Commit 626892a3 authored by Dan Povey's avatar Dan Povey
Browse files

Fixes to paper.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@530 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 4dd5597d
%!PS-Adobe-2.0 EPSF-2.0
%%Title: decodebeam.eps
%%Creator: gnuplot 4.4 patchlevel 0
%%CreationDate: Tue Sep 27 14:24:27 2011
%%CreationDate: Tue Sep 27 19:34:29 2011
%%DocumentFonts: (atend)
%%BoundingBox: 50 50 625 481
%%EndComments
......@@ -50,7 +50,7 @@ SDict begin [
/Author (dpovey)
% /Producer (gnuplot)
% /Keywords ()
/CreationDate (Tue Sep 27 14:24:27 2011)
/CreationDate (Tue Sep 27 19:34:29 2011)
/DOCINFO pdfmark
end
} ifelse
......@@ -1027,14 +1027,34 @@ LTb
0.500 UL
LTb
% Begin plot #1
2.000 UP
0.500 UL
LT0
LCb setrgbcolor
4304 3700 M
[ [(Helvetica) 270.0 0.0 true true 0 (Rescored WER)]
[ [(Helvetica) 270.0 0.0 true true 0 (One-best WER)]
] -90.0 MRshow
LT0
4442 3700 M
5333 1986 Crs
4851 1986 Crs
4369 1998 Crs
3886 2015 Crs
3404 2087 Crs
2922 2163 Crs
2440 2327 Crs
1957 2812 Crs
1475 3629 Crs
4749 3700 Crs
% End plot #1
% Begin plot #2
0.500 UL
LT1
LCb setrgbcolor
4304 3470 M
[ [(Helvetica) 270.0 0.0 true true 0 (Rescored WER)]
] -90.0 MRshow
LT1
4442 3470 M
615 0 V
5333 1177 M
-482 0 V
......@@ -1045,29 +1065,9 @@ LT0
-482 122 V
-483 535 V
-482 831 V
% End plot #1
% Begin plot #2
2.000 UP
stroke
LT1
LCb setrgbcolor
4304 3470 M
[ [(Helvetica) 270.0 0.0 true true 0 (One-best WER)]
] -90.0 MRshow
LT1
5333 1986 Pls
4851 1986 Pls
4369 1998 Pls
3886 2015 Pls
3404 2087 Pls
2922 2163 Pls
2440 2327 Pls
1957 2812 Pls
1475 3629 Pls
4749 3470 Pls
% End plot #2
1.000 UP
0.500 UL
stroke
LTb
grestore % colour palette end
gsave % colour palette begin
......@@ -1346,16 +1346,36 @@ LTb
0.500 UL
LTb
% Begin plot #1
2.000 UP
0.500 UL
LT0
LCb setrgbcolor
9367 7784 M
[ [(Helvetica) 270.0 0.0 true true 0 (Oracle WER)]
[ [(Helvetica) 270.0 0.0 true true 0 (One-best WER)]
] -90.0 MRshow
LT0
9505 7784 M
10396 7016 Crs
9914 7016 Crs
9432 7022 Crs
8949 7031 Crs
8467 7066 Crs
7985 7104 Crs
7503 7187 Crs
7020 7429 Crs
6538 7838 Crs
9812 7784 Crs
% End plot #1
% Begin plot #2
0.500 UL
LT1
LCb setrgbcolor
9367 7554 M
[ [(Helvetica) 270.0 0.0 true true 0 (Oracle WER)]
] -90.0 MRshow
LT1
9505 7554 M
615 0 V
276 -2555 R
276 -2325 R
-482 11 V
-482 33 V
-483 28 V
......@@ -1364,29 +1384,9 @@ LT0
-482 147 V
-483 365 V
-482 575 V
% End plot #1
% Begin plot #2
2.000 UP
stroke
LT1
LCb setrgbcolor
9367 7554 M
[ [(Helvetica) 270.0 0.0 true true 0 (One-best WER)]
] -90.0 MRshow
LT1
10396 7016 Pls
9914 7016 Pls
9432 7022 Pls
8949 7031 Pls
8467 7066 Pls
7985 7104 Pls
7503 7187 Pls
7020 7429 Pls
6538 7838 Pls
9812 7554 Pls
% End plot #2
1.000 UP
0.500 UL
stroke
LTb
grestore % colour palette end
gsave % colour palette begin
......
# run this in octave
figure(1)
hold off
markersize = 8;
labelsz = 23;
#set(0,"Defaulttextfontsize",labelsz)
%beam = [15.0 15.0 15.0 15.0 15.0 15.0 15.0 15.0 15.0 15.0]
......@@ -20,6 +16,8 @@ rescore_err =[ 9.59 9.59 9.59 9.59 9.56 9.59
rescore_utt_wrong = [205 205 205 205 204 204 204 202 205 212 ];;
labelsz = 27;
figure(1)
hold off
set(1,"Defaulttextfontsize",labelsz)
set(1,"Defaultaxesfontsize",labelsz)
......@@ -29,10 +27,12 @@ plot(latbeam, density)
xlabel('Lattice beam');
ylabel('Lattice density');
subplot(2,2,2)
plot(latbeam, oracle);
plot(latbeam, baseline_err, 'kx');
hold on
plot(latbeam, baseline_err, '.');
plot(latbeam, oracle);
hold off
set(gca(), "ylim", [2.5, 14.0]);
legend('One-best WER','Oracle WER')
xlabel('Lattice beam');
ylabel('Oracle WER');
subplot(2, 2, 3);
......@@ -65,18 +65,18 @@ plot(decode_beam, density)
xlabel('Decoding beam');
ylabel('Lattice density');
subplot(2,2,2)
plot(decode_beam, oracle);
plot(decode_beam, wer, 'kx');
hold on
plot(decode_beam, wer, 'k+');
legend('Oracle WER', 'One-best WER')
plot(decode_beam, oracle);
legend('One-best WER', 'Oracle WER')
xlabel('Decoding beam');
ylabel('WER, oracle');
hold off
subplot(2, 2, 3);
plot(decode_beam, rescore);
plot(decode_beam, wer, 'kx');
hold on
plot(decode_beam, wer, 'k+');
legend('Rescored WER', 'One-best WER')
plot(decode_beam, rescore);
legend('One-best WER', 'Rescored WER')
xlabel('Decoding beam');
ylabel('WER, rescoring with trigram LM');
hold off
......@@ -88,3 +88,4 @@ ylabel('Real time factor');
print -F:23 -deps 'decodebeam.eps'
%!PS-Adobe-2.0 EPSF-2.0
%%Title: latbeam.eps
%%Creator: gnuplot 4.4 patchlevel 0
%%CreationDate: Tue Sep 27 14:07:22 2011
%%CreationDate: Tue Sep 27 19:36:35 2011
%%DocumentFonts: (atend)
%%BoundingBox: 50 50 625 481
%%EndComments
......@@ -50,7 +50,7 @@ SDict begin [
/Author (dpovey)
% /Producer (gnuplot)
% /Keywords ()
/CreationDate (Tue Sep 27 14:07:22 2011)
/CreationDate (Tue Sep 27 19:36:35 2011)
/DOCINFO pdfmark
end
} ifelse
......@@ -1067,53 +1067,53 @@ Color InterpolatedColor or { % COLOUR vs. GRAY map
} ifelse
0.500 UL
LTb
6538 5012 M
6538 5397 M
88 0 V
3770 0 R
-88 0 V
stroke
6400 5012 M
[ [(Helvetica) 270.0 0.0 true true 0 (2)]
6400 5397 M
[ [(Helvetica) 270.0 0.0 true true 0 (4)]
] -90.0 MRshow
0.500 UL
LTb
6538 5602 M
6538 5910 M
88 0 V
3770 0 R
-88 0 V
stroke
6400 5602 M
[ [(Helvetica) 270.0 0.0 true true 0 (4)]
6400 5910 M
[ [(Helvetica) 270.0 0.0 true true 0 (6)]
] -90.0 MRshow
0.500 UL
LTb
6538 6192 M
6538 6423 M
88 0 V
3770 0 R
-88 0 V
stroke
6400 6192 M
[ [(Helvetica) 270.0 0.0 true true 0 (6)]
6400 6423 M
[ [(Helvetica) 270.0 0.0 true true 0 (8)]
] -90.0 MRshow
0.500 UL
LTb
6538 6782 M
6538 6936 M
88 0 V
3770 0 R
-88 0 V
stroke
6400 6782 M
[ [(Helvetica) 270.0 0.0 true true 0 (8)]
6400 6936 M
[ [(Helvetica) 270.0 0.0 true true 0 (10)]
] -90.0 MRshow
0.500 UL
LTb
6538 7372 M
6538 7449 M
88 0 V
3770 0 R
-88 0 V
stroke
6400 7372 M
[ [(Helvetica) 270.0 0.0 true true 0 (10)]
6400 7449 M
[ [(Helvetica) 270.0 0.0 true true 0 (12)]
] -90.0 MRshow
0.500 UL
LTb
......@@ -1123,7 +1123,7 @@ LTb
-88 0 V
stroke
6400 7962 M
[ [(Helvetica) 270.0 0.0 true true 0 (12)]
[ [(Helvetica) 270.0 0.0 true true 0 (14)]
] -90.0 MRshow
0.500 UL
LTb
......@@ -1211,36 +1211,49 @@ LTb
0.500 UL
LTb
% Begin plot #1
2.000 UP
0.500 UL
LT0
10396 5195 M
-386 47 V
-386 21 V
-385 68 V
-386 79 V
-386 151 V
-386 194 V
-386 278 V
-385 448 V
-386 484 V
LCb setrgbcolor
9367 7784 M
[ [(Helvetica) 270.0 0.0 true true 0 (One-best WER)]
] -90.0 MRshow
LT0
10396 7323 Crs
10010 7323 Crs
9624 7323 Crs
9239 7323 Crs
8853 7323 Crs
8467 7323 Crs
8081 7323 Crs
7695 7323 Crs
7310 7323 Crs
6924 7323 Crs
9812 7784 Crs
% End plot #1
% Begin plot #2
2.000 UP
stroke
0.500 UL
LT1
10396 7817 Pnt
10010 7817 Pnt
9624 7817 Pnt
9239 7817 Pnt
8853 7817 Pnt
8467 7817 Pnt
8081 7817 Pnt
7695 7817 Pnt
7310 7817 Pnt
6924 7817 Pnt
LCb setrgbcolor
9367 7554 M
[ [(Helvetica) 270.0 0.0 true true 0 (Oracle WER)]
] -90.0 MRshow
LT1
9505 7554 M
615 0 V
276 -2511 R
-386 41 V
-386 18 V
-385 59 V
-386 69 V
-386 131 V
-386 169 V
-386 241 V
-385 390 V
-386 421 V
% End plot #2
1.000 UP
0.500 UL
stroke
LTb
grestore % colour palette end
gsave % colour palette begin
......
......@@ -87,7 +87,7 @@
\name{ \em Daniel Povey$^1$, Mirko Hannemann$^{1,2}$, \\
\em {Gilles Boulianne}$^3$, {Luk\'{a}\v{s} Burget}$^4$, {Arnab Ghoshal}$^5$, {Milos Janda}$^2$, {Stefan Kombrink}$^2$, \\
\em {Petr Motl\'{i}\v{c}ek}$^6$, {Yanmin Qian}$^7$, {Ngoc Thang Vu}$^8$, {Korbinian Riedhammer}$^9$, {Karel Vesel\'{y}}$^2$
\thanks{Thanks to Sanjeev Khudanpur for his help in preparing the paper}}
\thanks{Thanks to Sanjeev Khudanpur for his help in preparing the paper, and to Honza Cernocky, Renata Kohlova, Martin Karafiat and Tomas Mikolov for their help relating to the Kaldi'11 workshop at BUT.}}
%%% TODO: fix thanks.
......@@ -149,14 +149,14 @@ where the Weighted Finite State Transducer (WFST) decoding graph is
\begin{equation}
\HCLG = \min(\det(H \circ C \circ L \circ G)),
\end{equation}
where $\circ$ is WFST composisition (note: view $\HCLG$ as a single symbol).
where $\circ$ is WFST composition (note: view $\HCLG$ as a single symbol).
For concreteness we will speak of ``costs'' rather
than weights, where a cost is a floating point number that typically represents a negated
log-probability. A WFST has a set of states with one distinguished
start state\footnote{This is the formulation that corresponds best with the toolkit we use.},
each state has a final-cost (or $\infty$ for non-final states);
and there is a set of arcs, where each arc has a weight,
weight (just think of this as a cost for now), an input label and an output
and there is a set of arcs, where each arc has a weight
(just think of this as a cost for now), an input label and an output
label. In $\HCLG$, the input labels are the identifiers of context-dependent
HMM states, and the output labels represent words. For both the input and output
symbols, the special label $\epsilon$ may appear meaning ``no label is present.''
......@@ -412,18 +412,16 @@ corresponding to a full state-level lattice. That is, for every arc of $\HCLG$
on every frame, we create a separate arc in the state-level lattice. These arcs
contain the acoustic and graph costs separately. We prune the state-level graph using
a beam $\alpha$; we do this periodically (every 25 frames) but this is equivalent to
doing it just once at the end (similar to~\cite{efficient_general}). Let the pruned
state-level lattice that we get after pruning it at the end of the utterance be $P$.
doing it just once at the end, as in~\cite{efficient_general}. Let the
final pruned state-level lattice be $P$.
Let $Q = \inv(P)$, and let $E$ be an encoded version of $Q$ as described above (with the
state labels as part of the weights). The final lattice we output is
state labels as part of the weights). The final lattice is
\begin{equation}
L = \mathrm{prune}(\mathrm{det}(\mathrm{rmeps}(E)), \alpha) .
\end{equation}
The determinization and epsilon removal are done together by a single algorithm
that we will describe below.
The resulting lattice $L$ is a deterministic, acyclic weighted acceptor with the
that we will describe below. $L$ is a deterministic, acyclic weighted acceptor with the
words as the labels, and the graph and acoustic costs and the alignments
encoded into the weights. The costs and alignments are not ``synchronized''
with the words.
......@@ -519,8 +517,7 @@ For these experiments we generate lattices with the bigram language model
supplied with the WSJ database, and for rescoring experiments we
use the trigram language model. The acoustic scale was $1/16$ for first-pass
decoding and $1/15$ for LM rescoring.
For simplicity, our results were all obtained using a decoder that
For simplicity we used a decoder that
does not support a ``maximum active states'' option, so the only variables
to consider are the beam used in the Viterbi beam search, and the separate
beam $\alpha$ used for lattice generation.
......@@ -540,9 +537,10 @@ $\alpha$, with the Viterbi beam fixed at 15. Note that we get all the improveme
LM rescoring by increasing $\alpha$ to 4. The time taken by our algorithm started to increase rapidly after about
$\alpha=8$, so a value of $\alpha$ anywhere between about 4 and 8 is sufficient for LM rescoring
and still does not slow down decoding too much.
Note that out of vocabularly words (OOVs) provide a floor on the lattice oracle error rate.
Of the 333 test utterances, 87 contained an OOV word, yet only 93 sentences had oracle errors
with $\alpha=10$. Figure~\ref{fig:viterbibeam} shows the effect of varying the
Out of vocabularly words (OOVs) provide a floor on the lattice oracle error rate:
of 333 test utterances, 87 contained at least one OOV word, yet only 93 sentences (6 more) had oracle errors
with $\alpha=10$. Lattice density is defined as the average number of arcs crossing each frame.
Figure~\ref{fig:viterbibeam} shows the effect of varying the
Viterbi decoding beam, while leaving $\alpha$ fixed at 7.
\begin{figure}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment