Commit 626892a3 authored by Dan Povey's avatar Dan Povey
Browse files

Fixes to paper.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@530 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 4dd5597d
%!PS-Adobe-2.0 EPSF-2.0 %!PS-Adobe-2.0 EPSF-2.0
%%Title: decodebeam.eps %%Title: decodebeam.eps
%%Creator: gnuplot 4.4 patchlevel 0 %%Creator: gnuplot 4.4 patchlevel 0
%%CreationDate: Tue Sep 27 14:24:27 2011 %%CreationDate: Tue Sep 27 19:34:29 2011
%%DocumentFonts: (atend) %%DocumentFonts: (atend)
%%BoundingBox: 50 50 625 481 %%BoundingBox: 50 50 625 481
%%EndComments %%EndComments
...@@ -50,7 +50,7 @@ SDict begin [ ...@@ -50,7 +50,7 @@ SDict begin [
/Author (dpovey) /Author (dpovey)
% /Producer (gnuplot) % /Producer (gnuplot)
% /Keywords () % /Keywords ()
/CreationDate (Tue Sep 27 14:24:27 2011) /CreationDate (Tue Sep 27 19:34:29 2011)
/DOCINFO pdfmark /DOCINFO pdfmark
end end
} ifelse } ifelse
...@@ -1027,14 +1027,34 @@ LTb ...@@ -1027,14 +1027,34 @@ LTb
0.500 UL 0.500 UL
LTb LTb
% Begin plot #1 % Begin plot #1
2.000 UP
0.500 UL 0.500 UL
LT0 LT0
LCb setrgbcolor LCb setrgbcolor
4304 3700 M 4304 3700 M
[ [(Helvetica) 270.0 0.0 true true 0 (Rescored WER)] [ [(Helvetica) 270.0 0.0 true true 0 (One-best WER)]
] -90.0 MRshow ] -90.0 MRshow
LT0 LT0
4442 3700 M 5333 1986 Crs
4851 1986 Crs
4369 1998 Crs
3886 2015 Crs
3404 2087 Crs
2922 2163 Crs
2440 2327 Crs
1957 2812 Crs
1475 3629 Crs
4749 3700 Crs
% End plot #1
% Begin plot #2
0.500 UL
LT1
LCb setrgbcolor
4304 3470 M
[ [(Helvetica) 270.0 0.0 true true 0 (Rescored WER)]
] -90.0 MRshow
LT1
4442 3470 M
615 0 V 615 0 V
5333 1177 M 5333 1177 M
-482 0 V -482 0 V
...@@ -1045,29 +1065,9 @@ LT0 ...@@ -1045,29 +1065,9 @@ LT0
-482 122 V -482 122 V
-483 535 V -483 535 V
-482 831 V -482 831 V
% End plot #1
% Begin plot #2
2.000 UP
stroke
LT1
LCb setrgbcolor
4304 3470 M
[ [(Helvetica) 270.0 0.0 true true 0 (One-best WER)]
] -90.0 MRshow
LT1
5333 1986 Pls
4851 1986 Pls
4369 1998 Pls
3886 2015 Pls
3404 2087 Pls
2922 2163 Pls
2440 2327 Pls
1957 2812 Pls
1475 3629 Pls
4749 3470 Pls
% End plot #2 % End plot #2
1.000 UP 1.000 UP
0.500 UL stroke
LTb LTb
grestore % colour palette end grestore % colour palette end
gsave % colour palette begin gsave % colour palette begin
...@@ -1346,16 +1346,36 @@ LTb ...@@ -1346,16 +1346,36 @@ LTb
0.500 UL 0.500 UL
LTb LTb
% Begin plot #1 % Begin plot #1
2.000 UP
0.500 UL 0.500 UL
LT0 LT0
LCb setrgbcolor LCb setrgbcolor
9367 7784 M 9367 7784 M
[ [(Helvetica) 270.0 0.0 true true 0 (Oracle WER)] [ [(Helvetica) 270.0 0.0 true true 0 (One-best WER)]
] -90.0 MRshow ] -90.0 MRshow
LT0 LT0
9505 7784 M 10396 7016 Crs
9914 7016 Crs
9432 7022 Crs
8949 7031 Crs
8467 7066 Crs
7985 7104 Crs
7503 7187 Crs
7020 7429 Crs
6538 7838 Crs
9812 7784 Crs
% End plot #1
% Begin plot #2
0.500 UL
LT1
LCb setrgbcolor
9367 7554 M
[ [(Helvetica) 270.0 0.0 true true 0 (Oracle WER)]
] -90.0 MRshow
LT1
9505 7554 M
615 0 V 615 0 V
276 -2555 R 276 -2325 R
-482 11 V -482 11 V
-482 33 V -482 33 V
-483 28 V -483 28 V
...@@ -1364,29 +1384,9 @@ LT0 ...@@ -1364,29 +1384,9 @@ LT0
-482 147 V -482 147 V
-483 365 V -483 365 V
-482 575 V -482 575 V
% End plot #1
% Begin plot #2
2.000 UP
stroke
LT1
LCb setrgbcolor
9367 7554 M
[ [(Helvetica) 270.0 0.0 true true 0 (One-best WER)]
] -90.0 MRshow
LT1
10396 7016 Pls
9914 7016 Pls
9432 7022 Pls
8949 7031 Pls
8467 7066 Pls
7985 7104 Pls
7503 7187 Pls
7020 7429 Pls
6538 7838 Pls
9812 7554 Pls
% End plot #2 % End plot #2
1.000 UP 1.000 UP
0.500 UL stroke
LTb LTb
grestore % colour palette end grestore % colour palette end
gsave % colour palette begin gsave % colour palette begin
......
# run this in octave # run this in octave
figure(1)
hold off
markersize = 8;
labelsz = 23;
#set(0,"Defaulttextfontsize",labelsz)
%beam = [15.0 15.0 15.0 15.0 15.0 15.0 15.0 15.0 15.0 15.0] %beam = [15.0 15.0 15.0 15.0 15.0 15.0 15.0 15.0 15.0 15.0]
...@@ -20,6 +16,8 @@ rescore_err =[ 9.59 9.59 9.59 9.59 9.56 9.59 ...@@ -20,6 +16,8 @@ rescore_err =[ 9.59 9.59 9.59 9.59 9.56 9.59
rescore_utt_wrong = [205 205 205 205 204 204 204 202 205 212 ];; rescore_utt_wrong = [205 205 205 205 204 204 204 202 205 212 ];;
labelsz = 27; labelsz = 27;
figure(1)
hold off
set(1,"Defaulttextfontsize",labelsz) set(1,"Defaulttextfontsize",labelsz)
set(1,"Defaultaxesfontsize",labelsz) set(1,"Defaultaxesfontsize",labelsz)
...@@ -29,10 +27,12 @@ plot(latbeam, density) ...@@ -29,10 +27,12 @@ plot(latbeam, density)
xlabel('Lattice beam'); xlabel('Lattice beam');
ylabel('Lattice density'); ylabel('Lattice density');
subplot(2,2,2) subplot(2,2,2)
plot(latbeam, oracle); plot(latbeam, baseline_err, 'kx');
hold on hold on
plot(latbeam, baseline_err, '.'); plot(latbeam, oracle);
hold off hold off
set(gca(), "ylim", [2.5, 14.0]);
legend('One-best WER','Oracle WER')
xlabel('Lattice beam'); xlabel('Lattice beam');
ylabel('Oracle WER'); ylabel('Oracle WER');
subplot(2, 2, 3); subplot(2, 2, 3);
...@@ -65,18 +65,18 @@ plot(decode_beam, density) ...@@ -65,18 +65,18 @@ plot(decode_beam, density)
xlabel('Decoding beam'); xlabel('Decoding beam');
ylabel('Lattice density'); ylabel('Lattice density');
subplot(2,2,2) subplot(2,2,2)
plot(decode_beam, oracle); plot(decode_beam, wer, 'kx');
hold on hold on
plot(decode_beam, wer, 'k+'); plot(decode_beam, oracle);
legend('Oracle WER', 'One-best WER') legend('One-best WER', 'Oracle WER')
xlabel('Decoding beam'); xlabel('Decoding beam');
ylabel('WER, oracle'); ylabel('WER, oracle');
hold off hold off
subplot(2, 2, 3); subplot(2, 2, 3);
plot(decode_beam, rescore); plot(decode_beam, wer, 'kx');
hold on hold on
plot(decode_beam, wer, 'k+'); plot(decode_beam, rescore);
legend('Rescored WER', 'One-best WER') legend('One-best WER', 'Rescored WER')
xlabel('Decoding beam'); xlabel('Decoding beam');
ylabel('WER, rescoring with trigram LM'); ylabel('WER, rescoring with trigram LM');
hold off hold off
...@@ -88,3 +88,4 @@ ylabel('Real time factor'); ...@@ -88,3 +88,4 @@ ylabel('Real time factor');
print -F:23 -deps 'decodebeam.eps' print -F:23 -deps 'decodebeam.eps'
%!PS-Adobe-2.0 EPSF-2.0 %!PS-Adobe-2.0 EPSF-2.0
%%Title: latbeam.eps %%Title: latbeam.eps
%%Creator: gnuplot 4.4 patchlevel 0 %%Creator: gnuplot 4.4 patchlevel 0
%%CreationDate: Tue Sep 27 14:07:22 2011 %%CreationDate: Tue Sep 27 19:36:35 2011
%%DocumentFonts: (atend) %%DocumentFonts: (atend)
%%BoundingBox: 50 50 625 481 %%BoundingBox: 50 50 625 481
%%EndComments %%EndComments
...@@ -50,7 +50,7 @@ SDict begin [ ...@@ -50,7 +50,7 @@ SDict begin [
/Author (dpovey) /Author (dpovey)
% /Producer (gnuplot) % /Producer (gnuplot)
% /Keywords () % /Keywords ()
/CreationDate (Tue Sep 27 14:07:22 2011) /CreationDate (Tue Sep 27 19:36:35 2011)
/DOCINFO pdfmark /DOCINFO pdfmark
end end
} ifelse } ifelse
...@@ -1067,53 +1067,53 @@ Color InterpolatedColor or { % COLOUR vs. GRAY map ...@@ -1067,53 +1067,53 @@ Color InterpolatedColor or { % COLOUR vs. GRAY map
} ifelse } ifelse
0.500 UL 0.500 UL
LTb LTb
6538 5012 M 6538 5397 M
88 0 V 88 0 V
3770 0 R 3770 0 R
-88 0 V -88 0 V
stroke stroke
6400 5012 M 6400 5397 M
[ [(Helvetica) 270.0 0.0 true true 0 (2)] [ [(Helvetica) 270.0 0.0 true true 0 (4)]
] -90.0 MRshow ] -90.0 MRshow
0.500 UL 0.500 UL
LTb LTb
6538 5602 M 6538 5910 M
88 0 V 88 0 V
3770 0 R 3770 0 R
-88 0 V -88 0 V
stroke stroke
6400 5602 M 6400 5910 M
[ [(Helvetica) 270.0 0.0 true true 0 (4)] [ [(Helvetica) 270.0 0.0 true true 0 (6)]
] -90.0 MRshow ] -90.0 MRshow
0.500 UL 0.500 UL
LTb LTb
6538 6192 M 6538 6423 M
88 0 V 88 0 V
3770 0 R 3770 0 R
-88 0 V -88 0 V
stroke stroke
6400 6192 M 6400 6423 M
[ [(Helvetica) 270.0 0.0 true true 0 (6)] [ [(Helvetica) 270.0 0.0 true true 0 (8)]
] -90.0 MRshow ] -90.0 MRshow
0.500 UL 0.500 UL
LTb LTb
6538 6782 M 6538 6936 M
88 0 V 88 0 V
3770 0 R 3770 0 R
-88 0 V -88 0 V
stroke stroke
6400 6782 M 6400 6936 M
[ [(Helvetica) 270.0 0.0 true true 0 (8)] [ [(Helvetica) 270.0 0.0 true true 0 (10)]
] -90.0 MRshow ] -90.0 MRshow
0.500 UL 0.500 UL
LTb LTb
6538 7372 M 6538 7449 M
88 0 V 88 0 V
3770 0 R 3770 0 R
-88 0 V -88 0 V
stroke stroke
6400 7372 M 6400 7449 M
[ [(Helvetica) 270.0 0.0 true true 0 (10)] [ [(Helvetica) 270.0 0.0 true true 0 (12)]
] -90.0 MRshow ] -90.0 MRshow
0.500 UL 0.500 UL
LTb LTb
...@@ -1123,7 +1123,7 @@ LTb ...@@ -1123,7 +1123,7 @@ LTb
-88 0 V -88 0 V
stroke stroke
6400 7962 M 6400 7962 M
[ [(Helvetica) 270.0 0.0 true true 0 (12)] [ [(Helvetica) 270.0 0.0 true true 0 (14)]
] -90.0 MRshow ] -90.0 MRshow
0.500 UL 0.500 UL
LTb LTb
...@@ -1211,36 +1211,49 @@ LTb ...@@ -1211,36 +1211,49 @@ LTb
0.500 UL 0.500 UL
LTb LTb
% Begin plot #1 % Begin plot #1
2.000 UP
0.500 UL 0.500 UL
LT0 LT0
10396 5195 M LCb setrgbcolor
-386 47 V 9367 7784 M
-386 21 V [ [(Helvetica) 270.0 0.0 true true 0 (One-best WER)]
-385 68 V ] -90.0 MRshow
-386 79 V LT0
-386 151 V 10396 7323 Crs
-386 194 V 10010 7323 Crs
-386 278 V 9624 7323 Crs
-385 448 V 9239 7323 Crs
-386 484 V 8853 7323 Crs
8467 7323 Crs
8081 7323 Crs
7695 7323 Crs
7310 7323 Crs
6924 7323 Crs
9812 7784 Crs
% End plot #1 % End plot #1
% Begin plot #2 % Begin plot #2
2.000 UP 0.500 UL
stroke
LT1 LT1
10396 7817 Pnt LCb setrgbcolor
10010 7817 Pnt 9367 7554 M
9624 7817 Pnt [ [(Helvetica) 270.0 0.0 true true 0 (Oracle WER)]
9239 7817 Pnt ] -90.0 MRshow
8853 7817 Pnt LT1
8467 7817 Pnt 9505 7554 M
8081 7817 Pnt 615 0 V
7695 7817 Pnt 276 -2511 R
7310 7817 Pnt -386 41 V
6924 7817 Pnt -386 18 V
-385 59 V
-386 69 V
-386 131 V
-386 169 V
-386 241 V
-385 390 V
-386 421 V
% End plot #2 % End plot #2
1.000 UP 1.000 UP
0.500 UL stroke
LTb LTb
grestore % colour palette end grestore % colour palette end
gsave % colour palette begin gsave % colour palette begin
......
...@@ -87,7 +87,7 @@ ...@@ -87,7 +87,7 @@
\name{ \em Daniel Povey$^1$, Mirko Hannemann$^{1,2}$, \\ \name{ \em Daniel Povey$^1$, Mirko Hannemann$^{1,2}$, \\
\em {Gilles Boulianne}$^3$, {Luk\'{a}\v{s} Burget}$^4$, {Arnab Ghoshal}$^5$, {Milos Janda}$^2$, {Stefan Kombrink}$^2$, \\ \em {Gilles Boulianne}$^3$, {Luk\'{a}\v{s} Burget}$^4$, {Arnab Ghoshal}$^5$, {Milos Janda}$^2$, {Stefan Kombrink}$^2$, \\
\em {Petr Motl\'{i}\v{c}ek}$^6$, {Yanmin Qian}$^7$, {Ngoc Thang Vu}$^8$, {Korbinian Riedhammer}$^9$, {Karel Vesel\'{y}}$^2$ \em {Petr Motl\'{i}\v{c}ek}$^6$, {Yanmin Qian}$^7$, {Ngoc Thang Vu}$^8$, {Korbinian Riedhammer}$^9$, {Karel Vesel\'{y}}$^2$
\thanks{Thanks to Sanjeev Khudanpur for his help in preparing the paper}} \thanks{Thanks to Sanjeev Khudanpur for his help in preparing the paper, and to Honza Cernocky, Renata Kohlova, Martin Karafiat and Tomas Mikolov for their help relating to the Kaldi'11 workshop at BUT.}}
%%% TODO: fix thanks. %%% TODO: fix thanks.
...@@ -149,14 +149,14 @@ where the Weighted Finite State Transducer (WFST) decoding graph is ...@@ -149,14 +149,14 @@ where the Weighted Finite State Transducer (WFST) decoding graph is
\begin{equation} \begin{equation}
\HCLG = \min(\det(H \circ C \circ L \circ G)), \HCLG = \min(\det(H \circ C \circ L \circ G)),
\end{equation} \end{equation}
where $\circ$ is WFST composisition (note: view $\HCLG$ as a single symbol). where $\circ$ is WFST composition (note: view $\HCLG$ as a single symbol).
For concreteness we will speak of ``costs'' rather For concreteness we will speak of ``costs'' rather
than weights, where a cost is a floating point number that typically represents a negated than weights, where a cost is a floating point number that typically represents a negated
log-probability. A WFST has a set of states with one distinguished log-probability. A WFST has a set of states with one distinguished
start state\footnote{This is the formulation that corresponds best with the toolkit we use.}, start state\footnote{This is the formulation that corresponds best with the toolkit we use.},
each state has a final-cost (or $\infty$ for non-final states); each state has a final-cost (or $\infty$ for non-final states);
and there is a set of arcs, where each arc has a weight, and there is a set of arcs, where each arc has a weight
weight (just think of this as a cost for now), an input label and an output (just think of this as a cost for now), an input label and an output
label. In $\HCLG$, the input labels are the identifiers of context-dependent label. In $\HCLG$, the input labels are the identifiers of context-dependent
HMM states, and the output labels represent words. For both the input and output HMM states, and the output labels represent words. For both the input and output
symbols, the special label $\epsilon$ may appear meaning ``no label is present.'' symbols, the special label $\epsilon$ may appear meaning ``no label is present.''
...@@ -412,18 +412,16 @@ corresponding to a full state-level lattice. That is, for every arc of $\HCLG$ ...@@ -412,18 +412,16 @@ corresponding to a full state-level lattice. That is, for every arc of $\HCLG$
on every frame, we create a separate arc in the state-level lattice. These arcs on every frame, we create a separate arc in the state-level lattice. These arcs
contain the acoustic and graph costs separately. We prune the state-level graph using contain the acoustic and graph costs separately. We prune the state-level graph using
a beam $\alpha$; we do this periodically (every 25 frames) but this is equivalent to a beam $\alpha$; we do this periodically (every 25 frames) but this is equivalent to
doing it just once at the end (similar to~\cite{efficient_general}). Let the pruned doing it just once at the end, as in~\cite{efficient_general}. Let the
state-level lattice that we get after pruning it at the end of the utterance be $P$. final pruned state-level lattice be $P$.
Let $Q = \inv(P)$, and let $E$ be an encoded version of $Q$ as described above (with the Let $Q = \inv(P)$, and let $E$ be an encoded version of $Q$ as described above (with the
state labels as part of the weights). The final lattice we output is state labels as part of the weights). The final lattice is
\begin{equation} \begin{equation}
L = \mathrm{prune}(\mathrm{det}(\mathrm{rmeps}(E)), \alpha) . L = \mathrm{prune}(\mathrm{det}(\mathrm{rmeps}(E)), \alpha) .
\end{equation} \end{equation}
The determinization and epsilon removal are done together by a single algorithm The determinization and epsilon removal are done together by a single algorithm
that we will describe below. that we will describe below. $L$ is a deterministic, acyclic weighted acceptor with the
The resulting lattice $L$ is a deterministic, acyclic weighted acceptor with the
words as the labels, and the graph and acoustic costs and the alignments words as the labels, and the graph and acoustic costs and the alignments
encoded into the weights. The costs and alignments are not ``synchronized'' encoded into the weights. The costs and alignments are not ``synchronized''
with the words. with the words.
...@@ -519,8 +517,7 @@ For these experiments we generate lattices with the bigram language model ...@@ -519,8 +517,7 @@ For these experiments we generate lattices with the bigram language model
supplied with the WSJ database, and for rescoring experiments we supplied with the WSJ database, and for rescoring experiments we
use the trigram language model. The acoustic scale was $1/16$ for first-pass use the trigram language model. The acoustic scale was $1/16$ for first-pass
decoding and $1/15$ for LM rescoring. decoding and $1/15$ for LM rescoring.
For simplicity we used a decoder that
For simplicity, our results were all obtained using a decoder that
does not support a ``maximum active states'' option, so the only variables does not support a ``maximum active states'' option, so the only variables
to consider are the beam used in the Viterbi beam search, and the separate to consider are the beam used in the Viterbi beam search, and the separate
beam $\alpha$ used for lattice generation. beam $\alpha$ used for lattice generation.
...@@ -540,9 +537,10 @@ $\alpha$, with the Viterbi beam fixed at 15. Note that we get all the improveme ...@@ -540,9 +537,10 @@ $\alpha$, with the Viterbi beam fixed at 15. Note that we get all the improveme
LM rescoring by increasing $\alpha$ to 4. The time taken by our algorithm started to increase rapidly after about LM rescoring by increasing $\alpha$ to 4. The time taken by our algorithm started to increase rapidly after about
$\alpha=8$, so a value of $\alpha$ anywhere between about 4 and 8 is sufficient for LM rescoring $\alpha=8$, so a value of $\alpha$ anywhere between about 4 and 8 is sufficient for LM rescoring
and still does not slow down decoding too much. and still does not slow down decoding too much.
Note that out of vocabularly words (OOVs) provide a floor on the lattice oracle error rate. Out of vocabularly words (OOVs) provide a floor on the lattice oracle error rate:
Of the 333 test utterances, 87 contained an OOV word, yet only 93 sentences had oracle errors of 333 test utterances, 87 contained at least one OOV word, yet only 93 sentences (6 more) had oracle errors
with $\alpha=10$. Figure~\ref{fig:viterbibeam} shows the effect of varying the with $\alpha=10$. Lattice density is defined as the average number of arcs crossing each frame.
Figure~\ref{fig:viterbibeam} shows the effect of varying the
Viterbi decoding beam, while leaving $\alpha$ fixed at 7. Viterbi decoding beam, while leaving $\alpha$ fixed at 7.
\begin{figure} \begin{figure}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment