Commit 80462b98 authored by Dan Povey's avatar Dan Povey
Browse files

Final Kaldi poster

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@655 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 73477735
all: kaldi-poster.pdf code-samples.ps script-toplevel.ps script-lowlevel.ps
kaldi-poster.pdf: kaldi-poster.tex beamerthemekaldi1.sty kaldi-poster.pdf: kaldi-poster.tex beamerthemekaldi1.sty code-samples.pdf
pdflatex kaldi-poster pdflatex kaldi-poster
code-samples.pdf: code-samples.tex
pdflatex code-samples.tex
code-samples.ps: code-samples.pdf # only needed so we can use ghostview to find bounding box.
pdf2ps code-samples.pdf code-samples.ps
script-toplevel.pdf: script-toplevel.tex
pdflatex script-toplevel.tex
script-toplevel.ps: script-toplevel.pdf # only needed so we can use ghostview to find bounding box.
pdf2ps script-toplevel.pdf script-toplevel.ps
script-lowlevel.pdf: script-lowlevel.tex
pdflatex script-lowlevel.tex
script-lowlevel.ps: script-lowlevel.pdf # only needed so we can use ghostview to find bounding box.
pdf2ps script-lowlevel.pdf script-lowlevel.ps
#latex code-samples.tex
# dvips code-samples.dvi -o code-samples.eps
# mv code-samples.eps tmp.eps
# cat tmp.eps | awk '/BoundingBox/{ print "%%BoundingBox: 128 517 486 725"; next; } {print;}' \
> code-samples.eps
# ps2pdf -dEPSCrop code-samples.eps code-samples.pdf
...@@ -133,12 +133,12 @@ ...@@ -133,12 +133,12 @@
\end{beamercolorbox} \end{beamercolorbox}
\leavevmode% \leavevmode%
% \begin{beamercolorbox}[ht=4ex,leftskip=1em,rightskip=1em]{author in head/foot}% \begin{beamercolorbox}[ht=4ex,leftskip=1em,rightskip=1em]{author in head/foot}%
% \texttt{http://kaldi.sf.net} \texttt{http://kaldi.sf.net}
% \hfill \hfill
% % \texttt{<surname>@cs.rwth-aachen.de} \texttt{kaldi-developers@lists.sourceforge.net}
% \vskip1ex \vskip1ex
% \end{beamercolorbox} \end{beamercolorbox}
\vskip0pt% \vskip0pt%
\begin{beamercolorbox}[wd=\paperwidth]{lower separation line foot} \begin{beamercolorbox}[wd=\paperwidth]{lower separation line foot}
\rule{0pt}{3pt} \rule{0pt}{3pt}
......
\documentclass{article}
\begin{document}
\begin{verbatim}
void DiagGmm::LogLikelihoods(const VectorBase<BaseFloat> &data,
Vector<BaseFloat> *loglikes) const {
loglikes->Resize(gconsts_.Dim(), kUndefined);
loglikes->CopyFromVec(gconsts_);
if (static_cast<int32>(data.Dim()) != Dim()) {
KALDI_ERR << "DiagGmm::ComponentLogLikelihood, dimension "
<< "mismatch" << (data.Dim()) << " vs. "<< (Dim());
}
Vector<BaseFloat> data_sq(data);
data_sq.ApplyPow(2.0);
// loglikes += means * inv(vars) * data.
loglikes->AddMatVec(1.0, means_invvars_, kNoTrans, data, 1.0);
// loglikes += -0.5 * inv(vars) * data_sq.
loglikes->AddMatVec(-0.5, inv_vars_, kNoTrans, data_sq, 1.0);
}
\end{verbatim}
\vspace{0.4in}
\end{document}
...@@ -61,43 +61,63 @@ ...@@ -61,43 +61,63 @@
% Since all columns are the same length, it is all nice and tidy. You have to get the height empirically % Since all columns are the same length, it is all nice and tidy. You have to get the height empirically
% ---------------------------------------------------------% % ---------------------------------------------------------%
% fill each column with content % fill each column with content
\begin{block}{Features of Kaldi}
\begin{block}{Problem Statement}
\begin{itemize}
\item You are a researcher who wants to try out a new method.
\item You want other people to use your idea, if it works.
\item You find that older toolkits (HTK, CMUSphinx) are too hard to modify.
\item Also their license may not allow you to release your changes (HTK).
\end{itemize}
\end{block}
\vfill
\begin{block}{The Kaldi project}
\begin{itemize}
\item Open-source speech recognition toolkit, Apache-licensed.
\item Began at 2009 Johns Hopkins University CLSP summer workshop.
\item Further development included 2 summer workshops in Brno, Czech Republic, and ongoing work by the participants.
\item Uses OpenFst code for decoding graph construction.
\item Uses the BLAS and LAPACK libraries for linear algebra.
\item Our closest competitor is probably the RWTH Aachen toolkit.
\end{itemize}
\end{block}
\vfill
\begin{block}{Our vision}
\begin{itemize}
\item Distributed community of users and contributors.
\begin{itemize}
\item Not a free-for-all; original authors moderate contributions.
\item The Apache license allows you to fork the project.
\end{itemize}
\item Complete state-of-the-art recipes that run from public data
\begin{itemize}
\item These already exist for Resource Management and Wall Street Journal
\item Switchboard recipe exists but is not yet state-of-the-art.
\end{itemize}
\item Code that's well structured and simple to understand.
\item Thorough testing and documentation.
\end{itemize}
\end{block}
\vfill
\begin{block}{The structure of Kaldi}
\begin{columns} \begin{columns}
\begin{column}{.44\textwidth} \begin{column}{.44\textwidth}
\begin{itemize} \begin{itemize}
\item Integration with Finite State Transducers \item OpenFst for FST functions
\item Extensive linear algebra support \item BLAS/LAPACK for linear algebra
\item Extensible design \item Core functions in C++
\item Open license \item Many simple command-line utilities
\item Complete recipes \item Example shell scripts
\item Thorough testing
\end{itemize} \end{itemize}
\end{column} \end{column}
\begin{column}{.55\textwidth} \begin{column}{.55\textwidth}
\centering \centering
\includegraphics[width=0.85\linewidth]{figures/kaldi-lib.pdf} \includegraphics[width=0.99\linewidth]{figures/kaldi-lib.pdf} \vspace*{0.3in}
\end{column} \end{column}
\end{columns} \end{columns}
\vskip-1ex \vskip-1ex
\end{block} \end{block}
\vfill \vfill
\begin{block}{Standard ASR techniques supported in Kaldi}
\begin{itemize}
\item Acoustic front-end supports MFCC and PLP features, with
cepstral mean and variance normalization, LDA, STC/MLLT, HLDA,
VTLN, etc.
\item HMM/GMM acoustic models
\item No language modeling code, but support converting ARPA
format LMs to FSTSs.
\end{itemize}
\end{block}
\vfill
\begin{block}{Features unique to Kaldi}
\begin{itemize}
\item SGMM acoustic models
\item Exponential transform
\end{itemize}
\end{block}
} }
\end{minipage} \end{minipage}
\end{beamercolorbox} \end{beamercolorbox}
...@@ -115,16 +135,29 @@ ...@@ -115,16 +135,29 @@
% ---------------------------------------------------------% % ---------------------------------------------------------%
% fill each column with content % fill each column with content
\begin{block}{The Kaldi Decoder}
% \begin{verbatim} \begin{block}{Example of Kaldi code}
% class DecodableInterface { %% We couldn't get verbatim to work in here, so used a temporary tex file.
% public: \includegraphics[width=0.9\linewidth,bb=128 517 486 725]{code-samples.pdf}
% virtual float LogLikelihood(int frame, int index) = 0; \end{block}
% virtual bool IsLastFrame(int frame) = 0; \vfill
% virtual int NumIndices() = 0; \begin{block}{Acoustic modeling techniques supported in Kaldi}
% virtual ~DecodableInterface() {} \begin{itemize}
% }; \item Acoustic front-end supports MFCC and PLP features, with
% \end{verbatim} cepstral mean and variance normalization, LDA, STC/MLLT, HLDA,
VTLN, etc.
\item HMM/GMM acoustic models; phonetic decision trees.
\item Also SGMMs, exponential transform.
\item No language modeling code, but support converting ARPA
format LMs to FSTSs.
\item WFST-based decoders, lattice generation.
\item Discriminative training with MMI, boosted MMI (fMPE unfinished).
\end{itemize}
\end{block}
\vfill
\begin{block}{Example of top-level system building script}
%% We couldn't get verbatim to work in here, so used a temporary tex file.
\includegraphics[width=0.9\linewidth,bb=128 472 495 725]{script-toplevel.pdf}
\end{block} \end{block}
} }
\end{minipage} \end{minipage}
...@@ -142,19 +175,10 @@ ...@@ -142,19 +175,10 @@
% Since all columns are the same length, it is all nice and tidy. You have to get the height empirically % Since all columns are the same length, it is all nice and tidy. You have to get the height empirically
% ---------------------------------------------------------% % ---------------------------------------------------------%
% fill each column with content % fill each column with content
\begin{block}{Databases} \begin{block}{Segment of triphone training sript}
\begin{itemize} %% We couldn't get verbatim to work in here, so used a temporary tex file.
\item Resource Management (RM) \includegraphics[width=0.95\linewidth,bb=134 284 552 713]{script-lowlevel.pdf}
\begin{itemize}
\item
\item
\end{itemize}
\item Wall Street Journal (WSJ)
\begin{itemize}
\item
\end{itemize}
\end{itemize}
\end{block} \end{block}
\vfill \vfill
\begin{block}{Comparison with previously published results} \begin{block}{Comparison with previously published results}
...@@ -190,37 +214,51 @@ ...@@ -190,37 +214,51 @@
\end{table} \end{table}
\end{column} \end{column}
\end{columns} \end{columns}
\end{block}
\vfill
\begin{block}{Other Results}
\begin{itemize} \begin{itemize}
\item AR-Face: 110 classes, 110 train (``one-shot'' training), 550 test \item These are not our best results: they just show that with similar system setups, we get similar results.
\end{itemize} \begin{itemize}
\vskip-0.5ex \item WSJ results in table use bigram LM.
\begin{table} \end{itemize}
\small \item Current best RM result: 1.78\%
\centering \begin{itemize}
\begin{tabular}{l @{} c c c@{}} \toprule \item System combination: LDA+MLLT+SAT+MMI with LDA+MLLT+SAT+SGMM+fMLLR).
& RM (Avg) & WSJ Nov'92 & WSJ Nov'93 \\ \end{itemize}
\cmidrule(lr){2-2} \cmidrule(lr){3-3} \cmidrule(r){4-4} \item Current best WSJ result: 4.39\% on eval'92 open-vocabulary test set.
Triphone & 3.97 & 12.5 & 18.3 \\ \begin{itemize}
\,\, + fMLLR & 3.59 & 11.4 & 15.5 \\ \item Train on SI-284, LDA+MLLT+SAT+SGMM, extended vocabulary, 4-gram LM trained from supplied trainscripts.
\,\, + LVTLN & 3.30 & 11.1 & 16.4 \\ \end{itemize}
Splice-9 + LDA + MLLT & 3.88 & 12.2 & 17.7 \\
\,\, + SAT (fMLLR) & 2.70 & 9.6 & 13.7 \\
\,\, + SGMM + spk-vecs & 2.45 & 10.0 & 13.4 \\
\qquad + fMLLR & 2.31 & 9.8 & 12.9 \\
\qquad + ET & 2.15 & 9.0 & 12.3 \\
\bottomrule
\end{tabular}
\end{table}
\end{block}
\vfill
\begin{block}{Conclusions}
\begin{itemize}
\item
\end{itemize} \end{itemize}
\end{block} \end{block}
%% \vfill
%% \begin{block}{Other Results}
%% \begin{itemize}
%% \item AR-Face: 110 classes, 110 train (``one-shot'' training), 550 test
%% \end{itemize}
%% \vskip-0.5ex
%% \begin{table}
%% \small
%% \centering
%% \begin{tabular}{l @{} c c c@{}} \toprule
%% & RM (Avg) & WSJ Nov'92 & WSJ Nov'93 \\
%% \cmidrule(lr){2-2} \cmidrule(lr){3-3} \cmidrule(r){4-4}
%% Triphone & 3.97 & 12.5 & 18.3 \\
%% \,\, + fMLLR & 3.59 & 11.4 & 15.5 \\
%% \,\, + LVTLN & 3.30 & 11.1 & 16.4 \\
%% Splice-9 + LDA + MLLT & 3.88 & 12.2 & 17.7 \\
%% \,\, + SAT (fMLLR) & 2.70 & 9.6 & 13.7 \\
%% \,\, + SGMM + spk-vecs & 2.45 & 10.0 & 13.4 \\
%% \qquad + fMLLR & 2.31 & 9.8 & 12.9 \\
%% \qquad + ET & 2.15 & 9.0 & 12.3 \\
%% \bottomrule
%% \end{tabular}
%% \end{table}
%% \end{block}
%% \vfill
%% \begin{block}{Conclusions}
%% \begin{itemize}
%% \item
%% \end{itemize}
%% \end{block}
} }
% ---------------------------------------------------------% % ---------------------------------------------------------%
% end the column % end the column
......
\documentclass{article}
\begin{document}
\begin{verbatim}
for n in `get_splits.pl $nj`; do
featspart[$n]="ark:apply-cmvn --norm-vars=false \
--utt2spk=ark:$data/split$nj/$n/utt2spk ark:$alidir/$n.cmvn \
scp:$data/split$nj/$n/feats.scp ark:- | add-deltas ark:- ark:- |"
done
# tree building, graph compilation omitted.
while [ $x -lt $numiters ]; do
echo "Iteration $x"
if echo $realign_iters | grep -w $x >/dev/null; then
echo "Aligning data"
for n in `get_splits.pl $nj`; do
$cmd $dir/log/align.$x.$n.log \
gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 \
$dir/$x.mdl "ark:gunzip -c $dir/$n.fsts.gz|" "${featspart[$n]}" \
"ark:|gzip -c >$dir/$n.ali.gz" || touch $dir/.error &
done
wait;
[ -f $dir/.error ] && echo "Alignment error on iteration $x" && exit 1;
fi
for n in `get_splits.pl $nj`; do
$cmd $dir/log/acc.$x.$n.log \
gmm-acc-stats-ali $dir/$x.mdl "${featspart[$n]}" \
"ark,s,cs:gunzip -c $dir/$n.ali.gz|" $dir/$x.$n.acc || touch $dir/.error &
done
wait;
[ -f $dir/.error ] && echo "Accumulation error on iteration $x" && exit 1;
$cmd $dir/log/update.$x.log \
gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \
"gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
rm $dir/$x.mdl $dir/$x.*.acc
rm $dir/$x.occs
if [[ $x -le $maxiterinc ]]; then
numgauss=$[$numgauss+$incgauss];
fi
x=$[$x+1];
done
\end{verbatim}
\vspace{0.4in}
\end{document}
\documentclass{article}
\begin{document}
\begin{verbatim}
## Example of WSJ system building.
local/wsj_data_prep.sh /mnt/speech_data/WSJ?/??-{?,??}.?
local/wsj_prepare_dict.sh
local/wsj_format_data.sh
mfccdir=/mnt/my_storage/kaldi_wsj_mfcc
for x in test_eval92 test_eval93 test_dev93 train_si284; do
steps/make_mfcc.sh data/$x exp/make_mfcc/$x $mfccdir 4
done
# skipped some data-subsetting commands here.
# This setup would use GridEngine.
decode_cmd="queue.pl -q queue_name -l ram_free=1200M,mem_free=1200M"
train_cmd="queue.pl -q queue_name -l ram_free=700M,mem_free=700M"
steps/train_mono.sh --num-jobs 10 --cmd "$train_cmd" \
data/train_si84_2kshort data/lang exp/mono0a
steps/align_deltas.sh --num-jobs 10 --cmd "$train_cmd" \
data/train_si84_half data/lang exp/mono0a exp/mono0a_ali
steps/train_deltas.sh --num-jobs 10 --cmd "$train_cmd" \
2000 10000 data/train_si84_half data/lang exp/mono0a_ali exp/tri1
# ...
\end{verbatim}
\vspace{0.4in}
\end{document}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment