Commit bad94ae0 authored by Dan Povey's avatar Dan Povey

Adding fMPE scripts; changes to fMPE code.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@772 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 138a71fa
Update to legal notice, made Feb. 2012. We would like to clarify that we
are using a convention where multiple names in the Apache copyright headers,
for example
// Copyright 2009-2012 Yanmin Qian Arnab Ghoshal
does not necessarily signify joint ownership of copyright of that file, except
in cases where all those names were present in the original release made in
March 2011-- you can use the version history to work this out, if this matters
to you. Instead, we intend that those contributors who later modified the file,
agree to release their changes under the Apache license, but do not claim to
jointly own the copyright of the original material (which would require an agreement
with the original contributors). The conventional way of signifying
this is to duplicate the Apache headers at the top of each file each time
a change is made by a different author, but this would quickly become impractical.
The original legal notice is below. Note: we are continuing to modify it by
adding the names of new contributors.
---
Legal Notices
Each of the files comprising Kaldi v1.0 have been separately licensed by
......@@ -18,6 +38,7 @@ Individual Contributors (in alphabetical order)
Arnab Ghoshal
Go Vivace Inc.
Mirko Hannemann
Navdeep Jaitly
Microsoft Corporation
Petr Motlicek
Ariya Rastrow
......
......@@ -26,4 +26,7 @@ Recipes in progress:
sampling rate).
This directory is a work in progress.
gp: GlobalPhone. This is a multilingual speech corpus.
timit: TIMIT, which is an old corpus of carefully read speech.
......@@ -28,7 +28,7 @@ exit 1;
# shorten to WAV to take out the empty files and those with compression errors.
# So set WORKDIR to someplace with enough disk space. That is where MFCCs will
# get created, as well as the FST versions of LMs.
WORKDIR=/path/with/disk/space
WORKDIR=/mnt/matylda6/jhu09/qpovey/temp_gp
cp -r conf local utils steps install.sh path.sh $WORKDIR
cd $WORKDIR
# INSTALLING REQUIRED TOOLS:
......@@ -39,7 +39,7 @@ cd $WORKDIR
{ echo "shorten and/or sox not found on PATH. Installing...";
install.sh }
local/gp_data_prep.sh --config-dir=$PWD/conf --corpus-dir=/path/to/GlobalPhone --lm-dir=/path/to/lms --work-dir=$WORKDIR
local/gp_data_prep.sh --config-dir=$PWD/conf --corpus-dir=/mnt/matylda2/data/GLOBALPHONE --lm-dir=/path/to/lms --work-dir=$WORKDIR
# On Eddie: local/gp_data_prep.sh --config-dir=$PWD/conf --corpus-dir=$PWD/corpus --lm-dir=$PWD/corpus/language_models --work-dir=$PWD
local/gp_format_data.sh --hmm-proto=conf/topo.proto --work-dir=$PWD
......
......@@ -5,38 +5,38 @@ for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | scripts/best_wer.sh;
# monophone; delta+accel
exp/mono/decode/wer_4:%WER 9.830049 [ 1232 / 12533, 143 ins, 289 del, 800 sub ]
# First triphone pass; delta+accel
exp/tri1/decode/wer_6:%WER 3.694247 [ 463 / 12533, 69 ins, 100 del, 294 sub ]
exp/tri1/decode/wer_6:%WER 3.893721 [ 488 / 12533, 69 ins, 96 del, 323 sub ]
# Second triphone pass; delta+accel
exp/tri2a/decode/wer_7:%WER 3.638395 [ 456 / 12533, 61 ins, 107 del, 288 sub ]
exp/tri2a/decode/wer_7:%WER 3.486795 [ 437 / 12533, 65 ins, 91 del, 281 sub ]
# [as tri2a, but] LDA+MLLT
exp/tri2b/decode/wer_7:%WER 3.534668 [ 443 / 12533, 74 ins, 88 del, 281 sub ]
exp/tri2b/decode/wer_6:%WER 3.359132 [ 421 / 12533, 73 ins, 71 del, 277 sub ]
# LDA + exponential transform (note: this is with speaker adaptation)
exp/tri2c/decode/wer_5:%WER 2.848480 [ 357 / 12533, 62 ins, 61 del, 234 sub ]
exp/tri2c/decode/wer_5:%WER 2.905492 [ 364 / 12528, 68 ins, 59 del, 237 sub ]
# LDA+MLLT+MMI.
exp/tri3a/decode/wer_7:%WER 3.502753 [ 439 / 12533, 75 ins, 83 del, 281 sub ]
exp/tri3a/decode/wer_7:%WER 3.084052 [ 386 / 12516, 54 ins, 67 del, 265 sub ]
# LDA+MLLT+boosted MMI [note: errors are not identical, although WER is same]
exp/tri3b/decode/wer_7:%WER 3.454879 [ 433 / 12533, 75 ins, 80 del, 278 sub ]
exp/tri3b/decode/wer_5:%WER 3.155960 [ 395 / 12516, 74 ins, 50 del, 271 sub ]
# LDA+MLLT+MCE
exp/tri3c/decode/wer_7:%WER 3.183595 [ 399 / 12533, 62 ins, 79 del, 258 sub ]
exp/tri3c/decode/wer_6:%WER 3.047953 [ 382 / 12533, 56 ins, 69 del, 257 sub ]
# LDA+MLLT+SAT
exp/tri3d/decode/wer_6:%WER 2.553259 [ 320 / 12533, 43 ins, 63 del, 214 sub ]
exp/tri3d/decode/wer_7:%WER 2.234102 [ 280 / 12533, 35 ins, 62 del, 183 sub ]
# LDA+MLLT+SAT+MMI
exp/tri4a/decode/wer_6:%WER 2.473470 [ 310 / 12533, 43 ins, 62 del, 205 sub ]
exp/tri4a/decode/wer_6:%WER 2.146334 [ 269 / 12533, 37 ins, 43 del, 189 sub ]
# LDA+MLLT+SAT, extra phase of builting on top of 3d (no help)
exp/tri4d/decode/wer_5:%WER 2.800606 [ 351 / 12533, 47 ins, 68 del, 236 sub ]
exp/tri4d/decode/wer_5:%WER 2.457512 [ 308 / 12533, 50 ins, 54 del, 204 sub ]
# LDA+MLLT + SGMM with speaker vectors
exp/sgmm3d/decode/wer_4:%WER 2.186228 [ 274 / 12533, 41 ins, 42 del, 191 sub ]
exp/sgmm3d/decode/wer_6:%WER 2.305912 [ 289 / 12533, 53 ins, 52 del, 184 sub ]
# LDA+ET + SGMM with speaker vectors.
exp/sgmm3e/decode/wer_5:%WER 2.242081 [ 281 / 12533, 44 ins, 47 del, 190 sub ]
exp/sgmm3e/decode/wer_4:%WER 2.042608 [ 256 / 12533, 39 ins, 38 del, 179 sub ]
# LDA+MLLT+SAT + SGMM with speaker vectors.
exp/sgmm4f/decode/wer_5:%WER 2.226123 [ 279 / 12533, 56 ins, 49 del, 174 sub ]
exp/sgmm4f/decode/wer_7:%WER 1.970797 [ 247 / 12533, 36 ins, 56 del, 155 sub ]
# + FMLLR on top of it all.
exp/sgmm4f/decode_fmllr/wer_6:%WER 2.202186 [ 276 / 12533, 39 ins, 59 del, 178 sub ]
exp/sgmm4f/decode_fmllr/wer_5:%WER 1.954839 [ 245 / 12533, 40 ins, 47 del, 158 sub ]
# System combination via lattices: combine tri1 and tri2a
exp/combine_1_2a/decode/wer_6:%WER 3.518711 [ 441 / 12533, 62 ins, 97 del, 282 sub ]
# System combination via lattices: combine sgmm4f and tri3d.
exp/combine_sgmm4f_tri3d/decode/wer_5:%WER 2.082502 [ 261 / 12533, 36 ins, 48 del, 177 sub ]
exp/combine_sgmm4f_tri3d/decode/wer_5:%WER 1.763345 [ 221 / 12533, 32 ins, 42 del, 147 sub ]
# System combination via lattices: combine sgmm4f and tri4a.
exp/combine_sgmm4f_tri4a/decode/wer_5:%WER 2.082502 [ 261 / 12533, 37 ins, 49 del, 175 sub ]
exp/combine_sgmm4f_tri4a/decode/wer_6:%WER 1.715471 [ 215 / 12533, 31 ins, 39 del, 145 sub ]
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation
# Copyright 2010-2012 Microsoft Corporation Daniel Povey
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -27,13 +27,24 @@
# ali, final.mdl, final.mat
boost=0 # boosting constant, for boosted MMI.
tau=100 # Tau value.
if [ $1 == "--boost" ]; then # e.g. "--boost 0.05"
shift;
boost=$1;
shift;
fi
tau=200 # Tau value.
merge=true # if true, cancel num and den counts as described in
# the boosted MMI paper.
for x in `seq 4`; do
if [ $1 == "--boost" ]; then # e.g. "--boost 0.05"
boost=$2;
shift 2;
fi
if [ $1 == "--smooth-to-model" ]; then
shift;
smooth_to_model=true
fi
if [ $1 == "--tau" ]; then # e.g. "--tau 200
tau=$2
shift 2;
fi
done
if [ $# != 4 ]; then
echo "Usage: steps/train_lda_etc_mmi.sh <data-dir> <lang-dir> <ali-dir> <exp-dir>"
......@@ -99,7 +110,7 @@ scripts/mkgraph.sh $dir/lang $alidir $dir/dengraph || exit 1;
echo "Making denominator lattices"
if false; then ##temp
rm $dir/.error 2>/dev/null
for n in 0 1 2 3; do
gmm-latgen-simple --beam=$beam --lattice-beam=$latticebeam --acoustic-scale=$acwt \
......@@ -113,45 +124,33 @@ if [ -f $dir/.error ]; then
echo "Error creating denominator lattices"
exit 1;
fi
fi ##temp
# No need to create "numerator" alignments/lattices: we just use the
# alignments in $alidir.
echo "Note: ignore absolute offsets in the objective function values"
echo "This is caused by not having LM, lexicon or transition-probs in numerator"
x=0;
while [ $x -lt $num_iters ]; do
echo "Iteration $x: getting denominator stats."
# Get denominator stats...
if [ $x -eq 0 ]; then
( lattice-to-post --acoustic-scale=$acwt "ark:gunzip -c $dir/lat?.gz|" ark:- | \
gmm-acc-stats $dir/$x.mdl "$feats" ark:- $dir/den_acc.$x.acc ) \
2>$dir/acc_den.$x.log || exit 1;
else # Need to recompute acoustic likelihoods...
( gmm-rescore-lattice $dir/$x.mdl "ark:gunzip -c $dir/lat?.gz|" "$feats" ark:- | \
lattice-to-post --acoustic-scale=$acwt ark:- ark:- | \
gmm-acc-stats $dir/$x.mdl "$feats" ark:- $dir/den_acc.$x.acc ) \
2>$dir/acc_den.$x.log || exit 1;
fi
echo "Iteration $x: getting numerator stats."
# Get numerator stats...
gmm-acc-stats-ali $dir/$x.mdl "$feats" ark:$alidir/ali $dir/num_acc.$x.acc \
2>$dir/acc_num.$x.log || exit 1;
( gmm-est-gaussians-ebw $dir/$x.mdl "gmm-ismooth-stats --tau=$tau $dir/num_acc.$x.acc $dir/num_acc.$x.acc -|" \
$dir/den_acc.$x.acc - | \
echo "Iteration $x: getting stats."
( gmm-rescore-lattice $dir/$x.mdl "ark:gunzip -c $dir/lat?.gz|" "$feats" ark:- | \
lattice-to-post --acoustic-scale=$acwt ark:- ark:- | \
sum-post --merge=$merge --scale1=-1 \
ark:- "ark,s,cs:ali-to-post ark:$alidir/ali ark:- |" ark:- | \
gmm-acc-stats2 $dir/$x.mdl "$feats" ark:- $dir/num_acc.$x.acc $dir/den_acc.$x.acc ) \
2>$dir/acc.$x.log || exit 1;
# This tau is only used for smoothing "to the model".
( gmm-est-gaussians-ebw --tau=$tau $dir/$x.mdl $dir/num_acc.$x.acc $dir/den_acc.$x.acc - | \
gmm-est-weights-ebw - $dir/num_acc.$x.acc $dir/den_acc.$x.acc $dir/$[$x+1].mdl ) \
2>$dir/update.$x.log || exit 1;
den=`grep Overall $dir/acc_den.$x.log | grep lattice-to-post | awk '{print $7}'`
num=`grep Overall $dir/acc_num.$x.log | grep gmm-acc-stats-ali | awk '{print $11}'`
diff=`perl -e "print ($num * $acwt - $den);"`
impr=`grep Overall $dir/update.$x.log | head -1 | awk '{print $10;}'`
impr=`perl -e "print ($impr * $acwt);"` # auxf impr normalized by multiplying by
# kappa, so it's comparable to an objective-function change.
echo On iter $x, objf was $diff, auxf improvement was $impr | tee $dir/objf.$x.log
objf=`grep Overall $dir/acc.$x.log | grep gmm-acc-stats2 | awk '{print $10}'`
nf=`grep Overall $dir/acc.$x.log | grep gmm-acc-stats2 | awk '{print $12}'`
impr=`grep Overall $dir/log/update.$x.log | head -1 | awk '{print $10*$12;}'`
impr=`perl -e "print ($impr/$nf);"` # renormalize by "real" #frames, to correct
# for the canceling of stats.
echo On iter $x, objf was $objf, auxf improvement from MMI was $impr | tee $dir/objf.$x.log
rm $dir/*.acc
x=$[$x+1]
done
......
--use-energy=false # only non-default option.
<Topology>
<TopologyEntry>
<ForPhones>
NONSILENCEPHONES
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.75 <Transition> 2 0.25 </State>
<State> 2 <PdfClass> 2 <Transition> 2 0.75 <Transition> 3 0.25 </State>
<State> 3 </State>
</TopologyEntry>
<TopologyEntry>
<ForPhones>
SILENCEPHONES
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.25 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 2 <PdfClass> 2 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 3 <PdfClass> 3 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 4 <PdfClass> 4 <Transition> 4 0.25 <Transition> 5 0.75 </State>
<State> 5 </State>
</TopologyEntry>
</Topology>
......@@ -103,4 +103,3 @@ done
# example of showing the alignments:
# show-alignments data/lang/phones.txt $dir/30.mdl ark:$dir/cur.ali | head -4
. path.sh
local/timit_data_prep.sh /ais/gobi2/speech/TIMIT
local/timit_train_lms.sh data/local
local/timit_format_data.sh
#local/timit_data_prep.sh /ais/gobi2/speech/TIMIT
local/timit_data_prep.sh /mnt/matylda2/data/TIMIT || exit 1;
local/timit_train_lms.sh data/local || exit 1;
local/timit_format_data.sh || exit 1;
# mfccdir should be some place with a largish disk where you
# want to store MFCC features.
......@@ -9,13 +10,13 @@ mfccdir=mfccs
steps/make_mfcc.sh data/train exp/make_mfcc/train $mfccdir 4
for test in train test dev ; do
steps/make_mfcc.sh data/$test exp/make_mfcc/$test $mfccdir 4
steps/make_mfcc.sh data/$test exp/make_mfcc/$test $mfccdir 4 || exit 1;
done
# train monophone system.
steps/train_mono.sh data/train data/lang exp/mono
steps/train_mono.sh data/train data/lang exp/mono || exit 1;
scripts/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph
scripts/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph || exit 1;
echo "Decoding test datasets."
for test in dev test ; do
steps/decode_deltas.sh exp/mono data/$test data/lang exp/mono/decode_$test &
......@@ -25,8 +26,7 @@ scripts/average_wer.sh exp/mono/decode_*/wer > exp/mono/wer
# Get alignments from monophone system.
echo "Creating training alignments to use to train other systems such as ANN-HMM."
steps/align_deltas.sh data/train data/lang exp/mono exp/mono_ali
steps/align_deltas.sh data/train data/lang exp/mono exp/mono_ali || exit 1;
echo "Creating dev alignments to use to train other systems such as ANN-HMM."
steps/align_deltas.sh data/dev data/lang exp/mono exp/mono_ali_dev
steps/align_deltas.sh data/dev data/lang exp/mono exp/mono_ali_dev || exit 1;
......@@ -22,12 +22,15 @@ exp/tri2b/decode_tgpr_dev93_fromlats/wer_15:%WER 16.71 [ 1376 / 8234, 267 ins, 1
exp/tri2b/decode_tgpr_dev93_tg/wer_16:%WER 16.26 [ 1339 / 8234, 267 ins, 141 del, 931 sub ]
exp/tri2b/decode_tgpr_dev93_tg_biglm/wer_16:%WER 16.42 [ 1352 / 8234, 269 ins, 142 del, 941 sub ]
exp/tri2b/decode_tgpr_eval92/wer_16:%WER 11.54 [ 651 / 5643, 146 ins, 42 del, 463 sub ]
exp/tri2b/decode_tgpr_eval92/wer_17:%WER 11.45 [ 646 / 5643, 140 ins, 46 del, 460 sub ]
# +MMI
exp/tri2b_mmi/decode_tgpr_eval92/wer_16:%WER 11.08 [ 625 / 5643, 125 ins, 44 del, 456 sub ]
exp/tri2b_mmi/decode_tgpr_eval92/wer_14:%WER 10.63 [ 600 / 5643, 124 ins, 45 del, 431 sub ]
# +boosting
exp/tri2b_mmi_b0.1/decode_tgpr_eval92/wer_16:%WER 10.83 [ 611 / 5643, 122 ins, 43 del, 446 sub ]
exp/tri2b_mmi_b0.1/decode_tgpr_eval92/wer_16:%WER 10.69 [ 603 / 5643, 119 ins, 48 del, 436 sub ]
# +fMMI
exp/tri2b_fmmi_b0.1/decode_tgpr_eval92/wer_15:%WER 10.26 [ 579 / 5643, 111 ins, 39 del, 429 sub ]
# +MCE
exp/tri2b_mce/decode_tgpr_eval92/wer_16:%WER 11.15 [ 629 / 5643, 132 ins, 45 del, 452 sub ]
......@@ -69,8 +72,17 @@ exp/tri4b/decode_tgpr_dev93/wer_13:%WER 12.53 [ 1032 / 8234, 242 ins, 79 del, 71
exp/tri4b/decode_tgpr_eval92/wer_16:%WER 8.05 [ 454 / 5643, 119 ins, 23 del, 312 sub ]
# +MMI
exp/tri4b_mmi/decode_tgpr_dev93/wer_14:%WER 11.53 [ 949 / 8234, 203 ins, 82 del, 664 sub ]
exp/tri4b_mmi_b0.1/decode_tgpr_dev93/wer_16:%WER 11.45 [ 943 / 8234, 191 ins, 87 del, 665 sub ]
exp/tri4b_mmi/decode_tgpr_dev93/wer_12:%WER 11.28 [ 929 / 8234, 206 ins, 76 del, 647 sub ]
#+boosting
exp/tri4b_mmi_b0.1/decode_tgpr_dev93/wer_16:%WER 11.25 [ 926 / 8234, 176 ins, 94 del, 656 sub ]
# increasing beam from 13 to 15 to see effect.
exp/tri4b_mmi_b0.1/decode_tgpr_dev93_b15/wer_14:%WER 10.72 [ 883 / 8234, 172 ins, 84 del, 627 sub ]
exp/tri4b_mmi_b0.1/decode_tgpr_eval92/wer_14:%WER 7.34 [ 414 / 5643, 105 ins, 20 del, 289 sub ]
#+fMMI
exp/tri4b_fmmi_b0.1/decode_tgpr_dev93/wer_13:%WER 10.86 [ 894 / 8234, 167 ins, 89 del, 638 sub ]
exp/tri4b_fmmi_b0.1/decode_tgpr_eval92/wer_12:%WER 7.25 [ 409 / 5643, 111 ins, 14 del, 284 sub ]
# LDA+MLLT+SAT, SI-284, full retraining starting from 3b [c.f. 4b]
exp/tri4c/decode_tgpr_dev93/wer_16:%WER 12.10 [ 996 / 8234, 220 ins, 83 del, 693 sub ]
......
......@@ -164,6 +164,18 @@ steps/train_lda_etc_mmi.sh --num-jobs 10 --boost 0.1 --cmd "$train_cmd" \
data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 exp/tri2b exp/tri2b_mmi_b0.1
scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt.sh exp/tri2b/graph_tgpr data/test_eval92 exp/tri2b_mmi_b0.1/decode_tgpr_eval92
# The next 3 commands train and test fMMI+MMI (on top of LDA+MLLT).
steps/train_dubm_lda_etc.sh --silence-weight 0.5 \
--num-jobs 10 --cmd "$train_cmd" 400 data/train_si84 \
data/lang exp/tri2b_ali_si84 exp/dubm2b
steps/train_lda_etc_mmi_fmmi.sh \
--num-jobs 10 --boost 0.1 --cmd "$train_cmd" \
data/train_si84 data/lang exp/tri2b_ali_si84 exp/dubm2b exp/tri2b_denlats_si84 \
exp/tri2b exp/tri2b_fmmi_b0.1
scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt_fmpe.sh \
exp/tri2b/graph_tgpr data/test_eval92 exp/tri2b_fmmi_b0.1/decode_tgpr_eval92
steps/train_lda_etc_mce.sh --cmd "$train_cmd" --num-jobs 10 data/train_si84 data/lang \
exp/tri2b_ali_si84 exp/tri2b_denlats_si84 exp/tri2b exp/tri2b_mce
scripts/decode.sh --num-jobs 10 --cmd "$decode_cmd" steps/decode_lda_mllt.sh \
......@@ -222,7 +234,8 @@ scripts/mkgraph.sh data/lang_test_tgpr exp/tri4b exp/tri4b/graph_tgpr
scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt_sat.sh exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b/decode_tgpr_dev93
scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt_sat.sh exp/tri4b/graph_tgpr data/test_eval92 exp/tri4b/decode_tgpr_eval92
# Train and test MMI, and boosted MMI, on tri4b.
# Train and test MMI, and boosted MMI, on tri4b (LDA+MLLT+SAT on
# all the data).
# Making num-jobs 40 as want to keep them under 4 hours long (or will fail
# on regular queue at BUT).
steps/align_lda_mllt_sat.sh --num-jobs 40 --cmd "$train_cmd" \
......@@ -235,6 +248,25 @@ scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_etc.sh exp/tri4b/graph_tg
steps/train_lda_etc_mmi.sh --boost 0.1 --num-jobs 40 --cmd "$train_cmd" \
data/train_si284 data/lang exp/tri4b_ali_si284 exp/tri4b_denlats_si284 exp/tri4b exp/tri4b_mmi_b0.1
scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_etc.sh exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b_mmi_b0.1/decode_tgpr_dev93 exp/tri4b/decode_tgpr_dev93
scripts/decode.sh --opts "--beam 15" --cmd "$decode_cmd" steps/decode_lda_etc.sh exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b_mmi_b0.1/decode_tgpr_dev93_b15 exp/tri4b/decode_tgpr_dev93
scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_etc.sh exp/tri4b/graph_tgpr data/test_eval92 exp/tri4b_mmi_b0.1/decode_tgpr_eval92 exp/tri4b/decode_tgpr_eval92
# Train fMMI+MMI system on top of 4b.
steps/train_dubm_lda_etc.sh --silence-weight 0.5 \
--num-jobs 40 --cmd "$train_cmd" 600 data/train_si284 \
data/lang exp/tri4b_ali_si284 exp/dubm4b
steps/train_lda_etc_mmi_fmmi.sh \
--num-jobs 40 --boost 0.1 --cmd "$train_cmd" \
data/train_si284 data/lang exp/tri4b_ali_si284 exp/dubm4b exp/tri4b_denlats_si284 \
exp/tri4b exp/tri4b_fmmi_b0.1
scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_etc_fmpe.sh \
exp/tri4b/graph_tgpr data/test_eval92 exp/tri4b_fmmi_b0.1/decode_tgpr_eval92 \
exp/tri4b/decode_tgpr_eval92
scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_etc_fmpe.sh \
exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b_fmmi_b0.1/decode_tgpr_dev93 \
exp/tri4b/decode_tgpr_dev93
# Train UBM, for SGMM system on top of LDA+MLLT.
steps/train_ubm_lda_etc.sh --num-jobs 10 --cmd "$train_cmd" \
......@@ -245,6 +277,7 @@ scripts/mkgraph.sh data/lang_test_tgpr exp/sgmm3c exp/sgmm3c/graph_tgpr
scripts/decode.sh --cmd "$decode_cmd" steps/decode_sgmm_lda_etc.sh exp/sgmm3c/graph_tgpr \
data/test_dev93 exp/sgmm3c/decode_tgpr_dev93
# Decode using 3 Gaussians (not 15) for gselect in 1st pass, for fast decoding.
scripts/decode.sh --opts "--first-pass-gselect 3" --cmd "$decode_cmd" \
steps/decode_sgmm_lda_etc.sh exp/sgmm3c/graph_tgpr data/test_dev93 exp/sgmm3c/decode_tgpr_dev93_gs3
......
......@@ -62,7 +62,7 @@ fi
requirements="$mydata/feats.scp $srcdir/final.mdl $srcdir/final.mat $graphdir/HCLG.fst $transdir/$jobid.trans"
for f in $requirements; do
if [ ! -f $f ]; then
echo "decode_lda_mllt.sh: no such file $f";
echo "decode_lda_etc.sh: no such file $f";
exit 1;
fi
done
......
#!/bin/bash
# Decoding script for LDA + optionally MLLT + [some speaker-specific transforms]
# + fMPE.
# This decoding script takes as an argument a previous decoding directory where it
# can find some transforms.
if [ -f ./path.sh ]; then . ./path.sh; fi
numjobs=1
jobid=0
beam=13.0
rescore=false
for x in `seq 3`; do
if [ "$1" == "-j" ]; then
shift;
numjobs=$1;
jobid=$2;
shift 2;
fi
if [ "$1" == "--beam" ]; then
beam=$2;
shift 2;
fi
done
if [ $# != 4 ]; then
# Note: transform-dir has to be last because scripts/decode.sh expects decode-dir to be #3 arg.
echo "Usage: steps/decode_lda_etc.sh [-j num-jobs job-number] <graph-dir> <data-dir> <decode-dir> <transform-dir>"
echo " e.g.: steps/decode_lda_etc.sh -j 8 0 exp/tri4b/graph_tgpr data/test_dev93 exp/tri4b_mmi/decode_tgpr_dev93 exp/tri4b/decode_tgpr_dev93"
exit 1;
fi
graphdir=$1
data=$2
dir=$3
transdir=$4
srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
mkdir -p $dir
if [ $numjobs -gt 1 ]; then
mydata=$data/split$numjobs/$jobid
else
mydata=$data
fi
requirements="$mydata/feats.scp $srcdir/final.mdl $srcdir/final.fmpe $srcdir/final.mat $graphdir/HCLG.fst $transdir/$jobid.trans"
for f in $requirements; do
if [ ! -f $f ]; then
echo "decode_lda_etc_fmpe.sh: no such file $f";
exit 1;
fi
done
basefeats="ark:compute-cmvn-stats --spk2utt=ark:$mydata/spk2utt scp:$mydata/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$mydata/utt2spk ark:- scp:$mydata/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- | transform-feats --utt2spk=ark:$mydata/utt2spk ark:$transdir/$jobid.trans ark:- ark:- |"
# Get the Gaussian-selection info for the fMPE.
ngselect=2; # Just the 2 top Gaussians.
gmm-gselect --n=$ngselect $srcdir/final.fmpe "$basefeats" \
"ark:|gzip -c >$dir/gselect.$jobid.gz" 2>$dir/gselect.$jobid.log
# Now set up the fMPE features.
feats="$basefeats fmpe-apply-transform $srcdir/final.fmpe ark:- 'ark,s,cs:gunzip -c $dir/gselect.$jobid.gz|' ark:- |"
gmm-latgen-faster --max-active=7000 --beam=$beam --lattice-beam=6.0 \
--acoustic-scale=0.083333 \
--allow-partial=true --word-symbol-table=$graphdir/words.txt \
$srcdir/final.mdl $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.$jobid.gz" \
2> $dir/decode$jobid.log || exit 1;
#!/bin/bash
# Decoding script that works with a GMM model and the baseline
# [e.g. MFCC] features plus cepstral mean subtraction plus
# LDA+MLLT or similar transform, plus fMPE/FMMI.
# This script just generates lattices for a single broken-up
# piece of the data.
if [ -f ./path.sh ]; then . ./path.sh; fi
numjobs=1
jobid=0
rescore=false
if [ "$1" == "-j" ]; then
shift;
numjobs=$1;
jobid=$2;
shift; shift;
fi
if [ $# != 3 ]; then
echo "Usage: steps/decode_lda_mllt_fmpe.sh [-j num-jobs job-number] <graph-dir> <data-dir> <decode-dir>"
echo " e.g.: steps/decode_lda_mllt_fmpe.sh -j 8 0 exp/tri2b/graph_tgpr data/test_dev93 exp/tri2b_fmmi/decode_dev93_tgpr"
exit 1;
fi
graphdir=$1
data=$2
dir=$3
srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
mkdir -p $dir
if [ $numjobs -gt 1 ]; then
mydata=$data/split$numjobs/$jobid
else
mydata=$data
fi
requirements="$mydata/feats.scp $srcdir/final.mdl $srcdir/final.fmpe $srcdir/final.mat $graphdir/HCLG.fst"
for f in $requirements; do
if [ ! -f $f ]; then
echo "decode_lda_mllt_fmpe.sh: no such file $f";
exit 1;
fi
done
basefeats="ark:compute-cmvn-stats --spk2utt=ark:$mydata/spk2utt scp:$mydata/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$mydata/utt2spk ark:- scp:$mydata/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
# Get the Gaussian-selection info for the fMPE.
ngselect=2; # Just the 2 top Gaussians.
gmm-gselect --n=$ngselect $srcdir/final.fmpe "$basefeats" \
"ark:|gzip -c >$dir/gselect.$jobid.gz" 2>$dir/gselect.$jobid.log
# Now set up the fMPE features.
feats="$basefeats fmpe-apply-transform $srcdir/final.fmpe ark:- 'ark,s,cs:gunzip -c $dir/gselect.$jobid.gz|' ark:- |"
gmm-latgen-faster --max-active=7000 --beam=13.0 --lattice-beam=6.0 --acoustic-scale=0.083333 \
--allow-partial=true --word-symbol-table=$graphdir/words.txt \
$srcdir/final.mdl $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.$jobid.gz" \
2> $dir/decode.$jobid.log || exit 1;
#!/bin/bash
# This trains a diagonal-covariance UBM (i.e. just a global
# mixture of Gaussians, or GMM).
# Train UBM from a trained HMM/GMM system [with splice+LDA+[MLLT/ET/MLLT+SAT] features]
# Alignment directory is used for the CMN and transforms.
# A UBM is just a single mixture of Gaussians (full-covariance, in our case), that's trained
# on all the data. This will later be used in Subspace Gaussian Mixture Model (SGMM)
# training.
nj=4
cmd=scripts/run.pl
silweight=
for x in 1 2; do
if [ $1 == "--num-jobs" ]; then
shift
nj=$1
shift
fi
if [ $1 == "--cmd" ]; then
shift
cmd=$1
shift
fi
if [ $1 == "--silence-weight" ]; then
shift
silweight=$1 # e.g. to weight down silence in training.
shift
fi
done
if [ $# != 5 ]; then
echo "Usage: steps/train_ubm_lda_etc.sh <num-comps> <data-dir> <lang-dir> <ali-dir> <exp-dir>"
echo " e.g.: steps/train_ubm_lda_etc.sh 400 data/train_si84 data/lang exp/tri2b_ali_si84 exp/ubm3c"
exit 1;
fi
if [ -f path.sh ]; then . path.sh; fi
numcomps=$1
data=$2
lang=$3
alidir=$4
dir=$5
silphonelist=`cat $lang/silphones.csl`
mkdir -p $dir/log
if [ ! -d $data/split$nj -o $data/split$nj -ot $data/feats.scp ]; then
scripts/split_data.sh $data $nj
fi
n1=`get_splits.pl $nj | awk '{print $1}'`
[ -f $alidir/$n1.trans ] && echo "Using speaker transforms from $alidir"
for n in `get_splits.pl $nj`; do
featspart[$n]="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$alidir/$n.cmvn scp:$data/split$nj/$n/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
if [ -f $alidir/$n1.trans ]; then
featspart[$n]="${featspart[$n]} transform-feats --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$alidir/$n.trans ark:- ark:- |"
fi
if [ ! -z "$silweight" ]; then
weightspart[$n]="--weights='ark,s,cs:gunzip -c $alidir/$n.ali.gz | ali-to-post ark:- ark:- | weight-silence-post $silweight $silphonelist $alidir/final.mdl ark:- ark:- | post-to-weights ark:- ark:- |'"
fi
done
ngselect=50
intermediate=2000
if [ $[$numcomps*2] -gt $intermediate ]; then
intermediate=$[$numcomps*2];
fi
echo "Clustering model $alidir/final.mdl to get initial UBM"
# typically: --intermediate-numcomps=2000 --ubm-numcomps=400
if [ ! -s $dir/0.dubm ]; then
$cmd $dir/log/cluster.log \
init-ubm --intermediate-numcomps=$intermediate --ubm-numcomps=$numcomps \
--verbose=2 --fullcov-ubm=false $alidir/final.mdl $alidir/final.occs \
$dir/0.dubm || exit 1;
fi
rm $dir/.error 2>/dev/null
# First do Gaussian selection to 50 components, which will be used
# as the initial screen for all further passes.
for n in `get_splits.pl $nj`; do
$cmd $dir/log/gselect.$n.log \
gmm-gselect --n=$ngselect $dir/0.dubm "${featspart[$n]}" \
"ark:|gzip -c >$dir/gselect.$n.gz" &
done
wait
[ -f $dir/.error ] && echo "Error doing GMM selection" && exit 1;
for x in 0 1 2 3; do
echo "Pass $x"
for n in `get_splits.pl $nj`; do
$cmd $dir/log/acc.$x.$n.log \
gmm-global-acc-stats ${weightspart[$n]} "--gselect=ark,s,cs:gunzip -c $dir/gselect.$n.gz|" \
$dir/$x.dubm "${featspart[$n]}" $dir/$x.$n.acc || touch $dir/.error &
done
wait
[ -f $dir/.error ] && echo "Error accumulating stats for UBM estimation on pass $x" && exit 1;
lowcount_opt="--remove-low-count-gaussians=false"
[ $x -eq 3 ] && lowcount_opt= # Only remove low-count Gaussians on last iter-- keeps gselect info valid.
$cmd $dir/log/update.$x.log \
gmm-global-est $lowcount_opt --verbose=2 $dir/$x.dubm "gmm-global-sum-accs - $dir/$x.*.acc |" \
$dir/$[$x+1].dubm || exit 1;
rm $dir/$x.*.acc $dir/$x.dubm
done
rm $dir/gselect.*.gz
rm $dir/final.dubm 2>/dev/null
mv $dir/4.dubm $dir/final.dubm || exit 1;
......@@ -20,10 +20,8 @@
# [something] may be MLLT, or ET, or MLLT + SAT. Any speaker-specific
# transforms are expected to be located in the alignment directory.
# This script never re-estimates any transforms, it just does model
# training. To make this faster, it initializes the model from the
# old system's model, i.e. for each p.d.f., it takes the best-match pdf
# from the old system (based on overlap of tree-stats counts), and
# uses that GMM to initialize the current GMM.
# training.
# Basically we are doing 4 iterations of Extended Baum-Welch (EBW)
# estimation, as described in Dan Povey's thesis, with a few differences:
# (i) we have the option of "boosting", as in "Boosted MMI", which increases
......@@ -47,7 +45,9 @@
niters=4
nj=4
boost=0.0
tau=100
tau=200
merge=true # if true, cancel num and den counts as described in
# the boosted MMI paper.
cmd=scripts/run.pl
acwt=0.1
stage=0
......@@ -69,6 +69,9 @@ for x in `seq 8`; do
if [ $1 == "--acwt" ]; then
shift; acwt=$1; shift
fi
if [ $1 == "--tau" ]; then
shift; tau=$1; shift
fi
if [ $1 == "--stage" ]; then
shift; stage=$1; shift
fi
......@@ -121,58 +124,60 @@ rm $dir/.error 2>/dev/null
cur_mdl=$srcdir/final.mdl
x=0
while [ $x -lt $niters ]; do
echo "Iteration $x: getting denominator stats."
# Get denominator stats... For simplicity we rescore the lattice
echo "Iteration $x: getting stats."
# Get denominator and numerator stats together... This involves
# merging the num and den posteriors, and (if $merge==true), canceling
# the +ve and -ve occupancies on each frame.
# For simplicity we rescore the lattice
# on all iterations, even though it shouldn't be necessary on the zeroth
# (but we want this script to work even if $srcdir doesn't contain the
# model used to generate the lattice).
# model used to generate the lattice).
if [ $stage -le $x ]; then
for n in `get_splits.pl $nj`; do
$cmd $dir/log/acc_den.$x.$n.log \
$cmd $dir/log/acc.$x.$n.log \
gmm-rescore-lattice $cur_mdl "${latspart[$n]}" "${featspart[$n]}" ark:- \| \
lattice-to-post --acoustic-scale=$acwt ark:- ark:- \| \
gmm-acc-stats $cur_mdl "${featspart[$n]}" ark:- $dir/den_acc.$x.$n.acc \