Commit 3a64393c authored by Dan Povey's avatar Dan Povey
Browse files

Updates to scripts (to be final version of papers); minor fixes to programs;...

Updates to scripts (to be final version of papers); minor fixes to programs; added scale-vecs program.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@125 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 88c5ee94
......@@ -24,9 +24,9 @@ exp/decode_tri2a_dfmllr/wer:Average WER is 3.861805 (484 / 12533) # + diagonal
exp/decode_tri2a_dfmllr_utt/wer:Average WER is 3.933615 (493 / 12533) # [ diagonal fMLLR per utterance]
exp/decode_tri2a_dfmllr_fmllr/wer:Average WER is 3.622437 (454 / 12533) # diagonal fMLLR, then estimate fMLLR and re-decode
exp/decode_tri2b/wer:Average WER is 3.303279 (414 / 12533) # Exponential transform
exp/decode_tri2b_fmllr/wer:Average WER is 3.047953 (382 / 12533) # +fMLLR
exp/decode_tri2b_utt/wer:Average WER is 3.335195 (418 / 12533) # [adapt per-utt]
exp/decode_tri2b/wer:Average WER is 3.143701 (394 / 12533) # Exponential transform
exp/decode_tri2b_fmllr/wer:Average WER is 3.055932 (383 / 12533) # +fMLLR
exp/decode_tri2b_utt/wer:Average WER is 3.295300 (413 / 12533) # [adapt per-utt]
exp/decode_tri2c/wer:Average WER is 3.957552 (496 / 12533) # Cepstral mean subtraction (per-spk)
exp/decode_tri2d/wer:Average WER is 4.316604 (541 / 12533) # MLLT (= global STC)
exp/decode_tri2e/wer:Average WER is 4.659698 (584 / 12533) # splice-9-frames + LDA features
......@@ -43,10 +43,12 @@ exp/decode_tri2g_vtln_nofmllr/wer:Average WER is 3.694247 (463 / 12533) # featur
exp/decode_tri2h/wer:Average WER is 4.252773 (533 / 12533) # Splice-9-frames + HLDA
exp/decode_tri2i/wer:Average WER is 3.981489 (499 / 12533) # Triple-deltas + HLDA
exp/decode_tri2j/wer:Average WER is 3.853826 (483 / 12533) # Triple-deltas + LDA + MLLT
exp/decode_tri2k/wer:Average WER is 2.968164 (372 / 12533) # LDA + exponential transform
exp/decode_tri2k_utt/wer:Average WER is 3.175616 (398 / 12533) # per-utterance adaptation.
exp/decode_tri2k_fmllr/wer:Average WER is 2.505386 (314 / 12533) # +fMLLR (per-spk)
exp/decode_tri2k_regtree_fmllr/wer:Average WER is 2.513365 (315 / 12533) # +regression tree
exp/decode_tri2k/wer:Average WER is 3.071890 (385 / 12533) # LDA + exponential transform
exp/decode_tri2k_utt/wer:Average WER is 3.039974 (381 / 12533) # per-utterance adaptation
exp/decode_tri2k_fmllr/wer:Average WER is 2.641028 (331 / 12533) # fMLLR (per-spk)
exp/decode_tri2k_regtree_fmllr/wer:Average WER is 2.688901 (337 / 12533) # +regression-tree
exp/decode_tri2l/wer:Average WER is 2.704859 (339 / 12533) # Splice-9-frames + LDA + MLLT + SAT (fMLLR in test)
exp/decode_tri2l_utt/wer:Average WER is 4.930982 (618 / 12533) # [ as decode_tri2l but per-utt in test. ]
......@@ -69,6 +71,9 @@ exp/decode_sgmmc_fmllr/wer:Average WER is 2.457512 (308 / 12533)
exp/decode_sgmmc_norm/wer:Average WER is 2.696880 (338 / 12533)
exp/decode_sgmmc_fmllr_norm/wer:Average WER is 2.425596 (304 / 12533)
# sgmmd is like sgmmb but with LDA+MLLT features.
exp/decode_sgmmd/wer:Average WER is 2.449533 (307 / 12533)
exp/decode_sgmmd_fmllr/wer:Average WER is 2.305912 (289 / 12533)
......
......@@ -117,6 +117,9 @@ steps/train_ubma.sh
# + gender dependency.
(steps/train_ubmb.sh; steps/train_sgmmc.sh; steps/decode_sgmmc.sh; steps/decode_sgmmc_fmllr.sh )&
# as sgmmb but with LDA+STC features.
(steps/train_ubmc.sh; steps/train_sgmmd.sh; steps/decode_sgmmd.sh; steps/decode_sgmmd_fmllr.sh )&
......
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# SGMM decoding with adaptation.
#
# SGMM decoding; use a different acoustic scale from normal (0.1 vs 0.08333)
# (1) decode with "alignment model"
# (2) get GMM posteriors with "alignment model" and estimate speaker
# vectors with final model
# (3) decode with final model.
if [ -f path.sh ]; then . path.sh; fi
dir=exp/decode_sgmmd
tree=exp/sgmmd/tree
model=exp/sgmmd/final.mdl
alimodel=exp/sgmmd/final.alimdl
graphdir=exp/graph_sgmmd
silphonelist=`cat data/silphones.csl`
mat=exp/sgmmd/final.mat
mkdir -p $dir
scripts/mkgraph.sh $tree $model $graphdir
for test in mar87 oct87 feb89 oct89 feb91 sep92; do
(
feats="ark:splice-feats scp:data/test_${test}.scp ark:- | transform-feats $mat ark:- ark:- |"
spk2utt_opt="--spk2utt=ark:data/test_${test}.spk2utt"
utt2spk_opt="--utt2spk=ark:data/test_${test}.utt2spk"
sgmm-gselect $model "$feats" ark,t:- 2>$dir/gselect.log | \
gzip -c > $dir/${test}_gselect.gz || exit 1;
gselect_opt="--gselect=ark:gunzip -c $dir/${test}_gselect.gz|"
# Use smaller beam first time.
sgmm-decode-faster "$gselect_opt" --beam=15.0 --acoustic-scale=0.1 --word-symbol-table=data/words.txt $alimodel $graphdir/HCLG.fst "$feats" ark,t:$dir/test_${test}.pre_tra ark,t:$dir/test_${test}.pre_ali 2> $dir/predecode_${test}.log
( ali-to-post ark:$dir/test_${test}.pre_ali ark:- | \
weight-silence-post 0.01 $silphonelist $alimodel ark:- ark:- | \
sgmm-post-to-gpost "$gselect_opt" $alimodel "$feats" ark,s,cs:- ark:- | \
sgmm-est-spkvecs-gpost $spk2utt_opt $model "$feats" ark,s,cs:- \
ark:$dir/test_${test}.vecs1 ) 2>$dir/vecs1_${test}.log
( ali-to-post ark:$dir/test_${test}.pre_ali ark:- | \
weight-silence-post 0.01 $silphonelist $alimodel ark:- ark:- | \
sgmm-est-spkvecs --spk-vecs=ark:$dir/test_${test}.vecs1 $spk2utt_opt \
$model "$feats" ark,s,cs:- ark:$dir/test_${test}.vecs2 ) 2>$dir/vecs2_${test}.log
sgmm-decode-faster "$gselect_opt" $utt2spk_opt --spk-vecs=ark:$dir/test_${test}.vecs2 --beam=20.0 --acoustic-scale=0.1 --word-symbol-table=data/words.txt $model $graphdir/HCLG.fst "$feats" ark,t:$dir/test_${test}.tra ark,t:$dir/test_${test}.ali 2> $dir/decode_${test}.log
# the ,p option lets it score partial output without dying..
scripts/sym2int.pl --ignore-first-field data/words.txt data_prep/test_${test}_trans.txt | \
compute-wer --mode=present ark:- ark,p:$dir/test_${test}.tra >& $dir/wer_${test}
) &
done
wait
grep WER $dir/wer_* | \
awk '{n=n+$4; d=d+$6} END{ printf("Average WER is %f (%d / %d) \n", 100.0*n/d, n, d); }' \
> $dir/wer
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation, Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# SGMM decoding with adaptation.
#
# SGMM decoding; use a different acoustic scale from normal (0.1 vs 0.08333)
# (1) decode with "alignment model"
# (2) get GMM posteriors with "alignment model" and estimate speaker
# vectors with final model
# (3) decode with final model.
# (4) get GMM posteriors from this decoded output and estimate fMLLR transforms
# with this final model
# (5) decode with the final model using both the speaker vectors and fMLLR
if [ -f path.sh ]; then . path.sh; fi
dir=exp/decode_sgmmd_fmllr
tree=exp/sgmmd/tree
model=exp/sgmmd/final.mdl
occs=exp/sgmmd/final.occs
fmllr_model=exp/sgmmd/final_fmllr.mdl
alimodel=exp/sgmmd/final.alimdl
graphdir=exp/graph_sgmmd
silphonelist=`cat data/silphones.csl`
mat=exp/sgmmd/final.mat
mincount=1000 # min occupancy to extimate fMLLR transform
iters=10 # number of iters of fMLLR estimation
if [ ! -f $fmllr_model ]; then
if [ ! -f $model ]; then
echo "Cannot find $model. Maybe training didn't finish?"
exit 1;
fi
sgmm-comp-prexform $model $occs $fmllr_model
fi
mkdir -p $dir
scripts/mkgraph.sh $tree $model $graphdir
for test in mar87 oct87 feb89 oct89 feb91 sep92; do
(
feats="ark:splice-feats scp:data/test_${test}.scp ark:- | transform-feats $mat ark:- ark:- |"
spk2utt_opt="--spk2utt=ark:data/test_${test}.spk2utt"
utt2spk_opt="--utt2spk=ark:data/test_${test}.utt2spk"
sgmm-gselect $model "$feats" ark,t:- 2>$dir/gselect.log | \
gzip -c > $dir/${test}_gselect.gz || exit 1;
gselect_opt="--gselect=ark:gunzip -c $dir/${test}_gselect.gz|"
# Use smaller beam for the first pass decoding.
sgmm-decode-faster "$gselect_opt" --beam=15.0 --acoustic-scale=0.1 --word-symbol-table=data/words.txt $alimodel $graphdir/HCLG.fst "$feats" ark,t:$dir/test_${test}.pass1.tra ark,t:$dir/test_${test}.pass1.ali 2> $dir/pass1_${test}.log
# Estimate the speaker vectors
( ali-to-post ark:$dir/test_${test}.pass1.ali ark:- | \
weight-silence-post 0.01 $silphonelist $alimodel ark:- ark:- | \
sgmm-post-to-gpost "$gselect_opt" $alimodel "$feats" ark,s,cs:- ark:- | \
sgmm-est-spkvecs-gpost "$spk2utt_opt" $model "$feats" ark,s,cs:- \
ark:$dir/test_${test}.vecs1 ) 2>$dir/vecs1_${test}.log
( ali-to-post ark:$dir/test_${test}.pass1.ali ark:- | \
weight-silence-post 0.01 $silphonelist $alimodel ark:- ark:- | \
sgmm-est-spkvecs --spk-vecs=ark:$dir/test_${test}.vecs1 "$spk2utt_opt" \
$model "$feats" ark,s,cs:- ark:$dir/test_${test}.vecs2 ) 2>$dir/vecs2_${test}.log
# Second-pass decoding with the speaker vectors.
sgmm-decode-faster "$gselect_opt" $utt2spk_opt --spk-vecs=ark:$dir/test_${test}.vecs2 --beam=20.0 --acoustic-scale=0.1 --word-symbol-table=data/words.txt $model $graphdir/HCLG.fst "$feats" ark,t:$dir/test_${test}.pass2.tra ark,t:$dir/test_${test}.pass2.ali 2> $dir/pass2_${test}.log
# Estimate the fMLLR transforms.
( ali-to-post ark:$dir/test_${test}.pass2.ali ark:- | \
weight-silence-post 0.01 $silphonelist $model ark:- ark:- | \
sgmm-post-to-gpost --spk-vecs=ark:$dir/test_${test}.vecs2 $utt2spk_opt \
"$gselect_opt" $model "$feats" ark,s,cs:- ark:- | \
sgmm-est-fmllr-gpost --fmllr-iters=$iters --fmllr-min-count=$mincount \
--spk-vecs=ark:$dir/test_${test}.vecs2 "$spk2utt_opt" $fmllr_model \
"$feats" ark,s,cs:- ark:$dir/test_${test}.fmllr ) \
2>$dir/est_fmllr_${test}.log
adapt_feats="ark:splice-feats scp:data/test_${test}.scp ark:- | transform-feats $mat ark:- ark:- | transform-feats $utt2spk_opt ark:$dir/test_${test}.fmllr ark:- ark:- |"
# Now decode with fMLLR-adapted features. Gaussian selection is also done
# with the adapted features. This causes a small improvement in WER on RM.
sgmm-decode-faster $utt2spk_opt --beam=20.0 --acoustic-scale=0.1 --word-symbol-table=data/words.txt --spk-vecs=ark:$dir/test_${test}.vecs2 $fmllr_model $graphdir/HCLG.fst "$adapt_feats" ark,t:$dir/test_${test}.tra ark,t:$dir/test_${test}.ali 2> $dir/decode_${test}.log
# the ,p option lets it score partial output without dying..
scripts/sym2int.pl --ignore-first-field data/words.txt data_prep/test_${test}_trans.txt | \
compute-wer --mode=present ark:- ark,p:$dir/test_${test}.tra >& $dir/wer_${test}
) &
done
wait
grep WER $dir/wer_* | \
awk '{n=n+$4; d=d+$6} END{ printf("Average WER is %f (%d / %d) \n", 100.0*n/d, n, d); }' \
> $dir/wer
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
if [ -f path.sh ]; then . path.sh; fi
# This is SGMM with speaker vectors (as sgmmb) but on top of LDA+STC features.
# To be run from ..
# You must run init_sgmmc.sh first, as well as train_tri2f.sh
# We rely on the UBM exp/sgmmc/4.ubm being there
dir=exp/sgmmd
srcdir=exp/tri2f
mat=$srcdir/final.mat
srcmodel=$srcdir/final.mdl
srcgraphs="ark:gunzip -c $srcdir/graphs.fsts.gz|"
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
numiters=25 # Total number of iterations
ubm=exp/ubmc/4.ubm
realign_iters="5 10 15";
spkvec_iters="5 8 12 17 22"
silphonelist=`cat data/silphones.csl`
numleaves=2500
numsubstates=2500 # Initial #-substates.
totsubstates=7500 # Target #-substates.
maxiterinc=15 # Last iter to increase #substates on.
incsubstates=$[($totsubstates-$numsubstates)/$maxiterinc] # per-iter increment for #substates
gselect_opt="--gselect=ark,s,cs:gunzip -c $dir/gselect.gz|"
# Initially don't have speaker vectors, but change this after
# we estimate them.
spkvecs_opt=
randprune=0.1
mkdir -p $dir
utt2spk_opt="--utt2spk=ark:data/train.utt2spk"
spk2utt_opt="--spk2utt=ark:data/train.spk2utt"
feats="ark:splice-feats scp:data/train.scp ark:- | transform-feats $mat ark:- ark:- |"
if [ ! -f $ubm ]; then
echo "No UBM in $ubm"
fi
cp $srcdir/topo $dir
echo "aligning all training data"
if [ ! -f $dir/0.ali ]; then
gmm-align-compiled $scale_opts --beam=8 --retry-beam=40 $srcmodel "$srcgraphs" \
"$feats" ark,t:$dir/0.ali 2> $dir/align.0.log || exit 1;
fi
# We rebuild the tree because we want a larger #states than for a normal
# GMM system (the optimum #states for SGMMs tends to be a bit higher).
if [ ! -f $dir/treeacc ]; then
acc-tree-stats --ci-phones=$silphonelist $srcmodel "$feats" ark:$dir/0.ali \
$dir/treeacc 2> $dir/acc.tree.log || exit 1;
fi
cat data/phones.txt | awk '{print $NF}' | grep -v -w 0 > $dir/phones.list
cluster-phones $dir/treeacc $dir/phones.list $dir/questions.txt 2> $dir/questions.log || exit 1;
scripts/int2sym.pl data/phones.txt < $dir/questions.txt > $dir/questions_syms.txt
compile-questions $dir/topo $dir/questions.txt $dir/questions.qst 2>$dir/compile_questions.log || exit 1;
scripts/make_roots.pl --separate data/phones.txt $silphonelist shared split > $dir/roots.txt 2>$dir/roots.log || exit 1;
build-tree --verbose=1 --max-leaves=$numleaves \
$dir/treeacc $dir/roots.txt \
$dir/questions.qst $dir/topo $dir/tree 2> $dir/train_tree.log || exit 1;
# the sgmm-init program accepts a GMM, so we just create a temporary GMM "0.gmm"
gmm-init-model --write-occs=$dir/0.occs \
$dir/tree $dir/treeacc $dir/topo $dir/0.gmm 2> $dir/init_gmm.log || exit 1;
sgmm-init --spk-space-dim=40 $dir/0.gmm $ubm $dir/0.mdl 2> $dir/init_sgmm.log || exit 1;
if [ ! -f $dir/0.mdl ]; then
echo "you must run train_ubma.sh before train_sgmmb.sh"
exit 1
fi
if [ ! -f $dir/gselect.gz ]; then
sgmm-gselect $dir/0.mdl "$feats" ark,t:- 2>$dir/gselect.log | gzip -c > $dir/gselect.gz || exit 1;
fi
convert-ali $srcmodel $dir/0.mdl $dir/tree ark:$dir/0.ali \
ark:$dir/cur.ali 2>$dir/convert.log
rm $dir/0.ali
# Make training graphs
echo "Compiling training graphs"
compile-train-graphs $dir/tree $dir/0.mdl data/L.fst ark:data/train.tra \
"ark:|gzip -c >$dir/graphs.fsts.gz" 2>$dir/compile_graphs.log || exit 1
iter=0
while [ $iter -lt $numiters ]; do
echo "Pass $iter ... "
if echo $realign_iters | grep -w $iter >/dev/null; then
echo "Aligning data"
sgmm-align-compiled $spkvecs_opt $utt2spk_opt $scale_opts "$gselect_opt" \
--retry-beam=40 $dir/$iter.mdl "ark:gunzip -c $dir/graphs.fsts.gz|" "$feats" \
ark:$dir/cur.ali 2> $dir/align.$iter.log || exit 1;
fi
if echo $spkvec_iters | grep -w $iter >/dev/null; then
( ali-to-post ark:$dir/cur.ali ark:- | \
weight-silence-post 0.01 $silphonelist $dir/$iter.mdl ark:- ark:- | \
sgmm-est-spkvecs $spk2utt_opt $spkvecs_opt "$gselect_opt" \
--rand-prune=$randprune $dir/$iter.mdl \
"$feats" ark:- ark:$dir/tmp.vecs ) 2>$dir/spkvecs.$iter.log || exit 1;
mv $dir/tmp.vecs $dir/cur.vecs
spkvecs_opt="--spk-vecs=ark:$dir/cur.vecs"
fi
if [ $iter -eq 0 ]; then
flags=vwcS
elif [ $[$iter%2] -eq 1 -a $iter -gt 4 ]; then # even iters after 4 (i.e. starting from 6)...
flags=vNwcS
else
flags=vMwcS
fi
sgmm-acc-stats $spkvecs_opt $utt2spk_opt --update-flags=$flags "$gselect_opt" --rand-prune=$randprune --binary=false $dir/$iter.mdl "$feats" "ark:ali-to-post ark:$dir/cur.ali ark:-|" $dir/$iter.acc 2> $dir/acc.$iter.log || exit 1;
sgmm-est --update-flags=$flags --split-substates=$numsubstates --write-occs=$dir/$[$iter+1].occs $dir/$iter.mdl $dir/$iter.acc $dir/$[$iter+1].mdl 2> $dir/update.$iter.log || exit 1;
rm $dir/$iter.mdl $dir/$iter.acc
rm $dir/$iter.occs
if [ $iter -lt $maxiterinc ]; then
numsubstates=$[$numsubstates+$incsubstates]
fi
iter=$[$iter+1];
done
# The point of this last phase of accumulation is to get Gaussian-level
# alignments with the speaker vectors but accumulate stats without
# any speaker vectors; we re-estimate M, w, c and S to get a model
# that's compatible with not having speaker vectors.
flags=MwcS
( ali-to-post ark:$dir/cur.ali ark:- | \
sgmm-post-to-gpost $spkvecs_opt $utt2spk_opt "$gselect_opt" \
$dir/$iter.mdl "$feats" ark,s,cs:- ark:- | \
sgmm-acc-stats-gpost --update-flags=$flags $dir/$iter.mdl "$feats" \
ark,s,cs:- $dir/$iter.aliacc ) 2> $dir/acc_ali.$iter.log || exit 1;
sgmm-est --update-flags=$flags --remove-speaker-space=true $dir/$iter.mdl \
$dir/$iter.aliacc $dir/$iter.alimdl 2>$dir/update_ali.$iter.log || exit 1;
( cd $dir; rm final.mdl final.occs 2>/dev/null;
ln -s $iter.mdl final.mdl;
ln -s $iter.alimdl final.alimdl;
ln -s $iter.occs final.occs )
cp $mat $dir/final.mat
......@@ -18,7 +18,7 @@
# To be run from ..
# This (train_tri2f) is training with splice-9-frames+LDA features,
# plus MLLT.
# plus STC/MLLT.
if [ -f path.sh ]; then . path.sh; fi
dir=exp/tri2f
......@@ -115,15 +115,15 @@ while [ $x -lt $numiters ]; do
featsub="ark:scripts/subset_scp.pl 800 data/train.scp | splice-feats scp:- ark:- | transform-feats $cur_lda ark:- ark:-|"
else # do GMM update.
gmm-acc-stats-ali --binary=false $dir/$x.mdl "$feats" ark:$dir/cur.ali $dir/$x.acc 2> $dir/acc.$x.log || exit 1;
gmm-est --mix-up=$numgauss $dir/$x.mdl $dir/$x.acc $dir/$[$x+1].mdl 2> $dir/update.$x.log || exit 1;
gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl $dir/$x.acc $dir/$[$x+1].mdl 2> $dir/update.$x.log || exit 1;
fi
rm $dir/$x.mdl $dir/$x.acc 2>/dev/null
rm $dir/$x.mdl $dir/$x.acc $dir/$x.occs 2>/dev/null
if [ $x -le $maxiterinc ]; then
numgauss=$[$numgauss+$incgauss];
fi
x=$[$x+1]
done
( cd $dir; rm final.mdl 2>/dev/null; ln -s $x.mdl final.mdl;
( cd $dir; rm final.mdl 2>/dev/null; ln -s $x.mdl final.mdl; ln -s $x.occs final.occs
ln -s `basename $cur_lda` final.mat )
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Train UBM from a trained HMM/GMM system [with splice+LDA+STC/MLLT features]
if [ -f path.sh ]; then . path.sh; fi
dir=exp/ubmc
mkdir -p $dir
srcdir=exp/tri2f
mat=$srcdir/final.mat
init-ubm --intermediate-numcomps=2000 --ubm-numcomps=400 --verbose=2 \
--fullcov-ubm=true $srcdir/final.mdl $srcdir/final.occs \
$dir/0.ubm 2> $dir/cluster.log
subset[0]=1000
subset[1]=1500
subset[2]=2000
subset[3]=2500
for x in 0 1 2 3; do
echo "Pass $x"
feats="ark:scripts/subset_scp.pl ${subset[$x]} data/train.scp | splice-feats scp:- ark:- | transform-feats $mat ark:- ark:- |"
fgmm-global-acc-stats --diag-gmm-nbest=15 --binary=false --verbose=2 $dir/$x.ubm "$feats" $dir/$x.acc \
2> $dir/acc.$x.log || exit 1;
fgmm-global-est --verbose=2 $dir/$x.ubm $dir/$x.acc \
$dir/$[$x+1].ubm 2> $dir/update.$x.log || exit 1;
rm $dir/$x.acc $dir/$x.ubm
done
......@@ -42,10 +42,10 @@ system:
[diagonal] 10.5 12.7
+fmllr[utt] 10.4 13.9
[diagonal] 10.6 13.3
tri2b[spk] 11.3 15.7 | tri2b is exponential transform
+fmllr[spk] 10.5 13.9
[utt] 11.3 15.1 | [estimating ET per utt.]
+fmllr[utt] 11.2 15.3 | [estimating ET and fMLLR per utt]
tri2b[spk] 11.5 15.0 | tri2b is exponential transform
+fmllr[spk] 10.6 13.9
[utt] 11.7 15.0 | [estimating ET per utt.]
+fmllr[utt] 11.5 15.1 | [estimating ET and fMLLR per utt]
tri2c 12.7 16.6 | as tri2a plus cepstral mean subtraction.
[utt] 13.0 17.0 | [per utterance CMS in test]
tri2d 13.0 19.4 | as tri2a plus STC/MLLT (worse).
......@@ -61,9 +61,9 @@ system:
tri2h 13.4 20.2 | [ splice-9-frames + HLDA... worse than tri2a]
tri2i 12.4 18.4 | [ triple-deltas + HLDA... same as tri2a]
tri2j 12.8 18.3 | [ triple-deltas+LDA+MLLT... slightly worse than tri2a]
tri2k 10.3 15.0 | [ splice-9-frames + LDA + ET ]
[utt] 10.3 15.2 | [adaptation per utterance]
[spk,+fmllr] 9.9 14.4 | [per speaker, plus fMLLR]
tri2k 10.6 14.7 | [ splice-9-frames + LDA + ET ]
[utt] 10.4 14.6 | [adaptation per utterance]
[spk,+fmllr] 10.0 13.7 | [per speaker, plus fMLLR]
tri2l 9.6 13.7 | train with SAT; test with fMLLR
[utt] 12.0 16.8 | [adaptation per utterance]
tri2m 10.8 15.0 | [LDA + MLLT + Linear VTLN]
......@@ -77,9 +77,13 @@ system:
sgmm2b 10.0 13.8 | [sgmm, spk-vector adaptation only]
[utt] 10.0 13.9 | [adapt per utt]
[spk,+fmllr] 9.9 13.5 | [per-spk, plus fMLLR]
tri3k 8.9 12.3 | [ splice-9-frames + LDA + ET; SI-284 ]
[utt] 8.9 12.3 | [adaptation per utterance]
[spk;+fmllr] 8.3 11.3 | [per-speaker adaptation; +fMLLR]
|[note: sgmm2d has acwt 1/13, not 1/12 as in sgmm2b]
sgmm2d 10.0 13.4 | [sgmm, spk-vectors, on LDA+STC features]
[utt] 10.2 13.3 | [adapt per utt]
[spk,+fmllr] 9.8 12.9 | [per-spk, plus fMLLR]
tri3k 9.0 11.9 | [ splice-9-frames + LDA + ET; SI-284 ]
[utt] 9.1 11.8 | [adaptation per utterance]
[spk;+fmllr] 8.4 11.7 | [per-speaker adaptation; +fMLLR]
sgmm3b 7.8 10.4 | [ SGMM with speaker vectors, on SI-284]
[utt] 7.8 10.4 | [per-utterance adaptation]
[spk;+fmllr] 7.8 10.0 | [per-speaker adaptation, with fMLLR]
......@@ -98,6 +102,16 @@ exp/decode_tri2a_tgpr_eval92/wer:%WER 12.52 [ 706 / 5641, 127 ins, 60 del, 519 s
exp/decode_tri2a_tgpr_eval93/wer:%WER 18.29 [ 629 / 3439, 47 ins, 104 del, 478 sub ]
exp/decode_tri2a_tgpr_fmllr_eval92/wer:%WER 11.42 [ 644 / 5641, 116 ins, 60 del, 468 sub ]
exp/decode_tri2a_tgpr_fmllr_utt_eval92/wer:%WER 12.48 [ 704 / 5641, 128 ins, 56 del, 520 sub ]
exp/decode_tri2b_tgpr_eval92/wer:%WER 11.52 [ 650 / 5641, 124 ins, 61 del, 465 sub ]
exp/decode_tri2b_tgpr_eval93/wer:%WER 14.98 [ 515 / 3439, 55 ins, 80 del, 380 sub ]
exp/decode_tri2b_tgpr_fmllr_eval92/wer:%WER 10.55 [ 595 / 5641, 115 ins, 55 del, 425 sub ]
exp/decode_tri2b_tgpr_fmllr_eval93/wer:%WER 13.87 [ 477 / 3439, 53 ins, 78 del, 346 sub ]
exp/decode_tri2b_tgpr_utt_eval92/wer:%WER 11.72 [ 661 / 5641, 126 ins, 63 del, 472 sub ]
exp/decode_tri2b_tgpr_utt_eval93/wer:%WER 15.00 [ 516 / 3439, 56 ins, 82 del, 378 sub ]
exp/decode_tri2b_tgpr_utt_fmllr_eval92/wer:%WER 11.51 [ 649 / 5641, 122 ins, 61 del, 466 sub ]
exp/decode_tri2b_tgpr_utt_fmllr_eval93/wer:%WER 15.06 [ 518 / 3439, 55 ins, 86 del, 377 sub ]
exp/decode_tri2c_tgpr_eval92/wer:%WER 12.71 [ 717 / 5641, 137 ins, 72 del, 508 sub ]
exp/decode_tri2c_tgpr_eval93/wer:%WER 16.57 [ 570 / 3439, 62 ins, 87 del, 421 sub ]
exp/decode_tri2c_tgpr_utt_eval92/wer:%WER 12.96 [ 731 / 5641, 148 ins, 67 del, 516 sub ]
......@@ -108,6 +122,10 @@ exp/decode_tri2e_tgpr_eval92/wer:%WER 14.29 [ 806 / 5641, 155 ins, 79 del, 572 s
exp/decode_tri2e_tgpr_eval93/wer:%WER 19.08 [ 656 / 3439, 71 ins, 120 del, 465 sub ]
exp/decode_tri2f_tgpr_eval92/wer:%WER 12.23 [ 690 / 5641, 138 ins, 57 del, 495 sub ]
exp/decode_tri2f_tgpr_eval93/wer:%WER 17.74 [ 610 / 3439, 68 ins, 85 del, 457 sub ]
# with acwt 1/17:
exp/decode_tri2f_tgpr_eval92/wer:%WER 12.16 [ 686 / 5641, 128 ins, 59 del, 499 sub ]
exp/decode_tri2f_tgpr_eval93/wer:%WER 17.56 [ 604 / 3439, 61 ins, 92 del, 451 sub ]
exp/decode_tri2g_tgpr_diag_eval92/wer:%WER 10.65 [ 601 / 5641, 111 ins, 55 del, 435 sub ]
exp/decode_tri2g_tgpr_diag_eval93/wer:%WER 16.49 [ 567 / 3439, 77 ins, 72 del, 418 sub ]
exp/decode_tri2g_tgpr_diag_fmllr_eval92/wer:%WER 10.25 [ 578 / 5641, 115 ins, 60 del, 403 sub ]
......@@ -128,12 +146,14 @@ exp/decode_tri2i_tgpr_eval92/wer:%WER 12.39 [ 699 / 5641, 130 ins, 72 del, 497 s
exp/decode_tri2i_tgpr_eval93/wer:%WER 18.35 [ 631 / 3439, 58 ins, 102 del, 471 sub ]
exp/decode_tri2j_tgpr_eval92/wer:%WER 12.82 [ 723 / 5641, 127 ins, 70 del, 526 sub ]
exp/decode_tri2j_tgpr_eval93/wer:%WER 18.26 [ 628 / 3439, 59 ins, 99 del, 470 sub ]
exp/decode_tri2k_tgpr_eval92/wer:%WER 10.26 [ 579 / 5641, 117 ins, 45 del, 417 sub ]
exp/decode_tri2k_tgpr_eval93/wer:%WER 15.03 [ 517 / 3439, 73 ins, 71 del, 373 sub ]
exp/decode_tri2k_tgpr_fmllr_eval92/wer:%WER 9.86 [ 556 / 5641, 119 ins, 49 del, 388 sub ]
exp/decode_tri2k_tgpr_fmllr_eval93/wer:%WER 14.39 [ 495 / 3439, 72 ins, 67 del, 356 sub ]
exp/decode_tri2k_tgpr_utt_eval92/wer:%WER 10.30 [ 581 / 5641, 117 ins, 47 del, 417 sub ]
exp/decode_tri2k_tgpr_utt_eval93/wer:%WER 15.18 [ 522 / 3439, 76 ins, 69 del, 377 sub ]
exp/decode_tri2k_tgpr_eval92/wer:%WER 10.60 [ 598 / 5641, 122 ins, 48 del, 428 sub ]
exp/decode_tri2k_tgpr_eval93/wer:%WER 14.66 [ 504 / 3439, 69 ins, 63 del, 372 sub ]
exp/decode_tri2k_tgpr_fmllr_eval92/wer:%WER 9.98 [ 563 / 5641, 113 ins, 52 del, 398 sub ]
exp/decode_tri2k_tgpr_fmllr_eval93/wer:%WER 13.70 [ 471 / 3439, 71 ins, 60 del, 340 sub ]
exp/decode_tri2k_tgpr_utt_eval92/wer:%WER 10.41 [ 587 / 5641, 112 ins, 50 del, 425 sub ]
exp/decode_tri2k_tgpr_utt_eval93/wer:%WER 14.63 [ 503 / 3439, 67 ins, 65 del, 371 sub ]
exp/decode_tri2l_tgpr_eval92/wer:%WER 9.64 [ 544 / 5641, 121 ins, 44 del, 379 sub ]
exp/decode_tri2l_tgpr_eval93/wer:%WER 13.72 [ 472 / 3439, 68 ins, 66 del, 338 sub ]
exp/decode_tri2l_tgpr_utt_eval92/wer:%WER 12.00 [ 677 / 5641, 141 ins, 60 del, 476 sub ]
......@@ -175,8 +195,13 @@ exp/decode_sgmm2b_tgpr_eval93/wer:%WER 13.84 [ 476 / 3439, 61 ins, 67 del, 348 s
exp/decode_sgmm2b_tgpr_utt_eval92/wer:%WER 9.96 [ 562 / 5641, 131 ins, 40 del, 391 sub ]
exp/decode_sgmm2b_tgpr_utt_eval93/wer:%WER 13.90 [ 478 / 3439, 59 ins, 68 del, 351 sub ]
exp/decode_sgmm2d_fmllr_tgpr_eval92/wer:%WER 9.77 [ 551 / 5641, 126 ins, 43 del, 382 sub ]
exp/decode_sgmm2d_fmllr_tgpr_eval93/wer:%WER 12.88 [ 443 / 3439, 55 ins, 60 del, 328 sub ]
exp/decode_sgmm2d_tgpr_eval92/wer:%WER 9.96 [ 562 / 5641, 124 ins, 44 del, 394 sub ]
exp/decode_sgmm2d_tgpr_eval93/wer:%WER 13.38 [ 460 / 3439, 57 ins, 59 del, 344 sub ]
exp/decode_sgmm2d_tgpr_utt_eval92/wer:%WER 10.16 [ 573 / 5641, 137 ins, 45 del, 391 sub ]
exp/decode_sgmm2d_tgpr_utt_eval93/wer:%WER 13.26 [ 456 / 3439, 61 ins, 63 del, 332 sub ]
svatava:s1:
# [old:]
......@@ -251,12 +276,12 @@ exp/decode_tri3a_tgpr_uttdfmllr_eval93/wer:%WER 13.29 [ 457 / 3439, 49 ins, 57 d
exp/decode_tri3a_tgpr_uttfmllr_eval92/wer:%WER 10.44 [ 589 / 5641, 122 ins, 47 del, 420 sub ]
exp/decode_tri3a_tgpr_uttfmllr_eval93/wer:%WER 13.93 [ 479 / 3439, 56 ins, 69 del, 354 sub ]
exp/decode_tri3k_tgpr_eval92/wer:%WER 8.90 [ 502 / 5641, 114 ins, 34 del, 354 sub ]
exp/decode_tri3k_tgpr_eval93/wer:%WER 12.27 [ 422 / 3439, 67 ins, 54 del, 301 sub ]
exp/decode_tri3k_tgpr_fmllr_eval92/wer:%WER 8.28 [ 467 / 5641, 117 ins, 26 del, 324 sub ]
exp/decode_tri3k_tgpr_fmllr_eval93/wer:%WER 11.34 [ 390 / 3439, 68 ins, 42 del, 280 sub ]
exp/decode_tri3k_tgpr_utt_eval92/wer:%WER 8.93 [ 504 / 5641, 114 ins, 34 del, 356 sub ]
exp/decode_tri3k_tgpr_utt_eval93/wer:%WER 12.27 [ 422 / 3439, 64 ins, 57 del, 301 sub ]
exp/decode_tri3k_tgpr_eval92/wer:%WER 9.02 [ 509 / 5641, 125 ins, 27 del, 357 sub ]
exp/decode_tri3k_tgpr_eval93/wer:%WER 11.92 [ 410 / 3439, 65 ins, 47 del, 298 sub ]
exp/decode_tri3k_tgpr_fmllr_eval92/wer:%WER 8.39 [ 473 / 5641, 117 ins, 27 del, 329 sub ]
exp/decode_tri3k_tgpr_fmllr_eval93/wer:%WER 11.66 [ 401 / 3439, 68 ins, 45 del, 288 sub ]
exp/decode_tri3k_tgpr_utt_eval92/wer:%WER 9.08 [ 512 / 5641, 125 ins, 31 del, 356 sub ]
exp/decode_tri3k_tgpr_utt_eval93/wer:%WER 11.75 [ 404 / 3439, 57 ins, 51 del, 296 sub ]
exp/decode_sgmm2a_tgpr_eval92/wer:%WER 10.44 [ 589 / 5641, 129 ins, 38 del, 422 sub ]
exp/decode_sgmm2a_tgpr_eval93/wer:%WER 16.40 [ 564 / 3439, 68 ins, 92 del, 404 sub ]
......
......@@ -396,6 +396,18 @@ steps/train_sgmm2b.sh || exit 1;
done
)&
# as sgmm2b, but with LDA+STC.
# Note: increased acwt from 12 to 13.