Commit 1fb53037 authored by Dan Povey's avatar Dan Povey
Browse files

Script changes, mostly efficiency improvements.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@621 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent a42ca060
......@@ -103,6 +103,3 @@ done
echo Data preparation and formatting completed for Eval 2000
echo "(but not MFCC extraction)"
......@@ -43,6 +43,7 @@ local/make_mfcc_segs.sh --num-jobs 10 --cmd "$cmd" data/train exp/make_mfcc/trai
scripts/fix_data_dir.sh data/train
local/make_mfcc_segs.sh --num-jobs 4 data/eval2000 exp/make_mfcc/eval2000 $mfccdir
scripts/fix_data_dir.sh data/eval2000
# Now-- there are 264k utterances, and we want to start the monophone training
# on relatively short utterances (easier to align), but not only the very shortest
......@@ -104,8 +105,8 @@ steps/train_lda_mllt_sat.sh --num-jobs 30 --cmd "$train_cmd" \
4000 20000 data/train_100k_nodup data/lang exp/tri3a_ali exp/tri4a
scripts/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
scripts/decode.sh --num-jobs 10 --cmd "$decode_cmd" steps/decode_lda_mllt_sat.sh exp/tri4a/graph \
data/eval2000 exp/tri4a/decode_eval2000
scripts/decode.sh -l data/lang_test --num-jobs 10 --cmd "$decode_cmd" \
steps/decode_lda_mllt_sat.sh exp/tri4a/graph data/eval2000 exp/tri4a/decode_eval2000
steps/align_lda_mllt_sat.sh --num-jobs 30 --cmd "$train_cmd" \
data/train_nodup data/lang exp/tri4a exp/tri4a_ali_all_nodup
......@@ -115,8 +116,8 @@ steps/train_lda_mllt_sat.sh --num-jobs 30 --cmd "$train_cmd" \
4000 150000 data/train_nodup data/lang exp/tri4a_ali_all_nodup exp/tri5a
scripts/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph
scripts/decode.sh --num-jobs 10 --cmd "$decode_cmd" steps/decode_lda_mllt_sat.sh exp/tri5a/graph \
data/eval2000 exp/tri5a/decode_eval2000
scripts/decode.sh -l data/lang_test --num-jobs 10 --cmd "$decode_cmd" \
steps/decode_lda_mllt_sat.sh exp/tri5a/graph data/eval2000 exp/tri5a/decode_eval2000
# Align the 5a system; we'll train an SGMM system on top of
# LDA+MLLT+SAT, and use 5a system for 1st pass.
......@@ -133,3 +134,4 @@ scripts/decode.sh --num-jobs 10 --cmd "$decode_cmd" steps/decode_sgmm_lda_etc.sh
exp/sgmm6a/graph_tgpr data/eval2000 exp/sgmm6a/decode_eval2000 exp/tri5a/decode_eval2000
for x in exp/*/decode_*; do [ -d $x ] && grep Mean $x/score_*/*.sys | scripts/best_wer.sh; done
\ No newline at end of file
# Note: RESULTS_2 is same results as run by Mirko, but
# at the current time is less up to date.
# Note: also see RESULTS_2 which is more up-to-date.
# These results will be finalized soon.
for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | scripts/best_wer.sh; done
......@@ -18,6 +18,7 @@ exp/tri1/decode_tgpr_dev93_tg4/wer_14:%WER 18.17 [ 1496 / 8234, 275 ins, 154 del
exp/tri2a/decode_tgpr_dev93/wer_15:%WER 17.77 [ 1463 / 8234, 245 ins, 150 del, 1068 sub ]
# triphone, LDA+MLLT, SI-84
exp/tri2b/decode_tgpr_dev93/wer_16:%WER 17.78 [ 1464 / 8234, 284 ins, 141 del, 1039 sub ]
exp/tri2b/decode_tgpr_dev93_fromlats/wer_16:%WER 17.08 [ 1406 / 8234, 256 ins, 164 del, 986 sub ] # rescore 2a lats
exp/tri2b/decode_tgpr_dev93_tg/wer_15:%WER 16.75 [ 1379 / 8234, 284 ins, 130 del, 965 sub ]
......@@ -39,27 +40,37 @@ exp/tri3b/decode_tgpr_dev93/wer_15:%WER 14.95 [ 1231 / 8234, 247 ins, 113 del, 8
exp/tri3b/decode_tgpr_dev93_tg/wer_16:%WER 14.26 [ 1174 / 8234, 235 ins, 122 del, 817 sub ]
exp/tri3b/decode_tgpr_eval92/wer_15:%WER 10.23 [ 577 / 5643, 141 ins, 42 del, 394 sub ]
exp/tri3b/decode_tgpr_eval92_tg/wer_16:%WER 9.87 [ 557 / 5643, 136 ins, 38 del, 383 sub ]
# same with big-dict, on eval'92
exp/tri3b/decode_bd_tgpr_eval92/wer_15:%WER 7.94 [ 448 / 5643, 70 ins, 49 del, 329 sub ]
exp/tri3b/decode_bd_tgpr_eval92_fg/wer_16:%WER 6.84 [ 386 / 5643, 66 ins, 43 del, 277 sub ]
exp/tri3b/decode_bd_tgpr_eval92_tg/wer_16:%WER 7.04 [ 397 / 5643, 62 ins, 46 del, 289 sub ]
# Big-dict and our own LM (trigram, pruned)
exp/tri3b/decode_bd_tgpr_eval92/wer_15:%WER 8.05 [ 454 / 5643, 81 ins, 43 del, 330 sub ]
# and rescoring with trigram and 4-gram LMs:
exp/tri3b/decode_bd_tgpr_eval92_tg/wer_16:%WER 7.51 [ 424 / 5643, 78 ins, 46 del, 300 sub ]
exp/tri3b/decode_bd_tgpr_eval92_fg/wer_16:%WER 7.32 [ 413 / 5643, 76 ins, 43 del, 294 sub ]
# LDA+MLLT+SAT, SI-284, quick retraining from 3b
exp/tri4b/decode_tgpr_dev93/wer_15:%WER 12.89 [ 1061 / 8234, 231 ins, 93 del, 737 sub ]
exp/tri4b/decode_tgpr_dev93/wer_16:%WER 12.95 [ 1066 / 8234, 223 ins, 95 del, 748 sub ]
exp/tri4b/decode_tgpr_eval92/wer_15:%WER 9.06 [ 511 / 5643, 135 ins, 26 del, 350 sub ]
# +MMI
exp/tri4b_mmi/decode_tgpr_dev93/wer_16:%WER 11.79 [ 971 / 8234, 203 ins, 87 del, 681 sub ]
exp/tri4b_mmi_b0.1/decode_tgpr_dev93/wer_16:%WER 11.63 [ 958 / 8234, 199 ins, 88 del, 671 sub ]
exp/tri4b_mmi/decode_tgpr_dev93/wer_15:%WER 11.87 [ 977 / 8234, 204 ins, 90 del, 683 sub ]
exp/tri4b_mmi_b0.1/decode_tgpr_dev93/wer_15:%WER 11.82 [ 973 / 8234, 204 ins, 86 del, 683 sub ]
# LDA+MLLT+SAT, SI-284, full retraining starting from 3b [c.f. 4b]
exp/tri4c/decode_tgpr_dev93/wer_16:%WER 12.82 [ 1056 / 8234, 207 ins, 97 del, 752 sub ]
exp/sgmm3c/decode_tgpr_dev93/wer_11:%WER 14.02 [ 1153 / 8224, 218 ins, 108 del, 827 sub ] [PARTIAL]
exp/sgmm4b/decode_tgpr_dev93/wer_15:%WER 12.89 [ 1061 / 8234, 190 ins, 128 del, 743 sub ]
exp/sgmm4c/decode_tgpr_dev93/wer_12:%WER 11.10 [ 912 / 8213, 180 ins, 94 del, 638 sub ] [PARTIAL]
exp/sgmm4c/decode_tgpr_dev93_tg/wer_11:%WER 10.31 [ 847 / 8213, 185 ins, 80 del, 582 sub ] [PARTIAL]
exp/sgmm4c/decode_tgpr_eval92/wer_13:%WER 7.53 [ 425 / 5643, 88 ins, 29 del, 308 sub ]
exp/sgmm4c/decode_tgpr_eval92_tg/wer_13:%WER 7.14 [ 403 / 5643, 93 ins, 22 del, 288 sub ]
exp/tri4c/decode_tgpr_dev93/wer_16:%WER 12.92 [ 1064 / 8234, 224 ins, 94 del, 746 sub ]
# sgmm4b is LDA+MLLT+SAT, on just SI-84 data.
exp/sgmm4b/decode_tgpr_dev93/wer_14:%WER 13.29 [ 1094 / 8234, 213 ins, 124 del, 757 sub ]
exp/sgmm4b/decode_tgpr_eval92/wer_12:%WER 8.79 [ 496 / 5643, 122 ins, 30 del, 344 sub ]
# sgmm4c is the same, but on all SI-284 data.
exp/sgmm4c/decode_tgpr_dev93/wer_12:%WER 11.19 [ 921 / 8234, 185 ins, 91 del, 645 sub ]
exp/sgmm4c/decode_tgpr_dev93_tg/wer_10:%WER 10.64 [ 876 / 8234, 197 ins, 81 del, 598 sub ]
exp/sgmm4c/decode_tgpr_eval92/wer_10:%WER 8.17 [ 461 / 5643, 123 ins, 25 del, 313 sub ]
exp/sgmm4c/decode_tgpr_eval92_tg/wer_12:%WER 7.51 [ 424 / 5643, 108 ins, 22 del, 294 sub ]
# using big-dict.
exp/sgmm4c/decode_bd_tgpr_eval92/wer_11:%WER 5.35 [ 302 / 5643, 51 ins, 28 del, 223 sub ]
exp/sgmm4c/decode_bd_tgpr_eval92_fg/wer_11:%WER 4.70 [ 265 / 5643, 46 ins, 18 del, 201 sub ]
exp/sgmm4c/decode_bd_tgpr_eval92_tg/wer_11:%WER 4.91 [ 277 / 5643, 44 ins, 23 del, 210 sub ]
......@@ -268,12 +268,21 @@ scripts/decode.sh --cmd "$decode_cmd" steps/decode_sgmm_lda_etc.sh \
scripts/lmrescore.sh --cmd "$decode_cmd" data/lang_test_tgpr data/lang_test_tg \
data/test_dev93 exp/sgmm4c/decode_tgpr_dev93 exp/sgmm4c/decode_tgpr_dev93_tg
# decode the above with nov'92 too
# decode sgmm4c with nov'92 too
scripts/decode.sh --cmd "$decode_cmd" steps/decode_sgmm_lda_etc.sh \
exp/sgmm4c/graph_tgpr data/test_eval92 exp/sgmm4c/decode_tgpr_eval92 exp/tri3b/decode_tgpr_eval92
scripts/lmrescore.sh --cmd "$decode_cmd" data/lang_test_tgpr data/lang_test_tg \
data/test_eval92 exp/sgmm4c/decode_tgpr_eval92 exp/sgmm4c/decode_tgpr_eval92_tg
# Decode sgmm4c with the "big-dict" decoding graph.
scripts/mkgraph.sh data/lang_test_bd_tgpr exp/sgmm4c exp/sgmm4c/graph_bd_tgpr
scripts/decode.sh --cmd "$decode_cmd" steps/decode_sgmm_lda_etc.sh exp/sgmm4c/graph_bd_tgpr \
data/test_eval92 exp/sgmm4c/decode_bd_tgpr_eval92 exp/tri3b/decode_tgpr_eval92
scripts/lmrescore.sh --cmd "$decode_cmd" data/lang_test_bd_tgpr data/lang_test_bd_fg \
data/test_eval92 exp/sgmm4c/decode_bd_tgpr_eval92 exp/sgmm4c/decode_bd_tgpr_eval92_fg
scripts/lmrescore.sh --cmd "$decode_cmd" data/lang_test_bd_tgpr data/lang_test_bd_tg \
data/test_eval92 exp/sgmm4c/decode_bd_tgpr_eval92 exp/sgmm4c/decode_bd_tgpr_eval92_tg
......
......@@ -146,7 +146,7 @@ while [ $x -lt $numiters ]; do
for n in `get_splits.pl $nj`; do
$cmd $dir/log/acc.$x.$n.log \
gmm-acc-stats-ali --binary=false $dir/$x.mdl "${featspart[$n]}" \
"ark:gunzip -c $dir/$n.ali.gz|" $dir/$x.$n.acc || touch $dir/.error &
"ark,s,cs:gunzip -c $dir/$n.ali.gz|" $dir/$x.$n.acc || touch $dir/.error &
done
wait;
[ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
......
......@@ -176,7 +176,7 @@ while [ $x -lt $numiters ]; do
for n in `get_splits.pl $nj`; do
$cmd $dir/log/acc.$x.$n.log \
gmm-acc-stats-ali --binary=false $dir/$x.mdl "${featspart[$n]}" \
"ark:gunzip -c $dir/$n.ali.gz|" $dir/$x.$n.acc || touch $dir/.error &
"ark,s,cs:gunzip -c $dir/$n.ali.gz|" $dir/$x.$n.acc || touch $dir/.error &
done
wait;
[ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
......@@ -200,7 +200,7 @@ if [ "$feats" != "$sifeats" ]; then
$cmd $dir/acc_alimdl.$n.log \
ali-to-post "ark:gunzip -c $dir/$n.ali.gz|" ark:- \| \
gmm-acc-stats-twofeats $dir/$x.mdl "${featspart[$n]}" "${sifeatspart[$n]}" \
ark:- $dir/$x.$n.acc2 || touch $dir/.error &
ark,s,cs:- $dir/$x.$n.acc2 || touch $dir/.error &
done
wait;
[ -f $dir/.error ] && echo "Error accumulating alignment statistics." && exit 1;
......
......@@ -186,7 +186,7 @@ while [ $x -lt $numiters ]; do
for n in `get_splits.pl $nj`; do
$cmd $dir/log/acc.$x.$n.log \
gmm-acc-stats-ali --binary=false $dir/$x.mdl "${featspart[$n]}" \
"ark:gunzip -c $dir/$n.ali.gz|" $dir/$x.$n.acc || touch $dir/.error &
"ark,s,cs:gunzip -c $dir/$n.ali.gz|" $dir/$x.$n.acc || touch $dir/.error &
done
wait;
[ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
......
......@@ -197,7 +197,7 @@ while [ $x -lt $numiters ]; do
for n in `get_splits.pl $nj`; do
$cmd $dir/log/acc.$x.$n.log \
gmm-acc-stats-ali --binary=false $dir/$x.mdl "${featspart[$n]}" \
"ark:gunzip -c $dir/$n.ali.gz|" $dir/$x.$n.acc || touch $dir/.error &
"ark,s,cs:gunzip -c $dir/$n.ali.gz|" $dir/$x.$n.acc || touch $dir/.error &
done
wait;
[ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
......@@ -218,7 +218,7 @@ for n in `get_splits.pl $nj`; do
$cmd $dir/log/acc_alimdl.$n.log \
ali-to-post "ark:gunzip -c $dir/$n.ali.gz|" ark:- \| \
gmm-acc-stats-twofeats $dir/$x.mdl "${featspart[$n]}" "${sifeatspart[$n]}" \
ark:- $dir/$x.$n.acc2 || touch $dir/.error &
ark,s,cs:- $dir/$x.$n.acc2 || touch $dir/.error &
done
wait;
[ -f $dir/.error ] && echo "Error accumulating alignment statistics." && exit 1;
......
......@@ -215,7 +215,7 @@ while [ $x -lt $numiters ]; do
$cmd $dir/log/acc.$x.$n.log \
sgmm-acc-stats ${spkvecs_opt[$n]} --utt2spk=ark:$data/split$nj/$n/utt2spk \
--update-flags=$flags "${gselect_opt[$n]}" --rand-prune=$randprune \
$dir/$x.mdl "${featspart[$n]}" "ark:ali-to-post 'ark:gunzip -c $dir/$n.ali.gz|' ark:-|" \
$dir/$x.mdl "${featspart[$n]}" "ark,s,cs:ali-to-post 'ark:gunzip -c $dir/$n.ali.gz|' ark:-|" \
$dir/$x.$n.acc || touch $dir/.error &
done
wait;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment