Commit 48992741 authored by Dan Povey's avatar Dan Povey
Browse files

Add script to mix-up system various script fixes and extensions.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@631 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 892b9b50
......@@ -145,11 +145,17 @@ scripts/decode.sh -l data/lang_test --num-jobs 30 --cmd "$decode_cmd" steps/deco
data/lang_test data/eval2000 exp/sgmm6a/decode_eval2000_fromlats exp/tri5a/decode_eval2000
scripts/decode.sh -l data/lang_test --opts '--scale-opts "--transition-scale=1.0 --self-loop-scale=0.0"' \
--num-jobs 30 --cmd "$decode_cmd" steps/decode_sgmm_lda_etc_fromlats.sh \
data/lang_test data/eval2000 exp/sgmm6a/decode_eval2000_fromlats_0selfloop exp/tri5a/decode_eval2000
# MMI starting from the system in tri5a.
steps/align_lda_mllt_sat.sh --num-jobs 40 --cmd "$train_cmd" \
data/train data/lang exp/tri5a exp/tri5a_ali
steps/make_denlats_lda_etc.sh --num-jobs 40 --num-split 40 --cmd "$train_cmd" \
steps/make_denlats_lda_etc.sh --num-jobs 40 --sub-split 40 --cmd "$train_cmd" \
data/train data/lang exp/tri5a_ali exp/tri5a_denlats
steps/train_lda_etc_mmi.sh --num-jobs 40 --cmd "$train_cmd" \
data/train data/lang exp/tri5a_ali exp/tri5a_denlats exp/tri5a exp/tri5a_mmi
......
......@@ -45,6 +45,9 @@ exp/tri3b/decode_tgpr_eval92_tg/wer_16:%WER 9.87 [ 557 / 5643, 136 ins, 38 del,
exp/tri3b/decode_bd_tgpr_eval92_fg/wer_16:%WER 6.84 [ 386 / 5643, 66 ins, 43 del, 277 sub ]
exp/tri3b/decode_bd_tgpr_eval92_tg/wer_16:%WER 7.04 [ 397 / 5643, 62 ins, 46 del, 289 sub ]
# note: with 20k Gaussians, it's a bit better [14.95->14.72]
exp/tri3b_20k/decode_tgpr_dev93/wer_16:%WER 14.72 [ 1212 / 8234, 239 ins, 117 del, 856 sub ]
# sgmm3c is SGMM on top of LDA+MLLT.
# In this case decoding from lattices is worse, presumably because the lattices used
# (tri2b) had a bad WER (17.78%).
......
......@@ -197,7 +197,13 @@ scripts/lmrescore.sh --cmd "$decode_cmd" data/lang_test_bd_tgpr data/lang_test_b
scripts/lmrescore.sh --cmd "$decode_cmd" data/lang_test_bd_tgpr data/lang_test_bd_tg \
data/test_eval92 exp/tri3b/decode_bd_tgpr_eval92 exp/tri3b/decode_bd_tgpr_eval92_tg
# The following two steps, which are a kind of side-branch, try mixing up
( # from the 3b system. This is to demonstrate that script.
steps/mixup_lda_etc.sh --num-jobs 10 --cmd "$train_cmd" \
20000 data/train_si84 exp/tri3b exp/tri2b_ali_si84 exp/tri3b_20k
scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt_sat.sh exp/tri3b/graph_tgpr \
data/test_dev93 exp/tri3b_20k/decode_tgpr_dev93
)
# From 3b system, align all si284 data.
steps/align_lda_mllt_sat.sh --num-jobs 10 --cmd "$train_cmd" \
......
......@@ -22,7 +22,7 @@ nj=
lang=
opts=
cmd=scripts/run.pl
for x in 1 2; do
for x in 1 2 3 4; do
if [ $1 == "--num-jobs" ]; then
shift
nj=$1
......
......@@ -57,16 +57,19 @@ for n in `scripts/get_splits.pl $numsplit`; do
utt2spks="$utt2spks $data/split$numsplit/$n/utt2spk"
done
scripts/split_scp.pl --utt2spk=$data/utt2spk $data/utt2spk $utt2spks
scripts/split_scp.pl --utt2spk=$data/utt2spk $data/feats.scp $feats
scripts/split_scp.pl --utt2spk=$data/utt2spk $data/utt2spk $utt2spks || exit 1
scripts/split_scp.pl --utt2spk=$data/utt2spk $data/feats.scp $feats || exit 1
[ -f $data/wav.scp ] && \
scripts/split_scp.pl --utt2spk=$data/utt2spk $data/wav.scp $wavs
[ -f $data/text ] && \
scripts/split_scp.pl --utt2spk=$data/utt2spk $data/text $texts
for n in `scripts/get_splits.pl $numsplit`; do
scripts/utt2spk_to_spk2utt.pl $data/split$numsplit/$n/utt2spk > $data/split$numsplit/$n/spk2utt
scripts/utt2spk_to_spk2utt.pl $data/split$numsplit/$n/utt2spk > $data/split$numsplit/$n/spk2utt || exit 1;
# for completeness, also split the spk2gender file
[ -f $data/spk2gender ] && \
scripts/filter_scp.pl $data/split$numsplit/$n/spk2utt $data/spk2gender > $data/split$numsplit/$n/spk2gender
scripts/filter_scp.pl $data/split$numsplit/$n/spk2utt $data/spk2gender > $data/split$numsplit/$n/spk2gender
done
exit 0
......@@ -19,6 +19,12 @@ beam=13.0
latticebeam=7.0
acwt=0.1
maxactive=5000
maxmem=20000000 # This will stop the processes getting too large
# (default is 50M, but this can result in the process getting up to 2G
# ... the units are not quite "real" units due to inaccuracies in the
# way that program measures how much memory it is using).
subsplit=1 # If this option is given, it will go sequentially over each
# part of the data, and decode it in parallel with this many jobs.
for x in 1 2 3; do
if [ "$1" == "--num-jobs" ]; then
......@@ -26,6 +32,11 @@ for x in 1 2 3; do
nj=$1
shift
fi
if [ "$1" == "--sub-split" ]; then
shift
subsplit=$1
shift
fi
if [ "$1" == "--cmd" ]; then
shift
cmd=$1
......@@ -79,10 +90,15 @@ cat $data/text | \
# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
# final.mdl from $alidir; the output HCLG.fst goes in $dir/graph.
scripts/mkgraph.sh $dir/lang $alidir $dir/dengraph || exit 1;
if [ -s $dir/dengraph/HCLG.fst ]; then
echo Not creating denominator graph $dir/dengraph/HCLG.fst since it already exists.
else
scripts/mkgraph.sh $dir/lang $alidir $dir/dengraph || exit 1;
fi
if [ ! -d $data/split$nj -o $data/split$nj -ot $data/feats.scp ]; then
scripts/split_data.sh $data $nj
scripts/split_data.sh $data $nj || exit 1;
fi
n=`get_splits.pl $nj | awk '{print $1}'`
......@@ -96,18 +112,56 @@ fi
rm $dir/.error 2>/dev/null
for n in `get_splits.pl $nj`; do
featspart[$n]="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$alidir/$n.cmvn scp:$data/split$nj/$n/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
$use_trans && featspart[$n]="${featspart[$n]} transform-feats --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$alidir/$n.trans ark:- ark:- |"
$cmd $dir/decode_den.$n.log \
gmm-latgen-faster --beam=$beam --lattice-beam=$latticebeam --acoustic-scale=$acwt \
--max-active=$maxactive --word-symbol-table=$lang/words.txt $alidir/final.mdl \
$dir/dengraph/HCLG.fst "${featspart[$n]}" "ark:|gzip -c >$dir/lat.$n.gz" \
|| touch $dir/.error &
done
wait
[ -f $dir/.error ] && echo Error generating denominator lattices && exit 1;
if [ $subsplit -eq 1 ]; then
for n in `get_splits.pl $nj`; do
feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$alidir/$n.cmvn scp:$data/split$nj/$n/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
$use_trans && feats="$feats transform-feats --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$alidir/$n.trans ark:- ark:- |"
$cmd $dir/decode_den.$n.log \
gmm-latgen-faster --beam=$beam --lattice-beam=$latticebeam --acoustic-scale=$acwt \
--max-mem=$maxmem --max-active=$maxactive --word-symbol-table=$lang/words.txt $alidir/final.mdl \
$dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.$n.gz" \
|| touch $dir/.error &
done
wait
[ -f $dir/.error ] && echo Error generating denominator lattices && exit 1;
else # Decode each subset of the data with multiple jobs.
for n in `get_splits.pl $nj`; do
if [ -f $dir/.done.$n ]; then
echo Not processing subset $n because file $dir/.done.$n exists # This is so we
else # can rerun this script without redoing everything, if we succeeded with some parts.
nk=$subsplit
if [ ! -d $data/split$nj/$n/split$nk -o $data/split$nj/$n/split$nk -ot $data/split$nj/feats.scp ]; then
scripts/split_data.sh $data/split$nj/$n $nk || exit 1;
fi
fi
mkdir -p $dir/log$n
for o in `get_splits.pl $nk`; do
if [ ! -s $data/split$nj/$n/split$nk/$o/feats.scp ]; then
echo "Empty subset; no lines in $data/split$nj/$n/split$nk/$o/feats.scp"
else
feats="ark:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/$n/split$nk/$o/utt2spk ark,s,cs:$alidir/$n.cmvn scp:$data/split$nj/$n/split$nk/$o/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
$use_trans && feats="$feats transform-feats --utt2spk=ark:$data/split$nj/$n/utt2spk ark,s,cs:$alidir/$n.trans ark:- ark:- |"
$cmd $dir/log$n/decode_den.$o.log \
gmm-latgen-faster --beam=$beam --lattice-beam=$latticebeam --acoustic-scale=$acwt \
--max-mem=$maxmem --max-active=$maxactive --word-symbol-table=$lang/words.txt $alidir/final.mdl \
$dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.$n.$o.gz" \
|| touch $dir/.error &
fi
done
wait
[ -f $dir/.error ] && echo Error generating denominator lattices for subset $n && exit 1;
echo Merging archives for data subset $n
for o in `get_splits.pl $nk`; do
gunzip -c $dir/lat.$n.$o.gz || touch $dir/.error;
done | gzip -c > $dir/lat.$n.gz || touch $dir/.error;
[ -f $dir/.error ] && echo Error merging denominator lattices for subset $n && exit 1;
rm $dir/lat.$n.*.gz
touch $dir/.done.$n # so we don't re-do it if we run this script again.
done
fi
echo "Done generating denominator lattices."
#!/bin/bash
# Copyright 2010-2011 Microsoft Corporation Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# To be run from ..
# This does mixing-up, three iterations of model training, realignment,
# and two more iterations of model training.
# It's intended to be used for experiments with LDA+MLLT or LDA+MLLT+SAT
# models where you increase the number of mixtures and see if it helps.
nj=4
cmd=scripts/run.pl
for x in 1 2; do
if [ "$1" == "--num-jobs" ]; then
shift
nj=$1
shift
fi
if [ "$1" == "--cmd" ]; then
shift
cmd=$1
shift
fi
done
if [ $# != 5 ]; then
echo "Usage: steps/mixup_lda_etc.sh <num-gauss> <data-dir> <old-exp-dir> <alignment-dir> <exp-dir>"
echo "Note: <alignment-dir> is only provided so we can get the CMVN data from there."
echo " e.g.: steps/mixup_lda_etc.sh 20000 data/train_si84 exp/tri3b exp/tri2b_ali_si84 exp/tri3b_20k"
exit 1;
fi
if [ -f path.sh ]; then . path.sh; fi
numgauss=$1
data=$2
olddir=$3
alidir=$4 # only needed for CMVN data.
dir=$5
for f in $data/feats.scp $olddir/final.mdl $olddir/final.mat; do
[ ! -f $f ] && echo "mixup_lda_etc.sh: no such file $f" && exit 1;
done
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
mkdir -p $dir/log
cp $olddir/final.mat $olddir/tree $dir/
if [ ! -d $data/split$nj -o $data/split$nj -ot $data/feats.scp ]; then
echo "Splitting data-dir $data into $nj pieces, but watch out: we require #jobs" \
"to be matched with $olddir"
split_data.sh $data $nj
fi
for n in `get_splits.pl $nj`; do
sifeatspart[$n]="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$alidir/$n.cmvn scp:$data/split$nj/$n/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
done
# Adjust the features to reflect any transforms we may have in $olddir.
first=`get_splits.pl $nj | awk '{print $1}'`
if [ -f $olddir/$first.trans ]; then
have_trans=true
echo Using transforms in $olddir
for n in `get_splits.pl $nj`; do
featspart[$n]="${sifeatspart[$n]} transform-feats --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$olddir/$n.trans ark:- ark:- |"
done
else
have_trans=false
echo "No transforms in $olddir, assuming you are not using fMLLR."
fi
echo Mixing up old model to $numgauss Gaussians
$cmd $dir/log/mixup.log \
gmm-mixup --mix-up=$numgauss $olddir/final.mdl $olddir/final.occs $dir/0.mdl || exit 1;
rm $dir/.error 2>/dev/null
dir_for_alignments=$olddir # This is where we find the alignments...
# after we realign, on iter 3, we'll use the ones in $dir
niters=4
for x in `seq 0 $niters`; do # Do five iterations of E-M; on 3rd iter, realign.
echo Iteration $x
if [ $x -eq 2 ]; then
echo Realigning data on iteration $x
for n in `get_splits.pl $nj`; do
[ ! -f $olddir/$n.fsts.gz ] && echo Expecting FSTs to exist: no such file $olddir/$n.fsts.gz \
&& exit 1;
$cmd $dir/log/align.$x.$n.log \
gmm-align-compiled $scale_opts --beam=10 --retry-beam=40 $dir/$x.mdl \
"ark:gunzip -c $olddir/$n.fsts.gz|" "${featspart[$n]}" \
"ark:|gzip -c >$dir/$n.ali.gz" || touch $dir/.error &
done
wait
[ -f $dir/.error ] && echo "Error computing alignments" && exit 1;
dir_for_alignments=$dir
fi
echo "Accumulating statistics"
for n in `get_splits.pl $nj`; do
$cmd $dir/log/acc.$x.$n.log \
gmm-acc-stats-ali --binary=false $dir/$x.mdl "${featspart[$n]}" \
"ark,s,cs:gunzip -c $dir_for_alignments/$n.ali.gz|" $dir/$x.$n.acc || touch $dir/.error &
done
wait;
[ -f $dir/.error ] && echo "Error accumulating stats on iteration $x" && exit 1;
$cmd $dir/log/update.$x.log \
gmm-est --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \
"gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
rm $dir/$x.mdl $dir/$x.*.acc
rm $dir/$x.occs
done
x=$[$niters+1]
rm $dir/final.mdl $dir/final.occs 2>/dev/null
ln -s $x.mdl $dir/final.mdl
ln -s $x.occs $dir/final.occs
if $have_trans; then # we have transforms, so compute the alignment model,
# which is as the model but with
# the default features (shares Gaussian-level alignments).
for n in `get_splits.pl $nj`; do
$cmd $dir/log/acc_alimdl.$n.log \
ali-to-post "ark:gunzip -c $dir_for_alignments/$n.ali.gz|" ark:- \| \
gmm-acc-stats-twofeats $dir/$x.mdl "${featspart[$n]}" "${sifeatspart[$n]}" \
ark,s,cs:- $dir/$x.$n.acc2 || touch $dir/.error &
done
wait;
[ -f $dir/.error ] && echo "Error accumulating alignment statistics." && exit 1;
# Update model.
$cmd $dir/log/est_alimdl.log \
gmm-est --write-occs=$dir/final.occs --remove-low-count-gaussians=false $dir/$x.mdl \
"gmm-sum-accs - $dir/$x.*.acc2|" $dir/$x.alimdl || exit 1;
rm $dir/$x.*.acc2
rm $dir/final.alimdl 2>/dev/null
ln -s $x.alimdl $dir/final.alimdl
fi
# Print out summary of the warning messages.
for x in $dir/log/*.log; do
n=`grep WARNING $x | wc -l`;
if [ $n -ne 0 ]; then echo $n warnings in $x; fi;
done
echo Done
......@@ -188,11 +188,10 @@ while [ $x -lt $numiters ]; do
done
wait
[ -f $dir/.error ] && echo "Error estimating or composing fMLLR transforms on iter $x" && exit 1;
transdir=$dir # This is now used as the place where the "current" transforms are.
for n in `get_splits.pl $nj`; do
featspart[$n]="${sifeatspart[$n]} transform-feats --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$transdir/$n.trans ark:- ark:- |"
featspart[$n]="${sifeatspart[$n]} transform-feats --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$dir/$n.trans ark:- ark:- |"
done
feats="$sifeats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $transdir/*.trans|' ark:- ark:- |" # not used, but in case...
feats="$sifeats transform-feats --utt2spk=ark:$data/utt2spk 'ark:cat $dir/*.trans|' ark:- ark:- |" # not used, but in case...
fi
for n in `get_splits.pl $nj`; do
$cmd $dir/log/acc.$x.$n.log \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment