Commit 5894033d authored by Dan Povey's avatar Dan Povey
Browse files

Minor fixes and finishing up the top-level (run.sh) scripts for big-dict decoding.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@606 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 029582ba
......@@ -33,10 +33,17 @@ exp/tri2c/decode_tgpr_dev93/wer_16:%WER 16.19 [ 1333 / 8234, 251 ins, 139 del, 9
exp/tri2c/decode_tgpr_dev93_2pass/wer_16:%WER 15.99 [ 1317 / 8234, 246 ins, 134 del, 937 sub ]
# LDA+MLLT+SAT, SI-84
exp/tri3b/decode_tgpr_dev93/wer_16:%WER 15.14 [ 1247 / 8234, 259 ins, 118 del, 870 sub ]
exp/tri3b/decode_tgpr_dev93_tg/wer_16:%WER 14.36 [ 1182 / 8234, 263 ins, 107 del, 812 sub ] # rescoring w/ tg LM
exp/tri3b/decode_tgpr_eval92/wer_16:%WER 10.24 [ 578 / 5643, 137 ins, 38 del, 403 sub ]
exp/tri3b/decode_tgpr_eval92_tg/wer_14:%WER 9.71 [ 548 / 5643, 144 ins, 28 del, 376 sub ] # rescoring w/ tg LM
exp/tri3b/decode_tgpr_dev93/wer_16:%WER 15.51 [ 1277 / 8234, 263 ins, 123 del, 891 sub ]
exp/tri3b/decode_tgpr_dev93_tg/wer_16:%WER 14.55 [ 1198 / 8234, 258 ins, 116 del, 824 sub ]
exp/tri3b/decode_tgpr_eval92/wer_14:%WER 10.86 [ 613 / 5643, 160 ins, 32 del, 421 sub ]
exp/tri3b/decode_tgpr_eval92_tg/wer_13:%WER 10.15 [ 573 / 5643, 158 ins, 30 del, 385 sub ]
# Big-dict and our own LM (trigram, pruned)
exp/tri3b/decode_bd_tgpr_eval92/wer_15:%WER 8.05 [ 454 / 5643, 81 ins, 43 del, 330 sub ]
# and rescoring with trigram and 4-gram LMs:
exp/tri3b/decode_bd_tgpr_eval92_tg/wer_16:%WER 7.51 [ 424 / 5643, 78 ins, 46 del, 300 sub ]
exp/tri3b/decode_bd_tgpr_eval92_fg/wer_16:%WER 7.32 [ 413 / 5643, 76 ins, 43 del, 294 sub ]
# LDA+MLLT+SAT, SI-284, quick retraining from 3b
exp/tri4b/decode_tgpr_dev93/wer_15:%WER 12.89 [ 1061 / 8234, 231 ins, 93 del, 737 sub ]
......
......@@ -8,14 +8,16 @@
# The "fgpr" LM is a locally estimated one (4-gram, pruned)
. path.sh || exit 1;
dict_srcdir=data/local/dict_larger_prep/
lm_srcdir=data/local/lm/4gram-mincount
lang=data/lang_test_bd_fgpr
lang_unpruned=data/lang_test_bd_fg
lm_srcdir_3g=data/local/lm/3gram-mincount
lm_srcdir_4g=data/local/lm/4gram-mincount
lang=data/lang_test_bd_tg
mkdir -p $lang
[ ! -f $dict_srcdir/lexicon.txt ] && \
echo "First run wsj_prepare_local_dict.sh" && exit 1;
[ ! -f $lm_srcdir/lm_pr7.0.gz -o ! -f $lm_srcdir/lm_unpruned.gz ] && \
[ ! -f $lm_srcdir_4g/lm_pr7.0.gz -o ! -f $lm_srcdir_4g/lm_unpruned.gz \
-o ! -f $lm_srcdir_3g/lm_pr6.0.gz -o ! -f $lm_srcdir_3g/lm_unpruned.gz ] && \
echo "First run wsj_train_lms.sh" && exit 1;
......@@ -97,13 +99,13 @@ cat $dict_srcdir/lexicon.txt | \
echo "Preparing language models for test"
# Note: at this point, $lang=="data/lang_test_bd_fgpr", we put a pruned 4-gram model
# Note: at this point, $lang=="data/lang_test_bd_tg", we put an unpruned 3-gram model
# there.
echo "Checking there are no OOVs" # there shouldn't be in this LM.
# If you have an LM with OOVs you'd have to put back the command
# "remove_oovs.pl" below, as it is used in wsj_format_data.sh.
gunzip -c $lm_srcdir/lm_pr7.0.gz | \
gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
scripts/find_arpa_oovs.pl $lang/words.txt | cmp - /dev/null || \
exit 1;
......@@ -113,7 +115,7 @@ gunzip -c $lm_srcdir/lm_pr7.0.gz | \
# broken. But we'll leave this in the script just in case it gets modified
# later.
# Note: ~1.5M N-grams.
gunzip -c $lm_srcdir/lm_pr7.0.gz | \
gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
......@@ -123,17 +125,38 @@ gunzip -c $lm_srcdir/lm_pr7.0.gz | \
fstrmepsilon > $lang/G.fst || exit 1;
fstisstochastic $lang/G.fst
mkdir -p $lang_unpruned
mkdir -p data/lang_test_bd_tgpr
cp $lang/* $lang_unpruned
# Be careful: this time we dispense with the grep -v '<s> <s>' so this might
# not work for LMs generated from all toolkits.
gunzip -c $lm_srcdir/lm_unpruned.gz | \
gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \
arpa2fst - | fstprint | \
scripts/eps2disambig.pl | scripts/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > $lang_unpruned/G.fst || exit 1;
fstisstochastic $lang_unpruned/G.fst
fstrmepsilon > data/lang_test_bd_tgpr/G.fst || exit 1;
fstisstochastic data/lang_test_bd_tgpr/G.fst
mkdir -p data/lang_test_bd_fg
cp $lang/* data/lang_test_bd_fg
# Be careful: this time we dispense with the grep -v '<s> <s>' so this might
# not work for LMs generated from all toolkits.
gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
arpa2fst - | fstprint | \
scripts/eps2disambig.pl | scripts/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test_bd_fg/G.fst || exit 1;
fstisstochastic data/lang_test_bd_fg/G.fst
mkdir -p data/lang_test_bd_fgpr
cp $lang/* data/lang_test_bd_fgpr
# Be careful: this time we dispense with the grep -v '<s> <s>' so this might
# not work for LMs generated from all toolkits.
gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \
arpa2fst - | fstprint | \
scripts/eps2disambig.pl | scripts/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test_bd_fgpr/G.fst || exit 1;
fstisstochastic data/lang_test_bd_fgpr/G.fst
# The commands below are just diagnostic tests.
mkdir -p tmpdir.g
......@@ -147,5 +170,4 @@ gunzip -c $lm_srcdir/lm_unpruned.gz | \
echo "Succeeded in formatting data."
......@@ -63,6 +63,14 @@ gunzip -c $dir/train_nounk.gz | awk -v wmap=$dir/word_map 'BEGIN{while((getline<
# easily generate it again if needed.
rm $dir/train_nounk.gz
train_lm.sh --arpa --lmtype 3gram-mincount $dir
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 141.444826
# 7.8 million N-grams.
prune_lm.sh --arpa 6.0 $dir/3gram-mincount/
# 1.45 million N-grams.
# Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 165.394139
train_lm.sh --arpa --lmtype 4gram-mincount $dir
#Perplexity over 228518.000000 words (excluding 478.000000 OOVs) is 126.734180
# 10.3 million N-grams.
......
......@@ -189,6 +189,17 @@ scripts/mkgraph.sh data/lang_test_bd_tgpr exp/tri3b exp/tri3b/graph_bd_tgpr
scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt_sat.sh exp/tri3b/graph_bd_tgpr \
data/test_eval92 exp/tri3b/decode_bd_tgpr_eval92
scripts/mkgraph.sh data/lang_test_bd_tgpr exp/tri3b exp/tri3b/graph_bd_tgpr
scripts/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt_sat.sh exp/tri3b/graph_bd_tgpr \
data/test_eval92 exp/tri3b/decode_bd_tgpr_eval92
scripts/lmrescore.sh --cmd "$decode_cmd" data/lang_test_bd_tgpr data/lang_test_bd_fg \
data/test_eval92 exp/tri3b/decode_bd_tgpr_eval92 exp/tri3b/decode_bd_tgpr_eval92_fg
scripts/lmrescore.sh --cmd "$decode_cmd" data/lang_test_bd_tgpr data/lang_test_bd_tg \
data/test_eval92 exp/tri3b/decode_bd_tgpr_eval92 exp/tri3b/decode_bd_tgpr_eval92_tg
# From 3b system, align all si284 data.
steps/align_lda_mllt_sat.sh --num-jobs 10 --cmd "$train_cmd" \
data/train_si284 data/lang exp/tri3b exp/tri3b_ali_si284
......
......@@ -55,6 +55,7 @@ if [ "$mode" == 4 ]; then
fi
rm $dir/.error 2>/dev/null
for lat in $indir/lat.*.gz; do
number=`basename $lat | cut -d. -f2`;
......@@ -63,7 +64,8 @@ for lat in $indir/lat.*.gz; do
1) # 1 is inexact, the original way of doing it.
$cmd $outdir/rescorelm.$number.log \
lattice-lmrescore --lm-scale=-1.0 "ark:gunzip -c $lat|" "$oldlmcommand" ark:- \| \
lattice-lmrescore --lm-scale=1.0 ark:- "$newlmcommand" "ark,t:|gzip -c>$newlat" &
lattice-lmrescore --lm-scale=1.0 ark:- "$newlmcommand" "ark,t:|gzip -c>$newlat" \
|| touch $dir/.error &
;;
2) # 2 is equivalent to 1, but using more basic operations, combined.
$cmd $outdir/rescorelm.$number.log \
......@@ -74,7 +76,7 @@ for lat in $indir/lat.*.gz; do
lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
lattice-compose ark:- "fstproject --project_output=true $newlm |" ark:- \| \
lattice-determinize ark:- ark:- \| \
gzip -c \>$newlat &
gzip -c \>$newlat || touch $dir/.error &
;;
3) # 3 is "exact" in that we remove the old LM scores accepting any path
# through G.fst (which is what we want as that happened in lattice
......@@ -88,7 +90,7 @@ for lat in $indir/lat.*.gz; do
lattice-scale --acoustic-scale=-1 --lm-scale=-1 ark:- ark:- \| \
lattice-compose --phi-label=$phi ark:- $newlm ark:- \| \
lattice-determinize ark:- ark:- \| \
gzip -c \>$newlat &
gzip -c \>$newlat || touch $dir/.error &
;;
4) # 4 is also exact (like 3), but instead of subtracting the old LM-scores,
# it removes the old graph scores entirely and adds in the lexicon,
......@@ -104,12 +106,13 @@ for lat in $indir/lat.*.gz; do
lattice-compose --phi-label=$phi ark:- $newlm ark:- \| \
lattice-add-trans-probs --transition-scale=1.0 --self-loop-scale=0.1 \
$mdl ark:- ark:- \| \
gzip -c \>$newlat &
gzip -c \>$newlat || touch $dir/.error &
;;
esac
done
wait
[ -f $dir/.error ] && echo Error doing LM rescoring && exit 1
rm $outdir/Ldet.fst 2>/dev/null
scripts/score_lats.sh $outdir $newlang/words.txt $data
......@@ -85,7 +85,7 @@ int main(int argc, char *argv[]) {
clat_writer.Write(lattice_reader.Key(), clat);
n_done++;
}
KALDI_LOG << "Done adding transition probabilities " << n_done << " lattices.";
KALDI_LOG << "Done adding transition probabilities to " << n_done << " lattices.";
return (n_done != 0 ? 0 : 1);
} catch(const std::exception& e) {
std::cerr << e.what();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment