Commit 6c43009b authored by Mirko Hannemann's avatar Mirko Hannemann
Browse files

fix min/max_lmwt


git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@1280 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent c2f20127
......@@ -5,10 +5,19 @@
[ -f ./path.sh ] && . ./path.sh
cmd=run.pl
[ $1 == "--cmd" ] && cmd=$2 && shift 2;
min_lmwt=5
max_lmwt=20
#end configuration section.
[ $# -ne 3 ] && \
echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>" && exit 1;
[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;
if [ $# -ne 3 ]; then
echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
exit 1;
fi
data=$1
lang_or_graph=$2
......@@ -24,12 +33,12 @@ mkdir -p $dir/scoring/log
cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
$cmd LMWT=5:20 $dir/scoring/log/best_path.LMWT.log \
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
lattice-best-path --lm-scale=LMWT --word-symbol-table=$symtab \
"ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1;
# Note: the double level of quoting for the sed command
$cmd LMWT=5:20 $dir/scoring/log/score.LMWT.log \
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
compute-wer --text --mode=present \
......
......@@ -4,11 +4,21 @@
[ -f ./path.sh ] && . ./path.sh
cmd=run.pl
[ $1 == "--cmd" ] && cmd=$2 && shift 2;
min_lmwt=2
max_lmwt=13
#end configuration section.
[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;
[ $# -ne 3 ] && \
echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>" && exit 1;
if [ $# -ne 3 ]; then
echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
exit 1;
fi
data=$1
lang_or_graph=$2
......@@ -22,12 +32,12 @@ done
mkdir -p $dir/scoring/log
$cmd LMWT=2:13 $dir/scoring/log/best_path.LMWT.log \
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
lattice-best-path --lm-scale=LMWT --word-symbol-table=$symtab \
"ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1;
# Note: the double level of quoting for the sed command
$cmd LMWT=2:13 $dir/scoring/log/score.LMWT.log \
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
compute-wer --text --mode=present \
......
......@@ -18,6 +18,8 @@ if [ $# -ne 3 ]; then
echo " Options:"
echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
echo " --stage (0|1|2) # start scoring script from part-way through."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
exit 1;
fi
......
......@@ -3,15 +3,19 @@
# begin configuration section.
cmd=run.pl
min_lmwt=9
max_lmwt=20
#end configuration section.
[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;
if [ $# -ne 3 ]; then
echo "Usage: local/score_basic.sh [options] <data-dir> <lang-dir|graph-dir> <decode-dir>";
echo "Usage: local/score_basic.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
echo " Options:"
echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
exit 1;
fi
......@@ -41,18 +45,18 @@ function filter_text {
'[NOISE]' '[LAUGHTER]' '[VOCALIZED-NOISE]' '<UNK>' '%HESITATION'
}
$cmd LMWT=9:20 $dir/scoring/log/best_path.LMWT.log \
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
lattice-best-path --lm-scale=LMWT --word-symbol-table=$lang/words.txt \
"ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1;
for lmwt in `seq 9 20`; do
for lmwt in `seq $min_lmwt $max_lmwt`; do
utils/int2sym.pl -f 2- $lang/words.txt <$dir/scoring/$lmwt.tra | \
filter_text > $dir/scoring/$lmwt.txt || exit 1;
done
filter_text <$data/text >$dir/scoring/text.filt
$cmd LMWT=9:20 $dir/scoring/log/score.LMWT.log \
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
compute-wer --text --mode=present \
ark:$dir/scoring/text.filt ark:$dir/scoring/LMWT.txt ">&" $dir/wer_LMWT || exit 1;
......
......@@ -4,16 +4,20 @@
# begin configuration section.
cmd=run.pl
stage=0
min_lmwt=9
max_lmwt=20
#end configuration section.
[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;
if [ $# -ne 3 ]; then
echo "Usage: local/score_sclite.sh [options] <data-dir> <lang-dir|graph-dir> <decode-dir>" && exit;
echo "Usage: local/score_sclite.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
echo " Options:"
echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
echo " --stage (0|1|2) # start scoring script from part-way through."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
exit 1;
fi
......@@ -37,7 +41,7 @@ name=`basename $data`; # e.g. eval2000
mkdir -p $dir/scoring/log
if [ $stage -le 0 ]; then
$cmd LMWT=9:20 $dir/scoring/log/get_ctm.LMWT.log \
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.log \
mkdir -p $dir/score_LMWT/ '&&' \
lattice-1best --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
......@@ -57,7 +61,7 @@ if [ $stage -le 1 ]; then
fi
if [ $stage -le 2 ]; then
$cmd LMWT=9:20 $dir/scoring/log/score.LMWT.log \
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
cp $data/stm $dir/score_LMWT/ '&&' \
$hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT/stm $dir/score_LMWT/${name}.ctm || exit 1;
fi
......
......@@ -5,16 +5,21 @@
cmd=run.pl
stage=0
decode_mbr=true
min_lmwt=9
max_lmwt=20
#end configuration section.
[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;
if [ $# -ne 3 ]; then
echo "Usage: local/score_sclite_conf.sh [options] <data-dir> <lang-dir|graph-dir> <decode-dir>" && exit;
echo "Usage: local/score_sclite_conf.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
echo " Options:"
echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
echo " --stage (0|1|2) # start scoring script from part-way through."
echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
exit 1;
fi
......@@ -39,7 +44,7 @@ mkdir -p $dir/scoring/log
if [ $stage -le 0 ]; then
# the escaping gets a bit crazy here, sorry...
$cmd LMWT=9:20 $dir/scoring/log/get_ctm.LMWT.log \
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.log \
mkdir -p $dir/score_LMWT/ '&&' \
ACWT=\`perl -e \"print 1.0/LMWT\;\"\` '&&' \
lattice-align-words $lang/phones/word_boundary.int $model "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
......@@ -59,7 +64,7 @@ if [ $stage -le 1 ]; then
fi
if [ $stage -le 2 ]; then
$cmd LMWT=9:20 $dir/scoring/log/score.LMWT.log \
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
cp $data/stm $dir/score_LMWT/ '&&' \
$hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT/stm $dir/score_LMWT/${name}.ctm || exit 1;
fi
......
......@@ -4,11 +4,23 @@
[ -f ./path.sh ] && . ./path.sh
# begin configuration section.
cmd=run.pl
[ $1 == "--cmd" ] && cmd=$2 && shift 2;
min_lmwt=9
max_lmwt=19
#end configuration section.
[ $# -ne 3 ] && \
echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>" && exit 1;
[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;
if [ $# -ne 3 ]; then
echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
echo " Options:"
echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
exit 1;
fi
data=$1
lang_or_graph=$2
......@@ -22,12 +34,12 @@ done
mkdir -p $dir/scoring/log
$cmd LMWT=9:19 $dir/scoring/log/best_path.LMWT.log \
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
lattice-best-path --lm-scale=LMWT --word-symbol-table=$symtab \
"ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1;
# Note: the double level of quoting for the sed command
$cmd LMWT=9:19 $dir/scoring/log/score.LMWT.log \
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
compute-wer --text --mode=present \
......
......@@ -4,11 +4,23 @@
[ -f ./path.sh ] && . ./path.sh
# begin configuration section.
cmd=run.pl
[ $1 == "--cmd" ] && cmd=$2 && shift 2;
min_lmwt=9
max_lmwt=20
#end configuration section.
[ $# -ne 3 ] && \
echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>" && exit 1;
[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;
if [ $# -ne 3 ]; then
echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
echo " Options:"
echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
exit 1;
fi
data=$1
lang_or_graph=$2
......@@ -24,12 +36,12 @@ mkdir -p $dir/scoring/log
cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
$cmd LMWT=9:20 $dir/scoring/log/best_path.LMWT.log \
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
lattice-best-path --lm-scale=LMWT --word-symbol-table=$symtab \
"ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1;
# Note: the double level of quoting for the sed command
$cmd LMWT=9:20 $dir/scoring/log/score.LMWT.log \
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
compute-wer --text --mode=present \
......
......@@ -3,7 +3,7 @@ LM: | Pruned trigram | Rescore Trigram |
Test set: | Eval92 | Dev93 | Eval92 | Eval93 |
system: | WER acwt| | WER acwt| | Train set | Leaf/PDFs|
mono0a *| 24.72% 10 | 34.22% 9 | | | 2k short | 146 1k | flat start,monophone,delta-deltas, CMN
+| 24.97% 9 | 34.37% 10 | | |
+| 25.13% 9 | 34.52% 10 | | |
tri1 *| 13.98% 13 | 20.09% 15 | 13.08% 14 | 19.30% 16 | 3500 (half) | 2000 10k | mono_ali, triphones, delta-deltas, CMN
+| | 19.87% 16 | | 19.16% 16 |
tri2a *| 11.61% 15 | 17.55% 16 | | SI-84 | 2500 15k | tri1_ali, -"-
......
......@@ -37,7 +37,8 @@ fi
trans=$data/text
cat $trans | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/test_trans.filt
#cat $trans | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/test_trans.filt
cat $trans | sed 's:<hes>::g' | sed 's:<unk>::g' > $dir/test_trans.filt
for inv_acwt in 9 10 11 12 13 14 15 16 17 18 19 20; do
acwt=`perl -e "print (1.0/$inv_acwt);"`
......@@ -46,7 +47,7 @@ for inv_acwt in 9 10 11 12 13 14 15 16 17 18 19 20; do
2>$dir/rescore_${inv_acwt}.log
cat $dir/${inv_acwt}.tra | \
scripts/int2sym.pl --ignore-first-field $symtab | sed 's:<UNK>::g' | \
scripts/int2sym.pl --ignore-first-field $symtab | sed 's:<unk>::g' | sed 's:<hes>::g' | \
compute-wer --text --mode=present ark:$dir/test_trans.filt ark,p:- >& $dir/wer_$inv_acwt
done
......@@ -72,7 +72,7 @@ done
# CMVN stats-- we make them part of a pipe.
feats="ark:compute-cmvn-stats --spk2utt=ark:$mydata/spk2utt scp:$mydata/feats.scp ark:- | apply-cmvn --norm-vars=false --utt2spk=ark:$mydata/utt2spk ark:- scp:$mydata/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- | transform-feats --utt2spk=ark:$mydata/utt2spk ark:$transdir/$jobid.trans ark:- ark:- |"
gmm-latgen-faster --max-active=7000 --beam=$beam --lattice-beam=6.0 --acoustic-scale=0.083333 \
gmm-latgen-faster --max-active=7000 --beam=$beam --lattice-beam=6.0 --acoustic-scale=0.055556 \
--allow-partial=true --word-symbol-table=$graphdir/words.txt \
$srcdir/final.mdl $graphdir/HCLG.fst "$feats" "ark:|gzip -c > $dir/lat.$jobid.gz" \
2> $dir/decode$jobid.log || exit 1;
......
......@@ -54,6 +54,7 @@ graphdir=$1
data=$2
dir=$3
acwt=0.08333 # Just used for adaptation and beam-pruning..
#acwt=0.0625 # Just used for adaptation and beam-pruning..
silphonelist=`cat $graphdir/silphones.csl`
srcdir=`dirname $dir`; # Assume model directory one level up from decoding directory.
......
......@@ -6,20 +6,21 @@
# the number of cpus on your machine.
#a) JHU cluster options
export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
#export cuda_cmd="..."
#export mkgraph_cmd="queue.pl -q all.q@a*.clsp.jhu.edu -l ram_free=4G,mem_free=4G"
#b) BUT cluseter options
#export train_cmd="queue.pl -q all.q@@blade -l ram_free=700M,mem_free=700M"
#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
#b) BUT cluster options
export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
#c) run it locally...
#export train_cmd=run.pl
#export decode_cmd=run.pl
export cuda_cmd=run.pl
export mkgraph_cmd=run.pl
#export cuda_cmd=run.pl
#export mkgraph_cmd=run.pl
......@@ -33,7 +33,7 @@
steps/make_denlats_sgmm2.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 \
data/train_si84 data/lang exp/sgmm2_5a_ali_si84 exp/sgmm2_5a_denlats_si84
steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \
steps/train_mmi_sgmm2.sh --cmd "$decodebig_cmd" --transform-dir exp/tri4b_ali_si84 --boost 0.1 \
data/train_si84 data/lang exp/sgmm2_5a_ali_si84 exp/sgmm2_5a_denlats_si84 exp/sgmm2_5a_mmi_b0.1
for iter in 1 2 3 4; do
......@@ -85,18 +85,18 @@
steps/make_denlats_sgmm2.sh --nj 30 --sub-split 30 --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 \
data/train_si284 data/lang exp/sgmm2_5b_ali_si284 exp/sgmm2_5b_denlats_si284
steps/train_mmi_sgmm2.sh --cmd "$decode_cmd" --transform-dir exp/tri4b_ali_si284 --boost 0.1 \
steps/train_mmi_sgmm2.sh --cmd "$decodebig_cmd" --transform-dir exp/tri4b_ali_si284 --boost 0.1 \
data/train_si284 data/lang exp/sgmm2_5b_ali_si284 exp/sgmm2_5b_denlats_si284 exp/sgmm2_5b_mmi_b0.1
for iter in 1 2 3 4; do
for test in dev93 eval92; do
steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
--transform-dir exp/tri4b/decode_tgpr_${test} data/lang_test_tgpr data/test_${test} exp/sgmm2_5b/decode_tgpr_${test} \
exp/sgmm2_5b_mmi_b0.1/decode_tgpr_${test}_it$iter &
for test in eval92; do # dev93
#steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
#--transform-dir exp/tri4b/decode_${test}_tgpr data/lang_test_tgpr data/test_${test} exp/sgmm2_5b/decode_${test}_tgpr \
#exp/sgmm2_5b_mmi_b0.1/decode_${test}_tgpr_it$iter &
steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $iter \
--transform-dir exp/tri4b/decode_bd_tgpr_${test} data/lang_test_bd_tgpr data/test_${test} exp/sgmm2_5b/decode_bd_tgpr_${test} \
exp/sgmm2_5b_mmi_b0.1/decode_bd_tgpr_${test}_it$iter &
--transform-dir exp/tri4b/decode_${test}_bd_tgpr data/lang_test_bd_fg data/test_${test} exp/sgmm2_5b/decode_${test}_bd_tgpr \
exp/sgmm2_5b_mmi_b0.1/decode_${test}_bd_tgpr_it$iter &
done
done
) &
......@@ -4,16 +4,29 @@
[ -f ./path.sh ] && . ./path.sh
# begin configuration section.
cmd=run.pl
stage=0
decode_mbr=true
reverse=false
min_lmwt=9
max_lmwt=20
#end configuration section.
[ $1 == "--cmd" ] && cmd=$2 && shift 2;
[ $1 == "--reverse" ] && reverse=$2 && shift 2;
[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;
[ $# -ne 3 ] && \
echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>" && exit 1;
if [ $# -ne 3 ]; then
echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
echo " Options:"
echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
echo " --stage (0|1|2) # start scoring script from part-way through."
echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
echo " --reverse (true/false) # score with time reversed features "
exit 1;
fi
data=$1
lang_or_graph=$2
......
......@@ -4,16 +4,22 @@
[ -f ./path.sh ] && . ./path.sh;
# begin configuration section.
cmd=run.pl
[ $1 == "--cmd" ] && cmd=$2 && shift 2;
[ $# -ne 3 ] && \
echo "Usage: local/score_mbr.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>" && exit 1;
min_lmwt=9
max_lmwt=20
#end configuration section.
[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;
if [ $# -ne 3 ]; then
echo "Usage: scripts/score_mbr.sh <decode-dir> <word-symbol-table> <data-dir>"
exit 1;
echo "Usage: local/score_sclite_conf.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
echo " Options:"
echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
exit 1;
fi
data=$1
......@@ -33,7 +39,7 @@ cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/t
# We submit the jobs separately, not as an array, because it's hard
# to get the inverse of the LM scales.
rm $dir/.error 2>/dev/null
for inv_acwt in `seq 9 20`; do
for inv_acwt in `seq $min_lmwt $max_lmwt`; do
acwt=`perl -e "print (1.0/$inv_acwt);"`
$cmd $dir/scoring/rescore_mbr.${inv_acwt}.log \
lattice-mbr-decode --acoustic-scale=$acwt --word-symbol-table=$symtab \
......@@ -44,7 +50,7 @@ wait;
[ -f $dir/.error ] && echo "score_mbr.sh: errror getting MBR outout.";
$cmd LMWT=9:20 $dir/scoring/log/score.LMWT.log \
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
compute-wer --text --mode=present \
......
......@@ -28,6 +28,7 @@ utils/prepare_lang.sh data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp data/
local/wsj_format_data.sh || exit 1;
# We suggest to run the next three commands in the background,
# as they are not a precondition for the system building and
# most of the tests: these commands build a dictionary
......@@ -54,7 +55,6 @@ local/wsj_format_data.sh || exit 1;
)
) &
# Now make MFCC features.
# mfccdir should be some place with a largish disk where you
# want to store MFCC features.
......@@ -142,6 +142,19 @@ steps/decode.sh --nj 10 --cmd "$decode_cmd" \
steps/decode.sh --nj 8 --cmd "$decode_cmd" \
exp/tri2a/graph_tgpr data/test_eval92 exp/tri2a/decode_tgpr_eval92 || exit 1;
utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a exp/tri2a/graph_bg5k
steps/decode.sh --nj 8 --cmd "$decode_cmd" \
exp/tri2a/graph_bg5k data/test_eval92 exp/tri2a/decode_eval92_bg5k || exit 1;
#prepare reverse lexicon and language model for backwards decoding
utils/prepare_lang.sh --reverse true data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp.reverse data/lang.reverse || exit 1;
local/wsj_reverse_lm.sh || exit 1;
utils/mkgraph.sh --reverse data/lang_test_bg_5k.reverse exp/tri2a exp/tri2a/graph_bg5kr
steps/decode_fwdbwd.sh --reverse true --nj 8 --cmd "$decode_cmd" \
exp/tri2a/graph_bg5kr data/test_eval92 exp/tri2a/decode_eval92_bg5k_reverse || exit 1;
steps/decode_fwdbwd.sh --reverse true --nj 8 --cmd "$decode_cmd" \
--first_pass exp/tri2a/decode_eval92_bg5k exp/tri2a/graph_bg5kr data/test_eval92 exp/tri2a/decode_eval92_bg5k_pingpong || exit 1;
steps/train_lda_mllt.sh --cmd "$train_cmd" \
--splice-opts "--left-context=3 --right-context=3" \
......
......@@ -81,7 +81,7 @@ else
$cmd JOB=1:$nj $dir/log/align.JOB.log \
compile-train-graphs $dir/tree $dir/final.mdl $lang/L.fst "$tra" ark:- \| \
gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam "$mdl" ark:- \
"$feats" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
"$feats" "ark,t:|gzip -c >$dir/ali.JOB.gz" || exit 1;
fi
echo "$0: done aligning data."
......@@ -13,14 +13,14 @@ transform_dir= # dir to find fMLLR transforms.
nj=4 # number of decoding jobs.
acwt=0.1 # Just a default value, used for adaptation and beam-pruning..
cmd=run.pl
beam=13.0
beam=15.0
gselect=15 # Number of Gaussian-selection indices for SGMMs. [Note:
# the first_pass_gselect variable is used for the 1st pass of
# decoding and can be tighter.
first_pass_gselect=3 # Use a smaller number of Gaussian-selection indices in
# the 1st pass of decoding (lattice generation).
max_active=7000
lat_beam=6.0 # Beam we use in lattice generation.
lat_beam=8.0 # Beam we use in lattice generation.
vecs_beam=4.0 # Beam we use to prune lattices while getting posteriors for
# speaker-vector computation. Can be quite tight (actually we could
# probably just do best-path.
......@@ -185,6 +185,10 @@ rm $dir/pre_lat.*.gz
if [ $stage -le 7 ]; then
[ ! -x local/score.sh ] && \
echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
echo "score best paths"
local/score.sh --cmd "$cmd" $data $graphdir $dir
echo "score confidence and timing with sclite"
#local/score_sclite_conf.sh --cmd "$cmd" --language turkish $data $graphdir $dir
fi
echo "Decoding done."
exit 0;
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment