Commit b1560b5e authored by Guoguo Chen's avatar Guoguo Chen
Browse files

trunk: adding higher order LM support in segmentation scripts

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@5193 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 7c99af63
...@@ -29,14 +29,17 @@ echo "$0 $@" # Print the command line for logging ...@@ -29,14 +29,17 @@ echo "$0 $@" # Print the command line for logging
. parse_options.sh || exit 1; . parse_options.sh || exit 1;
if [ $# != 3 ]; then if [ $# != 3 ]; then
echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
echo "... where <decode-dir> is assumed to be a sub-directory of the"
echo " directory where the model is."
echo ""
echo "This is a special decoding script for segmentation where we use one" echo "This is a special decoding script for segmentation where we use one"
echo "decoding graph for each segment. We assume a file HCLG.fsts.scp exists" echo "decoding graph for each segment. We assume a file HCLG.fsts.scp exists"
echo "which is the scp file of the graphs for each segment." echo "which is the scp file of the graphs for each segment."
echo "" echo ""
echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
echo " e.g.: $0 exp/tri2b/graph_train_si284_split \\"
echo " data/train_si284_split exp/tri2b/decode_train_si284_split"
echo ""
echo "where <decode-dir> is assumed to be a sub-directory of the directory"
echo "where the model is."
echo ""
echo "main options (for others, see top of script file)" echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options" echo " --config <config-file> # config containing options"
echo " --nj <nj> # number of parallel jobs" echo " --nj <nj> # number of parallel jobs"
...@@ -76,7 +79,7 @@ sort -k1,1 -u < $graphdir/HCLG.fsts.scp > $graphdir/HCLG.fsts.scp.sorted ...@@ -76,7 +79,7 @@ sort -k1,1 -u < $graphdir/HCLG.fsts.scp > $graphdir/HCLG.fsts.scp.sorted
mv $graphdir/HCLG.fsts.scp.sorted $graphdir/HCLG.fsts.scp mv $graphdir/HCLG.fsts.scp.sorted $graphdir/HCLG.fsts.scp
for x in `seq 1 $nj`; do for x in `seq 1 $nj`; do
cat $graphdir/HCLG.fsts.scp |\ cat $graphdir/HCLG.fsts.scp |\
utils/filter_scp.pl -f 1 <(cut -f 1 -d ' ' $sdata/$x/feats.scp) > $sdata/$x/graphs.scp utils/filter_scp.pl -f 1 $sdata/$x/feats.scp > $sdata/$x/graphs.scp
num_feats=`cat $sdata/$x/feats.scp | wc -l` num_feats=`cat $sdata/$x/feats.scp | wc -l`
num_graphs=`cat $sdata/$x/graphs.scp | wc -l` num_graphs=`cat $sdata/$x/graphs.scp | wc -l`
if [ $num_graphs -ne $num_feats ]; then if [ $num_graphs -ne $num_feats ]; then
......
...@@ -9,6 +9,8 @@ cmd=run.pl ...@@ -9,6 +9,8 @@ cmd=run.pl
tscale=1.0 # transition scale. tscale=1.0 # transition scale.
loopscale=0.1 # scale for self-loops. loopscale=0.1 # scale for self-loops.
cleanup=true cleanup=true
ngram_order=1
srilm_options="-wbdiscount" # By default, use Witten-Bell discounting in SRILM
# End configuration section. # End configuration section.
set -e set -e
...@@ -19,16 +21,23 @@ echo "$0 $@" ...@@ -19,16 +21,23 @@ echo "$0 $@"
. parse_options.sh || exit 1; . parse_options.sh || exit 1;
if [ $# -ne 4 ]; then if [ $# -ne 4 ]; then
echo "This script is a wrapper of steps/cleanup/make_transcript_graph.sh. In" echo "This script builds one decoding graph for each truncated utterance in"
echo "the segmentation case graphs are created for the original transcript" echo "segmentation. It first calls steps/cleanup/make_utterance_graph.sh to"
echo "(the long transcript before split), therefore we have to duplicate the" echo "build one decoding graph for each original utterance, which will be"
echo "graphs for the new utterances. We do this in the scp file so that we" echo "shared by the truncated utterances from the same original utterance."
echo "can avoid storing the duplicate graphs on the disk." echo "We assign the decoding graph to each truncated utterance using the scp"
echo "file so that we can avoid duplicating the graphs on the disk."
echo "" echo ""
echo "Usage: $0 [options] <data-dir> <lang-dir> <model-dir> <graph-dir>" echo "Usage: $0 [options] <data-dir> <lang-dir> <model-dir> <graph-dir>"
echo " e.g.: $0 data/train_si284_split/ \\"
echo " data/lang exp/tri2b/ exp/tri2b/graph_train_si284_split"
echo ""
echo "Options:" echo "Options:"
echo " --lm-order # order of n-gram language model" echo " --ngram-order # order of n-gram language model"
echo " --lm-options # options for ngram-count in SRILM tool" echo " --srilm-options # options for ngram-count in SRILM tool"
echo " --tscale # transition scale"
echo " --loopscale # scale for self-loops"
echo " --cleanup # if true, removes the intermediate files"
exit 1; exit 1;
fi fi
...@@ -45,6 +54,27 @@ for f in $data/text.orig $data/orig2utt $lang/L_disambig.fst \ ...@@ -45,6 +54,27 @@ for f in $data/text.orig $data/orig2utt $lang/L_disambig.fst \
fi fi
done done
# If --ngram-order is larger than 1, we will have to use SRILM
if [ $ngram_order -gt 1 ]; then
ngram_count=`which ngram-count`;
if [ -z $ngram_count ]; then
if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
sdir=`pwd`/../../../tools/srilm/bin/i686-m64
else
sdir=`pwd`/../../../tools/srilm/bin/i686
fi
if [ -f $sdir/ngram-count ]; then
echo Using SRILM tools from $sdir
export PATH=$PATH:$sdir
else
echo You appear to not have SRILM tools installed, either on your path,
echo or installed in $sdir. See tools/install_srilm.sh for installation
echo instructions.
exit 1
fi
fi
fi
# Creates one graph for each transcript. We parallelize the process a little # Creates one graph for each transcript. We parallelize the process a little
# bit. # bit.
num_lines=`cat $data/text.orig | wc -l` num_lines=`cat $data/text.orig | wc -l`
...@@ -63,9 +93,10 @@ for n in $(seq $nj); do ...@@ -63,9 +93,10 @@ for n in $(seq $nj); do
done done
utils/split_scp.pl $data/text.orig $split_texts utils/split_scp.pl $data/text.orig $split_texts
$cmd JOB=1:$nj $graph_dir/log/make_transcript_graph.JOB.log \ $cmd JOB=1:$nj $graph_dir/log/make_utterance_graph.JOB.log \
steps/cleanup/make_transcript_graph.sh --cleanup $cleanup \ steps/cleanup/make_utterance_graph.sh --cleanup $cleanup \
--tscale $tscale --loopscale $loopscale \ --tscale $tscale --loopscale $loopscale \
--ngram-order $ngram_order --srilm-options "$srilm_options" \
$graph_dir/split$nj/JOB/text $lang \ $graph_dir/split$nj/JOB/text $lang \
$model_dir $graph_dir/split$nj/JOB || exit 1; $model_dir $graph_dir/split$nj/JOB || exit 1;
......
...@@ -7,6 +7,8 @@ ...@@ -7,6 +7,8 @@
tscale=1.0 # transition scale. tscale=1.0 # transition scale.
loopscale=0.1 # scale for self-loops. loopscale=0.1 # scale for self-loops.
cleanup=true cleanup=true
ngram_order=1
srilm_options="-wbdiscount" # By default, use Witten-Bell discounting in SRILM
# End configuration section. # End configuration section.
set -e set -e
...@@ -17,13 +19,24 @@ echo "$0 $@" ...@@ -17,13 +19,24 @@ echo "$0 $@"
. parse_options.sh || exit 1; . parse_options.sh || exit 1;
if [ $# -ne 4 ]; then if [ $# -ne 4 ]; then
echo "This script builds one decoding graph for each transcript in the given" echo "This script builds one decoding graph for each utterance using the"
echo "<text> file." echo "corresponding text in the given <text> file. If --ngram-order is 1,"
echo "then utils/make_unigram_grammar.pl will be used to build the unigram"
echo "language model. Otherwise SRILM will be used instead. You are supposed"
echo "to have SRILM installed if --ngram-order is larger than 1. The format"
echo "of the given <text> file is same as the transcript text files in data"
echo "directory."
echo "" echo ""
echo "Usage: $0 [options] <text> <lang-dir> <model-dir> <graph-dir>" echo "Usage: $0 [options] <text> <lang-dir> <model-dir> <graph-dir>"
echo " e.g.: $0 data/train_si284_split/text \\"
echo " data/lang exp/tri2b/ exp/tri2b/graph_train_si284_split"
echo ""
echo "Options:" echo "Options:"
echo " --lm-order # order of n-gram language model" echo " --ngram-order # order of n-gram language model"
echo " --lm-options # options for ngram-count in SRILM tool" echo " --srilm-options # options for ngram-count in SRILM tool"
echo " --tscale # transition scale"
echo " --loopscale # scale for self-loops"
echo " --cleanup # if true, removes the intermediate files"
exit 1; exit 1;
fi fi
...@@ -42,8 +55,30 @@ done ...@@ -42,8 +55,30 @@ done
mkdir -p $graph_dir/sub_graphs mkdir -p $graph_dir/sub_graphs
# If --ngram-order is larger than 1, we will have to use SRILM
if [ $ngram_order -gt 1 ]; then
ngram_count=`which ngram-count`;
if [ -z $ngram_count ]; then
if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
sdir=`pwd`/../../../tools/srilm/bin/i686-m64
else
sdir=`pwd`/../../../tools/srilm/bin/i686
fi
if [ -f $sdir/ngram-count ]; then
echo Using SRILM tools from $sdir
export PATH=$PATH:$sdir
else
echo You appear to not have SRILM tools installed, either on your path,
echo or installed in $sdir. See tools/install_srilm.sh for installation
echo instructions.
exit 1
fi
fi
fi
# Maps OOV words to the oov symbol. # Maps OOV words to the oov symbol.
oov=`cat $lang/oov.int` oov=`cat $lang/oov.int`
oov_txt=`cat $lang/oov.txt`
N=`tree-info --print-args=false $model_dir/tree |\ N=`tree-info --print-args=false $model_dir/tree |\
grep "context-width" | awk '{print $NF}'` grep "context-width" | awk '{print $NF}'`
...@@ -62,11 +97,25 @@ while read line; do ...@@ -62,11 +97,25 @@ while read line; do
wdir=$graph_dir/sub_graphs/$uttid wdir=$graph_dir/sub_graphs/$uttid
mkdir -p $wdir mkdir -p $wdir
echo $words > $wdir/text
cat $wdir/text | utils/sym2int.pl --map-oov $oov -f 1- $lang/words.txt | \ # Compiles G.fst
utils/make_unigram_grammar.pl | fstcompile |\ if [ $ngram_order -eq 1 ]; then
fstarcsort --sort_type=ilabel > $wdir/G.fst || exit 1; echo $words > $wdir/text
cat $wdir/text | utils/sym2int.pl --map-oov $oov -f 1- $lang/words.txt | \
utils/make_unigram_grammar.pl | fstcompile |\
fstarcsort --sort_type=ilabel > $wdir/G.fst || exit 1;
else
echo $words | awk -v voc=$lang/words.txt -v oov="$oov_txt" '
BEGIN{ while((getline<voc)>0) { invoc[$1]=1; } } {
for (x=1;x<=NF;x++) {
if (invoc[$x]) { printf("%s ", $x); } else { printf("%s ", oov); } }
printf("\n"); }' > $wdir/text
ngram-count -text $wdir/text -order $ngram_order "$srilm_options" -lm - |\
arpa2fst - | fstprint | utils/eps2disambig.pl | utils/s2eps.pl |\
fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt \
--keep_isymbols=false --keep_osymbols=false |\
fstrmepsilon | fstarcsort --sort_type=ilabel > $wdir/G.fst || exit 1;
fi
fstisstochastic $wdir/G.fst || echo "$0: $uttid/G.fst not stochastic." fstisstochastic $wdir/G.fst || echo "$0: $uttid/G.fst not stochastic."
# Builds LG.fst # Builds LG.fst
......
...@@ -15,7 +15,11 @@ echo "$0 $@" ...@@ -15,7 +15,11 @@ echo "$0 $@"
. parse_options.sh || exit 1; . parse_options.sh || exit 1;
if [ $# -ne 2 ]; then if [ $# -ne 2 ]; then
echo "This script truncates the long audio into smaller overlapping segments"
echo ""
echo "Usage: $0 [options] <input-dir> <output-dir>" echo "Usage: $0 [options] <input-dir> <output-dir>"
echo " e.g.: $0 data/train_si284_long data/train_si284_split"
echo ""
echo "Options:" echo "Options:"
echo " --min-seg-length # minimal segment length" echo " --min-seg-length # minimal segment length"
echo " --seg-length # length of segments in seconds." echo " --seg-length # length of segments in seconds."
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment