Commit b1560b5e authored by Guoguo Chen's avatar Guoguo Chen
Browse files

trunk: adding higher order LM support in segmentation scripts

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@5193 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 7c99af63
......@@ -29,14 +29,17 @@ echo "$0 $@" # Print the command line for logging
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
echo "... where <decode-dir> is assumed to be a sub-directory of the"
echo " directory where the model is."
echo ""
echo "This is a special decoding script for segmentation where we use one"
echo "decoding graph for each segment. We assume a file HCLG.fsts.scp exists"
echo "which is the scp file of the graphs for each segment."
echo ""
echo "Usage: $0 [options] <graph-dir> <data-dir> <decode-dir>"
echo " e.g.: $0 exp/tri2b/graph_train_si284_split \\"
echo " data/train_si284_split exp/tri2b/decode_train_si284_split"
echo ""
echo "where <decode-dir> is assumed to be a sub-directory of the directory"
echo "where the model is."
echo ""
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --nj <nj> # number of parallel jobs"
......@@ -76,7 +79,7 @@ sort -k1,1 -u < $graphdir/HCLG.fsts.scp > $graphdir/HCLG.fsts.scp.sorted
mv $graphdir/HCLG.fsts.scp.sorted $graphdir/HCLG.fsts.scp
for x in `seq 1 $nj`; do
cat $graphdir/HCLG.fsts.scp |\
utils/filter_scp.pl -f 1 <(cut -f 1 -d ' ' $sdata/$x/feats.scp) > $sdata/$x/graphs.scp
utils/filter_scp.pl -f 1 $sdata/$x/feats.scp > $sdata/$x/graphs.scp
num_feats=`cat $sdata/$x/feats.scp | wc -l`
num_graphs=`cat $sdata/$x/graphs.scp | wc -l`
if [ $num_graphs -ne $num_feats ]; then
......
......@@ -9,6 +9,8 @@ cmd=run.pl
tscale=1.0 # transition scale.
loopscale=0.1 # scale for self-loops.
cleanup=true
ngram_order=1
srilm_options="-wbdiscount" # By default, use Witten-Bell discounting in SRILM
# End configuration section.
set -e
......@@ -19,16 +21,23 @@ echo "$0 $@"
. parse_options.sh || exit 1;
if [ $# -ne 4 ]; then
echo "This script is a wrapper of steps/cleanup/make_transcript_graph.sh. In"
echo "the segmentation case graphs are created for the original transcript"
echo "(the long transcript before split), therefore we have to duplicate the"
echo "graphs for the new utterances. We do this in the scp file so that we"
echo "can avoid storing the duplicate graphs on the disk."
echo "This script builds one decoding graph for each truncated utterance in"
echo "segmentation. It first calls steps/cleanup/make_utterance_graph.sh to"
echo "build one decoding graph for each original utterance, which will be"
echo "shared by the truncated utterances from the same original utterance."
echo "We assign the decoding graph to each truncated utterance using the scp"
echo "file so that we can avoid duplicating the graphs on the disk."
echo ""
echo "Usage: $0 [options] <data-dir> <lang-dir> <model-dir> <graph-dir>"
echo " e.g.: $0 data/train_si284_split/ \\"
echo " data/lang exp/tri2b/ exp/tri2b/graph_train_si284_split"
echo ""
echo "Options:"
echo " --lm-order # order of n-gram language model"
echo " --lm-options # options for ngram-count in SRILM tool"
echo " --ngram-order # order of n-gram language model"
echo " --srilm-options # options for ngram-count in SRILM tool"
echo " --tscale # transition scale"
echo " --loopscale # scale for self-loops"
echo " --cleanup # if true, removes the intermediate files"
exit 1;
fi
......@@ -45,6 +54,27 @@ for f in $data/text.orig $data/orig2utt $lang/L_disambig.fst \
fi
done
# If --ngram-order is larger than 1, we will have to use SRILM
if [ $ngram_order -gt 1 ]; then
ngram_count=`which ngram-count`;
if [ -z $ngram_count ]; then
if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
sdir=`pwd`/../../../tools/srilm/bin/i686-m64
else
sdir=`pwd`/../../../tools/srilm/bin/i686
fi
if [ -f $sdir/ngram-count ]; then
echo Using SRILM tools from $sdir
export PATH=$PATH:$sdir
else
echo You appear to not have SRILM tools installed, either on your path,
echo or installed in $sdir. See tools/install_srilm.sh for installation
echo instructions.
exit 1
fi
fi
fi
# Creates one graph for each transcript. We parallelize the process a little
# bit.
num_lines=`cat $data/text.orig | wc -l`
......@@ -63,9 +93,10 @@ for n in $(seq $nj); do
done
utils/split_scp.pl $data/text.orig $split_texts
$cmd JOB=1:$nj $graph_dir/log/make_transcript_graph.JOB.log \
steps/cleanup/make_transcript_graph.sh --cleanup $cleanup \
$cmd JOB=1:$nj $graph_dir/log/make_utterance_graph.JOB.log \
steps/cleanup/make_utterance_graph.sh --cleanup $cleanup \
--tscale $tscale --loopscale $loopscale \
--ngram-order $ngram_order --srilm-options "$srilm_options" \
$graph_dir/split$nj/JOB/text $lang \
$model_dir $graph_dir/split$nj/JOB || exit 1;
......
......@@ -7,6 +7,8 @@
tscale=1.0 # transition scale.
loopscale=0.1 # scale for self-loops.
cleanup=true
ngram_order=1
srilm_options="-wbdiscount" # By default, use Witten-Bell discounting in SRILM
# End configuration section.
set -e
......@@ -17,13 +19,24 @@ echo "$0 $@"
. parse_options.sh || exit 1;
if [ $# -ne 4 ]; then
echo "This script builds one decoding graph for each transcript in the given"
echo "<text> file."
echo "This script builds one decoding graph for each utterance using the"
echo "corresponding text in the given <text> file. If --ngram-order is 1,"
echo "then utils/make_unigram_grammar.pl will be used to build the unigram"
echo "language model. Otherwise SRILM will be used instead. You are supposed"
echo "to have SRILM installed if --ngram-order is larger than 1. The format"
echo "of the given <text> file is same as the transcript text files in data"
echo "directory."
echo ""
echo "Usage: $0 [options] <text> <lang-dir> <model-dir> <graph-dir>"
echo " e.g.: $0 data/train_si284_split/text \\"
echo " data/lang exp/tri2b/ exp/tri2b/graph_train_si284_split"
echo ""
echo "Options:"
echo " --lm-order # order of n-gram language model"
echo " --lm-options # options for ngram-count in SRILM tool"
echo " --ngram-order # order of n-gram language model"
echo " --srilm-options # options for ngram-count in SRILM tool"
echo " --tscale # transition scale"
echo " --loopscale # scale for self-loops"
echo " --cleanup # if true, removes the intermediate files"
exit 1;
fi
......@@ -42,8 +55,30 @@ done
mkdir -p $graph_dir/sub_graphs
# If --ngram-order is larger than 1, we will have to use SRILM
if [ $ngram_order -gt 1 ]; then
ngram_count=`which ngram-count`;
if [ -z $ngram_count ]; then
if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
sdir=`pwd`/../../../tools/srilm/bin/i686-m64
else
sdir=`pwd`/../../../tools/srilm/bin/i686
fi
if [ -f $sdir/ngram-count ]; then
echo Using SRILM tools from $sdir
export PATH=$PATH:$sdir
else
echo You appear to not have SRILM tools installed, either on your path,
echo or installed in $sdir. See tools/install_srilm.sh for installation
echo instructions.
exit 1
fi
fi
fi
# Maps OOV words to the oov symbol.
oov=`cat $lang/oov.int`
oov_txt=`cat $lang/oov.txt`
N=`tree-info --print-args=false $model_dir/tree |\
grep "context-width" | awk '{print $NF}'`
......@@ -62,11 +97,25 @@ while read line; do
wdir=$graph_dir/sub_graphs/$uttid
mkdir -p $wdir
echo $words > $wdir/text
# Compiles G.fst
if [ $ngram_order -eq 1 ]; then
echo $words > $wdir/text
cat $wdir/text | utils/sym2int.pl --map-oov $oov -f 1- $lang/words.txt | \
utils/make_unigram_grammar.pl | fstcompile |\
fstarcsort --sort_type=ilabel > $wdir/G.fst || exit 1;
else
echo $words | awk -v voc=$lang/words.txt -v oov="$oov_txt" '
BEGIN{ while((getline<voc)>0) { invoc[$1]=1; } } {
for (x=1;x<=NF;x++) {
if (invoc[$x]) { printf("%s ", $x); } else { printf("%s ", oov); } }
printf("\n"); }' > $wdir/text
ngram-count -text $wdir/text -order $ngram_order "$srilm_options" -lm - |\
arpa2fst - | fstprint | utils/eps2disambig.pl | utils/s2eps.pl |\
fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt \
--keep_isymbols=false --keep_osymbols=false |\
fstrmepsilon | fstarcsort --sort_type=ilabel > $wdir/G.fst || exit 1;
fi
fstisstochastic $wdir/G.fst || echo "$0: $uttid/G.fst not stochastic."
# Builds LG.fst
......
......@@ -15,7 +15,11 @@ echo "$0 $@"
. parse_options.sh || exit 1;
if [ $# -ne 2 ]; then
echo "This script truncates the long audio into smaller overlapping segments"
echo ""
echo "Usage: $0 [options] <input-dir> <output-dir>"
echo " e.g.: $0 data/train_si284_long data/train_si284_split"
echo ""
echo "Options:"
echo " --min-seg-length # minimal segment length"
echo " --seg-length # length of segments in seconds."
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment