Commit 55c7917e authored by Jan Trmal's avatar Jan Trmal
Browse files

Fixing one more recipe. The script add-start-end.sh is in bin/ as well, so...

Fixing one more recipe. The script add-start-end.sh is in bin/ as well, so using it from there, not from scripts/
parent c6ed8b61
......@@ -9,7 +9,18 @@
# data/train_si284, data/train_si84, etc.
. ./path.sh || exit 1;
export PATH=$KALDI_ROOT/tools/irstlm/bin:$PATH
if [ -z $IRSTLM ] ; then
export IRSTLM=$KALDI_ROOT/tools/irstlm/
fi
export PATH=${PATH}:$IRSTLM/bin
if ! command -v ngt >/dev/null 2>&1 ; then
echo "$0: Error: the IRSTLM is not available or compiled" >&2
echo "$0: Error: We used to install it by default, but." >&2
echo "$0: Error: this is no longer the case." >&2
echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
echo "$0: Error: and run extras/install_irstlm.sh" >&2
exit 1
fi
srcdict=$1
newtext=$2
......@@ -119,7 +130,7 @@ if [ ! -f $lmdir/extra4.ngt ];
awk '{if(NF>=4){ printf("%s\n",$0); }}' > $lmdir/text.filt
# Envelop LM training data in context cues
$irstbin/add-start-end.sh < $lmdir/text.filt > $lmdir/lm_input
add-start-end.sh < $lmdir/text.filt > $lmdir/lm_input
echo "Creating new binary ngram table $lmdir/extra4.ngt"
......
......@@ -14,6 +14,19 @@
. ./path.sh || exit 1;
if [ -z $IRSTLM ] ; then
export IRSTLM=$KALDI_ROOT/tools/irstlm/
fi
export PATH=${PATH}:$IRSTLM/bin
if ! command -v ngt >/dev/null 2>&1 ; then
echo "$0: Error: the IRSTLM is not available or compiled" >&2
echo "$0: Error: We used to install it by default, but." >&2
echo "$0: Error: this is no longer the case." >&2
echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
echo "$0: Error: and run extras/install_irstlm.sh" >&2
exit 1
fi
echo "Preparing train and test data"
srcdir=$4
lmdir=$5
......@@ -25,13 +38,12 @@ lm_suffix=$3
mkdir -p $lmdir
mkdir -p $tmpdir
irstbin=$KALDI_ROOT/tools/irstlm/bin
#grep -P -v '^[\s?|\.|\!]*$' $lexicon | grep -v '^ *$' | \
#awk '{if(NF>=4){ printf("%s\n",$0); }}' > $lmdir/text.filt
# Envelop LM training data in context cues
$irstbin/add-start-end.sh < $lexicon | awk '{if(NF>=3){ printf("%s\n",$0); }}' > $lmdir/lm_input
add-start-end.sh < $lexicon | awk '{if(NF>=3){ printf("%s\n",$0); }}' > $lmdir/lm_input
wait
# Next, for each type of language model, create the corresponding FST
......@@ -40,10 +52,10 @@ wait
echo "Preparing language models for test"
# Create Ngram table
$irstbin/ngt -i=$lmdir/lm_input -n=$ngram -o=$lmdir/train${ngram}.ngt -b=yes
ngt -i=$lmdir/lm_input -n=$ngram -o=$lmdir/train${ngram}.ngt -b=yes
wait
# Estimate trigram and quadrigram models in ARPA format
$irstbin/tlm -tr=$lmdir/train${ngram}.ngt -n=$ngram -lm=wb -o=$lmdir/train${ngram}.arpa
tlm -tr=$lmdir/train${ngram}.ngt -n=$ngram -lm=wb -o=$lmdir/train${ngram}.arpa
wait
......@@ -76,4 +88,4 @@ utils/validate_lang.pl $test || exit 1;
echo "Succeeded in formatting data."
exit 0;
#rm -rf $tmpdir
#rm -f $ccs
\ No newline at end of file
#rm -f $ccs
export KALDI_ROOT=`pwd`/../../..
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH
export LC_ALL=C
......@@ -190,7 +190,7 @@ fi
idir=$dir/irstlm
mkdir $idir
gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | $IRSTLM/scripts/add-start-end.sh | \
gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | add-start-end.sh | \
gzip -c > $idir/train.gz
dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment