Commit 827a5d62 authored by Daniel Povey's avatar Daniel Povey
Browse files

Merge pull request #12 from jtrmal/irstlm-checks

Irstlm checks
parents 514e1ac1 55c7917e
export KALDI_ROOT=`pwd`/../../..
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
#export KALDI_ROOT=/home/dpovey/kaldi-trunk-test
export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$PWD:$PATH
export LC_ALL=C
......
......@@ -12,6 +12,19 @@ exit 1;
. cmd.sh
. path.sh
if [ -z $IRSTLM ] ; then
export IRSTLM=$KALDI_ROOT/tools/irstlm/
fi
export PATH=${PATH}:$IRSTLM/bin
if ! command -v prune-lm >/dev/null 2>&1 ; then
echo "$0: Error: the IRSTLM is not available or compiled" >&2
echo "$0: Error: We used to install it by default, but." >&2
echo "$0: Error: this is no longer the case." >&2
echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
echo "$0: Error: and run extras/install_irstlm.sh" >&2
exit 1
fi
# Data prep
# Here we make some Edinburgh-specific changes from the Kaldi recipe in
# trunk/egs/swbd/s5 (rev. 1841). The major differences are that everything is
......
......@@ -18,6 +18,19 @@ set -e # exit on error
# want to store MFCC features.
mfccdir=mfcc
if [ -z $IRSTLM ] ; then
export IRSTLM=$KALDI_ROOT/tools/irstlm/
fi
export PATH=${PATH}:$IRSTLM/bin
if ! command -v prune-lm >/dev/null 2>&1 ; then
echo "$0: Error: the IRSTLM is not available or compiled" >&2
echo "$0: Error: We used to install it by default, but." >&2
echo "$0: Error: this is no longer the case." >&2
echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
echo "$0: Error: and run extras/install_irstlm.sh" >&2
exit 1
fi
# Prepare Switchboard data. This command can also take a second optional argument
# which specifies the directory to Switchboard documentations. Specifically, if
......@@ -54,6 +67,7 @@ local/swbd1_train_lms.sh data/local/train/text \
data/local/dict/lexicon.txt data/local/lm $fisher_dirs
# We don't really need all these options for SRILM, since the LM training script
# does some of the same processings (e.g. -subset -tolower)
for order in 3 4; do
lm_suffix="tg"
[ $order -eq 3 ] || lm_suffix="fg"
......@@ -66,7 +80,6 @@ for order in 3 4; do
utils/build_const_arpa_lm.sh $LM data/lang data/lang_sw1_fsh_$lm_suffix
# For some funny reason we are still using IRSTLM for doing LM pruning :)
export PATH=$PATH:../../../tools/irstlm/bin/
prune-lm --threshold=1e-7 data/local/lm/sw1_fsh.o${order}g.kn.gz /dev/stdout \
| gzip -c > data/local/lm/sw1_fsh.o${order}g.pr1-7.kn.gz || exit 1
LM=data/local/lm/sw1_fsh.o${order}g.pr1-7.kn.gz
......
......@@ -97,13 +97,30 @@ cut -f1 data/local/lexicon.txt \
# (4) Create the phone bigram LM
(
[ -z "$IRSTLM" ] && \
error_exit "LM building wo'nt work without setting the IRSTLM env variable"
cut -d' ' -f2- data/local/train.trans2 | sed -e 's:^:<s> :' -e 's:$: </s>:' \
> data/local/lm_train.txt
build-lm.sh -i data/local/lm_train.txt -n 2 -o data/local/lm_phone_bg.ilm.gz
compile-lm data/local/lm_phone_bg.ilm.gz --text yes /dev/stdout \
| grep -v unk | gzip -c > data/local/lm_phone_bg.arpa.gz
if [ -z $IRSTLM ] ; then
export IRSTLM=$KALDI_ROOT/tools/irstlm/
fi
export PATH=${PATH}:$IRSTLM/bin
if ! command -v prune-lm >/dev/null 2>&1 ; then
echo "$0: Error: the IRSTLM is not available or compiled" >&2
echo "$0: Error: We used to install it by default, but." >&2
echo "$0: Error: this is no longer the case." >&2
echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
echo "$0: Error: and run extras/install_irstlm.sh" >&2
exit 1
fi
cut -d' ' -f2- $srcdir/text | sed -e 's:^:<s> :' -e 's:$: </s>:' \
> $srcdir/lm_train
cut -d' ' -f2- data/local/train.trans2 | sed -e 's:^:<s> :' -e 's:$: </s>:' \
> data/local/lm_train.txt
build-lm.sh -i data/local/lm_train.txt -n 2 \
-o data/local/lm_phone_bg.ilm.gz
compile-lm data/local/lm_phone_bg.ilm.gz --text yes /dev/stdout \
| grep -v unk | gzip -c > data/local/lm_phone_bg.arpa.gz
) >& data/prepare_lm.log
......
......@@ -3,6 +3,7 @@
# The KALDIROOT enviromnent variable must be set by the user.
# KALDIROOT=/absolute/path/to/kaldi/installation
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
KALDISRC=$KALDIROOT/src
KALDIBIN=$KALDISRC/bin:$KALDISRC/featbin:$KALDISRC/fgmmbin:$KALDISRC/fstbin
KALDIBIN=$KALDIBIN:$KALDISRC/gmmbin:$KALDISRC/latbin:$KALDISRC/nnetbin
......@@ -28,7 +29,6 @@ TOOLS=$SPH2PIPE
export PATH=$PATH:$KALDIBIN:$FSTBIN:$LMBIN:$SCRIPTS:$TOOLS
export LC_ALL=C
export IRSTLM=$KALDIROOT/tools/irstlm
## Site-specific configs for Edinburgh
# [ `hostname -y` == ecdf ] && \
......
......@@ -61,17 +61,26 @@ cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_))
>> $dir/extra_questions.txt || exit 1;
# (2) Create the phone bigram LM
[ -z "$IRSTLM" ] && \
echo "LM building won't work without setting the IRSTLM env variable" && exit 1;
! which build-lm.sh 2>/dev/null && \
echo "IRSTLM does not seem to be installed (build-lm.sh not on your path): " && \
echo "go to <kaldi-root>/tools and try 'make irstlm_tgt'" && exit 1;
cut -d' ' -f2- $srcdir/train.text | sed -e 's:^:<s> :' -e 's:$: </s>:' \
> $srcdir/lm_train.text
build-lm.sh -i $srcdir/lm_train.text -n 2 -o $tmpdir/lm_phone_bg.ilm.gz
compile-lm $tmpdir/lm_phone_bg.ilm.gz -t=yes /dev/stdout | \
grep -v unk | gzip -c > $lmdir/lm_phone_bg.arpa.gz
if [ -z $IRSTLM ] ; then
export IRSTLM=$KALDI_ROOT/tools/irstlm/
fi
export PATH=${PATH}:$IRSTLM/bin
if ! command -v prune-lm >/dev/null 2>&1 ; then
echo "$0: Error: the IRSTLM is not available or compiled" >&2
echo "$0: Error: We used to install it by default, but." >&2
echo "$0: Error: this is no longer the case." >&2
echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
echo "$0: Error: and run extras/install_irstlm.sh" >&2
exit 1
fi
cut -d' ' -f2- $srcdir/train.text | sed -e 's:^:<s> :' -e 's:$: </s>:' \
> $srcdir/lm_train.text
build-lm.sh -i $srcdir/lm_train.text -n 2 \
-o $tmpdir/lm_phone_bg.ilm.gz
compile-lm $tmpdir/lm_phone_bg.ilm.gz -t=yes /dev/stdout | \
grep -v unk | gzip -c > $lmdir/lm_phone_bg.arpa.gz
echo "Dictionary & language model preparation succeeded"
export KALDI_ROOT=`pwd`/../../..
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH
export LC_ALL=C
export IRSTLM=$KALDI_ROOT/tools/irstlm
......@@ -25,13 +25,25 @@ local=`pwd`/local
utils=`pwd`/utils
. ./path.sh # Needed for KALDI_ROOT
export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
exit 1;
fi
if [ -z $IRSTLM ] ; then
export IRSTLM=$KALDI_ROOT/tools/irstlm/
fi
export PATH=${PATH}:$IRSTLM/bin
if ! command -v prune-lm >/dev/null 2>&1 ; then
echo "$0: Error: the IRSTLM is not available or compiled" >&2
echo "$0: Error: We used to install it by default, but." >&2
echo "$0: Error: this is no longer the case." >&2
echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
echo "$0: Error: and run extras/install_irstlm.sh" >&2
exit 1
fi
cd $dir
# This version for SI-84
......
......@@ -17,15 +17,26 @@ local=`pwd`/local
utils=`pwd`/utils
. ./path.sh # Needed for KALDI_ROOT
export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
exit 1;
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
exit 1;
fi
cd $dir
if [ -z $IRSTLM ] ; then
export IRSTLM=$KALDI_ROOT/tools/irstlm/
fi
export PATH=${PATH}:$IRSTLM/bin
if ! command -v prune-lm >/dev/null 2>&1 ; then
echo "$0: Error: the IRSTLM is not available or compiled" >&2
echo "$0: Error: We used to install it by default, but." >&2
echo "$0: Error: this is no longer the case." >&2
echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
echo "$0: Error: and run extras/install_irstlm.sh" >&2
exit 1
fi
cd $dir
# Make directory of links to the WSJ disks such as 11-13.1. This relies on the command
# line arguments being absolute pathnames.
rm -r links/ 2>/dev/null
......
......@@ -174,19 +174,31 @@ ngram -lm $sdir/srilm.o3g.pr7.kn.gz -ppl $sdir/cleaned.heldout
## From here is how to train with
# IRSTLM. This is not really working at the moment.
export IRSTLM=$KALDI_ROOT/tools/irstlm/
if [ -z $IRSTLM ] ; then
export IRSTLM=$KALDI_ROOT/tools/irstlm/
fi
export PATH=${PATH}:$IRSTLM/bin
if ! command -v prune-lm >/dev/null 2>&1 ; then
echo "$0: Error: the IRSTLM is not available or compiled" >&2
echo "$0: Error: We used to install it by default, but." >&2
echo "$0: Error: this is no longer the case." >&2
echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
echo "$0: Error: and run extras/install_irstlm.sh" >&2
exit 1
fi
idir=$dir/irstlm
mkdir $idir
gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | $IRSTLM/scripts/add-start-end.sh | \
gunzip -c $srcdir/cleaned.gz | tail -n +$heldout_sent | add-start-end.sh | \
gzip -c > $idir/train.gz
$IRSTLM/bin/dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no
dict -i=WSJ.cleaned.irstlm.txt -o=dico -f=y -sort=no
cat dico | gawk 'BEGIN{while (getline<"vocab.20k.nooov") v[$1]=1; print "DICTIONARY 0 "length(v);}FNR>1{if ($1 in v)\
{print $0;}}' > vocab.irstlm.20k
$IRSTLM/bin/build-lm.sh -i "gunzip -c $idir/train.gz" -o $idir/lm_3gram.gz -p yes \
build-lm.sh -i "gunzip -c $idir/train.gz" -o $idir/lm_3gram.gz -p yes \
-n 3 -s improved-kneser-ney -b yes
# Testing perplexity with SRILM tools:
ngram -lm $idir/lm_3gram.gz -ppl $sdir/cleaned.heldout
......
export KALDI_ROOT=`pwd`/../../..
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/:$PWD:$PATH
export LC_ALL=C
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment