Commit 7bb41d14 authored by Hainan Xu's avatar Hainan Xu
Browse files

Add gale_mandarin setup; fix a few things in the gale_arabic setup

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4560 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent e0e7519a
......@@ -5,7 +5,7 @@
# (make sure your --num-jobs options are no more than
# the number of cpus on your machine.
export train_cmd="queue.pl"
export decode_cmd="queue.pl"
export cuda_cmd="run.pl"
export train_cmd="queue.pl -l 'arch=*64*'"
export decode_cmd="queue.pl -l 'arch=*64*'"
export cuda_cmd="queue.pl -l gpu=1"
#!/bin/bash
#!/bin/bash
# Copyright 2012-2014 Brno University of Technology (Author: Karel Vesely)
# Copyright 2014 QCRI (author: Ahmed Ali)
# Apache 2.0
# This example script trains a DNN on top of fMLLR features.
# The training is done in 3 stages,
#
# 1) RBM pre-training:
# in this unsupervised stage we train stack of RBMs,
# a good starting point for frame cross-entropy trainig.
# 2) frame cross-entropy training:
# the objective is to classify frames to correct pdfs.
# 3) sequence-training optimizing sMBR:
# the objective is to emphasize state-sequences with better
# frame accuracy w.r.t. reference alignment.
. ./path.sh
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
## This relates to the queue.
. ./path.sh ## Source the tools/utils (import the queue.pl)
# Config:
gmmdir=exp/tri4b
data_fmllr=data-fmllr-tri4b
stage=0 # resume training with --stage=N
# End of config.
. utils/parse_options.sh || exit 1;
#
#train DNN
mfcc_fmllr_dir=mfcc_fmllr
baseDir=exp/tri3b
alignDir=exp/tri3b_ali
dnnDir=exp/tri3b_dnn_2048x5
align_dnnDir=exp/tri3b_dnn_2048x5_ali
dnnLatDir=exp/tri3b_dnn_2048x5_denlats
dnnMPEDir=exp/tri3b_dnn_2048x5_smb
if [ $stage -le 0 ]; then
# generate fbank features
mkdir -p data_fbank
for set in train dev.conversational test.conversational dev.report test.report; do
[ ! -d data_fbank/${set} ] && cp -r data/${set} data_fbank/${set}
if [ ! -f $working_dir/fbank.$set.done ]; then
( cd data_fbank/${set}; rm -r split* cmvn.scp feats.scp; )
steps/make_fbank.sh --cmd "$train_cmd" --nj 8 data_fbank/${set} exp/make_fbank/${set} exp/fbank || exit 1;
steps/compute_cmvn_stats.sh data_fbank/${set} exp/make_fbank/${set} exp/fbank || exit 1;
fi
done
fi
trainTr90=data/train_tr90
trainCV=data/train_cv10
exit 0
steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$cuda_cmd" \
--transform-dir $baseDir/decode data/test_fmllr data/test \
$baseDir $mfcc_fmllr_dir/log_test $mfcc_fmllr_dir || exit 1;
if [ $stage -le 1 ]; then
# Pre-train DBN, i.e. a stack of RBMs
dir=exp/dnn5b_pretrain-dbn
(tail --pid=$$ -F $dir/log/pretrain_dbn.log 2>/dev/null)& # forward log
$cuda_cmd $dir/log/pretrain_dbn.log \
steps/nnet/pretrain_dbn.sh --rbm-iter 3 $data_fmllr/train_si284 $dir || exit 1;
fi
steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$cuda_cmd" \
--transform-dir $alignDir data/train_fmllr data/train \
$baseDir $mfcc_fmllr_dir/log_train $mfcc_fmllr_dir || exit 1;
utils/subset_data_dir_tr_cv.sh data/train_fmllr $trainTr90 $trainCV || exit 1;
if [ $stage -le 2 ]; then
# Train the DNN optimizing per-frame cross-entropy.
dir=exp/dnn5b_pretrain-dbn_dnn
ali=${gmmdir}_ali_si284
feature_transform=exp/dnn5b_pretrain-dbn/final.feature_transform
dbn=exp/dnn5b_pretrain-dbn/6.dbn
(tail --pid=$$ -F $dir/log/train_nnet.log 2>/dev/null)& # forward log
# Train
$cuda_cmd $dir/log/train_nnet.log \
steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
$data_fmllr/train_si284_tr90 $data_fmllr/train_si284_cv10 data/lang $ali $ali $dir || exit 1;
# Decode (reuse HCLG graph)
steps/nnet/decode.sh --nj 10 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
$gmmdir/graph_bd_tgpr $data_fmllr/test_dev93 $dir/decode_bd_tgpr_dev93 || exit 1;
steps/nnet/decode.sh --nj 8 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
$gmmdir/graph_bd_tgpr $data_fmllr/test_eval92 $dir/decode_bd_tgpr_eval92 || exit 1;
fi
(tail --pid=$$ -F $dnnDir/train_nnet.log 2>/dev/null)&
$cuda_cmd $dnnDir/train_nnet.log \
steps/train_nnet.sh --hid-dim 2048 --hid-layers 5 --learn-rate 0.008 \
$trainTr90 $trainCV data/lang $alignDir $alignDir $dnnDir || exit 1;
steps/decode_nnet.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode_dnn.config \
--nnet $dnnDir/final.nnet --acwt 0.08 $baseDir/graph data/test_fmllr $dnnDir/decode || exit 1;
# Sequence training using sMBR criterion, we do Stochastic-GD
# with per-utterance updates. We use usually good acwt 0.1
# Lattices are re-generated after 1st epoch, to get faster convergence.
dir=exp/dnn5b_pretrain-dbn_dnn_smbr
srcdir=exp/dnn5b_pretrain-dbn_dnn
acwt=0.1
if [ $stage -le 3 ]; then
# First we generate lattices and alignments:
steps/nnet/align.sh --nj 100 --cmd "$train_cmd" \
$data_fmllr/train_si284 data/lang $srcdir ${srcdir}_ali || exit 1;
steps/nnet/make_denlats.sh --nj 100 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
$data_fmllr/train_si284 data/lang $srcdir ${srcdir}_denlats || exit 1;
fi
if [ $stage -le 4 ]; then
# Re-train the DNN by 1 iteration of sMBR
steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 1 --acwt $acwt --do-smbr true \
$data_fmllr/train_si284 data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
# Decode (reuse HCLG graph)
for ITER in 1; do
steps/nnet/decode.sh --nj 10 --cmd "$decode_cmd" --config conf/decode_dnn.config \
--nnet $dir/${ITER}.nnet --acwt $acwt \
$gmmdir/graph_bd_tgpr $data_fmllr/test_dev93 $dir/decode_bd_tgpr_dev93_it${ITER} || exit 1;
steps/nnet/decode.sh --nj 8 --cmd "$decode_cmd" --config conf/decode_dnn.config \
--nnet $dir/${ITER}.nnet --acwt $acwt \
$gmmdir/graph_bd_tgpr $data_fmllr/test_eval92 $dir/decode_bd_tgpr_eval92_it${ITER} || exit 1;
done
fi
#
steps/nnet/align.sh --nj $nJobs --cmd "$train_cmd" data/train_fmllr data/lang \
$dnnDir $align_dnnDir || exit 1;
# Re-generate lattices, run 4 more sMBR iterations
dir=exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats
srcdir=exp/dnn5b_pretrain-dbn_dnn_smbr
acwt=0.1
steps/nnet/make_denlats.sh --nj $nJobs --cmd "$train_cmd" --config conf/decode_dnn.config --acwt 0.1 \
data/train_fmllr data/lang $dnnDir $dnnLatDir || exit 1;
if [ $stage -le 5 ]; then
# Generate lattices and alignments:
steps/nnet/align.sh --nj 100 --cmd "$train_cmd" \
$data_fmllr/train_si284 data/lang $srcdir ${srcdir}_ali || exit 1;
steps/nnet/make_denlats.sh --nj 100 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
$data_fmllr/train_si284 data/lang $srcdir ${srcdir}_denlats || exit 1;
fi
steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 6 --acwt 0.1 --do-smbr true \
data/train_fmllr data/lang $dnnDir $align_dnnDir $dnnLatDir $dnnMPEDir || exit 1;
if [ $stage -le 6 ]; then
# Re-train the DNN by 1 iteration of sMBR
steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \
$data_fmllr/train_si284 data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
# Decode (reuse HCLG graph)
for ITER in 1 2 3 4; do
steps/nnet/decode.sh --nj 10 --cmd "$decode_cmd" --config conf/decode_dnn.config \
--nnet $dir/${ITER}.nnet --acwt $acwt \
$gmmdir/graph_bd_tgpr $data_fmllr/test_dev93 $dir/decode_bd_tgpr_dev93_iter${ITER} || exit 1;
steps/nnet/decode.sh --nj 8 --cmd "$decode_cmd" --config conf/decode_dnn.config \
--nnet $dir/${ITER}.nnet --acwt $acwt \
$gmmdir/graph_bd_tgpr $data_fmllr/test_eval92 $dir/decode_bd_tgpr_eval92_iter${ITER} || exit 1;
done
fi
#decode
for n in 1 2 3 4 5 6; do
steps/decode_nnet.sh --nj $nDecodeJobs --cmd "$train_cmd" --config conf/decode_dnn.config \
--nnet $dnnMPEDir/$n.nnet --acwt 0.08 \
$baseDir/graph data/test_fmllr $dnnMPEDir/decode_test_it$n || exit 1;
done
# End of DNN
# Getting results [see RESULTS file]
# for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
......@@ -214,53 +214,7 @@ for n in 1 2 3 4; do
data/test exp/sgmm_5a/decode exp/sgmm_5a_mmi_onlyRescoreb0.1/decode$n
done
#train DNN
mfcc_fmllr_dir=mfcc_fmllr
baseDir=exp/tri3b
alignDir=exp/tri3b_ali
dnnDir=exp/tri3b_dnn_2048x5
align_dnnDir=exp/tri3b_dnn_2048x5_ali
dnnLatDir=exp/tri3b_dnn_2048x5_denlats
dnnMPEDir=exp/tri3b_dnn_2048x5_smb
trainTr90=data/train_tr90
trainCV=data/train_cv10
steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$cuda_cmd" \
--transform-dir $baseDir/decode data/test_fmllr data/test \
$baseDir $mfcc_fmllr_dir/log_test $mfcc_fmllr_dir || exit 1;
steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$cuda_cmd" \
--transform-dir $alignDir data/train_fmllr data/train \
$baseDir $mfcc_fmllr_dir/log_train $mfcc_fmllr_dir || exit 1;
utils/subset_data_dir_tr_cv.sh data/train_fmllr $trainTr90 $trainCV || exit 1;
(tail --pid=$$ -F $dnnDir/train_nnet.log 2>/dev/null)&
$cuda_cmd $dnnDir/train_nnet.log \
steps/train_nnet.sh --hid-dim 2048 --hid-layers 5 --learn-rate 0.008 \
$trainTr90 $trainCV data/lang $alignDir $alignDir $dnnDir || exit 1;
steps/decode_nnet.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode_dnn.config \
--nnet $dnnDir/final.nnet --acwt 0.08 $baseDir/graph data/test_fmllr $dnnDir/decode || exit 1;
#
steps/nnet/align.sh --nj $nJobs --cmd "$train_cmd" data/train_fmllr data/lang \
$dnnDir $align_dnnDir || exit 1;
steps/nnet/make_denlats.sh --nj $nJobs --cmd "$train_cmd" --config conf/decode_dnn.config --acwt 0.1 \
data/train_fmllr data/lang $dnnDir $dnnLatDir || exit 1;
steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 6 --acwt 0.1 --do-smbr true \
data/train_fmllr data/lang $dnnDir $align_dnnDir $dnnLatDir $dnnMPEDir || exit 1;
#decode
for n in 1 2 3 4 5 6; do
steps/decode_nnet.sh --nj $nDecodeJobs --cmd "$train_cmd" --config conf/decode_dnn.config \
--nnet $dnnMPEDir/$n.nnet --acwt 0.08 \
$baseDir/graph data/test_fmllr $dnnMPEDir/decode_test_it$n || exit 1;
done
# End of DNN
local/run_dnn.sh
time=$(date +"%Y-%m-%d-%H-%M-%S")
#get WER
......
# "queue.pl" uses qsub. The options to it are
# options to qsub. If you have GridEngine installed,
# change this to a queue you have access to.
# Otherwise, use "run.pl", which will run jobs locally
# (make sure your --num-jobs options are no more than
# the number of cpus on your machine.
export train_cmd="queue.pl -l 'arch=*64*'"
export decode_cmd="queue.pl -l 'arch=*64*'"
export cuda_cmd="queue.pl -l gpu=1"
AA A
AE A
AH A
AO UO
AW U
AY AI
B B
CH CH
D D
DH S I
EH AI
ER E
EY AI
F F
G G
HH H
IH I
IY I
JH ZH
K K
L L
M M
N N
NG N
OW UO
OY UO
P P
R R
S S
SH SH
T T
TH S
UH U
UW U
V W
W W
Y Y
Z Z
ZH X
beam=18.0 # beam for decoding. Was 13.0 in the scripts.
lattice_beam=10.0 # this has most effect on size of the lattices.
# No non-default options for now.
--sample-frequency=16000
--num-mel-bins=30
--use-energy=false # only non-default option.
A AA
AI AY
AN AE N
ANG AE NG
AO AW
B B
CH CH
C T S
D D
E ER
EI EY
EN AH N
ENG AH NG
ER AA R
F F
G G
H HH
IA IY AA
IANG IY AE NG
IAN IY AE N
IAO IY AW
IE IY EH
I IY
ING IY NG
IN IY N
IONG IY UH NG
IU IY UH
J J
K K
L L
M M
N N
O AO
ONG UH NG
OU OW
P P
Q Q
R R
SH SH
S S
T T
UAI UW AY
UANG UW AE NG
UAN UW AE N
UA UW AA
UI UW IY
UN UW AH N
UO UW AO
U UW
UE IY EH
VE IY EH
V IY UW
VN IY N
W W
X X
Y Y
ZH JH
Z Z
#!/bin/bash
# Copyright 2014 QCRI (author: Ahmed Ali)
# Apache 2.0
if [ $# -ne 2 ]; then
echo "Arguments should be the <output folder> <data folder> "; exit 1
fi
# check that sox is installed
which sox &>/dev/null
if [[ $? != 0 ]]; then
echo "sox is not installed"
exit 1
fi
galeData=$1
wavedir=$galeData/wav
mkdir -p $wavedir
audio_path=$2
mkdir -p $wavedir/
#copy and convert the flac to wav
find $audio_path -type f -name *.flac | while read file; do
f_name=$(basename $file)
if [[ ! -e $wavedir/"${f_name%.flac}.wav" ]]; then
echo "soxing $file to $wavedir/$CD/"${f_name%.flac}.wav" "
sox $file $wavedir/"${f_name%.flac}.wav"
fi
done
find $wavedir -name *.wav > $galeData/wav$$
awk -F "/" '{print $NF}' $galeData/wav$$ | sed 's:\.wav::' > $galeData/id$$
paste -d ' ' $galeData/id$$ $galeData/wav$$ | sort -u > $galeData/wav.scp
#clean
rm -fr $galeData/id$$ $galeData/wav$$
echo data prep audio succeded
exit 0
#!/bin/bash
# Copyright 2014 (author: Ahmed Ali, Hainan Xu)
# Apache 2.0
if [ $# -ne 1 ]; then
echo "Arguments should be the <gale folder>"; exit 1
fi
#data will data/local
galeData=$(readlink -f $1)
mkdir -p data/local
dir=$(readlink -f data/local)
cat $galeData/utt2spk | awk '{print$2}' | sort -u > $galeData/spklist
cat $galeData/spklist | utils/shuffle_list.pl --srand ${seed:-777} > $galeData/spklist.shuffled
# we want about 6h dev data; 300 is manually chosen
cat $galeData/spklist.shuffled | head -n 300 > $galeData/spklist.dev
cat $galeData/utt2spk | grep -f $galeData/spklist.dev | awk '{print$1}' > $galeData/dev.list
# some problem with the text data; same utt id but different transcription
cat $galeData/all | awk '{print$2}' | sort | uniq -c | awk '{if($1!="1")print$2}' > $galeData/dup.list
utils/filter_scp.pl --exclude -f 2 $galeData/dup.list $galeData/all > $galeData/all_nodup
mv $galeData/all_nodup $galeData/all
utils/filter_scp.pl -f 2 $galeData/dev.list $galeData/all > $galeData/all.dev
utils/filter_scp.pl --exclude -f 2 $galeData/dev.list $galeData/all > $galeData/all.train
cat $galeData/all.dev | awk '{print$2}' > $galeData/dev_utt_list
cat $galeData/all.train | awk '{print$2}' > $galeData/train_utt_list
mkdir -p $dir/dev
mkdir -p $dir/train
utils/filter_scp.pl -f 1 $galeData/dev_utt_list $galeData/utt2spk > $dir/dev/utt2spk
utils/utt2spk_to_spk2utt.pl $dir/dev/utt2spk | sort -u > $dir/dev/spk2utt
utils/filter_scp.pl -f 1 $galeData/train_utt_list $galeData/utt2spk > $dir/train/utt2spk
utils/utt2spk_to_spk2utt.pl $dir/train/utt2spk | sort -u > $dir/train/spk2utt
for x in dev train; do
outdir=$dir/$x
file=$galeData/all.$x
mkdir -p $outdir
awk '{print $2 " " $1 " " $3 " " $4}' $file | sort -u > $outdir/segments
awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text
done
cat $dir/dev/segments | awk '{print$2}' | sort -u > $galeData/dev.wav.list
cat $dir/train/segments | awk '{print$2}' | sort -u > $galeData/train.wav.list
utils/filter_scp.pl -f 1 $galeData/dev.wav.list $galeData/wav.scp > $dir/dev/wav.scp
utils/filter_scp.pl -f 1 $galeData/train.wav.list $galeData/wav.scp > $dir/train/wav.scp
cat $galeData/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline<seg) >0) {seen[$2]=1;}}
{if (seen[$1]) { print $0}}' > $dir/train/wav.scp
echo data prep split succeeded
#!/bin/bash
# Copyright 2014 (author: Ahmed Ali, Hainan Xu)
# Apache 2.0
if [ $# -ne 2 ]; then
echo "Arguments should be the <gale folder> <txt data folder>"; exit 1
fi
export LC_ALL=C
galeData=$1
text=$2
cur=`pwd`
txtdir=$galeData/txt
mkdir -p $galeData/txt
cd $txtdir
find $text -type f -name *.tdf | while read file; do
sed '1,3d' $file
done > all.tmp
perl -e '
($inFile,$idFile,$txtFile,$spk,$mapf)= split /\s+/, $ARGV[0];
open(IN, "$inFile");
open(ID, ">$idFile");
open(TXT, ">$txtFile");
open(SPK, ">$spk");
open(MAP, ">$mapf");
while (<IN>) {
@arr= split /\t/,$_;
$arr[4] =~ s/ //g;
$spkid = "$arr[0]_$arr[4]";
$spkfix = sprintf("%060s", $spkid);
$start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning
$end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//;
$id="$arr[11] $arr[0] ${spkfix}_$arr[0]_${start}_${end} $rStart $rEnd\n";
next if ($rStart == $rEnd);
$id =~ s/.sph//g;
print ID $id;
print TXT "$arr[7]\n";
print SPK "${spkfix}_$arr[0]_${start}_${end} ${spkfix}\n";
print MAP "$arr[0] ${spkfix}_$arr[0]\n";
}' "all.tmp allid.tmp contentall.tmp utt2spk.tmp map.tmp"
perl -p -i -e 's=/.$==g' contentall.tmp
cd $cur
pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'`
export PYTHONPATH=$PYTHONPATH:`pwd`/tools/mmseg-1.3.0/lib/python${pyver}/site-packages
if [ ! -d tools/mmseg-1.3.0/lib/python${pyver}/site-packages ]; then
echo "--- Downloading mmseg-1.3.0 ..."
echo "NOTE: it assumes that you have Python, Setuptools installed on your system!"
wget -P tools http://pypi.python.org/packages/source/m/mmseg/mmseg-1.3.0.tar.gz
tar xf tools/mmseg-1.3.0.tar.gz -C tools
cd tools/mmseg-1.3.0
mkdir -p lib/python${pyver}/site-packages
python setup.py build
python setup.py install --prefix=.
cd ../..
if [ ! -d tools/mmseg-1.3.0/lib/python${pyver}/site-packages ]; then
echo "mmseg is not found - installation failed?"
exit 1
fi
fi
cat $txtdir/contentall.tmp |\
sed -e 's/,//g' |\
sed -e 's/<foreign language=\"[a-zA-Z]\+\">/ /g' |\
sed -e 's/<\/foreign>/ /g' |\
perl -pe 's/<Event.*?>/ /g' |\
sed -e 's/\[NS\]//g' |\
sed -e 's/\[ns\]//g' |\
sed -e 's/<noise>\(.\+\)<\/noise>/\1/g' |\
sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\
local/gale_normalize.pl | \
python local/gale_segment.py \
> $txtdir/text
paste $txtdir/allid.tmp $txtdir/text | sed 's: $::' | awk '{if (NF>5) {print $0}}' > $txtdir/all_1.tmp
awk '{$1="";print $0}' $txtdir/all_1.tmp | sed 's:^ ::' > $txtdir/../all
cat $txtdir/utt2spk.tmp | sort -u > $txtdir/../utt2spk
cat $txtdir/map.tmp | sort -u > $txtdir/../map
sort -c $txtdir/../utt2spk
utils/utt2spk_to_spk2utt.pl $txtdir/../utt2spk | sort -u > $txtdir/../spk2utt
cd ..;
rm -fr $txtdir
echo data prep text succeeded
#!/bin/bash
# Copyright 2014 QCRI (author: Ahmed Ali)
# Apache 2.0
if [ -f path.sh ]; then
. path.sh; else
echo "missing path.sh"; exit 1;
fi
for dir in dev train; do
cp -pr data/local/$dir data/$dir
done
export LC_ALL=C
mkdir -p data/lang_dev
arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
rm -r data/lang_dev
cp -r data/lang data/lang_dev
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
# LM doesn't have these "invalid combinations". These can cause
# determinization failures of CLG [ends up being epsilon cycles].
# Note: remove_oovs.pl takes a list of words in the LM that aren't in