Commit 406b3348 authored by Chao Weng's avatar Chao Weng
Browse files

hkust mandarin telephone speech (LDC2005S15) example

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@1634 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent f521d7f2
About HKUST Mandarin Telephone Speech
LDC2005S15 : http://www.ldc.upenn.edu/Catalog/catalogEntry.jsp?catalogId=LDC2005S15
LDC2005T32 : http://www.ldc.upenn.edu/Catalog/catalogEntry.jsp?catalogId=LDC2005T32
exp/mono0a/decode/cer_9:%WER 80.54 [ 45228 / 56154, 1435 ins, 11484 del, 32309 sub ]
exp/tri1/decode/cer_12:%WER 60.34 [ 33881 / 56154, 2720 ins, 6019 del, 25142 sub ]
exp/tri2/decode/cer_12:%WER 59.69 [ 33521 / 56154, 2800 ins, 5618 del, 25103 sub ]
exp/tri3a/decode/cer_13:%WER 57.65 [ 32370 / 56154, 2535 ins, 5673 del, 24162 sub ]
exp/tri4a/decode/cer_12:%WER 53.02 [ 29774 / 56154, 2724 ins, 4791 del, 22259 sub ]
exp/tri5a/decode/cer_13:%WER 49.67 [ 27891 / 56154, 2877 ins, 4538 del, 20476 sub ]
exp/tri5a_mce/decode/cer_11:%WER 44.74 [ 25125 / 56154, 2112 ins, 4108 del, 18905 sub ]
exp/tri5a_mmi_b0.1/decode/cer_11:%WER 44.24 [ 24840 / 56154, 2060 ins, 4118 del, 18662 sub ]
exp/tri5a_mpe/decode/cer_12:%WER 44.96 [ 25247 / 56154, 2233 ins, 4174 del, 18840 sub ]
# "queue.pl" uses qsub. The options to it are
# options to qsub. If you have GridEngine installed,
# change this to a queue you have access to.
# Otherwise, use "run.pl", which will run jobs locally
# (make sure your --num-jobs options are no more than
# the number of cpus on your machine.
export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
#export train_cmd=run.pl
#export decode_cmd=run.pl
AA A
AE A
AH A
AO UO
AW U
AY AI
B B
CH CH
D D
DH S I
EH AI
ER E
EY AI
F F
G G
HH H
IH I
IY I
JH ZH
K K
L L
M M
N N
NG N
OW UO
OY UO
P P
R R
S S
SH SH
T T
TH S
UH U
UW U
V W
W W
Y Y
Z Z
ZH X
beam=11.0 # beam for decoding. Was 13.0 in the scripts.
first_beam=8.0 # beam for 1st-pass decoding in SAT.
--use-energy=false # only non-default option.
--sample-frequency=8000 # Switchboard is sampled at 8kHz
A AA
AI AY
AN AE N
ANG AE NG
AO AW
B B
CH CH
C T S
D D
E ER
EI EY
EN AH N
ENG AH NG
ER AA R
F F
G G
H HH
IA IY AA
IANG IY AE NG
IAN IY AE N
IAO IY AW
IE IY EH
I IY
ING IY NG
IN IY N
IONG IY UH NG
IU IY UH
J J
K K
L L
M M
N N
O AO
ONG UH NG
OU OW
P P
Q Q
R R
SH SH
S S
T T
UAI UW AY
UANG UW AE NG
UAN UW AE N
UA UW AA
UI UW IY
UN UW AH N
UO UW AO
U UW
UE IY EH
VE IY EH
V IY UW
VN IY N
W W
X X
Y Y
ZH JH
Z Z
B
C
CH
D
F
G
H
J
K
L
M
N
P
Q
R
S
SH
T
W
X
Y
Z
ZH
<Topology>
<TopologyEntry>
<ForPhones>
NONSILENCEPHONES
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.75 <Transition> 1 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.75 <Transition> 2 0.25 </State>
<State> 2 <PdfClass> 2 <Transition> 2 0.75 <Transition> 3 0.25 </State>
<State> 3 </State>
</TopologyEntry>
<TopologyEntry>
<ForPhones>
SILENCEPHONES
</ForPhones>
<State> 0 <PdfClass> 0 <Transition> 0 0.25 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 </State>
<State> 1 <PdfClass> 1 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 2 <PdfClass> 2 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 3 <PdfClass> 3 <Transition> 1 0.25 <Transition> 2 0.25 <Transition> 3 0.25 <Transition> 4 0.25 </State>
<State> 4 <PdfClass> 4 <Transition> 4 0.25 <Transition> 5 0.75 </State>
<State> 5 </State>
</TopologyEntry>
</Topology>
#!/bin/bash
. path.sh
if [ $# != 1 ]; then
echo "Usage: hkust_data_prep.sh /path/to/HKUST"
exit 1;
fi
HKUST_DIR=$1
train_dir=data/local/train
dev_dir=data/local/dev
case 0 in #goto here
1)
;; #here:
esac
mkdir -p $train_dir
mkdir -p $dev_dir
#data directory check
if [ ! -d $HKUST_DIR ]; then
echo "Error: run.sh requires a directory argument"
exit 1;
fi
#find sph audio file for train dev resp.
find $HKUST_DIR -iname "*.sph" | grep -i "audio/train" > $train_dir/sph.flist
find $HKUST_DIR -iname "*.sph" | grep -i "audio/dev" > $dev_dir/sph.flist
n=`cat $train_dir/sph.flist $dev_dir/sph.flist | wc -l`
[ $n -ne 897 ] && \
echo Warning: expected 897 data data files, found $n
#Transcriptions preparation
#collect all trans, convert encodings to utf-8,
find $HKUST_DIR -iname "*.txt" | grep -i "trans/train" | xargs cat |\
iconv -f GBK -t utf-8 - | perl -e '
while (<STDIN>) {
@A = split(" ", $_);
if (@A <= 1) { next; }
if ($A[0] eq "#") { $utt_id = $A[1]; }
if (@A >= 3) {
$A[2] =~ s:^([AB])\:$:$1:;
printf "%s-%s-%06.0f-%06.0f", $utt_id, $A[2], 100*$A[0] + 0.5, 100*$A[1] + 0.5;
for($n = 3; $n < @A; $n++) { print " $A[$n]" };
print "\n";
}
}
' | sort -k1 > $train_dir/transcripts.txt
find $HKUST_DIR -iname "*.txt" | grep -i "trans/dev" | xargs cat |\
iconv -f GBK -t utf-8 - | perl -e '
while (<STDIN>) {
@A = split(" ", $_);
if (@A <= 1) { next; }
if ($A[0] eq "#") { $utt_id = $A[1]; }
if (@A >= 3) {
$A[2] =~ s:^([AB])\:$:$1:;
printf "%s-%s-%06.0f-%06.0f", $utt_id, $A[2], 100*$A[0] + 0.5, 100*$A[1] + 0.5;
for($n = 3; $n < @A; $n++) { print " $A[$n]" };
print "\n";
}
}
' | sort -k1 > $dev_dir/transcripts.txt
#transcripts normalization and segmentation
#(this needs external tools),
#Download and configure segment tools
pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'`
export PYTHONPATH=$PYTHONPATH:`pwd`/tools/mmseg-1.3.0/lib/python${pyver}/site-packages
if [ ! -d tools/mmseg-1.3.0/lib/python${pyver}/site-packages ]; then
echo "--- Downloading mmseg-1.3.0 ..."
echo "NOTE: it assumes that you have Python, Setuptools installed on your system!"
wget -P tools http://pypi.python.org/packages/source/m/mmseg/mmseg-1.3.0.tar.gz
tar xf tools/mmseg-1.3.0.tar.gz -C tools
cd tools/mmseg-1.3.0
mkdir -p lib/python${pyver}/site-packages
python setup.py build
python setup.py install --prefix=.
cd ../..
if [ ! -d tools/mmseg-1.3.0/lib/python${pyver}/site-packages ]; then
echo "mmseg is not found - installation failed?"
exit 1
fi
fi
cat $train_dir/transcripts.txt |\
sed -e 's/<foreign language=\"[a-zA-Z]\+\">/ /g' |\
sed -e 's/<\/foreign>/ /g' |\
sed -e 's/<noise>\(.\+\)<\/noise>/\1/g' |\
sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\
local/hkust_normalize.pl |\
python local/hkust_segment.py |\
awk '{if (NF > 1) print $0;}' > $train_dir/text
cat $dev_dir/transcripts.txt |\
sed -e 's/<foreign language=\"[a-zA-Z]\+\">/ /g' |\
sed -e 's/<\/foreign>/ /g' |\
sed -e 's/<noise>\(.\+\)<\/noise>/\1/g' |\
sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\
local/hkust_normalize.pl |\
python local/hkust_segment.py |\
awk '{if (NF > 1) print $0;}' > $dev_dir/text
#Make segment files from transcript
#segments file format is: utt-id side-id start-time end-time, e.g.:
#sw02001-A_000098-001156 sw02001-A 0.98 11.56
awk '{ segment=$1; split(segment,S,"-"); side=S[2]; audioname=S[1];startf=S[3];endf=S[4];
print segment " " audioname "-" side " " startf/100 " " endf/100}' <$train_dir/text > $train_dir/segments
awk '{name = $0; gsub(".sph$","",name); gsub(".*/","",name); print(name " " $0)}' $train_dir/sph.flist > $train_dir/sph.scp
awk '{ segment=$1; split(segment,S,"-"); side=S[2]; audioname=S[1];startf=S[3];endf=S[4];
print segment " " audioname "-" side " " startf/100 " " endf/100}' <$dev_dir/text > $dev_dir/segments
awk '{name = $0; gsub(".sph$","",name); gsub(".*/","",name); print(name " " $0)}' $dev_dir/sph.flist > $dev_dir/sph.scp
sph2pipe=`cd ../../..; echo $PWD/tools/sph2pipe_v2.5/sph2pipe`
[ ! -f $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1;
cat $train_dir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
sort > $train_dir/wav.scp || exit 1;
cat $dev_dir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2);
printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
sort > $dev_dir/wav.scp || exit 1;
#side A - channel 1, side B - channel 2
# this file reco2file_and_channel maps recording-id (e.g. sw02001-A)
# to the file name sw02001 and the A, e.g.
# sw02001-A sw02001 A
# In this case it's trivial, but in other corpora the information might
# be less obvious. Later it will be needed for ctm scoring.
cat $train_dir/wav.scp | awk '{print $1}' | \
perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; print "$1-$2 $1 $2\n"; ' \
> $train_dir/reco2file_and_channel || exit 1;
cat $dev_dir/wav.scp | awk '{print $1}' | \
perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; print "$1-$2 $1 $2\n"; ' \
> $dev_dir/reco2file_and_channel || exit 1;
cat $train_dir/segments | awk '{spk=substr($1,1,33); print $1 " " spk}' > $train_dir/utt2spk || exit 1;
cat $train_dir/utt2spk | sort -k 2 | utils/utt2spk_to_spk2utt.pl > $train_dir/spk2utt || exit 1;
cat $dev_dir/segments | awk '{spk=substr($1,1,33); print $1 " " spk}' > $dev_dir/utt2spk || exit 1;
cat $dev_dir/utt2spk | sort -k 2 | utils/utt2spk_to_spk2utt.pl > $dev_dir/spk2utt || exit 1;
echo HKUST data preparation succeeded
exit 1;
#!/bin/bash
#
if [ -f path.sh ]; then . path.sh; fi
silprob=0.5
mkdir -p data/lang_test data/train data/dev
arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
# Copy stuff into its final locations...
for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
cp data/local/train/$f data/train/$f || exit 1;
done
for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
cp data/local/dev/$f data/dev/$f || exit 1;
done
rm -r data/lang_test
cp -r data/lang data/lang_test
# grep -v '<s> <s>' etc. is only for future-proofing this script. Our
# LM doesn't have these "invalid combinations". These can cause
# determinization failures of CLG [ends up being epsilon cycles].
# Note: remove_oovs.pl takes a list of words in the LM that aren't in
# our word list. Since our LM doesn't have any, we just give it
# /dev/null [we leave it in the script to show how you'd do it].
gunzip -c "$arpa_lm" | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
arpa2fst - | fstprint | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic data/lang_test/G.fst
## Check lexicon.
## just have a look and make sure it seems sane.
echo "First few lines of lexicon FST:"
fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head
echo Performing further checks
# Checking that G.fst is determinizable.
fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G.
# Checking that L_disambig.fst is determinizable.
fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
# Checking that disambiguated lexicon times G is determinizable
# Note: we do this with fstdeterminizestar not fstdeterminize, as
# fstdeterminize was taking forever (presumbaly relates to a bug
# in this version of OpenFst that makes determinization slow for
# some case).
fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
fstdeterminizestar >/dev/null || echo Error
# Checking that LG is stochastic:
fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
fstisstochastic || echo LG is not stochastic
echo hkust_format_data succeeded.
#!/usr/bin/perl -w
# Copyright Chao Weng
# normalizations for hkust trascript
# see the docs/trans-guidelines.pdf for details
while (<STDIN>) {
@A = split(" ", $_);
print "$A[0] ";
for ($n = 1; $n < @A; $n++) {
$a = $A[$n];
if (($a eq "{breath}")||($a eq "{cough}")||($a eq "{sneeze}")
|| ($a eq "{lipsmack}")) {print "[VOCALIZED-NOISE] "; next;}
if (($a eq "{laugh}")) {print "[LAUGHTER] "; next;}
if (($a eq "<noise>")) {print "[NOISE] "; next;}
$tmp = $a;
if ($tmp =~ /[^.,?+-]{0,}[.,?+-]+/) { $tmp =~ s:([^.,?+-]{0,})[.,?+-]+:$1:; }
if ($tmp =~ /\~[A-Z]/) { $tmp =~ s:\~([A-Z]):$1:; }
if ($tmp =~ /%\S/) { $tmp =~ s:%(\S):$1:; }
if ($tmp =~ /[a-zA-Z]/) {$tmp=uc($tmp);}
print "$tmp ";
}
print "\n";
}
#!/bin/bash
# prepare dictionary for HKUST
# it is done for English and Chinese separately,
# For English, we use CMU dictionary, and Sequitur G2P
# for OOVs, while all englist phone set will concert to Chinese
# phone set at the end. For Chinese, we use an online dictionary,
# for OOV, we just produce pronunciation using Charactrt Mapping.
. path.sh
[ $# != 0 ] && echo "Usage: local/hkust_prepare_dict.sh" && exit 1;
train_dir=data/local/train
dev_dir=data/local/dev
dict_dir=data/local/dict
mkdir -p $dict_dir
case 0 in #goto here
1)
;; #here:
esac
# extract full vocabulary
cat $train_dir/text $dev_dir/text | awk '{for (i = 2; i <= NF; i++) print $i}' |\
sed -e 's/ /\n/g' | sort -u | grep -v '\[LAUGHTER\]' | grep -v '\[NOISE\]' |\
grep -v '\[VOCALIZED-NOISE\]' > $dict_dir/vocab-full.txt
# split into English and Chinese
cat $dict_dir/vocab-full.txt | grep '[a-zA-Z]' > $dict_dir/vocab-en.txt
cat $dict_dir/vocab-full.txt | grep -v '[a-zA-Z]' > $dict_dir/vocab-ch.txt
# produce pronunciations for english
if [ ! -f $dict_dir/cmudict/cmudict.0.7a ]; then
echo "--- Downloading CMU dictionary ..."
svn co https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/cmudict \
$dict_dir/cmudict || exit 1;
fi
echo "--- Striping stress and pronunciation variant markers from cmudict ..."
perl $dict_dir/cmudict/scripts/make_baseform.pl \
$dict_dir/cmudict/cmudict.0.7a /dev/stdout |\
sed -e 's:^\([^\s(]\+\)([0-9]\+)\(\s\+\)\(.*\):\1\2\3:' > $dict_dir/cmudict-plain.txt
echo "--- Searching for English OOV words ..."
gawk 'NR==FNR{words[$1]; next;} !($1 in words)' \
$dict_dir/cmudict-plain.txt $dict_dir/vocab-en.txt |\
egrep -v '<.?s>' > $dict_dir/vocab-en-oov.txt
gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \
$dict_dir/vocab-en.txt $dict_dir/cmudict-plain.txt |\
egrep -v '<.?s>' > $dict_dir/lexicon-en-iv.txt
wc -l $dict_dir/vocab-en-oov.txt
wc -l $dict_dir/lexicon-en-iv.txt
pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'`
export PYTHONPATH=$PYTHONPATH:`pwd`/tools/g2p/lib/python${pyver}/site-packages
if [ ! -f tools/g2p/lib/python${pyver}/site-packages/g2p.py ]; then
echo "--- Downloading Sequitur G2P ..."
echo "NOTE: it assumes that you have Python, NumPy and SWIG installed on your system!"
wget -P tools http://www-i6.informatik.rwth-aachen.de/web/Software/g2p-r1668.tar.gz
tar xf tools/g2p-r1668.tar.gz -C tools
cd tools/g2p
echo '#include <cstdio>' >> Utility.hh # won't compile on my system w/o this "patch"
python setup.py build
python setup.py install --prefix=.
cd ../..
if [ ! -f tools/g2p/lib/python${pyver}/site-packages/g2p.py ]; then
echo "Sequitur G2P is not found - installation failed?"
exit 1
fi
fi
if [ ! -f conf/g2p_model ]; then
echo "--- Downloading a pre-trained Sequitur G2P model ..."
wget http://sourceforge.net/projects/kaldi/files/sequitur-model4 -O conf/g2p_model
if [ ! -f conf/g2p_model ]; then
echo "Failed to download the g2p model!"
exit 1
fi
fi
echo "--- Preparing pronunciations for OOV words ..."
python tools/g2p/lib/python${pyver}/site-packages/g2p.py \
--model=conf/g2p_model --apply $dict_dir/vocab-en-oov.txt > $dict_dir/lexicon-en-oov.txt
cat $dict_dir/lexicon-en-oov.txt $dict_dir/lexicon-en-iv.txt |\
sort > $dict_dir/lexicon-en-phn.txt
# produce pronunciations for chinese
if [ ! -f $dict_dir/cedict_1_0_ts_utf-8_mdbg.txt ]; then
wget -P $dict_dir http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz
gunzip $dict_dir/cedict_1_0_ts_utf-8_mdbg.txt.gz
fi
cat $dict_dir/cedict_1_0_ts_utf-8_mdbg.txt | grep -v '#' | awk -F '/' '{print $1}' |\
perl -e '
while (<STDIN>) {
@A = split(" ", $_);
print $A[1];
for($n = 2; $n < @A; $n++) {
$A[$n] =~ s:\[?([a-zA-Z0-9\:]+)\]?:$1:;
$tmp = uc($A[$n]);
print " $tmp";
}
print "\n";
}
' | sort -k1 > $dict_dir/ch-dict.txt
echo "--- Searching for Chinese OOV words ..."
gawk 'NR==FNR{words[$1]; next;} !($1 in words)' \
$dict_dir/ch-dict.txt $dict_dir/vocab-ch.txt |\
egrep -v '<.?s>' > $dict_dir/vocab-ch-oov.txt
gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \
$dict_dir/vocab-ch.txt $dict_dir/ch-dict.txt |\
egrep -v '<.?s>' > $dict_dir/lexicon-ch-iv.txt
wc -l $dict_dir/vocab-ch-oov.txt
wc -l $dict_dir/lexicon-ch-iv.txt
# this
unset LC_ALL
# first make sure number of characters and pinyins
# are equal
cat $dict_dir/ch-dict.txt |\
perl -e '
use encoding utf8;
while (<STDIN>) {
@A = split(" ", $_);
$word_len = length($A[0]);
$proun_len = @A - 1 ;
if ($word_len == $proun_len) {print $_;}
}
' > $dict_dir/ch-dict-1.txt
cat $dict_dir/ch-dict-1.txt | awk '{print $1}' | sed -e 's/\(\S\)/\1\n/g' | grep -v '^$' > $dict_dir/ch-char.txt
cat $dict_dir/ch-dict-1.txt | awk '{for(i=2; i<=NF; i++) print $i}' | sed -e 's/ /\n/g' > $dict_dir/ch-char-pinyin.txt
wc -l $dict_dir/ch-char.txt
wc -l $dict_dir/ch-char-pinyin.txt
paste $dict_dir/ch-char.txt $dict_dir/ch-char-pinyin.txt | sort -u > $dict_dir/ch-char-dict.txt
cat $dict_dir/ch-char-dict.txt |\
perl -e '
my $prev = "";
my $out_line = "";
while (<STDIN>) {
@A = split(" ", $_);
$cur = $A[0];
$cur_py = $A[1];
#print length($prev);
if (length($prev) == 0) { $out_line = $_; chomp($out_line);}
if (length($prev)>0 && $cur ne $prev) { print $out_line; print "\n"; $out_line = $_; chomp($out_line);}
if (length($prev)>0 && $cur eq $prev) { $out_line = $out_line."/"."$cur_py";}
$prev = $cur;
}
print $out_line;
' > $dict_dir/ch-char-dict-1.txt
cat $dict_dir/vocab-ch-oov.txt | awk -v w=$dict_dir/ch-char-dict-1.txt \
'BEGIN{while((getline<w)>0) dict[$1]=$2;}
{printf("%s", $1); for (i=1; i<=length($1); i++) { py=substr($1, i, 1); printf(" %s", dict[py]); } printf("\n"); }' \
> $dict_dir/lexicon-ch-oov.txt
cat $dict_dir/lexicon-ch-oov.txt |\