Commit 347f241e authored by Pawel Swietojanski's avatar Pawel Swietojanski
Browse files

sandbox/pawel: test for stm, gzipped vocabulary

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/pawel@4262 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 9b3ffde0
......@@ -25,6 +25,7 @@ grep -v ';;;' $dir/cmudict/cmudict.0.7a | \
#cat eddie_data/rt09.ami.ihmtrain09.v3.dct | sort > $dir/lexicon1_raw_nosil.txt
# limit the vocabulary to the predefined 50k words
gunzip local/wordlist.50k.gz
join $dir/lexicon1_raw_nosil.txt local/wordlist.50k > $dir/lexicon1_raw_nosil_50k.txt
# Add prons for laughter, noise, oov
......
......@@ -21,6 +21,7 @@
fisher=
order=3
swbd=
google=
web_sw=
web_fsh=
web_mtg=
......@@ -120,6 +121,20 @@ if [ ! -z "$fisher" ]; then
num_lms=$[ num_lms + 1 ]
fi
if [ ! -z "$google1B" ]; then
mkdir -p $dir/google
wget -O $dir/google/cantab.lm3.bz2 http://vm.cantabresearch.com:6080/demo/cantab.lm3.bz2
wget -O $dir/google/150000.lex http://vm.cantabresearch.com:6080/demo/150000.lex
ngram -unk -limit-vocab -vocab $dir/wordlist -lm $dir/google.cantab.lm3.bz3 \
-write-lm $dir/google/google.o${order}g.kn.gz
mix_ppl="$mix_ppl $dir/goog1e/ppl2"
mix_tag="${mix_tag}_fsh"
mix_lms=("${mix_lms[@]}" "$dir/google/google.o${order}g.kn.gz")
num_lms=$[ num_lms + 1 ]
fi
## The University of Washington conversational web data can be obtained as:
## wget --no-check-certificate http://ssli.ee.washington.edu/data/191M_conversational_web-filt+periods.gz
if [ ! -z "$web_sw" ]; then
......
......@@ -85,13 +85,16 @@ foreach $utt (sort keys(%utt2reco)) {
__END__
# Test example [also test it without the 0.5's]
echo utt reco 10.0 20.0 > segments
echo reco file A > reco2file_and_channel
echo utt 1 8.0 1.0 word 0.5 > ctm_in
echo file A 18.00 1.00 word 0.5 > ctm_out
utils/convert_ctm.pl segments reco2file_and_channel ctm_in | cmp - ctm_out || echo error
rm segments reco2file_and_channel ctm_in ctm_out
# Test example
# ES2011a.Headset-0 A AMI_ES2011a_H00_FEE041 34.27 37.14 HERE WE GO
mkdir tmpdir
echo utt reco 10.0 20.0 > tmpdir/segments
echo utt word > tmpdir/text
echo reco file A > tmpdir/reco2file_and_channel
echo utt spk > tmpdir/utt2spk
echo file A spk 10.0 20.00 word > stm_tst
utils/convert2stm.pl tmpdir | cmp - stm_tst || echo error
rm -r tmpdir stm_tst
......
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment