Commit f1d92720 authored by Dan Povey's avatar Dan Povey
Browse files

trunk: merging sandbox/dan back to trunk. Includes addition of recipe for the...

trunk: merging sandbox/dan back to trunk.  Includes addition of recipe for the LibriSpeech corpus, and the capability to rescore lattices using ARPA language models that are too big to convert into FSTs.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4504 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 2aa9af84
......@@ -381,6 +381,7 @@
# /src/latbin/
/src/latbin/.depend.mk
/src/latbin/lattice-lmrescore-const-arpa
/src/latbin/lattice-prune
/src/latbin/lattice-rmali
/src/latbin/lattice-compose
......@@ -434,6 +435,9 @@
/src/lm/.depend.mk
/src/lm/lm-lib-test
# /src/lmbin/
src/lmbin/arpa-to-const-arpa
# /src/matrix/
/src/matrix/.depend.mk
/src/matrix/Matrix.vcxproj
......@@ -821,6 +825,7 @@
/src/nnet2bin/nnet1-to-raw-nnet
/src/nnet2bin/raw-nnet-copy
/src/online2bin/apply-cmvn-online
/src/online2bin/online2-wav-nnet2-am-compute
/src/online2bin/compress-uncompress-speex
/src/online2bin/extend-wav-with-silence
/src/online2bin/ivector-extract-online2
......@@ -849,3 +854,6 @@
/src/online2bin/.depend.mk
/src/online2/.depend.mk
/src/ivector/.depend.mk
/egs/librispeech/s5/data
/egs/librispeech/s5/mfcc
/egs/librispeech/s5/exp
......@@ -29,7 +29,7 @@ gunzip -c "$arpa_lm" | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test/G.fst
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"
......
......@@ -59,7 +59,7 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > $test/G.fst
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
......
......@@ -59,7 +59,7 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > $test/G.fst
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
......
......@@ -25,28 +25,28 @@ gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test_bd_tgpr/G.fst || exit 1;
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tgpr/G.fst || exit 1;
fstisstochastic data/lang_test_bd_tgpr/G.fst
gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test_bd_tg/G.fst || exit 1;
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tg/G.fst || exit 1;
fstisstochastic data/lang_test_bd_tg/G.fst
gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test_bd_fg/G.fst || exit 1;
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fg/G.fst || exit 1;
fstisstochastic data/lang_test_bd_fg/G.fst
gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test_bd_fgpr/G.fst || exit 1;
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fgpr/G.fst || exit 1;
fstisstochastic data/lang_test_bd_fgpr/G.fst
exit 0;
......@@ -46,7 +46,7 @@ gunzip -c $lmfile | \
utils/s2eps.pl | \
fstcompile --isymbols=$langdir/words.txt \
--osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > $destdir/G.fst || exit 1
fstrmepsilon | fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
fstisstochastic $destdir/G.fst || true
exit 0
......@@ -44,7 +44,7 @@ gunzip -c $lmfile | \
fstcompile --isymbols=$langdir/words.txt \
--osymbols=$langdir/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrhocompose "$rho" - $destdir/rho.fst | \
fstrmepsilon > $destdir/G.fst || exit 1
fstrmepsilon | fstarcsort --sort_type=ilabel > $destdir/G.fst || exit 1
fstisstochastic $destdir/G.fst || true
......
......@@ -211,7 +211,7 @@ gunzip -c $gzipped_ARPA_LM | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > $lang/G.fst || exit 1;
fstrmepsilon | fstarcsort --sort_type=ilabel > $lang/G.fst || exit 1;
fstisstochastic $lang/G.fst
##################################################################
......
......@@ -59,7 +59,7 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > $test/G.fst
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
......
......@@ -27,28 +27,28 @@ gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test_bd_tgpr/G.fst || exit 1;
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tgpr/G.fst || exit 1;
fstisstochastic data/lang_test_bd_tgpr/G.fst
gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test_bd_tg/G.fst || exit 1;
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_tg/G.fst || exit 1;
fstisstochastic data/lang_test_bd_tg/G.fst
gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test_bd_fg/G.fst || exit 1;
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fg/G.fst || exit 1;
fstisstochastic data/lang_test_bd_fg/G.fst
gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \
arpa2fst - | fstprint | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$lang/words.txt \
--osymbols=$lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test_bd_fgpr/G.fst || exit 1;
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_bd_fgpr/G.fst || exit 1;
fstisstochastic data/lang_test_bd_fgpr/G.fst
exit 0;
......@@ -24,7 +24,7 @@ gunzip -c "$arpa_lm" | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test/G.fst
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
......
......@@ -59,7 +59,7 @@ for lm_suffix in bg tgpr tg bg_5k tgpr_5k tg_5k; do
utils/remove_oovs.pl $tmpdir/oovs_${lm_suffix}.txt | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=$test/words.txt \
--osymbols=$test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > $test/G.fst
fstrmepsilon | fstarcsort --sort_type=ilabel > $test/G.fst
fstisstochastic $test/G.fst
# The output is like:
# 9.14233e-05 -0.259833
......
......@@ -24,7 +24,7 @@ gunzip -c "$arpa_lm" | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test/G.fst
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
......
......@@ -24,7 +24,7 @@ gunzip -c "$arpa_lm" | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test/G.fst
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
......
......@@ -24,7 +24,7 @@ gunzip -c "$arpa_lm" | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test_fsh/words.txt \
--osymbols=data/lang_test_fsh/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test_fsh/G.fst
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test_fsh/G.fst
fstisstochastic data/lang_test_fsh/G.fst
......
......@@ -35,7 +35,7 @@ gunzip -c "$arpa_lm" | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test/G.fst
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
......
......@@ -37,7 +37,7 @@ gunzip -c "$arpa_lm" | \
utils/remove_oovs.pl /dev/null | \
utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
--osymbols=data/lang_test/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon > data/lang_test/G.fst
fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
fstisstochastic data/lang_test/G.fst
......
The LibriSpeech corpus is a large (1000 hour) corpus of English read speech
derived from audiobooks in the LibriVox project, sampled at 16kHz. The
accents are various and not marked, but the majority are US English. It is
available for download for free at http://www.openslr.org/12/. It was prepared
as a speech recognition corpus by Vassil Panayotov.
The recipe is in s5/
# "queue.pl" uses qsub. The options to it are
# options to qsub. If you have GridEngine installed,
# change this to a queue you have access to.
# Otherwise, use "run.pl", which will run jobs locally
# (make sure your --num-jobs options are no more than
# the number of cpus on your machine.
#a) JHU cluster options
export train_cmd="queue.pl -l arch=*64"
export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
export big_memory_cmd="queue.pl -l arch=*64,ram_free=8G,mem_free=8G"
export cuda_cmd="queue.pl -l gpu=1"
#b) BUT cluster options
#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
#c) run it locally...
#export train_cmd=run.pl
#export decode_cmd=run.pl
#export cuda_cmd=run.pl
#export mkgraph_cmd=run.pl
# empty config, just use the defaults.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment