Commit 406ad668 authored by Hainan Xu's avatar Hainan Xu
Browse files

Fixed some data preparation issues; Added --cmd options to train/decode scripts.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4282 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent ac225741
...@@ -3,12 +3,13 @@ ...@@ -3,12 +3,13 @@
. path.sh . path.sh
if [ $# != 1 ]; then if [ $# != 2 ]; then
echo "Usage: hkust_data_prep.sh /path/to/HKUST" echo "Usage: hkust_data_prep.sh AUDIO_PATH TEXT_PATH"
exit 1; exit 1;
fi fi
HKUST_DIR=$1 HKUST_AUDIO_DIR=$1
HKUST_TEXT_DIR=$2
train_dir=data/local/train train_dir=data/local/train
dev_dir=data/local/dev dev_dir=data/local/dev
...@@ -23,14 +24,14 @@ mkdir -p $train_dir ...@@ -23,14 +24,14 @@ mkdir -p $train_dir
mkdir -p $dev_dir mkdir -p $dev_dir
#data directory check #data directory check
if [ ! -d $HKUST_DIR ]; then if [ ! -d $HKUST_AUDIO_DIR ] || [ ! -d $HKUST_TEXT_DIR ]; then
echo "Error: run.sh requires a directory argument" echo "Error: run.sh requires two directory arguments"
exit 1; exit 1;
fi fi
#find sph audio file for train dev resp. #find sph audio file for train dev resp.
find $HKUST_DIR -iname "*.sph" | grep -i "audio/train" > $train_dir/sph.flist find $HKUST_AUDIO_DIR -iname "*.sph" | grep -i "audio/train" > $train_dir/sph.flist
find $HKUST_DIR -iname "*.sph" | grep -i "audio/dev" > $dev_dir/sph.flist find $HKUST_AUDIO_DIR -iname "*.sph" | grep -i "audio/dev" > $dev_dir/sph.flist
n=`cat $train_dir/sph.flist $dev_dir/sph.flist | wc -l` n=`cat $train_dir/sph.flist $dev_dir/sph.flist | wc -l`
[ $n -ne 897 ] && \ [ $n -ne 897 ] && \
...@@ -40,7 +41,7 @@ n=`cat $train_dir/sph.flist $dev_dir/sph.flist | wc -l` ...@@ -40,7 +41,7 @@ n=`cat $train_dir/sph.flist $dev_dir/sph.flist | wc -l`
#Transcriptions preparation #Transcriptions preparation
#collect all trans, convert encodings to utf-8, #collect all trans, convert encodings to utf-8,
find $HKUST_DIR -iname "*.txt" | grep -i "trans/train" | xargs cat |\ find $HKUST_TEXT_DIR -iname "*.txt" | grep -i "trans/train" | xargs cat |\
iconv -f GBK -t utf-8 - | perl -e ' iconv -f GBK -t utf-8 - | perl -e '
while (<STDIN>) { while (<STDIN>) {
@A = split(" ", $_); @A = split(" ", $_);
...@@ -55,7 +56,7 @@ find $HKUST_DIR -iname "*.txt" | grep -i "trans/train" | xargs cat |\ ...@@ -55,7 +56,7 @@ find $HKUST_DIR -iname "*.txt" | grep -i "trans/train" | xargs cat |\
} }
' | sort -k1 > $train_dir/transcripts.txt ' | sort -k1 > $train_dir/transcripts.txt
find $HKUST_DIR -iname "*.txt" | grep -i "trans/dev" | xargs cat |\ find $HKUST_TEXT_DIR -iname "*.txt" | grep -i "trans/dev" | xargs cat |\
iconv -f GBK -t utf-8 - | perl -e ' iconv -f GBK -t utf-8 - | perl -e '
while (<STDIN>) { while (<STDIN>) {
@A = split(" ", $_); @A = split(" ", $_);
...@@ -111,6 +112,10 @@ cat $dev_dir/transcripts.txt |\ ...@@ -111,6 +112,10 @@ cat $dev_dir/transcripts.txt |\
python local/hkust_segment.py |\ python local/hkust_segment.py |\
awk '{if (NF > 1) print $0;}' > $dev_dir/text awk '{if (NF > 1) print $0;}' > $dev_dir/text
# some data is corrupted. Delete them
cat $train_dir/text | grep -v 20040527_210939_A901153_B901154-A-035691-035691 | egrep -v "A:|B:" > tmp
mv tmp $train_dir/text
#Make segment files from transcript #Make segment files from transcript
#segments file format is: utt-id side-id start-time end-time, e.g.: #segments file format is: utt-id side-id start-time end-time, e.g.:
#sw02001-A_000098-001156 sw02001-A 0.98 11.56 #sw02001-A_000098-001156 sw02001-A 0.98 11.56
......
...@@ -33,7 +33,7 @@ cat $dict_dir/vocab-full.txt | grep -v '[a-zA-Z]' > $dict_dir/vocab-ch.txt ...@@ -33,7 +33,7 @@ cat $dict_dir/vocab-full.txt | grep -v '[a-zA-Z]' > $dict_dir/vocab-ch.txt
# produce pronunciations for english # produce pronunciations for english
if [ ! -f $dict_dir/cmudict/cmudict.0.7a ]; then if [ ! -f $dict_dir/cmudict/cmudict.0.7a ]; then
echo "--- Downloading CMU dictionary ..." echo "--- Downloading CMU dictionary ..."
svn co https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/cmudict \ svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \
$dict_dir/cmudict || exit 1; $dict_dir/cmudict || exit 1;
fi fi
......
...@@ -11,9 +11,9 @@ ...@@ -11,9 +11,9 @@
. cmd.sh . cmd.sh
# Data Preparation, # Data Preparation,
local/hkust_data_prep.sh /mnt/spdb/LDC2005S15 local/hkust_data_prep.sh /export/corpora/LDC/LDC2005S15/ /export/corpora/LDC/LDC2005T32/
# Lexicon Preparation, # Lexicon Preparation,
local/hkust_prepare_dict.sh local/hkust_prepare_dict.sh
...@@ -34,14 +34,14 @@ local/hkust_format_data.sh ...@@ -34,14 +34,14 @@ local/hkust_format_data.sh
# want to store MFCC features. # want to store MFCC features.
mfccdir=mfcc mfccdir=mfcc
for x in train dev; do for x in train dev; do
steps/make_mfcc.sh --nj 10 data/$x exp/make_mfcc/$x $mfccdir || exit 1; steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/$x exp/make_mfcc/$x $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1; steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
done done
# after this, the next command will remove the small number of utterances # after this, the next command will remove the small number of utterances
# that couldn't be extracted for some reason (e.g. too short; no such file). # that couldn't be extracted for some reason (e.g. too short; no such file).
utils/fix_data_dir.sh data/train || exit 1; utils/fix_data_dir.sh data/train || exit 1;
steps/train_mono.sh --nj 10 \ steps/train_mono.sh --cmd "$train_cmd" --nj 10 \
data/train data/lang exp/mono0a || exit 1; data/train data/lang exp/mono0a || exit 1;
...@@ -53,101 +53,102 @@ utils/mkgraph.sh --mono data/lang_test exp/mono0a exp/mono0a/graph || exit 1 ...@@ -53,101 +53,102 @@ utils/mkgraph.sh --mono data/lang_test exp/mono0a exp/mono0a/graph || exit 1
steps/decode.sh --config conf/decode.config --nj 10 \ steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
exp/mono0a/graph data/dev exp/mono0a/decode exp/mono0a/graph data/dev exp/mono0a/decode
# Get alignments from monophone system. # Get alignments from monophone system.
steps/align_si.sh --nj 10 \ steps/align_si.sh --cmd "$train_cmd" --nj 10 \
data/train data/lang exp/mono0a exp/mono_ali || exit 1; data/train data/lang exp/mono0a exp/mono_ali || exit 1;
# train tri1 [first triphone pass] # train tri1 [first triphone pass]
steps/train_deltas.sh \ steps/train_deltas.sh --cmd "$train_cmd" \
2500 20000 data/train data/lang exp/mono_ali exp/tri1 || exit 1; 2500 20000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
# decode tri1 # decode tri1
utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1; utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1;
steps/decode.sh --config conf/decode.config --nj 10 \ steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
exp/tri1/graph data/dev exp/tri1/decode exp/tri1/graph data/dev exp/tri1/decode
# align tri1 # align tri1
steps/align_si.sh --nj 10 \ steps/align_si.sh --cmd "$train_cmd" --nj 10 \
data/train data/lang exp/tri1 exp/tri1_ali || exit 1; data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
# train tri2 [delta+delta-deltas] # train tri2 [delta+delta-deltas]
steps/train_deltas.sh \ steps/train_deltas.sh --cmd "$train_cmd" \
2500 20000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1; 2500 20000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1;
# decode tri2 # decode tri2
utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph
steps/decode.sh --config conf/decode.config --nj 10 \ steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
exp/tri2/graph data/dev exp/tri2/decode exp/tri2/graph data/dev exp/tri2/decode
# train and decode tri2b [LDA+MLLT] # train and decode tri2b [LDA+MLLT]
steps/align_si.sh --nj 10 \ steps/align_si.sh --cmd "$train_cmd" --nj 10 \
data/train data/lang exp/tri2 exp/tri2_ali || exit 1; data/train data/lang exp/tri2 exp/tri2_ali || exit 1;
# Train tri3a, which is LDA+MLLT, # Train tri3a, which is LDA+MLLT,
steps/train_lda_mllt.sh \ steps/train_lda_mllt.sh --cmd "$train_cmd" \
2500 20000 data/train data/lang exp/tri2_ali exp/tri3a || exit 1; 2500 20000 data/train data/lang exp/tri2_ali exp/tri3a || exit 1;
utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1; utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
steps/decode.sh --nj 10 --config conf/decode.config \ steps/decode.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
exp/tri3a/graph data/dev exp/tri3a/decode exp/tri3a/graph data/dev exp/tri3a/decode
# From now, we start building a more serious system (with SAT), and we'll # From now, we start building a more serious system (with SAT), and we'll
# do the alignment with fMLLR. # do the alignment with fMLLR.
steps/align_fmllr.sh --nj 10 \ steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \
data/train data/lang exp/tri3a exp/tri3a_ali || exit 1; data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;
steps/train_sat.sh \ steps/train_sat.sh --cmd "$train_cmd" \
2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1; 2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1;
utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
steps/decode_fmllr.sh --nj 10 --config conf/decode.config \ steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
exp/tri4a/graph data/dev exp/tri4a/decode exp/tri4a/graph data/dev exp/tri4a/decode
steps/align_fmllr.sh --nj 10 \ steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \
data/train data/lang exp/tri4a exp/tri4a_ali data/train data/lang exp/tri4a exp/tri4a_ali
# Building a larger SAT system. # Building a larger SAT system.
steps/train_sat.sh \ steps/train_sat.sh --cmd "$train_cmd" \
3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1; 3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1;
utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph || exit 1; utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph || exit 1;
steps/decode_fmllr.sh --nj 10 --config conf/decode.config \ steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
exp/tri5a/graph data/dev exp/tri5a/decode || exit 1; exp/tri5a/graph data/dev exp/tri5a/decode || exit 1;
# MMI starting from system in tri5a. Use the same data (100k_nodup). # MMI starting from system in tri5a. Use the same data (100k_nodup).
# Later we'll use all of it. # Later we'll use all of it.
steps/align_fmllr.sh --nj 10 \ steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \
data/train data/lang exp/tri5a exp/tri5a_ali || exit 1; data/train data/lang exp/tri5a exp/tri5a_ali || exit 1;
steps/make_denlats.sh --nj 10 --transform-dir exp/tri5a_ali \
steps/make_denlats.sh --cmd "$train_cmd" --nj 10 --transform-dir exp/tri5a_ali \
--config conf/decode.config \ --config conf/decode.config \
data/train data/lang exp/tri5a exp/tri5a_denlats || exit 1; data/train data/lang exp/tri5a exp/tri5a_denlats || exit 1;
steps/train_mmi.sh --boost 0.1 \ steps/train_mmi.sh --cmd "$train_cmd" --boost 0.1 \
data/train data/lang exp/tri5a_ali exp/tri5a_denlats exp/tri5a_mmi_b0.1 || exit 1; data/train data/lang exp/tri5a_ali exp/tri5a_denlats exp/tri5a_mmi_b0.1 || exit 1;
steps/decode.sh --nj 10 --config conf/decode.config \ steps/decode.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
--transform-dir exp/tri5a/decode \ --transform-dir exp/tri5a/decode \
exp/tri5a/graph data/dev exp/tri5a_mmi_b0.1/decode || exit 1 ; exp/tri5a/graph data/dev exp/tri5a_mmi_b0.1/decode || exit 1 ;
# Do MPE. # Do MPE.
steps/train_mpe.sh data/train data/lang exp/tri5a_ali exp/tri5a_denlats exp/tri5a_mpe || exit 1; steps/train_mpe.sh --cmd "$train_cmd" data/train data/lang exp/tri5a_ali exp/tri5a_denlats exp/tri5a_mpe || exit 1;
steps/decode.sh --nj 10 --config conf/decode.config \ steps/decode.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
--transform-dir exp/tri5a/decode \ --transform-dir exp/tri5a/decode \
exp/tri5a/graph data/dev exp/tri5a_mpe/decode || exit 1 ; exp/tri5a/graph data/dev exp/tri5a_mpe/decode || exit 1 ;
# Do MCE. # Do MCE.
steps/train_mce.sh data/train data/lang exp/tri5a_ali exp/tri5a_denlats exp/tri5a_mce || exit 1; steps/train_mce.sh --cmd "$train_cmd" data/train data/lang exp/tri5a_ali exp/tri5a_denlats exp/tri5a_mce || exit 1;
steps/decode.sh --nj 10 --config conf/decode.config \ steps/decode.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
--transform-dir exp/tri5a/decode \ --transform-dir exp/tri5a/decode \
exp/tri5a/graph data/dev exp/tri5a_mce/decode || exit 1 ; exp/tri5a/graph data/dev exp/tri5a_mce/decode || exit 1 ;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment