Commit 406ad668 authored by Hainan Xu's avatar Hainan Xu
Browse files

Fixed some data preparation issues; Added --cmd options to train/decode scripts.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4282 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent ac225741
......@@ -3,12 +3,13 @@
. path.sh
if [ $# != 1 ]; then
echo "Usage: hkust_data_prep.sh /path/to/HKUST"
if [ $# != 2 ]; then
echo "Usage: hkust_data_prep.sh AUDIO_PATH TEXT_PATH"
exit 1;
fi
HKUST_DIR=$1
HKUST_AUDIO_DIR=$1
HKUST_TEXT_DIR=$2
train_dir=data/local/train
dev_dir=data/local/dev
......@@ -23,14 +24,14 @@ mkdir -p $train_dir
mkdir -p $dev_dir
#data directory check
if [ ! -d $HKUST_DIR ]; then
echo "Error: run.sh requires a directory argument"
if [ ! -d $HKUST_AUDIO_DIR ] || [ ! -d $HKUST_TEXT_DIR ]; then
echo "Error: run.sh requires two directory arguments"
exit 1;
fi
#find sph audio file for train dev resp.
find $HKUST_DIR -iname "*.sph" | grep -i "audio/train" > $train_dir/sph.flist
find $HKUST_DIR -iname "*.sph" | grep -i "audio/dev" > $dev_dir/sph.flist
find $HKUST_AUDIO_DIR -iname "*.sph" | grep -i "audio/train" > $train_dir/sph.flist
find $HKUST_AUDIO_DIR -iname "*.sph" | grep -i "audio/dev" > $dev_dir/sph.flist
n=`cat $train_dir/sph.flist $dev_dir/sph.flist | wc -l`
[ $n -ne 897 ] && \
......@@ -40,7 +41,7 @@ n=`cat $train_dir/sph.flist $dev_dir/sph.flist | wc -l`
#Transcriptions preparation
#collect all trans, convert encodings to utf-8,
find $HKUST_DIR -iname "*.txt" | grep -i "trans/train" | xargs cat |\
find $HKUST_TEXT_DIR -iname "*.txt" | grep -i "trans/train" | xargs cat |\
iconv -f GBK -t utf-8 - | perl -e '
while (<STDIN>) {
@A = split(" ", $_);
......@@ -55,7 +56,7 @@ find $HKUST_DIR -iname "*.txt" | grep -i "trans/train" | xargs cat |\
}
' | sort -k1 > $train_dir/transcripts.txt
find $HKUST_DIR -iname "*.txt" | grep -i "trans/dev" | xargs cat |\
find $HKUST_TEXT_DIR -iname "*.txt" | grep -i "trans/dev" | xargs cat |\
iconv -f GBK -t utf-8 - | perl -e '
while (<STDIN>) {
@A = split(" ", $_);
......@@ -111,6 +112,10 @@ cat $dev_dir/transcripts.txt |\
python local/hkust_segment.py |\
awk '{if (NF > 1) print $0;}' > $dev_dir/text
# some data is corrupted. Delete them
cat $train_dir/text | grep -v 20040527_210939_A901153_B901154-A-035691-035691 | egrep -v "A:|B:" > tmp
mv tmp $train_dir/text
#Make segment files from transcript
#segments file format is: utt-id side-id start-time end-time, e.g.:
#sw02001-A_000098-001156 sw02001-A 0.98 11.56
......
......@@ -33,7 +33,7 @@ cat $dict_dir/vocab-full.txt | grep -v '[a-zA-Z]' > $dict_dir/vocab-ch.txt
# produce pronunciations for english
if [ ! -f $dict_dir/cmudict/cmudict.0.7a ]; then
echo "--- Downloading CMU dictionary ..."
svn co https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/cmudict \
svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \
$dict_dir/cmudict || exit 1;
fi
......
......@@ -11,9 +11,9 @@
. cmd.sh
# Data Preparation,
local/hkust_data_prep.sh /mnt/spdb/LDC2005S15
local/hkust_data_prep.sh /export/corpora/LDC/LDC2005S15/ /export/corpora/LDC/LDC2005T32/
# Lexicon Preparation,
local/hkust_prepare_dict.sh
......@@ -34,14 +34,14 @@ local/hkust_format_data.sh
# want to store MFCC features.
mfccdir=mfcc
for x in train dev; do
steps/make_mfcc.sh --nj 10 data/$x exp/make_mfcc/$x $mfccdir || exit 1;
steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/$x exp/make_mfcc/$x $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
done
# after this, the next command will remove the small number of utterances
# that couldn't be extracted for some reason (e.g. too short; no such file).
utils/fix_data_dir.sh data/train || exit 1;
steps/train_mono.sh --nj 10 \
steps/train_mono.sh --cmd "$train_cmd" --nj 10 \
data/train data/lang exp/mono0a || exit 1;
......@@ -53,101 +53,102 @@ utils/mkgraph.sh --mono data/lang_test exp/mono0a exp/mono0a/graph || exit 1
steps/decode.sh --config conf/decode.config --nj 10 \
steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
exp/mono0a/graph data/dev exp/mono0a/decode
# Get alignments from monophone system.
steps/align_si.sh --nj 10 \
steps/align_si.sh --cmd "$train_cmd" --nj 10 \
data/train data/lang exp/mono0a exp/mono_ali || exit 1;
# train tri1 [first triphone pass]
steps/train_deltas.sh \
steps/train_deltas.sh --cmd "$train_cmd" \
2500 20000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;
# decode tri1
utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1;
steps/decode.sh --config conf/decode.config --nj 10 \
steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
exp/tri1/graph data/dev exp/tri1/decode
# align tri1
steps/align_si.sh --nj 10 \
steps/align_si.sh --cmd "$train_cmd" --nj 10 \
data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
# train tri2 [delta+delta-deltas]
steps/train_deltas.sh \
steps/train_deltas.sh --cmd "$train_cmd" \
2500 20000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1;
# decode tri2
utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph
steps/decode.sh --config conf/decode.config --nj 10 \
steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
exp/tri2/graph data/dev exp/tri2/decode
# train and decode tri2b [LDA+MLLT]
steps/align_si.sh --nj 10 \
steps/align_si.sh --cmd "$train_cmd" --nj 10 \
data/train data/lang exp/tri2 exp/tri2_ali || exit 1;
# Train tri3a, which is LDA+MLLT,
steps/train_lda_mllt.sh \
steps/train_lda_mllt.sh --cmd "$train_cmd" \
2500 20000 data/train data/lang exp/tri2_ali exp/tri3a || exit 1;
utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
steps/decode.sh --nj 10 --config conf/decode.config \
steps/decode.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
exp/tri3a/graph data/dev exp/tri3a/decode
# From now, we start building a more serious system (with SAT), and we'll
# do the alignment with fMLLR.
steps/align_fmllr.sh --nj 10 \
steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \
data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;
steps/train_sat.sh \
steps/train_sat.sh --cmd "$train_cmd" \
2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1;
utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
steps/decode_fmllr.sh --nj 10 --config conf/decode.config \
steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
exp/tri4a/graph data/dev exp/tri4a/decode
steps/align_fmllr.sh --nj 10 \
steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \
data/train data/lang exp/tri4a exp/tri4a_ali
# Building a larger SAT system.
steps/train_sat.sh \
steps/train_sat.sh --cmd "$train_cmd" \
3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1;
utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph || exit 1;
steps/decode_fmllr.sh --nj 10 --config conf/decode.config \
steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
exp/tri5a/graph data/dev exp/tri5a/decode || exit 1;
# MMI starting from system in tri5a. Use the same data (100k_nodup).
# Later we'll use all of it.
steps/align_fmllr.sh --nj 10 \
steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \
data/train data/lang exp/tri5a exp/tri5a_ali || exit 1;
steps/make_denlats.sh --nj 10 --transform-dir exp/tri5a_ali \
steps/make_denlats.sh --cmd "$train_cmd" --nj 10 --transform-dir exp/tri5a_ali \
--config conf/decode.config \
data/train data/lang exp/tri5a exp/tri5a_denlats || exit 1;
steps/train_mmi.sh --boost 0.1 \
steps/train_mmi.sh --cmd "$train_cmd" --boost 0.1 \
data/train data/lang exp/tri5a_ali exp/tri5a_denlats exp/tri5a_mmi_b0.1 || exit 1;
steps/decode.sh --nj 10 --config conf/decode.config \
steps/decode.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
--transform-dir exp/tri5a/decode \
exp/tri5a/graph data/dev exp/tri5a_mmi_b0.1/decode || exit 1 ;
# Do MPE.
steps/train_mpe.sh data/train data/lang exp/tri5a_ali exp/tri5a_denlats exp/tri5a_mpe || exit 1;
steps/train_mpe.sh --cmd "$train_cmd" data/train data/lang exp/tri5a_ali exp/tri5a_denlats exp/tri5a_mpe || exit 1;
steps/decode.sh --nj 10 --config conf/decode.config \
steps/decode.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
--transform-dir exp/tri5a/decode \
exp/tri5a/graph data/dev exp/tri5a_mpe/decode || exit 1 ;
# Do MCE.
steps/train_mce.sh data/train data/lang exp/tri5a_ali exp/tri5a_denlats exp/tri5a_mce || exit 1;
steps/train_mce.sh --cmd "$train_cmd" data/train data/lang exp/tri5a_ali exp/tri5a_denlats exp/tri5a_mce || exit 1;
steps/decode.sh --nj 10 --config conf/decode.config \
steps/decode.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
--transform-dir exp/tri5a/decode \
exp/tri5a/graph data/dev exp/tri5a_mce/decode || exit 1 ;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment