run.sh 5.49 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#!/bin/bash

# Copyright 2012 Chao Weng 
# Apache 2.0

#exit 1;
# This is a shell script, but it's recommended that you run the commands one by
# one by copying and pasting into the shell.
# Caution: some of the graph creation steps use quite a bit of memory, so you
# should run this on a machine that has sufficient memory.

. cmd.sh

# Data Preparation, 
15 16
local/hkust_data_prep.sh /export/corpora/LDC/LDC2005S15/  /export/corpora/LDC/LDC2005T32/

17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
# Lexicon Preparation,
local/hkust_prepare_dict.sh




# Phone Sets, questions, L compilation 
utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang data/lang

# LM training
local/hkust_train_lms.sh

# G compilation, check LG composition
local/hkust_format_data.sh

# Now make MFCC features.
# mfccdir should be some place with a largish disk where you
# want to store MFCC features.
mfccdir=mfcc
for x in train dev; do 
37
  steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/$x exp/make_mfcc/$x $mfccdir || exit 1;
38 39 40 41 42 43
  steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir || exit 1;
done
# after this, the next command will remove the small number of utterances
# that couldn't be extracted for some reason (e.g. too short; no such file).
utils/fix_data_dir.sh data/train || exit 1;

44
steps/train_mono.sh --cmd "$train_cmd" --nj 10 \
45 46 47 48 49 50 51 52 53 54 55
  data/train data/lang exp/mono0a || exit 1;


# Monophone decoding
utils/mkgraph.sh --mono data/lang_test exp/mono0a exp/mono0a/graph || exit 1
# note: local/decode.sh calls the command line once for each
# test, and afterwards averages the WERs into (in this case
# exp/mono/decode/



56
steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
57 58 59 60 61
  exp/mono0a/graph data/dev exp/mono0a/decode



# Get alignments from monophone system.
62
steps/align_si.sh --cmd "$train_cmd" --nj 10 \
63 64 65
  data/train data/lang exp/mono0a exp/mono_ali || exit 1;

# train tri1 [first triphone pass]
66
steps/train_deltas.sh --cmd "$train_cmd" \
67 68 69 70
 2500 20000 data/train data/lang exp/mono_ali exp/tri1 || exit 1;

# decode tri1
utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1;
71
steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
72 73 74 75 76
  exp/tri1/graph data/dev exp/tri1/decode



# align tri1
77
steps/align_si.sh --cmd "$train_cmd" --nj 10 \
78 79 80
  data/train data/lang exp/tri1 exp/tri1_ali || exit 1;

# train tri2 [delta+delta-deltas]
81
steps/train_deltas.sh --cmd "$train_cmd" \
82 83 84 85
 2500 20000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1;

# decode tri2
utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph
86
steps/decode.sh --cmd "$decode_cmd" --config conf/decode.config --nj 10 \
87 88 89 90
  exp/tri2/graph data/dev exp/tri2/decode

# train and decode tri2b [LDA+MLLT]

91
steps/align_si.sh --cmd "$train_cmd" --nj 10 \
92 93 94
  data/train data/lang exp/tri2 exp/tri2_ali || exit 1;

# Train tri3a, which is LDA+MLLT, 
95
steps/train_lda_mllt.sh --cmd "$train_cmd" \
96 97 98
 2500 20000 data/train data/lang exp/tri2_ali exp/tri3a || exit 1;

utils/mkgraph.sh data/lang_test exp/tri3a exp/tri3a/graph || exit 1;
99
steps/decode.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
100 101 102 103
  exp/tri3a/graph data/dev exp/tri3a/decode
# From now, we start building a more serious system (with SAT), and we'll
# do the alignment with fMLLR.

104
steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \
105 106
  data/train data/lang exp/tri3a exp/tri3a_ali || exit 1;

107
steps/train_sat.sh --cmd "$train_cmd" \
108 109 110
  2500 20000 data/train data/lang exp/tri3a_ali exp/tri4a || exit 1;

utils/mkgraph.sh data/lang_test exp/tri4a exp/tri4a/graph
111
steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
112 113
  exp/tri4a/graph data/dev exp/tri4a/decode

114
steps/align_fmllr.sh  --cmd "$train_cmd" --nj 10 \
115 116 117 118
  data/train data/lang exp/tri4a exp/tri4a_ali

# Building a larger SAT system.

119
steps/train_sat.sh --cmd "$train_cmd" \
120 121 122
  3500 100000 data/train data/lang exp/tri4a_ali exp/tri5a || exit 1;

utils/mkgraph.sh data/lang_test exp/tri5a exp/tri5a/graph || exit 1;
123
steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
124 125 126 127 128
   exp/tri5a/graph data/dev exp/tri5a/decode || exit 1;


# MMI starting from system in tri5a.  Use the same data (100k_nodup).
# Later we'll use all of it.
129
steps/align_fmllr.sh --cmd "$train_cmd" --nj 10 \
130
  data/train data/lang exp/tri5a exp/tri5a_ali || exit 1;
131 132

steps/make_denlats.sh --cmd "$train_cmd" --nj 10 --transform-dir exp/tri5a_ali \
133 134
  --config conf/decode.config \
  data/train data/lang exp/tri5a exp/tri5a_denlats || exit 1;
135
steps/train_mmi.sh --cmd "$train_cmd" --boost 0.1 \
136
  data/train data/lang exp/tri5a_ali exp/tri5a_denlats exp/tri5a_mmi_b0.1 || exit 1;
137
steps/decode.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
138 139 140 141
  --transform-dir exp/tri5a/decode \
  exp/tri5a/graph data/dev exp/tri5a_mmi_b0.1/decode || exit 1 ; 

# Do MPE.
142
steps/train_mpe.sh  --cmd "$train_cmd" data/train data/lang exp/tri5a_ali exp/tri5a_denlats exp/tri5a_mpe || exit 1;
143

144
steps/decode.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
145 146 147 148
  --transform-dir exp/tri5a/decode \
  exp/tri5a/graph data/dev exp/tri5a_mpe/decode || exit 1 ;
# Do MCE.

149
steps/train_mce.sh --cmd "$train_cmd" data/train data/lang exp/tri5a_ali exp/tri5a_denlats exp/tri5a_mce || exit 1;
150

151
steps/decode.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
152 153 154 155 156 157 158 159 160
  --transform-dir exp/tri5a/decode \
  exp/tri5a/graph data/dev exp/tri5a_mce/decode || exit 1 ;

# getting results (see RESULTS file)
for x in exp/*/decode; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null
for x in exp/*/decode; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null

exit 1;