run.sh 14.9 KB
Newer Older
1 2 3 4 5 6 7
#!/bin/bash

# This recipe is based on the run_edin.sh recipe, by Arnab Ghoshal,
# in the s5/ directory.
# This is supposed to be the "new" version of the switchboard recipe,
# after the s5/ one became a bit messy.  It is not 100% checked-through yet.

8
#exit 1;
9 10 11 12 13 14 15
# This is a shell script, but it's recommended that you run the commands one by
# one by copying and pasting into the shell.
# Caution: some of the graph creation steps use quite a bit of memory, so you
# should run this on a machine that has sufficient memory.

. cmd.sh
. path.sh
16
set -e # exit on error
17 18 19 20
# mfccdir should be some place with a largish disk where you
# want to store MFCC features. 
mfccdir=mfcc

21

22 23 24 25 26 27 28 29 30
# Prepare Switchboard data. This command can also take a second optional argument
# which specifies the directory to Switchboard documentations. Specifically, if
# this argument is given, the script will look for the conv.tab file and correct
# speaker IDs to the actual speaker personal identification numbers released in
# the documentations. The documentations can be found here: 
# https://catalog.ldc.upenn.edu/docs/LDC97S62/
# Note: if you are using this link, make sure you rename conv_tab.csv to conv.tab
# after downloading.
# Usage: local/swbd1_data_prep.sh /path/to/SWBD [/path/to/SWBD_docs]
31
local/swbd1_data_prep.sh /export/corpora3/LDC/LDC97S62
32
# local/swbd1_data_prep.sh /home/dpovey/data/LDC97S62
33 34 35 36 37 38 39 40 41 42 43 44 45 46
# local/swbd1_data_prep.sh /data/corpora0/LDC97S62
# local/swbd1_data_prep.sh /mnt/matylda2/data/SWITCHBOARD_1R2
# local/swbd1_data_prep.sh /exports/work/inf_hcrc_cstr_general/corpora/switchboard/switchboard1

local/swbd1_prepare_dict.sh

utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang

# Now train the language models. We are using SRILM and interpolating with an
# LM trained on the Fisher transcripts (part 2 disk is currently missing; so 
# only part 1 transcripts ~700hr are used)

# If you have the Fisher data, you can set this "fisher_opt" variable.
fisher_opt="--fisher /export/corpora3/LDC/LDC2004T19/fe_03_p1_tran/"
47
#fisher_opt="--fisher /home/dpovey/data/LDC2004T19/fe_03_p1_tran/"
48 49 50
#fisher_opt="--fisher /data/corpora0/LDC2004T19/fe_03_p1_tran/"
# edinburgh:
# fisher_opt="--fisher /exports/work/inf_hcrc_cstr_general/corpora/fisher/transcripts"
Karel Vesely's avatar
Karel Vesely committed
51 52
# brno:
# fisher_opt="--fisher /mnt/matylda2/data/FISHER/fe_03_p1_tran" # BUT
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
local/swbd1_train_lms.sh $fisher_opt \
  data/local/train/text data/local/dict/lexicon.txt data/local/lm
# We don't really need all these options for SRILM, since the LM training script
# does some of the same processings (e.g. -subset -tolower)
srilm_opts="-subset -prune-lowprobs -unk -tolower -order 3"
LM=data/local/lm/sw1.o3g.kn.gz
utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
  data/lang $LM data/local/dict/lexicon.txt data/lang_sw1_tg

LM=data/local/lm/sw1_fsh.o3g.kn.gz
utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
  data/lang $LM data/local/dict/lexicon.txt data/lang_sw1_fsh_tg

# For some funny reason we are still using IRSTLM for doing LM pruning :)
export PATH=$PATH:../../../tools/irstlm/bin/
prune-lm --threshold=1e-7 data/local/lm/sw1_fsh.o3g.kn.gz /dev/stdout \
Karel Vesely's avatar
Karel Vesely committed
69
  | gzip -c > data/local/lm/sw1_fsh.o3g.pr1-7.kn.gz || exit 1
70 71 72 73 74 75 76 77 78 79 80 81
LM=data/local/lm/sw1_fsh.o3g.pr1-7.kn.gz
utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
  data/lang $LM data/local/dict/lexicon.txt data/lang_sw1_fsh_tgpr


# Data preparation and formatting for eval2000 (note: the "text" file
# is not very much preprocessed; for actual WER reporting we'll use
# sclite.

# local/eval2000_data_prep.sh /data/corpora0/LDC2002S09/hub5e_00 /data/corpora0/LDC2002T43
# local/eval2000_data_prep.sh /mnt/matylda2/data/HUB5_2000/ /mnt/matylda2/data/HUB5_2000/2000_hub5_eng_eval_tr
# local/eval2000_data_prep.sh /exports/work/inf_hcrc_cstr_general/corpora/switchboard/hub5/2000 /exports/work/inf_hcrc_cstr_general/corpora/switchboard/hub5/2000/transcr
82
# local/eval2000_data_prep.sh /home/dpovey/data/LDC2002S09/hub5e_00 /home/dpovey/data/LDC2002T43
83 84
local/eval2000_data_prep.sh /export/corpora2/LDC/LDC2002S09/hub5e_00 /export/corpora2/LDC/LDC2002T43

85
steps/make_mfcc.sh --nj 50 --cmd "$train_cmd" data/train exp/make_mfcc/train $mfccdir
86 87 88 89
steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train $mfccdir 

# Remove the small number of utterances that couldn't be extracted for some 
# reason (e.g. too short; no such file).
90
utils/fix_data_dir.sh data/train 
91 92

# Create MFCCs for the eval set
93 94 95
steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/eval2000 exp/make_mfcc/eval2000 $mfccdir
steps/compute_cmvn_stats.sh data/eval2000 exp/make_mfcc/eval2000 $mfccdir
utils/fix_data_dir.sh data/eval2000  # remove segments with problems
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131

# Use the first 4k sentences as dev set.  Note: when we trained the LM, we used
# the 1st 10k sentences as dev set, so the 1st 4k won't have been used in the
# LM training data.   However, they will be in the lexicon, plus speakers
# may overlap, so it's still not quite equivalent to a test set.
utils/subset_data_dir.sh --first data/train 4000 data/train_dev # 5hr 6min
n=$[`cat data/train/segments | wc -l` - 4000]
utils/subset_data_dir.sh --last data/train $n data/train_nodev

# perl -ne 'split; $s+=($_[3]-$_[2]); END{$h=int($s/3600); $r=($s-$h*3600); $m=int($r/60); $r-=$m*60; printf "%.1f sec -- %d:%d:%.1f\n", $s, $h, $m, $r;}' data/local/train/segments


# Now-- there are 260k utterances (313hr 23min), and we want to start the 
# monophone training on relatively short utterances (easier to align), but not 
# only the shortest ones (mostly uh-huh).  So take the 100k shortest ones;
# remove most of the repeated utterances (these are the uh-huh type ones), and 
# then take 10k random utterances from those (about 4hr 40mins)

utils/subset_data_dir.sh --shortest data/train_nodev 100000 data/train_100kshort
local/remove_dup_utts.sh 10 data/train_100kshort data/train_100kshort_nodup
utils/subset_data_dir.sh data/train_100kshort_nodup 10000 data/train_10k_nodup

# Take the first 30k utterances (about 1/8th of the data)
utils/subset_data_dir.sh --first data/train_nodev 30000 data/train_30k
local/remove_dup_utts.sh 200 data/train_30k data/train_30k_nodup  # 33hr

# Take the first 100k utterances (just under half the data); we'll use
# this for later stages of training.
utils/subset_data_dir.sh --first data/train_nodev 100000 data/train_100k
local/remove_dup_utts.sh 200 data/train_100k data/train_100k_nodup  # 110hr

# Finally, the full training set:
local/remove_dup_utts.sh 300 data/train_nodev data/train_nodup  # 286hr

## Starting basic training on MFCC features
steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
132
  data/train_10k_nodup data/lang exp/mono 
133 134

steps/align_si.sh --nj 30 --cmd "$train_cmd" \
135
  data/train_30k_nodup data/lang exp/mono exp/mono_ali 
136 137

steps/train_deltas.sh --cmd "$train_cmd" \
138
  3200 30000 data/train_30k_nodup data/lang exp/mono_ali exp/tri1 
139 140 141 142 143 144 145 146 147 148 149 150

for lm_suffix in tg fsh_tgpr; do
  (
    graph_dir=exp/tri1/graph_sw1_${lm_suffix}
    $train_cmd $graph_dir/mkgraph.log \
      utils/mkgraph.sh data/lang_sw1_${lm_suffix} exp/tri1 $graph_dir
    steps/decode_si.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
      $graph_dir data/eval2000 exp/tri1/decode_eval2000_sw1_${lm_suffix}
  ) &
done

steps/align_si.sh --nj 30 --cmd "$train_cmd" \
151
  data/train_30k_nodup data/lang exp/tri1 exp/tri1_ali 
152 153

steps/train_deltas.sh --cmd "$train_cmd" \
154
  3200 30000 data/train_30k_nodup data/lang exp/tri1_ali exp/tri2 
155 156 157 158 159 160 161 162 163 164 165


for lm_suffix in tg fsh_tgpr; do
  (
    # The previous mkgraph might be writing to this file.  If the previous mkgraph
    # is not running, you can remove this loop and this mkgraph will create it.
    while [ ! -s data/lang_sw1_${lm_suffix}/tmp/CLG_3_1.fst ]; do sleep 60; done
    sleep 20; # in case still writing.
    graph_dir=exp/tri2/graph_sw1_${lm_suffix}
    $train_cmd $graph_dir/mkgraph.log \
      utils/mkgraph.sh data/lang_sw1_${lm_suffix} exp/tri2 $graph_dir
Bagher BabaAli's avatar
Bagher BabaAli committed
166
    steps/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
167 168 169 170 171 172 173
      $graph_dir data/eval2000 exp/tri2/decode_eval2000_sw1_${lm_suffix}
  ) &
done

# From now, we start building a bigger system (on train_100k_nodup, which has 
# 110hrs of data). We start with the LDA+MLLT system
steps/align_si.sh --nj 30 --cmd "$train_cmd" \
174
  data/train_100k_nodup data/lang exp/tri2 exp/tri2_ali_100k_nodup 
175 176 177

# Train tri3b, which is LDA+MLLT, on 100k_nodup data.
steps/train_lda_mllt.sh --cmd "$train_cmd" \
178
  5500 90000 data/train_100k_nodup data/lang exp/tri2_ali_100k_nodup exp/tri3b 
179 180 181 182 183 184 185 186 187 188 189 190 191

for lm_suffix in tg fsh_tgpr; do
  (
    graph_dir=exp/tri3b/graph_sw1_${lm_suffix}
    $train_cmd $graph_dir/mkgraph.log \
      utils/mkgraph.sh data/lang_sw1_${lm_suffix} exp/tri3b $graph_dir
    steps/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
      $graph_dir data/eval2000 exp/tri3b/decode_eval2000_sw1_${lm_suffix}
  ) &
done

# Train tri4a, which is LDA+MLLT+SAT, on 100k_nodup data.
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
192
  data/train_100k_nodup data/lang exp/tri3b exp/tri3b_ali_100k_nodup 
193

194

195 196
steps/train_sat.sh  --cmd "$train_cmd" \
  5500 90000 data/train_100k_nodup data/lang exp/tri3b_ali_100k_nodup \
197
   exp/tri4a 
198 199 200 201 202 203 204 205 206 207 208

for lm_suffix in tg fsh_tgpr; do
  (
    graph_dir=exp/tri4a/graph_sw1_${lm_suffix}
    $train_cmd $graph_dir/mkgraph.log \
      utils/mkgraph.sh data/lang_sw1_${lm_suffix} exp/tri4a $graph_dir
    steps/decode_fmllr.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
      $graph_dir data/eval2000 exp/tri4a/decode_eval2000_sw1_${lm_suffix}
  ) &
done

209 210 211

#local/run_resegment.sh

212 213 214 215
# Now train a LDA+MLLT+SAT model on the entire training data (train_nodup; 
# 286 hours)
# Train tri4b, which is LDA+MLLT+SAT, on train_nodup data.
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
216
  data/train_nodup data/lang exp/tri3b exp/tri3b_ali_nodup 
217

218 219

steps/train_sat.sh  --cmd "$train_cmd" \
220
  11500 200000 data/train_nodup data/lang exp/tri3b_ali_nodup exp/tri4b
221 222 223 224 225 226

for lm_suffix in tg fsh_tgpr; do
  (
    graph_dir=exp/tri4b/graph_sw1_${lm_suffix}
    $train_cmd $graph_dir/mkgraph.log \
      utils/mkgraph.sh data/lang_sw1_${lm_suffix} exp/tri4b $graph_dir
227 228
    steps/decode_fmllr.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
       $graph_dir data/eval2000 exp/tri4b/decode_eval2000_sw1_${lm_suffix}
229 230 231 232
    steps/decode_fmllr.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
       $graph_dir data/train_dev exp/tri4b/decode_train_dev_sw1_${lm_suffix}
  ) &
done
Karel Vesely's avatar
Karel Vesely committed
233
wait
234

235
steps/lmrescore.sh --mode 3 --cmd "$decode_cmd" data/lang_sw1_fsh_tgpr data/lang_sw1_fsh_tg data/eval2000 \
Karel Vesely's avatar
Karel Vesely committed
236
  exp/tri4b/decode_eval2000_sw1_fsh_tgpr exp/tri4b/decode_eval2000_sw1_fsh_tg.3 || exit 1
237 238 239 240 241 242 243 244


# MMI training starting from the LDA+MLLT+SAT systems on both the 
# train_100k_nodup (110hr) and train_nodup (286hr) sets
steps/align_fmllr.sh --nj 50 --cmd "$train_cmd" \
  data/train_100k_nodup data/lang exp/tri4a exp/tri4a_ali_100k_nodup || exit 1

steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
245
  data/train_nodup data/lang exp/tri4b exp/tri4b_ali_nodup || exit 1
246 247 248 249

steps/make_denlats.sh --nj 50 --cmd "$decode_cmd" --config conf/decode.config \
  --transform-dir exp/tri4a_ali_100k_nodup \
  data/train_100k_nodup data/lang exp/tri4a exp/tri4a_denlats_100k_nodup \
250
  
251 252

steps/make_denlats.sh --nj 100 --cmd "$decode_cmd" --config conf/decode.config \
253
  --transform-dir exp/tri4b_ali_nodup \
254
  data/train_nodup data/lang exp/tri4b exp/tri4b_denlats_nodup 
255 256 257 258 259 260 261

# 4 iterations of MMI seems to work well overall. The number of iterations is
# used as an explicit argument even though train_mmi.sh will use 4 iterations by
# default.
num_mmi_iters=4
steps/train_mmi.sh --cmd "$decode_cmd" --boost 0.1 --num-iters $num_mmi_iters \
  data/train_100k_nodup data/lang exp/tri4a_{ali,denlats}_100k_nodup \
262
  exp/tri4a_mmi_b0.1 
263 264

steps/train_mmi.sh --cmd "$decode_cmd" --boost 0.1 --num-iters $num_mmi_iters \
265
  data/train_nodup data/lang exp/tri4b_{ali,denlats}_nodup \
266
  exp/tri4b_mmi_b0.1 
267

Bagher BabaAli's avatar
Bagher BabaAli committed
268 269 270 271 272
for iter in 1 2 3 4; do
  for lm_suffix in tg fsh_tgpr; do
    (
      graph_dir=exp/tri4a/graph_sw1_${lm_suffix}
      decode_dir=exp/tri4a_mmi_b0.1/decode_eval2000_${iter}.mdl_sw1_${lm_suffix}
273
    steps/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
Bagher BabaAli's avatar
Bagher BabaAli committed
274 275
    --iter $iter --transform-dir exp/tri4a/decode_eval2000_sw1_${lm_suffix} \
            $graph_dir data/eval2000 $decode_dir
276
  ) &
Bagher BabaAli's avatar
Bagher BabaAli committed
277
  done
278 279
done

Bagher BabaAli's avatar
Bagher BabaAli committed
280 281 282 283 284 285 286 287
for iter in 1 2 3 4; do
  for lm_suffix in tg fsh_tgpr; do
    (
      graph_dir=exp/tri4b/graph_sw1_${lm_suffix}
      decode_dir=exp/tri4b_mmi_b0.1/decode_eval2000_${iter}.mdl_sw1_${lm_suffix}
      steps/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
    --iter $iter --transform-dir exp/tri4b/decode_eval2000_sw1_${lm_suffix} \
    $graph_dir data/eval2000 $decode_dir   
288
  ) &
Bagher BabaAli's avatar
Bagher BabaAli committed
289
  done
290 291 292 293 294 295 296 297 298 299
done

#TODO(arnab): add lmrescore here
# ./steps/lmrescore.sh --mode 3 --cmd "$highmem_cmd" data/lang_sw1_fsh_tgpr data/lang_sw1_fsh_tg data/eval2000 exp/tri3a/decode_eval2000_sw1_fsh_tgpr exp/tri3a/decode_eval2000_sw1_fsh_tg.3 &

# Now do fMMI+MMI training
steps/train_diag_ubm.sh --silence-weight 0.5 --nj 50 --cmd "$train_cmd" \
  700 data/train_100k_nodup data/lang exp/tri4a_ali_100k_nodup exp/tri4a_dubm

steps/train_diag_ubm.sh --silence-weight 0.5 --nj 100 --cmd "$train_cmd" \
300
  700 data/train_nodup data/lang exp/tri4b_ali_nodup exp/tri4b_dubm
301 302 303

steps/train_mmi_fmmi.sh --learning-rate 0.005 --boost 0.1 --cmd "$train_cmd" \
  data/train_100k_nodup data/lang exp/tri4a_ali_100k_nodup exp/tri4a_dubm \
304
  exp/tri4a_denlats_100k_nodup exp/tri4a_fmmi_b0.1 
305 306

steps/train_mmi_fmmi.sh --learning-rate 0.005 --boost 0.1 --cmd "$train_cmd" \
307
  data/train_nodup data/lang exp/tri4b_ali_nodup exp/tri4b_dubm \
308
  exp/tri4b_denlats_nodup exp/tri4b_fmmi_b0.1  
309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333

for iter in 4 5 6 7 8; do
  for lm_suffix in tg fsh_tgpr; do
    (
      graph_dir=exp/tri4a/graph_sw1_${lm_suffix}
      decode_dir=exp/tri4a_fmmi_b0.1/decode_eval2000_it${iter}_sw1_${lm_suffix}
      steps/decode_fmmi.sh --nj 30 --cmd "$decode_cmd" --iter $iter \
	--transform-dir exp/tri4a/decode_eval2000_sw1_${lm_suffix} \
	--config conf/decode.config $graph_dir data/eval2000 $decode_dir
    ) &
  done
done

for iter in 4 5 6 7 8; do
  for lm_suffix in tg fsh_tgpr; do
    (
      graph_dir=exp/tri4b/graph_sw1_${lm_suffix}
      decode_dir=exp/tri4b_fmmi_b0.1/decode_eval2000_it${iter}_sw1_${lm_suffix}
      steps/decode_fmmi.sh --nj 30 --cmd "$decode_cmd" --iter $iter \
	--transform-dir exp/tri4b/decode_eval2000_sw1_${lm_suffix} \
	--config conf/decode.config $graph_dir data/eval2000 $decode_dir
    ) &
  done
done

334

335 336
# local/run_sgmm2.sh

337 338 339
# demonstration script for raw-fMLLR.  You should probably ignore this.
# local/run_raw_fmllr.sh

340 341

# # Karel's DNN recipe on top of fMLLR features
Karel Vesely's avatar
Karel Vesely committed
342
# local/run_dnn.sh
343 344 345


# # Dan's nnet recipe
346 347 348
# # you might want to look into that script and run parts,
# # rather than just running the whole thing.
# local/run_nnet2.sh
349

350
# # Dan's nnet recipe with online decoding.
351 352
# local/online/run_nnet2.sh
# local/online/run_nnet2_baseline.sh
353 354 355 356 357


# # getting results (see RESULTS file)
# for x in 1 2 3a 3b 4a; do grep 'Percent Total Error' exp/tri$x/decode_eval2000_sw1_tg/score_*/eval2000.ctm.filt.dtl | sort -k5 -g | head -1; done