run.sh 14.2 KB
Newer Older
1
#!/bin/bash
2 3
### [men at work sign] ###
### WORK IN PROGRESS###
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
# Copyright 2010-2011 Microsoft Corporation

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

exit 1 # Don't run this... it's to be run line by line from the shell.

21 22 23 24
# call the next line with the directory where the RM data is
# (the argument below is just an example).  This should contain
# subdirectories named as follows:
#    rm1_audio1  rm1_audio2	rm2_audio
25

26 27 28 29
local/RM_data_prep.sh /mnt/matylda2/data/RM/

local/RM_format_data.sh

30 31 32 33 34 35 36 37 38 39 40
# mfccdir should be some place with a largish disk where you
# want to store MFCC features. 
mfccdir=/mnt/matylda6/jhu09/qpovey/kaldi_rm_mfcc

steps/make_mfcc.sh data/train exp/make_mfcc/train $mfccdir 4
for test in mar87 oct87 feb89 oct89 feb91 sep92; do
  steps/make_mfcc.sh data/test_$test exp/make_mfcc/test_$test $mfccdir 4
done

scripts/subset_data_dir.sh data/train 1000 data/train.1k

41
# train monophone system.
42 43
steps/train_mono.sh data/train.1k data/lang exp/mono

44 45 46

local/decode.sh --mono steps/decode_deltas.sh exp/mono/decode

47 48 49 50 51 52 53

# Get alignments from monophone system.
steps/align_deltas.sh data/train data/lang exp/mono exp/mono_ali

# train tri1 [first triphone pass]
steps/train_deltas.sh data/train data/lang exp/mono_ali exp/tri1
# decode tri1
54
local/decode.sh steps/decode_deltas.sh exp/tri1/decode
55 56

# align tri1
57
steps/align_deltas.sh --graphs "ark,s,cs:gunzip -c exp/tri1/graphs.fsts.gz|" \
Dan Povey's avatar
Dan Povey committed
58
    data/train data/lang exp/tri1 exp/tri1_ali
59

60 61
# 2level full-cov training...
steps/train-2lvl.sh data/train data/lang exp/tri1_ali exp/tri1-2lvl 100 1024 1800 0 0 0
62 63 64 65

# train tri2a [delta+delta-deltas]
steps/train_deltas.sh data/train data/lang exp/tri1_ali exp/tri2a
# decode tri2a
66
local/decode.sh steps/decode_deltas.sh exp/tri2a/decode
67

68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
# Train a classic semi-continuous model using {diag,full} densities
# the numeric parameters following exp/tri1-semi are: 
#   number of gaussians, something like 4096 for diag, 2048 for full
#   number of tree leaves 
#   type of suff-stats interpolation (0 regular, 1 preserves counts)
#   rho-stats, rho value for the smoothing of the statistics (0 for no smoothing)
#   rho-iters, rho value to interpolate the parameters with the last iteration (0 for no interpolation)
steps/train_semi_full.sh data/train data/lang exp/tri1_ali exp/tri1-semi 4096 1800 1 10 0
local/decode.sh steps/decode_tied_full.sh exp/tri1-semi

# Train a 2-lvl semi-continuous model using {diag,full} densities
# the numeric parameters following exp/tri1-2lvl are:
#   number of codebooks, typically 1-3 times number of phones, the more, the faster
#   total number of gaussians, something like 2048 for full, 4096 for diag
#   number of tree leaves
#   type of suff-stats interpolation (0 regular, 1 preserves counts)
#   rho-stats, rho value for the smoothing of the statistics (0 for no smoothing)
#   rho-iters, rho value to interpolate the parameters with the last iteration (0 for no interpolation)
steps/train_2lvl_full.sh data/train data/lang exp/tri1_ali exp/tri1-2lvl 104 2048 2500 0 1 10 0
local/decode.sh steps/decode_tied_full.sh exp/tri1-2lvl

89
# train tri2b [LDA+MLLT]
90
steps/train_lda_mllt.sh data/train data/lang exp/tri1_ali exp/tri2b
91
# decode tri2b
92
local/decode.sh steps/decode_lda_mllt.sh exp/tri2b/decode
93

94 95
# Train and test ET.
steps/train_lda_et.sh data/train data/lang exp/tri1_ali exp/tri2c
96
scripts/mkgraph.sh data/lang_test exp/tri2c exp/tri2c/graph
97
local/decode.sh steps/decode_lda_et.sh exp/tri2c/decode
98

99
# Align all data with LDA+MLLT system (tri2b)
100 101
steps/align_lda_mllt.sh --graphs "ark,s,cs:gunzip -c exp/tri2b/graphs.fsts.gz|" \
   data/train data/lang exp/tri2b exp/tri2b_ali
102 103 104 105 106

#  Do MMI on top of LDA+MLLT.
steps/train_lda_etc_mmi.sh data/train data/lang exp/tri2b_ali exp/tri3a &
local/decode.sh steps/decode_lda_mllt.sh exp/tri3a/decode

107 108 109 110 111
# Do the same with boosting.
steps/train_lda_etc_mmi.sh --boost 0.05 data/train data/lang exp/tri2b_ali exp/tri3b &
local/decode.sh steps/decode_lda_mllt.sh exp/tri3a/decode


112
# Do LDA+MLLT+SAT
113
steps/train_lda_mllt_sat.sh data/train data/lang exp/tri2b_ali exp/tri3d
114
local/decode.sh steps/decode_lda_mllt_sat.sh exp/tri3d/decode
115

116

117 118 119 120
# Align all data with LDA+MLLT+SAT system (tri3d)
steps/align_lda_mllt_sat.sh --graphs "ark,s,cs:gunzip -c exp/tri3d/graphs.fsts.gz|" \
    data/train data/lang exp/tri3d exp/tri3d_ali

121 122 123 124
# MMI on top of that.
steps/train_lda_etc_mmi.sh data/train data/lang exp/tri3d_ali exp/tri4a &
local/decode.sh steps/decode_lda_mllt_sat.sh exp/tri4a/decode

125 126 127
# Try another pass on top of that.
steps/train_lda_mllt_sat.sh data/train data/lang exp/tri3d_ali exp/tri4d
scripts/mkgraph.sh data/lang_test exp/tri4d exp/tri4d/graph
128
local/decode.sh steps/decode_lda_mllt_sat.sh exp/tri4d/decode
129

130 131
# Next, SGMM system-- train SGMM system with speaker vectors, on top 
# of LDA+MLLT features.
132 133 134 135 136

steps/train_ubm_lda_etc.sh data/train data/lang exp/tri2b_ali exp/ubm3d
steps/train_sgmm_lda_etc.sh data/train data/lang exp/tri2b_ali exp/ubm3d/final.ubm exp/sgmm3d

scripts/mkgraph.sh data/lang_test exp/sgmm3d exp/sgmm3d/graph
137
local/decode.sh steps/decode_sgmm_lda_etc.sh exp/sgmm3d/decode
138 139 140 141 142 143 144 145 146

# Align LDA+ET system prior to training corresponding SGMM system.
steps/align_lda_et.sh --graphs "ark,s,cs:gunzip -c exp/tri2c/graphs.fsts.gz|" \
  data/train data/lang exp/tri2c exp/tri2c_ali 

# Train SGMM system on top of LDA+ET.
steps/train_ubm_lda_etc.sh data/train data/lang exp/tri2c_ali exp/ubm3e
steps/train_sgmm_lda_etc.sh data/train data/lang exp/tri2c_ali exp/ubm3e/final.ubm exp/sgmm3e

147
local/decode.sh steps/decode_sgmm_lda_etc.sh exp/sgmm3e/decode exp/tri2c/decode
148 149 150 151

# Now train SGMM system on top of LDA+MLLT+SAT
steps/train_ubm_lda_etc.sh data/train data/lang exp/tri3d_ali exp/ubm4f
steps/train_sgmm_lda_etc.sh data/train data/lang exp/tri3d_ali exp/ubm4f/final.ubm exp/sgmm4f
152

153 154
local/decode.sh steps/decode_sgmm_lda_etc.sh exp/sgmm4f/decode exp/tri3d/decode

155 156 157 158
# Decode with fMLLR
sgmm-comp-prexform exp/sgmm4f/final.{mdl,occs,fmllr_mdl}
local/decode.sh steps/decode_sgmm_lda_etc_fmllr.sh exp/sgmm4f/decode_fmllr exp/sgmm4f/decode exp/tri3d/decode

159 160 161 162 163

# Some system combination experiments (just compose lattices).
local/decode_combine.sh steps/decode_combine.sh exp/tri1/decode exp/tri2a/decode exp/combine_1_2a/decode
local/decode_combine.sh steps/decode_combine.sh exp/sgmm4f/decode/ exp/tri3d/decode exp/combine_sgmm4f_tri3d/decode

164
for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | scripts/best_wer.sh; done
165

166 167
exp/combine_1_2a/decode/wer_7:%WER 3.399027 [ 426 / 12533, 55 ins, 94 del, 277 sub ]
exp/combine_sgmm4f_tri3d/decode/wer_5:%WER 1.731429 [ 217 / 12533, 30 ins, 43 del, 144 sub ]
168 169 170 171 172 173 174 175 176 177
exp/mono/decode/wer_6:%WER 10.340701 [ 1296 / 12533, 95 ins, 391 del, 810 sub ]
exp/sgmm3d/decode/wer_5:%WER 2.267284 [ 284 / 12526, 38 ins, 51 del, 195 sub ]
exp/sgmm3e/decode/wer_6:%WER 2.122397 [ 266 / 12533, 37 ins, 51 del, 178 sub ]
exp/sgmm4f/decode/wer_4:%WER 1.795261 [ 225 / 12533, 45 ins, 37 del, 143 sub ]
exp/tri1/decode/wer_6:%WER 3.566584 [ 447 / 12533, 74 ins, 88 del, 285 sub ]
exp/tri2a/decode/wer_7:%WER 3.518711 [ 441 / 12533, 57 ins, 91 del, 293 sub ]
exp/tri2b/decode/wer_9:%WER 3.614458 [ 453 / 12533, 59 ins, 111 del, 283 sub ]
exp/tri2c/decode/wer_6:%WER 2.833653 [ 355 / 12528, 54 ins, 71 del, 230 sub ]
exp/tri3d/decode/wer_7:%WER 2.489428 [ 312 / 12533, 43 ins, 63 del, 206 sub ]
exp/tri4d/decode/wer_7:%WER 2.649007 [ 332 / 12533, 53 ins, 67 del, 212 sub ]
178

179
local/decode_combine.sh steps/decode_combine.sh exp/tri1/decode exp/tri2a/decode exp/combine_tri3d_sgmm4f
180 181 182

##### Below here is trash. ######

183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
#steps/train_lda_mllt.sh.bak data/train data/train.1k data/lang exp/tri1 exp/tri2b_tmp

#scripts/subset_data_dir.sh data/train 800 data/train.800
#steps/train_lda_mllt.sh data/train data/train.800 data/lang exp/tri1_ali exp/tri2b_tmp2


scripts/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph
for test in mar87 oct87 feb89 oct89 feb91 sep92; do
  steps/decode_deltas.sh exp/tri1 data/test_$test data/lang exp/tri1/decode_$test &
done
wait
scripts/average_wer.sh exp/mono/decode_?????/wer > exp/mono/wer



scripts/mkgraph.sh --mono exp/mono/tree exp/mono/final.mdl exp/mono/graph


\
   > $dir/wer
203 204 205 206 207

notes on structure...



208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234

scripts/ contains generic scripts
local/ contains more corpus-specific scripts
steps/ contains system-building steps...


data/local  contains temp., local stuff
data/train
data/train.1k
data/lang  [note: could have separate dirs like this for different test sets]
data/test_feb89
data/test_feb89


local/RM_data_prep.sh


steps/train_mono.sh




exp/ contains experiments.
  [ Decode_dirs in subdir of exp. dir? ]


ocal_scripts/ contains the most RM-specific scripts. [used to create data_prep/]
235 236 237 238 239 240

scripts/ will contain generic scipts.

Stuff that's about the language:

lang/
241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
  words.txt phones.txt silphones.csl nonsilphones.csl topo
  L.fst

maybe also, later:
  phonesets.txt [ phonesets used in building questions... if not supplied, use the "base phones" ]
  extra_questions.txt [ extra questions appended to automatically generated questions.  Should ask 
        questions that elicit information that's lost when we go to "phonesets.txt", e.g. about stress
        and position ]
  questions.txt [ if you supply the questions, this file should exist. ]


lang_test/
 words.txt phones.txt silphones.csl nonsilphones.csl topo  
 phones_dismbig.txt L_disambig.txt G.fst

256

257
[for training:]
258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
 phones.txt [for testing too?]
 phonesets.txt [ phonesets used in building questions... if not supplied, use the "base phones" ]
 extra_questions.txt [ extra questions appended to automatically generated questions.  Should ask 
        questions that elicit information that's lost when we go to "phonesets.txt", e.g. about stress
        and position ]
 questions.txt [ if you supply the questions, this file should exist. ]
 L.fst

 [for testing:]
 phones_disambig.txt
 L_disambig.fst
 G.fst

data/
 spk2utt
 utt2spk
 txt
 scp
 spk2gender


steps/train_mono.sh data.1h/ lang/ exp/mono

steps/train_tri1.sh exp/mono data.1h/ lang/ exp/mono



# This script file cannot be run as-is; some paths in it need to be changed
# before you can run it.
# Search for /path/to.
# It is recommended that you do not invoke this file from the shell, but
# run the paths one by one, by hand.

# the step in data_prep/ will need to be modified for your system.

# First step is to do data preparation:
# This just creates some text files, it is fast.
# If not on the BUT system, you would have to change run.sh to reflect
# your own paths.
#

#Example arguments to run.sh: /mnt/matylda2/data/RM, /ais/gobi2/speech/RM, /cygdrive/e/data/RM
# RM is a directory with subdirectories rm1_audio1, rm1_audio2, rm2_audio
cd data_prep
#*** You have to change the pathname below.***
./run.sh /path/to/RM
cd ..

mkdir -p data
( cd data; cp ../data_prep/{train,test*}.{spk2utt,utt2spk} . ; cp ../data_prep/spk2gender.map . )

# This next step converts the lexicon, grammar, etc., into FST format.
steps/prepare_graphs.sh

# Next, make sure that "exp/" is someplace you can write a significant amount of
# data to (e.g. make it a link to a file on some reasonably large file system).
# If it doesn't exist, the scripts below will make the directory "exp".

# mfcc should be set to some place to put training mfcc's
# where you have space.  Make sure you create the directory.
#e.g.: mfccdir=/mnt/matylda6/jhu09/qpovey/kaldi_rm_mfccb
# Note: mfccdir should be an absolute pathname
mfccdir=/path/to/mfccdir
steps/make_mfcc_train.sh $mfccdir
steps/make_mfcc_test.sh $mfccdir

steps/train_mono.sh
steps/decode_mono.sh  &
steps/train_tri1.sh
(steps/decode_tri1.sh ; steps/decode_tri1_fmllr.sh; steps/decode_tri1_regtree_fmllr.sh ; steps/decode_tri1_latgen.sh) &

steps/train_tri2a.sh
(steps/decode_tri2a.sh ; steps/decode_tri2a_fmllr.sh; steps/decode_tri2a_fmllr_utt.sh ;
 steps/decode_tri2a_dfmllr.sh;  steps/decode_tri2a_dfmllr_fmllr.sh;  
 steps/decode_tri2a_dfmllr_utt.sh; 
)&


# Then do the same for 2b, 2c, and so on
# 2a = basic triphone (all features double-deltas unless stated).
# 2b = exponential transform
# 2c = mean normalization (cmn)
# 2d = MLLT
# 2e = splice-9-frames + LDA
# 2f = splice-9-frames + LDA + MLLT
# 2g = linear VTLN (+ regular VTLN); various decode scripts available.
# 2h = splice-9-frames + HLDA
# 2i = triple-deltas + HLDA
# 2j = triple-deltas + LDA + MLLT
# 2k = LDA + ET (equiv to LDA+MLLT+ET)
# 2l = splice-9-frames + LDA + MLLT + SAT (i.e. train with CMLLR)
# 2m = splice-9-frames + LDA + MLLT + LVTLN [depends on 2f]

for group in "b c d e" "f g h i" "j k l m"; do 
  for x in $group; do
    steps/train_tri2$x.sh &
  done
  wait;
  for x in $group; do
    for y in steps/decode_tri2$x*.sh; do
     $y
    done
  done
done


# To train and test SGMM systems:



# note: if the SGMM decoding is too slow, aside from playing
# with decoder beams and max-leaves, you can set e.g.
# --full-gmm-nbest=5 to the sgmm-gselect program (default is 15, 
# so max possible speedup with this setting is 3x).  For best WER,
# this should have the
# same value in training and test ("matched training"), but
# you can get the speed improvements by just doing it in test.
# You can take this all the way down to 1 for fastest speed, although
# this will degrade results.


steps/train_ubma.sh

(steps/train_sgmma.sh; steps/decode_sgmma.sh; steps/decode_sgmma_fmllr.sh;
 steps/decode_sgmma_fmllr_utt.sh; steps/train_sgmma_fmllrbasis.sh; 
 steps/decode_sgmma_fmllrbasis_utt.sh )&

# train and test system with speaker vectors.
(steps/train_sgmmb.sh; steps/decode_sgmmb.sh; steps/decode_sgmmb_fmllr.sh; steps/decode_sgmmb_utt.sh )&

# + gender dependency.
(steps/train_ubmb.sh; steps/train_sgmmc.sh; steps/decode_sgmmc.sh; steps/decode_sgmmc_fmllr.sh )&

# as sgmmb but with LDA+STC features.
(steps/train_ubmc.sh; steps/train_sgmmd.sh; steps/decode_sgmmd.sh; steps/decode_sgmmd_fmllr.sh )&

(steps/train_ubmd.sh; steps/train_sgmme.sh; steps/decode_sgmme.sh; steps/decode_sgmme_fmllr.sh;
  steps/decode_sgmme_latgen.sh )&