Commit 99f60c70 authored by Dan Povey's avatar Dan Povey
Browse files

trunk: Merging sandbox/dan2. Code and script changes RE my neural-network...

trunk: Merging sandbox/dan2.  Code and script changes RE my neural-network training setup, including refactored scripts.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@2830 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parents 7c6eedfb af78be4c
......@@ -3,119 +3,98 @@ for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; d
exit 0
# Monophone, MFCC+delta+accel
%WER 8.82 [ 1106 / 12533, 132 ins, 249 del, 725 sub ] exp/mono/decode/wer_3
%WER 8.58 [ 1075 / 12533, 137 ins, 230 del, 708 sub ] exp/mono/decode/wer_2
# MFCC+delta+accel
%WER 3.27 [ 410 / 12533, 59 ins, 85 del, 266 sub ] exp/tri1/decode/wer_6
%WER 3.41 [ 428 / 12533, 53 ins, 94 del, 281 sub ] exp/tri1/decode/wer_6
# MFCC+delta+accel (on top of better alignments, but didn't help).
%WER 3.23 [ 405 / 12533, 60 ins, 84 del, 261 sub ] exp/tri2a/decode/wer_6
# MFCC+delta+accel (on top of better alignments)
%WER 3.26 [ 409 / 12533, 54 ins, 87 del, 268 sub ] exp/tri2a/decode/wer_6
# LDA+MLLT
%WER 3.08 [ 386 / 12533, 41 ins, 89 del, 256 sub ] exp/tri2b/decode/wer_8
%WER 2.78 [ 349 / 12533, 52 ins, 66 del, 231 sub ] exp/tri2b/decode/wer_5
# Some MMI/MPE experiments (MMI, boosted MMI, MPE) on top of the LDA+MLLT system.
%WER 2.59 [ 325 / 12533, 49 ins, 55 del, 221 sub ] exp/tri2b_mmi_b0.05/decode_it3/wer_6
%WER 2.65 [ 332 / 12533, 55 ins, 48 del, 229 sub ] exp/tri2b_mmi_b0.05/decode_it4/wer_6
%WER 2.53 [ 317 / 12533, 42 ins, 57 del, 218 sub ] exp/tri2b_mmi/decode_it3/wer_7
%WER 2.67 [ 335 / 12533, 54 ins, 49 del, 232 sub ] exp/tri2b_mmi/decode_it4/wer_6
%WER 2.90 [ 364 / 12533, 48 ins, 68 del, 248 sub ] exp/tri2b_mpe/decode_it3/wer_7
%WER 2.88 [ 361 / 12533, 39 ins, 73 del, 249 sub ] exp/tri2b_mpe/decode_it4/wer_8
%WER 2.54 [ 318 / 12533, 57 ins, 45 del, 216 sub ] exp/tri2b_mmi/decode_it3/wer_5
%WER 2.66 [ 333 / 12533, 53 ins, 64 del, 216 sub ] exp/tri2b_mmi/decode_it4/wer_7
%WER 2.51 [ 314 / 12533, 58 ins, 45 del, 211 sub ] exp/tri2b_mmi_b0.05/decode_it3/wer_5
%WER 2.55 [ 319 / 12533, 56 ins, 54 del, 209 sub ] exp/tri2b_mmi_b0.05/decode_it4/wer_6
%WER 2.50 [ 313 / 12533, 39 ins, 65 del, 209 sub ] exp/tri2b_mpe/decode_it3/wer_7
%WER 2.50 [ 313 / 12533, 44 ins, 56 del, 213 sub ] exp/tri2b_mpe/decode_it4/wer_6
# LDA+MLLT+SAT
%WER 3.27 [ 410 / 12533, 37 ins, 113 del, 260 sub ] exp/tri3b/decode.si/wer_8
%WER 2.04 [ 256 / 12533, 34 ins, 47 del, 175 sub ] exp/tri3b/decode/wer_4
%WER 1.90 [ 238 / 12533, 26 ins, 50 del, 162 sub ] exp/tri3b/decode/wer_4
%WER 2.96 [ 371 / 12533, 54 ins, 62 del, 255 sub ] exp/tri3b/decode.si/wer_4 # This is the speaker-independent decoding pass.
# LDA+MLLT+SAT (on training set)
%WER 0.88 [ 307 / 34722, 44 ins, 67 del, 196 sub ] exp/tri3b/decode_train/wer_4
# Decoding tri3b with unigram language model, which has higher WER.
%WER 10.27 [ 1287 / 12533, 119 ins, 205 del, 963 sub ] exp/tri3b/decode_ug/wer_13
%WER 13.47 [ 1688 / 12533, 172 ins, 258 del, 1258 sub ] exp/tri3b/decode_ug.si/wer_12
# LDA+MLLT+SAT+MMI (on training set)
%WER 0.32 [ 112 / 34722, 12 ins, 26 del, 74 sub ] exp/tri3b_mmi/decode_train/wer_7
# LDA+MLLT+SAT+MMI (MMI on top of the SAT system)
%WER 3.27 [ 410 / 12533, 37 ins, 113 del, 260 sub ] exp/tri3b_mmi/decode.si/wer_8
%WER 1.87 [ 234 / 12533, 33 ins, 44 del, 157 sub ] exp/tri3b_mmi/decode/wer_6
%WER 2.96 [ 371 / 12533, 54 ins, 62 del, 255 sub ] exp/tri3b_mmi/decode.si/wer_4
%WER 1.73 [ 217 / 12533, 20 ins, 45 del, 152 sub ] exp/tri3b_mmi/decode/wer_7
%WER 1.84 [ 231 / 12533, 27 ins, 41 del, 163 sub ] exp/tri3b_mmi/decode2/wer_7 # with transforms from tri3b
# LDA+MLLT+SAT+fMMI (fMMI+MMI on top of this SAT system) Various configurations.
Note: it doesn't really help here. Probably not enough data.
%WER 1.89 [ 237 / 12533, 30 ins, 41 del, 166 sub ] exp/tri3b_fmmi_b/decode_it3/wer_7
%WER 2.13 [ 267 / 12533, 36 ins, 47 del, 184 sub ] exp/tri3b_fmmi_b/decode_it4/wer_8
%WER 2.11 [ 265 / 12533, 39 ins, 38 del, 188 sub ] exp/tri3b_fmmi_b/decode_it5/wer_7
%WER 2.18 [ 273 / 12533, 42 ins, 35 del, 196 sub ] exp/tri3b_fmmi_b/decode_it6/wer_7
%WER 2.29 [ 287 / 12533, 48 ins, 35 del, 204 sub ] exp/tri3b_fmmi_b/decode_it7/wer_7
%WER 2.51 [ 314 / 12533, 53 ins, 45 del, 216 sub ] exp/tri3b_fmmi_b/decode_it8/wer_8
%WER 1.87 [ 234 / 12533, 38 ins, 36 del, 160 sub ] exp/tri3b_fmmi_c/decode_it3/wer_4
%WER 1.90 [ 238 / 12533, 36 ins, 37 del, 165 sub ] exp/tri3b_fmmi_c/decode_it4/wer_5
%WER 1.82 [ 228 / 12533, 36 ins, 33 del, 159 sub ] exp/tri3b_fmmi_c/decode_it5/wer_4
%WER 1.76 [ 220 / 12533, 34 ins, 31 del, 155 sub ] exp/tri3b_fmmi_c/decode_it6/wer_5
%WER 1.81 [ 227 / 12533, 27 ins, 42 del, 158 sub ] exp/tri3b_fmmi_c/decode_it7/wer_8
%WER 1.82 [ 228 / 12533, 36 ins, 30 del, 162 sub ] exp/tri3b_fmmi_c/decode_it8/wer_5
%WER 1.99 [ 250 / 12533, 33 ins, 43 del, 174 sub ] exp/tri3b_fmmi_d/decode_it3/wer_7
%WER 2.12 [ 266 / 12533, 30 ins, 55 del, 181 sub ] exp/tri3b_fmmi_d/decode_it4/wer_9
%WER 2.08 [ 261 / 12533, 41 ins, 37 del, 183 sub ] exp/tri3b_fmmi_d/decode_it5/wer_6
%WER 2.16 [ 271 / 12533, 37 ins, 50 del, 184 sub ] exp/tri3b_fmmi_d/decode_it6/wer_7
%WER 2.22 [ 278 / 12533, 38 ins, 47 del, 193 sub ] exp/tri3b_fmmi_d/decode_it7/wer_7
%WER 2.35 [ 294 / 12533, 46 ins, 48 del, 200 sub ] exp/tri3b_fmmi_d/decode_it8/wer_7
# SGMM experiments.
%WER 1.69 [ 212 / 12533, 36 ins, 37 del, 139 sub ] exp/sgmm4a/decode/wer_3
%WER 1.70 [ 213 / 12533, 35 ins, 36 del, 142 sub ] exp/sgmm4a/decode_fmllr/wer_3
%WER 1.56 [ 196 / 12533, 30 ins, 34 del, 132 sub ] exp/sgmm4a_mmi_b0.2/decode_it1/wer_5
%WER 1.54 [ 193 / 12533, 31 ins, 32 del, 130 sub ] exp/sgmm4a_mmi_b0.2/decode_it2/wer_5
%WER 1.57 [ 197 / 12533, 31 ins, 31 del, 135 sub ] exp/sgmm4a_mmi_b0.2/decode_it3/wer_5
%WER 1.58 [ 198 / 12533, 32 ins, 32 del, 134 sub ] exp/sgmm4a_mmi_b0.2/decode_it4/wer_5
# Note: it doesn't really help here. Probably not enough data.
%WER 1.68 [ 210 / 12533, 26 ins, 35 del, 149 sub ] exp/tri3b_fmmi_b/decode_it3/wer_6
%WER 1.84 [ 231 / 12533, 35 ins, 31 del, 165 sub ] exp/tri3b_fmmi_b/decode_it4/wer_5
%WER 1.80 [ 226 / 12533, 31 ins, 35 del, 160 sub ] exp/tri3b_fmmi_b/decode_it5/wer_6
%WER 1.91 [ 239 / 12533, 39 ins, 35 del, 165 sub ] exp/tri3b_fmmi_b/decode_it6/wer_6
%WER 2.01 [ 252 / 12533, 20 ins, 52 del, 180 sub ] exp/tri3b_fmmi_b/decode_it7/wer_10
%WER 2.09 [ 262 / 12533, 33 ins, 43 del, 186 sub ] exp/tri3b_fmmi_b/decode_it8/wer_8
%WER 1.80 [ 226 / 12533, 30 ins, 38 del, 158 sub ] exp/tri3b_fmmi_c/decode_it3/wer_4
%WER 1.73 [ 217 / 12533, 28 ins, 38 del, 151 sub ] exp/tri3b_fmmi_c/decode_it4/wer_5
%WER 1.69 [ 212 / 12533, 24 ins, 38 del, 150 sub ] exp/tri3b_fmmi_c/decode_it5/wer_6
%WER 1.71 [ 214 / 12533, 24 ins, 37 del, 153 sub ] exp/tri3b_fmmi_c/decode_it6/wer_6
%WER 1.79 [ 224 / 12533, 31 ins, 37 del, 156 sub ] exp/tri3b_fmmi_c/decode_it7/wer_6
%WER 1.80 [ 226 / 12533, 37 ins, 31 del, 158 sub ] exp/tri3b_fmmi_c/decode_it8/wer_4
%WER 1.87 [ 234 / 12533, 20 ins, 45 del, 169 sub ] exp/tri3b_fmmi_d/decode_it3/wer_7
%WER 2.11 [ 265 / 12533, 29 ins, 47 del, 189 sub ] exp/tri3b_fmmi_d/decode_it4/wer_6
%WER 2.20 [ 276 / 12533, 37 ins, 48 del, 191 sub ] exp/tri3b_fmmi_d/decode_it5/wer_5
%WER 2.15 [ 270 / 12533, 17 ins, 69 del, 184 sub ] exp/tri3b_fmmi_d/decode_it6/wer_10
%WER 2.12 [ 266 / 12533, 14 ins, 71 del, 181 sub ] exp/tri3b_fmmi_d/decode_it7/wer_12
%WER 2.25 [ 282 / 12533, 17 ins, 65 del, 200 sub ] exp/tri3b_fmmi_d/decode_it8/wer_11
# Some "SGMM2" experiments. SGMM2 is a new version of the code that
# has tying of the substates a bit like "state-clustered tied mixture" systems;
# and which has speaker-dependent mixture weights.
# we don't any longer show the old SGMM results, although the script is still
# there, commented out.
%WER 1.63 [ 204 / 12533, 19 ins, 40 del, 145 sub ] exp/sgmm2_4a/decode/wer_5
%WER 1.65 [ 207 / 12533, 26 ins, 33 del, 148 sub ] exp/sgmm2_4a/decode_fmllr/wer_3
%WER 1.50 [ 188 / 12533, 18 ins, 33 del, 137 sub ] exp/sgmm2_4a_mmi_b0.2/decode_it1/wer_6
%WER 1.49 [ 187 / 12533, 18 ins, 32 del, 137 sub ] exp/sgmm2_4a_mmi_b0.2/decode_it2/wer_6
%WER 1.48 [ 186 / 12533, 21 ins, 27 del, 138 sub ] exp/sgmm2_4a_mmi_b0.2/decode_it3/wer_5
%WER 1.47 [ 184 / 12533, 21 ins, 26 del, 137 sub ] exp/sgmm2_4a_mmi_b0.2/decode_it4/wer_5
%WER 1.58 [ 198 / 12533, 22 ins, 33 del, 143 sub ] exp/sgmm2_4a/decode/wer_4
%WER 1.60 [ 200 / 12533, 27 ins, 29 del, 144 sub ] exp/sgmm2_4a/decode_fmllr/wer_3
%WER 1.50 [ 188 / 12533, 24 ins, 24 del, 140 sub ] exp/sgmm2_4a_mmi_b0.2/decode_it1/wer_4
%WER 1.47 [ 184 / 12533, 27 ins, 21 del, 136 sub ] exp/sgmm2_4a_mmi_b0.2/decode_it2/wer_3
%WER 1.44 [ 181 / 12533, 26 ins, 22 del, 133 sub ] exp/sgmm2_4a_mmi_b0.2/decode_it3/wer_3
%WER 1.44 [ 181 / 12533, 31 ins, 17 del, 133 sub ] exp/sgmm2_4a_mmi_b0.2/decode_it4/wer_2
# This is testing an option "--zero-if-disjoint true" to MMI-- no clear difference here.
%WER 1.50 [ 188 / 12533, 27 ins, 22 del, 139 sub ] exp/sgmm2_4a_mmi_b0.2_x/decode_it1/wer_3
%WER 1.49 [ 187 / 12533, 18 ins, 32 del, 137 sub ] exp/sgmm2_4a_mmi_b0.2_x/decode_it2/wer_6
%WER 1.48 [ 186 / 12533, 20 ins, 26 del, 140 sub ] exp/sgmm2_4a_mmi_b0.2_x/decode_it3/wer_5
%WER 1.45 [ 182 / 12533, 20 ins, 26 del, 136 sub ] exp/sgmm2_4a_mmi_b0.2_x/decode_it4/wer_5
# Note: sgmm2x is sgmm2 excluding the speaker-dependent mixture weights.
# doesn't make much difference.
%WER 1.63 [ 204 / 12533, 23 ins, 37 del, 144 sub ] exp/sgmm2x_4a/decode/wer_4
%WER 1.59 [ 199 / 12533, 22 ins, 37 del, 140 sub ] exp/sgmm2x_4a/decode_fmllr/wer_4
%WER 1.46 [ 183 / 12533, 24 ins, 24 del, 135 sub ] exp/sgmm2x_4a_mmi_b0.2/decode_it1/wer_4
%WER 1.44 [ 181 / 12533, 25 ins, 22 del, 134 sub ] exp/sgmm2x_4a_mmi_b0.2/decode_it2/wer_4
%WER 1.44 [ 180 / 12533, 26 ins, 21 del, 133 sub ] exp/sgmm2x_4a_mmi_b0.2/decode_it3/wer_4
%WER 1.43 [ 179 / 12533, 26 ins, 21 del, 132 sub ] exp/sgmm2x_4a_mmi_b0.2/decode_it4/wer_4
# Deep neural net -- hybrid system.
%WER 1.89 [ 237 / 12533, 28 ins, 62 del, 147 sub ] exp/tri4a1_nnet/decode/wer_6
%WER 0.59 [ 204 / 34722, 23 ins, 50 del, 131 sub ] exp/tri4a1_nnet/decode_train/wer_2
%WER 1.87 [ 234 / 12533, 34 ins, 45 del, 155 sub ] exp/tri4a1_mmi_a/decode/wer_4
%WER 1.85 [ 232 / 12533, 28 ins, 52 del, 152 sub ] exp/tri4a1_mmi_b/decode/wer_5
%WER 1.87 [ 234 / 12533, 28 ins, 52 del, 154 sub ] exp/tri4a1_mmi_c/decode/wer_5
%WER 0.51 [ 178 / 34722, 25 ins, 34 del, 119 sub ] exp/tri4a1_mmi_c/decode_train/wer_2
%WER 1.81 [ 227 / 12533, 26 ins, 49 del, 152 sub ] exp/tri4a1_mmi_d/decode/wer_5
%WER 0.54 [ 187 / 34722, 30 ins, 28 del, 129 sub ] exp/tri4a1_mmi_d/decode_train/wer_2
%WER 1.84 [ 231 / 12533, 39 ins, 41 del, 151 sub ] exp/tri4a1_mmi_e/decode/wer_4
%WER 0.51 [ 178 / 34722, 19 ins, 38 del, 121 sub ] exp/tri4a1_mmi_e/decode_train/wer_4
%WER 1.81 [ 227 / 12533, 37 ins, 37 del, 153 sub ] exp/tri4a1_mmi_e2/decode/wer_4
%WER 0.53 [ 184 / 34722, 14 ins, 47 del, 123 sub ] exp/tri4a1_mmi_e2/decode_train/wer_6
%WER 8.48 [ 1063 / 12533, 238 ins, 146 del, 679 sub ] exp/tri4a1_mmi_f/decode/wer_9
%WER 4.84 [ 1679 / 34722, 506 ins, 192 del, 981 sub ] exp/tri4a1_mmi_f/decode_train/wer_9
%WER 2.07 [ 260 / 12533, 51 ins, 29 del, 180 sub ] exp/tri4a1_mmi_g/decode/wer_3
%WER 0.61 [ 211 / 34722, 33 ins, 34 del, 144 sub ] exp/tri4a1_mmi_g/decode_train/wer_4
# Deep neural net -- various types of hybrid system.
%WER 2.02 [ 253 / 12533, 27 ins, 64 del, 162 sub ] exp/nnet4a/decode/wer_4
%WER 9.77 [ 1224 / 12533, 95 ins, 251 del, 878 sub ] exp/nnet4a/decode_ug/wer_9
%WER 1.68 [ 211 / 12533, 20 ins, 53 del, 138 sub ] exp/nnet4b/decode/wer_5
%WER 8.96 [ 1123 / 12533, 97 ins, 166 del, 860 sub ] exp/nnet4b/decode_ug/wer_8
%WER 1.71 [ 214 / 12533, 23 ins, 46 del, 145 sub ] exp/nnet4c/decode/wer_4
%WER 9.02 [ 1130 / 12533, 91 ins, 181 del, 858 sub ] exp/nnet4c/decode_ug/wer_8
# DNN systems (Karel)
# note from Dan-- these are from an older RESULTS file as I did not rerun these
# last time I created this.
# Per-frame cross-entropy training
%WER 1.58 [ 198 / 12533, 28 ins, 44 del, 126 sub ] exp/tri3b_pretrain-dbn_dnn/decode/wer_3
# Sequence-based sMBR training
......@@ -128,13 +107,8 @@ exit 0
# Some system combination experiments.
%WER 3.02 [ 378 / 12533, 58 ins, 68 del, 252 sub ] exp/combine_1_2a/decode/wer_4
%WER 1.64 [ 206 / 12533, 29 ins, 35 del, 142 sub ] exp/combine_4a_3b/decode/wer_2
%WER 1.60 [ 201 / 12533, 29 ins, 39 del, 133 sub ] exp/combine_4a_3b_fmmic5/decode/wer_4
%WER 1.58 [ 198 / 12533, 34 ins, 30 del, 134 sub ] exp/combine_4a_mmi_3b_fmmic5/decode/wer_3
%WER 1.58 [ 198 / 12533, 24 ins, 33 del, 141 sub ] exp/combine_sgmm2_4a_3b/decode/wer_2
%WER 1.54 [ 193 / 12533, 20 ins, 40 del, 133 sub ] exp/combine_sgmm2_4a_3b_fmmic5/decode/wer_5
%WER 1.49 [ 187 / 12533, 22 ins, 33 del, 132 sub ] exp/combine_sgmm2_4a_mmi_3b_fmmic5/decode/wer_5
%WER 1.60 [ 200 / 12533, 26 ins, 34 del, 140 sub ] exp/combine_sgmm2x_4a_3b/decode/wer_2
%WER 1.51 [ 189 / 12533, 23 ins, 34 del, 132 sub ] exp/combine_sgmm2x_4a_3b_fmmic5/decode/wer_4
%WER 1.48 [ 186 / 12533, 24 ins, 29 del, 133 sub ] exp/combine_sgmm2x_4a_mmi_3b_fmmic5/decode/wer_4
%WER 3.18 [ 398 / 12533, 60 ins, 75 del, 263 sub ] exp/combine_1_2a/decode/wer_4
%WER 1.56 [ 196 / 12533, 27 ins, 32 del, 137 sub ] exp/combine_sgmm2_4a_3b/decode/wer_2
%WER 1.53 [ 192 / 12533, 23 ins, 30 del, 139 sub ] exp/combine_sgmm2_4a_3b_fmmic5/decode/wer_4
%WER 1.47 [ 184 / 12533, 23 ins, 27 del, 134 sub ] exp/combine_sgmm2_4a_mmi_3b_fmmic5/decode/wer_4
#!/bin/bash
. cmd.sh
# This example runs on top of "raw-fMLLR" features.
( steps/nnet2/train_tanh.sh --splice-width 7 \
--cleanup false \
--initial-learning-rate 0.02 --final-learning-rate 0.004 \
--num-hidden-layers 2 \
--num-epochs-extra 10 --add-layers-period 1 \
--mix-up 4000 \
--cmd "$decode_cmd" \
--hidden-layer-dim 375 \
data/train data/lang exp/tri3c_ali exp/nnet4a2 || exit 1
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 --feat-type raw \
--transform-dir exp/tri3c/decode \
exp/tri3c/graph data/test exp/nnet4a2/decode
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 --feat-type raw \
--transform-dir exp/tri3c/decode_ug \
exp/tri3c/graph_ug data/test exp/nnet4a2/decode_ug
)
#!/bin/bash
stage=0
train_stage=-100
# This trains only unadapted (just cepstral mean normalized) features,
# and uses various combinations of VTLN warping factor and time-warping
# factor to artificially expand the amount of data.
. cmd.sh
. utils/parse_options.sh # to parse the --stage option, if given
[ $# != 0 ] && echo "Usage: local/run_4b.sh [--stage <stage> --train-stage <train-stage>]" && exit 1;
set -e
if [ $stage -le 0 ]; then
# Create the training data.
featdir=`pwd`/mfcc/nnet4b; mkdir -p $featdir
fbank_conf=conf/fbank_40.conf
echo "--num-mel-bins=40" > $fbank_conf
steps/nnet2/get_perturbed_feats.sh --cmd "$train_cmd" \
$fbank_conf $featdir exp/perturbed_fbanks data/train data/train_perturbed_fbank &
steps/nnet2/get_perturbed_feats.sh --cmd "$train_cmd" --feature-type mfcc \
conf/mfcc.conf $featdir exp/perturbed_mfcc data/train data/train_perturbed_mfcc &
wait
fi
if [ $stage -le 1 ]; then
steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
data/train_perturbed_mfcc data/lang exp/tri3b exp/tri3b_ali_perturbed_mfcc
fi
if [ $stage -le 2 ]; then
steps/nnet2/train_block.sh --stage "$train_stage" \
--bias-stddev 0.5 --splice-width 7 --egs-opts "--feat-type raw" \
--softmax-learning-rate-factor 0.5 --cleanup false \
--initial-learning-rate 0.04 --final-learning-rate 0.004 \
--num-epochs-extra 10 --add-layers-period 1 \
--mix-up 4000 \
--cmd "$decode_cmd" \
--hidden-layer-dim 450 \
data/train_perturbed_fbank data/lang exp/tri3b_ali_perturbed_mfcc exp/nnet4b || exit 1
fi
if [ $stage -le 3 ]; then
# Create the testing data.
featdir=`pwd`/mfcc
mkdir -p $featdir
fbank_conf=conf/fbank_40.conf
echo "--num-mel-bins=40" > $fbank_conf
for x in test_mar87 test_oct87 test_feb89 test_oct89 test_feb91 test_sep92 train; do
cp -rT data/$x data/${x}_fbank
rm -r ${x}_fbank/split* || true
steps/make_fbank.sh --fbank-config "$fbank_conf" --nj 8 \
--cmd "run.pl" data/${x}_fbank exp/make_fbank/$x $featdir || exit 1;
steps/compute_cmvn_stats.sh data/${x}_fbank exp/make_fbank/$x $featdir || exit 1;
done
utils/combine_data.sh data/test_fbank data/test_{mar87,oct87,feb89,oct89,feb91,sep92}_fbank
steps/compute_cmvn_stats.sh data/test_fbank exp/make_fbank/test $featdir
fi
if [ $stage -le 4 ]; then
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 --feat-type raw \
exp/tri3b/graph data/test_fbank exp/nnet4b/decode
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 --feat-type raw \
exp/tri3b/graph_ug data/test_fbank exp/nnet4b/decode_ug
fi
#!/bin/bash
# This is neural net training on top of adapted 40-dimensional features.
#
. cmd.sh
( steps/nnet2/train_tanh.sh --num-epochs 20 \
--num-epochs-extra 10 --add-layers-period 1 \
--num-hidden-layers 2 \
--mix-up 4000 \
--initial-learning-rate 0.02 --final-learning-rate 0.004 \
--cmd "$decode_cmd" \
--hidden-layer-dim 375 \
data/train data/lang exp/tri3b_ali exp/nnet4c_nnet
steps/decode_nnet_cpu.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode \
exp/tri3b/graph data/test exp/nnet4c_nnet/decode
steps/decode_nnet_cpu.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode_ug \
exp/tri3b/graph_ug data/test exp/nnet4c_nnet/decode_ug
)
#!/bin/bash
#
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0.
# Takes no arguments.
tmpdir=data/local/tmp
[ ! -f $tmpdir/G.txt ] && echo "No such file $tmpdir/G.txt" && exit 1;
. ./path.sh || exit 1; # for KALDI_ROOT
cp -rT data/lang data/lang_ug
rm -rf data/lang_ug/tmp
cat data/train/text | \
perl -e 'while(<>) { @A = split; shift @A; foreach $w(@A) { $tot_count++; $count{$w}++; } $n_sent++; }
$tot_count += $n_sent;
foreach $k (keys %count) { $p = $count{$k} / $tot_count; $cost = -log($p); print "0 0 $k $k $cost\n"; }
$final_cost = -log($n_sent / $tot_count);
print "0 $final_cost\n"; ' | \
fstcompile --isymbols=data/lang/words.txt --osymbols=data/lang/words.txt --keep_isymbols=false \
--keep_osymbols=false > data/lang_ug/G.fst || exit 1;
# Checking that G is stochastic [note, it wouldn't be for an Arpa]
fstisstochastic data/lang_ug/G.fst || echo Error: G is not stochastic
# Checking that G.fst is determinizable.
fstdeterminize data/lang_ug/G.fst /dev/null || echo Error determinizing G.
# Checking that L_disambig.fst is determinizable.
fstdeterminize data/lang_ug/L_disambig.fst /dev/null || echo Error determinizing L.
# Checking that disambiguated lexicon times G is determinizable
fsttablecompose data/lang_ug/L_disambig.fst data/lang_ug/G.fst | \
fstdeterminize >/dev/null || echo Error
# Checking that LG is stochastic:
fsttablecompose data/lang_ug/L.fst data/lang_ug/G.fst | \
fstisstochastic || echo Error: LG is not stochastic.
# Checking that L_disambig.G is stochastic:
fsttablecompose data/lang_ug/L_disambig.fst data/lang_ug/G.fst | \
fstisstochastic || echo Error: LG is not stochastic.
echo "Succeeded preparing grammar for RM."
#!/bin/bash
# This example runs on top of "raw-fMLLR" features:
local/nnet2/run_4a.sh
# This one is on top of filter-bank features, with only CMN.
local/nnet2/run_4b.sh
# This one is on top of 40-dim + fMLLR features
local/nnet2/run_4c.sh
#!/bin/bash
. cmd.sh
# result: exp/tri4a1_nnet/decode/wer_2:%WER 1.69 [ 212 / 12533, 26 ins, 44 del, 142 sub ]
( steps/train_nnet_cpu.sh --num-epochs 20 \
--num-epochs-extra 10 --add-layers-period 1 \
--mix-up 4000 --num-iters-final 5 --shrink-interval 3 \
--initial-learning-rate 0.02 --final-learning-rate 0.004 \
--cmd "$decode_cmd" \
--num-parameters 750000 \
data/train data/lang exp/tri3b_ali exp/tri4a1_nnet
steps/decode_nnet_cpu.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode \
exp/tri3b/graph data/test exp/tri4a1_nnet/decode )
# using conf/decode.config as we need much larger beams for RM.
steps/make_denlats_nnet_cpu.sh --nj 8 \
--config conf/decode.config --transform-dir exp/tri3b_ali \
data/train data/lang exp/tri4a1_nnet exp/tri4a1_denlats
steps/train_nnet_cpu_mmi.sh --cmd "$decode_cmd" --transform-dir exp/tri3b_ali \
data/train data/lang exp/tri4a1_nnet exp/tri4a1_nnet exp/tri4a1_denlats exp/tri4a1_mmi_a
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode \
exp/tri3b/graph data/test exp/tri4a1_mmi_a/decode
(
steps/train_nnet_cpu_mmi.sh --initial-learning-rate 0.0005 \
--minibatch-size 128 --cmd "$decode_cmd" --transform-dir exp/tri3b_ali \
data/train data/lang exp/tri4a1_nnet exp/tri4a1_nnet exp/tri4a1_denlats exp/tri4a1_mmi_b
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode \
exp/tri3b/graph data/test exp/tri4a1_mmi_b/decode
)&
# Get WER on training data before MMI.
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 8 \
--config conf/decode.config --transform-dir exp/tri3b \
exp/tri3b/graph data/train exp/tri4a1_nnet/decode_train
# WER on tri3b as baseline, want to see how it compares to tri3b_mmi
steps/decode.sh --cmd "$decode_cmd" --nj 8 \
--config conf/decode.config --transform-dir exp/tri3b \
exp/tri3b/graph data/train exp/tri3b/decode_train
steps/decode.sh --cmd "$decode_cmd" --nj 8 \
--config conf/decode.config --transform-dir exp/tri3b \
exp/tri3b/graph data/train exp/tri3b_mmi/decode_train
(
steps/train_nnet_cpu_mmi.sh --boost 0.1 --initial-learning-rate 0.0005 \
--minibatch-size 128 --cmd "$decode_cmd" --transform-dir exp/tri3b_ali \
data/train data/lang exp/tri4a1_nnet exp/tri4a1_nnet exp/tri4a1_denlats exp/tri4a1_mmi_c
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode \
exp/tri3b/graph data/test exp/tri4a1_mmi_c/decode
# WER on trainnig data
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 8 \
--config conf/decode.config --transform-dir exp/tri3b \
exp/tri3b/graph data/train exp/tri4a1_mmi_c/decode_train
)&
(
steps/train_nnet_cpu_mmi.sh --E 0.5 --boost 0.1 --initial-learning-rate 0.0005 \
--minibatch-size 128 --cmd "$decode_cmd" --transform-dir exp/tri3b_ali \
data/train data/lang exp/tri4a1_nnet exp/tri4a1_nnet exp/tri4a1_denlats exp/tri4a1_mmi_d
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode \
exp/tri3b/graph data/test exp/tri4a1_mmi_d/decode
# WER on trainnig data
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 8 \
--config conf/decode.config --transform-dir exp/tri3b \
exp/tri3b/graph data/train exp/tri4a1_mmi_d/decode_train
)&
(
steps/train_nnet_cpu_mmi.sh --E 0.5 --boost 0.1 --initial-learning-rate 0.001 \
--minibatch-size 128 --cmd "$decode_cmd" --transform-dir exp/tri3b_ali \
data/train data/lang exp/tri4a1_nnet exp/tri4a1_nnet exp/tri4a1_denlats exp/tri4a1_mmi_e
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode \
exp/tri3b/graph data/test exp/tri4a1_mmi_e/decode
# WER on trainnig data
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 8 \
--config conf/decode.config --transform-dir exp/tri3b \
exp/tri3b/graph data/train exp/tri4a1_mmi_e/decode_train
)&
( # _e2 is as _e, but 2 epochs per EBW iter.
steps/train_nnet_cpu_mmi.sh --epochs-per-ebw-iter 2 --E 0.5 --boost 0.1 --initial-learning-rate 0.001 \
--minibatch-size 128 --cmd "$decode_cmd" --transform-dir exp/tri3b_ali \
data/train data/lang exp/tri4a1_nnet exp/tri4a1_nnet exp/tri4a1_denlats exp/tri4a1_mmi_e2
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode \
exp/tri3b/graph data/test exp/tri4a1_mmi_e2/decode
# WER on trainnig data
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 8 \
--config conf/decode.config --transform-dir exp/tri3b \
exp/tri3b/graph data/train exp/tri4a1_mmi_e2/decode_train
)&
( # With E = 0.0 it was terrible. WER is 12.5%
steps/train_nnet_cpu_mmi.sh --E 0.0 --boost 0.1 --initial-learning-rate 0.001 \
--minibatch-size 128 --cmd "$decode_cmd" --transform-dir exp/tri3b_ali \
data/train data/lang exp/tri4a1_nnet exp/tri4a1_nnet exp/tri4a1_denlats exp/tri4a1_mmi_f
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode \
exp/tri3b/graph data/test exp/tri4a1_mmi_f/decode
# WER on trainnig data
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 8 \
--config conf/decode.config --transform-dir exp/tri3b \
exp/tri3b/graph data/train exp/tri4a1_mmi_f/decode_train
)&
(
steps/train_nnet_cpu_mmi.sh --E 0.25 --boost 0.1 --initial-learning-rate 0.001 \
--minibatch-size 128 --cmd "$decode_cmd" --transform-dir exp/tri3b_ali \
data/train data/lang exp/tri4a1_nnet exp/tri4a1_nnet exp/tri4a1_denlats exp/tri4a1_mmi_g
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 20 \
--transform-dir exp/tri3b/decode \
exp/tri3b/graph data/test exp/tri4a1_mmi_g/decode
# WER on trainnig data
steps/decode_nnet_cpu.sh --cmd "$decode_cmd" --nj 8 \
--config conf/decode.config --transform-dir exp/tri3b \
exp/tri3b/graph data/train exp/tri4a1_mmi_g/decode_train
)&
#!/bin/bash
. cmd.sh
steps/align_raw_fmllr.sh --nj 8 --cmd "$train_cmd" --use-graphs true \
data/train data/lang exp/tri2b exp/tri2b_ali_raw
steps/train_raw_sat.sh 1800 9000 data/train data/lang exp/tri2b_ali_raw exp/tri3c || exit 1;
utils/mkgraph.sh data/lang exp/tri3c exp/tri3c/graph
utils/mkgraph.sh data/lang_ug exp/tri3c exp/tri3c/graph_ug
steps/decode_raw_fmllr.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
exp/tri3c/graph data/test exp/tri3c/decode
steps/decode_raw_fmllr.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
exp/tri3c/graph_ug data/test exp/tri3c/decode_ug
steps/decode_raw_fmllr.sh --use-normal-fmllr true --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
exp/tri3c/graph data/test exp/tri3c/decode_2fmllr
steps/decode_raw_fmllr.sh --use-normal-fmllr true --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
exp/tri3c/graph_ug data/test exp/tri3c/decode_2fmllr_ug
steps/align_raw_fmllr.sh --nj 8 --cmd "$train_cmd" data/train data/lang exp/tri3c exp/tri3c_ali
## SGMM on top of LDA+MLLT+SAT features.
## No-- this wasn't working because scripts don't support raw-fMLLR.
if [ ! -f exp/ubm4c/final.mdl ]; then
steps/train_ubm.sh --silence-weight 0.5 --cmd "$train_cmd" 400 data/train data/lang exp/tri3c_ali exp/ubm4c || exit 1;
fi
steps/train_sgmm2.sh --cmd "$train_cmd" 5000 7000 data/train data/lang exp/tri3c_ali exp/ubm4c/final.ubm exp/sgmm2_4c || exit 1;
utils/mkgraph.sh data/lang exp/sgmm2_4c exp/sgmm2_4c/graph || exit 1;
utils/mkgraph.sh data/lang_ug exp/sgmm2_4c exp/sgmm2_4c/graph_ug || exit 1;
steps/decode_sgmm2.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
--transform-dir exp/tri3c/decode exp/sgmm2_4c/graph data/test exp/sgmm2_4c/decode || exit 1;
steps/decode_sgmm2.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
--transform-dir exp/tri3c/decode_ug exp/sgmm2_4c/graph_ug data/test exp/sgmm2_4c/decode_ug || exit 1;
steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
--transform-dir exp/tri3c/decode exp/sgmm2_4c/graph data/test exp/sgmm2_4c/decode_fmllr || exit 1;
exit 0;
# (# get scaled-by-30 versions of the vecs to be used for nnet training.
# . path.sh
# mkdir -p exp/sgmm2_4c_x30
# cat exp/sgmm2_4c/vecs.* | copy-vector ark:- ark,t:- | \
# awk -v scale=30.0 '{printf("%s [ ", $1); for (n=3;n<NF;n++) { printf("%f ", scale*$n); } print "]"; }' > exp/sgmm2_4c_x30/vecs.1
# mkdir -p exp/sgmm2_4c_x30/decode
# cat exp/sgmm2_4c/decode/vecs.* | copy-vector ark:- ark,t:- | \
# awk -v scale=30.0 '{printf("%s [ ", $1); for (n=3;n<NF;n++) { printf("%f ", scale*$n); } print "]"; }' > exp/sgmm2_4c_x30/decode/vecs.1
# mkdir -p exp/sgmm2_4c_x30/decode_ug
# cat exp/sgmm2_4c/decode_ug/vecs.* | copy-vector ark:- ark,t:- | \
# awk -v scale=30.0 '{printf("%s [ ", $1); for (n=3;n<NF;n++) { printf("%f ", scale*$n); } print "]"; }' > exp/sgmm2_4c_x30/decode_ug/vecs.1
# )
# exit 0;
# ##
# steps/decode_sgmm2.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
# exp/sgmm2_4c.no_transform/graph data/test exp/sgmm2_4c.no_transform/decode || exit 1;
# steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
# exp/sgmm2_4c.no_transform/graph data/test exp/sgmm2_4c.no_transform/decode_fmllr || exit 1;
#!/bin/bash
# Multilingual setup for SGMMs.
# Caution: this is just a stub, intended to show some others what to do, it
# is not functional yet.
# We treat the WSJ setup as the "other language"-- in fact it's the same language,
# of course, but we treat the phones there as a distinct set.
# The only important thing is that the WSJ data has the same sample rate as the
# RM data.
# add the prefix to all the words and phones: