Commit ccf383cd authored by Dan Povey's avatar Dan Povey
Browse files

trunk: minor updates to nnet2-related scripts; adding multisplice example for RM.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4624 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent d6e17bd0
......@@ -12,8 +12,10 @@ for x in exp/*/decode_dev; do grep WER $x/wer_* | utils/best_wer.sh; done
%WER 23.66 [ 9259 / 39141, 1495 ins, 2432 del, 5332 sub ] exp/nnet6c4_gpu/decode_dev/wer_11
%WER 25.12 [ 9832 / 39141, 1423 ins, 2471 del, 5938 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_utt/wer_11
%WER 23.79 [ 9311 / 39141, 1499 ins, 2277 del, 5535 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev/wer_11
%WER 22.86 [ 8947 / 39141, 1369 ins, 2302 del, 5276 sub ] exp/nnet2_online/nnet_a_online/decode_dev/wer_12
%WER 23.77 [ 9305 / 39141, 1312 ins, 2462 del, 5531 sub ] exp/nnet2_online/nnet_a_online/decode_dev_utt/wer_12
%WER 23.13 [ 9055 / 39141, 1466 ins, 2210 del, 5379 sub ] exp/nnet2_online/nnet_a_online/decode_dev_utt_offline/wer_11
# the following is with a multi-splice version of the nnet-online recipe
......@@ -23,4 +25,3 @@ for x in exp/*/decode_dev; do grep WER $x/wer_* | utils/best_wer.sh; done
%WER 22.25 [ 8710 / 39141, 1420 ins, 2139 del, 5151 sub ] exp/nnet2_online/nnet_ms_a_online/decode_dev_utt_offline/wer_11
......@@ -31,6 +31,7 @@ parallel_opts="-l gpu=1" # This is suitable for the CLSP network, you'll likely
if [ ! -f exp/$dir/final.mdl ]; then
# train_pnorm_simple2.sh dumps the egs in a more compact format to save disk space.
# note: 12 epochs is too many, it's taking a very long time.
steps/nnet2/train_pnorm_simple2.sh --stage $train_stage \
--num-epochs 12 \
--io-opts "-tc 10" \
......
......@@ -202,3 +202,14 @@ for x in exp/nnet2_online/nnet*/decode*; do grep WER $x/wer_* | utils/best_wer.s
%WER 10.70 [ 1341 / 12533, 122 ins, 221 del, 998 sub ] exp/nnet2_online/nnet_baseline/decode_ug/wer_10
# normal recipe:
%WER 2.27 [ 285 / 12533, 42 ins, 62 del, 181 sub ] exp/nnet2_online/nnet_a_online/decode/wer_5
%WER 2.28 [ 286 / 12533, 66 ins, 39 del, 181 sub ] exp/nnet2_online/nnet_a_online/decode_per_utt/wer_2
%WER 10.26 [ 1286 / 12533, 140 ins, 188 del, 958 sub ] exp/nnet2_online/nnet_a_online/decode_ug/wer_10
%WER 10.45 [ 1310 / 12533, 106 ins, 241 del, 963 sub ] exp/nnet2_online/nnet_a_online/decode_ug_per_utt/wer_12
# multi-splice recipe:
%WER 2.31 [ 290 / 12533, 59 ins, 50 del, 181 sub ] exp/nnet2_online/nnet_ms_a_online/decode/wer_7
%WER 2.36 [ 296 / 12533, 63 ins, 54 del, 179 sub ] exp/nnet2_online/nnet_ms_a_online/decode_per_utt/wer_6
%WER 10.19 [ 1277 / 12533, 155 ins, 186 del, 936 sub ] exp/nnet2_online/nnet_ms_a_online/decode_ug/wer_13
%WER 10.18 [ 1276 / 12533, 150 ins, 183 del, 943 sub ] exp/nnet2_online/nnet_ms_a_online/decode_ug_per_utt/wer_13
......@@ -30,32 +30,12 @@ else
num_threads=16
minibatch_size=128
parallel_opts="-pe smp $num_threads"
dir=exp/nnet2_online/nnet
fi
if [ $stage -le 1 ]; then
mkdir -p exp/nnet2_online
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 10 --num-frames 200000 \
data/train 256 exp/tri3b exp/nnet2_online/diag_ubm
fi
if [ $stage -le 2 ]; then
# use a smaller iVector dim (50) than the default (100) because RM has a very
# small amount of data.
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 4 \
--ivector-dim 50 \
data/train exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
fi
# stages 1 through 3 run in run_nnet2_common.sh.
if [ $stage -le 3 ]; then
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train data/train_max2
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \
data/train_max2 exp/nnet2_online/extractor exp/nnet2_online/ivectors || exit 1;
fi
local/online/run_nnet2_common.sh --stage $stage || exit 1;
if [ $stage -le 4 ]; then
......@@ -135,10 +115,9 @@ exit 0;
# the experiment (with GPU)
#for x in exp/nnet2_online/nnet_gpu/decode*; do grep WER $x/wer_* | utils/best_wer.sh; done
%WER 2.27 [ 285 / 12533, 43 ins, 50 del, 192 sub ] exp/nnet2_online/nnet_gpu/decode/wer_4
%WER 10.40 [ 1303 / 12533, 133 ins, 200 del, 970 sub ] exp/nnet2_online/nnet_gpu/decode_ug/wer_11
#for x in exp/nnet2_online/nnet_a/decode*; do grep WER $x/wer_* | utils/best_wer.sh; done
%WER 2.20 [ 276 / 12533, 37 ins, 61 del, 178 sub ] exp/nnet2_online/nnet_a/decode/wer_5
%WER 10.22 [ 1281 / 12533, 143 ins, 193 del, 945 sub ] exp/nnet2_online/nnet_a/decode_ug/wer_10
# This is the baseline with spliced non-CMVN cepstra and no iVector input.
# The difference is pretty small on RM; I expect it to be more clear-cut on larger corpora.
......@@ -154,11 +133,13 @@ exit 0;
# This truly-online per-utterance decoding gives essentially the same WER as the offline decoding, which is
# as we expect as the features and decoding parameters are the same.
# for x in exp/nnet2_online/nnet_gpu_online/decode*utt; do grep WER $x/wer_* | utils/best_wer.sh; done
%WER 2.21 [ 277 / 12533, 45 ins, 48 del, 184 sub ] exp/nnet2_online/nnet_gpu_online/decode_per_utt/wer_4
%WER 10.27 [ 1287 / 12533, 142 ins, 186 del, 959 sub ] exp/nnet2_online/nnet_gpu_online/decode_ug_per_utt/wer_10
%WER 2.28 [ 286 / 12533, 66 ins, 39 del, 181 sub ] exp/nnet2_online/nnet_a_online/decode_per_utt/wer_2
%WER 10.45 [ 1310 / 12533, 106 ins, 241 del, 963 sub ] exp/nnet2_online/nnet_a_online/decode_ug_per_utt/wer_12
# The following are online decoding, as above, but using previous utterances of
# the same speaker to refine the adaptation state. It doesn't make much difference.
# for x in exp/nnet2_online/nnet_gpu_online/decode*; do grep WER $x/wer_* | utils/best_wer.sh; done | grep -v utt
%WER 2.20 [ 276 / 12533, 25 ins, 69 del, 182 sub ] exp/nnet2_online/nnet_gpu_online/decode/wer_8
%WER 10.14 [ 1271 / 12533, 127 ins, 198 del, 946 sub ] exp/nnet2_online/nnet_gpu_online/decode_ug/wer_11
%WER 2.27 [ 285 / 12533, 42 ins, 62 del, 181 sub ] exp/nnet2_online/nnet_a_online/decode/wer_5
%WER 10.26 [ 1286 / 12533, 140 ins, 188 del, 958 sub ] exp/nnet2_online/nnet_a_online/decode_ug/wer_10
#!/bin/bash
. cmd.sh
stage=1
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
EOF
fi
parallel_opts="-l gpu=1"
num_threads=1
minibatch_size=512
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
minibatch_size=128
parallel_opts="-pe smp $num_threads"
dir=exp/nnet2_online/nnet
fi
if [ $stage -le 1 ]; then
mkdir -p exp/nnet2_online
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 10 --num-frames 200000 \
data/train 256 exp/tri3b exp/nnet2_online/diag_ubm
fi
if [ $stage -le 2 ]; then
# use a smaller iVector dim (50) than the default (100) because RM has a very
# small amount of data.
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 4 \
--ivector-dim 50 \
data/train exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
fi
if [ $stage -le 3 ]; then
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train data/train_max2
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \
data/train_max2 exp/nnet2_online/extractor exp/nnet2_online/ivectors || exit 1;
fi
#!/bin/bash
. cmd.sh
stage=1
train_stage=-10
use_gpu=true
dir=exp/nnet2_online/nnet_ms_a
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
EOF
fi
parallel_opts="-l gpu=1"
num_threads=1
minibatch_size=512
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
minibatch_size=128
parallel_opts="-pe smp $num_threads"
fi
# stages 1 through 3 run in run_nnet2_common.sh.
local/online/run_nnet2_common.sh --stage $stage || exit 1;
if [ $stage -le 4 ]; then
steps/nnet2/train_pnorm_multisplice2.sh --stage $train_stage \
--splice-indexes "layer0/-3:-2:-1:0:1:2:3 layer2/-2:2" \
--num-hidden-layers 3 \
--feat-type raw \
--online-ivector-dir exp/nnet2_online/ivectors \
--cmvn-opts "--norm-means=false --norm-vars=false" \
--num-threads "$num_threads" \
--minibatch-size "$minibatch_size" \
--parallel-opts "$parallel_opts" \
--num-jobs-nnet 4 \
--num-epochs 25 \
--add-layers-period 1 \
--mix-up 4000 \
--initial-learning-rate 0.02 --final-learning-rate 0.004 \
--cmd "$decode_cmd" \
--pnorm-input-dim 800 \
--pnorm-output-dim 200 \
data/train data/lang exp/tri3b_ali $dir || exit 1;
fi
if [ $stage -le 5 ]; then
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \
data/test exp/nnet2_online/extractor exp/nnet2_online/ivectors_test || exit 1;
fi
if [ $stage -le 6 ]; then
# Note: comparing the results of this with run_online_decoding_nnet2_baseline.sh,
# it's a bit worse, meaning the iVectors seem to hurt at this amount of data.
# However, experiments by Haihua Xu (not checked in yet) on WSJ, show it helping
# nicely. This setup seems to have too little data for it to work, but it suffices
# to demonstrate the scripts. We will likely modify it to add noise to the
# iVectors in training, which will tend to mitigate the over-training.
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--online-ivector-dir exp/nnet2_online/ivectors_test \
exp/tri3b/graph data/test $dir/decode &
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--online-ivector-dir exp/nnet2_online/ivectors_test \
exp/tri3b/graph_ug data/test $dir/decode_ug || exit 1;
wait
fi
if [ $stage -le 7 ]; then
# If this setup used PLP features, we'd have to give the option --feature-type plp
# to the script below.
steps/online/nnet2/prepare_online_decoding.sh data/lang exp/nnet2_online/extractor \
"$dir" ${dir}_online || exit 1;
fi
if [ $stage -le 8 ]; then
# do the actual online decoding with iVectors.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
exp/tri3b/graph data/test ${dir}_online/decode &
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
exp/tri3b/graph_ug data/test ${dir}_online/decode_ug || exit 1;
wait
fi
if [ $stage -le 9 ]; then
# this version of the decoding treats each utterance separately
# without carrying forward speaker information.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--per-utt true \
exp/tri3b/graph data/test ${dir}_online/decode_per_utt &
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
--per-utt true \
exp/tri3b/graph_ug data/test ${dir}_online/decode_ug_per_utt || exit 1;
wait
fi
exit 0;
# see ../../RESULTS for results. It's about the same as the non-multisplice
# recipe, but I'm not doing much tuning on RM... it has too little data
# for any of these DNN things to really work well
......@@ -6,10 +6,15 @@
stage=1
train_stage=-10
use_gpu=true
dir=exp/nnet2_online/nnet_perturbed
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
......@@ -21,14 +26,12 @@ EOF
parallel_opts="-l gpu=1"
num_threads=1
minibatch_size=512
dir=exp/nnet2_online/nnet_gpu_perturbed
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
minibatch_size=128
parallel_opts="-pe smp $num_threads"
dir=exp/nnet2_online/nnet_perturbed
fi
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment