Commit 6f598676 authored by Dan Povey's avatar Dan Povey
Browse files

several nnet2-online changes: make it easier to get the feature extraction...

several nnet2-online changes: make it easier to get the feature extraction options right in cross-system training; add train_pnorm_simple.sh script (simplified learning-rate schedule and improved combination at the end, supersedes train_pnorm_fast.sh); modifying big-data online-nnet2 recipes to use 40-dimensional MFCC rather than 13 as input (will add results soon, but they are improved).  Modified filter_scp.pl to have one-based, not zero-based, field index.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4493 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 011808dc
......@@ -96,7 +96,7 @@ while (( "$#" )); do
$cmd LMWT=$min_lmwt:$max_lmwt $targetdir/$kws/kws_filter.LMWT.log \
set -e';' set -o pipefail';' \
mkdir -p $targetdir/${kws}_LMWT';'\
cat $resultdir/${kws}_LMWT/'result.*' \| utils/filter_scp.pl -f 1 $filter \> $targetdir/${kws}_LMWT/result || exit -1
cat $resultdir/${kws}_LMWT/'result.*' \| utils/filter_scp.pl -f 2 $filter \> $targetdir/${kws}_LMWT/result || exit -1
echo -e "\tWrite normalized..."
......
--window-type=hamming # disable Dans window, use the standard
--use-energy=false # only fbank outputs
--sample-frequency=8000 # Cantonese is sampled at 8kHz
--low-freq=64 # typical setup from Frantisek Grezl
--high-freq=3800
--dither=1
--num-mel-bins=15 # 8kHz so we use 15 bins
--htk-compat=true # try to make it compatible with HTK
# config for high-resolution MFCC features, intended for neural network training.
# Note: we keep all cepstra, so it has the same info as filterbank features,
# but MFCC is more easily compressible (because less correlated) which is why
# we prefer this method.
--use-energy=false # use average of log energy, not energy.
--sample-frequency=8000 # Switchboard is sampled at 8kHz
--num-mel-bins=40 # similar to Google's setup.
--num-ceps=40 # there is no dimensionality reduction.
--low-freq=40 # low cutoff frequency for mel bins
--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
#!/bin/bash
. cmd.sh
......@@ -12,74 +11,99 @@ set -e
. ./path.sh
. ./utils/parse_options.sh
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
# assume use_gpu=true since it would be way too slow otherwise.
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
where "nvcc" is installed.
EOF
fi
parallel_opts="-l gpu=1"
num_threads=1
minibatch_size=512
dir=exp/nnet2_online/nnet_a_gpu
mkdir -p exp/nnet2_online
if [ $stage -le 1 ]; then
# this shows how you can split across multiple file-systems. we'll split the
# MFCC dir across multiple locations. You might want to be careful here, if you
# have multiple copies of Kaldi checked out and run the same recipe, not to let
# them overwrite each other.
mfccdir=mfcc
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
date=$(date +'%m_%d_%H_%M')
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english-$date/s5/$mfccdir/storage $mfccdir/storage
fi
parallel_opts="-l gpu=1"
num_threads=1
minibatch_size=512
# the _a is in case I want to change the parameters.
dir=exp/nnet2_online/nnet_a_gpu
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
minibatch_size=128
parallel_opts="-pe smp $num_threads"
dir=exp/nnet2_online/nnet_a
utils/copy_data_dir.sh data/train data/train_hires
steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
--cmd "$train_cmd" data/train_hires exp/make_hires/train $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/train_hires exp/make_hires/train $mfccdir || exit 1;
utils/subset_data_dir.sh data/train_hires 30000 data/train_hires_30k
# want the 100k subset to exactly match train_100k, since we'll use its alignments.
awk '{print $1}' data/train_100k/utt2spk > uttlist
utils/subset_data_dir.sh --utt-list uttlist data/train_hires data/train_hires_100k
rm uttlist
fi
if [ $stage -le 2 ]; then
# We need to build a small system just because we need the LDA+MLLT transform
# to train the diag-UBM on top of. We use --num-iters 13 because after we get
# the transform (12th iter is the last), any further training is pointless.
steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
--splice-opts "--left-context=3 --right-context=3" \
5000 10000 data/train_hires_100k data/lang exp/tri4a exp/nnet2_online/tri5a
fi
if [ $stage -le 1 ]; then
mkdir -p exp/nnet2_online
# To train a diagonal UBM we don't need very much data, so use the smallest subset.
if [ $stage -le 3 ]; then
# To train a diagonal UBM we don't need very much data, so use the smallest
# subset. the input directory exp/nnet2_online/tri5a is only needed for
# the splice-opts and the LDA transform.
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 400000 \
data/train_30k 512 exp/tri5a exp/nnet2_online/diag_ubm
data/train_hires_30k 512 exp/nnet2_online/tri5a exp/nnet2_online/diag_ubm
fi
if [ $stage -le 2 ]; then
if [ $stage -le 4 ]; then
# iVector extractors can in general be sensitive to the amount of data, but
# this one has a fairly small dim (defaults to 100) so we don't use all of it,
# we use just the 100k subset (about one sixteenth of the data).
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
data/train_100k exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
data/train_hires_100k exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
fi
if [ $stage -le 3 ]; then
if [ $stage -le 5 ]; then
ivectordir=exp/nnet2_online/ivectors_train
if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/fisher_english/s5/$ivectordir $ivectordir/storage
if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then # this shows how you can split across multiple file-systems.
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english/s5/$ivectordir/storage $ivectordir/storage
fi
# We extract iVectors on all the train data, which will be what we
# train the system on. This version of the iVector-extraction script
# pairs the utterances into twos (by default, see --utts-per-spk-max option)
# and treats each of these pairs as one speaker.
# Note that these are extracted 'online'.
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_hires data/train_hires_max2
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
--utts-per-spk-max 2 \
data/train exp/nnet2_online/extractor $ivectordir || exit 1;
data/train_hires_max2 exp/nnet2_online/extractor $ivectordir || exit 1;
fi
if [ $stage -le 4 ]; then
if [ $stage -le 6 ]; then
if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/fisher_english/s5/$dir/egs $dir/egs/storage
fi
# Because we have a lot of data here and we don't want the training to take
# too long, we reduce the number of epochs from the defaults (15 + 5) to (1 +
# too long, we reduce the number of epochs from the defaults (15 + 5) to (3 +
# 1). The option "--io-opts '-tc 12'" is to have more than the default number
# (5) of jobs dumping the egs to disk; this is OK since we're splitting our
# data across four filesystems for speed.
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
--num-epochs 3 --num-epochs-extra 1 \
--num-epochs 4 --num-epochs-extra 1 \
--samples-per-iter 400000 \
--splice-width 7 --feat-type raw \
--online-ivector-dir exp/nnet2_online/ivectors_train \
--cmvn-opts "--norm-means=false --norm-vars=false" \
......@@ -94,30 +118,12 @@ if [ $stage -le 4 ]; then
--cmd "$decode_cmd" \
--pnorm-input-dim 3500 \
--pnorm-output-dim 350 \
data/train data/lang exp/tri5a $dir || exit 1;
data/train_hires data/lang exp/tri5a $dir || exit 1;
fi
if [ $stage -le 5 ]; then
# dump iVectors for the testing data.
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
data/dev exp/nnet2_online/extractor exp/nnet2_online/ivectors_dev || exit 1;
fi
if [ $stage -le 6 ]; then
# this does offline decoding that should give about the same results as the
# real online decoding (the one with --per-utt true)
steps/nnet2/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
--online-ivector-dir exp/nnet2_online/ivectors_dev \
exp/tri5a/graph data/dev $dir/decode_dev || exit 1;
fi
if [ $stage -le 7 ]; then
# If this setup used PLP features, we'd have to give the option --feature-type plp
# to the script below.
steps/online/nnet2/prepare_online_decoding.sh data/lang exp/nnet2_online/extractor \
"$dir" ${dir}_online || exit 1;
steps/online/nnet2/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
data/lang exp/nnet2_online/extractor "$dir" ${dir}_online || exit 1;
fi
if [ $stage -le 8 ]; then
......@@ -146,30 +152,3 @@ fi
exit 0;
#Baseline: GMM+SAT system.
#%WER 31.07 [ 12163 / 39141, 1869 ins, 2705 del, 7589 sub ] exp/tri5a/decode_dev/wer_13
# Baseline: p-norm system on top of fMLLR features.
#%WER 23.66 [ 9259 / 39141, 1495 ins, 2432 del, 5332 sub ] exp/nnet6c4_gpu/decode_dev/wer_11
# Our experiment, carrying forward the adaptation state between
# utterances of each speaker.
#%WER 23.79 [ 9311 / 39141, 1499 ins, 2277 del, 5535 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev/wer_11
# Our experiment, with per-utterance decoding:
%WER 24.84 [ 9721 / 39141, 1445 ins, 2410 del, 5866 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_utt/wer_11
# below, with --max-chunks-at-once 3. The WER is slightly worse but I expect in general it will
# be slightly better, to to more iVector right context; this is likely just noise. The average
# latency was reduced vs the baseline,
#%WER 24.92 [ 9753 / 39141, 1423 ins, 2429 del, 5901 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_utt_mc3/wer_11
# The following results (obtained after ./run_nnet2_discriminative.sh was run), show
# the effect of discriminative training. After 2 epochs, we reduce the WER from 23.58 to 22.07.
%WER 23.58 [ 9229 / 39141, 1382 ins, 2400 del, 5447 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev/wer_12
%WER 22.16 [ 8675 / 39141, 1522 ins, 1886 del, 5267 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_smbr_epoch1/wer_13
%WER 22.07 [ 8637 / 39141, 1540 ins, 1873 del, 5224 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_smbr_epoch2/wer_13
......@@ -60,14 +60,12 @@ if [ $stage -le 3 ]; then
utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/fisher_english/s5/$ivectordir $ivectordir/storage
fi
# We extract iVectors on all the train data, which will be what we
# train the system on. This version of the iVector-extraction script
# pairs the utterances into twos (by default, see --utts-per-spk-max option)
# and treats each of these pairs as one speaker.
# Note that these are extracted 'online'.
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train data/train_max2
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
--utts-per-spk-max 2 \
data/train exp/nnet2_online/extractor $ivectordir || exit 1;
data/train_max2 exp/nnet2_online/extractor $ivectordir || exit 1;
fi
......@@ -83,7 +81,8 @@ if [ $stage -le 4 ]; then
# data across four filesystems for speed.
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
--num-epochs 3 --num-epochs-extra 1 \
--num-epochs 4 --num-epochs-extra 1 \
--samples-per-iter 400000 \
--splice-width 7 --feat-type raw \
--online-ivector-dir exp/nnet2_online/ivectors_train \
--cmvn-opts "--norm-means=false --norm-vars=false" \
......
#!/bin/bash
# This is to be run after run_nnet2.sh
# THIS IS NOT TESTED YET.
. cmd.sh
......@@ -43,7 +41,6 @@ set -e
nj=40
if [ $stage -le 1 ]; then
# the make_denlats job is always done on CPU not GPU, since in any case
# the graph search and lattice determinization takes quite a bit of CPU.
# note: it's the sub-split option that determinies how many jobs actually
......@@ -51,7 +48,7 @@ if [ $stage -le 1 ]; then
steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
--nj $nj --sub-split 40 --num-threads 6 --parallel-opts "-pe smp 6" \
--online-ivector-dir exp/nnet2_online/ivectors_train \
data/train data/lang $srcdir ${srcdir}_denlats
data/train_hires data/lang $srcdir ${srcdir}_denlats
fi
if [ $stage -le 2 ]; then
......@@ -59,7 +56,7 @@ if [ $stage -le 2 ]; then
steps/nnet2/align.sh --cmd "$decode_cmd $gpu_opts" \
--online-ivector-dir exp/nnet2_online/ivectors_train \
--use-gpu $use_gpu_opt \
--nj $nj data/train data/lang ${srcdir} ${srcdir}_ali
--nj $nj data/train_hires data/lang ${srcdir} ${srcdir}_ali
fi
if [ $stage -le 3 ]; then
......@@ -72,22 +69,22 @@ if [ $stage -le 3 ]; then
# since we're using 4 disks.
steps/nnet2/train_discriminative.sh --cmd "$decode_cmd" --learning-rate 0.00001 \
--io-opts "-pe smp 10" \
--num-epochs 2 \
--num-epochs 4 \
--use-preconditioning $use_preconditioning \
--online-ivector-dir exp/nnet2_online/ivectors_train \
--num-jobs-nnet 4 --num-threads $num_threads --parallel-opts "$gpu_opts" \
data/train data/lang \
data/train_hires data/lang \
${srcdir}_ali ${srcdir}_denlats ${srcdir}/final.mdl ${srcdir}_smbr
fi
if [ $stage -le 4 ]; then
# we'll do the decoding as 'online' decoding by using the existing
# _online directory but with extra models copied to it.
for epoch in 1 2; do
for epoch in 1 2 3 4; do
cp ${srcdir}_smbr/epoch${epoch}.mdl ${srcdir}_online/smbr_epoch${epoch}.mdl
done
for epoch in 1 2; do
for epoch in 1 2 3 4; do
# do the actual online decoding with iVectors, carrying info forward from
# previous utterances of the same speaker.
steps/online/nnet2/decode.sh --cmd "$decode_cmd" --nj 30 --iter smbr_epoch${epoch} \
......@@ -95,5 +92,6 @@ if [ $stage -le 4 ]; then
done
fi
wait
# for results, see the end of run_nnet2.sh
......@@ -45,7 +45,7 @@ if [ $stage -le 2 ]; then
local/vad_split_utts_fix_data.pl $in_dir $dir;
fi
utils/filter_scp.pl -f 0 \
utils/filter_scp.pl \
<(echo "`awk < "$dir/segments" '{ print $2 }'`") $in_dir/wav.scp \
> $dir/wav.scp
......
......@@ -31,7 +31,7 @@ classes="ark:lid/remove_dialect.pl data/train/utt2lang \
# Create priors to rebalance the model. The following script rebalances
# the languages as count(lang_test) / (count(lang_test) + count(lang_train)).
lid/balance_priors_to_test.pl \
<(lid/remove_dialect.pl <(utils/filter_scp.pl -f 0 \
<(lid/remove_dialect.pl <(utils/filter_scp.pl \
exp/ivectors_train/ivector.scp data/train/utt2lang)) \
<(lid/remove_dialect.pl data/lre07/utt2lang) \
exp/ivectors_train/languages.txt \
......
......@@ -6,6 +6,9 @@
stage=1
train_stage=-10
use_gpu=true
dir=exp/nnet2_online/nnet_a
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
......@@ -21,7 +24,6 @@ EOF
parallel_opts="-l gpu=1"
num_threads=1
minibatch_size=512
dir=exp/nnet2_online/nnet_gpu
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
......@@ -47,14 +49,17 @@ if [ $stage -le 2 ]; then
fi
if [ $stage -le 3 ]; then
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train data/train_max2
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 4 \
--utts-per-spk-max 2 \
data/train exp/nnet2_online/extractor exp/nnet2_online/ivectors || exit 1;
data/train_max2 exp/nnet2_online/extractor exp/nnet2_online/ivectors || exit 1;
fi
if [ $stage -le 4 ]; then
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
steps/nnet2/train_pnorm_simple.sh --stage $train_stage \
--splice-width 7 \
--feat-type raw \
--online-ivector-dir exp/nnet2_online/ivectors \
......@@ -63,7 +68,8 @@ if [ $stage -le 4 ]; then
--minibatch-size "$minibatch_size" \
--parallel-opts "$parallel_opts" \
--num-jobs-nnet 4 \
--num-epochs-extra 10 --add-layers-period 1 \
--num-epochs 25 \
--add-layers-period 1 \
--num-hidden-layers 2 \
--mix-up 4000 \
--initial-learning-rate 0.02 --final-learning-rate 0.004 \
......
#!/bin/bash
# this is a baseline for run_online_decoding_nnet2.sh, without
# this is a baseline for ./run_nnet2.sh, without
# the iVectors, to see whether they make a difference.
. cmd.sh
......@@ -10,10 +10,14 @@
stage=1
train_stage=-10
use_gpu=true
dir=exp/nnet2_online/nnet_a_baseline
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
......@@ -25,19 +29,17 @@ EOF
parallel_opts="-l gpu=1"
num_threads=1
minibatch_size=512
dir=exp/nnet2_online/nnet_gpu_baseline
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
minibatch_size=128
parallel_opts="-pe smp $num_threads"
dir=exp/nnet2_online/nnet_baseline
fi
if [ $stage -le 1 ]; then
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
steps/nnet2/train_pnorm_simple.sh --stage $train_stage \
--splice-width 7 \
--feat-type raw \
--cmvn-opts "--norm-means=false --norm-vars=false" \
......@@ -45,7 +47,8 @@ if [ $stage -le 1 ]; then
--minibatch-size "$minibatch_size" \
--parallel-opts "$parallel_opts" \
--num-jobs-nnet 4 \
--num-epochs-extra 10 --add-layers-period 1 \
--num-epochs 25 \
--add-layers-period 1 \
--num-hidden-layers 2 \
--mix-up 4000 \
--initial-learning-rate 0.02 --final-learning-rate 0.004 \
......@@ -82,4 +85,4 @@ if [ $stage -le 4 ]; then
wait
fi
# for results, see the end of ./run_online_decoding_nnet2.sh
# for results, see the end of ./run_nnet2.sh
......@@ -77,11 +77,13 @@ if [ $stage -le 5 ]; then
fi
# Below, setting --utts-per-spk-max to a noninteger helps to randomize the division
# of speakers into "fake-speakers" with about 2 utterances each, by randomly making
# some have 2 and some 3 utterances... this randomnes will be different in different
# some have 2 and some 3 utterances... this randomness will be different in different
# copies of the data.
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2.5 data/train_perturbed_mfcc \
data/train_perturbed_mfcc_max2.5
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
--utts-per-spk-max 2.5 \
data/train_perturbed_mfcc exp/nnet2_online/extractor $ivectordir || exit 1;
data/train_perturbed_mfcc_max2.5 exp/nnet2_online/extractor $ivectordir || exit 1;
fi
......
......@@ -4,7 +4,7 @@
# the optional part local/online/run_online_decoding_nnet2.sh. It builds a
# neural net for online decoding on top of the network we previously trained on
# WSJ, by keeping everything but the last layer of that network and then
# training just the last layer on our data.
# training just the last layer on our data. We then train the whole thing.
stage=0
set -e
......@@ -26,35 +26,40 @@ EOF
parallel_opts="-l gpu=1"
num_threads=1
minibatch_size=512
dir=exp/nnet2_online_wsj/nnet_gpu
trainfeats=exp/nnet2_online_wsj/wsj_activations_train_gpu
dir=exp/nnet2_online_wsj/nnet_a
trainfeats=exp/nnet2_online_wsj/wsj_activations_train
# later we'll change the script to download the trained model from kaldi-asr.org.
srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_gpu_online
# the following things are needed while training the combined model.
srcdir_orig=../../wsj/s5/exp/nnet2_online/nnet_a_gpu
ivector_src=../../wsj/s5/exp/nnet2_online/extractor
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
minibatch_size=128
parallel_opts="-pe smp $num_threads"
dir=exp/nnet2_online_wsj/nnet
dir=exp/nnet2_online_wsj/nnet_a
trainfeats=exp/nnet2_online_wsj/wsj_activations_train
srcdir=../../wsj/s5/exp/nnet2_online/nnet_a_online
# the following things are needed while training the combined model.
srcdir_orig=../../wsj/s5/exp/nnet2_online/nnet_a
ivector_src=../../wsj/s5/exp/nnet2_online/extractor
fi
if [ $stage -le 0 ]; then
echo "$0: dumping activations from WSJ model"
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $trainfeats/feats/storage ]; then
# this shows how you can split the data across multiple file-systems; it's optional.
date=$(date +'%m_%d_%H_%M')
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/rm-$date/s5/$trainfeats/feats/storage \
$trainfeats/feats/storage
fi
steps/online/nnet2/dump_nnet_activations.sh --cmd "$train_cmd" --nj 30 \
data/train $srcdir $trainfeats
fi
if [ $stage -le 1 ]; then
echo "$0: training 0-hidden-layer model on top of WSJ activations"
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/rm-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
fi
steps/nnet2/retrain_fast.sh --stage $train_stage \
--num-threads "$num_threads" \
--minibatch-size "$minibatch_size" \
......@@ -71,9 +76,6 @@ if [ $stage -le 2 ]; then
steps/online/nnet2/prepare_online_decoding_retrain.sh $srcdir $dir ${dir}_online
fi
# Note: at this point it might be possible to further train the combined model
# by doing backprop through all of it. We haven't implemented this yet.
if [ $stage -le 3 ]; then
# do online decoding with the combined model.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 \
......@@ -98,7 +100,7 @@ fi
## the model on this dataset. First we need to create a combined version of the
## model.
if [ $stage -le 5 ]; then
steps/nnet2/create_appended_model.sh $srcdir_orig $dir ${dir}_combined_init
steps/nnet2/create_appended_model.sh $srcdir $dir ${dir}_combined_init
# Set the learning rate in this initial value to our guess of a suitable value.
# note: we initially tried 0.005, and this gave us WERs of (1.40, 1.48, 7.24, 7.70) vs.
......@@ -107,31 +109,20 @@ if [ $stage -le 5 ]; then
nnet-am-copy --learning-rate=$initial_learning_rate ${dir}_combined_init/final.mdl ${dir}_combined_init/final.mdl
fi
# In order to train the combined model, we'll need to dump iVectors.
if [ $stage -le 6 ]; then
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 10 \
--utts-per-spk-max 2 \
data/train $ivector_src exp/nnet2_online_wsj/ivectors || exit 1;
fi
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
utils/create_split_dir.pl \
/export/b0{1,2,3,4}/$USER/kaldi-data/rm-$(date +'%m_%d_%H_%M')/s5/${dir}_combined/egs/storage \
$dir_combined/egs/storage
fi
if [ $stage -le 7 ]; then
# assume left and right context of model are identical.
splice_width=$(nnet-am-info exp/nnet2_online_wsj/nnet_gpu_combined_init/final.mdl | grep '^left-context' | awk '{print $2}') || exit 1;
# Note: in general the get_egs.sh script would get things like the LDA matrix
# from exp/tri3b_ali, which would be the wrong thing to do as we want to get
# them from the original model dir. In this case we're using raw MFCC
# features so it's not an issue. But in general we'd probably have to create
# a temporary dir and copy or link both the alignments and feature-related
# things to it.
steps/nnet2/get_egs.sh --cmd "$train_cmd" \
--feat-type raw --cmvn-opts "--norm-means=false --norm-vars=false" \
--online-ivector-dir exp/nnet2_online_wsj/ivectors \
--num-jobs-nnet 4 --splice-width $splice_width \
data/train data/lang exp/tri3b_ali ${dir}_combined
# This version of the get_egs.sh script does the feature extraction and iVector
# extraction in a single binary, reading the config, as part of the script.
steps/online/nnet2/get_egs.sh --cmd "$train_cmd" --num-jobs-nnet 4 \
data/train exp/tri3b_ali ${dir}_online ${dir}_combined
fi
if [ $stage -le 8 ]; then
if [ $stage -le 7 ]; then
steps/nnet2/train_more.sh --learning-rate-factor 0.1 --cmd "$train_cmd" \
--num-threads "$num_threads" \
--minibatch-size "$minibatch_size" \
......@@ -139,15 +130,15 @@ if [ $stage -le 8 ]; then
${dir}_combined_init/final.mdl ${dir}_combined/egs ${dir}_combined
fi
if [ $stage -le 9 ]; then
if [ $stage -le 8 ]; then
# Create an online-decoding dir corresponding to what we just trained above.
# If this setup used PLP features, we'd have to give the option --feature-type plp
# to the script below.
steps/online/nnet2/prepare_online_decoding.sh data/lang $ivector_src \
steps/online/nnet2/prepare_online_decoding.sh data/lang $srcdir/ivector_extractor \
${dir}_combined ${dir}_combined_online || exit 1;
fi