Commit 7b84e0d3 authored by Dan Povey's avatar Dan Povey
Browse files

trunk: BIG UPDATE: merging sandbox/online back to trunk.

       Involves adding new online-decoding code, plus some refactorization of existing code
       (e.g., some small changes to interfaces of the decoders; rename GauPost to GaussPost).


git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4361 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parents 44a301cf 2e20cebb
......@@ -8,3 +8,6 @@ for x in exp/*/decode_dev; do grep WER $x/wer_* | utils/best_wer.sh; done
%WER 31.13 [ 12184 / 39141, 1939 ins, 2584 del, 7661 sub ] exp/tri5a_0.1/decode_dev/wer_12
%WER 23.66 [ 9259 / 39141, 1495 ins, 2432 del, 5332 sub ] exp/nnet6c4_gpu/decode_dev/wer_11
%WER 25.12 [ 9832 / 39141, 1423 ins, 2471 del, 5938 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_utt/wer_11
%WER 23.79 [ 9311 / 39141, 1499 ins, 2277 del, 5535 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev/wer_11
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
#!/bin/bash
. cmd.sh
stage=1
train_stage=-10
use_gpu=true
set -e
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
EOF
fi
parallel_opts="-l gpu=1"
num_threads=1
minibatch_size=512
# the _a is in case I want to change the parameters.
dir=exp/nnet2_online/nnet_a_gpu
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
minibatch_size=128
parallel_opts="-pe smp $num_threads"
dir=exp/nnet2_online/nnet_a
fi
if [ $stage -le 1 ]; then
mkdir -p exp/nnet2_online
# To train a diagonal UBM we don't need very much data, so use the smallest subset.
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 400000 \
data/train_30k 512 exp/tri5a exp/nnet2_online/diag_ubm
fi
if [ $stage -le 2 ]; then
# iVector extractors can in general be sensitive to the amount of data, but
# this one has a fairly small dim (defaults to 100) so we don't use all of it,
# we use just the 100k subset (about one sixteenth of the data).
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
data/train_100k exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
fi
if [ $stage -le 3 ]; then
ivectordir=exp/nnet2_online/ivectors_train
if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/fisher_english/s5/$ivectordir $ivectordir/storage
fi
# We extract iVectors on all the train data, which will be what we
# train the system on. This version of the iVector-extraction script
# pairs the utterances into twos (by default, see --utts-per-spk-max option)
# and treats each of these pairs as one speaker.
# Note that these are extracted 'online'.
steps/online/nnet2/extract_ivectors_online2.sh --cmd "$train_cmd" --nj 60 \
data/train exp/nnet2_online/extractor $ivectordir || exit 1;
fi
if [ $stage -le 4 ]; then
if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/fisher_english/s5/$dir/egs $dir/egs/storage
fi
# Because we have a lot of data here and we don't want the training to take
# too long, we reduce the number of epochs from the defaults (15 + 5) to (1 +
# 1). The option "--io-opts '-tc 12'" is to have more than the default number
# (5) of jobs dumping the egs to disk; this is OK since we're splitting our
# data across four filesystems for speed.
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
--num-epochs 3 --num-epochs-extra 1 \
--splice-width 7 --feat-type raw \
--online-ivector-dir exp/nnet2_online/ivectors_train \
--cmvn-opts "--norm-means=false --norm-vars=false" \
--num-threads "$num_threads" \
--minibatch-size "$minibatch_size" \
--parallel-opts "$parallel_opts" \
--io-opts "-tc 12" \
--num-jobs-nnet 6 \
--num-hidden-layers 4 \
--mix-up 12000 \
--initial-learning-rate 0.01 --final-learning-rate 0.001 \
--cmd "$decode_cmd" \
--pnorm-input-dim 3500 \
--pnorm-output-dim 350 \
data/train data/lang exp/tri5a $dir || exit 1;
fi
if [ $stage -le 5 ]; then
# dump iVectors for the testing data.
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
data/dev exp/nnet2_online/extractor exp/nnet2_online/ivectors_dev || exit 1;
fi
if [ $stage -le 6 ]; then
# this does offline decoding that should give about the same results as the
# real online decoding.
steps/nnet2/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
--online-ivector-dir exp/nnet2_online/ivectors_${data} \
exp/tri5a/graph data/dev $dir/decode_dev || exit 1;
fi
if [ $stage -le 7 ]; then
# If this setup used PLP features, we'd have to give the option --feature-type plp
# to the script below.
steps/online/nnet2/prepare_online_decoding.sh data/lang exp/nnet2_online/extractor \
"$dir" ${dir}_online || exit 1;
fi
if [ $stage -le 8 ]; then
# do the actual online decoding with iVectors, carrying info forward from
# previous utterances of the same speaker.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
exp/tri5a/graph data/dev ${dir}_online/decode_dev || exit 1;
fi
if [ $stage -le 9 ]; then
# this version of the decoding treats each utterance separately
# without carrying forward speaker information.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
--per-utt true \
exp/tri5a/graph data/dev ${dir}_online/decode_dev_utt || exit 1;
fi
exit 0;
#Baseline: GMM+SAT system.
#%WER 31.07 [ 12163 / 39141, 1869 ins, 2705 del, 7589 sub ] exp/tri5a/decode_dev/wer_13
# Baseline: p-norm system on top of fMLLR features.
#%WER 23.66 [ 9259 / 39141, 1495 ins, 2432 del, 5332 sub ] exp/nnet6c4_gpu/decode_dev/wer_11
# Our experiment, with per-utterance decoding:
#%WER 25.12 [ 9832 / 39141, 1423 ins, 2471 del, 5938 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_utt/wer_11
# Our experiment, carrying forward the adaptation state between
# utterances of each speaker.
#%WER 23.79 [ 9311 / 39141, 1499 ins, 2277 del, 5535 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev/wer_11
#!/bin/bash
# this script run_nnet2_b.sh is as run_nnet2.sh but it trains a larger network,
# with 5 instead of 4 hidden layers and p-norm (input,output) dims of
# 4k/400 instead of 3.5k/350.
# you can run it using --stage 4 if you've already run run_nnet2.sh,
# since the iVector extractor is the same.
. cmd.sh
stage=1
train_stage=-10
use_gpu=true
set -e
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
EOF
fi
parallel_opts="-l gpu=1"
num_threads=1
minibatch_size=512
dir=exp/nnet2_online/nnet_b_gpu
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
minibatch_size=128
parallel_opts="-pe smp $num_threads"
dir=exp/nnet2_online/nnet_b
fi
if [ $stage -le 1 ]; then
mkdir -p exp/nnet2_online
# To train a diagonal UBM we don't need very much data, so use the smallest subset.
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 400000 \
data/train_30k 512 exp/tri5a exp/nnet2_online/diag_ubm
fi
if [ $stage -le 2 ]; then
# iVector extractors can in general be sensitive to the amount of data, but
# this one has a fairly small dim (defaults to 100) so we don't use all of it,
# we use just the 100k subset (about one sixteenth of the data).
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
data/train_100k exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
fi
if [ $stage -le 3 ]; then
ivectordir=exp/nnet2_online/ivectors_train
if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/fisher_english/s5/$ivectordir $ivectordir/storage
fi
# We extract iVectors on all the train data, which will be what we
# train the system on. This version of the iVector-extraction script
# pairs the utterances into twos (by default, see --utts-per-spk-max option)
# and treats each of these pairs as one speaker.
# Note that these are extracted 'online'.
steps/online/nnet2/extract_ivectors_online2.sh --cmd "$train_cmd" --nj 60 \
data/train exp/nnet2_online/extractor $ivectordir || exit 1;
fi
if [ $stage -le 4 ]; then
if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/fisher_english/s5/$dir/egs $dir/egs/storage
fi
# Because we have a lot of data here and we don't want the training to take
# too long, we reduce the number of epochs from the defaults (15 + 5) to (1 +
# 1). The option "--io-opts '-tc 12'" is to have more than the default number
# (5) of jobs dumping the egs to disk; this is OK since we're splitting our
# data across four filesystems for speed.
steps/nnet2/train_pnorm_fast.sh --stage $train_stage \
--num-epochs 3 --num-epochs-extra 1 \
--splice-width 7 --feat-type raw \
--online-ivector-dir exp/nnet2_online/ivectors_train \
--cmvn-opts "--norm-means=false --norm-vars=false" \
--num-threads "$num_threads" \
--minibatch-size "$minibatch_size" \
--parallel-opts "$parallel_opts" \
--io-opts "-tc 12" \
--num-jobs-nnet 6 \
--num-hidden-layers 5 \
--mix-up 12000 \
--initial-learning-rate 0.01 --final-learning-rate 0.001 \
--cmd "$decode_cmd" \
--pnorm-input-dim 4000 \
--pnorm-output-dim 400 \
data/train data/lang exp/tri5a $dir || exit 1;
fi
if [ $stage -le 5 ]; then
# dump iVectors for the testing data.
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
data/dev exp/nnet2_online/extractor exp/nnet2_online/ivectors_dev || exit 1;
fi
if [ $stage -le 6 ]; then
# this does offline decoding that should give about the same results as the
# real online decoding.
steps/nnet2/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
--online-ivector-dir exp/nnet2_online/ivectors_${data} \
exp/tri5a/graph data/dev $dir/decode_dev || exit 1;
fi
if [ $stage -le 7 ]; then
# If this setup used PLP features, we'd have to give the option --feature-type plp
# to the script below.
steps/online/nnet2/prepare_online_decoding.sh data/lang exp/nnet2_online/extractor \
"$dir" ${dir}_online || exit 1;
fi
if [ $stage -le 8 ]; then
# do the actual online decoding with iVectors, carrying info forward from
# previous utterances of the same speaker.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
exp/tri5a/graph data/dev ${dir}_online/decode_dev || exit 1;
fi
if [ $stage -le 9 ]; then
# this version of the decoding treats each utterance separately
# without carrying forward speaker information.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
--per-utt true \
exp/tri5a/graph data/dev ${dir}_online/decode_dev_utt || exit 1;
fi
exit 0;
#Baseline: GMM+SAT system.
#%WER 31.07 [ 12163 / 39141, 1869 ins, 2705 del, 7589 sub ] exp/tri5a/decode_dev/wer_13
# Baseline: p-norm system on top of fMLLR features.
#%WER 23.66 [ 9259 / 39141, 1495 ins, 2432 del, 5332 sub ] exp/nnet6c4_gpu/decode_dev/wer_11
# Our experiment, with per-utterance decoding:
#%WER 25.12 [ 9832 / 39141, 1423 ins, 2471 del, 5938 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev_utt/wer_11
# Our experiment, carrying forward the adaptation state between
# utterances of each speaker.
#%WER 23.79 [ 9311 / 39141, 1499 ins, 2277 del, 5535 sub ] exp/nnet2_online/nnet_a_gpu_online/decode_dev/wer_11
export KALDI_ROOT=`pwd`/../../..
export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH
export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$PWD:$PATH
export LC_ALL=C
......@@ -175,3 +175,5 @@ steps/train_sat.sh --cmd "$train_cmd" \
# local/run_for_spkid.sh
# local/run_nnet2.sh
# local/online/run_nnet2.sh
......@@ -102,15 +102,22 @@ exit 0
%WER 1.68 [ 211 / 12533, 20 ins, 53 del, 138 sub ] exp/nnet4b/decode/wer_5
%WER 8.96 [ 1123 / 12533, 97 ins, 166 del, 860 sub ] exp/nnet4b/decode_ug/wer_8
%WER 1.72 [ 216 / 12533, 25 ins, 38 del, 153 sub ] exp/nnet4b_gpu/decode/wer_4
%WER 8.34 [ 1045 / 12533, 94 ins, 146 del, 805 sub ] exp/nnet4b_gpu/decode_ug/wer_10
%WER 1.91 [ 240 / 12533, 20 ins, 59 del, 161 sub ] exp/nnet4b_gpu/decode/wer_7
%WER 8.41 [ 1054 / 12533, 80 ins, 166 del, 808 sub ] exp/nnet4b_gpu/decode_ug/wer_10
# when I ran this before I got this:
# prob. just random.
# %WER 1.72 [ 216 / 12533, 25 ins, 38 del, 153 sub ] exp/nnet4b_gpu/decode/wer_4
# %WER 8.34 [ 1045 / 12533, 94 ins, 146 del, 805 sub ] exp/nnet4b_gpu/decode_ug/wer_10
# this another unadapted setup:
%WER 1.93 [ 242 / 12533, 40 ins, 44 del, 158 sub ] exp/nnet4b2_gpu/decode/wer_3
%WER 9.08 [ 1138 / 12533, 89 ins, 182 del, 867 sub ] exp/nnet4b2_gpu/decode_ug/wer_9
%WER 1.80 [ 226 / 12533, 29 ins, 44 del, 153 sub ] exp/nnet4c/decode/wer_4
%WER 8.49 [ 1064 / 12533, 80 ins, 175 del, 809 sub ] exp/nnet4c/decode_ug/wer_11
%WER 1.80 [ 226 / 12533, 23 ins, 52 del, 151 sub ] exp/nnet4c_gpu/decode/wer_5
%WER 8.64 [ 1083 / 12533, 93 ins, 169 del, 821 sub ] exp/nnet4c_gpu/decode_ug/wer_10
%WER 1.68 [ 211 / 12533, 29 ins, 39 del, 143 sub ] exp/nnet4d/decode/wer_4
%WER 8.40 [ 1053 / 12533, 101 ins, 153 del, 799 sub ] exp/nnet4d/decode_ug/wer_10
......@@ -178,3 +185,19 @@ exit 0
%WER 1.53 [ 192 / 12533, 23 ins, 30 del, 139 sub ] exp/combine_sgmm2_4a_3b_fmmic5/decode/wer_4
%WER 1.47 [ 184 / 12533, 23 ins, 27 del, 134 sub ] exp/combine_sgmm2_4a_mmi_3b_fmmic5/decode/wer_4
# Some things relating to nnet2 online decoding.
for x in exp/nnet2_online/nnet*/decode*; do grep WER $x/wer_* | utils/best_wer.sh ; done
%WER 2.75 [ 345 / 12533, 43 ins, 81 del, 221 sub ] exp/nnet2_online/nnet/decode/wer_7
%WER 10.94 [ 1371 / 12533, 133 ins, 220 del, 1018 sub ] exp/nnet2_online/nnet/decode_ug/wer_11
# script is not checked in for this, it's pnorm with 800/160 instead of 1000/200.
%WER 2.58 [ 323 / 12533, 38 ins, 81 del, 204 sub ] exp/nnet2_online/nnet2b/decode/wer_6
%WER 10.72 [ 1344 / 12533, 124 ins, 234 del, 986 sub ] exp/nnet2_online/nnet2b/decode_ug/wer_10
# This is the baseline for the nnet+ivector decoding, with no iVector. This is
# better than with the iVector, i.e. the iVector is not working. I assume this
# is due to overtraining. I plan to try this on a larger setup.
%WER 2.30 [ 288 / 12533, 44 ins, 51 del, 193 sub ] exp/nnet2_online/nnet_baseline/decode/wer_4
%WER 10.70 [ 1341 / 12533, 122 ins, 221 del, 998 sub ] exp/nnet2_online/nnet_baseline/decode_ug/wer_10
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
--nccf-ballast-online=true # helps for online operation.
#!/bin/bash
# This script demonstrates discriminative training of p-norm neural nets.
# It's on top of run_4c_gpu.sh which uses adapted 40-dimensional features.
# This version of the script uses GPUs. We distinguish it by putting "_gpu"
# at the end of the directory name.
gpu_opts="-l gpu=1,hostname=g*" # This is suitable for the CLSP network,
# you'll likely have to change it. we'll
# use it later on, in the training (it's
# not used in denlat creation)
. ./cmd.sh
. ./path.sh
! cuda-compiled && cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.
EOF
# The denominator lattice creation currently doesn't use GPUs.
# Note: we specify 1G each for the mem_free and ram_free which, is per
# thread... it will likely be less than the default. Increase the beam relative
# to the defaults; this is just for this RM setup, where the default beams will
# likely generate very thin lattices. Note: the transform-dir is important to
# specify, since this system is on top of fMLLR features.
nj=$(cat exp/tri3b_ali/num_jobs)
dir=nnet4d_gpu
steps/nnet2/make_denlats.sh --cmd "$decode_cmd -l mem_free=1G,ram_free=1G" \
--nj $nj --sub-split 20 --num-threads 6 --parallel-opts "-pe smp 6" \
--beam 20.0 --lattice-beam 10.0 \
--transform-dir exp/tri3b_ali \
data/train data/lang exp/$dir exp/$dir_denlats
steps/nnet2/align.sh --cmd "$decode_cmd $gpu_opts" --use-gpu yes \
--transform-dir exp/tri3b_ali \
--nj $nj data/train data/lang exp/$dir exp/$dir_ali
steps/nnet2/train_discriminative.sh --cmd "$decode_cmd" \
--num-jobs-nnet 2 --transform-dir exp/tri3b_ali \
--num-threads 1 --parallel-opts "$gpu_opts" data/train data/lang \
exp/$dir_ali exp/$dir_denlats exp/$dir/final.mdl exp/nnet5d_mpe_gpu
for epoch in 1 2 3 4; do
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 --iter epoch$epoch \
--transform-dir exp/tri3b/decode \
exp/tri3b/graph data/test exp/nnet5d_mpe_gpu/decode_epoch$epoch &
steps/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 --iter epoch$epoch \
--transform-dir exp/tri3b/decode_ug \
exp/tri3b/graph_ug data/test exp/nnet5d_mpe_gpu/decode_ug_epoch$epoch &
done
exit 0;
# The following is some test commands that I ran in order to verify that
# the neural-net splitting and excising code was working as intended.
# (
# acoustic_scale=0.1
# for criterion in smbr mmi mpfe; do
# for drop_frames in true false; do
# nnet-get-egs-discriminative --drop-frames=$drop_frames --criterion=$criterion --excise=true exp/tri5c_mpe/0.mdl 'ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:data/train/split8/1/utt2spk scp:data/train/split8/1/cmvn.scp "scp:head -n 40 data/train/split8/1/feats.scp|" ark:- | splice-feats --left-context=3 --right-context=3 ark:- ark:- | transform-feats exp/tri5c_mpe/final.mat ark:- ark:- | transform-feats --utt2spk=ark:data/train/split8/1/utt2spk ark:exp/tri3b_ali/trans.1 ark:- ark:- |' 'ark,s,cs:gunzip -c exp/$dir_ali/ali.1.gz |' 'ark,s,cs:gunzip -c exp/$dir_denlats/lat.1.gz|' "ark:|nnet-combine-egs-discriminative ark:- ark:1.egs"
# nnet-get-egs-discriminative --drop-frames=$drop_frames --criterion=$criterion --split=false --excise=false exp/tri5c_mpe/0.mdl 'ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:data/train/split8/1/utt2spk scp:data/train/split8/1/cmvn.scp "scp:head -n 40 data/train/split8/1/feats.scp|" ark:- | splice-feats --left-context=3 --right-context=3 ark:- ark:- | transform-feats exp/tri5c_mpe/final.mat ark:- ark:- | transform-feats --utt2spk=ark:data/train/split8/1/utt2spk ark:exp/tri3b_ali/trans.1 ark:- ark:- |' 'ark,s,cs:gunzip -c exp/$dir_ali/ali.1.gz |' 'ark,s,cs:gunzip -c exp/$dir_denlats/lat.1.gz|' ark:2.egs
# nnet-compare-hash-discriminative --acoustic-scale=$acoustic_scale --drop-frames=$drop_frames --criterion=$criterion exp/$dir/final.mdl ark:1.egs ark:2.egs || exit 1;
# done
# done
# )
#!/bin/bash
. cmd.sh
steps/online/prepare_online_decoding.sh --cmd "$train_cmd" data/train data/lang \
exp/tri3b exp/tri3b_mmi/final.mdl exp/tri3b_online/ || exit 1;
# Below is the basic online decoding. There is no endpointing being done: the utterances
# are supplied as .wav files. And the speaker information is known, so we can use adaptation
# info from previous utterances of the same speaker. It's like an application where
# we have push-to-talk and push-to-finish, and it has been told who the speaker is.
# The reason it's "online" is that internally, it processes the .wav file sequentially
# as if you were capturing it from an audio stream, so that when you get to the end of the file
# it is ready with the decoded output, with very little latency.
steps/online/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 exp/tri3b/graph \
data/test exp/tri3b_online/decode
# Below is online decoding with endpointing-- but the endpointing is just at the end of the
# utterance, not the beginning. It's like a dialog system over the phone, where when it's your
# turn to speak it waits till you've finished saying something and then does something. The
# endpoint detection is configurable in various ways (not demonstrated here), but it's not separate
# from the speech recognition, it uses the traceback of the decoder itself to endpoint (whether
# it's silence, and so on).
steps/online/decode.sh --do-endpointing true \
--config conf/decode.config --cmd "$decode_cmd" --nj 20 exp/tri3b/graph \
data/test exp/tri3b_online/decode_endpointing
# Below is like the "basic online decoding" above, except we treat each utterance separately and
# do not "carry forward" the speaker adaptation state from the previous utterance.
steps/online/decode.sh --per-utt true --config conf/decode.config \
--cmd "$decode_cmd" --nj 20 exp/tri3b/graph \
data/test exp/tri3b_online/decode_per_utt
# grep WER exp/tri3b_online/decode/wer_* | utils/best_wer.sh
# %WER 2.06 [ 258 / 12533, 29 ins, 46 del, 183 sub ] exp/tri3b_online/decode/wer_10
# grep WER exp/tri3b_online/decode_endpointing/wer_* | utils/best_wer.sh
# %WER 2.07 [ 260 / 12533, 33 ins, 46 del, 181 sub ] exp/tri3b_online/decode_endpointing/wer_10
# Treating each one as a separate utterance, we get this:
# grep WER exp/tri3b_online/decode_per_utt/wer_* | utils/best_wer.sh
# %WER 2.37 [ 297 / 12533, 41 ins, 56 del, 200 sub ] exp/tri3b_online/decode_per_utt/wer_9
# The baseline WER is:
# %WER 1.92 [ 241 / 12533, 28 ins, 39 del, 174 sub ] exp/tri3b_mmi/decode/wer_4
# You can ignore the folowing; these were when I was debugging a difference between the
# online and non-online decoding, the commands may be useful as examples.
# cat exp/tri3b_online/decode/log/decode.*.log | grep _ | grep -v LOG | grep -v gz | sort > foo
# cat exp/tri3b_online/decode_endpointing/log/decode.*.log | grep _ | grep -v LOG | grep -v gz | sort > bar
# diff foo bar
#gunzip -c exp/tri3b_online/decode/lat.*.gz | lattice-1best ark:- ark:- | lattice-copy ark:- ark:- | nbest-to-linear ark:- ark,t:- | grep rkm05_st0619_oct87 | show-alignments data/lang/phones.txt exp/tri3b/final.mdl ark:-
#gunzip -c exp/tri3b_online/decode_endpointing/lat.*.gz | lattice-1best ark:- ark:- | lattice-copy ark:- ark:- | nbest-to-linear ark:- ark,t:- | grep rkm05_st0619_oct87 | show-alignments data/lang/phones.txt exp/tri3b/final.mdl ark:-
# gunzip -c exp/tri3b_online/decode_endpointing/lat.*.gz | lattice-copy ark:- ark:- | lattice-to-fst ark:- "scp,p,t:echo rkm05_st0619_oct87 -|" | utils/int2sym.pl -f 3- data/lang/words.txt
#!/bin/bash
. cmd.sh
steps/online/prepare_online_decoding.sh --add-pitch true --cmd "$train_cmd" data/train data/lang \
exp/tri3b exp/tri3b_mmi/final.mdl exp/tri3b_online/ || exit 1;
steps/online/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 20 exp/tri3b/graph \
data/test exp/tri3b_online/decode
steps/online/decode.sh --do-endpointing true \
--config conf/decode.config --cmd "$decode_cmd" --nj 20 exp/tri3b/graph \
data/test exp/tri3b_online/decode_endpointing
steps/online/decode.sh --per-utt true --config conf/decode.config \
--cmd "$decode_cmd" --nj 20 exp/tri3b/graph \
data/test exp/tri3b_online/decode_per_utt
# grep WER exp/tri3b_online/decode/wer_* | utils/best_wer.sh
# %WER 3.20 [ 401 / 12533, 67 in , 50 del, 284 ub ] exp/tri3b_online/decode/wer_13
# grep WER exp/tri3b_online/decode_endpointing/wer_* | utils/best_wer.sh
# %WER 3.21 [ 402 / 12533, 73 in , 48 del, 281 ub ] exp/tri3b_online/decode_endpointing/wer_13
# Treating each one as a separate utterance, we get this:
# grep WER exp/tri3b_online/decode_per_utt/wer_* | utils/best_wer.sh
# %WER 3.62 [ 454 / 12533, 80 in , 58 del, 316 ub ] exp/tri3b_online/decode_per_utt/wer_13
# The baseline WER is:
# %WER 2.11 [ 265 / 12533, 44 in , 40 del, 181 ub ] exp/tri3b_mmi/decode/wer_7
# You can ignore the folowing; these were when I was debugging a difference between the
# online and non-online decoding, the commands may be useful as examples.
# cat exp/tri3b_online/decode/log/decode.*.log | grep _ | grep -v LOG | grep -v gz | sort > foo
# cat exp/tri3b_online/decode_endpointing/log/decode.*.log | grep _ | grep -v LOG | grep -v gz | sort > bar
# diff foo bar
#gunzip -c exp/tri3b_online/decode/lat.*.gz | lattice-1best ark:- ark:- | lattice-copy ark:- ark:- | nbest-to-linear ark:- ark,t:- | grep rkm05_st0619_oct87 | show-alignments data/lang/phones.txt exp/tri3b/final.mdl ark:-
#gunzip -c exp/tri3b_online/decode_endpointing/lat.*.gz | lattice-1best ark:- ark:- | lattice-copy ark:- ark:- | nbest-to-linear ark:- ark,t:- | grep rkm05_st0619_oct87 | show-alignments data/lang/phones.txt exp/tri3b/final.mdl ark:-
# gunzip -c exp/tri3b_online/decode_endpointing/lat.*.gz | lattice-copy ark:- ark:- | lattice-to-fst ark:- "scp,p,t:echo rkm05_st0619_oct87 -|" | utils/int2sym.pl -f 3- data/lang/words.txt
#!/bin/bash
. cmd.sh
stage=1
train_stage=-10
use_gpu=true
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
EOF
fi
parallel_opts="-l gpu=1"
num_threads=1
minibatch_size=512
dir=exp/nnet2_online/nnet_gpu
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
minibatch_size=128
parallel_opts="-pe smp $num_threads"
dir=exp/nnet2_online/nnet
fi
if [ $stage -le 1 ]; then
mkdir -p exp/nnet2_online
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 10 --num-frames 200000 \
data/train 256 exp/tri3b exp/nnet2_online/diag_ubm
fi
if [ $stage -le 2 ]; then
# use a smaller iVector dim (50) than the default (100) because RM has a very