Commit 43af891e authored by Vimal Manohar's avatar Vimal Manohar
Browse files

trunk: egs/gale_arabic: Adding online nnet2 recipe for Gale Arabic system

git-svn-id: 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 9668d086
decode_dnn.config
\ No newline at end of file
# config for high-resolution MFCC features, intended for neural network training.
# Note: we keep all cepstra, so it has the same info as filterbank features,
# but MFCC is more easily compressible (because less correlated) which is why
# we prefer this method.
--use-energy=false # use average of log energy, not energy.
--low-freq=40 # low cutoff frequency for mel bins
--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
# configuration file for apply-cmvn-online, used in the script ../local/online/
## This config is given by conf/ to the program compute-and-process-kaldi-pitch-feats,
## and is copied by steps/online/nnet2/ and similar scripts, to be given
## to programs like online2-wav-nnet2-latgen-faster.
## The program compute-and-process-kaldi-pitch-feats will use it to compute pitch features that
## are the same as that those which will generated in online decoding; this enables us to train
## in a way that's compatible with online decoding.
## most of these options relate to the post-processing rather than the pitch
## extraction itself.
--add-raw-log-pitch=true ## this is intended for input to neural nets, so our
## approach is "throw everything in and see what
## sticks".
--normalization-right-context=75 # We're removing amost all the right-context
# for the normalization. The reason why we
# include a small nonzero right-context (of
# just 0.1 second) is that by adding a little
# latency to the computation, it enables us to
# get a more accurate estimate of the pitch on
# the frame we're currently computing the
# normalized pitch of. We know for the current
# frame that we will have at least 10 frames to
# the right, and those extra 10 frames will
# increase the quality of the Viterbi
# backtrace.
# Note: our changes to the (left,right) context
# from the defaults of (75,75) to (100,10) will
# almost certainly worsen results, but will
# reduce latency.
--frames-per-chunk=10 ## relates to offline simulation of online decoding; 1
## would be equivalent to getting in samples one by
## one.
--simulate-first-pass-online=true ## this make the online-pitch-extraction code
## output the 'first-pass' features, which
## are less accurate than the final ones, and
## which are the only features the neural-net
## decoding would ever see (since we can't
## afford to do lattice rescoring in the
## neural-net code
--delay=0 ## We delay all the pitch information by 5 frames. This is almost
## certainly not helpful, but it helps to reduce the overall latency
## added by the pitch computation, from 10 (given by
## --normalization-right-context) to 10 - 5 = 5.
# Copyright 2014 Vimal Manohar
# This is our online neural net build for Gale system
. ./
. ./utils/
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
parallel_opts="-l gpu=1"
# the _a is in case I want to change the parameters.
# Use 4 nnet jobs just like so the results should be
# almost the same, but this may be a little bit slow.
parallel_opts="-pe smp $num_threads"
if [ $stage -le 0 ]; then
# this shows how you can split across multiple file-systems. we'll split the
# MFCC dir across multiple locations. You might want to be careful here, if you
# have multiple copies of Kaldi checked out and run the same recipe, not to let
# them overwrite each other.
if [[ $(hostname -f) == * ]] && [ ! -d $mfccdir/storage ]; then
date=$(date +'%m_%d_%H_%M')
utils/ /export/b0{1,2,3,4}/$USER/kaldi-data/egs/gale-$date/s5/$mfccdir/storage $mfccdir/storage || exit 1
utils/ data/train data/train_hires || exit 1
steps/ --nj $train_nj --mfcc-config conf/mfcc_hires.conf \
--cmd "$train_cmd" data/train_hires exp/make_hires/train $mfccdir || exit 1;
steps/ data/train_hires exp/make_hires/train $mfccdir || exit 1;
if [ $stage -le 1 ]; then
# we'll use the features with just MFCC, no pitch, to train the iVector
# extractor on. Check that we're using 40-dim features so the command line is correct.
! grep 'num-ceps=40' conf/mfcc_hires.conf >/dev/null && \
echo "Change the script if you change conf/mfcc_hires.conf" && exit 1;
steps/ --nj 5 --cmd "$train_cmd" 0-39 data/train_hires \
data/train_hires_mfcconly exp/nnet2_online/select_hires_train $mfccdir || exit 1
steps/ data/train_hires_mfcconly exp/nnet2_online/select_hires_train $mfccdir || exit 1
# Make a subset of about 1/3 of the data.
utils/ data/train_hires_mfcconly 100000 \
data/train_hires_mfcconly_100k || exit 1
# make a corresponding subset of normal-dimensional-MFCC training data.
utils/ --utt-list <(awk '{print $1}' data/train_hires_mfcconly_100k/utt2spk) \
data/train data/train_100k || exit 1
if [ $stage -le 2 ]; then
# We need to build a small system just because we need the LDA+MLLT transform
# to train the diag-UBM on top of. First align the data of the 100k subset using
# the tri3b system and normal MFCC features, so we have alignments to build our
# system on hires MFCCs on top of.
steps/ --nj $train_nj --cmd "$train_cmd" \
data/train_100k data/lang exp/tri3b exp/tri3b_ali_100k || exit 1;
# Build a small LDA+MLLT system on top of the hires MFCC features, just
# because we need the transform. We use --num-iters 13 because after we get
# the transform (12th iter is the last), any further training is pointless.
steps/ --cmd "$train_cmd" --num-iters 13 --realign-iters "" \
--splice-opts "--left-context=3 --right-context=3" \
5000 10000 data/train_hires_mfcconly_100k data/lang exp/tri3b_ali_100k exp/nnet2_online/tri4a || exit 1
if [ $stage -le 3 ]; then
# Train a diagonal UBM. The input directory exp/nnet2_online/tri3a is only
# needed for the splice-opts and the LDA+MLLT transform.
steps/online/nnet2/ --cmd "$train_cmd" --nj $train_nj --num-frames 400000 \
data/train_hires_mfcconly_100k 512 exp/nnet2_online/tri4a exp/nnet2_online/diag_ubm || exit 1
if [ $stage -le 4 ]; then
# train an iVector extractor on all the mfcconly data. Note: although we use
# only 10 job, each job uses 16 processes in total.
steps/online/nnet2/ --cmd "$train_cmd" --nj 10 \
data/train_hires_mfcconly exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
if [ $stage -le 5 ]; then
# extract iVectors for the training data.
if [[ $(hostname -f) == * ]] && [ ! -d $ivectordir/storage ]; then # this shows how you can split across multiple file-systems.
utils/ /export/b0{1,2,3,4}/$USER/kaldi-data/egs/gale/s5/$ivectordir/storage $ivectordir/storage || exit 1
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
steps/online/nnet2/ --utts-per-spk-max 2 data/train_hires_mfcconly data/train_hires_mfcconly_max2 || exit 1
steps/online/nnet2/ --cmd "$train_cmd" --nj $train_nj \
data/train_hires_mfcconly_max2 exp/nnet2_online/extractor $ivectordir || exit 1;
if [ $stage -le 6 ]; then
# this shows how you can split across multiple file-systems.
if [[ $(hostname -f) == * ]] && [ ! -d $dir/egs/storage ]; then
utils/ /export/b0{1,2,3,4}/$USER/kaldi-online/egs/bolt/s5/$dir/egs $dir/egs/storage || exit 1
# Because we have a lot of data here and we don't want the training to take
# too long, we reduce the number of epochs from the defaults (15) to (8).
# The option "--io-opts '-tc 12'" is to have more than the default number
# (5) of jobs dumping the egs to disk; this is OK since we're splitting our
# data across four filesystems for speed.
steps/nnet2/ --stage $train_stage \
--num-epochs 8 \
--samples-per-iter 400000 \
--splice-width 7 --feat-type raw \
--online-ivector-dir exp/nnet2_online/ivectors_train \
--cmvn-opts "--norm-means=false --norm-vars=false" \
--num-threads "$num_threads" \
--minibatch-size "$minibatch_size" \
--parallel-opts "$parallel_opts" \
--io-opts "-tc 12" \
--num-jobs-nnet 6 \
--num-hidden-layers 4 \
--mix-up 12000 \
--initial-learning-rate 0.06 --final-learning-rate 0.006 \
--cmd "$decode_cmd" \
--pnorm-input-dim 3000 \
--pnorm-output-dim 300 \
data/train_hires data/lang exp/tri3b $dir || exit 1;
if [ $stage -le 7 ]; then
steps/online/nnet2/ --mfcc-config conf/mfcc_hires.conf \
--add-pitch true data/lang exp/nnet2_online/extractor "$dir" ${dir}_online || exit 1;
if [ $stage -le 8 ]; then
# do the actual online decoding with iVectors, carrying info forward from
# previous utterances of the same speaker.
steps/online/nnet2/ --config conf/decode.config --cmd "$decode_cmd" --nj $decode_nj \
exp/tri3b/graph data/test ${dir}_online/decode_test || exit 1;
if [ $stage -le 9 ]; then
# this version of the decoding treats each utterance separately
# without carrying forward speaker information.
steps/online/nnet2/ --config conf/decode.config --cmd "$decode_cmd" --nj $decode_nj \
--per-utt true \
exp/tri3b/graph data/test ${dir}_online/decode_test_utt || exit 1;
if [ $stage -le 10 ]; then
# this version of the decoding treats each utterance separately
# without carrying forward speaker information, but looks to the end
# of the utterance while computing the iVector.
steps/online/nnet2/ --config conf/decode.config --cmd "$decode_cmd" --nj $decode_nj \
--per-utt true --online false \
exp/tri3b/graph data/test ${dir}_online/decode_test_utt_offline || exit 1;
exit 0;
