Commit 43594ae3 authored by Vijayaditya Peddinti's avatar Vijayaditya Peddinti
Browse files

egs/ami: Added TDNN recipe for AMI IHM task.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@5218 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 1cb64585
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
for x in exp/ihm/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null for x in exp/ihm/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null
# sclite / asclite: # sclite / asclite:
for x in exp/ihm/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep Sum $x/ascore_*/*.sys | utils/best_wer.sh; done 2>/dev/null for x in exp/ihm/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep Sum $x/ascore_*/*.sys | utils/best_wer.sh; done 2>/dev/null
exit 0 exit 0
dev dev
...@@ -18,3 +19,13 @@ exp/ihm/tri4a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_13/eval_o4.ctm.filt.dtl:Pe ...@@ -18,3 +19,13 @@ exp/ihm/tri4a/decode_eval_ami_fsh.o3g.kn.pr1-7/ascore_13/eval_o4.ctm.filt.dtl:Pe
exp/ihm/tri4a_mmi_b0.1/decode_eval_3.mdl_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.ctm.filt.dtl:Percent Total Error = 31.7% (28518) exp/ihm/tri4a_mmi_b0.1/decode_eval_3.mdl_ami_fsh.o3g.kn.pr1-7/ascore_12/eval_o4.ctm.filt.dtl:Percent Total Error = 31.7% (28518)
# TDNN results
for x in exp/$mic/nnet2_online/*/decode*; do [ -d $x ] && [[ $x =~ "$1" ]] && grep Sum $x/ascore_*/*.sys | utils/best_wer.sh; done
#dev
%WER 25.0 | 13098 94483 | 78.3 12.0 9.6 3.4 25.0 57.7 | exp/ihm/nnet2_online/nnet_ms_sp_online/decode_dev/ascore_13/dev_hires.ctm.filt.sys
%WER 25.3 | 13098 94468 | 78.5 12.7 8.8 3.8 25.3 57.9 | exp/ihm/nnet2_online/nnet_ms_sp_online/decode_dev_utt/ascore_12/dev_hires.ctm.filt.sys
%WER 25.0 | 13098 94476 | 78.5 12.4 9.1 3.6 25.0 58.0 | exp/ihm/nnet2_online/nnet_ms_sp_online/decode_dev_utt_offline/ascore_13/dev_hires.ctm.filt.sys
#eval
%WER 25.9 | 12643 89971 | 77.2 14.2 8.6 3.2 25.9 56.4 | exp/ihm/nnet2_online/nnet_ms_sp_online/decode_eval/ascore_12/eval_hires.ctm.filt.sys
%WER 26.0 | 12643 89976 | 77.1 14.7 8.2 3.2 26.0 55.7 | exp/ihm/nnet2_online/nnet_ms_sp_online/decode_eval_utt/ascore_12/eval_hires.ctm.filt.sys
%WER 25.8 | 12643 89978 | 77.6 14.6 7.8 3.4 25.8 55.8 | exp/ihm/nnet2_online/nnet_ms_sp_online/decode_eval_utt_offline/ascore_11/eval_hires.ctm.filt.sys
#!/bin/bash
# this script contains some common (shared) parts of the run_nnet*.sh scripts.
. cmd.sh
stage=0
mic=ihm
num_threads_ubm=32
set -e
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
if [ $stage -le 1 ]; then
# Create high-resolution MFCC features (with 40 cepstra instead of 13).
# this shows how you can split across multiple file-systems. we'll split the
# MFCC dir across multiple locations. You might want to be careful here, if you
# have multiple copies of Kaldi checked out and run the same recipe, not to let
# them overwrite each other.
mfccdir=mfcc_${mic}_hires
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
fi
for datadir in train dev eval; do
utils/copy_data_dir.sh data/$mic/$datadir data/$mic/${datadir}_hires
if [ "$datadir" == "train" ]; then
dir=data/$mic/train_hires
cat $dir/wav.scp | python -c "
import sys, os, subprocess, re, random
scale_low = 1.0/8
scale_high = 2.0
for line in sys.stdin.readlines():
if len(line.strip()) == 0:
continue
print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high))
"| sort -k1,1 -u > $dir/wav.scp_scaled || exit 1;
mv $dir/wav.scp $dir/wav.scp_nonorm
mv $dir/wav.scp_scaled $dir/wav.scp
fi
steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
--cmd "$train_cmd" data/$mic/${datadir}_hires exp/make_${mic}_hires/$datadir $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/$mic/${datadir}_hires exp/make_${mic}_hires/$mic/$datadir $mfccdir || exit 1;
done
fi
if [ $stage -le 2 ]; then
# Train a system just for its LDA+MLLT transform. We use --num-iters 13
# because after we get the transform (12th iter is the last), any further
# training is pointless.
steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
--realign-iters "" \
--splice-opts "--left-context=3 --right-context=3" \
5000 10000 data/$mic/train_hires data/lang \
exp/$mic/tri4a_ali exp/$mic/nnet2_online/tri5
fi
if [ $stage -le 3 ]; then
mkdir -p exp/nnet2_online
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 700000 \
--num-threads $num_threads_ubm \
data/$mic/train_hires 512 exp/$mic/nnet2_online/tri5 exp/$mic/nnet2_online/diag_ubm
fi
if [ $stage -le 4 ]; then
# iVector extractors can in general be sensitive to the amount of data, but
# this one has a fairly small dim (defaults to 100)
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
data/$mic/train_hires exp/$mic/nnet2_online/diag_ubm exp/$mic/nnet2_online/extractor || exit 1;
fi
if [ $stage -le 5 ]; then
ivectordir=exp/$mic/nnet2_online/ivectors_train_hires
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/ami-$mic-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
fi
# We extract iVectors on all the train data, which will be what we train the
# system on. With --utts-per-spk-max 2, the script. pairs the utterances
# into twos, and treats each of these pairs as one speaker. Note that these
# are extracted 'online'.
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/$mic/train_hires data/$mic/train_hires_max2
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
data/$mic/train_hires_max2 exp/$mic/nnet2_online/extractor exp/$mic/nnet2_online/ivectors_train_hires || exit 1;
fi
exit 0;
#!/bin/bash
# Copyright 2013 Johns Hopkins University (author: Daniel Povey)
# 2014 Tom Ko
# 2014 Vijay Peddinti
# Apache 2.0
# This example script demonstrates how speed perturbation of the data helps the nnet training in the SWB setup.
. ./cmd.sh
set -e
stage=1
train_stage=-10
use_gpu=true
splice_indexes="layer0/-2:-1:0:1:2 layer1/-1:2 layer2/-3:3 layer3/-7:2 layer4/-3:3"
common_egs_dir=
has_fisher=true
mic=ihm
nj=70
affix=
num_threads_ubm=32
. ./path.sh
. ./utils/parse_options.sh
if $use_gpu; then
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed. Otherwise, call this script with --use-gpu false
EOF
fi
parallel_opts="--gpu 1"
num_threads=1
minibatch_size=512
# the _a is in case I want to change the parameters.
else
# Use 4 nnet jobs just like run_4d_gpu.sh so the results should be
# almost the same, but this may be a little bit slow.
num_threads=16
minibatch_size=128
parallel_opts="-pe smp $num_threads"
fi
dir=exp/$mic/nnet2_online/nnet_ms_sp${affix:+_$affix}
final_lm=`cat data/local/lm/final_lm`
LM=$final_lm.pr1-7
graph_dir=exp/$mic/tri4a/graph_${LM}
# Run the common stages of training, including training the iVector extractor
local/online/run_nnet2_common.sh --stage $stage --mic $mic \
--num-threads-ubm $num_threads_ubm|| exit 1;
if [ $stage -le 6 ]; then
#Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment
# _sp stands for speed-perturbed
utils/perturb_data_dir_speed.sh 0.9 data/$mic/train data/$mic/temp1
utils/perturb_data_dir_speed.sh 1.0 data/$mic/train data/$mic/temp2
utils/perturb_data_dir_speed.sh 1.1 data/$mic/train data/$mic/temp3
utils/combine_data.sh --extra-files utt2uniq data/$mic/train_sp data/$mic/temp1 data/$mic/temp2 data/$mic/temp3
rm -r data/$mic/temp1 data/$mic/temp2 data/$mic/temp3
mfccdir=mfcc_${mic}_perturbed
for x in train_sp; do
steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj \
data/$mic/$x exp/make_${mic}_mfcc/$x $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/$mic/$x exp/make_${mic}_mfcc/$x $mfccdir || exit 1;
done
utils/fix_data_dir.sh data/$mic/train_sp
fi
if [ $stage -le 7 ]; then
steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
data/$mic/train_sp data/lang exp/$mic/tri4a exp/$mic/tri4a_sp_ali || exit 1
fi
if [ $stage -le 8 ]; then
#Now perturb the high resolution daa
utils/perturb_data_dir_speed.sh 0.9 data/$mic/train_hires data/temp1
utils/perturb_data_dir_speed.sh 1.0 data/$mic/train_hires data/temp2
utils/perturb_data_dir_speed.sh 1.1 data/$mic/train_hires data/temp3
utils/combine_data.sh --extra-files utt2uniq data/$mic/train_hires_sp data/temp1 data/temp2 data/temp3
rm -r data/temp1 data/temp2 data/temp3
mfccdir=mfcc_${mic}_perturbed
for x in train_hires_sp; do
steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj --mfcc-config conf/mfcc_hires.conf \
data/$mic/$x exp/make_${mic}_hires/$x $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/$mic/$x exp/make_${mic}_hires/$x $mfccdir || exit 1;
done
utils/fix_data_dir.sh data/$mic/train_hires_sp
fi
if [ $stage -le 9 ]; then
# We extract iVectors on all the train data, which will be what we
# train the system on.
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/$mic/train_hires_sp data/$mic/train_hires_sp_max2
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
data/$mic/train_hires_sp_max2 exp/$mic/nnet2_online/extractor exp/$mic/nnet2_online/ivectors_train_hires_sp2 || exit 1;
fi
if [ $stage -le 10 ]; then
steps/nnet2/train_multisplice_accel2_fix.sh --stage $train_stage \
--num-epochs 3 --num-jobs-initial 2 --num-jobs-final 12 \
--num-hidden-layers 6 --splice-indexes "$splice_indexes" \
--feat-type raw \
--online-ivector-dir exp/$mic/nnet2_online/ivectors_train_hires_sp2 \
--cmvn-opts "--norm-means=false --norm-vars=false" \
--num-threads "$num_threads" \
--minibatch-size "$minibatch_size" \
--parallel-opts "$parallel_opts" \
--io-opts "--max-jobs-run 12" \
--add-layers-period 1 \
--initial-effective-lrate 0.0015 --final-effective-lrate 0.00015 \
--cmd "$decode_cmd" \
--egs-dir "$common_egs_dir" \
--pnorm-input-dim 950 \
--pnorm-output-dim 950 \
data/$mic/train_hires_sp data/lang exp/$mic/tri4a_sp_ali $dir || exit 1;
fi
if [ $stage -le 11 ]; then
# If this setup used PLP features, we'd have to give the option --feature-type plp
# to the script below.
steps/online/nnet2/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
data/lang exp/$mic/nnet2_online/extractor "$dir" ${dir}_online || exit 1;
fi
wait;
if [ $stage -le 12 ]; then
# do the actual online decoding with iVectors, carrying info forward from
# previous utterances of the same speaker.
for decode_set in dev eval; do
(
num_jobs=`cat data/$mic/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
decode_dir=${dir}_online/decode_${decode_set}
steps/online/nnet2/decode.sh --config conf/decode.conf --cmd "$decode_cmd" --nj $num_jobs \
$graph_dir data/$mic/${decode_set}_hires $decode_dir || exit 1;
) &
done
fi
if [ $stage -le 13 ]; then
# this version of the decoding treats each utterance separately
# without carrying forward speaker information.
for decode_set in dev eval; do
(
num_jobs=`cat data/$mic/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
decode_dir=${dir}_online/decode_${decode_set}_utt
steps/online/nnet2/decode.sh --config conf/decode.conf --cmd "$decode_cmd" --nj $num_jobs \
--per-utt true $graph_dir data/$mic/${decode_set}_hires $decode_dir || exit 1;
) &
done
fi
if [ $stage -le 14 ]; then
# this version of the decoding treats each utterance separately
# without carrying forward speaker information, but looks to the end
# of the utterance while computing the iVector (--online false)
for decode_set in dev eval; do
(
num_jobs=`cat data/$mic/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l`
decode_dir=${dir}_online/decode_${decode_set}_utt_offline
steps/online/nnet2/decode.sh --config conf/decode.conf --cmd "$decode_cmd" --nj $num_jobs \
--per-utt true --online false $graph_dir data/$mic/${decode_set}_hires \
$decode_dir || exit 1;
) &
done
fi
wait;
exit 0;
...@@ -172,4 +172,9 @@ if [ $stage -le 12 ]; then ...@@ -172,4 +172,9 @@ if [ $stage -le 12 ]; then
local/nnet/run_dnn.sh $mic local/nnet/run_dnn.sh $mic
fi fi
# TDNN training.
if [ $stage -le 13 ]; then
local/online/run_nnet2_ms_perturbed.sh --mic $mic
fi
echo "Done!" echo "Done!"
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment