Commit 189c7741 authored by Karel Vesely's avatar Karel Vesely
Browse files

trunk,nnet1: updating 'blocksoftmax' example (renamed 'multisoftmax'),

- 'blocksoftmax' is better name as it has same name as the used output component,
- we use multitask objective function, so we can see performance on individual tasks,
- also adding some older LSTM results on tedlium, not working on this recipe now,



git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@5222 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent e8ada3c8
......@@ -20,8 +20,7 @@ cuda_cmd="queue.pl -l arch=*64 -l gpu=1"
# BUT cluster:
host=$(hostname -f)
if [ ${host#*.} == "fit.vutbr.cz" ]; then
if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
queue="all.q@@blade,all.q@@speech"
gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*"
storage="matylda5"
......
......@@ -3,7 +3,7 @@
# Copyright 2012-2015 Brno University of Technology (Author: Karel Vesely)
# Apache 2.0
# This example script trains DNN with <MultiSoftmax> output on top of FBANK features.
# This example script trains DNN with <BlockSoftmax> output on top of FBANK features.
# The network is trained on RM and WSJ84 simultaneously.
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
......@@ -11,10 +11,10 @@
. ./path.sh ## Source the tools/utils (import the queue.pl)
dev=data-fbank-multisoftmax/test
train=data-fbank-multisoftmax/train
wsj=data-fbank-multisoftmax/wsj
train_tr90_wsj=data-fbank-multisoftmax/train_tr90_wsj
dev=data-fbank-blocksoftmax/test
train=data-fbank-blocksoftmax/train
wsj=data-fbank-blocksoftmax/wsj
train_tr90_wsj=data-fbank-blocksoftmax/train_tr90_wsj
dev_original=data/test
train_original=data/train
......@@ -36,33 +36,33 @@ set -x
# Make the FBANK features,
if [ $stage -le 0 ]; then
# Make datadir copies,
utils/copy_data_dir.sh $dev_original $dev || exit 1; rm $dev/{cmvn,feats}.scp 2>/dev/null
utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp 2>/dev/null
utils/copy_data_dir.sh --utt-prefix wsj --spk-prefix wsj $wsj_original $wsj || exit 1; rm $wsj/{cmvn,feats}.scp 2>/dev/null
utils/copy_data_dir.sh $dev_original $dev; rm $dev/{cmvn,feats}.scp 2>/dev/null
utils/copy_data_dir.sh $train_original $train; rm $train/{cmvn,feats}.scp 2>/dev/null
utils/copy_data_dir.sh --utt-prefix wsj_ --spk-prefix wsj_ $wsj_original $wsj; rm $wsj/{cmvn,feats}.scp 2>/dev/null
# Feature extraction,
# Dev set,
steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd" \
$dev $dev/log $dev/data || exit 1;
steps/compute_cmvn_stats.sh $dev $dev/log $dev/data || exit 1;
$dev $dev/log $dev/data
steps/compute_cmvn_stats.sh $dev $dev/log $dev/data
# Training set,
steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd -tc 10" \
$train $train/log $train/data || exit 1;
steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1;
$train $train/log $train/data
steps/compute_cmvn_stats.sh $train $train/log $train/data
# Wsj,
steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd -tc 10" \
$wsj $wsj/log $wsj/data || exit 1;
steps/compute_cmvn_stats.sh $wsj $wsj/log $wsj/data || exit 1;
$wsj $wsj/log $wsj/data
steps/compute_cmvn_stats.sh $wsj $wsj/log $wsj/data
# Split the rm training set,
utils/subset_data_dir_tr_cv.sh --cv-spk-percent 10 $train ${train}_tr90 ${train}_cv10 || exit 1
utils/subset_data_dir_tr_cv.sh --cv-spk-percent 10 $train ${train}_tr90 ${train}_cv10
# Merge-in the wsj set with train-set,
utils/combine_data.sh $train_tr90_wsj ${train}_tr90 $wsj || exit 1
utils/combine_data.sh $train_tr90_wsj ${train}_tr90 $wsj
fi
# Prepare the merged targets,
dir=exp/dnn4e-fbank_multisoftmax
dir=exp/dnn4e-fbank_blocksoftmax
ali1_dim=$(hmm-info ${gmm}_ali/final.mdl | grep pdfs | awk '{ print $NF }')
ali2_dim=$(hmm-info ${wsj_ali}/final.mdl | grep pdfs | awk '{ print $NF }')
#
......@@ -73,25 +73,26 @@ ali1_dir=${gmm}_ali
#
if [ $stage -le 1 ]; then
mkdir -p $dir/log
copy-int-vector "ark:gzcat ${wsj_ali}/ali.*.gz |" ark,t:- | awk -v prefix=wsj '{ $1=prefix $1; print; }' | \
gzip -c >$dir/ali_wsj.gz # Mapping keys at wsj alignment,
# Mapping keys in wsj alignment to have prefix 'wsj',
copy-int-vector "ark:gzcat ${wsj_ali}/ali.*.gz |" ark,t:- | awk -v prefix=wsj_ '{ $1=prefix $1; print; }' | \
gzip -c >$dir/ali_wsj.gz
# Store posteriors to disk, indexed by 'scp',
# Store single-stream posteriors to disk, indexed by 'scp' for pasting w/o caching,
ali-to-pdf ${gmm}_ali/final.mdl "ark:gzcat ${gmm}_ali/ali.*.gz |" ark:- | \
ali-to-post ark:- ark,scp:$dir/post1.ark,$dir/post1.scp
ali-to-pdf ${wsj_ali}/final.mdl "ark:gzcat $dir/ali_wsj.gz |" ark:- | \
ali-to-post ark:- ark,scp:$dir/post2.ark,$dir/post2.scp
# Paste the posteriors from the 'scp' inputs,
featlen="ark:feat-to-len 'scp:cat $train/feats.scp $wsj/feats.scp |' ark,t:- |"
post1=scp:$dir/post1.scp
post2=scp:$dir/post2.scp
paste-post --allow-partial=true "$featlen" $ali1_dim:$ali2_dim "$post1" "$post2" \
paste-post --allow-partial=true "$featlen" $ali1_dim:$ali2_dim \
scp:$dir/post1.scp scp:$dir/post2.scp \
ark,scp:$dir/pasted_post.ark,$dir/pasted_post.scp 2>$dir/log/paste_post.log
fi
# Train <MultiSoftmax> system,
# Train NN with '<BlockSoftmax>' output, we use 'MultiTask' objective function,
objw1=1; objw2=0.1; # we'll use lower weight for 'wsj' data,
if [ $stage -le 2 ]; then
$cuda_cmd $dir/log/train_nnet.log \
steps/nnet/train.sh \
......@@ -99,23 +100,24 @@ if [ $stage -le 2 ]; then
--delta-opts "--delta-order=2" --splice 5 \
--labels "scp:$dir/pasted_post.scp" --num-tgt $output_dim \
--proto-opts "--block-softmax-dims='$ali1_dim:$ali2_dim'" \
--train-tool "nnet-train-frmshuff --objective-function 'multitask,xent,$ali1_dim,$objw1,xent,$ali2_dim,$objw2'" \
--learn-rate 0.008 \
${train_tr90_wsj} ${train}_cv10 lang-dummy ali-dummy ali-dummy $dir || exit 1;
${train_tr90_wsj} ${train}_cv10 lang-dummy ali-dummy ali-dummy $dir
# Create files used in decdoing, missing due to --labels use,
analyze-counts --binary=false "$ali1_pdf" $dir/ali_train_pdf.counts || exit 1
copy-transition-model --binary=false $ali1_dir/final.mdl $dir/final.mdl || exit 1
cp $ali1_dir/tree $dir/tree || exit 1
# Rebuild network, <MultiSoftmax> is removed, and neurons from 1st block are selected,
analyze-counts --binary=false "$ali1_pdf" $dir/ali_train_pdf.counts
copy-transition-model --binary=false $ali1_dir/final.mdl $dir/final.mdl
cp $ali1_dir/tree $dir/tree
# Rebuild network, <BlockSoftmax> is removed, and neurons from 1st block are selected,
nnet-concat "nnet-copy --remove-last-components=1 $dir/final.nnet - |" \
"echo '<Copy> <InputDim> $output_dim <OutputDim> $ali1_dim <BuildVector> 1:$ali1_dim </BuildVector>' | nnet-initialize - - |" \
$dir/final.nnet.lang1 || exit 1
# Decode (reuse HCLG graph)
$dir/final.nnet.lang1
# Decode (reuse HCLG graph),
steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
--nnet $dir/final.nnet.lang1 \
$gmm/graph $dev $dir/decode || exit 1;
$gmm/graph $dev $dir/decode
steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
--nnet $dir/final.nnet.lang1 \
$gmm/graph_ug $dev $dir/decode_ug || exit 1;
$gmm/graph_ug $dev $dir/decode_ug
fi
exit 0
......@@ -137,12 +139,12 @@ if [ $stage -le 3 ]; then
--cmvn-opts "--norm-means=true --norm-vars=true" \
--delta-opts "--delta-order=2" --splice 5 \
--learn-rate 0.008 \
${train}_tr90 ${train}_cv10 data/lang ${gmm}_ali ${gmm}_ali $dir || exit 1;
${train}_tr90 ${train}_cv10 data/lang ${gmm}_ali ${gmm}_ali $dir
# Decode (reuse HCLG graph)
steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
$gmm/graph $dev $dir/decode || exit 1;
$gmm/graph $dev $dir/decode
steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
$gmm/graph_ug $dev $dir/decode_ug || exit 1;
$gmm/graph_ug $dev $dir/decode_ug
fi
echo Success
......
......@@ -4,7 +4,7 @@ filter_regexp=.
[ $# -ge 1 ] && filter_regexp=$1
for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null
for x in exp/{mono,tri,sgmm,nnet,dnn}*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp
for x in exp/{mono,tri,sgmm,nnet,dnn,lstm}*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp
exit 0
# Results from Nikolay, using kaldi scoring:
......@@ -103,7 +103,18 @@ exit 0
%WER 17.6 | 1155 27512 | 85.9 11.1 3.0 3.5 17.6 87.8 | -0.210 | exp/nnet2_online/nnet_ms_sp_online/decode_test_utt/score_12/ctm.filt.filt.sys
%WER 17.2 | 1155 27512 | 86.5 10.8 2.7 3.7 17.2 87.4 | -0.236 | exp/nnet2_online/nnet_ms_sp_online/decode_test_utt_offline/score_11/ctm.filt.filt.sys
# new dict, lm,
%WER 13.3 | 1155 27512 | 88.7 8.7 2.6 2.0 13.3 81.5 | -0.097 | exp/nnet2_online/nnet_ms_sp/decode_test/score_10/ctm.filt.filt.sys
%WER 13.2 | 1155 27512 | 88.6 8.5 2.8 1.9 13.2 81.6 | -0.102 | exp/nnet2_online/nnet_ms_sp_online/decode_test/score_11/ctm.filt.filt.sys
%WER 13.6 | 1155 27512 | 88.5 8.9 2.6 2.1 13.6 82.7 | -0.095 | exp/nnet2_online/nnet_ms_sp_online/decode_test_utt/score_10/ctm.filt.filt.sys
### LSTM vs. DNN ###
# DNN on MFCC-fMLLR
%WER 19.1 | 1155 27512 | 84.4 12.1 3.5 3.5 19.1 90.0 | -0.025 | exp/dnn4_pretrain-dbn_dnn/decode_test/score_12/ctm.filt.filt.sys
# DNN on FBANK-pitch (we see pitch compensated degradation of not having fMLLR),
%WER 19.2 | 1155 27512 | 84.4 12.3 3.3 3.6 19.2 89.2 | -0.021 | exp/dnn4d-fbank_pretrain-dbn_dnn/decode_test/score_12/ctm.filt.filt.sys
# LSTM
%WER 20.3 | 1155 27512 | 83.3 13.2 3.5 3.6 20.3 90.7 | -0.176 | exp/lstm4f_ClipGradient5_lrate1e-4/decode_test/score_11/ctm.filt.filt.sys
# 2xLSTM
TODO...
......@@ -51,7 +51,7 @@ if [ $stage -le 1 ]; then
${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1;
# Decode (reuse HCLG graph)
steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
steps/nnet/decode.sh --nj 11 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
$gmm/graph $dev $dir/decode_test || exit 1;
fi
......
......@@ -94,7 +94,7 @@ Besides the DNN recipe, there are also other example scripts which can be handy:
- CNN : egs/rm/s5/local/nnet/run_cnn.sh , (CNN = Convolutional Neural Network, <a href="www.cs.toronto.edu/~asamir/papers/icassp13_cnn.pdf">see paper</a>, we have 1D convolution on frequency axis)
- Autoencoder training : egs/timit/s5/local/nnet/run_autoencoder.sh
- Tandem system : egs/swbd/s5c/local/nnet/run_dnn_tandem_uc.sh , (uc = Universal context network, <a href="http://www.fit.vutbr.cz/research/groups/speech/publi/2011/vesely_asru2011_00042.pdf">see paper</a>)
- Multilingual/Multitask : egs/rm/s5/local/nnet/run_multisoftmax.sh, (Network with <BlockSoftmax> output trained on RM and WSJ, same C++ design as was used in <a href="http://www.fit.vutbr.cz/research/groups/speech/publi/2012/vesely_slt2012_0000336.pdf">SLT2012 paper</a>)
- Multilingual/Multitask : egs/rm/s5/local/nnet/run_blocksoftmax.sh, (Network with <BlockSoftmax> output trained on RM and WSJ, same C++ design as was used in <a href="http://www.fit.vutbr.cz/research/groups/speech/publi/2012/vesely_slt2012_0000336.pdf">SLT2012 paper</a>)
<hr><!-- #################################################################################################################### -->
......
......@@ -59,16 +59,16 @@ inline void CountCorrectFramesWeighted(const CuArray<T> &v1,
void Xent::Eval(const VectorBase<BaseFloat> &frame_weights,
const CuMatrixBase<BaseFloat> &net_out,
const CuMatrixBase<BaseFloat> &target,
const CuMatrixBase<BaseFloat> &targets,
CuMatrix<BaseFloat> *diff) {
// check inputs,
KALDI_ASSERT(net_out.NumCols() == target.NumCols());
KALDI_ASSERT(net_out.NumRows() == target.NumRows());
KALDI_ASSERT(net_out.NumCols() == targets.NumCols());
KALDI_ASSERT(net_out.NumRows() == targets.NumRows());
KALDI_ASSERT(net_out.NumRows() == frame_weights.Dim());
KALDI_ASSERT(KALDI_ISFINITE(frame_weights.Sum()));
KALDI_ASSERT(KALDI_ISFINITE(net_out.Sum()));
KALDI_ASSERT(KALDI_ISFINITE(target.Sum()));
KALDI_ASSERT(KALDI_ISFINITE(targets.Sum()));
double num_frames = frame_weights.Sum();
KALDI_ASSERT(num_frames >= 0.0);
......@@ -76,30 +76,38 @@ void Xent::Eval(const VectorBase<BaseFloat> &frame_weights,
// get frame_weights to GPU,
frame_weights_ = frame_weights;
// There may be frames for which the sum of targets is zero.
// This happens in multi-lingual training when the frame
// has target class in the softmax of another language.
// We 'switch-off' such frames by masking the 'frame_weights_',
target_sum_.Resize(targets.NumRows());
target_sum_.AddColSumMat(1.0, targets, 0.0);
frame_weights_.MulElements(target_sum_);
// compute derivative wrt. activations of last layer of neurons,
*diff = net_out;
diff->AddMat(-1.0, target);
diff->AddMat(-1.0, targets);
diff->MulRowsVec(frame_weights_); // weighting,
// evaluate the frame-level classification,
double correct;
net_out.FindRowMaxId(&max_id_out_); // find max in nn-output
target.FindRowMaxId(&max_id_tgt_); // find max in targets
targets.FindRowMaxId(&max_id_tgt_); // find max in targets
CountCorrectFramesWeighted(max_id_out_, max_id_tgt_, frame_weights, &correct);
// calculate cross_entropy (in GPU),
xentropy_aux_ = net_out; // y
xentropy_aux_.Add(1e-20); // avoid log(0)
xentropy_aux_.ApplyLog(); // log(y)
xentropy_aux_.MulElements(target); // t*log(y)
xentropy_aux_.MulElements(targets); // t*log(y)
xentropy_aux_.MulRowsVec(frame_weights_); // w*t*log(y)
double cross_entropy = -xentropy_aux_.Sum();
// caluculate entropy (in GPU),
entropy_aux_ = target; // t
entropy_aux_ = targets; // t
entropy_aux_.Add(1e-20); // avoid log(0)
entropy_aux_.ApplyLog(); // log(t)
entropy_aux_.MulElements(target); // t*log(t)
entropy_aux_.MulElements(targets); // t*log(t)
entropy_aux_.MulRowsVec(frame_weights_); // w*t*log(t)
double entropy = -entropy_aux_.Sum();
......
......@@ -96,6 +96,7 @@ class Xent : public LossItf {
// weigting buffer,
CuVector<BaseFloat> frame_weights_;
CuVector<BaseFloat> target_sum_;
// loss computation buffers
CuMatrix<BaseFloat> tgt_mat_;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment