Commit 780afbb7 authored by Daniel Povey's avatar Daniel Povey

Merge pull request #74 from naxingyu/update-hkust

update hkust recipe with more neural net scripts and add relevant results
parents 5526c218 435af6f4
...@@ -7,6 +7,17 @@ exp/tri5a/decode/cer_13:%WER 49.67 [ 27891 / 56154, 2877 ins, 4538 del, 20476 su ...@@ -7,6 +7,17 @@ exp/tri5a/decode/cer_13:%WER 49.67 [ 27891 / 56154, 2877 ins, 4538 del, 20476 su
exp/tri5a_mce/decode/cer_11:%WER 44.74 [ 25125 / 56154, 2112 ins, 4108 del, 18905 sub ] exp/tri5a_mce/decode/cer_11:%WER 44.74 [ 25125 / 56154, 2112 ins, 4108 del, 18905 sub ]
exp/tri5a_mmi_b0.1/decode/cer_11:%WER 44.24 [ 24840 / 56154, 2060 ins, 4118 del, 18662 sub ] exp/tri5a_mmi_b0.1/decode/cer_11:%WER 44.24 [ 24840 / 56154, 2060 ins, 4118 del, 18662 sub ]
exp/tri5a_mpe/decode/cer_12:%WER 44.96 [ 25247 / 56154, 2233 ins, 4174 del, 18840 sub ] exp/tri5a_mpe/decode/cer_12:%WER 44.96 [ 25247 / 56154, 2233 ins, 4174 del, 18840 sub ]
exp/sgmm2_5a/decode/cer_11:%WER 43.44 [ 24391 / 56154, 2646 ins, 4066 del, 17679 sub ]
# nnet1 results
exp/dnn5b_pretrain-dbn_dnn/decode/cer_10:%WER 39.42 [ 22134 / 56154, 2507 ins, 3730 del, 15897 sub ]
exp/dnn5b_pretrain-dbn_dnn_smbr/decode/cer_11:%WER 36.50 [ 20499 / 56154, 1915 ins, 3312 del, 15272 sub ]
exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats/decode/cer_12:%WER 35.93 [ 20177 / 56154, 1949 ins, 3188 del, 15040 sub ]
exp/cnn5c/decode/cer_10:%WER 40.13 [ 22536 / 56154, 2329 ins, 3962 del, 16245 sub ]
exp/cnn5c_pretrain-dbn_dnn/decode/cer_10:%WER 38.80 [ 21790 / 56154, 2470 ins, 3582 del, 15738 sub ]
exp/lstm5e/decode/cer_10:%WER 37.61 [ 21121 / 56154, 1829 ins, 3941 del, 15351 sub ]
# nnet2 results
exp/nnet2_5d/decode/cer_10:%WER 38.59 [ 21669 / 56154, 2498 ins, 3581 del, 15590 sub ]
# ConvNet with 2 convolutional layers and 2 ReLU layers # ConvNet with 2 convolutional layers and 2 ReLU layers
exp/nnet2_convnet/decode/cer_10:%WER 40.73 [ 22873 / 56154, 2609 ins, 3712 del, 16552 sub ] exp/nnet2_convnet/decode/cer_10:%WER 40.73 [ 22873 / 56154, 2609 ins, 3712 del, 16552 sub ]
#!/bin/bash
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
## This relates to the queue.
. ./path.sh ## Source the tools/utils (import the queue.pl)
dev=data_fbank/dev
train=data_fbank/train
dev_original=data/dev
train_original=data/train
gmm=exp/tri5a
stage=0
. utils/parse_options.sh || exit 1;
# Make the FBANK features
if [ $stage -le 0 ]; then
# Dev set
utils/copy_data_dir.sh $dev_original $dev || exit 1; rm $dev/{cmvn,feats}.scp
steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd" \
$dev $dev/log $dev/data || exit 1;
steps/compute_cmvn_stats.sh $dev $dev/log $dev/data || exit 1;
# Training set
utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp
steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd" \
$train $train/log $train/data || exit 1;
steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1;
# Split the training set
utils/subset_data_dir_tr_cv.sh --cv-spk-percent 10 $train ${train}_tr90 ${train}_cv10
fi
# Run the CNN pre-training.
if [ $stage -le 1 ]; then
dir=exp/cnn5c
ali=${gmm}_ali
# Train
$cuda_cmd $dir/log/train_nnet.log \
steps/nnet/train.sh \
--cmvn-opts "--norm-means=true --norm-vars=true" \
--delta-opts "--delta-order=2" --splice 5 \
--network-type cnn1d --cnn-proto-opts "--patch-dim1 7 --pitch-dim 3" \
--hid-layers 2 --learn-rate 0.008 \
${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1;
# Decode with the trigram language model.
steps/nnet/decode.sh --nj 10 --cmd "$decode_cmd" \
--config conf/decode_dnn.config --acwt 0.1 \
$gmm/graph $dev $dir/decode || exit 1;
fi
# Pre-train stack of RBMs on top of the convolutional layers (2 layers, 2000 units)
if [ $stage -le 2 ]; then
dir=exp/cnn5c_pretrain-dbn
transf_cnn=exp/cnn5c/final.feature_transform_cnn # transform with convolutional layers
# Train
$cuda_cmd $dir/log/pretrain_dbn.log \
steps/nnet/pretrain_dbn.sh --nn-depth 2 --hid-dim 2000 --rbm-iter 1 \
--feature-transform $transf_cnn --input-vis-type bern \
--param-stddev-first 0.05 --param-stddev 0.05 \
$train $dir || exit 1
fi
# Re-align using CNN
if [ $stage -le 3 ]; then
dir=exp/cnn5c
steps/nnet/align.sh --nj 10 --cmd "$train_cmd" \
$train data/lang $dir ${dir}_ali || exit 1
fi
# Train the DNN optimizing cross-entropy.
if [ $stage -le 4 ]; then
dir=exp/cnn5c_pretrain-dbn_dnn; [ ! -d $dir ] && mkdir -p $dir/log;
ali=exp/cnn5c_ali
feature_transform=exp/cnn5c/final.feature_transform
feature_transform_dbn=exp/cnn5c_pretrain-dbn/final.feature_transform
dbn=exp/cnn5c_pretrain-dbn/2.dbn
cnn_dbn=$dir/cnn_dbn.nnet
{ # Concatenate CNN layers and DBN,
num_components=$(nnet-info $feature_transform | grep -m1 num-components | awk '{print $2;}')
nnet-concat "nnet-copy --remove-first-layers=$num_components $feature_transform_dbn - |" $dbn $cnn_dbn \
2>$dir/log/concat_cnn_dbn.log || exit 1
}
# Train
$cuda_cmd $dir/log/train_nnet.log \
steps/nnet/train.sh --feature-transform $feature_transform --dbn $cnn_dbn --hid-layers 0 \
${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1;
# Decode with the trigram language model.
steps/nnet/decode.sh --nj 10 --cmd "$decode_cmd" \
--config conf/decode_dnn.config --acwt 0.1 \
$gmm/graph $dev $dir/decode || exit 1;
fi
# Sequence training using sMBR criterion, we do Stochastic-GD
# with per-utterance updates. For RM good acwt is 0.2 (For WSJ maybe 0.1)
dir=exp/cnn5c_pretrain-dbn_dnn_smbr
srcdir=exp/cnn5c_pretrain-dbn_dnn
acwt=0.1
# First we generate lattices and alignments:
if [ $stage -le 5 ]; then
steps/nnet/align.sh --nj 10 --cmd "$train_cmd" \
$train data/lang $srcdir ${srcdir}_ali || exit 1;
steps/nnet/make_denlats.sh --nj 10 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
$train data/lang $srcdir ${srcdir}_denlats || exit 1;
fi
# Re-train the DNN by 2 iterations of sMBR
if [ $stage -le 6 ]; then
steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 2 --acwt $acwt --do-smbr true \
$train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
# Decode
for ITER in 1 2; do
steps/nnet/decode.sh --nj 10 --cmd "$decode_cmd" \
--config conf/decode_dnn.config --acwt $acwt --nnet $dir/${ITER}.nnet \
$gmm/graph $dev $dir/decode_it${ITER} || exit 1;
done
fi
# Re-generate lattices, run 4 more sMBR iterations
dir=exp/cnn5c_pretrain-dbn_dnn_smbr_i1lats
srcdir=exp/cnn5c_pretrain-dbn_dnn_smbr
if [ $stage -le 7 ]; then
steps/nnet/align.sh --nj 10 --cmd "$train_cmd" \
$train data/lang $srcdir ${srcdir}_ali || exit 1;
steps/nnet/make_denlats.sh --nj 10 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
$train data/lang $srcdir ${srcdir}_denlats || exit 1;
fi
if [ $stage -le 8 ]; then
steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \
$train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
# Decode
for ITER in 1 2 3 4; do
steps/nnet/decode.sh --nj 10 --cmd "$decode_cmd" \
--config conf/decode_dnn.config --acwt $acwt --nnet $dir/${ITER}.nnet \
$gmm/graph $dev $dir/decode_it${ITER} || exit 1;
done
fi
echo Success
exit 0
#!/bin/bash
# Copyright 2012-2014 Brno University of Technology (Author: Karel Vesely)
# 2014 Guoguo Chen
# Apache 2.0
# This example script trains a DNN on top of fMLLR features.
# The training is done in 3 stages,
#
# 1) RBM pre-training:
# in this unsupervised stage we train stack of RBMs,
# a good starting point for frame cross-entropy trainig.
# 2) frame cross-entropy training:
# the objective is to classify frames to correct pdfs.
# 3) sequence-training optimizing sMBR:
# the objective is to emphasize state-sequences with better
# frame accuracy w.r.t. reference alignment.
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
## This relates to the queue.
. ./path.sh ## Source the tools/utils (import the queue.pl)
# Config:
gmmdir=exp/tri5a
data_fmllr=data-fmllr-tri5a
stage=0 # resume training with --stage=N
# End of config.
. utils/parse_options.sh || exit 1;
#
if [ $stage -le 0 ]; then
# Store fMLLR features, so we can train on them easily,
# dev
dir=$data_fmllr/dev
steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \
--transform-dir $gmmdir/decode \
$dir data/dev $gmmdir $dir/log $dir/data || exit 1
# train
dir=$data_fmllr/train
steps/nnet/make_fmllr_feats.sh --nj 10 --cmd "$train_cmd" \
--transform-dir ${gmmdir}_ali \
$dir data/train $gmmdir $dir/log $dir/data || exit 1
# split the data : 90% train 10% cross-validation (held-out)
utils/subset_data_dir_tr_cv.sh $dir ${dir}_tr90 ${dir}_cv10 || exit 1
fi
if [ $stage -le 1 ]; then
# Pre-train DBN, i.e. a stack of RBMs
dir=exp/dnn5b_pretrain-dbn
(tail --pid=$$ -F $dir/log/pretrain_dbn.log 2>/dev/null)& # forward log
$cuda_cmd $dir/log/pretrain_dbn.log \
steps/nnet/pretrain_dbn.sh --rbm-iter 1 --nn-depth 4 --hid-dim 2000 \
$data_fmllr/train $dir || exit 1;
fi
if [ $stage -le 2 ]; then
# Train the DNN optimizing per-frame cross-entropy.
dir=exp/dnn5b_pretrain-dbn_dnn
ali=${gmmdir}_ali
feature_transform=exp/dnn5b_pretrain-dbn/final.feature_transform
dbn=exp/dnn5b_pretrain-dbn/4.dbn
(tail --pid=$$ -F $dir/log/train_nnet.log 2>/dev/null)& # forward log
# Train
$cuda_cmd $dir/log/train_nnet.log \
steps/nnet/train.sh --feature-transform $feature_transform --dbn $dbn --hid-layers 0 --learn-rate 0.008 \
$data_fmllr/train_tr90 $data_fmllr/train_cv10 data/lang $ali $ali $dir || exit 1;
# Decode with the trigram language model.
steps/nnet/decode.sh --nj 10 --cmd "$decode_cmd" \
--config conf/decode_dnn.config --acwt 0.1 \
$gmmdir/graph $data_fmllr/dev \
$dir/decode || exit 1;
fi
# Sequence training using sMBR criterion, we do Stochastic-GD
# with per-utterance updates. We use usually good acwt 0.1
# Lattices are re-generated after 1st epoch, to get faster convergence.
dir=exp/dnn5b_pretrain-dbn_dnn_smbr
srcdir=exp/dnn5b_pretrain-dbn_dnn
acwt=0.1
if [ $stage -le 3 ]; then
# First we generate lattices and alignments:
steps/nnet/align.sh --nj 10 --cmd "$train_cmd" \
$data_fmllr/train data/lang $srcdir ${srcdir}_ali || exit 1;
steps/nnet/make_denlats.sh --nj 10 --sub-split 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \
--acwt $acwt $data_fmllr/train data/lang $srcdir ${srcdir}_denlats || exit 1;
fi
if [ $stage -le 4 ]; then
# Re-train the DNN by 1 iteration of sMBR
steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 1 --acwt $acwt --do-smbr true \
$data_fmllr/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
# Decode (reuse HCLG graph)
for ITER in 1; do
# Decode with the trigram swbd language model.
steps/nnet/decode.sh --nj 10 --cmd "$decode_cmd" \
--config conf/decode_dnn.config \
--nnet $dir/${ITER}.nnet --acwt $acwt \
$gmmdir/graph $data_fmllr/dev \
$dir/decode || exit 1;
done
fi
# Re-generate lattices, run 2 more sMBR iterations
dir=exp/dnn5b_pretrain-dbn_dnn_smbr_i1lats
srcdir=exp/dnn5b_pretrain-dbn_dnn_smbr
acwt=0.0909
if [ $stage -le 5 ]; then
# First we generate lattices and alignments:
#steps/nnet/align.sh --nj 10 --cmd "$train_cmd" \
# $data_fmllr/train data/lang $srcdir ${srcdir}_ali || exit 1;
steps/nnet/make_denlats.sh --nj 10 --sub-split 20 --cmd "$decode_cmd" --config conf/decode_dnn.config \
--acwt $acwt $data_fmllr/train data/lang $srcdir ${srcdir}_denlats || exit 1;
fi
if [ $stage -le 6 ]; then
# Re-train the DNN by 2 iteration of sMBR
steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 2 --acwt $acwt --do-smbr true \
$data_fmllr/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
# Decode (reuse HCLG graph)
for ITER in 1 2; do
# Decode with the trigram language model.
steps/nnet/decode.sh --nj 10 --cmd "$decode_cmd" \
--config conf/decode_dnn.config \
--nnet $dir/${ITER}.nnet --acwt $acwt \
$gmmdir/graph $data_fmllr/dev \
$dir/decode || exit 1;
done
fi
# Getting results [see RESULTS file]
# for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
#!/bin/bash
# Copyright 2015 Brno University of Technology (Author: Karel Vesely)
# Apache 2.0
# This example script trains a LSTM network on FBANK features.
# The LSTM code comes from Yiayu DU, and Wei Li, thanks!
. ./cmd.sh
. ./path.sh
dev=data_fbank/dev
train=data_fbank/train
dev_original=data/dev
train_original=data/train
gmm=exp/tri5a
stage=0
. utils/parse_options.sh || exit 1;
# Make the FBANK features
[ ! -e $dev ] && if [ $stage -le 0 ]; then
# Dev set
utils/copy_data_dir.sh $dev_original $dev || exit 1; rm $dev/{cmvn,feats}.scp
steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd" \
$dev $dev/log $dev/data || exit 1;
steps/compute_cmvn_stats.sh $dev $dev/log $dev/data || exit 1;
# Training set
utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp
steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd" \
$train $train/log $train/data || exit 1;
steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1;
# Split the training set
utils/subset_data_dir_tr_cv.sh --cv-spk-percent 10 $train ${train}_tr90 ${train}_cv10
fi
if [ $stage -le 1 ]; then
# Train the DNN optimizing per-frame cross-entropy.
dir=exp/lstm5e
ali=${gmm}_ali
# Train
$cuda_cmd $dir/log/train_nnet.log \
steps/nnet/train.sh --network-type lstm --learn-rate 0.0001 \
--cmvn-opts "--norm-means=true --norm-vars=true" --feat-type plain --splice 0 \
--train-opts "--momentum 0.9 --halving-factor 0.5" \
--delta-opts "--delta-order=2" \
--train-tool "nnet-train-lstm-streams --num-stream=4 --targets-delay=5" \
--proto-opts "--num-cells 2000 --num-recurrent 750 --num-layers 1 --clip-gradient 5.0" \
${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1;
# Decode with the trigram language model.
steps/nnet/decode.sh --nj 10 --cmd "$decode_cmd" \
--config conf/decode_dnn.config --acwt 0.1 \
$gmm/graph $dev $dir/decode || exit 1;
fi
# TODO : sequence training,
echo Success
exit 0
# Getting results [see RESULTS file]
# for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
#!/bin/bash
# This runs on the full training set (with duplicates removed), with p-norm
# units, on top of fMLLR features, on GPU.
temp_dir=
dir=exp/nnet2_5d
stage=-5
. ./cmd.sh
. ./path.sh
. utils/parse_options.sh
parallel_opts="--gpu 1" # This is suitable for the CLSP network, you'll
# likely have to change it.
(
if [ ! -f $dir/final.mdl ]; then
steps/nnet2/train_pnorm_accel2.sh --parallel-opts "$parallel_opts" \
--cmd "$decode_cmd" --stage $stage \
--num-threads 1 --minibatch-size 512 \
--mix-up 20000 --samples-per-iter 300000 \
--num-epochs 15 \
--initial-effective-lrate 0.005 --final-effective-lrate 0.0005 \
--num-jobs-initial 3 --num-jobs-final 8 --num-hidden-layers 4 --splice-width 5 \
--pnorm-input-dim 4000 --pnorm-output-dim 400 --p 2 \
data/train data/lang exp/tri5a_ali $dir || exit 1;
fi
steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \
--config conf/decode.config \
--transform-dir exp/tri5a/decode \
exp/tri5a/graph data/dev \
$dir/decode || exit 1;
)
...@@ -145,6 +145,26 @@ steps/decode.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \ ...@@ -145,6 +145,26 @@ steps/decode.sh --cmd "$decode_cmd" --nj 10 --config conf/decode.config \
--transform-dir exp/tri5a/decode \ --transform-dir exp/tri5a/decode \
exp/tri5a/graph data/dev exp/tri5a_mpe/decode || exit 1 ; exp/tri5a/graph data/dev exp/tri5a_mpe/decode || exit 1 ;
# SGMM system [sgmm5a]
steps/train_ubm.sh --cmd "$train_cmd" \
900 data/train data/lang exp/tri5a_ali exp/ubm5a || exit 1;
steps/train_sgmm2.sh --cmd "$train_cmd" \
14000 35000 data/train data/lang exp/tri5a_ali \
exp/ubm5a/final.ubm exp/sgmm2_5a || exit 1;
utils/mkgraph.sh data/lang_test exp/sgmm2_5a exp/sgmm2_5a/graph || exit 1;
steps/decode_sgmm2.sh --nj 10 --cmd "$decode_cmd" --config conf/decode.config \
--transform-dir exp/tri5a/decode \
exp/sgmm2_5a/graph data/dev exp/sgmm2_5a/decode || exit 1;
# nnet1 dnn
local/nnet/run_dnn.sh
# nnet2
local/nnet2/run_5d.sh
local/nnet2/run_convnet.sh
# getting results (see RESULTS file) # getting results (see RESULTS file)
for x in exp/*/decode; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null for x in exp/*/decode; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null
for x in exp/*/decode; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null for x in exp/*/decode; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment