Commit 757064ad authored by Dan Povey's avatar Dan Povey
Browse files

trunk: some script changes, fixes and extensions RE multisplice. still no results posted.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4585 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 1198ecdb
......@@ -27,68 +27,10 @@ minibatch_size=512
dir=exp/nnet2_online/nnet_a_gpu
mkdir -p exp/nnet2_online
if [ $stage -le 1 ]; then
# this shows how you can split across multiple file-systems. we'll split the
# MFCC dir across multiple locations. You might want to be careful here, if you
# have multiple copies of Kaldi checked out and run the same recipe, not to let
# them overwrite each other.
mfccdir=mfcc
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
date=$(date +'%m_%d_%H_%M')
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english-$date/s5/$mfccdir/storage $mfccdir/storage
fi
utils/copy_data_dir.sh data/train data/train_hires
steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
--cmd "$train_cmd" data/train_hires exp/make_hires/train $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/train_hires exp/make_hires/train $mfccdir || exit 1;
utils/subset_data_dir.sh data/train_hires 30000 data/train_hires_30k
# want the 100k subset to exactly match train_100k, since we'll use its alignments.
awk '{print $1}' data/train_100k/utt2spk > uttlist
utils/subset_data_dir.sh --utt-list uttlist data/train_hires data/train_hires_100k
rm uttlist
fi
if [ $stage -le 2 ]; then
# We need to build a small system just because we need the LDA+MLLT transform
# to train the diag-UBM on top of. We use --num-iters 13 because after we get
# the transform (12th iter is the last), any further training is pointless.
steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
--splice-opts "--left-context=3 --right-context=3" \
5000 10000 data/train_hires_100k data/lang exp/tri4a exp/nnet2_online/tri5a
fi
if [ $stage -le 3 ]; then
# To train a diagonal UBM we don't need very much data, so use the smallest
# subset. the input directory exp/nnet2_online/tri5a is only needed for
# the splice-opts and the LDA transform.
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 400000 \
data/train_hires_30k 512 exp/nnet2_online/tri5a exp/nnet2_online/diag_ubm
fi
if [ $stage -le 4 ]; then
# iVector extractors can in general be sensitive to the amount of data, but
# this one has a fairly small dim (defaults to 100) so we don't use all of it,
# we use just the 100k subset (about one sixteenth of the data).
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
data/train_hires_100k exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
fi
if [ $stage -le 5 ]; then
ivectordir=exp/nnet2_online/ivectors_train
if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then # this shows how you can split across multiple file-systems.
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english/s5/$ivectordir/storage $ivectordir/storage
fi
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_hires data/train_hires_max2
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
data/train_hires_max2 exp/nnet2_online/extractor $ivectordir || exit 1;
fi
# Stages 1 through 5 are done in run_nnet2_common.sh,
# so it can be shared with other similar scripts.
local/online/run_nnet2_common.sh --stage $stage
if [ $stage -le 6 ]; then
if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
......
#!/bin/bash
# Make the features, build the iVector extractor
. cmd.sh
stage=1
set -e
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
mkdir -p exp/nnet2_online
if [ $stage -le 1 ]; then
# this shows how you can split across multiple file-systems. we'll split the
# MFCC dir across multiple locations. You might want to be careful here, if you
# have multiple copies of Kaldi checked out and run the same recipe, not to let
# them overwrite each other.
mfccdir=mfcc
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
date=$(date +'%m_%d_%H_%M')
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english-$date/s5/$mfccdir/storage $mfccdir/storage
fi
utils/copy_data_dir.sh data/train data/train_hires
steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \
--cmd "$train_cmd" data/train_hires exp/make_hires/train $mfccdir || exit 1;
steps/compute_cmvn_stats.sh data/train_hires exp/make_hires/train $mfccdir || exit 1;
utils/subset_data_dir.sh data/train_hires 30000 data/train_hires_30k
# want the 100k subset to exactly match train_100k, since we'll use its alignments.
awk '{print $1}' data/train_100k/utt2spk > uttlist
utils/subset_data_dir.sh --utt-list uttlist data/train_hires data/train_hires_100k
rm uttlist
fi
if [ $stage -le 2 ]; then
# We need to build a small system just because we need the LDA+MLLT transform
# to train the diag-UBM on top of. We use --num-iters 13 because after we get
# the transform (12th iter is the last), any further training is pointless.
steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
--splice-opts "--left-context=3 --right-context=3" \
5000 10000 data/train_hires_100k data/lang exp/tri4a exp/nnet2_online/tri5a
fi
if [ $stage -le 3 ]; then
# To train a diagonal UBM we don't need very much data, so use the smallest
# subset. the input directory exp/nnet2_online/tri5a is only needed for
# the splice-opts and the LDA transform.
steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 400000 \
data/train_hires_30k 512 exp/nnet2_online/tri5a exp/nnet2_online/diag_ubm
fi
if [ $stage -le 4 ]; then
# iVector extractors can in general be sensitive to the amount of data, but
# this one has a fairly small dim (defaults to 100) so we don't use all of it,
# we use just the 100k subset (about one sixteenth of the data).
steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \
data/train_hires_100k exp/nnet2_online/diag_ubm exp/nnet2_online/extractor || exit 1;
fi
if [ $stage -le 5 ]; then
ivectordir=exp/nnet2_online/ivectors_train
if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then # this shows how you can split across multiple file-systems.
utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/fisher_english/s5/$ivectordir/storage $ivectordir/storage
fi
# having a larger number of speakers is helpful for generalization, and to
# handle per-utterance decoding well (iVector starts at zero).
steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/train_hires data/train_hires_max2
steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 60 \
data/train_hires_max2 exp/nnet2_online/extractor $ivectordir || exit 1;
fi
#!/bin/bash
. cmd.sh
stage=1
train_stage=-10
use_gpu=true
set -e
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
# assume use_gpu=true since it would be way too slow otherwise.
if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.
EOF
fi
parallel_opts="-l gpu=1"
num_threads=1
minibatch_size=512
dir=exp/nnet2_online/nnet_ms_a
mkdir -p exp/nnet2_online
splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-1:3"
# Stages 1 through 5 are done in run_nnet2_common.sh,
# so it can be shared with other similar scripts.
local/online/run_nnet2_common.sh --stage $stage
if [ $stage -le 6 ]; then
if [ $USER == dpovey ]; then # this shows how you can split across multiple file-systems.
utils/create_split_dir.pl /export/b0{1,2,3,4}/dpovey/kaldi-online/egs/fisher_english/s5/$dir/egs $dir/egs/storage
fi
# Because we have a lot of data here and we don't want the training to take
# too long, we reduce the number of epochs from the defaults (15 + 5) to (3 +
# 1). The option "--io-opts '-tc 12'" is to have more than the default number
# (5) of jobs dumping the egs to disk; this is OK since we're splitting our
# data across four filesystems for speed.
steps/nnet2/train_pnorm_multisplice.sh --stage $train_stage \
--splice-indexes "$splice_indexes" \
--feat-type raw \
--online-ivector-dir exp/nnet2_online/ivectors_train \
--cmvn-opts "--norm-means=false --norm-vars=false" \
--num-threads "$num_threads" \
--minibatch-size "$minibatch_size" \
--parallel-opts "$parallel_opts" \
--num-jobs-nnet 6 \
--num-epochs 5 \
--num-hidden-layers 4 \
--mix-up 12000 \
--initial-learning-rate 0.01 --final-learning-rate 0.001 \
--cmd "$decode_cmd" \
--egs-dir "$common_egs_dir" \
--pnorm-input-dim 3500 \
--pnorm-output-dim 350 \
data/train_hires data/lang exp/tri5a $dir || exit 1;
fi
if [ $stage -le 7 ]; then
steps/online/nnet2/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
data/lang exp/nnet2_online/extractor "$dir" ${dir}_online || exit 1;
fi
if [ $stage -le 8 ]; then
# do the actual online decoding with iVectors, carrying info forward from
# previous utterances of the same speaker.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
exp/tri5a/graph data/dev ${dir}_online/decode_dev || exit 1;
fi
if [ $stage -le 9 ]; then
# this version of the decoding treats each utterance separately
# without carrying forward speaker information.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
--per-utt true \
exp/tri5a/graph data/dev ${dir}_online/decode_dev_utt || exit 1;
fi
if [ $stage -le 10 ]; then
# this version of the decoding treats each utterance separately
# without carrying forward speaker information, but looks to the end
# of the utterance while computing the iVector.
steps/online/nnet2/decode.sh --config conf/decode.config --cmd "$decode_cmd" --nj 30 \
--per-utt true --online false \
exp/tri5a/graph data/dev ${dir}_online/decode_dev_utt_offline || exit 1;
fi
exit 0;
#!/bin/bash
. cmd.sh
. ./cmd.sh
set -e
stage=1
train_stage=-10
use_gpu=true
nnet2_online=nnet2_online_ms
splice_inds="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-3:3"
splice_indexes="layer0/-4:-3:-2:-1:0:1:2:3:4 layer2/-5:-3:3"
common_egs_dir=
. cmd.sh
. ./path.sh
. ./utils/parse_options.sh
......@@ -35,6 +33,7 @@ else
fi
dir=exp/$nnet2_online/nnet_a
mkdir -p exp/$nnet2_online
if [ $stage -le 1 ]; then
mfccdir=mfcc_hires
......@@ -67,6 +66,11 @@ if [ $stage -le 1 ]; then
n=$[`cat data/train/segments | wc -l` - 4000]
utils/subset_data_dir.sh --last data/train_hires $n data/train_hires_nodev ;
# Take the first 30k utterances (about 1/8th of the data) this will be used
# for the diagubm training
utils/subset_data_dir.sh --first data/train_nodev 30000 data/train_hires_30k
local/remove_dup_utts.sh 200 data/train_hires_30k data/train_hires_30k_nodup # 33hr
# create a 100k subset for the lda+mllt training
utils/subset_data_dir.sh --first data/train_hires_nodev 100000 data/train_hires_100k;
local/remove_dup_utts.sh 200 data/train_hires_100k data/train_hires_100k_nodup;
......@@ -76,7 +80,6 @@ if [ $stage -le 1 ]; then
fi
if [ $stage -le 2 ]; then
mkdir -p exp/$nnet2_online
# We need to build a small system just because we need the LDA+MLLT transform
# to train the diag-UBM on top of. We use --num-iters 13 because after we get
# the transform (12th iter is the last), any further training is pointless.
......@@ -126,7 +129,7 @@ if [ $stage -le 6 ]; then
# want to demonstrate the capability of doing real-time decoding, and if the
# network was too bug we wouldn't be able to decode in real-time using a CPU.
steps/nnet2/train_pnorm_multisplice.sh --stage $train_stage \
--splice-indexes "$splice_inds" \
--splice-indexes "$splice_indexes" \
--feat-type raw \
--online-ivector-dir exp/$nnet2_online/ivectors_train_nodup2 \
--cmvn-opts "--norm-means=false --norm-vars=false" \
......@@ -201,11 +204,8 @@ if [ $stage -le 11 ]; then
done
fi
exit 0;
# get results on Dev with this command:
for x in exp/$nnet2_online/nnet_a/decode_train_dev_sw1_*; do grep WER $x/wer_* | utils/best_wer.sh; done
_mfcc_hires# and results on eval2000 with this command:
......
......@@ -55,6 +55,7 @@ def parse_splice_string(splice_string):
return [contexts, ' nnet_left_context={0};\n nnet_right_context={1}\n first_left_context={2};\n first_right_context={3}\n'.format(abs(max_left_context), abs(max_right_context), abs(first_left_context), abs(first_right_context) )]
def create_config_files(output_dir, params):
pnorm_p = 2
pnorm_input_dim = params.pnorm_input_dim
pnorm_output_dim = params.pnorm_output_dim
contexts, context_variables = parse_splice_string(params.splice_indexes)
......@@ -74,7 +75,7 @@ def create_config_files(output_dir, params):
nnet_config = ["SpliceComponent input-dim={0} context={1} const-component-dim={2}".format(params.total_input_dim, contexts[0], params.ivector_dim),
"FixedAffineComponent matrix={0}".format(params.lda_mat),
"AffineComponentPreconditionedOnline input-dim={0} output-dim={1} {2} learning-rate={3} param-stddev={4} bias-stddev={5}".format(params.lda_dim, pnorm_input_dim, params.online_preconditioning_opts, params.initial_learning_rate, stddev, params.bias_stddev),
"PnormComponent input-dim={0} output-dim={1} p={2}".format(pnorm_input_dim, pnorm_output_dim, params.pnorm_p),
"PnormComponent input-dim={0} output-dim={1} p={2}".format(pnorm_input_dim, pnorm_output_dim, pnorm_p),
"NormalizeComponent dim={0}".format(pnorm_output_dim),
"AffineComponentPreconditionedOnline input-dim={0} output-dim={1} {2} learning-rate={3} param-stddev=0 bias-stddev=0".format(pnorm_output_dim, params.num_targets, params.online_preconditioning_opts, params.initial_learning_rate),
"SoftmaxComponent dim={0}".format(params.num_targets)]
......@@ -95,7 +96,7 @@ def create_config_files(output_dir, params):
# Add the hidden layer, which is a composition of an affine component, pnorm component and normalization component
lines.append("AffineComponentPreconditionedOnline input-dim=%d output-dim=%d %s learning-rate=%f param-stddev=%f bias-stddev=%f"
% ( pnorm_output_dim*context_len, pnorm_input_dim, params.online_preconditioning_opts, params.initial_learning_rate, stddev, params.bias_stddev))
lines.append("PnormComponent input-dim=%d output-dim=%d p=%d" % (pnorm_input_dim, pnorm_output_dim, params.pnorm_p))
lines.append("PnormComponent input-dim=%d output-dim=%d p=%d" % (pnorm_input_dim, pnorm_output_dim, pnorm_p))
lines.append("NormalizeComponent dim={0}".format(pnorm_output_dim))
out_file = open("{0}/hidden_{1}.config".format(output_dir, i), 'w')
out_file.write("\n".join(lines))
......@@ -112,7 +113,6 @@ if __name__ == "__main__":
parser.add_argument('--lda-dim', type=str, help='dimension of the lda output')
parser.add_argument('--pnorm-input-dim', type=int, help='dimension of input to pnorm layer')
parser.add_argument('--pnorm-output-dim', type=int, help='dimension of output of pnorm layer')
parser.add_argument('--pnorm-p', type=int, help='type of norm in the p-norm component')
parser.add_argument('--online-preconditioning-opts', type=str, help='extra options for the AffineComponentPreconditionedOnline component')
parser.add_argument('--initial-learning-rate', type=float, help='')
parser.add_argument('--num-targets', type=int, help='#targets for the neural network ')
......
......@@ -33,7 +33,6 @@ final_learning_rate=0.004
bias_stddev=0.5
pnorm_input_dim=3000
pnorm_output_dim=300
p=2
minibatch_size=128 # by default use a smallish minibatch size for neural net
# training; this controls instability which would otherwise
# be a problem with multi-threaded update.
......@@ -255,7 +254,6 @@ if [ $stage -le -2 ]; then
--lda-dim $lda_dim \
--pnorm-input-dim $pnorm_input_dim \
--pnorm-output-dim $pnorm_output_dim \
--pnorm-p $p \
--online-preconditioning-opts "$online_preconditioning_opts" \
--initial-learning-rate $initial_learning_rate \
--bias-stddev $bias_stddev \
......
......@@ -178,5 +178,5 @@ while [ $x -lt $num_iters ]; do
x=$[$x+1]
done
rm $dir/final.ie 2>/dev/null
ln -s $x.ie $dir/final.ie
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment