Commit abad6ae2 authored by Dan Povey's avatar Dan Povey
Browse files

sandbox/dan: various minor fixes/additions to new nnet scripts.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/sandbox/dan2@2780 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent d8f88fe8
......@@ -3,11 +3,10 @@
. cmd.sh
# This examples runs on top of "raw-fMLLR" features.
# This example runs on top of "raw-fMLLR" features.
( steps/nnet2/train_tanh.sh --splice-width 7 \
--egs-dir exp/tri4a1xa32_nnet/egs \
--softmax-learning-rate-factor 0.5 --cleanup false \
--cleanup false \
--initial-learning-rate 0.08 --final-learning-rate 0.008 \
--num-hidden-layers 2 \
--num-epochs-extra 10 --add-layers-period 1 \
......
......@@ -9,7 +9,7 @@
set -e
if true; then
if false; then
featdir=`pwd`/mfcc/nnet4b; mkdir -p $featdir
mkdir -p conf/nnet4b
all_fbankdirs=""
......@@ -43,10 +43,18 @@ if true; then
steps/align_fmllr.sh --nj 8 --cmd "$train_cmd" \
data/nnet4b/train_mfcc_all data/lang exp/tri3b exp/tri3b_ali_nnet4b
# In the combined filterbank directory, create a file utt2uniq which maps
# our extended utterance-ids to "unique utterances". This enables the
# script steps/nnet2/get_egs.sh to hold out data in a more proper way.
cat data/nnet4b/train_fbank_all/utt2spk | awk '{print $1;}' | \
perl -ane ' chop; $utt = $_; s/[-0-9\.]+-[-0-9\.]+-//; print "$utt $_\n"; ' \
> data/nnet4b/train_fbank_all/utt2uniq
fi
( steps/nnet2/train_block.sh \
( steps/nnet2/train_block.sh --stage -3 \
--bias-stddev 0.5 --splice-width 7 --egs-opts "--feat-type raw" \
--softmax-learning-rate-factor 0.5 --cleanup false \
--initial-learning-rate 0.04 --final-learning-rate 0.004 \
......
......@@ -4,3 +4,10 @@
# This example runs on top of "raw-fMLLR" features:
local/nnet2/run_4a.sh
# This one is on top of filter-bank features, with only CMN.
local/nnet2/run_4b.sh
# This one is on top of 40-dim + fMLLR features
local/nnet2/run_4c.sh
......@@ -44,23 +44,28 @@ steps/decode_sgmm2.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
--transform-dir exp/tri3c/decode exp/sgmm2_4c/graph data/test exp/sgmm2_4c/decode_fmllr || exit 1;
(# get scaled-by-30 versions of the vecs to be used for nnet training.
mkdir -p exp/sgmm2_4c_x30
cat exp/sgmm2_4c/vecs.* | copy-vector ark:- ark,t:- | \
awk -v scale=30.0 '{printf("%s [ ", $1); for (n=3;n<NF;n++) { printf("%f ", scale*$n); } print "]"; }' > exp/sgmm2_4c_x30/vecs.1
mkdir -p exp/sgmm2_4c_x30/decode
cat exp/sgmm2_4c/decode/vecs.* | copy-vector ark:- ark,t:- | \
awk -v scale=30.0 '{printf("%s [ ", $1); for (n=3;n<NF;n++) { printf("%f ", scale*$n); } print "]"; }' > exp/sgmm2_4c_x30/decode/vecs.1
mkdir -p exp/sgmm2_4c_x30/decode_ug
cat exp/sgmm2_4c/decode_ug/vecs.* | copy-vector ark:- ark,t:- | \
awk -v scale=30.0 '{printf("%s [ ", $1); for (n=3;n<NF;n++) { printf("%f ", scale*$n); } print "]"; }' > exp/sgmm2_4c_x30/decode_ug/vecs.1
)
exit 0;
##
steps/decode_sgmm2.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
exp/sgmm2_4c.no_transform/graph data/test exp/sgmm2_4c.no_transform/decode || exit 1;
steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
exp/sgmm2_4c.no_transform/graph data/test exp/sgmm2_4c.no_transform/decode_fmllr || exit 1;
# (# get scaled-by-30 versions of the vecs to be used for nnet training.
# . path.sh
# mkdir -p exp/sgmm2_4c_x30
# cat exp/sgmm2_4c/vecs.* | copy-vector ark:- ark,t:- | \
# awk -v scale=30.0 '{printf("%s [ ", $1); for (n=3;n<NF;n++) { printf("%f ", scale*$n); } print "]"; }' > exp/sgmm2_4c_x30/vecs.1
# mkdir -p exp/sgmm2_4c_x30/decode
# cat exp/sgmm2_4c/decode/vecs.* | copy-vector ark:- ark,t:- | \
# awk -v scale=30.0 '{printf("%s [ ", $1); for (n=3;n<NF;n++) { printf("%f ", scale*$n); } print "]"; }' > exp/sgmm2_4c_x30/decode/vecs.1
# mkdir -p exp/sgmm2_4c_x30/decode_ug
# cat exp/sgmm2_4c/decode_ug/vecs.* | copy-vector ark:- ark,t:- | \
# awk -v scale=30.0 '{printf("%s [ ", $1); for (n=3;n<NF;n++) { printf("%f ", scale*$n); } print "]"; }' > exp/sgmm2_4c_x30/decode_ug/vecs.1
# )
# exit 0;
# ##
# steps/decode_sgmm2.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
# exp/sgmm2_4c.no_transform/graph data/test exp/sgmm2_4c.no_transform/decode || exit 1;
# steps/decode_sgmm2.sh --use-fmllr true --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
# exp/sgmm2_4c.no_transform/graph data/test exp/sgmm2_4c.no_transform/decode_fmllr || exit 1;
......@@ -2,6 +2,7 @@
. cmd.sh
# call the next line with the directory where the RM data is
# (the argument below is just an example). This should contain
# subdirectories named as follows:
......@@ -36,6 +37,7 @@ steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test $featdir
utils/subset_data_dir.sh data/train 1000 data/train.1k || exit 1;
steps/train_mono.sh --nj 4 --cmd "$train_cmd" data/train.1k data/lang exp/mono || exit 1;
#show-transitions data/lang/phones.txt exp/tri2a/final.mdl exp/tri2a/final.occs | perl -e 'while(<>) { if (m/ sil /) { $l = <>; $l =~ m/pdf = (\d+)/|| die "bad line $l"; $tot += $1; }} print "Total silence count $tot\n";'
......@@ -44,6 +46,9 @@ steps/train_mono.sh --nj 4 --cmd "$train_cmd" data/train.1k data/lang exp/mono
utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph
steps/decode.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
exp/mono/graph data/test exp/mono/decode
......@@ -81,6 +86,7 @@ steps/train_lda_mllt.sh --cmd "$train_cmd" \
--splice-opts "--left-context=3 --right-context=3" \
1800 9000 data/train data/lang exp/tri1_ali exp/tri2b || exit 1;
utils/mkgraph.sh data/lang exp/tri2b exp/tri2b/graph
steps/decode.sh --config conf/decode.config --nj 20 --cmd "$decode_cmd" \
exp/tri2b/graph data/test exp/tri2b/decode
......@@ -186,5 +192,6 @@ done
local/run_sgmm2.sh
#local/run_sgmm2x.sh
# you can do:
# The following script depends on local/run_raw_fmllr.sh having been run.
#
# local/run_nnet_cpu.sh
......@@ -84,6 +84,18 @@ cp $alidir/tree $dir
# Get list of validation utterances.
awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
> $dir/valid_uttlist || exit 1;
if [ -f $data/utt2uniq ]; then
echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
echo "include all perturbed versions of the same 'real' utterances."
mv $dir/valid_uttlist $dir/valid_uttlist.tmp
utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $dir/valid_uttlist
rm $dir/uniq2utt $dir/valid_uttlist.tmp
fi
awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \
head -$num_utts_subset > $dir/train_subset_uttlist || exit 1;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment