Commit 9bd10e67 authored by Karel Vesely's avatar Karel Vesely
Browse files

trunk,nnet,scripts :

- steps/nnet/train_scheduler.sh : add option --keep-lr-iters, to delay learning-rate halving when requested
- utils/nnet/make_nnet_proto.py : add max-norm support
- timit/s5/local/score*.sh : faster scoring
- tedlium/s5/local/run_dnn.sh : bugfix, for sMBR step decode once



git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4323 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 44e344ce
......@@ -131,15 +131,15 @@ if [ $stage -le 6 ]; then
steps/nnet/train_mpe.sh --cmd "$cuda_cmd" --num-iters 4 --acwt $acwt --do-smbr true \
$data_fmllr/train data/lang $srcdir ${srcdir}_ali ${srcdir}_denlats $dir || exit 1
# Decode (reuse HCLG graph)
for ITER in 1 2 3 4; do
for ITER in 4; do
steps/nnet/decode.sh --nj $decode_nj --cmd "$decode_cmd" --config conf/decode_dnn.config \
--num-threads 3 --parallel-opts "-pe smp 4" \
--nnet $dir/${ITER}.nnet --acwt $acwt \
$gmmdir/graph $data_fmllr/dev $dir/decode_dev || exit 1;
$gmmdir/graph $data_fmllr/dev $dir/decode_dev_it$ITER || exit 1;
steps/nnet/decode.sh --nj $decode_nj --cmd "$decode_cmd" --config conf/decode_dnn.config \
--num-threads 3 --parallel-opts "-pe smp 4" \
--nnet $dir/${ITER}.nnet --acwt $acwt \
$gmmdir/graph $data_fmllr/test $dir/decode_test || exit 1;
$gmmdir/graph $data_fmllr/test $dir/decode_test_it$ITER || exit 1;
done
fi
......
......@@ -27,6 +27,7 @@ lang_or_graph=$2
dir=$3
phonemap="conf/phones.60-48-39.map"
nj=$(cat $dir/num_jobs)
symtab=$lang_or_graph/words.txt
......@@ -40,16 +41,20 @@ mkdir -p $dir/scoring/log
cat $data/text | local/timit_norm_trans.pl -i - -m $phonemap -from 48 -to 39 > $dir/scoring/test_filt.txt
# Get the phone-sequence on the best-path:
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
lattice-best-path --lm-scale=LMWT --word-symbol-table=$symtab \
"ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1;
for LMWT in $(seq $min_lmwt $max_lmwt); do
$cmd JOB=1:$nj $dir/scoring/log/best_path_basic.$LMWT.JOB.log \
lattice-best-path --lm-scale=$LMWT --word-symbol-table=$symtab --verbose=2 \
"ark:gunzip -c $dir/lat.JOB.gz|" ark,t:$dir/scoring/$LMWT.JOB.tra || exit 1;
cat $dir/scoring/$LMWT.*.tra | sort > $dir/scoring/$LMWT.tra
rm $dir/scoring/$LMWT.*.tra
done
# Map hypothesis to 39 phone classes:
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score_basic.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
utils/int2sym.pl -f 2- $symtab \| \
local/timit_norm_trans.pl -i - -m $phonemap -from 48 -to 39 \| \
compute-wer --text --mode=all \
ark:$dir/scoring/test_filt.txt ark,p:- $dir/scoring/stats_LMWT ">&" $dir/wer_LMWT || exit 1;
ark:$dir/scoring/test_filt.txt ark,p:- $dir/scoring/wer_stats_LMWT ">&" $dir/wer_LMWT || exit 1;
exit 0;
......@@ -8,6 +8,7 @@ cmd=run.pl
stage=0
min_lmwt=1
max_lmwt=10
mbr_scale=1.0
#end configuration section.
[ -f ./path.sh ] && . ./path.sh
......@@ -54,7 +55,7 @@ if [ $stage -le 0 ]; then
for LMWT in $(seq $min_lmwt $max_lmwt); do
$cmd JOB=1:$nj $dir/scoring/log/best_path.$LMWT.JOB.log \
lattice-align-phones $model "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \
lattice-to-ctm-conf --inv-acoustic-scale=$LMWT ark:- $dir/scoring/$LMWT.JOB.ctm || exit 1;
lattice-to-ctm-conf --acoustic-scale=$(bc <<<"scale=8; 1/$LMWT*$mbr_scale") --lm-scale=$mbr_scale ark:- $dir/scoring/$LMWT.JOB.ctm || exit 1;
cat $dir/scoring/$LMWT.*.ctm | sort > $dir/scoring/$LMWT.ctm
rm $dir/scoring/$LMWT.*.ctm
done
......@@ -72,7 +73,7 @@ fi
# Score the set...
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
cp $dir/scoring/stm_39phn $dir/score_LMWT '&&' cp $dir/scoring/LMWT.ctm_39phn $dir/score_LMWT/ctm_39phn '&&' \
cp $dir/scoring/stm_39phn $dir/score_LMWT/stm_39phn '&&' cp $dir/scoring/LMWT.ctm_39phn $dir/score_LMWT/ctm_39phn '&&' \
$hubscr -p $hubdir -V -l english -h hub5 -g $dir/scoring/glm_39phn -r $dir/score_LMWT/stm_39phn $dir/score_LMWT/ctm_39phn || exit 1;
exit 0;
......@@ -20,6 +20,7 @@ feature_transform=
# learn rate scheduling
max_iters=20
min_iters=
keep_lr_iters=0
#start_halving_inc=0.5
#end_halving_inc=0.1
start_halving_impr=0.01
......@@ -123,9 +124,10 @@ for iter in $(seq -w $max_iters); do
# accept or reject new parameters (based on objective function)
loss_prev=$loss
if [ "1" == "$(awk "BEGIN{print($loss_new<$loss);}")" ]; then
if [ 1 == $(bc <<< "$loss_new < $loss") -o $iter -le $keep_lr_iters ]; then
loss=$loss_new
mlp_best=$dir/nnet/${mlp_base}_iter${iter}_learnrate${learn_rate}_tr$(printf "%.4f" $tr_loss)_cv$(printf "%.4f" $loss_new)
[ $iter -le $keep_lr_iters ] && mlp_best=${mlp_best}_keep-lr-iters-$keep_lr_iters
mv $mlp_next $mlp_best
echo "nnet accepted ($(basename $mlp_best))"
echo $mlp_best > $dir/.mlp_best
......@@ -137,27 +139,31 @@ for iter in $(seq -w $max_iters); do
# create .done file as a mark that iteration is over
touch $dir/.done_iter$iter
# no learn-rate halving yet, if keep_lr_iters set accordingly
[ $iter -le $keep_lr_iters ] && continue
# stopping criterion
if [[ "1" == "$halving" && "1" == "$(awk "BEGIN{print(($loss_prev-$loss)/$loss_prev < $end_halving_impr)}")" ]]; then
rel_impr=$(bc <<< "scale=10; ($loss_prev-$loss)/$loss_prev")
if [ 1 == $halving -a 1 == $(bc <<< "$rel_impr < $end_halving_impr") ]; then
if [[ "$min_iters" != "" ]]; then
if [ $min_iters -gt $iter ]; then
echo we were supposed to finish, but we continue, min_iters : $min_iters
echo we were supposed to finish, but we continue as min_iters : $min_iters
continue
fi
fi
echo finished, too small rel. improvement $(awk "BEGIN{print(($loss_prev-$loss)/$loss_prev)}")
echo finished, too small rel. improvement $rel_impr
break
fi
# start annealing when improvement is low
if [ "1" == "$(awk "BEGIN{print(($loss_prev-$loss)/$loss_prev < $start_halving_impr)}")" ]; then
if [ 1 == $(bc <<< "$rel_impr < $start_halving_impr") ]; then
halving=1
echo $halving >$dir/.halving
fi
# do annealing
if [ "1" == "$halving" ]; then
if [ 1 == $halving ]; then
learn_rate=$(awk "BEGIN{print($learn_rate*$halving_factor)}")
echo $learn_rate >$dir/.learn_rate
fi
......
......@@ -48,7 +48,9 @@ parser.add_option('--no-glorot-scaled-stddev', dest='with_glorot', help='Generat
parser.add_option('--no-smaller-input-weights', dest='smaller_input_weights',
help='Disable 1/12 reduction of stddef in input layer [default: %default]',
action='store_false', default=True);
parser.add_option('--max-norm', dest='max_norm',
help='Max radius of neuron-weights in L2 space (if longer weights get shrinked, not applied to last layer, 0.0 = disable) [default: %default]',
default=0.0, type='float');
(o,args) = parser.parse_args()
......@@ -90,9 +92,9 @@ if num_hid_layers == 0 and o.bottleneck_dim != 0:
(feat_dim, o.bottleneck_dim, \
(o.param_stddev_factor * Glorot(feat_dim, o.bottleneck_dim) * 0.75 ), 0.1)
# 25% smaller stddev -> smaller gradient in prev. layer, 10x smaller learning rate for weigts & biases
print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f" % \
print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f <MaxNorm> %f" % \
(o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
(o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons) * 0.75 ), 0.1, 0.1)
(o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons) * 0.75 ), 0.1, 0.1, o.max_norm)
print "%s <InputDim> %d <OutputDim> %d" % (o.activation_type, num_hid_neurons, num_hid_neurons) # Non-linearity
# Last AffineTransform (10x smaller learning rate on bias)
print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f" % \
......@@ -123,19 +125,19 @@ assert(num_hid_layers > 0)
print "<NnetProto>"
# First AffineTranform
print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f" % \
print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <MaxNorm> %f" % \
(feat_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
(o.param_stddev_factor * Glorot(feat_dim, num_hid_neurons) * \
(math.sqrt(1.0/12.0) if o.smaller_input_weights else 1.0)))
(math.sqrt(1.0/12.0) if o.smaller_input_weights else 1.0)), o.max_norm)
# stddev(U[0,1]) = sqrt(1/12); reducing stddev of weights,
# the dynamic range of input data is larger than of a Sigmoid.
print "%s <InputDim> %d <OutputDim> %d" % (o.activation_type, num_hid_neurons, num_hid_neurons)
# Internal AffineTransforms
for i in range(num_hid_layers-1):
print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f" % \
print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <MaxNorm> %f" % \
(num_hid_neurons, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
(o.param_stddev_factor * Glorot(num_hid_neurons, num_hid_neurons)))
(o.param_stddev_factor * Glorot(num_hid_neurons, num_hid_neurons)), o.max_norm)
print "%s <InputDim> %d <OutputDim> %d" % (o.activation_type, num_hid_neurons, num_hid_neurons)
# Optionaly add bottleneck
......@@ -146,9 +148,9 @@ if o.bottleneck_dim != 0:
(num_hid_neurons, o.bottleneck_dim, \
(o.param_stddev_factor * Glorot(num_hid_neurons, o.bottleneck_dim) * 0.75 ), 0.1)
# 25% smaller stddev -> smaller gradient in prev. layer, 10x smaller learning rate for weigts & biases
print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f" % \
print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> %f <BiasRange> %f <ParamStddev> %f <LearnRateCoef> %f <BiasLearnRateCoef> %f <MaxNorm> %f" % \
(o.bottleneck_dim, num_hid_neurons, o.hid_bias_mean, o.hid_bias_range, \
(o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons) * 0.75 ), 0.1, 0.1)
(o.param_stddev_factor * Glorot(o.bottleneck_dim, num_hid_neurons) * 0.75 ), 0.1, 0.1, o.max_norm)
print "%s <InputDim> %d <OutputDim> %d" % (o.activation_type, num_hid_neurons, num_hid_neurons)
# Last AffineTransform (10x smaller learning rate on bias)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment