Commit 740545b9 authored by Karel Vesely's avatar Karel Vesely
Browse files

trunk,nnet1 : adding BLSTM code from Ni Chongjia (I2R), Thanks!

- it works, but it needs further analysis, so far it did not outperform the LSTM.



git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@5183 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 16b211cb
......@@ -251,5 +251,7 @@ for x in exp/nnet2_online_wsj/nnet_ms_a_smbr_0.00005/1/decode_*; do grep WER $x/
# LSTM result
for x in exp/lstm4f/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
%WER 2.04 [ 256 / 12533, 18 ins, 60 del, 178 sub ] exp/lstm4f_c512_r200_c512_r200_lr0.0001_mmt0.9_clip50/decode/wer_4_0.5
# BLSTM result
%WER 2.09 [ 262 / 12533, 25 ins, 69 del, 168 sub ] exp/blstm4g/decode/wer_4_0.0
#!/bin/bash
# Copyright 2015 Brno University of Technology (Author: Karel Vesely)
# Apache 2.0
# This example script trains a BLSTM network on FBANK features.
# The BLSTM code comes from Ni Chongjia (I2R), thanks!
# TODO, this BLSTM code needs to solve a problem how to determine the
# history for the 'backward' recurrency. Currently is taken the state
# on 1st frame of previous mini-batch (20 frames).
#
# A more sensible approach should be single-stream training,
# and per-utterance updates. But the results were worse.
#
. ./cmd.sh
. ./path.sh
dev=data-fbank/test
train=data-fbank/train
dev_original=data/test
train_original=data/train
gmm=exp/tri3b
stage=0
. utils/parse_options.sh || exit 1;
# Make the FBANK features
[ ! -e $dev ] && if [ $stage -le 0 ]; then
# Dev set
utils/copy_data_dir.sh $dev_original $dev || exit 1; rm $dev/{cmvn,feats}.scp
steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd" \
$dev $dev/log $dev/data || exit 1;
steps/compute_cmvn_stats.sh $dev $dev/log $dev/data || exit 1;
# Training set
utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp
steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd -tc 10" \
$train $train/log $train/data || exit 1;
steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1;
# Split the training set
utils/subset_data_dir_tr_cv.sh --cv-spk-percent 10 $train ${train}_tr90 ${train}_cv10
fi
if [ $stage -le 1 ]; then
# Train the DNN optimizing per-frame cross-entropy.
dir=exp/blstm4g
ali=${gmm}_ali
# Train
$cuda_cmd $dir/log/train_nnet.log \
steps/nnet/train.sh --network-type blstm --learn-rate 0.0001 \
--cmvn-opts "--norm-means=true --norm-vars=true" --feat-type plain --splice 0 \
--train-opts "--momentum 0.9 --halving-factor 0.5" \
--train-tool "nnet-train-lstm-streams --num-stream=4 --targets-delay=0" \
--proto-opts "--num-cells 512 --num-recurrent 200 --num-layers 2 --clip-gradient 50.0" \
${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1;
# Decode (reuse HCLG graph)
steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
$gmm/graph $dev $dir/decode || exit 1;
steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
$gmm/graph_ug $dev $dir/decode_ug || exit 1;
fi
# TODO : sequence training,
echo Success
exit 0
# Getting results [see RESULTS file]
# for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
......@@ -47,13 +47,13 @@ if [ $stage -le 1 ]; then
--cmvn-opts "--norm-means=true --norm-vars=true" --feat-type plain --splice 0 \
--train-opts "--momentum 0.9 --halving-factor 0.5" \
--train-tool "nnet-train-lstm-streams --num-stream=4 --targets-delay=5" \
--proto-opts "--num-cells 512 --num-recurrent 200 --num-layers 2 --clip-gradient 50.0" \
--proto-opts "--num-cells 512 --num-recurrent 200 --num-layers 2 --clip-gradient 5.0" \
${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1;
# Decode (reuse HCLG graph)
steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
$gmm/graph $dev $dir/decode || exit 1;
steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
$gmm/graph_ug $dev $dir/decode_ug || exit 1;
fi
......
......@@ -380,6 +380,10 @@ if [[ -z "$nnet_init" && -z "$nnet_proto" ]]; then
utils/nnet/make_lstm_proto.py $proto_opts \
$num_fea $num_tgt >$nnet_proto || exit 1
;;
blstm)
utils/nnet/make_blstm_proto.py $proto_opts \
$num_fea $num_tgt >$nnet_proto || exit 1
;;
*) echo "Unknown : --network_type $network_type" && exit 1;
esac
......
#!/usr/bin/env python
# Copyright 2015 Brno University of Technology (author: Karel Vesely)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Generated Nnet prototype, to be initialized by 'nnet-initialize'.
import sys
###
### Parse options
###
from optparse import OptionParser
usage="%prog [options] <feat-dim> <num-leaves> >nnet-proto-file"
parser = OptionParser(usage)
#
parser.add_option('--num-cells', dest='num_cells', type='int', default=800,
help='Number of LSTM cells [default: %default]');
parser.add_option('--num-recurrent', dest='num_recurrent', type='int', default=512,
help='Number of LSTM recurrent units [default: %default]');
parser.add_option('--num-layers', dest='num_layers', type='int', default=2,
help='Number of LSTM layers [default: %default]');
parser.add_option('--lstm-stddev-factor', dest='lstm_stddev_factor', type='float', default=0.01,
help='Standard deviation of initialization [default: %default]');
parser.add_option('--param-stddev-factor', dest='param_stddev_factor', type='float', default=0.04,
help='Standard deviation in output layer [default: %default]');
parser.add_option('--clip-gradient', dest='clip_gradient', type='float', default=5.0,
help='Clipping constant applied to gradients [default: %default]');
#
(o,args) = parser.parse_args()
if len(args) != 2 :
parser.print_help()
sys.exit(1)
(feat_dim, num_leaves) = map(int,args);
# Original prototype from Jiayu,
#<NnetProto>
#<Transmit> <InputDim> 40 <OutputDim> 40
#<LstmProjectedStreams> <InputDim> 40 <OutputDim> 512 <CellDim> 800 <ParamScale> 0.01 <NumStream> 4
#<AffineTransform> <InputDim> 512 <OutputDim> 8000 <BiasMean> 0.000000 <BiasRange> 0.000000 <ParamStddev> 0.04
#<Softmax> <InputDim> 8000 <OutputDim> 8000
#</NnetProto>
print "<NnetProto>"
# normally we won't use more than 2 layers of LSTM
if o.num_layers == 1:
print "<BLstmProjectedStreams> <InputDim> %d <OutputDim> %d <CellDim> %s <ParamScale> %f <ClipGradient> %f" % \
(feat_dim, o.num_recurrent, o.num_cells, o.lstm_stddev_factor, o.clip_gradient)
elif o.num_layers == 2:
print "<BLstmProjectedStreams> <InputDim> %d <OutputDim> %d <CellDim> %s <ParamScale> %f <ClipGradient> %f" % \
(feat_dim, o.num_recurrent, o.num_cells, o.lstm_stddev_factor, o.clip_gradient)
print "<BLstmProjectedStreams> <InputDim> %d <OutputDim> %d <CellDim> %s <ParamScale> %f <ClipGradient> %f" % \
(o.num_recurrent, o.num_recurrent, o.num_cells, o.lstm_stddev_factor, o.clip_gradient)
else:
sys.stderr.write("make_lstm_proto.py ERROR: more than 2 layers of LSTM, not supported yet.\n")
sys.exit(1)
print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> 0.0 <BiasRange> 0.0 <ParamStddev> %f" % \
(o.num_recurrent, num_leaves, o.param_stddev_factor)
print "<Softmax> <InputDim> %d <OutputDim> %d" % \
(num_leaves, num_leaves)
print "</NnetProto>"
This diff is collapsed.
......@@ -37,6 +37,7 @@
#include "nnet/nnet-max-pooling-2d-component.h"
#include "nnet/nnet-lstm-projected-streams.h"
#include "nnet/nnet-blstm-projected-streams.h"
#include "nnet/nnet-sentence-averaging-component.h"
#include "nnet/nnet-frame-pooling-component.h"
......@@ -53,6 +54,7 @@ const struct Component::key_value Component::kMarkerMap[] = {
{ Component::kConvolutionalComponent,"<ConvolutionalComponent>"},
{ Component::kConvolutional2DComponent,"<Convolutional2DComponent>"},
{ Component::kLstmProjectedStreams,"<LstmProjectedStreams>"},
{ Component::kBLstmProjectedStreams,"<BLstmProjectedStreams>"},
{ Component::kSoftmax,"<Softmax>" },
{ Component::kBlockSoftmax,"<BlockSoftmax>" },
{ Component::kSigmoid,"<Sigmoid>" },
......@@ -117,6 +119,9 @@ Component* Component::NewComponentOfType(ComponentType comp_type,
case Component::kLstmProjectedStreams :
ans = new LstmProjectedStreams(input_dim, output_dim);
break;
case Component::kBLstmProjectedStreams :
ans = new BLstmProjectedStreams(input_dim, output_dim);
break;
case Component::kSoftmax :
ans = new Softmax(input_dim, output_dim);
break;
......
......@@ -54,6 +54,7 @@ class Component {
kConvolutionalComponent,
kConvolutional2DComponent,
kLstmProjectedStreams,
kBLstmProjectedStreams,
kActivationFunction = 0x0200,
kSoftmax,
......
......@@ -24,7 +24,7 @@
#include "nnet/nnet-affine-transform.h"
#include "nnet/nnet-various.h"
#include "nnet/nnet-lstm-projected-streams.h"
#include "nnet/nnet-blstm-projected-streams.h"
namespace kaldi {
namespace nnet1 {
......@@ -357,6 +357,10 @@ void Nnet::ResetLstmStreams(const std::vector<int32> &stream_reset_flag) {
LstmProjectedStreams& comp = dynamic_cast<LstmProjectedStreams&>(GetComponent(c));
comp.ResetLstmStreams(stream_reset_flag);
}
if (GetComponent(c).GetType() == Component::kBLstmProjectedStreams) {
BLstmProjectedStreams& comp = dynamic_cast<BLstmProjectedStreams&>(GetComponent(c));
comp.ResetLstmStreams(stream_reset_flag);
}
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment