Commit 36ae27bb authored by Karel Vesely's avatar Karel Vesely
Browse files

trunk,nnet1 : adding lstm example (needs to be tuned)



git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4862 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 61542bc7
......@@ -42,7 +42,7 @@ if [ $stage -le 1 ]; then
steps/nnet/train.sh \
--cmvn-opts "--norm-means=true --norm-vars=true" \
--delta-opts "--delta-order=2" --splice 5 \
--prepend-cnn-type cnn1d --cnn-proto-opts "--patch-dim1 8 --pitch-dim 3" \
--network-type cnn1d --cnn-proto-opts "--patch-dim1 8 --pitch-dim 3" \
--hid-layers 2 --learn-rate 0.008 \
${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1;
# Decode
......
#!/bin/bash
# Copyright 2015 Brno University of Technology (Author: Karel Vesely)
# Apache 2.0
# This example script trains a LSTM network on FBANK features.
# The LSTM code comes from Yiayu DU, and Wei Li, thanks!
. ./cmd.sh
. ./path.sh
dev=data-fbank/test
train=data-fbank/train
dev_original=data/test
train_original=data/train
gmm=exp/tri3b
stage=0
. utils/parse_options.sh || exit 1;
# Make the FBANK features
[ ! -e $dev ] && if [ $stage -le 0 ]; then
# Dev set
utils/copy_data_dir.sh $dev_original $dev || exit 1; rm $dev/{cmvn,feats}.scp
steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd" \
$dev $dev/log $dev/data || exit 1;
steps/compute_cmvn_stats.sh $dev $dev/log $dev/data || exit 1;
# Training set
utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp
steps/make_fbank_pitch.sh --nj 10 --cmd "$train_cmd -tc 10" \
$train $train/log $train/data || exit 1;
steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1;
# Split the training set
utils/subset_data_dir_tr_cv.sh --cv-spk-percent 10 $train ${train}_tr90 ${train}_cv10
fi
if [ $stage -le 1 ]; then
# Train the DNN optimizing per-frame cross-entropy.
dir=exp/lstm4f
ali=${gmm}_ali
# Train
$cuda_cmd $dir/log/train_nnet.log \
steps/nnet/train.sh --network-type lstm --learn-rate 0.00001 \
--cmvn-opts "--norm-means=true --norm-vars=true" --feat-type plain --splice 0 \
--train-opts "--momentum 0.9 --halving-factor 0.8" \
--train-tool "nnet-train-lstm-streams --num-stream=4 --targets-delay=5" \
${train}_tr90 ${train}_cv10 data/lang $ali $ali $dir || exit 1;
# Decode (reuse HCLG graph)
steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
$gmm/graph $dev $dir/decode || exit 1;
steps/nnet/decode.sh --nj 20 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt 0.1 \
$gmm/graph_ug $dev $dir/decode_ug || exit 1;
fi
# TODO : sequence training,
echo Success
exit 0
# Getting results [see RESULTS file]
# for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done
......@@ -11,7 +11,7 @@ mlp_init= # select initialized MLP (override initialization)
mlp_proto= # select network prototype (initialize it)
proto_opts= # non-default options for 'make_nnet_proto.py'
feature_transform= # provide feature transform (=splice,rescaling,...) (don't build new one)
prepend_cnn_type=none # (none,cnn1d,cnn2d) create nnet with convolutional layers
network_type=dnn # (dnn,cnn1d,cnn2d,lstm) select type of neural network
cnn_proto_opts= # extra options for 'make_cnn_proto.py'
#
hid_layers=4 # nr. of hidden layers (prior to sotfmax or bottleneck)
......@@ -337,8 +337,8 @@ if [[ -z "$mlp_init" && -z "$mlp_proto" ]]; then
# make network prototype
mlp_proto=$dir/nnet.proto
echo "Genrating network prototype $mlp_proto"
case "$prepend_cnn_type" in
none)
case "$network_type" in
dnn)
utils/nnet/make_nnet_proto.py $proto_opts \
${bn_dim:+ --bottleneck-dim=$bn_dim} \
$num_fea $num_tgt $hid_layers $hid_dim >$mlp_proto || exit 1
......@@ -358,7 +358,11 @@ if [[ -z "$mlp_init" && -z "$mlp_proto" ]]; then
cnn2d)
#TODO, to be filled by Vijay...
;;
*) echo "Unknown 'prepend-cnn' value $prepend_cnn" && exit 1;
lstm)
utils/nnet/make_lstm_proto.py $proto_opts \
$num_fea $num_tgt >$mlp_proto || exit 1
;;
*) echo "Unknown : --network_type $network_type" && exit 1;
esac
# initialize
......@@ -366,7 +370,7 @@ if [[ -z "$mlp_init" && -z "$mlp_proto" ]]; then
echo "Initializing $mlp_proto -> $mlp_init"
nnet-initialize $mlp_proto $mlp_init 2>$log || { cat $log; exit 1; }
#optionally prepend dbn to the initialization
# optionally prepend dbn to the initialization
if [ ! -z $dbn ]; then
mlp_init_old=$mlp_init; mlp_init=$dir/nnet_$(basename $dbn)_dnn.init
nnet-concat $dbn $mlp_init_old $mlp_init || exit 1
......
#!/bin/env python
# Copyright 2015 Brno University of Technology (author: Karel Vesely)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Generated Nnet prototype, to be initialized by 'nnet-initialize'.
import sys
###
### Parse options
###
from optparse import OptionParser
usage="%prog [options] <feat-dim> <num-leaves> >nnet-proto-file"
parser = OptionParser(usage)
#
parser.add_option('--num-cells', dest='num_cells', type='int', default=800,
help='Number of LSTM cells [default: %default]');
parser.add_option('--num-recurrent', dest='num_recurrent', type='int', default=512,
help='Number of LSTM recurrent units [default: %default]');
parser.add_option('--lstm-stddev-factor', dest='lstm_stddev_factor', type='float', default=0.01,
help='Standard deviation of initialization [default: %default]');
parser.add_option('--param-stddev-factor', dest='param_stddev_factor', type='float', default=0.04,
help='Standard deviation in output layer [default: %default]');
#
(o,args) = parser.parse_args()
if len(args) != 2 :
parser.print_help()
sys.exit(1)
(feat_dim, num_leaves) = map(int,args);
# Original prototype from Jiayu,
#<NnetProto>
#<Transmit> <InputDim> 40 <OutputDim> 40
#<LstmProjectedStreams> <InputDim> 40 <OutputDim> 512 <CellDim> 800 <ParamScale> 0.01 <NumStream> 4
#<AffineTransform> <InputDim> 512 <OutputDim> 8000 <BiasMean> 0.000000 <BiasRange> 0.000000 <ParamStddev> 0.04
#<Softmax> <InputDim> 8000 <OutputDim> 8000
#</NnetProto>
print "<NnetProto>"
print "<LstmProjectedStreams> <InputDim> %d <OutputDim> %d <CellDim> %s <ParamScale> %f" % \
(feat_dim, o.num_recurrent, o.num_cells, o.lstm_stddev_factor)
print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> 0.0 <BiasRange> 0.0 <ParamStddev> %f" % \
(o.num_recurrent, num_leaves, o.param_stddev_factor)
print "<Softmax> <InputDim> %d <OutputDim> %d" % \
(num_leaves, num_leaves)
print "</NnetProto>"
......@@ -473,7 +473,7 @@ void MatrixBase<Real>::AddMatDiagVec(
if (num_rows_ == 0) return;
for (MatrixIndexT i = 0; i < num_rows; i++){
for(MatrixIndexT j = 0; j < num_cols; j ++ ){
data[i*stride + j] += alpha * vdata[j] * Mdata[i*M_row_stride+j];
data[i*stride + j] += alpha * vdata[j] * Mdata[i*M_row_stride + j*M_col_stride];
}
}
}
......
......@@ -80,8 +80,6 @@ public:
ReadToken(is, false, &token);
if (token == "<CellDim>")
ReadBasicType(is, false, &ncell_);
else if (token == "<NumStream>")
ReadBasicType(is, false, &nstream_);
//else if (token == "<DropoutRate>")
// ReadBasicType(is, false, &dropout_rate_);
else if (token == "<ParamScale>")
......@@ -92,8 +90,6 @@ public:
is >> std::ws;
}
prev_nnet_state_.Resize(nstream_, 7*ncell_ + 1*nrecur_, kSetZero);
// init weight and bias (Uniform)
w_gifo_x_.Resize(4*ncell_, input_dim_, kUndefined);
w_gifo_r_.Resize(4*ncell_, nrecur_, kUndefined);
......@@ -129,8 +125,6 @@ public:
void ReadData(std::istream &is, bool binary) {
ExpectToken(is, binary, "<CellDim>");
ReadBasicType(is, binary, &ncell_);
ExpectToken(is, binary, "<NumStream>");
ReadBasicType(is, binary, &nstream_);
//ExpectToken(is, binary, "<DropoutRate>");
//ReadBasicType(is, binary, &dropout_rate_);
......@@ -144,8 +138,6 @@ public:
w_r_m_.Read(is, binary);
prev_nnet_state_.Resize(nstream_, 7*ncell_ + 1*nrecur_, kSetZero);
// init delta buffers
w_gifo_x_corr_.Resize(4*ncell_, input_dim_, kSetZero);
w_gifo_r_corr_.Resize(4*ncell_, nrecur_, kSetZero);
......@@ -161,8 +153,6 @@ public:
void WriteData(std::ostream &os, bool binary) const {
WriteToken(os, binary, "<CellDim>");
WriteBasicType(os, binary, ncell_);
WriteToken(os, binary, "<NumStream>");
WriteBasicType(os, binary, nstream_);
//WriteToken(os, binary, "<DropoutRate>");
//WriteBasicType(os, binary, dropout_rate_);
......@@ -239,6 +229,12 @@ public:
}
void ResetLstmStreams(const std::vector<int32> &stream_reset_flag) {
// allocate prev_nnet_state_ if not done yet,
if (nstream_ == 0) {
// Karel: we just got number of streams! (before the 1st batch comes)
nstream_ = stream_reset_flag.size();
prev_nnet_state_.Resize(nstream_, 7*ncell_ + 1*nrecur_, kSetZero);
}
// reset flag: 1 - reset stream network state
KALDI_ASSERT(prev_nnet_state_.NumRows() == stream_reset_flag.size());
for (int s = 0; s < stream_reset_flag.size(); s++) {
......@@ -251,6 +247,12 @@ public:
void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
int DEBUG = 0;
if (nstream_ == 0) {
nstream_ = 1; // Karel: we are in nnet-forward, so 1 stream,
prev_nnet_state_.Resize(nstream_, 7*ncell_ + 1*nrecur_, kSetZero);
}
KALDI_ASSERT(nstream_ > 0);
KALDI_ASSERT(in.NumRows() % nstream_ == 0);
int32 T = in.NumRows() / nstream_;
int32 S = nstream_;
......
......@@ -10,6 +10,7 @@ BINFILES = nnet-train-frmshuff \
nnet-train-perutt \
nnet-train-mmi-sequential \
nnet-train-mpe-sequential \
nnet-train-lstm-streams \
rbm-train-cd1-frmshuff rbm-convert-to-nnet \
nnet-forward nnet-copy nnet-info nnet-concat \
transf-to-nnet cmvn-to-nnet nnet-initialize \
......
// nnetbin/nnet-train-lstm-streams.cc
// Copyright 2014 Jiayu DU (Jerry), Wei Li
// Copyright 2015 Brno University of Technology (Author: Karel Vesely)
// 2014 Jiayu DU (Jerry), Wei Li
// See ../../COPYING for clarification regarding multiple authors
//
......@@ -58,11 +59,13 @@ int main(int argc, char *argv[]) {
std::string objective_function = "xent";
po.Register("objective-function", &objective_function, "Objective function : xent|mse");
/*
int32 length_tolerance = 5;
po.Register("length-tolerance", &length_tolerance, "Allowed length difference of features/targets (frames)");
std::string frame_weights;
po.Register("frame-weights", &frame_weights, "Per-frame weights to scale gradients (frame selection/weighting).");
*/
std::string use_gpu="yes";
po.Register("use-gpu", &use_gpu, "yes|no|optional, only has effect if compiled with CUDA");
......@@ -86,6 +89,7 @@ int main(int argc, char *argv[]) {
rnd_opts.Register(&po);
bool randomize = false;
po.Register("randomize", &randomize, "Dummy option, for compatibility...");
//
po.Read(argc, argv);
......@@ -126,10 +130,13 @@ int main(int argc, char *argv[]) {
SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
RandomAccessPosteriorReader target_reader(targets_rspecifier);
/*
RandomAccessBaseFloatVectorReader weights_reader;
if (frame_weights != "") {
weights_reader.Open(frame_weights);
}
*/
RandomizerMask randomizer_mask(rnd_opts);
MatrixRandomizer feature_randomizer(rnd_opts);
......@@ -173,12 +180,12 @@ int main(int argc, char *argv[]) {
keys[s] = feature_reader.Key();
const Matrix<BaseFloat> &mat = feature_reader.Value();
{ // apply optional feature transform,
// Karel: feature transform may contain <Splice> which copies frames
// on sentence boundaries. It is better to apply feature transform
// to whole sentences.
// Karel: feature transform may contain <Splice> which does clone
// frames on sentence boundaries. It is better to apply feature
// transform to whole sentences.
nnet_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feat_transf);
feats[s].Resize(feat_transf.NumRows(), feat_transf.NumCols())
feat_transf.CopyToMat(feats[s]);
feats[s].Resize(feat_transf.NumRows(), feat_transf.NumCols());
feat_transf.CopyToMat(&feats[s]);
}
if (!target_reader.HasKey(keys[s])) {
KALDI_WARN << keys[s] << ", missing targets";
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment