Commit 13c78d3b authored by Karel Vesely's avatar Karel Vesely

Merge pull request #66 from nichongjia/blstm

blstm remove bug
parents 9c257c5a 7afae3f8
......@@ -58,17 +58,17 @@ print "<NnetProto>"
# normally we won't use more than 2 layers of LSTM
if o.num_layers == 1:
print "<BLstmProjectedStreams> <InputDim> %d <OutputDim> %d <CellDim> %s <ParamScale> %f <ClipGradient> %f" % \
(feat_dim, o.num_recurrent, o.num_cells, o.lstm_stddev_factor, o.clip_gradient)
(feat_dim, 2*o.num_recurrent, o.num_cells, o.lstm_stddev_factor, o.clip_gradient)
elif o.num_layers == 2:
print "<BLstmProjectedStreams> <InputDim> %d <OutputDim> %d <CellDim> %s <ParamScale> %f <ClipGradient> %f" % \
(feat_dim, o.num_recurrent, o.num_cells, o.lstm_stddev_factor, o.clip_gradient)
(feat_dim, 2*o.num_recurrent, o.num_cells, o.lstm_stddev_factor, o.clip_gradient)
print "<BLstmProjectedStreams> <InputDim> %d <OutputDim> %d <CellDim> %s <ParamScale> %f <ClipGradient> %f" % \
(o.num_recurrent, o.num_recurrent, o.num_cells, o.lstm_stddev_factor, o.clip_gradient)
(2*o.num_recurrent, 2*o.num_recurrent, o.num_cells, o.lstm_stddev_factor, o.clip_gradient)
else:
sys.stderr.write("make_lstm_proto.py ERROR: more than 2 layers of LSTM, not supported yet.\n")
sys.exit(1)
print "<AffineTransform> <InputDim> %d <OutputDim> %d <BiasMean> 0.0 <BiasRange> 0.0 <ParamStddev> %f" % \
(o.num_recurrent, num_leaves, o.param_stddev_factor)
(2*o.num_recurrent, num_leaves, o.param_stddev_factor)
print "<Softmax> <InputDim> %d <OutputDim> %d" % \
(num_leaves, num_leaves)
print "</NnetProto>"
......
// nnet/nnet-lstm-projected-streams.h
// nnet/nnet-blstm-projected-streams.h
// Copyright 2014 Jiayu DU (Jerry), Wei Li
// Copyright 2015 Chongjia Ni
......@@ -49,7 +49,7 @@ class BLstmProjectedStreams : public UpdatableComponent {
BLstmProjectedStreams(int32 input_dim, int32 output_dim) :
UpdatableComponent(input_dim, output_dim),
ncell_(0),
nrecur_(output_dim),
nrecur_(static_cast<int32>(output_dim/2)),
nstream_(0),
clip_gradient_(0.0)
//, dropout_rate_(0.0)
......@@ -75,7 +75,12 @@ class BLstmProjectedStreams : public UpdatableComponent {
v = tmp;
}
void InitData(std::istream &is) {
/// set the utterance length used for parallel training
void SetSeqLengths(const std::vector<int32> &sequence_lengths) {
sequence_lengths_ = sequence_lengths;
}
void InitData(const std::istream &is) {
// define options
float param_scale = 0.02;
// parse config
......@@ -86,7 +91,7 @@ class BLstmProjectedStreams : public UpdatableComponent {
ReadBasicType(is, false, &ncell_);
else if (token == "<ClipGradient>")
ReadBasicType(is, false, &clip_gradient_);
//else if (token == "<DropoutRate>")
// else if (token == "<DropoutRate>")
// ReadBasicType(is, false, &dropout_rate_);
else if (token == "<ParamScale>")
ReadBasicType(is, false, &param_scale);
......@@ -121,7 +126,6 @@ class BLstmProjectedStreams : public UpdatableComponent {
InitVecParam(f_bias_, param_scale);
InitVecParam(b_bias_, param_scale);
// This is for input gate, forgot gate and output gate connected with the previous cell
// forward direction
f_peephole_i_c_.Resize(ncell_, kUndefined);
f_peephole_f_c_.Resize(ncell_, kUndefined);
......@@ -174,8 +178,8 @@ class BLstmProjectedStreams : public UpdatableComponent {
ReadBasicType(is, binary, &ncell_);
ExpectToken(is, binary, "<ClipGradient>");
ReadBasicType(is, binary, &clip_gradient_);
//ExpectToken(is, binary, "<DropoutRate>");
//ReadBasicType(is, binary, &dropout_rate_);
// ExpectToken(is, binary, "<DropoutRate>");
// ReadBasicType(is, binary, &dropout_rate_);
// reading parameters corresponding to forward direction
f_w_gifo_x_.Read(is, binary);
......@@ -439,52 +443,9 @@ class BLstmProjectedStreams : public UpdatableComponent {
"\n B_DR " + MomentStatistics(B_DR);
}
void ResetLstmStreams(const std::vector<int32> &stream_reset_flag) {
// allocate f_prev_nnet_state_, b_prev_nnet_state_ if not done yet,
if (nstream_ == 0) {
// Karel: we just got number of streams! (before the 1st batch comes)
nstream_ = stream_reset_flag.size();
// forward direction
f_prev_nnet_state_.Resize(nstream_, 7*ncell_ + 1*nrecur_, kSetZero);
// backward direction
b_prev_nnet_state_.Resize(nstream_, 7*ncell_ + 1*nrecur_, kSetZero);
KALDI_LOG << "Running training with " << nstream_ << " streams.";
}
// reset flag: 1 - reset stream network state
KALDI_ASSERT(f_prev_nnet_state_.NumRows() == stream_reset_flag.size());
KALDI_ASSERT(b_prev_nnet_state_.NumRows() == stream_reset_flag.size());
for (int s = 0; s < stream_reset_flag.size(); s++) {
if (stream_reset_flag[s] == 1) {
// forward direction
f_prev_nnet_state_.Row(s).SetZero();
// backward direction
b_prev_nnet_state_.Row(s).SetZero();
}
}
}
void PropagateFnc(const CuMatrixBase<BaseFloat> &in, CuMatrixBase<BaseFloat> *out) {
int DEBUG = 0;
static bool do_stream_reset = false;
if (nstream_ == 0) {
do_stream_reset = true;
nstream_ = 1; // Karel: we are in nnet-forward, so we will use 1 stream,
// forward direction
f_prev_nnet_state_.Resize(nstream_, 7*ncell_ + 1*nrecur_, kSetZero);
// backward direction
b_prev_nnet_state_.Resize(nstream_, 7*ncell_ + 1*nrecur_, kSetZero);
KALDI_LOG << "Running nnet-forward with per-utterance BLSTM-state reset";
}
if (do_stream_reset) {
// resetting the forward and backward streams
f_prev_nnet_state_.SetZero();
b_prev_nnet_state_.SetZero();
}
KALDI_ASSERT(nstream_ > 0);
int32 nstream_ = sequence_lengths_.size();
KALDI_ASSERT(in.NumRows() % nstream_ == 0);
int32 T = in.NumRows() / nstream_;
int32 S = nstream_;
......@@ -492,12 +453,8 @@ class BLstmProjectedStreams : public UpdatableComponent {
// 0:forward pass history, [1, T]:current sequence, T+1:dummy
// forward direction
f_propagate_buf_.Resize((T+2)*S, 7 * ncell_ + nrecur_, kSetZero);
f_propagate_buf_.RowRange(0*S,S).CopyFromMat(f_prev_nnet_state_);
// backward direction
b_propagate_buf_.Resize((T+2)*S, 7 * ncell_ + nrecur_, kSetZero);
// for the backward direction, we initialize it at (T+1) frame
b_propagate_buf_.RowRange((T+1)*S,S).CopyFromMat(b_prev_nnet_state_);
// disassembling forward-pass forward-propagation buffer into different neurons,
CuSubMatrix<BaseFloat> F_YG(f_propagate_buf_.ColRange(0*ncell_, ncell_));
......@@ -525,32 +482,33 @@ class BLstmProjectedStreams : public UpdatableComponent {
// forward direction
// x -> g, i, f, o, not recurrent, do it all in once
F_YGIFO.RowRange(1*S,T*S).AddMatMat(1.0, in, kNoTrans, f_w_gifo_x_, kTrans, 0.0);
F_YGIFO.RowRange(1*S, T*S).AddMatMat(1.0, in, kNoTrans, f_w_gifo_x_, kTrans, 0.0);
// bias -> g, i, f, o
F_YGIFO.RowRange(1*S,T*S).AddVecToRows(1.0, f_bias_);
F_YGIFO.RowRange(1*S, T*S).AddVecToRows(1.0, f_bias_);
for (int t = 1; t <= T; t++) {
// multistream buffers for current time-step
CuSubMatrix<BaseFloat> y_g(F_YG.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_i(F_YI.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_f(F_YF.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_o(F_YO.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_c(F_YC.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_h(F_YH.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_m(F_YM.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_r(F_YR.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_gifo(F_YGIFO.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_all(f_propagate_buf_.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_g(F_YG.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_i(F_YI.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_f(F_YF.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_o(F_YO.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_c(F_YC.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_h(F_YH.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_m(F_YM.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_r(F_YR.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_gifo(F_YGIFO.RowRange(t*S, S));
// r(t-1) -> g, i, f, o
y_gifo.AddMatMat(1.0, F_YR.RowRange((t-1)*S,S), kNoTrans, f_w_gifo_r_, kTrans, 1.0);
y_gifo.AddMatMat(1.0, F_YR.RowRange((t-1)*S, S), kNoTrans, f_w_gifo_r_, kTrans, 1.0);
// c(t-1) -> i(t) via peephole
y_i.AddMatDiagVec(1.0, F_YC.RowRange((t-1)*S,S), kNoTrans, f_peephole_i_c_, 1.0);
y_i.AddMatDiagVec(1.0, F_YC.RowRange((t-1)*S, S), kNoTrans, f_peephole_i_c_, 1.0);
// c(t-1) -> f(t) via peephole
y_f.AddMatDiagVec(1.0, F_YC.RowRange((t-1)*S,S), kNoTrans, f_peephole_f_c_, 1.0);
y_f.AddMatDiagVec(1.0, F_YC.RowRange((t-1)*S, S), kNoTrans, f_peephole_f_c_, 1.0);
// i, f sigmoid squashing
y_i.Sigmoid(y_i);
......@@ -563,7 +521,7 @@ class BLstmProjectedStreams : public UpdatableComponent {
y_c.AddMatMatElements(1.0, y_g, y_i, 0.0);
// c(t-1) -> c(t) via forget-gate
y_c.AddMatMatElements(1.0, F_YC.RowRange((t-1)*S,S), y_f, 1.0);
y_c.AddMatMatElements(1.0, F_YC.RowRange((t-1)*S, S), y_f, 1.0);
y_c.ApplyFloor(-50); // optional clipping of cell activation
y_c.ApplyCeiling(50); // google paper Interspeech2014: LSTM for LVCSR
......@@ -583,6 +541,12 @@ class BLstmProjectedStreams : public UpdatableComponent {
// m -> r
y_r.AddMatMat(1.0, y_m, kNoTrans, f_w_r_m_, kTrans, 0.0);
// set zeros
// for (int s = 0; s < S; s++) {
// if (t > sequence_lengths_[s])
// y_all.Row(s).SetZero();
// }
if (DEBUG) {
std::cerr << "forward direction forward-pass frame " << t << "\n";
std::cerr << "activation of g: " << y_g;
......@@ -597,43 +561,43 @@ class BLstmProjectedStreams : public UpdatableComponent {
}
// backward direction
B_YGIFO.RowRange(1*S,T*S).AddMatMat(1.0, in, kNoTrans, b_w_gifo_x_, kTrans, 0.0);
B_YGIFO.RowRange(1*S, T*S).AddMatMat(1.0, in, kNoTrans, b_w_gifo_x_, kTrans, 0.0);
//// LSTM forward dropout
//// Google paper 2014: Recurrent Neural Network Regularization
//// by Wojciech Zaremba, Ilya Sutskever, Oriol Vinyals
//if (dropout_rate_ != 0.0) {
// if (dropout_rate_ != 0.0) {
// dropout_mask_.Resize(in.NumRows(), 4*ncell_, kUndefined);
// dropout_mask_.SetRandUniform(); // [0,1]
// dropout_mask_.Add(-dropout_rate_); // [-dropout_rate, 1-dropout_rate_],
// dropout_mask_.ApplyHeaviside(); // -tive -> 0.0, +tive -> 1.0
// YGIFO.RowRange(1*S,T*S).MulElements(dropout_mask_);
//}
// }
// bias -> g, i, f, o
B_YGIFO.RowRange(1*S,T*S).AddVecToRows(1.0, b_bias_);
B_YGIFO.RowRange(1*S, T*S).AddVecToRows(1.0, b_bias_);
// backward direction, from T to 1, t--
for (int t = T; t >= 1; t--) {
// multistream buffers for current time-step
CuSubMatrix<BaseFloat> y_g(B_YG.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_i(B_YI.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_f(B_YF.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_o(B_YO.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_c(B_YC.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_h(B_YH.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_m(B_YM.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_r(B_YR.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_gifo(B_YGIFO.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_all(b_propagate_buf_.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_g(B_YG.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_i(B_YI.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_f(B_YF.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_o(B_YO.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_c(B_YC.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_h(B_YH.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_m(B_YM.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_r(B_YR.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_gifo(B_YGIFO.RowRange(t*S, S));
// r(t+1) -> g, i, f, o
y_gifo.AddMatMat(1.0, B_YR.RowRange((t+1)*S,S), kNoTrans, b_w_gifo_r_, kTrans, 1.0);
y_gifo.AddMatMat(1.0, B_YR.RowRange((t+1)*S, S), kNoTrans, b_w_gifo_r_, kTrans, 1.0);
// c(t+1) -> i(t) via peephole
y_i.AddMatDiagVec(1.0, B_YC.RowRange((t+1)*S,S), kNoTrans, b_peephole_i_c_, 1.0);
y_i.AddMatDiagVec(1.0, B_YC.RowRange((t+1)*S, S), kNoTrans, b_peephole_i_c_, 1.0);
// c(t+1) -> f(t) via peephole
y_f.AddMatDiagVec(1.0, B_YC.RowRange((t+1)*S,S), kNoTrans, b_peephole_f_c_, 1.0);
y_f.AddMatDiagVec(1.0, B_YC.RowRange((t+1)*S, S), kNoTrans, b_peephole_f_c_, 1.0);
// i, f sigmoid squashing
y_i.Sigmoid(y_i);
......@@ -646,7 +610,7 @@ class BLstmProjectedStreams : public UpdatableComponent {
y_c.AddMatMatElements(1.0, y_g, y_i, 0.0);
// c(t+1) -> c(t) via forget-gate
y_c.AddMatMatElements(1.0, B_YC.RowRange((t+1)*S,S), y_f, 1.0);
y_c.AddMatMatElements(1.0, B_YC.RowRange((t+1)*S, S), y_f, 1.0);
y_c.ApplyFloor(-50); // optional clipping of cell activation
y_c.ApplyCeiling(50); // google paper Interspeech2014: LSTM for LVCSR
......@@ -666,6 +630,11 @@ class BLstmProjectedStreams : public UpdatableComponent {
// m -> r
y_r.AddMatMat(1.0, y_m, kNoTrans, b_w_r_m_, kTrans, 0.0);
for (int s = 0; s < S; s++) {
if (t > sequence_lengths_[s])
y_all.Row(s).SetZero();
}
if (DEBUG) {
std::cerr << "backward direction forward-pass frame " << t << "\n";
std::cerr << "activation of g: " << y_g;
......@@ -679,26 +648,21 @@ class BLstmProjectedStreams : public UpdatableComponent {
}
}
// According to definition of BLSTM, for output YR of BLSTM, YR should be F_YR + B_YR
CuSubMatrix<BaseFloat> YR(F_YR.RowRange(1*S,T*S));
YR.AddMat(1.0,B_YR.RowRange(1*S,T*S));
CuMatrix<BaseFloat> YR_FB;
YR_FB.Resize((T+2)*S, 2 * nrecur_, kSetZero);
// forward part
YR_FB.ColRange(0, nrecur_).CopyFromMat(f_propagate_buf_.ColRange(7*ncell_, nrecur_));
// backward part
YR_FB.ColRange(nrecur_, nrecur_).CopyFromMat(b_propagate_buf_.ColRange(7*ncell_, nrecur_));
// recurrent projection layer is also feed-forward as BLSTM output
out->CopyFromMat(YR);
// now the last frame state becomes previous network state for next batch
f_prev_nnet_state_.CopyFromMat(f_propagate_buf_.RowRange(T*S,S));
// now the last frame (,that is the first frame) becomes previous netwok state for next batch
b_prev_nnet_state_.CopyFromMat(b_propagate_buf_.RowRange(1*S,S));
out->CopyFromMat(YR_FB.RowRange(1*S, T*S));
}
void BackpropagateFnc(const CuMatrixBase<BaseFloat> &in, const CuMatrixBase<BaseFloat> &out,
const CuMatrixBase<BaseFloat> &out_diff, CuMatrixBase<BaseFloat> *in_diff) {
int DEBUG = 0;
// the number of sequences to be processed in parallel
int32 nstream_ = sequence_lengths_.size();
int32 T = in.NumRows() / nstream_;
int32 S = nstream_;
// disassembling forward-pass forward-propagation buffer into different neurons,
......@@ -727,31 +691,31 @@ class BLstmProjectedStreams : public UpdatableComponent {
CuSubMatrix<BaseFloat> F_DGIFO(f_backpropagate_buf_.ColRange(0, 4*ncell_));
// projection layer to BLSTM output is not recurrent, so backprop it all in once
F_DR.RowRange(1*S,T*S).CopyFromMat(out_diff);
F_DR.RowRange(1*S, T*S).CopyFromMat(out_diff.ColRange(0, nrecur_));
for (int t = T; t >= 1; t--) {
CuSubMatrix<BaseFloat> y_g(F_YG.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_i(F_YI.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_f(F_YF.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_o(F_YO.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_c(F_YC.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_h(F_YH.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_m(F_YM.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_r(F_YR.RowRange(t*S,S));
CuSubMatrix<BaseFloat> d_g(F_DG.RowRange(t*S,S));
CuSubMatrix<BaseFloat> d_i(F_DI.RowRange(t*S,S));
CuSubMatrix<BaseFloat> d_f(F_DF.RowRange(t*S,S));
CuSubMatrix<BaseFloat> d_o(F_DO.RowRange(t*S,S));
CuSubMatrix<BaseFloat> d_c(F_DC.RowRange(t*S,S));
CuSubMatrix<BaseFloat> d_h(F_DH.RowRange(t*S,S));
CuSubMatrix<BaseFloat> d_m(F_DM.RowRange(t*S,S));
CuSubMatrix<BaseFloat> d_r(F_DR.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_g(F_YG.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_i(F_YI.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_f(F_YF.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_o(F_YO.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_c(F_YC.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_h(F_YH.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_m(F_YM.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_r(F_YR.RowRange(t*S, S));
CuSubMatrix<BaseFloat> d_g(F_DG.RowRange(t*S, S));
CuSubMatrix<BaseFloat> d_i(F_DI.RowRange(t*S, S));
CuSubMatrix<BaseFloat> d_f(F_DF.RowRange(t*S, S));
CuSubMatrix<BaseFloat> d_o(F_DO.RowRange(t*S, S));
CuSubMatrix<BaseFloat> d_c(F_DC.RowRange(t*S, S));
CuSubMatrix<BaseFloat> d_h(F_DH.RowRange(t*S, S));
CuSubMatrix<BaseFloat> d_m(F_DM.RowRange(t*S, S));
CuSubMatrix<BaseFloat> d_r(F_DR.RowRange(t*S, S));
CuSubMatrix<BaseFloat> d_all(f_backpropagate_buf_.RowRange(t*S, S));
// r
// Version 1 (precise gradients):
// backprop error from g(t+1), i(t+1), f(t+1), o(t+1) to r(t)
d_r.AddMatMat(1.0, F_DGIFO.RowRange((t+1)*S,S), kNoTrans, f_w_gifo_r_, kNoTrans, 1.0);
d_r.AddMatMat(1.0, F_DGIFO.RowRange((t+1)*S, S), kNoTrans, f_w_gifo_r_, kNoTrans, 1.0);
/*
// Version 2 (Alex Graves' PhD dissertation):
......@@ -785,13 +749,13 @@ class BLstmProjectedStreams : public UpdatableComponent {
// 4. diff from f(t+1) (via peephole)
// 5. diff from o(t) (via peephole, not recurrent)
d_c.AddMat(1.0, d_h);
d_c.AddMatMatElements(1.0, F_DC.RowRange((t+1)*S,S), F_YF.RowRange((t+1)*S,S), 1.0);
d_c.AddMatDiagVec(1.0, F_DI.RowRange((t+1)*S,S), kNoTrans, f_peephole_i_c_, 1.0);
d_c.AddMatDiagVec(1.0, F_DF.RowRange((t+1)*S,S), kNoTrans, f_peephole_f_c_, 1.0);
d_c.AddMatMatElements(1.0, F_DC.RowRange((t+1)*S, S), F_YF.RowRange((t+1)*S, S), 1.0);
d_c.AddMatDiagVec(1.0, F_DI.RowRange((t+1)*S, S), kNoTrans, f_peephole_i_c_, 1.0);
d_c.AddMatDiagVec(1.0, F_DF.RowRange((t+1)*S, S), kNoTrans, f_peephole_f_c_, 1.0);
d_c.AddMatDiagVec(1.0, d_o , kNoTrans, f_peephole_o_c_, 1.0);
// f
d_f.AddMatMatElements(1.0, d_c, F_YC.RowRange((t-1)*S,S), 0.0);
d_f.AddMatMatElements(1.0, d_c, F_YC.RowRange((t-1)*S, S), 0.0);
d_f.DiffSigmoid(y_f, d_f);
// i
......@@ -838,35 +802,35 @@ class BLstmProjectedStreams : public UpdatableComponent {
CuSubMatrix<BaseFloat> B_DH(b_backpropagate_buf_.ColRange(5*ncell_, ncell_));
CuSubMatrix<BaseFloat> B_DM(b_backpropagate_buf_.ColRange(6*ncell_, ncell_));
CuSubMatrix<BaseFloat> B_DR(b_backpropagate_buf_.ColRange(7*ncell_, nrecur_));
CuSubMatrix<BaseFloat> B_DGIFO(b_backpropagate_buf_.ColRange(0, 4*ncell_));
// projection layer to BLSTM output is not recurrent, so backprop it all in once
B_DR.RowRange(1*S,T*S).CopyFromMat(out_diff);
B_DR.RowRange(1*S, T*S).CopyFromMat(out_diff.ColRange(nrecur_, nrecur_));
for (int t = 1; t <= T; t++) {
CuSubMatrix<BaseFloat> y_g(B_YG.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_i(B_YI.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_f(B_YF.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_o(B_YO.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_c(B_YC.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_h(B_YH.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_m(B_YM.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_r(B_YR.RowRange(t*S,S));
CuSubMatrix<BaseFloat> d_g(B_DG.RowRange(t*S,S));
CuSubMatrix<BaseFloat> d_i(B_DI.RowRange(t*S,S));
CuSubMatrix<BaseFloat> d_f(B_DF.RowRange(t*S,S));
CuSubMatrix<BaseFloat> d_o(B_DO.RowRange(t*S,S));
CuSubMatrix<BaseFloat> d_c(B_DC.RowRange(t*S,S));
CuSubMatrix<BaseFloat> d_h(B_DH.RowRange(t*S,S));
CuSubMatrix<BaseFloat> d_m(B_DM.RowRange(t*S,S));
CuSubMatrix<BaseFloat> d_r(B_DR.RowRange(t*S,S));
CuSubMatrix<BaseFloat> y_g(B_YG.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_i(B_YI.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_f(B_YF.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_o(B_YO.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_c(B_YC.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_h(B_YH.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_m(B_YM.RowRange(t*S, S));
CuSubMatrix<BaseFloat> y_r(B_YR.RowRange(t*S, S));
CuSubMatrix<BaseFloat> d_g(B_DG.RowRange(t*S, S));
CuSubMatrix<BaseFloat> d_i(B_DI.RowRange(t*S, S));
CuSubMatrix<BaseFloat> d_f(B_DF.RowRange(t*S, S));
CuSubMatrix<BaseFloat> d_o(B_DO.RowRange(t*S, S));
CuSubMatrix<BaseFloat> d_c(B_DC.RowRange(t*S, S));
CuSubMatrix<BaseFloat> d_h(B_DH.RowRange(t*S, S));
CuSubMatrix<BaseFloat> d_m(B_DM.RowRange(t*S, S));
CuSubMatrix<BaseFloat> d_r(B_DR.RowRange(t*S, S));
CuSubMatrix<BaseFloat> d_all(b_backpropagate_buf_.RowRange(t*S, S));
// r
// Version 1 (precise gradients):
// backprop error from g(t-1), i(t-1), f(t-1), o(t-1) to r(t)
d_r.AddMatMat(1.0, B_DGIFO.RowRange((t-1)*S,S), kNoTrans, b_w_gifo_r_, kNoTrans, 1.0);
d_r.AddMatMat(1.0, B_DGIFO.RowRange((t-1)*S, S), kNoTrans, b_w_gifo_r_, kNoTrans, 1.0);
/*
// Version 2 (Alex Graves' PhD dissertation):
......@@ -899,13 +863,13 @@ class BLstmProjectedStreams : public UpdatableComponent {
// 4. diff from f(t+1) (via peephole)
// 5. diff from o(t) (via peephole, not recurrent)
d_c.AddMat(1.0, d_h);
d_c.AddMatMatElements(1.0, B_DC.RowRange((t-1)*S,S), B_YF.RowRange((t-1)*S,S), 1.0);
d_c.AddMatDiagVec(1.0, B_DI.RowRange((t-1)*S,S), kNoTrans, b_peephole_i_c_, 1.0);
d_c.AddMatDiagVec(1.0, B_DF.RowRange((t-1)*S,S), kNoTrans, b_peephole_f_c_, 1.0);
d_c.AddMatMatElements(1.0, B_DC.RowRange((t-1)*S, S), B_YF.RowRange((t-1)*S, S), 1.0);
d_c.AddMatDiagVec(1.0, B_DI.RowRange((t-1)*S, S), kNoTrans, b_peephole_i_c_, 1.0);
d_c.AddMatDiagVec(1.0, B_DF.RowRange((t-1)*S, S), kNoTrans, b_peephole_f_c_, 1.0);
d_c.AddMatDiagVec(1.0, d_o , kNoTrans, b_peephole_o_c_, 1.0);
// f
d_f.AddMatMatElements(1.0, d_c, B_YC.RowRange((t-1)*S,S), 0.0);
d_f.AddMatMatElements(1.0, d_c, B_YC.RowRange((t-1)*S, S), 0.0);
d_f.DiffSigmoid(y_f, d_f);
// i
......@@ -932,13 +896,12 @@ class BLstmProjectedStreams : public UpdatableComponent {
// g,i,f,o -> x, do it all in once
// forward direction difference
in_diff->AddMatMat(1.0, F_DGIFO.RowRange(1*S,T*S), kNoTrans, f_w_gifo_x_, kNoTrans, 0.0);
in_diff->AddMatMat(1.0, F_DGIFO.RowRange(1*S, T*S), kNoTrans, f_w_gifo_x_, kNoTrans, 0.0);
// backward direction difference
in_diff->AddMatMat(1.0, B_DGIFO.RowRange(1*S,T*S), kNoTrans, b_w_gifo_x_, kNoTrans, 1.0);
in_diff->AddMatMat(1.0, B_DGIFO.RowRange(1*S, T*S), kNoTrans, b_w_gifo_x_, kNoTrans, 1.0);
// backward pass dropout
//if (dropout_rate_ != 0.0) {
// if (dropout_rate_ != 0.0) {
// in_diff->MulElements(dropout_mask_);
//}
......@@ -947,26 +910,26 @@ class BLstmProjectedStreams : public UpdatableComponent {
// forward direction
// weight x -> g, i, f, o
f_w_gifo_x_corr_.AddMatMat(1.0, F_DGIFO.RowRange(1*S,T*S), kTrans,
f_w_gifo_x_corr_.AddMatMat(1.0, F_DGIFO.RowRange(1*S, T*S), kTrans,
in, kNoTrans, mmt);
// recurrent weight r -> g, i, f, o
f_w_gifo_r_corr_.AddMatMat(1.0, F_DGIFO.RowRange(1*S,T*S), kTrans,
F_YR.RowRange(0*S,T*S), kNoTrans, mmt);
f_w_gifo_r_corr_.AddMatMat(1.0, F_DGIFO.RowRange(1*S, T*S), kTrans,
F_YR.RowRange(0*S, T*S), kNoTrans, mmt);
// bias of g, i, f, o
f_bias_corr_.AddRowSumMat(1.0, F_DGIFO.RowRange(1*S,T*S), mmt);
f_bias_corr_.AddRowSumMat(1.0, F_DGIFO.RowRange(1*S, T*S), mmt);
// recurrent peephole c -> i
f_peephole_i_c_corr_.AddDiagMatMat(1.0, F_DI.RowRange(1*S,T*S), kTrans,
F_YC.RowRange(0*S,T*S), kNoTrans, mmt);
f_peephole_i_c_corr_.AddDiagMatMat(1.0, F_DI.RowRange(1*S, T*S), kTrans,
F_YC.RowRange(0*S, T*S), kNoTrans, mmt);
// recurrent peephole c -> f
f_peephole_f_c_corr_.AddDiagMatMat(1.0, F_DF.RowRange(1*S,T*S), kTrans,
F_YC.RowRange(0*S,T*S), kNoTrans, mmt);
f_peephole_f_c_corr_.AddDiagMatMat(1.0, F_DF.RowRange(1*S, T*S), kTrans,
F_YC.RowRange(0*S, T*S), kNoTrans, mmt);
// peephole c -> o
f_peephole_o_c_corr_.AddDiagMatMat(1.0, F_DO.RowRange(1*S,T*S), kTrans,
F_YC.RowRange(1*S,T*S), kNoTrans, mmt);
f_peephole_o_c_corr_.AddDiagMatMat(1.0, F_DO.RowRange(1*S, T*S), kTrans,
F_YC.RowRange(1*S, T*S), kNoTrans, mmt);
f_w_r_m_corr_.AddMatMat(1.0, F_DR.RowRange(1*S,T*S), kTrans,
F_YM.RowRange(1*S,T*S), kNoTrans, mmt);
f_w_r_m_corr_.AddMatMat(1.0, F_DR.RowRange(1*S, T*S), kTrans,
F_YM.RowRange(1*S, T*S), kNoTrans, mmt);
// apply the gradient clipping for forwardpass gradients
if (clip_gradient_ > 0.0) {
......@@ -988,25 +951,25 @@ class BLstmProjectedStreams : public UpdatableComponent {
// backward direction backpropagate
// weight x -> g, i, f, o
b_w_gifo_x_corr_.AddMatMat(1.0, B_DGIFO.RowRange(1*S,T*S), kTrans, in, kNoTrans, mmt);
b_w_gifo_x_corr_.AddMatMat(1.0, B_DGIFO.RowRange(1*S, T*S), kTrans, in, kNoTrans, mmt);
// recurrent weight r -> g, i, f, o
b_w_gifo_r_corr_.AddMatMat(1.0, B_DGIFO.RowRange(1*S,T*S), kTrans,
B_YR.RowRange(0*S,T*S) , kNoTrans, mmt);
b_w_gifo_r_corr_.AddMatMat(1.0, B_DGIFO.RowRange(1*S, T*S), kTrans,
B_YR.RowRange(0*S, T*S) , kNoTrans, mmt);
// bias of g, i, f, o
b_bias_corr_.AddRowSumMat(1.0, B_DGIFO.RowRange(1*S,T*S), mmt);
// recurrent peephole c -> i, c(t+1) --> i ##commented by chongjia
b_peephole_i_c_corr_.AddDiagMatMat(1.0, B_DI.RowRange(1*S,T*S), kTrans,
B_YC.RowRange(2*S,T*S), kNoTrans, mmt);
// recurrent peephole c -> f, c(t+1) --> f ###commented by chongjia
b_peephole_f_c_corr_.AddDiagMatMat(1.0, B_DF.RowRange(1*S,T*S), kTrans,
B_YC.RowRange(2*S,T*S), kNoTrans, mmt);
b_bias_corr_.AddRowSumMat(1.0, B_DGIFO.RowRange(1*S, T*S), mmt);
// recurrent peephole c -> i, c(t+1) --> i
b_peephole_i_c_corr_.AddDiagMatMat(1.0, B_DI.RowRange(1*S, T*S), kTrans,
B_YC.RowRange(2*S, T*S), kNoTrans, mmt);
// recurrent peephole c -> f, c(t+1) --> f
b_peephole_f_c_corr_.AddDiagMatMat(1.0, B_DF.RowRange(1*S, T*S), kTrans,
B_YC.RowRange(2*S, T*S), kNoTrans, mmt);
// peephole c -> o
b_peephole_o_c_corr_.AddDiagMatMat(1.0, B_DO.RowRange(1*S,T*S), kTrans,
B_YC.RowRange(1*S,T*S), kNoTrans, mmt);
b_peephole_o_c_corr_.AddDiagMatMat(1.0, B_DO.RowRange(1*S, T*S), kTrans,
B_YC.RowRange(1*S, T*S), kNoTrans, mmt);
b_w_r_m_corr_.AddMatMat(1.0, B_DR.RowRange(1*S,T*S), kTrans,
B_YM.RowRange(1*S,T*S), kNoTrans, mmt);
b_w_r_m_corr_.AddMatMat(1.0, B_DR.RowRange(1*S, T*S), kTrans,
B_YM.RowRange(1*S, T*S), kNoTrans, mmt);
// apply the gradient clipping for backwardpass gradients
if (clip_gradient_ > 0.0) {
......@@ -1083,16 +1046,14 @@ class BLstmProjectedStreams : public UpdatableComponent {
int32 ncell_; ///< the number of cell blocks
int32 nrecur_; ///< recurrent projection layer dim
int32 nstream_;
CuMatrix<BaseFloat> f_prev_nnet_state_;
CuMatrix<BaseFloat> b_prev_nnet_state_;
std::vector<int32> sequence_lengths_;
// gradient-clipping value,
BaseFloat clip_gradient_;
// non-recurrent dropout
//BaseFloat dropout_rate_;
//CuMatrix<BaseFloat> dropout_mask_;
// BaseFloat dropout_rate_;
// CuMatrix<BaseFloat> dropout_mask_;
// feed-forward connections: from x to [g, i, f, o]
// forward direction
......@@ -1160,7 +1121,7 @@ class BLstmProjectedStreams : public UpdatableComponent {
CuMatrix<BaseFloat> b_backpropagate_buf_;
};
} // namespace nnet1
} // namespace kaldi
} // namespace nnet1
} // namespace kaldi
#endif
......@@ -120,7 +120,7 @@ Component* Component::NewComponentOfType(ComponentType comp_type,
ans = new LstmProjectedStreams(input_dim, output_dim);
break;
case Component::kBLstmProjectedStreams :
ans = new BLstmProjectedStreams(input_dim, output_dim);
ans = new BLstmProjectedStreams(input_dim, output_dim);
break;
case Component::kSoftmax :
ans = new Softmax(input_dim, output_dim);
......
......@@ -91,7 +91,7 @@ class Component {
/// Convert component type to marker
static const char* TypeToMarker(ComponentType t);
/// Convert marker to component type (case insensitive)
static ComponentType MarkerToType(const std::string &s);
static ComponentType MarkerToType(const std::string &s);
/// General interface of a component
public:
......
......@@ -32,7 +32,7 @@ namespace nnet1 {
Nnet::Nnet(const Nnet& other) {
// copy the components
for(int32 i=0; i<other.NumComponents(); i++) {
for(int32 i = 0; i < other.NumComponents(); i++) {
components_.push_back(other.GetComponent(i).Copy());
}
// create empty buffers
......@@ -40,13 +40,13 @@ Nnet::Nnet(const Nnet& other) {
backpropagate_buf_.resize(NumComponents()+1);
// copy train opts
SetTrainOptions(other.opts_);
Check();
Check();
}
Nnet & Nnet::operator = (const Nnet& other) {
Destroy();
// copy the components
for(int32 i=0; i<other.NumComponents(); i++) {
for(int32 i = 0; i < other.NumComponents(); i++) {
components_.push_back(other.GetComponent(i).Copy());
}
// create empty buffers
......@@ -356,15 +356,19 @@ void Nnet::ResetLstmStreams(const std::vector<int32> &stream_reset_flag) {
if (GetComponent(c).GetType() == Component::kLstmProjectedStreams) {
LstmProjectedStreams& comp = dynamic_cast<LstmProjectedStreams&>(GetComponent(c));
comp.ResetLstmStreams(stream_reset_flag);
}
}
}
}
void Nnet::SetSeqLengths(const std::vector<int32> &sequence_lengths) {
for (int32 c=0; c < NumComponents(); c++) {
if (GetComponent(c).GetType() == Component::kBLstmProjectedStreams) {
BLstmProjectedStreams& comp = dynamic_cast<BLstmProjectedStreams&>(GetComponent(c));
comp.ResetLstmStreams(stream_reset_flag);
comp.SetSeqLengths(sequence_lengths);
}
}
}