Commit 885586f9 authored by naxingyu's avatar naxingyu
Browse files

add Maxpooling component and example script

parent d773ab9e
#!/bin/bash
# 2015 Xingyu Na
# This runs on the full training set, using ConvNet setup with
# Sigmoid affine layers, on top of fbank features, on GPU.
temp_dir=
dir=exp/nnet2_convnet
stage=-5
train_original=data/train
train=data-fb/train
. ./cmd.sh
. ./path.sh
. utils/parse_options.sh
parallel_opts="--gpu 1" # This is suitable for the CLSP network, you'll
# likely have to change it.
# Make the FBANK features
if [ $stage -le -5 ]; then
# Dev set
utils/copy_data_dir.sh data/dev data-fb/dev || exit 1; rm $train/{cmvn,feats}.scp
steps/make_fbank.sh --nj 10 --cmd "$train_cmd" \
data-fb/dev data-fb/dev/log data-fb/dev/data || exit 1;
steps/compute_cmvn_stats.sh data-fb/dev data-fb/dev/log data-fb/dev/data || exit 1;
# Training set
utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp
steps/make_fbank.sh --nj 10 --cmd "$train_cmd" \
$train $train/log $train/data || exit 1;
steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1;
fi
(
if [ ! -f $dir/final.mdl ]; then
steps/nnet2/train_convnet_accel2.sh --parallel-opts "$parallel_opts" \
--cmd "$decode_cmd" --stage $stage \
--num-threads 1 --minibatch-size 512 \
--mix-up 20000 --samples-per-iter 300000 \
--num-epochs 15 --delta-order 2 \
--initial-effective-lrate 0.0005 --final-effective-lrate 0.000025 \
--num-jobs-initial 3 --num-jobs-final 8 --num-hidden-layers 4 --splice-width 5 \
--hidden-dim 2000 --num-filters1 128 --patch-dim1 7 --pool-size 3 \
--num-filters2 256 --patch-dim2 4 \
$train data/lang exp/tri5a_ali $dir || exit 1;
fi
steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \
--config conf/decode.config \
exp/tri5a/graph data-fb/dev \
$dir/decode || exit 1;
)
......@@ -84,7 +84,12 @@ fi
splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
case $feat_type in
raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";;
raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
if [ -f $srcdir/delta_order ]; then
delta_order=`cat $srcdir/delta_order 2>/dev/null`
feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
fi
;;
lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
;;
*) echo "$0: invalid feature type $feat_type" && exit 1;
......
This diff is collapsed.
......@@ -307,6 +307,31 @@ void UnitTestPnormComponent() {
}
}
void UnitTestMaxpoolingComponent() {
// works if it has an initializer from int,
// e.g. tanh, sigmoid.
// We're testing that the gradients are computed correctly:
// the input gradients and the model gradients.
for (int32 i = 0; i < 5; i++) {
int32 pool_stride = 5 + Rand() % 10,
pool_size = 2 + Rand() % 3,
num_pools = 1 + Rand() % 10;
int32 output_dim = num_pools * pool_stride;
int32 num_patches = num_pools * pool_size;
int32 input_dim = pool_stride * num_patches;
MaxpoolingComponent component(input_dim, output_dim,
pool_size, pool_stride);
UnitTestGenericComponentInternal(component);
}
{
MaxpoolingComponent component;
component.InitFromString("input-dim=192 output-dim=64 pool-size=3 pool-stride=16");
UnitTestGenericComponentInternal(component);
}
}
void UnitTestAffineComponent() {
......@@ -850,6 +875,7 @@ int main() {
UnitTestSpliceComponent();
UnitTestMaxoutComponent();
UnitTestPnormComponent();
UnitTestMaxpoolingComponent();
UnitTestGenericComponent<NormalizeComponent>();
UnitTestSigmoidComponent();
UnitTestAffineComponent();
......
......@@ -104,6 +104,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
ans = new AdditiveNoiseComponent();
} else if (component_type == "ConvolutionComponent") {
ans = new ConvolutionComponent();
} else if (component_type == "MaxpoolingComponent") {
ans = new MaxpoolingComponent();
}
return ans;
}
......@@ -3905,12 +3907,12 @@ void ConvolutionComponent::Backprop(const ChunkInfo &in_info,
const CuMatrixBase<BaseFloat> &out_deriv,
Component *to_update_in,
CuMatrix<BaseFloat> *in_deriv) const {
in_deriv->Resize(in_value.NumRows(), in_value.NumCols(), kSetZero);
in_deriv->Resize(out_deriv.NumRows(), InputDim());
ConvolutionComponent *to_update = dynamic_cast<ConvolutionComponent*>(to_update_in);
int32 num_splice = InputDim() / patch_stride_;
int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_;
int32 num_filters = filter_params_.NumRows();
int32 num_frames = in_value.NumRows();
int32 num_frames = out_deriv.NumRows();
int32 filter_dim = filter_params_.NumCols();
/** Buffer for backpropagation:
......@@ -4112,5 +4114,138 @@ void ConvolutionComponent::Update(const CuMatrixBase<BaseFloat> &in_value,
bias_params_.AddVec(learning_rate_, bias_grad);
}
void MaxpoolingComponent::Init(int32 input_dim, int32 output_dim,
int32 pool_size, int32 pool_stride) {
input_dim_ = input_dim;
output_dim_ = output_dim;
pool_size_ = pool_size;
pool_stride_ = pool_stride;
// sanity check
// number of patches
KALDI_ASSERT(input_dim_ % pool_stride_ == 0);
int32 num_patches = input_dim_ / pool_stride_;
// number of pools
KALDI_ASSERT(num_patches % pool_size_ == 0);
int32 num_pools = num_patches / pool_size_;
// check output dim
KALDI_ASSERT(output_dim_ == num_pools * pool_stride_);
}
void MaxpoolingComponent::InitFromString(std::string args) {
std::string orig_args(args);
int32 input_dim = 0;
int32 output_dim = 0;
int32 pool_size = -1, pool_stride = -1;
bool ok = true;
ok = ok && ParseFromString("input-dim", &args, &input_dim);
ok = ok && ParseFromString("output-dim", &args, &output_dim);
ok = ok && ParseFromString("pool-size", &args, &pool_size);
ok = ok && ParseFromString("pool-stride", &args, &pool_stride);
KALDI_LOG << output_dim << " " << input_dim << " " << ok;
KALDI_LOG << "Pool: " << pool_size << " "
<< pool_stride << " " << ok;
if (!ok || !args.empty() || output_dim <= 0)
KALDI_ERR << "Invalid initializer for layer of type "
<< Type() << ": \"" << orig_args << "\"";
Init(input_dim, output_dim, pool_size, pool_stride);
}
void MaxpoolingComponent::Propagate(const ChunkInfo &in_info,
const ChunkInfo &out_info,
const CuMatrixBase<BaseFloat> &in,
CuMatrixBase<BaseFloat> *out) const {
in_info.CheckSize(in);
out_info.CheckSize(*out);
KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks());
int32 num_patches = input_dim_ / pool_stride_;
int32 num_pools = num_patches / pool_size_;
// do the max-pooling
for (int32 q = 0; q < num_pools; q++) {
// get output buffer of the pool
CuSubMatrix<BaseFloat> pool(out->ColRange(q * pool_stride_, pool_stride_));
pool.Set(-1e20); // reset a large negative value
for (int32 r = 0; r < pool_size_; r++) {
// col-by-col block comparison pool
int32 p = r + q * pool_size_;
pool.Max(in.ColRange(p * pool_stride_, pool_stride_));
}
}
}
void MaxpoolingComponent::Backprop(const ChunkInfo &, // in_info,
const ChunkInfo &, // out_info,
const CuMatrixBase<BaseFloat> &in_value,
const CuMatrixBase<BaseFloat> &out_value,
const CuMatrixBase<BaseFloat> &out_deriv,
Component *to_update,
CuMatrix<BaseFloat> *in_deriv) const {
int32 num_patches = input_dim_ / pool_stride_;
int32 num_pools = num_patches / pool_size_;
std::vector<int32> patch_summands(num_patches, 0);
in_deriv->Resize(in_value.NumRows(), in_value.NumCols(), kSetZero);
for(int32 q = 0; q < num_pools; q++) {
for(int32 r = 0; r < pool_size_; r++) {
int32 p = r + q * pool_size_;
CuSubMatrix<BaseFloat> in_p(in_value.ColRange(p * pool_stride_, pool_stride_));
CuSubMatrix<BaseFloat> out_q(out_value.ColRange(q * pool_stride_, pool_stride_));
CuSubMatrix<BaseFloat> tgt(in_deriv->ColRange(p * pool_stride_, pool_stride_));
CuMatrix<BaseFloat> src(out_deriv.ColRange(q * pool_stride_, pool_stride_));
// zero-out mask
CuMatrix<BaseFloat> mask;
in_p.EqualElementMask(out_q, &mask);
src.MulElements(mask);
tgt.AddMat(1.0, src);
// summed deriv info
patch_summands[p] += 1;
}
}
// scale in_deriv of overlaped pools
for(int32 p = 0; p < num_patches; p++) {
CuSubMatrix<BaseFloat> tgt(in_deriv->ColRange(p * pool_stride_, pool_stride_));
KALDI_ASSERT(patch_summands[p] > 0);
tgt.Scale(1.0 / patch_summands[p]);
}
}
void MaxpoolingComponent::Read(std::istream &is, bool binary) {
ExpectOneOrTwoTokens(is, binary, "<MaxpoolingComponent>", "<InputDim>");
ReadBasicType(is, binary, &input_dim_);
ExpectToken(is, binary, "<OutputDim>");
ReadBasicType(is, binary, &output_dim_);
ExpectToken(is, binary, "<PoolSize>");
ReadBasicType(is, binary, &pool_size_);
ExpectToken(is, binary, "<PoolStride>");
ReadBasicType(is, binary, &pool_stride_);
ExpectToken(is, binary, "</MaxpoolingComponent>");
}
void MaxpoolingComponent::Write(std::ostream &os, bool binary) const {
WriteToken(os, binary, "<MaxpoolingComponent>");
WriteToken(os, binary, "<InputDim>");
WriteBasicType(os, binary, input_dim_);
WriteToken(os, binary, "<OutputDim>");
WriteBasicType(os, binary, output_dim_);
WriteToken(os, binary, "<PoolSize>");
WriteBasicType(os, binary, pool_size_);
WriteToken(os, binary, "<PoolStride>");
WriteBasicType(os, binary, pool_stride_);
WriteToken(os, binary, "</MaxpoolingComponent>");
}
std::string MaxpoolingComponent::Info() const {
std::stringstream stream;
stream << Type() << ", input-dim = " << input_dim_
<< ", output-dim = " << output_dim_
<< ", pool-size = " << pool_size_
<< ", pool-stride = " << pool_stride_;
return stream.str();
}
} // namespace nnet2
} // namespace kaldi
......@@ -448,6 +448,59 @@ class MaxoutComponent: public Component {
int32 output_dim_;
};
/**
* MaxPoolingComponent :
* The input/output matrices are split to submatrices with width 'pool_stride_'.
* The pooling is done over 3rd axis, of the set of 2d matrices.
* Our pooling does not supports overlaps, which simplifies the
* implementation (and was not helpful for Ossama).
*/
class MaxpoolingComponent: public Component {
public:
void Init(int32 input_dim, int32 output_dim,
int32 pool_size, int32 pool_stride);
explicit MaxpoolingComponent(int32 input_dim, int32 output_dim,
int32 pool_size, int32 pool_stride) {
Init(input_dim, output_dim, pool_size, pool_stride);
}
MaxpoolingComponent(): input_dim_(0), output_dim_(0),
pool_size_(0), pool_stride_(0) { }
virtual std::string Type() const { return "MaxpoolingComponent"; }
virtual void InitFromString(std::string args);
virtual int32 InputDim() const { return input_dim_; }
virtual int32 OutputDim() const { return output_dim_; }
using Component::Propagate; // to avoid name hiding
virtual void Propagate(const ChunkInfo &in_info,
const ChunkInfo &out_info,
const CuMatrixBase<BaseFloat> &in,
CuMatrixBase<BaseFloat> *out) const;
virtual void Backprop(const ChunkInfo &in_info,
const ChunkInfo &out_info,
const CuMatrixBase<BaseFloat> &in_value,
const CuMatrixBase<BaseFloat> &, //out_value,
const CuMatrixBase<BaseFloat> &out_deriv,
Component *to_update, // may be identical to "this".
CuMatrix<BaseFloat> *in_deriv) const;
virtual bool BackpropNeedsInput() const { return true; }
virtual bool BackpropNeedsOutput() const { return true; }
virtual Component* Copy() const {
return new MaxpoolingComponent(input_dim_, output_dim_,
pool_size_, pool_stride_); }
virtual void Read(std::istream &is, bool binary); // This Read function
// requires that the Component has the correct type.
/// Write component to stream
virtual void Write(std::ostream &os, bool binary) const;
virtual std::string Info() const;
protected:
int32 input_dim_;
int32 output_dim_;
int32 pool_size_;
int32 pool_stride_;
};
class PnormComponent: public Component {
public:
void Init(int32 input_dim, int32 output_dim, BaseFloat p);
......@@ -1613,6 +1666,36 @@ class AdditiveNoiseComponent: public RandomComponent {
BaseFloat stddev_;
};
/**
* ConvolutionComponent implements convolution over frequency axis.
* We assume the input featrues are spliced, i.e. each frame is in
* fact a set of stacked frames, where we can form patches which span
* over several frequency bands and whole time axis. A patch is the
* instance of a filter on a group of frequency bands and whole time
* axis. Shifts of the filter generate patches.
*
* The convolution is done over whole axis with same filter
* coefficients, i.e. we don't use separate filters for different
* 'regions' of frequency axis.
*
* In order to have a fast implementations, the filters are
* represented in vectorized form, where each rectangular filter
* corresponds to a row in a matrix, where all the filters are
* stored. The features are then re-shaped to a set of matrices, where
* one matrix corresponds to single patch-position, where all the
* filters get applied.
*
* The type of convolution is controled by hyperparameters:
* patch_dim_ ... frequency axis size of the patch
* patch_step_ ... size of shift in the convolution
* patch_stride_ ... shift for 2nd dim of a patch
* (i.e. frame length before splicing)
*
* Due to convolution same weights are used repeateadly,
* the final gradient is a sum of all position-specific
* gradients (the sum was found better than averaging).
*
*/
class ConvolutionComponent: public UpdatableComponent {
public:
ConvolutionComponent();
......@@ -1636,7 +1719,7 @@ class ConvolutionComponent: public UpdatableComponent {
std::string Info() const;
void InitFromString(std::string args);
std::string Type() const { return "ConvolutionComponent"; }
bool BackpropNeedsInput() const { return true; }
bool BackpropNeedsInput() const { return false; }
bool BackpropNeedsOutput() const { return false; }
using Component::Propagate; // to avoid name hiding
void Propagate(const ChunkInfo &in_info,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment