Commit 9d4b994f authored by naxingyu's avatar naxingyu

add more doc

parent 885586f9
......@@ -7,3 +7,6 @@ exp/tri5a/decode/cer_13:%WER 49.67 [ 27891 / 56154, 2877 ins, 4538 del, 20476 su
exp/tri5a_mce/decode/cer_11:%WER 44.74 [ 25125 / 56154, 2112 ins, 4108 del, 18905 sub ]
exp/tri5a_mmi_b0.1/decode/cer_11:%WER 44.24 [ 24840 / 56154, 2060 ins, 4118 del, 18662 sub ]
exp/tri5a_mpe/decode/cer_12:%WER 44.96 [ 25247 / 56154, 2233 ins, 4174 del, 18840 sub ]
# ConvNet with 2 convolutional layers and 2 ReLU layers
exp/nnet2_convnet/decode/cer_10:%WER 40.73 [ 22873 / 56154, 2609 ins, 3712 del, 16552 sub ]
#!/bin/bash
# 2015 Xingyu Na
# This runs on the full training set, using ConvNet setup with
# Sigmoid affine layers, on top of fbank features, on GPU.
# This script runs on the full training set, using ConvNet setup on top of
# fbank features, on GPU. The ConvNet has four hidden layers, two convolutional
# layers and two affine transform layers with ReLU nonlinearity.
# Convolutional layer [1]:
# convolution1d, input feature dim is 36, filter dim is 7, output dim is
# 30, 128 filters are used
# maxpooling, 3-to-1 maxpooling, input dim is 30, output dim is 10
# Convolutional layer [2]:
# convolution1d, input feature dim is 10, filter dim is 4, output dim is
# 7, 256 filters are used
# Affine transform layers [3-4]:
# affine transform with ReLU nonlinearity.
temp_dir=
dir=exp/nnet2_convnet
......@@ -16,7 +26,7 @@ train=data-fb/train
. utils/parse_options.sh
parallel_opts="--gpu 1" # This is suitable for the CLSP network, you'll
# likely have to change it.
# likely have to change it.
# Make the FBANK features
if [ $stage -le -5 ]; then
......@@ -40,7 +50,7 @@ fi
--mix-up 20000 --samples-per-iter 300000 \
--num-epochs 15 --delta-order 2 \
--initial-effective-lrate 0.0005 --final-effective-lrate 0.000025 \
--num-jobs-initial 3 --num-jobs-final 8 --num-hidden-layers 4 --splice-width 5 \
--num-jobs-initial 3 --num-jobs-final 8 --splice-width 5 \
--hidden-dim 2000 --num-filters1 128 --patch-dim1 7 --pool-size 3 \
--num-filters2 256 --patch-dim2 4 \
$train data/lang exp/tri5a_ali $dir || exit 1;
......
......@@ -4,10 +4,14 @@
# 2013 Xiaohui Zhang
# 2013 Guoguo Chen
# 2014 Vimal Manohar
# 2015 Xingyu Na
# Apache 2.0.
# train_convnet_accel2.sh is modified from train_pnorm_accel2.sh
# train_convnet_accel2.sh is modified from train_pnorm_accel2.sh. It propotypes
# the training of a ConvNet. The ConvNet is composed of 4 layers. The first layer
# is a Convolutional1d component plus a Maxpooling component. The second layer
# is a single Convolutional1d component. The third and fourth layers are affine
# components with ReLU nonlinearities. Due to non-squashing output, normalize
# component is applied to all four layers.
# train_pnorm_accel2.sh is a modified form of train_pnorm_simple2.sh (the "2"
# suffix is because they both use the the "new" egs format, created by
......@@ -61,8 +65,7 @@ shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of
# affect each others' gradients.
add_layers_period=2 # by default, add new layers every 2 iterations.
num_hidden_layers=3
stage=-4
stage=-3
splice_width=4 # meaning +- 4 frames on each side for second LDA
left_context= # if set, overrides splice-width
......@@ -129,7 +132,6 @@ if [ $# != 4 ]; then
echo " --initial-effective-lrate <lrate|0.02> # effective learning rate at start of training,"
echo " # actual learning-rate is this time num-jobs."
echo " --final-effective-lrate <lrate|0.004> # effective learning rate at end of training."
echo " --num-hidden-layers <#hidden-layers|2> # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs"
echo " --add-layers-period <#iters|2> # Number of iterations between adding hidden layers"
echo " --mix-up <#pseudo-gaussians|0> # Can be used to have multiple targets in final output layer,"
echo " # per context-dependent state. Try a number several times #states."
......@@ -148,7 +150,6 @@ if [ $# != 4 ]; then
echo " # process."
echo " --splice-width <width|4> # Number of frames on each side to append for feature input"
echo " # (note: we splice processed, typically 40-dimensional frames"
echo " --lda-dim <dim|250> # Dimension to reduce spliced features to with LDA"
echo " --realign-epochs <list-of-epochs|\"\"> # A list of space-separated epoch indices the beginning of which"
echo " # realignment is to be done"
echo " --align-cmd (utils/run.pl|utils/queue.pl <queue opts>) # passed to align.sh"
......@@ -156,6 +157,15 @@ if [ $# != 4 ]; then
echo " --num-jobs-align <#njobs|30> # Number of jobs to perform realignment"
echo " --stage <stage|-4> # Used to run a partially-completed training process from somewhere in"
echo " # the middle."
echo "ConvNet configurations"
echo " --num-filters1 <num-filters1|128> # number of filters in the first convolutional layer."
echo " --patch-step1 <patch-step1|1> # patch step of the first convolutional layer."
echo " --patch-dim1 <patch-dim1|7> # dim of convolutional kernel in the first layer."
echo " # (note: (feat-dim - patch-dim1) % patch-step1 should be 0.)"
echo " --pool-size <pool-size|3> # size of pooling after the first convolutional layer."
echo " # (note: (feat-dim - patch-dim1 + 1) % pool-size should be 0.)"
echo " --num-filters2 <num-filters2|256> # number of filters in the second convolutional layer."
echo " --patch-dim2 <patch-dim2|4> # dim of convolutional kernel in the second layer."
exit 1;
......@@ -266,7 +276,7 @@ if [ $stage -le -2 ]; then
stddev=`perl -e "print 1.0/sqrt($hidden_dim);"`
cat >$dir/nnet.config <<EOF
SpliceComponent input-dim=$delta_feat_dim left-context=$left_context right-context=$right_context
ConvolutionComponent input-dim=$tot_input_dim output-dim=$conv_out_dim1 learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev patch-dim=$patch_dim1 patch-step=$patch_step1 patch-stride=$feat_dim
Convolutional1dComponent input-dim=$tot_input_dim output-dim=$conv_out_dim1 learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev patch-dim=$patch_dim1 patch-step=$patch_step1 patch-stride=$feat_dim
MaxpoolingComponent input-dim=$conv_out_dim1 output-dim=$pool_out_dim pool-size=$pool_size pool-stride=$num_filters1
NormalizeComponent dim=$pool_out_dim
AffineComponentPreconditionedOnline input-dim=$pool_out_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=0 bias-stddev=0
......@@ -274,7 +284,7 @@ SoftmaxComponent dim=$num_leaves
EOF
cat >$dir/replace.1.config <<EOF
ConvolutionComponent input-dim=$pool_out_dim output-dim=$conv_out_dim2 learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev patch-dim=$patch_dim2 patch-step=$patch_step2 patch-stride=$patch_stride2
Convolutional1dComponent input-dim=$pool_out_dim output-dim=$conv_out_dim2 learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev patch-dim=$patch_dim2 patch-step=$patch_step2 patch-stride=$patch_stride2
NormalizeComponent dim=$conv_out_dim2
AffineComponentPreconditionedOnline input-dim=$conv_out_dim2 output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
......@@ -282,7 +292,8 @@ EOF
cat >$dir/replace.2.config <<EOF
AffineComponentPreconditionedOnline input-dim=$conv_out_dim2 output-dim=$hidden_dim $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev
SigmoidComponent dim=$hidden_dim
RectifiedLinearComponent dim=$hidden_dim
NormalizeComponent dim=$hidden_dim
AffineComponentPreconditionedOnline input-dim=$hidden_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF
......@@ -291,7 +302,8 @@ EOF
# single hidden layer; we need this to add new layers.
cat >$dir/replace.3.config <<EOF
AffineComponentPreconditionedOnline input-dim=$hidden_dim output-dim=$hidden_dim $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=$stddev bias-stddev=$bias_stddev
SigmoidComponent dim=$hidden_dim
RectifiedLinearComponent dim=$hidden_dim
NormalizeComponent dim=$hidden_dim
AffineComponentPreconditionedOnline input-dim=$hidden_dim output-dim=$num_leaves $online_preconditioning_opts learning-rate=$initial_lrate param-stddev=0 bias-stddev=0
SoftmaxComponent dim=$num_leaves
EOF
......
......@@ -362,7 +362,7 @@ void UnitTestAffineComponent() {
}
}
void UnitTestConvolutionComponent() {
void UnitTestConvolutional1dComponent() {
BaseFloat learning_rate = 0.01,
param_stddev = 0.1, bias_stddev = 1.0;
int32 patch_stride = 10, patch_step = 1, patch_dim = 4;
......@@ -372,7 +372,7 @@ void UnitTestConvolutionComponent() {
int32 filter_dim = patch_dim * num_splice;
int32 output_dim = num_patches * num_filters;
{
ConvolutionComponent component;
Convolutional1dComponent component;
if (Rand() % 2 == 0) {
component.Init(learning_rate, input_dim, output_dim,
patch_dim, patch_step, patch_stride,
......@@ -394,7 +394,7 @@ void UnitTestConvolutionComponent() {
}
{
const char *str = "learning-rate=0.01 input-dim=100 output-dim=70 param-stddev=0.1 patch-dim=4 patch-step=1 patch-stride=10";
ConvolutionComponent component;
Convolutional1dComponent component;
component.InitFromString(str);
UnitTestGenericComponentInternal(component);
}
......@@ -890,7 +890,7 @@ int main() {
UnitTestFixedBiasComponent();
UnitTestAffineComponentPreconditioned();
UnitTestAffineComponentPreconditionedOnline();
UnitTestConvolutionComponent();
UnitTestConvolutional1dComponent();
UnitTestDropoutComponent();
UnitTestAdditiveNoiseComponent();
UnitTestParsing();
......
This diff is collapsed.
......@@ -450,8 +450,18 @@ class MaxoutComponent: public Component {
/**
* MaxPoolingComponent :
* Maxpooling component was firstly used in ConvNet for selecting an representative
* activation in an area. It inspired Maxout nonlinearity.
*
* The input/output matrices are split to submatrices with width 'pool_stride_'.
* The pooling is done over 3rd axis, of the set of 2d matrices.
* For instance, a minibatch of 512 frames is propagated by a convolutional
* layer, resulting in a 512 x 3840 input matrix for MaxpoolingComponent,
* which is composed of 128 feature maps for each frame (128 x 30). If you want
* a 3-to-1 maxpooling on each feature map, set 'pool_stride_' and 'pool_size_'
* as 128 and 3 respectively. Maxpooling component would create an output
* matrix of 512 x 1280. The 30 input neurons are grouped by a group size of 3, and
* the maximum in a group is selected, creating a smaller feature map of 10.
*
* Our pooling does not supports overlaps, which simplifies the
* implementation (and was not helpful for Ossama).
*/
......@@ -1667,7 +1677,7 @@ class AdditiveNoiseComponent: public RandomComponent {
};
/**
* ConvolutionComponent implements convolution over frequency axis.
* Convolutional1dComponent implements convolution over frequency axis.
* We assume the input featrues are spliced, i.e. each frame is in
* fact a set of stacked frames, where we can form patches which span
* over several frequency bands and whole time axis. A patch is the
......@@ -1676,7 +1686,10 @@ class AdditiveNoiseComponent: public RandomComponent {
*
* The convolution is done over whole axis with same filter
* coefficients, i.e. we don't use separate filters for different
* 'regions' of frequency axis.
* 'regions' of frequency axis. Due to convolution, same weights are
* used repeateadly, the final gradient is a sum of all
* position-specific gradients (the sum was found better than
* averaging).
*
* In order to have a fast implementations, the filters are
* represented in vectorized form, where each rectangular filter
......@@ -1690,21 +1703,34 @@ class AdditiveNoiseComponent: public RandomComponent {
* patch_step_ ... size of shift in the convolution
* patch_stride_ ... shift for 2nd dim of a patch
* (i.e. frame length before splicing)
*
* Due to convolution same weights are used repeateadly,
* the final gradient is a sum of all position-specific
* gradients (the sum was found better than averaging).
* For instance, for a convolutional component after raw input,
* if the input is 36-dim fbank feature with delta of order 2
* and spliced using +/- 5 frames of contexts, the convolutional
* component takes the input as a 36 x 33 image. The patch_stride_
* should be configured 36. If patch_step_ and patch_dim_ are
* configured 1 and 7, the Convolutional1dComponent creates a
* 2D filter of 7 x 33, such that the convolution is actually done
* only along the frequency axis. Specifically, the convolutional
* output along the frequency axis is (36 - 7) / 1 + 1 = 30, and
* the convolutional output along the temporal axis is 33 - 33 + 1 = 1,
* resulting in an output image of 30 x 1, which is called a feature map
* in ConvNet. Then if the output-dim is set 3840, the constructor
* would know there should be 3840 / 30 = 128 distinct filters,
* which will create 128 feature maps of 30 x 1 for one frame of
* input. The feature maps are vectorized as a 3840-dim row vector
* in the output matrix of this component. For details on progatation
* of Convolutional1dComponent, check the function definition.
*
*/
class ConvolutionComponent: public UpdatableComponent {
class Convolutional1dComponent: public UpdatableComponent {
public:
ConvolutionComponent();
Convolutional1dComponent();
// constructor using another component
ConvolutionComponent(const ConvolutionComponent &component);
Convolutional1dComponent(const Convolutional1dComponent &component);
// constructor using parameters
ConvolutionComponent(const CuMatrixBase<BaseFloat> &filter_params,
const CuVectorBase<BaseFloat> &bias_params,
BaseFloat learning_rate);
Convolutional1dComponent(const CuMatrixBase<BaseFloat> &filter_params,
const CuVectorBase<BaseFloat> &bias_params,
BaseFloat learning_rate);
int32 InputDim() const;
int32 OutputDim() const;
......@@ -1718,7 +1744,7 @@ class ConvolutionComponent: public UpdatableComponent {
void Resize(int32 input_dim, int32 output_dim);
std::string Info() const;
void InitFromString(std::string args);
std::string Type() const { return "ConvolutionComponent"; }
std::string Type() const { return "Convolutional1dComponent"; }
bool BackpropNeedsInput() const { return false; }
bool BackpropNeedsOutput() const { return false; }
using Component::Propagate; // to avoid name hiding
......@@ -1754,7 +1780,7 @@ class ConvolutionComponent: public UpdatableComponent {
int32 patch_step_;
int32 patch_stride_;
const ConvolutionComponent &operator = (const ConvolutionComponent &other); // Disallow.
const Convolutional1dComponent &operator = (const Convolutional1dComponent &other); // Disallow.
CuMatrix<BaseFloat> filter_params_;
CuVector<BaseFloat> bias_params_;
bool is_gradient_;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment