Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
Abdelwahab HEBA
kaldi_2015
Commits
885586f9
Commit
885586f9
authored
Aug 03, 2015
by
naxingyu
Browse files
add Maxpooling component and example script
parent
d773ab9e
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
968 additions
and
4 deletions
+968
-4
egs/hkust/s5/local/nnet2/run_convnet.sh
egs/hkust/s5/local/nnet2/run_convnet.sh
+53
-0
egs/wsj/s5/steps/nnet2/decode.sh
egs/wsj/s5/steps/nnet2/decode.sh
+6
-1
egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh
egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh
+662
-0
src/nnet2/nnet-component-test.cc
src/nnet2/nnet-component-test.cc
+26
-0
src/nnet2/nnet-component.cc
src/nnet2/nnet-component.cc
+137
-2
src/nnet2/nnet-component.h
src/nnet2/nnet-component.h
+84
-1
No files found.
egs/hkust/s5/local/nnet2/run_convnet.sh
0 → 100755
View file @
885586f9
#!/bin/bash
# 2015 Xingyu Na
# This runs on the full training set, using ConvNet setup with
# Sigmoid affine layers, on top of fbank features, on GPU.
temp_dir
=
dir
=
exp/nnet2_convnet
stage
=
-5
train_original
=
data/train
train
=
data-fb/train
.
./cmd.sh
.
./path.sh
.
utils/parse_options.sh
parallel_opts
=
"--gpu 1"
# This is suitable for the CLSP network, you'll
# likely have to change it.
# Make the FBANK features
if
[
$stage
-le
-5
]
;
then
# Dev set
utils/copy_data_dir.sh data/dev data-fb/dev
||
exit
1
;
rm
$train
/
{
cmvn,feats
}
.scp
steps/make_fbank.sh
--nj
10
--cmd
"
$train_cmd
"
\
data-fb/dev data-fb/dev/log data-fb/dev/data
||
exit
1
;
steps/compute_cmvn_stats.sh data-fb/dev data-fb/dev/log data-fb/dev/data
||
exit
1
;
# Training set
utils/copy_data_dir.sh
$train_original
$train
||
exit
1
;
rm
$train
/
{
cmvn,feats
}
.scp
steps/make_fbank.sh
--nj
10
--cmd
"
$train_cmd
"
\
$train
$train
/log
$train
/data
||
exit
1
;
steps/compute_cmvn_stats.sh
$train
$train
/log
$train
/data
||
exit
1
;
fi
(
if
[
!
-f
$dir
/final.mdl
]
;
then
steps/nnet2/train_convnet_accel2.sh
--parallel-opts
"
$parallel_opts
"
\
--cmd
"
$decode_cmd
"
--stage
$stage
\
--num-threads
1
--minibatch-size
512
\
--mix-up
20000
--samples-per-iter
300000
\
--num-epochs
15
--delta-order
2
\
--initial-effective-lrate
0.0005
--final-effective-lrate
0.000025
\
--num-jobs-initial
3
--num-jobs-final
8
--num-hidden-layers
4
--splice-width
5
\
--hidden-dim
2000
--num-filters1
128
--patch-dim1
7
--pool-size
3
\
--num-filters2
256
--patch-dim2
4
\
$train
data/lang exp/tri5a_ali
$dir
||
exit
1
;
fi
steps/nnet2/decode.sh
--cmd
"
$decode_cmd
"
--nj
10
\
--config
conf/decode.config
\
exp/tri5a/graph data-fb/dev
\
$dir
/decode
||
exit
1
;
)
egs/wsj/s5/steps/nnet2/decode.sh
View file @
885586f9
...
...
@@ -84,7 +84,12 @@ fi
splice_opts
=
`
cat
$srcdir
/splice_opts 2>/dev/null
`
case
$feat_type
in
raw
)
feats
=
"ark,s,cs:apply-cmvn
$cmvn_opts
--utt2spk=ark:
$sdata
/JOB/utt2spk scp:
$sdata
/JOB/cmvn.scp scp:
$sdata
/JOB/feats.scp ark:- |"
;;
raw
)
feats
=
"ark,s,cs:apply-cmvn
$cmvn_opts
--utt2spk=ark:
$sdata
/JOB/utt2spk scp:
$sdata
/JOB/cmvn.scp scp:
$sdata
/JOB/feats.scp ark:- |"
if
[
-f
$srcdir
/delta_order
]
;
then
delta_order
=
`
cat
$srcdir
/delta_order 2>/dev/null
`
feats
=
"
$feats
add-deltas --delta-order=
$delta_order
ark:- ark:- |"
fi
;;
lda
)
feats
=
"ark,s,cs:apply-cmvn
$cmvn_opts
--utt2spk=ark:
$sdata
/JOB/utt2spk scp:
$sdata
/JOB/cmvn.scp scp:
$sdata
/JOB/feats.scp ark:- | splice-feats
$splice_opts
ark:- ark:- | transform-feats
$srcdir
/final.mat ark:- ark:- |"
;;
*
)
echo
"
$0
: invalid feature type
$feat_type
"
&&
exit
1
;
...
...
egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh
0 → 100755
View file @
885586f9
This diff is collapsed.
Click to expand it.
src/nnet2/nnet-component-test.cc
View file @
885586f9
...
...
@@ -307,6 +307,31 @@ void UnitTestPnormComponent() {
}
}
void
UnitTestMaxpoolingComponent
()
{
// works if it has an initializer from int,
// e.g. tanh, sigmoid.
// We're testing that the gradients are computed correctly:
// the input gradients and the model gradients.
for
(
int32
i
=
0
;
i
<
5
;
i
++
)
{
int32
pool_stride
=
5
+
Rand
()
%
10
,
pool_size
=
2
+
Rand
()
%
3
,
num_pools
=
1
+
Rand
()
%
10
;
int32
output_dim
=
num_pools
*
pool_stride
;
int32
num_patches
=
num_pools
*
pool_size
;
int32
input_dim
=
pool_stride
*
num_patches
;
MaxpoolingComponent
component
(
input_dim
,
output_dim
,
pool_size
,
pool_stride
);
UnitTestGenericComponentInternal
(
component
);
}
{
MaxpoolingComponent
component
;
component
.
InitFromString
(
"input-dim=192 output-dim=64 pool-size=3 pool-stride=16"
);
UnitTestGenericComponentInternal
(
component
);
}
}
void
UnitTestAffineComponent
()
{
...
...
@@ -850,6 +875,7 @@ int main() {
UnitTestSpliceComponent
();
UnitTestMaxoutComponent
();
UnitTestPnormComponent
();
UnitTestMaxpoolingComponent
();
UnitTestGenericComponent
<
NormalizeComponent
>
();
UnitTestSigmoidComponent
();
UnitTestAffineComponent
();
...
...
src/nnet2/nnet-component.cc
View file @
885586f9
...
...
@@ -104,6 +104,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
ans
=
new
AdditiveNoiseComponent
();
}
else
if
(
component_type
==
"ConvolutionComponent"
)
{
ans
=
new
ConvolutionComponent
();
}
else
if
(
component_type
==
"MaxpoolingComponent"
)
{
ans
=
new
MaxpoolingComponent
();
}
return
ans
;
}
...
...
@@ -3905,12 +3907,12 @@ void ConvolutionComponent::Backprop(const ChunkInfo &in_info,
const
CuMatrixBase
<
BaseFloat
>
&
out_deriv
,
Component
*
to_update_in
,
CuMatrix
<
BaseFloat
>
*
in_deriv
)
const
{
in_deriv
->
Resize
(
in_value
.
NumRows
(),
in_value
.
NumCols
(),
kSetZero
);
in_deriv
->
Resize
(
out_deriv
.
NumRows
(),
InputDim
()
);
ConvolutionComponent
*
to_update
=
dynamic_cast
<
ConvolutionComponent
*>
(
to_update_in
);
int32
num_splice
=
InputDim
()
/
patch_stride_
;
int32
num_patches
=
1
+
(
patch_stride_
-
patch_dim_
)
/
patch_step_
;
int32
num_filters
=
filter_params_
.
NumRows
();
int32
num_frames
=
in_value
.
NumRows
();
int32
num_frames
=
out_deriv
.
NumRows
();
int32
filter_dim
=
filter_params_
.
NumCols
();
/** Buffer for backpropagation:
...
...
@@ -4112,5 +4114,138 @@ void ConvolutionComponent::Update(const CuMatrixBase<BaseFloat> &in_value,
bias_params_
.
AddVec
(
learning_rate_
,
bias_grad
);
}
void
MaxpoolingComponent
::
Init
(
int32
input_dim
,
int32
output_dim
,
int32
pool_size
,
int32
pool_stride
)
{
input_dim_
=
input_dim
;
output_dim_
=
output_dim
;
pool_size_
=
pool_size
;
pool_stride_
=
pool_stride
;
// sanity check
// number of patches
KALDI_ASSERT
(
input_dim_
%
pool_stride_
==
0
);
int32
num_patches
=
input_dim_
/
pool_stride_
;
// number of pools
KALDI_ASSERT
(
num_patches
%
pool_size_
==
0
);
int32
num_pools
=
num_patches
/
pool_size_
;
// check output dim
KALDI_ASSERT
(
output_dim_
==
num_pools
*
pool_stride_
);
}
void
MaxpoolingComponent
::
InitFromString
(
std
::
string
args
)
{
std
::
string
orig_args
(
args
);
int32
input_dim
=
0
;
int32
output_dim
=
0
;
int32
pool_size
=
-
1
,
pool_stride
=
-
1
;
bool
ok
=
true
;
ok
=
ok
&&
ParseFromString
(
"input-dim"
,
&
args
,
&
input_dim
);
ok
=
ok
&&
ParseFromString
(
"output-dim"
,
&
args
,
&
output_dim
);
ok
=
ok
&&
ParseFromString
(
"pool-size"
,
&
args
,
&
pool_size
);
ok
=
ok
&&
ParseFromString
(
"pool-stride"
,
&
args
,
&
pool_stride
);
KALDI_LOG
<<
output_dim
<<
" "
<<
input_dim
<<
" "
<<
ok
;
KALDI_LOG
<<
"Pool: "
<<
pool_size
<<
" "
<<
pool_stride
<<
" "
<<
ok
;
if
(
!
ok
||
!
args
.
empty
()
||
output_dim
<=
0
)
KALDI_ERR
<<
"Invalid initializer for layer of type "
<<
Type
()
<<
":
\"
"
<<
orig_args
<<
"
\"
"
;
Init
(
input_dim
,
output_dim
,
pool_size
,
pool_stride
);
}
void
MaxpoolingComponent
::
Propagate
(
const
ChunkInfo
&
in_info
,
const
ChunkInfo
&
out_info
,
const
CuMatrixBase
<
BaseFloat
>
&
in
,
CuMatrixBase
<
BaseFloat
>
*
out
)
const
{
in_info
.
CheckSize
(
in
);
out_info
.
CheckSize
(
*
out
);
KALDI_ASSERT
(
in_info
.
NumChunks
()
==
out_info
.
NumChunks
());
int32
num_patches
=
input_dim_
/
pool_stride_
;
int32
num_pools
=
num_patches
/
pool_size_
;
// do the max-pooling
for
(
int32
q
=
0
;
q
<
num_pools
;
q
++
)
{
// get output buffer of the pool
CuSubMatrix
<
BaseFloat
>
pool
(
out
->
ColRange
(
q
*
pool_stride_
,
pool_stride_
));
pool
.
Set
(
-
1e20
);
// reset a large negative value
for
(
int32
r
=
0
;
r
<
pool_size_
;
r
++
)
{
// col-by-col block comparison pool
int32
p
=
r
+
q
*
pool_size_
;
pool
.
Max
(
in
.
ColRange
(
p
*
pool_stride_
,
pool_stride_
));
}
}
}
void
MaxpoolingComponent
::
Backprop
(
const
ChunkInfo
&
,
// in_info,
const
ChunkInfo
&
,
// out_info,
const
CuMatrixBase
<
BaseFloat
>
&
in_value
,
const
CuMatrixBase
<
BaseFloat
>
&
out_value
,
const
CuMatrixBase
<
BaseFloat
>
&
out_deriv
,
Component
*
to_update
,
CuMatrix
<
BaseFloat
>
*
in_deriv
)
const
{
int32
num_patches
=
input_dim_
/
pool_stride_
;
int32
num_pools
=
num_patches
/
pool_size_
;
std
::
vector
<
int32
>
patch_summands
(
num_patches
,
0
);
in_deriv
->
Resize
(
in_value
.
NumRows
(),
in_value
.
NumCols
(),
kSetZero
);
for
(
int32
q
=
0
;
q
<
num_pools
;
q
++
)
{
for
(
int32
r
=
0
;
r
<
pool_size_
;
r
++
)
{
int32
p
=
r
+
q
*
pool_size_
;
CuSubMatrix
<
BaseFloat
>
in_p
(
in_value
.
ColRange
(
p
*
pool_stride_
,
pool_stride_
));
CuSubMatrix
<
BaseFloat
>
out_q
(
out_value
.
ColRange
(
q
*
pool_stride_
,
pool_stride_
));
CuSubMatrix
<
BaseFloat
>
tgt
(
in_deriv
->
ColRange
(
p
*
pool_stride_
,
pool_stride_
));
CuMatrix
<
BaseFloat
>
src
(
out_deriv
.
ColRange
(
q
*
pool_stride_
,
pool_stride_
));
// zero-out mask
CuMatrix
<
BaseFloat
>
mask
;
in_p
.
EqualElementMask
(
out_q
,
&
mask
);
src
.
MulElements
(
mask
);
tgt
.
AddMat
(
1.0
,
src
);
// summed deriv info
patch_summands
[
p
]
+=
1
;
}
}
// scale in_deriv of overlaped pools
for
(
int32
p
=
0
;
p
<
num_patches
;
p
++
)
{
CuSubMatrix
<
BaseFloat
>
tgt
(
in_deriv
->
ColRange
(
p
*
pool_stride_
,
pool_stride_
));
KALDI_ASSERT
(
patch_summands
[
p
]
>
0
);
tgt
.
Scale
(
1.0
/
patch_summands
[
p
]);
}
}
void
MaxpoolingComponent
::
Read
(
std
::
istream
&
is
,
bool
binary
)
{
ExpectOneOrTwoTokens
(
is
,
binary
,
"<MaxpoolingComponent>"
,
"<InputDim>"
);
ReadBasicType
(
is
,
binary
,
&
input_dim_
);
ExpectToken
(
is
,
binary
,
"<OutputDim>"
);
ReadBasicType
(
is
,
binary
,
&
output_dim_
);
ExpectToken
(
is
,
binary
,
"<PoolSize>"
);
ReadBasicType
(
is
,
binary
,
&
pool_size_
);
ExpectToken
(
is
,
binary
,
"<PoolStride>"
);
ReadBasicType
(
is
,
binary
,
&
pool_stride_
);
ExpectToken
(
is
,
binary
,
"</MaxpoolingComponent>"
);
}
void
MaxpoolingComponent
::
Write
(
std
::
ostream
&
os
,
bool
binary
)
const
{
WriteToken
(
os
,
binary
,
"<MaxpoolingComponent>"
);
WriteToken
(
os
,
binary
,
"<InputDim>"
);
WriteBasicType
(
os
,
binary
,
input_dim_
);
WriteToken
(
os
,
binary
,
"<OutputDim>"
);
WriteBasicType
(
os
,
binary
,
output_dim_
);
WriteToken
(
os
,
binary
,
"<PoolSize>"
);
WriteBasicType
(
os
,
binary
,
pool_size_
);
WriteToken
(
os
,
binary
,
"<PoolStride>"
);
WriteBasicType
(
os
,
binary
,
pool_stride_
);
WriteToken
(
os
,
binary
,
"</MaxpoolingComponent>"
);
}
std
::
string
MaxpoolingComponent
::
Info
()
const
{
std
::
stringstream
stream
;
stream
<<
Type
()
<<
", input-dim = "
<<
input_dim_
<<
", output-dim = "
<<
output_dim_
<<
", pool-size = "
<<
pool_size_
<<
", pool-stride = "
<<
pool_stride_
;
return
stream
.
str
();
}
}
// namespace nnet2
}
// namespace kaldi
src/nnet2/nnet-component.h
View file @
885586f9
...
...
@@ -448,6 +448,59 @@ class MaxoutComponent: public Component {
int32
output_dim_
;
};
/**
* MaxPoolingComponent :
* The input/output matrices are split to submatrices with width 'pool_stride_'.
* The pooling is done over 3rd axis, of the set of 2d matrices.
* Our pooling does not supports overlaps, which simplifies the
* implementation (and was not helpful for Ossama).
*/
class
MaxpoolingComponent
:
public
Component
{
public:
void
Init
(
int32
input_dim
,
int32
output_dim
,
int32
pool_size
,
int32
pool_stride
);
explicit
MaxpoolingComponent
(
int32
input_dim
,
int32
output_dim
,
int32
pool_size
,
int32
pool_stride
)
{
Init
(
input_dim
,
output_dim
,
pool_size
,
pool_stride
);
}
MaxpoolingComponent
()
:
input_dim_
(
0
),
output_dim_
(
0
),
pool_size_
(
0
),
pool_stride_
(
0
)
{
}
virtual
std
::
string
Type
()
const
{
return
"MaxpoolingComponent"
;
}
virtual
void
InitFromString
(
std
::
string
args
);
virtual
int32
InputDim
()
const
{
return
input_dim_
;
}
virtual
int32
OutputDim
()
const
{
return
output_dim_
;
}
using
Component
::
Propagate
;
// to avoid name hiding
virtual
void
Propagate
(
const
ChunkInfo
&
in_info
,
const
ChunkInfo
&
out_info
,
const
CuMatrixBase
<
BaseFloat
>
&
in
,
CuMatrixBase
<
BaseFloat
>
*
out
)
const
;
virtual
void
Backprop
(
const
ChunkInfo
&
in_info
,
const
ChunkInfo
&
out_info
,
const
CuMatrixBase
<
BaseFloat
>
&
in_value
,
const
CuMatrixBase
<
BaseFloat
>
&
,
//out_value,
const
CuMatrixBase
<
BaseFloat
>
&
out_deriv
,
Component
*
to_update
,
// may be identical to "this".
CuMatrix
<
BaseFloat
>
*
in_deriv
)
const
;
virtual
bool
BackpropNeedsInput
()
const
{
return
true
;
}
virtual
bool
BackpropNeedsOutput
()
const
{
return
true
;
}
virtual
Component
*
Copy
()
const
{
return
new
MaxpoolingComponent
(
input_dim_
,
output_dim_
,
pool_size_
,
pool_stride_
);
}
virtual
void
Read
(
std
::
istream
&
is
,
bool
binary
);
// This Read function
// requires that the Component has the correct type.
/// Write component to stream
virtual
void
Write
(
std
::
ostream
&
os
,
bool
binary
)
const
;
virtual
std
::
string
Info
()
const
;
protected:
int32
input_dim_
;
int32
output_dim_
;
int32
pool_size_
;
int32
pool_stride_
;
};
class
PnormComponent
:
public
Component
{
public:
void
Init
(
int32
input_dim
,
int32
output_dim
,
BaseFloat
p
);
...
...
@@ -1613,6 +1666,36 @@ class AdditiveNoiseComponent: public RandomComponent {
BaseFloat
stddev_
;
};
/**
* ConvolutionComponent implements convolution over frequency axis.
* We assume the input featrues are spliced, i.e. each frame is in
* fact a set of stacked frames, where we can form patches which span
* over several frequency bands and whole time axis. A patch is the
* instance of a filter on a group of frequency bands and whole time
* axis. Shifts of the filter generate patches.
*
* The convolution is done over whole axis with same filter
* coefficients, i.e. we don't use separate filters for different
* 'regions' of frequency axis.
*
* In order to have a fast implementations, the filters are
* represented in vectorized form, where each rectangular filter
* corresponds to a row in a matrix, where all the filters are
* stored. The features are then re-shaped to a set of matrices, where
* one matrix corresponds to single patch-position, where all the
* filters get applied.
*
* The type of convolution is controled by hyperparameters:
* patch_dim_ ... frequency axis size of the patch
* patch_step_ ... size of shift in the convolution
* patch_stride_ ... shift for 2nd dim of a patch
* (i.e. frame length before splicing)
*
* Due to convolution same weights are used repeateadly,
* the final gradient is a sum of all position-specific
* gradients (the sum was found better than averaging).
*
*/
class
ConvolutionComponent
:
public
UpdatableComponent
{
public:
ConvolutionComponent
();
...
...
@@ -1636,7 +1719,7 @@ class ConvolutionComponent: public UpdatableComponent {
std
::
string
Info
()
const
;
void
InitFromString
(
std
::
string
args
);
std
::
string
Type
()
const
{
return
"ConvolutionComponent"
;
}
bool
BackpropNeedsInput
()
const
{
return
tru
e
;
}
bool
BackpropNeedsInput
()
const
{
return
fals
e
;
}
bool
BackpropNeedsOutput
()
const
{
return
false
;
}
using
Component
::
Propagate
;
// to avoid name hiding
void
Propagate
(
const
ChunkInfo
&
in_info
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment