Commit 5526c218 authored by Daniel Povey's avatar Daniel Povey
Browse files

Merge pull request #47 from naxingyu/convolution-nnet2

add Convolution component in nnet2
parents c105c63b 372a1505
...@@ -7,3 +7,6 @@ exp/tri5a/decode/cer_13:%WER 49.67 [ 27891 / 56154, 2877 ins, 4538 del, 20476 su ...@@ -7,3 +7,6 @@ exp/tri5a/decode/cer_13:%WER 49.67 [ 27891 / 56154, 2877 ins, 4538 del, 20476 su
exp/tri5a_mce/decode/cer_11:%WER 44.74 [ 25125 / 56154, 2112 ins, 4108 del, 18905 sub ] exp/tri5a_mce/decode/cer_11:%WER 44.74 [ 25125 / 56154, 2112 ins, 4108 del, 18905 sub ]
exp/tri5a_mmi_b0.1/decode/cer_11:%WER 44.24 [ 24840 / 56154, 2060 ins, 4118 del, 18662 sub ] exp/tri5a_mmi_b0.1/decode/cer_11:%WER 44.24 [ 24840 / 56154, 2060 ins, 4118 del, 18662 sub ]
exp/tri5a_mpe/decode/cer_12:%WER 44.96 [ 25247 / 56154, 2233 ins, 4174 del, 18840 sub ] exp/tri5a_mpe/decode/cer_12:%WER 44.96 [ 25247 / 56154, 2233 ins, 4174 del, 18840 sub ]
# ConvNet with 2 convolutional layers and 2 ReLU layers
exp/nnet2_convnet/decode/cer_10:%WER 40.73 [ 22873 / 56154, 2609 ins, 3712 del, 16552 sub ]
#!/bin/bash
# 2015 Xingyu Na
# This script runs on the full training set, using ConvNet setup on top of
# fbank features, on GPU. The ConvNet has four hidden layers, two convolutional
# layers and two affine transform layers with ReLU nonlinearity.
# Convolutional layer [1]:
# convolution1d, input feature dim is 36, filter dim is 7, output dim is
# 30, 128 filters are used
# maxpooling, 3-to-1 maxpooling, input dim is 30, output dim is 10
# Convolutional layer [2]:
# convolution1d, input feature dim is 10, filter dim is 4, output dim is
# 7, 256 filters are used
# Affine transform layers [3-4]:
# affine transform with ReLU nonlinearity.
temp_dir=
dir=exp/nnet2_convnet
stage=-5
train_original=data/train
train=data-fb/train
. ./cmd.sh
. ./path.sh
. utils/parse_options.sh
parallel_opts="--gpu 1" # This is suitable for the CLSP network, you'll
# likely have to change it.
# Make the FBANK features
if [ $stage -le -5 ]; then
# Dev set
utils/copy_data_dir.sh data/dev data-fb/dev || exit 1; rm $train/{cmvn,feats}.scp
steps/make_fbank.sh --nj 10 --cmd "$train_cmd" \
data-fb/dev data-fb/dev/log data-fb/dev/data || exit 1;
steps/compute_cmvn_stats.sh data-fb/dev data-fb/dev/log data-fb/dev/data || exit 1;
# Training set
utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp
steps/make_fbank.sh --nj 10 --cmd "$train_cmd" \
$train $train/log $train/data || exit 1;
steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1;
fi
(
if [ ! -f $dir/final.mdl ]; then
steps/nnet2/train_convnet_accel2.sh --parallel-opts "$parallel_opts" \
--cmd "$decode_cmd" --stage $stage \
--num-threads 1 --minibatch-size 512 \
--mix-up 20000 --samples-per-iter 300000 \
--num-epochs 15 --delta-order 2 \
--initial-effective-lrate 0.0005 --final-effective-lrate 0.000025 \
--num-jobs-initial 3 --num-jobs-final 8 --splice-width 5 \
--hidden-dim 2000 --num-filters1 128 --patch-dim1 7 --pool-size 3 \
--num-filters2 256 --patch-dim2 4 \
$train data/lang exp/tri5a_ali $dir || exit 1;
fi
steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \
--config conf/decode.config \
exp/tri5a/graph data-fb/dev \
$dir/decode || exit 1;
)
...@@ -84,7 +84,12 @@ fi ...@@ -84,7 +84,12 @@ fi
splice_opts=`cat $srcdir/splice_opts 2>/dev/null` splice_opts=`cat $srcdir/splice_opts 2>/dev/null`
case $feat_type in case $feat_type in
raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";; raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
if [ -f $srcdir/delta_order ]; then
delta_order=`cat $srcdir/delta_order 2>/dev/null`
feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |"
fi
;;
lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
;; ;;
*) echo "$0: invalid feature type $feat_type" && exit 1; *) echo "$0: invalid feature type $feat_type" && exit 1;
......
This diff is collapsed.
...@@ -62,6 +62,7 @@ void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include ...@@ -62,6 +62,7 @@ void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include
void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d); void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim d); void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim d);
void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride); void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride); void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val, MatrixDim d); void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val, MatrixDim d);
void cudaF_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d); void cudaF_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d);
...@@ -190,6 +191,7 @@ void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power, bool inclu ...@@ -190,6 +191,7 @@ void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power, bool inclu
void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d); void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim d); void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim d);
void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride); void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride); void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaD_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val, MatrixDim d); void cudaD_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val, MatrixDim d);
void cudaD_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d); void cudaD_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d);
......
...@@ -1259,6 +1259,25 @@ static void _copy_cols(Real* dst, const Real *src, const MatrixIndexT_cuda* reor ...@@ -1259,6 +1259,25 @@ static void _copy_cols(Real* dst, const Real *src, const MatrixIndexT_cuda* reor
} }
} }
template<typename Real>
__global__
static void _add_cols(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
// Note: in this kernel, the x dimension corresponds to rows and the y to columns,
// as it will be going forward.
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < dst_dim.rows && j < dst_dim.cols) {
int index = reorder[j],
dst_index = i * dst_dim.stride + j;
if (index >= 0) {
int src_index = i * src_stride + reorder[j];
Real val = src[src_index];
dst[dst_index] += val;
}
}
}
template<typename Real> template<typename Real>
__global__ __global__
static void _copy_rows(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) { static void _copy_rows(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
...@@ -2024,6 +2043,10 @@ void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const Matri ...@@ -2024,6 +2043,10 @@ void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const Matri
_copy_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride); _copy_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
} }
void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
_add_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
}
void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) { void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
_copy_rows<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride); _copy_rows<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
} }
...@@ -2445,6 +2468,10 @@ void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const Mat ...@@ -2445,6 +2468,10 @@ void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const Mat
_copy_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride); _copy_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
} }
void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
_add_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
}
void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) { void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
_copy_rows<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride); _copy_rows<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
} }
......
...@@ -92,6 +92,9 @@ inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val, ...@@ -92,6 +92,9 @@ inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val,
inline void cuda_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) { inline void cuda_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaF_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride); cudaF_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
} }
inline void cuda_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaF_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
}
inline void cuda_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) { inline void cuda_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaF_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride); cudaF_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
} }
...@@ -259,6 +262,9 @@ inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val ...@@ -259,6 +262,9 @@ inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val
inline void cuda_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) { inline void cuda_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaD_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride); cudaD_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
} }
inline void cuda_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaD_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
}
inline void cuda_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) { inline void cuda_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaD_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride); cudaD_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
} }
......
...@@ -509,6 +509,36 @@ static void UnitTestCuMatrixCopyCols() { ...@@ -509,6 +509,36 @@ static void UnitTestCuMatrixCopyCols() {
} }
template<typename Real>
static void UnitTestCuMatrixAddCols() {
for (MatrixIndexT p = 0; p < 2; p++) {
MatrixIndexT num_cols1 = 10 + Rand() % 10,
num_cols2 = 10 + Rand() % 10,
num_rows = 10 + Rand() % 10;
CuMatrix<Real> M(num_rows, num_cols1);
M.SetRandn();
CuMatrix<Real> N(num_rows, num_cols2), O(num_rows, num_cols2);
std::vector<int32> reorder(num_cols2);
for (int32 i = 0; i < num_cols2; i++)
reorder[i] = -1 + (Rand() % (num_cols1 + 1));
if (Rand() % 2 == 0) {
N.AddCols(M, reorder);
} else {
CuArray<int32> cuda_reorder(reorder);
N.AddCols(M, cuda_reorder);
}
for (int32 i = 0; i < num_rows; i++)
for (int32 j = 0; j < num_cols2; j++)
if (reorder[j] < 0) O(i, j) = 0;
else O(i, j) = M(i, reorder[j]);
AssertEqual(N, O);
}
}
template<typename Real> template<typename Real>
static void UnitTestCuMatrixApplyFloor() { static void UnitTestCuMatrixApplyFloor() {
...@@ -2093,6 +2123,7 @@ template<typename Real> void CudaMatrixUnitTest() { ...@@ -2093,6 +2123,7 @@ template<typename Real> void CudaMatrixUnitTest() {
UnitTestCuMatrixCopyFromTp<Real>(); UnitTestCuMatrixCopyFromTp<Real>();
UnitTestCuMatrixAddMatTp<Real>(); UnitTestCuMatrixAddMatTp<Real>();
UnitTestCuMatrixCopyCols<Real>(); UnitTestCuMatrixCopyCols<Real>();
UnitTestCuMatrixAddCols<Real>();
UnitTestCuMatrixSumColumnRanges<Real>(); UnitTestCuMatrixSumColumnRanges<Real>();
UnitTestCuMatrixCopyRows<Real>(); UnitTestCuMatrixCopyRows<Real>();
UnitTestCuMatrixCopyRowsFromVec<Real>(); UnitTestCuMatrixCopyRowsFromVec<Real>();
......
...@@ -1960,6 +1960,56 @@ void CuMatrixBase<Real>::CopyCols(const CuMatrixBase<Real> &src, ...@@ -1960,6 +1960,56 @@ void CuMatrixBase<Real>::CopyCols(const CuMatrixBase<Real> &src,
} }
} }
template<typename Real>
void CuMatrixBase<Real>::AddCols(const CuMatrixBase<Real> &src,
const std::vector<MatrixIndexT> &reorder) {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
KALDI_ASSERT(static_cast<MatrixIndexT>(reorder.size()) == NumCols());
KALDI_ASSERT(NumRows() == src.NumRows());
#ifdef KALDI_PARANOID
MatrixIndexT src_cols = src.NumCols();
for (size_t i = 0; i < reorder.size(); i++)
KALDI_ASSERT(reorder[i] >= -1 && reorder[i] < src_cols);
#endif
CuArray<MatrixIndexT> cuda_reorder(reorder);
Timer tim;
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
// This kernel, as it is newer has the (x,y) dims as (rows,cols).
dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK));
cuda_add_cols(dimGrid, dimBlock, data_, src.Data(), cuda_reorder.Data(), Dim(), src.Stride());
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
Mat().AddCols(src.Mat(), reorder);
}
}
template<typename Real>
void CuMatrixBase<Real>::AddCols(const CuMatrixBase<Real> &src,
const CuArray<MatrixIndexT> &reorder) {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
KALDI_ASSERT(reorder.Dim() == NumCols());
KALDI_ASSERT(NumRows() == src.NumRows());
Timer tim;
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
// This kernel, as it is newer has the (x,y) dims as (rows,cols).
dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK));
cuda_add_cols(dimGrid, dimBlock, data_, src.Data(), reorder.Data(), Dim(), src.Stride());
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
std::vector<MatrixIndexT> reorder_cpu;
reorder.CopyToVec(&reorder_cpu);
Mat().AddCols(src.Mat(), reorder_cpu);
}
}
template<typename Real> template<typename Real>
void CuMatrixBase<Real>::CopyRows(const CuMatrixBase<Real> &src, void CuMatrixBase<Real>::CopyRows(const CuMatrixBase<Real> &src,
......
...@@ -98,6 +98,18 @@ class CuMatrixBase { ...@@ -98,6 +98,18 @@ class CuMatrixBase {
void CopyCols(const CuMatrixBase<Real> &src, void CopyCols(const CuMatrixBase<Real> &src,
const CuArray<MatrixIndexT> &indices); const CuArray<MatrixIndexT> &indices);
/// Add column indices[r] of src to column r.
/// As a special case, if indexes[i] == -1, skip column i
/// indices.size() must equal this->NumCols(),
/// all elements of "reorder" must be in [-1, src.NumCols()-1],
/// and src.NumRows() must equal this.NumRows()
void AddCols(const CuMatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices);
/// Version of CopyCols that takes CuArray argument.
void AddCols(const CuMatrixBase<Real> &src,
const CuArray<MatrixIndexT> &indices);
/// Copies row r from row indices[r] of src. /// Copies row r from row indices[r] of src.
/// As a special case, if indexes[i] <== -1, sets row i to zero /// As a special case, if indexes[i] <== -1, sets row i to zero
......
...@@ -2566,6 +2566,34 @@ void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src, ...@@ -2566,6 +2566,34 @@ void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
} }
} }
template<typename Real>
void MatrixBase<Real>::AddCols(const MatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices) {
KALDI_ASSERT(NumRows() == src.NumRows());
KALDI_ASSERT(NumCols() == static_cast<MatrixIndexT>(indices.size()));
MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
this_stride = stride_, src_stride = src.stride_;
Real *this_data = this->data_;
const Real *src_data = src.data_;
#ifdef KALDI_PARANOID
MatrixIndexT src_cols = src.NumCols();
for (std::vector<MatrixIndexT>::const_iterator iter = indices.begin();
iter != indices.end(); ++iter)
KALDI_ASSERT(*iter >= -1 && *iter < src_cols);
#endif
// For the sake of memory locality we do this row by row, rather
// than doing it column-wise using cublas_Xcopy
for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data += src_stride) {
const MatrixIndexT *index_ptr = &(indices[0]);
for (MatrixIndexT c = 0; c < num_cols; c++, index_ptr++) {
if (*index_ptr >= 0)
this_data[c] += src_data[*index_ptr];
}
}
}
template<typename Real> template<typename Real>
void MatrixBase<Real>::CopyRows(const MatrixBase<Real> &src, void MatrixBase<Real>::CopyRows(const MatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices) { const std::vector<MatrixIndexT> &indices) {
......
...@@ -284,6 +284,14 @@ class MatrixBase { ...@@ -284,6 +284,14 @@ class MatrixBase {
void CopyRows(const MatrixBase<Real> &src, void CopyRows(const MatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices); const std::vector<MatrixIndexT> &indices);
/// Add column indices[r] of src to column r.
/// As a special case, if indexes[i] == -1, skip column i
/// indices.size() must equal this->NumCols(),
/// all elements of "reorder" must be in [-1, src.NumCols()-1],
/// and src.NumRows() must equal this.NumRows()
void AddCols(const MatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices);
/// Applies floor to all matrix elements /// Applies floor to all matrix elements
void ApplyFloor(Real floor_val); void ApplyFloor(Real floor_val);
......
...@@ -307,6 +307,31 @@ void UnitTestPnormComponent() { ...@@ -307,6 +307,31 @@ void UnitTestPnormComponent() {
} }
} }
void UnitTestMaxpoolingComponent() {
// works if it has an initializer from int,
// e.g. tanh, sigmoid.
// We're testing that the gradients are computed correctly:
// the input gradients and the model gradients.
for (int32 i = 0; i < 5; i++) {
int32 pool_stride = 5 + Rand() % 10,
pool_size = 2 + Rand() % 3,
num_pools = 1 + Rand() % 10;
int32 output_dim = num_pools * pool_stride;
int32 num_patches = num_pools * pool_size;
int32 input_dim = pool_stride * num_patches;
MaxpoolingComponent component(input_dim, output_dim,
pool_size, pool_stride);
UnitTestGenericComponentInternal(component);
}
{
MaxpoolingComponent component;
component.InitFromString("input-dim=192 output-dim=64 pool-size=3 pool-stride=16");
UnitTestGenericComponentInternal(component);
}
}
void UnitTestAffineComponent() { void UnitTestAffineComponent() {
...@@ -337,6 +362,44 @@ void UnitTestAffineComponent() { ...@@ -337,6 +362,44 @@ void UnitTestAffineComponent() {
} }
} }
void UnitTestConvolutional1dComponent() {
BaseFloat learning_rate = 0.01,
param_stddev = 0.1, bias_stddev = 1.0;
int32 patch_stride = 10, patch_step = 1, patch_dim = 4;
int32 num_patches = 1 + (patch_stride - patch_dim) / patch_step;
int32 num_splice = 5 + Rand() % 10, num_filters = 5 + Rand() % 10;
int32 input_dim = patch_stride * num_splice;
int32 filter_dim = patch_dim * num_splice;
int32 output_dim = num_patches * num_filters;
{
Convolutional1dComponent component;
if (Rand() % 2 == 0) {
component.Init(learning_rate, input_dim, output_dim,
patch_dim, patch_step, patch_stride,
param_stddev, bias_stddev);
} else {
// initialize the hyper-parameters
component.Init(learning_rate, input_dim, output_dim,
patch_dim, patch_step, patch_stride,
param_stddev, bias_stddev);
Matrix<BaseFloat> mat(num_filters, filter_dim + 1);
mat.SetRandn();
mat.Scale(param_stddev);
WriteKaldiObject(mat, "tmpf", true);
Sleep(0.5);
component.Init(learning_rate, "tmpf");
unlink("tmpf");
}
UnitTestGenericComponentInternal(component);
}
{
const char *str = "learning-rate=0.01 input-dim=100 output-dim=70 param-stddev=0.1 patch-dim=4 patch-step=1 patch-stride=10";
Convolutional1dComponent component;
component.InitFromString(str);
UnitTestGenericComponentInternal(component);
}
}
void UnitTestDropoutComponent() { void UnitTestDropoutComponent() {
// We're testing that the gradients are computed correctly: // We're testing that the gradients are computed correctly:
// the input gradients and the model gradients. // the input gradients and the model gradients.
...@@ -812,6 +875,7 @@ int main() { ...@@ -812,6 +875,7 @@ int main() {
UnitTestSpliceComponent(); UnitTestSpliceComponent();
UnitTestMaxoutComponent(); UnitTestMaxoutComponent();
UnitTestPnormComponent(); UnitTestPnormComponent();
UnitTestMaxpoolingComponent();
UnitTestGenericComponent<NormalizeComponent>(); UnitTestGenericComponent<NormalizeComponent>();
UnitTestSigmoidComponent(); UnitTestSigmoidComponent();
UnitTestAffineComponent(); UnitTestAffineComponent();
...@@ -826,6 +890,7 @@ int main() { ...@@ -826,6 +890,7 @@ int main() {
UnitTestFixedBiasComponent(); UnitTestFixedBiasComponent();
UnitTestAffineComponentPreconditioned(); UnitTestAffineComponentPreconditioned();
UnitTestAffineComponentPreconditionedOnline(); UnitTestAffineComponentPreconditionedOnline();
UnitTestConvolutional1dComponent();
UnitTestDropoutComponent(); UnitTestDropoutComponent();
UnitTestAdditiveNoiseComponent(); UnitTestAdditiveNoiseComponent();
UnitTestParsing(); UnitTestParsing();
......
This diff is collapsed.
...@@ -448,6 +448,69 @@ class MaxoutComponent: public Component { ...@@ -448,6 +448,69 @@ class MaxoutComponent: public Component {
int32 output_dim_; int32 output_dim_;
}; };
/**
* MaxPoolingComponent :
* Maxpooling component was firstly used in ConvNet for selecting an representative
* activation in an area. It inspired Maxout nonlinearity.
*
* The input/output matrices are split to submatrices with width 'pool_stride_'.
* For instance, a minibatch of 512 frames is propagated by a convolutional
* layer, resulting in a 512 x 3840 input matrix for MaxpoolingComponent,
* which is composed of 128 feature maps for each frame (128 x 30). If you want
* a 3-to-1 maxpooling on each feature map, set 'pool_stride_' and 'pool_size_'
* as 128 and 3 respectively. Maxpooling component would create an output
* matrix of 512 x 1280. The 30 input neurons are grouped by a group size of 3, and
* the maximum in a group is selected, creating a smaller feature map of 10.
*
* Our pooling does not supports overlaps, which simplifies the
* implementation (and was not helpful for Ossama).
*/
class MaxpoolingComponent: public Component {
public:
void Init(int32 input_dim, int32 output_dim,
int32 pool_size, int32 pool_stride);
explicit MaxpoolingComponent(int32 input_dim, int32 output_dim,
int32 pool_size, int32 pool_stride) {
Init(input_dim, output_dim, pool_size, pool_stride);
}
MaxpoolingComponent(): input_dim_(0), output_dim_(0),
pool_size_(0), pool_stride_(0) { }
virtual std::string Type() const { return "MaxpoolingComponent"; }
virtual void InitFromString(std::string args);
virtual int32 InputDim() const { return input_dim_; }
virtual int32 OutputDim() const { return output_dim_; }
using Component::Propagate; // to avoid name hiding
virtual void Propagate(const ChunkInfo &in_info,
const ChunkInfo &out_info,
const CuMatrixBase<BaseFloat> &in,
CuMatrixBase<BaseFloat> *out) const;
virtual void Backprop(const ChunkInfo &in_info,
const ChunkInfo &out_info,
const CuMatrixBase<BaseFloat> &in_value,
const CuMatrixBase<BaseFloat> &, //out_value,
const CuMatrixBase<BaseFloat> &out_deriv,
Component *to_update, // may be identical to "this".
CuMatrix<BaseFloat> *in_deriv) const;
virtual bool BackpropNeedsInput() const { return true; }
virtual bool BackpropNeedsOutput() const { return true; }
virtual Component* Copy() const {