Commit c3f5a1da authored by naxingyu's avatar naxingyu

efficient convolutional Propagate and Backprop

parent 9d4b994f
......@@ -62,6 +62,7 @@ void cudaF_apply_pow_abs(dim3 Gr, dim3 Bl, float* mat, float power, bool include
void cudaF_apply_heaviside(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
void cudaF_apply_floor(dim3 Gr, dim3 Bl, float* mat, float floor_val, MatrixDim d);
void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val, MatrixDim d);
void cudaF_set_diag(int Gr, int Bl, float* mat, float value, MatrixDim d);
......@@ -190,6 +191,7 @@ void cudaD_apply_pow_abs(dim3 Gr, dim3 Bl, double* mat, double power, bool inclu
void cudaD_apply_heaviside(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
void cudaD_apply_floor(dim3 Gr, dim3 Bl, double* mat, double floor_val, MatrixDim d);
void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride);
void cudaD_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val, MatrixDim d);
void cudaD_set_diag(int Gr, int Bl, double* mat, double value, MatrixDim d);
......
......@@ -1259,6 +1259,25 @@ static void _copy_cols(Real* dst, const Real *src, const MatrixIndexT_cuda* reor
}
}
template<typename Real>
__global__
static void _add_cols(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
// Note: in this kernel, the x dimension corresponds to rows and the y to columns,
// as it will be going forward.
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < dst_dim.rows && j < dst_dim.cols) {
int index = reorder[j],
dst_index = i * dst_dim.stride + j;
if (index >= 0) {
int src_index = i * src_stride + reorder[j];
Real val = src[src_index];
dst[dst_index] += val;
}
}
}
template<typename Real>
__global__
static void _copy_rows(Real* dst, const Real *src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
......@@ -2024,6 +2043,10 @@ void cudaF_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const Matri
_copy_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
}
void cudaF_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
_add_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
}
void cudaF_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
_copy_rows<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
}
......@@ -2445,6 +2468,10 @@ void cudaD_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const Mat
_copy_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
}
void cudaD_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
_add_cols<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
}
void cudaD_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
_copy_rows<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
}
......
......@@ -92,6 +92,9 @@ inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val,
inline void cuda_copy_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaF_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
}
inline void cuda_add_cols(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaF_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
}
inline void cuda_copy_rows(dim3 Gr, dim3 Bl, float* dst, const float* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaF_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
}
......@@ -259,6 +262,9 @@ inline void cuda_apply_ceiling(dim3 Gr, dim3 Bl, double* mat, double ceiling_val
inline void cuda_copy_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaD_copy_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
}
inline void cuda_add_cols(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaD_add_cols(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
}
inline void cuda_copy_rows(dim3 Gr, dim3 Bl, double* dst, const double* src, const MatrixIndexT_cuda* reorder, MatrixDim dst_dim, int src_stride) {
cudaD_copy_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
}
......
......@@ -509,6 +509,36 @@ static void UnitTestCuMatrixCopyCols() {
}
template<typename Real>
static void UnitTestCuMatrixAddCols() {
for (MatrixIndexT p = 0; p < 2; p++) {
MatrixIndexT num_cols1 = 10 + Rand() % 10,
num_cols2 = 10 + Rand() % 10,
num_rows = 10 + Rand() % 10;
CuMatrix<Real> M(num_rows, num_cols1);
M.SetRandn();
CuMatrix<Real> N(num_rows, num_cols2), O(num_rows, num_cols2);
std::vector<int32> reorder(num_cols2);
for (int32 i = 0; i < num_cols2; i++)
reorder[i] = -1 + (Rand() % (num_cols1 + 1));
if (Rand() % 2 == 0) {
N.AddCols(M, reorder);
} else {
CuArray<int32> cuda_reorder(reorder);
N.AddCols(M, cuda_reorder);
}
for (int32 i = 0; i < num_rows; i++)
for (int32 j = 0; j < num_cols2; j++)
if (reorder[j] < 0) O(i, j) = 0;
else O(i, j) = M(i, reorder[j]);
AssertEqual(N, O);
}
}
template<typename Real>
static void UnitTestCuMatrixApplyFloor() {
......@@ -2093,6 +2123,7 @@ template<typename Real> void CudaMatrixUnitTest() {
UnitTestCuMatrixCopyFromTp<Real>();
UnitTestCuMatrixAddMatTp<Real>();
UnitTestCuMatrixCopyCols<Real>();
UnitTestCuMatrixAddCols<Real>();
UnitTestCuMatrixSumColumnRanges<Real>();
UnitTestCuMatrixCopyRows<Real>();
UnitTestCuMatrixCopyRowsFromVec<Real>();
......
......@@ -1960,6 +1960,56 @@ void CuMatrixBase<Real>::CopyCols(const CuMatrixBase<Real> &src,
}
}
template<typename Real>
void CuMatrixBase<Real>::AddCols(const CuMatrixBase<Real> &src,
const std::vector<MatrixIndexT> &reorder) {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
KALDI_ASSERT(static_cast<MatrixIndexT>(reorder.size()) == NumCols());
KALDI_ASSERT(NumRows() == src.NumRows());
#ifdef KALDI_PARANOID
MatrixIndexT src_cols = src.NumCols();
for (size_t i = 0; i < reorder.size(); i++)
KALDI_ASSERT(reorder[i] >= -1 && reorder[i] < src_cols);
#endif
CuArray<MatrixIndexT> cuda_reorder(reorder);
Timer tim;
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
// This kernel, as it is newer has the (x,y) dims as (rows,cols).
dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK));
cuda_add_cols(dimGrid, dimBlock, data_, src.Data(), cuda_reorder.Data(), Dim(), src.Stride());
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
Mat().AddCols(src.Mat(), reorder);
}
}
template<typename Real>
void CuMatrixBase<Real>::AddCols(const CuMatrixBase<Real> &src,
const CuArray<MatrixIndexT> &reorder) {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
KALDI_ASSERT(reorder.Dim() == NumCols());
KALDI_ASSERT(NumRows() == src.NumRows());
Timer tim;
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
// This kernel, as it is newer has the (x,y) dims as (rows,cols).
dim3 dimGrid(n_blocks(NumRows(), CU2DBLOCK), n_blocks(NumCols(), CU2DBLOCK));
cuda_add_cols(dimGrid, dimBlock, data_, src.Data(), reorder.Data(), Dim(), src.Stride());
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
std::vector<MatrixIndexT> reorder_cpu;
reorder.CopyToVec(&reorder_cpu);
Mat().AddCols(src.Mat(), reorder_cpu);
}
}
template<typename Real>
void CuMatrixBase<Real>::CopyRows(const CuMatrixBase<Real> &src,
......
......@@ -99,6 +99,18 @@ class CuMatrixBase {
const CuArray<MatrixIndexT> &indices);
/// Add column indices[r] of src to column r.
/// As a special case, if indexes[i] == -1, skip column i
/// indices.size() must equal this->NumCols(),
/// all elements of "reorder" must be in [-1, src.NumCols()-1],
/// and src.NumRows() must equal this.NumRows()
void AddCols(const CuMatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices);
/// Version of CopyCols that takes CuArray argument.
void AddCols(const CuMatrixBase<Real> &src,
const CuArray<MatrixIndexT> &indices);
/// Copies row r from row indices[r] of src.
/// As a special case, if indexes[i] <== -1, sets row i to zero
/// "reorder".size() must equal this->NumRows(),
......
......@@ -2566,6 +2566,34 @@ void MatrixBase<Real>::CopyCols(const MatrixBase<Real> &src,
}
}
template<typename Real>
void MatrixBase<Real>::AddCols(const MatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices) {
KALDI_ASSERT(NumRows() == src.NumRows());
KALDI_ASSERT(NumCols() == static_cast<MatrixIndexT>(indices.size()));
MatrixIndexT num_rows = num_rows_, num_cols = num_cols_,
this_stride = stride_, src_stride = src.stride_;
Real *this_data = this->data_;
const Real *src_data = src.data_;
#ifdef KALDI_PARANOID
MatrixIndexT src_cols = src.NumCols();
for (std::vector<MatrixIndexT>::const_iterator iter = indices.begin();
iter != indices.end(); ++iter)
KALDI_ASSERT(*iter >= -1 && *iter < src_cols);
#endif
// For the sake of memory locality we do this row by row, rather
// than doing it column-wise using cublas_Xcopy
for (MatrixIndexT r = 0; r < num_rows; r++, this_data += this_stride, src_data += src_stride) {
const MatrixIndexT *index_ptr = &(indices[0]);
for (MatrixIndexT c = 0; c < num_cols; c++, index_ptr++) {
if (*index_ptr >= 0)
this_data[c] += src_data[*index_ptr];
}
}
}
template<typename Real>
void MatrixBase<Real>::CopyRows(const MatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices) {
......
......@@ -284,6 +284,14 @@ class MatrixBase {
void CopyRows(const MatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices);
/// Add column indices[r] of src to column r.
/// As a special case, if indexes[i] == -1, skip column i
/// indices.size() must equal this->NumCols(),
/// all elements of "reorder" must be in [-1, src.NumCols()-1],
/// and src.NumRows() must equal this.NumRows()
void AddCols(const MatrixBase<Real> &src,
const std::vector<MatrixIndexT> &indices);
/// Applies floor to all matrix elements
void ApplyFloor(Real floor_val);
......
......@@ -3872,39 +3872,33 @@ void Convolutional1dComponent::Propagate(const ChunkInfo &in_info,
int32 filter_dim = filter_params_.NumCols();
/** Buffer of reshaped inputs:
* 1row = vectorized rectangular feature patch,
* 1row = vectorized rectangular feature patches
* 1col = dim over speech frames,
* std::vector-dim = patch-position
*/
std::vector<CuMatrix<BaseFloat> > vectorized_feature_patches_;
// prepare the buffers
if (vectorized_feature_patches_.size() == 0) {
vectorized_feature_patches_.resize(num_patches);
}
patches_.Resize(num_frames, filter_dim * num_patches, kSetZero);
// column_map is indexed by the column-index of "patches",
// and the value is the corresponding column-index of "in".
std::vector<int32> column_map;
// vectorize the inputs
for (int32 p = 0; p < num_patches; p++) {
vectorized_feature_patches_[p].Resize(num_frames, filter_dim, kSetZero);
// build-up a column selection mask:
std::vector<int32> column_mask;
// build-up a column selection map:
for (int32 s = 0; s < num_splice; s++) {
for (int32 d = 0; d < patch_dim_; d++) {
column_mask.push_back(p * patch_step_ + s * patch_stride_ + d);
column_map.push_back(p * patch_step_ + s * patch_stride_ + d);
}
}
KALDI_ASSERT(column_mask.size() == filter_dim);
// select the columns
vectorized_feature_patches_[p].CopyCols(in, column_mask);
}
KALDI_ASSERT(column_map.size() == filter_dim * num_patches);
patches_.CopyCols(in, column_map);
// compute filter activations
for (int32 p = 0; p < num_patches; p++) {
CuSubMatrix<BaseFloat> tgt(out->ColRange(p * num_filters, num_filters));
CuSubMatrix<BaseFloat> patch(patches_.ColRange(p * filter_dim, filter_dim));
tgt.AddVecToRows(1.0, bias_params_, 0.0); // add bias
// apply all filters
tgt.AddMatMat(1.0, vectorized_feature_patches_[p], kNoTrans,
filter_params_, kTrans, 1.0);
tgt.AddMatMat(1.0, patch, kNoTrans, filter_params_, kTrans, 1.0);
}
}
......@@ -3940,34 +3934,37 @@ void Convolutional1dComponent::Backprop(const ChunkInfo &in_info,
int32 filter_dim = filter_params_.NumCols();
/** Buffer for backpropagation:
* derivatives in the domain of 'vectorized_feature_patches_',
* 1row = vectorized rectangular feature patch,
* derivatives in the domain of 'patches_',
* 1row = vectorized rectangular feature patches,
* 1col = dim over speech frames,
* std::vector-dim = patch-position
*/
std::vector<CuMatrix<BaseFloat> > feature_patch_diffs_;
feature_patch_diffs_.resize(num_patches);
CuMatrix<BaseFloat> patches_deriv;
patches_deriv.Resize(num_frames, filter_dim * num_patches, kSetZero);
// backpropagate to vector of matrices
// (corresponding to position of a filter)
for (int32 p = 0; p < num_patches; p++) {
feature_patch_diffs_[p].Resize(num_frames, filter_dim, kSetZero); // reset
CuSubMatrix<BaseFloat> patch_deriv(patches_deriv.ColRange(p * filter_dim, filter_dim));
CuSubMatrix<BaseFloat> out_deriv_patch(out_deriv.ColRange(p * num_filters,
num_filters));
feature_patch_diffs_[p].AddMatMat(1.0, out_deriv_patch, kNoTrans,
patch_deriv.AddMatMat(1.0, out_deriv_patch, kNoTrans,
filter_params_, kNoTrans, 0.0);
}
// sum the derivatives into in_deriv, we will compensate #summands
for (int32 p = 0; p < num_patches; p++) {
// rearranged_column_map is indexed by the column-index c of "in",
// and contains either some column-index d of "patches" such that
// column_map[d] = c, or -1.
std::vector<int32> rearranged_column_map(InputDim(), -1);
for (int32 s = 0; s < num_splice; s++) {
CuSubMatrix<BaseFloat> src(feature_patch_diffs_[p].ColRange(s * patch_dim_,
patch_dim_));
CuSubMatrix<BaseFloat> tgt(in_deriv->ColRange(p * patch_step_ + s * patch_stride_,
patch_dim_));
tgt.AddMat(1.0, src); // sum
for (int32 d = 0; d < patch_dim_; d++) {
rearranged_column_map[p * patch_step_ + s * patch_stride_ + d] =
p * filter_dim + s * patch_dim_ + d;
}
}
in_deriv->AddCols(patches_deriv, rearranged_column_map);
}
if (to_update != NULL) {
// Next update the model (must do this 2nd so the derivatives we propagate
......@@ -4090,32 +4087,22 @@ void Convolutional1dComponent::Update(const CuMatrixBase<BaseFloat> &in_value,
CuMatrix<BaseFloat> filters_grad;
CuVector<BaseFloat> bias_grad;
/** Buffer of reshaped inputs:
* 1row = vectorized rectangular feature patch,
* 1col = dim over speech frames,
* std::vector-dim = patch-position
*/
std::vector<CuMatrix<BaseFloat> > vectorized_feature_patches_;
// prepare the buffers
if (vectorized_feature_patches_.size() == 0) {
vectorized_feature_patches_.resize(num_patches);
}
// vectorize the inputs
// prepare the buffers if necessary
// (necessary to pass the component test)
if (patches_.NumRows() == 0) {
std::vector<int32> column_map;
patches_.Resize(num_frames, filter_dim * num_patches, kSetZero);
for (int32 p = 0; p < num_patches; p++) {
vectorized_feature_patches_[p].Resize(num_frames, filter_dim, kSetZero);
// build-up a column selection mask:
std::vector<int32> column_mask;
for (int32 s = 0; s < num_splice; s++) {
for (int32 d = 0; d < patch_dim_; d++) {
column_mask.push_back(p * patch_step_ + s * patch_stride_ + d);
column_map.push_back(p * patch_step_ + s * patch_stride_ + d);
}
}
}
KALDI_ASSERT(column_mask.size() == filter_dim);
// select the columns
vectorized_feature_patches_[p].CopyCols(in_value, column_mask);
KALDI_ASSERT(column_map.size() == filter_dim * num_patches);
patches_.CopyCols(in_value, column_map);
}
KALDI_ASSERT(patches_.NumCols() == filter_dim * num_patches);
//
// calculate the gradient
......@@ -4126,8 +4113,8 @@ void Convolutional1dComponent::Update(const CuMatrixBase<BaseFloat> &in_value,
for (int32 p = 0; p < num_patches; p++) { // sum
CuSubMatrix<BaseFloat> diff_patch(out_deriv.ColRange(p * num_filters,
num_filters));
filters_grad.AddMatMat(1.0, diff_patch, kTrans, vectorized_feature_patches_[p],
kNoTrans, 1.0);
CuSubMatrix<BaseFloat> patch(patches_.ColRange(p * filter_dim, filter_dim));
filters_grad.AddMatMat(1.0, diff_patch, kTrans, patch, kNoTrans, 1.0);
bias_grad.AddRowSumMat(1.0, diff_patch, 1.0);
}
......
......@@ -1783,6 +1783,7 @@ class Convolutional1dComponent: public UpdatableComponent {
const Convolutional1dComponent &operator = (const Convolutional1dComponent &other); // Disallow.
CuMatrix<BaseFloat> filter_params_;
CuVector<BaseFloat> bias_params_;
mutable CuMatrix<BaseFloat> patches_;
bool is_gradient_;
};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment