Commit dc20fb65 authored by Daniel Povey's avatar Daniel Povey
Browse files

Merge pull request #30 from naxingyu/maxout-gpu-merge

update Maxout GPU-CPU and test code
parents 60c25096 87337b60
...@@ -80,6 +80,7 @@ void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, Matrix ...@@ -80,6 +80,7 @@ void cudaF_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, Matrix
void cudaF_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d); void cudaF_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d);
void cudaF_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size); void cudaF_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size);
void cudaF_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2, MatrixDim d, int src_stride, int group_size, float power); void cudaF_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2, MatrixDim d, int src_stride, int group_size, float power);
void cudaF_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2, MatrixDim d, int src_stride, int group_size);
void cudaF_div_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *vec_div, MatrixDim d); void cudaF_div_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *vec_div, MatrixDim d);
void cudaF_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *src, float *dst, MatrixDim d, int src_stride, int A_trans); void cudaF_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *src, float *dst, MatrixDim d, int src_stride, int A_trans);
void cudaF_add_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B, const float *C, float *dst, MatrixDim d, int stride_a, int stride_b, int stride_c); void cudaF_add_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B, const float *C, float *dst, MatrixDim d, int stride_a, int stride_b, int stride_c);
...@@ -131,6 +132,7 @@ void cudaF_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x, Matrix ...@@ -131,6 +132,7 @@ void cudaF_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x, Matrix
void cudaF_log_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d, int src_stride); void cudaF_log_softmax_reduce(size_t Gr, size_t Bl, float *y, const float *x, MatrixDim d, int src_stride);
void cudaF_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride); void cudaF_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size, float power); void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size, float power);
void cudaF_group_max(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size);
void cudaF_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride); void cudaF_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int e_stride, int y_stride); void cudaF_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int e_stride, int y_stride);
void cudaF_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride); void cudaF_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride);
...@@ -206,6 +208,7 @@ void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, Matr ...@@ -206,6 +208,7 @@ void cudaD_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, Matr
void cudaD_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d); void cudaD_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d);
void cudaD_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size); void cudaD_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size);
void cudaD_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2, MatrixDim d, int src_stride, int group_size, double power); void cudaD_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2, MatrixDim d, int src_stride, int group_size, double power);
void cudaD_calc_group_max_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2, MatrixDim d, int src_stride, int group_size);
void cudaD_div_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *vec_div, MatrixDim d); void cudaD_div_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *vec_div, MatrixDim d);
void cudaD_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *src, double *dst, MatrixDim d, int src_stride, int A_trans); void cudaD_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *src, double *dst, MatrixDim d, int src_stride, int A_trans);
void cudaD_add_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A, const double *B, const double *C, double *dst, MatrixDim d, int stride_a, int stride_b, int stride_c); void cudaD_add_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A, const double *B, const double *C, double *dst, MatrixDim d, int stride_a, int stride_b, int stride_c);
...@@ -260,6 +263,7 @@ void cudaD_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x, Matr ...@@ -260,6 +263,7 @@ void cudaD_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x, Matr
void cudaD_log_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d, int src_stride); void cudaD_log_softmax_reduce(size_t Gr, size_t Bl, double *y, const double *x, MatrixDim d, int src_stride);
void cudaD_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride); void cudaD_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size, double power); void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size, double power);
void cudaD_group_max(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size);
void cudaD_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride); void cudaD_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int e_stride, int y_stride); void cudaD_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int e_stride, int y_stride);
void cudaD_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride); void cudaD_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride);
......
...@@ -474,6 +474,24 @@ static void _calc_pnorm_deriv(Real *deriv, const Real *vec, const Real *norm, ...@@ -474,6 +474,24 @@ static void _calc_pnorm_deriv(Real *deriv, const Real *vec, const Real *norm,
} }
} }
/// deriv is the derivative we will output; vec is the input we're computing
/// the group max on, "maxv" is the previously computed group max.
template<typename Real>
__global__
static void _calc_group_max_deriv(Real *deriv, const Real *vec, const Real *maxv,
MatrixDim d, int src_stride, int group_size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (j < d.rows && i < d.cols ) {
int dst_index = i + j * d.stride,
src_index = i / group_size + j * src_stride;
Real vec_element = vec[dst_index], // this is the element of the original vector.
max_element = maxv[src_index]; // this is the max value
Real ans = (max_element == vec_element ? 1.0 : 0.0);
deriv[dst_index] = ans;
}
}
/// Set each element to y = (x == orig ? changed : x). /// Set each element to y = (x == orig ? changed : x).
template<typename Real> template<typename Real>
__global__ __global__
...@@ -1521,6 +1539,27 @@ static void _group_pnorm(Real *y, const Real *x, MatrixDim d, int src_stride, ...@@ -1521,6 +1539,27 @@ static void _group_pnorm(Real *y, const Real *x, MatrixDim d, int src_stride,
} }
} }
} }
template<typename Real>
__global__
static void _group_max(Real *y, const Real *x, MatrixDim d, int src_stride,
int group_size) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (j < d.rows && i < d.cols) {
int dst_index = i + j * d.stride;
int src_begin_index = i * group_size + j * src_stride;
Real max_value = -1e20;
int src_end_index = src_begin_index + group_size;
for (int src_index = src_begin_index; src_index < src_end_index;
src_index ++) {
if (!isnan(x[src_index]) && x[src_index] > max_value)
max_value = x[src_index];
}
y[dst_index] = max_value;
}
}
/* /*
* cu:: * cu::
*/ */
...@@ -2060,6 +2099,12 @@ void cudaF_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, ...@@ -2060,6 +2099,12 @@ void cudaF_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1,
_calc_pnorm_deriv<<<Gr,Bl>>>(y, x1, x2, d, src_stride, group_size, power); _calc_pnorm_deriv<<<Gr,Bl>>>(y, x1, x2, d, src_stride, group_size, power);
} }
void cudaF_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1,
const float *x2, MatrixDim d, int src_stride,
int group_size) {
_calc_group_max_deriv<<<Gr,Bl>>>(y, x1, x2, d, src_stride, group_size);
}
void cudaF_div_rows_vec(dim3 Gr, dim3 Bl, float* mat, const float* vec_div, MatrixDim d) { void cudaF_div_rows_vec(dim3 Gr, dim3 Bl, float* mat, const float* vec_div, MatrixDim d) {
_div_rows_vec<<<Gr,Bl>>>(mat, vec_div, d); _div_rows_vec<<<Gr,Bl>>>(mat, vec_div, d);
} }
...@@ -2234,6 +2279,10 @@ void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, ...@@ -2234,6 +2279,10 @@ void cudaF_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d,
_group_pnorm<<<Gr,Bl>>>(y, x, d, src_stride, group_size, power); _group_pnorm<<<Gr,Bl>>>(y, x, d, src_stride, group_size, power);
} }
void cudaF_group_max(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size) {
_group_max<<<Gr,Bl>>>(y, x, d, src_stride, group_size);
}
void cudaF_sigmoid (dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d, int src_stride) { void cudaF_sigmoid (dim3 Gr, dim3 Bl, float* y, const float* x, MatrixDim d, int src_stride) {
_sigmoid<<<Gr,Bl>>>(y, x, d, src_stride); _sigmoid<<<Gr,Bl>>>(y, x, d, src_stride);
} }
...@@ -2471,6 +2520,12 @@ void cudaD_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double*y, const double* x1, ...@@ -2471,6 +2520,12 @@ void cudaD_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double*y, const double* x1,
_calc_pnorm_deriv<<<Gr,Bl>>>(y, x1, x2, d, src_stride, group_size, power); _calc_pnorm_deriv<<<Gr,Bl>>>(y, x1, x2, d, src_stride, group_size, power);
} }
void cudaD_calc_group_max_deriv(dim3 Gr, dim3 Bl, double*y, const double* x1,
const double* x2, MatrixDim d, int src_stride,
int group_size) {
_calc_group_max_deriv<<<Gr,Bl>>>(y, x1, x2, d, src_stride, group_size);
}
void cudaD_div_rows_vec(dim3 Gr, dim3 Bl, double* mat, const double* vec_div, MatrixDim d) { void cudaD_div_rows_vec(dim3 Gr, dim3 Bl, double* mat, const double* vec_div, MatrixDim d) {
_div_rows_vec<<<Gr,Bl>>>(mat, vec_div, d); _div_rows_vec<<<Gr,Bl>>>(mat, vec_div, d);
} }
...@@ -2645,6 +2700,11 @@ void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d ...@@ -2645,6 +2700,11 @@ void cudaD_group_pnorm(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d
_group_pnorm<<<Gr,Bl>>>(y, x, d, src_stride, group_size, power); _group_pnorm<<<Gr,Bl>>>(y, x, d, src_stride, group_size, power);
} }
void cudaD_group_max(dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d,
int src_stride, int group_size) {
_group_max<<<Gr,Bl>>>(y, x, d, src_stride, group_size);
}
void cudaD_sigmoid (dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride) { void cudaD_sigmoid (dim3 Gr, dim3 Bl, double* y, const double* x, MatrixDim d, int src_stride) {
_sigmoid<<<Gr,Bl>>>(y, x, d, src_stride); _sigmoid<<<Gr,Bl>>>(y, x, d, src_stride);
} }
......
...@@ -116,6 +116,7 @@ inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, ...@@ -116,6 +116,7 @@ inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale,
inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_rows_vec(Gr,Bl,mat,scale,d); } inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, float *mat, const float *scale, MatrixDim d) { cudaF_mul_rows_vec(Gr,Bl,mat,scale,d); }
inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size) { cudaF_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size); } inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size) { cudaF_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size); }
inline void cuda_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2, MatrixDim d, int src_stride, int group_size, float power) {cudaF_calc_pnorm_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size, power); } inline void cuda_calc_pnorm_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2, MatrixDim d, int src_stride, int group_size, float power) {cudaF_calc_pnorm_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size, power); }
inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl, float *y, const float *x1, const float *x2, MatrixDim d, int src_stride, int group_size) {cudaF_calc_group_max_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size); }
inline void cuda_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *src, float *dst, MatrixDim d, int src_stride, int A_trans) { cudaF_add_mat(Gr,Bl,alpha,src,dst,d,src_stride, A_trans); } inline void cuda_add_mat(dim3 Gr, dim3 Bl, float alpha, const float *src, float *dst, MatrixDim d, int src_stride, int A_trans) { cudaF_add_mat(Gr,Bl,alpha,src,dst,d,src_stride, A_trans); }
inline void cuda_add_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B, const float *C, float *dst, MatrixDim d, int stride_a, int stride_b, int stride_c) { cudaF_add_mat_mat_div_mat(Gr,Bl,A,B,C,dst,d,stride_a,stride_b,stride_c); } inline void cuda_add_mat_mat_div_mat(dim3 Gr, dim3 Bl, const float *A, const float *B, const float *C, float *dst, MatrixDim d, int stride_a, int stride_b, int stride_c) { cudaF_add_mat_mat_div_mat(Gr,Bl,A,B,C,dst,d,stride_a,stride_b,stride_c); }
inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float *col, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); } inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, float alpha, const float *col, float beta, float *dst, MatrixDim d) { cudaF_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); }
...@@ -182,6 +183,7 @@ inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_dat ...@@ -182,6 +183,7 @@ inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_dat
*/ */
inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_soft_hinge(Gr,Bl,y,x,d,src_stride); } inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_soft_hinge(Gr,Bl,y,x,d,src_stride); }
inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size, float power) { cudaF_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);} inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size, float power) { cudaF_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power);}
inline void cuda_group_max(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride, int group_size) { cudaF_group_max(Gr, Bl, y, x, d, src_stride, group_size);}
inline void cuda_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_sigmoid(Gr,Bl,y,x,d,src_stride); } inline void cuda_sigmoid(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_sigmoid(Gr,Bl,y,x,d,src_stride); }
inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int e_stride, int y_stride) { cudaF_diff_sigmoid(Gr,Bl,eout,e,y,d,e_stride,y_stride); } inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, float *eout, const float *e, const float *y, MatrixDim d, int e_stride, int y_stride) { cudaF_diff_sigmoid(Gr,Bl,eout,e,y,d,e_stride,y_stride); }
inline void cuda_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_tanh(Gr,Bl,y,x,d,src_stride); } inline void cuda_tanh(dim3 Gr, dim3 Bl, float *y, const float *x, MatrixDim d, int src_stride) { cudaF_tanh(Gr,Bl,y,x,d,src_stride); }
...@@ -281,6 +283,7 @@ inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale ...@@ -281,6 +283,7 @@ inline void cuda_mul_cols_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale
inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d) { cudaD_mul_rows_vec(Gr,Bl,mat,scale,d); } inline void cuda_mul_rows_vec(dim3 Gr, dim3 Bl, double *mat, const double *scale, MatrixDim d) { cudaD_mul_rows_vec(Gr,Bl,mat,scale,d); }
inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size) { cudaD_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size); } inline void cuda_mul_rows_group_mat(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size) { cudaD_mul_rows_group_mat(Gr, Bl, y, x, d, src_stride, group_size); }
inline void cuda_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2, MatrixDim d, int src_stride, int group_size, double power) {cudaD_calc_pnorm_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size, power); } inline void cuda_calc_pnorm_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2, MatrixDim d, int src_stride, int group_size, double power) {cudaD_calc_pnorm_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size, power); }
inline void cuda_calc_group_max_deriv(dim3 Gr, dim3 Bl, double *y, const double *x1, const double *x2, MatrixDim d, int src_stride, int group_size) {cudaD_calc_group_max_deriv(Gr, Bl, y, x1, x2, d, src_stride, group_size); }
inline void cuda_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *src, double *dst, MatrixDim d, int src_stride, int A_trans) { cudaD_add_mat(Gr,Bl,alpha,src,dst,d,src_stride, A_trans); } inline void cuda_add_mat(dim3 Gr, dim3 Bl, double alpha, const double *src, double *dst, MatrixDim d, int src_stride, int A_trans) { cudaD_add_mat(Gr,Bl,alpha,src,dst,d,src_stride, A_trans); }
inline void cuda_add_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A, const double *B, const double *C, double *dst, MatrixDim d, int stride_a, int stride_b, int stride_c) { cudaD_add_mat_mat_div_mat(Gr,Bl,A,B,C,dst,d,stride_a,stride_b,stride_c); } inline void cuda_add_mat_mat_div_mat(dim3 Gr, dim3 Bl, const double *A, const double *B, const double *C, double *dst, MatrixDim d, int stride_a, int stride_b, int stride_c) { cudaD_add_mat_mat_div_mat(Gr,Bl,A,B,C,dst,d,stride_a,stride_b,stride_c); }
inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double *col, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); } inline void cuda_add_vec_to_cols(dim3 Gr, dim3 Bl, double alpha, const double *col, double beta, double *dst, MatrixDim d) { cudaD_add_vec_to_cols(Gr,Bl,alpha,col,beta,dst,d); }
...@@ -343,6 +346,7 @@ inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_dat ...@@ -343,6 +346,7 @@ inline void cuda_block_add_mat_mat(dim3 Gr, dim3 Bl, CuBlockMatrixData *B_cu_dat
*/ */
inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_soft_hinge(Gr,Bl,y,x,d,src_stride); } inline void cuda_soft_hinge(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_soft_hinge(Gr,Bl,y,x,d,src_stride); }
inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size, double power) { cudaD_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power); } inline void cuda_group_pnorm(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size, double power) { cudaD_group_pnorm(Gr, Bl, y, x, d, src_stride, group_size, power); }
inline void cuda_group_max(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride, int group_size) { cudaD_group_max(Gr, Bl, y, x, d, src_stride, group_size); }
inline void cuda_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_sigmoid(Gr,Bl,y,x,d,src_stride); } inline void cuda_sigmoid(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_sigmoid(Gr,Bl,y,x,d,src_stride); }
inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int e_stride, int y_stride) { cudaD_diff_sigmoid(Gr,Bl,eout,e,y,d,e_stride,y_stride); } inline void cuda_diff_sigmoid(dim3 Gr, dim3 Bl, double *eout, const double *e, const double *y, MatrixDim d, int e_stride, int y_stride) { cudaD_diff_sigmoid(Gr,Bl,eout,e,y,d,e_stride,y_stride); }
inline void cuda_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_tanh(Gr,Bl,y,x,d,src_stride); } inline void cuda_tanh(dim3 Gr, dim3 Bl, double *y, const double *x, MatrixDim d, int src_stride) { cudaD_tanh(Gr,Bl,y,x,d,src_stride); }
......
...@@ -311,6 +311,39 @@ template<typename Real> void TestCuMatrixGroupPnormDeriv(int32 dim) { ...@@ -311,6 +311,39 @@ template<typename Real> void TestCuMatrixGroupPnormDeriv(int32 dim) {
<< dim << ", speed was " << gflops << " gigaflops."; << dim << ", speed was " << gflops << " gigaflops.";
} }
template<typename Real> void TestCuMatrixGroupMax(int32 dim) {
BaseFloat time_in_secs = 0.025;
int32 group_size = 4;
CuMatrix<Real> M(dim, dim), N(dim, dim / group_size);
M.SetRandn();
Timer tim;
int32 iter = 0;
for (;tim.Elapsed() < time_in_secs; iter++)
N.GroupMax(M);
BaseFloat fdim = dim;
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
KALDI_LOG << "For CuMatrix::GroupMax" << NameOf<Real>() << ", for dim = "
<< dim << ", speed was " << gflops << " gigaflops.";
}
template<typename Real> void TestCuMatrixGroupMaxDeriv(int32 dim) {
BaseFloat time_in_secs = 0.025;
int32 group_size = 4;
CuMatrix<Real> M(dim, dim), N(dim, dim / group_size), O(dim, dim);
M.SetRandn();
N.GroupMax(M);
Timer tim;
int32 iter = 0;
for (;tim.Elapsed() < time_in_secs; iter++)
O.GroupMaxDeriv(M, N);
BaseFloat fdim = dim;
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
KALDI_LOG << "For CuMatrix::GroupMaxDeriv" << NameOf<Real>() << ", for dim = "
<< dim << ", speed was " << gflops << " gigaflops.";
}
template<typename Real> void TestCuMatrixTraceMatMat(int32 dim) { template<typename Real> void TestCuMatrixTraceMatMat(int32 dim) {
for (int32 n = 0; n < 2; n++) { for (int32 n = 0; n < 2; n++) {
...@@ -517,6 +550,10 @@ template<typename Real> void CudaMatrixSpeedTest() { ...@@ -517,6 +550,10 @@ template<typename Real> void CudaMatrixSpeedTest() {
TestCuMatrixGroupPnorm<Real>(sizes[s]); TestCuMatrixGroupPnorm<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++) for (int32 s = 0; s < ns; s++)
TestCuMatrixGroupPnormDeriv<Real>(sizes[s]); TestCuMatrixGroupPnormDeriv<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++)
TestCuMatrixGroupMax<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++)
TestCuMatrixGroupMaxDeriv<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++) for (int32 s = 0; s < ns; s++)
TestCuMatrixTraceMatMat<Real>(sizes[s]); TestCuMatrixTraceMatMat<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++) for (int32 s = 0; s < ns; s++)
......
...@@ -266,6 +266,27 @@ static void UnitTestCuMatrixGroupPnorm() { ...@@ -266,6 +266,27 @@ static void UnitTestCuMatrixGroupPnorm() {
} }
} }
template<typename Real>
static void UnitTestCuMatrixGroupMax() {
int32 M = 100 + Rand() % 200, N = 100 + Rand() % 200;
// M = 256; N = 256;
for (int32 K = 5; K < 7; K++) {
int32 N_src = N * K;
Matrix<Real> H_src(M, N_src);
H_src.SetRandn();
if (rand () % 2 == 0)
H_src.ApplyFloor(0.0); // will put some zeros in the matrix.. harder to
// do derivatives.
Matrix<Real> H(M, N);
H.GroupMax(H_src);
CuMatrix<Real> D(H_src);
CuMatrix<Real> E(M, N);
E.GroupMax(D);
Matrix<Real> H2(E);
AssertEqual(H,H2);
}
}
template<typename Real> template<typename Real>
static void UnitTestCuMatrixSet() { static void UnitTestCuMatrixSet() {
for (int32 i = 0; i < 2; i++) { for (int32 i = 0; i < 2; i++) {
...@@ -686,6 +707,39 @@ static void UnitTestCuMatrixGroupPnormDeriv() { ...@@ -686,6 +707,39 @@ static void UnitTestCuMatrixGroupPnormDeriv() {
AssertEqual(Hr,Hr2); AssertEqual(Hr,Hr2);
} }
template<typename Real>
static void UnitTestCuMatrixGroupMaxDeriv() {
int32 dimM = 100 + Rand() % 200, dimNs = 100 + Rand() % 200;
int32 group_size = 1 + Rand() % 10;
// int32 dimM = 256, dimNs = 2;
// int32 group_size = 2;
int32 dimN = group_size * dimNs;
Matrix<Real> Hm(dimM, dimN);
Matrix<Real> Hr(dimM, dimN);
Matrix<Real> Hs(dimM, dimNs);
Hs.SetRandn();
if (rand () % 2 == 0)
Hm.ApplyFloor(0.0); // will put some zeros in the matrix.. harder to
// do derivatives.
Hs.GroupMax(Hm);
CuMatrix<Real> Dm(dimM, dimN);
CuMatrix<Real> Dr(dimM, dimN);
CuMatrix<Real> Ds(dimM, dimNs);
Dm.CopyFromMat(Hm);
Dr.CopyFromMat(Hr);
Ds.CopyFromMat(Hs);
// KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ;
Dr.GroupMaxDeriv(Dm, Ds);
Hr.GroupMaxDeriv(Hm, Hs);
// KALDI_LOG << "Hr " << Hr << " Dr " << Dr << "Ds" << Ds << " Hs " << Hs ;
Matrix<Real> Hr2(dimM, dimN);
Dr.CopyToMat(&Hr2);
AssertEqual(Hr,Hr2);
}
template<typename Real> static void UnitTestCuMatrixAddDiagVecMat() { template<typename Real> static void UnitTestCuMatrixAddDiagVecMat() {
for (int p = 0; p < 4; p++) { for (int p = 0; p < 4; p++) {
MatrixIndexT dimM = 100 + Rand() % 255, dimN = 100 + Rand() % 255; MatrixIndexT dimM = 100 + Rand() % 255, dimN = 100 + Rand() % 255;
...@@ -2072,6 +2126,8 @@ template<typename Real> void CudaMatrixUnitTest() { ...@@ -2072,6 +2126,8 @@ template<typename Real> void CudaMatrixUnitTest() {
UnitTestCuDiffSigmoid<Real>(); UnitTestCuDiffSigmoid<Real>();
UnitTestCuMatrixGroupPnorm<Real>(); UnitTestCuMatrixGroupPnorm<Real>();
UnitTestCuMatrixGroupPnormDeriv<Real>(); UnitTestCuMatrixGroupPnormDeriv<Real>();
UnitTestCuMatrixGroupMax<Real>();
UnitTestCuMatrixGroupMaxDeriv<Real>();
UnitTestCuMatrixMulRowsVec<Real>(); UnitTestCuMatrixMulRowsVec<Real>();
UnitTestCuMatrixMulRowsGroupMat<Real>(); UnitTestCuMatrixMulRowsGroupMat<Real>();
UnitTestCuFindRowMaxId<Real>(); UnitTestCuFindRowMaxId<Real>();
......
...@@ -759,6 +759,32 @@ void CuMatrixBase<Real>::GroupPnormDeriv(const CuMatrixBase<Real> &src1, ...@@ -759,6 +759,32 @@ void CuMatrixBase<Real>::GroupPnormDeriv(const CuMatrixBase<Real> &src1,
} }
} }
template<typename Real>
void CuMatrixBase<Real>::GroupMaxDeriv(const CuMatrixBase<Real> &src1,
const CuMatrixBase<Real> &src2) {
KALDI_ASSERT(src2.NumCols() > 0);
int group_size = this->NumCols() / src2.NumCols();
KALDI_ASSERT(this->NumCols() == src2.NumCols() * group_size);
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
dim3 dimGrid(n_blocks(NumCols(), CU2DBLOCK), n_blocks(NumRows(), CU2DBLOCK));
cuda_calc_group_max_deriv(dimGrid, dimBlock, this->data_,
src1.Data(), src2.Data(), Dim(),
src2.Stride(), group_size);
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
Mat().GroupMaxDeriv(src1.Mat(), src2.Mat());
}
}
template<typename Real> template<typename Real>
void CuMatrixBase<Real>::DivRowsVec(const CuVectorBase<Real> &div) { void CuMatrixBase<Real>::DivRowsVec(const CuVectorBase<Real> &div) {
#if HAVE_CUDA == 1 #if HAVE_CUDA == 1
...@@ -1137,6 +1163,27 @@ void CuMatrixBase<Real>::GroupPnorm(const CuMatrixBase<Real> &src, Real power) { ...@@ -1137,6 +1163,27 @@ void CuMatrixBase<Real>::GroupPnorm(const CuMatrixBase<Real> &src, Real power) {
} }
} }
template<typename Real>
void CuMatrixBase<Real>::GroupMax(const CuMatrixBase<Real> &src) {
int group_size = src.NumCols() / this->NumCols();
KALDI_ASSERT(src.NumCols() == this->NumCols() * group_size &&
this->NumRows() == src.NumRows());
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
dim3 dimGrid(n_blocks(src.NumCols(), CU2DBLOCK), n_blocks(src.NumRows(), CU2DBLOCK));
cuda_group_max(dimGrid, dimBlock, this->data_, src.data_,
this->Dim(), src.Stride(), group_size);
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
} else
#endif
{
Mat().GroupMax(src.Mat());
}
}
/* /*
Think of sv_labels as a Matrix, denoting the "correct" label of each frame to Think of sv_labels as a Matrix, denoting the "correct" label of each frame to
each phone-state; it's very likely to contain a LOT of zeros each phone-state; it's very likely to contain a LOT of zeros
......
...@@ -197,6 +197,22 @@ class CuMatrixBase { ...@@ -197,6 +197,22 @@ class CuMatrixBase {
/// "output-elem" is whichever element of output depends on that input element. /// "output-elem" is whichever element of output depends on that input element.
void GroupPnormDeriv(const CuMatrixBase<Real> &input, void GroupPnormDeriv(const CuMatrixBase<Real> &input,
const CuMatrixBase<Real> &output, Real power); const CuMatrixBase<Real> &output, Real power);
/// Apply the function y(i) = (max_{j = i*G}^{(i+1)*G-1} x_j
/// where G = x.NumCols() / y.NumCols() must be an integer.
/// [note: y corresponds to *this and x to src, so
/// src.NumCols() / this->NumCols() must be an integer.
void GroupMax(const CuMatrixBase<Real> &src);
/// Calculate derivatives for the GroupMax function above, where
/// "input" is the input to the GroupMax function above (i.e. the "src" variable),
/// and "output" is the result of the computation (i.e. the "this" of that function
/// call), and *this must have the same dimension as "input". Each element
/// of *this will be set to 1 if the corresponding input equals the output of
/// the group, and 0 otherwise. The equals the function derivative where it is
/// defined (it's not defined where multiple inputs in the group are equal to the output).
void GroupMaxDeriv(const CuMatrixBase<Real> &input,
const CuMatrixBase<Real> &output);
/// Compute the hyperbolic tangent (tanh) function; element by element, /// Compute the hyperbolic tangent (tanh) function; element by element,
/// *this = tanh(src). /// *this = tanh(src).
......
...@@ -1110,6 +1110,26 @@ void MatrixBase<Real>::GroupPnormDeriv(const MatrixBase<Real> &input, ...@@ -1110,6 +1110,26 @@ void MatrixBase<Real>::GroupPnormDeriv(const MatrixBase<Real> &input,
} }
} }
template<typename Real>
void MatrixBase<Real>::GroupMaxDeriv(const MatrixBase<Real> &input,
const MatrixBase<Real> &output) {
KALDI_ASSERT(input.NumCols() == this->NumCols() &&
input.NumRows() == this->NumRows());
KALDI_ASSERT(this->NumCols() % output.NumCols() == 0 &&
this->NumRows() == output.NumRows());
int group_size = this->NumCols() / output.NumCols(),
num_rows = this->NumRows(), num_cols = this->NumCols();
for (MatrixIndexT i = 0; i < num_rows; i++) {
for (MatrixIndexT j = 0; j < num_cols; j++) {
Real input_val = input(i, j);
Real output_val = output(i, j / group_size);
(*this)(i, j) = (input_val == output_val ? 1 : 0);
}
}
}
template<typename Real> // scales each column by scale[i]. template<typename Real> // scales each column by scale[i].
void MatrixBase<Real>::MulColsVec(const VectorBase<Real> &scale) { void MatrixBase<Real>::MulColsVec(const VectorBase<Real> &scale) {
KALDI_ASSERT(scale.Dim() == num_cols_); KALDI_ASSERT(scale.Dim() == num_cols_);
...@@ -2499,6 +2519,26 @@ void MatrixBase<Real>::GroupPnorm(const MatrixBase<Real> &src, Real power) { ...@@ -2499,6 +2519,26 @@ void MatrixBase<Real>::GroupPnorm(const MatrixBase<Real> &src, Real power) {
(*this)(i, j) = src.Row(i).Range(j * group_size, group_size).Norm(power); (*this)(i, j) = src.Row(i).Range(j * group_size, group_size).Norm(power);
} }
template<typename Real>
void MatrixBase<Real>::GroupMax(const MatrixBase<Real> &src) {
KALDI_ASSERT(src.NumCols() % this->NumCols() == 0 &&
src.NumRows() == this->NumRows());
int group_size = src.NumCols() / this->NumCols(),
num_rows = this->NumRows(), num_cols = this->NumCols();
for (MatrixIndexT i = 0; i < num_rows; i++) {
const Real *src_row_data = src.RowData(i);
for (MatrixIndexT j = 0; j < num_cols; j++) {
Real max_val = -1e20;
for (MatrixIndexT k = 0; k < group_size; k++) {
Real src_data = src_row_data[j * group_size + k];
if (src_data > max_val)
max_val = src_data;
}
(*this)(i, j) = max_val;
}