Commit b1dc7709 authored by Dan Povey's avatar Dan Povey
Browse files

trunk: changes to Dan's neural net setup, with new preconditioning method...

trunk: changes to Dan's neural net setup, with new preconditioning method (speed roughly doubled if you use train_pnorm_online.sh, which uses the new preconditioning method).  Various bug-fixes, optimizations and cleanups in matrix code, cuda-matrix code and thread code.  Still tuning this so recipes not checked in yet.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4077 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent b03ef028
......@@ -99,8 +99,6 @@ void cudaF_vec_min(const float* v, float* value, int dim);
void cudaF_vec_max(const float* v, float* value, int dim);
void cudaF_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value);
void cudaF_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value);
void cudaF_add_diag_mat_trans(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim);
void cudaF_add_diag_mat(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim);
void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M,
int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride,
int N_col_stride, int threads_per_element, float beta);
......@@ -229,8 +227,6 @@ void cudaD_vec_min(const double* v, double* value, int dim);
void cudaD_vec_max(const double* v, double* value, int dim);
void cudaD_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value);
void cudaD_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value);
void cudaD_add_diag_mat_trans(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim);
void cudaD_add_diag_mat(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim);
void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M,
int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride,
int N_col_stride, int threads_per_element, double beta);
......
......@@ -178,8 +178,8 @@ static void _add_diag_vec_mat(Real alpha, Real *mat, MatrixDim mat_dim,
// Note from Dan: in this kernel, we make the x dimension correspond to the
// row index and y to the column index. That was not always the case for
// earlier kernels written by others.
int i = blockIdx.x * blockDim.x + threadIdx.x; // row index
int j = blockIdx.y * blockDim.y + threadIdx.y; // column index
int i = blockIdx.y * blockDim.y + threadIdx.y; // row index
int j = blockIdx.x * blockDim.x + threadIdx.x; // column index
int index = i * mat_dim.stride + j,
index2 = i * mat2_row_stride + j * mat2_col_stride;
......@@ -809,47 +809,13 @@ static void _trace_mat_mat_trans(const Real* A, const Real* B, MatrixDim dA, int
}
template<typename Real>
__global__
static void _add_diag_mat(Real alpha, Real* v, const Real* mat, Real beta, MatrixDim dmat, int dim) {
int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < dim) {
Real sum = 0.0;
for (int32_cuda j = 0; j < dmat.cols; j++) {
int32_cuda index = j + i * dmat.stride;
sum += mat[index] * mat[index];
}
v[i] = beta * v[i] + alpha * sum;
}
}
template<typename Real>
__global__
static void _add_diag_mat_trans(Real alpha, Real* v, const Real* mat, Real beta, MatrixDim dmat, int dim) {
int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
// if (blockIdx.y > 0) return;
if (i < dim) {
Real sum = 0.0;
for (int32_cuda j = 0; j < dmat.rows; j++) {
int32_cuda index = i + j * dmat.stride;
sum += mat[index] * mat[index];
}
v[i] = beta * v[i] + alpha * sum;
}
}
// Adds diag(M N) to v, where M and N are matrices. We supply row_stride and
// col_stride arguments for M and N, and swapping them allows us to transpose
// those matrices. Note: we imagine row-major indexing here, just like Kaldi
// and CBLAS (but unlike CUBLAS).
// This kernel expects the blockDim to be (CU1DBLOCK, 1) and the
// gridDim times CU1DBLOCK to be at least num-rows-of-v, but if the gridDim
// times CU1DBLOCK is larger than that, it will make good use of the
// extra threads. Note: for best efficiency, the gridDim should be approximately
// (num-rows-of-v / CU1DBLOCK) times a power of 2.
// gridDim times CU1DBLOCK to be at least num-rows-of-v * threads_per_element.
// threads_per_element should be a power of 2.
template<typename Real>
__global__
static void _add_diag_mat_mat(
......@@ -862,7 +828,7 @@ static void _add_diag_mat_mat(
__shared__ Real temp_data[CU1DBLOCK];
int i = blockIdx.x * blockDim.x + threadIdx.x;
int v_idx = i / threads_per_element, // v_ids is the index into v that we are supposed to
int v_idx = i / threads_per_element, // v_idx is the index into v that we are supposed to
sub_idx = i % threads_per_element; // add to; 0 <= sub_idx < threads_per_element tells
// us which block of elements we sum up.
if (v_idx >= v_dim) return;
......@@ -2150,13 +2116,6 @@ void cudaF_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_str
_trace_mat_mat<float,2> <<<2,CU1DBLOCK>>>(A,B,dA,B_stride,value);
}
void cudaF_add_diag_mat_trans(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim) {
_add_diag_mat_trans<<<Gr,Bl>>>(alpha,v,mat,beta,dmat,dim);
}
void cudaF_add_diag_mat(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim) {
_add_diag_mat<<<Gr,Bl>>>(alpha,v,mat,beta,dmat,dim);
}
void cudaF_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M,
int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride,
......@@ -2571,14 +2530,6 @@ void cudaD_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_s
_trace_mat_mat<double,2> <<<2,CU1DBLOCK>>>(A,B,dA,B_stride,value);
}
void cudaD_add_diag_mat_trans(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim) {
_add_diag_mat_trans<<<Gr,Bl>>>(alpha,v,mat,beta,dmat,dim);
}
void cudaD_add_diag_mat(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim) {
_add_diag_mat<<<Gr,Bl>>>(alpha,v,mat,beta,dmat,dim);
}
void cudaD_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M,
int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride,
int N_col_stride, int threads_per_element, double beta) {
......
......@@ -141,14 +141,12 @@ inline void cuda_vec_min(const float* v, float* value, int dim) { cudaF_vec_min(
inline void cuda_vec_max(const float* v, float* value, int dim) { cudaF_vec_max(v,value,dim); }
inline void cuda_trace_mat_mat_trans(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat_trans(A,B,dA,B_stride,value); }
inline void cuda_trace_mat_mat(const float* A, const float* B, MatrixDim dA, int B_stride, float* value) { cudaF_trace_mat_mat(A,B,dA,B_stride,value); }
inline void cuda_add_diag_mat_trans(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim) { cudaF_add_diag_mat_trans(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
inline void cuda_add_diag_mat_mat(int Gr, int Bl, float alpha, float* v, int v_dim, const float* M,
int M_cols, int M_row_stride, int M_col_stride, const float *N, int N_row_stride,
int N_col_stride, int threads_per_element, float beta) {
cudaF_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride,
N_col_stride, threads_per_element, beta);
}
inline void cuda_add_diag_mat(int Gr, int Bl, float alpha, float* v, const float* mat, float beta, MatrixDim dmat, int dim) { cudaF_add_diag_mat(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
inline void cuda_add_vec_vec(int Gr, int Bl, float alpha, float* v, const float* x, const float* y, float beta, int dim) { cudaF_add_vec_vec(Gr,Bl,alpha,v,x,y,beta,dim); }
inline void cuda_copy_col_from_mat(int Gr, int Bl, float* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat(Gr,Bl,v,col,mat,dmat,dim); }
inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const float* mat, MatrixDim dmat, int dim) { cudaF_copy_col_from_mat_df(Gr,Bl,v,col,mat,dmat,dim); }
......@@ -311,14 +309,12 @@ inline void cuda_vec_min(const double* v, double* value, int dim) { cudaD_vec_mi
inline void cuda_vec_max(const double* v, double* value, int dim) { cudaD_vec_max(v,value,dim); }
inline void cuda_trace_mat_mat_trans(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat_trans(A,B,dA,B_stride,value); }
inline void cuda_trace_mat_mat(const double* A, const double* B, MatrixDim dA, int B_stride, double* value) { cudaD_trace_mat_mat(A,B,dA,B_stride,value); }
inline void cuda_add_diag_mat_trans(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim) { cudaD_add_diag_mat_trans(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
inline void cuda_add_diag_mat_mat(int Gr, int Bl, double alpha, double* v, int v_dim, const double* M,
int M_cols, int M_row_stride, int M_col_stride, const double *N, int N_row_stride,
int N_col_stride, int threads_per_element, double beta) {
cudaD_add_diag_mat_mat(Gr, Bl, alpha, v, v_dim, M, M_cols, M_row_stride, M_col_stride, N, N_row_stride,
N_col_stride, threads_per_element, beta);
}
inline void cuda_add_diag_mat(int Gr, int Bl, double alpha, double* v, const double* mat, double beta, MatrixDim dmat, int dim) { cudaD_add_diag_mat(Gr,Bl,alpha,v,mat,beta,dmat,dim); }
inline void cuda_add_vec_vec(int Gr, int Bl, double alpha, double* v, const double* x, const double* y, double beta, int dim) { cudaD_add_vec_vec(Gr,Bl,alpha,v,x,y,beta,dim); }
inline void cuda_copy_col_from_mat(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat(Gr,Bl,v,col,mat,dmat,dim); }
inline void cuda_copy_col_from_mat_df(int Gr, int Bl, double* v, int col, const double* mat, MatrixDim dmat, int dim) { cudaD_copy_col_from_mat_df(Gr,Bl,v,col,mat,dmat,dim); }
......
......@@ -57,6 +57,26 @@ template<typename Real> void TestCuMatrixMatMat(int32 dim) {
<< dim << ", speed was " << gflops << " gigaflops.";
}
template<typename Real> void TestCuMatrixAddDiagVecMat(int32 dim, MatrixTransposeType trans) {
BaseFloat time_in_secs = 0.015;
CuMatrix<Real> M(dim, dim), N(dim, dim);
CuVector<Real> v(dim);
M.SetRandn();
v.SetRandn();
Timer tim;
int32 iter = 0;
for (;tim.Elapsed() < time_in_secs; iter++)
N.AddDiagVecMat(1.0, v, M, trans, 0.0);
BaseFloat fdim = dim;
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
KALDI_LOG << "For CuMatrix::AddDiagVecMat" << NameOf<Real>()
<< (trans == kTrans ? "[trans]" : "[no-trans]")
<< ", for dim = " << dim << ", speed was "
<< gflops << " gigaflops.";
}
template<typename Real> void TestSymInvertPosDef(int32 dim) {
BaseFloat time_in_secs = 0.025;
......@@ -222,7 +242,7 @@ template<typename Real> void TestCuMatrixMulRowsGroupMat(int32 dim) {
template<typename Real> void TestCuMatrixSoftmax(int32 dim) {
BaseFloat time_in_secs = 0.025;
CuMatrix<Real> M(256, dim), N(256, dim);
CuMatrix<Real> M(dim, dim), N(dim, dim);
M.SetRandn();
N.SetRandn();
Timer tim;
......@@ -237,6 +257,42 @@ template<typename Real> void TestCuMatrixSoftmax(int32 dim) {
<< dim << ", speed was " << gflops << " gigaflops.";
}
template<typename Real> void TestCuMatrixGroupPnorm(int32 dim) {
BaseFloat time_in_secs = 0.025;
int32 group_size = 4;
CuMatrix<Real> M(dim, dim), N(dim, dim / group_size);
M.SetRandn();
Timer tim;
int32 iter = 0;
for (;tim.Elapsed() < time_in_secs; iter++)
N.GroupPnorm(M, 2.0);
BaseFloat fdim = dim;
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
KALDI_LOG << "For CuMatrix::GroupPnorm" << NameOf<Real>() << ", for dim = "
<< dim << ", speed was " << gflops << " gigaflops.";
}
template<typename Real> void TestCuMatrixGroupPnormDeriv(int32 dim) {
BaseFloat time_in_secs = 0.025;
int32 group_size = 4;
CuMatrix<Real> M(dim, dim), N(dim, dim / group_size), O(dim, dim);
M.SetRandn();
N.GroupPnorm(M, 2.0);
Timer tim;
int32 iter = 0;
for (;tim.Elapsed() < time_in_secs; iter++)
O.GroupPnormDeriv(M, N, 2.0);
BaseFloat fdim = dim;
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
KALDI_LOG << "For CuMatrix::GroupPnormDeriv" << NameOf<Real>() << ", for dim = "
<< dim << ", speed was " << gflops << " gigaflops.";
}
template<typename Real> void TestCuMatrixTraceMatMat(int32 dim) {
for (int32 n = 0; n < 2; n++) {
MatrixTransposeType trans = (n == 0 ? kNoTrans : kTrans);
......@@ -388,6 +444,10 @@ template<typename Real> void CudaMatrixSpeedTest() {
int32 ns = sizes.size();
for (int32 s = 0; s < ns; s++)
TestCuMatrixMatMat<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++) {
TestCuMatrixAddDiagVecMat<Real>(sizes[s], kNoTrans);
TestCuMatrixAddDiagVecMat<Real>(sizes[s], kTrans);
}
for (int32 s = 0; s < ns; s++)
TestSymInvertPosDef<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++)
......@@ -402,6 +462,10 @@ template<typename Real> void CudaMatrixSpeedTest() {
TestCuMatrixMulRowsGroupMat<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++)
TestCuMatrixSoftmax<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++)
TestCuMatrixGroupPnorm<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++)
TestCuMatrixGroupPnormDeriv<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++)
TestCuMatrixTraceMatMat<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++)
......
......@@ -988,13 +988,13 @@ void CuMatrixBase<Real>::AddDiagVecMat(
Timer tim;
dim3 dimBlock(CU2DBLOCK, CU2DBLOCK);
// Caution, this dimGrid is not the same way around as much of the other
// code: going forward, I want to use the (rows, cols) order.
dim3 dimGrid(n_blocks(num_rows_, CU2DBLOCK), n_blocks(num_cols_, CU2DBLOCK));
MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1;
if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
dim3 dimGrid(n_blocks(num_cols_, CU2DBLOCK),
n_blocks(num_rows_, CU2DBLOCK));
MatrixIndexT M_row_stride = M.Stride(), M_col_stride = 1;
if (transM == kTrans)
std::swap(M_row_stride, M_col_stride);
cuda_add_diag_vec_mat(dimGrid, dimBlock, alpha, data_, Dim(),
v.Data(), M.Data(), M_row_stride, M_col_stride, beta);
CU_SAFE_CALL(cudaGetLastError());
......
......@@ -119,6 +119,8 @@ class CuMatrixBase {
const CuMatrixBase<Real> &B,
MatrixTransposeType trans);
/// Adds "value" to the diagonal elements of the matrix. The matrix
/// *this does not have to be square.
void AddToDiag(Real value);
/// Dimensions
......@@ -183,6 +185,8 @@ class CuMatrixBase {
/// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j ^ (power)) ^ (1 / p)
/// where G = x.NumCols() / y.NumCols() must be an integer.
/// [note: y corresponds to *this and x to src, so
/// src.NumCols() / this->NumCols() must be an integer.
void GroupPnorm(const CuMatrixBase<Real> &src, Real pow);
/// Calculate derivatives for the GroupPnorm function above...
......
......@@ -41,7 +41,7 @@ std::string NameOf() {
template<typename Real>
static void UnitTestCuSpMatrixInvert(int32 dim) {
BaseFloat time_in_secs = 0.5;
BaseFloat time_in_secs = 0.01;
int32 iter = 0;
Timer tim;
CuSpMatrix<Real> A(dim);
......@@ -82,7 +82,7 @@ static void UnitTestCuSpMatrixInvert(int32 dim) {
template<typename Real>
static void UnitTestCuSpMatrixCopyFromMat(int32 dim, SpCopyType copy_type) {
BaseFloat time_in_secs = 0.1;
BaseFloat time_in_secs = 0.05;
int32 iter = 0;
Timer tim;
CuMatrix<Real> A(dim, dim);
......
......@@ -96,23 +96,50 @@ template<typename Real> void TestCuVectorVecVecOne(int32 dim) {
template<typename Real> void TestCuVectorAddDiagMatMat(int32 dim) {
template<typename Real> void TestCuVectorAddDiagMatMat(int32 dim,
MatrixTransposeType transN,
MatrixTransposeType transO) {
BaseFloat time_in_secs = 0.05;
CuVector<Real> v(dim);
v.SetRandn();
CuMatrix<Real> N(dim, dim), O(dim, dim);
N.SetRandn(); O.SetRandn();
N.SetRandn();
O.SetRandn();
Timer tim;
int32 iter = 0;
for (;tim.Elapsed() < time_in_secs; iter++) {
v.AddDiagMatMat(1.0, N, kNoTrans, O, kNoTrans, 1.0);
v.AddDiagMatMat(1.0, N, transN, O, transO, 1.0);
}
BaseFloat fdim = dim;
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
KALDI_LOG << "For CuVector::AddDiagMatMat" << NameOf<Real>() << ", for dim = "
KALDI_LOG << "For CuVector::AddDiagMatMat" << NameOf<Real>()
<< (transN == kNoTrans ? "[no-trans],":"[trans],")
<< (transO == kNoTrans ? "[no-trans],":"[trans],")
<< " for dim = "<< dim << ", speed was " << gflops << " gigaflops.";
}
template<typename Real> void TestCuVectorAddDiagMat2(int32 dim, MatrixTransposeType trans) {
BaseFloat time_in_secs = 0.05;
CuVector<Real> v(dim);
v.SetRandn();
CuMatrix<Real> N(dim, dim);
N.SetRandn();
Timer tim;
int32 iter = 0;
for (;tim.Elapsed() < time_in_secs; iter++) {
v.AddDiagMat2(1.0, N, trans, 0.0);
}
BaseFloat fdim = dim;
BaseFloat gflops = (fdim * fdim * iter) / (tim.Elapsed() * 1.0e+09);
KALDI_LOG << "For CuVector::AddDiagMat2" << NameOf<Real>()
<< (trans == kTrans ? "[trans]" : "[no-trans]") << ", for dim = "
<< dim << ", speed was " << gflops << " gigaflops.";
}
......@@ -121,25 +148,27 @@ template<typename Real> void TestCuVectorAddDiagMatMat(int32 dim) {
template<typename Real> void CudaVectorSpeedTest() {
std::vector<int32> sizes;
sizes.push_back(16);
sizes.push_back(32);
sizes.push_back(64);
sizes.push_back(128);
sizes.push_back(256);
sizes.push_back(1024);
int32 ns = sizes.size();
for (int32 s = 0; s < ns; s++)
TestCuVectorSoftmax<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++)
TestCuVectorSum<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++)
TestCuVectorVecVecOne<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++) {
TestCuVectorSoftmax<Real>(sizes[s]);
}
for (int32 s = 0; s < ns; s++) {
TestCuVectorSum<Real>(sizes[s]);
}
for (int32 s = 0; s < ns; s++) {
TestCuVectorVecVecOne<Real>(sizes[s]);
TestCuVectorAddDiagMatMat<Real>(sizes[s], kNoTrans, kNoTrans);
TestCuVectorAddDiagMatMat<Real>(sizes[s], kNoTrans, kTrans);
TestCuVectorAddDiagMatMat<Real>(sizes[s], kTrans, kNoTrans);
TestCuVectorAddDiagMatMat<Real>(sizes[s], kTrans, kTrans);
}
for (int32 s = 0; s < ns; s++) {
TestCuVectorAddDiagMatMat<Real>(sizes[s]);
for (int32 s = 0; s < ns; s++) {
TestCuVectorAddDiagMat2<Real>(sizes[s], kNoTrans);
TestCuVectorAddDiagMat2<Real>(sizes[s], kTrans);
}
}
......
......@@ -472,22 +472,14 @@ void CuVectorBase<Real>::AddDiagMat2(Real alpha, const CuMatrixBase<Real> &M,
MatrixTransposeType trans, Real beta) {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
int dimBlock(CU1DBLOCK);
int dimGrid(n_blocks(dim_,CU2DBLOCK));
if (trans == kNoTrans) {
cuda_add_diag_mat(dimGrid, dimBlock, alpha, data_, M.Data(), beta, M.Dim(), dim_);
} else {
cuda_add_diag_mat_trans(dimGrid, dimBlock, alpha, data_, M.Data(), beta, M.Dim(), dim_);
}
CU_SAFE_CALL(cudaGetLastError());
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
MatrixTransposeType other_trans = (trans == kTrans ? kNoTrans : kTrans);
this->AddDiagMatMat(alpha, M, trans,
M, other_trans, beta);
} else
#endif
{
Vec().AddDiagMat2(alpha, M.Mat(), trans, beta);
}
}
}
template<typename Real>
......@@ -507,16 +499,22 @@ void CuVectorBase<Real>::AddDiagMatMat(
if (transM == kTrans) std::swap(M_row_stride, M_col_stride);
MatrixIndexT N_row_stride = N.Stride(), N_col_stride = 1;
if (transN == kTrans) std::swap(N_row_stride, N_col_stride);
// This kernel can take a variable grid dimension, it makes use
// of the extra threads by partitioning each vector-vector dot
// product into multiple pieces.
int dimBlock(CU1DBLOCK);
int dimGrid(n_blocks(dim,CU1DBLOCK));
int threads_per_element = 1;
while (M_col_dim > 10 * threads_per_element && dimGrid < 32 && threads_per_element < 256) {
// dimGridLimit may be any power of two between 1 and 256 inclusive; it was
// determined empirically based on speed tests.
int dimGridLimit = (transM == kNoTrans && transN == kTrans ? 64 :
(transM == kTrans && transN == kNoTrans ? 16 : 32));
while (M_col_dim > 10 * threads_per_element &&
dimGrid < dimGridLimit && threads_per_element < 256) {
threads_per_element *= 2;
dimGrid = n_blocks(dim * threads_per_element, CU1DBLOCK);
}
......
......@@ -34,3 +34,17 @@ ssh danielpovey,kaldi@shell.sourceforge.net create
echo 'cd /home/project-web/kaldi/htdocs/; rm -rf html;
tar -xzf html.tar.gz; for x in html/*; do mv $x .; done ' \
| ssh danielpovey,kaldi@shell.sourceforge.net bash
# You could uncomment and run the lines below as an example of how to figure out
# the amount of posts to the Kaldi forums on Sourceforge, per month.
#curl 'http://sourceforge.net/p/kaldi/discussion/stats_data?forum=&begin=2011-04-14&end=2014-06-13' > foo
#cat foo | perl -ane ' s/.*://; @A = split("]");
# foreach $a(@A){ $a =~ s/[,\[]//g; print "$a\n"; }' | \
# perl -e 'while(<>) { @A = split; if (@A == 2) { ($date, $count) = @A; $date /= 1000;
# @date_array = gmtime $date; $month = $date_array[4]; $year = 1900 + $date_array[5]; $count{$year. " " .sprintf("%02d", $month+1)} += $count; }}
# foreach $k (sort keys %count) { print "$k $count{$k}\n"; } '
# I added figures that I manually excerpted from https://sourceforge.net/p/kaldi/mailman/kaldi-users/?viewmonth=201203
# and https://sourceforge.net/p/kaldi/mailman/kaldi-developers/?viewmonth=201203
# this is june 13, 2014, 6:11pm, check my email.
......@@ -1373,6 +1373,15 @@ double IvectorStats::UpdatePrior(
return ans;
}
IvectorStats::IvectorStats (const IvectorStats &other):
config_(other.config_), tot_auxf_(other.tot_auxf_), gamma_(other.gamma_),
Y_(other.Y_), R_(other.R_), R_num_cached_(other.R_num_cached_),
R_gamma_cache_(other.R_gamma_cache_),
R_ivec_scatter_cache_(other.R_ivec_scatter_cache_),
Q_(other.Q_), G_(other.G_), S_(other.S_), num_ivectors_(other.num_ivectors_),
ivector_sum_(other.ivector_sum_), ivector_scatter_(other.ivector_scatter_) {
}
} // namespace kaldi
......@@ -368,9 +368,9 @@ class IvectorStats {
IvectorExtractor *extractor) const;
double AuxfPerFrame() { return tot_auxf_ / gamma_.Sum(); }
// Note: we allow the default assignment and copy operators
// because they do what we want.
// Copy constructor.
explicit IvectorStats (const IvectorStats &other);
protected:
friend class IvectorExtractorUpdateProjectionClass;
friend class IvectorExtractorUpdateWeightClass;
......@@ -525,6 +525,9 @@ class IvectorStats {
/// Second-order stats for the iVectors. Needed for prior re-estimation.
SpMatrix<double> ivector_scatter_;
private:
IvectorStats &operator = (const IvectorStats &other); // Disallow.
};
......
......@@ -246,7 +246,7 @@ template<typename Real> static void UnitTestAddSp() {
}
template<typename Real, typename OtherReal>
static void UnitTestSpAddVec() {
static void UnitTestSpAddDiagVec() {
for (MatrixIndexT i = 0;i< 10;i++) {
BaseFloat alpha = (i<5 ? 1.0 : 0.5);
MatrixIndexT dimM = 10+rand()%10;
......@@ -255,7 +255,7 @@ static void UnitTestSpAddVec() {
SpMatrix<Real> T(S);
Vector<OtherReal> v(dimM);
InitRand(&v);
S.AddVec(alpha, v);
S.AddDiagVec(alpha, v);
for (MatrixIndexT i = 0; i < dimM; i++)
T(i, i) += alpha * v(i);
AssertEqual(S, T);
......@@ -1514,19 +1514,35 @@ static void UnitTestTridiagonalize() {
}
for (MatrixIndexT i = 0; i < 4; i++) {
MatrixIndexT dim = 40 + rand() % 4;
// We happened to find out that a 16x16 matrix of 27's causes problems for
// Tridiagonalize.
if (i == 0 || i == 1)
dim = 16;
SpMatrix<Real> S(dim), S2(dim), R(dim), S3(dim);
Matrix<Real> Q(dim, dim);
InitRand(&S);
if (i == 0 || i == 1) {
Matrix<Real> temp(dim, dim);
if (i == 0)
temp.Set(27.0);
else
temp.Set(-1.61558713e-27);
S.CopyFromMat(temp);
}
SpMatrix<Real> T(S);
T.Tridiagonalize(&Q);
KALDI_LOG << "S trace " << S.Trace() << ", T trace " << T.Trace();
//KALDI_LOG << S << "\n" << T;
// KALDI_LOG << S << "\n" << T;
AssertEqual(S.Trace(), T.Trace());
// Also test Trace().
Real ans = 0.0;
for (MatrixIndexT j = 0; j < dim; j++) ans += T(j, j);
AssertEqual(ans, T.Trace());
AssertEqual(T.LogDet(), S.LogDet());
if (S.LogDet() > -50.0) {
// don't check logdet equality if original logdet is very negative- could
// be singular.
AssertEqual(T.LogDet(), S.LogDet());
}
R.AddMat2(1.0, Q, kNoTrans, 0.0);
KALDI_LOG << "Non-unit-ness of R is " << NonUnitness(R);
KALDI_ASSERT(R.IsUnit(0.01)); // Check Q is orthogonal.
......@@ -2854,7 +2870,7 @@ template<typename Real> static void UnitTestSymAddMat2() {
T2.CopyFromMat(M2);
Matrix<Real> X1(T1), X2(T2); // so we can test equality.
AssertEqual(X1, X2);