Commit a8c505f9 authored by Dan Povey's avatar Dan Povey
Browse files

trunk: some improvements to the cuda-matrix library: improve speed of...

trunk: some improvements to the cuda-matrix library: improve speed of TraceMatMat; better GPU-time logging from cu-device.cc.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4553 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 4e97dff3
......@@ -321,13 +321,21 @@ void CuDevice::PrintProfile() {
os << "-----\n[cudevice profile]\n";
std::map<std::string, double>::iterator it;
std::vector<std::pair<double, std::string> > pairs;
for(it = profile_map_.begin(); it != profile_map_.end(); ++it)
pairs.push_back(std::make_pair(it->second, it->first));
double total_time = 0.0;
for(it = profile_map_.begin(); it != profile_map_.end(); ++it) {
std::string function_name = it->first;
double elapsed_time = it->second;
total_time += elapsed_time;
pairs.push_back(std::make_pair(elapsed_time, function_name));
}
// display from shortest to longest time, so tail will show the longest
// times at the end.
std::sort(pairs.begin(), pairs.end());
size_t max_print = 15, start_pos = (pairs.size() <= max_print ?
0 : pairs.size() - max_print);
for (size_t i = start_pos; i < pairs.size(); i++)
os << pairs[i].second << "\t" << pairs[i].first << "s\n";
os << "Total GPU time:\t" << total_time << "s (may involve some double-counting)\n";
os << "-----";
KALDI_LOG << os.str();
PrintMemoryUsage();
......
......@@ -1482,21 +1482,36 @@ Real TraceMatMat(const CuMatrixBase<Real> &A,
Real result = 0;
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Timer tim;
// the sizes of result_vec must match what we
// call the kernels with, in cu-kernels.cu
CuVector<Real> result_vec(trans == kTrans ? 4 : 2, kUndefined);
if (trans == kNoTrans) {
KALDI_ASSERT(A.NumRows() == B.NumCols() && A.NumCols() == B.NumRows());
cuda_trace_mat_mat(A.Data(), B.Data(), A.Dim(), B.Stride(), result_vec.Data());
} else {
KALDI_ASSERT(A.NumRows() == B.NumRows() && A.NumCols() == B.NumCols());
cuda_trace_mat_mat_trans(A.Data(), B.Data(), A.Dim(), B.Stride(), result_vec.Data());
}
CU_SAFE_CALL(cudaGetLastError());
Vector<Real> result_cpu(result_vec); // copying from CUDA faster than summing in CUDA.
result = result_cpu.Sum();
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
if (A.NumRows() * A.NumCols() > 16384) {
// This version in which we don't use a special-purpose kernel, but
// do AddDiagMat on a temporary vector and returns its sum, seems to be
// faster for larger matrices. The cutoff is approximate and
// we only looked at the time on square matrices, which
// is what we test in cu-matrix-speed-test.cc.
CuVector<Real> sum_vec(A.NumRows());
sum_vec.AddDiagMatMat(1.0, A, kNoTrans,
B, trans, 0.0);
return sum_vec.Sum();
} else {
Timer tim;
// the sizes of result_vec must match what we
// call the kernels with, in cu-kernels.cu
CuVector<Real> result_vec(trans == kTrans ? 4 : 2, kUndefined);
if (trans == kNoTrans) {
cuda_trace_mat_mat(A.Data(), B.Data(), A.Dim(), B.Stride(), result_vec.Data());
} else {
cuda_trace_mat_mat_trans(A.Data(), B.Data(), A.Dim(), B.Stride(), result_vec.Data());
}
CU_SAFE_CALL(cudaGetLastError());
Vector<Real> result_cpu(result_vec); // copying from CUDA faster than summing in CUDA.
result = result_cpu.Sum();
CuDevice::Instantiate().AccuProfile(__func__, tim.Elapsed());
}
} else
#endif
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment