Commit 26e5d788 authored by Jan Trmal's avatar Jan Trmal
Browse files

(trunk) Making the auto-selection of code more persistent to select at least...

(trunk) Making the auto-selection of code more persistent to select at least _some_ devices and be more verbose about the possible problems 

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@5108 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent fb569bf6
...@@ -99,7 +99,7 @@ void CuDevice::SelectGpuId(std::string use_gpu) { ...@@ -99,7 +99,7 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
// or default gpu_id=0. In the case with no free GPUs a context cannot be created // or default gpu_id=0. In the case with no free GPUs a context cannot be created
// (compute-exclusive mode). // (compute-exclusive mode).
// //
e = cudaThreadSynchronize(); //<< CUDA context gets created here. e = cudaThreadSynchronize(); // << CUDA context gets created here.
if (use_gpu != "wait") { if (use_gpu != "wait") {
if (e != cudaSuccess) { if (e != cudaSuccess) {
...@@ -109,7 +109,7 @@ void CuDevice::SelectGpuId(std::string use_gpu) { ...@@ -109,7 +109,7 @@ void CuDevice::SelectGpuId(std::string use_gpu) {
<< " seconds."; << " seconds.";
sleep(sec_sleep); sleep(sec_sleep);
cudaGetLastError(); // reset the error state cudaGetLastError(); // reset the error state
e = cudaThreadSynchronize(); //<< 2nd trial to get CUDA context. e = cudaThreadSynchronize(); // << 2nd trial to get CUDA context.
if (e != cudaSuccess) { if (e != cudaSuccess) {
if (use_gpu == "yes") { if (use_gpu == "yes") {
KALDI_CUDA_ERR(e, "Failed to create CUDA context, no more unused GPUs?"); KALDI_CUDA_ERR(e, "Failed to create CUDA context, no more unused GPUs?");
...@@ -183,7 +183,7 @@ void CuDevice::FinalizeActiveGpu() { ...@@ -183,7 +183,7 @@ void CuDevice::FinalizeActiveGpu() {
KALDI_CUDA_ERR(e, "Failed to get device-id of active device."); KALDI_CUDA_ERR(e, "Failed to get device-id of active device.");
} }
// Remember the id of active GPU // Remember the id of active GPU
active_gpu_id_ = act_gpu_id; //CuDevice::Enabled() is true from now on active_gpu_id_ = act_gpu_id; // CuDevice::Enabled() is true from now on
// Initialize the CUBLAS // Initialize the CUBLAS
CU_SAFE_CALL(cublasInit()); CU_SAFE_CALL(cublasInit());
...@@ -240,7 +240,7 @@ bool CuDevice::IsComputeExclusive() { ...@@ -240,7 +240,7 @@ bool CuDevice::IsComputeExclusive() {
default : default :
// The computation mode is not compute-exclusive, // The computation mode is not compute-exclusive,
// in this case we release the GPU context... // in this case we release the GPU context...
e = cudaThreadExit(); //deprecated, but for legacy reason not cudaDeviceReset e = cudaThreadExit(); // deprecated, but for legacy reason not cudaDeviceReset
if(e != cudaSuccess) { if(e != cudaSuccess) {
KALDI_CUDA_ERR(e, "Failed to release CUDA context on a GPU"); KALDI_CUDA_ERR(e, "Failed to release CUDA context on a GPU");
} }
...@@ -248,40 +248,58 @@ bool CuDevice::IsComputeExclusive() { ...@@ -248,40 +248,58 @@ bool CuDevice::IsComputeExclusive() {
} }
} }
template<typename TA, typename TB>
bool greater_pair(const std::pair<TA, TB> &left, const std::pair<TA, TB>& right) {
return left.second > right.second;
}
bool CuDevice::SelectGpuIdAuto() { bool CuDevice::SelectGpuIdAuto() {
// Check that we have at least one gpu // Check that we have at least one gpu
cudaError_t e;
int32 n_gpu = 0; int32 n_gpu = 0;
cudaGetDeviceCount(&n_gpu); e = cudaGetDeviceCount(&n_gpu);
if(n_gpu == 0) { if(n_gpu == 0) {
KALDI_WARN << "No CUDA devices found"; KALDI_WARN << "No CUDA devices found";
if (e != cudaSuccess) {
KALDI_WARN << "cudaGetDeviceCount() returned " << e
<<", meaning: \"" << cudaGetErrorString(e) << "\"";
}
return false; return false;
} }
// The GPU is selected according to maximal free memory ratio // The GPU is selected according to maximal free memory ratio
std::vector<float> free_mem_ratio(n_gpu+1, 0.0); std::vector< std::pair<int, float> > free_mem_ratio(n_gpu);
// Get ratios of memory use, if possible // Get ratios of memory use, if possible
KALDI_LOG << "Selecting from " << n_gpu << " GPUs"; KALDI_LOG << "Selecting from " << n_gpu << " GPUs";
for(int32 n = 0; n < n_gpu; n++) { for(int32 n = 0; n < n_gpu; n++) {
int32 ret = cudaSetDevice(n); int32 ret = cudaSetDevice(n);
switch(ret) { switch(ret) {
case cudaSuccess : { case cudaSuccess : {
//create the CUDA context for the thread // create the CUDA context for the thread
cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize cudaThreadSynchronize(); // deprecated, but for legacy not cudaDeviceSynchronize
//get GPU name // get GPU name
char name[128]; char name[128];
DeviceGetName(name,128,n); DeviceGetName(name,128,n);
//get GPU memory stats // get GPU memory stats
int64 free, total; int64 free, total;
std::string mem_stats; std::string mem_stats;
mem_stats = GetFreeMemory(&free, &total); mem_stats = GetFreeMemory(&free, &total);
//log // log
KALDI_LOG << "cudaSetDevice(" << n << "): " KALDI_LOG << "cudaSetDevice(" << n << "): "
<< name << "\t" << mem_stats; << name << "\t" << mem_stats;
//store the free/total ratio
free_mem_ratio[n] = free/(float)total; // We have seen that in some cases GetFreeMemory returns zero
//destroy the CUDA context for the thread // That will produce nan after division, which might confuse
cudaThreadExit(); //deprecated, but for legacy reason not cudaDeviceReset // the sorting routine. Or maybe not, but let's keep it clean
if (total <= 0) {
KALDI_LOG << "Total memory reported for device " << n << " is zero (or less).";
}
float mem_ratio = total > 0 ? free/(float)total : 0;
free_mem_ratio[n] = std::make_pair(n, mem_ratio);
// destroy the CUDA context for the thread
cudaThreadExit(); // deprecated, but for legacy reason not cudaDeviceReset
} break; } break;
#if (CUDA_VERSION > 3020) #if (CUDA_VERSION > 3020)
...@@ -300,24 +318,43 @@ bool CuDevice::SelectGpuIdAuto() { ...@@ -300,24 +318,43 @@ bool CuDevice::SelectGpuIdAuto() {
<< cudaGetErrorString((cudaError_t)ret); << cudaGetErrorString((cudaError_t)ret);
} }
} }
//find GPU with max free memory // find GPU with max free memory
int32 max_id=0; int32 max_id=0;
for(int32 n=1; n<free_mem_ratio.size(); n++) { std::sort(free_mem_ratio.begin(), free_mem_ratio.end(),
if(free_mem_ratio[n] > free_mem_ratio[max_id]) max_id=n; greater_pair<int, float>);
} // the free_mem_ratio should be bigger than zero
//the free_mem_ratio should be bigger than zero KALDI_ASSERT(free_mem_ratio[max_id].second > 0.0);
KALDI_ASSERT(free_mem_ratio[max_id] > 0.0);
float dev_id;
//finally select the GPU float mem_ratio;
KALDI_LOG << "Selected device: " << max_id << " (automatically)"; do {
CU_SAFE_CALL(cudaSetDevice(max_id)); // try to select the GPU in the best to worst order
//create the context // Note we have to check the return codes manually, as the CU_SAFE_CALL
cudaError_t e; // contains call to KALDI_ERR (which will cause the program to abort)
e = cudaThreadSynchronize(); //deprecated, but for legacy not cudaDeviceSynchronize
if(e != cudaSuccess) { dev_id = free_mem_ratio[max_id].first;
KALDI_WARN << "Failed to create CUDA context on a GPU."; mem_ratio = free_mem_ratio[max_id].second;
KALDI_LOG << "Trying to select device: " << dev_id << " (automatically), mem_ratio: " << mem_ratio;
e = cudaSetDevice(dev_id);
if(e != cudaSuccess) {
KALDI_WARN << "Cannot select this device: return code " << e
<< ", Error message: \"" << cudaGetErrorString(e) << "\"";
} else {
e = cudaThreadSynchronize(); // deprecated, but for legacy not cudaDeviceSynchronize
if(e != cudaSuccess) {
KALDI_WARN << "Cannot select this device: return code " << e
<< ", Error message: \"" << cudaGetErrorString(e) << "\"";
}
}
max_id++;
} while ((e != cudaSuccess) && (max_id < free_mem_ratio.size()));
if (e != cudaSuccess) {
KALDI_WARN << "Failed to (automatically) select any device";
return false; return false;
} }
KALDI_LOG << "Success selecting device " << dev_id << " free mem ratio: " << mem_ratio;
return true; return true;
} }
...@@ -368,23 +405,23 @@ void CuDevice::PrintProfile() { ...@@ -368,23 +405,23 @@ void CuDevice::PrintProfile() {
std::string CuDevice::GetFreeMemory(int64* free, int64* total) const { std::string CuDevice::GetFreeMemory(int64* free, int64* total) const {
// WARNING! the CUDA API is inconsistent accross versions! // WARNING! the CUDA API is inconsistent accross versions!
#if (CUDA_VERSION >= 3020) #if (CUDA_VERSION >= 3020)
//define the function signature type // define the function signature type
size_t mem_free, mem_total; size_t mem_free, mem_total;
#else #else
unsigned int mem_free, mem_total; unsigned int mem_free, mem_total;
#endif #endif
{ {
//we will load the cuMemGetInfo dynamically from libcuda.so // we will load the cuMemGetInfo dynamically from libcuda.so
//cuMemGetInfo(&mem_free, &mem_total); // cuMemGetInfo(&mem_free, &mem_total);
//pre-fill ``safe'' values that will not cause problems // pre-fill ``safe'' values that will not cause problems
mem_free = 1; mem_total = 1; mem_free = 1; mem_total = 1;
//open libcuda.so // open libcuda.so
void* libcuda = dlopen("libcuda.so",RTLD_LAZY); void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
if(NULL == libcuda) { if(NULL == libcuda) {
KALDI_WARN << "cannot open libcuda.so"; KALDI_WARN << "cannot open libcuda.so";
} else { } else {
//define the function signature type // define the function signature type
//and get the symbol // and get the symbol
#if (CUDA_VERSION >= 3020) #if (CUDA_VERSION >= 3020)
typedef CUresult (*cu_fun_ptr)(size_t*, size_t*); typedef CUresult (*cu_fun_ptr)(size_t*, size_t*);
cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2"); cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2");
...@@ -395,10 +432,10 @@ std::string CuDevice::GetFreeMemory(int64* free, int64* total) const { ...@@ -395,10 +432,10 @@ std::string CuDevice::GetFreeMemory(int64* free, int64* total) const {
if(NULL == dl_cuMemGetInfo) { if(NULL == dl_cuMemGetInfo) {
KALDI_WARN << "cannot load cuMemGetInfo from libcuda.so"; KALDI_WARN << "cannot load cuMemGetInfo from libcuda.so";
} else { } else {
//call the function // call the function
dl_cuMemGetInfo(&mem_free, &mem_total); dl_cuMemGetInfo(&mem_free, &mem_total);
} }
//close the library // close the library
dlclose(libcuda); dlclose(libcuda);
} }
} }
...@@ -416,24 +453,24 @@ std::string CuDevice::GetFreeMemory(int64* free, int64* total) const { ...@@ -416,24 +453,24 @@ std::string CuDevice::GetFreeMemory(int64* free, int64* total) const {
void CuDevice::DeviceGetName(char* name, int32 len, int32 dev) { void CuDevice::DeviceGetName(char* name, int32 len, int32 dev) {
//prefill with something reasonable // prefill with something reasonable
strncpy(name,"Unknown GPU",len); strncpy(name,"Unknown GPU",len);
//open libcuda.so // open libcuda.so
void* libcuda = dlopen("libcuda.so",RTLD_LAZY); void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
if(NULL == libcuda) { if(NULL == libcuda) {
KALDI_WARN << "cannot open libcuda.so"; KALDI_WARN << "cannot open libcuda.so";
} else { } else {
//define the function signature type // define the function signature type
typedef CUresult (*cu_fun_ptr)(char*,int,CUdevice); typedef CUresult (*cu_fun_ptr)(char*,int,CUdevice);
//get the symbol // get the symbol
cu_fun_ptr cuDeviceGetName_ptr = (cu_fun_ptr)dlsym(libcuda,"cuDeviceGetName"); cu_fun_ptr cuDeviceGetName_ptr = (cu_fun_ptr)dlsym(libcuda,"cuDeviceGetName");
if(NULL == cuDeviceGetName_ptr) { if(NULL == cuDeviceGetName_ptr) {
KALDI_WARN << "cannot load cuDeviceGetName from libcuda.so"; KALDI_WARN << "cannot load cuDeviceGetName from libcuda.so";
} else { } else {
//call the function // call the function
cuDeviceGetName_ptr(name, len, dev); cuDeviceGetName_ptr(name, len, dev);
} }
//close the library // close the library
dlclose(libcuda); dlclose(libcuda);
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment