在多 GPU 节点上使用 NVIDIA MPS 服务的 MPI 运行 问题
MPI running issue using NVIDIA MPS Service on Multi-GPU nodes
我在多 GPU 节点上使用 NVIDIA MPS 服务 运行宁 MPI 代码时遇到问题。
我正在使用的系统有 2 个 K80 GPU(总共 4 个 GPU)。
基本上,我先把GPU模式设置为exclusive_process:
nvidia_smi -c 3
然后我启动 MPS 服务:
nvidia-cuda-mps-control -d
当我增加进程数和 运行 我的代码时,出现以下错误:
all CUDA-capable devices are busy or unavailable
这是一个例子:
这是我的代码:
#include <stdio.h>
#include <stdlib.h>
#include "cuda_runtime.h"
#include "mpi.h"
#define __SIZE__ 1024
int main(int argc, char **argv)
{
cudaError_t cuda_err = cudaSuccess;
void *dev_buf;
MPI_Init(&argc, &argv);
int my_rank = -1;
int dev_cnt = 0;
int dev_id = -1;
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
cuda_err = cudaGetDeviceCount(&dev_cnt);
if (cuda_err != cudaSuccess)
printf("cudaGET Error--on rank %d %s\n", my_rank, cudaGetErrorString(cuda_err));
dev_id = my_rank % dev_cnt;
printf("myrank=%d dev_cnt=%d, dev_id=%d\n", my_rank, dev_cnt, dev_id);
cuda_err = cudaSetDevice(dev_id);
if (cuda_err != cudaSuccess)
printf("cudaSet Error--on rank %d %s\n", my_rank, cudaGetErrorString(cuda_err));
cuda_err = cudaMalloc((void **) &dev_buf, __SIZE__);
if (cuda_err != cudaSuccess)
printf("cudaMalloc Error--on rank %d %s\n", my_rank, cudaGetErrorString(cuda_err))
else
printf("cudaMalloc Success++, %d \n", my_rank);
MPI_Finalize();
return 0;
}
这是 12 个进程的输出:
#mpirun -n 12 -hostfile hosts ./hq_test
myrank=0 dev_cnt=4, dev_id=0
myrank=1 dev_cnt=4, dev_id=1
myrank=2 dev_cnt=4, dev_id=2
myrank=3 dev_cnt=4, dev_id=3
myrank=4 dev_cnt=4, dev_id=0
myrank=5 dev_cnt=4, dev_id=1
myrank=6 dev_cnt=4, dev_id=2
myrank=7 dev_cnt=4, dev_id=3
myrank=8 dev_cnt=4, dev_id=0
myrank=9 dev_cnt=4, dev_id=1
myrank=10 dev_cnt=4, dev_id=2
myrank=11 dev_cnt=4, dev_id=3
cudaMalloc Success++, 8
cudaMalloc Success++, 10
cudaMalloc Success++, 0
cudaMalloc Success++, 1
cudaMalloc Success++, 3
cudaMalloc Success++, 7
cudaMalloc Success++, 9
cudaMalloc Success++, 6
cudaMalloc Success++, 4
cudaMalloc Success++, 2
cudaMalloc Success++, 5
cudaMalloc Success++, 11
这是 14 个进程的输出:
#mpirun -n 14 -hostfile hosts ./hq_test
myrank=0 dev_cnt=4, dev_id=0
myrank=1 dev_cnt=4, dev_id=1
myrank=2 dev_cnt=4, dev_id=2
myrank=3 dev_cnt=4, dev_id=3
myrank=4 dev_cnt=4, dev_id=0
myrank=5 dev_cnt=4, dev_id=1
myrank=6 dev_cnt=4, dev_id=2
myrank=7 dev_cnt=4, dev_id=3
myrank=8 dev_cnt=4, dev_id=0
myrank=9 dev_cnt=4, dev_id=1
myrank=10 dev_cnt=4, dev_id=2
myrank=11 dev_cnt=4, dev_id=3
myrank=12 dev_cnt=4, dev_id=0
myrank=13 dev_cnt=4, dev_id=1
cudaMalloc Success++, 11
cudaMalloc Success++, 3
cudaMalloc Success++, 7
cudaMalloc Success++, 2
cudaMalloc Success++, 10
cudaMalloc Success++, 6
cudaMalloc Success++, 1
cudaMalloc Success++, 8
cudaMalloc Error--on rank 13 all CUDA-capable devices are busy or unavailable
cudaMalloc Error--on rank 5 all CUDA-capable devices are busy or unavailable
cudaMalloc Error--on rank 9 all CUDA-capable devices are busy or unavailable
cudaMalloc Error--on rank 4 all CUDA-capable devices are busy or unavailable
cudaMalloc Error--on rank 0 all CUDA-capable devices are busy or unavailable
cudaMalloc Error--on rank 12 all CUDA-capable devices are busy or unavailable
注意:我已经尝试更改 CUDA_DEVICE_MAX_CONNECTIONS 值,但没有帮助。
如果你能与我分享你对此的想法,我将不胜感激。
根据交叉发布 here the server log in this case indicates a known issue which is covered in the MPS Documentation 第 4.4 节中的信息(MPS 服务器日志):
Memory allocation API calls (including context creation) may fail with the following
message in the server log: MPS Server failed to create/open SHM segment.
Comments: This is most likely due to exhausting the file descriptor limit on your
system. Check the maximum number of open file descriptors allowed on your
system and increase if necessary. We recommend setting it to 16384 and higher.
Typically this information can be checked via the command ‘ulimit –n’; refer to your
operating system instructions on how to change the limit.
我在多 GPU 节点上使用 NVIDIA MPS 服务 运行宁 MPI 代码时遇到问题。
我正在使用的系统有 2 个 K80 GPU(总共 4 个 GPU)。
基本上,我先把GPU模式设置为exclusive_process:
nvidia_smi -c 3
然后我启动 MPS 服务:
nvidia-cuda-mps-control -d
当我增加进程数和 运行 我的代码时,出现以下错误:
all CUDA-capable devices are busy or unavailable
这是一个例子:
这是我的代码:
#include <stdio.h>
#include <stdlib.h>
#include "cuda_runtime.h"
#include "mpi.h"
#define __SIZE__ 1024
int main(int argc, char **argv)
{
cudaError_t cuda_err = cudaSuccess;
void *dev_buf;
MPI_Init(&argc, &argv);
int my_rank = -1;
int dev_cnt = 0;
int dev_id = -1;
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
cuda_err = cudaGetDeviceCount(&dev_cnt);
if (cuda_err != cudaSuccess)
printf("cudaGET Error--on rank %d %s\n", my_rank, cudaGetErrorString(cuda_err));
dev_id = my_rank % dev_cnt;
printf("myrank=%d dev_cnt=%d, dev_id=%d\n", my_rank, dev_cnt, dev_id);
cuda_err = cudaSetDevice(dev_id);
if (cuda_err != cudaSuccess)
printf("cudaSet Error--on rank %d %s\n", my_rank, cudaGetErrorString(cuda_err));
cuda_err = cudaMalloc((void **) &dev_buf, __SIZE__);
if (cuda_err != cudaSuccess)
printf("cudaMalloc Error--on rank %d %s\n", my_rank, cudaGetErrorString(cuda_err))
else
printf("cudaMalloc Success++, %d \n", my_rank);
MPI_Finalize();
return 0;
}
这是 12 个进程的输出:
#mpirun -n 12 -hostfile hosts ./hq_test
myrank=0 dev_cnt=4, dev_id=0
myrank=1 dev_cnt=4, dev_id=1
myrank=2 dev_cnt=4, dev_id=2
myrank=3 dev_cnt=4, dev_id=3
myrank=4 dev_cnt=4, dev_id=0
myrank=5 dev_cnt=4, dev_id=1
myrank=6 dev_cnt=4, dev_id=2
myrank=7 dev_cnt=4, dev_id=3
myrank=8 dev_cnt=4, dev_id=0
myrank=9 dev_cnt=4, dev_id=1
myrank=10 dev_cnt=4, dev_id=2
myrank=11 dev_cnt=4, dev_id=3
cudaMalloc Success++, 8
cudaMalloc Success++, 10
cudaMalloc Success++, 0
cudaMalloc Success++, 1
cudaMalloc Success++, 3
cudaMalloc Success++, 7
cudaMalloc Success++, 9
cudaMalloc Success++, 6
cudaMalloc Success++, 4
cudaMalloc Success++, 2
cudaMalloc Success++, 5
cudaMalloc Success++, 11
这是 14 个进程的输出:
#mpirun -n 14 -hostfile hosts ./hq_test
myrank=0 dev_cnt=4, dev_id=0
myrank=1 dev_cnt=4, dev_id=1
myrank=2 dev_cnt=4, dev_id=2
myrank=3 dev_cnt=4, dev_id=3
myrank=4 dev_cnt=4, dev_id=0
myrank=5 dev_cnt=4, dev_id=1
myrank=6 dev_cnt=4, dev_id=2
myrank=7 dev_cnt=4, dev_id=3
myrank=8 dev_cnt=4, dev_id=0
myrank=9 dev_cnt=4, dev_id=1
myrank=10 dev_cnt=4, dev_id=2
myrank=11 dev_cnt=4, dev_id=3
myrank=12 dev_cnt=4, dev_id=0
myrank=13 dev_cnt=4, dev_id=1
cudaMalloc Success++, 11
cudaMalloc Success++, 3
cudaMalloc Success++, 7
cudaMalloc Success++, 2
cudaMalloc Success++, 10
cudaMalloc Success++, 6
cudaMalloc Success++, 1
cudaMalloc Success++, 8
cudaMalloc Error--on rank 13 all CUDA-capable devices are busy or unavailable
cudaMalloc Error--on rank 5 all CUDA-capable devices are busy or unavailable
cudaMalloc Error--on rank 9 all CUDA-capable devices are busy or unavailable
cudaMalloc Error--on rank 4 all CUDA-capable devices are busy or unavailable
cudaMalloc Error--on rank 0 all CUDA-capable devices are busy or unavailable
cudaMalloc Error--on rank 12 all CUDA-capable devices are busy or unavailable
注意:我已经尝试更改 CUDA_DEVICE_MAX_CONNECTIONS 值,但没有帮助。
如果你能与我分享你对此的想法,我将不胜感激。
根据交叉发布 here the server log in this case indicates a known issue which is covered in the MPS Documentation 第 4.4 节中的信息(MPS 服务器日志):
Memory allocation API calls (including context creation) may fail with the following message in the server log: MPS Server failed to create/open SHM segment. Comments: This is most likely due to exhausting the file descriptor limit on your system. Check the maximum number of open file descriptors allowed on your system and increase if necessary. We recommend setting it to 16384 and higher. Typically this information can be checked via the command ‘ulimit –n’; refer to your operating system instructions on how to change the limit.