从每个主机线程启动一个 CUDA 流,每个流会同时 运行 吗?
Launching a CUDA stream from each host thread, will each stream run concurrently?
通过搜索我知道 cuda 支持从每个主机线程启动 CUDA 流。我的问题是,当我只使用一个线程时,测试需要 180 秒才能完成。然后我用了三个线程,测试用了430秒。他们为什么不 运行 同时?
我的GPU是Tesla K20c
下面是我的简化代码,它切断了一些变量定义和输出数据保存等,
int main()
{
cudaSetDevice(0);
cudaSetDeviceFlags(cudaDeviceBlockingSync);
cudaStream_t stream1;
cudaStream_t stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
int ret;
pthread_t id_1,id_2;
ret = pthread_create(&id_1,NULL,thread_1,&stream1);
ret = pthread_create(&id_2,NULL,thread_1,&stream2);
pthread_join(id_1,NULL);
pthread_join(id_2,NULL);
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
return 0;
}
void* thread_1(void *streamno)
{
char speechInFileName[1024] = "data/ori_in.bin";
char bitOutFileName[1024] = "data/enc_out.bin";
//make sure the bitOutFileName is exclusive
char buf[1024];
int nchar = snprintf(buf,1024,"%p",(char*)streamno);
strcat(bitOutFileName,buf);
//change the stack size limit
size_t pvalue = 60 * 1024;
if (cudaDeviceSetLimit(cudaLimitStackSize, pvalue) == cudaErrorInvalidValue)
cout << "cudaErrorInvalidValue " << endl;
Encoder_main(3, speechInFileName, bitOutFileName,(cudaStream_t*)streamno);
pthread_exit(0);
}
int Encoder_main(int argc, char speechInFileName[], char bitOutFileName[], cudaStream_t *stream)
{
void *d_psEnc;
cudaMalloc(&d_psEnc, encSizeBytes);
cudaMemcpyAsync(d_psEnc, psEnc, encSizeBytes, cudaMemcpyHostToDevice, *stream);
SKP_SILK_SDK_EncControlStruct *d_encControl; // Struct for input to encoder
cudaMalloc(&d_encControl, sizeof(SKP_SILK_SDK_EncControlStruct));
cudaMemcpyAsync(d_encControl, &encControl, sizeof(SKP_SILK_SDK_EncControlStruct), cudaMemcpyHostToDevice, *stream);
SKP_int16 *d_in;
cudaMalloc(&d_in, FRAME_LENGTH_MS * MAX_API_FS_KHZ * MAX_INPUT_FRAMES * sizeof(SKP_int16));
SKP_int16 *d_nBytes;
cudaMalloc(&d_nBytes, sizeof(SKP_int16));
SKP_int32 *d_ret;
cudaMalloc(&d_ret, sizeof(SKP_int32));
SKP_uint8 *d_payload;
cudaMalloc(&d_payload, MAX_BYTES_PER_FRAME * MAX_INPUT_FRAMES);
while (1) {
/* Read input from file */
counter = fread(in, sizeof(SKP_int16), (frameSizeReadFromFile_ms * API_fs_Hz) / 1000, speechInFile);
if ((SKP_int)counter < ((frameSizeReadFromFile_ms * API_fs_Hz) / 1000)) {
break;
}
/* max payload size */
nBytes = MAX_BYTES_PER_FRAME * MAX_INPUT_FRAMES;
cudaMemcpyAsync(d_nBytes, &nBytes, sizeof(SKP_int16), cudaMemcpyHostToDevice, *stream);
cudaMemcpyAsync(d_in, in, FRAME_LENGTH_MS * MAX_API_FS_KHZ * MAX_INPUT_FRAMES, cudaMemcpyHostToDevice * sizeof(SKP_int16), *stream);
encoder_kernel <<<1, 1, 0, *stream>>>(d_psEnc, d_encControl, d_in, (SKP_int16)counter, d_payload, d_nBytes, d_ret);
cudaMemcpyAsync(&nBytes, d_nBytes, sizeof(SKP_int16), cudaMemcpyDeviceToHost,*stream);
cudaMemcpyAsync(&ret, d_ret, sizeof(ret), cudaMemcpyDeviceToHost,*stream);
cudaMemcpyAsync(payload, d_payload, MAX_BYTES_PER_FRAME * MAX_INPUT_FRAMES, cudaMemcpyDeviceToHost,*stream);
cudaStreamSynchronize(*stream);
}
cudaFree(d_psEnc);
cudaFree(d_encControl);
cudaFree(d_in);
cudaFree(d_nBytes);
cudaFree(d_ret);
cudaFree(d_payload);
return 0;
}
encoder_kernel是语音编码器函数。
感谢 Robert 和 Jez 的建议!我将我的代码更改为只打开两个流,并使用可视化分析器来显示时间线。从图像中,我有时看到两个流 运行 并发,但大多数时候没有!你能告诉我为什么吗?谢谢!
一个线程需要180s,三个线程需要430s。 430/180 = ~2.4。那不是三倍长,表明您具有一定的并发性。您能否做得比这更好取决于每个线程所做工作的细节。
通常,弄清楚正在发生什么的最好方法是 运行 通过 NVIDIA Visual Profiler 分析您的应用程序。您可以从可视化分析器界面 运行 它,或者从命令行 nvprof 分析器输出。这将显示每个 CUDA API 调用以及副本和内核。它会按流和线程拆分它们,因此可以很清楚地看到发生了什么。
通过搜索我知道 cuda 支持从每个主机线程启动 CUDA 流。我的问题是,当我只使用一个线程时,测试需要 180 秒才能完成。然后我用了三个线程,测试用了430秒。他们为什么不 运行 同时?
我的GPU是Tesla K20c
下面是我的简化代码,它切断了一些变量定义和输出数据保存等,
int main()
{
cudaSetDevice(0);
cudaSetDeviceFlags(cudaDeviceBlockingSync);
cudaStream_t stream1;
cudaStream_t stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
int ret;
pthread_t id_1,id_2;
ret = pthread_create(&id_1,NULL,thread_1,&stream1);
ret = pthread_create(&id_2,NULL,thread_1,&stream2);
pthread_join(id_1,NULL);
pthread_join(id_2,NULL);
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
return 0;
}
void* thread_1(void *streamno)
{
char speechInFileName[1024] = "data/ori_in.bin";
char bitOutFileName[1024] = "data/enc_out.bin";
//make sure the bitOutFileName is exclusive
char buf[1024];
int nchar = snprintf(buf,1024,"%p",(char*)streamno);
strcat(bitOutFileName,buf);
//change the stack size limit
size_t pvalue = 60 * 1024;
if (cudaDeviceSetLimit(cudaLimitStackSize, pvalue) == cudaErrorInvalidValue)
cout << "cudaErrorInvalidValue " << endl;
Encoder_main(3, speechInFileName, bitOutFileName,(cudaStream_t*)streamno);
pthread_exit(0);
}
int Encoder_main(int argc, char speechInFileName[], char bitOutFileName[], cudaStream_t *stream)
{
void *d_psEnc;
cudaMalloc(&d_psEnc, encSizeBytes);
cudaMemcpyAsync(d_psEnc, psEnc, encSizeBytes, cudaMemcpyHostToDevice, *stream);
SKP_SILK_SDK_EncControlStruct *d_encControl; // Struct for input to encoder
cudaMalloc(&d_encControl, sizeof(SKP_SILK_SDK_EncControlStruct));
cudaMemcpyAsync(d_encControl, &encControl, sizeof(SKP_SILK_SDK_EncControlStruct), cudaMemcpyHostToDevice, *stream);
SKP_int16 *d_in;
cudaMalloc(&d_in, FRAME_LENGTH_MS * MAX_API_FS_KHZ * MAX_INPUT_FRAMES * sizeof(SKP_int16));
SKP_int16 *d_nBytes;
cudaMalloc(&d_nBytes, sizeof(SKP_int16));
SKP_int32 *d_ret;
cudaMalloc(&d_ret, sizeof(SKP_int32));
SKP_uint8 *d_payload;
cudaMalloc(&d_payload, MAX_BYTES_PER_FRAME * MAX_INPUT_FRAMES);
while (1) {
/* Read input from file */
counter = fread(in, sizeof(SKP_int16), (frameSizeReadFromFile_ms * API_fs_Hz) / 1000, speechInFile);
if ((SKP_int)counter < ((frameSizeReadFromFile_ms * API_fs_Hz) / 1000)) {
break;
}
/* max payload size */
nBytes = MAX_BYTES_PER_FRAME * MAX_INPUT_FRAMES;
cudaMemcpyAsync(d_nBytes, &nBytes, sizeof(SKP_int16), cudaMemcpyHostToDevice, *stream);
cudaMemcpyAsync(d_in, in, FRAME_LENGTH_MS * MAX_API_FS_KHZ * MAX_INPUT_FRAMES, cudaMemcpyHostToDevice * sizeof(SKP_int16), *stream);
encoder_kernel <<<1, 1, 0, *stream>>>(d_psEnc, d_encControl, d_in, (SKP_int16)counter, d_payload, d_nBytes, d_ret);
cudaMemcpyAsync(&nBytes, d_nBytes, sizeof(SKP_int16), cudaMemcpyDeviceToHost,*stream);
cudaMemcpyAsync(&ret, d_ret, sizeof(ret), cudaMemcpyDeviceToHost,*stream);
cudaMemcpyAsync(payload, d_payload, MAX_BYTES_PER_FRAME * MAX_INPUT_FRAMES, cudaMemcpyDeviceToHost,*stream);
cudaStreamSynchronize(*stream);
}
cudaFree(d_psEnc);
cudaFree(d_encControl);
cudaFree(d_in);
cudaFree(d_nBytes);
cudaFree(d_ret);
cudaFree(d_payload);
return 0;
}
encoder_kernel是语音编码器函数。
感谢 Robert 和 Jez 的建议!我将我的代码更改为只打开两个流,并使用可视化分析器来显示时间线。从图像中,我有时看到两个流 运行 并发,但大多数时候没有!你能告诉我为什么吗?谢谢!
一个线程需要180s,三个线程需要430s。 430/180 = ~2.4。那不是三倍长,表明您具有一定的并发性。您能否做得比这更好取决于每个线程所做工作的细节。
通常,弄清楚正在发生什么的最好方法是 运行 通过 NVIDIA Visual Profiler 分析您的应用程序。您可以从可视化分析器界面 运行 它,或者从命令行 nvprof 分析器输出。这将显示每个 CUDA API 调用以及副本和内核。它会按流和线程拆分它们,因此可以很清楚地看到发生了什么。