为什么 Hyper-Q 在我的 cc5.2 硬件上选择性地重叠异步 HtoD 和 DtoH 传输?
Why does Hyper-Q selectively overlap async HtoD and DtoH transfer on my cc5.2 hardware?
有一个 old Parallel ForAll blog post 演示了如何使用流和异步 memcpys 在内核和 memcpys 之间以及 HtoD 和 DtoH memcpys 之间生成重叠。所以我 运行 在我的 GTX Titan X 上给出了完整的异步示例,结果如下:
如您所见,当 HtoD、Kernel 和 DtoH 在单个循环中被背对背调用时,HtoD 和 DtoH t运行sfer 之间没有任何重叠。但是在三个循环中分别调用时,HtoD和DtoH之间存在重叠。
如果 Hyper-Q 做了它声称要做的事情,那么在循环启动的第一个版本中也应该有 HtoD 和 DtoH 重叠(就像 Tesla K20c 的情况一样)。据我了解,在支持 Hyper-Q 的计算能力为 3.5 及以上的设备中,用户不必再担心定制启动顺序。
我还 运行 CUDA 7.0 simpleHyperQ
样本。 CUDA_DEVICE_MAX_CONNECTIONS
设置为 32,我可以获得 32 个并发内核 运行,因此 Hyper-Q 在这种情况下可以正常工作。
我是64位Windows8.1,驱动版本353.06,CUDA 7.0,使用Visual Studio2013编译,目标x64平台发布模式,代码生成属性为compute_52,sm_52
。 CUDA_DEVICE_MAX_CONNECTIONS
设置为足够大的 32。
由于我无法 post 更多链接,下面 post 编辑了异步示例的完整代码(稍作修改)。
// Copyright 2012 NVIDIA Corporation
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand_kernel.h>
#include <stdio.h>
// Convenience function for checking CUDA runtime API results
// can be wrapped around any runtime API call. No-op in release builds.
inline
cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
__global__ void kernel(float *a, int offset)
{
int i = offset + threadIdx.x + blockIdx.x*blockDim.x;
float x = (float)i;
float s = sinf(x);
float c = cosf(x);
a[i] = a[i] + sqrtf(s*s + c*c);
}
float maxError(float *a, int n)
{
float maxE = 0;
for (int i = 0; i < n; i++) {
float error = fabs(a[i] - 1.0f);
if (error > maxE) maxE = error;
}
return maxE;
}
int main(int argc, char **argv)
{
_putenv_s("CUDA_DEVICE_MAX_CONNECTIONS", "32");
const int blockSize = 256, nStreams = 4;
const int n = 4 * 1024 * blockSize * nStreams;
const int streamSize = n / nStreams;
const int streamBytes = streamSize * sizeof(float);
const int bytes = n * sizeof(float);
int devId = 0;
if (argc > 1) devId = atoi(argv[1]);
cudaDeviceProp prop;
checkCuda(cudaGetDeviceProperties(&prop, devId));
printf("Device : %s\n", prop.name);
checkCuda(cudaSetDevice(devId));
// allocate pinned host memory and device memory
float *a, *d_a;
checkCuda(cudaMallocHost((void**)&a, bytes)); // host pinned
checkCuda(cudaMalloc((void**)&d_a, bytes)); // device
float ms; // elapsed time in milliseconds
// create events and streams
cudaEvent_t startEvent, stopEvent, dummyEvent;
cudaStream_t stream[nStreams];
checkCuda(cudaEventCreate(&startEvent));
checkCuda(cudaEventCreate(&stopEvent));
checkCuda(cudaEventCreate(&dummyEvent));
for (int i = 0; i < nStreams; ++i)
checkCuda(cudaStreamCreate(&stream[i]));
// baseline case - sequential transfer and execute
memset(a, 0, bytes);
checkCuda(cudaEventRecord(startEvent, 0));
checkCuda(cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice));
kernel << <n / blockSize, blockSize >> >(d_a, 0);
checkCuda(cudaMemcpy(a, d_a, bytes, cudaMemcpyDeviceToHost));
checkCuda(cudaEventRecord(stopEvent, 0));
checkCuda(cudaEventSynchronize(stopEvent));
checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent));
printf("Time for sequential transfer and execute (ms): %f\n", ms);
printf(" max error: %e\n", maxError(a, n));
// asynchronous version 1: loop over {copy, kernel, copy}
memset(a, 0, bytes);
checkCuda(cudaEventRecord(startEvent, 0));
for (int i = 0; i < nStreams; ++i) {
int offset = i * streamSize;
checkCuda(cudaMemcpyAsync(&d_a[offset], &a[offset],
streamBytes, cudaMemcpyHostToDevice,
stream[i]));
kernel << <streamSize / blockSize, blockSize, 0, stream[i] >> >(d_a, offset);
checkCuda(cudaMemcpyAsync(&a[offset], &d_a[offset],
streamBytes, cudaMemcpyDeviceToHost,
stream[i]));
}
checkCuda(cudaEventRecord(stopEvent, 0));
checkCuda(cudaEventSynchronize(stopEvent));
checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent));
printf("Time for asynchronous V1 transfer and execute (ms): %f\n", ms);
printf(" max error: %e\n", maxError(a, n));
// asynchronous version 2:
// loop over copy, loop over kernel, loop over copy
memset(a, 0, bytes);
checkCuda(cudaEventRecord(startEvent, 0));
for (int i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
checkCuda(cudaMemcpyAsync(&d_a[offset], &a[offset],
streamBytes, cudaMemcpyHostToDevice,
stream[i]));
}
for (int i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
kernel << <streamSize / blockSize, blockSize, 0, stream[i] >> >(d_a, offset);
}
for (int i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
checkCuda(cudaMemcpyAsync(&a[offset], &d_a[offset],
streamBytes, cudaMemcpyDeviceToHost,
stream[i]));
}
checkCuda(cudaEventRecord(stopEvent, 0));
checkCuda(cudaEventSynchronize(stopEvent));
checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent));
printf("Time for asynchronous V2 transfer and execute (ms): %f\n", ms);
printf(" max error: %e\n", maxError(a, n));
// cleanup
checkCuda(cudaEventDestroy(startEvent));
checkCuda(cudaEventDestroy(stopEvent));
checkCuda(cudaEventDestroy(dummyEvent));
for (int i = 0; i < nStreams; ++i)
checkCuda(cudaStreamDestroy(stream[i]));
cudaFree(d_a);
cudaFreeHost(a);
cudaDeviceReset();
return 0;
}
您所观察到的可能是 运行 Windows WDDM 平台上代码的产物。 WDDM 子系统有很多其他平台不会受到影响的延迟,因此为了提高整体性能,CUDA WDDM 驱动程序执行命令批处理。这可能会干扰并发操作和命令重叠的预期顺序或时间,并且可能就是您在这里看到的。
解决方案是使用 Windows TCC 驱动程序,这需要受支持的 Telsa 或 Quadro 卡,或者更改为非 WDDM 平台,例如 Linux。后者似乎解决了本例的问题
有一个 old Parallel ForAll blog post 演示了如何使用流和异步 memcpys 在内核和 memcpys 之间以及 HtoD 和 DtoH memcpys 之间生成重叠。所以我 运行 在我的 GTX Titan X 上给出了完整的异步示例,结果如下:
如您所见,当 HtoD、Kernel 和 DtoH 在单个循环中被背对背调用时,HtoD 和 DtoH t运行sfer 之间没有任何重叠。但是在三个循环中分别调用时,HtoD和DtoH之间存在重叠。
如果 Hyper-Q 做了它声称要做的事情,那么在循环启动的第一个版本中也应该有 HtoD 和 DtoH 重叠(就像 Tesla K20c 的情况一样)。据我了解,在支持 Hyper-Q 的计算能力为 3.5 及以上的设备中,用户不必再担心定制启动顺序。
我还 运行 CUDA 7.0 simpleHyperQ
样本。 CUDA_DEVICE_MAX_CONNECTIONS
设置为 32,我可以获得 32 个并发内核 运行,因此 Hyper-Q 在这种情况下可以正常工作。
我是64位Windows8.1,驱动版本353.06,CUDA 7.0,使用Visual Studio2013编译,目标x64平台发布模式,代码生成属性为compute_52,sm_52
。 CUDA_DEVICE_MAX_CONNECTIONS
设置为足够大的 32。
由于我无法 post 更多链接,下面 post 编辑了异步示例的完整代码(稍作修改)。
// Copyright 2012 NVIDIA Corporation
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand_kernel.h>
#include <stdio.h>
// Convenience function for checking CUDA runtime API results
// can be wrapped around any runtime API call. No-op in release builds.
inline
cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
__global__ void kernel(float *a, int offset)
{
int i = offset + threadIdx.x + blockIdx.x*blockDim.x;
float x = (float)i;
float s = sinf(x);
float c = cosf(x);
a[i] = a[i] + sqrtf(s*s + c*c);
}
float maxError(float *a, int n)
{
float maxE = 0;
for (int i = 0; i < n; i++) {
float error = fabs(a[i] - 1.0f);
if (error > maxE) maxE = error;
}
return maxE;
}
int main(int argc, char **argv)
{
_putenv_s("CUDA_DEVICE_MAX_CONNECTIONS", "32");
const int blockSize = 256, nStreams = 4;
const int n = 4 * 1024 * blockSize * nStreams;
const int streamSize = n / nStreams;
const int streamBytes = streamSize * sizeof(float);
const int bytes = n * sizeof(float);
int devId = 0;
if (argc > 1) devId = atoi(argv[1]);
cudaDeviceProp prop;
checkCuda(cudaGetDeviceProperties(&prop, devId));
printf("Device : %s\n", prop.name);
checkCuda(cudaSetDevice(devId));
// allocate pinned host memory and device memory
float *a, *d_a;
checkCuda(cudaMallocHost((void**)&a, bytes)); // host pinned
checkCuda(cudaMalloc((void**)&d_a, bytes)); // device
float ms; // elapsed time in milliseconds
// create events and streams
cudaEvent_t startEvent, stopEvent, dummyEvent;
cudaStream_t stream[nStreams];
checkCuda(cudaEventCreate(&startEvent));
checkCuda(cudaEventCreate(&stopEvent));
checkCuda(cudaEventCreate(&dummyEvent));
for (int i = 0; i < nStreams; ++i)
checkCuda(cudaStreamCreate(&stream[i]));
// baseline case - sequential transfer and execute
memset(a, 0, bytes);
checkCuda(cudaEventRecord(startEvent, 0));
checkCuda(cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice));
kernel << <n / blockSize, blockSize >> >(d_a, 0);
checkCuda(cudaMemcpy(a, d_a, bytes, cudaMemcpyDeviceToHost));
checkCuda(cudaEventRecord(stopEvent, 0));
checkCuda(cudaEventSynchronize(stopEvent));
checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent));
printf("Time for sequential transfer and execute (ms): %f\n", ms);
printf(" max error: %e\n", maxError(a, n));
// asynchronous version 1: loop over {copy, kernel, copy}
memset(a, 0, bytes);
checkCuda(cudaEventRecord(startEvent, 0));
for (int i = 0; i < nStreams; ++i) {
int offset = i * streamSize;
checkCuda(cudaMemcpyAsync(&d_a[offset], &a[offset],
streamBytes, cudaMemcpyHostToDevice,
stream[i]));
kernel << <streamSize / blockSize, blockSize, 0, stream[i] >> >(d_a, offset);
checkCuda(cudaMemcpyAsync(&a[offset], &d_a[offset],
streamBytes, cudaMemcpyDeviceToHost,
stream[i]));
}
checkCuda(cudaEventRecord(stopEvent, 0));
checkCuda(cudaEventSynchronize(stopEvent));
checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent));
printf("Time for asynchronous V1 transfer and execute (ms): %f\n", ms);
printf(" max error: %e\n", maxError(a, n));
// asynchronous version 2:
// loop over copy, loop over kernel, loop over copy
memset(a, 0, bytes);
checkCuda(cudaEventRecord(startEvent, 0));
for (int i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
checkCuda(cudaMemcpyAsync(&d_a[offset], &a[offset],
streamBytes, cudaMemcpyHostToDevice,
stream[i]));
}
for (int i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
kernel << <streamSize / blockSize, blockSize, 0, stream[i] >> >(d_a, offset);
}
for (int i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
checkCuda(cudaMemcpyAsync(&a[offset], &d_a[offset],
streamBytes, cudaMemcpyDeviceToHost,
stream[i]));
}
checkCuda(cudaEventRecord(stopEvent, 0));
checkCuda(cudaEventSynchronize(stopEvent));
checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent));
printf("Time for asynchronous V2 transfer and execute (ms): %f\n", ms);
printf(" max error: %e\n", maxError(a, n));
// cleanup
checkCuda(cudaEventDestroy(startEvent));
checkCuda(cudaEventDestroy(stopEvent));
checkCuda(cudaEventDestroy(dummyEvent));
for (int i = 0; i < nStreams; ++i)
checkCuda(cudaStreamDestroy(stream[i]));
cudaFree(d_a);
cudaFreeHost(a);
cudaDeviceReset();
return 0;
}
您所观察到的可能是 运行 Windows WDDM 平台上代码的产物。 WDDM 子系统有很多其他平台不会受到影响的延迟,因此为了提高整体性能,CUDA WDDM 驱动程序执行命令批处理。这可能会干扰并发操作和命令重叠的预期顺序或时间,并且可能就是您在这里看到的。
解决方案是使用 Windows TCC 驱动程序,这需要受支持的 Telsa 或 Quadro 卡,或者更改为非 WDDM 平台,例如 Linux。后者似乎解决了本例的问题