CUDA:将设备函数作为参数传递给全局函数
CUDA: Pass device function as an argument to global function
如何制作这样的作品?
#define Eval(x, y, func) {y = func(x);}
__global__ void Evaluate(double *xs, double *ys, int N, double f(double))
{
int tid = threadIdx.x;
if (tid < N)
Eval(xs[tid], ys[tid], f);
}
然后在主函数里面我有
double *xs_d, *ys_d;
double *xs_h, *ys_h;
xs_h = (double *) malloc(sizeof(double) * 256);
ys_h = (double *) malloc(sizeof(double) * 256);
cudaMalloc((void **)&xs_d, sizeof(double) * 256);
cudaMalloc((void **)&ys_d, sizeof(double) * 256);
for (int i = 0; i < 256; i++)
{
xs_h[i] = (double)i;
}
HANDLE_ERROR(cudaMemcpy(xs_d, xs_h, 256*sizeof(double), cudaMemcpyHostToDevice));
Evaluate<<<1,256>>>(xs_d, ys_d, 256, Sin);
cudaDeviceSynchronize();
HANDLE_ERROR(cudaMemcpy(ys_h, ys_d, 256*sizeof(double), cudaMemcpyDeviceToHost));
最后一行失败。到目前为止,我已经看到了这样的解决方案 How to pass device function as an input argument to host-side function? 但它们使用了 __device__
函数,主机无法更改或访问这些函数(例如 main
)函数。例如,我不能将 __device__ int *fptrf1 = (int *)f1;
(取自示例)放在 main 中。是否有可能以某种方式具有这种灵活性?
For example, I cannot put __device__ int *fptrf1 = (int *)f1;
(taken from the example) inside main
. Is it possible to somehow have this flexibility?
一种可能的方法是使用 lambda:
$ cat t151.cu
#define Eval(x, y, func) {y = func(x);}
template <typename F>
__global__ void Evaluate(double *xs, double *ys, int N, F f)
{
int tid = threadIdx.x;
if (tid < N)
Eval(xs[tid], ys[tid], f);
}
int main(){
double *xs_d, *ys_d;
double *xs_h, *ys_h;
xs_h = (double *) malloc(sizeof(double) * 256);
ys_h = (double *) malloc(sizeof(double) * 256);
cudaMalloc((void **)&xs_d, sizeof(double) * 256);
cudaMalloc((void **)&ys_d, sizeof(double) * 256);
for (int i = 0; i < 256; i++)
{
xs_h[i] = (double)i;
}
cudaMemcpy(xs_d, xs_h, 256*sizeof(double), cudaMemcpyHostToDevice);
auto Sinlambda = [] __host__ __device__ (double v) {return sin(v);};
Evaluate<<<1,256>>>(xs_d, ys_d, 256, Sinlambda);
cudaDeviceSynchronize();
cudaMemcpy(ys_h, ys_d, 256*sizeof(double), cudaMemcpyDeviceToHost);
}
$ nvcc -o t151 t151.cu -std=c++11 --extended-lambda
$ cuda-memcheck ./t151
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
(CUDA 11.3)
对于各种设备函数指针的使用,此 链接到许多示例。
如何制作这样的作品?
#define Eval(x, y, func) {y = func(x);}
__global__ void Evaluate(double *xs, double *ys, int N, double f(double))
{
int tid = threadIdx.x;
if (tid < N)
Eval(xs[tid], ys[tid], f);
}
然后在主函数里面我有
double *xs_d, *ys_d;
double *xs_h, *ys_h;
xs_h = (double *) malloc(sizeof(double) * 256);
ys_h = (double *) malloc(sizeof(double) * 256);
cudaMalloc((void **)&xs_d, sizeof(double) * 256);
cudaMalloc((void **)&ys_d, sizeof(double) * 256);
for (int i = 0; i < 256; i++)
{
xs_h[i] = (double)i;
}
HANDLE_ERROR(cudaMemcpy(xs_d, xs_h, 256*sizeof(double), cudaMemcpyHostToDevice));
Evaluate<<<1,256>>>(xs_d, ys_d, 256, Sin);
cudaDeviceSynchronize();
HANDLE_ERROR(cudaMemcpy(ys_h, ys_d, 256*sizeof(double), cudaMemcpyDeviceToHost));
最后一行失败。到目前为止,我已经看到了这样的解决方案 How to pass device function as an input argument to host-side function? 但它们使用了 __device__
函数,主机无法更改或访问这些函数(例如 main
)函数。例如,我不能将 __device__ int *fptrf1 = (int *)f1;
(取自示例)放在 main 中。是否有可能以某种方式具有这种灵活性?
For example, I cannot put
__device__ int *fptrf1 = (int *)f1;
(taken from the example) insidemain
. Is it possible to somehow have this flexibility?
一种可能的方法是使用 lambda:
$ cat t151.cu
#define Eval(x, y, func) {y = func(x);}
template <typename F>
__global__ void Evaluate(double *xs, double *ys, int N, F f)
{
int tid = threadIdx.x;
if (tid < N)
Eval(xs[tid], ys[tid], f);
}
int main(){
double *xs_d, *ys_d;
double *xs_h, *ys_h;
xs_h = (double *) malloc(sizeof(double) * 256);
ys_h = (double *) malloc(sizeof(double) * 256);
cudaMalloc((void **)&xs_d, sizeof(double) * 256);
cudaMalloc((void **)&ys_d, sizeof(double) * 256);
for (int i = 0; i < 256; i++)
{
xs_h[i] = (double)i;
}
cudaMemcpy(xs_d, xs_h, 256*sizeof(double), cudaMemcpyHostToDevice);
auto Sinlambda = [] __host__ __device__ (double v) {return sin(v);};
Evaluate<<<1,256>>>(xs_d, ys_d, 256, Sinlambda);
cudaDeviceSynchronize();
cudaMemcpy(ys_h, ys_d, 256*sizeof(double), cudaMemcpyDeviceToHost);
}
$ nvcc -o t151 t151.cu -std=c++11 --extended-lambda
$ cuda-memcheck ./t151
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
(CUDA 11.3)
对于各种设备函数指针的使用,此