无法打印从 CUDA 内核返回的值
Cannot print values returned from CUDA kernel
我正在研究在同一节点交换 V1 和 V3 的变量。但是,我无法将值 16 和 31 初始化为数组。这可能是我犯的一个愚蠢的错误,但我已经花了一个小时来调试我的代码。它只在 printf
输出中打印每个数组的“0”。
谁能找出我代码中的错误?这是我的代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#define threads 1024 //define the number of thread to use
#define blocks 4 //define the number of blocks to use
//Kernal Function
__global__ void Initial(double* V1, double* V3, int NX, int NY)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx <= NX * NY)
{
V1[idx] = 16;
V3[idx] = 31;
}
}
__global__ void Add2D(double* V1 , double* V3, int NX, int NY)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx <= NX * NY)
{
double tmp = V1[idx] ;
V1[idx] = V3[idx + NX];
V3[idx + NX] = tmp;
}
}
int main(void) {
//number of nodes
int NX = 5;
int NY = 5;
int N = NY * NX;
size_t bytes = NX * NY * sizeof(double); // define the memory size which needs to use in this application
//declare V1 in host
double* hos_V1 ;
double* hos_V3 ;
double* dev_V1 ;
double* dev_V3 ;
// hos_V1 = new double[N]; // allocate storage for VL array
// hos_V3 = new double[N];
hos_V1 = (double*)malloc(bytes);
hos_V3 = (double*)malloc(bytes);
// dev_V1 = new double[N]; // allocate storage for VL array
// dev_V3 = new double[N];
cudaMalloc((void**)&dev_V1, bytes);
cudaMalloc((void**)&dev_V3, bytes);
Initial <<< blocks , threads >>> (dev_V1, dev_V3, NX, NY);
cudaMemcpy(hos_V1, dev_V1, bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(hos_V3, dev_V3, bytes, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
for (int x = 0; x <= NX * NY; x++)
{
printf("V1[%d] = %f \n", x, &hos_V1[x]);
printf("V3[%d] = %f \n", x, &hos_V3[x]);
}
printf("-----------------------------\n");
Add2D <<< blocks, threads >> > (dev_V1, dev_V3, NX, NY);
cudaThreadSynchronize(); //Sync CPU and GPU to start the timer
cudaMemcpy(hos_V1, dev_V1, bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(hos_V3, dev_V3, bytes, cudaMemcpyDeviceToHost);
for (int x = 0; x <= NX*NY; x++)
{
printf("V1[%d] = %f \n", x, &hos_V1[x]);
printf("V3[%d] = %f \n", x, &hos_V3[x]);
}
// free the memory allocated on the GPU
cudaFree(dev_V1);
cudaFree(dev_V3);
return 0;
}
您正在打印 &hos_V1[x] 和 &hos_V3[x],而不是 hos_V1[x] 和 hos_V3[x]。我确定您想实际打印数组的内容。
for (int x = 0; x <= NX * NY; x++)
{
printf("V1[%d] = %f \n", x, hos_V1[x]);
printf("V3[%d] = %f \n", x, hos_V3[x]);
}
在编译时启用警告可能对您有用。 NVCC(或 gcc,因为它是主机代码)向我建议您可能正在打印错误的东西。
edit:as 其他人指出,您提供的代码还有更多“系统性”问题。
我正在研究在同一节点交换 V1 和 V3 的变量。但是,我无法将值 16 和 31 初始化为数组。这可能是我犯的一个愚蠢的错误,但我已经花了一个小时来调试我的代码。它只在 printf
输出中打印每个数组的“0”。
谁能找出我代码中的错误?这是我的代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#define threads 1024 //define the number of thread to use
#define blocks 4 //define the number of blocks to use
//Kernal Function
__global__ void Initial(double* V1, double* V3, int NX, int NY)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx <= NX * NY)
{
V1[idx] = 16;
V3[idx] = 31;
}
}
__global__ void Add2D(double* V1 , double* V3, int NX, int NY)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx <= NX * NY)
{
double tmp = V1[idx] ;
V1[idx] = V3[idx + NX];
V3[idx + NX] = tmp;
}
}
int main(void) {
//number of nodes
int NX = 5;
int NY = 5;
int N = NY * NX;
size_t bytes = NX * NY * sizeof(double); // define the memory size which needs to use in this application
//declare V1 in host
double* hos_V1 ;
double* hos_V3 ;
double* dev_V1 ;
double* dev_V3 ;
// hos_V1 = new double[N]; // allocate storage for VL array
// hos_V3 = new double[N];
hos_V1 = (double*)malloc(bytes);
hos_V3 = (double*)malloc(bytes);
// dev_V1 = new double[N]; // allocate storage for VL array
// dev_V3 = new double[N];
cudaMalloc((void**)&dev_V1, bytes);
cudaMalloc((void**)&dev_V3, bytes);
Initial <<< blocks , threads >>> (dev_V1, dev_V3, NX, NY);
cudaMemcpy(hos_V1, dev_V1, bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(hos_V3, dev_V3, bytes, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
for (int x = 0; x <= NX * NY; x++)
{
printf("V1[%d] = %f \n", x, &hos_V1[x]);
printf("V3[%d] = %f \n", x, &hos_V3[x]);
}
printf("-----------------------------\n");
Add2D <<< blocks, threads >> > (dev_V1, dev_V3, NX, NY);
cudaThreadSynchronize(); //Sync CPU and GPU to start the timer
cudaMemcpy(hos_V1, dev_V1, bytes, cudaMemcpyDeviceToHost);
cudaMemcpy(hos_V3, dev_V3, bytes, cudaMemcpyDeviceToHost);
for (int x = 0; x <= NX*NY; x++)
{
printf("V1[%d] = %f \n", x, &hos_V1[x]);
printf("V3[%d] = %f \n", x, &hos_V3[x]);
}
// free the memory allocated on the GPU
cudaFree(dev_V1);
cudaFree(dev_V3);
return 0;
}
您正在打印 &hos_V1[x] 和 &hos_V3[x],而不是 hos_V1[x] 和 hos_V3[x]。我确定您想实际打印数组的内容。
for (int x = 0; x <= NX * NY; x++)
{
printf("V1[%d] = %f \n", x, hos_V1[x]);
printf("V3[%d] = %f \n", x, hos_V3[x]);
}
在编译时启用警告可能对您有用。 NVCC(或 gcc,因为它是主机代码)向我建议您可能正在打印错误的东西。 edit:as 其他人指出,您提供的代码还有更多“系统性”问题。