CUDA Visual Studio 错误 "The command (a really long command) exited with code 255"

CUDA Visual Studio error "The command (a really long command) exited with code 255"

我正在尝试学习 cuda 并将我当前的项目转换为使用它,但我收到此错误:

Error MSB3721 The command ""C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\bin\nvcc.exe" -gencode=arch=compute_52,code="sm_52,compute_52" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio19\Community\VC\Tools\MSVC.26.28801\bin\HostX86\x64" -x cu -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\include" -G --keep-dir x64\Debug -maxrregcount=0 --machine 64 --compile -cudart static -g -D_DEBUG -D_CONSOLE -D_UNICODE -DUNICODE -Xcompiler "/EHsc /W3 /nologo /Od /Fdx64\Debug\vc142.pdb /FS /Zi /RTC1 /MDd " -o x64\Debug\cudaMain.cu.obj "C:\Users[my usr name]\source\repos\LogicGateMachineLearning_V2_Solution\LogicGateMachineLearning_V2\cudaMain.cu"" exited with code 255.

我正在使用 .cuh 文件,我该如何声明 类 它给了我一条警告,说“属性不适用于实体”。我是否需要标记我还收到一条警告,告诉我“警告 C26812 枚举类型 'cudaError' 未限定范围。更喜欢 'enum class' 而不是 'enum' (Enum.3)。”

.cuh 文件

#pragma once
#include <iostream>
#include <fstream>
#include <stdlib.h>
#include <string>
#include <vector>
#include <algorithm>
#include <ctime>
#pragma warning(disable : 4996)
#include <cuda_runtime.h>
#include "device_launch_parameters.h"

using namespace std;

static unsigned const int maxCircuitSizeG = 200;
static const int inputSizeG = 16;
static const int outputSizeG = 9;
static const short childParentAmountG = 10;

__host__ __device__ class Gate {
public:
    char type;
    int children[childParentAmountG];
    int parents[childParentAmountG];
    bool output;
    __host__ __device__ Gate();
};
__host__ __device__ class Circuit {
public:
    Gate gates[maxCircuitSizeG];
    bool inputs[inputSizeG];
    bool outputs[outputSizeG];
    double score;
    unsigned int averageCounter; // up to 4,294,967,295
    int size;
    __host__ __device__ Circuit();
};

__host__ __device__ unsigned int randumb(void);
unsigned int randumb2(void);

__host__ __device__ bool IsBoolInParents(Gate gate, Gate circuit[], bool boolToFind);
__host__ __device__ bool XORgateOutput(Gate gate, Gate circuit[]);
__host__ __device__ bool IsIntInArr(int arr[], int arrSize, int num);
bool IsShortInArr(short arr[], short arrSize, short num);
bool IsIntInVector(vector<int> vec, int num);
__host__ __device__ bool PushInt(int arr[], int arrSize, int num);
bool PushShort(short arr[], short arrSize, short num);
__host__ __device__ int CountCircuitSize(Gate circuit[]);
int CountCircuitSize2(Gate circuit[]);
__host__ __device__ void RemoveIntAndShiftArr(int arr[], int arrSize, int indexToRemove);
void RemoveShortAndShiftArr(short arr[], short arrSize, short indexToRemove);
int IntPow(int num, int exponent);
int BinaryToDecimal(bool bits[], int byteSize, bool firstIsMostSignificant);
string IntToString(int num);
void DecimalToBinary(int n, bool byte[], int byteSize);
void ShiftBinary(bool byte[], int byteSize, bool shiftLeft, int shiftAmount);
string BinaryToString(bool byte[], int byteSize);
__host__ __device__ void CopyGate(Gate& to, Gate from);

__host__ __device__ void RandomGateType(Gate circuit[], int circuitIndex);
__host__ __device__ void DestroyGate(Gate circuit[], int indexToRemove, int circuitSize);
__host__ __device__ void CleanCircuit(Gate circuit[], int circuitSize);
__host__ __device__ void AddChild(Gate circuit[], int circuitSize, int index);
__host__ __device__ void AddParent(Gate circuit[], int circuitSize, int index);
__host__ __device__ void CreateGate(Circuit& circuit, int indexToAdd, int circuitSize);
__host__ __device__ void CreateGate2(Circuit& circuit, int indexToAdd, int circuitSize);
__host__ __device__ void SafeFixCircuit(Gate circuit[], int circuitSize);
__host__ __device__ bool GateOutput(Gate gate, Gate circuit[]);
__host__ __device__ void Process(Circuit& circuit);
__host__ __device__ void ProcessFromCharArr(Circuit& circuit, char arr[]);
__host__ __device__ void RandomCircuit(Circuit& circuit, int circuitSize, int startingChildParentAmount);
__host__ __device__ void RemoveChild(Gate circuit[], int circuitSize, int index);
__host__ __device__ void Mutate(Circuit& circuit, int growChance, int shrinkChance, int grow, int shrink, int rate, int intensity);

void CreateAdderCircuit(Gate circuit[]);

string CircuitToString(Circuit circuit);
void SaveCircuit(string path, Circuit circuit, int circuitSize);
void FileToCircuit(string path, Gate circuit[]);

__host__ __device__ void InitRndPop(Circuit population[], int popSize, int startCircuitSize, int startChildParentAmount);
void InitPopFromFile(Circuit population[], int popSize, string path);

vector<string> MakeRndSample(int sampleSize, string path);

__host__ __device__ void Score3(Circuit& circuit, char arr[]);

__host__ __device__ void CopyCircuit(Circuit from, Circuit& to);
__host__ __device__ void CopyCircuitToPopulation(Circuit circuit, Circuit population[], unsigned short populationSize);
void CopyCircuit2(Circuit from, Circuit& to);
void CopyCircuitsToPop(vector<int> circuitsIndexes, Circuit population[], unsigned short populationSize);

__host__ __device__ void ScoreAverageFromArray(char arr[], int arrSize, Circuit& circuit);

__host__ __device__ void RandomBruteForceImproveFromArray(Circuit& circuit, char arr[], unsigned int arrSize, unsigned int maxSearch);

void fileToCharArr(char arr[], int size, string path);

带有定义的 .cu 太大而无法包含但是 none 其中有 host device他们不需要那个吗?

主 .cu 文件

#include "LogicSimCuda.cuh"
#include <stdio.h>
cudaError_t improveCircuitPopWithCuda(Circuit* circuit, char arr[], int arrSize, unsigned int size);

__global__ void addKernel(Circuit *circuit, char arr[], int arrSize, const int maxSearch)
{
    int i = threadIdx.x;
    RandomBruteForceImproveFromArray(circuit[i], arr, arrSize, maxSearch);
}

int main()
{
    const int populationSize = 1024;
    Circuit *population = new Circuit[populationSize];
    InitPopFromFile(population, populationSize, "C:/Users/voidm/Documents/LogicSimProjectGIT/LogicSim/Circuits/day2/Sun_Dec_20_12_54_59_2020.txt");
    unsigned const int fileSize = (inputSizeG + outputSizeG + 1) * 65536;
    char* trainingArr = new char[fileSize];
    fileToCharArr(trainingArr, fileSize,"C:/Users/voidm/Documents/LogicSimProjectGIT/LogicSim/src/Eight-Bit-Adder-Data.txt" );

    cudaError_t cudaStatus = improveCircuitPopWithCuda(population, trainingArr, fileSize, populationSize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "improveCircuitPopWithCuda failed!");
        return 1;
    }

    std::cout << population[0].score;


    return 0;

}

cudaError_t improveCircuitPopWithCuda(Circuit* circuitPop, char arr[], int arrSize, unsigned int size)
{
    Circuit *dev_circuit;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers .
    cudaStatus = cudaMalloc((void**)&dev_circuit, size * sizeof(Circuit));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_circuit, circuitPop, size * sizeof(Circuit), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    // Launch a kernel on the GPU with one thread for each element.
    addKernel <<<1, size >>> (dev_circuit, arr, arrSize, 1000000);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(circuitPop, dev_circuit, size * sizeof(Circuit), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_circuit);

    return cudaStatus;
}
    ```

here is the output

>1>------ Build started: Project: LogicGateMachineLearning_V2, Configuration: Debug x64 ------
1>Compiling CUDA source file cudaMain.cu...
1>Compiling CUDA source file LogicSimCuda.cu...
1>
1>C:\Users\voidm\source\repos\LogicGateMachineLearning_V2_Solution\LogicGateMachineLearning_V2>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\bin\nvcc.exe" -gencode=arch=compute_52,code=\"sm_52,compute_52\" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio19\Community\VC\Tools\MSVC.26.28801\bin\HostX86\x64" -x cu   -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\include"  -G   --keep-dir x64\Debug  -maxrregcount=0  --machine 64 --compile -cudart static  -g  -D_DEBUG -D_CONSOLE -D_UNICODE -DUNICODE -Xcompiler "/EHsc /W3 /nologo /Od /Fdx64\Debug\vc142.pdb /FS /Zi /RTC1 /MDd " -o x64\Debug\LogicSimCuda.cu.obj "C:\Users\voidm\source\repos\LogicGateMachineLearning_V2_Solution\LogicGateMachineLearning_V2\LogicSimCuda.cu"
1>
1>C:\Users\voidm\source\repos\LogicGateMachineLearning_V2_Solution\LogicGateMachineLearning_V2>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\bin\nvcc.exe" -gencode=arch=compute_52,code=\"sm_52,compute_52\" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio19\Community\VC\Tools\MSVC.26.28801\bin\HostX86\x64" -x cu   -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\include"  -G   --keep-dir x64\Debug  -maxrregcount=0  --machine 64 --compile -cudart static  -g  -D_DEBUG -D_CONSOLE -D_UNICODE -DUNICODE -Xcompiler "/EHsc /W3 /nologo /Od /Fdx64\Debug\vc142.pdb /FS /Zi /RTC1 /MDd " -o x64\Debug\cudaMain.cu.obj "C:\Users\voidm\source\repos\LogicGateMachineLearning_V2_Solution\LogicGateMachineLearning_V2\cudaMain.cu"
1>ptxas fatal   : Unresolved extern function '_Z32RandomBruteForceImproveFromArrayR7CircuitPcjj'
1>cudaMain.cu
1>C:\Program Files (x86)\Microsoft Visual Studio19\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations\CUDA 11.2.targets(785,9): error MSB3721: The command ""C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\bin\nvcc.exe" -gencode=arch=compute_52,code=\"sm_52,compute_52\" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio19\Community\VC\Tools\MSVC.26.28801\bin\HostX86\x64" -x cu   -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\include"  -G   --keep-dir x64\Debug  -maxrregcount=0  --machine 64 --compile -cudart static  -g  -D_DEBUG -D_CONSOLE -D_UNICODE -DUNICODE -Xcompiler "/EHsc /W3 /nologo /Od /Fdx64\Debug\vc142.pdb /FS /Zi /RTC1 /MDd " -o x64\Debug\cudaMain.cu.obj "C:\Users\voidm\source\repos\LogicGateMachineLearning_V2_Solution\LogicGateMachineLearning_V2\cudaMain.cu"" exited with code 255.
1>Done building project "LogicGateMachineLearning_V2.vcxproj" -- FAILED.
1>LogicSimCuda.cu
========== Build: 0 succeeded, 1 failed, 0 up-to-date, 0 skipped ==========

这个:

__host__ __device__ class Gate {
public:
    char type;
    int children[childParentAmountG];
    int parents[childParentAmountG];
    bool output;
    __host__ __device__ Gate();
};

是非法的。执行 space 说明符(因此 __host____device__)适用于函数和变量的声明和定义,而不适用于类型。正确的 class 声明只是

class Gate {
    public:
        char type;
        int children[childParentAmountG];
        int parents[childParentAmountG];
        bool output;
        __host__ __device__ Gate();
    };

您的代码中很可能还有其他问题,但没有看到实际的编译错误日志,也没有热情地浏览问题中转储的所有代码,这就是您确定的一个编译错误的根源。