当 cudaDeviceSynchronize() 出现错误 30 时？

Question

我正在使用 nvidia 9500 gt 在 cuda 6 上尝试我的第一个内核。
当我运行 cudaDeviceSynchronize() 时，它 returns 错误 30.
当我尝试使用 Nsight 调试内核时，我无法查看变量的值，无论是在本地还是在监视列表中。代码在 vs2010 中编译没有问题，
这是代码：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include <vector>
#include <iostream>
#include <sstream>
#include <fstream>
#include <cstdlib>

cudaError_t convert_csv_file_monowave_numbers(std::vector<double>& bidPrice,    
    std::vector<double>& askPrice);

//1st kernel converts bid and ask price data to price direction data
__global__ void bid_ask_price_data_direction_data(const double *bidPrice, const double *askPrice, int *d_bidPrice_direction,int *d_askPrice_direction,int size)
{
    int tid = ((blockIdx.x * blockDim.x) + threadIdx.x)+1;  
    if(tid<=size)
    {
        //for bid prices
        bool ipred_bid = (bidPrice[tid] > bidPrice[tid-1]);
        bool ipred_bid2 = (bidPrice[tid] < bidPrice[tid-1]);
        bool ipred_bid3 = (bidPrice[tid] == bidPrice[tid-1]);
        if (ipred_bid) 
        {
            d_bidPrice_direction[tid]=1;//UP is 1 ,DOWN is 0,SIDEWAY IS 2
        }
        if (ipred_bid2) 
        {
            d_bidPrice_direction[tid]=0;//UP is 1 ,DOWN is 0,SIDEWAY IS 2       
        }
        if (ipred_bid3) 
        {
            d_bidPrice_direction[tid]=2;//UP is 1 ,DOWN is 0,SIDEWAY IS 2       
        }   
    
        //for ask prices
        bool ipred_ask = (askPrice[tid] > askPrice[tid-1]);
        bool ipred_ask2 = (askPrice[tid] < askPrice[tid-1]);
        bool ipred_ask3 = (askPrice[tid] == askPrice[tid-1]);
        if (ipred_ask) 
        {
            d_askPrice_direction[tid]=1;//UP is 1 ,DOWN is 0,SIDEWAY IS 2
        }
        if (ipred_ask2) 
        {
            d_askPrice_direction[tid]=0;//UP is 1 ,DOWN is 0,SIDEWAY IS 2       
        }
        if (ipred_ask3) 
        {
            d_askPrice_direction[tid]=2;//UP is 1 ,DOWN is 0,SIDEWAY IS 2       
        }
    }
    __syncthreads();
}

//2nd kernel converts bid and ask price direction data to number of ticks in each direction step_zero"step zero means that any direction with one tick is represented by 0 and direction with more than one tick has its first tick 0 then each tick is 1"
__device__ bool d_iteration=false;

__global__ void bid_ask_direction_data_num_ticks_step_zero(int *d_bidPrice_direction,int *d_askPrice_direction,int *d_bidPrice_num_ticks_step_0,int *d_askPrice_num_ticks_step_0)
{
    /////////////take care we did not make sideway condition yet////////////

    //the following line is to make sure that we start dealing with arrays from array[1] with tid=1 as tid-1=0 .Then we make specefic code for tid=0 
    int tid = ((blockIdx.x * blockDim.x) + threadIdx.x)+1;
    
    //the next line is used to record the status of iteration of the next code to calculate the number of ticks.this variable will be set by all threads then we use iteration_count to determine the number of times of iteration which is used in calculating number of ticks
    //d_iteration=false;
    
    //the following line is to make sure that we start with tid=1 as tid-1=0 .Then we make specefic code for tid=0 
    //if (tid!=0)
    //{
        //now convert up,down,sideway to number of ticks in each monowave
        bool ipred_bidPrice_direction_ticks = ((d_bidPrice_direction[tid] == d_bidPrice_direction[tid-1])|| (d_bidPrice_direction[tid]==2));//here we try to start manipulating sideway???????????????????????????????????????????
        if (ipred_bidPrice_direction_ticks) 
        {
            d_bidPrice_num_ticks_step_0[tid]=1;
            d_iteration=true;
        }
        if (!ipred_bidPrice_direction_ticks) 
        {
            d_bidPrice_num_ticks_step_0[tid]=0;
        }
        
        //now convert up,down,sideway to number of ticks in each monowave
        bool ipred_askPrice_direction_ticks = (d_askPrice_direction[tid] == d_askPrice_direction[tid-1]);
        if (ipred_askPrice_direction_ticks) 
        {
            d_askPrice_num_ticks_step_0[tid]=1;
            d_iteration=true;
        }
        if (!ipred_askPrice_direction_ticks) 
        {
            d_askPrice_num_ticks_step_0[tid]=0;
        }
        __syncthreads();
    //}
}

//3rd kernel converts bid and ask number of ticks in each direction step_zero"step zero means that any direction with one tick is represented by 0 and direction with more than one tick has its first tick 0 then each tick is 1"
__device__ int d_iteration_count=0;
__global__ void bid_ask_num_ticks_step_zero_further_steps(int *d_bidPrice_num_ticks_step_0, int *d_askPrice_num_ticks_step_0, int *d_intermediate)
{
    int tid = ((blockIdx.x * blockDim.x) + threadIdx.x)+1;

    //bool ipred_bidPrice_num_ticks_step_0 = d_bidPrice_num_ticks_step_0[tid];
    if (d_bidPrice_num_ticks_step_0[tid] == 0) 
    {
        d_intermediate[tid]=0;
        //d_iteration=true;
    }
    else if (d_bidPrice_num_ticks_step_0[tid] != 0) 
    {
        if ((d_bidPrice_num_ticks_step_0[tid] == 1) && (d_bidPrice_num_ticks_step_0[tid-1] == 0)) 
        {
            d_intermediate[tid]=1;
            //d_iteration=true;
        }
        else if (d_bidPrice_num_ticks_step_0[tid] == d_bidPrice_num_ticks_step_0[tid-1]) 
        {
            d_intermediate[tid]=((d_bidPrice_num_ticks_step_0[tid-1] + d_bidPrice_num_ticks_step_0[tid])-(d_iteration_count - 1));
        }
    }
    __syncthreads();

    //bool ipred_askPrice_num_ticks_step_0 = d_askPrice_num_ticks_step_0[tid];
    if (d_askPrice_num_ticks_step_0[tid] == 0) 
    {
        d_intermediate[tid]=0;
        //d_iteration=true;
    }
    else if (d_askPrice_num_ticks_step_0[tid] != 0) 
    {
        if ((d_askPrice_num_ticks_step_0[tid] == 1) && (d_askPrice_num_ticks_step_0[tid-1] == 0)) 
        {
            d_intermediate[tid]=1;
            //d_iteration=true;
        }
        else if (d_askPrice_num_ticks_step_0[tid] == d_askPrice_num_ticks_step_0[tid-1]) 
        {

            d_intermediate[tid]=((d_askPrice_num_ticks_step_0[tid-1] + d_askPrice_num_ticks_step_0[tid])-(d_iteration_count - 1));
        }
    }
    __syncthreads();
}

int main()
{  
    //FIRST we get the data from csv file to work with it
    std::string path ="GBPJPY_2020_08_30_.csv";
    std::ifstream data(path);
    std::string line;

    // Declare data storage  
    std::vector<long long> tickTime;
    std::vector<double> bidPrice;
    std::vector<double> askPrice;
    std::vector<double> bidVolume;
    std::vector<double> askVolume;
    
    int lineCounter={0};
    while(std::getline(data,line))
    {
        int cellCounter={0};

        std::stringstream lineStream(line);
        std::string cell;
                
        while(std::getline(lineStream,cell,','))
        {            
            switch (cellCounter)  
            {
                case 0:
                    // code to be executed if
                    // expression is equal to constant1;
                    //tickTime.push_back(::atof(cell.c_str());std::stod(s);
                    tickTime.push_back(std::stoll(cell));

                    break;
                case 1:
                    // code to be executed if
                    // expression is equal to constant1;
                    //tickTime.push_back(::atof(cell.c_str());std::stod(s);
                    //tickTime.push_back(std::stod(cell));

                    break;
                case 2:
                    // code to be executed if
                    // expression is equal to constant2;
                    //bidPrice.push_back(::atof(cell.c_str());
                    bidPrice.push_back(std::stod(cell));

                    break;

                case 3:
                    // code to be executed if
                    // expression is equal to constant1;
                    //askPrice.push_back(::atof(cell.c_str());
                    bidVolume.push_back(std::stod(cell));


                    break;

                case 4:
                    // code to be executed if
                    // expression is equal to constant2;
                    //bidVolume.push_back(::atof(cell.c_str());
                    askPrice.push_back(std::stod(cell));

                    break;
                case 5:
                    // code to be executed if
                    // expression is equal to constant2;
                    //askVolume.push_back(::atof(cell.c_str());
                    askVolume.push_back(std::stod(cell));

                    break;
                default:
                    // code to be executed if
                    // expression doesn't match any constant
                    throw;
            }

            //increment cellCounter at end so first element is 0 not 1
            ++cellCounter;
        }       
        ++lineCounter;
    }
    
    //SECOND start the helper function which is the main target of this program
    // convert csv file to monowave file.
    cudaError_t cudaStatus = convert_csv_file_monowave_numbers(//tickTime,
        bidPrice,// bidVolume,
        askPrice//,askVolume);
        );
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "convert_csv_file_monowave_numbers failed!");
        return 1;
    }

    //THIRD calculate monowaves serially on CPU

    
    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t convert_csv_file_monowave_numbers(std::vector<double>& bidPrice,    
    std::vector<double>& askPrice)
{
    //these pointers are used to store csv data on gpu   
    double *d_bidPrice=0;   
    double *d_askPrice=0;    
    
    //these pointers are used to store results of converting csv data to monowave requirements      
    int *d_bidPrice_direction=0;   
    int *d_askPrice_direction=0;
    int *d_bidPrice_num_ticks_step_0=0;   
    int *d_askPrice_num_ticks_step_0=0; 
    
    int *d_intermediate_1=0;    
    int *d_intermediate_2=0; 
    cudaError_t cudaStatus;

    //these pointers will be used to test output
    int *h_bidPrice_direction=0;   
    int *h_askPrice_direction=0;
    int *h_bidPrice_num_ticks_step_0=0;   
    int *h_askPrice_num_ticks_step_0=0;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for two vectors (two inputs)    .
    
    cudaStatus = cudaMalloc((void**)&d_bidPrice, bidPrice.size()*sizeof(double));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&d_askPrice, askPrice.size()*sizeof(double));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
    

    // Allocate GPU buffers for four resulting arrays (four output)    .
    
    cudaStatus = cudaMalloc((void**)&d_bidPrice_direction, bidPrice.size()*sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&d_askPrice_direction, askPrice.size()*sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&d_bidPrice_num_ticks_step_0, bidPrice.size()*sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&d_askPrice_num_ticks_step_0, askPrice.size()*sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
    

    // Allocate GPU buffers for two intra kernel iteration storing arrays (two intermediate buffers )    .
    cudaStatus = cudaMalloc((void**)&d_intermediate_1, bidPrice.size()*sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&d_intermediate_2, bidPrice.size()*sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.  
    cudaStatus = cudaMemcpy(d_bidPrice, bidPrice.data(), bidPrice.size()*sizeof(double), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(d_askPrice, askPrice.data(), askPrice.size()*sizeof(double), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }   
    
    // set up data size
    int size = bidPrice.size();
    int blocksize = 32;
    // set up execution configuration
    dim3 block (blocksize,1);
    dim3 grid ((size+block.x-1)/block.x,1);
    printf("Execution Configure (block %d grid %d)\n",block.x, grid.x);
        
    //launch first kernel
    bid_ask_price_data_direction_data<<<grid, block>>>(bidPrice.data(),askPrice.data() , d_bidPrice_direction,d_askPrice_direction,size);
    ////////////////////////////////////////////////////////////////////////////
    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "bid_ask_price_data_direction_data launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching bid_ask_price_data_direction_data!\n", cudaStatus);
        goto Error;
    }   

    // Copy output vector from GPU buffer to host memory.   
    cudaStatus = cudaMemcpy(h_bidPrice_direction, d_bidPrice_direction, bidPrice.size() * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    cudaStatus = cudaMemcpy(h_askPrice_direction, d_askPrice_direction, askPrice.size() * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    //show the bid and ask prices and Price_direction
    for(int i=0;i < askPrice.size();i++)
    {
        std::cout << bidPrice.data()[i] <<h_bidPrice_direction[i] << askPrice.data()[i] << h_askPrice_direction[i] << std::endl;
    }
    
    //launch 2nd kernel
    bid_ask_direction_data_num_ticks_step_zero<<<grid, block>>>(d_bidPrice_direction,d_askPrice_direction,d_bidPrice_num_ticks_step_0,d_askPrice_num_ticks_step_0);
    ////////////////////////////////////////////////////////////////////////////
    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "bid_ask_direction_data_num_ticks_step_zero launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching bid_ask_direction_data_num_ticks_step_zero!\n", cudaStatus);
        goto Error;
    }
    /////////////////////////////////////////////////////////////////////////////
    
    // Copy output vector from GPU buffer to host memory.    
    cudaStatus = cudaMemcpy(h_bidPrice_num_ticks_step_0, d_bidPrice_num_ticks_step_0, bidPrice.size() * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    cudaStatus = cudaMemcpy(h_askPrice_num_ticks_step_0, h_askPrice_num_ticks_step_0, askPrice.size() * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    
    //prepare for 3rd kernel
    bool h_iteration=false;
    int h_iteration_count=0;
    cudaStatus = cudaMemcpyFromSymbol(&h_iteration, "d_iteration", sizeof(h_iteration), 0, cudaMemcpyDeviceToHost);
     if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpyFromSymbol failed!");
        goto Error;
    }
    printf("iteration: %d\n", h_iteration);
    
    //determine from value of h_iteration if we will make loop which launch kernel to convert bid and ask number of ticks_step_0 to further steps to calculate total number of ticks in each monowave
    if(h_iteration)
    {
        //this indicate that iteration is true and we need to start kernel to convert step_0 to further steps
        //h_iteration_count++;
        while(h_iteration)
        {
            h_iteration_count++;

            //launch 3rd kernel
            cudaMemcpyFromSymbol(&d_iteration_count, "h_iteration_count", sizeof(h_iteration_count), 0, cudaMemcpyHostToDevice);
            //we need to determine if iteration count is odd or even, to determine which d_intermediate will be used
            if(h_iteration_count % 2 !=0)
            {
                //iteration count is odd. So we use d_intermediate_1 

            }
            else if(h_iteration_count % 2 ==0)
            {
                //iteration count is even. So we use d_intermediate_2

            }
        }

    }
    else if(!h_iteration)
    {

    }
    ////////////////////////////////////////////////////////////////////////////
    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "bid_ask_price_data_direction_data launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching bid_ask_price_data_direction_data!\n", cudaStatus);
        goto Error;
    }
    /////////////////////////////////////////////////////////////////////////////
    /*
    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    */
    /////////////////////////////////////////////////////////////////////////////
Error:
    
    cudaFree(d_bidPrice);
    cudaFree(d_askPrice);
   
    cudaFree(d_bidPrice_direction);
    cudaFree(d_askPrice_direction);
    
    cudaFree(d_bidPrice_num_ticks_step_0);
    cudaFree(d_askPrice_num_ticks_step_0);

    cudaFree(d_intermediate_1);
    cudaFree(d_intermediate_2);


    cudaDeviceReset();

    return cudaStatus;
}

我做错了什么？我该如何调试这段代码？

Answer 1

我发现了错误。我用主机向量指针而不是设备指针启动内核。

当 cudaDeviceSynchronize() 出现错误 30 时？

when cudaDeviceSynchronize(), I get error 30?

cuda