memory location error: thrust::stable_sort when using big array and user-defined comparison operator

Question

我运行此代码使用 thrust stable_sort 和用户定义的运算符对大量 IP 进行排序，以比较 IP。此代码适用于少于 50000 个 IP 的数组，但我遇到了大数组的内存错误。这是我使用的代码：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <stdio.h>
#include <time.h>
#include <device_functions.h>
template<typename T>
struct vector_less
{
    typedef T first_argument_type;
    typedef T second_argument_type;
    typedef bool result_type;
    __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {
        if (lhs[0] == rhs[0])
        if (lhs[1] == rhs[1])
        if (lhs[2] == rhs[2])
            return lhs[3] < rhs[3];
        else
            return lhs[2] < rhs[2];
        else
            return lhs[1] < rhs[1];
        else
            return lhs[0] < rhs[0];
    }
}; 

__global__ void prepare_ips_list(unsigned char ** dev_sorted_Ips, unsigned char * ip_b1, unsigned char * ip_b2, unsigned char * ip_b3, unsigned char * ip_b4, unsigned int searchedIpsSize)
{
    int thread = threadIdx.x + blockIdx.x * blockDim.x;
    if (thread < searchedIpsSize)
    {
        dev_sorted_Ips[thread] = new unsigned char[4];
        dev_sorted_Ips[thread][0] = ip_b1[thread];
        dev_sorted_Ips[thread][1] = ip_b2[thread];
        dev_sorted_Ips[thread][2] = ip_b3[thread];
        dev_sorted_Ips[thread][3] = ip_b4[thread];
    }

}


int main()
{
    const int size = 1000000;

    unsigned char * ip_b1 = new unsigned char[size];
    unsigned char * ip_b2 = new unsigned char[size];;
    unsigned char * ip_b3 = new unsigned char[size];;
    unsigned char * ip_b4 = new unsigned char[size];;

    unsigned char * dev_ip_b1;
    unsigned char * dev_ip_b2;
    unsigned char * dev_ip_b3;
    unsigned char * dev_ip_b4;

    unsigned char ** dev_sortedIps;

    for (int i = 0; i < size; i++)
    {
        ip_b1[i] = rand() % 240;
        ip_b2[i] = rand() % 240;
        ip_b3[i] = rand() % 240;
        ip_b4[i] = rand() % 240;
    }

    cudaError_t cudaStatus;
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_ip_b1, size * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
    cudaStatus = cudaMemcpy(dev_ip_b1, ip_b1, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    cudaStatus = cudaMalloc((void**)&dev_ip_b2, size * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
    cudaStatus = cudaMemcpy(dev_ip_b2, ip_b2, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    cudaStatus = cudaMalloc((void**)&dev_ip_b3, size * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
    cudaStatus = cudaMemcpy(dev_ip_b3, ip_b3, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    cudaStatus = cudaMalloc((void**)&dev_ip_b4, size * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }
    cudaStatus = cudaMemcpy(dev_ip_b4, ip_b4, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_sortedIps, size * sizeof(unsigned char *));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    int resetThreads = size;
    int resetBlocks = 1;
    if (size > 1024)
    {
        resetThreads = 1024;
        resetBlocks = size / 1024;
        if (size % 1024 > 0)
            resetBlocks++;
    }

    prepare_ips_list << <resetBlocks, resetThreads >> >(dev_sortedIps, dev_ip_b1, dev_ip_b2, dev_ip_b3, dev_ip_b4, size);



    thrust::device_ptr<unsigned char *> sorted_list_ptr1(dev_sortedIps);
    thrust::stable_sort(sorted_list_ptr1, sorted_list_ptr1 + size, vector_less<unsigned char *>());

    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching !\n", cudaStatus);
        goto Error;
    }

    return 0;

Error:
    cudaFree(dev_ip_b1);
    cudaFree(dev_ip_b2);
    cudaFree(dev_ip_b3);
    cudaFree(dev_ip_b4);
    cudaFree(dev_sortedIps);
}

我得到的错误是： Microsoft C++ 异常：thrust::system::system_error 在内存位置

大数组如何解决这个问题？我应该使用另一种技术来实现这种排序，例如对零件进行划分和排序然后合并吗？

Answer 1

最近的问题是内核 malloc 和 new 可用于分配的设备堆的大小受到限制。这个限制可以提高。请阅读 the documentation.

其他一些建议：

您在内核之后（在第一次推力调用之前）没有进行任何错误检查。你应该对内核进行错误检查，然后你会发现你的内核出了问题，而 thrust 只是为你报告错误。避免混淆。每当您在使用 CUDA 代码时遇到问题，请执行 rigorous, proper cuda error checking。
作为一种好的做法，至少为了调试目的，用 new 或 malloc 测试任何指针 return 是否为 NULL 并不是一个坏主意.这就是 API 通知您发生分配失败的方式。

下面的代码通过调整输入大小的设备堆来演示近端问题的可能解决方法。它还演示了解决其他两个建议的可能方法：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <device_functions.h>
#include <assert.h>

template<typename T>
struct vector_less
{
    typedef T first_argument_type;
    typedef T second_argument_type;
    typedef bool result_type;
    __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {
        if (lhs[0] == rhs[0])
        if (lhs[1] == rhs[1])
        if (lhs[2] == rhs[2])
            return lhs[3] < rhs[3];
        else
            return lhs[2] < rhs[2];
        else
            return lhs[1] < rhs[1];
        else
            return lhs[0] < rhs[0];
    }
};

__global__ void prepare_ips_list(unsigned char ** dev_sorted_Ips, unsigned char * ip_b1, unsigned char * ip_b2, unsigned char * ip_b3, unsigned char * ip_b4, unsigned int searchedIpsSize)
{
    int thread = threadIdx.x + blockIdx.x * blockDim.x;
    if (thread < searchedIpsSize)
    {
        dev_sorted_Ips[thread] = new unsigned char[4];
        if (dev_sorted_Ips[thread] == NULL) assert(0);
        dev_sorted_Ips[thread][0] = ip_b1[thread];
        dev_sorted_Ips[thread][1] = ip_b2[thread];
        dev_sorted_Ips[thread][2] = ip_b3[thread];
        dev_sorted_Ips[thread][3] = ip_b4[thread];
    }

}


int main(int argc, char *argv[])
{

    int size = 50000;
    if (argc > 1) size = atoi(argv[1]);
    int chunks = size/50000 + 1;
    cudaError_t cudaStatus;
    cudaStatus = cudaDeviceSetLimit(cudaLimitMallocHeapSize, 8000000 * chunks);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "set device heap limit failed!");
    }
    unsigned char * ip_b1 = new unsigned char[size];
    unsigned char * ip_b2 = new unsigned char[size];;
    unsigned char * ip_b3 = new unsigned char[size];;
    unsigned char * ip_b4 = new unsigned char[size];;

    unsigned char * dev_ip_b1;
    unsigned char * dev_ip_b2;
    unsigned char * dev_ip_b3;
    unsigned char * dev_ip_b4;

    unsigned char ** dev_sortedIps;

    for (int i = 0; i < size; i++)
    {
        ip_b1[i] = rand() % 240;
        ip_b2[i] = rand() % 240;
        ip_b3[i] = rand() % 240;
        ip_b4[i] = rand() % 240;
    }

    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    }

    cudaStatus = cudaMalloc((void**)&dev_ip_b1, size * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
    }
    cudaStatus = cudaMemcpy(dev_ip_b1, ip_b1, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }
    cudaStatus = cudaMalloc((void**)&dev_ip_b2, size * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
    }
    cudaStatus = cudaMemcpy(dev_ip_b2, ip_b2, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }
    cudaStatus = cudaMalloc((void**)&dev_ip_b3, size * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
    }
    cudaStatus = cudaMemcpy(dev_ip_b3, ip_b3, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }
    cudaStatus = cudaMalloc((void**)&dev_ip_b4, size * sizeof(unsigned char));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
    }
    cudaStatus = cudaMemcpy(dev_ip_b4, ip_b4, size * sizeof(unsigned char), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }

    cudaStatus = cudaMalloc((void**)&dev_sortedIps, size * sizeof(unsigned char *));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
    }

    int resetThreads = size;
    int resetBlocks = 1;
    if (size > 1024)
    {
        resetThreads = 1024;
        resetBlocks = size / 1024;
        if (size % 1024 > 0)
            resetBlocks++;
    }

    prepare_ips_list << <resetBlocks, resetThreads >> >(dev_sortedIps, dev_ip_b1, dev_ip_b2, dev_ip_b3, dev_ip_b4, size);

    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess){
      printf(" kernel fail\n");
      exit(0);}

    thrust::device_ptr<unsigned char *> sorted_list_ptr1(dev_sortedIps);
    thrust::stable_sort(sorted_list_ptr1, sorted_list_ptr1 + size, vector_less<unsigned char *>());

    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "launch failed: %s\n", cudaGetErrorString(cudaStatus));
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching !\n", cudaStatus);
    }

    return 0;

}

请注意，您可以通过将所需大小作为命令行参数传递来测试各种大小。我测试了 1000000，似乎工作正常。最终，对于足够大的问题，您的 GPU 内存将运行不足。你没有说明你有什么 GPU。

我已经删除了 goto 语句，因为我正在处理 linux（显然你已经切换回 windows）。我建议你想出一个不同于使用 goto 的错误处理过程，如果没有其他原因，它会导致推力构造困难。

还要注意内核中的 new 或 malloc 有点像 "slow"。通过预先进行必要的分配，使用适当大小的单个 cudaMalloc 调用，您可能会大大加快较大尺寸的速度。不幸的是，由于您使用了双指针数组 dev_sorted_Ips，这变得很复杂。我建议您将其展平为单个指针数组，通过 cudaMalloc 分配一次必要的大小，并在内核中进行必要的数组索引以使其工作。如果分析此代码，您会发现较长情况（例如 size = 1000000）的绝大部分执行时间都由 prepare_ips_list 内核消耗，而不是排序操作。因此，您应该从这里着手提高绩效。

memory location error: thrust::stable_sort when using big array and user-defined comparison operator

memory location error: thrust::stable_sort when using big array and user-defined comparison operator

cuda

stable-sort

thrust