没有 for 循环的 CUDA 数组过滤内核

Question

我有一个大数组 A，有 size_A 行和 6 列。我将检查每一行的第三个元素，如果它不为零，则将该行复制到另一个数组 B。我可以在不使用 for 循环的情况下获得 B 条目的索引吗，请看下面的代码？

我可能需要以某种方式定义 b_ptr 以使其成为静态的（类似于我们在 C 中所拥有的），但我认为这是不允许的。

__global__ void filtering_kernel(float* A, int size_A, float* B, float* size_B)
{
    /*B and size_B are the outputs*/
    int b_ptr = 0;
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    if (x > size_A) return;
    for (int i = 0; i < size_A; i++)
    {
        if (A[x + 3] != 0)
       {
            B[b_ptr] = A[x + 0];
            B[b_ptr + 1] = A[x + 1];
            B[b_ptr + 2] = A[x + 2];
            B[b_ptr + 3] = A[x + 3];
            B[b_ptr + 4] = A[x + 4];
            B[b_ptr + 5] = A[x + 5];
            b_ptr += 6;
            *size_B = *size_B + 1;
        }
    }
}

Answer 1

诀窍是启动与数组中的元素一样多的线程。如果我们假设 tid（从您的 x 重命名）的范围从 0 到 size_A * 6，那么我们可以完全删除循环。我们确实需要首先确定必须复制哪些行，因此引入了共享数组filter。假设您可以将 int[size_A] 放入单个块的内存中，并拥有与条目一样多的线程，您可以使用以下代码，并提示如果 size_A 足够大需要多个线程，您可以如何执行此操作块。

__global__ void filtering_kernel(float *A, const int size_A, const int W,
                                   float *B, int *size_B) {
    // We use this to store whether a given row is filtered,
    // and then scan this array to tell us how densely packed B is.
    extern __shared__ int filter[];
    
    // Assuming 1 block
    const int tid = threadIdx.x;
    const int offset = 0;
    // Multiblock difference
    // tid = threadIdx.x
    // offset = blockIdx.x * blockDim.x;
    
    // Guard to ensure we are not out of range
    if (offset + tid >= size_A * W)
      return;
    const int row = tid / W;
    const int col = tid % W;
    // NOTE: You have 3 in your sample code, but the third column is 2
    const int mid = (W - 1)/2;

    // Dedicate one thread per row to check
    // whether we should filter
    if (tid < size_A) {
      // A boolean will be either 1 or 0
      // Whatever filter criterion you want.
      filter[tid] = A[offset + tid * W + mid] == 0;
    }

    // We then need to run a scan to get the cumulative sum
    // of the filtered with a dedicated thread. If we consider
    // good rows (g) and bad rows (b), for gggbbggbbggg we expect
    // 1,2,3,3,3,4,5,5,5,6,7,8
    for (int i = 1; i < size_A; i <<= 1) {
      if (tid < size_A && tid >= i) {
        filter[tid] += filter[tid - i];
      }
      __syncthreads();
    }
    __syncthreads();

    // We should then only copy if the cumulative sum increases
    // And handle for the case of the first row
    // Note: If you are thread limited, you can do multiple copies here.
    if ((row == 0 && filter[row]) || (row > 0 && filter[row] > filter[row - 1])) {
      B[offset + W * (filter[row] - 1) + col] = A[tid];
    }
    // Also set the expected size for B
    if (tid == 0) {
      *size_B = filter[size_A - 1];
      printf("size_B %d\n", *size_B);
      // Multiple blocks: size_B[blockIdx.x] = filtered[size_A - 1];
    }
    // TODO: For multiple blocks, we still need to densely pack B. (see below)
}

继续：按原样，filtered 需要在内核中共享，因此这仅适用于单个块。对于多个块，我会过滤每个块的一部分 B（即保留上面的代码，更改我注意到的地方），记录过滤了多少 size_B 现在是一个数组，累加和 size_B，然后就地复制 B 使其更密集（或使用 size_B 从设备下载每个部分的密集部分）。

根据评论，调用代码：

int example(const float *arr, const size_t size_A, const size_t W ) {
    float *d_A;
    float *d_B;
    cudaMalloc((void **)&d_A, size_A * W * sizeof(float));
    cudaMalloc((void **)&d_B, size_A * W * sizeof(float));
    cudaMemset(d_B, 0, size_A * W * sizeof(float));
    int *size_B;
    cudaMalloc((void **)&size_B, sizeof(int));


    cudaMemset(size_B, 0, sizeof(int));
    cudaMemcpy(d_A, arr, size_A * W * sizeof(float), cudaMemcpyHostToDevice);
    filtering_kernel<<<1, W * size_A, size_A * sizeof(int)>>>(d_A, size_A, W, d_B,
                                                              size_B);
    cudaDeviceSynchronize();
    printf("Error %s \n", cudaGetLastError());

    int result;
    cudaMemcpy(&result, size_B, sizeof(int), cudaMemcpyDeviceToHost);
    printf("Error %s \n", cudaGetLastError());

    return result;
}

然后我们可以使用 GTEST 测试：

TEST(FILTER, ROW6) {
    size_t size_A = 100;
    size_t W = 6;
    float *arr = (float *)malloc(sizeof(float) * size_A * W); // initialize arr
    int expected = 0;
    for (int i = 0; i < size_A * W; i++) {
      arr[i] = i % 4;
      if (i % W == 2 && arr[i] == 0)
        expected++;
    }
    printf("Expected: %d\n", expected);

    const int result = drt::example(arr, size_A, W);
    ASSERT_EQ(result, expected) << "Filter Kernel does not work.";
}

Answer 2

这个问题很复杂，无法用 CUDA 一步完成，您无法搜索所需的行并将它们放入数组中 B 希望它们的顺序正确，因为 CUDA 内核不一定按顺序检查行。但是，有一个多步骤解决方案可以解决问题。首先，您将运行一个内核，它将在第三列中定位零，顺便说一句，其索引是 2 而不是 3，然后在数组 [=13= 中用 1 的值标记这些行].之后，一个简单的 for 循环将计算这些位置并将它们存储在另一个数组 Ind 中。最后，第二个内核会将所需的行从 A 复制到 B.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>
#include <stdio.h>

__global__ void get_indeces(float* A, int* P, int size_A);
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B);

int main()
{
    int i, size_A, size_B;
    size_t size;
    int* P, * d_P, * Ind, * d_I;
    float* A, * d_A, * B, * d_B;
    size_A = ..;  // specify number of rows of A
    A = new float[size_A * 6];
    // input values of array A
    ...
    P = new int[size_A];
    for (i = 0; i < size_A; i++)
        P[i] = 0;
    size = (uint64_t)size_A * 6 * sizeof(float);
    cudaMalloc(&d_A, size);
    cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
    size = (uint64_t)size_A * sizeof(int);
    cudaMalloc(&d_P, size);
    cudaMemcpy(d_P, P, size, cudaMemcpyHostToDevice);
    get_indeces<<<(int)ceil(size_A / 1024.0), 1024>>>(d_A, d_P, size_A);
    cudaMemcpy(P, d_P, size, cudaMemcpyDeviceToHost);
    size_B = 0;
    for (i = 0; i < size_A; i++)
        if (P[i] == 1)
            Ind[size_B++] = i;
    Ind = new int[size_A];
    size = (uint64_t)size_B * sizeof(int);
    cudaMalloc(&d_I, size);
    cudaMemcpy(d_I, Ind, size, cudaMemcpyHostToDevice);
    B = new float[size_B * 6];
    size = (uint64_t)size_B * 6 * sizeof(float);
    cudaMalloc(&d_B, size);
    dim3 dimBlock(170, 6);  // to copy the full row at the same time, 6 * 170 < 1024
    dim3 dimGrid((int)ceil(size_B / 170.0), 1);
    filtering_kernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_I, size_B);
    cudaMemcpy(B, d_B, size, cudaMemcpyDeviceToHost);
}

__global__ void get_indeces(float* A, int* P, int size_A)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    if (x < size_A && A[x * 6 + 2] == 0)   // if you want to use return, it should be "if (x >= size_A) return;"
        P[x] = 1;
}

__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B)
{
    int i;
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = threadIdx.y;
    if (x < size_B)
        B[x * 6 + y] = A[Ind[x] * 6 + y];
}

没有 for 循环的 CUDA 数组过滤内核

CUDA array filtering kernel without a for loop

c++

cuda