没有 for 循环的 CUDA 数组过滤内核
CUDA array filtering kernel without a for loop
我有一个大数组 A
,有 size_A
行和 6 列。我将检查每一行的第三个元素,如果它不为零,则将该行复制到另一个数组 B
。我可以在不使用 for
循环的情况下获得 B
条目的索引吗,请看下面的代码?
我可能需要以某种方式定义 b_ptr
以使其成为静态的(类似于我们在 C 中所拥有的),但我认为这是不允许的。
__global__ void filtering_kernel(float* A, int size_A, float* B, float* size_B)
{
/*B and size_B are the outputs*/
int b_ptr = 0;
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x > size_A) return;
for (int i = 0; i < size_A; i++)
{
if (A[x + 3] != 0)
{
B[b_ptr] = A[x + 0];
B[b_ptr + 1] = A[x + 1];
B[b_ptr + 2] = A[x + 2];
B[b_ptr + 3] = A[x + 3];
B[b_ptr + 4] = A[x + 4];
B[b_ptr + 5] = A[x + 5];
b_ptr += 6;
*size_B = *size_B + 1;
}
}
}
诀窍是启动与数组中的元素一样多的线程。如果我们假设 tid
(从您的 x
重命名)的范围从 0
到 size_A * 6
,那么我们可以完全删除循环。我们确实需要首先确定必须复制哪些行,因此引入了共享数组filter
。假设您可以将 int[size_A]
放入单个块的内存中,并拥有与条目一样多的线程,您可以使用以下代码,并提示如果 size_A
足够大需要多个线程,您可以如何执行此操作块。
__global__ void filtering_kernel(float *A, const int size_A, const int W,
float *B, int *size_B) {
// We use this to store whether a given row is filtered,
// and then scan this array to tell us how densely packed B is.
extern __shared__ int filter[];
// Assuming 1 block
const int tid = threadIdx.x;
const int offset = 0;
// Multiblock difference
// tid = threadIdx.x
// offset = blockIdx.x * blockDim.x;
// Guard to ensure we are not out of range
if (offset + tid >= size_A * W)
return;
const int row = tid / W;
const int col = tid % W;
// NOTE: You have 3 in your sample code, but the third column is 2
const int mid = (W - 1)/2;
// Dedicate one thread per row to check
// whether we should filter
if (tid < size_A) {
// A boolean will be either 1 or 0
// Whatever filter criterion you want.
filter[tid] = A[offset + tid * W + mid] == 0;
}
// We then need to run a scan to get the cumulative sum
// of the filtered with a dedicated thread. If we consider
// good rows (g) and bad rows (b), for gggbbggbbggg we expect
// 1,2,3,3,3,4,5,5,5,6,7,8
for (int i = 1; i < size_A; i <<= 1) {
if (tid < size_A && tid >= i) {
filter[tid] += filter[tid - i];
}
__syncthreads();
}
__syncthreads();
// We should then only copy if the cumulative sum increases
// And handle for the case of the first row
// Note: If you are thread limited, you can do multiple copies here.
if ((row == 0 && filter[row]) || (row > 0 && filter[row] > filter[row - 1])) {
B[offset + W * (filter[row] - 1) + col] = A[tid];
}
// Also set the expected size for B
if (tid == 0) {
*size_B = filter[size_A - 1];
printf("size_B %d\n", *size_B);
// Multiple blocks: size_B[blockIdx.x] = filtered[size_A - 1];
}
// TODO: For multiple blocks, we still need to densely pack B. (see below)
}
继续:按原样,filtered
需要在内核中共享,因此这仅适用于单个块。对于多个块,我会过滤每个块的一部分 B(即保留上面的代码,更改我注意到的地方),记录过滤了多少 size_B
现在是一个数组,累加和 size_B
,然后就地复制 B 使其更密集(或使用 size_B
从设备下载每个部分的密集部分)。
根据评论,调用代码:
int example(const float *arr, const size_t size_A, const size_t W ) {
float *d_A;
float *d_B;
cudaMalloc((void **)&d_A, size_A * W * sizeof(float));
cudaMalloc((void **)&d_B, size_A * W * sizeof(float));
cudaMemset(d_B, 0, size_A * W * sizeof(float));
int *size_B;
cudaMalloc((void **)&size_B, sizeof(int));
cudaMemset(size_B, 0, sizeof(int));
cudaMemcpy(d_A, arr, size_A * W * sizeof(float), cudaMemcpyHostToDevice);
filtering_kernel<<<1, W * size_A, size_A * sizeof(int)>>>(d_A, size_A, W, d_B,
size_B);
cudaDeviceSynchronize();
printf("Error %s \n", cudaGetLastError());
int result;
cudaMemcpy(&result, size_B, sizeof(int), cudaMemcpyDeviceToHost);
printf("Error %s \n", cudaGetLastError());
return result;
}
然后我们可以使用 GTEST 测试:
TEST(FILTER, ROW6) {
size_t size_A = 100;
size_t W = 6;
float *arr = (float *)malloc(sizeof(float) * size_A * W); // initialize arr
int expected = 0;
for (int i = 0; i < size_A * W; i++) {
arr[i] = i % 4;
if (i % W == 2 && arr[i] == 0)
expected++;
}
printf("Expected: %d\n", expected);
const int result = drt::example(arr, size_A, W);
ASSERT_EQ(result, expected) << "Filter Kernel does not work.";
}
这个问题很复杂,无法用 CUDA 一步完成,您无法搜索所需的行并将它们放入数组中 B
希望它们的顺序正确,因为 CUDA 内核不一定按顺序检查行。但是,有一个多步骤解决方案可以解决问题。首先,您将 运行 一个内核,它将在第三列中定位零,顺便说一句,其索引是 2 而不是 3,然后在数组 [=13= 中用 1
的值标记这些行].之后,一个简单的 for
循环将计算这些位置并将它们存储在另一个数组 Ind
中。最后,第二个内核会将所需的行从 A
复制到 B
.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>
#include <stdio.h>
__global__ void get_indeces(float* A, int* P, int size_A);
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B);
int main()
{
int i, size_A, size_B;
size_t size;
int* P, * d_P, * Ind, * d_I;
float* A, * d_A, * B, * d_B;
size_A = ..; // specify number of rows of A
A = new float[size_A * 6];
// input values of array A
...
P = new int[size_A];
for (i = 0; i < size_A; i++)
P[i] = 0;
size = (uint64_t)size_A * 6 * sizeof(float);
cudaMalloc(&d_A, size);
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
size = (uint64_t)size_A * sizeof(int);
cudaMalloc(&d_P, size);
cudaMemcpy(d_P, P, size, cudaMemcpyHostToDevice);
get_indeces<<<(int)ceil(size_A / 1024.0), 1024>>>(d_A, d_P, size_A);
cudaMemcpy(P, d_P, size, cudaMemcpyDeviceToHost);
size_B = 0;
for (i = 0; i < size_A; i++)
if (P[i] == 1)
Ind[size_B++] = i;
Ind = new int[size_A];
size = (uint64_t)size_B * sizeof(int);
cudaMalloc(&d_I, size);
cudaMemcpy(d_I, Ind, size, cudaMemcpyHostToDevice);
B = new float[size_B * 6];
size = (uint64_t)size_B * 6 * sizeof(float);
cudaMalloc(&d_B, size);
dim3 dimBlock(170, 6); // to copy the full row at the same time, 6 * 170 < 1024
dim3 dimGrid((int)ceil(size_B / 170.0), 1);
filtering_kernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_I, size_B);
cudaMemcpy(B, d_B, size, cudaMemcpyDeviceToHost);
}
__global__ void get_indeces(float* A, int* P, int size_A)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < size_A && A[x * 6 + 2] == 0) // if you want to use return, it should be "if (x >= size_A) return;"
P[x] = 1;
}
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B)
{
int i;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = threadIdx.y;
if (x < size_B)
B[x * 6 + y] = A[Ind[x] * 6 + y];
}
我有一个大数组 A
,有 size_A
行和 6 列。我将检查每一行的第三个元素,如果它不为零,则将该行复制到另一个数组 B
。我可以在不使用 for
循环的情况下获得 B
条目的索引吗,请看下面的代码?
我可能需要以某种方式定义 b_ptr
以使其成为静态的(类似于我们在 C 中所拥有的),但我认为这是不允许的。
__global__ void filtering_kernel(float* A, int size_A, float* B, float* size_B)
{
/*B and size_B are the outputs*/
int b_ptr = 0;
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x > size_A) return;
for (int i = 0; i < size_A; i++)
{
if (A[x + 3] != 0)
{
B[b_ptr] = A[x + 0];
B[b_ptr + 1] = A[x + 1];
B[b_ptr + 2] = A[x + 2];
B[b_ptr + 3] = A[x + 3];
B[b_ptr + 4] = A[x + 4];
B[b_ptr + 5] = A[x + 5];
b_ptr += 6;
*size_B = *size_B + 1;
}
}
}
诀窍是启动与数组中的元素一样多的线程。如果我们假设 tid
(从您的 x
重命名)的范围从 0
到 size_A * 6
,那么我们可以完全删除循环。我们确实需要首先确定必须复制哪些行,因此引入了共享数组filter
。假设您可以将 int[size_A]
放入单个块的内存中,并拥有与条目一样多的线程,您可以使用以下代码,并提示如果 size_A
足够大需要多个线程,您可以如何执行此操作块。
__global__ void filtering_kernel(float *A, const int size_A, const int W,
float *B, int *size_B) {
// We use this to store whether a given row is filtered,
// and then scan this array to tell us how densely packed B is.
extern __shared__ int filter[];
// Assuming 1 block
const int tid = threadIdx.x;
const int offset = 0;
// Multiblock difference
// tid = threadIdx.x
// offset = blockIdx.x * blockDim.x;
// Guard to ensure we are not out of range
if (offset + tid >= size_A * W)
return;
const int row = tid / W;
const int col = tid % W;
// NOTE: You have 3 in your sample code, but the third column is 2
const int mid = (W - 1)/2;
// Dedicate one thread per row to check
// whether we should filter
if (tid < size_A) {
// A boolean will be either 1 or 0
// Whatever filter criterion you want.
filter[tid] = A[offset + tid * W + mid] == 0;
}
// We then need to run a scan to get the cumulative sum
// of the filtered with a dedicated thread. If we consider
// good rows (g) and bad rows (b), for gggbbggbbggg we expect
// 1,2,3,3,3,4,5,5,5,6,7,8
for (int i = 1; i < size_A; i <<= 1) {
if (tid < size_A && tid >= i) {
filter[tid] += filter[tid - i];
}
__syncthreads();
}
__syncthreads();
// We should then only copy if the cumulative sum increases
// And handle for the case of the first row
// Note: If you are thread limited, you can do multiple copies here.
if ((row == 0 && filter[row]) || (row > 0 && filter[row] > filter[row - 1])) {
B[offset + W * (filter[row] - 1) + col] = A[tid];
}
// Also set the expected size for B
if (tid == 0) {
*size_B = filter[size_A - 1];
printf("size_B %d\n", *size_B);
// Multiple blocks: size_B[blockIdx.x] = filtered[size_A - 1];
}
// TODO: For multiple blocks, we still need to densely pack B. (see below)
}
继续:按原样,filtered
需要在内核中共享,因此这仅适用于单个块。对于多个块,我会过滤每个块的一部分 B(即保留上面的代码,更改我注意到的地方),记录过滤了多少 size_B
现在是一个数组,累加和 size_B
,然后就地复制 B 使其更密集(或使用 size_B
从设备下载每个部分的密集部分)。
根据评论,调用代码:
int example(const float *arr, const size_t size_A, const size_t W ) {
float *d_A;
float *d_B;
cudaMalloc((void **)&d_A, size_A * W * sizeof(float));
cudaMalloc((void **)&d_B, size_A * W * sizeof(float));
cudaMemset(d_B, 0, size_A * W * sizeof(float));
int *size_B;
cudaMalloc((void **)&size_B, sizeof(int));
cudaMemset(size_B, 0, sizeof(int));
cudaMemcpy(d_A, arr, size_A * W * sizeof(float), cudaMemcpyHostToDevice);
filtering_kernel<<<1, W * size_A, size_A * sizeof(int)>>>(d_A, size_A, W, d_B,
size_B);
cudaDeviceSynchronize();
printf("Error %s \n", cudaGetLastError());
int result;
cudaMemcpy(&result, size_B, sizeof(int), cudaMemcpyDeviceToHost);
printf("Error %s \n", cudaGetLastError());
return result;
}
然后我们可以使用 GTEST 测试:
TEST(FILTER, ROW6) {
size_t size_A = 100;
size_t W = 6;
float *arr = (float *)malloc(sizeof(float) * size_A * W); // initialize arr
int expected = 0;
for (int i = 0; i < size_A * W; i++) {
arr[i] = i % 4;
if (i % W == 2 && arr[i] == 0)
expected++;
}
printf("Expected: %d\n", expected);
const int result = drt::example(arr, size_A, W);
ASSERT_EQ(result, expected) << "Filter Kernel does not work.";
}
这个问题很复杂,无法用 CUDA 一步完成,您无法搜索所需的行并将它们放入数组中 B
希望它们的顺序正确,因为 CUDA 内核不一定按顺序检查行。但是,有一个多步骤解决方案可以解决问题。首先,您将 运行 一个内核,它将在第三列中定位零,顺便说一句,其索引是 2 而不是 3,然后在数组 [=13= 中用 1
的值标记这些行].之后,一个简单的 for
循环将计算这些位置并将它们存储在另一个数组 Ind
中。最后,第二个内核会将所需的行从 A
复制到 B
.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <math.h>
#include <stdio.h>
__global__ void get_indeces(float* A, int* P, int size_A);
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B);
int main()
{
int i, size_A, size_B;
size_t size;
int* P, * d_P, * Ind, * d_I;
float* A, * d_A, * B, * d_B;
size_A = ..; // specify number of rows of A
A = new float[size_A * 6];
// input values of array A
...
P = new int[size_A];
for (i = 0; i < size_A; i++)
P[i] = 0;
size = (uint64_t)size_A * 6 * sizeof(float);
cudaMalloc(&d_A, size);
cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
size = (uint64_t)size_A * sizeof(int);
cudaMalloc(&d_P, size);
cudaMemcpy(d_P, P, size, cudaMemcpyHostToDevice);
get_indeces<<<(int)ceil(size_A / 1024.0), 1024>>>(d_A, d_P, size_A);
cudaMemcpy(P, d_P, size, cudaMemcpyDeviceToHost);
size_B = 0;
for (i = 0; i < size_A; i++)
if (P[i] == 1)
Ind[size_B++] = i;
Ind = new int[size_A];
size = (uint64_t)size_B * sizeof(int);
cudaMalloc(&d_I, size);
cudaMemcpy(d_I, Ind, size, cudaMemcpyHostToDevice);
B = new float[size_B * 6];
size = (uint64_t)size_B * 6 * sizeof(float);
cudaMalloc(&d_B, size);
dim3 dimBlock(170, 6); // to copy the full row at the same time, 6 * 170 < 1024
dim3 dimGrid((int)ceil(size_B / 170.0), 1);
filtering_kernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_I, size_B);
cudaMemcpy(B, d_B, size, cudaMemcpyDeviceToHost);
}
__global__ void get_indeces(float* A, int* P, int size_A)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < size_A && A[x * 6 + 2] == 0) // if you want to use return, it should be "if (x >= size_A) return;"
P[x] = 1;
}
__global__ void filtering_kernel(float* A, float* B, int* Ind, int size_B)
{
int i;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = threadIdx.y;
if (x < size_B)
B[x * 6 + y] = A[Ind[x] * 6 + y];
}