Row-wise/column-wise 使用 CUDA 对矩阵进行运算

Question

我对 CUDA 编程比较陌生。我已经了解编程模型并且已经编写了一些基本内核。我知道如何将内核应用于矩阵的每个元素（存储为一维数组），但现在我想弄清楚 如何将相同的操作应用于矩阵的相同 row/column输入矩阵。

假设我有一个 MxN 矩阵和一个长度为 N 的向量。我想将向量加到矩阵的每一行（但它可以是任何其他数学运算）。此类操作的序列号为：

for (int c = 0; c < columns; c++) 
{
    for (int r = 0; r < rows; r++)
    {
        M[r * rows + c] += V[c];
    }
}

现在执行此操作的 CUDA 代码应该非常简单：我应该生成与元素一样多的 cuda 线程并应用此内核：

__global__ void kernel(const unsigned int size, float* matrix, const float* vector)
{
    // get the current element index for the thread
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size)
    {
        // sum the current element with the 
        matrix[idx] += vector[threadIdx.x];
    }
}

它运行但结果不正确。实际上，如果我在内核完成它的工作后转置矩阵是正确的。不幸的是，我不知道为什么它会以这种方式工作。你能帮我解决这个问题吗？提前致谢。

编辑 #1

我使用以下方式启动内核：

int block_size = 64;
int grid_size = (M * N + block_size - 1) / block_size;
kernel<<<grid_size, block_size>>>(M * N, matrix, vector);

编辑 #2

我按照@RobertCrovella 的建议通过修复 CPU 代码解决了问题：

M[r * columns + c] += V[c];

它应该匹配外部 for，即在列之上。

Answer 1

问题中显示的内核可以在不修改的情况下用于将向量与矩阵的每一行相加（假设是 c 风格的行优先存储），但要受到某些限制。一个演示是here。

该方法的主要限制是最大向量长度以及可处理的矩阵宽度等于每个块的最大线程数，在当前支持 CUDA 7 的 GPU 上为 1024。

我们可以通过对向量索引进行轻微修改并将行宽（列数）作为参数传递给矩阵来消除该限制。通过此修改，我们应该能够处理任意大小的矩阵（和向量）。

编辑： 基于 discussion/comments，OP 想知道如何处理行优先或列优先的底层存储。以下示例使用模板化内核 select 行优先或列优先底层存储，还显示了一种可能的 CUBLAS 方法，使用 [=13= 执行添加向量到每个矩阵行的操作]:

$ cat t712.cu
#include <iostream>
#include <cublas_v2.h>

#define ROWS 20
#define COLS 10

#define nTPB 64

#define ROW_MAJOR 0
#define COL_MAJOR 1

template <int select, typename T>
__global__ void vec_mat_row_add(const unsigned int height, const unsigned int width, T* matrix, const T* vector)
{
    // get the current element index for the thread
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < height*width)
    {
        // sum the current element with the
    if (select == ROW_MAJOR)
        matrix[idx] += vector[idx%width];
    else // COL_MAJOR
        matrix[idx] += vector[idx/height];
    }
}

int main(){

  float *h_mat, *d_mat, *h_vec, *d_vec;
  const unsigned int msz = ROWS*COLS*sizeof(float);
  const unsigned int vsz = COLS*sizeof(float);
  h_mat = (float *)malloc(msz);
  h_vec = (float *)malloc(vsz);
  cudaMalloc(&d_mat, msz);
  cudaMalloc(&d_vec, vsz);
  for (int i=0; i<COLS; i++) h_vec[i] = i; // set vector to 0,1,2, ...
  cudaMemcpy(d_vec, h_vec, vsz, cudaMemcpyHostToDevice);
  // test row-major case
  cudaMemset(d_mat, 0, msz); // set matrix to zero
  vec_mat_row_add<ROW_MAJOR><<<(ROWS*COLS + nTPB -1)/nTPB, nTPB>>>(ROWS, COLS, d_mat, d_vec);
  cudaMemcpy(h_mat, d_mat, msz, cudaMemcpyDeviceToHost);
  std::cout << "Row-major result: " << std::endl;
  for (int i = 0; i < ROWS; i++){
    for (int j = 0; j < COLS; j++) std::cout << h_mat[i*COLS+j] << " ";
    std::cout << std::endl;}
  // test column-major case
  cudaMemset(d_mat, 0, msz); // set matrix to zero
  vec_mat_row_add<COL_MAJOR><<<(ROWS*COLS + nTPB -1)/nTPB, nTPB>>>(ROWS, COLS, d_mat, d_vec);
  cudaMemcpy(h_mat, d_mat, msz, cudaMemcpyDeviceToHost);
  std::cout << "Column-major result: " << std::endl;
  for (int i = 0; i < ROWS; i++){
    for (int j = 0; j < COLS; j++) std::cout << h_mat[j*ROWS+i] << " ";
    std::cout << std::endl;}
  // test CUBLAS, doing matrix-vector add using <T>ger
  cudaMemset(d_mat, 0, msz); // set matrix to zero
  float *d_ones, *h_ones;
  h_ones = (float *)malloc(ROWS*sizeof(float));
  for (int i =0; i<ROWS; i++) h_ones[i] = 1.0f;
  cudaMalloc(&d_ones, ROWS*sizeof(float));
  cudaMemcpy(d_ones, h_ones, ROWS*sizeof(float), cudaMemcpyHostToDevice);
  cublasHandle_t ch;
  cublasCreate(&ch);
  float alpha = 1.0f;
  cublasStatus_t stat = cublasSger(ch, ROWS, COLS, &alpha, d_ones, 1, d_vec, 1, d_mat, ROWS);
  if (stat != CUBLAS_STATUS_SUCCESS) {std::cout << "CUBLAS error: " << (int)stat << std::endl; return 1;}
  cudaMemcpy(h_mat, d_mat, msz, cudaMemcpyDeviceToHost);
  std::cout << "CUBLAS Column-major result: " << std::endl;
  for (int i = 0; i < ROWS; i++){
    for (int j = 0; j < COLS; j++) std::cout << h_mat[j*ROWS+i] << " ";
    std::cout << std::endl;}

  return 0;
}
$ nvcc -o t712 t712.cu -lcublas
$ ./t712
Row-major result:
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
Column-major result:
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
CUBLAS Column-major result:
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
$

为了简洁起见，我没有包含 proper cuda error checking，但每当您在使用 CUDA 代码时遇到问题时，这总是一个好主意。作为 proxy/shortcut，您可以运行您的代码 cuda-memcheck 作为快速检查以查看是否存在任何 CUDA 错误。

请注意，我们希望所有 3 个打印输出都相同，因为这实际上是显示矩阵的正确方式，无论底层存储是行优先还是列优先。在处理显示输出的 for 循环中考虑了底层存储的差异。

Answer 2

Robert Crovella 已经回答了这个问题，并提供了使用显式 CUDA 内核和 cuBLAS 的示例。

我发现为了将来的参考，展示一个关于如何使用 CUDA Thrust 执行行式或列式操作的例子很有用。特别是，我关注两个问题：

对所有矩阵列求和一个列向量；
对所有矩阵行求和一个行向量。

thrust::transform 的通用性使得可以将下面的示例推广到除求和之外的元素运算（例如，乘法、除法、减法等）。

#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/random.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include <thrust/equal.h>

using namespace thrust::placeholders;

/*************************************/
/* CONVERT LINEAR INDEX TO ROW INDEX */
/*************************************/
template <typename T>
struct linear_index_to_row_index : public thrust::unary_function<T,T> {

    T Ncols; // --- Number of columns

    __host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {}

    __host__ __device__ T operator()(T i) { return i / Ncols; }
};

/********/
/* MAIN */
/********/
int main()
{
    /**************************/
    /* SETTING UP THE PROBLEM */
    /**************************/

    const int Nrows = 10;           // --- Number of rows
    const int Ncols =  3;           // --- Number of columns  

    // --- Random uniform integer distribution between 0 and 100
    thrust::default_random_engine rng;
    thrust::uniform_int_distribution<int> dist1(0, 100);

    // --- Random uniform integer distribution between 1 and 4
    thrust::uniform_int_distribution<int> dist2(1, 4);

    // --- Matrix allocation and initialization
    thrust::device_vector<float> d_matrix(Nrows * Ncols);
    for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (float)dist1(rng);

    // --- Column vector allocation and initialization
    thrust::device_vector<float> d_column(Nrows);
    for (size_t i = 0; i < d_column.size(); i++) d_column[i] = (float)dist2(rng);

    // --- Row vector allocation and initialization
    thrust::device_vector<float> d_row(Ncols);
    for (size_t i = 0; i < d_row.size(); i++) d_row[i] = (float)dist2(rng);

    printf("\n\nOriginal matrix\n");
    for(int i = 0; i < Nrows; i++) {
        std::cout << "[ ";
        for(int j = 0; j < Ncols; j++)
            std::cout << d_matrix[i * Ncols + j] << " ";
        std::cout << "]\n";
    }

    printf("\n\nColumn vector\n");
    for(int i = 0; i < Nrows; i++) std::cout << d_column[i] << "\n";

    printf("\n\nRow vector\n");
    for(int i = 0; i < Ncols; i++) std::cout << d_row[i] << " ";

    /*******************************************************/
    /* ADDING THE SAME COLUMN VECTOR TO ALL MATRIX COLUMNS */
    /*******************************************************/

    thrust::device_vector<float> d_matrix2(d_matrix);

    thrust::transform(d_matrix.begin(), d_matrix.end(),
                      thrust::make_permutation_iterator(
                                d_column.begin(),
                                thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Ncols))),
                      d_matrix2.begin(),
                      thrust::plus<float>());

    printf("\n\nColumn + Matrix -> Result matrix\n");
    for(int i = 0; i < Nrows; i++) {
        std::cout << "[ ";
        for(int j = 0; j < Ncols; j++)
            std::cout << d_matrix2[i * Ncols + j] << " ";
        std::cout << "]\n";
    }

    /*************************************************/
    /* ADDING THE SAME ROW VECTOR TO ALL MATRIX ROWS */
    /*************************************************/

    thrust::device_vector<float> d_matrix3(d_matrix);

    thrust::transform(thrust::make_permutation_iterator(
                                d_matrix.begin(),
                                thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)), 
                      thrust::make_permutation_iterator(
                                d_matrix.begin(),
                                thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)) + Nrows * Ncols,                    
                                thrust::make_permutation_iterator(
                                    d_row.begin(),
                                    thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows))),
                      thrust::make_permutation_iterator(
                                d_matrix3.begin(),
                                thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)), 
                      thrust::plus<float>());


    printf("\n\nRow + Matrix -> Result matrix\n");
    for(int i = 0; i < Nrows; i++) {
        std::cout << "[ ";
        for(int j = 0; j < Ncols; j++)
            std::cout << d_matrix3[i * Ncols + j] << " ";
        std::cout << "]\n";
    }

    return 0; 
}

Row-wise/column-wise 使用 CUDA 对矩阵进行运算

Row-wise/column-wise operations on matrices with CUDA

c

c++

cuda

matrix