如何在 cuSparse 中使用 cusparseXcoo2csr 从 coo 转换为 csc?
How to use cusparseXcoo2csr in cuSparse to convert from coo to csc?
在cuSparse的documentation中,说明函数cusparseXcoo2csr
can also be used to convert the array containing the uncompressed
column indices (corresponding to COO format) into an array of column
pointers (corresponding to CSC format)
但是,我找不到重现它的方法。请参阅下面的最小代码:
CMakeLists.txt
cmake_minimum_required(VERSION 3.11)
project(sample)
find_package(CUDA REQUIRED)
add_executable(${PROJECT_NAME} main.cpp)
target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_14)
target_include_directories(${PROJECT_NAME} SYSTEM PUBLIC ${CUDA_INCLUDE_DIRS})
target_link_libraries(${PROJECT_NAME} ${CUDA_LIBRARIES} ${CUDA_cusparse_LIBRARY})
main.cpp
#include <iostream>
#include <vector>
#include <cuda_runtime_api.h>
#include <cusparse_v2.h>
int main(){
// using the matrix as shown in https://docs.nvidia.com/cuda/cusparse/index.html#coo-format
// 1 4 0 0 0
// 0 2 3 0 0
// 5 0 0 7 8
// 0 0 9 0 6
std::vector<int> row;
std::vector<int> col;
std::vector<double> val;
row.emplace_back(0);
row.emplace_back(0);
row.emplace_back(1);
row.emplace_back(1);
row.emplace_back(2);
row.emplace_back(2);
row.emplace_back(2);
row.emplace_back(3);
row.emplace_back(3);
col.emplace_back(0);
col.emplace_back(1);
col.emplace_back(1);
col.emplace_back(2);
col.emplace_back(0);
col.emplace_back(3);
col.emplace_back(4);
col.emplace_back(2);
col.emplace_back(4);
val.emplace_back(1);
val.emplace_back(4);
val.emplace_back(2);
val.emplace_back(3);
val.emplace_back(5);
val.emplace_back(7);
val.emplace_back(8);
val.emplace_back(9);
val.emplace_back(6);
int *d_row;
int *d_col;
double *d_val;
cudaMalloc(reinterpret_cast<void **>(&d_row), row.size() * sizeof(int));
cudaMalloc(reinterpret_cast<void **>(&d_col), col.size() * sizeof(int));
cudaMalloc(reinterpret_cast<void **>(&d_val), val.size() * sizeof(double));
cudaMemcpy(d_row, row.data(), sizeof(int) * row.size(), cudaMemcpyHostToDevice);
cudaMemcpy(d_col, col.data(), sizeof(int) * col.size(), cudaMemcpyHostToDevice);
cudaMemcpy(d_val, val.data(), sizeof(double) * val.size(), cudaMemcpyHostToDevice);
cusparseHandle_t handle;
cusparseCreate(&handle);
cusparseMatDescr_t descr;
cusparseCreateMatDescr(&descr);
cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
cusparseMatDescr_t descr_out;
cusparseCreateMatDescr(&descr_out);
cusparseSetMatType(descr_out, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr_out, CUSPARSE_INDEX_BASE_ZERO);
int *d_row_csr;
cudaMalloc(reinterpret_cast<void **>(&d_row_csr), (4 + 1) * sizeof(int));
cusparseXcoo2csr(handle, d_row, 9, 4, d_row_csr, CUSPARSE_INDEX_BASE_ZERO);
std::vector<int> row_csr(4 + 1);
cudaMemcpy(row_csr.data(), d_row_csr, sizeof(int) * (4 + 1), cudaMemcpyDeviceToHost);
std::cout << "row" << std::endl;
for (int i : row_csr){
std::cout << i << std::endl; // prints 0 2 4 7 9 as expected
}
// however when I try to compress the column the same way...
int *d_col_csc;
cudaMalloc(reinterpret_cast<void **>(&d_col_csc), (5 + 1) * sizeof(int));
cusparseXcoo2csr(handle, d_col, 9, 5, d_col_csc, CUSPARSE_INDEX_BASE_ZERO);
std::vector<int> col_csc(5 + 1);
cudaMemcpy(col_csc.data(), d_col_csc, sizeof(int) * (5 + 1), cudaMemcpyDeviceToHost);
std::cout << "col" << std::endl;
for (int i : col_csc){
std::cout << i << std::endl; // prints 0 5 3 8 6 9, shouldn't it be 0 2 4 6 7 9?
}
return 0;
}
正如您所看到的,出于某种原因,从 coo 到 csc 的转换是不正确的。我暂时通过调用 cusparseXcoo2csr
将 coo 转换为 csr 来解决这个问题,然后调用另一个 cusparseDcsr2csc
将 csr 中间结果转换为 csc。这是额外的计算,所以我想知道如何使用 cusparseXcoo2csr
直接将 coo 转换为 csc,如文档中所示。
从 COO 行坐标到行指针的行压缩和从 COO 列坐标到列指针的列压缩在根本上是相同的操作(基本上只是按键的前缀和)。尽管没有明确记录,但 cusparseXcoo2csr
需要对输入坐标数据进行排序。
在您的示例中,CSR 转换有效,因为您拥有的坐标按行顺序排序,而 CSC 失败,因为列坐标未按列顺序排序。如果您重新排序输入以便对列索引进行排序,转换将起作用。
在cuSparse的documentation中,说明函数cusparseXcoo2csr
can also be used to convert the array containing the uncompressed column indices (corresponding to COO format) into an array of column pointers (corresponding to CSC format)
但是,我找不到重现它的方法。请参阅下面的最小代码:
CMakeLists.txt
cmake_minimum_required(VERSION 3.11)
project(sample)
find_package(CUDA REQUIRED)
add_executable(${PROJECT_NAME} main.cpp)
target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_14)
target_include_directories(${PROJECT_NAME} SYSTEM PUBLIC ${CUDA_INCLUDE_DIRS})
target_link_libraries(${PROJECT_NAME} ${CUDA_LIBRARIES} ${CUDA_cusparse_LIBRARY})
main.cpp
#include <iostream>
#include <vector>
#include <cuda_runtime_api.h>
#include <cusparse_v2.h>
int main(){
// using the matrix as shown in https://docs.nvidia.com/cuda/cusparse/index.html#coo-format
// 1 4 0 0 0
// 0 2 3 0 0
// 5 0 0 7 8
// 0 0 9 0 6
std::vector<int> row;
std::vector<int> col;
std::vector<double> val;
row.emplace_back(0);
row.emplace_back(0);
row.emplace_back(1);
row.emplace_back(1);
row.emplace_back(2);
row.emplace_back(2);
row.emplace_back(2);
row.emplace_back(3);
row.emplace_back(3);
col.emplace_back(0);
col.emplace_back(1);
col.emplace_back(1);
col.emplace_back(2);
col.emplace_back(0);
col.emplace_back(3);
col.emplace_back(4);
col.emplace_back(2);
col.emplace_back(4);
val.emplace_back(1);
val.emplace_back(4);
val.emplace_back(2);
val.emplace_back(3);
val.emplace_back(5);
val.emplace_back(7);
val.emplace_back(8);
val.emplace_back(9);
val.emplace_back(6);
int *d_row;
int *d_col;
double *d_val;
cudaMalloc(reinterpret_cast<void **>(&d_row), row.size() * sizeof(int));
cudaMalloc(reinterpret_cast<void **>(&d_col), col.size() * sizeof(int));
cudaMalloc(reinterpret_cast<void **>(&d_val), val.size() * sizeof(double));
cudaMemcpy(d_row, row.data(), sizeof(int) * row.size(), cudaMemcpyHostToDevice);
cudaMemcpy(d_col, col.data(), sizeof(int) * col.size(), cudaMemcpyHostToDevice);
cudaMemcpy(d_val, val.data(), sizeof(double) * val.size(), cudaMemcpyHostToDevice);
cusparseHandle_t handle;
cusparseCreate(&handle);
cusparseMatDescr_t descr;
cusparseCreateMatDescr(&descr);
cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
cusparseMatDescr_t descr_out;
cusparseCreateMatDescr(&descr_out);
cusparseSetMatType(descr_out, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr_out, CUSPARSE_INDEX_BASE_ZERO);
int *d_row_csr;
cudaMalloc(reinterpret_cast<void **>(&d_row_csr), (4 + 1) * sizeof(int));
cusparseXcoo2csr(handle, d_row, 9, 4, d_row_csr, CUSPARSE_INDEX_BASE_ZERO);
std::vector<int> row_csr(4 + 1);
cudaMemcpy(row_csr.data(), d_row_csr, sizeof(int) * (4 + 1), cudaMemcpyDeviceToHost);
std::cout << "row" << std::endl;
for (int i : row_csr){
std::cout << i << std::endl; // prints 0 2 4 7 9 as expected
}
// however when I try to compress the column the same way...
int *d_col_csc;
cudaMalloc(reinterpret_cast<void **>(&d_col_csc), (5 + 1) * sizeof(int));
cusparseXcoo2csr(handle, d_col, 9, 5, d_col_csc, CUSPARSE_INDEX_BASE_ZERO);
std::vector<int> col_csc(5 + 1);
cudaMemcpy(col_csc.data(), d_col_csc, sizeof(int) * (5 + 1), cudaMemcpyDeviceToHost);
std::cout << "col" << std::endl;
for (int i : col_csc){
std::cout << i << std::endl; // prints 0 5 3 8 6 9, shouldn't it be 0 2 4 6 7 9?
}
return 0;
}
正如您所看到的,出于某种原因,从 coo 到 csc 的转换是不正确的。我暂时通过调用 cusparseXcoo2csr
将 coo 转换为 csr 来解决这个问题,然后调用另一个 cusparseDcsr2csc
将 csr 中间结果转换为 csc。这是额外的计算,所以我想知道如何使用 cusparseXcoo2csr
直接将 coo 转换为 csc,如文档中所示。
从 COO 行坐标到行指针的行压缩和从 COO 列坐标到列指针的列压缩在根本上是相同的操作(基本上只是按键的前缀和)。尽管没有明确记录,但 cusparseXcoo2csr
需要对输入坐标数据进行排序。
在您的示例中,CSR 转换有效,因为您拥有的坐标按行顺序排序,而 CSC 失败,因为列坐标未按列顺序排序。如果您重新排序输入以便对列索引进行排序,转换将起作用。