找不到 Ctypes 函数
Ctypes function not found
我尝试使用 ctypes
来 运行 python 中的一些 cuda 代码。编译并加载 .so
文件后,我 运行 出现错误,告诉我 cuda
函数不存在。我之前尝试使用普通 c
中的示例,并且成功了。我编译有问题吗?
Cuda代码
#include <stdio.h>
#include <stdlib.h>
#define BLOCK_SIZE 16
struct Matrix {
int width;
int height;
float *elements;
};
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C){
// runs for each col - row pair
float tmpVal = 0;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
for (int i = 0; i < A.width; ++i)
tmpVal += A.elements[row * A.width + i] *
B.elements[i * B.width + col];
C.elements[ row * C.width + col ] = tmpVal;
}
void mMul( Matrix *A, Matrix *B, Matrix *C ){
Matrix d_A, d_B, d_C;
// Matrix d_A
d_A.width = A->width;
d_A.height = A->height;
size_t sizeA = A->width * A->height * sizeof(float);
// dynamically allocate cudaMemory for elemenst array
cudaMalloc(&d_A.elements, sizeA);
cudaMemcpy(d_A.elements, A->elements, sizeA, cudaMemcpyHostToDevice);
// Matrix d_B
d_B.width = B->width;
d_B.height = B->height;
size_t sizeB = B->width * B->height * sizeof(float);
// dynamically allocate cudaMemory for elemenst array
cudaMalloc(&d_B.elements, sizeB);
cudaMemcpy(d_B.elements, B->elements, sizeB, cudaMemcpyHostToDevice);
// Matrix d_C
d_C.width = C->width;
d_C.height = C->height;
size_t sizeC = C->width * C->height * sizeof(float);
// dynamically allocate cudaMemory for elemenst array
cudaMalloc(&d_C.elements, sizeC);
// 16 * 16 = 256 threads per block
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
// Blocks per grid
dim3 dimGrid(B->width / dimBlock.x, A->height / dimBlock.y);
// calling the Kernel
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
// copy results from result matrix C to the host again
cudaMemcpy(C->elements, d_C.elements, sizeC, cudaMemcpyDeviceToHost);
// free the cuda memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
然后我编译成Sequential_Cuda_Pyton.so
nvcc --shared --compiler-options '-fPIC' -o Sequential_Cuda_Python.so Sequential_Cuda_Python.cu
python - ctypes代码:
import numpy as np
from numpy.ctypeslib import ndpointer
from ctypes import *
class Matrix(Structure):
_fields_ = [("width", c_int),
("height", c_int),
("elements", POINTER(c_float))]
libc = CDLL("./Sequential_Cuda_Python.so")
libc.mMul.argtypes = [ POINTER(Matrix), POINTER(Matrix), POINTER(Matrix) ]
报错,好像没找到函数
Traceback (most recent call last):
File "cuda_arr.py", line 17, in <module>
libc.mMul.argtypes = [ POINTER(Matrix), POINTER(Matrix), POINTER(Matrix) ]
File "/usr/lib/python3.8/ctypes/__init__.py", line 386, in __getattr__
func = self.__getitem__(name)
File "/usr/lib/python3.8/ctypes/__init__.py", line 391, in __getitem__
func = self._FuncPtr((name_or_ordinal, self))
AttributeError: ... /Sequential_Cuda_Python.so: undefined symbol: mMul
根据评论,您需要 extern "C"
C++(以及扩展名 cuda)做了一些叫做 name mangling
的事情
尝试使用和不使用 extern "C"
readelf --symbols Sequential_Cuda_Python.so | grep mMul
#include <stdio.h>
#include <stdlib.h>
#define BLOCK_SIZE 16
struct Matrix {
int width;
int height;
float *elements;
};
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C){
// runs for each col - row pair
float tmpVal = 0;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
for (int i = 0; i < A.width; ++i)
tmpVal += A.elements[row * A.width + i] *
B.elements[i * B.width + col];
C.elements[ row * C.width + col ] = tmpVal;
}
extern "C" {
void mMul( Matrix *A, Matrix *B, Matrix *C ){
Matrix d_A, d_B, d_C;
// Matrix d_A
d_A.width = A->width;
d_A.height = A->height;
size_t sizeA = A->width * A->height * sizeof(float);
// dynamically allocate cudaMemory for elemenst array
cudaMalloc(&d_A.elements, sizeA);
cudaMemcpy(d_A.elements, A->elements, sizeA, cudaMemcpyHostToDevice);
// Matrix d_B
d_B.width = B->width;
d_B.height = B->height;
size_t sizeB = B->width * B->height * sizeof(float);
// dynamically allocate cudaMemory for elemenst array
cudaMalloc(&d_B.elements, sizeB);
cudaMemcpy(d_B.elements, B->elements, sizeB, cudaMemcpyHostToDevice);
// Matrix d_C
d_C.width = C->width;
d_C.height = C->height;
size_t sizeC = C->width * C->height * sizeof(float);
// dynamically allocate cudaMemory for elemenst array
cudaMalloc(&d_C.elements, sizeC);
// 16 * 16 = 256 threads per block
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
// Blocks per grid
dim3 dimGrid(B->width / dimBlock.x, A->height / dimBlock.y);
// calling the Kernel
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
// copy results from result matrix C to the host again
cudaMemcpy(C->elements, d_C.elements, sizeC, cudaMemcpyDeviceToHost);
// free the cuda memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
}
我尝试使用 ctypes
来 运行 python 中的一些 cuda 代码。编译并加载 .so
文件后,我 运行 出现错误,告诉我 cuda
函数不存在。我之前尝试使用普通 c
中的示例,并且成功了。我编译有问题吗?
Cuda代码
#include <stdio.h>
#include <stdlib.h>
#define BLOCK_SIZE 16
struct Matrix {
int width;
int height;
float *elements;
};
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C){
// runs for each col - row pair
float tmpVal = 0;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
for (int i = 0; i < A.width; ++i)
tmpVal += A.elements[row * A.width + i] *
B.elements[i * B.width + col];
C.elements[ row * C.width + col ] = tmpVal;
}
void mMul( Matrix *A, Matrix *B, Matrix *C ){
Matrix d_A, d_B, d_C;
// Matrix d_A
d_A.width = A->width;
d_A.height = A->height;
size_t sizeA = A->width * A->height * sizeof(float);
// dynamically allocate cudaMemory for elemenst array
cudaMalloc(&d_A.elements, sizeA);
cudaMemcpy(d_A.elements, A->elements, sizeA, cudaMemcpyHostToDevice);
// Matrix d_B
d_B.width = B->width;
d_B.height = B->height;
size_t sizeB = B->width * B->height * sizeof(float);
// dynamically allocate cudaMemory for elemenst array
cudaMalloc(&d_B.elements, sizeB);
cudaMemcpy(d_B.elements, B->elements, sizeB, cudaMemcpyHostToDevice);
// Matrix d_C
d_C.width = C->width;
d_C.height = C->height;
size_t sizeC = C->width * C->height * sizeof(float);
// dynamically allocate cudaMemory for elemenst array
cudaMalloc(&d_C.elements, sizeC);
// 16 * 16 = 256 threads per block
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
// Blocks per grid
dim3 dimGrid(B->width / dimBlock.x, A->height / dimBlock.y);
// calling the Kernel
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
// copy results from result matrix C to the host again
cudaMemcpy(C->elements, d_C.elements, sizeC, cudaMemcpyDeviceToHost);
// free the cuda memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
然后我编译成Sequential_Cuda_Pyton.so
nvcc --shared --compiler-options '-fPIC' -o Sequential_Cuda_Python.so Sequential_Cuda_Python.cu
python - ctypes代码:
import numpy as np
from numpy.ctypeslib import ndpointer
from ctypes import *
class Matrix(Structure):
_fields_ = [("width", c_int),
("height", c_int),
("elements", POINTER(c_float))]
libc = CDLL("./Sequential_Cuda_Python.so")
libc.mMul.argtypes = [ POINTER(Matrix), POINTER(Matrix), POINTER(Matrix) ]
报错,好像没找到函数
Traceback (most recent call last):
File "cuda_arr.py", line 17, in <module>
libc.mMul.argtypes = [ POINTER(Matrix), POINTER(Matrix), POINTER(Matrix) ]
File "/usr/lib/python3.8/ctypes/__init__.py", line 386, in __getattr__
func = self.__getitem__(name)
File "/usr/lib/python3.8/ctypes/__init__.py", line 391, in __getitem__
func = self._FuncPtr((name_or_ordinal, self))
AttributeError: ... /Sequential_Cuda_Python.so: undefined symbol: mMul
根据评论,您需要 extern "C"
C++(以及扩展名 cuda)做了一些叫做 name mangling
的事情尝试使用和不使用 extern "C"
readelf --symbols Sequential_Cuda_Python.so | grep mMul
#include <stdio.h>
#include <stdlib.h>
#define BLOCK_SIZE 16
struct Matrix {
int width;
int height;
float *elements;
};
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C){
// runs for each col - row pair
float tmpVal = 0;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
for (int i = 0; i < A.width; ++i)
tmpVal += A.elements[row * A.width + i] *
B.elements[i * B.width + col];
C.elements[ row * C.width + col ] = tmpVal;
}
extern "C" {
void mMul( Matrix *A, Matrix *B, Matrix *C ){
Matrix d_A, d_B, d_C;
// Matrix d_A
d_A.width = A->width;
d_A.height = A->height;
size_t sizeA = A->width * A->height * sizeof(float);
// dynamically allocate cudaMemory for elemenst array
cudaMalloc(&d_A.elements, sizeA);
cudaMemcpy(d_A.elements, A->elements, sizeA, cudaMemcpyHostToDevice);
// Matrix d_B
d_B.width = B->width;
d_B.height = B->height;
size_t sizeB = B->width * B->height * sizeof(float);
// dynamically allocate cudaMemory for elemenst array
cudaMalloc(&d_B.elements, sizeB);
cudaMemcpy(d_B.elements, B->elements, sizeB, cudaMemcpyHostToDevice);
// Matrix d_C
d_C.width = C->width;
d_C.height = C->height;
size_t sizeC = C->width * C->height * sizeof(float);
// dynamically allocate cudaMemory for elemenst array
cudaMalloc(&d_C.elements, sizeC);
// 16 * 16 = 256 threads per block
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
// Blocks per grid
dim3 dimGrid(B->width / dimBlock.x, A->height / dimBlock.y);
// calling the Kernel
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
// copy results from result matrix C to the host again
cudaMemcpy(C->elements, d_C.elements, sizeC, cudaMemcpyDeviceToHost);
// free the cuda memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
}