使用 cuModuleLoad 从 ELF 二进制文件中获取当前模块(来自 argv[0])
Using cuModuleLoad to get current Module from ELF binary (from argv[0])
情况:
我正在尝试使用 cuModuleLoad 加载当前二进制文件 (ELF) 的嵌入式 cubin(和 PTX),但它一直出错,错误代码为 200。我的问题是,如果 cubin 嵌入到最终二进制文件中,为什么我不能使用 cuModuleLoad 动态加载自己?它在我编译一个单独的 fatbinary 时起作用,但在我加载一个单独的 PTX 模块时不起作用,当然当我尝试加载最终二进制文件时 (a.out)。我有几个原因要加载我将放弃的当前可执行文件,以免偏离主题。我也在寻找一种无需使用实用工具(或系统调用)即可维护单个文件的解决方法。
在Linux中:
#include "cuda.h"
#include <cstdio>
#include <iostream>
using clock_value_t = long long;
__device__ void test( )
{
printf("Testing... : \n");
}
__device__ void sleep(clock_value_t sleep_cycles)
{
clock_value_t start = clock64();
clock_value_t cycles_elapsed;
do { cycles_elapsed = clock64() - start; }
while (cycles_elapsed < sleep_cycles);
}
extern "C" __global__ void hello_world( )
{
printf("Hello World from Device\n");
sleep( 1e9 );
test();
}
int main(int argc, char * argv[])
{
std::cout << argv[0] << std::endl;
// Initialize input vectors ...
//Initialize
cuInit(0);
// Get number of devices supporting CUDA
int deviceCount = 0;
cuDeviceGetCount(&deviceCount);
if (deviceCount == 0)
{
printf("There is no device supporting CUDA.\n");
exit (0);
}
else std::cout << "Number of device is "<< deviceCount << std::endl;
// Get handle for device 0
CUdevice cuDevice;
cuDeviceGet(&cuDevice, 0);
// Create context
CUcontext cuContext;
int ret = cuCtxCreate(&cuContext, 0, cuDevice);
if( ret != CUDA_SUCCESS )
std::cout << "Could not create context on device 0" << std::endl;
// Create module from binary file
CUmodule cuModule;
ret = cuModuleLoad(&cuModule, argv[0]); // <---errors HERE
if( ret != CUDA_SUCCESS )
{
std::cout << "Failed to load self fatbin : " << argv[0] << " : " << ret<< std::endl;
return -1;
}
}
如果我必须使用单独的文件或实用程序来动态提取 cubins 或 PTX,我会感到很沮丧。无论如何-提前感谢你们的见解。
找到解决办法。简而言之:
- fopen( argv[0] )
- mmap(文件)
- 阅读 ELF headers 并找到“.nv_fatbin”部分
- 解析“.nv_fatbin”对齐字节序列“50 ed 55 ba 01 00 10 00”
- 找到你要cuModuleGetFunction的全局方法相关的cubin
- 调用 cuModuleLoadFatBinary 的基地址为 .nv_fatbin + 特定的 cubin 偏移量。
- 使用 cuModuleGetFunction 获取函数
- 最后调用 cuLaunchKernel
参考下面草率的代码:
int main(int argc, char * argv[])
{
std::cout << "Hello World from Host" << std::endl;
std::cout << argv[0] << std::endl;
void * start_ptr =NULL;
struct stat sb;
size_t sz =0;
//read_elf_header( argv[0] );
// Either Elf64_Ehdr or Elf32_Ehdr depending on architecture.
ElfW(Ehdr) elf_header;
ElfW(Shdr) header;
std::cout << "opening elf file" << std::endl;
FILE* file = fopen(argv[0], "rb");
int fd = fileno( file );
if (fd < 0)
{
printf("Could not open file for memory mapping, fd = %i\n", errno);
exit(1);
}
std::cout << "getting file size" << std::endl;
if (fstat(fd, &sb) == -1) // To obtain file size
printf("Could not find fstat");
sz = sb.st_size;
std::cout << "Mapping file to memory : " << sz << std::endl;
start_ptr = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
//check if valid elf
bool b = elf_is_elf64( file );
fseek( file, 0, SEEK_SET );
std::cout << "is ELF file : " << b << std::endl;
if( b)
{
std::cout << "Found valid ELF file" << std::endl;
//get ELF_Header
b = elf64_get_elf_header(file, &elf_header);
fseek( file, 0, SEEK_SET );
if( b )
{
std::cout << "-Found valid ELF Header" << std::endl;
b = elf64_get_section_header_by_name(file, (const Elf64_Ehdr *) &elf_header, ".nv_fatbin", &header);
fseek( file, 0, SEEK_SET );
if( b )
{
std::cout << "Found fatbin section" << std::endl;
cuInit(0);
// Get number of devices supporting CUDA
int deviceCount = 0;
cuDeviceGetCount(&deviceCount);
if (deviceCount == 0)
{
printf("There is no device supporting CUDA.\n");
exit (0);
}
else std::cout << "Number of device is "<< deviceCount << std::endl;
// Get handle for device 0
CUdevice cuDevice;
cuDeviceGet(&cuDevice, 0);
// Create context
CUcontext cuContext;
int ret = cuCtxCreate(&cuContext, 0, cuDevice);
if( ret != CUDA_SUCCESS )
std::cout << "Could not create context on device 0" << std::endl;
// Create module from binary file
CUmodule cuModule;
std::cout << "sh_addr = " << header.sh_addr << std::endl;
unsigned long long offset = header.sh_addr;
unsigned long long cuOffset = _find_cubin_offset( header, start_ptr, offset, "_Z11hello_worldv");
const void * fatbin = &((unsigned char *) start_ptr)[cuOffset];
std:: cout << "fat bin = " << fatbin << std::endl;
ret = cuModuleLoadFatBinary(&cuModule, fatbin );
if( ret != CUDA_SUCCESS )
{
std::cout << "Failed to load self fatbin : " << argv[0] << " : " << ret<< std::endl;
}
CUfunction khw;
//ret = cuModuleGetFunction(&khw, cuModule, "hello_world");
ret = cuModuleGetFunction(&khw, cuModule, "_Z11hello_worldv");
if( ret != CUDA_SUCCESS )
{
std::cout << "Failed to get hello_world from " << argv[0] << " : " << ret << std::endl;
}
else ret = cuLaunchKernel(khw, 1, 1, 1, 1, 1, 1, 0, 0, NULL, 0);
if( ret != CUDA_SUCCESS )
{
std::cout << "Failed to launch : hello_world " << std::endl;
}
ret = cuModuleUnload(cuModule);
if( ret != CUDA_SUCCESS )
{
std::cout << "Failed to unload self fatbin : " << argv[0] << std::endl;
return -1;
}
if (cudaDeviceSynchronize() != cudaSuccess)
{
printf ("Cuda call failed\n");
}
//unmap sutff
munmap(start_ptr, sz);
return 0;
}
}
}
fclose(file);
return 0;
情况: 我正在尝试使用 cuModuleLoad 加载当前二进制文件 (ELF) 的嵌入式 cubin(和 PTX),但它一直出错,错误代码为 200。我的问题是,如果 cubin 嵌入到最终二进制文件中,为什么我不能使用 cuModuleLoad 动态加载自己?它在我编译一个单独的 fatbinary 时起作用,但在我加载一个单独的 PTX 模块时不起作用,当然当我尝试加载最终二进制文件时 (a.out)。我有几个原因要加载我将放弃的当前可执行文件,以免偏离主题。我也在寻找一种无需使用实用工具(或系统调用)即可维护单个文件的解决方法。
在Linux中:
#include "cuda.h"
#include <cstdio>
#include <iostream>
using clock_value_t = long long;
__device__ void test( )
{
printf("Testing... : \n");
}
__device__ void sleep(clock_value_t sleep_cycles)
{
clock_value_t start = clock64();
clock_value_t cycles_elapsed;
do { cycles_elapsed = clock64() - start; }
while (cycles_elapsed < sleep_cycles);
}
extern "C" __global__ void hello_world( )
{
printf("Hello World from Device\n");
sleep( 1e9 );
test();
}
int main(int argc, char * argv[])
{
std::cout << argv[0] << std::endl;
// Initialize input vectors ...
//Initialize
cuInit(0);
// Get number of devices supporting CUDA
int deviceCount = 0;
cuDeviceGetCount(&deviceCount);
if (deviceCount == 0)
{
printf("There is no device supporting CUDA.\n");
exit (0);
}
else std::cout << "Number of device is "<< deviceCount << std::endl;
// Get handle for device 0
CUdevice cuDevice;
cuDeviceGet(&cuDevice, 0);
// Create context
CUcontext cuContext;
int ret = cuCtxCreate(&cuContext, 0, cuDevice);
if( ret != CUDA_SUCCESS )
std::cout << "Could not create context on device 0" << std::endl;
// Create module from binary file
CUmodule cuModule;
ret = cuModuleLoad(&cuModule, argv[0]); // <---errors HERE
if( ret != CUDA_SUCCESS )
{
std::cout << "Failed to load self fatbin : " << argv[0] << " : " << ret<< std::endl;
return -1;
}
}
如果我必须使用单独的文件或实用程序来动态提取 cubins 或 PTX,我会感到很沮丧。无论如何-提前感谢你们的见解。
找到解决办法。简而言之:
- fopen( argv[0] )
- mmap(文件)
- 阅读 ELF headers 并找到“.nv_fatbin”部分
- 解析“.nv_fatbin”对齐字节序列“50 ed 55 ba 01 00 10 00”
- 找到你要cuModuleGetFunction的全局方法相关的cubin
- 调用 cuModuleLoadFatBinary 的基地址为 .nv_fatbin + 特定的 cubin 偏移量。
- 使用 cuModuleGetFunction 获取函数
- 最后调用 cuLaunchKernel
参考下面草率的代码:
int main(int argc, char * argv[])
{
std::cout << "Hello World from Host" << std::endl;
std::cout << argv[0] << std::endl;
void * start_ptr =NULL;
struct stat sb;
size_t sz =0;
//read_elf_header( argv[0] );
// Either Elf64_Ehdr or Elf32_Ehdr depending on architecture.
ElfW(Ehdr) elf_header;
ElfW(Shdr) header;
std::cout << "opening elf file" << std::endl;
FILE* file = fopen(argv[0], "rb");
int fd = fileno( file );
if (fd < 0)
{
printf("Could not open file for memory mapping, fd = %i\n", errno);
exit(1);
}
std::cout << "getting file size" << std::endl;
if (fstat(fd, &sb) == -1) // To obtain file size
printf("Could not find fstat");
sz = sb.st_size;
std::cout << "Mapping file to memory : " << sz << std::endl;
start_ptr = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
//check if valid elf
bool b = elf_is_elf64( file );
fseek( file, 0, SEEK_SET );
std::cout << "is ELF file : " << b << std::endl;
if( b)
{
std::cout << "Found valid ELF file" << std::endl;
//get ELF_Header
b = elf64_get_elf_header(file, &elf_header);
fseek( file, 0, SEEK_SET );
if( b )
{
std::cout << "-Found valid ELF Header" << std::endl;
b = elf64_get_section_header_by_name(file, (const Elf64_Ehdr *) &elf_header, ".nv_fatbin", &header);
fseek( file, 0, SEEK_SET );
if( b )
{
std::cout << "Found fatbin section" << std::endl;
cuInit(0);
// Get number of devices supporting CUDA
int deviceCount = 0;
cuDeviceGetCount(&deviceCount);
if (deviceCount == 0)
{
printf("There is no device supporting CUDA.\n");
exit (0);
}
else std::cout << "Number of device is "<< deviceCount << std::endl;
// Get handle for device 0
CUdevice cuDevice;
cuDeviceGet(&cuDevice, 0);
// Create context
CUcontext cuContext;
int ret = cuCtxCreate(&cuContext, 0, cuDevice);
if( ret != CUDA_SUCCESS )
std::cout << "Could not create context on device 0" << std::endl;
// Create module from binary file
CUmodule cuModule;
std::cout << "sh_addr = " << header.sh_addr << std::endl;
unsigned long long offset = header.sh_addr;
unsigned long long cuOffset = _find_cubin_offset( header, start_ptr, offset, "_Z11hello_worldv");
const void * fatbin = &((unsigned char *) start_ptr)[cuOffset];
std:: cout << "fat bin = " << fatbin << std::endl;
ret = cuModuleLoadFatBinary(&cuModule, fatbin );
if( ret != CUDA_SUCCESS )
{
std::cout << "Failed to load self fatbin : " << argv[0] << " : " << ret<< std::endl;
}
CUfunction khw;
//ret = cuModuleGetFunction(&khw, cuModule, "hello_world");
ret = cuModuleGetFunction(&khw, cuModule, "_Z11hello_worldv");
if( ret != CUDA_SUCCESS )
{
std::cout << "Failed to get hello_world from " << argv[0] << " : " << ret << std::endl;
}
else ret = cuLaunchKernel(khw, 1, 1, 1, 1, 1, 1, 0, 0, NULL, 0);
if( ret != CUDA_SUCCESS )
{
std::cout << "Failed to launch : hello_world " << std::endl;
}
ret = cuModuleUnload(cuModule);
if( ret != CUDA_SUCCESS )
{
std::cout << "Failed to unload self fatbin : " << argv[0] << std::endl;
return -1;
}
if (cudaDeviceSynchronize() != cudaSuccess)
{
printf ("Cuda call failed\n");
}
//unmap sutff
munmap(start_ptr, sz);
return 0;
}
}
}
fclose(file);
return 0;