使用 cuModuleLoad 从 ELF 二进制文件中获取当前模块(来自 argv[0])

Using cuModuleLoad to get current Module from ELF binary (from argv[0])

情况: 我正在尝试使用 cuModuleLoad 加载当前二进制文件 (ELF) 的嵌入式 cubin(和 PTX),但它一直出错,错误代码为 200。我的问题是,如果 cubin 嵌入到最终二进制文件中,为什么我不能使用 cuModuleLoad 动态加载自己?它在我编译一个单独的 fatbinary 时起作用,但在我加载一个单独的 PTX 模块时不起作用,当然当我尝试加载最终二进制文件时 (a.out)。我有几个原因要加载我将放弃的当前可执行文件,以免偏离主题。我也在寻找一种无需使用实用工具(或系统调用)即可维护单个文件的解决方法。

在Linux中:

#include "cuda.h"
#include <cstdio>
#include <iostream>

using clock_value_t = long long;

__device__ void test(  )
{
  printf("Testing... : \n");
}

__device__ void sleep(clock_value_t sleep_cycles)
{
    clock_value_t start = clock64();
    clock_value_t cycles_elapsed;
    do { cycles_elapsed = clock64() - start; }
    while (cycles_elapsed < sleep_cycles);
}

extern "C" __global__ void hello_world(  )
{
  printf("Hello World from Device\n");
  sleep( 1e9 );
  test();
}


int main(int argc, char * argv[])
{

  std::cout << argv[0] << std::endl;

  // Initialize input vectors    ...
  //Initialize
  cuInit(0);
  // Get number of devices supporting CUDA
  int deviceCount = 0;
  cuDeviceGetCount(&deviceCount);

  if (deviceCount == 0)
  {
    printf("There is no device supporting CUDA.\n");
    exit (0);
  }
  else std::cout << "Number of device is "<< deviceCount << std::endl;

  // Get handle for device 0
  CUdevice cuDevice;
  cuDeviceGet(&cuDevice, 0);
  // Create context
  CUcontext cuContext;
  int ret = cuCtxCreate(&cuContext, 0, cuDevice);

  if( ret != CUDA_SUCCESS )
          std::cout << "Could not create context on device 0" << std::endl;

  // Create module from binary file
  CUmodule cuModule;

  ret = cuModuleLoad(&cuModule, argv[0]); // <---errors HERE
  
  if( ret != CUDA_SUCCESS )
  {
    std::cout << "Failed to load self fatbin : " << argv[0] << " : " << ret<< std::endl;
    return -1;
  }
}

如果我必须使用单独的文件或实用程序来动态提取 cubins 或 PTX,我会感到很沮丧。无论如何-提前感谢你们的见解。

找到解决办法。简而言之:

  1. fopen( argv[0] )
  2. mmap(文件)
  3. 阅读 ELF headers 并找到“.nv_fatbin”部分
  4. 解析“.nv_fatbin”对齐字节序列“50 ed 55 ba 01 00 10 00”
  5. 找到你要cuModuleGetFunction的全局方法相关的cubin
  6. 调用 cuModuleLoadFatBinary 的基地址为 .nv_fatbin + 特定的 cubin 偏移量。
  7. 使用 cuModuleGetFunction 获取函数
  8. 最后调用 cuLaunchKernel

参考下面草率的代码:

int main(int argc, char * argv[])
{
  std::cout << "Hello World from Host" << std::endl;
  std::cout << argv[0] << std::endl;
  void * start_ptr =NULL;
  struct stat sb;
  size_t sz =0;

  //read_elf_header( argv[0] );
  // Either Elf64_Ehdr or Elf32_Ehdr depending on architecture.
  ElfW(Ehdr) elf_header;
  ElfW(Shdr) header;

  std::cout << "opening elf file" << std::endl;
  FILE* file = fopen(argv[0], "rb");

  int fd = fileno( file );

  if (fd < 0)
  {
    printf("Could not open file for memory mapping, fd = %i\n", errno);
    exit(1);
  }

  std::cout << "getting file size" << std::endl;
  if (fstat(fd, &sb) == -1)          // To obtain file size
    printf("Could not find fstat");
  sz = sb.st_size;

  std::cout << "Mapping file to memory : " << sz << std::endl;
  start_ptr = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);

  //check if valid elf
  bool b = elf_is_elf64( file );
  fseek( file, 0, SEEK_SET );
  std::cout << "is ELF file : " << b << std::endl;
  if( b)
  {
    std::cout << "Found valid ELF file" << std::endl;
    //get ELF_Header
    b = elf64_get_elf_header(file, &elf_header);
    fseek( file, 0, SEEK_SET );

    if( b )
    {
      std::cout << "-Found valid ELF Header" << std::endl;
      b = elf64_get_section_header_by_name(file, (const Elf64_Ehdr *) &elf_header, ".nv_fatbin", &header);
      fseek( file, 0, SEEK_SET );

      if( b )
      {
        std::cout << "Found fatbin section" << std::endl;
        cuInit(0);
        // Get number of devices supporting CUDA
        int deviceCount = 0;
        cuDeviceGetCount(&deviceCount);

        if (deviceCount == 0)
        {
          printf("There is no device supporting CUDA.\n");
          exit (0);
        }
        else std::cout << "Number of device is "<< deviceCount << std::endl;

        // Get handle for device 0
        CUdevice cuDevice;
        cuDeviceGet(&cuDevice, 0);
        // Create context
        CUcontext cuContext;
        int ret = cuCtxCreate(&cuContext, 0, cuDevice);
        if( ret != CUDA_SUCCESS )
          std::cout << "Could not create context on device 0" << std::endl;
        // Create module from binary file
        CUmodule cuModule;
        std::cout << "sh_addr = " <<  header.sh_addr << std::endl;
        unsigned long long offset = header.sh_addr;
        
        unsigned long long cuOffset = _find_cubin_offset( header, start_ptr, offset, "_Z11hello_worldv");

        const void * fatbin = &((unsigned char *) start_ptr)[cuOffset];
        
        std:: cout << "fat bin = " << fatbin << std::endl;

        ret = cuModuleLoadFatBinary(&cuModule, fatbin );

        if( ret != CUDA_SUCCESS )
        {
          std::cout << "Failed to load self fatbin : " << argv[0] << " : " << ret<< std::endl;
        }

        CUfunction khw;
        //ret = cuModuleGetFunction(&khw, cuModule, "hello_world");
        ret = cuModuleGetFunction(&khw, cuModule, "_Z11hello_worldv");
        if( ret != CUDA_SUCCESS )
        {
          std::cout << "Failed to get hello_world from " << argv[0] << " : " << ret <<  std::endl;
        }
        else ret = cuLaunchKernel(khw, 1, 1, 1, 1, 1, 1, 0, 0, NULL, 0);

        if( ret != CUDA_SUCCESS )
        {
          std::cout << "Failed to launch : hello_world "  << std::endl;
        }

        ret = cuModuleUnload(cuModule);

        if( ret != CUDA_SUCCESS )
        {
          std::cout << "Failed to unload self fatbin : " << argv[0] << std::endl;
          return -1;
        }

        if (cudaDeviceSynchronize() != cudaSuccess)
        {
          printf ("Cuda call failed\n");
        }

        //unmap sutff
        munmap(start_ptr, sz);
        return 0;
      }
    }

  }

  fclose(file);

  return 0;