简单的 Thrust 代码执行速度大约是我的原始 cuda 内核的一半。我使用推力错了吗?

Simple Thrust code performs about half as fast as my naive cuda kernel. Am I using Thrust wrong?

我对 Cuda 和 Thrust 很陌生,但我的印象是,如果使用得当,Thrust 应该会提供比单纯编写的 Cuda 内核更好的性能。我是否以次优方式使用 Thrust?下面是一个完整的最小示例,它采用长度为 N+2 的数组 u,并且对于 1N 之间的每个 i 计算平均值 0.5*(u[i-1] + u[i+1]) 并将结果放入 uNew[i]。 (uNew[0] 设置为 u[0]u[N+1] 设置为 u[N+1],这样边界项就不会改变)。该代码多次执行此平均操作以获得用于计时测试的合理时间。在我的硬件上,Thrust 计算花费的时间大约是原始代码的两倍。有没有办法改进我的 Thrust 代码?

#include <iostream>
#include <thrust/device_vector.h>
#include <boost/timer.hpp>
#include <thrust/device_malloc.h>

typedef double numtype;

template <typename T> class NeighborAverageFunctor{
 int N;
public:
 NeighborAverageFunctor(int _N){
  N = _N;
 }
 template <typename Tuple>
 __host__ __device__ void operator()(Tuple t){
  T uL = thrust::get<0>(t);
  T uR = thrust::get<1>(t);

  thrust::get<2>(t) = 0.5*(uL + uR);
 }

 int getN(){
  return N;
 }
};

template <typename T> void thrust_sweep(thrust::device_ptr<T> u, thrust::device_ptr<T> uNew, NeighborAverageFunctor<T>& op){
 int N = op.getN();
 thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(u, u + 2, uNew + 1)), thrust::make_zip_iterator(thrust::make_tuple(u + N, u + N+2, uNew + N+1)), op);
 // Propagate boundary values without changing them
 uNew[0] = u[0];
 uNew[N+1] = u[N+1];
}


template <typename T> __global__ void initialization_kernel(int n, T* u){
 const int i = blockIdx.x * blockDim.x + threadIdx.x;
 if(i < n+2){
  if(i == 0){
   u[i] = 1.0;
  }
  else{
   u[i] = 0.0;
  }
 }
}

template <typename T> __global__ void sweep_kernel(int n, T, T* u, T* uNew){
 const int i = blockDim.x * blockIdx.x + threadIdx.x;
 if (i >= 1 && i < n-1){
  uNew[i] = 0.5*(u[i+1] + u[i-1]);
 }
 else if(i == 0 || i == n+1){
  uNew[i] = u[i];
 }
}

int main(void){
 int sweeps = 2000;
 int N = 4096*2048;
 numtype h = 1.0/N;
 numtype hSquared = pow(h, 2);

 NeighborAverageFunctor<numtype> op(N);

 thrust::device_ptr<numtype> u_d = thrust::device_malloc<numtype>(N+2);
 thrust::device_ptr<numtype> uNew_d = thrust::device_malloc<numtype>(N+2);
 thrust::device_ptr<numtype> uTemp_d;

 thrust::fill(u_d, u_d + (N+2), 0.0);
 u_d[0] = 1.0;

 boost::timer::timer timer1;

 for(int k = 0; k < sweeps; k++){
  thrust_sweep<numtype>(u_d, uNew_d, op);
  uTemp_d = u_d;
  u_d = uNew_d;
  uNew_d = uTemp_d;
 }

 double thrust_time = timer1.elapsed();

 thrust::host_vector<numtype> u_h(N+2);
 thrust::copy(u_d, u_d + N+2, u_h.begin());
 for(int i = 0; i < 10; i++){
  std::cout << u_h[i] << " ";
 }
 std::cout << std::endl;

 thrust::device_free(u_d);
 thrust::device_free(uNew_d);

 numtype * u_raw_d, * uNew_raw_d, * uTemp_raw_d;
 cudaMalloc(&u_raw_d, (N+2)*sizeof(numtype));
 cudaMalloc(&uNew_raw_d, (N+2)*sizeof(numtype));

 numtype * u_raw_h = (numtype*)malloc((N+2)*sizeof(numtype));

 int block_size = 256;
 int grid_size = ((N+2) + block_size - 1) / block_size;

 initialization_kernel<numtype><<<grid_size, block_size>>>(N, u_raw_d);

 boost::timer::timer timer2;

 for(int k = 0; k < sweeps; k++){
  sweep_kernel<numtype><<<grid_size, block_size>>>(N+2, hSquared, u_raw_d, uNew_raw_d);
  uTemp_raw_d = u_raw_d;
  u_raw_d = uNew_raw_d;
  uNew_raw_d = uTemp_raw_d;
 }

 double raw_time = timer2.elapsed();

 cudaMemcpy(u_raw_h, u_raw_d, (N+2)*sizeof(numtype), cudaMemcpyDeviceToHost);

 for(int i = 0; i < 10; i++){
  std::cout << u_raw_h[i] << " ";
 }
 std::cout << std::endl;

 std::cout << "Thrust: " << thrust_time << " s" << std::endl;
 std::cout << "Raw: " << raw_time << " s" << std::endl;

 free(u_raw_h);

 cudaFree(u_raw_d);
 cudaFree(uNew_raw_d);

 return 0;
}

根据我的测试,这些行:

uNew[0] = u[0];
uNew[N+1] = u[N+1];

相对于内核方法,正在扼杀您的推力性能。当我消除它们时,结果似乎没有任何不同。与内核处理边界情况的方式相比,推力代码使用非常昂贵的方法(cudaMemcpy 操作,在幕后)执行边界处理。

由于推力函子实际上从未写入边界位置,因此只写入这些值一次就足够了,而不是循环写入。

您可以通过更好地处理边界情况来显着加快推力性能。