Titan RTX 上的双精度和单精度矩阵乘法基准测试
Matrix multiplication benchmarking on Titan RTX with double and single precisions
我想了解我们的 GPU 工作站的单精度和双精度之间的性能差异。
我们的工作站配备了两个 TITAN RTX GPU,但我 运行 是单个 Titan RTX 的基准测试。
我正在用 cublas 矩阵-矩阵乘法测试性能。我乘以由随机浮点数或双精度数组成的 8192x8192 矩阵。为了确保我这边没有错误,我还在 Python 使用 cupy 库重复了这个过程,结果非常相似。
测试结果是浮点数每 1 次乘法约 75 毫秒,双精度数约 2,000 毫秒。
如果我有一个较旧的 GPU,这将很有意义,因为 75*32 = 2,400~2000,这样我的双精度性能将比 [=31= 预期的低 ~32 倍] https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions.
但是,我的 GPU 具有 7.5 的计算能力,因此我预计双精度的性能下降仅为 2 倍。
其他信息:Ubuntu 18 LTS,nvcc 10.2,驱动程序 440.82。
这是 CUDA 代码:
#include <iostream>
#include <chrono>
#include <string>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <math.h>
#include <stdio.h>
#include <cuda.h>
#include <device_functions.h>
#include <sstream>
#include <time.h>
unsigned long mix(unsigned long a, unsigned long b, unsigned long c)
{
a=a-b; a=a-c; a=a^(c >> 13);
b=b-c; b=b-a; b=b^(a << 8);
c=c-a; c=c-b; c=c^(b >> 13);
a=a-b; a=a-c; a=a^(c >> 12);
b=b-c; b=b-a; b=b^(a << 16);
c=c-a; c=c-b; c=c^(b >> 5);
a=a-b; a=a-c; a=a^(c >> 3);
b=b-c; b=b-a; b=b^(a << 10);
c=c-a; c=c-b; c=c^(b >> 15);
return c;
}
using namespace std;
int main()
{
int deviceCount;
cudaGetDeviceCount(&deviceCount);
cudaDeviceProp deviceProp;
cublasStatus_t err;
cudaGetDeviceProperties(&deviceProp, 0);
printf("Detected %d devices \n", deviceCount);
printf("Device %d has compute capability %d.%d:\n\t maxshmem %d. \n\t maxthreads per block %d. \n\t max threads dim %d. %d. %d.\n ", 0,
deviceProp.major, deviceProp.minor, deviceProp.sharedMemPerBlock, deviceProp.maxThreadsPerBlock, deviceProp.maxThreadsDim[0],
deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
cudaEvent_t start_d, stop_d;
cudaEventCreate(&start_d);
cudaEventCreate(&stop_d);
//RND insicialization
unsigned long seed = mix(clock(), time(NULL), 0);
srand(seed);
int N=8192;
int Nloops=2;
int memsize=N*N*sizeof(double);
double *a = (double *)malloc(memsize);
double *b = (double *)malloc(memsize);
double *c = (double *)malloc(memsize);
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++){
a[i*N+j]=((double)rand() / RAND_MAX);
b[i*N+j]=((double)rand() / RAND_MAX);
}
double *a_d, *b_d, *c_d;
cudaMalloc((void **)&a_d, memsize);
cudaMalloc((void **)&b_d, memsize);
cudaMalloc((void **)&c_d, memsize);
cudaMemcpy(a_d, a, memsize, cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b, memsize, cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
double alpha=1.0;
double beta=0.0;
auto start = chrono::steady_clock::now();
clock_t start1;
start1 = clock();
cudaEventRecord(start_d);
if (cudaGetLastError() != cudaSuccess)
printf("%s \n",cudaGetErrorString(cudaGetLastError()));
for (int i=0; i<Nloops; i++)
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N,N,N,&alpha,a_d,N,b_d,N,&beta,c_d,N);
cudaEventRecord(stop_d);
cudaDeviceSynchronize();
auto end = chrono::steady_clock::now();
start1 = clock() - start1;
cudaEventSynchronize(stop_d);
cublasDestroy(handle);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start_d, stop_d);
std::cout << "Cuda event " << milliseconds /Nloops << " ms" <<endl;
std::cout << " time elapsed " << start1 / (double)CLOCKS_PER_SEC /Nloops << '\n';
cout << "time elapsed for 1 multiplication: " << ((double)chrono::duration_cast<chrono::microseconds>(end-start).count() )/(Nloops*1000.0)<< " milliseconds" <<endl;
free(a); free(b); free(c);
cudaFree(a_d); cudaFree(b_d); cudaFree(c_d);
}
这是产生一致结果的 python 代码:
import cupy as cp
import time
iterations = 2
a = cp.random.rand(8192,8192).astype(cp.float64)
b = cp.random.rand(8192,8192).astype(cp.float64)
def ab(a,b,iterations):
for i in range(iterations):
cp.matmul(a,b,out=None)
ab(a,b,1) # warm up
cp.cuda.Device(0).synchronize()
t1 = time.time()
ab(a,b,iterations)
cp.cuda.Device(0).synchronize()
t2 = time.time()
total = (t2-t1)/iterations
print(total)
好的,我找到了答案。在我 link 我的问题中的那个 table 中,有一个脚注说对于计算能力 7.5(这里就是这种情况),性能是 2,但不是 32,对于浮点数是64,这意味着 multiplication-addition 双精度运算比浮点运算慢 32 倍。
如果 float 和 double 问题都完全 arithmetic-bound,我预计减速约为 32。实际上,减速稍微小一些(2000/75 ~ 27),这可能是浮动问题的结果bandwidth-bound,或者可能与其他事情有关。
我想了解我们的 GPU 工作站的单精度和双精度之间的性能差异。
我们的工作站配备了两个 TITAN RTX GPU,但我 运行 是单个 Titan RTX 的基准测试。 我正在用 cublas 矩阵-矩阵乘法测试性能。我乘以由随机浮点数或双精度数组成的 8192x8192 矩阵。为了确保我这边没有错误,我还在 Python 使用 cupy 库重复了这个过程,结果非常相似。
测试结果是浮点数每 1 次乘法约 75 毫秒,双精度数约 2,000 毫秒。
如果我有一个较旧的 GPU,这将很有意义,因为 75*32 = 2,400~2000,这样我的双精度性能将比 [=31= 预期的低 ~32 倍] https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions.
但是,我的 GPU 具有 7.5 的计算能力,因此我预计双精度的性能下降仅为 2 倍。
其他信息:Ubuntu 18 LTS,nvcc 10.2,驱动程序 440.82。
这是 CUDA 代码:
#include <iostream>
#include <chrono>
#include <string>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <math.h>
#include <stdio.h>
#include <cuda.h>
#include <device_functions.h>
#include <sstream>
#include <time.h>
unsigned long mix(unsigned long a, unsigned long b, unsigned long c)
{
a=a-b; a=a-c; a=a^(c >> 13);
b=b-c; b=b-a; b=b^(a << 8);
c=c-a; c=c-b; c=c^(b >> 13);
a=a-b; a=a-c; a=a^(c >> 12);
b=b-c; b=b-a; b=b^(a << 16);
c=c-a; c=c-b; c=c^(b >> 5);
a=a-b; a=a-c; a=a^(c >> 3);
b=b-c; b=b-a; b=b^(a << 10);
c=c-a; c=c-b; c=c^(b >> 15);
return c;
}
using namespace std;
int main()
{
int deviceCount;
cudaGetDeviceCount(&deviceCount);
cudaDeviceProp deviceProp;
cublasStatus_t err;
cudaGetDeviceProperties(&deviceProp, 0);
printf("Detected %d devices \n", deviceCount);
printf("Device %d has compute capability %d.%d:\n\t maxshmem %d. \n\t maxthreads per block %d. \n\t max threads dim %d. %d. %d.\n ", 0,
deviceProp.major, deviceProp.minor, deviceProp.sharedMemPerBlock, deviceProp.maxThreadsPerBlock, deviceProp.maxThreadsDim[0],
deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
cudaEvent_t start_d, stop_d;
cudaEventCreate(&start_d);
cudaEventCreate(&stop_d);
//RND insicialization
unsigned long seed = mix(clock(), time(NULL), 0);
srand(seed);
int N=8192;
int Nloops=2;
int memsize=N*N*sizeof(double);
double *a = (double *)malloc(memsize);
double *b = (double *)malloc(memsize);
double *c = (double *)malloc(memsize);
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++){
a[i*N+j]=((double)rand() / RAND_MAX);
b[i*N+j]=((double)rand() / RAND_MAX);
}
double *a_d, *b_d, *c_d;
cudaMalloc((void **)&a_d, memsize);
cudaMalloc((void **)&b_d, memsize);
cudaMalloc((void **)&c_d, memsize);
cudaMemcpy(a_d, a, memsize, cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b, memsize, cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
double alpha=1.0;
double beta=0.0;
auto start = chrono::steady_clock::now();
clock_t start1;
start1 = clock();
cudaEventRecord(start_d);
if (cudaGetLastError() != cudaSuccess)
printf("%s \n",cudaGetErrorString(cudaGetLastError()));
for (int i=0; i<Nloops; i++)
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N,N,N,&alpha,a_d,N,b_d,N,&beta,c_d,N);
cudaEventRecord(stop_d);
cudaDeviceSynchronize();
auto end = chrono::steady_clock::now();
start1 = clock() - start1;
cudaEventSynchronize(stop_d);
cublasDestroy(handle);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start_d, stop_d);
std::cout << "Cuda event " << milliseconds /Nloops << " ms" <<endl;
std::cout << " time elapsed " << start1 / (double)CLOCKS_PER_SEC /Nloops << '\n';
cout << "time elapsed for 1 multiplication: " << ((double)chrono::duration_cast<chrono::microseconds>(end-start).count() )/(Nloops*1000.0)<< " milliseconds" <<endl;
free(a); free(b); free(c);
cudaFree(a_d); cudaFree(b_d); cudaFree(c_d);
}
这是产生一致结果的 python 代码:
import cupy as cp
import time
iterations = 2
a = cp.random.rand(8192,8192).astype(cp.float64)
b = cp.random.rand(8192,8192).astype(cp.float64)
def ab(a,b,iterations):
for i in range(iterations):
cp.matmul(a,b,out=None)
ab(a,b,1) # warm up
cp.cuda.Device(0).synchronize()
t1 = time.time()
ab(a,b,iterations)
cp.cuda.Device(0).synchronize()
t2 = time.time()
total = (t2-t1)/iterations
print(total)
好的,我找到了答案。在我 link 我的问题中的那个 table 中,有一个脚注说对于计算能力 7.5(这里就是这种情况),性能是 2,但不是 32,对于浮点数是64,这意味着 multiplication-addition 双精度运算比浮点运算慢 32 倍。
如果 float 和 double 问题都完全 arithmetic-bound,我预计减速约为 32。实际上,减速稍微小一些(2000/75 ~ 27),这可能是浮动问题的结果bandwidth-bound,或者可能与其他事情有关。