PyCUDA 填充 np.array 太慢
PyCUDA fills np.array too slow
我想用这段代码绘制实心三角形:
import cv2
import numpy as np
import os
import time
import math
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
executions_per_frame = 10
pycuda_code = """
__device__ void set_pixel_3d(unsigned char *canvas, int* canvas_shape, float *z_buffer, int x, int y, float z, unsigned char *color) {
int index = y * canvas_shape[1] + x;
if (z > z_buffer[index]) {
z_buffer[index] = z;
for (int i = 0; i < canvas_shape[2]; ++i) {
canvas[index * canvas_shape[2] + i] = color[i];
}
}
}
// l/r - left/right
// l/u - lower/upper
__global__ void draw_triangle(unsigned char *canvas, int *canvas_shape, float *z_buffer, float *ll, float *rl, float *lu, float *ru, unsigned char *color, int height, int min_x, int min_y) {
int global_thread_x = threadIdx.x + blockIdx.x * blockDim.x;
int global_thread_y = threadIdx.y + blockIdx.y * blockDim.y;
float k1 = (float)global_thread_y / height;
int left_x = (int)(ll[0] + (lu[0] - ll[0]) * k1);
int right_x = (int)(rl[0] + (ru[0] - rl[0]) * k1);
float left_z = ll[2] + (lu[2] - ll[2]) * k1;
float right_z = rl[2] + (ru[2] - rl[2]) * k1;
int actual_x = min_x + global_thread_x;
if (left_x != right_x && left_x <= actual_x && actual_x <= right_x) {
int actual_y = min_y + global_thread_y;
float k2 = (float)(global_thread_x - (left_x - min_x)) / (right_x - left_x);
float actual_z = left_z + (right_z - left_z) * k2;
set_pixel_3d(canvas, canvas_shape, z_buffer, actual_x, actual_y, actual_z, color);
}
}
"""
if __name__ == '__main__':
if (os.system("cl.exe")):
os.environ['PATH'] += ';' + r"C:\Program Files\Microsoft Visual Studio17\Community\VC\Tools\MSVC.16.27023\bin\Hostx64\x64"
if (os.system("cl.exe")):
raise RuntimeError("cl.exe still not found")
pycuda_src_module = SourceModule(pycuda_code, no_extern_c=True)
pycuda_draw_triangle = pycuda_src_module.get_function("_Z13draw_trianglePhPiPfS1_S1_S1_S1_S_iii")
time_start, frames_count, fps = time.time(), 0, 0
while True:
key = cv2.waitKeyEx(1)
if key == 27:
break
canvas_width, canvas_height = 1000, 800
canvas = np.zeros((canvas_height, canvas_width, 3), dtype=np.uint8)
z_buffer = np.zeros((canvas_height, canvas_width), dtype=np.float32)
fragment_width, fragment_height = 400, 300
color = [0, 0, 200]
block_side = 32
block_dim = (block_side, block_side, 1)
grid_dim = (math.ceil(fragment_width / block_side), math.ceil(fragment_height / block_side))
param_canvas = cuda.InOut(canvas) # unsigned char *canvas
param_canvas_shape = cuda.In(np.array(canvas.shape, dtype=np.int32)) # int *canvas_shape
param_z_buffer = cuda.InOut(z_buffer) # float *z_buffer
param_ll = cuda.In(np.array([100, 200, frames_count], dtype=np.float32)) # float *ll
param_rl = cuda.In(np.array([500, 200, frames_count], dtype=np.float32)) # float *rl
param_lu = cuda.In(np.array([400, 500, frames_count], dtype=np.float32)) # float *lu
param_ru = cuda.In(np.array([400, 500, frames_count], dtype=np.float32)) # float *ru
param_color = cuda.In(np.array(color, dtype=np.uint8)) # unsigned char *color
param_height = np.int32(fragment_height) # int height
param_min_x = np.int32(100) # int min_x
param_min_y = np.int32(200) # int min_y
for i in range(executions_per_frame):
pycuda_draw_triangle(param_canvas, param_canvas_shape,
param_z_buffer, param_ll, param_rl, param_lu, param_ru,
param_color, param_height, param_min_x, param_min_y,
block=block_dim, grid=grid_dim)
frames_count += 1
fps = frames_count / (time.time() - time_start)
cv2.putText(canvas, "fps={:0.2f}".format(fps), (5, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))
cv2.imshow('Scene', canvas)
cv2.destroyAllWindows()
使用 executions_per_frame = 1
(对于 1 次迭代,C 函数将被调用 1 次)我得到了 ~100 fps,使用 executions_per_frame = 10
- ~30 fps。它看起来不像它应该的那样富有成效。我错过了什么?
此外,这对特定任务有好处吗?
block_side = 32
block_dim = (block_side, block_side, 1)
grid_dim = (math.ceil(fragment_width / block_side), math.ceil(fragment_height / block_side))
pycuda_draw_triangle(..., block=block_dim, grid=grid_dim)
或者只是
pycuda_draw_triangle(..., block=(1, 1, 1), grid=(fragment_width, fragment_height))
Python3.6.9,CUDA 10.0,RTX 2060
UPD:
我通过将 cuda.In()
和 cuda.InOut()
替换为 cuda.mem_alloc()
,设法在 executions_per_frame = 10
上将性能提高到 150 fps,但现在 CPU 的使用率接近 30 %。我们还能做得更好吗?
if __name__ == '__main__':
if (os.system("cl.exe")):
os.environ['PATH'] += ';' + r"C:\Program Files\Microsoft Visual Studio17\Community\VC\Tools\MSVC.16.27023\bin\Hostx64\x64"
if (os.system("cl.exe")):
raise RuntimeError("cl.exe still not found")
pycuda_src_module = SourceModule(pycuda_code, no_extern_c=True)
pycuda_draw_triangle = pycuda_src_module.get_function("_Z13draw_trianglePhPiPfS1_S1_S1_S1_S_iii")
canvas_width, canvas_height = 1000, 800
param_canvas = cuda.mem_alloc(canvas_width * canvas_height * 3) # unsigned char *canvas
param_canvas_shape = cuda.mem_alloc(12) # int *canvas_shape
param_z_buffer = cuda.mem_alloc(canvas_width * canvas_height * 4) # float *z_buffer
param_ll = cuda.mem_alloc(12) # float *ll
param_rl = cuda.mem_alloc(12) # float *rl
param_lu = cuda.mem_alloc(12) # float *lu
param_ru = cuda.mem_alloc(12) # float *ru
param_color = cuda.mem_alloc(3) # unsigned char *color
time_start, frames_count, fps = time.time(), 0, 0
while True:
key = cv2.waitKeyEx(1)
if key == 27:
break
fragment_width, fragment_height = 400, 300
color = [0, 0, 200]
block_side = 32
block_dim = (block_side, block_side, 1)
grid_dim = (math.ceil(fragment_width / block_side), math.ceil(fragment_height / block_side))
canvas = np.zeros((canvas_height, canvas_width, 3), dtype=np.uint8)
z_buffer = np.zeros((canvas_height, canvas_width), dtype=np.float32)
cuda.memcpy_htod(param_canvas, canvas)
cuda.memcpy_htod(param_canvas_shape, np.array(canvas.shape, dtype=np.int32))
cuda.memcpy_htod(param_z_buffer, z_buffer)
cuda.memcpy_htod(param_ll, np.array([100, 200, frames_count], dtype=np.float32))
cuda.memcpy_htod(param_rl, np.array([500, 200, frames_count], dtype=np.float32))
cuda.memcpy_htod(param_lu, np.array([400, 500, frames_count], dtype=np.float32))
cuda.memcpy_htod(param_ru, np.array([400, 500, frames_count], dtype=np.float32))
cuda.memcpy_htod(param_color, np.array(color, dtype=np.uint8))
param_height = np.int32(fragment_height) # int height
param_min_x = np.int32(100) # int min_x
param_min_y = np.int32(200) # int min_y
for i in range(executions_per_frame):
pycuda_draw_triangle(param_canvas, param_canvas_shape,
param_z_buffer, param_ll, param_rl, param_lu, param_ru,
param_color, param_height, param_min_x, param_min_y,
block=block_dim, grid=grid_dim)
cuda.memcpy_dtoh(canvas, param_canvas)
cuda.memcpy_dtoh(z_buffer, param_z_buffer)
frames_count += 1
fps = frames_count / (time.time() - time_start)
cv2.putText(canvas, "fps={:0.2f}".format(fps), (5, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))
cv2.imshow('Scene', canvas)
cv2.destroyAllWindows()
这里的基本原则是您希望从性能循环中获取所有不必要的东西。您对性能的定义是 fps,因此您希望从 while
循环中取出不必要的所有内容。
性能的最大限制因素是循环开销 - 一些必须完成的工作是 "independent" 您为 executions_per_frame
设置的 "independent"。
在不求助于分析器的情况下,我们可以获得一些开销的估计值以及 executions_per_frame
每次迭代的贡献。我们将在 executions_per_frame
的两个不同值下测量整体性能 (fps),然后求解 2 个变量中的 2 个方程(开销 c
和每次迭代成本 x
):
1/fps (milliseconds per frame) = c + ix
我的 GPU 比你的 RTX2060 慢一点,所以当我 运行 你的原始代码有两个不同的 executions_per_frame
(i
) 值 1 和 10 时,我观察到:
i=1: 80 fps = 12 ms/frame
i=10: 11 fps = 90 ms/frame
因此我们的 2 个等式是:
c + (1)x = 12
c + (10)x = 90
求解,我们有 c
= 3ms 和 x
= 9ms。因此,每帧有一些 "fixed" 约 3 毫秒的开销,并且每个 executions_per_frame
的迭代有一些约 9 毫秒的可变开销。我们绝对可以攻击的是 x
数字(太大了),但我们可能不会在 c
数字上取得什么进展。
造成您最初问题的一个重要因素是 pycuda .In
、.Out
和 .InOut
指定每个内核启动 要完成的数据移动.这意味着每次您在 for 循环中启动内核时,您都在以这种方式移动指定的数据。这几乎肯定不是您的算法所必需的。
因此,让我们重构代码以删除该特征并进行另一次测量。下面是在 linux 上被隐藏到 运行 的代码(因为那是我工作的地方——看起来你可能在 windows 上)并且还做了这些事情:
有点琐碎,但我已经将您的内核内除法运算 height
转换为 1/height
的乘法运算。由于您将 height
作为内核参数传递,并且仅将其用于该 1 操作,因此我重构为传递 1/height
并使其成为乘法。不是很重要。
重构(删除)您对 .In
和 .InOut
的所有使用,以使用 cuda.mem_alloc
和 cuda.memcpy_XXXX
做类似的事情。
我已经将一些数据移动(零)转换为使用 cuda.memset_XXXX
。它比移动数据更快。
我在时序关键循环中移动了一些操作。
重要的是,我不会将 z-buffer 移回主机。如果您需要它(您显示的代码不需要它),您将不得不把它加回来,这会对性能有所影响。
这是我重构的代码:
import cv2
import numpy as np
import os
import time
import math
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
executions_per_frame = 100
pycuda_code = """
__device__ void set_pixel_3d(unsigned char *canvas, const int* canvas_shape, float *z_buffer, int x, int y, float z, const unsigned char *color) {
int index = y * canvas_shape[1] + x;
if (z > z_buffer[index]) {
z_buffer[index] = z;
for (int i = 0; i < canvas_shape[2]; ++i) {
canvas[index * canvas_shape[2] + i] = color[i];
}
}
}
// l/r - left/right
// l/u - lower/upper
__global__ void draw_triangle(unsigned char *canvas, const int *canvas_shape, float *z_buffer, const float *ll, const float *rl, const float *lu, const float *ru, const unsigned char *color, const float height, const int min_x, const int min_y) {
int global_thread_x = threadIdx.x + blockIdx.x * blockDim.x;
int global_thread_y = threadIdx.y + blockIdx.y * blockDim.y;
float k1 = (float)global_thread_y * height;
int left_x = (int)(ll[0] + (lu[0] - ll[0]) * k1);
int right_x = (int)(rl[0] + (ru[0] - rl[0]) * k1);
float left_z = ll[2] + (lu[2] - ll[2]) * k1;
float right_z = rl[2] + (ru[2] - rl[2]) * k1;
int actual_x = min_x + global_thread_x;
if (left_x != right_x && left_x <= actual_x && actual_x <= right_x) {
int actual_y = min_y + global_thread_y;
float k2 = ((float)(global_thread_x - (left_x - min_x))) / (right_x - left_x);
float actual_z = left_z + (right_z - left_z) * k2;
set_pixel_3d(canvas, canvas_shape, z_buffer, actual_x, actual_y, actual_z, color);
}
}
"""
if __name__ == '__main__':
pycuda_src_module = SourceModule(pycuda_code)
# pycuda_draw_triangle = pycuda_src_module.get_function("_Z13draw_trianglePhPiPfS1_S1_S1_S1_S_iii")
pycuda_draw_triangle = pycuda_src_module.get_function("draw_triangle")
time_start, frames_count, fps = time.time(), 0, 0
canvas_width, canvas_height = 1000, 800
canvas = np.zeros((canvas_height, canvas_width, 3), dtype=np.uint8)
z_buffer = np.zeros((canvas_height, canvas_width), dtype=np.float32)
fragment_width, fragment_height = 400, 300
# B G R
color = [200, 0, 100]
block_side = 32
block_dim = (block_side, block_side, 1)
grid_dim = (math.ceil(fragment_width / block_side), math.ceil(fragment_height / block_side))
param_canvas = cuda.mem_alloc(canvas.nbytes) # unsigned char *canvas
canvas_shape = np.array(canvas.shape, dtype=np.int32)
param_canvas_shape = cuda.mem_alloc(canvas_shape.nbytes) # int *canvas_shape
cuda.memcpy_htod(param_canvas_shape, canvas_shape)
param_z_buffer = cuda.mem_alloc(z_buffer.nbytes) # float *z_buffer
param_ll_h = np.array([100, 200, frames_count], dtype=np.float32)
param_rl_h = np.array([500, 200, frames_count], dtype=np.float32)
param_lu_h = np.array([400, 500, frames_count], dtype=np.float32)
param_ru_h = np.array([400, 500, frames_count], dtype=np.float32)
param_rl = cuda.mem_alloc(param_ll_h.nbytes)
param_lu = cuda.mem_alloc(param_ll_h.nbytes)
param_ru = cuda.mem_alloc(param_ll_h.nbytes)
param_ll = cuda.mem_alloc(param_ll_h.nbytes)
color_h = np.array(color, dtype=np.uint8)
param_color = cuda.mem_alloc(color_h.nbytes)
cuda.memcpy_htod(param_color, color_h)
while True:
key = cv2.waitKey(1)
if key == 27:
break
cuda.memset_d8(param_canvas, 0, canvas.nbytes)
cuda.memset_d8(param_z_buffer, 0, z_buffer.nbytes)
cuda.memcpy_htod(param_ll, param_ll_h)
cuda.memcpy_htod(param_rl, param_rl_h)
cuda.memcpy_htod(param_lu, param_lu_h)
cuda.memcpy_htod(param_ru, param_ru_h)
param_height = np.float32(1.0/fragment_height) # int height
param_min_x = np.int32(100) # int min_x
param_min_y = np.int32(200) # int min_y
for i in range(executions_per_frame):
pycuda_draw_triangle(param_canvas, param_canvas_shape,
param_z_buffer, param_ll, param_rl, param_lu, param_ru,
param_color, param_height, param_min_x, param_min_y,
block=block_dim, grid=grid_dim)
frames_count += 1
param_ll_h = np.array([100, 200, frames_count], dtype=np.float32)
param_rl_h = np.array([500, 200, frames_count], dtype=np.float32)
param_lu_h = np.array([400, 500, frames_count], dtype=np.float32)
param_ru_h = np.array([400, 500, frames_count], dtype=np.float32)
fps = frames_count / (time.time() - time_start)
cuda.memcpy_dtoh(canvas, param_canvas)
cv2.putText(canvas, "fps={:0.2f}".format(fps), (5, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))
cv2.imshow('Scene', canvas)
cv2.destroyAllWindows()
此代码 运行 速度相当快,因此我们可以 运行 在 10 次迭代和 100 次迭代时对测量进行计时,而不是像以前那样进行 1 次和 10 次迭代。在 100 次迭代时,我得到大约 60fps,在 10 次迭代时,我得到大约 80fps。 (在 1 次迭代中,我仍然只能达到 85 fps 左右)。做同样的运算:
c + (10)x = 12ms
c + (100)x = 16ms
所以 x
= 4/90 = 0.05 毫秒和 c
= 11 毫秒。 (这两种情况之间的精确等价是不必要的。我们正在建模一些可能不是完全线性的东西,这是一个粗略的模型)。重点是我们大大降低了每 executions_per_frame
次迭代的成本,同时对每帧的固定开销几乎没有改进。
所以如果你的目标是每帧执行多次,这将是一个重要的方法。如果您的目标真的只是每帧执行 1 次,这并没有多大帮助。
例如,通过此更改,cv2.imshow
操作可能需要几毫秒,在这种情况下,最终将成为性能的限制因素(我不知道,只是猜测).为了取得进一步的进展,此时的建议是仔细分析 while
循环中发生的事情,以查看每帧成本在哪里。
我想用这段代码绘制实心三角形:
import cv2
import numpy as np
import os
import time
import math
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
executions_per_frame = 10
pycuda_code = """
__device__ void set_pixel_3d(unsigned char *canvas, int* canvas_shape, float *z_buffer, int x, int y, float z, unsigned char *color) {
int index = y * canvas_shape[1] + x;
if (z > z_buffer[index]) {
z_buffer[index] = z;
for (int i = 0; i < canvas_shape[2]; ++i) {
canvas[index * canvas_shape[2] + i] = color[i];
}
}
}
// l/r - left/right
// l/u - lower/upper
__global__ void draw_triangle(unsigned char *canvas, int *canvas_shape, float *z_buffer, float *ll, float *rl, float *lu, float *ru, unsigned char *color, int height, int min_x, int min_y) {
int global_thread_x = threadIdx.x + blockIdx.x * blockDim.x;
int global_thread_y = threadIdx.y + blockIdx.y * blockDim.y;
float k1 = (float)global_thread_y / height;
int left_x = (int)(ll[0] + (lu[0] - ll[0]) * k1);
int right_x = (int)(rl[0] + (ru[0] - rl[0]) * k1);
float left_z = ll[2] + (lu[2] - ll[2]) * k1;
float right_z = rl[2] + (ru[2] - rl[2]) * k1;
int actual_x = min_x + global_thread_x;
if (left_x != right_x && left_x <= actual_x && actual_x <= right_x) {
int actual_y = min_y + global_thread_y;
float k2 = (float)(global_thread_x - (left_x - min_x)) / (right_x - left_x);
float actual_z = left_z + (right_z - left_z) * k2;
set_pixel_3d(canvas, canvas_shape, z_buffer, actual_x, actual_y, actual_z, color);
}
}
"""
if __name__ == '__main__':
if (os.system("cl.exe")):
os.environ['PATH'] += ';' + r"C:\Program Files\Microsoft Visual Studio17\Community\VC\Tools\MSVC.16.27023\bin\Hostx64\x64"
if (os.system("cl.exe")):
raise RuntimeError("cl.exe still not found")
pycuda_src_module = SourceModule(pycuda_code, no_extern_c=True)
pycuda_draw_triangle = pycuda_src_module.get_function("_Z13draw_trianglePhPiPfS1_S1_S1_S1_S_iii")
time_start, frames_count, fps = time.time(), 0, 0
while True:
key = cv2.waitKeyEx(1)
if key == 27:
break
canvas_width, canvas_height = 1000, 800
canvas = np.zeros((canvas_height, canvas_width, 3), dtype=np.uint8)
z_buffer = np.zeros((canvas_height, canvas_width), dtype=np.float32)
fragment_width, fragment_height = 400, 300
color = [0, 0, 200]
block_side = 32
block_dim = (block_side, block_side, 1)
grid_dim = (math.ceil(fragment_width / block_side), math.ceil(fragment_height / block_side))
param_canvas = cuda.InOut(canvas) # unsigned char *canvas
param_canvas_shape = cuda.In(np.array(canvas.shape, dtype=np.int32)) # int *canvas_shape
param_z_buffer = cuda.InOut(z_buffer) # float *z_buffer
param_ll = cuda.In(np.array([100, 200, frames_count], dtype=np.float32)) # float *ll
param_rl = cuda.In(np.array([500, 200, frames_count], dtype=np.float32)) # float *rl
param_lu = cuda.In(np.array([400, 500, frames_count], dtype=np.float32)) # float *lu
param_ru = cuda.In(np.array([400, 500, frames_count], dtype=np.float32)) # float *ru
param_color = cuda.In(np.array(color, dtype=np.uint8)) # unsigned char *color
param_height = np.int32(fragment_height) # int height
param_min_x = np.int32(100) # int min_x
param_min_y = np.int32(200) # int min_y
for i in range(executions_per_frame):
pycuda_draw_triangle(param_canvas, param_canvas_shape,
param_z_buffer, param_ll, param_rl, param_lu, param_ru,
param_color, param_height, param_min_x, param_min_y,
block=block_dim, grid=grid_dim)
frames_count += 1
fps = frames_count / (time.time() - time_start)
cv2.putText(canvas, "fps={:0.2f}".format(fps), (5, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))
cv2.imshow('Scene', canvas)
cv2.destroyAllWindows()
使用 executions_per_frame = 1
(对于 1 次迭代,C 函数将被调用 1 次)我得到了 ~100 fps,使用 executions_per_frame = 10
- ~30 fps。它看起来不像它应该的那样富有成效。我错过了什么?
此外,这对特定任务有好处吗?
block_side = 32
block_dim = (block_side, block_side, 1)
grid_dim = (math.ceil(fragment_width / block_side), math.ceil(fragment_height / block_side))
pycuda_draw_triangle(..., block=block_dim, grid=grid_dim)
或者只是
pycuda_draw_triangle(..., block=(1, 1, 1), grid=(fragment_width, fragment_height))
Python3.6.9,CUDA 10.0,RTX 2060
UPD:
我通过将 cuda.In()
和 cuda.InOut()
替换为 cuda.mem_alloc()
,设法在 executions_per_frame = 10
上将性能提高到 150 fps,但现在 CPU 的使用率接近 30 %。我们还能做得更好吗?
if __name__ == '__main__':
if (os.system("cl.exe")):
os.environ['PATH'] += ';' + r"C:\Program Files\Microsoft Visual Studio17\Community\VC\Tools\MSVC.16.27023\bin\Hostx64\x64"
if (os.system("cl.exe")):
raise RuntimeError("cl.exe still not found")
pycuda_src_module = SourceModule(pycuda_code, no_extern_c=True)
pycuda_draw_triangle = pycuda_src_module.get_function("_Z13draw_trianglePhPiPfS1_S1_S1_S1_S_iii")
canvas_width, canvas_height = 1000, 800
param_canvas = cuda.mem_alloc(canvas_width * canvas_height * 3) # unsigned char *canvas
param_canvas_shape = cuda.mem_alloc(12) # int *canvas_shape
param_z_buffer = cuda.mem_alloc(canvas_width * canvas_height * 4) # float *z_buffer
param_ll = cuda.mem_alloc(12) # float *ll
param_rl = cuda.mem_alloc(12) # float *rl
param_lu = cuda.mem_alloc(12) # float *lu
param_ru = cuda.mem_alloc(12) # float *ru
param_color = cuda.mem_alloc(3) # unsigned char *color
time_start, frames_count, fps = time.time(), 0, 0
while True:
key = cv2.waitKeyEx(1)
if key == 27:
break
fragment_width, fragment_height = 400, 300
color = [0, 0, 200]
block_side = 32
block_dim = (block_side, block_side, 1)
grid_dim = (math.ceil(fragment_width / block_side), math.ceil(fragment_height / block_side))
canvas = np.zeros((canvas_height, canvas_width, 3), dtype=np.uint8)
z_buffer = np.zeros((canvas_height, canvas_width), dtype=np.float32)
cuda.memcpy_htod(param_canvas, canvas)
cuda.memcpy_htod(param_canvas_shape, np.array(canvas.shape, dtype=np.int32))
cuda.memcpy_htod(param_z_buffer, z_buffer)
cuda.memcpy_htod(param_ll, np.array([100, 200, frames_count], dtype=np.float32))
cuda.memcpy_htod(param_rl, np.array([500, 200, frames_count], dtype=np.float32))
cuda.memcpy_htod(param_lu, np.array([400, 500, frames_count], dtype=np.float32))
cuda.memcpy_htod(param_ru, np.array([400, 500, frames_count], dtype=np.float32))
cuda.memcpy_htod(param_color, np.array(color, dtype=np.uint8))
param_height = np.int32(fragment_height) # int height
param_min_x = np.int32(100) # int min_x
param_min_y = np.int32(200) # int min_y
for i in range(executions_per_frame):
pycuda_draw_triangle(param_canvas, param_canvas_shape,
param_z_buffer, param_ll, param_rl, param_lu, param_ru,
param_color, param_height, param_min_x, param_min_y,
block=block_dim, grid=grid_dim)
cuda.memcpy_dtoh(canvas, param_canvas)
cuda.memcpy_dtoh(z_buffer, param_z_buffer)
frames_count += 1
fps = frames_count / (time.time() - time_start)
cv2.putText(canvas, "fps={:0.2f}".format(fps), (5, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))
cv2.imshow('Scene', canvas)
cv2.destroyAllWindows()
这里的基本原则是您希望从性能循环中获取所有不必要的东西。您对性能的定义是 fps,因此您希望从 while
循环中取出不必要的所有内容。
性能的最大限制因素是循环开销 - 一些必须完成的工作是 "independent" 您为 executions_per_frame
设置的 "independent"。
在不求助于分析器的情况下,我们可以获得一些开销的估计值以及 executions_per_frame
每次迭代的贡献。我们将在 executions_per_frame
的两个不同值下测量整体性能 (fps),然后求解 2 个变量中的 2 个方程(开销 c
和每次迭代成本 x
):
1/fps (milliseconds per frame) = c + ix
我的 GPU 比你的 RTX2060 慢一点,所以当我 运行 你的原始代码有两个不同的 executions_per_frame
(i
) 值 1 和 10 时,我观察到:
i=1: 80 fps = 12 ms/frame
i=10: 11 fps = 90 ms/frame
因此我们的 2 个等式是:
c + (1)x = 12
c + (10)x = 90
求解,我们有 c
= 3ms 和 x
= 9ms。因此,每帧有一些 "fixed" 约 3 毫秒的开销,并且每个 executions_per_frame
的迭代有一些约 9 毫秒的可变开销。我们绝对可以攻击的是 x
数字(太大了),但我们可能不会在 c
数字上取得什么进展。
造成您最初问题的一个重要因素是 pycuda .In
、.Out
和 .InOut
指定每个内核启动 要完成的数据移动.这意味着每次您在 for 循环中启动内核时,您都在以这种方式移动指定的数据。这几乎肯定不是您的算法所必需的。
因此,让我们重构代码以删除该特征并进行另一次测量。下面是在 linux 上被隐藏到 运行 的代码(因为那是我工作的地方——看起来你可能在 windows 上)并且还做了这些事情:
有点琐碎,但我已经将您的内核内除法运算
height
转换为1/height
的乘法运算。由于您将height
作为内核参数传递,并且仅将其用于该 1 操作,因此我重构为传递1/height
并使其成为乘法。不是很重要。重构(删除)您对
.In
和.InOut
的所有使用,以使用cuda.mem_alloc
和cuda.memcpy_XXXX
做类似的事情。我已经将一些数据移动(零)转换为使用
cuda.memset_XXXX
。它比移动数据更快。我在时序关键循环中移动了一些操作。
重要的是,我不会将 z-buffer 移回主机。如果您需要它(您显示的代码不需要它),您将不得不把它加回来,这会对性能有所影响。
这是我重构的代码:
import cv2
import numpy as np
import os
import time
import math
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
executions_per_frame = 100
pycuda_code = """
__device__ void set_pixel_3d(unsigned char *canvas, const int* canvas_shape, float *z_buffer, int x, int y, float z, const unsigned char *color) {
int index = y * canvas_shape[1] + x;
if (z > z_buffer[index]) {
z_buffer[index] = z;
for (int i = 0; i < canvas_shape[2]; ++i) {
canvas[index * canvas_shape[2] + i] = color[i];
}
}
}
// l/r - left/right
// l/u - lower/upper
__global__ void draw_triangle(unsigned char *canvas, const int *canvas_shape, float *z_buffer, const float *ll, const float *rl, const float *lu, const float *ru, const unsigned char *color, const float height, const int min_x, const int min_y) {
int global_thread_x = threadIdx.x + blockIdx.x * blockDim.x;
int global_thread_y = threadIdx.y + blockIdx.y * blockDim.y;
float k1 = (float)global_thread_y * height;
int left_x = (int)(ll[0] + (lu[0] - ll[0]) * k1);
int right_x = (int)(rl[0] + (ru[0] - rl[0]) * k1);
float left_z = ll[2] + (lu[2] - ll[2]) * k1;
float right_z = rl[2] + (ru[2] - rl[2]) * k1;
int actual_x = min_x + global_thread_x;
if (left_x != right_x && left_x <= actual_x && actual_x <= right_x) {
int actual_y = min_y + global_thread_y;
float k2 = ((float)(global_thread_x - (left_x - min_x))) / (right_x - left_x);
float actual_z = left_z + (right_z - left_z) * k2;
set_pixel_3d(canvas, canvas_shape, z_buffer, actual_x, actual_y, actual_z, color);
}
}
"""
if __name__ == '__main__':
pycuda_src_module = SourceModule(pycuda_code)
# pycuda_draw_triangle = pycuda_src_module.get_function("_Z13draw_trianglePhPiPfS1_S1_S1_S1_S_iii")
pycuda_draw_triangle = pycuda_src_module.get_function("draw_triangle")
time_start, frames_count, fps = time.time(), 0, 0
canvas_width, canvas_height = 1000, 800
canvas = np.zeros((canvas_height, canvas_width, 3), dtype=np.uint8)
z_buffer = np.zeros((canvas_height, canvas_width), dtype=np.float32)
fragment_width, fragment_height = 400, 300
# B G R
color = [200, 0, 100]
block_side = 32
block_dim = (block_side, block_side, 1)
grid_dim = (math.ceil(fragment_width / block_side), math.ceil(fragment_height / block_side))
param_canvas = cuda.mem_alloc(canvas.nbytes) # unsigned char *canvas
canvas_shape = np.array(canvas.shape, dtype=np.int32)
param_canvas_shape = cuda.mem_alloc(canvas_shape.nbytes) # int *canvas_shape
cuda.memcpy_htod(param_canvas_shape, canvas_shape)
param_z_buffer = cuda.mem_alloc(z_buffer.nbytes) # float *z_buffer
param_ll_h = np.array([100, 200, frames_count], dtype=np.float32)
param_rl_h = np.array([500, 200, frames_count], dtype=np.float32)
param_lu_h = np.array([400, 500, frames_count], dtype=np.float32)
param_ru_h = np.array([400, 500, frames_count], dtype=np.float32)
param_rl = cuda.mem_alloc(param_ll_h.nbytes)
param_lu = cuda.mem_alloc(param_ll_h.nbytes)
param_ru = cuda.mem_alloc(param_ll_h.nbytes)
param_ll = cuda.mem_alloc(param_ll_h.nbytes)
color_h = np.array(color, dtype=np.uint8)
param_color = cuda.mem_alloc(color_h.nbytes)
cuda.memcpy_htod(param_color, color_h)
while True:
key = cv2.waitKey(1)
if key == 27:
break
cuda.memset_d8(param_canvas, 0, canvas.nbytes)
cuda.memset_d8(param_z_buffer, 0, z_buffer.nbytes)
cuda.memcpy_htod(param_ll, param_ll_h)
cuda.memcpy_htod(param_rl, param_rl_h)
cuda.memcpy_htod(param_lu, param_lu_h)
cuda.memcpy_htod(param_ru, param_ru_h)
param_height = np.float32(1.0/fragment_height) # int height
param_min_x = np.int32(100) # int min_x
param_min_y = np.int32(200) # int min_y
for i in range(executions_per_frame):
pycuda_draw_triangle(param_canvas, param_canvas_shape,
param_z_buffer, param_ll, param_rl, param_lu, param_ru,
param_color, param_height, param_min_x, param_min_y,
block=block_dim, grid=grid_dim)
frames_count += 1
param_ll_h = np.array([100, 200, frames_count], dtype=np.float32)
param_rl_h = np.array([500, 200, frames_count], dtype=np.float32)
param_lu_h = np.array([400, 500, frames_count], dtype=np.float32)
param_ru_h = np.array([400, 500, frames_count], dtype=np.float32)
fps = frames_count / (time.time() - time_start)
cuda.memcpy_dtoh(canvas, param_canvas)
cv2.putText(canvas, "fps={:0.2f}".format(fps), (5, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))
cv2.imshow('Scene', canvas)
cv2.destroyAllWindows()
此代码 运行 速度相当快,因此我们可以 运行 在 10 次迭代和 100 次迭代时对测量进行计时,而不是像以前那样进行 1 次和 10 次迭代。在 100 次迭代时,我得到大约 60fps,在 10 次迭代时,我得到大约 80fps。 (在 1 次迭代中,我仍然只能达到 85 fps 左右)。做同样的运算:
c + (10)x = 12ms
c + (100)x = 16ms
所以 x
= 4/90 = 0.05 毫秒和 c
= 11 毫秒。 (这两种情况之间的精确等价是不必要的。我们正在建模一些可能不是完全线性的东西,这是一个粗略的模型)。重点是我们大大降低了每 executions_per_frame
次迭代的成本,同时对每帧的固定开销几乎没有改进。
所以如果你的目标是每帧执行多次,这将是一个重要的方法。如果您的目标真的只是每帧执行 1 次,这并没有多大帮助。
例如,通过此更改,cv2.imshow
操作可能需要几毫秒,在这种情况下,最终将成为性能的限制因素(我不知道,只是猜测).为了取得进一步的进展,此时的建议是仔细分析 while
循环中发生的事情,以查看每帧成本在哪里。