Python:运行 嵌套循环,二维移动 window,并行
Python: Running nested loop, 2D moving window, in Parallel
我处理地形数据。对于一个特定的问题,我在 Python 中编写了一个函数,它使用特定大小的移动 window 来压缩矩阵(高程网格)。然后我必须对此 window 进行分析,并将单元格设置在 window 中心的结果值。
我的最终输出是一个与我的原始矩阵大小相同的矩阵,该矩阵已根据我的分析进行了更改。这个问题在小范围内 运行 需要 11 个小时,所以我认为并行化内部循环会加速事情的进展。 或者,也可能有一个巧妙的矢量化解决方案...
看下面我的函数,DEM
是一个二维numpy数组,w
是window的大小。
def RMSH_det(DEM, w):
import numpy as np
from scipy import signal
[nrows, ncols] = np.shape(DEM)
#create an empty array to store result
rms = DEM*np.nan
# nw=(w*2)**2
# x = np.arange(0,nw)
for i in np.arange(w+1,nrows-w):
for j in np.arange(w+1,ncols-w):
d1 = np.int64(np.arange(i-w,i+w))
d2 = np.int64(np.arange(j-w,j+w))
win = DEM[d1[0]:d1[-1],d2[0]:d2[-1]]
if np.max(np.isnan(win)) == 1:
rms[i,j] = np.nan
else:
win = signal.detrend(win, type = 'linear')
z = np.reshape(win,-1)
nz = np.size(z)
rootms = np.sqrt(1 / (nz - 1) * np.sum((z-np.mean(z))**2))
rms[i,j] = rootms
return(rms)
我搜索了 SO/SE 我的问题的解决方案并遇到了许多嵌套 for 循环的例子并试图 运行 它们并行。我一直在努力调整我的代码以匹配示例,并希望得到一些帮助。这个问题的解决方案将帮助我使用我拥有的其他几个移动 window 函数。
到目前为止,我已经将内部循环移动到它自己的函数中,可以从外部循环中调用它:
def inLoop(i, w, DEM,rms,ncols):
for j in np.arange(w+1,ncols-w):
d1 = np.int64(np.arange(i-w,i+w))
d2 = np.int64(np.arange(j-w,j+w))
win = DEM[d1[0]:d1[-1],d2[0]:d2[-1]]
if np.max(np.isnan(win)) == 1:
rms[i,j] = np.nan
else:
win = signal.detrend(win, type = 'linear')
z = np.reshape(win,-1)
nz = np.size(z)
rootms = np.sqrt(1 / (nz - 1) * np.sum((z-np.mean(z))**2))
rms[i,j] = rootms
return(rms)
但我不确定使用需要输入到内部循环中的必要变量对 Pool 调用进行编码的正确方法。请参阅下面的外循环:
for i in np.arange(w+1,nrows-w):
number_of_workers = 8
with Pool(number_of_workers) as p:
#call the pool
p.starmap(inLoop, [i, w, DEM, rms, ncols])
剩余问题:
这段代码甚至可以通过并行化来优化吗?
如何成功存储并行嵌套 for 循环的结果?
Q : This problem takes 11 hours to run on a small area,... stay tuned, we can and we'll get under 20 [min] !!
提供了适当的解释,为此我感谢 O/P 作者:
# DEM.shape = [nrows, ncols] = [ 1355, 1165 ]
# DEM.dtype = float32
# .flags = C_CONTIGUOUS : True
# F_CONTIGUOUS : False
# OWNDATA : True
# WRITEABLE : True
# ALIGNED : True
# WRITEBACKIFCOPY : False
# UPDATEIFCOPY : False
我尝试检查代码并设置一个更高效代码的模型,然后再将所有流行的和现成的使用 numpy + numba
类固醇,临时 numpy
-only 结果在 的 [100,100]
DEM-grid 样本上有效
~ 6 [s]
在所述内核 window 宽度 w = 10
相同,对于 [200,200]
DEM 网格,采用 ~ 36 [s]
- 显然,缩放比例为 ~ O( N^2 )
相同,对于 [1000,1000]
DEM 网格,采用 ~ 1077 [s] ~ 17.6 [min]
哇!
[1000,1000]
DEM-grid 上的一个字段 .jit
试验目前正在测试中,将在完成后更新 post + 一旦 numba.jit()
代码将享有运行进一步加速的结果
到目前为止,很有希望,不是吗?
如果你现在 @morrismc 测试你的原样代码,在 [100,100]
矩阵上,我们已经可以猜出主体 speedup[=96= 的实现范围],甚至在 运行ning 测试完成之前。
>>> pass; import numpy as np
>>> from zmq import Stopwatch; clk = Stopwatch()
>>>
>>> size = 100; demF32 = np.random.random( ( size, size ) ).astype( np.float32 ); resF32 = demF32.copy(); clk.start(); _ = RMSH_det( demF32, 10, resF32 ); t = clk.stop(); print( "{1:>13d} [us]\nNumOf_np.nan-s was {0:d}".format( _, t ) )
6492192 [us]
NumOf_np.nan-s was 0
>>> size = 200; demF32 = np.random.random( ( size, size ) ).astype( np.float32 ); resF32 = demF32.copy(); clk.start(); _ = RMSH_det( demF32, 10, resF32 ); t = clk.stop(); print( "{1:>13d} [us]\nNumOf_np.nan-s was {0:d}".format( _, t ) )
35650629 [us]
NumOf_np.nan-s was 0
>>> size = 1000; demF32 = np.random.random( ( size, size ) ).astype( np.float32 ); resF32 = demF32.copy(); clk.start(); _ = RMSH_det( demF32, 10, resF32 ); t = clk.stop(); print( "{1:>13d} [us]\nNumOf_np.nan-s was {0:d}".format( _, t ) )
1058702889 [us]
NumOf_np.nan-s was 0
所有这些都在 scipy
1.2.1 上,因此没有 1.3.1 的好处可能进一步加速
A numba.jit()
LLVM 编译代码。糟糕,慢了?
numba.jit()
-加速显示大约 200 [ms]
更糟 运行time on [100,100]
DEM 网格,已指定签名(因此此处未产生临时分析成本)和 nogil = True
('0.43.1+0.g8dabe7abe.dirty' 不是最新的,还)
猜猜这里没有更多收获,没有将游戏移动到已编译的 Cython
区域,但大约有几十分钟而不是几十小时, Alea Iacta Est - 只是 numpy
智能矢量化代码规则!
结语:
如果原始算法是正确的(并且在源代码中留下一些疑问以进行任何进一步的改进工作),任何尝试 运行 其他形式的 [PARALLEL]
code-execution-flow 在这里无济于事( kernel-windows[w,w] 是 DEM 网格内存布局的非常小且不连续的区域,memory-I/O 成本是这里 运行 时间预算的主要部分,一些更好的索引可能会提高缓存行的重用,但总体努力远远超出预算,因为 的目标从 ~ 11 [hrs]
下降到大约 ~ 6 [hrs]
比成功达到 还多 ~ 20 [min]
运行 次[1300,1100]
float32 DEM 网格
代码保持原样(非 PEP-8),因为 [DOC.me], [TEST.me]
和 [=72] 的所有附加教学价值=][PERF.me]
QA 阶段,所以所有类型的 PEP-isto-evangelisators 都接受 O/P 作者的观点,左全屏宽度布局,以便允许理解 WHY 并改进代码,如果去掉注释,代码将失去 her/his 进一步提高代码性能的方法。谢谢
@jit( [ "int32( float32[:,:], int32, float32[:,:] )", ], nogil = True ) # numba.__version__ '0.43.1+0.g8dabe7abe.dirty'
def RMSH_det_jit( DEMf32, w, rmsRESULTf32 ): # pre-allocate rmsRESULTf32[:,:] externally
#import numpy as np
#from scipy import signal
#
# [nrows, ncols] = np.shape( DEM ) # avoid ~ [ 1355, 1165 ]
# # DEM.dtype = float32
# # .flags = C_CONTIGUOUS : True
# # F_CONTIGUOUS : False
# # OWNDATA : True
# # WRITEABLE : True
# # ALIGNED : True
# # WRITEBACKIFCOPY : False
# # UPDATEIFCOPY : False
#
rmsRESULTf32[:,:] = np.nan # .STO[:,:] np.nan-s, using in-place assignment into the by-ref passed, externally pre-allocated np.ndarray
dtdWIN = np.ones( ( 2 * w - 1, # .ALLOC once, re-use 1M+ times
2 * w - 1 ) )
a_div_by_nz_minus1 = 1. / ( dtdWIN.size - 1 ) # .SET float CONST with about a ~1M+ re-use
a_num_of_NaNs = 0 # .SET i4 bonus value, ret'd as a side-effect of the signature ...
# rms = DEM*np.nan # avoid ( pre-alloc rmsRESULTf32 ) externally create and pass a right-sized, empty array to store all results
# nw = ( w * 2 )**2
# x = np.arange( 0, nw )
# 11..1344
#or i in np.arange( w+1, nrows-w ): # w ~ 10 -> [11:1344, 11:1154]
for i in np.arange( w+1, DEMf32.shape[0]-w ): # ??? never touches DEM-row/column[0]?? or off-by-one indexing error ???
fromI = i - w # .UPD ALAP
tillI = i + w - 1 # .UPD ALAP upper bound index excluded ( this is how a code in [ np.arange(...)[0]:np.arange(...)[-1] ] works )
# 11..1154
#or j in np.arange( w+1, ncols-w ):
for j in np.arange( w+1, DEMf32.shape[1]-w ):
fromJ = j - w # .UPD ALAP
tillJ = j + w - 1 # .UPD ALAP upper bound index excluded ( this is how a code in [ np.arange(...)[0]:np.arange(...)[-1] ] works )
# 1..1334:21..1354 # ??? never touches first/last DEM-row/column??
# d1 = np.int64( np.arange( i-w, i+w ) ) # AVOID: 1M+ times allocated, yet never consumed, but their edge values
# d2 = np.int64( np.arange( j-w, j+w ) ) # AVOID: 1M+ times allocated, yet never consumed, but their edge values
# win = DEM[ d1[0]:d1[-1], # AVOID: while a .view-only, no need to 1M+ times instantiate a "kernel"-win(dow] ( this will create a np.view into the original DEM, not a copy ! )
# d2[0]:d2[-1] # ?.or.? NOT a .view-only, but a new .copy() instantiated, so as to call .detrend() w/o in-place modifying DEMf32 ???
# ] # ?.or.? NOT a .view-only, but a new .copy() instantiated, so as to call .detrend() w/o in-place modifying DEMf32 ???
dtdWIN[:,:] = DEMf32[fromI:tillI, fromJ:tillJ] # NOT a .view-only, but a .copy() re-populated into a just once and only once pre-allocated dtdWIN, via an in-place copy
#f np.max( np.isnan( win ) ) == 1: # AVOID: 1M+ times full-range scan, while any first np.nan decides the game and no need to scan "the rest"
if np.any( np.isnan( dtdWIN ) ): # "density" of np.nan-s determine, if this is a good idea to pre-store
a_num_of_NaNs += 1 # .INC
continue # .NOP/LOOP from here, already pre-stored np.nan-s for this case
# rms[i,j] = np.nan # DUP ( already stored in initialisation ... )
else:
#in = signal.detrend( win, type = 'linear' ) # REALLY?: in-place modification of DEM-matrix ???
dtdWIN = signal.detrend( dtdWIN, type = 'linear' ) # in scipy-v1.3.1+ can mod in-place, overwrite_data = True ) # REMOVE OLS-fit-linear trend
dtdWIN = signal.detrend( dtdWIN, type = 'constant' ) # in scipy-v1.3.1+ can mod in-place, overwrite_data = True ) # REMOVE mean
#z = np.reshape( win, -1 ) # AVOID:~1M+ re-counting constant value, known from w directly
#nz = np.size( z ) # AVOID:~1M+ re-counting constant value, known from w directly
#rootms = np.sqrt( 1 / ( nz - 1 ) * np.sum( ( z - np.mean( z ) )**2 ) )
#rms[i,j] = rootms
rmsRESULTf32[i,j] = np.sqrt( a_div_by_nz_minus1 # .STO a "scaled"
* np.dot( dtdWIN,
dtdWIN.T
).sum()
# np.sum( ( dtdWIN # SUM of
# # - dtdWIN.mean() # mean-removed ( ALREADY done via scipy.signal.detrend( 'const' ) above )
# )**2 # SQUARES
# )
) # ROOT
return( a_num_of_NaNs ) # ret i4
使用 Numba 的解决方案
在某些情况下,如果您使用的所有功能都受支持,这很容易做到。在您的代码中,win = signal.detrend(win, type = 'linear')
是您必须在 Numba 中实现的部分,因为不支持此功能。
在 Numba 中实施去趋势
如果您查看 detrend 的 source-code,并提取与您的问题相关的部分,它可能如下所示:
@nb.njit()
def detrend(w):
Npts=w.shape[0]
A=np.empty((Npts,2),dtype=w.dtype)
for i in range(Npts):
A[i,0]=1.*(i+1) / Npts
A[i,1]=1.
coef, resids, rank, s = np.linalg.lstsq(A, w.T)
out=w.T- np.dot(A, coef)
return out.T
我还为 np.max(np.isnan(win)) == 1
实施了更快的解决方案
@nb.njit()
def isnan(win):
for i in range(win.shape[0]):
for j in range(win.shape[1]):
if np.isnan(win[i,j]):
return True
return False
主要功能
因为我在这里使用了 Numba,所以并行化非常简单,只是外层循环的 prange 和
import numpy as np
import numba as nb
@nb.njit(parallel=True)
def RMSH_det_nb(DEM, w):
[nrows, ncols] = np.shape(DEM)
#create an empty array to store result
rms = DEM*np.nan
for i in nb.prange(w+1,nrows-w):
for j in range(w+1,ncols-w):
win = DEM[i-w:i+w-1,j-w:j+w-1]
if isnan(win):
rms[i,j] = np.nan
else:
win = detrend(win)
z = win.flatten()
nz = z.size
rootms = np.sqrt(1 / (nz - 1) * np.sum((z-np.mean(z))**2))
rms[i,j] = rootms
return rms
计时(小例子)
w = 10
DEM=np.random.rand(100, 100).astype(np.float32)
res1=RMSH_det(DEM, w)
res2=RMSH_det_nb(DEM, w)
print(np.allclose(res1,res2,equal_nan=True))
#True
%timeit res1=RMSH_det(DEM, w)
#1.59 s ± 72 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res2=RMSH_det_nb(DEM, w) #approx. 55 times faster
#29 ms ± 1.85 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
更大数组的时间
w = 10
DEM=np.random.rand(1355, 1165).astype(np.float32)
%timeit res2=RMSH_det_nb(DEM, w)
#6.63 s ± 21.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[编辑] 使用正态方程的实现
此方法的数值精度较低。虽然这个解决方案要快得多。
@nb.njit()
def isnan(win):
for i in range(win.shape[0]):
for j in range(win.shape[1]):
if win[i,j]==np.nan:
return True
return False
@nb.njit()
def detrend(w):
Npts=w.shape[0]
A=np.empty((Npts,2),dtype=w.dtype)
for i in range(Npts):
A[i,0]=1.*(i+1) / Npts
A[i,1]=1.
coef, resids, rank, s = np.linalg.lstsq(A, w.T)
out=w.T- np.dot(A, coef)
return out.T
@nb.njit()
def detrend_2(w,T1,A):
T2=np.dot(A.T,w.T)
coef=np.linalg.solve(T1,T2)
out=w.T- np.dot(A, coef)
return out.T
@nb.njit(parallel=True)
def RMSH_det_nb_normal_eq(DEM,w):
[nrows, ncols] = np.shape(DEM)
#create an empty array to store result
rms = DEM*np.nan
Npts=w*2-1
A=np.empty((Npts,2),dtype=DEM.dtype)
for i in range(Npts):
A[i,0]=1.*(i+1) / Npts
A[i,1]=1.
T1=np.dot(A.T,A)
nz = Npts**2
for i in nb.prange(w+1,nrows-w):
for j in range(w+1,ncols-w):
win = DEM[i-w:i+w-1,j-w:j+w-1]
if isnan(win):
rms[i,j] = np.nan
else:
win = detrend_2(win,T1,A)
rootms = np.sqrt(1 / (nz - 1) * np.sum((win-np.mean(win))**2))
rms[i,j] = rootms
return rms
计时
w = 10
DEM=np.random.rand(100, 100).astype(np.float32)
res1=RMSH_det(DEM, w)
res2=RMSH_det_nb(DEM, w)
print(np.allclose(res1,res2,equal_nan=True))
#True
%timeit res1=RMSH_det(DEM, w)
#1.59 s ± 72 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res2=RMSH_det_nb_normal_eq(DEM,w)
#7.97 ms ± 89.4 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
使用正规方程的优化解
重复使用临时数组以避免昂贵的内存分配,并使用矩阵乘法的自定义实现。这仅适用于非常小的矩阵,在大多数其他情况下 np.dot (sgeemm) 会快很多。
@nb.njit()
def matmult_2(A,B,out):
for j in range(B.shape[1]):
acc1=nb.float32(0)
acc2=nb.float32(0)
for k in range(B.shape[0]):
acc1+=A[0,k]*B[k,j]
acc2+=A[1,k]*B[k,j]
out[0,j]=acc1
out[1,j]=acc2
return out
@nb.njit(fastmath=True)
def matmult_mod(A,B,w,out):
for j in range(B.shape[1]):
for i in range(A.shape[0]):
acc=nb.float32(0)
acc+=A[i,0]*B[0,j]+A[i,1]*B[1,j]
out[j,i]=acc-w[j,i]
return out
@nb.njit()
def detrend_2_opt(w,T1,A,Tempvar_1,Tempvar_2):
T2=matmult_2(A.T,w.T,Tempvar_1)
coef=np.linalg.solve(T1,T2)
return matmult_mod(A, coef,w,Tempvar_2)
@nb.njit(parallel=True)
def RMSH_det_nb_normal_eq_opt(DEM,w):
[nrows, ncols] = np.shape(DEM)
#create an empty array to store result
rms = DEM*np.nan
Npts=w*2-1
A=np.empty((Npts,2),dtype=DEM.dtype)
for i in range(Npts):
A[i,0]=1.*(i+1) / Npts
A[i,1]=1.
T1=np.dot(A.T,A)
nz = Npts**2
for i in nb.prange(w+1,nrows-w):
Tempvar_1=np.empty((2,Npts),dtype=DEM.dtype)
Tempvar_2=np.empty((Npts,Npts),dtype=DEM.dtype)
for j in range(w+1,ncols-w):
win = DEM[i-w:i+w-1,j-w:j+w-1]
if isnan(win):
rms[i,j] = np.nan
else:
win = detrend_2_opt(win,T1,A,Tempvar_1,Tempvar_2)
rootms = np.sqrt(1 / (nz - 1) * np.sum((win-np.mean(win))**2))
rms[i,j] = rootms
return rms
计时
w = 10
DEM=np.random.rand(100, 100).astype(np.float32)
res1=RMSH_det(DEM, w)
res2=RMSH_det_nb_normal_eq_opt(DEM, w)
print(np.allclose(res1,res2,equal_nan=True))
#True
%timeit res1=RMSH_det(DEM, w)
#1.59 s ± 72 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res2=RMSH_det_nb_normal_eq_opt(DEM,w)
#4.66 ms ± 87.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
isnan 时间安排
这个函数完全是另一种实现。如果 NaN 恰好位于数组的开头,则速度要快得多,但无论如何,即使没有,也有一些加速。我用小数组(大约 window 大小)和 @user3666197 建议的大数组对它进行了基准测试。
case_1=np.full((20,20),np.nan)
case_2=np.full((20,20),0.)
case_2[10,10]=np.nan
case_3=np.full((20,20),0.)
case_4 = np.full( ( int( 1E4 ), int( 1E4 ) ),np.nan)
case_5 = np.ones( ( int( 1E4 ), int( 1E4 ) ) )
%timeit np.any(np.isnan(case_1))
%timeit np.any(np.isnan(case_2))
%timeit np.any(np.isnan(case_3))
%timeit np.any(np.isnan(case_4))
%timeit np.any(np.isnan(case_5))
#2.75 µs ± 73.1 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
#2.75 µs ± 46.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
#2.76 µs ± 32.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
#81.3 ms ± 2.97 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
#86.7 ms ± 2.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit isnan(case_1)
%timeit isnan(case_2)
%timeit isnan(case_3)
%timeit isnan(case_4)
%timeit isnan(case_5)
#244 ns ± 5.02 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
#357 ns ± 1.07 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
#475 ns ± 9.28 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
#235 ns ± 0.933 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
#58.8 ms ± 2.08 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
我处理地形数据。对于一个特定的问题,我在 Python 中编写了一个函数,它使用特定大小的移动 window 来压缩矩阵(高程网格)。然后我必须对此 window 进行分析,并将单元格设置在 window 中心的结果值。
我的最终输出是一个与我的原始矩阵大小相同的矩阵,该矩阵已根据我的分析进行了更改。这个问题在小范围内 运行 需要 11 个小时,所以我认为并行化内部循环会加速事情的进展。 或者,也可能有一个巧妙的矢量化解决方案...
看下面我的函数,DEM
是一个二维numpy数组,w
是window的大小。
def RMSH_det(DEM, w):
import numpy as np
from scipy import signal
[nrows, ncols] = np.shape(DEM)
#create an empty array to store result
rms = DEM*np.nan
# nw=(w*2)**2
# x = np.arange(0,nw)
for i in np.arange(w+1,nrows-w):
for j in np.arange(w+1,ncols-w):
d1 = np.int64(np.arange(i-w,i+w))
d2 = np.int64(np.arange(j-w,j+w))
win = DEM[d1[0]:d1[-1],d2[0]:d2[-1]]
if np.max(np.isnan(win)) == 1:
rms[i,j] = np.nan
else:
win = signal.detrend(win, type = 'linear')
z = np.reshape(win,-1)
nz = np.size(z)
rootms = np.sqrt(1 / (nz - 1) * np.sum((z-np.mean(z))**2))
rms[i,j] = rootms
return(rms)
我搜索了 SO/SE 我的问题的解决方案并遇到了许多嵌套 for 循环的例子并试图 运行 它们并行。我一直在努力调整我的代码以匹配示例,并希望得到一些帮助。这个问题的解决方案将帮助我使用我拥有的其他几个移动 window 函数。
到目前为止,我已经将内部循环移动到它自己的函数中,可以从外部循环中调用它:
def inLoop(i, w, DEM,rms,ncols):
for j in np.arange(w+1,ncols-w):
d1 = np.int64(np.arange(i-w,i+w))
d2 = np.int64(np.arange(j-w,j+w))
win = DEM[d1[0]:d1[-1],d2[0]:d2[-1]]
if np.max(np.isnan(win)) == 1:
rms[i,j] = np.nan
else:
win = signal.detrend(win, type = 'linear')
z = np.reshape(win,-1)
nz = np.size(z)
rootms = np.sqrt(1 / (nz - 1) * np.sum((z-np.mean(z))**2))
rms[i,j] = rootms
return(rms)
但我不确定使用需要输入到内部循环中的必要变量对 Pool 调用进行编码的正确方法。请参阅下面的外循环:
for i in np.arange(w+1,nrows-w):
number_of_workers = 8
with Pool(number_of_workers) as p:
#call the pool
p.starmap(inLoop, [i, w, DEM, rms, ncols])
剩余问题:
这段代码甚至可以通过并行化来优化吗?
如何成功存储并行嵌套 for 循环的结果?
Q : This problem takes 11 hours to run on a small area,... stay tuned, we can and we'll get under 20 [min] !!
提供了适当的解释,为此我感谢 O/P 作者:
# DEM.shape = [nrows, ncols] = [ 1355, 1165 ]
# DEM.dtype = float32
# .flags = C_CONTIGUOUS : True
# F_CONTIGUOUS : False
# OWNDATA : True
# WRITEABLE : True
# ALIGNED : True
# WRITEBACKIFCOPY : False
# UPDATEIFCOPY : False
我尝试检查代码并设置一个更高效代码的模型,然后再将所有流行的和现成的使用 numpy + numba
类固醇,临时 numpy
-only 结果在 的 [100,100]
DEM-grid 样本上有效
~ 6 [s]
在所述内核 window 宽度 w = 10
相同,对于 [200,200]
DEM 网格,采用 ~ 36 [s]
- 显然,缩放比例为 ~ O( N^2 )
相同,对于 [1000,1000]
DEM 网格,采用 ~ 1077 [s] ~ 17.6 [min]
哇!
[1000,1000]
DEM-grid 上的一个字段 .jit
试验目前正在测试中,将在完成后更新 post + 一旦 numba.jit()
代码将享有运行进一步加速的结果
到目前为止,很有希望,不是吗?
如果你现在 @morrismc 测试你的原样代码,在 [100,100]
矩阵上,我们已经可以猜出主体 speedup[=96= 的实现范围],甚至在 运行ning 测试完成之前。
>>> pass; import numpy as np
>>> from zmq import Stopwatch; clk = Stopwatch()
>>>
>>> size = 100; demF32 = np.random.random( ( size, size ) ).astype( np.float32 ); resF32 = demF32.copy(); clk.start(); _ = RMSH_det( demF32, 10, resF32 ); t = clk.stop(); print( "{1:>13d} [us]\nNumOf_np.nan-s was {0:d}".format( _, t ) )
6492192 [us]
NumOf_np.nan-s was 0
>>> size = 200; demF32 = np.random.random( ( size, size ) ).astype( np.float32 ); resF32 = demF32.copy(); clk.start(); _ = RMSH_det( demF32, 10, resF32 ); t = clk.stop(); print( "{1:>13d} [us]\nNumOf_np.nan-s was {0:d}".format( _, t ) )
35650629 [us]
NumOf_np.nan-s was 0
>>> size = 1000; demF32 = np.random.random( ( size, size ) ).astype( np.float32 ); resF32 = demF32.copy(); clk.start(); _ = RMSH_det( demF32, 10, resF32 ); t = clk.stop(); print( "{1:>13d} [us]\nNumOf_np.nan-s was {0:d}".format( _, t ) )
1058702889 [us]
NumOf_np.nan-s was 0
所有这些都在 scipy
1.2.1 上,因此没有 1.3.1 的好处可能进一步加速
A numba.jit()
LLVM 编译代码。糟糕,慢了?
numba.jit()
-加速显示大约 200 [ms]
更糟 运行time on [100,100]
DEM 网格,已指定签名(因此此处未产生临时分析成本)和 nogil = True
('0.43.1+0.g8dabe7abe.dirty' 不是最新的,还)
猜猜这里没有更多收获,没有将游戏移动到已编译的 Cython
区域,但大约有几十分钟而不是几十小时, Alea Iacta Est - 只是 numpy
智能矢量化代码规则!
结语:
如果原始算法是正确的(并且在源代码中留下一些疑问以进行任何进一步的改进工作),任何尝试 运行 其他形式的 [PARALLEL]
code-execution-flow 在这里无济于事( kernel-windows[w,w] 是 DEM 网格内存布局的非常小且不连续的区域,memory-I/O 成本是这里 运行 时间预算的主要部分,一些更好的索引可能会提高缓存行的重用,但总体努力远远超出预算,因为 的目标从 ~ 11 [hrs]
下降到大约 ~ 6 [hrs]
比成功达到 还多 ~ 20 [min]
运行 次[1300,1100]
float32 DEM 网格
代码保持原样(非 PEP-8),因为 [DOC.me], [TEST.me]
和 [=72] 的所有附加教学价值=][PERF.me]
QA 阶段,所以所有类型的 PEP-isto-evangelisators 都接受 O/P 作者的观点,左全屏宽度布局,以便允许理解 WHY 并改进代码,如果去掉注释,代码将失去 her/his 进一步提高代码性能的方法。谢谢
@jit( [ "int32( float32[:,:], int32, float32[:,:] )", ], nogil = True ) # numba.__version__ '0.43.1+0.g8dabe7abe.dirty'
def RMSH_det_jit( DEMf32, w, rmsRESULTf32 ): # pre-allocate rmsRESULTf32[:,:] externally
#import numpy as np
#from scipy import signal
#
# [nrows, ncols] = np.shape( DEM ) # avoid ~ [ 1355, 1165 ]
# # DEM.dtype = float32
# # .flags = C_CONTIGUOUS : True
# # F_CONTIGUOUS : False
# # OWNDATA : True
# # WRITEABLE : True
# # ALIGNED : True
# # WRITEBACKIFCOPY : False
# # UPDATEIFCOPY : False
#
rmsRESULTf32[:,:] = np.nan # .STO[:,:] np.nan-s, using in-place assignment into the by-ref passed, externally pre-allocated np.ndarray
dtdWIN = np.ones( ( 2 * w - 1, # .ALLOC once, re-use 1M+ times
2 * w - 1 ) )
a_div_by_nz_minus1 = 1. / ( dtdWIN.size - 1 ) # .SET float CONST with about a ~1M+ re-use
a_num_of_NaNs = 0 # .SET i4 bonus value, ret'd as a side-effect of the signature ...
# rms = DEM*np.nan # avoid ( pre-alloc rmsRESULTf32 ) externally create and pass a right-sized, empty array to store all results
# nw = ( w * 2 )**2
# x = np.arange( 0, nw )
# 11..1344
#or i in np.arange( w+1, nrows-w ): # w ~ 10 -> [11:1344, 11:1154]
for i in np.arange( w+1, DEMf32.shape[0]-w ): # ??? never touches DEM-row/column[0]?? or off-by-one indexing error ???
fromI = i - w # .UPD ALAP
tillI = i + w - 1 # .UPD ALAP upper bound index excluded ( this is how a code in [ np.arange(...)[0]:np.arange(...)[-1] ] works )
# 11..1154
#or j in np.arange( w+1, ncols-w ):
for j in np.arange( w+1, DEMf32.shape[1]-w ):
fromJ = j - w # .UPD ALAP
tillJ = j + w - 1 # .UPD ALAP upper bound index excluded ( this is how a code in [ np.arange(...)[0]:np.arange(...)[-1] ] works )
# 1..1334:21..1354 # ??? never touches first/last DEM-row/column??
# d1 = np.int64( np.arange( i-w, i+w ) ) # AVOID: 1M+ times allocated, yet never consumed, but their edge values
# d2 = np.int64( np.arange( j-w, j+w ) ) # AVOID: 1M+ times allocated, yet never consumed, but their edge values
# win = DEM[ d1[0]:d1[-1], # AVOID: while a .view-only, no need to 1M+ times instantiate a "kernel"-win(dow] ( this will create a np.view into the original DEM, not a copy ! )
# d2[0]:d2[-1] # ?.or.? NOT a .view-only, but a new .copy() instantiated, so as to call .detrend() w/o in-place modifying DEMf32 ???
# ] # ?.or.? NOT a .view-only, but a new .copy() instantiated, so as to call .detrend() w/o in-place modifying DEMf32 ???
dtdWIN[:,:] = DEMf32[fromI:tillI, fromJ:tillJ] # NOT a .view-only, but a .copy() re-populated into a just once and only once pre-allocated dtdWIN, via an in-place copy
#f np.max( np.isnan( win ) ) == 1: # AVOID: 1M+ times full-range scan, while any first np.nan decides the game and no need to scan "the rest"
if np.any( np.isnan( dtdWIN ) ): # "density" of np.nan-s determine, if this is a good idea to pre-store
a_num_of_NaNs += 1 # .INC
continue # .NOP/LOOP from here, already pre-stored np.nan-s for this case
# rms[i,j] = np.nan # DUP ( already stored in initialisation ... )
else:
#in = signal.detrend( win, type = 'linear' ) # REALLY?: in-place modification of DEM-matrix ???
dtdWIN = signal.detrend( dtdWIN, type = 'linear' ) # in scipy-v1.3.1+ can mod in-place, overwrite_data = True ) # REMOVE OLS-fit-linear trend
dtdWIN = signal.detrend( dtdWIN, type = 'constant' ) # in scipy-v1.3.1+ can mod in-place, overwrite_data = True ) # REMOVE mean
#z = np.reshape( win, -1 ) # AVOID:~1M+ re-counting constant value, known from w directly
#nz = np.size( z ) # AVOID:~1M+ re-counting constant value, known from w directly
#rootms = np.sqrt( 1 / ( nz - 1 ) * np.sum( ( z - np.mean( z ) )**2 ) )
#rms[i,j] = rootms
rmsRESULTf32[i,j] = np.sqrt( a_div_by_nz_minus1 # .STO a "scaled"
* np.dot( dtdWIN,
dtdWIN.T
).sum()
# np.sum( ( dtdWIN # SUM of
# # - dtdWIN.mean() # mean-removed ( ALREADY done via scipy.signal.detrend( 'const' ) above )
# )**2 # SQUARES
# )
) # ROOT
return( a_num_of_NaNs ) # ret i4
使用 Numba 的解决方案
在某些情况下,如果您使用的所有功能都受支持,这很容易做到。在您的代码中,win = signal.detrend(win, type = 'linear')
是您必须在 Numba 中实现的部分,因为不支持此功能。
在 Numba 中实施去趋势
如果您查看 detrend 的 source-code,并提取与您的问题相关的部分,它可能如下所示:
@nb.njit()
def detrend(w):
Npts=w.shape[0]
A=np.empty((Npts,2),dtype=w.dtype)
for i in range(Npts):
A[i,0]=1.*(i+1) / Npts
A[i,1]=1.
coef, resids, rank, s = np.linalg.lstsq(A, w.T)
out=w.T- np.dot(A, coef)
return out.T
我还为 np.max(np.isnan(win)) == 1
@nb.njit()
def isnan(win):
for i in range(win.shape[0]):
for j in range(win.shape[1]):
if np.isnan(win[i,j]):
return True
return False
主要功能
因为我在这里使用了 Numba,所以并行化非常简单,只是外层循环的 prange 和
import numpy as np
import numba as nb
@nb.njit(parallel=True)
def RMSH_det_nb(DEM, w):
[nrows, ncols] = np.shape(DEM)
#create an empty array to store result
rms = DEM*np.nan
for i in nb.prange(w+1,nrows-w):
for j in range(w+1,ncols-w):
win = DEM[i-w:i+w-1,j-w:j+w-1]
if isnan(win):
rms[i,j] = np.nan
else:
win = detrend(win)
z = win.flatten()
nz = z.size
rootms = np.sqrt(1 / (nz - 1) * np.sum((z-np.mean(z))**2))
rms[i,j] = rootms
return rms
计时(小例子)
w = 10
DEM=np.random.rand(100, 100).astype(np.float32)
res1=RMSH_det(DEM, w)
res2=RMSH_det_nb(DEM, w)
print(np.allclose(res1,res2,equal_nan=True))
#True
%timeit res1=RMSH_det(DEM, w)
#1.59 s ± 72 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res2=RMSH_det_nb(DEM, w) #approx. 55 times faster
#29 ms ± 1.85 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
更大数组的时间
w = 10
DEM=np.random.rand(1355, 1165).astype(np.float32)
%timeit res2=RMSH_det_nb(DEM, w)
#6.63 s ± 21.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[编辑] 使用正态方程的实现
此方法的数值精度较低。虽然这个解决方案要快得多。
@nb.njit()
def isnan(win):
for i in range(win.shape[0]):
for j in range(win.shape[1]):
if win[i,j]==np.nan:
return True
return False
@nb.njit()
def detrend(w):
Npts=w.shape[0]
A=np.empty((Npts,2),dtype=w.dtype)
for i in range(Npts):
A[i,0]=1.*(i+1) / Npts
A[i,1]=1.
coef, resids, rank, s = np.linalg.lstsq(A, w.T)
out=w.T- np.dot(A, coef)
return out.T
@nb.njit()
def detrend_2(w,T1,A):
T2=np.dot(A.T,w.T)
coef=np.linalg.solve(T1,T2)
out=w.T- np.dot(A, coef)
return out.T
@nb.njit(parallel=True)
def RMSH_det_nb_normal_eq(DEM,w):
[nrows, ncols] = np.shape(DEM)
#create an empty array to store result
rms = DEM*np.nan
Npts=w*2-1
A=np.empty((Npts,2),dtype=DEM.dtype)
for i in range(Npts):
A[i,0]=1.*(i+1) / Npts
A[i,1]=1.
T1=np.dot(A.T,A)
nz = Npts**2
for i in nb.prange(w+1,nrows-w):
for j in range(w+1,ncols-w):
win = DEM[i-w:i+w-1,j-w:j+w-1]
if isnan(win):
rms[i,j] = np.nan
else:
win = detrend_2(win,T1,A)
rootms = np.sqrt(1 / (nz - 1) * np.sum((win-np.mean(win))**2))
rms[i,j] = rootms
return rms
计时
w = 10
DEM=np.random.rand(100, 100).astype(np.float32)
res1=RMSH_det(DEM, w)
res2=RMSH_det_nb(DEM, w)
print(np.allclose(res1,res2,equal_nan=True))
#True
%timeit res1=RMSH_det(DEM, w)
#1.59 s ± 72 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res2=RMSH_det_nb_normal_eq(DEM,w)
#7.97 ms ± 89.4 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
使用正规方程的优化解
重复使用临时数组以避免昂贵的内存分配,并使用矩阵乘法的自定义实现。这仅适用于非常小的矩阵,在大多数其他情况下 np.dot (sgeemm) 会快很多。
@nb.njit()
def matmult_2(A,B,out):
for j in range(B.shape[1]):
acc1=nb.float32(0)
acc2=nb.float32(0)
for k in range(B.shape[0]):
acc1+=A[0,k]*B[k,j]
acc2+=A[1,k]*B[k,j]
out[0,j]=acc1
out[1,j]=acc2
return out
@nb.njit(fastmath=True)
def matmult_mod(A,B,w,out):
for j in range(B.shape[1]):
for i in range(A.shape[0]):
acc=nb.float32(0)
acc+=A[i,0]*B[0,j]+A[i,1]*B[1,j]
out[j,i]=acc-w[j,i]
return out
@nb.njit()
def detrend_2_opt(w,T1,A,Tempvar_1,Tempvar_2):
T2=matmult_2(A.T,w.T,Tempvar_1)
coef=np.linalg.solve(T1,T2)
return matmult_mod(A, coef,w,Tempvar_2)
@nb.njit(parallel=True)
def RMSH_det_nb_normal_eq_opt(DEM,w):
[nrows, ncols] = np.shape(DEM)
#create an empty array to store result
rms = DEM*np.nan
Npts=w*2-1
A=np.empty((Npts,2),dtype=DEM.dtype)
for i in range(Npts):
A[i,0]=1.*(i+1) / Npts
A[i,1]=1.
T1=np.dot(A.T,A)
nz = Npts**2
for i in nb.prange(w+1,nrows-w):
Tempvar_1=np.empty((2,Npts),dtype=DEM.dtype)
Tempvar_2=np.empty((Npts,Npts),dtype=DEM.dtype)
for j in range(w+1,ncols-w):
win = DEM[i-w:i+w-1,j-w:j+w-1]
if isnan(win):
rms[i,j] = np.nan
else:
win = detrend_2_opt(win,T1,A,Tempvar_1,Tempvar_2)
rootms = np.sqrt(1 / (nz - 1) * np.sum((win-np.mean(win))**2))
rms[i,j] = rootms
return rms
计时
w = 10
DEM=np.random.rand(100, 100).astype(np.float32)
res1=RMSH_det(DEM, w)
res2=RMSH_det_nb_normal_eq_opt(DEM, w)
print(np.allclose(res1,res2,equal_nan=True))
#True
%timeit res1=RMSH_det(DEM, w)
#1.59 s ± 72 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res2=RMSH_det_nb_normal_eq_opt(DEM,w)
#4.66 ms ± 87.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
isnan 时间安排
这个函数完全是另一种实现。如果 NaN 恰好位于数组的开头,则速度要快得多,但无论如何,即使没有,也有一些加速。我用小数组(大约 window 大小)和 @user3666197 建议的大数组对它进行了基准测试。
case_1=np.full((20,20),np.nan)
case_2=np.full((20,20),0.)
case_2[10,10]=np.nan
case_3=np.full((20,20),0.)
case_4 = np.full( ( int( 1E4 ), int( 1E4 ) ),np.nan)
case_5 = np.ones( ( int( 1E4 ), int( 1E4 ) ) )
%timeit np.any(np.isnan(case_1))
%timeit np.any(np.isnan(case_2))
%timeit np.any(np.isnan(case_3))
%timeit np.any(np.isnan(case_4))
%timeit np.any(np.isnan(case_5))
#2.75 µs ± 73.1 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
#2.75 µs ± 46.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
#2.76 µs ± 32.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
#81.3 ms ± 2.97 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
#86.7 ms ± 2.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit isnan(case_1)
%timeit isnan(case_2)
%timeit isnan(case_3)
%timeit isnan(case_4)
%timeit isnan(case_5)
#244 ns ± 5.02 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
#357 ns ± 1.07 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
#475 ns ± 9.28 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
#235 ns ± 0.933 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
#58.8 ms ± 2.08 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)