Cython对代码的优化
Cython optimization of the code
我正在努力用 Cython 提高我的 python 粒子跟踪代码的性能。
这是我的纯 Python 代码:
from scipy.integrate import odeint
import numpy as np
from numpy import sqrt, pi, sin, cos
from time import time as Time
import multiprocessing as mp
from functools import partial
cLight = 299792458.
Dim = 6
class Integrator:
def __init__(self, ring):
self.ring = ring
def equations(self, X, s):
dXds = np.zeros(Dim)
E, B = self.ring.getEMField( [X[0], X[2], s], X[4] )
h = 1 + X[0]/self.ring.ringRadius
p_s = np.sqrt(X[5]**2 - self.ring.particle.mass**2 - X[1]**2 - X[3]**2)
dtds = h*X[5]/p_s
gamma = X[5]/self.ring.particle.mass
beta = np.array( [X[1], X[3], p_s] ) / X[5]
dXds[0] = dtds*beta[0]
dXds[2] = dtds*beta[1]
dXds[1] = p_s/self.ring.ringRadius + self.ring.particle.charge*(dtds*E[0] + dXds[2]*B[2] - h*B[1])
dXds[3] = self.ring.particle.charge*(dtds*E[1] + h*B[0] - dXds[0]*B[2])
dXds[4] = dtds
dXds[5] = self.ring.particle.charge*(dXds[0]*E[0] + dXds[2]*E[1] + h*E[2])
return dXds
def odeSolve(self, X0, sRange):
sol = odeint(self.equations, X0, sRange)
return sol
class Ring:
def __init__(self, particle):
self.particle = particle
self.ringRadius = 7.112
self.magicB0 = self.particle.magicMomentum/self.ringRadius
def getEMField(self, pos, time):
x, y, s = pos
theta = (s/self.ringRadius*180/pi) % 360
r = sqrt(x**2 + y**2)
arg = 0 if r == 0 else np.angle( complex(x/r, y/r) )
rn = r/0.045
k2 = 37*24e3
k10 = -4*24e3
E = np.zeros(3)
B = np.array( [ 0, self.magicB0, 0 ] )
for i in range(4):
if ((21.9+90*i < theta < 34.9+90*i or 38.9+90*i < theta < 64.9+90*i) and (-0.05 < x < 0.05 and -0.05 < y < 0.05)):
E = np.array( [ k2*x/0.045 + k10*rn**9*cos(9*arg), -k2*y/0.045 -k10*rn**9*sin(9*arg), 0] )
break
return E, B
class Particle:
def __init__(self):
self.mass = 105.65837e6
self.charge = 1.
self.gm2 = 0.001165921
self.magicMomentum = self.mass/sqrt(self.gm2)
self.magicEnergy = sqrt(self.magicMomentum**2 + self.mass**2)
self.magicGamma = self.magicEnergy/self.mass
self.magicBeta = self.magicMomentum/(self.magicGamma*self.mass)
def runSimulation(nParticles, tEnd):
particle = Particle()
ring = Ring(particle)
integrator = Integrator(ring)
Xs = np.array( [ np.array( [45e-3*(np.random.rand()-0.5)*2, 0, 0, 0, 0, particle.magicEnergy] ) for i in range(nParticles) ] )
sRange = np.arange(0, tEnd, 1e-9)*particle.magicBeta*cLight
ode = partial(integrator.odeSolve, sRange=sRange)
t1 = Time()
pool = mp.Pool()
sol = np.array(pool.map(ode, Xs))
t2 = Time()
print ("%.3f sec" %(t2-t1))
return t2-t1
显然,最耗时的过程是积分 ODE,在 class Integrator 中定义为 odeSolve() 和 equations()。此外,class Ring 中的 getEMField() 方法在求解过程中被调用的次数与 equations() 方法一样多。
我尝试使用 Cython 获得显着的加速(至少 10 倍~20 倍),但通过以下 Cython 脚本我只获得了 ~1.5 倍的加速:
import cython
import numpy as np
cimport numpy as np
from libc.math cimport sqrt, pi, sin, cos
from scipy.integrate import odeint
from time import time as Time
import multiprocessing as mp
from functools import partial
cdef double cLight = 299792458.
cdef int Dim = 6
@cython.boundscheck(False)
cdef class Integrator:
cdef Ring ring
def __init__(self, ring):
self.ring = ring
cpdef np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] equations(self,
np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] X,
double s):
cdef np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] dXds = np.zeros(Dim)
cdef double h, p_s, dtds, gamma
cdef np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] beta, E, B
E, B = self.ring.getEMField( [X[0], X[2], s], X[4] )
h = 1 + X[0]/self.ring.ringRadius
p_s = np.sqrt(X[5]*X[5] - self.ring.particle.mass*self.ring.particle.mass - X[1]*X[1] - X[3]*X[3])
dtds = h*X[5]/p_s
gamma = X[5]/self.ring.particle.mass
beta = np.array( [X[1], X[3], p_s] ) / X[5]
dXds[0] = dtds*beta[0]
dXds[2] = dtds*beta[1]
dXds[1] = p_s/self.ring.ringRadius + self.ring.particle.charge*(dtds*E[0] + dXds[2]*B[2] - h*B[1])
dXds[3] = self.ring.particle.charge*(dtds*E[1] + h*B[0] - dXds[0]*B[2])
dXds[4] = dtds
dXds[5] = self.ring.particle.charge*(dXds[0]*E[0] + dXds[2]*E[1] + h*E[2])
return dXds
cpdef np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] odeSolve(self,
np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] X0,
np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] sRange):
sol = odeint(self.equations, X0, sRange)
return sol
@cython.boundscheck(False)
cdef class Ring:
cdef Particle particle
cdef double ringRadius
cdef double magicB0
def __init__(self, particle):
self.particle = particle
self.ringRadius = 7.112
self.magicB0 = self.particle.magicMomentum/self.ringRadius
cpdef tuple getEMField(self,
list pos,
double time):
cdef double x, y, s
cdef double theta, r, rn, arg, k2, k10
cdef np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] E, B
x, y, s = pos
theta = (s/self.ringRadius*180/pi) % 360
r = sqrt(x*x + y*y)
arg = 0 if r == 0 else np.angle( complex(x/r, y/r) )
rn = r/0.045
k2 = 37*24e3
k10 = -4*24e3
E = np.zeros(3)
B = np.array( [ 0, self.magicB0, 0 ] )
for i in range(4):
if ((21.9+90*i < theta < 34.9+90*i or 38.9+90*i < theta < 64.9+90*i) and (-0.05 < x < 0.05 and -0.05 < y < 0.05)):
E = np.array( [ k2*x/0.045 + k10*rn**9*cos(9*arg), -k2*y/0.045 -k10*rn**9*sin(9*arg), 0] )
#E = np.array( [ k2*x/0.045, -k2*y/0.045, 0] )
break
return E, B
cdef class Particle:
cdef double mass
cdef double charge
cdef double gm2
cdef double magicMomentum
cdef double magicEnergy
cdef double magicGamma
cdef double magicBeta
def __init__(self):
self.mass = 105.65837e6
self.charge = 1.
self.gm2 = 0.001165921
self.magicMomentum = self.mass/sqrt(self.gm2)
self.magicEnergy = sqrt(self.magicMomentum**2 + self.mass**2)
self.magicGamma = self.magicEnergy/self.mass
self.magicBeta = self.magicMomentum/(self.magicGamma*self.mass)
def runSimulation(nParticles, tEnd):
particle = Particle()
ring = Ring(particle)
integrator = Integrator(ring)
#nParticles = 5
Xs = np.array( [ np.array( [45e-3*(np.random.rand()-0.5)*2, 0, 0, 0, 0, particle.magicEnergy] ) for i in range(nParticles) ] )
sRange = np.arange(0, tEnd, 1e-9)*particle.magicBeta*cLight
ode = partial(integrator.odeSolve, sRange=sRange)
t1 = Time()
pool = mp.Pool()
sol = np.array(pool.map(ode, Xs))
t2 = Time()
print ("%.3f sec" %(t2-t1))
return t2-t1
我应该怎么做才能让 Cython 发挥最大的作用?
(我尝试使用 Numba 而不是 Cython,实际上 Numba 带来的性能提升是巨大的(大约 20 倍加速)。但是我很难将 Numba 用于 python class 个实例,因此我决定使用 Cython 而不是 Numba)。
供参考,以下是其编译时的cython注解:
这是一个非常不完整的答案,因为我没有分析或计时任何东西,甚至没有检查它是否给出了相同的答案。然而,这里有一些减少 Cython 生成的 Python 代码量的建议:
添加@cython.cdivision(True)
编译指令。这意味着 ZeroDivisionError
不会在浮点数除法上产生,你会得到一个 NaN
值。 (仅当您不想引发错误时才这样做)。
将 p_s = np.sqrt(...)
更改为 p_s = sqrt(...)
。这将删除仅对单个值进行操作的 numpy 调用。您似乎在其他地方做过,所以我不知道您为什么错过了这一行。
尽可能使用固定大小的 C 数组而不是 numpy 数组:
cdef double beta[3]
# ...
beta[0] = X[1]/X[5]
beta[1] = X[3]/X[5]
beta[2] = p_s/X[5]
如果大小在编译时已知(并且相当小)并且您不想 return 它,则可以执行此操作。这避免了调用 np.zeros
和一些后续的类型检查以将其分配给类型化的 numpy 数组。我认为 beta
是唯一可以做到这一点的地方。
np.angle( complex(x/r, y/r) )
可以用 atan2(y/r, x/r)
替换(使用 libc.math
中的 atan2
。你也可以用 r
去掉除法
cdef int i
有助于使您的 for
循环在 getEMField
中更快(Cython 通常擅长自动获取循环变量的类型,但似乎失败了这里)
我怀疑逐个元素分配 E
比整个数组更快:
E[0] = k2*x/0.045 + k10*rn**9*cos(9*arg)
E[1] = -k2*y/0.045 -k10*rn**9*sin(9*arg)
指定类型如 list
和 tuple
没有太大价值,它实际上可能会使代码稍微慢一些(因为它会浪费时间检查类型) .
一个更大的变化是将 E
和 B
作为指针传递给 GetEMField
而不是使用分配它们 np.zeros
。这将使您可以将它们分配为 equations
(cdef double E[3]
) 中的静态 C 数组。缺点是 GetEMField
必须是 cdef
,所以不再可以从 Python 调用(但如果你愿意,你也可以创建一个 Python 可调用包装函数)。
我正在努力用 Cython 提高我的 python 粒子跟踪代码的性能。
这是我的纯 Python 代码:
from scipy.integrate import odeint
import numpy as np
from numpy import sqrt, pi, sin, cos
from time import time as Time
import multiprocessing as mp
from functools import partial
cLight = 299792458.
Dim = 6
class Integrator:
def __init__(self, ring):
self.ring = ring
def equations(self, X, s):
dXds = np.zeros(Dim)
E, B = self.ring.getEMField( [X[0], X[2], s], X[4] )
h = 1 + X[0]/self.ring.ringRadius
p_s = np.sqrt(X[5]**2 - self.ring.particle.mass**2 - X[1]**2 - X[3]**2)
dtds = h*X[5]/p_s
gamma = X[5]/self.ring.particle.mass
beta = np.array( [X[1], X[3], p_s] ) / X[5]
dXds[0] = dtds*beta[0]
dXds[2] = dtds*beta[1]
dXds[1] = p_s/self.ring.ringRadius + self.ring.particle.charge*(dtds*E[0] + dXds[2]*B[2] - h*B[1])
dXds[3] = self.ring.particle.charge*(dtds*E[1] + h*B[0] - dXds[0]*B[2])
dXds[4] = dtds
dXds[5] = self.ring.particle.charge*(dXds[0]*E[0] + dXds[2]*E[1] + h*E[2])
return dXds
def odeSolve(self, X0, sRange):
sol = odeint(self.equations, X0, sRange)
return sol
class Ring:
def __init__(self, particle):
self.particle = particle
self.ringRadius = 7.112
self.magicB0 = self.particle.magicMomentum/self.ringRadius
def getEMField(self, pos, time):
x, y, s = pos
theta = (s/self.ringRadius*180/pi) % 360
r = sqrt(x**2 + y**2)
arg = 0 if r == 0 else np.angle( complex(x/r, y/r) )
rn = r/0.045
k2 = 37*24e3
k10 = -4*24e3
E = np.zeros(3)
B = np.array( [ 0, self.magicB0, 0 ] )
for i in range(4):
if ((21.9+90*i < theta < 34.9+90*i or 38.9+90*i < theta < 64.9+90*i) and (-0.05 < x < 0.05 and -0.05 < y < 0.05)):
E = np.array( [ k2*x/0.045 + k10*rn**9*cos(9*arg), -k2*y/0.045 -k10*rn**9*sin(9*arg), 0] )
break
return E, B
class Particle:
def __init__(self):
self.mass = 105.65837e6
self.charge = 1.
self.gm2 = 0.001165921
self.magicMomentum = self.mass/sqrt(self.gm2)
self.magicEnergy = sqrt(self.magicMomentum**2 + self.mass**2)
self.magicGamma = self.magicEnergy/self.mass
self.magicBeta = self.magicMomentum/(self.magicGamma*self.mass)
def runSimulation(nParticles, tEnd):
particle = Particle()
ring = Ring(particle)
integrator = Integrator(ring)
Xs = np.array( [ np.array( [45e-3*(np.random.rand()-0.5)*2, 0, 0, 0, 0, particle.magicEnergy] ) for i in range(nParticles) ] )
sRange = np.arange(0, tEnd, 1e-9)*particle.magicBeta*cLight
ode = partial(integrator.odeSolve, sRange=sRange)
t1 = Time()
pool = mp.Pool()
sol = np.array(pool.map(ode, Xs))
t2 = Time()
print ("%.3f sec" %(t2-t1))
return t2-t1
显然,最耗时的过程是积分 ODE,在 class Integrator 中定义为 odeSolve() 和 equations()。此外,class Ring 中的 getEMField() 方法在求解过程中被调用的次数与 equations() 方法一样多。 我尝试使用 Cython 获得显着的加速(至少 10 倍~20 倍),但通过以下 Cython 脚本我只获得了 ~1.5 倍的加速:
import cython
import numpy as np
cimport numpy as np
from libc.math cimport sqrt, pi, sin, cos
from scipy.integrate import odeint
from time import time as Time
import multiprocessing as mp
from functools import partial
cdef double cLight = 299792458.
cdef int Dim = 6
@cython.boundscheck(False)
cdef class Integrator:
cdef Ring ring
def __init__(self, ring):
self.ring = ring
cpdef np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] equations(self,
np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] X,
double s):
cdef np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] dXds = np.zeros(Dim)
cdef double h, p_s, dtds, gamma
cdef np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] beta, E, B
E, B = self.ring.getEMField( [X[0], X[2], s], X[4] )
h = 1 + X[0]/self.ring.ringRadius
p_s = np.sqrt(X[5]*X[5] - self.ring.particle.mass*self.ring.particle.mass - X[1]*X[1] - X[3]*X[3])
dtds = h*X[5]/p_s
gamma = X[5]/self.ring.particle.mass
beta = np.array( [X[1], X[3], p_s] ) / X[5]
dXds[0] = dtds*beta[0]
dXds[2] = dtds*beta[1]
dXds[1] = p_s/self.ring.ringRadius + self.ring.particle.charge*(dtds*E[0] + dXds[2]*B[2] - h*B[1])
dXds[3] = self.ring.particle.charge*(dtds*E[1] + h*B[0] - dXds[0]*B[2])
dXds[4] = dtds
dXds[5] = self.ring.particle.charge*(dXds[0]*E[0] + dXds[2]*E[1] + h*E[2])
return dXds
cpdef np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] odeSolve(self,
np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] X0,
np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] sRange):
sol = odeint(self.equations, X0, sRange)
return sol
@cython.boundscheck(False)
cdef class Ring:
cdef Particle particle
cdef double ringRadius
cdef double magicB0
def __init__(self, particle):
self.particle = particle
self.ringRadius = 7.112
self.magicB0 = self.particle.magicMomentum/self.ringRadius
cpdef tuple getEMField(self,
list pos,
double time):
cdef double x, y, s
cdef double theta, r, rn, arg, k2, k10
cdef np.ndarray[np.double_t, ndim=1, negative_indices=False, mode="c"] E, B
x, y, s = pos
theta = (s/self.ringRadius*180/pi) % 360
r = sqrt(x*x + y*y)
arg = 0 if r == 0 else np.angle( complex(x/r, y/r) )
rn = r/0.045
k2 = 37*24e3
k10 = -4*24e3
E = np.zeros(3)
B = np.array( [ 0, self.magicB0, 0 ] )
for i in range(4):
if ((21.9+90*i < theta < 34.9+90*i or 38.9+90*i < theta < 64.9+90*i) and (-0.05 < x < 0.05 and -0.05 < y < 0.05)):
E = np.array( [ k2*x/0.045 + k10*rn**9*cos(9*arg), -k2*y/0.045 -k10*rn**9*sin(9*arg), 0] )
#E = np.array( [ k2*x/0.045, -k2*y/0.045, 0] )
break
return E, B
cdef class Particle:
cdef double mass
cdef double charge
cdef double gm2
cdef double magicMomentum
cdef double magicEnergy
cdef double magicGamma
cdef double magicBeta
def __init__(self):
self.mass = 105.65837e6
self.charge = 1.
self.gm2 = 0.001165921
self.magicMomentum = self.mass/sqrt(self.gm2)
self.magicEnergy = sqrt(self.magicMomentum**2 + self.mass**2)
self.magicGamma = self.magicEnergy/self.mass
self.magicBeta = self.magicMomentum/(self.magicGamma*self.mass)
def runSimulation(nParticles, tEnd):
particle = Particle()
ring = Ring(particle)
integrator = Integrator(ring)
#nParticles = 5
Xs = np.array( [ np.array( [45e-3*(np.random.rand()-0.5)*2, 0, 0, 0, 0, particle.magicEnergy] ) for i in range(nParticles) ] )
sRange = np.arange(0, tEnd, 1e-9)*particle.magicBeta*cLight
ode = partial(integrator.odeSolve, sRange=sRange)
t1 = Time()
pool = mp.Pool()
sol = np.array(pool.map(ode, Xs))
t2 = Time()
print ("%.3f sec" %(t2-t1))
return t2-t1
我应该怎么做才能让 Cython 发挥最大的作用? (我尝试使用 Numba 而不是 Cython,实际上 Numba 带来的性能提升是巨大的(大约 20 倍加速)。但是我很难将 Numba 用于 python class 个实例,因此我决定使用 Cython 而不是 Numba)。
供参考,以下是其编译时的cython注解:
这是一个非常不完整的答案,因为我没有分析或计时任何东西,甚至没有检查它是否给出了相同的答案。然而,这里有一些减少 Cython 生成的 Python 代码量的建议:
添加
@cython.cdivision(True)
编译指令。这意味着ZeroDivisionError
不会在浮点数除法上产生,你会得到一个NaN
值。 (仅当您不想引发错误时才这样做)。将
p_s = np.sqrt(...)
更改为p_s = sqrt(...)
。这将删除仅对单个值进行操作的 numpy 调用。您似乎在其他地方做过,所以我不知道您为什么错过了这一行。尽可能使用固定大小的 C 数组而不是 numpy 数组:
cdef double beta[3] # ... beta[0] = X[1]/X[5] beta[1] = X[3]/X[5] beta[2] = p_s/X[5]
如果大小在编译时已知(并且相当小)并且您不想 return 它,则可以执行此操作。这避免了调用
np.zeros
和一些后续的类型检查以将其分配给类型化的 numpy 数组。我认为beta
是唯一可以做到这一点的地方。np.angle( complex(x/r, y/r) )
可以用atan2(y/r, x/r)
替换(使用libc.math
中的atan2
。你也可以用r
去掉除法cdef int i
有助于使您的for
循环在getEMField
中更快(Cython 通常擅长自动获取循环变量的类型,但似乎失败了这里)我怀疑逐个元素分配
E
比整个数组更快:E[0] = k2*x/0.045 + k10*rn**9*cos(9*arg) E[1] = -k2*y/0.045 -k10*rn**9*sin(9*arg)
指定类型如
list
和tuple
没有太大价值,它实际上可能会使代码稍微慢一些(因为它会浪费时间检查类型) .一个更大的变化是将
E
和B
作为指针传递给GetEMField
而不是使用分配它们np.zeros
。这将使您可以将它们分配为equations
(cdef double E[3]
) 中的静态 C 数组。缺点是GetEMField
必须是cdef
,所以不再可以从 Python 调用(但如果你愿意,你也可以创建一个 Python 可调用包装函数)。