有没有办法用 numpy 为数组中 n 行的所有组合计算一些东西(简单的情况是所有对,即 n = 2)
Is there a way, with numpy, to compute something for all combinations of n lines in an array (the simple case being all pairs i.e. n=2)
我目前正在 python 使用 Runge-Kutta 方法进行微分方程系统数值积分,范围是(如标题所述)行星轨道模拟。
我正在研究(比较)加速计算的不同方法,目前我已经尝试使用非常高效的 C 模块,我想尝试使用 numpy
在这个计算中,我需要计算每对行星的相互吸引力。目前,我正在这样做:
import numpy as np
def grav(px, py, M, ax, ay):
G = 6.67408*10**-2 # m³/s²T
for b in range(1, len(px)):
# computing the distance between body #b and all previous
dx = px[b] - px[:b]
dy = py[b] - py[:b]
d2 = dx*dx+dy*dy
# computing acceleration undergone by b from all previous
ax[b] = -sum(M[:b]*G * dx * d2**(-1.5))
ay[b] = -sum(M[:b]*G * dy * d2**(-1.5))
# adding for each previous, acceleration undergone by from b
ax[:b] += M[b]*G * dx * d2**(-1.5)
ay[:b] += M[b]*G * dy * d2**(-1.5)
# input data
system_px = np.array([1., 2., 3., 4., 5., 6., 7., 9., 4., 0.])
system_py = np.array([3., 5., 1., 2., 4., 5., 6., 3., 5., 8.])
system_M = np.array([3., 5., 1., 2., 4., 5., 6., 3., 5., 8.])
# outout array
system_ax = np.zeros(len(system_px))
system_ay = np.zeros(len(system_px))
grav(system_px, system_py, system_M, system_ax, system_ay)
for i in range(len(system_px)):
print('body {} mass = {}(ton), position = {}(m), '
'acceleration = ({:8.4f}, {:8.4f})(m/s²)'.format(i, system_M[i],
(system_px[i], system_py[i]), system_ax[i], system_ay[i]))
我想知道是否会有一些非常通用的更 «numpythonic» 的方法来做到这一点,它可以应用于 n 行的每个子集。
借助@norok2 获得的信息,我能够在没有循环的情况下获得更快的解决方案,并部分(即仅针对 n=2)回答问题,但不能同时回答这两个问题.回复问题的解大约慢了10倍:
import numpy as np
def grav_fast(p, M):
G = 6.67408*10**-2 # m³/s²T
d = p[:, :, None] - p[:, None, :]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
return (M[None, :, None]*G*d*(d2**(-1.5))[None, :, :]).sum(axis=1)
#or return (M[None, None, :]*G*d*(d2**(-1.5))[None, :, :]).sum(axis=2)
# (both are equivalent because d is symetric)
def grav_reply(p, M):
G = 6.67408*10**-2 # m³/s²T
d = np.tril(p[:, :, None] - p[:, None, :], -1)
d2 = np.tril((d*d).sum(axis=0), -1)
d2[d2==0] = 1
return (M[None, :, None]*G*d*(d2**(-1.5))[None, :, :]).sum(axis=1) - \
(M[None, None, :]*G*d*(d2**(-1.5))[None, :, :]).sum(axis=2)
# input data
system_p = np.array([[ 1., 2., 3., 4., 5., 6., 7., 9., 4., 0.],
[ 3., 5., 1., 2., 4., 5., 6., 3., 5., 8.]])
system_M = np.array([3., 5., 1., 2., 4., 5., 6., 3., 5., 8.])
# output array
l = len(system_p[0])
system_a = np.zeros(shape=(2, l))
for test in 'grav_fast', 'grav_reply':
print('\ntesting '+test)
system_a = eval(test+'(system_p, system_M)')
for i in range(l):
print('body {} mass = {}(ton), position = {}(m), '
'acceleration = [{:8.4f} {:8.4f}](m/s²)'.format(i,
system_M[i], system_p[:, i], system_a[0, i], system_a[1, i]))
grav_fast
并没有真正回答这个问题,因为它进行了两倍的计算,并且还为被自身吸引的物体(这导致除以零)进行了计算,但是对于一个小系统,它是仍然比 python 的循环快得多(收支平衡是大约 600 个物体)。另一方面,如果 np.tril
旨在避免进行不需要的计算,则 grav_reply
可能是有效的,但情况似乎并非如此:ipython
的特定测试表明改变 np.tril
(或 np.triu
)中的极限对角线并没有显着改变执行时间。
In [1]: import numpy as np
In [2]: import random
In [3]: a = np.array([[random.randint(10, 99)
....: for _ in range(5)]
....: for _ in range(5)])
In [4]: %timeit np.dot(a, a)
1000000 loops, best of 3: 1.35 µs per loop
In [5]: %timeit np.tril(np.dot(a, a), 0)
100000 loops, best of 3: 17.3 µs per loop
In [6]: %timeit np.tril(np.dot(a, a), -2)
100000 loops, best of 3: 16.5 µs per loop
In [7]: a = np.array([[random.randint(10, 99)
....: for _ in range(100)]
....: for _ in range(100)])
In [8]: %timeit np.tril(a*a, 0)
10000 loops, best of 3: 56.3 µs per loop
In [9]: %timeit np.tril(a*a, -20)
10000 loops, best of 3: 61 µs per loop
In [10]: %timeit np.tril(a*a, 20)
10000 loops, best of 3: 54.7 µs per loop
In [11]: %timeit np.tril(a*a, 60)
10000 loops, best of 3: 54.5 µs per loop
编辑:这是每个算法的 performance/size 图
编辑:这是我写的最后一个基准测试代码:
import numpy as np
import time
import random
from matplotlib import pyplot as plt
from grav_c import grav_c, grav2_c
from numba import jit, njit
import datetime
G = 6.67408*10**-8 # m³/s²T
def grav2(p, M):
l = len(p[0])
a = np.empty(shape=(2, l))
a[:, 0] = 0
for b in range(1, l):
# computing the distance between body #b and all previous
d = p[:, b:b+1] - p[:, :b]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
# computing Newton formula : acceleration undergone by b from all previous
a[:, b] = -(M[:b] * G * d2**(-1.5) * d).sum(axis=1)
# computing Newton formula : adding for each previous, acceleration undergone by from b
a[:, :b] += M[b] * G * d2**(-1.5) * d
return a
grav2_jit = jit(grav2)
def grav(p, M):
l = len(p[0])
a = np.empty(shape=(2, l))
a[:, 0] = 0
for b in range(1, l):
# computing the distance between body #b and all previous
d = p[:, b:b+1] - p[:, :b]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
# computing Newton formula : acceleration undergone by b from all previous
a[:, b] = -(M[:b] * G / np.sqrt(d2) / d2 * d).sum(axis=1)
## a[:, b] = -(M[:b] * G * d2**(-1.5) * d).sum(axis=1)
# computing Newton formula : adding for each previous, acceleration undergone by from b
a[:, :b] += M[b] * G / np.sqrt(d2) / d2 * d
## a[:, :b] += M[b] * G * d2**(-1.5) * d
return a
grav_jit = jit(grav)
def grav2_optim1(p, M):
l = len(p[0])
a = np.empty(shape=(2, l))
a[:, 0] = 0
for b in range(1, l):
# computing the distance between body #b and all previous
d = p[:, b:b+1] - p[:, :b]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
VVV = G * d2**(-1.5)
# computing Newton formula : acceleration undergone by b from all previous
a[:, b] = -(M[:b] * VVV * d).sum(axis=1)
# computing Newton formula : adding for each previous, acceleration undergone by from b
a[:, :b] += M[b] * VVV * d
return a
grav2_optim1_jit = jit(grav2_optim1)
def grav_optim1(p, M):
l = len(p[0])
a = np.empty(shape=(2, l))
a[:, 0] = 0
for b in range(1, l):
# computing the distance between body #b and all previous
d = p[:, b:b+1] - p[:, :b]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
VVV = G / np.sqrt(d2) / d2
# computing Newton formula : acceleration undergone by b from all previous
a[:, b] = -(M[:b] * VVV * d).sum(axis=1)
# computing Newton formula : adding for each previous, acceleration undergone by from b
a[:, :b] += M[b] * VVV * d
return a
grav_optim1_jit = jit(grav_optim1)
def grav2_optim2(p, M):
l = len(p[0])
a = np.empty(shape=(2, l))
a[:, 0] = 0
for b in range(1, l):
# computing the distance between body #b and all previous
d = p[:, b:b+1] - p[:, :b]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
XXX = G * d * d2**(-1.5)
# computing Newton formula : acceleration undergone by b from all previous
a[:, b] = -(M[None, :b] * XXX).sum(axis=1)
# computing Newton formula : adding for each previous, acceleration undergone by from b
a[:, :b] += M[b] * XXX
return a
grav2_optim2_jit = jit(grav2_optim2)
def grav_optim2(p, M):
l = len(p[0])
a = np.empty(shape=(2, l))
a[:, 0] = 0
for b in range(1, l):
# computing the distance between body #b and all previous
d = p[:, b:b+1] - p[:, :b]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
XXX = G * d / np.sqrt(d2) / d2
# computing Newton formula : acceleration undergone by b from all previous
a[:, b] = -(M[None, :b] * XXX).sum(axis=1)
# computing Newton formula : adding for each previous, acceleration undergone by from b
a[:, :b] += M[b] * XXX
return a
grav_optim2_jit = jit(grav_optim2)
def grav2_vect(p, M):
d = p[:, :, None] - p[:, None, :]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
return (M[None, :, None]*G*d*(d2**(-1.5))[None, :, :]).sum(axis=1)
grav2_vect_jit = jit(grav2_vect)
def grav_vect(p, M):
d = p[:, :, None] - p[:, None, :]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
return (M[None, :, None]*G*d/(np.sqrt(d2)*d2)[None, :, :]).sum(axis=1)
grav_vect_jit = jit(grav_vect)
# the grav*_vect_bis functions are equivalent to the grav*_vect functions because d is symetric
def grav2_vect_bis(p, M):
d = p[:, :, None] - p[:, None, :]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
return (-M[None, None, :]*G*d*(d2**(-1.5))[None, :, :]).sum(axis=2)
grav2_vect_bis_jit = jit(grav2_vect_bis)
def grav_vect_bis(p, M):
d = p[:, :, None] - p[:, None, :]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
return (-M[None, None, :]*G*d/(np.sqrt(d2)*d2)[None, :, :]).sum(axis=2)
grav_vect_bis_jit = jit(grav_vect_bis)
def grav2_tril(p, M):
d = np.tril(p[:, :, None] - p[:, None, :], -1)
d2 = np.tril((d*d).sum(axis=0), -1)
d2[d2==0] = 1
return (M[None, :, None]*G*d*(d2**(-1.5))[None, :, :]).sum(axis=1) - \
(M[None, None, :]*G*d*(d2**(-1.5))[None, :, :]).sum(axis=2)
grav2_tril_jit = jit(grav2_tril)
def grav_tril(p, M):
d = np.tril(p[:, :, None] - p[:, None, :], -1)
d2 = np.tril((d*d).sum(axis=0), -1)
d2[d2==0] = 1
return (M[None, :, None]*G*d/(np.sqrt(d2)*d2)[None, :, :]).sum(axis=1) - \
(M[None, None, :]*G*d/(np.sqrt(d2)*d2)[None, :, :]).sum(axis=2)
grav_tril_jit = jit(grav_tril)
testslist = [
('grav_vect', 'c'), ('grav2_vect', 'c--'), ('grav_vect_jit', 'c:'), ('grav2_vect_jit', 'c-.'),
('grav_vect_bis', 'm'), ('grav2_vect_bis', 'm--'), ('grav_vect_bis_jit', 'm:'), ('grav2_vect_bis_jit', 'm-.'),
('grav_tril', 'y'), ('grav2_tril', 'y--'), ('grav_tril_jit', 'y:'), ('grav2_tril_jit', 'y-.'),
('grav', 'r'), ('grav2', 'r--'), ('grav_jit', 'r:'), ('grav2_jit', 'r-.'),
('grav_optim1', 'g'), ('grav2_optim1', 'g--'), ('grav_optim1_jit', 'g:'), ('grav2_optim1_jit', 'g-.'),
('grav_optim2', 'b'), ('grav2_optim2', 'b--'), ('grav_optim2_jit', 'b:'), ('grav2_optim2_jit', 'b-.'),
('grav_c', 'k'),('grav2_c', 'k--')]
class ScaleType() : pass
class LinScale(ScaleType) : pass
class LogScale(ScaleType) : pass
attempts = 8
scaletype = LogScale
scalelen = 200
scalestart = 2
scalestop = 400
# input data (Multiple datasets to repeat the tests on different data)
randlist = lambda x : [float(random.randint(10000, 99999)) for _ in range(x)]
try:
# data_file = "Here you can give an npz file name to load some presaved data.npz"
with np.load(data_file) as data:
testslist = data['testslist']
N = data['N']
timings = data['timings']
perform = data['perform']
miny = data['miny']
except NameError:
L = scalestop-scalestart
if scalelen > L:
N = np.arange(scalestart, scalestop+1, 1)
elif scaletype == LinScale:
Q = L//(scalelen-1)
R = L%(scalelen-1)
N = np.array([i for r in (range(scalestart, scalestart+Q*(scalelen-1-R), Q),
range(scalestart+Q*(scalelen-1-R), scalestop+1, Q+1)) for i in r])
elif scaletype == LogScale:
X = scalestart
G = scalestop/scalestart
I = scalelen-1
while True:
NX = I*np.log(I/np.log(G)/scalestart)/np.log(G)
if NX-X < 0.0001: break
X = NX
L0 = int(scalestart*np.power(G, X/I))
G = scalestop/(scalestart+L0)
I = scalelen-1-L0
a1 = np.array(range(I))
N = np.concatenate((range(scalestart, scalestart+L0, 1),
scalestart+L0-1+np.cumsum((0.+(scalestart+L0)*(np.exp(np.log(G)*(a1+1)/I) - np.exp(np.log(G)*a1/I))).astype(int)),
[scalestop]))
print(N)
l = len(N)
timings = np.full(l, 9999999., dtype=[(test[0], np.float64) for test in testslist])
perform = np.full(l, 9999999., dtype=[(test[0], np.float64) for test in testslist])
miny = 9999999.
accum = 0 # This is to prevent system to perform unwanted optimisations
for j in range(attempts):
for i in range(l):
L = N[i]
system_p = [np.array([randlist(L), randlist(L)]) for _ in range(100)]
system_M = [np.array( randlist(L)) for _ in range(100)]
for test in testslist:
timeref = -time.time()
system_a = eval(test[0]+'(system_p[0], system_M[0])')
accum += system_a[0, 0]
count = 1
while time.time()+timeref<0.001:
for count in range(count+1, 10*count+1):
system_a = eval(test[0]+'(system_p[count%100], system_M[count%100])')
timeref += time.time()
## print(count)
timings[test[0]][i] = min(timings[test[0]][i], timeref/count)
val = timings[test[0]][i]/(N[i]*(N[i]-1)/2)
perform[test[0]][i] = val
miny = min(val, miny)
if i%10==9: print(j, end='', flush=True)
print(flush=True)
filename = "example grav, Whosebug "+str(datetime.datetime.now())+".npz"
print("saving data to", filename)
np.savez(filename, testslist=testslist, N=N, timings=timings, perform=perform, miny=miny)
ymin = 10**(np.floor(np.log10(miny)))
if (5*ymin<=miny): ymin *= 5
elif (2*ymin<=miny): ymin *= 2
print('ymin = {}, miny = {}\n'.format(ymin, miny))
figa, ax = plt.subplots(figsize=(24, 12))
for test in testslist:
ax.plot(N, timings[test[0]], test[1], label=test[0])
ax.set_title('numpy compared timings')
plt.xlabel('N (system size)')
plt.ylabel('timings (msec)')
plt.grid(True)
plt.legend(loc='upper left', bbox_to_anchor=(0., 1), shadow=True, ncol=7)
plt.subplots_adjust(left=0.05, bottom=0.06, right=0.98, top=0.98)
figb, bx = plt.subplots(figsize=(24, 12))
for test in testslist:
bx.plot(N, timings[test[0]], test[1], label=test[0])
bx.set_title('numpy compared timings')
plt.yscale('log')
plt.xscale('log')
plt.xlabel('N (system size)')
plt.ylabel('timings (msec)')
plt.grid(True)
plt.legend(loc='upper left', bbox_to_anchor=(0., 1), shadow=True, ncol=7)
plt.subplots_adjust(left=0.05, bottom=0.06, right=0.98, top=0.98)
figc, cx = plt.subplots(figsize=(24, 12))
for test in testslist:
cx.plot(N, perform[test[0]], test[1], label=test[0])
plt.ylim(0, 20*ymin)
cx.set_title('numpy compared performance')
plt.xlabel('N (system size)')
plt.ylabel('performance (msec)/N²')
plt.grid(True)
plt.legend(loc='upper right', bbox_to_anchor=(1., 1), shadow=True, ncol=7)
plt.subplots_adjust(left=0.05, bottom=0.06, right=0.98, top=0.98)
figd, dx = plt.subplots(figsize=(24, 12))
for test in testslist:
dx.plot(N, perform[test[0]], test[1], label=test[0])
dx.set_title('numpy compared performance')
plt.yscale('log')
plt.xscale('log')
plt.xlabel('N (system size)')
plt.ylabel('performance (msec)/N²')
plt.grid(True)
plt.legend(loc='upper right', bbox_to_anchor=(1., 1), shadow=True, ncol=7)
plt.subplots_adjust(left=0.05, bottom=0.06, right=0.98, top=0.98)
plt.show()
有了它的 C 模块
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <numpy/arrayobject.h>
#define G 6.67408E-8L
void * failure(PyObject *type, const char *message) {
PyErr_SetString(type, message);
return NULL;
}
void * success(PyObject *var){
Py_INCREF(var);
return var;
}
static PyObject *
Py_grav_c(PyObject *self, PyObject *args)
{
PyArrayObject *p, *M;
PyObject *a;
int i, j, k;
double *pq0, *pq1, *Mq0, *Mq1, *aq0, *aq1, *p0, *p1, *a0, *a1;
if (!PyArg_ParseTuple(args, "O!O!", &PyArray_Type, &p, &PyArray_Type, &M))
return failure(PyExc_RuntimeError, "Failed to parse parameters.");
if (PyArray_DESCR(p)->type_num != NPY_DOUBLE)
return failure(PyExc_TypeError, "Type np.float64 expected for p array.");
if (PyArray_DESCR(M)->type_num != NPY_DOUBLE)
return failure(PyExc_TypeError, "Type np.float64 expected for M array.");
if (PyArray_NDIM(p)!=2)
return failure(PyExc_TypeError, "p must be a 2 dimensionnal array.");
if (PyArray_NDIM(M)!=1)
return failure(PyExc_TypeError, "M must be a 1 dimensionnal array.");
int K = PyArray_DIM(p, 0); // Number of dimensions you want
int L = PyArray_DIM(p, 1); // Number of bodies in the system
int S0 = PyArray_STRIDE(p, 0); // Normally, the arrays should be contiguous
int S1 = PyArray_STRIDE(p, 1); // But since they provide this Stride info
int SM = PyArray_STRIDE(M, 0); // I supposed they might not be (alignment)
if (PyArray_DIM(M, 0) != L)
return failure(PyExc_TypeError,
"P and M must have the same number of bodies.");
a = PyArray_NewLikeArray(p, NPY_ANYORDER, NULL, 0);
if (a == NULL)
return failure(PyExc_RuntimeError, "Failed to create output array.");
PyArray_FILLWBYTE(a, 0);
// For all bodies except first which has no previous body
for (i = 1,
pq0 = (double *)(PyArray_DATA(p)+S1),
Mq0 = (double *)(PyArray_DATA(M)+SM),
aq0 = (double *)(PyArray_DATA(a)+S1);
i < L;
i++,
*(void **)&pq0 += S1,
*(void **)&Mq0 += SM,
*(void **)&aq0 += S1
) {
// For all previous bodies
for (j = 0,
pq1 = (double *)PyArray_DATA(p),
Mq1 = (double *)PyArray_DATA(M),
aq1 = (double *)PyArray_DATA(a);
j < i;
j++,
*(void **)&pq1 += S1,
*(void **)&Mq1 += SM,
*(void **)&aq1 += S1
) {
// For all dimensions calculate deltas
long double d[K], d2 = 0, VVV, M0xVVV, M1xVVV;
for (k = 0,
p0 = pq0,
p1 = pq1;
k<K;
k++,
*(void **)&p0 += S0,
*(void **)&p1 += S0) {
d[k] = *p1 - *p0;
}
// calculate Hypotenuse squared
for (k = 0, d2 = 0; k<K; k++) {
d2 += d[k]*d[k];
}
// calculate interm. results once for each bodies pair (optimization)
VVV = G;
#define LIM 1
// if (d2<LIM) d2=LIM; // Variation on collision case
if (d2>0) VVV /= d2*sqrt(d2);
M0xVVV = *Mq0 * VVV; // anonymous intermediate result
M1xVVV = *Mq1 * VVV; // anonymous intermediate result
// For all dimensions calculate component of acceleration
for (k = 0,
a0 = aq0,
a1 = aq1;
k<K;
k++,
*(void **)&a0 += S0,
*(void **)&a1 += S0) {
*a0 += M1xVVV*d[k];
*a1 -= M0xVVV*d[k];
}
}
}
/* clean up and return the result */
return success(a);
}
static PyObject *
Py_grav2_c(PyObject *self, PyObject *args)
{
PyArrayObject *p, *M;
PyObject *a;
int i, j, k;
double *pq0, *pq1, *Mq0, *Mq1, *aq0, *aq1, *p0, *p1, *a0, *a1;
if (!PyArg_ParseTuple(args, "O!O!", &PyArray_Type, &p, &PyArray_Type, &M))
return failure(PyExc_RuntimeError, "Failed to parse parameters.");
if (PyArray_DESCR(p)->type_num != NPY_DOUBLE)
return failure(PyExc_TypeError, "Type np.float64 expected for p array.");
if (PyArray_DESCR(M)->type_num != NPY_DOUBLE)
return failure(PyExc_TypeError, "Type np.float64 expected for M array.");
if (PyArray_NDIM(p)!=2)
return failure(PyExc_TypeError, "p must be a 2 dimensionnal array.");
if (PyArray_NDIM(M)!=1)
return failure(PyExc_TypeError, "M must be a 1 dimensionnal array.");
int K = PyArray_DIM(p, 0); // Number of dimensions you want
int L = PyArray_DIM(p, 1); // Number of bodies in the system
int S0 = PyArray_STRIDE(p, 0); // Normally, the arrays should be contiguous
int S1 = PyArray_STRIDE(p, 1); // But since they provide this Stride info
int SM = PyArray_STRIDE(M, 0); // I supposed they might not be (alignment)
if (PyArray_DIM(M, 0) != L)
return failure(PyExc_TypeError,
"P and M must have the same number of bodies.");
a = PyArray_NewLikeArray(p, NPY_ANYORDER, NULL, 0);
if (a == NULL)
return failure(PyExc_RuntimeError, "Failed to create output array.");
PyArray_FILLWBYTE(a, 0);
// For all bodies except first which has no previous body
for (i = 1,
pq0 = (double *)(PyArray_DATA(p)+S1),
Mq0 = (double *)(PyArray_DATA(M)+SM),
aq0 = (double *)(PyArray_DATA(a)+S1);
i < L;
i++,
*(void **)&pq0 += S1,
*(void **)&Mq0 += SM,
*(void **)&aq0 += S1
) {
// For all previous bodies
for (j = 0,
pq1 = (double *)PyArray_DATA(p),
Mq1 = (double *)PyArray_DATA(M),
aq1 = (double *)PyArray_DATA(a);
j < i;
j++,
*(void **)&pq1 += S1,
*(void **)&Mq1 += SM,
*(void **)&aq1 += S1
) {
// For all dimensions calculate deltas
long double d[K], d2 = 0, VVV, M0xVVV, M1xVVV;
for (k = 0,
p0 = pq0,
p1 = pq1;
k<K;
k++,
*(void **)&p0 += S0,
*(void **)&p1 += S0) {
d[k] = *p1 - *p0;
}
// calculate Hypotenuse squared
for (k = 0, d2 = 0; k<K; k++) {
d2 += d[k]*d[k];
}
// calculate interm. results once for each bodies pair (optimization)
VVV = G;
#define LIM 1
// if (d2<LIM) d2=LIM; // Variation on collision case
if (d2>0) VVV *= pow(d2, -1.5);
M0xVVV = *Mq0 * VVV; // anonymous intermediate result
M1xVVV = *Mq1 * VVV; // anonymous intermediate result
// For all dimensions calculate component of acceleration
for (k = 0,
a0 = aq0,
a1 = aq1;
k<K;
k++,
*(void **)&a0 += S0,
*(void **)&a1 += S0) {
*a0 += M1xVVV*d[k];
*a1 -= M0xVVV*d[k];
}
}
}
/* clean up and return the result */
return success(a);
}
// exported functions list
static PyMethodDef grav_c_Methods[] = {
{"grav_c", Py_grav_c, METH_VARARGS, "grav_c(p, M)\n"
"\n"
"grav_c takes the positions and masses of m bodies in Newtonian attraction in a n dimensionnal universe,\n"
"and returns the accelerations each body undergoes.\n"
"input data take the for of a row of fload64 for each dimension of the position (in p) and one row for the masses.\n"
"It returns and array of the same shape as p for the accelerations."},
{"grav2_c", Py_grav2_c, METH_VARARGS, "grav_c(p, M)\n"
"\n"
"grav_c takes the positions and masses of m bodies in Newtonian attraction in a n dimensionnal universe,\n"
"and returns the accelerations each body undergoes.\n"
"input data take the for of a row of fload64 for each dimension of the position (in p) and one row for the masses.\n"
"It returns and array of the same shape as p for the accelerations."},
{NULL, NULL, 0, NULL} // pour terminer la liste.
};
static char grav_c_doc[] = "Compute attractions between n bodies.";
static struct PyModuleDef grav_c_module = {
PyModuleDef_HEAD_INIT,
"grav_c", /* name of module */
grav_c_doc, /* module documentation, may be NULL */
-1, /* size of per-interpreter state of the module,
or -1 if the module keeps state in global variables. */
grav_c_Methods
};
PyMODINIT_FUNC
PyInit_grav_c(void)
{
// I don't understand why yet, but the program segfaults without this.
import_array();
return PyModule_Create(&grav_c_module);
}
我猜你无法获得比 grav_vectorized()
更好的 NumPythonic 表达式,正如已经注意到的那样,它增加了计算复杂度和内存,略小于 2 倍。
但是,如果您追求速度并且仍想留在 Python 一圈,
对于这个特定的应用程序,问题似乎在于细节和输入大小。
具体来说,至少在我的机器上,时间似乎在每次迭代中都由分数幂支配,并将其结果保存在临时变量中以加快速度。
归根结底,如果您使用带有冗余优化 (grav_optim_jit()
) 的原始代码的 Numba JIT 版本,您几乎总是最优的。
对于非常小的输入,Cython 版本 (grav_loop_cython()
) 是最快的,但是一旦大小稍有增长 (N > ~100
),更好优化的 Numba 代码 (grav_orig_jit()
) 就会接管讲台。对于更大的输入,NumPy-optimized 但仍然是 n (n + 1) / 2
(在 space 维度中向量化)(grav_iter()
)成为 runner-up,尽管在small-N
政权。
请注意,完全矢量化版本 (grav_vectorized()
) 对于小的 N
表现相当好,但一旦输入大小增加就会出现不足。
另请注意,grav_iter_jit()
基本上不受 Numba JIT 的影响(可能稍微减慢),因为核心优化是在 NumPy 方面。
通过使用 -O3 -march=native
选项编译,Cython 版本可能会变得更快,但我还没有尝试过。
以下是上述内容的详细内容。
这是我测试的代码:
import numpy as np
import numba as nb
G = 6.67408e-2
MAX_SIZE = 1000
MAX_MASS = 1000
原始代码的稍微完善的版本
def grav_orig(x_arr, m_arr, G=G):
n_dim, n_points = x_arr.shape
a_arr = np.zeros_like(x_arr, dtype=np.float64)
for i in range(1, n_points):
dx_arr = x_arr[0, i] - x_arr[0, :i]
dy_arr = x_arr[1, i] - x_arr[1, :i]
d2_arr = dx_arr ** 2 + dy_arr ** 2
a_arr[0, i] = -np.sum(m_arr[:i] * dx_arr * G * d2_arr**(-1.5))
a_arr[1, i] = -np.sum(m_arr[:i] * dy_arr * G * d2_arr**(-1.5))
a_arr[0, :i] += m_arr[i] * dx_arr * G * d2_arr**(-1.5)
a_arr[1, :i] += m_arr[i] * dy_arr * G * d2_arr**(-1.5)
return a_arr
优化版
def grav_optim(x_arr, m_arr, G=G):
n_dim, n_points = x_arr.shape
a_arr = np.zeros_like(x_arr, dtype=np.float64)
for i in range(1, n_points):
dx_arr = x_arr[0, i] - x_arr[0, :i]
dy_arr = x_arr[1, i] - x_arr[1, :i]
d2_arr = dx_arr ** 2 + dy_arr ** 2
temp = G * d2_arr**(-1.5)
temp_x = dx_arr * temp
temp_y = dy_arr * temp
a_arr[0, i] = -np.sum(m_arr[:i] * temp_x)
a_arr[1, i] = -np.sum(m_arr[:i] * temp_y)
a_arr[0, :i] += m_arr[i] * temp_x
a_arr[1, :i] += m_arr[i] * temp_y
return a_arr
对应的 Numba JITted 版本:
grav_optim_jit = nb.jit(grav_optim, nopython=True, nogil=True)
与原始方法类似但沿空间维度矢量化的方法:
def grav_iter(x_arr, m_arr, G=G):
n_dim, n_points = x_arr.shape
a_arr = np.zeros_like(x_arr, dtype=np.float64)
for i in range(1, n_points):
d_arr = x_arr[:, i:i + 1] - x_arr[:, :i]
d2_arr = np.sum(d_arr ** 2, axis=0)
temp = G * d_arr * d2_arr[None, :]**(-1.5)
a_arr[:, i] = -np.sum(m_arr[None, :i] * temp, axis=-1)
a_arr[:, :i] += m_arr[None, i] * temp
return a_arr
对应的 Numba JITted 版本:
grav_iter_jit = nb.jit(grav_iter)
完全矢量化版本:
def grav_vectorized(x_arr, m_arr, G=G):
d_arr = x_arr[:, :, None] - x_arr[:, None, :]
d2_arr = np.sum(d_arr ** 2, axis=0)
d2_arr[d2_arr == 0] = 1
return np.sum((m_arr[None, :, None] * G * d_arr * d2_arr[None, ...]**(-1.5)), axis=1)
Cythonized 版本
%%cython -a
#cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
import numpy as np
cimport cython
cimport numpy as np
DTYPE = np.double
ctypedef np.double_t DTYPE_t
cdef DTYPE_t G = 6.67408e-2
def grav_optim_cython(x_arr, m_arr, G=G):
n_dim, n_points = x_arr.shape
a_arr = np.zeros_like(x_arr, dtype=np.float64)
for i in range(1, n_points):
dx_arr = x_arr[0, i] - x_arr[0, :i]
dy_arr = x_arr[1, i] - x_arr[1, :i]
d2_arr = dx_arr ** 2 + dy_arr ** 2
temp = G * d2_arr**(-1.5)
temp_x = dx_arr * temp
temp_y = dy_arr * temp
a_arr[0, i] = -np.sum(m_arr[:i] * temp_x)
a_arr[1, i] = -np.sum(m_arr[:i] * temp_y)
a_arr[0, :i] += m_arr[i] * temp_x
a_arr[1, :i] += m_arr[i] * temp_y
return a_arr
Cythonized loop-explicit 版本
%load_ext Cython
%%cython -a
#cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
import numpy as np
cimport cython
cimport numpy as np
DTYPE = np.double
ctypedef np.double_t DTYPE_t
cdef DTYPE_t G = 6.67408e-2
@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False) # turn off negative index wrapping for entire function
def grav_loop_cython(
np.ndarray[DTYPE_t, ndim=2] x_arr,
np.ndarray[DTYPE_t, ndim=1] m_arr,
DTYPE_t G=G):
cdef int ndim = x_arr.shape[0]
cdef int n_points = x_arr.shape[1]
cdef np.ndarray[DTYPE_t, ndim=2] a_arr = np.zeros((ndim, n_points), dtype=DTYPE)
cdef np.ndarray[DTYPE_t, ndim=2] dx_arr = np.zeros((ndim, n_points - 1), dtype=DTYPE)
cdef np.ndarray[DTYPE_t, ndim=1] d2_arr = np.zeros((n_points - 1), dtype=DTYPE)
cdef DTYPE_t temp
for j in range(1, n_points):
# compute the pair-wise differences
for jj in range(j):
for i in range(ndim):
dx_arr[i, jj] = x_arr[i, j] - x_arr[i, jj]
# compute the pair-wise squared Euclidean distances
for jj in range(j):
d2_arr[jj] = 0
for i in range(ndim):
d2_arr[jj] += dx_arr[i, jj] ** 2.0
for i in range(ndim):
for jj in range(j):
temp = G * dx_arr[i, jj] * d2_arr[jj] ** -1.5
a_arr[i, j] -= (m_arr[jj] * temp)
a_arr[i, jj] += (m_arr[j] * temp)
return a_arr
时间
我用以下代码为所有这些计时:
funcs = (
grav_orig,
grav_optim,
grav_optim_jit,
grav_optim_cython,
grav_iter,
grav_iter_jit,
grav_loop_cython,
grav_vectorized,
)
Ns = np.geomspace(1e1, 6e3, 16).astype(int)
timings = np.zeros((len(funcs), len(Ns)))
np.random.seed(0)
for i, N in enumerate(Ns):
print('N: ', N)
x_arr = np.random.random((2, N)) * MAX_SIZE
m_arr = np.random.random((N,)) * MAX_MASS
for j, func in enumerate(funcs):
test_result = np.all(np.isclose(grav_orig(x_arr, m_arr), func(x_arr, m_arr)))
func_name = func.__name__ + ('_jit' if '__numba__' in dir(func) else '')
print(f'{func_name:20s} {test_result} ', end='')
t = %timeit -o func(x_arr, m_arr)
timings[j, i] = t.best
print()
数量:
# N: 10
# grav_orig True 501 µs ± 37.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_optim True 358 µs ± 20 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_optim_jit True 17.4 µs ± 491 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
# grav_optim_cython True 383 µs ± 21.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_iter True 371 µs ± 40.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_iter_jit True 481 µs ± 42 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_loop_cython True 12.6 µs ± 250 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
# grav_vectorized True 41.3 µs ± 2.4 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# N: 15
# grav_orig True 769 µs ± 81.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_optim True 540 µs ± 24.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_optim_jit True 29.2 µs ± 431 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# grav_optim_cython True 547 µs ± 24.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_iter True 602 µs ± 42.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_iter_jit True 750 µs ± 23.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_loop_cython True 22.9 µs ± 738 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# grav_vectorized True 58.1 µs ± 2.71 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# N: 23
# grav_orig True 1.11 ms ± 55.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_optim True 788 µs ± 9.89 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_optim_jit True 53 µs ± 290 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# grav_optim_cython True 825 µs ± 17.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_iter True 875 µs ± 78.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_iter_jit True 1.05 ms ± 70.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_loop_cython True 49.2 µs ± 1.76 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# grav_vectorized True 89.6 µs ± 4.89 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# N: 35
# grav_orig True 1.87 ms ± 94 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim True 1.35 ms ± 95.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_optim_jit True 111 µs ± 4.28 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# grav_optim_cython True 1.36 ms ± 96 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_iter True 1.31 ms ± 83.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_iter_jit True 1.54 ms ± 49.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_loop_cython True 109 µs ± 1.89 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# grav_vectorized True 159 µs ± 3.24 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# N: 55
# grav_orig True 3.13 ms ± 171 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim True 2.05 ms ± 128 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim_jit True 237 µs ± 7.39 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_optim_cython True 2.07 ms ± 54.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_iter True 2.37 ms ± 36.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_iter_jit True 2.87 ms ± 86.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_loop_cython True 263 µs ± 5.05 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_vectorized True 326 µs ± 7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# N: 84
# grav_orig True 4.97 ms ± 72.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim True 3.5 ms ± 241 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim_jit True 484 µs ± 4.96 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_optim_cython True 3.26 ms ± 76.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_iter True 3.87 ms ± 223 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_iter_jit True 4.81 ms ± 267 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_loop_cython True 645 µs ± 11.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_vectorized True 805 µs ± 22.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# N: 129
# grav_orig True 9.22 ms ± 215 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim True 6.14 ms ± 315 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim_jit True 1.07 ms ± 19.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_optim_cython True 5.93 ms ± 411 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_iter True 6.85 ms ± 651 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_iter_jit True 7.68 ms ± 523 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_loop_cython True 1.55 ms ± 28.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_vectorized True 1.8 ms ± 39.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# N: 197
# grav_orig True 17.4 ms ± 374 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim True 9.57 ms ± 248 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim_jit True 2.32 ms ± 30.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim_cython True 9.95 ms ± 284 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_iter True 9.87 ms ± 660 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_iter_jit True 12.3 ms ± 1.03 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_loop_cython True 3.68 ms ± 138 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_vectorized True 4.03 ms ± 128 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# N: 303
# grav_orig True 31.9 ms ± 532 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_optim True 15.9 ms ± 56.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim_jit True 5.36 ms ± 21.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim_cython True 16.5 ms ± 200 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_iter True 17.1 ms ± 834 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_iter_jit True 18 ms ± 247 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_loop_cython True 8.38 ms ± 37 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_vectorized True 10.6 ms ± 177 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# N: 464
# grav_orig True 70.7 ms ± 2.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_optim True 31.7 ms ± 1.61 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_optim_jit True 12.2 ms ± 53.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim_cython True 29.3 ms ± 646 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_iter True 28.3 ms ± 316 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_iter_jit True 32.5 ms ± 737 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_loop_cython True 19.7 ms ± 52.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_vectorized True 27.8 ms ± 214 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# N: 711
# grav_orig True 126 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_optim True 52.7 ms ± 452 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_optim_jit True 27.1 ms ± 164 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_optim_cython True 54.1 ms ± 623 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_iter True 54.8 ms ± 2.54 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_iter_jit True 60.6 ms ± 1.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_loop_cython True 46.8 ms ± 755 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_vectorized True 67.2 ms ± 3.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# N: 1089
# grav_orig True 306 ms ± 31.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim True 108 ms ± 2.35 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_optim_jit True 61.5 ms ± 606 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_optim_cython True 103 ms ± 1.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_iter True 110 ms ± 4.75 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_iter_jit True 114 ms ± 5.48 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_loop_cython True 107 ms ± 1.87 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_vectorized True 152 ms ± 1.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# N: 1669
# grav_orig True 567 ms ± 6.32 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim True 201 ms ± 4.24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim_jit True 141 ms ± 271 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_optim_cython True 207 ms ± 5.49 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_iter True 210 ms ± 3.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_iter_jit True 223 ms ± 8.24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_loop_cython True 252 ms ± 2.36 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_vectorized True 365 ms ± 4.99 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# N: 2557
# grav_orig True 1.28 s ± 35.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim True 418 ms ± 8.04 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim_jit True 339 ms ± 2.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim_cython True 432 ms ± 10.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_iter True 452 ms ± 12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_iter_jit True 470 ms ± 13.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_loop_cython True 605 ms ± 7.12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_vectorized True 817 ms ± 17.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# N: 3916
# grav_orig True 2.83 s ± 26.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim True 900 ms ± 25 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim_jit True 778 ms ± 4.23 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim_cython True 894 ms ± 19.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_iter True 951 ms ± 25.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_iter_jit True 991 ms ± 28.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_loop_cython True 1.41 s ± 30.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_vectorized True 1.88 s ± 22.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# N: 6000
# grav_orig True 6.77 s ± 171 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim True 1.95 s ± 36.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim_jit True 1.84 s ± 4.82 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim_cython True 2.01 s ± 47.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_iter True 2.28 s ± 79.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_iter_jit True 2.27 s ± 31.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_loop_cython True 3.26 s ± 43 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_vectorized True 4.32 s ± 9.29 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
这里是生成绘图的代码:
import matplotlib as mpl
import matplotlib.pyplot as plt
subplot_shape = (1, 2)
fig, axs = plt.subplots(
*subplot_shape, squeeze=False,
figsize=(8 * subplot_shape[1], 6 * subplot_shape[0]))
ax = axs[0, 0]
ax.set_title('Small-N Regime')
ax.set_xlabel('N / #')
ax.set_ylabel('timing / ms')
small_Ns = tuple(N for N in Ns if N < 1000)
for j, func in enumerate(funcs):
func_name = func.__name__ + ('_jit' if '__numba__' in dir(func) else '')
ax.plot(small_Ns, timings[j, :len(small_Ns)] * 1e3, label=func_name)
ax.legend()
ax = axs[0, 1]
ax.set_title('Full N Range')
ax.set_xlabel('N / #')
ax.set_ylabel('timing / ms')
for j, func in enumerate(funcs):
func_name = func.__name__ + ('_jit' if '__numba__' in dir(func) else '')
ax.plot(Ns, timings[j] * 1e3, label=func_name)
ax.legend()
plt.show()
编辑(朱莉娅)
只是为了好玩,我在 Julia 中实现了相同的代码(虽然我不是这方面的专家),但是对于他们吹嘘速度的所有时间来说,时间安排非常具有欺骗性。
using Random
G = 6.67408e-2
MAX_SIZE = 1000
MAX_MASS = 1000
function grav(
x_arr::Array{Float64,2},
m_arr::Array{Float64,2},
g::Float64=G)::Array{Float64,2}
n_dim, n_points = size(x_arr)
a_arr = zeros(size(x_arr))
for i in 2:n_points
d_arr = x_arr[:, i:i] .- x_arr[:, 1:i - 1]
d2_arr = sum(d_arr .^ 2, dims=1)
temp = G .* d_arr .* (d2_arr .^ -1.5)
a_arr[:, i] = -sum(m_arr[:, 1:i - 1] .* temp, dims=2)
a_arr[:, 1:i - 1] += m_arr[:, i - 1] .* temp
end
return a_arr
end
N = 10
x_arr = rand(Float64, 2, N) * MAX_SIZE
m_arr = rand(Float64, 1, N) * MAX_MASS
@time grav(x_arr, m_arr)
# 0.000111 seconds (329 allocations: 23.375 KiB)
N = 6000
x_arr = rand(Float64, 2, N) * MAX_SIZE
m_arr = rand(Float64, 1, N) * MAX_MASS
@time grav(x_arr, m_arr)
# 4.112578 seconds (269.17 k allocations: 2.426 GiB, 1.93% gc time)
# BenchmarkTools.Trial:
# memory estimate: 2.43 GiB
# allocs estimate: 269167
# --------------
# minimum time: 4.096 s (3.31% GC)
# median time: 4.169 s (2.50% GC)
# mean time: 4.169 s (2.50% GC)
# maximum time: 4.243 s (1.72% GC)
# --------------
# samples: 2
# evals/sample: 1
Numba 方法
无需太多操作即可大大加快您的代码速度。
- 加入不必要的循环(所有向量化命令都是循环)
- 做一些数学运算:
d2**(-1.5)
是一个非常昂贵的操作,可以用 d2 = 1./(np.sqrt(d2)*d2)
代替
- 安装 Intel SVML 以更快地实现
sin,sqrt,pow...
等功能
代码
import numba as nb
import numpy as np
def grav(px, py, M):
G = 6.67408*10**-2 # m³/s²T
nPoints=px.shape[0]
ax=np.zeros(nPoints,dtype=np.float64)
ay=np.zeros(nPoints,dtype=np.float64)
for b in range(1, px.shape[0]):
# computing the distance between body #b and all previous
dx = px[b] - px[:b]
dy = py[b] - py[:b]
d2 = dx*dx+dy*dy
# computing acceleration undergone by b from all previous
ax[b] = -np.sum(M[:b]*G * dx * d2**(-1.5))
ay[b] = -np.sum(M[:b]*G * dy * d2**(-1.5))
# adding for each previous, acceleration undergone by from b
ax[:b] += M[b]*G * dx * d2**(-1.5)
ay[:b] += M[b]*G * dy * d2**(-1.5)
return ax,ay
@nb.njit(fastmath=True,error_model="numpy")
def grav_2(px, py, M):
G = 6.67408*10**-2 # m³/s²T
nPoints=px.shape[0]
ax=np.zeros(nPoints,dtype=np.float64)
ay=np.zeros(nPoints,dtype=np.float64)
for b in range(1, nPoints):
sum_x=0.
sum_y=0.
for i in range(0,b):
# computing the distance between body #b and all previous
dx = px[b] - px[i]
dy = py[b] - py[i]
d2 = (dx*dx+dy*dy)
#Much less costly than d2 = d2**(-1.5)
d2 = 1./(np.sqrt(d2)*d2)
# computing acceleration undergone by b from all previous
sum_x += M[i]*G * dx * d2
sum_y += M[i]*G * dy * d2
# adding for each previous, acceleration undergone by from b
ax[i] += M[b]*G * dx * d2
ay[i] += M[b]*G * dy * d2
ax[b]=(-1)*sum_x
ay[b]=(-1)*sum_y
return ax,ay
时间
N=10
%timeit res=grav(px, py, M)
212 µs ± 3.47 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit res=grav_2(px, py, M)
1.29 µs ± 7.16 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
N=100
%timeit res=grav(px, py, M)
2.86 ms ± 41.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit res=grav_2(px, py, M)
18.9 µs ± 37.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
N=1000
%timeit res=grav(px, py, M)
86.5 ms ± 448 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit res=grav_2(px, py, M)
1.79 ms ± 13.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
N=10_000
%timeit res=grav(px, py, M)
6.28 s ± 17.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res=grav_2(px, py, M)
180 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
N=50_000
#Take a more advandced algorithm
#Small particles doesn't significantly interact with other particles
#which are far away (KD-tree based approaches?)
我目前正在 python 使用 Runge-Kutta 方法进行微分方程系统数值积分,范围是(如标题所述)行星轨道模拟。
我正在研究(比较)加速计算的不同方法,目前我已经尝试使用非常高效的 C 模块,我想尝试使用 numpy
在这个计算中,我需要计算每对行星的相互吸引力。目前,我正在这样做:
import numpy as np
def grav(px, py, M, ax, ay):
G = 6.67408*10**-2 # m³/s²T
for b in range(1, len(px)):
# computing the distance between body #b and all previous
dx = px[b] - px[:b]
dy = py[b] - py[:b]
d2 = dx*dx+dy*dy
# computing acceleration undergone by b from all previous
ax[b] = -sum(M[:b]*G * dx * d2**(-1.5))
ay[b] = -sum(M[:b]*G * dy * d2**(-1.5))
# adding for each previous, acceleration undergone by from b
ax[:b] += M[b]*G * dx * d2**(-1.5)
ay[:b] += M[b]*G * dy * d2**(-1.5)
# input data
system_px = np.array([1., 2., 3., 4., 5., 6., 7., 9., 4., 0.])
system_py = np.array([3., 5., 1., 2., 4., 5., 6., 3., 5., 8.])
system_M = np.array([3., 5., 1., 2., 4., 5., 6., 3., 5., 8.])
# outout array
system_ax = np.zeros(len(system_px))
system_ay = np.zeros(len(system_px))
grav(system_px, system_py, system_M, system_ax, system_ay)
for i in range(len(system_px)):
print('body {} mass = {}(ton), position = {}(m), '
'acceleration = ({:8.4f}, {:8.4f})(m/s²)'.format(i, system_M[i],
(system_px[i], system_py[i]), system_ax[i], system_ay[i]))
我想知道是否会有一些非常通用的更 «numpythonic» 的方法来做到这一点,它可以应用于 n 行的每个子集。
借助@norok2 获得的信息,我能够在没有循环的情况下获得更快的解决方案,并部分(即仅针对 n=2)回答问题,但不能同时回答这两个问题.回复问题的解大约慢了10倍:
import numpy as np
def grav_fast(p, M):
G = 6.67408*10**-2 # m³/s²T
d = p[:, :, None] - p[:, None, :]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
return (M[None, :, None]*G*d*(d2**(-1.5))[None, :, :]).sum(axis=1)
#or return (M[None, None, :]*G*d*(d2**(-1.5))[None, :, :]).sum(axis=2)
# (both are equivalent because d is symetric)
def grav_reply(p, M):
G = 6.67408*10**-2 # m³/s²T
d = np.tril(p[:, :, None] - p[:, None, :], -1)
d2 = np.tril((d*d).sum(axis=0), -1)
d2[d2==0] = 1
return (M[None, :, None]*G*d*(d2**(-1.5))[None, :, :]).sum(axis=1) - \
(M[None, None, :]*G*d*(d2**(-1.5))[None, :, :]).sum(axis=2)
# input data
system_p = np.array([[ 1., 2., 3., 4., 5., 6., 7., 9., 4., 0.],
[ 3., 5., 1., 2., 4., 5., 6., 3., 5., 8.]])
system_M = np.array([3., 5., 1., 2., 4., 5., 6., 3., 5., 8.])
# output array
l = len(system_p[0])
system_a = np.zeros(shape=(2, l))
for test in 'grav_fast', 'grav_reply':
print('\ntesting '+test)
system_a = eval(test+'(system_p, system_M)')
for i in range(l):
print('body {} mass = {}(ton), position = {}(m), '
'acceleration = [{:8.4f} {:8.4f}](m/s²)'.format(i,
system_M[i], system_p[:, i], system_a[0, i], system_a[1, i]))
grav_fast
并没有真正回答这个问题,因为它进行了两倍的计算,并且还为被自身吸引的物体(这导致除以零)进行了计算,但是对于一个小系统,它是仍然比 python 的循环快得多(收支平衡是大约 600 个物体)。另一方面,如果 np.tril
旨在避免进行不需要的计算,则 grav_reply
可能是有效的,但情况似乎并非如此:ipython
的特定测试表明改变 np.tril
(或 np.triu
)中的极限对角线并没有显着改变执行时间。
In [1]: import numpy as np
In [2]: import random
In [3]: a = np.array([[random.randint(10, 99)
....: for _ in range(5)]
....: for _ in range(5)])
In [4]: %timeit np.dot(a, a)
1000000 loops, best of 3: 1.35 µs per loop
In [5]: %timeit np.tril(np.dot(a, a), 0)
100000 loops, best of 3: 17.3 µs per loop
In [6]: %timeit np.tril(np.dot(a, a), -2)
100000 loops, best of 3: 16.5 µs per loop
In [7]: a = np.array([[random.randint(10, 99)
....: for _ in range(100)]
....: for _ in range(100)])
In [8]: %timeit np.tril(a*a, 0)
10000 loops, best of 3: 56.3 µs per loop
In [9]: %timeit np.tril(a*a, -20)
10000 loops, best of 3: 61 µs per loop
In [10]: %timeit np.tril(a*a, 20)
10000 loops, best of 3: 54.7 µs per loop
In [11]: %timeit np.tril(a*a, 60)
10000 loops, best of 3: 54.5 µs per loop
编辑:这是每个算法的 performance/size 图
编辑:这是我写的最后一个基准测试代码:
import numpy as np
import time
import random
from matplotlib import pyplot as plt
from grav_c import grav_c, grav2_c
from numba import jit, njit
import datetime
G = 6.67408*10**-8 # m³/s²T
def grav2(p, M):
l = len(p[0])
a = np.empty(shape=(2, l))
a[:, 0] = 0
for b in range(1, l):
# computing the distance between body #b and all previous
d = p[:, b:b+1] - p[:, :b]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
# computing Newton formula : acceleration undergone by b from all previous
a[:, b] = -(M[:b] * G * d2**(-1.5) * d).sum(axis=1)
# computing Newton formula : adding for each previous, acceleration undergone by from b
a[:, :b] += M[b] * G * d2**(-1.5) * d
return a
grav2_jit = jit(grav2)
def grav(p, M):
l = len(p[0])
a = np.empty(shape=(2, l))
a[:, 0] = 0
for b in range(1, l):
# computing the distance between body #b and all previous
d = p[:, b:b+1] - p[:, :b]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
# computing Newton formula : acceleration undergone by b from all previous
a[:, b] = -(M[:b] * G / np.sqrt(d2) / d2 * d).sum(axis=1)
## a[:, b] = -(M[:b] * G * d2**(-1.5) * d).sum(axis=1)
# computing Newton formula : adding for each previous, acceleration undergone by from b
a[:, :b] += M[b] * G / np.sqrt(d2) / d2 * d
## a[:, :b] += M[b] * G * d2**(-1.5) * d
return a
grav_jit = jit(grav)
def grav2_optim1(p, M):
l = len(p[0])
a = np.empty(shape=(2, l))
a[:, 0] = 0
for b in range(1, l):
# computing the distance between body #b and all previous
d = p[:, b:b+1] - p[:, :b]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
VVV = G * d2**(-1.5)
# computing Newton formula : acceleration undergone by b from all previous
a[:, b] = -(M[:b] * VVV * d).sum(axis=1)
# computing Newton formula : adding for each previous, acceleration undergone by from b
a[:, :b] += M[b] * VVV * d
return a
grav2_optim1_jit = jit(grav2_optim1)
def grav_optim1(p, M):
l = len(p[0])
a = np.empty(shape=(2, l))
a[:, 0] = 0
for b in range(1, l):
# computing the distance between body #b and all previous
d = p[:, b:b+1] - p[:, :b]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
VVV = G / np.sqrt(d2) / d2
# computing Newton formula : acceleration undergone by b from all previous
a[:, b] = -(M[:b] * VVV * d).sum(axis=1)
# computing Newton formula : adding for each previous, acceleration undergone by from b
a[:, :b] += M[b] * VVV * d
return a
grav_optim1_jit = jit(grav_optim1)
def grav2_optim2(p, M):
l = len(p[0])
a = np.empty(shape=(2, l))
a[:, 0] = 0
for b in range(1, l):
# computing the distance between body #b and all previous
d = p[:, b:b+1] - p[:, :b]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
XXX = G * d * d2**(-1.5)
# computing Newton formula : acceleration undergone by b from all previous
a[:, b] = -(M[None, :b] * XXX).sum(axis=1)
# computing Newton formula : adding for each previous, acceleration undergone by from b
a[:, :b] += M[b] * XXX
return a
grav2_optim2_jit = jit(grav2_optim2)
def grav_optim2(p, M):
l = len(p[0])
a = np.empty(shape=(2, l))
a[:, 0] = 0
for b in range(1, l):
# computing the distance between body #b and all previous
d = p[:, b:b+1] - p[:, :b]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
XXX = G * d / np.sqrt(d2) / d2
# computing Newton formula : acceleration undergone by b from all previous
a[:, b] = -(M[None, :b] * XXX).sum(axis=1)
# computing Newton formula : adding for each previous, acceleration undergone by from b
a[:, :b] += M[b] * XXX
return a
grav_optim2_jit = jit(grav_optim2)
def grav2_vect(p, M):
d = p[:, :, None] - p[:, None, :]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
return (M[None, :, None]*G*d*(d2**(-1.5))[None, :, :]).sum(axis=1)
grav2_vect_jit = jit(grav2_vect)
def grav_vect(p, M):
d = p[:, :, None] - p[:, None, :]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
return (M[None, :, None]*G*d/(np.sqrt(d2)*d2)[None, :, :]).sum(axis=1)
grav_vect_jit = jit(grav_vect)
# the grav*_vect_bis functions are equivalent to the grav*_vect functions because d is symetric
def grav2_vect_bis(p, M):
d = p[:, :, None] - p[:, None, :]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
return (-M[None, None, :]*G*d*(d2**(-1.5))[None, :, :]).sum(axis=2)
grav2_vect_bis_jit = jit(grav2_vect_bis)
def grav_vect_bis(p, M):
d = p[:, :, None] - p[:, None, :]
d2 = (d*d).sum(axis=0)
d2[d2==0] = 1
return (-M[None, None, :]*G*d/(np.sqrt(d2)*d2)[None, :, :]).sum(axis=2)
grav_vect_bis_jit = jit(grav_vect_bis)
def grav2_tril(p, M):
d = np.tril(p[:, :, None] - p[:, None, :], -1)
d2 = np.tril((d*d).sum(axis=0), -1)
d2[d2==0] = 1
return (M[None, :, None]*G*d*(d2**(-1.5))[None, :, :]).sum(axis=1) - \
(M[None, None, :]*G*d*(d2**(-1.5))[None, :, :]).sum(axis=2)
grav2_tril_jit = jit(grav2_tril)
def grav_tril(p, M):
d = np.tril(p[:, :, None] - p[:, None, :], -1)
d2 = np.tril((d*d).sum(axis=0), -1)
d2[d2==0] = 1
return (M[None, :, None]*G*d/(np.sqrt(d2)*d2)[None, :, :]).sum(axis=1) - \
(M[None, None, :]*G*d/(np.sqrt(d2)*d2)[None, :, :]).sum(axis=2)
grav_tril_jit = jit(grav_tril)
testslist = [
('grav_vect', 'c'), ('grav2_vect', 'c--'), ('grav_vect_jit', 'c:'), ('grav2_vect_jit', 'c-.'),
('grav_vect_bis', 'm'), ('grav2_vect_bis', 'm--'), ('grav_vect_bis_jit', 'm:'), ('grav2_vect_bis_jit', 'm-.'),
('grav_tril', 'y'), ('grav2_tril', 'y--'), ('grav_tril_jit', 'y:'), ('grav2_tril_jit', 'y-.'),
('grav', 'r'), ('grav2', 'r--'), ('grav_jit', 'r:'), ('grav2_jit', 'r-.'),
('grav_optim1', 'g'), ('grav2_optim1', 'g--'), ('grav_optim1_jit', 'g:'), ('grav2_optim1_jit', 'g-.'),
('grav_optim2', 'b'), ('grav2_optim2', 'b--'), ('grav_optim2_jit', 'b:'), ('grav2_optim2_jit', 'b-.'),
('grav_c', 'k'),('grav2_c', 'k--')]
class ScaleType() : pass
class LinScale(ScaleType) : pass
class LogScale(ScaleType) : pass
attempts = 8
scaletype = LogScale
scalelen = 200
scalestart = 2
scalestop = 400
# input data (Multiple datasets to repeat the tests on different data)
randlist = lambda x : [float(random.randint(10000, 99999)) for _ in range(x)]
try:
# data_file = "Here you can give an npz file name to load some presaved data.npz"
with np.load(data_file) as data:
testslist = data['testslist']
N = data['N']
timings = data['timings']
perform = data['perform']
miny = data['miny']
except NameError:
L = scalestop-scalestart
if scalelen > L:
N = np.arange(scalestart, scalestop+1, 1)
elif scaletype == LinScale:
Q = L//(scalelen-1)
R = L%(scalelen-1)
N = np.array([i for r in (range(scalestart, scalestart+Q*(scalelen-1-R), Q),
range(scalestart+Q*(scalelen-1-R), scalestop+1, Q+1)) for i in r])
elif scaletype == LogScale:
X = scalestart
G = scalestop/scalestart
I = scalelen-1
while True:
NX = I*np.log(I/np.log(G)/scalestart)/np.log(G)
if NX-X < 0.0001: break
X = NX
L0 = int(scalestart*np.power(G, X/I))
G = scalestop/(scalestart+L0)
I = scalelen-1-L0
a1 = np.array(range(I))
N = np.concatenate((range(scalestart, scalestart+L0, 1),
scalestart+L0-1+np.cumsum((0.+(scalestart+L0)*(np.exp(np.log(G)*(a1+1)/I) - np.exp(np.log(G)*a1/I))).astype(int)),
[scalestop]))
print(N)
l = len(N)
timings = np.full(l, 9999999., dtype=[(test[0], np.float64) for test in testslist])
perform = np.full(l, 9999999., dtype=[(test[0], np.float64) for test in testslist])
miny = 9999999.
accum = 0 # This is to prevent system to perform unwanted optimisations
for j in range(attempts):
for i in range(l):
L = N[i]
system_p = [np.array([randlist(L), randlist(L)]) for _ in range(100)]
system_M = [np.array( randlist(L)) for _ in range(100)]
for test in testslist:
timeref = -time.time()
system_a = eval(test[0]+'(system_p[0], system_M[0])')
accum += system_a[0, 0]
count = 1
while time.time()+timeref<0.001:
for count in range(count+1, 10*count+1):
system_a = eval(test[0]+'(system_p[count%100], system_M[count%100])')
timeref += time.time()
## print(count)
timings[test[0]][i] = min(timings[test[0]][i], timeref/count)
val = timings[test[0]][i]/(N[i]*(N[i]-1)/2)
perform[test[0]][i] = val
miny = min(val, miny)
if i%10==9: print(j, end='', flush=True)
print(flush=True)
filename = "example grav, Whosebug "+str(datetime.datetime.now())+".npz"
print("saving data to", filename)
np.savez(filename, testslist=testslist, N=N, timings=timings, perform=perform, miny=miny)
ymin = 10**(np.floor(np.log10(miny)))
if (5*ymin<=miny): ymin *= 5
elif (2*ymin<=miny): ymin *= 2
print('ymin = {}, miny = {}\n'.format(ymin, miny))
figa, ax = plt.subplots(figsize=(24, 12))
for test in testslist:
ax.plot(N, timings[test[0]], test[1], label=test[0])
ax.set_title('numpy compared timings')
plt.xlabel('N (system size)')
plt.ylabel('timings (msec)')
plt.grid(True)
plt.legend(loc='upper left', bbox_to_anchor=(0., 1), shadow=True, ncol=7)
plt.subplots_adjust(left=0.05, bottom=0.06, right=0.98, top=0.98)
figb, bx = plt.subplots(figsize=(24, 12))
for test in testslist:
bx.plot(N, timings[test[0]], test[1], label=test[0])
bx.set_title('numpy compared timings')
plt.yscale('log')
plt.xscale('log')
plt.xlabel('N (system size)')
plt.ylabel('timings (msec)')
plt.grid(True)
plt.legend(loc='upper left', bbox_to_anchor=(0., 1), shadow=True, ncol=7)
plt.subplots_adjust(left=0.05, bottom=0.06, right=0.98, top=0.98)
figc, cx = plt.subplots(figsize=(24, 12))
for test in testslist:
cx.plot(N, perform[test[0]], test[1], label=test[0])
plt.ylim(0, 20*ymin)
cx.set_title('numpy compared performance')
plt.xlabel('N (system size)')
plt.ylabel('performance (msec)/N²')
plt.grid(True)
plt.legend(loc='upper right', bbox_to_anchor=(1., 1), shadow=True, ncol=7)
plt.subplots_adjust(left=0.05, bottom=0.06, right=0.98, top=0.98)
figd, dx = plt.subplots(figsize=(24, 12))
for test in testslist:
dx.plot(N, perform[test[0]], test[1], label=test[0])
dx.set_title('numpy compared performance')
plt.yscale('log')
plt.xscale('log')
plt.xlabel('N (system size)')
plt.ylabel('performance (msec)/N²')
plt.grid(True)
plt.legend(loc='upper right', bbox_to_anchor=(1., 1), shadow=True, ncol=7)
plt.subplots_adjust(left=0.05, bottom=0.06, right=0.98, top=0.98)
plt.show()
有了它的 C 模块
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <numpy/arrayobject.h>
#define G 6.67408E-8L
void * failure(PyObject *type, const char *message) {
PyErr_SetString(type, message);
return NULL;
}
void * success(PyObject *var){
Py_INCREF(var);
return var;
}
static PyObject *
Py_grav_c(PyObject *self, PyObject *args)
{
PyArrayObject *p, *M;
PyObject *a;
int i, j, k;
double *pq0, *pq1, *Mq0, *Mq1, *aq0, *aq1, *p0, *p1, *a0, *a1;
if (!PyArg_ParseTuple(args, "O!O!", &PyArray_Type, &p, &PyArray_Type, &M))
return failure(PyExc_RuntimeError, "Failed to parse parameters.");
if (PyArray_DESCR(p)->type_num != NPY_DOUBLE)
return failure(PyExc_TypeError, "Type np.float64 expected for p array.");
if (PyArray_DESCR(M)->type_num != NPY_DOUBLE)
return failure(PyExc_TypeError, "Type np.float64 expected for M array.");
if (PyArray_NDIM(p)!=2)
return failure(PyExc_TypeError, "p must be a 2 dimensionnal array.");
if (PyArray_NDIM(M)!=1)
return failure(PyExc_TypeError, "M must be a 1 dimensionnal array.");
int K = PyArray_DIM(p, 0); // Number of dimensions you want
int L = PyArray_DIM(p, 1); // Number of bodies in the system
int S0 = PyArray_STRIDE(p, 0); // Normally, the arrays should be contiguous
int S1 = PyArray_STRIDE(p, 1); // But since they provide this Stride info
int SM = PyArray_STRIDE(M, 0); // I supposed they might not be (alignment)
if (PyArray_DIM(M, 0) != L)
return failure(PyExc_TypeError,
"P and M must have the same number of bodies.");
a = PyArray_NewLikeArray(p, NPY_ANYORDER, NULL, 0);
if (a == NULL)
return failure(PyExc_RuntimeError, "Failed to create output array.");
PyArray_FILLWBYTE(a, 0);
// For all bodies except first which has no previous body
for (i = 1,
pq0 = (double *)(PyArray_DATA(p)+S1),
Mq0 = (double *)(PyArray_DATA(M)+SM),
aq0 = (double *)(PyArray_DATA(a)+S1);
i < L;
i++,
*(void **)&pq0 += S1,
*(void **)&Mq0 += SM,
*(void **)&aq0 += S1
) {
// For all previous bodies
for (j = 0,
pq1 = (double *)PyArray_DATA(p),
Mq1 = (double *)PyArray_DATA(M),
aq1 = (double *)PyArray_DATA(a);
j < i;
j++,
*(void **)&pq1 += S1,
*(void **)&Mq1 += SM,
*(void **)&aq1 += S1
) {
// For all dimensions calculate deltas
long double d[K], d2 = 0, VVV, M0xVVV, M1xVVV;
for (k = 0,
p0 = pq0,
p1 = pq1;
k<K;
k++,
*(void **)&p0 += S0,
*(void **)&p1 += S0) {
d[k] = *p1 - *p0;
}
// calculate Hypotenuse squared
for (k = 0, d2 = 0; k<K; k++) {
d2 += d[k]*d[k];
}
// calculate interm. results once for each bodies pair (optimization)
VVV = G;
#define LIM 1
// if (d2<LIM) d2=LIM; // Variation on collision case
if (d2>0) VVV /= d2*sqrt(d2);
M0xVVV = *Mq0 * VVV; // anonymous intermediate result
M1xVVV = *Mq1 * VVV; // anonymous intermediate result
// For all dimensions calculate component of acceleration
for (k = 0,
a0 = aq0,
a1 = aq1;
k<K;
k++,
*(void **)&a0 += S0,
*(void **)&a1 += S0) {
*a0 += M1xVVV*d[k];
*a1 -= M0xVVV*d[k];
}
}
}
/* clean up and return the result */
return success(a);
}
static PyObject *
Py_grav2_c(PyObject *self, PyObject *args)
{
PyArrayObject *p, *M;
PyObject *a;
int i, j, k;
double *pq0, *pq1, *Mq0, *Mq1, *aq0, *aq1, *p0, *p1, *a0, *a1;
if (!PyArg_ParseTuple(args, "O!O!", &PyArray_Type, &p, &PyArray_Type, &M))
return failure(PyExc_RuntimeError, "Failed to parse parameters.");
if (PyArray_DESCR(p)->type_num != NPY_DOUBLE)
return failure(PyExc_TypeError, "Type np.float64 expected for p array.");
if (PyArray_DESCR(M)->type_num != NPY_DOUBLE)
return failure(PyExc_TypeError, "Type np.float64 expected for M array.");
if (PyArray_NDIM(p)!=2)
return failure(PyExc_TypeError, "p must be a 2 dimensionnal array.");
if (PyArray_NDIM(M)!=1)
return failure(PyExc_TypeError, "M must be a 1 dimensionnal array.");
int K = PyArray_DIM(p, 0); // Number of dimensions you want
int L = PyArray_DIM(p, 1); // Number of bodies in the system
int S0 = PyArray_STRIDE(p, 0); // Normally, the arrays should be contiguous
int S1 = PyArray_STRIDE(p, 1); // But since they provide this Stride info
int SM = PyArray_STRIDE(M, 0); // I supposed they might not be (alignment)
if (PyArray_DIM(M, 0) != L)
return failure(PyExc_TypeError,
"P and M must have the same number of bodies.");
a = PyArray_NewLikeArray(p, NPY_ANYORDER, NULL, 0);
if (a == NULL)
return failure(PyExc_RuntimeError, "Failed to create output array.");
PyArray_FILLWBYTE(a, 0);
// For all bodies except first which has no previous body
for (i = 1,
pq0 = (double *)(PyArray_DATA(p)+S1),
Mq0 = (double *)(PyArray_DATA(M)+SM),
aq0 = (double *)(PyArray_DATA(a)+S1);
i < L;
i++,
*(void **)&pq0 += S1,
*(void **)&Mq0 += SM,
*(void **)&aq0 += S1
) {
// For all previous bodies
for (j = 0,
pq1 = (double *)PyArray_DATA(p),
Mq1 = (double *)PyArray_DATA(M),
aq1 = (double *)PyArray_DATA(a);
j < i;
j++,
*(void **)&pq1 += S1,
*(void **)&Mq1 += SM,
*(void **)&aq1 += S1
) {
// For all dimensions calculate deltas
long double d[K], d2 = 0, VVV, M0xVVV, M1xVVV;
for (k = 0,
p0 = pq0,
p1 = pq1;
k<K;
k++,
*(void **)&p0 += S0,
*(void **)&p1 += S0) {
d[k] = *p1 - *p0;
}
// calculate Hypotenuse squared
for (k = 0, d2 = 0; k<K; k++) {
d2 += d[k]*d[k];
}
// calculate interm. results once for each bodies pair (optimization)
VVV = G;
#define LIM 1
// if (d2<LIM) d2=LIM; // Variation on collision case
if (d2>0) VVV *= pow(d2, -1.5);
M0xVVV = *Mq0 * VVV; // anonymous intermediate result
M1xVVV = *Mq1 * VVV; // anonymous intermediate result
// For all dimensions calculate component of acceleration
for (k = 0,
a0 = aq0,
a1 = aq1;
k<K;
k++,
*(void **)&a0 += S0,
*(void **)&a1 += S0) {
*a0 += M1xVVV*d[k];
*a1 -= M0xVVV*d[k];
}
}
}
/* clean up and return the result */
return success(a);
}
// exported functions list
static PyMethodDef grav_c_Methods[] = {
{"grav_c", Py_grav_c, METH_VARARGS, "grav_c(p, M)\n"
"\n"
"grav_c takes the positions and masses of m bodies in Newtonian attraction in a n dimensionnal universe,\n"
"and returns the accelerations each body undergoes.\n"
"input data take the for of a row of fload64 for each dimension of the position (in p) and one row for the masses.\n"
"It returns and array of the same shape as p for the accelerations."},
{"grav2_c", Py_grav2_c, METH_VARARGS, "grav_c(p, M)\n"
"\n"
"grav_c takes the positions and masses of m bodies in Newtonian attraction in a n dimensionnal universe,\n"
"and returns the accelerations each body undergoes.\n"
"input data take the for of a row of fload64 for each dimension of the position (in p) and one row for the masses.\n"
"It returns and array of the same shape as p for the accelerations."},
{NULL, NULL, 0, NULL} // pour terminer la liste.
};
static char grav_c_doc[] = "Compute attractions between n bodies.";
static struct PyModuleDef grav_c_module = {
PyModuleDef_HEAD_INIT,
"grav_c", /* name of module */
grav_c_doc, /* module documentation, may be NULL */
-1, /* size of per-interpreter state of the module,
or -1 if the module keeps state in global variables. */
grav_c_Methods
};
PyMODINIT_FUNC
PyInit_grav_c(void)
{
// I don't understand why yet, but the program segfaults without this.
import_array();
return PyModule_Create(&grav_c_module);
}
我猜你无法获得比 grav_vectorized()
更好的 NumPythonic 表达式,正如已经注意到的那样,它增加了计算复杂度和内存,略小于 2 倍。
但是,如果您追求速度并且仍想留在 Python 一圈, 对于这个特定的应用程序,问题似乎在于细节和输入大小。 具体来说,至少在我的机器上,时间似乎在每次迭代中都由分数幂支配,并将其结果保存在临时变量中以加快速度。
归根结底,如果您使用带有冗余优化 (grav_optim_jit()
) 的原始代码的 Numba JIT 版本,您几乎总是最优的。
对于非常小的输入,Cython 版本 (grav_loop_cython()
) 是最快的,但是一旦大小稍有增长 (N > ~100
),更好优化的 Numba 代码 (grav_orig_jit()
) 就会接管讲台。对于更大的输入,NumPy-optimized 但仍然是 n (n + 1) / 2
(在 space 维度中向量化)(grav_iter()
)成为 runner-up,尽管在small-N
政权。
请注意,完全矢量化版本 (grav_vectorized()
) 对于小的 N
表现相当好,但一旦输入大小增加就会出现不足。
另请注意,grav_iter_jit()
基本上不受 Numba JIT 的影响(可能稍微减慢),因为核心优化是在 NumPy 方面。
通过使用 -O3 -march=native
选项编译,Cython 版本可能会变得更快,但我还没有尝试过。
以下是上述内容的详细内容。
这是我测试的代码:
import numpy as np
import numba as nb
G = 6.67408e-2
MAX_SIZE = 1000
MAX_MASS = 1000
原始代码的稍微完善的版本
def grav_orig(x_arr, m_arr, G=G):
n_dim, n_points = x_arr.shape
a_arr = np.zeros_like(x_arr, dtype=np.float64)
for i in range(1, n_points):
dx_arr = x_arr[0, i] - x_arr[0, :i]
dy_arr = x_arr[1, i] - x_arr[1, :i]
d2_arr = dx_arr ** 2 + dy_arr ** 2
a_arr[0, i] = -np.sum(m_arr[:i] * dx_arr * G * d2_arr**(-1.5))
a_arr[1, i] = -np.sum(m_arr[:i] * dy_arr * G * d2_arr**(-1.5))
a_arr[0, :i] += m_arr[i] * dx_arr * G * d2_arr**(-1.5)
a_arr[1, :i] += m_arr[i] * dy_arr * G * d2_arr**(-1.5)
return a_arr
优化版
def grav_optim(x_arr, m_arr, G=G):
n_dim, n_points = x_arr.shape
a_arr = np.zeros_like(x_arr, dtype=np.float64)
for i in range(1, n_points):
dx_arr = x_arr[0, i] - x_arr[0, :i]
dy_arr = x_arr[1, i] - x_arr[1, :i]
d2_arr = dx_arr ** 2 + dy_arr ** 2
temp = G * d2_arr**(-1.5)
temp_x = dx_arr * temp
temp_y = dy_arr * temp
a_arr[0, i] = -np.sum(m_arr[:i] * temp_x)
a_arr[1, i] = -np.sum(m_arr[:i] * temp_y)
a_arr[0, :i] += m_arr[i] * temp_x
a_arr[1, :i] += m_arr[i] * temp_y
return a_arr
对应的 Numba JITted 版本:
grav_optim_jit = nb.jit(grav_optim, nopython=True, nogil=True)
与原始方法类似但沿空间维度矢量化的方法:
def grav_iter(x_arr, m_arr, G=G):
n_dim, n_points = x_arr.shape
a_arr = np.zeros_like(x_arr, dtype=np.float64)
for i in range(1, n_points):
d_arr = x_arr[:, i:i + 1] - x_arr[:, :i]
d2_arr = np.sum(d_arr ** 2, axis=0)
temp = G * d_arr * d2_arr[None, :]**(-1.5)
a_arr[:, i] = -np.sum(m_arr[None, :i] * temp, axis=-1)
a_arr[:, :i] += m_arr[None, i] * temp
return a_arr
对应的 Numba JITted 版本:
grav_iter_jit = nb.jit(grav_iter)
完全矢量化版本:
def grav_vectorized(x_arr, m_arr, G=G):
d_arr = x_arr[:, :, None] - x_arr[:, None, :]
d2_arr = np.sum(d_arr ** 2, axis=0)
d2_arr[d2_arr == 0] = 1
return np.sum((m_arr[None, :, None] * G * d_arr * d2_arr[None, ...]**(-1.5)), axis=1)
Cythonized 版本
%%cython -a
#cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
import numpy as np
cimport cython
cimport numpy as np
DTYPE = np.double
ctypedef np.double_t DTYPE_t
cdef DTYPE_t G = 6.67408e-2
def grav_optim_cython(x_arr, m_arr, G=G):
n_dim, n_points = x_arr.shape
a_arr = np.zeros_like(x_arr, dtype=np.float64)
for i in range(1, n_points):
dx_arr = x_arr[0, i] - x_arr[0, :i]
dy_arr = x_arr[1, i] - x_arr[1, :i]
d2_arr = dx_arr ** 2 + dy_arr ** 2
temp = G * d2_arr**(-1.5)
temp_x = dx_arr * temp
temp_y = dy_arr * temp
a_arr[0, i] = -np.sum(m_arr[:i] * temp_x)
a_arr[1, i] = -np.sum(m_arr[:i] * temp_y)
a_arr[0, :i] += m_arr[i] * temp_x
a_arr[1, :i] += m_arr[i] * temp_y
return a_arr
Cythonized loop-explicit 版本
%load_ext Cython
%%cython -a
#cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
import numpy as np
cimport cython
cimport numpy as np
DTYPE = np.double
ctypedef np.double_t DTYPE_t
cdef DTYPE_t G = 6.67408e-2
@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False) # turn off negative index wrapping for entire function
def grav_loop_cython(
np.ndarray[DTYPE_t, ndim=2] x_arr,
np.ndarray[DTYPE_t, ndim=1] m_arr,
DTYPE_t G=G):
cdef int ndim = x_arr.shape[0]
cdef int n_points = x_arr.shape[1]
cdef np.ndarray[DTYPE_t, ndim=2] a_arr = np.zeros((ndim, n_points), dtype=DTYPE)
cdef np.ndarray[DTYPE_t, ndim=2] dx_arr = np.zeros((ndim, n_points - 1), dtype=DTYPE)
cdef np.ndarray[DTYPE_t, ndim=1] d2_arr = np.zeros((n_points - 1), dtype=DTYPE)
cdef DTYPE_t temp
for j in range(1, n_points):
# compute the pair-wise differences
for jj in range(j):
for i in range(ndim):
dx_arr[i, jj] = x_arr[i, j] - x_arr[i, jj]
# compute the pair-wise squared Euclidean distances
for jj in range(j):
d2_arr[jj] = 0
for i in range(ndim):
d2_arr[jj] += dx_arr[i, jj] ** 2.0
for i in range(ndim):
for jj in range(j):
temp = G * dx_arr[i, jj] * d2_arr[jj] ** -1.5
a_arr[i, j] -= (m_arr[jj] * temp)
a_arr[i, jj] += (m_arr[j] * temp)
return a_arr
时间
我用以下代码为所有这些计时:
funcs = (
grav_orig,
grav_optim,
grav_optim_jit,
grav_optim_cython,
grav_iter,
grav_iter_jit,
grav_loop_cython,
grav_vectorized,
)
Ns = np.geomspace(1e1, 6e3, 16).astype(int)
timings = np.zeros((len(funcs), len(Ns)))
np.random.seed(0)
for i, N in enumerate(Ns):
print('N: ', N)
x_arr = np.random.random((2, N)) * MAX_SIZE
m_arr = np.random.random((N,)) * MAX_MASS
for j, func in enumerate(funcs):
test_result = np.all(np.isclose(grav_orig(x_arr, m_arr), func(x_arr, m_arr)))
func_name = func.__name__ + ('_jit' if '__numba__' in dir(func) else '')
print(f'{func_name:20s} {test_result} ', end='')
t = %timeit -o func(x_arr, m_arr)
timings[j, i] = t.best
print()
数量:
# N: 10
# grav_orig True 501 µs ± 37.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_optim True 358 µs ± 20 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_optim_jit True 17.4 µs ± 491 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
# grav_optim_cython True 383 µs ± 21.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_iter True 371 µs ± 40.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_iter_jit True 481 µs ± 42 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_loop_cython True 12.6 µs ± 250 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
# grav_vectorized True 41.3 µs ± 2.4 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# N: 15
# grav_orig True 769 µs ± 81.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_optim True 540 µs ± 24.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_optim_jit True 29.2 µs ± 431 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# grav_optim_cython True 547 µs ± 24.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_iter True 602 µs ± 42.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_iter_jit True 750 µs ± 23.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_loop_cython True 22.9 µs ± 738 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# grav_vectorized True 58.1 µs ± 2.71 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# N: 23
# grav_orig True 1.11 ms ± 55.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_optim True 788 µs ± 9.89 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_optim_jit True 53 µs ± 290 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# grav_optim_cython True 825 µs ± 17.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_iter True 875 µs ± 78.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_iter_jit True 1.05 ms ± 70.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_loop_cython True 49.2 µs ± 1.76 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# grav_vectorized True 89.6 µs ± 4.89 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# N: 35
# grav_orig True 1.87 ms ± 94 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim True 1.35 ms ± 95.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_optim_jit True 111 µs ± 4.28 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# grav_optim_cython True 1.36 ms ± 96 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_iter True 1.31 ms ± 83.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_iter_jit True 1.54 ms ± 49.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_loop_cython True 109 µs ± 1.89 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# grav_vectorized True 159 µs ± 3.24 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# N: 55
# grav_orig True 3.13 ms ± 171 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim True 2.05 ms ± 128 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim_jit True 237 µs ± 7.39 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_optim_cython True 2.07 ms ± 54.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_iter True 2.37 ms ± 36.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_iter_jit True 2.87 ms ± 86.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_loop_cython True 263 µs ± 5.05 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_vectorized True 326 µs ± 7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# N: 84
# grav_orig True 4.97 ms ± 72.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim True 3.5 ms ± 241 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim_jit True 484 µs ± 4.96 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_optim_cython True 3.26 ms ± 76.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_iter True 3.87 ms ± 223 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_iter_jit True 4.81 ms ± 267 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_loop_cython True 645 µs ± 11.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_vectorized True 805 µs ± 22.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# N: 129
# grav_orig True 9.22 ms ± 215 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim True 6.14 ms ± 315 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim_jit True 1.07 ms ± 19.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_optim_cython True 5.93 ms ± 411 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_iter True 6.85 ms ± 651 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_iter_jit True 7.68 ms ± 523 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_loop_cython True 1.55 ms ± 28.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# grav_vectorized True 1.8 ms ± 39.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# N: 197
# grav_orig True 17.4 ms ± 374 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim True 9.57 ms ± 248 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim_jit True 2.32 ms ± 30.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim_cython True 9.95 ms ± 284 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_iter True 9.87 ms ± 660 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_iter_jit True 12.3 ms ± 1.03 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_loop_cython True 3.68 ms ± 138 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_vectorized True 4.03 ms ± 128 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# N: 303
# grav_orig True 31.9 ms ± 532 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_optim True 15.9 ms ± 56.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim_jit True 5.36 ms ± 21.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim_cython True 16.5 ms ± 200 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_iter True 17.1 ms ± 834 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_iter_jit True 18 ms ± 247 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_loop_cython True 8.38 ms ± 37 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_vectorized True 10.6 ms ± 177 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# N: 464
# grav_orig True 70.7 ms ± 2.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_optim True 31.7 ms ± 1.61 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_optim_jit True 12.2 ms ± 53.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_optim_cython True 29.3 ms ± 646 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_iter True 28.3 ms ± 316 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_iter_jit True 32.5 ms ± 737 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_loop_cython True 19.7 ms ± 52.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# grav_vectorized True 27.8 ms ± 214 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# N: 711
# grav_orig True 126 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_optim True 52.7 ms ± 452 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_optim_jit True 27.1 ms ± 164 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_optim_cython True 54.1 ms ± 623 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_iter True 54.8 ms ± 2.54 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_iter_jit True 60.6 ms ± 1.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_loop_cython True 46.8 ms ± 755 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_vectorized True 67.2 ms ± 3.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# N: 1089
# grav_orig True 306 ms ± 31.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim True 108 ms ± 2.35 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_optim_jit True 61.5 ms ± 606 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_optim_cython True 103 ms ± 1.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_iter True 110 ms ± 4.75 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_iter_jit True 114 ms ± 5.48 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_loop_cython True 107 ms ± 1.87 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_vectorized True 152 ms ± 1.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# N: 1669
# grav_orig True 567 ms ± 6.32 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim True 201 ms ± 4.24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim_jit True 141 ms ± 271 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# grav_optim_cython True 207 ms ± 5.49 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_iter True 210 ms ± 3.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_iter_jit True 223 ms ± 8.24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_loop_cython True 252 ms ± 2.36 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_vectorized True 365 ms ± 4.99 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# N: 2557
# grav_orig True 1.28 s ± 35.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim True 418 ms ± 8.04 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim_jit True 339 ms ± 2.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim_cython True 432 ms ± 10.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_iter True 452 ms ± 12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_iter_jit True 470 ms ± 13.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_loop_cython True 605 ms ± 7.12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_vectorized True 817 ms ± 17.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# N: 3916
# grav_orig True 2.83 s ± 26.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim True 900 ms ± 25 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim_jit True 778 ms ± 4.23 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim_cython True 894 ms ± 19.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_iter True 951 ms ± 25.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_iter_jit True 991 ms ± 28.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_loop_cython True 1.41 s ± 30.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_vectorized True 1.88 s ± 22.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# N: 6000
# grav_orig True 6.77 s ± 171 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim True 1.95 s ± 36.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim_jit True 1.84 s ± 4.82 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_optim_cython True 2.01 s ± 47.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_iter True 2.28 s ± 79.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_iter_jit True 2.27 s ± 31.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_loop_cython True 3.26 s ± 43 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# grav_vectorized True 4.32 s ± 9.29 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
这里是生成绘图的代码:
import matplotlib as mpl
import matplotlib.pyplot as plt
subplot_shape = (1, 2)
fig, axs = plt.subplots(
*subplot_shape, squeeze=False,
figsize=(8 * subplot_shape[1], 6 * subplot_shape[0]))
ax = axs[0, 0]
ax.set_title('Small-N Regime')
ax.set_xlabel('N / #')
ax.set_ylabel('timing / ms')
small_Ns = tuple(N for N in Ns if N < 1000)
for j, func in enumerate(funcs):
func_name = func.__name__ + ('_jit' if '__numba__' in dir(func) else '')
ax.plot(small_Ns, timings[j, :len(small_Ns)] * 1e3, label=func_name)
ax.legend()
ax = axs[0, 1]
ax.set_title('Full N Range')
ax.set_xlabel('N / #')
ax.set_ylabel('timing / ms')
for j, func in enumerate(funcs):
func_name = func.__name__ + ('_jit' if '__numba__' in dir(func) else '')
ax.plot(Ns, timings[j] * 1e3, label=func_name)
ax.legend()
plt.show()
编辑(朱莉娅)
只是为了好玩,我在 Julia 中实现了相同的代码(虽然我不是这方面的专家),但是对于他们吹嘘速度的所有时间来说,时间安排非常具有欺骗性。
using Random
G = 6.67408e-2
MAX_SIZE = 1000
MAX_MASS = 1000
function grav(
x_arr::Array{Float64,2},
m_arr::Array{Float64,2},
g::Float64=G)::Array{Float64,2}
n_dim, n_points = size(x_arr)
a_arr = zeros(size(x_arr))
for i in 2:n_points
d_arr = x_arr[:, i:i] .- x_arr[:, 1:i - 1]
d2_arr = sum(d_arr .^ 2, dims=1)
temp = G .* d_arr .* (d2_arr .^ -1.5)
a_arr[:, i] = -sum(m_arr[:, 1:i - 1] .* temp, dims=2)
a_arr[:, 1:i - 1] += m_arr[:, i - 1] .* temp
end
return a_arr
end
N = 10
x_arr = rand(Float64, 2, N) * MAX_SIZE
m_arr = rand(Float64, 1, N) * MAX_MASS
@time grav(x_arr, m_arr)
# 0.000111 seconds (329 allocations: 23.375 KiB)
N = 6000
x_arr = rand(Float64, 2, N) * MAX_SIZE
m_arr = rand(Float64, 1, N) * MAX_MASS
@time grav(x_arr, m_arr)
# 4.112578 seconds (269.17 k allocations: 2.426 GiB, 1.93% gc time)
# BenchmarkTools.Trial:
# memory estimate: 2.43 GiB
# allocs estimate: 269167
# --------------
# minimum time: 4.096 s (3.31% GC)
# median time: 4.169 s (2.50% GC)
# mean time: 4.169 s (2.50% GC)
# maximum time: 4.243 s (1.72% GC)
# --------------
# samples: 2
# evals/sample: 1
Numba 方法
无需太多操作即可大大加快您的代码速度。
- 加入不必要的循环(所有向量化命令都是循环)
- 做一些数学运算:
d2**(-1.5)
是一个非常昂贵的操作,可以用d2 = 1./(np.sqrt(d2)*d2)
代替
- 安装 Intel SVML 以更快地实现
sin,sqrt,pow...
等功能
代码
import numba as nb
import numpy as np
def grav(px, py, M):
G = 6.67408*10**-2 # m³/s²T
nPoints=px.shape[0]
ax=np.zeros(nPoints,dtype=np.float64)
ay=np.zeros(nPoints,dtype=np.float64)
for b in range(1, px.shape[0]):
# computing the distance between body #b and all previous
dx = px[b] - px[:b]
dy = py[b] - py[:b]
d2 = dx*dx+dy*dy
# computing acceleration undergone by b from all previous
ax[b] = -np.sum(M[:b]*G * dx * d2**(-1.5))
ay[b] = -np.sum(M[:b]*G * dy * d2**(-1.5))
# adding for each previous, acceleration undergone by from b
ax[:b] += M[b]*G * dx * d2**(-1.5)
ay[:b] += M[b]*G * dy * d2**(-1.5)
return ax,ay
@nb.njit(fastmath=True,error_model="numpy")
def grav_2(px, py, M):
G = 6.67408*10**-2 # m³/s²T
nPoints=px.shape[0]
ax=np.zeros(nPoints,dtype=np.float64)
ay=np.zeros(nPoints,dtype=np.float64)
for b in range(1, nPoints):
sum_x=0.
sum_y=0.
for i in range(0,b):
# computing the distance between body #b and all previous
dx = px[b] - px[i]
dy = py[b] - py[i]
d2 = (dx*dx+dy*dy)
#Much less costly than d2 = d2**(-1.5)
d2 = 1./(np.sqrt(d2)*d2)
# computing acceleration undergone by b from all previous
sum_x += M[i]*G * dx * d2
sum_y += M[i]*G * dy * d2
# adding for each previous, acceleration undergone by from b
ax[i] += M[b]*G * dx * d2
ay[i] += M[b]*G * dy * d2
ax[b]=(-1)*sum_x
ay[b]=(-1)*sum_y
return ax,ay
时间
N=10
%timeit res=grav(px, py, M)
212 µs ± 3.47 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit res=grav_2(px, py, M)
1.29 µs ± 7.16 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
N=100
%timeit res=grav(px, py, M)
2.86 ms ± 41.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit res=grav_2(px, py, M)
18.9 µs ± 37.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
N=1000
%timeit res=grav(px, py, M)
86.5 ms ± 448 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit res=grav_2(px, py, M)
1.79 ms ± 13.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
N=10_000
%timeit res=grav(px, py, M)
6.28 s ± 17.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res=grav_2(px, py, M)
180 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
N=50_000
#Take a more advandced algorithm
#Small particles doesn't significantly interact with other particles
#which are far away (KD-tree based approaches?)