我正在尝试使用Numba来加快一些计算速度。的呼唤
nnleapfrog_integrate
函数导致Python进程的分段错误和崩溃。如果
parallel=True
标志从其jit装饰器中删除。但随后它以单线程运行。我希望这个函数能尽可能快地运行,因此我希望它能在多线程中运行,以利用所有的CPU内核。
from numba import jit, prange
import numpy as np
@jit('Tuple((f8[:,:,::1],f8[:,:,::1]))(f8[:,::1], f8[:,::1], f8[::1], i8, i8, i8, f8, f8)', nopython=True, parallel=True)
def nnleapfrog_integrate(pos, vel, mass, i_steps, r_steps, dt, G, softening):
N = pos.shape[0]
pos_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
vel_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
data_idx = 0
acc = np.zeros((N,3))
for s in range(i_steps):
vel += acc * dt/2.0
pos += vel * dt
for i in prange(N):
acc[i,0] = 0
acc[i,1] = 0
acc[i,2] = 0
for j in range(N):
dx = pos[j,0] - pos[i,0]
dy = pos[j,1] - pos[i,1]
dz = pos[j,2] - pos[i,2]
inv_r3 = (dx**2 + dy**2 + dz**2 + softening**2)**(-1.5)
acc[i,0] += G * (dx * inv_r3) * mass[j]
acc[i,1] += G * (dy * inv_r3) * mass[j]
acc[i,2] += G * (dz * inv_r3) * mass[j]
vel += acc * dt/2.0
if s % r_steps == 0:
pos_data[data_idx] = pos
vel_data[data_idx] = vel
data_idx += 1
return pos_data, vel_data
N = 10
dt = 60
pos = np.random.rand(N, 3)
vel = np.random.rand(N, 3)
m = np.random.rand(N)
softening = 1e3
G = 6.67430e-11
t_max = 3600*24*30
i_steps = int(t_max/dt)
r_steps = int(3600*24/dt)
r_i, v_i = nnleapfrog_integrate(pos, vel, m, i_steps, r_steps, dt, G, softening)
我已经试过了
因为只有
for i in prange(N):
循环适合并行化,所以我将其分离为单独的函数
getAcc
哪个与
parallel=真
标记并利用所有CPU核心。
from numba import jit, prange
import numpy as np
@jit('f8[:, ::1](f8[:, ::1], f8[::1], f8, f8)', nopython=True, parallel=True)
def getAcc( pos, mass, G, softening ):
N = pos.shape[0]
a = np.zeros((N,3))
for i in prange(N):
for j in range(N):
dx = pos[j,0] - pos[i,0]
dy = pos[j,1] - pos[i,1]
dz = pos[j,2] - pos[i,2]
inv_r3 = (dx**2 + dy**2 + dz**2 + softening**2)**(-1.5)
a[i,0] += G * (dx * inv_r3) * mass[j]
a[i,1] += G * (dy * inv_r3) * mass[j]
a[i,2] += G * (dz * inv_r3) * mass[j]
return a
@jit('Tuple((f8[:,:,::1],f8[:,:,::1]))(f8[:,::1], f8[:,::1], f8[::1], i8, i8, i8, f8, f8)', nopython=True)
def nleapfrog_integrate(pos, vel, mass, i_steps, r_steps, dt, G, softening):
N = pos.shape[0]
pos_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
vel_data = np.zeros((int(np.ceil(i_steps/r_steps)), N, 3))
data_idx = 0
acc = getAcc(pos, mass, G, softening)
for i in range(i_steps):
vel += acc * dt/2.0
pos += vel * dt
acc = getAcc( pos, mass, G, softening )
vel += acc * dt/2.0
if i % r_steps == 0:
pos_data[data_idx] = pos
vel_data[data_idx] = vel
data_idx += 1
return pos_data, vel_data
N = 10
dt = 60
pos = np.random.rand(N, 3)
vel = np.random.rand(N, 3)
m = np.random.rand(N)
softening = 1e3
G = 6.67430e-11
t_max = 3600*24*30
i_steps = int(t_max/dt)
r_steps = int(3600*24/dt)
r_i, v_i = nleapfrog_integrate(pos, vel, m, i_steps, r_steps, dt, G, softening)
但事实证明,它比内联该循环的原始函数的单线程版本慢了3倍多。
In [4]: %timeit r_i, v_i = nleapfrog_integrate(pos, vel, m, i_steps, r_steps, dt, G, softening)
8.51 s ± 46.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [5]: %timeit r_i, v_i = nnleapfrog_integrate(pos, vel, m, i_steps, r_steps, dt, G, softening)
2.53 s ± 18.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
因此,为了获得最佳性能,我需要具有内联的原始函数
对于调息(N)中的i:
循环以在多线程中运行。