内存泄漏(cython + numpy)
memory leak (cython + numpy)
我正在努力寻找这段代码中的漏洞所在
kullback.pyx
import numpy as np
cimport numpy as np
from libcpp.vector cimport vector
import scipy.stats as st
import matplotlib.pyplot as plt
cdef vector[double] minmax(double i, dict a):
cdef double minmax
cdef vector[double] out
try:
minmax= min(list(filter(lambda x: x > i, a.keys())))
except ValueError:
minmax = min(a.keys())
cdef double maxmin
try:
maxmin = max(list(filter(lambda x: x < i, a.keys())))
except ValueError:
maxmin = max(a.keys())
out.push_back(minmax)
out.push_back(maxmin)
return out
def KullbackLeibler(args):
cdef np.ndarray[np.double_t, ndim = 1] psample = args[0]
cdef np.ndarray[np.double_t, ndim = 1] qsample = args[1]
cdef int n = args[2]
a = plt.hist(psample, bins = n)
cdef np.ndarray[np.double_t, ndim = 1] ax = a[1]
cdef np.ndarray[np.double_t, ndim = 1] ay = a[0]
b = plt.hist(qsample, bins = ax)
adict = dict(zip(ax, ay))
ax = ax[:-1]
cdef np.ndarray[np.double_t, ndim = 1] bx = b[1]
cdef np.ndarray[np.double_t, ndim = 1] by = b[0]
bdict = dict(zip(bx, by))
bx = bx[:-1]
cdef vector[double] kl
cdef int N = np.sum(ay)
cdef int i
cdef double p_minmax, p_maxmin, q_minmax, q_maxmin
cdef double KL
for i in range(len(psample)):
ptmp = minmax(psample[i], adict)
p_minmax = ptmp[0]
p_maxmin = ptmp[1]
qtmp = minmax(psample[i], bdict)
q_minmax = qtmp[0]
q_maxmin = qtmp[1]
pdensity = adict[p_maxmin]/ N
qdensity = np.max([bdict[q_maxmin]/ N, 10e-20])
KL = pdensity * np.log(pdensity/qdensity)
kl.push_back(KL)
cdef double res = np.sum(kl)
del args, psample, qsample, ax, ay, bx, by, adict, bdict
return res
这里是我启动的主要平台
main.py
import kullback as klcy #@unresolvedimport
import datetime
import numpy as np
import pathos.pools as pp
import objgraph
np.random.seed(10)
ncore = 4
pool = pp.ProcessPool(ncore)
KL = []
for i in range(2500):
time1 = datetime.datetime.now()
n = 500
x = [np.random.normal(size = n, scale = 1) for j in range(ncore)]
y = [np.random.normal(size = n, scale = 1) for j in range(ncore)]
data = np.array(list(zip(x,y,[n/10]*ncore)))
kl = pool.map(klcy.KullbackLeibler, data)
time2 = datetime.datetime.now()
print(i, time2 - time1, sep = " ")
print(objgraph.show_growth())
KL.append(kl)
函数KullbackLeibler
以两个数组和一个整数作为输入
我已经尝试过的:
使用 objgraph 来识别增长的对象,不幸的是它似乎不适用于 C 定义的数组(它仅标识我将结果附加为增长的列表)
删除pyx函数末尾的所有数组
尝试在 pyx 文件和主文件中放置一个 gc.collect()
调用,但没有任何改变
内存消耗随着迭代次数和每次迭代所需的时间(从0.6s到4s以上)线性增长。这是我第一次尝试使用 cython,任何建议都会有用。
问题与数组无关。我没有关闭 matplotlib 图
a = plt.hist(psample, bins = n)
b = plt.hist(qsample, bins = ax)
即使我没有显示它们,它们仍然被绘制,消耗了之后从未释放的内存。感谢@DavidW 在评论中让我注意到。
我正在努力寻找这段代码中的漏洞所在
kullback.pyx
import numpy as np
cimport numpy as np
from libcpp.vector cimport vector
import scipy.stats as st
import matplotlib.pyplot as plt
cdef vector[double] minmax(double i, dict a):
cdef double minmax
cdef vector[double] out
try:
minmax= min(list(filter(lambda x: x > i, a.keys())))
except ValueError:
minmax = min(a.keys())
cdef double maxmin
try:
maxmin = max(list(filter(lambda x: x < i, a.keys())))
except ValueError:
maxmin = max(a.keys())
out.push_back(minmax)
out.push_back(maxmin)
return out
def KullbackLeibler(args):
cdef np.ndarray[np.double_t, ndim = 1] psample = args[0]
cdef np.ndarray[np.double_t, ndim = 1] qsample = args[1]
cdef int n = args[2]
a = plt.hist(psample, bins = n)
cdef np.ndarray[np.double_t, ndim = 1] ax = a[1]
cdef np.ndarray[np.double_t, ndim = 1] ay = a[0]
b = plt.hist(qsample, bins = ax)
adict = dict(zip(ax, ay))
ax = ax[:-1]
cdef np.ndarray[np.double_t, ndim = 1] bx = b[1]
cdef np.ndarray[np.double_t, ndim = 1] by = b[0]
bdict = dict(zip(bx, by))
bx = bx[:-1]
cdef vector[double] kl
cdef int N = np.sum(ay)
cdef int i
cdef double p_minmax, p_maxmin, q_minmax, q_maxmin
cdef double KL
for i in range(len(psample)):
ptmp = minmax(psample[i], adict)
p_minmax = ptmp[0]
p_maxmin = ptmp[1]
qtmp = minmax(psample[i], bdict)
q_minmax = qtmp[0]
q_maxmin = qtmp[1]
pdensity = adict[p_maxmin]/ N
qdensity = np.max([bdict[q_maxmin]/ N, 10e-20])
KL = pdensity * np.log(pdensity/qdensity)
kl.push_back(KL)
cdef double res = np.sum(kl)
del args, psample, qsample, ax, ay, bx, by, adict, bdict
return res
这里是我启动的主要平台
main.py
import kullback as klcy #@unresolvedimport
import datetime
import numpy as np
import pathos.pools as pp
import objgraph
np.random.seed(10)
ncore = 4
pool = pp.ProcessPool(ncore)
KL = []
for i in range(2500):
time1 = datetime.datetime.now()
n = 500
x = [np.random.normal(size = n, scale = 1) for j in range(ncore)]
y = [np.random.normal(size = n, scale = 1) for j in range(ncore)]
data = np.array(list(zip(x,y,[n/10]*ncore)))
kl = pool.map(klcy.KullbackLeibler, data)
time2 = datetime.datetime.now()
print(i, time2 - time1, sep = " ")
print(objgraph.show_growth())
KL.append(kl)
函数KullbackLeibler
以两个数组和一个整数作为输入
我已经尝试过的:
使用 objgraph 来识别增长的对象,不幸的是它似乎不适用于 C 定义的数组(它仅标识我将结果附加为增长的列表)
删除pyx函数末尾的所有数组
尝试在 pyx 文件和主文件中放置一个
gc.collect()
调用,但没有任何改变
内存消耗随着迭代次数和每次迭代所需的时间(从0.6s到4s以上)线性增长。这是我第一次尝试使用 cython,任何建议都会有用。
问题与数组无关。我没有关闭 matplotlib 图
a = plt.hist(psample, bins = n)
b = plt.hist(qsample, bins = ax)
即使我没有显示它们,它们仍然被绘制,消耗了之后从未释放的内存。感谢@DavidW 在评论中让我注意到。