最佳传输代码(Scipy 线性规划优化)需要更长的时间
Optimal transport code (Scipy linear programming optimization) takes much longer time
我一直在尝试计算两个一维高斯分布之间的 Wasserstein 距离,均值为 0.0 和 4.0,方差分别为 9.0 和 16.0。我使用了 scipy.linprog.optimize 模块并使用了以下 link 中所述的“内点”方法
https://yetanothermathprogrammingconsultant.blogspot.com/2019/10/scipy-linear-programming-large-but-easy.html。
然而,它需要超过 17 个小时,并且仍然(我的代码是)运行 运行 解决 300 x 300 LP 矩阵问题(即)300 个源节点和 300 个目标节点。但是,该文件表示可以用 1000 个源节点和 1000 个目标节点来解决问题。(即)可以用 1,000,000(一百万)个决定性变量来解决 LP 问题。我的代码有什么问题?为什么需要这么长时间?我们是否需要大内存(或集群)来解决此类问题?
我的代码
from datetime import datetime
start_time = datetime.now()
from scipy.optimize import linprog
import scipy
#Initializing the LP matrix
Piprob=np.zeros(500*500).reshape(500,500)
def Piprobmin(Krv,rhoi,rhoj):
r1=np.shape(Krv)[0]
r2=np.shape(Krv)[1]
print("r1,r2",r1,r2)
#Computing the LP Matrix which has just two ones in each column
pmat=np.zeros((r1+r2)*(r1*r2)).reshape((r1+r2),(r1*r2))
for i in range(r1+r2):
for j in range(r1*r2):
if((i<r1) and (j<((i+1)*r2)) and (j>=(i*r2))):
pmat[i][j]=1
if(i>=r1):
for k in range(r1*r2):
if j==(i-r1)+(k*r2):
pmat[i][j]=1
#flattening the cost matrix into one dimensional array
krvf=Krv.flatten()
tempr=np.append(rhoi,rhoj)
Xv=[] #Creating list for joint probability matrix elements
res = scipy.optimize.linprog(c=krvf,method='interior-point',A_eq=pmat,b_eq=tempr,options=
{'sparse':True, 'disp':True})
print("res=\n",res)
wv=res.fun
for l1 in range(r1*r2):
Xv.append(res.x[l1])
Yv=np.array(Xv)
Yv=Yv.reshape(r1,r2)
#returning Yv-joint probability and ,Wv-minimized wasserstein distance
return Yv,wv
Piprob,W=Piprobmin(K,result1,result2) #K-cost function matrix,result1 is the first
#marginal,result2 is the second marginal
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))
成本函数的大小为 300 X 300 大小,每个边际有 300 个点(总共 600 个约束)。我验证了我的成本函数是对称的和非负的。每个边际总和为 1,因为它们只是概率。
在blog post中多次使用稀疏这个词。不无道理。将 A 矩阵存储为稀疏矩阵极为重要。否则,你将无法处理大问题。博文post详细讨论了运输LP矩阵对内存要求的差异,所以这一点应该很难错过。
下面是一些示例代码,说明如何使用 scipy.optimize.linprog 设置具有 1000 个源节点和 1000 个目标节点的交通模型。同样,LP 矩阵有 2,000 行和 1,000,000 列并且存储稀疏。
import numpy as np
import scipy as sp
import scipy.sparse as sparse
import scipy.optimize as opt
from memory_profiler import profile
def GenerateData(M,N):
np.random.seed(123)
# form objective function
c = np.random.uniform(0,10,(M,N))
# demand, supply
s = np.random.uniform(0,15,M)
d = np.random.uniform(0,10,N)
assert np.sum(d) <= np.sum(s), "supply too small"
#print('c',c)
#print('s',s)
#print('d',d)
return {'c':c, 's':s, 'd':d, 'n':N, 'm':M}
def FormLPData(data):
rhs = np.append(data['s'],-data['d'])
# form A
# column (i,j)=n*i+j has two nonzeroes:
# 1 at row i with rhs supply(i)
# 1 at row N+j with rhs demand(j)
N = data['n']
M = data['m']
NZ = 2*N*M
irow = np.zeros(NZ, dtype=int)
jcol = np.zeros(NZ, dtype=int)
value = np.zeros(NZ)
for i in range(N):
for j in range(M):
k = M*i+j
k1 = 2*k
k2 = k1+1
irow[k1] = i
jcol[k1] = k
value[k1] = 1.0
irow[k2] = N+j
jcol[k2] = k
value[k2] = -1.0
A = sparse.coo_matrix((value, (irow, jcol)))
#print('A',A)
#print('rhs',rhs)
return {'A':A,'rhs':rhs}
@profile
def run():
# dimensions
M = 1000 # sources
N = 1000 # destinations
data = GenerateData(M,N)
lpdata = FormLPData(data)
res = opt.linprog(c=np.reshape(data['c'],M*N),A_ub=lpdata['A'],b_ub=lpdata['rhs'],options={'sparse':True, 'disp':True})
if __name__ == '__main__':
run()
所以看起来你完全错过了关于博客的全部要点 post。
我一直在尝试计算两个一维高斯分布之间的 Wasserstein 距离,均值为 0.0 和 4.0,方差分别为 9.0 和 16.0。我使用了 scipy.linprog.optimize 模块并使用了以下 link 中所述的“内点”方法 https://yetanothermathprogrammingconsultant.blogspot.com/2019/10/scipy-linear-programming-large-but-easy.html。 然而,它需要超过 17 个小时,并且仍然(我的代码是)运行 运行 解决 300 x 300 LP 矩阵问题(即)300 个源节点和 300 个目标节点。但是,该文件表示可以用 1000 个源节点和 1000 个目标节点来解决问题。(即)可以用 1,000,000(一百万)个决定性变量来解决 LP 问题。我的代码有什么问题?为什么需要这么长时间?我们是否需要大内存(或集群)来解决此类问题? 我的代码
from datetime import datetime
start_time = datetime.now()
from scipy.optimize import linprog
import scipy
#Initializing the LP matrix
Piprob=np.zeros(500*500).reshape(500,500)
def Piprobmin(Krv,rhoi,rhoj):
r1=np.shape(Krv)[0]
r2=np.shape(Krv)[1]
print("r1,r2",r1,r2)
#Computing the LP Matrix which has just two ones in each column
pmat=np.zeros((r1+r2)*(r1*r2)).reshape((r1+r2),(r1*r2))
for i in range(r1+r2):
for j in range(r1*r2):
if((i<r1) and (j<((i+1)*r2)) and (j>=(i*r2))):
pmat[i][j]=1
if(i>=r1):
for k in range(r1*r2):
if j==(i-r1)+(k*r2):
pmat[i][j]=1
#flattening the cost matrix into one dimensional array
krvf=Krv.flatten()
tempr=np.append(rhoi,rhoj)
Xv=[] #Creating list for joint probability matrix elements
res = scipy.optimize.linprog(c=krvf,method='interior-point',A_eq=pmat,b_eq=tempr,options=
{'sparse':True, 'disp':True})
print("res=\n",res)
wv=res.fun
for l1 in range(r1*r2):
Xv.append(res.x[l1])
Yv=np.array(Xv)
Yv=Yv.reshape(r1,r2)
#returning Yv-joint probability and ,Wv-minimized wasserstein distance
return Yv,wv
Piprob,W=Piprobmin(K,result1,result2) #K-cost function matrix,result1 is the first
#marginal,result2 is the second marginal
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))
成本函数的大小为 300 X 300 大小,每个边际有 300 个点(总共 600 个约束)。我验证了我的成本函数是对称的和非负的。每个边际总和为 1,因为它们只是概率。
在blog post中多次使用稀疏这个词。不无道理。将 A 矩阵存储为稀疏矩阵极为重要。否则,你将无法处理大问题。博文post详细讨论了运输LP矩阵对内存要求的差异,所以这一点应该很难错过。
下面是一些示例代码,说明如何使用 scipy.optimize.linprog 设置具有 1000 个源节点和 1000 个目标节点的交通模型。同样,LP 矩阵有 2,000 行和 1,000,000 列并且存储稀疏。
import numpy as np
import scipy as sp
import scipy.sparse as sparse
import scipy.optimize as opt
from memory_profiler import profile
def GenerateData(M,N):
np.random.seed(123)
# form objective function
c = np.random.uniform(0,10,(M,N))
# demand, supply
s = np.random.uniform(0,15,M)
d = np.random.uniform(0,10,N)
assert np.sum(d) <= np.sum(s), "supply too small"
#print('c',c)
#print('s',s)
#print('d',d)
return {'c':c, 's':s, 'd':d, 'n':N, 'm':M}
def FormLPData(data):
rhs = np.append(data['s'],-data['d'])
# form A
# column (i,j)=n*i+j has two nonzeroes:
# 1 at row i with rhs supply(i)
# 1 at row N+j with rhs demand(j)
N = data['n']
M = data['m']
NZ = 2*N*M
irow = np.zeros(NZ, dtype=int)
jcol = np.zeros(NZ, dtype=int)
value = np.zeros(NZ)
for i in range(N):
for j in range(M):
k = M*i+j
k1 = 2*k
k2 = k1+1
irow[k1] = i
jcol[k1] = k
value[k1] = 1.0
irow[k2] = N+j
jcol[k2] = k
value[k2] = -1.0
A = sparse.coo_matrix((value, (irow, jcol)))
#print('A',A)
#print('rhs',rhs)
return {'A':A,'rhs':rhs}
@profile
def run():
# dimensions
M = 1000 # sources
N = 1000 # destinations
data = GenerateData(M,N)
lpdata = FormLPData(data)
res = opt.linprog(c=np.reshape(data['c'],M*N),A_ub=lpdata['A'],b_ub=lpdata['rhs'],options={'sparse':True, 'disp':True})
if __name__ == '__main__':
run()
所以看起来你完全错过了关于博客的全部要点 post。