使用 python 应用程序进行多处理可节省 运行 当前 36 小时的时间
Multiprocessing with python application to shave running time that is currently 36 hours
我目前正在从事一个数据挖掘项目,该项目正在创建一个 18000x18000 的相似矩阵
下面是构建矩阵的两种方法
def CreateSimilarityMatrix(dbSubsetData, distancePairsList):
global matrix
matrix = [ [0.0 for y in range(dbSubsetData.shape[0])] for x in range(dbSubsetData.shape[0])]
for i in range(len(dbSubsetData)): #record1
SimilarityArray = []
start = time.time()
for j in range(i+1, len(dbSubsetData)): #record2
Similarity = GetDistanceBetweenTwoRecords(dbSubsetData, i, j, distancePairsList)
#The similarities are all very small numbers which might be why the preference value needs to be so precise.
#Let's multiply the value by a scalar 10 to give the values more range.
matrix[i][j] = Similarity * 10.0
matrix[j][i] = Similarity * 10.0
end = time.time()
return matrix
def GetDistanceBetweenTwoRecords(dbSubsetData, i, j, distancePairsList):
Record1 = dbSubsetData.iloc[i]
Record2 = dbSubsetData.iloc[j]
columns = dbSubsetData.columns
distancer = 0.0
distancec = 0.0
for i in range(len(Record1)):
columnName = columns[i]
Record1Value = Record1[i]
Record2Value = Record2[i]
if(Record1Value != Record2Value):
ob = distancePairsList[distancePairsDict[columnName]-1]
if(ob.attributeType == "String"):
strValue = Record1Value+":"+Record2Value
strValue2 = Record2Value+":"+Record1Value
if strValue in ob.distancePairs:
val = ((ob.distancePairs[strValue])**2)
val = val * -1
distancec = distancec + val
elif strValue2 in ob.distancePairs:
val = ((ob.distancePairs[strValue2])**2)
val = val * -1
distancec = distancec + val
elif(ob.attributeType == "Number"):
val = ((Record1Value - Record2Value)*ob.getSignificance())**2
val = val * -1
distancer = distancer + val
distance = distancer + distancec
return distance
每次迭代循环 18000x19 次(每行 18000 次,每个属性 19 次)。迭代总数为 (18000x18000x19)/2,因为它是对称的,因此我只需要做矩阵的一半。这将需要大约 36 小时才能完成,显然我想缩短这个时间范围。
我认为多处理是诀窍。由于每一行都独立生成数字并将它们拟合到矩阵中,因此我可以 运行 使用 CreateSimilarityMatrix 进行多处理。所以我在将创建我的进程的函数中创建了这个
matrix = [ [0.0 for y in range(SubsetDBNormalizedAttributes.shape[0])] for x in range(SubsetDBNormalizedAttributes.shape[0])]
if __name__ == '__main__':
procs = []
for i in range(4):
proc = Process(target=CreateSimilarityMatrix, args=(SubsetDBNormalizedAttributes, distancePairsList, i, 4))
procs.append(proc)
proc.start()
proc.join()
CreateSimilarityMatrix 现在更改为
def CreateSimilarityMatrix(dbSubsetData, distancePairsList, counter=0, iteration=1):
global Matrix
for i in range(counter, len(dbSubsetData), iteration): #record1
SimilarityArray = []
start = time.time()
for j in range(i+1, len(dbSubsetData)): #record2
Similarity = GetDistanceBetweenTwoRecords(dbSubsetData, i, j, distancePairsList)
#print("Similarity Between Records",i,":",j," is ", Similarity)
#The similarities are all very small numbers which might be why the preference value needs to be so precise.
#Let's multiply the value by a scalar 10 to give the values more range.
Matrix[i][j] = Similarity * 10.0
Matrix[j][i] = Similarity * 10.0
end = time.time()
print("Iteration",i,"took",end-start,"(s)")
目前这是 s-l-o-w。真的很慢。启动一个进程需要几分钟,然后启动下一个进程也需要几分钟。我认为这些应该同时 运行?我对流程的应用不正确吗?
如果您使用的是 CPython,则存在一种称为全局解释器锁 (GIL) 的东西,这使得在使事情变得更快的同时实际进行多线程变得困难,反而会大大减慢它的速度。
如果你正在处理矩阵,使用 numpy,它肯定比常规的快很多 Python。
我目前正在从事一个数据挖掘项目,该项目正在创建一个 18000x18000 的相似矩阵
下面是构建矩阵的两种方法
def CreateSimilarityMatrix(dbSubsetData, distancePairsList):
global matrix
matrix = [ [0.0 for y in range(dbSubsetData.shape[0])] for x in range(dbSubsetData.shape[0])]
for i in range(len(dbSubsetData)): #record1
SimilarityArray = []
start = time.time()
for j in range(i+1, len(dbSubsetData)): #record2
Similarity = GetDistanceBetweenTwoRecords(dbSubsetData, i, j, distancePairsList)
#The similarities are all very small numbers which might be why the preference value needs to be so precise.
#Let's multiply the value by a scalar 10 to give the values more range.
matrix[i][j] = Similarity * 10.0
matrix[j][i] = Similarity * 10.0
end = time.time()
return matrix
def GetDistanceBetweenTwoRecords(dbSubsetData, i, j, distancePairsList):
Record1 = dbSubsetData.iloc[i]
Record2 = dbSubsetData.iloc[j]
columns = dbSubsetData.columns
distancer = 0.0
distancec = 0.0
for i in range(len(Record1)):
columnName = columns[i]
Record1Value = Record1[i]
Record2Value = Record2[i]
if(Record1Value != Record2Value):
ob = distancePairsList[distancePairsDict[columnName]-1]
if(ob.attributeType == "String"):
strValue = Record1Value+":"+Record2Value
strValue2 = Record2Value+":"+Record1Value
if strValue in ob.distancePairs:
val = ((ob.distancePairs[strValue])**2)
val = val * -1
distancec = distancec + val
elif strValue2 in ob.distancePairs:
val = ((ob.distancePairs[strValue2])**2)
val = val * -1
distancec = distancec + val
elif(ob.attributeType == "Number"):
val = ((Record1Value - Record2Value)*ob.getSignificance())**2
val = val * -1
distancer = distancer + val
distance = distancer + distancec
return distance
每次迭代循环 18000x19 次(每行 18000 次,每个属性 19 次)。迭代总数为 (18000x18000x19)/2,因为它是对称的,因此我只需要做矩阵的一半。这将需要大约 36 小时才能完成,显然我想缩短这个时间范围。
我认为多处理是诀窍。由于每一行都独立生成数字并将它们拟合到矩阵中,因此我可以 运行 使用 CreateSimilarityMatrix 进行多处理。所以我在将创建我的进程的函数中创建了这个
matrix = [ [0.0 for y in range(SubsetDBNormalizedAttributes.shape[0])] for x in range(SubsetDBNormalizedAttributes.shape[0])]
if __name__ == '__main__':
procs = []
for i in range(4):
proc = Process(target=CreateSimilarityMatrix, args=(SubsetDBNormalizedAttributes, distancePairsList, i, 4))
procs.append(proc)
proc.start()
proc.join()
CreateSimilarityMatrix 现在更改为
def CreateSimilarityMatrix(dbSubsetData, distancePairsList, counter=0, iteration=1):
global Matrix
for i in range(counter, len(dbSubsetData), iteration): #record1
SimilarityArray = []
start = time.time()
for j in range(i+1, len(dbSubsetData)): #record2
Similarity = GetDistanceBetweenTwoRecords(dbSubsetData, i, j, distancePairsList)
#print("Similarity Between Records",i,":",j," is ", Similarity)
#The similarities are all very small numbers which might be why the preference value needs to be so precise.
#Let's multiply the value by a scalar 10 to give the values more range.
Matrix[i][j] = Similarity * 10.0
Matrix[j][i] = Similarity * 10.0
end = time.time()
print("Iteration",i,"took",end-start,"(s)")
目前这是 s-l-o-w。真的很慢。启动一个进程需要几分钟,然后启动下一个进程也需要几分钟。我认为这些应该同时 运行?我对流程的应用不正确吗?
如果您使用的是 CPython,则存在一种称为全局解释器锁 (GIL) 的东西,这使得在使事情变得更快的同时实际进行多线程变得困难,反而会大大减慢它的速度。
如果你正在处理矩阵,使用 numpy,它肯定比常规的快很多 Python。