在条件列上乘以两个 pandas 数据帧的最佳方法
Best way to multiply two pandas dataframes on conditional columns
我有 df1,我需要为 df1.Amount 的乘积添加一个额外的列,而 df2 的列以 df1.Q.
的行为条件
我下面的解决方案适用于较低的 n 值,但当 n = 100000 时,解决方案比 excel.
上的嵌套解决方案花费的时间更长
import pandas as pd
import numpy as np
import time
def iRw(Q,T):
vol = 'R' if Q in ['q1','q2','q3'] else 'L' if Q == 'q4' else 'H'
w = float(df2.loc[(df2.T == T), vol].values)
return w
n = 10000
Q = ['q1','q2','q3','q4','q5']
T = ['t1','t2','t3','t4','t5']
df1 = pd.DataFrame({'Id': [i for i in range(n)],
'Q': [Q[np.random.randint(len(Q))] for i in range(n)],
'T': [T[np.random.randint(len(T))] for i in range(n)],
'Amount': np.random.randn(n)})
df2 = pd.DataFrame({'Tenor': T,
'R':np.random.randn(len(T)),
'L':np.random.randn(len(T)),
'H':np.random.randn(len(T))})
tic = time.time()
df1['Product'] = df1.apply(lambda x: x.Amount * iRw(x.Q, x.T), axis = 1)
toc = time.time()
print(toc-tic)
有人可以推荐一种更快的方法来缩短上述处理时间吗?
提前致谢
使用lookup
with helper Series
created by map
:
d1 = dict.fromkeys(['q1','q2','q3'], 'R')
d = {**{'q4':'L'}, **d1}
print (d)
{'q4': 'L', 'q1': 'R', 'q2': 'R', 'q3': 'R'}
g = df1['Q'].map(d).fillna('H')
df1['Product1'] = df2.set_index('T').lookup(df1['T'], g) * df1['Amount']
print (df1.head(5))
Id Q T Amount Product Product1
0 0 q4 t5 -0.220341 0.145460 0.145460
1 1 q5 t1 -1.495181 -1.450221 -1.450221
2 2 q4 t3 -2.233968 0.368787 0.368787
3 3 q3 t4 1.859870 -0.785868 -0.785868
4 4 q2 t1 0.349115 0.067482 0.067482
详情:
print (g.head(5))
0 L
1 H
2 L
3 R
4 R
Name: Q, dtype: object
设置:
np.random.seed(456)
n = 10000
Q = ['q1','q2','q3','q4','q5']
T = ['t1','t2','t3','t4','t5']
df1 = pd.DataFrame({'Id': [i for i in range(n)],
'Q': [Q[np.random.randint(len(Q))] for i in range(n)],
'T': [T[np.random.randint(len(T))] for i in range(n)],
'Amount': np.random.randn(n)})
df2 = pd.DataFrame({'Tenor': T,
'R':np.random.randn(len(T)),
'L':np.random.randn(len(T)),
'H':np.random.randn(len(T))})
我有 df1,我需要为 df1.Amount 的乘积添加一个额外的列,而 df2 的列以 df1.Q.
的行为条件我下面的解决方案适用于较低的 n 值,但当 n = 100000 时,解决方案比 excel.
上的嵌套解决方案花费的时间更长import pandas as pd
import numpy as np
import time
def iRw(Q,T):
vol = 'R' if Q in ['q1','q2','q3'] else 'L' if Q == 'q4' else 'H'
w = float(df2.loc[(df2.T == T), vol].values)
return w
n = 10000
Q = ['q1','q2','q3','q4','q5']
T = ['t1','t2','t3','t4','t5']
df1 = pd.DataFrame({'Id': [i for i in range(n)],
'Q': [Q[np.random.randint(len(Q))] for i in range(n)],
'T': [T[np.random.randint(len(T))] for i in range(n)],
'Amount': np.random.randn(n)})
df2 = pd.DataFrame({'Tenor': T,
'R':np.random.randn(len(T)),
'L':np.random.randn(len(T)),
'H':np.random.randn(len(T))})
tic = time.time()
df1['Product'] = df1.apply(lambda x: x.Amount * iRw(x.Q, x.T), axis = 1)
toc = time.time()
print(toc-tic)
有人可以推荐一种更快的方法来缩短上述处理时间吗?
提前致谢
使用lookup
with helper Series
created by map
:
d1 = dict.fromkeys(['q1','q2','q3'], 'R')
d = {**{'q4':'L'}, **d1}
print (d)
{'q4': 'L', 'q1': 'R', 'q2': 'R', 'q3': 'R'}
g = df1['Q'].map(d).fillna('H')
df1['Product1'] = df2.set_index('T').lookup(df1['T'], g) * df1['Amount']
print (df1.head(5))
Id Q T Amount Product Product1
0 0 q4 t5 -0.220341 0.145460 0.145460
1 1 q5 t1 -1.495181 -1.450221 -1.450221
2 2 q4 t3 -2.233968 0.368787 0.368787
3 3 q3 t4 1.859870 -0.785868 -0.785868
4 4 q2 t1 0.349115 0.067482 0.067482
详情:
print (g.head(5))
0 L
1 H
2 L
3 R
4 R
Name: Q, dtype: object
设置:
np.random.seed(456)
n = 10000
Q = ['q1','q2','q3','q4','q5']
T = ['t1','t2','t3','t4','t5']
df1 = pd.DataFrame({'Id': [i for i in range(n)],
'Q': [Q[np.random.randint(len(Q))] for i in range(n)],
'T': [T[np.random.randint(len(T))] for i in range(n)],
'Amount': np.random.randn(n)})
df2 = pd.DataFrame({'Tenor': T,
'R':np.random.randn(len(T)),
'L':np.random.randn(len(T)),
'H':np.random.randn(len(T))})