在 pandas 中使用 iterrows 的有效方法(另一种方式)
Effective way to use iterrows in pandas (another way)
这是我在论坛上的第一个问题。感谢您的帮助!
我基于 df.iterrows ()(原文如此。)编写了嵌套 for 循环,执行它需要花费大量时间。我需要通过检查描述条件下的所有单元格,将一个数据帧的值分配给另一个数据帧。你能帮我让它有效吗? (多处理、应用方法、矢量化或其他?)
将不胜感激! :)
示例数据:
import pandas as pd
import numpy as np
d1 = {'geno_start' : [60, 1120, 1660], 'geno_end' : [90, 1150, 1690], 'original_subseq' : ['AAATGCCTGAACCTTGGAATTGGA', 'AAATGCCTGAACCTTGGAATTGGA', 'AAATGCCTGAACCTTGGAATTGGA']}
d2 = {'most_left_coordinate_genome' : [56, 1120, 1655], 'most_right_coordinate_genome' : [88, 1150, 1690], 'protein_ID' : ['XYZ_1', 'XYZ_2', 'XYZ_3']}
df_1 = pd.DataFrame(data=d1)
df_2 = pd.DataFrame(data=d2)
df_1['protein_ID'] = np.nan
def match_ranges(df1: pd.DataFrame, df2: pd.DataFrame):
for index, row_2 in df2.iterrows():
for index_1, row_1 in df1.iterrows():
if (row_1['geno_start'] >= row_2['most_left_coordinate_genome']) & (row_1['geno_end'] <= row_2['most_right_coordinate_genome']):
df1['protein_ID'].iloc[index_1] = row_2['protein_ID']
elif (abs(row_1['geno_start'] - row_2['most_left_coordinate_genome']) < 30) & (row_1['geno_end'] <= row_2['most_right_coordinate_genome']):
df1['protein_ID'].iloc[index_1] = row_2['protein_ID']
elif (row_1['geno_start'] >= row_2['most_left_coordinate_genome']) & (abs(row_1['geno_end'] - row_2['most_right_coordinate_genome']) < 30):
df1['protein_ID'].iloc[index_1] = row_2['protein_ID']
match_ranges(df_1, df_2)
Desired output:
这是从 2 for-loops 到 1 的方法。我 re-named 几列来削减线宽。
首先,创建数据框:
import pandas as pd
d1 = {'geno_start' : [60, 1120, 1660],
'geno_end' : [90, 1150, 1690],
'original_subseq' : ['AAATGCCTGAACCTTGGAATTGGA',
'AAATGCCTGAACCTTGGAATTGGA',
'AAATGCCTGAACCTTGGAATTGGA'],}
d2 = {'left' : [56, 1120, 1655],
'right' : [88, 1150, 1690],
'protein_ID' : ['XYZ_1', 'XYZ_2', 'XYZ_3']}
df_1 = pd.DataFrame(data=d1)
df_1['protein_ID'] = '?'
df_1['rule'] = '?'
df_2 = pd.DataFrame(data=d2)
其次,填充第一个数据框中的 protein_ID
列(即,基因组开始,基因组结束):
for g in df_1.itertuples():
# Rule A: left most <= geno start < geno end <= right-most
# LM-----------------------RM left- and right-most
# GS-----------GE genome start, end
if ((df_2['left'] <= g.geno_start) & (g.geno_end <= df_2['right'])).any():
mask = (df_2['left'] <= g.geno_start) & (g.geno_end <= df_2['right'])
df_1.at[g.Index, 'protein_ID'] = df_2.loc[mask, 'protein_ID'].values[0]
df_1.at[g.Index, 'rule'] = 'Rule A'
# Rule B: geno start before left-most
# LM-----------------RM
# GS-----------------GE
elif ((df_2['left'] - g.geno_start < 30) & (g.geno_end <= df_2['right'])).any():
mask = (df_2['left'] - g.geno_start < 30) & (g.geno_end <= df_2['right'])
df_1.at[g.Index, 'protein_ID'] = df_2.loc[mask, 'protein_ID'].values[0]
df_1.at[g.Index, 'rule'] = 'Rule B'
# Rule C: geno end after right-most
# LM-----------------RM
# GS-----------------GE
elif ((df_2['left'] <= g.geno_start) & (g.geno_end - df_2['right'] < 30)).any():
mask = (df_2['left'] <= g.geno_start) & (g.geno_end - df_2['right'] < 30)
df_1.at[g.Index, 'protein_ID'] = df_2.loc[mask, 'protein_ID'].values[0]
df_1.at[g.Index, 'rule'] = 'Rule C'
else:
pass
print(df_1)
geno_start geno_end original_subseq protein_ID rule
0 60 90 AAATGCCTGAACCTTGGAATTGGA XYZ_1 Rule C
1 1120 1150 AAATGCCTGAACCTTGGAATTGGA XYZ_2 Rule A
2 1660 1690 AAATGCCTGAACCTTGGAATTGGA XYZ_3 Rule A
这是我在论坛上的第一个问题。感谢您的帮助!
我基于 df.iterrows ()(原文如此。)编写了嵌套 for 循环,执行它需要花费大量时间。我需要通过检查描述条件下的所有单元格,将一个数据帧的值分配给另一个数据帧。你能帮我让它有效吗? (多处理、应用方法、矢量化或其他?) 将不胜感激! :)
示例数据:
import pandas as pd
import numpy as np
d1 = {'geno_start' : [60, 1120, 1660], 'geno_end' : [90, 1150, 1690], 'original_subseq' : ['AAATGCCTGAACCTTGGAATTGGA', 'AAATGCCTGAACCTTGGAATTGGA', 'AAATGCCTGAACCTTGGAATTGGA']}
d2 = {'most_left_coordinate_genome' : [56, 1120, 1655], 'most_right_coordinate_genome' : [88, 1150, 1690], 'protein_ID' : ['XYZ_1', 'XYZ_2', 'XYZ_3']}
df_1 = pd.DataFrame(data=d1)
df_2 = pd.DataFrame(data=d2)
df_1['protein_ID'] = np.nan
def match_ranges(df1: pd.DataFrame, df2: pd.DataFrame):
for index, row_2 in df2.iterrows():
for index_1, row_1 in df1.iterrows():
if (row_1['geno_start'] >= row_2['most_left_coordinate_genome']) & (row_1['geno_end'] <= row_2['most_right_coordinate_genome']):
df1['protein_ID'].iloc[index_1] = row_2['protein_ID']
elif (abs(row_1['geno_start'] - row_2['most_left_coordinate_genome']) < 30) & (row_1['geno_end'] <= row_2['most_right_coordinate_genome']):
df1['protein_ID'].iloc[index_1] = row_2['protein_ID']
elif (row_1['geno_start'] >= row_2['most_left_coordinate_genome']) & (abs(row_1['geno_end'] - row_2['most_right_coordinate_genome']) < 30):
df1['protein_ID'].iloc[index_1] = row_2['protein_ID']
match_ranges(df_1, df_2)
Desired output:
这是从 2 for-loops 到 1 的方法。我 re-named 几列来削减线宽。
首先,创建数据框:
import pandas as pd
d1 = {'geno_start' : [60, 1120, 1660],
'geno_end' : [90, 1150, 1690],
'original_subseq' : ['AAATGCCTGAACCTTGGAATTGGA',
'AAATGCCTGAACCTTGGAATTGGA',
'AAATGCCTGAACCTTGGAATTGGA'],}
d2 = {'left' : [56, 1120, 1655],
'right' : [88, 1150, 1690],
'protein_ID' : ['XYZ_1', 'XYZ_2', 'XYZ_3']}
df_1 = pd.DataFrame(data=d1)
df_1['protein_ID'] = '?'
df_1['rule'] = '?'
df_2 = pd.DataFrame(data=d2)
其次,填充第一个数据框中的 protein_ID
列(即,基因组开始,基因组结束):
for g in df_1.itertuples():
# Rule A: left most <= geno start < geno end <= right-most
# LM-----------------------RM left- and right-most
# GS-----------GE genome start, end
if ((df_2['left'] <= g.geno_start) & (g.geno_end <= df_2['right'])).any():
mask = (df_2['left'] <= g.geno_start) & (g.geno_end <= df_2['right'])
df_1.at[g.Index, 'protein_ID'] = df_2.loc[mask, 'protein_ID'].values[0]
df_1.at[g.Index, 'rule'] = 'Rule A'
# Rule B: geno start before left-most
# LM-----------------RM
# GS-----------------GE
elif ((df_2['left'] - g.geno_start < 30) & (g.geno_end <= df_2['right'])).any():
mask = (df_2['left'] - g.geno_start < 30) & (g.geno_end <= df_2['right'])
df_1.at[g.Index, 'protein_ID'] = df_2.loc[mask, 'protein_ID'].values[0]
df_1.at[g.Index, 'rule'] = 'Rule B'
# Rule C: geno end after right-most
# LM-----------------RM
# GS-----------------GE
elif ((df_2['left'] <= g.geno_start) & (g.geno_end - df_2['right'] < 30)).any():
mask = (df_2['left'] <= g.geno_start) & (g.geno_end - df_2['right'] < 30)
df_1.at[g.Index, 'protein_ID'] = df_2.loc[mask, 'protein_ID'].values[0]
df_1.at[g.Index, 'rule'] = 'Rule C'
else:
pass
print(df_1)
geno_start geno_end original_subseq protein_ID rule
0 60 90 AAATGCCTGAACCTTGGAATTGGA XYZ_1 Rule C
1 1120 1150 AAATGCCTGAACCTTGGAATTGGA XYZ_2 Rule A
2 1660 1690 AAATGCCTGAACCTTGGAATTGGA XYZ_3 Rule A