python pandas 多列 VlookUp
VlookUp in multiple columns with python pandas
有一个 CSV 文件,有 1000 个条目,
它有大约 50 列,其中
P1, P2, P3, ..... P10 are email columns in different pattern formats
他们都有不同的电子邮件
然后有一列 Emails 为 header,它包含所有正确的电子邮件,但不是按顺序排列的(就像它不对应于相同的行值,如 P1、P2、... P10 确实匹配 - 它们都是同一个人并且在同一行)所以我们要做的是我们必须查找 Email[=60 的电子邮件=] 列(这是最终正确的电子邮件)在这 10 列(P1,P2,... P10 - 其中一个模式将匹配)(VLOOKUP)
所以我们需要的是,一旦在 P 的任何一个列中找到它,它就应该存储在一个新列中,名称作为最终电子邮件(保留该行的其他凭据并删除其他 9 个电子邮件格式列(Ps) 和 Email 列,只保留一个 FinalEmail 列与匹配的电子邮件)
我在下面分享我的代码,它没有按照我使用的预期逻辑工作,因为它缺少近 50% 的匹配项,所以我正在寻找更好的工作方法或修复当前代码逻辑
我正在共享即将到来的 csv 输入和输出文件
查看输入文件中的 10+1 列
我的代码
import pandas as pd
import numpy as np
fname=input("Enter File name: ")
df=pd.read_csv(fname)
df = df.applymap(lambda s: s.lower() if type(s) == str else s)
df['Check']=np.nan
P1=df.loc[(df['Check'].isna()) & df['P1'].isin(df['Email']),~df.columns.isin(['P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9','P10', 'Email']) ]
df.loc[df['P1'].isin(df['Email']), "Check"] = "True"
P2=df.loc[(df['Check'].isna()) & df['P2'].isin(df['Email']),~df.columns.isin(['P1', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9','P10', 'Email']) ]
df.loc[df['P2'].isin(df['Email']), "Check"] = "True"
P3=df.loc[(df['Check'].isna()) & df['P3'].isin(df['Email']),~df.columns.isin(['P1', 'P2', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9','P10', 'Email']) ]
df.loc[df['P3'].isin(df['Email']), "Check"] = "True"
P4=df.loc[(df['Check'].isna()) & df['P4'].isin(df['Email']),~df.columns.isin(['P1', 'P2', 'P3', 'P5', 'P6', 'P7', 'P8', 'P9','P10', 'Email']) ]
df.loc[df['P4'].isin(df['Email']), "Check"] = "True"
P5=df.loc[(df['Check'].isna()) & df['P5'].isin(df['Email']),~df.columns.isin(['P1', 'P2', 'P3', 'P4', 'P6', 'P7', 'P8', 'P9','P10', 'Email']) ]
df.loc[df['P5'].isin(df['Email']), "Check"] = "True"
P6=df.loc[(df['Check'].isna()) & df['P6'].isin(df['Email']),~df.columns.isin(['P1', 'P2', 'P3', 'P4', 'P5', 'P7', 'P8', 'P9','P10', 'Email']) ]
df.loc[df['P6'].isin(df['Email']), "Check"] = "True"
P7=df.loc[(df['Check'].isna()) & df['P7'].isin(df['Email']),~df.columns.isin(['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P8', 'P9','P10', 'Email']) ]
df.loc[df['P7'].isin(df['Email']), "Check"] = "True"
P8=df.loc[(df['Check'].isna()) & df['P8'].isin(df['Email']),~df.columns.isin(['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P9','P10', 'Email']) ]
df.loc[df['P8'].isin(df['Email']), "Check"] = "True"
P9=df.loc[(df['Check'].isna()) & df['P9'].isin(df['Email']),~df.columns.isin(['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P10', 'Email']) ]
df.loc[df['P9'].isin(df['Email']), "Check"] = "True"
P10=df.loc[(df['Check'].isna()) & df['P10'].isin(df['Email']),~df.columns.isin(['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'Email'])]
df.loc[df['P10'].isin(df['Email']), "Check"] = "True"
P1.rename({'P1': 'FinalEmail'}, axis=1, inplace=True)
P2.rename({'P2': 'FinalEmail'}, axis=1, inplace=True)
P3.rename({'P3': 'FinalEmail'}, axis=1, inplace=True)
P4.rename({'P4': 'FinalEmail'}, axis=1, inplace=True)
P5.rename({'P5': 'FinalEmail'}, axis=1, inplace=True)
P6.rename({'P6': 'FinalEmail'}, axis=1, inplace=True)
P7.rename({'P7': 'FinalEmail'}, axis=1, inplace=True)
P8.rename({'P8': 'FinalEmail'}, axis=1, inplace=True)
P9.rename({'P9': 'FinalEmail'}, axis=1, inplace=True)
P10.rename({'P10': 'FinalEmail'}, axis=1, inplace=True)
edf=pd.DataFrame()
final = edf.append([P1,P2, P3,P4,P5,P6,P7,P8,P9,P10])
print(len(final), len(final.drop_duplicates()))
final.to_csv('Output - '+fname)
Project Folder with relevant files on Gdrive
我也在分享来自 csv 的 10 行示例数据
Sample 10 rows as data from Input CSV
Output of above Sample 10 rows CSV
Sample 100 rows as data from Input CSV
我发布的 logic/script 没有问题,我在其他多个输入文件上尝试了相同的脚本,我发布的输入文件没有达到我期望的输出,输出是仍然符合逻辑。
有一个 CSV 文件,有 1000 个条目,
它有大约 50 列,其中
P1, P2, P3, ..... P10 are email columns in different pattern formats
他们都有不同的电子邮件
然后有一列 Emails 为 header,它包含所有正确的电子邮件,但不是按顺序排列的(就像它不对应于相同的行值,如 P1、P2、... P10 确实匹配 - 它们都是同一个人并且在同一行)所以我们要做的是我们必须查找 Email[=60 的电子邮件=] 列(这是最终正确的电子邮件)在这 10 列(P1,P2,... P10 - 其中一个模式将匹配)(VLOOKUP)
所以我们需要的是,一旦在 P 的任何一个列中找到它,它就应该存储在一个新列中,名称作为最终电子邮件(保留该行的其他凭据并删除其他 9 个电子邮件格式列(Ps) 和 Email 列,只保留一个 FinalEmail 列与匹配的电子邮件)
我在下面分享我的代码,它没有按照我使用的预期逻辑工作,因为它缺少近 50% 的匹配项,所以我正在寻找更好的工作方法或修复当前代码逻辑
我正在共享即将到来的 csv 输入和输出文件
查看输入文件中的 10+1 列
我的代码
import pandas as pd
import numpy as np
fname=input("Enter File name: ")
df=pd.read_csv(fname)
df = df.applymap(lambda s: s.lower() if type(s) == str else s)
df['Check']=np.nan
P1=df.loc[(df['Check'].isna()) & df['P1'].isin(df['Email']),~df.columns.isin(['P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9','P10', 'Email']) ]
df.loc[df['P1'].isin(df['Email']), "Check"] = "True"
P2=df.loc[(df['Check'].isna()) & df['P2'].isin(df['Email']),~df.columns.isin(['P1', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9','P10', 'Email']) ]
df.loc[df['P2'].isin(df['Email']), "Check"] = "True"
P3=df.loc[(df['Check'].isna()) & df['P3'].isin(df['Email']),~df.columns.isin(['P1', 'P2', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9','P10', 'Email']) ]
df.loc[df['P3'].isin(df['Email']), "Check"] = "True"
P4=df.loc[(df['Check'].isna()) & df['P4'].isin(df['Email']),~df.columns.isin(['P1', 'P2', 'P3', 'P5', 'P6', 'P7', 'P8', 'P9','P10', 'Email']) ]
df.loc[df['P4'].isin(df['Email']), "Check"] = "True"
P5=df.loc[(df['Check'].isna()) & df['P5'].isin(df['Email']),~df.columns.isin(['P1', 'P2', 'P3', 'P4', 'P6', 'P7', 'P8', 'P9','P10', 'Email']) ]
df.loc[df['P5'].isin(df['Email']), "Check"] = "True"
P6=df.loc[(df['Check'].isna()) & df['P6'].isin(df['Email']),~df.columns.isin(['P1', 'P2', 'P3', 'P4', 'P5', 'P7', 'P8', 'P9','P10', 'Email']) ]
df.loc[df['P6'].isin(df['Email']), "Check"] = "True"
P7=df.loc[(df['Check'].isna()) & df['P7'].isin(df['Email']),~df.columns.isin(['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P8', 'P9','P10', 'Email']) ]
df.loc[df['P7'].isin(df['Email']), "Check"] = "True"
P8=df.loc[(df['Check'].isna()) & df['P8'].isin(df['Email']),~df.columns.isin(['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P9','P10', 'Email']) ]
df.loc[df['P8'].isin(df['Email']), "Check"] = "True"
P9=df.loc[(df['Check'].isna()) & df['P9'].isin(df['Email']),~df.columns.isin(['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P10', 'Email']) ]
df.loc[df['P9'].isin(df['Email']), "Check"] = "True"
P10=df.loc[(df['Check'].isna()) & df['P10'].isin(df['Email']),~df.columns.isin(['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'Email'])]
df.loc[df['P10'].isin(df['Email']), "Check"] = "True"
P1.rename({'P1': 'FinalEmail'}, axis=1, inplace=True)
P2.rename({'P2': 'FinalEmail'}, axis=1, inplace=True)
P3.rename({'P3': 'FinalEmail'}, axis=1, inplace=True)
P4.rename({'P4': 'FinalEmail'}, axis=1, inplace=True)
P5.rename({'P5': 'FinalEmail'}, axis=1, inplace=True)
P6.rename({'P6': 'FinalEmail'}, axis=1, inplace=True)
P7.rename({'P7': 'FinalEmail'}, axis=1, inplace=True)
P8.rename({'P8': 'FinalEmail'}, axis=1, inplace=True)
P9.rename({'P9': 'FinalEmail'}, axis=1, inplace=True)
P10.rename({'P10': 'FinalEmail'}, axis=1, inplace=True)
edf=pd.DataFrame()
final = edf.append([P1,P2, P3,P4,P5,P6,P7,P8,P9,P10])
print(len(final), len(final.drop_duplicates()))
final.to_csv('Output - '+fname)
Project Folder with relevant files on Gdrive
我也在分享来自 csv 的 10 行示例数据
Sample 10 rows as data from Input CSV
Output of above Sample 10 rows CSV
Sample 100 rows as data from Input CSV
我发布的 logic/script 没有问题,我在其他多个输入文件上尝试了相同的脚本,我发布的输入文件没有达到我期望的输出,输出是仍然符合逻辑。