如何创建名称相似度列?
How to create a name similarity column?
我有一个示例数据框
pid = [1,2,3,4,5]; name = ['abc', 'def', 'bca', 'fed', 'pqr']; match_score = [np.nan, np.nan, np.nan, np.nan, np.nan]
sample_df = pd.DataFrame(zip(pid,name,match_score), columns=['pid', 'name', 'match_score'])
sample_df
pid
name
match_score
1
abc
NaN
2
def
NaN
3
bca
NaN
4
fed
NaN
5
pqr
NaN
还有一种名字相似度计算方法
from difflib import SequenceMatcher
SequenceMatcher(None, "abc", "bca").ratio()
>>> 0.666
我怎样才能apply SequenceMatcher method to each row in the sample_df
,让我得到
from difflib import SequenceMatcher
# comparing row1 with row2
print(SequenceMatcher(None, "abc", "def").ratio())
>>> 0.0
# comparing row1 with row3
print(SequenceMatcher(None, "abc", "bca").ratio())
>>> 0.66
# comparing row1 with row4
print(SequenceMatcher(None, "abc", "fed").ratio())
>>> 0.0
# comparing row1 with row5
print(SequenceMatcher(None, "abc", "pqr").ratio())
>>> 0.0
# Highest score for abc was 6.666
pid
name
match_score
1
abc
0.666
2
def
NaN
3
bca
NaN
4
fed
NaN
5
pqr
NaN
# comparing row2 with row1
print(SequenceMatcher(None, "def", "abc").ratio())
>>> 0.0
# comparing row2 with row3
print(SequenceMatcher(None, "def", "bca").ratio())
>>> 0.0
# comparing row2 with row4
print(SequenceMatcher(None, "def", "fed").ratio())
>>> 0.33
# comparing row2 with row5
print(SequenceMatcher(None, "def", "pqr").ratio())
>>> 0.0
# Highest score for def was 3.333
pid
name
match_score
1
abc
0.666
2
def
0.33
3
bca
NaN
4
fed
NaN
5
pqr
NaN
以此类推:
pid
name
match_score
1
abc
0.666
2
def
0.333
3
bca
0.666
4
fed
0.333
5
pqr
0.000
成功申请 fuzzywuzzy
以获得您需要的结果。
我也是刚开始,- 所以我的方法很可能不是最好的,但适用于您提供的数据:
import pandas as pd
import numpy as np
from fuzzywuzzy import process, fuzz
pid = [1,2,3,4,5]; name = ['abc', 'def', 'bca', 'fed', 'pqr']; match_score = [np.nan, np.nan, np.nan, np.nan, np.nan]
sample_df = pd.DataFrame(zip(pid,name,match_score), columns=['pid', 'name', 'match_score'])
sample_df.drop('match_score', axis=1, inplace=True) # droping col as it will be created later.
unique_names = sample_df['name'].unique().tolist()
match_score = [(x,) + i
for x in unique_names
for i in process.extract(x, unique_names, scorer=fuzz.token_sort_ratio)]
similarity_df = pd.DataFrame(match_score, columns=['name','name_compare','match_score'])
similarity_df = similarity_df[similarity_df['match_score'] !=0].copy()
similarity_df = similarity_df[similarity_df['match_score'] !=100].drop('name_compare', axis=1)
sample_df= sample_df.merge(similarity_df, left_on='name', right_on='name', how="outer")
sample_df.match_score = sample_df.match_score / 100
print(sample_df)
输出:
pid name match_score
0 1 abc 0.67
1 2 def 0.33
2 3 bca 0.67
3 4 fed 0.33
4 5 pqr NaN
我是运行两个循环:外部和内部。
对不起不能发表评论。当我继续评论代码时,我在 python.
中收到缩进错误
我设置了一个最大值0。我将它与计算出的比率值相匹配。如果比较的字符串不相同,我也会匹配。如果两个检查(字符串比较和针对最大值的值)都为真,我使用 loc
将其分配给 match_score
列
length_df=len(sample_df)
for outer_index in range(0, length_df):
max=0
for inner_index in range(0, length_df):
out_value=sample_df.iloc[outer_index]['name']
inn_value=sample_df.iloc[inner_index]['name']
value_ratio=SequenceMatcher(None,out_value,inn_value).ratio()
if (out_value!=inn_value) & (value_ratio >max):
sample_df.loc[outer_index,'match_score']=value_ratio
##
##
##
##
我有一个示例数据框
pid = [1,2,3,4,5]; name = ['abc', 'def', 'bca', 'fed', 'pqr']; match_score = [np.nan, np.nan, np.nan, np.nan, np.nan]
sample_df = pd.DataFrame(zip(pid,name,match_score), columns=['pid', 'name', 'match_score'])
sample_df
pid | name | match_score |
---|---|---|
1 | abc | NaN |
2 | def | NaN |
3 | bca | NaN |
4 | fed | NaN |
5 | pqr | NaN |
还有一种名字相似度计算方法
from difflib import SequenceMatcher
SequenceMatcher(None, "abc", "bca").ratio()
>>> 0.666
我怎样才能apply SequenceMatcher method to each row in the sample_df
,让我得到
from difflib import SequenceMatcher
# comparing row1 with row2
print(SequenceMatcher(None, "abc", "def").ratio())
>>> 0.0
# comparing row1 with row3
print(SequenceMatcher(None, "abc", "bca").ratio())
>>> 0.66
# comparing row1 with row4
print(SequenceMatcher(None, "abc", "fed").ratio())
>>> 0.0
# comparing row1 with row5
print(SequenceMatcher(None, "abc", "pqr").ratio())
>>> 0.0
# Highest score for abc was 6.666
pid | name | match_score |
---|---|---|
1 | abc | 0.666 |
2 | def | NaN |
3 | bca | NaN |
4 | fed | NaN |
5 | pqr | NaN |
# comparing row2 with row1
print(SequenceMatcher(None, "def", "abc").ratio())
>>> 0.0
# comparing row2 with row3
print(SequenceMatcher(None, "def", "bca").ratio())
>>> 0.0
# comparing row2 with row4
print(SequenceMatcher(None, "def", "fed").ratio())
>>> 0.33
# comparing row2 with row5
print(SequenceMatcher(None, "def", "pqr").ratio())
>>> 0.0
# Highest score for def was 3.333
pid | name | match_score |
---|---|---|
1 | abc | 0.666 |
2 | def | 0.33 |
3 | bca | NaN |
4 | fed | NaN |
5 | pqr | NaN |
以此类推:
pid | name | match_score |
---|---|---|
1 | abc | 0.666 |
2 | def | 0.333 |
3 | bca | 0.666 |
4 | fed | 0.333 |
5 | pqr | 0.000 |
成功申请 fuzzywuzzy
以获得您需要的结果。
我也是刚开始,- 所以我的方法很可能不是最好的,但适用于您提供的数据:
import pandas as pd
import numpy as np
from fuzzywuzzy import process, fuzz
pid = [1,2,3,4,5]; name = ['abc', 'def', 'bca', 'fed', 'pqr']; match_score = [np.nan, np.nan, np.nan, np.nan, np.nan]
sample_df = pd.DataFrame(zip(pid,name,match_score), columns=['pid', 'name', 'match_score'])
sample_df.drop('match_score', axis=1, inplace=True) # droping col as it will be created later.
unique_names = sample_df['name'].unique().tolist()
match_score = [(x,) + i
for x in unique_names
for i in process.extract(x, unique_names, scorer=fuzz.token_sort_ratio)]
similarity_df = pd.DataFrame(match_score, columns=['name','name_compare','match_score'])
similarity_df = similarity_df[similarity_df['match_score'] !=0].copy()
similarity_df = similarity_df[similarity_df['match_score'] !=100].drop('name_compare', axis=1)
sample_df= sample_df.merge(similarity_df, left_on='name', right_on='name', how="outer")
sample_df.match_score = sample_df.match_score / 100
print(sample_df)
输出:
pid name match_score
0 1 abc 0.67
1 2 def 0.33
2 3 bca 0.67
3 4 fed 0.33
4 5 pqr NaN
我是运行两个循环:外部和内部。 对不起不能发表评论。当我继续评论代码时,我在 python.
中收到缩进错误我设置了一个最大值0。我将它与计算出的比率值相匹配。如果比较的字符串不相同,我也会匹配。如果两个检查(字符串比较和针对最大值的值)都为真,我使用 loc
match_score
列
length_df=len(sample_df)
for outer_index in range(0, length_df):
max=0
for inner_index in range(0, length_df):
out_value=sample_df.iloc[outer_index]['name']
inn_value=sample_df.iloc[inner_index]['name']
value_ratio=SequenceMatcher(None,out_value,inn_value).ratio()
if (out_value!=inn_value) & (value_ratio >max):
sample_df.loc[outer_index,'match_score']=value_ratio
##
##
##
##