检查数据框中列的字符串值是否以元组的字符串元素开头(str.startswith 除外)

check if string values of a column in a dataframe start with string elements of a tuple (other than str.startswith)

我有一个包含随机值 ("457645","458762496","1113423453"...) 的 pandas 数据框列,我需要检查这些值是否以元组 ("323","229","111") 的元素开头。

在这种情况下,"1113423453"应该是正确的。

我试过df[column].str.startswith(tuple),效果很好;但是对于大量数据(2M df 行和 3K 元组元素),与 10K df 行和 3K 元组元素(1.47 秒)相比,它变得更慢(大约 28 秒)。

有没有更有效的方法?

I have tried df[column].str.startswith(tuple), which works fine … but i'm searching for a more efficient way to do it if it's possible

由于 startswith() 没有针对大量前缀字符串进行优化,只是对它们进行线性搜索,因此在这里使用二分搜索可能更有效。为此,我们需要对前缀进行排序。

from bisect import bisect_right
s = sorted(tuple)
df[column].apply(lambda str: str.startswith(s[bisect_right(s, str)-1]))

is it possible to extract the prefix into a new column of the dataframe?

是的,e。 G。使用此功能:

def startwiths(str):
    prefix = s[bisect_right(s, str)-1]
    if str.startswith(prefix): return prefix

df['new column'] = df[column].apply(startwiths)

Armali 的解决方案仅适用于相同长度的字符串。如果你有可变长度的字符串,你需要按长度分组,然后使用 Armalis 算法。它仍然比大数据帧上的内置解决方案快得多。

import numpy  as np
import pandas as pd 
import random
from pandas._testing import rands_array
from bisect import bisect_right

# create random strings
def zufallsdaten(anz):   
    result = pd.DataFrame()         
    result['string_A'] = pd.util.testing.rands_array(10, anz)   
    result['string_B'] = pd.util.testing.rands_array(10, anz)    
    def bearbeite_element( skalar ):
        l = random.randint(2,5)
        return skalar[0:l] 
    result['string_B'] = result['string_B'].apply(bearbeite_element)
    return result

# create data to search in
manystrings = pd.DataFrame(zufallsdaten(1000000)['string_A'])

# create data to search
search_me   = pd.DataFrame(zufallsdaten(100000)['string_B'].drop_duplicates())



# fast startswith alternative. Finds the longest / shortest matching fragment and writes it into the field foundfieldname.
# if find_identical, the strings may not be identical.  
def fast_startswith(df, searchfieldname, foundfieldname, searchseries, find_longest=True, find_identical=True):
    
    # startswith alternative, works only if all strings in searchme have the same length. Also returns the matching fragment
    def startwiths(data, searchme, find_identical):
        prefix = searchme[bisect_right(searchme, data)-1]
        if ((data!=prefix) or find_identical ) and data.startswith(prefix): 
            return prefix    
        
    search = pd.DataFrame(searchseries)
    search.columns = ['searchstring']
    search['len'] = search.searchstring.str.len()
    grouped = search.groupby('len')
    lengroups = grouped.agg(list).reset_index().sort_values('len', ascending=find_longest)  
    result = df.copy()
    result[foundfieldname] = None
    for index, row in lengroups.iterrows():
        result[foundfieldname].update(result[searchfieldname].apply(startwiths, searchme=sorted(row.searchstring), find_identical=find_identical)  )  
        #result[foundfieldname] = result[foundfieldname].fillna(  result[searchfieldname].apply(startwiths, searchme=sorted(row.searchstring))  )
    return result
    
    

def fast_startswith2(df, searchfieldname, foundfieldname, searchseries):

    # startswith alternative, works only if all strings in searchme have the same length. Also returns the matching fragment
    def startwiths(data, searchme):
        prefix = searchme[bisect_right(searchme, data)-1]
        if data.startswith(prefix): return prefix    
    
    def grouped_startswith(searchme, data):
        data[foundfieldname].update(data[searchfieldname].apply(startwiths, searchme=sorted(searchme.searchstring)))
        return list(searchme.searchstring)   
    
    search = pd.DataFrame(searchseries)
    search.columns = ['searchstring']
    search['len'] = search.searchstring.str.len()
    grouped = search.groupby('len')     
    result = df.copy()
    result[foundfieldname] = None
    grouped.apply(grouped_startswith, data=result)
    return result    



%%time 
mask = manystrings.string_A.str.startswith(tuple(search_me.string_B))
result0 = manystrings[mask]
# result0: built-in startswith
# Wall time: 1min 6s 



%%time
df = fast_startswith(manystrings, 'string_A', 'found', search_me.string_B) 
mask = df.found.notnull()
result1 = df[mask]   
#print( result0.shape[0],   result1.shape[0])
assert result0.shape[0] == result1.shape[0]

# result1: iterate through groups of strings with same length.
# also returns the matching fragment
# Wall time: 6.33 s



%%time
df = fast_startswith2(manystrings, 'string_A', 'found', search_me.string_B) 
mask = df.found.notnull()
result2 = df[mask]    
#print( result0.shape[0],   result2.shape[0])
assert result0.shape[0] == result2.shape[0]

# result2: apply fast startswith method on groups of strings with same length
# also returns the matching fragment
# Wall time: 5.94 s



# differences? May occur if you use find_longest=False
result = pd.merge(result1,result2, on='string_A', suffixes=('_1','_2'))
mask = (result.found_1 != result.found_2)
result[mask]



# search self
df = fast_startswith(search_me, 'string_B', 'found', search_me.string_B, find_identical=False) 
mask = df.found.notnull()
df[mask]