使用正则表达式从字符串中提取测量维度和数字
Extract measurement dimensions and number from string using regex
import re
punctuation = '!"#$%&'*()+,-:;<=>?@[\]^_{|}~`'
train_new['priceDescription'] = '''His speech talked L36MM of the setbacks in 35CMx56cm life, including death, and L458905 how being aware of death 35CM can 56x34 help you make better 35MM choices in life. At 69cm the time, Jobs was dying 34/67 of pancreatic cancer, and 23 his inspirational words 2.3x50cm 3.9MM on the importance of acquiring 475MMx3.9cm knowledge and following your dreams was the best life lesson he could bestow upon the graduates'''
def remove_punctuation(text):
text = re.sub("[^0-9MMXCML.]", " ", text)
text = re.sub( r".*(MM).*",r"", text )
text = text.lower()
no_punct=[words for words in text if words not in punctuation]
words_wo_punct=''.join(no_punct)
return words_wo_punct
train_new['priceDescription']=train_new['priceDescription'].apply(lambda x: remove_punctuation(x))
train_new['priceDescription'].apply(lambda x: len(x.split(' '))).sum()
print(train_new)
我只想从上面的字符串中提取尺寸和数字,例如 35CMx56cm、L458905、L36MM、23、475MMx3.9cm、34MM、25CM
试试这个:
代码
import re
df = pd.DataFrame({'priceDescription':['''His speech talked L36MM of the setbacks in 35CMx56cm life, including death, and L458905 how being aware of death 35CM can 56x34 help you make better 35MM choices in life. At 69cm the time, Jobs was dying 34/67 of pancreatic cancer, and 23 his inspirational words 2.3x50cm 3.9MM on the importance of acquiring 475MMx3.9cm knowledge and following your dreams was the best life lesson he could bestow upon the graduates''']})
text = '''His speech talked L36MM of the setbacks in 35CMx56cm life, including death, and L458905 how being aware of death 35CM can 56x34 help you make better 35MM choices in life. At 69cm the time, Jobs was dying 34/67 of pancreatic cancer, and 23 his inspirational words 2.3x50cm 3.9MM on the importance of acquiring 475MMx3.9cm knowledge and following your dreams was the best life lesson he could bestow upon the graduates'''
def remove_punctuation(data) :
x = "(?:\.\d{1,2}|\d{1,4}\.?\d{0,2}|\d{5}\.?\d?|\d{6}\.?)"
by = "(?: )?(?:by|x)(?: )?"
cm = "(?:mm|cm|millimeter|centimeter|millimeters|centimeters|MM|CM)"
x_cm = "(?:" + x + " *(?:to|\-) *" + cm + "|" + x + cm + ")"
xy_cm = "(?:" + x + cm + by + x + cm +"|" + x + by + x + cm +"|" + x + cm + by + x +"|" + x + by + x + ")"
xyz_cm = "(?:" + x + cm + by + x + cm + by + x + cm + "|" + x + by + x + by + x + cm + "|" + x + by + x + by + x + ")"
xyz2_cm = "(?:" + "L"+ x + cm + "|" + "L"+ x + ")"
m = "{}|{}|{}|{}|{}".format(xyz_cm, xy_cm, x_cm,xyz2_cm,x)
a = re.compile(m)
return a.findall(data)
df['priceDescription'] = df['priceDescription'].apply(lambda x: remove_punctuation(x))
这似乎有效:
import re
s = "His speech talked L36MM of the setbacks in 35CMx56cm life, including death, and L458905 how being aware of death 35CM can 56x34 help you make better 35MM choices in life. At 69cm the time, Jobs was dying 34/67 of pancreatic cancer, and 23 his inspirational words 2.3x50cm 3.9MM on the importance of acquiring 475MMx3.9cm knowledge and following your dreams was the best life lesson he could bestow upon the graduates"
# L then some digits and optional cm/mm
lstart = re.compile(r"(\b(L+\d+)(cm|mm)*)", re.I)
# 56mm, 33cm, etc..
cm_mm_alone = re.compile(r"(\s(\d*\.*\d*)(cm|mm)+\s)", re.I)
# 475MMx3.9, etc...
x_by_y = re.compile(r"((\d*\.*\d*)(cm|mm)*x(\d*\.*\d*)(cm|mm)*)", re.I)
# 23, etc..
digit_alone = re.compile(r"((\s\d+\s))", re.I)
patterns = [lstart, cm_mm_alone, x_by_y, digit_alone]
matches = []
for pattern in patterns:
m = [t[0].strip() for t in pattern.findall(s)]
matches = matches + m
print(matches)
这是输出:
['L36MM', 'L458905', '35CM', '35MM', '69cm', '3.9MM', '35CMx56cm', '56x34', '2.3x50cm', '475MMx3.9cm', '23']
import re
punctuation = '!"#$%&'*()+,-:;<=>?@[\]^_{|}~`'
train_new['priceDescription'] = '''His speech talked L36MM of the setbacks in 35CMx56cm life, including death, and L458905 how being aware of death 35CM can 56x34 help you make better 35MM choices in life. At 69cm the time, Jobs was dying 34/67 of pancreatic cancer, and 23 his inspirational words 2.3x50cm 3.9MM on the importance of acquiring 475MMx3.9cm knowledge and following your dreams was the best life lesson he could bestow upon the graduates'''
def remove_punctuation(text):
text = re.sub("[^0-9MMXCML.]", " ", text)
text = re.sub( r".*(MM).*",r"", text )
text = text.lower()
no_punct=[words for words in text if words not in punctuation]
words_wo_punct=''.join(no_punct)
return words_wo_punct
train_new['priceDescription']=train_new['priceDescription'].apply(lambda x: remove_punctuation(x))
train_new['priceDescription'].apply(lambda x: len(x.split(' '))).sum()
print(train_new)
我只想从上面的字符串中提取尺寸和数字,例如 35CMx56cm、L458905、L36MM、23、475MMx3.9cm、34MM、25CM
试试这个:
代码
import re
df = pd.DataFrame({'priceDescription':['''His speech talked L36MM of the setbacks in 35CMx56cm life, including death, and L458905 how being aware of death 35CM can 56x34 help you make better 35MM choices in life. At 69cm the time, Jobs was dying 34/67 of pancreatic cancer, and 23 his inspirational words 2.3x50cm 3.9MM on the importance of acquiring 475MMx3.9cm knowledge and following your dreams was the best life lesson he could bestow upon the graduates''']})
text = '''His speech talked L36MM of the setbacks in 35CMx56cm life, including death, and L458905 how being aware of death 35CM can 56x34 help you make better 35MM choices in life. At 69cm the time, Jobs was dying 34/67 of pancreatic cancer, and 23 his inspirational words 2.3x50cm 3.9MM on the importance of acquiring 475MMx3.9cm knowledge and following your dreams was the best life lesson he could bestow upon the graduates'''
def remove_punctuation(data) :
x = "(?:\.\d{1,2}|\d{1,4}\.?\d{0,2}|\d{5}\.?\d?|\d{6}\.?)"
by = "(?: )?(?:by|x)(?: )?"
cm = "(?:mm|cm|millimeter|centimeter|millimeters|centimeters|MM|CM)"
x_cm = "(?:" + x + " *(?:to|\-) *" + cm + "|" + x + cm + ")"
xy_cm = "(?:" + x + cm + by + x + cm +"|" + x + by + x + cm +"|" + x + cm + by + x +"|" + x + by + x + ")"
xyz_cm = "(?:" + x + cm + by + x + cm + by + x + cm + "|" + x + by + x + by + x + cm + "|" + x + by + x + by + x + ")"
xyz2_cm = "(?:" + "L"+ x + cm + "|" + "L"+ x + ")"
m = "{}|{}|{}|{}|{}".format(xyz_cm, xy_cm, x_cm,xyz2_cm,x)
a = re.compile(m)
return a.findall(data)
df['priceDescription'] = df['priceDescription'].apply(lambda x: remove_punctuation(x))
这似乎有效:
import re
s = "His speech talked L36MM of the setbacks in 35CMx56cm life, including death, and L458905 how being aware of death 35CM can 56x34 help you make better 35MM choices in life. At 69cm the time, Jobs was dying 34/67 of pancreatic cancer, and 23 his inspirational words 2.3x50cm 3.9MM on the importance of acquiring 475MMx3.9cm knowledge and following your dreams was the best life lesson he could bestow upon the graduates"
# L then some digits and optional cm/mm
lstart = re.compile(r"(\b(L+\d+)(cm|mm)*)", re.I)
# 56mm, 33cm, etc..
cm_mm_alone = re.compile(r"(\s(\d*\.*\d*)(cm|mm)+\s)", re.I)
# 475MMx3.9, etc...
x_by_y = re.compile(r"((\d*\.*\d*)(cm|mm)*x(\d*\.*\d*)(cm|mm)*)", re.I)
# 23, etc..
digit_alone = re.compile(r"((\s\d+\s))", re.I)
patterns = [lstart, cm_mm_alone, x_by_y, digit_alone]
matches = []
for pattern in patterns:
m = [t[0].strip() for t in pattern.findall(s)]
matches = matches + m
print(matches)
这是输出:
['L36MM', 'L458905', '35CM', '35MM', '69cm', '3.9MM', '35CMx56cm', '56x34', '2.3x50cm', '475MMx3.9cm', '23']