删除非字母数字但保留标点符号
Remove non alphanumeric but preserve punctuation
我正在调用 Gmail API 来获取电子邮件的标题。一些标题包含非字母数字字符,例如表情符号、“'”符号等(例如:“\u201cEthnographic”)。同时我需要保留单词末尾的标点符号:例如你好!需要保存。我看过许多关于如何摆脱 non-alphanumeric 的代码示例,但无法完成我想要做的事情。任何反馈表示赞赏。
# Call the api and get the emails
M = json.dumps(message)
temp = message['messages'][0]['payload']
num_found = 0
# get the subject of the emails
for header in temp['headers']:
# print(header['name'])
if header['name'] == 'Subject':
subject = header['value']
break
# S contains patterns like "\u201cEthnographic ..."
# or "u2b50\ufe0f best of .."
S = json.dumps(subject)
你看过表情包python了吗?
参考:emoji package documentation
import emoji
def emoji_free(input):
allchars = [str for str in input]
emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
clean_text = ' '.join([str for str in input.split() if not any(i in str for i in emoji_list)])
return clean_text
emoji_message = 'This is an emoji and the code is designed to remove emojis from a string.'
# remove the emojis from the message
clean_message = emoji_free(emoji_message)
print (clean_message)
# output
# This is an emoji and the code is designed to remove emojis from a string.
emoji_message = 'You are a bright \u2b50 with a smiling face \u263A'
print (emoji_message)
# output
# You are a bright ⭐ with a smiling face ☺
clean_message = emoji_free(emoji_message)
print (clean_message)
# output
# You are a bright with a smiling face
这是另一种删除与表情符号相关的 unicode 字符串的方法。
import re
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u'\U00010000-\U0010ffff'
u"\u200d"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\u3030"
u"\ufe0f"
"]+", flags=re.UNICODE)
# message with star emoji in unicode
emoji_message = 'You are a bright \u2b50'
# print message with star emoji
print(emoji_message)
# output
# You are a bright ⭐
# print message without star emoji
print(emoji_pattern.sub(r'', emoji_message))
# output
# You are a bright
我正在调用 Gmail API 来获取电子邮件的标题。一些标题包含非字母数字字符,例如表情符号、“'”符号等(例如:“\u201cEthnographic”)。同时我需要保留单词末尾的标点符号:例如你好!需要保存。我看过许多关于如何摆脱 non-alphanumeric 的代码示例,但无法完成我想要做的事情。任何反馈表示赞赏。
# Call the api and get the emails
M = json.dumps(message)
temp = message['messages'][0]['payload']
num_found = 0
# get the subject of the emails
for header in temp['headers']:
# print(header['name'])
if header['name'] == 'Subject':
subject = header['value']
break
# S contains patterns like "\u201cEthnographic ..."
# or "u2b50\ufe0f best of .."
S = json.dumps(subject)
你看过表情包python了吗?
参考:emoji package documentation
import emoji
def emoji_free(input):
allchars = [str for str in input]
emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
clean_text = ' '.join([str for str in input.split() if not any(i in str for i in emoji_list)])
return clean_text
emoji_message = 'This is an emoji and the code is designed to remove emojis from a string.'
# remove the emojis from the message
clean_message = emoji_free(emoji_message)
print (clean_message)
# output
# This is an emoji and the code is designed to remove emojis from a string.
emoji_message = 'You are a bright \u2b50 with a smiling face \u263A'
print (emoji_message)
# output
# You are a bright ⭐ with a smiling face ☺
clean_message = emoji_free(emoji_message)
print (clean_message)
# output
# You are a bright with a smiling face
这是另一种删除与表情符号相关的 unicode 字符串的方法。
import re
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u'\U00010000-\U0010ffff'
u"\u200d"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\u3030"
u"\ufe0f"
"]+", flags=re.UNICODE)
# message with star emoji in unicode
emoji_message = 'You are a bright \u2b50'
# print message with star emoji
print(emoji_message)
# output
# You are a bright ⭐
# print message without star emoji
print(emoji_pattern.sub(r'', emoji_message))
# output
# You are a bright