清理推文,什么都不显示
Cleaning tweets, nothing is displayed
我正在尝试运行下面的代码来清理 txt 文件中的一组推文
我也在命令行上定义了参数,但似乎没有任何输出
知道我做错了什么吗?
下面是代码:
代码:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import os
import re
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import pos_tag
def clean(path, filename):
# print("Cleaning "+path)
filename = CLEANED_DATA + filename.strip()
WRITE_HANDLER = open(filename, 'wb')
tweets = dict()
for line in open('/Users/Mustafa/Desktop/nexalogy/project3.txt',
'rb'):
line = re.sub(r'[.,"!]+', '', line, flags=re.MULTILINE) # removes the characters specified
line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE) # removes RT
line = re.sub(r'https?:\/\/.*[\r\n]*', '', line,
flags=re.MULTILINE) # remove link
line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
line = filter(lambda x: x in string.printable, line) # filter non-ascii characers
new_line = ''
for i in line.split(): # remove @ and #words, punctuataion
if not i.startswith('@') and not i.startswith('#') and i \
not in string.punctuation:
new_line += i + ' '
line = new_line
# # Do sentence correction
if new_line in tweets:
continue
else:
tweets[new_line] = 1
if len(new_line.strip()) > 0:
WRITE_HANDLER.write(new_line + '''
''')
return filename
DATA_FOLDER = sys.argv[1]
CLEANED_DATA = sys.argv[2]
for (root, dirs, files) in os.walk(DATA_FOLDER): # gets all the files from
subfolders recrsively
for name in files:
absolute_path = os.path.join(root, name)
if os.path.isfile(absolute_path) and name != '.DS_Store':
filename = clean(absolute_path, name)
文件:Project3.txt
{"created_at":"Tue Oct 04 17:16:30 +0000 2016","id":783355126945722368,"id_str":"783355126945722368","text":"RT @Jacquiecharles: USAID providing 0,000 in initial assistance for humanitarian partners (not GOH) to rapidly provide critical relief.\u2026","truncated":false,"entities":{"hashtags":[],"symbols":[],"user_mentions":[{"screen_name": "Jacquiecharles","name":"Jacqueline Charles","id":15360434,"id_str":"15360434","indices":[3,18]}], "urls":[]},"metadata":{"iso_language_code":"en","result_type":"recent"},"source":"Twitter for iPhone<\/a>","in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user": {"id":635031678,"id_str":"635031678","name":"Tracie Hamilton","screen_name":"TracieHamilton8","location":" ","description":"Leaning & depending on Him everyday","url":null,"entities":{"description":{"urls":[]}},"protected":false,"followers_count":1929,"friends_count":715,"listed_count":63,"created_at":"Fri Jul 13 23:39:46 +0000 2012","favourites_count":27603,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":17433,"lang":"en" ,"contributors_enabled":false,"is_translator":false,"is_translation_enabled":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/575645183288610817\/5vJNgPld_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/575645183288610817\/5vJNgPld_normal.jpeg","profile_link_color":" 0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"has_extended_profile" :false,"default_profile":true,"default_profile_image":false,"following":false,"follow_request_sent":false,"notifications":false},"geo":空,"coordinates":空,"place":空,"contributors":空,"retweeted_status":{"created_at":"Tue Oct 04 01:27:02 +0000 2016","id" :7 83116185726357504,"id_str":"783116185726357504"
============================================= ===========================
您的代码有几个问题:
- 您正在 'for line in open' 语句中对输入文件进行硬编码
- 您的输出文件名不会 clean.txt。它将是 clean.txt、clean.txt...将为您的目录中的每个文件创建一个
- 有一些奇怪的缩进
- 你pos的JSON全在一行,所以被去掉标点的语句去掉了
您正在传递文件名,但试图根据该文件名遍历 os 文件系统。你应该通过:
python clean.py DIR_NAME CLEAN_FILE
修复缩进并美化 JSON 后,我得到正确的输出:
def clean(path, filename):
# print("Cleaning "+path)
filename = CLEANED_DATA + filename.strip()
print filename
WRITE_HANDLER = open(filename, 'wb')
tweets = dict()
for line in open('./project3.json','rb'):
# print "Before" + line
line = re.sub(r'[.,"!]+', '', line, flags=re.MULTILINE) # removes the characters specified
line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE) # removes RT
line = re.sub(r'https?:\/\/.*[\r\n]*', '', line, flags=re.MULTILINE) # remove link
line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
line = filter(lambda x: x in string.printable, line) # filter non-ascii characers
new_line = ''
for i in line.split(): # remove @ and #words, punctuataion
if not i.startswith('@') and not i.startswith('#') and i not in string.punctuation:
new_line += i + ' '
line = new_line
# # Do sentence correction
if new_line in tweets:
continue
else:
tweets[new_line] = 1
if len(new_line.strip()) > 0:
#print "Writing new line"
WRITE_HANDLER.write(new_line + '''''')
return filename
这是一个完整的工作版本:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import os
import re
import string
def clean(inputDir, outputFile):
# print("Cleaning "+path)
WRITE_HANDLER = open(outputFile, 'wb')
tweets = dict()
for line in open(inputDir + './project3.json','rb'):
# print "Before" + line
line = re.sub(r'[.,"!]+', '', line, flags=re.MULTILINE) # removes the characters specified
line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE) # removes RT
line = re.sub(r'https?:\/\/.*[\r\n]*', '', line, flags=re.MULTILINE) # remove link
line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
line = filter(lambda x: x in string.printable, line) # filter non-ascii characers
new_line = ''
for i in line.split(): # remove @ and #words, punctuataion
if not i.startswith('@') and not i.startswith('#') and i not in string.punctuation:
new_line += i + ' '
line = new_line
# # Do sentence correction
if new_line in tweets:
continue
else:
tweets[new_line] = 1
if len(new_line.strip()) > 0:
#print "Writing new line"
WRITE_HANDLER.write(new_line + '''''')
return outputFile
DATA_FOLDER = sys.argv[1]
CLEANED_DATA = sys.argv[2]
clean(DATA_FOLDER, CLEANED_DATA)
您通过以下方式调用它:
python clean.py inputDirectory outputFileName
我正在尝试运行下面的代码来清理 txt 文件中的一组推文
我也在命令行上定义了参数,但似乎没有任何输出
知道我做错了什么吗?
下面是代码:
代码:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import os
import re
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import pos_tag
def clean(path, filename):
# print("Cleaning "+path)
filename = CLEANED_DATA + filename.strip()
WRITE_HANDLER = open(filename, 'wb')
tweets = dict()
for line in open('/Users/Mustafa/Desktop/nexalogy/project3.txt',
'rb'):
line = re.sub(r'[.,"!]+', '', line, flags=re.MULTILINE) # removes the characters specified
line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE) # removes RT
line = re.sub(r'https?:\/\/.*[\r\n]*', '', line,
flags=re.MULTILINE) # remove link
line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
line = filter(lambda x: x in string.printable, line) # filter non-ascii characers
new_line = ''
for i in line.split(): # remove @ and #words, punctuataion
if not i.startswith('@') and not i.startswith('#') and i \
not in string.punctuation:
new_line += i + ' '
line = new_line
# # Do sentence correction
if new_line in tweets:
continue
else:
tweets[new_line] = 1
if len(new_line.strip()) > 0:
WRITE_HANDLER.write(new_line + '''
''')
return filename
DATA_FOLDER = sys.argv[1]
CLEANED_DATA = sys.argv[2]
for (root, dirs, files) in os.walk(DATA_FOLDER): # gets all the files from
subfolders recrsively
for name in files:
absolute_path = os.path.join(root, name)
if os.path.isfile(absolute_path) and name != '.DS_Store':
filename = clean(absolute_path, name)
文件:Project3.txt
{"created_at":"Tue Oct 04 17:16:30 +0000 2016","id":783355126945722368,"id_str":"783355126945722368","text":"RT @Jacquiecharles: USAID providing 0,000 in initial assistance for humanitarian partners (not GOH) to rapidly provide critical relief.\u2026","truncated":false,"entities":{"hashtags":[],"symbols":[],"user_mentions":[{"screen_name": "Jacquiecharles","name":"Jacqueline Charles","id":15360434,"id_str":"15360434","indices":[3,18]}], "urls":[]},"metadata":{"iso_language_code":"en","result_type":"recent"},"source":"Twitter for iPhone<\/a>","in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user": {"id":635031678,"id_str":"635031678","name":"Tracie Hamilton","screen_name":"TracieHamilton8","location":" ","description":"Leaning & depending on Him everyday","url":null,"entities":{"description":{"urls":[]}},"protected":false,"followers_count":1929,"friends_count":715,"listed_count":63,"created_at":"Fri Jul 13 23:39:46 +0000 2012","favourites_count":27603,"utc_offset":null,"time_zone":null,"geo_enabled":true,"verified":false,"statuses_count":17433,"lang":"en" ,"contributors_enabled":false,"is_translator":false,"is_translation_enabled":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/575645183288610817\/5vJNgPld_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/575645183288610817\/5vJNgPld_normal.jpeg","profile_link_color":" 0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"has_extended_profile" :false,"default_profile":true,"default_profile_image":false,"following":false,"follow_request_sent":false,"notifications":false},"geo":空,"coordinates":空,"place":空,"contributors":空,"retweeted_status":{"created_at":"Tue Oct 04 01:27:02 +0000 2016","id" :7 83116185726357504,"id_str":"783116185726357504"
============================================= ===========================
您的代码有几个问题:
- 您正在 'for line in open' 语句中对输入文件进行硬编码
- 您的输出文件名不会 clean.txt。它将是 clean.txt、clean.txt...将为您的目录中的每个文件创建一个
- 有一些奇怪的缩进
- 你pos的JSON全在一行,所以被去掉标点的语句去掉了
您正在传递文件名,但试图根据该文件名遍历 os 文件系统。你应该通过:
python clean.py DIR_NAME CLEAN_FILE
修复缩进并美化 JSON 后,我得到正确的输出:
def clean(path, filename):
# print("Cleaning "+path)
filename = CLEANED_DATA + filename.strip()
print filename
WRITE_HANDLER = open(filename, 'wb')
tweets = dict()
for line in open('./project3.json','rb'):
# print "Before" + line
line = re.sub(r'[.,"!]+', '', line, flags=re.MULTILINE) # removes the characters specified
line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE) # removes RT
line = re.sub(r'https?:\/\/.*[\r\n]*', '', line, flags=re.MULTILINE) # remove link
line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
line = filter(lambda x: x in string.printable, line) # filter non-ascii characers
new_line = ''
for i in line.split(): # remove @ and #words, punctuataion
if not i.startswith('@') and not i.startswith('#') and i not in string.punctuation:
new_line += i + ' '
line = new_line
# # Do sentence correction
if new_line in tweets:
continue
else:
tweets[new_line] = 1
if len(new_line.strip()) > 0:
#print "Writing new line"
WRITE_HANDLER.write(new_line + '''''')
return filename
这是一个完整的工作版本:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import os
import re
import string
def clean(inputDir, outputFile):
# print("Cleaning "+path)
WRITE_HANDLER = open(outputFile, 'wb')
tweets = dict()
for line in open(inputDir + './project3.json','rb'):
# print "Before" + line
line = re.sub(r'[.,"!]+', '', line, flags=re.MULTILINE) # removes the characters specified
line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE) # removes RT
line = re.sub(r'https?:\/\/.*[\r\n]*', '', line, flags=re.MULTILINE) # remove link
line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
line = filter(lambda x: x in string.printable, line) # filter non-ascii characers
new_line = ''
for i in line.split(): # remove @ and #words, punctuataion
if not i.startswith('@') and not i.startswith('#') and i not in string.punctuation:
new_line += i + ' '
line = new_line
# # Do sentence correction
if new_line in tweets:
continue
else:
tweets[new_line] = 1
if len(new_line.strip()) > 0:
#print "Writing new line"
WRITE_HANDLER.write(new_line + '''''')
return outputFile
DATA_FOLDER = sys.argv[1]
CLEANED_DATA = sys.argv[2]
clean(DATA_FOLDER, CLEANED_DATA)
您通过以下方式调用它:
python clean.py inputDirectory outputFileName