抓取数据推特时语法无效?
invalid syntax when crawl data twitter?
当我运行此代码用于抓取数据twitter
from tqdm import tqdm
from bs4 import BeautifulSoup as bs
import re, csv
def html2csv(fData, fHasil, full=True):
urlPattern=re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
print('Loading Data: ', flush = True)
Tweets, Username, waktu, replies, retweets, likes, Language, urlStatus = [], [], [], [], [], [], [], []
soup = bs(open(fData,encoding='utf-8', errors = 'ignore', mode='r'),'html.parser')
data = soup.find_all('li', class_= 'stream-item')
for i,t in tqdm(enumerate(data)):
T = t.find_all('p',class_='TweetTextSize')[0] # Loading tweet
Tweets.append(bs(str(T),'html.parser').text)
U = t.find_all('span',class_='username')
Username.append(bs(str(U[0]),'html.parser').text)
T = t.find_all('a',class_='tweet-timestamp')[0]# Loading Time
waktu.append(bs(str(T),'html.parser').text)
RP = t.find_all('span',class_='ProfileTweet-actionCountForAria')[0]# Loading reply, retweet & Likes
replies.append(int((bs(str(RP), "lxml").text.split()[0]).replace('.','').replace(',','')))
RT = t.find_all('span',class_='ProfileTweet-actionCountForAria')[1]
RT = int((bs(str(RT), "lxml").text.split()[0]).replace('.','').replace(',',''))
retweets.append(RT)
L = t.find_all('span',class_='ProfileTweet-actionCountForAria')[2]
likes.append(int((bs(str(L), "lxml").text.split()[0]).replace('.','').replace(',','')))
try:# Loading Bahasa
L = t.find_all('span',class_='tweet-language')
Language.append(bs(str(L[0]), "lxml").text)
except:
Language.append('')
url = str(t.find_all('small',class_='time')[0])
try:
url = re.findall(urlPattern,url)[0]
except:
try:
mulai, akhir = url.find('href="/')+len('href="/'), url.find('" title=')
url = 'https://twitter.com/' + url[mulai:akhir]
except:
url = ''
urlStatus.append(url)
print('Saving Data to "%s" ' %fHasil, flush = True)
dfile = open(fHasil, 'w', encoding='utf-8', newline='')
if full:
dfile.write('Time, Username, Tweet, Replies, Retweets, Likes, Language, urlStatus\n')
with dfile:
writer = csv.writer(dfile)
for i,t in enumerate(Tweets):
writer.writerow([waktu[i],Username[i],t,replies[i],retweets[i],likes[i],Language[i],urlStatus[i]])
else:
with dfile:
writer = csv.writer(dfile)
for i,t in enumerate(Tweets):
writer.writerow([Username[i],t])
dfile.close()
print('All Finished', flush = True)
我遇到了这个错误
File "<ipython-input-4-4a19b18dc90d>", line 27
except:
^
SyntaxError: invalid syntax
}
在Python中,缩进用于分隔代码块。这与许多其他使用大括号 {} 来分隔块的语言不同,例如 Java、Javascript 和 C。因此,Python 用户必须密切注意何时以及他们如何缩进代码,因为空格很重要。
当 Python 遇到程序缩进问题时,它会引发称为 IndentationError 或 TabError 的异常。
在你的情况下,问题是这样的:
try:
print(x)
except: # wrong indentation
print("An exception occurred")
您可以像这样简单地修复它:
try:
print(x)
except: # correct, try and catch stay at the same level
print("An exception occurred")
希望这对您有所帮助。祝你好运。
当我运行此代码用于抓取数据twitter
from tqdm import tqdm
from bs4 import BeautifulSoup as bs
import re, csv
def html2csv(fData, fHasil, full=True):
urlPattern=re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
print('Loading Data: ', flush = True)
Tweets, Username, waktu, replies, retweets, likes, Language, urlStatus = [], [], [], [], [], [], [], []
soup = bs(open(fData,encoding='utf-8', errors = 'ignore', mode='r'),'html.parser')
data = soup.find_all('li', class_= 'stream-item')
for i,t in tqdm(enumerate(data)):
T = t.find_all('p',class_='TweetTextSize')[0] # Loading tweet
Tweets.append(bs(str(T),'html.parser').text)
U = t.find_all('span',class_='username')
Username.append(bs(str(U[0]),'html.parser').text)
T = t.find_all('a',class_='tweet-timestamp')[0]# Loading Time
waktu.append(bs(str(T),'html.parser').text)
RP = t.find_all('span',class_='ProfileTweet-actionCountForAria')[0]# Loading reply, retweet & Likes
replies.append(int((bs(str(RP), "lxml").text.split()[0]).replace('.','').replace(',','')))
RT = t.find_all('span',class_='ProfileTweet-actionCountForAria')[1]
RT = int((bs(str(RT), "lxml").text.split()[0]).replace('.','').replace(',',''))
retweets.append(RT)
L = t.find_all('span',class_='ProfileTweet-actionCountForAria')[2]
likes.append(int((bs(str(L), "lxml").text.split()[0]).replace('.','').replace(',','')))
try:# Loading Bahasa L = t.find_all('span',class_='tweet-language') Language.append(bs(str(L[0]), "lxml").text) except: Language.append('') url = str(t.find_all('small',class_='time')[0]) try: url = re.findall(urlPattern,url)[0] except: try: mulai, akhir = url.find('href="/')+len('href="/'), url.find('" title=') url = 'https://twitter.com/' + url[mulai:akhir] except: url = '' urlStatus.append(url) print('Saving Data to "%s" ' %fHasil, flush = True) dfile = open(fHasil, 'w', encoding='utf-8', newline='') if full: dfile.write('Time, Username, Tweet, Replies, Retweets, Likes, Language, urlStatus\n') with dfile: writer = csv.writer(dfile) for i,t in enumerate(Tweets): writer.writerow([waktu[i],Username[i],t,replies[i],retweets[i],likes[i],Language[i],urlStatus[i]]) else: with dfile: writer = csv.writer(dfile) for i,t in enumerate(Tweets): writer.writerow([Username[i],t]) dfile.close() print('All Finished', flush = True)
我遇到了这个错误
}File "<ipython-input-4-4a19b18dc90d>", line 27 except: ^ SyntaxError: invalid syntax
在Python中,缩进用于分隔代码块。这与许多其他使用大括号 {} 来分隔块的语言不同,例如 Java、Javascript 和 C。因此,Python 用户必须密切注意何时以及他们如何缩进代码,因为空格很重要。
当 Python 遇到程序缩进问题时,它会引发称为 IndentationError 或 TabError 的异常。
在你的情况下,问题是这样的:
try:
print(x)
except: # wrong indentation
print("An exception occurred")
您可以像这样简单地修复它:
try:
print(x)
except: # correct, try and catch stay at the same level
print("An exception occurred")
希望这对您有所帮助。祝你好运。