Python Web Clawer:如何让 clawer 更好地工作
Python Web Clawer: How to make the clawer work better
我是 Python 的新手,最近正在尝试编写一个网络抓取器来为我的项目从网站收集数据。数据量很大,需要很长时间才能收集。期间遇到了一些问题:
502 错误网关 - 我能用它做什么?
我用try... expect...
来处理异常,但是好像还是会被异常中断,请问如何在不中断程序的情况下处理所有的异常?
非常感谢您的帮助!!!
以下是我的代码:(Python 2.7,BeautifulSoup4.3.2,XlsxWriter0.8.7)
# coding: utf-8
import urllib2
import urllib
import re
from bs4 import BeautifulSoup
import urlparse
import xlsxwriter
import traceback;
def open_with_retries(url):
attempts = 5
for attempt in range(attempts):
try:
return opener.open(url)
except:
if attempt == attempts - 1:
raise
workbook = xlsxwriter.Workbook('Artist_Art_B.xlsx')
worksheet = workbook.add_worksheet()
ro = 0
co = 0
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' )]
# First web
response = open_with_retries(unicode("http://amma.artron.net/artronindex_artist.php"))
content = response.read()
pattern = re.compile(u'<li><a href="artronindex_pic.php(.*?) title="(.*?)".*?</a></li>',re.S)
items = re.findall(pattern,content)
for item in items:
# Second
try:
res2 = open_with_retries(str("http://amma.artron.net/artronindex_pic.php?artist="+item[1]))
soup = BeautifulSoup(res2.read())
tables = soup.find_all('tbody')
if len(tables)>0:
table = tables[0]
rows = table.findChildren('tr')
print item[1]
for row in rows:
links = row.find_all('a',href=True)
for link in links:
url = link['href']
parsed = urlparse.urlparse(url)
sort = urlparse.parse_qs(parsed.query)['sort'][0]
labe = urlparse.parse_qs(parsed.query)['labe'][0]
f = {'sort':sort,'labe':labe}
later = urllib.urlencode(f)
# Third
res3 = open_with_retries(str("http://amma.artron.net/artronindex_auctionseason.php?name="+item[1]+"&"+later))
soup2 = BeautifulSoup(res3.read())
ttables = soup2.findChildren('tbody')
if len(tables)>0:
ttable = ttables[0]
rrows = ttable.findChildren('tr')
for rrow in rrows:
ccells = rrow.findChildren('td')
for ccell in ccells:
vvalue = unicode(ccell.string)
worksheet.write(ro,co,vvalue)
co=co+1
print vvalue
ro = ro+1
co = 0
except Exception:
traceback.print_exc()
workbook.close()
(1) 我猜 502 是一个临时错误,可以通过重试请求来解决。将对 opener.open
的调用替换为对该函数的调用:
def open_with_retries(url):
attempts = 5
for attempt in range(attempts):
try:
return opener.open(url)
except:
if attempt == attempts - 1:
raise
(2) 您已经确保该程序不会因以下代码而完全停止:
for item in items:
try:
...
except ValueError:
...
尽管这仅在抛出的异常类型为 ValueError
时才有效。循环将继续到下一个item
并保持运行。您可以按照与 for link in links
相同的模式来确保处理 item
中尽可能多的 link
。
我是 Python 的新手,最近正在尝试编写一个网络抓取器来为我的项目从网站收集数据。数据量很大,需要很长时间才能收集。期间遇到了一些问题:
502 错误网关 - 我能用它做什么?
我用
try... expect...
来处理异常,但是好像还是会被异常中断,请问如何在不中断程序的情况下处理所有的异常?
非常感谢您的帮助!!!
以下是我的代码:(Python 2.7,BeautifulSoup4.3.2,XlsxWriter0.8.7)
# coding: utf-8
import urllib2
import urllib
import re
from bs4 import BeautifulSoup
import urlparse
import xlsxwriter
import traceback;
def open_with_retries(url):
attempts = 5
for attempt in range(attempts):
try:
return opener.open(url)
except:
if attempt == attempts - 1:
raise
workbook = xlsxwriter.Workbook('Artist_Art_B.xlsx')
worksheet = workbook.add_worksheet()
ro = 0
co = 0
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' )]
# First web
response = open_with_retries(unicode("http://amma.artron.net/artronindex_artist.php"))
content = response.read()
pattern = re.compile(u'<li><a href="artronindex_pic.php(.*?) title="(.*?)".*?</a></li>',re.S)
items = re.findall(pattern,content)
for item in items:
# Second
try:
res2 = open_with_retries(str("http://amma.artron.net/artronindex_pic.php?artist="+item[1]))
soup = BeautifulSoup(res2.read())
tables = soup.find_all('tbody')
if len(tables)>0:
table = tables[0]
rows = table.findChildren('tr')
print item[1]
for row in rows:
links = row.find_all('a',href=True)
for link in links:
url = link['href']
parsed = urlparse.urlparse(url)
sort = urlparse.parse_qs(parsed.query)['sort'][0]
labe = urlparse.parse_qs(parsed.query)['labe'][0]
f = {'sort':sort,'labe':labe}
later = urllib.urlencode(f)
# Third
res3 = open_with_retries(str("http://amma.artron.net/artronindex_auctionseason.php?name="+item[1]+"&"+later))
soup2 = BeautifulSoup(res3.read())
ttables = soup2.findChildren('tbody')
if len(tables)>0:
ttable = ttables[0]
rrows = ttable.findChildren('tr')
for rrow in rrows:
ccells = rrow.findChildren('td')
for ccell in ccells:
vvalue = unicode(ccell.string)
worksheet.write(ro,co,vvalue)
co=co+1
print vvalue
ro = ro+1
co = 0
except Exception:
traceback.print_exc()
workbook.close()
(1) 我猜 502 是一个临时错误,可以通过重试请求来解决。将对 opener.open
的调用替换为对该函数的调用:
def open_with_retries(url):
attempts = 5
for attempt in range(attempts):
try:
return opener.open(url)
except:
if attempt == attempts - 1:
raise
(2) 您已经确保该程序不会因以下代码而完全停止:
for item in items:
try:
...
except ValueError:
...
尽管这仅在抛出的异常类型为 ValueError
时才有效。循环将继续到下一个item
并保持运行。您可以按照与 for link in links
相同的模式来确保处理 item
中尽可能多的 link
。