python 使用aiohttp时的爬虫问题
python crawler problems when using aiohttp
我是web spider的初学者,最近使用aiohttp时很困惑。
这是我的代码:
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1',
'Referer': 'https://www.mzitu.com/',
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
'Accept-Encoding': 'gzip',
}
class MZiTu(object):
def __init__(self):
self.timeout = 5
self.file_path = 'D:\mzitu'
self.common_page_url = 'https://www.mzitu.com/page/'
self.total_page_num = 0
self.end_album_num = 0
self.session = None
async def start(self):
async with aiohttp.ClientSession(headers=header) as mzt.session:
for page in range(1, self.total_page_num+1):
await self.crawlAlbum(self.common_page_url, page)
async def crawlAlbum(self, common_url, page_num):
page_url = self.common_page_url + str(page_num)
async with self.session.get(page_url, timeout=self.timeout) as resp:
html = await resp.text()
bsop = BeautifulSoup(html, 'lxml')
album_items = bsop.find('ul', {'id': 'pins'}).findAll('li')
for item in album_items:
try:
album_title = item.find('img').attrs['alt']
album_url = item.find('a').attrs['href']
if not os.path.exists(os.path.join(self.file_path, album_title)):
os.mkdir(os.path.join(self.file_path, album_title))
os.chdir(os.path.join(self.file_path, album_title))
await self.crawlImgs(album_url)
except:
continue
async def crawlImgs(self, album_url):
self.end_album_num = await self.getAlbumTotalNum(album_url)
for i in range(1, self.end_album_num+1):
img_page_url = album_url + str(i)
async with self.session.get(img_page_url, timeout=self.timeout) as resq:
html = await resq.text()
bsop = BeautifulSoup(html, 'lxml')
try:
img_url = bsop.find('div', {'class': 'main-image'}).find('img').attrs['src']
await self.downloadImg(i, img_url)
except:
continue
async def getAlbumTotalNum(self, album_url):
async with self.session.get(album_url, timeout=self.timeout) as resq:
html = await resq.text()
bsop = BeautifulSoup(html, 'lxml')
total_num = int(bsop.find('div', {'class': 'nav-links'}).findAll('a', {'class': 'page-numbers'})[-2].text)
return total_num
async def downloadImg(self,index, img_url):
async with self.session.get(img_url, timeout=self.timeout) as resq:
content = await resq.read()
async with aiofiles.open(str(index)+'.jpg', 'wb') as f:
await f.write(content)
if __name__ == "__main__":
mzt = MZiTu()
mzt.total_page_num = 2
loop = asyncio.get_event_loop()
to_do = [mzt.start()]
wait_future = asyncio.wait(to_do)
loop.run_until_complete(wait_future)
loop.close()
我的代码return直接在下面第一行,为什么?很困惑
async def getAlbumTotalNum(self, album_url):
async with self.session.get(album_url, timeout=self.timeout) as resq:
html = await resq.text()
bsop = BeautifulSoup(html, 'lxml')
total_num = int(bsop.find('div', {'class': 'nav-links'}).findAll('a', {'class': 'page-numbers'})[-2].text)
return total_num
我在我的程序中找不到任何错误。
如此迷茫。
如此迷茫。
如果有aiohttp和asyncio的学习资料,感觉好难
第一个问题是您正在使用 pokemon exception handling,您确实 不想全部捕获它们。
捕获特定的异常,仅,或者至少只捕获Exception
并确保重新引发asyncio.CancelledError
(你不想阻止任务取消),并记录或打印引发的异常,以便您可以进一步清理您的处理程序。作为快速修复,我将您的 try:... except: continue
块替换为:
try:
# ...
except asyncio.CancelledError:
raise
except Exception:
traceback.print_exc()
continue
并在顶部添加了 import traceback
。当您 运行 您的代码时,您会看到代码失败的原因:
Traceback (most recent call last):
File "test.py", line 43, in crawlAlbum
await self.crawlImgs(album_url)
File "test.py", line 51, in crawlImgs
self.end_album_num = await self.getAlbumTotalNum(album_url)
File "test.py", line 72, in getAlbumTotalNum
total_num = int(bsop.find('div', {'class': 'nav-links'}).findAll('a', {'class': 'page-numbers'})[-2].text)
AttributeError: 'NoneType' object has no attribute 'findAll'
网站标记链接的方式发生了变化,或者网站在加载 HTML 后使用 Javascript 更改浏览器中的 DOM。无论哪种方式,在不记录错误的情况下使用笼统的 except:
子句都会向您隐藏此类问题并使其真正难以调试。
我至少会添加一些日志记录来记录 URL 代码在发生异常时试图解析的内容,因此您可以在交互式、非异步设置中重现该问题并尝试不同的方法解析页面。
与其使用 .find()
和 .findAll()
调用,不如使用 CSS selector 找到正确的元素:
links = bsop.select(f'div.pagenavi a[href^="{album_url}"] span')
return 1 if len(links) < 3 else int(links[-2].string)
以上使用当前的 URL 将搜索限制为具有 a
元素父元素且具有 href
属性且其值至少开始的特定 span
元素当前页面 URL.
请注意,以上问题不是唯一的问题,当问题解决后,下一个例外是
Traceback (most recent call last):
File "test.py", line 59, in crawlImgs
img_url = bsop.find('div', {'class': 'main-image'}).find('img').attrs['src']
AttributeError: 'NoneType' object has no attribute 'find'
这实际上是由于您对相册的不正确 URL 处理造成的,假设它们总是以 /
结尾。更正此:
async def crawlImgs(self, album_url):
end_album_num = await self.getAlbumTotalNum(album_url)
if album_url[-1] != '/':
album_url += '/'
for i in range(1, end_album_num + 1):
img_page_url = album_url + str(i)
# ...
您 不 想要将 album_num
设置为 self
上的属性! class 实例状态在任务之间共享,而您实际上并没有在代码中创建多个任务(目前都是一个顺序任务),您希望避免更改共享状态。
我是web spider的初学者,最近使用aiohttp时很困惑。 这是我的代码:
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1',
'Referer': 'https://www.mzitu.com/',
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
'Accept-Encoding': 'gzip',
}
class MZiTu(object):
def __init__(self):
self.timeout = 5
self.file_path = 'D:\mzitu'
self.common_page_url = 'https://www.mzitu.com/page/'
self.total_page_num = 0
self.end_album_num = 0
self.session = None
async def start(self):
async with aiohttp.ClientSession(headers=header) as mzt.session:
for page in range(1, self.total_page_num+1):
await self.crawlAlbum(self.common_page_url, page)
async def crawlAlbum(self, common_url, page_num):
page_url = self.common_page_url + str(page_num)
async with self.session.get(page_url, timeout=self.timeout) as resp:
html = await resp.text()
bsop = BeautifulSoup(html, 'lxml')
album_items = bsop.find('ul', {'id': 'pins'}).findAll('li')
for item in album_items:
try:
album_title = item.find('img').attrs['alt']
album_url = item.find('a').attrs['href']
if not os.path.exists(os.path.join(self.file_path, album_title)):
os.mkdir(os.path.join(self.file_path, album_title))
os.chdir(os.path.join(self.file_path, album_title))
await self.crawlImgs(album_url)
except:
continue
async def crawlImgs(self, album_url):
self.end_album_num = await self.getAlbumTotalNum(album_url)
for i in range(1, self.end_album_num+1):
img_page_url = album_url + str(i)
async with self.session.get(img_page_url, timeout=self.timeout) as resq:
html = await resq.text()
bsop = BeautifulSoup(html, 'lxml')
try:
img_url = bsop.find('div', {'class': 'main-image'}).find('img').attrs['src']
await self.downloadImg(i, img_url)
except:
continue
async def getAlbumTotalNum(self, album_url):
async with self.session.get(album_url, timeout=self.timeout) as resq:
html = await resq.text()
bsop = BeautifulSoup(html, 'lxml')
total_num = int(bsop.find('div', {'class': 'nav-links'}).findAll('a', {'class': 'page-numbers'})[-2].text)
return total_num
async def downloadImg(self,index, img_url):
async with self.session.get(img_url, timeout=self.timeout) as resq:
content = await resq.read()
async with aiofiles.open(str(index)+'.jpg', 'wb') as f:
await f.write(content)
if __name__ == "__main__":
mzt = MZiTu()
mzt.total_page_num = 2
loop = asyncio.get_event_loop()
to_do = [mzt.start()]
wait_future = asyncio.wait(to_do)
loop.run_until_complete(wait_future)
loop.close()
我的代码return直接在下面第一行,为什么?很困惑
async def getAlbumTotalNum(self, album_url):
async with self.session.get(album_url, timeout=self.timeout) as resq:
html = await resq.text()
bsop = BeautifulSoup(html, 'lxml')
total_num = int(bsop.find('div', {'class': 'nav-links'}).findAll('a', {'class': 'page-numbers'})[-2].text)
return total_num
我在我的程序中找不到任何错误。 如此迷茫。 如此迷茫。 如果有aiohttp和asyncio的学习资料,感觉好难
第一个问题是您正在使用 pokemon exception handling,您确实 不想全部捕获它们。
捕获特定的异常,仅,或者至少只捕获Exception
并确保重新引发asyncio.CancelledError
(你不想阻止任务取消),并记录或打印引发的异常,以便您可以进一步清理您的处理程序。作为快速修复,我将您的 try:... except: continue
块替换为:
try:
# ...
except asyncio.CancelledError:
raise
except Exception:
traceback.print_exc()
continue
并在顶部添加了 import traceback
。当您 运行 您的代码时,您会看到代码失败的原因:
Traceback (most recent call last):
File "test.py", line 43, in crawlAlbum
await self.crawlImgs(album_url)
File "test.py", line 51, in crawlImgs
self.end_album_num = await self.getAlbumTotalNum(album_url)
File "test.py", line 72, in getAlbumTotalNum
total_num = int(bsop.find('div', {'class': 'nav-links'}).findAll('a', {'class': 'page-numbers'})[-2].text)
AttributeError: 'NoneType' object has no attribute 'findAll'
网站标记链接的方式发生了变化,或者网站在加载 HTML 后使用 Javascript 更改浏览器中的 DOM。无论哪种方式,在不记录错误的情况下使用笼统的 except:
子句都会向您隐藏此类问题并使其真正难以调试。
我至少会添加一些日志记录来记录 URL 代码在发生异常时试图解析的内容,因此您可以在交互式、非异步设置中重现该问题并尝试不同的方法解析页面。
与其使用 .find()
和 .findAll()
调用,不如使用 CSS selector 找到正确的元素:
links = bsop.select(f'div.pagenavi a[href^="{album_url}"] span')
return 1 if len(links) < 3 else int(links[-2].string)
以上使用当前的 URL 将搜索限制为具有 a
元素父元素且具有 href
属性且其值至少开始的特定 span
元素当前页面 URL.
请注意,以上问题不是唯一的问题,当问题解决后,下一个例外是
Traceback (most recent call last):
File "test.py", line 59, in crawlImgs
img_url = bsop.find('div', {'class': 'main-image'}).find('img').attrs['src']
AttributeError: 'NoneType' object has no attribute 'find'
这实际上是由于您对相册的不正确 URL 处理造成的,假设它们总是以 /
结尾。更正此:
async def crawlImgs(self, album_url):
end_album_num = await self.getAlbumTotalNum(album_url)
if album_url[-1] != '/':
album_url += '/'
for i in range(1, end_album_num + 1):
img_page_url = album_url + str(i)
# ...
您 不 想要将 album_num
设置为 self
上的属性! class 实例状态在任务之间共享,而您实际上并没有在代码中创建多个任务(目前都是一个顺序任务),您希望避免更改共享状态。