使用另一个 url 登录后抓取页面
Crawl pages after login using another url
我对scrapy了解有限。使用此代码,我可以在特定论坛中进行登录。现在我需要在登录后设置另一个url:
https://forum.xxx.com/threads/topic-name/page-300
我想自动抓取范围在 300-360 之间的页面。特别是具有此 class messageText
的所有元素
我该怎么做?
import scrapy
class LoginSpider(scrapy.Spider):
name = 'xxx.com'
start_urls = ['https://forum.xxx.com/login/login']
def parse(self, response):
return scrapy.FormRequest.from_response(
response,
formdata={'login': 'xxx', 'register': '0', 'password': 'xxxxx', 'cookie_check': '0'},
callback=self.after_login
)
def after_login(self, response):
# check login succeed before going on
if "authentication failed" in response.body:
self.logger.error("Login failed")
return
# continue scraping with authenticated session...
登录后只生成您需要的请求数:
from scrapy import Request
def after_login(self, response):
# check login succeed before going on
if "authentication failed" in response.body:
self.logger.error("Login failed")
return
for i in range(300, 360):
url = 'https://forum.xxx.com/threads/topic-name/page-{}'.format(i)
yield Request(url, self.parse_page)
def parse_page(self, response):
# parse page here
我对scrapy了解有限。使用此代码,我可以在特定论坛中进行登录。现在我需要在登录后设置另一个url:
https://forum.xxx.com/threads/topic-name/page-300
我想自动抓取范围在 300-360 之间的页面。特别是具有此 class messageText
的所有元素我该怎么做?
import scrapy
class LoginSpider(scrapy.Spider):
name = 'xxx.com'
start_urls = ['https://forum.xxx.com/login/login']
def parse(self, response):
return scrapy.FormRequest.from_response(
response,
formdata={'login': 'xxx', 'register': '0', 'password': 'xxxxx', 'cookie_check': '0'},
callback=self.after_login
)
def after_login(self, response):
# check login succeed before going on
if "authentication failed" in response.body:
self.logger.error("Login failed")
return
# continue scraping with authenticated session...
登录后只生成您需要的请求数:
from scrapy import Request
def after_login(self, response):
# check login succeed before going on
if "authentication failed" in response.body:
self.logger.error("Login failed")
return
for i in range(300, 360):
url = 'https://forum.xxx.com/threads/topic-name/page-{}'.format(i)
yield Request(url, self.parse_page)
def parse_page(self, response):
# parse page here