使用 scapy 请求尝试/探测 url
Trying / Probing urls with scapy Request
我想知道是否可以在对页面进行实际处理之前使用 scrapy Request 检查 url 的有效性(url 事先不知道,但可以测试它们出现的不同模式) .
失败的示例代码如下。
(为简单起见使用 reties 变量,测试条件也可以是 s.th 之类的
if response.code != 200
)
代码失败,因为在第二个回调 (parse_page_2
) 结束时,即使发出新请求,控制也不会返回到第一个回调 (parse_page_1
),作为回调 parse_page_1
。
为什么会这样?
我知道基于 urllib2
的解决方案 here,只是检查是否可以严格在 scrapy 上下文中完成。
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.http import Request
class MySpider(CrawlSpider):
name = 'alexa'
allowed_domains = ['alexa.com']
start_urls = ['http://www.alexa.com']
retries = 0
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
# Rule(LinkExtractor(allow=('topsites', ))),
# Extract links matching 'item.php' and parse them with the spider's method parse_item
Rule(LinkExtractor(allow=('topsites', )), callback='parse_page1'),
)
def parse_page1(self, response):
if self.retries < 5:
self.retries += 1
print 'Retries in 1: ', self.retries
return scrapy.Request("http://www.alexa.com/siteieekeknfo/google.com",
meta={'dont_merge_cookies': True,
'dont_redirect': False,
"handle_httpstatus_list": [301, 302, 303, 404]},
callback=self.parse_page2)
else:
print "Finished in 1"
def parse_page2(self, response):
if self.retries < 5:
self.retries += 1
print 'Retries in 2: ', self.retries
return scrapy.Request("http://www.alexa.com/siteieekeknfo/google.com",
meta={'dont_merge_cookies': True,
'dont_redirect': False,
"handle_httpstatus_list": [301, 302, 303, 404]},
callback=self.parse_page1)
else:
print "Finished in 2"
爬取结果粘贴here。
递归回调似乎有效:
import random
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.http import Request
class MySpider(CrawlSpider):
name = 'alexa'
allowed_domains = ['alexa.com']
start_urls = ['http://www.alexa.com']
rules = (
Rule(LinkExtractor(allow=('topsites', )), callback='parse_page1'),
)
_retries = 0
_random_urls = [
'http://www.alexa.com/probablydoesnotexist',
'http://www.alexa.com/neitherdoesthis',
'http://www.alexa.com/siteinfo/google.com'
]
def parse_page1(self, response):
print "Got status: ", response.status
if self._retries == 0 or response.status != 200:
self._retries += 1
print 'Retries in 1: ', self._retries
return Request(random.choice(self._random_urls),
meta={"handle_httpstatus_list": [301, 302, 303, 404]},
callback=self.parse_page1)
else:
print "Exiting"
我想知道是否可以在对页面进行实际处理之前使用 scrapy Request 检查 url 的有效性(url 事先不知道,但可以测试它们出现的不同模式) .
失败的示例代码如下。
(为简单起见使用 reties 变量,测试条件也可以是 s.th 之类的
if response.code != 200
)
代码失败,因为在第二个回调 (parse_page_2
) 结束时,即使发出新请求,控制也不会返回到第一个回调 (parse_page_1
),作为回调 parse_page_1
。
为什么会这样?
我知道基于 urllib2
的解决方案 here,只是检查是否可以严格在 scrapy 上下文中完成。
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.http import Request
class MySpider(CrawlSpider):
name = 'alexa'
allowed_domains = ['alexa.com']
start_urls = ['http://www.alexa.com']
retries = 0
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
# Rule(LinkExtractor(allow=('topsites', ))),
# Extract links matching 'item.php' and parse them with the spider's method parse_item
Rule(LinkExtractor(allow=('topsites', )), callback='parse_page1'),
)
def parse_page1(self, response):
if self.retries < 5:
self.retries += 1
print 'Retries in 1: ', self.retries
return scrapy.Request("http://www.alexa.com/siteieekeknfo/google.com",
meta={'dont_merge_cookies': True,
'dont_redirect': False,
"handle_httpstatus_list": [301, 302, 303, 404]},
callback=self.parse_page2)
else:
print "Finished in 1"
def parse_page2(self, response):
if self.retries < 5:
self.retries += 1
print 'Retries in 2: ', self.retries
return scrapy.Request("http://www.alexa.com/siteieekeknfo/google.com",
meta={'dont_merge_cookies': True,
'dont_redirect': False,
"handle_httpstatus_list": [301, 302, 303, 404]},
callback=self.parse_page1)
else:
print "Finished in 2"
爬取结果粘贴here。
递归回调似乎有效:
import random
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.http import Request
class MySpider(CrawlSpider):
name = 'alexa'
allowed_domains = ['alexa.com']
start_urls = ['http://www.alexa.com']
rules = (
Rule(LinkExtractor(allow=('topsites', )), callback='parse_page1'),
)
_retries = 0
_random_urls = [
'http://www.alexa.com/probablydoesnotexist',
'http://www.alexa.com/neitherdoesthis',
'http://www.alexa.com/siteinfo/google.com'
]
def parse_page1(self, response):
print "Got status: ", response.status
if self._retries == 0 or response.status != 200:
self._retries += 1
print 'Retries in 1: ', self._retries
return Request(random.choice(self._random_urls),
meta={"handle_httpstatus_list": [301, 302, 303, 404]},
callback=self.parse_page1)
else:
print "Exiting"