使用 scapy 请求尝试/探测 url

Trying / Probing urls with scapy Request

我想知道是否可以在对页面进行实际处理之前使用 scrapy Request 检查 url 的有效性(url 事先不知道,但可以测试它们出现的不同模式) . 失败的示例代码如下。 (为简单起见使用 reties 变量,测试条件也可以是 s.th 之类的 if response.code != 200 )

代码失败,因为在第二个回调 (parse_page_2) 结束时,即使发出新请求,控制也不会返回到第一个回调 (parse_page_1),作为回调 parse_page_1。 为什么会这样? 我知道基于 urllib2 的解决方案 here,只是检查是否可以严格在 scrapy 上下文中完成。

import scrapy

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.http import Request

class MySpider(CrawlSpider):
    name = 'alexa'
    allowed_domains = ['alexa.com']
    start_urls = ['http://www.alexa.com']
    retries = 0
    rules = (
        # Extract links matching 'category.php' (but not matching 'subsection.php')
        # and follow links from them (since no callback means follow=True by default).
        # Rule(LinkExtractor(allow=('topsites', ))),

        # Extract links matching 'item.php' and parse them with the spider's method parse_item
        Rule(LinkExtractor(allow=('topsites', )), callback='parse_page1'),
    )

    def parse_page1(self, response):
        if self.retries < 5:
            self.retries += 1
            print 'Retries in 1: ', self.retries
            return scrapy.Request("http://www.alexa.com/siteieekeknfo/google.com",
                                 meta={'dont_merge_cookies': True,
                                'dont_redirect': False,
                                "handle_httpstatus_list": [301, 302, 303, 404]},
                               callback=self.parse_page2)
        else:
            print "Finished in 1"

    def parse_page2(self, response):
        if self.retries < 5:
            self.retries += 1
            print 'Retries in 2: ', self.retries
            return scrapy.Request("http://www.alexa.com/siteieekeknfo/google.com",
                                 meta={'dont_merge_cookies': True,
                                'dont_redirect': False,
                                "handle_httpstatus_list": [301, 302, 303, 404]},
                               callback=self.parse_page1)
        else:
            print "Finished in 2"

爬取结果粘贴here

递归回调似乎有效:

import random


from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.http import Request

class MySpider(CrawlSpider):
    name = 'alexa'
    allowed_domains = ['alexa.com']
    start_urls = ['http://www.alexa.com']    
    rules = (
        Rule(LinkExtractor(allow=('topsites', )), callback='parse_page1'),
    )

    _retries = 0

    _random_urls = [
        'http://www.alexa.com/probablydoesnotexist',
        'http://www.alexa.com/neitherdoesthis',
        'http://www.alexa.com/siteinfo/google.com'
    ]

    def parse_page1(self, response):
        print "Got status: ", response.status
        if self._retries == 0 or response.status != 200:
            self._retries += 1
            print 'Retries in 1: ', self._retries
            return Request(random.choice(self._random_urls),
                                 meta={"handle_httpstatus_list": [301, 302, 303, 404]},
                               callback=self.parse_page1)
        else:
            print "Exiting"