扩展每个蜘蛛的 scrapy 设置值

Extend scrapy settings value per spider

假设我们要为特定蜘蛛添加特定项目管道。为了遵守 DRY 原则,我只想从设置中访问当前管道,添加我的特定管道并将结果设置回 spider 的设置。

我们无法通过 custom_settings class 属性完成此操作。即使通过 from_crawler 设置也不起作用:

@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
    crawler.settings.setdict({'ITEM_PIPELINES':
                                  {**dict(crawler.settings.getdict('ITEM_PIPELINES')),
                                   'myscrapers.pipelines.CustomPipeline': 11}
                              }, priority='spider')
    return super().from_crawler(cls, crawler, *args, **kwargs)

导致此错误的原因:

TypeError: Trying to modify an immutable Settings object

我们如何在蜘蛛级别正确扩展 scrapy 中的设置值?

您可以设置进程的设置:

import scrapy
from itemadapter import ItemAdapter
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings
import re


class ExampleSpider(scrapy.Spider):
    name = 'exampleSpider'
    allowed_domains = ['scrapingclub.com']
    start_urls = ['https://scrapingclub.com/exercise/detail_basic/']

    def parse(self, response):
        item = ExampleItem()
        item['title'] = response.xpath('//h3/text()').get()
        item['price'] = response.xpath('//div[@class="card-body"]/h4/text()').get()
        yield item


class ExampleItem(scrapy.Item):
    title = scrapy.Field()
    price = scrapy.Field()


class ItemPipeline1:
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        price = adapter['price']
        if price:
            price = float(re.findall(r'\d+\.\d+', price)[0])
            if price < 15:
                print('Cheap enough')
            return item
        else:
            raise DropItem(f"Missing price in {item}")


class ItemPipeline2:
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        price = adapter['price']
        if price:
            price = float(re.findall(r'\d+\.\d+', price)[0])
            if price > 10:
                print('Too expensive')
            return item
        else:
            raise DropItem(f"Missing price in {item}")


if __name__ == "__main__":
    spidername = 'exampleSpider'

    settings = get_project_settings()
    settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'

    settings['ITEM_PIPELINES'] = {'tempbuffer.spiders.yetanotherspider.ItemPipeline1': 300}
    process = CrawlerProcess(settings)
    process.crawl(spidername)


    settings['ITEM_PIPELINES'] = {'tempbuffer.spiders.yetanotherspider.ItemPipeline2': 300}
    process.crawl(spidername)
    
    process.start()

但是如果你真的想在蜘蛛内部完成所有这些你可以覆盖“update_settings”方法:

import scrapy
from itemadapter import ItemAdapter
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings
import re


class ExampleSpider(scrapy.Spider):
    name = 'exampleSpider'
    allowed_domains = ['scrapingclub.com']
    start_urls = ['https://scrapingclub.com/exercise/detail_basic/']

    custom_settings1 = {'ITEM_PIPELINES': {'tempbuffer.spiders.yetanotherspider.ItemPipeline1': 300}}
    custom_settings2 = {'ITEM_PIPELINES': {'tempbuffer.spiders.yetanotherspider.ItemPipeline2': 300}}

    @classmethod
    def update_settings(cls, settings):
        settings.setdict(getattr(cls, 'custom_settings1' if getattr(cls, 'is_pipeline_1', True) else 'custom_settings2', None) or {}, priority='spider')

    def parse(self, response):
        item = ExampleItem()
        item['title'] = response.xpath('//h3/text()').get()
        item['price'] = response.xpath('//div[@class="card-body"]/h4/text()').get()
        yield item


class ExampleItem(scrapy.Item):
    title = scrapy.Field()
    price = scrapy.Field()


class ItemPipeline1:
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        price = adapter['price']
        if price:
            price = float(re.findall(r'\d+\.\d+', price)[0])
            if price < 15:
                print('Cheap enough')
            return item
        else:
            raise DropItem(f"Missing price in {item}")


class ItemPipeline2:
    def process_item(self, item, spider):
        adapter = ItemAdapter(item)
        price = adapter['price']
        if price:
            price = float(re.findall(r'\d+\.\d+', price)[0])
            if price > 10:
                print('Too expensive')
            return item
        else:
            raise DropItem(f"Missing price in {item}")


if __name__ == "__main__":
    spidername = 'exampleSpider'

    settings = get_project_settings()
    settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    process = CrawlerProcess(settings)

    ExampleSpider.is_pipeline_1 = True
    process.crawl(ExampleSpider)

    ExampleSpider.is_pipeline_1 = False
    process.crawl(ExampleSpider)

    process.start()

但老实说,我认为第一种方法更好...