scrapy:嵌套的 ItemLoader 生成包含 n(n-1) 个字段的输出

scrapy: nested ItemLoader generates outputs containing n(n-1) fields

用例

我正在抓取一个网站 https://www.leseshopsbelges.be,它是电子商店的目录

我想建立第一个商店列表,包括:

问题

当我运行 scrapy时,输出包含了之前存储的所有'name'和'urls'

[
{"category": ["Animaux"], "name": ["Sherkane"], "url": "['https://www.leseshopsbelges.be/shop/sherkane/']"},
{"category": ["Animaux"], "name": ["Sherkane", "Little Joe\u2019s Factory"], "url": "['https://www.leseshopsbelges.be/shop/sherkane/', 'https://www.leseshopsbelges.be/shop/little-joes-factory/']"},
{"category": ["Animaux"], "name": ["Sherkane", "Little Joe\u2019s Factory", "AnimaStyle"], "url": "['https://www.leseshopsbelges.be/shop/sherkane/', 'https://www.leseshopsbelges.be/shop/little-joes-factory/', 'https://www.leseshopsbelges.be/shop/animastyle/']"},
{"category": ["Animaux"], "name": ["Sherkane", "Little Joe\u2019s Factory", "AnimaStyle", "Colonel Gustave"], "url": "['https://www.leseshopsbelges.be/shop/sherkane/', 'https://www.leseshopsbelges.be/shop/little-joes-factory/', 'https://www.leseshopsbelges.be/shop/animastyle/', 'https://www.leseshopsbelges.be/shop/colonel-gustave/']"},

{"category": ["Alimentation"], "name": ["Hobbyscuit"], "url": "['https://www.leseshopsbelges.be/shop/hobbyscuit/']"},
{"category": ["Alimentation"], "name": ["Hobbyscuit", "Authentique Store"], "url": "['https://www.leseshopsbelges.be/shop/hobbyscuit/', 'https://www.leseshopsbelges.be/shop/authentique-store/']"}


]

源代码:home_page_categories_spider_item.py

from ..items import LeseshopsbelgesItem
import logging
import scrapy
import os
from scrapy.loader import ItemLoader
from parsel import Selector
from pathlib import Path
output_directory = 'output'
Path(output_directory).mkdir(parents=True, exist_ok=True)

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class HomePageCategoriesSpider(scrapy.Spider):

    """get all shops details listed inside https://www.leseshopsbelges.be/
    """

    name = "home_page_categories_item"

    def start_requests(self):
        urls = [
            'https://www.leseshopsbelges.be/',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        """get all categories from the home page.e.g. alimentation, animaur, art & culture...

        """
        categories=response.css("a.et_pb_button.et_pb_bg_layout_light")

        for category in categories: 

            loader = ItemLoader(
                item=LeseshopsbelgesItem(),
                selector=category,
                response=response
            )

            loader.add_css(
                'category', "::text")
            shop_item = loader.load_item()


            """get the name of shops on the home page
            """
            shop_page_links = category.css(
                '::attr(href)'
            ).get()

            yield response.follow(shop_page_links, self.parse_shop_in_object, meta={'shop_item': shop_item})

    def parse_shop_in_object(self, response):
        """extract the metadata of each shop
        """
        shop_item = response.meta['shop_item']
        self.log(f'shop_item: {shop_item}')
        if len(shop_item) > 0:
            for shop in response.css("h2.entry-title"):
                loader = ItemLoader(item=shop_item, selector=shop)
                loader.add_css('name', 'a::text')
                loader.add_css('url', 'a::attr(href)')
                yield loader.load_item()


源代码:items.py

import scrapy

from itemloaders.processors import MapCompose, TakeFirst

THIS_IS_NOT_A_CATEGORY = [
    'Je veux référencer mon e-shop',
    'Référencer un e-shop',
    'Découvrez notre spot !'
]


def keep_good_categories(text):
    text = text.split(' | ')[0]
    if text in THIS_IS_NOT_A_CATEGORY:
        text = None
    return text


class LeseshopsbelgesItem(scrapy.Item):
    category = scrapy.Field(input_processor=MapCompose(keep_good_categories))
    name = scrapy.Field()
    url = scrapy.Field(serializer=str)

不要以这种方式使用相同的 item。只需将类别名称从 parse 传递到 parse_shop_in_object 而不是 ItemLoader 对象:

def parse(self, response):
    """get all categories from the home page.e.g. alimentation, animaur, art & culture...

    """
    categories=response.css("a.et_pb_button.et_pb_bg_layout_light")

    for category in categories: 
        category_name = category.css('::text').get()

        """get the name of shops on the home page
        """
        shop_page_links = category.css(
            '::attr(href)'
        ).get()

        yield response.follow(shop_page_links, self.parse_shop_in_object, meta={'category_name': category_name})

def parse_shop_in_object(self, response):
    """extract the metadata of each shop
    """
    category_name = response.meta['category_name']
    # self.log(f'shop_item: {shop_item}')
    # if len(shop_item) > 0:
    for shop in response.css("h2.entry-title"):
        loader = ItemLoader(item=LeseshopsbelgesItem(), selector=shop)
        loader.add_value('category', category_name)
        loader.add_css('name', 'a::text')
        loader.add_css('url', 'a::attr(href)')
        yield loader.load_item()