scrapy:嵌套的 ItemLoader 生成包含 n(n-1) 个字段的输出
scrapy: nested ItemLoader generates outputs containing n(n-1) fields
用例
我正在抓取一个网站 https://www.leseshopsbelges.be,它是电子商店的目录
我想建立第一个商店列表,包括:
- url 网站“leseshopsbelges.be”
- 网站名称
- 网站类别
问题
当我运行 scrapy时,输出包含了之前存储的所有'name'和'urls'
[
{"category": ["Animaux"], "name": ["Sherkane"], "url": "['https://www.leseshopsbelges.be/shop/sherkane/']"},
{"category": ["Animaux"], "name": ["Sherkane", "Little Joe\u2019s Factory"], "url": "['https://www.leseshopsbelges.be/shop/sherkane/', 'https://www.leseshopsbelges.be/shop/little-joes-factory/']"},
{"category": ["Animaux"], "name": ["Sherkane", "Little Joe\u2019s Factory", "AnimaStyle"], "url": "['https://www.leseshopsbelges.be/shop/sherkane/', 'https://www.leseshopsbelges.be/shop/little-joes-factory/', 'https://www.leseshopsbelges.be/shop/animastyle/']"},
{"category": ["Animaux"], "name": ["Sherkane", "Little Joe\u2019s Factory", "AnimaStyle", "Colonel Gustave"], "url": "['https://www.leseshopsbelges.be/shop/sherkane/', 'https://www.leseshopsbelges.be/shop/little-joes-factory/', 'https://www.leseshopsbelges.be/shop/animastyle/', 'https://www.leseshopsbelges.be/shop/colonel-gustave/']"},
{"category": ["Alimentation"], "name": ["Hobbyscuit"], "url": "['https://www.leseshopsbelges.be/shop/hobbyscuit/']"},
{"category": ["Alimentation"], "name": ["Hobbyscuit", "Authentique Store"], "url": "['https://www.leseshopsbelges.be/shop/hobbyscuit/', 'https://www.leseshopsbelges.be/shop/authentique-store/']"}
]
源代码:home_page_categories_spider_item.py
from ..items import LeseshopsbelgesItem
import logging
import scrapy
import os
from scrapy.loader import ItemLoader
from parsel import Selector
from pathlib import Path
output_directory = 'output'
Path(output_directory).mkdir(parents=True, exist_ok=True)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class HomePageCategoriesSpider(scrapy.Spider):
"""get all shops details listed inside https://www.leseshopsbelges.be/
"""
name = "home_page_categories_item"
def start_requests(self):
urls = [
'https://www.leseshopsbelges.be/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
"""get all categories from the home page.e.g. alimentation, animaur, art & culture...
"""
categories=response.css("a.et_pb_button.et_pb_bg_layout_light")
for category in categories:
loader = ItemLoader(
item=LeseshopsbelgesItem(),
selector=category,
response=response
)
loader.add_css(
'category', "::text")
shop_item = loader.load_item()
"""get the name of shops on the home page
"""
shop_page_links = category.css(
'::attr(href)'
).get()
yield response.follow(shop_page_links, self.parse_shop_in_object, meta={'shop_item': shop_item})
def parse_shop_in_object(self, response):
"""extract the metadata of each shop
"""
shop_item = response.meta['shop_item']
self.log(f'shop_item: {shop_item}')
if len(shop_item) > 0:
for shop in response.css("h2.entry-title"):
loader = ItemLoader(item=shop_item, selector=shop)
loader.add_css('name', 'a::text')
loader.add_css('url', 'a::attr(href)')
yield loader.load_item()
源代码:items.py
import scrapy
from itemloaders.processors import MapCompose, TakeFirst
THIS_IS_NOT_A_CATEGORY = [
'Je veux référencer mon e-shop',
'Référencer un e-shop',
'Découvrez notre spot !'
]
def keep_good_categories(text):
text = text.split(' | ')[0]
if text in THIS_IS_NOT_A_CATEGORY:
text = None
return text
class LeseshopsbelgesItem(scrapy.Item):
category = scrapy.Field(input_processor=MapCompose(keep_good_categories))
name = scrapy.Field()
url = scrapy.Field(serializer=str)
不要以这种方式使用相同的 item
。只需将类别名称从 parse
传递到 parse_shop_in_object
而不是 ItemLoader
对象:
def parse(self, response):
"""get all categories from the home page.e.g. alimentation, animaur, art & culture...
"""
categories=response.css("a.et_pb_button.et_pb_bg_layout_light")
for category in categories:
category_name = category.css('::text').get()
"""get the name of shops on the home page
"""
shop_page_links = category.css(
'::attr(href)'
).get()
yield response.follow(shop_page_links, self.parse_shop_in_object, meta={'category_name': category_name})
def parse_shop_in_object(self, response):
"""extract the metadata of each shop
"""
category_name = response.meta['category_name']
# self.log(f'shop_item: {shop_item}')
# if len(shop_item) > 0:
for shop in response.css("h2.entry-title"):
loader = ItemLoader(item=LeseshopsbelgesItem(), selector=shop)
loader.add_value('category', category_name)
loader.add_css('name', 'a::text')
loader.add_css('url', 'a::attr(href)')
yield loader.load_item()
用例
我正在抓取一个网站 https://www.leseshopsbelges.be,它是电子商店的目录
我想建立第一个商店列表,包括:
- url 网站“leseshopsbelges.be”
- 网站名称
- 网站类别
问题
当我运行 scrapy时,输出包含了之前存储的所有'name'和'urls'
[
{"category": ["Animaux"], "name": ["Sherkane"], "url": "['https://www.leseshopsbelges.be/shop/sherkane/']"},
{"category": ["Animaux"], "name": ["Sherkane", "Little Joe\u2019s Factory"], "url": "['https://www.leseshopsbelges.be/shop/sherkane/', 'https://www.leseshopsbelges.be/shop/little-joes-factory/']"},
{"category": ["Animaux"], "name": ["Sherkane", "Little Joe\u2019s Factory", "AnimaStyle"], "url": "['https://www.leseshopsbelges.be/shop/sherkane/', 'https://www.leseshopsbelges.be/shop/little-joes-factory/', 'https://www.leseshopsbelges.be/shop/animastyle/']"},
{"category": ["Animaux"], "name": ["Sherkane", "Little Joe\u2019s Factory", "AnimaStyle", "Colonel Gustave"], "url": "['https://www.leseshopsbelges.be/shop/sherkane/', 'https://www.leseshopsbelges.be/shop/little-joes-factory/', 'https://www.leseshopsbelges.be/shop/animastyle/', 'https://www.leseshopsbelges.be/shop/colonel-gustave/']"},
{"category": ["Alimentation"], "name": ["Hobbyscuit"], "url": "['https://www.leseshopsbelges.be/shop/hobbyscuit/']"},
{"category": ["Alimentation"], "name": ["Hobbyscuit", "Authentique Store"], "url": "['https://www.leseshopsbelges.be/shop/hobbyscuit/', 'https://www.leseshopsbelges.be/shop/authentique-store/']"}
]
源代码:home_page_categories_spider_item.py
from ..items import LeseshopsbelgesItem
import logging
import scrapy
import os
from scrapy.loader import ItemLoader
from parsel import Selector
from pathlib import Path
output_directory = 'output'
Path(output_directory).mkdir(parents=True, exist_ok=True)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class HomePageCategoriesSpider(scrapy.Spider):
"""get all shops details listed inside https://www.leseshopsbelges.be/
"""
name = "home_page_categories_item"
def start_requests(self):
urls = [
'https://www.leseshopsbelges.be/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
"""get all categories from the home page.e.g. alimentation, animaur, art & culture...
"""
categories=response.css("a.et_pb_button.et_pb_bg_layout_light")
for category in categories:
loader = ItemLoader(
item=LeseshopsbelgesItem(),
selector=category,
response=response
)
loader.add_css(
'category', "::text")
shop_item = loader.load_item()
"""get the name of shops on the home page
"""
shop_page_links = category.css(
'::attr(href)'
).get()
yield response.follow(shop_page_links, self.parse_shop_in_object, meta={'shop_item': shop_item})
def parse_shop_in_object(self, response):
"""extract the metadata of each shop
"""
shop_item = response.meta['shop_item']
self.log(f'shop_item: {shop_item}')
if len(shop_item) > 0:
for shop in response.css("h2.entry-title"):
loader = ItemLoader(item=shop_item, selector=shop)
loader.add_css('name', 'a::text')
loader.add_css('url', 'a::attr(href)')
yield loader.load_item()
源代码:items.py
import scrapy
from itemloaders.processors import MapCompose, TakeFirst
THIS_IS_NOT_A_CATEGORY = [
'Je veux référencer mon e-shop',
'Référencer un e-shop',
'Découvrez notre spot !'
]
def keep_good_categories(text):
text = text.split(' | ')[0]
if text in THIS_IS_NOT_A_CATEGORY:
text = None
return text
class LeseshopsbelgesItem(scrapy.Item):
category = scrapy.Field(input_processor=MapCompose(keep_good_categories))
name = scrapy.Field()
url = scrapy.Field(serializer=str)
不要以这种方式使用相同的 item
。只需将类别名称从 parse
传递到 parse_shop_in_object
而不是 ItemLoader
对象:
def parse(self, response):
"""get all categories from the home page.e.g. alimentation, animaur, art & culture...
"""
categories=response.css("a.et_pb_button.et_pb_bg_layout_light")
for category in categories:
category_name = category.css('::text').get()
"""get the name of shops on the home page
"""
shop_page_links = category.css(
'::attr(href)'
).get()
yield response.follow(shop_page_links, self.parse_shop_in_object, meta={'category_name': category_name})
def parse_shop_in_object(self, response):
"""extract the metadata of each shop
"""
category_name = response.meta['category_name']
# self.log(f'shop_item: {shop_item}')
# if len(shop_item) > 0:
for shop in response.css("h2.entry-title"):
loader = ItemLoader(item=LeseshopsbelgesItem(), selector=shop)
loader.add_value('category', category_name)
loader.add_css('name', 'a::text')
loader.add_css('url', 'a::attr(href)')
yield loader.load_item()