用 scrapy 解释回调和 cb_kwargs

Interpreting callbacks and cb_kwargs with scrapy

我即将达到 scrapy 的个人里程碑。目的是正确理解 callbackcb_kwargs,我已经无数次阅读文档,但我通过视觉代码、实践和解释学得最好。

我有一个示例爬虫,目的是抓取书名、价格并进入每个书页并提取单条信息。我也在尝试了解如何在接下来的几页中正确获取信息,我知道这取决于对回调操作的理解。

当我 运行 我的脚本 returns 只为第一页产生结果时,我如何获得额外的页面?

这是我的刮板:

class BooksItem(scrapy.Item):
    items = Field(output_processor = TakeFirst())
    price = Field(output_processor = TakeFirst())
    availability = Field(output_processor = TakeFirst())

class BookSpider(scrapy.Spider):
    name = "books"
    start_urls = ['https://books.toscrape.com']

    def start_request(self):
        for url in self.start_url:
            yield scrapy.Request(
                url, 
                callback = self.parse)

    def parse(self, response):
        data = response.xpath('//div[@class = "col-sm-8 col-md-9"]')
        for books in data:
            loader = ItemLoader(BooksItem(), selector = books)
            loader.add_xpath('items','.//article[@class="product_pod"]/h3/a//text()')
            loader.add_xpath('price','.//p[@class="price_color"]//text()')
            
            for url in [books.xpath('.//a//@href').get()]:
                yield scrapy.Request(
                    response.urljoin(url),
                    callback = self.parse_book,
                    cb_kwargs = {'loader':loader})

        for next_page in [response.xpath('.//div/ul[@class="pager"]/li[@class="next"]/a//@href').get()]:
            if next_page is not None:
                yield response.follow(next_page, callback=self.parse)


    def parse_book(self, response, loader):
        book_quote = response.xpath('//p[@class="instock availability"]//text()').get()
        

        loader.add_value('availability', book_quote)
        yield loader.load_item()

我认为问题出在我尝试抓取接下来几页的部分。我尝试了使用以下方法的替代方法:

def start_request(self):
        for url in self.start_url:
            yield scrapy.Request(
                url, 
                callback = self.parse,
                cb_kwargs = {'page_count':0}
)

def parse(self, response, next_page):
    if page_count > 3:
        return
...
...
    page_count += 1    
    for next_page in [response.xpath('.//div/ul[@class="pager"]/li[@class="next"]/a//@href').get()]:
        yield response.follow(next_page, callback=self.parse, cb_kwargs = {'page_count': page_count})

但是,我用这种方法得到了以下错误:

TypeError: parse() missing 1 required positional argument: 'page_cntr'

  1. 应该是start_requestsself.start_urls(函数内部)。

  2. get()会return第一个结果,你想要的是getall()为了return一个列表。

  3. “next_page”部分不需要 for 循环,这不是错误,只是不必要。

  4. 在行 for url in books.xpath 中,你得到每个 url 两次,同样不是错误,但仍然...

  5. 这里data = response.xpath('//div[@class = "col-sm-8 col-md-9"]')你不用select一本一本的书,你select整个书容器,你可以检查len(data.getall()) == 1.

  6. book_quote = response.xpath('//p[@class="instock availability"]//text()').get() 将 return \n,查看源代码以找出原因(提示:'i' 标签)。

将您的代码与此进行比较,看看我更改了什么:

import scrapy
from scrapy import Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst


class BooksItem(scrapy.Item):
    items = Field(output_processor=TakeFirst())
    price = Field(output_processor=TakeFirst())
    availability = Field(output_processor=TakeFirst())


class BookSpider(scrapy.Spider):
    name = "books"
    start_urls = ['https://books.toscrape.com']

    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.Request(
                url,
                callback=self.parse)

    def parse(self, response):
        data = response.xpath('//div[@class = "col-sm-8 col-md-9"]//li')
        for books in data:
            loader = ItemLoader(BooksItem(), selector=books)
            loader.add_xpath('items', './/article[@class="product_pod"]/h3/a//text()')
            loader.add_xpath('price', './/p[@class="price_color"]//text()')

            for url in books.xpath('.//h3/a//@href').getall():
                yield scrapy.Request(
                    response.urljoin(url),
                    callback=self.parse_book,
                    cb_kwargs={'loader': loader})

        next_page = response.xpath('.//div/ul[@class="pager"]/li[@class="next"]/a//@href').get()
        if next_page:
            yield response.follow(next_page, callback=self.parse)

    def parse_book(self, response, loader):
        # option 1:
        book_quote = response.xpath('//p[@class="instock availability"]/i/following-sibling::text()').get().strip()

        # option 2:
        # book_quote = ''.join(response.xpath('//div[contains(@class, "product_main")]//p[@class="instock availability"]//text()').getall()).strip()
        loader.add_value('availability', book_quote)
        yield loader.load_item()