Scrapy 和下一页

Question

我有以下代码：

import scrapy
from datetime import datetime, timedelta

class TigerOffer(scrapy.Item):
    product_id = scrapy.Field()
    url = scrapy.Field()
    title = scrapy.Field()
    price = scrapy.Field()
    city = scrapy.Field()
    state = scrapy.Field()
    post_date = scrapy.Field()
    post_time = scrapy.Field()
    thumb_url = scrapy.Field()
    is_featured = scrapy.Field()
    list_position = scrapy.Field()

class TigerOfferSpider(scrapy.Spider):
    name = 'tigeroffs'
    custom_settings = {
    'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',    
    'CLOSESPIDER_ITEMCOUNT': 30  
    }

    allowed_domains = ['https://sp.olx.com.br']
    start_urls = ['https://sp.olx.com.br/autos-e-pecas/motos?q=tiger%20800']

    def parse(self, response):
        offerItem = TigerOffer()
        offerList = response.xpath("//ul[@class='sc-1fcmfeb-1 kntIvV']/li")
        
        for offer in offerList: 
            offerItem["product_id"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']/@data-lurker_list_id").get()
            offerItem["url"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@href').get()
            offerItem["title"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']/@title").get()
            offerItem["price"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//span[@class='sc-ifAKCX eoKYee']/text()").get()
            offerItem["post_time"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//p[@class='sc-1iuc9a2-4 hDBjae sc-ifAKCX fWUyFm']/text()").get()
            offerItem["city"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//span[@class='sc-7l84qu-1 ciykCV sc-ifAKCX dpURtf']/@title").get()
            offerItem["thumb_url"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//div[@class='fnmrjs-1 gIEtsI']//img/@src").get()
            offerItem["is_featured"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@data-lurker_is_featured').get()
            offerItem["list_position"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@data-lurker_list_position').get()

            yield offerItem

        next_page_url = response.xpath('//*[@id="listing-main-content-slot"]/div[9]/div/div/div[2]/div/div[1]/div/a/@href').extract_first()
        if next_page_url is not None:
            yield scrapy.Request(response.urljoin(next_page_url))

问题是，我无法转到下一页。变量 next_page_url 存在并且不存在 none。我一直在寻找一些答案，但找不到任何答案。

有谁知道怎么弄的吗？

非常感谢。

Answer 1

主要问题

无法导航到下一页的根本原因是代码中的 'CLOSESPIDER_ITEMCOUNT': 30。

说明

通过在您的代码中包含这一行 'CLOSESPIDER_ITEMCOUNT': 30 是在项目抓取计数达到 30 时向蜘蛛发出关闭信号。但是，由于 Scrapy 异步性质，它将完成所有项目的抓取并且不会处理任何之后的其他请求。现在，由于蜘蛛已经关闭，yield scrapy.Request(response.urljoin(next_page_url)) 将无法工作。

你修改后的代码

import scrapy
from datetime import datetime, timedelta

class TigerOffer(scrapy.Item):
    product_id = scrapy.Field()
    url = scrapy.Field()
    title = scrapy.Field()
    price = scrapy.Field()
    city = scrapy.Field()
    state = scrapy.Field()
    post_date = scrapy.Field()
    post_time = scrapy.Field()
    thumb_url = scrapy.Field()
    is_featured = scrapy.Field()
    list_position = scrapy.Field()

class TigerOfferSpider(scrapy.Spider):
    name = 'tigeroffs'
    custom_settings = {
    'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',    
    }

    allowed_domains = ['sp.olx.com.br']
    start_urls = ['https://sp.olx.com.br/autos-e-pecas/motos?q=tiger%20800']

    def parse(self, response):
        offerItem = TigerOffer()
        offerList = response.xpath("//ul[@class='sc-1fcmfeb-1 kntIvV']/li")
        
        for offer in offerList: 
            offerItem["product_id"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']/@data-lurker_list_id").get()
            offerItem["url"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@href').get()
            offerItem["title"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']/@title").get()
            offerItem["price"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//span[@class='sc-ifAKCX eoKYee']/text()").get()
            offerItem["post_time"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//p[@class='sc-1iuc9a2-4 hDBjae sc-ifAKCX fWUyFm']/text()").get()
            offerItem["city"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//span[@class='sc-7l84qu-1 ciykCV sc-ifAKCX dpURtf']/@title").get()
            offerItem["thumb_url"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//div[@class='fnmrjs-1 gIEtsI']//img/@src").get()
            offerItem["is_featured"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@data-lurker_is_featured').get()
            offerItem["list_position"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@data-lurker_list_position').get()

            yield offerItem

        next_page_url = response.xpath('//*[@id="listing-main-content-slot"]/div[9]/div/div/div[2]/div/div[1]/div/a/@href').extract_first()
        if next_page_url is not None:
            yield scrapy.Request(next_page_url, callback=self.parse)

Scrapy 和下一页

Scrapy and next pages

python

scrapy

web-scraping