Scrapy spider 只抓取了 2 页

Scrapy spider scrape only 2 pages

当我运行这段代码时,蜘蛛只抓取了3个页面就停止了。它不会转到下一页。

我尝试了不同的方法改改改改,但我无法移动到第三页。

# -*- coding: utf-8 -*-
import scrapy
from realstatedata.items import RealstatedataItem

class RsdataSpider(scrapy.Spider):
    name = 'rsdata'
    allowed_domains = ['vivareal.com.br']
    start_urls = ['https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/#preco-ate=2000']

    def parse(self, response):

        nextpageurl = response.xpath('//a[@title="Próxima página"]/@href')
        yield from self.scrape(response)

        if nextpageurl:
            path = nextpageurl.extract_first()
            nextpage = response.urljoin(path)
            print("Found url: {}".format(nextpage))
            yield scrapy.Request(nextpage, callback=self.parse)

    def scrape(self, response):
        for resource in response.xpath('//article[@class="property-card__container js-property-card"]/..'):

            item = RealstatedataItem()

            item['description'] = resource.xpath('.//h2/span[@class="property-card__title js-cardLink js-card-title"]/text()').extract_first()
            #item['address'] = resource.xpath('.//span[@class="property-card__address"]/text()').extract_first()
            #item['prop_area'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area"]/text()').extract_first()
            #item['prop_rooms'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
            #item['prop_bath'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
            #item['prop_parking'] = resource.xpath('.//ul/li[4]/span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
            #item['price_rent'] = resource.xpath('.//p[@style="display: block;"]/text()').extract_first()
            #item['price_cond'] = resource.xpath('.//strong[@class="js-condo-rice"]/text()').extract_first()
            #item['realstate_name'] = resource.xpath('.//picture/img/@alt').extract_first()

            yield item

将“#”替换为“?”在 'path' 中(请注意“下一页”按钮不起作用):

import scrapy
from realstatedata.items import RealstatedataItem

class RsdataSpider(scrapy.Spider):
    name = 'rsdata'
    allowed_domains = ['vivareal.com.br']
    start_urls = ['https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/#preco-ate=2000']

    def parse(self, response):

        nextpageurl = response.xpath('//a[contains(@title,"Próxima página")]/@href')
        yield from self.scrape(response)

        if nextpageurl:
            path = nextpageurl.extract_first()
            # Got #pagina=2   =>    Replace with ?pagina=2
            path = '?' + path[1:]
            nextpage = response.urljoin(path)
            print("Found url: {}".format(nextpage))
            yield scrapy.Request(nextpage)

    def scrape(self, response):
        for resource in response.xpath('//article[@class="property-card__container js-property-card"]/..'):

            item = RealstatedataItem()

            item['description'] = resource.xpath('.//h2/span[@class="property-card__title js-cardLink js-card-title"]/text()').extract_first()
            #item['address'] = resource.xpath('.//span[@class="property-card__address"]/text()').extract_first()
            #item['prop_area'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area"]/text()').extract_first()
            #item['prop_rooms'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
            #item['prop_bath'] = resource.xpath('.//span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
            #item['prop_parking'] = resource.xpath('.//ul/li[4]/span[@class="property-card__detail-value js-property-card-value"]/text()').extract_first()
            #item['price_rent'] = resource.xpath('.//p[@style="display: block;"]/text()').extract_first()
            #item['price_cond'] = resource.xpath('.//strong[@class="js-condo-rice"]/text()').extract_first()
            #item['realstate_name'] = resource.xpath('.//picture/img/@alt').extract_first()

            yield item

部分输出:

{'description': '  Apartamento com 2 Quartos para Aluguel, 82m²  '}
{'description': '  Apartamento com 4 Quartos à Venda/Aluguel 280m²  '}
{'description': '  Apartamento com 2 Quartos para Aluguel, 70m²  '}
{'description': '  Apartamento com 3 Quartos para Aluguel, 113m²  '}
{'description': '  Apartamento com 2 Quartos para Venda/Aluguel 50m²  '}
{'description': '  Apartamento com 2 Quartos para Venda/Aluguel 50m²  '}
Found url: https://www.vivareal.com.br/aluguel/sp/sao-jose-dos-campos/apartamento_residencial/?pagina=27