scrapy 的下一页问题 python (json)

Next page issues with scrapy python (json)

我正在尝试从列表中输入邮政编码,但效果不佳(在 class 内)。 Start_urls 按预期采用 sa1、sa2、sa3,但仅在 def 中传递 'sa3'(最后一个),而 next_pages 仅获得 'sa3'.

这是我的代码:

Class OnthemarketSpider(scrapy.Spider):
    name = 'onthemarket'
    allowed_domains = ['onthemarket.com']

    postcodes = ('sa1'), ('sa2'), ('sa3')
    for postcode in postcodes:


        start_urls = [f'https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&sort-field=keywords&under-offer=true&view=grid']

        def parse(self, response):
            data = json.loads(response.body)
            properties = data.get('properties')
            for property in properties:
                yield {
                    'id': property.get('id'),
                    'price': property.get('price'),
                    'title': property.get('property-title'),
                    'url': response.urljoin(property.get('property-link'))
                }

            pages = int(100 / 23)
            postcode = self.postcode

            for number in range(1, pages +1):
                next_page = f"https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&page={number}&sort-field=keywords&under-offer=true&view=grid"
                yield scrapy.Request(next_page, callback=self.parse)

如果可能的话,我想达到这个结果。

This is start URL:  ['https://www.domainname-id=sa1&view=grid']
This is next page:  https://www.domainname-id=sa1&page=1&view=grid
This is next page:  https://www.domainname-id=sa1&page=2&view=grid
This is next page:  https://www.domainname-id=sa1&page=3&view=grid
This is start URL:  ['https://www.domainname-id=sa2&view=grid']
This is next page:  https://www.domainname-id=sa2&page=1&view=grid
This is next page:  https://www.domainname-id=sa2&page=2&view=grid
This is next page:  https://www.domainname-id=sa2&page=3&view=grid
This is start URL:  ['https://www.domainname-id=sa3&view=grid']
This is next page:  https://www.domainname-id=sa3&page=1&view=grid
This is next page:  https://www.domainname-id=sa3&page=2&view=grid
This is next page:  https://www.domainname-id=sa3&page=3&view=grid

感谢您的宝贵时间。

您创建 start_urls 列表并一次又一次地覆盖它,因此您只得到最后一个 url。相反,您需要附加到它:

start_urls = []

for postcode in postcodes:
    start_urls.append(f'https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&sort-field=keywords&under-offer=true&view=grid')

编辑:

完整代码:

import scrapy
import json


class OnthemarketSpider(scrapy.Spider):
    name = 'onthemarket'
    allowed_domains = ['onthemarket.com']

    postcodes = ('sa1'), ('sa2'), ('sa3')
    start_urls = []

    for postcode in postcodes:
        start_urls.append(f'https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&sort-field=keywords&under-offer=true&view=grid')

    def parse(self, response):
        data = json.loads(response.body)
        properties = data.get('properties')
        for property in properties:
            yield {
                'id': property.get('id'),
                'price': property.get('price'),
                'title': property.get('property-title'),
                'url': response.urljoin(property.get('property-link'))
            }

        pages = int(100 / 23)
        postcode = self.postcode

        for number in range(1, pages + 1):
            next_page = f"https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&page={number}&sort-field=keywords&under-offer=true&view=grid"
            yield scrapy.Request(next_page, callback=self.parse)

编辑 2:

import scrapy
import json


class OnthemarketSpider(scrapy.Spider):
    name = 'onthemarket'
    allowed_domains = ['onthemarket.com']

    postcodes = ('sa1'), ('sa2'), ('sa3')
    start_urls = []

    for postcode in postcodes:
        start_urls.append(f'https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&sort-field=keywords&under-offer=true&view=grid')

    def parse(self, response):
        data = json.loads(response.body)
        properties = data.get('properties')
        for property in properties:
            yield {
                'id': property.get('id'),
                'price': property.get('price'),
                'title': property.get('property-title'),
                'url': response.urljoin(property.get('property-link'))
            }

        # pages = int(100 / 23)
        pages = 4   # int(100/23) = 4
        postcode = self.postcode    # always 'sa3'

        for number in range(1, pages + 1):
            next_page = f'{response.url}&page={number}'
            yield scrapy.Request(next_page, callback=self.parse)

编辑 3:

import scrapy
import json
import re


class OnthemarketSpider(scrapy.Spider):
    name = 'onthemarket'
    allowed_domains = ['onthemarket.com']

    postcodes = ('sa1'), ('sa2'), ('sa3')
    start_urls = []

    for postcode in postcodes:
        start_urls.append(f'https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&sort-field=keywords&under-offer=true&view=grid&page=1')

    def parse(self, response):
        data = json.loads(response.body)
        properties = data.get('properties')
        for property in properties:
            yield {
                'id': property.get('id'),
                'price': property.get('price'),
                'title': property.get('property-title'),
                'url': response.urljoin(property.get('property-link'))
            }

        pages = 4   # int(100/23) = 4

        for number in range(1, pages + 1):
            next_page = re.sub(r'page=\d+', f'page={number}', response.url)
            yield scrapy.Request(next_page, callback=self.parse)