scrapy 的下一页问题 python (json)
Next page issues with scrapy python (json)
我正在尝试从列表中输入邮政编码,但效果不佳(在 class 内)。 Start_urls 按预期采用 sa1、sa2、sa3,但仅在 def 中传递 'sa3'(最后一个),而 next_pages 仅获得 'sa3'.
这是我的代码:
Class OnthemarketSpider(scrapy.Spider):
name = 'onthemarket'
allowed_domains = ['onthemarket.com']
postcodes = ('sa1'), ('sa2'), ('sa3')
for postcode in postcodes:
start_urls = [f'https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&sort-field=keywords&under-offer=true&view=grid']
def parse(self, response):
data = json.loads(response.body)
properties = data.get('properties')
for property in properties:
yield {
'id': property.get('id'),
'price': property.get('price'),
'title': property.get('property-title'),
'url': response.urljoin(property.get('property-link'))
}
pages = int(100 / 23)
postcode = self.postcode
for number in range(1, pages +1):
next_page = f"https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&page={number}&sort-field=keywords&under-offer=true&view=grid"
yield scrapy.Request(next_page, callback=self.parse)
如果可能的话,我想达到这个结果。
This is start URL: ['https://www.domainname-id=sa1&view=grid']
This is next page: https://www.domainname-id=sa1&page=1&view=grid
This is next page: https://www.domainname-id=sa1&page=2&view=grid
This is next page: https://www.domainname-id=sa1&page=3&view=grid
This is start URL: ['https://www.domainname-id=sa2&view=grid']
This is next page: https://www.domainname-id=sa2&page=1&view=grid
This is next page: https://www.domainname-id=sa2&page=2&view=grid
This is next page: https://www.domainname-id=sa2&page=3&view=grid
This is start URL: ['https://www.domainname-id=sa3&view=grid']
This is next page: https://www.domainname-id=sa3&page=1&view=grid
This is next page: https://www.domainname-id=sa3&page=2&view=grid
This is next page: https://www.domainname-id=sa3&page=3&view=grid
感谢您的宝贵时间。
您创建 start_urls
列表并一次又一次地覆盖它,因此您只得到最后一个 url。相反,您需要附加到它:
start_urls = []
for postcode in postcodes:
start_urls.append(f'https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&sort-field=keywords&under-offer=true&view=grid')
编辑:
完整代码:
import scrapy
import json
class OnthemarketSpider(scrapy.Spider):
name = 'onthemarket'
allowed_domains = ['onthemarket.com']
postcodes = ('sa1'), ('sa2'), ('sa3')
start_urls = []
for postcode in postcodes:
start_urls.append(f'https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&sort-field=keywords&under-offer=true&view=grid')
def parse(self, response):
data = json.loads(response.body)
properties = data.get('properties')
for property in properties:
yield {
'id': property.get('id'),
'price': property.get('price'),
'title': property.get('property-title'),
'url': response.urljoin(property.get('property-link'))
}
pages = int(100 / 23)
postcode = self.postcode
for number in range(1, pages + 1):
next_page = f"https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&page={number}&sort-field=keywords&under-offer=true&view=grid"
yield scrapy.Request(next_page, callback=self.parse)
编辑 2:
import scrapy
import json
class OnthemarketSpider(scrapy.Spider):
name = 'onthemarket'
allowed_domains = ['onthemarket.com']
postcodes = ('sa1'), ('sa2'), ('sa3')
start_urls = []
for postcode in postcodes:
start_urls.append(f'https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&sort-field=keywords&under-offer=true&view=grid')
def parse(self, response):
data = json.loads(response.body)
properties = data.get('properties')
for property in properties:
yield {
'id': property.get('id'),
'price': property.get('price'),
'title': property.get('property-title'),
'url': response.urljoin(property.get('property-link'))
}
# pages = int(100 / 23)
pages = 4 # int(100/23) = 4
postcode = self.postcode # always 'sa3'
for number in range(1, pages + 1):
next_page = f'{response.url}&page={number}'
yield scrapy.Request(next_page, callback=self.parse)
编辑 3:
import scrapy
import json
import re
class OnthemarketSpider(scrapy.Spider):
name = 'onthemarket'
allowed_domains = ['onthemarket.com']
postcodes = ('sa1'), ('sa2'), ('sa3')
start_urls = []
for postcode in postcodes:
start_urls.append(f'https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&sort-field=keywords&under-offer=true&view=grid&page=1')
def parse(self, response):
data = json.loads(response.body)
properties = data.get('properties')
for property in properties:
yield {
'id': property.get('id'),
'price': property.get('price'),
'title': property.get('property-title'),
'url': response.urljoin(property.get('property-link'))
}
pages = 4 # int(100/23) = 4
for number in range(1, pages + 1):
next_page = re.sub(r'page=\d+', f'page={number}', response.url)
yield scrapy.Request(next_page, callback=self.parse)
我正在尝试从列表中输入邮政编码,但效果不佳(在 class 内)。 Start_urls 按预期采用 sa1、sa2、sa3,但仅在 def 中传递 'sa3'(最后一个),而 next_pages 仅获得 'sa3'.
这是我的代码:
Class OnthemarketSpider(scrapy.Spider):
name = 'onthemarket'
allowed_domains = ['onthemarket.com']
postcodes = ('sa1'), ('sa2'), ('sa3')
for postcode in postcodes:
start_urls = [f'https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&sort-field=keywords&under-offer=true&view=grid']
def parse(self, response):
data = json.loads(response.body)
properties = data.get('properties')
for property in properties:
yield {
'id': property.get('id'),
'price': property.get('price'),
'title': property.get('property-title'),
'url': response.urljoin(property.get('property-link'))
}
pages = int(100 / 23)
postcode = self.postcode
for number in range(1, pages +1):
next_page = f"https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&page={number}&sort-field=keywords&under-offer=true&view=grid"
yield scrapy.Request(next_page, callback=self.parse)
如果可能的话,我想达到这个结果。
This is start URL: ['https://www.domainname-id=sa1&view=grid']
This is next page: https://www.domainname-id=sa1&page=1&view=grid
This is next page: https://www.domainname-id=sa1&page=2&view=grid
This is next page: https://www.domainname-id=sa1&page=3&view=grid
This is start URL: ['https://www.domainname-id=sa2&view=grid']
This is next page: https://www.domainname-id=sa2&page=1&view=grid
This is next page: https://www.domainname-id=sa2&page=2&view=grid
This is next page: https://www.domainname-id=sa2&page=3&view=grid
This is start URL: ['https://www.domainname-id=sa3&view=grid']
This is next page: https://www.domainname-id=sa3&page=1&view=grid
This is next page: https://www.domainname-id=sa3&page=2&view=grid
This is next page: https://www.domainname-id=sa3&page=3&view=grid
感谢您的宝贵时间。
您创建 start_urls
列表并一次又一次地覆盖它,因此您只得到最后一个 url。相反,您需要附加到它:
start_urls = []
for postcode in postcodes:
start_urls.append(f'https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&sort-field=keywords&under-offer=true&view=grid')
编辑:
完整代码:
import scrapy
import json
class OnthemarketSpider(scrapy.Spider):
name = 'onthemarket'
allowed_domains = ['onthemarket.com']
postcodes = ('sa1'), ('sa2'), ('sa3')
start_urls = []
for postcode in postcodes:
start_urls.append(f'https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&sort-field=keywords&under-offer=true&view=grid')
def parse(self, response):
data = json.loads(response.body)
properties = data.get('properties')
for property in properties:
yield {
'id': property.get('id'),
'price': property.get('price'),
'title': property.get('property-title'),
'url': response.urljoin(property.get('property-link'))
}
pages = int(100 / 23)
postcode = self.postcode
for number in range(1, pages + 1):
next_page = f"https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&page={number}&sort-field=keywords&under-offer=true&view=grid"
yield scrapy.Request(next_page, callback=self.parse)
编辑 2:
import scrapy
import json
class OnthemarketSpider(scrapy.Spider):
name = 'onthemarket'
allowed_domains = ['onthemarket.com']
postcodes = ('sa1'), ('sa2'), ('sa3')
start_urls = []
for postcode in postcodes:
start_urls.append(f'https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&sort-field=keywords&under-offer=true&view=grid')
def parse(self, response):
data = json.loads(response.body)
properties = data.get('properties')
for property in properties:
yield {
'id': property.get('id'),
'price': property.get('price'),
'title': property.get('property-title'),
'url': response.urljoin(property.get('property-link'))
}
# pages = int(100 / 23)
pages = 4 # int(100/23) = 4
postcode = self.postcode # always 'sa3'
for number in range(1, pages + 1):
next_page = f'{response.url}&page={number}'
yield scrapy.Request(next_page, callback=self.parse)
编辑 3:
import scrapy
import json
import re
class OnthemarketSpider(scrapy.Spider):
name = 'onthemarket'
allowed_domains = ['onthemarket.com']
postcodes = ('sa1'), ('sa2'), ('sa3')
start_urls = []
for postcode in postcodes:
start_urls.append(f'https://www.onthemarket.com/async/search/properties/?search-type=for-sale&location-id={postcode}&sort-field=keywords&under-offer=true&view=grid&page=1')
def parse(self, response):
data = json.loads(response.body)
properties = data.get('properties')
for property in properties:
yield {
'id': property.get('id'),
'price': property.get('price'),
'title': property.get('property-title'),
'url': response.urljoin(property.get('property-link'))
}
pages = 4 # int(100/23) = 4
for number in range(1, pages + 1):
next_page = re.sub(r'page=\d+', f'page={number}', response.url)
yield scrapy.Request(next_page, callback=self.parse)