Scrapy 和下一页
Scrapy and next pages
我有以下代码:
import scrapy
from datetime import datetime, timedelta
class TigerOffer(scrapy.Item):
product_id = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
price = scrapy.Field()
city = scrapy.Field()
state = scrapy.Field()
post_date = scrapy.Field()
post_time = scrapy.Field()
thumb_url = scrapy.Field()
is_featured = scrapy.Field()
list_position = scrapy.Field()
class TigerOfferSpider(scrapy.Spider):
name = 'tigeroffs'
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
'CLOSESPIDER_ITEMCOUNT': 30
}
allowed_domains = ['https://sp.olx.com.br']
start_urls = ['https://sp.olx.com.br/autos-e-pecas/motos?q=tiger%20800']
def parse(self, response):
offerItem = TigerOffer()
offerList = response.xpath("//ul[@class='sc-1fcmfeb-1 kntIvV']/li")
for offer in offerList:
offerItem["product_id"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']/@data-lurker_list_id").get()
offerItem["url"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@href').get()
offerItem["title"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']/@title").get()
offerItem["price"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//span[@class='sc-ifAKCX eoKYee']/text()").get()
offerItem["post_time"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//p[@class='sc-1iuc9a2-4 hDBjae sc-ifAKCX fWUyFm']/text()").get()
offerItem["city"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//span[@class='sc-7l84qu-1 ciykCV sc-ifAKCX dpURtf']/@title").get()
offerItem["thumb_url"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//div[@class='fnmrjs-1 gIEtsI']//img/@src").get()
offerItem["is_featured"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@data-lurker_is_featured').get()
offerItem["list_position"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@data-lurker_list_position').get()
yield offerItem
next_page_url = response.xpath('//*[@id="listing-main-content-slot"]/div[9]/div/div/div[2]/div/div[1]/div/a/@href').extract_first()
if next_page_url is not None:
yield scrapy.Request(response.urljoin(next_page_url))
问题是,我无法转到下一页。
变量 next_page_url
存在并且不存在 none。
我一直在寻找一些答案,但找不到任何答案。
有谁知道怎么弄的吗?
非常感谢。
主要问题
无法导航到下一页的根本原因是代码中的 'CLOSESPIDER_ITEMCOUNT': 30
。
说明
通过在您的代码中包含这一行 'CLOSESPIDER_ITEMCOUNT': 30
是在项目抓取计数达到 30 时向蜘蛛发出关闭信号。但是,由于 Scrapy 异步性质,它将完成所有项目的抓取并且不会处理任何之后的其他请求。现在,由于蜘蛛已经关闭,yield scrapy.Request(response.urljoin(next_page_url))
将无法工作。
你修改后的代码
import scrapy
from datetime import datetime, timedelta
class TigerOffer(scrapy.Item):
product_id = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
price = scrapy.Field()
city = scrapy.Field()
state = scrapy.Field()
post_date = scrapy.Field()
post_time = scrapy.Field()
thumb_url = scrapy.Field()
is_featured = scrapy.Field()
list_position = scrapy.Field()
class TigerOfferSpider(scrapy.Spider):
name = 'tigeroffs'
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
}
allowed_domains = ['sp.olx.com.br']
start_urls = ['https://sp.olx.com.br/autos-e-pecas/motos?q=tiger%20800']
def parse(self, response):
offerItem = TigerOffer()
offerList = response.xpath("//ul[@class='sc-1fcmfeb-1 kntIvV']/li")
for offer in offerList:
offerItem["product_id"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']/@data-lurker_list_id").get()
offerItem["url"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@href').get()
offerItem["title"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']/@title").get()
offerItem["price"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//span[@class='sc-ifAKCX eoKYee']/text()").get()
offerItem["post_time"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//p[@class='sc-1iuc9a2-4 hDBjae sc-ifAKCX fWUyFm']/text()").get()
offerItem["city"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//span[@class='sc-7l84qu-1 ciykCV sc-ifAKCX dpURtf']/@title").get()
offerItem["thumb_url"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//div[@class='fnmrjs-1 gIEtsI']//img/@src").get()
offerItem["is_featured"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@data-lurker_is_featured').get()
offerItem["list_position"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@data-lurker_list_position').get()
yield offerItem
next_page_url = response.xpath('//*[@id="listing-main-content-slot"]/div[9]/div/div/div[2]/div/div[1]/div/a/@href').extract_first()
if next_page_url is not None:
yield scrapy.Request(next_page_url, callback=self.parse)
我有以下代码:
import scrapy
from datetime import datetime, timedelta
class TigerOffer(scrapy.Item):
product_id = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
price = scrapy.Field()
city = scrapy.Field()
state = scrapy.Field()
post_date = scrapy.Field()
post_time = scrapy.Field()
thumb_url = scrapy.Field()
is_featured = scrapy.Field()
list_position = scrapy.Field()
class TigerOfferSpider(scrapy.Spider):
name = 'tigeroffs'
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
'CLOSESPIDER_ITEMCOUNT': 30
}
allowed_domains = ['https://sp.olx.com.br']
start_urls = ['https://sp.olx.com.br/autos-e-pecas/motos?q=tiger%20800']
def parse(self, response):
offerItem = TigerOffer()
offerList = response.xpath("//ul[@class='sc-1fcmfeb-1 kntIvV']/li")
for offer in offerList:
offerItem["product_id"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']/@data-lurker_list_id").get()
offerItem["url"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@href').get()
offerItem["title"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']/@title").get()
offerItem["price"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//span[@class='sc-ifAKCX eoKYee']/text()").get()
offerItem["post_time"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//p[@class='sc-1iuc9a2-4 hDBjae sc-ifAKCX fWUyFm']/text()").get()
offerItem["city"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//span[@class='sc-7l84qu-1 ciykCV sc-ifAKCX dpURtf']/@title").get()
offerItem["thumb_url"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//div[@class='fnmrjs-1 gIEtsI']//img/@src").get()
offerItem["is_featured"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@data-lurker_is_featured').get()
offerItem["list_position"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@data-lurker_list_position').get()
yield offerItem
next_page_url = response.xpath('//*[@id="listing-main-content-slot"]/div[9]/div/div/div[2]/div/div[1]/div/a/@href').extract_first()
if next_page_url is not None:
yield scrapy.Request(response.urljoin(next_page_url))
问题是,我无法转到下一页。
变量 next_page_url
存在并且不存在 none。
我一直在寻找一些答案,但找不到任何答案。
有谁知道怎么弄的吗?
非常感谢。
主要问题
无法导航到下一页的根本原因是代码中的 'CLOSESPIDER_ITEMCOUNT': 30
。
说明
通过在您的代码中包含这一行 'CLOSESPIDER_ITEMCOUNT': 30
是在项目抓取计数达到 30 时向蜘蛛发出关闭信号。但是,由于 Scrapy 异步性质,它将完成所有项目的抓取并且不会处理任何之后的其他请求。现在,由于蜘蛛已经关闭,yield scrapy.Request(response.urljoin(next_page_url))
将无法工作。
你修改后的代码
import scrapy
from datetime import datetime, timedelta
class TigerOffer(scrapy.Item):
product_id = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
price = scrapy.Field()
city = scrapy.Field()
state = scrapy.Field()
post_date = scrapy.Field()
post_time = scrapy.Field()
thumb_url = scrapy.Field()
is_featured = scrapy.Field()
list_position = scrapy.Field()
class TigerOfferSpider(scrapy.Spider):
name = 'tigeroffs'
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
}
allowed_domains = ['sp.olx.com.br']
start_urls = ['https://sp.olx.com.br/autos-e-pecas/motos?q=tiger%20800']
def parse(self, response):
offerItem = TigerOffer()
offerList = response.xpath("//ul[@class='sc-1fcmfeb-1 kntIvV']/li")
for offer in offerList:
offerItem["product_id"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']/@data-lurker_list_id").get()
offerItem["url"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@href').get()
offerItem["title"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']/@title").get()
offerItem["price"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//span[@class='sc-ifAKCX eoKYee']/text()").get()
offerItem["post_time"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//p[@class='sc-1iuc9a2-4 hDBjae sc-ifAKCX fWUyFm']/text()").get()
offerItem["city"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//span[@class='sc-7l84qu-1 ciykCV sc-ifAKCX dpURtf']/@title").get()
offerItem["thumb_url"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//div[@class='fnmrjs-1 gIEtsI']//img/@src").get()
offerItem["is_featured"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@data-lurker_is_featured').get()
offerItem["list_position"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@data-lurker_list_position').get()
yield offerItem
next_page_url = response.xpath('//*[@id="listing-main-content-slot"]/div[9]/div/div/div[2]/div/div[1]/div/a/@href').extract_first()
if next_page_url is not None:
yield scrapy.Request(next_page_url, callback=self.parse)