Scrapy 只从每页的第一项收集信息,为什么?
Scrapy collecting information only from the first item on each page, why?
我有以下蜘蛛,但它只收集每个页面上的第一个项目。
谁能给我解释一下为什么?我找不到我的错误。
import scrapy
class PerfumesSpider(scrapy.Spider):
name = 'perfumes'
allowed_domains = ['www.fragrancenet.com']
start_urls = ['https://www.fragrancenet.com/fragrances']
def parse(self, response):
for perfumes in response.xpath("//div[@id='resultSet']"):
#nome = perfumes.xpath(".//span[@class='brand-name']/text()").get(),
link = perfumes.xpath(".//p[@class='desc']/a/@href").get()
yield response.follow(url=link, callback=self.parse_produto, meta={'link' : link})
next_page = response.xpath("//a[@data-rel='next']/@href").get()
if next_page:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_produto(self, response):
link = response.request.meta['link']
for produto in response.xpath("//div[@class='topZone cf']"):
yield{
'link': link,
'gender': produto.xpath(".//span[@class='genderBar desktop']/span/span/text()").get(),
'name': produto.xpath(".//span[@class='productTitle']/text()").get(),
'year': produto.xpath(".//ul[@class='notes cf']/li[3]/span[2]/text()").get(),
'brand': produto.xpath("normalize-space(.//p[@class='uDesigner']/a/text())").get(),
'size': produto.xpath(".//span[@class='sr-only']/text()").getall(),
'price': produto.xpath(".//div[@class='pricing']/text()").getall(),
'discount': produto.xpath(".//div[@class='fnet-offer']/a/span/span/text()").get(),
}
如果有人能帮助我,我将不胜感激
您需要 select div 的内部结果集,尝试将其更改为:
for perfumes in response.xpath("//div[@id='resultSet']"):
像这样(我不确定 xpath 代码,你必须仔细检查):
for perfumes in response.xpath("//div[@id='resultSet']//div[@class='resultItem']"):
我有以下蜘蛛,但它只收集每个页面上的第一个项目。
谁能给我解释一下为什么?我找不到我的错误。
import scrapy
class PerfumesSpider(scrapy.Spider):
name = 'perfumes'
allowed_domains = ['www.fragrancenet.com']
start_urls = ['https://www.fragrancenet.com/fragrances']
def parse(self, response):
for perfumes in response.xpath("//div[@id='resultSet']"):
#nome = perfumes.xpath(".//span[@class='brand-name']/text()").get(),
link = perfumes.xpath(".//p[@class='desc']/a/@href").get()
yield response.follow(url=link, callback=self.parse_produto, meta={'link' : link})
next_page = response.xpath("//a[@data-rel='next']/@href").get()
if next_page:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_produto(self, response):
link = response.request.meta['link']
for produto in response.xpath("//div[@class='topZone cf']"):
yield{
'link': link,
'gender': produto.xpath(".//span[@class='genderBar desktop']/span/span/text()").get(),
'name': produto.xpath(".//span[@class='productTitle']/text()").get(),
'year': produto.xpath(".//ul[@class='notes cf']/li[3]/span[2]/text()").get(),
'brand': produto.xpath("normalize-space(.//p[@class='uDesigner']/a/text())").get(),
'size': produto.xpath(".//span[@class='sr-only']/text()").getall(),
'price': produto.xpath(".//div[@class='pricing']/text()").getall(),
'discount': produto.xpath(".//div[@class='fnet-offer']/a/span/span/text()").get(),
}
如果有人能帮助我,我将不胜感激
您需要 select div 的内部结果集,尝试将其更改为:
for perfumes in response.xpath("//div[@id='resultSet']"):
像这样(我不确定 xpath 代码,你必须仔细检查):
for perfumes in response.xpath("//div[@id='resultSet']//div[@class='resultItem']"):