如何使用 "if" 来处理两个或更多的 xpath?
How to use "if" to handle with two xpath or more?
我正在通过下面的代码 python 训练网络抓取。
但是其中一个数据有两个 xpath,我想知道是否有一种方法可以使用“if”条件来捕获这两个数据,但我不知道如何将其插入到我的代码中。谁能指导我?
例如,如果其中一个xpath为null,则肯定是另一个。不知道解释的好不好,如果我有a和b,如果a为null则b.
'vlr_atual'可分别为:
product.xpath(".//span[@id='priceblock_ourprice']/text()").get()
product.xpath(".//span[@id='priceblock_saleprice']/text()").get()
import scrapy
import datetime
class ProductsSpider(scrapy.Spider):
name = 'products'
allowed_domains = ['www.amazon.com.br']
start_urls = ['https://www.amazon.com.br/s?i=computers&bbn=16339926011&rh=n%3A16364756011&fs=true&qid=1615634908&ref=sr_pg_1']
def parse(self, response):
for produto in response.xpath("//div[@class='a-section a-spacing-medium']"):
selo = produto.xpath(".//span[@class='a-badge-text']/text()").get()
link = response.urljoin(produto.xpath(".//h2/a/@href").get())
yield response.follow(url=link, callback=self.parse_details, meta={'selo' : selo})
next_page = response.urljoin(response.xpath("//li[@class='a-last']/a/@href").get())
if next_page:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_details(self, response):
selo = response.request.meta['selo']
for produto in response.xpath("//div[@id='dp']"):
vlr_atual = produto.xpath(".//span[@id='priceblock_ourprice']/text()").get()
if vlr_atual is None:
vlr_atual = produto.xpath(".//span[@id='priceblock_saleprice']/text()").get()
yield{
'data' : datetime.datetime.now().strftime("%Y%m%d"),
'selo': selo,
'nome': produto.xpath("normalize-space(.//span[@id='productTitle']/text())").get(),
'vlr_atual': vlr_atual,
'estoque': produto.xpath("normalize-space(.//select[@name='quantity']/option[last()]/text())").get(),
'ean': produto.xpath("normalize-space(.//table[@id='productDetails_techSpec_section_1']//tr[last()]/td/text())").get(),
}
您可以使用 or
operator 在两个事物之间进行选择,同时优先选择第一个事物
>>> a="www.example.com"
>>> b="www.example2.com"
>>> a or b
'www.example.com'
>>> a=None
>>> a or b
'www.example2.com'
>>>
这项工作的方式是,如果 a
的 "truth" 值为真,则 a or b
return a
否则 return b
所以你可以做到
product.xpath (".//span[@id='priceblock_ourprice']/text()").get() or product.xpath (".//span[@id='priceblock_saleprice']/text()").get()
编辑
你也可以把它封装成自己的函数,像这样
def get_vlr_atual(product, default=None):
lst_xpaths = [".//span[@id='priceblock_ourprice']/text()",
".//span[@id='priceblock_saleprice']/text()"
]
for path in lst_paths:
result = product.xpath(path).get()
if result is not None:
return result
return default
这与以前基本相同,但它可以很容易地扩展到任意多的 xpath,如果所有这些都失败了,只需 return 一些方便的默认值
和像
这样的简单使用
...
{...
'vlr_atual': get_vlr_atual(product),
...
}
...
来点非常简单的东西怎么样:
def parse_details(self, response):
selo = response.request.meta['selo']
for produto in response.xpath("//div[@id='dp']"):
# determine which field is vlr_atual
ourprice = produto.xpath(".//span[@id='priceblock_ourprice']/text()").get()
saleprice = produto.xpath(".//span[@id='priceblock_saleprice']/text()").get()
if ourprice is not None:
vlr_atual = ourprice
else:
vlr_atual = saleprice
yield {
'data': datetime.datetime.now().strftime("%Y%m%d"),
'selo': selo,
'nome': produto.xpath("normalize-space(.//span[@id='productTitle']/text())").get(),
'vlr_atual': vlr_atual,
'estoque': produto.xpath("normalize-space(.//select[@name='quantity']/option[last()]/text())").get(),
'ean': produto.xpath("normalize-space(.//table[@id='productDetails_techSpec_section_1']//tr[last()]/td/text())").get(),
}
我强烈建议您使用 Item Loaders。您将能够在一个地方自动更新选定的字段。取第一个非空白值,加入几个结果等。
首先用 TakeFirst
处理器在 items.py 中定义你的 Product
:
class ProductItem(scrapy.Item):
data= scrapy.Field()
selo = scrapy.Field()
vlr_atual= scrapy.Field(output_processor=TakeFirst())
接下来在你的蜘蛛中使用它:
from scrapy.loader import ItemLoader
....
for produto in response.xpath("//div[@id='dp']"):
l = ItemLoader(item=ProductItem(), selector=produto)
l.add_value('data', datetime.datetime.now().strftime("%Y%m%d"))
l.add_xpath("vlr_atual", ".//span[@id='priceblock_ourprice']/text()")
l.add_xpath("vlr_atual", ".//span[@id='priceblock_saleprice']/text()")
...
l.load_item()
我正在通过下面的代码 python 训练网络抓取。
但是其中一个数据有两个 xpath,我想知道是否有一种方法可以使用“if”条件来捕获这两个数据,但我不知道如何将其插入到我的代码中。谁能指导我?
例如,如果其中一个xpath为null,则肯定是另一个。不知道解释的好不好,如果我有a和b,如果a为null则b.
'vlr_atual'可分别为:
product.xpath(".//span[@id='priceblock_ourprice']/text()").get()
product.xpath(".//span[@id='priceblock_saleprice']/text()").get()
import scrapy
import datetime
class ProductsSpider(scrapy.Spider):
name = 'products'
allowed_domains = ['www.amazon.com.br']
start_urls = ['https://www.amazon.com.br/s?i=computers&bbn=16339926011&rh=n%3A16364756011&fs=true&qid=1615634908&ref=sr_pg_1']
def parse(self, response):
for produto in response.xpath("//div[@class='a-section a-spacing-medium']"):
selo = produto.xpath(".//span[@class='a-badge-text']/text()").get()
link = response.urljoin(produto.xpath(".//h2/a/@href").get())
yield response.follow(url=link, callback=self.parse_details, meta={'selo' : selo})
next_page = response.urljoin(response.xpath("//li[@class='a-last']/a/@href").get())
if next_page:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_details(self, response):
selo = response.request.meta['selo']
for produto in response.xpath("//div[@id='dp']"):
vlr_atual = produto.xpath(".//span[@id='priceblock_ourprice']/text()").get()
if vlr_atual is None:
vlr_atual = produto.xpath(".//span[@id='priceblock_saleprice']/text()").get()
yield{
'data' : datetime.datetime.now().strftime("%Y%m%d"),
'selo': selo,
'nome': produto.xpath("normalize-space(.//span[@id='productTitle']/text())").get(),
'vlr_atual': vlr_atual,
'estoque': produto.xpath("normalize-space(.//select[@name='quantity']/option[last()]/text())").get(),
'ean': produto.xpath("normalize-space(.//table[@id='productDetails_techSpec_section_1']//tr[last()]/td/text())").get(),
}
您可以使用 or
operator 在两个事物之间进行选择,同时优先选择第一个事物
>>> a="www.example.com"
>>> b="www.example2.com"
>>> a or b
'www.example.com'
>>> a=None
>>> a or b
'www.example2.com'
>>>
这项工作的方式是,如果 a
的 "truth" 值为真,则 a or b
return a
否则 return b
所以你可以做到
product.xpath (".//span[@id='priceblock_ourprice']/text()").get() or product.xpath (".//span[@id='priceblock_saleprice']/text()").get()
编辑
你也可以把它封装成自己的函数,像这样
def get_vlr_atual(product, default=None):
lst_xpaths = [".//span[@id='priceblock_ourprice']/text()",
".//span[@id='priceblock_saleprice']/text()"
]
for path in lst_paths:
result = product.xpath(path).get()
if result is not None:
return result
return default
这与以前基本相同,但它可以很容易地扩展到任意多的 xpath,如果所有这些都失败了,只需 return 一些方便的默认值
和像
这样的简单使用...
{...
'vlr_atual': get_vlr_atual(product),
...
}
...
来点非常简单的东西怎么样:
def parse_details(self, response):
selo = response.request.meta['selo']
for produto in response.xpath("//div[@id='dp']"):
# determine which field is vlr_atual
ourprice = produto.xpath(".//span[@id='priceblock_ourprice']/text()").get()
saleprice = produto.xpath(".//span[@id='priceblock_saleprice']/text()").get()
if ourprice is not None:
vlr_atual = ourprice
else:
vlr_atual = saleprice
yield {
'data': datetime.datetime.now().strftime("%Y%m%d"),
'selo': selo,
'nome': produto.xpath("normalize-space(.//span[@id='productTitle']/text())").get(),
'vlr_atual': vlr_atual,
'estoque': produto.xpath("normalize-space(.//select[@name='quantity']/option[last()]/text())").get(),
'ean': produto.xpath("normalize-space(.//table[@id='productDetails_techSpec_section_1']//tr[last()]/td/text())").get(),
}
我强烈建议您使用 Item Loaders。您将能够在一个地方自动更新选定的字段。取第一个非空白值,加入几个结果等。
首先用 TakeFirst
处理器在 items.py 中定义你的 Product
:
class ProductItem(scrapy.Item):
data= scrapy.Field()
selo = scrapy.Field()
vlr_atual= scrapy.Field(output_processor=TakeFirst())
接下来在你的蜘蛛中使用它:
from scrapy.loader import ItemLoader
....
for produto in response.xpath("//div[@id='dp']"):
l = ItemLoader(item=ProductItem(), selector=produto)
l.add_value('data', datetime.datetime.now().strftime("%Y%m%d"))
l.add_xpath("vlr_atual", ".//span[@id='priceblock_ourprice']/text()")
l.add_xpath("vlr_atual", ".//span[@id='priceblock_saleprice']/text()")
...
l.load_item()