在 Scrapy 中更改 request.url 以抓取 json 文件然后 return 到原始请求
Change request.url in Scrapy to crawl a json file then return to the original request
我正在从 url(产品名称、价格等)中抓取数据,但后端有一个 json 文件,我也想抓取该文件,因为它包含相关信息。简而言之,我想将我的请求 url 更改为 json,然后将 return 更改为原始的 url,以便继续抓取。
- 产品url:https://www2.hm.com/hu_hu/productpage.0906822002.html
- 相关 json url(可以在“网络”选项卡中找到,我将其存储在名为 availability_url 的变量中):
https://www2.hm.com/hmwebservices/service/product/hu/availability/0906822.json
在实际产量之前将可用性数据保存在变量中非常重要,因为在检查代码末尾的颜色之前,我必须 return 到原始 url:
import scrapy
import re
class HMSpider(scrapy.Spider):
name = 'hm'
start_urls= ['https://www2.hm.com/hu_hu/learazas/noi/dresses.html']
custom_settings = {'FEED_EXPORT_ENCODING': 'utf-8'}
def parse(self, response):
items = response.css('h3.item-heading a::attr(href)').getall()
for item in items:
link = 'https://www2.hm.com' + item
yield scrapy.Request(link, self.parse_item)
def parse_item(self, response, request):
page_source_data = response.xpath('//div[@class= "tealiumProductviewtag productview parbase"]//text()')[0].get()
data = response.css('div.product.parbase script::text').get()
base_product_code = re.findall("ancestorProductCode = '(.*)';",data)[0]
detailed_product_code = re.findall("articleCode':'(.*)', 'baseProductCode",data)[0]
current_price = int(re.findall(r'\d+',re.findall('product_list_price : \["(.*?)\],', page_source_data)[0])[0])
original_price = int(re.findall(r'\d+',re.findall('product_original_price : \[(.*?)\],', page_source_data)[0])[0])
availability_url = 'https://www2.hm.com/hmwebservices/service/product/hu/availability/' + base_product_code +".json"
info_url = "https://tags.tiqcdn.com/dle/hm/hdl/" + detailed_product_code +".json"
if current_price != original_price:
yield{
'product_name': re.findall('(?<= ).*$',response.css('section.name-price h1.primary.product-item-headline::text').get())[0],
'vendor': 'H&M',
'current_price': int(current_price),
'original_price': int(original_price),
'discount_percent': 100-round((current_price / original_price)*100),
'colors': response.css('li.list-item a::attr(title)').getall(),
'link': response.request.url,
#rating
#reviews
}
color_count = len(response.css('div.mini-slider li.list-item a::attr(href)').getall())
if color_count > 1:
additonal_colors = response.css('div.mini-slider li.list-item a::attr(href)').getall()
color_index = 1
for color in additonal_colors:
if color_index <= color_count:
link = 'https://www2.hm.com' + color
yield scrapy.Request(link, self.parse_item)
color_index += 1
所以总结一下:我想把刮掉的 url 从
https://www2.hm.com/hu_hu/productpage.0906822002.html
至 https://www2.hm.com/hmwebservices/service/product/hu/availability/0906822.json
然后 return 回到
https://www2.hm.com/hu_hu/productpage.0906822002.html 这样我的抓取工具就可以继续工作了。
您可以这样做,如果您在提取所有项目数据后执行 json 请求,则不必 return 到函数。 (仍然会创建颜色变化请求,因为我们正在生成请求并且什么都没有 returning)
试试这是否适合你:
import json
import scrapy
import re
class HMSpider(scrapy.Spider):
name = 'hm'
start_urls= ['https://www2.hm.com/hu_hu/learazas/noi/dresses.html']
custom_settings = {'FEED_EXPORT_ENCODING': 'utf-8'}
def parse(self, response):
items = response.css('h3.item-heading a::attr(href)').getall()
for item in items:
link = 'https://www2.hm.com' + item
yield scrapy.Request(link, self.parse_item)
def parse_item(self, response, request):
page_source_data = response.xpath('//div[@class= "tealiumProductviewtag productview parbase"]//text()')[0].get()
data = response.css('div.product.parbase script::text').get()
base_product_code = re.findall("ancestorProductCode = '(.*)';",data)[0]
detailed_product_code = re.findall("articleCode':'(.*)', 'baseProductCode",data)[0]
current_price = int(re.findall(r'\d+',re.findall('product_list_price : \["(.*?)\],', page_source_data)[0])[0])
original_price = int(re.findall(r'\d+',re.findall('product_original_price : \[(.*?)\],', page_source_data)[0])[0])
availability_url = 'https://www2.hm.com/hmwebservices/service/product/hu/availability/' + base_product_code +".json"
info_url = "https://tags.tiqcdn.com/dle/hm/hdl/" + detailed_product_code +".json"
if current_price != original_price:
item = {
'product_name': re.findall('(?<= ).*$',response.css('section.name-price h1.primary.product-item-headline::text').get())[0],
'vendor': 'H&M',
'current_price': int(current_price),
'original_price': int(original_price),
'discount_percent': 100-round((current_price / original_price)*100),
'colors': response.css('li.list-item a::attr(title)').getall(),
'link': response.request.url,
#rating
#reviews
}
if availability_url:
yield scrapy.Request(
url=availability_url,
callback=self.parse_availability,
meta={
'item': item
}
)
color_count = len(response.css('div.mini-slider li.list-item a::attr(href)').getall())
if color_count > 1:
additonal_colors = response.css('div.mini-slider li.list-item a::attr(href)').getall()
color_index = 1
for color in additonal_colors:
if color_index <= color_count:
link = 'https://www2.hm.com' + color
yield scrapy.Request(link, self.parse_item)
color_index += 1
def parse_availability(self, response):
item = response.meta.get('item')
json_data = json.loads(response.body)
#do something with json data here and add it to item
yield item
我正在从 url(产品名称、价格等)中抓取数据,但后端有一个 json 文件,我也想抓取该文件,因为它包含相关信息。简而言之,我想将我的请求 url 更改为 json,然后将 return 更改为原始的 url,以便继续抓取。
- 产品url:https://www2.hm.com/hu_hu/productpage.0906822002.html
- 相关 json url(可以在“网络”选项卡中找到,我将其存储在名为 availability_url 的变量中): https://www2.hm.com/hmwebservices/service/product/hu/availability/0906822.json
在实际产量之前将可用性数据保存在变量中非常重要,因为在检查代码末尾的颜色之前,我必须 return 到原始 url:
import scrapy
import re
class HMSpider(scrapy.Spider):
name = 'hm'
start_urls= ['https://www2.hm.com/hu_hu/learazas/noi/dresses.html']
custom_settings = {'FEED_EXPORT_ENCODING': 'utf-8'}
def parse(self, response):
items = response.css('h3.item-heading a::attr(href)').getall()
for item in items:
link = 'https://www2.hm.com' + item
yield scrapy.Request(link, self.parse_item)
def parse_item(self, response, request):
page_source_data = response.xpath('//div[@class= "tealiumProductviewtag productview parbase"]//text()')[0].get()
data = response.css('div.product.parbase script::text').get()
base_product_code = re.findall("ancestorProductCode = '(.*)';",data)[0]
detailed_product_code = re.findall("articleCode':'(.*)', 'baseProductCode",data)[0]
current_price = int(re.findall(r'\d+',re.findall('product_list_price : \["(.*?)\],', page_source_data)[0])[0])
original_price = int(re.findall(r'\d+',re.findall('product_original_price : \[(.*?)\],', page_source_data)[0])[0])
availability_url = 'https://www2.hm.com/hmwebservices/service/product/hu/availability/' + base_product_code +".json"
info_url = "https://tags.tiqcdn.com/dle/hm/hdl/" + detailed_product_code +".json"
if current_price != original_price:
yield{
'product_name': re.findall('(?<= ).*$',response.css('section.name-price h1.primary.product-item-headline::text').get())[0],
'vendor': 'H&M',
'current_price': int(current_price),
'original_price': int(original_price),
'discount_percent': 100-round((current_price / original_price)*100),
'colors': response.css('li.list-item a::attr(title)').getall(),
'link': response.request.url,
#rating
#reviews
}
color_count = len(response.css('div.mini-slider li.list-item a::attr(href)').getall())
if color_count > 1:
additonal_colors = response.css('div.mini-slider li.list-item a::attr(href)').getall()
color_index = 1
for color in additonal_colors:
if color_index <= color_count:
link = 'https://www2.hm.com' + color
yield scrapy.Request(link, self.parse_item)
color_index += 1
所以总结一下:我想把刮掉的 url 从 https://www2.hm.com/hu_hu/productpage.0906822002.html 至 https://www2.hm.com/hmwebservices/service/product/hu/availability/0906822.json 然后 return 回到 https://www2.hm.com/hu_hu/productpage.0906822002.html 这样我的抓取工具就可以继续工作了。
您可以这样做,如果您在提取所有项目数据后执行 json 请求,则不必 return 到函数。 (仍然会创建颜色变化请求,因为我们正在生成请求并且什么都没有 returning) 试试这是否适合你:
import json
import scrapy
import re
class HMSpider(scrapy.Spider):
name = 'hm'
start_urls= ['https://www2.hm.com/hu_hu/learazas/noi/dresses.html']
custom_settings = {'FEED_EXPORT_ENCODING': 'utf-8'}
def parse(self, response):
items = response.css('h3.item-heading a::attr(href)').getall()
for item in items:
link = 'https://www2.hm.com' + item
yield scrapy.Request(link, self.parse_item)
def parse_item(self, response, request):
page_source_data = response.xpath('//div[@class= "tealiumProductviewtag productview parbase"]//text()')[0].get()
data = response.css('div.product.parbase script::text').get()
base_product_code = re.findall("ancestorProductCode = '(.*)';",data)[0]
detailed_product_code = re.findall("articleCode':'(.*)', 'baseProductCode",data)[0]
current_price = int(re.findall(r'\d+',re.findall('product_list_price : \["(.*?)\],', page_source_data)[0])[0])
original_price = int(re.findall(r'\d+',re.findall('product_original_price : \[(.*?)\],', page_source_data)[0])[0])
availability_url = 'https://www2.hm.com/hmwebservices/service/product/hu/availability/' + base_product_code +".json"
info_url = "https://tags.tiqcdn.com/dle/hm/hdl/" + detailed_product_code +".json"
if current_price != original_price:
item = {
'product_name': re.findall('(?<= ).*$',response.css('section.name-price h1.primary.product-item-headline::text').get())[0],
'vendor': 'H&M',
'current_price': int(current_price),
'original_price': int(original_price),
'discount_percent': 100-round((current_price / original_price)*100),
'colors': response.css('li.list-item a::attr(title)').getall(),
'link': response.request.url,
#rating
#reviews
}
if availability_url:
yield scrapy.Request(
url=availability_url,
callback=self.parse_availability,
meta={
'item': item
}
)
color_count = len(response.css('div.mini-slider li.list-item a::attr(href)').getall())
if color_count > 1:
additonal_colors = response.css('div.mini-slider li.list-item a::attr(href)').getall()
color_index = 1
for color in additonal_colors:
if color_index <= color_count:
link = 'https://www2.hm.com' + color
yield scrapy.Request(link, self.parse_item)
color_index += 1
def parse_availability(self, response):
item = response.meta.get('item')
json_data = json.loads(response.body)
#do something with json data here and add it to item
yield item