使用 Scrapy 传递请求
Passing requests with Scrapy
我正在尝试根据 url 中的 brand
数字传递 scrapy
的请求,然后从提供信息的网页中提取 id's
在下一页上,然后遍历下一页以获取产品 ID。
我可以传递请求并解析产品数据并将其发送到请求中,但是我不确定是否定义函数让我抓住下一页的光标。
这是我的代码:
class DepopItem(scrapy.Item):
brands = Field(output_processor=TakeFirst())
ID = Field(output_processor=TakeFirst())
brand = Field(output_processor=TakeFirst())
class DepopSpider(scrapy.Spider):
name = 'depop'
start_urls = ['https://webapi.depop.com/api/v2/search/filters/aggregates/?brands=1596&itemsPerPage=24&country=gb¤cy=GBP&sort=relevance']
brands = [1596]
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
def start_requests(self, cursor=''):
for brand in self.brands:
for item in self.create_product_request(brand):
yield item
yield scrapy.FormRequest(
url='https://webapi.depop.com/api/v2/search/products/',
method='GET',
formdata={
'brands': str(brand),
'cursor': cursor,
'itemsPerPage': '24',
'country': 'gb',
'currency': 'GBP',
'sort': 'relevance'
},
cb_kwargs={'brand': brand}
)
def parse(self, response, brand):
# load stuff
for item in response.json().get('products'):
loader = ItemLoader(DepopItem())
loader.add_value('brand', brand)
loader.add_value('ID', item.get('id'))
yield loader.load_item()
cursor = response.json()['meta'].get('cursor')
if cursor:
for item in self.create_product_request(brand, cursor):
yield item
def create_product_request(self, response):
test = response.json()['meta'].get('cursor')
yield test
我收到以下错误:
AttributeError: 'int' object has no attribute 'json'
预期输出:
{"brand": 1596, "ID": 273027529}
{"brand": 1596, "ID": 274115361}
{"brand": 1596, "ID": 270641301}
{"brand": 1596, "ID": 274505678}
{"brand": 1596, "ID": 262857014}
{"brand": 1596, "ID": 270088589}
{"brand": 1596, "ID": 208498028}
{"brand": 1596, "ID": 270426792}
{"brand": 1596, "ID": 274483351}
{"brand": 1596, "ID": 274109923}
{"brand": 1596, "ID": 273424157}
..
..
..
start_requests
在发出请求之前是 运行。
您可以递归处理分页。
import scrapy
from scrapy.loader import ItemLoader
from scrapy import Field
from scrapy.loader.processors import TakeFirst
class DepopItem(scrapy.Item):
brands = Field(output_processor=TakeFirst())
ID = Field(output_processor=TakeFirst())
brand = Field(output_processor=TakeFirst())
class DepopSpider(scrapy.Spider):
name = 'depop'
start_urls = ['https://webapi.depop.com/api/v2/search/products/']
brands = [1596]
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
def parse(self, response):
json_data = response.json()
# pagination
cursor = json_data['meta']['cursor']
if json_data['meta']['hasMore']:
yield scrapy.FormRequest(
url='https://webapi.depop.com/api/v2/search/products/',
method='GET',
formdata={'cursor': cursor}
)
for brand in self.brands:
yield scrapy.FormRequest(
url='https://webapi.depop.com/api/v2/search/products/',
method='GET',
formdata={
'brands': str(brand),
'cursor': cursor,
'itemsPerPage': '24',
'country': 'gb',
'currency': 'GBP',
'sort': 'relevance'
},
cb_kwargs={'brand': brand},
callback=self.parse_brand
)
def parse_brand(self, response, brand):
# load stuff
for item in response.json().get('products'):
loader = ItemLoader(DepopItem())
loader.add_value('brand', brand)
loader.add_value('ID', item.get('id'))
yield loader.load_item()
输出:
{'ID': 245137362, 'brand': 1596}
{'ID': 244263081, 'brand': 1596}
{'ID': 242128472, 'brand': 1596}
{'ID': 239929000, 'brand': 1596}
...
...
...
顺便说一句,使用旋转代理或其他东西,因为“请求太多”,我被阻止了 10 分钟。
我正在尝试根据 url 中的 brand
数字传递 scrapy
的请求,然后从提供信息的网页中提取 id's
在下一页上,然后遍历下一页以获取产品 ID。
我可以传递请求并解析产品数据并将其发送到请求中,但是我不确定是否定义函数让我抓住下一页的光标。
这是我的代码:
class DepopItem(scrapy.Item):
brands = Field(output_processor=TakeFirst())
ID = Field(output_processor=TakeFirst())
brand = Field(output_processor=TakeFirst())
class DepopSpider(scrapy.Spider):
name = 'depop'
start_urls = ['https://webapi.depop.com/api/v2/search/filters/aggregates/?brands=1596&itemsPerPage=24&country=gb¤cy=GBP&sort=relevance']
brands = [1596]
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
def start_requests(self, cursor=''):
for brand in self.brands:
for item in self.create_product_request(brand):
yield item
yield scrapy.FormRequest(
url='https://webapi.depop.com/api/v2/search/products/',
method='GET',
formdata={
'brands': str(brand),
'cursor': cursor,
'itemsPerPage': '24',
'country': 'gb',
'currency': 'GBP',
'sort': 'relevance'
},
cb_kwargs={'brand': brand}
)
def parse(self, response, brand):
# load stuff
for item in response.json().get('products'):
loader = ItemLoader(DepopItem())
loader.add_value('brand', brand)
loader.add_value('ID', item.get('id'))
yield loader.load_item()
cursor = response.json()['meta'].get('cursor')
if cursor:
for item in self.create_product_request(brand, cursor):
yield item
def create_product_request(self, response):
test = response.json()['meta'].get('cursor')
yield test
我收到以下错误:
AttributeError: 'int' object has no attribute 'json'
预期输出:
{"brand": 1596, "ID": 273027529}
{"brand": 1596, "ID": 274115361}
{"brand": 1596, "ID": 270641301}
{"brand": 1596, "ID": 274505678}
{"brand": 1596, "ID": 262857014}
{"brand": 1596, "ID": 270088589}
{"brand": 1596, "ID": 208498028}
{"brand": 1596, "ID": 270426792}
{"brand": 1596, "ID": 274483351}
{"brand": 1596, "ID": 274109923}
{"brand": 1596, "ID": 273424157}
..
..
..
start_requests
在发出请求之前是 运行。
您可以递归处理分页。
import scrapy
from scrapy.loader import ItemLoader
from scrapy import Field
from scrapy.loader.processors import TakeFirst
class DepopItem(scrapy.Item):
brands = Field(output_processor=TakeFirst())
ID = Field(output_processor=TakeFirst())
brand = Field(output_processor=TakeFirst())
class DepopSpider(scrapy.Spider):
name = 'depop'
start_urls = ['https://webapi.depop.com/api/v2/search/products/']
brands = [1596]
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
def parse(self, response):
json_data = response.json()
# pagination
cursor = json_data['meta']['cursor']
if json_data['meta']['hasMore']:
yield scrapy.FormRequest(
url='https://webapi.depop.com/api/v2/search/products/',
method='GET',
formdata={'cursor': cursor}
)
for brand in self.brands:
yield scrapy.FormRequest(
url='https://webapi.depop.com/api/v2/search/products/',
method='GET',
formdata={
'brands': str(brand),
'cursor': cursor,
'itemsPerPage': '24',
'country': 'gb',
'currency': 'GBP',
'sort': 'relevance'
},
cb_kwargs={'brand': brand},
callback=self.parse_brand
)
def parse_brand(self, response, brand):
# load stuff
for item in response.json().get('products'):
loader = ItemLoader(DepopItem())
loader.add_value('brand', brand)
loader.add_value('ID', item.get('id'))
yield loader.load_item()
输出:
{'ID': 245137362, 'brand': 1596}
{'ID': 244263081, 'brand': 1596}
{'ID': 242128472, 'brand': 1596}
{'ID': 239929000, 'brand': 1596}
...
...
...
顺便说一句,使用旋转代理或其他东西,因为“请求太多”,我被阻止了 10 分钟。