Scrapy:使用 post 请求的分页不起作用
Scrapy: pagination with post request doesn't work
我正在尝试从该网站提取:https://www.mrlodge.de/wohnungen/
蜘蛛工作没有任何错误,但它不执行我通过有效负载传递的分页。我只返回同一页面。
我尝试使用 json 库来配置我的负载,但是负载本身不在 json 中。
请帮忙。
import scrapy
class MrlodgeSpiderSpider(scrapy.Spider):
name = 'mrlodge_spider'
def start_requests(self):
for pageNumber in range(1,10):
s = """mrl_ft%5Bfd%5D%5Bdate_from%5D=&mrl_ft%5Bfd%5D%5Brent_from%5D=1000&mr\
l_ft%5Bfd%5D%5Brent_to%5D=8500&mrl_ft%5Bfd%5D%5Bpersons%5D=1&mrl_ft%5Bfd\
%5D%5Bkids%5D=0&mrl_ft%5Bfd%5D%5Brooms_from%5D=1&mrl_ft%5Bfd%5D%5Brooms_to\
%5D=9&mrl_ft%5Bfd%5D%5Barea_from%5D=20&mrl_ft%5Bfd%5D%5Barea_to%5D=480&\
mrl_ft%5Bfd%5D%5Bsterm%5D=&mrl_ft%5Bfd%5D%5Bradius%5D=50&mrl_ft%5Bfd\
%5D%5Bmvv%5D=&mrl_ft%5Bfd%5D%5Bobjecttype_cb%5D%5B%5D=w&mrl_ft%5B\
fd%5D%5Bobjecttype_cb%5D%5B%5D=h&mrl_ft%5Bpage%5D={}""".format(pageNumber)
payload = '{'+s+'}'
print(payload)
yield scrapy.Request(url='https://www.mrlodge.de/wohnungen/', method='POST',
body=payload, headers={'content-type': 'application/json'})
def parse(self, response):
for apartment in response.xpath("//div[@class='mrl-ft-results mrlobject-list']/div"):
yield {
'info': apartment.xpath(".//div[@class='obj-smallinfo']/text()").get()
}
我看到 headers 是错误的和不完整的。我看到 headers 如下:
Host: www.mrlodge.de
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:67.0) Gecko/20100101 Firefox/67.0
Accept: application/json, text/javascript, */*; q=0.01
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
Referer: https://www.mrlodge.de/wohnungen/
Content-Type: application/x-www-form-urlencoded; charset=UTF-8
X-Requested-With: XMLHttpRequest
Content-Length: 468
Connection: keep-alive
Cookie: fe_typo_user=cbf861eb412b7182ccf72aa5ca87c932; uac=true
Pragma: no-cache
Cache-Control: no-cache
Content-Type
也不一样
以下方法对我有用。
import json
headers = {
'Pragma': 'no-cache',
'Origin': 'https://www.mrlodge.de',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-GB,en;q=0.9,nl-BE;q=0.8,nl;q=0.7,ro-RO;q=0.6,ro;q=0.5,en-US;q=0.4',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Cache-Control': 'no-cache',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
'Referer': 'https://www.mrlodge.de/wohnungen/',
}
data = [
('mrl_ft[fd][date_from]', ''),
('mrl_ft[fd][rent_from]', '900'),
('mrl_ft[fd][rent_to]', '8500'),
('mrl_ft[fd][persons]', '1'),
('mrl_ft[fd][kids]', '0'),
('mrl_ft[fd][rooms_from]', '1'),
('mrl_ft[fd][rooms_to]', '9'),
('mrl_ft[fd][area_from]', '20'),
('mrl_ft[fd][area_to]', '480'),
('mrl_ft[fd][sterm]', ''),
('mrl_ft[fd][radius]', '50'),
('mrl_ft[fd][mvv]', ''),
('mrl_ft[fd][objecttype_cb][]', 'w'),
('mrl_ft[fd][objecttype_cb][]', 'h'),
('mrl_ft[page]', '2'),
]
yield Request(url, method='POST', headers=headers, body=json.dumps(data))
当然,您必须动态设置数据中的某些值 - 例如 mrl_ft[page] - 以遍历所有页面。
我正在尝试从该网站提取:https://www.mrlodge.de/wohnungen/ 蜘蛛工作没有任何错误,但它不执行我通过有效负载传递的分页。我只返回同一页面。 我尝试使用 json 库来配置我的负载,但是负载本身不在 json 中。 请帮忙。
import scrapy
class MrlodgeSpiderSpider(scrapy.Spider):
name = 'mrlodge_spider'
def start_requests(self):
for pageNumber in range(1,10):
s = """mrl_ft%5Bfd%5D%5Bdate_from%5D=&mrl_ft%5Bfd%5D%5Brent_from%5D=1000&mr\
l_ft%5Bfd%5D%5Brent_to%5D=8500&mrl_ft%5Bfd%5D%5Bpersons%5D=1&mrl_ft%5Bfd\
%5D%5Bkids%5D=0&mrl_ft%5Bfd%5D%5Brooms_from%5D=1&mrl_ft%5Bfd%5D%5Brooms_to\
%5D=9&mrl_ft%5Bfd%5D%5Barea_from%5D=20&mrl_ft%5Bfd%5D%5Barea_to%5D=480&\
mrl_ft%5Bfd%5D%5Bsterm%5D=&mrl_ft%5Bfd%5D%5Bradius%5D=50&mrl_ft%5Bfd\
%5D%5Bmvv%5D=&mrl_ft%5Bfd%5D%5Bobjecttype_cb%5D%5B%5D=w&mrl_ft%5B\
fd%5D%5Bobjecttype_cb%5D%5B%5D=h&mrl_ft%5Bpage%5D={}""".format(pageNumber)
payload = '{'+s+'}'
print(payload)
yield scrapy.Request(url='https://www.mrlodge.de/wohnungen/', method='POST',
body=payload, headers={'content-type': 'application/json'})
def parse(self, response):
for apartment in response.xpath("//div[@class='mrl-ft-results mrlobject-list']/div"):
yield {
'info': apartment.xpath(".//div[@class='obj-smallinfo']/text()").get()
}
我看到 headers 是错误的和不完整的。我看到 headers 如下:
Host: www.mrlodge.de
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:67.0) Gecko/20100101 Firefox/67.0
Accept: application/json, text/javascript, */*; q=0.01
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
Referer: https://www.mrlodge.de/wohnungen/
Content-Type: application/x-www-form-urlencoded; charset=UTF-8
X-Requested-With: XMLHttpRequest
Content-Length: 468
Connection: keep-alive
Cookie: fe_typo_user=cbf861eb412b7182ccf72aa5ca87c932; uac=true
Pragma: no-cache
Cache-Control: no-cache
Content-Type
也不一样
以下方法对我有用。
import json
headers = {
'Pragma': 'no-cache',
'Origin': 'https://www.mrlodge.de',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-GB,en;q=0.9,nl-BE;q=0.8,nl;q=0.7,ro-RO;q=0.6,ro;q=0.5,en-US;q=0.4',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Cache-Control': 'no-cache',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
'Referer': 'https://www.mrlodge.de/wohnungen/',
}
data = [
('mrl_ft[fd][date_from]', ''),
('mrl_ft[fd][rent_from]', '900'),
('mrl_ft[fd][rent_to]', '8500'),
('mrl_ft[fd][persons]', '1'),
('mrl_ft[fd][kids]', '0'),
('mrl_ft[fd][rooms_from]', '1'),
('mrl_ft[fd][rooms_to]', '9'),
('mrl_ft[fd][area_from]', '20'),
('mrl_ft[fd][area_to]', '480'),
('mrl_ft[fd][sterm]', ''),
('mrl_ft[fd][radius]', '50'),
('mrl_ft[fd][mvv]', ''),
('mrl_ft[fd][objecttype_cb][]', 'w'),
('mrl_ft[fd][objecttype_cb][]', 'h'),
('mrl_ft[page]', '2'),
]
yield Request(url, method='POST', headers=headers, body=json.dumps(data))
当然,您必须动态设置数据中的某些值 - 例如 mrl_ft[page] - 以遍历所有页面。