如何在 Scrapy 中没有表单的情况下从输入中获取数据
How can I get data from input without a form in Scrapy
我正在 Python Scrapy 上写一个网络爬虫,它应该从 this 网站获取输入数据。
当我在网站左侧选择一个州时,它会发送一个 POST
请求。
POST
请求(选择的州:“阿拉斯加”):
{
"CurrentPage": "1",
"SearchType": "org",
"GroupExemption": "",
"AffiliateOrgName": "",
"RelatedOrgName": "",
"RelatedOrgEin": "",
"RelationType": "",
"RelatedOrgs": "",
"SelectedCityNav[]": "",
"SelectedCountyNav[]": "",
"Eins": "",
"ul": "",
"PCSSubjectCodes[]": "",
"PeoplePCSSubjectCodes[]": "",
"PCSPopulationCodes[]": "",
"AutoSelectTaxonomyFacet": "",
"AutoSelectTaxonomyText": "",
"Keywords": "",
"State": "Alaska",
"City": "",
"PeopleZip": "",
"PeopleZipRadius": "Zip+Only",
"PeopleCity": "",
"PeopleRevenueRangeLow": "[=10=]",
"PeopleRevenueRangeHigh": "max",
"PeopleAssetsRangeLow": "[=10=]",
"PeopleAssetsRangeHigh": "max",
"Sort": ""
}
问题是没有表格,只有输入,我不知道如何处理。我正在使用 scrapy.http.Request
发送 POST
请求。当我抓取我的蜘蛛时,它给了我网站的 html,但没有选择任何状态。
我的蜘蛛:
import urllib
import scrapy
from scrapy.http import Request
from scrapy.utils.response import open_in_browser
class NonprofitSpider(scrapy.Spider):
name = 'nonprofit'
def parse(self, response):
url = 'https://www.guidestar.org/search' # or maybe 'https://www.guidestar.org/search/SubmitSearch'?
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}
data = {
"CurrentPage": "1",
"SearchType": "org",
"GroupExemption": "",
"AffiliateOrgName": "",
"RelatedOrgName": "",
"RelatedOrgEin": "",
"RelationType": "",
"RelatedOrgs": "",
"SelectedCityNav[]": "",
"SelectedCountyNav[]": "",
"Eins": "",
"ul": "",
"PCSSubjectCodes[]": "",
"PeoplePCSSubjectCodes[]": "",
"PCSPopulationCodes[]": "",
"AutoSelectTaxonomyFacet": "",
"AutoSelectTaxonomyText": "",
"Keywords": "",
"State": "Alaska",
"City": "",
"PeopleZip": "",
"PeopleZipRadius": "Zip+Only",
"PeopleCity": "",
"PeopleRevenueRangeLow": "[=11=]",
"PeopleRevenueRangeHigh": "max",
"PeopleAssetsRangeLow": "[=11=]",
"PeopleAssetsRangeHigh": "max",
"Sort": ""
}
return Request(
url=url,
method='POST',
headers=headers,
body=urllib.parse.urlencode(data),
callback=self.start
)
def start(self, response):
print('response', response)
open_in_browser(response)
重新创建请求后,您可以解析 json 文件中的数据
import urllib
import scrapy
from scrapy.http import Request
class NonprofitSpider(scrapy.Spider):
name = 'nonprofit'
start_urls = ['https://www.guidestar.org/search']
def parse(self, response):
url = 'https://www.guidestar.org/search/SubmitSearch'
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}
data = {
"CurrentPage": "1",
"SearchType": "org",
"GroupExemption": "",
"AffiliateOrgName": "",
"RelatedOrgName": "",
"RelatedOrgEin": "",
"RelationType": "",
"RelatedOrgs": "",
"SelectedCityNav[]": "",
"SelectedCountyNav[]": "",
"Eins": "",
"ul": "",
"PCSSubjectCodes[]": "",
"PeoplePCSSubjectCodes[]": "",
"PCSPopulationCodes[]": "",
"AutoSelectTaxonomyFacet": "",
"AutoSelectTaxonomyText": "",
"Keywords": "",
"State": "Alaska",
"City": "",
"PeopleZip": "",
"PeopleZipRadius": "Zip+Only",
"PeopleCity": "",
"PeopleRevenueRangeLow": "[=10=]",
"PeopleRevenueRangeHigh": "max",
"PeopleAssetsRangeLow": "[=10=]",
"PeopleAssetsRangeHigh": "max",
"Sort": ""
}
return Request(
url=url,
method='POST',
headers=headers,
body=urllib.parse.urlencode(data),
callback=self.start
)
def start(self, response):
print('response', response)
json_data = response.json()
for element in json_data['Hits']:
OrgName = element['OrgName']
Ein = element['Ein']
# ...
# ...
# ...
# ...
# and so on
我正在 Python Scrapy 上写一个网络爬虫,它应该从 this 网站获取输入数据。
当我在网站左侧选择一个州时,它会发送一个 POST
请求。
POST
请求(选择的州:“阿拉斯加”):
{
"CurrentPage": "1",
"SearchType": "org",
"GroupExemption": "",
"AffiliateOrgName": "",
"RelatedOrgName": "",
"RelatedOrgEin": "",
"RelationType": "",
"RelatedOrgs": "",
"SelectedCityNav[]": "",
"SelectedCountyNav[]": "",
"Eins": "",
"ul": "",
"PCSSubjectCodes[]": "",
"PeoplePCSSubjectCodes[]": "",
"PCSPopulationCodes[]": "",
"AutoSelectTaxonomyFacet": "",
"AutoSelectTaxonomyText": "",
"Keywords": "",
"State": "Alaska",
"City": "",
"PeopleZip": "",
"PeopleZipRadius": "Zip+Only",
"PeopleCity": "",
"PeopleRevenueRangeLow": "[=10=]",
"PeopleRevenueRangeHigh": "max",
"PeopleAssetsRangeLow": "[=10=]",
"PeopleAssetsRangeHigh": "max",
"Sort": ""
}
问题是没有表格,只有输入,我不知道如何处理。我正在使用 scrapy.http.Request
发送 POST
请求。当我抓取我的蜘蛛时,它给了我网站的 html,但没有选择任何状态。
我的蜘蛛:
import urllib
import scrapy
from scrapy.http import Request
from scrapy.utils.response import open_in_browser
class NonprofitSpider(scrapy.Spider):
name = 'nonprofit'
def parse(self, response):
url = 'https://www.guidestar.org/search' # or maybe 'https://www.guidestar.org/search/SubmitSearch'?
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}
data = {
"CurrentPage": "1",
"SearchType": "org",
"GroupExemption": "",
"AffiliateOrgName": "",
"RelatedOrgName": "",
"RelatedOrgEin": "",
"RelationType": "",
"RelatedOrgs": "",
"SelectedCityNav[]": "",
"SelectedCountyNav[]": "",
"Eins": "",
"ul": "",
"PCSSubjectCodes[]": "",
"PeoplePCSSubjectCodes[]": "",
"PCSPopulationCodes[]": "",
"AutoSelectTaxonomyFacet": "",
"AutoSelectTaxonomyText": "",
"Keywords": "",
"State": "Alaska",
"City": "",
"PeopleZip": "",
"PeopleZipRadius": "Zip+Only",
"PeopleCity": "",
"PeopleRevenueRangeLow": "[=11=]",
"PeopleRevenueRangeHigh": "max",
"PeopleAssetsRangeLow": "[=11=]",
"PeopleAssetsRangeHigh": "max",
"Sort": ""
}
return Request(
url=url,
method='POST',
headers=headers,
body=urllib.parse.urlencode(data),
callback=self.start
)
def start(self, response):
print('response', response)
open_in_browser(response)
重新创建请求后,您可以解析 json 文件中的数据
import urllib
import scrapy
from scrapy.http import Request
class NonprofitSpider(scrapy.Spider):
name = 'nonprofit'
start_urls = ['https://www.guidestar.org/search']
def parse(self, response):
url = 'https://www.guidestar.org/search/SubmitSearch'
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}
data = {
"CurrentPage": "1",
"SearchType": "org",
"GroupExemption": "",
"AffiliateOrgName": "",
"RelatedOrgName": "",
"RelatedOrgEin": "",
"RelationType": "",
"RelatedOrgs": "",
"SelectedCityNav[]": "",
"SelectedCountyNav[]": "",
"Eins": "",
"ul": "",
"PCSSubjectCodes[]": "",
"PeoplePCSSubjectCodes[]": "",
"PCSPopulationCodes[]": "",
"AutoSelectTaxonomyFacet": "",
"AutoSelectTaxonomyText": "",
"Keywords": "",
"State": "Alaska",
"City": "",
"PeopleZip": "",
"PeopleZipRadius": "Zip+Only",
"PeopleCity": "",
"PeopleRevenueRangeLow": "[=10=]",
"PeopleRevenueRangeHigh": "max",
"PeopleAssetsRangeLow": "[=10=]",
"PeopleAssetsRangeHigh": "max",
"Sort": ""
}
return Request(
url=url,
method='POST',
headers=headers,
body=urllib.parse.urlencode(data),
callback=self.start
)
def start(self, response):
print('response', response)
json_data = response.json()
for element in json_data['Hits']:
OrgName = element['OrgName']
Ein = element['Ein']
# ...
# ...
# ...
# ...
# and so on