如何在 Scrapy 中没有表单的情况下从输入中获取数据

How can I get data from input without a form in Scrapy

我正在 Python Scrapy 上写一个网络爬虫,它应该从 this 网站获取输入数据。

当我在网站左侧选择一个州时,它会发送一个 POST 请求。 POST 请求(选择的州:“阿拉斯加”):

{
    "CurrentPage": "1",
    "SearchType": "org",
    "GroupExemption": "",
    "AffiliateOrgName": "",
    "RelatedOrgName": "",
    "RelatedOrgEin": "",
    "RelationType": "",
    "RelatedOrgs": "",
    "SelectedCityNav[]": "",
    "SelectedCountyNav[]": "",
    "Eins": "",
    "ul": "",
    "PCSSubjectCodes[]": "",
    "PeoplePCSSubjectCodes[]": "",
    "PCSPopulationCodes[]": "",
    "AutoSelectTaxonomyFacet": "",
    "AutoSelectTaxonomyText": "",
    "Keywords": "",
    "State": "Alaska",
    "City": "",
    "PeopleZip": "",
    "PeopleZipRadius": "Zip+Only",
    "PeopleCity": "",
    "PeopleRevenueRangeLow": "[=10=]",
    "PeopleRevenueRangeHigh": "max",
    "PeopleAssetsRangeLow": "[=10=]",
    "PeopleAssetsRangeHigh": "max",
    "Sort": ""
}

问题是没有表格,只有输入,我不知道如何处理。我正在使用 scrapy.http.Request 发送 POST 请求。当我抓取我的蜘蛛时,它给了我网站的 html,但没有选择任何状态。 我的蜘蛛:

import urllib
import scrapy
from scrapy.http import Request
from scrapy.utils.response import open_in_browser


class NonprofitSpider(scrapy.Spider):
    name = 'nonprofit'

    def parse(self, response):

        url = 'https://www.guidestar.org/search' # or maybe 'https://www.guidestar.org/search/SubmitSearch'?

        headers = {
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        }

        data = {
            "CurrentPage": "1",
            "SearchType": "org",
            "GroupExemption": "",
            "AffiliateOrgName": "",
            "RelatedOrgName": "",
            "RelatedOrgEin": "",
            "RelationType": "",
            "RelatedOrgs": "",
            "SelectedCityNav[]": "",
            "SelectedCountyNav[]": "",
            "Eins": "",
            "ul": "",
            "PCSSubjectCodes[]": "",
            "PeoplePCSSubjectCodes[]": "",
            "PCSPopulationCodes[]": "",
            "AutoSelectTaxonomyFacet": "",
            "AutoSelectTaxonomyText": "",
            "Keywords": "",
            "State": "Alaska",
            "City": "",
            "PeopleZip": "",
            "PeopleZipRadius": "Zip+Only",
            "PeopleCity": "",
            "PeopleRevenueRangeLow": "[=11=]",
            "PeopleRevenueRangeHigh": "max",
            "PeopleAssetsRangeLow": "[=11=]",
            "PeopleAssetsRangeHigh": "max",
            "Sort": ""
        }

        return Request(
            url=url,
            method='POST',
            headers=headers,
            body=urllib.parse.urlencode(data),
            callback=self.start
        )

    def start(self, response):

        print('response', response)

        open_in_browser(response)

重新创建请求后,您可以解析 json 文件中的数据

import urllib
import scrapy
from scrapy.http import Request


class NonprofitSpider(scrapy.Spider):
    name = 'nonprofit'
    start_urls = ['https://www.guidestar.org/search']

    def parse(self, response):
        url = 'https://www.guidestar.org/search/SubmitSearch'

        headers = {
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        }

        data = {
            "CurrentPage": "1",
            "SearchType": "org",
            "GroupExemption": "",
            "AffiliateOrgName": "",
            "RelatedOrgName": "",
            "RelatedOrgEin": "",
            "RelationType": "",
            "RelatedOrgs": "",
            "SelectedCityNav[]": "",
            "SelectedCountyNav[]": "",
            "Eins": "",
            "ul": "",
            "PCSSubjectCodes[]": "",
            "PeoplePCSSubjectCodes[]": "",
            "PCSPopulationCodes[]": "",
            "AutoSelectTaxonomyFacet": "",
            "AutoSelectTaxonomyText": "",
            "Keywords": "",
            "State": "Alaska",
            "City": "",
            "PeopleZip": "",
            "PeopleZipRadius": "Zip+Only",
            "PeopleCity": "",
            "PeopleRevenueRangeLow": "[=10=]",
            "PeopleRevenueRangeHigh": "max",
            "PeopleAssetsRangeLow": "[=10=]",
            "PeopleAssetsRangeHigh": "max",
            "Sort": ""
        }

        return Request(
            url=url,
            method='POST',
            headers=headers,
            body=urllib.parse.urlencode(data),
            callback=self.start
        )

    def start(self, response):
        print('response', response)
        json_data = response.json()
        for element in json_data['Hits']:
            OrgName = element['OrgName']
            Ein = element['Ein']
            # ...
            # ...
            # ...
            # ...
            # and so on