Scrapy 和 ASPX 网站 - 为什么它只循环第一页?
Scrapy & ASPX site - why does it only loop over first page?
我正在尝试从此站点提取数据:https://inform.alabama.gov/employeesearch.aspx。当前,当我 运行 此代码时,它只是反复从“Page$1”中提取数据。该循环实际上并没有出现在迭代查询。这是我在 Python 和 Scrapy 中的第一个脚本,所以我确信我在这里遗漏了一些基本的东西......
从一个请求中获取响应并将从中提取的数据传递给下一个请求的正确方法是什么?
该站点也没有“下一步”按钮,所以我使用 num_pages 变量来设置最大值。愿意接受有关如何使其充满活力的建议。
from scrapy import FormRequest, Spider
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError, TCPTimedOutError
class EmployeesSpider(Spider):
name = 'employees'
start_urls = ['https://inform.alabama.gov/employeesearch.aspx']
num_pages = 5 # 661
def parse(self, response):
for i in range(1, self.num_pages):
yield FormRequest(
url='https://inform.alabama.gov/employeesearch.aspx',
method="POST",
dont_filter=True,
headers={
"authority": "inform.alabama.gov",
"cache-control": "max-age=0",
"upgrade-insecure-requests": "1",
"origin": "https://inform.alabama.gov",
"content-type": "application/x-www-form-urlencoded",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "navigate",
"sec-fetch-user": "?1",
"sec-fetch-dest": "document",
"referer": "https://inform.alabama.gov/employeesearch.aspx",
"accept-language": "en-US,en;q=0.9"
},
formdata=self.get_formdata(response, i),
callback=self.parse_results,
errback=self.failure)
def get_formdata(self, response, page_num):
eventargument = 'Page%24' + str(page_num)
viewstate = response.css(
'input#__VIEWSTATE::attr(value)').get()
if viewstate is None:
viewstate = ''
viewstategen = response.css(
'input#__VIEWSTATEGENERATOR::attr(value)').get()
if viewstategen is None:
viewstategen = ''
eventvalidation = response.css(
'input#__EVENTVALIDATION::attr(value)').get()
if eventvalidation is None:
eventvalidation = ''
formdata = {
'__EVENTTARGET': 'ctl00%24ContentPlaceHolder1%24GridView1',
'__EVENTARGUMENT': eventargument,
'__VIEWSTATE': viewstate,
'__VIEWSTATEGENERATOR': viewstategen,
'__EVENTVALIDATION': eventvalidation,
'ctl00%24ContentPlaceHolder1%24txt_FirstName': '',
'ctl00%24ContentPlaceHolder1%24txt_LastName': '',
'ctl00%24ContentPlaceHolder1%24ddl_Agency': 'Not+Selected',
'ctl00%24ContentPlaceHolder1%24txt_Phone': '',
}
return formdata
def parse_results(self, response):
for employee in response.xpath('//*[@id="ContentPlaceHolder1_GridView1"]//tr'):
yield {
'name': employee.xpath('./td[1]/text()').get(),
'email': employee.xpath('./td[1]/span/a/text()').get(),
'org': employee.xpath('./td[2]/text()').get(),
'phone': employee.xpath('./td[3]/span/a/text()').get(),
}
def failure(self, failure):
# log all failures
self.logger.error(repr(failure))
# in case you want to do something special for some errors,
# you may need the failure's type:
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError on %s', response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
self.logger.error('DNSLookupError on %s', request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.logger.error('TimeoutError on %s', request.url)
通过使用 cb_kwargs 或旧元,您可以将信息从一个函数传递到下一个函数 https://docs.scrapy.org/en/latest/topics/request-response.html
def start_requests(self):
for url in self.starting_urls:
yield Request(
url,
cb_kwargs={'additional_arguments': dict_map[url]}
)
def parse(self, response, additional_arguments):
# Here you can use that additional_argument
pass
您需要发送搜索表单请求 (__EVENTTARGET=ctl00$ContentPlaceHolder1$btn_Search
) 第一个和下一个迭代结果页面 (__EVENTTARGET=ctl00$ContentPlaceHolder1$GridView1
):
def parse(self, response):
# for i in range(1, 2):
formdata = self.get_formdata(response, 0)
formdata['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$btn_Search'
formdata['__EVENTARGUMENT'] = ''
yield FormRequest(
url='https://inform.alabama.gov/employeesearch.aspx',
method="POST",
dont_filter=True,
formdata=formdata,
callback=self.perform_search,
errback=self.failure)
def perform_search(self, response):
for employee in response.xpath('//*[@id="ContentPlaceHolder1_GridView1"]//tr'):
yield {
'name': employee.xpath('./td[1]/text()').get(),
'email': employee.xpath('./td[1]/span/a/text()').get(),
'org': employee.xpath('./td[2]/text()').get(),
'phone': employee.xpath('./td[3]/span/a/text()').get(),
}
# Download search pages starting from #2
for i in range(2, self.num_pages):
formdata = self.get_formdata(response, i)
yield FormRequest(
url='https://inform.alabama.gov/employeesearch.aspx',
method="POST",
dont_filter=True,
formdata=formdata,
callback=self.parse_results,
cb_kwargs={
'page': i,
},
errback=self.failure)
def get_formdata(self, response, page_num):
eventargument = 'Page$' + str(page_num)
viewstate = response.css(
'input#__VIEWSTATE::attr(value)').get()
if viewstate is None:
viewstate = ''
viewstategen = response.css(
'input#__VIEWSTATEGENERATOR::attr(value)').get()
if viewstategen is None:
viewstategen = ''
eventvalidation = response.css(
'input#__EVENTVALIDATION::attr(value)').get()
if eventvalidation is None:
eventvalidation = ''
formdata = {
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$GridView1',
'__EVENTARGUMENT': eventargument,
'__VIEWSTATE': viewstate,
'__VIEWSTATEGENERATOR': viewstategen,
'__EVENTVALIDATION': eventvalidation,
'ctl00%24ContentPlaceHolder1%24txt_FirstName': '',
'ctl00%24ContentPlaceHolder1%24txt_LastName': '',
'ctl00%24ContentPlaceHolder1%24ddl_Agency': 'Not+Selected',
'ctl00%24ContentPlaceHolder1%24txt_Phone': '',
}
return formdata
def parse_results(self, response, page):
# with open(f'Samples/Page_{page}.htm', 'wb') as f:
# f.write(response.body)
for employee in response.xpath('//*[@id="ContentPlaceHolder1_GridView1"]//tr'):
yield {
'name': employee.xpath('./td[1]/text()').get(),
'email': employee.xpath('./td[1]/span/a/text()').get(),
'org': employee.xpath('./td[2]/text()').get(),
'phone': employee.xpath('./td[3]/span/a/text()').get(),
}
我正在尝试从此站点提取数据:https://inform.alabama.gov/employeesearch.aspx。当前,当我 运行 此代码时,它只是反复从“Page$1”中提取数据。该循环实际上并没有出现在迭代查询。这是我在 Python 和 Scrapy 中的第一个脚本,所以我确信我在这里遗漏了一些基本的东西......
从一个请求中获取响应并将从中提取的数据传递给下一个请求的正确方法是什么?
该站点也没有“下一步”按钮,所以我使用 num_pages 变量来设置最大值。愿意接受有关如何使其充满活力的建议。
from scrapy import FormRequest, Spider
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.internet.error import DNSLookupError
from twisted.internet.error import TimeoutError, TCPTimedOutError
class EmployeesSpider(Spider):
name = 'employees'
start_urls = ['https://inform.alabama.gov/employeesearch.aspx']
num_pages = 5 # 661
def parse(self, response):
for i in range(1, self.num_pages):
yield FormRequest(
url='https://inform.alabama.gov/employeesearch.aspx',
method="POST",
dont_filter=True,
headers={
"authority": "inform.alabama.gov",
"cache-control": "max-age=0",
"upgrade-insecure-requests": "1",
"origin": "https://inform.alabama.gov",
"content-type": "application/x-www-form-urlencoded",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "navigate",
"sec-fetch-user": "?1",
"sec-fetch-dest": "document",
"referer": "https://inform.alabama.gov/employeesearch.aspx",
"accept-language": "en-US,en;q=0.9"
},
formdata=self.get_formdata(response, i),
callback=self.parse_results,
errback=self.failure)
def get_formdata(self, response, page_num):
eventargument = 'Page%24' + str(page_num)
viewstate = response.css(
'input#__VIEWSTATE::attr(value)').get()
if viewstate is None:
viewstate = ''
viewstategen = response.css(
'input#__VIEWSTATEGENERATOR::attr(value)').get()
if viewstategen is None:
viewstategen = ''
eventvalidation = response.css(
'input#__EVENTVALIDATION::attr(value)').get()
if eventvalidation is None:
eventvalidation = ''
formdata = {
'__EVENTTARGET': 'ctl00%24ContentPlaceHolder1%24GridView1',
'__EVENTARGUMENT': eventargument,
'__VIEWSTATE': viewstate,
'__VIEWSTATEGENERATOR': viewstategen,
'__EVENTVALIDATION': eventvalidation,
'ctl00%24ContentPlaceHolder1%24txt_FirstName': '',
'ctl00%24ContentPlaceHolder1%24txt_LastName': '',
'ctl00%24ContentPlaceHolder1%24ddl_Agency': 'Not+Selected',
'ctl00%24ContentPlaceHolder1%24txt_Phone': '',
}
return formdata
def parse_results(self, response):
for employee in response.xpath('//*[@id="ContentPlaceHolder1_GridView1"]//tr'):
yield {
'name': employee.xpath('./td[1]/text()').get(),
'email': employee.xpath('./td[1]/span/a/text()').get(),
'org': employee.xpath('./td[2]/text()').get(),
'phone': employee.xpath('./td[3]/span/a/text()').get(),
}
def failure(self, failure):
# log all failures
self.logger.error(repr(failure))
# in case you want to do something special for some errors,
# you may need the failure's type:
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError on %s', response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
self.logger.error('DNSLookupError on %s', request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.logger.error('TimeoutError on %s', request.url)
通过使用 cb_kwargs 或旧元,您可以将信息从一个函数传递到下一个函数 https://docs.scrapy.org/en/latest/topics/request-response.html
def start_requests(self):
for url in self.starting_urls:
yield Request(
url,
cb_kwargs={'additional_arguments': dict_map[url]}
)
def parse(self, response, additional_arguments):
# Here you can use that additional_argument
pass
您需要发送搜索表单请求 (__EVENTTARGET=ctl00$ContentPlaceHolder1$btn_Search
) 第一个和下一个迭代结果页面 (__EVENTTARGET=ctl00$ContentPlaceHolder1$GridView1
):
def parse(self, response):
# for i in range(1, 2):
formdata = self.get_formdata(response, 0)
formdata['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$btn_Search'
formdata['__EVENTARGUMENT'] = ''
yield FormRequest(
url='https://inform.alabama.gov/employeesearch.aspx',
method="POST",
dont_filter=True,
formdata=formdata,
callback=self.perform_search,
errback=self.failure)
def perform_search(self, response):
for employee in response.xpath('//*[@id="ContentPlaceHolder1_GridView1"]//tr'):
yield {
'name': employee.xpath('./td[1]/text()').get(),
'email': employee.xpath('./td[1]/span/a/text()').get(),
'org': employee.xpath('./td[2]/text()').get(),
'phone': employee.xpath('./td[3]/span/a/text()').get(),
}
# Download search pages starting from #2
for i in range(2, self.num_pages):
formdata = self.get_formdata(response, i)
yield FormRequest(
url='https://inform.alabama.gov/employeesearch.aspx',
method="POST",
dont_filter=True,
formdata=formdata,
callback=self.parse_results,
cb_kwargs={
'page': i,
},
errback=self.failure)
def get_formdata(self, response, page_num):
eventargument = 'Page$' + str(page_num)
viewstate = response.css(
'input#__VIEWSTATE::attr(value)').get()
if viewstate is None:
viewstate = ''
viewstategen = response.css(
'input#__VIEWSTATEGENERATOR::attr(value)').get()
if viewstategen is None:
viewstategen = ''
eventvalidation = response.css(
'input#__EVENTVALIDATION::attr(value)').get()
if eventvalidation is None:
eventvalidation = ''
formdata = {
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$GridView1',
'__EVENTARGUMENT': eventargument,
'__VIEWSTATE': viewstate,
'__VIEWSTATEGENERATOR': viewstategen,
'__EVENTVALIDATION': eventvalidation,
'ctl00%24ContentPlaceHolder1%24txt_FirstName': '',
'ctl00%24ContentPlaceHolder1%24txt_LastName': '',
'ctl00%24ContentPlaceHolder1%24ddl_Agency': 'Not+Selected',
'ctl00%24ContentPlaceHolder1%24txt_Phone': '',
}
return formdata
def parse_results(self, response, page):
# with open(f'Samples/Page_{page}.htm', 'wb') as f:
# f.write(response.body)
for employee in response.xpath('//*[@id="ContentPlaceHolder1_GridView1"]//tr'):
yield {
'name': employee.xpath('./td[1]/text()').get(),
'email': employee.xpath('./td[1]/span/a/text()').get(),
'org': employee.xpath('./td[2]/text()').get(),
'phone': employee.xpath('./td[3]/span/a/text()').get(),
}