XML 使用动态密钥抓取网站
XML web scraping a website with dynamic key
我一直在使用 IE 从 Excel 中抓取此站点,但最近使用 IE 时速度不一致且速度慢。我的列表通常在 500 到 1000 左右,所以我必须 运行 通宵执行宏。最近宏开始挂了。这就是为什么我决定第一次使用 MSXML2 进行探索。
该网站不需要身份验证,但它隐藏了动态变化的输入。
我做了什么.. 我使用 GET 拉取站点并提取动态密钥,然后尝试使用 POST 将输入数据发送到站点。我一直收到服务器 error/run-time 错误。我已经尝试使用不同的 header 请求选项,但我仍然没有得到结果 page.I 也尝试过使用 MSXML2.ServerXMLHTTP。我走对了吗?
Sub test_66()
Dim oXML_get
'Dim oXML_post
Dim sendText As String, s2 As String, n1 As Integer, postUrl As String, sHTML As String, s1 As String
' Instantiate MSXML2
Set oXML_get = New MSXML2.XMLHTTP
oXML_get.Open "GET", "http://www.phila.gov/revenue/realestatetax/default.aspx", False
oXML_get.setRequestHeader "Accept", "text/html;charset=UTF-8"
oXML_get.setRequestHeader "Accept-Encoding", "identity"
oXML_get.setRequestHeader "Accept-Charset", "UTF-8" 'Connection keep -alive
oXML_get.setRequestHeader "Connection", "keep -alive"
oXML_get.send
sHTML = oXML_get.responseText
'Debug.Print sHTML
Dim hDOC As MSHTML.HTMLDocument
Set hDOC = New MSHTML.HTMLDocument
hDOC.body.innerHTML = sHTML
s1 = Replace(hDOC.getElementsByTagName("input").Item(2).Value, "/", "%2F")
s2 = Replace(hDOC.getElementsByTagName("input").Item(3).Value, "/", "%2F")
sendText = "__VIEWSTATE=" & s1 & "&__EVENTVALIDATION=" & s2 & "&ctl00%24BodyContentPlaceHolder%24SearchByBRTControl%24txtTaxInfo=043185500&ctl00%24BodyContentPlaceHolder%24SearchByBRTControl%24btnTaxByBRT=%20>>"
Debug.Print sendText '"__EVENTTARGET=&__EVENTARGUMENT=&__VIEWSTATE=" & s1 & "__EVENTVALIDATION=" & s2 &
oXML_get.Open "POST", "http://www.phila.gov/revenue/realestatetax/default.aspx", False
oXML_get.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
oXML_get.setRequestHeader "Accept", "text/html;charset=UTF-8"
oXML_get.setRequestHeader "Accept-Encoding", "identity"
oXML_get.setRequestHeader "Accept-Charset", "UTF-8" 'Connection keep -alive
'oXML_get.setRequestHeader "Connection", "keep -alive"
oXML_get.send (sendText)
Dim objIE As Object: Set objIE = CreateObject("InternetExplorer.Application")
objIE.navigate "about:blank"
objIE.Visible = True
objIE.document.Write oXML_get.responseText
End Sub
这是我收到的运行时错误消息....
Server Error in '/revenue/RealEstateTax' Application.
<!-- Web.Config Configuration File -->
<configuration>
<system.web>
<customErrors mode="Off"/>
</system.web>
</configuration>
我已经通过 Firefox 网页上的 Web 表单提交了相同的搜索请求。之后我打开开发者工具 F12,网络选项卡,点击最后一个 POST 请求,打开参数部分,这里是已提交参数的屏幕截图:
原始表格数据:
__EVENTTARGET=&__EVENTARGUMENT=&__VIEWSTATE=%2FwEPDwULLTEyNDQ4MDU4OTkPZBYCZg9kFgICAw9kFgICDQ9kFgYCAQ9kFgICAw9kFgICAQ8QZBAVARUxNzAwIFNQUklORyBHQVJERU4gU1QVARUxNzAwIFNQUklORyBHQVJERU4gU1QUKwMBZxYBZmQCBQ8PFgIeBFRleHQFHFBsZWFzZSBhZGQgYWRkcmVzcyB0byBsb29rdXBkZAINDw8WAh4HVmlzaWJsZWhkFgoCAQ88KwAKAQAPFgQeC18hRGF0YUJvdW5kZx4LXyFJdGVtQ291bnRmZGQCAw9kFgICBQ8PFgIeF0VuYWJsZUFqYXhTa2luUmVuZGVyaW5naGRkAgUPFCsAAg8WAh8EaGQQFgJmAgEWAg8WBB4LTmF2aWdhdGVVcmwFJC4uL0ZlZWRiYWNrRm9ybS5hc3B4P0JydE5vPTc3MjUzNDcwMB8EaGQPFgQfBQUdfi9QREZzL1BheW1lbnRfQWdyZWVtZW50cy5wZGYfBGhkDxYCZmYWAQVxVGVsZXJpay5XZWIuVUkuUmFkV2luZG93LCBUZWxlcmlrLldlYi5VSSwgVmVyc2lvbj0yMDEwLjEuNTE5LjQwLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPTEyMWZhZTc4MTY1YmEzZDQWBGYPDxYEHwUFJC4uL0ZlZWRiYWNrRm9ybS5hc3B4P0JydE5vPTc3MjUzNDcwMB8EaGRkAgEPDxYEHwUFHX4vUERGcy9QYXltZW50X0FncmVlbWVudHMucGRmHwRoZGQCBw88KwARAgAPFgQfAmcfA2ZkARAWABYAFgBkAgkPFgIeBXZhbHVlBQk3NzI1MzQ3MDBkGAIFQWN0bDAwJEJvZHlDb250ZW50UGxhY2VIb2xkZXIkR2V0VGF4SW5mb0NvbnRyb2wkZ3JkUGF5bWVudHNIaXN0b3J5DzwrAAwBCGZkBTJjdGwwMCRCb2R5Q29udGVudFBsYWNlSG9sZGVyJEdldFRheEluZm9Db250cm9sJGZybQ9nZD9K5t7genscvOsiNrdPkxL0VHWCYSsS%2FK3EZTRu3h3w&__EVENTVALIDATION=%2FwEWBQKkrNCPCgLRzsWTBwLlpIbACAKV6q2KDQKIvdHyCawQaHbBYSHV%2B%2FVvyLUTUY%2BhSsmbpTvj0W4ycfOa1RCO&ctl00%24BodyContentPlaceHolder%24SearchByAddressControl%24txtLookup=by+Property+Address&ctl00%24BodyContentPlaceHolder%24SearchByBRTControl%24txtTaxInfo=043185500&ctl00%24BodyContentPlaceHolder%24SearchByBRTControl%24btnTaxByBRT=+%3E%3E
注意有7个参数。都应该是URL-encoded。我稍微修改和修改了您的代码,还添加了一些请求 headers。以下代码对我来说是正确的:
Option Explicit
Sub test_66()
Dim s1 As String
Dim s2 As String
Dim sResp As String
Dim aTmp As Variant
Dim sBRTNumber As String
Dim sFormData As String
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "http://www.phila.gov/revenue/realestatetax/default.aspx", False
.setRequestHeader "Accept", "text/html;charset=UTF-8"
.setRequestHeader "Accept-Encoding", "identity"
.setRequestHeader "Accept-Charset", "UTF-8"
.setRequestHeader "Connection", "keep-alive"
.send
sResp = .responseText
End With
aTmp = Split(sResp, "id=""__VIEWSTATE"" value=""", 2)
s1 = aTmp(1)
aTmp = Split(s1, """", 2)
s1 = aTmp(0)
aTmp = Split(sResp, "id=""__EVENTVALIDATION"" value=""", 2)
s2 = aTmp(1)
aTmp = Split(s2, """", 2)
s2 = aTmp(0)
s1 = EncodeUriComponent(s1)
s2 = EncodeUriComponent(s2)
sBRTNumber = "043185500"
sFormData = Join(Array( _
"__EVENTTARGET=", _
"__EVENTARGUMENT=", _
"__VIEWSTATE=" & s1, _
"__EVENTVALIDATION=" & s2, _
"ctl00%24BodyContentPlaceHolder%24SearchByAddressControl%24txtLookup=by+Property+Address", _
"ctl00%24BodyContentPlaceHolder%24SearchByBRTControl%24txtTaxInfo=" & sBRTNumber, _
"ctl00%24BodyContentPlaceHolder%24SearchByBRTControl%24btnTaxByBRT=+%3E%3E" _
), "&")
With CreateObject("MSXML2.XMLHTTP")
.Open "POST", "http://www.phila.gov/revenue/realestatetax/default.aspx", False
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
.setRequestHeader "Accept", "text/html;charset=UTF-8"
.setRequestHeader "Accept-Encoding", "identity"
.setRequestHeader "Accept-Charset", "UTF-8"
.setRequestHeader "Connection", "keep-alive"
.setRequestHeader "Host", "www.phila.gov"
.setRequestHeader "Origin", "http://www.phila.gov"
.setRequestHeader "Referer", "http://www.phila.gov/revenue/realestatetax/default.aspx"
.send (sFormData)
sResp = .responseText
End With
With CreateObject("InternetExplorer.Application")
.navigate "about:blank"
.Visible = True
.document.write sResp
End With
End Sub
Function EncodeUriComponent(strText As String) As String
Static objHtmlfile As Object
If objHtmlfile Is Nothing Then
Set objHtmlfile = CreateObject("htmlfile")
objHtmlfile.parentWindow.execScript "function encode(s) {return encodeURIComponent(s)}", "jscript"
End If
EncodeUriComponent = objHtmlfile.parentWindow.encode(strText)
End Function
这是 IE window 输出:
我一直在使用 IE 从 Excel 中抓取此站点,但最近使用 IE 时速度不一致且速度慢。我的列表通常在 500 到 1000 左右,所以我必须 运行 通宵执行宏。最近宏开始挂了。这就是为什么我决定第一次使用 MSXML2 进行探索。
该网站不需要身份验证,但它隐藏了动态变化的输入。
我做了什么.. 我使用 GET 拉取站点并提取动态密钥,然后尝试使用 POST 将输入数据发送到站点。我一直收到服务器 error/run-time 错误。我已经尝试使用不同的 header 请求选项,但我仍然没有得到结果 page.I 也尝试过使用 MSXML2.ServerXMLHTTP。我走对了吗?
Sub test_66()
Dim oXML_get
'Dim oXML_post
Dim sendText As String, s2 As String, n1 As Integer, postUrl As String, sHTML As String, s1 As String
' Instantiate MSXML2
Set oXML_get = New MSXML2.XMLHTTP
oXML_get.Open "GET", "http://www.phila.gov/revenue/realestatetax/default.aspx", False
oXML_get.setRequestHeader "Accept", "text/html;charset=UTF-8"
oXML_get.setRequestHeader "Accept-Encoding", "identity"
oXML_get.setRequestHeader "Accept-Charset", "UTF-8" 'Connection keep -alive
oXML_get.setRequestHeader "Connection", "keep -alive"
oXML_get.send
sHTML = oXML_get.responseText
'Debug.Print sHTML
Dim hDOC As MSHTML.HTMLDocument
Set hDOC = New MSHTML.HTMLDocument
hDOC.body.innerHTML = sHTML
s1 = Replace(hDOC.getElementsByTagName("input").Item(2).Value, "/", "%2F")
s2 = Replace(hDOC.getElementsByTagName("input").Item(3).Value, "/", "%2F")
sendText = "__VIEWSTATE=" & s1 & "&__EVENTVALIDATION=" & s2 & "&ctl00%24BodyContentPlaceHolder%24SearchByBRTControl%24txtTaxInfo=043185500&ctl00%24BodyContentPlaceHolder%24SearchByBRTControl%24btnTaxByBRT=%20>>"
Debug.Print sendText '"__EVENTTARGET=&__EVENTARGUMENT=&__VIEWSTATE=" & s1 & "__EVENTVALIDATION=" & s2 &
oXML_get.Open "POST", "http://www.phila.gov/revenue/realestatetax/default.aspx", False
oXML_get.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
oXML_get.setRequestHeader "Accept", "text/html;charset=UTF-8"
oXML_get.setRequestHeader "Accept-Encoding", "identity"
oXML_get.setRequestHeader "Accept-Charset", "UTF-8" 'Connection keep -alive
'oXML_get.setRequestHeader "Connection", "keep -alive"
oXML_get.send (sendText)
Dim objIE As Object: Set objIE = CreateObject("InternetExplorer.Application")
objIE.navigate "about:blank"
objIE.Visible = True
objIE.document.Write oXML_get.responseText
End Sub
这是我收到的运行时错误消息....
Server Error in '/revenue/RealEstateTax' Application.
<!-- Web.Config Configuration File -->
<configuration>
<system.web>
<customErrors mode="Off"/>
</system.web>
</configuration>
我已经通过 Firefox 网页上的 Web 表单提交了相同的搜索请求。之后我打开开发者工具 F12,网络选项卡,点击最后一个 POST 请求,打开参数部分,这里是已提交参数的屏幕截图:
原始表格数据:
__EVENTTARGET=&__EVENTARGUMENT=&__VIEWSTATE=%2FwEPDwULLTEyNDQ4MDU4OTkPZBYCZg9kFgICAw9kFgICDQ9kFgYCAQ9kFgICAw9kFgICAQ8QZBAVARUxNzAwIFNQUklORyBHQVJERU4gU1QVARUxNzAwIFNQUklORyBHQVJERU4gU1QUKwMBZxYBZmQCBQ8PFgIeBFRleHQFHFBsZWFzZSBhZGQgYWRkcmVzcyB0byBsb29rdXBkZAINDw8WAh4HVmlzaWJsZWhkFgoCAQ88KwAKAQAPFgQeC18hRGF0YUJvdW5kZx4LXyFJdGVtQ291bnRmZGQCAw9kFgICBQ8PFgIeF0VuYWJsZUFqYXhTa2luUmVuZGVyaW5naGRkAgUPFCsAAg8WAh8EaGQQFgJmAgEWAg8WBB4LTmF2aWdhdGVVcmwFJC4uL0ZlZWRiYWNrRm9ybS5hc3B4P0JydE5vPTc3MjUzNDcwMB8EaGQPFgQfBQUdfi9QREZzL1BheW1lbnRfQWdyZWVtZW50cy5wZGYfBGhkDxYCZmYWAQVxVGVsZXJpay5XZWIuVUkuUmFkV2luZG93LCBUZWxlcmlrLldlYi5VSSwgVmVyc2lvbj0yMDEwLjEuNTE5LjQwLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPTEyMWZhZTc4MTY1YmEzZDQWBGYPDxYEHwUFJC4uL0ZlZWRiYWNrRm9ybS5hc3B4P0JydE5vPTc3MjUzNDcwMB8EaGRkAgEPDxYEHwUFHX4vUERGcy9QYXltZW50X0FncmVlbWVudHMucGRmHwRoZGQCBw88KwARAgAPFgQfAmcfA2ZkARAWABYAFgBkAgkPFgIeBXZhbHVlBQk3NzI1MzQ3MDBkGAIFQWN0bDAwJEJvZHlDb250ZW50UGxhY2VIb2xkZXIkR2V0VGF4SW5mb0NvbnRyb2wkZ3JkUGF5bWVudHNIaXN0b3J5DzwrAAwBCGZkBTJjdGwwMCRCb2R5Q29udGVudFBsYWNlSG9sZGVyJEdldFRheEluZm9Db250cm9sJGZybQ9nZD9K5t7genscvOsiNrdPkxL0VHWCYSsS%2FK3EZTRu3h3w&__EVENTVALIDATION=%2FwEWBQKkrNCPCgLRzsWTBwLlpIbACAKV6q2KDQKIvdHyCawQaHbBYSHV%2B%2FVvyLUTUY%2BhSsmbpTvj0W4ycfOa1RCO&ctl00%24BodyContentPlaceHolder%24SearchByAddressControl%24txtLookup=by+Property+Address&ctl00%24BodyContentPlaceHolder%24SearchByBRTControl%24txtTaxInfo=043185500&ctl00%24BodyContentPlaceHolder%24SearchByBRTControl%24btnTaxByBRT=+%3E%3E
注意有7个参数。都应该是URL-encoded。我稍微修改和修改了您的代码,还添加了一些请求 headers。以下代码对我来说是正确的:
Option Explicit
Sub test_66()
Dim s1 As String
Dim s2 As String
Dim sResp As String
Dim aTmp As Variant
Dim sBRTNumber As String
Dim sFormData As String
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "http://www.phila.gov/revenue/realestatetax/default.aspx", False
.setRequestHeader "Accept", "text/html;charset=UTF-8"
.setRequestHeader "Accept-Encoding", "identity"
.setRequestHeader "Accept-Charset", "UTF-8"
.setRequestHeader "Connection", "keep-alive"
.send
sResp = .responseText
End With
aTmp = Split(sResp, "id=""__VIEWSTATE"" value=""", 2)
s1 = aTmp(1)
aTmp = Split(s1, """", 2)
s1 = aTmp(0)
aTmp = Split(sResp, "id=""__EVENTVALIDATION"" value=""", 2)
s2 = aTmp(1)
aTmp = Split(s2, """", 2)
s2 = aTmp(0)
s1 = EncodeUriComponent(s1)
s2 = EncodeUriComponent(s2)
sBRTNumber = "043185500"
sFormData = Join(Array( _
"__EVENTTARGET=", _
"__EVENTARGUMENT=", _
"__VIEWSTATE=" & s1, _
"__EVENTVALIDATION=" & s2, _
"ctl00%24BodyContentPlaceHolder%24SearchByAddressControl%24txtLookup=by+Property+Address", _
"ctl00%24BodyContentPlaceHolder%24SearchByBRTControl%24txtTaxInfo=" & sBRTNumber, _
"ctl00%24BodyContentPlaceHolder%24SearchByBRTControl%24btnTaxByBRT=+%3E%3E" _
), "&")
With CreateObject("MSXML2.XMLHTTP")
.Open "POST", "http://www.phila.gov/revenue/realestatetax/default.aspx", False
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
.setRequestHeader "Accept", "text/html;charset=UTF-8"
.setRequestHeader "Accept-Encoding", "identity"
.setRequestHeader "Accept-Charset", "UTF-8"
.setRequestHeader "Connection", "keep-alive"
.setRequestHeader "Host", "www.phila.gov"
.setRequestHeader "Origin", "http://www.phila.gov"
.setRequestHeader "Referer", "http://www.phila.gov/revenue/realestatetax/default.aspx"
.send (sFormData)
sResp = .responseText
End With
With CreateObject("InternetExplorer.Application")
.navigate "about:blank"
.Visible = True
.document.write sResp
End With
End Sub
Function EncodeUriComponent(strText As String) As String
Static objHtmlfile As Object
If objHtmlfile Is Nothing Then
Set objHtmlfile = CreateObject("htmlfile")
objHtmlfile.parentWindow.execScript "function encode(s) {return encodeURIComponent(s)}", "jscript"
End If
EncodeUriComponent = objHtmlfile.parentWindow.encode(strText)
End Function
这是 IE window 输出: