如何从网站的搜索栏中抓取所有可能的结果
How to scrape all possible results from a search bar of a website
这是我的第一个网页抓取任务。我的任务是抓取一个网站
这是一个包含丹麦律师姓名的网站。我的困难是我只能根据我在搜索栏中输入的特定名称查询来检索名称。有没有我可以用来抓取网站包含的所有名称的在线网络工具?到目前为止,我已经使用了 Import.io 之类的工具,但没有成功。我对所有这些是如何工作的感到非常困惑。
请向下滚动到更新 2
该网站强制您输入至少一个搜索参数,因此您可以循环遍历 Arbejdsområde
列表的所有项目,对每个项目提出请求。这是示例,展示了如何在 Excel VBA 中完成(打开 VBE,创建标准模块,粘贴代码和 运行 Test()
):
Option Explicit
Sub Test()
Dim sResponse As String
Dim oItems As Object
Dim vItem
Dim aData
Dim sContent As String
Dim lPage As Long
Dim i As Long
Dim j As Long
' Retrieve search page HTML content
XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/", "", "", "", sResponse
' Extract work areas items
ExtractOptions sResponse, "ctl00$ContentPlaceHolder$Search$AreaSelect", oItems
oItems.Remove oItems.Keys()(0)
sContent = ""
' Process each work area item
For Each vItem In oItems.Items()
Debug.Print "Item [" & vItem & "]"
lPage = 0
' Process each results page
Do
Debug.Print vbTab & "Page [" & lPage & "]"
' Retrieve result page HTML content
XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/sog.aspx?s=1&t=0&a=" & vItem & "&p=" & lPage, "", "", "", sResponse
' Extract result table
ParseResponse _
"<table\b[^>]*?id=""ctl00_ContentPlaceHolder_Grid""[^>]*>([\s\S]*?)</table>", _
sResponse, _
aData, _
False
' Store parsed table
sContent = sContent & aData(0)
Debug.Print vbTab & "Parsed " & Len(sContent)
lPage = lPage + 1
DoEvents
Loop Until InStr(sResponse, "<a class=""next""") = 0
Next
' Extract data from the whole content
ParseResponse _
"<tr.*?onclick=""location.href='([^']*)'"">\s*" & _
"<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
"<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
"<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
"</tr>", _
sContent, _
aData, _
False
' Rebuild nested arrays to 2d array for output
aData = Denestify(aData)
' Decode HTML
For i = 1 To UBound(aData, 1)
For j = 2 To 4
aData(i, j) = GetInnerText((aData(i, j)))
Next
Next
' Output
With ThisWorkbook.Sheets(1)
.Cells.Delete
Output2DArray .Cells(1, 1), aData
.Columns.AutoFit
.Rows.AutoFit
End With
MsgBox "Completed"
End Sub
Sub XmlHttpRequest(sMethod, sUrl, aSetHeaders, sFormData, sRespHeaders, sRespText)
Dim aHeader
'With CreateObject("MSXML2.ServerXMLHTTP")
'.SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
With CreateObject("MSXML2.XMLHTTP")
.Open sMethod, sUrl, False ' , "u051772", "fy17janr"
If IsArray(aSetHeaders) Then
For Each aHeader In aSetHeaders
.SetRequestHeader aHeader(0), aHeader(1)
Next
End If
.Send (sFormData)
sRespHeaders = .GetAllResponseHeaders
sRespText = .ResponseText
End With
End Sub
Sub ExtractOptions(sContent As String, ByVal sName As String, oOptions As Object)
Dim aTmp0
Dim vItem
' Escape RegEx special characters
For Each vItem In Array("\", "*", "+", "?", "^", "$", ".", "[", "]", "{", "}", "(", ")", "|", "/")
sName = Replace(sName, vItem, "\" & vItem)
Next
' Extract the whole <select> for parameter
ParseResponse "<select[^>]* name=""?" & sName & """?[^>]*>[^<]*((?:<option[^>]*>[^<]*</option>[^<]*)+)[^<]*</[^>]*>", sContent, aTmp0, False
' Extract each parameter <option>
ParseResponse "<option[^>]*value=(""[^""]*""|[^\s>]*)[^>]*>([^<]*)</option>", (aTmp0(0)), aTmp0, False
' Put each parameter and value into dictionary
Set oOptions = CreateObject("Scripting.Dictionary")
For Each vItem In aTmp0
oOptions(GetInnerText((vItem(1)))) = GetInnerText(Replace(vItem(0), """", ""))
Next
End Sub
Sub ParseResponse(sPattern, sResponse, aData, Optional bAppend As Boolean = True, Optional bGlobal = True, Optional bMultiLine = True, Optional bIgnoreCase = True)
Dim oMatch
Dim aTmp0()
Dim sSubMatch
If Not (IsArray(aData) And bAppend) Then aData = Array()
With CreateObject("VBScript.RegExp")
.Global = bGlobal
.MultiLine = bMultiLine
.IgnoreCase = bIgnoreCase
.Pattern = sPattern
For Each oMatch In .Execute(sResponse)
If oMatch.SubMatches.Count = 1 Then
PushItem aData, oMatch.SubMatches(0)
Else
aTmp0 = Array()
For Each sSubMatch In oMatch.SubMatches
PushItem aTmp0, sSubMatch
Next
PushItem aData, aTmp0
End If
Next
End With
End Sub
Sub PushItem(aData, vItem, Optional bAppend As Boolean = True)
If Not (IsArray(aData) And bAppend) Then aData = Array()
ReDim Preserve aData(UBound(aData) + 1)
aData(UBound(aData)) = vItem
End Sub
Function GetInnerText(sText As String) As String
Static oHtmlfile As Object
Static oDiv As Object
If oHtmlfile Is Nothing Then
Set oHtmlfile = CreateObject("htmlfile")
oHtmlfile.Open
Set oDiv = oHtmlfile.createElement("div")
End If
oDiv.innerHTML = sText
GetInnerText = oDiv.innerText
End Function
Function Denestify(aRows)
Dim aData()
Dim aItems()
Dim i As Long
Dim j As Long
If UBound(aRows) = -1 Then Exit Function
ReDim aData(1 To UBound(aRows) + 1, 1 To 1)
For j = 0 To UBound(aRows)
If IsArray(aRows(j)) Then
aItems = aRows(j)
For i = 0 To UBound(aItems)
If i + 1 > UBound(aData, 2) Then ReDim Preserve aData(1 To UBound(aRows) + 1, 1 To i + 1)
aData(j + 1, i + 1) = aItems(i)
Next
Else
aData(j + 1, 1) = aRows(j)
End If
Next
Denestify = aData
End Function
Sub Output2DArray(oDstRng As Range, aCells As Variant)
With oDstRng
.Parent.Select
With .Resize( _
UBound(aCells, 1) - LBound(aCells, 1) + 1, _
UBound(aCells, 2) - LBound(aCells, 2) + 1)
.NumberFormat = "@"
.Value = aCells
End With
End With
End Sub
第一次检索所有数据需要几分钟(之后再次启动时,所有请求都从缓存中加载,这使得处理速度显着加快,从您需要的服务器获取最新数据to clean up the cache in IE settings).我的输出如下:
通常不建议将 RegEx 用于 HTML 解析,因此 there is disclaimer. Data being processed in this case is quite simple that is why it is parsed with RegEx. About RegEx: introduction (especially syntax), introduction JS, VB flavor.
顺便说一句,还有另一个使用类似方法的答案:, , 3 and 。
更新
以上建议的抓取是基于解析由Arbejdsområde
参数过滤的搜索结果,事实证明,实际返回的结果是不准确的。那些有multiply Arbejdsområder
的律师在结果中出现multiply times,而有空Arbejdsområder
的律师根本不在结果中。
可用于此类抓取的另一个参数而不是 Arbejdsområde
是 Retskreds
。所有律师记录都包含地址,而且只有一个地址,因此结果完整且不包含重复项。请注意,一位律师可以与多个办公室相关,因此结果中将有多个记录。
有允许在循环内抓取每个条目的详细信息的代码:
Option Explicit
Sub Test()
Dim sResponse As String
Dim oItems As Object
Dim vKey
Dim sItem As String
Dim aTmp
Dim aData
Dim lPage As Long
Dim i As Long
Dim j As Long
' Retrieve search page HTML content
XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/", "", "", "", sResponse
' Extract Retskreds items
ExtractOptions sResponse, "ctl00$ContentPlaceHolder$Search$CourtSelect", oItems
oItems.Remove oItems.Keys()(0)
i = 0
' Process each Retskreds item
For Each vKey In oItems
sItem = oItems(vKey)
Debug.Print "Area " & sItem & " " & vKey
lPage = 0
' Process each results page
Do
Debug.Print vbTab & "Page " & lPage
' Retrieve results page
XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/sog.aspx?s=1&t=0&c=" & sItem & "&p=" & lPage, "", "", "", sResponse
' Extract table
ParseResponse _
"<table\b[^>]*?id=""ctl00_ContentPlaceHolder_Grid""[^>]*>([\s\S]*?)</table>", _
sResponse, _
aTmp, _
False
' Extract data from the table
ParseResponse _
"<tr.*?onclick=""location.href='([^']*)'"">\s*" & _
"<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
"<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
"<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
"</tr>", _
aTmp(0), _
aData, _
True
' Add Retskreds name
For i = i To UBound(aData)
aTmp = aData(i)
PushItem aTmp, vKey
aData(i) = aTmp
Next
Debug.Print vbTab & "Parsed " & UBound(aData)
lPage = lPage + 1
DoEvents
Loop Until InStr(sResponse, "<a class=""next""") = 0
Next
' Retrieve detailed info for each entry
For i = 0 To UBound(aData)
aTmp = aData(i)
' Retrieve details page
aTmp(0) = "http://www.advokatnoeglen.dk" & aTmp(0)
' Extract details
XmlHttpRequest "GET", aTmp(0), "", "", "", sResponse
ParseResponse _
DecodeUriComponent( _
"Arbejdsomr%C3%A5der\: [\s\S]*?</h2>[\s\S]*?" & _
"Beskikkelses%C3%A5r\: ([^<]*)[\s\S]*?" & _
"F%C3%B8dsels%C3%A5r\: ([^<]*)[\s\S]*?" & _
"M%C3%B8deret for landsret\: ([^<]*)[\s\S]*?" & _
"M%C3%B8deret for h%C3%B8jesteret\: ([^<]*)[\s\S]*?" & _
"E-mail\: [\s\S]*?href='\/email\.aspx\?e\=(.*?)'[\s\S]*?" & _
"Mobiltlf\.\: ([\d\(\)\-+ ]*?)\s*<"), _
sResponse, _
aTmp, _
True, _
False
aTmp(9) = StrReverse(aTmp(9))
aData(i) = aTmp
Debug.Print vbTab & "Details " & i
DoEvents
Next
' Rebuild nested arrays to 2d array for output
aData = Denestify(aData)
' Decode HTML
For i = 1 To UBound(aData, 1)
For j = 2 To 4
aData(i, j) = Trim(Replace(GetInnerText((aData(i, j))), vbCrLf, ""))
Next
Next
' Output
With ThisWorkbook.Sheets(1)
.Cells.Delete
OutputArray .Cells(1, 1), _
Array("URL", _
"Navn", _
"Firma", _
DecodeUriComponent("Arbejdsomr%C3%A5der"), _
DecodeUriComponent("Retskreds"), _
DecodeUriComponent("Beskikkelses%C3%A5r"), _
DecodeUriComponent("F%C3%B8dsels%C3%A5r"), _
DecodeUriComponent("M%C3%B8deret for landsret"), _
DecodeUriComponent("M%C3%B8deret for h%C3%B8jesteret"), _
"E-mail", _
"Mobiltlf." _
)
Output2DArray .Cells(2, 1), aData
.Columns.AutoFit
.Rows.AutoFit
End With
MsgBox "Completed"
End Sub
Sub XmlHttpRequest(sMethod, sUrl, aSetHeaders, sFormData, sRespHeaders, sRespText)
Dim aHeader
'With CreateObject("MSXML2.ServerXMLHTTP")
'.SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
With CreateObject("MSXML2.XMLHTTP")
.Open sMethod, sUrl, False
If IsArray(aSetHeaders) Then
For Each aHeader In aSetHeaders
.SetRequestHeader aHeader(0), aHeader(1)
Next
End If
.Send (sFormData)
sRespHeaders = .GetAllResponseHeaders
sRespText = .ResponseText
End With
End Sub
Sub ExtractOptions(sContent As String, ByVal sName As String, oOptions As Object)
Dim aTmp0
Dim vItem
' Escape RegEx special characters
For Each vItem In Array("\", "*", "+", "?", "^", "$", ".", "[", "]", "{", "}", "(", ")", "|", "/")
sName = Replace(sName, vItem, "\" & vItem)
Next
' Extract the whole <select> for parameter
ParseResponse "<select[^>]* name=""?" & sName & """?[^>]*>[^<]*((?:<option[^>]*>[^<]*</option>[^<]*)+)[^<]*</[^>]*>", sContent, aTmp0, False
' Extract each parameter <option>
ParseResponse "<option[^>]*value=(""[^""]*""|[^\s>]*)[^>]*>([^<]*)</option>", (aTmp0(0)), aTmp0, False
' Put each parameter and value into dictionary
Set oOptions = CreateObject("Scripting.Dictionary")
For Each vItem In aTmp0
oOptions(GetInnerText((vItem(1)))) = GetInnerText(Replace(vItem(0), """", ""))
Next
End Sub
Sub ParseResponse(sPattern, sResponse, aData, Optional bAppend As Boolean = True, Optional bNestSubMatches = True, Optional bGlobal = True, Optional bMultiLine = True, Optional bIgnoreCase = True)
Dim oMatch
Dim aTmp0()
Dim sSubMatch
If Not (IsArray(aData) And bAppend) Then aData = Array()
With CreateObject("VBScript.RegExp")
.Global = bGlobal
.MultiLine = bMultiLine
.IgnoreCase = bIgnoreCase
.Pattern = sPattern
For Each oMatch In .Execute(sResponse)
If oMatch.SubMatches.Count = 1 Then
PushItem aData, oMatch.SubMatches(0)
Else
If bNestSubMatches Then
aTmp0 = Array()
For Each sSubMatch In oMatch.SubMatches
PushItem aTmp0, sSubMatch
Next
PushItem aData, aTmp0
Else
For Each sSubMatch In oMatch.SubMatches
PushItem aData, sSubMatch
Next
End If
End If
Next
End With
End Sub
Sub PushItem(aData, vItem, Optional bAppend As Boolean = True)
If Not (IsArray(aData) And bAppend) Then aData = Array()
ReDim Preserve aData(UBound(aData) + 1)
aData(UBound(aData)) = vItem
End Sub
Function DecodeUriComponent(sEncoded As String) As String
Static objHtmlfile As Object
If objHtmlfile Is Nothing Then
Set objHtmlfile = CreateObject("htmlfile")
objHtmlfile.parentWindow.execScript "function decode(s) {return decodeURIComponent(s)}", "jscript"
End If
DecodeUriComponent = objHtmlfile.parentWindow.decode(sEncoded)
End Function
Function GetInnerText(sText As String) As String
Static oHtmlfile As Object
Static oDiv As Object
If oHtmlfile Is Nothing Then
Set oHtmlfile = CreateObject("htmlfile")
oHtmlfile.Open
Set oDiv = oHtmlfile.createElement("div")
End If
oDiv.innerHTML = sText
GetInnerText = oDiv.innerText
End Function
Function Denestify(aRows)
Dim aData()
Dim aItems()
Dim i As Long
Dim j As Long
If UBound(aRows) = -1 Then Exit Function
ReDim aData(1 To UBound(aRows) + 1, 1 To 1)
For j = 0 To UBound(aRows)
If IsArray(aRows(j)) Then
aItems = aRows(j)
For i = 0 To UBound(aItems)
If i + 1 > UBound(aData, 2) Then ReDim Preserve aData(1 To UBound(aRows) + 1, 1 To i + 1)
aData(j + 1, i + 1) = aItems(i)
Next
Else
aData(j + 1, 1) = aRows(j)
End If
Next
Denestify = aData
End Function
Sub OutputArray(oDstRng As Range, aCells As Variant, Optional sFormat As String = "@")
With oDstRng
.Parent.Select
With .Resize(1, UBound(aCells) - LBound(aCells) + 1)
.NumberFormat = sFormat
.Value = aCells
End With
End With
End Sub
Sub Output2DArray(oDstRng As Range, aCells As Variant, Optional sFormat As String = "@")
With oDstRng
.Parent.Select
With .Resize( _
UBound(aCells, 1) - LBound(aCells, 1) + 1, _
UBound(aCells, 2) - LBound(aCells, 2) + 1)
.NumberFormat = sFormat
.Value = aCells
End With
End With
End Sub
4689名律师共有4896条条目:
更新 2
似乎获得了完整的列表,您可以使用 set </code> (space) 作为 <code>Firma
参数进行搜索:http://www.advokatnoeglen.dk/sog.aspx?s=1&t=0&firm=%20,有目前有 6511 个条目。 Sub Test()
解析结果的代码应更改为如下所示:
Option Explicit
Sub Test()
Dim sResponse As String
Dim aTmp
Dim aData
Dim lPage As Long
Dim i As Long
Dim j As Long
lPage = 0
' Process each results page
Do
Debug.Print vbTab & "Page " & lPage
' Retrieve results page
XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/sog.aspx?s=1&t=0&firm=%20&p=" & lPage, "", "", "", sResponse
' Extract table
ParseResponse _
"<table\b[^>]*?id=""ContentPlaceHolder_Grid""[^>]*>([\s\S]*?)</table>", _
sResponse, _
aTmp, _
False
' Extract data from the table
ParseResponse _
"<tr.*?onclick=""location.href='(.*?)'"">\s*" & _
"<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
"<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
"<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
"</tr>", _
aTmp(0), _
aData, _
True
Debug.Print vbTab & "Parsed " & (UBound(aData) + 1)
lPage = lPage + 1
DoEvents
Loop Until InStr(sResponse, "<a class=""next""") = 0
' Retrieve detailed info for each entry
For i = 0 To UBound(aData)
aTmp = aData(i)
' Retrieve details page
aTmp(0) = "http://www.advokatnoeglen.dk" & aTmp(0)
' Extract details
Do
XmlHttpRequest "GET", aTmp(0), "", "", "", sResponse
If InStr(sResponse, "<title>Runtime Error</title>") = 0 Then Exit Do
DoEvents
Loop
ParseResponse _
DecodeUriComponent( _
"Arbejdsomr%C3%A5der\: [\s\S]*?</h2>[\s\S]*?" & _
"Beskikkelses%C3%A5r\: ([^<]*)[\s\S]*?" & _
"(:?F%C3%B8dsels%C3%A5r\: ([^<]*)[\s\S]*?)?" & _
"M%C3%B8deret for landsret\: ([^<]*)[\s\S]*?" & _
"M%C3%B8deret for h%C3%B8jesteret\: ([^<]*)[\s\S]*?" & _
"(:?E-mail [\s\S]*?href='\/email\.aspx\?e\=(.*?)'[\s\S]*?)?" & _
"Mobiltlf\.\: ([\d\(\)\-+ ]*?)\s*<"), _
sResponse, _
aTmp, _
True, _
False
aTmp(8) = StrReverse(aTmp(8))
aData(i) = aTmp
Debug.Print vbTab & "Details " & i
DoEvents
Next
' Rebuild nested arrays to 2d array for output
aData = Denestify(aData)
' Decode HTML
For i = 1 To UBound(aData, 1)
For j = 2 To 4
aData(i, j) = Trim(Replace(GetInnerText((aData(i, j))), vbCrLf, ""))
Next
Next
' Output
With ThisWorkbook.Sheets(1)
.Cells.Delete
OutputArray .Cells(1, 1), _
Array("URL", _
"Navn", _
"Firma", _
DecodeUriComponent("Arbejdsomr%C3%A5der"), _
DecodeUriComponent("Beskikkelses%C3%A5r"), _
DecodeUriComponent("F%C3%B8dsels%C3%A5r"), _
DecodeUriComponent("M%C3%B8deret for landsret"), _
DecodeUriComponent("M%C3%B8deret for h%C3%B8jesteret"), _
"E-mail", _
"Mobiltlf." _
)
Output2DArray .Cells(2, 1), aData
.Columns.AutoFit
.Rows.AutoFit
End With
MsgBox "Completed"
End Sub
这是我的第一个网页抓取任务。我的任务是抓取一个网站
这是一个包含丹麦律师姓名的网站。我的困难是我只能根据我在搜索栏中输入的特定名称查询来检索名称。有没有我可以用来抓取网站包含的所有名称的在线网络工具?到目前为止,我已经使用了 Import.io 之类的工具,但没有成功。我对所有这些是如何工作的感到非常困惑。
请向下滚动到更新 2
该网站强制您输入至少一个搜索参数,因此您可以循环遍历 Arbejdsområde
列表的所有项目,对每个项目提出请求。这是示例,展示了如何在 Excel VBA 中完成(打开 VBE,创建标准模块,粘贴代码和 运行 Test()
):
Option Explicit
Sub Test()
Dim sResponse As String
Dim oItems As Object
Dim vItem
Dim aData
Dim sContent As String
Dim lPage As Long
Dim i As Long
Dim j As Long
' Retrieve search page HTML content
XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/", "", "", "", sResponse
' Extract work areas items
ExtractOptions sResponse, "ctl00$ContentPlaceHolder$Search$AreaSelect", oItems
oItems.Remove oItems.Keys()(0)
sContent = ""
' Process each work area item
For Each vItem In oItems.Items()
Debug.Print "Item [" & vItem & "]"
lPage = 0
' Process each results page
Do
Debug.Print vbTab & "Page [" & lPage & "]"
' Retrieve result page HTML content
XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/sog.aspx?s=1&t=0&a=" & vItem & "&p=" & lPage, "", "", "", sResponse
' Extract result table
ParseResponse _
"<table\b[^>]*?id=""ctl00_ContentPlaceHolder_Grid""[^>]*>([\s\S]*?)</table>", _
sResponse, _
aData, _
False
' Store parsed table
sContent = sContent & aData(0)
Debug.Print vbTab & "Parsed " & Len(sContent)
lPage = lPage + 1
DoEvents
Loop Until InStr(sResponse, "<a class=""next""") = 0
Next
' Extract data from the whole content
ParseResponse _
"<tr.*?onclick=""location.href='([^']*)'"">\s*" & _
"<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
"<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
"<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
"</tr>", _
sContent, _
aData, _
False
' Rebuild nested arrays to 2d array for output
aData = Denestify(aData)
' Decode HTML
For i = 1 To UBound(aData, 1)
For j = 2 To 4
aData(i, j) = GetInnerText((aData(i, j)))
Next
Next
' Output
With ThisWorkbook.Sheets(1)
.Cells.Delete
Output2DArray .Cells(1, 1), aData
.Columns.AutoFit
.Rows.AutoFit
End With
MsgBox "Completed"
End Sub
Sub XmlHttpRequest(sMethod, sUrl, aSetHeaders, sFormData, sRespHeaders, sRespText)
Dim aHeader
'With CreateObject("MSXML2.ServerXMLHTTP")
'.SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
With CreateObject("MSXML2.XMLHTTP")
.Open sMethod, sUrl, False ' , "u051772", "fy17janr"
If IsArray(aSetHeaders) Then
For Each aHeader In aSetHeaders
.SetRequestHeader aHeader(0), aHeader(1)
Next
End If
.Send (sFormData)
sRespHeaders = .GetAllResponseHeaders
sRespText = .ResponseText
End With
End Sub
Sub ExtractOptions(sContent As String, ByVal sName As String, oOptions As Object)
Dim aTmp0
Dim vItem
' Escape RegEx special characters
For Each vItem In Array("\", "*", "+", "?", "^", "$", ".", "[", "]", "{", "}", "(", ")", "|", "/")
sName = Replace(sName, vItem, "\" & vItem)
Next
' Extract the whole <select> for parameter
ParseResponse "<select[^>]* name=""?" & sName & """?[^>]*>[^<]*((?:<option[^>]*>[^<]*</option>[^<]*)+)[^<]*</[^>]*>", sContent, aTmp0, False
' Extract each parameter <option>
ParseResponse "<option[^>]*value=(""[^""]*""|[^\s>]*)[^>]*>([^<]*)</option>", (aTmp0(0)), aTmp0, False
' Put each parameter and value into dictionary
Set oOptions = CreateObject("Scripting.Dictionary")
For Each vItem In aTmp0
oOptions(GetInnerText((vItem(1)))) = GetInnerText(Replace(vItem(0), """", ""))
Next
End Sub
Sub ParseResponse(sPattern, sResponse, aData, Optional bAppend As Boolean = True, Optional bGlobal = True, Optional bMultiLine = True, Optional bIgnoreCase = True)
Dim oMatch
Dim aTmp0()
Dim sSubMatch
If Not (IsArray(aData) And bAppend) Then aData = Array()
With CreateObject("VBScript.RegExp")
.Global = bGlobal
.MultiLine = bMultiLine
.IgnoreCase = bIgnoreCase
.Pattern = sPattern
For Each oMatch In .Execute(sResponse)
If oMatch.SubMatches.Count = 1 Then
PushItem aData, oMatch.SubMatches(0)
Else
aTmp0 = Array()
For Each sSubMatch In oMatch.SubMatches
PushItem aTmp0, sSubMatch
Next
PushItem aData, aTmp0
End If
Next
End With
End Sub
Sub PushItem(aData, vItem, Optional bAppend As Boolean = True)
If Not (IsArray(aData) And bAppend) Then aData = Array()
ReDim Preserve aData(UBound(aData) + 1)
aData(UBound(aData)) = vItem
End Sub
Function GetInnerText(sText As String) As String
Static oHtmlfile As Object
Static oDiv As Object
If oHtmlfile Is Nothing Then
Set oHtmlfile = CreateObject("htmlfile")
oHtmlfile.Open
Set oDiv = oHtmlfile.createElement("div")
End If
oDiv.innerHTML = sText
GetInnerText = oDiv.innerText
End Function
Function Denestify(aRows)
Dim aData()
Dim aItems()
Dim i As Long
Dim j As Long
If UBound(aRows) = -1 Then Exit Function
ReDim aData(1 To UBound(aRows) + 1, 1 To 1)
For j = 0 To UBound(aRows)
If IsArray(aRows(j)) Then
aItems = aRows(j)
For i = 0 To UBound(aItems)
If i + 1 > UBound(aData, 2) Then ReDim Preserve aData(1 To UBound(aRows) + 1, 1 To i + 1)
aData(j + 1, i + 1) = aItems(i)
Next
Else
aData(j + 1, 1) = aRows(j)
End If
Next
Denestify = aData
End Function
Sub Output2DArray(oDstRng As Range, aCells As Variant)
With oDstRng
.Parent.Select
With .Resize( _
UBound(aCells, 1) - LBound(aCells, 1) + 1, _
UBound(aCells, 2) - LBound(aCells, 2) + 1)
.NumberFormat = "@"
.Value = aCells
End With
End With
End Sub
第一次检索所有数据需要几分钟(之后再次启动时,所有请求都从缓存中加载,这使得处理速度显着加快,从您需要的服务器获取最新数据to clean up the cache in IE settings).我的输出如下:
通常不建议将 RegEx 用于 HTML 解析,因此 there is disclaimer. Data being processed in this case is quite simple that is why it is parsed with RegEx. About RegEx: introduction (especially syntax), introduction JS, VB flavor.
顺便说一句,还有另一个使用类似方法的答案:
更新
以上建议的抓取是基于解析由Arbejdsområde
参数过滤的搜索结果,事实证明,实际返回的结果是不准确的。那些有multiply Arbejdsområder
的律师在结果中出现multiply times,而有空Arbejdsområder
的律师根本不在结果中。
可用于此类抓取的另一个参数而不是 Arbejdsområde
是 Retskreds
。所有律师记录都包含地址,而且只有一个地址,因此结果完整且不包含重复项。请注意,一位律师可以与多个办公室相关,因此结果中将有多个记录。
有允许在循环内抓取每个条目的详细信息的代码:
Option Explicit
Sub Test()
Dim sResponse As String
Dim oItems As Object
Dim vKey
Dim sItem As String
Dim aTmp
Dim aData
Dim lPage As Long
Dim i As Long
Dim j As Long
' Retrieve search page HTML content
XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/", "", "", "", sResponse
' Extract Retskreds items
ExtractOptions sResponse, "ctl00$ContentPlaceHolder$Search$CourtSelect", oItems
oItems.Remove oItems.Keys()(0)
i = 0
' Process each Retskreds item
For Each vKey In oItems
sItem = oItems(vKey)
Debug.Print "Area " & sItem & " " & vKey
lPage = 0
' Process each results page
Do
Debug.Print vbTab & "Page " & lPage
' Retrieve results page
XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/sog.aspx?s=1&t=0&c=" & sItem & "&p=" & lPage, "", "", "", sResponse
' Extract table
ParseResponse _
"<table\b[^>]*?id=""ctl00_ContentPlaceHolder_Grid""[^>]*>([\s\S]*?)</table>", _
sResponse, _
aTmp, _
False
' Extract data from the table
ParseResponse _
"<tr.*?onclick=""location.href='([^']*)'"">\s*" & _
"<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
"<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
"<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
"</tr>", _
aTmp(0), _
aData, _
True
' Add Retskreds name
For i = i To UBound(aData)
aTmp = aData(i)
PushItem aTmp, vKey
aData(i) = aTmp
Next
Debug.Print vbTab & "Parsed " & UBound(aData)
lPage = lPage + 1
DoEvents
Loop Until InStr(sResponse, "<a class=""next""") = 0
Next
' Retrieve detailed info for each entry
For i = 0 To UBound(aData)
aTmp = aData(i)
' Retrieve details page
aTmp(0) = "http://www.advokatnoeglen.dk" & aTmp(0)
' Extract details
XmlHttpRequest "GET", aTmp(0), "", "", "", sResponse
ParseResponse _
DecodeUriComponent( _
"Arbejdsomr%C3%A5der\: [\s\S]*?</h2>[\s\S]*?" & _
"Beskikkelses%C3%A5r\: ([^<]*)[\s\S]*?" & _
"F%C3%B8dsels%C3%A5r\: ([^<]*)[\s\S]*?" & _
"M%C3%B8deret for landsret\: ([^<]*)[\s\S]*?" & _
"M%C3%B8deret for h%C3%B8jesteret\: ([^<]*)[\s\S]*?" & _
"E-mail\: [\s\S]*?href='\/email\.aspx\?e\=(.*?)'[\s\S]*?" & _
"Mobiltlf\.\: ([\d\(\)\-+ ]*?)\s*<"), _
sResponse, _
aTmp, _
True, _
False
aTmp(9) = StrReverse(aTmp(9))
aData(i) = aTmp
Debug.Print vbTab & "Details " & i
DoEvents
Next
' Rebuild nested arrays to 2d array for output
aData = Denestify(aData)
' Decode HTML
For i = 1 To UBound(aData, 1)
For j = 2 To 4
aData(i, j) = Trim(Replace(GetInnerText((aData(i, j))), vbCrLf, ""))
Next
Next
' Output
With ThisWorkbook.Sheets(1)
.Cells.Delete
OutputArray .Cells(1, 1), _
Array("URL", _
"Navn", _
"Firma", _
DecodeUriComponent("Arbejdsomr%C3%A5der"), _
DecodeUriComponent("Retskreds"), _
DecodeUriComponent("Beskikkelses%C3%A5r"), _
DecodeUriComponent("F%C3%B8dsels%C3%A5r"), _
DecodeUriComponent("M%C3%B8deret for landsret"), _
DecodeUriComponent("M%C3%B8deret for h%C3%B8jesteret"), _
"E-mail", _
"Mobiltlf." _
)
Output2DArray .Cells(2, 1), aData
.Columns.AutoFit
.Rows.AutoFit
End With
MsgBox "Completed"
End Sub
Sub XmlHttpRequest(sMethod, sUrl, aSetHeaders, sFormData, sRespHeaders, sRespText)
Dim aHeader
'With CreateObject("MSXML2.ServerXMLHTTP")
'.SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
With CreateObject("MSXML2.XMLHTTP")
.Open sMethod, sUrl, False
If IsArray(aSetHeaders) Then
For Each aHeader In aSetHeaders
.SetRequestHeader aHeader(0), aHeader(1)
Next
End If
.Send (sFormData)
sRespHeaders = .GetAllResponseHeaders
sRespText = .ResponseText
End With
End Sub
Sub ExtractOptions(sContent As String, ByVal sName As String, oOptions As Object)
Dim aTmp0
Dim vItem
' Escape RegEx special characters
For Each vItem In Array("\", "*", "+", "?", "^", "$", ".", "[", "]", "{", "}", "(", ")", "|", "/")
sName = Replace(sName, vItem, "\" & vItem)
Next
' Extract the whole <select> for parameter
ParseResponse "<select[^>]* name=""?" & sName & """?[^>]*>[^<]*((?:<option[^>]*>[^<]*</option>[^<]*)+)[^<]*</[^>]*>", sContent, aTmp0, False
' Extract each parameter <option>
ParseResponse "<option[^>]*value=(""[^""]*""|[^\s>]*)[^>]*>([^<]*)</option>", (aTmp0(0)), aTmp0, False
' Put each parameter and value into dictionary
Set oOptions = CreateObject("Scripting.Dictionary")
For Each vItem In aTmp0
oOptions(GetInnerText((vItem(1)))) = GetInnerText(Replace(vItem(0), """", ""))
Next
End Sub
Sub ParseResponse(sPattern, sResponse, aData, Optional bAppend As Boolean = True, Optional bNestSubMatches = True, Optional bGlobal = True, Optional bMultiLine = True, Optional bIgnoreCase = True)
Dim oMatch
Dim aTmp0()
Dim sSubMatch
If Not (IsArray(aData) And bAppend) Then aData = Array()
With CreateObject("VBScript.RegExp")
.Global = bGlobal
.MultiLine = bMultiLine
.IgnoreCase = bIgnoreCase
.Pattern = sPattern
For Each oMatch In .Execute(sResponse)
If oMatch.SubMatches.Count = 1 Then
PushItem aData, oMatch.SubMatches(0)
Else
If bNestSubMatches Then
aTmp0 = Array()
For Each sSubMatch In oMatch.SubMatches
PushItem aTmp0, sSubMatch
Next
PushItem aData, aTmp0
Else
For Each sSubMatch In oMatch.SubMatches
PushItem aData, sSubMatch
Next
End If
End If
Next
End With
End Sub
Sub PushItem(aData, vItem, Optional bAppend As Boolean = True)
If Not (IsArray(aData) And bAppend) Then aData = Array()
ReDim Preserve aData(UBound(aData) + 1)
aData(UBound(aData)) = vItem
End Sub
Function DecodeUriComponent(sEncoded As String) As String
Static objHtmlfile As Object
If objHtmlfile Is Nothing Then
Set objHtmlfile = CreateObject("htmlfile")
objHtmlfile.parentWindow.execScript "function decode(s) {return decodeURIComponent(s)}", "jscript"
End If
DecodeUriComponent = objHtmlfile.parentWindow.decode(sEncoded)
End Function
Function GetInnerText(sText As String) As String
Static oHtmlfile As Object
Static oDiv As Object
If oHtmlfile Is Nothing Then
Set oHtmlfile = CreateObject("htmlfile")
oHtmlfile.Open
Set oDiv = oHtmlfile.createElement("div")
End If
oDiv.innerHTML = sText
GetInnerText = oDiv.innerText
End Function
Function Denestify(aRows)
Dim aData()
Dim aItems()
Dim i As Long
Dim j As Long
If UBound(aRows) = -1 Then Exit Function
ReDim aData(1 To UBound(aRows) + 1, 1 To 1)
For j = 0 To UBound(aRows)
If IsArray(aRows(j)) Then
aItems = aRows(j)
For i = 0 To UBound(aItems)
If i + 1 > UBound(aData, 2) Then ReDim Preserve aData(1 To UBound(aRows) + 1, 1 To i + 1)
aData(j + 1, i + 1) = aItems(i)
Next
Else
aData(j + 1, 1) = aRows(j)
End If
Next
Denestify = aData
End Function
Sub OutputArray(oDstRng As Range, aCells As Variant, Optional sFormat As String = "@")
With oDstRng
.Parent.Select
With .Resize(1, UBound(aCells) - LBound(aCells) + 1)
.NumberFormat = sFormat
.Value = aCells
End With
End With
End Sub
Sub Output2DArray(oDstRng As Range, aCells As Variant, Optional sFormat As String = "@")
With oDstRng
.Parent.Select
With .Resize( _
UBound(aCells, 1) - LBound(aCells, 1) + 1, _
UBound(aCells, 2) - LBound(aCells, 2) + 1)
.NumberFormat = sFormat
.Value = aCells
End With
End With
End Sub
4689名律师共有4896条条目:
更新 2
似乎获得了完整的列表,您可以使用 set </code> (space) 作为 <code>Firma
参数进行搜索:http://www.advokatnoeglen.dk/sog.aspx?s=1&t=0&firm=%20,有目前有 6511 个条目。 Sub Test()
解析结果的代码应更改为如下所示:
Option Explicit
Sub Test()
Dim sResponse As String
Dim aTmp
Dim aData
Dim lPage As Long
Dim i As Long
Dim j As Long
lPage = 0
' Process each results page
Do
Debug.Print vbTab & "Page " & lPage
' Retrieve results page
XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/sog.aspx?s=1&t=0&firm=%20&p=" & lPage, "", "", "", sResponse
' Extract table
ParseResponse _
"<table\b[^>]*?id=""ContentPlaceHolder_Grid""[^>]*>([\s\S]*?)</table>", _
sResponse, _
aTmp, _
False
' Extract data from the table
ParseResponse _
"<tr.*?onclick=""location.href='(.*?)'"">\s*" & _
"<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
"<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
"<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
"</tr>", _
aTmp(0), _
aData, _
True
Debug.Print vbTab & "Parsed " & (UBound(aData) + 1)
lPage = lPage + 1
DoEvents
Loop Until InStr(sResponse, "<a class=""next""") = 0
' Retrieve detailed info for each entry
For i = 0 To UBound(aData)
aTmp = aData(i)
' Retrieve details page
aTmp(0) = "http://www.advokatnoeglen.dk" & aTmp(0)
' Extract details
Do
XmlHttpRequest "GET", aTmp(0), "", "", "", sResponse
If InStr(sResponse, "<title>Runtime Error</title>") = 0 Then Exit Do
DoEvents
Loop
ParseResponse _
DecodeUriComponent( _
"Arbejdsomr%C3%A5der\: [\s\S]*?</h2>[\s\S]*?" & _
"Beskikkelses%C3%A5r\: ([^<]*)[\s\S]*?" & _
"(:?F%C3%B8dsels%C3%A5r\: ([^<]*)[\s\S]*?)?" & _
"M%C3%B8deret for landsret\: ([^<]*)[\s\S]*?" & _
"M%C3%B8deret for h%C3%B8jesteret\: ([^<]*)[\s\S]*?" & _
"(:?E-mail [\s\S]*?href='\/email\.aspx\?e\=(.*?)'[\s\S]*?)?" & _
"Mobiltlf\.\: ([\d\(\)\-+ ]*?)\s*<"), _
sResponse, _
aTmp, _
True, _
False
aTmp(8) = StrReverse(aTmp(8))
aData(i) = aTmp
Debug.Print vbTab & "Details " & i
DoEvents
Next
' Rebuild nested arrays to 2d array for output
aData = Denestify(aData)
' Decode HTML
For i = 1 To UBound(aData, 1)
For j = 2 To 4
aData(i, j) = Trim(Replace(GetInnerText((aData(i, j))), vbCrLf, ""))
Next
Next
' Output
With ThisWorkbook.Sheets(1)
.Cells.Delete
OutputArray .Cells(1, 1), _
Array("URL", _
"Navn", _
"Firma", _
DecodeUriComponent("Arbejdsomr%C3%A5der"), _
DecodeUriComponent("Beskikkelses%C3%A5r"), _
DecodeUriComponent("F%C3%B8dsels%C3%A5r"), _
DecodeUriComponent("M%C3%B8deret for landsret"), _
DecodeUriComponent("M%C3%B8deret for h%C3%B8jesteret"), _
"E-mail", _
"Mobiltlf." _
)
Output2DArray .Cells(2, 1), aData
.Columns.AutoFit
.Rows.AutoFit
End With
MsgBox "Completed"
End Sub