使用 VBA 进行网页抓取
Web scraping using VBA
我想从此URL中提取数据。
我想从10张名片中提取标题、手机号码和地址。
这是我试过但没有成功的一些代码。
Public Sub GetValueFromBrowser()
On Error Resume Next
Dim Sn As Integer
Dim ie As Object
Dim url As String
Dim Doc As HTMLDocument
Dim element As IHTMLElement
Dim elements As IHTMLElementCollection
For Sn = 1 To 1
url = Sheets("Infos").Range("C" & Sn).Value
Set ie = CreateObject("InternetExplorer.Application")
With ie
.Visible = 0
.navigate url
While .Busy Or .readyState <> 4
DoEvents
Wend
End With
Set Doc = ie.document
Set elements = Doc.getElementsByClassName(" col-sm-5 col-xs-8 store-details sp-detail paddingR0")
Dim count As Long
Dim erow As Long
count = 0
For Each element In elements
If element.className = "lng_cont_name" Then
erow = Sheet1.Cells(Rows.count, 1).End(xlUp).Offset(1, 0).Row
Cells(erow, 1) = Doc.getElementsByClassName("Store-Name")(count).innerText
Cells(erow, 2) = Doc.getElementsByClassName("cont_fl_addr")(count).innerText
count = count + 1
End If
Next element
If Val(Left(Sn, 2)) = 99 Then
ActiveWorkbook.Save
End If
Next Sn
End Sub
电话号码并不容易,因为我认为它们是故意设置得难以抓取的。我找到了一种方法来破译 CSS 伪 ::before 元素内容中的值。地址和标题很简单 CSS 选择。
我已经在 python here.
中编写了一个更清晰的脚本
那么,代码的各个部分是如何工作的?
标题:
Set titles = .querySelectorAll(".jcn [title]")
我将标题定位为具有 title
属性和 parent jcn
class 属性的元素。 "."
表示 class 选择器,"[]"
表示属性选择器,中间的 " "
是后代组合器。
querySelectorAll
document
returns 页面上所有匹配元素的 nodeList
方法,即 10 个标题。
地址:
Set addresses = .querySelectorAll(".desk-add.jaddt")
地址以其 class 属性 desk-add jaddt
为目标。由于不允许使用复合 class 名称,因此必须使用额外的 "."
替换名称中的白色 space。
电话号码(解密storesTextToDecipher
内的内容):
Set storesTextToDecipher = .querySelectorAll(".col-sm-5.col-xs-8.store-details.sp-detail.paddingR0")
这就是魔法发生的地方。这些数字无法直接通过 DOM 获得,因为它们是伪元素内容。
如果你检查相关的 HTML 你会发现一系列伪 ::before elements。 VBA 没有公开应用伪选择器来尝试在页面的 CSS 中获取此信息的机制。
您看到的实际上是一系列 span 元素,每个元素都有一个以 mobilesv
开头的 class 属性。这些元素位于 class col-sm-5 col-xs-8 store-details sp-detail paddingR0
的单个 parent 元素中(再次注意复合 class 名称)。
我最初收集了所有 parent 元素的 nodeList
。
返回元素示例:
每个 parent 元素都包含构成电话号码字符串字符的 class 名称(以 mobilesv
开头)元素。字符串中有些字符是数字,有些则代表+()-
。注意:class 名称中的 2|3 个字母字符串,在 icon-
之后,例如dc
, fe
.
例如页面的第一个搜索结果,对于电话号码中的初始号码9
:
此伪元素/电话字符的实际 CSS 内容可以在 CSS 样式中观察到:
注意 class 名称和伪元素选择器之前:.icon-ji:before
并且内容是d010
.
长话短说....您可以提取 icon-
之后的 2 或 3 个字母,即 ji
在这种情况下,以及 d0
之后的数字字符串,即 10
在这种情况下,并使用这两位信息来破译电话号码。此信息在响应中可用:
查看与左侧电话字符串的 class 名称相关联的相同 2/3 字母字符串,以及右侧的内容说明。稍微算一算,右边的数字比电话号码大1,即class,如网页图片所示。我只是创建了一个字典,然后通过解析 html.
的这一部分将 2/3 字母缩写映射到电话号码
当遍历 storesTextToDecipher
时,我使用这本词典从 class 姓名中匹配的 2/3 字母缩写中破译实际电话号码。
VBA:
Option Explicit
Public Sub GetInfo()
Dim sResponse As String, html As HTMLDocument
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.justdial.com/Ahmedabad/Beauty-Parlours/page-3", False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
Dim cipherKey As String, cipherDict As Object
Set cipherDict = CreateObject("Scripting.Dictionary")
cipherKey = Split(Split(sResponse, "smoothing:grayscale}.icon-")(1), ".mobilesv")(0)
cipherKey = Replace$(cipherKey, ":before{content:""d", Chr$(32))
Dim arr() As String, tempArr() As String, i As Long, j As Long
arr = Split(cipherKey, """}.icon-")
For i = LBound(arr) To UBound(arr)
tempArr = Split(arr(i), Chr$(32))
cipherDict(tempArr(0)) = Replace$(tempArr(1), """}", vbNullString) - 1 'needs adjustment
Next
html.body.innerHTML = sResponse
Dim titles As Object, addresses As Object, storesTextToDecipher As Object
With html
Set titles = .querySelectorAll(".jcn [title]")
Set addresses = .querySelectorAll(".desk-add.jaddt")
Set storesTextToDecipher = .querySelectorAll(".col-sm-5.col-xs-8.store-details.sp-detail.paddingR0")
End With
For i = 0 To titles.Length - 1
Debug.Print "title: " & titles.item(i).innerText
Debug.Print "address: " & addresses.item(i).innerText
Debug.Print GetStoreNumber(storesTextToDecipher.item(i), cipherDict)
Next
End Sub
Public Function GetStoreNumber(ByVal storeInfo As Object, cipherDict As Object) As String
Dim html2 As HTMLDocument, elems As Object, telNumber As String, j As Long
Set html2 = New HTMLDocument
html2.body.innerHTML = storeInfo.innerHTML
Set elems = html2.querySelectorAll("b span")
For j = 0 To elems.Length - 1
On Error Resume Next
If cipherDict(Replace$(elems.item(j).className, "mobilesv icon-", vbNullString)) < 10 Then
telNumber = telNumber & cipherDict(Replace$(elems.item(j).className, "mobilesv icon-", vbNullString))
End If
On Error GoTo 0
Next
GetStoreNumber = telNumber
End Function
示例输出:
编辑:所有页面结果
由于您现在想要超过 10 个,因此以下使用预期的页面结果计数(NUMBER_RESULTS_ON_PAGE)
从页面收集信息。它滚动页面直到电话号码的预期数量(这应该是唯一的) 找到,或者 MAX_WAIT_SEC
被命中。这意味着您避免了无限循环,并且如果您期望不同的数字,则可以设置您的预期结果计数。它确实依赖于每个商店都有一个列出的电话号码 - 这似乎是相当合理的假设。
Option Explicit
Public Sub GetInfo()
Dim IE As New InternetExplorer, resultCountDict As Object, cipherDict As Object, t As Date
Const MAX_WAIT_SEC As Long = 300 'wait 5 minutes max before exiting loop to get all results
Const NUMBER_RESULTS_ON_PAGE As Long = 80
Const URL = "https://www.justdial.com/Ahmedabad/Beauty-Parlours/page-3"
Dim titles As Object, addresses As Object, storesTextToDecipher As Object
Application.ScreenUpdating = True
Set resultCountDict = CreateObject("Scripting.Dictionary")
Set cipherDict = GetCipherDict(URL)
With IE
.Visible = True
.Navigate2 URL
While .Busy Or .readyState < 4: DoEvents: Wend
With .document
t = Timer
Do
DoEvents
Set titles = .querySelectorAll(".jcn [title]")
Set addresses = .querySelectorAll(".desk-add.jaddt")
Set storesTextToDecipher = .querySelectorAll(".col-sm-5.col-xs-8.store-details.sp-detail.paddingR0")
Dim telNumber As String, i As Long
For i = 0 To titles.Length - 1
telNumber = GetStoreNumber(storesTextToDecipher.item(i), cipherDict)
If Not resultCountDict.Exists(telNumber) Then
resultCountDict.Add telNumber, Array(titles.item(i).innerText, addresses.item(i).innerText, telNumber)
End If
Next
.parentWindow.execScript "window.scrollBy(0, window.innerHeight);", "javascript"
While IE.Busy Or IE.readyState < 4: DoEvents: Wend
If Timer - t > MAX_WAIT_SEC Then Exit Do
Loop Until resultCountDict.Count = NUMBER_RESULTS_ON_PAGE
End With
.Quit
End With
Dim key As Variant, rowCounter As Long
rowCounter = 1
With ThisWorkbook.Worksheets("Sheet1")
For Each key In resultCountDict.keys
.Cells(rowCounter, 1).Resize(1, 3) = resultCountDict(key)
rowCounter = rowCounter + 1
Next
End With
Application.ScreenUpdating = True
End Sub
Public Function GetStoreNumber(ByVal storeInfo As Object, cipherDict As Object) As String
Dim html2 As HTMLDocument, elems As Object, telNumber As String, j As Long
Set html2 = New HTMLDocument
html2.body.innerHTML = storeInfo.innerHTML
Set elems = html2.querySelectorAll("b span")
For j = 0 To elems.Length - 1
On Error Resume Next
If cipherDict(Replace$(elems.item(j).className, "mobilesv icon-", vbNullString)) < 10 Then
telNumber = telNumber & cipherDict(Replace$(elems.item(j).className, "mobilesv icon-", vbNullString))
End If
On Error GoTo 0
Next
GetStoreNumber = telNumber
End Function
Public Function GetCipherDict(ByVal URL As String) As Object
Dim sResponse As String, html As HTMLDocument
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", URL, False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
Dim cipherKey As String, cipherDict As Object
Set cipherDict = CreateObject("Scripting.Dictionary")
cipherKey = Split(Split(sResponse, "smoothing:grayscale}.icon-")(1), ".mobilesv")(0)
cipherKey = Replace$(cipherKey, ":before{content:""d", Chr$(32))
Dim arr() As String, tempArr() As String, i As Long, j As Long
arr = Split(cipherKey, """}.icon-")
For i = LBound(arr) To UBound(arr)
tempArr = Split(arr(i), Chr$(32))
cipherDict(tempArr(0)) = Replace$(tempArr(1), """}", vbNullString) - 1 'needs adjustment
Next
Set GetCipherDict = cipherDict
End Function
编辑:
顶部有多个数字的版本(请注意,如果您发出太多请求或速度太快,服务器将为您提供随机页面):
Option Explicit
Public Sub GetDetails()
Dim re As Object, decodeDict As Object, i As Long
Dim html As MSHTML.htmlDocument, responseText As String, keys(), values()
Set decodeDict = CreateObject("Scripting.Dictionary")
Set re = CreateObject("vbscript.regexp")
Set html = New MSHTML.htmlDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.justdial.com/chengalpattu/Oasis-Pharma-Near-Saraswathi-Children-School-Revathypuram-Urapakkam/9999PXX44-XX44-181123145524-X8G7_BZDET", False
.setRequestHeader "User-Agent", "Mozilla/4.0"
.send
responseText = .responseText
html.body.innerHTML = responseText
End With
keys = GetMatches(re, responseText, "-(\w+):before")
If UBound(keys) = 0 Then Exit Sub
values = GetMatches(re, responseText, "9d0(\d+)", True)
For i = LBound(values) To UBound(values)
decodeDict(keys(i)) = values(i)
Next
Dim itemsToDecode()
decodeDict(keys(UBound(keys))) = "+"
itemsToDecode = GetValuesToDecode(html)
PrintNumbers re, html, itemsToDecode, decodeDict
End Sub
Public Function GetMatches(ByVal re As Object, ByVal inputString As String, ByVal sPattern As String, Optional ByVal numeric = False, Optional ByVal spanSearch = False) As Variant
Dim matches As Object, iMatch As Object, s As String, arrMatches(), i As Long
With re
.Global = True
.MultiLine = True
.IgnoreCase = True
.Pattern = sPattern
If .Test(inputString) Then
Set matches = .Execute(inputString)
ReDim arrMatches(0 To matches.Count - 1)
For Each iMatch In matches
If numeric Then
arrMatches(i) = iMatch.SubMatches.item(0) - 1
Else
If spanSearch Then
arrMatches(i) = iMatch
Else
arrMatches(i) = iMatch.SubMatches.item(0)
End If
End If
i = i + 1
Next iMatch
Else
ReDim arrMatches(0)
arrMatches(0) = vbNullString
End If
End With
GetMatches = arrMatches
End Function
Public Function GetValuesToDecode(ByVal html As MSHTML.htmlDocument) As Variant
Dim i As Long, elements As Object, results(), Class As String
Set elements = html.querySelectorAll(".telCntct span[class*='icon']")
ReDim results(elements.Length - 1)
For i = 0 To elements.Length - 1
Class = elements.item(i).className
results(i) = Right$(Class, Len(Class) - InStrRev(Class, "-"))
Next
GetValuesToDecode = results
End Function
Public Sub PrintNumbers(ByVal re As Object, ByVal html As htmlDocument, ByVal itemsToDecode As Variant, ByVal decodeDict As Object)
Dim output As String, i As Long
For i = LBound(itemsToDecode) To UBound(itemsToDecode)
output = output & decodeDict(itemsToDecode(i))
Next
Dim htmlToSearch As String, groups As Variant, startPos As Long, oldStartPos As Long
htmlToSearch = html.querySelector(".telCntct").outerHTML
groups = GetMatches(re, htmlToSearch, "mobilesv|,", False, True)
startPos = 1
Dim totalNumbers As Long
For i = LBound(groups) To UBound(groups)
If InStr(groups(i), ",") > 0 Then
totalNumbers = totalNumbers + 1
Debug.Print Mid$(output, startPos, IIf(startPos = 1, i, i - startPos))
startPos = i + 1
End If
Next
If totalNumbers = 1 Then Debug.Print Right$(output, Len(output) - startPos - 1)
End Sub
我想从此URL中提取数据。
我想从10张名片中提取标题、手机号码和地址。
这是我试过但没有成功的一些代码。
Public Sub GetValueFromBrowser()
On Error Resume Next
Dim Sn As Integer
Dim ie As Object
Dim url As String
Dim Doc As HTMLDocument
Dim element As IHTMLElement
Dim elements As IHTMLElementCollection
For Sn = 1 To 1
url = Sheets("Infos").Range("C" & Sn).Value
Set ie = CreateObject("InternetExplorer.Application")
With ie
.Visible = 0
.navigate url
While .Busy Or .readyState <> 4
DoEvents
Wend
End With
Set Doc = ie.document
Set elements = Doc.getElementsByClassName(" col-sm-5 col-xs-8 store-details sp-detail paddingR0")
Dim count As Long
Dim erow As Long
count = 0
For Each element In elements
If element.className = "lng_cont_name" Then
erow = Sheet1.Cells(Rows.count, 1).End(xlUp).Offset(1, 0).Row
Cells(erow, 1) = Doc.getElementsByClassName("Store-Name")(count).innerText
Cells(erow, 2) = Doc.getElementsByClassName("cont_fl_addr")(count).innerText
count = count + 1
End If
Next element
If Val(Left(Sn, 2)) = 99 Then
ActiveWorkbook.Save
End If
Next Sn
End Sub
电话号码并不容易,因为我认为它们是故意设置得难以抓取的。我找到了一种方法来破译 CSS 伪 ::before 元素内容中的值。地址和标题很简单 CSS 选择。
我已经在 python here.
中编写了一个更清晰的脚本那么,代码的各个部分是如何工作的?
标题:
Set titles = .querySelectorAll(".jcn [title]")
我将标题定位为具有 title
属性和 parent jcn
class 属性的元素。 "."
表示 class 选择器,"[]"
表示属性选择器,中间的 " "
是后代组合器。
querySelectorAll
document
returns 页面上所有匹配元素的 nodeList
方法,即 10 个标题。
地址:
Set addresses = .querySelectorAll(".desk-add.jaddt")
地址以其 class 属性 desk-add jaddt
为目标。由于不允许使用复合 class 名称,因此必须使用额外的 "."
替换名称中的白色 space。
电话号码(解密storesTextToDecipher
内的内容):
Set storesTextToDecipher = .querySelectorAll(".col-sm-5.col-xs-8.store-details.sp-detail.paddingR0")
这就是魔法发生的地方。这些数字无法直接通过 DOM 获得,因为它们是伪元素内容。
如果你检查相关的 HTML 你会发现一系列伪 ::before elements。 VBA 没有公开应用伪选择器来尝试在页面的 CSS 中获取此信息的机制。
您看到的实际上是一系列 span 元素,每个元素都有一个以 mobilesv
开头的 class 属性。这些元素位于 class col-sm-5 col-xs-8 store-details sp-detail paddingR0
的单个 parent 元素中(再次注意复合 class 名称)。
我最初收集了所有 parent 元素的 nodeList
。
返回元素示例:
每个 parent 元素都包含构成电话号码字符串字符的 class 名称(以 mobilesv
开头)元素。字符串中有些字符是数字,有些则代表+()-
。注意:class 名称中的 2|3 个字母字符串,在 icon-
之后,例如dc
, fe
.
例如页面的第一个搜索结果,对于电话号码中的初始号码9
:
此伪元素/电话字符的实际 CSS 内容可以在 CSS 样式中观察到:
注意 class 名称和伪元素选择器之前:.icon-ji:before
并且内容是d010
.
长话短说....您可以提取 icon-
之后的 2 或 3 个字母,即 ji
在这种情况下,以及 d0
之后的数字字符串,即 10
在这种情况下,并使用这两位信息来破译电话号码。此信息在响应中可用:
查看与左侧电话字符串的 class 名称相关联的相同 2/3 字母字符串,以及右侧的内容说明。稍微算一算,右边的数字比电话号码大1,即class,如网页图片所示。我只是创建了一个字典,然后通过解析 html.
的这一部分将 2/3 字母缩写映射到电话号码当遍历 storesTextToDecipher
时,我使用这本词典从 class 姓名中匹配的 2/3 字母缩写中破译实际电话号码。
VBA:
Option Explicit
Public Sub GetInfo()
Dim sResponse As String, html As HTMLDocument
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.justdial.com/Ahmedabad/Beauty-Parlours/page-3", False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
Dim cipherKey As String, cipherDict As Object
Set cipherDict = CreateObject("Scripting.Dictionary")
cipherKey = Split(Split(sResponse, "smoothing:grayscale}.icon-")(1), ".mobilesv")(0)
cipherKey = Replace$(cipherKey, ":before{content:""d", Chr$(32))
Dim arr() As String, tempArr() As String, i As Long, j As Long
arr = Split(cipherKey, """}.icon-")
For i = LBound(arr) To UBound(arr)
tempArr = Split(arr(i), Chr$(32))
cipherDict(tempArr(0)) = Replace$(tempArr(1), """}", vbNullString) - 1 'needs adjustment
Next
html.body.innerHTML = sResponse
Dim titles As Object, addresses As Object, storesTextToDecipher As Object
With html
Set titles = .querySelectorAll(".jcn [title]")
Set addresses = .querySelectorAll(".desk-add.jaddt")
Set storesTextToDecipher = .querySelectorAll(".col-sm-5.col-xs-8.store-details.sp-detail.paddingR0")
End With
For i = 0 To titles.Length - 1
Debug.Print "title: " & titles.item(i).innerText
Debug.Print "address: " & addresses.item(i).innerText
Debug.Print GetStoreNumber(storesTextToDecipher.item(i), cipherDict)
Next
End Sub
Public Function GetStoreNumber(ByVal storeInfo As Object, cipherDict As Object) As String
Dim html2 As HTMLDocument, elems As Object, telNumber As String, j As Long
Set html2 = New HTMLDocument
html2.body.innerHTML = storeInfo.innerHTML
Set elems = html2.querySelectorAll("b span")
For j = 0 To elems.Length - 1
On Error Resume Next
If cipherDict(Replace$(elems.item(j).className, "mobilesv icon-", vbNullString)) < 10 Then
telNumber = telNumber & cipherDict(Replace$(elems.item(j).className, "mobilesv icon-", vbNullString))
End If
On Error GoTo 0
Next
GetStoreNumber = telNumber
End Function
示例输出:
编辑:所有页面结果
由于您现在想要超过 10 个,因此以下使用预期的页面结果计数(NUMBER_RESULTS_ON_PAGE)
从页面收集信息。它滚动页面直到电话号码的预期数量(这应该是唯一的) 找到,或者 MAX_WAIT_SEC
被命中。这意味着您避免了无限循环,并且如果您期望不同的数字,则可以设置您的预期结果计数。它确实依赖于每个商店都有一个列出的电话号码 - 这似乎是相当合理的假设。
Option Explicit
Public Sub GetInfo()
Dim IE As New InternetExplorer, resultCountDict As Object, cipherDict As Object, t As Date
Const MAX_WAIT_SEC As Long = 300 'wait 5 minutes max before exiting loop to get all results
Const NUMBER_RESULTS_ON_PAGE As Long = 80
Const URL = "https://www.justdial.com/Ahmedabad/Beauty-Parlours/page-3"
Dim titles As Object, addresses As Object, storesTextToDecipher As Object
Application.ScreenUpdating = True
Set resultCountDict = CreateObject("Scripting.Dictionary")
Set cipherDict = GetCipherDict(URL)
With IE
.Visible = True
.Navigate2 URL
While .Busy Or .readyState < 4: DoEvents: Wend
With .document
t = Timer
Do
DoEvents
Set titles = .querySelectorAll(".jcn [title]")
Set addresses = .querySelectorAll(".desk-add.jaddt")
Set storesTextToDecipher = .querySelectorAll(".col-sm-5.col-xs-8.store-details.sp-detail.paddingR0")
Dim telNumber As String, i As Long
For i = 0 To titles.Length - 1
telNumber = GetStoreNumber(storesTextToDecipher.item(i), cipherDict)
If Not resultCountDict.Exists(telNumber) Then
resultCountDict.Add telNumber, Array(titles.item(i).innerText, addresses.item(i).innerText, telNumber)
End If
Next
.parentWindow.execScript "window.scrollBy(0, window.innerHeight);", "javascript"
While IE.Busy Or IE.readyState < 4: DoEvents: Wend
If Timer - t > MAX_WAIT_SEC Then Exit Do
Loop Until resultCountDict.Count = NUMBER_RESULTS_ON_PAGE
End With
.Quit
End With
Dim key As Variant, rowCounter As Long
rowCounter = 1
With ThisWorkbook.Worksheets("Sheet1")
For Each key In resultCountDict.keys
.Cells(rowCounter, 1).Resize(1, 3) = resultCountDict(key)
rowCounter = rowCounter + 1
Next
End With
Application.ScreenUpdating = True
End Sub
Public Function GetStoreNumber(ByVal storeInfo As Object, cipherDict As Object) As String
Dim html2 As HTMLDocument, elems As Object, telNumber As String, j As Long
Set html2 = New HTMLDocument
html2.body.innerHTML = storeInfo.innerHTML
Set elems = html2.querySelectorAll("b span")
For j = 0 To elems.Length - 1
On Error Resume Next
If cipherDict(Replace$(elems.item(j).className, "mobilesv icon-", vbNullString)) < 10 Then
telNumber = telNumber & cipherDict(Replace$(elems.item(j).className, "mobilesv icon-", vbNullString))
End If
On Error GoTo 0
Next
GetStoreNumber = telNumber
End Function
Public Function GetCipherDict(ByVal URL As String) As Object
Dim sResponse As String, html As HTMLDocument
Set html = New HTMLDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", URL, False
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
Dim cipherKey As String, cipherDict As Object
Set cipherDict = CreateObject("Scripting.Dictionary")
cipherKey = Split(Split(sResponse, "smoothing:grayscale}.icon-")(1), ".mobilesv")(0)
cipherKey = Replace$(cipherKey, ":before{content:""d", Chr$(32))
Dim arr() As String, tempArr() As String, i As Long, j As Long
arr = Split(cipherKey, """}.icon-")
For i = LBound(arr) To UBound(arr)
tempArr = Split(arr(i), Chr$(32))
cipherDict(tempArr(0)) = Replace$(tempArr(1), """}", vbNullString) - 1 'needs adjustment
Next
Set GetCipherDict = cipherDict
End Function
编辑:
顶部有多个数字的版本(请注意,如果您发出太多请求或速度太快,服务器将为您提供随机页面):
Option Explicit
Public Sub GetDetails()
Dim re As Object, decodeDict As Object, i As Long
Dim html As MSHTML.htmlDocument, responseText As String, keys(), values()
Set decodeDict = CreateObject("Scripting.Dictionary")
Set re = CreateObject("vbscript.regexp")
Set html = New MSHTML.htmlDocument
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.justdial.com/chengalpattu/Oasis-Pharma-Near-Saraswathi-Children-School-Revathypuram-Urapakkam/9999PXX44-XX44-181123145524-X8G7_BZDET", False
.setRequestHeader "User-Agent", "Mozilla/4.0"
.send
responseText = .responseText
html.body.innerHTML = responseText
End With
keys = GetMatches(re, responseText, "-(\w+):before")
If UBound(keys) = 0 Then Exit Sub
values = GetMatches(re, responseText, "9d0(\d+)", True)
For i = LBound(values) To UBound(values)
decodeDict(keys(i)) = values(i)
Next
Dim itemsToDecode()
decodeDict(keys(UBound(keys))) = "+"
itemsToDecode = GetValuesToDecode(html)
PrintNumbers re, html, itemsToDecode, decodeDict
End Sub
Public Function GetMatches(ByVal re As Object, ByVal inputString As String, ByVal sPattern As String, Optional ByVal numeric = False, Optional ByVal spanSearch = False) As Variant
Dim matches As Object, iMatch As Object, s As String, arrMatches(), i As Long
With re
.Global = True
.MultiLine = True
.IgnoreCase = True
.Pattern = sPattern
If .Test(inputString) Then
Set matches = .Execute(inputString)
ReDim arrMatches(0 To matches.Count - 1)
For Each iMatch In matches
If numeric Then
arrMatches(i) = iMatch.SubMatches.item(0) - 1
Else
If spanSearch Then
arrMatches(i) = iMatch
Else
arrMatches(i) = iMatch.SubMatches.item(0)
End If
End If
i = i + 1
Next iMatch
Else
ReDim arrMatches(0)
arrMatches(0) = vbNullString
End If
End With
GetMatches = arrMatches
End Function
Public Function GetValuesToDecode(ByVal html As MSHTML.htmlDocument) As Variant
Dim i As Long, elements As Object, results(), Class As String
Set elements = html.querySelectorAll(".telCntct span[class*='icon']")
ReDim results(elements.Length - 1)
For i = 0 To elements.Length - 1
Class = elements.item(i).className
results(i) = Right$(Class, Len(Class) - InStrRev(Class, "-"))
Next
GetValuesToDecode = results
End Function
Public Sub PrintNumbers(ByVal re As Object, ByVal html As htmlDocument, ByVal itemsToDecode As Variant, ByVal decodeDict As Object)
Dim output As String, i As Long
For i = LBound(itemsToDecode) To UBound(itemsToDecode)
output = output & decodeDict(itemsToDecode(i))
Next
Dim htmlToSearch As String, groups As Variant, startPos As Long, oldStartPos As Long
htmlToSearch = html.querySelector(".telCntct").outerHTML
groups = GetMatches(re, htmlToSearch, "mobilesv|,", False, True)
startPos = 1
Dim totalNumbers As Long
For i = LBound(groups) To UBound(groups)
If InStr(groups(i), ",") > 0 Then
totalNumbers = totalNumbers + 1
Debug.Print Mid$(output, startPos, IIf(startPos = 1, i, i - startPos))
startPos = i + 1
End If
Next
If totalNumbers = 1 Then Debug.Print Right$(output, Len(output) - startPos - 1)
End Sub