通过 Class 和标签名称抓取元素
Web Scraping Elements By Class & Tag name
我正在尝试从下面提到的网站复制数据,我需要所有范围的尺寸、价格、设施、特价、预订。我在代码下方构建了框架,但我能够正确复制元素。首先,只有三个元素在处理重复,而且我没有得到便利设施和储备的结果。有人可以调查一下吗?
Sub text()
Dim ie As New InternetExplorer, ws As Worksheet
Set ws = ThisWorkbook.Worksheets("Unit Data")
With ie
.Visible = True
.Navigate2 "https://www.safeandsecureselfstorage.com/self-storage-lake-villa-il-86955"
While .Busy Or .readyState < 4: DoEvents: Wend
Sheets("Unit Data").Select
Dim listings As Object, listing As Object, headers(), results()
Dim r As Long, list As Object, item As Object
headers = Array("size", "features", "Specials", "Price", "Reserve")
Set list = .document.getElementsByClassName("units_table")
'.unit_size medium, .features, .Specials, .price, .Reserve
Dim rowCount As Long
rowCount = .document.querySelectorAll(".tab_container li").Length
ReDim results(1 To rowCount, 1 To UBound(headers) + 1)
For Each listing In list
For Each item In listing.getElementsByClassName("unitinfo even")
r = r + 1
results(r, 1) = listing.getElementsByClassName("size secondary-color-text")(0).innerText
results(r, 2) = listing.getElementsByClassName("amenities")(0).innerText
results(r, 3) = listing.getElementsByClassName("offer1")(0).innerText
results(r, 4) = listing.getElementsByClassName("rate_text primary-color-text rate_text--clear")(0).innerText
results(r, 5) = listing.getElementsByClassName("reserve")(0).innerText
Next
Next
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
.Quit
End With
Worksheets("Unit Data").Range("A:G").Columns.AutoFit
End Sub
你能试试Jquery得到如下方法吗:
$.get( 'url', 函数(数据) {
// Loop through elements
$(data).find("ul").find("li").each( function(){
var text = $(this).text();
} )
});
这是一种方法:
Sub test()
Dim req As New WinHttpRequest
Dim doc As New HTMLDocument
Dim targetTable As HTMLTable
Dim tableRow As HTMLTableRow
Dim tableCell As HTMLTableCell
Dim element As HTMLDivElement
Dim sht As Worksheet
Dim amenitiesString As String
Dim i As Long
Dim j As Long
Set sht = ThisWorkbook.Worksheets("Sheet1")
With req
.Open "GET", "https://www.safeandsecureselfstorage.com/self-storage-lake-villa-il-86955", False
.send
doc.body.innerHTML = .responseText
End With
Set targetTable = doc.getElementById("units_small_units") 'You can use units_medium_units or units_large_units to get the info from the other tabs
i = 0
For Each tableRow In targetTable.Rows
i = i + 1
j = 0
For Each tableCell In tableRow.Cells
amenitiesString = ""
j = j + 1
If tableCell.className = "amenities" And tableCell.innerText <> "Amenities" Then
For Each element In tableCell.getElementsByTagName("div")
amenitiesString = amenitiesString & element.Title & ","
Next element
sht.Cells(i, j).Value = amenitiesString
ElseIf tableCell.className <> "features" Then
sht.Cells(i, j).Value = tableCell.innerText
End If
Next tableCell
Next tableRow
End Sub
我正在使用 HTTP 请求而不是 Internet Explorer 来获取 HTML。除此之外,我认为您可以了解如何访问您想要的元素。
这是结果的屏幕截图。
演示文稿有点原始,但您明白了:-P
基本上是这样的:
listing.getElementsByClassName("amenities")(0).innerText
会return一个空白,因为这些元素中没有内部文本。该信息由脚本生成,但也可以在 div
元素的 title
中找到。
使用的参考文献:
Microsoft HTML Object Library
和 WinHTTP Services Version 5.1
tl;dr;
提前(向某些人)道歉回答的长度,但我想我会接受这个
详细说明正在发生的事情的教学时刻。
我使用的总体方法与您的代码相同:找到一个 css 选择器来隔离行(尽管在不同的选项卡中,小、中、大实际上仍然全部显示在页面上):
Set listings = html.querySelectorAll(".unitinfo")
上面生成了行。和以前一样,我们将其转储到一个新的 HTMLDocument
中,以便我们可以利用 querySelector/querySelectorAll
方法。
行数:
让我们看一下我们正在检索的第一行html。后续章节将以此行作为案例研究来讨论如何检索信息:
5x5</TD> <TD class=features>
<DIV id=a5x5-1 class="icon a5x5">
<DIV class=img><IMG src="about:/core/resources/images/units/5x5_icon.png"></DIV>
<DIV class=display>
<P>More Information</P></DIV></DIV>
<SCRIPT type=text/javascript>
// Refine Search
//
$(function() {
$("#a5x5-1").tooltip({
track: false,
delay: 0,
showURL: false,
left: 5,
top: 5,
bodyHandler: function () {
return " <div class=\"tooltip\"> <div class=\"tooltop\"></div> <div class=\"toolmid clearfix\"> <div class=\"toolcontent\"> <div style=\"text-align:center;width:100%\"> <img alt=\"5 x 5 storage unit\" src=\"/core/resources/images/units/5x5.png\" /> </div> <div class=\"display\">5 x 5</div> <div class=\"description\">Think of it like a standard closet. Approximately 25 square feet, this space is perfect for about a dozen boxes, a desk and chair, and a bicycle.</div> </div> <div class=\"clearfix\"></div> </div> <div class=\"toolfoot\"></div> <div class=\"clearfix\"></div> </div> "}
});
});
</SCRIPT>
</TD><TD class=rates>
<DIV class="discount_price secondary-color-text standard_price--left">
<DIV class=price_text>Web Rate: </DIV>
<DIV class="rate_text primary-color-text rate_text--clear">.00 </DIV></DIV>
<SCRIPT>
$( document ).ready(function() {
$('.units_table tr.unitinfo').each(function(index, el) {
if ($(this).find('.standard_price').length != '' && $(this).find('.discount_price').length != '') {
$(this).parents('.units_table').addClass('both');
$(this).addClass('also-both');
$(this).find('.rates').addClass('rates_two_column');
}
});
});
</SCRIPT>
</TD><TD class=amenities>
<DIV title="Temperature Controlled" class="amenity_icon icon_climate"></DIV>
<DIV title="Interior Storage" class="amenity_icon icon_interior"></DIV>
<DIV title="Ground Floor" class="amenity_icon icon_ground_floor"></DIV></TD><TD class=offers>
<DIV class=offer1>Call for Specials </DIV>
<DIV class=offer2></DIV></TD><TD class=reserve><A id=5x5:39:00000000 class="facility_call_to_reserve cta_call primary-color primary-hover" href="about:blank#" rel=nofollow>Call </A></TD>
我们将要处理的每一行在 html2
变量中都有类似的 html。如果您有疑问,请查看上面显示的函数中的 javascript:
$('.units_table tr.unitinfo').each(function(index, el)
它使用相同的选择器(但也指定了父元素 table class 和元素类型 (tr
))。基本上,table.
中的每一行都会调用该函数
尺码:
出于某种原因,开头的 td
标签被删除了(我认为我已经看到它缺少父 <table>
标签)所以为了大小,而不是通过 class 抓取,我正在寻找结束标记的开始并将字符串提取到那里。我通过将 Instr 给出的 return 值(其中 < 在字符串中找到)-1 传递给 Left$
(类型)函数来做到这一点。
results(r, 1) = Left$(html2.body.innerHTML, InStr(html2.body.innerHTML, "<") - 1)
这个returns 5x5
.
描述:
描述列由我们在上面看到的函数填充(记住它应用于每一行)
这个位 - $("#a5x5-1").tooltip
- 告诉它目标位置,然后函数的 return 语句提供 html 有一个 div
,用 class description
,包含我们想要的文字。由于我们没有使用浏览器,而我使用的是 64 位 windows,我无法评估此脚本,但我可以使用 split
提取 "description\">
和 "description\">
之间的字符串(描述)结束关联 div
标签的开始:
results(r, 2) = Split(Split(html2.querySelector("SCRIPT").innerHTML, """description\"">")(1), "</div>")(0)
这个returns:
“把它想象成一个标准的壁橱。大约 25 平方英尺,这个 space 非常适合放置大约一打箱子、一张桌子和椅子以及一辆自行车。”
费率类型和价格:
这些很简单,使用 class 名称作为目标:
results(r, 3) = Replace$(html2.querySelector(".price_text").innerText, ":", vbNullString)
results(r, 4) = Trim$(html2.querySelector(".rate_text").innerText)
返回(分别)
网络费率,
£39.00
便利设施:
这是事情有点棘手的地方。
让我们重新检查上面显示的 html,对于这一行,它与便利设施有关:
<TD class=amenities>
<DIV title="Temperature Controlled" class="amenity_icon icon_climate"></DIV>
<DIV title="Interior Storage" class="amenity_icon icon_interior"></DIV>
<DIV title="Ground Floor" class="amenity_icon icon_ground_floor"></DIV></TD>
我们可以看到父 td
有一个 amenities
的 class,其子 div
元素具有复合 class 名称;后者在每种情况下都用作便利设施类型的标识符,例如icon_climate
.
当您将鼠标悬停在这些上时,页面上会显示工具提示信息:
我们可以在实际页面的 html 中跟踪此工具提示的位置:
当您将鼠标悬停在不同的便利设施上时,此内容会更新。
长话短说 (他在页面的一半处说!),此内容正在从服务器上的 php 文件更新。我们可以请求该文件并构建一个映射每个设施的 class 名称的字典,例如amenity_icon icon_climate
(当转换为 .amenity_icon.icon_climate
的适当 css 选择器时,复合 classes 需要将“ ”替换为“.”)到相关描述。您可以探索 php 文件 here.
php 文件:
让我们只看文件的开头,以剖析什么是重复模式的基本单位:
function LoadTooltips() {
$(".units_table .amenity_icon.icon_climate").tooltip({
track: false,
delay: 0,
showURL: false,
left: -126,
top: -100,
bodyHandler: function () {
return "<div class=\"sidebar_tooltip\"><h4>Temperature Controlled</h4><p>Units are heated and/or cooled. See manager for details.</p></div>"
}
});
负责更新工具提示的函数是LoadTooltips
。 CSS class 选择器用于定位每个图标:
$(".units_table .amenity_icon.icon_climate").tooltip
并且我们有 bodyhandler 指定 return 文本:
bodyHandler: function () {
return "<div class=\"sidebar_tooltip\"><h4>Temperature Controlled</h4><p>Units are heated and/or cooled. See manager for details.</p></div>"
我们有三位有用的信息出现在重复的组中。元素的 class 名称选择器、简短描述和详细描述,例如
.amenity_icon.icon_climate
:我们使用它来将 php 文件描述映射到我们行中便利设施图标的 class 名称。 CSS 选择器
Temperature Controlled
;在工具提示功能 return 文本的 h4
标签内。 简短描述
Units are heated and/or cooled. See manager for details.
;在工具提示功能 return 文本的 p
标签内。 详细说明
我写了 2 个函数,GetMatches
和 GetAmenitiesDescriptions
,它们使用正则表达式为每个图标提取所有重复项,return 一个字典,其中有 css 选择器作为键,短 description : long description
作为值。
当我收集每一行中的所有图标时:
Set icons = html2.querySelectorAll(".amenity_icon")
我根据图标的 class 名称使用词典 return 工具提示描述
For icon = 0 To icons.Length - 1 'use class name of amenity to look up description
amenitiesInfo(icon) = amenitiesDescriptions("." & Replace$(icons.item(icon).className, Chr$(32), "."))
Next
然后我用 vbNewLine
加入描述以确保输出在输出单元格中的不同行上。
您可以探索正则表达式 here。
正则表达式使用 |
(或)语法,所以我 return 将所有匹配的模式放在一个列表中。
arr = GetMatches(re, s, "(\.amenity_icon\..*)""|<h4>(.*)<\/h4>|<p>(.*)<\/p>")
因为我想要不同的子匹配(0,1 或 2 a.k.a css class 选择器,短描述,长描述)我使用 Select Case i mod 3
,与计数器变量 i
,以提取适当的子匹配项。
php 文件中映射的匹配示例:
特价:
我们回到 class 选择器。 Offer2
未填充,因此您可以删除。
results(r, 6) = html2.querySelector(".offer1").innerText
results(r, 7) = html2.querySelector(".offer2").innerText
returns(分别):
特价电话,空字符串
结束语:
因此,上面的内容带您浏览了一行。它只是冲洗并在所有行的循环中重复。为了提高效率,将数据添加到数组中,results
;然后一次性写入 Sheet1
。我可以看到一些小的改进,但这很快。
VBA:
Option Explicit
Public Sub GetInfo()
Dim ws As Worksheet, html As HTMLDocument, s As String, amenitiesDescriptions As Object
Const URL As String = "https://www.safeandsecureselfstorage.com/self-storage-lake-villa-il-86955"
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set html = New HTMLDocument
Set amenitiesDescriptions = GetAmenitiesDescriptions
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", URL, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
s = .responseText
html.body.innerHTML = s
Dim headers(), results(), listings As Object, amenities As String
headers = Array("Size", "Description", "RateType", "Price", "Amenities", "Offer1", "Offer2")
Set listings = html.querySelectorAll(".unitinfo")
Dim rowCount As Long, numColumns As Long, r As Long, c As Long
Dim icons As Object, icon As Long, amenitiesInfo(), i As Long, item As Long
rowCount = listings.Length
numColumns = UBound(headers) + 1
ReDim results(1 To rowCount, 1 To numColumns)
Dim html2 As HTMLDocument
Set html2 = New HTMLDocument
For item = 0 To listings.Length - 1
r = r + 1
html2.body.innerHTML = listings.item(item).innerHTML
results(r, 1) = Left$(html2.body.innerHTML, InStr(html2.body.innerHTML, "<") - 1)
results(r, 2) = Split(Split(html2.querySelector("SCRIPT").innerHTML, """description\"">")(1), "</div>")(0)
results(r, 3) = Replace$(html2.querySelector(".price_text").innerText, ":", vbNullString)
results(r, 4) = Trim$(html2.querySelector(".rate_text").innerText)
Set icons = html2.querySelectorAll(".amenity_icon")
ReDim amenitiesInfo(0 To icons.Length - 1)
For icon = 0 To icons.Length - 1 'use class name of amenity to look up description
amenitiesInfo(icon) = amenitiesDescriptions("." & Replace$(icons.item(icon).className, Chr$(32), "."))
Next
amenities = Join$(amenitiesInfo, vbNewLine) 'place each amenity description on a new line within cell when written out
results(r, 5) = amenities
results(r, 6) = html2.querySelector(".offer1").innerText
results(r, 7) = html2.querySelector(".offer2").innerText
Next
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
Public Function GetAmenitiesDescriptions() As Object 'retrieve amenities descriptions from php file on server
Dim s As String, dict As Object, re As Object, i As Long, arr() 'keys based on classname, short desc, full desc
' view regex here: https://regex101.com/r/bII5AL/1
Set dict = CreateObject("Scripting.Dictionary")
Set re = CreateObject("vbscript.regexp")
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.safeandsecureselfstorage.com/core/resources/js/src/common.tooltip.php", False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
s = .responseText
arr = GetMatches(re, s, "(\.amenity_icon\..*)""|<h4>(.*)<\/h4>|<p>(.*)<\/p>")
For i = LBound(arr) To UBound(arr) Step 3 'build up lookup dictionary for amenities descriptions
dict(arr(i)) = arr(i + 1) & ": " & arr(i + 2)
Next
End With
Set GetAmenitiesDescriptions = dict
End Function
Public Function GetMatches(ByVal re As Object, inputString As String, ByVal sPattern As String) As Variant
Dim matches As Object, iMatch As Object, s As String, arrMatches(), i As Long
With re
.Global = True
.MultiLine = True
.IgnoreCase = False
.Pattern = sPattern
If .test(inputString) Then
Set matches = .Execute(inputString)
ReDim arrMatches(0 To matches.Count - 1)
For Each iMatch In matches
Select Case i Mod 3
Case 0
arrMatches(i) = iMatch.SubMatches.item(0)
Case 1
arrMatches(i) = iMatch.SubMatches.item(1)
Case 2
arrMatches(i) = iMatch.SubMatches.item(2)
End Select
i = i + 1
Next iMatch
Else
ReDim arrMatches(0)
arrMatches(0) = vbNullString
End If
End With
GetMatches = arrMatches
End Function
输出:
参考资料(VBE > 工具 > 参考资料):
- 微软HTML对象库
我正在尝试从下面提到的网站复制数据,我需要所有范围的尺寸、价格、设施、特价、预订。我在代码下方构建了框架,但我能够正确复制元素。首先,只有三个元素在处理重复,而且我没有得到便利设施和储备的结果。有人可以调查一下吗?
Sub text()
Dim ie As New InternetExplorer, ws As Worksheet
Set ws = ThisWorkbook.Worksheets("Unit Data")
With ie
.Visible = True
.Navigate2 "https://www.safeandsecureselfstorage.com/self-storage-lake-villa-il-86955"
While .Busy Or .readyState < 4: DoEvents: Wend
Sheets("Unit Data").Select
Dim listings As Object, listing As Object, headers(), results()
Dim r As Long, list As Object, item As Object
headers = Array("size", "features", "Specials", "Price", "Reserve")
Set list = .document.getElementsByClassName("units_table")
'.unit_size medium, .features, .Specials, .price, .Reserve
Dim rowCount As Long
rowCount = .document.querySelectorAll(".tab_container li").Length
ReDim results(1 To rowCount, 1 To UBound(headers) + 1)
For Each listing In list
For Each item In listing.getElementsByClassName("unitinfo even")
r = r + 1
results(r, 1) = listing.getElementsByClassName("size secondary-color-text")(0).innerText
results(r, 2) = listing.getElementsByClassName("amenities")(0).innerText
results(r, 3) = listing.getElementsByClassName("offer1")(0).innerText
results(r, 4) = listing.getElementsByClassName("rate_text primary-color-text rate_text--clear")(0).innerText
results(r, 5) = listing.getElementsByClassName("reserve")(0).innerText
Next
Next
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
.Quit
End With
Worksheets("Unit Data").Range("A:G").Columns.AutoFit
End Sub
你能试试Jquery得到如下方法吗:
$.get( 'url', 函数(数据) {
// Loop through elements
$(data).find("ul").find("li").each( function(){
var text = $(this).text();
} )
});
这是一种方法:
Sub test()
Dim req As New WinHttpRequest
Dim doc As New HTMLDocument
Dim targetTable As HTMLTable
Dim tableRow As HTMLTableRow
Dim tableCell As HTMLTableCell
Dim element As HTMLDivElement
Dim sht As Worksheet
Dim amenitiesString As String
Dim i As Long
Dim j As Long
Set sht = ThisWorkbook.Worksheets("Sheet1")
With req
.Open "GET", "https://www.safeandsecureselfstorage.com/self-storage-lake-villa-il-86955", False
.send
doc.body.innerHTML = .responseText
End With
Set targetTable = doc.getElementById("units_small_units") 'You can use units_medium_units or units_large_units to get the info from the other tabs
i = 0
For Each tableRow In targetTable.Rows
i = i + 1
j = 0
For Each tableCell In tableRow.Cells
amenitiesString = ""
j = j + 1
If tableCell.className = "amenities" And tableCell.innerText <> "Amenities" Then
For Each element In tableCell.getElementsByTagName("div")
amenitiesString = amenitiesString & element.Title & ","
Next element
sht.Cells(i, j).Value = amenitiesString
ElseIf tableCell.className <> "features" Then
sht.Cells(i, j).Value = tableCell.innerText
End If
Next tableCell
Next tableRow
End Sub
我正在使用 HTTP 请求而不是 Internet Explorer 来获取 HTML。除此之外,我认为您可以了解如何访问您想要的元素。
这是结果的屏幕截图。
演示文稿有点原始,但您明白了:-P
基本上是这样的:
listing.getElementsByClassName("amenities")(0).innerText
会return一个空白,因为这些元素中没有内部文本。该信息由脚本生成,但也可以在 div
元素的 title
中找到。
使用的参考文献:
Microsoft HTML Object Library
和 WinHTTP Services Version 5.1
tl;dr;
提前(向某些人)道歉回答的长度,但我想我会接受这个 详细说明正在发生的事情的教学时刻。
我使用的总体方法与您的代码相同:找到一个 css 选择器来隔离行(尽管在不同的选项卡中,小、中、大实际上仍然全部显示在页面上):
Set listings = html.querySelectorAll(".unitinfo")
上面生成了行。和以前一样,我们将其转储到一个新的 HTMLDocument
中,以便我们可以利用 querySelector/querySelectorAll
方法。
行数:
让我们看一下我们正在检索的第一行html。后续章节将以此行作为案例研究来讨论如何检索信息:
5x5</TD> <TD class=features>
<DIV id=a5x5-1 class="icon a5x5">
<DIV class=img><IMG src="about:/core/resources/images/units/5x5_icon.png"></DIV>
<DIV class=display>
<P>More Information</P></DIV></DIV>
<SCRIPT type=text/javascript>
// Refine Search
//
$(function() {
$("#a5x5-1").tooltip({
track: false,
delay: 0,
showURL: false,
left: 5,
top: 5,
bodyHandler: function () {
return " <div class=\"tooltip\"> <div class=\"tooltop\"></div> <div class=\"toolmid clearfix\"> <div class=\"toolcontent\"> <div style=\"text-align:center;width:100%\"> <img alt=\"5 x 5 storage unit\" src=\"/core/resources/images/units/5x5.png\" /> </div> <div class=\"display\">5 x 5</div> <div class=\"description\">Think of it like a standard closet. Approximately 25 square feet, this space is perfect for about a dozen boxes, a desk and chair, and a bicycle.</div> </div> <div class=\"clearfix\"></div> </div> <div class=\"toolfoot\"></div> <div class=\"clearfix\"></div> </div> "}
});
});
</SCRIPT>
</TD><TD class=rates>
<DIV class="discount_price secondary-color-text standard_price--left">
<DIV class=price_text>Web Rate: </DIV>
<DIV class="rate_text primary-color-text rate_text--clear">.00 </DIV></DIV>
<SCRIPT>
$( document ).ready(function() {
$('.units_table tr.unitinfo').each(function(index, el) {
if ($(this).find('.standard_price').length != '' && $(this).find('.discount_price').length != '') {
$(this).parents('.units_table').addClass('both');
$(this).addClass('also-both');
$(this).find('.rates').addClass('rates_two_column');
}
});
});
</SCRIPT>
</TD><TD class=amenities>
<DIV title="Temperature Controlled" class="amenity_icon icon_climate"></DIV>
<DIV title="Interior Storage" class="amenity_icon icon_interior"></DIV>
<DIV title="Ground Floor" class="amenity_icon icon_ground_floor"></DIV></TD><TD class=offers>
<DIV class=offer1>Call for Specials </DIV>
<DIV class=offer2></DIV></TD><TD class=reserve><A id=5x5:39:00000000 class="facility_call_to_reserve cta_call primary-color primary-hover" href="about:blank#" rel=nofollow>Call </A></TD>
我们将要处理的每一行在 html2
变量中都有类似的 html。如果您有疑问,请查看上面显示的函数中的 javascript:
$('.units_table tr.unitinfo').each(function(index, el)
它使用相同的选择器(但也指定了父元素 table class 和元素类型 (tr
))。基本上,table.
尺码:
出于某种原因,开头的 td
标签被删除了(我认为我已经看到它缺少父 <table>
标签)所以为了大小,而不是通过 class 抓取,我正在寻找结束标记的开始并将字符串提取到那里。我通过将 Instr 给出的 return 值(其中 < 在字符串中找到)-1 传递给 Left$
(类型)函数来做到这一点。
results(r, 1) = Left$(html2.body.innerHTML, InStr(html2.body.innerHTML, "<") - 1)
这个returns 5x5
.
描述:
描述列由我们在上面看到的函数填充(记住它应用于每一行)
这个位 - $("#a5x5-1").tooltip
- 告诉它目标位置,然后函数的 return 语句提供 html 有一个 div
,用 class description
,包含我们想要的文字。由于我们没有使用浏览器,而我使用的是 64 位 windows,我无法评估此脚本,但我可以使用 split
提取 "description\">
和 "description\">
之间的字符串(描述)结束关联 div
标签的开始:
results(r, 2) = Split(Split(html2.querySelector("SCRIPT").innerHTML, """description\"">")(1), "</div>")(0)
这个returns:
“把它想象成一个标准的壁橱。大约 25 平方英尺,这个 space 非常适合放置大约一打箱子、一张桌子和椅子以及一辆自行车。”
费率类型和价格:
这些很简单,使用 class 名称作为目标:
results(r, 3) = Replace$(html2.querySelector(".price_text").innerText, ":", vbNullString)
results(r, 4) = Trim$(html2.querySelector(".rate_text").innerText)
返回(分别)
网络费率, £39.00
便利设施:
这是事情有点棘手的地方。
让我们重新检查上面显示的 html,对于这一行,它与便利设施有关:
<TD class=amenities>
<DIV title="Temperature Controlled" class="amenity_icon icon_climate"></DIV>
<DIV title="Interior Storage" class="amenity_icon icon_interior"></DIV>
<DIV title="Ground Floor" class="amenity_icon icon_ground_floor"></DIV></TD>
我们可以看到父 td
有一个 amenities
的 class,其子 div
元素具有复合 class 名称;后者在每种情况下都用作便利设施类型的标识符,例如icon_climate
.
当您将鼠标悬停在这些上时,页面上会显示工具提示信息:
我们可以在实际页面的 html 中跟踪此工具提示的位置:
当您将鼠标悬停在不同的便利设施上时,此内容会更新。
长话短说 (他在页面的一半处说!),此内容正在从服务器上的 php 文件更新。我们可以请求该文件并构建一个映射每个设施的 class 名称的字典,例如amenity_icon icon_climate
(当转换为 .amenity_icon.icon_climate
的适当 css 选择器时,复合 classes 需要将“ ”替换为“.”)到相关描述。您可以探索 php 文件 here.
php 文件:
让我们只看文件的开头,以剖析什么是重复模式的基本单位:
function LoadTooltips() {
$(".units_table .amenity_icon.icon_climate").tooltip({
track: false,
delay: 0,
showURL: false,
left: -126,
top: -100,
bodyHandler: function () {
return "<div class=\"sidebar_tooltip\"><h4>Temperature Controlled</h4><p>Units are heated and/or cooled. See manager for details.</p></div>"
}
});
负责更新工具提示的函数是LoadTooltips
。 CSS class 选择器用于定位每个图标:
$(".units_table .amenity_icon.icon_climate").tooltip
并且我们有 bodyhandler 指定 return 文本:
bodyHandler: function () {
return "<div class=\"sidebar_tooltip\"><h4>Temperature Controlled</h4><p>Units are heated and/or cooled. See manager for details.</p></div>"
我们有三位有用的信息出现在重复的组中。元素的 class 名称选择器、简短描述和详细描述,例如
.amenity_icon.icon_climate
:我们使用它来将 php 文件描述映射到我们行中便利设施图标的 class 名称。 CSS 选择器Temperature Controlled
;在工具提示功能 return 文本的h4
标签内。 简短描述Units are heated and/or cooled. See manager for details.
;在工具提示功能 return 文本的p
标签内。 详细说明
我写了 2 个函数,GetMatches
和 GetAmenitiesDescriptions
,它们使用正则表达式为每个图标提取所有重复项,return 一个字典,其中有 css 选择器作为键,短 description : long description
作为值。
当我收集每一行中的所有图标时:
Set icons = html2.querySelectorAll(".amenity_icon")
我根据图标的 class 名称使用词典 return 工具提示描述
For icon = 0 To icons.Length - 1 'use class name of amenity to look up description
amenitiesInfo(icon) = amenitiesDescriptions("." & Replace$(icons.item(icon).className, Chr$(32), "."))
Next
然后我用 vbNewLine
加入描述以确保输出在输出单元格中的不同行上。
您可以探索正则表达式 here。
正则表达式使用 |
(或)语法,所以我 return 将所有匹配的模式放在一个列表中。
arr = GetMatches(re, s, "(\.amenity_icon\..*)""|<h4>(.*)<\/h4>|<p>(.*)<\/p>")
因为我想要不同的子匹配(0,1 或 2 a.k.a css class 选择器,短描述,长描述)我使用 Select Case i mod 3
,与计数器变量 i
,以提取适当的子匹配项。
php 文件中映射的匹配示例:
特价:
我们回到 class 选择器。 Offer2
未填充,因此您可以删除。
results(r, 6) = html2.querySelector(".offer1").innerText
results(r, 7) = html2.querySelector(".offer2").innerText
returns(分别):
特价电话,空字符串
结束语:
因此,上面的内容带您浏览了一行。它只是冲洗并在所有行的循环中重复。为了提高效率,将数据添加到数组中,results
;然后一次性写入 Sheet1
。我可以看到一些小的改进,但这很快。
VBA:
Option Explicit
Public Sub GetInfo()
Dim ws As Worksheet, html As HTMLDocument, s As String, amenitiesDescriptions As Object
Const URL As String = "https://www.safeandsecureselfstorage.com/self-storage-lake-villa-il-86955"
Set ws = ThisWorkbook.Worksheets("Sheet1")
Set html = New HTMLDocument
Set amenitiesDescriptions = GetAmenitiesDescriptions
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", URL, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
s = .responseText
html.body.innerHTML = s
Dim headers(), results(), listings As Object, amenities As String
headers = Array("Size", "Description", "RateType", "Price", "Amenities", "Offer1", "Offer2")
Set listings = html.querySelectorAll(".unitinfo")
Dim rowCount As Long, numColumns As Long, r As Long, c As Long
Dim icons As Object, icon As Long, amenitiesInfo(), i As Long, item As Long
rowCount = listings.Length
numColumns = UBound(headers) + 1
ReDim results(1 To rowCount, 1 To numColumns)
Dim html2 As HTMLDocument
Set html2 = New HTMLDocument
For item = 0 To listings.Length - 1
r = r + 1
html2.body.innerHTML = listings.item(item).innerHTML
results(r, 1) = Left$(html2.body.innerHTML, InStr(html2.body.innerHTML, "<") - 1)
results(r, 2) = Split(Split(html2.querySelector("SCRIPT").innerHTML, """description\"">")(1), "</div>")(0)
results(r, 3) = Replace$(html2.querySelector(".price_text").innerText, ":", vbNullString)
results(r, 4) = Trim$(html2.querySelector(".rate_text").innerText)
Set icons = html2.querySelectorAll(".amenity_icon")
ReDim amenitiesInfo(0 To icons.Length - 1)
For icon = 0 To icons.Length - 1 'use class name of amenity to look up description
amenitiesInfo(icon) = amenitiesDescriptions("." & Replace$(icons.item(icon).className, Chr$(32), "."))
Next
amenities = Join$(amenitiesInfo, vbNewLine) 'place each amenity description on a new line within cell when written out
results(r, 5) = amenities
results(r, 6) = html2.querySelector(".offer1").innerText
results(r, 7) = html2.querySelector(".offer2").innerText
Next
ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
ws.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
Public Function GetAmenitiesDescriptions() As Object 'retrieve amenities descriptions from php file on server
Dim s As String, dict As Object, re As Object, i As Long, arr() 'keys based on classname, short desc, full desc
' view regex here: https://regex101.com/r/bII5AL/1
Set dict = CreateObject("Scripting.Dictionary")
Set re = CreateObject("vbscript.regexp")
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.safeandsecureselfstorage.com/core/resources/js/src/common.tooltip.php", False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
s = .responseText
arr = GetMatches(re, s, "(\.amenity_icon\..*)""|<h4>(.*)<\/h4>|<p>(.*)<\/p>")
For i = LBound(arr) To UBound(arr) Step 3 'build up lookup dictionary for amenities descriptions
dict(arr(i)) = arr(i + 1) & ": " & arr(i + 2)
Next
End With
Set GetAmenitiesDescriptions = dict
End Function
Public Function GetMatches(ByVal re As Object, inputString As String, ByVal sPattern As String) As Variant
Dim matches As Object, iMatch As Object, s As String, arrMatches(), i As Long
With re
.Global = True
.MultiLine = True
.IgnoreCase = False
.Pattern = sPattern
If .test(inputString) Then
Set matches = .Execute(inputString)
ReDim arrMatches(0 To matches.Count - 1)
For Each iMatch In matches
Select Case i Mod 3
Case 0
arrMatches(i) = iMatch.SubMatches.item(0)
Case 1
arrMatches(i) = iMatch.SubMatches.item(1)
Case 2
arrMatches(i) = iMatch.SubMatches.item(2)
End Select
i = i + 1
Next iMatch
Else
ReDim arrMatches(0)
arrMatches(0) = vbNullString
End If
End With
GetMatches = arrMatches
End Function
输出:
参考资料(VBE > 工具 > 参考资料):
- 微软HTML对象库