VBA 跨多个网页抓取
VBA Scraping across multiple webpages
所以,我有以下代码来从网站上抓取数据并且它工作没有任何问题。
我的 "issue" 现在我需要 运行 代码遍历多个网页,因为我正在抓取的网站有一个分页脚本。
例如:单个页面有 48 条记录,但在大多数情况下该页面有 200 多条记录,但它们被细分为 3/4 页。
我的代码:
Public Sub Roupa()
Dim data As Object, i As Long, html As HTMLDocument, r As Long, c As Long, item As Object, div As Object
Set html = New HTMLDocument '<== VBE > Tools > References > Microsoft HTML Object Library
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.worten.pt/grandes-eletrodomesticos/maquinas-de-roupa/maquinas-de-roupa-ver-todos-marca-BALAY-e-BOSCH-e-SIEMENS?per_page=100", False
.send
html.body.innerHTML = .responseText
End With
Set data = html.getElementsByClassName("w-product__content")
For Each item In data
r = r + 1: c = 1
For Each div In item.getElementsByTagName("div")
With ThisWorkbook.Worksheets("Roupa")
.Cells(r, c) = div.innerText
End With
c = c + 1
Next
Next
Sheets("Roupa").Range("A:A,C:C,F:F,G:G,H:H,I:I").EntireColumn.Delete
End Sub
更新
我试过在 With
之前添加这个 For n = 1 To 2
,它有效,但我需要知道确切的页数,所以这不是很有帮助..
用结果数除以每页的结果数,计算出有多少页。然后循环将适当的页码连接到 url
Option Explicit
Public Sub Roupa()
Dim data As Object, i As Long, html As HTMLDocument, r As Long, c As Long, item As Object, div As Object
Set html = New HTMLDocument '<== VBE > Tools > References > Microsoft HTML Object Library
Const RESULTS_PER_PAGE As Long = 48
Const START_URL As String = "https://www.worten.pt/grandes-eletrodomesticos/maquinas-de-roupa/maquinas-de-roupa-ver-todos-marca-BALAY-e-BOSCH-e-SIEMENS?per_page=" & RESULTS_PER_PAGE & "&page=1"
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", START_URL, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
html.body.innerHTML = .responseText
Dim numPages As Long, numResults As Long, arr() As String
arr = Split(html.querySelector(".w-filters__element").innerText, Chr$(32))
numResults = arr(UBound(arr))
numPages = 1
If numResults > RESULTS_PER_PAGE Then
numPages = Application.RoundUp(numResults / RESULTS_PER_PAGE, 0)
End If
For i = 1 To numPages
If i > 1 Then
.Open "GET", Replace$("https://www.worten.pt/grandes-eletrodomesticos/maquinas-de-roupa/maquinas-de-roupa-ver-todos-marca-BALAY-e-BOSCH-e-SIEMENS?per_page=" & RESULTS_PER_PAGE & "&page=1", "page=1", "page=" & i), False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
html.body.innerHTML = .responseText
End If
Set data = html.getElementsByClassName("w-product__content")
For Each item In data
r = r + 1: c = 1
For Each div In item.getElementsByTagName("div")
With ThisWorkbook.Worksheets("Roupa")
.Cells(r, c) = div.innerText
End With
c = c + 1
Next
Next
Next
End With
Sheets("Roupa").Range("A:A,C:C,F:F,G:G,H:H,I:I").EntireColumn.Delete
End Sub
想想@AhmedAu 所说的,如果页面已正确加载,看起来也是获取页数的好方法是简单地使用:
numPages = html.querySelectorAll("[data-page]").Length
所以,我有以下代码来从网站上抓取数据并且它工作没有任何问题。
我的 "issue" 现在我需要 运行 代码遍历多个网页,因为我正在抓取的网站有一个分页脚本。
例如:单个页面有 48 条记录,但在大多数情况下该页面有 200 多条记录,但它们被细分为 3/4 页。
我的代码:
Public Sub Roupa()
Dim data As Object, i As Long, html As HTMLDocument, r As Long, c As Long, item As Object, div As Object
Set html = New HTMLDocument '<== VBE > Tools > References > Microsoft HTML Object Library
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.worten.pt/grandes-eletrodomesticos/maquinas-de-roupa/maquinas-de-roupa-ver-todos-marca-BALAY-e-BOSCH-e-SIEMENS?per_page=100", False
.send
html.body.innerHTML = .responseText
End With
Set data = html.getElementsByClassName("w-product__content")
For Each item In data
r = r + 1: c = 1
For Each div In item.getElementsByTagName("div")
With ThisWorkbook.Worksheets("Roupa")
.Cells(r, c) = div.innerText
End With
c = c + 1
Next
Next
Sheets("Roupa").Range("A:A,C:C,F:F,G:G,H:H,I:I").EntireColumn.Delete
End Sub
更新
我试过在 With
之前添加这个 For n = 1 To 2
,它有效,但我需要知道确切的页数,所以这不是很有帮助..
用结果数除以每页的结果数,计算出有多少页。然后循环将适当的页码连接到 url
Option Explicit
Public Sub Roupa()
Dim data As Object, i As Long, html As HTMLDocument, r As Long, c As Long, item As Object, div As Object
Set html = New HTMLDocument '<== VBE > Tools > References > Microsoft HTML Object Library
Const RESULTS_PER_PAGE As Long = 48
Const START_URL As String = "https://www.worten.pt/grandes-eletrodomesticos/maquinas-de-roupa/maquinas-de-roupa-ver-todos-marca-BALAY-e-BOSCH-e-SIEMENS?per_page=" & RESULTS_PER_PAGE & "&page=1"
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", START_URL, False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
html.body.innerHTML = .responseText
Dim numPages As Long, numResults As Long, arr() As String
arr = Split(html.querySelector(".w-filters__element").innerText, Chr$(32))
numResults = arr(UBound(arr))
numPages = 1
If numResults > RESULTS_PER_PAGE Then
numPages = Application.RoundUp(numResults / RESULTS_PER_PAGE, 0)
End If
For i = 1 To numPages
If i > 1 Then
.Open "GET", Replace$("https://www.worten.pt/grandes-eletrodomesticos/maquinas-de-roupa/maquinas-de-roupa-ver-todos-marca-BALAY-e-BOSCH-e-SIEMENS?per_page=" & RESULTS_PER_PAGE & "&page=1", "page=1", "page=" & i), False
.setRequestHeader "User-Agent", "Mozilla/5.0"
.send
html.body.innerHTML = .responseText
End If
Set data = html.getElementsByClassName("w-product__content")
For Each item In data
r = r + 1: c = 1
For Each div In item.getElementsByTagName("div")
With ThisWorkbook.Worksheets("Roupa")
.Cells(r, c) = div.innerText
End With
c = c + 1
Next
Next
Next
End With
Sheets("Roupa").Range("A:A,C:C,F:F,G:G,H:H,I:I").EntireColumn.Delete
End Sub
想想@AhmedAu 所说的,如果页面已正确加载,看起来也是获取页数的好方法是简单地使用:
numPages = html.querySelectorAll("[data-page]").Length