使用 Excel VBA 从网站上抓取文本
Scrape text from a website using Excel VBA
我发现这个 article 解释了如何使用 Excel VBA 从网站上抓取某些标签。
下面的代码从它找到的第一个 <p>
标签获取内容:
Sub get_title_header()
Dim wb As Object
Dim doc As Object
Dim sURL As String
Dim lastrow As Long
lastrow = Sheet1.Cells(Rows.Count, "A").End(xlUp).Row
For i = 2 To lastrow
Set wb = CreateObject("internetExplorer.Application")
sURL = Cells(i, 1)
wb.navigate sURL
wb.Visible = True
While wb.Busy
DoEvents
Wend
'HTML document
Set doc = wb.document
Cells(i, 2) = doc.title
On Error GoTo err_clear
Cells(i, 3) = doc.GetElementsByTagName("p")(0).innerText
err_clear:
If Err <> 0 Then
Err.Clear
Resume Next
End If
wb.Quit
Range(Cells(i, 1), Cells(i, 3)).Columns.AutoFit
Next i
End Sub
我想让抓取工具获取网页上 <p>
标记内的所有内容。所以我想缺少某种 foreach
功能。
如何收集来自多个 <p>
标签的内容?
更新
工作代码!
Sub get_title_header()
Dim wb As Object
Dim doc As Object
Dim sURL As String
Dim lastrow As Long
Dim i As Integer
lastrow = Sheet1.Cells(Rows.Count, "A").End(xlUp).Row
For i = 2 To lastrow
Set wb = CreateObject("internetExplorer.Application")
sURL = Cells(i, 1)
wb.navigate sURL
wb.Visible = True
While wb.Busy
DoEvents
Wend
'HTML document
Set doc = wb.document
Cells(i, 2) = doc.Title
On Error GoTo err_clear
Dim el As Object
For Each el In doc.GetElementsByTagName("p")
counter = counter + 1
Cells(i, counter + 2).Value = Cells(counter + 1).Value & el.innerText
Next el
counter = 0
err_clear:
If Err <> 0 Then
Err.Clear
Resume Next
End If
wb.Quit
Range(Cells(i, 1), Cells(i, 10)).Columns.AutoFit
Next i
End Sub
你就快完成了! doc.GetElementsByTagName("p")
returns HTMLParagraphElement
个对象的集合,您使用 doc.GetElementsByTagName("p")(0)
访问了第一个条目。正如您提到的,For Each
循环可以让您依次访问每个循环:
Sub get_title_header()
Dim wb As Object
Dim doc As Object
Dim sURL As String
Dim lastrow As Long
Dim i As Integer
lastrow = Sheet1.Cells(Rows.Count, "A").End(xlUp).Row
For i = 2 To lastrow
Set wb = CreateObject("internetExplorer.Application")
sURL = Cells(i, 1)
wb.navigate sURL
wb.Visible = True
While wb.Busy
DoEvents
Wend
'HTML document
Set doc = wb.document
Cells(i, 2) = doc.Title
On Error GoTo err_clear
Dim el As Object
For Each el In doc.GetElementsByTagName("p")
Cells(i, 3).Value = Cells(i, 3).Value & ", " & el.innerText
Next el
err_clear:
If Err <> 0 Then
Err.Clear
Resume Next
End If
wb.Quit
Range(Cells(i, 1), Cells(i, 3)).Columns.AutoFit
Next i
End Sub
如果只是需要获取纯文本的网页内容,这段代码更简洁
Function WEBSITE_TEXT(Destination As String) As String
' Requires a reference to Microsoft XML, v6.0
' Draws on the Whosebug answer at bit.ly/parseXML
Dim myRequest As XMLHTTP60
Dim myDomDoc As DOMDocument60
' Check and clean inputs
On Error GoTo exitRoute
If Destination = "" Then
WEBSITE_TEXT = ""
Exit Function
End If
' Read the XML data from the Google Maps API
Set myRequest = New XMLHTTP60
myRequest.Open "GET", Destination, False
myRequest.send
' Parse HTML content
Dim html As New HTMLDocument
Dim text As String
html.body.innerHTML = myRequest.responseText
' Return the website content
text = html.body.innerText
If Not html Is Nothing Then WEBSITE_TEXT = text
exitRoute:
' Tidy up
text = ""
Set myRequest = Nothing
End Function
我发现这个 article 解释了如何使用 Excel VBA 从网站上抓取某些标签。
下面的代码从它找到的第一个 <p>
标签获取内容:
Sub get_title_header()
Dim wb As Object
Dim doc As Object
Dim sURL As String
Dim lastrow As Long
lastrow = Sheet1.Cells(Rows.Count, "A").End(xlUp).Row
For i = 2 To lastrow
Set wb = CreateObject("internetExplorer.Application")
sURL = Cells(i, 1)
wb.navigate sURL
wb.Visible = True
While wb.Busy
DoEvents
Wend
'HTML document
Set doc = wb.document
Cells(i, 2) = doc.title
On Error GoTo err_clear
Cells(i, 3) = doc.GetElementsByTagName("p")(0).innerText
err_clear:
If Err <> 0 Then
Err.Clear
Resume Next
End If
wb.Quit
Range(Cells(i, 1), Cells(i, 3)).Columns.AutoFit
Next i
End Sub
我想让抓取工具获取网页上 <p>
标记内的所有内容。所以我想缺少某种 foreach
功能。
如何收集来自多个 <p>
标签的内容?
更新 工作代码!
Sub get_title_header()
Dim wb As Object
Dim doc As Object
Dim sURL As String
Dim lastrow As Long
Dim i As Integer
lastrow = Sheet1.Cells(Rows.Count, "A").End(xlUp).Row
For i = 2 To lastrow
Set wb = CreateObject("internetExplorer.Application")
sURL = Cells(i, 1)
wb.navigate sURL
wb.Visible = True
While wb.Busy
DoEvents
Wend
'HTML document
Set doc = wb.document
Cells(i, 2) = doc.Title
On Error GoTo err_clear
Dim el As Object
For Each el In doc.GetElementsByTagName("p")
counter = counter + 1
Cells(i, counter + 2).Value = Cells(counter + 1).Value & el.innerText
Next el
counter = 0
err_clear:
If Err <> 0 Then
Err.Clear
Resume Next
End If
wb.Quit
Range(Cells(i, 1), Cells(i, 10)).Columns.AutoFit
Next i
End Sub
你就快完成了! doc.GetElementsByTagName("p")
returns HTMLParagraphElement
个对象的集合,您使用 doc.GetElementsByTagName("p")(0)
访问了第一个条目。正如您提到的,For Each
循环可以让您依次访问每个循环:
Sub get_title_header()
Dim wb As Object
Dim doc As Object
Dim sURL As String
Dim lastrow As Long
Dim i As Integer
lastrow = Sheet1.Cells(Rows.Count, "A").End(xlUp).Row
For i = 2 To lastrow
Set wb = CreateObject("internetExplorer.Application")
sURL = Cells(i, 1)
wb.navigate sURL
wb.Visible = True
While wb.Busy
DoEvents
Wend
'HTML document
Set doc = wb.document
Cells(i, 2) = doc.Title
On Error GoTo err_clear
Dim el As Object
For Each el In doc.GetElementsByTagName("p")
Cells(i, 3).Value = Cells(i, 3).Value & ", " & el.innerText
Next el
err_clear:
If Err <> 0 Then
Err.Clear
Resume Next
End If
wb.Quit
Range(Cells(i, 1), Cells(i, 3)).Columns.AutoFit
Next i
End Sub
如果只是需要获取纯文本的网页内容,这段代码更简洁
Function WEBSITE_TEXT(Destination As String) As String
' Requires a reference to Microsoft XML, v6.0
' Draws on the Whosebug answer at bit.ly/parseXML
Dim myRequest As XMLHTTP60
Dim myDomDoc As DOMDocument60
' Check and clean inputs
On Error GoTo exitRoute
If Destination = "" Then
WEBSITE_TEXT = ""
Exit Function
End If
' Read the XML data from the Google Maps API
Set myRequest = New XMLHTTP60
myRequest.Open "GET", Destination, False
myRequest.send
' Parse HTML content
Dim html As New HTMLDocument
Dim text As String
html.body.innerHTML = myRequest.responseText
' Return the website content
text = html.body.innerText
If Not html Is Nothing Then WEBSITE_TEXT = text
exitRoute:
' Tidy up
text = ""
Set myRequest = Nothing
End Function