VBA 无法使用 .getElementsByTag() 或 .getElementByID() 从 HTML 获取数据

VBA cannot get data from HTML with .getElementsByTag() nor .getElementByID()

我当前的项目包括从 HTML 源代码中检索数据。 具体来说,我正在查看此网站上的崩溃案例:

https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?xsl=main.xsl&CaseID=112007272

我想通过查找特定 tags/IDs 的 .innertext 从 HTML 收集所有相关数据。

到目前为止我的代码:

Sub ExtractData()

mystart:

'First I create two Internet Explorer object

Set objIE = CreateObject("InternetExplorer.Application")      'this browser contains the list of cases
objIE.Top = 0
objIE.Left = 0
objIE.Width = 1600
objIE.Height = 900
objIE.Visible = True 'We can see IE

Set objIEdata = CreateObject("InternetExplorer.Application")    'this browser opens the specific case
objIEdata.Top = 0
objIEdata.Left = 0
objIEdata.Width = 1600
objIEdata.Height = 900
objIEdata.Visible = True 'We can see IE

On Error Resume Next
objIE.navigate ("https://crashviewer.nhtsa.dot.gov/LegacyCDS/Index")        'url of website

Do
    DoEvents
    If Err.Number <> 0 Then
        objIE.Quit
        Set objIE = Nothing
        GoTo mystart:
    End If
Loop Until objIE.readystate = 4

'we define an object variable Alllinks and loop through all the links to search for

Set aAlllinks = objIE.document.getElementsByTagName("button")                'looks for Search Button 
For Each Hyperlink In aAlllinks
    If Hyperlink.innertext = " Search" Then
        Hyperlink.Click
        Exit For
    Else
        MsgBox "Search Button was not found. Please improve code!"
    End If

Next

Application.Wait (Now + TimeValue("0:00:02"))

Set bAlllinks = objIE.document.getElementsByTagName("a")                     'all Hyperlinks on webpage start with Tag "a"
For Each Hyperlink In bAlllinks
    If UBound(Split(Hyperlink.innertext, "-")) = 2 And Len(Hyperlink.innertext) = 11 Then             'case specific to find the Hyperlinks which contain cases
        Debug.Print Hyperlink.innertext

        '2nd IE is used for each case

restart:
            objIEdata.navigate (Hyperlink.href)        'url of each case

            Do
                DoEvents
                If Err.Number <> 0 Then
                    objIEdata.Quit
                    Set objIE = Nothing
                    GoTo restart:
                End If
            Loop Until objIEdata.readystate = 4

            Set register = objIEdata.document.getElementByTagName("tbody")             'objIEdata.document.getElementByID("main").getElementByID("mainSection")  '.getElementByID("bodyMain").getElementsByTagName("tbody")
            For Each untermenue In register
                Debug.Print untermenue.innerHTML
            Next

            Application.Wait (Now + TimeValue("0:00:02"))




    End If
Next




objIE.Quit
objIEdata.Quit

End Sub

请注意,IE 的可见性只是出于调试原因。

让我困惑的部分是

Set register = objIEdata.document.getElementByTagName("tbody").

如果我寻找 .TagName("tbody"),变量寄存器返回空,如果我寻找 .ID("bodyMain"),也会发生同样的情况。不幸的是,我不熟悉 HTML 以及 VBA 如何与 HTML 文档交互。我的印象是我可以通过它们的 ID 来处理所有元素,如果它们恰好有一个,但这似乎不起作用。

我是否需要自己完成 HTML "branches" 或者代码是否应该能够找到每个 ID,无论它是在 "child" 中找到的?

非常感谢

你问的是一个很大的要求,所以我将提供一些指导和起始代码。我的代码应该写出所有的表格,但你会想尝试一下以获得你想要的格式。围绕有效选择元素肯定有足够的逻辑,这应该有所帮助。 * 由于时间限制,我还没有测试使用 class 循环遍历所有检索到的 ID,但测试了个别情况和所有 ID 的检索。


获取初始案例链接和 ID:

我可能会使用一个函数 returning 一个包含链接和 ID 的数组。如果您提取 ID,它们可以传递给我在下面显示的 XMLHTTP 请求。

URL 是 https://crashviewer.nhtsa.dot.gov/LegacyCDS/Search

Public Function GetLinksAndIds(ByVal URL) As Variant
    Dim ie As InternetExplorer, i As Long
    Set ie = New InternetExplorer
    With ie
        .Visible = True
        .navigate2 URL

        While .Busy Or .readyState < 4: DoEvents: Wend

        .document.getElementById("btnSubmit1").Click

         While .Busy Or .readyState < 4: DoEvents: Wend

        Dim caseLinks As Object, id As String, newURL As String
        Set caseLinks = .document.querySelectorAll("[href*='CaseID=']")

        Dim linksAndIds()
        ReDim linksAndIds(1 To caseLinks.Length, 1 To 2)
        For i = 0 To caseLinks.Length - 1
           linksAndIds(i + 1, 1) = caseLinks.item(i)
           linksAndIds(i + 1, 2) = Replace$(caseLinks.item(i), "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?xsl=main.xsl&CaseID=", vbNullString)
        Next

        .Quit
    End With
    GetLinksAndIds = linksAndIds
End Function

示例 return 值:


对于每种情况 - 使用 XMLHTTP:

我很想避免使用 IE 并使用 XMLHTTP request(url 编码的查询字符串 return 使用打印选项获得更具可读性的页面版本)。虽然我已经使用 css 选择器进行了解析,但您可以将响应读入 MSXML2.DOMDocument60 并使用 XPath 进行查询。您可以将 caseid 连接成 URL。

Option Explicit
Public Sub GetTables()
    Dim sResponse As String, html As HTMLDocument, clipboard As Object, ws As Worksheet
    Set ws = ThisWorkbook.Worksheets("Sheet1")
    Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")

    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?ViewPage&xsl=Case.xsl&tab=Crash&form=CaseForm&baseNode=&vehnum=-1&occnum=-1&pos=-1&pos2=-1&websrc=true&title=Crash%20Overview%20-%20Summary&caseid=112007272&year=&fullimage=false", False '<==concatenate caseid into URL
        .setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
        .send
        sResponse = .responseText
    End With

    Set html = New HTMLDocument
    html.body.innerHTML = sResponse
    Dim tables As Object, i As Long
    Set tables = html.querySelectorAll("table")
    For i = 0 To tables.Length - 1
        clipboard.SetText tables.item(i).outerHTML
        clipboard.PutInClipboard
        ws.Cells(LastRow(ws) + 2, 1).PasteSpecial
    Next
End Sub

'https://www.rondebruin.nl/win/s9/win005.htm '<< Function below modified from here

Public Function LastRow(ByVal sh As Worksheet) As Long
    On Error Resume Next
    LastRow = sh.Cells.Find(What:="*", _
                            After:=sh.Range("A1"), _
                            Lookat:=xlPart, _
                            LookIn:=xlFormulas, _
                            SearchOrder:=xlByRows, _
                            SearchDirection:=xlPrevious, _
                            MatchCase:=False).Row
    On Error GoTo 0
End Function

使用 class 来保存 xmlhttp 对象可能会是什么样子(未测试):

Class clsHTTP:

Option Explicit

Private http As Object

Private Sub Class_Initialize()
    Set http = CreateObject("MSXML2.XMLHTTP")
End Sub

Public Function GetString(ByVal URL As String) As String
    Dim sResponse As String
    With http
        .Open "GET", URL, False
        .send
        sResponse = .responseText
    End With
End Function

标准模块1:

Option Explicit
Public Sub GetTables()
    Dim sResponse As String, html As HTMLDocument, clipboard As Object, ws As Worksheet
    Dim initialLinksURL As String, http As clsHTTP, i As Long, j As Long, newURL As String
    Set http = New clsHTTP
    Set ws = ThisWorkbook.Worksheets("Sheet1")
    Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
    Set html = New HTMLDocument
    initialLinksURL = "https://crashviewer.nhtsa.dot.gov/LegacyCDS/Search"

    Dim linksAndIds()
    linksAndIds = GetLinksAndIds(initialLinksURL)

    For i = LBound(linksAndIds, 2) To UBound(linksAndIds, 2)

        newURL = "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?ViewPage&xsl=Case.xsl&tab=Crash&form=CaseForm&baseNode=&vehnum=-1&occnum=-1&pos=-1&pos2=-1&websrc=true&title=Crash%20Overview%20-%20Summary&caseid=" & linksAndIds(i, 2) & "&year=&fullimage=false"
        html.body.innerHTML = http.GetString(newURL)
        Dim tables As Object

        Set tables = html.querySelectorAll("table")

        For j = 0 To tables.Length - 1
            clipboard.SetText tables.item(j).outerHTML
            clipboard.PutInClipboard
            ws.Cells(LastRow(ws) + 2, 1).PasteSpecial
        Next
    Next
End Sub

'https://www.rondebruin.nl/win/s9/win005.htm

Public Function LastRow(ByVal sh As Worksheet) As Long
    On Error Resume Next
    LastRow = sh.Cells.Find(What:="*", _
                            After:=sh.Range("A1"), _
                            Lookat:=xlPart, _
                            LookIn:=xlFormulas, _
                            SearchOrder:=xlByRows, _
                            SearchDirection:=xlPrevious, _
                            MatchCase:=False).Row
    On Error GoTo 0
End Function

Public Function GetLinksAndIds(ByVal URL) As Variant
    Dim ie As InternetExplorer, i As Long
    Set ie = New InternetExplorer
    With ie
        .Visible = True
        .navigate URL

        While .Busy Or .readyState < 4: DoEvents: Wend

        .document.getElementById("btnSubmit1").Click

         While .Busy Or .readyState < 4: DoEvents: Wend

        Dim caseLinks As Object, id As String, newURL As String
        Set caseLinks = .document.querySelectorAll("[href*='CaseID=']")

        Dim linksAndIds()
        ReDim linksAndIds(1 To caseLinks.Length, 1 To 2)
        For i = 0 To caseLinks.Length - 1
           linksAndIds(i + 1, 1) = caseLinks.item(i)
           linksAndIds(i + 1, 2) = Replace$(caseLinks.item(i), "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?xsl=main.xsl&CaseID=", vbNullString)
        Next

        .Quit
    End With
    GetLinksAndIds = linksAndIds
End Function

所有 Internet Explorer 选项:

Option Explicit

Public Sub GetTables()
    Dim sResponse As String, html As HTMLDocument, clipboard As Object, ws As Worksheet
    Dim initialLinksURL As String, i As Long, j As Long, newURL As String
    Set ws = ThisWorkbook.Worksheets("Sheet1")
    Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
    Set html = New HTMLDocument
    initialLinksURL = "https://crashviewer.nhtsa.dot.gov/LegacyCDS/Search"

    Dim ie As InternetExplorer, caseLinks As Object
    Set ie = New InternetExplorer
    With ie
        .Visible = True
        .Navigate2 initialLinksURL

        While .Busy Or .readyState < 4: DoEvents: Wend

        .document.getElementById("btnSubmit1").Click

        While .Busy Or .readyState < 4: DoEvents: Wend

        Set caseLinks = .document.querySelectorAll("[href*='CaseID=']")

        Dim linksAndIds()
        ReDim linksAndIds(1 To caseLinks.Length, 1 To 2)
        For i = 0 To caseLinks.Length - 1
            linksAndIds(i + 1, 1) = caseLinks.item(i)
            linksAndIds(i + 1, 2) = Replace$(caseLinks.item(i), "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?xsl=main.xsl&CaseID=", vbNullString)
        Next

        For i = LBound(linksAndIds, 2) To 2      ' UBound(linksAndIds, 2)

            newURL = "https://crashviewer.nhtsa.dot.gov/nass-cds/CaseForm.aspx?ViewPage&xsl=Case.xsl&tab=Crash&form=CaseForm&baseNode=&vehnum=-1&occnum=-1&pos=-1&pos2=-1&websrc=true&title=Crash%20Overview%20-%20Summary&caseid=" & linksAndIds(i, 2) & "&year=&fullimage=false"
            .Navigate2 newURL

            While .Busy Or .readyState < 4: DoEvents: Wend

            Dim tables As Object

            Set tables = .document.querySelectorAll("table")

            For j = 0 To tables.Length - 1
                clipboard.SetText tables.item(j).outerHTML
                clipboard.PutInClipboard
                ws.Cells(LastRow(ws) + 2, 1).PasteSpecial
            Next
        Next

        .Quit
    End With
End Sub

'https://www.rondebruin.nl/win/s9/win005.htm

Public Function LastRow(ByVal sh As Worksheet) As Long
    On Error Resume Next
    LastRow = sh.Cells.Find(What:="*", _
                            After:=sh.Range("A1"), _
                            Lookat:=xlPart, _
                            LookIn:=xlFormulas, _
                            SearchOrder:=xlByRows, _
                            SearchDirection:=xlPrevious, _
                            MatchCase:=False).Row
    On Error GoTo 0
End Function