将链接存储在计算机内存中而不是 excel 工作表中

Question

我正在尝试创建 web-scraper aka web-crawler 从网站下载 PDF 文件。我想将所有 PDF 文件下载到 C:\temp\。我目前有 link 到 Excel sheet A1:A17 的子页面。

他们被抓去 Excel 使用此代码工作sheet：

Sub GetAllLinks()

Dim internet As InternetExplorer
Dim internetdata As HTMLDocument
Dim internetlink As Object
Dim internetinnerlink As Object

Set internet = CreateObject("InternetExplorer.Application")

internet.Visible = False

internet.navigate ("https://www.nordicwater.com/products/waste-water/")

    Do While internet.Busy
      DoEvents
    Loop
    Do Until internet.readyState = READYSTATE_COMPLETE

    DoEvents

    Loop

        Set internetdata = internet.document

        Set internetlink = internetdata.getElementsByTagName("a")

        i = 1

        For Each internetinnerlink In internetlink


            If Left$(internetinnerlink, 36) = "https://www.nordicwater.com/product/" Then

            ActiveSheet.Cells(i, 1) = internetinnerlink.href

            i = i + 1

            Else
            End If


Next internetinnerlink

End Sub

文件下载代码：

Sub DownloadFiles()

    Dim xHttp: Set xHttp = CreateObject("Microsoft.XMLHTTP")
    Dim bStrm
    Dim hDoc As MSHTML.HTMLDocument
    Dim hAnchor As MSHTML.HTMLAnchorElement
    Dim sPath As String
    Dim i As Long
    Dim wholeURL
    Dim link
    Dim range

    range = ThisWorkbook.Worksheets("Sheet1").range("A1:A17")

    wholeURL = "URL URL URL"

    sPath = "C:\temp\"

    For Each link In range

    'Get the directory listing
    xHttp.Open "GET", link
    xHttp.send

    'Wait for the page to load
    Do Until xHttp.readyState = 4
        DoEvents
    Loop

    'Put the page in an HTML document
    Set hDoc = New MSHTML.HTMLDocument
    hDoc.body.innerHTML = xHttp.responseText

    'Loop through the hyperlinks on the directory listing
    For i = 0 To hDoc.getElementsByTagName("a").Length - 1
        Set hAnchor = hDoc.getElementsByTagName("a").Item(i)

        'test the pathname to see if it matches your pattern
        If hAnchor.pathname Like "*.pdf" Then

                Debug.Print wholeURL & hAnchor.pathname

                xHttp.Open "GET", wholeURL & hAnchor.pathname, False
                xHttp.send

                Set bStrm = CreateObject("Adodb.Stream")

                With bStrm
                    .Type = 1 '//binary
                    .Open
                    .write xHttp.responseBody
                    .SaveToFile sPath & getName(wholeURL & hAnchor.pathname), 2 '//overwrite
                End With

                Set bStrm = Nothing

        End If

    Next i

    Next

End Sub

从 url:

获取文件名的函数

Function getName(pf)
getName = Split(pf, "/")(UBound(Split(pf, "/")))
End Function

wholeURL = "URL URL URL":

A1:A17:

等等

如何将这些代码连接在一起，这样就不需要将 Excel Worksheet 用作 link 数据库，而是将 link 存储在计算机内存中？

编辑：

Sub DownloadFiles()
    Dim xHttp       As Object: Set xHttp = CreateObject("Microsoft.XMLHTTP")
    Dim hDoc        As MSHTML.HTMLDocument
    Dim Anchors     As Object
    Dim Anchor      As Variant
    Dim sPath       As String
    Dim wholeURL    As String

    Dim internet As InternetExplorer
    Dim internetdata As HTMLDocument
    Dim internetlink As Object
    Dim internetinnerlink As Object
    Dim arrLinks As Variant
    Dim sLink As String
    Dim iLinkCount As Integer
    Dim iCounter As Integer
    Dim sLinks As String

    Set internet = CreateObject("InternetExplorer.Application")
    internet.Visible = False
    internet.navigate ("https://www.nordicwater.com/products/waste-water/")

        Do While internet.Busy
          DoEvents
        Loop
        Do Until internet.readyState = READYSTATE_COMPLETE
            DoEvents
        Loop

        Set internetdata = internet.document
        Set internetlink = internetdata.getElementsByTagName("a")

        i = 1

        For Each internetinnerlink In internetlink
            If Left$(internetinnerlink, 36) = "https://www.nordicwater.com/product/" Then

                sLinks = sLinks & internetinnerlink.href & vbCrLf
                i = i + 1

            Else
            End If

    ThisWorkbook.Worksheets("Sheet1").range("B1").Value = sLinks

    Next internetinnerlink

    wholeURL = "https://www.nordicwater.com/"
    sPath = "C:\temp\"

    arrLinks = Split(p_sLinks, vbCrLf)
    iLinkCount = UBound(arrLinks) + 1

    For iCounter = 1 To iLinkCount
    sLink = arrLinks(iCounter - 1)
        'Get the directory listing
        xHttp.Open "GET", sLink
        xHttp.send

        'Wait for the page to load
        Do Until xHttp.readyState = 4
            DoEvents
        Loop

        'Put the page in an HTML document
        Set hDoc = New MSHTML.HTMLDocument
        hDoc.body.innerHTML = xHttp.responseText

        'Loop through the hyperlinks on the directory listing
        Set Anchors = hDoc.getElementsByTagName("a")

        For Each Anchor In Anchors

            'test the pathname to see if it matches your pattern
            If Anchor.pathname Like "*.pdf" Then

                xHttp.Open "GET", wholeURL & Anchor.pathname, False
                xHttp.send

                With CreateObject("Adodb.Stream")
                    .Type = 1
                    .Open
                    .write xHttp.responseBody
                    .SaveToFile sPath & getName(wholeURL & Anchor.pathname), 2 '//overwrite
                End With

            End If

        Next

    Next

End Sub
Function getName(pf As String) As String
    getName = Split(pf, "/")(UBound(Split(pf, "/")))
End Function

Answer 1

代码中有几个错误，我已在下面更正。您需要创建一个新的 ADODB.Stream 对象，或者确保关闭之前的对象。此外，您应该尽可能强类型化变量。我清理了几个地方。

Function getName(pf As String) As String
    getName = Split(pf, "/")(UBound(Split(pf, "/")))
End Function


Sub DownloadFiles()
    Dim xHttp       As Object: Set xHttp = CreateObject("Microsoft.XMLHTTP")
    Dim hDoc        As MSHTML.HTMLDocument
    Dim Anchors     As Object
    Dim Anchor      As Variant
    Dim sPath       As String
    Dim wholeURL    As String
    Dim link        As range
    Dim range       As range

    Set range = ThisWorkbook.Worksheets("Sheet1").range("A1:A5")
    wholeURL = "https://www.nordicwater.com/"
    sPath = "C:\temp\"

    For Each link In range
        'Get the directory listing
        xHttp.Open "GET", link
        xHttp.send

        'Wait for the page to load
        Do Until xHttp.readyState = 4
            DoEvents
        Loop

        'Put the page in an HTML document
        Set hDoc = New MSHTML.HTMLDocument
        hDoc.body.innerHTML = xHttp.responseText

        'Loop through the hyperlinks on the directory listing
        Set Anchors = hDoc.getElementsByTagName("a")

        For Each Anchor In Anchors

            'test the pathname to see if it matches your pattern
            If Anchor.pathname Like "*.pdf" Then

                xHttp.Open "GET", wholeURL & Anchor.pathname, False
                xHttp.send

                With CreateObject("Adodb.Stream")
                    .Type = 1
                    .Open
                    .Write xHttp.responseBody
                    .SaveToFile sPath & getName(wholeURL & Anchor.pathname), 2 '//overwrite
                End With

            End If

        Next

    Next

End Sub

Answer 2

您可以将所有 link 存储在一个字符串中，用 vbCrLf 分隔它们，然后使用 Split(yourstring, vbCrLf) 得到一个 link 数组。这样你就不需要在 Excel 中运行或者至少你不需要使用 Excel 单元格。

为此创建一个字符串变量，例如 sLinks。然后，在您的第一个循环中，替换

ActiveSheet.Cells(i, 1) = internetinnerlink.href

和

sLinks = sLinks & internetinnerlink.href & vbCrLf

完成后，您就消除了将 link 存储在 Excel 中的麻烦。然后您可以将此字符串作为参数传递给您的 DownloadFiles sub:

Sub DownloadFiles(p_sLinks)
    Dim arrLinks As Variant
    Dim sLink As String
    Dim iLinkCount As Integer
    Dim iCounter As Integer

    arrLinks = Split(p_sLinks, vbCrLf)
    iLinkCount = UBound(arrLinks) + 1

    For iCounter = 1 to iLinkCount
        sLink = arrLinks(iCounter - 1)
        ' Process sLink here
    Next

End Sub

您可以将此代码与现有的 DownloadFiles 子程序合并，用 For iCounter = 1 to iLinkCount 替换 For Each link In range 循环，将循环中的代码放入这个新循环中并使用 sLink 作为link 来处理而不是从 Excel.

中读取它

您可以将您的一些代码分解为 Sub，以便于阅读和故障排除：

Sub DownloadFile(p_sURL, p_sLocalPath)
    Dim xHttp As Object: Set xHttp = CreateObject("Microsoft.XMLHTTP")

    xHttp.Open "GET", p_sURL, False
    xHttp.send

    With CreateObject("Adodb.Stream")
        .Type = 1
        .Open
        .write xHttp.responseBody
        .SaveToFile p_sLocalPath & getName(p_sURL), 2 ' //overwrite
    End With

End Sub

将链接存储在计算机内存中而不是 excel 工作表中

Store links in Computer memory instead of excel worksheet

excel

vba

web-crawler

web-scraping