将链接存储在计算机内存中而不是 excel 工作表中
Store links in Computer memory instead of excel worksheet
我正在尝试创建 web-scraper aka web-crawler 从网站下载 PDF 文件。我想将所有 PDF 文件下载到 C:\temp\
。我目前有 link 到 Excel sheet A1:A17
的子页面。
他们被抓去 Excel 使用此代码工作sheet:
Sub GetAllLinks()
Dim internet As InternetExplorer
Dim internetdata As HTMLDocument
Dim internetlink As Object
Dim internetinnerlink As Object
Set internet = CreateObject("InternetExplorer.Application")
internet.Visible = False
internet.navigate ("https://www.nordicwater.com/products/waste-water/")
Do While internet.Busy
DoEvents
Loop
Do Until internet.readyState = READYSTATE_COMPLETE
DoEvents
Loop
Set internetdata = internet.document
Set internetlink = internetdata.getElementsByTagName("a")
i = 1
For Each internetinnerlink In internetlink
If Left$(internetinnerlink, 36) = "https://www.nordicwater.com/product/" Then
ActiveSheet.Cells(i, 1) = internetinnerlink.href
i = i + 1
Else
End If
Next internetinnerlink
End Sub
文件下载代码:
Sub DownloadFiles()
Dim xHttp: Set xHttp = CreateObject("Microsoft.XMLHTTP")
Dim bStrm
Dim hDoc As MSHTML.HTMLDocument
Dim hAnchor As MSHTML.HTMLAnchorElement
Dim sPath As String
Dim i As Long
Dim wholeURL
Dim link
Dim range
range = ThisWorkbook.Worksheets("Sheet1").range("A1:A17")
wholeURL = "URL URL URL"
sPath = "C:\temp\"
For Each link In range
'Get the directory listing
xHttp.Open "GET", link
xHttp.send
'Wait for the page to load
Do Until xHttp.readyState = 4
DoEvents
Loop
'Put the page in an HTML document
Set hDoc = New MSHTML.HTMLDocument
hDoc.body.innerHTML = xHttp.responseText
'Loop through the hyperlinks on the directory listing
For i = 0 To hDoc.getElementsByTagName("a").Length - 1
Set hAnchor = hDoc.getElementsByTagName("a").Item(i)
'test the pathname to see if it matches your pattern
If hAnchor.pathname Like "*.pdf" Then
Debug.Print wholeURL & hAnchor.pathname
xHttp.Open "GET", wholeURL & hAnchor.pathname, False
xHttp.send
Set bStrm = CreateObject("Adodb.Stream")
With bStrm
.Type = 1 '//binary
.Open
.write xHttp.responseBody
.SaveToFile sPath & getName(wholeURL & hAnchor.pathname), 2 '//overwrite
End With
Set bStrm = Nothing
End If
Next i
Next
End Sub
从 url:
获取文件名的函数
Function getName(pf)
getName = Split(pf, "/")(UBound(Split(pf, "/")))
End Function
wholeURL = "URL URL URL":
A1:A17:
等等
如何将这些代码连接在一起,这样就不需要将 Excel Worksheet 用作 link 数据库,而是将 link 存储在计算机内存中?
编辑:
Sub DownloadFiles()
Dim xHttp As Object: Set xHttp = CreateObject("Microsoft.XMLHTTP")
Dim hDoc As MSHTML.HTMLDocument
Dim Anchors As Object
Dim Anchor As Variant
Dim sPath As String
Dim wholeURL As String
Dim internet As InternetExplorer
Dim internetdata As HTMLDocument
Dim internetlink As Object
Dim internetinnerlink As Object
Dim arrLinks As Variant
Dim sLink As String
Dim iLinkCount As Integer
Dim iCounter As Integer
Dim sLinks As String
Set internet = CreateObject("InternetExplorer.Application")
internet.Visible = False
internet.navigate ("https://www.nordicwater.com/products/waste-water/")
Do While internet.Busy
DoEvents
Loop
Do Until internet.readyState = READYSTATE_COMPLETE
DoEvents
Loop
Set internetdata = internet.document
Set internetlink = internetdata.getElementsByTagName("a")
i = 1
For Each internetinnerlink In internetlink
If Left$(internetinnerlink, 36) = "https://www.nordicwater.com/product/" Then
sLinks = sLinks & internetinnerlink.href & vbCrLf
i = i + 1
Else
End If
ThisWorkbook.Worksheets("Sheet1").range("B1").Value = sLinks
Next internetinnerlink
wholeURL = "https://www.nordicwater.com/"
sPath = "C:\temp\"
arrLinks = Split(p_sLinks, vbCrLf)
iLinkCount = UBound(arrLinks) + 1
For iCounter = 1 To iLinkCount
sLink = arrLinks(iCounter - 1)
'Get the directory listing
xHttp.Open "GET", sLink
xHttp.send
'Wait for the page to load
Do Until xHttp.readyState = 4
DoEvents
Loop
'Put the page in an HTML document
Set hDoc = New MSHTML.HTMLDocument
hDoc.body.innerHTML = xHttp.responseText
'Loop through the hyperlinks on the directory listing
Set Anchors = hDoc.getElementsByTagName("a")
For Each Anchor In Anchors
'test the pathname to see if it matches your pattern
If Anchor.pathname Like "*.pdf" Then
xHttp.Open "GET", wholeURL & Anchor.pathname, False
xHttp.send
With CreateObject("Adodb.Stream")
.Type = 1
.Open
.write xHttp.responseBody
.SaveToFile sPath & getName(wholeURL & Anchor.pathname), 2 '//overwrite
End With
End If
Next
Next
End Sub
Function getName(pf As String) As String
getName = Split(pf, "/")(UBound(Split(pf, "/")))
End Function
代码中有几个错误,我已在下面更正。您需要创建一个新的 ADODB.Stream
对象,或者确保关闭之前的对象。此外,您应该尽可能强类型化变量。我清理了几个地方。
Function getName(pf As String) As String
getName = Split(pf, "/")(UBound(Split(pf, "/")))
End Function
Sub DownloadFiles()
Dim xHttp As Object: Set xHttp = CreateObject("Microsoft.XMLHTTP")
Dim hDoc As MSHTML.HTMLDocument
Dim Anchors As Object
Dim Anchor As Variant
Dim sPath As String
Dim wholeURL As String
Dim link As range
Dim range As range
Set range = ThisWorkbook.Worksheets("Sheet1").range("A1:A5")
wholeURL = "https://www.nordicwater.com/"
sPath = "C:\temp\"
For Each link In range
'Get the directory listing
xHttp.Open "GET", link
xHttp.send
'Wait for the page to load
Do Until xHttp.readyState = 4
DoEvents
Loop
'Put the page in an HTML document
Set hDoc = New MSHTML.HTMLDocument
hDoc.body.innerHTML = xHttp.responseText
'Loop through the hyperlinks on the directory listing
Set Anchors = hDoc.getElementsByTagName("a")
For Each Anchor In Anchors
'test the pathname to see if it matches your pattern
If Anchor.pathname Like "*.pdf" Then
xHttp.Open "GET", wholeURL & Anchor.pathname, False
xHttp.send
With CreateObject("Adodb.Stream")
.Type = 1
.Open
.Write xHttp.responseBody
.SaveToFile sPath & getName(wholeURL & Anchor.pathname), 2 '//overwrite
End With
End If
Next
Next
End Sub
您可以将所有 link 存储在一个字符串中,用 vbCrLf
分隔它们,然后使用 Split(yourstring, vbCrLf)
得到一个 link 数组。这样你就不需要在 Excel 中 运行 或者至少你不需要使用 Excel 单元格。
为此创建一个字符串变量,例如 sLinks
。然后,在您的第一个循环中,替换
ActiveSheet.Cells(i, 1) = internetinnerlink.href
和
sLinks = sLinks & internetinnerlink.href & vbCrLf
完成后,您就消除了将 link 存储在 Excel 中的麻烦。然后您可以将此字符串作为参数传递给您的 DownloadFiles
sub:
Sub DownloadFiles(p_sLinks)
Dim arrLinks As Variant
Dim sLink As String
Dim iLinkCount As Integer
Dim iCounter As Integer
arrLinks = Split(p_sLinks, vbCrLf)
iLinkCount = UBound(arrLinks) + 1
For iCounter = 1 to iLinkCount
sLink = arrLinks(iCounter - 1)
' Process sLink here
Next
End Sub
您可以将此代码与现有的 DownloadFiles 子程序合并,用 For iCounter = 1 to iLinkCount
替换 For Each link In range
循环,将循环中的代码放入这个新循环中并使用 sLink
作为link 来处理而不是从 Excel.
中读取它
您可以将您的一些代码分解为 Sub,以便于阅读和故障排除:
Sub DownloadFile(p_sURL, p_sLocalPath)
Dim xHttp As Object: Set xHttp = CreateObject("Microsoft.XMLHTTP")
xHttp.Open "GET", p_sURL, False
xHttp.send
With CreateObject("Adodb.Stream")
.Type = 1
.Open
.write xHttp.responseBody
.SaveToFile p_sLocalPath & getName(p_sURL), 2 ' //overwrite
End With
End Sub
我正在尝试创建 web-scraper aka web-crawler 从网站下载 PDF 文件。我想将所有 PDF 文件下载到 C:\temp\
。我目前有 link 到 Excel sheet A1:A17
的子页面。
他们被抓去 Excel 使用此代码工作sheet:
Sub GetAllLinks()
Dim internet As InternetExplorer
Dim internetdata As HTMLDocument
Dim internetlink As Object
Dim internetinnerlink As Object
Set internet = CreateObject("InternetExplorer.Application")
internet.Visible = False
internet.navigate ("https://www.nordicwater.com/products/waste-water/")
Do While internet.Busy
DoEvents
Loop
Do Until internet.readyState = READYSTATE_COMPLETE
DoEvents
Loop
Set internetdata = internet.document
Set internetlink = internetdata.getElementsByTagName("a")
i = 1
For Each internetinnerlink In internetlink
If Left$(internetinnerlink, 36) = "https://www.nordicwater.com/product/" Then
ActiveSheet.Cells(i, 1) = internetinnerlink.href
i = i + 1
Else
End If
Next internetinnerlink
End Sub
文件下载代码:
Sub DownloadFiles()
Dim xHttp: Set xHttp = CreateObject("Microsoft.XMLHTTP")
Dim bStrm
Dim hDoc As MSHTML.HTMLDocument
Dim hAnchor As MSHTML.HTMLAnchorElement
Dim sPath As String
Dim i As Long
Dim wholeURL
Dim link
Dim range
range = ThisWorkbook.Worksheets("Sheet1").range("A1:A17")
wholeURL = "URL URL URL"
sPath = "C:\temp\"
For Each link In range
'Get the directory listing
xHttp.Open "GET", link
xHttp.send
'Wait for the page to load
Do Until xHttp.readyState = 4
DoEvents
Loop
'Put the page in an HTML document
Set hDoc = New MSHTML.HTMLDocument
hDoc.body.innerHTML = xHttp.responseText
'Loop through the hyperlinks on the directory listing
For i = 0 To hDoc.getElementsByTagName("a").Length - 1
Set hAnchor = hDoc.getElementsByTagName("a").Item(i)
'test the pathname to see if it matches your pattern
If hAnchor.pathname Like "*.pdf" Then
Debug.Print wholeURL & hAnchor.pathname
xHttp.Open "GET", wholeURL & hAnchor.pathname, False
xHttp.send
Set bStrm = CreateObject("Adodb.Stream")
With bStrm
.Type = 1 '//binary
.Open
.write xHttp.responseBody
.SaveToFile sPath & getName(wholeURL & hAnchor.pathname), 2 '//overwrite
End With
Set bStrm = Nothing
End If
Next i
Next
End Sub
从 url:
获取文件名的函数Function getName(pf)
getName = Split(pf, "/")(UBound(Split(pf, "/")))
End Function
wholeURL = "URL URL URL":
A1:A17:
等等
如何将这些代码连接在一起,这样就不需要将 Excel Worksheet 用作 link 数据库,而是将 link 存储在计算机内存中?
编辑:
Sub DownloadFiles()
Dim xHttp As Object: Set xHttp = CreateObject("Microsoft.XMLHTTP")
Dim hDoc As MSHTML.HTMLDocument
Dim Anchors As Object
Dim Anchor As Variant
Dim sPath As String
Dim wholeURL As String
Dim internet As InternetExplorer
Dim internetdata As HTMLDocument
Dim internetlink As Object
Dim internetinnerlink As Object
Dim arrLinks As Variant
Dim sLink As String
Dim iLinkCount As Integer
Dim iCounter As Integer
Dim sLinks As String
Set internet = CreateObject("InternetExplorer.Application")
internet.Visible = False
internet.navigate ("https://www.nordicwater.com/products/waste-water/")
Do While internet.Busy
DoEvents
Loop
Do Until internet.readyState = READYSTATE_COMPLETE
DoEvents
Loop
Set internetdata = internet.document
Set internetlink = internetdata.getElementsByTagName("a")
i = 1
For Each internetinnerlink In internetlink
If Left$(internetinnerlink, 36) = "https://www.nordicwater.com/product/" Then
sLinks = sLinks & internetinnerlink.href & vbCrLf
i = i + 1
Else
End If
ThisWorkbook.Worksheets("Sheet1").range("B1").Value = sLinks
Next internetinnerlink
wholeURL = "https://www.nordicwater.com/"
sPath = "C:\temp\"
arrLinks = Split(p_sLinks, vbCrLf)
iLinkCount = UBound(arrLinks) + 1
For iCounter = 1 To iLinkCount
sLink = arrLinks(iCounter - 1)
'Get the directory listing
xHttp.Open "GET", sLink
xHttp.send
'Wait for the page to load
Do Until xHttp.readyState = 4
DoEvents
Loop
'Put the page in an HTML document
Set hDoc = New MSHTML.HTMLDocument
hDoc.body.innerHTML = xHttp.responseText
'Loop through the hyperlinks on the directory listing
Set Anchors = hDoc.getElementsByTagName("a")
For Each Anchor In Anchors
'test the pathname to see if it matches your pattern
If Anchor.pathname Like "*.pdf" Then
xHttp.Open "GET", wholeURL & Anchor.pathname, False
xHttp.send
With CreateObject("Adodb.Stream")
.Type = 1
.Open
.write xHttp.responseBody
.SaveToFile sPath & getName(wholeURL & Anchor.pathname), 2 '//overwrite
End With
End If
Next
Next
End Sub
Function getName(pf As String) As String
getName = Split(pf, "/")(UBound(Split(pf, "/")))
End Function
代码中有几个错误,我已在下面更正。您需要创建一个新的 ADODB.Stream
对象,或者确保关闭之前的对象。此外,您应该尽可能强类型化变量。我清理了几个地方。
Function getName(pf As String) As String
getName = Split(pf, "/")(UBound(Split(pf, "/")))
End Function
Sub DownloadFiles()
Dim xHttp As Object: Set xHttp = CreateObject("Microsoft.XMLHTTP")
Dim hDoc As MSHTML.HTMLDocument
Dim Anchors As Object
Dim Anchor As Variant
Dim sPath As String
Dim wholeURL As String
Dim link As range
Dim range As range
Set range = ThisWorkbook.Worksheets("Sheet1").range("A1:A5")
wholeURL = "https://www.nordicwater.com/"
sPath = "C:\temp\"
For Each link In range
'Get the directory listing
xHttp.Open "GET", link
xHttp.send
'Wait for the page to load
Do Until xHttp.readyState = 4
DoEvents
Loop
'Put the page in an HTML document
Set hDoc = New MSHTML.HTMLDocument
hDoc.body.innerHTML = xHttp.responseText
'Loop through the hyperlinks on the directory listing
Set Anchors = hDoc.getElementsByTagName("a")
For Each Anchor In Anchors
'test the pathname to see if it matches your pattern
If Anchor.pathname Like "*.pdf" Then
xHttp.Open "GET", wholeURL & Anchor.pathname, False
xHttp.send
With CreateObject("Adodb.Stream")
.Type = 1
.Open
.Write xHttp.responseBody
.SaveToFile sPath & getName(wholeURL & Anchor.pathname), 2 '//overwrite
End With
End If
Next
Next
End Sub
您可以将所有 link 存储在一个字符串中,用 vbCrLf
分隔它们,然后使用 Split(yourstring, vbCrLf)
得到一个 link 数组。这样你就不需要在 Excel 中 运行 或者至少你不需要使用 Excel 单元格。
为此创建一个字符串变量,例如 sLinks
。然后,在您的第一个循环中,替换
ActiveSheet.Cells(i, 1) = internetinnerlink.href
和
sLinks = sLinks & internetinnerlink.href & vbCrLf
完成后,您就消除了将 link 存储在 Excel 中的麻烦。然后您可以将此字符串作为参数传递给您的 DownloadFiles
sub:
Sub DownloadFiles(p_sLinks)
Dim arrLinks As Variant
Dim sLink As String
Dim iLinkCount As Integer
Dim iCounter As Integer
arrLinks = Split(p_sLinks, vbCrLf)
iLinkCount = UBound(arrLinks) + 1
For iCounter = 1 to iLinkCount
sLink = arrLinks(iCounter - 1)
' Process sLink here
Next
End Sub
您可以将此代码与现有的 DownloadFiles 子程序合并,用 For iCounter = 1 to iLinkCount
替换 For Each link In range
循环,将循环中的代码放入这个新循环中并使用 sLink
作为link 来处理而不是从 Excel.
您可以将您的一些代码分解为 Sub,以便于阅读和故障排除:
Sub DownloadFile(p_sURL, p_sLocalPath)
Dim xHttp As Object: Set xHttp = CreateObject("Microsoft.XMLHTTP")
xHttp.Open "GET", p_sURL, False
xHttp.send
With CreateObject("Adodb.Stream")
.Type = 1
.Open
.write xHttp.responseBody
.SaveToFile p_sLocalPath & getName(p_sURL), 2 ' //overwrite
End With
End Sub