循环链接并下载 PDF
Loop through links and download PDF's
我有一个代码已经在这里有一段时间了,有不同类型的问题。这越来越接近它的最终版本。但是现在我有一个问题,代码中有错误并且部分功能不正确。
想法是通过 links 并获取 PDF 文件。链接存储在 sLinks
中,请参阅注释行“检查 links 是否存储在 sLinks 中”。代码继续运行,文件存储在 C:\temp\
中,但是在文件夹中有 12 个 PDF 后,我收到错误消息,调试器指向 xHttp.Open "GET", sLink
.
我查看了 PDF 文件,看起来所有文件都已下载...因为有些内容在好几页上都是相同的,而且至少有两页上有一份政策 PDF。这就是为什么有 17 个 link 和 12 个文件的原因。不管怎样,它为什么会抛出错误?
可能是什么问题?
这是我的代码:
Sub DownloadFiles()
Dim xHttp As Object: Set xHttp = CreateObject("Microsoft.XMLHTTP")
Dim hDoc As MSHTML.HTMLDocument
Dim Anchors As Object
Dim Anchor As Variant
Dim sPath As String
Dim wholeURL As String
Dim internet As InternetExplorer
Dim internetdata As HTMLDocument
Dim internetlink As Object
Dim internetinnerlink As Object
Dim arrLinks As Variant
Dim sLink As String
Dim iLinkCount As Integer
Dim iCounter As Integer
Dim sLinks As String
Set internet = CreateObject("InternetExplorer.Application")
internet.Visible = False
internet.navigate ("https://www.webpage.com/products/waste-water/")
Do While internet.Busy
DoEvents
Loop
Do Until internet.readyState = READYSTATE_COMPLETE
DoEvents
Loop
Set internetdata = internet.document
Set internetlink = internetdata.getElementsByTagName("a")
i = 1
For Each internetinnerlink In internetlink
If Left$(internetinnerlink, 36) = "https://www.webpage.com/product/" Then
sLinks = sLinks & internetinnerlink.href & vbCrLf
i = i + 1
Else
End If
ThisWorkbook.Worksheets("Sheet1").range("B1").Value = sLinks ' Check that links are stored in sLinks
Next internetinnerlink
wholeURL = "https://www.webpage.com/"
sPath = "C:\temp\"
arrLinks = Split(sLinks, vbCrLf)
iLinkCount = UBound(arrLinks) + 1
For iCounter = 1 To iLinkCount
sLink = arrLinks(iCounter - 1)
'Get the directory listing
xHttp.Open "GET", sLink ' DEBUGGER IS POINTING HERE
xHttp.send
'Wait for the page to load
Do Until xHttp.readyState = 4
DoEvents
Loop
'Put the page in an HTML document
Set hDoc = New MSHTML.HTMLDocument
hDoc.body.innerHTML = xHttp.responseText
'Loop through the hyperlinks on the directory listing
Set Anchors = hDoc.getElementsByTagName("a")
For Each Anchor In Anchors
'test the pathname to see if it matches your pattern
If Anchor.pathname Like "*.pdf" Then
xHttp.Open "GET", wholeURL & Anchor.pathname, False
xHttp.send
With CreateObject("Adodb.Stream")
.Type = 1
.Open
.write xHttp.responseBody
.SaveToFile sPath & getName(wholeURL & Anchor.pathname), 2 '//overwrite
End With
End If
Next
Next
End Sub
从 link:
构建文件名的函数
Function getName(pf As String) As String
getName = Split(pf, "/")(UBound(Split(pf, "/")))
End Function
编辑:
我已经解决了第一个问题。 arrLinks = Split(p_sLinks, vbCrLf)
改为 arrLinks = Split(sLinks, vbCrLf)
是应该的。现在我面临另一个问题。
链接编辑到 www.webpage.com
我会在调用 HTTP GET 之前添加一个 If Len(sLink) > 0
检查。
这一行有问题:
sLinks = sLinks & internetinnerlink.href & vbCrLf
它将在 sLinks 列表的末尾添加一个额外的 vbCrLf
。应该是:
If sLinks <> "" Then sLinks = sLinks & vbCrLf
sLinks = sLinks & internetinnerlink.href
这样最后一个link
之后就不会有vbCrLf
了
我有一个代码已经在这里有一段时间了,有不同类型的问题。这越来越接近它的最终版本。但是现在我有一个问题,代码中有错误并且部分功能不正确。
想法是通过 links 并获取 PDF 文件。链接存储在 sLinks
中,请参阅注释行“检查 links 是否存储在 sLinks 中”。代码继续运行,文件存储在 C:\temp\
中,但是在文件夹中有 12 个 PDF 后,我收到错误消息,调试器指向 xHttp.Open "GET", sLink
.
我查看了 PDF 文件,看起来所有文件都已下载...因为有些内容在好几页上都是相同的,而且至少有两页上有一份政策 PDF。这就是为什么有 17 个 link 和 12 个文件的原因。不管怎样,它为什么会抛出错误?
可能是什么问题?
这是我的代码:
Sub DownloadFiles()
Dim xHttp As Object: Set xHttp = CreateObject("Microsoft.XMLHTTP")
Dim hDoc As MSHTML.HTMLDocument
Dim Anchors As Object
Dim Anchor As Variant
Dim sPath As String
Dim wholeURL As String
Dim internet As InternetExplorer
Dim internetdata As HTMLDocument
Dim internetlink As Object
Dim internetinnerlink As Object
Dim arrLinks As Variant
Dim sLink As String
Dim iLinkCount As Integer
Dim iCounter As Integer
Dim sLinks As String
Set internet = CreateObject("InternetExplorer.Application")
internet.Visible = False
internet.navigate ("https://www.webpage.com/products/waste-water/")
Do While internet.Busy
DoEvents
Loop
Do Until internet.readyState = READYSTATE_COMPLETE
DoEvents
Loop
Set internetdata = internet.document
Set internetlink = internetdata.getElementsByTagName("a")
i = 1
For Each internetinnerlink In internetlink
If Left$(internetinnerlink, 36) = "https://www.webpage.com/product/" Then
sLinks = sLinks & internetinnerlink.href & vbCrLf
i = i + 1
Else
End If
ThisWorkbook.Worksheets("Sheet1").range("B1").Value = sLinks ' Check that links are stored in sLinks
Next internetinnerlink
wholeURL = "https://www.webpage.com/"
sPath = "C:\temp\"
arrLinks = Split(sLinks, vbCrLf)
iLinkCount = UBound(arrLinks) + 1
For iCounter = 1 To iLinkCount
sLink = arrLinks(iCounter - 1)
'Get the directory listing
xHttp.Open "GET", sLink ' DEBUGGER IS POINTING HERE
xHttp.send
'Wait for the page to load
Do Until xHttp.readyState = 4
DoEvents
Loop
'Put the page in an HTML document
Set hDoc = New MSHTML.HTMLDocument
hDoc.body.innerHTML = xHttp.responseText
'Loop through the hyperlinks on the directory listing
Set Anchors = hDoc.getElementsByTagName("a")
For Each Anchor In Anchors
'test the pathname to see if it matches your pattern
If Anchor.pathname Like "*.pdf" Then
xHttp.Open "GET", wholeURL & Anchor.pathname, False
xHttp.send
With CreateObject("Adodb.Stream")
.Type = 1
.Open
.write xHttp.responseBody
.SaveToFile sPath & getName(wholeURL & Anchor.pathname), 2 '//overwrite
End With
End If
Next
Next
End Sub
从 link:
构建文件名的函数Function getName(pf As String) As String
getName = Split(pf, "/")(UBound(Split(pf, "/")))
End Function
编辑:
我已经解决了第一个问题。 arrLinks = Split(p_sLinks, vbCrLf)
改为 arrLinks = Split(sLinks, vbCrLf)
是应该的。现在我面临另一个问题。
链接编辑到 www.webpage.com
我会在调用 HTTP GET 之前添加一个 If Len(sLink) > 0
检查。
这一行有问题:
sLinks = sLinks & internetinnerlink.href & vbCrLf
它将在 sLinks 列表的末尾添加一个额外的 vbCrLf
。应该是:
If sLinks <> "" Then sLinks = sLinks & vbCrLf
sLinks = sLinks & internetinnerlink.href
这样最后一个link
之后就不会有vbCrLf
了