从网页下载PDF文件
Download PDF files from webpage
我正在尝试从网站下载文件。我目前的解决方案似乎有效,但有些事情我不明白。
第一期是在:
//div[@class='large-4 medium-4 columns']//a
还有其他div class large-4 medium-4 columns
。所以我得到了一些不必要的 links。如何摆脱它们?我只需要包含 /products/
的页面
第二个问题是没有任何内容下载到 C:\temp\
,我猜有些东西是:
//div[@class='large-6 medium-8 columns large-centered']/a[string-length(@href)>0]
但是怎么了?
"xxx"是我代码中的link,应该是
Imports HtmlAgilityPack
Module Module1
Sub Main()
Dim mainUrl As String = "xxx"
Dim htmlDoc As HtmlDocument = New HtmlWeb().Load(mainUrl) '< - - - Load the webage into htmldocument
Dim listLinks As New List(Of String)
Dim srcs As HtmlNodeCollection = htmlDoc.DocumentNode.SelectNodes("//div[@class='large-4 medium-4 columns']//a") '< - - - select nodes with links
For Each src As HtmlNode In srcs
' Store links in array
listLinks.Add(src.Attributes("href").Value)
Console.WriteLine(src.Attributes("href").Value)
Next
Console.Read()
For Each productLink As String In listLinks
Dim prodDoc As HtmlDocument = New HtmlWeb().Load(productLink)
Dim scrapedsrcs As HtmlNodeCollection = prodDoc.DocumentNode.SelectNodes("//div[@class='large-6 medium-8 columns large-centered']/a[string-length(@href)>0]") '< - - - select nodes with links
If scrapedsrcs IsNot Nothing Then
For Each scrapedlink As HtmlNode In scrapedsrcs
' Show links in console
'Console.WriteLine($"-- {scrapedlink.Attributes("href").Value}") '< - - - Print urls
My.Computer.Network.DownloadFile(scrapedlink.Attributes("href").Value, "C:\temp\" & System.IO.Path.GetFileName(scrapedlink.Attributes("href").Value) & ".pdf")
Next
End If
Next
Console.Read()
' End of scraping
End Sub
End Module
编辑:
好的,第一个应该是
//div[@class='row inset1 productItem padb1 padt1']/div[@class='large-4 medium-4 columns']//a
这会将小册子下载到应用所在的文件夹 运行:
Dim htmlDoc As HtmlDocument = New HtmlWeb().Load("https://webpage.com")
Dim ProductListPage As HtmlNodeCollection = htmlDoc.DocumentNode.SelectNodes("//div[@class='productContain padb6']//div[@class='large-4 medium-4 columns']/a")
For Each src As HtmlNode In ProductListPage
htmlDoc = New HtmlWeb().Load(src.Attributes("href").Value)
Dim LinkTester As HtmlNodeCollection = htmlDoc.DocumentNode.SelectNodes("//div[@class='row padt6 padb4']//a")
If LinkTester IsNot Nothing Then
For Each dllink In LinkTester
Dim LinkURL As String = dllink.Attributes("href").Value
Console.WriteLine(LinkURL)
Dim ExtractFilename As String = LinkURL.Substring(LinkURL.LastIndexOf("/"))
Dim DLClient As New WebClient
DLClient.DownloadFileAsync(New Uri(LinkURL), ".\" & ExtractFilename)
Next
End If
Next
我正在尝试从网站下载文件。我目前的解决方案似乎有效,但有些事情我不明白。
第一期是在:
//div[@class='large-4 medium-4 columns']//a
还有其他div class large-4 medium-4 columns
。所以我得到了一些不必要的 links。如何摆脱它们?我只需要包含 /products/
第二个问题是没有任何内容下载到 C:\temp\
,我猜有些东西是:
//div[@class='large-6 medium-8 columns large-centered']/a[string-length(@href)>0]
但是怎么了?
"xxx"是我代码中的link,应该是
Imports HtmlAgilityPack
Module Module1
Sub Main()
Dim mainUrl As String = "xxx"
Dim htmlDoc As HtmlDocument = New HtmlWeb().Load(mainUrl) '< - - - Load the webage into htmldocument
Dim listLinks As New List(Of String)
Dim srcs As HtmlNodeCollection = htmlDoc.DocumentNode.SelectNodes("//div[@class='large-4 medium-4 columns']//a") '< - - - select nodes with links
For Each src As HtmlNode In srcs
' Store links in array
listLinks.Add(src.Attributes("href").Value)
Console.WriteLine(src.Attributes("href").Value)
Next
Console.Read()
For Each productLink As String In listLinks
Dim prodDoc As HtmlDocument = New HtmlWeb().Load(productLink)
Dim scrapedsrcs As HtmlNodeCollection = prodDoc.DocumentNode.SelectNodes("//div[@class='large-6 medium-8 columns large-centered']/a[string-length(@href)>0]") '< - - - select nodes with links
If scrapedsrcs IsNot Nothing Then
For Each scrapedlink As HtmlNode In scrapedsrcs
' Show links in console
'Console.WriteLine($"-- {scrapedlink.Attributes("href").Value}") '< - - - Print urls
My.Computer.Network.DownloadFile(scrapedlink.Attributes("href").Value, "C:\temp\" & System.IO.Path.GetFileName(scrapedlink.Attributes("href").Value) & ".pdf")
Next
End If
Next
Console.Read()
' End of scraping
End Sub
End Module
编辑:
好的,第一个应该是
//div[@class='row inset1 productItem padb1 padt1']/div[@class='large-4 medium-4 columns']//a
这会将小册子下载到应用所在的文件夹 运行:
Dim htmlDoc As HtmlDocument = New HtmlWeb().Load("https://webpage.com")
Dim ProductListPage As HtmlNodeCollection = htmlDoc.DocumentNode.SelectNodes("//div[@class='productContain padb6']//div[@class='large-4 medium-4 columns']/a")
For Each src As HtmlNode In ProductListPage
htmlDoc = New HtmlWeb().Load(src.Attributes("href").Value)
Dim LinkTester As HtmlNodeCollection = htmlDoc.DocumentNode.SelectNodes("//div[@class='row padt6 padb4']//a")
If LinkTester IsNot Nothing Then
For Each dllink In LinkTester
Dim LinkURL As String = dllink.Attributes("href").Value
Console.WriteLine(LinkURL)
Dim ExtractFilename As String = LinkURL.Substring(LinkURL.LastIndexOf("/"))
Dim DLClient As New WebClient
DLClient.DownloadFileAsync(New Uri(LinkURL), ".\" & ExtractFilename)
Next
End If
Next