如何使用 Python 脚本下载完整网页?
How to download a full webpage with a Python script?
目前我有一个脚本只能下载给定页面的HTML
。
现在我要下载网页的所有文件包括HTML
,CSS
, JS
和 图像文件(与我们使用任何网站的 ctrl-s 一样)。
我当前的代码是:
import urllib
url = "https://en.wikipedia.org/wiki/Python_%28programming_language%29"
urllib.urlretrieve(url, "t3.html")
我访问了很多问题,但他们都只下载 HTML
。
试试 Python 库 Scrapy。您可以对 Scrapy 进行编程,通过下载其页面、扫描、访问以下链接来递归扫描网站:
An open source and collaborative framework for extracting the data you need from websites. In a fast, simple, yet extensible way.
以下实现可以让您获得子HTML 网站。它可以进一步开发以获得您需要的其他文件。我为你设置了 depth
变量来设置你想要解析的最大值 sub_websites 。
import urllib2
from BeautifulSoup import *
from urlparse import urljoin
def crawl(pages, depth=None):
indexed_url = [] # a list for the main and sub-HTML websites in the main website
for i in range(depth):
for page in pages:
if page not in indexed_url:
indexed_url.append(page)
try:
c = urllib2.urlopen(page)
except:
print "Could not open %s" % page
continue
soup = BeautifulSoup(c.read())
links = soup('a') #finding all the sub_links
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http':
indexed_url.append(url)
pages = indexed_url
return indexed_url
pagelist=["https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
urls = crawl(pagelist, depth=2)
print urls
Python3 版本,2019。希望这能为某人节省一些时间:
#!/usr/bin/env python
import urllib.request as urllib2
from bs4 import *
from urllib.parse import urljoin
def crawl(pages, depth=None):
indexed_url = [] # a list for the main and sub-HTML websites in the main website
for i in range(depth):
for page in pages:
if page not in indexed_url:
indexed_url.append(page)
try:
c = urllib2.urlopen(page)
except:
print( "Could not open %s" % page)
continue
soup = BeautifulSoup(c.read())
links = soup('a') #finding all the sub_links
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http':
indexed_url.append(url)
pages = indexed_url
return indexed_url
pagelist=["https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
urls = crawl(pagelist, depth=1)
print( urls )
您可以使用简单的 python 库 pywebcopy
.
轻松做到这一点
For Current version: 5.0.1
from pywebcopy import save_webpage
url = 'http://some-site.com/some-page.html'
download_folder = '/path/to/downloads/'
kwargs = {'bypass_robots': True, 'project_name': 'recognisable-name'}
save_webpage(url, download_folder, **kwargs)
您的 download_folder 将拥有 html、css、js。完全像原始站点一样工作。
使用Python 3+ Requests 和其他标准库。
函数 savePage
收到 requests.Response
和 pagefilename
保存位置。
- 将
pagefilename
.html 保存到当前文件夹
- 根据标签
script
、link
和 img
下载 javascripts
、css
和 images
并保存在文件夹中pagefilename
_文件。
- 任何异常都打印在
sys.stderr
、returns 一个 BeautifulSoup
对象上。
- Requests
session
必须是全局变量,除非有人在这里为我们编写了更简洁的代码。
您可以根据自己的需要进行调整。
import os, sys
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def soupfindAllnSave(pagefolder, url, soup, tag2find='img', inner='src'):
if not os.path.exists(pagefolder): # create only once
os.mkdir(pagefolder)
for res in soup.findAll(tag2find): # images, css, etc..
try:
filename = os.path.basename(res[inner])
fileurl = urljoin(url, res.get(inner))
# rename to saved file path
# res[inner] # may or may not exist
filepath = os.path.join(pagefolder, filename)
res[inner] = os.path.join(os.path.basename(pagefolder), filename)
if not os.path.isfile(filepath): # was not downloaded
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)
return soup
def savePage(response, pagefilename='page'):
url = response.url
soup = BeautifulSoup(response.text)
pagefolder = pagefilename+'_files' # page contents
soup = soupfindAllnSave(pagefolder, url, soup, 'img', inner='src')
soup = soupfindAllnSave(pagefolder, url, soup, 'link', inner='href')
soup = soupfindAllnSave(pagefolder, url, soup, 'script', inner='src')
with open(pagefilename+'.html', 'w') as file:
file.write(soup.prettify())
return soup
示例 保存 google 页面及其内容(google_files
文件夹)
session = requests.Session()
#... whatever requests config you need here
response = session.get('https://www.google.com')
savePage(response, 'google')
目前我有一个脚本只能下载给定页面的HTML
。
现在我要下载网页的所有文件包括HTML
,CSS
, JS
和 图像文件(与我们使用任何网站的 ctrl-s 一样)。
我当前的代码是:
import urllib
url = "https://en.wikipedia.org/wiki/Python_%28programming_language%29"
urllib.urlretrieve(url, "t3.html")
我访问了很多问题,但他们都只下载 HTML
。
试试 Python 库 Scrapy。您可以对 Scrapy 进行编程,通过下载其页面、扫描、访问以下链接来递归扫描网站:
An open source and collaborative framework for extracting the data you need from websites. In a fast, simple, yet extensible way.
以下实现可以让您获得子HTML 网站。它可以进一步开发以获得您需要的其他文件。我为你设置了 depth
变量来设置你想要解析的最大值 sub_websites 。
import urllib2
from BeautifulSoup import *
from urlparse import urljoin
def crawl(pages, depth=None):
indexed_url = [] # a list for the main and sub-HTML websites in the main website
for i in range(depth):
for page in pages:
if page not in indexed_url:
indexed_url.append(page)
try:
c = urllib2.urlopen(page)
except:
print "Could not open %s" % page
continue
soup = BeautifulSoup(c.read())
links = soup('a') #finding all the sub_links
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http':
indexed_url.append(url)
pages = indexed_url
return indexed_url
pagelist=["https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
urls = crawl(pagelist, depth=2)
print urls
Python3 版本,2019。希望这能为某人节省一些时间:
#!/usr/bin/env python
import urllib.request as urllib2
from bs4 import *
from urllib.parse import urljoin
def crawl(pages, depth=None):
indexed_url = [] # a list for the main and sub-HTML websites in the main website
for i in range(depth):
for page in pages:
if page not in indexed_url:
indexed_url.append(page)
try:
c = urllib2.urlopen(page)
except:
print( "Could not open %s" % page)
continue
soup = BeautifulSoup(c.read())
links = soup('a') #finding all the sub_links
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http':
indexed_url.append(url)
pages = indexed_url
return indexed_url
pagelist=["https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
urls = crawl(pagelist, depth=1)
print( urls )
您可以使用简单的 python 库 pywebcopy
.
For Current version: 5.0.1
from pywebcopy import save_webpage
url = 'http://some-site.com/some-page.html'
download_folder = '/path/to/downloads/'
kwargs = {'bypass_robots': True, 'project_name': 'recognisable-name'}
save_webpage(url, download_folder, **kwargs)
您的 download_folder 将拥有 html、css、js。完全像原始站点一样工作。
使用Python 3+ Requests 和其他标准库。
函数 savePage
收到 requests.Response
和 pagefilename
保存位置。
- 将
pagefilename
.html 保存到当前文件夹 - 根据标签
script
、link
和img
下载javascripts
、css
和images
并保存在文件夹中pagefilename
_文件。 - 任何异常都打印在
sys.stderr
、returns 一个BeautifulSoup
对象上。 - Requests
session
必须是全局变量,除非有人在这里为我们编写了更简洁的代码。
您可以根据自己的需要进行调整。
import os, sys
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def soupfindAllnSave(pagefolder, url, soup, tag2find='img', inner='src'):
if not os.path.exists(pagefolder): # create only once
os.mkdir(pagefolder)
for res in soup.findAll(tag2find): # images, css, etc..
try:
filename = os.path.basename(res[inner])
fileurl = urljoin(url, res.get(inner))
# rename to saved file path
# res[inner] # may or may not exist
filepath = os.path.join(pagefolder, filename)
res[inner] = os.path.join(os.path.basename(pagefolder), filename)
if not os.path.isfile(filepath): # was not downloaded
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)
return soup
def savePage(response, pagefilename='page'):
url = response.url
soup = BeautifulSoup(response.text)
pagefolder = pagefilename+'_files' # page contents
soup = soupfindAllnSave(pagefolder, url, soup, 'img', inner='src')
soup = soupfindAllnSave(pagefolder, url, soup, 'link', inner='href')
soup = soupfindAllnSave(pagefolder, url, soup, 'script', inner='src')
with open(pagefilename+'.html', 'w') as file:
file.write(soup.prettify())
return soup
示例 保存 google 页面及其内容(google_files
文件夹)
session = requests.Session()
#... whatever requests config you need here
response = session.get('https://www.google.com')
savePage(response, 'google')