使用 python 编辑 html 文件
Edit html file using python
我已经删除了网络内容(css、js 和图片)
现在我想编辑下载的 HTML 文件以提供图像、js 和 css 的绝对路径。
例如,脚本需要找到源'src',它必须是绝对路径(包含域)而不是亲戚(不包含域)。
从:/static_1.872.4/js/jquery_3.4.1/jquery-3.4.1.min.js更改为https://es.sopranodesign.com/static_1.872.4/js/jquery_3.4.1/jquery-3.4.1.min.js并保存为index2.html
到目前为止,这是我的代码:
import os
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
from pathlib import Path
import os.path
import urllib.request, urllib.error, urllib.parse
from tldextract import extract
directory = Path(r"\Documents\python\public_html").mkdir(parents=True, exist_ok=True)
dir_path = r"\Documents\python\public_html"
# URL of the web page you want to extract
url = "https://es.sopranodesign.com/sei/login.do?customerId=270"
# initialize a session
session = requests.Session()
# set the User-agent as a regular browser
session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
# get the HTML content
html = session.get(url).content
# parse HTML using beautiful soup
soup = bs(html, "html.parser")
# get the JavaScript files
script_files = []
for script in soup.find_all("script"):
if script.attrs.get("src"):
# if the tag has the attribute 'src'
script_url = urljoin(url, script.attrs.get("src"))
script_files.append(script_url)
# get the CSS files
css_files = []
for css in soup.find_all("link"):
if css.attrs.get("href"):
# if the link tag has the 'href' attribute
css_url = urljoin(url, css.attrs.get("href"))
css_files.append(css_url)
# get the images files
image_files = []
for script in soup.find_all("img"):
if script.attrs.get("src"):
# if the tag has the attribute 'src'
script_url = urljoin(url, script.attrs.get("src"))
image_files.append(script_url)
print("Total script files in the page:", len(script_files))
print("Total CSS files in the page:", len(css_files))
print("Total images files in the page:", len(image_files))
# write file links into files
javascript_filename = "javascript_files.txt"
with open(os.path.join(dir_path, javascript_filename), "w") as f:
for js_file in script_files:
print(js_file, file=f)
css_filename = "css_files.txt"
with open(os.path.join(dir_path, css_filename), "w") as f:
for css_file in css_files:
print(css_file, file=f)
image_filename = "image_files.txt"
with open(os.path.join(dir_path, image_filename), "w") as f:
for image_file in image_files:
print(image_file, file=f)
try:
for js_file in script_files:
fileNamepath = os.path.basename(js_file)
fileName = os.path.join(dir_path,fileNamepath)
text = requests.get(js_file).text
with open(fileName, 'w',encoding="utf-8") as f:
f.write(text)
except:
pass
for css_file in css_files:
fileNamepath = os.path.basename(css_file)
fileName = os.path.join(dir_path,fileNamepath)
text = requests.get(css_file).text
with open(fileName, 'w',encoding="utf-8") as f:
f.write(text)
for image_file in image_files:
fileNamepath = os.path.basename(image_file)
fileName = os.path.join(dir_path,fileNamepath)
text = requests.get(image_file).text
with open(fileName, 'w',encoding="utf-8") as f:
f.write(text)
tsd, td, tsu = extract(url) # prints hostname
domain = td # will prints as hostname
print(domain)
response = urllib.request.urlopen(url)
webContent = response.read().decode('UTF-8')
html_filename = domain + "test2.do.html"
f = open(os.path.join(dir_path, html_filename), 'w')
f.write(webContent)
f.close
您可以根据我提供的 link 将其作为属性重新分配给 bs4 对象:
例如:
for script in soup.find_all("script"):
if script.attrs.get("src"):
#Original
print('Original')
print(script)
# if the tag has the attribute 'src'
script_url = urljoin(url, script.attrs.get("src"))
script_files.append(script_url)
script['src'] = script_url
# Now changed
print('New/Changed')
print(script)
输出:
Original
<script src="/static_1.872.4/js/jquery_3.4.1/jquery-3.4.1.min.js" type="text/javascript"></script>
New/Changed
<script src="https://es.sopranodesign.com/static_1.872.4/js/jquery_3.4.1/jquery-3.4.1.min.js" type="text/javascript"></script>
您可以在 html 中看到它,它已更改(这是在第一次迭代之后。它将继续对其余部分执行此操作。)
我已经删除了网络内容(css、js 和图片)
现在我想编辑下载的 HTML 文件以提供图像、js 和 css 的绝对路径。
例如,脚本需要找到源'src',它必须是绝对路径(包含域)而不是亲戚(不包含域)。
从:/static_1.872.4/js/jquery_3.4.1/jquery-3.4.1.min.js更改为https://es.sopranodesign.com/static_1.872.4/js/jquery_3.4.1/jquery-3.4.1.min.js并保存为index2.html
到目前为止,这是我的代码:
import os
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
from pathlib import Path
import os.path
import urllib.request, urllib.error, urllib.parse
from tldextract import extract
directory = Path(r"\Documents\python\public_html").mkdir(parents=True, exist_ok=True)
dir_path = r"\Documents\python\public_html"
# URL of the web page you want to extract
url = "https://es.sopranodesign.com/sei/login.do?customerId=270"
# initialize a session
session = requests.Session()
# set the User-agent as a regular browser
session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
# get the HTML content
html = session.get(url).content
# parse HTML using beautiful soup
soup = bs(html, "html.parser")
# get the JavaScript files
script_files = []
for script in soup.find_all("script"):
if script.attrs.get("src"):
# if the tag has the attribute 'src'
script_url = urljoin(url, script.attrs.get("src"))
script_files.append(script_url)
# get the CSS files
css_files = []
for css in soup.find_all("link"):
if css.attrs.get("href"):
# if the link tag has the 'href' attribute
css_url = urljoin(url, css.attrs.get("href"))
css_files.append(css_url)
# get the images files
image_files = []
for script in soup.find_all("img"):
if script.attrs.get("src"):
# if the tag has the attribute 'src'
script_url = urljoin(url, script.attrs.get("src"))
image_files.append(script_url)
print("Total script files in the page:", len(script_files))
print("Total CSS files in the page:", len(css_files))
print("Total images files in the page:", len(image_files))
# write file links into files
javascript_filename = "javascript_files.txt"
with open(os.path.join(dir_path, javascript_filename), "w") as f:
for js_file in script_files:
print(js_file, file=f)
css_filename = "css_files.txt"
with open(os.path.join(dir_path, css_filename), "w") as f:
for css_file in css_files:
print(css_file, file=f)
image_filename = "image_files.txt"
with open(os.path.join(dir_path, image_filename), "w") as f:
for image_file in image_files:
print(image_file, file=f)
try:
for js_file in script_files:
fileNamepath = os.path.basename(js_file)
fileName = os.path.join(dir_path,fileNamepath)
text = requests.get(js_file).text
with open(fileName, 'w',encoding="utf-8") as f:
f.write(text)
except:
pass
for css_file in css_files:
fileNamepath = os.path.basename(css_file)
fileName = os.path.join(dir_path,fileNamepath)
text = requests.get(css_file).text
with open(fileName, 'w',encoding="utf-8") as f:
f.write(text)
for image_file in image_files:
fileNamepath = os.path.basename(image_file)
fileName = os.path.join(dir_path,fileNamepath)
text = requests.get(image_file).text
with open(fileName, 'w',encoding="utf-8") as f:
f.write(text)
tsd, td, tsu = extract(url) # prints hostname
domain = td # will prints as hostname
print(domain)
response = urllib.request.urlopen(url)
webContent = response.read().decode('UTF-8')
html_filename = domain + "test2.do.html"
f = open(os.path.join(dir_path, html_filename), 'w')
f.write(webContent)
f.close
您可以根据我提供的 link 将其作为属性重新分配给 bs4 对象:
例如:
for script in soup.find_all("script"):
if script.attrs.get("src"):
#Original
print('Original')
print(script)
# if the tag has the attribute 'src'
script_url = urljoin(url, script.attrs.get("src"))
script_files.append(script_url)
script['src'] = script_url
# Now changed
print('New/Changed')
print(script)
输出:
Original
<script src="/static_1.872.4/js/jquery_3.4.1/jquery-3.4.1.min.js" type="text/javascript"></script>
New/Changed
<script src="https://es.sopranodesign.com/static_1.872.4/js/jquery_3.4.1/jquery-3.4.1.min.js" type="text/javascript"></script>
您可以在 html 中看到它,它已更改(这是在第一次迭代之后。它将继续对其余部分执行此操作。)