使用 python 编辑 html 文件

Question

我已经删除了网络内容（css、js 和图片）

现在我想编辑下载的 HTML 文件以提供图像、js 和 css 的绝对路径。

例如，脚本需要找到源'src'，它必须是绝对路径（包含域）而不是亲戚（不包含域）。

从：/static_1.872.4/js/jquery_3.4.1/jquery-3.4.1.min.js更改为https://es.sopranodesign.com/static_1.872.4/js/jquery_3.4.1/jquery-3.4.1.min.js并保存为index2.html

到目前为止，这是我的代码：

import os
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
from pathlib import Path
import os.path
import urllib.request, urllib.error, urllib.parse
from tldextract import extract

directory = Path(r"\Documents\python\public_html").mkdir(parents=True, exist_ok=True)
dir_path = r"\Documents\python\public_html"

# URL of the web page you want to extract
url = "https://es.sopranodesign.com/sei/login.do?customerId=270"

# initialize a session
session = requests.Session()
# set the User-agent as a regular browser
session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"

# get the HTML content
html = session.get(url).content

# parse HTML using beautiful soup
soup = bs(html, "html.parser")

# get the JavaScript files
script_files = []

for script in soup.find_all("script"):
    if script.attrs.get("src"):
        # if the tag has the attribute 'src'
        script_url = urljoin(url, script.attrs.get("src"))
        script_files.append(script_url)

# get the CSS files
css_files = []

for css in soup.find_all("link"):
    if css.attrs.get("href"):
        # if the link tag has the 'href' attribute
        css_url = urljoin(url, css.attrs.get("href"))
        css_files.append(css_url)

# get the images files
image_files = []

for script in soup.find_all("img"):
    if script.attrs.get("src"):
        # if the tag has the attribute 'src'
        script_url = urljoin(url, script.attrs.get("src"))
        image_files.append(script_url)
        
print("Total script files in the page:", len(script_files))
print("Total CSS files in the page:", len(css_files))
print("Total images files in the page:", len(image_files))

# write file links into files
javascript_filename = "javascript_files.txt"
with open(os.path.join(dir_path, javascript_filename), "w") as f:
    for js_file in script_files:
        print(js_file, file=f)

css_filename = "css_files.txt"
with open(os.path.join(dir_path, css_filename), "w") as f:
    for css_file in css_files:
        print(css_file, file=f)

image_filename = "image_files.txt"
with open(os.path.join(dir_path, image_filename), "w") as f:
    for image_file in image_files:
        print(image_file, file=f)
        
        
try:
    for js_file in script_files:
        fileNamepath = os.path.basename(js_file)
        fileName = os.path.join(dir_path,fileNamepath)
        text = requests.get(js_file).text
        with open(fileName, 'w',encoding="utf-8") as f:
            f.write(text)
except:
    pass


for css_file in css_files:
    fileNamepath = os.path.basename(css_file)
    fileName = os.path.join(dir_path,fileNamepath)
    text = requests.get(css_file).text
    with open(fileName, 'w',encoding="utf-8") as f:
        f.write(text)
        
for image_file in image_files:
    fileNamepath = os.path.basename(image_file)
    fileName = os.path.join(dir_path,fileNamepath)
    text = requests.get(image_file).text
    with open(fileName, 'w',encoding="utf-8") as f:
        f.write(text)
        
tsd, td, tsu = extract(url) # prints  hostname
domain = td  # will prints as hostname
print(domain)

response = urllib.request.urlopen(url)
webContent = response.read().decode('UTF-8')
html_filename = domain + "test2.do.html"
f = open(os.path.join(dir_path, html_filename), 'w')
f.write(webContent)
f.close

Answer 1

您可以根据我提供的 link 将其作为属性重新分配给 bs4 对象：

例如：

for script in soup.find_all("script"):
    if script.attrs.get("src"):
        #Original
        print('Original')
        print(script)
        
        # if the tag has the attribute 'src'
        script_url = urljoin(url, script.attrs.get("src"))
        script_files.append(script_url)
        
        script['src'] = script_url
        
        # Now changed
        print('New/Changed')
        print(script)

输出:

Original
<script src="/static_1.872.4/js/jquery_3.4.1/jquery-3.4.1.min.js" type="text/javascript"></script>

New/Changed
<script src="https://es.sopranodesign.com/static_1.872.4/js/jquery_3.4.1/jquery-3.4.1.min.js" type="text/javascript"></script>

您可以在 html 中看到它，它已更改（这是在第一次迭代之后。它将继续对其余部分执行此操作。）

使用 python 编辑 html 文件

Edit html file using python

python

beautifulsoup

web-scraping