从 html 页面中删除所有样式、脚本和 html 标签
Remove all style, scripts, and html tags from an html page
这是我目前的情况:
from bs4 import BeautifulSoup
def cleanme(html):
soup = BeautifulSoup(html) # create a new bs4 object from the html data loaded
for script in soup(["script"]):
script.extract()
text = soup.get_text()
return text
testhtml = "<!DOCTYPE HTML>\n<head>\n<title>THIS IS AN EXAMPLE </title><style>.call {font-family:Arial;}</style><script>getit</script><body>I need this text captured<h1>And this</h1></body>"
cleaned = cleanme(testhtml)
print (cleaned)
正在删除脚本
看来你快搞定了。您还需要删除 html 标签和 css 样式代码。这是我的解决方案(我更新了函数):
def cleanMe(html):
soup = BeautifulSoup(html, "html.parser") # create a new bs4 object from the html data loaded
for script in soup(["script", "style"]): # remove all javascript and stylesheet code
script.extract()
# get text
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
如果你想要一个快速而肮脏的解决方案,你可以使用:
re.sub(r'<[^>]*?>', '', value)
相当于 php 中的 strip_tags。
是你想要的吗?
您可以使用 decompose
to completely remove the tags from the document and stripped_strings
生成器来检索标签内容。
def clean_me(html):
soup = BeautifulSoup(html)
for s in soup(['script', 'style']):
s.decompose()
return ' '.join(soup.stripped_strings)
>>> clean_me(testhtml)
'THIS IS AN EXAMPLE I need this text captured And this'
以干净的方式删除指定的标签和评论。感谢 Kim Hyesung for .
from bs4 import BeautifulSoup
from bs4 import Comment
def cleanMe(html):
soup = BeautifulSoup(html, "html5lib")
[x.extract() for x in soup.find_all('script')]
[x.extract() for x in soup.find_all('style')]
[x.extract() for x in soup.find_all('meta')]
[x.extract() for x in soup.find_all('noscript')]
[x.extract() for x in soup.find_all(text=lambda text:isinstance(text, Comment))]
return soup
改用lxml:
# Requirements: pip install lxml
import lxml.html.clean
def cleanme(content):
cleaner = lxml.html.clean.Cleaner(
allow_tags=[''],
remove_unknown_tags=False,
style=True,
)
html = lxml.html.document_fromstring(content)
html_clean = cleaner.clean_html(html)
return html_clean.text_content().strip()
testhtml = "<!DOCTYPE HTML>\n<head>\n<title>THIS IS AN EXAMPLE </title><style>.call {font-family:Arial;}</style><script>getit</script><body>I need this text captured<h1>And this</h1></body>"
cleaned = cleanme(testhtml)
print (cleaned)
除了 answer. If you want to extract a lot of text, check out selectolax
之外的另一种实现方式,比lxml
快多了
def clean_me(html):
soup = BeautifulSoup(html, 'lxml')
body = soup.body
if body is None:
return None
# removing everything besides text
for tag in body.select('script'):
tag.decompose()
for tag in body.select('style'):
tag.decompose()
plain_text = body.get_text(separator='\n').strip()
print(plain_text)
clean_me()
这是我目前的情况:
from bs4 import BeautifulSoup
def cleanme(html):
soup = BeautifulSoup(html) # create a new bs4 object from the html data loaded
for script in soup(["script"]):
script.extract()
text = soup.get_text()
return text
testhtml = "<!DOCTYPE HTML>\n<head>\n<title>THIS IS AN EXAMPLE </title><style>.call {font-family:Arial;}</style><script>getit</script><body>I need this text captured<h1>And this</h1></body>"
cleaned = cleanme(testhtml)
print (cleaned)
正在删除脚本
看来你快搞定了。您还需要删除 html 标签和 css 样式代码。这是我的解决方案(我更新了函数):
def cleanMe(html):
soup = BeautifulSoup(html, "html.parser") # create a new bs4 object from the html data loaded
for script in soup(["script", "style"]): # remove all javascript and stylesheet code
script.extract()
# get text
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
如果你想要一个快速而肮脏的解决方案,你可以使用:
re.sub(r'<[^>]*?>', '', value)
相当于 php 中的 strip_tags。 是你想要的吗?
您可以使用 decompose
to completely remove the tags from the document and stripped_strings
生成器来检索标签内容。
def clean_me(html):
soup = BeautifulSoup(html)
for s in soup(['script', 'style']):
s.decompose()
return ' '.join(soup.stripped_strings)
>>> clean_me(testhtml)
'THIS IS AN EXAMPLE I need this text captured And this'
以干净的方式删除指定的标签和评论。感谢 Kim Hyesung for
from bs4 import BeautifulSoup
from bs4 import Comment
def cleanMe(html):
soup = BeautifulSoup(html, "html5lib")
[x.extract() for x in soup.find_all('script')]
[x.extract() for x in soup.find_all('style')]
[x.extract() for x in soup.find_all('meta')]
[x.extract() for x in soup.find_all('noscript')]
[x.extract() for x in soup.find_all(text=lambda text:isinstance(text, Comment))]
return soup
改用lxml:
# Requirements: pip install lxml
import lxml.html.clean
def cleanme(content):
cleaner = lxml.html.clean.Cleaner(
allow_tags=[''],
remove_unknown_tags=False,
style=True,
)
html = lxml.html.document_fromstring(content)
html_clean = cleaner.clean_html(html)
return html_clean.text_content().strip()
testhtml = "<!DOCTYPE HTML>\n<head>\n<title>THIS IS AN EXAMPLE </title><style>.call {font-family:Arial;}</style><script>getit</script><body>I need this text captured<h1>And this</h1></body>"
cleaned = cleanme(testhtml)
print (cleaned)
除了selectolax
之外的另一种实现方式,比lxml
def clean_me(html):
soup = BeautifulSoup(html, 'lxml')
body = soup.body
if body is None:
return None
# removing everything besides text
for tag in body.select('script'):
tag.decompose()
for tag in body.select('style'):
tag.decompose()
plain_text = body.get_text(separator='\n').strip()
print(plain_text)
clean_me()