使用 python BeautifulSoup 将 HTML 提取到 JSON
Extract HTML into JSON with pyhton BeautifulSoup
问题
我正在尝试解析 HTML 的一些块以将相关数据存储在 JSON object 中,但我正在努力解决 BeautifulSoup' s 对 child 标签的处理与我的特定要求冲突。
示例输入:
<p>Here's a paragraph</p>
<ul>
<li>With a list</li>
<li>
<ul>
<li>And a nested list</li>
<li>Within it that has some <strong>bold text</strong></li>
</ul>
</li>
</ul>
期望的输出:
[
{
"type":"p",
"content":"Here's a paragraph"
},
{
"type":"ul",
"content":[
{
"type":"li",
"content":"With a list"
},
{
"type":"li",
"content":[
{
"type":"ul",
"content":[
{
"type":"li",
"content":"And a nested list"
},
{
"type":"li",
"content":"Within it that has some bold text"
}
]
}
]
}
]
}
]
我的尝试
这是我迄今为止的最佳尝试:
from bs4 import BeautifulSoup
import json
def process(html):
content = []
soup = BeautifulSoup(html, 'html.parser')
elements = soup.descendants
for element in elements:
if str(element).strip() not in [' ', '']:
if element.name in ['p']:#, 'ul', 'ol', 'li']:
content.append({
'type':element.name,
'content':element.find(text=True, recursive=False)
})
elif element.name in ['ul', 'ol']:
parent = {
'type':element.name,
'content':[]
}
for child in element.children:
if child != '\n':
if child.find(text=True, recursive=False) != '\n':
parent['content'].append({
'type':child.name,
'content':child.find(text=True, recursive=False)
})
content.append(parent)
print(json.dumps(content, indent=4))
if __name__ == '__main__':
original = '''<p>Here's a paragraph</p>
<ul>
<li>With a list</li>
<li>
<ul>
<li>And a nested list</li>
<li>Within it that has some <strong>bold text</strong></li>
</ul>
</li>
</ul>
'''
process(original)
产生以下输出:
[
{
"type": "p",
"content": "Here's a paragraph"
},
{
"type": "ul",
"content": [
{
"type": "li",
"content": "With a list"
}
]
},
{
"type": "ul",
"content": [
{
"type": "li",
"content": "And a nested list"
},
{
"type": "li",
"content": "Within it that has some "
}
]
},
{
"type": "ul",
"content": [
{
"type": "li",
"content": "And a nested list"
},
{
"type": "li",
"content": "Within it that has some "
}
]
}
]
你可以看到我有三个问题:
- 内表出现两次
- 内部列表没有嵌套在它的 parent 列表中
- 标签中包含的文本丢失
我知道对 HTML 做的事情有点奇怪,但是对于如何解决这三点有什么建议吗?
这不是 beautifulsoup 解决方案 - 但也许使用 event-based 解析器会更容易,例如 lxml.etree.iterparse()
您可以注册 start/end(打开 tag/close 标签)事件,这是处理 parent/child 嵌套的有用方法。
import io, json, lxml.etree
def process(html):
# convert html str into fileobj for iterparse
html = io.BytesIO(html.encode('utf-8'))
parser = lxml.etree.iterparse(
html, events=('start', 'end'), html=True)
root = None
parents = []
for event, tag in parser:
if event == 'start':
content = []
if tag.text and tag.text.strip():
content.append(tag.text.strip())
child = dict(type=tag.tag, content=content)
parents.append(child)
if not root:
root = child
else:
# close </tag> - point child to parent
if len(parents) > 1:
parent, child = parents[-2:]
parent['content'].append(child)
child = parents.pop()
content = child['content']
# unwrap 1 element lists that contain a text only node
if len(content) == 1 and isinstance(content[0], str):
child['content'] = content.pop()
# If the previous element is also a text only node
# join text together and "discard" the "dict"
if len(parent['content']) > 1 and \
isinstance(parent['content'][-2], str):
parent['content'][-2] += ' ' + child['content']
parent['content'].pop()
#root = root['content'][0]['content']
print(json.dumps(root, indent=4))
iterparse
添加 <html><body>
标签 - 如果您想排除它们,您可以 root = root['content'][0]['content']
左右。
输出:
{
"type": "html",
"content": [
{
"type": "body",
"content": [
{
"type": "p",
"content": "Here's a paragraph"
},
{
"type": "ul",
"content": [
{
"type": "li",
"content": "With a list"
},
{
"type": "li",
"content": [
{
"type": "ul",
"content": [
{
"type": "li",
"content": "And a nested list"
},
{
"type": "li",
"content": "Within it that has some bold text"
}
]
}
]
}
]
}
]
}
]
}
问题
我正在尝试解析 HTML 的一些块以将相关数据存储在 JSON object 中,但我正在努力解决 BeautifulSoup' s 对 child 标签的处理与我的特定要求冲突。
示例输入:
<p>Here's a paragraph</p>
<ul>
<li>With a list</li>
<li>
<ul>
<li>And a nested list</li>
<li>Within it that has some <strong>bold text</strong></li>
</ul>
</li>
</ul>
期望的输出:
[
{
"type":"p",
"content":"Here's a paragraph"
},
{
"type":"ul",
"content":[
{
"type":"li",
"content":"With a list"
},
{
"type":"li",
"content":[
{
"type":"ul",
"content":[
{
"type":"li",
"content":"And a nested list"
},
{
"type":"li",
"content":"Within it that has some bold text"
}
]
}
]
}
]
}
]
我的尝试
这是我迄今为止的最佳尝试:
from bs4 import BeautifulSoup
import json
def process(html):
content = []
soup = BeautifulSoup(html, 'html.parser')
elements = soup.descendants
for element in elements:
if str(element).strip() not in [' ', '']:
if element.name in ['p']:#, 'ul', 'ol', 'li']:
content.append({
'type':element.name,
'content':element.find(text=True, recursive=False)
})
elif element.name in ['ul', 'ol']:
parent = {
'type':element.name,
'content':[]
}
for child in element.children:
if child != '\n':
if child.find(text=True, recursive=False) != '\n':
parent['content'].append({
'type':child.name,
'content':child.find(text=True, recursive=False)
})
content.append(parent)
print(json.dumps(content, indent=4))
if __name__ == '__main__':
original = '''<p>Here's a paragraph</p>
<ul>
<li>With a list</li>
<li>
<ul>
<li>And a nested list</li>
<li>Within it that has some <strong>bold text</strong></li>
</ul>
</li>
</ul>
'''
process(original)
产生以下输出:
[
{
"type": "p",
"content": "Here's a paragraph"
},
{
"type": "ul",
"content": [
{
"type": "li",
"content": "With a list"
}
]
},
{
"type": "ul",
"content": [
{
"type": "li",
"content": "And a nested list"
},
{
"type": "li",
"content": "Within it that has some "
}
]
},
{
"type": "ul",
"content": [
{
"type": "li",
"content": "And a nested list"
},
{
"type": "li",
"content": "Within it that has some "
}
]
}
]
你可以看到我有三个问题:
- 内表出现两次
- 内部列表没有嵌套在它的 parent 列表中
- 标签中包含的文本丢失
我知道对 HTML 做的事情有点奇怪,但是对于如何解决这三点有什么建议吗?
这不是 beautifulsoup 解决方案 - 但也许使用 event-based 解析器会更容易,例如 lxml.etree.iterparse()
您可以注册 start/end(打开 tag/close 标签)事件,这是处理 parent/child 嵌套的有用方法。
import io, json, lxml.etree
def process(html):
# convert html str into fileobj for iterparse
html = io.BytesIO(html.encode('utf-8'))
parser = lxml.etree.iterparse(
html, events=('start', 'end'), html=True)
root = None
parents = []
for event, tag in parser:
if event == 'start':
content = []
if tag.text and tag.text.strip():
content.append(tag.text.strip())
child = dict(type=tag.tag, content=content)
parents.append(child)
if not root:
root = child
else:
# close </tag> - point child to parent
if len(parents) > 1:
parent, child = parents[-2:]
parent['content'].append(child)
child = parents.pop()
content = child['content']
# unwrap 1 element lists that contain a text only node
if len(content) == 1 and isinstance(content[0], str):
child['content'] = content.pop()
# If the previous element is also a text only node
# join text together and "discard" the "dict"
if len(parent['content']) > 1 and \
isinstance(parent['content'][-2], str):
parent['content'][-2] += ' ' + child['content']
parent['content'].pop()
#root = root['content'][0]['content']
print(json.dumps(root, indent=4))
iterparse
添加 <html><body>
标签 - 如果您想排除它们,您可以 root = root['content'][0]['content']
左右。
输出:
{
"type": "html",
"content": [
{
"type": "body",
"content": [
{
"type": "p",
"content": "Here's a paragraph"
},
{
"type": "ul",
"content": [
{
"type": "li",
"content": "With a list"
},
{
"type": "li",
"content": [
{
"type": "ul",
"content": [
{
"type": "li",
"content": "And a nested list"
},
{
"type": "li",
"content": "Within it that has some bold text"
}
]
}
]
}
]
}
]
}
]
}