PHP 从 DOM 创建 header 标签的递归列表
PHP create recursive list of header tags from DOM
我想解析一些 HTML 以根据该文档中的标题创建嵌套导航。
我正在尝试创建这样的数组:
[
'name' => 'section 1',
'number' => '1',
'level' => 1,
'children' => [
[
'name' => 'sub section 1',
'number' => '1.1',
'level' => 2,
'children' => []
],
[
'name' => 'sub section 2',
'number' => '1.2',
'level' => 2,
'children' => []
]
],
]
因此,如果文档在 H2 之后有一个 H3,则代码可以对其进行解析并为每个连续的 H 标题层创建一个包含 child 个元素的嵌套数组
我想它需要做几件主要的事情:
- 获取所有标题
- 递归循环(一个H2之后的H3应该是数组中的一个child)
- 创建节号
1.1.1
或 1.1.2
例如
这是我提取标题的代码:
$dom = new \DomDocument();
$dom->loadHTML($html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
// Extract the heading structure
$xpath = new \DomXPath($dom);
$headings = $xpath->query('//h1|//h2|//h3|//h4|//h5|//h6');
我已经尝试创建一个递归函数,但我不确定让它工作的最佳方法
测试非常困难,因为这取决于 HTML 的复杂程度以及您使用的特定页面。此外,由于代码做了很多事情,我将留给您来解决它的作用,因为解释会持续一段时间。 XPath 是使用 XPath select all elements between two specific elements 作为参考创建的,用于挑选两个标签之间的数据。测试源(test.html)仅仅是....
<html>
<head>
</head>
<body>
<h2>Header 1</h2>
<h2>Header 2</h2>
<h3>Header 2.1</h3>
<h4>Header 2.1.1</h4>
<h2>Header 3</h2>
<h3>Header 3.1</h3>
</body>
</html>
实际代码是...
function extractH ( $level, $xpath, $dom, $position = 0, $number = '' ) {
$output = [];
$prevLevel = $level-1;
$headings = $xpath->query("//*/h{$level}[count(preceding-sibling::h{$prevLevel})={$position}]");
foreach ( $headings as $key => $heading ) {
$sectionNumber = ltrim($number.".".($key+1), ".");
$newOutput = ["name" => $heading->nodeValue,
"number" => $sectionNumber,
"level" => $level
];
$children = extractH($level+1, $xpath, $dom, $key+1, $sectionNumber);
if ( !empty($children) ) {
$newOutput["children"] = $children;
}
$output[] =$newOutput;
}
return $output;
}
$html = file_get_contents("test.html");
$dom = new \DomDocument();
$dom->loadHTML($html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
$xpath = new \DomXPath($dom);
$output = extractH(2, $xpath, $dom);
print_r($output);
对 extractH()
的调用参数很少。由于示例 HTML 仅以 h2 标签(无 h1)开头,因此第一个参数为 2。然后是要使用的 XPath 和 DomDocument 对象。
接受的答案不适用于我这样的结构:
<h2>a</h2>
<h3>aa</h3>
<h4>aaa</h4>
<h5>aaaa</h5>
<h6>aaaaa</h6>
<h2>b</h2>
<h2>c</h2>
<h3>ca</h3>
<h3>cb</h3>
<h3>cc</h3>
<h2>d</h2>
<h3>da</h3>
<h4>daa</h4>
<h5>daaa</h5>
<h6>daaaa</h6>
"d" 部分中的树正在替换为 "a" 部分中的树
这个解决方案对我有用
class Parser {
private $counter = [
1 => 0,
2 => 0,
3 => 0,
4 => 0,
5 => 0,
6 => 0,
];
public function generate(string $text) {
$dom = new DOMDocument('1.0', 'utf-8');
$dom->loadHTML(mb_convert_encoding($text, 'HTML-ENTITIES', 'UTF-8'), LIBXML_HTML_NOIMPLIED);
$xpath = new DOMXPath($dom);
$tree = $this->extractHeadings(2, $xpath, $dom);
return $tree;
}
private function extractHeadings($level, DOMXPath $xpath, DOMDocument $dom, $position = 0) {
$result = [];
$prevLevel = $level-1;
$query = "//*/h{$level}[count(preceding::h{$prevLevel})={$position}]";
$headings = $xpath->query($query);
foreach ($headings as $key => $heading) {
$this->counter[$level]++;
$item = [
'value' => $heading->nodeValue,
'level' => $level,
'children' => [],
];
$children = $this->extractHeadings($level+1, $xpath, $dom, $this->counter[$level]);
if (!empty($children)) {
$item['children'] = $children;
}
$result[] = $item;
}
return $result;
}
}
$text = "
<h2>a</h2>
<h3>aa</h3>
<h4>aaa</h4>
<h5>aaaa</h5>
<h6>aaaaa</h6>
<h2>b</h2>
<h2>c</h2>
<h3>ca</h3>
<h3>cb</h3>
<h3>cc</h3>
<h2>d</h2>
<h3>da</h3>
<h4>daa</h4>
<h5>daaa</h5>
<h6>daaaa</h6>
";
$parser = new Parser();
$parser->generate($text);
但仍然需要有序的标题
我想解析一些 HTML 以根据该文档中的标题创建嵌套导航。
我正在尝试创建这样的数组:
[
'name' => 'section 1',
'number' => '1',
'level' => 1,
'children' => [
[
'name' => 'sub section 1',
'number' => '1.1',
'level' => 2,
'children' => []
],
[
'name' => 'sub section 2',
'number' => '1.2',
'level' => 2,
'children' => []
]
],
]
因此,如果文档在 H2 之后有一个 H3,则代码可以对其进行解析并为每个连续的 H 标题层创建一个包含 child 个元素的嵌套数组
我想它需要做几件主要的事情:
- 获取所有标题
- 递归循环(一个H2之后的H3应该是数组中的一个child)
- 创建节号
1.1.1
或1.1.2
例如
这是我提取标题的代码:
$dom = new \DomDocument();
$dom->loadHTML($html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
// Extract the heading structure
$xpath = new \DomXPath($dom);
$headings = $xpath->query('//h1|//h2|//h3|//h4|//h5|//h6');
我已经尝试创建一个递归函数,但我不确定让它工作的最佳方法
测试非常困难,因为这取决于 HTML 的复杂程度以及您使用的特定页面。此外,由于代码做了很多事情,我将留给您来解决它的作用,因为解释会持续一段时间。 XPath 是使用 XPath select all elements between two specific elements 作为参考创建的,用于挑选两个标签之间的数据。测试源(test.html)仅仅是....
<html>
<head>
</head>
<body>
<h2>Header 1</h2>
<h2>Header 2</h2>
<h3>Header 2.1</h3>
<h4>Header 2.1.1</h4>
<h2>Header 3</h2>
<h3>Header 3.1</h3>
</body>
</html>
实际代码是...
function extractH ( $level, $xpath, $dom, $position = 0, $number = '' ) {
$output = [];
$prevLevel = $level-1;
$headings = $xpath->query("//*/h{$level}[count(preceding-sibling::h{$prevLevel})={$position}]");
foreach ( $headings as $key => $heading ) {
$sectionNumber = ltrim($number.".".($key+1), ".");
$newOutput = ["name" => $heading->nodeValue,
"number" => $sectionNumber,
"level" => $level
];
$children = extractH($level+1, $xpath, $dom, $key+1, $sectionNumber);
if ( !empty($children) ) {
$newOutput["children"] = $children;
}
$output[] =$newOutput;
}
return $output;
}
$html = file_get_contents("test.html");
$dom = new \DomDocument();
$dom->loadHTML($html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
$xpath = new \DomXPath($dom);
$output = extractH(2, $xpath, $dom);
print_r($output);
对 extractH()
的调用参数很少。由于示例 HTML 仅以 h2 标签(无 h1)开头,因此第一个参数为 2。然后是要使用的 XPath 和 DomDocument 对象。
接受的答案不适用于我这样的结构:
<h2>a</h2>
<h3>aa</h3>
<h4>aaa</h4>
<h5>aaaa</h5>
<h6>aaaaa</h6>
<h2>b</h2>
<h2>c</h2>
<h3>ca</h3>
<h3>cb</h3>
<h3>cc</h3>
<h2>d</h2>
<h3>da</h3>
<h4>daa</h4>
<h5>daaa</h5>
<h6>daaaa</h6>
"d" 部分中的树正在替换为 "a" 部分中的树
这个解决方案对我有用
class Parser {
private $counter = [
1 => 0,
2 => 0,
3 => 0,
4 => 0,
5 => 0,
6 => 0,
];
public function generate(string $text) {
$dom = new DOMDocument('1.0', 'utf-8');
$dom->loadHTML(mb_convert_encoding($text, 'HTML-ENTITIES', 'UTF-8'), LIBXML_HTML_NOIMPLIED);
$xpath = new DOMXPath($dom);
$tree = $this->extractHeadings(2, $xpath, $dom);
return $tree;
}
private function extractHeadings($level, DOMXPath $xpath, DOMDocument $dom, $position = 0) {
$result = [];
$prevLevel = $level-1;
$query = "//*/h{$level}[count(preceding::h{$prevLevel})={$position}]";
$headings = $xpath->query($query);
foreach ($headings as $key => $heading) {
$this->counter[$level]++;
$item = [
'value' => $heading->nodeValue,
'level' => $level,
'children' => [],
];
$children = $this->extractHeadings($level+1, $xpath, $dom, $this->counter[$level]);
if (!empty($children)) {
$item['children'] = $children;
}
$result[] = $item;
}
return $result;
}
}
$text = "
<h2>a</h2>
<h3>aa</h3>
<h4>aaa</h4>
<h5>aaaa</h5>
<h6>aaaaa</h6>
<h2>b</h2>
<h2>c</h2>
<h3>ca</h3>
<h3>cb</h3>
<h3>cc</h3>
<h2>d</h2>
<h3>da</h3>
<h4>daa</h4>
<h5>daaa</h5>
<h6>daaaa</h6>
";
$parser = new Parser();
$parser->generate($text);
但仍然需要有序的标题