为什么 org.w3c.dom 解析我的 xml 错误?
Why is org.w3c.dom parsing my xml wrong?
解析后xml,
<html>
<body>
<a>
<div>
<span>foo</span>
</div>
</a>
</body>
</html>
用 javax.xml.xpath 解析的 org.w3c.dom 文档表示如下:
div
是a
的父节点
a
是span
的父节点
为什么会这样,我该如何正确解析这个xml?
这是我正在使用的代码,然后是用于创建 Document 对象的方法,然后是代码的输出。
String myxml = ""
+ "<html>"
+ "<body>"
+ "<a>"
+ "<div>"
+ "<span>foo</span>"
+ "</div>"
+ "</a>"
+ "</body>"
+ "</html>";
Document doc = HttpDownloadUtilities.getWebpageDocument_fromSource(myxml);
XPath xPath = XPathFactory.newInstance().newXPath();
Node node = ((Node)xPath.compile("//*[text() = 'foo']").evaluate(doc, XPathConstants.NODE));
System.out.println(" node tag: " + node.getNodeName());
System.out.println(" parent tag: " + node.getParentNode().getNodeName());
System.out.println("grandparent tag: " + node.getParentNode().getParentNode().getNodeName());
Set<Node> nodes = H.getSet((NodeList)xPath.compile("//*").evaluate(doc, XPathConstants.NODESET));
for (Node n : nodes) {
System.out.println();
try {
System.out.println("node: " + n.getNodeName());
} catch (Exception e) {
}
try {
System.out.println("child: " + n.getChildNodes().item(0).getNodeName());
} catch (Exception e) {
}
}
这里是创建文档对象的方法:
public static Document getWebpageDocument_fromSource(String source) throws InterruptedException, IOException {
try {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setAllowHtmlInsideAttributes(true);
props.setAllowMultiWordAttributes(true);
props.setRecognizeUnicodeChars(true);
props.setOmitComments(true);
DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = null;
try {
builder = builderFactory.newDocumentBuilder();
} catch (ParserConfigurationException e) {
e.printStackTrace();
}
TagNode tagNode = new HtmlCleaner().clean(source);
Document doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
return doc;
} catch (ParserConfigurationException ex) {
ex.printStackTrace();
return null;
}
}
输出:
node tag: span
parent tag: a
grandparent tag: div
node: html
child: head
node: head
node: body
child: html
node: html
child: body
node: body
child: a
node: a
node: div
child: a
node: a
child: span
node: span
child: #text
很可能 html 解析器修复了无效的 html。不允许使用 a-tags div-tags。只要您拥有文档对象,html 就已经被解析并修复。
解析后xml,
<html>
<body>
<a>
<div>
<span>foo</span>
</div>
</a>
</body>
</html>
用 javax.xml.xpath 解析的 org.w3c.dom 文档表示如下:
div
是a
的父节点
a
是span
的父节点
为什么会这样,我该如何正确解析这个xml?
这是我正在使用的代码,然后是用于创建 Document 对象的方法,然后是代码的输出。
String myxml = ""
+ "<html>"
+ "<body>"
+ "<a>"
+ "<div>"
+ "<span>foo</span>"
+ "</div>"
+ "</a>"
+ "</body>"
+ "</html>";
Document doc = HttpDownloadUtilities.getWebpageDocument_fromSource(myxml);
XPath xPath = XPathFactory.newInstance().newXPath();
Node node = ((Node)xPath.compile("//*[text() = 'foo']").evaluate(doc, XPathConstants.NODE));
System.out.println(" node tag: " + node.getNodeName());
System.out.println(" parent tag: " + node.getParentNode().getNodeName());
System.out.println("grandparent tag: " + node.getParentNode().getParentNode().getNodeName());
Set<Node> nodes = H.getSet((NodeList)xPath.compile("//*").evaluate(doc, XPathConstants.NODESET));
for (Node n : nodes) {
System.out.println();
try {
System.out.println("node: " + n.getNodeName());
} catch (Exception e) {
}
try {
System.out.println("child: " + n.getChildNodes().item(0).getNodeName());
} catch (Exception e) {
}
}
这里是创建文档对象的方法:
public static Document getWebpageDocument_fromSource(String source) throws InterruptedException, IOException {
try {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setAllowHtmlInsideAttributes(true);
props.setAllowMultiWordAttributes(true);
props.setRecognizeUnicodeChars(true);
props.setOmitComments(true);
DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = null;
try {
builder = builderFactory.newDocumentBuilder();
} catch (ParserConfigurationException e) {
e.printStackTrace();
}
TagNode tagNode = new HtmlCleaner().clean(source);
Document doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
return doc;
} catch (ParserConfigurationException ex) {
ex.printStackTrace();
return null;
}
}
输出:
node tag: span
parent tag: a
grandparent tag: div
node: html
child: head
node: head
node: body
child: html
node: html
child: body
node: body
child: a
node: a
node: div
child: a
node: a
child: span
node: span
child: #text
很可能 html 解析器修复了无效的 html。不允许使用 a-tags div-tags。只要您拥有文档对象,html 就已经被解析并修复。