如何使用 HTMLDocument 查找属性?
How to find an attribute using HTMLDocument?
可能 HTML 的术语与 XML 的术语不同,但这里有一个 HTML 文档,可以从中检索属性。这里的属性 a1、a2、a3 是 Body 标签的一部分。
<html>
<head>
Hello World
</head>
<body a1="ABC" a2="3974" a3="A1B2"> <------These attributes
<H1>Start Here<H1>
<p>This is the body</p>
</body>
</html>
使用下面的文件解析上面的HTML文件。
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.Reader;
import javax.swing.text.AttributeSet;
import javax.swing.text.Element;
import javax.swing.text.ElementIterator;
import javax.swing.text.StyleConstants;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class HTMLParserTest
{
public static void main(String args[]) throws Exception {
Reader reader = new FileReader("C:/Downloads/DeleteMe/Example1.html");
BufferedReader br = new BufferedReader(reader );
HTMLEditorKit htmlKit = new HTMLEditorKit();
HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();
HTMLEditorKit.Parser parser = new ParserDelegator();
HTMLEditorKit.ParserCallback callback = htmlDoc.getReader(0);
parser.parse(br, callback, true);
// Parse
ElementIterator iterator = new ElementIterator(htmlDoc);
Element element;
while ((element = iterator.next()) != null)
{
System.out.println("Element : " + element);
AttributeSet attributes = element.getAttributes();
Object name = attributes.getAttribute(StyleConstants.NameAttribute);
if ((name instanceof HTML.Tag))
//&& ((name == HTML.Tag.H1) || (name == HTML.Tag.H2) || (name == HTML.Tag.H3)))
{
// Build up content text as it may be within multiple elements
StringBuffer text = new StringBuffer();
int count = element.getElementCount();
for (int i = 0; i < count; i++) {
Element child = element.getElement(i);
AttributeSet childAttributes = child.getAttributes();
System.out.println("Element : " + child);
System.out.println(" Attribute count : " + childAttributes.getAttributeCount());
System.out.println(" a1 exists : " + childAttributes.isDefined("a1"));
int startOffset = child.getStartOffset();
int endOffset = child.getEndOffset();
int length = endOffset - startOffset;
text.append(htmlDoc.getText(startOffset, length));
}
}
}
System.exit(0);
}
}
输出在这里。
Element : BranchElement(html) 0,1
Element : BranchElement(body) 0,1
Attribute count : 1
a1 exists : false <-----expected true here.
Element : BranchElement(body) 0,1
Element : BranchElement(p) 0,1
Attribute count : 3
a1 exists : false
Element : BranchElement(p) 0,1
Element : LeafElement(content) 0,1
Attribute count : 1
a1 exists : false
Element : LeafElement(content) 0,1
期望“a1 存在”检查应该返回 true 一次,但它没有。
最终将搜索所有 3 (a1, a2, a3)。
上面的代码是正确的实现还是在 HTML 解析器中不可行?
我不知道 HtmlKit
但你可以使用正则表达式获得类似的结果
public static void main(String[] args) throws UnirestException {
String html = "<html>\r\n" +
" <head>\r\n" +
" Hello World\r\n" +
" </head>\r\n" +
" <body a1=\"ABC\" a2=\"3974\" a3=\"A1B2\">\r\n" +
" <H1>Start Here<H1>\r\n" +
" <p>This is the body</p>\r\n" +
" </body>\r\n" +
"</html>";
Pattern regexBodyPattern = Pattern.compile("<body[^>]*>", Pattern.MULTILINE);
Matcher matcher = regexBodyPattern.matcher(html);
while(matcher.find()) {
String bodyTag = matcher.group();
Pattern regexBodyAttrPattern = Pattern.compile("(\S*)=(\\"\w*\\")", Pattern.MULTILINE);
Matcher attrMatcher = regexBodyAttrPattern.matcher(bodyTag);
while(attrMatcher.find()) {
System.out.println("Key :: "+attrMatcher.group(1)+" , Value "+attrMatcher.group(2));
}
}
}
输出
Key :: a1 , Value "ABC"
Key :: a2 , Value "3974"
Key :: a3 , Value "A1B2"
也许这会有所帮助:
import java.io.*;
import java.net.*;
import java.util.*;
import javax.swing.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
class AttributeHTML
{
public static void main(String[] args)
{
EditorKit kit = new HTMLEditorKit();
Document doc = kit.createDefaultDocument();
// The Document class does not yet handle charset's properly.
doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);
try
{
// Create a reader on the HTML content.
Reader rd = getReader(args[0]);
// Parse the HTML.
kit.read(rd, doc, 0);
// Iterate through the elements of the HTML document.
ElementIterator it = new ElementIterator(doc);
Element elem = null;
while ( (elem = it.next()) != null )
{
if (elem.getName().equals("body"))
{
AttributeSet as = elem.getAttributes();
Enumeration enum1 = as.getAttributeNames();
while( enum1.hasMoreElements() )
{
Object name = enum1.nextElement();
Object value = as.getAttribute( name );
System.out.println( "\t" + name + " : " + value );
}
}
}
}
catch (Exception e)
{
e.printStackTrace();
}
System.exit(1);
}
// Returns a reader on the HTML data. If 'uri' begins
// with "http:", it's treated as a URL; otherwise,
// it's assumed to be a local filename.
static Reader getReader(String uri)
throws IOException
{
// Retrieve from Internet.
if (uri.startsWith("http:"))
{
URLConnection conn = new URL(uri).openConnection();
return new InputStreamReader(conn.getInputStream());
}
// Retrieve from file.
else
{
return new FileReader(uri);
}
}
}
测试使用:
java AttributeHTML yourFile.html
要检索属性,您可以提供自己的 ParserCallback
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;
public class HTMLParserTest2
{
public static void main(String args[]) throws Exception {
Reader reader = new FileReader("d:/temp/Example.html");
BufferedReader br = new BufferedReader(reader);
System.out.println(HTMLParserTest2.extractTagsAttributes(br));
// output : [title-_implied_=true, body-a1=ABC, body-a2=3974, body-a3=A1B2]
System.exit(0);
}
public static List<String> extractTagsAttributes(Reader r) throws IOException {
final ArrayList<String> list = new ArrayList<String>();
ParserDelegator parserDelegator = new ParserDelegator();
ParserCallback parserCallback = new ParserCallback() {
@Override
public void handleText(final char[] data, final int pos) { }
@Override
public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
Enumeration<?> e=attribute.getAttributeNames();
while(e.hasMoreElements()) {
Object name=e.nextElement();
Object value=attribute.getAttribute(name);
list.add(tag.toString() + "-" + name + "=" +value);
}
}
@Override
public void handleEndTag(Tag t, final int pos) { }
@Override
public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { }
@Override
public void handleComment(final char[] data, final int pos) { }
@Override
public void handleError(final java.lang.String errMsg, final int pos) { }
};
parserDelegator.parse(r, parserCallback, true);
return list;
}
}
可能 HTML 的术语与 XML 的术语不同,但这里有一个 HTML 文档,可以从中检索属性。这里的属性 a1、a2、a3 是 Body 标签的一部分。
<html>
<head>
Hello World
</head>
<body a1="ABC" a2="3974" a3="A1B2"> <------These attributes
<H1>Start Here<H1>
<p>This is the body</p>
</body>
</html>
使用下面的文件解析上面的HTML文件。
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.Reader;
import javax.swing.text.AttributeSet;
import javax.swing.text.Element;
import javax.swing.text.ElementIterator;
import javax.swing.text.StyleConstants;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
public class HTMLParserTest
{
public static void main(String args[]) throws Exception {
Reader reader = new FileReader("C:/Downloads/DeleteMe/Example1.html");
BufferedReader br = new BufferedReader(reader );
HTMLEditorKit htmlKit = new HTMLEditorKit();
HTMLDocument htmlDoc = (HTMLDocument) htmlKit.createDefaultDocument();
HTMLEditorKit.Parser parser = new ParserDelegator();
HTMLEditorKit.ParserCallback callback = htmlDoc.getReader(0);
parser.parse(br, callback, true);
// Parse
ElementIterator iterator = new ElementIterator(htmlDoc);
Element element;
while ((element = iterator.next()) != null)
{
System.out.println("Element : " + element);
AttributeSet attributes = element.getAttributes();
Object name = attributes.getAttribute(StyleConstants.NameAttribute);
if ((name instanceof HTML.Tag))
//&& ((name == HTML.Tag.H1) || (name == HTML.Tag.H2) || (name == HTML.Tag.H3)))
{
// Build up content text as it may be within multiple elements
StringBuffer text = new StringBuffer();
int count = element.getElementCount();
for (int i = 0; i < count; i++) {
Element child = element.getElement(i);
AttributeSet childAttributes = child.getAttributes();
System.out.println("Element : " + child);
System.out.println(" Attribute count : " + childAttributes.getAttributeCount());
System.out.println(" a1 exists : " + childAttributes.isDefined("a1"));
int startOffset = child.getStartOffset();
int endOffset = child.getEndOffset();
int length = endOffset - startOffset;
text.append(htmlDoc.getText(startOffset, length));
}
}
}
System.exit(0);
}
}
输出在这里。
Element : BranchElement(html) 0,1
Element : BranchElement(body) 0,1
Attribute count : 1
a1 exists : false <-----expected true here.
Element : BranchElement(body) 0,1
Element : BranchElement(p) 0,1
Attribute count : 3
a1 exists : false
Element : BranchElement(p) 0,1
Element : LeafElement(content) 0,1
Attribute count : 1
a1 exists : false
Element : LeafElement(content) 0,1
期望“a1 存在”检查应该返回 true 一次,但它没有。 最终将搜索所有 3 (a1, a2, a3)。
上面的代码是正确的实现还是在 HTML 解析器中不可行?
我不知道 HtmlKit
但你可以使用正则表达式获得类似的结果
public static void main(String[] args) throws UnirestException {
String html = "<html>\r\n" +
" <head>\r\n" +
" Hello World\r\n" +
" </head>\r\n" +
" <body a1=\"ABC\" a2=\"3974\" a3=\"A1B2\">\r\n" +
" <H1>Start Here<H1>\r\n" +
" <p>This is the body</p>\r\n" +
" </body>\r\n" +
"</html>";
Pattern regexBodyPattern = Pattern.compile("<body[^>]*>", Pattern.MULTILINE);
Matcher matcher = regexBodyPattern.matcher(html);
while(matcher.find()) {
String bodyTag = matcher.group();
Pattern regexBodyAttrPattern = Pattern.compile("(\S*)=(\\"\w*\\")", Pattern.MULTILINE);
Matcher attrMatcher = regexBodyAttrPattern.matcher(bodyTag);
while(attrMatcher.find()) {
System.out.println("Key :: "+attrMatcher.group(1)+" , Value "+attrMatcher.group(2));
}
}
}
输出
Key :: a1 , Value "ABC"
Key :: a2 , Value "3974"
Key :: a3 , Value "A1B2"
也许这会有所帮助:
import java.io.*;
import java.net.*;
import java.util.*;
import javax.swing.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
class AttributeHTML
{
public static void main(String[] args)
{
EditorKit kit = new HTMLEditorKit();
Document doc = kit.createDefaultDocument();
// The Document class does not yet handle charset's properly.
doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);
try
{
// Create a reader on the HTML content.
Reader rd = getReader(args[0]);
// Parse the HTML.
kit.read(rd, doc, 0);
// Iterate through the elements of the HTML document.
ElementIterator it = new ElementIterator(doc);
Element elem = null;
while ( (elem = it.next()) != null )
{
if (elem.getName().equals("body"))
{
AttributeSet as = elem.getAttributes();
Enumeration enum1 = as.getAttributeNames();
while( enum1.hasMoreElements() )
{
Object name = enum1.nextElement();
Object value = as.getAttribute( name );
System.out.println( "\t" + name + " : " + value );
}
}
}
}
catch (Exception e)
{
e.printStackTrace();
}
System.exit(1);
}
// Returns a reader on the HTML data. If 'uri' begins
// with "http:", it's treated as a URL; otherwise,
// it's assumed to be a local filename.
static Reader getReader(String uri)
throws IOException
{
// Retrieve from Internet.
if (uri.startsWith("http:"))
{
URLConnection conn = new URL(uri).openConnection();
return new InputStreamReader(conn.getInputStream());
}
// Retrieve from file.
else
{
return new FileReader(uri);
}
}
}
测试使用:
java AttributeHTML yourFile.html
要检索属性,您可以提供自己的 ParserCallback
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;
public class HTMLParserTest2
{
public static void main(String args[]) throws Exception {
Reader reader = new FileReader("d:/temp/Example.html");
BufferedReader br = new BufferedReader(reader);
System.out.println(HTMLParserTest2.extractTagsAttributes(br));
// output : [title-_implied_=true, body-a1=ABC, body-a2=3974, body-a3=A1B2]
System.exit(0);
}
public static List<String> extractTagsAttributes(Reader r) throws IOException {
final ArrayList<String> list = new ArrayList<String>();
ParserDelegator parserDelegator = new ParserDelegator();
ParserCallback parserCallback = new ParserCallback() {
@Override
public void handleText(final char[] data, final int pos) { }
@Override
public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) {
Enumeration<?> e=attribute.getAttributeNames();
while(e.hasMoreElements()) {
Object name=e.nextElement();
Object value=attribute.getAttribute(name);
list.add(tag.toString() + "-" + name + "=" +value);
}
}
@Override
public void handleEndTag(Tag t, final int pos) { }
@Override
public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { }
@Override
public void handleComment(final char[] data, final int pos) { }
@Override
public void handleError(final java.lang.String errMsg, final int pos) { }
};
parserDelegator.parse(r, parserCallback, true);
return list;
}
}