正则表达式匹配与重叠符号冲突
Regex Matching Conflict with Overlapping Symbol
我正在尝试匹配所有包含符号 <
或 >
的标记,但存在一些冲突。特别是,我的标记是 <
、>
、</
、/>
,以及以 <!--
开头并以 -->
结尾的评论。
我的正则表达式如下:
String LTHAN = "<";
String GTHAN = ">";
String LTHAN_SLASH = "</";
String GTHAN_SLASH = "/>";
String COMMENT = "<!--.*-->";
我通过使用通用方法将它们添加到列表来编译它们:
public void add(String regex, int token) {
tokenInfos.add(new TokenInfo(Pattern.compile("^(" + regex + ")"), token));
}
这是我的 TokenInfo class 的样子:
private class TokenInfo {
public final Pattern regex;
public final int token;
public TokenInfo(Pattern regex, int token) {
super();
this.regex = regex;
this.token = token;
}
}
我匹配显示列表如下:
public void tokenize(String str) {
String s = new String(str);
tokens.clear();
while (!s.equals("")) {
boolean match = false;
for (TokenInfo info : tokenInfos) {
Matcher m = info.regex.matcher(s);
if (m.find()) {
match = true;
String tok = m.group().trim();
tokens.add(new Token(info.token, tok));
s = m.replaceFirst("");
break;
}
}
}
}
读取并显示:
try {
BufferedReader br;
String curLine;
String EOF = null;
Scanner scan = new Scanner(System.in);
StringBuilder sb = new StringBuilder();
try {
File dir = new File("C:\Users\Me\Documents\input files\example.xml");
br = new BufferedReader(new FileReader(dir));
while ((curLine = br.readLine()) != EOF) {
sb.append(curLine);
// System.out.println(curLine);
}
br.close();
} catch (IOException e) {
System.out.println(e.getMessage());
}
tokenizer.tokenize(sb.toString());
for (Tokenizer.Token tok : tokenizer.getTokens()) {
System.out.println("" + tok.token + " " + tok.sequence);
}
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
示例输入:
<!-- Sample input file with incomplete recipe -->
<recipe name="bread" prep_time="5 mins" cook_time="3 hours">
<title>Basic bread</title>
<ingredient amount="3" unit="cups">Flour</ingredient>
<instructions>
<step>Mix all ingredients together.</step>
</instructions>
</recipe>
但是,输出的标记列表将 <
和 /
(包括其后的任何字符)识别为单独的标记,这意味着它似乎永远无法识别标记 </
和/>
。与评论相同的问题。这是我的正则表达式的问题吗?为什么它不能识别模式 </
和 />
?
希望我的问题很清楚。如有必要,很乐意提供更多 details/examples。
问题:
- 您的初始正则表达式
^(<)
将匹配整个输入。这个正则表达式意味着文本必须以 <
开头,整个输入字符串就是这样。所以你必须修复它。
- 如果整个标签(没有文本内容 - 如 Basic Bread、Mix all ingredients together)被视为令牌。所以对应的正则表达式应该是单个正则表达式。
解决方案
尝试将正则表达式更改为以下内容:
- 对于单个标签 -
<[^>]*>
- 对于单个结束标记 -
</[^>]*>
;
- 评论 - (这已经是正确的)
示例程序
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class RegexTest {
private static ArrayList<TokenInfo> tokenInfoList = new ArrayList<>();
private static ArrayList<String> tokensList = new ArrayList<>();
public static void add(String regex, int token) {
tokenInfoList.add(new TokenInfo(Pattern.compile(regex), token));
}
static {
String LTHAN = "<[^>]*>";
String LTHAN_SLASH = "</[^>]*>";
String COMMENT = "<!--.*-->";
add(LTHAN, 1);
add(LTHAN_SLASH, 3);
add(COMMENT, 5);
}
private static class TokenInfo {
public final Pattern regex;
public final int token;
public TokenInfo(Pattern regex, int token) {
super();
this.regex = regex;
this.token = token;
}
}
public static void tokenize(String str) {
String s = new String(str);
while (!s.equals("")) {
boolean match = false;
for (TokenInfo info : tokenInfoList) {
Matcher m = info.regex.matcher(s);
if (m.find()) {
match = true;
String tok = m.group().trim();
tokensList.add(tok);
s = m.replaceFirst("");
break;
}
}
// The following is under the assumption that the Text nodes within the document are not considered tokens and replaced
if (!match) {
break;
}
}
}
public static void main(String[] args) {
try {
BufferedReader br;
String curLine;
String EOF = null;
StringBuilder sb = new StringBuilder();
try {
File dir = new File("recipe.xml");
br = new BufferedReader(new FileReader(dir));
while ((curLine = br.readLine()) != EOF) {
sb.append(curLine);
// System.out.println(curLine);
}
br.close();
} catch (IOException e) {
System.out.println(e.getMessage());
}
tokenize(sb.toString());
for (String eachToken : tokensList) {
System.out.println(eachToken);
}
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
}
参考资料
http://www.regular-expressions.info/ 是学习正则表达式的好资源。
我正在尝试匹配所有包含符号 <
或 >
的标记,但存在一些冲突。特别是,我的标记是 <
、>
、</
、/>
,以及以 <!--
开头并以 -->
结尾的评论。
我的正则表达式如下:
String LTHAN = "<";
String GTHAN = ">";
String LTHAN_SLASH = "</";
String GTHAN_SLASH = "/>";
String COMMENT = "<!--.*-->";
我通过使用通用方法将它们添加到列表来编译它们:
public void add(String regex, int token) {
tokenInfos.add(new TokenInfo(Pattern.compile("^(" + regex + ")"), token));
}
这是我的 TokenInfo class 的样子:
private class TokenInfo {
public final Pattern regex;
public final int token;
public TokenInfo(Pattern regex, int token) {
super();
this.regex = regex;
this.token = token;
}
}
我匹配显示列表如下:
public void tokenize(String str) {
String s = new String(str);
tokens.clear();
while (!s.equals("")) {
boolean match = false;
for (TokenInfo info : tokenInfos) {
Matcher m = info.regex.matcher(s);
if (m.find()) {
match = true;
String tok = m.group().trim();
tokens.add(new Token(info.token, tok));
s = m.replaceFirst("");
break;
}
}
}
}
读取并显示:
try {
BufferedReader br;
String curLine;
String EOF = null;
Scanner scan = new Scanner(System.in);
StringBuilder sb = new StringBuilder();
try {
File dir = new File("C:\Users\Me\Documents\input files\example.xml");
br = new BufferedReader(new FileReader(dir));
while ((curLine = br.readLine()) != EOF) {
sb.append(curLine);
// System.out.println(curLine);
}
br.close();
} catch (IOException e) {
System.out.println(e.getMessage());
}
tokenizer.tokenize(sb.toString());
for (Tokenizer.Token tok : tokenizer.getTokens()) {
System.out.println("" + tok.token + " " + tok.sequence);
}
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
示例输入:
<!-- Sample input file with incomplete recipe -->
<recipe name="bread" prep_time="5 mins" cook_time="3 hours">
<title>Basic bread</title>
<ingredient amount="3" unit="cups">Flour</ingredient>
<instructions>
<step>Mix all ingredients together.</step>
</instructions>
</recipe>
但是,输出的标记列表将 <
和 /
(包括其后的任何字符)识别为单独的标记,这意味着它似乎永远无法识别标记 </
和/>
。与评论相同的问题。这是我的正则表达式的问题吗?为什么它不能识别模式 </
和 />
?
希望我的问题很清楚。如有必要,很乐意提供更多 details/examples。
问题:
- 您的初始正则表达式
^(<)
将匹配整个输入。这个正则表达式意味着文本必须以<
开头,整个输入字符串就是这样。所以你必须修复它。 - 如果整个标签(没有文本内容 - 如 Basic Bread、Mix all ingredients together)被视为令牌。所以对应的正则表达式应该是单个正则表达式。
解决方案
尝试将正则表达式更改为以下内容:
- 对于单个标签 -
<[^>]*>
- 对于单个结束标记 -
</[^>]*>
; - 评论 - (这已经是正确的)
示例程序
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class RegexTest {
private static ArrayList<TokenInfo> tokenInfoList = new ArrayList<>();
private static ArrayList<String> tokensList = new ArrayList<>();
public static void add(String regex, int token) {
tokenInfoList.add(new TokenInfo(Pattern.compile(regex), token));
}
static {
String LTHAN = "<[^>]*>";
String LTHAN_SLASH = "</[^>]*>";
String COMMENT = "<!--.*-->";
add(LTHAN, 1);
add(LTHAN_SLASH, 3);
add(COMMENT, 5);
}
private static class TokenInfo {
public final Pattern regex;
public final int token;
public TokenInfo(Pattern regex, int token) {
super();
this.regex = regex;
this.token = token;
}
}
public static void tokenize(String str) {
String s = new String(str);
while (!s.equals("")) {
boolean match = false;
for (TokenInfo info : tokenInfoList) {
Matcher m = info.regex.matcher(s);
if (m.find()) {
match = true;
String tok = m.group().trim();
tokensList.add(tok);
s = m.replaceFirst("");
break;
}
}
// The following is under the assumption that the Text nodes within the document are not considered tokens and replaced
if (!match) {
break;
}
}
}
public static void main(String[] args) {
try {
BufferedReader br;
String curLine;
String EOF = null;
StringBuilder sb = new StringBuilder();
try {
File dir = new File("recipe.xml");
br = new BufferedReader(new FileReader(dir));
while ((curLine = br.readLine()) != EOF) {
sb.append(curLine);
// System.out.println(curLine);
}
br.close();
} catch (IOException e) {
System.out.println(e.getMessage());
}
tokenize(sb.toString());
for (String eachToken : tokensList) {
System.out.println(eachToken);
}
} catch (Exception e) {
System.out.println(e.getMessage());
}
}
}
参考资料
http://www.regular-expressions.info/ 是学习正则表达式的好资源。