c++11 正则表达式和 GCC
c++11 regexp and GCC
根据https://gcc.gnu.org/onlinedocs/libstdc++/manual/status.html#status.iso.2011,C++11标准的正则表达式引擎应该在GCC中完成。现在,有人可以向我解释为什么这个简单的例子
#include <iostream>
#include <string>
#include <regex>
using namespace std;
int main ()
{
string string_array[] = {"http://www.cplusplus.com/reference/regex/regex_match/",
"tcp://192.168.2.1:1234/hello/how/are/you",
"https://mail.google.com/mail/u/0/?tab=wm#inbox/15178022db56df29?projector=1"};
regex e("^(?:([A-Za-z]+):)?(\/{0,3})([0-9.\-A-Za-z]+)(?::(\d+))?(?:\/([^?#]*))?(?:\?([^#]*))?(?:#(.*))?$");
for(int i=0; i<3; i++)
{
smatch sm;
regex_match (string_array[i],sm,e);
for (unsigned i=0; i<sm.size(); ++i)
{
cout << "[" << sm[i] << "] ";
}
cout << endl;
}
return 0;
}
结果是这个输出(例如注意第二行错误解析的端口号,但似乎有很多错误)
[http://www.cplusplus.com/reference/regex/regex_match/] [http] [//] [www.cplusplus.com/reference/regex] [] [regex_match/] [] []
[tcp://192.168.2.1:1234/hello/how/are/you] [tcp] [//] [192.168.2.1:1234/hello/how/are/you] [] [] [] []
[https://mail.google.com/mail/u/0/?tab=wm#inbox/15178022db56df29?projector=1] [https] [//] [mail.google.com/mail/u/0/?tab=wm] [] [] [] [inbox/15178022db56df29?projector=1]
而其 python 对应
import re
string_array = ["http://www.cplusplus.com/reference/regex/regex_match/",
"tcp://192.168.2.1:1234/hello/how/are/you",
"https://mail.google.com/mail/u/0/?tab=wm#inbox/15178022db56df29?projector=1"]
e = re.compile("^(?:([A-Za-z]+):)?(\/{0,3})([0-9.\-A-Za-z]+)(?::(\d+))?(?:\/([^?#]*))?(?:\?([^#]*))?(?:#(.*))?$");
for i in range(len(string_array)):
m = e.match(string_array[i])
print(m.groups())
正确打印这个?
('http', '//', 'www.cplusplus.com', None, 'reference/regex/regex_match/', None, None)
('tcp', '//', '192.168.2.1', '1234', 'hello/how/are/you', None, None)
('https', '//', 'mail.google.com', None, 'mail/u/0/', 'tab=wm', 'inbox/15178022db56df29?projector=1')
我在 archlinux 上使用 gcc 5.3.0
编辑:
我将程序更改为这个,检查正则表达式 syntax_option_type 标志
#include <iostream>
#include <string>
#include <regex>
using namespace std;
int main ()
{
string string_array[] = {"http://www.cplusplus.com/reference/regex/regex_match/",
"tcp://192.168.2.1:1234/hello/how/are/you",
"https://mail.google.com/mail/u/0/?tab=wm#inbox/15178022db56df29?projector=1"};
regex e("^(?:([A-Za-z]+):)?(\/{0,3})([0-9.\-A-Za-z]+)(?::(\d+))?(?:\/([^?#]*))?(?:\?([^#]*))?(?:#(.*))?$");
for(int i=0; i<3; i++)
{
smatch sm;
cout << "match: " <<regex_match (string_array[i],sm,e) << endl;
for (unsigned i=0; i<sm.size(); ++i)
{
cout << "[" << sm[i].str() << "] ";
}
}
cout << endl;
switch(e.flags())
{
case regex_constants::basic:
cout << "POSIX syntax was used" << endl;
break;
case regex_constants::awk:
cout << "POSIX awk syntax was used" << endl;
break;
case regex_constants::ECMAScript:
cout << "ECMA syntax was used" << endl;
break;
case regex_constants::egrep:
cout << "POSIX egrep syntax was used" << endl;
break;
}
return 0;
}
令人惊讶的是我最后得到了
match: 1
[http://www.cplusplus.com/reference/regex/regex_match/] [http] [//] [www.cplusplus.com/reference/regex] [] [regex_match/] [] [] match: 1
[tcp://192.168.2.1:1234/hello/how/are/you] [tcp] [//] [192.168.2.1:1234/hello/how/are/you] [] [] [] [] match: 1
[https://mail.google.com/mail/u/0/?tab=wm#inbox/15178022db56df29?projector=1] [https] [//] [mail.google.com/mail/u/0/?tab=wm] [] [] [] [inbox/15178022db56df29?projector=1]
ECMA syntax was used
这似乎真的是一个编译器错误..
这里有两个问题:
- Match object
groups()
vs. match_results
区别
- 字符 类 内的字符不应转义
Python中的match object groups()
returns所有子匹配(捕获的子串)以第1组开头:
Return a tuple containing all the subgroups of the match, from 1 up to however many groups are in the pattern.
match_results
从第 0 组开始枚举所有组(整场比赛):
If successful, it is not empty and contains a series of sub_match objects: the first sub_match element corresponds to the entire match, and, if the regex expression contained sub-expressions to be matched (i.e., parentheses-delimited groups), their corresponding sub-matches are stored as successive sub_match elements in the match_results
object.
第 3 组中的 \
没有转义连字符,它被忽略,因此在 .
和 A
之间创建一个范围(= [.-A]
在正常的正则表达式中世界)。
您不能在 POSIX 正则表达式 中的字符 类 内使用转义符号,这将被视为错误。将连字符放在末尾以避免需要对其进行转义(如 [0-9.A-Za-z-]+
)。
所以,在Python中,使用
e = re.compile("^(?:([A-Za-z]+):)?(\/{0,3})([0-9.A-Za-z-]+)(?::(\d+))?(?:\/([^?#]*))?(?:\?([^#]*))?(?:#(.*))?$");
在 C++ 中:
regex e("^(?:([A-Za-z]+):)?(\/{0,3})([0-9.A-Za-z-]+)(?::(\d+))?(?:\/([^?#]*))?(?:\?([^#]*))?(?:#(.*))?$");
// ...
for(int i=0; i<3; i++)
{
smatch sm;
regex_match (string_array[i],sm,e);
for (unsigned i=1; i<sm.size(); ++i) // Here, start with the second element
{
cout << "[" << sm[i] << "] ";
}
cout << endl;
}
比较Python demo and C++ demo.
正则表达式工作正常。
改变
cout << "[" << sm[i] << "] ";
和
cout << "[" << sm[i].str() << "] ";
您会看到正确的结果。
根据https://gcc.gnu.org/onlinedocs/libstdc++/manual/status.html#status.iso.2011,C++11标准的正则表达式引擎应该在GCC中完成。现在,有人可以向我解释为什么这个简单的例子
#include <iostream>
#include <string>
#include <regex>
using namespace std;
int main ()
{
string string_array[] = {"http://www.cplusplus.com/reference/regex/regex_match/",
"tcp://192.168.2.1:1234/hello/how/are/you",
"https://mail.google.com/mail/u/0/?tab=wm#inbox/15178022db56df29?projector=1"};
regex e("^(?:([A-Za-z]+):)?(\/{0,3})([0-9.\-A-Za-z]+)(?::(\d+))?(?:\/([^?#]*))?(?:\?([^#]*))?(?:#(.*))?$");
for(int i=0; i<3; i++)
{
smatch sm;
regex_match (string_array[i],sm,e);
for (unsigned i=0; i<sm.size(); ++i)
{
cout << "[" << sm[i] << "] ";
}
cout << endl;
}
return 0;
}
结果是这个输出(例如注意第二行错误解析的端口号,但似乎有很多错误)
[http://www.cplusplus.com/reference/regex/regex_match/] [http] [//] [www.cplusplus.com/reference/regex] [] [regex_match/] [] []
[tcp://192.168.2.1:1234/hello/how/are/you] [tcp] [//] [192.168.2.1:1234/hello/how/are/you] [] [] [] []
[https://mail.google.com/mail/u/0/?tab=wm#inbox/15178022db56df29?projector=1] [https] [//] [mail.google.com/mail/u/0/?tab=wm] [] [] [] [inbox/15178022db56df29?projector=1]
而其 python 对应
import re
string_array = ["http://www.cplusplus.com/reference/regex/regex_match/",
"tcp://192.168.2.1:1234/hello/how/are/you",
"https://mail.google.com/mail/u/0/?tab=wm#inbox/15178022db56df29?projector=1"]
e = re.compile("^(?:([A-Za-z]+):)?(\/{0,3})([0-9.\-A-Za-z]+)(?::(\d+))?(?:\/([^?#]*))?(?:\?([^#]*))?(?:#(.*))?$");
for i in range(len(string_array)):
m = e.match(string_array[i])
print(m.groups())
正确打印这个?
('http', '//', 'www.cplusplus.com', None, 'reference/regex/regex_match/', None, None)
('tcp', '//', '192.168.2.1', '1234', 'hello/how/are/you', None, None)
('https', '//', 'mail.google.com', None, 'mail/u/0/', 'tab=wm', 'inbox/15178022db56df29?projector=1')
我在 archlinux 上使用 gcc 5.3.0
编辑:
我将程序更改为这个,检查正则表达式 syntax_option_type 标志
#include <iostream>
#include <string>
#include <regex>
using namespace std;
int main ()
{
string string_array[] = {"http://www.cplusplus.com/reference/regex/regex_match/",
"tcp://192.168.2.1:1234/hello/how/are/you",
"https://mail.google.com/mail/u/0/?tab=wm#inbox/15178022db56df29?projector=1"};
regex e("^(?:([A-Za-z]+):)?(\/{0,3})([0-9.\-A-Za-z]+)(?::(\d+))?(?:\/([^?#]*))?(?:\?([^#]*))?(?:#(.*))?$");
for(int i=0; i<3; i++)
{
smatch sm;
cout << "match: " <<regex_match (string_array[i],sm,e) << endl;
for (unsigned i=0; i<sm.size(); ++i)
{
cout << "[" << sm[i].str() << "] ";
}
}
cout << endl;
switch(e.flags())
{
case regex_constants::basic:
cout << "POSIX syntax was used" << endl;
break;
case regex_constants::awk:
cout << "POSIX awk syntax was used" << endl;
break;
case regex_constants::ECMAScript:
cout << "ECMA syntax was used" << endl;
break;
case regex_constants::egrep:
cout << "POSIX egrep syntax was used" << endl;
break;
}
return 0;
}
令人惊讶的是我最后得到了
match: 1
[http://www.cplusplus.com/reference/regex/regex_match/] [http] [//] [www.cplusplus.com/reference/regex] [] [regex_match/] [] [] match: 1
[tcp://192.168.2.1:1234/hello/how/are/you] [tcp] [//] [192.168.2.1:1234/hello/how/are/you] [] [] [] [] match: 1
[https://mail.google.com/mail/u/0/?tab=wm#inbox/15178022db56df29?projector=1] [https] [//] [mail.google.com/mail/u/0/?tab=wm] [] [] [] [inbox/15178022db56df29?projector=1]
ECMA syntax was used
这似乎真的是一个编译器错误..
这里有两个问题:
- Match object
groups()
vs.match_results
区别 - 字符 类 内的字符不应转义
Python中的match object groups()
returns所有子匹配(捕获的子串)以第1组开头:
Return a tuple containing all the subgroups of the match, from 1 up to however many groups are in the pattern.
match_results
从第 0 组开始枚举所有组(整场比赛):
If successful, it is not empty and contains a series of sub_match objects: the first sub_match element corresponds to the entire match, and, if the regex expression contained sub-expressions to be matched (i.e., parentheses-delimited groups), their corresponding sub-matches are stored as successive sub_match elements in the
match_results
object.
第 3 组中的 \
没有转义连字符,它被忽略,因此在 .
和 A
之间创建一个范围(= [.-A]
在正常的正则表达式中世界)。
您不能在 POSIX 正则表达式 中的字符 类 内使用转义符号,这将被视为错误。将连字符放在末尾以避免需要对其进行转义(如 [0-9.A-Za-z-]+
)。
所以,在Python中,使用
e = re.compile("^(?:([A-Za-z]+):)?(\/{0,3})([0-9.A-Za-z-]+)(?::(\d+))?(?:\/([^?#]*))?(?:\?([^#]*))?(?:#(.*))?$");
在 C++ 中:
regex e("^(?:([A-Za-z]+):)?(\/{0,3})([0-9.A-Za-z-]+)(?::(\d+))?(?:\/([^?#]*))?(?:\?([^#]*))?(?:#(.*))?$");
// ...
for(int i=0; i<3; i++)
{
smatch sm;
regex_match (string_array[i],sm,e);
for (unsigned i=1; i<sm.size(); ++i) // Here, start with the second element
{
cout << "[" << sm[i] << "] ";
}
cout << endl;
}
比较Python demo and C++ demo.
正则表达式工作正常。
改变
cout << "[" << sm[i] << "] ";
和
cout << "[" << sm[i].str() << "] ";
您会看到正确的结果。