如何根据空格标记特殊字符(< > | & 等)
How to tokenize special characters depending on whitespace (< > | & etc.)
我发现几年前完成的一个项目 here 可以进行一些简单的命令行解析。虽然我真的很喜欢它的功能,但它不支持解析特殊字符,例如 <、>、& 等。我继续尝试添加一些功能来专门解析这些字符,方法是添加一些与现有代码相同的条件用于查找空格、转义字符和引号:
bool _isQuote(char c) {
if (c == '\"')
return true;
else if (c == '\'')
return true;
return false;
}
bool _isEscape(char c) {
if (c == '\')
return true;
return false;
}
bool _isWhitespace(char c) {
if (c == ' ')
return true;
else if(c == '\t')
return true;
return false;
}
.
.
.
我添加的内容:
bool _isLeftCarrot(char c) {
if (c == '<')
return true;
return false;
}
bool _isRightCarrot(char c) {
if (c == '>')
return true;
return false;
}
其余特殊字符依此类推。
我也尝试了与 parse
方法中现有代码相同的方法:
std::list<string> parse(const std::string& args) {
std::stringstream ain(args); // iterates over the input string
ain >> std::noskipws; // ensures not to skip whitespace
std::list<std::string> oargs; // list of strings where we will store the tokens
std::stringstream currentArg("");
currentArg >> std::noskipws;
// current state
enum State {
InArg, // scanning the string currently
InArgQuote, // scanning the string that started with a quote currently
OutOfArg // not scanning the string currently
};
State currentState = OutOfArg;
char currentQuoteChar = '[=12=]'; // used to differentiate between ' and "
// ex. "sample'text"
char c;
std::stringstream ss;
std::string s;
// iterate character by character through input string
while(!ain.eof() && (ain >> c)) {
// if current character is a quote
if(_isQuote(c)) {
switch(currentState) {
case OutOfArg:
currentArg.str(std::string());
case InArg:
currentState = InArgQuote;
currentQuoteChar = c;
break;
case InArgQuote:
if (c == currentQuoteChar)
currentState = InArg;
else
currentArg << c;
break;
}
}
// if current character is whitespace
else if (_isWhitespace(c)) {
switch(currentState) {
case InArg:
oargs.push_back(currentArg.str());
currentState = OutOfArg;
break;
case InArgQuote:
currentArg << c;
break;
case OutOfArg:
// nothing
break;
}
}
// if current character is escape character
else if (_isEscape(c)) {
switch(currentState) {
case OutOfArg:
currentArg.str(std::string());
currentState = InArg;
case InArg:
case InArgQuote:
if (ain.eof())
{
currentArg << c;
throw(std::runtime_error("Found Escape Character at end of file."));
}
else {
char c1 = c;
ain >> c;
if (c != '\"')
currentArg << c1;
ain.unget();
ain >> c;
currentArg << c;
}
break;
}
}
我在parse
方法中添加的内容:
// if current character is left carrot (<)
else if(_isLeftCarrot(c)) {
// convert from char to string and push onto list
ss << c;
ss >> s;
oargs.push_back(s);
}
// if current character is right carrot (>)
else if(_isRightCarrot(c)) {
ss << c;
ss >> s;
oargs.push_back(s);
}
.
.
.
else {
switch(currentState) {
case InArg:
case InArgQuote:
currentArg << c;
break;
case OutOfArg:
currentArg.str(std::string());
currentArg << c;
currentState = InArg;
break;
}
}
}
if (currentState == InArg) {
oargs.push_back(currentArg.str());
s.clear();
}
else if (currentState == InArgQuote)
throw(std::runtime_error("Starting quote has no ending quote."));
return oargs;
}
parse
将 return 标记的字符串列表。
但是,当特殊字符附加到输入的末尾时,我 运行 遇到了特定测试用例的问题。例如,输入
foo-bar&
将 return 这个列表:[{&},{foo-bar}]
而不是我想要的:[{foo-bar},{&}]
我正在努力解决这个问题。我是 C++ 的新手,所以任何建议和一些解释都会很有帮助。
当您处理其中一个角色时,您需要做与原始代码在遇到 space 时所做的相同的事情。您需要查看 currentState
,然后如果您在一个参数的中间,请保存当前参数(并重置它,因为您不再在一个参数中)。
我发现几年前完成的一个项目 here 可以进行一些简单的命令行解析。虽然我真的很喜欢它的功能,但它不支持解析特殊字符,例如 <、>、& 等。我继续尝试添加一些功能来专门解析这些字符,方法是添加一些与现有代码相同的条件用于查找空格、转义字符和引号:
bool _isQuote(char c) {
if (c == '\"')
return true;
else if (c == '\'')
return true;
return false;
}
bool _isEscape(char c) {
if (c == '\')
return true;
return false;
}
bool _isWhitespace(char c) {
if (c == ' ')
return true;
else if(c == '\t')
return true;
return false;
}
.
.
.
我添加的内容:
bool _isLeftCarrot(char c) {
if (c == '<')
return true;
return false;
}
bool _isRightCarrot(char c) {
if (c == '>')
return true;
return false;
}
其余特殊字符依此类推。
我也尝试了与 parse
方法中现有代码相同的方法:
std::list<string> parse(const std::string& args) {
std::stringstream ain(args); // iterates over the input string
ain >> std::noskipws; // ensures not to skip whitespace
std::list<std::string> oargs; // list of strings where we will store the tokens
std::stringstream currentArg("");
currentArg >> std::noskipws;
// current state
enum State {
InArg, // scanning the string currently
InArgQuote, // scanning the string that started with a quote currently
OutOfArg // not scanning the string currently
};
State currentState = OutOfArg;
char currentQuoteChar = '[=12=]'; // used to differentiate between ' and "
// ex. "sample'text"
char c;
std::stringstream ss;
std::string s;
// iterate character by character through input string
while(!ain.eof() && (ain >> c)) {
// if current character is a quote
if(_isQuote(c)) {
switch(currentState) {
case OutOfArg:
currentArg.str(std::string());
case InArg:
currentState = InArgQuote;
currentQuoteChar = c;
break;
case InArgQuote:
if (c == currentQuoteChar)
currentState = InArg;
else
currentArg << c;
break;
}
}
// if current character is whitespace
else if (_isWhitespace(c)) {
switch(currentState) {
case InArg:
oargs.push_back(currentArg.str());
currentState = OutOfArg;
break;
case InArgQuote:
currentArg << c;
break;
case OutOfArg:
// nothing
break;
}
}
// if current character is escape character
else if (_isEscape(c)) {
switch(currentState) {
case OutOfArg:
currentArg.str(std::string());
currentState = InArg;
case InArg:
case InArgQuote:
if (ain.eof())
{
currentArg << c;
throw(std::runtime_error("Found Escape Character at end of file."));
}
else {
char c1 = c;
ain >> c;
if (c != '\"')
currentArg << c1;
ain.unget();
ain >> c;
currentArg << c;
}
break;
}
}
我在parse
方法中添加的内容:
// if current character is left carrot (<)
else if(_isLeftCarrot(c)) {
// convert from char to string and push onto list
ss << c;
ss >> s;
oargs.push_back(s);
}
// if current character is right carrot (>)
else if(_isRightCarrot(c)) {
ss << c;
ss >> s;
oargs.push_back(s);
}
.
.
.
else {
switch(currentState) {
case InArg:
case InArgQuote:
currentArg << c;
break;
case OutOfArg:
currentArg.str(std::string());
currentArg << c;
currentState = InArg;
break;
}
}
}
if (currentState == InArg) {
oargs.push_back(currentArg.str());
s.clear();
}
else if (currentState == InArgQuote)
throw(std::runtime_error("Starting quote has no ending quote."));
return oargs;
}
parse
将 return 标记的字符串列表。
但是,当特殊字符附加到输入的末尾时,我 运行 遇到了特定测试用例的问题。例如,输入
foo-bar&
将 return 这个列表:[{&},{foo-bar}]
而不是我想要的:[{foo-bar},{&}]
我正在努力解决这个问题。我是 C++ 的新手,所以任何建议和一些解释都会很有帮助。
当您处理其中一个角色时,您需要做与原始代码在遇到 space 时所做的相同的事情。您需要查看 currentState
,然后如果您在一个参数的中间,请保存当前参数(并重置它,因为您不再在一个参数中)。