如何调试 ANTLR4 语法无关/不匹配的输入错误

How to debug ANTLR4 grammar extraneous / mismatched input error

我想像下面这样解析规则手册 "demo.rb" 文件:

rulebook Titanic-Normalization {
  version 1

  meta {
    description "Test"
    source "my-rules.xslx"
    user "joltie"
  }

  rule remove-first-line {
    description "Removes first line when offset is zero"
    when(present(offset) && offset == 0) then {
      filter-row-if-true true;
    }
  }
}

我编写了 ANTLR4 语法文件 Rulebook.g4,如下所示。目前,它可以很好地解析 *.rb 文件,但遇到 "expression" / "statement" 规则时会抛出意外错误。

grammar Rulebook;

rulebookStatement
    :   KWRulebook
        (GeneralIdentifier | Identifier)
        '{'
        KWVersion
        VersionConstant
        metaStatement
        (ruleStatement)+
        '}'
    ;

metaStatement
    :   KWMeta
        '{'
        KWDescription
        StringLiteral
        KWSource
        StringLiteral
        KWUser
        StringLiteral
        '}'
    ;

ruleStatement
    :   KWRule
        (GeneralIdentifier | Identifier)
        '{'
        KWDescription
        StringLiteral
        whenThenStatement
        '}'
    ;

whenThenStatement
    :   KWWhen '(' expression ')'
        KWThen '{' statement '}'
    ;

primaryExpression
    :   GeneralIdentifier
    |   Identifier
    |   StringLiteral+
    |   '(' expression ')'
    ;

postfixExpression
    :   primaryExpression
    |   postfixExpression '[' expression ']'
    |   postfixExpression '(' argumentExpressionList? ')'
    |   postfixExpression '.' Identifier
    |   postfixExpression '->' Identifier
    |   postfixExpression '++'
    |   postfixExpression '--'
    ;

argumentExpressionList
    :   assignmentExpression
    |   argumentExpressionList ',' assignmentExpression
    ;

unaryExpression
    :   postfixExpression
    |   '++' unaryExpression
    |   '--' unaryExpression
    |   unaryOperator castExpression
    ;

unaryOperator
    :   '&' | '*' | '+' | '-' | '~' | '!'
    ;

castExpression
    :   unaryExpression
    |   DigitSequence // for
    ;

multiplicativeExpression
    :   castExpression
    |   multiplicativeExpression '*' castExpression
    |   multiplicativeExpression '/' castExpression
    |   multiplicativeExpression '%' castExpression
    ;

additiveExpression
    :   multiplicativeExpression
    |   additiveExpression '+' multiplicativeExpression
    |   additiveExpression '-' multiplicativeExpression
    ;

shiftExpression
    :   additiveExpression
    |   shiftExpression '<<' additiveExpression
    |   shiftExpression '>>' additiveExpression
    ;

relationalExpression
    :   shiftExpression
    |   relationalExpression '<' shiftExpression
    |   relationalExpression '>' shiftExpression
    |   relationalExpression '<=' shiftExpression
    |   relationalExpression '>=' shiftExpression
    ;

equalityExpression
    :   relationalExpression
    |   equalityExpression '==' relationalExpression
    |   equalityExpression '!=' relationalExpression
    ;

andExpression
    :   equalityExpression
    |   andExpression '&' equalityExpression
    ;

exclusiveOrExpression
    :   andExpression
    |   exclusiveOrExpression '^' andExpression
    ;

inclusiveOrExpression
    :   exclusiveOrExpression
    |   inclusiveOrExpression '|' exclusiveOrExpression
    ;

logicalAndExpression
    :   inclusiveOrExpression
    |   logicalAndExpression '&&' inclusiveOrExpression
    ;

logicalOrExpression
    :   logicalAndExpression
    |   logicalOrExpression '||' logicalAndExpression
    ;

conditionalExpression
    :   logicalOrExpression ('?' expression ':' conditionalExpression)?
    ;

assignmentExpression
    :   conditionalExpression
    |   unaryExpression assignmentOperator assignmentExpression
    |   DigitSequence // for
    ;

assignmentOperator
    :   '=' | '*=' | '/=' | '%=' | '+=' | '-=' | '<<=' | '>>=' | '&=' | '^=' | '|='
    ;

expression
    :   assignmentExpression
    |   expression ',' assignmentExpression
    ;

statement
    :   expressionStatement
    ;

expressionStatement
    :   expression+ ';'
    ;


KWRulebook: 'rulebook';
KWVersion: 'version';
KWMeta: 'meta';
KWDescription: 'description';
KWSource: 'source';
KWUser: 'user';
KWRule: 'rule';
KWWhen: 'when';
KWThen: 'then';
KWTrue: 'true';
KWFalse: 'false';

fragment
LeftParen : '(';

fragment
RightParen : ')';

fragment
LeftBracket : '[';

fragment
RightBracket : ']';

fragment
LeftBrace : '{';

fragment
RightBrace : '}';


Identifier
    :   IdentifierNondigit
        (   IdentifierNondigit
        |   Digit
        )*
    ;

GeneralIdentifier
    :   Identifier
        ('-' Identifier)+
    ;

fragment
IdentifierNondigit
    :   Nondigit
    //|   // other implementation-defined characters...
    ;

VersionConstant
    :   DigitSequence ('.' DigitSequence)*
    ;

DigitSequence
    :   Digit+
    ;

fragment
Nondigit
    :   [a-zA-Z_]
    ;

fragment
Digit
    :   [0-9]
    ;

StringLiteral
    :   '"' SCharSequence? '"'
    |   '\'' SCharSequence? '\''
    ;

fragment
SCharSequence
    :   SChar+
    ;

fragment
SChar
    :   ~["\\r\n]
    |   '\\n'   // Added line
    |   '\\r\n' // Added line
    ;

Whitespace
    :   [ \t]+
        -> skip
    ;

Newline
    :   (   '\r' '\n'?
        |   '\n'
        )
        -> skip
    ;

BlockComment
    :   '/*' .*? '*/'
        -> skip
    ;

LineComment
    :   '//' ~[\r\n]*
        -> skip
    ;

我用如下单元测试测试了规则手册解析器:

    public void testScanRulebookFile() throws IOException {
        String fileName = "C:\rulebooks\demo.rb";
        FileInputStream fis = new FileInputStream(fileName);
        // create a CharStream that reads from standard input
        CharStream input = CharStreams.fromStream(fis);

        // create a lexer that feeds off of input CharStream
        RulebookLexer lexer = new RulebookLexer(input);

        // create a buffer of tokens pulled from the lexer
        CommonTokenStream tokens = new CommonTokenStream(lexer);

        // create a parser that feeds off the tokens buffer
        RulebookParser parser = new RulebookParser(tokens);


        RulebookStatementContext context = parser.rulebookStatement();
//        WhenThenStatementContext context = parser.whenThenStatement();

        System.out.println(context.toStringTree(parser));

//      ParseTree tree = parser.getContext(); // begin parsing at init rule
//      System.out.println(tree.toStringTree(parser)); // print LISP-style tree
    }

对于上面的"demo.rb",解析器得到如下错误。我还将 RulebookStatementContext 打印为 toStringTree。

line 12:25 mismatched input '&&' expecting ')'
(rulebookStatement rulebook Titanic-Normalization { version 1 (metaStatement meta { description "Test" source "my-rules.xslx" user "joltie" }) (ruleStatement rule remove-first-line { description "Removes first line when offset is zero" (whenThenStatement when ( (expression (assignmentExpression (conditionalExpression (logicalOrExpression (logicalAndExpression (inclusiveOrExpression (exclusiveOrExpression (andExpression (equalityExpression (relationalExpression (shiftExpression (additiveExpression (multiplicativeExpression (castExpression (unaryExpression (postfixExpression (postfixExpression (primaryExpression present)) ( (argumentExpressionList (assignmentExpression (conditionalExpression (logicalOrExpression (logicalAndExpression (inclusiveOrExpression (exclusiveOrExpression (andExpression (equalityExpression (relationalExpression (shiftExpression (additiveExpression (multiplicativeExpression (castExpression (unaryExpression (postfixExpression (primaryExpression offset))))))))))))))))) ))))))))))))))))) && offset == 0 ) then { filter-row-if-true true ;) }) })

我还编写了单元测试来测试像 "when (offset == 0) then {\n" + "filter-row-if-true true;\n" + "}\n" 这样的短输入上下文来调试问题。但它仍然出现如下错误:

line 1:16 mismatched input '0' expecting {'(', '++', '--', '&&', '&', '*', '+', '-', '~', '!', Identifier, GeneralIdentifier, DigitSequence, StringLiteral}
line 2:19 extraneous input 'true' expecting {'(', '++', '--', '&&', '&', '*', '+', '-', '~', '!', ';', Identifier, GeneralIdentifier, DigitSequence, StringLiteral}

经过两天的尝试,我没有任何进展。问题就这么长,请有人给我一些建议如何调试ANTLR4语法无关/不匹配的输入错误

我不知道是否有任何更复杂的方法来调试 grammar/parser 但我通常是这样做的:

  1. 将导致问题的输入减少到尽可能少的字符 可能的。

  2. 尽可能减少你的语法,这样它仍然会在相应的输入上产生相同的错误(大多数时候这意味着通过回收规则为减少的输入写一个最小的语法原语法(尽量简化)

  3. 确保词法分析器正确分割输入(因为 ANTLRWorks 中向您展示词法分析器输出的功能非常好)

  4. 看看 ParseTree。 ANTLR 的 testRig 具有以图形方式显示 ParseTree 的功能(您可以通过 ANTLRWorks 或通过 ANTLR 的 TreeViewer 访问此功能)因此您可以查看解析器的解释与您的解释不同的地方

  5. 进行解析 "by hand"。这意味着您将掌握自己的语法并自己完成输入,一步一步尝试不应用任何逻辑或 assumptions/knowledge/etc。在那个过程中。就像电脑一样,按照你自己的语法来做。质疑你采取的每一步(是否有另一种方法来匹配输入)并始终尝试以另一种方式匹配输入而不是你实际希望它被解析的方式

尝试修复最小语法中的错误,然后将解决方案迁移到您的真实语法中。

更新 g4 文件以修复解析错误

grammar Rulebook;       

@header {
package com.someone.commons.rulebook.parser;
}

rulebookStatement
    :   KWRulebook
        (GeneralIdentifier | Identifier)
        '{'
        KWVersion
        VersionConstant
        metaStatement
        (ruleStatement)+
        '}'
    ;

metaStatement
    :   KWMeta
        '{'
        KWDescription
        StringLiteral
        KWSource
        StringLiteral
        KWUser
        StringLiteral
        '}'
    ;

ruleStatement
    :   KWRule
        (GeneralIdentifier | Identifier)
        '{'
        KWDescription
        StringLiteral
        whenThenStatement
        '}'
    ;

whenThenStatement
    :   KWWhen '(' expression ')'
        KWThen '{' (statement)* '}'
    ;

primaryExpression
    :   GeneralIdentifier
    |   Identifier
    |   StringLiteral+
    |   Constant
    |   '(' expression ')'
    |   '[' expression ']'
    ;

postfixExpression
    :   primaryExpression
    |   postfixExpression '[' expression ']'
    |   postfixExpression '(' argumentExpressionList? ')'
    |   postfixExpression '.' Identifier
    |   postfixExpression '->' Identifier
    |   postfixExpression '++'
    |   postfixExpression '--'
    ;

argumentExpressionList
    :   assignmentExpression
    |   argumentExpressionList ',' assignmentExpression
    ;

unaryExpression
    :   postfixExpression
    |   '++' unaryExpression
    |   '--' unaryExpression
    |   unaryOperator castExpression
    ;

unaryOperator
    :   '&' | '*' | '+' | '-' | '~' | '!'
    ;

castExpression
    :   unaryExpression
    ;

multiplicativeExpression
    :   castExpression
    |   multiplicativeExpression '*' castExpression
    |   multiplicativeExpression '/' castExpression
    |   multiplicativeExpression '%' castExpression
    ;

additiveExpression
    :   multiplicativeExpression
    |   additiveExpression '+' multiplicativeExpression
    |   additiveExpression '-' multiplicativeExpression
    ;

shiftExpression
    :   additiveExpression
    |   shiftExpression '<<' additiveExpression
    |   shiftExpression '>>' additiveExpression
    ;

relationalExpression
    :   shiftExpression
    |   relationalExpression '<' shiftExpression
    |   relationalExpression '>' shiftExpression
    |   relationalExpression '<=' shiftExpression
    |   relationalExpression '>=' shiftExpression
    ;

equalityExpression
    :   relationalExpression
    |   equalityExpression '==' relationalExpression
    |   equalityExpression '!=' relationalExpression
    ;

andExpression
    :   equalityExpression
    |   andExpression '&' equalityExpression
    ;

exclusiveOrExpression
    :   andExpression
    |   exclusiveOrExpression '^' andExpression
    ;

inclusiveOrExpression
    :   exclusiveOrExpression
    |   inclusiveOrExpression '|' exclusiveOrExpression
    ;

logicalAndExpression
    :   inclusiveOrExpression
    |   logicalAndExpression '&&' inclusiveOrExpression
    ;

logicalOrExpression
    :   logicalAndExpression
    |   logicalOrExpression '||' logicalAndExpression
    ;

conditionalExpression
    :   logicalOrExpression ('?' expression? ':' conditionalExpression)?
    ;

assignmentExpression
    :   conditionalExpression
    |   unaryExpression assignmentOperator assignmentExpression
    ;

assignmentOperator
    :   '=' | '*=' | '/=' | '%=' | '+=' | '-=' | '<<=' | '>>=' | '&=' | '^=' | '|='
    ;

expression
    :   assignmentExpression
    |   expression ',' assignmentExpression
    ;

statement
    :   expressionStatement
    ;

expressionStatement
    :   expression+ ';'
    ;


KWRulebook: 'rulebook';
KWVersion: 'version';
KWMeta: 'meta';
KWDescription: 'description';
KWSource: 'source';
KWUser: 'user';
KWRule: 'rule';
KWWhen: 'when';
KWThen: 'then';

Identifier
    :   IdentifierNondigit
        (   IdentifierNondigit
        |   Digit
        )*
    ;

GeneralIdentifier
    :   Identifier
        (   '-' 
        |   '.'
        |   IdentifierNondigit
        |   Digit
        )*
    ;

fragment
IdentifierNondigit
    :   Nondigit
    //|   // other implementation-defined characters...
    ;

VersionConstant
    :   DigitSequence ('.' DigitSequence)*
    ;

Constant
    :   IntegerConstant
    |   FloatingConstant
    ;

fragment
IntegerConstant
    :   DecimalConstant
    ;

fragment
DecimalConstant
    :   NonzeroDigit Digit*
    ;

fragment
FloatingConstant
    :   DecimalFloatingConstant
    ;

fragment
DecimalFloatingConstant
    :   FractionalConstant
    ;

fragment
FractionalConstant
    :   DigitSequence? '.' DigitSequence
    |   DigitSequence '.'
    ;

fragment
DigitSequence
    :   Digit+
    ;

fragment
Nondigit
    :   [a-zA-Z_]
    ;

fragment
Digit
    :   [0-9]
    ;

fragment
NonzeroDigit
    :   [1-9]
    ;

StringLiteral
    :   '"' SCharSequence? '"'
    |   '\'' SCharSequence? '\''
    ;

fragment
SCharSequence
    :   SChar+
    ;

fragment
SChar
    :   ~["\\r\n]
    |   '\\n'   // Added line
    |   '\\r\n' // Added line
    ;

Whitespace
    :   [ \t]+
        -> skip
    ;

Newline
    :   (   '\r' '\n'?
        |   '\n'
        )
        -> skip
    ;

BlockComment
    :   '/*' .*? '*/'
        -> skip
    ;

LineComment
    :   '//' ~[\r\n]*
        -> skip
    ;

除了 Raven 的回答之外,我还使用了 Intellij 12+ 的 ANTLR 4 插件,它为我调试语法节省了很多精力。我有一个我找不到的非常简单的错误(未转义的点 . 而不是浮点数规则中的“.”)。此工具允许 select 语法的任何解析器规则,使用输入对其进行测试并以图形方式显示解析树。在我开始寻找调试语法的方法之前,我没有注意到它有这个非常有用的功能。强烈推荐。