如何根据以下内容减少解析器堆栈或 'unshift' 当前标记？

Question

给定以下语言描述为：

正式：(identifier operator identifier+)*
用简单的英语来说：零个或多个操作写成标识符（左值），然后是运算符，然后是一个或多个标识符（右值）

给定任意运算符 @:

，该语言中的一系列操作示例将是

A @ B C X @ Y

空格并不重要，也可以写得更清楚：

A @ B C
X @ Y

您将如何使用类似 yacc 的 LALR 解析器解析它？

到目前为止我尝试了什么

我知道如何解析 明确分隔的 操作，比如 A @ B C ; X @ Y 但我想知道解析上述输入是否可行以及如何解析。以下是使用 Flex/Bison.

的（非功能性）最小示例

lex.l:

%{
#include "y.tab.h"
%}

%option noyywrap
%option yylineno

%%
[a-zA-Z][a-zA-Z0-9_]*   { return ID; }
@                       { return OP; }
[ \t\r\n]+              ; /* ignore whitespace */
.                       { return ERROR; } /* any other character causes parse error */
%%

yacc.y:

%{
#include <stdio.h>

extern int yylineno;
void yyerror(const char *str);
int yylex();
%}

%define parse.lac full
%define parse.error verbose

%token ID OP ERROR
%left OP

%start opdefs

%%
opright:
       | opright ID
       ;

opdef: ID OP ID opright
     ;

opdefs:
      | opdefs opdef
      ;
%%

void yyerror(const char *str) {
    fprintf(stderr, "error@%d: %s\n", yylineno, str);
}

int main(int argc, char *argv[]) {
    yyparse();
}

构建：$ flex lex.l && yacc -d yacc.y --report=all --verbose && gcc lex.yy.c y.tab.c

问题：我无法让解析器不包含下一个lvalue标识符到第一个操作的 rvalue。

$ ./a.out
A @ B C X @ Y
error@1: syntax error, unexpected OP, expecting $end or ID

上面总是解析为：reduce(A @ B reduce(C X)) @ Y

我觉得我必须以某种方式在前瞻标记上设置一个条件，说明如果它是运算符，则不应移动最后一个标识符并且应减少当前堆栈：

A @ B C X @ Y
        ^ *    // ^: current, *: lookahead
-> reduce 'A @ B C' !
-> shift 'X' !

我尝试了所有类型的运算符优先级安排，但无法正常工作。

我愿意接受不适用于 Bison 的解决方案。

Answer 1

该语言的朴素语法是 LALR(2)，而 bison 不会生成 LALR(2) 解析器。

可以机械修改任何 LALR(2) 文法以生成具有兼容解析树的 LALR(1) 文法，但我不知道有任何自动工具可以做到这一点。

手动进行转换是可能的，但很烦人，但请注意，您需要调整操作以恢复正确的解析树：

%{
  typedef struct IdList  { char* id; struct IdList* next; };
  typedef struct Def     { char* lhs; IdList* rhs; };
  typedef struct DefList { Def* def; struct DefList* next; };
%}
union {
  Def*     def;
  DefList* defs;
  char*    id;
}
%type <def>  ophead
%type <defs> opdefs
%token <id>   ID

%%

prog  : opdefs        { ->def->rhs = IdList_reverse(->def->rhs);
                        DefList_show(DefList_reverse()); }
ophead: ID '@' ID     { $$ = Def_new();
                        $$->rhs = IdList_push($$->rhs, ); } 
opdefs: ophead        { $$ = DefList_push(NULL, ); }
      | opdefs ID     { ->def->rhs = IdList_push(->def->rhs, ); }
      | opdefs ophead { ->def->rhs = IdList_reverse(->def->rhs);
                        $$ = DefList_push(, ); }

具有讽刺意味的是，这个精确的问题是 bison 本身的一部分，因为产品不需要 ; 终止符。 Bison 使用自身来生成一个解析器，它在词法分析器中解决了这个问题，而不是像上面概述的那样跳过循环。在词法分析器中，一旦找到 ID，就会继续扫描直到下一个非空白字符。如果那是一个 :，那么词法分析器 return 就是一个 identifier-definition 标记；否则，非空白字符 returned 到输入流，普通 identifier 标记 returned.

这是在词法分析器中实现它的一种方法：

%x SEEK_AT
%%
  /* See below for explanation, if needed */
  static int deferred_eof = 0;
  if (deferred_eof) { deferred_eof = 0; return 0; }
[[:alpha:]][[:alnum:]_]*  yylval = strdup(yytext); BEGIN(SEEK_AT);
[[:space:]]+              ;                /* ignore whitespace */
   /* Could be other rules here */
.                         return *yytext;  /* Let the parser handle errors */

<SEEK_AT>{
  [[:space:]]+            ;                /* ignore whitespace */
  "@"                     BEGIN(INITIAL); return ID_AT;
  .                       BEGIN(INITIAL); yyless(0); return ID;
  <EOF>                   BEGIN(INITIAL); deferred_eof = 1; return ID;
}

在SEEK_AT开始条件中，我们只对@感兴趣。如果我们找到一个，那么 ID 就是 def 的开头，并且我们 return 是正确的令牌类型。如果我们找到其他任何东西（除了空格），我们使用 yyless 和 return ID 标记类型 return 字符到输入流。请注意，yylval 已在 ID 的初始扫描中设置，因此此处无需担心。

上述代码中唯一复杂的部分是 EOF 处理。一旦检测到 EOF，就不可能将其重新插入到输入流中，无论是 yyless 还是 unputc。让扫描器再次读取 EOF 也是不合法的。所以需要全面处理。不幸的是，在 SEEK_AT 开始条件下，完全处理 EOF 需要发送两个令牌：首先是已经检测到的 ID 令牌，然后是 yyparse 将识别为结束的 0输入。没有推送解析器，我们无法从单个扫描器操作发送两个令牌，因此我们需要注册已收到 EOF 的事实，并在下一次调用扫描器时检查它。

第一条规则之前的缩进代码被插入到 yylex 函数的顶部，因此它可以声明局部变量并在扫描开始之前做任何需要做的事情。正如所写，这个词法分析器是不可重入的，但它是可重启的，因为持久状态在 if (deferred_eof) 操作中被重置。要使其可重入，您只需将 deferred_eof 放入 yystate 结构中，而不是将其设为静态本地。

Answer 2

根据 rici 的有用评论和回答，这是我想出的：

lex.l:

%{
#include "y.tab.h"
%}

%option noyywrap
%option yylineno

%%
[a-zA-Z][a-zA-Z0-9_]*   { yylval.a = strdup(yytext); return ID; }
@                       { return OP; }
[ \t\r\n]+              ; /* ignore whitespace */
.                       { return ERROR; } /* any other character causes parse error */
%%

yacc.y:

%{
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>

extern int yylineno;
void yyerror(const char *str);
int yylex();


#define STR_OP    " @ "
#define STR_SPACE " "

char *concat3(const char *, const char *, const char *);

struct oplist {
    char **ops;
    size_t capacity, count;
} my_oplist = { NULL, 0, 0 };

int oplist_append(struct oplist *, char *);
void oplist_clear(struct oplist *);
void oplist_dump(struct oplist *);
%}

%union {
    char *a;
}

%define parse.lac full
%define parse.error verbose

%token ID OP END ERROR

%start input

%%

opbase: ID OP ID {
         char *s = concat3($<a>1, STR_OP, $<a>3);
         free($<a>1);
         free($<a>3);
         assert(s && "opbase: allocation failed");

         $<a>$ = s;
     }
     ;

ops: opbase {
       $<a>$ = $<a>1;
   }
   | ops opbase {
       int r = oplist_append(&my_oplist, $<a>1);
       assert(r == 0 && "ops: allocation failed");

       $<a>$ = $<a>2;
   }
   | ops ID {
       char *s = concat3($<a>1, STR_SPACE, $<a>2);
       free($<a>1);
       free($<a>2);
       assert(s && "ops: allocation failed");

       $<a>$ = s;
   }
   ;

input: ops {
         int r = oplist_append(&my_oplist, $<a>1);
         assert(r == 0 && "input: allocation failed");
     }
     ;       
%%

char *concat3(const char *s1, const char *s2, const char *s3) {
    size_t len = strlen(s1) + strlen(s2) + strlen(s3);
    char *s = malloc(len + 1);
    if (!s)
        goto concat3__end;

    sprintf(s, "%s%s%s", s1, s2, s3);

concat3__end:
    return s;
}


int oplist_append(struct oplist *oplist, char *op) {
    if (oplist->count == oplist->capacity) {  
        char **ops = realloc(oplist->ops, (oplist->capacity + 32) * sizeof(char *));
        if (!ops)
            return 1;

        oplist->ops = ops;
        oplist->capacity += 32;
    } 

    oplist->ops[oplist->count++] = op;
    return 0;
}

void oplist_clear(struct oplist *oplist) {
    if (oplist->count > 0) {
        for (size_t i = 0; i < oplist->count; ++i)
            free(oplist->ops[i]);
        oplist->count = 0;
    }

    if (oplist->capacity > 0) {
        free(oplist->ops);
        oplist->capacity = 0;
    }
}

void oplist_dump(struct oplist *oplist) {
    for (size_t i = 0; i < oplist->count; ++i)
        printf("%2zu: '%s'\n", i, oplist->ops[i]);
}


void yyerror(const char *str) {
    fprintf(stderr, "error@%d: %s\n", yylineno, str);
}

int main(int argc, char *argv[]) {
    yyparse();

    oplist_dump(&my_oplist);
    oplist_clear(&my_oplist);
}

输出 A @ B C X @ Y:

 0: 'A @ B C'
 1: 'X @ Y'

如何根据以下内容减少解析器堆栈或 'unshift' 当前标记？

How to reduce parser stack or 'unshift' the current token depending on what follows?

compiler-construction

grammar

parsing

bison