Yacc 语法没有正确减少输入的最后一个文字

Question

这是一个为算术表达式生成3地址码的代码。

我面临的问题是我的语法能够正确读取输入直到最后一个文字。但无法减少 '\n'

之前的最后一个文字

示例：x = 1 + 2 * 3 - 4
1,2,3 被正确阅读。甚至减号。当它读取 4 时，它能够将语法减少 2 步。然后报错。

我已经把打印语句求助了。这是整个代码。读取最后一个文字时出现问题：

x -> IDEN | NUM | ....  
f -> x  
t -> f (This doesn't happen)

icg.y

%{
    #include<stdio.h>
    #include<stdlib.h>
    int temp_reg_num = 0;
    void print_three_address_code(char* text1,char oper,char* text2)
    {
        printf("t%d = %s %c %s\n",temp_reg_num,text1,oper,text2);
    }   
%}

%token IDEN, NUM, FLOAT 
%union {
    char* text;
}

%left '+' '-'
%left '*' '/'

%%
s : IDEN '=' e '\n' {
    printf("start\n");
    if(.text == NULL)
        printf("%s = t%d\n",.text,temp_reg_num-1);
    else
        printf("%s = %s\n",.text,.text);
    return 0;
}
;
e : e '+' t {
    printf("e -> e + t\n");
    char temp[5];
    print_three_address_code(.text,'+',.text);
    sprintf(temp,"t%d",temp_reg_num++);
    $$.text = copy_string(temp);
}
  | e '-' t {
    printf("e -> e - t\n");
    char temp[5];
    print_three_address_code(.text,'-',.text);
    sprintf(temp,"t%d",temp_reg_num++);
    $$.text = copy_string(temp);
}
  | t {
    printf("e -> t\n");
    $$.text = copy_string(.text);
}
;
t : t '*' f {
    printf("t -> t * f\n");
    char temp[5];
    print_three_address_code(.text,'*',.text);
    sprintf(temp,"t%d",temp_reg_num++);
    $$.text = copy_string(temp);
}
  | t '/' f {
    printf("t -> t / f\n");
    char temp[5];
    print_three_address_code(.text,'/',.text);
    sprintf(temp,"t%d",temp_reg_num++);
    $$.text = copy_string(temp);
}
  | f {
    printf("t -> f\n");
    $$.text = copy_string(.text);
}
;
f : f '^' x {
    printf("f -> f ^ x\n");
    char temp[5];
    print_three_address_code(.text,'^',.text);
    sprintf(temp,"t%d",temp_reg_num++);
    $$.text = copy_string(temp);
}
  | x {
    printf("f -> x\n");
    $$.text = copy_string(.text);
    printf("Why syntax error??");
}
;
x : '(' s ')' {}
      | NUM {
    printf("x -> NUM\n");
    $$.text = copy_string(.text);
}
      | IDEN {
    printf("x -> IDEN\n");
    $$.text = copy_string($$.text,.text);
}
      | FLOAT {
    printf("x -> FLOAT\n");
    $$.text = copy_string(.text);
}
      | '-' x {
    printf("x -> - x\n");
    $$.text = (char*)malloc(sizeof(char)*(strlen(.text)+2));
    $$.text[0] = '-';
    strcat($$.text,.text);
}
;
%%

main()
{
    printf("Enter your expression : ");
    yyparse();
}
yyerror()
{
    printf("Syntax Error\n");
}
yywrap()
{
    return 1;
}

icg.l

%{
    #include<stdio.h>
    #include<stdlib.h>
    #include"y.tab.h"
    char* copy_string(char* fromstring)  
    {
        char* tostring = (char*)malloc(sizeof(char)*(strlen(fromstring)+1));  
        strcpy(tostring,fromstring);  
        return tostring;  
    }
%}

iden [a-zA-Z_][a-zA-Z_]*  
%%  
[0-9]+ {  
    yylval.text = copy_string(yytext);  
    return NUM;  
}
[0-9]+[.][0-9]+ {  
    yylval.text = copy_string(yytext);  
    return FLOAT;  
}
{iden} {  
    yylval.text = copy_string(yytext);  
    return IDEN;  
}  
[+] {  
    return '+';  
}  
[-] {  
    return '-';  
}  
[*] {  
    return '*';  
}  
[/] {  
    return '/';  
}  
['^'] {  
    return '^';  
}  
[=] {  
    return '=';  
}  
'\n' {  
    return '\n';  
}  
. {}  
%%

Answer 1

由于这条规则，您遇到了语法错误：

'\n' { return '\n'; }

Flex 不认为 ' 有任何特殊之处，因此该规则将（仅）匹配三个字符序列 'ENTER '。这不会出现在您的输入中，因此需要 \n 的规则不会成功，并且当您提供 end-of-file 时（或者文件结束，如果您正在从文件中读取），然后你会得到一个语法错误。

所以你应该写的是：

\n { return '\n'; }

此外，

['^']

不只匹配一个^。它还匹配 '，因为如上所述，' 没有任何特殊意义，因此字符 class 由三个字符组成，其中一个重复。要精确匹配字符 ^，请使用 double-quotes（特殊字符）：

"^"

（但是，"\n" 将不起作用，除非您想要匹配的恰好是一个 \ 后跟一个 n。您想要的只是 \n .)

您实际上可以将所有这些单字符规则简化为一条规则：

[-+*/^()=\n]   { return yytext[0]; }

我包括了 ( 和 )，尽管它们不在您的 icg.l 文件中，因为您在语法中使用了它们。

但是，您使用它们的规则不正确：

x : '(' s ')' {}

因为 s 是一个赋值。这将允许

x = 1 + (y = 2 * 3) - 4

但不允许更正常

x = 1 + (2 * 3) - 4

你想要的是：

x : '(' e ')' { $$ = ; }

您的语法中还有其他各种错误，包括：

缺少 #include <string.h> header（对于 strlen）
缺少 yylex、yyerror 和 copy_string
main 和 yyerror 的原型不正确（或者更准确地说，是过时的）。 "Modern C"——在过去 30 年左右——更喜欢显式 return 类型，因为 C99 (1999) 需要它们。
copy_string 的一个实例，带有两个参数而不是一个参数，这是未定义的行为。（如果您为 copy_string 提供声明，它会被标记为错误。）

Berkeley yacc (byacc) 在没有投诉的情况下处理了您的 icg.y 文件，但 bison 即使在 yacc 兼容模式下也不会处理它。问题是你使用了 .text 并且你没有声明你的终端和 non-terminals 的标签。如果您要声明 union 类型，您应该为您的终端提供标签声明：

%token <text> NUM IDEN FLOAT

和您的non-terminals（或至少具有语义值的那些）：

%type <text> s e t f x

然后您可以从您的操作中删除所有 .text 选择器，因为 bison（甚至 byacc）将知道要使用哪个选择器。声明令牌标记使您的解析器 type-safe，或至少更少 type-unsafe，并且通常也更具可读性。

最后，您需要进行内存管理。您永远不会 free() 在 string_copy 中分配的任何字符串，因此您逐渐积累了相当多的泄漏内存。此外，在很多情况下，您会在不需要的情况下进行复制；例如，在单位规则中：

x : NUM { $$.text = copy_string(.text); }

副本完全没有必要，因为 </code> 即将从解析器堆栈中弹出，并且它引用的字符串将永远不会再次使用。所以你只是不必要地泄露了一份副本。会更干净</p> <pre><code>x : NUM { $$ = ; }

但这实际上是不必要的，因为它是默认操作。

更改单元制作不会阻止其他内存泄漏；在实际上对语义值做一些事情而不是传递它们的作品中，如果您不再需要它们，您仍然需要手动 free 复制的字符串（这似乎是所有情况你的语义动作）。或者，如果以后需要它们，您将需要弄清楚如何释放它们。

您可以使用 yacc 或 bison 的 built-in 跟踪功能，而不是在您的程序中塞满 printf，稍后您必须将其删除。当你运行 yacc 或添加

时，只需提供 -t 选项

#define YYDEBUG 1

到 icg.y 文件中的“%{...%}”块。然后修改你的main函数：

int main() {
  yydebug = 1;
  return yyparse();
}

这会准确地告诉您输入被解析时发生了什么，包括从您的扫描仪接收到哪些终端。

这是一个完整的示例，显示了一种内存管理方法。它基于您的代码，但进行了一定程度的重构。请注意，解析器从不复制字符串，但 emit_tac 会分配一个新字符串，并释放作为参数传递的字符串。（某些观察者可能认为这很粗鲁。）现代 flex 生成的扫描器需要在 main 中调用 yylex_destroy()，以释放词法分析器分配的资源。 bison-specific %destructor 可防止出现语法错误时发生内存泄漏：如果您不使用 bison，则必须针对该问题找到不同的解决方案。

icg.y

%{
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    int yylex();
    int yylex_destroy();
    void yyerror(const char* msg);

    int temp_reg_num = 0;
    char* emit_tac(char* text1, char oper, char* text2) {
        char* rv = malloc(8);
        snprintf(rv, 7, "t%d", temp_reg_num++);
        printf("%s = %s %c %s\n", rv, text1 ? text1 : "", oper, text2);
        free(text1); free(text2);
        return rv;
    }
%}

%union {
    char* text;
}
%token <text> IDEN NUMBER
%type <text> e t f x
%destructor { free($$); } <text>

%%
s : IDEN '=' e '\n' { printf("%s = %s\n", , );
                      free(); free();
                    }
e : e '+' t         { $$ = emit_tac(,'+',); }
  | e '-' t         { $$ = emit_tac(,'-',); }
  | t
t : t '*' f         { $$ = emit_tac(,'*',); }
  | t '/' f         { $$ = emit_tac(,'/',); }
  | f
f : f '^' x         { $$ = emit_tac(,'^',); }
  | x 
x : IDEN 
  | NUMBER
  | '-' x           { $$ = emit_tac(NULL, '-', ); }
  | '(' e ')'       { $$ = ; }
%%

int main() {
    int rc = yyparse();
    yylex_destroy();
    return rc;
}

void yyerror(const char* msg) {
    printf("%s\n", msg);
}

icg.l（需要弹性）

%{
    /* Change to y.tab.h if you use yacc */
    #include "icg.tab.h"
    char* copy_yytext() {
        char* tostring = malloc(yyleng + 1);
        memcpy(tostring, yytext, yyleng);
        tostring[yyleng] = 0;
        return tostring;  
    }
%}
%option noinput nounput noyywrap nodefault

%%

\n                           { return '\n'; }
[[:space:]]                  /* Ignore */
[[:digit:]]+(\.[[:digit]]*)? { yylval.text = copy_yytext(); return NUMBER;  }
[[:alpha:]_][[:alnum:]_]*    { yylval.text = copy_yytext(); return IDEN; } 
.                            { return yytext[0]; }

编译

$ bison -d icg.y
$ flex icg.l
$ gcc -Wall -o icg lex.yy.c icg.tab.c

用 valgr 测试nd 证明没有内存泄漏

$ valgrind --leak-check=full ./icg <<< ' x = 1 + 2 - 3 / 4 ^ 5 * ( 6 - 9 )'
==26225== Memcheck, a memory error detector
==26225== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al.
==26225== Using Valgrind-3.10.1 and LibVEX; rerun with -h for copyright info
==26225== Command: ./icg
==26225== 
t0 = 1 + 2
t1 = 4 ^ 5
t2 = 3 / t1
t3 = 6 - 9
t4 = t2 * t3
t5 = t0 - t4
x = t5
==26225== 
==26225== HEAP SUMMARY:
==26225==     in use at exit: 0 bytes in 0 blocks
==26225==   total heap usage: 17 allocs, 17 frees, 16,530 bytes allocated
==26225== 
==26225== All heap blocks were freed -- no leaks are possible
==26225== 
==26225== For counts of detected and suppressed errors, rerun with: -v
==26225== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)

Yacc 语法没有正确减少输入的最后一个文字

Yacc Grammar not reducing the last literal of the input correctly

grammar

yacc

lex

icg.y

icg.l（需要弹性）

编译

用 valgr 测试nd 证明没有内存泄漏