在 antlr4 python3 目标中将 java 翻译成 python

Question

我使用 antlr4 使用来自 antlr 语法库的 Python3.g4 语法文件生成了一个 python 目标。生成的 Python3Lexer.py 文件包含 Java 代码，我需要将其转换为 python。这是它输出的两个 java 段，你可以在 python3 语法文件中找到它们 here also

// A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
private java.util.LinkedList<Token> tokens = new java.util.LinkedList<>();

// The stack that keeps track of the indentation level.
private java.util.Stack<Integer> indents = new java.util.Stack<>();

// The amount of opened braces, brackets and parenthesis.
private int opened = 0;

// The most recently produced token.
private Token lastToken = null;

@Override
public void emit(Token t) {
    super.setToken(t);
    tokens.offer(t);
}

@Override
public Token nextToken() {

    // Check if the end-of-file is ahead and there are still some DEDENTS expected.
    if (_input.LA(1) == EOF && !this.indents.isEmpty()) {

    // Remove any trailing EOF tokens from our buffer.
    for (int i = tokens.size() - 1; i >= 0; i--) {
        if (tokens.get(i).getType() == EOF) {
          tokens.remove(i);
        }
    }

    // First emit an extra line break that serves as the end of the statement.
    this.emit(commonToken(Python3Parser.NEWLINE, "\n"));

    // Now emit as much DEDENT tokens as needed.
    while (!indents.isEmpty()) {
        this.emit(createDedent());
        indents.pop();
    }

    // Put the EOF back on the token stream.
    this.emit(commonToken(Python3Parser.EOF, "<EOF>"));
  }

  Token next = super.nextToken();

  if (next.getChannel() == Token.DEFAULT_CHANNEL) {
      // Keep track of the last token on the default channel.
      this.lastToken = next;
  }

  return tokens.isEmpty() ? next : tokens.poll();
}

private Token createDedent() {
    CommonToken dedent = commonToken(Python3Parser.DEDENT, "");
    dedent.setLine(this.lastToken.getLine());
    return dedent;
}

private CommonToken commonToken(int type, String text) {
    int stop = this.getCharIndex() - 1;
    int start = text.isEmpty() ? stop : stop - text.length() + 1;
    return new CommonToken(this._tokenFactorySourcePair, type, DEFAULT_TOKEN_CHANNEL, start, stop);
}

static int getIndentationCount(String spaces) {

    int count = 0;

    for (char ch : spaces.toCharArray()) {
      switch (ch) {
        case '\t':
          count += 8 - (count % 8);
          break;
        default:
          // A normal space char.
          count++;
      }
    }

    return count;
}

boolean atStartOfInput() {
    return super.getCharPositionInLine() == 0 && super.getLine() == 1;
}

和

String newLine = getText().replaceAll("[^\r\n\f]+", "");
String spaces = getText().replaceAll("[\r\n\f]+", "");
int next = _input.LA(1);

if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') {
   // If we're inside a list or on a blank line, ignore all indents,
   // dedents and line breaks.
   skip();
}
else {
   emit(commonToken(NEWLINE, newLine));

   int indent = getIndentationCount(spaces);
   int previous = indents.isEmpty() ? 0 : indents.peek();

   if (indent == previous) {
       // skip indents of the same size as the present indent-size
       skip();
   }
   else if (indent > previous) {
       indents.push(indent);
       emit(commonToken(Python3Parser.INDENT, spaces));
   }
   else {
       // Possibly emit more than 1 DEDENT token.
       while(!indents.isEmpty() && indents.peek() > indent) {
           this.emit(createDedent());
           indents.pop();
       }
   }
}

我自己将这些翻译成：

# A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
tokens = deque()

# The stack that keeps track of the indentation level.
# https://docs.python.org/3/tutorial/datastructures.html#using-lists-as-stacks
indents = []

# The amount of opened braces, brackets and parenthesis.
opened = 0

# The most recently produced token.
lastToken = None

def emit(self, t):
  self._token = t
  self.tokens.append(t)

def nextToken(self):

  # Check if the end-of-file is ahead and there are still some DEDENTS expected.
  if self._input.LA(1) == Token.EOF and self.indents.size() != 0:

    # Remove any trailing EOF tokens from our buffer.
    for i in range(tokens.size() - 1, 0, -1):
      if self.tokens[i].getType() == Token.EOF:
        self.tokens.remove(i)

    # First emit an extra line break that serves as the end of the statement.
    self.emit(commonToken(Python3Parser.NEWLINE, "\n"))

    # Now emit as much DEDENT tokens as needed.
    while self.indents.size() != 0:
      self.emit(createDedent())
      self.indents.pop()

    # Put the EOF back on the token stream.
    self.emit(commonToken(Python3Parser.EOF, "<EOF>"))

  next = self.nextToken()

  if next.getChannel() == Token.DEFAULT_CHANNEL:
    # Keep track of the last token on the default channel.
    self.lastToken = next

  return next if self.tokens.size() == 0 else self.tokens.popleft()

def createDedent():
  dedent = commonToken(Python3Parser.DEDENT, "")
  dedent.setLine(self.lastToken.getLine())
  return dedent

def commonToken(self, type, text):
  stop = self.getCharIndex() - 1
  start = stop if text.size() == 0 else stop - text.size() + 1
  return CommonToken(self._tokenFactorySourcePair, type, DEFAULT_TOKEN_CHANNEL, start, stop)

def getIndentationCount(spaces):

  count = 0

  for ch in spaces:
    if ch == '\t':
        count += 8 - (count % 8)
        break
    else:
        # A normal space char.
        count = count + 1

  return count

def atStartOfInput(self):
  return self.getCharPositionInLine() == 0 and self.getLine() == 1

和

newLine = getText().replaceAll("[^\r\n\f]+", "")
spaces = getText().replaceAll("[\r\n\f]+", "")
next = self._input.LA(1)

if opened > 0 or next == '\r' or next == '\n' or next == '\f' or next == '#':
    # If we're inside a list or on a blank line, ignore all indents,
    # dedents and line breaks.
    skip()
else:
    emit(commonToken(NEWLINE, newLine))

indent = getIndentationCount(spaces)
previous = 0 if indents.isEmpty() else indents.peek()

if indent == previous:
    # skip indents of the same size as the present indent-size
    skip()
elif indent > previous:
    indents.push(indent)
    emit(commonToken(Python3Parser.INDENT, spaces))
else:
     # Possibly emit more than 1 DEDENT token.
     while not indents.isEmpty() and indents.peek() > indent:
     self.emit(createDedent())
     indents.pop()

这是我的 python 脚本，用于运行 antlr 输出，其中包含 python 而不是 java 片段。运行使用命令 python main.py test.py

import sys
from antlr4 import *
from Python3Lexer import Python3Lexer
from Python3Parser import Python3Parser
from Python3Listener import Python3Listener

class FuncPrinter(Python3Listener):
  def enterFuncdef(self, ctx):
    print("Oh, a func")

def main(argv):
  input = FileStream(argv[1])
  lexer = Python3Lexer(input)
  stream = CommonTokenStream(lexer)
  parser = Python3Parser(stream)
  tree = parser.funcdef()

  printer = KeyPrinter()
  walker = ParseTreeWalker()
  walker.walk(printer, tree)

if __name__ == '__main__':
  main(sys.argv)

它出错并打印以下跟踪

Traceback (most recent call last):
  File "main.py", line 24, in <module>
    main(sys.argv)
  File "main.py", line 17, in main
    tree = parser.parameters()
  File "...\antler-test\Python3Parser.py", line 1297, in parameters
    self.enterRule(localctx, 14, self.RULE_parameters)
  File "...\antler-test\antlr4\Parser.py", line 358, in enterRule
    self._ctx.start = self._input.LT(1)
  File "...\antler-test\antlr4\CommonTokenStream.py", line 61, in LT
    self.lazyInit()
  File "...\antler-test\antlr4\BufferedTokenStream.py", line 186, in lazyInit
    self.setup()
  File "...\antler-test\antlr4\BufferedTokenStream.py", line 189, in setup
    self.sync(0)
  File "...\antler-test\antlr4\BufferedTokenStream.py", line 111, in sync
    fetched = self.fetch(n)
  File "...\antler-test\antlr4\BufferedTokenStream.py", line 123, in fetch
    t = self.tokenSource.nextToken()
  File "...\antler-test\Python3Lexer.py", line 698, in nextToken
    next = self.nextToken()
  File "...\antler-test\Python3Lexer.py", line 698, in nextToken
    next = self.nextToken()
  File "...\antler-test\Python3Lexer.py", line 698, in nextToken
    next = self.nextToken()
  [Previous line repeated 985 more times]
  File "...\antler-test\Python3Lexer.py", line 680, in nextToken
    if self._input.LA(1) == Token.EOF and self.indents.size() != 0:
  File "...\antler-test\antlr4\InputStream.py", line 49, in LA
    if offset==0:
RecursionError: maximum recursion depth exceeded in comparison

输入文件如下所示：

def fun1():
    return None

def fun2():
    return None

我不确定我是否翻译了 python 错误，或者递归算法对 python 来说太多了，但我也不知道如何更改算法nextToken 方法是迭代的，因为它不是尾递归的。也许有人能弄清楚？还是我的操作有其他问题？

Answer 1

您的 python 代码显示

next = self.nextToken()

但是您的 java 代码显示：

Token next = super.nextToken();

请注意 super 与 self. 不同你可能指的是：

next = super().nextToken()

Answer 2

几天来我一直在研究同一个主题。

这并不容易。 Python 运行时与 java 运行时不完全相同 API。 Python runtime 使用较少且相当不完整。

我不得不使用一些解决方法，但它似乎有效。这是我的代码：

tokens { INDENT, DEDENT }

@lexer::members {

    # A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
    self.tokens = []

    # The stack that keeps track of the indentation level.
    self.indents = []

    # The amount of opened braces, brackets and parenthesis.
    self.opened = 0

    # The most recently produced token.
    self.last_token = None

def emitToken(self, t):
    super().emitToken(t)
    self.tokens.append(t)

def nextToken(self):
    if self._input.LA(1) == Token.EOF and len(self.indents) > 0:
        # Remove any trailing EOF tokens from our buffer.
        while len(self.tokens) > 0 and self.tokens[-1].type == Token.EOF:
            del self.tokens[-1]

        # First emit an extra line break that serves as the end of the statement.
        self.emitToken(self.common_token(Python3Lexer.NEWLINE, "\n"));

        # Now emit as much DEDENT tokens as needed.
        while len(self.indents) != 0:
            self.emitToken(self.create_dedent())
            del self.indents[-1]

        # Put the EOF back on the token stream.
        self.emitToken(self.common_token(Token.EOF, "<EOF>"));

    next = super().nextToken();

    if next.channel == Token.DEFAULT_CHANNEL:
        # Keep track of the last token on the default channel.
        self.last_token = next

    if len(self.tokens) == 0:
        return next
    else:
        t = self.tokens[0]
        del self.tokens[0]
        return t

def create_dedent(self):
    from Python3Parser import Python3Parser
    dedent = self.common_token(Python3Parser.DEDENT, "")
    dedent.line = self.last_token.line
    return dedent

def common_token(self, _type,  text):
    from antlr4.Token import CommonToken
    stop = self.getCharIndex() - 1
    if len(self.text) == 0:
        start = stop
    else:
        start = stop - len(self.text) + 1
    return CommonToken(self._tokenFactorySourcePair, _type, Lexer.DEFAULT_TOKEN_CHANNEL, start, stop)

## Calculates the indentation of the provided spaces, taking the
## following rules into account:
##
## "Tabs are replaced (from left to right) by one to eight spaces
##  such that the total number of characters up to and including
##  the replacement is a multiple of eight [...]"
##
##  -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
def getIndentationCount(self, spaces):
    count = 0
    for ch in spaces:
        if ch == '\t':
            count += 8 - (count % 8)
        else:
            count += 1
    return count

def atStartOfInput(self):
    return self._interp.column == 0 and self._interp.line == 1

}

对于 NEWLINE 词法分析器部分：

NEWLINE
 : ( {self.atStartOfInput()}?   SPACES
   | ( '\r'? '\n' | '\r' | '\f' ) SPACES?
   )

   {
    import re
    from Python3Parser import Python3Parser
    new_line = re.sub(r"[^\r\n\f]+", "", self._interp.getText(self._input)) #.replaceAll("[^\r\n\f]+", "")
    spaces = re.sub(r"[\r\n\f]+", "", self._interp.getText(self._input)) #.replaceAll("[\r\n\f]+", "")
    next = self._input.LA(1)

    if self.opened > 0 or next == '\r' or next == '\n' or next == '\f' or next == '#':
        self.skip()
    else:
        self.emitToken(self.common_token(self.NEWLINE, new_line))

        indent = self.getIndentationCount(spaces)
        if len(self.indents) == 0:
            previous = 0
        else:
            previous = self.indents[-1]

        if indent == previous:
            self.skip()
        elif indent > previous:
            self.indents.append(indent)
            self.emitToken(self.common_token(Python3Parser.INDENT, spaces))
        else:
            while len(self.indents) > 0 and self.indents[-1] > indent:
                self.emitToken(self.create_dedent())
                del self.indents[-1]

   };

您还必须在整个文件中将 lexer id "str" 替换为 "string"（例如），因为 str 是 python.

中的关键字

Answer 3

我遇到了同样的问题。我不能完全让 Alexandre 的代码在 python3 下工作。我不得不稍微修改一下：

...
next = self._input.LA(1)
if next == Python3Parser.EOF:
    chr_next = -1
else:
    chr_next = chr( next )

if self.opened > 0 or chr_next == '\r' or chr_next == '\n' or chr_next == '\f' or chr_next == '#':
    self.skip()
...

您还可以将所有导入移动到词法分析器的 header:

@lexer::header {
import re
from Python3Parser import Python3Parser
from antlr4.Token import CommonToken    
}

在 antlr4 python3 目标中将 java 翻译成 python

translating java to python in antlr4 python3 target

python

antlr

python-3.x

antlr4