在 antlr4 python3 目标中将 java 翻译成 python

translating java to python in antlr4 python3 target

我使用 antlr4 使用来自 antlr 语法库的 Python3.g4 语法文件生成了一个 python 目标。生成的 Python3Lexer.py 文件包含 Java 代码,我需要将其转换为 python。这是它输出的两个 java 段,你可以在 python3 语法文件中找到它们 here also

// A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
private java.util.LinkedList<Token> tokens = new java.util.LinkedList<>();

// The stack that keeps track of the indentation level.
private java.util.Stack<Integer> indents = new java.util.Stack<>();

// The amount of opened braces, brackets and parenthesis.
private int opened = 0;

// The most recently produced token.
private Token lastToken = null;

@Override
public void emit(Token t) {
    super.setToken(t);
    tokens.offer(t);
}

@Override
public Token nextToken() {

    // Check if the end-of-file is ahead and there are still some DEDENTS expected.
    if (_input.LA(1) == EOF && !this.indents.isEmpty()) {

    // Remove any trailing EOF tokens from our buffer.
    for (int i = tokens.size() - 1; i >= 0; i--) {
        if (tokens.get(i).getType() == EOF) {
          tokens.remove(i);
        }
    }

    // First emit an extra line break that serves as the end of the statement.
    this.emit(commonToken(Python3Parser.NEWLINE, "\n"));

    // Now emit as much DEDENT tokens as needed.
    while (!indents.isEmpty()) {
        this.emit(createDedent());
        indents.pop();
    }

    // Put the EOF back on the token stream.
    this.emit(commonToken(Python3Parser.EOF, "<EOF>"));
  }

  Token next = super.nextToken();

  if (next.getChannel() == Token.DEFAULT_CHANNEL) {
      // Keep track of the last token on the default channel.
      this.lastToken = next;
  }

  return tokens.isEmpty() ? next : tokens.poll();
}

private Token createDedent() {
    CommonToken dedent = commonToken(Python3Parser.DEDENT, "");
    dedent.setLine(this.lastToken.getLine());
    return dedent;
}

private CommonToken commonToken(int type, String text) {
    int stop = this.getCharIndex() - 1;
    int start = text.isEmpty() ? stop : stop - text.length() + 1;
    return new CommonToken(this._tokenFactorySourcePair, type, DEFAULT_TOKEN_CHANNEL, start, stop);
}

static int getIndentationCount(String spaces) {

    int count = 0;

    for (char ch : spaces.toCharArray()) {
      switch (ch) {
        case '\t':
          count += 8 - (count % 8);
          break;
        default:
          // A normal space char.
          count++;
      }
    }

    return count;
}

boolean atStartOfInput() {
    return super.getCharPositionInLine() == 0 && super.getLine() == 1;
}

String newLine = getText().replaceAll("[^\r\n\f]+", "");
String spaces = getText().replaceAll("[\r\n\f]+", "");
int next = _input.LA(1);

if (opened > 0 || next == '\r' || next == '\n' || next == '\f' || next == '#') {
   // If we're inside a list or on a blank line, ignore all indents,
   // dedents and line breaks.
   skip();
}
else {
   emit(commonToken(NEWLINE, newLine));

   int indent = getIndentationCount(spaces);
   int previous = indents.isEmpty() ? 0 : indents.peek();

   if (indent == previous) {
       // skip indents of the same size as the present indent-size
       skip();
   }
   else if (indent > previous) {
       indents.push(indent);
       emit(commonToken(Python3Parser.INDENT, spaces));
   }
   else {
       // Possibly emit more than 1 DEDENT token.
       while(!indents.isEmpty() && indents.peek() > indent) {
           this.emit(createDedent());
           indents.pop();
       }
   }
}

我自己将这些翻译成:

# A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
tokens = deque()

# The stack that keeps track of the indentation level.
# https://docs.python.org/3/tutorial/datastructures.html#using-lists-as-stacks
indents = []

# The amount of opened braces, brackets and parenthesis.
opened = 0

# The most recently produced token.
lastToken = None

def emit(self, t):
  self._token = t
  self.tokens.append(t)

def nextToken(self):

  # Check if the end-of-file is ahead and there are still some DEDENTS expected.
  if self._input.LA(1) == Token.EOF and self.indents.size() != 0:

    # Remove any trailing EOF tokens from our buffer.
    for i in range(tokens.size() - 1, 0, -1):
      if self.tokens[i].getType() == Token.EOF:
        self.tokens.remove(i)

    # First emit an extra line break that serves as the end of the statement.
    self.emit(commonToken(Python3Parser.NEWLINE, "\n"))

    # Now emit as much DEDENT tokens as needed.
    while self.indents.size() != 0:
      self.emit(createDedent())
      self.indents.pop()

    # Put the EOF back on the token stream.
    self.emit(commonToken(Python3Parser.EOF, "<EOF>"))

  next = self.nextToken()

  if next.getChannel() == Token.DEFAULT_CHANNEL:
    # Keep track of the last token on the default channel.
    self.lastToken = next

  return next if self.tokens.size() == 0 else self.tokens.popleft()

def createDedent():
  dedent = commonToken(Python3Parser.DEDENT, "")
  dedent.setLine(self.lastToken.getLine())
  return dedent

def commonToken(self, type, text):
  stop = self.getCharIndex() - 1
  start = stop if text.size() == 0 else stop - text.size() + 1
  return CommonToken(self._tokenFactorySourcePair, type, DEFAULT_TOKEN_CHANNEL, start, stop)

def getIndentationCount(spaces):

  count = 0

  for ch in spaces:
    if ch == '\t':
        count += 8 - (count % 8)
        break
    else:
        # A normal space char.
        count = count + 1

  return count

def atStartOfInput(self):
  return self.getCharPositionInLine() == 0 and self.getLine() == 1

newLine = getText().replaceAll("[^\r\n\f]+", "")
spaces = getText().replaceAll("[\r\n\f]+", "")
next = self._input.LA(1)

if opened > 0 or next == '\r' or next == '\n' or next == '\f' or next == '#':
    # If we're inside a list or on a blank line, ignore all indents,
    # dedents and line breaks.
    skip()
else:
    emit(commonToken(NEWLINE, newLine))

indent = getIndentationCount(spaces)
previous = 0 if indents.isEmpty() else indents.peek()

if indent == previous:
    # skip indents of the same size as the present indent-size
    skip()
elif indent > previous:
    indents.push(indent)
    emit(commonToken(Python3Parser.INDENT, spaces))
else:
     # Possibly emit more than 1 DEDENT token.
     while not indents.isEmpty() and indents.peek() > indent:
     self.emit(createDedent())
     indents.pop()

这是我的 python 脚本,用于 运行 antlr 输出,其中包含 python 而不是 java 片段。 运行 使用命令 python main.py test.py

import sys
from antlr4 import *
from Python3Lexer import Python3Lexer
from Python3Parser import Python3Parser
from Python3Listener import Python3Listener

class FuncPrinter(Python3Listener):
  def enterFuncdef(self, ctx):
    print("Oh, a func")

def main(argv):
  input = FileStream(argv[1])
  lexer = Python3Lexer(input)
  stream = CommonTokenStream(lexer)
  parser = Python3Parser(stream)
  tree = parser.funcdef()

  printer = KeyPrinter()
  walker = ParseTreeWalker()
  walker.walk(printer, tree)

if __name__ == '__main__':
  main(sys.argv)

它出错并打印以下跟踪

Traceback (most recent call last):
  File "main.py", line 24, in <module>
    main(sys.argv)
  File "main.py", line 17, in main
    tree = parser.parameters()
  File "...\antler-test\Python3Parser.py", line 1297, in parameters
    self.enterRule(localctx, 14, self.RULE_parameters)
  File "...\antler-test\antlr4\Parser.py", line 358, in enterRule
    self._ctx.start = self._input.LT(1)
  File "...\antler-test\antlr4\CommonTokenStream.py", line 61, in LT
    self.lazyInit()
  File "...\antler-test\antlr4\BufferedTokenStream.py", line 186, in lazyInit
    self.setup()
  File "...\antler-test\antlr4\BufferedTokenStream.py", line 189, in setup
    self.sync(0)
  File "...\antler-test\antlr4\BufferedTokenStream.py", line 111, in sync
    fetched = self.fetch(n)
  File "...\antler-test\antlr4\BufferedTokenStream.py", line 123, in fetch
    t = self.tokenSource.nextToken()
  File "...\antler-test\Python3Lexer.py", line 698, in nextToken
    next = self.nextToken()
  File "...\antler-test\Python3Lexer.py", line 698, in nextToken
    next = self.nextToken()
  File "...\antler-test\Python3Lexer.py", line 698, in nextToken
    next = self.nextToken()
  [Previous line repeated 985 more times]
  File "...\antler-test\Python3Lexer.py", line 680, in nextToken
    if self._input.LA(1) == Token.EOF and self.indents.size() != 0:
  File "...\antler-test\antlr4\InputStream.py", line 49, in LA
    if offset==0:
RecursionError: maximum recursion depth exceeded in comparison

输入文件如下所示:

def fun1():
    return None

def fun2():
    return None

我不确定我是否翻译了 python 错误,或者递归算法对 python 来说太多了,但我也不知道如何更改算法nextToken 方法是迭代的,因为它不是尾递归的。也许有人能弄清楚?还是我的操作有其他问题?

您的 python 代码显示

next = self.nextToken()

但是您的 java 代码显示:

Token next = super.nextToken();

请注意 superself. 不同 你可能指的是:

next = super().nextToken()

几天来我一直在研究同一个主题。

这并不容易。 Python 运行时与 java 运行时不完全相同 API。 Python runtime 使用较少且相当不完整。

我不得不使用一些解决方法,但它似乎有效。这是我的代码:

tokens { INDENT, DEDENT }

@lexer::members {

    # A queue where extra tokens are pushed on (see the NEWLINE lexer rule).
    self.tokens = []

    # The stack that keeps track of the indentation level.
    self.indents = []

    # The amount of opened braces, brackets and parenthesis.
    self.opened = 0

    # The most recently produced token.
    self.last_token = None

def emitToken(self, t):
    super().emitToken(t)
    self.tokens.append(t)

def nextToken(self):
    if self._input.LA(1) == Token.EOF and len(self.indents) > 0:
        # Remove any trailing EOF tokens from our buffer.
        while len(self.tokens) > 0 and self.tokens[-1].type == Token.EOF:
            del self.tokens[-1]

        # First emit an extra line break that serves as the end of the statement.
        self.emitToken(self.common_token(Python3Lexer.NEWLINE, "\n"));

        # Now emit as much DEDENT tokens as needed.
        while len(self.indents) != 0:
            self.emitToken(self.create_dedent())
            del self.indents[-1]

        # Put the EOF back on the token stream.
        self.emitToken(self.common_token(Token.EOF, "<EOF>"));

    next = super().nextToken();

    if next.channel == Token.DEFAULT_CHANNEL:
        # Keep track of the last token on the default channel.
        self.last_token = next

    if len(self.tokens) == 0:
        return next
    else:
        t = self.tokens[0]
        del self.tokens[0]
        return t

def create_dedent(self):
    from Python3Parser import Python3Parser
    dedent = self.common_token(Python3Parser.DEDENT, "")
    dedent.line = self.last_token.line
    return dedent

def common_token(self, _type,  text):
    from antlr4.Token import CommonToken
    stop = self.getCharIndex() - 1
    if len(self.text) == 0:
        start = stop
    else:
        start = stop - len(self.text) + 1
    return CommonToken(self._tokenFactorySourcePair, _type, Lexer.DEFAULT_TOKEN_CHANNEL, start, stop)

## Calculates the indentation of the provided spaces, taking the
## following rules into account:
##
## "Tabs are replaced (from left to right) by one to eight spaces
##  such that the total number of characters up to and including
##  the replacement is a multiple of eight [...]"
##
##  -- https://docs.python.org/3.1/reference/lexical_analysis.html#indentation
def getIndentationCount(self, spaces):
    count = 0
    for ch in spaces:
        if ch == '\t':
            count += 8 - (count % 8)
        else:
            count += 1
    return count

def atStartOfInput(self):
    return self._interp.column == 0 and self._interp.line == 1

}

对于 NEWLINE 词法分析器部分:

NEWLINE
 : ( {self.atStartOfInput()}?   SPACES
   | ( '\r'? '\n' | '\r' | '\f' ) SPACES?
   )

   {
    import re
    from Python3Parser import Python3Parser
    new_line = re.sub(r"[^\r\n\f]+", "", self._interp.getText(self._input)) #.replaceAll("[^\r\n\f]+", "")
    spaces = re.sub(r"[\r\n\f]+", "", self._interp.getText(self._input)) #.replaceAll("[\r\n\f]+", "")
    next = self._input.LA(1)

    if self.opened > 0 or next == '\r' or next == '\n' or next == '\f' or next == '#':
        self.skip()
    else:
        self.emitToken(self.common_token(self.NEWLINE, new_line))

        indent = self.getIndentationCount(spaces)
        if len(self.indents) == 0:
            previous = 0
        else:
            previous = self.indents[-1]

        if indent == previous:
            self.skip()
        elif indent > previous:
            self.indents.append(indent)
            self.emitToken(self.common_token(Python3Parser.INDENT, spaces))
        else:
            while len(self.indents) > 0 and self.indents[-1] > indent:
                self.emitToken(self.create_dedent())
                del self.indents[-1]

   };

您还必须在整个文件中将 lexer id "str" 替换为 "string"(例如),因为 str 是 python.

中的关键字

我遇到了同样的问题。我不能完全让 Alexandre 的代码在 python3 下工作。我不得不稍微修改一下:

...
next = self._input.LA(1)
if next == Python3Parser.EOF:
    chr_next = -1
else:
    chr_next = chr( next )

if self.opened > 0 or chr_next == '\r' or chr_next == '\n' or chr_next == '\f' or chr_next == '#':
    self.skip()
...

您还可以将所有导入移动到词法分析器的 header:

@lexer::header {
import re
from Python3Parser import Python3Parser
from antlr4.Token import CommonToken    
}