当前位置: 代码迷 >> 综合 >> 词法分析器(Lexical Analyzer)【Python实现】
  详细解决方案

词法分析器(Lexical Analyzer)【Python实现】

热度:49   发布时间:2023-10-26 06:16:13.0

查看正文内容

from __future__ import print_function
import sys# following two must remain in the same ordertk_EOI, tk_Mul, tk_Div, tk_Mod, tk_Add, tk_Sub, tk_Negate, tk_Not, tk_Lss, tk_Leq, tk_Gtr, \
tk_Geq, tk_Eq, tk_Neq, tk_Assign, tk_And, tk_Or, tk_If, tk_Else, tk_While, tk_Print,       \
tk_Putc, tk_Lparen, tk_Rparen, tk_Lbrace, tk_Rbrace, tk_Semi, tk_Comma, tk_Ident,          \
tk_Integer, tk_String = range(31)all_syms = ["End_of_input", "Op_multiply", "Op_divide", "Op_mod", "Op_add", "Op_subtract","Op_negate", "Op_not", "Op_less", "Op_lessequal", "Op_greater", "Op_greaterequal","Op_equal", "Op_notequal", "Op_assign", "Op_and", "Op_or", "Keyword_if","Keyword_else", "Keyword_while", "Keyword_print", "Keyword_putc", "LeftParen","RightParen", "LeftBrace", "RightBrace", "Semicolon", "Comma", "Identifier","Integer", "String"]# single character only symbols
symbols = {
     '{': tk_Lbrace, '}': tk_Rbrace, '(': tk_Lparen, ')': tk_Rparen, '+': tk_Add, '-': tk_Sub,'*': tk_Mul, '%': tk_Mod, ';': tk_Semi, ',': tk_Comma }key_words = {
    'if': tk_If, 'else': tk_Else, 'print': tk_Print, 'putc': tk_Putc, 'while': tk_While}the_ch = " "    # dummy first char - but it must be a space
the_col = 0
the_line = 1
input_file = None#*** show error and exit
def error(line, col, msg):print(line, col, msg)exit(1)#*** get the next character from the input
def next_ch():global the_ch, the_col, the_linethe_ch = input_file.read(1)the_col += 1if the_ch == '\n':the_line += 1the_col = 0return the_ch#*** 'x' - character constants
def char_lit(err_line, err_col):n = ord(next_ch())              # skip opening quoteif the_ch == '\'':error(err_line, err_col, "empty character constant")elif the_ch == '\\':next_ch()if the_ch == 'n':n = 10elif the_ch == '\\':n = ord('\\')else:error(err_line, err_col, "unknown escape sequence \\%c" % (the_ch))if next_ch() != '\'':error(err_line, err_col, "multi-character constant")next_ch()return tk_Integer, err_line, err_col, n#*** process divide or comments
def div_or_cmt(err_line, err_col):if next_ch() != '*':return tk_Div, err_line, err_col# comment foundnext_ch()while True:if the_ch == '*':if next_ch() == '/':next_ch()return gettok()elif len(the_ch) == 0:error(err_line, err_col, "EOF in comment")else:next_ch()#*** "string"
def string_lit(start, err_line, err_col):text = ""while next_ch() != start:if len(the_ch) == 0:error(err_line, err_col, "EOF while scanning string literal")if the_ch == '\n':error(err_line, err_col, "EOL while scanning string literal")text += the_chnext_ch()return tk_String, err_line, err_col, text#*** handle identifiers and integers
def ident_or_int(err_line, err_col):is_number = Truetext = ""while the_ch.isalnum() or the_ch == '_':text += the_chif not the_ch.isdigit():is_number = Falsenext_ch()if len(text) == 0:error(err_line, err_col, "ident_or_int: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))if text[0].isdigit():if not is_number:error(err_line, err_col, "invalid number: %s" % (text))n = int(text)return tk_Integer, err_line, err_col, nif text in key_words:return key_words[text], err_line, err_colreturn tk_Ident, err_line, err_col, text#*** look ahead for '>=', etc.
def follow(expect, ifyes, ifno, err_line, err_col):if next_ch() == expect:next_ch()return ifyes, err_line, err_colif ifno == tk_EOI:error(err_line, err_col, "follow: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))return ifno, err_line, err_col#*** return the next token type
def gettok():while the_ch.isspace():next_ch()err_line = the_lineerr_col  = the_colif len(the_ch) == 0:    return tk_EOI, err_line, err_colelif the_ch == '/':     return div_or_cmt(err_line, err_col)elif the_ch == '\'':    return char_lit(err_line, err_col)elif the_ch == '<':     return follow('=', tk_Leq, tk_Lss,    err_line, err_col)elif the_ch == '>':     return follow('=', tk_Geq, tk_Gtr,    err_line, err_col)elif the_ch == '=':     return follow('=', tk_Eq,  tk_Assign, err_line, err_col)elif the_ch == '!':     return follow('=', tk_Neq, tk_Not,    err_line, err_col)elif the_ch == '&':     return follow('&', tk_And, tk_EOI,    err_line, err_col)elif the_ch == '|':     return follow('|', tk_Or,  tk_EOI,    err_line, err_col)elif the_ch == '"':     return string_lit(the_ch, err_line, err_col)elif the_ch in symbols:sym = symbols[the_ch]next_ch()return sym, err_line, err_colelse: return ident_or_int(err_line, err_col)#*** main driver
input_file = sys.stdin
if len(sys.argv) > 1:try:input_file = open(sys.argv[1], "r", 4096)except IOError as e:error(0, 0, "Can't open %s" % sys.argv[1])while True:t = gettok()tok  = t[0]line = t[1]col  = t[2]print("%5d %5d %-14s" % (line, col, all_syms[tok]), end='')if tok == tk_Integer:  print(" %5d" % (t[3]))elif tok == tk_Ident:  print(" %s" %   (t[3]))elif tok == tk_String: print(' "%s"' % (t[3]))else:                  print("")if tok == tk_EOI:break

输出(测试用例三)

    5     16   Keyword_print5     40   Op_subtract6     16   Keyword_putc6     40   Op_less7     16   Keyword_if7     40   Op_greater8     16   Keyword_else8     40   Op_lessequal9     16   Keyword_while9     40   Op_greaterequal10     16   LeftBrace10     40   Op_equal11     16   RightBrace11     40   Op_notequal12     16   LeftParen12     40   Op_and13     16   RightParen13     40   Op_or14     16   Op_subtract14     40   Semicolon15     16   Op_not15     40   Comma16     16   Op_multiply16     40   Op_assign17     16   Op_divide17     40   Integer             4218     16   Op_mod18     40   String          "String literal"19     16   Op_add19     40   Identifier      variable_name20     26   Integer             1021     26   Integer             9222     26   Integer             3223      1   End_of_input