Python版C语言词法分析器
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Python版C语言词法分析器相关的知识,希望对你有一定的参考价值。
#!/usr/bin/python # -*- coding: utf-8 -*- import sys from Tkinter import * from tkFont import * from FileDialog import * KEYWORD_LIST = [‘if‘, ‘else‘, ‘while‘, ‘break‘, ‘continue‘, ‘for‘, ‘double‘, ‘int‘, ‘float‘, ‘long‘, ‘short‘, ‘bool‘, ‘switch‘, ‘case‘, ‘return‘, ‘void‘] SEPARATOR_LIST = [‘{‘, ‘}‘, ‘[‘, ‘]‘, ‘(‘, ‘)‘, ‘~‘, ‘,‘, ‘;‘, ‘.‘, ‘?‘, ‘:‘, ‘ ‘] OPERATOR_LIST = [‘+‘, ‘++‘, ‘-‘, ‘--‘, ‘+=‘, ‘-=‘, ‘*‘, ‘*=‘, ‘%‘, ‘%=‘, ‘->‘, ‘|‘, ‘||‘, ‘|=‘, ‘/‘, ‘/=‘, ‘>‘, ‘<‘, ‘>=‘, ‘<=‘, ‘=‘, ‘==‘, ‘!=‘, ‘!‘, ‘&‘] CATEGORY_DICT = { # KEYWORD "far": 257, "near": 258, "pascal": 259, "register": 260, "asm": 261, "cdecl": 262, "huge": 263, "auto": 264, "double": 265, "int": 266, "struct": 267, "break": 268, "else": 269, "long": 270, "switch": 271, "case": 272, "enum": 273, "register": 274, "typedef": 275, "char": 276, "extern": 277, "return": 278, "union": 279, "const": 280, "float": 281, "short": 282, "unsigned": 283, "continue": 284, "for": 285, "signed": 286, "void": 287, "default": 288, "goto": 289, "sizeof": 290, "volatile": 291, "do": 292, "if": 293, "while": 294, "static": 295, "interrupt": 296, "sizeof": 297, "NULL": 298, # SEPARATOR "{": 299, "}": 300, "[": 301, "]": 302, "(": 303, ")": 304, "~": 305, ",": 306, ";": 307, ".": 308, "#": 309, "?": 310, ":": 311, # OPERATOR "<<": 312, ">>": 313, "<": 314, "<=": 315, ">": 316, ">=": 317, "=": 318, "==": 319, "|": 320, "||": 321, "|=": 322, "^": 323, "^=": 324, "&": 325, "&&": 326, "&=": 327, "%": 328, "%=": 329, "+": 330, "++": 331, "+=": 332, "-": 333, "--": 334, "-=": 335, "->": 336, "/": 337, "/=": 338, "*": 339, "*=": 340, "!": 341, "!=": 342, "sizeof": 343, "<<=": 344, ">>=": 345, "inum": 346, "int16": 347, "int8": 348, "char": 350, "string": 351, "bool": 352, "fnum": 353, "IDN": 354 } current_row = -1 current_line = 0 out_line = 1 def getchar(input_str): global current_row global current_line current_row += 1 if current_row == len(input_str[current_line]): current_line += 1 current_row = 0 if current_line == len(input_str) - 1: return ‘SCANEOF‘ return input_str[current_line][current_row] def ungetchar(input_str): global current_row global current_line current_row = current_row - 1 if current_row < 0: current_line = current_line - 1 current_row = len(input_str[current_row]) - 1 return input_str[current_line][current_row] def error(msg, line=None, row=None): global out_line if line is None: line = current_line + 1 if row is None: row = current_row + 1 analysis.insert(str(out_line) + ‘.0‘, str(line) + ‘:‘ + str(row) + ‘Error: ‘ + msg) analysis.insert(str(out_line) + ‘.end‘, "\n") out_line = out_line + 1 def scanner(input_str): global current_line global current_row current_char = getchar(input_str) if current_char == ‘SCANEOF‘: return (‘SCANEOF‘, ‘‘, ‘‘) if current_char.strip() == ‘‘: return if current_char.isdigit(): int_value = 0 while current_char.isdigit(): int_value = int_value * 10 + int(current_char) current_char = getchar(input_str) if current_char not in OPERATOR_LIST and current_char not in SEPARATOR_LIST and current_char != ‘e‘: line = current_line + 1 row = current_row + 1 # ungetchar(input_str) error(‘illigal identifier‘, line, row) # return (‘SCANEOF‘, ‘‘, ‘‘) return (‘‘, ‘‘, ‘‘) if current_char != ‘.‘ and current_char != ‘e‘: ungetchar(input_str) return (‘INUM‘, int_value, CATEGORY_DICT[‘inum‘]) if current_char == ‘e‘: power_value = str(int_value) + ‘e‘ current_char = getchar(input_str) if current_char == ‘+‘ or current_char == ‘-‘: power_value += current_char current_char = getchar(input_str) while current_char.isdigit(): power_value += current_char current_char = getchar(input_str) if current_char not in OPERATOR_LIST and current_char not in SEPARATOR_LIST: line = current_line + 1 row = current_row + 1 # ungetchar(input_str) error(‘illigal const int value in power‘, line, row) # return (‘SCANEOF‘, ‘‘, ‘‘) return (‘‘, ‘‘, ‘‘) ungetchar(input_str) return (‘INUM‘, power_value, CATEGORY_DICT[‘inum‘]) if current_char == ‘.‘: float_value = str(int_value) + ‘.‘ current_char = getchar(input_str) while current_char.isdigit(): float_value += current_char current_char = getchar(input_str) if current_char not in OPERATOR_LIST and current_char not in SEPARATOR_LIST or current_char == ‘.‘: line = current_line + 1 row = current_row + 1 # ungetchar(input_str) error(‘illigal const float value‘, line, row) # return (‘SCANEOF‘, ‘‘, ‘‘) return (‘‘, ‘‘, ‘‘) ungetchar(input_str) return (‘FNUM‘, float_value, CATEGORY_DICT[‘fnum‘]) if current_char.isalpha() or current_char == ‘_‘: string = ‘‘ while current_char.isalpha() or current_char.isdigit() or current_char == ‘_‘ and current_char != ‘ ‘: string += current_char current_char = getchar(input_str) if current_char == ‘SCANEOF‘: break ungetchar(input_str) if string in KEYWORD_LIST: return (string, ‘‘, CATEGORY_DICT[string]) else: return (‘IDN‘, string, CATEGORY_DICT[‘IDN‘]) if current_char == ‘\"‘: str_literal = ‘‘ line = current_line + 1 row = current_row + 1 current_char = getchar(input_str) while current_char != ‘\"‘: str_literal += current_char current_char = getchar() if current_char == ‘SCANEOF‘: error(‘missing terminating \"‘, line, row) current_line = line current_row = row return (‘SCANEOF‘, ‘‘, ‘‘) return (‘STRING_LITERAL‘, str_literal, CATEGORY_DICT[‘string‘]) if current_char == ‘/‘: next_char = getchar(input_str) line = int(current_line) + 1 row = int(current_row) + 1 if next_char == ‘*‘: comment = ‘‘ next_char = getchar(input_str) while True: if next_char == ‘SCANEOF‘: error(‘unteminated /* comment‘, line, row) return (‘SCANEOF‘, ‘‘, ‘‘) if next_char == ‘*‘: end_char = getchar(input_str) if end_char == ‘/‘: return None if end_char == ‘SCANEOF‘: error(‘unteminated /* comment‘, line, row) return (‘SCANEOF‘, ‘‘, ‘‘) comment += next_char next_char = getchar(input_str) else: ungetchar(input_str) op = current_char current_char = getchar(input_str) if current_char in OPERATOR_LIST: op += current_char else: ungetchar(input_str) return (‘OP‘, op, CATEGORY_DICT[op]) if current_char in SEPARATOR_LIST: return (‘SEP‘, current_char, CATEGORY_DICT[current_char]) if current_char in OPERATOR_LIST: op = current_char current_char = getchar(input_str) if current_char in OPERATOR_LIST: op += current_char else: ungetchar(input_str) return (‘OP‘, op, CATEGORY_DICT[op]) else: error(‘unknown character: ‘ + current_char) def fileloader(): global root code.delete(1.0, END) fd = LoadFileDialog(root) filename = fd.go() fin = open(filename, "r") input_file = fin.read() input_lines = input_file[0].split("\n") code.insert(1.0, input_file) fin.close() def lexer_analysis(input_str): global current_row global current_line global out_line current_row = -1 current_line = 0 analysis_result = [] while True: r = scanner(input_str) if r is not None: if r[0] == ‘SCANEOF‘: break analysis_result.append(str(r[0]) + "\t\t" + str(r[1]) + "\t\t" + str(r[2])) return analysis_result def lexer(): input_str = [] analysis.delete(1.0, END) input_raw = code.get(1.0, END) input_str = input_raw.split("\n") lexer_analysis(input_str) out_line = 1 result = lexer_analysis(input_str) for each in result: analysis.insert(str(out_line) + ‘.end‘, each) analysis.insert(str(out_line) + ‘.end‘, "\n") out_line = out_line + 1 def pre_interface(): global root global code global analysis root = Tk() code = Text(root, width=60, height=20, font=15) analysis = Text(root, width=60, height=20, font=15) t = StringVar() t.set(‘Patrick的词法分析器‘) label = Label(root, textvariable=t, font=15) Analysis = Button(root, text=‘词法分析‘, command=lexer, font=15) load = Button(root, text=‘ 载入代码 ‘, command=fileloader, font=15) root.title("LEXER") label.pack(side=TOP) Analysis.pack(side=BOTTOM) load.pack(side=BOTTOM) code.pack(side=LEFT) analysis.pack(side=RIGHT) root.mainloop() def main(): pre_interface() # lexer() if __name__ == ‘__main__‘: main()