python 简单的基于正则表达式的词法分析器
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 简单的基于正则表达式的词法分析器相关的知识,希望对你有一定的参考价值。
'''Simple regex-based lexer
Refer to:
- http://stackoverflow.com/questions/133886/simple-regex-based-lexer-in-python
- https://gist.github.com/eliben/5797351
'''
import re
class Token(object):
""" A simple Token structure.
Contains the token type, value and position.
"""
def __init__(self, type, val, pos):
self.type = type
self.val = val
self.pos = pos
def __str__(self):
return '%s(%s) at %s' % (self.type, self.val, self.pos)
class LexerError(Exception):
""" Lexer error exception.
pos:
Position in the input line where the error occurred.
"""
def __init__(self, pos):
self.pos = pos
class Lexer(object):
""" A simple regex-based lexer/tokenizer.
See below for an example of usage.
"""
def __init__(self, rules, skip_whitespace=True):
""" Create a lexer.
rules:
A list of rules. Each rule is a `pattern, type`
pair, where `pattern` is the regular expression used
to recognize the token and `type` is the type
of the token to return when it's recognized.
skip_whitespace:
If True, whitespace (\s+) will be skipped and not
reported by the lexer. Otherwise, you have to
specify your rules for whitespace, or it will be
flagged as an error.
"""
self.rules = [(re.compile(pattern), type) for pattern, type in rules]
self.skip_whitespace = skip_whitespace
self.re_ws_skip = re.compile('\S')
def input(self, buf):
""" Initialize the lexer with a buffer as input.
"""
self.buf = buf
self.pos = 0
def token(self):
""" Return the next token (a Token object) found in the
input buffer. None is returned if the end of the
buffer was reached.
In case of a lexing error (the current chunk of the
buffer matches no rule), a LexerError is raised with
the position of the error.
"""
if self.pos >= len(self.buf):
return None
else:
if self.skip_whitespace:
m = self.re_ws_skip.search(self.buf, self.pos)
if m:
self.pos = m.start()
else:
return None
for token_regex, token_type in self.rules:
m = token_regex.match(self.buf, self.pos)
if m:
value = self.buf[m.start(): m.end()]
tok = Token(token_type, value, self.pos)
self.pos = m.end()
return tok
# if we're here, no rule matched
raise LexerError(self.pos)
def tokens(self):
""" Returns an iterator to the tokens found in the buffer.
"""
while 1:
tok = self.token()
if tok is None: break
yield tok
if __name__ == '__main__':
rules = [
(r'\d+', 'NUMBER'),
(r'[a-zA-Z_]\w+', 'IDENTIFIER'),
(r'\+', 'PLUS'),
(r'\-', 'MINUS'),
(r'\*', 'MULTIPLY'),
(r'\/', 'DIVIDE'),
(r'\(', 'LP'),
(r'\)', 'RP'),
(r'=', 'EQUALS'),
]
lx = Lexer(rules, skip_whitespace=True)
lx.input('erw = _abc + 12*(R4-623902) ')
try:
for tok in lx.tokens():
print tok
except LexerError, err:
print 'LexerError at position', err.pos
以上是关于python 简单的基于正则表达式的词法分析器的主要内容,如果未能解决你的问题,请参考以下文章