C词法分析器的Python简单实现
Posted petewell
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了C词法分析器的Python简单实现相关的知识,希望对你有一定的参考价值。
原文引用https://www.dazhuanlan.com/2019/08/25/5d62598fd13ee/
在学习编译原理的课程设计中,需要设计一个词法分许进程。于是尝试用Python来简单实现C语言词法分析器。其中其实并没有什么具体需要克服的难处,只要将部分的词法分析DFA设计好,实现起来思路便更清晰。
?
1、前言
C语言中我们需要提取出关键字,标识符,分隔符,运算符,不同数据类型的常量。其中标识符、标识符及分隔符的提取更简单,而运算符因为各运算符组合具有不同意义需详细分解,例如:>>、>=、>>=,常量更是最让人头疼的,例如:整型 八进制 00123u 十六进制 0x23fflu 、浮点型 123.434f 指数型 0123e-002。
词法分析器还需要能显示出行数、检测出出错位置,并跳过错误继续分析。另外我们还需要对单行注释、多行注释进行跳过,以及跳过暂时不能处理的宏定义、预编译。
2、部分有穷状态自动机实现
2.1、关键字及标识符
if prog[flag].isalpha() or prog[flag]==r‘_‘:#_字母开头 标识符和关键字 tmp=flag while prog[flag].isalnum() or prog[flag]==r‘_‘:flag+=1 #走到符合条件的最右边 fragment=str(reduce(lambda x,y:x+y,prog[tmp:flag])) if fragment in keyword:#判断为关键字,keyword为关键字list print("< "+fragment+",- >",end=‘ ‘) else:#判断为标识符 print("< IDENTIFY,"+fragment+" >",end=‘ ‘)
2.2、数字(最复杂的一个)
#prog 待分析串 flag标记当前位置 line当前行 befspace上一个换行位置
#skip(prog,flag) 出错后跳过无效字符
elif prog[flag].isdigit():#判断数字常量
if prog[flag] == ‘0’:#当开始字符为0
if prog[flag+1].lower()==’x’:#0x开头为十六进制
flag+=2
tmp=flag
while prog[flag].isdigit() or
(prog[flag].lower()>=’a’ and prog[flag].lower()<=’f’): #判断十六进制
flag+=1
numch=str(reduce(lambda x,y:x+y,prog[tmp:flag]))#拼接十六进制数字
num = str(int(numch, 16))#拼接为十进制数字
if prog[flag].lower()==’u’ :#判断是否有符号、长整,当前flag指向十六进制后一位
if prog[flag+1].lower()==’l’:#0x23ul
if prog[flag + 2].isalnum():#0x23uls报错,并跳过
print(“<ERROR,INT_UN_LONG_DEFINE,LINE “+str(line)+” “+str(flag-befspace)+” >”, end=’ ‘)
flag += 3
flag = skip(prog, flag)
else:
print(“< INT_UN_LONG,”+num+” >”,end=’ ‘)
flag+=2
elif prog[flag+1].isalnum():#0x23u2出错
print(“<ERROR,INT_UN_DEFINE,LINE “+str(line)+” “+str(flag-befspace)+” >”, end=’ ‘)
flag += 1
flag=skip(prog,flag)
else:
print(“< INT_UN,”+num+” >”,end=’ ‘)#
flag+=1
elif prog[flag].lower()==’l’:#同上
if prog[flag+1].lower()==’u’:
if prog[flag + 2].isalnum():
print(“< ERROR,INT_UN_LONG_DEFINE ,LINE “+str(line)+” “+str(flag-befspace)+” >”, end=’ ‘)
flag += 3
flag = skip(prog, flag)
else:
print(“< INT_UN_LONG,”+num+” >”,end=’ ‘)
flag+=2
elif prog[flag+1].isalnum():
print(“< ERROR,INT_LONG_DEFINE,LINE “+str(line)+str(flag-befspace)+” >”, end=’ ‘)
flag += 1
flag = skip(prog, flag)
else:
print(“< INT_LONG,”+num+” >”,end=’ ‘)
flag+=1
elif prog[flag].isalpha():#十六进制后接字符则报错
print(“< ERROR,INT_DEFINE,LINE “+str(line)+” “+str(flag-befspace)+” >”,end=’ ‘)
flag+=1
flag = skip(prog, flag)
else:
print(“< INT,” + num + “ >”, end=’ ‘)
pass
elif prog[flag+1]==’.’:#0.开头为浮点数
tmp=flag
flag+=2
haspro=False#是否有+、-、e
hasneg=False
haseE=False
while prog[flag].isdigit() or prog[flag].lower()==’e’ or prog[flag]==’-‘ or prog[flag]==’+’:
if (prog[flag]==’+’ or prog[flag]==’-‘) and not haseE:break#需满足一些条件
elif (prog[flag] == ‘+’ or prog[flag] == ‘-‘) and (haspro or hasneg):break
elif (prog[flag]==’+’ or prog[flag]==’-‘) and haseE:
if prog[flag-1].lower()!=’e’:break
else:
if prog[flag]==’+’:haspro=True
else:hasneg=True
elif prog[flag].lower()==’e’ and haseE:break
elif prog[flag]==’+’:haspro=True
elif prog[flag] ==’-‘:hasneg = True
elif prog[flag].lower()==’e’:haseE=True
flag+=1
befpron=str(reduce(lambda x,y:x+y,prog[tmp:flag]))#拼接
findpro=befpron.find(‘+’)
findneg=befpron.find(‘-‘)
findeE=befpron.lower().find(‘e’)
IsFLOAT=False
IsERROR=False
theskip=flag
if prog[flag].lower()==’f’:#0.23e+002f浮点数声明判断
if prog[flag+1].isalnum():#0.23e+002f3报错
print(“< ERROR,FLOAT_DEFINE,LINE “ + str(line) + “ “ + str(flag+1- befspace) + “ >”,
end=’ ‘)
IsERROR=True
flag = skip(prog, flag+2)
else:
IsFLOAT=True
theskip=flag+1
elif prog[flag].isalnum():#同上
print(“< ERROR,FLOAT_DEFINE,LINE “ + str(line) + “ “ + str(flag-befspace) + “ >”,
end=’ ‘)
IsERROR = True
flag = skip(prog, flag+1)
if not IsERROR:
if (hasneg or haspro) and haseE:#有符号指数型
pos=(findpro if haspro>hasneg else findneg)
if pos!=flag-tmp-1:#处理e+23
suffix=int(reduce(lambda x,y:str(int(x)10+int(y)),befpron[pos+1:]))
suffix=suffix if haspro else -suffix
num=str(float(str(reduce(lambda x,y:x+y,befpron[0:findeE])))10**suffix)
if IsFLOAT:
print(“< FLOAT,” + num + “ >”, end=’ ‘)
flag=theskip
else:print(“< DOUBLE,” + num + “ >”, end=’ ‘)
else:#0.12e+报错
print(“< ERROR,FLOAT_DEFINE,LINE “ + str(line) + “ “ + str(flag - befspace) + “ >”,
end=’ ‘)
flag = skip(prog, flag)
elif haseE and not (haspro or hasneg):#无符号指数e
if findeE!=flag-tmp-1:
suffix = int(str(reduce(lambda x, y: x + y, befpron[findeE+1:])))
num = str(float(str(reduce(lambda x, y: x + y, befpron[0:findeE])))
* 10 ** suffix)
if IsFLOAT:
print("< FLOAT," + num + " >", end=‘ ‘)
flag=theskip
else:print("< DOUBLE," + num + " >", end=‘ ‘)
else:
print("< ERROR,FLOAT_DEFINE,LINE " + str(line) + " " + str(flag - befspace) + " >",
end=‘ ‘)
flag = skip(prog, flag)
pass
else:#单纯的浮点数0.23
num=str(float(befpron))
if IsFLOAT:
print("< FLOAT," + num + " >", end=‘ ‘)
flag = theskip
else:
print("< DOUBLE," + num + " >", end=‘ ‘)
elif prog[flag+1].isdigit():#浮点数和八进制均有可能
tmp=flag
flag+=2
haspoint=False#是否有小数点、符号、指数
haseE=False
haspro=False
hasneg=False
while prog[flag].isdigit() or prog[flag].lower()==‘e‘ or prog[flag]==‘-‘ or prog[flag]==‘+‘ or prog[flag]==‘.‘:
if (prog[flag]==‘+‘ or prog[flag]==‘-‘) and not haseE:break#需满足的一些条件
elif (prog[flag] == ‘+‘ or prog[flag] == ‘-‘) and (haspro or hasneg):
break
elif (prog[flag]==‘+‘ or prog[flag]==‘-‘) and haseE:
if prog[flag-1].lower()!=‘e‘:break
else:
if prog[flag]==‘+‘:haspro=True
else:hasneg=True
elif (haseE or haspoint) and prog[flag]==‘.‘:break
elif prog[flag].lower() == ‘e‘ and haseE:break
elif prog[flag]==‘.‘ and haspoint:break
elif prog[flag] == ‘+‘ :haspro = True
elif prog[flag] == ‘-‘:hasneg = True
elif prog[flag].lower() == ‘e‘ :haseE = True
elif prog[flag]==‘.‘ :haspoint=True
flag+=1
befpron = str(reduce(lambda x, y: x + y, prog[tmp:flag]))#拼接
findpro = befpron.find(‘+‘)
findneg = befpron.find(‘-‘)
findeE = befpron.lower().find(‘e‘)
findpoint = befpron.find(‘.‘)
IsFLOAT = False
IsERROR = False
theskip = flag
if (haspoint or haseE) and prog[flag].lower() == ‘f‘:#同上
if prog[flag + 1].isalnum():
print("< ERROR,FLOAT_DEFINE,LINE " + str(line) + " " + str(flag + 1 - befspace) + " >",
end=‘ ‘)
IsERROR = True
flag = skip(prog, flag + 2)
else:
IsFLOAT = True
theskip = flag + 1
elif prog[flag].isalnum():
print("< ERROR,NUM_DEFINE,LINE " + str(line) + " " + str(flag - befspace) + " >",
end=‘ ‘)
IsERROR = True
flag = skip(prog, flag + 1)
if not IsERROR:
if haseE and (haspro or hasneg):
pos = (findpro if haspro > hasneg else findneg)
if pos != flag - tmp - 1: # 处理e+23
suffix = int(reduce(lambda x, y: str(int(x) * 10 + int(y)), befpron[pos + 1:]))
suffix = suffix if haspro else -suffix
num = str(float(str(reduce(lambda x, y: x + y, befpron[0:findeE]))) * 10 ** suffix)
if IsFLOAT:
print("< FLOAT," + num + " >", end=‘ ‘)
flag=theskip
else:print("< DOUBLE," + num + " >", end=‘ ‘)
else: # 0.12e+报错
print("< ERROR,FLOAT_DEFINE,LINE " + str(line) + " " + str(flag - befspace) + " >",
end=‘ ‘)
flag = skip(prog, flag)
elif haseE :
if haseE!=flag-tmp-1:
suffix = int(str(reduce(lambda x, y: x + y, befpron[findeE + 1:])))
num=str(float(str(reduce(lambda x, y: x + y, befpron[0:findeE])))*10**suffix)
if IsFLOAT:
print("< FLOAT," + num + " >", end=‘ ‘)
flag=theskip
else:print("< DOUBLE," + num + " >", end=‘ ‘)
else:
print("< ERROR,FLOAT_DEFINE,LINE " + str(line) + " " + str(flag - befspace) + " >",
end=‘ ‘)
flag = skip(prog, flag)
elif haspoint:
if IsFLOAT:
print("< FLOAT," + str(float(befpron)) + " >", end=‘ ‘)
flag = theskip
else:
print("< DOUBLE," + str(float(befpron)) + " >", end=‘ ‘)
else:
isoctal=True
for char in befpron:
if int(char)>=8:isoctal=False
if not isoctal:
print("< ERROR,OCTAL_OUTOFRANG,LINE"+str(line)+" "+str(flag-befspace)+" >",end=‘ ‘)
else:
print("< INT,"+str(int(befpron,8))+">" ,end=‘ ‘)
elif prog[flag+1] in operetor or prog[flag+1] in delimiter:#简单的0
print("< INT,0 >",end=‘ ‘)
flag+=1
else:#其他字母或符号抛异常
flag+=2
print("< ERROR,NUM_DEFINE,LINE"+str(line)+" "+str(flag-befspace)+" >",end=‘ ‘)
else:#非0开头可为浮点数或整数
haspoint=False#同上
haseE=False
haspro = False
hasneg = False
tmp=flag
while prog[flag].isdigit() or prog[flag].lower() == ‘e‘ or prog[flag] == ‘-‘ or prog[flag] == ‘+‘ or
prog[flag] == ‘.‘:
if (prog[flag] == ‘+‘ or prog[flag] == ‘-‘) and not haseE:break
elif (prog[flag] == ‘+‘ or prog[flag] == ‘-‘) and (haspro or hasneg):break
elif (prog[flag] == ‘+‘ or prog[flag] == ‘-‘) and haseE:
if prog[flag - 1].lower() != ‘e‘:
break
else:
if prog[flag] == ‘+‘:
haspro = True
else:
hasneg = True
elif (haseE or haspoint) and prog[flag] == ‘.‘:break
elif prog[flag].lower() == ‘e‘ and haseE:break
elif prog[flag] == ‘.‘ and haspoint:break
elif prog[flag] == ‘+‘:
haspro = True
elif prog[flag] == ‘-‘:
hasneg = True
elif prog[flag].lower() == ‘e‘:
haseE = True
elif prog[flag] == ‘.‘:
haspoint = True
flag += 1
befpron = str(reduce(lambda x, y: x + y, prog[tmp:flag]))
findpro = befpron.find(‘+‘)
findneg = befpron.find(‘-‘)
findeE = befpron.lower().find(‘e‘)
findpoint = befpron.find(‘.‘)
IsFLOAT = False
IsERROR = False
theskip = flag
if (haspoint or haseE) and prog[flag].lower() == ‘f‘:
if prog[flag + 1].isalnum():
print("< ERROR,FLOAT_DEFINE,LINE " + str(line) + " " + str(flag + 1 - befspace) + " >",
end=‘ ‘)
IsERROR = True
flag = skip(prog, flag + 2)
else:
IsFLOAT = True
theskip = flag + 1
elif prog[flag].isalnum():
print("< ERROR,FLOAT_DEFINE,LINE " + str(line) + " " + str(flag - befspace) + " >",
end=‘ ‘)
IsERROR = True
flag = skip(prog, flag + 1)
if not IsERROR:
if haseE and (haspro or hasneg):#有符号指数型
pos = (findpro if haspro > hasneg else findneg)
if pos != flag - tmp - 1: # 处理e+23
suffix = int(reduce(lambda x, y: str(int(x) * 10 + int(y)), befpron[pos + 1:]))
suffix = suffix if haspro else -suffix
num = str(float(str(reduce(lambda x, y: x + y, befpron[0:findeE]))) * 10 ** suffix)
if IsFLOAT:
print("< FLOAT," + num + " >", end=‘ ‘)
flag = theskip
else:
print("< DOUBLE," + num + " >", end=‘ ‘)
else: # 0.12e+报错
print("< ERROR,FLOAT_DEFINE,LINE " + str(line) + " " + str(flag - befspace) + " >",
end=‘ ‘)
flag = skip(prog, flag)
elif haseE :#指数型浮点数
if haseE != flag - tmp - 1:
suffix = int(str(reduce(lambda x, y: x + y, befpron[findeE + 1:])))
num = str(float(str(reduce(lambda x, y: x + y, befpron[0:findeE]))) * 10 ** suffix)
if IsFLOAT:
print("< FLOAT," + num + " >", end=‘ ‘)
flag = theskip
else:
print("< DOUBLE," + num + " >", end=‘ ‘)
else:
print("< ERROR,FLOAT_DEFINE,LINE " + str(line) + " " + str(flag - befspace) + " >",
end=‘ ‘)
flag = skip(prog, flag)
elif haspoint:#小数点浮点数
if IsFLOAT:
print("< FLOAT," + str(float(befpron)) + " >", end=‘ ‘)
flag = theskip
else:
print("< DOUBLE," + str(float(befpron)) + " >", end=‘ ‘)
else:#纯整数
num = str(int(befpron))
if prog[flag].lower() == ‘u‘:
if prog[flag + 1].lower() == ‘l‘:
if prog[flag + 2].isalnum():
print("<ERROR,INT_UN_LONG_DEFINE,LINE"+str(line)+" "+str(flag-befspace)+" >", end=‘ ‘)
flag += 3
flag = skip(prog, flag)
else:
print("<INT_UN_LONG," + num + ">", end=‘ ‘)
flag += 2
elif prog[flag + 1].isalnum():
print("< ERROR,INT_LONG_DEFINE ,LINE"+str(line)+" "+str(flag-befspace)+" >", end=‘ ‘)
flag += 1
flag = skip(prog, flag)
else:
print("<INT_UN," + num + ">", end=‘ ‘)
elif prog[flag].lower() == ‘l‘:
if prog[flag + 1].lower() == ‘u‘:
if prog[flag + 2].isalnum():
print("< ERROR,INT_UN_LONG_DEFINE ,LINE"+str(line)+" "+str(flag-befspace)+" >", end=‘ ‘)
flag += 3
flag = skip(prog, flag)
else:
print("< INT_UN_LONG," + num + ">", end=‘ ‘)
flag += 2
elif prog[flag + 1].isalnum():
print("< ERROR,INT_LONG_DEFINE ,LINE"+str(line)+" "+str(flag-befspace)+" >", end=‘ ‘)
flag += 1
flag = skip(prog, flag)
else:
print("< INT_LONG," + num + " >", end=‘ ‘)
elif prog[flag].isalpha():
print("< ERROR,INT_DEFINE,LINE"+str(line)+" "+str(flag-befspace)+" >", end=‘ ‘)
flag = skip(prog, flag)
else:
print("< INT," + num + " >", end=‘ ‘)</pre>
边上也有同学想用Python 的正则表达式,直接将符合前言中的那几种分别表示出来,进行直接匹配。但是其中还是有些复杂的结构无法表示,并且容易出误判,而无法显示出错位置。
不得不说其实不少代码冗余,本文全部的代码实现,欢迎拍砖 https://github.com/single-wolf/show-me-the-code/blob/master/analyzer.py
最近也喜欢上Python ,分享个练手的有趣东西 https://github.com/Show-Me-the-Code/show-me-the-code
以上是关于C词法分析器的Python简单实现的主要内容,如果未能解决你的问题,请参考以下文章