Python实现的C语言词法分析
来源:互联网 发布:php项目开发文档 编辑:程序博客网 时间:2024/04/28 09:21
编译原理课上的一个实验是做一个编译器前端的词法分析器,我选择了用Python来写C语言的词法分析。
词法分析器的功能是输入源程序,输出单词符号。当初定义Token(单词种别,属性值)序列的时候,是将单词种别用数字来表示,后来再做语法分析的时候,发现用数字时不太合理的,所以又对单词的种别码进行了一番修改。
我的程序的总体思路是先对源程序进行一遍扫描,将多余的空格和注释去除,然后再读一遍已经进行过预处理的源程序,进行单词的识别,转换成二元组,保存到token文件中,并建立符号表对标识符进行管理,如果发现了错误,对其的位置和错误信息进行打印。
在对单词的识别部分,我采用了有穷自动机的理论来进行识别。这样就可以根据现在的状态和输入符号决定其后继行为。因此在对单词的识别中,我画了很多的状态图来识别不同的单词,如字符串、数字等等。状态图的绘制中,本来想用visio来画的,后来的后来觉得太麻烦了,还是用了最快的手画的方法。
图1.注释的状态转换图
图2.标志符的状态装换图
图3.字符串的状态装换图
图4.界符的状态转换图
图5.整常数、浮点常数的状态转换图
图6.字符常数的状态转换图
关于错误处理的方面,我对于词法分析阶段所能遇到的几种错误,如下图所示中的四种中的前三种都进行了相应的处理。但是对于第三点做的不太好,对字符常数中可以出现的字符限制的有点过于厉害,例如分号等在我的词法分析器中是不能再字符串中出现的。
图7.词法分析中的四种错误
测试程序如下,内包含主要的C语言的各种语句,含有少量的错误:
int main(){ int _a; char ch = 'f; floatb,centigrade,fahrj@enheit; char fd = '\n'; printf("please inputa); scanf("%d",&a); /*mycomment1or***2*/ printf("please inputb"); scanf("%f",&b); if (a==8.1.6) { centigrade=095*(b-32)/9; /*itismyc5435omment*/ printf("TheCentigrade is ",centigrade); /*mess/age*/ } else if (a!=0) { fahrenheit=(9/5.0)*b++32; /*mycontent*/ printf("TheFahrenheit is fahrenheit); /*hello****/ } return 0;}
运行结果如下图所示:
图8.测试程序的错误报告
这是用Python写的第一个稍微像点样的东西,所以很多地方写的不大好,代码结构也是有点混乱。总而言之,就是在这样的条件下把编译原理的第一次实验给写完了。接下来是我的水水的代码了。
# -*- coding: utf-8 -*- '''Created on 2012-10-18@author: zouliping'''import string_key = ("auto","break","case","char","const","continue","default","do","double","else","enum","extern","float","for","goto","if","int","long","register","return","short","signed","static","sizeof","struct","switch","typedef","union","unsigned","void","volatile","while") # c语言的32个关键字_abnormalChar = '@#$%^&*~' #标识符中可能出现的非法字符_syn = '' #单词的种别码_p = 0 #下标_value = '' #存放词法分析出的单词_content = '' #程序内容_mstate = 0 #字符串的状态_cstate = 0 #字符的状态_dstate = 0 #整数和浮点数的状态_line = 1 #代码的第几行_mysymbol = [] #符号表def outOfComment(): '''去除代码中的注释''' global _content state = 0 index = -1 for c in _content: index = index + 1 if state == 0: if c == '/': state = 1 startIndex = index elif state == 1: if c == '*': state = 2 else: state = 0 elif state == 2: if c == '*': state = 3 else: pass elif state == 3: if c == '/': endIndex = index + 1 comment = _content[startIndex:endIndex] _content = _content.replace(comment,'') #将注释替换为空,并且将下标移动 index = startIndex - 1 state = 0 elif c == '*': pass else: state = 2 def getMyProm(): '''从文件中获取代码片段''' global _content myPro = open(r'E://test.txt','r') for line in myPro: if line != '\n': _content = "%s%s" %(_content,line.lstrip()) #效率更高的字符串拼接方法 else: _content = "%s%s" %(_content,line) myPro.close()def analysis(mystr): '''分析目标代码,生成token''' global _p,_value,_syn,_mstate,_dstate,_line,_cstate _value = '' ch = mystr[_p] _p += 1 while ch == ' ': ch = mystr[_p] _p += 1 if ch in string.letters or ch == '_': ###############letter(letter|digit)* while ch in string.letters or ch in string.digits or ch == '_' or ch in _abnormalChar: _value += ch ch = mystr[_p] _p += 1 _p -= 1 for abnormal in _abnormalChar: if abnormal in _value: _syn = '@-6' #错误代码,标识符中含有非法字符 break else: _syn = 'ID' for s in _key: if cmp(s,_value) == 0: _syn = _value.upper() #############关键字 break if _syn == 'ID': inSymbolTable(_value) elif ch == '\"': #############字符串 while ch in string.letters or ch in '\"% ' : _value += ch if _mstate == 0: if ch == '\"': _mstate = 1 elif _mstate == 1: if ch == '\"': _mstate = 2 ch = mystr[_p] _p += 1 if _mstate == 1: _syn = '@-2' #错误代码,字符串不封闭 _mstate = 0 elif _mstate == 2: _mstate = 0 _syn = 'STRING' _p -= 1 elif ch in string.digits: while ch in string.digits or ch == '.' or ch in string.letters: _value += ch if _dstate == 0: if ch == '0': _dstate = 1 else: _dstate = 2 elif _dstate == 1: if ch == '.': _dstate = 3 else: _dstate = 5 elif _dstate == 2: if ch == '.': _dstate = 3 ch = mystr[_p] _p += 1 for char in string.letters: if char in _value: _syn = '@-7' #错误代码,数字和字母混合,如12AB56等 _dstate = 0 if _syn != '@-7': if _dstate == 5: _syn = '@-3' #错误代码,数字以0开头 _dstate = 0 else: _dstate = 0 if '.' not in _value: _syn = 'DIGIT' ##################digit digit* else: if _value.count('.') == 1: _syn = 'FRACTION' ################## 浮点数 else: _syn = '@-5' #错误代码,浮点数中包含多个点,如1.2.3 _p -= 1 elif ch == '\'': ################## 字符 while ch in string.letters or ch in '@#$%&*\\\'\"': _value += ch if _cstate == 0: if ch == '\'': _cstate = 1 elif _cstate == 1: if ch == '\\': _cstate = 2 elif ch in string.letters or ch in '@#$%&*': _cstate = 3 elif _cstate == 2: if ch in 'nt': _cstate = 3 elif _cstate == 3: if ch == '\'': _cstate = 4 ch = mystr[_p] _p += 1 _p -= 1 if _cstate == 4: _syn = 'CHARACTER' _cstate = 0 else: _syn = '@-4' #错误代码,字符不封闭 _cstate = 0 elif ch == '<': _value = ch ch = mystr[_p] if ch == '=': ########### '<=' _value += ch _p += 1 _syn = '<=' else: ########### '<' _syn = '<' elif ch == '>': _value = ch ch = mystr[_p] if ch == '=': ########### '>=' _value += ch _p += 1 _syn = '>=' else: ########## '>' _syn = '>' elif ch == '!': _value = ch ch = mystr[_p] if ch == '=': ########## '!=' _value += ch _p += 1 _syn = '!=' else: ########## '!' _syn = '!' elif ch == '+': _value = ch ch = mystr[_p] if ch =='+': ############ '++' _value += ch _p += 1 _syn = '++' else : ############ '+' _syn = '+' elif ch == '-': _value = ch ch = mystr[_p] if ch =='-': ########### '--' _value += ch _p += 1 _syn = '--' else : ########### '-' _syn = '-' elif ch == '=': _value = ch ch = mystr[_p] if ch =='=': ########### '==' _value += ch _p += 1 _syn = '==' else : ########### '=' _syn = '=' elif ch == '&': _value = ch ch = mystr[_p] if ch == '&': ########### '&&' _value += ch _p += 1 _syn = '&&' else: ########### '&' _syn = '&' elif ch == '|': _value = ch ch = mystr[_p] if ch == '|': ########## '||' _value += ch _p += 1 _syn = '||' else: ########## '|' _syn = '|' elif ch == '*': ########## '*' _value = ch _syn = '*' elif ch == '/': ########## '/' _value = ch _syn = '/' elif ch ==';': ########## ';' _value = ch _syn = ';' elif ch == '(': ########## '(' _value = ch _syn = '(' elif ch == ')': ########### ')' _value = ch _syn = ')' elif ch == '{': ########### '{' _value = ch _syn = '{' elif ch == '}': ########### '}' _value = ch _syn = '}' elif ch == '[': ########### '[' _value = ch _syn = '[' elif ch == ']': ########### ']' _value = ch _syn = ']' elif ch == ',': ########## ',' _value = ch _syn = ',' elif ch == '\n': _syn = '@-1' def inSymbolTable(token): '''将关键字和标识符存进符号表''' global _mysymbol if token not in _mysymbol: _mysymbol.append(token)if __name__ == '__main__': getMyProm() outOfComment() symbolTableFile = open(r'E://symbol_table.txt','w') tokenFile = open(r'E://token.txt','w') while _p != len(_content): analysis(_content) if _syn == '@-1': _line += 1 #记录程序的行数 elif _syn == '@-2': print '字符串 ' + _value + ' 不封闭! Error in line ' + str(_line) elif _syn == '@-3': print '数字 ' + _value + ' 错误,不能以0开头! Error in line ' + str(_line) elif _syn == '@-4': print '字符 ' + _value + ' 不封闭! Error in line ' + str(_line) elif _syn == '@-5': print '数字 ' + _value + ' 不合法! Error in line ' + str(_line) elif _syn == '@-6': print '标识符' + _value + ' 不能包含非法字符!Error in line ' + str(_line) elif _syn == '@-7': print '数字 ' + _value + ' 不合法,包含字母! Error in line ' + str(_line) else: #若程序中无词法错误的情况 #print (_syn,_value) tokenFile.write(str(_syn)+'@'+_value+'\n') tokenFile.close() symbolTableFile.write('入口地址\t变量名\n') i = 0 for symbolItem in _mysymbol: symbolTableFile.write(str(i)+'\t\t\t'+symbolItem+'\n') i += 1 symbolTableFile.close()
- Python实现的C语言词法分析
- C语言实现词法分析
- c语言词法分析初试(C++实现)
- c语言 词法分析
- C语言,词法分析的“贪心法”
- C语言实现的词法分析器
- C语言实现的词法分析器
- c语言词法分析器的简单实现
- UCC--C语言词法分析
- C语言实现词法分析器
- C语言实现词法分析器
- C语言实现词法分析器
- 纯C实现的词法分析和lex实现的词法分析的对比
- 用java写的C语言词法分析
- 类C语言词法分析器的设计--c++实现
- c语言词法分析器的一个简单实现
- 一个简单词法分析器的C语言实现
- 用C语言实现简单的词法分析器
- OV9650驱动程序跟踪
- java
- C#程序设计(十九)----10个随机数在列表框中显示出来
- 软考完小结
- iOS 游戏开发教程资源
- Python实现的C语言词法分析
- iOS开发有用的连接大全
- 两道经典概率题的讲解与拓展。 隔板法
- 我们怎样组合使用 Scrum 和 XP(《硝烟中的Scrum和XP - 我们如何实施 Scrum》)
- android笔记
- SQL变量的使用
- POJ 1036 Gangsters DP好题 多解
- 基于mini2440的ov9650摄像头裸机测试
- isKindOfClass和isMemberOfClass的区别