c语言词法分析初试(C++实现)

来源:互联网 发布:知乎 中国美院象山校区 编辑:程序博客网 时间:2024/05/05 00:34

开篇

所谓词法分析,就是将源代码按照构词规则分解成一系列单词符号。单词是语言中具有独立意义的最小单位,包括关键字、标识符、运算符、界符和常量等。

分析

源程序分解的单词可以归为以下几类:

1. 关键字

c语言关键字有不少,这里只列举其中一部分:
for,while,do,continue,if,else,
char,int,double,return

2. 变量名

变量名的规则为:
由字母数字以及下划线组成,开头必须为下划线或字母
用正规表达式描述为

letter = {a|b|...|z|A|B|...|Z}digit = {0|1|...|9}id = (letter|_ )(letter| _| digit)*

转化为NFA图为:
这里写图片描述

3. 数字常量

变量名的规则为:

由字母数字以及下划线组成,开头必须为下划线或字母
用正规表达式描述为

digit = {0|1|...|9}id = digit((digit)*|.(digit)*)(ℇ | (E|e)digit(digit)*)

转化为NFA图为:

这里写图片描述

4. 界符和运算符

()[]{};+-*/等等
本人这里只加入了+-和=,因为*/以及&^%等都是类似的,这里只是试验,不需要全部写。

思路

根据有限自动机的概念,将整个源代码分解的规则转化为一个一个状态,用switch或if/else将状态之间的转移描述出来即可。这里省略了将常量以及变量名插入到符号表返回指针的那一步。
简单一个伪代码描述这个过程

    初始化 状态=0    switch(状态):    {        case 0:            接收源码            if(符合1规则)                处理,状态=1            else if(符合2规则)                处理,状态=2            else                ...        case 1:        case 2:        case ...    }

代码

#include <iostream>#include <algorithm>#include <vector>#include <stack>#include <cstdlib>#include <cstdio>using namespace std;const int KEY_NUM = 11;const char* KEY_SET[] = {    "for",    "while",    "do",    "continue",    "if",    "else",    "char",    "int",    "double",    "float",    "return"};const char* FILE_NAME = "/Users/shiyi/abc.c";char strBuffer[1026];char strBox[200];void retract(char* &str){    str--;}char getChar(char* &str){    return *str++;}bool isNum(char ch){    if(ch >= '0' && ch <= '9')        return true;    return false;}bool isLetter(char ch){    if(ch >= 'a' && ch <= 'z')        return true;    if(ch >= 'A' && ch <= 'Z')        return true;    return false;}int findKey(char* ch){    for(int i=0; i<KEY_NUM; i++)    {        if(0 == strcmp(ch, KEY_SET[i]))            return i+1;    }    return 0;}void output(char* key, char* value){    printf("< %s, %s >\n", key, value);}//扫描void scanner(char* str){    int state = 0;    char ch = ' ';    int pos = 0;    while(ch != '\0')    {    switch (state) {        case 0:        {            ch = getChar(str);            switch (ch) {                case ' ':                    pos = 0;                    break;                case '[':                case ']':                case '(':                case ')':                case '{':                case '}': {                    char temp[2];                    temp[0] = ch;                    output(temp, "-");                    pos = 0;                    break;                }                case '\'': {                    state = 0;                    while((ch = getChar(str)) != '\'' && ch != '\0')                    {                        strBox[pos++] = ch;                    }                    if(ch == '\0')                    {                        //error                    }                    strBox[pos] = '\0';                    output("string", strBox);                    pos = 0;                    break;                }                case '"': {                    state = 0;                    while((ch = getChar(str)) != '"' && ch != '\0')                    {                        strBox[pos++] = ch;                    }                    if(ch == '\0')                    {                        ////                        output("error", strBox);                    }                    else                    {                        strBox[pos] = '\0';                        output("string", strBox);                        pos = 0;                    }                    break;                }                case '+': {                    state = 0;                    ch = getChar(str);                    switch (ch) {                        case '+':                            output("++", "-");                            pos = 0;                            break;                        case '=':                            output("+=", "-");                            pos = 0;                            break;                        default:                            retract(str);                            output("+", "-");                            pos = 0;                            break;                    }                }                case '-': {                    state = 0;                    ch = getChar(str);                    switch (ch) {                        case '-':                            output("--", "-");                            pos = 0;                            break;                        case '=':                            output("-=", "-");                            pos = 0;                            break;                        default:                            retract(str);                            output("-", "-");                            pos = 0;                            break;                    }                }                case '=': {                    state = 0;                    ch = getChar(str);                    switch (ch) {                        case '=':                            output("==", "-");                            pos = 0;                            break;                        default:                            retract(str);                            output("=", "-");                            pos = 0;                            break;                    }                    break;                }                default: {                    if(isNum(ch))                    {                        strBox[pos++] = ch;                        state = 2;                    }                    else if(isLetter(ch) || ch == '_')                    {                        strBox[pos++] = ch;                        state = 1;                    }                    break;                }            }            break;        }        case 1:        {            while(true)            {                ch = getChar(str);                if(isLetter(ch) || isNum(ch) || ch == '_')                    strBox[pos++] = ch;                else                {                    strBox[pos] = '\0';                    int id = findKey(strBox);                    if(id == 0)                        output("id", strBox);                    else                        output("key", strBox);                    retract(str);                    pos = 0;                    state = 0;                    break;                }            }            break;        }        case 2:        {            while(true)            {                ch = getChar(str);                if(isNum(ch))                    strBox[pos++] = ch;                else if(ch == '.')                {                    strBox[pos++] = ch;                    state = 3;                    break;                }                else if(ch == 'E' || ch == 'e')                {                    strBox[pos++] = ch;                    state = 4;                    break;                }                else                {                    strBox[pos] = '\0';                    output("number", strBox);                    retract(str);                    pos = 0;                    state = 0;                    break;                }            }            break;        }        case 3:        {            while(true)            {                ch = getChar(str);                if(isNum(ch))                    strBox[pos++] = ch;                else if(ch == 'E' || ch == 'e')                {                    strBox[pos++] = ch;                    state = 4;                }                else                {                    strBox[pos] = '\0';                    output("number", strBox);                    retract(str);                    pos = 0;                    state = 0;                    break;                }            }            break;        }        case 4:        {            while(true)            {                ch = getChar(str);                if(isNum(ch))                    strBox[pos++] = ch;                else                {                    strBox[pos] = '\0';                    output("number", strBox);                    retract(str);                    pos = 0;                    state = 0;                    break;                }            }            break;        }        default:            ////            output("error", strBox);            break;    }    }}int main(){    FILE* file = fopen(FILE_NAME, "r");    while(NULL != fgets(strBuffer, 1024, file))    {        scanner(strBuffer);    }    return 0;}

测试结果

源程序

int main(){    int a = 5;    for(int i=0; i<5; i++)      {        a++;    }    return 0;}

输出结果

< key, int >< id, main >< (, - >< ), - >< {, - >< key, int >< id, a >< =, - >< number, 5 >< key, for >< (, - >< key, int >< id, i >< =, - >< number, 0 >< id, i >< number, 5 >< id, i >< ++, - >< -, - >< =, - >< ), - >< {, - >< id, a >< ++, - >< -, - >< =, - >< }, - >< key, return >< number, 0 >< }, - >Program ended with exit code: 0
0 0