C语言实现的词法分析器

来源：互联网发布：nodejs mysql access 编辑：程序博客网时间：2024/04/28 20:21

昨天花了一整天的时间而且还参考老师发的代码才写好一个简陋的词法分析器，今天稍微记录一下吧。

这是编译原理课上的实验，这里先贴出题目要求：

字符集定义
1. <字符集>→ <字母>│<数字>│<单界符>
2. <字母>→ A│B│…│Z│a│b│…│z
3. <数字>→ 0│1│2│…│9
4. <单界符>→ +│-│*│/│=│<│>│(│)│[│]│:│.│;│,│'

单词集定义

5．<单词集>→ <保留字>│<双界符>│<标识符>│<常数>│<单界符>

6．<保留字>→and│array│begin│bool│call│case│char│constant│dim│do│else│end│false│for│if│input│integer│not│of│or│output│procedure│program│read│real│repeat│set│stop│then│to│true│until│var│while│write

7．<双界符>→ <>│<=│>=│:=│/*│*/│..

8．<标识符>→ <字母>│<标识符><数字>│<标识符><字母>

9．<常数>→ <整数>│<布尔常数>│<字符常数>

10．<整数>→ <数字>│<整数><数字>

11．<布尔常数>→true│false

12．<字符常数>→ ' 除{'}外的任意字符串'

测试程序与样板输出

测试程序1：程序名TEST1

and array begin bool call

case char constant dim do

else end false for if

input integer not of or

output procedure program read real

repeat set stop then to

true until var while write

abc 123 'EFG' ( ) * + , - . .. /

: := ; < <= <> = > >= [ ]

样板输出1：（要求在屏幕上显示）注：作为自身值的内容显示做了简化，便于检查。只有标识符和常数有显示，并且通过数字来区分他们的不同。

( 1 , - ) ( 2 , - ) (3 , - ) ( 4 , - ) ( 5 , - )

( 6 , - ) ( 7 , - ) (8 , - ) ( 9 , - ) (10 , - )

(11 , - ) (12 , -) (13 , - ) (14 , - ) (15 , - )

(16 , - ) (17 , -) (18 , - ) (19 , - ) (20 , - )

(21 , - ) (22 , -) (23 , - ) (24 , - ) (25 , - )

(26 , - ) (27 , -) (28 , - ) (29 , - ) (30 , - )

(31 , - ) (32 , -) (33 , - ) (34 , - ) (35 , - )

(36 , 1 ) (37 , 2) (38 , 3 ) (39 , - ) (40 , - )

(41 , - ) (43 , -) (44 , - ) (45 , - ) (46 , - )

(47 , - ) (48 , -) (50 , - ) (51 , - ) (52 , - )

(53 , - ) (54 , -) (55 , - ) (56 , - ) (57 , - )

(58 , - ) (59 , -) (60 , - )

测试程序2：程序名TEST2

program example2;

var A,B,C:integer;

X,Y:bool;

begin /* this is an example */

A:=B*C+37;

X:=’ABC’

end.

样板输出2：（要求在屏幕上显示）

(23 , - ) (36 , 1) (52 , - ) (33 , - ) (36 , 2 )

(44 , - ) (36 , 3) (44 , - ) (36 , 4 ) (50 , - )

(17 , - ) (52 , -) (36 , 5 ) (44 , - ) (36 , 6 )

(50 , - ) ( 4 , -) (52 , - ) ( 3 , - ) (36 , 2 )

(51 , - ) (36 , 3) (41 , - ) (36 , 4 ) (43 , - )

(37 , 7 ) (52 , -) (36 , 5 ) (51 , - ) (38 , 8 )

(12 , - ) (46 , - )

接着来讲一下词法分析器，词法分析器是什么就不说了，主要说一下原理。

任何词法分析器都要满足一个原则，极大搜索原则，就是在符合词法定义的情况下进行超前搜索识别。

举个例子，“++”不会被分解为两个“+”运算符，而是被判定为自增运算符。

超前搜索识别通过超前搜索一个字符，直到读入下一个字符后单词不能满足词法定义，就完成了一个单词的识别了，同时还要将该字符返回缓冲区。

词法分析器的设计关键其实在于状态转换图，也就是课堂上老师讲的有穷自动机，转换图写好后，只要按流程编码就可以实现基本的分析器功能了。

图我懒得画了，就说说几个关键的地方，大家可以自己去画画

a.状态分3种：开始状态、中间状态跟终止状态。什么叫中间状态呢？就是说读入一个字符，不能马上判断该字符的种别，于是就进入了中间状态。

b.单词我这里只要分为四类：ID（保留字或标识符）、NUM（整数）、STRING（字符串常量）、SPECIAL（特殊字符），当然还有标记文件尾的ENDFILE跟非法字符ERROR。

这里采用一遍读取的方法，先从文件读取一段字符到缓冲区，然后从缓冲区逐个读出，根据当前状态以及转换图的流程，判定下一个状态，直至文件结束。

下面给出我写的代码

#include <stdio.h>#include <stdlib.h>#include <string.h>#include <ctype.h>#define FALSE 0#define TRUE 1#define MAXTOKENLEN 40#define MAXRESERVED 35#define MAXSPECIAL 23#define MAXDEFINED 100#define BUFLEN 256/* 分词器状态 */typedef enum{    /* 开始状态 */    START,    /* 终止状态 */    DONE,    /* 中间状态 */    INNUM,INSTRING,INID,INASSIGN,INCOMMENT,INOMIT,INLT,INGT}StateType;/* 单词类型 */typedef enum{    /* 文件结束 */    ENDFILE,    /* 非法字符 */    ERROR,    /* 保留字或标识符*/    ID,    /* 常数 */    NUM,    /* 字符串常量 */    STRING,    /* 特殊符号 */    LPAREN,RPAREN,TIMES,PLUS,COMMA,MINUS,DOT,OMIT,OVER,    COLON,ASSIGN,SEMI,LT,LEQT,NEQ,EQ,GT,GEQT,LBRAC,RBRAC} TokenType;TokenType currentToken;//单词类型char tokenString[MAXTOKENLEN+1];//单词/* 关键字的结构数组 */static struct{    char* word;    int no;} reservedWords[MAXRESERVED]={    {"and",1},{"array",2},{"begin",3}, {"bool",4}, {"call",5}, {"case",6}, {"char",7}, {"constant",8}, {"dim",9},    {"do",10}, {"else",11}, {"end",12}, {"false",13}, {"for",14}, {"if",15}, {"input",16}, {"integer",17}, {"not",18}, {"of",19},    {"or",20}, {"output",21}, {"procedure",22}, {"program",23}, {"read",24}, {"real",25}, {"repeat",26}, {"set",27},    {"stop",28}, {"then",29}, {"to",30}, {"true",31}, {"until",32}, {"var",33}, {"while",34}, {"write",35}};/* 特殊字符的结构数组 */static struct{    TokenType tok;    int no;} tokens[MAXSPECIAL]={    {NUM,37},{STRING,38},{LPAREN,39},{RPAREN,40},{TIMES,41},{PLUS,43},{COMMA,44},    {MINUS,45},{DOT,46},{OMIT,47},{OVER,48},{COLON,50},{ASSIGN,51},{SEMI,52},{LT,53},    {LEQT,54},{NEQ,55},{EQ,56},{GT,57},{GEQT,58},{LBRAC,59},{RBRAC,60}};/* 标识符或常量的结构数组 */static struct{    char* var;    int no;} defined[MAXDEFINED];static FILE* source;//源文件static char lineBuf[BUFLEN];//当前行缓冲区static int linepos = 0;//在缓冲区中的当前位置static int bufsize = 0;//缓冲区字符串长度static int EOF_flag = FALSE;//是否到达文件尾static int lineno;//读入字符串的行数static int definedno;//用户使用的常量或变量数目static int expno;//输出结果时一行中的二元式数目/* 从缓冲区取出一个字符，如果缓冲区为空读取新行 */static int getNextChar(void){    if (!(linepos < bufsize))    {        lineno++;        if (fgets(lineBuf,BUFLEN-1,source))        {            bufsize = strlen(lineBuf);            linepos = 0;            return lineBuf[linepos++];        }        else        {            EOF_flag = TRUE;            return EOF;        }    }    else return lineBuf[linepos++];}/* 将一个字符返回到缓冲区 */void ungetNextChar(void){    if (!EOF_flag) linepos-- ;}int idLookup(TokenType tk, char * s){    int i;    for (i=0; i<MAXRESERVED; i++)    {        if (!strcmp(reservedWords[i].word,s))            return reservedWords[i].no;    }    return 36;//在保留字中找不到，说明是标识符}int definedLookup(char* s){    int i;    for(i=0; i<definedno; i++)        if(!strcmp(s,defined[i].var))//该变量或常量已经定义            return defined[i].no;    //如果没找到该变量或常量    char *c = malloc(sizeof(char)*strlen(s));    strcpy(c,s); //复制字符串    defined[definedno].var = c;    defined[definedno].no = definedno+1;    return ++definedno;}int specialLookup(TokenType tk){    int i;    for(i=0; i<MAXSPECIAL; i++)        if(tk == tokens[i].tok)            return tokens[i].no;    return 0;//在特殊字符集中找不到，返回0}/* 线性查找单词 */int tokenLookup (TokenType tk, char * s){    if(tk == ID)        return idLookup(tk,s);    else //特殊字符        return specialLookup(tk);}void printToken1(TokenType tk, char* s){    int a = tokenLookup(tk,s);    if(expno == 4)//一行已经有4个    {        if(a == 36 || a == 37 || a == 38)            printf("(%d , %d)",a,definedLookup(s));        else            printf("(%d , -)",a);        printf("\n");        expno = 0;    }    else    {        if(a == 36 || a == 37 || a == 38)            printf("(%d , %d)",a,definedLookup(s));        else            printf("(%d , -)",a);        expno++;    }}TokenType getToken(void){    int stringline;//字符常数所在行    int commentline;//注释所在行    int tokenStringIndex = 0;//单词下标    StateType state = START;//词法分析器当前状态，每次进入循环设为开始状态    int save;//是否需要保存单前字符    while(state != DONE)    {        int c = getNextChar();        save = TRUE;        switch(state)        {        case START://开始状态处理            if(isdigit(c))                state = INNUM;            else if(c == '\'')//单引号            {                state = INSTRING;                stringline = lineno;            }            else if(isalpha(c))                state = INID;            else if (c == ':')                state = INASSIGN;            else if ((c == ' ') || (c == '\t') || (c == '\n'))                save = FALSE;            else if (c == '/')            {                int a = getNextChar();                if(a == '*')                {                    save = FALSE;                    state = INCOMMENT;                    commentline = lineno;                }                else                {                    save = TRUE;                    state = DONE;                    currentToken = OVER;                    ungetNextChar();                    break;                }            }            else if(c == '.')                state = INOMIT;            else if(c == '<')                state = INLT;            else if(c == '>')                state = INGT;            else            {//进入终止状态                state = DONE;                switch (c)                {                case EOF:                    save = FALSE;                    currentToken = ENDFILE;                    break;                case '(':                    currentToken = LPAREN;                    break;                case ')':                    currentToken = RPAREN;                    break;                case '*':                    currentToken = TIMES;                    break;                case '+':                    currentToken = PLUS;                    break;                case ',':                    currentToken = COMMA;                    break;                case '-':                    currentToken = MINUS;                    break;                case ';':                    currentToken = SEMI;                    break;                case '=':                    currentToken = EQ;                    break;                case '[':                    currentToken = LBRAC;                    break;                case ']':                    currentToken = RBRAC;                    break;                default:                    currentToken = ERROR;                    break;                }            }            break;        case INNUM://当前处于整数的状态            if(!isdigit(c))            {                ungetNextChar();                save = FALSE;                state = DONE;                currentToken = NUM;            }            break;        case INSTRING://当前处于字符串常数的状态            save = TRUE;            if(lineno > stringline)            {                printf("第%d行字符常数缺少右界符，程序终止\n",stringline);                exit(1);            }            if(c == '\'')            {                state = DONE;                currentToken = STRING;                int a = getNextChar();                if(a == '\'')                    currentToken = ERROR;                else                    currentToken = STRING;                ungetNextChar();            }            break;        case INID://处于标识符或保留字的状态            if(!isdigit(c) && !isalpha(c))            {                ungetNextChar();                save = FALSE;                state = DONE;                currentToken = ID;            }            break;        case INASSIGN://处于赋值符号的状态            state = DONE;            if (c == '=')                currentToken = ASSIGN;            else            {                ungetNextChar();                save = FALSE;                currentToken = COLON;            }            break;        case INCOMMENT://处于注释的状态            save = FALSE;            if (c == EOF)            {                state = DONE;                currentToken = ENDFILE;            }            else if (c == '*')            {                int a = getNextChar();                if(a == '/')//注释的右界符                {                    if(commentline != lineno)//处理跨行注释                    {                        printf("注释左右边跨行，程序终止\n");                        exit(1);                    }                    state = START;                }                else                    ungetNextChar();            }            break;        case INOMIT://处于两点省略符号的状态            state = DONE;            if(c == '.')                currentToken = OMIT;            else            {                ungetNextChar();                currentToken = DOT;            }            break;        case INLT://处于'<‘的状态            state = DONE;            if(c == '=')                currentToken = LEQT;            else if(c == '>')                currentToken = NEQ;            else            {                ungetNextChar();                currentToken = LT;            }            break;        case INGT://处于'>'的状态            state = DONE;            if(c == '=')                currentToken = GEQT;            else            {                ungetNextChar();                currentToken = GT;            }            break;        case DONE:        default: /* 不应该遇到 */            state = DONE;            currentToken = ERROR;            break;        }        if ((save) && (tokenStringIndex <= MAXTOKENLEN))//保存字符            tokenString[tokenStringIndex++] = (char) c;        if (state == DONE)        {            tokenString[tokenStringIndex] = '\0';            return currentToken;        }    }}int main(){    source = fopen("./test4","r");    while(getToken() != ENDFILE)    {        if(currentToken == ERROR)        {            printf("第%d行第%d列非法字符，程序终止\n",lineno,linepos);            exit(1);        }        printToken1(currentToken, tokenString);    }    return 0;}