  1. 字符集定义

    1. <字符集> <字母><数字><单界符>

    2. <字母> AB│…│Zab│…│z

    3. <数字> 012│…│9

    4. <单界符> +-*/=<>()[]:.;,'

  1. 单词集定义

5<单词集> <保留字><双界符><标识符><常数><单界符>


7<双界符> <><=>=:=/**/..

8<标识符> <字母><标识符><数字><标识符><字母>

9<常数> <整数><布尔常数><字符常数>

10<整数> <数字><整数><数字>


12<字符常数> ' {'}外的任意字符串'



and array begin bool call

case char constant dim do

else end false for if

input integer not of or

output procedure program read real

repeat set stop then to

true until var while write

abc 123 'EFG' ( ) * + , - . .. /

: := ; < <= <> = > >= [ ]


( 1 , - ) ( 2 , - ) (3 , - ) ( 4 , - ) ( 5 , - )

( 6 , - ) ( 7 , - ) (8 , - ) ( 9 , - ) (10 , - )

(11 , - ) (12 , -) (13 , - ) (14 , - ) (15 , - )

(16 , - ) (17 , -) (18 , - ) (19 , - ) (20 , - )

(21 , - ) (22 , -) (23 , - ) (24 , - ) (25 , - )

(26 , - ) (27 , -) (28 , - ) (29 , - ) (30 , - )

(31 , - ) (32 , -) (33 , - ) (34 , - ) (35 , - )

(36 , 1 ) (37 , 2) (38 , 3 ) (39 , - ) (40 , - )

(41 , - ) (43 , -) (44 , - ) (45 , - ) (46 , - )

(47 , - ) (48 , -) (50 , - ) (51 , - ) (52 , - )

(53 , - ) (54 , -) (55 , - ) (56 , - ) (57 , - )

(58 , - ) (59 , -) (60 , - )


program example2;

var A,B,C:integer;


begin /* this is an example */





(23 , - ) (36 , 1) (52 , - ) (33 , - ) (36 , 2 )

(44 , - ) (36 , 3) (44 , - ) (36 , 4 ) (50 , - )

(17 , - ) (52 , -) (36 , 5 ) (44 , - ) (36 , 6 )

(50 , - ) ( 4 , -) (52 , - ) ( 3 , - ) (36 , 2 )

(51 , - ) (36 , 3) (41 , - ) (36 , 4 ) (43 , - )

(37 , 7 ) (52 , -) (36 , 5 ) (51 , - ) (38 , 8 )

(12 , - ) (46 , - )











#include <stdio.h>#include <stdlib.h>#include <string.h>#include <ctype.h>#define FALSE 0#define TRUE 1#define MAXTOKENLEN 40#define MAXRESERVED 35#define MAXSPECIAL 23#define MAXDEFINED 100#define BUFLEN 256/* 分词器状态 */typedef enum{    /* 开始状态 */    START,    /* 终止状态 */    DONE,    /* 中间状态 */    INNUM,INSTRING,INID,INASSIGN,INCOMMENT,INOMIT,INLT,INGT}StateType;/* 单词类型 */typedef enum{    /* 文件结束 */    ENDFILE,    /* 非法字符 */    ERROR,    /* 保留字或标识符*/    ID,    /* 常数 */    NUM,    /* 字符串常量 */    STRING,    /* 特殊符号 */    LPAREN,RPAREN,TIMES,PLUS,COMMA,MINUS,DOT,OMIT,OVER,    COLON,ASSIGN,SEMI,LT,LEQT,NEQ,EQ,GT,GEQT,LBRAC,RBRAC} TokenType;TokenType currentToken;//单词类型char tokenString[MAXTOKENLEN+1];//单词/* 关键字的结构数组 */static struct{    char* word;    int no;} reservedWords[MAXRESERVED]={    {"and",1},{"array",2},{"begin",3}, {"bool",4}, {"call",5}, {"case",6}, {"char",7}, {"constant",8}, {"dim",9},    {"do",10}, {"else",11}, {"end",12}, {"false",13}, {"for",14}, {"if",15}, {"input",16}, {"integer",17}, {"not",18}, {"of",19},    {"or",20}, {"output",21}, {"procedure",22}, {"program",23}, {"read",24}, {"real",25}, {"repeat",26}, {"set",27},    {"stop",28}, {"then",29}, {"to",30}, {"true",31}, {"until",32}, {"var",33}, {"while",34}, {"write",35}};/* 特殊字符的结构数组 */static struct{    TokenType tok;    int no;} tokens[MAXSPECIAL]={    {NUM,37},{STRING,38},{LPAREN,39},{RPAREN,40},{TIMES,41},{PLUS,43},{COMMA,44},    {MINUS,45},{DOT,46},{OMIT,47},{OVER,48},{COLON,50},{ASSIGN,51},{SEMI,52},{LT,53},    {LEQT,54},{NEQ,55},{EQ,56},{GT,57},{GEQT,58},{LBRAC,59},{RBRAC,60}};/* 标识符或常量的结构数组 */static struct{    char* var;    int no;} defined[MAXDEFINED];static FILE* source;//源文件static char lineBuf[BUFLEN];//当前行缓冲区static int linepos = 0;//在缓冲区中的当前位置static int bufsize = 0;//缓冲区字符串长度static int EOF_flag = FALSE;//是否到达文件尾static int lineno;//读入字符串的行数static int definedno;//用户使用的常量或变量数目static int expno;//输出结果时一行中的二元式数目/* 从缓冲区取出一个字符,如果缓冲区为空读取新行 */static int getNextChar(void){    if (!(linepos < bufsize))    {        lineno++;        if (fgets(lineBuf,BUFLEN-1,source))        {            bufsize = strlen(lineBuf);            linepos = 0;            return lineBuf[linepos++];        }        else        {            EOF_flag = TRUE;            return EOF;        }    }    else return lineBuf[linepos++];}/* 将一个字符返回到缓冲区 */void ungetNextChar(void){    if (!EOF_flag) linepos-- ;}int idLookup(TokenType tk, char * s){    int i;    for (i=0; i<MAXRESERVED; i++)    {        if (!strcmp(reservedWords[i].word,s))            return reservedWords[i].no;    }    return 36;//在保留字中找不到,说明是标识符}int definedLookup(char* s){    int i;    for(i=0; i<definedno; i++)        if(!strcmp(s,defined[i].var))//该变量或常量已经定义            return defined[i].no;    //如果没找到该变量或常量    char *c = malloc(sizeof(char)*strlen(s));    strcpy(c,s); //复制字符串    defined[definedno].var = c;    defined[definedno].no = definedno+1;    return ++definedno;}int specialLookup(TokenType tk){    int i;    for(i=0; i<MAXSPECIAL; i++)        if(tk == tokens[i].tok)            return tokens[i].no;    return 0;//在特殊字符集中找不到,返回0}/* 线性查找单词 */int tokenLookup (TokenType tk, char * s){    if(tk == ID)        return idLookup(tk,s);    else //特殊字符        return specialLookup(tk);}void printToken1(TokenType tk, char* s){    int a = tokenLookup(tk,s);    if(expno == 4)//一行已经有4个    {        if(a == 36 || a == 37 || a == 38)            printf("(%d , %d)",a,definedLookup(s));        else            printf("(%d , -)",a);        printf("\n");        expno = 0;    }    else    {        if(a == 36 || a == 37 || a == 38)            printf("(%d , %d)",a,definedLookup(s));        else            printf("(%d , -)",a);        expno++;    }}TokenType getToken(void){    int stringline;//字符常数所在行    int commentline;//注释所在行    int tokenStringIndex = 0;//单词下标    StateType state = START;//词法分析器当前状态,每次进入循环设为开始状态    int save;//是否需要保存单前字符    while(state != DONE)    {        int c = getNextChar();        save = TRUE;        switch(state)        {        case START://开始状态处理            if(isdigit(c))                state = INNUM;            else if(c == '\'')//单引号            {                state = INSTRING;                stringline = lineno;            }            else if(isalpha(c))                state = INID;            else if (c == ':')                state = INASSIGN;            else if ((c == ' ') || (c == '\t') || (c == '\n'))                save = FALSE;            else if (c == '/')            {                int a = getNextChar();                if(a == '*')                {                    save = FALSE;                    state = INCOMMENT;                    commentline = lineno;                }                else                {                    save = TRUE;                    state = DONE;                    currentToken = OVER;                    ungetNextChar();                    break;                }            }            else if(c == '.')                state = INOMIT;            else if(c == '<')                state = INLT;            else if(c == '>')                state = INGT;            else            {//进入终止状态                state = DONE;                switch (c)                {                case EOF:                    save = FALSE;                    currentToken = ENDFILE;                    break;                case '(':                    currentToken = LPAREN;                    break;                case ')':                    currentToken = RPAREN;                    break;                case '*':                    currentToken = TIMES;                    break;                case '+':                    currentToken = PLUS;                    break;                case ',':                    currentToken = COMMA;                    break;                case '-':                    currentToken = MINUS;                    break;                case ';':                    currentToken = SEMI;                    break;                case '=':                    currentToken = EQ;                    break;                case '[':                    currentToken = LBRAC;                    break;                case ']':                    currentToken = RBRAC;                    break;                default:                    currentToken = ERROR;                    break;                }            }            break;        case INNUM://当前处于整数的状态            if(!isdigit(c))            {                ungetNextChar();                save = FALSE;                state = DONE;                currentToken = NUM;            }            break;        case INSTRING://当前处于字符串常数的状态            save = TRUE;            if(lineno > stringline)            {                printf("第%d行字符常数缺少右界符,程序终止\n",stringline);                exit(1);            }            if(c == '\'')            {                state = DONE;                currentToken = STRING;                int a = getNextChar();                if(a == '\'')                    currentToken = ERROR;                else                    currentToken = STRING;                ungetNextChar();            }            break;        case INID://处于标识符或保留字的状态            if(!isdigit(c) && !isalpha(c))            {                ungetNextChar();                save = FALSE;                state = DONE;                currentToken = ID;            }            break;        case INASSIGN://处于赋值符号的状态            state = DONE;            if (c == '=')                currentToken = ASSIGN;            else            {                ungetNextChar();                save = FALSE;                currentToken = COLON;            }            break;        case INCOMMENT://处于注释的状态            save = FALSE;            if (c == EOF)            {                state = DONE;                currentToken = ENDFILE;            }            else if (c == '*')            {                int a = getNextChar();                if(a == '/')//注释的右界符                {                    if(commentline != lineno)//处理跨行注释                    {                        printf("注释左右边跨行,程序终止\n");                        exit(1);                    }                    state = START;                }                else                    ungetNextChar();            }            break;        case INOMIT://处于两点省略符号的状态            state = DONE;            if(c == '.')                currentToken = OMIT;            else            {                ungetNextChar();                currentToken = DOT;            }            break;        case INLT://处于'<‘的状态            state = DONE;            if(c == '=')                currentToken = LEQT;            else if(c == '>')                currentToken = NEQ;            else            {                ungetNextChar();                currentToken = LT;            }            break;        case INGT://处于'>'的状态            state = DONE;            if(c == '=')                currentToken = GEQT;            else            {                ungetNextChar();                currentToken = GT;            }            break;        case DONE:        default: /* 不应该遇到 */            state = DONE;            currentToken = ERROR;            break;        }        if ((save) && (tokenStringIndex <= MAXTOKENLEN))//保存字符            tokenString[tokenStringIndex++] = (char) c;        if (state == DONE)        {            tokenString[tokenStringIndex] = '\0';            return currentToken;        }    }}int main(){    source = fopen("./test4","r");    while(getToken() != ENDFILE)    {        if(currentToken == ERROR)        {            printf("第%d行第%d列非法字符,程序终止\n",lineno,linepos);            exit(1);        }        printToken1(currentToken, tokenString);    }    return 0;}
