Tiny C语言编译程序之词法分析Scanner

来源：互联网发布：中国网络信息安全联盟编辑：程序博客网时间：2024/06/15 17:24

Tiny C语言编译程序之词法分析Scanner

约定：

仅允许整数类型，不允许实数类型
标识符由大小写英文字母组成，最多52个。其识别按最长匹配原则
整数后紧跟非数字，或标识符后紧跟非字母认为是一个新Token开始
由{ }括起来符号串都认为是注释部分，该部分在词法分析时被过滤掉
识别出的Token由两个变量：currentToken，tokenString识别，其中currentToken代表Token的类属，为一个名为TokenType的枚举类型，在文件globals.h中定义；tokenString代表Token在程序中出现的形式，即其本来面目。例如整数10的currentToken值为NUM，而tokenString值为‘10’；标识符i的currentToken值为ID，而tokenString值为‘i’

画识别符合TINY C语言构词规则的DFA。然后用直接编码的方法构造词法分析器

词法分析器scan.c

/****************************************************//* File: scan.c                                     *//* The scanner implementation for the TINY compiler *//****************************************************/#include "globals.h"#include "util.h"#include "scan.h"/* states in scanner DFA */typedef enum   { START,INASSIGN,INCOMMENT,INNUM,INID,DONE }   StateType;/* lexeme of identifier or reserved word */char tokenString[MAXTOKENLEN+1];/* BUFLEN = length of the input buffer for   source code lines */#define BUFLEN 256static char lineBuf[BUFLEN]; /* holds the current line */static int linepos = 0; /* current position in LineBuf */static int bufsize = 0; /* current size of buffer string */static int EOF_flag = FALSE; /* corrects ungetNextChar behavior on EOF *//* getNextChar fetches the next non-blank character   from lineBuf, reading in a new line if lineBuf is   exhausted *///获得下一字符static int getNextChar(void){ if (!(linepos < bufsize))  { lineno++;    if (fgets(lineBuf,BUFLEN-1,source))    { if (EchoSource) fprintf(listing,"%4d: %s",lineno,lineBuf);      bufsize = strlen(lineBuf);      linepos = 0;      return lineBuf[linepos++];    }    else    { EOF_flag = TRUE;      return EOF;    }  }  else return lineBuf[linepos++];}/* ungetNextChar backtracks one character   in lineBuf *///用于回吐字符static void ungetNextChar(void){ if (!EOF_flag) linepos-- ;}/* lookup table of reserved words *///定义保留字表static struct    { char* str;      TokenType tok;    } reservedWords[MAXRESERVED]   = {{"if",IF},{"then",THEN},{"else",ELSE},{"end",END},      {"repeat",REPEAT},{"until",UNTIL},{"read",READ},      {"write",WRITE}}; /* lookup an identifier to see if it is a reserved word *//* uses linear search *///进行保留字的匹配static TokenType reservedLookup (char * s){ int i;  for (i=0;i<MAXRESERVED;i++)    if (!strcmp(s,reservedWords[i].str))      return reservedWords[i].tok;  return ID;}/****************************************//* the primary function of the scanner  *//****************************************//* function getToken returns the  * next token in source file */TokenType getToken(void){  /* index for storing into tokenString */   /* tokenString的索引*/   int tokenStringIndex = 0;   /* holds current token to be returned */   /* 保存当前要返回的符号 */   TokenType currentToken;   /* 当前的状态——总是从START开始 */   /* current state - always begins at START */   StateType state = START;   /* 是否保存到tokenString的标记 */   /* flag to indicate save to tokenString */   int save;   while (state != DONE)   { int c = getNextChar();     save = TRUE;     switch (state)     { case START:         if (isdigit(c))           state = INNUM;         else if (isalpha(c)) /*（字符、:、空格/tab/换行、{、算符及界符等）*/            state = INID;         else if (c == ':')            state = INASSIGN;         else if ((c == ' ') || (c == '\t') || (c == '\n'))            save = FALSE;         else if (c == '{')         {            save = FALSE;            state = INCOMMENT;         }         else         {            state = DONE;            switch(c)            {                case EOF:                    save = FALSE;                    currentToken = ENDFILE;                    break;                case '=':                    currentToken = EQ;                    break;                case '<':                    currentToken = LT;                    break;                case '+':                    currentToken = PLUS;                    break;                case '-':                    currentToken = MINUS;                    break;                case '*':                    currentToken = TIMES;                    break;                case '/':                    currentToken = OVER;                    break;                case '(':                    currentToken = LPAREN;                    break;                case ')':                    currentToken = RPAREN;                    break;                case ';':                    currentToken = SEMI;                     break;                default:                    currentToken = ERROR;                    break;            }         }         break;       case INCOMMENT:         /* 仅出现‘}’或EOF（注释未完结束程序）时才改变状态。 */         save = FALSE;         if (c == '}')         {            state = START;         }else if (c == EOF)         {            state = DONE;            currentToken = ENDFILE;         }         break;       case INASSIGN:         /* ‘=’或其它（出现错误） */         state = DONE;         if(c == '=')            currentToken = ASSIGN;         else         {            ungetNextChar();            save = FALSE;            currentToken = ERROR;         }         break;       case INNUM:         if (!isdigit(c))         { /* backup in the input */           ungetNextChar();           save = FALSE;           state = DONE;           currentToken = NUM;         }         break;       case INID:         /* 不是字符则回吐，并进入DONE，且识别出一个ID */         if (!isalpha(c))         { /* backup in the input */           ungetNextChar();           save = FALSE;           state = DONE;           currentToken = ID;         }         break;       case DONE:       default: /* should never happen */         fprintf(listing,"Scanner Bug: state= %d\n",state);         state = DONE;         currentToken = ERROR;         break;     }     if ((save) && (tokenStringIndex <= MAXTOKENLEN))       tokenString[tokenStringIndex++] = (char) c;     if (state == DONE)     { tokenString[tokenStringIndex] = '\0';       if (currentToken == ID)         currentToken = reservedLookup(tokenString);     }   }   if (TraceScan) {     fprintf(listing,"\t%d: ",lineno);     printToken(currentToken,tokenString);   }   return currentToken;} /* end getToken */

对于Tiny语言编写的Sample程序源代码如下：

{ Sample program  in TINY language -  computes factorial}read x; { input an integer }if 0 < x then { don't compute if x <= 0 }  fact := 1;  repeat    fact := fact * x;    x := x - 1  until x = 0;  write fact  { output factorial of x }end

词法分析主要为后面的各个阶段提供方法getToken;经过词法分析后每行的关键字、标识符以及数字如下：

TINY COMPILATION: SAMPLE.tny   1: { Sample program   2:   in TINY language -   3:   computes factorial   4: }   5: read x; { input an integer }        5: reserved word: read        5: ID, name= x        5: ;   6: if 0 < x then { don't compute if x <= 0 }        6: reserved word: if        6: NUM, val= 0        6: <        6: ID, name= x        6: reserved word: then   7:   fact := 1;        7: ID, name= fact        7: :=        7: NUM, val= 1        7: ;   8:   repeat        8: reserved word: repeat   9:     fact := fact * x;        9: ID, name= fact        9: :=        9: ID, name= fact        9: *        9: ID, name= x        9: ;  10:     x := x - 1        10: ID, name= x        10: :=        10: ID, name= x        10: -        10: NUM, val= 1  11:   until x = 0;        11: reserved word: until        11: ID, name= x        11: =        11: NUM, val= 0        11: ;  12:   write fact  { output factorial of x }        12: reserved word: write        12: ID, name= fact  13: end        13: reserved word: end        14: EOF

备注

内容为课堂所学及网上参考，仅供参考.

阅读全文

1 0