CSegGraph class 之自我解析(一)

来源:互联网 发布:sql server2008破解版 编辑:程序博客网 时间:2024/06/06 20:00

此类的类函数如下,

bool GenerateWordNet(char *sSentence,CDictionary &dictCore,bool bOriginalFreq=false);

参数为,传入的句子,核心字典,以及是否使用原始频率。

在其执行过程中,首先调用了函数 bool AtomSegment(char *sSentence);

即对原始句子进行原子切割。

成员变量,

char m_sAtom[MAX_SENTENCE_LEN][WORD_MAXLENGTH],用来记录切割好的词;

int m_nAtomLength[MAX_SENTENCE_LEN],用来记录每个原子词的词长度;

int m_nAtomPOS[MAX_SENTENCE_LEN],记录每个原子词的词性;

unsigned int m_nAtomCount,保存原子词的总个数;

CDynamicArray m_segGraph,保存分割后二维词图。

 

先看 AtomSegment 函数,

 

bool CSegGraph::AtomSegment(char *sSentence){    unsigned int i=0,j=0,nCurType,nNextType;//i is the pointer of sentence string//j is the pointer of pAtomschar sChar[3];sChar[2]=0;//Set the char endingm_sAtom[j][0]=0;//Set the first word as nullm_nAtomLength[j]=0;//先判别句子是否有起始标志if(strncmp(sSentence,SENTENCE_BEGIN,strlen(SENTENCE_BEGIN))==0){//如果有起始标志strcpy(m_sAtom[j],SENTENCE_BEGIN);//Set the first word as sentence beginingm_nAtomLength[j] = strlen(SENTENCE_BEGIN);m_nAtomPOS[j] = CT_SENTENCE_BEGIN;i += m_nAtomLength[j];//i增一个原子词的长度j += 1;//j增1//对下一个原子进行初始化m_sAtom[j][0] = 0;m_nAtomLength[j] = 0;}while(i<strlen(sSentence)){if(strncmp(sSentence+i,SENTENCE_END,strlen(SENTENCE_END))==0){//如果是结束标志strcpy(m_sAtom[j],SENTENCE_END);//将原子词设为结束标志m_nAtomLength[j] = strlen(SENTENCE_END);m_nAtomPOS[j] = CT_SENTENCE_END;i += m_nAtomLength[j];j += 1;m_sAtom[j][0]=0;m_nAtomLength[j]=0;continue;//直接跳到while判别}sChar[0] = *(sSentence + i);//记录第i个位置的字符sChar[1] = 0;i += 1;/*ASCII值最大是 0111 1111 =127 汉字编码为了避免与ASCII冲突  编码要加0xA0 也即1010 所以汉字最高位就变成了 1 在char中代表负值 */if(sChar[0]<0)//Two byte char,即为汉字{sChar[1] = *(sSentence+i);//Get the char with second bytei += 1;//i increased by 1}strcat(m_sAtom[j],sChar);//记录第j个原子nCurType = charType((unsigned char *)sChar);//记录字符类型//如果第一个字符为'.'并且其后跟的字符为数字if(sChar[0]=='.' && (charType((unsigned char *)sSentence+i)==CT_NUM || (*(sSentence+i)>='0' && *(sSentence+i)<='9')))nCurType = CT_NUM;//Digit after . indicate . as a point in the numericm_nAtomPOS[j] = nCurType;//Record its property, just convience for continuous processingif(nCurType==CT_CHINESE || nCurType==CT_INDEX || nCurType==CT_DELIMITER || nCurType==CT_OTHER){//Chinese char, index number,delimiter and other is treated as atomm_nAtomLength[j] = strlen(m_sAtom[j]);//Save its lengthj += 1;//Skip to next atomm_sAtom[j][0] = 0;//init}else {//Number,single char,letternNextType = 255;if(i<strlen(sSentence))nNextType = charType((unsigned char *)(sSentence+i));if(nNextType!=nCurType || i==strlen(sSentence))//Reaching end or next char type is different from current char{m_nAtomLength[j] = strlen(m_sAtom[j]);//Save its lengthj += 1;m_sAtom[j][0] = 0;//init}}}m_nAtomCount = j;//The count of segmentation atomsreturn true;}


 此函数主要用途为,将传入的句子进行原子切割,并得出相应原子词的词性及长度等基本信息。较为简单。

 

原创粉丝点击