ICTCLAS分词系统研究（五）--N最短路径

来源：互联网发布：网络歌曲你是否爱过我编辑：程序博客网时间：2024/06/06 00:22

原文地址：http://blog.csdn.net/sinboy/article/details/745498

ICTCLAS和别的分司系统不一样的地方就是于--N最短路径分词算法。所谓N最短路径其实就是最短路径和最大路径的折中，保留前N个最优路径。这样做的目的就是对这两种方法取长补短，既能达到一个比较理解的分词不达意效果，又能保证分词不达意速度。在此处，我们中国人的中庸思想被完美体现：）。

在N－最短路径求解之前，ICTCLAS首先通过二叉分词图表（邻接表，如下图一所示）表示出了每个词组之间的耦合关系，每一个节点都表示分词图表中的一条边，它的行值代表边的起点（前驱），它的列值代表边的终点（后驱），这一点务必弄清楚。可以通过图一、图二相结合对照来理解。通过计算词组之间的耦合关系，来最终确定初次的分词路径。我们都知道Dijkstra算法是求源点到某一点的最短路径，也就是最优的那一条，在此处的N－最短路径指的是找出前N条最优的路径（实际上在FreeICTCLAS的源代码当中N是等于1的，即nValueKind==1）。按照Dijkstra的表示方法把二叉分词图表转化成图二的表示形式，就能比较清楚地看出来，求解的过程实际就是求源点0到终于12的最短路径，和纯粹的Dijkstra算法不同的地方是在此处需要记录每个节点的N个前驱，Dijkstra当中记录一个即可。

图一

图二

在求解过程中，源程序通过二维数组m_pParent[i][j]、m_pWeight[m][n]来记录每个节点的N个前驱和每个前驱和权重，而求解最短路径权重时借用了一个队列来实现排序，数据结构如下图三所示：

图四

在源程序中，N最短路径是在CNShortPath类里里面实现的。

bool CSegment::BiSegment(char *sSentence, double dSmoothingPara, CDictionary &dictCore, CDictionary &dictBinary, unsigned int nResultCount)

{

......

//调用构造函数,生成一个二维链表,如下图一所示。每个链表节点是一个队列，数据结构如下图二所示

CNShortPath sp(&aBiwordsNet,nResultCount);

//最短路径算法实现

sp.ShortPath();

//输出最短路径

sp.Output(nSegRoute,false,&m_nSegmentCount);

.....

}

对源代码进行解析，以“他说的确实在理”为实例：

//进行N－最短路径的求解，找出每一个节点的前驱计算前驱的权值（从源点到该前驱节点）

int CNShortPath::ShortPath()

{

unsigned int nCurNode=1,nPreNode,i,nIndex;

ELEMENT_TYPE eWeight;

PARRAY_CHAIN pEdgeList;

//遍历所示节点,按列优先原则,从1开始

//m_apCost其实是一个邻接表,或者叫稀疏矩阵,如图一所示，

//每一个节点代表的是分词路径中的一条边,

//该节点的行值代表边的起点,该节点的列值代表该边的终点

for(;nCurNode<m_nVertex;nCurNode++)

{

CQueue queWork;

//得到从nCurNode开始的所有结点,列优先原则

eWeight=m_apCost->GetElement(-1,nCurNode,0,&pEdgeList);//Get all the edges

//遍历列下标等于nCurNode的所有结点,即遍历邻接表中所有终点为nCurNode的边

while(pEdgeList!=0 && pEdgeList->col==nCurNode)

{

//取得该边的起点

nPreNode=pEdgeList->row;

//该条边的权值

eWeight=pEdgeList->value;//Get the value of edges

//m_nValueKind代表的是N-最短路径的N,即前N条最短分词路径

//m_pWeight记录当前节点的最短路径的权值,即从开始点到该点所有边的权值的总和

//每条边的起点的前驱可能有若干个,在这里只记录权值最小的m_nValueKind个

for(i=0;i<m_nValueKind;i++)

{

if(nPreNode>0)//Push the weight and the pre node infomation

{

if(m_pWeight[nPreNode-1][i]==INFINITE_VALUE)

break;

queWork.Push(nPreNode,i,eWeight+m_pWeight[nPreNode-1][i]);

}

else//该条边的起点是0,即该起点没有父结点,是分词的源点

{

queWork.Push(nPreNode,i,eWeight);

break;

}

}//end for

pEdgeList=pEdgeList->next;

}//end while

//Now get the result queue which sort as weight.

//Set the current node information

for(i=0;i<m_nValueKind;i++)

{

m_pWeight[nCurNode-1][i]=INFINITE_VALUE;

}

//memset((void *),(int),sizeof(ELEMENT_TYPE)*);

//init the weight

i=0;

//设置当前节点的N个前驱节点的最短路径的权值

//以"他说的确实在理"为例

//m_pWeight[0][0]=3.846

//m_pWeight[1][0]=6.025

//m_pWeight[2][0]=10.208

//m_pWeight[3][0]=15.063

//m_pWeight[4][0]=16.190

//m_pWeight[5][0]=16.184

//m_pWeight[6][0]=28.331

//m_pWeight[7][0]=28.331

//m_pWeight[8][0]=28.923

//m_pWeight[9][0]=28.923

//m_pWeight[10][0]=36.416

//m_pWeight[11][0]= 39.889

while(i<m_nValueKind&&queWork.Pop(&nPreNode,&nIndex,&eWeight)!=-1)

{//Set the current node weight and parent

if(m_pWeight[nCurNode-1][i]==INFINITE_VALUE)

m_pWeight[nCurNode-1][i]=eWeight;

//记录下一个前驱的权值，在queWork里面已经做过排序，

//所以不会有后来的eWeight更小的可能

//我总得把此if语句的表达式反过来比较可能会更容易理解一点

else if(m_pWeight[nCurNode-1][i]<eWeight)//Next queue

{

i++;//Go next queue and record next weight

if(i==m_nValueKind)//Get the last position

break;

m_pWeight[nCurNode-1][i]=eWeight;

}

//m_pParent[0][0]=(0,0,0)

//m_pParent[1][0]=(1,0,0)

//m_pParent[2][0]=(2,0,0)

//m_pParent[3][0]=(2,0,0)

//m_pParent[4][0]=(3,0,0)

//m_pParent[5][0]=(3,0,0)

//m_pParent[6][0]=(4,0,0)

//m_pParent[7][0]=(4,0,0)

//m_pParent[8][0]=(6,0,0)

//m_pParent[9][0]=(6,0,0)

//m_pParent[10][0]=(9,0,0)

//m_pParent[11][0]=(11,0,0)

m_pParent[nCurNode-1][i].Push(nPreNode,nIndex);

}

}//end for

return 1;

}

经过对每个节点的前驱求解后，得到前驱的最短路径权值和它的父节点，记录如下图四所示：

图四

然后通过队列（其实更象一个栈）来求出二叉分词路径：

//bBest=true: only get one best result and ignore others

//Added in 2002-1-24

void CNShortPath::GetPaths(unsigned int nNode,unsigned int nIndex,int **nResult,bool bBest)

{

CQueue queResult;

unsigned int nCurNode,nCurIndex,nParentNode,nParentIndex,nResultIndex=0;

if(m_nResultCount>=MAX_SEGMENT_NUM)//Only need 10 result

return ;

nResult[m_nResultCount][nResultIndex]=-1;//Init the result

//先把末节点压栈

queResult.Push(nNode,nIndex);

nCurNode=nNode;

nCurIndex=nIndex;

bool bFirstGet;

while(!queResult.IsEmpty())

{

while(nCurNode>0)//

{ //Get its parent and store them in nParentNode,nParentIndex

//根据m_pParent数组中记录的每一个节点的前驱，把相应的前驱也压入栈中，

//当把0节点也压入栈中时，即表示找到一个条完整的最短路径，

//详情可参考吕震宇的BLOG：SharpICTCLAS分词系统简介(4)NShortPath-1

if(m_pParent[nCurNode-1][nCurIndex].Pop(&nParentNode,&nParentIndex,0,false,true)!=-1)

{

nCurNode=nParentNode;

nCurIndex=nParentIndex;

}

if(nCurNode>0)

queResult.Push(nCurNode,nCurIndex);

}

//当到0节点时，也就意为着形成了一条最短路径

if(nCurNode==0)

{ //Get a path and output

nResult[m_nResultCount][nResultIndex++]=nCurNode;//Get the first node

bFirstGet=true;

nParentNode=nCurNode;

//输出该条分词怎么，在这里queResult并不实际弹出元素，只是下标位移遍历元素

//遍历元素通过第四个参数bModify来控制是否真正删除栈顶元素

while(queResult.Pop(&nCurNode,&nCurIndex,0,false,bFirstGet)!=-1)

{

nResult[m_nResultCount][nResultIndex++]=nCurNode;

bFirstGet=false;

nParentNode=nCurNode;

}

nResult[m_nResultCount][nResultIndex]=-1;//Set the end

m_nResultCount+=1;//The number of result add by 1

if(m_nResultCount>=MAX_SEGMENT_NUM)//Only need 10 result

return ;

nResultIndex=0;

nResult[m_nResultCount][nResultIndex]=-1;//Init the result

if(bBest)//Return the best result, ignore others

return ;

}

//首先判断栈顶元素是否有下一个前驱，如果没有则删除栈顶元素直到有下一个前驱的元素出现

queResult.Pop(&nCurNode,&nCurIndex,0,false,true);//Read the top node

while(queResult.IsEmpty()==false&&(m_pParent[nCurNode-1][nCurIndex].IsSingle()||m_pParent[nCurNode-1][nCurIndex].IsEmpty(true)))

{

queResult.Pop(&nCurNode,&nCurIndex,0);//Get rid of it

queResult.Pop(&nCurNode,&nCurIndex,0,false,true);//Read the top node

}

//如果找到了有下一个前驱的节点，则它的前驱压入栈中，重新循环直到把源点也压入

if(queResult.IsEmpty()==false&&m_pParent[nCurNode-1][nCurIndex].IsEmpty(true)==false)

{

m_pParent[nCurNode-1][nCurIndex].Pop(&nParentNode,&nParentIndex,0,false,false);

nCurNode=nParentNode;

nCurIndex=nParentIndex;

if(nCurNode>0)

queResult.Push(nCurNode,nCurIndex);

}

最终得到最短路么（0，1，2，3，6，9，11，12），里面的数值分别对应研究（四）中图四的下标，到此分词的第一大步就结束了，并形成最终结果：始##始/他/说/的/确实/在/理/末##末

如果想详细getPaths（）当中的实现原理，推荐大家看吕震宇的BLOG：

http://www.cnblogs.com/zhenyulu/articles/669795.html