将败者树运用到文档匹配打分程序中

来源:互联网 发布:哈尔滨雨人软件 编辑:程序博客网 时间:2024/05/16 14:10
 

length={4}, pfRtv={ 1.000000}  length={9}, pfRtv={ 2.000000}  length={16}, pfRtv={ 3.000000}  length={25}, pfRtv={ 4.000000} 
doc0  start
10  2.000000 11  3.000000 12  6.000000 13  11.000000
doc1  start
11  3.000000 12  4.000000 13  7.000000 14  12.000000 15  19.000000 16  8.000000 17  19.000000 18  12.000000 19  7.000000
doc2  start
12  4.000000 13  5.000000 14  8.000000 15  13.000000 16  0.000000 17  9.000000 18  0.000000 19  13.000000 20  8.000000 21  5.000000 22  4.000000 23  5.000000 24  8.000000 25  13.000000 26  0.000000 27  9.000000
doc3  start
13  5.000000 14  6.000000 15  9.000000 16  14.000000 17  1.000000 18  10.000000 19  1.000000 20  14.000000 21  9.000000 22  6.000000 23  5.000000 24  6.000000 25  9.000000 26  14.000000 27  1.000000 28  10.000000 29  1.000000 30  14.000000 31  9.000000 32  6.000000 33  5.000000 34  6.000000 35  9.000000 36  14.000000 37  1.000000

result 
13  0.000916

 

¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥¥

#include  <stdio.h>
#include <stdlib.h>
#include  <string.h>
#define  PATH  4


typedef struct _SSDOC_ITEM_TMP
{
        ushort      iDocID;
        float      tf;
}   SSDOC_ITEM_TMP;

#define  RTV_TYPE  float
#define  MAX_USHORT   65535
#define  PAGE_SEARCH_ARG  int
#define  BUCKET_VECTOR_BLOCK   64*(1024)

typedef struct   _TERM_DIC_HASH_VALUE_INTER
{
        float * pfRtv;
        int  length;
}TERM_DIC_HASH_VALUE_INTER;


typedef  struct  wrap_data
{
        int  offset;
        int  path;
        SSDOC_ITEM_TMP  *data;
}wrap_data;

int choosevec(int path)
{
        if(path<=4)
        {
                return 4;
        }
        else if (path<=8)
        {
                return 8;
        }
        else if(path<=16)
        {
                return  16;
        }
        else
        {
                return 32;
        }
}

wrap_data **vec;
int  vecsize;

wrap_data  *  up ( int num )
{
        int  i,j,k;
        wrap_data  *first,*second;
        i=num;
        second=vec[i];
        while(i)
        {
                j=i/2;
                first=vec[j];

                if(!first)
                {
                        vec[j]=second;
                        if (!j)
                        {
                                return second;
                        }
                        else
                        {
                                return NULL;
                        }
                }
                if ( first->path==second->path)
                {
                        i=j;
                }
                else if ( (*( second->data + second->offset )).iDocID>  (*( first->data + first->offset )).iDocID)
                {
                        vec[j]=second;
                        second=first;
                        i=j;

                }
                else
                {
                        i=j;
                }
        }
        return  second;
}


void GetSameDocWrap(ushort sBucketPos, int &nSameCount,SSDOC_ITEM_TMP *aSameDoc, int nDocArray, SSDOC_ITEM_TMP *pDocArray[],
                PAGE_SEARCH_ARG *pArg, const TERM_DIC_HASH_VALUE_INTER * const pDocList, off_t *loffset)
{
        off_t   lloffset[nDocArray];
        memset(lloffset, 0, sizeof(off_t) * nDocArray);
        wrap_data  *result;
        int i=0,j=0,k=0;
        wrap_data a[PATH]={0};
        int  last_docid=MAX_USHORT;
        int  count_docid=0;
        wrap_data   sed[PATH]={0};


        vecsize=2*    choosevec(PATH);
        vec=(wrap_data **)calloc( vecsize ,sizeof (wrap_data*));

        for(i=0;i<PATH;i++)
        {
                a[i].data=pDocArray[i];
                a[i].offset=0;
                a[i].path=i;
        }
        k=vecsize/2;
        for(i=0;i<PATH;i++)
        {
                vec[k+i]=&a[i];
        }
        for(i=0;i<PATH;i++)
        {
                result=up(i+k);
        }
        while(result)
        {
                if(   (*(result->data+result->offset)).iDocID  ==  MAX_USHORT )
                {
                        break;
                }

                if (  (*(result->data+result->offset)).iDocID ==  last_docid  )
                {
                        count_docid++;
                        sed[count_docid-1]=*result;
                        if (count_docid ==  PATH)
                        {
                                aSameDoc[nSameCount].iDocID=last_docid;
                                aSameDoc[nSameCount].tf=0;
                                for(i=0;i<PATH;i++)
                                {
                                        aSameDoc[nSameCount].tf+=
                                                (*(sed[i].data +  sed[i].offset)).tf/ MAX_USHORT * (*(pDocList[sed[i].path  ].pfRtv));
                                }

                                last_docid=MAX_USHORT;
                                nSameCount++;

                        }
                }
                else
                {
                        last_docid=(*(result->data+result->offset)).iDocID ;  
                        count_docid=1;
                        sed[count_docid-1]=*result;
                }

                //add  a  sed  for  the result  by  chenbing   2011.11.14
                result->offset++; 
                result=up(result->path+k);
        }
        printf("\n");

        /*
           {
           ushort  doc_count=pDocList[j]->iTotalDocCnt ;
           if (doc_count >   0.1*MAX_DOC_COUNT)
           {


           uint iDocID = aSameDoc[nSameCount].iDocID = pTmpDocArray[j]->iDocID + BUCKET_VECTOR_BLOCK * sBucketPos;
           ushort  buc_pos =iDocID/256;
           ushort  buc_mod =iDocID%256;

           lloffset[j] += pTmpDocArray[j]->nOccurs;
           }
           else
           {
           for(int  ii=0;ii<pos ;i++)
           {
           lloffset[ii]+=(pTmpDocArray[j]+ii)->nOccurs;
           }

           }

           }
         */

        /*
           while (pTmpDocArray[j]->iDocID < pTmpDocArray[j - 1]->iDocID)
           {
           lloffset[j] += pTmpDocArray[j]->nOccurs;
           pTmpDocArray[j]++;
           }

           if (!pTmpDocArray[j]->nOccurs)
           return;
         */
}


int  main()
{
        int  nSameCount=0;
        SSDOC_ITEM_TMP  aSameDoc[MAX_USHORT ] ={0};
        int  nDocArray =PATH;
        SSDOC_ITEM_TMP  * pDocArray[PATH];
        PAGE_SEARCH_ARG  pArg[PATH];

        TERM_DIC_HASH_VALUE_INTER  pDocList[PATH]={0};

        for(int  i=0;i<nDocArray ;i++)
        {
                pDocList[i].pfRtv=(float *)calloc (1,sizeof (float ));
                *(pDocList[i].pfRtv)=i+1;
                pDocList[i].length=(i+2)*(i+2);
                printf("length={%d}, pfRtv={ %f}  ",pDocList[i].length, *(pDocList[i].pfRtv)  );
        }
        printf("\n");
        for(int  i=0;i<nDocArray;i++)
        {
                printf("doc%d  start \n",i);
                pDocArray[i]=(SSDOC_ITEM_TMP  *)calloc ( pDocList[i].length +1,sizeof ( SSDOC_ITEM_TMP  ));
                for(int  j=0;j< pDocList[i].length;j++)
                {
                        pDocArray[i][j].iDocID=i+j+10;
                        pDocArray[i][j].tf=(j*j+2+i)%20;
                        printf("%d  %f ", pDocArray[i][j].iDocID,pDocArray[i][j].tf);
                }
                pDocArray[i][ pDocList[i].length ].iDocID=MAX_USHORT;
                printf("\n");
        }
        GetSameDocWrap ( 0 , nSameCount ,aSameDoc ,nDocArray   ,  pDocArray , pArg , pDocList ,0);

        printf("result  \n");
        for(int  i=0;i<nSameCount;i++)
        {
                printf("%d  %f \n", aSameDoc[i].iDocID,aSameDoc[i].tf);
        }


}

 

原创粉丝点击