文档合并打分程序

来源:互联网 发布:1688网络批发大市场 编辑:程序博客网 时间:2024/04/29 23:07

length={8}, pfRtv={ 1.000000}  length={27}, pfRtv={ 2.000000}  length={64}, pfRtv={ 3.000000} 
doc0  start
10  2.000000 11  3.000000 12  6.000000 13  11.000000 14  18.000000 15  7.000000 16  18.000000 17  11.000000
doc1  start
11  3.000000 12  4.000000 13  7.000000 14  12.000000 15  19.000000 16  8.000000 17  19.000000 18  12.000000 19  7.000000 20  4.000000 21  3.000000 22  4.000000 23  7.000000 24  12.000000 25  19.000000 26  8.000000 27  19.000000 28  12.000000 29  7.000000 30  4.000000 31  3.000000 32  4.000000 33  7.000000 34  12.000000 35  19.000000 36  8.000000 37  19.000000
doc2  start
12  4.000000 13  5.000000 14  8.000000 15  13.000000 16  0.000000 17  9.000000 18  0.000000 19  13.000000 20  8.000000 21  5.000000 22  4.000000 23  5.000000 24  8.000000 25  13.000000 26  0.000000 27  9.000000 28  0.000000 29  13.000000 30  8.000000 31  5.000000 32  4.000000 33  5.000000 34  8.000000 35  13.000000 36  0.000000 37  9.000000 38  0.000000 39  13.000000 40  8.000000 41  5.000000 42  4.000000 43  5.000000 44  8.000000 45  13.000000 46  0.000000 47  9.000000 48  0.000000 49  13.000000 50  8.000000 51  5.000000 52  4.000000 53  5.000000 54  8.000000 55  13.000000 56  0.000000 57  9.000000 58  0.000000 59  13.000000 60  8.000000 61  5.000000 62  4.000000 63  5.000000 64  8.000000 65  13.000000 66  0.000000 67  9.000000 68  0.000000 69  13.000000 70  8.000000 71  5.000000 72  4.000000 73  5.000000 74  8.000000 75  13.000000
result 
12  0.000397
13  0.000610
14  0.001007
15  0.001282
16  0.000519
17  0.001160

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$

#include  <stdio.h>
#include <stdlib.h>
#include  <string.h>
#include <math.h>

typedef struct _SSDOC_ITEM_TMP
{
        ushort      iDocID;
        float      tf;
}   SSDOC_ITEM_TMP;
#define  RTV_TYPE  float
#define  MAX_USHORT   65535
#define  PAGE_SEARCH_ARG  int
#define  BUCKET_VECTOR_BLOCK   64*(1024)

typedef struct   _TERM_DIC_HASH_VALUE_INTER
{
        float * pfRtv;
        int  length;
}TERM_DIC_HASH_VALUE_INTER;
int find(int left,int right,ushort docid ,SSDOC_ITEM_TMP *data )
{
        int  mid =(left +right )/2;
        if (left==mid )
        {  
                if(  docid==data [right].iDocID)
                        return  right;
                else  if(  docid==data [left].iDocID)
                        return  left;
                else
                        return   -1;
        } 
        else if( docid > data [ mid  ].iDocID  )
                return  find  ( mid  ,right ,docid , data);
        else
                return  find (left ,  mid ,  docid , data);

}


void GetSameDocBSearch(  int &nCommon, SSDOC_ITEM_TMP *common_doc,  SSDOC_ITEM_TMP *left,SSDOC_ITEM_TMP  *right,
                int  left_length ,int right_length,RTV_TYPE  rtv)
{

        int  i,j;
        for(i=0;i<left_length;i++,left++)
        {
                short  pos=0;
                if ( -1== (pos=find ( 0 ,right_length , left->iDocID ,right )))
                {  
                        continue;

                }
                else
                {  
                        common_doc[  nCommon].iDocID=left->iDocID  ;
                        common_doc[ nCommon ].tf  = left->tf  +  (((float)(right+pos)->tf) / MAX_USHORT) * rtv;
                        nCommon++;

                }
        }
}

void GetSameDocMergeSearch(  int &nCommon, SSDOC_ITEM_TMP *common_doc,  SSDOC_ITEM_TMP *left,SSDOC_ITEM_TMP  *right,
                int  left_length ,int right_length,RTV_TYPE rtv)
{

        int  left_tmp=0,right_tmp=0;
        while(1)
        {
                if(left_tmp>=left_length)
                {
                        return ;
                }
                if(right_tmp>=right_length)
                {
                        return ;
                }

                if (left->iDocID ==right->iDocID)
                {
                        common_doc[  nCommon].iDocID=left->iDocID  ;
                        common_doc[ nCommon ].tf  = left->tf  +  (((float)right->tf) / MAX_USHORT) * rtv;
                        nCommon++;

                        left_tmp++;
                        left++;

                        right_tmp++;
                        right++;

                }
                else  if (left->iDocID < right->iDocID)
                {
                        left_tmp++;
                        left++;
                }
                else
                {
                        right_tmp++;
                        right++;
                }
        }

}


void GetSameDocWrap(ushort sBucketPos, int &nSameCount,SSDOC_ITEM_TMP *aSameDoc, int nDocArray, SSDOC_ITEM_TMP *pDocArray[],
                PAGE_SEARCH_ARG *pArg, const TERM_DIC_HASH_VALUE_INTER * const pDocList, off_t *loffset)
{
        off_t   lloffset[nDocArray];
        SSDOC_ITEM_TMP *pTmpDocArray[nDocArray];

        memset(lloffset, 0, sizeof(off_t) * nDocArray);
        memcpy(pTmpDocArray, pDocArray, nDocArray * sizeof(SSDOC_ITEM_TMP *));

        SSDOC_ITEM_TMP   result_common_doc[ BUCKET_VECTOR_BLOCK ]= {0};
        ushort left=pDocList[0].length;
        memcpy (  result_common_doc , pDocArray[0], left *  sizeof (  SSDOC_ITEM_TMP) );
        for(int  k=0;k<left ;k++)
        {
                result_common_doc[k].tf = (((float)(pTmpDocArray[0]+k)->tf) / MAX_USHORT) * (*(pDocList[0].pfRtv));
        }

        for( int j=1;j<nDocArray;j++)
        {
                ushort  right=pDocList[j].length;
                //              if(left +  right  >  left  * log ( right )/log (2)  )
                if(0)
                {
                        int  nCommon=0;
                        SSDOC_ITEM_TMP   common_doc[ BUCKET_VECTOR_BLOCK ]= {0};
                        GetSameDocMergeSearch(nCommon , common_doc ,
                                        result_common_doc,pDocArray[j], left ,right ,(*(pDocList[j].pfRtv)) );
                        if(0==nCommon)
                        {
                                return  ;
                        }
                        else
                        {
                                left=nCommon;
                                memcpy ( result_common_doc  , common_doc , nCommon * sizeof ( SSDOC_ITEM_TMP));
                        }
                }
                else
                {
                        int  nCommon=0;
                        SSDOC_ITEM_TMP   common_doc[ BUCKET_VECTOR_BLOCK ]= {0};
                        GetSameDocBSearch(nCommon , common_doc ,result_common_doc ,pDocArray[j], left,right ,(*(pDocList[j].pfRtv)));
                        if(0==nCommon)
                        {
                                return  ;
                        }
                        else
                        {
                                left=nCommon;
                                memcpy ( result_common_doc  , common_doc , nCommon * sizeof ( SSDOC_ITEM_TMP));
                        }
                }
        }
        nSameCount=left;
        for(int  kk=0;kk<left;kk++)
        {
                uint iDocID = aSameDoc[kk].iDocID = result_common_doc[kk].iDocID + BUCKET_VECTOR_BLOCK * sBucketPos;
                aSameDoc[kk].tf = result_common_doc[kk].tf;
        }

        /*
           {
           ushort  doc_count=pDocList[j]->iTotalDocCnt ;
           if (doc_count >   0.1*MAX_DOC_COUNT)
           {


           uint iDocID = aSameDoc[nSameCount].iDocID = pTmpDocArray[j]->iDocID + BUCKET_VECTOR_BLOCK * sBucketPos;
           ushort  buc_pos =iDocID/256;
           ushort  buc_mod =iDocID%256;

           lloffset[j] += pTmpDocArray[j]->nOccurs;
           }
           else
           {
           for(int  ii=0;ii<pos ;i++)
           {
           lloffset[ii]+=(pTmpDocArray[j]+ii)->nOccurs;
           }

           }

           }
         */

        /*
           while (pTmpDocArray[j]->iDocID < pTmpDocArray[j - 1]->iDocID)
           {
           lloffset[j] += pTmpDocArray[j]->nOccurs;
           pTmpDocArray[j]++;
           }

           if (!pTmpDocArray[j]->nOccurs)
           return;
         */
}


int  main()
{
#define  LENGTH  3

        int  nSameCount;
        SSDOC_ITEM_TMP  aSameDoc[MAX_USHORT ] ={0};
        int  nDocArray =LENGTH;
        SSDOC_ITEM_TMP  * pDocArray[LENGTH];
        PAGE_SEARCH_ARG  pArg[LENGTH];

        TERM_DIC_HASH_VALUE_INTER  pDocList[LENGTH]={0};

        for(int  i=0;i<nDocArray ;i++)
        {
                pDocList[i].pfRtv=(float *)calloc (1,sizeof (float ));
                *(pDocList[i].pfRtv)=i+1;
                pDocList[i].length=(i+2)*(i+2)*(i+2);
                printf("length={%d}, pfRtv={ %f}  ",pDocList[i].length, *(pDocList[i].pfRtv)  );
        }
        printf("\n");
        for(int  i=0;i<nDocArray;i++)
        {
                printf("doc%d  start \n",i);
                pDocArray[i]=(SSDOC_ITEM_TMP  *)calloc ( pDocList[i].length ,sizeof ( SSDOC_ITEM_TMP  ));
                for(int  j=0;j< pDocList[i].length;j++)
                {
                        pDocArray[i][j].iDocID=i+j+10;
                        pDocArray[i][j].tf=(j*j+2+i)%20;
                        printf("%d  %f ", pDocArray[i][j].iDocID,pDocArray[i][j].tf);
                }
                printf("\n");
        }
        GetSameDocWrap ( 0 , nSameCount ,aSameDoc ,nDocArray   ,  pDocArray , pArg , pDocList ,0);
        printf("result  \n");
        for(int  i=0;i<nSameCount;i++)
        {
                printf("%d  %f \n", aSameDoc[i].iDocID,aSameDoc[i].tf);
        }
}

原创粉丝点击