文档合并打分程序

来源：互联网发布：1688网络批发大市场编辑：程序博客网时间：2024/04/29 23:07

length={8}, pfRtv={ 1.000000} length={27}, pfRtv={ 2.000000} length={64}, pfRtv={ 3.000000}
doc0 start
10 2.000000 11 3.000000 12 6.000000 13 11.000000 14 18.000000 15 7.000000 16 18.000000 17 11.000000
doc1 start
11 3.000000 12 4.000000 13 7.000000 14 12.000000 15 19.000000 16 8.000000 17 19.000000 18 12.000000 19 7.000000 20 4.000000 21 3.000000 22 4.000000 23 7.000000 24 12.000000 25 19.000000 26 8.000000 27 19.000000 28 12.000000 29 7.000000 30 4.000000 31 3.000000 32 4.000000 33 7.000000 34 12.000000 35 19.000000 36 8.000000 37 19.000000
doc2 start
12 4.000000 13 5.000000 14 8.000000 15 13.000000 16 0.000000 17 9.000000 18 0.000000 19 13.000000 20 8.000000 21 5.000000 22 4.000000 23 5.000000 24 8.000000 25 13.000000 26 0.000000 27 9.000000 28 0.000000 29 13.000000 30 8.000000 31 5.000000 32 4.000000 33 5.000000 34 8.000000 35 13.000000 36 0.000000 37 9.000000 38 0.000000 39 13.000000 40 8.000000 41 5.000000 42 4.000000 43 5.000000 44 8.000000 45 13.000000 46 0.000000 47 9.000000 48 0.000000 49 13.000000 50 8.000000 51 5.000000 52 4.000000 53 5.000000 54 8.000000 55 13.000000 56 0.000000 57 9.000000 58 0.000000 59 13.000000 60 8.000000 61 5.000000 62 4.000000 63 5.000000 64 8.000000 65 13.000000 66 0.000000 67 9.000000 68 0.000000 69 13.000000 70 8.000000 71 5.000000 72 4.000000 73 5.000000 74 8.000000 75 13.000000
result
12 0.000397
13 0.000610
14 0.001007
15 0.001282
16 0.000519
17 0.001160

$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

typedef struct _SSDOC_ITEM_TMP
{
        ushort      iDocID;
        float      tf;
}   SSDOC_ITEM_TMP;
#define RTV_TYPE float
#define MAX_USHORT   65535
#define PAGE_SEARCH_ARG int
#define BUCKET_VECTOR_BLOCK   64*(1024)

typedef struct   _TERM_DIC_HASH_VALUE_INTER
{
        float * pfRtv;
        int length;
}TERM_DIC_HASH_VALUE_INTER;
int find(int left,int right,ushort docid ,SSDOC_ITEM_TMP *data )
{
        int mid =(left +right )/2;
        if (left==mid )
        {
                if( docid==data [right].iDocID)
                        return right;
                else if( docid==data [left].iDocID)
                        return left;
                else
                        return   -1;
        }
        else if( docid > data [ mid ].iDocID )
                return find ( mid ,right ,docid , data);
        else
                return find (left , mid , docid , data);

}

void GetSameDocBSearch( int &nCommon, SSDOC_ITEM_TMP *common_doc, SSDOC_ITEM_TMP *left,SSDOC_ITEM_TMP *right,
int left_length ,int right_length,RTV_TYPE rtv)
{

        int i,j;
        for(i=0;i<left_length;i++,left++)
        {
                short pos=0;
                if ( -1== (pos=find ( 0 ,right_length , left->iDocID ,right )))
                {
                        continue;

                }
                else
                {
                        common_doc[ nCommon].iDocID=left->iDocID ;
                        common_doc[ nCommon ].tf = left->tf + (((float)(right+pos)->tf) / MAX_USHORT) * rtv;
                        nCommon++;

}
}
}

void GetSameDocMergeSearch( int &nCommon, SSDOC_ITEM_TMP *common_doc, SSDOC_ITEM_TMP *left,SSDOC_ITEM_TMP *right,
int left_length ,int right_length,RTV_TYPE rtv)
{

        int left_tmp=0,right_tmp=0;
        while(1)
        {
                if(left_tmp>=left_length)
                {
                        return ;
                }
                if(right_tmp>=right_length)
                {
                        return ;
                }

                if (left->iDocID ==right->iDocID)
                {
                        common_doc[ nCommon].iDocID=left->iDocID ;
                        common_doc[ nCommon ].tf = left->tf + (((float)right->tf) / MAX_USHORT) * rtv;
                        nCommon++;

left_tmp++;
left++;

right_tmp++;
right++;

                }
                else if (left->iDocID < right->iDocID)
                {
                        left_tmp++;
                        left++;
                }
                else
                {
                        right_tmp++;
                        right++;
                }
        }

}

void GetSameDocWrap(ushort sBucketPos, int &nSameCount,SSDOC_ITEM_TMP *aSameDoc, int nDocArray, SSDOC_ITEM_TMP *pDocArray[],
                PAGE_SEARCH_ARG *pArg, const TERM_DIC_HASH_VALUE_INTER * const pDocList, off_t *loffset)
{
        off_t   lloffset[nDocArray];
        SSDOC_ITEM_TMP *pTmpDocArray[nDocArray];

memset(lloffset, 0, sizeof(off_t) * nDocArray);
memcpy(pTmpDocArray, pDocArray, nDocArray * sizeof(SSDOC_ITEM_TMP *));

        SSDOC_ITEM_TMP   result_common_doc[ BUCKET_VECTOR_BLOCK ]= {0};
        ushort left=pDocList[0].length;
        memcpy ( result_common_doc , pDocArray[0], left * sizeof ( SSDOC_ITEM_TMP) );
        for(int k=0;k<left ;k++)
        {
                result_common_doc[k].tf = (((float)(pTmpDocArray[0]+k)->tf) / MAX_USHORT) * (*(pDocList[0].pfRtv));
        }

        for( int j=1;j<nDocArray;j++)
        {
                ushort right=pDocList[j].length;
                //              if(left + right > left * log ( right )/log (2) )
                if(0)
                {
                        int nCommon=0;
                        SSDOC_ITEM_TMP   common_doc[ BUCKET_VECTOR_BLOCK ]= {0};
                        GetSameDocMergeSearch(nCommon , common_doc ,
                                        result_common_doc,pDocArray[j], left ,right ,(*(pDocList[j].pfRtv)) );
                        if(0==nCommon)
                        {
                                return ;
                        }
                        else
                        {
                                left=nCommon;
                                memcpy ( result_common_doc , common_doc , nCommon * sizeof ( SSDOC_ITEM_TMP));
                        }
                }
                else
                {
                        int nCommon=0;
                        SSDOC_ITEM_TMP   common_doc[ BUCKET_VECTOR_BLOCK ]= {0};
                        GetSameDocBSearch(nCommon , common_doc ,result_common_doc ,pDocArray[j], left,right ,(*(pDocList[j].pfRtv)));
                        if(0==nCommon)
                        {
                                return ;
                        }
                        else
                        {
                                left=nCommon;
                                memcpy ( result_common_doc , common_doc , nCommon * sizeof ( SSDOC_ITEM_TMP));
                        }
                }
        }
        nSameCount=left;
        for(int kk=0;kk<left;kk++)
        {
                uint iDocID = aSameDoc[kk].iDocID = result_common_doc[kk].iDocID + BUCKET_VECTOR_BLOCK * sBucketPos;
                aSameDoc[kk].tf = result_common_doc[kk].tf;
        }

        /*
           {
           ushort doc_count=pDocList[j]->iTotalDocCnt ;
           if (doc_count >   0.1*MAX_DOC_COUNT)
           {

           uint iDocID = aSameDoc[nSameCount].iDocID = pTmpDocArray[j]->iDocID + BUCKET_VECTOR_BLOCK * sBucketPos;
           ushort buc_pos =iDocID/256;
           ushort buc_mod =iDocID%256;

           lloffset[j] += pTmpDocArray[j]->nOccurs;
           }
           else
           {
           for(int ii=0;ii<pos ;i++)
           {
           lloffset[ii]+=(pTmpDocArray[j]+ii)->nOccurs;
           }

}

}
*/

        /*
           while (pTmpDocArray[j]->iDocID < pTmpDocArray[j - 1]->iDocID)
           {
           lloffset[j] += pTmpDocArray[j]->nOccurs;
           pTmpDocArray[j]++;
           }

           if (!pTmpDocArray[j]->nOccurs)
           return;
         */
}

int main()
{
#define LENGTH 3

        int nSameCount;
        SSDOC_ITEM_TMP aSameDoc[MAX_USHORT ] ={0};
        int nDocArray =LENGTH;
        SSDOC_ITEM_TMP * pDocArray[LENGTH];
        PAGE_SEARCH_ARG pArg[LENGTH];

TERM_DIC_HASH_VALUE_INTER pDocList[LENGTH]={0};

        for(int i=0;i<nDocArray ;i++)
        {
                pDocList[i].pfRtv=(float *)calloc (1,sizeof (float ));
                *(pDocList[i].pfRtv)=i+1;
                pDocList[i].length=(i+2)*(i+2)*(i+2);
                printf("length={%d}, pfRtv={ %f} ",pDocList[i].length, *(pDocList[i].pfRtv) );
        }
        printf("\n");
        for(int i=0;i<nDocArray;i++)
        {
                printf("doc%d start \n",i);
                pDocArray[i]=(SSDOC_ITEM_TMP *)calloc ( pDocList[i].length ,sizeof ( SSDOC_ITEM_TMP ));
                for(int j=0;j< pDocList[i].length;j++)
                {
                        pDocArray[i][j].iDocID=i+j+10;
                        pDocArray[i][j].tf=(j*j+2+i)%20;
                        printf("%d %f ", pDocArray[i][j].iDocID,pDocArray[i][j].tf);
                }
                printf("\n");
        }
        GetSameDocWrap ( 0 , nSameCount ,aSameDoc ,nDocArray   , pDocArray , pArg , pDocList ,0);
        printf("result \n");
        for(int i=0;i<nSameCount;i++)
        {
                printf("%d %f \n", aSameDoc[i].iDocID,aSameDoc[i].tf);
        }
}