将败者树运用到文档匹配打分程序中

来源：互联网发布：哈尔滨雨人软件编辑：程序博客网时间：2024/05/16 14:10

length={4}, pfRtv={ 1.000000} length={9}, pfRtv={ 2.000000} length={16}, pfRtv={ 3.000000} length={25}, pfRtv={ 4.000000}
doc0 start
10 2.000000 11 3.000000 12 6.000000 13 11.000000
doc1 start
11 3.000000 12 4.000000 13 7.000000 14 12.000000 15 19.000000 16 8.000000 17 19.000000 18 12.000000 19 7.000000
doc2 start
12 4.000000 13 5.000000 14 8.000000 15 13.000000 16 0.000000 17 9.000000 18 0.000000 19 13.000000 20 8.000000 21 5.000000 22 4.000000 23 5.000000 24 8.000000 25 13.000000 26 0.000000 27 9.000000
doc3 start
13 5.000000 14 6.000000 15 9.000000 16 14.000000 17 1.000000 18 10.000000 19 1.000000 20 14.000000 21 9.000000 22 6.000000 23 5.000000 24 6.000000 25 9.000000 26 14.000000 27 1.000000 28 10.000000 29 1.000000 30 14.000000 31 9.000000 32 6.000000 33 5.000000 34 6.000000 35 9.000000 36 14.000000 37 1.000000

result
13 0.000916

￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥￥

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define PATH 4

typedef struct _SSDOC_ITEM_TMP
{
        ushort      iDocID;
        float      tf;
}   SSDOC_ITEM_TMP;

#define RTV_TYPE float
#define MAX_USHORT 65535
#define PAGE_SEARCH_ARG int
#define BUCKET_VECTOR_BLOCK 64*(1024)

typedef struct   _TERM_DIC_HASH_VALUE_INTER
{
        float * pfRtv;
        int length;
}TERM_DIC_HASH_VALUE_INTER;

typedef struct wrap_data
{
        int offset;
        int path;
        SSDOC_ITEM_TMP *data;
}wrap_data;

int choosevec(int path)
{
        if(path<=4)
        {
                return 4;
        }
        else if (path<=8)
        {
                return 8;
        }
        else if(path<=16)
        {
                return 16;
        }
        else
        {
                return 32;
        }
}

wrap_data **vec;
int vecsize;

wrap_data * up ( int num )
{
        int i,j,k;
        wrap_data *first,*second;
        i=num;
        second=vec[i];
        while(i)
        {
                j=i/2;
                first=vec[j];

                if(!first)
                {
                        vec[j]=second;
                        if (!j)
                        {
                                return second;
                        }
                        else
                        {
                                return NULL;
                        }
                }
                if ( first->path==second->path)
                {
                        i=j;
                }
                else if ( (*( second->data + second->offset )).iDocID> (*( first->data + first->offset )).iDocID)
                {
                        vec[j]=second;
                        second=first;
                        i=j;

                }
                else
                {
                        i=j;
                }
        }
        return second;
}

void GetSameDocWrap(ushort sBucketPos, int &nSameCount,SSDOC_ITEM_TMP *aSameDoc, int nDocArray, SSDOC_ITEM_TMP *pDocArray[],
                PAGE_SEARCH_ARG *pArg, const TERM_DIC_HASH_VALUE_INTER * const pDocList, off_t *loffset)
{
        off_t   lloffset[nDocArray];
        memset(lloffset, 0, sizeof(off_t) * nDocArray);
        wrap_data *result;
        int i=0,j=0,k=0;
        wrap_data a[PATH]={0};
        int last_docid=MAX_USHORT;
        int count_docid=0;
        wrap_data   sed[PATH]={0};

vecsize=2* choosevec(PATH);
vec=(wrap_data **)calloc( vecsize ,sizeof (wrap_data*));

        for(i=0;i<PATH;i++)
        {
                a[i].data=pDocArray[i];
                a[i].offset=0;
                a[i].path=i;
        }
        k=vecsize/2;
        for(i=0;i<PATH;i++)
        {
                vec[k+i]=&a[i];
        }
        for(i=0;i<PATH;i++)
        {
                result=up(i+k);
        }
        while(result)
        {
                if(   (*(result->data+result->offset)).iDocID == MAX_USHORT )
                {
                        break;
                }

                if ( (*(result->data+result->offset)).iDocID == last_docid )
                {
                        count_docid++;
                        sed[count_docid-1]=*result;
                        if (count_docid == PATH)
                        {
                                aSameDoc[nSameCount].iDocID=last_docid;
                                aSameDoc[nSameCount].tf=0;
                                for(i=0;i<PATH;i++)
                                {
                                        aSameDoc[nSameCount].tf+=
                                                (*(sed[i].data + sed[i].offset)).tf/ MAX_USHORT * (*(pDocList[sed[i].path ].pfRtv));
                                }

last_docid=MAX_USHORT;
nSameCount++;

                        }
                }
                else
                {
                        last_docid=(*(result->data+result->offset)).iDocID ;
                        count_docid=1;
                        sed[count_docid-1]=*result;
                }

                //add a sed for the result by chenbing   2011.11.14
                result->offset++;
                result=up(result->path+k);
        }
        printf("\n");

        /*
           {
           ushort doc_count=pDocList[j]->iTotalDocCnt ;
           if (doc_count >   0.1*MAX_DOC_COUNT)
           {

           uint iDocID = aSameDoc[nSameCount].iDocID = pTmpDocArray[j]->iDocID + BUCKET_VECTOR_BLOCK * sBucketPos;
           ushort buc_pos =iDocID/256;
           ushort buc_mod =iDocID%256;

           lloffset[j] += pTmpDocArray[j]->nOccurs;
           }
           else
           {
           for(int ii=0;ii<pos ;i++)
           {
           lloffset[ii]+=(pTmpDocArray[j]+ii)->nOccurs;
           }

}

}
*/

        /*
           while (pTmpDocArray[j]->iDocID < pTmpDocArray[j - 1]->iDocID)
           {
           lloffset[j] += pTmpDocArray[j]->nOccurs;
           pTmpDocArray[j]++;
           }

           if (!pTmpDocArray[j]->nOccurs)
           return;
         */
}

int main()
{
        int nSameCount=0;
        SSDOC_ITEM_TMP aSameDoc[MAX_USHORT ] ={0};
        int nDocArray =PATH;
        SSDOC_ITEM_TMP * pDocArray[PATH];
        PAGE_SEARCH_ARG pArg[PATH];

TERM_DIC_HASH_VALUE_INTER pDocList[PATH]={0};

        for(int i=0;i<nDocArray ;i++)
        {
                pDocList[i].pfRtv=(float *)calloc (1,sizeof (float ));
                *(pDocList[i].pfRtv)=i+1;
                pDocList[i].length=(i+2)*(i+2);
                printf("length={%d}, pfRtv={ %f} ",pDocList[i].length, *(pDocList[i].pfRtv) );
        }
        printf("\n");
        for(int i=0;i<nDocArray;i++)
        {
                printf("doc%d start \n",i);
                pDocArray[i]=(SSDOC_ITEM_TMP *)calloc ( pDocList[i].length +1,sizeof ( SSDOC_ITEM_TMP ));
                for(int j=0;j< pDocList[i].length;j++)
                {
                        pDocArray[i][j].iDocID=i+j+10;
                        pDocArray[i][j].tf=(j*j+2+i)%20;
                        printf("%d %f ", pDocArray[i][j].iDocID,pDocArray[i][j].tf);
                }
                pDocArray[i][ pDocList[i].length ].iDocID=MAX_USHORT;
                printf("\n");
        }
        GetSameDocWrap ( 0 , nSameCount ,aSameDoc ,nDocArray   , pDocArray , pArg , pDocList ,0);

        printf("result \n");
        for(int i=0;i<nSameCount;i++)
        {
                printf("%d %f \n", aSameDoc[i].iDocID,aSameDoc[i].tf);
        }

}