文档合并打分程序
来源:互联网 发布:1688网络批发大市场 编辑:程序博客网 时间:2024/04/29 23:07
length={8}, pfRtv={ 1.000000} length={27}, pfRtv={ 2.000000} length={64}, pfRtv={ 3.000000}
doc0 start
10 2.000000 11 3.000000 12 6.000000 13 11.000000 14 18.000000 15 7.000000 16 18.000000 17 11.000000
doc1 start
11 3.000000 12 4.000000 13 7.000000 14 12.000000 15 19.000000 16 8.000000 17 19.000000 18 12.000000 19 7.000000 20 4.000000 21 3.000000 22 4.000000 23 7.000000 24 12.000000 25 19.000000 26 8.000000 27 19.000000 28 12.000000 29 7.000000 30 4.000000 31 3.000000 32 4.000000 33 7.000000 34 12.000000 35 19.000000 36 8.000000 37 19.000000
doc2 start
12 4.000000 13 5.000000 14 8.000000 15 13.000000 16 0.000000 17 9.000000 18 0.000000 19 13.000000 20 8.000000 21 5.000000 22 4.000000 23 5.000000 24 8.000000 25 13.000000 26 0.000000 27 9.000000 28 0.000000 29 13.000000 30 8.000000 31 5.000000 32 4.000000 33 5.000000 34 8.000000 35 13.000000 36 0.000000 37 9.000000 38 0.000000 39 13.000000 40 8.000000 41 5.000000 42 4.000000 43 5.000000 44 8.000000 45 13.000000 46 0.000000 47 9.000000 48 0.000000 49 13.000000 50 8.000000 51 5.000000 52 4.000000 53 5.000000 54 8.000000 55 13.000000 56 0.000000 57 9.000000 58 0.000000 59 13.000000 60 8.000000 61 5.000000 62 4.000000 63 5.000000 64 8.000000 65 13.000000 66 0.000000 67 9.000000 68 0.000000 69 13.000000 70 8.000000 71 5.000000 72 4.000000 73 5.000000 74 8.000000 75 13.000000
result
12 0.000397
13 0.000610
14 0.001007
15 0.001282
16 0.000519
17 0.001160
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
typedef struct _SSDOC_ITEM_TMP
{
ushort iDocID;
float tf;
} SSDOC_ITEM_TMP;
#define RTV_TYPE float
#define MAX_USHORT 65535
#define PAGE_SEARCH_ARG int
#define BUCKET_VECTOR_BLOCK 64*(1024)
typedef struct _TERM_DIC_HASH_VALUE_INTER
{
float * pfRtv;
int length;
}TERM_DIC_HASH_VALUE_INTER;
int find(int left,int right,ushort docid ,SSDOC_ITEM_TMP *data )
{
int mid =(left +right )/2;
if (left==mid )
{
if( docid==data [right].iDocID)
return right;
else if( docid==data [left].iDocID)
return left;
else
return -1;
}
else if( docid > data [ mid ].iDocID )
return find ( mid ,right ,docid , data);
else
return find (left , mid , docid , data);
}
void GetSameDocBSearch( int &nCommon, SSDOC_ITEM_TMP *common_doc, SSDOC_ITEM_TMP *left,SSDOC_ITEM_TMP *right,
int left_length ,int right_length,RTV_TYPE rtv)
{
int i,j;
for(i=0;i<left_length;i++,left++)
{
short pos=0;
if ( -1== (pos=find ( 0 ,right_length , left->iDocID ,right )))
{
continue;
}
else
{
common_doc[ nCommon].iDocID=left->iDocID ;
common_doc[ nCommon ].tf = left->tf + (((float)(right+pos)->tf) / MAX_USHORT) * rtv;
nCommon++;
}
}
}
void GetSameDocMergeSearch( int &nCommon, SSDOC_ITEM_TMP *common_doc, SSDOC_ITEM_TMP *left,SSDOC_ITEM_TMP *right,
int left_length ,int right_length,RTV_TYPE rtv)
{
int left_tmp=0,right_tmp=0;
while(1)
{
if(left_tmp>=left_length)
{
return ;
}
if(right_tmp>=right_length)
{
return ;
}
if (left->iDocID ==right->iDocID)
{
common_doc[ nCommon].iDocID=left->iDocID ;
common_doc[ nCommon ].tf = left->tf + (((float)right->tf) / MAX_USHORT) * rtv;
nCommon++;
left_tmp++;
left++;
right_tmp++;
right++;
}
else if (left->iDocID < right->iDocID)
{
left_tmp++;
left++;
}
else
{
right_tmp++;
right++;
}
}
}
void GetSameDocWrap(ushort sBucketPos, int &nSameCount,SSDOC_ITEM_TMP *aSameDoc, int nDocArray, SSDOC_ITEM_TMP *pDocArray[],
PAGE_SEARCH_ARG *pArg, const TERM_DIC_HASH_VALUE_INTER * const pDocList, off_t *loffset)
{
off_t lloffset[nDocArray];
SSDOC_ITEM_TMP *pTmpDocArray[nDocArray];
memset(lloffset, 0, sizeof(off_t) * nDocArray);
memcpy(pTmpDocArray, pDocArray, nDocArray * sizeof(SSDOC_ITEM_TMP *));
SSDOC_ITEM_TMP result_common_doc[ BUCKET_VECTOR_BLOCK ]= {0};
ushort left=pDocList[0].length;
memcpy ( result_common_doc , pDocArray[0], left * sizeof ( SSDOC_ITEM_TMP) );
for(int k=0;k<left ;k++)
{
result_common_doc[k].tf = (((float)(pTmpDocArray[0]+k)->tf) / MAX_USHORT) * (*(pDocList[0].pfRtv));
}
for( int j=1;j<nDocArray;j++)
{
ushort right=pDocList[j].length;
// if(left + right > left * log ( right )/log (2) )
if(0)
{
int nCommon=0;
SSDOC_ITEM_TMP common_doc[ BUCKET_VECTOR_BLOCK ]= {0};
GetSameDocMergeSearch(nCommon , common_doc ,
result_common_doc,pDocArray[j], left ,right ,(*(pDocList[j].pfRtv)) );
if(0==nCommon)
{
return ;
}
else
{
left=nCommon;
memcpy ( result_common_doc , common_doc , nCommon * sizeof ( SSDOC_ITEM_TMP));
}
}
else
{
int nCommon=0;
SSDOC_ITEM_TMP common_doc[ BUCKET_VECTOR_BLOCK ]= {0};
GetSameDocBSearch(nCommon , common_doc ,result_common_doc ,pDocArray[j], left,right ,(*(pDocList[j].pfRtv)));
if(0==nCommon)
{
return ;
}
else
{
left=nCommon;
memcpy ( result_common_doc , common_doc , nCommon * sizeof ( SSDOC_ITEM_TMP));
}
}
}
nSameCount=left;
for(int kk=0;kk<left;kk++)
{
uint iDocID = aSameDoc[kk].iDocID = result_common_doc[kk].iDocID + BUCKET_VECTOR_BLOCK * sBucketPos;
aSameDoc[kk].tf = result_common_doc[kk].tf;
}
/*
{
ushort doc_count=pDocList[j]->iTotalDocCnt ;
if (doc_count > 0.1*MAX_DOC_COUNT)
{
uint iDocID = aSameDoc[nSameCount].iDocID = pTmpDocArray[j]->iDocID + BUCKET_VECTOR_BLOCK * sBucketPos;
ushort buc_pos =iDocID/256;
ushort buc_mod =iDocID%256;
lloffset[j] += pTmpDocArray[j]->nOccurs;
}
else
{
for(int ii=0;ii<pos ;i++)
{
lloffset[ii]+=(pTmpDocArray[j]+ii)->nOccurs;
}
}
}
*/
/*
while (pTmpDocArray[j]->iDocID < pTmpDocArray[j - 1]->iDocID)
{
lloffset[j] += pTmpDocArray[j]->nOccurs;
pTmpDocArray[j]++;
}
if (!pTmpDocArray[j]->nOccurs)
return;
*/
}
int main()
{
#define LENGTH 3
int nSameCount;
SSDOC_ITEM_TMP aSameDoc[MAX_USHORT ] ={0};
int nDocArray =LENGTH;
SSDOC_ITEM_TMP * pDocArray[LENGTH];
PAGE_SEARCH_ARG pArg[LENGTH];
TERM_DIC_HASH_VALUE_INTER pDocList[LENGTH]={0};
for(int i=0;i<nDocArray ;i++)
{
pDocList[i].pfRtv=(float *)calloc (1,sizeof (float ));
*(pDocList[i].pfRtv)=i+1;
pDocList[i].length=(i+2)*(i+2)*(i+2);
printf("length={%d}, pfRtv={ %f} ",pDocList[i].length, *(pDocList[i].pfRtv) );
}
printf("\n");
for(int i=0;i<nDocArray;i++)
{
printf("doc%d start \n",i);
pDocArray[i]=(SSDOC_ITEM_TMP *)calloc ( pDocList[i].length ,sizeof ( SSDOC_ITEM_TMP ));
for(int j=0;j< pDocList[i].length;j++)
{
pDocArray[i][j].iDocID=i+j+10;
pDocArray[i][j].tf=(j*j+2+i)%20;
printf("%d %f ", pDocArray[i][j].iDocID,pDocArray[i][j].tf);
}
printf("\n");
}
GetSameDocWrap ( 0 , nSameCount ,aSameDoc ,nDocArray , pDocArray , pArg , pDocList ,0);
printf("result \n");
for(int i=0;i<nSameCount;i++)
{
printf("%d %f \n", aSameDoc[i].iDocID,aSameDoc[i].tf);
}
}
- 文档合并打分程序
- 将败者树运用到文档匹配打分程序中
- 类似打分程序
- SVN打分支,分支合并操作
- 使用SmartSVN打分支与合并代码
- SVN合并分支、分支打分支,分支合并分支
- 合并文档
- 打分
- 为译文信息打分的程序
- 影响 Lucene对文档打分的四种方式
- 影响Lucene对文档打分的四种方式
- 影响Lucene对文档打分的四种方式
- 全文搜索怎么给查询语句与文档相关性打分
- 如何合并pdf文档
- 批量合并word文档
- 实现word文档合并
- word 文档合并
- C# 合并word文档
- 基于VC6.0+WDK的环境搭建及简单实例
- 爱情的人生
- 学习外语几点
- MySQL 之 其他存储引擎
- linux虚拟机的一些觉问题的解决方法
- 文档合并打分程序
- 淘宝CDN系统架构
- IPC--消息队列 message queue(消息队列的创建,信息的发送和接收)
- 设计师必备的用户界面设计工具,工具包和资源
- 《Linux那些事儿之我是USB》我是U盘(27)彼岸花的传说(六)
- 某人收集的架构blog
- public protected private 成员函数和成员变量在public protected private 继承后访问权限问题
- Ts'o其人
- 学习ISA2004——《ISA Server 2004 常规部署方案》