sphinx源码分析之搜索(search)

来源：互联网发布：时间管理app 知乎编辑：程序博客网时间：2024/05/21 13:19

此处分析用的源码为最新版本1.0 beta版的。

//search.cpp int main ( int argc, char ** argv )

让我们从程序入口点开始进行旅程。search的入口点在文件search.cpp中，打开后定位到int main ( int argc, char ** argv )开始我们的分析：

在main中开始部分进行参数检查和配置信息的load，先进行命令行参数的检查和设置，如下：

///////////////////////////////////////////// get query and other commandline options///////////////////////////////////////////CSphQuery tQuery;char sQuery [ 1024 ];sQuery[0] = '/0';const char * sOptConfig = NULL;const char * sIndex = NULL;bool bNoInfo = false;bool bStdin = false;int iStart = 0;int iLimit = 20;#define OPT(_a1,_a2)else if ( !strcmp(argv[i],_a1) || !strcmp(argv[i],_a2) )#define OPT1(_a1)else if ( !strcmp(argv[i],_a1) )int i;for ( i=1; i<argc; i++ ){if ( argv[i][0]=='-' ){// this is an optionif ( i==0 );OPT ( "-a", "--any" )tQuery.m_eMode = SPH_MATCH_ANY;OPT ( "-b", "--boolean" )tQuery.m_eMode = SPH_MATCH_BOOLEAN;OPT ( "-p", "--phrase" )tQuery.m_eMode = SPH_MATCH_PHRASE;OPT ( "-e", "--ext" )tQuery.m_eMode = SPH_MATCH_EXTENDED;OPT ( "-e2", "--ext2" )tQuery.m_eMode = SPH_MATCH_EXTENDED2;OPT ( "-q", "--noinfo" )bNoInfo = true;OPT1 ( "--sort=date" )tQuery.m_eSort = SPH_SORT_ATTR_DESC;OPT1 ( "--rsort=date" )tQuery.m_eSort = SPH_SORT_ATTR_ASC;OPT1 ( "--sort=ts" )tQuery.m_eSort = SPH_SORT_TIME_SEGMENTS;OPT1 ( "--stdin" )bStdin = true;else if ( (i+1)>=argc )break;OPT ( "-o", "--offset" )iStart = atoi ( argv[++i] );OPT ( "-l", "--limit" )iLimit = atoi ( argv[++i] );OPT ( "-c", "--config" )sOptConfig = argv[++i];OPT ( "-i", "--index" )sIndex = argv[++i];OPT ( "-g", "--group" ){ tQuery.m_eGroupFunc = SPH_GROUPBY_ATTR; tQuery.m_sGroupBy = argv[++i]; }OPT ( "-gs","--groupsort" ){ tQuery.m_sGroupSortBy = argv[++i]; } // NOLINTOPT ( "-s", "--sortby" ){ tQuery.m_eSort = SPH_SORT_EXTENDED; tQuery.m_sSortBy = argv[++i]; }OPT ( "-S", "--sortexpr" ){ tQuery.m_eSort = SPH_SORT_EXPR; tQuery.m_sSortBy = argv[++i]; }else if ( (i+2)>=argc )break;OPT ( "-f", "--filter" ){DWORD uVal = strtoul ( argv[i+2], NULL, 10 );tQuery.m_dFilters.Reset ();tQuery.m_dFilters.Resize ( 1 );tQuery.m_dFilters[0].m_eType = SPH_FILTER_VALUES;tQuery.m_dFilters[0].m_dValues.Reset ();tQuery.m_dFilters[0].m_dValues.Add ( uVal );tQuery.m_dFilters[0].m_sAttrName = argv[i+1];i += 2;} elsebreak; // unknown option} else if ( strlen(sQuery) + strlen(argv[i]) + 1 < sizeof(sQuery) ){// this is a search termstrcat ( sQuery, argv[i] ); // NOLINTstrcat ( sQuery, " " ); // NOLINT}}iStart = Max ( iStart, 0 );iLimit = Max ( iLimit, 0 );if ( i!=argc ){fprintf ( stdout, "ERROR: malformed or unknown option near '%s'./n", argv[i] );return 1;}#undef OPTif ( bStdin ){int iPos = 0, iLeft = sizeof(sQuery)-1;char sThrowaway [ 256 ];while ( !feof(stdin) ){if ( iLeft>0 ){int iLen = fread ( sQuery, 1, iLeft, stdin );iPos += iLen;iLeft -= iLen;} else{int iDummy; // to avoid gcc unused result warningiDummy = fread ( sThrowaway, 1, sizeof(sThrowaway), stdin );}}assert ( iPos<(int)sizeof(sQuery) );sQuery[iPos] = '/0';}

所有的查询信息都封装在CSphQuery tQuery;中，结构CSphQuery定义可以查看原文件，此处不知道也无关系影响不大，只要清楚它只是个容器的作用，用于盛放所以查询参数就ok了。

继续旅程，下面进入config的加载部分：

/////////////// configure/////////////tQuery.m_iMaxMatches = Max ( 1000, iStart + iLimit );CSphConfigParser cp;CSphConfig & hConf = cp.m_tConf;sphLoadConfig ( sOptConfig, false, cp );

可以看到通过类CSphConfigParser对配置文件进行解析，最终解析结果存放在结构CSphConfig中，其实CSphConfig就是一个hashtable一样的东东，config文件中的source、index、searchd等等所有节的东西都被加载进入了CSphConfig中。

现在所有的参数信息不管是命令行传入的还是从config文件加载进入的都已经就绪，下面就该开始真正的工作了。

首先说明一下查询的流程，查询就是一个打开索引文件，用输入的查询词在索引文件中挨个进行比较，找到满足关系的文档的过程，并读出文档，给每个文件打分，最后打分完成后进行排序，随后获取到排序后的文档列表的过程。

//////////// search//////////tQuery.m_sQuery = sQuery;CSphQueryResult * pResult = NULL; //查询结果存放指针CSphIndex * pIndex = sphCreateIndexPhrase ( hIndex["path"].cstr() ); //用索引文件存放路径构造CSphIndex对象pIndex->m_bEnableStar = ( hIndex.GetInt("enable_star")!=0 );pIndex->SetWordlistPreload ( hIndex.GetInt("ondisk_dict")==0 );

数据库中可能存在多个索引文件，而查询一次在一个索引中进行。

for ( ; pIndex; ){if ( !pIndex->Prealloc ( false, false, sWarning ) || !pIndex->Preread() ){sError = pIndex->GetLastError ();break;}const CSphSchema * pSchema = &pIndex->GetMatchSchema();

pIndex->Prealloc ( false, false, sWarning )预先分配知足够内存存放cache数据

!pIndex->Preread() 预先读取所有的需要cache起来的数据，存放在Prealloc 分配的空间中
获取索引关联的数据模式const CSphSchema * pSchema = &pIndex->GetMatchSchema();

查询：

// do queryingISphMatchSorter * pTop = sphCreateQueue ( &tQuery, pIndex->GetMatchSchema(), sError );//创建一个优先级队列用于对文档集过滤，存放分值最高的文档docif ( !pTop ){sError.SetSprintf ( "failed to create sorting queue: %s", sError.cstr() );break;}pResult = new CSphQueryResult(); //实例化结果对象CSphQueryResultif ( !pIndex->MultiQuery ( &tQuery, pResult, 1, &pTop, NULL ) ) //执行最终的查询动作{// failure; pull that error messagesError = pIndex->GetLastError();SafeDelete ( pResult );} else{// success; fold them matchespResult->m_dMatches.Reset ();pResult->m_iTotalMatches += pTop->GetTotalCount();pResult->m_tSchema = pTop->GetSchema();sphFlattenQueue ( pTop, pResult, 0 ); //用查询结果填充pResult，以便返回使用}SafeDelete ( pTop );

pIndex->MultiQuery ( &tQuery, pResult, 1, &pTop, NULL ) )进行最终的查询过程。

函数MultiQuery的原型为virtual bool MultiQuery ( const CSphQuery * pQuery, CSphQueryResult * pResult, int iSorters, ISphMatchSorter ** ppSorters, const CSphVector<CSphFilterSettings> * pExtraFilters, int iTag=0 ) const = 0;

在后面还将对其进行分析。

MultiQuery的查询结果由pResult和pTop返回，在pTop中存放的是docs数据，里面的数据需要进行sphFlattenQueue ( pTop, pResult, 0 )操作后才能copy到pResult中，sphFlattenQueue()的运算过程就是一个优先队列的出队操作，最后的结果就是rank从高到底的排列。pResult返回的结果是一些对全体文档都有效的数据，如字符串属性的字符串值、多值属性的值等。

查询得到结果后下面的代码只是进行简单的终端输出了，没什么好分析的了：

/////////// print/////////if ( !pResult ){fprintf ( stdout, "index '%s': search error: %s./n", sIndexName, sError.cstr() );return 1;}fprintf ( stdout, "index '%s': query '%s': returned %d matches of %d total in %d.%03d sec/n",sIndexName, sQuery, pResult->m_dMatches.GetLength(), pResult->m_iTotalMatches,pResult->m_iQueryTime/1000, pResult->m_iQueryTime%1000 );if ( !pResult->m_sWarning.IsEmpty() )fprintf ( stdout, "WARNING: %s/n", pResult->m_sWarning.cstr() );if ( pResult->m_dMatches.GetLength() ){fprintf ( stdout, "/ndisplaying matches:/n" );int iMaxIndex = Min ( iStart+iLimit, pResult->m_dMatches.GetLength() );for ( int i=iStart; i<iMaxIndex; i++ ){CSphMatch & tMatch = pResult->m_dMatches[i];fprintf ( stdout, "%d. document=" DOCID_FMT ", weight=%d", 1+i, tMatch.m_iDocID, tMatch.m_iWeight );for ( int j=0; j<pResult->m_tSchema.GetAttrsCount(); j++ ){const CSphColumnInfo & tAttr = pResult->m_tSchema.GetAttr(j);fprintf ( stdout, ", %s=", tAttr.m_sName.cstr() );if ( tAttr.m_eAttrType & SPH_ATTR_MULTI ){fprintf ( stdout, "(" );SphAttr_t iIndex = tMatch.GetAttr ( tAttr.m_tLocator );if ( iIndex ){const DWORD * pValues = pResult->m_pMva + iIndex;int iValues = *pValues++;for ( int k=0; k<iValues; k++ )fprintf ( stdout, k ? ",%u" : "%u", *pValues++ );}fprintf ( stdout, ")" );} else switch ( tAttr.m_eAttrType ){case SPH_ATTR_INTEGER:case SPH_ATTR_ORDINAL:case SPH_ATTR_BOOL:fprintf ( stdout, "%u", (DWORD)tMatch.GetAttr ( tAttr.m_tLocator ) ); break;case SPH_ATTR_TIMESTAMP:fprintf ( stdout, "%s", myctime ( (DWORD)tMatch.GetAttr ( tAttr.m_tLocator ) ) ); break;case SPH_ATTR_FLOAT:fprintf ( stdout, "%f", tMatch.GetAttrFloat ( tAttr.m_tLocator ) ); break;case SPH_ATTR_BIGINT:fprintf ( stdout, INT64_FMT, tMatch.GetAttr ( tAttr.m_tLocator ) ); break;case SPH_ATTR_STRING:{const BYTE * pStr;int iLen = sphUnpackStr ( pResult->m_pStrings + tMatch.GetAttr ( tAttr.m_tLocator ), &pStr );//从tMatch.GetAttr ( tAttr.m_tLocator )得到字符串属性的存放地址偏移加上pResult->m_pStrings就是对于的字符串值fwrite ( pStr, 1, iLen, stdout );break;}default:fprintf ( stdout, "(unknown-type-%d)", tAttr.m_eAttrType );}}fprintf ( stdout, "/n" );

下面对文档排序器(ISphMatchSorter )的创建过程简单分析：

//sphinxsort.cpp ISphMatchSorter * sphCreateQueue ( const CSphQuery * pQuery, const CSphSchema & tSchema, CSphString & sError, bool bComputeItems )

此函数中大部分的代码段都是根据pQuery中的配置对ISphMatchSorte人进行定制的，源程序中的注释已经很详细，具体可以参考源代码。在进行了所有的需要定制的功能的设定后，最终如下面代码片段，进行实际的排序器构造：

switch ( eMatchFunc ){case FUNC_REL_DESC:pTop = new CSphMatchQueue<MatchRelevanceLt_fn>( pQuery->m_iMaxMatches, bUsesAttrs ); break;case FUNC_ATTR_DESC:pTop = new CSphMatchQueue<MatchAttrLt_fn>( pQuery->m_iMaxMatches, bUsesAttrs ); break;case FUNC_ATTR_ASC:pTop = new CSphMatchQueue<MatchAttrGt_fn>( pQuery->m_iMaxMatches, bUsesAttrs ); break;case FUNC_TIMESEGS:pTop = new CSphMatchQueue<MatchTimeSegments_fn>( pQuery->m_iMaxMatches, bUsesAttrs ); break;case FUNC_GENERIC2:pTop = new CSphMatchQueue<MatchGeneric2_fn>( pQuery->m_iMaxMatches, bUsesAttrs ); break;case FUNC_GENERIC3:pTop = new CSphMatchQueue<MatchGeneric3_fn>( pQuery->m_iMaxMatches, bUsesAttrs ); break;case FUNC_GENERIC4:pTop = new CSphMatchQueue<MatchGeneric4_fn>( pQuery->m_iMaxMatches, bUsesAttrs ); break;case FUNC_GENERIC5:pTop = new CSphMatchQueue<MatchGeneric5_fn>( pQuery->m_iMaxMatches, bUsesAttrs ); break;case FUNC_CUSTOM:pTop = new CSphMatchQueue<MatchCustom_fn>( pQuery->m_iMaxMatches, bUsesAttrs ); break;case FUNC_EXPR:pTop = new CSphMatchQueue<MatchExpr_fn>( pQuery->m_iMaxMatches, bUsesAttrs ); break;default:pTop = NULL;}

还是按着查询的路线一直走下去，下面分析查询过程。

//sphinx.cpp bool CSphIndex_VLN::MultiQuery ( const CSphQuery * pQuery, CSphQueryResult * pResult, int iSorters, ISphMatchSorter ** ppSorters, const CSphVector<CSphFilterSettings> * pExtraFilters, int iTag ) const

bool CSphIndex_VLN::MultiQuery ( const CSphQuery * pQuery, CSphQueryResult * pResult, int iSorters, ISphMatchSorter ** ppSorters, const CSphVector<CSphFilterSettings> * pExtraFilters, int iTag ) const{assert ( pQuery );MEMORY ( SPH_MEM_IDX_DISK_MULTY_QUERY );// to avoid the checking of a ppSorters's element for NULL on every next step, just filter out all nulls right hereCSphVector<ISphMatchSorter*> dSorters;dSorters.Reserve ( iSorters );for ( int i=0; i<iSorters; i++ )if ( ppSorters[i] )dSorters.Add ( ppSorters[i] );iSorters = dSorters.GetLength();// if we have anything to work withif ( iSorters==0 )return false;// non-random at the start, random at the enddSorters.Sort ( CmpPSortersByRandom_fn() );// fast path for scansif ( pQuery->m_sQuery.IsEmpty() )return MultiScan ( pQuery, pResult, iSorters, &dSorters[0], pExtraFilters, iTag );ISphTokenizer * pTokenizer = m_pTokenizer->Clone ( false ); //复制一个分词器，也许是为了多线程环境了高效和安全吧CSphScopedPtr<CSphDict> tDict ( NULL );CSphDict * pDict = SetupStarDict ( tDict, *pTokenizer );//给字典加个修饰以处理星号(*)查询CSphScopedPtr<CSphDict> tDict2 ( NULL );pDict = SetupExactDict ( tDict2, pDict, *pTokenizer );// parse queryXQQuery_t tParsed;/* 用输入的查询词(由pQuery->m_sQuery.cstr()得到)解析成词条(term)后，最终构造一棵查询树,* 这树结构纯属本人幻想的，事实是否如此还得以后分析，只是觉得二叉树好玩而已* 如：a b c * and * //* and c* // * a b*/if ( !sphParseExtendedQuery ( tParsed, pQuery->m_sQuery.cstr(), pTokenizer, &m_tSchema, pDict ) ) {pResult->m_sError = tParsed.m_sParseError;SafeDelete ( pTokenizer );return false;}// fixup stat's ordersphDoStatsOrder ( tParsed.m_pRoot, *pResult );// transform query if needed (quorum transform, keyword expansion, etc.)TransformQuorum ( &tParsed.m_pRoot );if ( m_bExpandKeywords )tParsed.m_pRoot = ExpandKeywords ( tParsed.m_pRoot, m_tSettings );// flag common subtreesCSphVector<XQNode_t*> dTrees;dTrees.Add ( tParsed.m_pRoot );int iCommonSubtrees = sphMarkCommonSubtrees ( dTrees );CSphQueryNodeCache tNodeCache ( iCommonSubtrees, m_iMaxCachedDocs, m_iMaxCachedHits );bool bResult = ParsedMultiQuery ( pQuery, pResult, iSorters, &dSorters[0], tParsed.m_pRoot, pDict, pExtraFilters, &tNodeCache, iTag );//用解析出的查询树进行最终的search工作SafeDelete ( pTokenizer );return bResult;}

//sphinx.cpp bool CSphIndex_VLN::ParsedMultiQuery ( const CSphQuery * pQuery, CSphQueryResult * pResult, int iSorters, ISphMatchSorter ** ppSorters, const XQNode_t * pRoot, CSphDict * pDict, const CSphVector<CSphFilterSettings> * pExtraFilters, CSphQueryNodeCache * pNodeCache, int iTag ) const

开头部分只是一些索引查询前的铺垫工作，为下一层接口提供需要的参数的封装：

// start countingint64_t tmQueryStart = sphMicroTimer();///////////////////// setup searching///////////////////PROFILER_INIT ();PROFILE_BEGIN ( query_init );// non-ready index, empty response!if ( m_bPreread.IsEmpty() || !m_bPreread[0] ){pResult->m_sError = "index not preread";return false;}// setup calculations and result schemaCSphQueryContext tCtx;if ( !tCtx.SetupCalc ( pResult, ppSorters[0]->GetSchema(), m_tSchema, GetMVAPool() ) )return false;// open filesCSphAutofile tDoclist, tHitlist, tWordlist, tDummy;if ( !m_bKeepFilesOpen ){if ( tDoclist.Open ( GetIndexFileName("spd"), SPH_O_READ, pResult->m_sError ) < 0 )return false;if ( tHitlist.Open ( GetIndexFileName ( m_uVersion>=3 ? "spp" : "spd" ), SPH_O_READ, pResult->m_sError ) < 0 )return false;if ( tWordlist.Open ( GetIndexFileName ( "spi" ), SPH_O_READ, pResult->m_sError ) < 0 )return false;}// setup search termsDiskIndexQwordSetup_c tTermSetup ( m_bKeepFilesOpen ? m_tDoclistFile : tDoclist,m_bKeepFilesOpen ? m_tHitlistFile : tHitlist,m_bPreloadWordlist ? tDummy : ( m_bKeepFilesOpen ? m_tWordlistFile : tWordlist ) );tTermSetup.m_pDict = pDict;tTermSetup.m_pIndex = this;tTermSetup.m_eDocinfo = m_tSettings.m_eDocinfo;tTermSetup.m_tMin.m_iDocID = m_tMin.m_iDocID;if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE ){tTermSetup.m_tMin.Clone ( m_tMin, m_tSchema.GetRowSize() );tTermSetup.m_iInlineRowitems = m_tSchema.GetRowSize();}tTermSetup.m_iDynamicRowitems = pResult->m_tSchema.GetDynamicSize();if ( pQuery->m_uMaxQueryMsec>0 )tTermSetup.m_iMaxTimer = sphMicroTimer() + pQuery->m_uMaxQueryMsec*1000; // max_query_timetTermSetup.m_pWarning = &pResult->m_sWarning;tTermSetup.m_bSetupReaders = true;tTermSetup.m_pCtx = &tCtx;tTermSetup.m_pNodeCache = pNodeCache;// bind weightstCtx.BindWeights ( pQuery, m_tSchema );

// setup query// must happen before index-level reject, in order to build proper keyword statsCSphScopedPtr<ISphRanker> pRanker ( sphCreateRanker ( pRoot, pQuery->m_eRanker, pResult, tTermSetup ) ); //创建一个打分器(ranker)，此打分器为下面find的每一个doc都进行打分处理if ( !pRanker.Ptr() )return false;

//////////////////////////////////////// find and weight matching documents 查找和给每一个doc打分//////////////////////////////////////PROFILE_BEGIN ( query_match );switch ( pQuery->m_eMode ){case SPH_MATCH_ALL:case SPH_MATCH_PHRASE:case SPH_MATCH_ANY:case SPH_MATCH_EXTENDED:case SPH_MATCH_EXTENDED2:case SPH_MATCH_BOOLEAN:if ( !MatchExtended ( &tCtx, pQuery, iSorters, ppSorters, pRanker.Ptr(), iTag ) ) //从索引中进行最终的查询return false;break;default:sphDie ( "INTERNAL ERROR: unknown matching mode (mode=%d)", pQuery->m_eMode );}PROFILE_END ( query_match );

最终的查找，并对找到的doc进行ranking

现在我们已经从index中获取到docs，此时的doc缺乏docinfo信息，下面给它们安装上docinfo：

// adjust result setsfor ( int iSorter=0; iSorter<iSorters; iSorter++ ){ISphMatchSorter * pTop = ppSorters[iSorter];// final lookup and/or calcbool bFinalLookup = !tCtx.m_bLookupFilter && !tCtx.m_bLookupSort;if ( pTop->GetLength() && ( bFinalLookup || tCtx.m_dCalcFinal.GetLength() ) ){const int iCount = pTop->GetLength ();CSphMatch * const pHead = pTop->First();CSphMatch * const pTail = pHead + iCount;for ( CSphMatch * pCur=pHead; pCur<pTail; pCur++ ){if ( bFinalLookup )CopyDocinfo ( &tCtx, *pCur, FindDocinfo ( pCur->m_iDocID ) );//给找到的doc安装docinfotCtx.CalcFinal ( *pCur );}}// mva and string pools ptrspResult->m_pMva = m_pMva.GetWritePtr();pResult->m_pStrings = m_pStrings.GetWritePtr(); //给查询结果对象设置全局字符串数据引用}

//sphinx.cpp bool CSphIndex_VLN::MatchExtended ( CSphQueryContext * pCtx, const CSphQuery * pQuery, int iSorters, ISphMatchSorter ** ppSorters, ISphRanker * pRanker, int iTag ) const

bool CSphIndex_VLN::MatchExtended ( CSphQueryContext * pCtx, const CSphQuery * pQuery, int iSorters, ISphMatchSorter ** ppSorters, ISphRanker * pRanker, int iTag ) const{int iCutoff = pQuery->m_iCutoff;if ( iCutoff<=0 )iCutoff = -1;// do searchingCSphMatch * pMatch = pRanker->GetMatchesBuffer();for ( ;; ){int iMatches = pRanker->GetMatches ( pCtx->m_iWeights, pCtx->m_dWeights ); //从index获取到匹配的docsif ( iMatches<=0 )break;for ( int i=0; i<iMatches; i++ ){if ( pCtx->m_bLookupSort )CopyDocinfo ( pCtx, pMatch[i], FindDocinfo ( pMatch[i].m_iDocID ) );pCtx->CalcSort ( pMatch[i] );if ( pCtx->m_pWeightFilter && !pCtx->m_pWeightFilter->Eval ( pMatch[i] ) )continue;pMatch[i].m_iTag = iTag;bool bRand = false;bool bNewMatch = false;for ( int iSorter=0; iSorter<iSorters; iSorter++ ){// all non-random sorters are in the beginning,// so we can avoid the simple 'first-element' assertionif ( !bRand && ppSorters[iSorter]->m_bRandomize ){bRand = true;pMatch[i].m_iWeight = ( sphRand() & 0xffff );if ( pCtx->m_pWeightFilter && !pCtx->m_pWeightFilter->Eval ( pMatch[i] ) )break;}bNewMatch |= ppSorters[iSorter]->Push ( pMatch[i] ); //把doc放入排序队列中}if ( bNewMatch )if ( --iCutoff==0 )break;}if ( iCutoff==0 )break;}return true;}

***********************************************************************************************************************************************

一直到现在，都停留在index抽象层中，没有涉及到任何具体的磁盘IO，从下面开始我们将进入的外存空间，进行具体的读磁盘索引操作。

***********************************************************************************************************************************************

//sphinxsearch.cpp template < typename STATE > int ExtRanker_T<STATE>::GetMatches ( int iFields, const int * pWeights )

从磁盘中读取匹配到的docs块和hits块

int ExtRanker_T<STATE>::GetMatches ( int iFields, const int * pWeights ){STATE tState ( iFields, pWeights, this );if ( !m_pRoot )return 0;int iMatches = 0;const ExtHit_t * pHlist = m_pHitlist;const ExtDoc_t * pDocs = m_pDoclist;// warmup if necessaryif ( !pHlist ){if ( !pDocs ) pDocs = GetFilteredDocs (); //从spd文件中获取docsif ( !pDocs ) return iMatches;pHlist = m_pRoot->GetHitsChunk ( pDocs, m_uMaxID );//从spp中获取hitsif ( !pHlist ) return iMatches;}// main matching loopconst ExtDoc_t * pDoc = pDocs;for ( SphDocID_t uCurDocid=0; iMatches<ExtNode_i::MAX_DOCS; ){// keep rankingwhile ( pHlist->m_uDocid==uCurDocid )tState.Update ( pHlist++ );// if hits block is over, get next block, but do *not* flush current docif ( pHlist->m_uDocid==DOCID_MAX ){assert ( pDocs );pHlist = m_pRoot->GetHitsChunk ( pDocs, m_uMaxID ); //获取下一块if ( pHlist )continue;}// otherwise (new match or no next hits block), flush current docif ( uCurDocid ){assert ( uCurDocid==pDoc->m_uDocid );Swap ( m_dMatches[iMatches], m_dMyMatches[pDoc-m_dMyDocs] );m_dMatches[iMatches].m_iWeight = tState.Finalize ( m_dMatches[iMatches].m_iWeight );iMatches++;}// boundary checksif ( !pHlist ){// there are no more hits for current docs block; do we have a next one?assert ( pDocs );pDoc = pDocs = GetFilteredDocs ();// we don't, so bail outif ( !pDocs )break;// we do, get some hitspHlist = m_pRoot->GetHitsChunk ( pDocs, m_uMaxID );assert ( pHlist ); // fresh docs block, must have hits}// skip until next good doc/hit pairassert ( pDoc->m_uDocid<=pHlist->m_uDocid );while ( pDoc->m_uDocid<pHlist->m_uDocid ) pDoc++; //跳过不完整的docs，就是没有hits存在的docassert ( pDoc->m_uDocid==pHlist->m_uDocid );uCurDocid = pHlist->m_uDocid;}m_pDoclist = pDocs; //设置docs块，当m_pHitlist = pHlist;//设置hits块return iMatches; //返回结果数量}

//sphinxsearch.cpp const ExtDoc_t * ExtRanker_c::GetFilteredDocs ()

这函数没什么好说的，对查询树找到的结果进行过滤，因为有些match是不想要的

const ExtDoc_t * ExtRanker_c::GetFilteredDocs (){for ( ;; ){// get another chunkm_uMaxID = 0;const ExtDoc_t * pCand = m_pRoot->GetDocsChunk ( &m_uMaxID ); //通过查询树进行查找if ( !pCand )return NULL;// create matches, and filter themint iDocs = 0;while ( pCand->m_uDocid!=DOCID_MAX ){m_tTestMatch.m_iDocID = pCand->m_uDocid;if ( pCand->m_pDocinfo )memcpy ( m_tTestMatch.m_pDynamic, pCand->m_pDocinfo, m_iInlineRowitems*sizeof(CSphRowitem) );if ( m_pIndex->EarlyReject ( m_pCtx, m_tTestMatch ) ) //丢掉不合格的match{pCand++;continue;}m_dMyDocs[iDocs] = *pCand;m_tTestMatch.m_iWeight = (int)( (pCand->m_fTFIDF+0.5f)*SPH_BM25_SCALE ); // FIXME! bench bNeedBM25 BM25算法微调Swap ( m_tTestMatch, m_dMyMatches[iDocs] );iDocs++;pCand++;}if ( iDocs ){m_dMyDocs[iDocs].m_uDocid = DOCID_MAX;return m_dMyDocs;}}}

//sphinxsearch.cpp const ExtDoc_t * ExtAnd_c::GetDocsChunk ( SphDocID_t * pMaxID )

前面已经提到过查询是分解到一颗查询树上进行的，树叶子节点为原子词条，而分支节点为and、or等孩子节点的组合操作，从下面的函数可以看到这点，这里是一个and节点，它将两个孩子节点的查询结果进行集合运算中的and运算：

const ExtDoc_t * ExtAnd_c::GetDocsChunk ( SphDocID_t * pMaxID ){m_uMaxID = 0;const ExtDoc_t * pCur0 = m_pCurDoc[0];const ExtDoc_t * pCur1 = m_pCurDoc[1];int iDoc = 0;CSphRowitem * pDocinfo = m_pDocinfo;for ( ;; ){// if any of the pointers is empty, *and* there is no data yet, process next child chunk// if there is data, we can't advance, because child hitlist offsets would be lostif ( !pCur0 || !pCur1 ){if ( iDoc!=0 )break;if ( !pCur0 ) pCur0 = m_pChildren[0]->GetDocsChunk ( NULL ); //从第一个孩子获取docif ( !pCur1 ) pCur1 = m_pChildren[1]->GetDocsChunk ( NULL ); //从第二个孩子获取docif ( !pCur0 || !pCur1 ){m_pCurDoc[0] = NULL;m_pCurDoc[1] = NULL;return NULL;}}// find common matchesassert ( pCur0 && pCur1 );while ( iDoc<MAX_DOCS-1 ){//将两个孩子节点获取到的doc合并起来，过滤掉docId不相同的，这是一个and节点// find next matching docidwhile ( pCur0->m_uDocid < pCur1->m_uDocid ) pCur0++;if ( pCur0->m_uDocid==DOCID_MAX ) { pCur0 = NULL; break; }while ( pCur1->m_uDocid < pCur0->m_uDocid ) pCur1++;if ( pCur1->m_uDocid==DOCID_MAX ) { pCur1 = NULL; break; }if ( pCur0->m_uDocid!=pCur1->m_uDocid ) continue;// emit itExtDoc_t & tDoc = m_dDocs[iDoc++];tDoc.m_uDocid = pCur0->m_uDocid;tDoc.m_uFields = pCur0->m_uFields | pCur1->m_uFields;tDoc.m_uHitlistOffset = -1;tDoc.m_fTFIDF = pCur0->m_fTFIDF + pCur1->m_fTFIDF; //将两个term的分值进行合并,也就是将两个独立的查询词进行加起来统一得到总的rank值CopyExtDocinfo ( tDoc, *pCur0, &pDocinfo, m_iStride );// skip itpCur0++; if ( pCur0->m_uDocid==DOCID_MAX ) pCur0 = NULL;pCur1++; if ( pCur1->m_uDocid==DOCID_MAX ) pCur1 = NULL;if ( !pCur0 || !pCur1 ) break;}}m_pCurDoc[0] = pCur0;m_pCurDoc[1] = pCur1;return ReturnDocsChunk ( iDoc, pMaxID );}

//sphinxsearch.cpp const ExtDoc_t * ExtTerm_c::GetDocsChunk ( SphDocID_t * pMaxID )

下面看看对于原子词条是怎么读取和它相对应的docs列表的：

const ExtDoc_t * ExtTerm_c::GetDocsChunk ( SphDocID_t * pMaxID ){if ( !m_pQword->m_iDocs )return NULL;m_uMaxID = 0;// max_query_timeif ( m_iMaxTimer>0 && sphMicroTimer()>=m_iMaxTimer ){if ( m_pWarning )*m_pWarning = "query time exceeded max_query_time";return NULL;}// interrupt by sitgermif ( m_bInterruptNow ){if ( m_pWarning )*m_pWarning = "Server shutdown in progress";return NULL;}int iDoc = 0;CSphRowitem * pDocinfo = m_pDocinfo;while ( iDoc<MAX_DOCS-1 ){const CSphMatch & tMatch = m_pQword->GetNextDoc ( pDocinfo ); if ( !tMatch.m_iDocID ){m_pQword->m_iDocs = 0;break;}if (!( m_pQword->m_uFields & m_uFields ))continue;ExtDoc_t & tDoc = m_dDocs[iDoc++];tDoc.m_uDocid = tMatch.m_iDocID;tDoc.m_pDocinfo = pDocinfo;tDoc.m_uHitlistOffset = m_pQword->m_iHitlistPos; //hit文件的偏移量tDoc.m_uFields = m_pQword->m_uFields & m_uFields; // OPTIMIZE: only needed for phrase nodetDoc.m_fTFIDF = float(m_pQword->m_uMatchHits) / float(m_pQword->m_uMatchHits+SPH_BM25_K1) * m_fIDF; //BM25进行权值计算pDocinfo += m_iStride;}m_pHitDoc = NULL;return ReturnDocsChunk ( iDoc, pMaxID );}

//sphinx.cpp virtual const CSphMatch & GetNextDoc ( DWORD * pDocinfo )

virtual const CSphMatch & GetNextDoc ( DWORD * pDocinfo ){SphDocID_t iDelta = m_rdDoclist.UnzipDocid(); //读取文档ID差值if ( iDelta ){m_tDoc.m_iDocID += iDelta; //计算文档IDif ( INLINE_DOCINFO ){assert ( pDocinfo );for ( int i=0; i<m_iInlineAttrs; i++ )pDocinfo[i] = m_rdDoclist.UnzipInt() + m_pInlineFixup[i];}if ( INLINE_HITS ){m_uMatchHits = m_rdDoclist.UnzipInt();const DWORD uFirst = m_rdDoclist.UnzipInt();if ( m_uMatchHits==1 ){const DWORD uField = m_rdDoclist.UnzipInt(); // field and end markerm_iHitlistPos = uFirst | ( uField << 23 ) | ( U64C(1)<<63 );m_uFields = 1 << ( uField >> 1 );} else{m_uFields = uFirst;m_uHitPosition += m_rdDoclist.UnzipOffset();m_iHitlistPos = m_uHitPosition;}} else{SphOffset_t iDeltaPos = m_rdDoclist.UnzipOffset();assert ( iDeltaPos>=0 );m_iHitlistPos += iDeltaPos; //计算hitlist偏移值，此偏移量等以后读取hitlist的时候用到m_uFields = m_rdDoclist.UnzipInt(); //获取字段数m_uMatchHits = m_rdDoclist.UnzipInt();//获取hits计数}} else{m_tDoc.m_iDocID = 0;}return m_tDoc;}

对于从文件中读取hitslist和读docs文件差不多，这里不再给出分析过程。

分析了这么大半天终于接近尾声，从输入一个查询短语，一直到从index文件中读出相关的文档列表，走过了查询的主要过程，但之间还有什么忽略掉了的地方，一些细节的地方，但它们对弄懂search流程并不那么重要，要具体弄懂每一个细节的地方还得花大量的时间精力去分析它，但这里就不在深入下去了。