搜索引擎之分词设计

来源:互联网 发布:乐视 大数据部高级总监 编辑:程序博客网 时间:2024/05/06 09:34

搜索引擎之分词设计
 
  上面我们已经可以扫描目录,建立反向索引结构了,那么我们现在进入最关键的一步就是文件内容解析,使用过MS索引服务的人都只到,可以通过增加Filter来增加可索引的文件,比如增加PDF,HTML等,所以我们在设计文件解析的过程中也要留有类似的接口,我们先设计出一个基类,抽象出两个虚函数.
//All right revsered by yoki2009
//mailto:imj040144@tom.com
//Welcome to my blog:    http://blog.csdn.net/yoki2009
class CFileParserBase
{
public:
 CFileParserBase(void);
 ~CFileParserBase(void);
 virtual int parseWord(IN char * read_buf, CString FileName) = 0;//对相关文件进行解析
 virtual char * getContent(CString FileName) = 0;//获得相关文件内容
};
 设计一个用来解析文本文件的类,继承该基类:
//All right revsered by yoki2009
//mailto:imj040144@tom.com
//Welcome to my blog:    http://blog.csdn.net/yoki2009

class FILEOperator : public CFileParserBase
{
protected:
 FILEOperator(void);
public:
 ~FILEOperator(void);
 static FILEOperator * getInstance();
 int parseWord(IN char * read_buf, CString FileName);
 char * getContent(CString FileName);

private:
 static FILEOperator * _instance;
 Log    FILELog;
};

文件实现类,该类实现了解析文本文件,建立反向索引表,忽略a,an,the等没有意义的关键字,去除重复出现的关键字等基本功能,解析其它的文件只要参考该类就能很简单的实现,这里特别要提到的是解析文件时最好先把文件内容解析出来读到内存中去,我这里是存在了vector里面,然后在对内存中内容做操作,不要边解析边操作,或者读到文本里之类的,那样的效率会大幅度降低的.

#include "FILEOperator.h"
FILEOperator * FILEOperator::_instance = 0;

FILEOperator::FILEOperator(void)
{
 FILELog.CommonLogInit(CString("FILE"));
}

FILEOperator::~FILEOperator(void)
{
 FILELog.Close();
}

FILEOperator * FILEOperator::getInstance()
{
 if (0 == _instance)
  _instance = new FILEOperator;
 return _instance;
}

int FILEOperator::parseWord(char * read_buf, CString FileName)
{
 vector<CString>  v_phrase;
 CString    strread;
 CStdioFile   m_fileread;

 int nCount;
 BOOL isEnd = FALSE;
 m_fileread.Open(FileName,CFile::shareDenyNone);

 DBOperator * dboperator = DBOperator::getInstance();
 if (!dboperator->InitConnection())
 { 
  return FALSE;
 }
 while(!isEnd)
 { 
  if (NULL == StrStrI(read_buf,SPACE))
  {
   nCount = (int)strlen(read_buf);
   isEnd = TRUE;
  } 
  else
   nCount = (int)(StrStrI(read_buf,SPACE) - read_buf);


  char * tmp = new char[nCount+1];
  strncpy_s(tmp,nCount+1,read_buf,nCount);

  if (!StrCmpI(tmp,"is") || !StrCmpI(tmp,"a") || !StrCmpI(tmp,"an")
   || !StrCmpI(tmp,"the") || !StrCmpI(tmp,"of") || !StrCmpI(tmp,"with")
   || !StrCmpI(tmp,"for") || !StrCmpI(tmp,"not")|| !StrCmpI(tmp,"only")
   || !StrCmpI(tmp,"are") || !StrCmpI(tmp,"by") || !StrCmpI(tmp,"to")
   || !StrCmpI(tmp,"as"))
  {
   read_buf+=strlen(tmp);
   while (*read_buf == ' '|| *read_buf =='.' || *read_buf == ','
    || *read_buf == '/n' || *read_buf == '/r')
    read_buf++;
   continue;
  }
  //check exist element
  if (!v_phrase.empty())
  {
   vector<CString>::iterator iter = find(v_phrase.begin(),v_phrase.end(),tmp); 
 
   if (iter != v_phrase.end())
   {  
    read_buf += strlen(tmp);//move to next word
    while (*read_buf == ' '|| *read_buf == '.' || *read_buf == ','
     || *read_buf == '/n' || *read_buf == '/r')
     read_buf++;
    continue;
   }
  }
  int nNum = 0;
  int nLineNum = 0;
  char * findstr ;
  vector<int> v_line;
  vector<int>::iterator pos;
  while (m_fileread.ReadString(strread))
  {
   nLineNum++;
   findstr = StrStrI(strread,tmp);
   while (findstr)
   {
    if (findstr)
    {
     findstr += strlen(tmp);
     nNum++;
     v_line.push_back(nLineNum);
    }
    findstr = StrStrI(findstr,tmp);
   }
  }

  v_line.resize(unique(v_line.begin() ,v_line.end()) - v_line.begin());
  CString mlines; 
  for (pos = v_line.begin(); pos != v_line.end(); ++pos)
  {
   CString _tmp;
   _tmp.Format("%d ",*pos);
   mlines.Append(_tmp);
  }
  //dboperator->InsertTable(FileName,nNum,mlines);
  if (dboperator->existKeyword(tmp))
  {
   if(!dboperator->OpenTable(tmp))
   {
    FILELog<<"OpenTable "<<tmp<<"Failed.";
   }else
   {
    dboperator->InsertTable(FileName,nNum,mlines);
   }
  }else
  {  
   if (!dboperator->CreateTable(tmp))
   {
    FILELog<<"CreateTable "<<tmp<<"Failed.";
   }else
   {
    dboperator->InsertKeyword(tmp);
    if(!dboperator->OpenTable(tmp))
    {
     FILELog<<"OpenTable "<<tmp<<"Failed.";
    }else
    {
     dboperator->InsertTable(FileName,nNum,mlines);
    }
   }
  }
  if(!isEnd)
  {
   read_buf += strlen(tmp);//move to next word
   while (*read_buf == ' '|| *read_buf == '.' || *read_buf == ','
    || *read_buf == '/n' || *read_buf == '/r')
    read_buf++;
   v_phrase.push_back(tmp);
   m_fileread.SeekToBegin();
  }
 }
 m_fileread.Close();
 dboperator->Close();

 return TRUE;
}

char * FILEOperator::getContent(CString FileName)
{
 ifstream _stream(FileName); 
 FILELog<<FileName;
 struct _stat forlen;
 char * read_buf;
 if ((_stream.rdbuf())->is_open())
 {
  if ( -1 == _stat(FileName,&forlen))
  {
   return NULL;
  }
  unsigned int blen = forlen.st_size;
  read_buf = new char[blen];
  _stream.read(read_buf,blen);
  Util::TrimText(read_buf);

  //read the first word
 }

 _stream.close();
 return read_buf;
}

原创粉丝点击