搜索引擎之分词设计
来源:互联网 发布:乐视 大数据部高级总监 编辑:程序博客网 时间:2024/05/06 09:34
搜索引擎之分词设计
上面我们已经可以扫描目录,建立反向索引结构了,那么我们现在进入最关键的一步就是文件内容解析,使用过MS索引服务的人都只到,可以通过增加Filter来增加可索引的文件,比如增加PDF,HTML等,所以我们在设计文件解析的过程中也要留有类似的接口,我们先设计出一个基类,抽象出两个虚函数.
//All right revsered by yoki2009
//mailto:imj040144@tom.com
//Welcome to my blog: http://blog.csdn.net/yoki2009
class CFileParserBase
{
public:
CFileParserBase(void);
~CFileParserBase(void);
virtual int parseWord(IN char * read_buf, CString FileName) = 0;//对相关文件进行解析
virtual char * getContent(CString FileName) = 0;//获得相关文件内容
};
设计一个用来解析文本文件的类,继承该基类:
//All right revsered by yoki2009
//mailto:imj040144@tom.com
//Welcome to my blog: http://blog.csdn.net/yoki2009
class FILEOperator : public CFileParserBase
{
protected:
FILEOperator(void);
public:
~FILEOperator(void);
static FILEOperator * getInstance();
int parseWord(IN char * read_buf, CString FileName);
char * getContent(CString FileName);
private:
static FILEOperator * _instance;
Log FILELog;
};
文件实现类,该类实现了解析文本文件,建立反向索引表,忽略a,an,the等没有意义的关键字,去除重复出现的关键字等基本功能,解析其它的文件只要参考该类就能很简单的实现,这里特别要提到的是解析文件时最好先把文件内容解析出来读到内存中去,我这里是存在了vector里面,然后在对内存中内容做操作,不要边解析边操作,或者读到文本里之类的,那样的效率会大幅度降低的.
#include "FILEOperator.h"
FILEOperator * FILEOperator::_instance = 0;
FILEOperator::FILEOperator(void)
{
FILELog.CommonLogInit(CString("FILE"));
}
FILEOperator::~FILEOperator(void)
{
FILELog.Close();
}
FILEOperator * FILEOperator::getInstance()
{
if (0 == _instance)
_instance = new FILEOperator;
return _instance;
}
int FILEOperator::parseWord(char * read_buf, CString FileName)
{
vector<CString> v_phrase;
CString strread;
CStdioFile m_fileread;
int nCount;
BOOL isEnd = FALSE;
m_fileread.Open(FileName,CFile::shareDenyNone);
DBOperator * dboperator = DBOperator::getInstance();
if (!dboperator->InitConnection())
{
return FALSE;
}
while(!isEnd)
{
if (NULL == StrStrI(read_buf,SPACE))
{
nCount = (int)strlen(read_buf);
isEnd = TRUE;
}
else
nCount = (int)(StrStrI(read_buf,SPACE) - read_buf);
char * tmp = new char[nCount+1];
strncpy_s(tmp,nCount+1,read_buf,nCount);
if (!StrCmpI(tmp,"is") || !StrCmpI(tmp,"a") || !StrCmpI(tmp,"an")
|| !StrCmpI(tmp,"the") || !StrCmpI(tmp,"of") || !StrCmpI(tmp,"with")
|| !StrCmpI(tmp,"for") || !StrCmpI(tmp,"not")|| !StrCmpI(tmp,"only")
|| !StrCmpI(tmp,"are") || !StrCmpI(tmp,"by") || !StrCmpI(tmp,"to")
|| !StrCmpI(tmp,"as"))
{
read_buf+=strlen(tmp);
while (*read_buf == ' '|| *read_buf =='.' || *read_buf == ','
|| *read_buf == '/n' || *read_buf == '/r')
read_buf++;
continue;
}
//check exist element
if (!v_phrase.empty())
{
vector<CString>::iterator iter = find(v_phrase.begin(),v_phrase.end(),tmp);
if (iter != v_phrase.end())
{
read_buf += strlen(tmp);//move to next word
while (*read_buf == ' '|| *read_buf == '.' || *read_buf == ','
|| *read_buf == '/n' || *read_buf == '/r')
read_buf++;
continue;
}
}
int nNum = 0;
int nLineNum = 0;
char * findstr ;
vector<int> v_line;
vector<int>::iterator pos;
while (m_fileread.ReadString(strread))
{
nLineNum++;
findstr = StrStrI(strread,tmp);
while (findstr)
{
if (findstr)
{
findstr += strlen(tmp);
nNum++;
v_line.push_back(nLineNum);
}
findstr = StrStrI(findstr,tmp);
}
}
v_line.resize(unique(v_line.begin() ,v_line.end()) - v_line.begin());
CString mlines;
for (pos = v_line.begin(); pos != v_line.end(); ++pos)
{
CString _tmp;
_tmp.Format("%d ",*pos);
mlines.Append(_tmp);
}
//dboperator->InsertTable(FileName,nNum,mlines);
if (dboperator->existKeyword(tmp))
{
if(!dboperator->OpenTable(tmp))
{
FILELog<<"OpenTable "<<tmp<<"Failed.";
}else
{
dboperator->InsertTable(FileName,nNum,mlines);
}
}else
{
if (!dboperator->CreateTable(tmp))
{
FILELog<<"CreateTable "<<tmp<<"Failed.";
}else
{
dboperator->InsertKeyword(tmp);
if(!dboperator->OpenTable(tmp))
{
FILELog<<"OpenTable "<<tmp<<"Failed.";
}else
{
dboperator->InsertTable(FileName,nNum,mlines);
}
}
}
if(!isEnd)
{
read_buf += strlen(tmp);//move to next word
while (*read_buf == ' '|| *read_buf == '.' || *read_buf == ','
|| *read_buf == '/n' || *read_buf == '/r')
read_buf++;
v_phrase.push_back(tmp);
m_fileread.SeekToBegin();
}
}
m_fileread.Close();
dboperator->Close();
return TRUE;
}
char * FILEOperator::getContent(CString FileName)
{
ifstream _stream(FileName);
FILELog<<FileName;
struct _stat forlen;
char * read_buf;
if ((_stream.rdbuf())->is_open())
{
if ( -1 == _stat(FileName,&forlen))
{
return NULL;
}
unsigned int blen = forlen.st_size;
read_buf = new char[blen];
_stream.read(read_buf,blen);
Util::TrimText(read_buf);
//read the first word
}
_stream.close();
return read_buf;
}
- 搜索引擎之分词设计
- 搜索引擎之中文分词简介
- 搜索引擎技术:中文分词之二
- 搜索引擎之猎兔分词实例
- 搜索引擎之中文分词实现(java版)
- 搜索引擎之中文分词实现(java版)
- 搜索引擎之中文分词(Chinese Word Segmentation)简介
- 搜索引擎之中文分词(Chinese Word Segmentation)简介
- 搜索引擎之中文分词(Chinese Word Segmentation)简介
- 搜索引擎之中文分词(Chinese Word Segmentation)简介
- (搜索引擎之solr) 给solr添加中文分词器
- 搜索引擎分词算法介绍
- 中文分词和搜索引擎
- 中文分词和搜索引擎
- 中文分词和搜索引擎
- 中文分词和搜索引擎
- 中文分词和搜索引擎
- 中文分词和搜索引擎
- 下拉框模拟只读
- php中iconv函数使用方法
- asp.net 内嵌代码
- 获取系统系统所有TCP以及UDP端口使用情况
- javascript操作xml 2
- 搜索引擎之分词设计
- 提升JSP应用程序的七大绝招
- 谷歌在中央电视台做软广告了
- latex CJK 中文字体的显示问题
- 谁知道这是什么原因
- 爸爸的病情严重
- pushmail的YY
- myeclipse相关
- 网上一些串口程序的不足