Kmeans聚类之建立词袋子模型

来源：互联网发布：python buffer类型编辑：程序博客网时间：2024/05/17 07:14

作者：finallyliuyu（转载请注明出处）

最近打算将自己的工作平台由C#，python等迁移到C++。这是我的第一个C++工作程序吧。

IDE:VS2008

language: C++

library:boost（安装boost库，先要安装python安装方法见《boost库安装方法》）

tools:weka

C++程序完成的功能：从数据库中读出文章-》分词（调用ICTCLAS）-》特征词选择（DF法）->VSM模型建立->把文章写成weka数据格式arff文件（此处写成的是稀疏数据的储存格式。weka教程见《教程》）

首先给出构造停用词集合的代码：

按 Ctrl+C 复制代码

然后我们给出调用ICTclas进行分词的代码，注意：工程中调用ICTCLAS时要把data 文件夹，config文件，ictclas30.h ICTCLAS30.dll,ICTCLAS30.LIB放在工程所在的文件夹。将ictclas30.h加入工程，在调用ICTCLAS30.DLL的cpp文件的头部加上#pragma comment(lib, "ICTCLAS30.lib")

按 Ctrl+C 复制代码

调用ICTCLAS分词 Code highlighting produced by Actipro CodeHighlighter (freeware)http://www.CodeHighlighter.com/-->************************************************************************//* c字符创形式的输入，string格式的输出，此函数用于调用ICTCLAS完成分词功能/*/************************************************************************/string ICTsplit(const char *sInput){    if(!ICTCLAS_Init())    {        printf("ICTCLAS INIT FAILED!\n");        string strerr(sInput);        return strerr;    }    ICTCLAS_SetPOSmap(ICT_POS_MAP_SECOND);    //导入用户词典后    /*printf("\n导入用户词典后：\n");    int nCount = ICTCLAS_ImportUserDict("userdic.txt");//覆盖以前的用户词典    //保存用户词典    ICTCLAS_SaveTheUsrDic();    printf("导入%d个用户词。\n", nCount);*/    const char* sResult = ICTCLAS_ParagraphProcess(sInput, 0);    string strresult(sResult);    //printf("%s\n", sResult);    //把字符串转化成宽字符串    wstring wsResult=myMultibyteToWideChar(strresult);    boost::wregex wreg(L"\\s+");    wsResult=boost::regex_replace(wsResult,wreg,wstring(L"|"));    strresult=myWideCharToMultibyte(wsResult);                //ofile<<str1;    //ofile.close();    //cout<<str1<<endl;    //ICTCLAS_FileProcess("text.txt","test_result.txt",1);    ICTCLAS_Exit();    return strresult;}

按 Ctrl+C 复制代码

ICTclas分词结果默认的分割符是空格，在以上函数中，我们改成了“|”作为分隔符，字符串替换考率用boost的正则表达式库。因为我们要处理的是汉字字符串，所有要进行宽字符串窄字符串之间的转化，我采用的是利用win32函数的方法更多方法请见《boost正则表达式处理汉字字符串》。

按 Ctrl+C 复制代码

宽窄字符串互转函数 Code highlighting produced by Actipro CodeHighlighter (freeware)http://www.CodeHighlighter.com/-->/************************************************************************//*  功能：将窄字符转化成宽字符，string->wstring                         *//************************************************************************/wstring myMultibyteToWideChar(string sResult){    int iWLen=MultiByteToWideChar( CP_ACP, 0, sResult.c_str(), sResult.size(), 0, 0 );// 计算转换后宽字符串的长度。（不包含字符串结束符）    wchar_t *lpwsz= new wchar_t [iWLen+1];    MultiByteToWideChar( CP_ACP, 0, sResult.c_str(), sResult.size(), lpwsz, iWLen ); // 正式转换。    lpwsz[iWLen] = L'\0';     wstring wsResult(lpwsz);    delete []lpwsz;    return wsResult;}/************************************************************************//* 将宽字符串转化成窄字符串用于输出                                     *//************************************************************************/string myWideCharToMultibyte(wstring wsResult){    string sResult;    int iLen= WideCharToMultiByte( CP_ACP, NULL, wsResult.c_str(), -1, NULL, 0, NULL, FALSE ); // 计算转换后字符串的长度。（包含字符串结束符）    char *lpsz= new char[iLen];    WideCharToMultiByte( CP_OEMCP, NULL, wsResult.c_str(), -1, lpsz, iLen, NULL, FALSE); // 正式转换。    sResult.assign( lpsz, iLen-1 ); // 对string对象进行赋值。    delete []lpsz;    return sResult;}

按 Ctrl+C 复制代码

有了以上的功能，我们现在编写一个函数，函数的输入是一篇文章，输出是一个词的集合。该词集合保存的是初步去掉噪声词后的“好词”

代码如下

按 Ctrl+C 复制代码

对每篇文章初步过滤形成词集合 Code highlighting produced by Actipro CodeHighlighter (freeware)http://www.CodeHighlighter.com/-->/************************************************************************//* 返回一篇文章中的好词                                                 *//************************************************************************/vector<string>goodWordsinPieceArticle(string rawtext,set<string> stopwords){      vector<wstring> goodWordstemp;    vector<string> goodWords;    const char* sInput=rawtext.c_str();    string sResult=ICTsplit(sInput);    wstring wsResult=myMultibyteToWideChar(sResult);    boost::wregex wreg(L"\\d+");//去掉中文空格    wsResult=boost::regex_replace(wsResult,wreg,wstring(L""));    //boost::regex_split(back_inserter(goodWordstemp),wsResult,wreg);    boost::split(goodWordstemp,wsResult,boost::is_any_of("|"));    for(vector<wstring>::iterator it=goodWordstemp.begin();it!=goodWordstemp.end();it++)    {        string temp=myWideCharToMultibyte(*it);        trim(temp," ");        if(!stopwords.count(temp)&&!temp.empty())        {            goodWords.push_back(temp);        }                }    return goodWords;    }

按 Ctrl+C 复制代码

上面的这个函数可以说是我们建立词袋子模型的基本单元，给上面的函数输入文章内容（rawtext）,以及停用词表，那么它将返回一个词集合。下面我们开始构造词袋子模型。在构造词袋子模型之前，我们要说一下，我们词袋子模型的格式map<string,vector<pair<int,int>>>：主键为该词，pair中的第一个int 为文章标号，第二个词为在该文中出现的次数，vector<pair<int,int>>统计的是这个词在那些文章中出现，出现过几次。因为数据量比较大所以词袋子模型map,采用引用传参，如果是值传参的话，会在内存中产生拷贝，浪费内存

下面是从数据库中读文章建立词袋子模型的代码

按 Ctrl+C 复制代码

建立词袋子模型 Code highlighting produced by Actipro CodeHighlighter (freeware)http://www.CodeHighlighter.com/-->/************************************构建倒排表： key=word,val= a list of pairs which consists of articleid ,and count, count=tf*************************************************************/int ConstructMap(map<string,vector<pair<int,int>>>&mymap,int beginindex,int endindex){    //    vector<string> mySplit(string s);     set<string>MakeStopSet();    vector<string>goodWordsinPieceArticle(string rawtext,set<string>stopwords);    CoInitialize(NULL);    _ConnectionPtr pConn(__uuidof(Connection));    _RecordsetPtr pRst(__uuidof(Recordset));    char * select =new char[5000];    memset(select,0,5000);    char *firstpart="select CKeyWord,ArticleId,CAbstract from Article where ArticleId between ";    char *lastpart=" order by ArticleId";    char middlepart1[100];    char middlepart2[100];    sprintf_s(middlepart1,sizeof(middlepart1),"%d",beginindex);    sprintf_s(middlepart2,sizeof(middlepart2),"%d",endindex);    strcat(select,firstpart);    strcat(select,middlepart1);    strcat(select," and ");    strcat(select,middlepart2);    strcat(select,lastpart);    pConn->ConnectionString="Provider=SQLOLEDB.1;Password=xxxxxx;Persist Security Info=True; User ID=sa;Initial Catalog=ArticleCollection";    pConn->Open("","","",adConnectUnspecified);    pRst=pConn->Execute(select,NULL,adCmdText);    set<string>stopwords=MakeStopSet();    while(!pRst->rsEOF)    {    vector<string>wordcollection;        //string keywordstr=(_bstr_t)pRst->GetCollect("CKeyWord");        string rawtext=(_bstr_t)pRst->GetCollect("CAbstract");        if(rawtext!="")        {                wordcollection=goodWordsinPieceArticle(rawtext,stopwords);                string tempid=(_bstr_t)pRst->GetCollect("ArticleId");                int articleid=atoi(tempid.c_str());                for(vector<string>::iterator strit=wordcollection.begin();strit!=wordcollection.end();strit++)                {                    vector<pair<int,int>>::iterator it;                    if(mymap[*strit].empty())                    {                        pair<int,int>mytemppair=make_pair(articleid,1);                        mymap[*strit].push_back(mytemppair);                    }                    else                    {                        for(it=mymap[*strit].begin();it!=mymap[*strit].end();it++)                        {                              if(it->first==articleid)                            {                                it->second=++(it->second);                                break;                            }                                            }                        if(it==mymap[*strit].end())                        {                            pair<int,int>mytemppair=make_pair(articleid,1);                            mymap[*strit].push_back(mytemppair);                        }                    }            }                    }                        pRst->MoveNext();        wordcollection.clear();    }    pRst->Close();    pConn->Close();    pRst.Release();    pConn.Release();    CoUninitialize();    delete[] select;    return 0;}

按 Ctrl+C 复制代码

未完，待续。。。。。

0 0