从文件中查找出现次数最多的10个单词

来源：互联网发布：华为网络培训编辑：程序博客网时间：2024/05/22 06:11

本来想用字典树的，但字典树是空间换时间的，只有26个字母还好，但文件中还有各种标点符号，觉得空间耗费太大了。。。。就用了还算一般的方法。。

十道海量数据处理：http://blog.csdn.net/v_july_v/article/details/6279498

字典树参考：http://www.cnblogs.com/DiaoCow/archive/2010/04/19/1715337.html && http://blog.csdn.net/bill_ming/article/details/7585009

附代码：见注释(VS2008)

//假设数据能一次全装入内存#include <iostream>#include <algorithm>#include <fstream>#include <map>#include <vector>#include <iterator>#include <functional>#include <string>#include <cstring>#include <cstdio>#include <ctime>#include <cstdlib>using namespace std;#define SEARCH_COUNT 10  //打印前多少个单词//单词类class WordTop10{private :    map<string,int> mapWord;   //存储单词   vector<pair<string ,int > > pair_vec;  //因为要按map的value排序，借助vectorpublic :    void insertWord(const string& word);  //插入单词到map    void sortWord();                                  //按map的value排序    void readFile(const string& strFileName); //从文件读数据    void outPut();                                          //输出};//在map中插入一个单词void WordTop10::insertWord(const string& word){    map<string,int>::iterator mit;    mit = mapWord.find(word);    if( mit != mapWord.end())    {        mit->second ++;    }    else    {        mapWord[word] = 1;    }}//递减排序int cmp(const pair<string ,int >& a,const pair<string ,int >& b){    return a.second > b.second;}//按map中的value排序void WordTop10::sortWord(){       for(map<string,int>::iterator map_iter = mapWord.begin(); map_iter != mapWord.end(); ++map_iter)    {        pair_vec.push_back(make_pair(map_iter->first,map_iter->second));    }    sort(pair_vec.begin(),pair_vec.end(),cmp);   //排序   }void WordTop10:: outPut(){    int i = 0;    for(vector<pair<string ,int > >::iterator cur = pair_vec.begin(); cur != pair_vec.end(); ++cur)    {        i++;        if(i >SEARCH_COUNT)   //输出前SEARCH_COUNT个        {            return ;        }        cout << cur->first <<"\t"<<cur->second<<endl;    }}//从文件中读取数据void WordTop10::readFile(const string& strFileName){    string text;//　c_str函数的返回值是const char*的，不能直接赋值给char*，所以就需要我们进行相应的操作转化，下面就是这一转化过程。ifstream in(strFileName.c_str());    if (!in)    {        cout << "~文件打开失败~" << endl;    }    while (in >> text)    {        //text.erase(remove_if(text.begin(),text.end(),bind2nd(equal_to<char>(),'-')), text.end());        //因为读入是以空格分割的，需要处理两边的表标点符号        //取出最后的标点，假设最多有三个尾标点        string::iterator it = text.end();        if(text.length() != 0 &&  ispunct(text[text.length()-1]))   //如果最后那个是标点     /* 注意判断长度啊啊啊啊啊，否则string subscript out of range*/            text.erase(it-1);                  //去掉最后那个标点        it = text.end();           if(text.length() != 0 && ispunct(text[text.length()-1]))            text.erase(it-1);  it = text.end();           if(text.length() != 0 &&  ispunct(text[text.length()-1]))            text.erase(it-1);        //有时前面也有符号 ，假设3个        it = text.begin();       if(text.length() != 0 &&ispunct(text[0]))            text.erase(it);        it = text.begin();        if(text.length() != 0 && ispunct(text[0]))            text.erase(it);     it = text.begin();       if(text.length() != 0 &&ispunct(text[0]))            text.erase(it);        //很多字符串带 “--”，特殊处理,--也是标点符号，两头的已在前面处理过，这里处理中间的        size_t npos = text.find("--");        if( npos != -1)        {            string text1 = text.substr(0,npos);  //--把text分割成两个单词string text2 = text.substr(npos+2);            insertWord(text1);            insertWord(text2);        }        else        {            insertWord(text);        }    }    in.close();    in.clear();    return ;}//文件属性要是ANSI，而且不能有汉字  。将项目设置中->配置属性->C/C++->语言->默认Char无符号，选择是(/J)即可解决//参考：http://blog.csdn.net/bill_ming/article/details/8191551int main(){    string fileName = "haha.txt";    WordTop10 wordTop;    wordTop.readFile(fileName);clock_t start = clock();    wordTop.sortWord();clock_t finish = clock();double duration = (double)(finish - start) / CLOCKS_PER_SEC;cout << "排序时间" << duration<<endl; wordTop.outPut();    return 0;}