归并法外排序—海量数据排序

来源：互联网发布：python 能做界面吗编辑：程序博客网时间：2024/06/06 04:30

1.外归并排序

讲完了内排序，我们来了解一下，外归并排序，外归并排序一般是应对于数据量非常大的数据，这些数据放在硬盘上，无法一次性的放到内存上。所以，我们通常采用的思路对于这些数据就是进行切分，然后对切分出来的文件进行排序。在排序的时候，小文件我们采用快排来排序，如果是大文件，我们就从两个文件中一个一个读取，然后进行归并排序，放入合并以后的文件当中，最后最大的文件就是排序以后的结果。

外排序是指在排序期间全部对象个数太多，不能同时存放在内存，必须根据排序过程的要求，不断在内、外存之间移动的排序。比如常见的有外归并排序。

我在此采用的方式是首先创建一个文件，这个文件中我们放随机数，对这个随机数文件我们进行排序，当我们生成了随机数文件以后，接下来我们要考虑的就是对这个随机数的文件进行拆分成小文件，同时，我们对拆分到每一个小文件的数据进行排排序再放到小文件当中去，然后，我们对文件进行归并的操作，就是，取出两个文件，将它们的内容进行归并排序放入新的文件当中去。这样最终，我们就可以得到最后的一个文件，这个文件就是我们要的最后排序好的大文件。

这里写图片描述

2.个人收获

注：我在这里采取的拆分方案是大文件的每一行拆成一个文件，但是实际过程当中我们完全可以给小文件当中存储的内容更多一些，这样效率更高。否则会有文件读取，删除太多的问题。

另外就是要熟悉使用各种关于io的函数，最开始我采用拆分的时候就是不熟悉这些函数，需要不断的数字转换字符，字符转换数字，最后，我使用了fscanf和fprintf这两个函数，更加高效的解决问题。

在最后排序的时候注意最后是一个归并排序的思想，如果不熟悉，可以去上面看我以前关于归并排序的博客。

另外就是需要对库非常熟悉，尤其是比如像一些算法的使用还有容器的一些函数。例如我在这里对小文件进行拆分排序的时候使用了sort，要对这些的底层有深刻的了解，理解程度深些才能写出更加高效漂亮的代码。

3.示例代码

#define _CRT_SECURE_NO_WARNINGS 1#include<iostream>#include<cstdlib>#include<ctime>#include<vector>#include<algorithm>#include<cassert>using namespace std;//外归并排序//想要进行外部归并排序，//我们要考虑进行大文件切分小文件，//然后小文件再次进行归并排序大文件，最终归并出来的大文件就是排序后的结果。class ExternalMergeSort{public:    ExternalMergeSort(const string & s)        :_filename(s)    {}    //切分文件    void SplitFile(const string& a,size_t line)    {        string str;        FILE* fin = fopen(a.c_str(), "r");        assert(fin);        string count="0";        int countline=0;        while (ReadLine(fin, str))        {            string CutFileName = a;            int pos = a.rfind('.');            CutFileName = CutFileName.substr(0, pos);            CutFileName += count+".txt";            _file.push_back(CutFileName);            FILE* fout = fopen(CutFileName.c_str(), "w");            string num;            //从中读取单词，然后排序。            std::vector<int >s;            int i = 0;            char ch = str[i++];            num += ch;            while (i<str.size()&&ch != '\n')            {                if (ch == ' ')                {                    s.push_back(atoi(num.c_str()));                    num.clear();                }                ch = str[i++];                num += ch;            }            s.push_back(atoi(num.c_str()));            num.clear();            sort(s.begin(), s.end());            std::vector<int>::iterator it = s.begin();            char buf[33];            while (it!=s.end())            {                _itoa(*it, buf, 10);                fprintf(fout, buf);                fputc(' ', fout);                it++;            }            fclose(fout);            str.clear();            char countbuf[4];            int index = atoi(count.c_str())+1;            _itoa(index, countbuf, 10);            count.clear();            count += countbuf;            countline++;            //count++;        }        fclose(fin);    }    //进行归并。    void Mergefile()    {        string count = "1";        while (_file.size()>1)        {            std::vector<string> newfile;            int index = 0;            while (index < _file.size() && index + 1 < _file.size())            {                string newfilename ="merge"+count+".txt";                char countbuf[5];                int num = atoi(count.c_str()) + 1;                _itoa(num, countbuf, 10);                count.clear();                count += countbuf;                newfile.push_back(newfilename);                FILE* fout = fopen(newfilename.c_str(), "w");                string file1 = _file[index++];                string file2 = _file[index++];                FILE* fin1 = fopen(file1.c_str(), "r");                assert(fin1);                FILE* fin2 = fopen(file2.c_str(), "r");                assert(fin2);                int tmp1 = 0;                int tmp2 = 0;                fscanf(fin1, "%d", &tmp1);                fscanf(fin2, "%d", &tmp2);                while (!feof(fin1)&& !feof(fin2))                {                    while (!feof(fin1) && !feof(fin2) && tmp1 <= tmp2)                    {                        fprintf(fout, "%d", tmp1);                        fputc(' ', fout);                        fscanf(fin1, "%d", &tmp1);                    }                    while (!feof(fin1) && !feof(fin2) && tmp1 >= tmp2)                    {                        fprintf(fout, "%d", tmp2);                        fputc(' ', fout);                        fscanf(fin2, "%d", &tmp2);                    }                }                while (!feof(fin1))                {                    fprintf(fout, "%d", tmp1);                    fputc(' ', fout);                    fscanf(fin1, "%d", &tmp1);                }                while (!feof(fin2))                {                    fprintf(fout, "%d", tmp2);                    fputc(' ', fout);                    fscanf(fin2, "%d", &tmp2);                }                fclose(fout);                fclose(fin1);                fclose(fin2);                remove(file1.c_str());                remove(file2.c_str());            }            while (index < _file.size())            {                newfile.push_back(_file[index++]);            }            _file = newfile;        }        int pos = _filename.rfind('.');        string SortFileName = _filename.substr(0, pos);        SortFileName +="external_merge_sort .txt";        rename(_file[0].c_str(), SortFileName.c_str());    }protected:    //读取文件中的一行。    bool ReadLine(FILE*& fconfigout, string & str)    {        //从配置文件读出一个字符        int ch = fgetc(fconfigout);        if (ch == EOF)        {            return false;        }        while (ch != EOF&&ch != '\n')        {            str += ch;            ch = fgetc(fconfigout);        }        //while结束后，在最后就是str记录了这一行除了\n以外的字符。在外部，会对\n添加。        return true;    }protected:    std::vector<string> _file;    string _filename;};void init_data( FILE* fin, size_t line){    assert(fin);    srand(time(NULL));    for (size_t i = 0; i < line; i++)    {        for (size_t i = 0; i < 99; i++)        {            int randnum = rand();            char buf[33];            _itoa(randnum, buf, 10);            fprintf(fin, buf);            fputc(' ', fin);        }        fputc('\n', fin);    }}void CreateRandamNum(const string& a,size_t lines){    FILE* fin = fopen(a.c_str(), "w");    assert(fin);    init_data(fin, lines);    fclose(fin);}void test1(){    size_t lines = 4;    string a = "sort.txt";    CreateRandamNum(a, lines);    ExternalMergeSort b(a);    b.SplitFile(a, lines);    b.Mergefile();}int main(){    test1();    system("pause");    return 0;}

1 1