1亿个整数求top 10000

来源：互联网发布：座机电话录音软件编辑：程序博客网时间：2024/06/06 08:43

参考资料：

http://bbs.csdn.net/topics/250038051

【1】遍历数组，求出最大值，与arr[0]元素交换，再在arr[1]到arr[100000000-1]之间求最大值，与arr[1]交换。。类似选择排序。

#include <iostream>#include <fstream>#include <iterator>#include <sys/time.h>#include <stdlib.h>using namespace std;const int N=100000000; // 1亿const int M=10000;     // top 1万const int x=0x40000000-1;int arr[N];int main(){    // 生成随机数    int i=0;    for(i=0; i<N;i++)          arr[i]=rand() & x;        struct timeval starttime,endtime;    gettimeofday(&starttime,0);        // 选择排序。记录各个最大值    for(i=0;i<M; i++)    {        int j;        int m=i;        for(j=i+1; j<N; j++)           {            if(arr[j]>arr[m])                m=j;        }        if(i!=m)        {            int t=arr[i];            arr[i]=arr[m];            arr[m]=t;        }            }        gettimeofday(&endtime,0);    double timeuse = 1000000*(endtime.tv_sec - starttime.tv_sec) + endtime.tv_usec - starttime.tv_usec;    timeuse /=1000;    cout<<timeuse<<" ms"<<endl;        cout<<"0-10"<<endl;    copy(arr,arr+5, ostream_iterator<int>(cout, " ") );    cout<<"9990-10000"<<endl;    copy(arr+9990,arr+10000, ostream_iterator<int>(cout, " ") );        return 0;    }

运行结果：

chen@chen-book1:~$ time ./count_0-634229 ms0-101073741822 1073741791 1073741788 1073741787 1073741783 9990-100001073634978 1073634973 1073634956 1073634953 1073634935 1073634926 1073634918 1073634907 1073634906 1073634904 real61m3.238suser60m57.797ssys0m0.580schen@chen-book1:~$

一个小时。。等不住了吃完饭回来才跑完。

【2】对这1万个数排序，遍历剩下的数字，如果比一万个数字中的最小值大，那么将其插入到合适位置。

#include <iostream>#include <fstream>#include <iterator>#include <sys/time.h>#include <string.h>#include <algorithm>using namespace std;const int N=100000000;const int M=10000;const int x=0x40000000-1;int arr[N];inline void swap(int &a, int &b){    int t=a;    a=b;    b=t;}int main(){    int i=0;    for(i=0; i<N;i++)          arr[i]=rand() & x;                  struct timeval starttime,endtime;    gettimeofday(&starttime,0);    //    sort(arr, arr+M);    for(i=M; i<N; i++)    {        if(arr[0] < arr[i])            swap(arr[0], arr[i]);        else            continue;                    // dirty        // 1优化        if(arr[0] > arr[M-1] )   //优化        {            int t=arr[0];            memmove(arr, arr+1, sizeof(int)*(M-1) );            arr[M-1] = t;        }else        {            int j;            for(j=1; arr[0]>arr[j]; j++);            if(j==1) continue;            int t=arr[0];            memmove(arr, arr+1, sizeof(int)* (j-1) );            arr[j-1]=t;        }        /* 2 去掉优化        {            int j;            for(j=1; j<M && arr[0]>arr[j]; j++);            if(j==1) continue;            int t=arr[0];            memmove(arr, arr+1, sizeof(int)* (j-1) );            arr[j-1]=t;                    }*/ //             /* 3 二分查找            int *p=lower_bound(arr+1, arr+M, arr[0]);            if(p!=arr)            {                int t=arr[0];                memcpy(arr,arr+1,sizeof(int) * (p-arr-1) );                *(p-1)=t;            }*/            }    //    gettimeofday(&endtime,0);    if(endtime.tv_usec<starttime.tv_usec)    {        endtime.tv_sec--;        endtime.tv_usec+=1000000;    }    cout<<(endtime.tv_sec-starttime.tv_sec)<<" s,  "<<(endtime.tv_usec-starttime.tv_usec)<<" us"<<endl;        cout<<"0-10"<<endl;    copy(arr,arr+10, ostream_iterator<int>(cout, " ") );    cout<<endl;    cout<<"90-100"<<endl;    copy(arr+90,arr+100, ostream_iterator<int>(cout, " ") );        return 0;    }

运行

chen@chen-book1:~$ time ./count_22 s,  106089 us0-101073634904 1073634906 1073634907 1073634918 1073634926 1073634935 1073634953 1073634956 1073634973 1073634978 90-1001073635879 1073635888 1073635895 1073635899 1073635902 1073635906 1073635909 1073635918 1073635937 1073635945 real    0m4.605suser    0m4.212ssys    0m0.352s

运算时间居然已经到了2.1ms！总时间4.6s，主要花在了生成随机数上。这里有个优化但是没有发挥作用，去掉优化：

chen@chen-book1:~$ time ./count_22 s,  40210 us

二分：

chen@chen-book1:~$ time ./count_20 s,  578971 us0-101073634904 1073634906 1073634907 1073634918 1073634926 1073634935 1073634953 1073634956 1073634973 107363497890-1001073635879 1073635888 1073635895 1073635899 1073635902 1073635906 1073635909 1073635918 1073635937 1073635945real    0m3.285suser    0m2.864ssys    0m0.388s

优化：2.4s

去掉优化：2.1s

二分查找：579ms

记得当M=100时，二分的效果还比前两者要差，M=10000的时候就已经是前者的1/4了！半秒中有木有！！

PS：N=4亿时：

优化: 3.5s

去掉优化：3.3s

二分查找：1.85s

【3】堆排序。

建立一个小根堆，然后后面的依次跟堆顶比，如果比堆顶大，那么就跟堆顶换，然后调整堆。

#include <iostream>#include <fstream>#include <iterator>#include <sys/time.h>#include <string.h>#include <algorithm>using namespace std;const int N=100000000;const int M=10000;const int x=0x40000000-1;int arr[N];inline void swap(int &a, int &b){    int t=a;    a=b;    b=t;}struct cmp{   bool operator()(int &a, int &b){return a>=b;}};int main(){    int i=0;    for(i=0; i<N;i++)          arr[i]=rand() & x;                  struct timeval starttime,endtime;    gettimeofday(&starttime,0);    make_heap(arr, arr+M, cmp());    for(i=M; i<N; i++)    {        if(arr[i]>arr[0])            swap(arr[i],arr[0]);        __adjust_heap(arr,0,M,arr[0],cmp());    }    gettimeofday(&endtime,0);    cout<<(endtime.tv_sec-starttime.tv_sec)<<" s,  "<<(endtime.tv_usec-starttime.tv_usec)<<" us"<<endl;        cout<<"0-10"<<endl;    copy(arr,arr+10, ostream_iterator<int>(cout, " ") );    cout<<endl;    cout<<"90-100"<<endl;    copy(arr+90,arr+100, ostream_iterator<int>(cout, " ") );/*int a[10]={3,36,12,13,6,78,34,2,5,7};copy(a,a+10, ostream_iterator<int>(cout, " ") );cout<<endl;make_heap(a,a+5, cmp());copy(a,a+10, ostream_iterator<int>(cout, " ") );cout<<endl;for(i=5;i<10;i++){    if(a[i]>a[0])        swap(a[i],a[0]);    __adjust_heap(a,0,5,a[0],cmp());}copy(a,a+10, ostream_iterator<int>(cout, " ") );cout<<endl;*/    return 0;    }

运行：

chen@chen-book1:~$ time ./count_234 s,  -60310 us

34s！我最爱的堆排序，怎么会这么慢。。。

PS：程序貌似有问题

【4】当然，还有直接对整个数组排序，然后取前1万个。。。排序用快排

#include <iostream>#include <fstream>#include <iterator>#include <sys/time.h>#include <string.h>#include <algorithm>using namespace std;const int N=100000000;const int M=10000;const int x=0x40000000-1;int arr[N];inline void swap(int &a, int &b){    int t=a;    a=b;    b=t;}struct cmp{   bool operator()(const int &a, const int &b){return a>=b;}};int main(){    int i=0;    for(i=0; i<N;i++)          arr[i]=rand() & x;                  struct timeval starttime,endtime;    gettimeofday(&starttime,0);    //    sort(arr, arr+N, cmp() );    //    gettimeofday(&endtime,0);    cout<<(endtime.tv_sec-starttime.tv_sec)<<" s,  "<<(endtime.tv_usec-starttime.tv_usec)<<" us"<<endl;        cout<<"0-10"<<endl;    copy(arr,arr+10, ostream_iterator<int>(cout, " ") );    cout<<endl;    cout<<"90-100"<<endl;    copy(arr+90,arr+100, ostream_iterator<int>(cout, " ") );    return 0;    }

运行：

chen@chen-book1:~$ time ./count_244 s,  747807 us0-101073741822 1073741791 1073741788 1073741787 1073741783 1073741782 1073741777 1073741771 1073741756 1073741754 90-1001073740859 1073740858 1073740844 1073740841 1073740837 1073740832 1073740826 1073740826 1073740811 1073740789 real0m47.247suser0m46.807ssys0m0.340s

先快排再取前10000，用时44s。要是换成堆排序呢？把sort改成sort_heap，运行：

chen@chen-book1:~$ time ./count_2
67 s, -486231 us
0-10
1073741822 1073741791 1073741788 1073741787 1073741783 1073741782 1073741777 1073741771 1073741756 1073741754
90-100
1073740859 1073740858 1073740844 1073740841 1073740837 1073740832 1073740826 1073740826 1073740811 1073740789
real   1m9.013s
user   1m8.540s
sys   0m0.340s

用了67s。快排的确是N logN排序算法里最快的。

【5】部分排序。STL里有，基于堆排序的，看看效果如何！

#include <iostream>#include <fstream>#include <iterator>#include <sys/time.h>#include <string.h>#include <algorithm>using namespace std;const int N=100000000;const int M=10000;const int x=0x40000000-1;int arr[N];inline void swap(int &a, int &b){    int t=a;    a=b;    b=t;}struct cmp{   bool operator()(const int &a, const int &b){return a>=b;}};int main(){    int i=0;    for(i=0; i<N;i++)          arr[i]=rand() & x;                  struct timeval starttime,endtime;    gettimeofday(&starttime,0);    //    partial_sort(arr, arr+M, arr+N, cmp() );    //    gettimeofday(&endtime,0);    if(endtime.tv_usec<starttime.tv_usec)    {        endtime.tv_sec--;        endtime.tv_usec+=1000000;    }    cout<<(endtime.tv_sec-starttime.tv_sec)<<" s,  "<<(endtime.tv_usec-starttime.tv_usec)<<" us"<<endl;        cout<<"0-10"<<endl;    copy(arr,arr+10, ostream_iterator<int>(cout, " ") );    cout<<endl;    cout<<"90-100"<<endl;    copy(arr+90,arr+100, ostream_iterator<int>(cout, " ") );    return 0;    }

运行：

chen@chen-book1:~$ time ./count_2
0 s, 729513 us
0-10
1073741822 1073741791 1073741788 1073741787 1073741783 1073741782 1073741777 1073741771 1073741756 1073741754
90-100
1073740859 1073740858 1073740844 1073740841 1073740837 1073740832 1073740826 1073740826 1073740811 1073740789
real   0m3.282s
user   0m2.896s
sys   0m0.312s

用了730ms，很不错了！可是为什么我自己用堆排序来模拟，就要那么久呢！！

PS：当N为4亿时，时间为2.65s。

【6】类似计数排序。按比特位，先申请32长度的数组，第一个表示元素表示最高比特位为第0比特的元素个数，第31个元素表示最高比特位为31的元素个数。

遍历数组，得到了直方图。然后从后向前算，就可以知道，最高比特位为多少的整数，是肯定在top10000里的，例如，算得最高比特位为20的整数，肯定在top 10000里，

而最高比特位为19的整数，则有一部分是，有一部分不是。于是，再遍历一遍，将最高比特位为20的放在整个数组开头，再将最高比特位为19的紧随其后，再对最高比特位为19的这部分做部分排序即可！

#include <iostream>#include <fstream>#include <iterator>#include <sys/time.h>#include <string.h>#include <algorithm>using namespace std;const int N=100000000;const int M=10000;const int x=0x40000000-1;int arr[N];inline void swap(int &a, int &b){    int t=a;    a=b;    b=t;}struct cmp{   bool operator()(const int &a, const int &b){return a>=b;}};int hbit(int x){    int i=0;    while(x>>=1)i++;    return i;}int main(){    int i=0;    for(i=0; i<N;i++)          arr[i]=rand() & x;        int bits[32]={0};    struct timeval starttime,endtime;    gettimeofday(&starttime,0);    //    for(i=0; i<N; i++)    {        bits[ hbit(arr[i]) ]++;    }    int s=0;    for(i=32-1; i>=0; i--)    {        s+=bits[i];        if(s>M)break;    }    int s0=s;    s-=bits[i];    int threshold= (1<<(i+1) );    i=0;    while(arr[i]>=threshold) ++i;    int j=i+1;    for(; j<N; j++)    {        if(arr[j]>=threshold)        {            swap(arr[i], arr[j]);            ++i;         }    }    threshold>>=1;    if(s<M)    {        while(arr[i]>=threshold) ++i;        j=i+1;        for(; j<N; j++)        {            if(arr[j]>=threshold)            {                swap(arr[i], arr[j]);                ++i;             }                    }        partial_sort(arr+s, arr+M, arr+s0, cmp());            }    //    gettimeofday(&endtime,0);    if(endtime.tv_usec<starttime.tv_usec)    {        endtime.tv_sec--;        endtime.tv_usec+=1000000;    }    cout<<(endtime.tv_sec-starttime.tv_sec)<<" s,  "<<(endtime.tv_usec-starttime.tv_usec)<<" us"<<endl;        cout<<"0-10"<<endl;    copy(arr,arr+10, ostream_iterator<int>(cout, " ") );    cout<<endl;    cout<<"90-100"<<endl;    copy(arr+90,arr+100, ostream_iterator<int>(cout, " ") );    return 0;    }

运行：

chen@chen-book1:~$ time ./count_2
13 s, 863604 us
0-10
1073741822 1073741791 1073741788 1073741787 1073741783 1073741782 1073741777 1073741771 1073741756 1073741754
90-100
1073740859 1073740858 1073740844 1073740841 1073740837 1073740832 1073740826 1073740826 1073740811 1073740789
real   0m16.352s
user   0m15.865s
sys   0m0.420s

13秒。怎么会这么久。。可能是移动的地方太多了。。

PS：N=4亿时，时间为55s。

【7】nth_element(arr,arr+M,arr+N, greater<int>() );

耗时：2.9s。

综上，第一名：对1万个小数组维持排序；

第二名：partial_sort based on heap