双核CPU上的快速排序效率

来源:互联网 发布:排水排污管网络cad图 编辑:程序博客网 时间:2024/04/28 13:36
 
双核CPU上的快速排序效率
 
为了试验一下多核CPU上排序算法的效率,得比较单任务情况下和多任务并行排序算法的差距,因此选用快速排序算法来进行比较。
测试环境:双核CPU 2.66GHZ
                    单核CPU 2.4GHZ
           
以下是一个快速排序算法的源代码:
UINTSplit(void **ppData, UINTuStart, UINTuEnd,
                     COMPAREFUNCCompareFunc)
{
    void *pSelData;
    UINTuLow;
    UINTuHigh;
 
    uLow = uStart;
    uHigh = uEnd;
 
    pSelData = ppData[uLow];
    while ( uLow < uHigh )
    {
        while ( (*CompareFunc)(ppData[uHigh], pSelData) > 0
            && uLow != uHigh )
        {
            --uHigh;
        }
        if ( uHigh != uLow )
        {
            ppData[uLow] = ppData[uHigh];
            ++uLow;
        }
 
        while ( (*CompareFunc)( ppData[uLow], pSelData ) < 0
            && uLow != uHigh )
        {
             ++uLow;
        }
        if ( uLow != uHigh )
        {
            ppData[uHigh] = ppData[uLow];
            --uHigh;
        }
    }
    ppData[uLow] = pSelData;
 
    returnuLow;
}
 
 
voidQuickSort(void **ppData, UINTuStart, UINTuEnd,
                        COMPAREFUNCCompareFunc)
{
    UINTuMid = Split(ppData, uStart, uEnd, CompareFunc );
    if ( uMid > uStart )
    {
        QuickSort(ppData, uStart, uMid - 1, CompareFunc);
    }
 
    if ( uEnd > uMid )
    {
        QuickSort(ppData, uMid + 1, uEnd, CompareFunc);
   }
}
 
先测试一下这个快速排序算法排一百万个随机整数所花的时间:
voidTest_QuickSort(void)
{
    UINTi;
    UINTuCount = 1000000; //1000000
 
    srand(time(NULL));
    void **pp = (void **)malloc(uCount * sizeof(void *));
    for ( i = 0; i < uCount; i++ )
    {
        pp[i] = (void *)(rand() % uCount);
    }
 
       clock_tt1 = clock();
    QuickSort(pp, 0, uCount-1, UIntCompare);
       clock_tt2 = clock();
 
       printf("QuickSort 1000000 Time %ld/n", t2-t1);
 
    free(pp);
}
 
在双核CPU2.66GHZ机器上运行测试程序,打印出花费的时间约为406 ms
在单核CPU2.4GHZ机器上运行测试程序,打印出花费时间约为484ms
可见在双核CPU上运行单任务程序和单核CPU基本是一样的,效率没有任何提高。
 
下面再来把上面的快速排序程序变成并行的,一个简单的方法就是将要排序的区间分成相同的几个段,然后对每个段进行快速排序,排序完后再使用归并算法将排好的几个区间归并成一个排好序的表,我们先四个线程来进行排序,代码如下:
 
void ** Merge(void **ppData, UINTuStart, UINTuEnd,
       void **ppData2, UINTuStart2, UINTuEnd2, COMPAREFUNCcfunc)
{
    UINTi, j, k;
    UINTu1, u2, v1,v2;
    void **pp1;
    void **pp2;
 
    void **pp = (void **)malloc( (uEnd-uStart+1+uEnd2-uStart2+1) * sizeof(void *));
    if ( pp == NULL )
    {
        returnNULL;
    }
 
    if ( (*cfunc)(ppData2[uStart2], ppData[uStart]) > 0 )
    {
        u1 = uStart;
        u2 = uEnd;
        v1 = uStart2;
        v2 = uEnd2;
        pp1 = ppData;
        pp2 = ppData2;
    }
    else
    {       
        u1 = uStart2;
        u2 = uEnd2;
        v1 = uStart;
        v2 = uEnd;
        pp1 = ppData2;
        pp2 = ppData;
    }
 
    k = 0;
    pp[k] = pp1[u1];
    j = v1;
    for (i = u1+1; i <= u2; i++ )
    {
        while ( j <= v2 )
        {
            if ( (*cfunc)(pp2[j], pp1[i]) < 0 )
           {
                ++k;
                pp[k] = pp2[j];
                j++;
            }
            else
            {
                break;
            }
        }
        ++k;
        pp[k] = pp1[i];
    }
 
    if ( j < v2 )
    {
        for ( i = j; i <= v2; i++)
        {
            ++k;
            pp[k] = pp2[i];
        }
    }
    returnpp;
}
 
typedefstructSORTNODE_st {
       void **           ppData;
       UINT             uStart;
       UINT             uEnd;
       COMPAREFUNCfunc;
} SORTNODE;
 
 
DWORDWINAPIQuickSort_Thread(void *arg)
{
       SORTNODE   *pNode = (SORTNODE *)arg;
       QuickSort(pNode->ppData, pNode->uStart, pNode->uEnd, pNode->func);
       return 1;
}
 
#define THREAD_COUNT    4
 
INTMQuickSort(void **ppData, UINTuStart, UINTuEnd,
COMPAREFUNCCompareFunc)
{
    void **pp1;
    void **pp2;
    void **pp3;
       INT               i;
       SORTNODE   Node[THREAD_COUNT];
       HANDLE        hThread[THREAD_COUNT];
 
       INT        nRet = CAPI_FAILED;
 
       for ( i = 0; i < THREAD_COUNT; i++)
       {
              Node[i].ppData = ppData;
              if ( i == 0 )
              {
                     Node[i].uStart = uStart;
              }
              else
              {
                     Node[i].uStart = uEnd * i /THREAD_COUNT + 1; 
              }
              Node[i].uEnd = uEnd *(i+1) / THREAD_COUNT;
              Node[i].func = CompareFunc;
 
              hThread[i] = CreateThread(NULL, 0, QuickSort_Thread, &(Node[i]), 0, NULL);
       }
 
       for ( i = 0; i < THREAD_COUNT; i++ )
       {
              WaitForSingleObject(hThread[i], INFINITE);
       }
 
 
    pp1 = Merge(ppData, uStart, uEnd/4, ppData, uEnd/4+1, uEnd/2, CompareFunc);
 
    pp2 = Merge(ppData, uEnd/2+1, uEnd*3/4, ppData, uEnd*3/4+1, uEnd, CompareFunc);
 
    if ( pp1 != NULL && pp2 != NULL )
    {
        pp3 = Merge(pp1, 0, uEnd/2-uStart, pp2, 0, uEnd - uEnd/2 - 1, CompareFunc);
 
        if ( pp3 != NULL )
        {
            UINTi;
         
            for ( i = uStart; i <= uEnd; i++)
            {
                ppData[i] = pp3[i-uStart];
            }
            free(pp3);
            nRet = CAPI_SUCCESS;
        }
    }
    if( pp1 != NULL)
    {
        free( pp1 );
    }
    if ( pp2 != NULL )
    {
        free( pp2 );
    }
 
    returnnRet;
}
 
用下面程序来测试一下排1百万个随机整数的花费时间:
voidTest_MQuickSort (void)
{
    UINTi;
    UINTuCount = 1000000; //1000
 
    srand(time(NULL));
    void **pp = (void **)malloc(uCount * sizeof(void *));
    for ( i = 0; i < uCount; i++ )
    {
        pp[i] = (void *)(rand() % uCount);
    }
 
       clock_tt1 = clock();
    INTnRet = MQuickSort(pp, 0, uCount-1, UIntCompare);
       clock_tt2 = clock();
 
       printf("MQuickSort 1000000 Time %ld/n", t2-t1);
 
    free(pp);
}
 
在双核CPU上运行后,打印出花费的时间为 234 ms , 单任务版的快速排序函数约需406ms左右,并行运行效率为:406/(2×234) = 86.7% 左右。运行速度快了172ms。
 
可见双核CPU中,多任务程序速度还是有很大提高的。
 
当然上面的多任务版的快速排序程序还有很大的改进余地,当对4个区间排好序后,后面的归并操作都是在一个任务里运行的,对整体效率会产生影响。估计将程序继续优化后,速度还能再快一些。
 


 
双核CPU上的快速排序效率
 
为了试验一下多核CPU上排序算法的效率,得比较单任务情况下和多任务并行排序算法的差距,因此选用快速排序算法来进行比较。
测试环境:双核CPU 2.66GHZ
                    单核CPU 2.4GHZ
           
以下是一个快速排序算法的源代码:
UINTSplit(void **ppData, UINTuStart, UINTuEnd,
                     COMPAREFUNCCompareFunc)
{
    void *pSelData;
    UINTuLow;
    UINTuHigh;
 
    uLow = uStart;
    uHigh = uEnd;
 
    pSelData = ppData[uLow];
    while ( uLow < uHigh )
    {
        while ( (*CompareFunc)(ppData[uHigh], pSelData) > 0
            && uLow != uHigh )
        {
            --uHigh;
        }
        if ( uHigh != uLow )
        {
            ppData[uLow] = ppData[uHigh];
            ++uLow;
        }
 
        while ( (*CompareFunc)( ppData[uLow], pSelData ) < 0
            && uLow != uHigh )
        {
             ++uLow;
        }
        if ( uLow != uHigh )
        {
            ppData[uHigh] = ppData[uLow];
            --uHigh;
        }
    }
    ppData[uLow] = pSelData;
 
    returnuLow;
}
 
 
voidQuickSort(void **ppData, UINTuStart, UINTuEnd,
                        COMPAREFUNCCompareFunc)
{
    UINTuMid = Split(ppData, uStart, uEnd, CompareFunc );
    if ( uMid > uStart )
    {
        QuickSort(ppData, uStart, uMid - 1, CompareFunc);
    }
 
    if ( uEnd > uMid )
    {
        QuickSort(ppData, uMid + 1, uEnd, CompareFunc);
   }
}
 
先测试一下这个快速排序算法排一百万个随机整数所花的时间:
voidTest_QuickSort(void)
{
    UINTi;
    UINTuCount = 1000000; //1000000
 
    srand(time(NULL));
    void **pp = (void **)malloc(uCount * sizeof(void *));
    for ( i = 0; i < uCount; i++ )
    {
        pp[i] = (void *)(rand() % uCount);
    }
 
       clock_tt1 = clock();
    QuickSort(pp, 0, uCount-1, UIntCompare);
       clock_tt2 = clock();
 
       printf("QuickSort 1000000 Time %ld/n", t2-t1);
 
    free(pp);
}
 
在双核CPU2.66GHZ机器上运行测试程序,打印出花费的时间约为406 ms
在单核CPU2.4GHZ机器上运行测试程序,打印出花费时间约为484ms
可见在双核CPU上运行单任务程序和单核CPU基本是一样的,效率没有任何提高。
 
下面再来把上面的快速排序程序变成并行的,一个简单的方法就是将要排序的区间分成相同的几个段,然后对每个段进行快速排序,排序完后再使用归并算法将排好的几个区间归并成一个排好序的表,我们先四个线程来进行排序,代码如下:
 
void ** Merge(void **ppData, UINTuStart, UINTuEnd,
       void **ppData2, UINTuStart2, UINTuEnd2, COMPAREFUNCcfunc)
{
    UINTi, j, k;
    UINTu1, u2, v1,v2;
    void **pp1;
    void **pp2;
 
    void **pp = (void **)malloc( (uEnd-uStart+1+uEnd2-uStart2+1) * sizeof(void *));
    if ( pp == NULL )
    {
        returnNULL;
    }
 
    if ( (*cfunc)(ppData2[uStart2], ppData[uStart]) > 0 )
    {
        u1 = uStart;
        u2 = uEnd;
        v1 = uStart2;
        v2 = uEnd2;
        pp1 = ppData;
        pp2 = ppData2;
    }
    else
    {       
        u1 = uStart2;
        u2 = uEnd2;
        v1 = uStart;
        v2 = uEnd;
        pp1 = ppData2;
        pp2 = ppData;
    }
 
    k = 0;
    pp[k] = pp1[u1];
    j = v1;
    for (i = u1+1; i <= u2; i++ )
    {
        while ( j <= v2 )
        {
            if ( (*cfunc)(pp2[j], pp1[i]) < 0 )
           {
                ++k;
                pp[k] = pp2[j];
                j++;
            }
            else
            {
                break;
            }
        }
        ++k;
        pp[k] = pp1[i];
    }
 
    if ( j < v2 )
    {
        for ( i = j; i <= v2; i++)
        {
            ++k;
            pp[k] = pp2[i];
        }
    }
    returnpp;
}
 
typedefstructSORTNODE_st {
       void **           ppData;
       UINT             uStart;
       UINT             uEnd;
       COMPAREFUNCfunc;
} SORTNODE;
 
 
DWORDWINAPIQuickSort_Thread(void *arg)
{
       SORTNODE   *pNode = (SORTNODE *)arg;
       QuickSort(pNode->ppData, pNode->uStart, pNode->uEnd, pNode->func);
       return 1;
}
 
#define THREAD_COUNT    4
 
INTMQuickSort(void **ppData, UINTuStart, UINTuEnd,
COMPAREFUNCCompareFunc)
{
    void **pp1;
    void **pp2;
    void **pp3;
       INT               i;
       SORTNODE   Node[THREAD_COUNT];
       HANDLE        hThread[THREAD_COUNT];
 
       INT        nRet = CAPI_FAILED;
 
       for ( i = 0; i < THREAD_COUNT; i++)
       {
              Node[i].ppData = ppData;
              if ( i == 0 )
              {
                     Node[i].uStart = uStart;
              }
              else
              {
                     Node[i].uStart = uEnd * i /THREAD_COUNT + 1; 
              }
              Node[i].uEnd = uEnd *(i+1) / THREAD_COUNT;
              Node[i].func = CompareFunc;
 
              hThread[i] = CreateThread(NULL, 0, QuickSort_Thread, &(Node[i]), 0, NULL);
       }
 
       for ( i = 0; i < THREAD_COUNT; i++ )
       {
              WaitForSingleObject(hThread[i], INFINITE);
       }
 
 
    pp1 = Merge(ppData, uStart, uEnd/4, ppData, uEnd/4+1, uEnd/2, CompareFunc);
 
    pp2 = Merge(ppData, uEnd/2+1, uEnd*3/4, ppData, uEnd*3/4+1, uEnd, CompareFunc);
 
    if ( pp1 != NULL && pp2 != NULL )
    {
        pp3 = Merge(pp1, 0, uEnd/2-uStart, pp2, 0, uEnd - uEnd/2 - 1, CompareFunc);
 
        if ( pp3 != NULL )
        {
            UINTi;
         
            for ( i = uStart; i <= uEnd; i++)
            {
                ppData[i] = pp3[i-uStart];
            }
            free(pp3);
            nRet = CAPI_SUCCESS;
        }
    }
    if( pp1 != NULL)
    {
        free( pp1 );
    }
    if ( pp2 != NULL )
    {
        free( pp2 );
    }
 
    returnnRet;
}
 
用下面程序来测试一下排1百万个随机整数的花费时间:
voidTest_MQuickSort (void)
{
    UINTi;
    UINTuCount = 1000000; //1000
 
    srand(time(NULL));
    void **pp = (void **)malloc(uCount * sizeof(void *));
    for ( i = 0; i < uCount; i++ )
    {
        pp[i] = (void *)(rand() % uCount);
    }
 
       clock_tt1 = clock();
    INTnRet = MQuickSort(pp, 0, uCount-1, UIntCompare);
       clock_tt2 = clock();
 
       printf("MQuickSort 1000000 Time %ld/n", t2-t1);
 
    free(pp);
}
 
在双核CPU上运行后,打印出花费的时间为 234 ms , 单任务版的快速排序函数约需406ms左右,并行运行效率为:406/(2×234) = 86.7% 左右。运行速度快了172ms。
 
可见双核CPU中,多任务程序速度还是有很大提高的。
 
当然上面的多任务版的快速排序程序还有很大的改进余地,当对4个区间排好序后,后面的归并操作都是在一个任务里运行的,对整体效率会产生影响。估计将程序继续优化后,速度还能再快一些。
 


 
双核CPU上的快速排序效率
 
为了试验一下多核CPU上排序算法的效率,得比较单任务情况下和多任务并行排序算法的差距,因此选用快速排序算法来进行比较。
测试环境:双核CPU 2.66GHZ
                    单核CPU 2.4GHZ
           
以下是一个快速排序算法的源代码:
UINTSplit(void **ppData, UINTuStart, UINTuEnd,
                     COMPAREFUNCCompareFunc)
{
    void *pSelData;
    UINTuLow;
    UINTuHigh;
 
    uLow = uStart;
    uHigh = uEnd;
 
    pSelData = ppData[uLow];
    while ( uLow < uHigh )
    {
        while ( (*CompareFunc)(ppData[uHigh], pSelData) > 0
            && uLow != uHigh )
        {
            --uHigh;
        }
        if ( uHigh != uLow )
        {
            ppData[uLow] = ppData[uHigh];
            ++uLow;
        }
 
        while ( (*CompareFunc)( ppData[uLow], pSelData ) < 0
            && uLow != uHigh )
        {
             ++uLow;
        }
        if ( uLow != uHigh )
        {
            ppData[uHigh] = ppData[uLow];
            --uHigh;
        }
    }
    ppData[uLow] = pSelData;
 
    returnuLow;
}
 
 
voidQuickSort(void **ppData, UINTuStart, UINTuEnd,
                        COMPAREFUNCCompareFunc)
{
    UINTuMid = Split(ppData, uStart, uEnd, CompareFunc );
    if ( uMid > uStart )
    {
        QuickSort(ppData, uStart, uMid - 1, CompareFunc);
    }
 
    if ( uEnd > uMid )
    {
        QuickSort(ppData, uMid + 1, uEnd, CompareFunc);
   }
}
 
先测试一下这个快速排序算法排一百万个随机整数所花的时间:
voidTest_QuickSort(void)
{
    UINTi;
    UINTuCount = 1000000; //1000000
 
    srand(time(NULL));
    void **pp = (void **)malloc(uCount * sizeof(void *));
    for ( i = 0; i < uCount; i++ )
    {
        pp[i] = (void *)(rand() % uCount);
    }
 
       clock_tt1 = clock();
    QuickSort(pp, 0, uCount-1, UIntCompare);
       clock_tt2 = clock();
 
       printf("QuickSort 1000000 Time %ld/n", t2-t1);
 
    free(pp);
}
 
在双核CPU2.66GHZ机器上运行测试程序,打印出花费的时间约为406 ms
在单核CPU2.4GHZ机器上运行测试程序,打印出花费时间约为484ms
可见在双核CPU上运行单任务程序和单核CPU基本是一样的,效率没有任何提高。
 
下面再来把上面的快速排序程序变成并行的,一个简单的方法就是将要排序的区间分成相同的几个段,然后对每个段进行快速排序,排序完后再使用归并算法将排好的几个区间归并成一个排好序的表,我们先四个线程来进行排序,代码如下:
 
void ** Merge(void **ppData, UINTuStart, UINTuEnd,
       void **ppData2, UINTuStart2, UINTuEnd2, COMPAREFUNCcfunc)
{
    UINTi, j, k;
    UINTu1, u2, v1,v2;
    void **pp1;
    void **pp2;
 
    void **pp = (void **)malloc( (uEnd-uStart+1+uEnd2-uStart2+1) * sizeof(void *));
    if ( pp == NULL )
    {
        returnNULL;
    }
 
    if ( (*cfunc)(ppData2[uStart2], ppData[uStart]) > 0 )
    {
        u1 = uStart;
        u2 = uEnd;
        v1 = uStart2;
        v2 = uEnd2;
        pp1 = ppData;
        pp2 = ppData2;
    }
    else
    {       
        u1 = uStart2;
        u2 = uEnd2;
        v1 = uStart;
        v2 = uEnd;
        pp1 = ppData2;
        pp2 = ppData;
    }
 
    k = 0;
    pp[k] = pp1[u1];
    j = v1;
    for (i = u1+1; i <= u2; i++ )
    {
        while ( j <= v2 )
        {
            if ( (*cfunc)(pp2[j], pp1[i]) < 0 )
           {
                ++k;
                pp[k] = pp2[j];
                j++;
            }
            else
            {
                break;
            }
        }
        ++k;
        pp[k] = pp1[i];
    }
 
    if ( j < v2 )
    {
        for ( i = j; i <= v2; i++)
        {
            ++k;
            pp[k] = pp2[i];
        }
    }
    returnpp;
}
 
typedefstructSORTNODE_st {
       void **           ppData;
       UINT             uStart;
       UINT             uEnd;
       COMPAREFUNCfunc;
} SORTNODE;
 
 
DWORDWINAPIQuickSort_Thread(void *arg)
{
       SORTNODE   *pNode = (SORTNODE *)arg;
       QuickSort(pNode->ppData, pNode->uStart, pNode->uEnd, pNode->func);
       return 1;
}
 
#define THREAD_COUNT    4
 
INTMQuickSort(void **ppData, UINTuStart, UINTuEnd,
COMPAREFUNCCompareFunc)
{
    void **pp1;
    void **pp2;
    void **pp3;
       INT               i;
       SORTNODE   Node[THREAD_COUNT];
       HANDLE        hThread[THREAD_COUNT];
 
       INT        nRet = CAPI_FAILED;
 
       for ( i = 0; i < THREAD_COUNT; i++)
       {
              Node[i].ppData = ppData;
              if ( i == 0 )
              {
                     Node[i].uStart = uStart;
              }
              else
              {
                     Node[i].uStart = uEnd * i /THREAD_COUNT + 1; 
              }
              Node[i].uEnd = uEnd *(i+1) / THREAD_COUNT;
              Node[i].func = CompareFunc;
 
              hThread[i] = CreateThread(NULL, 0, QuickSort_Thread, &(Node[i]), 0, NULL);
       }
 
       for ( i = 0; i < THREAD_COUNT; i++ )
       {
              WaitForSingleObject(hThread[i], INFINITE);
       }
 
 
    pp1 = Merge(ppData, uStart, uEnd/4, ppData, uEnd/4+1, uEnd/2, CompareFunc);
 
    pp2 = Merge(ppData, uEnd/2+1, uEnd*3/4, ppData, uEnd*3/4+1, uEnd, CompareFunc);
 
    if ( pp1 != NULL && pp2 != NULL )
    {
        pp3 = Merge(pp1, 0, uEnd/2-uStart, pp2, 0, uEnd - uEnd/2 - 1, CompareFunc);
 
        if ( pp3 != NULL )
        {
            UINTi;
         
            for ( i = uStart; i <= uEnd; i++)
            {
                ppData[i] = pp3[i-uStart];
            }
            free(pp3);
            nRet = CAPI_SUCCESS;
        }
    }
    if( pp1 != NULL)
    {
        free( pp1 );
    }
    if ( pp2 != NULL )
    {
        free( pp2 );
    }
 
    returnnRet;
}
 
用下面程序来测试一下排1百万个随机整数的花费时间:
voidTest_MQuickSort (void)
{
    UINTi;
    UINTuCount = 1000000; //1000
 
    srand(time(NULL));
    void **pp = (void **)malloc(uCount * sizeof(void *));
    for ( i = 0; i < uCount; i++ )
    {
        pp[i] = (void *)(rand() % uCount);
    }
 
       clock_tt1 = clock();
    INTnRet = MQuickSort(pp, 0, uCount-1, UIntCompare);
       clock_tt2 = clock();
 
       printf("MQuickSort 1000000 Time %ld/n", t2-t1);
 
    free(pp);
}
 
在双核CPU上运行后,打印出花费的时间为 234 ms , 单任务版的快速排序函数约需406ms左右,并行运行效率为:406/(2×234) = 86.7% 左右。运行速度快了172ms。
 
可见双核CPU中,多任务程序速度还是有很大提高的。
 
当然上面的多任务版的快速排序程序还有很大的改进余地,当对4个区间排好序后,后面的归并操作都是在一个任务里运行的,对整体效率会产生影响。估计将程序继续优化后,速度还能再快一些。