CUDA并行排序(4)——双调排序(二维double型数据)

来源：互联网发布：linux dns 添加mx 编辑：程序博客网时间：2024/06/13 07:23

#include <stdio.h>#include<iostream>#include<math.h>#include <iomanip>using namespace std;#define CHECK(res) if(res!=cudaSuccess){exit(-1);}__global__ void helloCUDA(double **dp_out_params, unsigned int len){//在核函数内部定义的变量，没有 __shared__ 都是寄存器变量//，每一个线程都有自己的寄存器，线程之间互不干涉//unsigned int offset = 0;//共享内存变量，对于每个线程是唯一的，线程块之间互不干涉//，在线程块内部各个线程共享    //__shared__ double sortbuf[8][9];     // Max of 1024 elements - TODO: make this dynamic    int x=threadIdx.x;    int y=threadIdx.y;    dp_out_params[x][y] = ((x-5)*(y+1)+(x+100)%7)*0.2634;    __shared__ double sortbuf[8][9];    sortbuf[x][y]=dp_out_params[x][y];/*    if(threadIdx.x==0)    {        printf("%f  ", sortbuf[1][y]);        printf("\n y=%d  ", y);    }*/    __syncthreads();    // Now the sort loops    // Here, "k" is the sort level (remember bitonic does a multi-level butterfly style sort)    // and "j" is the partner element in the butterfly.    // Two threads each work on one butterfly, because the read/write needs to happen    // simultaneously    if(len<=8)    {     for (unsigned int k=2; k<=8; k*=2) // Butterfly stride increments in powers of 2        {            for (unsigned int j=k>>1; j>0; j>>=1) // Strides also in powers of to, up to <k            {                //printf("JJJJJJ\n");                unsigned int swap_idx = threadIdx.x ^ j; // Index of element we're compare-and-swapping with                double *my_elem = sortbuf[threadIdx.x];                double *swap_elem = sortbuf[swap_idx];/*                if(threadIdx.x==7)                {                    printf("%f  ", *(my_elem+threadIdx.y) );                    __syncthreads();                    printf("\n");                    __syncthreads();                }*/                __syncthreads();                // The k'th bit of my threadid (and hence my sort item ID)                // determines if we sort ascending or descending.                // However, since threads are reading from the top AND the bottom of                // the butterfly, if my ID is > swap_idx, then ascending means mine<swap.                // Finally, if either my_elem or swap_elem is out of range, then it                // ALWAYS acts like it's the largest number.                // Confusing? It saves us two writes though.                unsigned int ascend = k * (swap_idx < threadIdx.x);                unsigned int descend = k * (swap_idx > threadIdx.x);                bool swap = false;                if ((threadIdx.x & k) == ascend)                {                    if (*my_elem > *swap_elem)                        swap = true;                }                if ((threadIdx.x & k) == descend)                {                    if (*my_elem < *swap_elem)                        swap = true;                }                // If we had to swap, then write my data to the other element's position.                // Don't forget to track out-of-range status too!/*                if (swap)                {                //当前线程寄存器中的my_elem，赋值给线程块共享内存变量sortbuf[swap_idx]                        sortbuf[swap_idx][threadIdx.y] = *(my_elem+threadIdx.y);                }*/                if (swap)                {                //当前线程寄存器中的my_elem，赋值给线程块共享内存变量sortbuf[swap_idx]                        sortbuf[swap_idx][threadIdx.y] = *(my_elem+threadIdx.y);                        __syncthreads();                }                __syncthreads();            }//for()        }//for()    }//if()    else    {    printf("数组过长");    }    dp_out_params[x][y] = sortbuf[x][y];    /*    if(threadIdx.x==0&&threadIdx.y==0)    {    printf("\nKKKKKK\n" );    }*/}///////////////////////////////////////////////////////////////////int main(){printf("Hello main()\n");cudaError_t res;///unsigned int ROWS = 8;unsigned int COLS = 9;////////////////////////////////////////double *d_out_params = NULL;res = cudaMalloc((void**)(&d_out_params), ROWS*COLS*sizeof(double));CHECK(res)double **dp_out_params = NULL;res = cudaMalloc((void**)(&dp_out_params), ROWS*sizeof(double*));CHECK(res)double **hp_out_params = NULL;hp_out_params = (double**)malloc(ROWS*sizeof(double*));double *h_out_params = NULL;h_out_params = (double*)malloc(ROWS*COLS*sizeof(double));for (int r = 0; r < ROWS; r++){hp_out_params[r] = d_out_params + r*COLS;}////////////////////////////////////////res = cudaMemcpy((void*)(dp_out_params), (void*)(hp_out_params), ROWS*sizeof(double*), cudaMemcpyHostToDevice);CHECK(res)dim3 dimBlock( ROWS,   COLS,  1);dim3 dimGrid(  1,    1,  1);helloCUDA<<<dimGrid, dimBlock>>>(dp_out_params, ROWS);cudaDeviceSynchronize();res = cudaMemcpy((void*)(h_out_params), (void*)(d_out_params), ROWS*COLS*sizeof(double), cudaMemcpyDeviceToHost);CHECK(res)    for (int i=0;i<ROWS;i++)//输出数组array1    {       for (int j=0;j<COLS;j++)      {       //Type expression must have pointer-to-object type       //cout<<h_out_params[i][j]<<"  ";       printf("%f  ",*h_out_params++);       }       cout<<endl;    }    printf("Goodbye main()\n");    return 0;}

原始数据：

-0.790200  -2.107200  -3.424200  -4.741200  -6.058200  -7.375200  -8.692200  -10.009200  -11.326200  -0.263400  -1.317000  -2.370600  -3.424200  -4.477800  -5.531400  -6.585000  -7.638600  -8.692200  0.263400  -0.526800  -1.317000  -2.107200  -2.897400  -3.687600  -4.477800  -5.268000  -6.058200  0.790200  0.263400  -0.263400  -0.790200  -1.317000  -1.843800  -2.370600  -2.897400  -3.424200  1.317000  1.053600  0.790200  0.526800  0.263400  0.000000  -0.263400  -0.526800  -0.790200  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.526800  0.790200  1.053600  1.317000  1.580400  1.843800  2.107200  2.370600  2.634000  1.053600  1.580400  2.107200  2.634000  3.160800  3.687600  4.214400  4.741200  5.268000

排序后：

Hello main()-0.790200  -2.107200  -3.424200  -4.741200  -6.058200  -7.375200  -8.692200  -10.009200  -11.326200  -0.263400  -1.317000  -2.370600  -3.424200  -4.477800  -5.531400  -6.585000  -7.638600  -8.692200  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.263400  -0.526800  -1.317000  -2.107200  -2.897400  -3.687600  -4.477800  -5.268000  -6.058200  0.526800  0.790200  1.053600  1.317000  1.580400  1.843800  2.107200  2.370600  2.634000  0.790200  0.263400  -0.263400  -0.790200  -1.317000  -1.843800  -2.370600  -2.897400  -3.424200  1.053600  1.580400  2.107200  2.634000  3.160800  3.687600  4.214400  4.741200  5.268000  1.317000  1.053600  0.790200  0.526800  0.263400  0.000000  -0.263400  -0.526800  -0.790200  Goodbye main()

阅读全文

0 0