
来源:互联网 发布:linux dns 添加mx 编辑:程序博客网 时间:2024/06/13 07:23

#include <stdio.h>#include<iostream>#include<math.h>#include <iomanip>using namespace std;#define CHECK(res) if(res!=cudaSuccess){exit(-1);}__global__ void helloCUDA(double **dp_out_params, unsigned int len){//在核函数内部定义的变量,没有 __shared__ 都是寄存器变量//,每一个线程都有自己的寄存器,线程之间互不干涉//unsigned int offset = 0;//共享内存变量,对于每个线程是唯一的,线程块之间互不干涉//,在线程块内部各个线程共享    //__shared__ double sortbuf[8][9];     // Max of 1024 elements - TODO: make this dynamic    int x=threadIdx.x;    int y=threadIdx.y;    dp_out_params[x][y] = ((x-5)*(y+1)+(x+100)%7)*0.2634;    __shared__ double sortbuf[8][9];    sortbuf[x][y]=dp_out_params[x][y];/*    if(threadIdx.x==0)    {        printf("%f  ", sortbuf[1][y]);        printf("\n y=%d  ", y);    }*/    __syncthreads();    // Now the sort loops    // Here, "k" is the sort level (remember bitonic does a multi-level butterfly style sort)    // and "j" is the partner element in the butterfly.    // Two threads each work on one butterfly, because the read/write needs to happen    // simultaneously    if(len<=8)    {     for (unsigned int k=2; k<=8; k*=2) // Butterfly stride increments in powers of 2        {            for (unsigned int j=k>>1; j>0; j>>=1) // Strides also in powers of to, up to <k            {                //printf("JJJJJJ\n");                unsigned int swap_idx = threadIdx.x ^ j; // Index of element we're compare-and-swapping with                double *my_elem = sortbuf[threadIdx.x];                double *swap_elem = sortbuf[swap_idx];/*                if(threadIdx.x==7)                {                    printf("%f  ", *(my_elem+threadIdx.y) );                    __syncthreads();                    printf("\n");                    __syncthreads();                }*/                __syncthreads();                // The k'th bit of my threadid (and hence my sort item ID)                // determines if we sort ascending or descending.                // However, since threads are reading from the top AND the bottom of                // the butterfly, if my ID is > swap_idx, then ascending means mine<swap.                // Finally, if either my_elem or swap_elem is out of range, then it                // ALWAYS acts like it's the largest number.                // Confusing? It saves us two writes though.                unsigned int ascend = k * (swap_idx < threadIdx.x);                unsigned int descend = k * (swap_idx > threadIdx.x);                bool swap = false;                if ((threadIdx.x & k) == ascend)                {                    if (*my_elem > *swap_elem)                        swap = true;                }                if ((threadIdx.x & k) == descend)                {                    if (*my_elem < *swap_elem)                        swap = true;                }                // If we had to swap, then write my data to the other element's position.                // Don't forget to track out-of-range status too!/*                if (swap)                {                //当前线程寄存器中的my_elem,赋值给线程块共享内存变量sortbuf[swap_idx]                        sortbuf[swap_idx][threadIdx.y] = *(my_elem+threadIdx.y);                }*/                if (swap)                {                //当前线程寄存器中的my_elem,赋值给线程块共享内存变量sortbuf[swap_idx]                        sortbuf[swap_idx][threadIdx.y] = *(my_elem+threadIdx.y);                        __syncthreads();                }                __syncthreads();            }//for()        }//for()    }//if()    else    {    printf("数组过长");    }    dp_out_params[x][y] = sortbuf[x][y];    /*    if(threadIdx.x==0&&threadIdx.y==0)    {    printf("\nKKKKKK\n" );    }*/}///////////////////////////////////////////////////////////////////int main(){printf("Hello main()\n");cudaError_t res;///unsigned int ROWS = 8;unsigned int COLS = 9;////////////////////////////////////////double *d_out_params = NULL;res = cudaMalloc((void**)(&d_out_params), ROWS*COLS*sizeof(double));CHECK(res)double **dp_out_params = NULL;res = cudaMalloc((void**)(&dp_out_params), ROWS*sizeof(double*));CHECK(res)double **hp_out_params = NULL;hp_out_params = (double**)malloc(ROWS*sizeof(double*));double *h_out_params = NULL;h_out_params = (double*)malloc(ROWS*COLS*sizeof(double));for (int r = 0; r < ROWS; r++){hp_out_params[r] = d_out_params + r*COLS;}////////////////////////////////////////res = cudaMemcpy((void*)(dp_out_params), (void*)(hp_out_params), ROWS*sizeof(double*), cudaMemcpyHostToDevice);CHECK(res)dim3 dimBlock( ROWS,   COLS,  1);dim3 dimGrid(  1,    1,  1);helloCUDA<<<dimGrid, dimBlock>>>(dp_out_params, ROWS);cudaDeviceSynchronize();res = cudaMemcpy((void*)(h_out_params), (void*)(d_out_params), ROWS*COLS*sizeof(double), cudaMemcpyDeviceToHost);CHECK(res)    for (int i=0;i<ROWS;i++)//输出数组array1    {       for (int j=0;j<COLS;j++)      {       //Type expression must have pointer-to-object type       //cout<<h_out_params[i][j]<<"  ";       printf("%f  ",*h_out_params++);       }       cout<<endl;    }    printf("Goodbye main()\n");    return 0;}


-0.790200  -2.107200  -3.424200  -4.741200  -6.058200  -7.375200  -8.692200  -10.009200  -11.326200  -0.263400  -1.317000  -2.370600  -3.424200  -4.477800  -5.531400  -6.585000  -7.638600  -8.692200  0.263400  -0.526800  -1.317000  -2.107200  -2.897400  -3.687600  -4.477800  -5.268000  -6.058200  0.790200  0.263400  -0.263400  -0.790200  -1.317000  -1.843800  -2.370600  -2.897400  -3.424200  1.317000  1.053600  0.790200  0.526800  0.263400  0.000000  -0.263400  -0.526800  -0.790200  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.526800  0.790200  1.053600  1.317000  1.580400  1.843800  2.107200  2.370600  2.634000  1.053600  1.580400  2.107200  2.634000  3.160800  3.687600  4.214400  4.741200  5.268000 


Hello main()-0.790200  -2.107200  -3.424200  -4.741200  -6.058200  -7.375200  -8.692200  -10.009200  -11.326200  -0.263400  -1.317000  -2.370600  -3.424200  -4.477800  -5.531400  -6.585000  -7.638600  -8.692200  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.263400  -0.526800  -1.317000  -2.107200  -2.897400  -3.687600  -4.477800  -5.268000  -6.058200  0.526800  0.790200  1.053600  1.317000  1.580400  1.843800  2.107200  2.370600  2.634000  0.790200  0.263400  -0.263400  -0.790200  -1.317000  -1.843800  -2.370600  -2.897400  -3.424200  1.053600  1.580400  2.107200  2.634000  3.160800  3.687600  4.214400  4.741200  5.268000  1.317000  1.053600  0.790200  0.526800  0.263400  0.000000  -0.263400  -0.526800  -0.790200  Goodbye main()

0 0