转置的好用的cuda程序

来源：互联网发布：postgresql vs mysql 编辑：程序博客网时间：2024/05/29 16:52

通过sample的例子自己改编的一个例子

#include <stdio.h>#define BLOCK_DIM 5// Transpose kernel (see transpose CUDA Sample for details)__global__ void d_transpose(float *odata, float *idata, int width, int height){    __shared__ float block[BLOCK_DIM][BLOCK_DIM+1];    // read the matrix tile into shared memory    unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;    unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;    if ((xIndex < width) && (yIndex < height))    {        unsigned int index_in = yIndex * width + xIndex;        block[threadIdx.y][threadIdx.x] = idata[index_in];    }    __syncthreads();    // write the transposed matrix tile to global memory    xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x;    yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y;    if ((xIndex < height) && (yIndex < width))    {        unsigned int index_out = yIndex * height + xIndex;        odata[index_out] = block[threadIdx.x][threadIdx.y];    }}void print_arr(float a[],int row,int col,char * info){        printf("%s\n",info);        for(int i=0;i<row;i++){                for(int j=0;j<col;j++){                        printf("%f ",a[i*col+j]);                }                printf("\n");        }}int iDivUp(int a, int b){    return (a % b != 0) ? (a / b + 1) : (a / b);}/*    Transpose a 2D array (see SDK transpose example)*/extern "C"void transpose(float *d_src, float *d_dest,int width, int height){    dim3 grid(iDivUp(width, BLOCK_DIM), iDivUp(height, BLOCK_DIM), 1);    dim3 threads(BLOCK_DIM, BLOCK_DIM, 1);    d_transpose<<< grid, threads >>>(d_dest, d_src, width, height);}int main(){const int  nx = 32;  const int  ny = 32;  const int mem_size = nx*ny*sizeof(float);float *h_idata = (float *)malloc(mem_size);  float  *h_cdata = (float *)malloc(mem_size);  float *h_tdata = (float*)malloc(mem_size);float *d_idata, *d_cdata, *d_tdata;  cudaMalloc(&d_idata, mem_size) ; cudaMalloc(&d_cdata, mem_size) ;  cudaMalloc(&d_tdata, mem_size) ;for (int j = 0; j < ny; j++){    for (int i = 0; i < nx; i++){h_idata[j*nx + i] = j+0.1;//j*nx + i;}}cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice) ;;transpose(d_idata,d_tdata ,nx,ny);cudaMemcpy(h_tdata, d_tdata, mem_size, cudaMemcpyDeviceToHost);print_arr(h_idata,nx,ny,"origin data is");print_arr(h_tdata,nx,ny,"transposed data is");cudaFree(d_tdata) ;  cudaFree(d_cdata) ;  cudaFree(d_idata) ;free(h_idata);  free(h_tdata);  free(h_cdata);return 0;}

还有一个程序，挺奇怪的现在还不知道怎么用，有知道的可以交流一下

#include <stdio.h>#include <cuda.h>const int TILE_DIM = 16;const int BLOCK_ROWS = 8;void print_arr(float a[],int row,int col,char * info){        printf("%s\n",info);        for(int i=0;i<row;i++){                for(int j=0;j<col;j++){                        printf("%f ",a[i*col+j]);                }                printf("\n");        }}__global__ void transposeNaive(float *odata, const float *idata){  int x = blockIdx.x * TILE_DIM + threadIdx.x;  int y = blockIdx.y * TILE_DIM + threadIdx.y;  int width = gridDim.x * TILE_DIM;  for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)    odata[x*width + (y+j)] = idata[(y+j)*width + x];}int main(){const int nx = 32;  const int ny = 32;  const int mem_size = nx*ny*sizeof(float);float *h_idata = (float*)malloc(mem_size);  float *h_cdata = (float*)malloc(mem_size);  float *h_tdata = (float*)malloc(mem_size);float *d_idata, *d_cdata, *d_tdata;  cudaMalloc(&d_idata, mem_size) ; cudaMalloc(&d_cdata, mem_size) ;  cudaMalloc(&d_tdata, mem_size) ;for (int j = 0; j < ny; j++)    for (int i = 0; i < nx; i++)      h_idata[j*nx + i] = j%32;//j*nx + i;cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice) ;dim3 dimGrid(nx/TILE_DIM, ny/TILE_DIM, 1);  dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);cudaMemset(d_tdata, 0, mem_size) ;transposeNaive<<<dimGrid, dimBlock>>>(d_tdata, d_idata);cudaMemcpy(h_tdata, d_tdata, mem_size, cudaMemcpyDeviceToHost);print_arr(h_idata,32,32,"origin data is");print_arr(h_tdata,32,32,"transposed data is");cudaFree(d_tdata) ;  cudaFree(d_cdata) ;  cudaFree(d_idata) ;free(h_idata);  free(h_tdata);  free(h_cdata);return 0;}

阅读全文

0 0