转置的好用的cuda程序
来源:互联网 发布:postgresql vs mysql 编辑:程序博客网 时间:2024/05/29 16:52
通过sample的例子自己改编的一个例子
#include <stdio.h>#define BLOCK_DIM 5// Transpose kernel (see transpose CUDA Sample for details)__global__ void d_transpose(float *odata, float *idata, int width, int height){ __shared__ float block[BLOCK_DIM][BLOCK_DIM+1]; // read the matrix tile into shared memory unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x; unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y; if ((xIndex < width) && (yIndex < height)) { unsigned int index_in = yIndex * width + xIndex; block[threadIdx.y][threadIdx.x] = idata[index_in]; } __syncthreads(); // write the transposed matrix tile to global memory xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x; yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y; if ((xIndex < height) && (yIndex < width)) { unsigned int index_out = yIndex * height + xIndex; odata[index_out] = block[threadIdx.x][threadIdx.y]; }}void print_arr(float a[],int row,int col,char * info){ printf("%s\n",info); for(int i=0;i<row;i++){ for(int j=0;j<col;j++){ printf("%f ",a[i*col+j]); } printf("\n"); }}int iDivUp(int a, int b){ return (a % b != 0) ? (a / b + 1) : (a / b);}/* Transpose a 2D array (see SDK transpose example)*/extern "C"void transpose(float *d_src, float *d_dest,int width, int height){ dim3 grid(iDivUp(width, BLOCK_DIM), iDivUp(height, BLOCK_DIM), 1); dim3 threads(BLOCK_DIM, BLOCK_DIM, 1); d_transpose<<< grid, threads >>>(d_dest, d_src, width, height);}int main(){const int nx = 32; const int ny = 32; const int mem_size = nx*ny*sizeof(float);float *h_idata = (float *)malloc(mem_size); float *h_cdata = (float *)malloc(mem_size); float *h_tdata = (float*)malloc(mem_size);float *d_idata, *d_cdata, *d_tdata; cudaMalloc(&d_idata, mem_size) ; cudaMalloc(&d_cdata, mem_size) ; cudaMalloc(&d_tdata, mem_size) ;for (int j = 0; j < ny; j++){ for (int i = 0; i < nx; i++){h_idata[j*nx + i] = j+0.1;//j*nx + i;}}cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice) ;;transpose(d_idata,d_tdata ,nx,ny);cudaMemcpy(h_tdata, d_tdata, mem_size, cudaMemcpyDeviceToHost);print_arr(h_idata,nx,ny,"origin data is");print_arr(h_tdata,nx,ny,"transposed data is");cudaFree(d_tdata) ; cudaFree(d_cdata) ; cudaFree(d_idata) ;free(h_idata); free(h_tdata); free(h_cdata);return 0;}还有一个程序,挺奇怪的现在还不知道怎么用,有知道的可以交流一下
#include <stdio.h>#include <cuda.h>const int TILE_DIM = 16;const int BLOCK_ROWS = 8;void print_arr(float a[],int row,int col,char * info){ printf("%s\n",info); for(int i=0;i<row;i++){ for(int j=0;j<col;j++){ printf("%f ",a[i*col+j]); } printf("\n"); }}__global__ void transposeNaive(float *odata, const float *idata){ int x = blockIdx.x * TILE_DIM + threadIdx.x; int y = blockIdx.y * TILE_DIM + threadIdx.y; int width = gridDim.x * TILE_DIM; for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS) odata[x*width + (y+j)] = idata[(y+j)*width + x];}int main(){const int nx = 32; const int ny = 32; const int mem_size = nx*ny*sizeof(float);float *h_idata = (float*)malloc(mem_size); float *h_cdata = (float*)malloc(mem_size); float *h_tdata = (float*)malloc(mem_size);float *d_idata, *d_cdata, *d_tdata; cudaMalloc(&d_idata, mem_size) ; cudaMalloc(&d_cdata, mem_size) ; cudaMalloc(&d_tdata, mem_size) ;for (int j = 0; j < ny; j++) for (int i = 0; i < nx; i++) h_idata[j*nx + i] = j%32;//j*nx + i;cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice) ;dim3 dimGrid(nx/TILE_DIM, ny/TILE_DIM, 1); dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);cudaMemset(d_tdata, 0, mem_size) ;transposeNaive<<<dimGrid, dimBlock>>>(d_tdata, d_idata);cudaMemcpy(h_tdata, d_tdata, mem_size, cudaMemcpyDeviceToHost);print_arr(h_idata,32,32,"origin data is");print_arr(h_tdata,32,32,"transposed data is");cudaFree(d_tdata) ; cudaFree(d_cdata) ; cudaFree(d_idata) ;free(h_idata); free(h_tdata); free(h_cdata);return 0;}
阅读全文
0 0
- 转置的好用的cuda程序
- cuda的简单程序
- CUDA学习笔记一:CUDA+OpenCV的图像转置,采用Shared Memory进行CUDA程序优化
- 为什么用了CUDA的程序还没有不用CUDA的程序快?
- 介绍cuda的一个好文章
- 我写的cuda程序
- CUDA判断素数的程序
- CUDA程序优化的记录
- 一个简单的CUDA程序
- cuda程序的编译运行
- cuda程序的编译运行
- 最简单的CUDA程序
- NVIDIA CUDA简单的CUDA程序:图像二值化处理
- CUDA计算向量内积的程序(源自CUDA范例编程)
- CUDA下的GPU编程入门--第一个CUDA程序
- CUDA编程(三)评估CUDA程序的表现
- CUDA编程(三)评估CUDA程序的表现
- Cuda 学习教程(四):Cuda程序的优化
- win10下RabbitMQ环境搭建
- pyenv
- Android三种播放视频的方式
- Mybatis中,当插入数据后,返回最新主键id的几种方法,及具体用法
- 主页被强制绑定为360导航
- 转置的好用的cuda程序
- stdin和STDIN_FILENO的区别
- RAD Studio 10 自带Demo代码汇总说明
- [学习笔记]用户界面优化之Android ViewPager
- POJ 1405 Heritage 笔记
- oracle锁表查询及解锁kill进程
- JVM结构、GC工作机制详解
- 第二讲 使用Spring IoC创建对象的3种方式
- Javaweb中快速生成验证码Captcha