【CUDA笔记1】share memory优化
来源:互联网 发布:马自达 mx5 rf 知乎 编辑:程序博客网 时间:2024/06/04 20:02
//share memory demo//实现C[MH,NW]=A[MH,MW]B[MW,NW]#include <cuda.h>#include <stdio.h>#include <stdlib.h>#define N 8 //A、B、C为方阵时的矩阵宽度//非方阵的设置如下:#define MH 16 //A的行数#define MW 32 //A的列数=B的行数#define NW 16 //B的列数//C的行数=MH,列数=NW#define THREAD_WIDTH 16 //每个Block中Thread个数,3个维的宽度乘积不能超过1024#define TILE_WIDTH 16//瓦片宽度,应保证各矩阵的宽高是TILE_WIDTH的整数倍__global__ void MatrixMulOptimazationKernel(const float* Md, const float* Nd, float* Pd, int mh, int mw, int nw){__shared__ float Mds[TILE_WIDTH][TILE_WIDTH];//在瓦片内共享的share memory__shared__ float Nds[TILE_WIDTH][TILE_WIDTH];int bx = blockIdx.x;int by = blockIdx.y;int tx = threadIdx.x;int ty = threadIdx.y;int row = by * TILE_WIDTH + ty;//Pd矩阵中某点的行号,每个瓦片放在一个单独的Block中,所以为by*TILE_WIDTHint col = bx * TILE_WIDTH + tx;int loop = (mw + TILE_WIDTH - 1) / TILE_WIDTH;if( row < mh && col < nw){float sum = 0;for (int m = 0; m < loop; ++m){int mx = m * TILE_WIDTH + tx;int ny = m * TILE_WIDTH + ty;if(mx < mw && ny < mw){Mds[ty][tx] = Md[row * mw + (m * TILE_WIDTH + tx)];Nds[ty][tx] = Nd[col + (m * TILE_WIDTH + ty) * nw];__syncthreads();for (int k = 0; k < TILE_WIDTH; ++k)sum += Mds[ty][k] * Nds[k][tx];__syncthreads();}}Pd[ row * nw + col] = sum;}}int main(){int mh=MH;int mw=MW;int nw=NW; //int size=N*N*sizeof(float);int asize = mh * mw * sizeof(float);int bsize = mw * nw * sizeof(float);int csize = mh * nw * sizeof(float); float*a=(float*)malloc(asize); float*b=(float*)malloc(bsize); float*c=(float*)malloc(csize); float*da,*db,*dc; float time; cudaMalloc((void**)&da,asize); cudaMalloc((void**)&db,bsize); cudaMalloc((void**)&dc,csize);for(int i=0;i<mh;i++){ for(int j=0;j<mw;j++) { a[i*mw+j]=1.0;//矩阵a初始值 }}for(int i=0;i<mw;i++){ for(int j=0;j<nw;j++) {b[i*nw+j]=1.0;//矩阵b初始值 }} cudaMemcpy(da,a,asize,cudaMemcpyHostToDevice); cudaMemcpy(db,b,bsize,cudaMemcpyHostToDevice); //开始计算时间 cudaEvent_t start,stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start,0); dim3 threadperblock(THREAD_WIDTH,THREAD_WIDTH,1); dim3 blockpergrid((nw + threadperblock.x - 1)/threadperblock.x, (mh + threadperblock.y - 1)/threadperblock.x, 1); MatrixMulOptimazationKernel<<<blockpergrid, threadperblock>>>(da, db,dc, mh,mw,nw); //结束计算时间 cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); cudaMemcpy(c,dc,csize,cudaMemcpyDeviceToHost); printf("A=\n");for(int i=0;i<mh;i++){ for(int j=0;j<mw;j++) {printf("%.2f ",a[i*mw+j]); }printf("\n");}printf("B=\n");for(int i=0;i<mw;i++){ for(int j=0;j<nw;j++) {printf("%.2f ",b[i*nw+j]); }printf("\n");}printf("C=\n");for(int i=0;i<mh;i++){ for(int j=0;j<nw;j++) { printf("%.2f ",c[i*nw+j]); } printf("\n");}printf("the GPU performing time is %f ms \n",time); free(a); free(b); free(c); cudaFree(da); cudaFree(db); cudaFree(dc);}
0 0
- 【CUDA笔记1】share memory优化
- cuda share memory
- CUDA核函数share memory
- CUDA学习笔记一:CUDA+OpenCV的图像转置,采用Shared Memory进行CUDA程序优化
- Linux程序设计笔记--IPC操作--share memory
- share memory
- 一个使用share memory进行性能优化的实例
- cuda memory
- Share Memory协议
- Share Memory协议是怎么回事
- [转载] 关于share memory
- Qualcomm Share Memory
- Qualcomm Share Memory
- share memory linux
- Linux Share Memory
- CUDA编程—通过shared memory优化矩阵相乘
- cuda编程优化1
- gpu/cuda-03-cuda memory
- Matlab中Bode图--HZ显示横坐标
- 【设计模式 - 11】之享元模式(FlyWeight)
- hdu 2421 数论--积性函数
- ActiveMQ高可用+负载均衡集群
- Visual Studio 2015 编译gflags
- 【CUDA笔记1】share memory优化
- linux(centos)下nginx虚拟主机的配置
- nginx配置解析流程
- hdu 2053 简单模拟
- 一次摄像头的任务 - 3
- go语言中type的几种使用
- 解决打开mysql.exe后闪退
- 【设计模式 - 12】之代理模式(Proxy)
- Spring中Aspect的切入点语法定义细节