CUDA/GPU下矩阵乘法的几种实现的C++源码

来源：互联网发布：adobe 软件 ico图标编辑：程序博客网时间：2024/05/16 05:41

#include "cutil_inline.h"#include "cublas.h"void MatrixMul(const float *A, const float *B, float *C, int Width) { int i, j, k; for(i=0; i<Width; i++)for(j=0; j<Width; j++){float s=0;for(k=0; k<Width; k++) s+=A[i*Width+k]*B[k*Width+j];C[i*Width+j]=s;}}#define TILE_WIDTH 16//简单方法__global__ void MatrixMulKernel1(float* Md, float* Nd, float* Pd, int Width){ int x = threadIdx.x+blockIdx.x*blockDim.x; int y = threadIdx.y+blockIdx.y*blockDim.y;float Pvalue = 0;for (int k = 0; k < Width; ++k)Pvalue+=Md[y * Width + k]*Nd[k * Width + x];Pd[y*Width + x] = Pvalue;}//块方法__global__ void MatrixMulKernel2(float* Md, float* Nd, float* Pd, int Width){int bx = blockIdx.x;int by = blockIdx.y;int tx = threadIdx.x;int ty = threadIdx.y;float Pvalue = 0;for (int m = 0; m < gridDim.x; ++m) {__shared__ float Mds[TILE_WIDTH][TILE_WIDTH];__shared__ float Nds[TILE_WIDTH][TILE_WIDTH];Mds[ty][tx] = *(Md + (by*blockDim.y + ty) * Width + m*blockDim.x + tx);Nds[ty][tx] = *(Nd + (m*blockDim.y + ty) * Width + bx*blockDim.x + tx);__syncthreads();for (int k = 0; k < blockDim.x; ++k)Pvalue += Mds[ty][k] * Nds[k][tx];__syncthreads();}Pd[(by*blockDim.y+ty)*Width+bx*blockDim.x+tx] = Pvalue;}//块方法, 循环展开__global__ void MatrixMulKernel3(float* Md, float* Nd, float* Pd, int Width){int bx = blockIdx.x;int by = blockIdx.y;int tx = threadIdx.x;int ty = threadIdx.y;float Pvalue = 0;for (int m = 0; m < gridDim.x; ++m) {__shared__ float Mds[TILE_WIDTH][TILE_WIDTH];__shared__ float Nds[TILE_WIDTH][TILE_WIDTH];Mds[ty][tx] = *(Md + (by*blockDim.y + ty) * Width + m*blockDim.x + tx);Nds[ty][tx] = *(Nd + (m*blockDim.y + ty) * Width + bx*blockDim.x + tx);__syncthreads();Pvalue += Mds[ty][0] * Nds[0][tx] + Mds[ty][1] * Nds[1][tx] +Mds[ty][2] * Nds[2][tx] + Mds[ty][3] * Nds[3][tx] +Mds[ty][4] * Nds[4][tx] + Mds[ty][5] * Nds[5][tx] +Mds[ty][6] * Nds[6][tx] + Mds[ty][7] * Nds[7][tx] +Mds[ty][8] * Nds[8][tx] + Mds[ty][9] * Nds[9][tx] +Mds[ty][10] * Nds[10][tx] + Mds[ty][11] * Nds[11][tx] +Mds[ty][12] * Nds[12][tx] + Mds[ty][13] * Nds[13][tx] +Mds[ty][14] * Nds[14][tx] + Mds[ty][15] * Nds[15][tx];__syncthreads();}Pd[(by*blockDim.y+ty)*Width+bx*blockDim.x+tx] = Pvalue;}#define ThreadColumn 2//块方法, 线程粒度__global__ void MatrixMulKernel4(float* Md, float* Nd, float* Pd, int Width){int bx = blockIdx.x;int by = blockIdx.y;int tx = threadIdx.x;int ty = threadIdx.y;float Pvalue1 = 0, Pvalue2=0;for (int m = 0; m < gridDim.x; ++m) {__shared__ float Mds[TILE_WIDTH][ThreadColumn*TILE_WIDTH];__shared__ float Nds[ThreadColumn*TILE_WIDTH][ThreadColumn*TILE_WIDTH];for(int k=0;k<ThreadColumn; ++k)Mds[ty][tx+k*TILE_WIDTH] = *(Md + (by*blockDim.y + ty) * Width + ThreadColumn*m*blockDim.x + tx + k*TILE_WIDTH );for(int h=0;h<ThreadColumn; ++h)for(int k=0; k<ThreadColumn; ++k)Nds[ty+h*TILE_WIDTH][tx+k*TILE_WIDTH] = *(Nd + (m*blockDim.y*ThreadColumn + ty + h*TILE_WIDTH) * Width + bx*blockDim.x*ThreadColumn + tx + k*TILE_WIDTH);__syncthreads();for(int h=0;h<ThreadColumn*TILE_WIDTH;++h)Pvalue1 +=Mds[ty][h] * Nds[h][tx];for(int k=0;k<ThreadColumn*TILE_WIDTH;++k)Pvalue2 +=Mds[ty][k] * Nds[k][tx+TILE_WIDTH];__syncthreads();}Pd[(by*blockDim.y + ty) * Width + bx*ThreadColumn*blockDim.x + tx] = Pvalue1;Pd[(by*blockDim.y + ty) * Width + bx*ThreadColumn*blockDim.x + tx + TILE_WIDTH] = Pvalue2;}//块方法, 循环展开, 线程粒度__global__ void MatrixMulKernel5(float* Md, float* Nd, float* Pd, int Width){int bx = blockIdx.x;int by = blockIdx.y;int tx = threadIdx.x;int ty = threadIdx.y;float Pvalue1 = 0, Pvalue2=0;for (int m = 0; m < gridDim.x; ++m) {__shared__ float Mds[TILE_WIDTH][ThreadColumn*TILE_WIDTH];__shared__ float Nds[ThreadColumn*TILE_WIDTH][ThreadColumn*TILE_WIDTH];for(int k=0;k<ThreadColumn; ++k)Mds[ty][tx+k*TILE_WIDTH] = *(Md + (by*blockDim.y + ty) * Width + ThreadColumn*m*blockDim.x + tx + k*TILE_WIDTH );for(int h=0;h<ThreadColumn; ++h)for(int k=0; k<ThreadColumn; ++k)Nds[ty+h*TILE_WIDTH][tx+k*TILE_WIDTH] = *(Nd + (m*blockDim.y*ThreadColumn + ty + h*TILE_WIDTH) * Width + bx*blockDim.x*ThreadColumn + tx + k*TILE_WIDTH);__syncthreads();Pvalue1 += Mds[ty][0] * Nds[0][tx] + Mds[ty][1] * Nds[1][tx] +Mds[ty][2] * Nds[2][tx] + Mds[ty][3] * Nds[3][tx] +Mds[ty][4] * Nds[4][tx] + Mds[ty][5] * Nds[5][tx] +Mds[ty][6] * Nds[6][tx] + Mds[ty][7] * Nds[7][tx] +Mds[ty][8] * Nds[8][tx] + Mds[ty][9] * Nds[9][tx] +Mds[ty][10] * Nds[10][tx] + Mds[ty][11] * Nds[11][tx] +Mds[ty][12] * Nds[12][tx] + Mds[ty][13] * Nds[13][tx] +Mds[ty][14] * Nds[14][tx] + Mds[ty][15] * Nds[15][tx] +Mds[ty][16] * Nds[16][tx] + Mds[ty][17] * Nds[17][tx] +Mds[ty][18] * Nds[18][tx] + Mds[ty][19] * Nds[19][tx] +Mds[ty][20] * Nds[20][tx] + Mds[ty][21] * Nds[21][tx] +Mds[ty][22] * Nds[22][tx] + Mds[ty][23] * Nds[23][tx] +Mds[ty][24] * Nds[24][tx] + Mds[ty][25] * Nds[25][tx] +Mds[ty][26] * Nds[26][tx] + Mds[ty][27] * Nds[27][tx] +Mds[ty][28] * Nds[28][tx] + Mds[ty][29] * Nds[29][tx] +Mds[ty][30] * Nds[30][tx] + Mds[ty][31] * Nds[31][tx];Pvalue2 += Mds[ty][0] * Nds[0][tx+TILE_WIDTH] + Mds[ty][1] * Nds[1][tx+TILE_WIDTH] +Mds[ty][2] * Nds[2][tx+TILE_WIDTH] + Mds[ty][3] * Nds[3][tx+TILE_WIDTH] +Mds[ty][4] * Nds[4][tx+TILE_WIDTH] + Mds[ty][5] * Nds[5][tx+TILE_WIDTH] +Mds[ty][6] * Nds[6][tx+TILE_WIDTH] + Mds[ty][7] * Nds[7][tx+TILE_WIDTH] +Mds[ty][8] * Nds[8][tx+TILE_WIDTH] + Mds[ty][9] * Nds[9][tx+TILE_WIDTH] +Mds[ty][10] * Nds[10][tx+TILE_WIDTH] + Mds[ty][11] * Nds[11][tx+TILE_WIDTH] +Mds[ty][12] * Nds[12][tx+TILE_WIDTH] + Mds[ty][13] * Nds[13][tx+TILE_WIDTH] +Mds[ty][14] * Nds[14][tx+TILE_WIDTH] + Mds[ty][15] * Nds[15][tx+TILE_WIDTH] +Mds[ty][16] * Nds[16][tx+TILE_WIDTH] + Mds[ty][17] * Nds[17][tx+TILE_WIDTH] +Mds[ty][18] * Nds[18][tx+TILE_WIDTH] + Mds[ty][19] * Nds[19][tx+TILE_WIDTH] +Mds[ty][20] * Nds[20][tx+TILE_WIDTH] + Mds[ty][21] * Nds[21][tx+TILE_WIDTH] +Mds[ty][22] * Nds[22][tx+TILE_WIDTH] + Mds[ty][23] * Nds[23][tx+TILE_WIDTH] +Mds[ty][24] * Nds[24][tx+TILE_WIDTH] + Mds[ty][25] * Nds[25][tx+TILE_WIDTH] +Mds[ty][26] * Nds[26][tx+TILE_WIDTH] + Mds[ty][27] * Nds[27][tx+TILE_WIDTH] +Mds[ty][28] * Nds[28][tx+TILE_WIDTH] + Mds[ty][29] * Nds[29][tx+TILE_WIDTH] +Mds[ty][30] * Nds[30][tx+TILE_WIDTH] + Mds[ty][31] * Nds[31][tx+TILE_WIDTH];__syncthreads();}Pd[(by*blockDim.y + ty) * Width + bx*ThreadColumn*blockDim.x + tx] = Pvalue1;Pd[(by*blockDim.y + ty) * Width + bx*ThreadColumn*blockDim.x + tx + TILE_WIDTH] = Pvalue2;}int main(int argc, char **argv) {int Width(0);bool is_test=false;if(argc==1){Width=512;is_test=false;}else if(argc==2){Width=atoi(argv[1]);is_test=false;}else{Width=atoi(argv[1]);is_test=(bool)atoi(argv[2]);} float *h_A=(float*)malloc(Width*Width*sizeof(float)); float *h_B=(float*)malloc(Width*Width*sizeof(float)); float *h_C=(float*)malloc(Width*Width*sizeof(float)); float *h_C_ref=(float*)malloc(Width*Width*sizeof(float)); float *d_A, *d_B, *d_C; float t0, error_norm=0, ref_norm=0; unsigned int timer1=0; cutCreateTimer(&timer1); cutStartTimer(timer1); for(int i=0; i<Width*Width; i++) { h_A[i]=rand()/(float)RAND_MAX; h_B[i]=rand()/(float)RAND_MAX; } cublasInit(); cublasAlloc(Width*Width, sizeof(float), (void**)&d_A); cublasAlloc(Width*Width, sizeof(float), (void**)&d_B); cublasAlloc(Width*Width, sizeof(float), (void**)&d_C); cublasSetVector(Width*Width, sizeof(float), h_A, 1, d_A, 1); cublasSetVector(Width*Width, sizeof(float), h_B, 1, d_B, 1);if(is_test)MatrixMul(h_A, h_B, h_C_ref,Width);dim3 dimBlock(TILE_WIDTH, TILE_WIDTH); dim3 dimGrid(Width/TILE_WIDTH, Width/TILE_WIDTH); t0=cutGetTimerValue(timer1); cudaThreadSynchronize();MatrixMulKernel1<<<dimGrid,dimBlock>>>(d_A,d_B,d_C,Width); cudaThreadSynchronize(); float gpu_t1=(cutGetTimerValue(timer1)-t0)/1000.0f; cublasGetVector(Width*Width, sizeof(float), d_C, 1, h_C, 1);if(is_test){error_norm=0;ref_norm=0;for(int i=0; i<Width*Width; i++) {float diff=h_C_ref[i]-h_C[i];error_norm+=diff*diff;ref_norm+=h_C_ref[i]*h_C_ref[i];}printf("Test %s/n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED");} t0=cutGetTimerValue(timer1); cudaThreadSynchronize();MatrixMulKernel2<<<dimGrid,dimBlock>>>(d_A,d_B,d_C,Width); cudaThreadSynchronize(); float gpu_t2=(cutGetTimerValue(timer1)-t0)/1000.0f; cublasGetVector(Width*Width, sizeof(float), d_C, 1, h_C, 1);if(is_test){error_norm=0;ref_norm=0;for(int i=0; i<Width*Width; i++) {float diff=h_C_ref[i]-h_C[i];error_norm+=diff*diff;ref_norm+=h_C_ref[i]*h_C_ref[i];}printf("Test %s/n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED");} t0=cutGetTimerValue(timer1); cudaThreadSynchronize();MatrixMulKernel3<<<dimGrid,dimBlock>>>(d_A,d_B,d_C,Width); cudaThreadSynchronize(); float gpu_t3=(cutGetTimerValue(timer1)-t0)/1000.0f; cublasGetVector(Width*Width, sizeof(float), d_C, 1, h_C, 1);if(is_test){error_norm=0;ref_norm=0;for(int i=0; i<Width*Width; i++) {float diff=h_C_ref[i]-h_C[i];error_norm+=diff*diff;ref_norm+=h_C_ref[i]*h_C_ref[i];}printf("Test %s/n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED");}dimGrid.x/=ThreadColumn; t0=cutGetTimerValue(timer1); cudaThreadSynchronize();MatrixMulKernel4<<<dimGrid,dimBlock>>>(d_A,d_B,d_C,Width); cudaThreadSynchronize(); float gpu_t4=(cutGetTimerValue(timer1)-t0)/1000.0f; cublasGetVector(Width*Width, sizeof(float), d_C, 1, h_C, 1);if(is_test){error_norm=0;ref_norm=0;for(int i=0; i<Width*Width; i++) {float diff=h_C_ref[i]-h_C[i];error_norm+=diff*diff;ref_norm+=h_C_ref[i]*h_C_ref[i];}printf("Test %s/n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED");} t0=cutGetTimerValue(timer1); cudaThreadSynchronize();MatrixMulKernel5<<<dimGrid,dimBlock>>>(d_A,d_B,d_C,Width); cudaThreadSynchronize(); float gpu_t5=(cutGetTimerValue(timer1)-t0)/1000.0f; cublasGetVector(Width*Width, sizeof(float), d_C, 1, h_C, 1);if(is_test){error_norm=0;ref_norm=0;for(int i=0; i<Width*Width; i++) {float diff=h_C_ref[i]-h_C[i];error_norm+=diff*diff;ref_norm+=h_C_ref[i]*h_C_ref[i];}printf("Test %s/n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED");}printf("矩阵阶数为%4d,", Width);printf("简单方法: %.6fs(%.3fGflops),", gpu_t1, 1e-9*Width*Width*Width*2/gpu_t1);printf("块方法: %.6fs(%.3fGflops),", gpu_t2, 1e-9*Width*Width*Width*2/gpu_t2);printf("块+循环展开方法: %.6fs(%.3fGflops),", gpu_t3, 1e-9*Width*Width*Width*2/gpu_t3);printf("块+线程粒度2: %.6fs(%.3fGflops),", gpu_t4, 1e-9*Width*Width*Width*2/gpu_t4);printf("块+循环展开方法+线程粒度2: %.6fs(%.3fGflops)/n", gpu_t5, 1e-9*Width*Width*Width*2/gpu_t5);}

环境：CUDA toolkit3.2+Windows XP+CUDA SDK中的vs2008模板release编译通过，显卡是GeForce GT240。大家可根据自己的情况进行测试，报告一下结果吧.

不同规模的运行结果:
矩阵阶数为 512,简单方法: 0.033887s(7.922Gflops),块方法: 0.008424s(31.865Gflops),块+循环展开方法: 0.003995s(67.191Gflops),块+线程粒度2: 0.008091s(33.179Gflops),块+循环展开方法+线程粒度2: 0.003399s(78.984Gflops)

矩阵阶数为1024,简单方法: 0.264424s(8.121Gflops),块方法: 0.066201s(32.439Gflops),块+循环展开方法: 0.030827s(69.662Gflops),块+线程粒度2: 0.063264s(33.945Gflops),块+循环展开方法+线程粒度2: 0.026282s(81.710Gflops)

矩阵阶数为2048,简单方法: 2.112513s(8.132Gflops),块方法: 0.527002s(32.599Gflops),块+循环展开方法: 0.244058s(70.392Gflops),块+线程粒度2: 0.505495s(33.986Gflops),块+循环展开方法+线程粒度2: 0.208845s(82.261Gflops)

矩阵阶数为4096,简单方法: 17.705070s(7.763Gflops),块方法: 4.205308s(32.682Gflops),块+循环展开方法: 1.966043s(69.906Gflops),块+线程粒度2: 4.059064s(33.860Gflops),块+循环展开方法+线程粒度2: 1.677771s(81.918Gflops)

测试结果表明在块大小16x16时，块方法和循环展开对速度有很显著的影响，而线程粒度的使用对速度只有很小的提高。而块大小是8x8时的情形没有测试，线程粒度对速度应该有较大影响，家可以做一做。

参考文献: David B. Kirk, Wen-mei W. Hwu. 大规模并行处理器编程实战[M]. 北京: 清华大学出版社, 2010.