CUDA笔记3:代码实践
来源:互联网 发布:python .fillna 编辑:程序博客网 时间:2024/06/08 13:42
本文用CUDA实现两矩阵相乘。下图为C=A*B的直观解释,C的每个元素为A对应行向量与B对应列向量的点积。可见,为了计算元素C(i,j),需要访问全局内存A.width + B.height次。为了提升计算效率,矩阵乘法还可以用共享内存实现。首先按照Block大小将A,B,C拆分成子矩阵形式,那么C中一个Block内的所有Thread可以同步访问A,B对应的子区域(从全局内存拷贝到共享内存,共享内存设置成Block区域大小),然后进行子区域的乘积计算。实际上利用共享内存可以将全局访问次数降低为A.width/BLOCK_SIZE + B.height/BLOCK_SIZE.
矩阵相乘原理图
利用共享内存可以提高计算效率
以下Code参照SDK VectorAdd代码示例改下,用来计算两个1024*1024方阵的乘机,并且添加了CPU和GPU计时功能,可以方便对比运算时间。
/** * 两矩阵(方阵)相乘示例, 使用共享内存 比 不使用共享内存 效率高出一倍 */#include <stdio.h>#include <time.h>// For the CUDA runtime routines (prefixed with "cuda_")#include <cuda_runtime.h>/** * CUDA Kernel Device code * * Computes the matrix multiplication of A and B into C. * without using shared memory */__global__ void matrixMul_noSharedM(const float *A, const float *B, float *C, int N){int col = blockDim.x * blockIdx.x + threadIdx.x;int row = blockDim.y * blockIdx.y + threadIdx.y;if (col < N && row < N){C[row*N + col] = 0;for (int i=0; i<N; i++){C[row*N + col] += A[row*N + i] * B[i*N + col];}}}//using shared memory__global__ void matrixMul_SharedM(const float *A, const float *B, float *C, int N){__shared__ float Mds[32][32];__shared__ float Nds[32][32];int col = blockDim.x * blockIdx.x + threadIdx.x;int row = blockDim.y * blockIdx.y + threadIdx.y;if (col < N && row < N){float pValue = 0.0;for (int m=0; m<32; m++){Mds[threadIdx.y][threadIdx.x] = A[row*N + m*32 + threadIdx.x];Nds[threadIdx.y][threadIdx.x] = B[(threadIdx.y+m*32)*N + col];__syncthreads(); //同步同一个block的所有threadfor(int k = 0; k < 32; k++){pValue += Mds[threadIdx.y][k] * Nds[k][threadIdx.x];}__syncthreads();}C[row*N + col] = pValue;}}/** * Host main routine */int main(void){ // Error code to check return values for CUDA calls cudaError_t err = cudaSuccess; // Define the squared matrix, and compute its size int numElements = 32*32; int size = numElements * numElements * sizeof(float);int NN = numElements * numElements; // Allocate the host input vector A float *h_A = new float[size];//(float *)malloc(size); // Allocate the host input vector B float *h_B = new float[size];//(float *)malloc(size); // Allocate the host output vector C float *h_C = new float[size];//(float *)malloc(size); // Verify that allocations succeeded if (h_A == NULL || h_B == NULL || h_C == NULL) { fprintf(stderr, "Failed to allocate host matrix!\n"); exit(EXIT_FAILURE); } // Initialize the host input matrix for (int i = 0; i < NN; i++) { h_A[i] = rand()/(float)RAND_MAX * 10.0; h_B[i] = rand()/(float)RAND_MAX * 10.0; } // Allocate the device input matrix A float *d_A = NULL; err = cudaMalloc((void **)&d_A, size); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device matrix A (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Allocate the device input matrix B float *d_B = NULL; err = cudaMalloc((void **)&d_B, size); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device matrix B (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Allocate the device output matrix C float *d_C = NULL; err = cudaMalloc((void **)&d_C, size); if (err != cudaSuccess) { fprintf(stderr, "Failed to allocate device matrix C (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Copy the host input matrix A and B in host memory to the device input matrix in // device memory // printf("Copy input data from the host memory to the CUDA device\n"); err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy matrix A from host to device (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy matrix B from host to device (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Launch the Matrix multiplication CUDA Kernel dim3 dimGrid;dim3 dimBlock(32, 32);dimGrid.x = numElements / dimBlock.x ;dimGrid.y = numElements / dimBlock.y ;cudaEvent_t start, stop; // measure GPU timefloat time;cudaEventCreate(&start);cudaEventCreate(&stop);cudaEventRecord(start, 0); //matrixMul_noSharedM<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, numElements);matrixMul_SharedM<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, numElements);cudaEventRecord(stop, 0);cudaEventSynchronize(stop);cudaEventElapsedTime(&time, start, stop); printf("GPU time: %f \n", time);cudaEventDestroy(start);cudaEventDestroy(stop); err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Copy the device result matrix in device memory to the host result matrix // in host memory. // printf("Copy output data from the CUDA device to the host memory\n"); err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); if (err != cudaSuccess) { fprintf(stderr, "Failed to copy matrix C from device to host (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); }// Compute the groundtruethclock_t start2, finish; // measure CPU time double duration = 0.0; start2 = clock(); float* GT_C = new float[NN];for (int r=0; r<numElements; r++){for (int c=0; c<numElements; c++){GT_C[r*numElements + c] = 0;for (int i=0; i<numElements; i++){GT_C[r*numElements + c] += h_A[r*numElements + i] * h_B[i*numElements + c];}}}finish = clock(); duration = (double)(finish - start2); //输出单位ms printf("CPU time: %f\n", duration);// Verify that the result vector is correctfor (int i = 0; i < NN; ++i){//printf("%d %f %f\n", i, GT_C[i], h_C[i]);if (fabs(GT_C[i] - h_C[i]) > 1e-5){fprintf(stderr, "Result verification failed at element %d!\n", i);exit(EXIT_FAILURE);}}printf("Test PASSED\n"); // Free device global memory err = cudaFree(d_A); if (err != cudaSuccess) { fprintf(stderr, "Failed to free device matrix A (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaFree(d_B); if (err != cudaSuccess) { fprintf(stderr, "Failed to free device matrix B (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } err = cudaFree(d_C); if (err != cudaSuccess) { fprintf(stderr, "Failed to free device matrix C (error code %s)!\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } // Free host memory delete []h_A; delete []h_B; delete []h_C; // Reset the device and exit err = cudaDeviceReset(); if (err != cudaSuccess) { fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err)); exit(EXIT_FAILURE); } printf("Done\n"); return 0;}
参考:http://blog.csdn.net/csgxy123/article/details/10018531
0 0
- CUDA笔记3:代码实践
- CUDA代码笔记 - SimpleMultiGPU
- CUDA代码笔记 - cudaOpenMP
- CUDA编程实践-3
- cuda-convnet代码阅读笔记
- CUDA笔记之一:实践编程篇
- CUDA笔记之一:实践编程篇
- CUDA 学习笔记 3
- CUDA代码笔记(一)SimpleMultiGPU
- CUDA代码笔记(二) cudaOpenMP
- CUDA从入门到精通到精通_笔记3:第一个CUDA程序及其代码详解
- 代码阅读和实践笔记
- CUDA笔记
- cuda 笔记
- CUDA笔记
- CUDA笔记
- cuda笔记
- CUDA 4.0 编程实践
- Java的位运算符详解实例——与(&)、非(~)、或(|)、异或(^)
- SpringMVC 基础教程 框架分析
- UIWebView 中JavaScript 与 Objective-C 通信
- FFMPEG解码流程
- css实现照片堆叠效果
- CUDA笔记3:代码实践
- 对easyuir的datagrid分页属性处理
- GROUP BY 函数和HAVING()函数.
- 自定义标签 tld
- 出现 'use -v to see invocation' 问题的解决办法
- 测试
- Android本地视频播放器开发--SDL编译
- serialVersionUID作用
- MapReduce Design Patterns(2. 最大值、最小值、总数、个数、均值)(二)