Heterogeneous Parallel Programming 随笔二
来源:互联网 发布:青鸟华光超捷排版软件 编辑:程序博客网 时间:2024/06/04 20:04
Device端线程结构为Grid->Block->Thread;
执行过程:每个Streaming Multiprocessor包含最多8个Block或者1536个Thread执行;
一个Grid包含多个Block,Grid内为SPMD执行模式;
一个Block包含多个Thread,被划分成多个warp执行,其中一个warp包含32个thread,这些thread为SIMD执行模式;
代码示例 Matrix-Matrix Multiplication:
#include <wb.h>#define wbCheck(stmt) \ do { \ cudaError_t err = stmt; \ if (err != cudaSuccess) { \ wbLog(ERROR, "Failed to run stmt ", #stmt); \ wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \ return -1; \ } \ } while (0)// Compute C = A * B__global__ void matrixMultiply(float *A, float *B, float *C, int numARows, int numAColumns, int numBRows, int numBColumns, int numCRows, int numCColumns) { //@@ Insert code to implement matrix multiplication here int Row = blockIdx.y * blockDim.y + threadIdx.y; int Col = blockIdx.x * blockDim.x + threadIdx.x; if ((Row < numCRows) && (Col < numCColumns)) { float Cvalue = 0.0; for (int i = 0; i < numAColumns; ++i) Cvalue += A[Row * numAColumns + i] * B[Col + i * numCColumns]; C[Row * numCColumns + Col] = Cvalue; }}int main(int argc, char **argv) { wbArg_t args; float *hostA; // The A matrix float *hostB; // The B matrix float *hostC; // The output C matrix float *deviceA; float *deviceB; float *deviceC; int numARows; // number of rows in the matrix A int numAColumns; // number of columns in the matrix A int numBRows; // number of rows in the matrix B int numBColumns; // number of columns in the matrix B int numCRows; // number of rows in the matrix C (you have to set this) int numCColumns; // number of columns in the matrix C (you have to set this) args = wbArg_read(argc, argv); wbTime_start(Generic, "Importing data and creating memory on host"); hostA = ( float * )wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns); hostB = ( float * )wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns); //@@ Set numCRows and numCColumns numCRows = numARows; numCColumns = numBColumns; //@@ Allocate the hostC matrix hostC = ( float * )malloc(numCRows * numCColumns * sizeof(float)); wbTime_stop(Generic, "Importing data and creating memory on host"); wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns); wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns); wbTime_start(GPU, "Allocating GPU memory."); //@@ Allocate GPU memory here wbCheck(cudaMalloc((void**)&deviceA, numARows * numAColumns * sizeof(float))); wbCheck(cudaMalloc((void**)&deviceB, numBRows * numBColumns * sizeof(float))); wbCheck(cudaMalloc((void**)&deviceC, numCRows * numCColumns * sizeof(float))); wbTime_stop(GPU, "Allocating GPU memory."); wbTime_start(GPU, "Copying input memory to the GPU."); //@@ Copy memory to the GPU here wbCheck(cudaMemcpy(deviceA, hostA, numARows * numAColumns * sizeof(float), cudaMemcpyHostToDevice)); wbCheck(cudaMemcpy(deviceB, hostB, numBRows * numBColumns * sizeof(float), cudaMemcpyHostToDevice)); wbTime_stop(GPU, "Copying input memory to the GPU."); //@@ Initialize the grid and block dimensions here int TILE_WIDTH = 16; dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1); dim3 dimGrid((numBColumns - 1) / TILE_WIDTH + 1, (numARows - 1) / TILE_WIDTH + 1, 1); wbTime_start(Compute, "Performing CUDA computation"); //@@ Launch the GPU Kernel here matrixMultiply<<<dimGrid, dimBlock>>>(deviceA, deviceB, deviceC, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns); cudaDeviceSynchronize(); wbTime_stop(Compute, "Performing CUDA computation"); wbTime_start(Copy, "Copying output memory to the CPU"); //@@ Copy the GPU memory back to the CPU here wbCheck(cudaMemcpy(hostC, deviceC, numCRows * numCColumns * sizeof(float), cudaMemcpyDeviceToHost)); wbTime_stop(Copy, "Copying output memory to the CPU"); wbTime_start(GPU, "Freeing GPU Memory"); //@@ Free the GPU memory here wbCheck(cudaFree(deviceA)); wbCheck(cudaFree(deviceB)); wbCheck(cudaFree(deviceC)); wbTime_stop(GPU, "Freeing GPU Memory"); wbSolution(args, hostC, numCRows, numCColumns); free(hostA); free(hostB); free(hostC); return 0;}</span>
优化后:
#include <wb.h>#define wbCheck(stmt) \ do { \ cudaError_t err = stmt; \ if (err != cudaSuccess) { \ wbLog(ERROR, "Failed to run stmt ", #stmt); \ wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \ return -1; \ } \ } while (0)#define TILE_WIDTH 16// Compute C = A * B__global__ void matrixMultiplyShared(float *A, float *B, float *C, int numARows, int numAColumns, int numBRows, int numBColumns, int numCRows, int numCColumns) { //@@ Insert code to implement matrix multiplication here //@@ You have to use shared memory for this MP __shared__ float ds_A[TILE_WIDTH][TILE_WIDTH]; __shared__ float ds_B[TILE_WIDTH][TILE_WIDTH]; int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; int Row = by * blockDim.y + ty; int Col = bx * blockDim.x + tx; float Cvalue = 0; for (int t = 0; t < (numAColumns - 1) / TILE_WIDTH + 1; ++t) { if((Row < numCRows) && ((t * TILE_WIDTH + tx) < numAColumns)) { ds_A[ty][tx] = A[Row * numAColumns + t * TILE_WIDTH + tx];} else { ds_A[ty][tx] = 0.0;} if (((t * TILE_WIDTH + ty) < numAColumns) && (Col < numCColumns)) { ds_B[ty][tx] = B[(t * TILE_WIDTH + ty) * numCColumns + Col];} else { ds_B[ty][tx] = 0.0;} __syncthreads();for (int i = 0; i < TILE_WIDTH; ++i) { Cvalue += ds_A[ty][i] * ds_B[i][tx];} __syncthreads(); } if (Row < numCRows && Col < numCColumns) { C[Row * numCColumns + Col] = Cvalue; }}int main(int argc, char **argv) { wbArg_t args; float *hostA; // The A matrix float *hostB; // The B matrix float *hostC; // The output C matrix float *deviceA; float *deviceB; float *deviceC; int numARows; // number of rows in the matrix A int numAColumns; // number of columns in the matrix A int numBRows; // number of rows in the matrix B int numBColumns; // number of columns in the matrix B int numCRows; // number of rows in the matrix C (you have to set this) int numCColumns; // number of columns in the matrix C (you have to set this) args = wbArg_read(argc, argv); wbTime_start(Generic, "Importing data and creating memory on host"); hostA = ( float * )wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns); hostB = ( float * )wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns); //@@ Set numCRows and numCColumns numCRows = numARows; numCColumns = numBColumns; //@@ Allocate the hostC matrix hostC = ( float * )malloc(numCRows * numCColumns * sizeof(float)); wbTime_stop(Generic, "Importing data and creating memory on host"); wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns); wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns); wbTime_start(GPU, "Allocating GPU memory."); //@@ Allocate GPU memory here wbCheck(cudaMalloc((void**)&deviceA, numARows * numAColumns * sizeof(float))); wbCheck(cudaMalloc((void**)&deviceB, numBRows * numBColumns * sizeof(float))); wbCheck(cudaMalloc((void**)&deviceC, numCRows * numCColumns * sizeof(float))); wbTime_stop(GPU, "Allocating GPU memory."); wbTime_start(GPU, "Copying input memory to the GPU."); //@@ Copy memory to the GPU here wbCheck(cudaMemcpy(deviceA, hostA, numARows * numAColumns * sizeof(float), cudaMemcpyHostToDevice)); wbCheck(cudaMemcpy(deviceB, hostB, numBRows * numBColumns * sizeof(float), cudaMemcpyHostToDevice)); wbTime_stop(GPU, "Copying input memory to the GPU."); //@@ Initialize the grid and block dimensions here dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1); dim3 dimGrid((numBColumns - 1) / TILE_WIDTH + 1, (numARows - 1) / TILE_WIDTH + 1, 1); wbTime_start(Compute, "Performing CUDA computation"); //@@ Launch the GPU Kernel here matrixMultiplyShared<<<dimGrid, dimBlock>>>(deviceA, deviceB, deviceC, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns); cudaDeviceSynchronize(); wbTime_stop(Compute, "Performing CUDA computation"); wbTime_start(Copy, "Copying output memory to the CPU"); //@@ Copy the GPU memory back to the CPU here wbCheck(cudaMemcpy(hostC, deviceC, numCRows * numCColumns * sizeof(float), cudaMemcpyDeviceToHost)); wbTime_stop(Copy, "Copying output memory to the CPU"); wbTime_start(GPU, "Freeing GPU Memory"); //@@ Free the GPU memory here wbCheck(cudaFree(deviceA)); wbCheck(cudaFree(deviceB)); wbCheck(cudaFree(deviceC)); wbTime_stop(GPU, "Freeing GPU Memory"); wbSolution(args, hostC, numCRows, numCColumns); free(hostA); free(hostB); free(hostC); return 0;}
0 0
- Heterogeneous Parallel Programming 随笔二
- Heterogeneous Parallel Programming 随笔一
- Heterogeneous Parallel Programming(异构并行编程)学习笔记(二)
- Heterogeneous Parallel Programming(异构并行编程)学习笔记(一)
- Heterogeneous Parallel Programming(异构并行编程)学习笔记(三)
- Heterogeneous Parallel Programming(异构并行编程)学习笔记(四)
- Heterogeneous Parallel Programming(异构并行编程)学习笔记(五)
- Heterogeneous Parallel Programming(异构并行编程)学习笔记(六)
- Heterogeneous Parallel Programming(异构并行编程)学习笔记(七)
- CUDA - Heterogeneous Parallel Processing 笔记
- Parallel Programming
- Parallel Programming指南
- Principles of parallel programming
- Parallel Programming with CUDA
- Concurrent and Parallel Programming
- .NET_PATTERNS OF PARALLEL PROGRAMMING[TODO]
- Begin Parallel Programming With OpenMP
- parallel programming in java introduce
- iOS8新特性
- tr的简单使用
- 1056. Mice and Rice (25)
- Java Socket编程
- Deep Learning 学习笔记(三):神经网络反向传播算法推导
- Heterogeneous Parallel Programming 随笔二
- OC 面向对象—封装
- win7下cygwin+hadoop+MyEclipse
- 初识MVC
- tftp/nfs等常用Linux配置
- UVA 10714-Ants(求花费的最大最小时间)
- 评分卡的开发过程
- 第18天: hive数据加载 从文件加载到hive表讲解和案例操作、从查询插入数据到hive表讲解和案例操作
- VMWare虚拟机下为Ubuntu 12.04.1配置静态IP(NAT方式)