炼数成金CUDA视频教程——第三课1——学习笔记
来源:互联网 发布:数组tostring 编辑:程序博客网 时间:2024/06/05 15:14
/*** * gputimer.h 源程序来自炼数成金教程 * ***/#ifndef __GPU_TIMER_H__#define __GPU_TIMER_H__struct GpuTimer{ cudaEvent_t start; cudaEvent_t stop; GpuTimer() { cudaEventCreate(&start); cudaEventCreate(&stop); } ~GpuTimer() { cudaEventDestroy(start); cudaEventDestroy(stop); } void Start() { cudaEventRecord(start, 0); } void Stop() { cudaEventRecord(stop, 0); } float Elapsed() { float elapsed; cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsed, start, stop); return elapsed; }};#endif /* __GPU_TIMER_H__ */
////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////
/**** * reduce.cu 源程序来自炼数成金教程 * ***/#include <stdio.h>#include <stdlib.h>#include <cuda_runtime.h>__global__ void global_reduce_kernel(float * d_out, float * d_in){ int myId = threadIdx.x + blockDim.x * blockIdx.x; int tid = threadIdx.x; // do reduction in global mem for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { d_in[myId] += d_in[myId + s]; } __syncthreads(); // make sure all adds at one stage are done! } // only thread 0 writes result for this block back to global mem if (tid == 0) { d_out[blockIdx.x] = d_in[myId]; }}__global__ void shmem_reduce_kernel(float * d_out, const float * d_in){ // sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>> extern __shared__ float sdata[]; int myId = threadIdx.x + blockDim.x * blockIdx.x; int tid = threadIdx.x; // load shared mem from global mem sdata[tid] = d_in[myId]; __syncthreads(); // make sure entire block is loaded! // do reduction in shared mem for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { sdata[tid] += sdata[tid + s]; } __syncthreads(); // make sure all adds at one stage are done! } // only thread 0 writes result for this block back to global mem if (tid == 0) { d_out[blockIdx.x] = sdata[0]; }}void reduce(float * d_out, float * d_intermediate, float * d_in, int size, bool usesSharedMemory){ // assumes that size is not greater than maxThreadsPerBlock^2 // and that size is a multiple of maxThreadsPerBlock const int maxThreadsPerBlock = 1024; int threads = maxThreadsPerBlock; int blocks = size / maxThreadsPerBlock; if (usesSharedMemory) { shmem_reduce_kernel<<<blocks, threads, threads * sizeof(float)>>> (d_intermediate, d_in); } else { global_reduce_kernel<<<blocks, threads>>> (d_intermediate, d_in); } // now we're down to one block left, so reduce it threads = blocks; // launch one thread for each block in prev step blocks = 1; if (usesSharedMemory) { shmem_reduce_kernel<<<blocks, threads, threads * sizeof(float)>>> (d_out, d_intermediate); } else { global_reduce_kernel<<<blocks, threads>>> (d_out, d_intermediate); }}int main(int argc, char **argv){ int deviceCount; cudaGetDeviceCount(&deviceCount); if (deviceCount == 0) { fprintf(stderr, "error: no devices supporting CUDA.\n"); exit(EXIT_FAILURE); } int dev = 0; cudaSetDevice(dev); cudaDeviceProp devProps; if (cudaGetDeviceProperties(&devProps, dev) == 0) { printf("Using device %d:\n", dev); printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n", devProps.name, (int)devProps.totalGlobalMem, (int)devProps.major, (int)devProps.minor, (int)devProps.clockRate); } const int ARRAY_SIZE = 1 << 20; const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float); // generate the input array on the host float h_in[ARRAY_SIZE]; float sum = 0.0f; for(int i = 0; i < ARRAY_SIZE; i++) { // generate random float in [-1.0f, 1.0f] h_in[i] = -1.0f + (float)random()/((float)RAND_MAX/2.0f); sum += h_in[i]; } // declare GPU memory pointers float * d_in, * d_intermediate, * d_out; // allocate GPU memory cudaMalloc((void **) &d_in, ARRAY_BYTES); cudaMalloc((void **) &d_intermediate, ARRAY_BYTES); // overallocated cudaMalloc((void **) &d_out, sizeof(float)); // transfer the input array to the GPU cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); int whichKernel = 0; if (argc == 2) { whichKernel = atoi(argv[1]); } cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); // launch the kernel switch(whichKernel) { case 0: printf("Running global reduce\n"); cudaEventRecord(start, 0); for (int i = 0; i < 100; i++) { reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, false); } cudaEventRecord(stop, 0); break; case 1: printf("Running reduce with shared mem\n"); cudaEventRecord(start, 0); for (int i = 0; i < 100; i++) { reduce(d_out, d_intermediate, d_in, ARRAY_SIZE, true); } cudaEventRecord(stop, 0); break; default: fprintf(stderr, "error: ran no kernel\n"); exit(EXIT_FAILURE); } cudaEventSynchronize(stop); float elapsedTime; cudaEventElapsedTime(&elapsedTime, start, stop); elapsedTime /= 100.0f; // 100 trials // copy back the sum from GPU float h_out; cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost); printf("average time elapsed: %f\n", elapsedTime); // free GPU memory allocation cudaFree(d_in); cudaFree(d_intermediate); cudaFree(d_out); return 0;}
阅读全文
0 0
- 炼数成金CUDA视频教程——第三课1——学习笔记
- 炼数成金CUDA视频教程——第三课2——学习笔记
- CUDA学习笔记(一)——CUDA基础
- CUDA学习笔记(二)——CUDA扩展
- Cuda学习笔记(三)——Cuda编程Tips
- CUDA学习笔记——一些基本概念
- CUDA Samples学习笔记1——Utilities_asyncAPI
- CUDA学习—cudaMallocArray()
- CUDA学习—cudaMallocArray()
- Cuda学习笔记(二)——Cuda中对线程块的设置
- CUDA与C++的集成——学习笔记
- KALDI学习笔记——The CUDA Matrix library
- cuda《学习笔记一》——查询属性
- cuda《学习笔记二》——基本用法
- cuda《学习笔记三》——共享内存和同步
- VC视频教程笔记(第三课)
- CUDA学习——CUDA代码常用编写技巧(转)
- CUDA学习——CUDA代码常用编写技巧(转)
- 【网易】游历魔法王国
- 机器学习笔记(4)-EM算法(期望极大算法)
- 用jenkins实现参数化构建-只依靠ssh
- 嵌入式 ARM9 对中断的整体理解(图)
- 【51NOD 1585】买邮票送邮票
- 炼数成金CUDA视频教程——第三课1——学习笔记
- 希尔排序
- 数据结构实验之二叉树二:遍历二叉树
- 20171106
- 计数系统架构实践一次搞定
- VUE项目的小坑
- HDU Reorder the Books 找规律
- history对象
- HDU