CUDA sample源码分析,dct8*8
来源:互联网 发布:sql语句创建视图 编辑:程序博客网 时间:2024/05/23 14:06
学习共享内存+纹理内存的使用方法
dct8*8.cu
float WrapperCUDA1(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size){ //prepare channel format descriptor for passing texture into kernels cudaChannelFormatDesc floattex = cudaCreateChannelDesc<float>(); //allocate device memory cudaArray *Src; //因为需要绑定纹理,所以定义为cudaAarry float *Dst; size_t DstStride; checkCudaErrors(cudaMallocArray(&Src, &floattex, Size.width, Size.height)); checkCudaErrors(cudaMallocPitch((void **)(&Dst), &DstStride, Size.width * sizeof(float), Size.height)); //对齐操作 DstStride /= sizeof(float); //convert source image to float representation int ImgSrcFStride; float *ImgSrcF = MallocPlaneFloat(Size.width, Size.height, &ImgSrcFStride); //对齐操作 CopyByte2Float(ImgSrc, Stride, ImgSrcF, ImgSrcFStride, Size); //整型转浮点 AddFloatPlane(-128.0f, ImgSrcF, ImgSrcFStride, Size); //copy from host memory to device checkCudaErrors(cudaMemcpy2DToArray(Src, 0, 0, ImgSrcF, ImgSrcFStride * sizeof(float), Size.width * sizeof(float), Size.height, cudaMemcpyHostToDevice));//浮点数据拷贝至cudaArray //setup execution parameters dim3 threads(BLOCK_SIZE, BLOCK_SIZE); //8*8的邻域 dim3 grid(Size.width / BLOCK_SIZE, Size.height / BLOCK_SIZE); //create and start CUDA timer StopWatchInterface *timerCUDA = 0; sdkCreateTimer(&timerCUDA); sdkResetTimer(&timerCUDA); //execute DCT kernel and benchmark checkCudaErrors(cudaBindTextureToArray(TexSrc, Src)); //!绑定纹理,实现对邻域内像素的快速访问。因为邻域内不同行像素是非连续内存,属于分散式的访问模式,如果通过全局内存访问将无法触发合并访问,需要耗费较多的访存指令 for (int i=0; i<BENCHMARK_SIZE; i++) //执行10次,测试平均耗时 { sdkStartTimer(&timerCUDA); CUDAkernel1DCT<<< grid, threads >>>(Dst, (int) DstStride, 0, 0); //kernel函数,没有输入,因为通过纹理,纹理定义为全局变量 checkCudaErrors(cudaDeviceSynchronize()); sdkStopTimer(&timerCUDA); } checkCudaErrors(cudaUnbindTexture(TexSrc)); getLastCudaError("Kernel execution failed"); // finalize CUDA timer float TimerCUDASpan = sdkGetAverageTimerValue(&timerCUDA); sdkDeleteTimer(&timerCUDA); // execute Quantization kernel CUDAkernelQuantizationFloat<<< grid, threads >>>(Dst, (int) DstStride); getLastCudaError("Kernel execution failed"); //copy quantized coefficients from host memory to device array checkCudaErrors(cudaMemcpy2DToArray(Src, 0, 0, Dst, DstStride *sizeof(float), Size.width *sizeof(float), Size.height, cudaMemcpyDeviceToDevice)); // execute IDCT kernel checkCudaErrors(cudaBindTextureToArray(TexSrc, Src)); CUDAkernel1IDCT<<< grid, threads >>>(Dst, (int) DstStride, 0, 0); checkCudaErrors(cudaUnbindTexture(TexSrc)); getLastCudaError("Kernel execution failed"); //copy quantized image block to host checkCudaErrors(cudaMemcpy2D(ImgSrcF, ImgSrcFStride *sizeof(float), Dst, DstStride *sizeof(float), Size.width *sizeof(float), Size.height, cudaMemcpyDeviceToHost)); //convert image back to byte representation AddFloatPlane(128.0f, ImgSrcF, ImgSrcFStride, Size); CopyFloat2Byte(ImgSrcF, ImgSrcFStride, ImgDst, Stride, Size); //clean up memory checkCudaErrors(cudaFreeArray(Src)); checkCudaErrors(cudaFree(Dst)); FreePlane(ImgSrcF); //return time taken by the operation return TimerCUDASpan;}
NVIDIA Corporation\CUDA Samples\v8.0\3_Imaging\dct8x8\dct8x8_kernel1.cuh
__global__ void CUDAkernel1DCT(float *Dst, int ImgWidth, int OffsetXBlocks, int OffsetYBlocks){ // Block index const int bx = blockIdx.x + OffsetXBlocks; //OffsetXBlocks为零 const int by = blockIdx.y + OffsetYBlocks; // Thread index (current coefficient) const int tx = threadIdx.x; const int ty = threadIdx.y; // Texture coordinates const float tex_x = (float)((bx << BLOCK_SIZE_LOG2) + tx) + 0.5f; const float tex_y = (float)((by << BLOCK_SIZE_LOG2) + ty) + 0.5f;//!计算x、y方向下标时,通过左移3位代替乘8//!纹理可以访问浮点值下标 //copy current image pixel to the first block CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx ] = tex2D(TexSrc, tex_x, tex_y); //!CurBlockLocal1为共享内存,大小为8*8,刚好存放一个邻域内的像素 //!2维纹理对随机访问有优化 //synchronize threads to make sure the block is copied __syncthreads(); //等待 获取完邻域内所有像素之后,才继续往下执行 //calculate the multiplication of DCTv8matrixT * A and place it in the second block float curelem = 0; int DCTv8matrixIndex = 0 * BLOCK_SIZE + ty; int CurBlockLocal1Index = 0 * BLOCK_SIZE + tx;#pragma unroll //循环展开,cuda编译器优化项 for (int i=0; i<BLOCK_SIZE; i++) { curelem += DCTv8matrix[DCTv8matrixIndex] * CurBlockLocal1[CurBlockLocal1Index]; //DCT模板运算 DCTv8matrixIndex += BLOCK_SIZE; CurBlockLocal1Index += BLOCK_SIZE; //邻域内下一个像素的索引 } CurBlockLocal2[(ty << BLOCK_SIZE_LOG2) + tx ] = curelem; //运算结果 //synchronize threads to make sure the first 2 matrices are multiplied and the result is stored in the second block __syncthreads(); //等待邻域内所有像素都运算完毕后,继续往下执行 //calculate the multiplication of (DCTv8matrixT * A) * DCTv8matrix and place it in the first block curelem = 0; int CurBlockLocal2Index = (ty << BLOCK_SIZE_LOG2) + 0; DCTv8matrixIndex = 0 * BLOCK_SIZE + tx;#pragma unroll for (int i=0; i<BLOCK_SIZE; i++) { curelem += CurBlockLocal2[CurBlockLocal2Index] * DCTv8matrix[DCTv8matrixIndex]; CurBlockLocal2Index += 1; DCTv8matrixIndex += BLOCK_SIZE; } CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx ] = curelem; //!充分利用共享内存CurBlockLocal1,将第一次运算的输入作为第二次运算的输出 //synchronize threads to make sure the matrices are multiplied and the result is stored back in the first block __syncthreads(); //copy current coefficient to its place in the result array Dst[ FMUL(((by << BLOCK_SIZE_LOG2) + ty), ImgWidth) + ((bx << BLOCK_SIZE_LOG2) + tx) ] = CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx ];}
0 0
- CUDA sample源码分析,dct8*8
- Linux定时器分析-源码sample
- libevent sample--分析及其源码阅读
- 微信 Mars Android Sample 源码分析
- CUDA Sample 求和
- sparkstreaming中通过kafka sample api实现directstream源码分析
- CUDA samples 第三章 sample reference 概况
- Linux mint 18编译CUDA 7.5 Sample
- MultiAnimation Sample分析
- Sample Application分析(1)
- sample.exe 病毒分析
- Ogre sample启动流程分析
- ogre sample分析(一)
- ogre sample分析(二)
- cuda的sdk sample中的一个低级错误
- Corporation of cuda and openGL Texture( Source Code sample)
- CUDA SDK VolumeRender 分析
- CUDA SDK VolumeRender 分析
- 基于LDAP和Sentry的大数据认证和鉴权解决方案--Part One:LDAP集成
- Endnote 基本使用教程
- Wakelock API详解(屏幕常亮操作)
- elasticsearch2.3.3搭建及插件安装
- 重新编译spark源码,使CDH支持spark sql
- CUDA sample源码分析,dct8*8
- 404页面(Bootstrap)
- ui-router,目前见过最全的。特别是route参数
- C++ 面向对象基础
- C# 二叉树的综合操作(二):删除节点
- [Java数据结构]从源码分析HashMap
- phpRedis函数使用总结【分类详细】
- pycharm2016激活
- Linux定时任务—Crontab每秒访问一次网页