CUDA sample源码分析，dct8*8

来源：互联网发布：sql语句创建视图编辑：程序博客网时间：2024/05/23 14:06

学习共享内存+纹理内存的使用方法

dct8*8.cu

float WrapperCUDA1(byte *ImgSrc, byte *ImgDst, int Stride, ROI Size){    //prepare channel format descriptor for passing texture into kernels    cudaChannelFormatDesc floattex = cudaCreateChannelDesc<float>();    //allocate device memory    cudaArray *Src;   //因为需要绑定纹理，所以定义为cudaAarry    float *Dst;        size_t DstStride;    checkCudaErrors(cudaMallocArray(&Src, &floattex, Size.width, Size.height));    checkCudaErrors(cudaMallocPitch((void **)(&Dst), &DstStride, Size.width * sizeof(float), Size.height));   //对齐操作    DstStride /= sizeof(float);    //convert source image to float representation    int ImgSrcFStride;    float *ImgSrcF = MallocPlaneFloat(Size.width, Size.height, &ImgSrcFStride);   //对齐操作    CopyByte2Float(ImgSrc, Stride, ImgSrcF, ImgSrcFStride, Size); //整型转浮点    AddFloatPlane(-128.0f, ImgSrcF, ImgSrcFStride, Size);    //copy from host memory to device    checkCudaErrors(cudaMemcpy2DToArray(Src, 0, 0,                                        ImgSrcF, ImgSrcFStride * sizeof(float),                                        Size.width * sizeof(float), Size.height,                                        cudaMemcpyHostToDevice));//浮点数据拷贝至cudaArray    //setup execution parameters    dim3 threads(BLOCK_SIZE, BLOCK_SIZE);  //8*8的邻域    dim3 grid(Size.width / BLOCK_SIZE, Size.height / BLOCK_SIZE);    //create and start CUDA timer    StopWatchInterface *timerCUDA = 0;    sdkCreateTimer(&timerCUDA);    sdkResetTimer(&timerCUDA);    //execute DCT kernel and benchmark    checkCudaErrors(cudaBindTextureToArray(TexSrc, Src));      //！绑定纹理，实现对邻域内像素的快速访问。因为邻域内不同行像素是非连续内存，属于分散式的访问模式，如果通过全局内存访问将无法触发合并访问，需要耗费较多的访存指令    for (int i=0; i<BENCHMARK_SIZE; i++)  //执行10次，测试平均耗时    {        sdkStartTimer(&timerCUDA);        CUDAkernel1DCT<<< grid, threads >>>(Dst, (int) DstStride, 0, 0); //kernel函数，没有输入，因为通过纹理，纹理定义为全局变量        checkCudaErrors(cudaDeviceSynchronize());        sdkStopTimer(&timerCUDA);    }    checkCudaErrors(cudaUnbindTexture(TexSrc));    getLastCudaError("Kernel execution failed");    // finalize CUDA timer    float TimerCUDASpan = sdkGetAverageTimerValue(&timerCUDA);    sdkDeleteTimer(&timerCUDA);    // execute Quantization kernel    CUDAkernelQuantizationFloat<<< grid, threads >>>(Dst, (int) DstStride);    getLastCudaError("Kernel execution failed");    //copy quantized coefficients from host memory to device array    checkCudaErrors(cudaMemcpy2DToArray(Src, 0, 0,                                        Dst, DstStride *sizeof(float),                                        Size.width *sizeof(float), Size.height,                                        cudaMemcpyDeviceToDevice));    // execute IDCT kernel    checkCudaErrors(cudaBindTextureToArray(TexSrc, Src));    CUDAkernel1IDCT<<< grid, threads >>>(Dst, (int) DstStride, 0, 0);    checkCudaErrors(cudaUnbindTexture(TexSrc));    getLastCudaError("Kernel execution failed");    //copy quantized image block to host    checkCudaErrors(cudaMemcpy2D(ImgSrcF, ImgSrcFStride *sizeof(float),                                 Dst, DstStride *sizeof(float),                                 Size.width *sizeof(float), Size.height,                                 cudaMemcpyDeviceToHost));    //convert image back to byte representation    AddFloatPlane(128.0f, ImgSrcF, ImgSrcFStride, Size);    CopyFloat2Byte(ImgSrcF, ImgSrcFStride, ImgDst, Stride, Size);    //clean up memory    checkCudaErrors(cudaFreeArray(Src));    checkCudaErrors(cudaFree(Dst));    FreePlane(ImgSrcF);    //return time taken by the operation    return TimerCUDASpan;}

NVIDIA Corporation\CUDA Samples\v8.0\3_Imaging\dct8x8\dct8x8_kernel1.cuh

__global__ void CUDAkernel1DCT(float *Dst, int ImgWidth, int OffsetXBlocks, int OffsetYBlocks){    // Block index    const int bx = blockIdx.x + OffsetXBlocks;  //OffsetXBlocks为零    const int by = blockIdx.y + OffsetYBlocks;    // Thread index (current coefficient)    const int tx = threadIdx.x;    const int ty = threadIdx.y;    // Texture coordinates    const float tex_x = (float)((bx << BLOCK_SIZE_LOG2) + tx) + 0.5f;    const float tex_y = (float)((by << BLOCK_SIZE_LOG2) + ty) + 0.5f;//！计算x、y方向下标时，通过左移3位代替乘8//！纹理可以访问浮点值下标    //copy current image pixel to the first block    CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx ] = tex2D(TexSrc, tex_x, tex_y);       //！CurBlockLocal1为共享内存，大小为8*8，刚好存放一个邻域内的像素    //！2维纹理对随机访问有优化    //synchronize threads to make sure the block is copied    __syncthreads();   //等待 获取完邻域内所有像素之后，才继续往下执行    //calculate the multiplication of DCTv8matrixT * A and place it in the second block    float curelem = 0;    int DCTv8matrixIndex = 0 * BLOCK_SIZE + ty;    int CurBlockLocal1Index = 0 * BLOCK_SIZE + tx;#pragma unroll   //循环展开，cuda编译器优化项    for (int i=0; i<BLOCK_SIZE; i++)    {        curelem += DCTv8matrix[DCTv8matrixIndex] * CurBlockLocal1[CurBlockLocal1Index];  //DCT模板运算        DCTv8matrixIndex += BLOCK_SIZE;          CurBlockLocal1Index += BLOCK_SIZE; //邻域内下一个像素的索引    }    CurBlockLocal2[(ty << BLOCK_SIZE_LOG2) + tx ] = curelem;  //运算结果    //synchronize threads to make sure the first 2 matrices are multiplied and the result is stored in the second block    __syncthreads();  //等待邻域内所有像素都运算完毕后，继续往下执行    //calculate the multiplication of (DCTv8matrixT * A) * DCTv8matrix and place it in the first block    curelem = 0;    int CurBlockLocal2Index = (ty << BLOCK_SIZE_LOG2) + 0;    DCTv8matrixIndex = 0 * BLOCK_SIZE + tx;#pragma unroll    for (int i=0; i<BLOCK_SIZE; i++)    {        curelem += CurBlockLocal2[CurBlockLocal2Index] * DCTv8matrix[DCTv8matrixIndex];        CurBlockLocal2Index += 1;        DCTv8matrixIndex += BLOCK_SIZE;    }    CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx ] = curelem;      //！充分利用共享内存CurBlockLocal1，将第一次运算的输入作为第二次运算的输出    //synchronize threads to make sure the matrices are multiplied and the result is stored back in the first block    __syncthreads();    //copy current coefficient to its place in the result array    Dst[ FMUL(((by << BLOCK_SIZE_LOG2) + ty), ImgWidth) + ((bx << BLOCK_SIZE_LOG2) + tx) ] = CurBlockLocal1[(ty << BLOCK_SIZE_LOG2) + tx ];}

0 0