OpenCV GPU模块+NPP

来源：互联网发布：one note mac 编辑：程序博客网时间：2024/05/16 15:46

　　NVIDIA Performance Primitives(NPP)是一系列GPU加速的图像、视频以及信号处理函数，与同级别的纯CPU函数相比，这些函数最高可实现5 - 10倍性能提升。利用NPP，开发者能够利用2000多个图像处理与信号处理基元，在数小时之内即可实现应用程序的大幅性能提升。
　　无论用GPU加速的版本代替CPU版本还是将NPP与现有的GPU加速流水线相结合，NPP都能够实现极高的性能，同时可缩短开发时间。

　　由于NPP例程中与图像相关的库使用的是freeImage，这里使用OpenCV2.4中的函数。
　　就/7_CUDALibraries/boxFilterNPP的例程来说，有如下几种方法。

使用GpuMat

#include "cuda_runtime.h"#include <iostream>#include "opencv2/gpu/gpu.hpp"#include "opencv2/opencv.hpp"#include <npp.h>using namespace std;using namespace cv;int main(){        //读取图片到主机端    Mat hSrc = imread("../zero.jpg",1);    imshow("Input",hSrc);    //将输入图片上传到设备端    gpu::GpuMat dSrc;    dSrc.upload(hSrc);    // 大部分情况下GpuMat::isContinuous() == false , 即hSrc.step一般不等于cols*channels    cout<<"Image on host memory, step = "<<hSrc.step<<"  isContinuous? "<<hSrc.isContinuous()<<endl;    cout<<"Image on device memory, step = "<<dSrc.step<<"  isContinuous? "<<dSrc.isContinuous()<<endl;    // 设置滤波器参数    // create struct with box-filter mask size    NppiSize oMaskSize = {5, 5};    // create struct with ROI size given the current mask    NppiSize oSizeROI = {hSrc.cols - oMaskSize.width + 1, hSrc.rows - oMaskSize.height + 1};    // set anchor point inside the mask to (0, 0)    NppiPoint oAnchor = {0, 0};    // 在主机端和设备端为输出图片分配内存    Size oSize(oSizeROI.width,oSizeROI.height);    Mat hDst = Mat(oSize,hSrc.type());    gpu::GpuMat dDst(oSize,hSrc.type());    // run box filter    nppiFilterBox_8u_C3R(dSrc.data, dSrc.step, dDst.data, dDst.step, oSizeROI, oMaskSize, oAnchor);    //从显存传回数据    dDst.download(hDst);    //显示图片    imshow("Output",hDst);    cvWaitKey();    return 0;}

使用Mat + Cuda

#include "cuda_runtime.h"#include <iostream>#include "opencv2/gpu/gpu.hpp"#include "opencv2/opencv.hpp"#include <npp.h>using namespace std;using namespace cv;int main(){        // 读取图片到主机端    Mat hSrc = imread("../zero.jpg",1);    imshow("Input",hSrc);    // 设置滤波器参数    // create struct with box-filter mask size    NppiSize oMaskSize = {5, 5};    // create struct with ROI size given the current mask    NppiSize oSizeROI = {hSrc.cols - oMaskSize.width + 1, hSrc.rows - oMaskSize.height + 1};    // set anchor point inside the mask to (0, 0)    NppiPoint oAnchor = {0, 0};    // 设置输出图片的长宽    Size oSize(oSizeROI.width,oSizeROI.height);    Mat hDst = Mat(oSize,hSrc.type());    // 在设备端分配内存    unsigned char *dSrc, *dDst;    const int bytes = (int)hSrc.step*hSrc.rows;    const int obytes = (int)hDst.step*hDst.rows;    cudaMalloc<unsigned char>(&dSrc,bytes);    cudaMalloc<unsigned char>(&dDst,obytes);    // Copy Data From Mat to Device Pointer    cudaMemcpy(dSrc,hSrc.data,bytes,cudaMemcpyHostToDevice);    // run box filter    nppiFilterBox_8u_C3R(dSrc, hSrc.step, dDst, hDst.step, oSizeROI, oMaskSize, oAnchor);    // Copy back the result from device to Mat    cudaMemcpy(hDst.data,dDst,obytes,cudaMemcpyDeviceToHost);    imshow("Output",hDst);    cvWaitKey();    cudaFree(dSrc);    cudaFree(dDst);    return 0;}

使用IplImage + Cuda

#include "cuda_runtime.h"#include <iostream>#include "opencv2/gpu/gpu.hpp"#include "opencv2/opencv.hpp"#include <npp.h>using namespace std;int main(){        // 读取图片到主机端    IplImage* hSrc = cvLoadImage("../zero.jpg",1);    cvShowImage("Input",hSrc);    // 设置滤波器参数    // create struct with box-filter mask size    NppiSize oMaskSize = {5, 5};    // create struct with ROI size given the current mask    NppiSize oSizeROI = {hSrc->width - oMaskSize.width + 1, hSrc->height - oMaskSize.height + 1};    // set anchor point inside the mask to (0, 0)    NppiPoint oAnchor = {0, 0};    // 设置输出图片的长宽    CvSize oSize=cvSize(oSizeROI.width,oSizeROI.height);    IplImage* hDst = cvCreateImage(oSize,hSrc->depth,hSrc->nChannels);    // 在设备端分配内存    unsigned char *dSrc, *dDst;    const int bytes = hSrc->imageSize;    const int obytes = hDst->imageSize;    cudaMalloc<unsigned char>(&dSrc,bytes);    cudaMalloc<unsigned char>(&dDst,obytes);    cout<<"step = cols * channels + padding: "<<endl<<hSrc->widthStep<<"="<<hSrc->width<<"*"<<hSrc->nChannels<<endl;    cout<<"bytes = step * rows: "<<endl<<bytes<<"="<<hSrc->widthStep<<"*"<<hSrc->height<<endl;    //Copy Data From IplImage to Device Pointer    cudaMemcpy(dSrc,hSrc->imageData,bytes,cudaMemcpyHostToDevice);    // run box filter    nppiFilterBox_8u_C3R(dSrc, hSrc->widthStep, dDst, hDst->widthStep, oSizeROI, oMaskSize, oAnchor);    //Copy back the result from device to IplImage    cudaMemcpy(hDst->imageData,dDst,obytes,cudaMemcpyDeviceToHost);    cvShowImage("Output",hDst);    cvWaitKey();    cudaFree(dSrc);    cudaFree(dDst);    cvReleaseImage(&hSrc);    cvReleaseImage(&hDst);    return 0;}

备注:
1.GpuMat大部分情况下是不连续的，即GpuMat::isContinuous() == false，这样可以提高效率。而nppiFilterBox_8u_C3R()函数对于输入图像只要求提供pSrc和nSrcStep ，即图像数据和图像一行所占用的内存大小（通常是cols * channels + padding），因此可以处理连续或者非连续的数据。
2.如果需要在设备端创建连续的内存空间，可以使用createContinuous()。

    gpu::GpuMat dSrc = gpu::createContinuous(hSrc.rows, hSrc.cols, hSrc.type());    dSrc.upload(hSrc);

3.这是使用GpuMat方法的输出。

0 0