pcl+cuda基本用法

来源：互联网发布：sql sever外键约束编辑：程序博客网时间：2024/06/05 17:56

转载自http://www.cnblogs.com/zipeilu/p/6117304.html

PCL+CUDA编程（一）

点云的操作对运算资源的消耗是十分高的。但利用GPU并行运算的优点可以解决这个问题。下面我将跟大家分享关于利用CUDA处理PCL点云数据的一些经验。

首先举一个简单的例子说明CUDA程序是如何运作的。

我们先写一个简单的C++程序helloworld.cpp

 1 /* 2  * helloworld.cpp 3  * 4  *  Created on: Nov 25, 2016 5  *      Author: lzp 6  */ 7  8 #include <iostream> 9 10 #include <addition.h>11 12 13 int main(int argc, char** argv)14 {15     int a=1,b=2,c;16 17     if(addition(a,b,&c))18         std::cout<<"c="<<c<<std::endl;19     else20         std::cout<<"Addition failed!"<<std::endl;21 22     return 0;23 }

我们将利用addition()函数将a和b相加，然后由c储存它们的和。
addition()函数在头文件声明：

 1 /* 2  * addition.h 3  * 4  *  Created on: Nov 25, 2016 5  *      Author: lzp 6  */ 7  8 #ifndef INCLUDES_ADDITION_H_ 9 #define INCLUDES_ADDITION_H_10 11 /*check if the compiler is of C++*/12 #ifdef __cplusplus13 extern "C" bool addition(int a, int b, int *c);14 15 #endif16 17 18 19 #endif /* INCLUDES_ADDITION_H_ */

修饰符extern "C"是CUDA和C++混合编程时必须的。然后我们来看addition()的在CUDA上的实现：

 1 #include <addition.h> 2 __global__ void add(int *a, int *b, int *c) 3 { 4     *c=*a+*b; 5 } 6  7 extern "C" bool addition(int a, int b, int *c) 8 { 9     int *d_a, *d_b, *d_c;10     int size=sizeof(int);11     12     cudaMalloc((void **)&d_a, size);13     cudaMalloc((void **)&d_b, size);14     cudaMalloc((void **)&d_c, size);15     16     cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);17     cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);18     19     add<<<1,1>>>(d_a, d_b, d_c);20     21     cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);22     23     cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);24     return true;25 }

其中，带有__global__修饰符的函数称为”核函数“，它负责处理GPU内存里的数据，是并行计算发生的地方。而bool addition(int a, int b, int *c)充当了CPU和GPU之间数据传输的角色。也就是Host和Device之间的数据传输。

最后，编写CMake文件编译。

cmake_minimum_required(VERSION 2.6 FATAL_ERROR)project(helloworld)find_package(CUDA REQUIRED)include_directories(../../includes)cuda_add_executable (helloworld helloworld.cpp addition.cu)

其中include_directories的参数为.h文件所在的目录。

下面我们用相同的程序结构，写一个最简单的例子，用CUDA对PCL点云中的一个点进行操作。

 1 /* 2  * pcl_points_gpu.cpp 3  * 4  *  Created on: Nov 24, 2016 5  *      Author: lzp 6  */ 7  8  9 10 #include <gpu_draw_cloud.h>11 #include <pcl/io/pcd_io.h>12 13 int main(int argc, char** argv)14 {15     pcl::PointCloud<pcl::PointXYZRGB> cloud;16     pcl::gpu::DeviceArray<pcl::PointXYZRGB> cloud_device;17 18 19     cloud.width = 1;20     cloud.height =1;21     cloud.is_dense=false;22     cloud.points.resize(cloud.width*cloud.height);23 24     std::vector<float> point_val;25 26     for(size_t i=0; i<3*cloud.points.size(); ++i)27     {28         point_val.push_back(1024*rand()/(RAND_MAX+1.0f));29     }30 31     for (size_t i = 0; i < cloud.points.size(); ++i) {32         cloud.points[i].x = point_val[3 * i];33         cloud.points[i].y = point_val[3 * i + 1];34         cloud.points[i].z = point_val[3 * i + 2];35     }36 37     std::cout<<"cloud.points="<<cloud.points[0]<<std::endl;38 39     cloud_device.upload(cloud.points);40 41     cloud2GPU(cloud_device);42 43     cloud_device.download(cloud.points);44 45     std::cout<<"cloud.points="<<cloud.points[0]<<std::endl;46     return (0);47 }

这段代码模仿了PCL中写点云的一个例子，生成了一个点，坐标是随机生成的。关键点是pcl::gpu::DeviceArray<pcl::PointXYZRGB>，这是一个可以将点云传输到GPU上的桥梁。它的upload() 和download()方法相当于前面例子中的cudaMemcpy()。详情可参考PCL的源码仓库中/gpu/examples/和/gpu/octree/这两个目录的源码。

接下来是头文件：

 1 /* 2  * gpu_draw_cloud.h 3  * 4  *  Created on: Nov 25, 2016 5  *      Author: lzp 6  */ 7  8 #ifndef INCLUDES_GPU_DRAW_CLOUD_H_ 9 #define INCLUDES_GPU_DRAW_CLOUD_H_10 11 12 #include <iostream>13 #include <pcl/point_types.h>14 #include <pcl/gpu/containers/device_array.h>15 16 /*check if the compiler is of C++*/17 #ifdef __cplusplus18 19 20 /*21  * Try accessing GPU with pointcloud22  * */23 extern "C" bool cloud2GPU(pcl::gpu::DeviceArray<pcl::PointXYZRGB>& cloud_device);24 25 26 #endif27 28 29 #endif /* INCLUDES_GPU_DRAW_CLOUD_H_ */

然后是函数实现体：

 1 #include <gpu_draw_cloud.h> 2  3  4  5  6 __global__ void change_points(pcl::gpu::PtrSz<pcl::PointXYZRGB> cloud_device) 7 { 8     cloud_device[0].x+=1; 9     pcl::PointXYZRGB q=cloud_device.data[0];10     printf("x=%f, y=%f, z=%f, r=%d, g=%d, b=%d \n", q.x, q.y, q.z, q.r, q.g, q.b);11 }12 13 14 15 extern "C" bool 16 cloud2GPU(pcl::gpu::DeviceArray<pcl::PointXYZRGB>& cloud_device)17 {18     change_points<<<1,1>>>(cloud_device);19     return true;20 }

在这个例子中，我将CPU和GPU的数据交互放到主函数中了，因此cloud2GPU函数只充当了一个调用核函数的接口。值得注意的是，在核函数的参数中，传入的pcl::gpu::DeviceArray<pcl::PointXYZRGB>隐式转换成pcl::gpu::PtrSz<pcl::PointXYZRGB>了。这两个数据类型是实现C++和CUDA混合编程的关键。

最后附上CMakeLists。

 1 project(pcl_points_gpu) 2  3 find_package(PCL 1.8 REQUIRED) 4 find_package(CUDA REQUIRED) 5 INCLUDE(FindCUDA) 6  7 include_directories(../../includes) 8  9 include_directories(${PCL_INCLUDE_DIRS})10 link_directories(${PCL_LIBRARY_DIRS})11 add_definitions(${PCL_DEFINITIONS})12 13 get_directory_property(dir_defs DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS)14 set(vtk_flags)15 16 foreach(it ${dir_defs})17     if(it MATCHES "vtk*")18     list(APPEND vtk_flags ${it})19     endif()20 endforeach()21 22 foreach(d ${vtk_flags})23     remove_definitions(-D${d})24 endforeach()25 26 cuda_add_executable (pcl_points_gpu pcl_points_gpu.cpp gpu_draw_cloud.cu)27 target_link_libraries (pcl_points_gpu ${PCL_LIBRARIES})

留意13-24行，如果没有这几行，nvcc编译时会报出类似这样的错误：

nvcc fatal : A single input file is required for a non-link phase when an outputfile is specified
CMake Error at pcl_points_gpu_generated_gpu_draw_cloud.cu.o.cmake:209 (message):
Error generating
XXXXXXXXXXXXXXXX./pcl_points_gpu_generated_gpu_draw_cloud.cu.o

根据https://github.com/PointCloudLibrary/pcl/issues/776的描述，这是VTK的一个bug所致，因此在CMake中添加了这几行脚本。

希望这些例子对刚接触PCL和CUDA的人有帮助。本人也是新手，对很多概念仍然模糊不清，望体谅。