cuda数组的拷贝

来源：互联网发布：男模项海知乎编辑：程序博客网时间：2024/04/18 21:24

原来看了GPU编程觉得挺简单的，后面自己动手发现毛病不少啊。简单描述一下自己要做的事情：（1）CPU三维的vector--->（2）CPU三维数组--->（3）转换到GPU中的三维数组--->（4）转换到CPU中的三维数组，而其中问题主要出在第3、4步。主要是没有理解一个问题，那就是“cuda的各种拷贝一定要是内存连续的”。而自己在申请三维数组的时候用的是new或者malloc，这种在申请一维数组的时候是连续的，但是在申请多维数组就会出现不连续，因此在这里犯了致命错误。

http://hpcbbs.it168.com/thread-7366-1-1.html这个帖子给了很好的建议，“vector<vector<float> > 并不是二维数组吧，它只是实现了二维数组的操作（比如[][]).内存是不连续的。要用cudaMemcpy还是得定义 float 2darray[N][M] 或者直接 float *2darray = new float(M*N);”。反正就是这样，纸上得来终觉浅，自己多亲身力为一下。

下面给出一个连续成功的例子：

#include "example1.cuh"#include "Struct.h"/************************************************************************//* 转换成设备可以识别的                                                 *//************************************************************************/void InitCPUData(DataMatrix &datamatrix,std::vector<std::vector<std::vector<float > > > vec3D1, std::vector<std::vector<std::vector<float > > > vec3D2,int width,int height,int depth){int i,j,k;for (i=0;i<depth;i++){for (j=0;j<height;j++){for (k=0;k<width;k++){datamatrix.Mat3D1[i][j][k]=vec3D1[i][j][k];datamatrix.Mat3D2[i][j][k]=vec3D2[i][j][k];}}}}/************************************************************************//* 分配并且赋值                                                         *//************************************************************************/__host__ void AllocDataAndVal(DataStruct &datastruct,DataMatrix datamatrix,int width,int height,int depth){//分配内存cudaExtent extent=make_cudaExtent(sizeof(float)*6,7,8);cutilSafeCall(cudaMalloc3D(&(datastruct.Vec3D1),extent));cutilSafeCall(cudaMalloc3D(&(datastruct.Vec3D2),extent));//赋值cudaMemcpy3DParms Parms3D1={0};cudaMemcpy3DParms Parms3D2={0};Parms3D1.dstPtr=datastruct.Vec3D1;Parms3D2.dstPtr=datastruct.Vec3D2;Parms3D1.srcPtr=make_cudaPitchedPtr((void*)datamatrix.Mat3D1,width*sizeof(float),width,height);Parms3D2.srcPtr=make_cudaPitchedPtr((void*)datamatrix.Mat3D2,width*sizeof(float),width,height);Parms3D1.extent=extent;Parms3D2.extent=extent;Parms3D1.kind=cudaMemcpyHostToDevice;Parms3D2.kind=cudaMemcpyHostToDevice;cudaMemcpy3D(&Parms3D1);cudaMemcpy3D(&Parms3D2);}/************************************************************************//* 核函数                                                               *//************************************************************************/__global__ void kernel(DataStruct datastruct,int width,int height,int depth) //实现类中两个数组的相加，保持到第一个数组中{char* devPtr1=(char*)datastruct.Vec3D1.ptr; //起始地址char* devPtr2=(char*)datastruct.Vec3D2.ptr;int pitch=datastruct.Vec3D1.pitch; //pitch，相当于宽度int SlicePitch=pitch*height; //用线程int xid=threadIdx.x;int yid=threadIdx.y;int zid=threadIdx.z;if (xid<width&&yid<height&&zid<depth){((float*)((char*)(devPtr1+zid*SlicePitch)+yid*pitch))[zid]=((float*)((char*)(devPtr1+zid*SlicePitch)+yid*pitch))[zid]+((float*)((char*)(devPtr2+zid*SlicePitch)+yid*pitch))[zid];}}/************************************************************************//* 返回到主机上                                                         *//************************************************************************/__host__ void GPU2CPU(DataStruct &datastruct,DataMatrix datamatrix, int width,int height,int depth){cudaExtent extent=make_cudaExtent(sizeof(float)*6,7,8);cudaMemcpy3DParms Parms3D1={0};cudaMemcpy3DParms Parms3D2={0};Parms3D1.srcPtr=datastruct.Vec3D1;Parms3D2.srcPtr=datastruct.Vec3D2;Parms3D1.dstPtr=make_cudaPitchedPtr((void*)datamatrix.Mat3D1,width*sizeof(float),width,height);Parms3D2.dstPtr=make_cudaPitchedPtr((void*)datamatrix.Mat3D2,width*sizeof(float),width,height);Parms3D1.extent=extent;Parms3D2.extent=extent;Parms3D1.kind=cudaMemcpyDeviceToHost;Parms3D2.kind=cudaMemcpyDeviceToHost;cudaMemcpy3D(&Parms3D1);cudaMemcpy3D(&Parms3D2);}

主函数：

// 说明：在cu中host和device的虽然写在一起,但是是分开编译的，这个在一起只是形式上的。如果函数前面有__global__由主机调用设备执行，__device__设备调用设备执行，__host__主机调用主机执行。其分别对应三种形式为核函数、核函数中的函数、一般函数。#include <iostream>#include <vector>#include <algorithm>#include "example1.cuh"#include "Struct.h"int main(){int i,j,k;int width=6;int height=7;int depth=8;std::vector<std::vector<std::vector<float > > > vec3D1(width); //建立6*7*8的三维数组，范文depth-height-widthstd::vector<std::vector<std::vector<float > > > vec3D2(width);vec3D1.resize(depth);vec3D2.resize(depth);for (i=0;i<depth;i++){vec3D1[i].resize(height);vec3D2[i].resize(height);for (j=0;j<height;j++){vec3D1[i][j].resize(width);vec3D2[i][j].resize(width);for (k=0;k<width;k++){vec3D1[i][j][k]=i+j+k;vec3D2[i][j][k]=i*j*k;}}}////////////////////////////////////////////////////////////////////////////将数据转换成设备可以接受的形式，为赋值做准备，这个是在主机上进行DataMatrix datamatrix;InitCPUData(datamatrix,vec3D1,vec3D2,width,height,depth);////////////////////////////////////////////////////////////////////////////给设备分配内存并且赋值，这个是在设备上进行DataStruct datastruct;AllocDataAndVal(datastruct,datamatrix,width,height,depth);////////////////////////////////////////////////////////////////////////////调用核函数dim3 dimBlock(8,7,6);kernel<<<1,dimBlock>>>(datastruct,width,height,depth); ////////////////////////////////////////////////////////////////////////////返回到主机，并显示出来GPU2CPU(datastruct,datamatrix,width,height,depth);for (i=0;i<depth;i++){for (j=0;j<height;j++){for (k=0;k<width;k++){printf("%f  ",datamatrix.Mat3D1[i][j][k]);}printf("\n");}printf("\n");printf("\n");}//释放空间cudaFree(&(datastruct.Vec3D1));cudaFree(&(datastruct.Vec3D2));}