人生第一次真正的CUDA程序

来源：互联网发布：智锐科技成都知乎编辑：程序博客网时间：2024/05/20 22:02

奋斗了2周，终于把CUDA的内存与显存数据拷贝、pitch、以及如何对显存的数组进行引用弄明白了。很开心。

我是初次接触CUDA，学习CUDA是因为我了解到它的并行性使得数据量很大的程序的运行效率很高。我现在做的项目恰好需要这种高性能的并行运算。

我开始编写了几个内存和显存之间普通的int、char、float和double型数据的拷贝进行测试，然后进行了一维数组在内存和显存之间数据的拷贝，随后就是二维数组了。

在一维数组的定义和引用中，我开始是定义的时候指定数组的大小，后来改写为动态申请内存和显存。如下：

定义一个一维数组：

float *cpu_data; //内存中的一维数组指针

int width; //数组的宽度

cpu_data = (float*)malloc(sizeof(float) * width);

float *gpu_data; //显存中的一维数组指针

cutilSafeCall( cudaMalloc((void**) &gpu_data, sizeof(float) * width)); //为数组申请显存空间

for(int c = 0; c < width; ++c) //内存数组的初始化

cpu_data[c] = c;

//下面进行数据的拷贝将内存数据拷贝到显存对应数组中

cutilSafeCall( cudaMemcpy( gpu_data, cpu_data, sizeof(float) * width, cudaMemcpyHostToDevice));

对于二维数组，在显存上申请空间是最好使用cudaMallocPitch（）函数，在此函数中有一个参数pitch，这个参数是补齐时显存数组每一行占得字节数。注意它是一个传出参数，其类型是size_t，也就是unsigned int类型。对于二维数组在内存和显存之间的拷贝最好使用函数

cudaMemcpy2D()，这样效率高些.

float *cpu_data; //CPU上的数组

float *gpu_data; //GPU上的数组

int width, height; //数组的宽度（列数）和高度（行数）

int pitch; //GPU上数组的pitch

printf("Input the width and height/n"); //输入数组的列数和行数

scanf("%d%d", &width, &height);

cpu_data = (float*)malloc(sizeof(float)*width*height); //申请内存空间

cutilSafeCall( cudaMallocPitch( (void**) &gpu_data, &pitch, sizeof(float) * width, height)); //申请显存空间

cutilSafeCall( cudaMemcpy2D( gpu_data, pitch, cpu_data, sizeof(float) * width, sizeof(float) * width, height, cudaMemcpyHostToDevice)); //内存和显存数据进行拷贝

下面是我写的最终的程序：

   /**************************************************************************
   *矩阵相加的例子
   ***************************************************************************/
   #include <stdio.h>
   #include <stdlib.h>
   #include <cutil_inline.h>

   /************************************************************************
    * Init CUDA
   ************************************************************************/
   #if __DEVICE_EMULATION__

bool InitCUDA(void){return true;}

   #else
   bool InitCUDA(void)
{
      int count = 0;
      int i = 0;

      cudaGetDeviceCount(&count);
      if(count == 0) {
       fprintf(stderr, "There is no device./n");
        return false;
       }

     for(i = 0; i < count; i++) {
         cudaDeviceProp prop;
     if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
             if(prop.major >= 1) {
                 break;
              }
       }
      }
    if(i == count) {
            fprintf(stderr, "There is no device supporting CUDA./n");
            return false;
    }
    cudaSetDevice(i);

printf("CUDA initialized./n");
return true;
}

#endif

/*****************************************************************************************************
*kernel函数，矩阵相加
******************************************************************************************************/
__global__ void myKernel(const float *a, const float *b, float *c, size_t pitch, int height, int width)
{

      int i = blockIdx.y * blockDim.y + threadIdx.y;
      int j = blockIdx.x * blockDim.x + threadIdx.x;

      if(i < height && j < width)
      c[i * pitch/ sizeof(float) + j] = a[i * pitch / sizeof(float) + j] + b[i * pitch / sizeof(float) + j];
}

int main(int argc, char* argv[])
{
     if(!InitCUDA())
         return 0;
    //CPU上的3个矩阵数组
    float *cpu_A;
    float *cpu_B;
    float *cpu_C;
    //GPU上的3个矩阵数组
    float *gpu_A;
    float *gpu_B;
    float *gpu_C;

    int width = 3; //矩阵的宽度（列数）
    int height = 2;//矩阵的高度（行数）
    size_t pitch;   //GPU数组的pitch
    //为CPU上的矩阵数组申请内存空间
    cpu_A = (float*)malloc(sizeof(float) * width * height);
    cpu_B = (float*)malloc(sizeof(float) * width * height);
    cpu_C = (float*)malloc(sizeof(float) * width * height);
    //为GPU上的矩阵数组申请显存空间
    cutilSafeCall( cudaMallocPitch((void**) &gpu_A, &pitch, sizeof(float) * width, height));
    cutilSafeCall( cudaMallocPitch((void**) &gpu_B, &pitch, sizeof(float) * width, height));
    cutilSafeCall( cudaMallocPitch((void**) &gpu_C, &pitch, sizeof(float) * width, height));
     //将pitch打印
     printf("The pitch is: %d/n", pitch);
    //为CPU上的矩阵数组初始化
     for(int r = 0; r < height; ++r){
       for(int c = 0; c < width; ++c){
         cpu_A[r * width + c] = r * c;
         cpu_B[r * width + c] = r + c;
         cpu_C[r * width + c] = 0.0;
      }
   }
   //打印CPU上的矩阵数组
     printf("/nCPU_A DATA/n");
     for(int r = 0; r < height; ++r){
    for(int c = 0; c < width; ++c){
    printf("%f/t", cpu_A[r * width + c]);
}
   printf("/n");
}
printf("/nCPU_B DATA/n");
for(int r = 0; r < height; ++r){
        for(int c = 0; c < width; ++c){
            printf("%f/t", cpu_B[r * width + c]);
      }
      printf("/n");
}
printf("/nCPU_C DATA/n");
for(int r = 0; r < height; ++r){
         for(int c = 0; c < width; ++c){
             printf("%f/t", cpu_C[r * width + c]);
        }
      printf("/n");
   }

   //将CPU上的矩阵数组cpu_A、cpu_B分别拷贝到GPU上的矩阵数组gpu_A、gpu_B中
   cutilSafeCall( cudaMemcpy2D( gpu_A, pitch, cpu_A, sizeof(float) * width, sizeof(float) * width, height,       cudaMemcpyHostToDevice));
    cutilSafeCall( cudaMemcpy2D( gpu_B, pitch, cpu_B, sizeof(float) * width, sizeof(float) * width, height, cudaMemcpyHostToDevice));

   dim3 Dg(1, 2, 1); //定义整个grid的维度和尺寸
   dim3 Db(width, 1, 1); //定义每个block的维度和尺寸
   myKernel<<<Dg, Db, 0>>>(gpu_A, gpu_B, gpu_C, pitch, height, width); //调用kernel函数

//将显存数组gpu_C拷贝会内存数组cpu_C
cutilSafeCall( cudaMemcpy2D( cpu_C, sizeof(float) * width, gpu_C, pitch, sizeof(float) * width, height, cudaMemcpyDeviceToHost));

//打印CPU_C数组
   printf("/nAfter change CPU_C DATA/n");
      for(int r = 0; r < height; ++r){
         for(int c = 0; c < width; ++c){
             printf("%f/t", cpu_C[r * width + c]);
          }
     printf("/n");
    }
   //释放内存空间
   free(cpu_A);
   free(cpu_B);
   free(cpu_C);
   //释放显存空间
   cutilSafeCall( cudaFree(gpu_A));
   cutilSafeCall( cudaFree(gpu_B));
   cutilSafeCall( cudaFree(gpu_C));
    //退出CUDA
     CUT_EXIT(argc, argv);

return 0;
}

2周的编程（当然是闲暇时间，呵呵），我终于开始了我的CUDA编程之路。

路漫漫其修远兮，吾将上下而求索。

本文来自CSDN博客，转载请标明出处：http://blog.csdn.net/xhz1234/archive/2011/03/03/6222010.aspx