CUDA编程学习二

来源:互联网 发布:麒麟芯片 知乎 编辑:程序博客网 时间:2024/06/05 10:26

今天学习的链接
调试方法
cuda-gdb部分命令

// includes, system#include <stdio.h>#include <assert.h>// Simple utility function to check for CUDA runtime errorsvoid checkCUDAError(const char *msg);///////////////////////////////////////////////////////////////////////////////// Program main///////////////////////////////////////////////////////////////////////////////int main( int argc, char** argv) {    // pointer and dimension for host memory    int n, dimA;    float *h_a;    // pointers for device memory    float *d_a, *d_b;    // allocate and initialize host memory    // Bonus: try using cudaMallocHost in place of malloc    dimA = 8;    h_a = (float *) malloc(dimA*sizeof(float));    for (n=0; n<dimA; n++)    {        h_a[n] = (float) n;    }    // Part 1 of 5: allocate device memory    size_t memSize = dimA*sizeof(float);    cudaMalloc((void **)&d_a, memSize  );    cudaMalloc(  (void **)&d_b, memSize);    // Part 2 of 5: host to device memory copy    cudaMemcpy(d_a, h_a, memSize, cudaMemcpyHostToDevice  );    // Part 3 of 5: device to device memory copy    cudaMemcpy(d_b, d_a, memSize, cudaMemcpyDeviceToDevice  );    // clear host memory    for (n=0; n<dimA; n++)    {        h_a[n] = 0.f;    }    // Part 4 of 5: device to host copy    cudaMemcpy(h_a, d_b, memSize, cudaMemcpyDeviceToHost);    // Check for any CUDA errors    checkCUDAError("cudaMemcpy calls");    // verify the data on the host is correct    for (n=0; n<dimA; n++)    {        assert(h_a[n] == (float) n);        printf("h_a:%f, n:%f\n", h_a[n], n);    }    // Part 5 of 5: free device memory pointers d_a and d_b    cudaFree(d_b );    cudaFree(d_a );    // Check for any CUDA errors    checkCUDAError("cudaFree");    // free host memory pointer h_a    // Bonus: be sure to use cudaFreeHost for memory allocated with cudaMallocHost    free(h_a);    // If the program makes it this far, then the results are correct and    // there are no run-time errors.  Good work!    printf("Correct!\n");    return 0;}void checkCUDAError(const char *msg){    cudaError_t err = cudaGetLastError();    if( cudaSuccess != err)     {        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );        exit(-1);    }                         }

主要包含Device与Host的内存分配与相互拷贝。

再看一个代码,了解一下套路,天下代码从看开始,从抄起步:

/* * Copyright 1993-2008 NVIDIA Corporation.  All rights reserved. * * NOTICE TO USER: * * This source code is subject to NVIDIA ownership rights under U.S. and * international Copyright laws.  Users and possessors of this source code * are hereby granted a nonexclusive, royalty-free license to use this code * in individual and commercial software. * * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE * OR PERFORMANCE OF THIS SOURCE CODE. * * U.S. Government End Users.   This source code is a "commercial item" as * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of * "commercial computer  software"  and "commercial computer software * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995) * and is provided to the U.S. Government only as a commercial end item. * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the * source code with only those rights set forth herein. * * Any use of this source code in individual and commercial software must * include, in the user documentation and internal comments to the code, * the above Disclaimer and U.S. Government End Users Notice. */// includes, system#include <stdio.h>#include <assert.h>// Simple utility function to check for CUDA runtime errorsvoid checkCUDAError(const char *msg);// Part 3 of 5: implement the kernel__global__ void myFirstKernel( int *d_a ){    int idx = blockIdx.x * blockDim.x + threadIdx.x;    d_a[idx] = 1000 * blockIdx.x + threadIdx.x;}////////////////////////////////////////////////////////////////////////////////// Program main////////////////////////////////////////////////////////////////////////////////int main( int argc, char** argv) {    // pointer for host memory    int *h_a;    // pointer for device memory    int *d_a;    // define grid and block size    int numBlocks = 8;    int numThreadsPerBlock = 8;    // Part 1 of 5: allocate host and device memory    size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);    h_a = (int *) malloc(memSize);    cudaMalloc((void **) &d_a, memSize  );    // Part 2 of 5: configure and launch kernel    dim3 dimGrid( numBlocks );    dim3 dimBlock( numThreadsPerBlock );    myFirstKernel<<< dimGrid , dimBlock >>>( d_a );    // block until the device has completed    cudaThreadSynchronize();    // check if kernel execution generated an error    checkCUDAError("kernel execution");    // Part 4 of 5: device to host copy    cudaMemcpy(h_a, d_a, memSize,   cudaMemcpyDeviceToHost  );    // Check for any CUDA errors    checkCUDAError("cudaMemcpy");    // Part 5 of 5: verify the data returned to the host is correct    for (int i = 0; i <  numBlocks        ; i++)    {        for (int j = 0; j <   numThreadsPerBlock                ; j++)        {            assert(h_a[i * numThreadsPerBlock + j] == 1000 * i + j);        }    }    // free device memory    cudaFree(d_a);    // free host memory    free(h_a);    // If the program makes it this far, then the results are correct and    // there are no run-time errors.  Good work!    printf("Correct!\n");    return 0;}void checkCUDAError(const char *msg){    cudaError_t err = cudaGetLastError();    if( cudaSuccess != err)     {        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );        exit(-1);    }                         }

dim3 is an integer vector type that can be used in CUDA code. Its most common application is to pass the grid and block dimensions in a kernel invocation. It can also be used in any user code for holding values of 3 dimensions.

调试工具cuda-gdb

nvcc -g *.cu *.out//生成可调试文件cuda-gdb *.out//启动调用

常用调试命令:
指令 简写 指令说明 举例
file exe_name 指定待调试的可执行文件 file program
set args arg1 arg2 … 设置命令行参数 set args 1 2
breakpoint b 设置断点 b main
b 数字
run r 在调试器中执行程序
start 开始执行程序,并在main的第一行停住
next r 单步执行到下一行
step s 单步执行,会进入函数内部执行
continue c 执行已暂停程序到下一断点或结尾处
print p 打印参数信息,查看变量 p var1
thread 列出当前主机线程
cuda 列出当前活跃的kernel/grid/block/thread内容,并允许将焦点移至此处 cuda thread(1, 1, 1)
cuda kernel 1 block(1, 2, 1)
info 查看参数所包含的具体信息 info devices
info kernels
info threads
backtrace bt 显示当前函数调用栈的内容
程序调试示例:
无法启动图形界面调试
遇到上图问题需进入tty窗口调试。在一个博客上看到,GPU计算能力在3.5以上的可以在Ubuntu图形界面下调试。查看一下自己显卡的计算能力(tips:ubuntu 系统剪贴板到vim:shift+insert),以下程序输出设备信息:

#include "cuda_runtime.h"  #include "device_launch_parameters.h"  #include<iostream>  #include <stdio.h>  using namespace std;  cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size);  __global__ void addKernel(int *c, const int *a, const int *b)  {      int i = threadIdx.x;      c[i] = a[i] + b[i];  }  int main()  {      const int arraySize = 5;      const int a[arraySize] = { 1, 2, 3, 4, 5 };      const int b[arraySize] = { 10, 20, 30, 40, 50 };      int c[arraySize] = { 0 };      //get device prop    cudaError_t cudaStatus;    int num = 0;    cudaDeviceProp deviceProp;    cudaStatus = cudaGetDeviceCount(&num);    for(int i=0;i<num;i++)    {        cudaGetDeviceProperties(&deviceProp, i);        cout << "设备 " << i + 1 << " 的主要属性: " << endl;          cout << "设备显卡型号: " << deviceProp.name << endl;          cout << "设备全局内存总量(以MB为单位): " << deviceProp.totalGlobalMem / 1024 / 1024 << endl;          cout << "设备上一个线程块(Block)中可用的最大共享内存(以KB为单位): " << deviceProp.sharedMemPerBlock / 1024 << endl;          cout << "设备上一个线程块(Block)种可用的32位寄存器数量: " << deviceProp.regsPerBlock << endl;          cout << "设备上一个线程块(Block)可包含的最大线程数量: " << deviceProp.maxThreadsPerBlock << endl;          cout << "设备的计算功能集(Compute Capability)的版本号: " << deviceProp.major << "." << deviceProp.minor << endl;          cout << "设备上多处理器的数量: " << deviceProp.multiProcessorCount << endl;     }    return 0;  }  // Helper function for using CUDA to add vectors in parallel.  cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size)  {      int *dev_a = 0;      int *dev_b = 0;      int *dev_c = 0;      cudaError_t cudaStatus;      // Choose which GPU to run on, change this on a multi-GPU system.      cudaStatus = cudaSetDevice(0);      if (cudaStatus != cudaSuccess) {          fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");          goto Error;      }      // Allocate GPU buffers for three vectors (two input, one output)    .      cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));      if (cudaStatus != cudaSuccess) {          fprintf(stderr, "cudaMalloc failed!");          goto Error;      }      cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));      if (cudaStatus != cudaSuccess) {          fprintf(stderr, "cudaMalloc failed!");          goto Error;      }      cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));      if (cudaStatus != cudaSuccess) {          fprintf(stderr, "cudaMalloc failed!");          goto Error;      }      // Copy input vectors from host memory to GPU buffers.      cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);      if (cudaStatus != cudaSuccess) {          fprintf(stderr, "cudaMemcpy failed!");          goto Error;      }      cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);      if (cudaStatus != cudaSuccess) {          fprintf(stderr, "cudaMemcpy failed!");          goto Error;      }      // Launch a kernel on the GPU with one thread for each element.      addKernel<<<1, size>>>(dev_c, dev_a, dev_b);      // cudaThreadSynchronize waits for the kernel to finish, and returns      // any errors encountered during the launch.      cudaStatus = cudaThreadSynchronize();      if (cudaStatus != cudaSuccess) {          fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);          goto Error;      }      // Copy output vector from GPU buffer to host memory.      cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);      if (cudaStatus != cudaSuccess) {          fprintf(stderr, "cudaMemcpy failed!");          goto Error;      }  Error:      cudaFree(dev_c);      cudaFree(dev_a);      cudaFree(dev_b);      return cudaStatus;  

设备信息
网上查到我的显卡计算能力为5.2(官方文档,综合指数),因此我的显卡是支持界面条件下调试的,按照设置环境参数链接,设置后可以正常调试
设置界面条件可调试
添加如环境变量,设置好之后我这每次新terminal都有重新source一次profile,看以后重启了有没有这个问题。
下面就根据gdb的命令调试该程序,调试状况下查看GPU属性。
debug示例

另外,在其他博客上见到了许多文档的列表,这些文档在:
查询pdf路径

性能分析工具Visual Profiler

nvvp命令调用Visual Profiler

使用工具对比分析一下两个实现数组逆序的程序:

  • 程序1
// includes, system#include <stdio.h>#include <assert.h>// Simple utility function to check for CUDA runtime errorsvoid checkCUDAError(const char* msg);// Part3: implement the kernel__global__ void reverseArrayBlock(int *d_out, int *d_in){    int inOffset  = blockDim.x * blockIdx.x;    int outOffset = blockDim.x * (gridDim.x - 1 - blockIdx.x);    int in  = inOffset + threadIdx.x;    int out = outOffset + (blockDim.x - 1 - threadIdx.x);    d_out[out] = d_in[in];}////////////////////////////////////////////////////////////////////////////////// Program main////////////////////////////////////////////////////////////////////////////////int main( int argc, char** argv) {    // pointer for host memory and size    int *h_a;    int dimA = 256 * 1024; // 256K elements (1MB total)    // pointer for device memory    int *d_b, *d_a;    // define grid and block size    int numThreadsPerBlock = 256;    // Part 1: compute number of blocks needed based on array size and desired block size    int numBlocks = dimA / numThreadsPerBlock;      // allocate host and device memory    size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);    h_a = (int *) malloc(memSize);    cudaMalloc( (void **) &d_a, memSize );    cudaMalloc( (void **) &d_b, memSize );    // Initialize input array on host    for (int i = 0; i < dimA; ++i)    {        h_a[i] = i;    }    // Copy host array to device array    cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );    // launch kernel    dim3 dimGrid(numBlocks);    dim3 dimBlock(numThreadsPerBlock);    reverseArrayBlock<<< dimGrid, dimBlock >>>( d_b, d_a );    // block until the device has completed    cudaThreadSynchronize();    // check if kernel execution generated an error    // Check for any CUDA errors    checkCUDAError("kernel invocation");    // device to host copy    cudaMemcpy( h_a, d_b, memSize, cudaMemcpyDeviceToHost );    // Check for any CUDA errors    checkCUDAError("memcpy");    // verify the data returned to the host is correct    for (int i = 0; i < dimA; i++)    {        assert(h_a[i] == dimA - 1 - i );    }    // free device memory    cudaFree(d_a);    cudaFree(d_b);    // free host memory    free(h_a);    // If the program makes it this far, then the results are correct and    // there are no run-time errors.  Good work!    printf("Correct!\n");    return 0;}void checkCUDAError(const char *msg){    cudaError_t err = cudaGetLastError();    if( cudaSuccess != err)     {        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );        exit(EXIT_FAILURE);    }                         }
  • 程序2,
// includes, system#include <stdio.h>#include <assert.h>// Simple utility function to check for CUDA runtime errorsvoid checkCUDAError(const char* msg);// Part 2 of 2: implement the fast kernel using shared memory__global__ void reverseArrayBlock(int *d_out, int *d_in){    extern __shared__ int s_data[];    int inOffset  = blockDim.x * blockIdx.x;    int in  = inOffset + threadIdx.x;    // Load one element per thread from device memory and store it     // *in reversed order* into temporary shared memory    s_data[blockDim.x - 1 - threadIdx.x] = d_in[in];    // Block until all threads in the block have written their data to shared mem    __syncthreads();    // write the data from shared memory in forward order,     // but to the reversed block offset as before    int outOffset = blockDim.x * (gridDim.x - 1 - blockIdx.x);    int out = outOffset + threadIdx.x;    d_out[out] = s_data[threadIdx.x];}////////////////////////////////////////////////////////////////////////////////// Program main////////////////////////////////////////////////////////////////////////////////int main( int argc, char** argv) {    // pointer for host memory and size    int *h_a;    int dimA = 256 * 1024; // 256K elements (1MB total)    // pointer for device memory    int *d_b, *d_a;    // define grid and block size    int numThreadsPerBlock = 256;    // Compute number of blocks needed based on array size and desired block size    int numBlocks = dimA / numThreadsPerBlock;      // Part 1 of 2: Compute the number of bytes of shared memory needed    // This is used in the kernel invocation below    int sharedMemSize = numThreadsPerBlock * sizeof(int);    // allocate host and device memory    size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);    h_a = (int *) malloc(memSize);    cudaMalloc( (void **) &d_a, memSize );    cudaMalloc( (void **) &d_b, memSize );    // Initialize input array on host    for (int i = 0; i < dimA; ++i)    {        h_a[i] = i;    }    // Copy host array to device array    cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );    // launch kernel    dim3 dimGrid(numBlocks);    dim3 dimBlock(numThreadsPerBlock);    reverseArrayBlock<<< dimGrid, dimBlock, sharedMemSize >>>( d_b, d_a );    // block until the device has completed    cudaThreadSynchronize();    // check if kernel execution generated an error    // Check for any CUDA errors    checkCUDAError("kernel invocation");    // device to host copy    cudaMemcpy( h_a, d_b, memSize, cudaMemcpyDeviceToHost );    // Check for any CUDA errors    checkCUDAError("memcpy");    // verify the data returned to the host is correct    for (int i = 0; i < dimA; i++)    {        assert(h_a[i] == dimA - 1 - i );    }    // free device memory    cudaFree(d_a);    cudaFree(d_b);    // free host memory    free(h_a);    // If the program makes it this far, then the results are correct and    // there are no run-time errors.  Good work!    printf("Correct!\n");    return 0;}void checkCUDAError(const char *msg){    cudaError_t err = cudaGetLastError();    if( cudaSuccess != err)     {        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );        exit(EXIT_FAILURE);    }                         }

运行profiler结果:profiler界面简介,profiler

程序1分析结果

程序2分析结果

程序2使用了shared mempry kernel运行时间段,内存拷贝时间长。

  • 调试问题:fatal: All CUDA devices are used for display and cannot be used while debugging. (error code = 24):显卡被显示占用,进入纯命令行调试。Somehow it seems that my X-Server is blocking my NVIDIA GPU because if I switch to another virtual console (CTRL+ALT+F1,CTRL+ALT+F7,可在命令行界面和xwin’切换) I am able to run my code using cuda-gdb. No monitor cable is plugged into the NVIDIA-card…
  • Device内存中的内容如何输出?如何调试的时候查看device中的内存?最简单的方法就是输出printf了。