CUDAExample-0-cdpSimplePrint

来源：互联网发布：苹果下载铃声软件编辑：程序博客网时间：2024/05/22 01:50

标签： CUDAExample

作用

>
* Key Concepts: CUDA Dynamic Parallelism *
It generates a unique identifier for each block. Prints the information about that block. Finally, if the ‘max_depth’ has not been reached, the block launches new blocks directly from the GPU.

例程中使用了递归思想，在核函数中打印处每一个线程块中threads为0的所在的线程块，以及它对应的parent线程块,重点说明了cuda架构可以动态的并行计算。

所使用的技巧

递归， device全局变量，命令行得到参数，字符串匹配，设备属性计算能力，共享内存, 设备函数

代码分析

main函数传参机制

int main(int argc, char **argv)

main 前面的 int 则说明main函数返回值是整形，一般是正常退出返回0，异常则是-1.
参数 argc 则是表示 argv 的个数.
argv 则是命令行参数. 这个参数是通过命令提示符窗（Linux称为终端）口运行程序，以空格区分参数格式带入的。
char **argv 就好理解了，它就是一个指向字符串的指针。
argc 是字符串的个数，如device is ready ，则argc = 3， gagv[0] 为字符串，argv[0][n]表示字符

设备端全局变量

__device__ int g_uids = 0;

checkCmdLineFlag()函数解析

inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref){    bool bFound = false;    if (argc >= 1)    {        for (int i=1; i < argc; i++)        {            int string_start = stringRemoveDelimiter('-', argv[i]);    //返回给个字符串中非“-”的第一个字符的位置            const char *string_argv = &argv[i][string_start];   //二维数组中非"-"d 开始            const char *equal_pos = strchr(string_argv, '=');  //查找字符串中首次出现字符=的位置            int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);            int length = (int)strlen(string_ref);            if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))            {                bFound = true;                continue;            }        }    }    return bFound;}

函数的作用是在输入字符串中，寻找与传入字符串*string_ref相同的部分，若有相同部分返回ture，否则false.
其中设计字符串操作函数strchr(),strlen(),strnicmp().

getCmdLineArgumentInt()函数解析

inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref){    bool bFound = false;    int value = -1;    if (argc >= 1)    {        for (int i=1; i < argc; i++)        {            int string_start = stringRemoveDelimiter('-', argv[i]);            const char *string_argv = &argv[i][string_start];            int length = (int)strlen(string_ref);            if (!STRNCASECMP(string_argv, string_ref, length))            {                if (length+1 <= (int)strlen(string_argv))                {                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;                    value = atoi(&string_argv[length + auto_inc]);                }                else                {                    value = 0;                }                bFound = true;                continue;            }        }    }    if (bFound)    {        return value;    }    else    {        return 0;    }}

字符串函数atoi()将字符串转换为数字，获取初始值.

gpu设备属性

struct cudaDeviceProp {   char   name[256];                  /**< 设备的ASCII标识 */   size_t totalGlobalMem;             /**< 可用的全局内存量，单位字节 */   size_t sharedMemPerBlock;          /**< 每个block可用的共享内存量，单位字节 */   int    regsPerBlock;               /**< 每个block里可用32位寄存器数量 */   int    warpSize;                   /**< 在线程warp块大小*/   size_t memPitch;                   /**< 允许的内存复制最大修正，单位字节*/   int    maxThreadsPerBlock;         /**< 每个block最大进程数量 */   int    maxThreadsDim[3];           /**< 一block里每个维度最大线程量 */   int    maxGridSize[3];             /**< 一格里每个维度最大数量 */   int    clockRate;                  /**< 时钟频率，单位千赫khz */   size_t totalConstMem;              /**< 设备上可用的常量内存，单位字节 */   int    major;                      /**< 计算功能主版本号*/   int    minor;                      /**< 计算功能次版本号*/   size_t textureAlignment;           /**< 对齐要求的纹理 */   int    deviceOverlap;              /**< 判断设备是否可以同时拷贝内存和执行内核。已过时。改用asyncEngineCount */   int    multiProcessorCount;        /**< 设备上的处理器数量 */   int    kernelExecTimeoutEnabled;   /**< 内核函数是否运行受时间限制*/   int    integrated;                 /**< 设备是不是独立的 */   int    canMapHostMemory;           /**< 设备能否映射主机cudaHostAlloc/cudaHostGetDevicePointer */   int    computeMode;                /**< 计算模式，有默认，独占，禁止，独占进程(See ::cudaComputeMode) */   int    maxTexture1D;               /**< 1D纹理最大值 */   int    maxTexture2D[2];            /**< 2D纹理最大维数*/   int    maxTexture3D[3];            /**< 3D纹理最大维数 */   int    maxTexture1DLayered[2];     /**< 最大的1D分层纹理尺寸 */   int    maxTexture2DLayered[3];     /**< 最大的2D分层纹理尺寸  */   size_t surfaceAlignment;           /**< 表面的对齐要求*/   int    concurrentKernels;          /**< 设备是否能同时执行多个内核*/   int    ECCEnabled;                 /**< 设备是否支持ECC */   int    pciBusID;                   /**< 设备的PCI总线ID */   int    pciDeviceID;                /**< PCI设备的设备ID*/   int    pciDomainID;                /**<PCI设备的域ID*/   int    tccDriver;                  /**< 1如果设备是使用了TCC驱动的Tesla设备，否则就是0 */   int    asyncEngineCount;           /**< 异步Engine数量 */   int    unifiedAddressing;          /**< 设备是否共享统一的地址空间与主机*/   int    memoryClockRate;            /**<峰值内存时钟频率，单位khz*/   int    memoryBusWidth;             /**< 全局内存总线宽度，单位bit*/   int    l2CacheSize;                /**< L2 cache大小，单位字节 */   int    maxThreadsPerMultiProcessor;/**< 每个多处理器的最大的常驻线程 */};

设置主设备端

cudaSetDevice(device);//设置某一块Device作为这个主机host上的某一个运行线程的设备

这个函数必须要在使用 global 的函数或者Runtime的其他的API调用之前才能生效。如果没有调用cudaSetDevice()，device0就会被设置为默认的设备，接下里的如果还有cudaSetDevice()函数也不会有效果.

递归核函数

这里写图片描述

__global__ void cdp_kernel(int max_depth, int depth, int thread, int parent_uid){    // We create a unique ID per block. Thread 0 does that and shares the value with the other threads.    __shared__ int s_uid;    if (threadIdx.x == 0)    {        s_uid = atomicAdd(&g_uids, 1); //原子操作，不同线程块里的线程会对同一位置写入    }    __syncthreads();    // We print the ID of the block and information about its parent.    print_info(depth, thread, s_uid, parent_uid); //设备端函数    // We launch new blocks if we haven't reached the max_depth yet.    if (++depth >= max_depth) //递归结束条件    {        return;    }    cdp_kernel<<<gridDim.x, blockDim.x>>>(max_depth, depth, threadIdx.x, s_uid);}

函数功能：不断开辟新线程递归调用同一个核函数，知道满足结束条件，每一个线程都会有打印函数，但只有满足线程为0才会打印到命令行中显示。

设备端函数

__device__ void print_info(int depth, int thread, int uid, int parent_uid){    if (threadIdx.x == 0) //在 0 线程打印    {        if (depth == 0)            printf("BLOCK %d launched by the host\n", uid);        else        {            char buffer[32];            for (int i = 0 ; i < depth ; ++i)    //设备端调用是加标记                                   {                buffer[3*i+0] = '|';                buffer[3*i+1] = ' ';                buffer[3*i+2] = ' ';            }            buffer[3*depth] = '\0';            printf("%sBLOCK %d launched by thread %d of block %d\n", buffer, uid, thread, parent_uid);//打印结果        }    }    __syncthreads();}

运行结果

starting Simple Print (CUDA Dynamic Parallelism)
Running on GPU 0 (GeForce GTX 980)
***************************************************************************
The CPU launches 2 blocks of 2 threads each. On the device each thread will
launch 2 blocks of 2 threads each. The GPU we will do that recursively
until it reaches max_depth=2
In total 2+8=10 blocks are launched!!! (8 from the GPU)
***************************************************************************
Launching cdp_kernel() with CUDA Dynamic Parallelism:
BLOCK 1 launched by the host
BLOCK 0 launched by the host
| BLOCK 4 launched by thread 0 of block 1
| BLOCK 5 launched by thread 0 of block 1
| BLOCK 2 launched by thread 0 of block 0
| BLOCK 3 launched by thread 0 of block 0
| BLOCK 6 launched by thread 1 of block 1
| BLOCK 7 launched by thread 1 of block 1
| BLOCK 8 launched by thread 1 of block 0
| BLOCK 9 launched by thread 1 of block 0
请按任意键继续…

由于不同线程运行结束时间不同，会导致其中打印顺序可能会不一致。

结论

gpu程序支持递归，支持核函数打印，原子操作可避免冲突。

End

0 0