CUDA编程指南(第五章)

来源:互联网 发布:算法的概念简单例题 编辑:程序博客网 时间:2024/04/30 04:13

CUDA

第五章 线程网络、线程块以及线程

GPU上需要将问题分解得比CPU上更加细致。
GPU通过创建一个内核函数来对循环并行化,内核函数只能在GPU上执行。例如:

__global__ void some_kernel_func(const int* const a,const int* const b,const int * const c){    a[i] = b[i] * c[i];}

上面的代码中用__global__前缀来表征这个函数编译生成的是GPU代码而不是CPU代码,并且这个代码是全局可见的。
我们常使用的一个结构体是threadIdx

__global__ some_kernel_func(const int * a,const int * b, const int * c){    const unsigned int thread_idx = threadIdx.x;    a[thread_idx] = b[thread_idx] * c[thread_idx];}

硬件

GPU每个线程组被送到SM中,然后N个SP开始执行代码,在得到了线程标号之后就立即从数组b和c中各取一个数然后进行乘法计算。但是在取数的这个过程中就已经过去了400到600个GPU时钟周期,这期间N个线程都要被挂起。另外,每32个线程被称为一个线程束。

CUDA内核

调用内核函数的语法是:

kernel_function<<<num_blocks, num_threads>>>(param1,param2,...)

num_treads表示执行内核函数的线程数量num_blocks表示线程块的数量
如果num_blocks不为1,例如核函数为some_kernel_funx<<<2,64>>>(a,b,c)那么在上面的代码中计算threadIdx就需要改变

__global__ void some_kernel_func(const int * a,const int * b, const int *c){    const unsigned int thread_idx = (blockIdx.x * blockDim.x) + threadIdx.x;    a[thread_idx] = b[thread_idx] * c[thread_idx];}

下面是一个简单的例程

#include "cuda_runtime.h"#include "device_launch_parameters.h"#include <stdlib.h>#include <stdio.h>#include <conio.h>__global__ void what_is_my_id(unsigned int *const block,                              unsigned int *const thread,                              unsigned int *const warp,                              unsigned int *const calc_thread){    /*Thread id is index *block size+thread offset into the block */    const unsigned int thread_idx = (blockIdx.x * blockDim.x) + threadIdx.x;    block[thread_idx] = blockIdx.x;    thread[thread_idx] = threadIdx.x;    /*calculate warp using built in variable warpSize */    warp[thread_idx] = threadIdx.x / warpSize;    calc_thread[thread_idx] = thread_idx;}#define ARRAY_SIZE 128#define ARRAY_SIZE_IN_BYTES (sizeof(unsigned int) * (ARRAY_SIZE))unsigned int cpu_block[ARRAY_SIZE];unsigned int cpu_thread[ARRAY_SIZE];unsigned int cpu_warp[ARRAY_SIZE];unsigned int cpu_calc_thread[ARRAY_SIZE];int main(void){    const unsigned int num_blocks=2;    const unsigned int num_thread=64;    char ch;    unsigned int * gpu_block;    unsigned int * gpu_thread;    unsigned int * gpu_warp;    unsigned int * gpu_calc_thread;    unsigned int i;    cudaMalloc((void**)&gpu_block,ARRAY_SIZE_IN_BYTES);    cudaMalloc((void**)&gpu_thread,ARRAY_SIZE_IN_BYTES);    cudaMalloc((void**)&gpu_warp,ARRAY_SIZE_IN_BYTES);    cudaMalloc((void**)&gpu_calc_thread,ARRAY_SIZE_IN_BYTES);    what_is_my_id<<<num_blocks,num_thread>>>(gpu_block,gpu_thread,gpu_warp,gpu_calc_thread);    cudaMemcpy(cpu_block,gpu_block,ARRAY_SIZE_IN_BYTES,cudaMemcpyDeviceToHost);    cudaMemcpy(cpu_thread,gpu_thread,ARRAY_SIZE_IN_BYTES,cudaMemcpyDeviceToHost);    cudaMemcpy(cpu_warp,gpu_warp,ARRAY_SIZE_IN_BYTES,cudaMemcpyDeviceToHost);    cudaMemcpy(cpu_calc_thread,gpu_calc_thread,ARRAY_SIZE_IN_BYTES,cudaMemcpyDeviceToHost);    cudaFree(gpu_block);    cudaFree(gpu_thread);    cudaFree(gpu_warp);    cudaFree(gpu_calc_thread);    for(i=0;i<ARRAY_SIZE;i++)    {        printf("calculated  thread: %3u - block:%2u -warp: %2u -thread; %2u\n",        cpu_calc_thread[i],cpu_block[i],cpu_warp[i],cpu_thread[i]);    }    ch = getch();}

线程网格

192是可以考虑的最小的线程数目
线程块上的二维数组需要使用两个线程索引
const unsigned int idx = (blockIdx.x * blockDim.x) + threadIdx.x;
const unsigned int idy = (blockIdx.y * blockDim.y) + threadIdx.y;
同一个线程块中的线程可以通过共享内存进行通信,这样速度比较快。

线程0~15,线程块0 线程16~31,线程块0 … … 线程0~15,线程块1 线程16~31,线程块1 … … 线程0~15,线程块4 线程16~31,线程块4
0 0