CUDA笔记2-循环展开

来源:互联网 发布:mac无法更新系统 编辑:程序博客网 时间:2024/06/08 08:45

CUDA循环展开



串行循环展开



 loop unrolling是一种牺牲程序的尺寸来加快执行速度的优化方法。拿数组来说,数组的数据在内存中是连续存储的,每次取数据的时候可以一次抓取相邻的多个数据,从而减少从内存中读取数据的时间,优化程序。 例子 :
 
  假设n可以被4整除
 
 
未展开程序 for(int i = 0; i < n; i++){ c[i] = a[i] + b[i]; }  4层展开程序 for(int i = 0; i < n/4; i++){ c[i] = a[i] + b[i]; c[i+1] = a[i+1] + b[i+1]; c[i+2] = a[i+2] + b[i+2]; c[i+3] = a[i+3] + b[i+3]; }


 

GPU并行循环展开



在GPU上,同样的也可以进行循环展开优化, 更重要的是展开可以减少warp总得分支,下面是一个简单的程序,将value写入out的N个元素中。


global_write函数未展开版:template <class T> __global__ void Global_write(T*out,T value,size_t N){ for ( size_t i = blockIdx.x*blockDim.x+threadIdx.x;        i < N;i += blockDim.x*gridDim.x ) {        out[i] = value;}global_write函数展开,展开n层template<class T, const int n>__global__ void Global_write(T* out, T value, size_t N){size_t i;for(i = n*blockDim.x*blockIdx.x + threadIdx.x;i < N - n*blockDim.x*blockIdx.x;i += n*gridDim.x*blockDim.x;)for(int j = 0; j < n; i++){size_t index = i + j * blockDim.x;outp[index] = value;}// 为了不在循环里加入控制语句,将最后一次循环单独的写for ( int j = 0; j < n; j++ ) {        size_t index = i+j*blockDim.x;        if ( index<N ) out[index] = value;    }}






向量运算例子
计算 out = alpha * x + beta * y





一般形式:__global__ void            saxpyGPU(                float *out,                const float *x,                const float *y,                size_t N,                float alpha,float beta ){             for ( size_t i = blockIdx.x*blockDim.x +threadIdx.x;                          i < N;                          i += blockDim.x*gridDim.x ) {                 out[i] = alpha * x[i]+beta * y[i];}}unroll:template<const int n>__device__ void saxy_unrolled(float* out,const float* px,const float* py,size_t N,float alpha, float beta){float x[n],y[n];size_t i;for(i = n*blockDim.x*blockIdx.x + threadIdx.x;i < N - n*blockDim.x*blockIdx.x; i += gridDim.x*blockDim.x){for(int j = 0; j < n; j++){size_t index = i + j * blockDim.x;x[j] = px[index];y[j] = py[index];}for( int j = 0; j < n; j++){size_t index = i + j*blockDim.x;out[index] = alpha * x[j] + beta * y[j];}}for ( int j = 0; j < n; j++ ) {for ( int j = 0; j < n; j++ ) {size_t index = i+j*blockDim.x;    if ( index<N ) {        x[j] = px[index];        y[j] = py[index];    }}for ( int j = 0; j < n; j++ ) {    size_t index = i+j*blockDim.x;    if ( index<N ) out[index] = alpha*x[j]+y[j];}}__global__ voidsaxpyGPU( float *out, const float *px, const float *py,size_t N,float alpha ){saxpy_unrolled<4>( out, px, py, N, alpha );}






在cuda-handbook中说这样的优化能够提高 10% 的效率。


[berkeley并行循环展开课件](http://www.cs.berkeley.edu/~volkov/volkov11-unrolling.pdf)
0 0
原创粉丝点击