Cuda笔记1 --- 内积

来源:互联网 发布:洛氏霍克指标源码 编辑:程序博客网 时间:2024/05/01 08:25
/*  CUDA 内积简单例程 */const int THREAD_DIM = 256;void __global__  dot(const float* d_a, const float* d_b, float* d_c , const int n) {int tid = threadIdx.x + blockIdx.x * blockDim.x;int threads = threadIdx.x; __shared__ float cache[THREAD_DIM];float temp = 0.0;while(tid < n){temp += d_a[tid] * d_b[tid];tid += blockDim.x * GridDim.x;}// 将每一个线程计算的乘积放入相应block中的shared memory里cache[threads] = temp;__syncthreads();//归约求每一个block中内积和 int i = blockDim.x / 2;while( i != 0){if ( threads < i ){cache[threads] += cache[threads + i]}__syncthreads();i>>2;}if (threads == 0) // 选择一个线程取出一个block中的和{d_c[blockIdx.x] = cache[0];}}int main(){int n = some const;dim3 blockPergrid((n +255) / 256, 1);dim3 threadPerblock(256,1);// 将每个block中得和取出在CPU中求和}


0 0
原创粉丝点击