【CUDA并行编程之七】数组元素之和

来源：互联网发布：佳能手机打印软件编辑：程序博客网时间：2024/05/01 14:00

现在需要求得一个数组的所有元素之和，之前感觉似乎不太可能，因为每个线程只处理一个元素，无法将所有元素联系起来，但是最近学习了一段代码可以实现，同时也对shared memory有了进一步的理解。

一、C++串行实现

串行实现的方法非常之简单，只要将所有元素依次相加就能够得到相应的结果，实际上我们注重的不是结果，而是运行的效率。那么代码如下：

array_sum.cc:

[cpp] view plaincopy
#include<iostream>  
#include<stdio.h>  
#include "kmeans.h"  
  
using namespace std;  
  
const int cnt = 100000;  
  
int main()  
{  
    int *a = new int[cnt];  
  
    for(int i=0;i<cnt;i++)  
    {  
        a[i] = i+1;   
    }  
  
    double t = wtime();  
      
    for(int i=0;i<cnt;i++)  
        sum += a[i];  
      
    printf("computation elapsed %.8f \n",wtime()-t);  
  
    return 0;  
}  

wtime.cu:

[cpp] view plaincopy
#include <sys/time.h>  
#include <stdio.h>  
#include <stdlib.h>  
  
double wtime(void)   
{  
    double          now_time;  
    struct timeval  etstart;  
    struct timezone tzp;  
  
    if (gettimeofday(&etstart, &tzp) == -1)  
        perror("Error: calling gettimeofday() not successful.\n");  
  
    now_time = ((double)etstart.tv_sec) +              /* in seconds */  
               ((double)etstart.tv_usec) / 1000000.0;  /* in microseconds */  
    return now_time;  
}  

运行结果：

二、CUDA并行实现

先上代码然后再进行解释：

[cpp] view plaincopy
#include <iostream>  
#include <stdio.h>  
#include "kmeans.h"  
  
using namespace std;  
  
const int count = 1000;  
  
void generate_data(int *arr)  
{  
    for(int i=0;i<count;i++)   
    {  
        arr[i] = i+1;  
    }  
}  
  
int nextPowerOfTwo(int n)  
{  
    n--;  
    n = n >> 1 | n;  
    n = n >> 2 | n;  
    n = n >> 4 | n;  
    n = n >> 8 | n;  
    n = n >> 16 | n;  
    //n = n >> 32 | n; //For 64-bits int   
    return ++n;  
}  
  
/* 
cnt : count  
cnt2 : next power of two of count  
*/  
__global__ static void compute_sum(int *array,int cnt , int cnt2)  
{  
    extern __shared__ unsigned int sharedMem[];  
    sharedMem[threadIdx.x] = (threadIdx.x < cnt) ? array[threadIdx.x] : 0 ;  
    __syncthreads();  
  
    //cnt2 "must" be a power of two!  
    for( unsigned int s = cnt2/2 ; s > 0 ; s>>=1 )  
    {  
        if( threadIdx.x < s )      
        {  
            sharedMem[threadIdx.x] += sharedMem[threadIdx.x + s];  
        }  
        __syncthreads();  
    }  
    if(threadIdx.x == 0)  
    {  
        array[0] = sharedMem[0];      
    }  
}  
  
  
int main()  
{  
    int *a = new int[count];  
    generate_data(a);  
  
    int *deviceArray;  
    cudaMalloc( &deviceArray,count*sizeof(int) );  
    cudaMemcpy( deviceArray,a,count*sizeof(int),cudaMemcpyHostToDevice );  
    int npt_count = nextPowerOfTwo(count);//next power of two of count  
    //cout<<"npt_count = "<<npt_count<<endl;  
    int blockSharedDataSize = npt_count * sizeof(int);  
  
    double t = wtime();  
    for(int i=0;i<count;i++)  
    {  
        compute_sum<<<1,count,blockSharedDataSize>>>(deviceArray,count,npt_count);  
    }  
    printf("computation elapsed %.8f \n",wtime()-t);  
  
    int sum ;  
    cudaMemcpy( &sum,deviceArray,sizeof(int),cudaMemcpyDeviceToHost );  
    cout<<"sum = "<<sum<<endl;  
      
    return 0;  
}  

主函数：
line58：为数组a赋初值，维度为count。

line60~62：定义device变量并分配内存，将数组a的值拷贝到显存上去。

line63：nextPowerOfTwo是非常精妙的一段代码，它计算大于等于输入参数n的第一个2的幂次数。至于为什么这么做要到kernel函数里面才能明白。

line68：compute_sum中的"1"为block的数量，"count"为每个block里面的线程数，"blockSharedDataSize"为共享内存的大小。

核函数compute_sum：

line35：定义shared memory变量。

line36：将threadIdx.x小于cnt的对应的sharedMem的内存区赋值为数组array中的值。

line39~47：这段代码的功能就是将所有值求和之后放到了shareMem[0]这个位置上。这段代码要好好体会一下，它把原本计算复杂度为O(n)的串行实现的时间效率通过并行达到了O(logn)。最后将结果保存到array[0]并拷贝回主存。

makefile：

[cpp] view plaincopy
cu:  
    nvcc cuda_array_sum.cu wtime.cu  
    ./a.out  

结果：

三、效率对比

我们通过修改count的值并且加大循环次数来观察变量的效率的差别。

代码修改：

运行时间对比：

count

串行（s）

并行（s）

1000

0.00627995

0.00345612

10000

0.29315591

0.06507015

100000

25.18921304

0.65188980

1000000

2507.66827798

5.61648989

哈哈，可见在数据量大的情况下效率还是相当不错的。

Author：忆之独秀

Email：leaguenew@qq.com

注明出处：http://blog.csdn.net/lavorange/article/details/43031419

0 0