将并行线程块修改为使用并行线程时的矢量相加

来源:互联网 发布:网络信息安全 书 编辑:程序博客网 时间:2024/05/02 22:31

和上篇代码有略微差异

#include <cuda_runtime_api.h>#include <iostream>using namespace std;const int N = 10 ;__global__ void add(int *a , int *b , int *c){    int tid = threadIdx.x ;//通过线程索引来对数据进行索引    if (tid < N)    {        c[tid] = a[tid] + b[tid] ;    }}void main(){    int a[N] , b[N] , c[N] ;    int *dev_a , *dev_b , *dev_c ;    //在GPU上分配内存    cudaMalloc((void**)(&dev_a) , sizeof(int) * N) ;    cudaMalloc((void**)(&dev_b) , sizeof(int) * N) ;    cudaMalloc((void**)(&dev_c) , sizeof(int) * N) ;    for (int i = 0 ; i < N ; i++)    {        a[i] = i ;        b[i] = i * i ;    }    //将a,b复制到GPU    cudaMemcpy(dev_a , a , sizeof(int) * N , cudaMemcpyHostToDevice) ;    cudaMemcpy(dev_b , b , sizeof(int) * N , cudaMemcpyHostToDevice) ;    cudaMemcpy(dev_c , c , sizeof(int) * N , cudaMemcpyHostToDevice) ;    add<<<1 , N>>>(dev_a , dev_b , dev_c) ;//只有一个线程块    //将c从GPU复制到cpu    cudaMemcpy(c , dev_c , N * sizeof(int) , cudaMemcpyDeviceToHost) ;    for(int i = 0 ; i < N ; i++)    {        cout<<"a["<<i<<"]"<<" + b["<<i<<"] = "<<c[i]<<"\n";    }    cudaFree(dev_a) ;    cudaFree(dev_b) ;    cudaFree(dev_c) ;    return ;}
0 0