多线程CUDA实例 167页 平方求和

来源:互联网 发布:软件维护不能用 编辑:程序博客网 时间:2024/05/01 13:41
//GPGPU编程技术-从GLSL、CUDA到OpenCL  平方和算法第二版 增加计时函数 没有为主机变量分配内存 多线程求和////书中程序167页#include <stdio.h>#include <iostream>#include <cuda_runtime.h> // For the CUDA runtime routines (prefixed with "cuda_")#include <device_launch_parameters.h> //我在查询中找到的头文件 有可能有别的表达方式#include <time.h> //计时用的函数库#define DATA_SIZE 1048576   //4MB 的数据#define THREAD_NUM 256      //线程数using namespace std;int anData[DATA_SIZE];//生成随机数据量void GenerateNumber(int *pnNumber, int nSize){    for (int i = 0; i < nSize; i++)  pnNumber[i] = rand() % 10;}//全局函数计算平方和内核:在主机上调用,在设备上执行__global__ static void sumofSquares(int *pnNum, int* pnResult,clock_t *pclock_ttime){    int tid = threadIdx.x;    int nSum = 0;    int i;    int nSize = 0;    if (DATA_SIZE % THREAD_NUM) nSize = DATA_SIZE / THREAD_NUM + 1;    else nSize = DATA_SIZE / THREAD_NUM; //nSize 一个线程计算的数据量    //计时开始    clock_t clock_tstart;    if ( tid == 0 )  clock_tstart = clock(); //用了一个线程计时,足以精确请放心    for ( i=nSize * tid ; i < (tid+1) *nSize ; i++)    {        nSum += (pnNum[i] * pnNum[i]);    }    pnResult[tid] = nSum;    //计时结束    if ( tid == 0) *pclock_ttime = clock() - clock_tstart; //用了一个线程计时}void main(){    GenerateNumber(anData, DATA_SIZE); //生成随机数据量    int *pnGpuData, *pnResult;    clock_t *pclock_ttime;//储存时间的    int *nSummat;    cudaMallocHost((void**)&nSummat, sizeof(int) * THREAD_NUM);    cudaMalloc ((void**)&pnGpuData, sizeof(int) * DATA_SIZE);    cudaMalloc ((void**)&pnResult , sizeof(int) * THREAD_NUM);    cudaMalloc((void**)&pclock_ttime, sizeof(clock_t));    cudaMemcpy(pnGpuData, anData, sizeof(int)*DATA_SIZE, cudaMemcpyHostToDevice);    sumofSquares <<< 1, THREAD_NUM , 0 >>>(pnGpuData, pnResult,pclock_ttime);    cudaMemcpy(nSummat, pnResult, sizeof(int) * THREAD_NUM, cudaMemcpyDeviceToHost);    clock_t pclocksum;    cudaMemcpy(&pclocksum,pclock_ttime, sizeof(clock_t), cudaMemcpyDeviceToHost);    //在cpu上最后加和运算    int finishsum = 0;    for (size_t i = 0; i < THREAD_NUM; i++)    {        finishsum = finishsum + nSummat[i];    }    printf("SuM = %d    Time = %d\n", finishsum, pclocksum);    cudaFree(pnGpuData);    cudaFree(pnResult);    cudaFree(pclock_ttime);    system("pause");    //return 0;}

运行结果:
SuM = 29887816 Time = 15721410
请按任意键继续…

0 0
原创粉丝点击