测试cuda的unified memory和cudaMemcpy的加减乘除及空间申请时间对比

来源:互联网 发布:二手书知乎 编辑:程序博客网 时间:2024/05/01 03:10
#include<iostream>
#include<cuda.h>
#include<cuda_runtime.h>
#include<time.h>
using namespace std;


const int N=1234567;
const int sample=100;
const int threads=256;




__global__ void judge(int *da,int *data,int sam)
{
const int tid=blockIdx.x *blockDim.x+threadIdx.x;
for(int i=0;i<sam;i++)
{
if(da[tid]<sample*(i+1))
{
atomicAdd(&data[i],1);
break;
}
}


//const int tid=threadIdx.x;
//const int bid=blockIdx.x;
//for(long i=tid+bid*blockDim.x;i<N+gridDim.x*blockDim.x;i+=gridDim.x*blockDim.x)
//{
// for(int j=0;j<sam;j++)
// {
// if(da[i]<sample*(j+1))
// {
// atomicAdd(&data[j],1);
// break;
// }
// }
//}


__syncthreads();


}




int main(void)
{
int *ha,*da;//用来申请空间


//测试unified memory的申请时间 
clock_t a,b,c;
a=clock();
cudaMallocManaged (&da,N*sizeof(int));
b=clock()-a;
cout<<"unified-"<<b<<endl;
ha=new int[N];


for(int i=0;i<N;i++)//初值
{
ha[i]=i;
da[i]=ha[i];
}


int it_sam=(N+sample-1)/sample;//分区间的个数 ***************************
int *h_data,*d_data;
h_data=new int[it_sam ];


//int *a;
//a=new int[it_sam];



cudaMallocManaged (&d_data,it_sam*sizeof(int));
for(int i=0;i<it_sam;i++)//初始化为0
{
h_data[i]=0;
d_data[i]=0;
//a[i]=0;
}






for(int i=0;i<N;i++)//host端if
for(int it=0;it<it_sam;it++)
{
if(ha[i]<sample*(it+1))
{
h_data[it]++;
break;
}
}


cout<<"host____"<<endl;


//int blocks;
//if(it_sam<2048)
// blocks=it_sam;
//else
// blocks=2048;


int blocks=(N+threads-1)/threads;


//int *data;
//cudaMalloc(&data,it_sam*sizeof(int));
//cudaMemcpy(data,d_data,it_sam*sizeof(int),cudaMemcpyHostToDevice);
//int *dda;
//cudaMalloc(&dda,N*sizeof(int));
//cudaMemcpy(dda,da,N*sizeof(int),cudaMemcpyHostToDevice);




cudaEvent_t start,stop;//事件
float time_unified;//测试时间
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);


judge<<<blocks,threads>>>(da,d_data,it_sam);
cudaDeviceSynchronize();


cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_unified,start,stop);
cout<<"unified__"<<time_unified<<endl;


//cudaMemcpy(d_data,data,it_sam*sizeof(int),cudaMemcpyDeviceToHost);


int *data;
cudaMalloc(&data,it_sam*sizeof(int));
cudaMemcpy(data,d_data,it_sam*sizeof(int),cudaMemcpyHostToDevice);
int *dda;

//测试cudaMemcpy的申请时间
a=clock();
cudaMalloc(&dda,N*sizeof(int));
c=clock()-a;
cout<<"cudaMemcpy-"<<c<<endl;
cudaMemcpy(dda,da,N*sizeof(int),cudaMemcpyHostToDevice);


cudaEvent_t start1,stop1;//事件
float time_gpu;//测试时间
cudaEventCreate(&start1);
cudaEventCreate(&stop1);
cudaEventRecord(start1,0);
judge<<<blocks,threads>>>(dda,d_data,it_sam);
cudaDeviceSynchronize();


cudaEventRecord(stop1,0);
cudaEventSynchronize(stop1);
cudaEventElapsedTime(&time_gpu,start1,stop1);


cout<<"device__"<<time_gpu<<endl;


//for(int ii=0;ii<it_sam;ii++)
//{
// cout<<d_data[ii]<<"  ";
//}


//for(int ii=0;ii<it_sam;ii++)
//{
// if(h_data[ii]!=d_data[ii])
// cout<<ii<<"   "<<h_data[ii]<<"   "<<d_data[ii]<<"   "<<"error";
//}


cout<<"end__"<<endl;


cudaFree(d_data);
cudaFree(da);


return 0;
}
0 0
原创粉丝点击