异步内存、直接复制及流处理复制对比

来源：互联网发布：对大数据的认识与评价编辑：程序博客网时间：2024/05/23 01:59

#include<iostream>

#include<cuda.h>
#include<cuda_runtime.h>

using namespace std;

template<const int n>
__device__ void saxpy_unrolled(
float *out,
const float *px,
const float *py,
size_t N,
float alpha
)
{
float x[n],y[n];
size_t i;
for(i=n*blockIdx.x*blockDim.x+threadIdx.x;
i<N+n*blockDim.x*gridDim.x;
i+=n*blockDim.x*gridDim.x)
{
for(int j=0;j<n;j++)
{
size_t index=i+j*blockDim.x;
if(index<N)
{
x[j]=px[index];
y[j]=py[index];
}

}
for(int j=0;j<n;j++)
{
size_t index=i+j*blockDim.x;
if(index<N) out[index]=alpha*x[j]+y[j];
}
}
}

__global__ void saxpyGPU(float *out,const float *px,const float *py,
size_t N,float alpha)
{
saxpy_unrolled<4>(out,px,py,N,alpha);
}

int main(void)
{
const size_t N=52428800;
float *dptrX,*hptrX,*dptrY,*hptrY,*dptrOut,*hptrOut;
float alpha=0.5;

//申明空间
hptrX=new float[N];
hptrY=new float[N];
hptrOut=new float[N];
cudaMalloc((void**)&dptrX,N*sizeof(float));
cudaMalloc((void**)&dptrY,N*sizeof(float));
cudaMalloc((void**)&dptrOut,N*sizeof(float));

//简单赋值
for(size_t i=0;i<N;i++)
{
hptrX[i]=5.6;
hptrY[i]=6.5;
}

//设置线程
int nBlocks=2048,nThreads=256;

//声明事件
cudaEvent_t start_sync,HtoD_sync,kernel_sync,DtoH_sync,stop_sync,
start_Async,HtoD_Async,kernel_Async,DtoH_Async,stop_Async;

//声明耗时
float mcpy_HtoD_sync,kernelTime_sync,mcpy_DtoH_sync,total_sync,
mcpy_HtoD_Async,kernelTime_Async,mcpy_DtoH_Async,total_Async;

//创建事件
cudaEventCreate(&start_sync);
cudaEventCreate(&HtoD_sync);
cudaEventCreate(&kernel_sync);
cudaEventCreate(&DtoH_sync);
cudaEventCreate(&stop_sync);
cudaEventCreate(&start_Async);
cudaEventCreate(&HtoD_Async);
cudaEventCreate(&kernel_Async);
cudaEventCreate(&DtoH_Async);
cudaEventCreate(&stop_Async);

//++++++++++++++++++++++++++++++++++++++++++++++++++++++++
//同步内核计时
cudaEventRecord(start_sync,0);
cudaMemcpy(dptrX,hptrX,N*sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(dptrY,hptrY,N*sizeof(float),cudaMemcpyHostToDevice);
cudaEventRecord(HtoD_sync,0);
saxpyGPU<<<nBlocks,nThreads>>>(dptrOut,dptrX,dptrY,N,alpha);
cudaEventRecord(kernel_sync,0);
cudaMemcpy(hptrOut,dptrOut,N*sizeof(float),cudaMemcpyDeviceToHost);
cudaEventRecord(DtoH_sync,0);
cudaEventRecord(stop_sync,0);
cudaDeviceSynchronize();

//统计同步内核耗时
cudaEventElapsedTime(&mcpy_HtoD_sync,start_sync,HtoD_sync);
cudaEventElapsedTime(&kernelTime_sync,HtoD_sync,kernel_sync);
cudaEventElapsedTime(&mcpy_DtoH_sync,kernel_sync,DtoH_sync);
cudaEventElapsedTime(&total_sync,start_sync,stop_sync);

//++++++++++++++++++++++++++++++++++++++++++++++++++++++++
//异步内核计时
cudaEventRecord(start_Async,0);
cudaMemcpyAsync(dptrX,hptrX,N*sizeof(float),cudaMemcpyHostToDevice,NULL);
cudaMemcpyAsync(dptrY,hptrY,N*sizeof(float),cudaMemcpyHostToDevice,NULL);
cudaEventRecord(HtoD_Async,0);
saxpyGPU<<<nBlocks,nThreads>>>(dptrOut,dptrX,dptrY,N,alpha);
cudaEventRecord(kernel_Async,0);
cudaMemcpyAsync(hptrOut,dptrOut,N*sizeof(float),cudaMemcpyDeviceToHost,NULL);
cudaEventRecord(DtoH_Async,0);
cudaEventRecord(stop_Async,0);
cudaDeviceSynchronize();

//统计异步内核耗时
cudaEventElapsedTime(&mcpy_HtoD_Async,start_Async,HtoD_Async);
cudaEventElapsedTime(&kernelTime_Async,HtoD_Async,kernel_Async);
cudaEventElapsedTime(&mcpy_DtoH_Async,kernel_Async,DtoH_Async);
cudaEventElapsedTime(&total_Async,start_Async,stop_Async);

//流的事件及事件的声明
cudaEvent_t stream_start,stream_stop;
float total_stream;

//流的个数
const int nStream=10;
cudaStream_t streams[nStream];
size_t streamStep=N/10;
for(int i=0;i<nStream;i++)
cudaStreamCreate(&streams[i]);

//事件的创建
cudaEventCreate(&stream_start);
cudaEventCreate(&stream_stop);

//++++++++++++++++++++++++++++++++++++++++++++++++++++++++
//事件的记录
cudaEventRecord(stream_start,0);
for(int iStream=0;iStream<nStream;iStream++)
{
cudaMemcpyAsync(dptrX+iStream*streamStep,
hptrX+iStream*streamStep,
streamStep*sizeof(float),
cudaMemcpyHostToDevice,
streams[iStream]);
cudaMemcpyAsync(dptrY+iStream*streamStep,
hptrY+iStream*streamStep,
streamStep*sizeof(float),
cudaMemcpyHostToDevice,
streams[iStream]);
}

for(int iStream=0;iStream<nStream;iStream++)
{
saxpyGPU<<<nBlocks,nThreads,0,streams[iStream]>>>(
dptrOut+iStream*streamStep,
dptrX+iStream*streamStep,
dptrY+iStream*streamStep,
streamStep,
alpha);
}

for(int iStream=0;iStream<nStream;iStream++)
{
cudaMemcpyAsync(dptrOut+iStream*streamStep,
hptrOut+iStream*streamStep,
streamStep*sizeof(float),
cudaMemcpyHostToDevice,
streams[iStream]);
}

cudaEventRecord(stream_stop,0);
cudaDeviceSynchronize();

//计算流处理数据传输所耗时间
cudaEventElapsedTime(&total_stream,stream_start,stream_stop);

//++++++++++++++++++++++++++++++++++++++++++++++++++++++++
//用锁页内存来传输
float *host_alloc_X,*host_alloc_Y,*host_alloc_Out;
cudaEvent_t host_alloc_start,host_alloc_HtoD,host_alloc_DtoH,
host_alloc_kernel,host_alloc_stop;
float host_alloc_time_HtoD,host_alloc_time_Kernel,
host_alloc_time_DtoH,host_alloc_time_Total;
cudaHostAlloc((void**)&host_alloc_X,N*sizeof(float),cudaHostAllocDefault);
cudaHostAlloc((void**)&host_alloc_Y,N*sizeof(float),cudaHostAllocDefault);
cudaHostAlloc((void**)&host_alloc_Out,N*sizeof(float),cudaHostAllocDefault);
for(size_t i=0;i<N;i++)
{
host_alloc_X[i]=5.6;
host_alloc_Y[i]=6.5;
}
cudaEventCreate(&host_alloc_start);
cudaEventCreate(&host_alloc_HtoD);
cudaEventCreate(&host_alloc_DtoH);
cudaEventCreate(&host_alloc_kernel);
cudaEventCreate(&host_alloc_stop);
cudaEventRecord(host_alloc_start,0);
cudaMemcpyAsync(dptrX,host_alloc_X,N*sizeof(float),cudaMemcpyHostToDevice,NULL);
cudaMemcpyAsync(dptrY,host_alloc_Y,N*sizeof(float),cudaMemcpyHostToDevice,NULL);
cudaEventRecord(host_alloc_HtoD,0);
saxpyGPU<<<nBlocks,nThreads>>>(dptrOut,dptrX,dptrY,N,alpha);
cudaEventRecord(host_alloc_kernel,0);
cudaMemcpyAsync(hptrOut,dptrOut,N*sizeof(float),cudaMemcpyDeviceToHost,NULL);
cudaEventRecord(host_alloc_DtoH,0);
cudaEventRecord(host_alloc_stop,0);

cudaDeviceSynchronize();

cudaEventElapsedTime(&host_alloc_time_HtoD,host_alloc_start,host_alloc_HtoD);
cudaEventElapsedTime(&host_alloc_time_Kernel,host_alloc_HtoD,host_alloc_kernel);
cudaEventElapsedTime(&host_alloc_time_DtoH,host_alloc_kernel,host_alloc_DtoH);
cudaEventElapsedTime(&host_alloc_time_Total,host_alloc_start,host_alloc_stop);
cudaFreeHost(host_alloc_X);
cudaFreeHost(host_alloc_Y);
cudaFreeHost(host_alloc_Out);

//++++++++++++++++++++++++++++++++++++++++++++++++++++++++
//使用零拷贝来实现
cudaEvent_t zero_start,zero_stop;
float zero_time;
float *hostX,*hostY,*hostOut,*zeroX,*zeroY,*zeroOut;
cudaHostAlloc((void**)&hostX,N*sizeof(float),cudaHostAllocWriteCombined|
cudaHostAllocMapped);
cudaHostAlloc((void**)&hostY,N*sizeof(float),cudaHostAllocWriteCombined|
cudaHostAllocMapped);
cudaHostAlloc((void**)&hostOut,N*sizeof(float),cudaHostAllocWriteCombined|
cudaHostAllocMapped);

//**************************************************************************
//这里需要先释放了上面的映射锁页内存，否则不能够赋值************************
for(size_t i=0;i<N;i++)
{
hostX[i]=5.6;
hostY[i]=6.5;
//hostOut[i]=0.5;//可以不用先赋值，只需传个指针到GPU
}
//**************************************************************************
cudaEventCreate(&zero_start);
cudaEventCreate(&zero_stop);
cudaEventRecord(zero_start,0);
cudaHostGetDevicePointer(&zeroX,hostX,0);
cudaHostGetDevicePointer(&zeroY,hostY,0);
cudaHostGetDevicePointer(&zeroOut,hostOut,0);
saxpyGPU<<<nBlocks,nThreads>>>(zeroOut,zeroX,zeroY,N,alpha);
cudaEventRecord(zero_stop,0);

cudaThreadSynchronize();
cudaEventElapsedTime(&zero_time,zero_start,zero_stop);
cudaFreeHost(hostX);
cudaFreeHost(hostY);
cudaFreeHost(hostOut);

//++++++++++++++++++++++++++++++++++++++++++++++++++++++++
//使用unified memory来测试
cudaEvent_t unified_start,unified_copy,unified_kernel,unified_stop;
float unified_time_HtoD,unified_time_kernel,unified_time_DtoH,unified_time_total;
float *unifiedX,*unifiedY,*unifiedOut;
cudaMallocManaged(&unifiedX,N*sizeof(float));
cudaMallocManaged(&unifiedY,N*sizeof(float));
cudaMallocManaged(&unifiedOut,N*sizeof(float));
cudaEventCreate(&unified_start);
cudaEventCreate(&unified_copy);
cudaEventCreate(&unified_kernel);
cudaEventCreate(&unified_stop);
cudaEventRecord(unified_start,0);

for(size_t i=0;i<N;i++)
{
unifiedX[i]=hptrX[i];
unifiedY[i]=hptrY[i];
unifiedOut[i]=hptrOut[i];
}

cudaEventRecord(unified_copy,0);

saxpyGPU<<<nBlocks,nThreads>>>(unifiedOut,unifiedX,unifiedY,N,alpha);
cudaDeviceSynchronize();//一定要用这个内核函数同步语句才可以
cudaEventRecord(unified_kernel,0);
for(size_t i=0;i<N;i++)
{
hptrOut[i]=unifiedOut[i];
}
cudaEventRecord(unified_stop,0);
cudaDeviceSynchronize();
cudaEventElapsedTime(&unified_time_HtoD,unified_start,unified_copy);
cudaEventElapsedTime(&unified_time_kernel,unified_copy,unified_kernel);
cudaEventElapsedTime(&unified_time_DtoH,unified_kernel,unified_stop);
cudaEventElapsedTime(&unified_time_total,unified_start,unified_stop);

//++++++++++++++++++++++++++++++++++++++++++++++++++++++++
//显示同步内核耗时
cout<<"cudaMemcpy processing..."<<endl;
cout<<"Memcpy(host->device):"<<mcpy_HtoD_sync<<"ms"<<endl;
cout<<"Kernel processing:"<<kernelTime_sync<<"ms"<<endl;
cout<<"Memcpy(device->host):"<<mcpy_DtoH_sync<<"ms"<<endl;
cout<<"Total time:"<<total_sync<<"ms"<<endl;

//显示异步内核耗时
cout<<endl<<"cudaMemcpyAsync processing..."<<endl;
cout<<"Memcpy(host->device):"<<mcpy_HtoD_Async<<"ms"<<endl;
cout<<"Kernel processing:"<<kernelTime_Async<<"ms"<<endl;
cout<<"Memcpy(device->host):"<<mcpy_DtoH_Async<<"ms"<<endl;
cout<<"Total time:"<<total_Async<<"ms"<<endl;

//显示流处理所耗时
cout<<endl<<"Stream processing..."<<endl;
cout<<"Total time:"<<total_stream<<"ms"<<endl;

//显示映射锁页内存耗时
cout<<endl<<"cudaHostAlloc processing..."<<endl;
cout<<"Memcpy(host->device):"<<host_alloc_time_HtoD<<"ms"<<endl;
cout<<"Kernel processing:"<<host_alloc_time_Kernel<<"ms"<<endl;
cout<<"Memcpy(device->host):"<<host_alloc_time_DtoH<<"ms"<<endl;
cout<<"Total time:"<<host_alloc_time_Total<<"ms"<<endl;

//显示零拷贝耗时
cout<<endl<<"zeroCopy processing..."<<endl;
cout<<"Total time:"<<zero_time<<endl;

//显示unified memory耗时
cout<<endl<<"unified memory processing..."<<endl;
cout<<"Memcpy(host->device):"<<unified_time_HtoD<<"ms"<<endl;
cout<<"Kernel processing:"<<unified_time_kernel<<"ms"<<endl;
cout<<"Memcpy(device->host):"<<unified_time_DtoH<<"ms"<<endl;
cout<<"Total time:"<<unified_time_total<<"ms"<<endl;

return 0;
}

0 0