cuda的优化技术

来源:互联网 发布:绝对领域 知乎 编辑:程序博客网 时间:2024/04/20 01:43
#include<iostream>
#include<cuda.h>
#include<cuda_runtime.h>
#include<time.h>
using namespace std;




const int data_size=1048576;
int data[data_size];




void GenerateNumbers(int *number,int size)
{
for(int i=0;i<size;i++)
{
number[i]=rand()%10;
}
}




__global__ static void sumOfSquares(int *num,int* result)
{
int sum=0;
int i;
for(i=0;i<data_size;i++)
{
sum+=num[i]*num[i];
}
*result=sum;
}








int main(void)
{
GenerateNumbers(data,data_size);
int* gpudata,*result;
cudaMalloc((void**)&gpudata,sizeof(int)*data_size);
cudaMalloc((void**)&result,sizeof(int));
cudaMemcpy(gpudata,data,sizeof(int)*data_size,
cudaMemcpyHostToDevice);


cudaEvent_t start,stop;
float time_used;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
sumOfSquares<<<1,1,0>>>(gpudata,result);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_used,start,stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
int sum;
cudaMemcpy(&sum,result,sizeof(int),
cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cout<<"sum(gpu):"<<sum<<"  time:"<<time_used<<endl;






sum=0;
for(int i=0;i<data_size;i++)
{
sum+=data[i]*data[i];
}
cout<<"sum(cpu):"<<sum<<endl;




return 0;
}


仅用了一个线程的情况。

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++



#include<iostream>
#include<cuda.h>
#include<cuda_runtime.h>
#include<time.h>
using namespace std;




const int data_size=1048576;
const int thread_num=256;
int data[data_size];








void GenerateNumbers(int *number,int size)
{
for(int i=0;i<size;i++)
{
number[i]=rand()%10;
}
}




__global__ static void sumOfSquares(int *num,int* result)
{
const int tid=threadIdx.x;
const int size=data_size/thread_num;
int sum=0;
int i;
for(i=tid*size;i<(tid+1)*size;i++)
{
sum+=num[i]*num[i];
}
result[tid]=sum;
}








int main(void)
{
GenerateNumbers(data,data_size);
int* gpudata,*result;
cudaMalloc((void**)&gpudata,sizeof(int)*data_size);
cudaMalloc((void**)&result,sizeof(int)*thread_num);
cudaMemcpy(gpudata,data,sizeof(int)*data_size,
cudaMemcpyHostToDevice);


cudaEvent_t start,stop;
float time_used;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);


sumOfSquares<<<1,thread_num,0>>>(gpudata,result);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_used,start,stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);




int sum[thread_num];


cudaMemcpy(&sum,result,sizeof(int)*thread_num,
cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);




int final_sum=0;
for(int i=0;i<thread_num;i++)
final_sum+=sum[i];
cout<<"sum(gpu):"<<final_sum<<"  time:"<<time_used<<endl;






final_sum=0;
for(int i=0;i<data_size;i++)
{
final_sum+=data[i]*data[i];
}
cout<<"sum(cpu):"<<final_sum<<endl;




return 0;
}


这是用了1*256个线程

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++



#include<iostream>
#include<cuda.h>
#include<cuda_runtime.h>
#include<time.h>
using namespace std;




const int data_size=1048576;
const int thread_num=256;
int data[data_size];








void GenerateNumbers(int *number,int size)
{
for(int i=0;i<size;i++)
{
number[i]=rand()%10;
}
}




__global__ static void sumOfSquares(int *num,int* result)
{
const int tid=threadIdx.x;
const int size=data_size/thread_num;
int sum=0;
int i;
for(i=tid;i<data_size;i+=thread_num)
{
sum+=num[i]*num[i];
}
result[tid]=sum;
}








int main(void)
{
GenerateNumbers(data,data_size);
int* gpudata,*result;
cudaMalloc((void**)&gpudata,sizeof(int)*data_size);
cudaMalloc((void**)&result,sizeof(int)*thread_num);
cudaMemcpy(gpudata,data,sizeof(int)*data_size,
cudaMemcpyHostToDevice);


cudaEvent_t start,stop;
float time_used;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);


sumOfSquares<<<1,thread_num,0>>>(gpudata,result);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_used,start,stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);




int sum[thread_num];


cudaMemcpy(&sum,result,sizeof(int)*thread_num,
cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);




int final_sum=0;
for(int i=0;i<thread_num;i++)
final_sum+=sum[i];
cout<<"sum(gpu):"<<final_sum<<"  time:"<<time_used<<endl;






final_sum=0;
for(int i=0;i<data_size;i++)
{
final_sum+=data[i]*data[i];
}
cout<<"sum(cpu):"<<final_sum<<endl;




return 0;
}


使用了1*256个线程,注意kernel函数的区别,主要是线程的使用方法。


使用了1*1024个线程

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++



#include<iostream>
#include<cuda.h>
#include<cuda_runtime.h>
#include<time.h>
using namespace std;


const int data_size=1048576;
const int block_num=32;
const int thread_num=256;
int data[data_size];




void GenerateNumbers(int *number,int size)
{
for(int i=0;i<size;i++)
{
number[i]=rand()%10;
}
}


__global__ static void sumOfSquares(int *num,int* result)
{
const int tid=threadIdx.x;
const int bid=blockIdx.x;
const int size=data_size/thread_num;
int sum=0;
int i;
for(i=bid*thread_num+tid;i<data_size;
i+=block_num*thread_num)
{
sum+=num[i]*num[i];
}
result[bid*thread_num+tid]=sum;
}




int main(void)
{
GenerateNumbers(data,data_size);
int* gpudata,*result;
cudaMalloc((void**)&gpudata,sizeof(int)*data_size);
cudaMalloc((void**)&result,
sizeof(int)*thread_num*block_num);
cudaMemcpy(gpudata,data,sizeof(int)*data_size,
cudaMemcpyHostToDevice);


cudaEvent_t start,stop;
float time_used;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);


sumOfSquares<<<block_num,thread_num,0>>>(gpudata,result);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_used,start,stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);




int sum[thread_num*block_num];

cudaMemcpy(&sum,result,sizeof(int)*thread_num*block_num,
cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);


int final_sum=0;
for(int i=0;i<thread_num*block_num;i++)
final_sum+=sum[i];



cout<<"sum(gpu):"<<final_sum<<"  time:"<<time_used<<endl;



final_sum=0;
for(int i=0;i<data_size;i++)
{
final_sum+=data[i]*data[i];
}
cout<<"sum(cpu):"<<final_sum<<endl;


return 0;
}


使用了32*256个线程。这里相当于93.08GB/s的带宽了


++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


#include<iostream>
#include<cuda.h>
#include<cuda_runtime.h>
#include<time.h>
using namespace std;




const int data_size=1048576;
const int block_num=32;
const int thread_num=256;
int data[data_size];








void GenerateNumbers(int *number,int size)
{
for(int i=0;i<size;i++)
{
number[i]=rand()%10;
}
}




__global__ static void sumOfSquares(int *num,int* result)
{
extern __shared__ int shared[];
const int tid=threadIdx.x;
const int bid=blockIdx.x;
int i;
shared[tid]=0;
for(i=bid*thread_num+tid;i<data_size;
i+=block_num*thread_num)
{
shared[tid]+=num[i]*num[i];
}
__syncthreads();
if(tid==0)
{
for(int i=1;i<thread_num;i++)
{
shared[0]+=shared[i];
}
result[bid]=shared[0];
}
}








int main(void)
{
GenerateNumbers(data,data_size);
int* gpudata,*result;
cudaMalloc((void**)&gpudata,sizeof(int)*data_size);
cudaMalloc((void**)&result,
sizeof(int)*block_num);
cudaMemcpy(gpudata,data,sizeof(int)*data_size,
cudaMemcpyHostToDevice);




cudaEvent_t start,stop;
float time_used;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);




sumOfSquares<<<block_num,thread_num,
thread_num*sizeof(int)>>>(gpudata,result);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_used,start,stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);








int sum[block_num];


cudaMemcpy(&sum,result,sizeof(int)*block_num,
cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);




int final_sum=0;
for(int i=0;i<block_num;i++)
final_sum+=sum[i];






cout<<"sum(gpu):"<<final_sum<<"  time:"<<time_used<<endl;






final_sum=0;
for(int i=0;i<data_size;i++)
{
final_sum+=data[i]*data[i];
}
cout<<"sum(cpu):"<<final_sum<<endl;




return 0;
}



这里用到了共享内存,但是,在gpu里面的操作多了些,所以效率会打折扣,并且每个块都是用一个线程来累加,没有用到并行的方法,下面的方法将会用到并行的累加一个块里面的数据之和。


+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++




#include<iostream>
#include<cuda.h>
#include<cuda_runtime.h>
#include<time.h>
using namespace std;




const int data_size=1048576;
const int block_num=32;
const int thread_num=256;
int data[data_size];








void GenerateNumbers(int *number,int size)
{
for(int i=0;i<size;i++)
{
number[i]=rand()%10;
}
}




__global__ static void sumOfSquares(int *num,int* result)
{
extern __shared__ int shared[];
const int tid=threadIdx.x;
const int bid=blockIdx.x;
int i;
int offset=1,mask=1;
shared[tid]=0;
for(i=bid*thread_num+tid;i<data_size;
i+=block_num*thread_num)
{
shared[tid]+=num[i]*num[i];
}
__syncthreads();


while(offset<thread_num)
{
if((tid&mask)==0)//要*******0的倍数再******00的倍数再*****000的倍数才能满足条件
{
shared[tid]+=shared[tid+offset];
}
offset+=offset;
mask=offset+mask;//由00000001变成000000011再变成00000111
__syncthreads();
}


if(tid==0)
{
result[bid]=shared[0];
}
}








int main(void)
{
GenerateNumbers(data,data_size);
int* gpudata,*result;
cudaMalloc((void**)&gpudata,sizeof(int)*data_size);
cudaMalloc((void**)&result,
sizeof(int)*block_num);
cudaMemcpy(gpudata,data,sizeof(int)*data_size,
cudaMemcpyHostToDevice);




cudaEvent_t start,stop;
float time_used;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);




sumOfSquares<<<block_num,thread_num,
thread_num*sizeof(int)>>>(gpudata,result);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_used,start,stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);








int sum[block_num];


cudaMemcpy(&sum,result,sizeof(int)*block_num,
cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);




int final_sum=0;
for(int i=0;i<block_num;i++)
final_sum+=sum[i];






cout<<"sum(gpu):"<<final_sum<<"  time:"<<time_used<<endl;






final_sum=0;
for(int i=0;i<data_size;i++)
{
final_sum+=data[i]*data[i];
}
cout<<"sum(cpu):"<<final_sum<<endl;




return 0;
}



在这里while循环采用树状加法,但是会有share memory的bank conflict的问题,所以可以通过移位来实现。



++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

#include<iostream>
#include<cuda.h>
#include<cuda_runtime.h>
#include<time.h>
using namespace std;




const int data_size=1048576;
const int block_num=32;
const int thread_num=256;
int data[data_size];








void GenerateNumbers(int *number,int size)
{
for(int i=0;i<size;i++)
{
number[i]=rand()%10;
}
}




__global__ static void sumOfSquares(int *num,int* result)
{
extern __shared__ int shared[];
const int tid=threadIdx.x;
const int bid=blockIdx.x;
int i;


for(i=bid*thread_num+tid;i<data_size;
i+=block_num*thread_num)
{
shared[tid]+=num[i]*num[i];
}
__syncthreads();
int offset=thread_num/2;
while(offset>0)
{
if(tid<offset)
{
shared[tid]+=shared[tid+offset];
}
offset>>=1;
    __syncthreads();
}


if(tid==0)
{
result[bid]=shared[0];
}
}








int main(void)
{
GenerateNumbers(data,data_size);
int* gpudata,*result;
cudaMalloc((void**)&gpudata,sizeof(int)*data_size);
cudaMalloc((void**)&result,
sizeof(int)*block_num);
cudaMemcpy(gpudata,data,sizeof(int)*data_size,
cudaMemcpyHostToDevice);




cudaEvent_t start,stop;
float time_used;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);




sumOfSquares<<<block_num,thread_num,
thread_num*sizeof(int)>>>(gpudata,result);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_used,start,stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);








int sum[block_num];


cudaMemcpy(&sum,result,sizeof(int)*block_num,
cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);




int final_sum=0;
for(int i=0;i<block_num;i++)
final_sum+=sum[i];






cout<<"sum(gpu):"<<final_sum<<"  time:"<<time_used<<endl;






final_sum=0;
for(int i=0;i<data_size;i++)
{
final_sum+=data[i]*data[i];
}
cout<<"sum(cpu):"<<final_sum<<endl;




return 0;
}



这里通过移位来实现了,但是可能因为移位操作也耗时,所以时间并没有减少,但是没有产生bank conflict。同事,如果将整个循环都展开,即没有了while循环,将会发现效率将会提高,说明gpu不适合展开循环,它的特点是并行计算,循环将会降低效率。







0 0