cuda的优化技术
来源:互联网 发布:绝对领域 知乎 编辑:程序博客网 时间:2024/04/20 01:43
#include<cuda.h>
#include<cuda_runtime.h>
#include<time.h>
using namespace std;
const int data_size=1048576;
int data[data_size];
void GenerateNumbers(int *number,int size)
{
for(int i=0;i<size;i++)
{
number[i]=rand()%10;
}
}
__global__ static void sumOfSquares(int *num,int* result)
{
int sum=0;
int i;
for(i=0;i<data_size;i++)
{
sum+=num[i]*num[i];
}
*result=sum;
}
int main(void)
{
GenerateNumbers(data,data_size);
int* gpudata,*result;
cudaMalloc((void**)&gpudata,sizeof(int)*data_size);
cudaMalloc((void**)&result,sizeof(int));
cudaMemcpy(gpudata,data,sizeof(int)*data_size,
cudaMemcpyHostToDevice);
cudaEvent_t start,stop;
float time_used;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
sumOfSquares<<<1,1,0>>>(gpudata,result);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_used,start,stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
int sum;
cudaMemcpy(&sum,result,sizeof(int),
cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
cout<<"sum(gpu):"<<sum<<" time:"<<time_used<<endl;
sum=0;
for(int i=0;i<data_size;i++)
{
sum+=data[i]*data[i];
}
cout<<"sum(cpu):"<<sum<<endl;
return 0;
}
仅用了一个线程的情况。
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#include<iostream>
#include<cuda.h>
#include<cuda_runtime.h>
#include<time.h>
using namespace std;
const int data_size=1048576;
const int thread_num=256;
int data[data_size];
void GenerateNumbers(int *number,int size)
{
for(int i=0;i<size;i++)
{
number[i]=rand()%10;
}
}
__global__ static void sumOfSquares(int *num,int* result)
{
const int tid=threadIdx.x;
const int size=data_size/thread_num;
int sum=0;
int i;
for(i=tid*size;i<(tid+1)*size;i++)
{
sum+=num[i]*num[i];
}
result[tid]=sum;
}
int main(void)
{
GenerateNumbers(data,data_size);
int* gpudata,*result;
cudaMalloc((void**)&gpudata,sizeof(int)*data_size);
cudaMalloc((void**)&result,sizeof(int)*thread_num);
cudaMemcpy(gpudata,data,sizeof(int)*data_size,
cudaMemcpyHostToDevice);
cudaEvent_t start,stop;
float time_used;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
sumOfSquares<<<1,thread_num,0>>>(gpudata,result);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_used,start,stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
int sum[thread_num];
cudaMemcpy(&sum,result,sizeof(int)*thread_num,
cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
int final_sum=0;
for(int i=0;i<thread_num;i++)
final_sum+=sum[i];
cout<<"sum(gpu):"<<final_sum<<" time:"<<time_used<<endl;
final_sum=0;
for(int i=0;i<data_size;i++)
{
final_sum+=data[i]*data[i];
}
cout<<"sum(cpu):"<<final_sum<<endl;
return 0;
}
这是用了1*256个线程
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#include<iostream>
#include<cuda.h>
#include<cuda_runtime.h>
#include<time.h>
using namespace std;
const int data_size=1048576;
const int thread_num=256;
int data[data_size];
void GenerateNumbers(int *number,int size)
{
for(int i=0;i<size;i++)
{
number[i]=rand()%10;
}
}
__global__ static void sumOfSquares(int *num,int* result)
{
const int tid=threadIdx.x;
const int size=data_size/thread_num;
int sum=0;
int i;
for(i=tid;i<data_size;i+=thread_num)
{
sum+=num[i]*num[i];
}
result[tid]=sum;
}
int main(void)
{
GenerateNumbers(data,data_size);
int* gpudata,*result;
cudaMalloc((void**)&gpudata,sizeof(int)*data_size);
cudaMalloc((void**)&result,sizeof(int)*thread_num);
cudaMemcpy(gpudata,data,sizeof(int)*data_size,
cudaMemcpyHostToDevice);
cudaEvent_t start,stop;
float time_used;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
sumOfSquares<<<1,thread_num,0>>>(gpudata,result);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_used,start,stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
int sum[thread_num];
cudaMemcpy(&sum,result,sizeof(int)*thread_num,
cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
int final_sum=0;
for(int i=0;i<thread_num;i++)
final_sum+=sum[i];
cout<<"sum(gpu):"<<final_sum<<" time:"<<time_used<<endl;
final_sum=0;
for(int i=0;i<data_size;i++)
{
final_sum+=data[i]*data[i];
}
cout<<"sum(cpu):"<<final_sum<<endl;
return 0;
}
使用了1*256个线程,注意kernel函数的区别,主要是线程的使用方法。
使用了1*1024个线程
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#include<iostream>
#include<cuda.h>
#include<cuda_runtime.h>
#include<time.h>
using namespace std;
const int data_size=1048576;
const int block_num=32;
const int thread_num=256;
int data[data_size];
void GenerateNumbers(int *number,int size)
{
for(int i=0;i<size;i++)
{
number[i]=rand()%10;
}
}
__global__ static void sumOfSquares(int *num,int* result)
{
const int tid=threadIdx.x;
const int bid=blockIdx.x;
const int size=data_size/thread_num;
int sum=0;
int i;
for(i=bid*thread_num+tid;i<data_size;
i+=block_num*thread_num)
{
sum+=num[i]*num[i];
}
result[bid*thread_num+tid]=sum;
}
int main(void)
{
GenerateNumbers(data,data_size);
int* gpudata,*result;
cudaMalloc((void**)&gpudata,sizeof(int)*data_size);
cudaMalloc((void**)&result,
sizeof(int)*thread_num*block_num);
cudaMemcpy(gpudata,data,sizeof(int)*data_size,
cudaMemcpyHostToDevice);
cudaEvent_t start,stop;
float time_used;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
sumOfSquares<<<block_num,thread_num,0>>>(gpudata,result);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_used,start,stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
int sum[thread_num*block_num];
cudaMemcpy(&sum,result,sizeof(int)*thread_num*block_num,
cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
int final_sum=0;
for(int i=0;i<thread_num*block_num;i++)
final_sum+=sum[i];
cout<<"sum(gpu):"<<final_sum<<" time:"<<time_used<<endl;
final_sum=0;
for(int i=0;i<data_size;i++)
{
final_sum+=data[i]*data[i];
}
cout<<"sum(cpu):"<<final_sum<<endl;
return 0;
}
使用了32*256个线程。这里相当于93.08GB/s的带宽了
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#include<iostream>
#include<cuda.h>
#include<cuda_runtime.h>
#include<time.h>
using namespace std;
const int data_size=1048576;
const int block_num=32;
const int thread_num=256;
int data[data_size];
void GenerateNumbers(int *number,int size)
{
for(int i=0;i<size;i++)
{
number[i]=rand()%10;
}
}
__global__ static void sumOfSquares(int *num,int* result)
{
extern __shared__ int shared[];
const int tid=threadIdx.x;
const int bid=blockIdx.x;
int i;
shared[tid]=0;
for(i=bid*thread_num+tid;i<data_size;
i+=block_num*thread_num)
{
shared[tid]+=num[i]*num[i];
}
__syncthreads();
if(tid==0)
{
for(int i=1;i<thread_num;i++)
{
shared[0]+=shared[i];
}
result[bid]=shared[0];
}
}
int main(void)
{
GenerateNumbers(data,data_size);
int* gpudata,*result;
cudaMalloc((void**)&gpudata,sizeof(int)*data_size);
cudaMalloc((void**)&result,
sizeof(int)*block_num);
cudaMemcpy(gpudata,data,sizeof(int)*data_size,
cudaMemcpyHostToDevice);
cudaEvent_t start,stop;
float time_used;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
sumOfSquares<<<block_num,thread_num,
thread_num*sizeof(int)>>>(gpudata,result);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_used,start,stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
int sum[block_num];
cudaMemcpy(&sum,result,sizeof(int)*block_num,
cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
int final_sum=0;
for(int i=0;i<block_num;i++)
final_sum+=sum[i];
cout<<"sum(gpu):"<<final_sum<<" time:"<<time_used<<endl;
final_sum=0;
for(int i=0;i<data_size;i++)
{
final_sum+=data[i]*data[i];
}
cout<<"sum(cpu):"<<final_sum<<endl;
return 0;
}
这里用到了共享内存,但是,在gpu里面的操作多了些,所以效率会打折扣,并且每个块都是用一个线程来累加,没有用到并行的方法,下面的方法将会用到并行的累加一个块里面的数据之和。
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#include<iostream>
#include<cuda.h>
#include<cuda_runtime.h>
#include<time.h>
using namespace std;
const int data_size=1048576;
const int block_num=32;
const int thread_num=256;
int data[data_size];
void GenerateNumbers(int *number,int size)
{
for(int i=0;i<size;i++)
{
number[i]=rand()%10;
}
}
__global__ static void sumOfSquares(int *num,int* result)
{
extern __shared__ int shared[];
const int tid=threadIdx.x;
const int bid=blockIdx.x;
int i;
int offset=1,mask=1;
shared[tid]=0;
for(i=bid*thread_num+tid;i<data_size;
i+=block_num*thread_num)
{
shared[tid]+=num[i]*num[i];
}
__syncthreads();
while(offset<thread_num)
{
if((tid&mask)==0)//要*******0的倍数再******00的倍数再*****000的倍数才能满足条件
{
shared[tid]+=shared[tid+offset];
}
offset+=offset;
mask=offset+mask;//由00000001变成000000011再变成00000111
__syncthreads();
}
if(tid==0)
{
result[bid]=shared[0];
}
}
int main(void)
{
GenerateNumbers(data,data_size);
int* gpudata,*result;
cudaMalloc((void**)&gpudata,sizeof(int)*data_size);
cudaMalloc((void**)&result,
sizeof(int)*block_num);
cudaMemcpy(gpudata,data,sizeof(int)*data_size,
cudaMemcpyHostToDevice);
cudaEvent_t start,stop;
float time_used;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
sumOfSquares<<<block_num,thread_num,
thread_num*sizeof(int)>>>(gpudata,result);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_used,start,stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
int sum[block_num];
cudaMemcpy(&sum,result,sizeof(int)*block_num,
cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
int final_sum=0;
for(int i=0;i<block_num;i++)
final_sum+=sum[i];
cout<<"sum(gpu):"<<final_sum<<" time:"<<time_used<<endl;
final_sum=0;
for(int i=0;i<data_size;i++)
{
final_sum+=data[i]*data[i];
}
cout<<"sum(cpu):"<<final_sum<<endl;
return 0;
}
在这里while循环采用树状加法,但是会有share memory的bank conflict的问题,所以可以通过移位来实现。
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#include<iostream>
#include<cuda.h>
#include<cuda_runtime.h>
#include<time.h>
using namespace std;
const int data_size=1048576;
const int block_num=32;
const int thread_num=256;
int data[data_size];
void GenerateNumbers(int *number,int size)
{
for(int i=0;i<size;i++)
{
number[i]=rand()%10;
}
}
__global__ static void sumOfSquares(int *num,int* result)
{
extern __shared__ int shared[];
const int tid=threadIdx.x;
const int bid=blockIdx.x;
int i;
for(i=bid*thread_num+tid;i<data_size;
i+=block_num*thread_num)
{
shared[tid]+=num[i]*num[i];
}
__syncthreads();
int offset=thread_num/2;
while(offset>0)
{
if(tid<offset)
{
shared[tid]+=shared[tid+offset];
}
offset>>=1;
__syncthreads();
}
if(tid==0)
{
result[bid]=shared[0];
}
}
int main(void)
{
GenerateNumbers(data,data_size);
int* gpudata,*result;
cudaMalloc((void**)&gpudata,sizeof(int)*data_size);
cudaMalloc((void**)&result,
sizeof(int)*block_num);
cudaMemcpy(gpudata,data,sizeof(int)*data_size,
cudaMemcpyHostToDevice);
cudaEvent_t start,stop;
float time_used;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
sumOfSquares<<<block_num,thread_num,
thread_num*sizeof(int)>>>(gpudata,result);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time_used,start,stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
int sum[block_num];
cudaMemcpy(&sum,result,sizeof(int)*block_num,
cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
int final_sum=0;
for(int i=0;i<block_num;i++)
final_sum+=sum[i];
cout<<"sum(gpu):"<<final_sum<<" time:"<<time_used<<endl;
final_sum=0;
for(int i=0;i<data_size;i++)
{
final_sum+=data[i]*data[i];
}
cout<<"sum(cpu):"<<final_sum<<endl;
return 0;
}
这里通过移位来实现了,但是可能因为移位操作也耗时,所以时间并没有减少,但是没有产生bank conflict。同事,如果将整个循环都展开,即没有了while循环,将会发现效率将会提高,说明gpu不适合展开循环,它的特点是并行计算,循环将会降低效率。
- cuda的优化技术
- cuda的计时技术
- CUDA程序优化的记录
- CUDA 优化的一些措施
- CUDA优化
- cuda优化
- CUDA优化
- cuda优化
- Cuda 学习教程(四):Cuda程序的优化
- Tesla架构下的CUDA程序优化
- 基于CUDA的GPU优化建议
- CUDA: 程序优化的15个策略
- 基于CUDA的GPU优化建议
- CUDA: 程序优化的15个策略
- CUDA: 程序优化的15个策略
- 基于CUDA的GPU优化建议
- 基于CUDA的GPU优化建议
- 基于CUDA的GPU优化建议
- A*算法源码
- 使用hadoop存储图片服务器
- Kernel中GPIO的Sysfs接口的使用
- 数据结构---图的连通性
- Android反调试之 AntiEmulator 检测安卓模拟器
- cuda的优化技术
- POJ 2823 Sliding Window(单调队列)
- NYOJ--21(搜索)-题目-----------------------------三个水杯
- 数据结构--最小生成树(Prim算法)
- 我的maven项目常用配置
- 【学习心得】linux下多客户端批量操作脚本(含expect交互)
- 数据结构---图(邻接表)
- 显示缓存内容
- enum和int的相互转换