CUDA之PTX优化原理
来源:互联网 发布:自学黑帽seo 编辑:程序博客网 时间:2024/05/21 09:59
摘要
本文主要讲述CUDA中的PTX的原理实现和分析。
1. 不作优化的代码实现
#include <stdio.h>#include <assert.h>#include <cuda_runtime.h>#include <helper_functions.h>#include <helper_cuda.h>__global__ void gpu(int *d_ptr, int length){ int elemID = blockIdx.x * blockDim.x + threadIdx.x;for(int innerloops = 0; innerloops < 100000; innerloops++){if (elemID < length){//unsigned int laneid;d_ptr[elemID] = elemID % 32;}}}void valid(int *h_ptr, int length){ for (int elemID=0; elemID<length; elemID++){ h_ptr[elemID] = elemID % 32; }}int main(int argc, char **argv){ const int N = 1000; int *d_ptr; checkCudaErrors(cudaMalloc(&d_ptr, N * sizeof(int))); int *h_ptr; checkCudaErrors(cudaMallocHost(&h_ptr, N * sizeof(int)));//start timingfloat time_elapsed=0;cudaEvent_t start,stop;cudaEventCreate(&start); cudaEventCreate(&stop);cudaEventRecord( start,0);// GPU kernel without PTXdim3 cudaBlockSize(256,1,1); dim3 cudaGridSize((N + cudaBlockSize.x - 1) / cudaBlockSize.x, 1, 1);gpu<<<cudaGridSize, cudaBlockSize>>>(d_ptr, N); checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaDeviceSynchronize());// Finish timingcudaEventRecord(stop,0); cudaEventSynchronize(start); cudaEventSynchronize(stop); cudaEventElapsedTime(&time_elapsed,start,stop);// Printprintf("Time Used on GPU:%f(ms)\n",time_elapsed);// CPU (results for validate) valid(h_ptr, N); int *h_d_ptr; checkCudaErrors(cudaMallocHost(&h_d_ptr, N *sizeof(int))); checkCudaErrors(cudaMemcpy(h_d_ptr, d_ptr, N *sizeof(int), cudaMemcpyDeviceToHost)); bool bValid = true; for (int i=0; i<N && bValid; i++){ if (h_ptr[i] != h_d_ptr[i]){ bValid = false; } } printf("Test %s.\n", bValid ? "Successful" : "Failed"); checkCudaErrors(cudaFree(d_ptr)); checkCudaErrors(cudaFreeHost(h_ptr)); checkCudaErrors(cudaFreeHost(h_d_ptr)); return bValid ? EXIT_SUCCESS: EXIT_FAILURE;}
2. PTX
PTX主要特点是预计算的索引值放到L1缓存,实现内存操作的预取优化,达到本案例中性能的大幅提升。
#include <stdio.h>#include <assert.h>#include <cuda_runtime.h>#include <helper_functions.h>#include <helper_cuda.h>__global__ void gpu_ptx(int *d_ptr, int length){ int elemID = blockIdx.x * blockDim.x + threadIdx.x;for(int innerloops = 0; innerloops < 100000; innerloops++){if (elemID < length){unsigned int laneid;asm("mov.u32 %0, %%laneid;" : "=r"(laneid)); // 索引缓存d_ptr[elemID] = laneid;}}}void valid(int *h_ptr, int length){ for (int elemID=0; elemID<length; elemID++){ h_ptr[elemID] = elemID % 32; }}int main(int argc, char **argv){ const int N = 1000; int *d_ptr; checkCudaErrors(cudaMalloc(&d_ptr, N * sizeof(int))); int *h_ptr; checkCudaErrors(cudaMallocHost(&h_ptr, N * sizeof(int)));//start timingfloat time_elapsed=0;cudaEvent_t start,stop;cudaEventCreate(&start); cudaEventCreate(&stop);cudaEventRecord( start,0);// GPU kernel using PTX dim3 cudaBlockSize(256,1,1); dim3 cudaGridSize((N + cudaBlockSize.x - 1) / cudaBlockSize.x, 1, 1); gpu_ptx<<<cudaGridSize, cudaBlockSize>>>(d_ptr, N); checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaDeviceSynchronize());// Finish timingcudaEventRecord(stop,0); cudaEventSynchronize(start); cudaEventSynchronize(stop); cudaEventElapsedTime(&time_elapsed,start,stop);// Printprintf("Time Used on GPU:%f(ms)\n",time_elapsed);// CPU (results for validate) valid(h_ptr, N); int *h_d_ptr; checkCudaErrors(cudaMallocHost(&h_d_ptr, N *sizeof(int))); checkCudaErrors(cudaMemcpy(h_d_ptr, d_ptr, N *sizeof(int), cudaMemcpyDeviceToHost)); bool bValid = true; for (int i=0; i<N && bValid; i++){ if (h_ptr[i] != h_d_ptr[i]){ bValid = false; } } printf("Test %s.\n", bValid ? "Successful" : "Failed"); checkCudaErrors(cudaFree(d_ptr)); checkCudaErrors(cudaFreeHost(h_ptr)); checkCudaErrors(cudaFreeHost(h_d_ptr)); return bValid ? EXIT_SUCCESS: EXIT_FAILURE;}
3. 小结
带点态度地分享一下经验:能用简单的寄存器解决的问题就不必要用复杂的PTX缓存去操作!!!register用完了再去考虑用缓存预取。
0 0
- CUDA之PTX优化原理
- Run CUDA or PTX Code on GPU
- CUDA进阶第二篇:巧用PTX
- CUDA CUBIN/PTX文件动态加载
- CUDA PTX ISA阅读笔记(一)
- CUDA PTX ISA阅读笔记(二)
- CUDA进阶第二篇:巧用PTX
- CUDA优化之Reducing
- PTX ISA 之 comparisions 小结
- PTX ISA 之 cache operator
- CUDA学习之CUDA程序优化
- CUDA优化之去冗余
- CUDA之程序优化总结
- PTX ISA 之 volatile 的用法
- PTX ISA 之 同步指令 bar & membar
- PTX ISA 之 BFS 代码分析
- PTX ISA 之 Control Flow Instructions
- PTX ISA 之 Control Flow Instructions
- 27SharedSdk(第三方分享)
- SIP的REG,UNREG和INVITE
- matlab中的数组与矩阵
- samba设置用户自己修改密码
- Java基础学习笔记
- CUDA之PTX优化原理
- 输入一个5乘5的矩阵,输出从左上到右下的路径
- 开源地图 SharpMap
- cocos2d-x-3.0学习笔记之std::function、std::bind
- error: ‘rand’ was not declared in this scope
- 优秀程序员45种习惯
- UI常用工具类(不定时更新)
- Samba磁盘配额
- 潜意识、读懂行为、说服的艺术