CUDA之PTX优化原理

来源：互联网发布：自学黑帽seo 编辑：程序博客网时间：2024/05/21 09:59

摘要

本文主要讲述CUDA中的PTX的原理实现和分析。

1. 不作优化的代码实现

#include <stdio.h>#include <assert.h>#include <cuda_runtime.h>#include <helper_functions.h>#include <helper_cuda.h>__global__ void gpu(int *d_ptr, int length){    int elemID = blockIdx.x * blockDim.x + threadIdx.x;for(int innerloops = 0; innerloops < 100000; innerloops++){if (elemID < length){//unsigned int laneid;d_ptr[elemID] = elemID % 32;}}}void valid(int *h_ptr, int length){    for (int elemID=0; elemID<length; elemID++){        h_ptr[elemID] = elemID % 32;    }}int main(int argc, char **argv){        const int N = 1000;    int *d_ptr;    checkCudaErrors(cudaMalloc(&d_ptr, N * sizeof(int)));    int *h_ptr;    checkCudaErrors(cudaMallocHost(&h_ptr, N * sizeof(int)));//start timingfloat time_elapsed=0;cudaEvent_t start,stop;cudaEventCreate(&start);    cudaEventCreate(&stop);cudaEventRecord( start,0);// GPU kernel without PTXdim3 cudaBlockSize(256,1,1);    dim3 cudaGridSize((N + cudaBlockSize.x - 1) / cudaBlockSize.x, 1, 1);gpu<<<cudaGridSize, cudaBlockSize>>>(d_ptr, N);    checkCudaErrors(cudaGetLastError());    checkCudaErrors(cudaDeviceSynchronize());// Finish timingcudaEventRecord(stop,0);    cudaEventSynchronize(start);    cudaEventSynchronize(stop);   cudaEventElapsedTime(&time_elapsed,start,stop);// Printprintf("Time Used on GPU:%f(ms)\n",time_elapsed);// CPU (results for validate)    valid(h_ptr, N);    int *h_d_ptr;    checkCudaErrors(cudaMallocHost(&h_d_ptr, N *sizeof(int)));    checkCudaErrors(cudaMemcpy(h_d_ptr, d_ptr, N *sizeof(int), cudaMemcpyDeviceToHost));    bool bValid = true;    for (int i=0; i<N && bValid; i++){        if (h_ptr[i] != h_d_ptr[i]){            bValid = false;        }    }    printf("Test %s.\n", bValid ? "Successful" : "Failed");    checkCudaErrors(cudaFree(d_ptr));    checkCudaErrors(cudaFreeHost(h_ptr));    checkCudaErrors(cudaFreeHost(h_d_ptr));    return bValid ? EXIT_SUCCESS: EXIT_FAILURE;}

2. PTX

PTX主要特点是预计算的索引值放到L1缓存，实现内存操作的预取优化，达到本案例中性能的大幅提升。

#include <stdio.h>#include <assert.h>#include <cuda_runtime.h>#include <helper_functions.h>#include <helper_cuda.h>__global__ void gpu_ptx(int *d_ptr, int length){    int elemID = blockIdx.x * blockDim.x + threadIdx.x;for(int innerloops = 0; innerloops < 100000; innerloops++){if (elemID < length){unsigned int laneid;asm("mov.u32 %0, %%laneid;" : "=r"(laneid)); // 索引缓存d_ptr[elemID] = laneid;}}}void valid(int *h_ptr, int length){    for (int elemID=0; elemID<length; elemID++){        h_ptr[elemID] = elemID % 32;    }}int main(int argc, char **argv){        const int N = 1000;    int *d_ptr;    checkCudaErrors(cudaMalloc(&d_ptr, N * sizeof(int)));    int *h_ptr;    checkCudaErrors(cudaMallocHost(&h_ptr, N * sizeof(int)));//start timingfloat time_elapsed=0;cudaEvent_t start,stop;cudaEventCreate(&start);    cudaEventCreate(&stop);cudaEventRecord( start,0);// GPU kernel using PTX    dim3 cudaBlockSize(256,1,1);    dim3 cudaGridSize((N + cudaBlockSize.x - 1) / cudaBlockSize.x, 1, 1);    gpu_ptx<<<cudaGridSize, cudaBlockSize>>>(d_ptr, N);    checkCudaErrors(cudaGetLastError());    checkCudaErrors(cudaDeviceSynchronize());// Finish timingcudaEventRecord(stop,0);    cudaEventSynchronize(start);    cudaEventSynchronize(stop);   cudaEventElapsedTime(&time_elapsed,start,stop);// Printprintf("Time Used on GPU:%f(ms)\n",time_elapsed);// CPU (results for validate)    valid(h_ptr, N);    int *h_d_ptr;    checkCudaErrors(cudaMallocHost(&h_d_ptr, N *sizeof(int)));    checkCudaErrors(cudaMemcpy(h_d_ptr, d_ptr, N *sizeof(int), cudaMemcpyDeviceToHost));    bool bValid = true;    for (int i=0; i<N && bValid; i++){        if (h_ptr[i] != h_d_ptr[i]){            bValid = false;        }    }    printf("Test %s.\n", bValid ? "Successful" : "Failed");    checkCudaErrors(cudaFree(d_ptr));    checkCudaErrors(cudaFreeHost(h_ptr));    checkCudaErrors(cudaFreeHost(h_d_ptr));    return bValid ? EXIT_SUCCESS: EXIT_FAILURE;}

3. 小结

带点态度地分享一下经验：能用简单的寄存器解决的问题就不必要用复杂的PTX缓存去操作！！！register用完了再去考虑用缓存预取。

0 0