CUDA中模板类和模板函数

来源：互联网发布：js 页面刷新编辑：程序博客网时间：2024/05/16 00:56

以向量加法为例，包含三个文件：kernel.h,kernel.cu,test.cpp

kernel.h:

#ifndef __KERNEL_H_#define __KERNEL_H_extern "C" void runtest();#endif

kernel.cu:

#include "kernel.h"#include "cuda_runtime.h"#include "device_launch_parameters.h"#include <stdio.h>template <class T>class operate {public:    cudaError_t addWithCuda(T *c, const T *a, const T *b, unsigned int size);};template <class T>void __global__ addKernel1(T *c, const T *a, const T *b){    int i = threadIdx.x;    c[i] = a[i] + b[i];}template <class T>cudaError_t operate<T>::addWithCuda(T *c, const T *a, const T *b, unsigned int size){    T *dev_a = 0;    T *dev_b = 0;    T *dev_c = 0;    cudaError_t cudaStatus;    // Choose which GPU to run on, change this on a multi-GPU system.    cudaStatus = cudaSetDevice(0);    if (cudaStatus != cudaSuccess) {        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");        goto Error;    }    // Allocate GPU buffers for three vectors (two input, one output)    .    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(T));    if (cudaStatus != cudaSuccess) {        fprintf(stderr, "cudaMalloc failed!");        goto Error;    }    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(T));    if (cudaStatus != cudaSuccess) {        fprintf(stderr, "cudaMalloc failed!");        goto Error;    }    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(T));    if (cudaStatus != cudaSuccess) {        fprintf(stderr, "cudaMalloc failed!");        goto Error;    }    // Copy input vectors from host memory to GPU buffers.    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(T), cudaMemcpyHostToDevice);    if (cudaStatus != cudaSuccess) {        fprintf(stderr, "cudaMemcpy failed!");        goto Error;    }    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(T), cudaMemcpyHostToDevice);    if (cudaStatus != cudaSuccess) {        fprintf(stderr, "cudaMemcpy failed!");        goto Error;    }    // Launch a kernel on the GPU with one thread for each element.    addKernel1<T><<<1, size>>>(dev_c, dev_a, dev_b);    // Check for any errors launching the kernel    cudaStatus = cudaGetLastError();    if (cudaStatus != cudaSuccess) {        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));        goto Error;    }        // cudaDeviceSynchronize waits for the kernel to finish, and returns    // any errors encountered during the launch.    cudaStatus = cudaDeviceSynchronize();    if (cudaStatus != cudaSuccess) {        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);        goto Error;    }    // Copy output vector from GPU buffer to host memory.    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(T), cudaMemcpyDeviceToHost);    if (cudaStatus != cudaSuccess) {        fprintf(stderr, "cudaMemcpy failed!");        goto Error;    }Error:    cudaFree(dev_c);    cudaFree(dev_a);    cudaFree(dev_b);        return cudaStatus;}extern "C" void runtest(){    const int arraySize = 5;    const double a_d[arraySize] = { 1.1, 2.2, 3.3, 4.4, 5.5 };    const double b_d[arraySize] = { 10.1, 20.1, 30.1, 40.1, 50.1 };    double c_d[arraySize] = { 0 };    // Add vectors in parallel.operate<double> op;cudaError_t cudaStatus = op.addWithCuda(c_d, a_d, b_d, arraySize);    if (cudaStatus != cudaSuccess) {        fprintf(stderr, "addWithCuda failed!");        return;}    printf("{1.1,2.2,3.3,4.4,5.5} + {10.1,20.1,30.1,40.1,50.1} = {%f,%f,%f,%f,%f}\n",        c_d[0], c_d[1], c_d[2], c_d[3], c_d[4]);    // cudaDeviceReset must be called before exiting in order for profiling and    // tracing tools such as Nsight and Visual Profiler to show complete traces.    cudaStatus = cudaDeviceReset();    if (cudaStatus != cudaSuccess) {        fprintf(stderr, "cudaDeviceReset failed!");        return;    }}

test.cpp:

#include "kernel.h"#include <stdio.h>int main(){    runtest();    char a=getchar();    return 0;}

0 0