CUDA内核函数的连续执行
来源:互联网 发布:电子网络 编辑:程序博客网 时间:2024/06/05 01:51
以下是两个连续的CUDA核函数衔接的一种思路:
要完成的功能:1. 向量的计算computer(暂时以两向量求和为例);2. 对结果向量求和SUM。
思路1:写1个计算的内核函数,中间结果保留,求和函数调用cublas。
#include "cuda_runtime.h"#include "device_launch_parameters.h"#include "cublas_v2.h"#include <stdio.h>#include <stdlib.h>#pragma comment(lib, "cublas.lib")cudaError_t addWithCuda(const int *a, const int *b, unsigned int size, float& sum);__global__ void addKernel(const int *a, const int *b, float *c){ int i = threadIdx.x; c[i] = a[i] + b[i];}int main(){ const int arraySize = 5; const int a[arraySize] = { 1, 2, 3, 4, 5 }; const int b[arraySize] = { 10, 20, 30, 40, 50 }; // Add vectors in parallel.float sum = 0.0; cudaError_t cudaStatus = addWithCuda(a, b, arraySize, sum); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addWithCuda failed!"); return 1; } printf("sum: %f\n", sum); // cudaDeviceReset must be called before exiting in order for profiling and // tracing tools such as Nsight and Visual Profiler to show complete traces. cudaStatus = cudaDeviceReset(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceReset failed!"); return 1; } return 0;}// Helper function for using CUDA to add vectors in parallel.cudaError_t addWithCuda(const int *a, const int *b, unsigned int size, float& sum){ int *dev_a = 0; int *dev_b = 0; float *dev_c = 0; cudaError_t cudaStatus; // Choose which GPU to run on, change this on a multi-GPU system. cudaStatus = cudaSetDevice(0); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); goto Error; } // Allocate GPU buffers for three vectors (two input, one output) . cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(float)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } // Copy input vectors from host memory to GPU buffers. cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } // Launch a kernel on the GPU with one thread for each element.addKernel << <1, size >> >(dev_a, dev_b, dev_c); // Check for any errors launching the kernel cudaStatus = cudaGetLastError(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); goto Error; } // cudaDeviceSynchronize waits for the kernel to finish, and returns // any errors encountered during the launch. cudaStatus = cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); goto Error; }cublasHandle_t handle;cublasStatus_t stat;stat = cublasCreate(&handle);if (stat != CUBLAS_STATUS_SUCCESS) {printf("CUBLAS initialization failed\n");exit(-1);}cublasSasum(handle, size, dev_c, 1, &sum);if (stat != CUBLAS_STATUS_SUCCESS) {printf("data execution failed");cublasDestroy(handle);exit(-1);}cublasDestroy(handle);Error: cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b); return cudaStatus;}
思路2:写两个内核函数,1个调用另外一个(待求证)。
__device__ void sum(const float *c, float* sum){}__global__ void calculate(const int *a, const int *b, float *c, float *sum){//calculate....int i = threadIdx.x;c[i] = a[i] + b[i];__syncthreads();sum(c, sum)}思路3:写两个内核函数,顺序执行,即自己动手实现sum函数。
__global__ void calculate(const int *a, const int *b, float *c){//calculate....int i = threadIdx.x;c[i] = a[i] + b[i];}__global__ void sum(const float *c, float* sum){}
推荐思路1,因为尽量调用库函数(不自己手写),开发效率会提高。
0 0
- CUDA内核函数的连续执行
- CUDA内核运行时间的测量函数
- cuda的内核函数中可以使用的copy代码
- CUDA内核函数参数不定时出现的错误
- 对CUDA内核函数运行时间测量的方法
- 对CUDA内核函数运行时间测量的方法
- CUDA的执行模式
- cuda 的执行配置
- 内核中__init函数的执行顺序
- cuda内核函数单步调试
- CUDA:核函数未执行,错误检查
- GPU(CUDA)学习日记(十)------ Kernal 内核函数 参数的传递
- GPU(CUDA)学习日记(十)------ Kernal 内核函数 参数的传递
- Linux命令的连续执行
- linux内核设计的艺术-开始执行main函数
- cuda执行配置对性能的影响
- CUDA:异步并发执行的概念
- cuda sm执行block的数量
- Ubuntu14.04 emacs配置cscope
- [HDU1269]迷宫城堡(Tarjan求强连通分量)
- LintCode_1_A + B 问题
- leetcode 46,47. Permutations I/II 全排列问题 java
- C++强制转换之static_cast
- CUDA内核函数的连续执行
- Ternary Calculation(也不用栈,注意输入方式即可,直接解决)
- 【SSH进阶之路】Struts + Spring + Hibernate 进阶开端(一)
- 实习过程中linux相关开发学习总结(一)
- 内存池——第一章 几种常用的内存池技术
- git clone远程库错误
- Android使用ListView时item失效解决方案
- 新编日语第四册(修订版)第03课 应用文
- 线段树lazy标记1血题