Cuda 学习教程:Cuda 程序初始化
来源:互联网 发布:京东联盟和淘宝联盟 编辑:程序博客网 时间:2024/05/17 15:58
Cuda程序初始化
目前,cuda里面没有对设备的初始化函数InitDevice(),只能每次调用的api函数的时候,加载设备的上下文,自动进行初始化,这将带来问题:
- First函数调用的时候,需要自动初始化设备,因此耗时过长
- 无法分析第一个api函数的耗时
处理办法
- 在程序前加设置初始化函数: cudaFree(0),后面程序就不会再次初始化,cudamalloc()也将很快;
- 一次初始化后,程序cudamalloc()分配的内存不释放,继续使用,所有程序运行结束后,再一起释放。
TestCode
简单的测试demo,例如:
@ Bytry Zhang#include "cuda_runtime.h"#include "device_launch_parameters.h"#include <time.h>#include<iostream>#include <stdio.h>using namespace std;cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);__global__ void addKernel(int *c, const int *a, const int *b){ int i = threadIdx.x; c[i] = a[i] + b[i];}int main(){ const int arraySize = 5; const int a[arraySize] = { 1, 2, 3, 4, 5 }; const int b[arraySize] = { 10, 20, 30, 40, 50 }; int c[arraySize] = { 0 }; // Add vectors in parallel. cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addWithCuda failed!"); return 1; } printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n", c[0], c[1], c[2], c[3], c[4]); // cudaDeviceReset must be called before exiting in order for profiling and // tracing tools such as Nsight and Visual Profiler to show complete traces. cudaStatus = cudaDeviceReset(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceReset failed!"); return 1; } return 0;}// Helper function for using CUDA to add vectors in parallel.cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size){ int *dev_a = 0; int *dev_b = 0; int *dev_c = 0; cudaError_t cudaStatus; cudaStatus = cudaFree(0); //cudaStatus = cudaSetDevice(0); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); goto Error; } clock_t t = clock(); // Choose which GPU to run on, change this on a multi-GPU system. // Allocate GPU buffers for three vectors (two input, one output) . cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cout << "cudaMemcpy time = " << clock() - t << endl; // Copy input vectors from host memory to GPU buffers. cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } // Launch a kernel on the GPU with one thread for each element. addKernel<<<1, size>>>(dev_c, dev_a, dev_b); // Check for any errors launching the kernel cudaStatus = cudaGetLastError(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); goto Error; } // cudaDeviceSynchronize waits for the kernel to finish, and returns // any errors encountered during the launch. cudaStatus = cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); goto Error; } // Copy output vector from GPU buffer to host memory. cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; }Error: cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b); return cudaStatus;}
阅读全文
0 0
- Cuda 学习教程:Cuda 程序初始化
- Cuda 学习教程(四):Cuda程序的优化
- CUDA学习教程
- Cuda 学习教程三:CUDA硬件架构
- CUDA学习之CUDA程序优化
- CUDA教程
- cuda教程
- CUDA学习--CUDA流
- [CUDA学习]CUDA安装
- cuda 程序
- Cuda 学习教程四:GPU和Cuda逻辑关系
- CUDA学习笔记(2) 第一个CUDA程序
- CUDA: CUDA程序优化步骤
- cuda学习
- CUDA学习
- cuda学习
- CUDA学习
- CUDA学习
- (7)Mysql自定义函数
- TreeSet集合排序两种实现方式Comparable和Comparator比较
- iOS应用中检测第三方app是否安装及跳转解决方案
- 第一次MVC开发5天后的知识点记录
- python操作mysql数据库
- Cuda 学习教程:Cuda 程序初始化
- pom.xml文件报错
- 预计华为Mate10比小米Mix2强大好几倍,一起来看看它“强大的”短信误删恢复功能吧
- 手机端页面自适应解决方案—rem布局
- array 方法
- ZeroMQ 原理及其优点
- k-means 算法
- 笨办法学 Python · 续 练习 9:`sed`
- ubuntu 14.04+opencv+opencv_contrib-3.1.0编译安装