windows10 vs2013控制台工程中添加并编译cuda8.0文件操作步骤
来源:互联网 发布:仓库数据采集器 软件 编辑:程序博客网 时间:2024/06/13 01:16
一般有两种方法可以在vs2013上添加运行cuda8.0程序:
一、直接新建一个基于CUDA8.0的项目:如下图所示,
点击确定后即可生成test_cuda项目;默认会自动生成一个kernel.cu文件;默认已经配置好Debug/Release, Win32/x64环境,直接编译运行,结果如下图所示:函数执行的是两个数组的加操作。移除kernel.cu文件,加入自己需要的cuda文件即可进行实际操作了,非常方便。
二、实际情况下,多是在已有的项目中添加一些cuda文件,用于加速,下面说下具体的操作步骤:
1、新建一个CUDA_Test x64控制台空工程;
2、新建CUDA_Test.cpp文件;
3、选中CUDA_Test项目,右键单击-->生成依赖项-->生成自定义,勾选CUDA8.0,点击确定,如下图所示:
4、完成第3步后,再次打开工程的属性配置,会多出两项,CUDA C/C++和CUDA Linker,如下图所示:
5、新建或添加几个已有的文件,包括common.hpp、simple.hpp、simple.cpp、simple.cu,各个文件内容如下:
common.hpp:
#ifndef FBC_CUDA_TEST_COMMON_HPP_#define FBC_CUDA_TEST_COMMON_HPP_#define PRINT_ERROR_INFO(info) { \fprintf(stderr, "Error: %s, file: %s, func: %s, line: %d\n", #info, __FILE__, __FUNCTION__, __LINE__); \return -1; }#endif // FBC_CUDA_TEST_COMMON_HPP_simple.hpp:
#ifndef FBC_CUDA_TEST_SIMPLE_HPP_#define FBC_CUDA_TEST_SIMPLE_HPP_// reference: C:\ProgramData\NVIDIA Corporation\CUDA Samples\v8.0\0_Simpleint test_vectorAdd();int vectorAdd_cpu(const float *A, const float *B, float *C, int numElements);int vectorAdd_gpu(const float *A, const float *B, float *C, int numElements);#endif // FBC_CUDA_TEST_SIMPLE_HPP_simple.cpp:
#include "simple.hpp"#include <stdlib.h>#include <iostream>#include "common.hpp"// =========================== vector add =============================int test_vectorAdd(){// Vector addition: C = A + B, implements element by element vector additionconst int numElements{ 50000 };float* A = new float[numElements];float* B = new float[numElements];float* C1 = new float[numElements];float* C2 = new float[numElements];// Initialize vectorfor (int i = 0; i < numElements; ++i) {A[i] = rand() / (float)RAND_MAX;B[i] = rand() / (float)RAND_MAX;}int ret = vectorAdd_cpu(A, B, C1, numElements);if (ret != 0) PRINT_ERROR_INFO(vectorAdd_cpu);ret = vectorAdd_gpu(A, B, C2, numElements);if (ret != 0) PRINT_ERROR_INFO(vectorAdd_gpu);for (int i = 0; i < numElements; ++i) {if (fabs(C1[i] - C2[i]) > 1e-5) {fprintf(stderr, "Result verification failed at element %d!\n", i);return -1;}}delete[] A;delete[] B;delete[] C1;delete[] C2;return 0;}int vectorAdd_cpu(const float *A, const float *B, float *C, int numElements){for (int i = 0; i < numElements; ++i) {C[i] = A[i] + B[i];}return 0;}simple.cu:
#include "simple.hpp"#include <iostream>#include <cuda_runtime.h> // For the CUDA runtime routines (prefixed with "cuda_")#include <device_launch_parameters.h>// reference: C:\ProgramData\NVIDIA Corporation\CUDA Samples\v8.0\0_Simple// =========================== vector add =============================__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements){int i = blockDim.x * blockIdx.x + threadIdx.x;if (i < numElements) {C[i] = A[i] + B[i];}}int vectorAdd_gpu(const float *A, const float *B, float *C, int numElements){// Error code to check return values for CUDA callscudaError_t err{ cudaSuccess };size_t length{ numElements * sizeof(float) };fprintf(stderr, "Length: %d\n", length);float* d_A{ nullptr };float* d_B{ nullptr };float* d_C{ nullptr };err = cudaMalloc(&d_A, length);if (err != cudaSuccess) {fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));return -1;}err = cudaMalloc(&d_B, length);if (err != cudaSuccess) {fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));return -1;}err = cudaMalloc(&d_C, length);if (err != cudaSuccess) {fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));return -1;}err = cudaMemcpy(d_A, A, length, cudaMemcpyHostToDevice);if (err != cudaSuccess) {fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));return -1;}err = cudaMemcpy(d_B, B, length, cudaMemcpyHostToDevice);if (err != cudaSuccess) {fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));return -1;}// Launch the Vector Add CUDA kernelint threadsPerBlock = 256;int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;fprintf(stderr, "CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);vectorAdd << <blocksPerGrid, threadsPerBlock >> >(d_A, d_B, d_C, numElements);err = cudaGetLastError();if (err != cudaSuccess) {fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));return -1;}// Copy the device result vector in device memory to the host result vector in host memory.err = cudaMemcpy(C, d_C, length, cudaMemcpyDeviceToHost);if (err != cudaSuccess) {fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));return -1;}err = cudaFree(d_A);if (err != cudaSuccess) {fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));return -1;}err = cudaFree(d_B);if (err != cudaSuccess) {fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));return -1;}err = cudaFree(d_C);if (err != cudaSuccess) {fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));return -1;}return err;}CUDA_Test.cpp:
#include <iostream>#include "simple.hpp"int main(){int ret = test_vectorAdd();if (ret == 0) fprintf(stderr, "***** test success *****\n");else fprintf(stderr, "===== test fail =====\n");return 0;}6、调整属性配置项:
(1)、CUDA C/C++-->Common中Target Machine Platform中默认是32-bit(--machine32),因为是x64,所以将其调整为64-bit(--machine 64);
(2)、添加附加库:链接器-->输入-->附加依赖项:cudart.lib;
(3)、消除nvcc warning: The 'compute_20', 'sm_20', and'sm_21' architectures are deprecated, and may be removed in a future release:CUDA C/C++-->Device: Code Generation:由compute_20,sm_20修改为compute_30,sm_30; compute_35,sm_35; compute_37,sm_37;compute_50,sm_50; compute_52,sm_52; compute_60,sm_60
以上code是参考NVIDIA Corporation\CUDA Samples\v8.0\0_Simple中vectorAdd例子进行的改写,输出结果如下:
GitHub:https://github.com/fengbingchun/CUDA_Test
- windows10 vs2013控制台工程中添加并编译cuda8.0文件操作步骤
- windows10+cuda8.0+cudnn5.1+vs2013下编译caffe
- windows10+cuda8.0+cudnn5.1+vs2013下编译caffe
- Windows10+VS2013+caffe+Python2.7+CUDA8.0 部署配置
- 如何在VS2013工程中添加head和source文件
- windows10下编译SSD cuda8.0 失败,cpuonly成功
- 在vs2013+qt中添加控制台
- 64位win10+cuda8.0+vs2013+cuDNN V5下Caffe的编译安装教程并配置matlab2014a 接口
- vs2013 + win8.1 + cuda8.0 手工编译 opencv2.4.9
- wion10 cuda8.0+cudnn+vs2013+matlab2015b+matconvnet编译
- 用mingw编译C++工程;程序里面的cout<<"aaa";cmd运行c++程序,aaa内容在控制台不显示,如何将aaa输出到控制台屏幕并将控制台内容写到一个文件中。
- Theano+cuda8.0+vs2013配置
- 图文:配置CUDA8.0 + VS2013
- win10+vs2013+cuda8.0+caffe
- Windows10系统添加打印机步骤
- VS工程中添加c/c++工程中外部头文件及库的基本步骤
- openalpr编译步骤-vs2013
- Java 控制台编译Java文件并调用
- SQL中,无法连接到本机服务器,error:40-无法打开到SQL Server的连接(Microsoft SQL Server,错误:2)
- linux无法连接网络
- java面试——mybatis 面试题
- hibernate用like进行模糊查询时不能写单引号!!!
- C#实现序列化对象到XML文档与反序列化
- windows10 vs2013控制台工程中添加并编译cuda8.0文件操作步骤
- openssl 的使用
- ATL提供的所有转换宏
- 使用Resources.getIdentifier (name, defType,defPackage)获取资源Id
- Hibernate(6)对象的三种状态
- String与StringBuffer的区别
- IT资料地址分享
- spring定时器表达式
- 线段树小解及模板