CUDA从组成到编程(二) Hello World!

来源：互联网发布：编程使用的安卓模拟器编辑：程序博客网时间：2024/06/05 19:56

首先是官网给的向量加法的例子。
我的环境是ubuntu16, cuda7.5
先是常用命令：

这个的作用是使用自带的工具每0.5s刷新一下GPU的状态，可以作监控用

$ watch -n 0.5 nvida-smi

检查cuda编译器是否安装正确

$ which nvcc

检查GPU型号

$ ls  -l /dev/nv*

编译与运行

$ nvcc hello.cu -o hello$ ./hello

#include <stdio.h>__global__ void vector_add(int *a, int *b, int *c){    /* insert code to calculate the index properly using blockIdx.x, blockDim.x, threadIdx.x */    int index = blockIdx.x * blockDim.x + threadIdx.x;    c[index] = a[index] + b[index];}/* experiment with N *//* how large can it be? */#define N (2048*2048)#define THREADS_PER_BLOCK 512int main(){    int *a, *b, *c;    int *d_a, *d_b, *d_c;    int size = N * sizeof( int );    /* allocate space for device copies of a, b, c */    cudaMalloc( (void **) &d_a, size );    cudaMalloc( (void **) &d_b, size );    cudaMalloc( (void **) &d_c, size );    /* allocate space for host copies of a, b, c and setup input values */    a = (int *)malloc( size );    b = (int *)malloc( size );    c = (int *)malloc( size );    for( int i = 0; i < N; i++ )    {        a[i] = b[i] = i;        c[i] = 0;    }    /* copy inputs to device */    /* fix the parameters needed to copy data to the device */    cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice );    cudaMemcpy( d_b, b, size, cudaMemcpyHostToDevice );    /* launch the kernel on the GPU */    /* insert the launch parameters to launch the kernel properly using blocks and threads */     vector_add<<< (N + (THREADS_PER_BLOCK-1)) / THREADS_PER_BLOCK, THREADS_PER_BLOCK >>>( d_a, d_b, d_c );    /* copy result back to host */    /* fix the parameters needed to copy data back to the host */    cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost );    printf( "c[0] = %d\n",c[0] );    printf( "c[%d] = %d\n",N-1, c[N-1] );    /* clean up */    free(a);    free(b);    free(c);    cudaFree( d_a );    cudaFree( d_b );    cudaFree( d_c );    return 0;} /* end main */

0 0