cuda编程---cuda硬件信息与错误处置

来源：互联网发布：java外卖cms 编辑：程序博客网时间：2024/06/06 10:06

一、硬件信息查询：

#include <stdio.h>int main() {    int nDevices;    cudaGetDeviceCount(&nDevices);    for (int i=0; i < nDevices; i++) {        cudaDeviceProp prop;        cudaGetDeviceProperties(&prop, i);        printf("Device Number: %d\n", i);        printf("  Device name: %s\n", prop.name);        printf("  Memory Clock Rate (KHz): %d\n",                prop.memoryClockRate);        printf("  Memory Bus Width (bits): %d\n",prop.memoryBusWidth);        printf("  Peak Memory Bandwidth (GB/s): %f\n\n",                2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);    }    return 0;}

二、错误处置：

1、代码段一：

#include <stdio.h>int main() {    int nDevices;    cudaError_t err = cudaGetDeviceCount(&nDevices);    if (err != cudaSuccess) printf("%s\n", cudaGetErrorString(err));    for (int i=0; i < nDevices; i++) {        cudaDeviceProp prop;        cudaGetDeviceProperties(&prop, i);        printf("Device Number: %d\n", i);        printf("  Device name: %s\n", prop.name);        printf("  Memory Clock Rate (KHz): %d\n",                prop.memoryClockRate);        printf("  Memory Bus Width (bits): %d\n",prop.memoryBusWidth);        printf("  Peak Memory Bandwidth (GB/s): %f\n\n",                2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);    }    return 0;}

这段代码在下面处有改变：
cudaError_t err = cudaGetDeviceCount(&nDevices); if (err != cudaSuccess) printf("%s\n", cudaGetErrorString(err));

2、代码段二：

#include <iostream>#include <math.h>#include <stdio.h>__global__void saxpy(int n,float a,float *x,float *y){    int i = blockIdx.x*blockDim.x +threadIdx.x;    if (i < n) y[i] = a*x[i] + y[i];}int main(void){    int N = 1 << 20; //1M element.    //float *x=new float[N];    //float *y=new float[N];    //Allocate Unified Memory -- accessible from CPU or GPU    float *x, *y, *d_x, *d_y;    x = (float*)malloc(N*sizeof(float));    y = (float*)malloc(N*sizeof(float));    cudaMalloc(&d_x, N*sizeof(float));    cudaMalloc(&d_y, N*sizeof(float));    //initialize x and y arrays on the host.    for (int i=0;i<N;i++){        x[i]=1.0f;        y[i]=2.0f;    }    cudaEvent_t start, stop;    cudaEventCreate(&start);    cudaEventCreate(&stop);    cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);    cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);    cudaEventRecord(start);    saxpy<<< (N+255)/256, 256>>>(N, 2.0, d_x, d_y);    cudaError_t errSync = cudaGetLastError();    cudaError_t errAsync = cudaDeviceSynchronize();    if (errSync != cudaSuccess)        printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));    if (errAsync != cudaSuccess)        printf("Async kernel error: %s\n",cudaGetErrorString(errAsync));    cudaEventRecord(stop);    cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);    cudaEventSynchronize(stop);    float milliseconds = 0;    cudaEventElapsedTime(&milliseconds, start, stop);    // Check for errors (all values should be 3.0f)    float maxError=0.0f;    for (int i=0;i<N;i++)        maxError=max(maxError,fabs(y[i]-4.0f));    printf("Max error: %f . \n", maxError);    printf("Effective Bandwidth (GB/s): %f .\n", N*4*3/milliseconds/1e6);    cudaFree(x);    cudaFree(y);    cudaFree(d_x);    cudaFree(d_y);    cudaEventDestroy(start);    cudaEventDestroy(stop);    return 0;}

阅读全文

0 0