Cuda入门代码

来源：互联网发布：sweet alert.js 编辑：程序博客网时间：2024/06/05 06:09
Cuda入门代码

#include "cuda_runtime.h"#include "device_launch_parameters.h"#include <stdio.h>#include <stdlib.h>__global__ void VecAdd(float* A, float* B, float* C, int N){    printf("Debug: blockDim.x %d blockIdx.x %d  threadIdx.x %d\n", blockDim.x, blockIdx.x, threadIdx.x);    int i = blockDim.x * blockIdx.x + threadIdx.x;    if (i < N)        C[i] = A[i] + B[i];}int main(){    const int N = 5;    size_t size = N * sizeof(float);    //Allocate input vectors in host memory    float* h_A = (float*)malloc(size);    float* h_B = (float*)malloc(size);    float* h_C = (float*)malloc(size);    for (int i = 0; i < N; i++){        h_A[i] = i + 1;        h_B[i] = 5 - i;        h_C[i] = 0;    }    //Allocate vectors in device memory    float* d_A;    cudaMalloc(&d_A, size);    float* d_B;    cudaMalloc(&d_B, size);    float* d_C;    cudaMalloc(&d_C, size);    //Copy vectors from host memory to device memory    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);    //Invoke kernal    int threadsPerBlock = 256;    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;    VecAdd <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);    //Copy result from device memory to host memory    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);    for (int i = 0; i < N; i++){        printf("%f%c", h_A[i], i == N - 1 ? '\n' : ' ');    }    for (int i = 0; i < N; i++){        printf("%f%c", h_B[i], i == N - 1 ? '\n' : ' ');    }    for (int i = 0; i < N; i++){        printf("%f%c", h_C[i], i == N - 1 ? '\n' : ' ');    }    //Free device memory    cudaFree(d_A);    cudaFree(d_B);    cudaFree(d_C);    //Free host memory    free(h_A);    free(h_B);    free(h_C);    system("pause");    return 0;}
阅读全文
0 0