CUDA之旅:矩阵相加

来源:互联网 发布:淘宝网店名字女童店 编辑:程序博客网 时间:2024/05/19 00:48

矩阵相加CUDA实现

//矩阵相加的CUDA程序实现//Author: Eric Lv//Email: Eric2014_Lv@sjtu.edu.cn//Date: 6/7/2017#include "cuda_runtime.h"#include "device_launch_parameters.h"#include <stdio.h>#include <math.h>#include <stdlib.h>//#include <cuda.h>#define N 32__global__ void matrix_add(const int a[][N], const int b[][N], int c[][N]){    int idx = blockIdx.x * blockDim.x + threadIdx.x;    int idy = blockIdx.y * blockDim.y + threadIdx.y;    c[idx][idy] = a[idx][idy] + b[idx][idy];}int main(void){    int i;    int *dev_a, *dev_b, *dev_c;    int *host_a, *host_b, *host_c;    //分配block里面线程的维数 N*N    dim3 threads_in_block (N, N);       cudaError_t err = cudaSuccess;    host_a = (int *)malloc( sizeof(int) * N * N );    host_b = (int *)malloc( sizeof(int) * N * N );    host_c = (int *)malloc( sizeof(int) * N * N );    err = cudaMalloc((void **)&dev_a, sizeof(int) * N * N);    if(err != cudaSuccess)    {        printf("cudaMalloc (a) is failed!\n");        return -1;    }    err = cudaMalloc((void **)&dev_b, sizeof(int) * N * N);    if(err != cudaSuccess)    {        printf("cudaMalloc (b) is failed!\n");        return -1;    }    err = cudaMalloc((void **)&dev_c, sizeof(int) * N * N);    if(err != cudaSuccess)    {        printf("cudaMalloc (c) is failed!\n");        return -1;    }    for(i = 0; i < N * N; i++)    {        host_a[i] = 2*i+1;        host_b[i] = 3*i-1;    }    err = cudaMemcpy(dev_a, host_a, sizeof(int) * N * N, cudaMemcpyHostToDevice);    if(err != cudaSuccess)    {        printf("Host to device (a) is failed!\n");        return -1;    }    err = cudaMemcpy(dev_b, host_b, sizeof(int) * N * N, cudaMemcpyHostToDevice);    if(err != cudaSuccess)    {        printf("Host to device (b) is failed!\n");        return -1;    }    // 调用GPU上的核函数    matrix_add<<<1, threads_in_block>>>((int (*)[N])dev_a, (int (*)[N])dev_b, (int (*)[N])dev_c);    err = cudaMemcpy(host_c, dev_c, sizeof(int) * N * N, cudaMemcpyDeviceToHost);    if(err != cudaSuccess)    {        printf("Device to host (c) is failed!\n");        return -1;    }     for (i = 0; i < N * N; i++)      {        if (host_a[i] + host_b[i] != host_c[i])         {            printf("a[%d]%d + b[%d]%d != c[%d]%d.\n", i, host_a[i], i, host_b[i], i, host_c[i]);            return -1;        }    }    printf("Congratulations! All entris are correct! You have finished the CUDA code!\n");    free(host_a);    free(host_b);    free(host_c);    cudaFree(dev_a);    cudaFree(dev_b);    cudaFree(dev_c);    return 0;}
原创粉丝点击