使用CUBLAS的一些小例子

来源：互联网发布：淘宝家装设计编辑：程序博客网时间：2024/04/27 18:45

1. 矩阵相乘再加 C = a*A*B + b*C

#include "cuda_runtime.h"#include "cublas_v2.h"#include <time.h>#include <iostream>using namespace std;int const M = 6;int const N = 10;int main(){cublasStatus_t status;//Host memory mallocfloat *h_A = (float*)malloc(N*M*sizeof(float));float *h_B = (float*)malloc(N*M*sizeof(float));float *h_C = (float*)malloc(M*M*sizeof(float));float *h_C_cpu = (float*)malloc(M*M*sizeof(float));memset(h_C_cpu,0,M*M*sizeof(float));//Initialize and printfor (int i=0; i<M*N; i++){h_A[i] = (float)(rand()%10+1);h_B[i] = (float)(rand()%10+1);}cout << "Matrix A is:" << endl;for (int i=0; i<M*N; i++){cout << h_A[i] << " ";if ((i+1)%N == 0){cout << endl;}}cout << endl;cout << "Matrix B is:" << endl;for (int i=0; i<M*N; i++){cout << h_B[i] << " ";if ((i+1)%M == 0){cout << endl;}}cout << endl;//CPU caculatefor(int i = 0; i<M; i++){for(int j=0; j<M; j++){for(int k=0; k<N; k++){h_C_cpu[i*M+j] = h_C_cpu[i*M+j]+h_A[i*N+k]*h_B[k*M+j]*1 + 0;}}}cout << "The result from CPU is:" << endl;for (int i=0; i<M*M; i++){cout << h_C_cpu[i] << " ";if ((i+1)%M == 0){cout << endl;}}cout << endl;//Create handle;cublasHandle_t handle;status = cublasCreate(&handle);if (status != CUBLAS_STATUS_SUCCESS){if (status == CUBLAS_STATUS_NOT_INITIALIZED){cout << "Fail to get an instance of blas object! Check whether you have free the handle!" << endl;}getchar();return EXIT_FAILURE;}//Device memory malloc and initializefloat *d_A, *d_B, *d_C;cudaMalloc((void**)&d_A, N*M*sizeof(float));cudaMalloc((void**)&d_B, N*M*sizeof(float));cudaMalloc((void**)&d_C, M*M*sizeof(float));cublasSetVector(N*M, sizeof(float), h_A, 1, d_A, 1);cublasSetVector(N*M, sizeof(float), h_B, 1, d_B, 1);cudaThreadSynchronize();//Call gpu operationfloat a = 1;float b = 0;const float *ca = &a;const float *cb = &b;cublasSgemm(handle,CUBLAS_OP_T,CUBLAS_OP_T,M,M,N,ca,d_A,N,d_B,M,cb,d_C,M);cudaThreadSynchronize();cublasGetVector(M*M, sizeof(float), d_C, 1, h_C, 1);cout << "The result from GPU is:" << endl;for (int i=0; i<M; i++){for(int j=0; j<M; j++){cout << h_C[j*M + i] << " ";}cout << endl;}//Cleaning...free(h_A);free(h_B);free(h_C);cudaFree(d_A);cudaFree(d_B);cudaFree(d_C);cublasDestroy(handle);return 0;}

2.更规范的一个例子，由SDK中的例子改编，与1类似，比较了CPU与GPU运算效率：

/* * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved. * * NOTICE TO USER: * * This source code is subject to NVIDIA ownership rights under U.S. and * international Copyright laws.  Users and possessors of this source code * are hereby granted a nonexclusive, royalty-free license to use this code * in individual and commercial software. * * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE * OR PERFORMANCE OF THIS SOURCE CODE. * * U.S. Government End Users.   This source code is a "commercial item" as * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of * "commercial computer  software"  and "commercial computer software * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995) * and is provided to the U.S. Government only as a commercial end item. * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the * source code with only those rights set forth herein. * * Any use of this source code in individual and commercial software must * include, in the user documentation and internal comments to the code, * the above Disclaimer and U.S. Government End Users Notice. *//* This example demonstrates how to use the CUBLAS library * by scaling an array of floating-point values on the device * and comparing the result to the same operation performed * on the host. *//* Includes, system */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <iostream>#include "time.h"/* Includes, cuda */#include <cuda_runtime.h>#include <cublas_v2.h>#include <helper_cuda.h>using namespace std;/* Matrix size */#define N  (600)/* Host implementation of a simple version of sgemm */static void simple_sgemm(int n, float alpha, const float *A, const float *B,                         float beta, float *C){    int i;    int j;    int k;    for (i = 0; i < n; ++i)    {        for (j = 0; j < n; ++j)        {            float prod = 0;            for (k = 0; k < n; ++k)            {                prod += A[k * n + i] * B[j * n + k];            }            C[j * n + i] = alpha * prod + beta * C[j * n + i];        }    }}/* Main */int main(int argc, char **argv){    cublasStatus_t status;    float *h_A;    float *h_B;    float *h_C;    float *h_C_ref;    float *d_A = 0;    float *d_B = 0;    float *d_C = 0;    float alpha = 1.0f;    float beta = 0.0f;    int n2 = N * N;    int i;    float error_norm;    float ref_norm;    float diff;    cublasHandle_t handle;clock_t cuStart,cuFinish;clock_t start,finish;    int dev = findCudaDevice(argc, (const char **) argv);    if (dev == -1)    {        return EXIT_FAILURE;    }    /* Initialize CUBLAS */    printf("simpleCUBLAS test running..\n");    status = cublasCreate(&handle);    if (status != CUBLAS_STATUS_SUCCESS)    {        fprintf(stderr, "!!!! CUBLAS initialization error\n");        return EXIT_FAILURE;    }    /* Allocate host memory for the matrices */    h_A = (float *)malloc(n2 * sizeof(h_A[0]));    if (h_A == 0)    {        fprintf(stderr, "!!!! host memory allocation error (A)\n");        return EXIT_FAILURE;    }    h_B = (float *)malloc(n2 * sizeof(h_B[0]));    if (h_B == 0)    {        fprintf(stderr, "!!!! host memory allocation error (B)\n");        return EXIT_FAILURE;    }    h_C = (float *)malloc(n2 * sizeof(h_C[0]));    if (h_C == 0)    {        fprintf(stderr, "!!!! host memory allocation error (C)\n");        return EXIT_FAILURE;    }    /* Fill the matrices with test data */    for (i = 0; i < n2; i++)    {        h_A[i] = rand() / (float)RAND_MAX;        h_B[i] = rand() / (float)RAND_MAX;        h_C[i] = rand() / (float)RAND_MAX;    }cuStart = clock();    /* Allocate device memory for the matrices */    if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess)    {        fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");        return EXIT_FAILURE;    }    if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess)    {        fprintf(stderr, "!!!! device memory allocation error (allocate B)\n");        return EXIT_FAILURE;    }    if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess)    {        fprintf(stderr, "!!!! device memory allocation error (allocate C)\n");        return EXIT_FAILURE;    }    /* Initialize the device matrices with the host matrices */    status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);    if (status != CUBLAS_STATUS_SUCCESS)    {        fprintf(stderr, "!!!! device access error (write A)\n");        return EXIT_FAILURE;    }    status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);    if (status != CUBLAS_STATUS_SUCCESS)    {        fprintf(stderr, "!!!! device access error (write B)\n");        return EXIT_FAILURE;    }    status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);    if (status != CUBLAS_STATUS_SUCCESS)    {        fprintf(stderr, "!!!! device access error (write C)\n");        return EXIT_FAILURE;    }    /* Performs operation using cublas */    status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N);    if (status != CUBLAS_STATUS_SUCCESS)    {        fprintf(stderr, "!!!! kernel execution error.\n");        return EXIT_FAILURE;    }    /* Allocate host memory for reading back the result from device memory */    h_C_ref = (float *)malloc(n2 * sizeof(h_C[0]));    if (h_C_ref == 0)    {        fprintf(stderr, "!!!! host memory allocation error (C)\n");        return EXIT_FAILURE;    }    /* Read the result back */    status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C_ref, 1);    if (status != CUBLAS_STATUS_SUCCESS)    {        fprintf(stderr, "!!!! device access error (read C)\n");        return EXIT_FAILURE;    }cuFinish = clock();start = clock();/* Performs operation using plain C code */simple_sgemm(N, alpha, h_A, h_B, beta, h_C);finish = clock();cout<<"GPU TIME:"<<(double)(cuFinish - cuStart)/CLOCKS_PER_SEC<<endl;cout<<"CPU TIME:"<<(double)(finish - start)/CLOCKS_PER_SEC<<endl;    /* Check result against reference */    error_norm = 0;    ref_norm = 0;    for (i = 0; i < n2; ++i)    {        diff = h_C_ref[i] - h_C[i];        error_norm += diff * diff;        ref_norm += h_C_ref[i] * h_C_ref[i];    }    error_norm = (float)sqrt((double)error_norm);    ref_norm = (float)sqrt((double)ref_norm);std::cout<<error_norm<<"\t"<<ref_norm<<std::endl;    if (fabs(ref_norm) < 1e-7)    {        fprintf(stderr, "!!!! reference norm is 0\n");        return EXIT_FAILURE;    }    /* Memory clean up */    free(h_A);    free(h_B);    free(h_C);    free(h_C_ref);    if (cudaFree(d_A) != cudaSuccess)    {        fprintf(stderr, "!!!! memory free error (A)\n");        return EXIT_FAILURE;    }    if (cudaFree(d_B) != cudaSuccess)    {        fprintf(stderr, "!!!! memory free error (B)\n");        return EXIT_FAILURE;    }    if (cudaFree(d_C) != cudaSuccess)    {        fprintf(stderr, "!!!! memory free error (C)\n");        return EXIT_FAILURE;    }    /* Shutdown */    status = cublasDestroy(handle);    if (status != CUBLAS_STATUS_SUCCESS)    {        fprintf(stderr, "!!!! shutdown error (A)\n");        return EXIT_FAILURE;    }    if (error_norm / ref_norm < 1e-6f)    {        printf("simpleCUBLAS test passed.\n");        exit(EXIT_SUCCESS);    }    else    {        printf("simpleCUBLAS test failed.\n");        exit(EXIT_FAILURE);    }}

0 0