CUDA之矩阵乘法——globalmemory

来源:互联网 发布:Ubuntu grub引导 编辑:程序博客网 时间:2024/06/05 11:22

CUDA 矩阵乘法

使用global memory

报错

  • 错误 17 error : no instance of overloaded function “cudaMalloc” matches the argument list E:\Niki\MVDR_BTR\MVDR_BTR\MatrixMulOnDevice.cu 16
    修正:把CUDAcudaMalloc(&Nd, size);改成cudaMalloc((void**)&Nd, size);

  • 错误 17 error : argument of type “float” is incompatible with parameter of type “void *” E:\Niki\MVDR_BTR\MVDR_BTR\MatrixMulOnDevice.cu 17
    修正:把float Md, *Nd, *Pd; 改成float Md, Nd, Pd;

编译通过的代码

#include "cuda_runtime.h"#include "device_launch_parameters.h"#include <stdio.h>__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int Width);void MatrixMulOnDevice(float* M, float* N, float* P, int Width){   int size = Width * Width * sizeof(float);     float  *Md, *Nd, *Pd;// Allocate and Load M, N to device memory     cudaMalloc((void**)&Md, size);    cudaMemcpy(Md, M, size, cudaMemcpyHostToDevice);     cudaMalloc((void**)&Nd, size);     cudaMemcpy(Nd, N, size, cudaMemcpyHostToDevice);     // Allocate P on the device    cudaMalloc((void**)&Pd, size);// Kernel invocation code – to be shown later    // Setup the execution configuration    dim3 dimBlock(Width, Width);    dim3 dimGrid(1, 1);    // Launch the device computation threads!    MatrixMulKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd,Width);// Read P from the device      cudaMemcpy(P, Pd, size, cudaMemcpyDeviceToHost);       // Free device matrices      cudaFree(Md); cudaFree(Nd); cudaFree (Pd);}// Matrix multiplication kernel – per thread code__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int Width){    // 2D Thread ID    int tx = threadIdx.x;    int ty = threadIdx.y;    // Pvalue is used to store the element of the matrix    // that is computed by the thread    float Pvalue = 0;    for (int k = 0; k < Width; ++k)    {          float Melement = Md[ty * Width + k];         float Nelement = Nd[k * Width + tx];         Pvalue += Melement * Nelement;    }    // Write the matrix to device memory;    // each thread writes one element    Pd[ty * Width + tx] = Pvalue;}

测试

添加主函数代码:

#include "cuda_runtime.h"#include "device_launch_parameters.h"#include <stdio.h>#include <iostream>using namespace std;void MatrixMulOnDevice(float* M, float* N, float* P, int Width);int main(){   int Width;   Width = 8;   float M[64];   float N[64];   float P[64];   int i,j;   for(i=0;i<Width;i++)//row   {       for(j=0;j<Width;j++)//colume       {           M[i*Width+j]= i+j+1;           N[i*Width+j]= i+1;       }   }   //float *Mp = M;   //float *Np = N;   //float *Pp = P;    /*MatrixMulOnDevice(Mp,Np, Pp, Width);*/   MatrixMulOnDevice(&M[0],&N[0], &P[0], Width);   printf("矩阵相乘结果为:\n");   for(i=0;i<Width;i++)//row   {       for(j=0;j<Width;j++)//colume       {           printf(" %f \t", P[i*Width+j]);       }       printf("\n");   }    return 0;}

0 0