Java通过JNI调用CUDA矩阵乘法

来源:互联网 发布:大华监控软件 编辑:程序博客网 时间:2024/05/12 02:43

——NG

首先,新建vs2008 cuda dll工程Project4,新建java工程,代码如下:

public class java1 {
 static
 {
  System.loadLibrary("public class java1 {
 static
 {
  System.loadLibrary("Project4");
 }
 public native static int cuda(int i);
 public static void main(String[] args)
 {
  java1.cuda(0);
 }
}

然后编译成.h头文件件后放到cuda dll工程Project4下。

Project4代码如下:

/********************************************************************
*  sample.cu
*  This is a example of the CUDA program.
*********************************************************************/


#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda_runtime.h>
#include "java1.h"
#define BLOCK_SIZE 16

/************************************************************************/
/* Init CUDA                                                            */
/************************************************************************/
#if __DEVICE_EMULATION__

bool InitCUDA(void){return true;}

#else
bool InitCUDA(void)
{
 int count = 0;
 int i = 0;

 cudaGetDeviceCount(&count);
 if(count == 0) {
  fprintf(stderr, "There is no device.\n");
  return false;
 }

 for(i = 0; i < count; i++) {
  cudaDeviceProp prop;
  if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
   if(prop.major >= 1) {
    break;
   }
  }
 }
 if(i == count) {
  fprintf(stderr, "There is no device supporting CUDA.\n");
  return false;
 }
 cudaSetDevice(i);

 printf("CUDA initialized.\n");
 return true;
}

#endif
/************************************************************************/
/* Example                                                              */
/************************************************************************/

__global__ static void matMultCUDA(const float* a, size_t lda, const float* b, size_t ldb, float* c, size_t ldc, int n)
{
 __shared__ float matA[BLOCK_SIZE][BLOCK_SIZE];
 __shared__ float matB[BLOCK_SIZE][BLOCK_SIZE];
 const int tidc = threadIdx.x;
 const int tidr = threadIdx.y;
 const int bidc = blockIdx.x * BLOCK_SIZE;
 const int bidr = blockIdx.y * BLOCK_SIZE;
 int i, j;

 float results = 0;
 float comp = 0;

 for(j = 0; j < n; j += BLOCK_SIZE) {
  matA[tidr][tidc] = a[(tidr + bidr) * lda + tidc + j];
  matB[tidr][tidc] = b[(tidr + j) * ldb + tidc + bidc];

  __syncthreads();

  for(i = 0; i < BLOCK_SIZE; i++) {
   float t;
   comp -= matA[tidr][i] * matB[i][tidc];
   t = results - comp;
   comp = (t - results) + comp;
   results = t;
  }

  __syncthreads();
 }

 c[(tidr + bidr) * ldc + tidc + bidc] = results;
}


/************************************************************************/
/* HelloCUDA                                                            */
/************************************************************************/

clock_t matmultCUDA(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
 float *ac, *bc, *cc;
 clock_t start, end;
 size_t pitch_a, pitch_b, pitch_c;
 int newn = ((n + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE;

 start = clock();
 cudaMallocPitch((void**) &ac, &pitch_a, sizeof(float) * newn, newn);
 cudaMallocPitch((void**) &bc, &pitch_b, sizeof(float) * newn, newn);
 cudaMallocPitch((void**) &cc, &pitch_c, sizeof(float) * newn, newn);

 cudaMemset(ac, 0, pitch_a * newn);
 cudaMemset(bc, 0, pitch_b * newn);

 cudaMemcpy2D(ac, pitch_a, a, sizeof(float) * lda, sizeof(float) * n, n, cudaMemcpyHostToDevice);
 cudaMemcpy2D(bc, pitch_b, b, sizeof(float) * ldb, sizeof(float) * n, n, cudaMemcpyHostToDevice);

 int bx = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
 dim3 blocks(bx, bx);
 dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
 matMultCUDA<<<blocks, threads>>>(ac, pitch_a / sizeof(float), bc, pitch_b / sizeof(float), cc, pitch_c / sizeof(float), n);

 cudaMemcpy2D(c, sizeof(float) * ldc, cc, pitch_c, sizeof(float) * n, n, cudaMemcpyDeviceToHost);

 cudaFree(ac);
 cudaFree(bc);
 cudaFree(cc);

 end = clock();

 return end - start;
}


void matmult(const float* a, int lda, const float* b, int ldb, float* c, int ldc, int n)
{
 int i, j, k;

 for(i = 0; i < n; i++) {
  for(j = 0; j < n; j++) {
   double t = 0;
   for(k = 0; k < n; k++) {
    t += a[i * lda + k] * b[k * ldb + j];
   }
   c[i * ldc + j] = t;
  }
 }
}


void matgen(float* a, int lda, int n)
{
 int i, j;

 for(i = 0; i < n; i++) {
  for(j = 0; j < n; j++) {
   a[i * lda + j] = (float) rand() / RAND_MAX + (float) rand() / (RAND_MAX * RAND_MAX);
  }
 }
}


void compare_mat(const float* a, int lda, const float* b, int ldb, int n)
{
 float max_err = 0;
 float average_err = 0;
 int i, j;

 for(i = 0; i < n; i++) {
  for(j = 0; j < n; j++) {
   if(b[i * ldb + j] != 0) {
    float err = fabs((a[i * lda + j] - b[i * ldb + j]) / b[i * ldb + j]);
    if(max_err < err) max_err = err;
    average_err += err;
   }
  }
 }

 printf("Max error: %g  Average error: %g\n", max_err, average_err / (n * n));
}

JNIEXPORT jint JNICALL Java_java1_cuda (JNIEnv *, jclass, jint p)
{
 float *a, *b, *c, *d;
 int n = 1000;
 int q=p;
 if(!InitCUDA()) {
  return 0;
 }

 a = (float*) malloc(sizeof(float) * n * n);
 b = (float*) malloc(sizeof(float) * n * n);
 c = (float*) malloc(sizeof(float) * n * n);
 d = (float*) malloc(sizeof(float) * n * n);

 srand(0);

 matgen(a, n, n);
 matgen(b, n, n);

 clock_t time = matmultCUDA(a, n, b, n, c, n, n);

 matmult(a, n, b, n, d, n, n);
 compare_mat(c, n, d, n, n);

 double sec = (double) time / CLOCKS_PER_SEC;
 printf("Time used: %.4lf   (%.2lf GFLOPS)\n", sec, 2.0 * n * n * n / (sec * 1E9));

 free(a);
 free(b);
 free(c);
 free(d);
 int abc;
 //scanf("%d",&abc);
 return q;
}

 

更改sample.def为:

LIBRARY "Project4"
EXPORTS
 Java_java1_cuda

将jni.h和jni_md.h放到Project4工程目录下。

然后运行生成Project4.dll,将它拷贝到java家工程目录下。

最后预计女性java程序。

结果如下:

CUDA initialized.
Max error: 1.19209e-007  Average error: 4.22751e-008
Time used: 0.2650   (7.55 GFLOPS)

 

(完)

/*

QQ交流:798618048

*/