多人多车求距离_cpu&&gpu_寄存器优化_sharememory优化

来源:互联网 发布:步步高9688软件下载 编辑:程序博客网 时间:2024/04/28 22:11
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <ctime>
  4. #include <iostream>
  5. #include <cmath>
  6. using namespace std;
  7. #define M 3200 //num of person
  8. #define N 3200 //num of car
  9. #define B_S 32
  10. #define tile_x 2
  11. #define tile_y 2
  12. #define SHOW
  13. //P[M]*C[N]=D[M][N]
  14. __global__ void distance_gpu(float *x, float *y, float *px, float *py, float *distance, int m, int n)
  15. {
  16. __shared__ float px_s[B_S], py_s[B_S], x_s[B_S], y_s[B_S];
  17. int index_x = blockIdx.x * blockDim.x + threadIdx.x;
  18. int index_y = blockIdx.y * blockDim.y + threadIdx.y;
  19. if (index_x >= N || index_y >= M) return;
  20. if (threadIdx.y == 0)
  21. {
  22. x_s[threadIdx.x] = x[index_x];
  23. y_s[threadIdx.x] = y[index_x];
  24. }
  25. if (threadIdx.x==0)
  26. {
  27. px_s[threadIdx.y] = px[index_y];
  28. py_s[threadIdx.y] = py[index_y];
  29. }
  30. __syncthreads();
  31. distance[N*index_y + index_x] = sqrt((px_s[threadIdx.y] - x_s[threadIdx.x])*(px_s[threadIdx.y] - x_s[threadIdx.x]) + (py_s[threadIdx.y] - y_s[threadIdx.x])*(py_s[threadIdx.y] - y_s[threadIdx.x]));
  32. }
  33. void distance_cpu(float *x, float *y, float *px, float *py, float *distance, int m, int n)
  34. {
  35. for (int i = 0; i<m; i++)
  36. {
  37. for (int j = 0; j<n; j++)
  38. {
  39. int xx = px[i] - x[j];
  40. int yy = py[i] - y[j];
  41. distance[i*N + j] = sqrt(xx*xx + yy*yy);
  42. }
  43. }
  44. }
  45. void compute_gpu(float *x, float *y, float *px, float *py, float *distance, int m, int n)
  46. {
  47. float *dx, *dy, *dpx, *dpy, *dd;
  48. cudaMalloc((void **)&dpx, sizeof(float)*M);
  49. cudaMalloc((void **)&dpy, sizeof(float)*M);
  50. cudaMalloc((void **)&dx, sizeof(float)*N);
  51. cudaMalloc((void **)&dy, sizeof(float)*N);
  52. cudaMalloc((void **)&dd, sizeof(float)*N*M);
  53. ///测试时间
  54. float elapsedTime = 0.0f;
  55. cudaEvent_t start, stop;
  56. cudaEventCreate(&start);
  57. cudaEventCreate(&stop);
  58. cudaEventRecord(start, 0);
  59. cudaMemcpy(dx, x, sizeof(float)*N, cudaMemcpyHostToDevice);
  60. cudaMemcpy(dy, y, sizeof(float)*N, cudaMemcpyHostToDevice);
  61. cudaMemcpy(dpx, px, sizeof(float)*M, cudaMemcpyHostToDevice);
  62. cudaMemcpy(dpy, py, sizeof(float)*M, cudaMemcpyHostToDevice);
  63. dim3 dimGrid((N + B_S - 1) / B_S, (M + B_S - 1) / B_S);
  64. dim3 dimBlock(B_S, B_S);
  65. distance_gpu << <dimGrid, dimBlock >> >(dx, dy, dpx, dpy, dd, M, N);
  66. cudaMemcpy(distance, dd, sizeof(float)*N*M, cudaMemcpyDeviceToHost);
  67. ///时间结束
  68. cudaEventRecord(stop, 0);
  69. cudaEventSynchronize(stop);
  70. cudaEventElapsedTime(&elapsedTime, start, stop);
  71. printf("the time on gpu is %f ms\n", elapsedTime);
  72. cudaFree(dx);
  73. cudaFree(dy);
  74. cudaFree(dpx);
  75. cudaFree(dpy);
  76. cudaFree(dd);
  77. cudaEventDestroy(start);
  78. cudaEventDestroy(stop);
  79. }
  80. void compute_cpu(float *x, float *y, float *px, float *py, float *distance, int m, int n)
  81. {
  82. const int stride_x = N / tile_x;
  83. const int stride_y = M / tile_y;
  84. float x_l[stride_x], y_l[stride_x], px_l[stride_y], py_l[stride_y];
  85. clock_t start, finish;
  86. start = clock();
  87. for (int p = 0; p < tile_y; p++)
  88. {
  89. for (int j = 0; j < stride_y; j++)
  90. {
  91. px_l[j] = px[p*stride_y + j];
  92. py_l[j] = py[p*stride_y + j];
  93. }
  94. for (int q = 0; q < tile_x; q++)
  95. {
  96. for (int i = 0; i < stride_x; i++)
  97. {
  98. x_l[i] = x[q*stride_x + i];
  99. y_l[i] = y[q*stride_x + i];
  100. }
  101. float *distance_l = distance+p*N*stride_y + q*stride_x;
  102. distance_cpu(x_l, y_l, px_l, py_l, distance_l, stride_x, stride_y);
  103. }
  104. }
  105. finish = clock();
  106. printf("the time on cpu is %f ms\n", (double)(finish - start));
  107. }
  108. void verify(float *C1, float *C2, int m, int n)
  109. {
  110. for (int i = 0; i < m; i++)
  111. for (int j = 0; j < n; j++)
  112. {
  113. if ((C2[i*n + j] - C1[i*m + j])>1e-5)
  114. {
  115. printf("error! results are not equel!");
  116. break;
  117. }
  118. }
  119. }
  120. int main()
  121. {
  122. float* px = (float*)malloc(M*sizeof(float));
  123. float* py = (float*)malloc(M*sizeof(float));
  124. float* x = (float*)malloc(N*sizeof(float));
  125. float* y = (float*)malloc(N*sizeof(float));
  126. float* distance1 = (float*)malloc(N*M*sizeof(float));
  127. float* distance2 = (float*)malloc(N*M*sizeof(float));
  128. for (int i = 0; i<N; i++)
  129. {
  130. x[i] = rand() % 10;
  131. y[i] = rand() % 10;
  132. #ifdef SHOW
  133. cout << " (" << x[i] << "," << y[i] << ")";
  134. #endif // SHOW
  135. }
  136. for (int i = 0; i<M; i++)
  137. {
  138. px[i] = rand() % 10;
  139. py[i] = rand() % 10;
  140. #ifdef SHOW
  141. cout << endl << "(" << px[i] << "," << py[i] << ")" << endl;
  142. #endif // SHOW
  143. }
  144. compute_cpu(x, y, px, py, distance1, M, N);
  145. #ifdef SHOW
  146. for (int i = 0; i< M; i++)
  147. {
  148. for (int j = 0; j< N; j++)
  149. cout << distance1[i*N + j] << " ";
  150. cout << endl;
  151. }
  152. #endif // SHOW
  153. compute_gpu(x, y, px, py, distance2, M, N);
  154. #ifdef SHOW
  155. for (int i = 0; i< M; i++)
  156. {
  157. for (int j = 0; j< N; j++)
  158. cout << distance2[i*N + j] << " ";
  159. cout << endl;
  160. }
  161. #endif // SHOW
  162. verify(distance1, distance2, M, N);
  163. free(x);
  164. free(y);
  165. free(px);
  166. free(py);
  167. free(distance1);
  168. free(distance2);
  169. return 0;
  170. }



来自为知笔记(Wiz)


0 0
原创粉丝点击