cuda 1000 32 block 256 threads 2 改进
来源:互联网 发布:初学者怎么制作软件 编辑:程序博客网 时间:2024/06/05 04:51
- #include "cuda_runtime.h"
- #include "device_launch_parameters.h"
- #include <stdio.h>
- #include <iostream>
- using namespace std;
- #define data_size 10000
- #define thread_num 256
- #define block_num 32
- __global__ void kernel(int *d_idata,int * d_odata)
- {
- const int tid=threadIdx.x;
- const int bid=blockIdx.x;
- extern __shared__ int shared[];
- int sum=0;
- for (int i=bid*thread_num+tid;i<data_size;i+=thread_num*block_num)
- {
- shared[tid]+=d_idata[i]*d_idata[i];
- }
- __syncthreads();
- if(tid == 0)
- {
- for(int i = 1; i < thread_num; i++)
- {
- shared[0] += shared[i];
- }
- d_odata[bid] = shared[0];
- }
- }
- int main()
- {
- int h_idata[data_size];
- for (int i=0;i<data_size;i++)
- {
- h_idata[i]=rand()%10;
- }
- int * d_idata;
- int * d_odata;
- cudaMalloc(&d_idata,sizeof(int)*data_size);
- cudaMalloc(&d_odata,sizeof(int)*block_num);
- cudaMemcpy(d_idata,h_idata,sizeof(int)*data_size,cudaMemcpyHostToDevice);
- kernel<<<block_num,thread_num,thread_num*sizeof(int)>>>(d_idata,d_odata);
- int gpu_sum[block_num];
- cudaMemcpy(&gpu_sum,d_odata,sizeof(int)*block_num,cudaMemcpyDeviceToHost);
- cudaFree(d_idata);
- cudaFree(d_odata);
- int final_gpu_sum=0;
- for (int i=0;i<block_num;i++)
- {
- final_gpu_sum+=gpu_sum[i];
- }
- printf("final_gpu_sum=%d\n",final_gpu_sum);
- int cpu_sum = 0;
- for(int i = 0; i < data_size; i++)
- {
- cpu_sum+= h_idata[i] * h_idata[i];
- }
- printf("cpu_sum: %d\n", cpu_sum);
- cin.get();
- }
0 0
- cuda 1000 32 block 256 threads 2 改进
- cuda 1000 32 block 256 threads 2 改进
- cuda 1000 开 32 block 256 threads
- Understanding CUDA grid dimensions, block dimensions and threads organization
- CUDA Thread Block
- How do CUDA blocks/warps/threads map onto CUDA cores?
- 对cuda函数block中thread的理解(2)
- cuda-Block和Grid设定
- start 2 threads
- 2-PROCESSES AND THREADS
- Threads
- Threads
- Threads
- Threads
- CUDA的Threading:Block和Grid设定
- CUDA的Threading:Block和Grid设定
- CUDA 的 Threading:Block 和 Grid 设定
- CUDA的Threading:Block和Grid设定
- springmvc和struts2的差别
- CentOS fedora 安装并设置MariaDB
- To-read
- thrift安装时出现的问题
- 中介者模式(Mediator Pattern)
- cuda 1000 32 block 256 threads 2 改进
- Java两则常见错误详析及解决
- poj3667线段树
- 【Python基础教程】第4章 字典
- J2EE面试题集锦(附答案)
- jni.h头文件详解(一)
- 黑马程序员——线程的总结(二)
- insmod: error inserting './scull.ko': -1 Unknown symbol in module
- OpenCV 决策树 之 理论准备