CUDA编程札记
来源:互联网 发布:淘宝上买被子靠谱吗 编辑:程序博客网 时间:2024/05/22 00:33
- const int N = 33 * 1024;
- const int threadsPerBlock = 256;
- const int blocksPerGrid =
- imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
- __global__ void dot( float *a, float *b, float *c ) {
- __shared__ float cache[threadsPerBlock];
- int tid = threadIdx.x + blockIdx.x * blockDim.x;
- int cacheIndex = threadIdx.x;
- float temp = 0;
- while (tid < N) {
- temp += a[tid] * b[tid];
- tid += blockDim.x * gridDim.x;
- }
- // set the cache values
- cache[cacheIndex] = temp;
- // synchronize threads in this block
- __syncthreads();
- // for reductions, threadsPerBlock must be a power of 2
- // because of the following code
- int i = blockDim.x/2;
- while (i != 0) {
- if (cacheIndex < i)
- cache[cacheIndex] += cache[cacheIndex + i];
- __syncthreads();
- i /= 2;
- }
- if (cacheIndex == 0)
- c[blockIdx.x] = cache[0];
- }
- int main( void ) {
- float *a, *b, c, *partial_c;
- float *dev_a, *dev_b, *dev_partial_c;
- // allocate memory on the cpu side
- a = (float*)malloc( N*sizeof(float) );
- b = (float*)malloc( N*sizeof(float) );
- partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );
- // allocate the memory on the GPU
- HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
- N*sizeof(float) ) );
- HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
- N*sizeof(float) ) );
- HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
- blocksPerGrid*sizeof(float) ) );
- // fill in the host memory with data
- for (int i=0; i<N; i++) {
- a[i] = i;
- b[i] = i*2;
- }
- // copy the arrays 'a' and 'b' to the GPU
- HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
- cudaMemcpyHostToDevice ) );
- HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
- cudaMemcpyHostToDevice ) );
- dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
- dev_partial_c );
- // copy the array 'c' back from the GPU to the CPU
- HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
- blocksPerGrid*sizeof(float),
- cudaMemcpyDeviceToHost ) );
- // finish up on the CPU side
- c = 0;
- for (int i=0; i<blocksPerGrid; i++) {
- c += partial_c[i];
- }
- #define sum_squares(x) (x*(x+1)*(2*x+1)/6)
- printf( "Does GPU value %.6g = %.6g?\n", c,
- 2 * sum_squares( (float)(N - 1) ) );
- // free memory on the gpu side
- HANDLE_ERROR( cudaFree( dev_a ) );
- HANDLE_ERROR( cudaFree( dev_b ) );
- HANDLE_ERROR( cudaFree( dev_partial_c ) );
- // free memory on the cpu side
- free( a );
- free( b );
- free( partial_c );
- }
- struct Lock {
- int *mutex;
- Lock( void ) {
- HANDLE_ERROR( cudaMalloc( (void**)&mutex,sizeof(int) ) );
- HANDLE_ERROR( cudaMemset( mutex, 0, sizeof(int) ) );
- }
- ~Lock( void ) {
- cudaFree( mutex );
- }
- __device__ void lock( void ) {
- while( atomicCAS( mutex, 0, 1 ) != 0 );
- }
- __device__ void unlock( void ) {
- atomicExch( mutex, 0 );
- }
- };
- #define imin(a,b) (a<b?a:b)
- const int N = 33 * 1024 * 1024;
- const int threadsPerBlock = 256;
- const int blocksPerGrid =
- imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
- __global__ void dot( Lock lock, float *a,
- float *b, float *c ) {
- __shared__ float cache[threadsPerBlock];
- int tid = threadIdx.x + blockIdx.x * blockDim.x;
- int cacheIndex = threadIdx.x;
- float temp = 0;
- while (tid < N) {
- temp += a[tid] * b[tid];
- tid += blockDim.x * gridDim.x;
- }
- // set the cache values
- cache[cacheIndex] = temp;
- // synchronize threads in this block
- __syncthreads();
- // for reductions, threadsPerBlock must be a power of 2
- // because of the following code
- int i = blockDim.x/2;
- while (i != 0) {
- if (cacheIndex < i)
- cache[cacheIndex] += cache[cacheIndex + i];
- __syncthreads();
- i /= 2;
- }
- if (cacheIndex == 0) {
- // wait until we get the lock
- lock.lock();
- // we have the lock at this point, update and release
- *c += cache[0];
- lock.unlock();
- }
- }
- int main( void ) {
- float *a, *b, c = 0;
- float *dev_a, *dev_b, *dev_c;
- // allocate memory on the cpu side
- a = (float*)malloc( N*sizeof(float) );
- b = (float*)malloc( N*sizeof(float) );
- // allocate the memory on the GPU
- HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
- N*sizeof(float) ) );
- HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
- N*sizeof(float) ) );
- HANDLE_ERROR( cudaMalloc( (void**)&dev_c,
- sizeof(float) ) );
- // fill in the host memory with data
- for (int i=0; i<N; i++) {
- a[i] = i;
- b[i] = i*2;
- }
- // copy the arrays 'a' and 'b' to the GPU
- HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
- cudaMemcpyHostToDevice ) );
- HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
- cudaMemcpyHostToDevice ) );
- HANDLE_ERROR( cudaMemcpy( dev_c, &c, sizeof(float),
- cudaMemcpyHostToDevice ) );
- Lock lock;
- dot<<<blocksPerGrid,threadsPerBlock>>>( lock, dev_a,
- dev_b, dev_c );
- // copy c back from the GPU to the CPU
- HANDLE_ERROR( cudaMemcpy( &c, dev_c,
- sizeof(float),
- cudaMemcpyDeviceToHost ) );
- #define sum_squares(x) (x*(x+1)*(2*x+1)/6)
- printf( "Does GPU value %.6g = %.6g?\n", c,
- 2 * sum_squares( (float)(N - 1) ) );
- // free memory on the gpu side
- HANDLE_ERROR( cudaFree( dev_a ) );
- HANDLE_ERROR( cudaFree( dev_b ) );
- HANDLE_ERROR( cudaFree( dev_c ) );
- // free memory on the cpu side
- free( a );
- free( b );
- }
- __global__ void histo_kernel( unsigned char *buffer,
- long size,
- unsigned int *histo ) {
- // calculate the starting index and the offset to the next
- // block that each thread will be processing
- int i = threadIdx.x + blockIdx.x * blockDim.x;
- int stride = blockDim.x * gridDim.x;
- while (i < size) {
- atomicAdd( &histo[buffer[i]], 1 );
- i += stride;
- }
- }
- int main( void ) {
- unsigned char *buffer =
- (unsigned char*)big_random_block( SIZE );
- // capture the start time
- // starting the timer here so that we include the cost of
- // all of the operations on the GPU.
- cudaEvent_t start, stop;
- HANDLE_ERROR( cudaEventCreate( &start ) );
- HANDLE_ERROR( cudaEventCreate( &stop ) );
- HANDLE_ERROR( cudaEventRecord( start, 0 ) );
- // allocate memory on the GPU for the file's data
- unsigned char *dev_buffer;
- unsigned int *dev_histo;
- HANDLE_ERROR( cudaMalloc( (void**)&dev_buffer, SIZE ) );
- HANDLE_ERROR( cudaMemcpy( dev_buffer, buffer, SIZE,
- cudaMemcpyHostToDevice ) );
- HANDLE_ERROR( cudaMalloc( (void**)&dev_histo,
- 256 * sizeof( int ) ) );
- HANDLE_ERROR( cudaMemset( dev_histo, 0,
- 256 * sizeof( int ) ) );
- // kernel launch - 2x the number of mps gave best timing
- cudaDeviceProp prop;
- HANDLE_ERROR( cudaGetDeviceProperties( &prop, 0 ) );
- int blocks = prop.multiProcessorCount;
- histo_kernel<<<blocks*2,256>>>( dev_buffer, SIZE, dev_histo );
- unsigned int histo[256];
- HANDLE_ERROR( cudaMemcpy( histo, dev_histo,
- 256 * sizeof( int ),
- cudaMemcpyDeviceToHost ) );
- // get stop time, and display the timing results
- HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
- HANDLE_ERROR( cudaEventSynchronize( stop ) );
- float elapsedTime;
- HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
- start, stop ) );
- printf( "Time to generate: %3.1f ms\n", elapsedTime );
- long histoCount = 0;
- for (int i=0; i<256; i++) {
- histoCount += histo[i];
- }
- printf( "Histogram Sum: %ld\n", histoCount );
- // verify that we have the same counts via CPU
- for (int i=0; i<SIZE; i++)
- histo[buffer[i]]--;
- for (int i=0; i<256; i++) {
- if (histo[i] != 0)
- printf( "Failure at %d! Off by %d\n", i, histo[i] );
- }
- HANDLE_ERROR( cudaEventDestroy( start ) );
- HANDLE_ERROR( cudaEventDestroy( stop ) );
- cudaFree( dev_histo );
- cudaFree( dev_buffer );
- free( buffer );
- return 0;
- }
- __global__ void histo_kernel( unsigned char *buffer,
- long size,
- unsigned int *histo ) {
- // clear out the accumulation buffer called temp
- // since we are launched with 256 threads, it is easy
- // to clear that memory with one write per thread
- __shared__ unsigned int temp[256];
- temp[threadIdx.x] = 0;
- __syncthreads();
- // calculate the starting index and the offset to the next
- // block that each thread will be processing
- int i = threadIdx.x + blockIdx.x * blockDim.x;
- int stride = blockDim.x * gridDim.x;
- while (i < size) {
- atomicAdd( &temp[buffer[i]], 1 );
- i += stride;
- }
- // sync the data from the above writes to shared memory
- // then add the shared memory values to the values from
- // the other thread blocks using global memory
- // atomic adds
- // same as before, since we have 256 threads, updating the
- // global histogram is just one write per thread!
- __syncthreads();
- atomicAdd( &(histo[threadIdx.x]), temp[threadIdx.x] );
- }
- int main( void ) {
- unsigned char *buffer =
- (unsigned char*)big_random_block( SIZE );
- // capture the start time
- // starting the timer here so that we include the cost of
- // all of the operations on the GPU. if the data were
- // already on the GPU and we just timed the kernel
- // the timing would drop from 74 ms to 15 ms. Very fast.
- cudaEvent_t start, stop;
- HANDLE_ERROR( cudaEventCreate( &start ) );
- HANDLE_ERROR( cudaEventCreate( &stop ) );
- HANDLE_ERROR( cudaEventRecord( start, 0 ) );
- // allocate memory on the GPU for the file's data
- unsigned char *dev_buffer;
- unsigned int *dev_histo;
- HANDLE_ERROR( cudaMalloc( (void**)&dev_buffer, SIZE ) );
- HANDLE_ERROR( cudaMemcpy( dev_buffer, buffer, SIZE,
- cudaMemcpyHostToDevice ) );
- HANDLE_ERROR( cudaMalloc( (void**)&dev_histo,
- 256 * sizeof( int ) ) );
- HANDLE_ERROR( cudaMemset( dev_histo, 0,
- 256 * sizeof( int ) ) );
- // kernel launch - 2x the number of mps gave best timing
- cudaDeviceProp prop;
- HANDLE_ERROR( cudaGetDeviceProperties( &prop, 0 ) );
- int blocks = prop.multiProcessorCount;
- histo_kernel<<<blocks*2,256>>>( dev_buffer,
- SIZE, dev_histo );
- unsigned int histo[256];
- HANDLE_ERROR( cudaMemcpy( histo, dev_histo,
- 256 * sizeof( int ),
- cudaMemcpyDeviceToHost ) );
- // get stop time, and display the timing results
- HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
- HANDLE_ERROR( cudaEventSynchronize( stop ) );
- float elapsedTime;
- HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
- start, stop ) );
- printf( "Time to generate: %3.1f ms\n", elapsedTime );
- long histoCount = 0;
- for (int i=0; i<256; i++) {
- histoCount += histo[i];
- }
- printf( "Histogram Sum: %ld\n", histoCount );
- // verify that we have the same counts via CPU
- for (int i=0; i<SIZE; i++)
- histo[buffer[i]]--;
- for (int i=0; i<256; i++) {
- if (histo[i] != 0)
- printf( "Failure at %d!\n", i );
- }
- HANDLE_ERROR( cudaEventDestroy( start ) );
- HANDLE_ERROR( cudaEventDestroy( stop ) );
- cudaFree( dev_histo );
- cudaFree( dev_buffer );
- free( buffer );
- return 0;
- }
注:本文是作者对GPU高性能编程CUDA实战的学习总结。此书的代码可以在下面的链接下载,无需积分哦!
http://download.csdn.net/detail/celerychen2009/6360573
http://blog.csdn.net/celerychen2009/article/details/11318087
- CUDA编程札记
- CUDA编程札记
- CUDA编程札记
- 编程札记
- 编程札记
- CUDA编程
- cuda编程
- CUDA编程
- CUDA编程
- CUDA编程
- cuda 编程
- CUDA编程
- cuda编程
- cuda编程
- 【CUDA编程】 CUDA编程初始
- Symbian 编程札记
- Linux编程札记
- MobileBASIC 编程札记 (一)
- C++调用DLL方法
- C/C++获取当前函数名及所在行行数
- Ext.ux.TabCloseMenu插件的使用(TabPanel右键关闭菜单) 示例
- 判断一个字符串是否为合法ip地址(ipv4)
- 虚拟机下的Linux不能上网
- CUDA编程札记
- python数据类型和3个重要函数
- 补一个Ext3.2综合应用的例子
- MFC程序唯一运行判断(只能运行一个实例)
- 【巧破网络限制 重享网络自由】
- Linux添加虚拟网卡
- PK FK 命令小记
- c# asp.net 修改webconfig文件 配置
- MySQL大战SQLite和PostgreSQL(比拼、比较)