Cuda编程总结--cuda c programming Guide

来源：互联网发布：linux创建新用户和组编辑：程序博客网时间：2024/05/16 11:54

Cuda编程总结2013-10-120:32:46

--------余家奎

参加书籍：NVIDIACUDA C Programming Guide

OpenGL编程指南

学习cuda例子中的总结

1、__constant__和__device__,__shared__的使用说明...2

2、分配二位数组实现两个二位数组相加...2

3、用cudaMemcpyPitch和cudaMemcpy2D实现二位数组的分配和拷贝...5

4、cudaMalloc3D()和cudaMemcpy3D()函数的用法...7

5、不带共享存储器的矩阵的相乘...9

6、带shared memory的矩阵相乘...12

7、页锁定主机存储器Page-locked Host memory.16

8、纹理存储的使用texture memory.18

9、surface Memory的使用方法...20

10、opengl和cuda的交互...22

11、Formatted output---printf函数在device的函数中，但是其需要其的compute copability至少为2.0 27

12、Asserting在设备端的函数中，但是其要求其计算能力至少为2.0.28

13、Per Thread Allocation On heap每个线程在堆上分配...29

14、Per Thread Block Allocation每个线程块在堆上分配空间...29

15、Allocation Persisting Between Kernel Launches在堆上分配...30

如有错误的地方还请指正。。。谢谢

1、constant和device,shared的使用说明

其对应的程序：

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include <stdlib.h>

// __constant__ int device_global_var=5;

// __device__ int device_global_var=5;

__shared__ int device_global_var ;

__global__ void kernel()

{

__shared__ int xx;

}

int main()

{

int host_var=5;

cudaMemcpyToSymbol(device_global_var,&host_var,sizeof(int));

printf("value=%d\n",host_var);

cudaMemcpyFromSymbol(&host_var,device_global_var,sizeof(int));

printf("device_value=%d\n",host_var);

system("pause");

return 0;

}

2、分配二位数组实现两个二位数组相加

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include <stdlib.h>

#define N 16

__device__ intdevice_a[N][N],device_b[N][N],device_c[N][N];

__global__ void VecAdd(inta[N][N],int b[N][N],intc[N][N])

{

int global_threadId_x=blockIdx.x*blockDim.x+threadIdx.x;

int global_threadId_y=blockIdx.y*blockDim.y+threadIdx.y;

if (global_threadId_x<N &&global_threadId_y <N)

{

c[global_threadId_y][global_threadId_x]=a[global_threadId_y][global_threadId_x]+

b[global_threadId_y][global_threadId_x];

}

void printfArray(int data[N][N])

{

for (inti=0;i<N;i++)

{

for (intj=0;j<N;j++)

{

printf("%d ",data[i][j]);

}

printf("\n");

}

void host_Add(int a[N][N],int b[N][N],intc[N][N])

{

for (inti=0;i<N;i++)

{

for (int j=0;j<N;j++)

{

c[i][j]=a[i][j]+b[i][j];

}

int main()

{

int i,j;

int k=0;

int a[N][N],b[N][N];

int c[N][N];

for (i=0;i<N;i++)

{

for (j=0;j<N;j++)

{

a[i][j]=k;

b[i][j]=k;

k++;

}

int tempA[N][N];

//int (*device_aa)[N];

int **device_aa;

cudaMalloc((void**)&device_aa,sizeof(int)*N*N);

cudaMemcpyToSymbol(device_a,a,sizeof(int)*N*N);

cudaMemcpyFromSymbol(tempA,device_a,sizeof(int)*N*N);

printf("tempA====\n");

printfArray(tempA);

system("pause");

return 0;

}

3、用cudaMemcpyPitch和cudaMemcpy2D实现二位数组的分配和拷贝

代码：

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include <stdlib.h>

#include <iostream>

// kernel which copies data from d_arrayto destinationArray

__global__ void CopyData(float*d_array,

float* destinationArray,

size_tpitch,

int columnCount,

int rowCount)

{

for (int row = 0; row< rowCount; row++)

{

// update the pointer to point to the beginning of the nextrow

float* rowData = (float*)(((char*)d_array) +(row * pitch));

for (int column = 0;column < columnCount; column++)

{

rowData[column] =123.0; // make every value in the array123.0

destinationArray[(row*columnCount) + column] = rowData[column];

}

int main(int argc,char**argv)

{

int columnCount = 15;

int rowCount = 10;

float* d_array; // thedevice array which memory will be allocated to

float* d_destinationArray; //the device array

// allocate memory on the host

float* h_array = new float[columnCount*rowCount];

// the pitch value assigned by cudaMallocPitch

// (which ensures correct data structure alignment)

size_tpitch;

//allocated the device memory for source array

cudaMallocPitch(&d_array, &pitch,columnCount * sizeof(float), rowCount);

//allocate the device memory for destination array

cudaMalloc(&d_destinationArray,columnCount*rowCount*sizeof(float));

//call the kernel which copies values from d_array tod_destinationArray

CopyData<<<100, 512>>>(d_array, d_destinationArray,pitch, columnCount, rowCount);

//copy the data back to the host memory

float *h_result=(float*)malloc(sizeof(float)*columnCount*rowCount);

memset(h_result,0,sizeof(float)*columnCount*rowCount);

cudaMemcpy2D(h_result,columnCount*sizeof(float),d_array,pitch,columnCount*sizeof(float),rowCount,cudaMemcpyDeviceToHost);

cudaMemcpy(h_array,

d_destinationArray,

columnCount*rowCount*sizeof(float),

cudaMemcpyDeviceToHost);

for(int i = 0 ; i< rowCount ; i++)

{

for(int j = 0 ; j < columnCount ; j++)

{

cout << "h_result["<< (i*columnCount) + j <<"]="<< h_result[(i*columnCount) + j] << endl;

}

system("pause");

printf("h_array==\n");

//print out the values (all the values are 123.0)

for(int i = 0 ; i< rowCount ; i++)

{

for(int j = 0 ; j< columnCount ; j++)

{

cout<< "h_array[" <<(i*columnCount) + j <<"]="<< h_array[(i*columnCount) + j] << endl;

}

system("pause");

}

4、cudaMalloc3D()和cudaMemcpy3D()函数的用法

代码：

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdlib.h>

#include <stdio.h>

// Device code

__global__ void MyKernel(cudaPitchedPtrdevPitchedPtr,cudaExtent extent)

{

char* devPtr = (char*)devPitchedPtr.ptr;

size_t pitch= devPitchedPtr.pitch;

size_tslicePitch = pitch * extent.height;

for(int k=0; k <extent.depth; k++){

char* slice = devPtr + k * slicePitch;

for(int j=0; j<extent.height; j++){

float3* row = (float3*)(slice+j*pitch);

for (inti=0;i<extent.width;i++)

{

row[i].x=2;

row[i].y=3;

row[i].z=4;

}

const int x=6;

const int y=60;

const int z=66;

int main(){

size_tbuf_pf=900000000;

// cudaPrintfInit(buf_pf);

cudaError_tstatus = cudaSuccess;

//======== Mem Host

float3 *mem_host = (float3*)malloc(sizeof(float3)*x*y*z);

float3 *mem_host2 = (float3*)malloc(sizeof(float3)*x*y*z);

for(int i=0;i<x*y*z;i++){

mem_host[i].x=10;

mem_host[i].y=100;

mem_host[i].z=1000;

}

//======== Mem Device

cudaExtentextent;

extent.width=x*sizeof(float3);

extent.height=y;

extent.depth=z;

cudaPitchedPtrmem_device;

status=cudaMalloc3D(&mem_device,extent);

// if(status!= cudaSuccess){fprintf(stderr, "Malloc: %s\n", cudaGetErrorString(status));}

// //========Cpy HostToDevice

// cudaMemcpy3DParmsp = { 0 };

// p.srcPtr= make_cudaPitchedPtr((void*)mem_host, x*sizeof(float3),x,y);

// p.dstPtr= mem_device;

// p.extent= extent;

// p.kind= cudaMemcpyHostToDevice;

// status=cudaMemcpy3D(&p);

// if(status!= cudaSuccess){fprintf(stderr, "MemcpyHtD: %s\n",cudaGetErrorString(status));}

MyKernel<<<1,1>>>(mem_device,extent);

//======== Cpy DeviceToHost !!!!!!! UNTESTED !!!!!!!!

cudaMemcpy3DParmsq = {0};

q.srcPtr =mem_device;

q.dstPtr =make_cudaPitchedPtr((void*)mem_host2,x*sizeof(float3),x,y);

q.extent=extent;

q.kind =cudaMemcpyDeviceToHost;

status=cudaMemcpy3D(&q);

if(status != cudaSuccess){fprintf(stderr,"MemcpyDtoH: %s\n",cudaGetErrorString(status));}

for(int i=0;i<x*y*z;i++)

printf("%f %f %f\n",mem_host2[i].x,mem_host2[i].y,mem_host2[i].z);

cudaFree(mem_device.ptr);

system("pause");

}

5、不带共享存储器的矩阵的相乘

代码：

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

typedef struct

{

int width;

int height;

float *element;

}Matrix;

#define BLOCK_SIZE 16

__global__ void MatMulKernel(const Matrix,const Matrix,Matrix);

void printMatrix(const Matrix &A)

{

for (inti=0;i<A.height;i++)

{

for (intj=0;j<A.width;j++)

{

printf("%f ",A.element[i*A.width+j]);

}

printf("\n");

}

void MatMul(const Matrix &A,const Matrix &B,Matrix &C)

{

printf("matrix A");

printMatrix(A);

printf("matrix B");

printMatrix(B);

system("pause");

Matrix d_A;

d_A.width=A.width;

d_A.height=A.height;

size_tsize=A.width*A.height*sizeof(float);

cudaMalloc(&d_A.element,size);

cudaMemcpy(d_A.element,A.element,size,cudaMemcpyHostToDevice);

Matrix d_B;

d_B.width=B.width;

d_B.height=B.height;

size=B.width*B.height*sizeof(float);

cudaMalloc(&d_B.element,size);

cudaMemcpy(d_B.element,B.element,size,cudaMemcpyHostToDevice);

Matrix d_C;

d_C.width=C.width;

d_C.height=C.height;

size=C.width*C.height*sizeof(float);

cudaMalloc(&d_C.element,size);

dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);

dim3 dimGrid(B.width/dimBlock.x,A.height/dimBlock.y);

MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);

cudaMemcpy(C.element,d_C.element,size,cudaMemcpyDeviceToHost);

cudaFree(d_A.element);

cudaFree(d_B.element);

cudaFree(d_C.element);

for (inti=0;i<C.height;++i)

{

for (intj=0;j<C.width;++j)

{

printf("%f ",C.element[i*C.width+j]);

}

printf("\n");

}

system("pause");

}

void SetMatrixValue(Matrix &A,int value)

{

for (inti=0;i<A.height;++i)

{

for (intj=0;j<A.width;++j)

{

A.element[i*A.width+j]=value;

}

void main()

{

MatrixA,B,C;

A.width=128;

A.height=128;

A.element=(float*)malloc(A.width*A.height*sizeof(float));

SetMatrixValue(A,2);

B.width=128;

B.height=128;

B.element=(float*)malloc(B.width*B.height*sizeof(float));

//memset(B.element,2,sizeof(float)*B.width*B.height);

SetMatrixValue(B,2);

C.width=128;

C.height=128;

C.element=(float*)malloc(C.width*C.height*sizeof(float));

//memset(C.element,2,sizeof(float)*C.width*C.height);

MatMul(A,B,C);

for (inti=0;i<C.height;++i)

{

for (intj=0;j<C.width;++j)

{

printf("%f ",C.element[i*C.width+j]);

}

printf("\n");

}

system("pause");

}

__global__ void MatMulKernel(Matrix A,MatrixB,Matrix C)

{

float CValue=0;

int row=blockIdx.y*blockDim.y+threadIdx.y;

int col=blockIdx.x*blockDim.x+threadIdx.x;

for (inte=0;e<A.width;++e)

{

CValue+=A.element[row*A.width+e]*B.element[e*B.width+col];

}

C.element[row*C.width+col]=CValue;

}

6、带shared memory的矩阵相乘

代码：

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include <stdlib.h>

#define BLOCK_SIZE 16

typedef struct

{

int width;

int height;

int stride;

float *elements;

}Matrix;

__device__ float GetElement(constMatrix A, int row,intcol)

{

return A.elements[row*A.stride+col];

}

__device__ void SetElement(Matrix A,int row,int col,float value)

{

A.elements[row*A.stride+col]=value;

}

__device__ Matrix GetSubMatrix(Matrix A,int row,int col)

{

Matrix Asub;

Asub.width=BLOCK_SIZE;

Asub.height=BLOCK_SIZE;

Asub.stride=A.stride;

Asub.elements=&A.elements[A.stride*BLOCK_SIZE*row+BLOCK_SIZE*col];

return Asub;

}

__global__ void MatMulKernel(const Matrix,const Matrix,Matrix);

void MatMul(const Matrix &A,const Matrix &B,Matrix &C)

{

Matrix d_A;

d_A.width=A.width;

d_A.height=A.height;

d_A.stride=A.width;

size_tsize=d_A.width*d_A.height*sizeof(float);

cudaMalloc(&d_A.elements,size);

cudaMemcpy(d_A.elements,A.elements,size,cudaMemcpyHostToDevice);

Matrix d_B;

d_B.width=B.width;

d_B.height=B.height;

d_B.stride=B.width;

size=B.width*B.height*sizeof(float);

cudaMalloc(&d_B.elements,size);

cudaMemcpy(d_B.elements,B.elements,size,cudaMemcpyHostToDevice);

Matrix d_C;

d_C.width=C.width;

d_C.height=C.height;

d_C.stride=C.width;

size=C.width*C.height*sizeof(float);

cudaMalloc(&d_C.elements,size);

dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);

dim3 dimGrid(B.width/BLOCK_SIZE,B.height/BLOCK_SIZE);

MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);

cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);

cudaFree(d_A.elements);

cudaFree(d_B.elements);

cudaFree(d_C.elements);

}

__global__ void MatMulKernel(Matrix A,MatrixB,Matrix C)

{

int blockRow=blockIdx.y;

int blockCol=blockIdx.x;

MatrixCsub=GetSubMatrix(C,blockRow,blockCol);

float Cvalue=0;

int row=threadIdx.y;

int col=threadIdx.x;

for (intm=0;m<A.width/BLOCK_SIZE;++m)

{

MatrixAsub=GetSubMatrix(A,blockRow,m);

MatrixBsub=GetSubMatrix(B,m,blockCol);

__shared__ floatAs[BLOCK_SIZE][BLOCK_SIZE];

__shared__ floatBs[BLOCK_SIZE][BLOCK_SIZE];

As[row][col]=GetElement(Asub,row,col);

Bs[row][col]=GetElement(Bsub,row,col);

__syncthreads();

for (inte=0;e<BLOCK_SIZE;++e)

{

Cvalue+=As[row][e]*Bs[e][col];

}

__syncthreads();

}

SetElement(Csub,row,col,Cvalue);

}

void SetMatValue(Matrix A,int value)

{

for (inti=0;i<A.height;++i)

{

for (intj=0;j<A.width;++j)

{

A.elements[i*A.width+j]=value;

}

void PrintMat(const Matrix A)

{

for (int i=0;i<A.height;++i)

{

for(intj=0;j<A.width;++j)

{

printf("%f ",A.elements[i*A.width+j]);

}

printf("\n");

}

void main()

{

MatrixA,B,C;

A.width=128;

A.height=128;

A.elements=(float*)malloc(A.width*A.height*sizeof(float));

SetMatValue(A,2);

B.width=128;

B.height=128;

B.elements=(float*)malloc(B.width*B.height*sizeof(float));

SetMatValue(B,2);

C.width=128;

C.height=128;

C.elements=(float *)malloc(C.width*C.height*sizeof(float));

MatMul(A,B,C);

PrintMat(C);

system("pause");

}

7、页锁定主机存储器Page-lockedHost memory

代码：

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include <stdlib.h>

#define N 256

void SetArrayValue(float *pData,int size,float value)

{

for (inti=0;i<size;++i)

{

pData[i]=value;

}

__global__ void VecAdd(float*A,float *B,float*C)

{

int index=threadIdx.x;

C[index]=A[index]+B[index];

}

void PrintArray(float *data,int size)

{

for(inti=0;i<size;++i)

{

if((i+1)%10==0)

{

printf("\n");

}

printf("%f ",data[i]);

}

void main()

{

//cudaSetDeviceFlags(cudaDeviceMapHost);---可有可无

cudaDevicePropdeviceProp;

cudaGetDeviceProperties(&deviceProp,0);

if (deviceProp.integrated)

{

printf("GPU is integrated\n");

return;

}

if(!deviceProp.canMapHostMemory)

{

printf("can't map host memory\n");

return;

}

float *hostPtrA;

cudaHostAlloc(&hostPtrA,sizeof(float)*N,cudaHostAllocDefault| cudaHostAllocMapped);

SetArrayValue(hostPtrA,N,29);

PrintArray(hostPtrA,N);

system("pause");

float *hostPtrB;

cudaHostAlloc(&hostPtrB,sizeof(float)*N,cudaHostAllocDefault| cudaHostAllocMapped);

SetArrayValue(hostPtrB,N,31);

float *devPtrA,*devPtrB;

cudaHostGetDevicePointer(&devPtrA,hostPtrA,0);

cudaHostGetDevicePointer(&devPtrB,hostPtrB,0);

float *hostPtrC;

cudaHostAlloc(&hostPtrC,sizeof(float)*N,cudaHostAllocDefault| cudaHostAllocMapped);

float *devPtrC;

cudaHostGetDevicePointer(&devPtrC,hostPtrC,0);

VecAdd<<<1,N>>>(devPtrA,devPtrB,devPtrC);

cudaDeviceSynchronize();

for(inti=0;i<N;i++)

{

if ((i+1)%10==0)

{

printf("\n");

}

printf("%f ",hostPtrC[i]);

}

system("pause");

}

8、纹理存储的使用texturememory

代码：

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include <stdlib.h>

#define size 256

texture<float,cudaTextureType2D,cudaReadModeElementType>texRef;

__global__ void transformKernel(float *output,intwidth,int height,floattheta)

{

unsigned int x=blockIdx.x*blockDim.x+threadIdx.x;

unsigned int y=blockIdx.y*blockDim.y+threadIdx.y;

float u=x/(float)width;

float v=y/(float)height;

u-=0.5f;

v-=0.5f;

float tu=u*cosf(theta)-v*sin(theta)+0.5f;

float tv=v*cosf(theta)+u*sinf(theta)+0.5f;

output[y*width+x]=tex2D(texRef,tu,tv);

}

void main()

{

int width=25,height=25;

cudaChannelFormatDescchannelDesc=cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat);

cudaArray*cuArray;

cudaMallocArray(&cuArray,&channelDesc,width,height);

float *h_data=(float*)malloc(width*height*sizeof(float));

for (inti=0;i<height;++i)

{

for (intj=0;j<width;++j)

{

h_data[i*width+j]=i*width+j;

}

cudaMemcpyToArray(cuArray,0,0,h_data,width*height*sizeof(float),cudaMemcpyHostToDevice);

texRef.addressMode[0]=cudaAddressModeWrap;

texRef.addressMode[1]=cudaAddressModeWrap;

texRef.filterMode=cudaFilterModeLinear;

texRef.normalized=true;

cudaBindTextureToArray(texRef,cuArray,channelDesc);

float *output;

cudaMalloc(&output,width*height*sizeof(float));

dim3 dimBlock(16,16);

dim3dimGrid((width+dimBlock.x-1)/dimBlock.x,(height+dimBlock.y-1)/dimBlock.y);

float angle=30;

transformKernel<<<dimGrid,dimBlock>>>(output,width,height,angle);

float *hostPtr=(float*)malloc(sizeof(float)*width*height);

cudaMemcpy(hostPtr,output,sizeof(float)*width*height,cudaMemcpyDeviceToHost);

for (inti=0;i<height;++i)

{

for (intj=0;j<width;++j)

{

printf("%f ",hostPtr[i*width+j]);

}

printf("\n");

}

free(hostPtr);

cudaFreeArray(cuArray);

cudaFree(output);

system("pause");

}

9、surface Memory的使用方法

代码：

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include <stdlib.h>

surface<void,2> inputSurfRef;

surface<void,2>outputSurfRef;

__global__ void copyKernel(intwidth,int height)

{

unsigned int x=blockIdx.x*blockDim.x+threadIdx.x;

unsigned int y=blockIdx.y*blockDim.y+threadIdx.y;

if(x<width && y<height)

{

uchar4 data;

surf2Dread(&data,inputSurfRef,x*4,y);

surf2Dwrite(data,outputSurfRef,x*4,y);

}

void main()

{

int width=256,height=256;

unsigned int*h_data=(unsigned int*)malloc(width*height*sizeof(unsignedint));

for (inti=0;i<height;++i)

{

for (intj=0;j<width;++j)

{

h_data[i*width+j]=3;

}

int size=width*height*sizeof(unsignedint);

cudaChannelFormatDescchannelDesc=cudaCreateChannelDesc(8,8,8,8,cudaChannelFormatKindUnsigned);

cudaArray*cuInputArray;

cudaMallocArray(&cuInputArray,&channelDesc,width,height,cudaArraySurfaceLoadStore);

cudaArray*cuOutputArray;

cudaMallocArray(&cuOutputArray,&channelDesc,width,height,cudaArraySurfaceLoadStore);

cudaMemcpyToArray(cuInputArray,0,0,h_data,size,cudaMemcpyHostToDevice);

cudaBindSurfaceToArray(inputSurfRef,cuInputArray);

cudaBindSurfaceToArray(outputSurfRef,cuOutputArray);

dim3 dimBlock(16,16);

dim3dimGrid((width+dimBlock.x-1)/dimBlock.x,(height+dimBlock.y-1)/dimBlock.y);

copyKernel<<<dimGrid,dimBlock>>>(width,height);

unsigned int*host_output=(unsigned int*)malloc(sizeof(unsignedint)*width*height);

cudaMemcpyFromArray(host_output,cuOutputArray,0,0,size,cudaMemcpyDeviceToHost);

for (inti=0;i<height;++i)

{

for (intj=0;j<width;++j)

{

printf("%u ",host_output[i*width+j]);

}

printf("\n");

}

system("pause");

free(host_output);

free(h_data);

cudaFreeArray(cuInputArray);

cudaFreeArray(cuOutputArray);

}

10、opengl和cuda的交互

代码：https://devtalk.nvidia.com/default/topic/502692/how-to-use-open_gl/

http://stackoverflow.com/questions/12082357/errors-while-using-opengl-buffers-using-visual-studio-2010-in-windows7

#include <cuda.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <gl/glew.h>//要放在下面这一句的前面

#include "cuda_gl_interop.h"

#include <stdio.h>

#include <stdlib.h>

//#include <gl/GL.h>

#include <gl/glut.h>

GLuint postionsVBO=1;

struct cudaGraphicsResource * postionsVBO_CUDA;

int width=256;

int height=256;

__device__ float dev_time=1;

float host_time=1;

__global__ void createVertices(float4 *positions,floattime,unsignedintwidth ,unsignedintheight);

void init()

{

glClearColor(0.f,0.f,0.f,1.f);

glClear(GL_DEPTH_BUFFER_BIT| GL_COLOR_BUFFER_BIT);

}

void reshape(int width,int height)

{

glMatrixMode(GL_PROJECTION);

glLoadIdentity();

if (width>height)

{

gluPerspective(45,(GLfloat)width/height,0.001,1000);

}else

{

gluPerspective(45,(GLfloat)height/width,0.001,1000);

}

glMatrixMode(GL_MATRIX_MODE);

glLoadIdentity();

}

void display()

{

float4 *positions;

cudaGraphicsMapResources(1,&postionsVBO_CUDA,0);

size_tnumb_bytes;

cudaGraphicsResourceGetMappedPointer((void**)&positions,&numb_bytes,postionsVBO_CUDA);

dim3 dimBlock(16,16,1);

dim3 dimGrid(width/dimBlock.x,height/dimBlock.y,1);

createVertices<<<dimGrid,dimBlock>>>(positions,dev_time,width,height);

dev_time++;

cudaMemcpy(&time,&host_time,sizeof(float),cudaMemcpyHostToDevice);

cudaGraphicsUnmapResources(1,&postionsVBO_CUDA,0);

glClear(GL_COLOR_BUFFER_BIT| GL_DEPTH_BUFFER_BIT);

glBindBuffer(GL_ARRAY_BUFFER,postionsVBO);

glVertexPointer(4,GL_FLOAT,0,0);

glEnableClientState(GL_VERTEX_ARRAY);

glDrawArrays(GL_POINTS,0,width*height);

glDisableClientState(GL_VERTEX_ARRAY);

glutSwapBuffers();

glutPostRedisplay();

}

__global__ void createVertices(float4 *positions,floattime,unsignedintwidth ,unsignedintheight)

{

unsigned int x=blockIdx.x*blockDim.x+threadIdx.x;

unsigned int y=blockIdx.y*blockDim.y+threadIdx.y;

float u=x/(float)width;

float v=y/(float)height;

u=u*2.f-1.f;

v=v*2.f-1.f;

float freq=4.f;

float w=sinf(u*freq+time)*cosf(v*freq+time)*0.5f;

positions[y*width+x]=make_float4(u,w,v,1.f);

}

int main(int argc,char*argv[])

{

cudaGLSetGLDevice(0);

glutInit(&argc,argv);

glutInitDisplayMode(GLUT_DOUBLE| GLUT_RGB);

glutInitWindowPosition(0,0);

glutInitWindowSize(100,100);

glutCreateWindow("opengl-cuda");

init();

glutDisplayFunc(display);

glutReshapeFunc(reshape);

glewInit();//http://stackoverflow.com/questions/12344612/unusual-error-using-opengl-buffers-with-cuda-interop-on-ms-visual-studio-2010

glGenBuffers(1,&postionsVBO);

glBindBuffer(GL_ARRAY_BUFFER,postionsVBO);

unsigned intsize=width*height*4*sizeof(float);

glBufferData(GL_ARRAY_BUFFER,size,0,GL_DYNAMIC_DRAW);

glBindBuffer(GL_ARRAY_BUFFER,0);

cudaGraphicsGLRegisterBuffer(&postionsVBO_CUDA,postionsVBO,cudaGraphicsMapFlagsWriteDiscard);

glutMainLoop();

}

11、Formattedoutput---printf函数在device的函数中，但是其需要其的compute copability至少为2.0

代码：

#include <cuda.h>

#include <helper_cuda.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

__global__ void bcast()

{

printf("%d\n",threadIdx.x);

}

void main()

{

bcast<<<1,32>>>();

cudaDeviceSynchronize();

system("pause");

}

12、Asserting在设备端的函数中，但是其要求其计算能力至少为2.0

代码：

#include <cuda.h>

#include <helper_cuda.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <assert.h>

#include <stdlib.h>

__global__ void testAssert(void)

{

int is_one=1;

int should_be_one=0;

assert(is_one);

assert(should_be_one);

}

void main()

{

testAssert<<<1,1>>>();

cudaDeviceSynchronize();

cudaDeviceReset();

system("pause");

}

13、Per ThreadAllocation On heap每个线程在堆上分配

代码：

#include <stdio.h>

#include <stdlib.h>

#include <cuda.h>

#include <helper_cuda.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

__global__ void mallocTestPerThread()

{

char *ptr=(char *)malloc(100);

printf("Thread %d got pointer:%p\n",threadIdx.x,ptr);

free(ptr);

}

int main()

{

cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024);

mallocTestPerThread<<<1,5>>>();

cudaDeviceSynchronize();

system("pause");

return 0;

}

14、Per Thread BlockAllocation每个线程块在堆上分配空间

代码：

#include <stdio.h>

#include <stdlib.h>

#include <cuda.h>

#include <helper_cuda.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

__global__ void mallocTestPerThreadBlock()

{

__shared__ int *data;

if (threadIdx.x==0)

{

data=(int*)malloc(blockDim.x*64);

}

__syncthreads();

if (data==NULL)

{

return;

}

int *ptr=data;

for (inti=0;i<64;++i)

{

ptr[i*blockDim.x+threadIdx.x]=threadIdx.x;

}

__syncthreads();

if (threadIdx.x==0)

{

free(data);

}

int main()

{

cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024);

mallocTestPerThreadBlock<<<10,128>>>();

cudaDeviceSynchronize();

system("pause");

return 0;

}

15、AllocationPersisting Between Kernel Launches在堆上分配

代码：

#include <cuda.h>

#include <helper_cuda.h>

#include <stdio.h>

#include <stdlib.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#define NUM_BLOCKS 20

__device__ int *dataptr[NUM_BLOCKS];

__global__ void allocmem()

{

if (threadIdx.x==0)

{

dataptr[blockIdx.x]=(int*)malloc(blockDim.x*4);

}

__syncthreads();

if (dataptr[blockIdx.x]==NULL)

{

return;

}

dataptr[blockIdx.x][threadIdx.x]=0;

}

__global__ void usemem()

{

int *ptr=dataptr[blockIdx.x];

if (ptr!=NULL)

{

ptr[threadIdx.x]+=threadIdx.x;

}

__global__ void freemem()

{

int *ptr=dataptr[blockIdx.x];

if(ptr!=NULL)

printf("Block %d,Thread=%d:final value=%d\n",blockIdx.x,threadIdx.x,ptr[threadIdx.x]);

if (threadIdx.x==0)

{

free(ptr);

}

int main()

{

cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024);

allocmem<<<NUM_BLOCKS,10>>>();

usemem<<<NUM_BLOCKS,10>>>();

freemem<<<NUM_BLOCKS,10>>>();

cudaDeviceSynchronize();

system("pause");

return 0;

}

0 0