Cuda编程总结--cuda c programming Guide

来源:互联网 发布:linux创建新用户和组 编辑:程序博客网 时间:2024/05/16 11:54

Cuda编程总结2013-10-120:32:46

--------余家奎

 

参加书籍:NVIDIACUDA C Programming Guide

OpenGL编程指南

 

学习cuda例子中的总结

1、__constant__和__device__,__shared__的使用说明...2

2、分配二位数组实现两个二位数组相加...2

3、用cudaMemcpyPitch和cudaMemcpy2D实现二位数组的分配和拷贝...5

4、cudaMalloc3D()和cudaMemcpy3D()函数的用法...7

5、不带共享存储器的矩阵的相乘...9

6、带shared memory的矩阵相乘...12

7、页锁定主机存储器Page-locked Host memory.16

8、纹理存储的使用texture memory.18

9、surface Memory的使用方法...20

10、opengl和cuda的交互...22

11、Formatted output---printf函数在device的函数中,但是其需要其的compute copability至少为2.0 27

12、Asserting在设备端的函数中,但是其要求其计算能力至少为2.0.28

13、Per Thread Allocation On heap每个线程在堆上分配...29

14、Per Thread Block Allocation每个线程块在堆上分配空间...29

15、Allocation Persisting Between Kernel Launches在堆上分配...30 

如有错误的地方还请指正。。。谢谢

 

1、__constant__和__device__,__shared__的使用说明

其对应的程序:

//

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include <stdlib.h> 

// __constant__ int device_global_var=5;

// __device__ int device_global_var=5;

__shared__ int device_global_var ;

__global__ void kernel()

{

    __shared__ int xx;

}

int main()

{   

      int host_var=5;

    cudaMemcpyToSymbol(device_global_var,&host_var,sizeof(int)); 

    printf("value=%d\n",host_var);   

    cudaMemcpyFromSymbol(&host_var,device_global_var,sizeof(int));

    printf("device_value=%d\n",host_var); 

    system("pause");

    return 0;

2、分配二位数组实现两个二位数组相加

#include "cuda_runtime.h"

#include "device_launch_parameters.h" 

#include <stdio.h>

#include <stdlib.h> 

#define  N 16 

__device__ intdevice_a[N][N],device_b[N][N],device_c[N][N]; 

__global__ void VecAdd(inta[N][N],int b[N][N],intc[N][N])

{

    int global_threadId_x=blockIdx.x*blockDim.x+threadIdx.x;

    int global_threadId_y=blockIdx.y*blockDim.y+threadIdx.y; 

    if (global_threadId_x<N &&global_threadId_y <N)

    {

        c[global_threadId_y][global_threadId_x]=a[global_threadId_y][global_threadId_x]+

            b[global_threadId_y][global_threadId_x];

    }

}

void printfArray(int data[N][N])

{

    for (inti=0;i<N;i++)

    {

        for (intj=0;j<N;j++)

        {

            printf("%d ",data[i][j]);

        }

        printf("\n");

    }

void host_Add(int a[N][N],int b[N][N],intc[N][N])

{

    for (inti=0;i<N;i++)

    {

        for (int j=0;j<N;j++)

        {

            c[i][j]=a[i][j]+b[i][j];

        }

    }

}

int main()

{

    int i,j;

    int k=0;

    int a[N][N],b[N][N];

    int c[N][N];   

    for (i=0;i<N;i++)

    {

        for (j=0;j<N;j++)

        {

            a[i][j]=k;

            b[i][j]=k;

            k++;

        }

    }

    int tempA[N][N];

    //int (*device_aa)[N];

    int **device_aa;

    cudaMalloc((void**)&device_aa,sizeof(int)*N*N);

    cudaMemcpyToSymbol(device_a,a,sizeof(int)*N*N); 

    cudaMemcpyFromSymbol(tempA,device_a,sizeof(int)*N*N);

    printf("tempA====\n");

    printfArray(tempA);  

    system("pause");

    return 0;

3、用cudaMemcpyPitch和cudaMemcpy2D实现二位数组的分配和拷贝

代码:

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include <stdlib.h>

#include <iostream> 

// kernel which copies data from d_arrayto destinationArray 

__global__ void CopyData(float*d_array, 

                                   float* destinationArray, 

                                   size_tpitch, 

                                   int columnCount, 

                                   int rowCount) 

  for (int row = 0; row< rowCount; row++) 

  { 

     // update the pointer to point to the beginning of the nextrow 

     float* rowData = (float*)(((char*)d_array) +(row * pitch)); 

    for (int column = 0;column < columnCount; column++) 

    { 

      rowData[column] =123.0; // make every value in the array123.0 

     destinationArray[(row*columnCount) + column] = rowData[column]; 

    } 

  } 

int main(int argc,char**argv) 

  int columnCount = 15; 

  int rowCount = 10; 

  float* d_array; // thedevice array which memory will be allocated to 

  float* d_destinationArray; //the device array 

  // allocate memory on the host 

  float* h_array = new float[columnCount*rowCount]; 

  // the pitch value assigned by cudaMallocPitch 

  // (which ensures correct data structure alignment) 

  size_tpitch; 

  //allocated the device memory for source array 

  cudaMallocPitch(&d_array, &pitch,columnCount * sizeof(float), rowCount); 

  //allocate the device memory for destination array 

  cudaMalloc(&d_destinationArray,columnCount*rowCount*sizeof(float)); 

  //call the kernel which copies values from d_array tod_destinationArray 

 CopyData<<<100, 512>>>(d_array, d_destinationArray,pitch, columnCount, rowCount); 

  //copy the data back to the host memory  

  float *h_result=(float*)malloc(sizeof(float)*columnCount*rowCount);

 memset(h_result,0,sizeof(float)*columnCount*rowCount); 

 cudaMemcpy2D(h_result,columnCount*sizeof(float),d_array,pitch,columnCount*sizeof(float),rowCount,cudaMemcpyDeviceToHost);

 

 cudaMemcpy(h_array, 

                   d_destinationArray, 

                   columnCount*rowCount*sizeof(float), 

                   cudaMemcpyDeviceToHost);  

  for(int i = 0 ; i< rowCount ; i++) 

  { 

      for(int j = 0 ; j < columnCount ; j++) 

      { 

          cout << "h_result["<< (i*columnCount) + j <<"]="<< h_result[(i*columnCount) + j] << endl; 

      } 

  } 

  system("pause");

  printf("h_array==\n");

  //print out the values (all the values are 123.0) 

  for(int i = 0 ; i< rowCount ; i++) 

  { 

    for(int j = 0 ; j< columnCount ; j++) 

    { 

      cout<< "h_array[" <<(i*columnCount) + j <<"]="<< h_array[(i*columnCount) + j] << endl; 

    } 

  }  

  system("pause");

}  

4、cudaMalloc3D()和cudaMemcpy3D()函数的用法

代码:

#include "cuda_runtime.h"

#include "device_launch_parameters.h" 

#include <stdlib.h>

#include <stdio.h> 

// Device code

__global__ void MyKernel(cudaPitchedPtrdevPitchedPtr,cudaExtent extent)

{

    char* devPtr = (char*)devPitchedPtr.ptr;

    size_t pitch= devPitchedPtr.pitch;

    size_tslicePitch = pitch * extent.height;

    for(int k=0; k <extent.depth; k++){

        char* slice = devPtr + k * slicePitch;

        for(int j=0; j<extent.height; j++){

            float3* row = (float3*)(slice+j*pitch); 

            for (inti=0;i<extent.width;i++)

            {

                row[i].x=2;

                row[i].y=3;

                row[i].z=4;

            }

        }

    } 

const int x=6;

const int y=60;

const int z=66; 

int main(){

    size_tbuf_pf=900000000;

//  cudaPrintfInit(buf_pf);

    cudaError_tstatus = cudaSuccess; 

    //======== Mem Host 

    float3 *mem_host = (float3*)malloc(sizeof(float3)*x*y*z);

    float3 *mem_host2 = (float3*)malloc(sizeof(float3)*x*y*z); 

    for(int i=0;i<x*y*z;i++){

        mem_host[i].x=10;

        mem_host[i].y=100;

        mem_host[i].z=1000;

    }

    //======== Mem Device 

    cudaExtentextent;

    extent.width=x*sizeof(float3);

    extent.height=y;

    extent.depth=z; 

    cudaPitchedPtrmem_device;

    status=cudaMalloc3D(&mem_device,extent);

// if(status!= cudaSuccess){fprintf(stderr, "Malloc: %s\n", cudaGetErrorString(status));}

//

// //========Cpy HostToDevice

//

// cudaMemcpy3DParmsp = { 0 };

// p.srcPtr= make_cudaPitchedPtr((void*)mem_host, x*sizeof(float3),x,y);

// p.dstPtr= mem_device;

// p.extent= extent;

// p.kind= cudaMemcpyHostToDevice;

// status=cudaMemcpy3D(&p);

// if(status!= cudaSuccess){fprintf(stderr, "MemcpyHtD: %s\n",cudaGetErrorString(status));} 

    MyKernel<<<1,1>>>(mem_device,extent); 

    //======== Cpy DeviceToHost !!!!!!! UNTESTED !!!!!!!! 

    cudaMemcpy3DParmsq = {0};

    q.srcPtr =mem_device;

    q.dstPtr =make_cudaPitchedPtr((void*)mem_host2,x*sizeof(float3),x,y);

    q.extent=extent;

    q.kind =cudaMemcpyDeviceToHost;

    status=cudaMemcpy3D(&q);

    if(status != cudaSuccess){fprintf(stderr,"MemcpyDtoH: %s\n",cudaGetErrorString(status));} 

    for(int i=0;i<x*y*z;i++)

        printf("%f %f %f\n",mem_host2[i].x,mem_host2[i].y,mem_host2[i].z); 

    cudaFree(mem_device.ptr); 

    system("pause"); 

}

5、不带共享存储器的矩阵的相乘

代码:

#include "cuda_runtime.h"

#include "device_launch_parameters.h" 

#include <stdio.h>

#include <stdlib.h>

#include <string.h> 

typedef struct 

{

    int width;

    int height;

    float *element;

}Matrix; 

#define BLOCK_SIZE 16 

__global__ void MatMulKernel(const Matrix,const Matrix,Matrix); 

void printMatrix(const Matrix &A)

{

    for (inti=0;i<A.height;i++)

    {

        for (intj=0;j<A.width;j++)

        {

            printf("%f ",A.element[i*A.width+j]);

        }

        printf("\n");

    }

}

void MatMul(const Matrix &A,const Matrix &B,Matrix &C)

{

    printf("matrix A");

    printMatrix(A);

    printf("matrix B");

    printMatrix(B);

    system("pause"); 

    Matrix d_A;

    d_A.width=A.width;

    d_A.height=A.height;

    size_tsize=A.width*A.height*sizeof(float);

    cudaMalloc(&d_A.element,size);

    cudaMemcpy(d_A.element,A.element,size,cudaMemcpyHostToDevice); 

    Matrix d_B;

    d_B.width=B.width;

    d_B.height=B.height;

    size=B.width*B.height*sizeof(float);

    cudaMalloc(&d_B.element,size);

    cudaMemcpy(d_B.element,B.element,size,cudaMemcpyHostToDevice); 

    Matrix d_C;

    d_C.width=C.width;

    d_C.height=C.height;

    size=C.width*C.height*sizeof(float);

    cudaMalloc(&d_C.element,size);   

    dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);

    dim3 dimGrid(B.width/dimBlock.x,A.height/dimBlock.y);

    MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C); 

    cudaMemcpy(C.element,d_C.element,size,cudaMemcpyDeviceToHost); 

    cudaFree(d_A.element);

    cudaFree(d_B.element);

    cudaFree(d_C.element); 

    for (inti=0;i<C.height;++i)

    {

        for (intj=0;j<C.width;++j)

        {

            printf("%f ",C.element[i*C.width+j]);

        }

        printf("\n");

    }

    system("pause");

}

void SetMatrixValue(Matrix &A,int value)

{

    for (inti=0;i<A.height;++i)

    {

        for (intj=0;j<A.width;++j)

        {

            A.element[i*A.width+j]=value;

        }

    }

}

void main()

{

    MatrixA,B,C;

    A.width=128;

    A.height=128;

    A.element=(float*)malloc(A.width*A.height*sizeof(float));

    SetMatrixValue(A,2); 

    B.width=128;

    B.height=128;

    B.element=(float*)malloc(B.width*B.height*sizeof(float));

    //memset(B.element,2,sizeof(float)*B.width*B.height);

    SetMatrixValue(B,2); 

    C.width=128;

    C.height=128;

    C.element=(float*)malloc(C.width*C.height*sizeof(float));

    //memset(C.element,2,sizeof(float)*C.width*C.height); 

    MatMul(A,B,C); 

    for (inti=0;i<C.height;++i)

    {

        for (intj=0;j<C.width;++j)

        {

            printf("%f ",C.element[i*C.width+j]);

        }

        printf("\n");

    } 

    system("pause");

__global__ void MatMulKernel(Matrix A,MatrixB,Matrix C)

{

    float CValue=0;

    int row=blockIdx.y*blockDim.y+threadIdx.y;

    int col=blockIdx.x*blockDim.x+threadIdx.x; 

    for (inte=0;e<A.width;++e)

    {

        CValue+=A.element[row*A.width+e]*B.element[e*B.width+col];

    } 

    C.element[row*C.width+col]=CValue;

6、带shared memory的矩阵相乘

代码:

#include "cuda_runtime.h"

#include "device_launch_parameters.h" 

#include <stdio.h>

#include <stdlib.h> 

#define BLOCK_SIZE 16 

typedef struct

{

    int width;

    int height;

    int stride;

    float *elements;

}Matrix; 

__device__ float GetElement(constMatrix A, int row,intcol)

{

    return A.elements[row*A.stride+col];

__device__ void SetElement(Matrix A,int row,int col,float value)

{

    A.elements[row*A.stride+col]=value;

__device__ Matrix GetSubMatrix(Matrix A,int row,int col)

{

    Matrix Asub;

    Asub.width=BLOCK_SIZE;

    Asub.height=BLOCK_SIZE;

    Asub.stride=A.stride;

    Asub.elements=&A.elements[A.stride*BLOCK_SIZE*row+BLOCK_SIZE*col]; 

    return Asub;

__global__ void MatMulKernel(const Matrix,const Matrix,Matrix); 

void MatMul(const Matrix &A,const Matrix &B,Matrix &C)

{

    Matrix d_A;

    d_A.width=A.width;

    d_A.height=A.height;

    d_A.stride=A.width;

    size_tsize=d_A.width*d_A.height*sizeof(float);

    cudaMalloc(&d_A.elements,size);

    cudaMemcpy(d_A.elements,A.elements,size,cudaMemcpyHostToDevice); 

    Matrix d_B;

    d_B.width=B.width;

    d_B.height=B.height;

    d_B.stride=B.width;

    size=B.width*B.height*sizeof(float);

    cudaMalloc(&d_B.elements,size);

    cudaMemcpy(d_B.elements,B.elements,size,cudaMemcpyHostToDevice); 

    Matrix d_C;

    d_C.width=C.width;

    d_C.height=C.height;

    d_C.stride=C.width;

    size=C.width*C.height*sizeof(float);

    cudaMalloc(&d_C.elements,size); 

    dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);

    dim3 dimGrid(B.width/BLOCK_SIZE,B.height/BLOCK_SIZE);

    MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C); 

    cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);

    cudaFree(d_A.elements);

    cudaFree(d_B.elements);

    cudaFree(d_C.elements);

__global__ void MatMulKernel(Matrix A,MatrixB,Matrix C)

{

    int blockRow=blockIdx.y;

    int blockCol=blockIdx.x; 

    MatrixCsub=GetSubMatrix(C,blockRow,blockCol); 

    float Cvalue=0; 

    int row=threadIdx.y;

    int col=threadIdx.x; 

    for (intm=0;m<A.width/BLOCK_SIZE;++m)

    {

        MatrixAsub=GetSubMatrix(A,blockRow,m);

        MatrixBsub=GetSubMatrix(B,m,blockCol); 

        __shared__ floatAs[BLOCK_SIZE][BLOCK_SIZE];

        __shared__ floatBs[BLOCK_SIZE][BLOCK_SIZE]; 

        As[row][col]=GetElement(Asub,row,col);

        Bs[row][col]=GetElement(Bsub,row,col); 

        __syncthreads(); 

        for (inte=0;e<BLOCK_SIZE;++e)

        {

            Cvalue+=As[row][e]*Bs[e][col];

        } 

        __syncthreads();

    } 

    SetElement(Csub,row,col,Cvalue);

}

 void SetMatValue(Matrix A,int value)

{

    for (inti=0;i<A.height;++i)

    {

        for (intj=0;j<A.width;++j)

        {

            A.elements[i*A.width+j]=value;

        }

    }

}

void PrintMat(const Matrix A)

{

    for (int i=0;i<A.height;++i)

    {

        for(intj=0;j<A.width;++j)

        {

            printf("%f ",A.elements[i*A.width+j]);

        }

        printf("\n");

    }

}

void main()

{

    MatrixA,B,C;

    A.width=128;

    A.height=128;

    A.elements=(float*)malloc(A.width*A.height*sizeof(float));

    SetMatValue(A,2); 

    B.width=128;

    B.height=128;

    B.elements=(float*)malloc(B.width*B.height*sizeof(float));

    SetMatValue(B,2); 

    C.width=128;

    C.height=128;

    C.elements=(float *)malloc(C.width*C.height*sizeof(float)); 

    MatMul(A,B,C);

    PrintMat(C);

    system("pause"); 

7、页锁定主机存储器Page-lockedHost memory

代码:

#include "cuda_runtime.h"

#include "device_launch_parameters.h" 

#include <stdio.h>

#include <stdlib.h> 

#define N 256 

void SetArrayValue(float *pData,int size,float value)

{

    for (inti=0;i<size;++i)

    {

        pData[i]=value;

    }

__global__ void VecAdd(float*A,float *B,float*C)

{

    int index=threadIdx.x;

    C[index]=A[index]+B[index];

void PrintArray(float *data,int size)

{

    for(inti=0;i<size;++i)

    {

        if((i+1)%10==0)

        {

            printf("\n");

        }

        printf("%f ",data[i]);

    }

void main()

{

    //cudaSetDeviceFlags(cudaDeviceMapHost);---可有可无

    cudaDevicePropdeviceProp;

    cudaGetDeviceProperties(&deviceProp,0); 

    if (deviceProp.integrated)

    {

        printf("GPU is integrated\n");

        return;

    }

    if(!deviceProp.canMapHostMemory)

    {

        printf("can't map host memory\n");

        return;

    }   

    float *hostPtrA;

    cudaHostAlloc(&hostPtrA,sizeof(float)*N,cudaHostAllocDefault| cudaHostAllocMapped);

    SetArrayValue(hostPtrA,N,29); 

    PrintArray(hostPtrA,N); 

    system("pause"); 

    float *hostPtrB;

    cudaHostAlloc(&hostPtrB,sizeof(float)*N,cudaHostAllocDefault| cudaHostAllocMapped);

    SetArrayValue(hostPtrB,N,31); 

    float *devPtrA,*devPtrB;

    cudaHostGetDevicePointer(&devPtrA,hostPtrA,0);

    cudaHostGetDevicePointer(&devPtrB,hostPtrB,0); 

    float *hostPtrC;

    cudaHostAlloc(&hostPtrC,sizeof(float)*N,cudaHostAllocDefault| cudaHostAllocMapped);

    float *devPtrC;

    cudaHostGetDevicePointer(&devPtrC,hostPtrC,0); 

    VecAdd<<<1,N>>>(devPtrA,devPtrB,devPtrC); 

    cudaDeviceSynchronize(); 

    for(inti=0;i<N;i++)

    {

        if ((i+1)%10==0)

        {

            printf("\n");

        }

        printf("%f ",hostPtrC[i]);

    }

    system("pause");

}

8、纹理存储的使用texturememory

代码:

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include <stdlib.h> 

#define size 256 

texture<float,cudaTextureType2D,cudaReadModeElementType>texRef; 

__global__ void transformKernel(float *output,intwidth,int height,floattheta)

{

    unsigned int x=blockIdx.x*blockDim.x+threadIdx.x;

    unsigned int y=blockIdx.y*blockDim.y+threadIdx.y; 

    float u=x/(float)width;

    float v=y/(float)height; 

    u-=0.5f;

    v-=0.5f;

    float tu=u*cosf(theta)-v*sin(theta)+0.5f;

    float tv=v*cosf(theta)+u*sinf(theta)+0.5f;

    output[y*width+x]=tex2D(texRef,tu,tv);

void main()

{

    int width=25,height=25;

    cudaChannelFormatDescchannelDesc=cudaCreateChannelDesc(32,0,0,0,cudaChannelFormatKindFloat);

    cudaArray*cuArray;

    cudaMallocArray(&cuArray,&channelDesc,width,height); 

    float *h_data=(float*)malloc(width*height*sizeof(float));

    for (inti=0;i<height;++i)

    {

        for (intj=0;j<width;++j)

        {

            h_data[i*width+j]=i*width+j;

        }

    } 

    cudaMemcpyToArray(cuArray,0,0,h_data,width*height*sizeof(float),cudaMemcpyHostToDevice); 

    texRef.addressMode[0]=cudaAddressModeWrap;

    texRef.addressMode[1]=cudaAddressModeWrap;

    texRef.filterMode=cudaFilterModeLinear;

    texRef.normalized=true

    cudaBindTextureToArray(texRef,cuArray,channelDesc);

    float *output;

    cudaMalloc(&output,width*height*sizeof(float)); 

    dim3 dimBlock(16,16);

    dim3dimGrid((width+dimBlock.x-1)/dimBlock.x,(height+dimBlock.y-1)/dimBlock.y);

    float angle=30; 

    transformKernel<<<dimGrid,dimBlock>>>(output,width,height,angle); 

    float *hostPtr=(float*)malloc(sizeof(float)*width*height);

    cudaMemcpy(hostPtr,output,sizeof(float)*width*height,cudaMemcpyDeviceToHost); 

    for (inti=0;i<height;++i)

    {

        for (intj=0;j<width;++j)

        {

            printf("%f ",hostPtr[i*width+j]);

        }

        printf("\n");

    } 

    free(hostPtr); 

    cudaFreeArray(cuArray);

    cudaFree(output); 

    system("pause");

9、surface Memory的使用方法

代码:

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdio.h>

#include <stdlib.h> 

surface<void,2> inputSurfRef;

surface<void,2>outputSurfRef; 

__global__ void copyKernel(intwidth,int height)

{

    unsigned int x=blockIdx.x*blockDim.x+threadIdx.x;

    unsigned int y=blockIdx.y*blockDim.y+threadIdx.y; 

    if(x<width && y<height)

    {

        uchar4 data;

        surf2Dread(&data,inputSurfRef,x*4,y);

        surf2Dwrite(data,outputSurfRef,x*4,y);

    }

}

void main()

{

    int width=256,height=256;

    unsigned int*h_data=(unsigned int*)malloc(width*height*sizeof(unsignedint));

    for (inti=0;i<height;++i)

    {

        for (intj=0;j<width;++j)

        {

            h_data[i*width+j]=3;

        }

    }

    int size=width*height*sizeof(unsignedint); 

    cudaChannelFormatDescchannelDesc=cudaCreateChannelDesc(8,8,8,8,cudaChannelFormatKindUnsigned);

    cudaArray*cuInputArray;

    cudaMallocArray(&cuInputArray,&channelDesc,width,height,cudaArraySurfaceLoadStore); 

    cudaArray*cuOutputArray;

    cudaMallocArray(&cuOutputArray,&channelDesc,width,height,cudaArraySurfaceLoadStore); 

    cudaMemcpyToArray(cuInputArray,0,0,h_data,size,cudaMemcpyHostToDevice); 

    cudaBindSurfaceToArray(inputSurfRef,cuInputArray);

    cudaBindSurfaceToArray(outputSurfRef,cuOutputArray); 

    dim3 dimBlock(16,16);

    dim3dimGrid((width+dimBlock.x-1)/dimBlock.x,(height+dimBlock.y-1)/dimBlock.y); 

    copyKernel<<<dimGrid,dimBlock>>>(width,height); 

    unsigned int*host_output=(unsigned int*)malloc(sizeof(unsignedint)*width*height);

    cudaMemcpyFromArray(host_output,cuOutputArray,0,0,size,cudaMemcpyDeviceToHost); 

    for (inti=0;i<height;++i)

    {

        for (intj=0;j<width;++j)

        {

            printf("%u ",host_output[i*width+j]);

        }

        printf("\n");

    } 

    system("pause");

    free(host_output);

    free(h_data);

    cudaFreeArray(cuInputArray);

    cudaFreeArray(cuOutputArray);

}

 

10、opengl和cuda的交互

代码:https://devtalk.nvidia.com/default/topic/502692/how-to-use-open_gl/

http://stackoverflow.com/questions/12082357/errors-while-using-opengl-buffers-using-visual-studio-2010-in-windows7 

#include <cuda.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <gl/glew.h>//要放在下面这一句的前面

#include "cuda_gl_interop.h" 

#include <stdio.h>

#include <stdlib.h> 

//#include <gl/GL.h>

#include <gl/glut.h> 

GLuint postionsVBO=1;

struct cudaGraphicsResource * postionsVBO_CUDA; 

int width=256;

int height=256;

__device__ float dev_time=1;

float host_time=1; 

__global__ void createVertices(float4 *positions,floattime,unsignedintwidth ,unsignedintheight); 

void init()

{

    glClearColor(0.f,0.f,0.f,1.f);

    glClear(GL_DEPTH_BUFFER_BIT| GL_COLOR_BUFFER_BIT);

void reshape(int width,int height)

{

    glMatrixMode(GL_PROJECTION);

    glLoadIdentity();

    if (width>height)

    {

        gluPerspective(45,(GLfloat)width/height,0.001,1000);

    }else

    {

        gluPerspective(45,(GLfloat)height/width,0.001,1000);

    }

    glMatrixMode(GL_MATRIX_MODE);

    glLoadIdentity();   

}

void display()

{

    float4 *positions;

    cudaGraphicsMapResources(1,&postionsVBO_CUDA,0);

    size_tnumb_bytes;

    cudaGraphicsResourceGetMappedPointer((void**)&positions,&numb_bytes,postionsVBO_CUDA); 

    dim3 dimBlock(16,16,1);

    dim3 dimGrid(width/dimBlock.x,height/dimBlock.y,1);

    createVertices<<<dimGrid,dimBlock>>>(positions,dev_time,width,height); 

    dev_time++;

    cudaMemcpy(&time,&host_time,sizeof(float),cudaMemcpyHostToDevice); 

    cudaGraphicsUnmapResources(1,&postionsVBO_CUDA,0); 

    glClear(GL_COLOR_BUFFER_BIT| GL_DEPTH_BUFFER_BIT);

    glBindBuffer(GL_ARRAY_BUFFER,postionsVBO);

    glVertexPointer(4,GL_FLOAT,0,0);

    glEnableClientState(GL_VERTEX_ARRAY);

    glDrawArrays(GL_POINTS,0,width*height);

    glDisableClientState(GL_VERTEX_ARRAY); 

    glutSwapBuffers();

    glutPostRedisplay();

__global__ void createVertices(float4 *positions,floattime,unsignedintwidth ,unsignedintheight)

{

    unsigned int x=blockIdx.x*blockDim.x+threadIdx.x;

    unsigned int y=blockIdx.y*blockDim.y+threadIdx.y; 

    float u=x/(float)width;

    float v=y/(float)height; 

    u=u*2.f-1.f;

    v=v*2.f-1.f; 

    float freq=4.f;

    float w=sinf(u*freq+time)*cosf(v*freq+time)*0.5f; 

    positions[y*width+x]=make_float4(u,w,v,1.f); 

int main(int argc,char*argv[])

{

    cudaGLSetGLDevice(0); 

    glutInit(&argc,argv);

    glutInitDisplayMode(GLUT_DOUBLE| GLUT_RGB);

    glutInitWindowPosition(0,0);

    glutInitWindowSize(100,100);

    glutCreateWindow("opengl-cuda");

    init();

    glutDisplayFunc(display);

    glutReshapeFunc(reshape); 

    glewInit();//http://stackoverflow.com/questions/12344612/unusual-error-using-opengl-buffers-with-cuda-interop-on-ms-visual-studio-2010 

    glGenBuffers(1,&postionsVBO);

    glBindBuffer(GL_ARRAY_BUFFER,postionsVBO);

    unsigned intsize=width*height*4*sizeof(float);

    glBufferData(GL_ARRAY_BUFFER,size,0,GL_DYNAMIC_DRAW);

    glBindBuffer(GL_ARRAY_BUFFER,0);

    cudaGraphicsGLRegisterBuffer(&postionsVBO_CUDA,postionsVBO,cudaGraphicsMapFlagsWriteDiscard); 

    glutMainLoop();

}

11、Formattedoutput---printf函数在device的函数中,但是其需要其的compute copability至少为2.0

代码:

#include <cuda.h>

#include <helper_cuda.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h" 

__global__ void bcast()

{

    printf("%d\n",threadIdx.x);

void main()

{

    bcast<<<1,32>>>(); 

    cudaDeviceSynchronize();

    system("pause");

12、Asserting在设备端的函数中,但是其要求其计算能力至少为2.0

代码:

#include <cuda.h>

#include <helper_cuda.h> 

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <assert.h>

#include <stdlib.h> 

__global__ void testAssert(void)

{

    int is_one=1;

    int should_be_one=0; 

    assert(is_one);

    assert(should_be_one);

void main()

{

    testAssert<<<1,1>>>();

    cudaDeviceSynchronize();

    cudaDeviceReset();

    system("pause");

13、Per ThreadAllocation On heap每个线程在堆上分配

代码:

#include <stdio.h>

#include <stdlib.h>

#include <cuda.h>

#include <helper_cuda.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h" 

__global__ void mallocTestPerThread()

{

    char *ptr=(char *)malloc(100);

    printf("Thread %d got pointer:%p\n",threadIdx.x,ptr);

    free(ptr);

int main()

{

    cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024);

    mallocTestPerThread<<<1,5>>>();

    cudaDeviceSynchronize(); 

    system("pause");

    return 0;

}

14、Per Thread BlockAllocation每个线程块在堆上分配空间

代码:

#include <stdio.h>

#include <stdlib.h>

#include <cuda.h>

#include <helper_cuda.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h" 

__global__ void mallocTestPerThreadBlock()

{

    __shared__ int *data;

    if (threadIdx.x==0)

    {

        data=(int*)malloc(blockDim.x*64);

    }

    __syncthreads(); 

    if (data==NULL)

    {

        return;

    } 

    int *ptr=data;

    for (inti=0;i<64;++i)

    {

        ptr[i*blockDim.x+threadIdx.x]=threadIdx.x;

    }

    __syncthreads(); 

    if (threadIdx.x==0)

    {

        free(data);

    }

int main()

{

    cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024);

    mallocTestPerThreadBlock<<<10,128>>>();

    cudaDeviceSynchronize(); 

    system("pause");

    return 0;

}

15、AllocationPersisting Between Kernel Launches在堆上分配

代码:

#include <cuda.h>

#include <helper_cuda.h>

#include <stdio.h>

#include <stdlib.h>

#include "cuda_runtime.h"

#include "device_launch_parameters.h" 

#define NUM_BLOCKS 20 

__device__ int *dataptr[NUM_BLOCKS]; 

__global__ void allocmem()

{

    if (threadIdx.x==0)

    {

        dataptr[blockIdx.x]=(int*)malloc(blockDim.x*4);

    }

    __syncthreads(); 

    if (dataptr[blockIdx.x]==NULL)

    {

        return;

    } 

    dataptr[blockIdx.x][threadIdx.x]=0;

__global__ void usemem()

{

    int *ptr=dataptr[blockIdx.x];

    if (ptr!=NULL)

    {

        ptr[threadIdx.x]+=threadIdx.x;

    }

__global__ void freemem()

{

    int *ptr=dataptr[blockIdx.x];

    if(ptr!=NULL)

        printf("Block %d,Thread=%d:final value=%d\n",blockIdx.x,threadIdx.x,ptr[threadIdx.x]); 

    if (threadIdx.x==0)

    {

        free(ptr);

    }

int main()

{

    cudaDeviceSetLimit(cudaLimitMallocHeapSize,128*1024*1024); 

    allocmem<<<NUM_BLOCKS,10>>>(); 

    usemem<<<NUM_BLOCKS,10>>>();

    usemem<<<NUM_BLOCKS,10>>>();

    usemem<<<NUM_BLOCKS,10>>>(); 

    freemem<<<NUM_BLOCKS,10>>>(); 

    cudaDeviceSynchronize();

    system("pause");

    return 0;

}

 

0 0
原创粉丝点击
热门问题 老师的惩罚 人脸识别 我在镇武司摸鱼那些年 重生之率土为王 我在大康的咸鱼生活 盘龙之生命进化 天生仙种 凡人之先天五行 春回大明朝 姑娘不必设防,我是瞎子 公司扣员工奖金怎么办 墙面大面积有霉怎么办 电脑总是断网怎么办 造价师资格证书丢了怎么办 行测做题速度太慢怎么办 普通话准考证丢了怎么办 信用卡还款逾期了怎么办 淮北建洗煤厂需要怎么办 建设工程不结算怎么办 考监理没有职称怎么办 公开招标两次招标失败怎么办 山东省植保员证怎么办 工伤没开病假单怎么办 邮储银行怎么办金卡 农机被交警查怎么办 广联达识别板筋负筋重叠怎么办 掌上品的钱怎么办 财务纠纷被起诉怎么办 牙齿补后疼痛怎么办 牙齿表面蛀了怎么办 虫牙全掉了怎么办 牙齿蛀光了怎么办 虫牙全部掉完了怎么办 不喜欢向人请教怎么办 单位不交公积金怎么办 电锯链条掉了怎么办 天津公积金怎么办外地转入 学籍档案涂抹了怎么办 气相点火失败怎么办 小米闹钟声音小怎么办 河北省监理员证怎么办 买的商铺烂尾了怎么办 钢表带被磨花了怎么办 资料员到期了怎么办 八大员挂靠社保怎么办 安许证三类人员不足了怎么办 考试准考证号写错了怎么办 科目一失约两次怎么办 钢筋送检两次不合格怎么办 公路原材料抽检不合格怎么办 混凝土回弹强度不合格怎么办