使用CUDA驱动API的简单例子

来源:互联网 发布:淘宝天猫优惠券招代理 编辑:程序博客网 时间:2024/06/01 09:19

花了两个晚上,搞了一个使用CUDA驱动API的简单例子.可以从这个例子出发,修改出你想要的程序.例子的功能很简单,就是Hello CUDA!(被我改成了New CUDA!呵呵).好了,废话少说,下面就是说明和代码.

1)原程序由两部分组成:
a)由kernel.cu编译成的kernel.cubin.我用的是sdk 3.0.因此,cubin被编译成为了elf文件格式.不过,没关系,在程序执行时照样可装入使用.
kernel.cu的代码如下:
__device__ void HelloCUDA(char *result, int num)
{
    int i = 0;
    char p_HelloCUDA[] = "New CUDA!";
    for(i = 0; i < num; i++) {
        result[i] = p_HelloCUDA[i];
    }
}
__global__ static void GPUMain(char* result, int num)
{
    HelloCUDA(result, num);
}
这里要注意两点:
i:编译器可能优化你的程序.在kernel.cubin中,你将找不到HelloCUDA函数.
ii:在kernel.cubin中,GPUMain的函数名成为了_Z7GPUMainPci.呵呵.
b)主机程序sample.cpp.
嗯?cpp?不是cu?
是的,是cpp!而且可以在vc6.0下编译通过!总算又看到了VC6的"亲切面孔"....
(和普通的vc6一样,不要特别的设置。当然,别忘了设置include路径,lib路径,还有连接库上别忘了加上cuda.lib)
下面是sample.cpp的代码:
/********************************************************************
*  sample.cu
*  This is a example of the CUDA program.
*********************************************************************/

#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <io.h>
#include <stdio.h>
#include <stdlib.h>
#include <windows.h>

#include <cuda.h>
#include <cutil.h>

/************************************************************************/
/* Define for CUDA                                                                                                   */
/************************************************************************/
#define ALIGN_UP(offset, alignment) /
(offset)=(((offset)+(alignment)-1) & ~((alignment)-1))

/************************************************************************/
/* Init CUDA                                                                                                            */
/************************************************************************/
bool InitCUDA(void)
{
 static int nGpuArchCoresPerSM[] = { -1, 8, 32 };
 int driverVersion;
 char deviceName[256];
 int major;
 int minor;
 unsigned int totalGlobalMem;
 int multiProcessorCount;
 int totalConstantMemory;
 int sharedMemPerBlock;
 int regsPerBlock;
 int warpSize;
 int maxThreadsPerBlock;
 int blockDim[3];
 int gridDim[3];
 int memPitch;
 int clockRate;
 int gpuOverlap;
 int textureAlign;
 int kernelExecTimeoutEnabled;
 int integrated;
 int canMapHostMemory;
 int computeMode;
 CUresult rc;
 int count = 0;
 int i = 0;

 /* Init */
 rc=cuInit(0);
 if(rc!=CUDA_SUCCESS)
 {
  fprintf(stderr, "CUDA init error./n");
  return false;
 }

 /* Get device num */
 rc=cuDeviceGetCount(&count);
 if((rc!=CUDA_SUCCESS)|| (count==0))
 {
  fprintf(stderr, "There is no device supporting CUDA./n");
  return false;
 }

 /* Get API version */
 rc=cuDriverGetVersion(&driverVersion);
 if(rc!=CUDA_SUCCESS)
 {
  fprintf(stderr, "There is error on cuDriverGetVersion./n");
  return false;
 }
 printf("CUDA Driver Version: %d.%d/n/n", driverVersion/1000, driverVersion%100);

 /* Get device info */
 for(i=0; i<count; i++)
 {
  rc=cuDeviceComputeCapability(&major, &minor,i);
  if(rc==CUDA_SUCCESS)
  {
   if ((major == 9999) && (minor == 9999))
   { /* emu device */
    fprintf(stderr, "Find the emu device./n");
    continue;
   }

   /* display the info */
   rc=cuDeviceGetName(deviceName, 256, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetName(%ld)./n", i);
    continue;
   }
   printf("Device %ld: /"%s/"/n", i, deviceName);
   printf("  CUDA Capability Major revision number: %d/n", major);
   printf("  CUDA Capability Minor revision number: %d/n", minor);

   rc=cuDeviceTotalMem(&totalGlobalMem, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceTotalMem(%ld)./n", i);
    continue;
   }
   printf("  Total amount of global memory: %u bytes/n", totalGlobalMem);

   rc=cuDeviceGetAttribute( &multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
    continue;
   }
   printf("  Number of multiprocessors: %d/n", multiProcessorCount);
   printf("  Number of cores: %d/n", nGpuArchCoresPerSM[major] * multiProcessorCount);
   
   rc=cuDeviceGetAttribute( &totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
    continue;
   }
         printf("  Total amount of constant memory: %u bytes/n", totalConstantMemory);

   rc=cuDeviceGetAttribute( &sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
    continue;
   }
         printf("  Total amount of shared memory per block: %u bytes/n", sharedMemPerBlock);

   rc=cuDeviceGetAttribute( &regsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
    continue;
   }
         printf("  Total number of registers available per block: %d/n", regsPerBlock);

   rc=cuDeviceGetAttribute( &warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
    continue;
   }
            printf("  Warp size: %d/n", warpSize);

   rc=cuDeviceGetAttribute( &maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
    continue;
   }
    printf("  Maximum number of threads per block: %d/n", maxThreadsPerBlock);

   rc=cuDeviceGetAttribute( &blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
    continue;
   }
   rc=cuDeviceGetAttribute( &blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
    continue;
   }
   rc=cuDeviceGetAttribute( &blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
    continue;
   }
         printf("  Maximum sizes of each dimension of a block: %d x %d x %d/n", blockDim[0], blockDim[1], blockDim[2]);

   rc=cuDeviceGetAttribute( &gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
    continue;
   }
   rc=cuDeviceGetAttribute( &gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
    continue;
   }
   rc=cuDeviceGetAttribute( &gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
    continue;
   }
         printf("  Maximum sizes of each dimension of a grid: %d x %d x %d/n", gridDim[0], gridDim[1], gridDim[2]);

   rc=cuDeviceGetAttribute( &memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
    continue;
   }
   printf("  Maximum memory pitch: %u bytes/n", memPitch);

   rc=cuDeviceGetAttribute( &textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
    continue;
   }
   printf("  Texture alignment: %u bytes/n", textureAlign);

   rc=cuDeviceGetAttribute( &clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
    continue;
   }
   printf("  Clock rate: %.2f GHz/n", clockRate * 1e-6f);

   rc=cuDeviceGetAttribute( &gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
    continue;
   }
   printf("  Concurrent copy and execution: %s/n",gpuOverlap ? "Yes" : "No");

   rc=cuDeviceGetAttribute( &kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
    continue;
   }
      printf("  Run time limit on kernels: %s/n", kernelExecTimeoutEnabled ? "Yes" : "No");

   rc=cuDeviceGetAttribute( &integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
    continue;
   }
   printf("  Integrated: %s/n", integrated ? "Yes" : "No");

   rc=cuDeviceGetAttribute( &canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
    continue;
   }
   printf("  Support host page-locked memory mapping: %s/n", canMapHostMemory ? "Yes" : "No");

   rc=cuDeviceGetAttribute( &computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, i);
   if(rc!=CUDA_SUCCESS)
   {
    fprintf(stderr, "There is error on cuDeviceGetAttribute(%ld)./n", i);
    continue;
   }
   printf("  Compute mode: %s/n", computeMode == 0 ?
                           "Default (multiple host threads can use this device)" :
                              computeMode == 1 ?
         "Exclusive (only one host thread at a time can use this device)" :
                              computeMode == 2 ?
         "Prohibited (no host thread can use this device)" :
         "Unknown");
  }
 }


 return true;
}

/************************************************************************/
/* HelloCUDA                                                                                                           */
/************************************************************************/
int main(int argc, char* argv[])
{
 CUdevice dev;
 CUcontext ctx;
 CUdeviceptr device_result =0;
 CUmodule module;
 CUfunction func;
 int offset;
 void* paramptr;
 int paramint;
 CUresult rc;

 char *codemem=0;
 long codesize=0;
 int fp;
    struct _stat fstatus;
 int rt;

 CUevent startevent;
 CUevent stopevent;
 char host_result[12] ={0};
 float tm;
 float tottm;
 int i;

 if(!InitCUDA()) {
  return 0;
 }
 printf("CUDA initialized./n");

 /* Get the handle */
 rc=cuDeviceGet(&dev, 0);
 if(rc!=CUDA_SUCCESS)
 {
  fprintf(stderr, "There is error on cuDeviceGet(%ld)./n", rc);
  return 0;
 }

 /* Get cubin file size */
 fp=_open("kernel.cubin", _O_BINARY|_O_RDONLY);
 if(fp<0)
 {
  fprintf(stderr, "There is error on fopen(kernel.cubin)./n");
  /* free all resource */
  return 0;
 }
 rt=_fstat(fp, &fstatus);
 if(rt!=0)
 {
  fprintf(stderr, "There is error on _fstat(kernel.cubin)./n");
  /* free all resource */
  _close(fp);
  return 0;
 }
 codesize=fstatus.st_size;

 /* Alloc code memory */
 codemem=(char *)malloc(codesize+1);
 if(codemem==NULL)
 {
  fprintf(stderr, "There is error on malloc(%ld)./n", codesize+1);
  /* free all resource */
  _close(fp);
  return 0;
 }
 codemem[codesize]=0;

 /* Read cubin file */
 rt=_read(fp, codemem, codesize);
 if(rt!=codesize)
 {
  fprintf(stderr, "There is error on _read(kernel.cubin)(%ld)./n", rt);
  /* free all resource */
  free(codemem);
  _close(fp);
  return 0;
 }

 /* Close the cubin file */
 rt=_close(fp);
 if(rt!=0)
 {
  fprintf(stderr, "There is error on _close(kernel.cubin)./n");
  /* free all resource */
  free(codemem);
  return 0;
 }

 /* Create the context */
 /* Normal flag is CU_CTX_SCHED_AUTO */
 rc=cuCtxCreate(&ctx, CU_CTX_BLOCKING_SYNC, dev);
 if(rc!=CUDA_SUCCESS)
 {
  fprintf(stderr, "There is error on cuCtxCreate(%ld)./n", rc);
  free(codemem);
  return 0;
 }

 /* Load cuda module */
 rc=cuModuleLoadData(&module, codemem);
 if(rc!=CUDA_SUCCESS)
 {
  fprintf(stderr, "There is error on cuModuleLoadData(%ld)./n", rc);
  /* free all resource */
  free(codemem);
  cuCtxDestroy(ctx);
  return 0;
 }

 /* Malloc gpu memory */
 rc=cuMemAlloc(&device_result, sizeof(char) * 10);
 if(rc!=CUDA_SUCCESS)
 {
  fprintf(stderr, "There is error on cuMemAlloc(%ld)./n", rc);
  /* free all resource */
  cuModuleUnload(module);
  free(codemem);
  cuCtxDestroy(ctx);
  return 0;
 }

 /* Create a event for start time point */
 rc=cuEventCreate(&startevent, CU_EVENT_DEFAULT);
 if(rc!=CUDA_SUCCESS)
 {
  fprintf(stderr, "There is error on cuEventCreate(startevent)(%ld)./n", rc);
  /* free all resource */
  cuMemFree(device_result);
  cuModuleUnload(module);
  free(codemem);
  cuCtxDestroy(ctx);
  return 0;
 }

 /* Create a event for stop time point */
  rc=cuEventCreate(&stopevent, CU_EVENT_DEFAULT);
 if(rc!=CUDA_SUCCESS)
 {
  fprintf(stderr, "There is error on cuEventCreate(stopevent)(%ld)./n", rc);
  /* free all resource */
  cuEventDestroy(startevent);
  cuMemFree(device_result);
  cuModuleUnload(module);
  free(codemem);
  cuCtxDestroy(ctx);
  return 0;
 }

 /* Get the function */
 rc=cuModuleGetFunction(&func, module, "_Z7GPUMainPci");
 if(rc!=CUDA_SUCCESS)
 {
  fprintf(stderr, "There is error on cuModuleGetFunction(GPUMain)(%ld)./n", rc);
  /* free all resource */
  cuEventDestroy(stopevent);
  cuEventDestroy(startevent);
  cuMemFree(device_result);
  cuModuleUnload(module);
  free(codemem);
  cuCtxDestroy(ctx);
  return 0;
 }

 /* run kernel for 10000 times */
 tottm=0.0;
 for(i=0; i<10000; i++)
 { /* Set param 1 */
  offset=0;
  /* void* should be used to determine CUdeviceptr’s alignment */
  paramptr=(void*)(size_t)device_result;
  ALIGN_UP(offset, __alignof(void *));
  rc=cuParamSetv(func, offset, &paramptr, sizeof(paramptr));
  if(rc!=CUDA_SUCCESS)
  {
   fprintf(stderr, "There is error on cuParamSetv(GPUMain)(%ld)./n", rc);
   /* free all resource */
   cuEventDestroy(stopevent);
   cuEventDestroy(startevent);
   cuMemFree(device_result);
   cuModuleUnload(module);
   free(codemem);
   cuCtxDestroy(ctx);
   return 0;
  }
  offset += sizeof(paramptr);
  /* Set param 2 */
  paramint=10;
  ALIGN_UP(offset, __alignof(int));
  rc=cuParamSeti(func, offset, paramint);
  if(rc!=CUDA_SUCCESS)
  {
   fprintf(stderr, "There is error on cuParamSeti(GPUMain)(%ld)./n", rc);
   /* free all resource */
   cuEventDestroy(stopevent);
   cuEventDestroy(startevent);
   cuMemFree(device_result);
   cuModuleUnload(module);
   free(codemem);
   cuCtxDestroy(ctx);
   return 0;
  }
  offset += sizeof(paramint);
  /* Set param size */
  rc=cuParamSetSize(func, offset);
  if(rc!=CUDA_SUCCESS)
  {
   fprintf(stderr, "There is error on cuParamSetSize(GPUMain)(%ld)./n", rc);
   /* free all resource */
   cuEventDestroy(stopevent);
   cuEventDestroy(startevent);
   cuMemFree(device_result);
   cuModuleUnload(module);
   free(codemem);
   cuCtxDestroy(ctx);
   return 0;
  }
  /* Set block size */
  rc=cuFuncSetBlockShape(func, 1, 1, 1);
  if(rc!=CUDA_SUCCESS)
  {
   fprintf(stderr, "There is error on cuFuncSetBlockShape(GPUMain)(%ld)./n", rc);
   /* free all resource */
   cuEventDestroy(stopevent);
   cuEventDestroy(startevent);
   cuMemFree(device_result);
   cuModuleUnload(module);
   free(codemem);
   cuCtxDestroy(ctx);
   return 0;
  }

  /* Record the begin time */
  /* kernel is default at stream 0 */
  rc=cuEventRecord(startevent, 0);
  if(rc!=CUDA_SUCCESS)
  {
   fprintf(stderr, "There is error on cuEventRecord(startevent)(%ld)./n", rc);
   /* free all resource */
   cuEventDestroy(stopevent);
   cuEventDestroy(startevent);
   cuMemFree(device_result);
   cuModuleUnload(module);
   free(codemem);
   cuCtxDestroy(ctx);
   return 0;
  }

  /* Lauch grid */
  rc=cuLaunchGrid(func, 1, 1);
  if(rc!=CUDA_SUCCESS)
  {
   fprintf(stderr, "There is error on cuLaunchGrid(GPUMain)(%ld)./n", rc);
   /* free all resource */
   cuEventDestroy(stopevent);
   cuEventDestroy(startevent);
   cuMemFree(device_result);
   cuModuleUnload(module);
   free(codemem);
   cuCtxDestroy(ctx);
   return 0;
  }

  /* Waiting for the kernel finish */
  rc=cuCtxSynchronize();
  if(rc!=CUDA_SUCCESS)
  {
   fprintf(stderr, "There is error on cuCtxSynchronize(GPUMain)(%ld)./n", rc);
   /* free all resource */
   cuEventDestroy(stopevent);
   cuEventDestroy(startevent);
   cuMemFree(device_result);
   cuModuleUnload(module);
   free(codemem);
   cuCtxDestroy(ctx);
   return 0;
  }

  /* Record the end time */
  rc=cuEventRecord(stopevent, 0);
  if(rc!=CUDA_SUCCESS)
  {
   fprintf(stderr, "There is error on cuEventRecord(stopevent)(%ld)./n", rc);
   /* free all resource */
   cuEventDestroy(stopevent);
   cuEventDestroy(startevent);
   cuMemFree(device_result);
   cuModuleUnload(module);
   free(codemem);
   cuCtxDestroy(ctx);
   return 0;
  }
  /* Wait for stop event is actually recorded */
  rc=cuEventSynchronize(stopevent);
  if(rc!=CUDA_SUCCESS)
  {
   fprintf(stderr, "There is error on cuEventSynchronize(stopevent)(%ld)./n", rc);
   /* free all resource */
   cuEventDestroy(stopevent);
   cuEventDestroy(startevent);
   cuMemFree(device_result);
   cuModuleUnload(module);
   free(codemem);
   cuCtxDestroy(ctx);
   return 0;
  }

  /* Calc the time of processing */
  rc=cuEventElapsedTime(&tm, startevent, stopevent);
  if(rc!=CUDA_SUCCESS)
  {
   fprintf(stderr, "There is error on cuEventElapsedTime(%ld)./n", rc);
   /* free all resource */
   cuEventDestroy(stopevent);
   cuEventDestroy(startevent);
   cuMemFree(device_result);
   cuModuleUnload(module);
   free(codemem);
   cuCtxDestroy(ctx);
   return 0;
  }
  tottm+=tm;
 }

 /* Copy the result */
 rc=cuMemcpyDtoH(host_result, device_result, sizeof(char)*10);
 if(rc!=CUDA_SUCCESS)
 {
  fprintf(stderr, "There is error on cuMemcpyDtoH(%ld)./n", rc);
  /* free all resource */
  cuEventDestroy(stopevent);
  cuEventDestroy(startevent);
  cuMemFree(device_result);
  cuModuleUnload(module);
  free(codemem);
  cuCtxDestroy(ctx);
  return 0;
 }

 printf("Processing time: %f (ms)/n", tottm/i);

 /* Free the event */
 rc=cuEventDestroy(stopevent);
 if(rc!=CUDA_SUCCESS)
 {
  fprintf(stderr, "There is error on cuEventDestroy(stopevent)(%ld)./n", rc);
  /* free all resource */
  cuEventDestroy(startevent);
  cuMemFree(device_result);
  cuModuleUnload(module);
  free(codemem);
  cuCtxDestroy(ctx);
  return 0;
 }
 rc=cuEventDestroy(startevent);
 if(rc!=CUDA_SUCCESS)
 {
  fprintf(stderr, "There is error on cuEventDestroy(startevent)(%ld)./n", rc);
  /* free all resource */
  cuMemFree(device_result);
  cuModuleUnload(module);
  free(codemem);
  cuCtxDestroy(ctx);
  return 0;
 }

 /* Free gpu memory */
 rc=cuMemFree(device_result);
 if(rc!=CUDA_SUCCESS)
 {
  fprintf(stderr, "There is error on cuMemFree(%ld)./n", rc);
  /* free all resource */
  cuModuleUnload(module);
  free(codemem);
  cuCtxDestroy(ctx);
  return 0;
 }

 /* Unload cuda module */
 rc=cuModuleUnload(module);
 if(rc!=CUDA_SUCCESS)
 {
  fprintf(stderr, "There is error on cuModuleUnload(%ld)./n", rc);
  /* free all resource */
  free(codemem);
  cuCtxDestroy(ctx);
  return 0;
 }

 /* Free code memory */
 free(codemem);

 /* Destory the context */
 rc=cuCtxDestroy(ctx);
 if(rc!=CUDA_SUCCESS)
 {
  fprintf(stderr, "There is error on cuCtxDestroy(%ld)./n", rc);
  return 0;
 }

 printf("%s/n", host_result);

 printf("CUDA Sample is finished./n");

 return 0;
}

感谢cuda2010所指出的错误!已做了修改(文中红字部分).