Nvidia-OpenCL-SDK-Code-Samples的学习[3]
来源:互联网 发布:淘宝如何开虚拟店 编辑:程序博客网 时间:2024/06/07 18:48
接下来就是 MultiGPU 和 OpenGLInterop 但这两个工程先跳过。因为我电脑里只有一个显卡,然后与OpenGL交互暂时用不到。
所以接下来就是OpenCL Scan这个工程了,乍一看感觉和之前的都很不一样啊,感觉很难的样子。
cl文件:
// Scan codelets////////////////////////////////////////////////////////////////////////////////#if(1) //Naive inclusive scan: O(N * log2(N)) operations //Allocate 2 * 'size' local memory, initialize the first half //with 'size' zeros avoiding if(pos >= offset) condition evaluation //and saving instructions inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){ uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1)); l_Data[pos] = 0; pos += size; l_Data[pos] = idata; for(uint offset = 1; offset < size; offset <<= 1){ barrier(CLK_LOCAL_MEM_FENCE); uint t = l_Data[pos] + l_Data[pos - offset]; barrier(CLK_LOCAL_MEM_FENCE); l_Data[pos] = t; } return l_Data[pos]; } inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){ return scan1Inclusive(idata, l_Data, size) - idata; }#else #define LOG2_WARP_SIZE 5U #define WARP_SIZE (1U << LOG2_WARP_SIZE) //Almost the same as naive scan1Inclusive but doesn't need barriers //and works only for size <= WARP_SIZE inline uint warpScanInclusive(uint idata, volatile __local uint *l_Data, uint size){ uint pos = 2 * get_local_id(0) - (get_local_id(0) & (size - 1)); l_Data[pos] = 0; pos += size; l_Data[pos] = idata; if(size >= 2) l_Data[pos] += l_Data[pos - 1]; if(size >= 4) l_Data[pos] += l_Data[pos - 2]; if(size >= 8) l_Data[pos] += l_Data[pos - 4]; if(size >= 16) l_Data[pos] += l_Data[pos - 8]; if(size >= 32) l_Data[pos] += l_Data[pos - 16]; return l_Data[pos]; } inline uint warpScanExclusive(uint idata, __local uint *l_Data, uint size){ return warpScanInclusive(idata, l_Data, size) - idata; } inline uint scan1Inclusive(uint idata, __local uint *l_Data, uint size){ if(size > WARP_SIZE){ //Bottom-level inclusive warp scan uint warpResult = warpScanInclusive(idata, l_Data, WARP_SIZE); //Save top elements of each warp for exclusive warp scan //sync to wait for warp scans to complete (because l_Data is being overwritten) barrier(CLK_LOCAL_MEM_FENCE); if( (get_local_id(0) & (WARP_SIZE - 1)) == (WARP_SIZE - 1) ) l_Data[get_local_id(0) >> LOG2_WARP_SIZE] = warpResult; //wait for warp scans to complete barrier(CLK_LOCAL_MEM_FENCE); if( get_local_id(0) < (WORKGROUP_SIZE / WARP_SIZE) ){ //grab top warp elements uint val = l_Data[get_local_id(0)]; //calculate exclsive scan and write back to shared memory l_Data[get_local_id(0)] = warpScanExclusive(val, l_Data, size >> LOG2_WARP_SIZE); } //return updated warp scans with exclusive scan results barrier(CLK_LOCAL_MEM_FENCE); return warpResult + l_Data[get_local_id(0) >> LOG2_WARP_SIZE]; }else{ return warpScanInclusive(idata, l_Data, size); } } inline uint scan1Exclusive(uint idata, __local uint *l_Data, uint size){ return scan1Inclusive(idata, l_Data, size) - idata; }#endif//Vector scan: the array to be scanned is stored//in work-item private memory as uint4inline uint4 scan4Inclusive(uint4 data4, __local uint *l_Data, uint size){ //Level-0 inclusive scan data4.y += data4.x; data4.z += data4.y; data4.w += data4.z; //Level-1 exclusive scan uint val = scan1Inclusive(data4.w, l_Data, size / 4) - data4.w; return (data4 + (uint4)val);}inline uint4 scan4Exclusive(uint4 data4, __local uint *l_Data, uint size){ return scan4Inclusive(data4, l_Data, size) - data4;}////////////////////////////////////////////////////////////////////////////////// Scan kernels////////////////////////////////////////////////////////////////////////////////__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))void scanExclusiveLocal1( __global uint4 *d_Dst, __global uint4 *d_Src, __local uint *l_Data, uint size){ //Load data uint4 idata4 = d_Src[get_global_id(0)]; //Calculate exclusive scan uint4 odata4 = scan4Exclusive(idata4, l_Data, size); //Write back d_Dst[get_global_id(0)] = odata4;}//Exclusive scan of top elements of bottom-level scans (4 * THREADBLOCK_SIZE)__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))void scanExclusiveLocal2( __global uint *d_Buf, __global uint *d_Dst, __global uint *d_Src, __local uint *l_Data, uint N, uint arrayLength){ //Load top elements //Convert results of bottom-level scan back to inclusive //Skip loads and stores for inactive work-items of the work-group with highest index(pos >= N) uint data = 0; if(get_global_id(0) < N) data = d_Dst[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)] + d_Src[(4 * WORKGROUP_SIZE - 1) + (4 * WORKGROUP_SIZE) * get_global_id(0)]; //Compute uint odata = scan1Exclusive(data, l_Data, arrayLength); //Avoid out-of-bound access if(get_global_id(0) < N) d_Buf[get_global_id(0)] = odata;}//Final step of large-array scan: combine basic inclusive scan with exclusive scan of top elements of input arrays__kernel __attribute__((reqd_work_group_size(WORKGROUP_SIZE, 1, 1)))void uniformUpdate( __global uint4 *d_Data, __global uint *d_Buf){ __local uint buf[1]; uint4 data4 = d_Data[get_global_id(0)]; if(get_local_id(0) == 0) buf[0] = d_Buf[get_group_id(0)]; barrier(CLK_LOCAL_MEM_FENCE); data4 += (uint4)buf[0]; d_Data[get_global_id(0)] = data4;}
main函数部分:
int main(int argc, const char **argv){ shrQAStart(argc, (char **)argv); // Start logs shrSetLogFileName ("oclScan.txt"); shrLog("%s Starting...\n\n", argv[0]); cl_platform_id cpPlatform; //OpenCL platform cl_device_id cdDevice; //OpenCL device cl_context cxGPUContext; //OpenCL context cl_command_queue cqCommandQueue; //OpenCL command que cl_mem d_Input, d_Output; //OpenCL memory buffer objects cl_int ciErrNum; uint *h_Input, *h_OutputCPU, *h_OutputGPU; const uint N = 13 * 1048576 / 2; shrLog("Allocating and initializing host arrays...\n"); h_Input = (uint *)malloc(N * sizeof(uint)); h_OutputCPU = (uint *)malloc(N * sizeof(uint)); h_OutputGPU = (uint *)malloc(N * sizeof(uint)); srand(2009); for(uint i = 0; i < N; i++) h_Input[i] = rand(); shrLog("Initializing OpenCL...\n"); //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); //Get a GPU device ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); oclCheckError(ciErrNum, CL_SUCCESS); //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); //Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Initializing OpenCL scan...\n"); initScan(cxGPUContext, cqCommandQueue, argv); shrLog("Creating OpenCL memory objects...\n\n"); d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, N * sizeof(uint), h_Input, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_Output = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, N * sizeof(uint), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); int globalFlag = 1; // init pass/fail flag to pass size_t szWorkgroup; const int iCycles = 100; shrLog("*** Running GPU scan for short arrays (%d identical iterations)...\n\n", iCycles); for(uint arrayLength = MIN_SHORT_ARRAY_SIZE; arrayLength <= MAX_SHORT_ARRAY_SIZE; arrayLength *= 2) { shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); clFinish(cqCommandQueue); shrDeltaT(0); for (int i = 0; i<iCycles; i++) { szWorkgroup = scanExclusiveShort( cqCommandQueue, d_Output, d_Input, N / arrayLength, arrayLength ); } clFinish(cqCommandQueue); double timerValue = shrDeltaT(0)/(double)iCycles; shrLog("Validating the results...\n"); shrLog(" ...reading back OpenCL memory\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, N * sizeof(uint), h_OutputGPU, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog(" ...scanExclusiveHost()\n"); scanExclusiveHost( h_OutputCPU, h_Input, N / arrayLength, arrayLength ); // Compare GPU results with CPU results and accumulate error for this test shrLog(" ...comparing the results\n"); int localFlag = 1; for(uint i = 0; i < N; i++) { if(h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } // Log message on individual test result, then accumulate to global flag shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; #ifdef GPU_PROFILING if (arrayLength == MAX_SHORT_ARRAY_SIZE) { shrLog("\n"); shrLogEx(LOGBOTH | MASTER, 0, "oclScan-Short, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup); shrLog("\n"); } #endif } shrLog("*** Running GPU scan for large arrays (%d identical iterations)...\n\n", iCycles); for(uint arrayLength = MIN_LARGE_ARRAY_SIZE; arrayLength <= MAX_LARGE_ARRAY_SIZE; arrayLength *= 2) { shrLog("Running scan for %u elements (%u arrays)...\n", arrayLength, N / arrayLength); clFinish(cqCommandQueue); shrDeltaT(0); for (int i = 0; i<iCycles; i++) { szWorkgroup = scanExclusiveLarge( cqCommandQueue, d_Output, d_Input, N / arrayLength, arrayLength ); } clFinish(cqCommandQueue); double timerValue = shrDeltaT(0)/(double)iCycles; shrLog("Validating the results...\n"); shrLog(" ...reading back OpenCL memory\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, N * sizeof(uint), h_OutputGPU, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog(" ...scanExclusiveHost()\n"); scanExclusiveHost( h_OutputCPU, h_Input, N / arrayLength, arrayLength ); // Compare GPU results with CPU results and accumulate error for this test shrLog(" ...comparing the results\n"); int localFlag = 1; for(uint i = 0; i < N; i++) { if(h_OutputCPU[i] != h_OutputGPU[i]) { localFlag = 0; break; } } // Log message on individual test result, then accumulate to global flag shrLog(" ...Results %s\n\n", (localFlag == 1) ? "Match" : "DON'T Match !!!"); globalFlag = globalFlag && localFlag; #ifdef GPU_PROFILING if (arrayLength == MAX_LARGE_ARRAY_SIZE) { shrLog("\n"); shrLogEx(LOGBOTH | MASTER, 0, "oclScan-Large, Throughput = %.4f MElements/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", (1.0e-6 * (double)arrayLength/timerValue), timerValue, arrayLength, 1, szWorkgroup); shrLog("\n"); } #endif } shrLog("Shutting down...\n"); //Release kernels and program closeScan(); //Release other OpenCL Objects ciErrNum = clReleaseMemObject(d_Output); ciErrNum |= clReleaseMemObject(d_Input); ciErrNum |= clReleaseCommandQueue(cqCommandQueue); ciErrNum |= clReleaseContext(cxGPUContext); oclCheckError(ciErrNum, CL_SUCCESS); //Release host buffers free(h_OutputGPU); free(h_OutputCPU); free(h_Input); // finish // pass or fail (cumulative... all tests in the loop) shrQAFinishExit(argc, (const char **)argv, globalFlag ? QA_PASSED : QA_FAILED); //Finish shrEXIT(argc, argv);}其实就是这个意思,src是个N=6815744大小的数组,同样创建一个这么大的数组,数组中每个值是前面所有位置值的和:就是这个意思。
但有个问题,host端实参h_Input变量是N大小的uint型 创建buffer时也是N大小的uint型变量名是d_Input 将这个d_Input传给kernel函数scanExclusiveLocal1,而kernel那个位置形参是global uint4* d_Src 这个例子中竟然可以直接传过去的???uint直接传给了uint4,甚至没有强转换的???我以为这样直接传是 只传进来给uint4.x 另外的uint4.y、uint4.z和uint4.w都是随机的吗 随机的无意义的值。结果不是这样的!!!!有一个100个uint元素构成的缓冲区(因为int/uint这种,在A卡实现上,和普通的int/uint大小一样,因此就不区分cl_int之类的cl_前缀了)你可以使用uint *p1指向它,那么自然依然有100个有效元素, 即p1[0] - p1[99] 。如果你要使用uint4 *p2指向它,这也是可以的。但只有25个有效元素了,即p2[0] - p2[24]。无效的只是从p2[25] - p2[99]。而不是我想像的p2[0] - p2[99]中,分别只有x元素有效,y/z/w元素无效!!!大神指出这是我第N次在这种内存问题上栽倒了,上次及上上次有2个例子也是这样!
我现在是按照main函数一步步分析:arrayLength=4时,localsize=256,globalsize=N/4=1703936,执行第1个kernel,globalsize个线程去调用scan4Exclusive()函数,因为uint传给uint4,所以其实总共是有N个数的。又调用了scan4Inclusive()函数,其实就是每个globalID负责原来的4个uint数,函数scan4Inclusive()中将这4个数逐级相加,其实就是对于这4个数实现了每一个数是前面的数之和。然后函数scan4Inclusive()又调用了scan1Inclusive()函数,我分析到scan1Inclusive()函数里就卡住了。scan1Inclusive()函数用到了localID,因为localsize是256大小,而这里l_Data[pos]=0; pos+=size;(arrayLength=4时 size=1) l_data[pos]=idata;所以其实相当于每个localID算了2个位置的数,256个localID*2=512,所以l_Data大小为512是没毛病的。只是这个l_Data里奇数位置放的都是0,偶数位置放的分别是原数据uint的前4位之和、5-8位之和、9-12位之和、、、、我晕倒了、、、
阅读全文
0 0
- Nvidia-OpenCL-SDK-Code-Samples的学习[3]
- Nvidia-OpenCL-SDK-Code-Samples的学习[1]
- Nvidia-OpenCL-SDK-Code-Samples的学习[2]
- Nvidia-OpenCL-SDK-Code-Samples的学习[4]
- Nvidia-SDK-Code-Sample的学习[5]OverLap、一次乌龙
- opencl之code builder学习教程3
- Android SDK Samples,学习Android的好方法
- OpenCL和CUDA的关系--NVIDIA
- opencl之code builder学习教程1
- opencl之code builder学习教程2
- opencl之code builder学习教程4
- DirectX9 SDK Samples(20) HDRDemo Sample(3)
- Ubuntu14.04下Nvidia opencl例子的编译运行
- nvidia显卡上OpenCL的hello world程序
- OpenCL的学习
- CentOS 7 配置OpenCL环境(安装NVIDIA cuda sdk、Cmake、Eclipse CDT)
- OPencl学习笔记3
- 学习Android SDK Samples之旅-connectivity篇
- 最长公共子序列
- ThinkPHP 3.23 框架执行流程分析
- JAVA数据结构:简单排序(冒泡,选择,插入)
- 在osChina上生成ssh公钥的步骤
- 一致性哈希的浅析
- Nvidia-OpenCL-SDK-Code-Samples的学习[3]
- Linux学习
- UICollectionView
- driver.manage().window().getSize().width 获取屏幕分辨率报错问题
- JavaScript的Global和Math对象
- Spring JdbcTemplate详解
- VS2017编译jsoncpp我出现的问题及怎么解决的
- 一个完整的标准的cassandra连接代码(java版)
- String字符串的方法大全