AMD-SDK的学习[6]---BlackScholes与BlackScholesDP

来源：互联网发布：酷听说软件下载编辑：程序博客网时间：2024/06/05 20:34

这个是关于什么期货公式的，因为我看了下没有什么依赖性，的确可以改成并行。AMD还真是兴趣广泛，无论是学术上的知识还是商业上的公式都搜罗来写成OpenCL形式，不错。

依旧改成我习惯看的形式：

#include <CL/cl.h>#include <CL/cl_ext.h>#include <stdlib.h>#include <string.h>#include <malloc.h>#include <stdio.h>#include "n_needed_headers/oclUtils.h"#include "a_needed_headers/SDKCommon.hpp"using namespace std;#define GROUP_SIZE 256//  Constants#define S_LOWER_LIMIT 10.0f#define S_UPPER_LIMIT 100.0f#define K_LOWER_LIMIT 10.0f#define K_UPPER_LIMIT 100.0f#define T_LOWER_LIMIT 1.0f#define T_UPPER_LIMIT 10.0f#define R_LOWER_LIMIT 0.01f#define R_UPPER_LIMIT 0.05f#define SIGMA_LOWER_LIMIT 0.01f#define SIGMA_UPPER_LIMIT 0.10ffloat phi(float X);int main(){//////////////////////////////////////////////////////////////set up OpenCL...cl_uint platformNum;cl_int status;status=clGetPlatformIDs(0,NULL,&platformNum);if(status!=CL_SUCCESS){printf("cannot get platforms number.\n");return -1;}cl_platform_id* platforms;platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);status=clGetPlatformIDs(platformNum,platforms,NULL);if(status!=CL_SUCCESS){printf("cannot get platforms addresses.\n");return -1;}cl_platform_id platformInUse=platforms[0];cl_device_id device;status=clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_DEFAULT,1,&device,NULL);cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,&status);cl_command_queue_properties prop=0; //CL_QUEUE_PROFILING_ENABLE;cl_command_queue_properties *propers;propers=∝cl_command_queue commandQueue=clCreateCommandQueueWithProperties(context,device,propers, &status);std::ifstream srcFile("/home/jumper/OpenCL_projects/AMD-Sample-BlackScholes/BlackScholes_Kernels.cl");std::string srcProg(std::istreambuf_iterator<char>(srcFile),(std::istreambuf_iterator<char>()));const char * src = srcProg.c_str();size_t srclength = srcProg.length();cl_program program=clCreateProgramWithSource(context,1,&src,&srclength,&status);status=clBuildProgram(program,1,&device,NULL,NULL,&status);if (status != CL_SUCCESS) { cout<<"error:Build BasicDebug_Kernel()..."<<endl; shrLogEx(LOGBOTH | ERRORMSG, status, STDERROR); oclLogBuildInfo(program, oclGetFirstDev(context)); oclLogPtx(program, oclGetFirstDev(context), "ocldiary.ptx"); return(EXIT_FAILURE); }//////////////////////////////////////////////////////prepare host datacl_int samples(256 * 256 * 4);size_t blockSizeX(1), blockSizeY(1);cl_int width = 64,height = 64;// Calculate width and height from samplessamples = samples / 4;samples = (samples / GROUP_SIZE)? (samples / GROUP_SIZE) * GROUP_SIZE:GROUP_SIZE;unsigned int tempVar1 = (unsigned int)sqrt((double)samples);tempVar1 = (tempVar1 / GROUP_SIZE)? (tempVar1 / GROUP_SIZE) * GROUP_SIZE:GROUP_SIZE;samples = tempVar1 * tempVar1;width = tempVar1;height = width;cl_float* randArray = (cl_float*)memalign(16, width * height * sizeof(cl_float4));//can exchange between cl_float4 and float freely....for(int i = 0; i < width * height * 4; i++){randArray[i] = (float)rand() / (float)RAND_MAX;}cl_float* deviceCallPrice = (cl_float*)malloc(width * height * sizeof(cl_float4));memset(deviceCallPrice, 0, width * height * sizeof(cl_float4));cl_float* devicePutPrice = (cl_float*)malloc(width * height * sizeof(cl_float4));memset(devicePutPrice, 0, width * height * sizeof(cl_float4));//////////////////////////////////////////////////////prepare gpu datacl_mem randBuf = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD,sizeof(cl_float4) * width  * height,NULL, &status);CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (randBuf)");cl_mem callPriceBuf = clCreateBuffer(context,CL_MEM_WRITE_ONLY,sizeof(cl_float4) * width * height,NULL,&status);CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (callPriceBuf)");cl_mem putPriceBuf = clCreateBuffer(context,CL_MEM_WRITE_ONLY,sizeof(cl_float4) * width * height,NULL, &status);CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (putPriceBuf)");size_t globalThreads[2] = {width, height};size_t localThreads[2] = {blockSizeX, blockSizeY};cl_kernel kernel;bool useScalarKernel=false; //or true...if (useScalarKernel){kernel = clCreateKernel(program, "blackScholes_scalar", &status);globalThreads[0]*=4;}else{kernel = clCreateKernel(program, "blackScholes", &status);}CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(kernel)");cl_event inMapEvt;void* mapPtr = clEnqueueMapBuffer(commandQueue,randBuf,CL_FALSE,CL_MAP_WRITE,0, sizeof(cl_float4) * width  * height,0,NULL,&inMapEvt,&status);CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer failed. (mapPtr)");status = clFlush(commandQueue);CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed.");status=clWaitForEvents(1,&inMapEvt);status = clReleaseEvent(inMapEvt);CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(inMapEvt) Failed");memcpy(mapPtr, randArray, sizeof(cl_float4) * width  * height);cl_event unmapEvent;status = clEnqueueUnmapMemObject(commandQueue,randBuf,mapPtr,0,NULL, &unmapEvent);CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject failed. (randBuf)");status = clFlush(commandQueue);CHECK_OPENCL_ERROR(status, "clFlush failed. (randBuf)");status=clWaitForEvents(1,&unmapEvent);status = clReleaseEvent(unmapEvent);CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(unmapEvent) Failed");// whether sort is to be in increasing order. CL_TRUE implies increasingstatus = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&randBuf);CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (randBuf)");int rowStride = useScalarKernel?width*4:width;status = clSetKernelArg(kernel, 1, sizeof(rowStride), (const void *)&rowStride);CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (width)");status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&callPriceBuf);CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (callPriceBuf)");status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&putPriceBuf);CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (putPriceBuf)");// Enqueue a kernel run call.status = clEnqueueNDRangeKernel(commandQueue,kernel,2,NULL,globalThreads,localThreads,0,NULL,NULL);CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");// wait for the kernel call to finish executionstatus = clFinish(commandQueue);CHECK_OPENCL_ERROR(status, "clFinish failed.");cl_event callEvent;cl_event putEvent;// Enqueue the results to application pointerstatus = clEnqueueReadBuffer(commandQueue,callPriceBuf,CL_FALSE,0, width * height * sizeof(cl_float4), deviceCallPrice,0,NULL,&callEvent);CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");// Enqueue the results to application pointerstatus = clEnqueueReadBuffer(commandQueue,putPriceBuf,CL_FALSE,0,width * height * sizeof(cl_float4),devicePutPrice,0, NULL,&putEvent);CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");status = clFlush(commandQueue);CHECK_OPENCL_ERROR(status, "clFlush(commanQueue) failed.");status=clWaitForEvents(1,&callEvent);status = clReleaseEvent(callEvent);CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(callEvent) Failed");status=clWaitForEvents(1,&putEvent);status = clReleaseEvent(putEvent);CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(putEvent) Failed");/////////////////////////////////////////////////////////////////////////////CPU result....cl_float* hostCallPrice = (cl_float*)malloc(width * height * sizeof(cl_float4));memset(hostCallPrice, 0, width * height * sizeof(cl_float4));cl_float* hostPutPrice = (cl_float*)malloc(width * height * sizeof(cl_float4));memset(hostPutPrice, 0, width * height * sizeof(cl_float4));int y;for (y = 0; y < width * height * 4; ++y){float d1, d2;float sigmaSqrtT;float KexpMinusRT;float s = S_LOWER_LIMIT * randArray[y] + S_UPPER_LIMIT * (1.0f - randArray[y]);float k = K_LOWER_LIMIT * randArray[y] + K_UPPER_LIMIT * (1.0f - randArray[y]);float t = T_LOWER_LIMIT * randArray[y] + T_UPPER_LIMIT * (1.0f - randArray[y]);float r = R_LOWER_LIMIT * randArray[y] + R_UPPER_LIMIT * (1.0f - randArray[y]);float sigma = SIGMA_LOWER_LIMIT * randArray[y] + SIGMA_UPPER_LIMIT *(1.0f - randArray[y]);sigmaSqrtT = sigma * sqrt(t);d1 = (log(s / k) + (r + sigma * sigma / 2.0f) * t) / sigmaSqrtT;d2 = d1 - sigmaSqrtT;KexpMinusRT = k * exp(-r * t);hostCallPrice[y] = s * phi(d1) - KexpMinusRT * phi(d2);hostPutPrice[y]  = KexpMinusRT * phi(-d2) - s * phi(-d1);}//print CPU & GPU result...for (y = 0; y < width * height * 4; ++y){printf("CallPrice:CPU-result---%f  GPU-result---%f  PutPrice:CPU-result---%f  GPU-result---%f\n",hostCallPrice[y],deviceCallPrice[y],hostPutPrice[y],devicePutPrice[y]);}status = clReleaseKernel(kernel);CHECK_OPENCL_ERROR(status, "clReleaseKernel failed.");status = clReleaseProgram(program);CHECK_OPENCL_ERROR(status, "clReleaseProgram failed.");status = clReleaseMemObject(randBuf);CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.");status = clReleaseMemObject(callPriceBuf);CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.");status = clReleaseMemObject(putPriceBuf);CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.");status = clReleaseCommandQueue(commandQueue);CHECK_OPENCL_ERROR(status, "clReleaseCommandQueue failed.");status = clReleaseContext(context);CHECK_OPENCL_ERROR(status, "clReleaseContext failed.");status=clReleaseDevice(device);FREE(randArray);FREE(deviceCallPrice);FREE(devicePutPrice);FREE(hostCallPrice);FREE(hostPutPrice);return 0;}float phi(float X){    float y, absX, t;    // the coeffs    const float c1 =  0.319381530f;    const float c2 = -0.356563782f;    const float c3 =  1.781477937f;    const float c4 = -1.821255978f;    const float c5 =  1.330274429f;    const float oneBySqrt2pi = 0.398942280f;    absX = fabs(X);    t = 1.0f / (1.0f + 0.2316419f * absX);    y = 1.0f - oneBySqrt2pi * exp(-X * X / 2.0f) *t * (c1 + t * (c2 +t * (c3 +t * (c4 + t * c5))));    return (X < 0) ? (1.0f - y) : y;}

cl部分我没有改动：

#define S_LOWER_LIMIT 10.0f#define S_UPPER_LIMIT 100.0f#define K_LOWER_LIMIT 10.0f#define K_UPPER_LIMIT 100.0f#define T_LOWER_LIMIT 1.0f#define T_UPPER_LIMIT 10.0f#define R_LOWER_LIMIT 0.01f#define R_UPPER_LIMIT 0.05f#define SIGMA_LOWER_LIMIT 0.01f#define SIGMA_UPPER_LIMIT 0.10f/** * @brief   Abromowitz Stegun approxmimation for PHI (Cumulative Normal Distribution Function) * @param   X input value * @param   phi pointer to store calculated CND of X */void phi(float4 X, float4* phi){    float4 y;    float4 absX;    float4 t;    float4 result;    const float4 c1 = (float4)0.319381530f;    const float4 c2 = (float4)-0.356563782f;    const float4 c3 = (float4)1.781477937f;    const float4 c4 = (float4)-1.821255978f;    const float4 c5 = (float4)1.330274429f;    const float4 zero = (float4)0.0f;    const float4 one = (float4)1.0f;    const float4 two = (float4)2.0f;    const float4 temp4 = (float4)0.2316419f;    const float4 oneBySqrt2pi = (float4)0.398942280f;    absX = fabs(X);    t = one/(one + temp4 * absX);    y = one - oneBySqrt2pi * exp(-X*X/two) * t* (c1 + t* (c2 + t* (c3 + t* (c4 + t * c5))));    result = (X < zero)? (one - y) : y;    *phi = result;}/* * @brief   Calculates the call and put prices by using Black Scholes model * @param   s       Array of random values of current option price * @param   sigma   Array of random values sigma * @param   k       Array of random values strike price * @param   t       Array of random values of expiration time * @param   r       Array of random values of risk free interest rate * @param   width   Width of call price or put price array * @param   call    Array of calculated call price values * @param   put     Array of calculated put price values */__kernel voidblackScholes(const __global float4 *randArray,             int width,             __global float4 *call,             __global float4 *put){    float4 d1, d2;    float4 phiD1, phiD2;    float4 sigmaSqrtT;    float4 KexpMinusRT;        size_t xPos = get_global_id(0);    size_t yPos = get_global_id(1);    float4 two = (float4)2.0f;    float4 inRand = randArray[yPos * width + xPos];    float4 S = S_LOWER_LIMIT * inRand + S_UPPER_LIMIT * (1.0f - inRand);    float4 K = K_LOWER_LIMIT * inRand + K_UPPER_LIMIT * (1.0f - inRand);    float4 T = T_LOWER_LIMIT * inRand + T_UPPER_LIMIT * (1.0f - inRand);    float4 R = R_LOWER_LIMIT * inRand + R_UPPER_LIMIT * (1.0f - inRand);    float4 sigmaVal = SIGMA_LOWER_LIMIT * inRand + SIGMA_UPPER_LIMIT * (1.0f - inRand);    sigmaSqrtT = sigmaVal * sqrt(T);    d1 = (log(S/K) + (R + sigmaVal * sigmaVal / two)* T)/ sigmaSqrtT;    d2 = d1 - sigmaSqrtT;    KexpMinusRT = K * exp(-R * T);    phi(d1, &phiD1);    phi(d2, &phiD2);    call[yPos * width + xPos] = S * phiD1 - KexpMinusRT * phiD2;    phi(-d1, &phiD1);    phi(-d2, &phiD2);    put[yPos * width + xPos]  = KexpMinusRT * phiD2 - S * phiD1;}void phi_scalar(float X, float* phi){    float y;    float absX;    float t;    float result;    const float c1 = (float)0.319381530f;    const float c2 = (float)-0.356563782f;    const float c3 = (float)1.781477937f;    const float c4 = (float)-1.821255978f;    const float c5 = (float)1.330274429f;    const float zero = (float)0.0f;    const float one = (float)1.0f;    const float two = (float)2.0f;    const float temp4 = (float)0.2316419f;    const float oneBySqrt2pi = (float)0.398942280f;    absX = fabs(X);    t = one/(one + temp4 * absX);    y = one - oneBySqrt2pi * exp(-X*X/two) * t * (c1 + t* (c2 + t* (c3 + t* (c4 + t * c5))));    result = (X < zero)? (one - y) : y;    *phi = result;}/* * @brief   Calculates the call and put prices by using Black Scholes model * @param   s       Array of random values of current option price * @param   sigma   Array of random values sigma * @param   k       Array of random values strike price * @param   t       Array of random values of expiration time * @param   r       Array of random values of risk free interest rate * @param   width   Width of call price or put price array * @param   call    Array of calculated call price values * @param   put     Array of calculated put price values */__kernel voidblackScholes_scalar(const __global float *randArray,             int width,             __global float *call,             __global float *put){    float d1, d2;    float phiD1, phiD2;    float sigmaSqrtT;    float KexpMinusRT;        size_t xPos = get_global_id(0);    size_t yPos = get_global_id(1);    float two = (float)2.0f;    float inRand = randArray[yPos * width + xPos];    float S = S_LOWER_LIMIT * inRand + S_UPPER_LIMIT * (1.0f - inRand);    float K = K_LOWER_LIMIT * inRand + K_UPPER_LIMIT * (1.0f - inRand);    float T = T_LOWER_LIMIT * inRand + T_UPPER_LIMIT * (1.0f - inRand);    float R = R_LOWER_LIMIT * inRand + R_UPPER_LIMIT * (1.0f - inRand);    float sigmaVal = SIGMA_LOWER_LIMIT * inRand + SIGMA_UPPER_LIMIT * (1.0f - inRand);    sigmaSqrtT = sigmaVal * sqrt(T);    d1 = (log(S/K) + (R + sigmaVal * sigmaVal / two)* T)/ sigmaSqrtT;    d2 = d1 - sigmaSqrtT;    KexpMinusRT = K * exp(-R * T);    phi_scalar(d1, &phiD1);    phi_scalar(d2, &phiD2);    call[yPos * width + xPos] = S * phiD1 - KexpMinusRT * phiD2;    phi_scalar(-d1, &phiD1);    phi_scalar(-d2, &phiD2);    put[yPos * width + xPos]  = KexpMinusRT * phiD2 - S * phiD1;}

两种情况都运行了一下，CPU与GPU结果都是一致。至于这种期货什么的公式我不想说什么，这两个kernel的差别就是一个用矢量float4；一个用标量float；我用CodeXL看了事实证明同样条件下，矢量更快！！！

看标量的kernel运行时间是12.3526ms；但矢量的只需要10.9099ms！！！我习惯用标量的习惯看来得改过来。。。
这个例子很简单，不多说什么。

我需要学习的部分：

1、依旧是使用memalign()分配内存，可以在矢量和对应标量之间随意切换，并且是对齐的！

2、CL_MEM_USE_PERSISTENT_MEM_AMD的巨大优点我依旧还不太明白，因为显然还是需要那么大的randArray（host端）和randBuf（gpu端）啊？

于是我按照例子写上CL_MEM_USE_PERSISTENT_MEM_AMD 和去掉CL_MEM_USE_PERSISTENT_MEM_AMD 进行了一次对比：

看这是例子中有CL_MEM_USE_PERSISTENT_MEM_AMD 标志的，DataTransfer这一栏的紫色很少，而且Device Time没有；而下面当我去掉CL_MEM_USE_PERSISTENT_MEM_AMD标志时，Data Transfer的紫色多了，而且出现了Device Time为0.6416ms！！！

这就是这个flag的优点吧：即GPU上没有真正发生dataTransfer ？？？！

大神说：依然会传输的, 但如同zero-copy, 内存后备的buffer, 总是能让GPU的指令执行和回传overlap一样,此种显存, 能让你的CPU在执行指令的时候同时传输.这样前者分别能进行的同时计算和回传并行 + 同时减少一次无辜的显存写入和一次无辜的显存读取; 后者能让CPU的运算和正向传输并行 + 同时减少内存的写入和读取,所以这两个会快.
需要说明的是, 前者的第二部分因素往往被人无视, 而只说前者的第一部分(GPU的指令运算和回传并行overlap), 这是因为前者的第二部分往往不会造成瓶颈(相比PCI-E和内存, 显存吞吐率太高了) 但在CPU, 你应当同时考虑这两个因素.

3、注意到例子将数据传回host显示是用的ReadBuffer（肯定有device Time）而不是MapBuffer，我以为MapBuffer不会有device Time，于是传回时我也改成MapBuffer，时间竟然比ReadBuffer多很多？！看来之前memory flag的那9个flags的区别联系我还没真正弄懂？！

****************************************************************************************************

至于工程BlackScholesDP与上面类似，只是将float换成了double而已，但差别大大的：

57ms！！！当然毕竟是2MB咯...

阅读全文

0 0