CUDA 做排列组合

来源:互联网 发布:淘宝无法拍照怎么设置 编辑:程序博客网 时间:2024/06/05 03:18



#include <fstream>#include <stdio.h>#include <stdlib.h>#include <cuda_runtime.h>#include <iostream>using namespace std;#define ROWS 10#define COLS 6#define CHECK(res) if(res!=cudaSuccess){exit(-1);}__global__ void Kerneltest(double **dp_out_params, double * d_out_Iqsd1, double *d_MTFac, double *d_errIqsd_MTFac, unsigned int loop){    unsigned int row =  blockDim.x*blockIdx.x + threadIdx.x   ;    unsigned int col = threadIdx.y;double min_q = 0;double max_q = 0.027;int stepNums_q = 10;double stepLen_q = (max_q-min_q)/stepNums_q;double min_mua = 0;double max_mua = 0.007;int stepNums_mua = 100;double stepLen_mua = (max_mua-min_mua)/stepNums_mua;double min_musp = 0;double max_musp = 1;int stepNums_musp = 100;double stepLen_musp = (max_musp-min_musp)/stepNums_musp;double min_alpha = 0;double max_alpha = 1;int stepNums_alpha = 100;double stepLen_alpha = (max_alpha-min_alpha)/stepNums_alpha;double min_n = 0;double max_n = 1;int stepNums_n = 100;double stepLen_n = (max_n-min_n)/stepNums_n;double min_rough = 0;double max_rough = 1;int stepNums_rough = 100;double stepLen_rough = (max_rough-min_rough)/stepNums_rough;if (row < ROWS && col < COLS){dp_out_params[row][col] = 0; __syncthreads();if( col== 5 ){int weight_q = (row+ROWS*loop) % stepNums_q;dp_out_params[row][col] = min_q + weight_q * stepLen_q;} __syncthreads();if( col== 4 ){int weight_mua =( (row+ROWS*loop) / stepNums_q ) % stepNums_mua;dp_out_params[row][col] = min_mua + weight_mua * stepLen_mua;} __syncthreads();if( col== 3 ){int weight_musp =( (row+ROWS*loop) / stepNums_q / stepNums_mua ) % stepNums_musp;dp_out_params[row][col] = min_musp + weight_musp * stepLen_musp; } __syncthreads();if( col== 2 ){int weight_alpha =( (row+ROWS*loop) / stepNums_q / stepNums_mua /stepNums_musp ) % stepNums_alpha;dp_out_params[row][col] = min_alpha + weight_alpha * stepLen_alpha; } __syncthreads();if( col== 1 ){int weight_n =( (row+ROWS*loop) / stepNums_q / stepNums_mua /stepNums_musp / stepNums_alpha ) % stepNums_n;dp_out_params[row][col] = min_n + weight_n * stepLen_n; } __syncthreads();if( col== 0 ){int weight_rough =( (row+ROWS*loop) / stepNums_q / stepNums_mua /stepNums_musp / stepNums_alpha / stepNums_n ) % stepNums_rough;dp_out_params[row][col] = min_rough + weight_rough * stepLen_rough; } __syncthreads();}///////////////////////////////////////////////////////////////////////////////////d_out_Iqsd1[row] = threadIdx.x;__syncthreads();///////////////////////////////////////////////////////////////////////////////////    unsigned int row1 =  blockDim.x*blockIdx.x + threadIdx.x   ;d_errIqsd_MTFac[row1] = d_out_Iqsd1[row] - d_MTFac[row];__syncthreads();}//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////int main(int argc, char **argv){cudaError_t res;////////////////////////////////////////int MTFlen = 4000;double *h_MTFac = NULL;h_MTFac = (double*)malloc(MTFlen*sizeof(double));for(int i=0; i<100; i++){h_MTFac[i] = 3.2;}double *d_MTFac = NULL;    res = cudaMalloc((void**)(&d_MTFac), MTFlen*sizeof(double));CHECK(res)    res = cudaMemcpy((void*)(d_MTFac), (void*)(h_MTFac), MTFlen*sizeof(double*), cudaMemcpyHostToDevice);CHECK(res)double *h_errIqsd_MTFac = NULL;    h_errIqsd_MTFac = (double*)malloc(MTFlen*sizeof(double));double *d_errIqsd_MTFac = NULL;    res = cudaMalloc((void**)(&d_errIqsd_MTFac), MTFlen*sizeof(double));CHECK(res)    res = cudaMemcpy((void*)(d_errIqsd_MTFac), (void*)(h_errIqsd_MTFac), MTFlen*sizeof(double*), cudaMemcpyHostToDevice);CHECK(res)////////////////////////////////////////double *d_out_params = NULL;    res = cudaMalloc((void**)(&d_out_params), ROWS*COLS*sizeof(double));CHECK(res)double **dp_out_params = NULL;res = cudaMalloc((void**)(&dp_out_params), ROWS*sizeof(double*));CHECK(res)double **hp_out_params = NULL;hp_out_params = (double**)malloc(ROWS*sizeof(double*));double *h_out_params = NULL;h_out_params = (double*)malloc(ROWS*COLS*sizeof(double));    for (int r = 0; r < ROWS; r++)    {    hp_out_params[r] = d_out_params + r*COLS;    }////////////////////////////////////////    double *h_out_Iqsd1 = NULL;    h_out_Iqsd1 = (double*)malloc(ROWS*sizeof(double));double *d_out_Iqsd1 = NULL;res = cudaMalloc((void **) &d_out_Iqsd1, ROWS*sizeof(double));CHECK(res)////////////////////////////////////////    res = cudaMemcpy((void*)(dp_out_params), (void*)(hp_out_params), ROWS*sizeof(double*), cudaMemcpyHostToDevice);CHECK(res)    dim3 dimBlock( 15,   6,  1);    dim3 dimGrid(  1,    1,  1);    for(unsigned int loop=0; loop<1; loop++)     {    Kerneltest<<<dimGrid, dimBlock>>>(dp_out_params, d_out_Iqsd1, d_MTFac, d_errIqsd_MTFac, loop);    cout<<"loop:  "<<loop<<endl;     }    res = cudaMemcpy((void*)(h_out_params), (void*)(d_out_params), ROWS*COLS*sizeof(double*), cudaMemcpyDeviceToHost);CHECK(res)    res = cudaMemcpy((void*)(h_out_Iqsd1), (void*)(d_out_Iqsd1), ROWS*sizeof(double*), cudaMemcpyDeviceToHost);CHECK(res)    res = cudaMemcpy((void*)(h_MTFac), (void*)(d_MTFac), MTFlen*sizeof(double*), cudaMemcpyDeviceToHost);CHECK(res)    res = cudaMemcpy((void*)(h_errIqsd_MTFac), (void*)(d_errIqsd_MTFac), MTFlen*sizeof(double*), cudaMemcpyDeviceToHost);CHECK(res)////////////////////////////////////////    ofstream f1("/home/zlf/Documents/cuda.txt");    int zz = 0;cout<<endl<<"h_out_params: "<<endl;    for (int r = 0; r < ROWS; r++)    {        for (int c = 0; c < COLS; c++)        {            printf("%f ", h_out_params[r*COLS+c]);            f1 << h_out_params[r*COLS+c]<<"    ";        }        zz = zz + 1;        cout<<"    行数: "<<zz;        printf("\n");        f1<< "    行数: "<<zz<< "\n";    }f1.close();cout<<endl<<"h_out_Iqsd1: "<<endl;    for (int r = 0; r < ROWS; r++)    {    cout<<h_out_Iqsd1[r]<<"    ";    if ((r%10)==9)    {    cout<<endl;    }    }    /////////////////////////    cout<<"h_errIqsd_MTFac[i]: "<<endl;for(int i=0; i<MTFlen; i++){cout<<h_errIqsd_MTFac[i]<<"  ";}    /////////////////////////cout<<zz<<endl;    cudaFree((void*)d_out_params);    cudaFree((void*)dp_out_params);    cudaFree((void*)d_out_Iqsd1);    cudaFree((void*)d_MTFac);    cudaFree((void*)d_errIqsd_MTFac);    free(h_out_params);    free(hp_out_params);    free(h_out_Iqsd1);    free(h_MTFac);    free(h_errIqsd_MTFac);    getchar();    return 0;}























原创粉丝点击