结构体数组(SoA)与数组结构体(AoS)
来源:互联网 发布:京东数据罗盘供应商 编辑:程序博客网 时间:2024/06/11 04:48
1.结构体数组(SoA)
/* * SoA 结构体数组定义 */struct InnerArray{ float x[LEN]; float y[LEN];};/* * CPU -> SoA 结构体数组的CPU计算形式 */void testInnerArrayHost(InnerArray *A, InnerArray *C, const int n){ for (int idx = 0; idx < n; idx++) { C->x[idx] = A->x[idx] + 10.f; C->y[idx] = A->y[idx] + 20.f; } return;}/* * GPU -> SoA 结构体数组的CUDA计算模式 */__global__ void testInnerArrayDevice(InnerArray *data, InnerArray * result, const int n){ unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { float tmpx = data->x[i]; float tmpy = data->y[i]; tmpx += 10.f; tmpy += 20.f; result->x[i] = tmpx; result->y[i] = tmpy; }}
2.数组结构体(AoS)
/* * AoS */struct innerStruct{ float x; float y;};/* * CPU -> AoS */void testInnerStructHost(innerStruct *A, innerStruct *C, const int n){ for (int idx = 0; idx < n; idx++) { C[idx].x = A[idx].x + 10.f; C[idx].y = A[idx].y + 20.f; } return;}/* * GPU -> AoS */__global__ void testInnerStructDevice(innerStruct *data, innerStruct * result, const int n){ unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { innerStruct tmp = data[i]; tmp.x += 10.f; tmp.y += 20.f; result[i] = tmp; }}
3.结构体数组(SoA)与数组结构体(AoS)二者的区别
许多并行编程范式,尤其是SIMD(单指令多数据)型范式,更倾向于使用SoA。在CUDA C编程中也普遍倾向于SoA,一维数据元素是为全局内存的有效合并访问而预先准备好的,而相同内存操作引用的同字段元素在存储时时彼此相邻的。
4.给出源代码示例,(《CUDA C编程中文翻译版本》,如有侵权,联系删除)源码网址:点击打开链接
SoA
#include "../common/common.h"#include <cuda_runtime.h>#include <stdio.h>/* * A simple example of using a structore of arrays to store data on the device. * This example is used to study the impact on performance of data layout on the * GPU. * * SoA: contiguous reads for x and y */#define LEN 1<<22struct InnerArray{ float x[LEN]; float y[LEN];};// functions for inner array outer structvoid initialInnerArray(InnerArray *ip, int size){ for (int i = 0; i < size; i++) { ip->x[i] = (float)( rand() & 0xFF ) / 100.0f; ip->y[i] = (float)( rand() & 0xFF ) / 100.0f; } return;}void testInnerArrayHost(InnerArray *A, InnerArray *C, const int n){ for (int idx = 0; idx < n; idx++) { C->x[idx] = A->x[idx] + 10.f; C->y[idx] = A->y[idx] + 20.f; } return;}void printfHostResult(InnerArray *C, const int n){ for (int idx = 0; idx < n; idx++) { printf("printout idx %d: x %f y %f\n", idx, C->x[idx], C->y[idx]); } return;}void checkInnerArray(InnerArray *hostRef, InnerArray *gpuRef, const int N){ double epsilon = 1.0E-8; bool match = 1; for (int i = 0; i < N; i++) { if (abs(hostRef->x[i] - gpuRef->x[i]) > epsilon) { match = 0; printf("different on x %dth element: host %f gpu %f\n", i, hostRef->x[i], gpuRef->x[i]); break; } if (abs(hostRef->y[i] - gpuRef->y[i]) > epsilon) { match = 0; printf("different on y %dth element: host %f gpu %f\n", i, hostRef->y[i], gpuRef->y[i]); break; } } if (!match) printf("Arrays do not match.\n\n");}__global__ void testInnerArray(InnerArray *data, InnerArray * result, const int n){ unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { float tmpx = data->x[i]; float tmpy = data->y[i]; tmpx += 10.f; tmpy += 20.f; result->x[i] = tmpx; result->y[i] = tmpy; }}__global__ void warmup2(InnerArray *data, InnerArray * result, const int n){ unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { float tmpx = data->x[i]; float tmpy = data->y[i]; tmpx += 10.f; tmpy += 20.f; result->x[i] = tmpx; result->y[i] = tmpy; }}// test for array of structint main(int argc, char **argv){ // set up device int dev = 0; cudaDeviceProp deviceProp; CHECK(cudaGetDeviceProperties(&deviceProp, dev)); printf("%s test struct of array at ", argv[0]); printf("device %d: %s \n", dev, deviceProp.name); CHECK(cudaSetDevice(dev)); // allocate host memory int nElem = LEN; size_t nBytes = sizeof(InnerArray); InnerArray *h_A = (InnerArray *)malloc(nBytes); InnerArray *hostRef = (InnerArray *)malloc(nBytes); InnerArray *gpuRef = (InnerArray *)malloc(nBytes); // initialize host array initialInnerArray(h_A, nElem); testInnerArrayHost(h_A, hostRef, nElem); // allocate device memory InnerArray *d_A, *d_C; CHECK(cudaMalloc((InnerArray**)&d_A, nBytes)); CHECK(cudaMalloc((InnerArray**)&d_C, nBytes)); // copy data from host to device CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice)); // set up offset for summary int blocksize = 128; if (argc > 1) blocksize = atoi(argv[1]); // execution configuration dim3 block (blocksize, 1); dim3 grid ((nElem + block.x - 1) / block.x, 1); // kernel 1: double iStart = seconds(); warmup2<<<grid, block>>>(d_A, d_C, nElem); CHECK(cudaDeviceSynchronize()); double iElaps = seconds() - iStart; printf("warmup2 <<< %3d, %3d >>> elapsed %f sec\n", grid.x, block.x, iElaps); CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); checkInnerArray(hostRef, gpuRef, nElem); CHECK(cudaGetLastError()); iStart = seconds(); testInnerArray<<<grid, block>>>(d_A, d_C, nElem); CHECK(cudaDeviceSynchronize()); iElaps = seconds() - iStart; printf("innerarray <<< %3d, %3d >>> elapsed %f sec\n", grid.x, block.x, iElaps); CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); checkInnerArray(hostRef, gpuRef, nElem); CHECK(cudaGetLastError()); CHECK(cudaFree(d_A)); CHECK(cudaFree(d_C)); free(h_A); free(hostRef); free(gpuRef); // reset device CHECK(cudaDeviceReset()); return EXIT_SUCCESS;}
AoS
#include "../common/common.h"#include <cuda_runtime.h>#include <stdio.h>/* * A simple example of using an array of structures to store data on the device. * This example is used to study the impact on performance of data layout on the * GPU. * * AoS: one contiguous 64-bit read to get x and y (up to 300 cycles) */#define LEN 1<<22struct innerStruct{ float x; float y;};struct innerArray{ float x[LEN]; float y[LEN];};void initialInnerStruct(innerStruct *ip, int size){ for (int i = 0; i < size; i++) { ip[i].x = (float)(rand() & 0xFF) / 100.0f; ip[i].y = (float)(rand() & 0xFF) / 100.0f; } return;}void testInnerStructHost(innerStruct *A, innerStruct *C, const int n){ for (int idx = 0; idx < n; idx++) { C[idx].x = A[idx].x + 10.f; C[idx].y = A[idx].y + 20.f; } return;}void checkInnerStruct(innerStruct *hostRef, innerStruct *gpuRef, const int N){ double epsilon = 1.0E-8; bool match = 1; for (int i = 0; i < N; i++) { if (abs(hostRef[i].x - gpuRef[i].x) > epsilon) { match = 0; printf("different on %dth element: host %f gpu %f\n", i, hostRef[i].x, gpuRef[i].x); break; } if (abs(hostRef[i].y - gpuRef[i].y) > epsilon) { match = 0; printf("different on %dth element: host %f gpu %f\n", i, hostRef[i].y, gpuRef[i].y); break; } } if (!match) printf("Arrays do not match.\n\n");}__global__ void testInnerStruct(innerStruct *data, innerStruct * result, const int n){ unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { innerStruct tmp = data[i]; tmp.x += 10.f; tmp.y += 20.f; result[i] = tmp; }}__global__ void warmup(innerStruct *data, innerStruct * result, const int n){ unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { innerStruct tmp = data[i]; tmp.x += 10.f; tmp.y += 20.f; result[i] = tmp; }}int main(int argc, char **argv){ // set up device int dev = 0; cudaDeviceProp deviceProp; CHECK(cudaGetDeviceProperties(&deviceProp, dev)); printf("%s test struct of array at ", argv[0]); printf("device %d: %s \n", dev, deviceProp.name); CHECK(cudaSetDevice(dev)); // allocate host memory int nElem = LEN; size_t nBytes = nElem * sizeof(innerStruct); innerStruct *h_A = (innerStruct *)malloc(nBytes); innerStruct *hostRef = (innerStruct *)malloc(nBytes); innerStruct *gpuRef = (innerStruct *)malloc(nBytes); // initialize host array initialInnerStruct(h_A, nElem); testInnerStructHost(h_A, hostRef, nElem); // allocate device memory innerStruct *d_A, *d_C; CHECK(cudaMalloc((innerStruct**)&d_A, nBytes)); CHECK(cudaMalloc((innerStruct**)&d_C, nBytes)); // copy data from host to device CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice)); // set up offset for summaryAU: It is blocksize not offset. Thanks.CZ int blocksize = 128; if (argc > 1) blocksize = atoi(argv[1]); // execution configuration dim3 block (blocksize, 1); dim3 grid ((nElem + block.x - 1) / block.x, 1); // kernel 1: warmup double iStart = seconds(); warmup<<<grid, block>>>(d_A, d_C, nElem); CHECK(cudaDeviceSynchronize()); double iElaps = seconds() - iStart; printf("warmup <<< %3d, %3d >>> elapsed %f sec\n", grid.x, block.x, iElaps); CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); checkInnerStruct(hostRef, gpuRef, nElem); CHECK(cudaGetLastError()); // kernel 2: testInnerStruct iStart = seconds(); testInnerStruct<<<grid, block>>>(d_A, d_C, nElem); CHECK(cudaDeviceSynchronize()); iElaps = seconds() - iStart; printf("innerstruct <<< %3d, %3d >>> elapsed %f sec\n", grid.x, block.x, iElaps); CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); checkInnerStruct(hostRef, gpuRef, nElem); CHECK(cudaGetLastError()); // free memories both host and device CHECK(cudaFree(d_A)); CHECK(cudaFree(d_C)); free(h_A); free(hostRef); free(gpuRef); // reset device CHECK(cudaDeviceReset()); return EXIT_SUCCESS;}
阅读全文
0 0
- 结构体数组(SoA)与数组结构体(AoS)
- 数组与结构体
- 数组--结构体数组
- 结构体数组(C++)
- 可变数组与结构体
- 结构体数组与指针
- 结构体与byte数组转换/结构体内定长数组
- 结构体、结构体数组
- VB.NET入门(五):数组,枚举与结构体
- 结构数组,结构数组类型(typedef)
- 结构数组,结构数组类型(typedef)
- 指针与数组与结构体
- 数组,结构体初始化 {0} (转载)
- 结构体(有数组和指针)
- map 数组结构体计数(二)
- 区间合并 (结构体数组)c++
- 打印学生成绩数组(结构体)
- 结构体数组使用方法
- FastDFS浅析和架构图
- CentOS 7 最小安装不能发现eth0
- Html与Js连用实现动画
- 使用java解压GZip文件
- jQuery form插件的使用--ajaxForm()和ajaxSubmit()的可选参数项对象
- 结构体数组(SoA)与数组结构体(AoS)
- linux——bont,team网桥的搭建
- Elasticsearch in Action.pdf 英文原版 免费下载
- Android OTA系统升级---原理一
- 习题2.1
- Maven系列学习(1)——安装以及本地仓库和仓库镜像的配置
- php 报告错误提示
- CCF 201409-2 画图 C语言解法
- [spm操作] VBM分析中,modulation的作用