AVX指令集矩阵乘向量算法
来源:互联网 发布:php 会员登录获取id 编辑:程序博客网 时间:2024/04/29 18:42
#include <stdio.h>#include <time.h>#include <x86intrin.h>void matmul_avx(const float *x, const float **w,float *y,const int col,const int row){ const int col_reduced_8 = col - col % 8; float scratchpad[8]; __m256 op0, op1, tgt, tmp_vec; for (int i = 0; i < row; i++) { float res = 0; tgt = _mm256_setzero_ps(); for (int j = 0; j < col_reduced_8; j += 8) { op0 = __builtin_ia32_loadups256(&x[j]); op1 = __builtin_ia32_loadups256(&w[i][j]); tmp_vec = __builtin_ia32_mulps256(op0, op1); tgt = __builtin_ia32_addps256(tmp_vec, tgt); } __builtin_ia32_storeups256(scratchpad, tgt); for (int k = 0; k < 8; k++) res += scratchpad[k]; for (int l = col_reduced_8; l < col; l++) { res += w[i][l] * x[l]; } y[i] = res; }}int main() { const int col = 2048, row = 512, num_mul = 10; float **w; float x[col]; float y[row]; float scratchpad[8]; w = (float **)malloc(sizeof(float*)*row); for (int i = 0; i < row; i ++) { w[i] = (float *)malloc(sizeof(float) * col); } for (int i = 0; i < row; i++) { for (int j = 0; j < col; j++) { w[i][j] = (float) (rand() % 1000) / 800.0f; } } for (int j = 0; j < col; j++) { x[j] = (float) (rand() % 1000) / 800.0f; } clock_t t1, t2;// The original matrix multiplication version t1 = clock(); for (int r = 0; r < num_mul; r++) for (int j = 0; j < row; j++) { float sum = 0; float *wj = w[j]; for (int i = 0; i < col; i++) sum += wj[i] * x[i]; y[j] = sum; } t2 = clock(); float diff = ((float) t2 - (float) t1) / (num_mul*CLOCKS_PER_SEC); printf("\nTime taken: %f second.\n", diff); for (int i = 0; i < row; i++) { printf("%.4f, ", y[i]); y[i]=0; } printf("\n");// The avx matrix multiplication version. t1 = clock(); for (int r = 0; r < num_mul; r++) matmul_avx(x,w,y,col,row); t2 = clock(); diff = ((float) t2 - (float) t1) / (num_mul*CLOCKS_PER_SEC); printf("\nTime taken: %f second.\n",diff); for (int i = 0; i < row; i++) { printf("%.4f, ", y[i]); } printf("\n"); }
运行方式:
gcc -o test test.c -mavx
./test
0 0
- AVX指令集矩阵乘向量算法
- X86 SSE/AVX指令集:向量内积
- 利用AVX指令集实现矩阵乘法
- AVX 指令集架构简介
- 向量叉乘与叉乘矩阵
- AVX是什么?AVX指令集技术与应用解析
- dot--向量或矩阵的点乘
- cross--向量或矩阵的叉乘
- [VC] 检测AVX系列指令集的支持级别(AVX、AVX2、F16C、FMA、FMA4、XOP)
- [VC] 检测AVX系列指令集的支持级别(AVX、AVX2、F16C、FMA、FMA4、XOP)
- iPP矩阵操作随笔4-矩阵乘向量
- 矩阵连乘算法思想
- 快速矩阵幂乘算法
- 矩阵链乘-算法导论
- 算法提高 矩阵链乘
- 矩阵连乘算法模板
- Intel AVX 指令集对视频处理的影响
- TensorFlow CPU环境 SSE/AVX/FMA 指令集编译
- ButterKnife的使用
- Maven常用命令
- 打鼹鼠
- Oracle 12.2中设置pdb随cdb启动而自动启动
- SQL语句DISTINCT
- AVX指令集矩阵乘向量算法
- 【上机笔试之八】二分法查找
- 用MySQL创建数据库和数据库表
- (pat-a)1020. Tree Traversals (25)
- MongoDB的再次相逢之聚合(二)
- React Native ref高级用法&&setNativeProps使用
- 对称加解密算法
- ssm java web项目中获取文件路径
- Java核心技术笔记-第14章(2)