CUDA的矩阵算法
来源:互联网 发布:linux文本处理命令 编辑:程序博客网 时间:2024/06/07 08:32
用cuda最多的地方莫过于矩阵算法,得空把cuda的常用矩阵算法整理一下,与大家分享,一下矩阵都是以行存储的一维数组
1、矩阵乘法
// Compute C = A * B
// wA is the width of A
// wB is the width of B
__global__ void Muld(float* A, float* B, int wA, int wB, float* C)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block
int aBegin = wA * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + wA - 1;
// Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE;
// Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx;
// Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * wB;
// The element of the block sub-matrix that is computed
// by the thread
float Csub = 0;
// Loop over all the sub-matrices of A and B required to
// compute the block sub-matrix
for (int a = aBegin, b = bBegin;
a <= aEnd;
a += aStep, b += bStep) {
// Shared memory for the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
// Shared memory for the sub-matrix of B
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from global memory to shared memory;
// each thread loads one element of each matrix
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
for (int k = 0; k < BLOCK_SIZE; ++k)
Csub += As[ty][k] * Bs[k][tx];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to global memory;
// each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
}
2、矩阵加法
// C = A + B
// wA is the width of A
// hA is the height of A
__global__ void Muld_Add(float* A, float* B, int wA, int hA, float* C)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int sumIndex = wA*hA;
int index = i * wA + j;
if(index<sumIndex)
C[index] = A[index]+B[index];
}
3、矩阵转置
//矩阵转置
//odata输出
//idata输入
//width:矩阵宽,height:矩阵高
__global__ void transpose_naive(float *odata, float *idata, int width, int height)
{
__shared__ float block[BLOCK_SIZE][BLOCK_SIZE];
// read the matrix tile into shared memory
unsigned int xIndex = blockIdx.x * BLOCK_SIZE + threadIdx.x;
unsigned int yIndex = blockIdx.y * BLOCK_SIZE + threadIdx.y;
if((xIndex < width) && (yIndex < height))
{
unsigned int index_in = yIndex * width + xIndex;
block[threadIdx.y][threadIdx.x] = idata[index_in];
}
__syncthreads();
// write the transposed matrix tile to global memory
xIndex = blockIdx.y * BLOCK_SIZE + threadIdx.x;
yIndex = blockIdx.x * BLOCK_SIZE + threadIdx.y;
if((xIndex < height) && (yIndex < width))
{
unsigned int index_out = yIndex * height + xIndex;
odata[index_out] = block[threadIdx.x][threadIdx.y];
}
}
- CUDA的矩阵算法
- 基于CUDA的矩阵相乘
- 矩阵加法的CUDA实现
- CUDA学习--一维矩阵的加
- 矩阵棋盘划分的CUDA程序
- cuda 任意维度的矩阵相乘
- CUDA任意维度的矩阵相乘
- CUDA学习--一维矩阵的加
- CUDA学习--矩阵乘法的并行运算
- CUDA向量与矩阵的乘法
- 矩阵按行(列)求和CUDA并行算法设计
- KNN算法的CUDA实现
- CUDA 矩阵转置
- 矩阵赋值CUDA实现
- CUDA矩阵乘法
- CUDA 矩阵乘法优化
- CUDA:矩阵乘法原理
- CUDA矩阵BLAS效率
- 解决dedecms5.7验证码不显示、错误等问题
- 什么是无需编程—软件是企业的灵魂
- C/S与B/S的区别
- 【转载】ABAP 对透明表的添加,修改,删除,查询等操作
- 循环队列
- CUDA的矩阵算法
- ios使用网络活动指示器
- 面向对象设计中的一些基本术语
- Designing for iPhone 4's Retina Display
- 软件开发人员猛如虎
- Android APK反编译详解(附图)
- meta和title的顺序问题引起IE显示空白页
- Maven 3 入门 -- 安装与配置(Maven以及其Eclipse插件m2eclipse的安装)
- android 新浪微博Oauth认证