CUDA的矩阵算法

来源：互联网发布：linux文本处理命令编辑：程序博客网时间：2024/06/07 08:32

用cuda最多的地方莫过于矩阵算法，得空把cuda的常用矩阵算法整理一下，与大家分享,一下矩阵都是以行存储的一维数组

1、矩阵乘法

// Compute C = A * B

// wA is the width of A

// wB is the width of B

__global__ void Muld(float* A, float* B, int wA, int wB, float* C)

{

// Block index

int bx = blockIdx.x;

int by = blockIdx.y;

// Thread index

int tx = threadIdx.x;

int ty = threadIdx.y;

// Index of the first sub-matrix of A processed by the block

int aBegin = wA * BLOCK_SIZE * by;

// Index of the last sub-matrix of A processed by the block

int aEnd = aBegin + wA - 1;

// Step size used to iterate through the sub-matrices of A

int aStep = BLOCK_SIZE;

// Index of the first sub-matrix of B processed by the block

int bBegin = BLOCK_SIZE * bx;

// Step size used to iterate through the sub-matrices of B

int bStep = BLOCK_SIZE * wB;

// The element of the block sub-matrix that is computed

// by the thread

float Csub = 0;

// Loop over all the sub-matrices of A and B required to

// compute the block sub-matrix

for (int a = aBegin, b = bBegin;

a <= aEnd;

a += aStep, b += bStep) {

// Shared memory for the sub-matrix of A

__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];

// Shared memory for the sub-matrix of B

__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

// Load the matrices from global memory to shared memory;

// each thread loads one element of each matrix

As[ty][tx] = A[a + wA * ty + tx];

Bs[ty][tx] = B[b + wB * ty + tx];

// Synchronize to make sure the matrices are loaded

__syncthreads();

// Multiply the two matrices together;

// each thread computes one element

// of the block sub-matrix

for (int k = 0; k < BLOCK_SIZE; ++k)

Csub += As[ty][k] * Bs[k][tx];

// Synchronize to make sure that the preceding

// computation is done before loading two new

// sub-matrices of A and B in the next iteration

__syncthreads();

}

// Write the block sub-matrix to global memory;

// each thread writes one element

int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;

C[c + wB * ty + tx] = Csub;

}

2、矩阵加法

// C = A + B
// wA is the width of A
// hA is the height of A
__global__ void Muld_Add(float* A, float* B, int wA, int hA, float* C)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int sumIndex = wA*hA;
int index = i * wA + j;
if(index<sumIndex)
C[index] = A[index]+B[index];
}

3、矩阵转置

//矩阵转置

//odata输出

//idata输入

//width:矩阵宽,height:矩阵高
__global__ void transpose_naive(float *odata, float *idata, int width, int height)
{

__shared__ float block[BLOCK_SIZE][BLOCK_SIZE];
// read the matrix tile into shared memory
unsigned int xIndex = blockIdx.x * BLOCK_SIZE + threadIdx.x;
unsigned int yIndex = blockIdx.y * BLOCK_SIZE + threadIdx.y;
if((xIndex < width) && (yIndex < height))
{
unsigned int index_in = yIndex * width + xIndex;
block[threadIdx.y][threadIdx.x] = idata[index_in];
}
__syncthreads();
// write the transposed matrix tile to global memory
xIndex = blockIdx.y * BLOCK_SIZE + threadIdx.x;
yIndex = blockIdx.x * BLOCK_SIZE + threadIdx.y;
if((xIndex < height) && (yIndex < width))
{
unsigned int index_out = yIndex * height + xIndex;
odata[index_out] = block[threadIdx.x][threadIdx.y];
}
}