16位图像Alpha混合的实现

来源：互联网发布：写脚本的软件编辑：程序博客网时间：2024/05/01 08:44

Alpha混合的算法很简单，基于下面的公式就可以实现：

D := A * (S - D) / 255 + D

D是目标图像的像素，

S是源图像的像素

A是Alpha值，0为全透明，255为不透明。

下面是16位565格式的混合算法的实现，首先用最简单的方式实现，即逐个像素的处理：

// 一次处理一个像素，比较简单，但速度较慢

procedure AlphaBlend656(BmpDst, BmpSrc:TBitmap; Alpha: Byte);

var

i,j, W, H: Integer;

pSrc, pDst: PWord;

wSR, wSG, wSB: Word;

wDR, wDG, wDB: Word;

begin

// 确定高宽

ifBmpDst.Width > BmpSrc.Width then

W:= BmpSrc.Width

else

W:= BmpDst.Width;

ifBmpDst.Height > BmpSrc.Height then

H:= BmpSrc.Height

else

H:= BmpDst.Height;

fori := 0 to H - 1do

begin

pSrc := BmpSrc.ScanLine[i];

pDst := BmpDst.ScanLine[i];

for j := 0 to W - 1 do

begin

// D := A * (S - D) / 255 + D

wSR := (pSrc^ shr 11);

wSG := (pSrc^ shr 5) and $3F;

wSB := pSrc^ and $1F;

wDR := (pDst^ shr 11);

wDG := (pDst^ shr 5) and $3F;

wDB := pDst^ and $1F;

pDst^ := (((Alpha * (wSR - wDR) shr 8) + wDR) shl 11) or

(((Alpha * (wSG - wDG) shr 8) +wDG) shl 5) or

((Alpha * (wSB - wDB) shr 8) +wDB);

Inc(pSrc);

Inc(pDst);

end;

实现起来很简单，但速度比较慢，其实存在着一次处理两个像素的算法，下面是代码：

// 一次处理两个像素,所以速度是AlphaBlend656的2倍

procedure AlphaBlend656Fast(BmpDst, BmpSrc:TBitmap; Alpha: Byte);

var

i,j, W, H: Integer;

pSrc, pDst: PWord;

dwSR, dwSG, dwSB: LongWord;

dwDR, dwDG, dwDB: LongWord;

dwAdd64 : LongWord;

dwAlphaOver4 : LongWord;

odd: Boolean;

begin

// 确定高宽

ifBmpDst.Width > BmpSrc.Width then

W:= BmpSrc.Width

else

W:= BmpDst.Width;

ifBmpDst.Height > BmpSrc.Height then

H:= BmpSrc.Height

else

H:= BmpDst.Height;

dwAdd64 := 64 or ( 64 shl 16 );

dwAlphaOver4 := ( Alpha shr 2 ) or ( ( Alpha shr 2 ) shl 16 );

if(W and $01) = 1 then

begin

odd := True;

W:= (W - 1) shr 1;

end

else begin

odd := False;

W:= W shr 1;

end;

fori := 0 to H - 1 do

begin

pSrc := BmpSrc.ScanLine[i];

pDst := BmpDst.ScanLine[i];

for j := 0 to W - 1 do

begin

// D := A * (S - D) / 255 + D

dwSR := (PLongWord(pSrc)^ shr 11) and $001F001F;

dwSG := (PLongWord(pSrc)^ shr 5) and $003F003F;

dwSB := PLongWord(pSrc)^ and $001F001F;

dwDR := (PLongWord(pDst)^ shr 11) and $001F001F;

dwDG := (PLongWord(pDst)^ shr 5) and $003F003F;

dwDB := PLongWord(pDst)^ and $001F001F;

PLongWord(pDst)^ := ((((Alpha * (dwSR + dwAdd64 - dwDR)) shr 8) + dwDR -dwAlphaOver4) and $001F001F) shl 11 or

((((Alpha * (dwSG + dwAdd64 -dwDG)) shr 8) + dwDG - dwAlphaOver4 ) and $003F003F)shl 5 or

(((Alpha * (dwSB + dwAdd64 -dwDB)) shr 8) + dwDB - dwAlphaOver4 ) and $001F001F;

Inc(pSrc, 2);

Inc(pDst, 2);

end;

if odd then

begin

dwSR := (pSrc^ shr 11);

dwSG := (pSrc^ shr 5) and $3F;

dwSB := pSrc^ and $1F;

dwDR := (pDst^ shr 11);

dwDG := (pDst^ shr 5) and $3F;

dwDB := pDst^ and $1F;

pDst^ := Word((((Alpha * (dwSR - dwDR) shr 8) + dwDR) shl 11) or

(((Alpha * (dwSG - dwDG) shr 8)+ dwDG) shl 5) or

((Alpha * (dwSB - dwDB) shr 8) +dwDB));

Inc(pSrc);

Inc(pDst);

end;

不过这还不够快，基本MMX指令的实现可以一次处理4个像素，下面是代码：

// 利用MMX优化指令，一次可以处理4个像素，因此速度应该是AlphaBlend656的4倍

procedure AlphaBlend656MMX(BmpDst, BmpSrc:TBitmap; Alpha: Byte);

var

i,j, W, H, Leave: Integer;

pSrc, pDst: PWord;

MaskR, MaskG, MaskB, Alpha64: Int64;

wSR, wSG, wSB: Word;

wDR, wDG, wDB: Word;

begin

// 确定高宽

ifBmpDst.Width > BmpSrc.Width then

W:= BmpSrc.Width

else

W:= BmpDst.Width;

ifBmpDst.Height > BmpSrc.Height then

H:= BmpSrc.Height

else

H:= BmpDst.Height;

Leave := W and 3; // 剩余的像素

W:= W shr 2; // 一次处理4个像素，因此取W整除4的值

// 提取RGB通道的掩码

MaskR := $f800f800f800f800;

MaskG := $07e007e007e007e0;

MaskB := $001f001f001f001f;

//Alpha值扩展到64位

Alpha64 := Alpha;

Alpha64 := Alpha64 or (Alpha64 shl 16) or (Alpha64 shl 32) or (Alpha64shl 48);

fori := 0 to H - 1do

begin

pSrc := BmpSrc.ScanLine[i];

pDst := BmpDst.ScanLine[i];

asm

push ecx // 保存寄存器

mov ecx, W // 设宽度

cmp ecx, 0 // 宽度是否为0

jz @@exit565 // 如果宽度为0，结束

push esi

push edi

mov esi, pSrc // 开始处理

mov edi, pDst

@@blend565_4:

{mmx 的作用：

mm0: red target value

mm1: red source value

mm2: green target value

mm3: green source value

mm4: blue target value

mm5: blue source value

mm6: original target pixel

mm7: original source pixel

D := A * (S - D) / 255 + D

}

movq mm6, [edi]

movq mm7, [esi]

movq mm0, mm6

pand mm0, MaskR // 提取目标的R通道

movq mm1, mm7

pand mm1, MaskR // 提取源的R通道

psrlw mm0, 11 // 右移到最低位，便于接下来的计算

psrlw mm1, 11

psubw mm1, mm0 // SrcRed := SrcRed - DestRed

pmullw mm1, Alpha64 // SrcRed := SrcRed * Alpha

psraw mm1, 8 // SrcRed := SrcRed div 8

paddw mm1, mm0 // SrcRed := SrcRed + DestRed

psllw mm1, 11 // 左移回原来的位置，此已经R通道混合已经完毕

movq mm2, mm6

pand mm2, MaskG // 提取目标的G通道

movq mm3, mm7

pand mm3, MaskG // 提取源的G通道

psrlw mm2, 5 // 右移到最低位，便于接下来的计算

psrlw mm3, 5

psubw mm3, mm2 // SrcGreen := SrcGreen - DestGreen

pmullw mm3, Alpha64 // SrcGreen := SrcGreen * Alpha

psraw mm3, 8 // SrcGreen := SrcGreen div 8

paddw mm3, mm2 // SrcGreen := SrcGreen + DestGreen

psllw mm3, 5 // 左移回原来的位置，此已经G通道混合已经完毕

movq mm4, mm6

pand mm4, MaskB // 提取目标的B通道

movq mm5, mm7

pand mm5, MaskB // 提取源的B通道

psubw mm5, mm4 // SrcBlue := SrcBlue - DestBlue

pmullw mm5, Alpha64 // SrcBlue := SrcBlue * Alpha

psraw mm5, 8 // SrcBlue := SrcBlue div 8

paddw mm5, mm4 // SrcBlue := SrcBlue + DestBlue，此已经B通道混合已经完毕

por mm1, mm3 // 合成像素

por mm1, mm5

movq [edi], mm1 // 赋给目标

add esi, 8 // 下4个像素

add edi, 8

dec ecx

jnz @@blend565_4

mov pSrc, esi

mov pDst, edi

pop edi

pop esi

emms

@@exit565:

pop ecx

end;

// 处理剩下的像素

for j := 0 to Leave - 1 do

begin

wSR := (pSrc^ shr 11);

wSG := (pSrc^ shr 5) and $3F;

wSB := pSrc^ and $1F;

wDR := (pDst^ shr 11);

wDG := (pDst^ shr 5) and $3F;

wDB := pDst^ and $1F;

pDst^ := (((Alpha * (wSR - wDR) shr 8) + wDR) shl 11) or

(((Alpha * (wSG - wDG) shr 8) +wDG) shl 5) or

((Alpha * (wSB - wDB) shr 8) +wDB);

Inc(pSrc);

Inc(pDst);

end;

下面是这三个函数的速度比较，目标图像是600*450的16位位图，源图像是399*532的16位位图，分别进行了1000次混合，结果如下：

AlphaBlend656： 4516

AlphaBlend656Fast： 2562

AlphaBlend656MMX： 1234

没有意外，MMX版本比普通的快了近4倍

对于图像处理的优化有两个比较重要的点：

1、尽量用位移代替乘除。

2、一次能够同时处理多个像素，利用MMX指令可以做到这一点。

最后是代码：

https://files.getdropbox.com/u/524963/AlphaBlend16_565.rar