用SSE加速CPU蒙皮计算
来源:互联网 发布:ubuntu vim 新建文件 编辑:程序博客网 时间:2024/05/20 03:40
http://blog.csdn.net/garuda/article/details/6539271
我们知道现在绝大多数情况下角色动画的蒙皮计算是放在GPU中计算的。
但是仍然有一些特殊的场合我们需要在CPU端使用蒙皮计算的结果,比如涉及到
布料的物理模拟的时候。这时我们需要在CPU端计算蒙皮。
为了节约宝贵的CPU计算的时间,我们需要用SSE对CPU计算蒙皮进行加速。
顶点结构如下:
struct Vertex
{
float3 Pos;
float3 Normal;
int n; //该顶点蒙到了几个骨骼上
int BoneId[4];
float Weight[3];
.... //切线等等
};
对于每个顶点的计算过程是先由BoneId和Weight计算出变换矩阵,然后用矩阵变换
顶点的位置,法线等。其中求变换矩阵是对多个矩阵加权求和的过程,这一步很适合
使用SSE加速。
每个顶点可能蒙到1-4个骨骼上,也就是每个顶点可能需要对1-4个矩阵加权求和。
1个骨骼的时候不用计算。2-4个骨骼的时候,我们分别写3个函数对应这3种情况。
__forceinline void LoadFourFloats(float* a0, __m128& res)
{
res = _mm_load_ps(a0);
}
__forceinline void StoreFourFloats(float* a0, const __m128& src)
{
_mm_store_ps(a0, src);
}
__forceinline void MulMatrixFloat(__m128& mo0, __m128& mo1, __m128& mo2,
const __m128& mi0, const __m128& mi1, const __m128& mi2,
float w)
{
__m128 xmm;
xmm = _mm_load_ss(&w);
xmm = _mm_shuffle_ps(xmm,xmm,0);
// Multiply matrix 1 by weight 1.
mo0 = _mm_mul_ps(xmm, mi0);
mo1 = _mm_mul_ps(xmm, mi1);
mo2 = _mm_mul_ps(xmm, mi2);
}
__forceinline void Collapse2MatSSE(float* pM1, float* pM2,
float W1, float W2, float* pR)
{
__m128 xmm1, xmm2, xmm3;
__m128 xmm4, xmm5, xmm6;
// Load matrix 1.
LoadFourFloats(pM1 + 0, xmm1);
LoadFourFloats(pM1 + 4, xmm2);
LoadFourFloats(pM1 + 8, xmm3);
MulMatrixFloat(xmm1, xmm2, xmm3, xmm1, xmm2, xmm3, W1);
// Load matrix 2.
LoadFourFloats(pM2 + 0, xmm4);
LoadFourFloats(pM2 + 4, xmm5);
LoadFourFloats(pM2 + 8, xmm6);
MulMatrixFloat(xmm4, xmm5, xmm6, xmm4, xmm5, xmm6, W2);
// Add matrix 1 to matrix 2.
xmm1 = _mm_add_ps(xmm1, xmm4);
xmm2 = _mm_add_ps(xmm2, xmm5);
xmm3 = _mm_add_ps(xmm3, xmm6);
StoreFourFloats(pR + 0, xmm1);
StoreFourFloats(pR + 4, xmm2);
StoreFourFloats(pR + 8, xmm3);
}
__forceinline void Collapse3MatSSE(float* pM1, float* pM2, float* pM3,
float W1, float W2, float W3, float* pR)
{
__m128 xmm1, xmm2, xmm3;
__m128 xmm4, xmm5, xmm6;
// Load matrix 1.
LoadFourFloats(pM1 + 0, xmm1);
LoadFourFloats(pM1 + 4, xmm2);
LoadFourFloats(pM1 + 8, xmm3);
MulMatrixFloat(xmm1, xmm2, xmm3, xmm1, xmm2, xmm3, W1);
// Load matrix 2.
LoadFourFloats(pM2 + 0, xmm4);
LoadFourFloats(pM2 + 4, xmm5);
LoadFourFloats(pM2 + 8, xmm6);
MulMatrixFloat(xmm4, xmm5, xmm6, xmm4, xmm5, xmm6, W2);
// Add matrix 1 to matrix 2.
xmm1 = _mm_add_ps(xmm1, xmm4);
xmm2 = _mm_add_ps(xmm2, xmm5);
xmm3 = _mm_add_ps(xmm3, xmm6);
// Load matrix 2.
LoadFourFloats(pM3 + 0, xmm4);
LoadFourFloats(pM3 + 4, xmm5);
LoadFourFloats(pM3 + 8, xmm6);
MulMatrixFloat(xmm4, xmm5, xmm6, xmm4, xmm5, xmm6, W3);
// Add matrix 1 to matrix 2.
xmm1 = _mm_add_ps(xmm1, xmm4);
xmm2 = _mm_add_ps(xmm2, xmm5);
xmm3 = _mm_add_ps(xmm3, xmm6);
StoreFourFloats(pR + 0, xmm1);
StoreFourFloats(pR + 4, xmm2);
StoreFourFloats(pR + 8, xmm3);
}
__forceinline void Collapse4MatSSE(float* pM1, float* pM2, float* pM3, float* pM4,
float W1, float W2, float W3, float W4, float* pR)
{
__m128 xmm1, xmm2, xmm3;
__m128 xmm4, xmm5, xmm6;
// Load matrix 1.
LoadFourFloats(pM1 + 0, xmm1);
LoadFourFloats(pM1 + 4, xmm2);
LoadFourFloats(pM1 + 8, xmm3);
MulMatrixFloat(xmm1, xmm2, xmm3, xmm1, xmm2, xmm3, W1);
// Load matrix 2.
LoadFourFloats(pM2 + 0, xmm4);
LoadFourFloats(pM2 + 4, xmm5);
LoadFourFloats(pM2 + 8, xmm6);
MulMatrixFloat(xmm4, xmm5, xmm6, xmm4, xmm5, xmm6, W2);
// Add matrix 1 to matrix 2.
xmm1 = _mm_add_ps(xmm1, xmm4);
xmm2 = _mm_add_ps(xmm2, xmm5);
xmm3 = _mm_add_ps(xmm3, xmm6);
LoadFourFloats(pM3 + 0, xmm4);
LoadFourFloats(pM3 + 4, xmm5);
LoadFourFloats(pM3 + 8, xmm6);
MulMatrixFloat(xmm4, xmm5, xmm6, xmm4, xmm5, xmm6, W3);
xmm1 = _mm_add_ps(xmm1, xmm4);
xmm2 = _mm_add_ps(xmm2, xmm5);
xmm3 = _mm_add_ps(xmm3, xmm6);
LoadFourFloats(pM4 + 0, xmm4);
LoadFourFloats(pM4 + 4, xmm5);
LoadFourFloats(pM4 + 8, xmm6);
MulMatrixFloat(xmm4, xmm5, xmm6, xmm4, xmm5, xmm6, W4);
xmm1 = _mm_add_ps(xmm1, xmm4);
xmm2 = _mm_add_ps(xmm2, xmm5);
xmm3 = _mm_add_ps(xmm3, xmm6);
StoreFourFloats(pR + 0, xmm1);
StoreFourFloats(pR + 4, xmm2);
StoreFourFloats(pR + 8, xmm3);
}
计算顶点蒙皮矩阵的过程如下
// 为了使用sse优化,这个矩阵必须是16字节对齐的。
// release 编译时编译器会自动保证这一点,但
// 要debug正确运行必须加上内存对齐的声明.
__declspec(align(16)) Matrix4 matObj;
for (int i = 0; i < vertCount; i++, pVertex++)
{
if (pVertex->n == 1)
{
matObj = *BoneMatrixPalette[pWeight->nBones[0]];
}
else if (pVertex->n == 2)
{
float* pMat0 = BoneMatrixPalette[pVertex->nBones[0]]->ToFloatPtr();
float* pMat1 = BoneMatrixPalette[pVertex->nBones[1]]->ToFloatPtr();
Collapse2MatSSE(pMat0, pMat1,pVertex->Weight[0], pVertex->Weight[1], matObj.ToFloatPtr());
}
else if (pVertex->n == 3)
{
float* pMat0 = BoneMatrixPalette[pWeight->nBones[0]]->ToFloatPtr();
float* pMat1 = BoneMatrixPalette[pWeight->nBones[1]]->ToFloatPtr();
float* pMat2 = BoneMatrixPalette[pWeight->nBones[2]]->ToFloatPtr();
Collapse3MatSSE(pMat0, pMat1, pMat2,pVertex->Weight[0], pVertex->Weight[1],
pVertex->Weight[2], matObj.ToFloatPtr());
}
else if (pVertex->n == 4)
{
float* pMat0 = BoneMatrixPalette[pVertex->nBones[0]]->ToFloatPtr();
float* pMat1 = BoneMatrixPalette[pVertex->nBones[1]]->ToFloatPtr();
float* pMat2 = BoneMatrixPalette[pVertex->nBones[2]]->ToFloatPtr();
float* pMat3 = BoneMatrixPalette[pVertex->nBones[3]]->ToFloatPtr();
Collapse4MatSSE(pMat0, pMat1, pMat2, pMat3, pVertex->Weight[0], pVertex->Weight[1],
pVertex->Weight[2], pVertex->Weight[3], matObj.ToFloatPtr());
}
else
{
assert(0);
}
}
得到matObj后分别对等点位置、法线等做变换就可以了。这一过程仍然可以用SSE加速,但这需要顶点
的位置、法线等均是16字节对齐的。这需要较大改动,因此我们没有做。
经profile,加速后CPU蒙皮的速度提升了1倍。
主要参考
Optimized CPU-based Skinning for 3D Games
下面两篇来自id,更加变态的优化
Fast Skinning
The Skeleton Assembly Line
- 用SSE加速CPU蒙皮计算
- 用SSE加速CPU蒙皮计算
- SSE加速实战之 二阶范数计算
- Ogre渲染优化心得(五) -- 用硬件蒙皮代替软件蒙皮,用GPU代替CPU
- Ogre渲染优化心得(五) -- 用硬件蒙皮代替软件蒙皮,用GPU代替CPU
- Ogre 用硬件蒙皮代替软件蒙皮,用GPU代替CPU .
- memchr SSE 加速
- SSE 加速初学
- SSE加速指令集
- SSE指令集加速运算
- qt sse指令加速配置
- 多核CPU加速并行计算的快捷开发和应用
- 蒙皮
- sse指令加速例子-无对比
- CPU 指令集加速
- 检测CPU支持的指令SSE
- 检测CPU支持的指令SSE
- 自己实现用FFT加速多项式计算
- noip2016 Day2 T2:蚯蚓 (归并)
- 二 d3.数据绑定
- ubuntu 12.04 GDB调试常用指令
- 关于Drawable设置到控件中不显示的问题
- https 客户端与服务器端交互过程
- 用SSE加速CPU蒙皮计算
- hadoop实战(一) hadoop基本概念
- 数据库升级
- Git基础(1)
- cookie 和session 的区别详解
- 【codeforce508C】. Anya and Ghosts
- git 新建分支并提交本地代码到远程分支
- elasticsearch mapping
- JDK动态代理的实现及原理