Assembly x64 Intro - SSE2 IDCT

来源:互联网 发布:杨君优化人生全本 编辑:程序博客网 时间:2024/05/18 03:33


void WelsIDctFourT4Rec_c (uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct)

{
  int32_t iDstStridex4  = iStride << 2;
  int32_t iPredStridex4 = iPredStride << 2;
  WelsIDctT4Rec_c (pRec,                  iStride, pPred,      iPredStride, pDct);
  WelsIDctT4Rec_c (&pRec[4],              iStride, &pPred[4],     iPredStride, pDct + 16);
  WelsIDctT4Rec_c (&pRec[iDstStridex4  ], iStride, &pPred[iPredStridex4  ], iPredStride, pDct + 32);
  WelsIDctT4Rec_c (&pRec[iDstStridex4 + 4], iStride, &pPred[iPredStridex4 + 4], iPredStride, pDct + 48);

}


;***********************************************************************
; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
;***********************************************************************
WELS_EXTERN WelsIDctFourT4Rec_sse2
    %assign push_num 0
    LOAD_5_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    ;Load 4x8
    SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5

    SSE2_TransTwo4x4W   xmm0, xmm1, xmm4, xmm2, xmm3
    SSE2_IDCT           xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
    SSE2_TransTwo4x4W   xmm1, xmm4, xmm0, xmm2, xmm3
    SSE2_IDCT           xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1

    WELS_Zero           xmm7
    WELS_DW32           xmm6

    SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [r0      ],  [r2]
    SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [r0 + r1 ],  [r2 + r3]
    lea     r0, [r0 + 2 * r1]
    lea     r2, [r2 + 2 * r3]
    SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [r0],            [r2]
    SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [r0 + r1 ],  [r2 + r3]

    add     r4, 64
    lea     r0, [r0 + 2 * r1]
    lea     r2, [r2 + 2 * r3]
    SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5

    SSE2_TransTwo4x4W   xmm0, xmm1, xmm4, xmm2, xmm3
    SSE2_IDCT           xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
    SSE2_TransTwo4x4W   xmm1, xmm4, xmm0, xmm2, xmm3
    SSE2_IDCT           xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1

    WELS_Zero           xmm7
    WELS_DW32           xmm6

    SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [r0      ],  [r2]
    SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [r0 + r1 ],  [r2 + r3]
    lea     r0, [r0 + 2 * r1]
    lea     r2, [r2 + 2 * r3]
    SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [r0],            [r2]
    SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [r0 + r1],   [r2 + r3]
    POP_XMM
    LOAD_5_PARA_POP
    ; pop        esi
    ; pop        ebx
    ret



0 0
原创粉丝点击