Assembly x64 Intro - SSE2 IDCT I16X16 DC

来源:互联网 发布:杨君优化人生全本 编辑:程序博客网 时间:2024/05/29 10:41


/*
 * pfIDctI16x16Dc: do luma idct of an MB for I16x16 mode, when only dc value are non-zero
 */
void WelsIDctRecI16x16Dc_c (uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDctDc)

{
  int32_t i, j;

  for (i = 0; i < 16; i ++) {
    for (j = 0; j < 16; j++) {
      pRec[j] = WelsClip1 (pPred[j] + ((pDctDc[ (i & 0x0C) + (j >> 2)] + 32) >> 6));
    }
    pRec += iStride;
    pPred += iPredStride;
  }
}



 ;***********************************************************************
; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
;***********************************************************************
WELS_EXTERN WelsIDctRecI16x16Dc_sse2
    %assign push_num 0
    LOAD_5_PARA
    PUSH_XMM 8
    SIGN_EXTENSION r1, r1d
    SIGN_EXTENSION r3, r3d
    pxor        xmm7,       xmm7
    WELS_DW32   xmm6

    SSE2_Load8DC            xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
    SSE2_StoreDiff4x8p      xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3

    lea         r0,     [r0 + 2 * r1]
    lea         r2,     [r2 + 2 * r3]
    SSE2_StoreDiff4x8p      xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3

    lea         r0,     [r0 + 2 * r1]
    lea         r2,     [r2 + 2 * r3]
    SSE2_StoreDiff4x8p      xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3

    lea         r0,     [r0 + 2 * r1]
    lea         r2,     [r2 + 2 * r3]
    SSE2_StoreDiff4x8p      xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3

    SSE2_Load8DC            xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
    lea         r0,     [r0 + 2 * r1]
    lea         r2,     [r2 + 2 * r3]
    SSE2_StoreDiff4x8p      xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3

    lea         r0,     [r0 + 2 * r1]
    lea         r2,     [r2 + 2 * r3]
    SSE2_StoreDiff4x8p      xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3

    lea         r0,     [r0 + 2 * r1]
    lea         r2,     [r2 + 2 * r3]
    SSE2_StoreDiff4x8p      xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3

    lea         r0,     [r0 + 2 * r1]
    lea         r2,     [r2 + 2 * r3]
    SSE2_StoreDiff4x8p      xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
    POP_XMM
    LOAD_5_PARA_POP
    ret



0 0
原创粉丝点击