sse2

来源:互联网 发布:太上老君 知乎 编辑:程序博客网 时间:2024/05/12 18:23

在网上找到个sse的事例程序,http://www.codeproject.com/Articles/4522/Introduction-to-SSE-Programming。

对应的sse程序如下:

fResult[i] = sqrt( fSource1[i]*fSource1[i] + fSource2[i]*fSource2[i] ) + 0.5i = 0, 1, 2 ... ARRAY_SIZE-1

C++ function:

void CSSETestDlg::ComputeArrayCPlusPlus(          float* pArray1,                   // [in] first source array          float* pArray2,                   // [in] second source array          float* pResult,                   // [out] result array          int nSize)                        // [in] size of all arrays{    int i;    float* pSource1 = pArray1;    float* pSource2 = pArray2;    float* pDest = pResult;    for ( i = 0; i < nSize; i++ )    {        *pDest = (float)sqrt((*pSource1) * (*pSource1) + (*pSource2)                 * (*pSource2)) + 0.5f;        pSource1++;        pSource2++;        pDest++;    }}

C++ function with SSE Intrinsics:

void CSSETestDlg::ComputeArrayCPlusPlusSSE(          float* pArray1,                   // [in] first source array          float* pArray2,                   // [in] second source array          float* pResult,                   // [out] result array          int nSize)                        // [in] size of all arrays{    int nLoop = nSize/ 4;    __m128 m1, m2, m3, m4;    __m128* pSrc1 = (__m128*) pArray1;    __m128* pSrc2 = (__m128*) pArray2;    __m128* pDest = (__m128*) pResult;    __m128 m0_5 = _mm_set_ps1(0.5f);        // m0_5[0, 1, 2, 3] = 0.5    for ( int i = 0; i < nLoop; i++ )    {        m1 = _mm_mul_ps(*pSrc1, *pSrc1);        // m1 = *pSrc1 * *pSrc1        m2 = _mm_mul_ps(*pSrc2, *pSrc2);        // m2 = *pSrc2 * *pSrc2        m3 = _mm_add_ps(m1, m2);                // m3 = m1 + m2        m4 = _mm_sqrt_ps(m3);                   // m4 = sqrt(m3)        *pDest = _mm_add_ps(m4, m0_5);          // *pDest = m4 + 0.5                pSrc1++;        pSrc2++;        pDest++;    }}

OpenCV上有段计算AD的程序如下:

for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
    {
        for( x = minX1; x < maxX1; x++ )
        {
            int u = prow1[x];
        #if CV_SSE2
            if( useSIMD )
            {
                __m128i _u = _mm_set1_epi8(u), z = _mm_setzero_si128();
               
                for( int d = minD; d < maxD; d += 16 )
                {
                    __m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-1-x + d));
                    __m128i diff = _mm_adds_epu8(_mm_subs_epu8(_u,_v), _mm_subs_epu8(_v,_u));
                    __m128i c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
                    __m128i c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
                   
                    _mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff,z)));
                    _mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff,z)));
                }
            }
            else
        #endif
            {
                for( int d = minD; d < maxD; d++ )
                {
                    int v = prow2[width-1-x + d];
                    cost[x*D + d] = (CostType)(cost[x*D + d] + (CostType)std::abs(u - v));
                }
            }
        }
    }

可以看出,第一个每次循环+4,第二段每次+16,图像数据是8位的。

原创粉丝点击