sse2
来源:互联网 发布:太上老君 知乎 编辑:程序博客网 时间:2024/05/12 18:23
在网上找到个sse的事例程序,http://www.codeproject.com/Articles/4522/Introduction-to-SSE-Programming。
对应的sse程序如下:
fResult[i] = sqrt( fSource1[i]*fSource1[i] + fSource2[i]*fSource2[i] ) + 0.5i = 0, 1, 2 ... ARRAY_SIZE-1
C++ function:
void CSSETestDlg::ComputeArrayCPlusPlus( float* pArray1, // [in] first source array float* pArray2, // [in] second source array float* pResult, // [out] result array int nSize) // [in] size of all arrays{ int i; float* pSource1 = pArray1; float* pSource2 = pArray2; float* pDest = pResult; for ( i = 0; i < nSize; i++ ) { *pDest = (float)sqrt((*pSource1) * (*pSource1) + (*pSource2) * (*pSource2)) + 0.5f; pSource1++; pSource2++; pDest++; }}
C++ function with SSE Intrinsics:
void CSSETestDlg::ComputeArrayCPlusPlusSSE( float* pArray1, // [in] first source array float* pArray2, // [in] second source array float* pResult, // [out] result array int nSize) // [in] size of all arrays{ int nLoop = nSize/ 4; __m128 m1, m2, m3, m4; __m128* pSrc1 = (__m128*) pArray1; __m128* pSrc2 = (__m128*) pArray2; __m128* pDest = (__m128*) pResult; __m128 m0_5 = _mm_set_ps1(0.5f); // m0_5[0, 1, 2, 3] = 0.5 for ( int i = 0; i < nLoop; i++ ) { m1 = _mm_mul_ps(*pSrc1, *pSrc1); // m1 = *pSrc1 * *pSrc1 m2 = _mm_mul_ps(*pSrc2, *pSrc2); // m2 = *pSrc2 * *pSrc2 m3 = _mm_add_ps(m1, m2); // m3 = m1 + m2 m4 = _mm_sqrt_ps(m3); // m4 = sqrt(m3) *pDest = _mm_add_ps(m4, m0_5); // *pDest = m4 + 0.5 pSrc1++; pSrc2++; pDest++; }}
OpenCV上有段计算AD的程序如下:
for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
{
for( x = minX1; x < maxX1; x++ )
{
int u = prow1[x];
#if CV_SSE2
if( useSIMD )
{
__m128i _u = _mm_set1_epi8(u), z = _mm_setzero_si128();
for( int d = minD; d < maxD; d += 16 )
{
__m128i _v = _mm_loadu_si128((const __m128i*)(prow2 + width-1-x + d));
__m128i diff = _mm_adds_epu8(_mm_subs_epu8(_u,_v), _mm_subs_epu8(_v,_u));
__m128i c0 = _mm_load_si128((__m128i*)(cost + x*D + d));
__m128i c1 = _mm_load_si128((__m128i*)(cost + x*D + d + 8));
_mm_store_si128((__m128i*)(cost + x*D + d), _mm_adds_epi16(c0, _mm_unpacklo_epi8(diff,z)));
_mm_store_si128((__m128i*)(cost + x*D + d + 8), _mm_adds_epi16(c1, _mm_unpackhi_epi8(diff,z)));
}
}
else
#endif
{
for( int d = minD; d < maxD; d++ )
{
int v = prow2[width-1-x + d];
cost[x*D + d] = (CostType)(cost[x*D + d] + (CostType)std::abs(u - v));
}
}
}
}
可以看出,第一个每次循环+4,第二段每次+16,图像数据是8位的。
- sse2
- SSE2
- SSE2 Optimization
- SSE2 memcpy
- SSE/SSE2
- SSE2 memcpy
- SSE2 memcpy
- SSE2加速
- SSE2 指令
- SSE2初学指南
- SSE2初学指南
- SSE/SSE2版ceilf
- MMX,SSE,SSE2扫盲
- SSE2初学指南
- sse2指令集
- XMM SSE2浮点指令
- SSE2指令集
- SSE2指令集
- Linux 条件变量使用示例
- Asp.net(c#)常用文件操作类封装 移动 复制 删除 上传 下载等
- VC 注册表编程(二)
- 64位Windows7、Vista的IIS中网站数据库连接失败的解决办法
- hbase配置lzo压缩
- sse2
- Dive into Zend_Config Deeply
- ODbgScript 入门系列(二) ODbgScript的命令
- Winform中通过一个字符串定位到和字符串相等ID的控件(将字符串转换成相应的控件名称)
- UITableView相关
- python-gtk-3-tutorial(2)--"Print hello world"
- spring 事务管理配置总结
- gdal C#编译
- Sharp将在CMOS图像传感器上发力