MMX Intrinsics各函数介绍

来源：互联网发布：小清新治愈电影知乎编辑：程序博客网时间：2024/06/06 03:11

原文地址：http://blog.csdn.net/fengbingchun/article/details/20228667

SIMD相关头文件包括：

//#include <ivec.h>//MMX//#include <fvec.h>//SSE(also include ivec.h)//#include <dvec.h>//SSE2(also include fvec.h)#include <mmintrin.h> //MMX#include <xmmintrin.h> //SSE(include mmintrin.h)#include <emmintrin.h> //SSE2(include xmmintrin.h)#include <pmmintrin.h> //SSE3(include emmintrin.h)#include <tmmintrin.h>//SSSE3(include pmmintrin.h)#include <smmintrin.h>//SSE4.1(include tmmintrin.h)#include <nmmintrin.h>//SSE4.2(include smmintrin.h)#include <wmmintrin.h>//AES(include nmmintrin.h)#include <immintrin.h>//AVX(include wmmintrin.h)#include <intrin.h>//(include immintrin.h)

mmintrin.h为MMX头文件，其中__m64的定义为：

typedef union __declspec(intrin_type) _CRT_ALIGN(8) __m64{    unsigned __int64    m64_u64;    float               m64_f32[2];    __int8              m64_i8[8];    __int16             m64_i16[4];    __int32             m64_i32[2];        __int64             m64_i64;    unsigned __int8     m64_u8[8];    unsigned __int16    m64_u16[4];    unsigned __int32    m64_u32[2];} __m64;

mmintrin.h文件中各函数的介绍

/* General support intrinsics *///Empties the multimedia state,清除MMX寄存器中的内容，即初始化(以避免和浮点数//操作发生冲突)，详细说明见参考文献1void  _m_empty(void);//_mm_empty//Converts the integer object _I to a 64-bit __m64 object, r0=_I, r1=0__m64 _m_from_int(int _I);//_mm_cvtsi32_si64//Converts the lower 32 bits of the __m64 object _M to an integer, r=_M0int   _m_to_int(__m64 _M);//_mm_cvtsi64_si32//Packs the four 16-bit values from _MM1 into the lower four 8-bit values of//the result with signed saturation, and packs the four 16-bit values from _MM2//into the upper four 8-bit values of the result with signed saturation__m64 _m_packsswb(__m64 _MM1, __m64 _MM2);//_mm_packs_pi16//Packs the two 32-bit values from _MM1 into the lower two 16-bit values of the// result with signed saturation, and packs the two 32-bit values from _MM2 into// the upper two 16-bit values of the result with signed saturation__m64 _m_packssdw(__m64 _MM1, __m64 _MM2);//_mm_packs_pi32//Packs the four 16-bit values from _MM1 into the lower four 8-bit values of the//result with unsigned saturation, and packs the four 16-bit values from _MM2 into//the upper four 8-bit values of the result with unsigned saturation__m64 _m_packuswb(__m64 _MM1, __m64 _MM2);//_mm_packs_pu16//_MM1=(_MM10, _MM11, _MM12, _MM13, _MM14, _MM15, _MM16, _MM17),//_MM2=(_MM20, _MM21, _MM22, _MM23, _MM24, _MM25, _MM26, _MM27),//则r=(_MM14, _MM24, _MM15, _MM25, _MM16, _MM26, _MM17, _MM27)__m64 _m_punpckhbw(__m64 _MM1, __m64 _MM2);//_mm_unpackhi_pi8 //_MM1=(_MM10, _MM11, _MM12, _MM13),_MM10为低位，_MM2=(_MM20, _MM21, _MM22, _MM23),//则r=(_MM12, _MM22, _MM13, _MM23)__m64 _m_punpckhwd(__m64 _MM1, __m64 _MM2);//_mm_unpackhi_pi16//MM1=(_MM10, _MM11),_MM10为低位，_MM2=(_MM20, _MM21),则r=(_MM11, _MM21)__m64 _m_punpckhdq(__m64 _MM1, __m64 _MM2);//_mm_unpackhi_pi32//_MM1=(_MM10, _MM11, _MM12, _MM13, _MM14, _MM15, _MM16, _MM17),//_MM2=(_MM20, _MM21, _MM22, _MM23, _MM24, _MM25, _MM26, _MM27),//则r=(_MM10, _MM20, _MM11, _MM21, _MM12, _MM22, _MM13, _MM23)__m64 _m_punpcklbw(__m64 _MM1, __m64 _MM2);//_mm_unpacklo_pi8//_MM1=(_MM10, _MM11, _MM12, _MM13),_MM10为低位，_MM2=(_MM20, _MM21, _MM22, _MM23),//则r=(_MM10, _MM20, _MM11, _MM21)__m64 _m_punpcklwd(__m64 _MM1, __m64 _MM2);//_mm_unpacklo_pi16//MM1=(_MM10, _MM11),_MM10为低位，_MM2=(_MM20, _MM21),则r=(_MM10, _MM20)__m64 _m_punpckldq(__m64 _MM1, __m64 _MM2);//mm_unpacklo_pi32/* Packed arithmetic intrinsics *///Adds the eight 8-bit values in _MM1 to the eight 8-bit values in _MM2__m64 _m_paddb(__m64 _MM1, __m64 _MM2);//_mm_add_pi8//Adds the four 16-bit values in _MM1 to the four 16-bit values in _MM2__m64 _m_paddw(__m64 _MM1, __m64 _MM2);//_mm_add_pi16//Adds the two 32-bit values in _MM1 to the two 32-bit values in _MM2__m64 _m_paddd(__m64 _MM1, __m64 _MM2);//_mm_add_pi32//Adds the eight signed 8-bit values in _MM1 to the eight signed 8-bit values in _MM2//and saturates__m64 _m_paddsb(__m64 _MM1, __m64 _MM2);//_mm_adds_pi8//Adds the four signed 16-bit values in _MM1 to the four signed 16-bit values in _MM2//and saturates__m64 _m_paddsw(__m64 _MM1, __m64 _MM2);//_mm_adds_pi16//Adds the eight unsigned 8-bit values in _MM1 to the eight unsigned 8-bit values //in _MM2 and saturates__m64 _m_paddusb(__m64 _MM1, __m64 _MM2);//_mm_adds_pu8//Add the four unsigned 16-bit values in _MM1 to the four unsigned 16-bit values //in _MM2 and saturates__m64 _m_paddusw(__m64 _MM1, __m64 _MM2);//_mm_adds_pu16//Subtracts the eight 8-bit values in _MM2 from the eight 8-bit values in _MM1__m64 _m_psubb(__m64 _MM1, __m64 _MM2);//_mm_sub_pi8 //Subtracts the four 16-bit values in _MM2 from the four 16-bit values in _MM1__m64 _m_psubw(__m64 _MM1, __m64 _MM2);//_mm_sub_pi16//Subtracts the two 32-bit values in _MM2 from the two 32-bit values in _MM1__m64 _m_psubd(__m64 _MM1, __m64 _MM2);//_mm_sub_pi32//Subtracts the eight signed 8-bit values in _MM2 from the eight signed 8-bit//values in _MM1 and saturates__m64 _m_psubsb(__m64 _MM1, __m64 _MM2);//_mm_subs_pi8//Subtracts the four signed 16-bit values in _MM2 from the four signed 16-bit//values in _MM1 and saturates__m64 _m_psubsw(__m64 _MM1, __m64 _MM2);//_mm_subs_pi16//Subtracts the eight unsigned 8-bit values in _MM2 from the eight unsigned 8-bit//values in _MM1 and saturates__m64 _m_psubusb(__m64 _MM1, __m64 _MM2);//_mm_subs_pu8//Subtracts the four unsigned 16-bit values in _MM2 from the four unsigned 16-bit//values in _MM1 and saturates__m64 _m_psubusw(__m64 _MM1, __m64 _MM2);//_mm_subs_pu16//Multiplies four 16-bit values in _MM1 by four 16-bit values in _MM2 to produce//four 32-bit intermediate results, which are then summed by pairs to produce two//32-bit results,r0=_MM10*_MM20+_MM11*_MM21, r1=_MM12*_MM22+_MM13*_MM23__m64 _m_pmaddwd(__m64 _MM1, __m64 _MM2);//_mm_madd_pi16//Multiplies four signed 16-bit values in _MM1 by four signed 16-bit values in _MM2//and produces the high 16 bits of the four results__m64 _m_pmulhw(__m64 _MM1, __m64 _MM2);//_mm_mulhi_pi16//Multiplies four 16-bit values in _MM1 by four 16-bit values in _MM2 and produces//the low 16 bits of the four results__m64 _m_pmullw(__m64 _MM1, __m64 _MM2);//_mm_mullo_pi16/* Shift intrinsics *///Shifts four 16-bit values in _M left the amount specified by _Count //while shifting in zeros,左移_Count位，移出位补0__m64 _m_psllw(__m64 _M, __m64 _Count);//_mm_sll_pi16//Shifts four 16-bit values in _M left the amount specified by _Ccount while //shifting in zeros,左移_Count位，移出位补0,_Count需是一个立即数//汇编语言中的立即数相当于高级语言中的常量(常数)，它是直接出现在指令中的数，//不用存储在寄存器或存储器中的数__m64 _m_psllwi(__m64 _M, int _Count);//_mm_slli_pi16 //Shifts two 32-bit values in _M left the amount specified by _Count//while shifting in zeros__m64 _m_pslld(__m64 _M, __m64 _Count);//_mm_sll_pi32//Shifts two 32-bit values in _M left the amount specified by _Count//while shifting in zeros__m64 _m_pslldi(__m64 _M, int _Count);//_mm_slli_pi32//Shifts the 64-bit value in _M left the amount specified by _Count//while shifting in zeros__m64 _m_psllq(__m64 _M, __m64 _Count);//_mm_sll_si64//Shifts the 64-bit value in _M left the amount specified by _Count//while shifting in zeros__m64 _m_psllqi(__m64 _M, int _Count);//_mm_slli_si64//Shifts four 16-bit values in _M right the amount specified by _Count//while shifting in the sign bit__m64 _m_psraw(__m64 _M, __m64 _Count);//_mm_sra_pi16//Shifts four 16-bit values in _M right the amount specified by _Count//while shifting in the sign bit__m64 _m_psrawi(__m64 _M, int _Count);//_mm_srai_pi16//Shifts two 32-bit values in _M right the amount specified by _Count//while shifting in the sign bit__m64 _m_psrad(__m64 _M, __m64 _Count);//_mm_sra_pi32//Shifts two 32-bit values in _M right the amount specified by _Count//while shifting in the sign bit__m64 _m_psradi(__m64 _M, int _Count);//_mm_srai_pi32//Shifts four 16-bit values in _M right the amount specified by _Count//while shifting in zeros__m64 _m_psrlw(__m64 _M, __m64 _Count);//_mm_srl_pi16//Shifts four 16-bit values in _M right the amount specified by _Count//while shifting in zeros__m64 _m_psrlwi(__m64 _M, int _Count);//_mm_srli_pi16//Shifts two 32-bit values in _M right the amount specified by _Count//while shifting in zeros__m64 _m_psrld(__m64 _M, __m64 _Count);//_mm_srl_pi32//Shifts two 32-bit values in _M right the amount specified by _Count//while shifting in zeros__m64 _m_psrldi(__m64 _M, int _Count);//_mm_srli_pi32 //Shifts the 64-bit value in _M right the amount specified by _Count//while shifting in zeros__m64 _m_psrlq(__m64 _M, __m64 _Count);//_mm_srl_si64//Shifts the 64-bit value in _M right the amount specified by _Count//while shifting in zeros__m64 _m_psrlqi(__m64 _M, int _Count);//_mm_srli_si64/* Logical intrinsics *///Performs a bitwise AND of the 64-bit value in _MM1 with the 64-bit value in _MM2__m64 _m_pand(__m64 _MM1, __m64 _MM2);//_mm_and_si64//Performs a logical NOT on the 64-bit value in _MM1 and use the result in a //bitwise AND with the 64-bit value in _MM2__m64 _m_pandn(__m64 _MM1, __m64 _MM2);//_mm_andnot_si64//Performs a bitwise OR of the 64-bit value in _MM1 with the 64-bit value in _MM2__m64 _m_por(__m64 _MM1, __m64 _MM2);//_mm_or_si64//Performs a bitwise XOR of the 64-bit value in _MM1 with the 64-bit value in _MM2__m64 _m_pxor(__m64 _MM1, __m64 _MM2);//_mm_xor_si64/* Comparison intrinsics *///If the respective 8-bit values in _MM1 are equal to the respective //8-bit values in _MM2, sets the respective 8-bit resulting values to //all ones; otherwise, sets them to all zeros__m64 _m_pcmpeqb(__m64 _MM1, __m64 _MM2);//_mm_cmpeq_pi8//If the respective 16-bit values in _MM1 are equal to the respective //16-bit values in _MM2, sets the respective 16-bit resulting values //to all ones; otherwise, sets them to all zeros__m64 _m_pcmpeqw(__m64 _MM1, __m64 _MM2);//_mm_cmpeq_pi16//If the respective 32-bit values in _MM1 are equal to the respective //32-bit values in _MM2, sets the respective 32-bit resulting values//to all ones; otherwise, sets them to all zeros__m64 _m_pcmpeqd(__m64 _MM1, __m64 _MM2);//_mm_cmpeq_pi32 //If the respective 8-bit values in _MM1 are greater than the respective //8-bit values in _MM2, sets the respective 8-bit resulting values to all ones;//otherwise, sets them to all zeros__m64 _m_pcmpgtb(__m64 _MM1, __m64 _MM2);//_mm_cmpgt_pi8//If the respective 16-bit values in _MM1 are greater than the respective 16-bit//values in _MM2, sets the respective 16-bit resulting values to all ones;//otherwise, sets them to all zeros__m64 _m_pcmpgtw(__m64 _MM1, __m64 _MM2);//_mm_cmpgt_pi16//If the respective 32-bit values in _MM1 are greater than the respective 32-bit//values in _MM2, sets the respective 32-bit resulting values to all ones;//otherwise, sets them all to zeros__m64 _m_pcmpgtd(__m64 _MM1, __m64 _MM2);//_mm_cmpgt_pi32/* Utility intrinsics *///Sets the 64-bit value to zero__m64 _mm_setzero_si64(void);//Sets the two signed 32-bit integer values,r0=_I0, r1=_I1__m64 _mm_set_pi32(int _I1, int _I0);//r0=_S0, r1=_S1, r2=_S2, r3=_S3__m64 _mm_set_pi16(short _S3, short _S2, short _S1, short _S0);//r0=_B0, r1=_B1, r2=_B2, r3=_B3, r4=_B4, ..., r7=_B7__m64 _mm_set_pi8(char _B7, char _B6, char _B5, char _B4,char _B3, char _B2, char _B1, char _B0);//Sets the two signed 32-bit integer values to _I,r0=r1=_I__m64 _mm_set1_pi32(int _I);//Sets the four signed 16-bit integer values to _S, r0=r1=r2=r3=_S__m64 _mm_set1_pi16(short _S);//Sets the eight signed 8-bit integer values to _B, r0=r1...=r7=_B__m64 _mm_set1_pi8(char _B);//Sets the two signed 32-bit integer values in reverse order,r0=_I1, r1=_I0__m64 _mm_setr_pi32(int _I1, int _I0);//Sets the four signed 16-bit integer values in reverse order,//r0=_S3, r1=_S2, r2=_S1, r3=_S0__m64 _mm_setr_pi16(short _S3, short _S2, short _S1, short _S0);//Sets the eight signed 8-bit integer values in reverse order//r0=_B7, r1=_B6, r2=_B5, r3=_B4, r4=_B3, r5=_B2, r6=_B1, r7=_B0__m64 _mm_setr_pi8(char _B7, char _B6, char _B5, char _B4,char _B3, char _B2, char _B1, char _B0);/* Alternate intrinsic name definitions */#define _mm_empty         _m_empty#define _mm_cvtsi32_si64  _m_from_int#define _mm_cvtsi64_si32  _m_to_int#define _mm_packs_pi16    _m_packsswb#define _mm_packs_pi32    _m_packssdw#define _mm_packs_pu16    _m_packuswb#define _mm_unpackhi_pi8  _m_punpckhbw#define _mm_unpackhi_pi16 _m_punpckhwd#define _mm_unpackhi_pi32 _m_punpckhdq#define _mm_unpacklo_pi8  _m_punpcklbw#define _mm_unpacklo_pi16 _m_punpcklwd#define _mm_unpacklo_pi32 _m_punpckldq#define _mm_add_pi8       _m_paddb#define _mm_add_pi16      _m_paddw#define _mm_add_pi32      _m_paddd#define _mm_adds_pi8      _m_paddsb#define _mm_adds_pi16     _m_paddsw#define _mm_adds_pu8      _m_paddusb#define _mm_adds_pu16     _m_paddusw#define _mm_sub_pi8       _m_psubb#define _mm_sub_pi16      _m_psubw#define _mm_sub_pi32      _m_psubd#define _mm_subs_pi8      _m_psubsb#define _mm_subs_pi16     _m_psubsw#define _mm_subs_pu8      _m_psubusb#define _mm_subs_pu16     _m_psubusw#define _mm_madd_pi16     _m_pmaddwd#define _mm_mulhi_pi16    _m_pmulhw#define _mm_mullo_pi16    _m_pmullw#define _mm_sll_pi16      _m_psllw#define _mm_slli_pi16     _m_psllwi#define _mm_sll_pi32      _m_pslld#define _mm_slli_pi32     _m_pslldi#define _mm_sll_si64      _m_psllq#define _mm_slli_si64     _m_psllqi#define _mm_sra_pi16      _m_psraw#define _mm_srai_pi16     _m_psrawi#define _mm_sra_pi32      _m_psrad#define _mm_srai_pi32     _m_psradi#define _mm_srl_pi16      _m_psrlw#define _mm_srli_pi16     _m_psrlwi#define _mm_srl_pi32      _m_psrld#define _mm_srli_pi32     _m_psrldi#define _mm_srl_si64      _m_psrlq#define _mm_srli_si64     _m_psrlqi#define _mm_and_si64      _m_pand#define _mm_andnot_si64   _m_pandn#define _mm_or_si64       _m_por#define _mm_xor_si64      _m_pxor#define _mm_cmpeq_pi8     _m_pcmpeqb#define _mm_cmpeq_pi16    _m_pcmpeqw#define _mm_cmpeq_pi32    _m_pcmpeqd#define _mm_cmpgt_pi8     _m_pcmpgtb#define _mm_cmpgt_pi16    _m_pcmpgtw#define _mm_cmpgt_pi32    _m_pcmpgtd

参考文献：1、http://software.intel.com/sites/products/documentation/studio/composer/en-us/2011Update/compiler_c/intref_cls/common/intref_mmx_emms_usage.htm

0 0