x264中的汇编:x86inc.asm——001

来源:互联网 发布:重庆交通干部网络学校 编辑:程序博客网 时间:2024/05/18 03:44

http://blog.csdn.net/xiaoyi247/article/details/7904906


     龙哥以前说过,不懂汇编,就别说自己懂264,确实汇编在视频编解码中的作用太大了。在非opencl等显卡并行优化的平台上,SIMD就成了算法并行处理的唯一渠道。整个X264的代码的精华都在那些汇编文件中,当然,所有的算法都有C的实现,但是为什么X264的编码速度能够达到现在的水平,基本决定于它的汇编优化。

      x86inc.asm是x264汇编语言的头文件,和编码算法没有直接的关系,只涉及到跨平台的各种预编译宏。这个文件是为nasm平台写的,所以熟悉WIN32汇编的同学可能对这个文件中的一些代码仍然比较陌生,好在这个文件中包含了很详细的注释,基本上结合注释和nasm的手册的第四章 the nasm preprocessor就能看懂了。其中比较麻烦的就是宏参数的传递,默认参数个数等。

下面是这个文件的一些注释,可能有错误之处,以后改正。。。后面会陆续分析DCT,sad,QUANT,mc,deblock等算法的一些汇编实现。

;*****************************************************************************
;* x86inc.asm
;*****************************************************************************
;* Copyright (C) 2005-2008 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*          Anton Mitrofanov <BugMaster@narod.ru>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*****************************************************************************
;如果定义ARCH_X86_64,则判断操作系统平台,如果__OUTPUT_FORMAT_为win32则定义WIN64,否则定义UNIX64
%ifdef ARCH_X86_64
    %ifidn __OUTPUT_FORMAT__,win32
        %define WIN64
    %else
        %define UNIX64
    %endif
%endif


; FIXME: All of the 64bit asm functions that take a stride as an argument
; via register, assume that the high dword of that register is filled with 0.
; This is true in practice (since we never do any 64bit arithmetic on strides,
; and x264's strides are all positive), but is not guaranteed by the ABI.


; Name of the .rodata section.
; Kludge: Something on OS X fails to align .rodata even given an align attribute,
; so use a different read-only section.
;定义代码段的对齐方式,0-1表示可以有0个或者1个参数,如果0个参数的时候%1默认是16,即16位对齐
;.text段可读写,.rodata是只读段 noexec nowrite
%macro SECTION_RODATA 0-1 16
    %ifidn __OUTPUT_FORMAT__,macho64
        SECTION .text align=%1
    %elifidn __OUTPUT_FORMAT__,macho
        SECTION .text align=%1
        fakegot:
    %else
        SECTION .rodata align=%1
    %endif
%endmacro


; PIC support macros.
; x86_64 can't fit 64bit address literals in most instruction types,
; so shared objects (under the assumption that they might be anywhere
; in memory) must use an address mode that does fit.
; So all accesses to global variables must use this macro, e.g.
;     mov eax, [foo GLOBAL]
; instead of
;     mov eax, [foo]
;

; PIC参见nasm doc 7.9.3  Position−Independent Code: elf Special Symbols and WRT
; x86_32 doesn't require PIC.
; Some distros prefer shared objects to be PIC, but nothing breaks if
; the code contains a few textrels, so we'll skip that complexity.
; 地址无关代码支持宏
%ifdef WIN64
    %define PIC
%elifndef ARCH_X86_64
    %undef PIC
%endif
%ifdef PIC
    %define GLOBAL wrt rip
%else
    %define GLOBAL
%endif


; Macros to eliminate most code duplication between x86_32 and x86_64:
; Currently this works only for leaf functions which load all their arguments
; into registers at the start, and make no other use of the stack. Luckily that
; covers most of x264's asm.


; PROLOGUE:
; %1 = number of arguments. loads them from stack if needed.
; %2 = number of registers used. pushes callee-saved regs if needed.
; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
; %4 = list of names to define to registers
; PROLOGUE can also be invoked by adding the same options to cglobal


; e.g.
; cglobal foo, 2,3, dst, src, tmp
; declares a function (foo), taking two args (dst and src) and one local variable (tmp)


; TODO Some functions can use some args directly from the stack. If they're the
; last args then you can just not declare them, but if they're in the middle
; we need more flexible macro.


; RET:
; Pops anything that was pushed by PROLOGUE


; REP_RET:
; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
; which are slow when a normal ret follows a branch.


; registers:
; rN and rNq are the native-size register holding function argument N
; rNd, rNw, rNb are dword, word, and byte size
; rNm is the original location of arg N (a register or on the stack), dword
; rNmp is native size


;使用6个参数来定义一个寄存器的各种访问
%macro DECLARE_REG 6
    %define r%1q %2
    %define r%1d %3
    %define r%1w %4
    %define r%1b %5
    %define r%1m %6
    %ifid %6 ; i.e. it's a register
        %define r%1mp %2
    %elifdef ARCH_X86_64 ; memory
        %define r%1mp qword %6
    %else
        %define r%1mp dword %6
    %endif
    %define r%1  %2
%endmacro


;使用两个参数来定义寄存器的各种访问 比如raxq定义为rax, raxd定义为eax, raxw定义为ax
%macro DECLARE_REG_SIZE 2
    %define r%1q r%1
    %define e%1q r%1
    %define r%1d e%1
    %define e%1d e%1
    %define r%1w %1
    %define e%1w %1
    %define r%1b %2
    %define e%1b %2
%ifndef ARCH_X86_64
    %define r%1  e%1
%endif
%endmacro


DECLARE_REG_SIZE ax, al
DECLARE_REG_SIZE bx, bl
DECLARE_REG_SIZE cx, cl
DECLARE_REG_SIZE dx, dl
DECLARE_REG_SIZE si, sil
DECLARE_REG_SIZE di, dil
DECLARE_REG_SIZE bp, bpl


; t# defines for when per-arch register allocation is more complex than just function arguments


;nasm每次会为%%简历一个不同的名字来替换现有的名字%%i可能会是@123i,123在每次宏被调用的时候都会被修改
;*表示可以多个参数
;定义t0 t1....
%macro DECLARE_REG_TMP 1-*
    %assign %%i 0
    %rep %0
        CAT_XDEFINE t, %%i, r%1
        %assign %%i %%i+1
        %rotate 1
    %endrep
%endmacro


;define t1q t1 q 有啥意义?
%macro DECLARE_REG_TMP_SIZE 0-*
    %rep %0
        %define t%1q t%1 %+ q
        %define t%1d t%1 %+ d
        %define t%1w t%1 %+ w
        %define t%1b t%1 %+ b
        %rotate 1
    %endrep
%endmacro


DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7


%ifdef ARCH_X86_64
    %define gprsize 8
%else
    %define gprsize 4
%endif


;定义出栈入栈时候的stack_offset的位置
%macro PUSH 1
    push %1
    %assign stack_offset stack_offset+gprsize
%endmacro


%macro POP 1
    pop %1
    %assign stack_offset stack_offset-gprsize
%endmacro


;定义rsp作为被减数时候stack_offset的位置
%macro SUB 2
    sub %1, %2
    %ifidn %1, rsp
        %assign stack_offset stack_offset+(%2)
    %endif
%endmacro


;定义rsp作为被加数时候stack_offset的位置
%macro ADD 2
    add %1, %2
    %ifidn %1, rsp
        %assign stack_offset stack_offset-(%2)
    %endif
%endmacro


;如果两个参数不相等,则参数2赋值给参数1
%macro movifnidn 2
    %ifnidn %1, %2
        mov %1, %2
    %endif
%endmacro


;如果两个参数不相等,则参数2赋值给参数1,并且做符号扩展
%macro movsxdifnidn 2
    %ifnidn %1, %2
        movsxd %1, %2
    %endif
%endmacro


%macro ASSERT 1
    %if (%1) == 0
        %error assert failed
    %endif
%endmacro


%macro DEFINE_ARGS 0-*
    %ifdef n_arg_names
        %assign %%i 0
        %rep n_arg_names
            CAT_UNDEF arg_name %+ %%i, q
            CAT_UNDEF arg_name %+ %%i, d
            CAT_UNDEF arg_name %+ %%i, w
            CAT_UNDEF arg_name %+ %%i, b
            CAT_UNDEF arg_name, %%i
            %assign %%i %%i+1
        %endrep
    %endif


    %assign %%i 0
    %rep %0
        %xdefine %1q r %+ %%i %+ q
        %xdefine %1d r %+ %%i %+ d
        %xdefine %1w r %+ %%i %+ w
        %xdefine %1b r %+ %%i %+ b
        CAT_XDEFINE arg_name, %%i, %1
        %assign %%i %%i+1
        %rotate 1
    %endrep
    %assign n_arg_names %%i
%endmacro


%ifdef WIN64 ; Windows x64 ;=================================================
;在win64环境下定义r0--r6的寄存器,r7m,r8m定义为堆栈内存
DECLARE_REG 0, rcx, ecx, cx,  cl,  ecx
DECLARE_REG 1, rdx, edx, dx,  dl,  edx
DECLARE_REG 2, r8,  r8d, r8w, r8b, r8d
DECLARE_REG 3, r9,  r9d, r9w, r9b, r9d
DECLARE_REG 4, rdi, edi, di,  dil, [rsp + stack_offset + 40]
DECLARE_REG 5, rsi, esi, si,  sil, [rsp + stack_offset + 48]
DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 56]
%define r7m [rsp + stack_offset + 64]
%define r8m [rsp + stack_offset + 72]




%macro LOAD_IF_USED 2 ; reg_id, number_of_args
    %if %1 < %2
        mov r%1, [rsp + stack_offset + 8 + %1*8]
    %endif
%endmacro
;如果需要,将参数load到寄存器中 r4 r5 r6,或者堆栈中
%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
    ASSERT %2 >= %1
    %assign regs_used %2
    ASSERT regs_used <= 7
    %if %0 > 2
        %assign xmm_regs_used %3
    %else
        %assign xmm_regs_used 0
    %endif
    ASSERT xmm_regs_used <= 16
    %if regs_used > 4
        push r4
        push r5
        %assign stack_offset stack_offset+16
    %endif
    %if xmm_regs_used > 6
        sub rsp, (xmm_regs_used-6)*16+16
        %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
        %assign %%i xmm_regs_used
        %rep (xmm_regs_used-6)
            %assign %%i %%i-1
            movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
        %endrep
    %endif
    LOAD_IF_USED 4, %1
    LOAD_IF_USED 5, %1
    LOAD_IF_USED 6, %1
    DEFINE_ARGS %4
%endmacro


;将多余6个寄存器的堆栈中数据存储到xmm寄存器中
%macro RESTORE_XMM_INTERNAL 1
    %if xmm_regs_used > 6
        %assign %%i xmm_regs_used
        %rep (xmm_regs_used-6)
            %assign %%i %%i-1
            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
        %endrep
        add %1, (xmm_regs_used-6)*16+16
    %endif
%endmacro


%macro RESTORE_XMM 1
    RESTORE_XMM_INTERNAL %1
    %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
    %assign xmm_regs_used 0
%endmacro


;返回指令
%macro RET 0
    RESTORE_XMM_INTERNAL rsp
    %if regs_used > 4
        pop r5
        pop r4
    %endif
    ret
%endmacro


%macro REP_RET 0
    %if regs_used > 4 || xmm_regs_used > 6
        RET
    %else
        rep ret
    %endif
%endmacro


%elifdef ARCH_X86_64 ; *nix x64 ;=============================================
;Linux 64位寄存器的定义
DECLARE_REG 0, rdi, edi, di,  dil, edi
DECLARE_REG 1, rsi, esi, si,  sil, esi
DECLARE_REG 2, rdx, edx, dx,  dl,  edx
DECLARE_REG 3, rcx, ecx, cx,  cl,  ecx
DECLARE_REG 4, r8,  r8d, r8w, r8b, r8d
DECLARE_REG 5, r9,  r9d, r9w, r9b, r9d
DECLARE_REG 6, rax, eax, ax,  al,  [rsp + stack_offset + 8]
%define r7m [rsp + stack_offset + 16]
%define r8m [rsp + stack_offset + 24]


%macro LOAD_IF_USED 2 ; reg_id, number_of_args
    %if %1 < %2
        mov r%1, [rsp - 40 + %1*8]
    %endif
%endmacro

;将堆栈参数放入参数寄存器中,定义要使用的寄存器
%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
    ASSERT %2 >= %1
    ASSERT %2 <= 7
    LOAD_IF_USED 6, %1
    DEFINE_ARGS %4
%endmacro


%macro RET 0
    ret
%endmacro


%macro REP_RET 0
    rep ret
%endmacro


%else ; X86_32 ;==============================================================
;32为寄存器的定义


DECLARE_REG 0, eax, eax, ax, al,   [esp + stack_offset + 4]
DECLARE_REG 1, ecx, ecx, cx, cl,   [esp + stack_offset + 8]
DECLARE_REG 2, edx, edx, dx, dl,   [esp + stack_offset + 12]
DECLARE_REG 3, ebx, ebx, bx, bl,   [esp + stack_offset + 16]
DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
%define r7m [esp + stack_offset + 32]
%define r8m [esp + stack_offset + 36]
%define rsp esp


%macro PUSH_IF_USED 1 ; reg_id
    %if %1 < regs_used
        push r%1
        %assign stack_offset stack_offset+4
    %endif
%endmacro


%macro POP_IF_USED 1 ; reg_id
    %if %1 < regs_used
        pop r%1
    %endif
%endmacro


%macro LOAD_IF_USED 2 ; reg_id, number_of_args
    %if %1 < %2
        mov r%1, [esp + stack_offset + 4 + %1*4]
    %endif
%endmacro


%macro PROLOGUE 2-4+ ; #args, #regs, arg_names...
    ASSERT %2 >= %1
    %assign regs_used %2
    ASSERT regs_used <= 7
    PUSH_IF_USED 3
    PUSH_IF_USED 4
    PUSH_IF_USED 5
    PUSH_IF_USED 6
    LOAD_IF_USED 0, %1
    LOAD_IF_USED 1, %1
    LOAD_IF_USED 2, %1
    LOAD_IF_USED 3, %1
    LOAD_IF_USED 4, %1
    LOAD_IF_USED 5, %1
    LOAD_IF_USED 6, %1
    DEFINE_ARGS %4
%endmacro


%macro RET 0
    POP_IF_USED 6
    POP_IF_USED 5
    POP_IF_USED 4
    POP_IF_USED 3
    ret
%endmacro


%macro REP_RET 0
    %if regs_used > 3
        RET
    %else
        rep ret
    %endif
%endmacro


%endif ;======================================================================






;=============================================================================
; arch-independent part
;=============================================================================


%assign function_align 16
; 定义C语言能够连接的符号前缀
; Symbol prefix for C linkage
%macro cglobal 1-2+
    %ifdef PREFIX
        %xdefine %1.skip_prologue _%1.skip_prologue
        %xdefine %1 _%1
    %endif
    %ifidn __OUTPUT_FORMAT__,elf
        global %1:function hidden
    %else
        global %1
    %endif
    align function_align
    %1:
    RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
    %assign stack_offset 0
    %if %0 > 1

;如果函数带有参数,则定义PROLOGURE,指定参数和寄存器个数,PROLOGURE参考之前的定义
        PROLOGUE %2
    %endif
%endmacro
;定义外部函数
%macro cextern 1
    %ifdef PREFIX
        %xdefine %1 _%1
    %endif
    extern %1
%endmacro


; This is needed for ELF, otherwise the GNU linker assumes the stack is
; executable by default.
%ifidn __OUTPUT_FORMAT__,elf
SECTION .note.GNU-stack noalloc noexec nowrite progbits
%endif


%assign FENC_STRIDE 16
%assign FDEC_STRIDE 32


; merge mmx and sse*
;合并mmx和sse寄存器
%macro CAT_XDEFINE 3
    %xdefine %1%2 %3
%endmacro


%macro CAT_UNDEF 2
    %undef %1%2
%endmacro


;将mmx的寄存器定义为m0...m7
%macro INIT_MMX 0
    %define RESET_MM_PERMUTATION INIT_MMX
    %define mmsize 8
    %define num_mmregs 8

;如果是MMX,则将mova等指令定义为MMX中的指令,和SSE兼容
    %define mova movq
    %define movu movq
    %define movh movd
    %define movnt movntq
    %assign %%i 0
    %rep 8
    CAT_XDEFINE m, %%i, mm %+ %%i
    CAT_XDEFINE nmm, %%i, %%i
    %assign %%i %%i+1
    %endrep
    %rep 8
    CAT_UNDEF m, %%i
    CAT_UNDEF nmm, %%i
    %assign %%i %%i+1
    %endrep
%endmacro


;将sse的寄存器定义为m0..m7
%macro INIT_XMM 0
    %define RESET_MM_PERMUTATION INIT_XMM
    %define mmsize 16
    %define num_mmregs 8
    %ifdef ARCH_X86_64
    %define num_mmregs 16
    %endif
    %define mova movdqa
    %define movu movdqu
    %define movh movq
    %define movnt movntdq
    %assign %%i 0
    %rep num_mmregs
    CAT_XDEFINE m, %%i, xmm %+ %%i
    CAT_XDEFINE nxmm, %%i, %%i
    %assign %%i %%i+1
    %endrep
%endmacro


INIT_MMX


; I often want to use macros that permute their arguments. e.g. there's no
; efficient way to implement butterfly or transpose or dct without swapping some
; arguments.
;
; I would like to not have to manually keep track of the permutations:
; If I insert a permutation in the middle of a function, it should automatically
; change everything that follows. For more complex macros I may also have multiple
; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
;
; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
; permutes its arguments. It's equivalent to exchanging the contents of the
; registers, except that this way you exchange the register names instead, so it
; doesn't cost any cycles.


%macro PERMUTE 2-* ; takes a list of pairs to swap
%rep %0/2
    %xdefine tmp%2 m%2
    %xdefine ntmp%2 nm%2
    %rotate 2
%endrep
%rep %0/2
    %xdefine m%1 tmp%2
    %xdefine nm%1 ntmp%2
    %undef tmp%2
    %undef ntmp%2
    %rotate 2
%endrep
%endmacro


%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
%rep %0-1
%ifdef m%1
    %xdefine tmp m%1
    %xdefine m%1 m%2
    %xdefine m%2 tmp
    CAT_XDEFINE n, m%1, %1
    CAT_XDEFINE n, m%2, %2
%else
    ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
    ; Be careful using this mode in nested macros though, as in some cases there may be
    ; other copies of m# that have already been dereferenced and don't get updated correctly.
    %xdefine %%n1 n %+ %1
    %xdefine %%n2 n %+ %2
    %xdefine tmp m %+ %%n1
    CAT_XDEFINE m, %%n1, m %+ %%n2
    CAT_XDEFINE m, %%n2, tmp
    CAT_XDEFINE n, m %+ %%n1, %%n1
    CAT_XDEFINE n, m %+ %%n2, %%n2
%endif
    %undef tmp
    %rotate 1
%endrep
%endmacro


%macro SAVE_MM_PERMUTATION 1
    %assign %%i 0
    %rep num_mmregs
    CAT_XDEFINE %1_m, %%i, m %+ %%i
    %assign %%i %%i+1
    %endrep
%endmacro


%macro LOAD_MM_PERMUTATION 1
    %assign %%i 0
    %rep num_mmregs
    CAT_XDEFINE m, %%i, %1_m %+ %%i
    CAT_XDEFINE n, m %+ %%i, %%i
    %assign %%i %%i+1
    %endrep
%endmacro


%macro call 1
    call %1
    %ifdef %1_m0
        LOAD_MM_PERMUTATION %1
    %endif
%endmacro


;Substitutions that reduce instruction size but are functionally equivalent
%macro add 2
    %ifnum %2
        %if %2==128
            sub %1, -128
        %else
            add %1, %2
        %endif
    %else
        add %1, %2
    %endif
%endmacro


%macro sub 2
    %ifnum %2
        %if %2==128
            add %1, -128
        %else
            sub %1, %2
        %endif
    %else
        sub %1, %2
    %endif
%endmacro




原创粉丝点击