Lab3 ARM指令

来源：互联网发布：网络在线对外汉语教师编辑：程序博客网时间：2024/05/16 19:31

通过C代码和反汇编工具研究ARM指令。

教程目标：

生成了Thumb指令还是ARM指令，如何通过编译参数改变；
对于ARM指令，能否产生条件执行的指令；
设计C的代码场景，观察是否产生了寄存器移位寻址；
设计C的代码场景,观察一个复杂的32位数是如何装载到寄存器的;
写一个C的多重函数调用的程序，观察和分析:
调用时的返回地址在哪里？
传入的参数在哪里？
本地变量的堆栈分配是如何做的？
寄存器是caller保存还是callee保存？是全体保存还是部分保存？
MLA是带累加的乘法，尝试要如何写C的表达式能编译得到MLA指令。

教程器材及软件：

树莓派的板子。
SD卡（已经有镜像刷入）。
电源线及USB充电器。
U盘或USB硬盘
putty和psftp。
有DHCP的网线。

步骤：

首先写一段简单的C代码：

#include<stdio.h>int main(int argc,char** argv){    int a=0x12345678;    printf("a:%d\n",a);    return 0;}

如果要将其编译成ARM指令的，那么，默认就好了。然后，再用objdump出来看看。
```
gcc -o 1.o -c 1.cobjdump -d 1.o
```

我们可以看到指令是32位的。

1.o:     file format elf32-littlearmDisassembly of section .text:00000000 <main>:   0:   e92d4800        push    {fp, lr}   4:   e28db004        add     fp, sp, #4   8:   e24dd010        sub     sp, sp, #16   c:   e50b0010        str     r0, [fp, #-16]  10:   e50b1014        str     r1, [fp, #-20]  14:   e59f3020        ldr     r3, [pc, #32]   ; 3c <main+0x3c>  18:   e50b3008        str     r3, [fp, #-8]  1c:   e59f301c        ldr     r3, [pc, #28]   ; 40 <main+0x40>  20:   e1a00003        mov     r0, r3  24:   e51b1008        ldr     r1, [fp, #-8]  28:   ebfffffe        bl      0 <printf>  2c:   e3a03000        mov     r3, #0  30:   e1a00003        mov     r0, r3  34:   e24bd004        sub     sp, fp, #4  38:   e8bd8800        pop     {fp, pc}  3c:   12345678        .word   0x12345678  40:   00000000        .word   0x00000000

如果要将其编译成Thumb指令的话，就要像下面这样子。如果，不加-mfloat-abi=softfp，会报错。好像和浮点运算VFP 的ABI没有有关系。
```
gcc -o 1.o -c 1.c -mthumb -mfloat-abi=softfpobjdump -d 1.o
```

我们可以看到指令是16位的。

1.o:     file format elf32-littlearmDisassembly of section .text:00000000 <main>:   0:   b580            push    {r7, lr}   2:   b084            sub     sp, #16   4:   af00            add     r7, sp, #0   6:   6078            str     r0, [r7, #4]   8:   6039            str     r1, [r7, #0]   a:   4b06            ldr     r3, [pc, #24]   ; (24 <main+0x24>)   c:   60fb            str     r3, [r7, #12]   e:   4a06            ldr     r2, [pc, #24]   ; (28 <main+0x28>)  10:   68fb            ldr     r3, [r7, #12]  12:   1c10            adds    r0, r2, #0  14:   1c19            adds    r1, r3, #0  16:   f7ff fffe       bl      0 <printf>  1a:   2300            movs    r3, #0  1c:   1c18            adds    r0, r3, #0  1e:   46bd            mov     sp, r7  20:   b004            add     sp, #16  22:   bd80            pop     {r7, pc}  24:   12345678        .word   0x12345678  28:   00000000        .word   0x00000000

再写一个程序2.c：

int max(int a,int b){    if(a>b)        return a;    else        return b;}

2.o:     file format elf32-littlearmDisassembly of section .text:00000000 <max>:   0:   e52db004        push    {fp}            ; (str fp, [sp, #-4]!)   4:   e28db000        add     fp, sp, #0   8:   e24dd00c        sub     sp, sp, #12   c:   e50b0008        str     r0, [fp, #-8]  10:   e50b100c        str     r1, [fp, #-12]  14:   e51b2008        ldr     r2, [fp, #-8]  18:   e51b300c        ldr     r3, [fp, #-12]  1c:   e1520003        cmp     r2, r3  20:   da000001        ble     2c <max+0x2c>  24:   e51b3008        ldr     r3, [fp, #-8]  28:   ea000000        b       30 <max+0x30>  2c:   e51b300c        ldr     r3, [fp, #-12]  30:   e1a00003        mov     r0, r3  34:   e28bd000        add     sp, fp, #0  38:   e8bd0800        pop     {fp}  3c:   e12fff1e        bx      lr

上面有些跳转指令，但是没有条件执行指令。

我们让gcc对代码进行优化：
```
gcc -o 2.o -c 2.c -O1
```

2.o:     file format elf32-littlearmDisassembly of section .text:00000000 <max>:   0:   e1510000        cmp     r1, r0   4:   a1a00001        movge   r0, r1   8:   b1a00000        movlt   r0, r0   c:   e12fff1e        bx      lr

代码就变得非常短了，而且也可以明显的看到，条件执行指令。

写一个简单的程序3.c：

int fun(int p[],int index){    return p[index];}

gcc -o 3.o -c 3.c -O1objdump -d 3.o3.o:     file format elf32-littlearmDisassembly of section .text:00000000 <fun>:   0:   e7900101        ldr     r0, [r0, r1, lsl #2]   4:   e12fff1e        bx      lr

再写一个简单的程序4.c:
```
int fun(void){    return 0x12345678;}
```
```
gcc -o 4.o -c 4.c -O1objdump -d 4.o4.o:     file format elf32-littlearmDisassembly of section .text:00000000 <fun>:   0:   e59f0000        ldr     r0, [pc]        ; 8 <fun+0x8>   4:   e12fff1e        bx      lr   8:   12345678        .word   0x12345678
```
它的做法很简单，将32位数放在指令的附近，然后load一下就可以了。加上有cache的存在，这样的方案可能比将数字拆分成16位再load进来要快，而且它只执行了1条指令。实验了一下，发现load64位数，它也是将数放在指令附近然后load两次。
再写一个不简单的程序5.c（gcc的优化能力实在是太强了，要写一个程序就看出所有的这些，真是不容易啊。）：

#include<stdio.h>int bb(int a,int b,int c,int d,int e,int f){    printf("Hello world!\n");    return a*b*c*d*e*f;}int cc(int a,int b,int c,int d,int e,int f,int g,int h,int i,int j,int k){    int t1=a+b;    int t2=c+d;    int t3=e+f;    int t4=g+h;    int t5=i+j;    bb(1,2,3,4,5,6);    int t6=t1*t2;    int t7=t3*t4;    int t8=t6-t7;    int t9=t8*t5*k;    return t9;    //return a*b*c*d*e*f*g*h*i*j*k;}

gcc -o 5.o -c 5.c -O1objdump -d 5.o5_1.o:     file format elf32-littlearmDisassembly of section .text:00000000 <bb>:   0:   e92d40f8        push    {r3, r4, r5, r6, r7, lr}   4:   e1a04000        mov     r4, r0//r0-r3会被用作作为传参数的寄存器，如果不够就会用堆栈里的。   8:   e1a05001        mov     r5, r1   c:   e1a06002        mov     r6, r2  10:   e1a07003        mov     r7, r3  14:   e59f0020        ldr     r0, [pc, #32]   ; 3c <bb+0x3c>  18:   ebfffffe        bl      0 <puts>  1c:   e0040495        mul     r4, r5, r4  20:   e0060496        mul     r6, r6, r4  24:   e0070697        mul     r7, r7, r6  28:   e59d6018        ldr     r6, [sp, #24]  2c:   e0070796        mul     r7, r6, r7  30:   e59d001c        ldr     r0, [sp, #28]  34:   e0000790        mul     r0, r0, r7  38:   e8bd80f8        pop     {r3, r4, r5, r6, r7, pc}  3c:   00000000        .word   0x0000000000000040 <cc>:  40:   e92d41f0        push    {r4, r5, r6, r7, r8, lr}//这个说明caller save r0-r3,lr,callee save r4-r8,//另外，返回地址就在lr上，如果该函数要表用别的函数的话，lr会被推入堆栈。  44:   e24dd008        sub     sp, sp, #8  48:   e0804001        add     r4, r0, r1  4c:   e0825003        add     r5, r2, r3  50:   e59d3024        ldr     r3, [sp, #36]   ; 0x24  54:   e59d7020        ldr     r7, [sp, #32]  58:   e0877003        add     r7, r7, r3  5c:   e59d302c        ldr     r3, [sp, #44]   ; 0x2c  60:   e59d6028        ldr     r6, [sp, #40]   ; 0x28  64:   e0866003        add     r6, r6, r3  68:   e59d3034        ldr     r3, [sp, #52]   ; 0x34  6c:   e59d8030        ldr     r8, [sp, #48]   ; 0x30  70:   e0888003        add     r8, r8, r3  74:   e3a03005        mov     r3, #5  78:   e58d3000        str     r3, [sp]  7c:   e3a03006        mov     r3, #6  80:   e58d3004        str     r3, [sp, #4]  84:   e3a00001        mov     r0, #1  88:   e3a01002        mov     r1, #2  8c:   e3a02003        mov     r2, #3  90:   e3a03004        mov     r3, #4  94:   ebfffffe        bl      0 <bb>  98:   e0040495        mul     r4, r5, r4  9c:   e0060796        mul     r6, r6, r7  a0:   e0664004        rsb     r4, r6, r4  a4:   e0080498        mul     r8, r8, r4  a8:   e59d0038        ldr     r0, [sp, #56]   ; 0x38  ac:   e0000890        mul     r0, r0, r8  b0:   e28dd008        add     sp, sp, #8  b4:   e8bd81f0        pop     {r4, r5, r6, r7, r8, pc}//至于本地变量的存放问题，因为，开启了优化，本地变量都放在寄存器里面了。如果，不要优化，就可以看到它是先用低地址，再用高地址。

再写一个简单的程序6.c：

int fun(int a,int b,int c){    return a*b+c;}

gcc -o 6.o -c 6.c -O1objdump -d 6.o6.o:     file format elf32-littlearmDisassembly of section .text:00000000 <fun>:   0:   e0202091        mla     r0, r1, r0, r2   4:   e12fff1e        bx      lr

备注：

此为浙江大学计算机学院嵌入式系统实验报告。