嵌入式作业

来源：互联网发布：新版mac装win10不要u盘编辑：程序博客网时间：2024/05/17 03:58

寄信人: aaaaatiger (绝不作别人的累赘)
标题: 没主题
发信站: 郁金香BBS站 (Mon Apr 10 01:09:25 2006)
来源: 10.10.80.9

two ways to build ARM apps
Dear David,
Op vrijdag 3 oktober 2003 03:08, schreef David McCullough:
> There are two ways to build ARM apps, first , fully relocatable:
>
> arm-elf-gcc -Wl,-elf2flt=-z hello.c
>
> and, second, for XIP (where possible)
>
> arm-elf-gcc -D__PIC__ -fpic -msingle-pic-base -Wl,-elf2flt=-z hello.c

>
> Don't mix the two up :-)
Just tested it.
Wonderful! Both images work well when started from a ramdisk.

Thanks a lot!

寄信人: Fll (学海无涯，回头是岸！)
标题: home
发信站: 郁金香BBS站 (2006年04月17日16:51:08 星期一)
来源: 郁金香BBS站

2.手动交叉编译一个最简单的“hello，world！”程序，并运行，分别采用C和汇编语言，
并分析结果。
C语言版本的“Hello，World！”程序如下：
#include<stdio.h>
int main(int argc, char * argv[])
{
         printf("Hello,world!");
         return 0;
}
采用汇编语言编写的程序如下：
        .file   "hello.c"
.gcc2_compiled.:
        .section .rodata
        .align 2
.LC0:
        .ascii "Hello,world!/012/000"
.text
        .align 2
        .global main
        .type    main,function
main:
        @ args = 0, pretend = 0, frame = 8
        @ frame_needed = 1, current_function_anonymous_args = 0
        mov     ip, sp
        stmfd   sp!, {fp, ip, lr, pc}
        sub     fp, ip, #4
        sub     sp, sp, #8
        str     r0, [fp, #-16]
        str     r1, [fp, #-20]
        bl      __gccmain
        ldr     r0, .L3
        bl      printf
        mov     r0, #0
        b       .L2
.L4:
        .align 2
.L3:
        .word   .LC0
.L2:
        ldmea   fp, {fp, sp, pc}
.Lfe1:
        .size    main,.Lfe1-main

4.剖析和优化下面的计算阶乘的C语言程序。
我们先看要优化的程序（我们称其为程序1）：
#include <stdio.h>
#include <stdlib.h>

#define M 1000000000L
#define N 60000

int multiply(int n, unsigned int prod[N], int highest);
void print(unsigned int prod[N], int highest);
int main(int argc, char * argv[]) {
int i, n = 0, highest = 0;
int prod[N] = {1, 0};

if (argc > 1) n = atoi(argv[1]);

for (i = 2; i <= n; i++)
highest = multiply(i, prod, highest);

print(prod, highest);

return 0;
}

int multiply(int n, unsigned int prod[N], int highest) {
    unsigned long long tmp;   // if your compiler supports C99
    //unsigned __int64 tmp;   // if you use Visual C or Borland C
    int carrier = 0;
    int i;

    for (i = 0; i <= highest; i++) {
        tmp = n;
        tmp *= prod[i];
        tmp += carrier;

        prod[i] = tmp % M;
        carrier = tmp / M;
    }

if (carrier)
prod[++highest] = carrier;

return highest;
}

void print(unsigned int prod[N], int highest) {
    printf("%u", prod[highest]);
    while (highest > 0)
        printf("%09u", prod[--highest]);

return;
}
该程序利用了高精度×低精度的高精度乘法算法，可以说如果要达到高精度的话，应该只有
这种方法了。
开始的时候我想到了不考虑高精度方面，利用公式：
n!=1*2*3*....*n → lg(n!)=lg(1*2*3*....*n)=lg(1)+lg(2)+ lg(3)+..+lg(n)
从而可以得出 n!=10^(lg(1)+lg(2)+lg(3)+..+lg(n))，这样的话速度确实可以达到比较快
，可是精度不够高，因为是
用到了公式去计算，因而就不能使用高精度的算法，可以说是从某种程度上说没有达到程序
原来的目的，从而没有多少意
义。

在优化的过程中看到为了防止9位数乘上9位数的溢出，程序中定义了一个unsigned long lo
ng tmp的变量，由于变量的
范围比较大，因而计算比较慢，比如一个double类型的数乘上一个double类型的数字，肯定
比一个转化为两个int类型的数相乘要慢得多（如果该double转化为int不丢失精度的话），
从而盟想把tmp进行优化的想法。于是改进后程序如下：
程序2：
include<stdio.h>

void fact(int n)
{
        int i,j,carry,d = 0;
        unsigned long x;
        int a[1000000]={0};
        a[0] = 1;
        if(n == 0)
                n = 1;
        for(i=1; i<=n; i++){
                        for(carry = 0,j = 0; j <= d; j++){
                                x=a[j]*i+carry;
                                carry = x / 100000; //计算进位,数组每位保存4位
数字
                                a[j] = x - carry * 100000;//利用减法实现取余
                        }
                        while(a[j] == 0)
                                j--;
                        d = j + 5;
        }
        while(a[j] == 0)
                j--;
        printf("%d",a[j--]);
        while(j>=0)
                printf("%05d",a[j--]);
        printf("/n");
        return ;
}
int main(int argc, char * argv[])
{
        int a = 0;
        scanf("%d",&a);
        //a=20000;
        fact(a);
        return 0;
}

从表面上看，两个程序并没有多少差别，不过从实际的运行情况来看，程序运行的速度大大
的改善了，利用linux的time命令粗略的测量了一下运行时间，结果如下：
程序1计算20000！（结果为77338位数字）：
real    0m25.235s
user    0m21.409s
sys     0m0.040s

程序2计算20000！：
real    0m13.028s
user    0m7.588s
sys     0m0.088s

程序1计算30000！（结果为121288位数字）：
real    1m25.480s
user    0m51.871s
sys     0m0.184s

程序2计算30000！：
real    0m44.989s
user    0m19.165s
sys     0m0.112s

再利用linux的自带的程序性能分析工具gprof进行分析，下面是计算20000！的时候的结果
（结果为77338位数字）：
程序1：
Flat profile:
Each sample counts as 0.01 seconds.
%   cumulative   self              self     total
time   seconds   seconds    calls ms/call ms/call name
42.97     21.80    21.80                             __umoddi3
32.29     38.18    16.38                             __udivdi3
24.69     50.70    12.53    19999     0.63     0.63 multiply
0.04     50.73     0.02                             main
0.01     50.73     0.01        1     5.00     5.00 print

index % time    self children    called     name
                                                 <spontaneous>
[1]     43.0   21.80    0.00                 __umoddi3 [1]
-----------------------------------------------
                                                 <spontaneous>
[2]     32.3   16.38    0.00                 __udivdi3 [2]
-----------------------------------------------
                                                 <spontaneous>
[3]     24.7    0.02   12.53                 main [3]
               12.53    0.00   19999/19999       multiply [4]
                0.01    0.00       1/1           print [5]
-----------------------------------------------
               12.53    0.00   19999/19999       main [3]
[4]     24.7   12.53    0.00   19999         multiply [4]
-----------------------------------------------
                0.01    0.00       1/1           main [3]
[5]      0.0    0.01    0.00       1         print [5]
-----------------------------------------------

程序2：
Flat profile:

Each sample counts as 0.01 seconds.
%   cumulative   self              self     total
time   seconds   seconds    calls   s/call   s/call name
100.00     15.51    15.51        1    15.51    15.51 fact

index % time    self children    called     name
               15.51    0.00       1/1           main [2]
[1]    100.0   15.51    0.00       1         fact [1]
-----------------------------------------------
                                                 <spontaneous>
[2]    100.0    0.00   15.51                 main [2]
               15.51    0.00       1/1           fact [1]
-----------------------------------------------

从数据看程序2的速度和性能方面显然已经大大提升了。
其他方面的优化：
   程序1计算阶乘的时候是在main（）函数中使用了下面的语句：
          for (i = 2; i <= n; i++)
                highest = multiply(i, prod, highest);
显然这个multiply（）函数在计算10000！的时候被调用了10000-1次，而计算20000！阶乘
的时候调用了20000-1次，如此之多的函数调用会使得程序的运行速度减慢。另外程序1的结
果的输出也是用了函数调用，同样减慢了程序的执行速度。
   程序2把计算n！的主要过程放在了fact（）函数里面，不过在计算n！的时候，不管n多
大，fact（）函数仅被主函数调用了一次，而且程序结果的输出也放在了fact（）函数里面
，函数内联的效果使得程序在某种程度上速度加快了。另外使用编译器优化（编译时加上-O
3选项）,也使得
函数有一定的优化。
   那是否把那个中间变量定义地更小一点比如int是否会更快一点呢？再次经过测试,事实

寄信人: Fll (学海无涯，回头是岸！)
标题: Re: 嘿嘿
发信站: 郁金香BBS站 (2006年04月09日22:30:49 星期天)
来源: 郁金香BBS站

不行了，我已经又优化了一次了，呵呵，比以前更快了。
你比较一下：

#include<stdio.h>
#include<string.h>

void fact(int n)//阶乘
{
        int i,j,f,d;
        unsigned long x;
        int a[1000000]={0};
        a[0]=1;
        if(n==0) n=1;
        for(i=1;i<=n;i++){
                        for(f=0,j=0;j<=d;j++){
                                x=a[j]*i+f;
                                f=x/100000;a[j]=x%100000;
                        }
                        while(a[j]==0)
                                j--;
                        d=j+5;
        }
        while(a[j]==0) j--;
        printf("%d",a[j--]);
        while(j>=0)
                printf("%05d",a[j--]);
        printf("/n");
        return ;
}
int main(int argc, char * argv[])
{
        int a=0;
        scanf("%d",&a);
        fact(a);
        return 0;
}

【在 guomin (have a pair of wings,the freedom I hope) 的来信中提到: 】
: 那发给我……我想继续优化……