探究Linux的奇妙之旅

来源：互联网发布：亚马逊kindle软件编辑：程序博客网时间：2024/04/24 09:39

内容概要

本文主要探究linux 0.11的一些神奇的东西

理解schedule函数的调用

process 0 第一次调度和后面调度，schedule运行的机制是不同的。

    void main(void)     /* This really IS void, no error here. */    {               ......    /*     *   NOTE!!   For any other task 'pause()' would mean we have to get a     * signal to awaken, but task0 is the sole exception (see 'schedule()')     * as task 0 gets activated at every idle moment (when no other tasks     * can run). For task0 'pause()' just means we go check if some other     * task can run, and if not we return here.     */        for(;;) pause();    }

当进程0到达main函数的最后的循环之后，将通过系统调用，调用系统函数_sys_pause，其中调用schedule，

        int sys_pause(void)        {            current->state = TASK_INTERRUPTIBLE;            schedule();            return 0;        }

查找到刚刚创建的进程1处于就绪状态，在schedule最后switch_to(1). 在switch中发生了什么呢？

        /*     *  'schedule()' is the scheduler function. This is GOOD CODE! There     * probably won't be any reason to change this, as it should work well     * in all circumstances (ie gives IO-bound processes good response etc).     * The one thing you might take a look at is the signal-handler code here.     *     *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other     * tasks can run. It can not be killed, and it cannot sleep. The 'state'     * information in task[0] is never used.     */    void schedule(void)    {        int i,next,c;        struct task_struct ** p;    /* check alarm, wake up any interruptible tasks that have got a signal */        for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)            if (*p) {                if ((*p)->alarm && (*p)->alarm < jiffies) {                        (*p)->signal |= (1<<(SIGALRM-1));                        (*p)->alarm = 0;                    }                if (((*p)->signal & ~(_BLOCKABLE & (*p)->blocked)) &&                (*p)->state==TASK_INTERRUPTIBLE)                    (*p)->state=TASK_RUNNING;            }    /* this is the scheduler proper: */        while (1) {            c = -1;            next = 0;            i = NR_TASKS;            p = &task[NR_TASKS];            while (--i) {                if (!*--p)                    continue;                if ((*p)->state == TASK_RUNNING && (*p)->counter > c)                    c = (*p)->counter, next = i;            }            if (c) break;            for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)                if (*p)                    (*p)->counter = ((*p)->counter >> 1) +                            (*p)->priority;        }        switch_to(next);    }

ljmp 0 \n\t这个得查看IA-32手册，就会发现，IA-32架构中允许同种特权级之间相互切换，但是不同特权级之间切换是不允许的。但是0特权级的进程0是如何切换到3特权级的进程1呢？就要引入’门’的概念了。具体参见手册.门是允许点到点的，不同特权级之间的切换。

那么当进程1运行过之后，再次由进程0切换到进程1时，由于进程切换都是0特权级，此时，此时就不需要经过什么门了，IA32是允许同特权级之间进行进程切换的。

switch_to

    #define switch_to(n) {\    struct {long a,b;} __tmp; \    __asm__("cmpl %%ecx,_current\n\t" \        "je 1f\n\t" \        "movw %%dx,%1\n\t" \        "xchgl %%ecx,_current\n\t" \        "ljmp %0\n\t" \        "cmpl %%ecx,_last_task_used_math\n\t" \        "jne 1f\n\t" \        "clts\n" \        "1:" \        ::"m" (*&__tmp.a),"m" (*&__tmp.b), \        "d" (_TSS(n)),"c" ((long) task[n])); \    }

这里涉及到TSS的定义和GDT LDT管理结构

    #define _TSS(n) ((((unsigned long) n)<<4)+(FIRST_TSS_ENTRY<<3))    #define FIRST_TSS_ENTRY 4

每个描述符占8个字节，第一个状态段是第四个，所以 <<3，得到第一个任务描述符在GDT中的位置，
而每个任务使用一个tss和ldt，占16字节，所以<<4，两者相加得到任务n的tss在GDT中的位置,写入EDX寄存器。
另外ECX指向要切换过去的新任务task[n]。

现在开始理解代码，首先声明了一个_tmp的结构，这个结构里面包括两个long型，32位机里面long占32位，声明这个结构主要与ljmp这个长跳指令有关，这个指令有两个参数，一个参数是段选择符，另一个是偏移地址，所以这个_tmp就是保存这两个参数。再比较任务n是不是当前任务，如果不是则跳转到标号1，否则交互ecx和current的内容，交换后的结果为ecx指向当前进程，current指向要切换过去的新进程，在执行长跳，%0代表输出输入寄存器列表中使用的第一个寄存器，即m"(*&__tmp.a)，这个寄存器保存了*&__tmp.a，而_tmp.a存放的是32位偏移(对应EIP)，_tmp.b存放的是新任务的tss段选择符(对应CS)，长跳到段选择符会造成任务切换，这个是x86的硬件原理。"d" (_TSS(n)),"c" ((long) task[n]));

缓冲区

Linux0.11 中没有实现预读和预写功能,而是将预读功能转化为普通读取。
其中缓冲区的理解要找到如下数据结构：
1. buffer head

    struct buffer_head {        char * b_data;          /* pointer to data block (1024 bytes) */        unsigned long b_blocknr;    /* block number */        unsigned short b_dev;       /* device (0 = free) */        unsigned char b_uptodate;        unsigned char b_dirt;       /* 0-clean,1-dirty */        unsigned char b_count;      /* users using this block */        unsigned char b_lock;       /* 0 - ok, 1 -locked */        struct task_struct * b_wait;        struct buffer_head * b_prev;        struct buffer_head * b_next;        struct buffer_head * b_prev_free;        struct buffer_head * b_next_free;    };

hash_table

        struct buffer_head * hash_table[NR_HASH];        #define _hashfn(dev,block) (((unsigned)(dev^block))%NR_HASH)        #define hash(dev,block) hash_table[_hashfn(dev,block)]

request

        /*         * Ok, this is an expanded form so that we can use the same         * request for paging requests when that is implemented. In         * paging, 'bh' is NULL, and 'waiting' is used to wait for         * read/write completion.         */        struct request {            int dev;        /* -1 if no request */            int cmd;        /* READ or WRITE */            int errors;            unsigned long sector;            unsigned long nr_sectors;            char * buffer;            struct task_struct * waiting;            struct buffer_head * bh;            struct request * next;        };

READA and WRITEA

    static void make_request(int major,int rw, struct buffer_head * bh)    {        ......        /* WRITEA/READA is special case - it is not really needed, so if the */        /* buffer is locked, we just forget about it, else it's a normal read */        if (rw_ahead = (rw == READA || rw == WRITEA)) {            if (bh->b_lock)                return;            if (rw == READA)                rw = READ;            else                rw = WRITE;        }        if (rw!=READ && rw!=WRITE)            panic("Bad block dev command, must be R/W/RA/WA");        lock_buffer(bh);        if ((rw == WRITE && !bh->b_dirt) || (rw == READ && bh->b_uptodate)) {            unlock_buffer(bh);            return;        }        ......    }

那么什么是预读预写呢？参考Linux内核的文件预读

系统调用

        fn_ptr sys_call_table[] = { sys_setup, sys_exit, sys_fork, sys_read,        sys_write, sys_open, sys_close, sys_waitpid, sys_creat, sys_link,        sys_unlink, sys_execve, sys_chdir, sys_time, sys_mknod, sys_chmod,        sys_chown, sys_break, sys_stat, sys_lseek, sys_getpid, sys_mount,        sys_umount, sys_setuid, sys_getuid, sys_stime, sys_ptrace, sys_alarm,        sys_fstat, sys_pause, sys_utime, sys_stty, sys_gtty, sys_access,        sys_nice, sys_ftime, sys_sync, sys_kill, sys_rename, sys_mkdir,        sys_rmdir, sys_dup, sys_pipe, sys_times, sys_prof, sys_brk, sys_setgid,        sys_getgid, sys_signal, sys_geteuid, sys_getegid, sys_acct, sys_phys,        sys_lock, sys_ioctl, sys_fcntl, sys_mpx, sys_setpgid, sys_ulimit,        sys_uname, sys_umask, sys_chroot, sys_ustat, sys_dup2, sys_getppid,        sys_getpgrp, sys_setsid, sys_sigaction, sys_sgetmask, sys_ssetmask,        sys_setreuid,sys_setregid };

引用

Linus
Linux官网
Linux 内核设计的艺术

1 0