init/main.c源码分析
来源:互联网 发布:mac无苹果安装win7系统 编辑:程序博客网 时间:2024/05/17 09:25
2 * linux/init/main.c
3 *
4 * (C) 1991 Linus Torvalds
5 */
我们来看一下main函数
105 { /* The startup routine assumes (well, ...) this */
106 /*
107 * Interrupts are still disabled. Do necessary setups, then
108 * enable them
109 */
110 ROOT_DEV = ORIG_ROOT_DEV;
111 drive_info = DRIVE_INFO;
112 memory_end = (1<<20) + (EXT_MEM_K<<10);
56 * This is set up by the setup-routine at boot-time
57 */
58 #define EXT_MEM_K (*(unsigned short *)0x90002)
59 #define DRIVE_INFO (*(struct drive_info *)0x90080)
60 #define ORIG_ROOT_DEV (*(unsigned short *)0x901FC)
这些值是我们在setup中通过询问BIOS获得,并保存的。现在拿来使用。
113 memory_end &= 0xfffff000;
114 if (memory_end > 16*1024*1024)
115 memory_end = 16*1024*1024;
116 if (memory_end > 12*1024*1024)
117 buffer_memory_end = 4*1024*1024;
118 else if (memory_end > 6*1024*1024)
119 buffer_memory_end = 2*1024*1024;
120 else
121 buffer_memory_end = 1*1024*1024;
根据当前内存大小设置缓冲区的长度,这里采用4M。
124 main_memory_start += rd_init(main_memory_start, RAMDISK*1024);
125 #endif
127 trap_init();
128 blk_dev_init();
129 chr_dev_init();
130 tty_init();
131 time_init();
132 sched_init();
133 buffer_init(buffer_memory_end);
134 hd_init();
135 floppy_init();
136 sti();
我们依次来分析,首先126行:
400 {
401 int i;
402
403 HIGH_MEMORY = end_mem;
404 for (i=0 ; i<PAGING_PAGES ; i++)
405 mem_map[i] = USED;
406 i = MAP_NR(start_mem);
407 end_mem -= start_mem;
408 end_mem >>= 12;
409 while (end_mem-->0)
410 mem_map[i++]=0;
411 }
42 /* these are not to be changed without changing head.s etc */
43 #define LOW_MEM 0x100000
44 #define PAGING_MEMORY (15*1024*1024)
45 #define PAGING_PAGES (PAGING_MEMORY>>12)
46 #define MAP_NR(addr) (((addr)-LOW_MEM)>>12)
47 #define USED 100
57 static unsigned char mem_map [ PAGING_PAGES ] = {0,};
23 __asm__ ("movw %%dx,%%ax\n\t" \
24 "movw %0,%%dx\n\t" \
25 "movl %%eax,%1\n\t" \
26 "movl %%edx,%2" \
27 : \
28 : "i" ((short) (0x8000+(dpl<<13)+(type<<8))), \
29 "o" (*((char *) (gate_addr))), \
30 "o" (*(4+(char *) (gate_addr))), \
31 "d" ((char *) (addr)),"a" (0x00080000))
36 #define set_trap_gate(n,addr) \
37 _set_gate(&idt[n],15,0,addr)
40 _set_gate(&idt[n],15,3,addr)
182 {
183 int i;
184
185 set_trap_gate(0,÷_error);
186 set_trap_gate(1,&debug);
187 set_trap_gate(2,&nmi);
188 set_system_gate(3,&int3); /* int3-5 can be called from all */
189 set_system_gate(4,&overflow);
190 set_system_gate(5,&bounds);
这里dpl=3,也就是用户级。
191 set_trap_gate(6,&invalid_op);
192 set_trap_gate(7,&device_not_available);
193 set_trap_gate(8,&double_fault);
194 set_trap_gate(9,&coprocessor_segment_overrun);
195 set_trap_gate(10,&invalid_TSS);
196 set_trap_gate(11,&segment_not_present);
197 set_trap_gate(12,&stack_segment);
198 set_trap_gate(13,&general_protection);
199 set_trap_gate(14,&page_fault);
200 set_trap_gate(15,&reserved);
201 set_trap_gate(16,&coprocessor_error);
202 for (i=17;i<48;i++)
203 set_trap_gate(i,&reserved);
204 set_trap_gate(45,&irq13);
205 outb_p(inb_p(0x21)&0xfb,0x21);
206 outb(inb_p(0xA1)&0xdf,0xA1);
允许8259A主芯片IRQ2中断,允许8259A从芯片IRQ13中断
207 set_trap_gate(39,¶llel_interrupt);
208 }
因为我们之前在setup中临时设置了idtr,又在head中具体分配了内存,但是那时还没有具体设置值,以上就是为其设置具体的值。
不是这样划分的,中断门、陷阱门都属于系统段。它们是通过type值来区分的,如下图
通过上图可以看到中断门type = 14(1110),陷阱门type =15(1111)。另外,中断门会关中断,陷阱门不会,这是二者的唯一区别。
158 {
159 int i;
160
161 for (i=0 ; i<NR_REQUEST ; i++) {
162 request[i].dev = -1;
163 request[i].next = NULL;
164 }
165 }
看来这里主要是把请求队列初始化。同一时间可能会有不同的进程等待读写磁盘,这些读写操作封装成请求,按照电梯调度算法排列在request数组中,同时使用next连接成链。当数组满了之后,下一步请求的进程会执行sleep_on进行等待。
348 {
349 }
106 {
107 rs_init();
108 con_init();
109 }
继续:131 time_init();
76 static void time_init(void)
77 {
78 struct tm time;
79
80 do {
81 time.tm_sec = CMOS_READ(0);
82 time.tm_min = CMOS_READ(2);
83 time.tm_hour = CMOS_READ(4);
84 time.tm_mday = CMOS_READ(7);
85 time.tm_mon = CMOS_READ(8);
86 time.tm_year = CMOS_READ(9);
87 } while (time.tm_sec != CMOS_READ(0));
88 BCD_TO_BIN(time.tm_sec);
89 BCD_TO_BIN(time.tm_min);
90 BCD_TO_BIN(time.tm_hour);
91 BCD_TO_BIN(time.tm_mday);
92 BCD_TO_BIN(time.tm_mon);
93 BCD_TO_BIN(time.tm_year);
94 time.tm_mon--;
95 startup_time = kernel_mktime(&time);
96 }
时钟初始化
kernel/sched.c:
386 {
387 int i;
388 struct desc_struct * p;
389
390 if (sizeof(struct sigaction) != 16)
391 panic("Struct sigaction MUST be 16 bytes");
392 set_tss_desc(gdt+FIRST_TSS_ENTRY,&(init_task.task.tss));
393 set_ldt_desc(gdt+FIRST_LDT_ENTRY,&(init_task.task.ldt));
150 * Entry into gdt where to find first TSS. 0-nul, 1-cs, 2-ds,3-syscall
151 * 4-TSS0, 5-LDT0, 6-TSS1 etc ...
152 */
153 #define FIRST_TSS_ENTRY 4
154 #define FIRST_LDT_ENTRY (FIRST_TSS_ENTRY+1)
在gdt中的索引。
看一下init_task:
54 struct task_struct task;
55 char stack[PAGE_SIZE];
56 };
57
58 static union task_union init_task = {INIT_TASK,};
继续来看INIT_TASK:
110 * INIT_TASK is used to set up the first task table, touch at
111 * your own risk!. Base=0, limit=0x9ffff (=640kB)
112 */
113 #define INIT_TASK \
114 /* state etc */ { 0,15,15, \
115 /* signals */ 0,{{},},0, \
116 /* ec,brk... */ 0,0,0,0,0,0, \
117 /* pid etc.. */ 0,-1,0,0,0, \
118 /* uid etc */ 0,0,0,0,0,0, \
119 /* alarm */ 0,0,0,0,0,0, \
120 /* math */ 0, \
121 /* fs info */ -1,0022,NULL,NULL,NULL,0, \
122 /* filp */ {NULL,}, \
123 { \
124 {0,0}, \
125 /* ldt */ {0x9f,0xc0fa00}, \
126 {0x9f,0xc0f200}, \
127 }, \
128 /*tss*/ {0,PAGE_SIZE+(long)&init_task,0x10,0,0,0,0,(long)&pg_dir,\
129 0,0,0,0,0,0,0,0, \
130 0,0,0x17,0x17,0x17,0x17,0x17,0x17, \
131 _LDT(0),0x80000000, \
132 {} \
133 }, \
134 }
为了搞懂这里的定义,我们还需要先来看几个数据结构,首先task_struct(我们把上面的宏定义拿到下面对应比较)
79 /* these are hardcoded - don't touch */
80 long state; /* -1 unrunnable, 0 runnable, >0 stopped */ //0,runnable
81 long counter; //15
82 long priority; //15
83 long signal; //0
84 struct sigaction sigaction[32]; //{{},}
85 long blocked; /* bitmap of masked signals */ //0
86 /* various fields */
87 int exit_code; //0
88 unsigned long start_code,end_code,end_data,brk,start_stack; //all is 0
89 long pid,father,pgrp,session,leader; //0,-1,0,0,0,只有father是-1,表示没有father
90 unsigned short uid,euid,suid; //all is 0
91 unsigned short gid,egid,sgid; //all is 0
92 long alarm; // 0
93 long utime,stime,cutime,cstime,start_time; //all is 0
94 unsigned short used_math; //0
95 /* file system info */
96 int tty; /* -1 if no tty, so it must be signed */ // -1 no tty
97 unsigned short umask; //0022
98 struct m_inode * pwd; //NULL
99 struct m_inode * root; //NULL
100 struct m_inode * executable; //NULL,
101 unsigned long close_on_exec; //0
102 struct file * filp[NR_OPEN]; // {NULL,}
103 /* ldt for this task 0 - zero 1 - cs 2 - ds&ss */
104 struct desc_struct ldt[3];
124 {0,0}, \
125 /* ldt */ {0x9f,0xc0fa00}, \
126 {0x9f,0xc0f200}, \
127 }, \
105 /* tss for this task */
106 struct tss_struct tss;
129 0,0,0,0,0,0,0,0, \
130 0,0,0x17,0x17,0x17,0x17,0x17,0x17, \
131 _LDT(0),0x80000000, \
132 {} \
133 }, \
134 }
107 };
再来看一下ldt结构:
5 unsigned long a,b;
6 } desc_table[256];
123 { \
124 {0,0}, \
125 /* ldt */ {0x9f,0xc0fa00}, \ 代码长640K,基址0x0,G=1,D=1,DPL=3,P=1 TYPE=0x0a
126 {0x9f,0xc0f200}, \ 数据长640K,基址0x0,G=1,D=1,DPL=3,P=1 TYPE=0x02
127 }, \
52 long back_link; /* 16 high bits zero */ //0
53 long esp0; //PAGE_SIZE+(long)&init_task
54 long ss0; /* 16 high bit zero */0x10
55 long esp1; //0
56 long ss1; /* 16 high bits zero */0
57 long esp2; //0
58 long ss2; /* 16 high bits zero *///0
59 long cr3; //(long)&pg_dir
60 long eip; //0
61 long eflags; //0
62 long eax,ecx,edx,ebx; //0 ,0 ,0, 0
63 long esp; //0
64 long ebp; //0
65 long esi; //0
66 long edi; //0
67 long es; /* 16 high bits zero */ //0x17
68 long cs; /* 16 high bits zero */ //0x17
69 long ss; /* 16 high bits zero */ //0x17
70 long ds; /* 16 high bits zero */ //0x17
71 long fs; /* 16 high bits zero */ //0x17
72 long gs; /* 16 high bits zero */0x17
73 long ldt; /* 16 high bits zero */_LDT(0),
74 long trace_bitmap; /* bits: trace 0, bitmap 16-31 */ //0x80000000,
75 struct i387_struct i387; //{}
76 };
p->tss.esp0 = PAGE_SIZE + (long)p;p->tss.ss0 = 0x10;
其中,p是新任务的任务数据结构指针,tss是任务状态段结构。内核为新任务申请内存用作保存其task_struct结构数据,而tss结构(段)是 task_struct中的一个字段。该任务的内核堆栈段值tss.ss0也被设置成为0x10(即 内核数据段选择符),而tss.esp0则指向保存task_struct结构页面的末端。实际上tss.esp0被设置成指向该页面 (外)上一字节处。这是因为 Intel CPU执行堆栈操作时是先递减堆栈指针esp值,然后在esp指针处保存入栈内容。
斜体内容参考:http://blog.sina.com.cn/s/blog_673ef8130100qaje.html我们继续回到sched_init开始:
386 {
387 int i;
388 struct desc_struct * p;
389
390 if (sizeof(struct sigaction) != 16)
391 panic("Struct sigaction MUST be 16 bytes");
392 set_tss_desc(gdt+FIRST_TSS_ENTRY,&(init_task.task.tss));
393 set_ldt_desc(gdt+FIRST_LDT_ENTRY,&(init_task.task.ldt));
66 #define set_ldt_desc(n,addr) _set_tssldt_desc(((char *) (n)),addr,"0x82")
53 __asm__ ("movw $104,%1\n\t" \
54 "movw %%ax,%2\n\t" \
55 "rorl $16,%%eax\n\t" \
56 "movb %%al,%3\n\t" \
57 "movb $" type ",%4\n\t" \
58 "movb $0x00,%5\n\t" \
59 "movb %%ah,%6\n\t" \
60 "rorl $16,%%eax" \
61 ::"a" (addr), "m" (*(n)), "m" (*(n+2)), "m" (*(n+4)), \
62 "m" (*(n+5)), "m" (*(n+6)), "m" (*(n+7)) \
63 )
可以看到这里就是把addr中指定的值写入到地址n中,其中类型type分别为0x89或0x82.
395 for(i=1;i<NR_TASKS;i++) {
396 task[i] = NULL;
397 p->a=p->b=0;
398 p++;
399 p->a=p->b=0;
400 p++;
401 }
402 /* Clear NT, so that we won't have troubles with that later on */
403 __asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl");
404 ltr(0);
405 lldt(0);
406 outb_p(0x36,0x43); /* binary, mode 3, LSB/MSB, ch 0 */
407 outb_p(LATCH & 0xff , 0x40); /* LSB */
408 outb(LATCH >> 8 , 0x40); /* MSB */
409 set_intr_gate(0x20,&timer_interrupt);
//type=14,dpl=0,设置时钟中断
410 outb(inb_p(0x21)&~0x01,0x21);
411 set_system_gate(0x80,&system_call);
//设置系统调用中断,权限为用户级
412 }
349 {
350 struct buffer_head * h = start_buffer;
351 void * b;
352 int i;
353
354 if (buffer_end == 1<<20)
355 b = (void *) (640*1024);
356 else
357 b = (void *) buffer_end;
358 while ( (b -= BLOCK_SIZE) >= ((void *) (h+1)) ) {
359 h->b_dev = 0;
360 h->b_dirt = 0;
361 h->b_count = 0;
362 h->b_lock = 0;
363 h->b_uptodate = 0;
364 h->b_wait = NULL;
365 h->b_next = NULL;
366 h->b_prev = NULL;
367 h->b_data = (char *) b;
368 h->b_prev_free = h-1;
369 h->b_next_free = h+1;
370 h++;
371 NR_BUFFERS++;
372 if (b == (void *) 0x100000)
373 b = (void *) 0xA0000;
374 }
375 h--;
376 free_list = start_buffer;
377 free_list->b_prev_free = h;
378 h->b_next_free = free_list;
379 for (i=0;i<NR_HASH;i++)
380 hash_table[i]=NULL;
381 }
kernel/blk_drv/hd.c
344 {
345 blk_dev[MAJOR_NR].request_fn = DEVICE_REQUEST;
346 set_intr_gate(0x2E,&hd_interrupt);
347 outb_p(inb_p(0x21)&0xfb,0x21);
348 outb(inb_p(0xA1)&0xbf,0xA1);
349 }
回到main函数继续:
136 sti();
137 move_to_user_mode();
135我们就不看了,136行可以开中断了,因为所有的准备工作都已经准备好了。
2 __asm__ ("movl %%esp,%%eax\n\t" \
3 "pushl $0x17\n\t" \
4 "pushl %%eax\n\t" \
5 "pushfl\n\t" \ //把标志寄存器的值压栈
6 "pushl $0x0f\n\t" \
7 "pushl $1f\n\t" \将下面标号1 的偏移地址(eip)入栈
8 "iret\n" \ \\执行中断返回指令,则会跳转到下面标号1处
9 "1:\tmovl $0x17,%%eax\n\t" \
10 "movw %%ax,%%ds\n\t" \
11 "movw %%ax,%%es\n\t" \
12 "movw %%ax,%%fs\n\t" \
13 "movw %%ax,%%gs" \
14 :::"ax")
执行iret前的堆栈如下:
以下这段话引用自http://faydoc.tripod.com/cpu/iret.htmis cleared
In Protected Mode, the action of the IRET instruction depends on the settings of the NT (nested task) and VM flags in the EFLAGS register and the VM flag in the EFLAGS image stored on the current stack. Depending on the setting of these flags, the processor performs the following types of interrupt returns:
- Return from virtual-8086 mode.
- Return to virtual-8086 mode.
- Intra-privilege level return.
- Inter-privilege level return.
- Return from nested task (task switch).
If the NT flag (EFLAGS register) is cleared,the IRET instruction performs a far return from the interrupt procedure, without a task switch. The code segment being returned to must be equally or less privileged than the interrupt handler routine (as indicated by the RPL field of the code segment selector popped from the stack). As with a real-address mode interrupt return, the IRET instruction pops the return instruction pointer, return code segment selector, and EFLAGS image from the stack to the EIP, CS, and EFLAGS registers, respectively, and then resumes execution of the interrupted program or procedure.If the return is to another privilege level, the IRET instruction also pops the stack pointer and SS from the stack, before resuming program execution. If the return is to virtual-8086 mode, the processor also pops the data segment registers from the stack.
由于在sched_init()中已经设置了标志寄存器中的vm标志为0,所以iret掉用后不会发生任务切换,而是继续执行EIP指向的指令故继续执行1标号的代码,开始执行任务0,任务0的堆栈段选择符为0x17,在sched_init()中已设置了任务0 的任务描述符和局部描述符为INIT_TASK。
也就是这里在原来的进程中开始执行init_task,因为我们在sched_init中已经加载了ldt和ts,这里又指定了段寄存器选择符为0x17,也就是作为ldt中的索引,索引值为2,也就是ldt中的数据段。基地址都是0.既然基地址是0,它会与内核进程共用相同的代码段和数据段。
139 init();
140 }
现在fork一个子进程,如果返回值为0说明是子进程,也就是这里的if条件为true。那么在子进程中执行init函数(因此这个子进程也被成为init进程)。
169 {
170 int pid,i;
171
172 setup((void *) &drive_info);
这个操作还比较重要,与文件系统挂载有关系,只是这里我们暂不关心。
173 (void) open("/dev/tty0",O_RDWR,0);
175 (void) dup(0);
看一下这个函数
43 {
44 return dupfd(fildes,0);
45 }
18 static int dupfd(unsigned int fd, unsigned int arg)
19 {
20 if (fd >= NR_OPEN || !current->filp[fd])
21 return -EBADF;
22 if (arg >= NR_OPEN)
23 return -EINVAL;
24 while (arg < NR_OPEN)
25 if (current->filp[arg])
26 arg++;
27 else
28 break;
29 if (arg >= NR_OPEN)
30 return -EMFILE;
31 current->close_on_exec &= ~(1<<arg);
32 (current->filp[arg] = current->filp[fd])->f_count++;
33 return arg;
34 }
可以看到这个函数的作用就是从当前task的filp数组中找到一个还未使用的,然后让它与fd指向同一个file,并且增加引用计数。
所以上面的两个dup(0)就是从filp数组中找到两个未使用的,然后让它指向fip[0]的file结构体。实际是用来重定向标准输出和标准错误的。
180 close(0);
181 if (open("/etc/rc",O_RDONLY,0))
182 _exit(1);
183 execve("/bin/sh",argv_rc,envp_rc);
184 _exit(2);
185 }
现在又fork一个进程,并在其中执行以下操作:
163 static char * envp_rc[] = { "HOME=/", NULL };
187 while (pid != wait(&i))
188 /* nothing */;
父进程(任务0)等待子进程结束,当184行执行完毕,子进程结束后继续向下执行
190 if ((pid=fork())<0) {
191 printf("Fork failed in init\r\n");
192 continue;
193 }
194 if (!pid) {
195 close(0);close(1);close(2);
196 setsid();
197 (void) open("/dev/tty0",O_RDWR,0);
198 (void) dup(0);
199 (void) dup(0);
200 _exit(execve("/bin/sh",argv,envp));
201 }
166 static char * envp[] = { "HOME=/usr/root", NULL };
202 while (1)
203 if (pid == wait(&i))
204 break;
205 printf("\n\rchild %d died with code %04x\n\r",pid,i);
206 sync(); //同步,刷新缓冲区
207 }//while
父进程等待其退出,我们发现最外面的while循环没有退出指令,也就是它是一个死循环。一个/bin/sh结束后,会另外创建一个进程来执行它。
142 * NOTE!! For any other task 'pause()' would mean we have to get a
143 * signal to awaken, but task0 is the sole exception (see 'schedule()')
144 * as task 0 gets activated at every idle moment (when no other tasks
145 * can run). For task0 'pause()' just means we go check if some other
146 * task can run, and if not we return here.
147 */
148 for(;;) pause();
149 }
进程0创建进程1之后,调用进程1的init方法,来进行一些初始化工作,并在一个循环中为shell进行服务。但是进程0并没有退出,它有自己的 task_struct(task_init),有自己的ldt,tss等,它也会被调度。我们看一下它被调度时的工作,由于148行是一个死循环,所以 每次被调度后,它就执行这一句。根据注释中说的,这里pause仅仅意味着我们去看一下是否有其他进程可以运行,如果没有,继续回到idle。这个比较有意思,也就是我们机器启动运行后如果没有任何其他可运行进程,也没有中断发生,idle就会一直运行,也就是一直检查是否有其他进程可以运行。这与 init进程中的死循环很相似,只不过init的死循环是为了保证永远有进程在执行/bin/sh,而idle的死循环是为了保证一旦有进程可以运行就调度它。所以idle具有一种“大局观”,地位非凡就跟BIOS一样。
12 * we need this inline - forking from kernel space will result
13 * in NO COPY ON WRITE (!!!), until an execve is executed. This
14 * is no problem, but for the stack. This is handled by not letting
15 * main() use the stack at all after fork(). Thus, no function
16 * calls - which means inline code for fork too, as otherwise we
17 * would use the stack upon exit from 'fork()'.
18 *
19 * Actually only pause and fork are needed inline, so that there
20 * won't be any messing with the stack from main(), but we define
21 * some others too.
22 */
23 static inline _syscall0(int,fork)
24 static inline _syscall0(int,pause)
25 static inline _syscall1(int,setup,void *,BIOS)
26 static inline _syscall0(int,sync)
133 #define _syscall0(type,name) \
134 type name(void) \
135 { \
136 long __res; \
137 __asm__ volatile ("int $0x80" \
138 : "=a" (__res) \
139 : "0" (__NR_##name)); \
140 if (__res >= 0) \
141 return (type) __res; \
142 errno = -__res; \
143 return -1; \
144 }
问题1. 这里为什么采用内联函数而不是函数调用?
- init/main.c源码分析
- linux源码分析-/init/main.c/start_kernel
- Android源码阅读笔记1 - init.c的main函数分析
- void init(void) 分析 ! \linux-1.0\init\main.c
- init\main.c start_kernel()
- linux/init/main.c
- void start_kernel(void)分析 ! \linux-1.0\init\main.c
- Android启动之init.c文件main函数分析
- FreeSWITCH - mod_xml_rpc源码分析五init.c
- linux/init/main.c 注释
- kubeadm init源码分析
- init进程源码分析
- init进程源码分析
- android 源码分析流程(一) init.c
- Android-4.0.3-init.c启动源码分析
- main函数及ISR init分析
- android init.c分析
- per_cpu_pageset 之一(init/main.c start_kernel初始化)
- Vijos-P1097-合并果子(简单贪心 && 优先队列 && c++)
- 逼真黑客范儿–Hacker Typer
- php创建接口出现乱码 encode urldecode转换程字符串
- ExtJs学习
- 一个菜鸟的Windows Phone开发日志
- init/main.c源码分析
- Android动画之Property Animation
- (大数据之zookeeper)ZooKeeper安装说明
- Android String与int类型互转
- LeetCode(Combinations)
- android 查看联系人
- iOS 设置UINavtionController navigationBar 为透明颜色
- twisted异步机制--Deferred
- C/C++ Volatile关键词深度剖析