kernel hacker修炼之道之内存管理-OOM Killer

来源：互联网发布：蘑菇街和淘宝的区别编辑：程序博客网时间：2024/05/21 17:13

在系统内存不足的时候会回收页框，但是在这个过程中可能会发现，系统即使是以最高优先级扫描都无法释放足够的页面来满足请求。如果系统不能够释放页面，就会调用out_of_memory函数，告知系统发生内存溢出，这时就会杀死某个进程。在__alloc_pages函数中，当调用try_to_free_pages回收页框无效的时候，会调用out_of_memory杀死一个进程，释放所占有的page后，再重新尝试分配。

这个是out_of_memory的流程图，主要分为两部分，左侧这一部分主要是选择要杀死的进程，右边这一部分执行杀死操作。

[html] view plaincopy
256void out_of_memory(int gfp_mask)  
257{  
258        struct mm_struct *mm = NULL;  
259        task_t * p;  
260  
261        read_lock(&tasklist_lock);  
262retry:  
263        p = select_bad_process();  
264  
265        if (PTR_ERR(p) == -1UL)  
266                goto out;  
267  
268        /* Found nothing?!?! Either we hang forever, or we panic. */  
269        if (!p) {  
270                read_unlock(&tasklist_lock);  
271                show_free_areas();  
272                panic("Out of memory and no killable processes...\n");  
273        }  
274  
275        printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);  
276        show_free_areas();  
277        mm = oom_kill_process(p);  
278        if (!mm)  
279                goto retry;  
280  
281 out:  
282        read_unlock(&tasklist_lock);  
283        if (mm)  
284                mmput(mm);  
285  
286        /*  
287         * Give "p" a good chance of killing itself before we  
288         * retry to allocate memory.  
289         */  
290        __set_current_state(TASK_INTERRUPTIBLE);  
291        schedule_timeout(1);  
292}  

调用select_bad_process函数选择一个即将被杀死的进程
调用oom_kill_process函数杀死进程
如果在调用select_bad_process函数的时候返回-1，说明已经有进程被OOM Killer选中，等它死就行了
给被选中的进程一点儿时间，休眠一秒再重新分配内存

下面看这个选择"best" process的函数：

[html] view plaincopy
138static struct task_struct * select_bad_process(void)  
139{  
140        unsigned long maxpoints = 0;  
141        struct task_struct *g, *p;  
142        struct task_struct *chosen = NULL;  
143        struct timespec uptime;  
144  
145        do_posix_clock_monotonic_gettime(&uptime);  
146        do_each_thread(g, p)  
147                /* skip the init task with pid == 1 */  
148                if (p->pid > 1) {  
149                        unsigned long points;  
150  
151                        /*  
152                         * This is in the process of releasing memory so wait it  
153                         * to finish before killing some other task by mistake.  
154                         */  
155                        if ((unlikely(test_tsk_thread_flag(p, TIF_MEMDIE)) || (p->flags & PF_EXITING)) &&  
156                            !(p->flags & PF_DEAD))  
157                                return ERR_PTR(-1UL);  
158                        if (p->flags & PF_SWAPOFF)  
159                                return p;  
160  
161                        points = badness(p, uptime.tv_sec);  
162                        if (points > maxpoints || !chosen) {  
163                                chosen = p;  
164                                maxpoints = points;  
165                        }  
166                }  
167        while_each_thread(g, p);  
168        return chosen;  
169}  

跳过init进程
如果进程的TIF_MEMDIE标志被设置，表示进程已经被OOM Killer机制选中；如果PF_EXITING标志被设置，表示进程正在被消除；如果PF_DEAD标志被设置，表示进程已经dead；如果这些标志被设置则返回-1，这样告诉调用 out_of_memory函数的进程等一下再分配就好
如果进程的PF_SWAPOFF标志被设置，表示那个进程调用了sys_swapoff函数，这个函数迫使进程所有驻留在swap中的page进入RAM中，并设置相应的页表，所以这个进程直接被返回，被选中杀死
如果进程没有设置上诉标志，则调用badness选择一个最该杀死的，什么是最该杀死的呢？它选择的是使用了最大量内存而又没有生存很久的进程

看badness函数实现：

[html] view plaincopy
 45unsigned long badness(struct task_struct *p, unsigned long uptime)  
 46{  
 47        unsigned long points, cpu_time, run_time, s;  
 48        struct list_head *tsk;  
 49  
 50        if (!p->mm)  
 51                return 0;  
 52  
 53        /*  
 54         * The memory size of the process is the basis for the badness.  
 55         */  
 56        ppoints = p->mm->total_vm;  
 57  
 58        /*  
 59         * Processes which fork a lot of child processes are likely  
 60         * a good choice. We add the vmsize of the childs if they  
 61         * have an own mm. This prevents forking servers to flood the  
 62         * machine with an endless amount of childs  
 63         */  
 64        list_for_each(tsk, &p->children) {  
 65                struct task_struct *chld;  
 66                chld = list_entry(tsk, struct task_struct, sibling);  
 67                if (chld->mm != p->mm && chld->mm)  
 68                        points += chld->mm->total_vm;  
 69        }  
 70  
 71        /*  
 72         * CPU time is in tens of seconds and run time is in thousands  
 73         * of seconds. There is no particular reason for this other than  
 74         * that it turned out to work very well in practice.  
 75         */  
 76        cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime))  
 77                >> (SHIFT_HZ + 3);  
 78  
 79        if (uptime >= p->start_time.tv_sec)  
 80                run_time = (uptime - p->start_time.tv_sec) >> 10;  
 81        else  
 82                run_time = 0;  
 83  
 84        s = int_sqrt(cpu_time);  
 85        if (s)  
 86                points /= s;  
 87        s = int_sqrt(int_sqrt(run_time));  
 88        if (s)  
 89                points /= s;  
 90  
 91        /*  
 92         * Niced processes are most likely less important, so double  
 93         * their badness points.  
 94         */  
 95        if (task_nice(p) > 0)  
 96                points *= 2;  
 97  
 98        /*  
 99         * Superuser processes are usually more important, so we make it  
100         * less likely that we kill those.  
101         */  
102        if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) ||  
103                                p->uid == 0 || p->euid == 0)  
104                points /= 4;  
105  
106        /*  
107         * We don't want to kill a process with direct hardware access.  
108         * Not only could that mess up the hardware, but usually users  
109         * tend to only have this flag set on applications they think  
110         * of as important.  
111         */  
112        if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))  
113                points /= 4;  
114  
115        /*  
116         * Adjust the score by oomkilladj.  
117         */  
118        if (p->oomkilladj) {  
119                if (p->oomkilladj > 0)  
120                        points <<= p->oomkilladj;  
121                else  
122                        points >>= -(p->oomkilladj);  
123        }  
124  
125#ifdef DEBUG  
126        printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",  
127        p->pid, p->comm, points);  
128#endif  
129        return points;  
130}  

获得这个进程拥有的内存size，记为权重
遍历这个进程的子进程，如果它们有自己的内存空间，则增加权重vmsize，防止一个进程创建无限的子进程占用内存
获得占用cpu的时间cpu_time，占有CPU时间越多，即越忙越可能生存
获得运行的时间run_time，运行时间越久越可能生存
如果nice值大于0，说明静态优先级很低，你越nice，越会被杀掉，权重×2
CAP_SYS_ADMIN，管理员的程序要保留下来，权重/4
CAP_SYS_RAWIO，如果有访问源设备的能力，保留，权重/4
使用p->oomkilladj调节一下权重

至此，已经找到bad points最大的进程了，主要是占用内存大，运行时间短，比较空闲的。下面分析另一半，杀死进程，看oom_kill_process函数：

[html] view plaincopy
230static struct mm_struct *oom_kill_process(struct task_struct *p)  
231{  
232        struct mm_struct *mm;  
233        struct task_struct *c;  
234        struct list_head *tsk;  
235  
236        /* Try to kill a child first */  
237        list_for_each(tsk, &p->children) {  
238                c = list_entry(tsk, struct task_struct, sibling);  
239                if (c->mm == p->mm)  
240                        continue;  
241                mm = oom_kill_task(c);  
242                if (mm)  
243                        return mm;  
244        }  
245        return oom_kill_task(p);  
246}  

尝试杀死找到的bad points最大的进程的子进程，如果那个子进程有与父进程不同的内存则杀了子进程，否则杀死父进程。

[html] view plaincopy
205static struct mm_struct *oom_kill_task(task_t *p)  
206{  
207        struct mm_struct *mm = get_task_mm(p);  
208        task_t * g, * q;  
209  
210        if (!mm)  
211                return NULL;  
212        if (mm == &init_mm) {  
213                mmput(mm);  
214                return NULL;  
215        }  
216  
217        __oom_kill_task(p);  
218        /*  
219         * kill all processes that share the ->mm (i.e. all threads),  
220         * but are in a different thread group  
221         */  
222        do_each_thread(g, q)  
223                if (q->mm == mm && q->tgid != p->tgid)  
224                        __oom_kill_task(q);  
225        while_each_thread(g, q);  
226  
227        return mm;  
228}  

不能杀init进程。杀死选中的进程和所有的共享那个进程mm的线程，并且这些线程在不同的线程组。主要调用__oom_kill_task函数，进行实际的杀死操作。

[html] view plaincopy
176static void __oom_kill_task(task_t *p)  
177{  
178        if (p->pid == 1) {  
179                WARN_ON(1);  
180                printk(KERN_WARNING "tried to kill init!\n");  
181                return;  
182        }  
183  
184        task_lock(p);  
185        if (!p->mm || p->mm == &init_mm) {  
186                WARN_ON(1);  
187                printk(KERN_WARNING "tried to kill an mm-less task!\n");  
188                task_unlock(p);  
189                return;  
190        }  
191        task_unlock(p);  
192        printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm);  
193  
194        /*  
195         * We give our sacrificial lamb high priority and access to  
196         * all the memory it needs. That way it should be able to  
197         * exit() and clear out its resources quickly...  
198         */  
199        p->time_slice = HZ;  
200        set_tsk_thread_flag(p, TIF_MEMDIE);  
201  
202        force_sig(SIGKILL, p);  
203}  

杀那个进程之前先让他把该干的事干完，给它时间片和很高的优先级。然后发送SIGKILL信号，让它去死吧。

[html] view plaincopy
1268void  
1269force_sig(int sig, struct task_struct *p)  
1270{  
1271        force_sig_info(sig, (void*)1L, p);  
1272}  

总结一下，当分配内存，内存不足的时候，会以高优先级调用回收函数，如果还无法回收足够的内存，只能找一个既占内存，又比较空闲的进程杀死，在它死前给它比较高的优先级和时间片，让他把该干的事干一下。杀死后，可以释放出大量内存。

下边一个实例程序：

[html] view plaincopy
#include <stdio.h>  
  
int main()  
{  
    void *p;  
    while(1)  
    {  
        p = malloc(1024 * 1024 * 100);  
        memset(p, 0, 1024 * 1024 * 100);  
    printf("100MB memory has been allocated!\n");  
    }  
    return 0;  
}  

运行结果：

此时查看日志：

可以看到系统只剩下8MB左右内存，swap分区已经被全部耗尽了。此时实在无法分配内存了，即使回收也无法成功，因为free swap已经是0了，不可能将pages swap out了。日志里，计算了各个zone buddy system所剩下的pages。