kernel hacker修炼之道之内存管理-OOM Killer
来源:互联网 发布:蘑菇街和淘宝的区别 编辑:程序博客网 时间:2024/05/21 17:13
在系统内存不足的时候会回收页框,但是在这个过程中可能会发现,系统即使是以最高优先级扫描都无法释放足够的页面来满足请求。如果系统不能够释放页面,就会调用out_of_memory函数,告知系统发生内存溢出,这时就会杀死某个进程。在__alloc_pages函数中,当调用try_to_free_pages回收页框无效的时候,会调用out_of_memory杀死一个进程,释放所占有的page后,再重新尝试分配。
这个是out_of_memory的流程图,主要分为两部分,左侧这一部分主要是选择要杀死的进程,右边这一部分执行杀死操作。
- 256void out_of_memory(int gfp_mask)
- 257{
- 258 struct mm_struct *mm = NULL;
- 259 task_t * p;
- 260
- 261 read_lock(&tasklist_lock);
- 262retry:
- 263 p = select_bad_process();
- 264
- 265 if (PTR_ERR(p) == -1UL)
- 266 goto out;
- 267
- 268 /* Found nothing?!?! Either we hang forever, or we panic. */
- 269 if (!p) {
- 270 read_unlock(&tasklist_lock);
- 271 show_free_areas();
- 272 panic("Out of memory and no killable processes...\n");
- 273 }
- 274
- 275 printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
- 276 show_free_areas();
- 277 mm = oom_kill_process(p);
- 278 if (!mm)
- 279 goto retry;
- 280
- 281 out:
- 282 read_unlock(&tasklist_lock);
- 283 if (mm)
- 284 mmput(mm);
- 285
- 286 /*
- 287 * Give "p" a good chance of killing itself before we
- 288 * retry to allocate memory.
- 289 */
- 290 __set_current_state(TASK_INTERRUPTIBLE);
- 291 schedule_timeout(1);
- 292}
- 调用select_bad_process函数选择一个即将被杀死的进程
- 调用oom_kill_process函数杀死进程
- 如果在调用select_bad_process函数的时候返回-1,说明已经有进程被OOM Killer选中,等它死就行了
- 给被选中的进程一点儿时间,休眠一秒再重新分配内存
下面看这个选择"best" process的函数:
- 138static struct task_struct * select_bad_process(void)
- 139{
- 140 unsigned long maxpoints = 0;
- 141 struct task_struct *g, *p;
- 142 struct task_struct *chosen = NULL;
- 143 struct timespec uptime;
- 144
- 145 do_posix_clock_monotonic_gettime(&uptime);
- 146 do_each_thread(g, p)
- 147 /* skip the init task with pid == 1 */
- 148 if (p->pid > 1) {
- 149 unsigned long points;
- 150
- 151 /*
- 152 * This is in the process of releasing memory so wait it
- 153 * to finish before killing some other task by mistake.
- 154 */
- 155 if ((unlikely(test_tsk_thread_flag(p, TIF_MEMDIE)) || (p->flags & PF_EXITING)) &&
- 156 !(p->flags & PF_DEAD))
- 157 return ERR_PTR(-1UL);
- 158 if (p->flags & PF_SWAPOFF)
- 159 return p;
- 160
- 161 points = badness(p, uptime.tv_sec);
- 162 if (points > maxpoints || !chosen) {
- 163 chosen = p;
- 164 maxpoints = points;
- 165 }
- 166 }
- 167 while_each_thread(g, p);
- 168 return chosen;
- 169}
- 跳过init进程
- 如果进程的TIF_MEMDIE标志被设置,表示进程已经被OOM Killer机制选中;如果PF_EXITING标志被设置,表示进程正在被消除;如果PF_DEAD标志被设置,表示进程已经dead;如果这些标志被设置则返回-1,这样告诉调用 out_of_memory函数的进程等一下再分配就好
- 如果进程的PF_SWAPOFF标志被设置,表示那个进程调用了sys_swapoff函数,这个函数迫使进程所有驻留在swap中的page进入RAM中,并设置相应的页表,所以这个进程直接被返回,被选中杀死
- 如果进程没有设置上诉标志,则调用badness选择一个最该杀死的,什么是最该杀死的呢?它选择的是使用了最大量内存而又没有生存很久的进程
看badness函数实现:
- 45unsigned long badness(struct task_struct *p, unsigned long uptime)
- 46{
- 47 unsigned long points, cpu_time, run_time, s;
- 48 struct list_head *tsk;
- 49
- 50 if (!p->mm)
- 51 return 0;
- 52
- 53 /*
- 54 * The memory size of the process is the basis for the badness.
- 55 */
- 56 ppoints = p->mm->total_vm;
- 57
- 58 /*
- 59 * Processes which fork a lot of child processes are likely
- 60 * a good choice. We add the vmsize of the childs if they
- 61 * have an own mm. This prevents forking servers to flood the
- 62 * machine with an endless amount of childs
- 63 */
- 64 list_for_each(tsk, &p->children) {
- 65 struct task_struct *chld;
- 66 chld = list_entry(tsk, struct task_struct, sibling);
- 67 if (chld->mm != p->mm && chld->mm)
- 68 points += chld->mm->total_vm;
- 69 }
- 70
- 71 /*
- 72 * CPU time is in tens of seconds and run time is in thousands
- 73 * of seconds. There is no particular reason for this other than
- 74 * that it turned out to work very well in practice.
- 75 */
- 76 cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime))
- 77 >> (SHIFT_HZ + 3);
- 78
- 79 if (uptime >= p->start_time.tv_sec)
- 80 run_time = (uptime - p->start_time.tv_sec) >> 10;
- 81 else
- 82 run_time = 0;
- 83
- 84 s = int_sqrt(cpu_time);
- 85 if (s)
- 86 points /= s;
- 87 s = int_sqrt(int_sqrt(run_time));
- 88 if (s)
- 89 points /= s;
- 90
- 91 /*
- 92 * Niced processes are most likely less important, so double
- 93 * their badness points.
- 94 */
- 95 if (task_nice(p) > 0)
- 96 points *= 2;
- 97
- 98 /*
- 99 * Superuser processes are usually more important, so we make it
- 100 * less likely that we kill those.
- 101 */
- 102 if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) ||
- 103 p->uid == 0 || p->euid == 0)
- 104 points /= 4;
- 105
- 106 /*
- 107 * We don't want to kill a process with direct hardware access.
- 108 * Not only could that mess up the hardware, but usually users
- 109 * tend to only have this flag set on applications they think
- 110 * of as important.
- 111 */
- 112 if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
- 113 points /= 4;
- 114
- 115 /*
- 116 * Adjust the score by oomkilladj.
- 117 */
- 118 if (p->oomkilladj) {
- 119 if (p->oomkilladj > 0)
- 120 points <<= p->oomkilladj;
- 121 else
- 122 points >>= -(p->oomkilladj);
- 123 }
- 124
- 125#ifdef DEBUG
- 126 printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
- 127 p->pid, p->comm, points);
- 128#endif
- 129 return points;
- 130}
- 获得这个进程拥有的内存size,记为权重
- 遍历这个进程的子进程,如果它们有自己的内存空间,则增加权重vmsize,防止一个进程创建无限的子进程占用内存
- 获得占用cpu的时间cpu_time,占有CPU时间越多,即越忙越可能生存
- 获得运行的时间run_time,运行时间越久越可能生存
- 如果nice值大于0,说明静态优先级很低,你越nice,越会被杀掉,权重×2
- CAP_SYS_ADMIN,管理员的程序要保留下来,权重/4
- CAP_SYS_RAWIO,如果有访问源设备的能力,保留,权重/4
- 使用p->oomkilladj调节一下权重
- 230static struct mm_struct *oom_kill_process(struct task_struct *p)
- 231{
- 232 struct mm_struct *mm;
- 233 struct task_struct *c;
- 234 struct list_head *tsk;
- 235
- 236 /* Try to kill a child first */
- 237 list_for_each(tsk, &p->children) {
- 238 c = list_entry(tsk, struct task_struct, sibling);
- 239 if (c->mm == p->mm)
- 240 continue;
- 241 mm = oom_kill_task(c);
- 242 if (mm)
- 243 return mm;
- 244 }
- 245 return oom_kill_task(p);
- 246}
- 205static struct mm_struct *oom_kill_task(task_t *p)
- 206{
- 207 struct mm_struct *mm = get_task_mm(p);
- 208 task_t * g, * q;
- 209
- 210 if (!mm)
- 211 return NULL;
- 212 if (mm == &init_mm) {
- 213 mmput(mm);
- 214 return NULL;
- 215 }
- 216
- 217 __oom_kill_task(p);
- 218 /*
- 219 * kill all processes that share the ->mm (i.e. all threads),
- 220 * but are in a different thread group
- 221 */
- 222 do_each_thread(g, q)
- 223 if (q->mm == mm && q->tgid != p->tgid)
- 224 __oom_kill_task(q);
- 225 while_each_thread(g, q);
- 226
- 227 return mm;
- 228}
- 176static void __oom_kill_task(task_t *p)
- 177{
- 178 if (p->pid == 1) {
- 179 WARN_ON(1);
- 180 printk(KERN_WARNING "tried to kill init!\n");
- 181 return;
- 182 }
- 183
- 184 task_lock(p);
- 185 if (!p->mm || p->mm == &init_mm) {
- 186 WARN_ON(1);
- 187 printk(KERN_WARNING "tried to kill an mm-less task!\n");
- 188 task_unlock(p);
- 189 return;
- 190 }
- 191 task_unlock(p);
- 192 printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm);
- 193
- 194 /*
- 195 * We give our sacrificial lamb high priority and access to
- 196 * all the memory it needs. That way it should be able to
- 197 * exit() and clear out its resources quickly...
- 198 */
- 199 p->time_slice = HZ;
- 200 set_tsk_thread_flag(p, TIF_MEMDIE);
- 201
- 202 force_sig(SIGKILL, p);
- 203}
杀那个进程之前先让他把该干的事干完,给它时间片和很高的优先级。然后发送SIGKILL信号,让它去死吧。
- 1268void
- 1269force_sig(int sig, struct task_struct *p)
- 1270{
- 1271 force_sig_info(sig, (void*)1L, p);
- 1272}
下边一个实例程序:
- #include <stdio.h>
- int main()
- {
- void *p;
- while(1)
- {
- p = malloc(1024 * 1024 * 100);
- memset(p, 0, 1024 * 1024 * 100);
- printf("100MB memory has been allocated!\n");
- }
- return 0;
- }
此时查看日志:
可以看到系统只剩下8MB左右内存,swap分区已经被全部耗尽了。此时实在无法分配内存了,即使回收也无法成功,因为free swap已经是0了,不可能将pages swap out了。日志里,计算了各个zone buddy system所剩下的pages。
- kernel hacker修炼之道之内存管理-OOM Killer
- kernel hacker修炼之道之内存管理-高端内存(上)
- kernel hacker修炼之道之内存管理-高端内存(下)
- kernel hacker修炼之道
- kernel hacker修炼之道之内核虚拟化KVM——overview
- kernel hacker修炼之道之内核虚拟化 KVM/QEMU——Guest OS, Qemu
- kernel hacker修炼之道之内核虚拟化KVM——overview
- Linux之OOM-killer
- OC学习之路之内存管理
- 操作系统之内存管理
- Android之内存管理
- OC之内存管理
- cocos2dx 之内存管理
- c++之内存管理
- OC之内存管理
- jvm之内存管理
- 操作系统之内存管理
- 简单之内存管理
- GNU make man手册(中文版)
- 棋盘覆盖_分治策略_java实现
- 看技术方面书
- gray码_分治策略_java
- AS3 摄像机类,国人用starling基础上,写了一个挺强大的
- kernel hacker修炼之道之内存管理-OOM Killer
- 多边形游戏_动态规划
- ObjectiveC中的赋值,对象拷贝,浅拷贝与深拷贝
- Android Wifi:使用Android Instrument 自动测试 WIFI_SERVICE
- 14条原则 (2) 首先检查最简单的:例如,MFC播放avi的时候在上面画东西
- android SDK开发环境搭建(Android 4.0.3 emulator)
- POJ 2155 Matrix
- 单源最短路径_贪心算法
- 三维立体重建