linux内核虚拟化之路(一) cgroup机制

来源:互联网 发布:软件模块化设计原则 编辑:程序博客网 时间:2024/06/03 23:03

[摘要]

[正文]cgroup开启

[正文]cgroup初始化

[正文]cgroup文件系统挂载

[正文]cgroup文件访问与生效

[正文]cgroup机制启用实例

[总结]


注意:请使用谷歌浏览器阅读(IE浏览器排版混乱)


【摘要】

本文将介绍linux内核中cgroup机制的实现原理.主要进行内核源码分析,并以一个使用实例介绍cgroup是如何控制进程对cpu和内存资源的使用.

【正文】cgroup开启

1 配置项

General setup  --->  [*] Control Group support  --->    | |                                     [*]   Resource counters                                                  | |    | |                                     [*]     Memory Resource Controller for Control Groups                    | |    | |                                     [*]       Memory Resource Controller Swap Extension                      | |    | |                                     [*]       Memory Resource Controller Kernel Memory accounting            | |    | |                                     [*]   Group CPU scheduler    
2 cpu控制 
mount -t cgroup -o cpu cpu /mnt/mtd/cgroup/cpu  之后/mnt/mtd/cgroup/cpu下实现:static struct cftype cgroup_legacy_base_files[] = {{.name = "cgroup.procs",}}
3 memory控制
需要打开配置项:CONFIG_MEMCG

mount -t cgroup -o memory memory /mnt/mtd/cgroup/memory  之后static struct cftype mem_cgroup_files[] = {{.name = "usage_in_bytes",.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),.read_u64 = mem_cgroup_read_u64,}}
3 cpu和memory控制
# mount -t cgroup cgroup /mnt/mtd/cpu_memory/或者:# mount -t cgroup -o cpu,memory cpu_memory /mnt/mtd/cpu_memory/# ls /mnt/mtd/cpu_memory/cgroup.clone_children            memory.oom_controlcgroup.event_control             memory.pressure_levelcgroup.procs                     memory.soft_limit_in_bytescgroup.sane_behavior             memory.statcpu.shares                       memory.swappinessmemory.failcnt                   memory.usage_in_bytesmemory.force_empty               memory.use_hierarchymemory.limit_in_bytes            notify_on_releasememory.max_usage_in_bytes        release_agentmemory.move_charge_at_immigrate  tasks

memory控制内容:

 cgroup.event_control       #用于eventfd的接口 memory.usage_in_bytes      #显示当前已用的内存 memory.limit_in_bytes      #设置/显示当前限制的内存额度 memory.failcnt             #显示内存使用量达到限制值的次数 memory.max_usage_in_bytes  #历史内存最大使用量 memory.soft_limit_in_bytes #设置/显示当前限制的内存软额度 memory.stat                #显示当前cgroup的内存使用情况 memory.use_hierarchy       #设置/显示是否将子cgroup的内存使用情况统计到当前cgroup里面 memory.force_empty         #触发系统立即尽可能的回收当前cgroup中可以回收的内存 memory.pressure_level      #设置内存压力的通知事件,配合cgroup.event_control一起使用 memory.swappiness          #设置和显示当前的swappiness memory.move_charge_at_immigrate #设置当进程移动到其他cgroup中时,它所占用的内存是否也随着移动过去 memory.oom_control         #设置/显示oom controls相关的配置 memory.numa_stat           #显示numa相关的内存

其中分析一下:memory.usage_in_bytes;其中涉及的代码可以参考后文的分析;

usage_in_bytes可以表征内存的使用情况,但它并不是真正的使用内存的大小.

usage_in_bytes累加参考mem_cgroup_try_charge(),try_charge(),consume_stock()几个函数;

真正实现usage_in_bytes累加是在函数:try_charge->res_counter_charge->__res_counter_charge->res_counter_charge_locked中完成

1> 当缺页异常申请一个页时:流程handle_pte_fault->do_anonymous_page()->mem_cgroup_try_charge()->try_charge;

2> try_charge->consume_stock中判断是否有余额,余额从何处来?

第一次try_charge时肯定没有余额,此时会增加32个页,即usage_in_bytes=32*pages;注意此时缺页异常中虽然真正申请的只有一个页,但

usage_in_bytes中统计的是32个页;当下一次缺页异常申请一个页时,try_charge->consume_stock中判断余额有32个页,所以余额减1后返回,

usage_in_bytes中还是32pages;直到余额用完,进行下一个32pages的累加.

usage_in_bytes减少参考函数:do_exit->exit_mm->mmput->exit_mmap->release_pages->mem_cgroup_uncharge_list->uncharge_list

真正实现usage_in_bytes减少是在函数:uncharge_list->uncharge_batch->res_counter_uncharge()->res_counter_uncharge_locked()中完成;

1>内存释放(包括用户释放,进程退出时释放)时,usage_in_bytes减少相应的释放大小;由上可知usage_in_bytes增加时都是以32pages=128k为单位

增加的,而此处减少时是以真正释放的大小为单位减少的,所以会出现进程退出时usage_in_bytes仍然大于0的情况.

使用usage_in_bytes统计真正使用内存大小的修改方式:

static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
     unsigned int nr_pages)
{/* 注意此处会影响usage_in_bytes */
   unsigned int batch = max(CHARGE_BATCH, nr_pages);

   /*unsigned int batch = nr_pages;如此修改,则usage_in_bytes表示统计真正使用的内存大小*/

}

要理解清楚usage_in_bytes的含义,请按上述流程参考usage_in_bytes累加和减少过程的代码.

【正文】cgroup初始化

先介绍两个重要定义:

1 for_each_subsys(ss, ssid)定义

/*遍历cgroup_subsys[]:即本例中cpu_cgrp_subsys和memory_cgrp_subsys*/#define for_each_subsys(ss, ssid)\for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&\     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
cgroup_subsys[]定义如下
static struct cgroup_subsys *cgroup_subsys[]={&cpu_cgrp_subsys;&memory_cgrp_subsys;}struct cgroup_subsys cpu_cgrp_subsys = {.css_alloc= cpu_cgroup_css_alloc,.css_free= cpu_cgroup_css_free,.css_online= cpu_cgroup_css_online,.css_offline= cpu_cgroup_css_offline,.fork= cpu_cgroup_fork,.can_attach= cpu_cgroup_can_attach,.attach= cpu_cgroup_attach,.allow_attach   = subsys_cgroup_allow_attach,.exit= cpu_cgroup_exit,.legacy_cftypes= cpu_files,.early_init= 1,};struct cgroup_subsys memory_cgrp_subsys = {.css_alloc = mem_cgroup_css_alloc,.css_online = mem_cgroup_css_online,.css_offline = mem_cgroup_css_offline,.css_free = mem_cgroup_css_free,.css_reset = mem_cgroup_css_reset,.can_attach = mem_cgroup_can_attach,.cancel_attach = mem_cgroup_cancel_attach,.attach = mem_cgroup_move_task,.allow_attach = mem_cgroup_allow_attach,.bind = mem_cgroup_bind,.legacy_cftypes = mem_cgroup_files,.early_init = 0,};
cgroup_subsys[]使用的SUBSYS宏定义
cgroup.c中定义:
/* generate an array of cgroup subsystem pointers *//*SUBSYS(cpu)定义[cpu_cgrp_id]=&cpu_cgrp_subsys;SUBSYS(memory)定义[memory_cgrp_id]=&memory_cgrp_subsys;*/#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,static struct cgroup_subsys *cgroup_subsys[] = {#include <linux/cgroup_subsys.h>---如上定义;};#undef SUBSYS/* array of cgroup subsystem names *//*定义cpu_cgrp_id = "cpu"cgroup_subsys_name[]={"cpu","memory"};*/#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,static const char *cgroup_subsys_name[] = {#include <linux/cgroup_subsys.h>};#undef SUBSYS

#include <linux/cgroup_subsys.h>:

#if IS_ENABLED(CONFIG_CGROUP_SCHED)SUBSYS(cpu)#endif#if IS_ENABLED(CONFIG_MEMCG)SUBSYS(memory)#endif

文件对应的cgroup_subsys与cftype定义举例:

struct cgroup_subsys memory_cgrp_subsys = {.css_alloc = mem_cgroup_css_alloc,.css_online = mem_cgroup_css_online,.css_offline = mem_cgroup_css_offline,.css_free = mem_cgroup_css_free,.css_reset = mem_cgroup_css_reset,.can_attach = mem_cgroup_can_attach,.cancel_attach = mem_cgroup_cancel_attach,.attach = mem_cgroup_move_task,.allow_attach = mem_cgroup_allow_attach,.bind = mem_cgroup_bind,.legacy_cftypes = mem_cgroup_files,.early_init = 0,};/*cftype;cpu_cgrp_subsys->cpu_files类似*/static struct cftype mem_cgroup_files[] = {{.name = "usage_in_bytes",.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),.read_u64 = mem_cgroup_read_u64,},{.name = "max_usage_in_bytes",.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),.write = mem_cgroup_reset,.read_u64 = mem_cgroup_read_u64,},{.name = "limit_in_bytes",.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),.write = mem_cgroup_write,.read_u64 = mem_cgroup_read_u64,},{ }, /* terminate */};
3 cgroup初始化

start_kernel(){1>cgroup_init_early(); 2>page_cgroup_init() 3>cgroup_init()}

start_kernel->cgroup_init_early初始化cgroup_root;cgroup_subsys;

int __init cgroup_init_early(void){static struct cgroup_sb_opts __initdata opts;struct cgroup_subsys *ss;int i;/*初始化cgroup_root:cgrp_dfl_root;cgroup_init_subsys中赋值:cpu_cgroup_subsys->root=cgrp_dfl_root*/init_cgroup_root(&cgrp_dfl_root, &opts);cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;RCU_INIT_POINTER(init_task.cgroups, &init_css_set);/*遍历cgroup_subsys[]并初始化cgroup_subsys[]:*/for_each_subsys(ss, i) {ss->id = i;ss->name = cgroup_subsys_name[i];/* 初始化cgroup_subsys:cpu_cgrp_subsys->early_init=1;cgroup_init_subsys中初始化:cpu_cgrp_subsys->root=cgrp_dfl_root;memory_cgrp_subsys->early_init=0;*/if (ss->early_init)cgroup_init_subsys(ss, true);}return 0;}

start_kernel->cgroup_init:初始化cftype(struct cftype mem_cgroup_files定义见上文)

/*该函数初始化了文件对应的struct cftype:cpu_files;mem_cgroup_files;cgroup_dfl_base_files和cgroup_legacy_base_files;cgroup文件读写都是基于每个文件对应的cftype;*/int __init cgroup_init(void){struct cgroup_subsys *ss;unsigned long key;int ssid, err;/*初始化cfttype:cgroup文件系统每个文件都对应cftype,操作文件时也是基于cfstype的;cgroup文件读写操作:inode->i_fops=kernfs_file_fops->write/read--cgroup_mount时注册的;cgroup_kf_ops->write/seq_read(cgroup_file_read)--cgroup_init_cftypes时初始化部分文件的cftype;可以参考cgroup_dfl_base_files和cgroup_legacy_base_files对应的文件;mem_cgroup_files[].write/read(mem_cgroup_write)--全局定义的;表示memory控制文件,如:memory.limit_in_bytes;这个文件的cfstype是在挂载时初始化的(mount -t cgroup -o memory memory /mnt/mtd/cgroup/memory); */BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));mutex_lock(&cgroup_mutex);/* Add init_css_set to the hash table */key = css_set_hash(init_css_set.subsys);hash_add(css_set_table, &init_css_set.hlist, key);/*在此创建cgroup_dfl_base_files;参考后文挂载过程*/BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));mutex_unlock(&cgroup_mutex);for_each_subsys(ss, ssid) {/*cpu_cgrp_subsys->early_init=1;在cgroup_init_early->cgroup_init_subsys中初始化*/if (ss->early_init) {struct cgroup_subsys_state *css =init_css_set.subsys[ss->id];css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,GFP_KERNEL);BUG_ON(css->id < 0);} else {/*初始化cgroup_subsys;后文会分析*/cgroup_init_subsys(ss, false);}list_add_tail(&init_css_set.e_cset_node[ssid],     &cgrp_dfl_root.cgrp.e_csets[ssid]);/** Setting dfl_root subsys_mask needs to consider the* disabled flag and cftype registration needs kmalloc,* both of which aren't available during early_init.*/if (ss->disabled)continue;cgrp_dfl_root.subsys_mask |= 1 << ss->id;/*cgroup_legacy_files_on_dfl=0*/if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)ss->dfl_cftypes = ss->legacy_cftypes;if (!ss->dfl_cftypes)cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;/*1 当ss=cpu_cgrp_subsys : ss->dfl_cftypes=NULL;ss->legacy_cftypes=cpu_files; 2 当ss=memory_cgrp_subsys : ss->dfl_cftypes=NULL;ss->legacy_cftypes=mem_cgroup_files;*/if (ss->dfl_cftypes == ss->legacy_cftypes) {WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));} else {/*ss->dfl_cftypes=NULL;不初始化*/WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));/*cgroup_add_legacy_cftypes->cgroup_init_cftypes:注意此处初始化cpu_files和mem_cgroup_files对应的cftype;注意和struct cftype cgroup_legacy_base_files初始化的区别;*/WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));}}cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);if (!cgroup_kobj)return -ENOMEM;/*注册cgroup文件系统;挂载时:cgroup_mount*/err = register_filesystem(&cgroup_fs_type);if (err < 0) {kobject_put(cgroup_kobj);return err;}/*创建/proc/cgroups*/proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);return 0;}
start_kernel->cgroup_init->cgroup_init_cftypes:
static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts){struct cftype *cft;/*每个文件对应一个cftype,如:mem_cgroup_files表示memory控制文件,如:memory.limit_in_bytes;*/for (cft = cfts; cft->name[0] != '\0'; cft++) {struct kernfs_ops *kf_ops;WARN_ON(cft->ss || cft->kf_ops);if (cft->seq_start)kf_ops = &cgroup_kf_ops;elsekf_ops = &cgroup_kf_single_ops;/** Ugh... if @cft wants a custom max_write_len, we need to* make a copy of kf_ops to set its atomic_write_len.*/if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);if (!kf_ops) {cgroup_exit_cftypes(cfts);return -ENOMEM;}kf_ops->atomic_write_len = cft->max_write_len;}/*定义每个文件的操作方法,cgroup文件读写时使用;cgroup读写操作:inode->i_fops=kernfs_file_fops->write/read--cgroup_mount时注册的;cgroup_kf_ops->write/seq_read(cgroup_file_read)--cgroup_init_cftypes时初始化文件的cftype;mem_cgroup_files[].write/read(mem_cgroup_write)--全局定义的;表示memory控制文件,如:memory.limit_in_bytes;*/cft->kf_ops = kf_ops;cft->ss = ss;}return 0;}

cgroup_init->cgroup_init_subsys()初始化memory_cgrp_subsys;

cgroup_init_early->cgroup_init_subsys()初始化cpu_cgrp_subsys;

cgroup_init_subsys中创建css(即struct cgroup_subsys_state);

cgroup_init_subsys->css_alloc中创建cgroup控制信息结构:如mem_cgroup;task_group等;

cgroup_init_subsys->css_online中把如mem_cgroup;task_group等与tasks里的进程号对应的进程关联;

cgroup_mkdir->create_css中完成类似功能,可以参考后文;

static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early){struct cgroup_subsys_state *css;mutex_lock(&cgroup_mutex);idr_init(&ss->css_idr);INIT_LIST_HEAD(&ss->cfts);ss->root = &cgrp_dfl_root;/*申请css:static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,     struct cgroup_subsys *ss){ //cgrp_dfl_root在cpu_init_early中初始化;cgrp_dfl_root.cgrp.subsys[]为空; rcu_dereference_check(cgrp->subsys[ss->id],lockdep_is_held(&cgroup_mutex));}注意:此时cgroup_css()=NULL;在css_alloc()中会申请cgroup控制相关的信息;如:mem_cgroup和task_group;注意css_alloc之后会紧跟css_online把mem_cgroup与task_struct关联;css_alloc/online_css=cpu_cgroup_css_alloc/cpu_cgroup_css_online*/css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));BUG_ON(IS_ERR(css));init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);css->flags |= CSS_NO_REF;if (early) {/* allocation can't be done safely during early init */css->id = 1;} else {css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);BUG_ON(css->id < 0);}init_css_set.subsys[ss->id] = css;need_forkexit_callback |= ss->fork || ss->exit;BUG_ON(!list_empty(&init_task.tasks));/*此处调用如:css_online=cpu_cgroup_css_online*/BUG_ON(online_css(css));mutex_unlock(&cgroup_mutex);}
【正文】cgroup文件系统挂载

1 cgroup挂载

#mount -t cgroup cgroup /mnt/mtd/cgroup

static struct dentry *cgroup_mount(struct file_system_type *fs_type,   int flags, const char *unused_dev_name,void *data){struct super_block *pinned_sb = NULL;struct cgroup_subsys *ss;struct cgroup_root *root;struct cgroup_sb_opts opts;struct dentry *dentry;init_cgroup_root(root, &opts);/*挂载过程cgroup文件创建,如cpu_files[];mem_cgroup_files[];*/ret = cgroup_setup_root(root, opts.subsys_mask);/*初始化inode等信息*/dentry = kernfs_mount(fs_type, flags, root->kf_root,CGROUP_SUPER_MAGIC, &new_sb);}
2 挂载过程cgroup文件创建

cgroup_mount->cgroup_setup_root->rebind_subsystems->cgroup_populate_dir->cgroup_addrm_files

static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask){LIST_HEAD(tmp_links);struct cgroup *root_cgrp = &root->cgrp;struct cftype *base_files;struct css_set *cset;int i, ret;lockdep_assert_held(&cgroup_mutex);ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);if (ret < 0)goto out;root_cgrp->id = ret;ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,     GFP_KERNEL);if (ret)goto out;ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);if (ret)goto cancel_ref;ret = cgroup_init_root_id(root);if (ret)goto cancel_ref;root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,  KERNFS_ROOT_CREATE_DEACTIVATED,  root_cgrp);if (IS_ERR(root->kf_root)) {ret = PTR_ERR(root->kf_root);goto exit_root_id;}root_cgrp->kn = root->kf_root->kn;if (root == &cgrp_dfl_root)base_files = cgroup_dfl_base_files;elsebase_files = cgroup_legacy_base_files;/*cgroup文件创建:1 cgroup_mount挂载时创建文件:roup_legacy_base_files;2 cgroup_init初始化时创建文件cgroup_dfl_base_files;*/ret = cgroup_addrm_files(root_cgrp, base_files, true);if (ret)goto destroy_root;/*cgroup文件创建,如cpu_files[];mem_cgroup_files[]*/ret = rebind_subsystems(root, ss_mask);if (ret)goto destroy_root;/** There must be no failure case after here, since rebinding takes* care of subsystems' refcounts, which are explicitly dropped in* the failure exit path.*/list_add(&root->root_list, &cgroup_roots);cgroup_root_count++;/** Link the root cgroup in this hierarchy into all the css_set* objects.*/down_write(&css_set_rwsem);hash_for_each(css_set_table, i, cset, hlist)link_css_set(&tmp_links, cset, root_cgrp);up_write(&css_set_rwsem);BUG_ON(!list_empty(&root_cgrp->self.children));BUG_ON(atomic_read(&root->nr_cgrps) != 1);kernfs_activate(root_cgrp->kn);return ret;}

添加和删除cgroup文件:

cgroup_mount->cgroup_setup_root->cgroup_addrm_files()

cgroup_mount->cgroup_setup_root->rebind_subsystems->cgroup_populate_dir->cgroup_addrm_files()

/*添加和删除cgroup文件*/static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],     bool is_add){struct cftype *cft;int ret;lockdep_assert_held(&cgroup_mutex);/*如挂载过程创建mem_cgroup_files[]文件;*/for (cft = cfts; cft->name[0] != '\0'; cft++) {/* does cft->flags tell us to skip this file on @cgrp? */if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))continue;if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))continue;if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))continue;if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))continue;if (is_add) {/*创建cgroup文件:->__kernfs_create_file*/ret = cgroup_add_file(cgrp, cft);if (ret) {pr_warn("%s: failed to add %s, err=%d\n",__func__, cft->name, ret);return ret;}} else {/*删除cgroup文件*/cgroup_rm_file(cgrp, cft);}}return 0;}
3 挂载过程注册cgroup文件inode->i_fops:

cgroup_mount->kernfs_mount->kernfs_fill_super->kernfs_get_inode->

kernfs_init_inode():inode->i_fops=kernfs_file_fops;

4 cgroup文件写过程

1) inode->i_fops=kernfs_file_fops->write/read=kernfs_fop_read/kernfs_fop_write->

初始化过程:cgroup_mount->kernfs_mount->kernfs_fill_super->kernfs_get_inode->kernfs_init_inode():

inode->i_fops=kernfs_file_fops;

2) cgroup_kf_ops->write/seq_read=cgroup_file_write->cgroup_init->cgroup_init_cftypes:时初始化;

入参of(即struct kernfs_open_file)初始:

(kernfs_file_fops->open=kernfs_fop_open)中((struct seq_file *)file->private_data)->private=of;

kernfs_node(of->kn)初始化,分如下几步:

打开一个cgroup文件它对应一个of->kn(即kernel_node),同时也对应of->kn->parent,这个parent也是kernfs_node;

可以理解为每个cgroup文件的对应一个kernfs_node(即of->kn)同时也对应一个父kernfs_node(即of->kn->parent);

首先看of->kn->parent创建:

1> kernfs_fill_super中有一段代码:sturct dentry *root->d_fsdata=info->root->kn;

2> info->root->kn从何而来?这要追溯到挂载过程:cgroup_mount->kernfs_mount()函数

的入参root->kf_root即是此处info->root->kn中的root;而此root是在cgroup_mount中创建的;

info->root->kn在cgroup_mount->cgroup_setup_root->kernfs_create_root中创建:

kn->priv=root_cgrp;(即cgroup)注意此时(cgroup_mount过程中)的kn=of->kn->parent;

由此cgrp=of->kn->parent->priv是在cgroup_mount->cgroup_setup_root过程创建的;

看完of->kn->parent,回头再看of->kn的创建:

1>cgroup_mount过程创建完了父kernfs_node(即of->kn->parent)后,会继续执行:

cgroup_mount->rebind_subsystems->cgroup_populate_dir->cgroup_addrm_files->

cgroup_add_file->__kernfs_create_file函数中会为每个cgroup文件创建kernfs_node

(即of-kn),它的父kernfs_node就是上面创建的of->kn->parent;而此时:

of->kn->priv=cftype(即每个cgroup文件各自对应的cftype可以参考mem_cgroup_files[]定义);

__kernfs_create_file->kernfs_add_one中将of->kn与of->kn->parent建立关系;

在打开文件过程会调用kernfs_iop_lookup:

kernfs_iop_lookup中根据cgoup文件名,从of->kn->parent开始查找kernfs_node,因为kernfs_add_one中将of->kn和

of->kn->parent建立了关系,所以kernfs_iop_lookup中能找到cgroup_add_file->__kernfs_create_file

过程创建的kernfs_node,并且在kernfs_iop_lookup中有一段代码:dentry->d_fsdata=kn=of->kn;

of->kn即__kernfs_create_file中创建的kernfs_node;

(注意在kernfs_fill_super创建父kernfs_node时也有赋值d_fsdata的操作;)

当我们打开一个cgroup文件时:kernfs_fop_open中会申请一个struct kernfs_open_file *of;

of->fn=file->f_path.dentry->d_fsdata;这个d_fsdata就是上面__kernfs_create_file创建的kernfs_node;

总结来说:我们写cgroup文件时,可以根据打开过程中的of->fn=file->f_path.dentry->d_fsdata(即kernfs_node)

找到父kernfs_node(即of->kn->parent)并根据父kernfs_node找到cgroup(即:of->kn->parent->priv);

static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,size_t nbytes, loff_t off){struct cgroup *cgrp = of->kn->parent->priv;struct cftype *cft = of->kn->priv;struct cgroup_subsys_state *css;int ret;if (cft->write)return cft->write(of, buf, nbytes, off);rcu_read_lock();css = cgroup_css(cgrp, cft->ss);rcu_read_unlock();/*->cgroup_file_write(cgroup_init_cftypes时初始化) */if (cft->write_u64) {unsigned long long v;ret = kstrtoull(buf, 0, &v);if (!ret)ret = cft->write_u64(css, cft, v);} else if (cft->write_s64) {long long v;ret = kstrtoll(buf, 0, &v);if (!ret)ret = cft->write_s64(css, cft, v);} else {ret = -EINVAL;}return ret ?: nbytes;}
3> mem_cgroup_write()->mem_cgroup_resize_limit()->memcg_oom_recover->memcg_wakeup_oom

mem_cgroup_files[]中定义;

【正文】cgroup文件访问与生效

本章以mem_cgroup_files[]中定义的:memory.limit_in_bytes文件为例介绍cgroup文件及使用过程;

1 配置memory.limit_in_bytes

#echo 10485760 > /mnt/mtd/cpu_memory/A/memory.limit_in_bytes--表示

/mnt/mtd/cpu_memory/A/tasks里的进程内存不能超过10485760=10M bytes;

由上一章可知:上面命令写memory.limit_in_bytes文件过程大致为:

kernfs_fop_write->cgroup_file_write->mem_cgroup_write();

static ssize_t mem_cgroup_write(struct kernfs_open_file *of,char *buf, size_t nbytes, loff_t off){struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));enum res_type type;int name;unsigned long long val;int ret;buf = strstrip(buf);type = MEMFILE_TYPE(of_cft(of)->private);name = MEMFILE_ATTR(of_cft(of)->private);/*cgroup_kf_single_ops->write=cgroup_file_write->mem_cgoup_write()static struct cftype mem_cgroup_files[] = {{{.name = "limit_in_bytes",.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),.write = mem_cgroup_write,.read_u64 = mem_cgroup_read_u64,}}*/switch (name) {case RES_LIMIT:if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ret = -EINVAL;break;}/* This function does all necessary parse...reuse it */ret = res_counter_memparse_write_strategy(buf, &val);if (ret)break;if (type == _MEM)/*配置memory.limit_in_bytes*/ret = mem_cgroup_resize_limit(memcg, val);else if (type == _MEMSWAP)ret = mem_cgroup_resize_memsw_limit(memcg, val);else if (type == _KMEM)ret = memcg_update_kmem_limit(memcg, val);elsereturn -EINVAL;break;case RES_SOFT_LIMIT:ret = res_counter_memparse_write_strategy(buf, &val);if (ret)break;/** For memsw, soft limits are hard to implement in terms* of semantics, for now, we support soft limits for* control without swap*/if (type == _MEM)ret = res_counter_set_soft_limit(&memcg->res, val);elseret = -EINVAL;break;default:ret = -EINVAL; /* should be BUG() ? */break;}return ret ?: nbytes;}

memory.limit_in_bytes生效过程

ps:memory.limit_in_bytes是挂载后文件;

匿名页与文件页分配分配过程:

1>handle_pte_fault->do_anonymous_page()->mem_cgroup_try_charge();

2>add_to_page_cache_lru->__add_to_page_cache_locked()->->mem_cgroup_try_charge();

2.1 判断内存使用量是否超过limit_in_bytes

int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcgp){struct mem_cgroup *memcg = NULL;unsigned int nr_pages = 1;int ret = 0;if (mem_cgroup_disabled())goto out;if (PageSwapCache(page)) {struct page_cgroup *pc = lookup_page_cgroup(page);if (PageCgroupUsed(pc))goto out;}if (PageTransHuge(page)) {nr_pages <<= compound_order(page);VM_BUG_ON_PAGE(!PageTransHuge(page), page);}/*do_swap_account=0;先获取/cpu_memory/tasks里pid表示的进程对应的mem_cgroup;*/if (do_swap_account && PageSwapCache(page))memcg = try_get_mem_cgroup_from_page(page);if (!memcg)memcg = get_mem_cgroup_from_mm(mm);/*判断内存是否超出memory.limit_in_bytes*/ret = try_charge(memcg, gfp_mask, nr_pages);css_put(&memcg->css);if (ret == -EINTR) {memcg = root_mem_cgroup;ret = 0;}out:*memcgp = memcg;return ret;}

2.2 判断内存使用量是否超过limit_in_bytes:mem_cgroup_try_charge()->try_charge()

static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,     unsigned int nr_pages){/* 注意此处会影响usage_in_bytes */unsigned int batch = max(CHARGE_BATCH, nr_pages);int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;struct mem_cgroup *mem_over_limit;struct res_counter *fail_res;unsigned long nr_reclaimed;unsigned long long size;bool may_swap = true;bool drained = false;int ret = 0;if (mem_cgroup_is_root(memcg))goto done;retry:if (consume_stock(memcg, nr_pages))goto done;size = batch * PAGE_SIZE;/*size=32pages;do_swap_account=0:表示不统计swap+mem;所以不执行res_counter_charge(&memcg->memsw, size, &fail_res));*/if (!do_swap_account ||   !res_counter_charge(&memcg->memsw, size, &fail_res)) {/* memcg是在css_alloc/css_online中创建:判断memcg->res->usage+val>memcg->res->limmitres_counter->limit等于10M bytes(echo 10485760 > /mnt/mtd/cpu_memory/A/memory.limit_in_bytes)res_counter->max_usage与res_counter->usage在echo pid > /mnt/mtd/cpu_memory/A/tasks后从0开始累加;*/if (!res_counter_charge(&memcg->res, size, &fail_res))goto done_restock;if (do_swap_account)res_counter_uncharge(&memcg->memsw, size);mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);} else {mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);may_swap = false;}if (batch > nr_pages) {batch = nr_pages;goto retry;}/** Unlike in global OOM situations, memcg is not in a physical* memory shortage.  Allow dying and OOM-killed tasks to* bypass the last charges so that they can exit quickly and* free their memory.*/if (unlikely(test_thread_flag(TIF_MEMDIE) ||    fatal_signal_pending(current) ||    current->flags & PF_EXITING))goto bypass;if (unlikely(task_in_memcg_oom(current)))goto nomem;if (!(gfp_mask & __GFP_WAIT))goto nomem;nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,   gfp_mask, may_swap);if (mem_cgroup_margin(mem_over_limit) >= nr_pages)goto retry;if (!drained) {drain_all_stock_async(mem_over_limit);drained = true;goto retry;}if (gfp_mask & __GFP_NORETRY)goto nomem;/** Even though the limit is exceeded at this point, reclaim* may have been able to free some pages.  Retry the charge* before killing the task.** Only for regular pages, though: huge pages are rather* unlikely to succeed so close to the limit, and we fall back* to regular pages anyway in case of failure.*/if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))goto retry;/** At task move, charge accounts can be doubly counted. So, it's* better to wait until the end of task_move if something is going on.*/if (mem_cgroup_wait_acct_move(mem_over_limit))goto retry;if (nr_retries--)goto retry;if (gfp_mask & __GFP_NOFAIL)goto bypass;if (fatal_signal_pending(current))goto bypass;mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));nomem:if (!(gfp_mask & __GFP_NOFAIL))return -ENOMEM;bypass:return -EINTR;done_restock:if (batch > nr_pages)refill_stock(memcg, batch - nr_pages);done:return ret;}

2.3 判断内存使用量是否超过limit_in_bytes:获取当前进程对应的mem_cgroups

mem_cgroup_try_charge()->get_mem_cgroup_from_mm()

static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm){struct mem_cgroup *memcg = NULL;rcu_read_lock();do {/** Page cache insertions can happen withou an* actual mm context, e.g. during disk probing* on boot, loopback IO, acct() writes etc.*/if (unlikely(!mm))memcg = root_mem_cgroup;else {/*当前进程是/mnt/mtd/cpu_memory/tasks里pid代表的进程*/memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));if (unlikely(!memcg))memcg = root_mem_cgroup;}} while (!css_tryget_online(&memcg->css));rcu_read_unlock();return memcg;}

获取当前进程对应的mem_cgroups:

mem_cgroup_try_charge()->get_mem_cgroup_from_mm()->mem_cgroup_from_task()

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p){/** mm_update_next_owner() may clear mm->owner to NULL* if it races with swapoff, page migration, etc.* So this can be called with p == NULL.*/if (unlikely(!p))return NULL;/*根据task找到css_set(即task->cgroups)再由此找到css(即task_css()=task->cgroups->subsys[memory_cgrp_id]);task_css()先根据task_css_set_check()找到当前进程的css_set即task->cgroups;再由此找到css,即task_css()=task->cgroups->subsys[memory_cgrp_id];*/return mem_cgroup_from_css(task_css(p, memory_cgrp_id));}
获取当前进程对应的mem_cgroups:根据task找到css_set(即task->cgroups);

再由此找到css(即task_css()=task->cgroups->subsys[memory_cgrp_id]);

再根据css找到mem_cgroup(即:container_of(s, struct mem_cgroup, css));

mem_cgroup_from_task()->mem_cgroup_from_css()
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s){/*根据cgroup_subsys_state找到mem_cgroup*/return s ? container_of(s, struct mem_cgroup, css) : NULL;}

获取当前进程对应的mem_cgroups: css与mem_cgroup的创建

mem_cgroups与cgroup_subsys_state初始化:css_alloc();

cgroup_mkdir->create_css->css_alloc();

cgroup_init_subsys->css_alloc();即cpu_cgrp_subsys或memory_cgrp_subsys

css申请与创建关系:

static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,     bool visible){struct cgroup *parent = cgroup_parent(cgrp);struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);struct cgroup_subsys_state *css;int err;lockdep_assert_held(&cgroup_mutex);/*此时创建css和cgroup信息如:mem_cgroup;task_group;mem_cgroup_css_alloc*/css = ss->css_alloc(parent_css);if (IS_ERR(css))return PTR_ERR(css);init_and_link_css(css, ss, cgrp);err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);if (err)goto err_free_css;err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);if (err < 0)goto err_free_percpu_ref;css->id = err;if (visible) {err = cgroup_populate_dir(cgrp, 1 << ss->id);if (err)goto err_free_id;}/* @css is ready to be brought online now, make it visible */list_add_tail_rcu(&css->sibling, &parent_css->children);cgroup_idr_replace(&ss->css_idr, css, css->id);/*创建关系:mem_cgroup;此处调用task_cgroup_css_online/mem_cgroup_css_online*/err = online_css(css);if (err)goto err_list_del;if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&   cgroup_parent(parent)) {if (!strcmp(ss->name, "memory"))ss->warned_broken_hierarchy = true;}return 0;}
获取当前进程对应的mem_cgroups: css与mem_cgroup的创建:

cgroup_mkdir->create_css->css_alloc();

cgroup_init_subsys->css_alloc()关于css初始化等与create_css类似

struct cgroup_subsys memory_cgrp_subsys = {.css_alloc = mem_cgroup_css_alloc,        .css_online= cpu_cgroup_css_online,};static struct cgroup_subsys_state * __ref   mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css){struct mem_cgroup *memcg;long error = -ENOMEM;int node;/*mem_cgroup申请;容器内进程的cgroup机制都是根据task_struct找到cgroups再找到类似mem_cgroup的*/memcg = mem_cgroup_alloc();if (!memcg)return ERR_PTR(error);for_each_node(node)if (alloc_mem_cgroup_per_zone_info(memcg, node))goto free_out;/* root ? */if (parent_css == NULL) {root_mem_cgroup = memcg;res_counter_init(&memcg->res, NULL);res_counter_init(&memcg->memsw, NULL);res_counter_init(&memcg->kmem, NULL);}memcg->last_scanned_node = MAX_NUMNODES;INIT_LIST_HEAD(&memcg->oom_notify);memcg->move_charge_at_immigrate = 0;mutex_init(&memcg->thresholds_lock);spin_lock_init(&memcg->move_lock);vmpressure_init(&memcg->vmpressure);INIT_LIST_HEAD(&memcg->event_list);spin_lock_init(&memcg->event_list_lock);/*memcg->css=cgroup_subsys_state;css如下创建*/return &memcg->css;}

获取当前进程对应的mem_cgroups: css与mem_cgroup的创建

cgroup_mkdir->create_css->online_css

/* invoke ->css_online() on a new CSS and mark it online if successful */static int online_css(struct cgroup_subsys_state *css){struct cgroup_subsys *ss = css->ss;int ret = 0;lockdep_assert_held(&cgroup_mutex);if (ss->css_online)ret = ss->css_online(css);if (!ret) {css->flags |= CSS_ONLINE;/*css赋值给css->cgroup->subsys[]*/rcu_assign_pointer(css->cgroup->subsys[ss->id], css);}return ret;}

mcm_cgroup初始化:

online_css->css_online=mem_cgroup_css_online

struct cgroup_subsys memory_cgrp_subsys = {.css_alloc = mem_cgroup_css_alloc,     .css_online= mem_cgroup_css_online,};/*mem_cgroups初始化;注意mem_cgroup申请是在css_alloc中完成*/static int mem_cgroup_css_online(struct cgroup_subsys_state *css){struct mem_cgroup *memcg = mem_cgroup_from_css(css);struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);int ret;if (css->id > MEM_CGROUP_ID_MAX)return -ENOSPC;if (!parent)return 0;mutex_lock(&memcg_create_mutex);memcg->use_hierarchy = parent->use_hierarchy;memcg->oom_kill_disable = parent->oom_kill_disable;memcg->swappiness = mem_cgroup_swappiness(parent);/*mem_cgroup->res_counter初始化;try_charge时使用*/if (parent->use_hierarchy) {res_counter_init(&memcg->res, &parent->res);res_counter_init(&memcg->memsw, &parent->memsw);res_counter_init(&memcg->kmem, &parent->kmem);/** No need to take a reference to the parent because cgroup* core guarantees its existence.*/} else {res_counter_init(&memcg->res, NULL);res_counter_init(&memcg->memsw, NULL);res_counter_init(&memcg->kmem, NULL);/** Deeper hierachy with use_hierarchy == false doesn't make* much sense so let cgroup subsystem know about this* unfortunate state in our controller.*/if (parent != root_mem_cgroup)memory_cgrp_subsys.broken_hierarchy = true;}mutex_unlock(&memcg_create_mutex);ret = memcg_init_kmem(memcg, &memory_cgrp_subsys);if (ret)return ret;/** Make sure the memcg is initialized: mem_cgroup_iter()* orders reading memcg->initialized against its callers* reading the memcg members.*/smp_store_release(&memcg->initialized, 1);return 0;}

3 tasks(即echo pid > /mnt/mtd/cpu_memory/tasks)与mem_cgroup关联 

cgroup_fork初始化task->cgroups(即struct css_set)

3.1 css_set初始化 即:task->cgroups初始化

fork->cgroup_fork;fork->cgroup_post_fork()

void cgroup_fork(struct task_struct *child){/*初始化task_struct->cgroups;cgroups是struct *css_set结构类型*/RCU_INIT_POINTER(child->cgroups, &init_css_set);INIT_LIST_HEAD(&child->cg_list);}

3.2 css_set初始化 即:task->cgroups初始化后,会根据tasks:(即echo pid > /mnt/mtd/cpu_memory/tasks)

重新设置task->cgroups;

调用流程:kernfs_fop_write->cgroup_file_write->__cgroup_procs_write->

cgroup_attach_task->cgroup_migrate->cgroup_task_migrate();

cgroup_attach_task():

static int cgroup_attach_task(struct cgroup *dst_cgrp,     struct task_struct *leader, bool threadgroup){LIST_HEAD(preloaded_csets);struct task_struct *task;int ret;/* look up all src csets */down_read(&css_set_rwsem);rcu_read_lock();task = leader;do {cgroup_migrate_add_src(task_css_set(task), dst_cgrp,      &preloaded_csets);if (!threadgroup)break;} while_each_thread(leader, task);rcu_read_unlock();up_read(&css_set_rwsem);/* 申请新的css_set:prepare dst csets and commit */ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);if (!ret)/*设置新的task->cgroups(即css_set)*/ret = cgroup_migrate(dst_cgrp, leader, threadgroup);cgroup_migrate_finish(&preloaded_csets);return ret;}

cgroup_attach_task->cgroup_migrate->cgroup_task_migrate()

static void cgroup_task_migrate(struct cgroup *old_cgrp,    struct task_struct *tsk,struct css_set *new_cset){struct css_set *old_cset;lockdep_assert_held(&cgroup_mutex);lockdep_assert_held(&css_set_rwsem);/** We are synchronized through threadgroup_lock() against PF_EXITING* setting such that we can't race against cgroup_exit() changing the* css_set to init_css_set and dropping the old one.*/WARN_ON_ONCE(tsk->flags & PF_EXITING);old_cset = task_css_set(tsk);get_css_set(new_cset);/*重新设置css_set 即task->cgroups;cgroup_attach_task->cgroup_migrate_add_src时申请*/rcu_assign_pointer(tsk->cgroups, new_cset);/** Use move_tail so that cgroup_taskset_first() still returns the* leader after migration.  This works because cgroup_migrate()* ensures that the dst_cset of the leader is the first on the* tset's dst_csets list.*/list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);/** We just gained a reference on old_cset by taking it from the* task. As trading it for new_cset is protected by cgroup_mutex,* we're safe to drop it here; it will be freed under RCU.*/put_css_set_locked(old_cset);}

cpu.shares生效过程:

首先搞清楚task_group的创建,和mem_cgroup类似:都是通过

cgroup_mkdir->create_css->css_alloc()->cpu_cgroup_css_alloc;

cgroup_init_subsys->css_alloc()->cpu_cgroup_css_alloc;

关键函数:cpu_cgoup_css_alloc->sched_create_group()该函数主要作用:以cfs调度为例

它申请了新的运行队列task_group->cfs_rq;对进程运行队列管理;

此处task_group->cfs_rq与非容器内进程的rq=cpu_rq(cpu);rq->cfs的功能类似;

申请了新的sched_entity:task_group->se;对进程调度时间管理;

此处task_group->se;与非容器内进程的rq=cpu_rq(cpu);rq->cfs->sched_entity的功能类似;

【正文】cgoup机制启用实例

1 开启命令

#mount -t cgroup -o cpu,memory mem_cpu /mnt/mtd/cpu_memory

# ls /mnt/mtd/cpu_memory/cgroup.clone_children            memory.oom_controlcgroup.event_control             memory.pressure_levelcgroup.procs                  memory.soft_limit_in_bytescgroup.sane_behavior             memory.statcpu.shares                    memory.swappinessmemory.failcnt                 memory.usage_in_bytesmemory.force_empty              memory.use_hierarchymemory.limit_in_bytes            notify_on_releasememory.max_usage_in_bytes          release_agentmemory.move_charge_at_immigrate  tasks
#mkdir /mnt/mtd/cpu_memory/A;cd /mnt/mtd/cpu_memory/A

#echo 10485760 > /mnt/mtd/cpu_memory/A/memory.limit_in_bytes

--/mnt/mtd/cpu_memory/A/tasks里的进程使用内存不能超过10485760=10M bytes;

#/home/test&  --启动test进程,假设pid=100;

#echo 100 > /mnt/mtd/cpu_memory/A/tasks

do_anonymous_page()->mem_cgroup_try_charge()计算pid=100的进程是否内存大小溢出;

2 /proc/cgroups 系统

#mount -t cgroup -o cpu,memory cpu_memory /mnt/mtd/cpu_memory

#cat /proc/cgroups

subsys_name  hierarchy  num_cgroups  enabled

cpu              1                   1                      1

memory            1                   1                      1

#mkdir -p /mnt/mtd/cpu_memory/A  --- num_cgroups = 2;

static int proc_cgroupstats_show(){   struct cgroup_subsys *ss;   /*CGROUP_SUBSYS_COUNT定义和cgroup_subsys[ssid]*/   for_each_subsys(ss,i)}

【总结】

1 配置容器内进程内存使用大小及将进程加入容器;

#echo 10485760 > /mnt/mtd/cpu_memory/A/memory.limit_in_bytes

--/mnt/mtd/cpu_memory/A/tasks里的进程内存不能超过10485760=10M bytes;

#echo pid > /mnt/mtd/cpu_memory/A/tasks

2 进程使用内存大小检查发生在缺页异常中:

do_page_falut->handle_mm_fault->

1>匿名页:handle_pte_fault->do_anonymous_page()->mem_cgroup_try_charge()->try_charge;

2>文件页:add_to_page_cache_lru->__add_to_page_cache_locked()->->mem_cgroup_try_charge()->try_charge;

3 进程cgroup机制生效过程:

1>配置tasks时,cgroup_attach_task根据pid找到了task_struct,并配置了task->cgroups(即css_set);

2>当容器内的进程申请内存时,它根据自己的task->cgroups(即css_set)找到css再根据css(即cgroup_subsys_state)找到mem_cgroup

mem_cgroup保存了cgroup机制的内存控制信息,详见上文分析;

3>对容器内进程进行cgroup控制,都是通过类似mem_cgroup/task_group等信息完成的,进程是如何找到各自的mem_cgroup/task_group等信息的?

是通过mem_cgroup/task_group中css与task->cgroups(即css_set)关联.即cgroup使用过程中都是从进程的task->cgroups出发的.

4 cgroup控制的关键是弄清楚每个容器内进程mem_cgroup/task_group等信息的管理和使用.