slub分配器实现

来源：互联网发布：crf算法实现编辑：程序博客网时间：2024/06/03 19:58
slub分配器对外主要提供几个接口：

//创建一个高速缓冲区
struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
unsigned long,
void (*)(void *));
//销毁高速缓冲区
void kmem_cache_destroy(struct kmem_cache *);
//从高速缓冲申请一个object
static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep,
gfp_t flags, int node)；
//释放object
void kmem_cache_free(struct kmem_cache *, void *);
下面直接看代码：
struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align,  unsigned long flags, void (*ctor)(void *)){return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);}/*创建一个高速缓冲区，先从现有的kmem_cache中查找一个能否满足需要的缓冲区，不能满足的话就调用slub alloc,申请page，构造一个高速缓冲区*/struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,size_t align, unsigned long flags, void (*ctor)(void *),struct kmem_cache *parent_cache){struct kmem_cache *s = NULL;int err = 0;get_online_cpus();mutex_lock(&slab_mutex);if (!kmem_cache_sanity_check(memcg, name, size) == 0)goto out_locked;/* * Some allocators will constraint the set of valid flags to a subset * of all flags. We expect them to define CACHE_CREATE_MASK in this * case, and we'll just provide them with a sanitized version of the * passed flags. */flags &= CACHE_CREATE_MASK;//找到可合并的kmem_caches = __kmem_cache_alias(memcg, name, size, align, flags, ctor);if (s)goto out_locked;//最终调用slab_alloc，s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);if (s) {s->object_size = s->size = size;s->align = calculate_alignment(flags, align, size);s->ctor = ctor;if (memcg_register_cache(memcg, s, parent_cache)) {kmem_cache_free(kmem_cache, s);err = -ENOMEM;goto out_locked;}s->name = kstrdup(name, GFP_KERNEL);if (!s->name) {kmem_cache_free(kmem_cache, s);err = -ENOMEM;goto out_locked;}err = __kmem_cache_create(s, flags);if (!err) {s->refcount = 1;list_add(&s->list, &slab_caches);//将缓冲区加入全局list slab_cachesmemcg_cache_list_add(memcg, s);} else {kfree(s->name);kmem_cache_free(kmem_cache, s);}} elseerr = -ENOMEM;out_locked:mutex_unlock(&slab_mutex);put_online_cpus();if (err) {if (flags & SLAB_PANIC)panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",name, err);else {printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",name, err);dump_stack();}return NULL;}return s;}void kmem_cache_destroy(struct kmem_cache *s){/* Destroy all the children caches if we aren't a memcg cache */kmem_cache_destroy_memcg_children(s);get_online_cpus();mutex_lock(&slab_mutex);s->refcount--;if (!s->refcount) {list_del(&s->list);//从slab_caches list中删除if (!__kmem_cache_shutdown(s)) {//释放这块高速缓冲区mutex_unlock(&slab_mutex);if (s->flags & SLAB_DESTROY_BY_RCU)rcu_barrier();memcg_release_cache(s);kfree(s->name);kmem_cache_free(kmem_cache, s);//释放s到kmem_cache} else {list_add(&s->list, &slab_caches);mutex_unlock(&slab_mutex);printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n",s->name);dump_stack();}} else {mutex_unlock(&slab_mutex);}put_online_cpus();}/* * Release all resources used by a slab cache. */ //释放slub cache所用的资源，包括cpu cache，node partial list中，最终slub占用页面 //释放到伙伴系统static inline int kmem_cache_close(struct kmem_cache *s){int node;flush_all(s);/* Attempt to free all objects */for_each_node_state(node, N_NORMAL_MEMORY) {struct kmem_cache_node *n = get_node(s, node);free_partial(s, n);if (n->nr_partial || slabs_node(s, node))return 1;}free_percpu(s->cpu_slab);//释放percpu cpu_slab结构free_kmem_cache_nodes(s);return 0;}//用来flush cpu本地cache，刷新之后cpu本地将没有slub 缓冲区可以用static void flush_all(struct kmem_cache *s){on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);}static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n){struct page *page, *h;list_for_each_entry_safe(page, h, &n->partial, lru) {if (!page->inuse) {//该slub缓冲区还未使用remove_partial(n, page);//从node partial list移除这个缓冲区discard_slab(s, page);//将page释放到伙伴系统中} else {list_slab_objects(s, page,"Objects remaining in %s on kmem_cache_close()");}}}static void free_kmem_cache_nodes(struct kmem_cache *s){int node;for_each_node_state(node, N_NORMAL_MEMORY) {struct kmem_cache_node *n = s->node[node];if (n)kmem_cache_free(kmem_cache_node, n);//最终调用slab frees->node[node] = NULL;}}/*对于从缓冲区申请一个object使用时，申请顺序如下:先从本地cpu cache 正在使用的缓冲区申请，申请不到的话就从cpu cache partial list先获取一个缓冲区，如果还是申请不到就从node partial list获取，申请失败时将从伙伴系统申请新的page。释放一个对象时也是遵循同样的顺序。*/static __always_inline void *slab_alloc(struct kmem_cache *s,gfp_t gfpflags, unsigned long addr){return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);} //先尝试从本地缓冲申请，不成功就走慢速申请路径，从伙伴系统中申请pagestatic __always_inline void *slab_alloc_node(struct kmem_cache *s,gfp_t gfpflags, int node, unsigned long addr){void **object;struct kmem_cache_cpu *c;struct page *page;unsigned long tid;//slab 分配时用于检查gfp flag是否合法if (slab_pre_alloc_hook(s, gfpflags))return NULL;//获取正确的kmem_caches = memcg_kmem_get_cache(s, gfpflags);redo:/* * Must read kmem_cache cpu data via this cpu ptr. Preemption is * enabled. We may switch back and forth between cpus while * reading from one cpu area. That does not matter as long * as we end up on the original cpu again when doing the cmpxchg. * * Preemption is disabled for the retrieval of the tid because that * must occur from the current processor. We cannot allow rescheduling * on a different processor between the determination of the pointer * and the retrieval of the tid. *///禁止调度器抢占，获取本cpu的kmem_cache_cpupreempt_disable();c = __this_cpu_ptr(s->cpu_slab);/* * The transaction ids are globally unique per cpu and per operation on * a per cpu queue. Thus they can be guarantee that the cmpxchg_double * occurs on the right processor and that there was no operation on the * linked list in between. */tid = c->tid;preempt_enable();object = c->freelist;//本地缓冲区空闲对象page = c->page;//slab 物理页框if (unlikely(!object || !node_match(page, node)))//本地cpu没有空闲对象或者page不属于node时，走慢速分配流程object = __slab_alloc(s, gfpflags, node, addr, c);else {//本地cpu快速分配void *next_object = get_freepointer_safe(s, object);/* * The cmpxchg will only match if there was no additional * operation and if we are on the right processor. * * The cmpxchg does the following atomically (without lock semantics!) * 1. Relocate first pointer to the current per cpu area. * 2. Verify that tid and freelist have not been changed * 3. If they were not changed replace tid and freelist * * Since this is without lock semantics the protection is only against * code executing on this cpu *not* from access by other cpus. */if (unlikely(!this_cpu_cmpxchg_double(s->cpu_slab->freelist, s->cpu_slab->tid,object, tid,next_object, next_tid(tid)))) {note_cmpxchg_failure("slab_alloc", s, tid);goto redo;}prefetch_freepointer(s, next_object);stat(s, ALLOC_FASTPATH);}if (unlikely(gfpflags & __GFP_ZERO) && object)memset(object, 0, s->object_size);slab_post_alloc_hook(s, gfpflags, object);return object;} /*先从cpu本地缓冲cpu_slab page中申请一个object，如果没有空闲可用的object，就将cpu_slab partial list获取一个缓冲区放入cpu_slab 的page中，然后再从page中申请object，如果也失败的话先从node partial list获取缓冲区，如果失败将从伙伴系统中申请page，加入到cpu_slab的page */static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,  unsigned long addr, struct kmem_cache_cpu *c){void *freelist;struct page *page;unsigned long flags;local_irq_save(flags);#ifdef CONFIG_PREEMPT/* * We may have been preempted and rescheduled on a different * cpu before disabling interrupts. Need to reload cpu area * pointer. */c = this_cpu_ptr(s->cpu_slab);#endifpage = c->page;if (!page)goto new_slab;redo://slab page跟node不匹配，需要释放cpu slab 缓冲，再重新申请if (unlikely(!node_match(page, node))) {stat(s, ALLOC_NODE_MISMATCH);//将cpu cache page加入node partial listdeactivate_slab(s, page, c->freelist);c->page = NULL;c->freelist = NULL;goto new_slab;}/* * By rights, we should be searching for a slab page that was * PFMEMALLOC but right now, we are losing the pfmemalloc * information when the page leaves the per-cpu allocator */if (unlikely(!pfmemalloc_match(page, gfpflags))) {deactivate_slab(s, page, c->freelist);c->page = NULL;c->freelist = NULL;goto new_slab;}/* must check again c->freelist in case of cpu migration or IRQ */freelist = c->freelist;//本地空闲对象列表if (freelist)goto load_freelist;stat(s, ALLOC_SLOWPATH);//将page freelist置空，同时返回未置空前的freelistfreelist = get_freelist(s, page);if (!freelist) {c->page = NULL;stat(s, DEACTIVATE_BYPASS);goto new_slab;}stat(s, ALLOC_REFILL);load_freelist:/* * freelist is pointing to the list of objects to be used. * page is pointing to the page from which the objects are obtained. * That page must be frozen for per cpu allocations to work. */VM_BUG_ON(!c->page->frozen);c->freelist = get_freepointer(s, freelist);c->tid = next_tid(c->tid);local_irq_restore(flags);return freelist;new_slab:if (c->partial) {page = c->page = c->partial;c->partial = page->next;stat(s, CPU_PARTIAL_ALLOC);c->freelist = NULL;goto redo;}freelist = new_slab_objects(s, gfpflags, node, &c);if (unlikely(!freelist)) {if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())slab_out_of_memory(s, gfpflags, node);local_irq_restore(flags);return NULL;}page = c->page;if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))goto load_freelist;/* Only entered in the debug case */if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr))goto new_slab;/* Slab failed checks. Next slab needed */deactivate_slab(s, page, get_freepointer(s, freelist));c->page = NULL;c->freelist = NULL;local_irq_restore(flags);return freelist;}/*从缓冲区申请一个object，1. 先从 当前node partial list申请一个partial缓冲区，将其加入本地cpu缓冲，同时返回首个object地址2.申请失败的话，就从伙伴系统申请page，然后将其加入cpu本地cache中*/static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,int node, struct kmem_cache_cpu **pc){void *freelist;struct kmem_cache_cpu *c = *pc;struct page *page;//尝试从当前node partial list中找到可用的slub，如果失败需要从伙伴系统申请新的pagefreelist = get_partial(s, flags, node, c);if (freelist)return freelist;//申请page，填充page信息，将其加入cpu 本地cache中page = new_slab(s, flags, node);if (page) {c = __this_cpu_ptr(s->cpu_slab);if (c->page)//cpu 当前正在用的slub不为空，需要将其释放掉flush_slab(s, c);/* * No other reference to the page yet so we can * muck around with it freely without cmpxchg */freelist = page->freelist;page->freelist = NULL;stat(s, ALLOC_SLAB);c->page = page;*pc = c;} elsefreelist = NULL;return freelist; //slub第一个object地址}/* * Try to allocate a partial slab from a specific node. */ //尝试从指定node partial list中获取一个slub缓冲区，然后加入kmem_cache_cpu static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,struct kmem_cache_cpu *c, gfp_t flags){struct page *page, *page2;void *object = NULL;int available = 0;int objects;/* * Racy check. If we mistakenly see no partial slabs then we * just allocate an empty slab. If we mistakenly try to get a * partial slab and there is none available then get_partials() * will return NULL. */if (!n || !n->nr_partial)return NULL;spin_lock(&n->list_lock);//遍历node n的partial listlist_for_each_entry_safe(page, page2, &n->partial, lru) {void *t;if (!pfmemalloc_match(page, flags))continue;//从node partial list获取一个slubt = acquire_slab(s, n, page, object == NULL, &objects);if (!t)break;available += objects;//可用object数量if (!object) {//将获取的partial缓冲区加入本地cpu cache中c->page = page;stat(s, ALLOC_FROM_PARTIAL);object = t;} else {put_cpu_partial(s, page, 0);//获取的page跟cpu partial 链接起来stat(s, CPU_PARTIAL_NODE);}if (kmem_cache_debug(s) || available > s->cpu_partial / 2)break;}spin_unlock(&n->list_lock);return object;}//给kmem_cache 缓冲区申请page，然后填充page内的object debug信息，初始化page信息static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node){struct page *page;void *start;void *last;void *p;int order;BUG_ON(flags & GFP_SLAB_BUG_MASK);page = allocate_slab(s,flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);if (!page)goto out;order = compound_order(page);//???inc_slabs_node(s, page_to_nid(page), page->objects);memcg_bind_pages(s, order);page->slab_cache = s;__SetPageSlab(page);if (page->pfmemalloc)SetPageSlabPfmemalloc(page);start = page_address(page);//获取page对应的虚拟地址//设置了SLAB_POISON的调试项，需要将slub对应内存全填充为0x5aif (unlikely(s->flags & SLAB_POISON))memset(start, POISON_INUSE, PAGE_SIZE << order);last = start;kasan_poison_slab(page);//更新slub缓冲区内的object信息for_each_object(p, s, start, page->objects) {//依次遍历每个object，size表示整个object大小setup_object(s, page, last);//根据debug选项，构造object数据，last指向objectset_freepointer(s, last, p);//p赋值给下个free pointerlast = p;}setup_object(s, page, last);set_freepointer(s, last, NULL);page->freelist = start;page->inuse = page->objects; //object数量page->frozen = 1;out:return page;}/* Object debug checks for alloc/free paths *///这个用来构造object 调试信息，比如poison，red zone，call stackstatic void setup_object_debug(struct kmem_cache *s, struct page *page,void *object){if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))return;init_object(s, object, SLUB_RED_INACTIVE);init_tracking(s, object);}/*一个object前面object_size-1 填充为0x6b，第object_size个字节为0xa5，接下来后面(s->inuse - s->object_size)个字节填充为redzone 0xbb，inuse表示object实际大小inuse==offset*/static void init_object(struct kmem_cache *s, void *object, u8 val){u8 *p = object;if (s->flags & __OBJECT_POISON) {memset(p, POISON_FREE, s->object_size - 1);p[s->object_size - 1] = POISON_END;}if (s->flags & SLAB_RED_ZONE)memset(p + s->object_size, val, s->inuse - s->object_size);}//设置SLAB_STORE_USER之后，需要构造slub对象allocation/free调用栈static void init_tracking(struct kmem_cache *s, void *object){if (!(s->flags & SLAB_STORE_USER))return;set_track(s, object, TRACK_FREE, 0UL);set_track(s, object, TRACK_ALLOC, 0UL);}//下面是slab释放过程static __always_inline void slab_free(struct kmem_cache *s,struct page *page, void *x, unsigned long addr){void **object = (void *)x;struct kmem_cache_cpu *c;unsigned long tid;slab_free_hook(s, x);redo:/* * Determine the currently cpus per cpu slab. * The cpu may change afterward. However that does not matter since * data is retrieved via this pointer. If we are on the same cpu * during the cmpxchg then the free will succedd. */preempt_disable();c = __this_cpu_ptr(s->cpu_slab);tid = c->tid;preempt_enable();//如果object所在的page位于cpu_slab上就将obejct放到cpu本地cache，否则走慢速释放通道if (likely(page == c->page)) {set_freepointer(s, object, c->freelist);if (unlikely(!this_cpu_cmpxchg_double(s->cpu_slab->freelist, s->cpu_slab->tid,c->freelist, tid,object, next_tid(tid)))) {note_cmpxchg_failure("slab_free", s, tid);goto redo;}stat(s, FREE_FASTPATH);} else__slab_free(s, page, x, addr);}static void __slab_free(struct kmem_cache *s, struct page *page,void *x, unsigned long addr){void *prior;void **object = (void *)x;int was_frozen;struct page new;unsigned long counters;struct kmem_cache_node *n = NULL;unsigned long uninitialized_var(flags);stat(s, FREE_SLOWPATH);//开启debug，并且slub有发生溢出的话，不再继续释放if (kmem_cache_debug(s) &&!(n = free_debug_processing(s, page, x, addr, &flags)))return;do {if (unlikely(n)) {spin_unlock_irqrestore(&n->list_lock, flags);n = NULL;}//page不在cpu cache中,那么page一定在node list中//prior为空，page就在node full list中prior = page->freelist; counters = page->counters;set_freepointer(s, object, prior);//将object加入cpu cache freelist中new.counters = counters;was_frozen = new.frozen;new.inuse--;if ((!new.inuse || !prior) && !was_frozen) {if (!kmem_cache_debug(s) && !prior)/* * Slab was on no list before and will be partially empty * We can defer the list move and instead freeze it. */new.frozen = 1;else { /* Needs to be taken off a list */                        n = get_node(s, page_to_nid(page));/* * Speculatively acquire the list_lock. * If the cmpxchg does not succeed then we may * drop the list_lock without any processing. * * Otherwise the list_lock will synchronize with * other processors updating the list of slabs. */spin_lock_irqsave(&n->list_lock, flags);}}} while (!cmpxchg_double_slab(s, page,prior, counters,object, new.counters,"__slab_free"));if (likely(!n)) {/* * If we just froze the page then put it onto the * per cpu partial list. */if (new.frozen && !was_frozen) {put_cpu_partial(s, page, 1);stat(s, CPU_PARTIAL_FREE);}/* * The list lock was not taken therefore no list * activity can be necessary. */        if (was_frozen)                stat(s, FREE_FROZEN);        return;     }if (unlikely(!new.inuse && n->nr_partial > s->min_partial))goto slab_empty;/* * Objects left in the slab. If it was not on the partial list before * then add it. *///object位于node full list的话，将object所在slub先从full list移除再加入partial listif (kmem_cache_debug(s) && unlikely(!prior)) {remove_full(s, page);add_partial(n, page, DEACTIVATE_TO_TAIL);stat(s, FREE_ADD_PARTIAL);}spin_unlock_irqrestore(&n->list_lock, flags);return;slab_empty://slab 从node list移除if (prior) {/* * Slab on the partial list. */remove_partial(n, page);stat(s, FREE_REMOVE_PARTIAL);} else/* Slab must be on the full list */remove_full(s, page);spin_unlock_irqrestore(&n->list_lock, flags);stat(s, FREE_SLAB);discard_slab(s, page);//将page释放到伙伴系统}最后说明下，对于kmalloc跟kfree来说，这两个接口的实现同样是基于slub分配器实现的。void *__kmalloc(size_t size, gfp_t flags){struct kmem_cache *s;void *ret;if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))return kmalloc_large(size, flags);s = kmalloc_slab(size, flags);if (unlikely(ZERO_OR_NULL_PTR(s)))return s;ret = slab_alloc(s, flags, _RET_IP_);trace_kmalloc(_RET_IP_, ret, size, s->size, flags);kasan_kmalloc(s, ret, size);return ret;}
阅读全文
0 0