伙伴系统分配器 - __alloc_pages

来源：互联网发布：会计理解知乎编辑：程序博客网时间：2024/05/20 04:11

kernel可以通过几个分配函数从伙伴系统分配页面：

alloc_pages

get_zeroed_page

get_dma_pages

这几个函数都是通过alloc_pages来实现页面分配的，而alloc_pages的核心实现就是__alloc_pages。

alloc_pages

在gfp.h中定义

#define alloc_pages(gfp_mask, order) \                alloc_pages_node(numa_node_id(), gfp_mask, order)

两个参数：@gfp_mask 申请标志位；@order 申请页面的阶数

MAX_ORDER是一个宏，定义了buddy系统最大的阶数，大于这个阶数的申请是注定失败的。系统缺省定义是11，也就是说2^11 = 2048个pages。这个值可以通过架构特定的配置FORCE_MAX_ZONEORDER来修改缺省值，一般来说我们不需要更改这个值。要注意的是GPU VPU等申请的连续内存区比较大，并且这些驱动不会频繁分配释放，系统不会通过buddy系统分配内存，而是使用预留内存的方式。

@gfp_mask可参看伙伴系统分配器分配掩码

numa_node_id获得当前执行CPU对应的node id

alloc_page_node

static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,                                                unsigned int order){        if (unlikely(order >= MAX_ORDER))                return NULL;        /* Unknown node is current node */        if (nid < 0)                nid = numa_node_id();        return __alloc_pages(gfp_mask, order,                NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask));}

在alloc_pages_node中做了一件很重要的事，计算__alloc_pages的第三个参数NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask)

第三个参数是用来通知__alloc_pages首先从该node的哪一个zone进行分配，node_zonelists是节点的zone链表，一般顺序是DMA, Normal, Highmem；

gfp_zone(gfp_mask)计算给定的分配域（包含在gfp_mask）中，所对应的zone偏移。这个偏移加上node_zonelists就是选择的内存域。

__alloc_pages

文件mm/page_alloc.c中实现，是zone buddy的核心分配函数，分析该函数前，我们先了解一些可能的标记和重要的辅助函数

辅助标记

kenel定义了一些函数需要使用的标记，用来控制内存区空闲页面数达到不同watermark时的分配行为

#define ALLOC_NO_WATERMARKS     0x01 /* don't check watermarks at all */#define ALLOC_WMARK_MIN         0x02 /* use pages_min watermark */#define ALLOC_WMARK_LOW         0x04 /* use pages_low watermark */#define ALLOC_WMARK_HIGH        0x08 /* use pages_high watermark */#define ALLOC_HARDER            0x10 /* try to alloc harder */#define ALLOC_HIGH              0x20 /* __GFP_HIGH set */#define ALLOC_CPUSET            0x40 /* check for correct cpuset */

这些标志用来表示也分配过程中，需要考虑当前内存区的哪些分配水印。内存区的三个水印：zone->pages_min, zone->page_low, zone->page_high

默认情况下，仅有内存域包含的页数大于page_high时，才会进行分配。

ALLOC_NO_WATERMARKS 完全不检查水印，也就是略过分配页的选择过程，直接调用buffered_rmqueue进行分配。

ALLOC_WMARK_MIN 在当前分配区使用zone->pages_min进行检查。

ALLOC_WMARK_LOW 在当前分配区使用zone->pages_low进行检查。

ALLOC_WMAEK_HIGH 在当前分配区使用zone->pages_high进行检查

ALLOC_HARDER 通知伙伴系统放宽检查限制，其实就是对水印给定的值乘以一个系数 3/4

ALLOC_HIGH 比HARDER更紧急的分配请求，进一步放宽限制

ALLOC_CPUSET 只能在当前节点相关连的内存节点进行分配。

zone_watermark_ok

上诉标志会在函数zone_watermark_ok中检查

1215 /*1216  * Return 1 if free pages are above 'mark'. This takes into account the order1217  * of the allocation.1218  */1219 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,1220                       int classzone_idx, int alloc_flags)1221 {1222         /* free_pages my go negative - that's OK */1223         long min = mark;1224         long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;1225         int o;1226 1227         if (alloc_flags & ALLOC_HIGH)1228                 min -= min / 2;1229         if (alloc_flags & ALLOC_HARDER)1230                 min -= min / 4;1231 1232         if (free_pages <= min + z->lowmem_reserve[classzone_idx])1233                 return 0;1234         for (o = 0; o < order; o++) {1235                 /* At the next order, this order's pages become unavailable */1236                 free_pages -= z->free_area[o].nr_free << o;1237 1238                 /* Require fewer higher order pages to be free */1239                 min >>= 1;1240 1241                 if (free_pages <= min)1242                         return 0;1243         }1244         return 1;1245 }

返回1 表示满足给定的水印，0表示不满足。

zone_page_state返回给定zone的free pages数目。

1232 首先要判断空闲页面数目，分配给定的free_pages是否还能满足min 和z->lowmem_reserver[]，从这个可以看出lowmem_reserve是不包含min的。lowmem_reserve的作用有点小复杂，可以参考另外一篇文章lowmem_reserve的理解

1234 ~ 1243做循环，对于小于给定参数@order的buddy链表，要把他们的容量从free_pages中减去，因为这些页面对当前分配请求来说和非空闲页面没有区别，

1239 每一次循环，所需空闲页的最小值折半，这也是个经验算法。

1241 如果发现空闲页面小于mark，那么说明已经剩余的页面无法满足分配了，直接失败退出。

1244 表明当前内存zone满足分配的请求。

get_page_from_freelist

/* * get_page_from_freelist goes through the zonelist trying to allocate * a page. */static struct page *get_page_from_freelist(gfp_t gfp_mask, unsigned int order,struct zonelist *zonelist, int alloc_flags){struct zone **z;struct page *page = NULL;int classzone_idx = zone_idx(zonelist->zones[0]);struct zone *zone;nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */int zlc_active = 0;/* set if using zonelist_cache */int did_zlc_setup = 0;/* just call zlc_setup() one time */enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */zonelist_scan:/* * Scan zonelist, looking for a zone with enough free. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */z = zonelist->zones;do {/* * In NUMA, this could be a policy zonelist which contains * zones that may not be allowed by the current gfp_mask. * Check the zone is allowed by the current flags */if (unlikely(alloc_should_filter_zonelist(zonelist))) {if (highest_zoneidx == -1)highest_zoneidx = gfp_zone(gfp_mask);if (zone_idx(*z) > highest_zoneidx)continue;}if (NUMA_BUILD && zlc_active &&!zlc_zone_worth_trying(zonelist, z, allowednodes))continue;zone = *z;if ((alloc_flags & ALLOC_CPUSET) &&!cpuset_zone_allowed_softwall(zone, gfp_mask))goto try_next_zone;if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {unsigned long mark;if (alloc_flags & ALLOC_WMARK_MIN)mark = zone->pages_min;else if (alloc_flags & ALLOC_WMARK_LOW)mark = zone->pages_low;elsemark = zone->pages_high;if (!zone_watermark_ok(zone, order, mark,    classzone_idx, alloc_flags)) {if (!zone_reclaim_mode ||    !zone_reclaim(zone, gfp_mask, order))goto this_zone_full;}}page = buffered_rmqueue(zonelist, zone, order, gfp_mask);if (page)break;this_zone_full:if (NUMA_BUILD)zlc_mark_zone_full(zonelist, z);try_next_zone:if (NUMA_BUILD && !did_zlc_setup) {/* we do zlc_setup after the first zone is tried */allowednodes = zlc_setup(zonelist, alloc_flags);zlc_active = 1;did_zlc_setup = 1;}} while (*(++z) != NULL);if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {/* Disable zlc cache for second zonelist scan */zlc_active = 0;goto zonelist_scan;}return page;}

这个函数分为两个步骤：

1. 选择要分配的页

2. 移除步骤1中选择的页，主要由buffered_rmqueue实现

参数@zonelist指向备用内存区链表的指针。在预期内存区中（zonelist->zone[0]）没有足够空闲空间的情况下，该列表确定了扫描系统其他内存域的顺序。

do循环是遍历zonelist，找到一个满足分配条件的内存区，如果成功，则调用buffer_rmqueue试图分配需要的页面。

__alloc_pages

在了解可用标记和辅助函数后，我们可以开始分析__alloc_pages函数了。该函数实现比较复杂，尤其是在可用内存不充足的情况下；如果可用内存重组，该函数的流程还是非常简单的。

/* * This is the 'heart' of the zoned buddy allocator. */struct page * fastcall__alloc_pages(gfp_t gfp_mask, unsigned int order,                struct zonelist *zonelist){        const gfp_t wait = gfp_mask & __GFP_WAIT;        struct zone **z;        struct page *page;        struct reclaim_state reclaim_state;        struct task_struct *p = current;        int do_retry;        int alloc_flags;        int did_some_progress;        might_sleep_if(wait);        if (should_fail_alloc_page(gfp_mask, order))                return NULL;restart:        z = zonelist->zones;  /* the list of zones suitable for gfp_mask */        if (unlikely(*z == NULL)) {                /*                 * Happens if we have an empty zonelist as a result of                 * GFP_THISNODE being used on a memoryless node                 */                return NULL;        }        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,                                zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);        if (page)                goto got_pg;

在最简单的情况下，只调用一次get_page_from_freelist就成功的获得了分配的页面，直接通过跳转指令到got_pg处。

        for (z = zonelist->zones; *z; z++)                wakeup_kswapd(*z, order);        /*         * OK, we're below the kswapd watermark and have kicked background         * reclaim. Now things get more complex, so set up alloc_flags according         * to how we want to proceed.         *         * The caller may dip into page reserves a bit more if the caller         * cannot run direct reclaim, or if the caller has realtime scheduling         * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will         * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).         */        alloc_flags = ALLOC_WMARK_MIN;        if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)                alloc_flags |= ALLOC_HARDER;        if (gfp_mask & __GFP_HIGH)                alloc_flags |= ALLOC_HIGH;        if (wait)                alloc_flags |= ALLOC_CPUSET;        /*         * Go through the zonelist again. Let __GFP_HIGH and allocations         * coming from realtime tasks go deeper into reserves.         *         * This is the last chance, in general, before the goto nopage.         * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.         * See also cpuset_zone_allowed() comment in kernel/cpuset.c.         */        page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);        if (page)                goto got_pg;

内核再一次遍历所有的内存zone，对每个zone都调用wakeup_kswapd，该函数唤醒负责换出页的内核守护进程。交换守护进程通过缩减内核缓存和页面回收来获得更多的空闲内存，缩减内核缓存和页面回涉及到页面写回或者换出很少使用的页面。这两种措施都是由守护进程发起的。

唤醒守护进程后，内核开始重新尝试从内存zones中查找合适的内存块。这一次搜索更为积极，对分配标志做了调整，修改为在一些特定情况下需要的分配标记，因此也减小了水印。

如果再次失败，内核会使用更积极的分配措施：

rebalance:        if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))                        && !in_interrupt()) {                if (!(gfp_mask & __GFP_NOMEMALLOC)) {nofail_alloc:                        /* go through the zonelist yet again, ignoring mins */                        page = get_page_from_freelist(gfp_mask, order,                                zonelist, ALLOC_NO_WATERMARKS);                        if (page)                                goto got_pg;                        if (gfp_mask & __GFP_NOFAIL) {                                congestion_wait(WRITE, HZ/50);                                goto nofail_alloc;                        }                }                goto nopage;        }

TIF_MEMDIE表示该进程已经被oom killer选择中，而PF_MEMALLOC比较复杂，我们单独讨论，PF_MEMALLOC表示当前进程是内存管理程序，需要一点点额外的内存，以便能继续执行下去。

__GFP_NOMMEALLOC表示禁止使用紧急分配链表，因此无法再尝试禁止水印的情况下调用get_page_from_freelist，此时只能失败。

如果允许使用紧急分配链表，则使用标志ALLOC_NO_WATERMAERKS尝试分配，如果失败，还要看分配标志是否有__GFP_NOFAIL，该标志表示不允许失败，首先调用conestion_wait等待，然后再尝试分配，直到成功。

1560         p->flags |= PF_MEMALLOC;1561         reclaim_state.reclaimed_slab = 0;1562         p->reclaim_state = &reclaim_state;1563 1564         did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask);1565 1566         p->reclaim_state = NULL;1567         p->flags &= ~PF_MEMALLOC;

在调用try_to_free_pages之前，先设置PF_MEMALLOC保证try_to_free_pages可以分配预留内存，try_to_free_pages不仅会把不常用页面交换到交换空间，还会shrink各种cache

1574         if (likely(did_some_progress)) {1575                 page = get_page_from_freelist(gfp_mask, order,1576                                                 zonelist, alloc_flags);1577                 if (page)1578                         goto got_pg;1579         } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {1580                 if (!try_set_zone_oom(zonelist)) {1581                         schedule_timeout_uninterruptible(1);1582                         goto restart;1583                 }1584 1585                 /*1586                  * Go through the zonelist yet one more time, keep1587                  * very high watermark here, this is only to catch1588                  * a parallel oom killing, we must fail if we're still1589                  * under heavy pressure.1590                  */1591                 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,1592                                 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);1593                 if (page) {1594                         clear_zonelist_oom(zonelist);1595                         goto got_pg;1596                 }1597 1598                 /* The OOM killer will not help higher order allocs so fail */1599                 if (order > PAGE_ALLOC_COSTLY_ORDER) {1600                         clear_zonelist_oom(zonelist);1601                         goto nopage;1602                 }1603 1604                 out_of_memory(zonelist, gfp_mask, order);1605                 clear_zonelist_oom(zonelist);1606                 goto restart;1607         }

did_some_progress表示的确释放了一些页面，那么再尝试进行分配；否则内核正在做VFS层的操作，同时又没有设置GFP_NORETRY，那么调用OOM killer

1599行表示如果分配的阶数很大，那么即便调用OOM很大的几率仍然无法满足2^order分配，所以不执行oom killer。这个解释听起来不那么合理的，因为有时候一个进程可能占据几十M的内存空间，杀掉它，必然会释放很大的内存空间，获得连续空间的几率应该也很大的。

1609         /*1610          * Don't let big-order allocations loop unless the caller explicitly1611          * requests that.  Wait for some write requests to complete then retry.1612          *1613          * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order1614          * <= 3, but that may not be true in other implementations.1615          */1616         do_retry = 0;1617         if (!(gfp_mask & __GFP_NORETRY)) {1618                 if ((order <= PAGE_ALLOC_COSTLY_ORDER) ||1619                                                 (gfp_mask & __GFP_REPEAT))1620                         do_retry = 1;1621                 if (gfp_mask & __GFP_NOFAIL)1622                         do_retry = 1;1623         }1624         if (do_retry) {1625                 congestion_wait(WRITE, HZ/50);1626                 goto rebalance;1627         }

看注释，意思是说在分配标志没有NORETRY的情况下，要考虑几种情况，没什么可分析的。