linux内存管理之内存回收机制

来源:互联网 发布:sqlmap写入webshell 编辑:程序博客网 时间:2024/05/29 03:40

[摘要]

[正文]重要数据结构介绍

[正文]重要资源初始化(包括pgdat、struct zone等)

[正文]内存回收时机

[正文]内存回收过程

[总结]

[其他]


注意:请使用谷歌浏览器阅读(IE浏览器排版混乱)


【摘要】

本文将介绍linux内存回收机制.

【正文】重要数据结构介绍

1 membank/meminfo

meminfo描述了整个系统的内存信息,meminfo中可以包含NR_BANKS个membank;

struct meminfo {int nr_banks;struct membank bank[NR_BANKS];//NR_BANKS=8};

每个membank都是通过arm_add_memory()初始化的:setup_arch()->setup_machine_fdt()->arm_add_memory();

struct membank {phys_addr_t start;phys_addr_t size;unsigned int highmem;};
2 memblock/memblock_type
struct memblock_region {phys_addr_t base;phys_addr_t size;};struct memblock_type {unsigned long cnt;/* number of regions */unsigned long max;/* size of the allocated array */phys_addr_t total_size;/* size of all regions */struct memblock_region *regions;};struct memblock {phys_addr_t current_limit;struct memblock_type memory;struct memblock_type reserved;};
初始化:arm_memblock_init->memblock_add->memblock_add_region->   type->regions[0].base =mi->bank[i].start
     arm_memblock_init->memblock_reserve->memblock_add_region->
void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc){int i;for (i = 0; i < mi->nr_banks; i++)memblock_add(mi->bank[i].start, mi->bank[i].size);/* Register the kernel text, kernel data and initrd with memblock. */#ifdef CONFIG_XIP_KERNELmemblock_reserve(__pa(_sdata), _end - _sdata);#elsememblock_reserve(__pa(_stext), _end - _stext);#endifarm_mm_memblock_reserve();arm_dt_memblock_reserve();/* reserve any platform specific memblock areas */if (mdesc->reserve)mdesc->reserve();/* * reserve memory for DMA contigouos allocations, * must come from DMA area inside low memory */dma_contiguous_reserve(min(arm_dma_limit, arm_lowmem_limit));arm_memblock_steal_permitted = false;memblock_allow_resize();     /*memblock_dump(&memblock.memory, "memory");memblock_dump(&memblock.reserved, "reserved");     */memblock_dump_all();}

pg_data_t *pgdat=NODE_DATA(0);可以通过NODE_DATA(0)找到ZONE_NORMAL内存区,进而获取这个内存区信息,

如:可用内存大小,zone->managed_pages

typedef struct pglist_data {struct zone node_zones[MAX_NR_ZONES];struct zonelist node_zonelists[MAX_ZONELISTS];int nr_zones;#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */struct page *node_mem_map;#endifunsigned long node_start_pfn;unsigned long node_present_pages; /* total number of physical pages */unsigned long node_spanned_pages; /* total size of physical page    range, including holes */int node_id;nodemask_t reclaim_nodes;/* Nodes allowed to reclaim from */wait_queue_head_t kswapd_wait;wait_queue_head_t pfmemalloc_wait;struct task_struct *kswapd;/* Protected by lock_memory_hotplug() */int kswapd_max_order;enum zone_type classzone_idx;} pg_data_t;
pg_data_t *pgdat=NODE_DATA(0);
操作系统在管理内存时比如申请alloc_pages,释放shrink_zone是通过全局变量pg_data_t来实现的。
通过zone_list找到zone(zone分为正常内存区和DMA内存区等),通过zone结构中的lruvec来管理
lruvec包括以下几种类型的链表:

enum lru_list {LRU_INACTIVE_ANON = LRU_BASE,LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,LRU_UNEVICTABLE,NR_LRU_LISTS};

【正文】重要资源初始化(包括pgdat、struct zone等)

1 meminfo与membank初始化

系统启动过程中,根据bootargs参数初始化meminfo,membank结构:

bootargs = console=ttyS0,115200 mem=110M root=/dev/mtdblock1 rootfstype=squashfs init=/linuxrc

举例:fdt中指定系统内存起始地址是0x200000,系统内存大小是110M=0x6e00000;

注意:atags或fdt参数都对系统内存的起始地址和内存大小进行了配置,但系统内存的大小最后以bootargs中的mem参数为准.

1.1 初始化membank的方法:

int __init arm_add_memory(phys_addr_t start, phys_addr_t size){struct membank *bank = &meminfo.bank[meminfo.nr_banks];u64 aligned_start;if (meminfo.nr_banks >= NR_BANKS) {printk(KERN_CRIT "NR_BANKS too low, ""ignoring memory at 0x%08llx\n", (long long)start);return -EINVAL;}/** Ensure that start/size are aligned to a page boundary.* Size is appropriately rounded down, start is rounded up.*/size -= start & ~PAGE_MASK;aligned_start = PAGE_ALIGN(start);#ifndef CONFIG_ARCH_PHYS_ADDR_T_64BITif (aligned_start > ULONG_MAX) {printk(KERN_CRIT "Ignoring memory at 0x%08llx outside "      "32-bit physical address space\n", (long long)start);return -EINVAL;}if (aligned_start + size > ULONG_MAX) {printk(KERN_CRIT "Truncating memory at 0x%08llx to fit in ""32-bit physical address space\n", (long long)start);/** To ensure bank->start + bank->size is representable in* 32 bits, we use ULONG_MAX as the upper limit rather than 4GB.* This means we lose a page after masking.*/size = ULONG_MAX - aligned_start;}#endifif (aligned_start < PHYS_OFFSET) {if (aligned_start + size <= PHYS_OFFSET) {pr_info("Ignoring memory below PHYS_OFFSET: 0x%08llx-0x%08llx\n",aligned_start, aligned_start + size);return -EINVAL;}pr_info("Ignoring memory below PHYS_OFFSET: 0x%08llx-0x%08llx\n",aligned_start, (u64)PHYS_OFFSET);size -= PHYS_OFFSET - aligned_start;aligned_start = PHYS_OFFSET;}/* atags或fdt中指定的系统内存的起始地址 */bank->start = aligned_start;/* atags或fdt中指定的系统内存的大小,bootargs中可以对系统内存大小进行修改 */bank->size = size & ~(phys_addr_t)(PAGE_SIZE - 1);/** Check whether this memory region has non-zero size or* invalid node number.*/if (bank->size == 0)return -EINVAL;/* atags或fdt中对应meminfo.membank[0],初始化想要的membank后nr_banks要加1,下次再有membank加入则对应meminfo.membank[1]*/meminfo.nr_banks++;return 0;}
1.2 初始化membank的时机:

<1> atags或fdt解析时,根据系统内存的其实地址和大小创建meminfo.membank[0]

void __init early_init_dt_add_memory_arch(u64 base, u64 size){/*根据fdt参数指定的系统内存和大小创建meminfo.membank[0], 如:membank.start=0x200000;membank.size=0x6e00000<110M> 系统内存大小会根据bootargs中的mem参数进行二次修改.*/arm_add_memory(base,size);}
<2> 解析bootargs中mem=118M时:
static int __init early_mem(char *p){static int usermem __initdata = 0;phys_addr_t size;phys_addr_t start;char *endp;/* * If the user specifies memory size, we * blow away any automatically generated * size. */if (usermem == 0) {usermem = 1;     /* 注意解析fdt时已经添加了membank,此时meminfo.nr_banks=1;        此处再次将meminfo.nr_banks=0,表明bootargs中mem参数中指定的内存大小       对应meminfo.membank[0],即是对fdt中指定的系统内存大小进行二次配置.     */meminfo.nr_banks = 0;}start = PHYS_OFFSET;size  = memparse(p, &endp);if (*endp == '@')start = memparse(endp + 1, NULL);arm_add_memory(start, size);return 0;}

2 初始化pgdat、struct zone: setup_arch()->paging_init()->bootmem_init()->

void __init bootmem_init(void){unsigned long min, max_low, max_high;max_low = max_high = 0;/* 根据上面创建好的meminfo.membank[0] ,获取物理地址的页帧号:min=0x200;max_low=0x6e00;max_high=0x6e00*/find_limits(&min, &max_low, &max_high);arm_bootmem_init(min, max_low);/* * Sparsemem tries to allocate bootmem in memory_present(), * so must be done after the fixed reservations */arm_memory_present();/* * sparse_init() needs the bootmem allocator up and running. */sparse_init();/* * Now free the memory - free_area_init_node needs * the sparse mem_map arrays initialized by sparse_init() * for memmap_init_zone(), otherwise all PFNs are invalid. */arm_bootmem_free(min, max_low, max_high);/* * This doesn't seem to be used by the Linux memory manager any * more, but is used by ll_rw_block.  If we can get rid of it, we * also get rid of some of the stuff above as well. * * Note: max_low_pfn and max_pfn reflect the number of _pages_ in * the system, not the maximum PFN. */max_low_pfn = max_low - PHYS_PFN_OFFSET;        /*系统中物理页个数:0x6c00=0x6e00-0x200*/max_pfn = max_high - PHYS_PFN_OFFSET;}

setup_arch()->paging_init()->bootm_init()->arm_bootmem_init()

static void __init arm_bootmem_init(unsigned long start_pfn,unsigned long end_pfn){struct memblock_region *reg;unsigned int boot_pages;phys_addr_t bitmap;pg_data_t *pgdat;/* * Allocate the bootmem bitmap page.  This must be in a region * of memory which has already been mapped. */boot_pages = bootmem_bootmap_pages(end_pfn - start_pfn);bitmap = memblock_alloc_base(boot_pages << PAGE_SHIFT, L1_CACHE_BYTES,__pfn_to_phys(end_pfn));/* * Initialise the bootmem allocator, handing the * memory banks over to bootmem. */node_set_online(0);pgdat = NODE_DATA(0);init_bootmem_node(pgdat, __phys_to_pfn(bitmap), start_pfn, end_pfn);/* Free the lowmem regions from memblock into bootmem. */for_each_memblock(memory, reg) {           /*物理页帧号:start=0x200;end=0x6e00;*/unsigned long start = memblock_region_memory_base_pfn(reg);unsigned long end = memblock_region_memory_end_pfn(reg);if (end >= end_pfn)end = end_pfn;if (start >= end)break;free_bootmem(__pfn_to_phys(start), (end - start) << PAGE_SHIFT);}/* Reserve the lowmem memblock reserved regions in bootmem. */for_each_memblock(reserved, reg) {unsigned long start = memblock_region_reserved_base_pfn(reg);unsigned long end = memblock_region_reserved_end_pfn(reg);if (end >= end_pfn)end = end_pfn;if (start >= end)break;reserve_bootmem(__pfn_to_phys(start),        (end - start) << PAGE_SHIFT, BOOTMEM_DEFAULT);}}
setup_arch()->paging_init()->bootm_init()->arm_bootmem_free()
static void __init arm_bootmem_free(unsigned long min, unsigned long max_low,unsigned long max_high){unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];struct memblock_region *reg;/* * initialise the zones. */memset(zone_size, 0, sizeof(zone_size));/* * The memory size has already been determined.  If we need * to do anything fancy with the allocation of this memory * to the zones, now is the time to do it. *//*0表示ZONE_NORMAL,系统内存大小对应的物理页个数,即0x6c00=0x6e00-0x200;zone_size[1]=0;表示ZONE_MOVABLE*/zone_size[0] = max_low - min;/* * Calculate the size of the holes. *  holes = node_size - sum(bank_sizes) */memcpy(zhole_size, zone_size, sizeof(zhole_size));for_each_memblock(memory, reg) {unsigned long start = memblock_region_memory_base_pfn(reg);unsigned long end = memblock_region_memory_end_pfn(reg);if (start < max_low) {unsigned long low_end = min(end, max_low);zhole_size[0] -= low_end - start;}#ifdef CONFIG_HIGHMEMif (end > max_low) {unsigned long high_start = max(start, max_low);zhole_size[ZONE_HIGHMEM] -= end - high_start;}#endif}#ifdef CONFIG_ZONE_DMA/* * Adjust the sizes according to any special requirements for * this machine type. */if (arm_dma_zone_size)arm_adjust_dma_zone(zone_size, zhole_size,arm_dma_zone_size >> PAGE_SHIFT);#endiffree_area_init_node(0, zone_size, min, zhole_size);}
setup_arch()->paging_init()->bootm_init()->arm_bootmem_free()->free_area_init_node():
void __paginginit free_area_init_node(int nid, unsigned long *zones_size,unsigned long node_start_pfn, unsigned long *zholes_size){pg_data_t *pgdat = NODE_DATA(nid);/* pg_data_t should be reset to zero when it's allocated */WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);     /*nid = 0*/pgdat->node_id = nid;     /* 系统内存起始地址对应的物理页帧号:0x200 */pgdat->node_start_pfn = node_start_pfn;init_zone_allows_reclaim(nid);     /*系统内存大小:以页为单位(0x6e00-0x200 个页)*/calculate_node_totalpages(pgdat, zones_size, zholes_size);alloc_node_mem_map(pgdat);#ifdef CONFIG_FLAT_NODE_MEM_MAPprintk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",nid, (unsigned long)pgdat,(unsigned long)pgdat->node_mem_map);#endiffree_area_init_core(pgdat, zones_size, zholes_size);}
setup_arch()->paging_init()->bootm_init()->arm_bootmem_free()->free_area_init_node()->free_area_init_core():

实现struct zone: pgdat->node_zones的初始化

static void __paginginit free_area_init_core(struct pglist_data *pgdat,unsigned long *zones_size, unsigned long *zholes_size){enum zone_type j;int nid = pgdat->node_id;unsigned long zone_start_pfn = pgdat->node_start_pfn;int ret;pgdat_resize_init(pgdat);#ifdef CONFIG_NUMA_BALANCINGspin_lock_init(&pgdat->numabalancing_migrate_lock);pgdat->numabalancing_migrate_nr_pages = 0;pgdat->numabalancing_migrate_next_window = jiffies;#endifinit_waitqueue_head(&pgdat->kswapd_wait);init_waitqueue_head(&pgdat->pfmemalloc_wait);pgdat_page_cgroup_init(pgdat);     /*MAX_NR_ZONES=2;0表示ZONE_NORMAL;1表示ZONE_MOVALBE*/for (j = 0; j < MAX_NR_ZONES; j++) {           /*初始化struct zone: pgdat->node_zones*/struct zone *zone = pgdat->node_zones + j;unsigned long size, realsize, freesize, memmap_pages;           /*系统内存大小以页为单位:           即zone_names[0]="NORMAL";zone_size[0]=0x6c00=0x6e00-0x200;zholes_size[0]=0;           zone_names[1]="Movable";zone_size[1]=0;zholes_size[1]=0;            dma_reserve=0;           */size = zone_spanned_pages_in_node(nid, j, zones_size);           /*系统内存大小以页为单位:0x6c00*/realsize = freesize = size - zone_absent_pages_in_node(nid, j,zholes_size);/* * Adjust freesize so that it accounts for how much memory * is used by this zone for memmap. This affects the watermark * and per-cpu initialisations */           /*计算页描述符占用的大小:0x6c00*sizeof(struct page)>>PAGE_SHIFT*/memmap_pages = calc_memmap_size(size, realsize);if (freesize >= memmap_pages) {freesize -= memmap_pages;if (memmap_pages)printk(KERN_DEBUG       "  %s zone: %lu pages used for memmap\n",       zone_names[j], memmap_pages);} elseprintk(KERN_WARNING"  %s zone: %lu pages exceeds freesize %lu\n",zone_names[j], memmap_pages, freesize);/* Account for reserved pages */if (j == 0 && freesize > dma_reserve) {freesize -= dma_reserve;printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",zone_names[0], dma_reserve);}if (!is_highmem_idx(j))nr_kernel_pages += freesize;/* Charge for highmem memmap if there are enough kernel pages */else if (nr_kernel_pages > memmap_pages * 2)nr_kernel_pages -= memmap_pages;nr_all_pages += freesize;/*系统内存大小以页为单位:0x6c00*/zone->spanned_pages = size;zone->present_pages = realsize;/* 注意:在之后的mm_init->mem_init->free_all_bootmem中会重新初始化 * Set an approximate value for lowmem here, it will be adjusted * when the bootmem allocator frees pages into the buddy system. * And all highmem pages will be managed by the buddy system. */zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;#ifdef CONFIG_NUMAzone->node = nid;zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)/ 100;zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;#endifzone->name = zone_names[j];spin_lock_init(&zone->lock);spin_lock_init(&zone->lru_lock);zone_seqlock_init(zone);zone->zone_pgdat = pgdat;zone_pcp_init(zone);                /*初始化zone->lruvec :包括LRU_INACTIVE_ANON/LRU_ACTIVE_ANON等*/lruvec_init(&zone->lruvec);if (!size)continue;set_pageblock_order();setup_usemap(pgdat, zone, zone_start_pfn, size);ret = init_currently_empty_zone(zone, zone_start_pfn,size, MEMMAP_EARLY);BUG_ON(ret);                /*初始化页描述符  */memmap_init(size, nid, j, zone_start_pfn);zone_start_pfn += size;}}

查看struct zone方法,如查看ZONE_NORMAL区:

方法一:struct zone=NODE_DATA(nid)->node_zones(ZONE_NORMAL);其中nid=0;ZONE_NORMAL=0;

方法二:struct *zone=page_zone(struct *page);可以根据page找到所在的struct zone;

注意:zone.managed_pages是系统允许使用的内存页个数,zone.watermark是以此为基准进行校验的,可以参考后文zone_watermark_ok.

zone.managed_pagess在mm_init->mem_init->free_all_bootmem中会重新初始化,并且后续使用的zone.managed_pages以此为准.

memmap_init=memmap_init_zone

void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,unsigned long start_pfn, enum memmap_context context){struct page *page;unsigned long end_pfn = start_pfn + size;unsigned long pfn;struct zone *z;if (highest_memmap_pfn < end_pfn - 1)highest_memmap_pfn = end_pfn - 1;     /*pgdat->node_zones[0]:ZONE_NORMAL*/z = &NODE_DATA(nid)->node_zones[zone];for (pfn = start_pfn; pfn < end_pfn; pfn++) {/* * There can be holes in boot-time mem_map[]s * handed to this function.  They do not * exist on hotplugged memory. */if (context == MEMMAP_EARLY) {if (!early_pfn_valid(pfn))continue;if (!early_pfn_in_nid(pfn, nid))continue;}page = pfn_to_page(pfn);set_page_links(page, zone, nid, pfn);mminit_verify_page_links(page, zone, nid, pfn);           /* page->count初始化为1,表示该页被申请了,在以后的启动过程中会通过free_all_bootmem->__free_one_page           释放所有系统内存,那时page->count设置为0           */init_page_count(page);           /*page->_mapcount设置-1*/page_mapcount_reset(page);page_nid_reset_last(page);SetPageReserved(page);/* * Mark the block movable so that blocks are reserved for * movable at startup. This will force kernel allocations * to reserve their blocks rather than leaking throughout * the address space during boot when many long-lived * kernel allocations are made. Later some blocks near * the start are marked MIGRATE_RESERVE by * setup_zone_migrate_reserve() * * bitmap is created for zone's valid pfn range. but memmap * can be created for invalid pages (for alignment) * check here not to call set_pageblock_migratetype() against * pfn out of zone. */if ((z->zone_start_pfn <= pfn)    && (pfn < zone_end_pfn(z))    && !(pfn & (pageblock_nr_pages - 1)))set_pageblock_migratetype(page, MIGRATE_MOVABLE);INIT_LIST_HEAD(&page->lru);}}
【正文】内存回收时机

1 首先系统启动过程中,会对所有系统内存进行一次释放:

mm_init->mem_init->free_all_bootmem->free_all_bootmem_core->free_hot_cold_page/__free_pages_ok

static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata){struct page *page;unsigned long start, end, pages, count = 0;if (!bdata->node_bootmem_map)return 0;start = bdata->node_min_pfn;end = bdata->node_low_pfn;     /*start物理内存起始地址物理页帧号0x200;物理内存结束地址页帧号0x6e00*/while (start < end) {unsigned long *map, idx, vec;unsigned shift;map = bdata->node_bootmem_map;idx = start - bdata->node_min_pfn;shift = idx & (BITS_PER_LONG - 1);/* * vec holds at most BITS_PER_LONG map bits, * bit 0 corresponds to start. */vec = ~map[idx / BITS_PER_LONG];if (shift) {vec >>= shift;if (end - start >= BITS_PER_LONG)vec |= ~map[idx / BITS_PER_LONG + 1] <<(BITS_PER_LONG - shift);}/* * If we have a properly aligned and fully unreserved * BITS_PER_LONG block of pages in front of us, free * it in one go. */if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) {int order = ilog2(BITS_PER_LONG);__free_pages_bootmem(pfn_to_page(start), order);count += BITS_PER_LONG;start += BITS_PER_LONG;} else {unsigned long cur = start;start = ALIGN(start + 1, BITS_PER_LONG);while (vec && cur != start) {if (vec & 1) {page = pfn_to_page(cur);__free_pages_bootmem(page, 0);count++;}vec >>= 1;++cur;}}}page = virt_to_page(bdata->node_bootmem_map);pages = bdata->node_low_pfn - bdata->node_min_pfn;pages = bootmem_bootmap_pages(pages);count += pages;while (pages--)__free_pages_bootmem(page++, 0);bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);return count;}

2 alloc_pages过程中因为剩余内存不足引起的内存的回收:

alloc_pages(gfp_mask,order)->alloc_pages_node(numa_node_id(),gfp_mask,order)->

__alloc_pages(gfp_mask,order,node_zonelist(nid,gfp_mask))->__alloc_pages_nodemask

/* zonelist= node_zonelist(nid,gfp_mask)=NODE_DATA(nid)->node_zonelists;  其中nid=0;pg_data_t *pgdat=NODE_DATA(0)*/struct page *__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,        struct zonelist *zonelist, nodemask_t *nodemask){enum zone_type high_zoneidx = gfp_zone(gfp_mask);struct zone *preferred_zone;struct page *page = NULL;int migratetype = allocflags_to_migratetype(gfp_mask);unsigned int cpuset_mems_cookie;int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;struct mem_cgroup *memcg = NULL;gfp_mask &= gfp_allowed_mask;lockdep_trace_alloc(gfp_mask);might_sleep_if(gfp_mask & __GFP_WAIT);if (should_fail_alloc_page(gfp_mask, order))return NULL;/** Check the zones suitable for the gfp_mask contain at least one* valid zone. It's possible to have an empty zonelist as a result* of GFP_THISNODE and a memoryless node*/if (unlikely(!zonelist->_zonerefs->zone))return NULL;/** Will only have any effect when __GFP_KMEMCG is set.  This is* verified in the (always inline) callee*/if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))return NULL;retry_cpuset:cpuset_mems_cookie = get_mems_allowed();/* The preferred zone is used for statistics later */first_zones_zonelist(zonelist, high_zoneidx,nodemask ? : &cpuset_current_mems_allowed,&preferred_zone);if (!preferred_zone)goto out;/* First allocation attempt */page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,zonelist, high_zoneidx, alloc_flags,preferred_zone, migratetype);if (unlikely(!page)) {/** Runtime PM, block IO and its error handling path* can deadlock because I/O on the device might not* complete.*/gfp_mask = memalloc_noio_flags(gfp_mask);/* 系统空闲内存紧张时 get_page_from_freelist失败时 */page = __alloc_pages_slowpath(gfp_mask, order,zonelist, high_zoneidx, nodemask,preferred_zone, migratetype);}trace_mm_page_alloc(page, order, gfp_mask, migratetype);out:/** When updating a task's mems_allowed, it is possible to race with* parallel threads in such a way that an allocation can fail while* the mask is being updated. If a page allocation is about to fail,* check if the cpuset changed during allocation and if so, retry.*/if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))goto retry_cpuset;memcg_kmem_commit_charge(page, memcg, order);return page;}

alloc_pages(gfp_mask,order)->alloc_pages_node(numa_node_id(),gfp_mask,order)->

__alloc_pages(gfp_mask,order,node_zonelist(nid,gfp_mask))->__alloc_pages_nodemask->

get_page_from_freelist->zone_reclaim->shrink_zone

static struct page *get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,struct zonelist *zonelist, int high_zoneidx, int alloc_flags,struct zone *preferred_zone, int migratetype){struct zoneref *z;struct page *page = NULL;int classzone_idx;struct zone *zone;nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */int zlc_active = 0;/* set if using zonelist_cache */int did_zlc_setup = 0;/* just call zlc_setup() one time */classzone_idx = zone_idx(preferred_zone);zonelist_scan:if ((alloc_flags & ALLOC_WMARK_LOW) &&   (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))goto this_zone_full;BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {unsigned long mark;int ret;mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];/* zone_watermark_ok在空闲内存不足时返回0;足够时返回1 */if (zone_watermark_ok(zone, order, mark,   classzone_idx, alloc_flags))goto try_this_zone;if (IS_ENABLED(CONFIG_NUMA) &&!did_zlc_setup && nr_online_nodes > 1) {/** we do zlc_setup if there are multiple nodes* and before considering the first zone allowed* by the cpuset.*/allowednodes = zlc_setup(zonelist, alloc_flags);zlc_active = 1;did_zlc_setup = 1;}if (zone_reclaim_mode == 0 ||   !zone_allows_reclaim(preferred_zone, zone))goto this_zone_full;if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&!zlc_zone_worth_trying(zonelist, z, allowednodes))continue;/* zone_reclaim进行内存回收,zone_watermark_ok返回0时,表明空闲内存不足,此时通过zone_reclarim->shrink_zone回收内存;如果回收后,空闲内存仍不足,则通过:__alloc_pages_nodemask->__alloc_pages_slowpath->wake_all_kswapd 唤醒守护进程继续释放内存:kswapd->balance_pgdat->shrink_zone*/ret = zone_reclaim(zone, gfp_mask, order);switch (ret) {case ZONE_RECLAIM_NOSCAN:/* did not scan */continue;case ZONE_RECLAIM_FULL:/* scanned but unreclaimable */continue;default:/* did we reclaim enough 内存回收之后,系统空余内存是否足够*/if (zone_watermark_ok(zone, order, mark,classzone_idx, alloc_flags))goto try_this_zone;if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||   ret == ZONE_RECLAIM_SOME)goto this_zone_full;continue;}}try_this_zone:/*申请内存:alloc_pages->__alloc_pages_nodemask->get_page_from_freelist->buffered_rmqueue->rmqueue_bulk:分解大的伙伴.如4页分成2*2页,申请失败时调用*/page = buffered_rmqueue(preferred_zone, zone, order,gfp_mask, migratetype);if (page)break;this_zone_full:if (IS_ENABLED(CONFIG_NUMA))zlc_mark_zone_full(zonelist, z);}if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {/* Disable zlc cache for second zonelist scan */zlc_active = 0;goto zonelist_scan;}if (page)page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);return page;}
alloc_pages->get_page_from_freelist申请过程中通过zone_watermark_ok,判断是否有足够空闲页,否则调用zone_reclaim->shrink_zone回收,如果还未成功,则:

__alloc_pages_slowpath->__alloc_pages_direct_reclaim->__perform_reclaim->try_to_free_pages->shrink_zones->shrink_zone

bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,     int classzone_idx, int alloc_flags){return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,zone_page_state(z, NR_FREE_PAGES));}/* * Return true if free pages are above 'mark'. This takes into account the order * of the allocation. */static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,     int classzone_idx, int alloc_flags, long free_pages){/* free_pages my go negative - that's OK */long min = mark;long lowmem_reserve = z->lowmem_reserve[classzone_idx];int o;long free_cma = 0;free_pages -= (1 << order) - 1;if (alloc_flags & ALLOC_HIGH)min -= min / 2;if (alloc_flags & ALLOC_HARDER)min -= min / 4;#ifdef CONFIG_CMA/* If allocation can't use CMA areas don't use free CMA pages */if (!(alloc_flags & ALLOC_CMA))free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);#endif/* 系统空闲内存不足时,返回0 */if (free_pages - free_cma <= min + lowmem_reserve)return false;for (o = 0; o < order; o++) {/* At the next order, this order's pages become unavailable */free_pages -= z->free_area[o].nr_free << o;/* Require fewer higher order pages to be free */min >>= 1;/* 系统空闲内存不足时,返回0 */if (free_pages <= min)return false;}return true;}

zone->watermark和zone->lowmem_reserve初始化过程:init_per_zone_wmark_min->setup_per_zone_wmarks

/* * Initialise min_free_kbytes. * * For small machines we want it small (128k min).  For large machines * we want it large (64MB max).  But it is not linear, because network * bandwidth does not increase linearly with machine size.  We use * * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: * min_free_kbytes = sqrt(lowmem_kbytes * 16) * * which yields * * 16MB: 512k * 32MB: 724k * 64MB: 1024k * 128MB: 1448k * 256MB: 2048k * 512MB: 2896k * 1024MB: 4096k * 2048MB: 5792k * 4096MB: 8192k * 8192MB: 11584k * 16384MB: 16384k */int __meminit init_per_zone_wmark_min(void){unsigned long lowmem_kbytes;/*举例:系统内存为118M(bootargs参数中mem=118M)zone->managed_pages=28764个page;表示系统允许申请的内存页个数.lowmem_kbytes=(zone->managed_pages=28764)*4=115056kbytes;*/lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);/*min_free_kbytes =1356kbytes*/min_free_kbytes = int_sqrt(lowmem_kbytes * 16);if (min_free_kbytes < 128)min_free_kbytes = 128;if (min_free_kbytes > 65536)min_free_kbytes = 65536;setup_per_zone_wmarks();refresh_zone_stat_thresholds();setup_per_zone_lowmem_reserve();setup_per_zone_inactive_ratio();return 0;}module_init(init_per_zone_wmark_min)
zone->watermark和zone->lowmem_reserve通过proc设置过程:

min_free_kbytes_sysctl_handler->setup_per_zone_wmarks

# cat /proc/sys/vm/min_free_kbytes 
1356
# cat /proc/sys/vm/lowmem_reserve_ratio 
32
static void __setup_per_zone_wmarks(void){unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);unsigned long lowmem_pages = 0;struct zone *zone;unsigned long flags;/* Calculate total number of !ZONE_HIGHMEM pages */for_each_zone(zone) {if (!is_highmem(zone))lowmem_pages += zone->managed_pages;}for_each_zone(zone) {u64 tmp;spin_lock_irqsave(&zone->lock, flags);tmp = (u64)pages_min * zone->managed_pages;do_div(tmp, lowmem_pages);if (is_highmem(zone)) {/** __GFP_HIGH and PF_MEMALLOC allocations usually don't* need highmem pages, so cap pages_min to a small* value here.** The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)* deltas controls asynch page reclaim, and so should* not be capped for highmem.*/unsigned long min_pages;min_pages = zone->managed_pages / 1024;min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);zone->watermark[WMARK_MIN] = min_pages;} else {/** If it's a lowmem zone, reserve a number of pages* proportionate to the zone's size.*/zone->watermark[WMARK_MIN] = tmp;}/*举例:系统内存为118M(bootargs参数中mem=118M)zone->managed_pages=28764个page;表示系统允许申请的内存页个数.lowmem_kbytes=(zone->managed_pages=28764)*4=115056kbytes;min_free_kbytes =1356kbytes;min_wmark_pages(zone)=zone->watermark[WMARK_MIN]=tmp=min_free_kbytes/4=339个pages;zone->watermark[WMARK_LOW]=339+339/4=423;zone->watermark[WMARK_HIGH]=339+339/2=508;*/zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);setup_zone_migrate_reserve(zone);spin_unlock_irqrestore(&zone->lock, flags);}/* update totalreserve_pages */calculate_totalreserve_pages();}

3 wake_all_kswapd唤醒守护进程释放空闲内存kswapd->balance_pgdat->shrink_zone

get_page_from_freelist申请内存失败时,alloc_pages(gfp_mask,order)->alloc_pages_node(numa_node_id(),gfp_mask,order)->

__alloc_pages(gfp_mask,order,node_zonelist(nid,gfp_mask))->__alloc_pages_nodemask->

__alloc_pages_slowpath->wake_all_kswapd唤醒回收机制.

【正文】内存回收过程

本文所讲的物理页面回收是指动态的回收:即,空闲的内存不够用的时候,系统采取相应的方法将正在使用的内存释放,补充空闲内存,以满足内存的分配。
1.首先先简单看一下系统中的内存释放的三种渠道。
1-1>. 在用户进程退出的时候,释放内存。当用户进程退出的时候,会调用do_exit. do_exit最终会调用free_pagetables函数。该函数的作用是:遍历vma,根据vma中的虚拟地址找到实际的物理页,将其释放。在之前讲过,对于用户进程的虚拟地址区间,是以红黑树组织的。
1-2>. 手动的释放。在驱动中,分配内存使用alloc_pages(),释放内存用free_pages(这一点,类似于c语言中的malloc和free)。必须配对使用。否则会造成内存泄漏。
1-3>. 按需求调用内存回收例程来释放内存。这个区别于前两种的最大不同:它是动态的,按需的。当内存不够的时候,系统会自动按照一定的方式,将某些正在使用的内存释放掉,放进buddy system中再利用。
2.内存将会回收哪些页面
用户进程的页面都是通过page fault进行分配的。通过page fault进行分配的页面都是可以进行回收的。 这些页面总体可以划分为两种,分别是文件页(file cache)和匿名页(anonymous cache). 文件页,顾名思义,它是和外部存储设备上的某个文件相对应。匿名页,其内容不来自于外部存储设备,例如用户进程中的堆栈。这两种页面是内存回收的目标页面。
内存回收采用的主要算法是近似于LRU的算法。位于LRU链表前面的页是活跃的,位于LRU链表后面的页是不活跃的。为什么说是近似呢?1. 页面在链表上排序并不是严格依据LRU不断移动的。他们挂上去后是不移动的。除非在进行页面回收的时候,有些页面从后面,可能会插入到前面;2. Linux在LRU的基础上又引入了一个Referrenced标志。这种带Referenced标志的近似LRU的算法被有些人称之为Second-Chance Algorithm.
void lruvec_init(struct lruvec *lruvec){enum lru_list lru;memset(lruvec, 0, sizeof(struct lruvec));for_each_lru(lru)INIT_LIST_HEAD(&lruvec->lists[lru]);}struct lruvec {struct list_head lists[NR_LRU_LISTS];struct zone_reclaim_stat reclaim_stat;};enum lru_list {LRU_INACTIVE_ANON = LRU_BASE,LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,LRU_UNEVICTABLE,NR_LRU_LISTS};
4 缺页异常处理

用户进程的物理页面都是通过page fault进行分配的。通过page fault进行分配的页面都是可以进行回收的。 这些页面总体可以划分为两种,分别是文件页(file cache)和匿名页(anonymous cache).下面介绍一下缺页异常处理的主要函数:

int handle_pte_fault(struct mm_struct *mm,       struct vm_area_struct *vma, unsigned long address,       pte_t *pte, pmd_t *pmd, unsigned int flags){ pte_t entry; spinlock_t *ptl; entry = *pte; /* 页未被申请 present位被清除 */ if (!pte_present(entry)) {  if (pte_none(entry)) {   if (vma->vm_ops) {    if (likely(vma->vm_ops->fault))     return do_linear_fault(mm, vma, address,      pte, pmd, flags, entry);   }    /*匿名页缺页处理:最后会调用到filemap_fault,见后文分析*/   return do_anonymous_page(mm, vma, address,       pte, pmd, flags);  }  if (pte_file(entry)){    /*文件页缺页处理*/    return do_nonlinear_fault(mm, vma, address,                 pte, pmd, flags, entry);               }  return do_swap_page(mm, vma, address,     pte, pmd, flags, entry); } if (pte_numa(entry))  return do_numa_page(mm, vma, address, entry, pte, pmd); ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry)))  goto unlock; if (flags & FAULT_FLAG_WRITE) {  if (!pte_write(entry))   return do_wp_page(mm, vma, address,     pte, pmd, ptl, entry);  entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {  update_mmu_cache(vma, address, pte); } else {  /*   * This is needed only for protection faults but the arch code   * is not yet telling us if this is a protection fault or not.   * This still avoids useless tlb flushes for .text page faults   * with threads.   */  if (flags & FAULT_FLAG_WRITE)   flush_tlb_fix_spurious_fault(vma, address); }unlock: pte_unmap_unlock(pte, ptl); return 0;}
4.1 匿名页申请

匿名页缺页处理:handle_pte_fault->do_anonymous_page()

static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,  unsigned long address, pte_t *page_table, pmd_t *pmd,  unsigned int flags){ struct page *page; spinlock_t *ptl; pte_t entry; pte_unmap(page_table); /* Check if we need to add a guard page to the stack */ if (check_stack_guard_page(vma, address) < 0)  return VM_FAULT_SIGBUS; /* Use the zero-page for reads */ if (!(flags & FAULT_FLAG_WRITE)) {  entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),      vma->vm_page_prot));  page_table = pte_offset_map_lock(mm, pmd, address, &ptl);  if (!pte_none(*page_table))   goto unlock;  goto setpte; } /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma)))  goto oom; /*申请物理页*/ page = alloc_zeroed_user_highpage_movable(vma, address); if (!page)  goto oom; /*  * The memory barrier inside __SetPageUptodate makes sure that  * preceeding stores to the page contents become visible before  * the set_pte_at() write.  */ /*设置page->flags ;page-flags.h中定义宏*/ __SetPageUptodate(page); if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))  goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE)  entry = pte_mkwrite(pte_mkdirty(entry)); page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table))  goto release; inc_mm_counter_fast(mm, MM_ANONPAGES);/* 设置page->flags;page->mapping;page->index等. 将申请的页添加到LRU_ACTIVE_ANON或LRU_UNEVICTABLE链表*/ page_add_new_anon_rmap(page, vma, address);setpte: set_pte_at(mm, address, page_table, entry); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, page_table);unlock: pte_unmap_unlock(page_table, ptl); return 0;release: mem_cgroup_uncharge_page(page); page_cache_release(page); goto unlock;oom_free_page: page_cache_release(page);oom: return VM_FAULT_OOM;}
handle_pte_fault->do_anonymous_page->page_add_new_anon_rmap
void page_add_new_anon_rmap(struct page *page,struct vm_area_struct *vma, unsigned long address){VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);SetPageSwapBacked(page);atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */if (!PageTransHuge(page))__inc_zone_page_state(page, NR_ANON_PAGES);else__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);__page_set_anon_rmap(page, vma, address, 1);      /*         判断这段内存是否mlock锁定,mlock(用户虚拟起始地址,锁定的内存大小);       锁定的内存不会被动kswapd回收机制动态释放      */if (!mlocked_vma_newpage(vma, page))     /*将页添加到LRU_ACTIVE_ANON*/lru_cache_add_lru(page, LRU_ACTIVE_ANON);else     /*将页添加到LRU_UNEVICTABLE链表;动态回收过程(shrink_zone)不会释放锁定的内存*/add_page_to_unevictable_list(page);}
handle_pte_fault->do_wp_page->page_move_anon_rmap
void page_move_anon_rmap(struct page *page,struct vm_area_struct *vma, unsigned long address){struct anon_vma *anon_vma = vma->anon_vma;anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;page->mapping = (struct address_space *) anon_vma;}
匿名vma初始化:anon_vma_alloc()

4.2 文件页的申请

1)读取文件可以直接使用read系统调用进行读取;也可以mmap文件句柄,最后会使用文件系统层的mmap注册文件页的缺页异常处理函数,注意此处不是匿名页的缺页异常处理。

以squashfs文件系统为例:

const struct file_operations generic_ro_fops={    .llseek = generic_file_llseek,    .read = do_sync_read,    .aio_read = generic_file_aio_read, --直接读取    .mmap = generic_file_readonly_mmap,--触发缺页异常,通过filemap_fault读取    .splice_read = generic_file_splice_read,}

2)mmap一个文件的过程:generic_file_readonly_mmap->generic_file_mmap

const struct vm_operations_struct generic_file_vm_ops = { .fault  = filemap_fault, .page_mkwrite = filemap_page_mkwrite, .remap_pages = generic_file_remap_pages,};int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma){ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))  return -EINVAL; return generic_file_mmap(file, vma);}int generic_file_mmap(struct file * file, struct vm_area_struct * vma){ struct address_space *mapping = file->f_mapping; if (!mapping->a_ops->readpage)  return -ENOEXEC; file_accessed(file);/*  const struct vm_operations_struct generic_file_vm_ops = {      .fault=filemap_fault,//缺页异常时调用   }*/ vma->vm_ops = &generic_file_vm_ops; return 0;}
3)read方式读取文件,系统调用处理过程:generic_file_aio_read->do_generic_file_read()

mmap方式读取文件,缺页异常处理过程:handle_pte_fault->do_nolinear_fault->__do_fault->filemap_fault

int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf){ int error; struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; struct page *page; pgoff_t size; int ret = 0; size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (offset >= size)  return VM_FAULT_SIGBUS; /*  * Do we have something in the page cache already?  */ page = find_get_page(mapping, offset); if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {  /*   * We found the page, so try async readahead before   * waiting for the lock.   */  /*调用路径:   ->page_cache_async_readahead->ondemand_readahead->__do_page_cache_readahead->     read_pages->squashfs_readpage   ->page_cache_async_readahead->ondemand_readahead->ra_submit->__do_page_cache_readahead->     read_pages->squashfs_readpage */  do_async_mmap_readahead(vma, ra, file, page, offset); } else if (!page) {  /* No page in the page cache at all */   /*调用路径:     ->ra_submit->__do_page_cache_readahead->read_pages->squashfs_readpage   */  do_sync_mmap_readahead(vma, ra, file, offset);                print_readpage_flag = 0;  count_vm_event(PGMAJFAULT);  mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);  ret = VM_FAULT_MAJOR;retry_find:  page = find_get_page(mapping, offset);  if (!page)   goto no_cached_page; } if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {  page_cache_release(page);  return ret | VM_FAULT_RETRY; } /* Did it get truncated? */ if (unlikely(page->mapping != mapping)) {  unlock_page(page);  put_page(page);  goto retry_find; } VM_BUG_ON(page->index != offset); /*  * We have a locked page in the page cache, now we need to check  * that it's up-to-date. If not, it is going to be due to an error.  */ if (unlikely(!PageUptodate(page)))  goto page_not_uptodate; /*  * Found the page and have a reference on it.  * We must recheck i_size under page lock.  */ size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (unlikely(offset >= size)) {  unlock_page(page);  page_cache_release(page);  return VM_FAULT_SIGBUS; } vmf->page = page; return ret | VM_FAULT_LOCKED;no_cached_page: /*  * We're only likely to ever get here if MADV_RANDOM is in  * effect.  */ error = page_cache_read(file, offset); /*  * The page we want has now been added to the page cache.  * In the unlikely event that someone removed it in the  * meantime, we'll just come back here and read it again.  */ if (error >= 0)  goto retry_find; /*  * An error return from page_cache_read can result if the  * system is low on memory, or a problem occurs while trying  * to schedule I/O.  */ if (error == -ENOMEM)  return VM_FAULT_OOM; return VM_FAULT_SIGBUS;page_not_uptodate: /*  * Umm, take care of errors if the page isn't up-to-date.  * Try to re-read it _once_. We do this synchronously,  * because there really aren't any performance issues here  * and we need to check for errors.  */ ClearPageError(page); /* 调用路径:->squashfs_readpage */ error = mapping->a_ops->readpage(file, page); if (!error) {  wait_on_page_locked(page);  if (!PageUptodate(page))   error = -EIO; } page_cache_release(page); if (!error || error == AOP_TRUNCATED_PAGE)  goto retry_find; /* Things didn't work out. Return zero to tell the mm layer so. */ shrink_readahead_size_eio(file, ra); return VM_FAULT_SIGBUS;}

分析file_fault中代码得知:do_sync_mmap_readahead/do_async_mmap_readahead/page_cache_read最后都会通过squashfs_readpage读取文件.

举例:系统内存紧张时,会释放进程的动态库,此时若再加载动态库,则需要通过file_fault获取,多数情况通过do_sync_mmap_readahead加载代码段到dram上.

下面分析一下do_sync_mmap_readahead/do_async_mmap_readahead/page_cache_read这三个函数,这三种读取文件的过程都有可能申请文件页,且此时申请的文件页

需要通过add_to_page_cache_lru挂载到lru链表上.do_sync_mmap_readahead/do_async_mmap_readahead实现类似统一介绍:

无论哪种方式文件也得申请都形如(do_generic_file_read中虽未直接调用page_cache_read,但也有类似实现add_to_page_cache_lru):

filemap_fault->page_cache_async_readahead->ondemand_readahead->__do_page_cache_readahead->
     read_pages->squashfs_readpage
filemap_fault->page_cache_async_readahead->ondemand_readahead->ra_submit->__do_page_cache_readahead->
     read_pages->squashfs_readpage

4)申请文件页时机:__do_page_cache_readahead->过程申请文件页:

static int__do_page_cache_readahead(struct address_space *mapping, struct file *filp,pgoff_t offset, unsigned long nr_to_read,unsigned long lookahead_size){struct inode *inode = mapping->host;struct page *page;unsigned long end_index;/* The last page we want to read */LIST_HEAD(page_pool);int page_idx;int ret = 0;loff_t isize = i_size_read(inode);if (isize == 0)goto out;end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);/* * Preallocate as many pages as we will need. */for (page_idx = 0; page_idx < nr_to_read; page_idx++) {pgoff_t page_offset = offset + page_idx;if (page_offset > end_index)break;rcu_read_lock();page = radix_tree_lookup(&mapping->page_tree, page_offset);rcu_read_unlock();if (page)continue;           /*申请文件页,在之后read_pages->add_to_page_cache_lru中将申请到的文件页添加到lru链表*/page = page_cache_alloc_readahead(mapping);if (!page)break;page->index = page_offset;list_add(&page->lru, &page_pool);if (page_idx == nr_to_read - lookahead_size)SetPageReadahead(page);ret++;}/* * Now start the IO.  We ignore I/O errors - if the page is not * uptodate then the caller will launch readpage again, and * will then handle the error. */if (ret)read_pages(mapping, filp, &page_pool, ret);BUG_ON(!list_empty(&page_pool));out:return ret;}
__do_page_cache_readahead->read_pages
static int read_pages(struct address_space *mapping, struct file *filp,struct list_head *pages, unsigned nr_pages){struct blk_plug plug;unsigned page_idx;int ret;blk_start_plug(&plug);if (mapping->a_ops->readpages) {ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);/* Clean up the remaining pages */put_pages_list(pages);goto out;}for (page_idx = 0; page_idx < nr_pages; page_idx++) {struct page *page = list_to_page(pages);list_del(&page->lru);if (!add_to_page_cache_lru(page, mapping,page->index, GFP_KERNEL)) {  mapping->a_ops->readpage(filp, page);}page_cache_release(page);}ret = 0;out:blk_finish_plug(&plug);return ret;}
5)申请文件页时机:squashfs_readpage过程申请文件页:可以参考linux文件读取过程一文
static int squashfs_readpage(struct file *file, struct page *page){struct inode *inode = page->mapping->host;struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;int bytes, i, offset = 0, sparse = 0;struct squashfs_cache_entry *buffer = NULL;void *pageaddr;int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);int start_index = page->index & ~mask;int end_index = start_index | mask;int file_end = i_size_read(inode) >> msblk->block_log;    if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>PAGE_CACHE_SHIFT))goto out;if (index < file_end || squashfs_i(inode)->fragment_block ==SQUASHFS_INVALID_BLK) {/* * Reading a datablock from disk.  Need to read block list * to get location and block size. */u64 block = 0;int bsize = read_blocklist(inode, index, &block);if (bsize < 0)goto error_out;   if (bsize == 0) { /* hole */bytes = index == file_end ?(i_size_read(inode) & (msblk->block_size - 1)) : msblk->block_size;sparse = 1;} else {/* * Read and decompress datablock. */buffer = squashfs_get_datablock(inode->i_sb,block, bsize);if (buffer->error) {                     squashfs_file_lookup(inode,file->f_dentry,0);ERROR("Unable to read page0 ,block %llx, size %x""\n", block, bsize);squashfs_cache_put(buffer);goto error_out;}bytes = buffer->length;}} else {                /*                * Datablock is stored inside a fragment (tail-end packed                * block).                */        if(print_readpage_flag)printk("read page, fragment block %llx, size %x\n",squashfs_i(inode)->fragment_block,squashfs_i(inode)->fragment_size);buffer = squashfs_get_fragment(inode->i_sb,squashfs_i(inode)->fragment_block,squashfs_i(inode)->fragment_size);if (buffer->error) {ERROR("Unable to read page, block %llx, size %x\n",squashfs_i(inode)->fragment_block,squashfs_i(inode)->fragment_size);squashfs_cache_put(buffer);goto error_out;}bytes = i_size_read(inode) & (msblk->block_size - 1);offset = squashfs_i(inode)->fragment_offset;}/* * Loop copying datablock into pages.  As the datablock likely covers * many PAGE_CACHE_SIZE pages (default block size is 128 KiB) explicitly * grab the pages from the page cache, except for the page that we've * been called to fill. */for (i = start_index; i <= end_index && bytes > 0; i++,bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {struct page *push_page;int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE);TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);          /*申请文件页,在之后grab_cache_page_nowait->add_to_page_cache_lru中将申请到的文件页添加到lru链表           squashfs_readpage一次读取一个逻辑块大小,而入参只是一个页,所以要再申请文件页          */push_page = (i == page->index) ? page :grab_cache_page_nowait(page->mapping, i);if (!push_page)continue;if (PageUptodate(push_page))goto skip_page;pageaddr = kmap_atomic(push_page);squashfs_copy_data(pageaddr, buffer, offset, avail);memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);kunmap_atomic(pageaddr);flush_dcache_page(push_page);SetPageUptodate(push_page);skip_page:unlock_page(push_page);if (i != page->index)page_cache_release(push_page);}if (!sparse)squashfs_cache_put(buffer);return 0;error_out:SetPageError(page);out:pageaddr = kmap_atomic(page);memset(pageaddr, 0, PAGE_CACHE_SIZE);kunmap_atomic(pageaddr);flush_dcache_page(page);if (!PageError(page))SetPageUptodate(page);unlock_page(page);return 0;}
squashfs_readpage->grab_cache_page_nowait
struct page *grab_cache_page_nowait(struct address_space *mapping, pgoff_t index){struct page *page = find_get_page(mapping, index);if (page) {if (trylock_page(page))return page;page_cache_release(page);return NULL;}     /*申请文件页,在之后grab_cache_page_nowait->add_to_page_cache_lru中     将申请到的文件页添加到lru链表*/page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {page_cache_release(page);page = NULL;}return page;}

6)申请文件页时机:filemap_fault->page_cache_read:过程申请文件页

do_nolinear_fault->__do_fault->filemap_fault->page_cache_read

static int page_cache_read(struct file *file, pgoff_t offset){ struct address_space *mapping = file->f_mapping; struct page *page;  int ret; do {   /*通过alloc_pages申请内存*/  page = page_cache_alloc_cold(mapping);  if (!page)   return -ENOMEM;/* 设置page->flags;page->mapping;page->index等. 将申请的页添加到LRU_INACTIVE_FILE链表*/  ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);  if (ret == 0)   ret = mapping->a_ops->readpage(file, page);  else if (ret == -EEXIST)   ret = 0; /* losing race to add is OK */  page_cache_release(page); } while (ret == AOP_TRUNCATED_PAGE);   return ret;}

do_nolinear_fault->__do_fault->filemap_fault->page_cache_read->add_to_page_cache_lru()

int add_to_page_cache_lru(struct page *page, struct address_space *mapping,    pgoff_t offset, gfp_t gfp_mask){ int ret; /*设置PG_locked;调用add_to_page_cache_locked*/ ret = add_to_page_cache(page, mapping, offset, gfp_mask); if (ret == 0)  /*将申请的页添加到LRU_INACTIVE_FILE链表*/  lru_cache_add_file(page); return ret;}

add_to_page_cache_locked设置page->mapping=file->f_mapping;page->index=offset;

注意:do_dentry_open时f_mapping=inode->i_mapping;

inode->i_mapping的初始化:

1>是在do_last->lookup_open->vfs_create->ubifs_create->ubifs_new_inode创建一个文件时初始化的.

2>对于已存在文件,是在do_last->lookup_open->lookup_real->ubifs_lookup->ubifs_iget

add_to_page_cache_lru->add_to_page_cache->add_to_page_cache_locked 和 __delete_from_page_cache对应;

int add_to_page_cache_locked(struct page *page, struct address_space *mapping,  pgoff_t offset, gfp_t gfp_mask){ int error; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageSwapBacked(page)); error = mem_cgroup_cache_charge(page, current->mm,     gfp_mask & GFP_RECLAIM_MASK); if (error)  goto out; error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) {  page_cache_get(page);   /*设置page->mapping=file->f_mapping*/  page->mapping = mapping;   /*offset表示要访问文件的偏移地址所在页*/  page->index = offset;  spin_lock_irq(&mapping->tree_lock);  error = radix_tree_insert(&mapping->page_tree, offset, page);  if (likely(!error)) {   mapping->nrpages++;   __inc_zone_page_state(page, NR_FILE_PAGES);   spin_unlock_irq(&mapping->tree_lock);   trace_mm_filemap_add_to_page_cache(page);  } else {   page->mapping = NULL;   /* Leave page->index set: truncation relies upon it */   spin_unlock_irq(&mapping->tree_lock);   mem_cgroup_uncharge_cache_page(page);   page_cache_release(page);  }  radix_tree_preload_end(); } else  mem_cgroup_uncharge_cache_page(page);out: return error;}

7)申请文件页时机:do_generic_file_read过程申请文件页

详见博文:linux文件系统实现原理简述 http://write.blog.csdn.net/postedit/71249365 ;


5 内存回收的代码介绍:

alloc_pages申请内存过程,主要有两个时机会回收内存:
时机一:get_page_from_freelist->zone_reclaim->shrink_zone 主要根据zone_watermark_ok判断是否zone_reclaim;

时机二:__alloc_pages_slowpath->wake_all_kswapd唤醒回收机制.kswapd->balance_pgdat->shrink_zone;

alloc_pages过程中,如果get_page_from_freelist申请失败,则唤醒守护进程进行内存回收.

无论哪种方式,最后都是通过形如shrink_zone的函数进行内存回收的.

shrink_zone->shrink_lruvc
/* * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim. */static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc){unsigned long nr[NR_LRU_LISTS];unsigned long nr_to_scan;enum lru_list lru;unsigned long nr_reclaimed = 0;unsigned long nr_to_reclaim = sc->nr_to_reclaim;struct blk_plug plug;/*获取NR_LRU_LISTS链表上每种物理页个数*/get_scan_count(lruvec, sc, nr);blk_start_plug(&plug);while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||nr[LRU_INACTIVE_FILE]) {for_each_evictable_lru(lru) {if (nr[lru]) {nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);nr[lru] -= nr_to_scan;nr_reclaimed += shrink_list(lru, nr_to_scan,   lruvec, sc);}}/** On large memory systems, scan >> priority can become* really large. This is fine for the starting priority;* we want to put equal scanning pressure on each zone.* However, if the VM has a harder time of freeing pages,* with multiple processes reclaiming pages, the total* freeing target can get unreasonably large.*/if (nr_reclaimed >= nr_to_reclaim &&   sc->priority < DEF_PRIORITY)break;}blk_finish_plug(&plug);sc->nr_reclaimed += nr_reclaimed;/** Even if we did not try to evict anon pages at all, we want to* rebalance the anon lru active/inactive ratio.inactive_anon_is_low(lruvec)=0*/if (inactive_anon_is_low(lruvec))shrink_active_list(SWAP_CLUSTER_MAX, lruvec,  sc, LRU_ACTIVE_ANON);throttle_vm_writeout(sc->gfp_mask);}
shrink_lruvec->shrink_list()
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,struct lruvec *lruvec, struct scan_control *sc){if (is_active_lru(lru)) {/*inactive_list_is_low返回值:如果是文件页:LRU_ACTIVE_FILE链表上的页比LRU_INACTIVE_FILE链表上的页多,则为真,否则为假;如果是匿名页则一直未假;*/if (inactive_list_is_low(lruvec, lru))/* 把页从active链表移到inactive链表 */shrink_active_list(nr_to_scan, lruvec, sc, lru);return 0;}/*真正的页面回收函数shrink_inactive_list->shrink_page_list*/return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);}
当inactive链表上的页数不够的时候,会调用shrink_active_list,该函数会将active链表上的页move到inactive链表上;

shrink_lruvec->shrink_list()->shrink_active_list()

static void shrink_active_list(unsigned long nr_to_scan,      struct lruvec *lruvec,      struct scan_control *sc,      enum lru_list lru){unsigned long nr_taken;unsigned long nr_scanned;unsigned long vm_flags;LIST_HEAD(l_hold);/* The pages which were snipped off */LIST_HEAD(l_active);LIST_HEAD(l_inactive);struct page *page;struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;unsigned long nr_rotated = 0;isolate_mode_t isolate_mode = 0;int file = is_file_lru(lru);struct zone *zone = lruvec_zone(lruvec);            unsigned long vm_oflags = 0;        static int age = 0;        int puttoinactive = 0;        lru_add_drain();if (!sc->may_unmap)isolate_mode |= ISOLATE_UNMAPPED;if (!sc->may_writepage)isolate_mode |= ISOLATE_CLEAN;spin_lock_irq(&zone->lru_lock);nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,    &nr_scanned, sc, isolate_mode, lru);if (global_reclaim(sc))zone->pages_scanned += nr_scanned;reclaim_stat->recent_scanned[file] += nr_taken;__count_zone_vm_events(PGREFILL, zone, nr_scanned);__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);spin_unlock_irq(&zone->lru_lock);while (!list_empty(&l_hold)) {cond_resched();page = lru_to_page(&l_hold);list_del(&page->lru);if (unlikely(!page_evictable(page))) {putback_lru_page(page);continue;}if (unlikely(buffer_heads_over_limit)) {if (page_has_private(page) && trylock_page(page)) {if (page_has_private(page))try_to_release_page(page, 0);unlock_page(page);}}/* page_referenced返回0:表示该页需要添加到inactive链表上,如果某些特殊情况不想释放特定页可以在此时做特殊处理;page_referenced返回1:如果文件页是代码段,则添加到active链表上,因此代码段有更多机会留在内存上; */if (page_referenced(page, 0, sc->target_mem_cgroup,   &vm_flags)) {nr_rotated += hpage_nr_pages(page);/** Identify referenced, file-backed active pages and* give them one more trip around the active list. So* that executable code get better chances to stay in* memory under moderate memory pressure.  Anon pages* are not likely to be evicted by use-once streaming* IO, plus JVM can create lots of anon VM_EXEC pages,* so we ignore them here.*//*此时表示文件页*/if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {list_add(&page->lru, &l_active);continue;}}ClearPageActive(page); /* we are de-activating *//* 将页添加到inactive链表中 */list_add(&page->lru, &l_inactive);}/** Move pages back to the lru list.*/spin_lock_irq(&zone->lru_lock);/** Count referenced pages from currently used mappings as rotated,* even though only some of them are actually re-activated.  This* helps balance scan pressure between file and anonymous pages in* get_scan_ratio.*/reclaim_stat->recent_rotated[file] += nr_rotated;move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);spin_unlock_irq(&zone->lru_lock);free_hot_cold_page_list(&l_hold, 1);}

shrink_active_list将页移到inactive链表上;

下一步进行真正的页面回收函数

shrink_inactive_list->shrink_page_list

static noinline_for_stack unsigned longshrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,    struct scan_control *sc, enum lru_list lru){LIST_HEAD(page_list);unsigned long nr_scanned;unsigned long nr_reclaimed = 0;unsigned long nr_taken;unsigned long nr_dirty = 0;unsigned long nr_writeback = 0;isolate_mode_t isolate_mode = 0;int file = is_file_lru(lru);struct zone *zone = lruvec_zone(lruvec);struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;while (unlikely(too_many_isolated(zone, file, sc))) {congestion_wait(BLK_RW_ASYNC, HZ/10);/* We are about to die and free our memory. Return now. */if (fatal_signal_pending(current))return SWAP_CLUSTER_MAX;}lru_add_drain();if (!sc->may_unmap)isolate_mode |= ISOLATE_UNMAPPED;if (!sc->may_writepage)isolate_mode |= ISOLATE_CLEAN;spin_lock_irq(&zone->lru_lock);nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,    &nr_scanned, sc, isolate_mode, lru);__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);if (global_reclaim(sc)) {zone->pages_scanned += nr_scanned;if (current_is_kswapd())__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);else__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);}spin_unlock_irq(&zone->lru_lock);if (nr_taken == 0)return 0;/*真正回收页面*/nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,&nr_dirty, &nr_writeback, false);spin_lock_irq(&zone->lru_lock);reclaim_stat->recent_scanned[file] += nr_taken;if (global_reclaim(sc)) {if (current_is_kswapd())__count_zone_vm_events(PGSTEAL_KSWAPD, zone,      nr_reclaimed);else__count_zone_vm_events(PGSTEAL_DIRECT, zone,      nr_reclaimed);}putback_inactive_pages(lruvec, &page_list);__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);spin_unlock_irq(&zone->lru_lock);free_hot_cold_page_list(&page_list, 1);/** If reclaim is isolating dirty pages under writeback, it implies* that the long-lived page allocation rate is exceeding the page* laundering rate. Either the global limits are not being effective* at throttling processes due to the page distribution throughout* zones or there is heavy usage of a slow backing device. The* only option is to throttle from reclaim context which is not ideal* as there is no guarantee the dirtying process is throttled in the* same way balance_dirty_pages() manages.** This scales the number of dirty pages that must be under writeback* before throttling depending on priority. It is a simple backoff* function that has the most effect in the range DEF_PRIORITY to* DEF_PRIORITY-2 which is the priority reclaim is considered to be* in trouble and reclaim is considered to be in trouble.** DEF_PRIORITY   100% isolated pages must be PageWriteback to throttle* DEF_PRIORITY-1  50% must be PageWriteback* DEF_PRIORITY-2  25% must be PageWriteback, kswapd in trouble* ...* DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any*                     isolated page is PageWriteback*/if (nr_writeback && nr_writeback >=(nr_taken >> (DEF_PRIORITY - sc->priority)))wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,zone_idx(zone),nr_scanned, nr_reclaimed,sc->priority,trace_shrink_flags(file));return nr_reclaimed;}
shrink_inactive_list->shrink_page_list()->__remove_mapping()
static int __remove_mapping(struct address_space *mapping, struct page *page){BUG_ON(!PageLocked(page));BUG_ON(mapping != page_mapping(page));spin_lock_irq(&mapping->tree_lock);/** The non racy check for a busy page.** Must be careful with the order of the tests. When someone has* a ref to the page, it may be possible that they dirty it then* drop the reference. So if PageDirty is tested before page_count* here, then the following race may occur:** get_user_pages(&page);* [user mapping goes away]* write_to(page);* !PageDirty(page)    [good]* SetPageDirty(page);* put_page(page);* !page_count(page)   [good, discard it]** [oops, our write_to data is lost]** Reversing the order of the tests ensures such a situation cannot* escape unnoticed. The smp_rmb is needed to ensure the page->flags* load is not satisfied before that of page->_count.** Note that if SetPageDirty is always performed via set_page_dirty,* and thus under tree_lock, then this ordering is not required.*/if (!page_freeze_refs(page, 2))goto cannot_free;/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */if (unlikely(PageDirty(page))) {page_unfreeze_refs(page, 2);goto cannot_free;}if (PageSwapCache(page)) {swp_entry_t swap = { .val = page_private(page) };__delete_from_swap_cache(page);spin_unlock_irq(&mapping->tree_lock);swapcache_free(swap, page);} else {void (*freepage)(struct page *);freepage = mapping->a_ops->freepage;__delete_from_page_cache(page);spin_unlock_irq(&mapping->tree_lock);mem_cgroup_uncharge_cache_page(page);if (freepage != NULL)freepage(page);}return 1;cannot_free:spin_unlock_irq(&mapping->tree_lock);return 0;}

__remove_mapping->delete_from_page_cache->__delete_from_page_cache()

void __delete_from_page_cache(struct page *page){ struct address_space *mapping = page->mapping; trace_mm_filemap_delete_from_page_cache(page); /*  * if we're uptodate, flush out into the cleancache, otherwise  * invalidate any existing cleancache entries.  We can't leave  * stale data around in the cleancache once our page is gone  */ if (PageUptodate(page) && PageMappedToDisk(page))  cleancache_put_page(page); else  cleancache_invalidate_page(mapping, page); /*从lru链表上删除,和add_to_page_cache_locked对应*/ radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page))  __dec_zone_page_state(page, NR_SHMEM); BUG_ON(page_mapped(page)); /*  * Some filesystems seem to re-dirty the page even after  * the VM has canceled the dirty bit (eg ext3 journaling).  *  * Fix it up by doing a final dirty accounting check after  * having removed the page entirely.  */ if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {  dec_zone_page_state(page, NR_FILE_DIRTY);  dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); }}
【总结】
1 系统内存信息的初始化过程(包括:初始化pgdat、struct zone: ):
paging_init->bootmem_init->arm_bootmem_init

paging_init->bootmem_init->arm_bootmem_free->free_area_init_node
 为页描述符地址分配内存:
free_area_init_node->alloc_node_mem_map(pgdat)->alloc_bootmem_node_nopanic->alloc_bootmem_bdata->__reserve; 
 初始化页描述符,初始化struct zone node_zones:
free_area_init_node->free_area_init_core->memmap_init_zone
 初始化node_zonelists:
start_kernel->build_all_zonelists()
 释放所有页框,将page->lru添加到:mem_init->free_all_bootmem_core
 初始化阶段释放所有页:free_all_bootmem->__free_one_page(
 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
2 申请内存:alloc_pages->__alloc_pages_nodemask->get_page_from_freelist->buffered_rmqueue->rmqueue_bulk:分解大的伙伴。如4页分成2*2页,申请失败时调用。
          rmqueue_bulk从zone->free_area链表上分配内存页,然后将页链到pcp->lists中。
3 zone->watermark赋值:__setup_per_zone_wmarks。
alloc_pages->zone_watermark_ok根据>watermark值决定是否能申请到内存。
alloc_pages-申请过程中通过zone_watermark_ok,判断是否有足够空闲页,否则调用zone_reclaim->shrink_zone回收,如果还未成功,则:
__alloc_pages_slowpath->__alloc_pages_direct_reclaim->__perform_reclaim->try_to_free_pages->shrink_zones->shrink_zone

ps:伙伴系统buddy:11个块链表大小分别为2的0、1、2、、、10次方个页,最多申请连续1024个页=4M

4通过page fault申请文件页
1) 映射页:page->mapping =  file->f_mapping:
filemap_fault->add_to_page_cache_lru->add_to_page_cache_locked-》add_to_page_cache_locked:page->mapping = mapping;page->index = offset;
2)匿名页:handle_pte_fault->do_wp_page->page_move_anon_rmap
3) 匿名vma初始化:anon_vma_alloc()

5 动态内存回收机制,主要回收通过page fault进行申请的页.

1)alloc_pages因为剩余内存不足引起的回收:
   try_to_free_pages->shrink_zone;
   zone_reclaim->shrink_zone;
shrink_zone -> shrink_lruvec -> shrink_list->shrink_inactive_list->shrink_page_list
shrink_zone -> shrink_lruvec -> shrink_list->shrink_active_list
2)守护进程释放kswapd->balance_pgdat->shrink_zone
kswapd_init->kswapd_run->kthread_run(kswapd,pgdat,"kswapd%d",nid)
6 真正的页面回收函数
shrink_page_list->try_to_unmap_anon:
匿名线性区赋值,存放在页描述符的mapping字段。当mapping最低位为0时,存放的是映射页address_space;非0存放匿名页线性区。
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;(可以插入几个进程的线性区(),只有所有进程都释放了该页(mapcount为-1时)
才能释放,即从active,inactive链表中删除,并调用free_hot_page释放页框;初始化page->_mapcount:copy_one_pte->page_dup_rmap(page);
try_to_unmap_anon逐个线性区释放。
1)一个线性区如何释放?
shrink_active_list的执行条件:非活动链表中比活动链表小,则开始缩小活动链表 。
move_active_pages_to_lru:list_move(&page->lru, &lruvec->lists[lru]),将活动也链到活动也链表中。
1)mark_page_accessed: page从inactive链表向active链表切换,其中采用了二次机会。
inactive reference=0-> inactive reference=1 ->active reference=0->active reference=1
2)page_referenced:page从active向inactive表切换.每当置换算法扫描一次页面,就老化一次。
3)用户进程的页面都是通过缺页异常 page fault分配的handle_pte_fault,都是可回收的,分为:文件页和匿名页。
4) handle_pte_fault函数(do_swap_page换入,pageout换出,页不在内存时-present标记清0时调用。匿名页添加到活动lru;文件页加到非活动lru)

【其他】

1 不断触发内存动态回收方法,内存紧张时,代码段可能被回收:

static int wake_all_kswapd_debug(void){/* 每个两秒触发一次内存回收,触发30次 */for(j=0;j<30;j++){    /*8表示连续256个page即1M大小;NODE_DATA(0)表示ZONE_NORMAL*/    wake_all_kswapd(8,NODE_DATA(0)->node_zonelists,0,0);    msleep(2000);}}
2 判断test进程的代码段,动态库等被重新加载的方法:filemap_fault函数中加入打印信息.
int filemap_fault(){     if(strncmp(current->comm,"test",4)==0)     {             printk("[%s]filemap_fault:%s\n",                 current->comm,file->f_path.dentry->d_iname);     }}

3 统计加载一个动态库到ram上时,需要申请的文件页数,根据上文得知申请文件页的时机,则分别统计即可.

文件页个数增加过程,一般来说系统通过add_to_page_cache_lru讲申请到的页作为文件页,可以据此统计文件页个数:

1>do_generic_file_read过程申请文件页,do_generic_file_read->add_to_page_cache_lru之后统计文件页个数;

2>filemap_fault->page_cache_read->add_to_page_cache_lru之后统计文件页个数;

3>filemap_fault->__do_page_cache_readahead->read_pages->->add_to_page_cache_lru之后统计文件页个数;

4>do_generic_file_read/filemap_fault->(mapping->a_ops->readpage=squashfs_readpage)->

squashfs_readpage->grab_cache_page_nowait之后统计文件页个数,注意不统计入参的一个文件页;

文件页减少过程:

shrink_page_list->remove_mapping->__remove_mapping->__delete_from_page_cache后统计;

原创粉丝点击