node

来源:互联网 发布:python append和extend 编辑:程序博客网 时间:2024/06/05 07:34



日期内核版本架构作者GitHubCSDN2017-07-04Linux-4.12X86lwhuqLinuxMemoryStudyLinux内存管理


  在NUMA多CPU架构下,每个CPU后面都有挂载本地内存,CPU之前通过总线连接。每个CPU在访问当地内存的速度都会比访问远程内存速度快。Linux系统下把每个CPU的本地内存资源用一个结点node表示。


1 pg_data_t结构

  pg_data_t的定义在include/linux/mmzone.h#L601

typedef struct pglist_data {//一个结构数组,包含了结点中各内存域的数据结构zonestruct zone node_zones[MAX_NR_ZONES];//指定了备用结点机器内存域的列表,以便在当前结点没有可用空间时,在备用结点分配内存struct zonelist node_zonelists[MAX_ZONELISTS];//内存域的个数        int nr_zones;#ifdef CONFIG_FLAT_NODE_MEM_MAP/* means !SPARSEMEM *///指向结点的第一个页框的页结构,该页结构位于全局mem_map中某个位置struct page *node_mem_map;#ifdef CONFIG_PAGE_EXTENSIONstruct page_ext *node_page_ext;#endif#endif#ifndef CONFIG_NO_BOOTMEM//启动内存分配器struct bootmem_data *bdata;#endif#ifdef CONFIG_MEMORY_HOTPLUG/* * Must be held any time you expect node_start_pfn, node_present_pages * or node_spanned_pages stay constant.  Holding this will also * guarantee that any pfn_valid() stays that way. * * pgdat_resize_lock() and pgdat_resize_unlock() are provided to * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG. * * Nests above zone->lock and zone->span_seqlock */spinlock_t node_size_lock;#endif//结点起始页框unsigned long node_start_pfn;//结点总页框数(不包含洞)unsigned long node_present_pages; /* total number of physical pages *///结点总页框数(包含洞)unsigned long node_spanned_pages; /* total size of physical page     range, including holes *///结点idint node_id;//交换守护进程的等待列表wait_queue_head_t kswapd_wait;//本结点交换守护进程wait_queue_head_t pfmemalloc_wait;struct task_struct *kswapd;/* Protected by   mem_hotplug_begin/end() */int kswapd_order;enum zone_type kswapd_classzone_idx;int kswapd_failures;/* Number of 'reclaimed == 0' runs */#ifdef CONFIG_COMPACTIONint kcompactd_max_order;enum zone_type kcompactd_classzone_idx;wait_queue_head_t kcompactd_wait;struct task_struct *kcompactd;#endif#ifdef CONFIG_NUMA_BALANCING/* Lock serializing the migrate rate limiting window */spinlock_t numabalancing_migrate_lock;/* Rate limiting time interval */unsigned long numabalancing_migrate_next_window;/* Number of pages migrated during the rate limiting time interval */unsigned long numabalancing_migrate_nr_pages;#endif/* * This is a per-node reserve of pages that are not available * to userspace allocations. */unsigned longtotalreserve_pages;#ifdef CONFIG_NUMA/* * zone reclaim becomes active if more unmapped pages exist. */unsigned longmin_unmapped_pages;unsigned longmin_slab_pages;#endif /* CONFIG_NUMA *//* Write-intensive fields used by page reclaim */ZONE_PADDING(_pad1_)spinlock_tlru_lock;#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT/* * If memory initialisation on large machines is deferred then this * is the first PFN that needs to be initialised. */unsigned long first_deferred_pfn;unsigned long static_init_size;#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */#ifdef CONFIG_TRANSPARENT_HUGEPAGEspinlock_t split_queue_lock;struct list_head split_queue;unsigned long split_queue_len;#endif/* Fields commonly accessed by the page reclaim scanner */struct lruveclruvec;/* * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on * this node's LRU.  Maintained by the pageout code. */unsigned int inactive_ratio;unsigned longflags;ZONE_PADDING(_pad2_)/* Per-node vmstats */struct per_cpu_nodestat __percpu *per_cpu_nodestats;atomic_long_tvm_stat[NR_VM_NODE_STAT_ITEMS];} pg_data_t;

1.1 结点的内存域

  结点管理的内存再细分成内存域。
typedef struct pglist_data {        //一个结构数组,包含了结点中各内存域的数据结构zone struct zone node_zones[MAX_NR_ZONES];         //指定了备用结点机器内存域的列表,以便在当前结点没有可用空间时,在备用结点分配内存 struct zonelist node_zonelists[MAX_ZONELISTS];        //内存域的个数         int nr_zones;}

  • node_zones[MAX_NR_ZONES]管理着本地内存的最多MAX_NR_ZONES个内存域
  • node_zonelists[MAX_ZONELISTS]指定了备用结点及内存域的列表。可以想象这些备用结点及内存域都是远程内存
  • nr_zones结点内存域的个数

1.2 结点的内存页

typedef struct pglist_data {#ifdef CONFIG_FLAT_NODE_MEM_MAP/* means !SPARSEMEM */struct page *node_mem_map;  //指向结点的第一个页框的页结构,该页结构位于全局mem_map中某个位置#ifdef CONFIG_PAGE_EXTENSIONstruct page_ext *node_page_ext;#endif#endif //结点起始页框 unsigned long node_start_pfn;        //结点总页框数(不包含洞) unsigned long node_present_pages; /* total number of physical pages */        //结点总页框数(包含洞) unsigned long node_spanned_pages; /* total size of physical page range, including holes */} pg_data_t;

  在每个结点的结构pg_data_t内有一个指向页结构page的指针node_mem_map。pg_data_t->node_mem_map指向本结点管理的物理内存页框的第一个页框。

typedef struct pglist_data {      //指向结点的第一个页框的页结构,该页结构位于全局mem_map中某个位置      struct page *node_mem_map;  }
  pg_data_t->node_mem_map的初始化在alloc_node_mem_map中完成,定义在mm/page_alloc.c#L6096
static void __ref alloc_node_mem_map(struct pglist_data *pgdat){start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);offset = pgdat->node_start_pfn - start;/* ia64 gets its own node_mem_map, before this, without bootmem */if (!pgdat->node_mem_map) {unsigned long size, end;struct page *map;/* * The zone's endpoints aren't required to be MAX_ORDER * aligned but the node_mem_map endpoints must be in order * for the buddy allocator to function correctly. */end = pgdat_end_pfn(pgdat);end = ALIGN(end, MAX_ORDER_NR_PAGES);size =  (end - start) * sizeof(struct page);map = alloc_remap(pgdat->node_id, size);if (!map)map = memblock_virt_alloc_node_nopanic(size,       pgdat->node_id);pgdat->node_mem_map = map + offset;}}

1.3 交换守护进程

typedef struct pglist_data {        //交换守护进程的等待列表 wait_queue_head_t kswapd_wait;wait_queue_head_t pfmemalloc_wait;        //本结点交换守护进程struct task_struct *kswapd;/* Protected by   mem_hotplug_begin/end() */int kswapd_order;enum zone_type kswapd_classzone_idx;int kswapd_failures; /* Number of 'reclaimed == 0' runs */} pg_data_t;

2 结点状态

  当系统中有超过一个结点时,内核会维护一个位图node_states用以提供各个结点的状态信息,其定义在include/linux/nodemask.h#L381
enum node_states {N_POSSIBLE,/* The node could become online at some point */N_ONLINE,/* The node is online */N_NORMAL_MEMORY,/* The node has regular memory */#ifdef CONFIG_HIGHMEMN_HIGH_MEMORY,/* The node has regular or high memory */#elseN_HIGH_MEMORY = N_NORMAL_MEMORY,#endif#ifdef CONFIG_MOVABLE_NODEN_MEMORY,/* The node has memory(regular, high, movable) */#elseN_MEMORY = N_HIGH_MEMORY,#endifN_CPU,/* The node has one or more cpus */NR_NODE_STATES};
  结点位图的实例node_states定义在mm/page_alloc.c#L122, 当某个node处在某个状态时,对应状态位的node位就会被置起。
nodemask_t node_states[NR_NODE_STATES] __read_mostly = {[N_POSSIBLE] = NODE_MASK_ALL,[N_ONLINE] = { { [0] = 1UL } },#ifndef CONFIG_NUMA[N_NORMAL_MEMORY] = { { [0] = 1UL } },#ifdef CONFIG_HIGHMEM[N_HIGH_MEMORY] = { { [0] = 1UL } },#endif#ifdef CONFIG_MOVABLE_NODE[N_MEMORY] = { { [0] = 1UL } },#endif[N_CPU] = { { [0] = 1UL } },#endif/* NUMA */};EXPORT_SYMBOL(node_states);

  • N_POSSIBLE, N_ONLINE和N_CPU用于CPU和内存的热插拔
  • N_NORMAL_MEMORY, N_HIGH_MEMORY用于普通内存管理
  • N_MEMORY表示有物理内存的结点
  几个辅助函数用于设置或清除位域或特定结点中的一个bit。定义在include/linux/nodemask.h#L407

static inline int node_state(int node, enum node_states state){return node_isset(node, node_states[state]);}static inline void node_set_state(int node, enum node_states state){__node_set(node, &node_states[state]);}static inline void node_clear_state(int node, enum node_states state){__node_clear(node, &node_states[state]);}static inline int num_node_state(enum node_states state){return nodes_weight(node_states[state]);}

3 查找内存结点

  内存结点的实例为node_data[MAX_NUMNODES],定义在arch/x86/mm/numa.c#L26
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;EXPORT_SYMBOL(node_data);

  内存结点最大数目由MAX_NUMNODES决定,定义在include/linux/numa.h#L11

#ifdef CONFIG_NODES_SHIFT#define NODES_SHIFT     CONFIG_NODES_SHIFT#else#define NODES_SHIFT     0#define MAX_NUMNODES    (1 << NODES_SHIFT)#endif

  宏NODE_DATA(nid)可以根据node id找到node_data结构实例,定义在arch/x86/include/asm/mmzone_32.h#L13和arch/x86/include/asm/mmzone_64.h#L14
#define NODE_DATA(nid)(node_data[nid])

3.1 查找node id

  宏first_online_node用于得到第一个online的node,定义在include/linux/nodemask.h#L430
#define first_online_nodefirst_node(node_states[N_ONLINE])
  宏 first_memory_node得到第一个有memory的node,定义在include/linux/nodemask.h#L431
#define first_memory_nodefirst_node(node_states[N_MEMORY])
  宏next_node(n, src)得到某个node state状态src的下一个被置起的node id,定义在include/linux/nodemask.h#L258
#define next_node(n, src) __next_node((n), &(src))static inline int __next_node(int n, const nodemask_t *srcp){return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));}
  函数next_online_node得到下一个online的node,定义在include/linux/nodemask.h#L432
static inline int next_online_node(int nid){return next_node(nid, node_states[N_ONLINE]);}
  函数next_memory_node得到下一个有memory的node,定义在include/linux/nodemask.h#L436
static inline int next_memory_node(int nid){return next_node(nid, node_states[N_MEMORY]);}

3.2 node id的遍历

  宏for_each_node_state(__node, __state)用来遍历处于特定状态的所有结点,定义在include/linux/nodemask.h#L427
#define for_each_node_state(__node, __state) \for_each_node_mask((__node), node_states[__state])
  宏for_each_node(node)用来迭代处于N_POSSIBLE状态的所有结点,定义在include/linux/nodemask.h#L507
#define for_each_node(node)   for_each_node_state(node, N_POSSIBLE)
  宏for_each_online_node(node)用来遍历处于N_ONLINE所有结点,定义在include/linux/nodemask.h#L508
#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)

3.3 查找pg_data_t结构

  函数first_online_pgdat得到第一个online的pg_data结构的指针,定义在mm/mmzone.c#L12
struct pglist_data *first_online_pgdat(void){return NODE_DATA(first_online_node);}
  函数next_online_pgdat(pgdat)得到下一个online的pg_data结构的指针,定义在mm/mmzone.c#L17
struct pglist_data *next_online_pgdat(struct pglist_data *pgdat){int nid = next_online_node(pgdat->node_id);if (nid == MAX_NUMNODES)return NULL;return NODE_DATA(nid);}

3.4 pg_data_t结构的遍历

  宏for_each_online_pgdat(pgdat)用来遍历所有online的pg_data_t结构指针,定义在include/linux/mmzone.h#L908
#define for_each_online_pgdat(pgdat)\for (pgdat = first_online_pgdat();\     pgdat;\     pgdat = next_online_pgdat(pgdat))