Linux内存管理源码分析
内核pwn学到UAF,发现又不太行了,虽说之前操作系统的知识没啥问题了,但是这里对于目前市面上的内存管理还是不了解,因此在这里再来浅浅分析一下,整体的数据部分,Linux采用node
、zone
、page
三级表示,接下来我们来分别叙述,这里若涉及到源码大家可以点击下面链接查看Linux内核相应版本查看
Linux 内核源码
本篇主要是个人跟随着arttnba3师傅:
arttnba3师傅个人博客
和cft56200_ln师傅:
cft56200_ln师傅博客
这两位大牛写的内存管理,a师傅的比较详细,c师傅比较简化,但是简化的前提是看明白了a师傅的部分博客知识(反正跳不开a师傅,他真的太细了
1. 数据结构部分
node节点
我们首先需要知道,对于内存访问架构来讲,一般CPU都可以分为以下两种方式:
- UMA(一致性内存访问,Uniform Memory Access),表示全局就一个
node
,且多个CPU通过1跟总线访问内存,且访问时间一致,类似SMP
- NUMA(非一致性内存访问,Not-Uniform Memory Access),每个CPU分配一块内存,存在多个
node
,且再不同情况下使用访问时间有所区别。
而node
的结构体是采用pglist_data
结构进行描述,定义在/include/linux/mmzone.h
,如下:
/*
* On NUMA machines, each NUMA node would have a pg_data_t to describe
* it's memory layout. On UMA machines there is a single pglist_data which
* describes the whole memory.(NUMA架构每个node都有个此结构来描述内存布局,而UMA就一个)
*
* Memory statistics and page replacement data structures are maintained on a
* per-zone basis.
*/
typedef struct pglist_data {
/*
* node_zones contains just the zones for THIS node. Not all of the
* zones may be populated, but it is the full list. It is referenced by
* this node's node_zonelists as well as other node's node_zonelists.
*/
struct zone node_zones[MAX_NR_ZONES];
/*
* node_zonelists contains references to all zones in all nodes.
* Generally the first zones will be references to this node's
* node_zones.
*/
struct zonelist node_zonelists[MAX_ZONELISTS];
int nr_zones; /* number of populated zones in this node */
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
struct page_ext *node_page_ext;
#endif
#endif
#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
/*
* Must be held any time you expect node_start_pfn,
* node_present_pages, node_spanned_pages or nr_zones to stay constant.
* Also synchronizes pgdat->first_deferred_pfn during deferred page
* init.
*
* pgdat_resize_lock() and pgdat_resize_unlock() are provided to
* manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
* or CONFIG_DEFERRED_STRUCT_PAGE_INIT.
*
* Nests above zone->lock and zone->span_seqlock
*/
spinlock_t node_size_lock;
#endif
unsigned long node_start_pfn;
unsigned long node_present_pages; /* total number of physical pages */
unsigned long node_spanned_pages; /* total size of physical page
range, including holes */
int node_id;
wait_queue_head_t kswapd_wait;
wait_queue_head_t pfmemalloc_wait;
struct task_struct *kswapd; /* Protected by
mem_hotplug_begin/end() */
int kswapd_order;
enum zone_type kswapd_highest_zoneidx;
int kswapd_failures; /* Number of 'reclaimed == 0' runs */
#ifdef CONFIG_COMPACTION
int kcompactd_max_order;
enum zone_type kcompactd_highest_zoneidx;
wait_queue_head_t kcompactd_wait;
struct task_struct *kcompactd;
#endif
/*
* This is a per-node reserve of pages that are not available
* to userspace allocations.
*/
unsigned long totalreserve_pages;
#ifdef CONFIG_NUMA
/*
* node reclaim becomes active if more unmapped pages exist.
*/
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
#endif /* CONFIG_NUMA */
/* Write-intensive fields used by page reclaim */
ZONE_PADDING(_pad1_)
spinlock_t lru_lock;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* If memory initialisation on large machines is deferred then this
* is the first PFN that needs to be initialised.
*/
unsigned long first_deferred_pfn;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct deferred_split deferred_split_queue;
#endif
/* Fields commonly accessed by the page reclaim scanner */
/*
* NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
*
* Use mem_cgroup_lruvec() to look up lruvecs.
*/
struct lruvec __lruvec;
unsigned long flags;
ZONE_PADDING(_pad2_)
/* Per-node vmstats */
struct per_cpu_nodestat __percpu *per_cpu_nodestats;
atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
} pg_data_t;
下面单独指出一些重要字段:
- node_zones:node_zones contains just the zones for THIS node. Not all of the zones may be populated, but it is the full list. It is referenced by this node's node_zonelists as well as other node's node_zonelists.说人话,他是一个
struct zone
类型的数组,包含了仅仅这个node
下的所有的zone
,这里注意并非所有zone
都被填充,但是他是已经被充满了,他被下面即将讲到的一个链表节点node_zonelists
和其他node
的node_zonelists
引用;
- node_zonelists:不标英语了,看着烦人,这里我直接写他的含义,他的定义是为了确定内存分配的时候对备用
zone
的搜索顺序,他同时可以包含非本node
的zone
,普遍他的第一个zone
链接的是本node
下的zone
数组第一个,其实这个struct zonelist
就是一个指向zone
的指针加上其他元素,我们可以看看他的数据结构,这里直接引用arttnba3
师傅的笔记,
如下:
/*
* 单次分配请求在一个 zonelist 上操作. 一个 zonelist 便是一组 zone 的列表,
* 其中第一个 zone 为分配的“目标”,而其他的 zone 为后备的zone,优先级降低。
*
* 为了提高 zonelist 的读取速度, 在 zonerefs 中包含正在被读取的 entry 的 zone index。
* 用来访问所给的 zoneref 结构体信息的帮助函数有:
*
* zonelist_zone() - 返回一个 struct zone 的指针作为 _zonerefs 中的一个 entry
* zonelist_zone_idx() - 返回作为 entry 的 zone 的 index
* zonelist_node_idx() - 返回作为 entry 的 node 的 index
*/
struct zonelist {
struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
};
其中是一个struct zoneref
数组,接下来再看看其中的结构
/*
* 该结构包含了 zonelist 中一个 zone 的信息。
* 其被储存在这里以预防对大结构体的解引用与对表的查询。
*/
struct zoneref {
struct zone *zone; /* 指向实际上的 zone 的指针 */
int zone_idx; /* zone_idx(zoneref->zone) */
};
可以看到其就是一个指针而已
- nr_zones:记录了该
node
中所有可用的zone
数量
- node_start_pfn:
node
起始页的页框标号,这里的pfn
我们在之后讲解,这里可以理解为该node
所在的物理地址
- node_present_pages:
node
中物理页的总数量
- unsighnd long node_spanned_pages:
node
中物理页的总大小
- node_id:记录该
node
在系统中的标号,从0开始
知道了其中的一些数据结构,接下来我们了解一下node
的存储方式:我们可以在上面的网站中查找源码,在/arch/x86/mm/numa.c
中看到其中定义了一个pglist_data
的全局数组node_data[]
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
其中包含我们的所有node
,下面来一个好图,为啥大伙画图都这么专业捏
当我们知晓了node
节点的存储方式,我们需要另一个数组node_status
来描述对应node
节点的状态,他定义在/mm/page_alloc.c
当中,也是一个全局数组(我是真佩服写Linux的这一群大佬,这文件的分布情况跟我自己写的那个操作系统相比简直天壤之别阿)
/*
* Array of node states.
*/
nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
[N_POSSIBLE] = NODE_MASK_ALL,
[N_ONLINE] = { { [0] = 1UL } },
#ifndef CONFIG_NUMA
[N_NORMAL_MEMORY] = { { [0] = 1UL } },
#ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = { { [0] = 1UL } },
#endif
[N_MEMORY] = { { [0] = 1UL } },
[N_CPU] = { { [0] = 1UL } },
#endif /* NUMA */
};
EXPORT_SYMBOL(node_states);
而我们的node_states
类型保存在/include/linux/nodemask.h
,这里仍然直接引用arttnba3
师傅
/*
* 位掩码将为所有节点保存
*/
enum node_states {
N_POSSIBLE, /* 节点在某个时刻是联机的 */
N_ONLINE, /* 节点是联机的 */
N_NORMAL_MEMORY, /* 节点有着普通的内存 */
#ifdef CONFIG_HIGHMEM
N_HIGH_MEMORY, /* 节点有着普通或高端内存 */
#else
N_HIGH_MEMORY = N_NORMAL_MEMORY,
#endif
N_MEMORY, /* 节点有着内存(普通,高端,可移动) */
N_CPU, /* 节点有着一个或多个 cpu */
N_GENERIC_INITIATOR, /* 节点有一个或多个 Generic Initiators */
NR_NODE_STATES
};
说完node,我来绘个图吧,这里老抄作业好像体现不出自己真正学到了东西
我们将在之后一步一步慢慢完善这个图片
zone区域
同样的,先说其数据结构struct zone
,他位于/include/linux/mmzone.h
struct zone {
/* Read-mostly fields */
/* zone watermarks, access with *_wmark_pages(zone) macros */
unsigned long _watermark[NR_WMARK];
unsigned long watermark_boost;
unsigned long nr_reserved_highatomic;
/*
* We don't know if the memory that we're going to allocate will be
* freeable or/and it will be released eventually, so to avoid totally
* wasting several GB of ram we must reserve some of the lower zone
* memory (otherwise we risk to run OOM on the lower zones despite
* there being tons of freeable ram on the higher zones). This array is
* recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
* changes.
*/
long lowmem_reserve[MAX_NR_ZONES];
#ifdef CONFIG_NUMA
int node;
#endif
struct pglist_data *zone_pgdat;
struct per_cpu_pageset __percpu *pageset;
/*
* the high and batch values are copied to individual pagesets for
* faster access
*/
int pageset_high;
int pageset_batch;
#ifndef CONFIG_SPARSEMEM
/*
* Flags for a pageblock_nr_pages block. See pageblock-flags.h.
* In SPARSEMEM, this map is stored in struct mem_section
*/
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
/*
* spanned_pages is the total pages spanned by the zone, including
* holes, which is calculated as:
* spanned_pages = zone_end_pfn - zone_start_pfn;
*
* present_pages is physical pages existing within the zone, which
* is calculated as:
* present_pages = spanned_pages - absent_pages(pages in holes);
*
* managed_pages is present pages managed by the buddy system, which
* is calculated as (reserved_pages includes pages allocated by the
* bootmem allocator):
* managed_pages = present_pages - reserved_pages;
*
* So present_pages may be used by memory hotplug or memory power
* management logic to figure out unmanaged pages by checking
* (present_pages - managed_pages). And managed_pages should be used
* by page allocator and vm scanner to calculate all kinds of watermarks
* and thresholds.
*
* Locking rules:
*
* zone_start_pfn and spanned_pages are protected by span_seqlock.
* It is a seqlock because it has to be read outside of zone->lock,
* and it is done in the main allocator path. But, it is written
* quite infrequently.
*
* The span_seq lock is declared along with zone->lock because it is
* frequently read in proximity to zone->lock. It's good to
* give them a chance of being in the same cacheline.
*
* Write access to present_pages at runtime should be protected by
* mem_hotplug_begin/end(). Any reader who can't tolerant drift of
* present_pages should get_online_mems() to get a stable value.
*/
atomic_long_t managed_pages;
unsigned long spanned_pages;
unsigned long present_pages;
const char *name;
#ifdef CONFIG_MEMORY_ISOLATION
/*
* Number of isolated pageblock. It is used to solve incorrect
* freepage counting problem due to racy retrieving migratetype
* of pageblock. Protected by zone->lock.
*/
unsigned long nr_isolate_pageblock;
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
/* see spanned/present_pages for more description */
seqlock_t span_seqlock;
#endif
int initialized;
/* Write-intensive fields used from the page allocator */
ZONE_PADDING(_pad1_)
/* free areas of different sizes */
struct free_area free_area[MAX_ORDER];
/* zone flags, see below */
unsigned long flags;
/* Primarily protects free_area */
spinlock_t lock;
/* Write-intensive fields used by compaction and vmstats. */
ZONE_PADDING(_pad2_)
/*
* When free pages are below this point, additional steps are taken
* when reading the number of free pages to avoid per-cpu counter
* drift allowing watermarks to be breached
*/
unsigned long percpu_drift_mark;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* pfn where compaction free scanner should start */
unsigned long compact_cached_free_pfn;
/* pfn where compaction migration scanner should start */
unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC];
unsigned long compact_init_migrate_pfn;
unsigned long compact_init_free_pfn;
#endif
#ifdef CONFIG_COMPACTION
/*
* On compaction failure, 1<<compact_defer_shift compactions
* are skipped before trying again. The number attempted since
* last failure is tracked with compact_considered.
* compact_order_failed is the minimum compaction failed order.
*/
unsigned int compact_considered;
unsigned int compact_defer_shift;
int compact_order_failed;
#endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* Set to true when the PG_migrate_skip bits should be cleared */
bool compact_blockskip_flush;
#endif
bool contiguous;
ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;
同样地,我们来了解其中比较重要的字段
- _watermark 水位线,一般表示剩余空闲页框,他又三个挡位,分别是
WMARK_MIN
,WMARK_LOW
,WMARK_HIGH
,他存放在_watermark
数组当中,进行内存分配的时候,分配器会根据当前水位来采取不同的措施,下面搞个图:
- lowmem_reserve:当本
zone
没有空闲块之后,会到别的zone
中进行分配,避免分配内存全分配在低端zone
,而我们不能保证这里分配的内存是可释放,或者最终会被释放的,出现低端zone
区域内存提前耗尽,而高端zone
区保留大量内存,因此声名该字段来保留一段内存,而这里的zone
区内存是其他zone
不能打扰的
- node:标识该
zone
所属node
,当然,这里只在NUMA
启动,UMA
中只有一个node
,不需要这个字段
- zone_pgdat:标识所属的
pglist_data
节点,同上面的node
-
pageset:由于目前都是多处理器CPU架构,因此对于临界区的同步互斥访问就是一个严重的问题,而防止出错的办法之一加锁解锁十分浪费资源,因此每个zone
当中都为每一个CPU准备一个单独的页面仓库,最开始buddy system
会首先将页面放置在各个CPU独自的页面仓库当中,需要进行分配的时候优先从其中分配,其类型结构体位于/include/linux/mmzone.h
struct per_cpu_pages {
int count; /* number of pages in the list */
int high; /* high watermark, emptying needed */
int batch; /* chunk size for buddy add/remove */
/* Lists of pages, one per migrate type stored on the pcp-lists */
struct list_head lists[MIGRATE_PCPTYPES]; //双链表指针数组,指向空闲页们
};
struct per_cpu_pageset {
struct per_cpu_pages pcp;
#ifdef CONFIG_NUMA
s8 expire;
u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
#endif
#ifdef CONFIG_SMP
s8 stat_threshold;
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
#endif
};
此结构是一个包括状态,他会被存放在每个CPU独立的.data..percpu
段当中,下面再再再次引用arttnba3
师傅的图,真的态🐂辣
看图好吧,这个order
起始就是伙伴系统中的对于不同大小页分配的请求大小
- flags:标识
zone
的状态
-
vm_stat:统计数据,这里是一个数组,而数组大小取决于定义的枚举类型,如下:
enum zone_stat_item {
/* First 128 byte cacheline (assuming 64 bit words) */
NR_FREE_PAGES,
NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
NR_ZONE_ACTIVE_ANON,
NR_ZONE_INACTIVE_FILE,
NR_ZONE_ACTIVE_FILE,
NR_ZONE_UNEVICTABLE,
NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */
NR_MLOCK, /* mlock()ed pages found and moved off LRU */
/* Second 128 byte cacheline */
NR_BOUNCE,
#if IS_ENABLED(CONFIG_ZSMALLOC)
NR_ZSPAGES, /* allocated in zsmalloc */
#endif
NR_FREE_CMA_PAGES,
NR_VM_ZONE_STAT_ITEMS };
讲完一般结构,这里需要注意,虽说我们的node
节点中直接就是一个zone
数组,但他们之间是有区别的,此在/include/linux/mmzone.h
中有定义:
enum zone_type {
/*
* ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able
* to DMA to all of the addressable memory (ZONE_NORMAL).
* On architectures where this area covers the whole 32 bit address
* space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller
* DMA addressing constraints. This distinction is important as a 32bit
* DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit
* platforms may need both zones as they support peripherals with
* different DMA addressing limitations.
*/
#ifdef CONFIG_ZONE_DMA
ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
ZONE_DMA32,
#endif
/*
* Normal addressable memory is in ZONE_NORMAL. DMA operations can be
* performed on pages in ZONE_NORMAL if the DMA devices support
* transfers to all addressable memory.
*/
ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
/*
* A memory area that is only addressable by the kernel through
* mapping portions into its own address space. This is for example
* used by i386 to allow the kernel to address the memory beyond
* 900MB. The kernel will set up special mappings (page
* table entries on i386) for each page that the kernel needs to
* access.
*/
ZONE_HIGHMEM,
#endif
/*
* ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains
* movable pages with few exceptional cases described below. Main use
* cases for ZONE_MOVABLE are to make memory offlining/unplug more
* likely to succeed, and to locally limit unmovable allocations - e.g.,
* to increase the number of THP/huge pages. Notable special cases are:
*
* 1. Pinned pages: (long-term) pinning of movable pages might
* essentially turn such pages unmovable. Memory offlining might
* retry a long time.
* 2. memblock allocations: kernelcore/movablecore setups might create
* situations where ZONE_MOVABLE contains unmovable allocations
* after boot. Memory offlining and allocations fail early.
* 3. Memory holes: kernelcore/movablecore setups might create very rare
* situations where ZONE_MOVABLE contains memory holes after boot,
* for example, if we have sections that are only partially
* populated. Memory offlining and allocations fail early.
* 4. PG_hwpoison pages: while poisoned pages can be skipped during
* memory offlining, such pages cannot be allocated.
* 5. Unmovable PG_offline pages: in paravirtualized environments,
* hotplugged memory blocks might only partially be managed by the
* buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The
* parts not manged by the buddy are unmovable PG_offline pages. In
* some cases (virtio-mem), such pages can be skipped during
* memory offlining, however, cannot be moved/allocated. These
* techniques might use alloc_contig_range() to hide previously
* exposed pages from the buddy again (e.g., to implement some sort
* of memory unplug in virtio-mem).
*
* In general, no unmovable allocations that degrade memory offlining
* should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
* have to expect that migrating pages in ZONE_MOVABLE can fail (even
* if has_unmovable_pages() states that there are no unmovable pages,
* there can be false negatives).
*/
ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
ZONE_DEVICE,
#endif
__MAX_NR_ZONES
};
这里x86分别32位与64位都会有所区别,如下:
在32位中,zone
可以分为ZONE_DMA
、ZONE_NORMAL
、ZONE_HIGHMEM
,他们分别对应的起始和终止地址为
ZONE_DMA
:0~16MB
ZONE_NORMAL
:16~896MB
ZONE_HIGHMEM
:896~...MB
以上前两种类型是线性映射,也就是这里是直接映射的,也就是说存在虚拟地址就是物理地址的情形,后面的高端内存是不连续的
在64位中有所区别,zone
分为如下三种
ZONE_DMA
:0~16MB
ZONE_DMA32
:16~4GB
ZONE_NORMAL
:4GB~...
内核中取消了高端内存的概念,接着上面咱们画的图,这里我们把zone
补上
page页框
终于来到了咱们的页框,这里的page
对应的是物理页框而不是虚拟页,注意漏。
他对应的数据结构是struct page
,位于/include/linux/mm_types.h
如下:
struct page {
unsigned long flags; /* Atomic flags, some possibly
* updated asynchronously */
/*
* Five words (20/40 bytes) are available in this union.
* WARNING: bit 0 of the first word is used for PageTail(). That
* means the other users of this union MUST NOT use the bit to
* avoid collision and false-positive PageTail().
*/
union {
struct { /* Page cache and anonymous pages */
/**
* @lru: Pageout list, eg. active_list protected by
* lruvec->lru_lock. Sometimes used as a generic list
* by the page owner.
*/
struct list_head lru;
/* See page-flags.h for PAGE_MAPPING_FLAGS */
struct address_space *mapping;
pgoff_t index; /* Our offset within mapping. */
/**
* @private: Mapping-private opaque data.
* Usually used for buffer_heads if PagePrivate.
* Used for swp_entry_t if PageSwapCache.
* Indicates order in the buddy system if PageBuddy.
*/
unsigned long private;
};
struct { /* page_pool used by netstack */
/**
* @dma_addr: might require a 64-bit value on
* 32-bit architectures.
*/
unsigned long dma_addr[2];
};
struct { /* slab, slob and slub */
union {
struct list_head slab_list;
struct { /* Partial pages */
struct page *next;
#ifdef CONFIG_64BIT
int pages; /* Nr of pages left */
int pobjects; /* Approximate count */
#else
short int pages;
short int pobjects;
#endif
};
};
struct kmem_cache *slab_cache; /* not slob */
/* Double-word boundary */
void *freelist; /* first free object */
union {
void *s_mem; /* slab: first object */
unsigned long counters; /* SLUB */
struct { /* SLUB */
unsigned inuse:16;
unsigned objects:15;
unsigned frozen:1;
};
};
};
struct { /* Tail pages of compound page */
unsigned long compound_head; /* Bit zero is set */
/* First tail page only */
unsigned char compound_dtor;
unsigned char compound_order;
atomic_t compound_mapcount;
unsigned int compound_nr; /* 1 << compound_order */
};
struct { /* Second tail page of compound page */
unsigned long _compound_pad_1; /* compound_head */
atomic_t hpage_pinned_refcount;
/* For both global and memcg */
struct list_head deferred_list;
};
struct { /* Page table pages */
unsigned long _pt_pad_1; /* compound_head */
pgtable_t pmd_huge_pte; /* protected by page->ptl */
unsigned long _pt_pad_2; /* mapping */
union {
struct mm_struct *pt_mm; /* x86 pgds only */
atomic_t pt_frag_refcount; /* powerpc */
};
#if ALLOC_SPLIT_PTLOCKS
spinlock_t *ptl;
#else
spinlock_t ptl;
#endif
};
struct { /* ZONE_DEVICE pages */
/** @pgmap: Points to the hosting device page map. */
struct dev_pagemap *pgmap;
void *zone_device_data;
/*
* ZONE_DEVICE private pages are counted as being
* mapped so the next 3 words hold the mapping, index,
* and private fields from the source anonymous or
* page cache page while the page is migrated to device
* private memory.
* ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
* use the mapping, index, and private fields when
* pmem backed DAX files are mapped.
*/
};
/** @rcu_head: You can use this to free a page by RCU. */
struct rcu_head rcu_head;
};
union { /* This union is 4 bytes in size. */
/*
* If the page can be mapped to userspace, encodes the number
* of times this page is referenced by a page table.
*/
atomic_t _mapcount;
/*
* If the page is neither PageSlab nor mappable to userspace,
* the value stored here may help determine what this page
* is used for. See page-flags.h for a list of page types
* which are currently stored here.
*/
unsigned int page_type;
unsigned int active; /* SLAB */
int units; /* SLOB */
};
/* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
atomic_t _refcount;
#ifdef CONFIG_MEMCG
unsigned long memcg_data;
#endif
/*
* On machines where all RAM is mapped into kernel address space,
* we can simply calculate the virtual address. On machines with
* highmem some memory is mapped into kernel virtual memory
* dynamically, so we need a place to store that address.
* Note that this field could be 16 bits on x86 ... ;)
*
* Architectures with slow multiplication can define
* WANT_PAGE_VIRTUAL in asm/page.h
*/
#if defined(WANT_PAGE_VIRTUAL)
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
int _last_cpupid;
#endif
} _struct_page_alignment;
老样子,先解释关键字段
- lru:最近未使用页这个概念在计算机组成原理或者说操作系统课程里面都会讲解,这里也就不过多描述,在linux内核当中,page通过该字段来组织成链表
-
slab相关:用来存放slab
相关成员
struct { /* slab, slob and slub */
union {
struct list_head slab_list;
struct { /* Partial pages */
struct page *next;
#ifdef CONFIG_64BIT
int pages; /* Nr of pages left */
int pobjects; /* Approximate count */
#else
short int pages;
short int pobjects;
#endif
};
};
struct kmem_cache *slab_cache; /* not slob */
/* Double-word boundary */
void *freelist; /* first free object */
union {
void *s_mem; /* slab: first object */
unsigned long counters; /* SLUB */
struct { /* SLUB */
unsigned inuse:16;
unsigned objects:15;
unsigned frozen:1;
};
};
};
下面给出又一张十分详细的图,是由简·李奥师傅所作
-
flags:表示该页所处在的状态,定义于include/linux/page-flags.h
当中,他是一个枚举类型,如下:
enum pageflags {
PG_locked, /* Page is locked. Don't touch. */
PG_referenced,
PG_uptodate,
PG_dirty,
PG_lru,
PG_active,
PG_workingset,
PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
PG_error,
PG_slab,
PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/
PG_arch_1,
PG_reserved,
PG_private, /* If pagecache, has fs-private data */
PG_private_2, /* If pagecache, has fs aux data */
PG_writeback, /* Page is under writeback */
PG_head, /* A head page */
PG_mappedtodisk, /* Has blocks allocated on-disk */
PG_reclaim, /* To be reclaimed asap */
PG_swapbacked, /* Page is backed by RAM/swap */
PG_unevictable, /* Page is "unevictable" */
#ifdef CONFIG_MMU
PG_mlocked, /* Page is vma mlocked */
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
PG_uncached, /* Page has been mapped as uncached */
#endif
#ifdef CONFIG_MEMORY_FAILURE
PG_hwpoison, /* hardware poisoned page. Don't touch */
#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
PG_young,
PG_idle,
#endif
#ifdef CONFIG_64BIT
PG_arch_2,
#endif
__NR_PAGEFLAGS,
/* Filesystems */
PG_checked = PG_owner_priv_1,
/* SwapBacked */
PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */
/* Two page bits are conscripted by FS-Cache to maintain local caching
* state. These bits are set on pages belonging to the netfs's inodes
* when those inodes are being locally cached.
*/
PG_fscache = PG_private_2, /* page backed by cache */
/* XEN */
/* Pinned in Xen as a read-only pagetable page. */
PG_pinned = PG_owner_priv_1,
/* Pinned as part of domain save (see xen_mm_pin_all()). */
PG_savepinned = PG_dirty,
/* Has a grant mapping of another (foreign) domain's page. */
PG_foreign = PG_owner_priv_1,
/* Remapped by swiotlb-xen. */
PG_xen_remapped = PG_owner_priv_1,
/* SLOB */
PG_slob_free = PG_private,
/* Compound pages. Stored in first tail page's flags */
PG_double_map = PG_workingset,
/* non-lru isolated movable page */
PG_isolated = PG_reclaim,
/* Only valid for buddy pages. Used to track pages that are reported */
PG_reported = PG_uptodate,
};
这里采用的复用的手法,也就是说flags字段还容纳了其他元素,如下,结构划分位于/include/linux/page-flags-layout.h
当中
/*
* page->flags layout:
*
* There are five possibilities for how page->flags get laid out. The first
* pair is for the normal case without sparsemem. The second pair is for
* sparsemem when there is plenty of space for node and section information.
* The last is when there is insufficient space in page->flags and a separate
* lookup is necessary.
*
* No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS |
* " plus space for last_cpupid: | NODE | ZONE | LAST_CPUPID ... | FLAGS |
* classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS |
* " plus space for last_cpupid: | SECTION | NODE | ZONE | LAST_CPUPID ... | FLAGS |
* classic sparse no space for node: | SECTION | ZONE | ... | FLAGS |
*/
可以看到在不同布局下他其实是可以用作指向归属的zone
和node
的
-
_mapcount:记录该页被页表映射的次数,初始值为-1,他是一个根据不同情况所采用的联合结构体,如果说他是被用户空间所映射,那么他会记录被映射的次数,但若是他没被映射到用户空间,页不是PageSlab
,那么他为page_type字段,它定义于/include/linux/page-flags.h
字段当中,如下:
/*
* For pages that are never mapped to userspace (and aren't PageSlab),
* page_type may be used. Because it is initialised to -1, we invert the
* sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and
* __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and
* low bits so that an underflow or overflow of page_mapcount() won't be
* mistaken for a page type value.
*/
#define PAGE_TYPE_BASE 0xf0000000
/* Reserve 0x0000007f to catch underflows of page_mapcount */
#define PAGE_MAPCOUNT_RESERVE -128
#define PG_buddy 0x00000080
#define PG_offline 0x00000100
#define PG_table 0x00000200
#define PG_guard 0x00000400
-
_refcount:用作该页在内核中的引用次数,初值为0,若大于0表示正在被使用,等于0表示空闲或将要被释放,内核函数get_page()
和put_page()
函数会来进行引用计数的增减,后者若引用计数器为1则会调用__put_single_page()
释放该页面
-
vitrual:指向物理页框对应虚拟地址(这里有点疑问那就是他被多个页表映射咋办捏,还是说每次切换进程的时候会刷新一下这里呢?)
说完数据结构,还记得上面flags
不同布局下对应的结构吗,linux一般提供了三种内存模型,定义在/include/asm-generic/memory_model.h
常用模型是sparsemem
,所以我们只了解他,中文翻译过来就是离散内存模型。在这个模型下,内存中会存在一个mem_section
类型的指针数组,而其中元素指向的mem_section
结构体中的section_mem_map
成员会指向一个struct page
类型的数组,它对应于一个连续的物理地址空间,如下图所示
其中mem_section
结构体的定义在/include/linux/mmzone.h
当中,如下:
struct mem_section {
/*
* This is, logically, a pointer to an array of struct
* pages. However, it is stored with some other magic.
* (see sparse.c::sparse_init_one_section())
*
* Additionally during early boot we encode node id of
* the location of the section here to guide allocation.
* (see sparse.c::memory_present())
*
* Making it a UL at least makes someone do a cast
* before using it wrong.
*/
unsigned long section_mem_map;
struct mem_section_usage *usage;
#ifdef CONFIG_PAGE_EXTENSION
/*
* If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
* section. (see page_ext.h about this.)
*/
struct page_ext *page_ext;
unsigned long pad;
#endif
/*
* WARNING: mem_section must be a power-of-2 in size for the
* calculation and use of SECTION_ROOT_MASK to make sense.
*/
};
而我们的全局mem_section
数组存放着指向所有struct mem_section
结构体的指针,定义于/mm/sparse.c
当中:
#ifdef CONFIG_SPARSEMEM_EXTREME
struct mem_section **mem_section;
#else
struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
____cacheline_internodealigned_in_smp;
#endif
咱们之前说到的数据结构都会使用PFN
进行表示物理地址,但实际上他并不是物理地址,而是对应的某一个page
的,而pfn
的含义就是page frame number
,他为每个物理页框所在位置都编了个号。而我们要通过PFN
找到page
或通过page
找到PFN
都需要这个mem_section
结构体中的section_mem_map
来实现。
2.伙伴系统
我们刚刚已经知道了,每个zone
中包含一个free_area
数组,其中就是一个个的双链表,且按照了buddy system
的order
进行管理,
而我们一个free_area
中其实并不只有一个双向链表,他是按照不同的migrate type
也就是迁移类型进行存放,主要是为了避免内存过于碎片化,如下图:
而这里的页面存在一个迁移类型,这决定了该页是否可以迁移,如下:
enum migratetype {
MIGRATE_UNMOVABLE, //不可移动
MIGRATE_MOVABLE, //不可移动
MIGRATE_RECLAIMABLE, //不能直接移动,但可以删除,例如文件映射页
MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ //仅限同一节点内移动
MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
#ifdef CONFIG_CMA
/*
* MIGRATE_CMA migration type is designed to mimic the way
* ZONE_MOVABLE works. Only movable pages can be allocated
* from MIGRATE_CMA pageblocks and page allocator never
* implicitly change migration type of MIGRATE_CMA pageblock.
*
* The way to use it is to change migratetype of a range of
* pageblocks to MIGRATE_CMA which can be done by
* __free_pageblock_cma() function. What is important though
* is that a range of pageblocks must be aligned to
* MAX_ORDER_NR_PAGES should biggest page be bigger then
* a single pageblock.
*/
MIGRATE_CMA, //连续的物理内存
#endif
#ifdef CONFIG_MEMORY_ISOLATION
MIGRATE_ISOLATE, /* can't allocate from here */
#endif
MIGRATE_TYPES
};
下面仍然是一个arttnba3
师傅所做的图
而free_area
中的结构中的nr_free
表示的是当前free_area
中空闲页面块的数量
struct free_area {
struct list_head free_list[MIGRATE_TYPES];
unsigned long nr_free;
};
1. 分配页框
内核中实现了几个函数接口来请求页框,最终都会调用__alloc_pages_nodemask
,如下图
其中核心的函数就是__alloc_pages_nodemask
,这里我们需要先知道gfp_mask
和alloc_flags
这两个标志
gfp_flags
- __GFP_DMA:请求在ZONE_DMA区域中分配页面;
- __GFP_HIGHMEM:请求在ZONE_HIGHMEM区域中分配页面;
- __GFP_MOVABLE:ZONE_MOVALBE可用时在该区域分配页面,同时表示页面分配后可以在内存压缩时进行迁移,也能进行回收;
- __GFP_RECLAIMABLE:请求分配到可恢复页面;
- __GFP_HIGH:高优先级处理请求;
- __GFP_IO:请求在分配期间进行 I/O 操作;
- __GFP_FS:请求在分配期间进行文件系统调用;
- __GFP_ZERO:请求将分配的区域初始化为 0;
- __GFP_NOFAIL:不允许请求失败,会无限重试;
- __GFP_NORETRY:请求不重试内存分配请求;
这里我是直接引用的cft56200_ln师傅的图
alloc_flags
- ALLOC_WMARK_MIN:仅在最小水位water mark及以上限制页面分配;
- ALLOC_WMARK_LOW:仅在低水位water mark及以上限制页面分配;
- ALLOC_WMARK_HIGH:仅在高水位water mark及以上限制页面分配;
- ALLOC_HARDER:努力分配,一般在gfp_mask设置了__GFP_ATOMIC时会使用;
- ALLOC_HIGH:高优先级分配,一般在gfp_mask设置了__GFP_HIGH时使用;
- ALLOC_CPUSET:检查是否为正确的 cpuset;
- ALLOC_CMA:允许从 CMA 区域进行分配
下面就是该核心函数的函数体部分,他位于/mm/page_alloc.c
当中,如下:
/*
* This is the 'heart' of the zoned buddy allocator.(看好了,兄弟系统是这么用的)
*/
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
/*
* There are several places where we assume that the order value is sane
* so bail out early if the request is out of bound.
*/
if (unlikely(order >= MAX_ORDER)) {
WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
return NULL;
}
gfp_mask &= gfp_allowed_mask;
alloc_mask = gfp_mask;
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
return NULL;
/*
* Forbid the first pass from falling back to types that fragment
* memory until all local zones are considered.
*/
alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
/* First allocation attempt */
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
if (likely(page))
goto out;
/*
* Apply scoped allocation constraints. This is mainly about GFP_NOFS
* resp. GFP_NOIO which has to be inherited for all allocation requests
* from a particular context which has been marked by
* memalloc_no{fs,io}_{save,restore}.
*/
alloc_mask = current_gfp_context(gfp_mask);
ac.spread_dirty_pages = false;
/*
* Restore the original nodemask if it was potentially replaced with
* &cpuset_current_mems_allowed to optimize the fast-path attempt.
*/
ac.nodemask = nodemask;
page = __alloc_pages_slowpath(alloc_mask, order, &ac);
out:
if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) {
__free_pages(page, order);
page = NULL;
}
trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);
上面函数概括为下面的步骤:
- 检测环境,准备分配
- 快速分配,调用
get_page_from_freelist()
- 慢速分配,调用
__alloc_pages_slowpath()
- 快慢均失败,考虑页面回收,杀死进程后再次尝试
其中准备函数prepare_alloc_pages()
是设定一下环境值且从指定参数node
中获取一个zonelist
,这里就不多讲了,直接来讲解快速分配函数get_page_from_freelist()
,他位于/mm/page_alloc.c
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
struct zoneref *z;
struct zone *zone;
struct pglist_data *last_pgdat_dirty_limit = NULL;
bool no_fallback;
retry:
/*
* 扫描 zonelist, 寻找有着足够空闲块的zone
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
ac->nodemask) {
struct page *page;
unsigned long mark;
if (cpusets_enabled() &&
(alloc_flags & ALLOC_CPUSET) &&
!__cpuset_zone_allowed(zone, gfp_mask))
continue;
/*
* When allocating a page cache page for writing, we
* want to get it from a node that is within its dirty
* limit, such that no single node holds more than its
* proportional share of globally allowed dirty pages.
* The dirty limits take into account the node's
* lowmem reserves and high watermark so that kswapd
* should be able to balance it without having to
* write pages from its LRU list.
*
* XXX: For now, allow allocations to potentially
* exceed the per-node dirty limit in the slowpath
* (spread_dirty_pages unset) before going into reclaim,
* which is important when on a NUMA setup the allowed
* nodes are together not big enough to reach the
* global limit. The proper fix for these situations
* will require awareness of nodes in the
* dirty-throttling and the flusher threads.
*/
if (ac->spread_dirty_pages) {
if (last_pgdat_dirty_limit == zone->zone_pgdat)
continue;
if (!node_dirty_ok(zone->zone_pgdat)) {
last_pgdat_dirty_limit = zone->zone_pgdat;
continue;
}
}
if (no_fallback && nr_online_nodes > 1 &&
zone != ac->preferred_zoneref->zone) {
int local_nid;
/*
* If moving to a remote node, retry but allow
* fragmenting fallbacks. Locality is more important
* than fragmentation avoidance.
*/
local_nid = zone_to_nid(ac->preferred_zoneref->zone);
if (zone_to_nid(zone) != local_nid) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
}
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
if (!zone_watermark_fast(zone, order, mark,
ac->highest_zoneidx, alloc_flags,
gfp_mask)) {
int ret;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* Watermark failed for this zone, but see if we can
* grow this zone if it contains deferred pages.
*/
if (static_branch_unlikely(&deferred_pages)) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
#endif
/* Checked here to keep the fast path fast */
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
if (node_reclaim_mode == 0 ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;
ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
switch (ret) {
case NODE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case NODE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
/* did we reclaim enough */
if (zone_watermark_ok(zone, order, mark,
ac->highest_zoneidx, alloc_flags))
goto try_this_zone;
continue;
}
}
try_this_zone: //本zone正常水位
page = rmqueue(ac->preferred_zoneref->zone, zone, order,
gfp_mask, alloc_flags, ac->migratetype);
if (page) {
prep_new_page(page, order, gfp_mask, alloc_flags);
/*
* If this is a high-order atomic allocation then check
* if the pageblock should be reserved for the future
*/
if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
reserve_highatomic_pageblock(page, zone, order);
return page;
} else {
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Try again if zone has deferred pages */
if (static_branch_unlikely(&deferred_pages)) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
#endif
}
}
/*
* It's possible on a UMA machine to get through all zones that are
* fragmented. If avoiding fragmentation, reset and try again.
*/
if (no_fallback) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
}
return NULL;
}
其功能就是首先遍历当前的zone
,判断当前zone
是否满足low water mark水位,若不满足则进行一次快速回收操作,再次检测水位情况,若还是不能满足,则遍历下一个zone
,然后采取同样的步骤,最后进入rmqueue
函数,这就是buddy system
的核心,过程可以简化看下图:
相比于代码,下图更加直观,之后我们来查看关键函数rmqueue()
,它位于/mm/page_alloc.c
/*
* 从所给zone中获取页. 当order为0的时候,使用pcplists.
*/
static inline
struct page *rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
gfp_t gfp_flags, unsigned int alloc_flags,
int migratetype)
{
unsigned long flags;
struct page *page;
if (likely(order == 0)) {
/*
* 若没有开启`CMA`|设置`ALLOC_CMA`|迁移类型为MIGRATE_MOVABLE,则先从pcplist上分配
*/
if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
migratetype != MIGRATE_MOVABLE) {
page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
migratetype, alloc_flags);
goto out;
}
}
/*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with __GFP_NOFAIL.
*/
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
spin_lock_irqsave(&zone->lock, flags);
do {
page = NULL;
/*
* order-0 request can reach here when the pcplist is skipped
* due to non-CMA allocation context. HIGHATOMIC area is
* reserved for high-order atomic allocation, so order-0
* request should skip it.
*/
if (order > 0 && alloc_flags & ALLOC_HARDER) { //order大于0且带有ALLOC_HARDER,使用__rmqueue_smallest分配
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (page)
trace_mm_page_alloc_zone_locked(page, order, migratetype);
}
/*
* 执行到这里说明order>0,我们采用__rmqueue函数,这是真正的兄弟系统核心分配函数
*/
if (!page)
page = __rmqueue(zone, order, migratetype, alloc_flags);
} while (page && check_new_pages(page, order));
spin_unlock(&zone->lock);
if (!page)
goto failed;
__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone);
local_irq_restore(flags);
out:
/* Separate test+clear to avoid unnecessary atomics */
if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
}
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;
failed:
local_irq_restore(flags);
return NULL;
}
有部分注释,我在上面中西合璧标注了一下,接下来先提醒大家伙,之前咱们讲解zone
上的一个字段per-cpu pageset
,他是为了放置条件竞争的问题,为每个cpu单独设置一个仓库用来为buddy system
进行迅速的分配,这里就是给出了buddy system
先从他里面调用的函数代码,总结为一下流程
- 若
order
为0,若没有开启CMA
|设置ALLOC_CMA
|迁移类型为MIGRATE_MOVABLE,则先从per-cpu pageset 中分配并且返回
- order >0 调用
__rmqueue_smallest()
分配
- 若未分配成功,这里不管order是否为0,调用
__rmqueue()
分配
- 结果检查,调用
check_new_pages()
,未通过则循环跳到第二步
我们一个一个关键函数来查看,首先是分配per_cpu_pageset
,也就是如下函数
rmqueue_pcplist()
/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct zone *zone, gfp_t gfp_flags,
int migratetype, unsigned int alloc_flags)
{
struct per_cpu_pages *pcp;
struct list_head *list;
struct page *page;
unsigned long flags;
local_irq_save(flags); // 关中断
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype]; // 获取迁移类型链表
page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); // 分配
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
zone_statistics(preferred_zone, zone);
}
local_irq_restore(flags); // 开中断
return page;
}
主要是进行了一些同步互斥操作(开关中断),然后调用函数__rmqueue_pcplist
/* 从 per-cpu 链表上取出 page, 调用者必须保护链表 */
static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
unsigned int alloc_flags,
struct per_cpu_pages *pcp,
struct list_head *list)
{
struct page *page;
do {
if (list_empty(list)) { // list 是空的
//
pcp->count += rmqueue_bulk(zone, 0,
READ_ONCE(pcp->batch), list,
migratetype, alloc_flags);
if (unlikely(list_empty(list)))
return NULL;
}
// 链表脱链
page = list_first_entry(list, struct page, lru);
list_del(&page->lru);
pcp->count--;
} while (check_new_pcp(page));
return page;
}
这里先判定链表,若为空,则调用rmqueue_bulk()
函数,从zone
上拿到pages之后再进行unlink
,而rmqueue_bulk()
函数最终会调用__rmqueue()
/*
* 为了高效率,从 buddy 分配器获得指定数量的元素,
* 所有的单个元素都在持有锁的情况下进行. 将其添加到提供的链表中.
* 返回放置在 *list 链表上的 pages 数量.
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
int i, alloced = 0;
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
alloc_flags);
if (unlikely(page == NULL))
break;
if (unlikely(check_pcp_refill(page)))
continue;
/*
* 由 expand() 返回的分割 buddy 页面在此处以物理页框顺序接收。
* 页面被添加到 caller 的链表尾部。从 caller 的角度看,链表在
* 某些情况下是按照页码排序的。这对一些可以从头部前向的IO设备是有用的,
* 因为链表也是在物理页的顺序上的。这对于可以在物理页合理排序的情况下
* 合并IO请求的IO设备是有用的。
*/
list_add_tail(&page->lru, list);
alloced++;
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
/*
* i pages were removed from the buddy list even if some leak due
* to check_pcp_refill failing so adjust NR_FREE_PAGES based
* on i. Do not confuse with 'alloced' which is the number of
* pages added to the pcp list.
*/
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock(&zone->lock);
return alloced;
}
__rmqueue_smallest
该函数就是由order对应的free_area
中类型为migration type
的链表上进行分配,如果不够则向高order处请求,由于这里都是以2^order来进行分配,因此如果说我order为1,且这里不够的话,我们就转而order为2的链表,将其中的块对半拆下到低order中,其中向更高order分配是通过循环和脱链完成,而拆高阶的page是通过expand()
函数来进行的
/*
* 对给定的 migrationtype 遍历 free lists
* 并从 freelists 上移除最小可用的页面
*/
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area *area;
struct page *page;
/* 在 preferred list 上寻找一个合适 size 的 page */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
del_page_from_free_list(page, zone, current_order);
expand(zone, page, order, current_order, migratetype);
set_pcppage_migratetype(page, migratetype);
return page;
}
return NULL;
}
而拆分函数expand
也比较简单
/*
* 此处再分割的顺序对 IO subsystem 而言是十分重要的.
* 请不要在有好的理由及回归测试前改变这个顺序。
* 特别地,当大块的内存被分割,更小块(内存)被传递的顺序
* 则由他们在该函数中被分割的顺序决定。
* 根据实际测试,这是影响传递给IO子系统的 pages 顺序的主要因素,
* 考虑到包含一个内存大块(由一系列小的分配作用)的 buddy system 的行为,
* 这也是合理的。这种行为是 sglist 合并成功的关键因素。
*
* -- nyc
*/
static inline void expand(struct zone *zone, struct page *page,
int low, int high, int migratetype)
{
unsigned long size = 1 << high;
while (high > low) {
high--;
size >>= 1;
VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
/*
* 标记为 guard pages (或 page), 这将允许在 buddy 将被
* 释放时合并回分配器.对应的页表项不会被创建,
* pages 在 虚拟地址空间上仍将保持不存在。
*/
if (set_page_guard(zone, &page[size], high, migratetype))
continue;
add_to_free_list(&page[size], zone, high, migratetype);
set_buddy_order(&page[size], high);
}
}
__rmqueue()
最开始我以为这个才是最终函数,但其实他不是,他反而还会调用__rmqueue_smallest()
/*
* 从 buddy allocator 上移除一个元素.
* 在持有 zone->lock 时调用.
*/
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,
unsigned int alloc_flags)
{
struct page *page;
if (IS_ENABLED(CONFIG_CMA)) {
/*
* 通过当半数空闲内存在 CMA 区域时从 CMA 中分配
* 以平衡常规的与CMA区域的可迁移的分配。
*/
if (alloc_flags & ALLOC_CMA &&
zone_page_state(zone, NR_FREE_CMA_PAGES) >
zone_page_state(zone, NR_FREE_PAGES) / 2) {
page = __rmqueue_cma_fallback(zone, order);
if (page)
goto out;
}
}
retry:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page)) {
if (alloc_flags & ALLOC_CMA)
page = __rmqueue_cma_fallback(zone, order);
if (!page && __rmqueue_fallback(zone, order, migratetype,
alloc_flags))
goto retry;
}
out:
if (page)
trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}
整体快速分配可以看下面这张图
我们了解完了快速分配,接下来就是慢速分配了,其中他的功能包括了内存碎片化的整理和回收,他的代码太长,我就也只贴一部分,如下:
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
page = __alloc_pages_direct_compact(gfp_mask, order,
alloc_flags, ac,
INIT_COMPACT_PRIORITY,
&compact_result);
......
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
&did_some_progress);
......
}
其中内存碎片化也即是利用到迁移的知识,这里有两个关键函数,其中之一就是__alloc_pages_direct_compact
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, enum compact_result *compact_result)
{
struct page *page;
unsigned int noreclaim_flag;
if (!order)
return NULL;
noreclaim_flag = memalloc_noreclaim_save();
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
prio);
memalloc_noreclaim_restore(noreclaim_flag);
if (*compact_result <= COMPACT_INACTIVE)
return NULL;
count_vm_event(COMPACTSTALL);
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page) {
struct zone *zone = page_zone(page);
zone->compact_blockskip_flush = false;
compaction_defer_reset(zone, order, true);
count_vm_event(COMPACTSUCCESS);
return page;
}
count_vm_event(COMPACTFAIL);
cond_resched();
return NULL;
}
这里的函数也是迁移算法memory compaction
的代码实现,该算法可以简化为下面的流程
也就是分为两个链表,一个专门遍历空闲页,一个专门遍历使用页,注意这俩要分别维持链表,然后最后进行交换操作就实现了迁移过程,且记住这个迁移是需要page
本身是允许的才行,
在完成上述迁移操作后会再次尝试快速分配,这里的碎片化整理还有其他方式,但是我这里暂不区深究,先记录个图等我哪天想起来了再探索
而关于慢速分配还有个函数是__alloc_pages_direct_reclaim()
,他的作用主要是回收,而不是碎片整理
最后来个整体分配页框的函数流程图
暂未完工
一天下来怎么硕呢,感觉都是几位师傅的博客一口一口的喂饭,虽说自己理解了大致过程,但是对于源码的解读还是太粗了,这个系列还有释放页框和slub算法的源码实现,slub算法我再上一篇博客中已经讲解了大致原理了哦,这里还差一部分,