Linux内存回收机制watermark
Posted bubbleben
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Linux内存回收机制watermark相关的知识,希望对你有一定的参考价值。
本文基于Linux 5.0, 涉及源码如下
include/linux/gfp.h
include/linux/mmzone.h
mm/page_alloc.c
kernel/sysctl.c
1. init_per_zone_wmark_min
// min_free_kbytes默认为1MB: 代表所有zone警戒水位之和
// min_free_kbytes的范围: 128k ~ 64MB
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
// 计算公式: min_free_kbytes = sqrt(lowmem_kbytes * 16)
int __meminit init_per_zone_wmark_min(void)
unsigned long lowmem_kbytes;
int new_min_free_kbytes;
// 计算所有zone除去高水位以外的空闲页数, 并将其转换为kb
lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
// 计算lowmem_kbytes乘以16后开方的结果
new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
// user_min_free_kbytes默认为-1
if (new_min_free_kbytes > user_min_free_kbytes)
// 更新min_free_kbytes
min_free_kbytes = new_min_free_kbytes;
// min_free_kbytes范围: 128k ~ 64KB
if (min_free_kbytes < 128)
min_free_kbytes = 128;
if (min_free_kbytes > 65536)
min_free_kbytes = 65536;
else
pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\\n",
new_min_free_kbytes, user_min_free_kbytes);
// 根据总的警戒水位计算每个zone的所有水位[见1.2节]
setup_per_zone_wmarks();
refresh_zone_stat_thresholds();
// 设置每个zone的lowmem_reserve[见1.3节]
setup_per_zone_lowmem_reserve();
#ifdef CONFIG_NUMA
setup_min_unmapped_ratio();
setup_min_slab_ratio();
#endif
return 0;
core_initcall(init_per_zone_wmark_min)
1.1 nr_free_buffer_pages
// 计算ZONE_DMA和ZONE_NORMAL除高水位以外的空闲页数
unsigned long nr_free_buffer_pages(void)
// gfp_zone返回分配掩码GFP_USER对应的zone_type[见1.1.1节]
// nr_free_zone_pages计算zone_type范围内所有zone的空闲页数[见1.1.2节]
return nr_free_zone_pages(gfp_zone(GFP_USER));
1.1.1 gfp_zone
// 0x600000 | 0x40 | 0x80 | 0x20000 = 0x6200c0
#define GFP_USER (__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
// 0x01 = 0000 0001
#define __GFP_DMA ((__force gfp_t)___GFP_DMA)
// 0x02 = 0000 0010
#define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM)
// 0x04 = 0000 0100
#define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32)
// 0x08 = 0000 1000
#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */
// zone的掩码: 0x0f = 0000 1111
#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
enum zone_type
#ifdef CONFIG_ZONE_DMA
// 0
ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
ZONE_DMA32,
#endif
// 1
ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
ZONE_HIGHMEM,
#endif
// 2
ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
ZONE_DEVICE,
#endif
// 3
__MAX_NR_ZONES
;
#if MAX_NR_ZONES < 2
#define ZONES_SHIFT 0
#elif MAX_NR_ZONES <= 2
#define ZONES_SHIFT 1
#elif MAX_NR_ZONES <= 4
// ZONES_SHIFT等于2
#define ZONES_SHIFT 2
#elif MAX_NR_ZONES <= 8
#define ZONES_SHIFT 3
#else
#error ZONES_SHIFT -- too many zones configured adjust calculation
#endif
// 未定义ZONE_DEVICE
#if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4
/* ZONE_DEVICE is not a valid GFP zone specifier */
#define GFP_ZONES_SHIFT 2
#else
// GFP_ZONES_SHIFT = ZONES_SHIFT = 2
#define GFP_ZONES_SHIFT ZONES_SHIFT
#endif
#define GFP_ZONE_TABLE ( \\
// 1左移0位 = 1
(ZONE_NORMAL << 0 * GFP_ZONES_SHIFT) \\
// 0左移2位 = 0
| (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT) \\
| (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT) \\
| (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT) \\
| (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT) \\
| (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT) \\
| (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\\
| (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\\
)
// 根据分配掩码计算zone_type
static inline enum zone_type gfp_zone(gfp_t flags)
enum zone_type z;
// 取分配掩码的低4位: 0x6200c0 & 0x0f = 0
int bit = (__force int) (flags & GFP_ZONEMASK);
// 取GFP_ZONE_TABLE低2位: GFP_ZONE_TABLE & (1左移2位 - 1) = GFP_ZONE_TABLE & 0011
// 结果等于ZONE_NORMAL
z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
((1 << GFP_ZONES_SHIFT) - 1);
VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
return z;
1.1.2 nr_free_zone_pages
enum zone_watermarks
WMARK_MIN,
WMARK_LOW,
WMARK_HIGH,
NR_WMARK
;
#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
// zone高水位值(Linux5.0新增watermark_boost, 可以通过/proc/sys/vm/watermark_boost_factor设置)
#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
// offset代表最大的zone_type
// 空闲页的计算公式: managed_pages - high_pages
static unsigned long nr_free_zone_pages(int offset)
struct zoneref *z;
struct zone *zone;
/* Just pick one node, since fallback list is circular */
unsigned long sum = 0;
// UMA架构: 返回节点中ZONELIST_FALLBACK类型的zonelist
struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
// 遍历zonelist中offset以内的所有zone
for_each_zone_zonelist(zone, z, zonelist, offset)
// 计算zone的managed_pages
unsigned long size = zone_managed_pages(zone);
// 计算zone的高水位值: 初始时为0
unsigned long high = high_wmark_pages(zone);
// 空闲内存 = zone->managed_pages - zone->_watermark[WMARK_HIGH]
if (size > high)
sum += size - high;
// 最终结果是ZONE_NORMAL和ZONE_DMA除高水位以外的空闲页之和
return sum;
1.2 setup_per_zone_wmarks
// min_free_kbytes发生改变或者内存热插拔时被调用
void setup_per_zone_wmarks(void)
static DEFINE_SPINLOCK(lock);
spin_lock(&lock);
__setup_per_zone_wmarks();
spin_unlock(&lock);
int watermark_boost_factor __read_mostly = 15000;
// watermark_scale_factor默认为10, 取值范围: 1 ~ 1000
// 可以通过/proc/sys/vm/watermark_scale_factor设置
int watermark_scale_factor = 10;
// 设置每个zone的min, low和high水位
static void __setup_per_zone_wmarks(void)
// 将总警戒水位值单位由kb转换为页数
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
struct zone *zone;
unsigned long flags;
/* Calculate total number of !ZONE_HIGHMEM pages */
for_each_zone(zone)
// highmem(ZONE_HIGHMEM和ZONE_MOVEABLE)
if (!is_highmem(zone))
// 计算lowmem(ZONE_DMA和ZONE_NORMAL)总的可管理内存
lowmem_pages += zone_managed_pages(zone);
for_each_zone(zone)
u64 tmp;
spin_lock_irqsave(&zone->lock, flags);
// 总警戒水位 * zone->managed_pages
tmp = (u64)pages_min * zone_managed_pages(zone);
// 1. 相当于先计算zone->managed_pages占总managed_pages的比例;
// 2. 然后将这个比例 * 总警戒水位, 得到此zone的警戒水位
do_div(tmp, lowmem_pages);
// 这里不再考虑highmem的情况
if (is_highmem(zone))
/*
* __GFP_HIGH and PF_MEMALLOC allocations usually don't
* need highmem pages, so cap pages_min to a small
* value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
* deltas control asynch page reclaim, and so should
* not be capped for highmem.
*/
unsigned long min_pages;
min_pages = zone_managed_pages(zone) / 1024;
min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
zone->_watermark[WMARK_MIN] = min_pages;
else
/*
* If it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
// 更新zone的警戒水位
zone->_watermark[WMARK_MIN] = tmp;
/*
* Set the kswapd watermarks distance according to the
* scale factor in proportion to available memory, but
* ensure a minimum size on small systems.
*/
// 取以下两者之间的最大值
// 1. 计算警戒水位的一半
// 2. 计算zone->managed_pages的比例(0.1% ~ 10%), watermark_scale_factor越大tmp越大
tmp = max_t(u64, tmp >> 2,
mult_frac(zone_managed_pages(zone),
watermark_scale_factor, 10000));
// 低水位 = 警戒水位 + tmp
zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
// 高水位 = 警戒水位 + tmp * 2
zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
zone->watermark_boost = 0;
spin_unlock_irqrestore(&zone->lock, flags);
/* update totalreserve_pages */
// 计算总的预留内存[见1.2.2节]
calculate_totalreserve_pages();
1.2.1 is_highmem
// 返回zone在zone_type中的索引
#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
static inline int is_highmem_idx(enum zone_type idx)
#ifdef CONFIG_HIGHMEM
// ZONE_MOVABLE和ZONE_HIGHMEM属于highmem
return (idx == ZONE_HIGHMEM ||
(idx == ZONE_MOVABLE && zone_movable_is_highmem()));
#else
return 0;
#endif
// 判断zone是否属于highmem(ZONE_HIGHMEM和ZONE_MOVEABLE)
// 与之对应的就是lowmem(ZONE_DMA和ZONE_NORMAL)
static inline int is_highmem(struct zone *zone)
#ifdef CONFIG_HIGHMEM
return is_highmem_idx(zone_idx(zone));
#else
// 未定义CONFIG_HIGHMEM则返回0
return 0;
#endif
1.2.2 calculate_totalreserve_pages
// 系统总的预留内存
unsigned long totalreserve_pages __read_mostly;
// 当sysctl_lowmem_reserve_ratio或者min_free_kbytes发生变化时计算总的预留内存
static void calculate_totalreserve_pages(void)
struct pglist_data *pgdat;
unsigned long reserve_pages = 0;
enum zone_type i, j;
// 遍历每个节点
for_each_online_pgdat(pgdat)
// pglist_data->totalreserve_pages统计节点总的预留内存
pgdat->totalreserve_pages = 0;
// 遍历每个zone
for (i = 0; i < MAX_NR_ZONES; i++)
struct zone *zone = pgdat->node_zones + i;
long max = 0;
// zone->managed_pages
unsigned long managed_pages = zone_managed_pages(zone);
/* Find valid and maximum lowmem_reserve in the zone */
// 计算zone->lowmem_reserve数组中的最大值
for (j = i; j < MAX_NR_ZONES; j++)
if (zone->lowmem_reserve[j] > max)
max = zone->lowmem_reserve[j];
/* we treat the high watermark as reserved pages. */
// zone->lowmem_reserve最大值 + zone->_watermark[WMARK_HIGH]作为总的预留值
max += high_wmark_pages(zone);
// 总的预留值不能大于zone->managed_pages
if (max > managed_pages)
max = managed_pages;
// 更新pglist_data->totalreserve_pages
pgdat->totalreserve_pages += max;
reserve_pages += max;
// 返回总的预留内存
totalreserve_pages = reserve_pages;
1.3 setup_per_zone_lowmem_reserve
// sysctl_lowmem_reserve_ratio可以通过/proc/sys/vm/lowmem_reserve_ratio进行设置
int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] =
#ifdef CONFIG_ZONE_DMA
[ZONE_DMA] = 256,
#endif
#ifdef CONFIG_ZONE_DMA32
[ZONE_DMA32] = 256,
#endif
[ZONE_NORMAL] = 32,
#ifdef CONFIG_HIGHMEM
[ZONE_HIGHMEM] = 0,
#endif
[ZONE_MOVABLE] = 0,
;
// 设置每个zone的lowmem_reserve数组
static void setup_per_zone_lowmem_reserve(void)
struct pglist_data *pgdat;
enum zone_type j, idx;
// 遍历每1个node
for_each_online_pgdat(pgdat)
// 遍历每1个zone
for (j = 0; j < MAX_NR_ZONES; j++)
struct zone *zone = pgdat->node_zones + j;
// 每个zone可管理的页数
unsigned long managed_pages = zone_managed_pages(zone);
// 每个zone都由1个lowmem_reserve数组, 用于预留低端zone的内存
zone->lowmem_reserve[j] = 0;
idx = j;
// idx = 0即ZONE_DMA时不会进入while循环
// idx = 1即ZONE_NORMAL时会进入while循环
// idx = 2即ZONE_MOVEABLE时会进入while循环
while (idx)
struct zone *lower_zone;
idx--;
// idx等于0: 低端内存代表ZONE_DMA
// idx等于1: 低端内存代表ZONE_NORMAL
lower_zone = pgdat->node_zones + idx;
if (sysctl_lowmem_reserve_ratio[idx] < 1)
sysctl_lowmem_reserve_ratio[idx] = 0;
lower_zone->lowmem_reserve[j] = 0;
else
// DMA zone->lowmem_reserve[1] = Normal zone->managed_pages / 256
// Normal zone->lowmem_reserve[2] = Movable zone->managed_pages / 32 = 0
// DMA zone->lowmem_reserve[2] = Normal zone->managed_pages / 256
lower_zone->lowmem_reserve[j] =
managed_pages / sysctl_lowmem_reserve_ratio[idx];
// managed_pages += Normal zone->managed_pages
managed_pages += zone_managed_pages(lower_zone);
/* update totalreserve_pages */
// zone->lowmem_reserve发生改变时需要更新总的预留内存[同1.2.2节]
calculate_totalreserve_pages();
2. /proc/sys/vm/min_free_kbytes
static struct ctl_table vm_table[] =
...
// /proc/sys/vm提供min_free_kbytes节点以供调整zone的三个水位值
.procname = "min_free_kbytes",
.data = &min_free_kbytes,
.maxlen = sizeof(min_free_kbytes),
.mode = 0644,
// 回调函数[见2.1节]
.proc_handler = min_free_kbytes_sysctl_handler,
.extra1 = &zero,
,
...
2.1 min_free_kbytes_sysctl_handler
int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
if (write)
// 更新user_min_free_kbytes
user_min_free_kbytes = min_free_kbytes;
// 重新计算每个zone的水位[同1.2节]
// min_free_kbytes调大之后, 各个zone的三个水位都会不同程度的提升, 反之亦然
setup_per_zone_wmarks();
return 0;
3. /proc/sys/vm/watermark_scale_factor
static Linux 内核 内存管理分区伙伴分配器 ⑦ ( z->watermark[WMARK_MIN] 最低水位计算 | min_free_kbytes 初始化 )
Linux 内核 内存管理分区伙伴分配器 ⑦ ( z->watermark[WMARK_MIN] 最低水位计算 | min_free_kbytes 初始化 )
Linux 内核 内存管理分区伙伴分配器 ⑥ ( zone 结构体中水线控制相关成员 | 在 Ubuntu 中查看内存区域水位线 )
Linux 内核 内存管理分区伙伴分配器 ⑥ ( zone 结构体中水线控制相关成员 | 在 Ubuntu 中查看内存区域水位线 )