Linux内存回收机制watermark

Posted bubbleben

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Linux内存回收机制watermark相关的知识,希望对你有一定的参考价值。

本文基于Linux 5.0, 涉及源码如下

include/linux/gfp.h
include/linux/mmzone.h
mm/page_alloc.c
kernel/sysctl.c

1. init_per_zone_wmark_min

// min_free_kbytes默认为1MB: 代表所有zone警戒水位之和
// min_free_kbytes的范围: 128k ~ 64MB
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;

// 计算公式: min_free_kbytes = sqrt(lowmem_kbytes * 16)
int __meminit init_per_zone_wmark_min(void)

	unsigned long lowmem_kbytes;
	int new_min_free_kbytes;

    // 计算所有zone除去高水位以外的空闲页数, 并将其转换为kb
	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
    // 计算lowmem_kbytes乘以16后开方的结果
	new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);

    // user_min_free_kbytes默认为-1
	if (new_min_free_kbytes > user_min_free_kbytes) 
        // 更新min_free_kbytes
		min_free_kbytes = new_min_free_kbytes;
        // min_free_kbytes范围: 128k ~ 64KB
		if (min_free_kbytes < 128)
			min_free_kbytes = 128;
		if (min_free_kbytes > 65536)
			min_free_kbytes = 65536;
	 else 
		pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\\n",
				new_min_free_kbytes, user_min_free_kbytes);
	
    // 根据总的警戒水位计算每个zone的所有水位[见1.2节]
	setup_per_zone_wmarks();
	refresh_zone_stat_thresholds();
    // 设置每个zone的lowmem_reserve[见1.3节]
	setup_per_zone_lowmem_reserve();

#ifdef CONFIG_NUMA
	setup_min_unmapped_ratio();
	setup_min_slab_ratio();
#endif

	return 0;

core_initcall(init_per_zone_wmark_min)

1.1 nr_free_buffer_pages

// 计算ZONE_DMA和ZONE_NORMAL除高水位以外的空闲页数
unsigned long nr_free_buffer_pages(void)

    // gfp_zone返回分配掩码GFP_USER对应的zone_type[见1.1.1节]
    // nr_free_zone_pages计算zone_type范围内所有zone的空闲页数[见1.1.2节]
	return nr_free_zone_pages(gfp_zone(GFP_USER));

1.1.1 gfp_zone

// 0x600000 | 0x40 | 0x80 | 0x20000 = 0x6200c0
#define GFP_USER	(__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL)

// 0x01 = 0000 0001
#define __GFP_DMA	((__force gfp_t)___GFP_DMA)
// 0x02 = 0000 0010
#define __GFP_HIGHMEM	((__force gfp_t)___GFP_HIGHMEM)
// 0x04 = 0000 0100
#define __GFP_DMA32	((__force gfp_t)___GFP_DMA32)
// 0x08 = 0000 1000
#define __GFP_MOVABLE	((__force gfp_t)___GFP_MOVABLE)  /* ZONE_MOVABLE allowed */
// zone的掩码: 0x0f = 0000 1111
#define GFP_ZONEMASK	(__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
enum zone_type 
#ifdef CONFIG_ZONE_DMA
    // 0
	ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
	ZONE_DMA32,
#endif
    // 1
	ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
	ZONE_HIGHMEM,
#endif
    // 2
	ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
	ZONE_DEVICE,
#endif
    // 3
	__MAX_NR_ZONES

;

#if MAX_NR_ZONES < 2
#define ZONES_SHIFT 0
#elif MAX_NR_ZONES <= 2
#define ZONES_SHIFT 1
#elif MAX_NR_ZONES <= 4
// ZONES_SHIFT等于2
#define ZONES_SHIFT 2
#elif MAX_NR_ZONES <= 8
#define ZONES_SHIFT 3
#else
#error ZONES_SHIFT -- too many zones configured adjust calculation
#endif

// 未定义ZONE_DEVICE
#if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4
/* ZONE_DEVICE is not a valid GFP zone specifier */
#define GFP_ZONES_SHIFT 2
#else
// GFP_ZONES_SHIFT = ZONES_SHIFT = 2
#define GFP_ZONES_SHIFT ZONES_SHIFT
#endif

#define GFP_ZONE_TABLE ( \\
    // 1左移0位 = 1
	(ZONE_NORMAL << 0 * GFP_ZONES_SHIFT)				       \\
    // 0左移2位 = 0
	| (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT)		       \\
	| (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT)	       \\
	| (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT)		       \\
	| (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT)		       \\
	| (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT)    \\
	| (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\\
	| (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\\
)
// 根据分配掩码计算zone_type
static inline enum zone_type gfp_zone(gfp_t flags)

	enum zone_type z;
    // 取分配掩码的低4位: 0x6200c0 & 0x0f = 0
	int bit = (__force int) (flags & GFP_ZONEMASK);

    // 取GFP_ZONE_TABLE低2位: GFP_ZONE_TABLE & (1左移2位 - 1) = GFP_ZONE_TABLE & 0011
    // 结果等于ZONE_NORMAL
	z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
					 ((1 << GFP_ZONES_SHIFT) - 1);
	VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
	return z;

1.1.2 nr_free_zone_pages

enum zone_watermarks 
	WMARK_MIN,
	WMARK_LOW,
	WMARK_HIGH,
	NR_WMARK
;

#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
// zone高水位值(Linux5.0新增watermark_boost, 可以通过/proc/sys/vm/watermark_boost_factor设置)
#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
// offset代表最大的zone_type
// 空闲页的计算公式: managed_pages - high_pages
static unsigned long nr_free_zone_pages(int offset)

	struct zoneref *z;
	struct zone *zone;

	/* Just pick one node, since fallback list is circular */
	unsigned long sum = 0;

    // UMA架构: 返回节点中ZONELIST_FALLBACK类型的zonelist
	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

    // 遍历zonelist中offset以内的所有zone
	for_each_zone_zonelist(zone, z, zonelist, offset) 
        // 计算zone的managed_pages
		unsigned long size = zone_managed_pages(zone);
        // 计算zone的高水位值: 初始时为0
		unsigned long high = high_wmark_pages(zone);
        // 空闲内存 = zone->managed_pages - zone->_watermark[WMARK_HIGH]
		if (size > high)
			sum += size - high;
	

    // 最终结果是ZONE_NORMAL和ZONE_DMA除高水位以外的空闲页之和
	return sum;

1.2 setup_per_zone_wmarks

// min_free_kbytes发生改变或者内存热插拔时被调用
void setup_per_zone_wmarks(void)

	static DEFINE_SPINLOCK(lock);

	spin_lock(&lock);
	__setup_per_zone_wmarks();
	spin_unlock(&lock);

int watermark_boost_factor __read_mostly = 15000;
// watermark_scale_factor默认为10, 取值范围: 1 ~ 1000
// 可以通过/proc/sys/vm/watermark_scale_factor设置
int watermark_scale_factor = 10;

// 设置每个zone的min, low和high水位
static void __setup_per_zone_wmarks(void)

    // 将总警戒水位值单位由kb转换为页数
	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
	unsigned long lowmem_pages = 0;
	struct zone *zone;
	unsigned long flags;

	/* Calculate total number of !ZONE_HIGHMEM pages */
	for_each_zone(zone) 
        // highmem(ZONE_HIGHMEM和ZONE_MOVEABLE)
		if (!is_highmem(zone))
            // 计算lowmem(ZONE_DMA和ZONE_NORMAL)总的可管理内存
			lowmem_pages += zone_managed_pages(zone);
	

	for_each_zone(zone) 
		u64 tmp;

		spin_lock_irqsave(&zone->lock, flags);
        // 总警戒水位 * zone->managed_pages
		tmp = (u64)pages_min * zone_managed_pages(zone);
        // 1. 相当于先计算zone->managed_pages占总managed_pages的比例;
        // 2. 然后将这个比例 * 总警戒水位, 得到此zone的警戒水位
		do_div(tmp, lowmem_pages);
        // 这里不再考虑highmem的情况
		if (is_highmem(zone)) 
			/*
			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
			 * need highmem pages, so cap pages_min to a small
			 * value here.
			 *
			 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
			 * deltas control asynch page reclaim, and so should
			 * not be capped for highmem.
			 */
			unsigned long min_pages;

			min_pages = zone_managed_pages(zone) / 1024;
			min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
			zone->_watermark[WMARK_MIN] = min_pages;
		 else 
			/*
			 * If it's a lowmem zone, reserve a number of pages
			 * proportionate to the zone's size.
			 */
            // 更新zone的警戒水位
			zone->_watermark[WMARK_MIN] = tmp;
		

		/*
		 * Set the kswapd watermarks distance according to the
		 * scale factor in proportion to available memory, but
		 * ensure a minimum size on small systems.
		 */
        // 取以下两者之间的最大值
        // 1. 计算警戒水位的一半
        // 2. 计算zone->managed_pages的比例(0.1% ~ 10%), watermark_scale_factor越大tmp越大
		tmp = max_t(u64, tmp >> 2,
			    mult_frac(zone_managed_pages(zone),
				      watermark_scale_factor, 10000));

        // 低水位 = 警戒水位 + tmp
		zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
        // 高水位 = 警戒水位 + tmp * 2        
		zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
		zone->watermark_boost = 0;

		spin_unlock_irqrestore(&zone->lock, flags);
	

	/* update totalreserve_pages */
    // 计算总的预留内存[见1.2.2节]
	calculate_totalreserve_pages();

1.2.1 is_highmem

// 返回zone在zone_type中的索引
#define zone_idx(zone)		((zone) - (zone)->zone_pgdat->node_zones)

static inline int is_highmem_idx(enum zone_type idx)

#ifdef CONFIG_HIGHMEM
    // ZONE_MOVABLE和ZONE_HIGHMEM属于highmem
	return (idx == ZONE_HIGHMEM ||
		(idx == ZONE_MOVABLE && zone_movable_is_highmem()));
#else
	return 0;
#endif


// 判断zone是否属于highmem(ZONE_HIGHMEM和ZONE_MOVEABLE)
// 与之对应的就是lowmem(ZONE_DMA和ZONE_NORMAL)
static inline int is_highmem(struct zone *zone)

#ifdef CONFIG_HIGHMEM
	return is_highmem_idx(zone_idx(zone));
#else
    // 未定义CONFIG_HIGHMEM则返回0
	return 0;
#endif

1.2.2 calculate_totalreserve_pages

// 系统总的预留内存
unsigned long totalreserve_pages __read_mostly;

// 当sysctl_lowmem_reserve_ratio或者min_free_kbytes发生变化时计算总的预留内存
static void calculate_totalreserve_pages(void)

	struct pglist_data *pgdat;
	unsigned long reserve_pages = 0;
	enum zone_type i, j;

    // 遍历每个节点
	for_each_online_pgdat(pgdat) 
        // pglist_data->totalreserve_pages统计节点总的预留内存
		pgdat->totalreserve_pages = 0;
        // 遍历每个zone
		for (i = 0; i < MAX_NR_ZONES; i++) 
			struct zone *zone = pgdat->node_zones + i;
			long max = 0;
            // zone->managed_pages
			unsigned long managed_pages = zone_managed_pages(zone);

			/* Find valid and maximum lowmem_reserve in the zone */
            // 计算zone->lowmem_reserve数组中的最大值
			for (j = i; j < MAX_NR_ZONES; j++) 
				if (zone->lowmem_reserve[j] > max)
					max = zone->lowmem_reserve[j];
			

			/* we treat the high watermark as reserved pages. */
            // zone->lowmem_reserve最大值 + zone->_watermark[WMARK_HIGH]作为总的预留值
			max += high_wmark_pages(zone);

            // 总的预留值不能大于zone->managed_pages
			if (max > managed_pages)
				max = managed_pages;

            // 更新pglist_data->totalreserve_pages
			pgdat->totalreserve_pages += max;
            
			reserve_pages += max;
		
	
    // 返回总的预留内存
	totalreserve_pages = reserve_pages;

1.3 setup_per_zone_lowmem_reserve

// sysctl_lowmem_reserve_ratio可以通过/proc/sys/vm/lowmem_reserve_ratio进行设置
int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = 
#ifdef CONFIG_ZONE_DMA
	[ZONE_DMA] = 256,
#endif
#ifdef CONFIG_ZONE_DMA32
	[ZONE_DMA32] = 256,
#endif
	[ZONE_NORMAL] = 32,
#ifdef CONFIG_HIGHMEM
	[ZONE_HIGHMEM] = 0,
#endif
	[ZONE_MOVABLE] = 0,
;

// 设置每个zone的lowmem_reserve数组
static void setup_per_zone_lowmem_reserve(void)

	struct pglist_data *pgdat;
	enum zone_type j, idx;

    // 遍历每1个node
	for_each_online_pgdat(pgdat) 
        // 遍历每1个zone
		for (j = 0; j < MAX_NR_ZONES; j++) 
			struct zone *zone = pgdat->node_zones + j;
            // 每个zone可管理的页数
			unsigned long managed_pages = zone_managed_pages(zone);

            // 每个zone都由1个lowmem_reserve数组, 用于预留低端zone的内存
			zone->lowmem_reserve[j] = 0;

			idx = j;
            // idx = 0即ZONE_DMA时不会进入while循环
            // idx = 1即ZONE_NORMAL时会进入while循环
            // idx = 2即ZONE_MOVEABLE时会进入while循环
			while (idx) 
				struct zone *lower_zone;

				idx--;
                // idx等于0: 低端内存代表ZONE_DMA
                // idx等于1: 低端内存代表ZONE_NORMAL
				lower_zone = pgdat->node_zones + idx;

				if (sysctl_lowmem_reserve_ratio[idx] < 1) 
					sysctl_lowmem_reserve_ratio[idx] = 0;
					lower_zone->lowmem_reserve[j] = 0;
				 else 
                    // DMA zone->lowmem_reserve[1] = Normal zone->managed_pages / 256
                    // Normal zone->lowmem_reserve[2] = Movable zone->managed_pages / 32 = 0
                    // DMA zone->lowmem_reserve[2] = Normal zone->managed_pages / 256
					lower_zone->lowmem_reserve[j] =
						managed_pages / sysctl_lowmem_reserve_ratio[idx];
				
                // managed_pages += Normal zone->managed_pages
				managed_pages += zone_managed_pages(lower_zone);
			
		
	

	/* update totalreserve_pages */
    // zone->lowmem_reserve发生改变时需要更新总的预留内存[同1.2.2节]
	calculate_totalreserve_pages();

2. /proc/sys/vm/min_free_kbytes

static struct ctl_table vm_table[] = 
    ...
    
        // /proc/sys/vm提供min_free_kbytes节点以供调整zone的三个水位值
		.procname	= "min_free_kbytes",
		.data		= &min_free_kbytes,
		.maxlen		= sizeof(min_free_kbytes),
		.mode		= 0644,
        // 回调函数[见2.1节]
		.proc_handler	= min_free_kbytes_sysctl_handler,
		.extra1		= &zero,
	,
    ...

2.1 min_free_kbytes_sysctl_handler

int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
	void __user *buffer, size_t *length, loff_t *ppos)

	int rc;

	rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
	if (rc)
		return rc;

	if (write) 
        // 更新user_min_free_kbytes
		user_min_free_kbytes = min_free_kbytes;
        // 重新计算每个zone的水位[同1.2节]
        // min_free_kbytes调大之后, 各个zone的三个水位都会不同程度的提升, 反之亦然
		setup_per_zone_wmarks();
	
	return 0;

3. /proc/sys/vm/watermark_scale_factor

static Linux 内核 内存管理分区伙伴分配器 ⑦ ( z->watermark[WMARK_MIN] 最低水位计算 | min_free_kbytes 初始化 )

Linux 内核 内存管理分区伙伴分配器 ⑦ ( z->watermark[WMARK_MIN] 最低水位计算 | min_free_kbytes 初始化 )

Flink 水位线机制WaterMark实践 处理乱序消息

Flink详解系列之五--水位线(watermark)

Linux 内核 内存管理分区伙伴分配器 ⑥ ( zone 结构体中水线控制相关成员 | 在 Ubuntu 中查看内存区域水位线 )

Linux 内核 内存管理分区伙伴分配器 ⑥ ( zone 结构体中水线控制相关成员 | 在 Ubuntu 中查看内存区域水位线 )