Linux内存控制器

Posted bubbleben

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Linux内存控制器相关的知识,希望对你有一定的参考价值。

1. memory_cgrp_subsys

// cftype: 用于定义和描述控制组的控制文件
// cftype->private:描述资源类型和资源属性
// dfl_cftypes和legacy_cftypes都是cftype的成员
struct cgroup_subsys memory_cgrp_subsys = 
	.css_alloc = mem_cgroup_css_alloc,
	.css_online = mem_cgroup_css_online,
	.css_offline = mem_cgroup_css_offline,
	.css_released = mem_cgroup_css_released,
	.css_free = mem_cgroup_css_free,
	.css_reset = mem_cgroup_css_reset,
	.can_attach = mem_cgroup_can_attach,
	.cancel_attach = mem_cgroup_cancel_attach,
	.post_attach = mem_cgroup_move_task,
	.bind = mem_cgroup_bind,
    // 默认层级
	.dfl_cftypes = memory_files,
    // 子层级
	.legacy_cftypes = mem_cgroup_legacy_files,
	.early_init = 0,
;

2. dfl_cftypes

static struct cftype memory_files[] = 
	
        // 控制组和所有子控制组的当前内存使用量
		.name = "current",
		.flags = CFTYPE_NOT_ON_ROOT,
		.read_u64 = memory_current_read,
	,
	
        // 内存使用低界限
		.name = "low",
		.flags = CFTYPE_NOT_ON_ROOT,
		.seq_show = memory_low_show,
		.write = memory_low_write,
	,
	
        // 内存使用高界限
		.name = "high",
		.flags = CFTYPE_NOT_ON_ROOT,
		.seq_show = memory_high_show,
		.write = memory_high_write,
	,
	
        // 内存使用硬限制
		.name = "max",
		.flags = CFTYPE_NOT_ON_ROOT,
		.seq_show = memory_max_show,
		.write = memory_max_write,
	,
	
        // 内存事件
		.name = "events",
		.flags = CFTYPE_NOT_ON_ROOT,
		.file_offset = offsetof(struct mem_cgroup, events_file),
		.seq_show = memory_events_show,
	,
	
        // 查看内存使用的各种统计值
		.name = "stat",
		.flags = CFTYPE_NOT_ON_ROOT,
		.seq_show = memory_stat_show,
	,
	 	/* terminate */
;

static struct cftype swap_files[] = 
	
        // 控制组和所有子控制组当前交换分区使用量
		.name = "swap.current",
		.flags = CFTYPE_NOT_ON_ROOT,
		.read_u64 = swap_current_read,
	,
	
        // 交换分区使用硬限制
		.name = "swap.max",
		.flags = CFTYPE_NOT_ON_ROOT,
		.seq_show = swap_max_show,
		.write = swap_max_write,
	,
	 	/* terminate */
;

3. legacy_cftypes

// 根控制组对资源使用量没有限制,并且不允许在根控制组配置资源使用限制
// 进程默认属于根控制组,创建子进程时,子进程继承父进程加入的控制组
static struct cftype mem_cgroup_legacy_files[] = 
	
        // 当前内存使用量
		.name = "usage_in_bytes",
		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
        // 见第6节
		.read_u64 = mem_cgroup_read_u64,
	,
	
        // 记录的最大内存使用量
		.name = "max_usage_in_bytes",
		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
        // 见第8节
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	,
	
        // 内存使用硬限制
		.name = "limit_in_bytes",
		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
        // 见第7节
		.write = mem_cgroup_write,
		.read_u64 = mem_cgroup_read_u64,
	,
	
        // 内存使用软限制
		.name = "soft_limit_in_bytes",
		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
		.write = mem_cgroup_write,
		.read_u64 = mem_cgroup_read_u64,
	,
	
		.name = "failcnt",
		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	,
	
        // 内存使用统计值
		.name = "stat",
		.seq_show = memcg_stat_show,
	,
	
		.name = "force_empty",
		.write = mem_cgroup_force_empty_write,
	,
	
        // 使用分层记账: 启用后子树中的所有内存控制组的内存使用都会被记账到这个内存控制组
		.name = "use_hierarchy",
		.write_u64 = mem_cgroup_hierarchy_write,
		.read_u64 = mem_cgroup_hierarchy_read,
	,
	
        // 注册内存监控事件
		.name = "cgroup.event_control",		/* XXX: for compat */
		.write = memcg_write_event_control,
		.flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
	,
	
		.name = "swappiness",
		.read_u64 = mem_cgroup_swappiness_read,
		.write_u64 = mem_cgroup_swappiness_write,
	,
	
		.name = "move_charge_at_immigrate",
		.read_u64 = mem_cgroup_move_charge_read,
		.write_u64 = mem_cgroup_move_charge_write,
	,
	
        // 是否禁止oom killer杀进程
		.name = "oom_control",
		.seq_show = mem_cgroup_oom_control_read,
		.write_u64 = mem_cgroup_oom_control_write,
		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
	,
	
        // 内存压力级别
		.name = "pressure_level",
	,
#ifdef CONFIG_NUMA
	
		.name = "numa_stat",
		.seq_show = memcg_numa_stat_show,
	,
#endif
	
        // 内核内存使用限制
		.name = "kmem.limit_in_bytes",
		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
		.write = mem_cgroup_write,
		.read_u64 = mem_cgroup_read_u64,
	,
	
        // 内核内存使用量
		.name = "kmem.usage_in_bytes",
		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
		.read_u64 = mem_cgroup_read_u64,
	,
	
		.name = "kmem.failcnt",
		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	,
	
        // 记录的最大内核内存使用使用量
		.name = "kmem.max_usage_in_bytes",
		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	,
#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
	
		.name = "kmem.slabinfo",
		.seq_start = memcg_slab_start,
		.seq_next = memcg_slab_next,
		.seq_stop = memcg_slab_stop,
		.seq_show = memcg_slab_show,
	,
#endif
	
        // tcp缓冲区内存使用限制
		.name = "kmem.tcp.limit_in_bytes",
		.private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
		.write = mem_cgroup_write,
		.read_u64 = mem_cgroup_read_u64,
	,
	
        // tcp缓冲区内存使用量
		.name = "kmem.tcp.usage_in_bytes",
		.private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
		.read_u64 = mem_cgroup_read_u64,
	,
	
		.name = "kmem.tcp.failcnt",
		.private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	,
	
        // 记录的最大tcp缓冲区内存使用量
		.name = "kmem.tcp.max_usage_in_bytes",
		.private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	,
	 ,	/* terminate */
;

static struct cftype memsw_cgroup_files[] = 
	
        // 内存+交换分区内存使用量
		.name = "memsw.usage_in_bytes",
		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
		.read_u64 = mem_cgroup_read_u64,
	,
	
        // 记录的内存+交换分区最大内存使用量
		.name = "memsw.max_usage_in_bytes",
		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	,
	
        // 记录的内存+交换分区内存使用量
		.name = "memsw.limit_in_bytes",
		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
		.write = mem_cgroup_write,
		.read_u64 = mem_cgroup_read_u64,
	,
	
		.name = "memsw.failcnt",
		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	,
	 ,	/* terminate */
;

4. mem_cgroup

struct mem_cgroup 
    // 所有资源控制器的基类
	struct cgroup_subsys_state css;

	/* Private memcg ID. Used to ID objects that outlive the cgroup */
	struct mem_cgroup_id id;

	/* Accounted resources */
    // _MEM类型的内存计数器: 记录内存的限制和当前使用量[见5.1节]
	struct page_counter memory;
	struct page_counter swap;

	/* Legacy consumer-oriented counters */
    // _MEMSWAP类型的内存计数器: 记录内存+交换分区的限制和当前使用量
	struct page_counter memsw;
    // _KMEM类型的内核内存计数器: 记录内核内存的限制和当前使用量
	struct page_counter kmem;
    // _TCP类型的tcp缓冲区计数器: 记录tcp缓冲区的限制和当前使用量
	struct page_counter tcpmem;

	/* Normal memory consumption range */
    // 内存使用低界限
	unsigned long low;
    // 内存使用高界限
	unsigned long high;

	/* Range enforcement for interrupt charges */
	struct work_struct high_work;
    
    // 内存使用软限制
	unsigned long soft_limit;

    /* vmpressure notifications */
	struct vmpressure vmpressure;

	/*
	 * Should the accounting and control be hierarchical, per subtree?
	 */
    // 是否使用分层记账
	bool use_hierarchy;
	/* protected by memcg_oom_lock */
	bool		oom_lock;
	int		under_oom;

	int	swappiness;
	/* OOM-Killer disable */
	int		oom_kill_disable;

	/* handle for "memory.events" */
	struct cgroup_file events_file;

	/* protect arrays of thresholds */
	struct mutex thresholds_lock;

	/* thresholds for memory usage. RCU-protected */
	struct mem_cgroup_thresholds thresholds;

	/* thresholds for mem+swap usage. RCU-protected */
	struct mem_cgroup_thresholds memsw_thresholds;

	/* For oom notifier event fd */
	struct list_head oom_notify;

	/*
	 * Should we move charges of a task when a task is moved into this
	 * mem_cgroup ? And what type of charges should we move ?
	 */
	unsigned long move_charge_at_immigrate;
	/*
	 * set > 0 if pages under this cgroup are moving to other cgroup.
	 */
	atomic_t		moving_account;
	/* taken only while moving_account > 0 */
	spinlock_t		move_lock;
	struct task_struct	*move_lock_task;
	unsigned long		move_lock_flags;
	/*
	 * percpu counter.
	 */
    // 每cpu变量: 统计内存控制组状态(包括内存使用量和内存事件)[见5.3节]
	struct mem_cgroup_stat_cpu __percpu *stat;

	unsigned long		socket_pressure;

	/* Legacy tcp memory accounting */
	bool			tcpmem_active;
	int			tcpmem_pressure;

#ifndef CONFIG_SLOB
        /* Index in the kmem_cache->memcg_params.memcg_caches array */
	int kmemcg_id;
	enum memcg_kmem_state kmem_state;
	struct list_head kmem_caches;
#endif

	int last_scanned_node;
#if MAX_NUMNODES > 1
	nodemask_t	scan_nodes;
	atomic_t	numainfo_events;
	atomic_t	numainfo_updating;
#endif

#ifdef CONFIG_CGROUP_WRITEBACK
	struct list_head cgwb_list;
	struct wb_domain cgwb_domain;
#endif

	/* List of events which userspace want to receive */
	struct list_head event_list;
	spinlock_t event_list_lock;

    // 每个节点对应一个mem_cgroup_per_node实例[见5.2节]
	struct mem_cgroup_per_node *nodeinfo[0];
	/* WARNING: nodeinfo must be the last member here */
;

4.1 page_counter

// 页面计数器
struct page_counter 
    // 计数值
	atomic_long_t count;
    // 硬限制
	unsigned long limit;
    // 如果父控制组使用use_hierarchy, 则parent指向父控制组的页面计数器, 否则时空指针
	struct page_counter *parent;

	/* legacy */
    // 记录计数值的历史最大值
	unsigned long watermark;
    // 命中限制的次数
	unsigned long failcnt;
;

4.2 mem_cgroup_per_node

/*
 * per-zone information in memory controller.
 */
struct mem_cgroup_per_node 
    // 内存控制组私有的lru链表
    // 当进程加入内存控制组后, 给进程分配的页面不再加入node的lru链表, 而是加入内存控制组私有的lru链表
	struct lruvec		lruvec;
	struct lruvec_stat __percpu *lruvec_stat;
	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];

	struct mem_cgroup_reclaim_iter	iter[DEF_PRIORITY + 1];

	struct rb_node		tree_node;	/* RB tree node */
    // 内存使用量超过软限制的数值 = mem_cgroup.memory.count - mem_cgroup.soft_limit
	unsigned long		usage_in_excess;/* Set to the value by which */
						/* the soft limit is exceeded*/
    // 表示内存控制组是否在软限制树种
    // 当内存使用量超过软限制时, 通过成员tree_node把mem_cgroup_per_node实例加入软限制树
	bool			on_tree;
    // 指向mem_cgroup_per_node实例所属的内存控制组
	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
						/* use container_of	   */
;

4.3 mem_cgroup_stat_cpu

struct mem_cgroup_stat_cpu 
    // 统计控制组内不同状态page的使用量[见5.4节]
	long count[MEMCG_NR_STAT];
    // 统计控制组内发生的不同类型事件的次数[见5.5节]
	unsigned long events[MEMCG_NR_EVENTS];
	unsigned long nr_page_events;
    // 统计控制组不同目标发生事件的次数[见5.6节]
	unsigned long targets[MEM_CGROUP_NTARGETS];
;

4.4 memcg_stat_item

// cgroup自定义的page状态
enum memcg_stat_item 
    // 文件缓存
	MEMCG_CACHE = NR_VM_NODE_STAT_ITEMS,
    // 匿名内存
	MEMCG_RSS,
    // 匿名巨页
	MEMCG_RSS_HUGE,
    // swap缓存
	MEMCG_SWAP,
	MEMCG_SOCK,
	/* XXX: why are these zone and not node counters? */
	MEMCG_KERNEL_STACK_KB,
	MEMCG_NR_STAT,
;

4.5 memcg_event_item

/* Cgroup-specific events, on top of universal VM events */
enum memcg_event_item 
	MEMCG_LOW = NR_VM_EVENT_ITEMS,
	MEMCG_HIGH,
	MEMCG_MAX,
	MEMCG_OOM,
	MEMCG_NR_EVENTS,
;

4.6 mem_cgroup_events_target

enum mem_cgroup_events_target 
	MEM_CGROUP_TARGET_THRESH,
	MEM_CGROUP_TARGET_SOFTLIMIT,
	MEM_CGROUP_TARGET_NUMAINFO,
	MEM_CGROUP_NTARGETS,
;

5. mem_cgroup_read_u64

// 资源类型
enum res_type 
	_MEM,
	_MEMSWAP,
	_OOM_TYPE,
	_KMEM,
	_TCP,
;

// 资源属性
enum 
	RES_USAGE,
	RES_LIMIT,
	RES_MAX_USAGE,
	RES_FAILCNT,
	RES_SOFT_LIMIT,
;

static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
			       struct cftype *cft)

	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
	struct page_counter *counter;

    // 解析资源类型, 从mem_cgroup中选择对应的页面计数器
	switch (MEMFILE_TYPE(cft->private)) 
	case _MEM:
		counter = &memcg->memory;
		break;
	case _MEMSWAP:
		counter = &memcg->memsw;
		break;
	case _KMEM:
		counter = &memcg->kmem;
		break;
	case _TCP:
		counter = &memcg->tcpmem;
		break;
	default:
		BUG();
	

    // 解析资源属性
	switch (MEMFILE_ATTR(cft->private)) 
	case RES_USAGE:
        // 读取usage_in_bytes数据[见6.1节]
		if (counter == &memcg->memory)
			return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
        // 读取memsw.usage_in_bytes数据
		if (counter == &memcg->memsw)
			return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
		return (u64)page_counter_read(counter) * PAGE_SIZE;
	case RES_LIMIT:
        // 读取*.limit_in_bytes即内存使用限制值
		return (u64)counter->limit * PAGE_SIZE;
	case RES_MAX_USAGE:
        // 读取*.max_usage_in_bytes即历史最大内存使用量
		return (u64)counter->watermark * PAGE_SIZE;
	case RES_FAILCNT:
        // 读取*.failcnt数据
		return counter->failcnt;
	case RES_SOFT_LIMIT:
        // 读取软限制值
		return (u64)memcg->soft_limit * PAGE_SIZE;
	default:
		BUG();
	

5.1 mem_cgroup_usage

static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)

	unsigned long val = 0;

    // 根控制组
	if (mem_cgroup_is_root(memcg)) 
		struct mem_cgroup *iter;

        // 遍历根控制组下所有子控制组
		for_each_mem_cgroup_tree(iter, memcg) 
            // 读取控制组内page cache的数量[见6.2节]
			val += memcg_page_state(iter, MEMCG_CACHE以上是关于Linux内存控制器的主要内容,如果未能解决你的问题,请参考以下文章

DockerDocker安全容器资源控制(CPU内存磁盘IO)安全加固(lxcfs特权白名单)

5.cgroup资源控制

linux进程资源控制-cgroup

linux进程资源控制-cgroup

linux进程资源控制-cgroup

Docker 控制组