linux源码解析11–缺页异常之swap缺页异常

Posted 2023-02-16

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了linux源码解析11–缺页异常之swap缺页异常相关的知识，希望对你有一定的参考价值。

接上篇 https://www.daodaodao123.com/?p=776

本篇解析缺页异常分支之一，swap缺页异常；

1.swap分区的来由

当系统内存不足时，首先回收page cache页面，仍然不足时，继续回收匿名页面，但是匿名页面没有对应文件，因此建立一个swap文件，来临时存储匿名页面，这时匿名页面可以被回收掉，当再次读取匿名页内容时，触发缺页中断，从swap文件读取恢复。

2.swap缺页异常触发条件

pte表项为不为空, 且pte页表项的PRESENT没有置位

3.应用场景

系统内存不足，匿名页/ipc共享内存页/tmpfs页被换出，再次访问时发生swap缺页异常。

4.swap相关概念和编码规则

概念：

换出页标识符：当将一个物理页面换出到交换区/交换文件时，需要通过反向映射改写共享这个页的，所有页表项为交换区/交换文件的位置，填写的内容为换出页标识符。

swap cache: 类似page cache,为了解决多重换入和换出时的查找问题，每次换入必须先查找swap cache, 不存在再从交换区换入；换出时先加入swap cache，回写完成后释放；换入时先加入swap cache，所有共享页的vma都换入后释放（没有swap cache无法判断换入的页是否在内存）。

页槽：交换区分为连续的槽（slot）,每个槽位长度为页大小，用于存放换出的物理页。

交换区索引：表明页在那个交换区。

页槽索引 ：表明页在那个页槽。

槽位计数：换出页进程的数目，当计数为0时释放页槽。

换出页标识符不为0：原因是即使交换区索引为0，但是页槽索引不为0（0页槽存放交换区信息），从1开始。

含义：

present	是否在内存 <br>（设置换出页标识符时必须为0）
swap type	交换区索引
swap offset	页槽索引
PTE_PROT_NONE	属性是否为空 <br>（软件bit ，此位为1 表示表项属性为空，设置换出页标识符时必须为0）

物理页是否存在内存：

#define PTE_PROT_NONE (_AT(pteval_t, 1) << 58) /* only when !PTE_VALID */
#define PTE_VALID (_AT(pteval_t, 1) << 0)

5.swap缺页异常处理过程：

1.存在映射物理页的vma;

2.系统内存不足时，页面回收算法换出匿名页： (1)分配交换空间，并加速到swap_cache，保存换出页标识符到page->private； (2)反向映射查找，设置pte为换出页标识符； (3)pfn1换出到交换区； (4)释放pfn1给伙伴系统；

3.访问该匿名页

*p = 0x55;

4.触发缺页异常；

5.缺页异常处理： (1)根据pte中的换出页标识符, 从swap cache中查找页； (2)没找到就分配物理页并加入swap cache; (3)根据pte中的换出页标识符从交换分区n的页槽m换入数据到pfn2; (4)虚拟页和pfn2建立映射关系

6.异常返回，继续执行；

*p = 0x55;

源码解析

vm_fault_t do_swap_page(struct vm_fault *vmf)

	struct vm_area_struct *vma = vmf->vma;
	struct page *page = NULL, *swapcache;
	swp_entry_t entry;
	pte_t pte;
	int locked;
	int exclusive = 0;
	vm_fault_t ret = 0;
	void *shadow = NULL;

	if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
		goto out;

	entry = pte_to_swp_entry(vmf->orig_pte);  ///获取换出页标识符
	if (unlikely(non_swap_entry(entry)))    ///非换出页标识符，处理迁移页面，复用swap机制
		if (is_migration_entry(entry)) 
			migration_entry_wait(vma->vm_mm, vmf->pmd,
					     vmf->address);
		 else if (is_device_private_entry(entry)) 
			vmf->page = device_private_entry_to_page(entry);
			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
		 else if (is_hwpoison_entry(entry)) 
			ret = VM_FAULT_HWPOISON;
		 else 
			print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
			ret = VM_FAULT_SIGBUS;
		
		goto out;
	


	delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
	page = lookup_swap_cache(entry, vma, vmf->address);  ///在swap_cache查找
	swapcache = page;

	if (!page)   ///swap_cache没找到，新分配page，并加入swap_page
		struct swap_info_struct *si = swp_swap_info(entry);

		if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
		    __swap_count(entry) == 1)  ///需要启动慢速IO操作，此时根据局部性原理，还做预取动作来优化性能
			/* skip swapcache */
			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,    ///分配page
							vmf->address);
			if (page) 
				__SetPageLocked(page);
				__SetPageSwapBacked(page);

				if (mem_cgroup_swapin_charge_page(page,
					vma->vm_mm, GFP_KERNEL, entry)) 
					ret = VM_FAULT_OOM;
					goto out_page;
				
				mem_cgroup_swapin_uncharge_swap(entry);

				shadow = get_shadow_from_swap_cache(entry);
				if (shadow)
					workingset_refault(page, shadow);

				lru_cache_add(page);   						///page加入swap_cache

				/* To provide entry to swap_readpage() */
				set_page_private(page, entry.val);
				swap_readpage(page, true);                 ///从swap文件读取数据到page
				set_page_private(page, 0);
			
		 else 
			page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,  ///从swap文件读取数据到page
						vmf);
			swapcache = page;
		

		if (!page) 
			/*
			 * Back out if somebody else faulted in this pte
			 * while we released the pte lock.
			 */
			vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
					vmf->address, &vmf->ptl);
			if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
				ret = VM_FAULT_OOM;
			delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
			goto unlock;
		

		/* Had to read the page from swap area: Major fault */
		ret = VM_FAULT_MAJOR;  ///需要启动慢速IO操作，标记为主缺页
		count_vm_event(PGMAJFAULT);
		count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
	 else if (PageHWPoison(page)) 
		/*
		 * hwpoisoned dirty swapcache pages are kept for killing
		 * owner processes (which may be unknown at hwpoison time)
		 */
		ret = VM_FAULT_HWPOISON;
		delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
		goto out_release;
	

	locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);

	delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
	if (!locked) 
		ret |= VM_FAULT_RETRY;
		goto out_release;
	

	/*
	 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
	 * release the swapcache from under us.  The page pin, and pte_same
	 * test below, are not enough to exclude that.  Even if it is still
	 * swapcache, we need to check that the pages swap has not changed.
	 */
	if (unlikely((!PageSwapCache(page) ||
			page_private(page) != entry.val)) && swapcache)
		goto out_page;

	page = ksm_might_need_to_copy(page, vma, vmf->address);
	if (unlikely(!page)) 
		ret = VM_FAULT_OOM;
		page = swapcache;
		goto out_page;
	

	cgroup_throttle_swaprate(page, GFP_KERNEL);

	/*
	 * Back out if somebody else already faulted in this pte.
	 */
///重新获取页表项
	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
			&vmf->ptl);
	if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
		goto out_nomap;

	if (unlikely(!PageUptodate(page))) 
		ret = VM_FAULT_SIGBUS;
		goto out_nomap;
	

	/*
	 * The page isnt present yet, go ahead with the fault.
	 *
	 * Be careful about the sequence of operations here.
	 * To get its accounting right, reuse_swap_page() must be called
	 * while the page is counted on swap but not yet in mapcount i.e.
	 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
	 * must be called after the swap_free(), or it will never succeed.
	 */

	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);   ///匿页也计数增加
	dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);    ///swap页面技术减少
	pte = mk_pte(page, vma->vm_page_prot);           ///拼接页表项
	if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL))  ///reuse_swap_page，只被当前vma使用，直接改为可写，不做写时复制
		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
		vmf->flags &= ~FAULT_FLAG_WRITE;
		ret |= VM_FAULT_WRITE;
		exclusive = RMAP_EXCLUSIVE;
	
	flush_icache_page(vma, page);
	if (pte_swp_soft_dirty(vmf->orig_pte))
		pte = pte_mksoft_dirty(pte);
	if (pte_swp_uffd_wp(vmf->orig_pte)) 
		pte = pte_mkuffd_wp(pte);
		pte = pte_wrprotect(pte);
	
	set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);   ///填充页表
	arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
	vmf->orig_pte = pte;

	/* ksm created a completely new copy */
	if (unlikely(page != swapcache && swapcache)) 
		page_add_new_anon_rmap(page, vma, vmf->address, false);
		lru_cache_add_inactive_or_unevictable(page, vma);
	 else 
		do_page_add_anon_rmap(page, vma, vmf->address, exclusive);  ///加入rmap
	

	swap_free(entry); ///递减交换页槽的引用计数

///mem_cgroup_swap_full：交换页槽使用超过总数的1/2，或者vma被锁内存，尝试释放swap页面
	if (mem_cgroup_swap_full(page) ||
	    (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
		try_to_free_swap(page);   ///引用计数为0，尝试释放swap cache
	unlock_page(page);
	if (page != swapcache && swapcache) 
		/*
		 * Hold the lock to avoid the swap entry to be reused
		 * until we take the PT lock for the pte_same() check
		 * (to avoid false positives from pte_same). For
		 * further safety release the lock after the swap_free
		 * so that the swap count wont change under a
		 * parallel locked swapcache.
		 */
		unlock_page(swapcache);
		put_page(swapcache);
	

	if (vmf->flags & FAULT_FLAG_WRITE)    ///处理私有匿名页
		ret |= do_wp_page(vmf);            ///写时复制
		if (ret & VM_FAULT_ERROR)
			ret &= VM_FAULT_ERROR;
		goto out;
	

	/* No need to invalidate - it was non-present before */
	update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
	pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
	return ret;
out_nomap:
	pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
	unlock_page(page);
out_release:
	put_page(page);
	if (page != swapcache && swapcache) 
		unlock_page(swapcache);
		put_page(swapcache);
	
	return ret;

swap缺页异常补充

(1)再次发生写实复制情况：当内存不足时，有可能换出的页是写实复制的页（多个vma通过页表以只读的方式共享私有可写页面），当再次写访问时发生swap缺页，这个时候换入之后通过do_wp_page处理写实复制，当然这里还会处理只有一个vma映射这个页面的reuse情形（通过reuse_swap_page）。

(2)复用swap机制场景：最常见的常见时页面迁移机制中，在迁移过程中，往迁移页面对应的所有页面修改为迁移描述符，然后进行迁移操作，迁移过程中，有进程访问页面就会发生swap缺页，缺页中判断为迁移描述符，做睡眠处理。

(3)swap缺页预读：做换入操作，swap cache中没有请求页面时，需要从swap区中读取，需要做慢速的IO操作，根据程序局部性原理，缺页附近的一些页面很有可能马上被访问，为了提供性能会在换入时预读一些页面。

以上是关于linux源码解析11–缺页异常之swap缺页异常的主要内容，如果未能解决你的问题，请参考以下文章