Linux内存从0到1学习笔记(七,用户空间虚拟内存之三 - 内存映射)
Posted 高桐@BILL
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Linux内存从0到1学习笔记(七,用户空间虚拟内存之三 - 内存映射)相关的知识,希望对你有一定的参考价值。
C库函数提供了mmap函数建立映射。在内核一端,提供了两个系统调用mmap和mmap2.通常C标准库只提供了一个函数,由应用程序用来创建内存映射,接下来该函数调用在内部转换为适合于体系结构的系统调用mmap和mmap2。可使用munmap系统调用删除映射。
一、内存映射简介
mmap完成的是将物理内存映射到用户态虚拟内存,中间不需要任何的内存拷贝,文件映射实质上就是在创建内核文件的时候,给文件挂上一个mmap钩子,下面我们来看下内存映射的调用链路如下:
mmap()-->ksys_mmap_pgoff()-->vm_mmap_pgoff()-->do_mmap()-->mmap_region()
首先是系统调用,arm64架构对函数实现,系统调用函数入口在arch/arm64/kernel/sys.c中,如下:
linux_mainline-5.17.0/arch/arm64/kernel/sys.c
21 SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
22 unsigned long, prot, unsigned long, flags,
23 unsigned long, fd, unsigned long, off)
24
25 if (offset_in_page(off) != 0)
26 return -EINVAL;
27
28 return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
29
addr:表示用户空间传入的地址,用于映射到指定用户空间内存位置,一般情况下用NULL,让用户空间自适应映射;
len:映射对内存大小,是需要映射的物理内存大小,在系统调用过后,该大小做了页面对齐操作,因为mmap映射对内存大小必须是整页映射。
prot:表示映射的保护权限,有以下四种权限:
- PROT_EXEC:映射页面可以为可执行的;
- PROT_READ:映射页面是可读的;
- PROT_WRITE:映射页面是可写的;
- PROT_NONE:映射页面是不可访问的;
flags:表示映射的标志位,决定了映射区域对其他(映射了相同区域的)进程是否可见,并决定了是否将映射更新到基础文件,用得比较多的有MAP_SHARED、MAP_PRIVATE、MAP_HUGETLB,详细含义参考mmap函数帮助说明;
fd:open得到的文件描述符;
off:表示映射的页面偏移,一般情况下,该值为0,需要注意的是,off的大小必须是内存页面的整数倍,如系统采用4K页面,则off的值为0、4、8...
return:mmap返回一个指针,指向映射的内存区域,如果映射失败,则返回MAP_FAILED(-1),同时将错误码保存在errno中。
linux_mainline-5.17.0/mm/mmap.c
1590 unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
1591 unsigned long prot, unsigned long flags,
1592 unsigned long fd, unsigned long pgoff)
1593
1594 struct file *file = NULL;
1595 unsigned long retval;
1596
1597 if (!(flags & MAP_ANONYMOUS))
1598 audit_mmap_fd(fd, flags);//不使用文件映射
1599 file = fget(fd);
1600 if (!file)
1601 return -EBADF;
1602 if (is_file_hugepages(file))
1603 len = ALIGN(len, huge_page_size(hstate_file(file)));//巨页映射
1604 else if (unlikely(flags & MAP_HUGETLB))
1605 retval = -EINVAL;//巨页映射
1606 goto out_fput;
1607
1608 else if (flags & MAP_HUGETLB)
1609 struct hstate *hs;
1610
1611 hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
1612 if (!hs)
1613 return -EINVAL;
1614
1615 len = ALIGN(len, huge_page_size(hs));
1616 /*
1617 * VM_NORESERVE is used because the reservations will be
1618 * taken when vm_ops->mmap() is called
1619 * A dummy user value is used because we are not locking
1620 * memory so no accounting is necessary
1621 */
1622 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
1623 VM_NORESERVE,
1624 HUGETLB_ANONHUGE_INODE,
1625 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
1626 if (IS_ERR(file))
1627 return PTR_ERR(file);
1628
1629
1630 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1631 out_fput:
1632 if (file)
1633 fput(file);
1634 return retval;
1635
linux_mainline-5.17.0/mm/util.c
506 unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
507 unsigned long len, unsigned long prot,
508 unsigned long flag, unsigned long pgoff)
509
510 unsigned long ret;
511 struct mm_struct *mm = current->mm;
512 unsigned long populate;
513 LIST_HEAD(uf);
514
515 ret = security_mmap_file(file, prot, flag);//检查映射文件的安全性,需要安全钩子函数支持mmap_file的检查,并通过ima_file_mmap来完成进程文件测量的收集与存储。
516 if (!ret)
517 if (mmap_write_lock_killable(mm))
518 return -EINTR;
519 ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,//执行mmap过程
520 &uf);
521 mmap_write_unlock(mm);
522 userfaultfd_unmap_complete(mm, &uf);
523 if (populate)
524 mm_populate(ret, populate);
525
526 return ret;
527
linux_mainline-5.17.0/mm/mmap.c
1395 static inline bool file_mmap_ok(struct file *file, struct inode *inode,
1396 unsigned long pgoff, unsigned long len)
1397
1398 u64 maxsize = file_mmap_size_max(file, inode);
1399
1400 if (maxsize && len > maxsize)
1401 return false;
1402 maxsize -= len;
1403 if (pgoff > maxsize >> PAGE_SHIFT)
1404 return false;
1405 return true;
1406
1407
1408 /*
1409 * The caller must write-lock current->mm->mmap_lock.
1410 */
1411 unsigned long do_mmap(struct file *file, unsigned long addr,
1412 unsigned long len, unsigned long prot,
1413 unsigned long flags, unsigned long pgoff,
1414 unsigned long *populate, struct list_head *uf)
1415
1416 struct mm_struct *mm = current->mm;
1417 vm_flags_t vm_flags;
1418 int pkey = 0;
1419
1420 *populate = 0;
1421
1422 if (!len)
1423 return -EINVAL;
1424
1425 /*
1426 * Does the application expect PROT_READ to imply PROT_EXEC?
1427 *
1428 * (the exception is when the underlying filesystem is noexec
1429 * mounted, in which case we dont add PROT_EXEC.)
1430 */
1431 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
1432 if (!(file && path_noexec(&file->f_path)))
1433 prot |= PROT_EXEC;
1434
1435 /* force arch specific MAP_FIXED handling in get_unmapped_area */
1436 if (flags & MAP_FIXED_NOREPLACE)
1437 flags |= MAP_FIXED;
1438
1439 if (!(flags & MAP_FIXED))
1440 addr = round_hint_to_min(addr);
1441
1442 /* Careful about overflows.. */
1443 len = PAGE_ALIGN(len);//页面对齐,防止内存溢出
1444 if (!len)
1445 return -ENOMEM;
1446
1447 /* offset overflow? */
1448 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)//防止偏移后内存溢出
1449 return -EOVERFLOW;
1450
1451 /* Too many mappings? */
1452 if (mm->map_count > sysctl_max_map_count)/每次映射都会对映射区域的映射计数器增加一次,防止多次映射
1453 return -ENOMEM;
1454
1455 /* Obtain the address to map to. we verify (or select) it and ensure
1456 * that it represents a valid section of the address space.
1457 */
1458 addr = get_unmapped_area(file, addr, len, pgoff, flags);//获取没有映射的区域
1459 if (IS_ERR_VALUE(addr))
1460 return addr;
1461
1462 if (flags & MAP_FIXED_NOREPLACE)
1463 if (find_vma_intersection(mm, addr, addr + len))
1464 return -EEXIST;
1465
1466
1467 if (prot == PROT_EXEC)
1468 pkey = execute_only_pkey(mm);
1469 if (pkey < 0)
1470 pkey = 0;
1471
1472
1473 /* Do simple checking here so the lower-level routines won't have
1474 * to. we assume access permissions have been handled by the open
1475 * of the memory object, so we don't do any here.
1476 */
1477 vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
1478 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1479
1480 if (flags & MAP_LOCKED)
1481 if (!can_do_mlock())
1482 return -EPERM;
1483
1484 if (mlock_future_check(mm, vm_flags, len))
1485 return -EAGAIN;
1486
1487 if (file)
1488 struct inode *inode = file_inode(file);
1489 unsigned long flags_mask;
1490
1491 if (!file_mmap_ok(file, inode, pgoff, len))//根据文件节点和文件,判断是否可被映射
1492 return -EOVERFLOW;
1493
1494 flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
1495
1496 switch (flags & MAP_TYPE)
1497 case MAP_SHARED:
1498 /*
1499 * Force use of MAP_SHARED_VALIDATE with non-legacy
1500 * flags. E.g. MAP_SYNC is dangerous to use with
1501 * MAP_SHARED as you don't know which consistency model
1502 * you will get. We silently ignore unsupported flags
1503 * with MAP_SHARED to preserve backward compatibility.
1504 */
1505 flags &= LEGACY_MAP_MASK;
1506 fallthrough;
1507 case MAP_SHARED_VALIDATE:
1508 if (flags & ~flags_mask)
1509 return -EOPNOTSUPP;
1510 if (prot & PROT_WRITE)
1511 if (!(file->f_mode & FMODE_WRITE))
1512 return -EACCES;
1513 if (IS_SWAPFILE(file->f_mapping->host))
1514 return -ETXTBSY;
1515
1516
1517 /*
1518 * Make sure we don't allow writing to an append-only
1519 * file..
1520 */
1521 if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
1522 return -EACCES;
1523
1524 vm_flags |= VM_SHARED | VM_MAYSHARE;
1525 if (!(file->f_mode & FMODE_WRITE))
1526 vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
1527 fallthrough;
1528 case MAP_PRIVATE:
1529 if (!(file->f_mode & FMODE_READ))
1530 return -EACCES;
1531 if (path_noexec(&file->f_path))
1532 if (vm_flags & VM_EXEC)
1533 return -EPERM;
1534 vm_flags &= ~VM_MAYEXEC;
1535
1536
1537 if (!file->f_op->mmap)
1538 return -ENODEV;
1539 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1540 return -EINVAL;
1541 break;
1542
1543 default:
1544 return -EINVAL;
1545
1546 else
//如果文件对应的file数据结构不存在,则只支持MAP_SHARED和MAP_PRIVATE方式的映射
1547 switch (flags & MAP_TYPE)
1548 case MAP_SHARED:
1549 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1550 return -EINVAL;
1551 /*
1552 * Ignore pgoff.
1553 */
1554 pgoff = 0;
1555 vm_flags |= VM_SHARED | VM_MAYSHARE;
1556 break;
1557 case MAP_PRIVATE:
1558 /*
1559 * Set pgoff according to addr for anon_vma.
1560 */
1561 pgoff = addr >> PAGE_SHIFT;
1562 break;
1563 default:
1564 return -EINVAL;
1565
1566
1567
1568 /*
1569 * Set 'VM_NORESERVE' if we should not account for the
1570 * memory use of this mapping.
1571 */
1572 if (flags & MAP_NORESERVE)
1573 /* We honor MAP_NORESERVE if allowed to overcommit */
1574 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1575 vm_flags |= VM_NORESERVE;
1576
1577 /* hugetlb applies strict overcommit unless MAP_NORESERVE */
1578 if (file && is_file_hugepages(file))
1579 vm_flags |= VM_NORESERVE;
1580
1581
1582 addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);//这里实现了内存区域的映射
1583 if (!IS_ERR_VALUE(addr) &&
1584 ((vm_flags & VM_LOCKED) ||
1585 (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
1586 *populate = len;
1587 return addr;
1588
linux_mainline-5.17.0/mm/mmap.c
//该函数主要是创建映射区域对应的用户态虚拟内存空间,即创建一个struct vm_area_struct变量,存放映射区域的首地址、映射长度、映射标志位等,同时在映射文件存在struct file数据的情况下,还会找到文件挂的mmap钩子函数,实现自定义的映射过程,这里就可以将内核中创建的内存对应的物理内存映射到用户空间。
1722 unsigned long mmap_region(struct file *file, unsigned long addr,
1723 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
1724 struct list_head *uf)
1725
1726 struct mm_struct *mm = current->mm;
1727 struct vm_area_struct *vma, *prev, *merge;
1728 int error;
1729 struct rb_node **rb_link, *rb_parent;
1730 unsigned long charged = 0;
1731
1732 /* Check against address space limit. */
1733 if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT))
1734 unsigned long nr_pages;
1735
1736 /*
1737 * MAP_FIXED may remove pages of mappings that intersects with
1738 * requested mapping. Account for the pages it would unmap.
1739 */
1740 nr_pages = count_vma_pages_range(mm, addr, addr + len);
1741
1742 if (!may_expand_vm(mm, vm_flags,
1743 (len >> PAGE_SHIFT) - nr_pages))
1744 return -ENOMEM;
1745
1746
1747 /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
//清除旧映射, 设置prev、rb_link、rb_parent以及uf
1748 if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
1749 return -ENOMEM;
1750 /*
1751 * Private writable mapping: check memory availability
1752 */
1753 if (accountable_mapping(file, vm_flags))
1754 charged = len >> PAGE_SHIFT;
1755 if (security_vm_enough_memory_mm(mm, charged))
1756 return -ENOMEM;
1757 vm_flags |= VM_ACCOUNT;
1758
1759
1760 /*
1761 * Can we just expand an old mapping?
1762 */
//判断是否能够扩展旧的映射(已经merged了的映射区域)
1763 vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
1764 NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
1765 if (vma)
1766 goto out;
1767
1768 /*
1769 * Determine the object being mapped and call the appropriate
1770 * specific mapper. the address has already been validated, but
1771 * not unmapped, but the maps are removed from the list.
1772 */
1773 vma = vm_area_alloc(mm);
1774 if (!vma)
1775 error = -ENOMEM;
1776 goto unacct_error;
1777
1778
1779 vma->vm_start = addr;
1780 vma->vm_end = addr + len;
1781 vma->vm_flags = vm_flags;
1782 vma->vm_page_prot = vm_get_page_prot(vm_flags);
1783 vma->vm_pgoff = pgoff;
1784
1785 if (file)
1786 if (vm_flags & VM_SHARED)
1787 error = mapping_map_writable(file->f_mapping);
1788 if (error)
1789 goto free_vma;
1790
1791
1792 vma->vm_file = get_file(file);//获取映射文件的数据
//调用文件挂的mmap钩子,到这里就会进入到struct file_operations数据结构中的mmap钩子,完成自定义的映射过程,具体实现参考用例
1793 error = call_mmap(file, vma);
1794 if (error)
1795 goto unmap_and_free_vma;
1796
1797 /* Can addr have changed??
1798 *
1799 * Answer: Yes, several device drivers can do it in their
1800 * f_op->mmap method. -DaveM
1801 * Bug: If addr is changed, prev, rb_link, rb_parent should
1802 * be updated for vma_link()
1803 */
1804 WARN_ON_ONCE(addr != vma->vm_start);
1805
1806 addr = vma->vm_start;
1807
1808 /* If vm_flags changed after call_mmap(), we should try merge vma again
1809 * as we may succeed this time.
1810 */
1811 if (unlikely(vm_flags != vma->vm_flags && prev))
1812 merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
1813 NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
1814 if (merge)
1815 /* ->mmap() can change vma->vm_file and fput the original file. So
1816 * fput the vma->vm_file here or we would add an extra fput for file
1817 * and cause general protection fault ultimately.
1818 */
1819 fput(vma->vm_file);
1820 vm_area_free(vma);
1821 vma = merge;
1822 /* Update vm_flags to pick up the change. */
1823 vm_flags = vma->vm_flags;
1824 goto unmap_writable;
1825
1826
1827
1828 vm_flags = vma->vm_flags;
1829 else if (vm_flags & VM_SHARED)
1830 error = shmem_zero_setup(vma);
1831 if (error)
1832 goto free_vma;
1833 else
1834 vma_set_anonymous(vma);
1835
1836
1837 /* Allow architectures to sanity-check the vm_flags */
1838 if (!arch_validate_flags(vma->vm_flags))
1839 error = -EINVAL;
1840 if (file)
1841 goto unmap_and_free_vma;
1842 else
1843 goto free_vma;
1844
1845
1846 vma_link(mm, vma, prev, rb_link, rb_parent);
1847 /* Once vma denies write, undo our temporary denial count */
1848 unmap_writable:
1849 if (file && vm_flags & VM_SHARED)
1850 mapping_unmap_writable(file->f_mapping);
1851 file = vma->vm_file;
1852 out:
1853 perf_event_mmap(vma);
1854
1855 vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
1856 if (vm_flags & VM_LOCKED)
1857 if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
1858 is_vm_hugetlb_page(vma) ||
1859 vma == get_gate_vma(current->mm))
1860 vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
1861 else
1862 mm->locked_vm += (len >> PAGE_SHIFT);
1863
1864
1865 if (file)
1866 uprobe_mmap(vma);
1867
1868 /*
1869 * New (or expanded) vma always get soft dirty status.
1870 * Otherwise user-space soft-dirty page tracker won't
1871 * be able to distinguish situation when vma area unmapped,
1872 * then new mapped in-place (which must be aimed as
1873 * a completely new data area).
1874 */
1875 vma->vm_flags |= VM_SOFTDIRTY;
1876
1877 vma_set_page_prot(vma);
1878
1879 return addr;
1880
1881 unmap_and_free_vma:
1882 fput(vma->vm_file);
1883 vma->vm_file = NULL;
1884
1885 /* Undo any partial mapping done by a device driver. */
1886 unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
1887 charged = 0;
1888 if (vm_flags & VM_SHARED)
1889 mapping_unmap_writable(file->f_mapping);
1890 free_vma:
1891 vm_area_free(vma);
1892 unacct_error:
1893 if (charged)
1894 vm_unacct_memory(charged);
1895 return error;
1896
1897
mmap映射是否和文件关联,即当file参数为空时就表示不关联文件,当file数据存在时,就表示关联上了文件,由此可以将Linux内核中的映射分为匿名映射(不关联文件的映射)和文件映射(需要内核文件数据结构(struct file_operations)挂上mmap钩子.
二、mmap映射类型
2.1 匿名映射
2.1.1 私有匿名映射
私有匿名映射通常用于内存分配,当open文件的时候,返回的fd为-1,且flags为MAP_ANONYMOUS | MAP_PRIVATE时,创建的mmap映射就是私有匿名映射,私有匿名映射的最常见用于是在glibc分配大块内存时,通常情况下,malloc分配内存是先查找内存中可用的部分(该部分不一定是连续的),当不够的情况下,会通过伙伴系统brk来分配剩余的,当分配的内存大于128KB(即MMAP_THREADHOLD)时,glibc会使用mmap代替默认的brk来分配内存,需要注意的是,小于128字节时,在第一次读写之前,用户态分配的内存只有虚拟内存,还不存在物理内存,当第一次读写之后,才会通过伙伴系统分配对应的物理内存,但是当大于128KB时,由于不再使用brk分配内存,而是通过mmap分配内存,此时,会对应一片物理内存。
2.1.2 共享匿名映射
匿名映射通常用于进程间共享内存,当open文件的时候,返回的fd为-1,且flags为MAP_ANONYMOUS | MAP_SHARED时,创建的mmap映射就是共享匿名映射。共享匿名映射能够让相关进程共享一块内存,通常用于父子进程间通信,创建共享匿名映射的方式有以下两种:
- fd=-1 且 flags= MAP_ANONYMOUS|MAP_SHARED。在这种情况下,do_mmap_pgoff()->mmap() 函数最终调用 shmem_zero_setup() 来打开一个 "/dev/zero" 特殊的设备文件。
- 直接打开 "/dev/zero" 设备文件,然后使用这个文件句柄来创建 mmap。
2.2 文件映射
2.2.1 私有文件映射
私有文件映射时 flags 的标志位被设置为 MAP_PRIVATE,那么就会创建私有文件映射。私有文件映射的最常用的场景是加载动态共享库。
2.2.2 共享文件映射
创建文件映射时 flags 的标志位被设置为 MAP_SHARED,那么就会创建共享文件映射。如果 prot 参数指定了 PROT_WRITE,那么打开文件需要制定 O_RDWR 标志位。
共享文件映射通常有如下场景:
- 读写文件:把文件内容映射到进程地址空间,同时对映射的内容做了修改,内核的回写机制(writeback)最终会把修改的内容同步到磁盘中。
- 进程间通信:进程之间的进程地址空间相互隔离,一个进程不能访问到另外一个进程的地址空间。如果多个进程都同时映射到一个相同的文件,就实现了多进程间的共享内存的通信。如果一个进程对映射内容做了修改,那么另外的进程是可以看到的。
以上是关于Linux内存从0到1学习笔记(七,用户空间虚拟内存之三 - 内存映射)的主要内容,如果未能解决你的问题,请参考以下文章
Linux内存从0到1学习笔记(6.10 物理内存初始化之vmalloc分配器)