Linux内存从0到1学习笔记(七,用户空间虚拟内存之二 - 内存空间的建立)

Posted 高桐@BILL

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Linux内存从0到1学习笔记(七,用户空间虚拟内存之二 - 内存空间的建立)相关的知识,希望对你有一定的参考价值。

在使用load_elf_binary装在一个ELF二进制文件时,将创建进程的地址空间。Linux下的exec系统调用该函数来加载ELF文件。

linux_mainline-5.17.0/fs/binfmt_elf.c
823  static int load_elf_binary(struct linux_binprm *bprm)
824  
825  	struct file *interpreter = NULL; /* to shut gcc up */
826   	unsigned long load_addr = 0, load_bias = 0;
827  	int load_addr_set = 0;
828  	unsigned long error;
829  	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
830  	struct elf_phdr *elf_property_phdata = NULL;
831  	unsigned long elf_bss, elf_brk;
832  	int bss_prot = 0;
833  	int retval, i;
834  	unsigned long elf_entry;
835  	unsigned long e_entry;
836  	unsigned long interp_load_addr = 0;
837  	unsigned long start_code, end_code, start_data, end_data;
838  	unsigned long reloc_func_desc __maybe_unused = 0;
839  	int executable_stack = EXSTACK_DEFAULT;
840  	struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf;
841  	struct elfhdr *interp_elf_ex = NULL;
842  	struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE;
843  	struct mm_struct *mm;
844  	struct pt_regs *regs;
845  
846  	retval = -ENOEXEC;
847  	/* First of all, some simple consistency checks */
848  	if (memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0)
849  		goto out;
850  
851  	if (elf_ex->e_type != ET_EXEC && elf_ex->e_type != ET_DYN)
852  		goto out;
853  	if (!elf_check_arch(elf_ex))
854  		goto out;
855  	if (elf_check_fdpic(elf_ex))
856  		goto out;
857  	if (!bprm->file->f_op->mmap)
858  		goto out;
859  
860  	elf_phdata = load_elf_phdrs(elf_ex, bprm->file);
861  	if (!elf_phdata)
862  		goto out;
863  
864  	elf_ppnt = elf_phdata;
865  	for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) 
866  		char *elf_interpreter;
867  
868  		if (elf_ppnt->p_type == PT_GNU_PROPERTY) 
869  			elf_property_phdata = elf_ppnt;
870  			continue;
871  		
872  
873  		if (elf_ppnt->p_type != PT_INTERP)
874  			continue;
875  
876  		/*
877  		 * This is the program interpreter used for shared libraries -
878  		 * for now assume that this is an a.out format binary.
879  		 */
880  		retval = -ENOEXEC;
881  		if (elf_ppnt->p_filesz > PATH_MAX || elf_ppnt->p_filesz < 2)
882  			goto out_free_ph;
883  
884  		retval = -ENOMEM;
885  		elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL);
886  		if (!elf_interpreter)
887  			goto out_free_ph;
888  
889  		retval = elf_read(bprm->file, elf_interpreter, elf_ppnt->p_filesz,
890  				  elf_ppnt->p_offset);
891  		if (retval < 0)
892  			goto out_free_interp;
893  		/* make sure path is NULL terminated */
894  		retval = -ENOEXEC;
895  		if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\\0')
896  			goto out_free_interp;
897  
898  		interpreter = open_exec(elf_interpreter);
899  		kfree(elf_interpreter);
900  		retval = PTR_ERR(interpreter);
901  		if (IS_ERR(interpreter))
902  			goto out_free_ph;
903  
904  		/*
905  		 * If the binary is not readable then enforce mm->dumpable = 0
906  		 * regardless of the interpreter's permissions.
907  		 */
908  		would_dump(bprm, interpreter);
909  
910  		interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL);
911  		if (!interp_elf_ex) 
912  			retval = -ENOMEM;
913  			goto out_free_ph;
914  		
915  
916  		/* Get the exec headers */
917  		retval = elf_read(interpreter, interp_elf_ex,
918  				  sizeof(*interp_elf_ex), 0);
919  		if (retval < 0)
920  			goto out_free_dentry;
921  
922  		break;
923  
924  out_free_interp:
925  		kfree(elf_interpreter);
926  		goto out_free_ph;
927  	
928  
929  	elf_ppnt = elf_phdata;
930  	for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++)
931  		switch (elf_ppnt->p_type) 
932  		case PT_GNU_STACK:
933  			if (elf_ppnt->p_flags & PF_X)
934  				executable_stack = EXSTACK_ENABLE_X;
935  			else
936  				executable_stack = EXSTACK_DISABLE_X;
937  			break;
938  
939  		case PT_LOPROC ... PT_HIPROC:
940  			retval = arch_elf_pt_proc(elf_ex, elf_ppnt,
941  						  bprm->file, false,
942  						  &arch_state);
943  			if (retval)
944  				goto out_free_dentry;
945  			break;
946  		
947  
948  	/* Some simple consistency checks for the interpreter */
949  	if (interpreter) 
950  		retval = -ELIBBAD;
951  		/* Not an ELF interpreter */
952  		if (memcmp(interp_elf_ex->e_ident, ELFMAG, SELFMAG) != 0)
953  			goto out_free_dentry;
954  		/* Verify the interpreter has a valid arch */
955  		if (!elf_check_arch(interp_elf_ex) ||
956  		    elf_check_fdpic(interp_elf_ex))
957  			goto out_free_dentry;
958  
959  		/* Load the interpreter program headers */
960  		interp_elf_phdata = load_elf_phdrs(interp_elf_ex,
961  						   interpreter);
962  		if (!interp_elf_phdata)
963  			goto out_free_dentry;
964  
965  		/* Pass PT_LOPROC..PT_HIPROC headers to arch code */
966  		elf_property_phdata = NULL;
967  		elf_ppnt = interp_elf_phdata;
968  		for (i = 0; i < interp_elf_ex->e_phnum; i++, elf_ppnt++)
969  			switch (elf_ppnt->p_type) 
970  			case PT_GNU_PROPERTY:
971  				elf_property_phdata = elf_ppnt;
972  				break;
973  
974  			case PT_LOPROC ... PT_HIPROC:
975  				retval = arch_elf_pt_proc(interp_elf_ex,
976  							  elf_ppnt, interpreter,
977  							  true, &arch_state);
978  				if (retval)
979  					goto out_free_dentry;
980  				break;
981  			
982  	
983  
984  	retval = parse_elf_properties(interpreter ?: bprm->file,
985  				      elf_property_phdata, &arch_state);
986  	if (retval)
987  		goto out_free_dentry;
988  
989  	/*
990  	 * Allow arch code to reject the ELF at this point, whilst it's
991  	 * still possible to return an error to the code that invoked
992  	 * the exec syscall.
993  	 */
994  	retval = arch_check_elf(elf_ex,
995  				!!interpreter, interp_elf_ex,
996  				&arch_state);
997  	if (retval)
998  		goto out_free_dentry;
999  
1000  	/* Flush all traces of the currently running executable */
1001  	retval = begin_new_exec(bprm);
1002  	if (retval)
1003  		goto out_free_dentry;
1004  
1005  	/* Do this immediately, since STACK_TOP as used in setup_arg_pages
1006  	   may depend on the personality.  */
1007  	SET_PERSONALITY2(*elf_ex, &arch_state);
1008  	if (elf_read_implies_exec(*elf_ex, executable_stack))
1009  		current->personality |= READ_IMPLIES_EXEC;
1010  
1011  	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
1012  		current->flags |= PF_RANDOMIZE;
1013  
1014  	setup_new_exec(bprm);
1015  
1016  	/* Do this so that we can load the interpreter, if need be.  We will
1017  	   change some of these later */
1018  	retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
1019  				 executable_stack);
1020  	if (retval < 0)
1021  		goto out_free_dentry;
1022  
1023  	elf_bss = 0;
1024  	elf_brk = 0;
1025  
1026  	start_code = ~0UL;
1027  	end_code = 0;
1028  	start_data = 0;
1029  	end_data = 0;
1030  
1031  	/* Now we do a little grungy work by mmapping the ELF image into
1032  	   the correct location in memory. */
1033  	for(i = 0, elf_ppnt = elf_phdata;
1034  	    i < elf_ex->e_phnum; i++, elf_ppnt++) 
1035  		int elf_prot, elf_flags;
1036  		unsigned long k, vaddr;
1037  		unsigned long total_size = 0;
1038  		unsigned long alignment;
1039  
1040  		if (elf_ppnt->p_type != PT_LOAD)
1041  			continue;
1042  
1043  		if (unlikely (elf_brk > elf_bss)) 
1044  			unsigned long nbyte;
1045  
1046  			/* There was a PT_LOAD segment with p_memsz > p_filesz
1047  			   before this one. Map anonymous pages, if needed,
1048  			   and clear the area.  */
1049  			retval = set_brk(elf_bss + load_bias,
1050  					 elf_brk + load_bias,
1051  					 bss_prot);
1052  			if (retval)
1053  				goto out_free_dentry;
1054  			nbyte = ELF_PAGEOFFSET(elf_bss);
1055  			if (nbyte) 
1056  				nbyte = ELF_MIN_ALIGN - nbyte;
1057  				if (nbyte > elf_brk - elf_bss)
1058  					nbyte = elf_brk - elf_bss;
1059  				if (clear_user((void __user *)elf_bss +
1060  							load_bias, nbyte)) 
1061  					/*
1062  					 * This bss-zeroing can fail if the ELF
1063  					 * file specifies odd protections. So
1064  					 * we don't check the return value
1065  					 */
1066  				
1067  			
1068  		
1069  
1070  		elf_prot = make_prot(elf_ppnt->p_flags, &arch_state,
1071  				     !!interpreter, false);
1072  
1073  		elf_flags = MAP_PRIVATE;
1074  
1075  		vaddr = elf_ppnt->p_vaddr;
1076  		/*
1077  		 * The first time through the loop, load_addr_set is false:
1078  		 * layout will be calculated. Once set, use MAP_FIXED since
1079  		 * we know we've already safely mapped the entire region with
1080  		 * MAP_FIXED_NOREPLACE in the once-per-binary logic following.
1081  		 */
1082  		if (load_addr_set) 
1083  			elf_flags |= MAP_FIXED;
1084  		 else if (elf_ex->e_type == ET_EXEC) 
1085  			/*
1086  			 * This logic is run once for the first LOAD Program
1087  			 * Header for ET_EXEC binaries. No special handling
1088  			 * is needed.
1089  			 */
1090  			elf_flags |= MAP_FIXED_NOREPLACE;
1091  		 else if (elf_ex->e_type == ET_DYN) 
1092  			/*
1093  			 * This logic is run once for the first LOAD Program
1094  			 * Header for ET_DYN binaries to calculate the
1095  			 * randomization (load_bias) for all the LOAD
1096  			 * Program Headers.
1097  			 *
1098  			 * There are effectively two types of ET_DYN
1099  			 * binaries: programs (i.e. PIE: ET_DYN with INTERP)
1100  			 * and loaders (ET_DYN without INTERP, since they
1101  			 * _are_ the ELF interpreter). The loaders must
1102  			 * be loaded away from programs since the program
1103  			 * may otherwise collide with the loader (especially
1104  			 * for ET_EXEC which does not have a randomized
1105  			 * position). For example to handle invocations of
1106  			 * "./ld.so someprog" to test out a new version of
1107  			 * the loader, the subsequent program that the
1108  			 * loader loads must avoid the loader itself, so
1109  			 * they cannot share the same load range. Sufficient
1110  			 * room for the brk must be allocated with the
1111  			 * loader as well, since brk must be available with
1112  			 * the loader.
1113  			 *
1114  			 * Therefore, programs are loaded offset from
1115  			 * ELF_ET_DYN_BASE and loaders are loaded into the
1116  			 * independently randomized mmap region (0 load_bias
1117  			 * without MAP_FIXED nor MAP_FIXED_NOREPLACE).
1118  			 */
1119  			alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
1120  			if (interpreter || alignment > ELF_MIN_ALIGN) 
1121  				load_bias = ELF_ET_DYN_BASE;
1122  				if (current->flags & PF_RANDOMIZE)
1123  					load_bias += arch_mmap_rnd();
1124  				if (alignment)
1125  					load_bias &= ~(alignment - 1);
1126  				elf_flags |= MAP_FIXED_NOREPLACE;
1127  			 else
1128  				load_bias = 0;
1129  
1130  			/*
1131  			 * Since load_bias is used for all subsequent loading
1132  			 * calculations, we must lower it by the first vaddr
1133  			 * so that the remaining calculations based on the
1134  			 * ELF vaddrs will be correctly offset. The result
1135  			 * is then page aligned.
1136  			 */
1137  			load_bias = ELF_PAGESTART(load_bias - vaddr);
1138  		
1139  
1140  		/*
1141  		 * Calculate the entire size of the ELF mapping (total_size).
1142  		 * (Note that load_addr_set is set to true later once the
1143  		 * initial mapping is performed.)
1144  		 */
1145  		if (!load_addr_set) 
1146  			total_size = total_mapping_size(elf_phdata,
1147  							elf_ex->e_phnum);
1148  			if (!total_size) 
1149  				retval = -EINVAL;
1150  				goto out_free_dentry;
1151  			
1152  		
1153  
1154  		error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
1155  				elf_prot, elf_flags, total_size);
1156  		if (BAD_ADDR(error)) 
1157  			retval = IS_ERR((void *)error) ?
1158  				PTR_ERR((void*)error) : -EINVAL;
1159  			goto out_free_dentry;
1160  		
1161  
1162  		if (!load_addr_set) 
1163  			load_addr_set = 1;
1164  			load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset);
1165  			if (elf_ex->e_type == ET_DYN) 
1166  				load_bias += error -
1167  				             ELF_PAGESTART(load_bias + vaddr);
1168  				load_addr += load_bias;
1169  				reloc_func_desc = load_bias;
1170  			
1171  		
1172  		k = elf_ppnt->p_vaddr;
1173  		if ((elf_ppnt->p_flags & PF_X) && k < start_code)
1174  			start_code = k;
1175  		if (start_data < k)
1176  			start_data = k;
1177  
1178  		/*
1179  		 * Check to see if the section's size will overflow the
1180  		 * allowed task size. Note that p_filesz must always be
1181  		 * <= p_memsz so it is only necessary to check p_memsz.
1182  		 */
1183  		if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz ||
1184  		    elf_ppnt->p_memsz > TASK_SIZE ||
1185  		    TASK_SIZE - elf_ppnt->p_memsz < k) 
1186  			/* set_brk can never work. Avoid overflows. */
1187  			retval = -EINVAL;
1188  			goto out_free_dentry;
1189  		
1190  
1191  		k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz;
1192  
1193  		if (k > elf_bss)
1194  			elf_bss = k;
1195  		if ((elf_ppnt->p_flags & PF_X) && end_code < k)
1196  			end_code = k;
1197  		if (end_data < k)
1198  			end_data = k;
1199  		k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
1200  		if (k > elf_brk) 
1201  			bss_prot = elf_prot;
1202  			elf_brk = k;
1203  		
1204  	
1205  
1206  	e_entry = elf_ex->e_entry + load_bias;
1207  	elf_bss += load_bias;
1208  	elf_brk += load_bias;
1209  	start_code += load_bias;
1210  	end_code += load_bias;
1211  	start_data += load_bias;
1212  	end_data += load_bias;
1213  
1214  	/* Calling set_brk effectively mmaps the pages that we need
1215  	 * for the bss and break sections.  We must do this before
1216  	 * mapping in the interpreter, to make sure it doesn't wind
1217  	 * up getting placed where the bss needs to go.
1218  	 */
1219  	retval = set_brk(elf_bss, elf_brk, bss_prot);
1220  	if (retval)
1221  		goto out_free_dentry;
1222  	if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) 
1223  		retval = -EFAULT; /* Nobody gets to see this, but.. */
1224  		goto out_free_dentry;
1225  	
1226  
1227  	if (interpreter) 
1228  		elf_entry = load_elf_interp(interp_elf_ex,
1229  					    interpreter,
1230  					    load_bias, interp_elf_phdata,
1231  					    &arch_state);
1232  		if (!IS_ERR((void *)elf_entry)) 
1233  			/*
1234  			 * load_elf_interp() returns relocation
1235  			 * adjustment
1236  			 */
1237  			interp_load_addr = elf_entry;
1238  			elf_entry += interp_elf_ex->e_entry;
1239  		
1240  		if (BAD_ADDR(elf_entry)) 
1241  			retval = IS_ERR((void *)elf_entry) ?
1242  					(int)elf_entry : -EINVAL;
1243  			goto out_free_dentry;
1244  		
1245  		reloc_func_desc = interp_load_addr;
1246  
1247  		allow_write_access(interpreter);
1248  		fput(interpreter);
1249  
1250  		kfree(interp_elf_ex);
1251  		kfree(interp_elf_phdata);
1252  	 else 
1253  		elf_entry = e_entry;
1254  		if (BAD_ADDR(elf_entry)) 
1255  			retval = -EINVAL;
1256  			goto out_free_dentry;
1257  		
1258  	
1259  
1260  	kfree(elf_phdata);
1261  
1262  	set_binfmt(&elf_format);
1263  
1264  #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
1265  	retval = ARCH_SETUP_ADDITIONAL_PAGES(bprm, elf_ex, !!interpreter);
1266  	if (retval < 0)
1267  		goto out;
1268  #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
1269  
1270  	retval = create_elf_tables(bprm, elf_ex,
1271  			  load_addr, interp_load_addr, e_entry);
1272  	if (retval < 0)
1273  		goto out;
1274  
1275  	mm = current->mm;
1276  	mm->end_code = end_code;
1277  	mm->start_code = start_code;
1278  	mm->start_data = start_data;
1279  	mm->end_data = end_data;
1280  	mm->start_stack = bprm->p;
1281  
1282  	if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) 
1283  		/*
1284  		 * For architectures with ELF randomization, when executing
1285  		 * a loader directly (i.e. no interpreter listed in ELF
1286  		 * headers), move the brk area out of the mmap region
1287  		 * (since it grows up, and may collide early with the stack
1288  		 * growing down), and into the unused ELF_ET_DYN_BASE region.
1289  		 */
1290  		if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) &&
1291  		    elf_ex->e_type == ET_DYN && !interpreter) 
1292  			mm->brk = mm->start_brk = ELF_ET_DYN_BASE;
1293  		
1294  
1295  		mm->brk = mm->start_brk = arch_randomize_brk(mm);
1296  #ifdef compat_brk_randomized
1297  		current->brk_randomized = 1;
1298  #endif
1299  	
1300  
1301  	if (current->personality & MMAP_PAGE_ZERO) 
1302  		/* Why this, you ask???  Well SVr4 maps page 0 as read-only,
1303  		   and some applications "depend" upon this behavior.
1304  		   Since we do not have the power to recompile these, we
1305  		   emulate the SVr4 behavior. Sigh. */
1306  		error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
1307  				MAP_FIXED | MAP_PRIVATE, 0);
1308  	
1309  
1310  	regs = current_pt_regs();
1311  #ifdef ELF_PLAT_INIT
1312  	/*
1313  	 * The ABI may specify that certain registers be set up in special
1314  	 * ways (on i386 %edx is the address of a DT_FINI function, for
1315  	 * example.  In addition, it may also specify (eg, PowerPC64 ELF)
1316  	 * that the e_entry field is the address of the function descriptor
1317  	 * for the startup routine, rather than the address of the startup
1318  	 * routine itself.  This macro performs whatever initialization to
1319  	 * the regs structure is required as well as any relocations to the
1320  	 * function descriptor entries when executing dynamically links apps.
1321  	 */
1322  	ELF_PLAT_INIT(regs, reloc_func_desc);
1323  #endif
1324  
1325  	finalize_exec(bprm);
1326  	START_THREAD(elf_ex, regs, elf_entry, bprm->p);
1327  	retval = 0;
1328  out:
1329  	return retval;
1330  
1331  	/* error cleanup */
1332  out_free_dentry:
1333  	kfree(interp_elf_ex);
1334  	kfree(interp_elf_phdata);
1335  	allow_write_access(interpreter);
1336  	if (interpreter)
1337  		fput(interpreter);
1338  out_free_ph:
1339  	kfree(elf_phdata);
1340  	goto out;
1341  

setup_arg_pages函数:把栈顶设置为STACK_TOP减去随机值,然后把环境变量和参数从临时栈移到最终的用户栈;

set_brk函数:设置堆的起始地址,如果启用堆随机化,把堆的起始地址加上随机值。

arch_pick_mmap_layout函数:ARM64架构的下该函数负责选择内存映射区域的布局。

409  void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
410  
411  	unsigned long random_factor = 0UL;
412  
413  	if (current->flags & PF_RANDOMIZE)
414  		random_factor = arch_mmap_rnd();
415  
416  	if (mmap_is_legacy(rlim_stack)) //如果给进程描述符的成员personality设置标志位ADDRCOMPAT LAYOUT表示使用传统的虚拟地址空间布局,或者用户栈可以无限增长,或者通过文件“/proc/sys/vm/legacy_va_layout”指定,那么使用传统的自底向上增长的布局,内存映射区域的起始地址是 TASK_UNMAPPED_BASE 加上随机值,分配未映射区域的函数是arch_get_unmapped_area。
417  		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
418  		mm->get_unmapped_area = arch_get_unmapped_area;
419  	 else //如果使用自顶向下增长的布局,那么分配未映射区域的函数是arch_ get_unmapped_area_topdown,内存映射区域的起始地址的计算方法如下:先计算内存映射区域的起始地址和栈顶的间隙:初始值取用户栈的最大长度,限定不能小于“128MB + 栈的最大随机偏移值 + 1”,确保用户栈最大可以达到128MB;限定不能超过STACK_TOP的5/6。内存映射区域的起始地址等于“STACK_TOP−间隙−随机值”,然后向下对齐到页长度。
420  		mm->mmap_base = mmap_base(random_factor, rlim_stack);
421  		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
422  	
423  

以上是关于Linux内存从0到1学习笔记(七,用户空间虚拟内存之二 - 内存空间的建立)的主要内容,如果未能解决你的问题,请参考以下文章

Linux内存从0到1学习笔记(一,内存简介)

《Linux从0到99》七 进程地址空间

《Linux从0到99》七 进程地址空间

Linux内存从0到1学习笔记(6.10 物理内存初始化之vmalloc分配器)

Linux内存从0到1学习笔记(11.2 内存优化方案之内存压缩zram)

Linux学习笔记第七周一次课(3月19日)