Linux内核分析之六——进程的描述与进程的创建
Posted hoikin-yiu
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Linux内核分析之六——进程的描述与进程的创建相关的知识,希望对你有一定的参考价值。
作者:姚开健
原创作品转载请注明出处
《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000
进程的描述
Linux系统的进程由一个进程描述符PCB,即task_struct结构体来描述,其在内核中代码实现如下:
struct task_struct { 1236 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ 1237 void *stack; 1238 atomic_t usage; 1239 unsigned int flags; /* per process flags, defined below */ 1240 unsigned int ptrace; 1241 1242#ifdef CONFIG_SMP 1243 struct llist_node wake_entry; 1244 int on_cpu; 1245 struct task_struct *last_wakee; 1246 unsigned long wakee_flips; 1247 unsigned long wakee_flip_decay_ts; 1248 1249 int wake_cpu; 1250#endif 1251 int on_rq; 1252 1253 int prio, static_prio, normal_prio; 1254 unsigned int rt_priority; 1255 const struct sched_class *sched_class; 1256 struct sched_entity se; 1257 struct sched_rt_entity rt; 1258#ifdef CONFIG_CGROUP_SCHED 1259 struct task_group *sched_task_group; 1260#endif 1261 struct sched_dl_entity dl; 1262 1263#ifdef CONFIG_PREEMPT_NOTIFIERS 1264 /* list of struct preempt_notifier: */ 1265 struct hlist_head preempt_notifiers; 1266#endif 1267 1268#ifdef CONFIG_BLK_DEV_IO_TRACE 1269 unsigned int btrace_seq; 1270#endif 1271 1272 unsigned int policy; 1273 int nr_cpus_allowed; 1274 cpumask_t cpus_allowed; 1275 1276#ifdef CONFIG_PREEMPT_RCU 1277 int rcu_read_lock_nesting; 1278 union rcu_special rcu_read_unlock_special; 1279 struct list_head rcu_node_entry; 1280#endif /* #ifdef CONFIG_PREEMPT_RCU */ 1281#ifdef CONFIG_TREE_PREEMPT_RCU 1282 struct rcu_node *rcu_blocked_node; 1283#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 1284#ifdef CONFIG_TASKS_RCU 1285 unsigned long rcu_tasks_nvcsw; 1286 bool rcu_tasks_holdout; 1287 struct list_head rcu_tasks_holdout_list; 1288 int rcu_tasks_idle_cpu; 1289#endif /* #ifdef CONFIG_TASKS_RCU */ 1290 1291#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1292 struct sched_info sched_info; 1293#endif 1294 1295 struct list_head tasks; 1296#ifdef CONFIG_SMP 1297 struct plist_node pushable_tasks; 1298 struct rb_node pushable_dl_tasks; 1299#endif 1300 1301 struct mm_struct *mm, *active_mm; 1302#ifdef CONFIG_COMPAT_BRK 1303 unsigned brk_randomized:1; 1304#endif 1305 /* per-thread vma caching */ 1306 u32 vmacache_seqnum; 1307 struct vm_area_struct *vmacache[VMACACHE_SIZE]; 1308#if defined(SPLIT_RSS_COUNTING) 1309 struct task_rss_stat rss_stat; 1310#endif 1311/* task state */ 1312 int exit_state; 1313 int exit_code, exit_signal; 1314 int pdeath_signal; /* The signal sent when the parent dies */ 1315 unsigned int jobctl; /* JOBCTL_*, siglock protected */ 1316 1317 /* Used for emulating ABI behavior of previous Linux versions */ 1318 unsigned int personality; 1319 1320 unsigned in_execve:1; /* Tell the LSMs that the process is doing an 1321 * execve */ 1322 unsigned in_iowait:1; 1323 1324 /* Revert to default priority/policy when forking */ 1325 unsigned sched_reset_on_fork:1; 1326 unsigned sched_contributes_to_load:1; 1327 1328 unsigned long atomic_flags; /* Flags needing atomic access. */ 1329 1330 pid_t pid; 1331 pid_t tgid; 1332 1333#ifdef CONFIG_CC_STACKPROTECTOR 1334 /* Canary value for the -fstack-protector gcc feature */ 1335 unsigned long stack_canary; 1336#endif 1337 /* 1338 * pointers to (original) parent process, youngest child, younger sibling, 1339 * older sibling, respectively. (p->father can be replaced with 1340 * p->real_parent->pid) 1341 */ 1342 struct task_struct __rcu *real_parent; /* real parent process */ 1343 struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */ 1344 /* 1345 * children/sibling forms the list of my natural children 1346 */ 1347 struct list_head children; /* list of my children */ 1348 struct list_head sibling; /* linkage in my parent's children list */ 1349 struct task_struct *group_leader; /* threadgroup leader */ 1350 1351 /* 1352 * ptraced is the list of tasks this task is using ptrace on. 1353 * This includes both natural children and PTRACE_ATTACH targets. 1354 * p->ptrace_entry is p's link on the p->parent->ptraced list. 1355 */ 1356 struct list_head ptraced; 1357 struct list_head ptrace_entry; 1358 1359 /* PID/PID hash table linkage. */ 1360 struct pid_link pids[PIDTYPE_MAX]; 1361 struct list_head thread_group; 1362 struct list_head thread_node; 1363 1364 struct completion *vfork_done; /* for vfork() */ 1365 int __user *set_child_tid; /* CLONE_CHILD_SETTID */ 1366 int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ 1367 1368 cputime_t utime, stime, utimescaled, stimescaled; 1369 cputime_t gtime; 1370#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 1371 struct cputime prev_cputime; 1372#endif 1373#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 1374 seqlock_t vtime_seqlock; 1375 unsigned long long vtime_snap; 1376 enum { 1377 VTIME_SLEEPING = 0, 1378 VTIME_USER, 1379 VTIME_SYS, 1380 } vtime_snap_whence; 1381#endif 1382 unsigned long nvcsw, nivcsw; /* context switch counts */ 1383 u64 start_time; /* monotonic time in nsec */ 1384 u64 real_start_time; /* boot based time in nsec */ 1385/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ 1386 unsigned long min_flt, maj_flt; 1387 1388 struct task_cputime cputime_expires; 1389 struct list_head cpu_timers[3]; 1390 1391/* process credentials */ 1392 const struct cred __rcu *real_cred; /* objective and real subjective task 1393 * credentials (COW) */ 1394 const struct cred __rcu *cred; /* effective (overridable) subjective task 1395 * credentials (COW) */ 1396 char comm[TASK_COMM_LEN]; /* executable name excluding path 1397 - access with [gs]et_task_comm (which lock 1398 it with task_lock()) 1399 - initialized normally by setup_new_exec */ 1400/* file system info */ 1401 int link_count, total_link_count; 1402#ifdef CONFIG_SYSVIPC 1403/* ipc stuff */ 1404 struct sysv_sem sysvsem; 1405 struct sysv_shm sysvshm; 1406#endif 1407#ifdef CONFIG_DETECT_HUNG_TASK 1408/* hung task detection */ 1409 unsigned long last_switch_count; 1410#endif 1411/* CPU-specific state of this task */ 1412 struct thread_struct thread; 1413/* filesystem information */ 1414 struct fs_struct *fs; 1415/* open file information */ 1416 struct files_struct *files; 1417/* namespaces */ 1418 struct nsproxy *nsproxy; 1419/* signal handlers */ 1420 struct signal_struct *signal; 1421 struct sighand_struct *sighand; 1422 1423 sigset_t blocked, real_blocked; 1424 sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */ 1425 struct sigpending pending; 1426 1427 unsigned long sas_ss_sp; 1428 size_t sas_ss_size; 1429 int (*notifier)(void *priv); 1430 void *notifier_data; 1431 sigset_t *notifier_mask; 1432 struct callback_head *task_works; 1433 1434 struct audit_context *audit_context; 1435#ifdef CONFIG_AUDITSYSCALL 1436 kuid_t loginuid; 1437 unsigned int sessionid; 1438#endif 1439 struct seccomp seccomp; 1440 1441/* Thread group tracking */ 1442 u32 parent_exec_id; 1443 u32 self_exec_id; 1444/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, 1445 * mempolicy */ 1446 spinlock_t alloc_lock; 1447 1448 /* Protection of the PI data structures: */ 1449 raw_spinlock_t pi_lock; 1450 1451#ifdef CONFIG_RT_MUTEXES 1452 /* PI waiters blocked on a rt_mutex held by this task */ 1453 struct rb_root pi_waiters; 1454 struct rb_node *pi_waiters_leftmost; 1455 /* Deadlock detection and priority inheritance handling */ 1456 struct rt_mutex_waiter *pi_blocked_on; 1457#endif 1458 1459#ifdef CONFIG_DEBUG_MUTEXES 1460 /* mutex deadlock detection */ 1461 struct mutex_waiter *blocked_on; 1462#endif 1463#ifdef CONFIG_TRACE_IRQFLAGS 1464 unsigned int irq_events; 1465 unsigned long hardirq_enable_ip; 1466 unsigned long hardirq_disable_ip; 1467 unsigned int hardirq_enable_event; 1468 unsigned int hardirq_disable_event; 1469 int hardirqs_enabled; 1470 int hardirq_context; 1471 unsigned long softirq_disable_ip; 1472 unsigned long softirq_enable_ip; 1473 unsigned int softirq_disable_event; 1474 unsigned int softirq_enable_event; 1475 int softirqs_enabled; 1476 int softirq_context; 1477#endif 1478#ifdef CONFIG_LOCKDEP 1479# define MAX_LOCK_DEPTH 48UL 1480 u64 curr_chain_key; 1481 int lockdep_depth; 1482 unsigned int lockdep_recursion; 1483 struct held_lock held_locks[MAX_LOCK_DEPTH]; 1484 gfp_t lockdep_reclaim_gfp; 1485#endif 1486 1487/* journalling filesystem info */ 1488 void *journal_info; 1489 1490/* stacked block device info */ 1491 struct bio_list *bio_list; 1492 1493#ifdef CONFIG_BLOCK 1494/* stack plugging */ 1495 struct blk_plug *plug; 1496#endif 1497 1498/* VM state */ 1499 struct reclaim_state *reclaim_state; 1500 1501 struct backing_dev_info *backing_dev_info; 1502 1503 struct io_context *io_context; 1504 1505 unsigned long ptrace_message; 1506 siginfo_t *last_siginfo; /* For ptrace use. */ 1507 struct task_io_accounting ioac; 1508#if defined(CONFIG_TASK_XACCT) 1509 u64 acct_rss_mem1; /* accumulated rss usage */ 1510 u64 acct_vm_mem1; /* accumulated virtual memory usage */ 1511 cputime_t acct_timexpd; /* stime + utime since last update */ 1512#endif 1513#ifdef CONFIG_CPUSETS 1514 nodemask_t mems_allowed; /* Protected by alloc_lock */ 1515 seqcount_t mems_allowed_seq; /* Seqence no to catch updates */ 1516 int cpuset_mem_spread_rotor; 1517 int cpuset_slab_spread_rotor; 1518#endif 1519#ifdef CONFIG_CGROUPS 1520 /* Control Group info protected by css_set_lock */ 1521 struct css_set __rcu *cgroups; 1522 /* cg_list protected by css_set_lock and tsk->alloc_lock */ 1523 struct list_head cg_list; 1524#endif 1525#ifdef CONFIG_FUTEX 1526 struct robust_list_head __user *robust_list; 1527#ifdef CONFIG_COMPAT 1528 struct compat_robust_list_head __user *compat_robust_list; 1529#endif 1530 struct list_head pi_state_list; 1531 struct futex_pi_state *pi_state_cache; 1532#endif 1533#ifdef CONFIG_PERF_EVENTS 1534 struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; 1535 struct mutex perf_event_mutex; 1536 struct list_head perf_event_list; 1537#endif 1538#ifdef CONFIG_DEBUG_PREEMPT 1539 unsigned long preempt_disable_ip; 1540#endif 1541#ifdef CONFIG_NUMA 1542 struct mempolicy *mempolicy; /* Protected by alloc_lock */ 1543 short il_next; 1544 short pref_node_fork; 1545#endif 1546#ifdef CONFIG_NUMA_BALANCING 1547 int numa_scan_seq; 1548 unsigned int numa_scan_period; 1549 unsigned int numa_scan_period_max; 1550 int numa_preferred_nid; 1551 unsigned long numa_migrate_retry; 1552 u64 node_stamp; /* migration stamp */ 1553 u64 last_task_numa_placement; 1554 u64 last_sum_exec_runtime; 1555 struct callback_head numa_work; 1556 1557 struct list_head numa_entry; 1558 struct numa_group *numa_group; 1559 1560 /* 1561 * Exponential decaying average of faults on a per-node basis. 1562 * Scheduling placement decisions are made based on the these counts. 1563 * The values remain static for the duration of a PTE scan 1564 */ 1565 unsigned long *numa_faults_memory; 1566 unsigned long total_numa_faults; 1567 1568 /* 1569 * numa_faults_buffer records faults per node during the current 1570 * scan window. When the scan completes, the counts in 1571 * numa_faults_memory decay and these values are copied. 1572 */ 1573 unsigned long *numa_faults_buffer_memory; 1574 1575 /* 1576 * Track the nodes the process was running on when a NUMA hinting 1577 * fault was incurred. 1578 */ 1579 unsigned long *numa_faults_cpu; 1580 unsigned long *numa_faults_buffer_cpu; 1581 1582 /* 1583 * numa_faults_locality tracks if faults recorded during the last 1584 * scan window were remote/local. The task scan period is adapted 1585 * based on the locality of the faults with different weights 1586 * depending on whether they were shared or private faults 1587 */ 1588 unsigned long numa_faults_locality[2]; 1589 1590 unsigned long numa_pages_migrated; 1591#endif /* CONFIG_NUMA_BALANCING */ 1592 1593 struct rcu_head rcu; 1594 1595 /* 1596 * cache last used pipe for splice 1597 */ 1598 struct pipe_inode_info *splice_pipe; 1599 1600 struct page_frag task_frag; 1601 1602#ifdef CONFIG_TASK_DELAY_ACCT 1603 struct task_delay_info *delays; 1604#endif 1605#ifdef CONFIG_FAULT_INJECTION 1606 int make_it_fail; 1607#endif 1608 /* 1609 * when (nr_dirtied >= nr_dirtied_pause), it's time to call 1610 * balance_dirty_pages() for some dirty throttling pause 1611 */ 1612 int nr_dirtied; 1613 int nr_dirtied_pause; 1614 unsigned long dirty_paused_when; /* start of a write-and-pause period */ 1615 1616#ifdef CONFIG_LATENCYTOP 1617 int latency_record_count; 1618 struct latency_record latency_record[LT_SAVECOUNT]; 1619#endif 1620 /* 1621 * time slack values; these are used to round up poll() and 1622 * select() etc timeout values. These are in nanoseconds. 1623 */ 1624 unsigned long timer_slack_ns; 1625 unsigned long default_timer_slack_ns; 1626 1627#ifdef CONFIG_FUNCTION_GRAPH_TRACER 1628 /* Index of current stored address in ret_stack */ 1629 int curr_ret_stack; 1630 /* Stack of return addresses for return function tracing */ 1631 struct ftrace_ret_stack *ret_stack; 1632 /* time stamp for last schedule */ 1633 unsigned long long ftrace_timestamp; 1634 /* 1635 * Number of functions that haven't been traced 1636 * because of depth overrun. 1637 */ 1638 atomic_t trace_overrun; 1639 /* Pause for the tracing */ 1640 atomic_t tracing_graph_pause; 1641#endif 1642#ifdef CONFIG_TRACING 1643 /* state flags for use by tracers */ 1644 unsigned long trace; 1645 /* bitmask and counter of trace recursion */ 1646 unsigned long trace_recursion; 1647#endif /* CONFIG_TRACING */ 1648#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ 1649 unsigned int memcg_kmem_skip_account; 1650 struct memcg_oom_info { 1651 struct mem_cgroup *memcg; 1652 gfp_t gfp_mask; 1653 int order; 1654 unsigned int may_oom:1; 1655 } memcg_oom; 1656#endif 1657#ifdef CONFIG_UPROBES 1658 struct uprobe_task *utask; 1659#endif 1660#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) 1661 unsigned int sequential_io; 1662 unsigned int sequential_io_avg; 1663#endif 1664};代码很长,这里不一一分析其结构,可参考网上的其他相关文章。我们可以从代码知道一般的进程信息包括进程状态,进程调度信息,进程标识符,进程通信有关信息,进程链接信息,时间和定时器信息,文件系统信息,虚拟内存信息,页面管理信息,对称处理机信息,和处理器相关的上下文信息等。如下图简略图所示:
进程的创建
当说明了进程的描述之后,来分析一下进程的创建过程。
创建进程的系统调用有fork(),vfork()和clone()这三个。fork和vfork的区别在于fork需要拷贝父进程的内核数据空间,而vfork在exec与exit之前与父进程共用数据空间,fork创建了子进程后不限定父进程与子进程的执行顺序,而vfork需要在子进程exec与exit之前让父进程阻塞,子进程先执行。clone只要是对简单的进程进行创建。三个系统调用都是调用do_fork()来进行进程的创建。
long do_fork(unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr) { ... p = copy_process(clone_flags, stack_start, stack_size, child_tidptr, NULL, trace); ... }真正执行进程创建的是copy_process函数调用,完成子进程对父进程的PCB的复制与修改并初始化。接着执行调用dup_task_struct()为新进程创建一个内核栈
p = dup_task_struct(current);
retval = copy_thread(clone_flags,
stack_start, stack_size, p);
复制父进程堆栈的内容到子进程的堆栈中去.这其中,copy_thread()函数中的语句p->thread.ip = (unsigned long) ret_from_fork;
决定了新进程的第一条指令地址
static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; struct thread_info *ti; int node = tsk_fork_get_node(orig); int err; tsk = alloc_task_struct_node(node); if (!tsk) return NULL; ti = alloc_thread_info_node(tsk, node); if (!ti) goto free_tsk; err = arch_dup_task_struct(tsk, orig); if (err) goto free_ti; tsk->stack = ti; # ifdef CONFIG_SECCOMP tsk->seccomp.filter = NULL; # endif setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); set_task_stack_end_magic(tsk); # ifdef CONFIG_CC_STACKPROTECTOR tsk->stack_canary = get_random_int(); # endif atomic_set(&tsk->usage, 2); # ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; # endif tsk->splice_pipe = NULL; tsk->task_frag.page = NULL; account_kernel_stack(ti, 1); return tsk; free_ti: free_thread_info(ti); free_tsk: free_task_struct(tsk); return NULL; }
新进程的PCB和内核堆栈在分配的页表中的分布地址如下所示:
task_struct结构体是按page分配的,多余的部分作为该进程的内核堆栈,从底向task_struct延伸。
新进程的执行
在之前的函数分析已经说明了新进程的堆栈ip指针初始化为ret_from_fork,这是一个汇编程序
在之前的分析中,谈到copy_process中的copy_thread()函数,正是这个函数决定了子进程从系统调用中返回后的执行. int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p) { ... *childregs = *current_pt_regs(); childregs->ax = 0; if (sp) childregs->sp = sp; p->thread.ip = (unsigned long) ret_from_fork; ... }
ENTRY(ret_from_fork) CFI_STARTPROC pushl_cfi %eax call schedule_tail GET_THREAD_INFO(%ebp) popl_cfi %eax pushl_cfi $0x0202 # Reset kernel eflags popfl_cfi jmp syscall_exit CFI_ENDPROC END(ret_from_fork)
上述的ret_from_fork就是新进程的执行点。
新进程的内核堆栈初始为父进程的保存现场SAVE_ALL的堆栈数据,所以新进程执行ret_from_fork后有一个RESTORE_ALL,把内核堆栈的数据恢复之后就可以离开内核态进入到用户态执行。总结
1、Linux内核创建一个新进程时有三个系统调用fork(),vfork(),clone()fork和vfork的区别上面已分析。他们之间都是通过do_fork()来创建进程。
2、创建进程往往把父进程的PCB拷贝给子进程,然后再拷贝内核堆栈,子进程需要对其修改并初始化,这样才能是一个可以运行的进程。通常实现的时候是写时复制,就是当子进程需要用到一些数据结构时,才创建一个新的数据结构给它。
3、新进程的执行点是ret_from_fork,恢复堆栈数据后就可以离开内核返回到用户态执行。
以上是关于Linux内核分析之六——进程的描述与进程的创建的主要内容,如果未能解决你的问题,请参考以下文章
linux内核分析 第六周 分析Linux内核创建一个新进程的过程