Linux内核分析之六——进程的描述与进程的创建

Posted hoikin-yiu

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Linux内核分析之六——进程的描述与进程的创建相关的知识,希望对你有一定的参考价值。

作者:姚开健

原创作品转载请注明出处

《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000

进程的描述

Linux系统的进程由一个进程描述符PCB,即task_struct结构体来描述,其在内核中代码实现如下:

struct task_struct {
1236	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
1237	void *stack;
1238	atomic_t usage;
1239	unsigned int flags;	/* per process flags, defined below */
1240	unsigned int ptrace;
1241
1242#ifdef CONFIG_SMP
1243	struct llist_node wake_entry;
1244	int on_cpu;
1245	struct task_struct *last_wakee;
1246	unsigned long wakee_flips;
1247	unsigned long wakee_flip_decay_ts;
1248
1249	int wake_cpu;
1250#endif
1251	int on_rq;
1252
1253	int prio, static_prio, normal_prio;
1254	unsigned int rt_priority;
1255	const struct sched_class *sched_class;
1256	struct sched_entity se;
1257	struct sched_rt_entity rt;
1258#ifdef CONFIG_CGROUP_SCHED
1259	struct task_group *sched_task_group;
1260#endif
1261	struct sched_dl_entity dl;
1262
1263#ifdef CONFIG_PREEMPT_NOTIFIERS
1264	/* list of struct preempt_notifier: */
1265	struct hlist_head preempt_notifiers;
1266#endif
1267
1268#ifdef CONFIG_BLK_DEV_IO_TRACE
1269	unsigned int btrace_seq;
1270#endif
1271
1272	unsigned int policy;
1273	int nr_cpus_allowed;
1274	cpumask_t cpus_allowed;
1275
1276#ifdef CONFIG_PREEMPT_RCU
1277	int rcu_read_lock_nesting;
1278	union rcu_special rcu_read_unlock_special;
1279	struct list_head rcu_node_entry;
1280#endif /* #ifdef CONFIG_PREEMPT_RCU */
1281#ifdef CONFIG_TREE_PREEMPT_RCU
1282	struct rcu_node *rcu_blocked_node;
1283#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1284#ifdef CONFIG_TASKS_RCU
1285	unsigned long rcu_tasks_nvcsw;
1286	bool rcu_tasks_holdout;
1287	struct list_head rcu_tasks_holdout_list;
1288	int rcu_tasks_idle_cpu;
1289#endif /* #ifdef CONFIG_TASKS_RCU */
1290
1291#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1292	struct sched_info sched_info;
1293#endif
1294
1295	struct list_head tasks;
1296#ifdef CONFIG_SMP
1297	struct plist_node pushable_tasks;
1298	struct rb_node pushable_dl_tasks;
1299#endif
1300
1301	struct mm_struct *mm, *active_mm;
1302#ifdef CONFIG_COMPAT_BRK
1303	unsigned brk_randomized:1;
1304#endif
1305	/* per-thread vma caching */
1306	u32 vmacache_seqnum;
1307	struct vm_area_struct *vmacache[VMACACHE_SIZE];
1308#if defined(SPLIT_RSS_COUNTING)
1309	struct task_rss_stat	rss_stat;
1310#endif
1311/* task state */
1312	int exit_state;
1313	int exit_code, exit_signal;
1314	int pdeath_signal;  /*  The signal sent when the parent dies  */
1315	unsigned int jobctl;	/* JOBCTL_*, siglock protected */
1316
1317	/* Used for emulating ABI behavior of previous Linux versions */
1318	unsigned int personality;
1319
1320	unsigned in_execve:1;	/* Tell the LSMs that the process is doing an
1321				 * execve */
1322	unsigned in_iowait:1;
1323
1324	/* Revert to default priority/policy when forking */
1325	unsigned sched_reset_on_fork:1;
1326	unsigned sched_contributes_to_load:1;
1327
1328	unsigned long atomic_flags; /* Flags needing atomic access. */
1329
1330	pid_t pid;
1331	pid_t tgid;
1332
1333#ifdef CONFIG_CC_STACKPROTECTOR
1334	/* Canary value for the -fstack-protector gcc feature */
1335	unsigned long stack_canary;
1336#endif
1337	/*
1338	 * pointers to (original) parent process, youngest child, younger sibling,
1339	 * older sibling, respectively.  (p->father can be replaced with
1340	 * p->real_parent->pid)
1341	 */
1342	struct task_struct __rcu *real_parent; /* real parent process */
1343	struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
1344	/*
1345	 * children/sibling forms the list of my natural children
1346	 */
1347	struct list_head children;	/* list of my children */
1348	struct list_head sibling;	/* linkage in my parent's children list */
1349	struct task_struct *group_leader;	/* threadgroup leader */
1350
1351	/*
1352	 * ptraced is the list of tasks this task is using ptrace on.
1353	 * This includes both natural children and PTRACE_ATTACH targets.
1354	 * p->ptrace_entry is p's link on the p->parent->ptraced list.
1355	 */
1356	struct list_head ptraced;
1357	struct list_head ptrace_entry;
1358
1359	/* PID/PID hash table linkage. */
1360	struct pid_link pids[PIDTYPE_MAX];
1361	struct list_head thread_group;
1362	struct list_head thread_node;
1363
1364	struct completion *vfork_done;		/* for vfork() */
1365	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
1366	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
1367
1368	cputime_t utime, stime, utimescaled, stimescaled;
1369	cputime_t gtime;
1370#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
1371	struct cputime prev_cputime;
1372#endif
1373#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
1374	seqlock_t vtime_seqlock;
1375	unsigned long long vtime_snap;
1376	enum {
1377		VTIME_SLEEPING = 0,
1378		VTIME_USER,
1379		VTIME_SYS,
1380	} vtime_snap_whence;
1381#endif
1382	unsigned long nvcsw, nivcsw; /* context switch counts */
1383	u64 start_time;		/* monotonic time in nsec */
1384	u64 real_start_time;	/* boot based time in nsec */
1385/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
1386	unsigned long min_flt, maj_flt;
1387
1388	struct task_cputime cputime_expires;
1389	struct list_head cpu_timers[3];
1390
1391/* process credentials */
1392	const struct cred __rcu *real_cred; /* objective and real subjective task
1393					 * credentials (COW) */
1394	const struct cred __rcu *cred;	/* effective (overridable) subjective task
1395					 * credentials (COW) */
1396	char comm[TASK_COMM_LEN]; /* executable name excluding path
1397				     - access with [gs]et_task_comm (which lock
1398				       it with task_lock())
1399				     - initialized normally by setup_new_exec */
1400/* file system info */
1401	int link_count, total_link_count;
1402#ifdef CONFIG_SYSVIPC
1403/* ipc stuff */
1404	struct sysv_sem sysvsem;
1405	struct sysv_shm sysvshm;
1406#endif
1407#ifdef CONFIG_DETECT_HUNG_TASK
1408/* hung task detection */
1409	unsigned long last_switch_count;
1410#endif
1411/* CPU-specific state of this task */
1412	struct thread_struct thread;
1413/* filesystem information */
1414	struct fs_struct *fs;
1415/* open file information */
1416	struct files_struct *files;
1417/* namespaces */
1418	struct nsproxy *nsproxy;
1419/* signal handlers */
1420	struct signal_struct *signal;
1421	struct sighand_struct *sighand;
1422
1423	sigset_t blocked, real_blocked;
1424	sigset_t saved_sigmask;	/* restored if set_restore_sigmask() was used */
1425	struct sigpending pending;
1426
1427	unsigned long sas_ss_sp;
1428	size_t sas_ss_size;
1429	int (*notifier)(void *priv);
1430	void *notifier_data;
1431	sigset_t *notifier_mask;
1432	struct callback_head *task_works;
1433
1434	struct audit_context *audit_context;
1435#ifdef CONFIG_AUDITSYSCALL
1436	kuid_t loginuid;
1437	unsigned int sessionid;
1438#endif
1439	struct seccomp seccomp;
1440
1441/* Thread group tracking */
1442   	u32 parent_exec_id;
1443   	u32 self_exec_id;
1444/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
1445 * mempolicy */
1446	spinlock_t alloc_lock;
1447
1448	/* Protection of the PI data structures: */
1449	raw_spinlock_t pi_lock;
1450
1451#ifdef CONFIG_RT_MUTEXES
1452	/* PI waiters blocked on a rt_mutex held by this task */
1453	struct rb_root pi_waiters;
1454	struct rb_node *pi_waiters_leftmost;
1455	/* Deadlock detection and priority inheritance handling */
1456	struct rt_mutex_waiter *pi_blocked_on;
1457#endif
1458
1459#ifdef CONFIG_DEBUG_MUTEXES
1460	/* mutex deadlock detection */
1461	struct mutex_waiter *blocked_on;
1462#endif
1463#ifdef CONFIG_TRACE_IRQFLAGS
1464	unsigned int irq_events;
1465	unsigned long hardirq_enable_ip;
1466	unsigned long hardirq_disable_ip;
1467	unsigned int hardirq_enable_event;
1468	unsigned int hardirq_disable_event;
1469	int hardirqs_enabled;
1470	int hardirq_context;
1471	unsigned long softirq_disable_ip;
1472	unsigned long softirq_enable_ip;
1473	unsigned int softirq_disable_event;
1474	unsigned int softirq_enable_event;
1475	int softirqs_enabled;
1476	int softirq_context;
1477#endif
1478#ifdef CONFIG_LOCKDEP
1479# define MAX_LOCK_DEPTH 48UL
1480	u64 curr_chain_key;
1481	int lockdep_depth;
1482	unsigned int lockdep_recursion;
1483	struct held_lock held_locks[MAX_LOCK_DEPTH];
1484	gfp_t lockdep_reclaim_gfp;
1485#endif
1486
1487/* journalling filesystem info */
1488	void *journal_info;
1489
1490/* stacked block device info */
1491	struct bio_list *bio_list;
1492
1493#ifdef CONFIG_BLOCK
1494/* stack plugging */
1495	struct blk_plug *plug;
1496#endif
1497
1498/* VM state */
1499	struct reclaim_state *reclaim_state;
1500
1501	struct backing_dev_info *backing_dev_info;
1502
1503	struct io_context *io_context;
1504
1505	unsigned long ptrace_message;
1506	siginfo_t *last_siginfo; /* For ptrace use.  */
1507	struct task_io_accounting ioac;
1508#if defined(CONFIG_TASK_XACCT)
1509	u64 acct_rss_mem1;	/* accumulated rss usage */
1510	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
1511	cputime_t acct_timexpd;	/* stime + utime since last update */
1512#endif
1513#ifdef CONFIG_CPUSETS
1514	nodemask_t mems_allowed;	/* Protected by alloc_lock */
1515	seqcount_t mems_allowed_seq;	/* Seqence no to catch updates */
1516	int cpuset_mem_spread_rotor;
1517	int cpuset_slab_spread_rotor;
1518#endif
1519#ifdef CONFIG_CGROUPS
1520	/* Control Group info protected by css_set_lock */
1521	struct css_set __rcu *cgroups;
1522	/* cg_list protected by css_set_lock and tsk->alloc_lock */
1523	struct list_head cg_list;
1524#endif
1525#ifdef CONFIG_FUTEX
1526	struct robust_list_head __user *robust_list;
1527#ifdef CONFIG_COMPAT
1528	struct compat_robust_list_head __user *compat_robust_list;
1529#endif
1530	struct list_head pi_state_list;
1531	struct futex_pi_state *pi_state_cache;
1532#endif
1533#ifdef CONFIG_PERF_EVENTS
1534	struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
1535	struct mutex perf_event_mutex;
1536	struct list_head perf_event_list;
1537#endif
1538#ifdef CONFIG_DEBUG_PREEMPT
1539	unsigned long preempt_disable_ip;
1540#endif
1541#ifdef CONFIG_NUMA
1542	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
1543	short il_next;
1544	short pref_node_fork;
1545#endif
1546#ifdef CONFIG_NUMA_BALANCING
1547	int numa_scan_seq;
1548	unsigned int numa_scan_period;
1549	unsigned int numa_scan_period_max;
1550	int numa_preferred_nid;
1551	unsigned long numa_migrate_retry;
1552	u64 node_stamp;			/* migration stamp  */
1553	u64 last_task_numa_placement;
1554	u64 last_sum_exec_runtime;
1555	struct callback_head numa_work;
1556
1557	struct list_head numa_entry;
1558	struct numa_group *numa_group;
1559
1560	/*
1561	 * Exponential decaying average of faults on a per-node basis.
1562	 * Scheduling placement decisions are made based on the these counts.
1563	 * The values remain static for the duration of a PTE scan
1564	 */
1565	unsigned long *numa_faults_memory;
1566	unsigned long total_numa_faults;
1567
1568	/*
1569	 * numa_faults_buffer records faults per node during the current
1570	 * scan window. When the scan completes, the counts in
1571	 * numa_faults_memory decay and these values are copied.
1572	 */
1573	unsigned long *numa_faults_buffer_memory;
1574
1575	/*
1576	 * Track the nodes the process was running on when a NUMA hinting
1577	 * fault was incurred.
1578	 */
1579	unsigned long *numa_faults_cpu;
1580	unsigned long *numa_faults_buffer_cpu;
1581
1582	/*
1583	 * numa_faults_locality tracks if faults recorded during the last
1584	 * scan window were remote/local. The task scan period is adapted
1585	 * based on the locality of the faults with different weights
1586	 * depending on whether they were shared or private faults
1587	 */
1588	unsigned long numa_faults_locality[2];
1589
1590	unsigned long numa_pages_migrated;
1591#endif /* CONFIG_NUMA_BALANCING */
1592
1593	struct rcu_head rcu;
1594
1595	/*
1596	 * cache last used pipe for splice
1597	 */
1598	struct pipe_inode_info *splice_pipe;
1599
1600	struct page_frag task_frag;
1601
1602#ifdef	CONFIG_TASK_DELAY_ACCT
1603	struct task_delay_info *delays;
1604#endif
1605#ifdef CONFIG_FAULT_INJECTION
1606	int make_it_fail;
1607#endif
1608	/*
1609	 * when (nr_dirtied >= nr_dirtied_pause), it's time to call
1610	 * balance_dirty_pages() for some dirty throttling pause
1611	 */
1612	int nr_dirtied;
1613	int nr_dirtied_pause;
1614	unsigned long dirty_paused_when; /* start of a write-and-pause period */
1615
1616#ifdef CONFIG_LATENCYTOP
1617	int latency_record_count;
1618	struct latency_record latency_record[LT_SAVECOUNT];
1619#endif
1620	/*
1621	 * time slack values; these are used to round up poll() and
1622	 * select() etc timeout values. These are in nanoseconds.
1623	 */
1624	unsigned long timer_slack_ns;
1625	unsigned long default_timer_slack_ns;
1626
1627#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1628	/* Index of current stored address in ret_stack */
1629	int curr_ret_stack;
1630	/* Stack of return addresses for return function tracing */
1631	struct ftrace_ret_stack	*ret_stack;
1632	/* time stamp for last schedule */
1633	unsigned long long ftrace_timestamp;
1634	/*
1635	 * Number of functions that haven't been traced
1636	 * because of depth overrun.
1637	 */
1638	atomic_t trace_overrun;
1639	/* Pause for the tracing */
1640	atomic_t tracing_graph_pause;
1641#endif
1642#ifdef CONFIG_TRACING
1643	/* state flags for use by tracers */
1644	unsigned long trace;
1645	/* bitmask and counter of trace recursion */
1646	unsigned long trace_recursion;
1647#endif /* CONFIG_TRACING */
1648#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
1649	unsigned int memcg_kmem_skip_account;
1650	struct memcg_oom_info {
1651		struct mem_cgroup *memcg;
1652		gfp_t gfp_mask;
1653		int order;
1654		unsigned int may_oom:1;
1655	} memcg_oom;
1656#endif
1657#ifdef CONFIG_UPROBES
1658	struct uprobe_task *utask;
1659#endif
1660#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
1661	unsigned int	sequential_io;
1662	unsigned int	sequential_io_avg;
1663#endif
1664};
代码很长,这里不一一分析其结构,可参考网上的其他相关文章。我们可以从代码知道一般的进程信息包括进程状态,进程调度信息,进程标识符,进程通信有关信息,进程链接信息,时间和定时器信息,文件系统信息,虚拟内存信息,页面管理信息,对称处理机信息,和处理器相关的上下文信息等。如下图简略图所示:

技术分享

进程的创建

当说明了进程的描述之后,来分析一下进程的创建过程。

创建进程的系统调用有fork(),vfork()和clone()这三个。fork和vfork的区别在于fork需要拷贝父进程的内核数据空间,而vfork在exec与exit之前与父进程共用数据空间,fork创建了子进程后不限定父进程与子进程的执行顺序,而vfork需要在子进程exec与exit之前让父进程阻塞,子进程先执行。clone只要是对简单的进程进行创建。三个系统调用都是调用do_fork()来进行进程的创建。

long do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr)
{

    ...

    p = copy_process(clone_flags, stack_start, stack_size,
             child_tidptr, NULL, trace);
  ...
}
真正执行进程创建的是copy_process函数调用,完成子进程对父进程的PCB的复制与修改并初始化。接着执行调用dup_task_struct()为新进程创建一个内核栈

p = dup_task_struct(current);
retval = copy_thread(clone_flags, stack_start, stack_size, p);复制父进程堆栈的内容到子进程的堆栈中去.这其中,copy_thread()函数中的语句p->thread.ip = (unsigned long) ret_from_fork;决定了新进程的第一条指令地址

static struct task_struct *dup_task_struct(struct task_struct *orig)
{
    struct task_struct *tsk;
    struct thread_info *ti;
    int node = tsk_fork_get_node(orig);
    int err;

    tsk = alloc_task_struct_node(node);
    if (!tsk)
        return NULL;

    ti = alloc_thread_info_node(tsk, node);
    if (!ti)
        goto free_tsk;

    err = arch_dup_task_struct(tsk, orig);
    if (err)
        goto free_ti;

    tsk->stack = ti;
# ifdef CONFIG_SECCOMP

    tsk->seccomp.filter = NULL;
# endif

    setup_thread_stack(tsk, orig);
    clear_user_return_notifier(tsk);
    clear_tsk_need_resched(tsk);
    set_task_stack_end_magic(tsk);

# ifdef CONFIG_CC_STACKPROTECTOR
    tsk->stack_canary = get_random_int();
# endif

    atomic_set(&tsk->usage, 2);
# ifdef CONFIG_BLK_DEV_IO_TRACE
    tsk->btrace_seq = 0;
# endif
    tsk->splice_pipe = NULL;
    tsk->task_frag.page = NULL;

    account_kernel_stack(ti, 1);

    return tsk;

free_ti:
    free_thread_info(ti);
free_tsk:
    free_task_struct(tsk);
    return NULL;
}

新进程的PCB和内核堆栈在分配的页表中的分布地址如下所示:

        技术分享task_struct结构体是按page分配的,多余的部分作为该进程的内核堆栈,从底向task_struct延伸。

新进程的执行

在之前的函数分析已经说明了新进程的堆栈ip指针初始化为ret_from_fork,这是一个汇编程序
在之前的分析中,谈到copy_process中的copy_thread()函数,正是这个函数决定了子进程从系统调用中返回后的执行.

int copy_thread(unsigned long clone_flags, unsigned long sp,
    unsigned long arg, struct task_struct *p)
{
    ...

    *childregs = *current_pt_regs();
    childregs->ax = 0;
    if (sp)
        childregs->sp = sp;

    p->thread.ip = (unsigned long) ret_from_fork;

    ...
}

ENTRY(ret_from_fork)
    CFI_STARTPROC
    pushl_cfi %eax
    call schedule_tail
    GET_THREAD_INFO(%ebp)
    popl_cfi %eax
    pushl_cfi $0x0202       # Reset kernel eflags
    popfl_cfi
    jmp syscall_exit
    CFI_ENDPROC
END(ret_from_fork)

上述的ret_from_fork就是新进程的执行点。

新进程的内核堆栈初始为父进程的保存现场SAVE_ALL的堆栈数据,所以新进程执行ret_from_fork后有一个RESTORE_ALL,把内核堆栈的数据恢复之后就可以离开内核态进入到用户态执行。

总结

1、Linux内核创建一个新进程时有三个系统调用fork(),vfork(),clone()fork和vfork的区别上面已分析。他们之间都是通过do_fork()来创建进程。
2、创建进程往往把父进程的PCB拷贝给子进程,然后再拷贝内核堆栈,子进程需要对其修改并初始化,这样才能是一个可以运行的进程。通常实现的时候是写时复制,就是当子进程需要用到一些数据结构时,才创建一个新的数据结构给它。
3、新进程的执行点是ret_from_fork,恢复堆栈数据后就可以离开内核返回到用户态执行。


以上是关于Linux内核分析之六——进程的描述与进程的创建的主要内容,如果未能解决你的问题,请参考以下文章

20169203《Linux内核原理与分析》第四周作业

linux内核分析 第六周 分析Linux内核创建一个新进程的过程

Linux内核分析——进程的描述和进程的创建

20135239益西拉姆 Linux内核分析 进程的描述和进程的创建

Linux内核分析——进程的描述和进程的创建

LINUX内核分析第六周学习总结——进程的描述和进程的创建