Linux 0.11 进程012

Posted 2021-12-18 扶磐

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了Linux 0.11 进程012相关的知识，希望对你有一定的参考价值。

本博客研究重点主要放在进程0、1、2的创建和fork、execv等系统调用，因此会省去很多内部细节，只在代码上加上必要注释，这对我们理解重点没有影响。文章全部参考《Linux内核设计的艺术（第二版）》

进程0初始化

进程0是Linux操作系统中运行的第一个进程，也是Linux操作系统父子进程创建机制的第一个父进程。Linux初始化主要包含以下三方面：

系统初始化进程0。进程管理结构task_struct的母本（init_task = INIT_TASK）已经在代码设计阶段事先准备好了。之后要对进程0的task_struct中的LDT、TSS与GDT挂接，并对GDT、task[64]以及与进程调度有关的寄存器进行初始化
设置时钟中断，支持多进程轮转。
进程0要具备处理系统调用的能力，通过set_system_gate将system_call与IDT相挂接。

这三点的实现都是在sched_init()函数中实现的，具体代码如下：

//代码路径：include/linux/head.h
typedef struct desc_struct 
	unsigned long a,b;
 desc_table[256];


//代码路径：kernel/sched.c
#define LATCH (1193180/HZ)

union task_union 
	struct task_struct task;
	char stack[PAGE_SIZE];
;

static union task_union init_task = INIT_TASK,;

struct task_struct *current = &(init_task.task);
struct task_struct *last_task_used_math = NULL;

//初始化进程槽task[NR_TASKS]的第一项为进程0，即task[0]为进程0占用
struct task_struct * task[NR_TASKS] = &(init_task.task), ;

void sched_init(void)

	int i;
	struct desc_struct * p;

	if (sizeof(struct sigaction) != 16)
		panic("Struct sigaction MUST be 16 bytes");
	set_tss_desc(gdt+FIRST_TSS_ENTRY,&(init_task.task.tss));
	set_ldt_desc(gdt+FIRST_LDT_ENTRY,&(init_task.task.ldt));
	p = gdt+2+FIRST_TSS_ENTRY;
	for(i=1;i<NR_TASKS;i++) 
		task[i] = NULL;
		p->a=p->b=0;
		p++;
		p->a=p->b=0;
		p++;
	
/* Clear NT, so that we won\'t have troubles with that later on */
	__asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl");
	ltr(0);
	lldt(0);
	outb_p(0x36,0x43);		/* binary, mode 3, LSB/MSB, ch 0 */
	outb_p(LATCH & 0xff , 0x40);	/* LSB */
	outb(LATCH >> 8 , 0x40);	/* MSB */
	set_intr_gate(0x20,&timer_interrupt);
	outb(inb_p(0x21)&~0x01,0x21);
	set_system_gate(0x80,&system_call);


//代码路径：include/linux/sched.h
struct tss_struct 
	long	back_link;	/* 16 high bits zero */
	long	esp0;
	long	ss0;		/* 16 high bits zero */
	long	esp1;
	long	ss1;		/* 16 high bits zero */
	long	esp2;
	long	ss2;		/* 16 high bits zero */
	long	cr3;
	long	eip;
	long	eflags;
	long	eax,ecx,edx,ebx;
	long	esp;
	long	ebp;
	long	esi;
	long	edi;
	long	es;		/* 16 high bits zero */
	long	cs;		/* 16 high bits zero */
	long	ss;		/* 16 high bits zero */
	long	ds;		/* 16 high bits zero */
	long	fs;		/* 16 high bits zero */
	long	gs;		/* 16 high bits zero */
	long	ldt;		/* 16 high bits zero */
	long	trace_bitmap;	/* bits: trace 0, bitmap 16-31 */
	struct i387_struct i387;
;

struct task_struct 
/* these are hardcoded - don\'t touch */
	long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
	long counter;
	long priority;
	long signal;
	struct sigaction sigaction[32];
	long blocked;	/* bitmap of masked signals */
/* various fields */
	int exit_code;
	unsigned long start_code,end_code,end_data,brk,start_stack;
	long pid,father,pgrp,session,leader;
	unsigned short uid,euid,suid;
	unsigned short gid,egid,sgid;
	long alarm;
	long utime,stime,cutime,cstime,start_time;
	unsigned short used_math;
/* file system info */
	int tty;		/* -1 if no tty, so it must be signed */
	unsigned short umask;
	struct m_inode * pwd;
	struct m_inode * root;
	struct m_inode * executable;
	unsigned long close_on_exec;
	struct file * filp[NR_OPEN];
/* ldt for this task 0 - zero 1 - cs 2 - ds&ss */
	struct desc_struct ldt[3];
/* tss for this task */
	struct tss_struct tss;
;

/*
 *  INIT_TASK is used to set up the first task table, touch at
 * your own risk!. Base=0, limit=0x9ffff (=640kB)
 */
#define INIT_TASK \\
/* state etc */	 0,15,15, \\
/* signals */	0,,,0, \\
/* ec,brk... */	0,0,0,0,0,0, \\
/* pid etc.. */	0,-1,0,0,0, \\
/* uid etc */	0,0,0,0,0,0, \\
/* alarm */	0,0,0,0,0,0, \\
/* math */	0, \\
/* fs info */	-1,0022,NULL,NULL,NULL,0, \\
/* filp */	NULL,, \\
	 \\
		0,0, \\
/* ldt */	0x9f,0xc0fa00, \\
		0x9f,0xc0f200, \\
	, \\
/*tss*/	0,PAGE_SIZE+(long)&init_task,0x10,0,0,0,0,(long)&pg_dir,\\
	 0,0,0,0,0,0,0,0, \\
	 0,0,0x17,0x17,0x17,0x17,0x17,0x17, \\
	 _LDT(0),0x80000000, \\
		 \\
	, \\


extern struct task_struct *task[NR_TASKS];
extern struct task_struct *last_task_used_math;
extern struct task_struct *current;

/*
 * Entry into gdt where to find first TSS. 0-nul, 1-cs, 2-ds, 3-syscall
 * 4-TSS0, 5-LDT0, 6-TSS1 etc ...
 */
#define FIRST_TSS_ENTRY 4
#define FIRST_LDT_ENTRY (FIRST_TSS_ENTRY+1)
#define _TSS(n) ((((unsigned long) n)<<4)+(FIRST_TSS_ENTRY<<3))
#define _LDT(n) ((((unsigned long) n)<<4)+(FIRST_LDT_ENTRY<<3))
#define ltr(n) __asm__("ltr %%ax"::"a" (_TSS(n)))
#define lldt(n) __asm__("lldt %%ax"::"a" (_LDT(n)))

设置GDT中对应进程0的TSS、LDT描述符

#define FIRST_TSS_ENTRY 4
#define FIRST_LDT_ENTRY (FIRST_TSS_ENTRY+1)

set_tss_desc(gdt+FIRST_TSS_ENTRY,&(init_task.task.tss));
set_ldt_desc(gdt+FIRST_LDT_ENTRY,&(init_task.task.ldt));

设置GDT中和进程0相关的TSS和LDT描述符，进程0的tss内容和ldt内容均放在自己的task_struct中，因此向set_xxx_desc传入描述符对应的偏移量和地址即可进行初始化。

p = gdt+2+FIRST_TSS_ENTRY;
for(i=1;i<NR_TASKS;i++) 
	task[i] = NULL;		//进程i的指针为null
	p->a=p->b=0;		//进程i的tss描述符
	p++;
	p->a=p->b=0;		//进程i的ldt描述符
	p++;

之后对gdt中的其他内容和task数组的其他内容都清零。

#define FIRST_TSS_ENTRY 4
#define FIRST_LDT_ENTRY (FIRST_TSS_ENTRY+1)
#define _TSS(n) ((((unsigned long) n)<<4)+(FIRST_TSS_ENTRY<<3))
#define _LDT(n) ((((unsigned long) n)<<4)+(FIRST_LDT_ENTRY<<3))
#define ltr(n) __asm__("ltr %%ax"::"a" (_TSS(n)))
#define lldt(n) __asm__("lldt %%ax"::"a" (_LDT(n)))

ltr(0);
lldt(0);

设置tr寄存器和ldt寄存器，指向进程0的tss描述符和ldt描述符。

设置时钟中断

outb_p(0x36,0x43);		/* binary, mode 3, LSB/MSB, ch 0 */
outb_p(LATCH & 0xff , 0x40);	/* LSB */
outb(LATCH >> 8 , 0x40);	/* MSB */
set_intr_gate(0x20,&timer_interrupt);
outb(inb_p(0x21)&~0x01,0x21);		//打开时钟中断

时钟中断处理程序timer_interrupt的代码进程调度时再进行介绍，先略过。

设置系统调用总入口

#define set_system_gate(n,addr) \\
	_set_gate(&idt[n],15,3,addr)		//陷阱门

set_system_gate(0x80,&system_call);

将系统调用处理函数system_call与int0x80中断描述符表挂接。system_call是整个操作系统中系统调用软中断的总入口。所有用户程序使用系统调用，产生int 0x80软中断后，操作系统都是通过这个总入口找到具体的系统调用函数。

进程0由0特权级反转到3特权级，成为真正的进程

Linux操作系统规定，除进程0之外，所有进程都要由一个已有进程在3特权级下创建。
在Linux0.11中，进程0的代码和数据都是由操作系统的设计者写在内核代码、数据区，并且，此前处在0特权级，严格说还不是真正意义上的进程。为了遵守规则，在进程0正式创建进程1之前，要将进程0由0特权级转变为3特权级。方法是调用move_ to_ user_ mode()函数，模仿中断返回动作，实现进程0的特权级从0转变为3。

//代码路径：init/main.c
void main()

	...
	move_to_user_mode();
	...


//代码路径：include/system.h
#define move_to_user_mode() \\
__asm__ ("movl %%esp,%%eax\\n\\t" \\
	"pushl $0x17\\n\\t" \\					//ss	0b10111: CPL=3，LDT中的第3个描述符
	"pushl %%eax\\n\\t" \\					//esp
	"pushfl\\n\\t" \\						//flags
	"pushl $0x0f\\n\\t" \\					//cs	0b1111: CPL=3， LDT中的第2个描述符
	"pushl $1f\\n\\t" \\					//把标号1对应的eip压到栈中
	"iret\\n" \\
	"1:\\tmovl $0x17,%%eax\\n\\t" \\
	"movw %%ax,%%ds\\n\\t" \\
	"movw %%ax,%%es\\n\\t" \\
	"movw %%ax,%%fs\\n\\t" \\
	"movw %%ax,%%gs" \\
	:::"ax")

手工模拟特权级从0到3的栈的内容，使用iret反转到3特权级，执行3特权级的代码

此时我们再来看一下进程0的LDT内容

/*
 *  INIT_TASK is used to set up the first task table, touch at
 * your own risk!. Base=0, limit=0x9ffff (=640kB)
 */
#define INIT_TASK \\
/* state etc */	 0,15,15, \\
/* signals */	0,,,0, \\
/* ec,brk... */	0,0,0,0,0,0, \\
/* pid etc.. */	0,-1,0,0,0, \\
/* uid etc */	0,0,0,0,0,0, \\
/* alarm */	0,0,0,0,0,0, \\
/* math */	0, \\
/* fs info */	-1,0022,NULL,NULL,NULL,0, \\
/* filp */	NULL,, \\
	 \\
		0,0, \\
/* ldt */	0x9f,0xc0fa00, \\				//代码段，base=0，G=1，DPL=3，Execute/Read
		0x9f,0xc0f200, \\					//数据段，base=0，G=1，DPL=3，Read/Write
	, \\
/*tss*/	0,PAGE_SIZE+(long)&init_task,0x10,0,0,0,0,(long)&pg_dir,\\
	 0,0,0,0,0,0,0,0, \\
	 0,0,0x17,0x17,0x17,0x17,0x17,0x17, \\
	 _LDT(0),0x80000000, \\
		 \\
	, \\

可以看到此时进程0的代码段都是从0开始，界限为640KB。ss0为0x10，指向GDT中内核数据段，esp0为init_task页面最顶端。

进程1的创建

进程0现在处在3特权级状态，即进程状态。正式开始运行要做的第一件事就是作为父进程调用fork函数创建第一个子进程—进程 1,这是父子进程创建机制的第一次实际运用。以后，所有进程都是基于父子进程创建机制由父进程创建出来的。

//代码路i纪念馆：init/main.c
static inline _syscall0(int, fork)

void main(void)

	move_to_user_mode();
	if(!fork())
		init();
	
    /*
 *   NOTE!!   For any other task \'pause()\' would mean we have to get a
 * signal to awaken, but task0 is the sole exception (see \'schedule()\')
 * as task 0 gets activated at every idle moment (when no other tasks
 * can run). For task0 \'pause()\' just means we go check if some other
 * task can run, and if not we return here.
 */
	for(;;) pause();


//代码路径：include/unistd.h
#define __NR_setup	0	/* used only by init, to get system going */
#define __NR_exit	1
#define __NR_fork	2

#define _syscall0(type,name) \\
type name(void) \\
 \\
long __res; \\
__asm__ volatile ("int $0x80" \\
	: "=a" (__res) \\
	: "0" (__NR_##name)); \\			//将调用号传入eax寄存器中作为参数，fork中为2，2为sys_fork在sys_call_table中的偏移值
if (__res >= 0) \\
	return (type) __res; \\
errno = -__res; \\
return -1; \\


extern int errno;

//代码路径：include/linux/sys.h
extern int sys_setup();
extern int sys_exit();
extern int sys_fork();

fn_ptr sys_call_table[] =  sys_setup, sys_exit, sys_fork, sys_read,
sys_write, sys_open, sys_close, sys_waitpid, sys_creat, sys_link,
sys_unlink, sys_execve, sys_chdir, sys_time, sys_mknod, sys_chmod,
sys_chown, sys_break, sys_stat, sys_lseek, sys_getpid, sys_mount,
sys_umount, sys_setuid, sys_getuid, sys_stime, sys_ptrace, sys_alarm,
sys_fstat, sys_pause, sys_utime, sys_stty, sys_gtty, sys_access,
sys_nice, sys_ftime, sys_sync, sys_kill, sys_rename, sys_mkdir,
sys_rmdir, sys_dup, sys_pipe, sys_times, sys_prof, sys_brk, sys_setgid,
sys_getgid, sys_signal, sys_geteuid, sys_getegid, sys_acct, sys_phys,
sys_lock, sys_ioctl, sys_fcntl, sys_mpx, sys_setpgid, sys_ulimit,
sys_uname, sys_umask, sys_chroot, sys_ustat, sys_dup2, sys_getppid,
sys_getpgrp, sys_setsid, sys_sigaction, sys_sgetmask, sys_ssetmask,
sys_setreuid,sys_setregid ;

执行int 0x80程序后，产生一个软中断，CPU从特权级3的进程0代码跳转到0特权级的内核代码中执行，中断将SS、ESP、EFLAGS、CS和EIP这5个寄存器自动压栈，压入init_task中的进程0内核栈。

# 代码路径：kernel/system_call.s
nr_system_calls = 72

.align 2
bad_sys_call:
	movl $-1,%eax
	iret
.align 2
reschedule:
	pushl $ret_from_sys_call
	jmp _schedule
.align 2
_system_call:
	cmpl $nr_system_calls-1,%eax		//比较eax中的进程号和nr_system_calls，查看
	ja bad_sys_call						//是否越界
	push %ds							//继续压入参数
	push %es
	push %fs
	pushl %edx
	pushl %ecx		# push %ebx,%ecx,%edx as parameters
	pushl %ebx		# to the system call
	movl $0x10,%edx		# set up ds,es to kernel space
	mov %dx,%ds
	mov %dx,%es
	movl $0x17,%edx		# fs points to local data space
	mov %dx,%fs
	call _sys_call_table(,%eax,4)		//执行函数
	pushl %eax
	movl _current,%eax
	cmpl $0,state(%eax)		# state
	jne reschedule
	cmpl $0,counter(%eax)		# counter
	je reschedule
ret_from_sys_call:
	movl _current,%eax		# task[0] cannot have signals
	cmpl _task,%eax
	je 3f
	cmpw $0x0f,CS(%esp)		# was old code segment supervisor ?
	jne 3f
	cmpw $0x17,OLDSS(%esp)		# was stack segment = 0x17 ?
	jne 3f
	movl signal(%eax),%ebx
	movl blocked(%eax),%ecx
	notl %ecx
	andl %ebx,%ecx
	bsfl %ecx,%ecx
	je 3f
	btrl %ecx,%ebx
	movl %ebx,signal(%eax)
	incl %ecx
	pushl %ecx
	call _do_signal
	popl %eax
3:	popl %eax
	popl %ebx
	popl %ecx
	popl %edx
	pop %fs
	pop %es
	pop %ds
	iret

接下来到了我们的重点fork函数，代码如下

.align 2
_sys_fork:
	call _find_empty_process
	testl %eax,%eax
	js 1f
	push %gs
	pushl %esi
	pushl %edi
	pushl %ebp
	pushl %eax
	call _copy_process
	addl $20,%esp
1:	ret

首先在task[64]中为进程1申请一个空闲位置并获取进程号

调用find_empty_process()函数为进程1获得一个可用的进程号和task[64]中的一个位置，将该位置放入eax寄存器中返回。

在该函数中，内核用全局变量last_pid来存放系统自开机以来累计的进程数，也将此变量用作新建进程的进程号（task_struct中的pid变量）。因为Linux 0.11的task[64]只有64项，最多只能同时运行64个进程，如果find_empty_process()函数返回-EAGAIN，意味着当前已经有64个进程在运行。
继续压入寄存器
调用copy_process函数

copy_process是我们关注的重点，接下来好好研究一下。

copy_process

进程0已经成为一个可以创建子进程的父进程，在内核中有“进程0的task_ struct” 和“进程0的页表项”等专属进程0的管理信息。进程0将在copy_ process() 函数中做非常重要的、体现父子进程创建机制的工作:

为进程1创建task_struct，将进程0的task_struct的内容复制给进程1
为进程1的task_struct、tss以及LDT中的内容做个性化设置
为进程1创建第一个页表，将进程0的页表项内容赋给这个页表
进程1共享进程0的文件
设置进程1的GDT项
最后将进程1设置为就绪态，使其可以参与进程间的轮转。

//代码路径：kernel/fork.c

/*
 *  Ok, this is the main fork-routine. It copies the system process
 * information (task[nr]) and sets up the necessary registers. It
 * also copies the data segment in it\'s entirety.
 */
int copy_process(int nr,long ebp,long edi,long esi,long gs,long none,
		long ebx,long ecx,long edx,
		long fs,long es,long ds,
		long eip,long cs,long eflags,long esp,long ss)

	struct task_struct *p;
	int i;
	struct file *f;

	p = (struct task_struct *) get_free_page();
	if (!p)
		return -EAGAIN;
	task[nr] = p;
	*p = *current;	/* NOTE! this doesn\'t copy the supervisor stack */
	p->state = TASK_UNINTERRUPTIBLE;
	p->pid = last_pid;
	p->father = current->pid;
	p->counter = p->priority;
	p->signal = 0;
	p->alarm = 0;
	p->leader = 0;		/* process leadership doesn\'t inherit */
	p->utime = p->stime = 0;
	p->cutime = p->cstime = 0;
	p->start_time = jiffies;
	p->tss.back_link = 0;
	p->tss.esp0 = PAGE_SIZE + (long) p;		//内核栈发生变化，
	p->tss.ss0 = 0x10;
	p->tss.eip = eip;						//int 0x80后面的那条指令
	p->tss.eflags = eflags;
	p->tss.eax = 0;						//子进程fork返回值为0
	p->tss.ecx = ecx;
	p->tss.edx = edx;
	p->tss.ebx = ebx;
	p->tss.esp = esp;
	p->tss.ebp = ebp;
	p->tss.esi = esi;
	p->tss.edi = edi;
	p->tss.es = es & 0xffff;				//es是long类型的，因此要取低16位
	p->tss.cs = cs & 0xffff;
	p->tss.ss = ss & 0xffff;
	p->tss.ds = ds & 0xffff;
	p->tss.fs = fs & 0xffff;
	p->tss.gs = gs & 0xffff;
	p->tss.ldt = _LDT(nr);					//设置ldt
	p->tss.trace_bitmap = 0x80000000;
	if (last_task_used_math == current)
		__asm__("clts ; fnsave %0"::"m" (p->tss.i387));
	if (copy_mem(nr,p)) 				//设置页表、以及ldt中段描述符的新基址
		task[nr] = NULL;
		free_page((long) p);
		return -EAGAIN;
	
	for (i=0; i<NR_OPEN;i++)			//共享文件
		if (f=p->filp[i])
			f->f_count++;
	if (current->pwd)
		current->pwd->i_count++;
	if (current->root)
		current->root->i_count++;
	if (current->executable)
		current->executable->i_count++;
	set_tss_desc(gdt+(nr<<1)+FIRST_TSS_ENTRY,&(p->tss));		//设置对应的GDT
	set_ldt_desc(gdt+(nr<<1)+FIRST_LDT_ENTRY,&(p->ldt));
	p->state = TASK_RUNNING;	/* do this last, just in case */
	return last_pid;			//父进程返回值为子进程的pid

其中比较重要的copy_mem函数，该函数改变了LDT中代码段和数据段的基址，将进程0的页表拷贝给进程1。

//代码路径：include/linux/sched.h

#define _set_base(addr,base) \\
__asm__("movw %%dx,%0\\n\\t" \\
	"rorl $16,%%edx\\n\\t" \\
	"movb %%dl,%1\\n\\t" \\
	"movb %%dh,%2" \\
	::"m" (*((addr)+2)), \\
	  "m" (*((addr)+4)), \\
	  "m" (*((addr)+7)), \\
	  "d" (base) \\
	:"dx")

#define _set_limit(addr,limit) \\
__asm__("movw %%dx,%0\\n\\t" \\
	"rorl $16,%%edx\\n\\t" \\
	"movb %1,%%dh\\n\\t" \\
	"andb $0xf0,%%dh\\n\\t" \\
	"orb %%dh,%%dl\\n\\t" \\
	"movb %%dl,%1" \\
	::"m" (*(addr)), \\
	  "m" (*((addr)+6)), \\
	  "d" (limit) \\
	:"dx")

#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , base )
#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , (limit-1)>>12 )

#define _get_base(addr) (\\
unsigned long __base; \\
__asm__("movb %3,%%dh\\n\\t" \\
	"movb %2,%%dl\\n\\t" \\
	"shll $16,%%edx\\n\\t" \\
	"movw %1,%%dx" \\
	:"=d" (__base) \\
	:"m" (*((addr)+2)), \\
	 "m" (*((addr)+4)), \\
	 "m" (*((addr)+7))); \\
__base;)

#define get_base(ldt) _get_base( ((char *)&(ldt)) )

#define get_limit(segment) ( \\
unsigned long __limit; \\
__asm__("lsll %1,%0\\n\\t incl %0":"=r" (__limit):"r" (segment)); \\
__limit;)

//代码路径：kernel/fork.c

int copy_mem(int nr,struct task_struct * p)

	unsigned long old_data_base,new_data_base,data_limit;
	unsigned long old_code_base,new_code_base,code_limit;

	code_limit=get_limit(0x0f);
	data_limit=get_limit(0x17);
	old_code_base = get_base(current->ldt[1]);
	old_data_base = get_base(current->ldt[2]);
	if (old_data_base != old_code_base)
		panic("We don\'t support separate I&D");
	if (data_limit < code_limit)
		panic("Bad data_limit");
	new_data_base = new_code_base = nr * 0x4000000;
	p->start_code = new_code_base;
	set_base(p->ldt[1],new_code_base);
	set_base(p->ldt[2],new_data_base);
	if (copy_page_tables(old_data_base,new_data_base,data_limit)) 	//old_data_base 为 0，new_data_base为nr * 0x4000000，data_limit为640KB
		free_page_tables(new_data_base,data_limit);
		return -ENOMEM;
	
	return 0;

该函数更改进程1的代码段和数据段基址后调用copy_page_table设置页目录项和复制页表。

进入copy_ page_tables() 函数后，先为新的页表申请- - 个空闲页面，并把进程0中第一个页表里面前160 个页表项复制到这个页面中(1 个页表项控制一个页面4 KB内存空间，160 个页表项可以控制640KB内存空间)。进程0和进程1的页表暂时都指向了相同的页面，意味着进程1也可以操作进程0的页面。之后对进程1的页目录表进行设置。最后，用重置CR3的方法刷新页变换高速缓存。进程1的页表和页目录表设置完毕。

/*
 *  Well, here is one of the most complicated functions in mm. It
 * copies a range of linerar addresses by copying only the pages.
 * Let\'s hope this is bug-free, \'cause this one I don\'t want to debug :-)
 *
 * Note! We don\'t copy just any chunks of memory - addresses have to
 * be divisible by 4Mb (one page-directory entry), as this makes the
 * function easier. It\'s used only by fork anyway.
 *
 * NOTE 2!! When from==0 we are copying kernel space for the first
 * fork(). Then we DONT want to copy a full page-directory entry, as
 * that would lead to some serious memory waste - we just copy the
 * first 160 pages - 640kB. Even that is more than we need, but it
 * doesn\'t take any more memory - we don\'t copy-on-write in the low
 * 1 Mb-range, so the pages can be shared with the kernel. Thus the
 * special case for nr=xxxx.
 */
int copy_page_tables(unsigned long from,unsigned long to,long size)

	unsigned long * from_page_table;
	unsigned long * to_page_table;
	unsigned long this_page;
	unsigned long * from_dir, * to_dir;
	unsigned long nr;

	if ((from&0x3fffff) || (to&0x3fffff))
		panic("copy_page_tables called with wrong alignment");
	from_dir = (unsigned long *) ((from>>20) & 0xffc); /* _pg_dir = 0 */
	to_dir = (unsigned long *) ((to>>20) & 0xffc);
	size = ((unsigned) (size+0x3fffff)) >> 22;		//得到需要复制的页目录项个数，data_limit除以4MB，此时size = 1
	for( ; size-->0 ; from_dir++,to_dir++) 
		if (1 & *to_dir)
			panic("copy_page_tables: already exist");
		if (!(1 & *from_dir))		//看目录项最低为p位是否为1
			continue;
		from_page_table = (unsigned long *) (0xfffff000 & *from_dir);
		if (!(to_page_table = (unsigned long *) get_free_page()))	//为页表分配内存
			return -1;	/* Out of memory, see freeing */
		*to_dir = ((unsigned long) to_page_table) | 7;	//用户，可写，存在
		nr = (from==0)?0xA0:1024;		//如果是拷贝进程0的页表，则复制160个页表项，否则复制1024个页表项
		for ( ; nr-- > 0 ; from_page_table++,to_page_table++) 
			this_page = *from_page_table;
			if (!(1 & this_page))
				continue;
			this_page &= ~2;					//用户，只读，存在
			*to_page_table = this_page;
			if (this_page > LOW_MEM) 			//1MB以内的内核区不参与用户分页管理
				*from_page_table = this_page;		
				this_page -= LOW_MEM;
				this_page >>= 12;
				mem_map[this_page]++;
			
		
	
	invalidate();		//重置CR3为0，刷新”页高速缓存“
	return 0;

执行结束后，进程1和进程0管理页面完全一致，因为页表项是完全拷贝过来的，所以他们共享页面。效果如下：

copy_process结束时，进程1的创建工作完成，进程1已经具备了进程0的全部能力，可以在主机中正常地运行。返回sys_fork()中call_copy_process()的下一行执行，执行代码如下：

.align 2
_sys_fork:
	call _find_empty_process
	testl %eax,%eax
	js 1f
	push %gs
	pushl %esi
	pushl %edi
	pushl %ebp
	pushl %eax
	call _copy_process		//返回值last_pid放在eax寄存器中，层层返回
	addl $20,%esp			//将前面的gs、esi、edi、ebp和eax出栈
1:	ret


.align 2
_system_call:
	cmpl $nr_system_calls-1,%eax
	ja bad_sys_call
	push %ds
	push %es
	push %fs
	pushl %edx
	pushl %ecx		# push %ebx,%ecx,%edx as parameters
	pushl %ebx		# to the system call
	movl $0x10,%edx		# set up ds,es to kernel space
	mov %dx,%ds
	mov %dx,%es
	movl $0x17,%edx		# fs points to local data space
	mov %dx,%fs
	call _sys_call_table(,%eax,4)
	pushl %eax					//将进程号压栈
	movl _current,%eax			//检查当前进程的状态和counter位
	cmpl $0,state(%eax)		# state
	jne reschedule
	cmpl $0,counter(%eax)		# counter
	je reschedule
ret_from_sys_call:
	movl _current,%eax		# task[0] cannot have signals
	cmpl _task,%eax				//如果当前进程是进程0，跳到下面的3执行
	je 3f
	cmpw $0x0f,CS(%esp)		# was old code segment supervisor ?
	jne 3f
	cmpw $0x17,OLDSS(%esp)		# was stack segment = 0x17 ?
	jne 3f
	movl signal(%eax),%ebx
	movl blocked(%eax),%ecx
	notl %ecx
	andl %ebx,%ecx
	bsfl %ecx,%ecx
	je 3f
	btrl %ecx,%ebx
	movl %ebx,signal(%eax)
	incl %ecx
	pushl %ecx
	call _do_signal
	popl %eax				//
3:	popl %eax				//将进程号放入eax寄存器中
	popl %ebx
	popl %ecx
	popl %edx
	pop %fs
	pop %es
	pop %ds
	iret

由于当前进程是进程0，所以就跳到标号3处，将压栈的各个寄存器数值还原。之后iret中断返回，CPU硬件自动将int 0x80的中断时压入的ss、esp、eflags、cs和eip值按压栈的反序出栈给CPU对应寄存器，从0特权级的内核代码切换到3特权级的进程0代码执行，CS:EIP指向fork()中int 0x80的下一行if(_res >= 0)。

#define _syscall0(type,name) \\
type name(void) \\
 \\
long __res; \\
__asm__ volatile ("int $0x80" \\
	: "=a" (__res) \\
	: "0" (__NR_##name)); \\
if (__res >= 0) \\					<----------iret返回后就这一行,eax寄存器此时存放在_res中
	return (type) __res; \\
errno = -__res; \\
return -1; \\

return (type) _res将进程号1返回，回到调用点if(!fork())处执行，!1为假，这样就不会执行到init()函数中，而是进程0继续执行，直到pause()函数

void main(void)

	move_to_user_mode();
	if(!fork()) 
		init();
	
	
	for(; ;) pause();		//执行

进入pause函数后，最终会映射到sys_pause函数，该函数将进程置为可中断等待状态，同时调用schedule函数。

int sys_pause(void)

	current->state = TASK_INTERRUPTIBLE;
	schedule();
	return 0;

schedule代码如下：

/*
 *  \'schedule()\' is the scheduler function. This is GOOD CODE! There
 * probably won\'t be any reason to change this, as it should work well
 * in all circumstances (ie gives IO-bound processes good response etc).
 * The one thing you might take a look at is the signal-handler code here.
 *
 *   NOTE!!  Task 0 is the \'idle\' task, which gets called when no other
 * tasks can run. It can not be killed, and it cannot sleep. The \'state\'
 * information in task[0] is never used.
 */
void schedule(void)

	int i,next,c;
	struct task_struct ** p;

/* check alarm, wake up any interruptible tasks that have got a signal */

	for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
		if (*p) 
			if ((*p)->alarm && (*p)->alarm < jiffies) 
					(*p)->signal |= (1<<(SIGALRM-1));
					(*p)->alarm = 0;
				
			if (((*p)->signal & ~(_BLOCKABLE & (*p)->blocked)) &&
			(*p)->state==TASK_INTERRUPTIBLE)
				(*p)->state=TASK_RUNNING;
		

/* this is the scheduler proper: */

	while (1) 
		c = -1;
		next = 0;
		i = NR_TASKS;
		p = &task[NR_TASKS];
		while (--i) 
			if (!*--p)
				continue;
			if ((*p)->state == TASK_RUNNING && (*p)->counter > c)
				c = (*p)->counter, next = i;
		
		if (c) break;
		for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
			if (*p)
				(*p)->counter = ((*p)->counter >> 1) +
						(*p)->priority;
	
	switch_to(next);


/*
 *	switch_to(n) should switch tasks to task nr n, first
 * checking that n isn\'t the current task, in which case it does nothing.
 * This also clears the TS-flag if the task we switched to has used
 * tha math co-processor latest.
 */
#define switch_to(n) \\
struct long a,b; __tmp; \\
__asm__("cmpl %%ecx,_current\\n\\t" \\
	"je 1f\\n\\t" \\					//如果进程n是当前进程，则没必要切换，直接退出
	"movw %%dx,%1\\n\\t" \\
	"xchgl %%ecx,_current\\n\\t" \\
	"ljmp %0\\n\\t" \\					//通过TSS选择符进行跳转，此时保存各个寄存器到进程0的TSS中
	"cmpl %%ecx,_last_task_used_math\\n\\t" \\
	"jne 1f\\n\\t" \\
	"clts\\n" \\
	"1:" \\
	::"m" (*&__tmp.a),"m" (*&__tmp.b), \\
	"d" (_TSS(n)),"c" ((long) task[n])); \\

ljmp跳转到进程1执行，将CPU的各个寄存器值保存在进程0的TSS中，将进程1的TSS数据以及LDT的代码段、数据段描述符数据恢复给CPU的各个寄存器，实现从0特权级的内核代码切换到3特权级的进程1代码执行。

进程1执行

当时为进程1的TSS中eax设置为0，eip设置为int 0x80下一指令的地址，即if(_res >= 0)。进程开始从这一行执行。

#define _syscall0(type,name) \\
type name(void) \\
 \\
long __res; \\
__asm__ volatile ("int $0x80" \\
	: "=a" (__res) \\
	: "0" (__NR_##name)); \\
if (__res >= 0) \\					<----------进程0从这一行开始执行，但此时eax存放值为0
	return (type) __res; \\
errno = -__res; \\
return -1; \\

返回后，执行到main()函数中if(!fork())这一行，!0为“真”，调用init()函数！

void init(void)

	int pid,i;

	setup((void *) &drive_info);
	(void) open("/dev/tty0",O_RDWR,0);
	(void) dup(0);
	(void) dup(0);
	printf("%d buffers = %d bytes buffer space\\n\\r",NR_BUFFERS,
		NR_BUFFERS*BLOCK_SIZE);
	printf("Free mem: %d bytes\\n\\r",memory_end-main_memory_start);
	if (!(pid=fork())) 
		close(0);
		if (open("/etc/rc",O_RDONLY,0))
			_exit(1);
		execve("/bin/sh",argv_rc,envp_rc);
		_exit(2);
	
	if (pid>0)
		while (pid != wait(&i))
			/* nothing */;
	while (1) 
		if ((pid=fork())<0) 
			printf("Fork failed in init\\r\\n");
			continue;
		
		if (!pid) 
			close(0);close(1);close(2);
			setsid();
			(void) open("/dev/tty0",O_RDWR,0);
			(void) dup(0);
			(void) dup(0);
			_exit(execve("/bin/sh",argv,envp));
		
		while (1)
			if (pid == wait(&i))
				break;
		printf("\\n\\rchild %d died with code %04x\\n\\r",pid,i);
		sync();
	
	_exit(0);	/* NOTE! _exit, not exit() */

执行setup函数对根文件系统初始化后，进程1又打开了/dev/tty0字符文件，之后调用fork函数创建进程2。

fork映射到sys_fork，执行过程和之前是一样的，会为进程2的task_struct以及内核栈申请页面，并复制task_struct，随后对进程2的task_struct进行各种个性化设置，包括各个寄存器的设置、内存页面的管理设置、共享文件的设置、GDT表项的设置等。不太一样的是进程2复制了进程1的1024个页表项。

进程2 shell

进程2创建完毕后，fork()函数返回，返回值为2，因此!(pid=fork())为假，于是调用wait()函数。此函数的功能是：如果进程1有等待退出的子进程，就为该进程的退出做善后工作；如果有子进程，但并不等待退出，则进行进程切换；如果没有子进程，函数返回。

//代码路径：/lib/wait.c

_syscall3(pid_t,waitpid,pid_t,pid,int *,wait_stat,int,options)

pid_t wait(int * wait_stat)

	return waitpid(-1,wait_stat,0);


//代码路径：kernel/exit.c
int sys_waitpid(pid_t pid,unsigned long * stat_addr, int options)

	int flag, code;
	struct task_struct ** p;

	verify_area(stat_addr,4);
repeat:
	flag=0;
	for(p = &LAST_TASK ; p > &FIRST_TASK ; --p) 
		if (!*p || *p == current)
			continue;
		if ((*p)->father != current->pid)
			continue;
		if (pid>0) 						//wait时传入pid为-1
			if ((*p)->pid != pid)
				continue;
		 else if (!pid) 
			if ((*p)->pgrp != current->pgrp)
				continue;
		 else if (pid != -1) 
			if ((*p)->pgrp != -pid)
				continue;
		
		switch ((*p)->state) 
			case TASK_STOPPED:
				if (!(options & WUNTRACED))
					continue;
				put_fs_long(0x7f,stat_addr);
				return (*p)->pid;
			case TASK_ZOMBIE:
				current->cutime += (*p)->utime;
				current->cstime += (*p)->stime;
				flag = (*p)->pid;
				code = (*p)->exit_code;
				release(*p);
				put_fs_long(code,stat_addr);
				return flag;
			default:					//此时进程2为就绪态，执行到这里
				flag=1;
				continue;
		
	
	if (flag) 
		if (options & WNOHANG)
			return 0;
		current->state=TASK_INTERRUPTIBLE;		//将进程1设置为可中断等待状态
		schedule();								//执行进程2
		if (!(current->signal &= ~(1<<(SIGCHLD-1))))
			goto repeat;
		else
			return -EINTR;
	
	return -ECHILD;

轮转到进程2执行，进程2关闭文件后打开文件/etc/rc

	if (!(pid=fork())) 
		close(0);
		if (open("/etc/rc",O_RDONLY,0))
			_exit(1);
		execve("/bin/sh",argv_rc,envp_rc);
		_exit(2);

之后执行execve函数，进入内核对应_sys_execve函数

_sys_execve:
	lea EIP(%esp),%eax
	pushl %eax
	call _do_execve
	addl $4,%esp
	ret

#define MAX_ARG_PAGES 32
/*
 * \'do_execve()\' executes a new program.
 */
int do_execve(unsigned long * eip,long tmp,char * filename,
	char ** argv, char ** envp)

	struct m_inode * inode;
	struct buffer_head * bh;
	struct exec ex;
	unsigned long page[MAX_ARG_PAGES];
	int i,argc,envc;
	int e_uid, e_gid;
	int retval;
	int sh_bang = 0;
	unsigned long p=PAGE_SIZE*MAX_ARG_PAGES-4;

	if ((0xffff & eip[1]) != 0x000f)
		panic("execve called from supervisor mode");
	for (i=0 ; i<MAX_ARG_PAGES ; i++)	/* clear page-table */
		page[i]=0;
	if (!(inode=namei(filename)))		/* get executables inode */
		return -ENOENT;
	argc = count(argv);
	envc = count(envp);
	
restart_interp:
	if (!S_ISREG(inode->i_mode)) 	/* must be regular file */
		retval = -EACCES;
		goto exec_error2;
	
	i = inode->i_mode;					//文件uid、gid暂时还看不懂
	e_uid = (i & S_ISUID) ? inode->i_uid : current->euid;
	e_gid = (i & S_ISGID) ? inode->i_gid : current->egid;
	if (current->euid == inode->i_uid)
		i >>= 6;
	else if (current->egid == inode->i_gid)
		i >>= 3;
	if (!(i & 1) &&
	    !((inode->i_mode & 0111) && suser())) 
		retval = -ENOEXEC;
		goto exec_error2;
	
	if (!(bh = bread(inode->i_dev,inode->i_zone[0]))) 
		retval = -EACCES;
		goto exec_error2;
	
	ex = *((struct exec *) bh->b_data);	/* read exec-header */
	if ((bh->b_data[0] == \'#\') && (bh->b_data[1] == \'!\') && (!sh_bang)) 
		/*
		 * This section does the #! interpretation.
		 * Sorta complicated, but hopefully it will work.  -TYT
		 */

		char buf[1023], *cp, *interp, *i_name, *i_arg;
		unsigned long old_fs;

		strncpy(buf, bh->b_data+2, 1022);
		brelse(bh);
		iput(inode);
		buf[1022] = \'\\0\';
		if (cp = strchr(buf, \'\\n\')) 
			*cp = \'\\0\';
			for (cp = buf; (*cp == \' \') || (*cp == \'\\t\'); cp++);
		
		if (!cp || *cp == \'\\0\') 
			retval = -ENOEXEC; /* No interpreter name found */
			goto exec_error1;
		
		interp = i_name = cp;
		i_arg = 0;
		for ( ; *cp && (*cp != \' \') && (*cp != \'\\t\'); cp++) 
 			if (*cp == \'/\')
				i_name = cp+1;
		
		if (*cp) 
			*cp++ = \'\\0\';
			i_arg = cp;
		
		/*
		 * OK, we\'ve parsed out the interpreter name and
		 * (optional) argument.
		 */
		if (sh_bang++ == 0) 
			p = copy_strings(envc, envp, page, p, 0);
			p = copy_strings(--argc, argv+1, page, p, 0);
		
		/*
		 * Splice in (1) the interpreter\'s name for argv[0]
		 *           (2) (optional) argument to interpreter
		 *           (3) filename of shell script
		 *
		 * This is done in reverse order, because of how the
		 * user environment and arguments are stored.
		 */
		p = copy_strings(1, &filename, page, p, 1);
		argc++;
		if (i_arg) 
			p = copy_strings(1, &i_arg, page, p, 2);
			argc++;
		
		p = copy_strings(1, &i_name, page, p, 2);
		argc++;
		if (!p) 
			retval = -ENOMEM;
			goto exec_error1;
		
		/*
		 * OK, now restart the process with the interpreter\'s inode.
		 */
		old_fs = get_fs();
		set_fs(get_ds());
		if (!(inode=namei(interp)))  /* get executables inode */
			set_fs(old_fs);
			retval = -ENOENT;
			goto exec_error1;
		
		set_fs(old_fs);
		goto restart_interp;
	
	brelse(bh);		//检测可执行文件是否可执行
	if (N_MAGIC(ex) != ZMAGIC || ex.a_trsize || ex.a_drsize ||
		ex.a_text+ex.a_data+ex.a_bss>0x3000000 ||
		inode->i_size < ex.a_text+ex.a_data+ex.a_syms+N_TXTOFF(ex)) 
		retval = -ENOEXEC;
		goto exec_error2;
	
	if (N_TXTOFF(ex) != BLOCK_SIZE) 
		printk("%s: N_TXTOFF != BLOCK_SIZE. See a.out.h.", filename);
		retval = -ENOEXEC;
		goto exec_error2;
	
	if (!sh_bang) 
		p = copy_strings(envc,envp,page,p,0);		//分配页面，将环境变量和参数拷贝到新分配的页面中
		p = copy_strings(argc,argv,page,p,0);		//page数组存放的分配页面的物理地址
		if (!p) 
			retval = -ENOMEM;
			goto exec_error2;
		
	
/* OK, This is the point of no return */
	if (current->executable)
		iput(current->executable);			//减少inode的引用数，由于进程2复制进程1的执行文件是0，所以不会执行
	current->executable = inode;
	for (i=0 ; i<32 ; i++)
		current->sigaction[i].sa_handler = NULL;
	for (i=0 ; i<NR_OPEN ; i++)
		if ((current->close_on_exec>>i)&1)		//遍历close_on_exec，关闭需要关闭的文件
			sys_close(i);
	current->close_on_exec = 0;
	free_page_tables(get_base(current->ldt[1]),get_limit(0x0f));		//非常关键，并不直接将可执行文件读进内存然后设置页表来映射，需要触发缺页中断开始执行
	free_page_tables(get_base(current->ldt[2]),get_limit(0x17));
	if (last_task_used_math == current)
		last_task_used_math = NULL;
	current->used_math = 0;
	p += change_ldt(ex.a_text,page)-MAX_ARG_PAGES*PAGE_SIZE;		//修改数据段的data limit为data_limit = 0x4000000;最高地址准备存放环境变量和参数
	p = (unsigned long) create_tables((char *)p,argc,envc);			//设置环境变量parses the env- and arg-strings in new user memory and creates the pointer tables from them, and puts their addresses on the "stack", returning the new stack pointer value.
	current->brk = ex.a_bss +
		(current->end_data = ex.a_data +
		(current->end_code = ex.a_text));
	current->start_stack = p & 0xfffff000;						//
	current->euid = e_uid;
	current->egid = e_gid;
	i = ex.a_text+ex.a_data;
	while (i&0xfff)
		put_fs_byte(0,(char *) (i++));
	eip[0] = ex.a_entry;		/* eip, magic happens :-) */			//改变栈上的内容
	eip[3] = p;			/* stack pointer */			//设置eip和esp
	return 0;
exec_error2:
	iput(inode);
exec_error1:
	for (i=0 ; i<MAX_ARG_PAGES ; i++)
		free_page(page[i]);
	return(retval);

shell程序开始执行时，其线性地址空间对应的程序内容并未加载，也就不存在相应的页面，因此会产生一个“页异常”中断，此中断会进一步调用“缺页中断”处理程序来分配该页面，并加载一页shell程序。

//代码路径：mm/page.s
.globl _page_fault

_page_fault:
	xchgl %eax,(%esp)			//error code
	pushl %ecx
	pushl %edx
	push %ds
	push %es
	push %fs
	movl $0x10,%edx
	mov %dx,%ds
	mov %dx,%es
	mov %dx,%fs
	movl %cr2,%edx				//address
	pushl %edx					//addr
	pushl %eax					//error code
	testl $1,%eax				//p bits
	jne 1f						//p != 0
	call _do_no_page			//p = 0 means no page
	jmp 2f
1:	call _do_wp_page
2:	addl $8,%esp
	pop %fs
	pop %es
	pop %ds
	popl %edx
	popl %ecx
	popl %eax
	iret

通过检测error code后调用do_no_page()函数，先确定缺页的原因。加入是由于需要加载程序才却页。会尝试与其他进程共享shell（显然此前没有进程加载过shell，无法共享），于是申请一个页面，调用bread_page()函数，从虚拟盘上读取4块（4KB，一页）shell程序内容，载入内存页面。

void do_no_page(unsigned long error_code,unsigned long address)

	int nr[4];
	unsigned long tmp;
	unsigned long page;
	int block,i;

	address &= 0xfffff000;
	tmp = address - current->start_code;					
	if (!current->executable || tmp >= current->end_data) 
		get_empty_page(address);
		return;
	
	if (share_page(tmp))
		return;
	if (!(page = get_free_page()))
		oom();
/* remember that 1 block is used for header */
	block = 1 + tmp/BLOCK_SIZE;
	for (i=0 ; i<4 ; block++,i++)
		nr[i] = bmap(current->executable,block);
	bread_page(page,current->executable->i_dev,nr);		//读取四个逻辑块（1页）的shell程序内容进内存页面
	i = tmp + 4096 - current->end_data;
	tmp = page + 4096;
	while (i-- > 0) 
		tmp--;
		*(char *)tmp = 0;
	
	if (put_page(page,address))			//修改页表，建立映射
		return;
	free_page(page);
	oom();

载入一页的shell程序后，内核会将该页内容映射到shell进程的线性地址空间内，建立页目录表->页表->页面的三级映射管理关系。

进程3 update

之后进程2读取rc文件上的信息，fork出了update进程，这个新进程的进程号为3。update进程有一项很重要的任务：将缓冲区中的数据同步到外色号（软盘、硬盘）上。由于主机与外设的数据交换速度远低于主机内部的数据处理速度，因此，当内核需要往外设上写数据的时候，为了提高系统的整体执行效率，并不把数据直接写入到外设上，而是先写入缓冲区，之后根据实际情况，再将数据从缓冲区同步到外设。

每隔一段时间，update进程就会被唤醒，把数据往外设上同步一次，之后这个进程会被挂起，即被设置为可中断等待状态，等待着下一次被唤醒后继续执行，如此周而复始。

update进程执行后，没有同步任务，于是该进程被挂起，系统执行进程调度，最终切换到shell进程继续执行。

完成工作后调用exit()函数，对应的系统调用函数为sys_exit()，执行代码如下

int sys_exit(int error_code)

	return do_exit((error_code&0xff)<<8);


int do_exit(long code)

	int i;

	free_page_tables(get_base(current->ldt[1]),get_limit(0x0f));
	free_page_tables(get_base(current->ldt[2]),get_limit(0x17));
	for (i=0 ; i<NR_TASKS ; i++)
		if (task[i] && task[i]->father == current->pid) 			//寻找子进程
			task[i]->father = 1;									//将子进程父进程设置为1
			if (task[i]->state == TASK_ZOMBIE)
				/* assumption task[1] is always init */
				(void) send_sig(SIGCHLD, task[1], 1);				//如果子进程为zombie，即已经调用了exit函数，给进程1发送信号。
		
	for (i=0 ; i<NR_OPEN ; i++)
		if (current->filp[i])
			sys_close(i);
	iput(current->pwd);
	current->pwd=NULL;
	iput(current->root);
	current->root=NULL;
	iput(current->executable);
	current->executable=NULL;
	if (current->leader && current->tty >= 0)
		tty_table[current->tty].pgrp = 0;
	if (last_task_used_math == current)
		last_task_used_math = NULL;
	if (current->leader)
		kill_session();
	current->state = TASK_ZOMBIE;
	current->exit_code = code;
	tell_father(current->father);
	schedule();
	return (-1);	/* just to suppress warnings */

值得注意的是tell_father()和schedule()函数的执行，tell_father向父进程发送SIGCHLD信号

static void tell_father(int pid)

	int i;

	if (pid)
		for (i=0;i<NR_TASKS;i++) 
			if (!task[i])
				continue;
			if (task[i]->pid != pid)
				continue;
			task[i]->signal |= (1<<(SIGCHLD-1));
			return;
		
/* if we don\'t find any fathers, we just release ourselves */
/* This is not really OK. Must change it to make father 1 */
	printk("BAD BAD - no father found\\n\\r");
	release(current);

tell_father()函数执行完毕后，调用schedule()函数准备进程切换。此次schedule()函数中对信号的检测，影响了进程切换

/*
 *  \'schedule()\' is the scheduler function. This is GOOD CODE! There
 * probably won\'t be any reason to change this, as it should work well
 * in all circumstances (ie gives IO-bound processes good response etc).
 * The one thing you might take a look at is the signal-handler code here.
 *
 *   NOTE!!  Task 0 is the \'idle\' task, which gets called when no other
 * tasks can run. It can not be killed, and it cannot sleep. The \'state\'
 * information in task[0] is never used.
 */
void schedule(void)

	int i,next,c;
	struct task_struct ** p;

/* check alarm, wake up any interruptible tasks that have got a signal */

	for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
		if (*p) 
			if ((*p)->alarm && (*p)->alarm < jiffies) 
					(*p)->signal |= (1<<(SIGALRM-1));
					(*p)->alarm = 0;
				
			if (((*p)->signal & ~(_BLOCKABLE & (*p)->blocked)) &&
			(*p)->state==TASK_INTERRUPTIBLE)
				(*p)->state=TASK_RUNNING;			//将进程1设置为task_running
		

/* this is the scheduler proper: */

	while (1) 
		c = -1;
		next = 0;
		i = NR_TASKS;
		p = &task[NR_TASKS];
		while (--i) 
			if (!*--p)
				continue;
			if ((*p)->state == TASK_RUNNING && (*p)->counter > c)
				c = (*p)->counter, next = i;
		
		if (c) break;
		for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
			if (*p)
				(*p)->counter = ((*p)->counter >> 1) +
						(*p)->priority;
	
	switch_to(next);

将进程1设置为task_running后，后面调度到进程1执行，此时task1还在schedule函数中，执行完毕后继续执行sys_waitpid()函数

int sys_waitpid(pid_t pid,unsigned long * stat_addr, int options)

	int flag, code;
	struct task_struct ** p;

	verify_area(stat_addr,4);
repeat:
	flag=0;
	for(p = &LAST_TASK ; p > &FIRST_TASK ; --p) 
		if (!*p || *p == current)
			continue;
		if ((*p)->father != current->pid)
			continue;
		if (pid>0) 
			if ((*p)->pid != pid)
				continue;
		 else if (!pid) 
			if ((*p)->pgrp != current->pgrp)
				continue;
		 else if (pid != -1) 
			if ((*p)->pgrp != -pid)
				continue;
		
		switch ((*p)->state) 
			case TASK_STOPPED:
				if (!(options & WUNTRACED))
					continue;
				put_fs_long(0x7f,stat_addr);
				return (*p)->pid;
			case TASK_ZOMBIE:					//找到子进程的状态为task_zombie
				current->cutime += (*p)->utime;
				current->cstime += (*p)->stime;
				flag = (*p)->pid;				//记录进程2的pid
				code = (*p)->exit_code;
				release(*p);					//释放他的task_struct页面
				put_fs_long(code,stat_addr);
				return flag;
			default:
				flag=1;
				continue;
		
	
	if (flag) 
		if (options & WNOHANG)
			return 0;
		current->state=TASK_INTERRUPTIBLE;
		schedule();
		if (!(current->signal &= ~(1<<(SIGCHLD-1))))		//执行到这里，检测到SIGCHLD，确定有子进程要退出，if条件成立，repeat
			goto repeat;
		else
			return -EINTR;
	
	return -ECHILD;

sys_waitpid()函数执行完毕后，会回到wait()函数，最后返回到init()函数中，进程1继续执行。

void init(void)

	int pid,i;

	setup((void *) &drive_info);
	(void) open("/dev/tty0",O_RDWR,0);
	(void) dup(0);
	(void) dup(0);
	printf("%d buffers = %d bytes buffer space\\n\\r",NR_BUFFERS,
		NR_BUFFERS*BLOCK_SIZE);
	printf("Free mem: %d bytes\\n\\r",memory_end-main_memory_start);
	if (!(pid=fork())) 
		close(0);
		if (open("/etc/rc",O_RDONLY,0))
			_exit(1);
		execve("/bin/sh",argv_rc,envp_rc);
		_exit(2);		//不会执行到这句话
	
	if (pid>0)
		while (pid != wait(&i))		//此时wait的返回值为2，while条件为假，退出
			/* nothing */;
	while (1) 
		if ((pid=fork())<0) 				//fork进程
			printf("Fork failed in init\\r\\n");
			continue;
		
		if (!pid) 					//此时进程号为4，虽然在task数组中索引为2（因为之前shell进程还fork出了一个update进程，该进程pid为3
			close(0);close(1);close(2);
			setsid();
			(void) open("/dev/tty0",O_RDWR,0);	//
			(void) dup(0);
			(void) dup(0);
			_exit(execve("/bin/sh",argv,envp));
		
		while (1)
			if (pid == wait(&i))					//因为进程1是所有孤儿进程的父进程，因此要一直wait，从而release子进程
				break;
		printf("\\n\\rchild %d died with code %04x\\n\\r",pid,i);
		sync();
	
	_exit(0);	/* NOTE! _exit, not exit() */

进程4 shell

这次shell打开的是标准输入设备文件tty0而不是rc，这使得shell开始执行后，不再退出。进入rw_char()函数后，shell进程将被设置为可中断等待状态，这样所有的进程都处于可中断等待状态，再次切换到进程0去执行，系统实现怠速。

怠速以后，操作系统用户将通过shell进程提供的平台与计算机进行交互，shell进程处理用户指令的工作原理如下：用户通过键盘输入信息，存储在指定的字符缓冲队列上。该缓冲队列上的内容，就是tty0文件的内容，shell进程会不断读取缓冲队列上的数据信息。如果用户没有下达指令，缓冲队列就不会有数据，shell进程将会被设置为可终端等待状态，即被挂起。如果用户通过键盘下达指令，将产生键盘中断，中断程序会将字符信息存储在缓冲队列上，并给shell进程发信号，信号将导致shell进程被设置为就绪状态，即被唤醒，唤醒后的shell继续从缓冲队列中读取数据信息并处理，完毕后，shell进程将再次被挂起，等待下一次键盘中断被唤醒。

以上是关于Linux 0.11 进程012的主要内容，如果未能解决你的问题，请参考以下文章