从Linux Kernel源码解开RT进程优先于CFS进程调度的谜团

Posted 高桐@BILL

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了从Linux Kernel源码解开RT进程优先于CFS进程调度的谜团相关的知识,希望对你有一定的参考价值。

写在前面:

改代码分析,基于android的kernel源码。通过该源码,我们来了解下Kernel在进行进程调度的时候,RT进程为何优于CFS进程调度。

一、进程调度流程

代码路径:/kernel/sched/core.c,这部分kernel代码来自http://androidxref.com/kernel_3.18/.

相关进程调度流程如下:

二、代码梳理

2.1 调度器函数

/kernel/sched/core.c

    static void __sched __schedule(void)
2766
2767	struct task_struct *prev, *next;
2768	unsigned long *switch_count;
2769	struct rq *rq;
2770	int cpu;
2771
2772need_resched:
2773	preempt_disable();
2774	cpu = smp_processor_id();
2775	rq = cpu_rq(cpu);
2776	rcu_note_context_switch(cpu);
2777	prev = rq->curr;
2778
2779	schedule_debug(prev);
2780
2781	if (sched_feat(HRTICK))
2782		hrtick_clear(rq);
2783
2784	/*
2785	 * Make sure that signal_pending_state()->signal_pending() below
2786	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
2787	 * done by the caller to avoid the race with signal_wake_up().
2788	 */
2789	smp_mb__before_spinlock();
2790	raw_spin_lock_irq(&rq->lock);
2791
2792	switch_count = &prev->nivcsw;
2793	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) 
2794		if (unlikely(signal_pending_state(prev->state, prev))) 
2795			prev->state = TASK_RUNNING;
2796		 else 
2797			deactivate_task(rq, prev, DEQUEUE_SLEEP);
2798			prev->on_rq = 0;
2799
2800			/*
2801			 * If a worker went to sleep, notify and ask workqueue
2802			 * whether it wants to wake up a task to maintain
2803			 * concurrency.
2804			 */
2805			if (prev->flags & PF_WQ_WORKER) 
2806				struct task_struct *to_wakeup;
2807
2808				to_wakeup = wq_worker_sleeping(prev, cpu);
2809				if (to_wakeup)
2810					try_to_wake_up_local(to_wakeup);
2811			
2812		
2813		switch_count = &prev->nvcsw;
2814	
2815
2816	if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
2817		update_rq_clock(rq);
2818
2819	next = pick_next_task(rq, prev);
2820	clear_tsk_need_resched(prev);
2821	clear_preempt_need_resched();
2822	rq->skip_clock_update = 0;
2823
2824	if (likely(prev != next)) 
2825		rq->nr_switches++;
2826		rq->curr = next;
2827		++*switch_count;
2828
2829		context_switch(rq, prev, next); /* unlocks the rq */
2830		/*
2831		 * The context switch have flipped the stack from under us
2832		 * and restored the local variables which were saved when
2833		 * this task called schedule() in the past. prev == current
2834		 * is still correct, but it can be moved to another cpu/rq.
2835		 */
2836		cpu = smp_processor_id();
2837		rq = cpu_rq(cpu);
2838	 else
2839		raw_spin_unlock_irq(&rq->lock);
2840
2841	post_schedule(rq);
2842
2843	sched_preempt_enable_no_resched();
2844	if (need_resched())
2845		goto need_resched;
2846

2.2 选择最高优先级的进程进行执行

/kernel/sched/core.c

2692static inline struct task_struct *
2693pick_next_task(struct rq *rq, struct task_struct *prev)
2694
2695	const struct sched_class *class = &fair_sched_class;
2696	struct task_struct *p;
2697
2698	/*
2699	 * Optimization: we know that if all tasks are in
2700	 * the fair class we can call that function directly:
2701	 */
2702	if (likely(prev->sched_class == class &&
2703		   rq->nr_running == rq->cfs.h_nr_running)) //如果当前任务队列没有RT任务,则选择普通进程
2704		p = fair_sched_class.pick_next_task(rq, prev);
2705		if (unlikely(p == RETRY_TASK))
2706			goto again;
2707
2708		/* assumes fair_sched_class->next == idle_sched_class */
2709		if (unlikely(!p))
2710			p = idle_sched_class.pick_next_task(rq, prev);
2711
2712		return p;
2713	
2714
2715again:
2716	for_each_class(class) 
2717		p = class->pick_next_task(rq, prev);//从这里来看,只要有rt进程就不会接着继续找了
2718		if (p) 
2719			if (unlikely(p == RETRY_TASK))
2720				goto again;
2721			return p;
2722		
2723	
2724
2725	BUG(); /* the idle class will always have a runnable task */
2726

2.3 调度类轮询

1 Stop调度器

优先级最高的调度器,可以抢占其他所有进程,不能被其他进程抢占;

//轮询调度类,首先从stop_schd_class开始;stop调度器作为最高优先级调度器,可以抢占其他所有进程。Stop调度器仅由内核使用,用户无法选择,这里不做讨论。
1150#define sched_class_highest (&stop_sched_class)
1151#define for_each_class(class) \\
1152   for (class = sched_class_highest; class; class = class->next)
1153
1154extern const struct sched_class stop_sched_class;
1155extern const struct sched_class dl_sched_class;
1156extern const struct sched_class rt_sched_class;
1157extern const struct sched_class fair_sched_class;
1158extern const struct sched_class idle_sched_class;
109/*
110 * Simple, special scheduling class for the per-CPU stop tasks:
111 */
112const struct sched_class stop_sched_class = 
113	.next			= &dl_sched_class,
114
115	.enqueue_task		= enqueue_task_stop,
116	.dequeue_task		= dequeue_task_stop,
117	.yield_task		= yield_task_stop,
118
119	.check_preempt_curr	= check_preempt_curr_stop,
120
121	.pick_next_task		= pick_next_task_stop,
122	.put_prev_task		= put_prev_task_stop,
123
124#ifdef CONFIG_SMP
125	.select_task_rq		= select_task_rq_stop,
126#endif
127
128	.set_curr_task          = set_curr_task_stop,
129	.task_tick		= task_tick_stop,
130
131	.get_rr_interval	= get_rr_interval_stop,
132
133	.prio_changed		= prio_changed_stop,
134	.switched_to		= switched_to_stop,
135	.update_curr		= update_curr_stop,
136;

2 dl调度器

使用红黑树,把进程按照绝对截止日期进行排序,选择最小的进程运行;

1676const struct sched_class dl_sched_class = 
1677	.next			= &rt_sched_class,
1678	.enqueue_task		= enqueue_task_dl,
1679	.dequeue_task		= dequeue_task_dl,
1680	.yield_task		= yield_task_dl,
1681
1682	.check_preempt_curr	= check_preempt_curr_dl,
1683
1684	.pick_next_task		= pick_next_task_dl,
1685	.put_prev_task		= put_prev_task_dl,
1686
1687#ifdef CONFIG_SMP
1688	.select_task_rq		= select_task_rq_dl,
1689	.set_cpus_allowed       = set_cpus_allowed_dl,
1690	.rq_online              = rq_online_dl,
1691	.rq_offline             = rq_offline_dl,
1692	.post_schedule		= post_schedule_dl,
1693	.task_woken		= task_woken_dl,
1694#endif
1695
1696	.set_curr_task		= set_curr_task_dl,
1697	.task_tick		= task_tick_dl,
1698	.task_fork              = task_fork_dl,
1699	.task_dead		= task_dead_dl,
1700
1701	.prio_changed           = prio_changed_dl,
1702	.switched_from		= switched_from_dl,
1703	.switched_to		= switched_to_dl,
1704
1705	.update_curr		= update_curr_dl,
1706;

3 rt调度器

2102const struct sched_class rt_sched_class = 
2103	.next			= &fair_sched_class,
2104	.enqueue_task		= enqueue_task_rt,
2105	.dequeue_task		= dequeue_task_rt,
2106	.yield_task		= yield_task_rt,
2107
2108	.check_preempt_curr	= check_preempt_curr_rt,
2109
2110	.pick_next_task		= pick_next_task_rt,
2111	.put_prev_task		= put_prev_task_rt,
2112
2113#ifdef CONFIG_SMP
2114	.select_task_rq		= select_task_rq_rt,
2115
2116	.set_cpus_allowed       = set_cpus_allowed_rt,
2117	.rq_online              = rq_online_rt,
2118	.rq_offline             = rq_offline_rt,
2119	.post_schedule		= post_schedule_rt,
2120	.task_woken		= task_woken_rt,
2121	.switched_from		= switched_from_rt,
2122#endif
2123
2124	.set_curr_task          = set_curr_task_rt,
2125	.task_tick		= task_tick_rt,
2126
2127	.get_rr_interval	= get_rr_interval_rt,
2128
2129	.prio_changed		= prio_changed_rt,
2130	.switched_to		= switched_to_rt,
2131
2132	.update_curr		= update_curr_rt,
2133;

4 cfs调度器

采用完全公平调度算法的调度器

7929/*
7930 * All the scheduling class methods:
7931 */
7932const struct sched_class fair_sched_class = 
7933	.next			= &idle_sched_class,
7934	.enqueue_task		= enqueue_task_fair,
7935	.dequeue_task		= dequeue_task_fair,
7936	.yield_task		= yield_task_fair,
7937	.yield_to_task		= yield_to_task_fair,
7938
7939	.check_preempt_curr	= check_preempt_wakeup,
7940
7941	.pick_next_task		= pick_next_task_fair,
7942	.put_prev_task		= put_prev_task_fair,
7943
7944#ifdef CONFIG_SMP
7945	.select_task_rq		= select_task_rq_fair,
7946	.migrate_task_rq	= migrate_task_rq_fair,
7947
7948	.rq_online		= rq_online_fair,
7949	.rq_offline		= rq_offline_fair,
7950
7951	.task_waking		= task_waking_fair,
7952#endif
7953
7954	.set_curr_task          = set_curr_task_fair,
7955	.task_tick		= task_tick_fair,
7956	.task_fork		= task_fork_fair,
7957
7958	.prio_changed		= prio_changed_fair,
7959	.switched_from		= switched_from_fair,
7960	.switched_to		= switched_to_fair,
7961
7962	.get_rr_interval	= get_rr_interval_fair,
7963
7964	.update_curr		= update_curr_fair,
7965
7966#ifdef CONFIG_FAIR_GROUP_SCHED
7967	.task_move_group	= task_move_group_fair,
7968#endif
7969;

如果没有rt进程,那么在cfs调度器队列中的进程是通过最小虚拟运行时间(vruntime值最小的进程。 CFS使用红黑树组织就绪队列,因此可以快速找到vruntime值最小的那个进程,只需要查找树中最左侧的叶子节点即可)来选择执行进程的。

且CFS调度器通过check_preempt_tick()函数,来根据当前进程已经运行的时间,判断是否需要将进程thread info结构体的flag通过resched_curr设置为TIF_NEED_RESCHED。这样在执行完本次时钟中断后(即从handle irq返回后),根据需要进行重新调度。

/kernel/sched/fair.c

2941/*
2942 * Preempt the current task with a newly woken task if needed:
2943 */
2944static void
2945check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2946
2947	unsigned long ideal_runtime, delta_exec;
2948	struct sched_entity *se;
2949	s64 delta;
2950
2951	ideal_runtime = sched_slice(cfs_rq, curr);//计算调度周期
2952	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
2953	if (delta_exec > ideal_runtime) 
2954		resched_curr(rq_of(cfs_rq));
2955		/*
2956		 * The current task ran long enough, ensure it doesn't get
2957		 * re-elected due to buddy favours.
2958		 */
2959		clear_buddies(cfs_rq, curr);
2960		return;
2961	
2962
2963	/*
2964	 * Ensure that a task that missed wakeup preemption by a
2965	 * narrow margin doesn't have to wait for a full slice.
2966	 * This also mitigates buddy induced latencies under load.
2967	 */
2968	if (delta_exec < sysctl_sched_min_granularity)
2969		return;
2970
2971	se = __pick_first_entity(cfs_rq);
2972	delta = curr->vruntime - se->vruntime;
2973
2974	if (delta < 0)
2975		return;
2976
2977	if (delta > ideal_runtime)
2978		resched_curr(rq_of(cfs_rq));
2979

注意:我们通过“zcat /proc/config.gz”命令来看下当前版本中内核的配置中CONFIG_HZ_100=y了解到,当前cpu是每10ms触发一次时钟中断。

5 idle调度器

空闲调度器,每个cpu都会有个idle线程,当没有其他进程可调度时,运行idle线程。

82/*
83 * Simple, special scheduling class for the per-CPU idle tasks:
84 */
85const struct sched_class idle_sched_class = 
86	/* .next is NULL */
87	/* no enqueue/yield_task for idle tasks */
88
89	/* dequeue is not valid, we print a debug message there: */
90	.dequeue_task		= dequeue_task_idle,
91
92	.check_preempt_curr	= check_preempt_curr_idle,
93
94	.pick_next_task		= pick_next_task_idle,
95	.put_prev_task		= put_prev_task_idle,
96
97#ifdef CONFIG_SMP
98	.select_task_rq		= select_task_rq_idle,
99#endif
100
101	.set_curr_task          = set_curr_task_idle,
102	.task_tick		= task_tick_idle,
103
104	.get_rr_interval	= get_rr_interval_idle,
105
106	.prio_changed		= prio_changed_idle,
107	.switched_to		= switched_to_idle,
108	.update_curr		= update_curr_idle,
109;

如上,从上面轮询调度器的顺序来看,rt进程是比cfs进程优先调度的。

以上是关于从Linux Kernel源码解开RT进程优先于CFS进程调度的谜团的主要内容,如果未能解决你的问题,请参考以下文章

什么优先于SCHED_DEADLINE?

第一次作业:深入源码分析进程模型(Linux kernel 2.6.32)

Linux(内核剖析):11---进程调度之实时调度策略(SCHED_FIFOSCHED_RRMAX_RT_PRIO实时优先级)

第一次作业: 基于Linux Kernel 2.6 的源码, 分析其进程模型

基于Linux Kernel Version 4.13.0-36-generic的源码分析进程模型

Linux 内核进程管理 ( 内核线程概念 | 内核线程普通进程用户线程 | 内核线程与普通进程区别 | 内核线程主要用途 | 内核线程创建函数 kernel_thread 源码 )