从Linux Kernel源码解开RT进程优先于CFS进程调度的谜团
Posted 高桐@BILL
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了从Linux Kernel源码解开RT进程优先于CFS进程调度的谜团相关的知识,希望对你有一定的参考价值。
写在前面:
改代码分析,基于android的kernel源码。通过该源码,我们来了解下Kernel在进行进程调度的时候,RT进程为何优于CFS进程调度。
一、进程调度流程
代码路径:/kernel/sched/core.c,这部分kernel代码来自http://androidxref.com/kernel_3.18/.
相关进程调度流程如下:
二、代码梳理
2.1 调度器函数
/kernel/sched/core.c
static void __sched __schedule(void)
2766
2767 struct task_struct *prev, *next;
2768 unsigned long *switch_count;
2769 struct rq *rq;
2770 int cpu;
2771
2772need_resched:
2773 preempt_disable();
2774 cpu = smp_processor_id();
2775 rq = cpu_rq(cpu);
2776 rcu_note_context_switch(cpu);
2777 prev = rq->curr;
2778
2779 schedule_debug(prev);
2780
2781 if (sched_feat(HRTICK))
2782 hrtick_clear(rq);
2783
2784 /*
2785 * Make sure that signal_pending_state()->signal_pending() below
2786 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
2787 * done by the caller to avoid the race with signal_wake_up().
2788 */
2789 smp_mb__before_spinlock();
2790 raw_spin_lock_irq(&rq->lock);
2791
2792 switch_count = &prev->nivcsw;
2793 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE))
2794 if (unlikely(signal_pending_state(prev->state, prev)))
2795 prev->state = TASK_RUNNING;
2796 else
2797 deactivate_task(rq, prev, DEQUEUE_SLEEP);
2798 prev->on_rq = 0;
2799
2800 /*
2801 * If a worker went to sleep, notify and ask workqueue
2802 * whether it wants to wake up a task to maintain
2803 * concurrency.
2804 */
2805 if (prev->flags & PF_WQ_WORKER)
2806 struct task_struct *to_wakeup;
2807
2808 to_wakeup = wq_worker_sleeping(prev, cpu);
2809 if (to_wakeup)
2810 try_to_wake_up_local(to_wakeup);
2811
2812
2813 switch_count = &prev->nvcsw;
2814
2815
2816 if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
2817 update_rq_clock(rq);
2818
2819 next = pick_next_task(rq, prev);
2820 clear_tsk_need_resched(prev);
2821 clear_preempt_need_resched();
2822 rq->skip_clock_update = 0;
2823
2824 if (likely(prev != next))
2825 rq->nr_switches++;
2826 rq->curr = next;
2827 ++*switch_count;
2828
2829 context_switch(rq, prev, next); /* unlocks the rq */
2830 /*
2831 * The context switch have flipped the stack from under us
2832 * and restored the local variables which were saved when
2833 * this task called schedule() in the past. prev == current
2834 * is still correct, but it can be moved to another cpu/rq.
2835 */
2836 cpu = smp_processor_id();
2837 rq = cpu_rq(cpu);
2838 else
2839 raw_spin_unlock_irq(&rq->lock);
2840
2841 post_schedule(rq);
2842
2843 sched_preempt_enable_no_resched();
2844 if (need_resched())
2845 goto need_resched;
2846
2.2 选择最高优先级的进程进行执行
/kernel/sched/core.c
2692static inline struct task_struct *
2693pick_next_task(struct rq *rq, struct task_struct *prev)
2694
2695 const struct sched_class *class = &fair_sched_class;
2696 struct task_struct *p;
2697
2698 /*
2699 * Optimization: we know that if all tasks are in
2700 * the fair class we can call that function directly:
2701 */
2702 if (likely(prev->sched_class == class &&
2703 rq->nr_running == rq->cfs.h_nr_running)) //如果当前任务队列没有RT任务,则选择普通进程
2704 p = fair_sched_class.pick_next_task(rq, prev);
2705 if (unlikely(p == RETRY_TASK))
2706 goto again;
2707
2708 /* assumes fair_sched_class->next == idle_sched_class */
2709 if (unlikely(!p))
2710 p = idle_sched_class.pick_next_task(rq, prev);
2711
2712 return p;
2713
2714
2715again:
2716 for_each_class(class)
2717 p = class->pick_next_task(rq, prev);//从这里来看,只要有rt进程就不会接着继续找了
2718 if (p)
2719 if (unlikely(p == RETRY_TASK))
2720 goto again;
2721 return p;
2722
2723
2724
2725 BUG(); /* the idle class will always have a runnable task */
2726
2.3 调度类轮询
1 Stop调度器
优先级最高的调度器,可以抢占其他所有进程,不能被其他进程抢占;
//轮询调度类,首先从stop_schd_class开始;stop调度器作为最高优先级调度器,可以抢占其他所有进程。Stop调度器仅由内核使用,用户无法选择,这里不做讨论。
1150#define sched_class_highest (&stop_sched_class)
1151#define for_each_class(class) \\
1152 for (class = sched_class_highest; class; class = class->next)
1153
1154extern const struct sched_class stop_sched_class;
1155extern const struct sched_class dl_sched_class;
1156extern const struct sched_class rt_sched_class;
1157extern const struct sched_class fair_sched_class;
1158extern const struct sched_class idle_sched_class;
109/*
110 * Simple, special scheduling class for the per-CPU stop tasks:
111 */
112const struct sched_class stop_sched_class =
113 .next = &dl_sched_class,
114
115 .enqueue_task = enqueue_task_stop,
116 .dequeue_task = dequeue_task_stop,
117 .yield_task = yield_task_stop,
118
119 .check_preempt_curr = check_preempt_curr_stop,
120
121 .pick_next_task = pick_next_task_stop,
122 .put_prev_task = put_prev_task_stop,
123
124#ifdef CONFIG_SMP
125 .select_task_rq = select_task_rq_stop,
126#endif
127
128 .set_curr_task = set_curr_task_stop,
129 .task_tick = task_tick_stop,
130
131 .get_rr_interval = get_rr_interval_stop,
132
133 .prio_changed = prio_changed_stop,
134 .switched_to = switched_to_stop,
135 .update_curr = update_curr_stop,
136;
2 dl调度器
使用红黑树,把进程按照绝对截止日期进行排序,选择最小的进程运行;
1676const struct sched_class dl_sched_class =
1677 .next = &rt_sched_class,
1678 .enqueue_task = enqueue_task_dl,
1679 .dequeue_task = dequeue_task_dl,
1680 .yield_task = yield_task_dl,
1681
1682 .check_preempt_curr = check_preempt_curr_dl,
1683
1684 .pick_next_task = pick_next_task_dl,
1685 .put_prev_task = put_prev_task_dl,
1686
1687#ifdef CONFIG_SMP
1688 .select_task_rq = select_task_rq_dl,
1689 .set_cpus_allowed = set_cpus_allowed_dl,
1690 .rq_online = rq_online_dl,
1691 .rq_offline = rq_offline_dl,
1692 .post_schedule = post_schedule_dl,
1693 .task_woken = task_woken_dl,
1694#endif
1695
1696 .set_curr_task = set_curr_task_dl,
1697 .task_tick = task_tick_dl,
1698 .task_fork = task_fork_dl,
1699 .task_dead = task_dead_dl,
1700
1701 .prio_changed = prio_changed_dl,
1702 .switched_from = switched_from_dl,
1703 .switched_to = switched_to_dl,
1704
1705 .update_curr = update_curr_dl,
1706;
3 rt调度器
2102const struct sched_class rt_sched_class =
2103 .next = &fair_sched_class,
2104 .enqueue_task = enqueue_task_rt,
2105 .dequeue_task = dequeue_task_rt,
2106 .yield_task = yield_task_rt,
2107
2108 .check_preempt_curr = check_preempt_curr_rt,
2109
2110 .pick_next_task = pick_next_task_rt,
2111 .put_prev_task = put_prev_task_rt,
2112
2113#ifdef CONFIG_SMP
2114 .select_task_rq = select_task_rq_rt,
2115
2116 .set_cpus_allowed = set_cpus_allowed_rt,
2117 .rq_online = rq_online_rt,
2118 .rq_offline = rq_offline_rt,
2119 .post_schedule = post_schedule_rt,
2120 .task_woken = task_woken_rt,
2121 .switched_from = switched_from_rt,
2122#endif
2123
2124 .set_curr_task = set_curr_task_rt,
2125 .task_tick = task_tick_rt,
2126
2127 .get_rr_interval = get_rr_interval_rt,
2128
2129 .prio_changed = prio_changed_rt,
2130 .switched_to = switched_to_rt,
2131
2132 .update_curr = update_curr_rt,
2133;
4 cfs调度器
采用完全公平调度算法的调度器
7929/*
7930 * All the scheduling class methods:
7931 */
7932const struct sched_class fair_sched_class =
7933 .next = &idle_sched_class,
7934 .enqueue_task = enqueue_task_fair,
7935 .dequeue_task = dequeue_task_fair,
7936 .yield_task = yield_task_fair,
7937 .yield_to_task = yield_to_task_fair,
7938
7939 .check_preempt_curr = check_preempt_wakeup,
7940
7941 .pick_next_task = pick_next_task_fair,
7942 .put_prev_task = put_prev_task_fair,
7943
7944#ifdef CONFIG_SMP
7945 .select_task_rq = select_task_rq_fair,
7946 .migrate_task_rq = migrate_task_rq_fair,
7947
7948 .rq_online = rq_online_fair,
7949 .rq_offline = rq_offline_fair,
7950
7951 .task_waking = task_waking_fair,
7952#endif
7953
7954 .set_curr_task = set_curr_task_fair,
7955 .task_tick = task_tick_fair,
7956 .task_fork = task_fork_fair,
7957
7958 .prio_changed = prio_changed_fair,
7959 .switched_from = switched_from_fair,
7960 .switched_to = switched_to_fair,
7961
7962 .get_rr_interval = get_rr_interval_fair,
7963
7964 .update_curr = update_curr_fair,
7965
7966#ifdef CONFIG_FAIR_GROUP_SCHED
7967 .task_move_group = task_move_group_fair,
7968#endif
7969;
如果没有rt进程,那么在cfs调度器队列中的进程是通过最小虚拟运行时间(vruntime值最小的进程。 CFS使用红黑树来组织就绪队列,因此可以快速找到vruntime值最小的那个进程,只需要查找树中最左侧的叶子节点即可)来选择执行进程的。
且CFS调度器通过check_preempt_tick()函数,来根据当前进程已经运行的时间,判断是否需要将进程thread info结构体的flag通过resched_curr设置为TIF_NEED_RESCHED。这样在执行完本次时钟中断后(即从handle irq返回后),根据需要进行重新调度。
2941/*
2942 * Preempt the current task with a newly woken task if needed:
2943 */
2944static void
2945check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2946
2947 unsigned long ideal_runtime, delta_exec;
2948 struct sched_entity *se;
2949 s64 delta;
2950
2951 ideal_runtime = sched_slice(cfs_rq, curr);//计算调度周期
2952 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
2953 if (delta_exec > ideal_runtime)
2954 resched_curr(rq_of(cfs_rq));
2955 /*
2956 * The current task ran long enough, ensure it doesn't get
2957 * re-elected due to buddy favours.
2958 */
2959 clear_buddies(cfs_rq, curr);
2960 return;
2961
2962
2963 /*
2964 * Ensure that a task that missed wakeup preemption by a
2965 * narrow margin doesn't have to wait for a full slice.
2966 * This also mitigates buddy induced latencies under load.
2967 */
2968 if (delta_exec < sysctl_sched_min_granularity)
2969 return;
2970
2971 se = __pick_first_entity(cfs_rq);
2972 delta = curr->vruntime - se->vruntime;
2973
2974 if (delta < 0)
2975 return;
2976
2977 if (delta > ideal_runtime)
2978 resched_curr(rq_of(cfs_rq));
2979
注意:我们通过“zcat /proc/config.gz”命令来看下当前版本中内核的配置中CONFIG_HZ_100=y了解到,当前cpu是每10ms触发一次时钟中断。
5 idle调度器
空闲调度器,每个cpu都会有个idle线程,当没有其他进程可调度时,运行idle线程。
82/*
83 * Simple, special scheduling class for the per-CPU idle tasks:
84 */
85const struct sched_class idle_sched_class =
86 /* .next is NULL */
87 /* no enqueue/yield_task for idle tasks */
88
89 /* dequeue is not valid, we print a debug message there: */
90 .dequeue_task = dequeue_task_idle,
91
92 .check_preempt_curr = check_preempt_curr_idle,
93
94 .pick_next_task = pick_next_task_idle,
95 .put_prev_task = put_prev_task_idle,
96
97#ifdef CONFIG_SMP
98 .select_task_rq = select_task_rq_idle,
99#endif
100
101 .set_curr_task = set_curr_task_idle,
102 .task_tick = task_tick_idle,
103
104 .get_rr_interval = get_rr_interval_idle,
105
106 .prio_changed = prio_changed_idle,
107 .switched_to = switched_to_idle,
108 .update_curr = update_curr_idle,
109;
如上,从上面轮询调度器的顺序来看,rt进程是比cfs进程优先调度的。
以上是关于从Linux Kernel源码解开RT进程优先于CFS进程调度的谜团的主要内容,如果未能解决你的问题,请参考以下文章
第一次作业:深入源码分析进程模型(Linux kernel 2.6.32)
Linux(内核剖析):11---进程调度之实时调度策略(SCHED_FIFOSCHED_RRMAX_RT_PRIO实时优先级)
第一次作业: 基于Linux Kernel 2.6 的源码, 分析其进程模型
基于Linux Kernel Version 4.13.0-36-generic的源码分析进程模型
Linux 内核进程管理 ( 内核线程概念 | 内核线程普通进程用户线程 | 内核线程与普通进程区别 | 内核线程主要用途 | 内核线程创建函数 kernel_thread 源码 )