static void __sched notrace __schedule(bool preempt)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
struct rq_flags rf;
struct rq *rq;
int cpu;
cpu = smp_processor_id();
rq = cpu_rq(cpu);
prev = rq->curr;//现在的将变成过往
schedule_debug(prev, preempt);//此函数检查当前调用schedule是否合适,如不能在原子上下文(即抢占被关闭的情况下)中调用schedule,栈崩溃等.
if (sched_feat(HRTICK))
hrtick_clear(rq);
local_irq_disable();
rcu_note_context_switch(preempt);//通知RCU的相关操作
/*
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
*
* The membarrier system call requires a full memory barrier
* after coming from user-space, before storing to rq->curr.
*/
rq_lock(rq, &rf);
smp_mb__after_spinlock();
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;//为什么要修改clock_update_flags呢?
update_rq_clock(rq);
switch_count = &prev->nivcsw; //根据上下文,switch_count此时指向的是非自愿切换计数器
if (!preempt && prev->state) {//如果不是被抢占的,且task处于非running状态
if (signal_pending_state(prev->state, prev)) {//如果有signal pending,则重新将prev task置为running状态
prev->state = TASK_RUNNING;
} else { //否则做dequeue的操作.
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
if (prev->in_iowait) { //如果prev task是在iowait状态,则增加当前runqueue iowait计数
atomic_inc(&rq->nr_iowait);
delayacct_blkio_start();
}
}
switch_count = &prev->nvcsw;//因为经过判断是自愿切换的,所以,将计数器指针指向自愿切换计数器
}
next = pick_next_task(rq, prev, &rf);//选择新的task出来
clear_tsk_need_resched(prev);//因为可能prev task是设置了need_reschedule,所以,重新置位
clear_preempt_need_resched();
if (likely(prev != next)) {
rq->nr_switches++;//nr_switches表示自愿与非自愿的总数
/*
* RCU users of rcu_dereference(rq->curr) may not see
* changes to task_struct made by pick_next_task().
*/
RCU_INIT_POINTER(rq->curr, next);
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
* rq->curr, before returning to user-space.
*
* Here are the schemes providing that barrier on the
* various architectures:
* - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
* switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
* - finish_lock_switch() for weakly-ordered
* architectures where spin_unlock is a full barrier,
* - switch_to() for arm64 (weakly-ordered, spin_unlock
* is a RELEASE barrier),
*/
++*switch_count;
trace_sched_switch(preempt, prev, next);//这个trace吐出来的信息可以反映出一些信息,需要研究一下.
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
} else {
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
rq_unlock_irq(rq, &rf);
}
balance_callback(rq);//做一些回调动作.
}
上面的flow读完之后,这个funtion涉及如下的几个问题:(逐个研究)
1. clock_update_flag的作用,这个可能涉及到rq->clock的更新机制
2.为什么要根据signal_pending_state的状态,将prev task的状态重新置为running
3.deactive_task涉及到prev task的on_rq状态,on_rq状态是怎样切换的
4.pick_next_task如何进行的(2020年5月15日23:18:37研究)
1.4.1 code reading
/*
* Pick up the highest-prio task:
*/
static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
/*
* Optimization: we know that if all tasks are in the fair class we can
* call that function directly, but only if the @prev task wasn't of a
* higher scheduling class, because otherwise those loose the
* opportunity to pull in more work from other CPUs.
*/
if (likely((prev->sched_class == &idle_sched_class ||
prev->sched_class == &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {//每次enqueue或者dequeue一个task,rq->cfs.h_nr_running都会加1,或者减1,所以此处表示如果rq中只有fair_sched_class中有task需要调度,则,直接从fair_sched_class中选task出来即可.如果nr_running的值为0也是可以的在fair.c中会做idle_balance.详细的部分,参考后面fair class的分析.
p = fair_sched_class.pick_next_task(rq, prev, rf);
if (unlikely(p == RETRY_TASK))//RETRY_TASK的值为-1,如果fair class中pick_next_task没有拉到task,但是发现rq中的task数量有变化,说明有新的更高优先级task进来了,因此,要去拉更高优先级的task.
goto restart;
/* Assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p))
p = idle_sched_class.pick_next_task(rq, prev, rf);
return p;
}
restart:
#ifdef CONFIG_SMP
/*
* We must do the balancing pass before put_next_task(), such
* that when we release the rq->lock the task is in the same
* state as before we took rq->lock.
*
* We can terminate the balance pass as soon as we know there is
* a runnable task of @class priority or higher.
*/
for_class_range(class, prev->sched_class, &idle_sched_class) {
if (class->balance(rq, prev, rf)) //在进行选task之前,通知各class做一次balance,方便后面可以选到task.如假定当前rq的cfs queue当中是空的,可以通过idlebalance从其他的runqueue中拉runnable的task过来,这样后面就可以从当前的rq中选到task了,而且还减少了runnable的时间.
break;
}
#endif
put_prev_task(rq, prev);
for_each_class(class) { //遍历每一个class,并拉task出来.那么一共有哪些class,稍候研究.
p = class->pick_next_task(rq, NULL, NULL);
if (p)
return p;
}
/* The idle class should always have a runnable task: */
BUG();
}
那么一共有哪些class呢?
#define for_each_class(class) \
for_class_range(class, sched_class_highest, NULL)
#ifdef CONFIG_SMP
#define sched_class_highest (&stop_sched_class)
#else
#define sched_class_highest (&dl_sched_class)
#endif
/*
* Simple, special scheduling class for the per-CPU stop tasks:
*/
const struct sched_class stop_sched_class = {
.next = &dl_sched_class,
}
const struct sched_class dl_sched_class = {
.next = &rt_sched_class,
}
const struct sched_class rt_sched_class = {
.next = &fair_sched_class,
}
const struct sched_class fair_sched_class = {
.next = &idle_sched_class,
}
所以5个class的优先顺序为:stop_sched_class > dl_sched_class > rt_sched_class > fair_sched_class > idle_sched_class
1.4.2 summary
pick_next_task优先判断当前cpu的rq中总task的数量与cfs 中的task数量一致,如果一致则优直接从cfs queue中选task出来.
在选task之前先通知各class做一次balance,目的是让各class可以有机会做balance的动作,从其他cpu中拉task过来,然后被选出来执行
按照优先级高低,选task出来.
5.trace_sched_switch打印出的信息中线程状态什么含义
6.balance_callback的作用
7.context_switch是怎样进行的