Scheduler 学习之二：主调度器_schedule函数

卢磊

2023-12-01

Summary
1.1 _schedule函数是又被称作为主调度器，负责选择新的task run到当前的cpu当中
1.2 _scheduler将选核的功能交给各调度类完成
1.3 pick_next_task提供了一个优化功能：假定fair class中的Task数量与rq 中一致，刚直接从fair class中选即可
1.4 主调度器起作用的时间主要是在task切换的时候，例如，抢锁抢不到，或者初始开启抢占，以及从kernel space回到user space

Flow

static void __sched notrace __schedule(bool preempt)
{
	struct task_struct *prev, *next;
	unsigned long *switch_count;
	struct rq_flags rf;
	struct rq *rq;
	int cpu;

	cpu = smp_processor_id();
	rq = cpu_rq(cpu);
	prev = rq->curr;//现在的将变成过往

	schedule_debug(prev, preempt);//此函数检查当前调用schedule是否合适,如不能在原子上下文(即抢占被关闭的情况下)中调用schedule,栈崩溃等.

	if (sched_feat(HRTICK))
		hrtick_clear(rq);

	local_irq_disable();
	rcu_note_context_switch(preempt);//通知RCU的相关操作

	/*
	 * Make sure that signal_pending_state()->signal_pending() below
	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
	 * done by the caller to avoid the race with signal_wake_up().
	 *
	 * The membarrier system call requires a full memory barrier
	 * after coming from user-space, before storing to rq->curr.
	 */
	rq_lock(rq, &rf);
	smp_mb__after_spinlock();

	/* Promote REQ to ACT */
	rq->clock_update_flags <<= 1;//为什么要修改clock_update_flags呢?
	update_rq_clock(rq);

	switch_count = &prev->nivcsw; //根据上下文,switch_count此时指向的是非自愿切换计数器
	if (!preempt && prev->state) {//如果不是被抢占的,且task处于非running状态
		if (signal_pending_state(prev->state, prev)) {//如果有signal pending,则重新将prev task置为running状态
			prev->state = TASK_RUNNING;
		} else { //否则做dequeue的操作.
			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);

			if (prev->in_iowait) { //如果prev task是在iowait状态,则增加当前runqueue iowait计数
				atomic_inc(&rq->nr_iowait);
				delayacct_blkio_start();
			}
		}
		switch_count = &prev->nvcsw;//因为经过判断是自愿切换的,所以,将计数器指针指向自愿切换计数器
	}

	next = pick_next_task(rq, prev, &rf);//选择新的task出来
	clear_tsk_need_resched(prev);//因为可能prev task是设置了need_reschedule,所以,重新置位
	clear_preempt_need_resched();

	if (likely(prev != next)) {
		rq->nr_switches++;//nr_switches表示自愿与非自愿的总数
		/*
		 * RCU users of rcu_dereference(rq->curr) may not see
		 * changes to task_struct made by pick_next_task().
		 */
		RCU_INIT_POINTER(rq->curr, next);
		/*
		 * The membarrier system call requires each architecture
		 * to have a full memory barrier after updating
		 * rq->curr, before returning to user-space.
		 *
		 * Here are the schemes providing that barrier on the
		 * various architectures:
		 * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
		 *   switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
		 * - finish_lock_switch() for weakly-ordered
		 *   architectures where spin_unlock is a full barrier,
		 * - switch_to() for arm64 (weakly-ordered, spin_unlock
		 *   is a RELEASE barrier),
		 */
		++*switch_count;

		trace_sched_switch(preempt, prev, next);//这个trace吐出来的信息可以反映出一些信息,需要研究一下.

		/* Also unlocks the rq: */
		rq = context_switch(rq, prev, next, &rf);
	} else {
		rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
		rq_unlock_irq(rq, &rf);
	}

	balance_callback(rq);//做一些回调动作.
}

上面的flow读完之后,这个funtion涉及如下的几个问题:(逐个研究)
1. clock_update_flag的作用,这个可能涉及到rq->clock的更新机制
2.为什么要根据signal_pending_state的状态,将prev task的状态重新置为running
3.deactive_task涉及到prev task的on_rq状态,on_rq状态是怎样切换的
4.pick_next_task如何进行的(2020年5月15日23:18:37研究)
1.4.1 code reading

/*
 * Pick up the highest-prio task:
 */
static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
	const struct sched_class *class;
	struct task_struct *p;

	/*
	 * Optimization: we know that if all tasks are in the fair class we can
	 * call that function directly, but only if the @prev task wasn't of a
	 * higher scheduling class, because otherwise those loose the
	 * opportunity to pull in more work from other CPUs.
	 */
	if (likely((prev->sched_class == &idle_sched_class ||
		    prev->sched_class == &fair_sched_class) &&
		   rq->nr_running == rq->cfs.h_nr_running)) {//每次enqueue或者dequeue一个task,rq->cfs.h_nr_running都会加1,或者减1,所以此处表示如果rq中只有fair_sched_class中有task需要调度,则,直接从fair_sched_class中选task出来即可.如果nr_running的值为0也是可以的在fair.c中会做idle_balance.详细的部分,参考后面fair class的分析.

		p = fair_sched_class.pick_next_task(rq, prev, rf);
		if (unlikely(p == RETRY_TASK))//RETRY_TASK的值为-1,如果fair class中pick_next_task没有拉到task,但是发现rq中的task数量有变化,说明有新的更高优先级task进来了,因此,要去拉更高优先级的task.
			goto restart;

		/* Assumes fair_sched_class->next == idle_sched_class */
		if (unlikely(!p))
			p = idle_sched_class.pick_next_task(rq, prev, rf);

		return p;
	}

restart:
#ifdef CONFIG_SMP
	/*
	 * We must do the balancing pass before put_next_task(), such
	 * that when we release the rq->lock the task is in the same
	 * state as before we took rq->lock.
	 *
	 * We can terminate the balance pass as soon as we know there is
	 * a runnable task of @class priority or higher.
	 */
	for_class_range(class, prev->sched_class, &idle_sched_class) {
		if (class->balance(rq, prev, rf)) //在进行选task之前,通知各class做一次balance,方便后面可以选到task.如假定当前rq的cfs queue当中是空的,可以通过idlebalance从其他的runqueue中拉runnable的task过来,这样后面就可以从当前的rq中选到task了,而且还减少了runnable的时间.
			break;
	}
#endif

	put_prev_task(rq, prev);

	for_each_class(class) { //遍历每一个class,并拉task出来.那么一共有哪些class,稍候研究.
		p = class->pick_next_task(rq, NULL, NULL);
		if (p)
			return p;
	}

	/* The idle class should always have a runnable task: */
	BUG();
}

那么一共有哪些class呢?

#define for_each_class(class) \
	for_class_range(class, sched_class_highest, NULL)

#ifdef CONFIG_SMP
#define sched_class_highest (&stop_sched_class)
#else
#define sched_class_highest (&dl_sched_class)
#endif

/*
 * Simple, special scheduling class for the per-CPU stop tasks:
 */
const struct sched_class stop_sched_class = {
	.next			= &dl_sched_class,
}

const struct sched_class dl_sched_class = {
	.next			= &rt_sched_class,
}

const struct sched_class rt_sched_class = {
	.next			= &fair_sched_class,

}
const struct sched_class fair_sched_class = {
	.next			= &idle_sched_class,
}

所以5个class的优先顺序为:stop_sched_class > dl_sched_class > rt_sched_class > fair_sched_class > idle_sched_class
1.4.2 summary
pick_next_task优先判断当前cpu的rq中总task的数量与cfs 中的task数量一致,如果一致则优直接从cfs queue中选task出来.
在选task之前先通知各class做一次balance,目的是让各class可以有机会做balance的动作,从其他cpu中拉task过来,然后被选出来执行
按照优先级高低,选task出来.
5.trace_sched_switch打印出的信息中线程状态什么含义
6.balance_callback的作用
7.context_switch是怎样进行的

调用时机
如源代码注释有如下几种情形，都会调用到这个主调度器，将cpu让渡出去。
* The main means of driving the scheduler and thus entering this function are:
*
* 1. Explicit blocking: mutex, semaphore, waitqueue, etc.

*
* 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
* paths. For example, see arch/x86/entry_64.S.
*
* To drive preemption between tasks, the scheduler sets the flag in timer
* interrupt handler scheduler_tick().
*
* 3. Wakeups don't really cause entry into schedule(). They add a
* task to the run-queue and that's it.
*
* Now, if the new task added to the run-queue preempts the current
* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
* called on the nearest possible occasion:
*
* - If the kernel is preemptible (CONFIG_PREEMPTION=y):
*
* - in syscall or exception context, at the next outmost
* preempt_enable(). (this might be as soon as the wake_up()'s
* spin_unlock()!)
*
* - in IRQ context, return from interrupt-handler to
* preemptible context
*
* - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
* then at the next:
*
* - cond_resched() call
* - explicit schedule() call
* - return from syscall or exception to user-space
* - return from interrupt-handler to user-space

Scheduler 学习之二：主调度器_schedule函数

相关阅读

相关文章

相关问答

相关文档