现在计算机为多核处理器,操作系统会把任务分配到处理器上运行工作,但是如何均匀的分配任务?以及怎样调整何时调整任务的分配策略?
此篇内容大部分不懂先看个大概,有时间在仔细分析
在linux系统中运用软中断来调整各个处理器的负载;
__init void init_sched_fair_class(void)
{
#ifdef CONFIG_SMP
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 注册软中断底部
#ifdef CONFIG_NO_HZ
nohz.next_balance = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
cpu_notifier(sched_ilb_notifier, 0);
#endif
#endif /* SMP */
}
/*
* run_rebalance_domains is triggered when needed from the scheduler tick.
* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
*/
static void run_rebalance_domains(struct softirq_action *h)
{
int this_cpu = smp_processor_id();
struct rq *this_rq = cpu_rq(this_cpu);
enum cpu_idle_type idle = this_rq->idle_balance ?
CPU_IDLE : CPU_NOT_IDLE;
rebalance_domains(this_cpu, idle);
/*
* If this cpu has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle cpus whose ticks are
* stopped.
*/
nohz_idle_balance(this_cpu, idle);
}
其中rebblance_domain调用load_balance;
/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
*
* Balancing parameters are set up in arch_init_sched_domains.
*/
static void rebalance_domains(int cpu, enum cpu_idle_type idle)
{
int balance = 1;
struct rq *rq = cpu_rq(cpu);
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
int need_serialize;
update_shares(cpu);
rcu_read_lock();
for_each_domain(cpu, sd) {//对于该cpu所在的调度域及其父调度域
if (!(sd->flags & SD_LOAD_BALANCE))<pre name="code" class="cpp">//如果这个调度域已经明确表示不参与负载平衡
continue;interval = sd->balance_interval;//得到该调度域的平衡周期if (idle != CPU_IDLE)interval *= sd->busy_factor;//根据当前cpu状态对此周期进行修正/* scale ms to jiffies */interval = msecs_to_jiffies(interval);//将毫秒转化成jiffie数interval = clamp(interval, 1UL, max_load_balance_interval);need_serialize = sd->flags & SD_SERIALIZE;if (need_serialize) {if (!spin_trylock(&balancing))goto out;}if (time_after_eq(jiffies, sd->last_balance + interval)) {//真的需要进行负载平衡了if (load_balance(cpu, rq, sd, idle, &balance)) {/* * We've pulled tasks over so either we're no * longer idle. */idle = CPU_NOT_IDLE;}sd->last_balance = jiffies;}if (need_serialize)spin_unlock(&balancing);out:if (time_after(next_balance, sd->last_balance + interval)) {next_balance = sd->last_balance + interval;update_next_balance = 1;}/* * Stop the load balance at this level. There is another * CPU in our sched group which is doing load balancing more * actively. */if (!balance)break;}rcu_read_unlock();/* * next_balance will be updated only when there is a need. * When the cpu is attached to null domain for ex, it will not be * updated. */if (likely(update_next_balance))rq->next_balance = next_balance;
还是不懂对于cpu的负载均衡怎样分析以及怎样处理。。。。。此处代码还不是很理解
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *balance)
{
int ld_moved, cur_ld_moved, active_balance = 0;
int lb_iterations, max_lb_iterations;
struct sched_group *group;
struct rq *busiest;run_rebalance_domains
unsigned long flags;
struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
struct lb_env env = {
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
.dst_grpmask = sched_group_cpus(sd->groups),
.idle = idle,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
};
cpumask_copy(cpus, cpu_active_mask);
max_lb_iterations = cpumask_weight(env.dst_grpmask);
schedstat_inc(sd, lb_count[idle]);
redo:
group = find_busiest_group(&env, balance);
if (*balance == 0)
goto out_balanced;
if (!group) {
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
}
busiest = find_busiest_queue(&env, group);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
}
BUG_ON(busiest == this_rq);
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
ld_moved = 0;
lb_iterations = 1;
if (busiest->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
* an imbalance but busiest->nr_running <= 1, the group is
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
env.src_cpu = busiest->cpu;
env.src_rq = busiest;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
update_h_load(env.src_cpu);
more_balance:
local_irq_save(flags);
double_rq_lock(this_rq, busiest);
/*
* cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations
*/
cur_ld_moved = move_tasks(&env);
ld_moved += cur_ld_moved;
double_rq_unlock(this_rq, busiest);
local_irq_restore(flags);
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
goto more_balance;
}
/*
* some other cpu did the load balance for us.
*/
if (cur_ld_moved && env.dst_cpu != smp_processor_id())
resched_cpu(env.dst_cpu);
/*
* Revisit (affine) tasks on src_cpu that couldn't be moved to
* us and move them to an alternate dst_cpu in our sched_group
* where they can run. The upper limit on how many times we
* iterate on same src_cpu is dependent on number of cpus in our
* sched_group.
*
* This changes load balance semantics a bit on who can move
* load to a given_cpu. In addition to the given_cpu itself
* (or a ilb_cpu acting on its behalf where given_cpu is
* nohz-idle), we now have balance_cpu in a position to move
* load to given_cpu. In rare situations, this may cause
* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
* _independently_ and at _same_ time to move some load to
* given_cpu) causing exceess load to be moved to given_cpu.
* This however should not happen so much in practice and
* moreover subsequent load balance cycles should correct the
* excess load moved.
*/
if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
lb_iterations++ < max_lb_iterations) {
this_rq = cpu_rq(env.new_dst_cpu);
env.dst_rq = this_rq;
env.dst_cpu = env.new_dst_cpu;
env.flags &= ~LBF_SOME_PINNED;
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
/*
* Go back to "more_balance" rather than "redo" since we
* need to continue with same src_cpu.
*/
goto more_balance;
}
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
if (!cpumask_empty(cpus)) {
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
goto redo;
}
goto out_balanced;
}
}
if (!ld_moved) {
schedstat_inc(sd, lb_failed[idle]);
/*
* Increment the failure counter only on periodic balance.
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
*/
if (idle != CPU_NEWLY_IDLE)
sd->nr_balance_failed++;
if (need_active_balance(&env)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the active_load_balance_cpu_stop,
* if the curr task on busiest cpu can't be
* moved to this_cpu
*/
if (!cpumask_test_cpu(this_cpu,
tsk_cpus_allowed(busiest->curr))) {
raw_spin_unlock_irqrestore(&busiest->lock,
flags);
env.flags |= LBF_ALL_PINNED;
goto out_one_pinned;
}
/*
* ->active_balance synchronizes accesses to
* ->active_balance_work. Once set, it's cleared
* only after active load balance is finished.
*/
if (!busiest->active_balance) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
active_balance = 1;
}
raw_spin_unlock_irqrestore(&busiest->lock, flags);
if (active_balance) {
stop_one_cpu_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
}
/*
* We've kicked active balancing, reset the failure
* counter.
*/
sd->nr_balance_failed = sd->cache_nice_tries+1;
}
} else
sd->nr_balance_failed = 0;
if (likely(!active_balance)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
} else {
/*
* If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call
* move_tasks).
*/
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
}
goto out;
out_balanced:
schedstat_inc(sd, lb_balanced[idle]);
sd->nr_balance_failed = 0;
out_one_pinned:
/* tune up the balancing interval */
if (((env.flags & LBF_ALL_PINNED) &&
sd->balance_interval < MAX_PINNED_INTERVAL) ||
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
ld_moved = 0;
out:
return ld_moved;
}
同时:当进程调用了sleep、usleep、poll、epoll、pause时,也就是调用了可能休眠的操作时都会在内核代码里对schedule()函数的调用;同时 try_to_wake_up() 在唤醒线程、进程时会选择在那个cpu上运行,也会涉及到load balance;
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
void idle_balance(int this_cpu, struct rq *this_rq)
{
struct sched_domain *sd;
int pulled_task = 0;
unsigned long next_balance = jiffies + HZ;
this_rq->idle_stamp = this_rq->clock;
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
/*
* Drop the rq->lock, but keep IRQ/preempt disabled.
*/
raw_spin_unlock(&this_rq->lock);
update_shares(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
unsigned long interval;
int balance = 1;
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
if (sd->flags & SD_BALANCE_NEWIDLE) {
/* If we've pulled tasks over stop searching: */
pulled_task = load_balance(this_cpu, this_rq,
sd, CPU_NEWLY_IDLE, &balance);
}
interval = msecs_to_jiffies(sd->balance_interval);
if (time_after(next_balance, sd->last_balance + interval))
next_balance = sd->last_balance + interval;
if (pulled_task) {
this_rq->idle_stamp = 0;
break;
}
}
rcu_read_unlock();
raw_spin_lock(&this_rq->lock);
if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
/*
* We are going idle. next_balance may be set based on
* a busy processor. So reset next_balance.
*/
this_rq->next_balance = next_balance;
}
}
在try_to_wake_up上也会进行cpu的负载均衡:
*/
static int
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
cpu = task_cpu(p);
.................................
#ifdef CONFIG_SMP
/*
* If the owning (remote) cpu is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
*/
while (p->on_cpu) {
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
/*
* In case the architecture enables interrupts in
* context_switch(), we cannot busy wait, since that
* would lead to deadlocks when an interrupt hits and
* tries to wake up @prev. So bail and do a complete
* remote wakeup.
*/
if (ttwu_activate_remote(p, wake_flags))
goto stat;
#else
cpu_relax();
#endif
}
.............
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
if (p->sched_class->task_waking)
p->sched_class->task_waking(p);
cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
}
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu);
stat:
ttwu_stat(p, cpu, wake_flags);
out:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
return success;
}
如果上述this_cpu和cpu不等;则设置wake_flags以及set_task_cpu(p,cpu);
cpu 为进程休眠前运行时的cpu,this_cpu为目标cpu,实际运行这个函数的cpu?????是否这样理解