注:本文分析基于3.10.0-693.el7内核版本,即CentOS 7.4
在《vm内核参数之内存脏页dirty_writeback_centisecs和dirty_expire_centisecs》中我们知道在处理回写work时,会判断是否需要提交background回写work,同时在实际处理work时对于background回写还会再次判断是否达到回写阈值,我们就来看看什么条件下会提交和执行background回写work。
static long wb_do_writeback(struct bdi_writeback *wb)
{
...
wrote += wb_check_background_flush(wb);
clear_bit(BDI_writeback_running, &wb->bdi->state);
return wrote;
}
static long wb_check_background_flush(struct bdi_writeback *wb)
{
//判断是否达到回写阈值
if (over_bground_thresh(wb->bdi)) {
struct wb_writeback_work work = {
.nr_pages = LONG_MAX,
.sync_mode = WB_SYNC_NONE,
.for_background = 1,
.range_cyclic = 1,
.reason = WB_REASON_BACKGROUND,
};
//提交background回写work
return wb_writeback(wb, &work);
}
return 0;
}
static bool over_bground_thresh(struct backing_dev_info *bdi)
{
unsigned long background_thresh, dirty_thresh;
//获取background回写阈值和dirty阈值,但是这里只会使用到前者
global_dirty_limits(&background_thresh, &dirty_thresh);
//判断系统dirty页面数量是否大于阈值
if (global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS) > background_thresh)
return true;
//判断当前BDI可回收页面数量是否大于该BDI所占比例的阈值
//因为系统可能有多块磁盘,有可能系统总的脏页不多,但都集中在某块磁盘,
//那这块磁盘是需要启动对应的background回写进程的
//通过计算某个时间段内,该BID回写完成次数的占比来确定对应的background_thresh的比例
if (bdi_stat(bdi, BDI_RECLAIMABLE) > bdi_dirty_limit(bdi, background_thresh))
return true;
return false;
}
可见,最终阈值计算由global_dirty_limits确定,根据dirty_background_bytes或者dirty_background_ratio 计算。同时也会计算出触发显式脏页回写的阈值,根据dirty_bytes或dirty_ratio计算。
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
{
unsigned long background;
unsigned long dirty;
unsigned long uninitialized_var(available_memory);
struct task_struct *tsk;
//如果没有设置/proc/sys/vm/dirty_bytes和/proc/sys/vm/dirty_background_bytes
if (!vm_dirty_bytes || !dirty_background_bytes)
//统计空闲页和可回收页面之后,并扣除min_free_kbytes以及系统保留内存
available_memory = global_dirtyable_memory();
if (vm_dirty_bytes)
//设置了/proc/sys/vm/dirty_bytes,则按dirty_bytes计算脏页阈值
dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
else
//否则使用vm_dirty_ratio计算脏页阈值,默认值20%
dirty = (vm_dirty_ratio * available_memory) / 100;
if (dirty_background_bytes)
//设置了/proc/sys/vm/dirty_background_bytes,则按其计算background脏页阈值
background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
else
//否则使用dirty_background_ratio计算background脏页阈值,默认值10%
background = (dirty_background_ratio * available_memory) / 100;
//background脏页阈值不能大于脏页阈值,也就是background回写要早于进程主动刷脏页
if (background >= dirty)
background = dirty / 2;
tsk = current;
//如果当前进程设置了PF_LESS_THROTTLE标志位,或者是个实时进程,阈值各自提高1/4额度
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
background += background / 4;
dirty += dirty / 4;
}
//保存阈值
*pbackground = background;
*pdirty = dirty;
trace_global_dirty_state(background, dirty);
}
实际执行background回写时需要再次做一次判断,因为background回写优先级比较低,其他work上可能已经回写了部分脏页,有可能此时已经低于阈值。
static long wb_writeback(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
...
spin_lock(&wb->list_lock);
for (;;) {
...
if (work->for_background && !over_bground_thresh(wb->bdi))
break;
...
}
spin_unlock(&wb->list_lock);
return nr_pages - work->nr_pages;
}
background回写对于进程是无法感知的,但是如果脏页过多,来不及处理,内存消耗过快,此时在进程运行时就出现了内存不足,这是就需要显式的触发脏页回写,并阻塞等待内存释放后,进程才能进一步运行。这种情况主要是在write操作过程中出现。
我们以ext4文件系统为例,
ext4_file_write
ext4_file_dio_write
__generic_file_aio_write
generic_file_buffered_write
generic_perform_write
balance_dirty_pages_ratelimited
balance_dirty_pages #vm_dirty_ratio/bytes
global_dirty_limits
主要流程:
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
struct backing_dev_info *bdi = mapping->backing_dev_info;
int ratelimit;
int *p;
if (!bdi_cap_account_dirty(bdi))
return;
//进程初始化时nr_dirtied_pause为32
ratelimit = current->nr_dirtied_pause;
if (bdi->dirty_exceeded)
ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
preempt_disable();
/*
* This prevents one CPU to accumulate too many dirtied pages without
* calling into balance_dirty_pages(), which can happen when there are
* 1000+ tasks, all of them start dirtying pages at exactly the same
* time, hence all honoured too large initial task->nr_dirtied_pause.
*/
//获取当前CPU的dirty page数量
p = &__get_cpu_var(bdp_ratelimits);
//如果当前进程脏页数量超过ratelimit,重置bdp_ratelimits计数
if (unlikely(current->nr_dirtied >= ratelimit))
*p = 0;
//ratelimit_pages默认值32,内核启动时设置为dirty_thresh平均到每个CPU的1/32,最小值16
//如果当前CPU bdp_ratelimits计数超过ratelimit_pages,同样重置bdp_ratelimits计数
//并将ratelimit置零
else if (unlikely(*p >= ratelimit_pages)) {
*p = 0;
ratelimit = 0;
}
/*
* Pick up the dirtied pages by the exited tasks. This avoids lots of
* short-lived tasks (eg. gcc invocations in a kernel build) escaping
* the dirty throttling and livelock other long-run dirtiers.
*/
//获取当前CPU上退出进程使的脏页,防止由于过多生存周期很短的进程,
//产生了脏页却没有统计而无法触发回写,尤其是在当前进程脏页数未达到ratelimit阈值时
p = &__get_cpu_var(dirty_throttle_leaks);
if (*p > 0 && current->nr_dirtied < ratelimit) {
unsigned long nr_pages_dirtied;
nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
*p -= nr_pages_dirtied;
//如果当前CPU上退出进程使的脏页小于剩余值,就需要加上这部分,
//否则,此时就需要触发回写了,此时下面的语句就相当于将剩余值补上,往下走就会触发回写
current->nr_dirtied += nr_pages_dirtied;
}
preempt_enable();
//当前进程脏页数量超过ratelimit,需要启动回写进程,进行脏页回写
if (unlikely(current->nr_dirtied >= ratelimit))
balance_dirty_pages(mapping, current->nr_dirtied);
}
主要流程:
static void balance_dirty_pages(struct address_space *mapping,
unsigned long pages_dirtied)
{
unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
...
bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
unsigned long start_time = jiffies;
for (;;) {
...
/*
* Unstable writes are a feature of certain networked
* filesystems (i.e. NFS) in which data may have been
* written to the server's write cache, but has not yet
* been flushed to permanent storage.
*/
//统计可回收页面数量
nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
//统计dirty页面数量,可回收+正在回写
nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
//获取background回写阈值和显式回写阈值
global_dirty_limits(&background_thresh, &dirty_thresh);
if (unlikely(strictlimit)) {
bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
&bdi_dirty, &bdi_thresh, &bg_thresh);
dirty = bdi_dirty;
thresh = bdi_thresh;
} else {
dirty = nr_dirty;
thresh = dirty_thresh;
bg_thresh = background_thresh;
}
//如果脏页数量小于background和显式回写阈值的平均值,这次先不回写
if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {
current->dirty_paused_when = now;
current->nr_dirtied = 0;
//计算当前进程还能增加多少脏页而不超阈值,近似等于距离阈值的一半
current->nr_dirtied_pause =
dirty_poll_interval(dirty, thresh);
break;
}
//超过阈值后,如果该BDI回写进程没有启动,则唤醒该BDI的回写进程执行background回写
if (unlikely(!writeback_in_progress(bdi)))
bdi_start_background_writeback(bdi);
...
}
if (!dirty_exceeded && bdi->dirty_exceeded)
bdi->dirty_exceeded = 0;
if (writeback_in_progress(bdi))
return;
/*
* In laptop mode, we wait until hitting the higher threshold before
* starting background writeout, and then write out all the way down
* to the lower threshold. So slow writers cause minimal disk activity.
*
* In normal mode, we start background writeout at the lower
* background_thresh, to keep the amount of dirty memory low.
*/
if (laptop_mode)
return;
//如果可回收脏页数量大于background回写阈值,启动background回写进程
if (nr_reclaimable > background_thresh)
bdi_start_background_writeback(bdi);
}
dirty_background_bytes
控制脏页内存数量,超过dirty_background_bytes时,内核的flush线程开始回写脏页
dirty_background_ratio
控制脏页占可用内存(空闲+可回收)的百分比,达到dirty_background_ratio时,内核的flush线程开始回写脏页。默认值: 10
由此可见,增大这两个参数的值一定程度可以提高系统的读写性能,毕竟可用于缓存的内存变大了。
dirty_bytes
控制脏页内存数量,达到dirty_bytes时,执行磁盘写操作的进程开始回写脏页
dirty_ratio
控制脏页所占可用内存百分比,达到dirty_ratio时,执行磁盘写操作的进程自己开始回写脏数据。默认值:20
需要注意的是,这两对参数都只能指定其中一个,先设置先生效,另一个会被清零。