vm内核参数之内存脏页dirty_background_bytes和dirty_bytes

杨曜瑞

2023-12-01

注：本文分析基于3.10.0-693.el7内核版本，即CentOS 7.4

1、背景

在《vm内核参数之内存脏页dirty_writeback_centisecs和dirty_expire_centisecs》中我们知道在处理回写work时，会判断是否需要提交background回写work，同时在实际处理work时对于background回写还会再次判断是否达到回写阈值，我们就来看看什么条件下会提交和执行background回写work。

2、background回写

2.1 提交background回写work

static long wb_do_writeback(struct bdi_writeback *wb)
{
	...
	wrote += wb_check_background_flush(wb);
	clear_bit(BDI_writeback_running, &wb->bdi->state);

	return wrote;
}

static long wb_check_background_flush(struct bdi_writeback *wb)
{
	//判断是否达到回写阈值
	if (over_bground_thresh(wb->bdi)) {

		struct wb_writeback_work work = {
			.nr_pages	= LONG_MAX,
			.sync_mode	= WB_SYNC_NONE,
			.for_background	= 1,
			.range_cyclic	= 1,
			.reason		= WB_REASON_BACKGROUND,
		};
		//提交background回写work
		return wb_writeback(wb, &work);
	}

	return 0;
}

static bool over_bground_thresh(struct backing_dev_info *bdi)
{
	unsigned long background_thresh, dirty_thresh;
	//获取background回写阈值和dirty阈值，但是这里只会使用到前者
	global_dirty_limits(&background_thresh, &dirty_thresh);
	//判断系统dirty页面数量是否大于阈值
	if (global_page_state(NR_FILE_DIRTY) +
	    global_page_state(NR_UNSTABLE_NFS) > background_thresh)
		return true;
	//判断当前BDI可回收页面数量是否大于该BDI所占比例的阈值
	//因为系统可能有多块磁盘，有可能系统总的脏页不多，但都集中在某块磁盘，
	//那这块磁盘是需要启动对应的background回写进程的
	//通过计算某个时间段内，该BID回写完成次数的占比来确定对应的background_thresh的比例
	if (bdi_stat(bdi, BDI_RECLAIMABLE) > bdi_dirty_limit(bdi, background_thresh))
		return true;

	return false;
}

可见，最终阈值计算由global_dirty_limits确定，根据dirty_background_bytes或者dirty_background_ratio 计算。同时也会计算出触发显式脏页回写的阈值，根据dirty_bytes或dirty_ratio计算。

void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
{
	unsigned long background;
	unsigned long dirty;
	unsigned long uninitialized_var(available_memory);
	struct task_struct *tsk;
    //如果没有设置/proc/sys/vm/dirty_bytes和/proc/sys/vm/dirty_background_bytes
	if (!vm_dirty_bytes || !dirty_background_bytes)
        //统计空闲页和可回收页面之后，并扣除min_free_kbytes以及系统保留内存
		available_memory = global_dirtyable_memory();

	if (vm_dirty_bytes)
        //设置了/proc/sys/vm/dirty_bytes，则按dirty_bytes计算脏页阈值
		dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
	else
        //否则使用vm_dirty_ratio计算脏页阈值，默认值20%
		dirty = (vm_dirty_ratio * available_memory) / 100;

	if (dirty_background_bytes)
        //设置了/proc/sys/vm/dirty_background_bytes，则按其计算background脏页阈值
		background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
	else
        //否则使用dirty_background_ratio计算background脏页阈值，默认值10%
		background = (dirty_background_ratio * available_memory) / 100;

    //background脏页阈值不能大于脏页阈值，也就是background回写要早于进程主动刷脏页
	if (background >= dirty)
		background = dirty / 2;
	tsk = current;
    //如果当前进程设置了PF_LESS_THROTTLE标志位，或者是个实时进程，阈值各自提高1/4额度
	if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
		background += background / 4;
		dirty += dirty / 4;
	}
    //保存阈值
	*pbackground = background;
	*pdirty = dirty;
	trace_global_dirty_state(background, dirty);
}

2.2 执行background回写work

实际执行background回写时需要再次做一次判断，因为background回写优先级比较低，其他work上可能已经回写了部分脏页，有可能此时已经低于阈值。

static long wb_writeback(struct bdi_writeback *wb,
			 struct wb_writeback_work *work)
{
	...
	spin_lock(&wb->list_lock);
	for (;;) {
		...
		if (work->for_background && !over_bground_thresh(wb->bdi))
			break;
		...
	}
	spin_unlock(&wb->list_lock);

	return nr_pages - work->nr_pages;
}

3、显式回写

background回写对于进程是无法感知的，但是如果脏页过多，来不及处理，内存消耗过快，此时在进程运行时就出现了内存不足，这是就需要显式的触发脏页回写，并阻塞等待内存释放后，进程才能进一步运行。这种情况主要是在write操作过程中出现。

我们以ext4文件系统为例，

ext4_file_write
    ext4_file_dio_write
        __generic_file_aio_write
            generic_file_buffered_write
                generic_perform_write
                    balance_dirty_pages_ratelimited
                        balance_dirty_pages #vm_dirty_ratio/bytes
                            global_dirty_limits

3.1 balance_dirty_pages_ratelimited

主要流程：

当前进程脏页数量超过ratelimit，启动background回写
当前CPU的bdp_ratelimits计数超过ratelimit_pages，启动background回写
以上两种都没出现，但是当前进程脏页数量加上当前CPU上退出进程使的脏页大于ratelimit，启动background回写

void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
	struct backing_dev_info *bdi = mapping->backing_dev_info;
	int ratelimit;
	int *p;

	if (!bdi_cap_account_dirty(bdi))
		return;
    //进程初始化时nr_dirtied_pause为32
	ratelimit = current->nr_dirtied_pause;
	if (bdi->dirty_exceeded)
		ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));

	preempt_disable();
	/*
	 * This prevents one CPU to accumulate too many dirtied pages without
	 * calling into balance_dirty_pages(), which can happen when there are
	 * 1000+ tasks, all of them start dirtying pages at exactly the same
	 * time, hence all honoured too large initial task->nr_dirtied_pause.
	 */
    //获取当前CPU的dirty page数量
	p =  &__get_cpu_var(bdp_ratelimits);
    //如果当前进程脏页数量超过ratelimit，重置bdp_ratelimits计数
	if (unlikely(current->nr_dirtied >= ratelimit))
		*p = 0;
    //ratelimit_pages默认值32，内核启动时设置为dirty_thresh平均到每个CPU的1/32，最小值16
    //如果当前CPU bdp_ratelimits计数超过ratelimit_pages，同样重置bdp_ratelimits计数
    //并将ratelimit置零
	else if (unlikely(*p >= ratelimit_pages)) {
		*p = 0;
		ratelimit = 0;
	}
	/*
	 * Pick up the dirtied pages by the exited tasks. This avoids lots of
	 * short-lived tasks (eg. gcc invocations in a kernel build) escaping
	 * the dirty throttling and livelock other long-run dirtiers.
	 */
    //获取当前CPU上退出进程使的脏页，防止由于过多生存周期很短的进程，
    //产生了脏页却没有统计而无法触发回写，尤其是在当前进程脏页数未达到ratelimit阈值时
	p = &__get_cpu_var(dirty_throttle_leaks);
	if (*p > 0 && current->nr_dirtied < ratelimit) {
		unsigned long nr_pages_dirtied;
		nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
		*p -= nr_pages_dirtied;
        //如果当前CPU上退出进程使的脏页小于剩余值，就需要加上这部分，
        //否则，此时就需要触发回写了，此时下面的语句就相当于将剩余值补上，往下走就会触发回写
		current->nr_dirtied += nr_pages_dirtied;
	}
	preempt_enable();

    //当前进程脏页数量超过ratelimit，需要启动回写进程，进行脏页回写
	if (unlikely(current->nr_dirtied >= ratelimit))
		balance_dirty_pages(mapping, current->nr_dirtied);
}

3.2 balance_dirty_pages

主要流程：

如果可回收+正在回写脏页数量 < background和显式回写阈值的均值此次先不启动回写，否则启动background回写
如果可回收脏页数量大于background回写阈值，启动background回写进程

static void balance_dirty_pages(struct address_space *mapping,
				unsigned long pages_dirtied)
{
	unsigned long nr_reclaimable;	/* = file_dirty + unstable_nfs */
	unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
	...
	bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
	unsigned long start_time = jiffies;

	for (;;) {
		...
		/*
		 * Unstable writes are a feature of certain networked
		 * filesystems (i.e. NFS) in which data may have been
		 * written to the server's write cache, but has not yet
		 * been flushed to permanent storage.
		 */
        //统计可回收页面数量
		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
					global_page_state(NR_UNSTABLE_NFS);
        //统计dirty页面数量，可回收+正在回写
		nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
        //获取background回写阈值和显式回写阈值
		global_dirty_limits(&background_thresh, &dirty_thresh);

		if (unlikely(strictlimit)) {
			bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
					 &bdi_dirty, &bdi_thresh, &bg_thresh);

			dirty = bdi_dirty;
			thresh = bdi_thresh;
		} else {
			dirty = nr_dirty;
			thresh = dirty_thresh;
			bg_thresh = background_thresh;
		}

        //如果脏页数量小于background和显式回写阈值的平均值，这次先不回写
		if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {
			current->dirty_paused_when = now;
			current->nr_dirtied = 0;
            //计算当前进程还能增加多少脏页而不超阈值，近似等于距离阈值的一半
			current->nr_dirtied_pause =
				dirty_poll_interval(dirty, thresh);
			break;
		}
        //超过阈值后，如果该BDI回写进程没有启动，则唤醒该BDI的回写进程执行background回写
		if (unlikely(!writeback_in_progress(bdi)))
			bdi_start_background_writeback(bdi);
        ...
	}

	if (!dirty_exceeded && bdi->dirty_exceeded)
		bdi->dirty_exceeded = 0;

	if (writeback_in_progress(bdi))
		return;

	/*
	 * In laptop mode, we wait until hitting the higher threshold before
	 * starting background writeout, and then write out all the way down
	 * to the lower threshold.  So slow writers cause minimal disk activity.
	 *
	 * In normal mode, we start background writeout at the lower
	 * background_thresh, to keep the amount of dirty memory low.
	 */
	if (laptop_mode)
		return;
    //如果可回收脏页数量大于background回写阈值，启动background回写进程
	if (nr_reclaimable > background_thresh)
		bdi_start_background_writeback(bdi);
}

4、小结

dirty_background_bytes
控制脏页内存数量，超过dirty_background_bytes时，内核的flush线程开始回写脏页
dirty_background_ratio
控制脏页占可用内存(空闲+可回收)的百分比，达到dirty_background_ratio时，内核的flush线程开始回写脏页。默认值： 10

由此可见，增大这两个参数的值一定程度可以提高系统的读写性能，毕竟可用于缓存的内存变大了。

dirty_bytes
控制脏页内存数量，达到dirty_bytes时，执行磁盘写操作的进程开始回写脏页
dirty_ratio
控制脏页所占可用内存百分比，达到dirty_ratio时，执行磁盘写操作的进程自己开始回写脏数据。默认值：20

需要注意的是，这两对参数都只能指定其中一个，先设置先生效，另一个会被清零。

vm内核参数之内存脏页dirty_background_bytes和dirty_bytes

1、背景

2、background回写

2.1 提交background回写work

2.2 执行background回写work

3、显式回写

3.1 balance_dirty_pages_ratelimited

3.2 balance_dirty_pages

4、小结

相关阅读

相关文章

相关问答

相关文档