Linux内核中的dm-crypt模块的异步IO改造---blog17

韶硕

2023-12-01

2021SC@SUDSC

在学习了前面的只是后，就可以对dm-crypt进行分析了。

dm-crypt分析

首先，需要明确的是，dm-crypt是dm构架中用于块设备加密的模块。dm-crypt通过dm虚拟一个块设备，并在bio转发的时候将数据加密后存储来实现块设备的加密，而这些对于应用层是透明的。其target_type的定义如下：

static struct target_type crypt_target = {
	.name   = "crypt",
	.version = {1, 23, 0},
	.module = THIS_MODULE,
	.ctr    = crypt_ctr,
	.dtr    = crypt_dtr,
	.features = DM_TARGET_ZONED_HM,
	.report_zones = crypt_report_zones,
	.map    = crypt_map,
	.status = crypt_status,
	.postsuspend = crypt_postsuspend,
	.preresume = crypt_preresume,
	.resume = crypt_resume,
	.message = crypt_message,
	.iterate_devices = crypt_iterate_devices,
	.io_hints = crypt_io_hints,
};

接下来我们主要分析ctr和map这两个函数。

ctr决定了设备的创建过程与密码算法的关联过程
map决定了bio转发与对密码算法调用的具体步骤

crypt_ctr函数

crypt_ctr函数的代码如下：


/*
 * Construct an encryption mapping:
 * <cipher> [<key>|:<key_size>:<user|logon>:<key_description>] <iv_offset> <dev_path> <start>
 */
static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
	struct crypt_config *cc;
	const char *devname = dm_table_device_name(ti->table);
	int key_size;
	unsigned int align_mask;
	unsigned long long tmpll;
	int ret;
	size_t iv_size_padding, additional_req_size;
	char dummy;

	if (argc < 5) {
		ti->error = "Not enough arguments";
		return -EINVAL;
	}

	key_size = get_key_size(&argv[1]);
	if (key_size < 0) {
		ti->error = "Cannot parse key size";
		return -EINVAL;
	}

	cc = kzalloc(struct_size(cc, key, key_size), GFP_KERNEL);
	if (!cc) {
		ti->error = "Cannot allocate encryption context";
		return -ENOMEM;
	}
	cc->key_size = key_size;
	cc->sector_size = (1 << SECTOR_SHIFT);
	cc->sector_shift = 0;

	ti->private = cc;

	spin_lock(&dm_crypt_clients_lock);
	dm_crypt_clients_n++;
	crypt_calculate_pages_per_client();
	spin_unlock(&dm_crypt_clients_lock);

	ret = percpu_counter_init(&cc->n_allocated_pages, 0, GFP_KERNEL);
	if (ret < 0)
		goto bad;

	/* Optional parameters need to be read before cipher constructor */
	if (argc > 5) {
		ret = crypt_ctr_optional(ti, argc - 5, &argv[5]);
		if (ret)
			goto bad;
	}

	ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
	if (ret < 0)
		goto bad;

	if (crypt_integrity_aead(cc)) {
		cc->dmreq_start = sizeof(struct aead_request);
		cc->dmreq_start += crypto_aead_reqsize(any_tfm_aead(cc));
		align_mask = crypto_aead_alignmask(any_tfm_aead(cc));
	} else {
		cc->dmreq_start = sizeof(struct skcipher_request);
		cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc));
		align_mask = crypto_skcipher_alignmask(any_tfm(cc));
	}
	cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request));

	if (align_mask < CRYPTO_MINALIGN) {
		/* Allocate the padding exactly */
		iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request))
				& align_mask;
	} else {
		/*
		 * If the cipher requires greater alignment than kmalloc
		 * alignment, we don't know the exact position of the
		 * initialization vector. We must assume worst case.
		 */
		iv_size_padding = align_mask;
	}

	/*  ...| IV + padding | original IV | original sec. number | bio tag offset | */
	additional_req_size = sizeof(struct dm_crypt_request) +
		iv_size_padding + cc->iv_size +
		cc->iv_size +
		sizeof(uint64_t) +
		sizeof(unsigned int);

	ret = mempool_init_kmalloc_pool(&cc->req_pool, MIN_IOS, cc->dmreq_start + additional_req_size);
	if (ret) {
		ti->error = "Cannot allocate crypt request mempool";
		goto bad;
	}

	cc->per_bio_data_size = ti->per_io_data_size =
		ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size,
		      ARCH_KMALLOC_MINALIGN);

	ret = mempool_init(&cc->page_pool, BIO_MAX_VECS, crypt_page_alloc, crypt_page_free, cc);
	if (ret) {
		ti->error = "Cannot allocate page mempool";
		goto bad;
	}

	ret = bioset_init(&cc->bs, MIN_IOS, 0, BIOSET_NEED_BVECS);
	if (ret) {
		ti->error = "Cannot allocate crypt bioset";
		goto bad;
	}

	mutex_init(&cc->bio_alloc_lock);

	ret = -EINVAL;
	if ((sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) ||
	    (tmpll & ((cc->sector_size >> SECTOR_SHIFT) - 1))) {
		ti->error = "Invalid iv_offset sector";
		goto bad;
	}
	cc->iv_offset = tmpll;

	ret = dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &cc->dev);
	if (ret) {
		ti->error = "Device lookup failed";
		goto bad;
	}

	ret = -EINVAL;
	if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1 || tmpll != (sector_t)tmpll) {
		ti->error = "Invalid device sector";
		goto bad;
	}
	cc->start = tmpll;

	if (bdev_is_zoned(cc->dev->bdev)) {
		/*
		 * For zoned block devices, we need to preserve the issuer write
		 * ordering. To do so, disable write workqueues and force inline
		 * encryption completion.
		 */
		set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
		set_bit(DM_CRYPT_WRITE_INLINE, &cc->flags);

		/*
		 * All zone append writes to a zone of a zoned block device will
		 * have the same BIO sector, the start of the zone. When the
		 * cypher IV mode uses sector values, all data targeting a
		 * zone will be encrypted using the first sector numbers of the
		 * zone. This will not result in write errors but will
		 * cause most reads to fail as reads will use the sector values
		 * for the actual data locations, resulting in IV mismatch.
		 * To avoid this problem, ask DM core to emulate zone append
		 * operations with regular writes.
		 */
		DMDEBUG("Zone append operations will be emulated");
		ti->emulate_zone_append = true;
	}

	if (crypt_integrity_aead(cc) || cc->integrity_iv_size) {
		ret = crypt_integrity_ctr(cc, ti);
		if (ret)
			goto bad;

		cc->tag_pool_max_sectors = POOL_ENTRY_SIZE / cc->on_disk_tag_size;
		if (!cc->tag_pool_max_sectors)
			cc->tag_pool_max_sectors = 1;

		ret = mempool_init_kmalloc_pool(&cc->tag_pool, MIN_IOS,
			cc->tag_pool_max_sectors * cc->on_disk_tag_size);
		if (ret) {
			ti->error = "Cannot allocate integrity tags mempool";
			goto bad;
		}

		cc->tag_pool_max_sectors <<= cc->sector_shift;
	}

	ret = -ENOMEM;
	cc->io_queue = alloc_workqueue("kcryptd_io/%s", WQ_MEM_RECLAIM, 1, devname);
	if (!cc->io_queue) {
		ti->error = "Couldn't create kcryptd io queue";
		goto bad;
	}

	if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
		cc->crypt_queue = alloc_workqueue("kcryptd/%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM,
						  1, devname);
	else
		cc->crypt_queue = alloc_workqueue("kcryptd/%s",
						  WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
						  num_online_cpus(), devname);
	if (!cc->crypt_queue) {
		ti->error = "Couldn't create kcryptd queue";
		goto bad;
	}

	spin_lock_init(&cc->write_thread_lock);
	cc->write_tree = RB_ROOT;

	cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write/%s", devname);
	if (IS_ERR(cc->write_thread)) {
		ret = PTR_ERR(cc->write_thread);
		cc->write_thread = NULL;
		ti->error = "Couldn't spawn write thread";
		goto bad;
	}
	wake_up_process(cc->write_thread);

	ti->num_flush_bios = 1;
	ti->limit_swap_bios = true;

	return 0;

bad:
	crypt_dtr(ti);
	return ret;
}

crypt_ctr的参数格式是< cipher > < key > < iv_offset > < dev_path > < start >，这些参数是在ctr中解析的并存放到crypt_config结构中。

< cipher > 格式是cipher-chainmode-ivopts:ivmode。cipher就是算法注册时的cra_name；chainmode就是前面所说的ecb/cbc之类的加密模式。但是chainmode默认是cbc，如果chainmode不是ecb，则必须指定ivmode。ivmode有5种：plain、plain64、essiv、benbi和null，分别对应不同的iv生成算法，而ivopts是传给这几种ivmode的ctr的参数，其中null、benbi、plain和plain64没有使用，而essiv将ivopts作为在系统中注册的哈希算法名，由该哈希算法生成iv。
< start >是加密的起始块，start之前并不是由dm-crypt管理。
< iv_offset >是为了保存iv到磁盘上而预留位置，其单位是sector。所以，dm-crypt设备上偏移为sector的bio对应于原始磁盘上sector+<iv_offset>+偏移的块。但在dm-crypt内部，偏移为sector+<iv_offset>，也就是说dm-crypt内部将iv所占据的那些块给隐藏了。

crypt_map函数

crypt_map函数的代码如下：


static int crypt_map(struct dm_target *ti, struct bio *bio)
{
	struct dm_crypt_io *io;
	struct crypt_config *cc = ti->private;

	/*
	 * If bio is REQ_PREFLUSH or REQ_OP_DISCARD, just bypass crypt queues.
	 * - for REQ_PREFLUSH device-mapper core ensures that no IO is in-flight
	 * - for REQ_OP_DISCARD caller must use flush if IO ordering matters
	 */
	if (unlikely(bio->bi_opf & REQ_PREFLUSH ||
	    bio_op(bio) == REQ_OP_DISCARD)) {
		bio_set_dev(bio, cc->dev->bdev);
		if (bio_sectors(bio))
			bio->bi_iter.bi_sector = cc->start +
				dm_target_offset(ti, bio->bi_iter.bi_sector);
		return DM_MAPIO_REMAPPED;
	}

	/*
	 * Check if bio is too large, split as needed.
	 */
	if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_VECS << PAGE_SHIFT)) &&
	    (bio_data_dir(bio) == WRITE || cc->on_disk_tag_size))
		dm_accept_partial_bio(bio, ((BIO_MAX_VECS << PAGE_SHIFT) >> SECTOR_SHIFT));

	/*
	 * Ensure that bio is a multiple of internal sector encryption size
	 * and is aligned to this size as defined in IO hints.
	 */
	if (unlikely((bio->bi_iter.bi_sector & ((cc->sector_size >> SECTOR_SHIFT) - 1)) != 0))
		return DM_MAPIO_KILL;

	if (unlikely(bio->bi_iter.bi_size & (cc->sector_size - 1)))
		return DM_MAPIO_KILL;

	io = dm_per_bio_data(bio, cc->per_bio_data_size);
	crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));

	if (cc->on_disk_tag_size) {
		unsigned tag_len = cc->on_disk_tag_size * (bio_sectors(bio) >> cc->sector_shift);

		if (unlikely(tag_len > KMALLOC_MAX_SIZE) ||
		    unlikely(!(io->integrity_metadata = kmalloc(tag_len,
				GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
			if (bio_sectors(bio) > cc->tag_pool_max_sectors)
				dm_accept_partial_bio(bio, cc->tag_pool_max_sectors);
			io->integrity_metadata = mempool_alloc(&cc->tag_pool, GFP_NOIO);
			io->integrity_metadata_from_pool = true;
		}
	}

	if (crypt_integrity_aead(cc))
		io->ctx.r.req_aead = (struct aead_request *)(io + 1);
	else
		io->ctx.r.req = (struct skcipher_request *)(io + 1);

	if (bio_data_dir(io->base_bio) == READ) {
		if (kcryptd_io_read(io, GFP_NOWAIT))
			kcryptd_queue_read(io);
	} else
		kcryptd_queue_crypt(io);

	return DM_MAPIO_SUBMITTED;
}

crypt_map用来修改bio的内容然后转发。

crypt_map

kcryptd_queue_io(io) // io结构包含bio、ti等信息
queue_work(cc->io_queue, &io->work) // 添加到io队列

队列io

kcryptd_queue_io(io) // io结构包含bio、ti等信息
kcryptd_io_read(io) // io 是 work 的容器，反向获取
generic_make_request(clone); // clone是io->base_bio的克隆，设置有异步回调

异步io

crypt_endio(struct bio *clone, int error) // 读操作完成回调，得到密文，保存在clone中
kcryptd_queue_crypt(io) // io由clone得到
queue_work(cc->crypt_queue, &io->work) // 添加到crypt队列

队列crypt

kcryptd_crypt(struct work_struct *work)
kcryptd_crypt_read_convert(io); // io 是 work 的容器，反向获取
crypt_convert(cc, &io->ctx) // cc 由 io 获得
crypt_convert_block(cc, ctx, cc->req) // 执行请求
crypto_ablkcipher_decrypt(req) // 调用异步密码算法

异步crypt

kcryptd_async_done(struct crypto_async_request *async_req, int error)
kcryptd_crypt_read_done(io, error) // 清理 io，然后结束

上面主要是读操作流程，而写操作流程与读操作流程基本类似，唯一不同的就是写操作要先encrypt再io，这就造成了写操作的两次io异步在两次crypt异步之后。

既然dm-crypt使用的是异步块加密算法，那么很自然就会想到这么两个问题：

dm-crypt请求数据自然要比密码算法处理数据的速度要快，而队列总有满的时候。那么，在这种情况下，dm-crypt怎么知道停止呢？
dm-crypt支持同步块加密或者普通分组加密算法吗？

对于第一个问题，在/drivers/md/dm-crypt.c中的crypt_convert_block函数中有解决，代码如下：

static int crypt_convert(struct crypt_config *cc,  
             struct convert_context *ctx)  
{  
    int r;  
    atomic_set(&ctx->pending, 1);  
    while(ctx->idx_in < ctx->bio_in->bi_vcnt &&  
          ctx->idx_out < ctx->bio_out->bi_vcnt) {  
        crypt_alloc_req(cc, ctx);  
        atomic_inc(&ctx->pending);  
        r = crypt_convert_block(cc, ctx, cc->req);  
        switch (r) {  
        /* async */  
        case -EBUSY:  
            wait_for_completion(&ctx->restart);  
            INIT_COMPLETION(ctx->restart);  
            /* fall through*/  
        case -EINPROGRESS:  
            cc->req = NULL;  
            ctx->sector++;  
            continue;  
        /* sync */  
        case 0:  
            atomic_dec(&ctx->pending);  
            ctx->sector++;  
            cond_resched();  
            continue;  
        /* error */  
        default:  
            atomic_dec(&ctx->pending);  
            return r;  
        }  
    }  
    return 0;  
}

从代码中可以看出，果异步密码算法的encrypt/decrypt返回-EBUSY，则dm-crypt陷入等待之中；如果返回-EINPROGRESS表示已将请求移入队列，dm-crypt会继续下一个请求；如果返回0表示已经完成，异步变成同步了，这里可以看出dm-crypt对同步块加密是支持的，这就回答了第二个问题。

既然dm-crypt能进入等待状态，那么也定义着被唤醒的操作，在密码算法的异步回调kcryptd_async_done函数中，由于代码有点长，只给了关键代码：

if (error == -EINPROGRESS) {
complete(&ctx->restart);
return;
}

可以看出，当一个标以EBUSY的request被error=-EINPROGRESS方式complete的时候，complete异步回调会唤醒dm-crypt而不干其他的事情，并且哪个处于等待状态的request仍然会被一部密码算法给记录，这样dm-crypt就不用重发请求了，只是额外执行了一次complete操作。

参考：
《深入理解Linux内核》
https://www.cnblogs.com/informatics/p/7903391.html

Linux内核中的dm-crypt模块的异步IO改造---blog17

dm-crypt分析

crypt_ctr函数

crypt_map函数

相关阅读

相关文章

相关问答

相关文档