2021SC@SUDSC
在学习了前面的只是后,就可以对dm-crypt进行分析了。
首先,需要明确的是,dm-crypt是dm构架中用于块设备加密的模块。dm-crypt通过dm虚拟一个块设备,并在bio转发的时候将数据加密后存储来实现块设备的加密,而这些对于应用层是透明的。其target_type的定义如下:
static struct target_type crypt_target = {
.name = "crypt",
.version = {1, 23, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
.features = DM_TARGET_ZONED_HM,
.report_zones = crypt_report_zones,
.map = crypt_map,
.status = crypt_status,
.postsuspend = crypt_postsuspend,
.preresume = crypt_preresume,
.resume = crypt_resume,
.message = crypt_message,
.iterate_devices = crypt_iterate_devices,
.io_hints = crypt_io_hints,
};
接下来我们主要分析ctr和map这两个函数。
crypt_ctr函数的代码如下:
/*
* Construct an encryption mapping:
* <cipher> [<key>|:<key_size>:<user|logon>:<key_description>] <iv_offset> <dev_path> <start>
*/
static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct crypt_config *cc;
const char *devname = dm_table_device_name(ti->table);
int key_size;
unsigned int align_mask;
unsigned long long tmpll;
int ret;
size_t iv_size_padding, additional_req_size;
char dummy;
if (argc < 5) {
ti->error = "Not enough arguments";
return -EINVAL;
}
key_size = get_key_size(&argv[1]);
if (key_size < 0) {
ti->error = "Cannot parse key size";
return -EINVAL;
}
cc = kzalloc(struct_size(cc, key, key_size), GFP_KERNEL);
if (!cc) {
ti->error = "Cannot allocate encryption context";
return -ENOMEM;
}
cc->key_size = key_size;
cc->sector_size = (1 << SECTOR_SHIFT);
cc->sector_shift = 0;
ti->private = cc;
spin_lock(&dm_crypt_clients_lock);
dm_crypt_clients_n++;
crypt_calculate_pages_per_client();
spin_unlock(&dm_crypt_clients_lock);
ret = percpu_counter_init(&cc->n_allocated_pages, 0, GFP_KERNEL);
if (ret < 0)
goto bad;
/* Optional parameters need to be read before cipher constructor */
if (argc > 5) {
ret = crypt_ctr_optional(ti, argc - 5, &argv[5]);
if (ret)
goto bad;
}
ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
if (ret < 0)
goto bad;
if (crypt_integrity_aead(cc)) {
cc->dmreq_start = sizeof(struct aead_request);
cc->dmreq_start += crypto_aead_reqsize(any_tfm_aead(cc));
align_mask = crypto_aead_alignmask(any_tfm_aead(cc));
} else {
cc->dmreq_start = sizeof(struct skcipher_request);
cc->dmreq_start += crypto_skcipher_reqsize(any_tfm(cc));
align_mask = crypto_skcipher_alignmask(any_tfm(cc));
}
cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request));
if (align_mask < CRYPTO_MINALIGN) {
/* Allocate the padding exactly */
iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request))
& align_mask;
} else {
/*
* If the cipher requires greater alignment than kmalloc
* alignment, we don't know the exact position of the
* initialization vector. We must assume worst case.
*/
iv_size_padding = align_mask;
}
/* ...| IV + padding | original IV | original sec. number | bio tag offset | */
additional_req_size = sizeof(struct dm_crypt_request) +
iv_size_padding + cc->iv_size +
cc->iv_size +
sizeof(uint64_t) +
sizeof(unsigned int);
ret = mempool_init_kmalloc_pool(&cc->req_pool, MIN_IOS, cc->dmreq_start + additional_req_size);
if (ret) {
ti->error = "Cannot allocate crypt request mempool";
goto bad;
}
cc->per_bio_data_size = ti->per_io_data_size =
ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + additional_req_size,
ARCH_KMALLOC_MINALIGN);
ret = mempool_init(&cc->page_pool, BIO_MAX_VECS, crypt_page_alloc, crypt_page_free, cc);
if (ret) {
ti->error = "Cannot allocate page mempool";
goto bad;
}
ret = bioset_init(&cc->bs, MIN_IOS, 0, BIOSET_NEED_BVECS);
if (ret) {
ti->error = "Cannot allocate crypt bioset";
goto bad;
}
mutex_init(&cc->bio_alloc_lock);
ret = -EINVAL;
if ((sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) ||
(tmpll & ((cc->sector_size >> SECTOR_SHIFT) - 1))) {
ti->error = "Invalid iv_offset sector";
goto bad;
}
cc->iv_offset = tmpll;
ret = dm_get_device(ti, argv[3], dm_table_get_mode(ti->table), &cc->dev);
if (ret) {
ti->error = "Device lookup failed";
goto bad;
}
ret = -EINVAL;
if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1 || tmpll != (sector_t)tmpll) {
ti->error = "Invalid device sector";
goto bad;
}
cc->start = tmpll;
if (bdev_is_zoned(cc->dev->bdev)) {
/*
* For zoned block devices, we need to preserve the issuer write
* ordering. To do so, disable write workqueues and force inline
* encryption completion.
*/
set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
set_bit(DM_CRYPT_WRITE_INLINE, &cc->flags);
/*
* All zone append writes to a zone of a zoned block device will
* have the same BIO sector, the start of the zone. When the
* cypher IV mode uses sector values, all data targeting a
* zone will be encrypted using the first sector numbers of the
* zone. This will not result in write errors but will
* cause most reads to fail as reads will use the sector values
* for the actual data locations, resulting in IV mismatch.
* To avoid this problem, ask DM core to emulate zone append
* operations with regular writes.
*/
DMDEBUG("Zone append operations will be emulated");
ti->emulate_zone_append = true;
}
if (crypt_integrity_aead(cc) || cc->integrity_iv_size) {
ret = crypt_integrity_ctr(cc, ti);
if (ret)
goto bad;
cc->tag_pool_max_sectors = POOL_ENTRY_SIZE / cc->on_disk_tag_size;
if (!cc->tag_pool_max_sectors)
cc->tag_pool_max_sectors = 1;
ret = mempool_init_kmalloc_pool(&cc->tag_pool, MIN_IOS,
cc->tag_pool_max_sectors * cc->on_disk_tag_size);
if (ret) {
ti->error = "Cannot allocate integrity tags mempool";
goto bad;
}
cc->tag_pool_max_sectors <<= cc->sector_shift;
}
ret = -ENOMEM;
cc->io_queue = alloc_workqueue("kcryptd_io/%s", WQ_MEM_RECLAIM, 1, devname);
if (!cc->io_queue) {
ti->error = "Couldn't create kcryptd io queue";
goto bad;
}
if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags))
cc->crypt_queue = alloc_workqueue("kcryptd/%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM,
1, devname);
else
cc->crypt_queue = alloc_workqueue("kcryptd/%s",
WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND,
num_online_cpus(), devname);
if (!cc->crypt_queue) {
ti->error = "Couldn't create kcryptd queue";
goto bad;
}
spin_lock_init(&cc->write_thread_lock);
cc->write_tree = RB_ROOT;
cc->write_thread = kthread_create(dmcrypt_write, cc, "dmcrypt_write/%s", devname);
if (IS_ERR(cc->write_thread)) {
ret = PTR_ERR(cc->write_thread);
cc->write_thread = NULL;
ti->error = "Couldn't spawn write thread";
goto bad;
}
wake_up_process(cc->write_thread);
ti->num_flush_bios = 1;
ti->limit_swap_bios = true;
return 0;
bad:
crypt_dtr(ti);
return ret;
}
crypt_ctr的参数格式是< cipher > < key > < iv_offset > < dev_path > < start >,这些参数是在ctr中解析的并存放到crypt_config结构中。
crypt_map函数的代码如下:
static int crypt_map(struct dm_target *ti, struct bio *bio)
{
struct dm_crypt_io *io;
struct crypt_config *cc = ti->private;
/*
* If bio is REQ_PREFLUSH or REQ_OP_DISCARD, just bypass crypt queues.
* - for REQ_PREFLUSH device-mapper core ensures that no IO is in-flight
* - for REQ_OP_DISCARD caller must use flush if IO ordering matters
*/
if (unlikely(bio->bi_opf & REQ_PREFLUSH ||
bio_op(bio) == REQ_OP_DISCARD)) {
bio_set_dev(bio, cc->dev->bdev);
if (bio_sectors(bio))
bio->bi_iter.bi_sector = cc->start +
dm_target_offset(ti, bio->bi_iter.bi_sector);
return DM_MAPIO_REMAPPED;
}
/*
* Check if bio is too large, split as needed.
*/
if (unlikely(bio->bi_iter.bi_size > (BIO_MAX_VECS << PAGE_SHIFT)) &&
(bio_data_dir(bio) == WRITE || cc->on_disk_tag_size))
dm_accept_partial_bio(bio, ((BIO_MAX_VECS << PAGE_SHIFT) >> SECTOR_SHIFT));
/*
* Ensure that bio is a multiple of internal sector encryption size
* and is aligned to this size as defined in IO hints.
*/
if (unlikely((bio->bi_iter.bi_sector & ((cc->sector_size >> SECTOR_SHIFT) - 1)) != 0))
return DM_MAPIO_KILL;
if (unlikely(bio->bi_iter.bi_size & (cc->sector_size - 1)))
return DM_MAPIO_KILL;
io = dm_per_bio_data(bio, cc->per_bio_data_size);
crypt_io_init(io, cc, bio, dm_target_offset(ti, bio->bi_iter.bi_sector));
if (cc->on_disk_tag_size) {
unsigned tag_len = cc->on_disk_tag_size * (bio_sectors(bio) >> cc->sector_shift);
if (unlikely(tag_len > KMALLOC_MAX_SIZE) ||
unlikely(!(io->integrity_metadata = kmalloc(tag_len,
GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
if (bio_sectors(bio) > cc->tag_pool_max_sectors)
dm_accept_partial_bio(bio, cc->tag_pool_max_sectors);
io->integrity_metadata = mempool_alloc(&cc->tag_pool, GFP_NOIO);
io->integrity_metadata_from_pool = true;
}
}
if (crypt_integrity_aead(cc))
io->ctx.r.req_aead = (struct aead_request *)(io + 1);
else
io->ctx.r.req = (struct skcipher_request *)(io + 1);
if (bio_data_dir(io->base_bio) == READ) {
if (kcryptd_io_read(io, GFP_NOWAIT))
kcryptd_queue_read(io);
} else
kcryptd_queue_crypt(io);
return DM_MAPIO_SUBMITTED;
}
crypt_map用来修改bio的内容然后转发。
上面主要是读操作流程,而写操作流程与读操作流程基本类似,唯一不同的就是写操作要先encrypt再io,这就造成了写操作的两次io异步在两次crypt异步之后。
既然dm-crypt使用的是异步块加密算法,那么很自然就会想到这么两个问题:
对于第一个问题,在/drivers/md/dm-crypt.c中的crypt_convert_block函数中有解决,代码如下:
static int crypt_convert(struct crypt_config *cc,
struct convert_context *ctx)
{
int r;
atomic_set(&ctx->pending, 1);
while(ctx->idx_in < ctx->bio_in->bi_vcnt &&
ctx->idx_out < ctx->bio_out->bi_vcnt) {
crypt_alloc_req(cc, ctx);
atomic_inc(&ctx->pending);
r = crypt_convert_block(cc, ctx, cc->req);
switch (r) {
/* async */
case -EBUSY:
wait_for_completion(&ctx->restart);
INIT_COMPLETION(ctx->restart);
/* fall through*/
case -EINPROGRESS:
cc->req = NULL;
ctx->sector++;
continue;
/* sync */
case 0:
atomic_dec(&ctx->pending);
ctx->sector++;
cond_resched();
continue;
/* error */
default:
atomic_dec(&ctx->pending);
return r;
}
}
return 0;
}
从代码中可以看出,果异步密码算法的encrypt/decrypt返回-EBUSY,则dm-crypt陷入等待之中;如果返回-EINPROGRESS表示已将请求移入队列,dm-crypt会继续下一个请求;如果返回0表示已经完成,异步变成同步了,这里可以看出dm-crypt对同步块加密是支持的,这就回答了第二个问题。
既然dm-crypt能进入等待状态,那么也定义着被唤醒的操作,在密码算法的异步回调kcryptd_async_done函数中,由于代码有点长,只给了关键代码:
if (error == -EINPROGRESS) {
complete(&ctx->restart);
return;
}
可以看出,当一个标以EBUSY的request被error=-EINPROGRESS方式complete的时候,complete异步回调会唤醒dm-crypt而不干其他的事情,并且哪个处于等待状态的request仍然会被一部密码算法给记录,这样dm-crypt就不用重发请求了,只是额外执行了一次complete操作。
参考:
《深入理解Linux内核》
https://www.cnblogs.com/informatics/p/7903391.html