KSM在初始化是会创建一个名为"ksmd"的内核线程。
[mm/ksm.c]
static int __init ksm_init(void)
{
struct task_struct *ksm_thread;
int err;
err = ksm_slab_init();
if (err)
goto out;
ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
if (IS_ERR(ksm_thread)) {
pr_err("ksm: creating kthread failed\n");
err = PTR_ERR(ksm_thread);
goto out_free;
}
#ifdef CONFIG_SYSFS
err = sysfs_create_group(mm_kobj, &ksm_attr_group);
if (err) {
pr_err("ksm: register sysfs failed\n");
kthread_stop(ksm_thread);
goto out_free;
}
#else
ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
#endif /* CONFIG_SYSFS */
#ifdef CONFIG_MEMORY_HOTREMOVE
/* There is no significance to this priority 100 */
hotplug_memory_notifier(ksm_memory_callback, 100);
#endif
return 0;
out_free:
ksm_slab_free();
out:
return err;
}
KSM只会处理通过madvise系统调用显式指定的用户进程空间内存,因此用户程序想使用这个功能就必须分配内存时显式地调用"madvise(add, length, MADV_MERGEABLE)", 如果用户想在KSM中取消某一个用户进程地址空间的合并功能,也需要显式得调用"madvise(addr, lenght, MADV_UNMERGEABLE)"。
在Android系统中,在libc库(Android系统的libc库是bionic)中的mmap函数实现已经默认添加了此功能。
[libc/bionic/mmap.cpp]
void* mmap64(void* addr, size_t size, int prot, int flags, int fd, off64_t offset) {
if (offset < 0 || (offset & ((1UL << MMAP2_SHIFT)-1)) != 0) {
errno = EINVAL;
return MAP_FAILED;
}
// prevent allocations large enough for `end - start` to overflow
size_t rounded = BIONIC_ALIGN(size, PAGE_SIZE);
if (rounded < size || rounded > PTRDIFF_MAX) {
errno = ENOMEM;
return MAP_FAILED;
}
bool is_private_anonymous =
(flags & (MAP_PRIVATE | MAP_ANONYMOUS)) == (MAP_PRIVATE | MAP_ANONYMOUS);
bool is_stack_or_grows_down = (flags & (MAP_STACK | MAP_GROWSDOWN)) != 0;
void* result = __mmap2(addr, size, prot, flags, fd, offset >> MMAP2_SHIFT);
/*判断mmap分配的内存,即进程用户空间地址是否私有映射(MAP_PRIVATE)或匿名映射(MAP_ANONYMOUS),
如果是,则显式地调用madvise系统调用把进程用户空间地址区间添加到Linux内核KSM系统中。*/
if (result != MAP_FAILED && kernel_has_MADV_MERGEABLE &&
is_private_anonymous && !is_stack_or_grows_down) {
ErrnoRestorer errno_restorer;
int rc = madvise(result, size, MADV_MERGEABLE);
if (rc == -1 && errno == EINVAL) {
kernel_has_MADV_MERGEABLE = false;
}
}
return result;
}
[madvise()->ksm_madvise()->__ksm_enter()]
int __ksm_enter(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
int needs_wakeup;
/*分配一个struct mm_slot数据结构*/
mm_slot = alloc_mm_slot();
if (!mm_slot)
return -ENOMEM;
/* Check ksm_run too? Would need tighter locking */
needs_wakeup = list_empty(&ksm_mm_head.mm_list);
/*添加管理ksm mmlist链表的spinlock锁*/
spin_lock(&ksm_mmlist_lock);
/*把当前的mm数据结构添加到mm_lots_hash哈希表中(mm_lots_hash是一个全局hash链表,必须加锁访问)*/
insert_to_mm_slots_hash(mm, mm_slot);
/*
* When KSM_RUN_MERGE (or KSM_RUN_STOP),
* insert just behind the scanning cursor, to let the area settle
* down a little; when fork is followed by immediate exec, we don't
* want ksmd to waste time setting up and tearing down an rmap_list.
*
* But when KSM_RUN_UNMERGE, it's important to insert ahead of its
* scanning cursor, otherwise KSM pages in newly forked mms will be
* missed: then we might as well insert at the end of the list.
*/
if (ksm_run & KSM_RUN_UNMERGE)
list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
else
list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
spin_unlock(&ksm_mmlist_lock);
/*设置mm->flags中的MMF_VM_MERGEABLE标志位,表示这个进程已经添加到KSM系统中*/
set_bit(MMF_VM_MERGEABLE, &mm->flags);
atomic_inc(&mm->mm_count);
/*如果之前ksm_mm_head.mm_list链表为空,则唤醒ksmd内核线程。*/
if (needs_wakeup)
wake_up_interruptible(&ksm_thread_wait);
return 0;
}
ksm_scan_thread()函数实现:此函数是ksmd内核线程的主干,每次会执行ksm_do_scan()函数取扫描和合并100个页面(见ksm_thread_pages_to_scan变量),然后睡眠等待20毫秒(见ksm_thread_sleep_millisecs变量),这两个参数可以在"/sys/kernel/mm/ksm"目录下的相关参数中去设置和修改。
[ksmd内核线程]
static int ksm_scan_thread(void *nothing)
{
set_freezable();
set_user_nice(current, 5);
while (!kthread_should_stop()) {
mutex_lock(&ksm_thread_mutex);
wait_while_offlining();
if (ksmd_should_run())
ksm_do_scan(ksm_thread_pages_to_scan);
mutex_unlock(&ksm_thread_mutex);
try_to_freeze();
if (ksmd_should_run()) {
schedule_timeout_interruptible(
msecs_to_jiffies(ksm_thread_sleep_millisecs));
} else {
wait_event_freezable(ksm_thread_wait,
ksmd_should_run() || kthread_should_stop());
}
}
return 0;
}
ksm_do_scan()函数实现:
/**
* ksm_do_scan - the ksm scanner main worker function.
* @scan_npages - number of pages we want to scan before we return.
*/
static void ksm_do_scan(unsigned int scan_npages)
{
struct rmap_item *rmap_item;
struct page *uninitialized_var(page);
/*while循环中尝试去合并scan_npages个页面,*/
while (scan_npages-- && likely(!freezing(current))) {
cond_resched();
/*scan_get_next_rmap_item()获取一个合适的匿名页面page,下面查看此函数实现*/
rmap_item = scan_get_next_rmap_item(&page);
if (!rmap_item)
return;
/*cmp_and_merge_page()会让page在KSM中的
stable和unstable的两颗红黑树中查找是否有合适合并的对象,并且尝试去合并它们
下面查看此函数实现*/
cmp_and_merge_page(page, rmap_item);
put_page(page);
}
}
KSM的核心数据结构:
/*描述一个虚拟地址反向映射的条目(item)*/
struct rmap_item {
/*所有的rmap_item连接成一个链表,链表头在ksm_scan.rmap_list中*/
struct rmap_item *rmap_list;
union {
/*当rmap_item加入stable树时,指向VMA的anon_vma数据结构*/
struct anon_vma *anon_vma; /* when stable */
#ifdef CONFIG_NUMA
int nid; /* when node of unstable tree */
#endif
};
/*进程的struct mm_struct数据结构*/
struct mm_struct *mm;
/*rmap_item所跟踪的用户空间地址*/
unsigned long address; /* + low bits used for flags below */
/*虚拟地址对应物理页面的旧校验值*/
unsigned int oldchecksum; /* when unstable */
union {
/*rmap_item加入unstable红黑树的节点*/
struct rb_node node; /* when node of unstable tree */
struct { /* when listed from stable tree */
/*加入stable红黑树的节点*/
struct stable_node *head;
/*stable链表*/
struct hlist_node hlist;
};
};
};
/*描述添加到KSM系统中将要被扫描的进程mm_struct数据结构*/
struct mm_slot {
/*用于添加到mm_slot哈希表中*/
struct hlist_node link;
/*用于添加到mm_slot链表中,链表头在ksm_mm_head*/
struct list_head mm_list;
/*rmap_item链表头*/
struct rmap_item *rmap_list;
/*进程的mm数据结构*/
struct mm_struct *mm;
};
/*用于表示当前扫描的状态*/
struct ksm_scan {
/*当前正在扫描的mm_slot*/
struct mm_slot *mm_slot;
/*下一次扫描的地址*/
unsigned long address;
/*将要扫描rmap_item的指针*/
struct rmap_item **rmap_list;
/*全局扫描完成后会计数一次,用于删除unstable节点*/
unsigned long seqnr;
};
[mm/ksm.c]
ksm_mm_head是mm_slot链表的头。ksm_scan是静态全局的数据结构,用于描述当前扫描的mm_slot。
static struct mm_slot ksm_mm_head = {
.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
};
static struct ksm_scan ksm_scan = {
.mm_slot = &ksm_mm_head,
};
scan_get_next_rmap_item()函数实现:获取一个合格的匿名页面page。
如果找到合格的page,分配一个rmap_item结构体,用来描述page
下面来看ksm_do_scan()中scan_get_next_rmap_item()函数的实现。
[ksm_do_scan()->scan_get_next_rmap_item()]
static struct rmap_item *scan_get_next_rmap_item(struct page **page)
{
struct mm_struct *mm;
struct mm_slot *slot;
struct vm_area_struct *vma;
struct rmap_item *rmap_item;
int nid;
if (list_empty(&ksm_mm_head.mm_list))
return NULL;
/*ksmd第一次跑的情况,初始化ksm_scan数据结构中的成员ksm_scan.mm_slot、
ksm_scan.address和ksm_scan.rmap_list*/
slot = ksm_scan.mm_slot;
if (slot == &ksm_mm_head) {
/*
* A number of pages can hang around indefinitely on per-cpu
* pagevecs, raised page count preventing write_protect_page
* from merging them. Though it doesn't really matter much,
* it is puzzling to see some stuck in pages_volatile until
* other activity jostles them out, and they also prevented
* LTP's KSM test from succeeding deterministically; so drain
* them here (here rather than on entry to ksm_do_scan(),
* so we don't IPI too often when pages_to_scan is set low).
*/
lru_add_drain_all();
/*
* Whereas stale stable_nodes on the stable_tree itself
* get pruned in the regular course of stable_tree_search(),
* those moved out to the migrate_nodes list can accumulate:
* so prune them once before each full scan.
*/
if (!ksm_merge_across_nodes) {
struct stable_node *stable_node;
struct list_head *this, *next;
struct page *page;
list_for_each_safe(this, next, &migrate_nodes) {
stable_node = list_entry(this,
struct stable_node, list);
page = get_ksm_page(stable_node, false);
if (page)
put_page(page);
cond_resched();
}
}
for (nid = 0; nid < ksm_nr_node_ids; nid++)
root_unstable_tree[nid] = RB_ROOT;
spin_lock(&ksm_mmlist_lock);
slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
ksm_scan.mm_slot = slot;
spin_unlock(&ksm_mmlist_lock);
/*
* Although we tested list_empty() above, a racing __ksm_exit
* of the last mm on the list may have removed it since then.
*/
if (slot == &ksm_mm_head)
return NULL;
next_mm:
ksm_scan.address = 0;
ksm_scan.rmap_list = &slot->rmap_list;
}
/*扫描当前slot对应的用户进程中的所有VMAs,寻找一个合适的匿名页面*/
mm = slot->mm;
down_read(&mm->mmap_sem);
if (ksm_test_exit(mm))
vma = NULL;
else
vma = find_vma(mm, ksm_scan.address);/*因为ksm_scan.address刚初始化时为0,
所以这里会找到这个用户进程中的第一个VMA。*/
/*for循环遍历所有的VMA*/
for (; vma; vma = vma->vm_next) {
if (!(vma->vm_flags & VM_MERGEABLE))
continue;
if (ksm_scan.address < vma->vm_start)
ksm_scan.address = vma->vm_start; /*第一个vma的地址存放在ksm_scan.address*/
if (!vma->anon_vma)
ksm_scan.address = vma->vm_end;
while (ksm_scan.address < vma->vm_end) {
if (ksm_test_exit(mm))
break;
/*follow_page()函数从虚拟地址开始找回normal mapping页面的struct page数据结构,
KSM只会处理匿名页面的情况*/
*page = follow_page(vma, ksm_scan.address, FOLL_GET);
if (IS_ERR_OR_NULL(*page)) {
ksm_scan.address += PAGE_SIZE;
cond_resched();
continue;
}
/*使用PageAnon()来判断该页是否为匿名页面*/
if (PageAnon(*page) ||
page_trans_compound_anon(*page)) {
/*下面两行代码冲刷该页对应的cache。*/
flush_anon_page(vma, *page, ksm_scan.address);
flush_dcache_page(*page);
/*get_next_rmap_item()去找ksm_scan.rmap_list链表上是否有
该虚拟地址对应的rmap_item,没有找到就新建一个*/
rmap_item = get_next_rmap_item(slot,
ksm_scan.rmap_list, ksm_scan.address);
if (rmap_item) {
/*ksm_scan.rmap_list指向刚找到或者新建的rmap_item,方便后续的扫描。
找到合适的匿名页面后,释放mm->mmap_sem信号量,这个信号量是在扫描VMA
时加的,然后返回该页struct page数据结构*/
ksm_scan.rmap_list =
&rmap_item->rmap_list;
ksm_scan.address += PAGE_SIZE;
} else
put_page(*page);
up_read(&mm->mmap_sem);
return rmap_item;
}
put_page(*page);
ksm_scan.address += PAGE_SIZE;
cond_resched();
}
}
/*运行到这里说明for循环里扫描该进程所有的VMA都没有找到合适的匿名页面,因为如果
找到一个合适的匿名页面是会返回rmap_item的。如果被扫描的进程已经销毁了(mm->mm_users = 0),
那么设置ksm_scan.address = 0.*/
if (ksm_test_exit(mm)) {
ksm_scan.address = 0;
ksm_scan.rmap_list = &slot->rmap_list;
}
/*
* Nuke all the rmap_items that are above this current rmap:
* because there were no VM_MERGEABLE vmas with such addresses.
*/
/*在该进程中没找到合适的匿名页面时,那么对应的rmap_item已经没有用处,为了
避免占用内存空间,直接全部删除。*/
remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
spin_lock(&ksm_mmlist_lock);
/*取下一个mm_slot,这里操作了mm_slot链表,所以用一个spinlock锁
ksm_mmlist_lock来保护链表*/
ksm_scan.mm_slot = list_entry(slot->mm_list.next,
struct mm_slot, mm_list);
/*如果找到一个合适的匿名页面是会返回rmap_item的。如果被扫描的进程已经销毁
了(mm->mm_users = 0),那么设置ksm_scan.address = 0.
处理该进程被销毁的情况,把mm_slot从ksm_mm_head链表删除,释放mm_slot数据结构,
清空mm->flags中的MMF_VM_MERGEABLE标志位*/
if (ksm_scan.address == 0) {
/*
* We've completed a full scan of all vmas, holding mmap_sem
* throughout, and found no VM_MERGEABLE: so do the same as
* __ksm_exit does to remove this mm from all our lists now.
* This applies either when cleaning up after __ksm_exit
* (but beware: we can reach here even before __ksm_exit),
* or when all VM_MERGEABLE areas have been unmapped (and
* mmap_sem then protects against race with MADV_MERGEABLE).
*/
hash_del(&slot->link);
list_del(&slot->mm_list);
spin_unlock(&ksm_mmlist_lock);
free_mm_slot(slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
up_read(&mm->mmap_sem);
mmdrop(mm);
} else {
spin_unlock(&ksm_mmlist_lock);
up_read(&mm->mmap_sem);
}
/* Repeat until we've completed scanning the whole list */
/*如果没有扫描完一轮所有的mm_slot,那么继续扫描下一个mm_slot。*/
slot = ksm_scan.mm_slot;
if (slot != &ksm_mm_head)
goto next_mm;
/*如果扫描完一轮mm_slot,则增加ksm_scan.seqnr计数*/
ksm_scan.seqnr++;
return NULL;
}
回到ksm_do_scan()函数
cmp_and_merge_page()函数实现:
[ksm_do_scan()->cmp_and_merge_page()]
/*
* cmp_and_merge_page - first see if page can be merged into the stable tree;
* if not, compare checksum to previous and if it's the same, see if page can
* be inserted into the unstable tree, or merged with a page already there and
* both transferred to the stable tree.
*
* @page: the page that we are searching identical page to.
* @rmap_item: the reverse mapping into the virtual address of this page
*/
/*
参数说明:
@page: 刚才扫描mm_slot时找到的一个合适的匿名页面,rmap_item表示该page对应的rmap_item数据结构。
@rmap_item: 表示该page对应的rmap_item数据结构。
*/
static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
{
struct rmap_item *tree_rmap_item;
struct page *tree_page = NULL;
struct stable_node *stable_node;
struct page *kpage;
unsigned int checksum;
int err;
/*如果这个页面是stable_node,则page_stable_node()返回这个page对应的stable_node,
否则返回NULL*/
stable_node = page_stable_node(page);
if (stable_node) {
if (stable_node->head != &migrate_nodes &&
get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
rb_erase(&stable_node->node,
root_stable_tree + NUMA(stable_node->nid));
stable_node->head = &migrate_nodes;
list_add(&stable_node->list, stable_node->head);
}
if (stable_node->head != &migrate_nodes &&
rmap_item->head == stable_node)
return;
}
/* We first start with searching the page inside the stable tree */
/*stable_tree_search()函数在stable红黑树中查找页面内容和page相同的stable页。下面查看此函数实现*/
kpage = stable_tree_search(page);
/*如果找到的stable页kpage和page是同一个页面,说明该页已经是KSM页面,不需要继续处理。
直接返回。put_page()减少_count引用计数,注意page在scan_get_next_rmap_item()->
follow_page()时给该页增加了_count引用计数。*/
if (kpage == page && rmap_item->head == stable_node) {
put_page(kpage);
return;
}
remove_rmap_item_from_tree(rmap_item);
if (kpage) {
/*如果在stable红黑树中找到一个页面内容相同的节点,那么调用try_to_merge_with_ksm_page()
尝试合并这个页面到节点上。合并成功后,stable_tree_append()会把rmap_item添加到
stable_node->hlist哈希链表上,下面查看此函数实现*/
err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
if (!err) {
/*
* The page was successfully merged:
* add its rmap_item to the stable tree.
*/
lock_page(kpage);
/*try_to_merge_with_ksm_page把page合并到kpage页面后,需要做
一些统计工作,下面来查看此函数实现*/
stable_tree_append(rmap_item, page_stable_node(kpage));
unlock_page(kpage);
}
put_page(kpage);
return;
}
/*
* If the hash value of the page has changed from the last time
* we calculated it, this page is changing frequently: therefore we
* don't want to insert it in the unstable tree, and we don't want
* to waste our time searching for something identical to it there.
*/
/*若在stable红黑树中没能找到和page内容相同的节点,则重新计算该页的校验值。
如果校验值发生变化,说明该页面的内容被频繁修改,这种页面不适合添加到unstable
红黑树中*/
checksum = calc_checksum(page);
if (rmap_item->oldchecksum != checksum) {
rmap_item->oldchecksum = checksum;
return;
}
/*上面是在stable树中找到候选者页面内容相同的情况,假设在stable树中没有找到的话,那么
接下来就去查找unstable树*/
/*unstable_tree_search_insert()搜索unstable红黑树中是否有和该页内容相同的节点,
如果没有找到,则将page插入到unstable树,在一次全盘扫描结束的时候会删除unstable树,再
下一次重新加入page,下面查看此函数实现*/
tree_rmap_item =
unstable_tree_search_insert(rmap_item, page, &tree_page);
/*若在unstable红黑树中能找到页面内容相同的节点tree_rmap_item和页面tree_page,
那么调用try_to_merge_two_pages()去尝试合并该页page和tree_page称为一个KSM页面
kpage。stable_tree_insert()会把kpage添加到stable红黑树中,创建一个新的stable
节点。stable_tree_append()把tree_rmap_item和rmap_item添加到stable节点的哈希
链表中,并更新统计计数ksm_pages_sharing和ksm_pages_shared*/
if (tree_rmap_item) {
/*当unstable树中找到和候选页面page内容相同的tree_page后,尝试把该page和
tree_page合并成一个KSM页面,下面我们查看try_to_merge_two_pages()函数实现*/
kpage = try_to_merge_two_pages(rmap_item, page,
tree_rmap_item, tree_page);
put_page(tree_page);
if (kpage) {
/*
* The pages were successfully merged: insert new
* node in the stable tree and add both rmap_items.
*/
lock_page(kpage);
/*当候选page荣升为KSM页面kpage后,stable_tree_insert()会把KSM页
kpage添加到stable树中,下面查看此函数实现*/
stable_node = stable_tree_insert(kpage);
if (stable_node) {
stable_tree_append(tree_rmap_item, stable_node);
stable_tree_append(rmap_item, stable_node);
}
unlock_page(kpage);
/*
* If we fail to insert the page into the stable tree,
* we will have 2 virtual addresses that are pointing
* to a ksm page left outside the stable tree,
* in which case we need to break_cow on both.
*/
/*到这里,我们就完成了对一个页面如何合并成KSM页面的介绍,包括了查找stable树和
unstable树等,接下来看如果在合并过程中发生失败的情况。*/
/*如果stable节点插入到stable红黑树失败,那么调用
break_cow()主动触发一个缺页中断来分离这个ksm页面,查看此函数的实现*/
if (!stable_node) {
break_cow(tree_rmap_item);
break_cow(rmap_item);
}
}
}
}
回到ksm_do_scan
stable_tree_search()函数实现:搜索stable红黑树并查找是否有和page页面内容一致的节点。
[ksm_do_scan()->cmp_and_merge_page()->stable_tree_search()]
/*
* stable_tree_search - search for page inside the stable tree
*
* This function checks if there is a page inside the stable tree
* with identical content to the page that we are scanning right now.
*
* This function returns the stable tree node of identical content if found,
* NULL otherwise.
*/
static struct page *stable_tree_search(struct page *page)
{
int nid;
struct rb_root *root;
struct rb_node **new;
struct rb_node *parent;
struct stable_node *stable_node;
struct stable_node *page_node;
/*如果page已经是stable page,那不需要搜索了。*/
page_node = page_stable_node(page);
if (page_node && page_node->head != &migrate_nodes) {
/* ksm page forked */
get_page(page);
return page;
}
nid = get_kpfn_nid(page_to_pfn(page));
/*从这里开始搜索stable红黑树*/
root = root_stable_tree + nid;
again:
new = &root->rb_node;
parent = NULL;
while (*new) {
struct page *tree_page;
int ret;
cond_resched();
/*rb_entry取出一个节点元素stable_node*/
stable_node = rb_entry(*new, struct stable_node, node);
/*get_ksm_page()函数把对应的stable节点转换为struct page数据结构。
stable节点中有一个成员kpfn存放着页帧号,通过页帧号可以求出对应的page数据
结构tree_page,注意这个函数会增加该结点tree_page的_count引用计数。*/
tree_page = get_ksm_page(stable_node, false);
if (!tree_page)
return NULL;
/*通过memcmp_pages()来对比page和tree_page的内容是否一致。*/
ret = memcmp_pages(page, tree_page);
/*调用put_page()来减少tree_page的_count引用计数,之前get_ksm_page()对该页增加了引用计数*/
put_page(tree_page);
parent = *new;
/*如果两个page内容不一致,则继续搜索红黑树的叶节点。*/
if (ret < 0)
new = &parent->rb_left;
else if (ret > 0)
new = &parent->rb_right;
else {
/*
* Lock and unlock the stable_node's page (which
* might already have been migrated) so that page
* migration is sure to notice its raised count.
* It would be more elegant to return stable_node
* than kpage, but that involves more changes.
*/
/*page和tree_page内容一致,重新用get_ksm_page()增加tree_page的引用计数,
其实是让页面迁移模块(page migration)知道这里在使用这个页面,最后返回tree_page*/
tree_page = get_ksm_page(stable_node, true);
if (tree_page) {
unlock_page(tree_page);
if (get_kpfn_nid(stable_node->kpfn) !=
NUMA(stable_node->nid)) {
put_page(tree_page);
goto replace;
}
return tree_page;
}
/*
* There is now a place for page_node, but the tree may
* have been rebalanced, so re-evaluate parent and new.
*/
if (page_node)
goto again;
return NULL;
}
}
if (!page_node)
return NULL;
list_del(&page_node->list);
DO_NUMA(page_node->nid = nid);
rb_link_node(&page_node->node, parent, new);
rb_insert_color(&page_node->node, root);
get_page(page);
return page;
replace:
if (page_node) {
list_del(&page_node->list);
DO_NUMA(page_node->nid = nid);
rb_replace_node(&stable_node->node, &page_node->node, root);
get_page(page);
} else {
rb_erase(&stable_node->node, root);
page = NULL;
}
stable_node->head = &migrate_nodes;
list_add(&stable_node->list, stable_node->head);
return page;
}
回到cmp_and_merge_page
try_to_merge_with_ksm_page()函数实现:合并page页面到ksm页面
[ksm_do_scan()->cmp_and_merge_page()->try_to_merge_with_ksm_page()]
/*
* try_to_merge_with_ksm_page - like try_to_merge_two_pages,
* but no new kernel page is allocated: kpage must already be a ksm page.
*
* This function returns 0 if the pages were merged, -EFAULT otherwise.
*/
/*参数说明:
@rmap_item: 候选页对应的rmap_item结构
@page:需要合并的页
@kpage: 是stable树中的KSM页面,尝试把候选页page合并到kpage中。
*/
static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
struct page *page, struct page *kpage)
{
struct mm_struct *mm = rmap_item->mm;
struct vm_area_struct *vma;
int err = -EFAULT;
/*需要操作VMA,因此加一个mm->mmap_sem读者锁*/
down_read(&mm->mmap_sem);
if (ksm_test_exit(mm))
goto out;
/*根据虚拟地址找到对应的VMA*/
vma = find_vma(mm, rmap_item->address);
if (!vma || vma->vm_start > rmap_item->address)
goto out;
/*调用try_to_merge_one_page(),尝试合并page到kpage中。下面查看此函数实现*/
err = try_to_merge_one_page(vma, page, kpage);
if (err)
goto out;
/* Unstable nid is in union with stable anon_vma: remove first */
remove_rmap_item_from_tree(rmap_item);
/* Must get reference to anon_vma while still holding mmap_sem */
/*rmap_item->anon_vma指向VMA对应的anon_vma数据结构*/
rmap_item->anon_vma = vma->anon_vma;
/*增加anon_vma->refcount的引用计数,防止anon_vma被释放*/
get_anon_vma(vma->anon_vma);
out:
/*释放mm->mmap_sem的读者锁*/
up_read(&mm->mmap_sem);
return err;
}
回到cmp_and_merge_page
try_to_merge_one_page()函数实现:尝试合并page页面到kpage
[ksm_do_scan()->cmp_and_merge_page()->try_to_merge_with_ksm_page()->try_to_merge_one_page()]
/*
* try_to_merge_one_page - take two pages and merge them into one
* @vma: the vma that holds the pte pointing to page
* @page: the PageAnon page that we want to replace with kpage
* @kpage: the PageKsm page that we want to map instead of page,
* or NULL the first time when we want to use page as kpage.
*
* This function returns 0 if the pages were merged, -EFAULT otherwise.
*/
static int try_to_merge_one_page(struct vm_area_struct *vma,
struct page *page, struct page *kpage)
{
pte_t orig_pte = __pte(0);
int err = -EFAULT;
/*page和kpage是同一个page*/
if (page == kpage) /* ksm page forked */
return 0;
/*page对应的VMA属性是不可合并的,即没有包含VM_MERGEABLE标志。*/
if (!(vma->vm_flags & VM_MERGEABLE))
goto out;
if (PageTransCompound(page) && page_trans_compound_anon_split(page))
goto out;
BUG_ON(PageTransCompound(page));
/*剔除不是匿名页面的部分*/
if (!PageAnon(page))
goto out;
/*
* We need the page lock to read a stable PageSwapCache in
* write_protect_page(). We use trylock_page() instead of
* lock_page() because we don't want to wait here - we
* prefer to continue scanning and merging different pages,
* then come back to this page when it is unlocked.
*/
/*这里为什么要使用trylock_page(page), 而不是lock_page(page)呢?我们需要
申请该页的页面锁以方便在稍后的write_protect_page()中读取稳定的PageSwapCache
的状态,并且不需要在这里睡眠等待该页的页面锁。如果该页被其他人加锁了,我们可以
略过它,先处理其他页面。*/
if (!trylock_page(page))
goto out;
/*
* If this anonymous page is mapped only here, its pte may need
* to be write-protected. If it's mapped elsewhere, all of its
* ptes are necessarily already write-protected. But in either
* case, we need to lock and check page_count is not raised.
*/
/*write_protect_page()对该页映射VMA的pte进行写保护操作,下面查看此函数实现*/
if (write_protect_page(vma, page, &orig_pte) == 0) {
/*在与unstable书节点合并时,参数kpage有可能传过来NULL,这主要是设置
page为stable节点,并且设置该页的活动情况(mark_page_accessed())*/
if (!kpage) {
/*
* While we hold page lock, upgrade page from
* PageAnon+anon_vma to PageKsm+NULL stable_node:
* stable_tree_insert() will update stable_node.
*/
set_page_stable_node(page, NULL);
mark_page_accessed(page);
err = 0;
/*pages_identical()再一次比较page和kpage内容是否一致。如果一致,则调用
replace_page(),把该page对应的pte设置对应的kpage中*/
} else if (pages_identical(page, kpage))
err = replace_page(vma, page, kpage, orig_pte);/*vma的用户空间和kpage建立映射,下面查看此函数实现*/
}
if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
munlock_vma_page(page);
if (!PageMlocked(kpage)) {
unlock_page(page);
lock_page(kpage);
mlock_vma_page(kpage);
page = kpage; /* for final unlock */
}
}
unlock_page(page);
out:
return err;
}
回到try_to_merge_with_ksm_page
write_protect_page()函数实现:
[ksm_do_scan()->cmp_and_merge_page()->try_to_merge_with_ksm_page()->try_to_merge_one_page()->write_protect_page()]
static int write_protect_page(struct vm_area_struct *vma, struct page *page,
pte_t *orig_pte)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long addr;
pte_t *ptep;
spinlock_t *ptl;
int swapped;
int err = -EFAULT;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
/*通过VMA和page数据结构可以计算出page对应的虚拟地址address。page结构
中有一个成员index,表示在VMA中的偏移量,由此可以得出虚拟地址。*/
addr = page_address_in_vma(page, vma);
if (addr == -EFAULT)
goto out;
BUG_ON(PageTransCompound(page));
mmun_start = addr;
mmun_end = addr + PAGE_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
/*有mm和虚拟地址address通过查询页表找到该地址对应的pte页表项*/
ptep = page_check_address(page, mm, addr, &ptl, 0);
if (!ptep)
goto out_mn;
/*因为该函数的作用是设置pte为写保护,因此对应pte页表项的属性是可写或脏页面需要设置pte为写
保护(对ARM处理器设置页表项的L_PTE_RDONLY比特位,对x86处理器清_PAGE_BIT_RW比特位),脏页面
通过set_page_dirty()函数来调用该页的mapping->a_ops->set_page_dirty()函数并通知回写系统。*/
if (pte_write(*ptep) || pte_dirty(*ptep)) {
pte_t entry;
swapped = PageSwapCache(page);
/*刷新这个页面对应的cache。*/
flush_cache_page(vma, addr, page_to_pfn(page));
/*
* Ok this is tricky, when get_user_pages_fast() run it doesn't
* take any lock, therefore the check that we are going to make
* with the pagecount against the mapcount is racey and
* O_DIRECT can happen right after the check.
* So we clear the pte and flush the tlb before the check
* this assure us that no O_DIRECT can happen after the check
* or in the middle of the check.
*/
/*清空pte页表项内容并冲刷相应的TLB,保证没有DIRECT_IO发生,函数返回该pte原来的内容*/
entry = ptep_clear_flush_notify(vma, addr, ptep);
/*
* Check that no O_DIRECT or similar I/O is in progress on the
* page
*/
/*为什么要有这样的判断公式呢?
这是一个需要深入理解内存管理代码才能明白的问题,涉及到page的_count和_mapcount两个引用计数的巧妙运用。
write_protect_page()函数本身的目的是让页面变成只读,后续可以做比较和合并的工作了。要把一个页面变成只读
需要满足如下两个条件:
(1) 确认没有其他人获取了该页面。
(2) 将指向该页面的pte变成只读属性。
第二个条件容易处理,难点在第一个条件上。一般来说,page 的_count计数有如下4中来源:
(1) page cache在radix tree上,KSM不考虑page cache情况
(2) 被用户引用,_count和_mapcount都会增加计数
(3) page->private私有数据也会增加_count计数,对于匿名页面,需要判断是否在swap cache中,例如add_to_swap()函数。
(4) 内核中有某些页面操作时会增加_count计数,例如follow_page()、get_user_pages_fast()等。
假设没有他内核路径操作该页面,并且该页面不在swap cache中,两个引用计数的关系为:
(page->_mapcount + 1) = page->_count
那么在write_protect_page()场景中,swapped指的是页面是否为swapcache,在add_to_swap()函数里增加_count计数,因此
上面的公式可以变为:
(page->_mapcount + 1) + PageSwapCache() = page->_count
这里为什么会有"+1"呢?因为该页面scan_get_next_rmap_item()函数通过follow_page()操作来获取struct page数据结构,
这个过程会让page->_count引用计数加1,综上所述,在当前场景下判断没有DIRECT_IO读写的情况,公式变为:
(page->_mapcount + 1) + 1 + PageSwapCache() == page->_count
因此判断不相等,说明有内核代码路径(例如DIRECT_IO读写)正在操作该页面,那么write_protect_page()函数只能返回错误。
*/
if (page_mapcount(page) + 1 + swapped != page_count(page)) {
set_pte_at(mm, addr, ptep, entry);
goto out_unlock;
}
if (pte_dirty(entry))
set_page_dirty(page);
/*下面两行生成一个具有只读属性的PTE entry,并设置到硬件页面中*/
entry = pte_mkclean(pte_wrprotect(entry));
set_pte_at_notify(mm, addr, ptep, entry);
}
*orig_pte = *ptep;
err = 0;
out_unlock:
pte_unmap_unlock(ptep, ptl);
out_mn:
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
return err;
}
回到try_to_merge_one_page
replace_page()函数实现:
[ksm_do_scan()->cmp_and_merge_page()->try_to_merge_with_ksm_page()->try_to_merge_one_page()->replace_page()]
/**
* replace_page - replace page in vma by new ksm page
* @vma: vma that holds the pte pointing to page
* @page: the page we are replacing by kpage 旧的page
* @kpage: the ksm page we replace page by kpage是stable树中找到的KSM页面
* @orig_pte: the original value of the pte 用于判断在这期间page是否被修改了
*
* Returns 0 on success, -EFAULT on failure.
*/
/*此函数的作用:简单说就是使用kpage的pfn加上原来page的一些属性构造成一个新的pte页表项,然后写入原来
page的pte页表项中,这样原来的page页对应的VMA用户地址空间就和kpage建立了映射关系*/
static int replace_page(struct vm_area_struct *vma, struct page *page,
struct page *kpage, pte_t orig_pte)
{
struct mm_struct *mm = vma->vm_mm;
pmd_t *pmd;
pte_t *ptep;
spinlock_t *ptl;
unsigned long addr;
int err = -EFAULT;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
addr = page_address_in_vma(page, vma);
if (addr == -EFAULT)
goto out;
pmd = mm_find_pmd(mm, addr);
if (!pmd)
goto out;
mmun_start = addr;
mmun_end = addr + PAGE_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!pte_same(*ptep, orig_pte)) {
pte_unmap_unlock(ptep, ptl);
goto out_mn;
}
/*给kpage增加在_count引用计数*/
get_page(kpage);
/*看起来page_add_anon_rmap()是要把kpage添加到RMAP系统中,因为kpage早已经
添加到RMAP系统中,所以这里只是增加_mapcount计数*/
page_add_anon_rmap(kpage, vma, addr);
/*冲刷addr和pte对应的cache,然后清空pte的内容和对应的TLB后,写入新的pte内容*/
flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
/*减少page的_mapcount和_count计数,并且删掉该page在swap分区的swap space*/
page_remove_rmap(page);
if (!page_mapped(page))
try_to_free_swap(page);
put_page(page);
pte_unmap_unlock(ptep, ptl);
err = 0;
out_mn:
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
return err;
}
回到try_to_merge_one_page函数
stable_tree_append()函数实现:
[ksm_do_scan()->cmp_and_merge_page()->stable_tree_append()]
/*
* stable_tree_append - add another rmap_item to the linked list of
* rmap_items hanging off a given node of the stable tree, all sharing
* the same ksm page.
*/
/*参数说明:
@rmap_item: 是page页面对应的rmap_item数据结构
@stable_node: struct stable_node是KSM页面mapping指向的数据结构,类似匿名页面中的anon_vma数据结构
stable_node参数是kpage指向的struct stable_node数据结构。
*/
static void stable_tree_append(struct rmap_item *rmap_item,
struct stable_node *stable_node)
{
rmap_item->head = stable_node;
rmap_item->address |= STABLE_FLAG;
/*把rmap_item添加到kpage页面的stable_node中的哈希链表里*/
hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
/*如果有多个页面同时映射到stable_node上,则增加ksm_pages_sharing计数,
否则增加ksm_pages_shared计数,说明这是一个新成立的stable节点。
ksm_pages_shared计数表示系统中有多个ksm节点,ksm_pages_sharing计数表示
合并到ksm节点中的页面个数。*/
if (rmap_item->hlist.next)
ksm_pages_sharing++;
else
ksm_pages_shared++;
}
回到cmp_and_merge_page()函数
unstable_tree_search_insert()函数实现:
[ksm_do_scan()->cmp_and_merge_page()->unstable_tree_search_insert()]
/*
* unstable_tree_search_insert - search for identical page,
* else insert rmap_item into the unstable tree.
*
* This function searches for a page in the unstable tree identical to the
* page currently being scanned; and if no identical page is found in the
* tree, we insert rmap_item as a new object into the unstable tree.
*
* This function returns pointer to rmap_item found to be identical
* to the currently scanned page, NULL otherwise.
*
* This function does both searching and inserting, because they share
* the same walking algorithm in an rbtree.
*/
static
struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
struct page *page,
struct page **tree_pagep)
{
struct rb_node **new;
struct rb_root *root;
struct rb_node *parent = NULL;
int nid;
nid = get_kpfn_nid(page_to_pfn(page));
root = root_unstable_tree + nid;
new = &root->rb_node;
/*查找unstable红黑树,这棵树的根在root_unstable_tree。*/
while (*new) {
struct rmap_item *tree_rmap_item;
struct page *tree_page;
int ret;
cond_resched();
tree_rmap_item = rb_entry(*new, struct rmap_item, node);
/*get_mergeable_page()判断从树中取出来的页面是否合格,只有匿名页面
才可以被合并。如果在树中没找到和候选页面相同的内容,那么会把候选页面
也添加到该树中。*/
tree_page = get_mergeable_page(tree_rmap_item);
if (IS_ERR_OR_NULL(tree_page))
return NULL;
/*
* Don't substitute a ksm page for a forked page.
*/
if (page == tree_page) {
put_page(tree_page);
return NULL;
}
ret = memcmp_pages(page, tree_page);
parent = *new;
if (ret < 0) {
put_page(tree_page);
new = &parent->rb_left;
} else if (ret > 0) {
put_page(tree_page);
new = &parent->rb_right;
} else if (!ksm_merge_across_nodes &&
page_to_nid(tree_page) != nid) {
/*
* If tree_page has been migrated to another NUMA node,
* it will be flushed out and put in the right unstable
* tree next time: only merge with it when across_nodes.
*/
put_page(tree_page);
return NULL;
} else {
*tree_pagep = tree_page;
return tree_rmap_item;
}
}
/*rmap_item->address的低12比特位用于存放一些标志位。例如UNSTABLE_FLAG(0x100)
表示rmap_item在unstable树中,另外低8位用于存放全盘扫描的次数seqnr。unstable树
的节点会在一次全盘扫描后被删掉,在下一次全盘扫描重新加入到unstable树中。
ksm_pages_unshared表示有在unstable树中的节点个数。*/
rmap_item->address |= UNSTABLE_FLAG;
rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
DO_NUMA(rmap_item->nid = nid);
rb_link_node(&rmap_item->node, parent, new);
rb_insert_color(&rmap_item->node, root);
ksm_pages_unshared++;
return NULL;
}
回到cmp_and_merge_page函数
try_to_merge_two_pages()函数实现:
[ksm_do_scan()->cmp_and_merge_page()->try_to_merge_two_pages()]
/*
* try_to_merge_two_pages - take two identical pages and prepare them
* to be merged into one page.
*
* This function returns the kpage if we successfully merged two identical
* pages into one ksm page, NULL otherwise.
*
* Note that this function upgrades page to ksm page: if one of the pages
* is already a ksm page, try_to_merge_with_ksm_page should be used.
*/
static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
struct page *page,
struct rmap_item *tree_rmap_item,
struct page *tree_page)
{
int err;
/*这里调用了两次try_to_merge_with_ksm_page(),注意这两次调用的参数不一
样,实现的功能也不一样。
第一次,参数是候选者page和对应的rmap_item,kpage为NULL,因此第一次调用主
要是把page的页表设置为写保护,并且把该页设置为KSM节点。*/
err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
if (!err) {
/*第二次,参数变成了tree_page和对应的tree_rmap_item,kpage为候选者
page,因此这里要实现的功能是把tree_page的页表设置为写保护,然后再比较
tree_page和page之间的内容是否一致。在查找unstable树时已经做过了页面
内容的比较,为什么这里还需要再比较一次呢?因为在这个过程中,页面有可能
被别的进程修改了内容。当两个页面内容确保一致后,借用page的pfn来重新生
成一个页表项并设置到tree_page的页表中,也就是tree_page对应的进程虚拟
地址和物理页面page重新建立了映射关系,tree_page和page合并成了一个KSM
页面,page作为KSM页面的联络点。*/
err = try_to_merge_with_ksm_page(tree_rmap_item,
tree_page, page);
/*
* If that fails, we have a ksm page with only one pte
* pointing to it: so break it.
*/
if (err)
break_cow(rmap_item);
}
return err ? NULL : page;
}
回到cmp_and_merge_page函数
stable_tree_insert()函数实现:
[ksm_do_scan()->cmp_and_merge_page()->stable_tree_insert()]
/*
* stable_tree_insert - insert stable tree node pointing to new ksm page
* into the stable tree.
*
* This function returns the stable tree node just allocated on success,
* NULL otherwise.
*/
static struct stable_node *stable_tree_insert(struct page *kpage)
{
int nid;
unsigned long kpfn;
struct rb_root *root;
struct rb_node **new;
struct rb_node *parent = NULL;
struct stable_node *stable_node;
kpfn = page_to_pfn(kpage);
nid = get_kpfn_nid(kpfn);
root = root_stable_tree + nid;
new = &root->rb_node;
/*查找stable树查找合适插入的叶节点*/
while (*new) {
struct page *tree_page;
int ret;
cond_resched();
stable_node = rb_entry(*new, struct stable_node, node);
tree_page = get_ksm_page(stable_node, false);
if (!tree_page)
return NULL;
ret = memcmp_pages(kpage, tree_page);
put_page(tree_page);
parent = *new;
if (ret < 0)
new = &parent->rb_left;
else if (ret > 0)
new = &parent->rb_right;
else {
/*
* It is not a bug that stable_tree_search() didn't
* find this node: because at that time our page was
* not yet write-protected, so may have changed since.
*/
return NULL;
}
}
/*分配一个新的stable_node节点,page->mapping指向stable_node节点,然后把
stable_node节点插入到stable树中*/
stable_node = alloc_stable_node();
if (!stable_node)
return NULL;
/*最后rmap_item和tree_rmap_item会添加到新的stable_tree的哈希链表中,并更新ksm的数据统计*/
INIT_HLIST_HEAD(&stable_node->hlist);
stable_node->kpfn = kpfn;
set_page_stable_node(kpage, stable_node);
DO_NUMA(stable_node->nid = nid);
rb_link_node(&stable_node->node, parent, new);
rb_insert_color(&stable_node->node, root);
return stable_node;
}
回到cmp_and_merge_page函数
break_cow()函数实现:
此函数处理已经把页面设置成写保护的情况,并人为造一个写错误的缺页中断,即写时复制(COW)的场景。其中,参数rmap_item中保存了该页的虚拟地址和进程数据结构,由此可以找到对应的VMA。
[ksm_do_scan()->cmp_and_merge_page()->break_cow()]
static void break_cow(struct rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
unsigned long addr = rmap_item->address;
struct vm_area_struct *vma;
/*
* It is not an accident that whenever we want to break COW
* to undo, we also need to drop a reference to the anon_vma.
*/
put_anon_vma(rmap_item->anon_vma);
down_read(&mm->mmap_sem);
vma = find_mergeable_vma(mm, addr);
if (vma)
break_ksm(vma, addr);
up_read(&mm->mmap_sem);
}
break_ksm()函数实现:
[ksm_do_scan()->cmp_and_merge_page()->break_cow()->break_ksm()]
/*
* We use break_ksm to break COW on a ksm page: it's a stripped down
*
* if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
* put_page(page);
*
* but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
* in case the application has unmapped and remapped mm,addr meanwhile.
* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
* mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
*/
static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
{
struct page *page;
int ret = 0;
do {
cond_resched();
/*follow_page函数由VMA和虚拟地址获取normal mapping的页面数据结构,参数flags
是FOLL_GET|FOLL_MIGRATION, FOLL_GET表示增加该页的_count计数,FOLL_MIGRATION
表示如果该页在页迁移的过程中会等待页迁移完成。*/
page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);
if (IS_ERR_OR_NULL(page))
break;
/*对于KSM页面,这里直接调用handle_mm_fault()人为造一个写错误(FAULT_FLAG_WRITE)的
缺页中断,在缺页中断处理函数找那个处理写时复制COW,最终调用do_wp_page()重新分配一个页面来
和对应的虚拟地址建立映射关系。*/
if (PageKsm(page))
ret = handle_mm_fault(vma->vm_mm, vma, addr,
FAULT_FLAG_WRITE);
else
ret = VM_FAULT_WRITE;
put_page(page);
} while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
/*
* We must loop because handle_mm_fault() may back out if there's
* any difficulty e.g. if pte accessed bit gets updated concurrently.
*
* VM_FAULT_WRITE is what we have been hoping for: it indicates that
* COW has been broken, even if the vma does not permit VM_WRITE;
* but note that a concurrent fault might break PageKsm for us.
*
* VM_FAULT_SIGBUS could occur if we race with truncation of the
* backing file, which also invalidates anonymous pages: that's
* okay, that truncation will have unmapped the PageKsm for us.
*
* VM_FAULT_OOM: at the time of writing (late July 2009), setting
* aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
* current task has TIF_MEMDIE set, and will be OOM killed on return
* to user; and ksmd, having no mm, would never be chosen for that.
*
* But if the mm is in a limited mem_cgroup, then the fault may fail
* with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
* even ksmd can fail in this way - though it's usually breaking ksm
* just to undo a merge it made a moment before, so unlikely to oom.
*
* That's a pity: we might therefore have more kernel pages allocated
* than we're counting as nodes in the stable tree; but ksm_do_scan
* will retry to break_cow on each pass, so should recover the page
* in due course. The important thing is to not let VM_MERGEABLE
* be cleared while any such pages might remain in the area.
*/
return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
}