2.4.1 内核态VM-Entry
本节本6步骤来分析vm-entry的流程
(1) vcpu->requests 处理
VM-Entry分为两种情况
a. 首次启动VCPU
b. VM-Exit后重新进入
上次VM-Exit时可能调用kvm_make_request设置不同的request;下次准备VM-Entry时需要处理这些request.
下表是kvm中各种reqeust和它们何时被设置。
Request | 类别 | 设置函数 | 处理函数 |
KVM_REQ_TLB_FLUSH | 内存 | kvm_arch_vcpu_setup==>vcpu_load kvm_mmu_flush_tlb kvm_flush_remote_tlbs(kvm_main.c) | kvm_x86_ops->tlb_flush(vcpu) |
KVM_REQ_REPORT_TPR_ACCESS | APIC | __report_tpr_access | vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS |
KVM_REQ_MMU_RELOAD | 内存 | kvm_reload_remote_mmus | kvm_mmu_unload |
KVM_REQ_TRIPLE_FAULT | 异常 | kvm_vcpu_ioctl_x86_set_mce kvm_multiple_exception,handle_vmclear vmx_inject_nmi, vmx_inject_irq vmx_queue_exception mmu_check_root | vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; |
KVM_REQ_PENDING_TIMER | Timer | apic_timer_fn | __vcpu_run |
KVM_REQ_UNHALT | APIC | kvm_vcpu_block | kvm_arch_vcpu_ioctl_run __vcpu_run |
KVM_REQ_MMU_SYNC | 内存 | kvm_mmu_get_page | kvm_mmu_sync_roots |
KVM_REQ_CLOCK_UPDATE | 时钟 | vcpu_enter_guest kvmclock_cpufreq_notifier kvm_set_guest_paused kvm_guest_time_update | kvm_guest_time_update |
KVM_REQ_DEACTIVATE_FPU | FPU | kvm_put_guest_fpu | kvm_x86_ops->fpu_deactivate |
KVM_REQ_EVENT | 中断,异常 | kvm_set_rflags apic_set_eoi kvmclock_cpufreq_notifier kvm_vcpu_reset kvm_vcpu_ioctl_interrupt kvm_arch_async_page_not_present kvm_arch_vcpu_ioctl_set_sregs kvm_multiple_exception vmx_vcpu_run kvm_vcpu_ioctl_x86_set_vcpu_events __vmx_complete_interrupts vmx_preemption_timer_fn handle_nmi_window __apic_accept_irq handle_interrupt_window handle_tpr_below_threshold kvm_apic_set_eoi_accelerated kvm_apic_post_state_restore kvm_apic_set_eoi_accelerated | kvm_apic_accept_events inject_pending_event enable_nmi_window enable_irq_window hwapic_irr_update kvm_lapic_find_highest_irr update_cr8_intercept kvm_lapic_sync_to_vapic |
KVM_REQ_APF_HALT | mmu | kvm_arch_async_page_not_present try_async_pf | vcpu->arch.apf.halted = true; |
KVM_REQ_STEAL_UPDATE | timer | kvm_arch_vcpu_load kvm_set_msr_common | record_steal_time(vcpu); |
KVM_REQ_NMI | nmi | kvm_inject_nmi | process_nmi |
KVM_REQ_PMU | pmu | kvm_perf_overflow_intr kvm_perf_overflow | process_pmu |
KVM_REQ_PMI | pmi | kvm_perf_overflow_intr | process_pmi |
KVM_REQ_MASTERCLOCK_UPDATE | timer | kvm_track_tsc_matching |
|
KVM_REQ_MCLOCK_INPROGRESS | timer | kvm_make_mclock_inprogress_request | kvm_gen_update_masterclock |
KVM_REQ_SCAN_IOAPIC | apic | kvm_make_scan_ioapic_request | vcpu_scan_ioapic |
KVM_REQ_GLOBAL_CLOCK_UPDATE | timer | kvm_arch_vcpu_load kvm_set_msr_common | kvm_gen_kvmclock_update |
(2) kvm_x86_ops->prepare_guest_switch ==> vmx_save_host_state
a.设置host state 区 FS GS
b, Host MSR设置 ==> kvm_set_shared_msr
kvm_set_shared_msr(vmx->guest_msrs[i].index,
vmx->guest_msrs[i].data,
vmx->guest_msrs[i].mask);
guest_msr在handle_wrmsr ==> vmx_set_msr中更新
(3) 检测不能进入vm-entry的条件
if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
|| need_resched() ||signal_pending(current)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
local_irq_enable();
preempt_enable();
vcpu->srcu_idx =srcu_read_lock(&vcpu->kvm->srcu);
r = 1;
goto cancel_injection;
}
if (req_immediate_exit) //有nested event时,在inject_pending_event返回1时
smp_send_reschedule(vcpu->cpu);
当vcpu处于EXITING_GUEST_MODE,或有requests,或当前线程需要调度和有signal未处理时,返回1值到上层函数__vcpu_run,做如下处理
if (signal_pending(current)) {
r = -EINTR;
vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.signal_exits;
}
if (need_resched()) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
cond_resched();
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
}
等待下次loop在调用vcpu_enter_guest,
(4) 当需要设置vpcu debug regs 时, 设置当前 cpu debug寄存器
set_debugreg(0, 7);
set_debugreg(vcpu->arch.eff_db[0], 0);
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
set_debugreg(vcpu->arch.dr6, 6);
#define set_debugreg(value,register) \
native_set_debugreg(register, value)
(5) 更新Gusest RIP , RSP
这里只设置了rip, rsp,其它寄存器在不同地方设置
a.其它寄存器在rax, rcx等, 这些寄存器在vmcsguest state区并没有对应设置
kvm_arch_vcpu_ioctl_set_regs时(qemu在cpu reset或resume)设置
b. 段寄存器在vmx_vcpu_reset时设置, 代码段的例子
seg_setup(VCPU_SREG_CS); //用kvm_vmx_segment_fields中的元素设置
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
vmcs_write32(GUEST_CS_BASE, 0xffff0000);
static const structkvm_vmx_segment_field {
unsigned selector;
unsigned base;
unsigned limit;
unsigned ar_bytes;
} kvm_vmx_segment_fields[] ={
VMX_SEGMENT_FIELD(CS),
。。。。。。,
};
(6) 进入 VM-Entry
2.4.2 内核态VM-Exit
(1) vmx_vcpu_run 中vm-exit流程
vmx->idt_vectoring_info =vmcs_read32(IDT_VECTORING_INFO_FIELD);
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
a) vmx_complete_atomic_exit获得nmi 或machne check引发的vm-exit信息
vmx->exit_intr_info =vmcs_read32(VM_EXIT_INTR_INFO);
exit_intr_info = vmx->exit_intr_info;
/* Handle machine checks before interruptsare enabled */
if (is_machine_check(exit_intr_info))
kvm_machine_check(); //处理machine check
/* We need to handle NMIs before interruptsare enabled */
if ((exit_intr_info &INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
(exit_intr_info &INTR_INFO_VALID_MASK)) {
kvm_before_handle_nmi(&vmx->vcpu);
asm("int $2");
kvm_after_handle_nmi(&vmx->vcpu);
}
b) vmx_recover_nmi_blocking处理virtual nmi
virtual nmi是通过事件注入方式产生的nmi
exit_intr_info =vmcs_read32(VM_EXIT_INTR_INFO);
unblock_nmi = (exit_intr_info &INTR_INFO_UNBLOCK_NMI) != 0;
vector = exit_intr_info &INTR_INFO_VECTOR_MASK;
if ((exit_intr_info &INTR_INFO_VALID_MASK) && unblock_nmi &&
vector != DF_VECTOR&& !idtv_info_valid) //block by nmi被解除
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
1. GUEST_INTR_STATE_NMI);//设置允许nmi中断
else
vmx->nmi_known_unmasked =
!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
& GUEST_INTR_STATE_NMI);
x86/x64 有4种中断阻塞状态
2. Blockingby STI
若stil指令前eflags.IF=0则下条指令执行完前如果发生外部中断将会阻塞,下条执行执行完后将解除阻塞。
sti
ret //ret 执行完后才能产生中断
2. Blocking by mov-ss
使用mov 或者pop更新ss寄存器时, 下条指令执行完前如果发生外部中断,NMI,#DB异常将会阻塞;下条执行执行完后将解除阻塞, 例子:
mov ss, ax
mov esp, kernel_stack //存在blocking by mov-ss
mov ax, 0 //无阻塞状态
3. Blocking by SMI
处理器切换到smm模式执行时,新的smi将会被阻塞
4. Blocking by NMI
处理器响应nmi请求时,nmi服务例程得到delivery后将阻塞另一个nmi.
注意,当nmi exiting =1时,一个nmi将产生vm-exit,nmi并未delivery,所以此时 blocking by nmi is 0. 当vmm注入nmi时, nmi通过guest-idt进行delivery,cpu处于blocking by nmi.
对于blocking by nmi的引起,virtual-nmi和nmi完全等价。 当nmi服务例程执行iret返回后,blocking by nmi解除。 如果这个iret指令的执行引起vm-exit,vm-exit interruptioninformation bit12将为1,表明阻塞状态解除。
static void vmcs_set_bits(unsigned longfield, u32 mask)
{
vmcs_writel(field,vmcs_readl(field) | mask);
}
c) __vmx_complete_interrupts获取与预处理中断导致的vm-exit
(2) vcpu_enter_guest
//当guest_debug未开启, 且exit_reason 为EXIT_REASON_DR_ACCESS时,
if(unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
int i;
kvm_x86_ops->sync_dirty_debug_regs(vcpu);
for (i = 0; i < KVM_NR_DB_REGS; i++)
vcpu->arch.eff_db[i] = vcpu->arch.db[i];
}
备份vcpu的db寄存器,当vm-Entry时载入
vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
native_read_tsc());
kvm_x86_ops->handle_external_intr(vcpu)= vmx_handle_external_intr
if (vcpu->arch.apic_attention)
kvm_lapic_sync_from_vapic(vcpu);
kvm_x86_ops->handle_exit(vcpu)= vmx_handle_exit;
(3) vmx_handle_exit
a) 调用guest_state_valid判断guest state如果不合法(段寄存器base,limit,attr等)则进入调用handle_invalid_guest_state处理,并返回1。当vm-entry直接fialed 时vcpu->run->exit_reason= KVM_EXIT_FAIL_ENTRY,返回0
b) 由向量引起vm-exit,但exit_reason不为异常,nmi,ept violation 和task_switch时:
vcpu->run->exit_reason =KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =KVM_INTERNAL_ERROR_DELIVERY_EV; 并返回0
c) 调用kvm_vmx_exit_handlers[exit_reason]处理kvm_vmx_exit_handlers 支持的vm-exit,其它类型设置vcpu->run->exit_reason = KVM_EXIT_UNKNOWN
d) 当handler的返回值<= 0时, 返回到上层函数后内核态的处理将结束,进入qemu(kvm_cpu_exec)继续处理
Ø 0时, 内核态处理完成后,将准备下一次vm-entry.
2.4.3 Qemu Vm-Entry &VM-Exit分析
(1) VM-EXIT处理
内核态返回到用户态kvm_cpu_exec中
a. kvm_arch_post_run 处理apic
b. switch (run->exit_reason) 根据exit原因做不同处理
KVM_EXIT_IO, KVM_EXIT_MMIO 将在3,4章中分析
KVM_EXIT_UNKNOWN时,kvm_cpu_exec将返回vm_stop
void vm_stop(RunState state)
{
if(!qemu_thread_is_self(&io_thread)) {
qemu_system_vmstop_request(state);
cpu_stop_current();
return;
}
do_vm_stop(state);
}
当返回到qemu_kvm_cpu_thread_fn后循环结束。
KVM_EXIT_IRQ_WINDOW_OPEN return EXCP_INTERRUPT;
KVM_EXIT_SHUTDOWN:call qemu_system_reset_request
voidqemu_system_reset_request(void)
{
if (no_reboot) {
shutdown_requested = 1;
} else {
reset_requested = 1;
}
cpu_stop_current();
qemu_notify_event();
}
main_loop==>main_loop_should_exit
if (qemu_reset_requested()) {
pause_all_vcpus();
cpu_synchronize_all_states();
qemu_system_reset(VMRESET_REPORT);
resume_all_vcpus();
if(runstate_check(RUN_STATE_INTERNAL_ERROR) ||
runstate_check(RUN_STATE_SHUTDOWN)) {
runstate_set(RUN_STATE_PAUSED);
}
}
......
if (qemu_vmstop_requested(&r)) {
vm_stop(r);
}
处理 reset与stop
KVM_EXIT_INTERNAL_ERROR: ret = kvm_handle_internal_error(env, run);
call kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_INTERNAL_ERROR_DATA);
kvm_dev_ioctl_check_extension_generic(kernel kvm_main.c)return 1;
if (run->internal.suberror ==KVM_INTERNAL_ERROR_EMULATION) {
fprintf(stderr, "emulation failure\n");
if(!kvm_arch_stop_on_emulation_error(env)) {
cpu_dump_state(env, stderr,fprintf, CPU_DUMP_CODE);
return EXCP_INTERRUPT;
}
}
剩余case 则
intkvm_arch_handle_exit(CPUX86State *env, struct kvm_run *run)
{
switch (run->exit_reason) {
case KVM_EXIT_HLT:
DPRINTF("handle_hlt\n");
ret = kvm_handle_halt(env);
break;
case KVM_EXIT_SET_TPR:
ret = 0;
break;
case KVM_EXIT_TPR_ACCESS:
ret = kvm_handle_tpr_access(env);
break;
case KVM_EXIT_FAIL_ENTRY:
code =run->fail_entry.hardware_entry_failure_reason;
ret = -1;
break;
case KVM_EXIT_EXCEPTION:
ret = -1;
break;
case KVM_EXIT_DEBUG:
ret =kvm_handle_debug(&run->debug.arch);
break;
default:
ret = -1;
break;
}
return ret;
}
返回-1导致vm_stop
(2) VM-ENTRY处理
a) 处理cpu异步事件
if (kvm_arch_process_async_events(env)) {
env->exit_request = 0;
return EXCP_HLT;
}
b) kvm_arch_process_async_events根据env->interrupt_request,做不同处理,case如下:
CPU_INTERRUPT_POLL call apic_poll_irq
CPU_INTERRUPT_SMI
CPU_INTERRUPT_NMI
CPU_INTERRUPT_MCE machine check
CPU_INTERRUPT_VIRQ
CPU_INTERRUPT_INIT kvm_cpu_synchronize_state(env);
do_cpu_init(cpu); //cpu初始化
CPU_INTERRUPT_SIPI kvm_cpu_synchronize_state(env);
do_cpu_sipi(cpu); 向不同cpu发sipi
CPU_INTERRUPT_TPR handle tpr access
这些request由qemu的vm_exit handler调用cpu_interrupt设置
c) 如果寄存器需要更新
if (env->kvm_vcpu_dirty) {
kvm_arch_put_registers(env,KVM_PUT_RUNTIME_STATE);
env->kvm_vcpu_dirty= 0;
}
d) 中断与NMI注入kvm_arch_pre_run
if (env->interrupt_request &CPU_INTERRUPT_NMI) {
env->interrupt_request &=~CPU_INTERRUPT_NMI;
DPRINTF("injected NMI\n");
ret = kvm_vcpu_ioctl(env, KVM_NMI);
if (ret < 0) {
fprintf(stderr, "KVM:injection failed, NMI lost (%s)\n", strerror(-ret));
}
}
e) call kvm_vcpu_ioctl(env,KVM_RUN, 0);
2.4.4 几个简单情景分析
(1) Halt指令
guest os 执行 hlt指令
vm-exit and vmm call handle_halt
static int handle_halt(structkvm_vcpu *vcpu)
{
skip_emulated_instruction(vcpu); //更新rip
return kvm_emulate_halt(vcpu); //模拟halt指令的执行
}
static voidskip_emulated_instruction(struct kvm_vcpu *vcpu)
{
unsigned long rip;
rip = kvm_rip_read(vcpu); //取当前rip
rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); // rip += 导致vm-exit的指令长度,
kvm_rip_write(vcpu, rip);
vmx_set_interrupt_shadow(vcpu, 0);
}
int kvm_emulate_halt(structkvm_vcpu *vcpu)
{
++vcpu->stat.halt_exits;
if (irqchip_in_kernel(vcpu->kvm)) {
vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
return 1;
} else {
vcpu->run->exit_reason = KVM_EXIT_HLT;
return 0;
}
}
返回到qemu后
kvm_arch_handle_exit ==> kvm_handle_halt
static intkvm_handle_halt(CPUX86State *env)
{
if (!((env->interrupt_request &CPU_INTERRUPT_HARD) &&
(env->eflags & IF_MASK))&&
!(env->interrupt_request &CPU_INTERRUPT_NMI)) {
env->halted = 1;
return EXCP_HLT;
}
return 0;
}
kvm_cpu_exec中将不在vm-entry, 而是返回EXCP_HLT
if (kvm_arch_process_async_events(env)) {
env->exit_request = 0;
return EXCP_HLT; //
}
(2) Tripe Fault
handle_triple_fault
static inthandle_triple_fault(struct kvm_vcpu *vcpu)
{
vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
}
(3) MSR
handle_rdmsr ==>
vmx_get_msr(vcpu, ecx, &data)
vcpu->arch.regs[VCPU_REGS_RAX]= data & -1u;
vcpu->arch.regs[VCPU_REGS_RDX]= (data >> 32) & -1u;
skip_emulated_instruction
vmx_get_msr ==> 分两种
case a. 在VMCS GuestState中有存储的直接用vmcs_read32 返回,如GUEST_SYSENTER_CS
case b.返回软件维护的值find_msr_entry
kvm_get_msr_common
handle_wrmsr==》 vmx_set_msr来完成
(4) NMI
用户态
apic_bus_deliver ==》 cpu_interrupt(apic_iter->cpu_env,CPU_INTERRUPT_NMI) );
kvm_arch_pre_run ==》 kvm_vcpu_ioctl(env, KVM_NMI)
内核态
kvm_vcpu_ioctl_nmi ==> kvm_inject_nmi==> kvm_make_request(KVM_REQ_NMI, vcpu);
vcpu_enter_guest==》process_nmi
static voidprocess_nmi(struct kvm_vcpu *vcpu)
{
unsigned limit = 2;
if (kvm_x86_ops->get_nmi_mask(vcpu) ||vcpu->arch.nmi_injected)
limit = 1;
vcpu->arch.nmi_pending +=atomic_xchg(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending,limit);
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
vcpu_enter_guest ==> inject_pending_event
if (vcpu->arch.nmi_pending) {
if (kvm_x86_ops->nmi_allowed(vcpu)) {
--vcpu->arch.nmi_pending;
vcpu->arch.nmi_injected = true;
kvm_x86_ops->set_nmi(vcpu);
}
}
static voidvmx_inject_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (is_guest_mode(vcpu))
return;
if (!cpu_has_virtual_nmis()) {
vmx->soft_vnmi_blocked= 1;
vmx->vnmi_blocked_time = 0;
}
++vcpu->stat.nmi_injections;
vmx->nmi_known_unmasked = false;
if (vmx->rmode.vm86_active) {
if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) !=EMULATE_DONE)
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
}
注入NMI后处于blocking by nmi
vcpu_enter_guest ==> kvm_x86_ops->enable_nmi_window =enable_nmi_window
Enable CPU_BASED_VM_EXEC_CONTROLbit[22]当该位为1时,在没有nmi阻塞的情况下,vm-entry直接引发vm-exit当guest os 解除blockingby nmi后, 再次vm-entry时会触发vm-exit ,此时
进入 handle_nmi_window, Enable CPU_BASED_VM_EXEC_CONTROL bit[22]被清0。