2.4 VM-Entry & VM-Exit

钮勇
2023-12-01


2.4.1 内核态VM-Entry

本节本6步骤来分析vm-entry的流程

 

(1) vcpu->requests 处理

VM-Entry分为两种情况

a. 首次启动VCPU

b. VM-Exit后重新进入

 

上次VM-Exit时可能调用kvm_make_request设置不同的request;下次准备VM-Entry时需要处理这些request.

下表是kvm中各种reqeust和它们何时被设置。

Request

类别

设置函数

处理函数

KVM_REQ_TLB_FLUSH

内存

kvm_arch_vcpu_setup==>vcpu_load kvm_mmu_flush_tlb

kvm_flush_remote_tlbs(kvm_main.c)

kvm_x86_ops->tlb_flush(vcpu)

KVM_REQ_REPORT_TPR_ACCESS

APIC

__report_tpr_access

vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS

KVM_REQ_MMU_RELOAD

内存

kvm_reload_remote_mmus

kvm_mmu_unload

KVM_REQ_TRIPLE_FAULT

异常

kvm_vcpu_ioctl_x86_set_mce

kvm_multiple_exception,handle_vmclear

vmx_inject_nmi, vmx_inject_irq

vmx_queue_exception mmu_check_root

vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;

KVM_REQ_PENDING_TIMER

Timer

apic_timer_fn

__vcpu_run

KVM_REQ_UNHALT

APIC

kvm_vcpu_block

kvm_arch_vcpu_ioctl_run

__vcpu_run

KVM_REQ_MMU_SYNC

内存

kvm_mmu_get_page

kvm_mmu_sync_roots

KVM_REQ_CLOCK_UPDATE

时钟

vcpu_enter_guest

kvmclock_cpufreq_notifier

kvm_set_guest_paused

kvm_guest_time_update

kvm_guest_time_update

KVM_REQ_DEACTIVATE_FPU

FPU

kvm_put_guest_fpu

kvm_x86_ops->fpu_deactivate

KVM_REQ_EVENT

中断,异常

kvm_set_rflags apic_set_eoi

kvmclock_cpufreq_notifier

kvm_vcpu_reset

kvm_vcpu_ioctl_interrupt

kvm_arch_async_page_not_present

kvm_arch_vcpu_ioctl_set_sregs

kvm_multiple_exception vmx_vcpu_run

kvm_vcpu_ioctl_x86_set_vcpu_events

__vmx_complete_interrupts

vmx_preemption_timer_fn

handle_nmi_window __apic_accept_irq

handle_interrupt_window

handle_tpr_below_threshold

kvm_apic_set_eoi_accelerated

kvm_apic_post_state_restore

kvm_apic_set_eoi_accelerated

kvm_apic_accept_events

inject_pending_event

enable_nmi_window

enable_irq_window

hwapic_irr_update

kvm_lapic_find_highest_irr

update_cr8_intercept

kvm_lapic_sync_to_vapic

KVM_REQ_APF_HALT

mmu

kvm_arch_async_page_not_present

try_async_pf

vcpu->arch.apf.halted = true;

KVM_REQ_STEAL_UPDATE

timer

kvm_arch_vcpu_load

kvm_set_msr_common

record_steal_time(vcpu);

KVM_REQ_NMI

nmi

kvm_inject_nmi

process_nmi

KVM_REQ_PMU

pmu

kvm_perf_overflow_intr

kvm_perf_overflow

process_pmu

KVM_REQ_PMI

pmi

kvm_perf_overflow_intr

process_pmi

KVM_REQ_MASTERCLOCK_UPDATE

timer

kvm_track_tsc_matching

 

KVM_REQ_MCLOCK_INPROGRESS

timer

kvm_make_mclock_inprogress_request

kvm_gen_update_masterclock

KVM_REQ_SCAN_IOAPIC

apic

kvm_make_scan_ioapic_request

vcpu_scan_ioapic

KVM_REQ_GLOBAL_CLOCK_UPDATE

timer

kvm_arch_vcpu_load

kvm_set_msr_common

kvm_gen_kvmclock_update

 

(2) kvm_x86_ops->prepare_guest_switch ==> vmx_save_host_state

 a.设置host state 区 FS GS

 b, Host MSR设置 ==> kvm_set_shared_msr

       kvm_set_shared_msr(vmx->guest_msrs[i].index,

                vmx->guest_msrs[i].data,

                vmx->guest_msrs[i].mask);

 

guest_msr在handle_wrmsr  ==> vmx_set_msr中更新

 

(3) 检测不能进入vm-entry的条件

    if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests

        || need_resched() ||signal_pending(current)) {

       vcpu->mode = OUTSIDE_GUEST_MODE;

       smp_wmb();

       local_irq_enable();

       preempt_enable();

       vcpu->srcu_idx =srcu_read_lock(&vcpu->kvm->srcu);

       r = 1;

       goto cancel_injection;

    }

    if (req_immediate_exit) //有nested event时,在inject_pending_event返回1时

       smp_send_reschedule(vcpu->cpu);

 

当vcpu处于EXITING_GUEST_MODE,或有requests,或当前线程需要调度和有signal未处理时,返回1值到上层函数__vcpu_run,做如下处理

       if (signal_pending(current)) {

           r = -EINTR;

           vcpu->run->exit_reason = KVM_EXIT_INTR;

           ++vcpu->stat.signal_exits;

       }

       if (need_resched()) {

           srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);

           cond_resched();

           vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);

       }

等待下次loop在调用vcpu_enter_guest,

 

(4) 当需要设置vpcu debug regs 时, 设置当前 cpu debug寄存器

       set_debugreg(0, 7);

       set_debugreg(vcpu->arch.eff_db[0], 0);

       set_debugreg(vcpu->arch.eff_db[1], 1);

       set_debugreg(vcpu->arch.eff_db[2], 2);

       set_debugreg(vcpu->arch.eff_db[3], 3);

       set_debugreg(vcpu->arch.dr6, 6);

 

#define set_debugreg(value,register)            \

    native_set_debugreg(register, value)

 

(5) 更新Gusest RIP , RSP

  这里只设置了rip, rsp,其它寄存器在不同地方设置

  a.其它寄存器在rax, rcx等, 这些寄存器在vmcsguest state区并没有对应设置

    kvm_arch_vcpu_ioctl_set_regs时(qemu在cpu reset或resume)设置

  b. 段寄存器在vmx_vcpu_reset时设置, 代码段的例子

    seg_setup(VCPU_SREG_CS); //用kvm_vmx_segment_fields中的元素设置

    vmcs_write16(GUEST_CS_SELECTOR, 0xf000);

    vmcs_write32(GUEST_CS_BASE, 0xffff0000);

 

static const structkvm_vmx_segment_field {

    unsigned selector;

    unsigned base;

    unsigned limit;

    unsigned ar_bytes;

} kvm_vmx_segment_fields[] ={

    VMX_SEGMENT_FIELD(CS),

    。。。。。。,

};

 

(6) 进入 VM-Entry

 

2.4.2  内核态VM-Exit

(1) vmx_vcpu_run 中vm-exit流程

    vmx->idt_vectoring_info =vmcs_read32(IDT_VECTORING_INFO_FIELD);

    vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);

 

a)  vmx_complete_atomic_exit获得nmi 或machne check引发的vm-exit信息

vmx->exit_intr_info =vmcs_read32(VM_EXIT_INTR_INFO);

exit_intr_info = vmx->exit_intr_info;

 

/* Handle machine checks before interruptsare enabled */

if (is_machine_check(exit_intr_info))

kvm_machine_check(); //处理machine check

 

/* We need to handle NMIs before interruptsare enabled */

if ((exit_intr_info &INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&

(exit_intr_info &INTR_INFO_VALID_MASK)) {

kvm_before_handle_nmi(&vmx->vcpu);

asm("int $2");

kvm_after_handle_nmi(&vmx->vcpu);

   }

 

b)  vmx_recover_nmi_blocking处理virtual nmi

virtual nmi是通过事件注入方式产生的nmi

exit_intr_info =vmcs_read32(VM_EXIT_INTR_INFO);

unblock_nmi = (exit_intr_info &INTR_INFO_UNBLOCK_NMI) != 0;

vector = exit_intr_info &INTR_INFO_VECTOR_MASK;

 

if ((exit_intr_info &INTR_INFO_VALID_MASK) && unblock_nmi &&

vector != DF_VECTOR&& !idtv_info_valid) //block by nmi被解除

vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,

1.  GUEST_INTR_STATE_NMI);//设置允许nmi中断

else

vmx->nmi_known_unmasked =

!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)

& GUEST_INTR_STATE_NMI);

 

x86/x64 有4种中断阻塞状态

2.  Blockingby STI

若stil指令前eflags.IF=0则下条指令执行完前如果发生外部中断将会阻塞,下条执行执行完后将解除阻塞。

sti

ret //ret 执行完后才能产生中断

 

2. Blocking by mov-ss

使用mov 或者pop更新ss寄存器时, 下条指令执行完前如果发生外部中断,NMI,#DB异常将会阻塞;下条执行执行完后将解除阻塞, 例子:

mov ss, ax

mov esp, kernel_stack //存在blocking by mov-ss

mov ax, 0 //无阻塞状态

 

3. Blocking by SMI

处理器切换到smm模式执行时,新的smi将会被阻塞

 

4. Blocking by NMI

处理器响应nmi请求时,nmi服务例程得到delivery后将阻塞另一个nmi.

注意,当nmi exiting =1时,一个nmi将产生vm-exit,nmi并未delivery,所以此时 blocking by nmi is 0. 当vmm注入nmi时, nmi通过guest-idt进行delivery,cpu处于blocking by nmi.

对于blocking by nmi的引起,virtual-nmi和nmi完全等价。 当nmi服务例程执行iret返回后,blocking by nmi解除。 如果这个iret指令的执行引起vm-exit,vm-exit interruptioninformation bit12将为1,表明阻塞状态解除。

 

static void vmcs_set_bits(unsigned longfield, u32 mask)

{

  vmcs_writel(field,vmcs_readl(field) | mask);

}

 

c)  __vmx_complete_interrupts获取与预处理中断导致的vm-exit

 

(2) vcpu_enter_guest

//当guest_debug未开启, 且exit_reason 为EXIT_REASON_DR_ACCESS时,

    if(unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {

       int i;

       kvm_x86_ops->sync_dirty_debug_regs(vcpu);

       for (i = 0; i < KVM_NR_DB_REGS; i++)

           vcpu->arch.eff_db[i] = vcpu->arch.db[i];

    }

   备份vcpu的db寄存器,当vm-Entry时载入

    vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,

                           native_read_tsc());

 

    kvm_x86_ops->handle_external_intr(vcpu)= vmx_handle_external_intr

 

    if (vcpu->arch.apic_attention)

       kvm_lapic_sync_from_vapic(vcpu);

 

   kvm_x86_ops->handle_exit(vcpu)= vmx_handle_exit;

 

(3) vmx_handle_exit

a)  调用guest_state_valid判断guest state如果不合法(段寄存器base,limit,attr等)则进入调用handle_invalid_guest_state处理,并返回1。当vm-entry直接fialed  时vcpu->run->exit_reason= KVM_EXIT_FAIL_ENTRY,返回0

b)  由向量引起vm-exit,但exit_reason不为异常,nmi,ept violation 和task_switch时:

vcpu->run->exit_reason =KVM_EXIT_INTERNAL_ERROR;

vcpu->run->internal.suberror =KVM_INTERNAL_ERROR_DELIVERY_EV; 并返回0

 

c)  调用kvm_vmx_exit_handlers[exit_reason]处理kvm_vmx_exit_handlers 支持的vm-exit,其它类型设置vcpu->run->exit_reason = KVM_EXIT_UNKNOWN

 

d)  当handler的返回值<= 0时, 返回到上层函数后内核态的处理将结束,进入qemu(kvm_cpu_exec)继续处理

Ø  0时, 内核态处理完成后,将准备下一次vm-entry.

 

2.4.3  Qemu Vm-Entry &VM-Exit分析

(1) VM-EXIT处理

内核态返回到用户态kvm_cpu_exec中

    a. kvm_arch_post_run  处理apic

    b. switch (run->exit_reason) 根据exit原因做不同处理

    KVM_EXIT_IO, KVM_EXIT_MMIO 将在3,4章中分析

   

    KVM_EXIT_UNKNOWN时,kvm_cpu_exec将返回vm_stop

    void vm_stop(RunState state)

    {

       if(!qemu_thread_is_self(&io_thread)) {

           qemu_system_vmstop_request(state);

              cpu_stop_current();

           return;

    }

    do_vm_stop(state);

}

当返回到qemu_kvm_cpu_thread_fn后循环结束。

 

KVM_EXIT_IRQ_WINDOW_OPEN  return EXCP_INTERRUPT;

KVM_EXIT_SHUTDOWN:call qemu_system_reset_request

voidqemu_system_reset_request(void)

{

    if (no_reboot) {

        shutdown_requested = 1;

    } else {

        reset_requested = 1;

    }

    cpu_stop_current();

    qemu_notify_event();

}

 

main_loop==>main_loop_should_exit

    if (qemu_reset_requested()) {

        pause_all_vcpus();

        cpu_synchronize_all_states();

        qemu_system_reset(VMRESET_REPORT);

        resume_all_vcpus();

        if(runstate_check(RUN_STATE_INTERNAL_ERROR) ||

            runstate_check(RUN_STATE_SHUTDOWN)) {

            runstate_set(RUN_STATE_PAUSED);

        }

    }

......

   if (qemu_vmstop_requested(&r)) {

        vm_stop(r);

    }

处理 reset与stop

 

KVM_EXIT_INTERNAL_ERROR: ret = kvm_handle_internal_error(env, run);

  call kvm_ioctl(s, KVM_CHECK_EXTENSION, KVM_CAP_INTERNAL_ERROR_DATA);

       kvm_dev_ioctl_check_extension_generic(kernel kvm_main.c)return 1;

          

    if (run->internal.suberror ==KVM_INTERNAL_ERROR_EMULATION) {

        fprintf(stderr, "emulation failure\n");

        if(!kvm_arch_stop_on_emulation_error(env)) {

            cpu_dump_state(env, stderr,fprintf, CPU_DUMP_CODE);

            return EXCP_INTERRUPT;

        }

    }

 

剩余case 则

intkvm_arch_handle_exit(CPUX86State *env, struct kvm_run *run)

{

 

    switch (run->exit_reason) {

    case KVM_EXIT_HLT:

        DPRINTF("handle_hlt\n");

        ret = kvm_handle_halt(env);

        break;

    case KVM_EXIT_SET_TPR:

        ret = 0;

        break;

    case KVM_EXIT_TPR_ACCESS:

        ret = kvm_handle_tpr_access(env);

        break;

    case KVM_EXIT_FAIL_ENTRY:

        code =run->fail_entry.hardware_entry_failure_reason;

        ret = -1;

        break;

    case KVM_EXIT_EXCEPTION:

        ret = -1;

        break;

    case KVM_EXIT_DEBUG:

        ret =kvm_handle_debug(&run->debug.arch);

        break;

    default:

        ret = -1;

        break;

    }

 

    return ret;

}

 

返回-1导致vm_stop

 

(2) VM-ENTRY处理

a)  处理cpu异步事件

if (kvm_arch_process_async_events(env)) {

env->exit_request = 0;

return EXCP_HLT;

}

 

b)  kvm_arch_process_async_events根据env->interrupt_request,做不同处理,case如下:

CPU_INTERRUPT_POLL      call apic_poll_irq

CPU_INTERRUPT_SMI     

CPU_INTERRUPT_NMI      

CPU_INTERRUPT_MCE       machine check

CPU_INTERRUPT_VIRQ     

CPU_INTERRUPT_INIT             kvm_cpu_synchronize_state(env);

do_cpu_init(cpu); //cpu初始化

CPU_INTERRUPT_SIPI              kvm_cpu_synchronize_state(env);

do_cpu_sipi(cpu); 向不同cpu发sipi

CPU_INTERRUPT_TPR            handle tpr access

这些request由qemu的vm_exit handler调用cpu_interrupt设置

 

c)  如果寄存器需要更新

if (env->kvm_vcpu_dirty) {

kvm_arch_put_registers(env,KVM_PUT_RUNTIME_STATE);

   env->kvm_vcpu_dirty= 0;

   }

 

d)  中断与NMI注入kvm_arch_pre_run

if (env->interrupt_request &CPU_INTERRUPT_NMI) {

env->interrupt_request &=~CPU_INTERRUPT_NMI;

DPRINTF("injected NMI\n");

ret = kvm_vcpu_ioctl(env, KVM_NMI);

if (ret < 0) {

fprintf(stderr, "KVM:injection failed, NMI lost (%s)\n", strerror(-ret));

    }

}

 

e)  call kvm_vcpu_ioctl(env,KVM_RUN, 0);

 

2.4.4  几个简单情景分析

(1) Halt指令

    guest os 执行 hlt指令

    vm-exit and vmm call  handle_halt

static int handle_halt(structkvm_vcpu *vcpu)

{

    skip_emulated_instruction(vcpu); //更新rip

    return kvm_emulate_halt(vcpu); //模拟halt指令的执行

}

 

static voidskip_emulated_instruction(struct kvm_vcpu *vcpu)

{

    unsigned long rip;

 

    rip = kvm_rip_read(vcpu); //取当前rip

    rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); //  rip += 导致vm-exit的指令长度,

    kvm_rip_write(vcpu, rip);

    vmx_set_interrupt_shadow(vcpu, 0);

}

 

int kvm_emulate_halt(structkvm_vcpu *vcpu)

{

    ++vcpu->stat.halt_exits;

    if (irqchip_in_kernel(vcpu->kvm)) {

       vcpu->arch.mp_state = KVM_MP_STATE_HALTED;

       return 1;

    } else {

       vcpu->run->exit_reason = KVM_EXIT_HLT;

       return 0;

    }

}

返回到qemu后

kvm_arch_handle_exit ==> kvm_handle_halt

static intkvm_handle_halt(CPUX86State *env)

{

    if (!((env->interrupt_request &CPU_INTERRUPT_HARD) &&

          (env->eflags & IF_MASK))&&

        !(env->interrupt_request &CPU_INTERRUPT_NMI)) {

        env->halted = 1;

        return EXCP_HLT;

    }

 

    return 0;

}

 

kvm_cpu_exec中将不在vm-entry, 而是返回EXCP_HLT

    if (kvm_arch_process_async_events(env)) {

        env->exit_request = 0;

        return EXCP_HLT; // 

    }

 

 (2) Tripe Fault

handle_triple_fault

static inthandle_triple_fault(struct kvm_vcpu *vcpu)

{

    vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;

    return 0;

}

 

(3) MSR

handle_rdmsr ==>

       vmx_get_msr(vcpu, ecx, &data)

        vcpu->arch.regs[VCPU_REGS_RAX]= data & -1u;

        vcpu->arch.regs[VCPU_REGS_RDX]= (data >> 32) & -1u;

       skip_emulated_instruction

 

vmx_get_msr ==> 分两种

    case a. 在VMCS GuestState中有存储的直接用vmcs_read32 返回,如GUEST_SYSENTER_CS

   case b.返回软件维护的值find_msr_entry

                      kvm_get_msr_common

                 

                 

handle_wrmsr==》 vmx_set_msr来完成

 

(4) NMI

用户态

apic_bus_deliver ==》 cpu_interrupt(apic_iter->cpu_env,CPU_INTERRUPT_NMI) );

kvm_arch_pre_run  ==》 kvm_vcpu_ioctl(env, KVM_NMI)

内核态

kvm_vcpu_ioctl_nmi ==> kvm_inject_nmi==> kvm_make_request(KVM_REQ_NMI, vcpu);

 

vcpu_enter_guest==》process_nmi

static voidprocess_nmi(struct kvm_vcpu *vcpu)

{

    unsigned limit = 2;

 

    if (kvm_x86_ops->get_nmi_mask(vcpu) ||vcpu->arch.nmi_injected)

       limit = 1;

 

    vcpu->arch.nmi_pending +=atomic_xchg(&vcpu->arch.nmi_queued, 0);

    vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending,limit);

    kvm_make_request(KVM_REQ_EVENT, vcpu);

}

 

vcpu_enter_guest  ==> inject_pending_event

    if (vcpu->arch.nmi_pending) {

       if (kvm_x86_ops->nmi_allowed(vcpu)) {

           --vcpu->arch.nmi_pending;

           vcpu->arch.nmi_injected = true;

           kvm_x86_ops->set_nmi(vcpu);

       }

    }

 

static voidvmx_inject_nmi(struct kvm_vcpu *vcpu)

{

    struct vcpu_vmx *vmx = to_vmx(vcpu);

 

    if (is_guest_mode(vcpu))

       return;

 

    if (!cpu_has_virtual_nmis()) {

           vmx->soft_vnmi_blocked= 1;

       vmx->vnmi_blocked_time = 0;

    }

 

    ++vcpu->stat.nmi_injections;

    vmx->nmi_known_unmasked = false;

    if (vmx->rmode.vm86_active) {

       if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) !=EMULATE_DONE)

           kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);

       return;

    }

    vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,

           INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);

}

注入NMI后处于blocking by nmi

 

vcpu_enter_guest  ==> kvm_x86_ops->enable_nmi_window =enable_nmi_window

Enable CPU_BASED_VM_EXEC_CONTROLbit[22]当该位为1时,在没有nmi阻塞的情况下,vm-entry直接引发vm-exit当guest os 解除blockingby nmi后, 再次vm-entry时会触发vm-exit ,此时

进入 handle_nmi_window, Enable CPU_BASED_VM_EXEC_CONTROL bit[22]被清0。

 类似资料: