kvm vm运行

来源：互联网发布：读取股票数据编辑：程序博客网时间：2024/04/30 02:09

用户空间发送KVM_RUN指令，通过vm的fd描述符进行。

static long kvm_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
int r;
struct kvm_fpu *fpu = NULL;
struct kvm_sregs *kvm_sregs = NULL;

if (vcpu->kvm->mm != current->mm)
return -EIO;

if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
return -EINVAL;

#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
/*
* Special cases: vcpu ioctls that are asynchronous to vcpu execution,
* so vcpu_load() would break it.
*/
if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_S390_IRQ || ioctl == KVM_INTERRUPT)
return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
#endif

r = vcpu_load(vcpu);
if (r)
return r;
switch (ioctl) {
case KVM_RUN:
r = -EINVAL;
if (arg)
goto out;
if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
/* The thread running this VCPU changed. */
struct pid *oldpid = vcpu->pid;
struct pid *newpid = get_task_pid(current, PIDTYPE_PID);

rcu_assign_pointer(vcpu->pid, newpid);
if (oldpid)
synchronize_rcu();
put_pid(oldpid);
}
r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
break;
case KVM_GET_REGS: {
struct kvm_regs *kvm_regs;

r = -ENOMEM;
kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
if (!kvm_regs)
goto out;
r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
if (r)
goto out_free1;
r = -EFAULT;
if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
goto out_free1;
r = 0;
out_free1:
kfree(kvm_regs);
break;
}
case KVM_SET_REGS: {
struct kvm_regs *kvm_regs;

r = -ENOMEM;
kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
if (IS_ERR(kvm_regs)) {
r = PTR_ERR(kvm_regs);
goto out;
}
r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
kfree(kvm_regs);
break;
}
case KVM_GET_SREGS: {
kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
r = -ENOMEM;
if (!kvm_sregs)
goto out;
r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
goto out;
r = 0;
break;
}
case KVM_SET_SREGS: {
kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
if (IS_ERR(kvm_sregs)) {
r = PTR_ERR(kvm_sregs);
kvm_sregs = NULL;
goto out;
}
r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
break;
}
case KVM_GET_MP_STATE: {
struct kvm_mp_state mp_state;

r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
goto out;
r = 0;
break;
}
case KVM_SET_MP_STATE: {
struct kvm_mp_state mp_state;

r = -EFAULT;
if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
goto out;
r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
break;
}
case KVM_TRANSLATE: {
struct kvm_translation tr;

r = -EFAULT;
if (copy_from_user(&tr, argp, sizeof(tr)))
goto out;
r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user(argp, &tr, sizeof(tr)))
goto out;
r = 0;
break;
}
case KVM_SET_GUEST_DEBUG: {
struct kvm_guest_debug dbg;

r = -EFAULT;
if (copy_from_user(&dbg, argp, sizeof(dbg)))
goto out;
r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
break;
}
case KVM_SET_SIGNAL_MASK: {
struct kvm_signal_mask __user *sigmask_arg = argp;
struct kvm_signal_mask kvm_sigmask;
sigset_t sigset, *p;

p = NULL;
if (argp) {
r = -EFAULT;
if (copy_from_user(&kvm_sigmask, argp,
sizeof(kvm_sigmask)))
goto out;
r = -EINVAL;
if (kvm_sigmask.len != sizeof(sigset))
goto out;
r = -EFAULT;
if (copy_from_user(&sigset, sigmask_arg->sigset,
sizeof(sigset)))
goto out;
p = &sigset;
}
r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
break;
}
case KVM_GET_FPU: {
fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
r = -ENOMEM;
if (!fpu)
goto out;
r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
goto out;
r = 0;
break;
}
case KVM_SET_FPU: {
fpu = memdup_user(argp, sizeof(*fpu));
if (IS_ERR(fpu)) {
r = PTR_ERR(fpu);
fpu = NULL;
goto out;
}
r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
break;
}
default:
r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
}
out:
vcpu_put(vcpu);
kfree(fpu);
kfree(kvm_sregs);
return r;

}

int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct fpu *fpu = &current->thread.fpu;
int r;
sigset_t sigsaved;

fpu__activate_curr(fpu);

if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);

if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
kvm_vcpu_block(vcpu);
kvm_apic_accept_events(vcpu);
clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
r = -EAGAIN;
goto out;
}

/* re-sync apic's tpr */
if (!lapic_in_kernel(vcpu)) {
if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
r = -EINVAL;
goto out;
}
}

if (unlikely(vcpu->arch.complete_userspace_io)) {
int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
vcpu->arch.complete_userspace_io = NULL;
r = cui(vcpu);
if (r <= 0)
goto out;
} else
WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);

r = vcpu_run(vcpu);

out:
post_kvm_run_save(vcpu);
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &sigsaved, NULL);

return r;
}

x86.c

static int vcpu_run(struct kvm_vcpu *vcpu)
{
int r;
struct kvm *kvm = vcpu->kvm;

vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);

for (;;) {
if (kvm_vcpu_running(vcpu)) {
r = vcpu_enter_guest(vcpu);
} else {
r = vcpu_block(kvm, vcpu);
}

if (r <= 0)
break;

clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
if (kvm_cpu_has_pending_timer(vcpu))
kvm_inject_pending_timer_irqs(vcpu);

if (dm_request_for_irq_injection(vcpu) &&
kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
r = 0;
vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
++vcpu->stat.request_irq_exits;
break;
}

kvm_check_async_pf_completion(vcpu);

if (signal_pending(current)) {
r = -EINTR;
vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.signal_exits;
break;
}
if (need_resched()) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
cond_resched();
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
}
}

srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);

return r;
}

static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
bool req_int_win =
dm_request_for_irq_injection(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu);

bool req_immediate_exit = false;

if (vcpu->requests) {
if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
kvm_mmu_unload(vcpu);
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
__kvm_migrate_timers(vcpu);
if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
kvm_gen_update_masterclock(vcpu->kvm);
if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
kvm_gen_kvmclock_update(vcpu);
if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
r = kvm_guest_time_update(vcpu);
if (unlikely(r))
goto out;
}
if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
kvm_mmu_sync_roots(vcpu);
if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
kvm_vcpu_flush_tlb(vcpu);
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
r = 0;
goto out;
}
if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
r = 0;
goto out;
}
if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
vcpu->fpu_active = 0;
kvm_x86_ops->fpu_deactivate(vcpu);
}
if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
/* Page is swapped out. Do synthetic halt */
vcpu->arch.apf.halted = true;
r = 1;
goto out;
}
if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
record_steal_time(vcpu);
if (kvm_check_request(KVM_REQ_SMI, vcpu))
process_smi(vcpu);
if (kvm_check_request(KVM_REQ_NMI, vcpu))
process_nmi(vcpu);
if (kvm_check_request(KVM_REQ_PMU, vcpu))
kvm_pmu_handle_event(vcpu);
if (kvm_check_request(KVM_REQ_PMI, vcpu))
kvm_pmu_deliver_pmi(vcpu);
if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
if (test_bit(vcpu->arch.pending_ioapic_eoi,
vcpu->arch.ioapic_handled_vectors)) {
vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
vcpu->run->eoi.vector =
vcpu->arch.pending_ioapic_eoi;
r = 0;
goto out;
}
}
if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
vcpu_scan_ioapic(vcpu);
if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
kvm_vcpu_reload_apic_access_page(vcpu);
if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
r = 0;
goto out;
}
if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
r = 0;
goto out;
}
if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_HYPERV;
vcpu->run->hyperv = vcpu->arch.hyperv.exit;
r = 0;
goto out;
}

/*
* KVM_REQ_HV_STIMER has to be processed after
* KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
* depend on the guest clock being up-to-date
*/
if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
kvm_hv_process_stimers(vcpu);
}

/*
* KVM_REQ_EVENT is not set when posted interrupts are set by
* VT-d hardware, so we have to update RVI unconditionally.
*/
if (kvm_lapic_enabled(vcpu)) {
/*
* Update architecture specific hints for APIC
* virtual interrupt delivery.
*/
if (vcpu->arch.apicv_active)
kvm_x86_ops->hwapic_irr_update(vcpu,
kvm_lapic_find_highest_irr(vcpu));
}

if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
kvm_apic_accept_events(vcpu);
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
r = 1;
goto out;
}

if (inject_pending_event(vcpu, req_int_win) != 0)
req_immediate_exit = true;
else {
/* Enable NMI/IRQ window open exits if needed.
*
* SMIs have two cases: 1) they can be nested, and
* then there is nothing to do here because RSM will
* cause a vmexit anyway; 2) or the SMI can be pending
* because inject_pending_event has completed the
* injection of an IRQ or NMI from the previous vmexit,
* and then we request an immediate exit to inject the SMI.
*/
if (vcpu->arch.smi_pending && !is_smm(vcpu))
req_immediate_exit = true;
if (vcpu->arch.nmi_pending)
kvm_x86_ops->enable_nmi_window(vcpu);
if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
kvm_x86_ops->enable_irq_window(vcpu);
}

if (kvm_lapic_enabled(vcpu)) {
update_cr8_intercept(vcpu);
kvm_lapic_sync_to_vapic(vcpu);
}
}

r = kvm_mmu_reload(vcpu);
if (unlikely(r)) {
goto cancel_injection;
}

preempt_disable();

kvm_x86_ops->prepare_guest_switch(vcpu);
if (vcpu->fpu_active)
kvm_load_guest_fpu(vcpu);
vcpu->mode = IN_GUEST_MODE;

srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);

/*
* We should set ->mode before check ->requests,
* Please see the comment in kvm_make_all_cpus_request.
* This also orders the write to mode from any reads
* to the page tables done while the VCPU is running.
* Please see the comment in kvm_flush_remote_tlbs.
*/
smp_mb__after_srcu_read_unlock();

local_irq_disable();

if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
|| need_resched() || signal_pending(current)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
local_irq_enable();
preempt_enable();
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
r = 1;
goto cancel_injection;
}

kvm_load_guest_xcr0(vcpu);

if (req_immediate_exit) {
kvm_make_request(KVM_REQ_EVENT, vcpu);
smp_send_reschedule(vcpu->cpu);
}

trace_kvm_entry(vcpu->vcpu_id);
wait_lapic_expire(vcpu);
guest_enter_irqoff();

if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
set_debugreg(vcpu->arch.eff_db[0], 0);
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
set_debugreg(vcpu->arch.dr6, 6);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
}

kvm_x86_ops->run(vcpu);

/*
* Do this here before restoring debug registers on the host. And
* since we do this before handling the vmexit, a DR access vmexit
* can (a) read the correct value of the debug registers, (b) set
* KVM_DEBUGREG_WONT_EXIT again.
*/
if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
kvm_x86_ops->sync_dirty_debug_regs(vcpu);
kvm_update_dr0123(vcpu);
kvm_update_dr6(vcpu);
kvm_update_dr7(vcpu);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
}

/*
* If the guest has used debug registers, at least dr7
* will be disabled while returning to the host.
* If we don't have active breakpoints in the host, we don't
* care about the messed up debug address registers. But if
* we have some of them active, restore the old state.
*/
if (hw_breakpoint_active())
hw_breakpoint_restore();

vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());

vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();

kvm_put_guest_xcr0(vcpu);

kvm_x86_ops->handle_external_intr(vcpu);

++vcpu->stat.exits;

guest_exit_irqoff();

local_irq_enable();
preempt_enable();

vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);

/*
* Profile KVM exit RIPs:
*/
if (unlikely(prof_on == KVM_PROFILING)) {
unsigned long rip = kvm_rip_read(vcpu);
profile_hit(KVM_PROFILING, (void *)rip);
}

if (unlikely(vcpu->arch.tsc_always_catchup))
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);

if (vcpu->arch.apic_attention)
kvm_lapic_sync_from_vapic(vcpu);

r = kvm_x86_ops->handle_exit(vcpu);
return r;

cancel_injection:
kvm_x86_ops->cancel_injection(vcpu);
if (unlikely(vcpu->arch.apic_attention))
kvm_lapic_sync_from_vapic(vcpu);
out:
return r;
}

static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long debugctlmsr, cr4;

/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
vmx->entry_time = ktime_get();

/* Don't enter VMX if guest state is invalid, let the exit handler
start emulation until we arrive back to a valid state */
if (vmx->emulation_required)
return;

if (vmx->ple_window_dirty) {
vmx->ple_window_dirty = false;
vmcs_write32(PLE_WINDOW, vmx->ple_window);
}

if (vmx->nested.sync_shadow_vmcs) {
copy_vmcs12_to_shadow(vmx);
vmx->nested.sync_shadow_vmcs = false;
}

if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);

cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
vmcs_writel(HOST_CR4, cr4);
vmx->host_state.vmcs_host_cr4 = cr4;
}

/* When single-stepping over STI and MOV SS, we must clear the
* corresponding interruptibility bits in the guest state. Otherwise
* vmentry fails as it then expects bit 14 (BS) in pending debug
* exceptions being set, but that's not correct for the guest debugging
* case. */
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);

if (vmx->guest_pkru_valid)
__write_pkru(vmx->guest_pkru);

atomic_switch_perf_msrs(vmx);
debugctlmsr = get_debugctlmsr();

vmx_arm_hv_timer(vcpu);

vmx->__launched = vmx->loaded_vmcs->launched;
asm(
/* Store host registers */
"push %%" _ASM_DX "; push %%" _ASM_BP ";"
"push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
"push %%" _ASM_CX " \n\t"
"cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
"je 1f \n\t"
"mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
"1: \n\t"
/* Reload cr2 if changed */
"mov %c[cr2](%0), %%" _ASM_AX " \n\t"
"mov %%cr2, %%" _ASM_DX " \n\t"
"cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
"je 2f \n\t"
"mov %%" _ASM_AX", %%cr2 \n\t"
"2: \n\t"
/* Check if vmlaunch of vmresume is needed */
"cmpl $0, %c[launched](%0) \n\t"
/* Load guest registers. Don't clobber flags. */
"mov %c[rax](%0), %%" _ASM_AX " \n\t"
"mov %c[rbx](%0), %%" _ASM_BX " \n\t"
"mov %c[rdx](%0), %%" _ASM_DX " \n\t"
"mov %c[rsi](%0), %%" _ASM_SI " \n\t"
"mov %c[rdi](%0), %%" _ASM_DI " \n\t"
"mov %c[rbp](%0), %%" _ASM_BP " \n\t"
#ifdef CONFIG_X86_64
"mov %c[r8](%0), %%r8 \n\t"
"mov %c[r9](%0), %%r9 \n\t"
"mov %c[r10](%0), %%r10 \n\t"
"mov %c[r11](%0), %%r11 \n\t"
"mov %c[r12](%0), %%r12 \n\t"
"mov %c[r13](%0), %%r13 \n\t"
"mov %c[r14](%0), %%r14 \n\t"
"mov %c[r15](%0), %%r15 \n\t"
#endif
"mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */

/* Enter guest mode */
"jne 1f \n\t"
__ex(ASM_VMX_VMLAUNCH) "\n\t"
"jmp 2f \n\t"
"1: " __ex(ASM_VMX_VMRESUME) "\n\t"
"2: "
/* Save guest registers, load host registers, keep flags */
"mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
"pop %0 \n\t"
"mov %%" _ASM_AX ", %c[rax](%0) \n\t"
"mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
__ASM_SIZE(pop) " %c[rcx](%0) \n\t"
"mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
"mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
"mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
"mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
#ifdef CONFIG_X86_64
"mov %%r8, %c[r8](%0) \n\t"
"mov %%r9, %c[r9](%0) \n\t"
"mov %%r10, %c[r10](%0) \n\t"
"mov %%r11, %c[r11](%0) \n\t"
"mov %%r12, %c[r12](%0) \n\t"
"mov %%r13, %c[r13](%0) \n\t"
"mov %%r14, %c[r14](%0) \n\t"
"mov %%r15, %c[r15](%0) \n\t"
#endif
"mov %%cr2, %%" _ASM_AX " \n\t"
"mov %%" _ASM_AX ", %c[cr2](%0) \n\t"

"pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
"setbe %c[fail](%0) \n\t"
".pushsection .rodata \n\t"
".global vmx_return \n\t"
"vmx_return: " _ASM_PTR " 2b \n\t"
".popsection"
: : "c"(vmx), "d"((unsigned long)HOST_RSP),
[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
[fail]"i"(offsetof(struct vcpu_vmx, fail)),
[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
[rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
[rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
[rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
[rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
[rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
[rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
#ifdef CONFIG_X86_64
[r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
[r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
[r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
[r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
[r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
[r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
[r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
[r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
#endif
[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
[wordsize]"i"(sizeof(ulong))
: "cc", "memory"
#ifdef CONFIG_X86_64
, "rax", "rbx", "rdi", "rsi"
, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
#else
, "eax", "ebx", "edi", "esi"
#endif
);

/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
if (debugctlmsr)
update_debugctlmsr(debugctlmsr);

#ifndef CONFIG_X86_64
/*
* The sysexit path does not restore ds/es, so we must set them to
* a reasonable value ourselves.
*
* We can't defer this to vmx_load_host_state() since that function
* may be executed in interrupt context, which saves and restore segments
* around it, nullifying its effect.
*/
loadsegment(ds, __USER_DS);
loadsegment(es, __USER_DS);
#endif

vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
| (1 << VCPU_EXREG_RFLAGS)
| (1 << VCPU_EXREG_PDPTR)
| (1 << VCPU_EXREG_SEGMENTS)
| (1 << VCPU_EXREG_CR3));
vcpu->arch.regs_dirty = 0;

vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);

vmx->loaded_vmcs->launched = 1;

vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);

/*
* eager fpu is enabled if PKEY is supported and CR4 is switched
* back on host, so it is safe to read guest PKRU from current
* XSAVE.
*/
if (boot_cpu_has(X86_FEATURE_OSPKE)) {
vmx->guest_pkru = __read_pkru();
if (vmx->guest_pkru != vmx->host_pkru) {
vmx->guest_pkru_valid = true;
__write_pkru(vmx->host_pkru);
} else
vmx->guest_pkru_valid = false;
}

/*
* the KVM_REQ_EVENT optimization bit is only on for one entry, and if
* we did not inject a still-pending event to L1 now because of
* nested_run_pending, we need to re-enable this bit.
*/
if (vmx->nested.nested_run_pending)
kvm_make_request(KVM_REQ_EVENT, vcpu);

vmx->nested.nested_run_pending = 0;

vmx_complete_atomic_exit(vmx);
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
}

0 0