* derived from drivers/kvm/kvm_main.c
*
* Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
*
* Authors:
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
+ * Amit Shah <amit.shah@qumranet.com>
+ * Ben-Ami Yassour <benami@il.ibm.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
#include "i8254.h"
#include "tss.h"
#include "kvm_cache_regs.h"
+#include "x86.h"
#include <linux/clocksource.h>
+#include <linux/interrupt.h>
#include <linux/kvm.h>
#include <linux/fs.h>
#include <linux/vmalloc.h>
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/highmem.h>
+#include <linux/intel-iommu.h>
#include <asm/uaccess.h>
#include <asm/msr.h>
{ "fpu_reload", VCPU_STAT(fpu_reload) },
{ "insn_emulation", VCPU_STAT(insn_emulation) },
{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
+ { "irq_injections", VCPU_STAT(irq_injections) },
{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
{ "mmu_flooded", VM_STAT(mmu_flooded) },
{ "mmu_recycled", VM_STAT(mmu_recycled) },
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
+ { "mmu_unsync", VM_STAT(mmu_unsync) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ "largepages", VM_STAT(lpages) },
{ NULL }
};
-
unsigned long segment_base(u16 selector)
{
struct descriptor_table gdt;
void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
+ kvm_mmu_sync_roots(vcpu);
kvm_mmu_flush_tlb(vcpu);
return;
}
hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
- __FUNCTION__, tsc_khz, hv_clock->tsc_shift,
+ __func__, tsc_khz, hv_clock->tsc_shift,
hv_clock->tsc_to_system_mul);
}
pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
__func__, data);
break;
+ case MSR_IA32_DEBUGCTLMSR:
+ if (!data) {
+ /* We support the non-activated case already */
+ break;
+ } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
+ /* Values other than LBR and BTF are vendor-specific,
+ thus reserved and should throw a #GP */
+ return 1;
+ }
+ pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+ __func__, data);
+ break;
case MSR_IA32_UCODE_REV:
case MSR_IA32_UCODE_WRITE:
break;
/* ...but clean it before doing the actual write */
vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
- down_read(¤t->mm->mmap_sem);
vcpu->arch.time_page =
gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
- up_read(¤t->mm->mmap_sem);
if (is_error_page(vcpu->arch.time_page)) {
kvm_release_page_clean(vcpu->arch.time_page);
case MSR_IA32_MC0_MISC+8:
case MSR_IA32_MC0_MISC+12:
case MSR_IA32_MC0_MISC+16:
+ case MSR_IA32_MC0_MISC+20:
case MSR_IA32_UCODE_REV:
case MSR_IA32_EBL_CR_POWERON:
+ case MSR_IA32_DEBUGCTLMSR:
+ case MSR_IA32_LASTBRANCHFROMIP:
+ case MSR_IA32_LASTBRANCHTOIP:
+ case MSR_IA32_LASTINTFROMIP:
+ case MSR_IA32_LASTINTTOIP:
data = 0;
break;
case MSR_MTRRcap:
case KVM_CAP_PV_MMU:
r = !tdp_enabled;
break;
+ case KVM_CAP_IOMMU:
+ r = intel_iommu_found();
+ break;
default:
r = 0;
break;
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
int r;
+ struct kvm_lapic_state *lapic = NULL;
switch (ioctl) {
case KVM_GET_LAPIC: {
- struct kvm_lapic_state lapic;
+ lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
- memset(&lapic, 0, sizeof lapic);
- r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
+ r = -ENOMEM;
+ if (!lapic)
+ goto out;
+ r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
if (r)
goto out;
r = -EFAULT;
- if (copy_to_user(argp, &lapic, sizeof lapic))
+ if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
goto out;
r = 0;
break;
}
case KVM_SET_LAPIC: {
- struct kvm_lapic_state lapic;
-
+ lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!lapic)
+ goto out;
r = -EFAULT;
- if (copy_from_user(&lapic, argp, sizeof lapic))
+ if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
goto out;
- r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
+ r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
if (r)
goto out;
r = 0;
r = -EINVAL;
}
out:
+ if (lapic)
+ kfree(lapic);
return r;
}
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
int r = -EINVAL;
+ /*
+ * This union makes it completely explicit to gcc-3.x
+ * that these two variables' stack usage should be
+ * combined, not added together.
+ */
+ union {
+ struct kvm_pit_state ps;
+ struct kvm_memory_alias alias;
+ } u;
switch (ioctl) {
case KVM_SET_TSS_ADDR:
case KVM_GET_NR_MMU_PAGES:
r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
break;
- case KVM_SET_MEMORY_ALIAS: {
- struct kvm_memory_alias alias;
-
+ case KVM_SET_MEMORY_ALIAS:
r = -EFAULT;
- if (copy_from_user(&alias, argp, sizeof alias))
+ if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
goto out;
- r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
+ r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
if (r)
goto out;
break;
- }
case KVM_CREATE_IRQCHIP:
r = -ENOMEM;
kvm->arch.vpic = kvm_create_pic(kvm);
goto out;
if (irqchip_in_kernel(kvm)) {
mutex_lock(&kvm->lock);
- if (irq_event.irq < 16)
- kvm_pic_set_irq(pic_irqchip(kvm),
- irq_event.irq,
- irq_event.level);
- kvm_ioapic_set_irq(kvm->arch.vioapic,
- irq_event.irq,
- irq_event.level);
+ kvm_set_irq(kvm, irq_event.irq, irq_event.level);
mutex_unlock(&kvm->lock);
r = 0;
}
}
case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
- struct kvm_irqchip chip;
+ struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
- r = -EFAULT;
- if (copy_from_user(&chip, argp, sizeof chip))
+ r = -ENOMEM;
+ if (!chip)
goto out;
+ r = -EFAULT;
+ if (copy_from_user(chip, argp, sizeof *chip))
+ goto get_irqchip_out;
r = -ENXIO;
if (!irqchip_in_kernel(kvm))
- goto out;
- r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
+ goto get_irqchip_out;
+ r = kvm_vm_ioctl_get_irqchip(kvm, chip);
if (r)
- goto out;
+ goto get_irqchip_out;
r = -EFAULT;
- if (copy_to_user(argp, &chip, sizeof chip))
- goto out;
+ if (copy_to_user(argp, chip, sizeof *chip))
+ goto get_irqchip_out;
r = 0;
+ get_irqchip_out:
+ kfree(chip);
+ if (r)
+ goto out;
break;
}
case KVM_SET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
- struct kvm_irqchip chip;
+ struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
- r = -EFAULT;
- if (copy_from_user(&chip, argp, sizeof chip))
+ r = -ENOMEM;
+ if (!chip)
goto out;
+ r = -EFAULT;
+ if (copy_from_user(chip, argp, sizeof *chip))
+ goto set_irqchip_out;
r = -ENXIO;
if (!irqchip_in_kernel(kvm))
- goto out;
- r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
+ goto set_irqchip_out;
+ r = kvm_vm_ioctl_set_irqchip(kvm, chip);
if (r)
- goto out;
+ goto set_irqchip_out;
r = 0;
+ set_irqchip_out:
+ kfree(chip);
+ if (r)
+ goto out;
break;
}
case KVM_GET_PIT: {
- struct kvm_pit_state ps;
r = -EFAULT;
- if (copy_from_user(&ps, argp, sizeof ps))
+ if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
goto out;
r = -ENXIO;
if (!kvm->arch.vpit)
goto out;
- r = kvm_vm_ioctl_get_pit(kvm, &ps);
+ r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
if (r)
goto out;
r = -EFAULT;
- if (copy_to_user(argp, &ps, sizeof ps))
+ if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
goto out;
r = 0;
break;
}
case KVM_SET_PIT: {
- struct kvm_pit_state ps;
r = -EFAULT;
- if (copy_from_user(&ps, argp, sizeof ps))
+ if (copy_from_user(&u.ps, argp, sizeof u.ps))
goto out;
r = -ENXIO;
if (!kvm->arch.vpit)
goto out;
- r = kvm_vm_ioctl_set_pit(kvm, &ps);
+ r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
if (r)
goto out;
r = 0;
val = *(u64 *)new;
- down_read(¤t->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- up_read(¤t->mm->mmap_sem);
kaddr = kmap_atomic(page, KM_USER0);
set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
{
+ kvm_mmu_invlpg(vcpu, address);
return X86EMUL_CONTINUE;
}
int r;
struct decode_cache *c;
+ kvm_clear_exception_queue(vcpu);
vcpu->arch.mmio_fault_cr2 = cr2;
/*
* TODO: fix x86_emulate.c to use guest_read/write_register
KVMTRACE_0D(HLT, vcpu, handler);
if (irqchip_in_kernel(vcpu->kvm)) {
vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
- up_read(&vcpu->kvm->slots_lock);
- kvm_vcpu_block(vcpu);
- down_read(&vcpu->kvm->slots_lock);
- if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
- return -EINTR;
return 1;
} else {
vcpu->run->exit_reason = KVM_EXIT_HLT;
if (!apic || !apic->vapic_addr)
return;
- down_read(¤t->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
- up_read(¤t->mm->mmap_sem);
vcpu->arch.apic->vapic_page = page;
}
up_read(&vcpu->kvm->slots_lock);
}
-static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
int r;
- if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
- pr_debug("vcpu %d received sipi with vector # %x\n",
- vcpu->vcpu_id, vcpu->arch.sipi_vector);
- kvm_lapic_reset(vcpu);
- r = kvm_x86_ops->vcpu_reset(vcpu);
- if (r)
- return r;
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- }
-
- down_read(&vcpu->kvm->slots_lock);
- vapic_enter(vcpu);
-
-preempted:
- if (vcpu->guest_debug.enabled)
- kvm_x86_ops->guest_debug_pre(vcpu);
-
-again:
if (vcpu->requests)
if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
kvm_mmu_unload(vcpu);
if (vcpu->requests) {
if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
__kvm_migrate_timers(vcpu);
+ if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
+ kvm_mmu_sync_roots(vcpu);
if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
kvm_x86_ops->tlb_flush(vcpu);
if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
local_irq_disable();
- if (vcpu->requests || need_resched()) {
+ if (vcpu->requests || need_resched() || signal_pending(current)) {
local_irq_enable();
preempt_enable();
r = 1;
goto out;
}
- if (signal_pending(current)) {
- local_irq_enable();
- preempt_enable();
- r = -EINTR;
- kvm_run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.signal_exits;
- goto out;
- }
+ if (vcpu->guest_debug.enabled)
+ kvm_x86_ops->guest_debug_pre(vcpu);
vcpu->guest_mode = 1;
/*
kvm_lapic_sync_from_vapic(vcpu);
r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
+out:
+ return r;
+}
- if (r > 0) {
- if (dm_request_for_irq_injection(vcpu, kvm_run)) {
- r = -EINTR;
- kvm_run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.request_irq_exits;
- goto out;
- }
- if (!need_resched())
- goto again;
+static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ int r;
+
+ if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
+ pr_debug("vcpu %d received sipi with vector # %x\n",
+ vcpu->vcpu_id, vcpu->arch.sipi_vector);
+ kvm_lapic_reset(vcpu);
+ r = kvm_x86_ops->vcpu_reset(vcpu);
+ if (r)
+ return r;
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
-out:
- up_read(&vcpu->kvm->slots_lock);
- if (r > 0) {
- kvm_resched(vcpu);
- down_read(&vcpu->kvm->slots_lock);
- goto preempted;
+ down_read(&vcpu->kvm->slots_lock);
+ vapic_enter(vcpu);
+
+ r = 1;
+ while (r > 0) {
+ if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
+ r = vcpu_enter_guest(vcpu, kvm_run);
+ else {
+ up_read(&vcpu->kvm->slots_lock);
+ kvm_vcpu_block(vcpu);
+ down_read(&vcpu->kvm->slots_lock);
+ if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
+ if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+ vcpu->arch.mp_state =
+ KVM_MP_STATE_RUNNABLE;
+ if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
+ r = -EINTR;
+ }
+
+ if (r > 0) {
+ if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+ r = -EINTR;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.request_irq_exits;
+ }
+ if (signal_pending(current)) {
+ r = -EINTR;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
+ ++vcpu->stat.signal_exits;
+ }
+ if (need_resched()) {
+ up_read(&vcpu->kvm->slots_lock);
+ kvm_resched(vcpu);
+ down_read(&vcpu->kvm->slots_lock);
+ }
+ }
}
+ up_read(&vcpu->kvm->slots_lock);
post_kvm_run_save(vcpu, kvm_run);
vapic_exit(vcpu);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
kvm_vcpu_block(vcpu);
+ clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
r = -EAGAIN;
goto out;
}
return 0;
}
+static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
+{
+ struct kvm_segment segvar = {
+ .base = selector << 4,
+ .limit = 0xffff,
+ .selector = selector,
+ .type = 3,
+ .present = 1,
+ .dpl = 3,
+ .db = 0,
+ .s = 1,
+ .l = 0,
+ .g = 0,
+ .avl = 0,
+ .unusable = 0,
+ };
+ kvm_x86_ops->set_segment(vcpu, &segvar, seg);
+ return 0;
+}
+
int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
int type_bits, int seg)
{
struct kvm_segment kvm_seg;
+ if (!(vcpu->arch.cr0 & X86_CR0_PE))
+ return kvm_load_realmode_segment(vcpu, selector, seg);
if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
return 1;
kvm_seg.type |= type_bits;
pr_debug("Set back pending irq %d\n",
pending_vec);
}
+ kvm_pic_clear_isr_ack(vcpu->kvm);
}
kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
+ /* Older userspace won't unhalt the vcpu on reset. */
+ if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
+ sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
+ !(vcpu->arch.cr0 & X86_CR0_PE))
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
vcpu_put(vcpu);
return 0;
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
return kvm;
}
void kvm_arch_destroy_vm(struct kvm *kvm)
{
+ kvm_iommu_unmap_guest(kvm);
+ kvm_free_all_assigned_devices(kvm);
kvm_free_pit(kvm);
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
userspace_addr = do_mmap(NULL, 0,
npages * PAGE_SIZE,
PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_ANONYMOUS,
+ MAP_PRIVATE | MAP_ANONYMOUS,
0);
up_write(¤t->mm->mmap_sem);