]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/commitdiff
Merge branch 'kvm-updates/2.6.28' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 16 Oct 2008 22:36:00 +0000 (15:36 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 16 Oct 2008 22:36:00 +0000 (15:36 -0700)
* 'kvm-updates/2.6.28' of git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm: (134 commits)
  KVM: ia64: Add intel iommu support for guests.
  KVM: ia64: add directed mmio range support for kvm guests
  KVM: ia64: Make pmt table be able to hold physical mmio entries.
  KVM: Move irqchip_in_kernel() from ioapic.h to irq.h
  KVM: Separate irq ack notification out of arch/x86/kvm/irq.c
  KVM: Change is_mmio_pfn to kvm_is_mmio_pfn, and make it common for all archs
  KVM: Move device assignment logic to common code
  KVM: Device Assignment: Move vtd.c from arch/x86/kvm/ to virt/kvm/
  KVM: VMX: enable invlpg exiting if EPT is disabled
  KVM: x86: Silence various LAPIC-related host kernel messages
  KVM: Device Assignment: Map mmio pages into VT-d page table
  KVM: PIC: enhance IPI avoidance
  KVM: MMU: add "oos_shadow" parameter to disable oos
  KVM: MMU: speed up mmu_unsync_walk
  KVM: MMU: out of sync shadow core
  KVM: MMU: mmu_convert_notrap helper
  KVM: MMU: awareness of new kvm_mmu_zap_page behaviour
  KVM: MMU: mmu_parent_walk
  KVM: x86: trap invlpg
  KVM: MMU: sync roots on mmu reload
  ...

63 files changed:
MAINTAINERS
arch/ia64/include/asm/kvm_host.h
arch/ia64/kvm/Kconfig
arch/ia64/kvm/Makefile
arch/ia64/kvm/irq.h [new file with mode: 0644]
arch/ia64/kvm/kvm-ia64.c
arch/ia64/kvm/kvm_minstate.h
arch/ia64/kvm/optvfault.S
arch/ia64/kvm/process.c
arch/ia64/kvm/vcpu.h
arch/ia64/kvm/vmm_ivt.S
arch/ia64/kvm/vtlb.c
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kvm/44x_tlb.c
arch/powerpc/kvm/Kconfig
arch/powerpc/kvm/Makefile
arch/powerpc/kvm/booke_guest.c
arch/powerpc/kvm/booke_interrupts.S
arch/powerpc/kvm/emulate.c
arch/powerpc/kvm/powerpc.c
arch/s390/Kconfig
arch/s390/kvm/priv.c
arch/x86/kernel/kvmclock.c
arch/x86/kernel/pvclock.c
arch/x86/kvm/Makefile
arch/x86/kvm/i8254.c
arch/x86/kvm/i8254.h
arch/x86/kvm/i8259.c
arch/x86/kvm/irq.c
arch/x86/kvm/irq.h
arch/x86/kvm/kvm_cache_regs.h [new file with mode: 0644]
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu.c
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/kvm/vmx.h
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h [new file with mode: 0644]
arch/x86/kvm/x86_emulate.c
arch/x86/xen/time.c
drivers/pci/dmar.c
drivers/pci/intel-iommu.c
drivers/pci/intr_remapping.c
drivers/pci/intr_remapping.h
drivers/pci/iova.c
include/asm-x86/kvm.h
include/asm-x86/kvm_host.h
include/asm-x86/msr-index.h
include/asm-x86/pvclock.h
include/linux/dma_remapping.h [moved from drivers/pci/dma_remapping.h with 100% similarity]
include/linux/intel-iommu.h [moved from drivers/pci/intel-iommu.h with 92% similarity]
include/linux/iova.h [moved from drivers/pci/iova.h with 100% similarity]
include/linux/kvm.h
include/linux/kvm_host.h
virt/kvm/ioapic.c
virt/kvm/ioapic.h
virt/kvm/irq_comm.c [new file with mode: 0644]
virt/kvm/kvm_main.c
virt/kvm/kvm_trace.c
virt/kvm/vtd.c [new file with mode: 0644]

index 57975bda92014b4d83ad5a0fc783f09bb3f6ad05..52702b057c022fa6a3d3b4ee7f2c5984482286f9 100644 (file)
@@ -2448,7 +2448,14 @@ S:       Supported
 
 KERNEL VIRTUAL MACHINE (KVM)
 P:     Avi Kivity
-M:     avi@qumranet.com
+M:     avi@redhat.com
+L:     kvm@vger.kernel.org
+W:     http://kvm.qumranet.com
+S:     Supported
+
+KERNEL VIRTUAL MACHINE (KVM) FOR AMD-V
+P:     Joerg Roedel
+M:     joerg.roedel@amd.com
 L:     kvm@vger.kernel.org
 W:     http://kvm.qumranet.com
 S:     Supported
index 1efe513a99415c6b0557ae6e00486ac5bd50c327..85db124d37f6a91498a02821f320138106bbc4fc 100644 (file)
 #define GPFN_IOSAPIC        (4UL << 60) /* IOSAPIC base */
 #define GPFN_LEGACY_IO      (5UL << 60) /* Legacy I/O base */
 #define GPFN_GFW        (6UL << 60) /* Guest Firmware */
-#define GPFN_HIGH_MMIO      (7UL << 60) /* High MMIO range */
+#define GPFN_PHYS_MMIO      (7UL << 60) /* Directed MMIO Range */
 
 #define GPFN_IO_MASK        (7UL << 60) /* Guest pfn is I/O type */
 #define GPFN_INV_MASK       (1UL << 63) /* Guest pfn is invalid */
@@ -413,6 +413,10 @@ struct kvm_arch {
        struct kvm_ioapic *vioapic;
        struct kvm_vm_stat stat;
        struct kvm_sal_data rdv_sal_data;
+
+       struct list_head assigned_dev_head;
+       struct dmar_domain *intel_iommu_domain;
+       struct hlist_head irq_ack_notifier_list;
 };
 
 union cpuid3_t {
index 7914e4828504595c161aee52025ed9e5f0cdaba0..8e99fed6b3fd932a61b8d2de2537710748717b2d 100644 (file)
@@ -46,4 +46,6 @@ config KVM_INTEL
 config KVM_TRACE
        bool
 
+source drivers/virtio/Kconfig
+
 endif # VIRTUALIZATION
index bf22fb9e6dcf8f04f5a289dad646ec7b8229511e..cf37f8f490c030a02fb458df21c376af4a051bdb 100644 (file)
@@ -44,7 +44,11 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
 EXTRA_AFLAGS += -Ivirt/kvm -Iarch/ia64/kvm/
 
 common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
-               coalesced_mmio.o)
+               coalesced_mmio.o irq_comm.o)
+
+ifeq ($(CONFIG_DMAR),y)
+common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
+endif
 
 kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o
 obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/ia64/kvm/irq.h b/arch/ia64/kvm/irq.h
new file mode 100644 (file)
index 0000000..c6786e8
--- /dev/null
@@ -0,0 +1,31 @@
+/*
+ * irq.h: In-kernel interrupt controller related definitions
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Authors:
+ *   Xiantao Zhang <xiantao.zhang@intel.com>
+ *
+ */
+
+#ifndef __IRQ_H
+#define __IRQ_H
+
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+       return 1;
+}
+
+#endif
index cd0d1a7284b78afbe673424b4463d3f6463621de..c0699f0e35a926936113e42d925b4fd08318c1c3 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/bitops.h>
 #include <linux/hrtimer.h>
 #include <linux/uaccess.h>
+#include <linux/intel-iommu.h>
 
 #include <asm/pgtable.h>
 #include <asm/gcc_intrin.h>
@@ -45,6 +46,7 @@
 #include "iodev.h"
 #include "ioapic.h"
 #include "lapic.h"
+#include "irq.h"
 
 static unsigned long kvm_vmm_base;
 static unsigned long kvm_vsa_base;
@@ -179,12 +181,16 @@ int kvm_dev_ioctl_check_extension(long ext)
        switch (ext) {
        case KVM_CAP_IRQCHIP:
        case KVM_CAP_USER_MEMORY:
+       case KVM_CAP_MP_STATE:
 
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
                r = KVM_COALESCED_MMIO_PAGE_OFFSET;
                break;
+       case KVM_CAP_IOMMU:
+               r = intel_iommu_found();
+               break;
        default:
                r = 0;
        }
@@ -771,6 +777,7 @@ static void kvm_init_vm(struct kvm *kvm)
         */
        kvm_build_io_pmt(kvm);
 
+       INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 }
 
 struct  kvm *kvm_arch_create_vm(void)
@@ -1334,6 +1341,10 @@ static void kvm_release_vm_pages(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+       kvm_iommu_unmap_guest(kvm);
+#ifdef  KVM_CAP_DEVICE_ASSIGNMENT
+       kvm_free_all_assigned_devices(kvm);
+#endif
        kfree(kvm->arch.vioapic);
        kvm_release_vm_pages(kvm);
        kvm_free_physmem(kvm);
@@ -1435,17 +1446,24 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
                int user_alloc)
 {
        unsigned long i;
-       struct page *page;
+       unsigned long pfn;
        int npages = mem->memory_size >> PAGE_SHIFT;
        struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
        unsigned long base_gfn = memslot->base_gfn;
 
        for (i = 0; i < npages; i++) {
-               page = gfn_to_page(kvm, base_gfn + i);
-               kvm_set_pmt_entry(kvm, base_gfn + i,
-                               page_to_pfn(page) << PAGE_SHIFT,
-                               _PAGE_AR_RWX|_PAGE_MA_WB);
-               memslot->rmap[i] = (unsigned long)page;
+               pfn = gfn_to_pfn(kvm, base_gfn + i);
+               if (!kvm_is_mmio_pfn(pfn)) {
+                       kvm_set_pmt_entry(kvm, base_gfn + i,
+                                       pfn << PAGE_SHIFT,
+                               _PAGE_AR_RWX | _PAGE_MA_WB);
+                       memslot->rmap[i] = (unsigned long)pfn_to_page(pfn);
+               } else {
+                       kvm_set_pmt_entry(kvm, base_gfn + i,
+                                       GPFN_PHYS_MMIO | (pfn << PAGE_SHIFT),
+                                       _PAGE_MA_UC);
+                       memslot->rmap[i] = 0;
+                       }
        }
 
        return 0;
@@ -1789,11 +1807,43 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
 {
-       return -EINVAL;
+       vcpu_load(vcpu);
+       mp_state->mp_state = vcpu->arch.mp_state;
+       vcpu_put(vcpu);
+       return 0;
+}
+
+static int vcpu_reset(struct kvm_vcpu *vcpu)
+{
+       int r;
+       long psr;
+       local_irq_save(psr);
+       r = kvm_insert_vmm_mapping(vcpu);
+       if (r)
+               goto fail;
+
+       vcpu->arch.launched = 0;
+       kvm_arch_vcpu_uninit(vcpu);
+       r = kvm_arch_vcpu_init(vcpu);
+       if (r)
+               goto fail;
+
+       kvm_purge_vmm_mapping(vcpu);
+       r = 0;
+fail:
+       local_irq_restore(psr);
+       return r;
 }
 
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
 {
-       return -EINVAL;
+       int r = 0;
+
+       vcpu_load(vcpu);
+       vcpu->arch.mp_state = mp_state->mp_state;
+       if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)
+               r = vcpu_reset(vcpu);
+       vcpu_put(vcpu);
+       return r;
 }
index 13980d9b8bcf1542ea29756990e2ce4a3b941d9e..2cc41d17cf991855d31460c1025137589e037461 100644 (file)
 
 #define PAL_VSA_SYNC_READ                                              \
        /* begin to call pal vps sync_read */                           \
+{.mii;                                                                 \
        add r25 = VMM_VPD_BASE_OFFSET, r21;                             \
-       adds r20 = VMM_VCPU_VSA_BASE_OFFSET, r21;  /* entry point */    \
+       nop 0x0;                                                        \
+       mov r24=ip;                                                     \
        ;;                                                              \
+}                                                                      \
+{.mmb                                                                  \
+       add r24=0x20, r24;                                              \
        ld8 r25 = [r25];      /* read vpd base */                       \
-       ld8 r20 = [r20];                                                \
-       ;;                                                              \
-       add r20 = PAL_VPS_SYNC_READ,r20;                                \
-       ;;                                                              \
-{ .mii;                                                                        \
-       nop 0x0;                                                        \
-       mov r24 = ip;                                                   \
-       mov b0 = r20;                                                   \
+       br.cond.sptk kvm_vps_sync_read;         /*call the service*/    \
        ;;                                                              \
 };                                                                     \
-{ .mmb;                                                                        \
-       add r24 = 0x20, r24;                                            \
-       nop 0x0;                                                        \
-       br.cond.sptk b0;        /*  call the service */                 \
-       ;;                                                              \
-};
-
 
 
 #define KVM_MINSTATE_GET_CURRENT(reg)   mov reg=r21
index e4f15d641b22dfbc5c21ea85f4c2e2db1d54a36b..634abad979b5143953e6eae5191eff3b4a673bc1 100644 (file)
@@ -1,9 +1,12 @@
 /*
- * arch/ia64/vmx/optvfault.S
+ * arch/ia64/kvm/optvfault.S
  * optimize virtualization fault handler
  *
  * Copyright (C) 2006 Intel Co
  *     Xuefei Xu (Anthony Xu) <anthony.xu@intel.com>
+ * Copyright (C) 2008 Intel Co
+ *      Add the support for Tukwila processors.
+ *     Xiantao Zhang <xiantao.zhang@intel.com>
  */
 
 #include <asm/asmmacro.h>
 #define ACCE_MOV_TO_PSR
 #define ACCE_THASH
 
+#define VMX_VPS_SYNC_READ                      \
+       add r16=VMM_VPD_BASE_OFFSET,r21;        \
+       mov r17 = b0;                           \
+       mov r18 = r24;                          \
+       mov r19 = r25;                          \
+       mov r20 = r31;                          \
+       ;;                                      \
+{.mii;                                         \
+       ld8 r16 = [r16];                        \
+       nop 0x0;                                \
+       mov r24 = ip;                           \
+       ;;                                      \
+};                                             \
+{.mmb;                                         \
+       add r24=0x20, r24;                      \
+       mov r25 =r16;                           \
+       br.sptk.many kvm_vps_sync_read;         \
+};                                             \
+       mov b0 = r17;                           \
+       mov r24 = r18;                          \
+       mov r25 = r19;                          \
+       mov r31 = r20
+
+ENTRY(kvm_vps_entry)
+       adds r29 = VMM_VCPU_VSA_BASE_OFFSET,r21
+       ;;
+       ld8 r29 = [r29]
+       ;;
+       add r29 = r29, r30
+       ;;
+       mov b0 = r29
+       br.sptk.many b0
+END(kvm_vps_entry)
+
+/*
+ *     Inputs:
+ *     r24 : return address
+ *     r25 : vpd
+ *     r29 : scratch
+ *
+ */
+GLOBAL_ENTRY(kvm_vps_sync_read)
+       movl r30 = PAL_VPS_SYNC_READ
+       ;;
+       br.sptk.many kvm_vps_entry
+END(kvm_vps_sync_read)
+
+/*
+ *     Inputs:
+ *     r24 : return address
+ *     r25 : vpd
+ *     r29 : scratch
+ *
+ */
+GLOBAL_ENTRY(kvm_vps_sync_write)
+       movl r30 = PAL_VPS_SYNC_WRITE
+       ;;
+       br.sptk.many kvm_vps_entry
+END(kvm_vps_sync_write)
+
+/*
+ *     Inputs:
+ *     r23 : pr
+ *     r24 : guest b0
+ *     r25 : vpd
+ *
+ */
+GLOBAL_ENTRY(kvm_vps_resume_normal)
+       movl r30 = PAL_VPS_RESUME_NORMAL
+       ;;
+       mov pr=r23,-2
+       br.sptk.many kvm_vps_entry
+END(kvm_vps_resume_normal)
+
+/*
+ *     Inputs:
+ *     r23 : pr
+ *     r24 : guest b0
+ *     r25 : vpd
+ *     r17 : isr
+ */
+GLOBAL_ENTRY(kvm_vps_resume_handler)
+       movl r30 = PAL_VPS_RESUME_HANDLER
+       ;;
+       ld8 r27=[r25]
+       shr r17=r17,IA64_ISR_IR_BIT
+       ;;
+       dep r27=r17,r27,63,1   // bit 63 of r27 indicate whether enable CFLE
+       mov pr=r23,-2
+       br.sptk.many kvm_vps_entry
+END(kvm_vps_resume_handler)
+
 //mov r1=ar3
 GLOBAL_ENTRY(kvm_asm_mov_from_ar)
 #ifndef ACCE_MOV_FROM_AR
@@ -157,11 +252,11 @@ GLOBAL_ENTRY(kvm_asm_rsm)
 #ifndef ACCE_RSM
        br.many kvm_virtualization_fault_back
 #endif
-       add r16=VMM_VPD_BASE_OFFSET,r21
+       VMX_VPS_SYNC_READ
+       ;;
        extr.u r26=r25,6,21
        extr.u r27=r25,31,2
        ;;
-       ld8 r16=[r16]
        extr.u r28=r25,36,1
        dep r26=r27,r26,21,2
        ;;
@@ -196,7 +291,7 @@ GLOBAL_ENTRY(kvm_asm_rsm)
        tbit.nz p6,p0=r23,0
        ;;
        tbit.z.or p6,p0=r26,IA64_PSR_DT_BIT
-       (p6) br.dptk kvm_resume_to_guest
+       (p6) br.dptk kvm_resume_to_guest_with_sync
        ;;
        add r26=VMM_VCPU_META_RR0_OFFSET,r21
        add r27=VMM_VCPU_META_RR0_OFFSET+8,r21
@@ -212,7 +307,7 @@ GLOBAL_ENTRY(kvm_asm_rsm)
        mov rr[r28]=r27
        ;;
        srlz.d
-       br.many kvm_resume_to_guest
+       br.many kvm_resume_to_guest_with_sync
 END(kvm_asm_rsm)
 
 
@@ -221,11 +316,11 @@ GLOBAL_ENTRY(kvm_asm_ssm)
 #ifndef ACCE_SSM
        br.many kvm_virtualization_fault_back
 #endif
-       add r16=VMM_VPD_BASE_OFFSET,r21
+       VMX_VPS_SYNC_READ
+       ;;
        extr.u r26=r25,6,21
        extr.u r27=r25,31,2
        ;;
-       ld8 r16=[r16]
        extr.u r28=r25,36,1
        dep r26=r27,r26,21,2
        ;;  //r26 is imm24
@@ -271,7 +366,7 @@ kvm_asm_ssm_1:
        tbit.nz p6,p0=r29,IA64_PSR_I_BIT
        ;;
        tbit.z.or p6,p0=r19,IA64_PSR_I_BIT
-       (p6) br.dptk kvm_resume_to_guest
+       (p6) br.dptk kvm_resume_to_guest_with_sync
        ;;
        add r29=VPD_VTPR_START_OFFSET,r16
        add r30=VPD_VHPI_START_OFFSET,r16
@@ -286,7 +381,7 @@ kvm_asm_ssm_1:
        ;;
        cmp.gt p6,p0=r30,r17
        (p6) br.dpnt.few kvm_asm_dispatch_vexirq
-       br.many kvm_resume_to_guest
+       br.many kvm_resume_to_guest_with_sync
 END(kvm_asm_ssm)
 
 
@@ -295,10 +390,9 @@ GLOBAL_ENTRY(kvm_asm_mov_to_psr)
 #ifndef ACCE_MOV_TO_PSR
        br.many kvm_virtualization_fault_back
 #endif
-       add r16=VMM_VPD_BASE_OFFSET,r21
-       extr.u r26=r25,13,7 //r2
+       VMX_VPS_SYNC_READ
        ;;
-       ld8 r16=[r16]
+       extr.u r26=r25,13,7 //r2
        addl r20=@gprel(asm_mov_from_reg),gp
        ;;
        adds r30=kvm_asm_mov_to_psr_back-asm_mov_from_reg,r20
@@ -374,7 +468,7 @@ kvm_asm_mov_to_psr_1:
        ;;
        tbit.nz.or p6,p0=r17,IA64_PSR_I_BIT
        tbit.z.or p6,p0=r30,IA64_PSR_I_BIT
-       (p6) br.dpnt.few kvm_resume_to_guest
+       (p6) br.dpnt.few kvm_resume_to_guest_with_sync
        ;;
        add r29=VPD_VTPR_START_OFFSET,r16
        add r30=VPD_VHPI_START_OFFSET,r16
@@ -389,13 +483,29 @@ kvm_asm_mov_to_psr_1:
        ;;
        cmp.gt p6,p0=r30,r17
        (p6) br.dpnt.few kvm_asm_dispatch_vexirq
-       br.many kvm_resume_to_guest
+       br.many kvm_resume_to_guest_with_sync
 END(kvm_asm_mov_to_psr)
 
 
 ENTRY(kvm_asm_dispatch_vexirq)
 //increment iip
+       mov r17 = b0
+       mov r18 = r31
+{.mii
+       add r25=VMM_VPD_BASE_OFFSET,r21
+       nop 0x0
+       mov r24 = ip
+       ;;
+}
+{.mmb
+       add r24 = 0x20, r24
+       ld8 r25 = [r25]
+       br.sptk.many kvm_vps_sync_write
+}
+       mov b0 =r17
        mov r16=cr.ipsr
+       mov r31 = r18
+       mov r19 = 37
        ;;
        extr.u r17=r16,IA64_PSR_RI_BIT,2
        tbit.nz p6,p7=r16,IA64_PSR_RI_BIT+1
@@ -435,25 +545,31 @@ GLOBAL_ENTRY(kvm_asm_thash)
        ;;
 kvm_asm_thash_back1:
        shr.u r23=r19,61                // get RR number
-       adds r25=VMM_VCPU_VRR0_OFFSET,r21       // get vcpu->arch.vrr[0]'s addr
+       adds r28=VMM_VCPU_VRR0_OFFSET,r21       // get vcpu->arch.vrr[0]'s addr
        adds r16=VMM_VPD_VPTA_OFFSET,r16        // get vpta
        ;;
-       shladd r27=r23,3,r25    // get vcpu->arch.vrr[r23]'s addr
+       shladd r27=r23,3,r28    // get vcpu->arch.vrr[r23]'s addr
        ld8 r17=[r16]           // get PTA
        mov r26=1
        ;;
-       extr.u r29=r17,2,6              // get pta.size
-       ld8 r25=[r27]           // get vcpu->arch.vrr[r23]'s value
+       extr.u r29=r17,2,6      // get pta.size
+       ld8 r28=[r27]           // get vcpu->arch.vrr[r23]'s value
        ;;
-       extr.u r25=r25,2,6              // get rr.ps
+       mov b0=r24
+       //Fallback to C if pta.vf is set
+       tbit.nz p6,p0=r17, 8
+       ;;
+       (p6) mov r24=EVENT_THASH
+       (p6) br.cond.dpnt.many kvm_virtualization_fault_back
+       extr.u r28=r28,2,6      // get rr.ps
        shl r22=r26,r29         // 1UL << pta.size
        ;;
-       shr.u r23=r19,r25               // vaddr >> rr.ps
+       shr.u r23=r19,r28       // vaddr >> rr.ps
        adds r26=3,r29          // pta.size + 3
        shl r27=r17,3           // pta << 3
        ;;
        shl r23=r23,3           // (vaddr >> rr.ps) << 3
-       shr.u r27=r27,r26               // (pta << 3) >> (pta.size+3)
+       shr.u r27=r27,r26       // (pta << 3) >> (pta.size+3)
        movl r16=7<<61
        ;;
        adds r22=-1,r22         // (1UL << pta.size) - 1
@@ -724,6 +840,29 @@ END(asm_mov_from_reg)
  * r31: pr
  * r24: b0
  */
+ENTRY(kvm_resume_to_guest_with_sync)
+       adds r19=VMM_VPD_BASE_OFFSET,r21
+       mov r16 = r31
+       mov r17 = r24
+       ;;
+{.mii
+       ld8 r25 =[r19]
+       nop 0x0
+       mov r24 = ip
+       ;;
+}
+{.mmb
+       add r24 =0x20, r24
+       nop 0x0
+       br.sptk.many kvm_vps_sync_write
+}
+
+       mov r31 = r16
+       mov r24 =r17
+       ;;
+       br.sptk.many kvm_resume_to_guest
+END(kvm_resume_to_guest_with_sync)
+
 ENTRY(kvm_resume_to_guest)
        adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
        ;;
index 5a33f7ed29a0af4b3410f9fa1259b839f657eefa..3417783ae16446f8f7b4347afe6e801b0b473191 100644 (file)
@@ -962,9 +962,9 @@ static void kvm_do_resume_op(struct kvm_vcpu *vcpu)
 void vmm_transition(struct kvm_vcpu *vcpu)
 {
        ia64_call_vsa(PAL_VPS_SAVE, (unsigned long)vcpu->arch.vpd,
-                       0, 0, 0, 0, 0, 0);
+                       1, 0, 0, 0, 0, 0);
        vmm_trampoline(&vcpu->arch.guest, &vcpu->arch.host);
        ia64_call_vsa(PAL_VPS_RESTORE, (unsigned long)vcpu->arch.vpd,
-                                               0, 0, 0, 0, 0, 0);
+                                               1, 0, 0, 0, 0, 0);
        kvm_do_resume_op(vcpu);
 }
index b0fcfb62c49e8a660464be9fca38c2e555ab20cc..341e3fee280cd502cbb270aaea00218548fcdbff 100644 (file)
@@ -313,21 +313,21 @@ static inline void vcpu_set_tr(struct thash_data *trp, u64 pte, u64 itir,
        trp->rid = rid;
 }
 
-extern u64 kvm_lookup_mpa(u64 gpfn);
-extern u64 kvm_gpa_to_mpa(u64 gpa);
-
-/* Return I/O type if trye */
-#define __gpfn_is_io(gpfn)                     \
-       ({                                              \
-        u64 pte, ret = 0;                      \
-        pte = kvm_lookup_mpa(gpfn);            \
-        if (!(pte & GPFN_INV_MASK))            \
-        ret = pte & GPFN_IO_MASK;      \
-        ret;                                   \
-        })
+extern u64 kvm_get_mpt_entry(u64 gpfn);
 
+/* Return I/ */
+static inline u64 __gpfn_is_io(u64 gpfn)
+{
+       u64  pte;
+       pte = kvm_get_mpt_entry(gpfn);
+       if (!(pte & GPFN_INV_MASK)) {
+               pte = pte & GPFN_IO_MASK;
+               if (pte != GPFN_PHYS_MMIO)
+                       return pte;
+       }
+       return 0;
+}
 #endif
-
 #define IA64_NO_FAULT  0
 #define IA64_FAULT     1
 
index 3ee5f481c06dccbd62f88940cacb8f2662567c71..c1d7251a148008bcbdcb57b9b0611ab9dc6a52ee 100644 (file)
@@ -1261,11 +1261,6 @@ kvm_rse_clear_invalid:
     adds r19=VMM_VPD_VPSR_OFFSET,r18
     ;;
     ld8 r19=[r19]        //vpsr
-    adds r20=VMM_VCPU_VSA_BASE_OFFSET,r21
-    ;;
-    ld8 r20=[r20]
-    ;;
-//vsa_sync_write_start
     mov r25=r18
     adds r16= VMM_VCPU_GP_OFFSET,r21
     ;;
@@ -1274,10 +1269,7 @@ kvm_rse_clear_invalid:
     ;;
     add  r24=r24,r16
     ;;
-    add r16=PAL_VPS_SYNC_WRITE,r20
-    ;;
-    mov b0=r16
-    br.cond.sptk b0         // call the service
+    br.sptk.many  kvm_vps_sync_write       // call the service
     ;;
 END(ia64_leave_hypervisor)
 // fall through
@@ -1288,28 +1280,15 @@ GLOBAL_ENTRY(ia64_vmm_entry)
  *  r17:cr.isr
  *  r18:vpd
  *  r19:vpsr
- *  r20:__vsa_base
  *  r22:b0
  *  r23:predicate
  */
     mov r24=r22
     mov r25=r18
     tbit.nz p1,p2 = r19,IA64_PSR_IC_BIT        // p1=vpsr.ic
+    (p1) br.cond.sptk.few kvm_vps_resume_normal
+    (p2) br.cond.sptk.many kvm_vps_resume_handler
     ;;
-    (p1) add r29=PAL_VPS_RESUME_NORMAL,r20
-    (p1) br.sptk.many ia64_vmm_entry_out
-    ;;
-    tbit.nz p1,p2 = r17,IA64_ISR_IR_BIT                //p1=cr.isr.ir
-    ;;
-    (p1) add r29=PAL_VPS_RESUME_NORMAL,r20
-    (p2) add r29=PAL_VPS_RESUME_HANDLER,r20
-    (p2) ld8 r26=[r25]
-    ;;
-ia64_vmm_entry_out:
-    mov pr=r23,-2
-    mov b0=r29
-    ;;
-    br.cond.sptk b0             // call pal service
 END(ia64_vmm_entry)
 
 
@@ -1376,6 +1355,9 @@ GLOBAL_ENTRY(vmm_reset_entry)
     //set up ipsr, iip, vpd.vpsr, dcr
     // For IPSR: it/dt/rt=1, i/ic=1, si=1, vm/bn=1
     // For DCR: all bits 0
+    bsw.0
+    ;;
+    mov r21 =r13
     adds r14=-VMM_PT_REGS_SIZE, r12
     ;;
     movl r6=0x501008826000      // IPSR dt/rt/it:1;i/ic:1, si:1, vm/bn:1
@@ -1387,12 +1369,6 @@ GLOBAL_ENTRY(vmm_reset_entry)
     ;;
     srlz.i
     ;;
-    bsw.0
-    ;;
-    mov r21 =r13
-    ;;
-    bsw.1
-    ;;
     mov ar.rsc = 0
     ;;
     flushrs
@@ -1406,12 +1382,9 @@ GLOBAL_ENTRY(vmm_reset_entry)
     ld8 r1 = [r20]
     ;;
     mov cr.iip=r4
-    ;;
     adds r16=VMM_VPD_BASE_OFFSET,r13
-    adds r20=VMM_VCPU_VSA_BASE_OFFSET,r13
     ;;
     ld8 r18=[r16]
-    ld8 r20=[r20]
     ;;
     adds r19=VMM_VPD_VPSR_OFFSET,r18
     ;;
index def4576d22b1b9d3524319d5c7dffe0fa43c24ba..e22b93361e082528812b2d56c1e2bd628f21f584 100644 (file)
@@ -390,7 +390,7 @@ void thash_purge_entries_remote(struct kvm_vcpu *v, u64 va, u64 ps)
 
 u64 translate_phy_pte(u64 *pte, u64 itir, u64 va)
 {
-       u64 ps, ps_mask, paddr, maddr;
+       u64 ps, ps_mask, paddr, maddr, io_mask;
        union pte_flags phy_pte;
 
        ps = itir_ps(itir);
@@ -398,8 +398,9 @@ u64 translate_phy_pte(u64 *pte, u64 itir, u64 va)
        phy_pte.val = *pte;
        paddr = *pte;
        paddr = ((paddr & _PAGE_PPN_MASK) & ps_mask) | (va & ~ps_mask);
-       maddr = kvm_lookup_mpa(paddr >> PAGE_SHIFT);
-       if (maddr & GPFN_IO_MASK) {
+       maddr = kvm_get_mpt_entry(paddr >> PAGE_SHIFT);
+       io_mask = maddr & GPFN_IO_MASK;
+       if (io_mask && (io_mask != GPFN_PHYS_MMIO)) {
                *pte |= VTLB_PTE_IO;
                return -1;
        }
@@ -418,7 +419,7 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
                                                u64 ifa, int type)
 {
        u64 ps;
-       u64 phy_pte;
+       u64 phy_pte, io_mask, index;
        union ia64_rr vrr, mrr;
        int ret = 0;
 
@@ -426,13 +427,16 @@ int thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
        vrr.val = vcpu_get_rr(v, ifa);
        mrr.val = ia64_get_rr(ifa);
 
+       index = (pte & _PAGE_PPN_MASK) >> PAGE_SHIFT;
+       io_mask = kvm_get_mpt_entry(index) & GPFN_IO_MASK;
        phy_pte = translate_phy_pte(&pte, itir, ifa);
 
        /* Ensure WB attribute if pte is related to a normal mem page,
         * which is required by vga acceleration since qemu maps shared
         * vram buffer with WB.
         */
-       if (!(pte & VTLB_PTE_IO) && ((pte & _PAGE_MA_MASK) != _PAGE_MA_NAT)) {
+       if (!(pte & VTLB_PTE_IO) && ((pte & _PAGE_MA_MASK) != _PAGE_MA_NAT) &&
+                       io_mask != GPFN_PHYS_MMIO) {
                pte &= ~_PAGE_MA_MASK;
                phy_pte &= ~_PAGE_MA_MASK;
        }
@@ -566,12 +570,19 @@ void thash_init(struct thash_cb *hcb, u64 sz)
        }
 }
 
-u64 kvm_lookup_mpa(u64 gpfn)
+u64 kvm_get_mpt_entry(u64 gpfn)
 {
        u64 *base = (u64 *) KVM_P2M_BASE;
        return *(base + gpfn);
 }
 
+u64 kvm_lookup_mpa(u64 gpfn)
+{
+       u64 maddr;
+       maddr = kvm_get_mpt_entry(gpfn);
+       return maddr&_PAGE_PPN_MASK;
+}
+
 u64 kvm_gpa_to_mpa(u64 gpa)
 {
        u64 pte = kvm_lookup_mpa(gpa >> PAGE_SHIFT);
index 2655e2a4831ead92bfc16f2b764ac80a2c2dbd31..34b52b7180cd52da00f31532cb956dff68923a86 100644 (file)
@@ -81,11 +81,17 @@ struct kvm_vcpu_arch {
        struct tlbe shadow_tlb[PPC44x_TLB_SIZE];
        /* Pages which are referenced in the shadow TLB. */
        struct page *shadow_pages[PPC44x_TLB_SIZE];
-       /* Copy of the host's TLB. */
-       struct tlbe host_tlb[PPC44x_TLB_SIZE];
+
+       /* Track which TLB entries we've modified in the current exit. */
+       u8 shadow_tlb_mod[PPC44x_TLB_SIZE];
 
        u32 host_stack;
        u32 host_pid;
+       u32 host_dbcr0;
+       u32 host_dbcr1;
+       u32 host_dbcr2;
+       u32 host_iac[4];
+       u32 host_msr;
 
        u64 fpr[32];
        u32 gpr[32];
@@ -123,7 +129,11 @@ struct kvm_vcpu_arch {
        u32 ivor[16];
        u32 ivpr;
        u32 pir;
+
+       u32 shadow_pid;
        u32 pid;
+       u32 swap_pid;
+
        u32 pvr;
        u32 ccr0;
        u32 ccr1;
index a8b068792260a56a5371fcec5845a66a634d0453..8931ba729d2b519ec4d4efb1d0851301d96e1d3a 100644 (file)
@@ -64,6 +64,10 @@ extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn,
 extern void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
                                   gva_t eend, u32 asid);
 extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode);
+extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid);
+
+/* XXX Book E specific */
+extern void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i);
 
 extern void kvmppc_check_and_deliver_interrupts(struct kvm_vcpu *vcpu);
 
@@ -92,4 +96,12 @@ static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
                kvm_vcpu_block(vcpu);
 }
 
+static inline void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid)
+{
+       if (vcpu->arch.pid != new_pid) {
+               vcpu->arch.pid = new_pid;
+               vcpu->arch.swap_pid = 1;
+       }
+}
+
 #endif /* __POWERPC_KVM_PPC_H__ */
index 09febc582584c1a662568610cf542bf3b4759d5e..75c5dd0138fd327a96d22d0fe46b9bac899cb610 100644 (file)
@@ -359,8 +359,8 @@ int main(void)
 
        DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
        DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
-       DEFINE(VCPU_HOST_TLB, offsetof(struct kvm_vcpu, arch.host_tlb));
        DEFINE(VCPU_SHADOW_TLB, offsetof(struct kvm_vcpu, arch.shadow_tlb));
+       DEFINE(VCPU_SHADOW_MOD, offsetof(struct kvm_vcpu, arch.shadow_tlb_mod));
        DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
        DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
        DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
@@ -372,7 +372,7 @@ int main(void)
        DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5));
        DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6));
        DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7));
-       DEFINE(VCPU_PID, offsetof(struct kvm_vcpu, arch.pid));
+       DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid));
 
        DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
        DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
index 5a5602da50910d441f806c6b0a111fcd3e19aab3..2e227a412bc240f2bc889b0c7c1fe2e17a7bdd8c 100644 (file)
@@ -19,6 +19,7 @@
 
 #include <linux/types.h>
 #include <linux/string.h>
+#include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <linux/highmem.h>
 #include <asm/mmu-44x.h>
@@ -109,7 +110,6 @@ static int kvmppc_44x_tlbe_is_writable(struct tlbe *tlbe)
        return tlbe->word2 & (PPC44x_TLB_SW|PPC44x_TLB_UW);
 }
 
-/* Must be called with mmap_sem locked for writing. */
 static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu,
                                       unsigned int index)
 {
@@ -124,6 +124,11 @@ static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu,
        }
 }
 
+void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i)
+{
+    vcpu->arch.shadow_tlb_mod[i] = 1;
+}
+
 /* Caller must ensure that the specified guest TLB entry is safe to insert into
  * the shadow TLB. */
 void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
@@ -142,19 +147,16 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
        stlbe = &vcpu->arch.shadow_tlb[victim];
 
        /* Get reference to new page. */
-       down_read(&current->mm->mmap_sem);
        new_page = gfn_to_page(vcpu->kvm, gfn);
        if (is_error_page(new_page)) {
                printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn);
                kvm_release_page_clean(new_page);
-               up_read(&current->mm->mmap_sem);
                return;
        }
        hpaddr = page_to_phys(new_page);
 
        /* Drop reference to old page. */
        kvmppc_44x_shadow_release(vcpu, victim);
-       up_read(&current->mm->mmap_sem);
 
        vcpu->arch.shadow_pages[victim] = new_page;
 
@@ -164,27 +166,30 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
 
        /* XXX what about AS? */
 
-       stlbe->tid = asid & 0xff;
+       stlbe->tid = !(asid & 0xff);
 
        /* Force TS=1 for all guest mappings. */
        /* For now we hardcode 4KB mappings, but it will be important to
         * use host large pages in the future. */
        stlbe->word0 = (gvaddr & PAGE_MASK) | PPC44x_TLB_VALID | PPC44x_TLB_TS
                       | PPC44x_TLB_4K;
-
        stlbe->word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf);
        stlbe->word2 = kvmppc_44x_tlb_shadow_attrib(flags,
                                                    vcpu->arch.msr & MSR_PR);
+       kvmppc_tlbe_set_modified(vcpu, victim);
+
+       KVMTRACE_5D(STLB_WRITE, vcpu, victim,
+                       stlbe->tid, stlbe->word0, stlbe->word1, stlbe->word2,
+                       handler);
 }
 
 void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
                            gva_t eend, u32 asid)
 {
-       unsigned int pid = asid & 0xff;
+       unsigned int pid = !(asid & 0xff);
        int i;
 
        /* XXX Replace loop with fancy data structures. */
-       down_write(&current->mm->mmap_sem);
        for (i = 0; i <= tlb_44x_hwater; i++) {
                struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
                unsigned int tid;
@@ -204,21 +209,35 @@ void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
 
                kvmppc_44x_shadow_release(vcpu, i);
                stlbe->word0 = 0;
+               kvmppc_tlbe_set_modified(vcpu, i);
+               KVMTRACE_5D(STLB_INVAL, vcpu, i,
+                               stlbe->tid, stlbe->word0, stlbe->word1,
+                               stlbe->word2, handler);
        }
-       up_write(&current->mm->mmap_sem);
 }
 
-/* Invalidate all mappings, so that when they fault back in they will get the
- * proper permission bits. */
+/* Invalidate all mappings on the privilege switch after PID has been changed.
+ * The guest always runs with PID=1, so we must clear the entire TLB when
+ * switching address spaces. */
 void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
 {
        int i;
 
-       /* XXX Replace loop with fancy data structures. */
-       down_write(&current->mm->mmap_sem);
-       for (i = 0; i <= tlb_44x_hwater; i++) {
-               kvmppc_44x_shadow_release(vcpu, i);
-               vcpu->arch.shadow_tlb[i].word0 = 0;
+       if (vcpu->arch.swap_pid) {
+               /* XXX Replace loop with fancy data structures. */
+               for (i = 0; i <= tlb_44x_hwater; i++) {
+                       struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
+
+                       /* Future optimization: clear only userspace mappings. */
+                       kvmppc_44x_shadow_release(vcpu, i);
+                       stlbe->word0 = 0;
+                       kvmppc_tlbe_set_modified(vcpu, i);
+                       KVMTRACE_5D(STLB_INVAL, vcpu, i,
+                                   stlbe->tid, stlbe->word0, stlbe->word1,
+                                   stlbe->word2, handler);
+               }
+               vcpu->arch.swap_pid = 0;
        }
-       up_write(&current->mm->mmap_sem);
+
+       vcpu->arch.shadow_pid = !usermode;
 }
index 6b076010213ba28248a9c81813ca0920bef43e08..53aaa66b25e5ada55b425004e7a004295747857b 100644 (file)
@@ -37,6 +37,17 @@ config KVM_BOOKE_HOST
          Provides host support for KVM on Book E PowerPC processors. Currently
          this works on 440 processors only.
 
+config KVM_TRACE
+       bool "KVM trace support"
+       depends on KVM && MARKERS && SYSFS
+       select RELAY
+       select DEBUG_FS
+       default n
+       ---help---
+         This option allows reading a trace of kvm-related events through
+         relayfs.  Note the ABI is not considered stable and will be
+         modified in future updates.
+
 source drivers/virtio/Kconfig
 
 endif # VIRTUALIZATION
index 04e3449e1f427f0ee644443a6a66c233afef58fc..2a5d4397ac4b06f7545a5fc764b9da98b0ba580a 100644 (file)
@@ -4,9 +4,11 @@
 
 EXTRA_CFLAGS += -Ivirt/kvm -Iarch/powerpc/kvm
 
-common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
+common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
 
-kvm-objs := $(common-objs) powerpc.o emulate.o booke_guest.o
+common-objs-$(CONFIG_KVM_TRACE)  += $(addprefix ../../../virt/kvm/, kvm_trace.o)
+
+kvm-objs := $(common-objs-y) powerpc.o emulate.o booke_guest.o
 obj-$(CONFIG_KVM) += kvm.o
 
 AFLAGS_booke_interrupts.o := -I$(obj)
index 9c8ad850c6e32ebcc78b697c8e8246e69d91dffa..7b2591e26bae3dc93dc7f002e833fd98de8288bb 100644 (file)
@@ -410,6 +410,21 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                break;
        }
 
+       case BOOKE_INTERRUPT_DEBUG: {
+               u32 dbsr;
+
+               vcpu->arch.pc = mfspr(SPRN_CSRR0);
+
+               /* clear IAC events in DBSR register */
+               dbsr = mfspr(SPRN_DBSR);
+               dbsr &= DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4;
+               mtspr(SPRN_DBSR, dbsr);
+
+               run->exit_reason = KVM_EXIT_DEBUG;
+               r = RESUME_HOST;
+               break;
+       }
+
        default:
                printk(KERN_EMERG "exit_nr %d\n", exit_nr);
                BUG();
@@ -471,6 +486,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        vcpu->arch.msr = 0;
        vcpu->arch.gpr[1] = (16<<20) - 8; /* -8 for the callee-save LR slot */
 
+       vcpu->arch.shadow_pid = 1;
+
        /* Eye-catching number so we know if the guest takes an interrupt
         * before it's programmed its own IVPR. */
        vcpu->arch.ivpr = 0x55550000;
index 3b653b5309b8ecc2dad5b5137163a9465408ca22..95e165baf85faee89b7dfa30e628b75778358618 100644 (file)
@@ -42,7 +42,8 @@
 #define HOST_STACK_LR   (HOST_STACK_SIZE + 4) /* In caller stack frame. */
 
 #define NEED_INST_MASK ((1<<BOOKE_INTERRUPT_PROGRAM) | \
-                        (1<<BOOKE_INTERRUPT_DTLB_MISS))
+                        (1<<BOOKE_INTERRUPT_DTLB_MISS) | \
+                        (1<<BOOKE_INTERRUPT_DEBUG))
 
 #define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \
                         (1<<BOOKE_INTERRUPT_DTLB_MISS))
@@ -331,51 +332,57 @@ lightweight_exit:
 
        mfspr   r3, SPRN_PID
        stw     r3, VCPU_HOST_PID(r4)
-       lwz     r3, VCPU_PID(r4)
+       lwz     r3, VCPU_SHADOW_PID(r4)
        mtspr   SPRN_PID, r3
 
-       /* Prevent all TLB updates. */
+       /* Prevent all asynchronous TLB updates. */
        mfmsr   r5
        lis     r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@h
        ori     r6, r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@l
        andc    r6, r5, r6
        mtmsr   r6
 
-       /* Save the host's non-pinned TLB mappings, and load the guest mappings
-        * over them. Leave the host's "pinned" kernel mappings in place. */
-       /* XXX optimization: use generation count to avoid swapping unmodified
-        * entries. */
+       /* Load the guest mappings, leaving the host's "pinned" kernel mappings
+        * in place. */
        mfspr   r10, SPRN_MMUCR                 /* Save host MMUCR. */
-       lis     r8, tlb_44x_hwater@ha
-       lwz     r8, tlb_44x_hwater@l(r8)
-       addi    r3, r4, VCPU_HOST_TLB - 4
-       addi    r9, r4, VCPU_SHADOW_TLB - 4
-       li      r6, 0
+       li      r5, PPC44x_TLB_SIZE
+       lis     r5, tlb_44x_hwater@ha
+       lwz     r5, tlb_44x_hwater@l(r5)
+       mtctr   r5
+       addi    r9, r4, VCPU_SHADOW_TLB
+       addi    r5, r4, VCPU_SHADOW_MOD
+       li      r3, 0
 1:
-       /* Save host entry. */
-       tlbre   r7, r6, PPC44x_TLB_PAGEID
-       mfspr   r5, SPRN_MMUCR
-       stwu    r5, 4(r3)
-       stwu    r7, 4(r3)
-       tlbre   r7, r6, PPC44x_TLB_XLAT
-       stwu    r7, 4(r3)
-       tlbre   r7, r6, PPC44x_TLB_ATTRIB
-       stwu    r7, 4(r3)
+       lbzx    r7, r3, r5
+       cmpwi   r7, 0
+       beq     3f
+
        /* Load guest entry. */
-       lwzu    r7, 4(r9)
+       mulli   r11, r3, TLBE_BYTES
+       add     r11, r11, r9
+       lwz     r7, 0(r11)
        mtspr   SPRN_MMUCR, r7
-       lwzu    r7, 4(r9)
-       tlbwe   r7, r6, PPC44x_TLB_PAGEID
-       lwzu    r7, 4(r9)
-       tlbwe   r7, r6, PPC44x_TLB_XLAT
-       lwzu    r7, 4(r9)
-       tlbwe   r7, r6, PPC44x_TLB_ATTRIB
-       /* Increment index. */
-       addi    r6, r6, 1
-       cmpw    r6, r8
-       blt     1b
+       lwz     r7, 4(r11)
+       tlbwe   r7, r3, PPC44x_TLB_PAGEID
+       lwz     r7, 8(r11)
+       tlbwe   r7, r3, PPC44x_TLB_XLAT
+       lwz     r7, 12(r11)
+       tlbwe   r7, r3, PPC44x_TLB_ATTRIB
+3:
+       addi    r3, r3, 1                       /* Increment index. */
+       bdnz    1b
+
        mtspr   SPRN_MMUCR, r10                 /* Restore host MMUCR. */
 
+       /* Clear bitmap of modified TLB entries */
+       li      r5, PPC44x_TLB_SIZE>>2
+       mtctr   r5
+       addi    r5, r4, VCPU_SHADOW_MOD - 4
+       li      r6, 0
+1:
+       stwu    r6, 4(r5)
+       bdnz    1b
+
        iccci   0, 0 /* XXX hack */
 
        /* Load some guest volatiles. */
@@ -431,6 +438,14 @@ lightweight_exit:
        oris    r3, r3, KVMPPC_MSR_MASK@h
        ori     r3, r3, KVMPPC_MSR_MASK@l
        mtsrr1  r3
+
+       /* Clear any debug events which occurred since we disabled MSR[DE].
+        * XXX This gives us a 3-instruction window in which a breakpoint
+        * intended for guest context could fire in the host instead. */
+       lis     r3, 0xffff
+       ori     r3, r3, 0xffff
+       mtspr   SPRN_DBSR, r3
+
        lwz     r3, VCPU_GPR(r3)(r4)
        lwz     r4, VCPU_GPR(r4)(r4)
        rfi
index 8c605d0a5488fdce0268cbbdf3055d9404028f7b..0fce4fbdc20d2015f44ce3c2c8bfcef291bc82b9 100644 (file)
@@ -170,6 +170,10 @@ static int kvmppc_emul_tlbwe(struct kvm_vcpu *vcpu, u32 inst)
                kvmppc_mmu_map(vcpu, eaddr, raddr >> PAGE_SHIFT, asid, flags);
        }
 
+       KVMTRACE_5D(GTLB_WRITE, vcpu, index,
+                       tlbe->tid, tlbe->word0, tlbe->word1, tlbe->word2,
+                       handler);
+
        return EMULATE_DONE;
 }
 
@@ -504,7 +508,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        case SPRN_MMUCR:
                                vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break;
                        case SPRN_PID:
-                               vcpu->arch.pid = vcpu->arch.gpr[rs]; break;
+                               kvmppc_set_pid(vcpu, vcpu->arch.gpr[rs]); break;
                        case SPRN_CCR0:
                                vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break;
                        case SPRN_CCR1:
@@ -765,6 +769,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                break;
        }
 
+       KVMTRACE_3D(PPC_INSTR, vcpu, inst, vcpu->arch.pc, emulated, entryexit);
+
        if (advance)
                vcpu->arch.pc += 4; /* Advance past emulated instruction. */
 
index 53826a5f6c06d406bb7a95ebb8017f0b4d1a1185..90a6fc422b238ccbd38becc6d36546c0a7ca0cd0 100644 (file)
@@ -27,6 +27,7 @@
 #include <asm/cputable.h>
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
+#include <asm/tlbflush.h>
 
 
 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
@@ -239,18 +240,114 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 }
 
+/* Note: clearing MSR[DE] just means that the debug interrupt will not be
+ * delivered *immediately*. Instead, it simply sets the appropriate DBSR bits.
+ * If those DBSR bits are still set when MSR[DE] is re-enabled, the interrupt
+ * will be delivered as an "imprecise debug event" (which is indicated by
+ * DBSR[IDE].
+ */
+static void kvmppc_disable_debug_interrupts(void)
+{
+       mtmsr(mfmsr() & ~MSR_DE);
+}
+
+static void kvmppc_restore_host_debug_state(struct kvm_vcpu *vcpu)
+{
+       kvmppc_disable_debug_interrupts();
+
+       mtspr(SPRN_IAC1, vcpu->arch.host_iac[0]);
+       mtspr(SPRN_IAC2, vcpu->arch.host_iac[1]);
+       mtspr(SPRN_IAC3, vcpu->arch.host_iac[2]);
+       mtspr(SPRN_IAC4, vcpu->arch.host_iac[3]);
+       mtspr(SPRN_DBCR1, vcpu->arch.host_dbcr1);
+       mtspr(SPRN_DBCR2, vcpu->arch.host_dbcr2);
+       mtspr(SPRN_DBCR0, vcpu->arch.host_dbcr0);
+       mtmsr(vcpu->arch.host_msr);
+}
+
+static void kvmppc_load_guest_debug_registers(struct kvm_vcpu *vcpu)
+{
+       struct kvm_guest_debug *dbg = &vcpu->guest_debug;
+       u32 dbcr0 = 0;
+
+       vcpu->arch.host_msr = mfmsr();
+       kvmppc_disable_debug_interrupts();
+
+       /* Save host debug register state. */
+       vcpu->arch.host_iac[0] = mfspr(SPRN_IAC1);
+       vcpu->arch.host_iac[1] = mfspr(SPRN_IAC2);
+       vcpu->arch.host_iac[2] = mfspr(SPRN_IAC3);
+       vcpu->arch.host_iac[3] = mfspr(SPRN_IAC4);
+       vcpu->arch.host_dbcr0 = mfspr(SPRN_DBCR0);
+       vcpu->arch.host_dbcr1 = mfspr(SPRN_DBCR1);
+       vcpu->arch.host_dbcr2 = mfspr(SPRN_DBCR2);
+
+       /* set registers up for guest */
+
+       if (dbg->bp[0]) {
+               mtspr(SPRN_IAC1, dbg->bp[0]);
+               dbcr0 |= DBCR0_IAC1 | DBCR0_IDM;
+       }
+       if (dbg->bp[1]) {
+               mtspr(SPRN_IAC2, dbg->bp[1]);
+               dbcr0 |= DBCR0_IAC2 | DBCR0_IDM;
+       }
+       if (dbg->bp[2]) {
+               mtspr(SPRN_IAC3, dbg->bp[2]);
+               dbcr0 |= DBCR0_IAC3 | DBCR0_IDM;
+       }
+       if (dbg->bp[3]) {
+               mtspr(SPRN_IAC4, dbg->bp[3]);
+               dbcr0 |= DBCR0_IAC4 | DBCR0_IDM;
+       }
+
+       mtspr(SPRN_DBCR0, dbcr0);
+       mtspr(SPRN_DBCR1, 0);
+       mtspr(SPRN_DBCR2, 0);
+}
+
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+       int i;
+
+       if (vcpu->guest_debug.enabled)
+               kvmppc_load_guest_debug_registers(vcpu);
+
+       /* Mark every guest entry in the shadow TLB entry modified, so that they
+        * will all be reloaded on the next vcpu run (instead of being
+        * demand-faulted). */
+       for (i = 0; i <= tlb_44x_hwater; i++)
+               kvmppc_tlbe_set_modified(vcpu, i);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
+       if (vcpu->guest_debug.enabled)
+               kvmppc_restore_host_debug_state(vcpu);
+
+       /* Don't leave guest TLB entries resident when being de-scheduled. */
+       /* XXX It would be nice to differentiate between heavyweight exit and
+        * sched_out here, since we could avoid the TLB flush for heavyweight
+        * exits. */
+       _tlbia();
 }
 
 int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
                                     struct kvm_debug_guest *dbg)
 {
-       return -ENOTSUPP;
+       int i;
+
+       vcpu->guest_debug.enabled = dbg->enabled;
+       if (vcpu->guest_debug.enabled) {
+               for (i=0; i < ARRAY_SIZE(vcpu->guest_debug.bp); i++) {
+                       if (dbg->breakpoints[i].enabled)
+                               vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
+                       else
+                               vcpu->guest_debug.bp[i] = 0;
+               }
+       }
+
+       return 0;
 }
 
 static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
index 4c03049e7db9195586fe00b07a22c067dcc3f30a..bc581d8a7cd9c35995e6bb75ae1f17d101840de1 100644 (file)
@@ -565,13 +565,16 @@ config ZFCPDUMP
          Refer to <file:Documentation/s390/zfcpdump.txt> for more details on this.
 
 config S390_GUEST
-bool "s390 guest support (EXPERIMENTAL)"
+bool "s390 guest support for KVM (EXPERIMENTAL)"
        depends on 64BIT && EXPERIMENTAL
        select VIRTIO
        select VIRTIO_RING
        select VIRTIO_CONSOLE
        help
-         Select this option if you want to run the kernel under s390 linux
+         Select this option if you want to run the kernel as a guest under
+         the KVM hypervisor. This will add detection for KVM as well  as a
+         virtio transport. If KVM is detected, the virtio console will be
+         the default console.
 endmenu
 
 source "net/Kconfig"
index d1faf5c544057f718c3737faf2340dfbaf3914c1..cce40ff2913bbf851059c8ea0f8d604d4eb94304 100644 (file)
@@ -157,8 +157,8 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
        int rc;
 
        vcpu->stat.instruction_stfl++;
-       facility_list &= ~(1UL<<24); /* no stfle */
-       facility_list &= ~(1UL<<23); /* no large pages */
+       /* only pass the facility bits, which we can handle */
+       facility_list &= 0xfe00fff3;
 
        rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
                           &facility_list, sizeof(facility_list));
index d02def06ca91540dd174f219eee8fe6eee6f095c..774ac4991568db3928b6741371735aa1f8b8d7ee 100644 (file)
@@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void)
        return ret;
 }
 
+/*
+ * If we don't do that, there is the possibility that the guest
+ * will calibrate under heavy load - thus, getting a lower lpj -
+ * and execute the delays themselves without load. This is wrong,
+ * because no delay loop can finish beforehand.
+ * Any heuristics is subject to fail, because ultimately, a large
+ * poll of guests can be running and trouble each other. So we preset
+ * lpj here
+ */
+static unsigned long kvm_get_tsc_khz(void)
+{
+       return preset_lpj;
+}
+
+static void kvm_get_preset_lpj(void)
+{
+       struct pvclock_vcpu_time_info *src;
+       unsigned long khz;
+       u64 lpj;
+
+       src = &per_cpu(hv_clock, 0);
+       khz = pvclock_tsc_khz(src);
+
+       lpj = ((u64)khz * 1000);
+       do_div(lpj, HZ);
+       preset_lpj = lpj;
+}
+
 static struct clocksource kvm_clock = {
        .name = "kvm-clock",
        .read = kvm_clock_read,
@@ -153,6 +181,7 @@ void __init kvmclock_init(void)
                pv_time_ops.get_wallclock = kvm_get_wallclock;
                pv_time_ops.set_wallclock = kvm_set_wallclock;
                pv_time_ops.sched_clock = kvm_clock_read;
+               pv_time_ops.get_tsc_khz = kvm_get_tsc_khz;
 #ifdef CONFIG_X86_LOCAL_APIC
                pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
 #endif
@@ -163,6 +192,7 @@ void __init kvmclock_init(void)
 #ifdef CONFIG_KEXEC
                machine_ops.crash_shutdown  = kvm_crash_shutdown;
 #endif
+               kvm_get_preset_lpj();
                clocksource_register(&kvm_clock);
        }
 }
index 05fbe9a0325a513c81fed8359a3bc259bce54edc..4f9c55f3a7c0f7e2590a5e248428bd19aa0d5c37 100644 (file)
@@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
        return dst->version;
 }
 
+unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
+{
+       u64 pv_tsc_khz = 1000000ULL << 32;
+
+       do_div(pv_tsc_khz, src->tsc_to_system_mul);
+       if (src->tsc_shift < 0)
+               pv_tsc_khz <<= -src->tsc_shift;
+       else
+               pv_tsc_khz >>= src->tsc_shift;
+       return pv_tsc_khz;
+}
+
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 {
        struct pvclock_shadow_time shadow;
index d0e940bb6f401839681f79fee8367b1a27678b1a..c02343594b4de95bb79ce31bf78de03fb48ed636 100644 (file)
@@ -3,10 +3,13 @@
 #
 
 common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
-                coalesced_mmio.o)
+                coalesced_mmio.o irq_comm.o)
 ifeq ($(CONFIG_KVM_TRACE),y)
 common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
 endif
+ifeq ($(CONFIG_DMAR),y)
+common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
+endif
 
 EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
 
index c0f7872a9124873ac9c01937bd4400ca2f3fe47b..634132a9a512391d324def8826390a709d257c80 100644 (file)
@@ -200,13 +200,14 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
 
        if (!atomic_inc_and_test(&pt->pending))
                set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
-       if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
-               vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+       if (vcpu0 && waitqueue_active(&vcpu0->wq))
                wake_up_interruptible(&vcpu0->wq);
-       }
 
        pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
        pt->scheduled = ktime_to_ns(pt->timer.expires);
+       if (pt->period)
+               ps->channels[0].count_load_time = pt->timer.expires;
 
        return (pt->period == 0 ? 0 : 1);
 }
@@ -215,12 +216,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
 {
        struct kvm_pit *pit = vcpu->kvm->arch.vpit;
 
-       if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending)
+       if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
                return atomic_read(&pit->pit_state.pit_timer.pending);
-
        return 0;
 }
 
+static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+       struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
+                                                irq_ack_notifier);
+       spin_lock(&ps->inject_lock);
+       if (atomic_dec_return(&ps->pit_timer.pending) < 0)
+               atomic_inc(&ps->pit_timer.pending);
+       ps->irq_ack = 1;
+       spin_unlock(&ps->inject_lock);
+}
+
 static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 {
        struct kvm_kpit_state *ps;
@@ -255,8 +266,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt)
        hrtimer_cancel(&pt->timer);
 }
 
-static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
+static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
 {
+       struct kvm_kpit_timer *pt = &ps->pit_timer;
        s64 interval;
 
        interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -268,6 +280,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
        pt->period = (is_period == 0) ? 0 : interval;
        pt->timer.function = pit_timer_fn;
        atomic_set(&pt->pending, 0);
+       ps->irq_ack = 1;
 
        hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
                      HRTIMER_MODE_ABS);
@@ -302,11 +315,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
        case 1:
         /* FIXME: enhance mode 4 precision */
        case 4:
-               create_pit_timer(&ps->pit_timer, val, 0);
+               create_pit_timer(ps, val, 0);
                break;
        case 2:
        case 3:
-               create_pit_timer(&ps->pit_timer, val, 1);
+               create_pit_timer(ps, val, 1);
                break;
        default:
                destroy_pit_timer(&ps->pit_timer);
@@ -520,7 +533,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
        mutex_unlock(&pit->pit_state.lock);
 
        atomic_set(&pit->pit_state.pit_timer.pending, 0);
-       pit->pit_state.inject_pending = 1;
+       pit->pit_state.irq_ack = 1;
 }
 
 struct kvm_pit *kvm_create_pit(struct kvm *kvm)
@@ -534,6 +547,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
 
        mutex_init(&pit->pit_state.lock);
        mutex_lock(&pit->pit_state.lock);
+       spin_lock_init(&pit->pit_state.inject_lock);
 
        /* Initialize PIO device */
        pit->dev.read = pit_ioport_read;
@@ -555,6 +569,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
        pit_state->pit = pit;
        hrtimer_init(&pit_state->pit_timer.timer,
                     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+       pit_state->irq_ack_notifier.gsi = 0;
+       pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
+       kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
        mutex_unlock(&pit->pit_state.lock);
 
        kvm_pit_reset(pit);
@@ -578,10 +595,8 @@ void kvm_free_pit(struct kvm *kvm)
 static void __inject_pit_timer_intr(struct kvm *kvm)
 {
        mutex_lock(&kvm->lock);
-       kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1);
-       kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0);
-       kvm_pic_set_irq(pic_irqchip(kvm), 0, 1);
-       kvm_pic_set_irq(pic_irqchip(kvm), 0, 0);
+       kvm_set_irq(kvm, 0, 1);
+       kvm_set_irq(kvm, 0, 0);
        mutex_unlock(&kvm->lock);
 }
 
@@ -592,37 +607,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
        struct kvm_kpit_state *ps;
 
        if (vcpu && pit) {
+               int inject = 0;
                ps = &pit->pit_state;
 
-               /* Try to inject pending interrupts when:
-                * 1. Pending exists
-                * 2. Last interrupt was accepted or waited for too long time*/
-               if (atomic_read(&ps->pit_timer.pending) &&
-                   (ps->inject_pending ||
-                   (jiffies - ps->last_injected_time
-                               >= KVM_MAX_PIT_INTR_INTERVAL))) {
-                       ps->inject_pending = 0;
-                       __inject_pit_timer_intr(kvm);
-                       ps->last_injected_time = jiffies;
-               }
-       }
-}
-
-void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
-{
-       struct kvm_arch *arch = &vcpu->kvm->arch;
-       struct kvm_kpit_state *ps;
-
-       if (vcpu && arch->vpit) {
-               ps = &arch->vpit->pit_state;
-               if (atomic_read(&ps->pit_timer.pending) &&
-               (((arch->vpic->pics[0].imr & 1) == 0 &&
-                 arch->vpic->pics[0].irq_base == vec) ||
-                 (arch->vioapic->redirtbl[0].fields.vector == vec &&
-                 arch->vioapic->redirtbl[0].fields.mask != 1))) {
-                       ps->inject_pending = 1;
-                       atomic_dec(&ps->pit_timer.pending);
-                       ps->channels[0].count_load_time = ktime_get();
+               /* Try to inject pending interrupts when
+                * last one has been acked.
+                */
+               spin_lock(&ps->inject_lock);
+               if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
+                       ps->irq_ack = 0;
+                       inject = 1;
                }
+               spin_unlock(&ps->inject_lock);
+               if (inject)
+                       __inject_pit_timer_intr(kvm);
        }
 }
index db25c2a6c8c4a612afedd77a9c93d3378230ab33..e436d4983aa15e349eec174ed1643435060305e2 100644 (file)
@@ -8,7 +8,6 @@ struct kvm_kpit_timer {
        int irq;
        s64 period; /* unit: ns */
        s64 scheduled;
-       ktime_t last_update;
        atomic_t pending;
 };
 
@@ -34,8 +33,9 @@ struct kvm_kpit_state {
        u32    speaker_data_on;
        struct mutex lock;
        struct kvm_pit *pit;
-       bool inject_pending; /* if inject pending interrupts */
-       unsigned long last_injected_time;
+       spinlock_t inject_lock;
+       unsigned long irq_ack;
+       struct kvm_irq_ack_notifier irq_ack_notifier;
 };
 
 struct kvm_pit {
@@ -54,7 +54,6 @@ struct kvm_pit {
 #define KVM_PIT_CHANNEL_MASK       0x3
 
 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
-void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
 void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
 struct kvm_pit *kvm_create_pit(struct kvm *kvm);
 void kvm_free_pit(struct kvm *kvm);
index c31164e8aa46c498643b9fa953df3920d598a1d3..17e41e165f1a296cb538f48ec118db1109cf3b92 100644 (file)
 
 #include <linux/kvm_host.h>
 
+static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
+{
+       s->isr &= ~(1 << irq);
+       s->isr_ack |= (1 << irq);
+}
+
+void kvm_pic_clear_isr_ack(struct kvm *kvm)
+{
+       struct kvm_pic *s = pic_irqchip(kvm);
+       s->pics[0].isr_ack = 0xff;
+       s->pics[1].isr_ack = 0xff;
+}
+
 /*
  * set irq level. If an edge is detected, then the IRR is set to 1
  */
@@ -141,11 +154,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level)
  */
 static inline void pic_intack(struct kvm_kpic_state *s, int irq)
 {
+       s->isr |= 1 << irq;
        if (s->auto_eoi) {
                if (s->rotate_on_auto_eoi)
                        s->priority_add = (irq + 1) & 7;
-       } else
-               s->isr |= (1 << irq);
+               pic_clear_isr(s, irq);
+       }
        /*
         * We don't clear a level sensitive interrupt here
         */
@@ -153,9 +167,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq)
                s->irr &= ~(1 << irq);
 }
 
-int kvm_pic_read_irq(struct kvm_pic *s)
+int kvm_pic_read_irq(struct kvm *kvm)
 {
        int irq, irq2, intno;
+       struct kvm_pic *s = pic_irqchip(kvm);
 
        irq = pic_get_irq(&s->pics[0]);
        if (irq >= 0) {
@@ -181,16 +196,32 @@ int kvm_pic_read_irq(struct kvm_pic *s)
                intno = s->pics[0].irq_base + irq;
        }
        pic_update_irq(s);
+       kvm_notify_acked_irq(kvm, irq);
 
        return intno;
 }
 
 void kvm_pic_reset(struct kvm_kpic_state *s)
 {
+       int irq, irqbase;
+       struct kvm *kvm = s->pics_state->irq_request_opaque;
+       struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
+
+       if (s == &s->pics_state->pics[0])
+               irqbase = 0;
+       else
+               irqbase = 8;
+
+       for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
+               if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
+                       if (s->irr & (1 << irq) || s->isr & (1 << irq))
+                               kvm_notify_acked_irq(kvm, irq+irqbase);
+       }
        s->last_irr = 0;
        s->irr = 0;
        s->imr = 0;
        s->isr = 0;
+       s->isr_ack = 0xff;
        s->priority_add = 0;
        s->irq_base = 0;
        s->read_reg_select = 0;
@@ -243,7 +274,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
                                priority = get_priority(s, s->isr);
                                if (priority != 8) {
                                        irq = (priority + s->priority_add) & 7;
-                                       s->isr &= ~(1 << irq);
+                                       pic_clear_isr(s, irq);
                                        if (cmd == 5)
                                                s->priority_add = (irq + 1) & 7;
                                        pic_update_irq(s->pics_state);
@@ -251,7 +282,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
                                break;
                        case 3:
                                irq = val & 7;
-                               s->isr &= ~(1 << irq);
+                               pic_clear_isr(s, irq);
                                pic_update_irq(s->pics_state);
                                break;
                        case 6:
@@ -260,8 +291,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
                                break;
                        case 7:
                                irq = val & 7;
-                               s->isr &= ~(1 << irq);
                                s->priority_add = (irq + 1) & 7;
+                               pic_clear_isr(s, irq);
                                pic_update_irq(s->pics_state);
                                break;
                        default:
@@ -303,7 +334,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
                        s->pics_state->pics[0].irr &= ~(1 << 2);
                }
                s->irr &= ~(1 << ret);
-               s->isr &= ~(1 << ret);
+               pic_clear_isr(s, ret);
                if (addr1 >> 7 || ret != 2)
                        pic_update_irq(s->pics_state);
        } else {
@@ -422,10 +453,14 @@ static void pic_irq_request(void *opaque, int level)
 {
        struct kvm *kvm = opaque;
        struct kvm_vcpu *vcpu = kvm->vcpus[0];
+       struct kvm_pic *s = pic_irqchip(kvm);
+       int irq = pic_get_irq(&s->pics[0]);
 
-       pic_irqchip(kvm)->output = level;
-       if (vcpu)
+       s->output = level;
+       if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
+               s->pics[0].isr_ack &= ~(1 << irq);
                kvm_vcpu_kick(vcpu);
+       }
 }
 
 struct kvm_pic *kvm_create_pic(struct kvm *kvm)
index 76d736b5f66464372a95979c0ada398f6f109a1d..c019b8edcdb76bab269ceb48f8bc96e02fae7817 100644 (file)
@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
                if (kvm_apic_accept_pic_intr(v)) {
                        s = pic_irqchip(v->kvm);
                        s->output = 0;          /* PIC */
-                       vector = kvm_pic_read_irq(s);
+                       vector = kvm_pic_read_irq(v->kvm);
                }
        }
        return vector;
@@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
 void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
 {
        kvm_apic_timer_intr_post(vcpu, vec);
-       kvm_pit_timer_intr_post(vcpu, vec);
        /* TODO: PIT, RTC etc. */
 }
 EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
index 7ca47cbb48bb3c2239a2ebb13975d43f5f9bac4b..f17c8f5bbf31b8bae7b1d2a0d770bd5a968a7a6a 100644 (file)
@@ -42,6 +42,7 @@ struct kvm_kpic_state {
        u8 irr;         /* interrupt request register */
        u8 imr;         /* interrupt mask register */
        u8 isr;         /* interrupt service register */
+       u8 isr_ack;     /* interrupt ack detection */
        u8 priority_add;        /* highest irq priority */
        u8 irq_base;
        u8 read_reg_select;
@@ -63,12 +64,13 @@ struct kvm_pic {
        void *irq_request_opaque;
        int output;             /* intr from master PIC */
        struct kvm_io_device dev;
+       void (*ack_notifier)(void *opaque, int irq);
 };
 
 struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_pic_set_irq(void *opaque, int irq, int level);
-int kvm_pic_read_irq(struct kvm_pic *s);
+int kvm_pic_read_irq(struct kvm *kvm);
 void kvm_pic_update_irq(struct kvm_pic *s);
+void kvm_pic_clear_isr_ack(struct kvm *kvm);
 
 static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
 {
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
new file mode 100644 (file)
index 0000000..1ff819d
--- /dev/null
@@ -0,0 +1,32 @@
+#ifndef ASM_KVM_CACHE_REGS_H
+#define ASM_KVM_CACHE_REGS_H
+
+static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
+                                             enum kvm_reg reg)
+{
+       if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
+               kvm_x86_ops->cache_reg(vcpu, reg);
+
+       return vcpu->arch.regs[reg];
+}
+
+static inline void kvm_register_write(struct kvm_vcpu *vcpu,
+                                     enum kvm_reg reg,
+                                     unsigned long val)
+{
+       vcpu->arch.regs[reg] = val;
+       __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
+       __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
+{
+       return kvm_register_read(vcpu, VCPU_REGS_RIP);
+}
+
+static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
+{
+       kvm_register_write(vcpu, VCPU_REGS_RIP, val);
+}
+
+#endif
index 73f43de69f676b9a784de04b4a7b5dbd43ecc3b1..6571926bfd339b498c2ca06835b71d8ead494787 100644 (file)
@@ -32,6 +32,7 @@
 #include <asm/current.h>
 #include <asm/apicdef.h>
 #include <asm/atomic.h>
+#include "kvm_cache_regs.h"
 #include "irq.h"
 
 #define PRId64 "d"
@@ -338,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                } else
                        apic_clear_vector(vector, apic->regs + APIC_TMR);
 
-               if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
-                       kvm_vcpu_kick(vcpu);
-               else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
-                       vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-                       if (waitqueue_active(&vcpu->wq))
-                               wake_up_interruptible(&vcpu->wq);
-               }
+               kvm_vcpu_kick(vcpu);
 
                result = (orig_irr == 0);
                break;
@@ -370,21 +365,18 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                        vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
                        kvm_vcpu_kick(vcpu);
                } else {
-                       printk(KERN_DEBUG
-                              "Ignoring de-assert INIT to vcpu %d\n",
-                              vcpu->vcpu_id);
+                       apic_debug("Ignoring de-assert INIT to vcpu %d\n",
+                                  vcpu->vcpu_id);
                }
-
                break;
 
        case APIC_DM_STARTUP:
-               printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
-                      vcpu->vcpu_id, vector);
+               apic_debug("SIPI to vcpu %d vector 0x%02x\n",
+                          vcpu->vcpu_id, vector);
                if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
                        vcpu->arch.sipi_vector = vector;
                        vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
-                       if (waitqueue_active(&vcpu->wq))
-                               wake_up_interruptible(&vcpu->wq);
+                       kvm_vcpu_kick(vcpu);
                }
                break;
 
@@ -438,7 +430,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
 static void apic_set_eoi(struct kvm_lapic *apic)
 {
        int vector = apic_find_highest_isr(apic);
-
+       int trigger_mode;
        /*
         * Not every write EOI will has corresponding ISR,
         * one example is when Kernel check timer on setup_IO_APIC
@@ -450,7 +442,10 @@ static void apic_set_eoi(struct kvm_lapic *apic)
        apic_update_ppr(apic);
 
        if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
-               kvm_ioapic_update_eoi(apic->vcpu->kvm, vector);
+               trigger_mode = IOAPIC_LEVEL_TRIG;
+       else
+               trigger_mode = IOAPIC_EDGE_TRIG;
+       kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
 }
 
 static void apic_send_ipi(struct kvm_lapic *apic)
@@ -558,8 +553,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
        struct kvm_run *run = vcpu->run;
 
        set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
-       kvm_x86_ops->cache_regs(vcpu);
-       run->tpr_access.rip = vcpu->arch.rip;
+       run->tpr_access.rip = kvm_rip_read(vcpu);
        run->tpr_access.is_write = write;
 }
 
@@ -683,9 +677,9 @@ static void apic_mmio_write(struct kvm_io_device *this,
         * Refer SDM 8.4.1
         */
        if (len != 4 || alignment) {
-               if (printk_ratelimit())
-                       printk(KERN_ERR "apic write: bad size=%d %lx\n",
-                              len, (long)address);
+               /* Don't shout loud, $infamous_os would cause only noise. */
+               apic_debug("apic write: bad size=%d %lx\n",
+                          len, (long)address);
                return;
        }
 
@@ -947,10 +941,9 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
 
        if(!atomic_inc_and_test(&apic->timer.pending))
                set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
-       if (waitqueue_active(q)) {
-               apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+       if (waitqueue_active(q))
                wake_up_interruptible(q);
-       }
+
        if (apic_lvtt_period(apic)) {
                result = 1;
                apic->timer.dev.expires = ktime_add_ns(
index 3da2508eb22a2d9577a0a24847065af3e21acbcc..99c239c5c0ac7becff901a5bbdd399af4ec48293 100644 (file)
@@ -70,6 +70,9 @@ static int dbg = 0;
 module_param(dbg, bool, 0644);
 #endif
 
+static int oos_shadow = 1;
+module_param(oos_shadow, bool, 0644);
+
 #ifndef MMU_DEBUG
 #define ASSERT(x) do { } while (0)
 #else
@@ -135,18 +138,24 @@ module_param(dbg, bool, 0644);
 #define ACC_USER_MASK    PT_USER_MASK
 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 
-struct kvm_pv_mmu_op_buffer {
-       void *ptr;
-       unsigned len;
-       unsigned processed;
-       char buf[512] __aligned(sizeof(long));
-};
+#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
 struct kvm_rmap_desc {
        u64 *shadow_ptes[RMAP_EXT];
        struct kvm_rmap_desc *more;
 };
 
+struct kvm_shadow_walk {
+       int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
+                    u64 addr, u64 *spte, int level);
+};
+
+struct kvm_unsync_walk {
+       int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
+};
+
+typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
+
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *rmap_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
@@ -405,16 +414,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
 {
        struct vm_area_struct *vma;
        unsigned long addr;
+       int ret = 0;
 
        addr = gfn_to_hva(kvm, gfn);
        if (kvm_is_error_hva(addr))
-               return 0;
+               return ret;
 
+       down_read(&current->mm->mmap_sem);
        vma = find_vma(current->mm, addr);
        if (vma && is_vm_hugetlb_page(vma))
-               return 1;
+               ret = 1;
+       up_read(&current->mm->mmap_sem);
 
-       return 0;
+       return ret;
 }
 
 static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@@ -649,8 +661,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 
        if (write_protected)
                kvm_flush_remote_tlbs(kvm);
-
-       account_shadowed(kvm, gfn);
 }
 
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -859,6 +869,77 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
        BUG();
 }
 
+
+static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+                           mmu_parent_walk_fn fn)
+{
+       struct kvm_pte_chain *pte_chain;
+       struct hlist_node *node;
+       struct kvm_mmu_page *parent_sp;
+       int i;
+
+       if (!sp->multimapped && sp->parent_pte) {
+               parent_sp = page_header(__pa(sp->parent_pte));
+               fn(vcpu, parent_sp);
+               mmu_parent_walk(vcpu, parent_sp, fn);
+               return;
+       }
+       hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+                       if (!pte_chain->parent_ptes[i])
+                               break;
+                       parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
+                       fn(vcpu, parent_sp);
+                       mmu_parent_walk(vcpu, parent_sp, fn);
+               }
+}
+
+static void kvm_mmu_update_unsync_bitmap(u64 *spte)
+{
+       unsigned int index;
+       struct kvm_mmu_page *sp = page_header(__pa(spte));
+
+       index = spte - sp->spt;
+       __set_bit(index, sp->unsync_child_bitmap);
+       sp->unsync_children = 1;
+}
+
+static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
+{
+       struct kvm_pte_chain *pte_chain;
+       struct hlist_node *node;
+       int i;
+
+       if (!sp->parent_pte)
+               return;
+
+       if (!sp->multimapped) {
+               kvm_mmu_update_unsync_bitmap(sp->parent_pte);
+               return;
+       }
+
+       hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+               for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+                       if (!pte_chain->parent_ptes[i])
+                               break;
+                       kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
+               }
+}
+
+static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+       sp->unsync_children = 1;
+       kvm_mmu_update_parents_unsync(sp);
+       return 1;
+}
+
+static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
+                                       struct kvm_mmu_page *sp)
+{
+       mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+       kvm_mmu_update_parents_unsync(sp);
+}
+
 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
                                    struct kvm_mmu_page *sp)
 {
@@ -868,6 +949,58 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
                sp->spt[i] = shadow_trap_nonpresent_pte;
 }
 
+static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
+                              struct kvm_mmu_page *sp)
+{
+       return 1;
+}
+
+static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+}
+
+#define for_each_unsync_children(bitmap, idx)          \
+       for (idx = find_first_bit(bitmap, 512);         \
+            idx < 512;                                 \
+            idx = find_next_bit(bitmap, 512, idx+1))
+
+static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+                          struct kvm_unsync_walk *walker)
+{
+       int i, ret;
+
+       if (!sp->unsync_children)
+               return 0;
+
+       for_each_unsync_children(sp->unsync_child_bitmap, i) {
+               u64 ent = sp->spt[i];
+
+               if (is_shadow_present_pte(ent)) {
+                       struct kvm_mmu_page *child;
+                       child = page_header(ent & PT64_BASE_ADDR_MASK);
+
+                       if (child->unsync_children) {
+                               ret = mmu_unsync_walk(child, walker);
+                               if (ret)
+                                       return ret;
+                               __clear_bit(i, sp->unsync_child_bitmap);
+                       }
+
+                       if (child->unsync) {
+                               ret = walker->entry(child, walker);
+                               __clear_bit(i, sp->unsync_child_bitmap);
+                               if (ret)
+                                       return ret;
+                       }
+               }
+       }
+
+       if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
+               sp->unsync_children = 0;
+
+       return 0;
+}
+
 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 {
        unsigned index;
@@ -888,6 +1021,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
        return NULL;
 }
 
+static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+       WARN_ON(!sp->unsync);
+       sp->unsync = 0;
+       --kvm->stat.mmu_unsync;
+}
+
+static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+       if (sp->role.glevels != vcpu->arch.mmu.root_level) {
+               kvm_mmu_zap_page(vcpu->kvm, sp);
+               return 1;
+       }
+
+       rmap_write_protect(vcpu->kvm, sp->gfn);
+       if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
+               kvm_mmu_zap_page(vcpu->kvm, sp);
+               return 1;
+       }
+
+       kvm_mmu_flush_tlb(vcpu);
+       kvm_unlink_unsync_page(vcpu->kvm, sp);
+       return 0;
+}
+
+struct sync_walker {
+       struct kvm_vcpu *vcpu;
+       struct kvm_unsync_walk walker;
+};
+
+static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+{
+       struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
+                                                    walker);
+       struct kvm_vcpu *vcpu = sync_walk->vcpu;
+
+       kvm_sync_page(vcpu, sp);
+       return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
+}
+
+static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+       struct sync_walker walker = {
+               .walker = { .entry = mmu_sync_fn, },
+               .vcpu = vcpu,
+       };
+
+       while (mmu_unsync_walk(sp, &walker.walker))
+               cond_resched_lock(&vcpu->kvm->mmu_lock);
+}
+
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                                             gfn_t gfn,
                                             gva_t gaddr,
@@ -901,7 +1087,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
        unsigned quadrant;
        struct hlist_head *bucket;
        struct kvm_mmu_page *sp;
-       struct hlist_node *node;
+       struct hlist_node *node, *tmp;
 
        role.word = 0;
        role.glevels = vcpu->arch.mmu.root_level;
@@ -917,9 +1103,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                 gfn, role.word);
        index = kvm_page_table_hashfn(gfn);
        bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-       hlist_for_each_entry(sp, node, bucket, hash_link)
-               if (sp->gfn == gfn && sp->role.word == role.word) {
+       hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
+               if (sp->gfn == gfn) {
+                       if (sp->unsync)
+                               if (kvm_sync_page(vcpu, sp))
+                                       continue;
+
+                       if (sp->role.word != role.word)
+                               continue;
+
                        mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+                       if (sp->unsync_children) {
+                               set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
+                               kvm_mmu_mark_parents_unsync(vcpu, sp);
+                       }
                        pgprintk("%s: found\n", __func__);
                        return sp;
                }
@@ -931,8 +1128,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
        sp->gfn = gfn;
        sp->role = role;
        hlist_add_head(&sp->hash_link, bucket);
-       if (!metaphysical)
+       if (!metaphysical) {
                rmap_write_protect(vcpu->kvm, gfn);
+               account_shadowed(vcpu->kvm, gfn);
+       }
        if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
                vcpu->arch.mmu.prefetch_page(vcpu, sp);
        else
@@ -940,6 +1139,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
        return sp;
 }
 
+static int walk_shadow(struct kvm_shadow_walk *walker,
+                      struct kvm_vcpu *vcpu, u64 addr)
+{
+       hpa_t shadow_addr;
+       int level;
+       int r;
+       u64 *sptep;
+       unsigned index;
+
+       shadow_addr = vcpu->arch.mmu.root_hpa;
+       level = vcpu->arch.mmu.shadow_root_level;
+       if (level == PT32E_ROOT_LEVEL) {
+               shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+               shadow_addr &= PT64_BASE_ADDR_MASK;
+               --level;
+       }
+
+       while (level >= PT_PAGE_TABLE_LEVEL) {
+               index = SHADOW_PT_INDEX(addr, level);
+               sptep = ((u64 *)__va(shadow_addr)) + index;
+               r = walker->entry(walker, vcpu, addr, sptep, level);
+               if (r)
+                       return r;
+               shadow_addr = *sptep & PT64_BASE_ADDR_MASK;
+               --level;
+       }
+       return 0;
+}
+
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
                                         struct kvm_mmu_page *sp)
 {
@@ -955,7 +1183,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
                                rmap_remove(kvm, &pt[i]);
                        pt[i] = shadow_trap_nonpresent_pte;
                }
-               kvm_flush_remote_tlbs(kvm);
                return;
        }
 
@@ -974,7 +1201,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
                }
                pt[i] = shadow_trap_nonpresent_pte;
        }
-       kvm_flush_remote_tlbs(kvm);
 }
 
 static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -991,11 +1217,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
                        kvm->vcpus[i]->arch.last_pte_updated = NULL;
 }
 
-static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
        u64 *parent_pte;
 
-       ++kvm->stat.mmu_shadow_zapped;
        while (sp->multimapped || sp->parent_pte) {
                if (!sp->multimapped)
                        parent_pte = sp->parent_pte;
@@ -1010,21 +1235,59 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
                kvm_mmu_put_page(sp, parent_pte);
                set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
        }
+}
+
+struct zap_walker {
+       struct kvm_unsync_walk walker;
+       struct kvm *kvm;
+       int zapped;
+};
+
+static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+{
+       struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
+                                                    walker);
+       kvm_mmu_zap_page(zap_walk->kvm, sp);
+       zap_walk->zapped = 1;
+       return 0;
+}
+
+static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+       struct zap_walker walker = {
+               .walker = { .entry = mmu_zap_fn, },
+               .kvm = kvm,
+               .zapped = 0,
+       };
+
+       if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+               return 0;
+       mmu_unsync_walk(sp, &walker.walker);
+       return walker.zapped;
+}
+
+static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+       int ret;
+       ++kvm->stat.mmu_shadow_zapped;
+       ret = mmu_zap_unsync_children(kvm, sp);
        kvm_mmu_page_unlink_children(kvm, sp);
+       kvm_mmu_unlink_parents(kvm, sp);
+       kvm_flush_remote_tlbs(kvm);
+       if (!sp->role.invalid && !sp->role.metaphysical)
+               unaccount_shadowed(kvm, sp->gfn);
+       if (sp->unsync)
+               kvm_unlink_unsync_page(kvm, sp);
        if (!sp->root_count) {
-               if (!sp->role.metaphysical && !sp->role.invalid)
-                       unaccount_shadowed(kvm, sp->gfn);
                hlist_del(&sp->hash_link);
                kvm_mmu_free_page(kvm, sp);
        } else {
-               int invalid = sp->role.invalid;
-               list_move(&sp->link, &kvm->arch.active_mmu_pages);
                sp->role.invalid = 1;
+               list_move(&sp->link, &kvm->arch.active_mmu_pages);
                kvm_reload_remote_mmus(kvm);
-               if (!sp->role.metaphysical && !invalid)
-                       unaccount_shadowed(kvm, sp->gfn);
        }
        kvm_mmu_reset_last_pte_updated(kvm);
+       return ret;
 }
 
 /*
@@ -1077,8 +1340,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
                if (sp->gfn == gfn && !sp->role.metaphysical) {
                        pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
                                 sp->role.word);
-                       kvm_mmu_zap_page(kvm, sp);
                        r = 1;
+                       if (kvm_mmu_zap_page(kvm, sp))
+                               n = bucket->first;
                }
        return r;
 }
@@ -1101,6 +1365,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
        __set_bit(slot, &sp->slot_bitmap);
 }
 
+static void mmu_convert_notrap(struct kvm_mmu_page *sp)
+{
+       int i;
+       u64 *pt = sp->spt;
+
+       if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
+               return;
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+               if (pt[i] == shadow_notrap_nonpresent_pte)
+                       set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte);
+       }
+}
+
 struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
 {
        struct page *page;
@@ -1110,51 +1388,60 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
        if (gpa == UNMAPPED_GVA)
                return NULL;
 
-       down_read(&current->mm->mmap_sem);
        page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-       up_read(&current->mm->mmap_sem);
 
        return page;
 }
 
-static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
-                        unsigned pt_access, unsigned pte_access,
-                        int user_fault, int write_fault, int dirty,
-                        int *ptwrite, int largepage, gfn_t gfn,
-                        pfn_t pfn, bool speculative)
+static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
-       u64 spte;
-       int was_rmapped = 0;
-       int was_writeble = is_writeble_pte(*shadow_pte);
+       unsigned index;
+       struct hlist_head *bucket;
+       struct kvm_mmu_page *s;
+       struct hlist_node *node, *n;
 
-       pgprintk("%s: spte %llx access %x write_fault %d"
-                " user_fault %d gfn %lx\n",
-                __func__, *shadow_pte, pt_access,
-                write_fault, user_fault, gfn);
+       index = kvm_page_table_hashfn(sp->gfn);
+       bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+       /* don't unsync if pagetable is shadowed with multiple roles */
+       hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
+               if (s->gfn != sp->gfn || s->role.metaphysical)
+                       continue;
+               if (s->role.word != sp->role.word)
+                       return 1;
+       }
+       kvm_mmu_mark_parents_unsync(vcpu, sp);
+       ++vcpu->kvm->stat.mmu_unsync;
+       sp->unsync = 1;
+       mmu_convert_notrap(sp);
+       return 0;
+}
 
-       if (is_rmap_pte(*shadow_pte)) {
-               /*
-                * If we overwrite a PTE page pointer with a 2MB PMD, unlink
-                * the parent of the now unreachable PTE.
-                */
-               if (largepage && !is_large_pte(*shadow_pte)) {
-                       struct kvm_mmu_page *child;
-                       u64 pte = *shadow_pte;
+static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+                                 bool can_unsync)
+{
+       struct kvm_mmu_page *shadow;
 
-                       child = page_header(pte & PT64_BASE_ADDR_MASK);
-                       mmu_page_remove_parent_pte(child, shadow_pte);
-               } else if (pfn != spte_to_pfn(*shadow_pte)) {
-                       pgprintk("hfn old %lx new %lx\n",
-                                spte_to_pfn(*shadow_pte), pfn);
-                       rmap_remove(vcpu->kvm, shadow_pte);
-               } else {
-                       if (largepage)
-                               was_rmapped = is_large_pte(*shadow_pte);
-                       else
-                               was_rmapped = 1;
-               }
+       shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
+       if (shadow) {
+               if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
+                       return 1;
+               if (shadow->unsync)
+                       return 0;
+               if (can_unsync && oos_shadow)
+                       return kvm_unsync_page(vcpu, shadow);
+               return 1;
        }
+       return 0;
+}
 
+static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+                   unsigned pte_access, int user_fault,
+                   int write_fault, int dirty, int largepage,
+                   gfn_t gfn, pfn_t pfn, bool speculative,
+                   bool can_unsync)
+{
+       u64 spte;
+       int ret = 0;
        /*
         * We don't set the accessed bit, since we sometimes want to see
         * whether the guest actually used the pte (in order to detect
@@ -1162,7 +1449,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
         */
        spte = shadow_base_present_pte | shadow_dirty_mask;
        if (!speculative)
-               pte_access |= PT_ACCESSED_MASK;
+               spte |= shadow_accessed_mask;
        if (!dirty)
                pte_access &= ~ACC_WRITE_MASK;
        if (pte_access & ACC_EXEC_MASK)
@@ -1178,35 +1465,82 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 
        if ((pte_access & ACC_WRITE_MASK)
            || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
-               struct kvm_mmu_page *shadow;
+
+               if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
+                       ret = 1;
+                       spte = shadow_trap_nonpresent_pte;
+                       goto set_pte;
+               }
 
                spte |= PT_WRITABLE_MASK;
 
-               shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
-               if (shadow ||
-                  (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
+               if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
                        pgprintk("%s: found shadow page for %lx, marking ro\n",
                                 __func__, gfn);
+                       ret = 1;
                        pte_access &= ~ACC_WRITE_MASK;
-                       if (is_writeble_pte(spte)) {
+                       if (is_writeble_pte(spte))
                                spte &= ~PT_WRITABLE_MASK;
-                               kvm_x86_ops->tlb_flush(vcpu);
-                       }
-                       if (write_fault)
-                               *ptwrite = 1;
                }
        }
 
        if (pte_access & ACC_WRITE_MASK)
                mark_page_dirty(vcpu->kvm, gfn);
 
-       pgprintk("%s: setting spte %llx\n", __func__, spte);
-       pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
-                (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
-                (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
+set_pte:
        set_shadow_pte(shadow_pte, spte);
-       if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK)
-           && (spte & PT_PRESENT_MASK))
+       return ret;
+}
+
+static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+                        unsigned pt_access, unsigned pte_access,
+                        int user_fault, int write_fault, int dirty,
+                        int *ptwrite, int largepage, gfn_t gfn,
+                        pfn_t pfn, bool speculative)
+{
+       int was_rmapped = 0;
+       int was_writeble = is_writeble_pte(*shadow_pte);
+
+       pgprintk("%s: spte %llx access %x write_fault %d"
+                " user_fault %d gfn %lx\n",
+                __func__, *shadow_pte, pt_access,
+                write_fault, user_fault, gfn);
+
+       if (is_rmap_pte(*shadow_pte)) {
+               /*
+                * If we overwrite a PTE page pointer with a 2MB PMD, unlink
+                * the parent of the now unreachable PTE.
+                */
+               if (largepage && !is_large_pte(*shadow_pte)) {
+                       struct kvm_mmu_page *child;
+                       u64 pte = *shadow_pte;
+
+                       child = page_header(pte & PT64_BASE_ADDR_MASK);
+                       mmu_page_remove_parent_pte(child, shadow_pte);
+               } else if (pfn != spte_to_pfn(*shadow_pte)) {
+                       pgprintk("hfn old %lx new %lx\n",
+                                spte_to_pfn(*shadow_pte), pfn);
+                       rmap_remove(vcpu->kvm, shadow_pte);
+               } else {
+                       if (largepage)
+                               was_rmapped = is_large_pte(*shadow_pte);
+                       else
+                               was_rmapped = 1;
+               }
+       }
+       if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
+                     dirty, largepage, gfn, pfn, speculative, true)) {
+               if (write_fault)
+                       *ptwrite = 1;
+               kvm_x86_ops->tlb_flush(vcpu);
+       }
+
+       pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
+       pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
+                is_large_pte(*shadow_pte)? "2MB" : "4kB",
+                is_present_pte(*shadow_pte)?"RW":"R", gfn,
+                *shadow_pte, shadow_pte);
+       if (!was_rmapped && is_large_pte(*shadow_pte))
                ++vcpu->kvm->stat.lpages;
 
        page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
@@ -1230,54 +1564,67 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 {
 }
 
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
-                          int largepage, gfn_t gfn, pfn_t pfn,
-                          int level)
-{
-       hpa_t table_addr = vcpu->arch.mmu.root_hpa;
-       int pt_write = 0;
-
-       for (; ; level--) {
-               u32 index = PT64_INDEX(v, level);
-               u64 *table;
-
-               ASSERT(VALID_PAGE(table_addr));
-               table = __va(table_addr);
+struct direct_shadow_walk {
+       struct kvm_shadow_walk walker;
+       pfn_t pfn;
+       int write;
+       int largepage;
+       int pt_write;
+};
 
-               if (level == 1) {
-                       mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-                                    0, write, 1, &pt_write, 0, gfn, pfn, false);
-                       return pt_write;
-               }
+static int direct_map_entry(struct kvm_shadow_walk *_walk,
+                           struct kvm_vcpu *vcpu,
+                           u64 addr, u64 *sptep, int level)
+{
+       struct direct_shadow_walk *walk =
+               container_of(_walk, struct direct_shadow_walk, walker);
+       struct kvm_mmu_page *sp;
+       gfn_t pseudo_gfn;
+       gfn_t gfn = addr >> PAGE_SHIFT;
+
+       if (level == PT_PAGE_TABLE_LEVEL
+           || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
+               mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
+                            0, walk->write, 1, &walk->pt_write,
+                            walk->largepage, gfn, walk->pfn, false);
+               ++vcpu->stat.pf_fixed;
+               return 1;
+       }
 
-               if (largepage && level == 2) {
-                       mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
-                                    0, write, 1, &pt_write, 1, gfn, pfn, false);
-                       return pt_write;
+       if (*sptep == shadow_trap_nonpresent_pte) {
+               pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+               sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1,
+                                     1, ACC_ALL, sptep);
+               if (!sp) {
+                       pgprintk("nonpaging_map: ENOMEM\n");
+                       kvm_release_pfn_clean(walk->pfn);
+                       return -ENOMEM;
                }
 
-               if (table[index] == shadow_trap_nonpresent_pte) {
-                       struct kvm_mmu_page *new_table;
-                       gfn_t pseudo_gfn;
-
-                       pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
-                               >> PAGE_SHIFT;
-                       new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
-                                                    v, level - 1,
-                                                    1, ACC_ALL, &table[index]);
-                       if (!new_table) {
-                               pgprintk("nonpaging_map: ENOMEM\n");
-                               kvm_release_pfn_clean(pfn);
-                               return -ENOMEM;
-                       }
-
-                       set_shadow_pte(&table[index],
-                                      __pa(new_table->spt)
-                                      | PT_PRESENT_MASK | PT_WRITABLE_MASK
-                                      | shadow_user_mask | shadow_x_mask);
-               }
-               table_addr = table[index] & PT64_BASE_ADDR_MASK;
+               set_shadow_pte(sptep,
+                              __pa(sp->spt)
+                              | PT_PRESENT_MASK | PT_WRITABLE_MASK
+                              | shadow_user_mask | shadow_x_mask);
        }
+       return 0;
+}
+
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
+                       int largepage, gfn_t gfn, pfn_t pfn)
+{
+       int r;
+       struct direct_shadow_walk walker = {
+               .walker = { .entry = direct_map_entry, },
+               .pfn = pfn,
+               .largepage = largepage,
+               .write = write,
+               .pt_write = 0,
+       };
+
+       r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT);
+       if (r < 0)
+               return r;
+       return walker.pt_write;
 }
 
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
@@ -1287,16 +1634,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
        pfn_t pfn;
        unsigned long mmu_seq;
 
-       down_read(&current->mm->mmap_sem);
        if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
                gfn &= ~(KVM_PAGES_PER_HPAGE-1);
                largepage = 1;
        }
 
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
-       /* implicit mb(), we'll read before PT lock is unlocked */
+       smp_rmb();
        pfn = gfn_to_pfn(vcpu->kvm, gfn);
-       up_read(&current->mm->mmap_sem);
 
        /* mmio */
        if (is_error_pfn(pfn)) {
@@ -1308,8 +1653,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
        if (mmu_notifier_retry(vcpu, mmu_seq))
                goto out_unlock;
        kvm_mmu_free_some_pages(vcpu);
-       r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
-                        PT32E_ROOT_LEVEL);
+       r = __direct_map(vcpu, v, write, largepage, gfn, pfn);
        spin_unlock(&vcpu->kvm->mmu_lock);
 
 
@@ -1405,6 +1749,37 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
        vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
 }
 
+static void mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+       int i;
+       struct kvm_mmu_page *sp;
+
+       if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+               return;
+       if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+               hpa_t root = vcpu->arch.mmu.root_hpa;
+               sp = page_header(root);
+               mmu_sync_children(vcpu, sp);
+               return;
+       }
+       for (i = 0; i < 4; ++i) {
+               hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+               if (root) {
+                       root &= PT64_BASE_ADDR_MASK;
+                       sp = page_header(root);
+                       mmu_sync_children(vcpu, sp);
+               }
+       }
+}
+
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+       spin_lock(&vcpu->kvm->mmu_lock);
+       mmu_sync_roots(vcpu);
+       spin_unlock(&vcpu->kvm->mmu_lock);
+}
+
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
 {
        return vaddr;
@@ -1446,15 +1821,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
        if (r)
                return r;
 
-       down_read(&current->mm->mmap_sem);
        if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
                gfn &= ~(KVM_PAGES_PER_HPAGE-1);
                largepage = 1;
        }
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
-       /* implicit mb(), we'll read before PT lock is unlocked */
+       smp_rmb();
        pfn = gfn_to_pfn(vcpu->kvm, gfn);
-       up_read(&current->mm->mmap_sem);
        if (is_error_pfn(pfn)) {
                kvm_release_pfn_clean(pfn);
                return 1;
@@ -1464,7 +1837,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
                goto out_unlock;
        kvm_mmu_free_some_pages(vcpu);
        r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
-                        largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
+                        largepage, gfn, pfn);
        spin_unlock(&vcpu->kvm->mmu_lock);
 
        return r;
@@ -1489,6 +1862,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
        context->gva_to_gpa = nonpaging_gva_to_gpa;
        context->free = nonpaging_free;
        context->prefetch_page = nonpaging_prefetch_page;
+       context->sync_page = nonpaging_sync_page;
+       context->invlpg = nonpaging_invlpg;
        context->root_level = 0;
        context->shadow_root_level = PT32E_ROOT_LEVEL;
        context->root_hpa = INVALID_PAGE;
@@ -1536,6 +1911,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
        context->page_fault = paging64_page_fault;
        context->gva_to_gpa = paging64_gva_to_gpa;
        context->prefetch_page = paging64_prefetch_page;
+       context->sync_page = paging64_sync_page;
+       context->invlpg = paging64_invlpg;
        context->free = paging_free;
        context->root_level = level;
        context->shadow_root_level = level;
@@ -1557,6 +1934,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
        context->gva_to_gpa = paging32_gva_to_gpa;
        context->free = paging_free;
        context->prefetch_page = paging32_prefetch_page;
+       context->sync_page = paging32_sync_page;
+       context->invlpg = paging32_invlpg;
        context->root_level = PT32_ROOT_LEVEL;
        context->shadow_root_level = PT32E_ROOT_LEVEL;
        context->root_hpa = INVALID_PAGE;
@@ -1576,6 +1955,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
        context->page_fault = tdp_page_fault;
        context->free = nonpaging_free;
        context->prefetch_page = nonpaging_prefetch_page;
+       context->sync_page = nonpaging_sync_page;
+       context->invlpg = nonpaging_invlpg;
        context->shadow_root_level = kvm_x86_ops->get_tdp_level();
        context->root_hpa = INVALID_PAGE;
 
@@ -1647,6 +2028,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
        spin_lock(&vcpu->kvm->mmu_lock);
        kvm_mmu_free_some_pages(vcpu);
        mmu_alloc_roots(vcpu);
+       mmu_sync_roots(vcpu);
        spin_unlock(&vcpu->kvm->mmu_lock);
        kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
        kvm_mmu_flush_tlb(vcpu);
@@ -1767,15 +2149,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                return;
        gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 
-       down_read(&current->mm->mmap_sem);
        if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
                gfn &= ~(KVM_PAGES_PER_HPAGE-1);
                vcpu->arch.update_pte.largepage = 1;
        }
        vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
-       /* implicit mb(), we'll read before PT lock is unlocked */
+       smp_rmb();
        pfn = gfn_to_pfn(vcpu->kvm, gfn);
-       up_read(&current->mm->mmap_sem);
 
        if (is_error_pfn(pfn)) {
                kvm_release_pfn_clean(pfn);
@@ -1837,7 +2217,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        index = kvm_page_table_hashfn(gfn);
        bucket = &vcpu->kvm->arch.mmu_page_hash[index];
        hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
-               if (sp->gfn != gfn || sp->role.metaphysical)
+               if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid)
                        continue;
                pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
                misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
@@ -1855,7 +2235,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                         */
                        pgprintk("misaligned: gpa %llx bytes %d role %x\n",
                                 gpa, bytes, sp->role.word);
-                       kvm_mmu_zap_page(vcpu->kvm, sp);
+                       if (kvm_mmu_zap_page(vcpu->kvm, sp))
+                               n = bucket->first;
                        ++vcpu->kvm->stat.mmu_flooded;
                        continue;
                }
@@ -1969,6 +2350,16 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+       spin_lock(&vcpu->kvm->mmu_lock);
+       vcpu->arch.mmu.invlpg(vcpu, gva);
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       kvm_mmu_flush_tlb(vcpu);
+       ++vcpu->stat.invlpg;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
+
 void kvm_enable_tdp(void)
 {
        tdp_enabled = true;
@@ -2055,6 +2446,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 {
        struct kvm_mmu_page *sp;
 
+       spin_lock(&kvm->mmu_lock);
        list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
                int i;
                u64 *pt;
@@ -2068,6 +2460,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
                        if (pt[i] & PT_WRITABLE_MASK)
                                pt[i] &= ~PT_WRITABLE_MASK;
        }
+       kvm_flush_remote_tlbs(kvm);
+       spin_unlock(&kvm->mmu_lock);
 }
 
 void kvm_mmu_zap_all(struct kvm *kvm)
@@ -2076,7 +2470,9 @@ void kvm_mmu_zap_all(struct kvm *kvm)
 
        spin_lock(&kvm->mmu_lock);
        list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
-               kvm_mmu_zap_page(kvm, sp);
+               if (kvm_mmu_zap_page(kvm, sp))
+                       node = container_of(kvm->arch.active_mmu_pages.next,
+                                           struct kvm_mmu_page, link);
        spin_unlock(&kvm->mmu_lock);
 
        kvm_flush_remote_tlbs(kvm);
@@ -2291,18 +2687,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
                  gpa_t addr, unsigned long *ret)
 {
        int r;
-       struct kvm_pv_mmu_op_buffer buffer;
+       struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
 
-       buffer.ptr = buffer.buf;
-       buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf);
-       buffer.processed = 0;
+       buffer->ptr = buffer->buf;
+       buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
+       buffer->processed = 0;
 
-       r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len);
+       r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
        if (r)
                goto out;
 
-       while (buffer.len) {
-               r = kvm_pv_mmu_op_one(vcpu, &buffer);
+       while (buffer->len) {
+               r = kvm_pv_mmu_op_one(vcpu, buffer);
                if (r < 0)
                        goto out;
                if (r == 0)
@@ -2311,7 +2707,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
 
        r = 1;
 out:
-       *ret = buffer.processed;
+       *ret = buffer->processed;
        return r;
 }
 
index 4a814bff21f27cb9b829df36071ffb40b43818ca..613ec9aa674afe06cd76c99b8ebc06eaa3f91f32 100644 (file)
 #if PTTYPE == 64
        #define pt_element_t u64
        #define guest_walker guest_walker64
+       #define shadow_walker shadow_walker64
        #define FNAME(name) paging##64_##name
        #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
        #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
        #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
-       #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
        #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
        #define PT_LEVEL_BITS PT64_LEVEL_BITS
        #ifdef CONFIG_X86_64
 #elif PTTYPE == 32
        #define pt_element_t u32
        #define guest_walker guest_walker32
+       #define shadow_walker shadow_walker32
        #define FNAME(name) paging##32_##name
        #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
        #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
        #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
-       #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
        #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
        #define PT_LEVEL_BITS PT32_LEVEL_BITS
        #define PT_MAX_FULL_LEVELS 2
@@ -73,6 +73,17 @@ struct guest_walker {
        u32 error_code;
 };
 
+struct shadow_walker {
+       struct kvm_shadow_walk walker;
+       struct guest_walker *guest_walker;
+       int user_fault;
+       int write_fault;
+       int largepage;
+       int *ptwrite;
+       pfn_t pfn;
+       u64 *sptep;
+};
+
 static gfn_t gpte_to_gfn(pt_element_t gpte)
 {
        return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -91,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
        pt_element_t *table;
        struct page *page;
 
-       down_read(&current->mm->mmap_sem);
        page = gfn_to_page(kvm, table_gfn);
-       up_read(&current->mm->mmap_sem);
 
        table = kmap_atomic(page, KM_USER0);
-
        ret = CMPXCHG(&table[index], orig_pte, new_pte);
-
        kunmap_atomic(table, KM_USER0);
 
        kvm_release_page_dirty(page);
@@ -274,86 +281,89 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  */
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
-                        struct guest_walker *walker,
-                        int user_fault, int write_fault, int largepage,
-                        int *ptwrite, pfn_t pfn)
+static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
+                                   struct kvm_vcpu *vcpu, u64 addr,
+                                   u64 *sptep, int level)
 {
-       hpa_t shadow_addr;
-       int level;
-       u64 *shadow_ent;
-       unsigned access = walker->pt_access;
-
-       if (!is_present_pte(walker->ptes[walker->level - 1]))
-               return NULL;
-
-       shadow_addr = vcpu->arch.mmu.root_hpa;
-       level = vcpu->arch.mmu.shadow_root_level;
-       if (level == PT32E_ROOT_LEVEL) {
-               shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
-               shadow_addr &= PT64_BASE_ADDR_MASK;
-               --level;
+       struct shadow_walker *sw =
+               container_of(_sw, struct shadow_walker, walker);
+       struct guest_walker *gw = sw->guest_walker;
+       unsigned access = gw->pt_access;
+       struct kvm_mmu_page *shadow_page;
+       u64 spte;
+       int metaphysical;
+       gfn_t table_gfn;
+       int r;
+       pt_element_t curr_pte;
+
+       if (level == PT_PAGE_TABLE_LEVEL
+           || (sw->largepage && level == PT_DIRECTORY_LEVEL)) {
+               mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
+                            sw->user_fault, sw->write_fault,
+                            gw->ptes[gw->level-1] & PT_DIRTY_MASK,
+                            sw->ptwrite, sw->largepage, gw->gfn, sw->pfn,
+                            false);
+               sw->sptep = sptep;
+               return 1;
        }
 
-       for (; ; level--) {
-               u32 index = SHADOW_PT_INDEX(addr, level);
-               struct kvm_mmu_page *shadow_page;
-               u64 shadow_pte;
-               int metaphysical;
-               gfn_t table_gfn;
-
-               shadow_ent = ((u64 *)__va(shadow_addr)) + index;
-               if (level == PT_PAGE_TABLE_LEVEL)
-                       break;
-
-               if (largepage && level == PT_DIRECTORY_LEVEL)
-                       break;
+       if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
+               return 0;
 
-               if (is_shadow_present_pte(*shadow_ent)
-                   && !is_large_pte(*shadow_ent)) {
-                       shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
-                       continue;
-               }
+       if (is_large_pte(*sptep)) {
+               set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+               kvm_flush_remote_tlbs(vcpu->kvm);
+               rmap_remove(vcpu->kvm, sptep);
+       }
 
-               if (is_large_pte(*shadow_ent))
-                       rmap_remove(vcpu->kvm, shadow_ent);
-
-               if (level - 1 == PT_PAGE_TABLE_LEVEL
-                   && walker->level == PT_DIRECTORY_LEVEL) {
-                       metaphysical = 1;
-                       if (!is_dirty_pte(walker->ptes[level - 1]))
-                               access &= ~ACC_WRITE_MASK;
-                       table_gfn = gpte_to_gfn(walker->ptes[level - 1]);
-               } else {
-                       metaphysical = 0;
-                       table_gfn = walker->table_gfn[level - 2];
-               }
-               shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
-                                              metaphysical, access,
-                                              shadow_ent);
-               if (!metaphysical) {
-                       int r;
-                       pt_element_t curr_pte;
-                       r = kvm_read_guest_atomic(vcpu->kvm,
-                                                 walker->pte_gpa[level - 2],
-                                                 &curr_pte, sizeof(curr_pte));
-                       if (r || curr_pte != walker->ptes[level - 2]) {
-                               kvm_release_pfn_clean(pfn);
-                               return NULL;
-                       }
+       if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) {
+               metaphysical = 1;
+               if (!is_dirty_pte(gw->ptes[level - 1]))
+                       access &= ~ACC_WRITE_MASK;
+               table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
+       } else {
+               metaphysical = 0;
+               table_gfn = gw->table_gfn[level - 2];
+       }
+       shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1,
+                                      metaphysical, access, sptep);
+       if (!metaphysical) {
+               r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2],
+                                         &curr_pte, sizeof(curr_pte));
+               if (r || curr_pte != gw->ptes[level - 2]) {
+                       kvm_release_pfn_clean(sw->pfn);
+                       sw->sptep = NULL;
+                       return 1;
                }
-               shadow_addr = __pa(shadow_page->spt);
-               shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
-                       | PT_WRITABLE_MASK | PT_USER_MASK;
-               set_shadow_pte(shadow_ent, shadow_pte);
        }
 
-       mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
-                    user_fault, write_fault,
-                    walker->ptes[walker->level-1] & PT_DIRTY_MASK,
-                    ptwrite, largepage, walker->gfn, pfn, false);
+       spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK
+               | PT_WRITABLE_MASK | PT_USER_MASK;
+       *sptep = spte;
+       return 0;
+}
+
+static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+                        struct guest_walker *guest_walker,
+                        int user_fault, int write_fault, int largepage,
+                        int *ptwrite, pfn_t pfn)
+{
+       struct shadow_walker walker = {
+               .walker = { .entry = FNAME(shadow_walk_entry), },
+               .guest_walker = guest_walker,
+               .user_fault = user_fault,
+               .write_fault = write_fault,
+               .largepage = largepage,
+               .ptwrite = ptwrite,
+               .pfn = pfn,
+       };
+
+       if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1]))
+               return NULL;
+
+       walk_shadow(&walker.walker, vcpu, addr);
 
-       return shadow_ent;
+       return walker.sptep;
 }
 
 /*
@@ -407,7 +417,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
                return 0;
        }
 
-       down_read(&current->mm->mmap_sem);
        if (walker.level == PT_DIRECTORY_LEVEL) {
                gfn_t large_gfn;
                large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
@@ -417,9 +426,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
                }
        }
        mmu_seq = vcpu->kvm->mmu_notifier_seq;
-       /* implicit mb(), we'll read before PT lock is unlocked */
+       smp_rmb();
        pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
-       up_read(&current->mm->mmap_sem);
 
        /* mmio */
        if (is_error_pfn(pfn)) {
@@ -453,6 +461,31 @@ out_unlock:
        return 0;
 }
 
+static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
+                                     struct kvm_vcpu *vcpu, u64 addr,
+                                     u64 *sptep, int level)
+{
+
+       if (level == PT_PAGE_TABLE_LEVEL) {
+               if (is_shadow_present_pte(*sptep))
+                       rmap_remove(vcpu->kvm, sptep);
+               set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
+               return 1;
+       }
+       if (!is_shadow_present_pte(*sptep))
+               return 1;
+       return 0;
+}
+
+static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
+{
+       struct shadow_walker walker = {
+               .walker = { .entry = FNAME(shadow_invlpg_entry), },
+       };
+
+       walk_shadow(&walker.walker, vcpu, gva);
+}
+
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
 {
        struct guest_walker walker;
@@ -499,12 +532,66 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
        }
 }
 
+/*
+ * Using the cached information from sp->gfns is safe because:
+ * - The spte has a reference to the struct page, so the pfn for a given gfn
+ *   can't change unless all sptes pointing to it are nuked first.
+ * - Alias changes zap the entire shadow cache.
+ */
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+       int i, offset, nr_present;
+
+       offset = nr_present = 0;
+
+       if (PTTYPE == 32)
+               offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+       for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
+               unsigned pte_access;
+               pt_element_t gpte;
+               gpa_t pte_gpa;
+               gfn_t gfn = sp->gfns[i];
+
+               if (!is_shadow_present_pte(sp->spt[i]))
+                       continue;
+
+               pte_gpa = gfn_to_gpa(sp->gfn);
+               pte_gpa += (i+offset) * sizeof(pt_element_t);
+
+               if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
+                                         sizeof(pt_element_t)))
+                       return -EINVAL;
+
+               if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) ||
+                   !(gpte & PT_ACCESSED_MASK)) {
+                       u64 nonpresent;
+
+                       rmap_remove(vcpu->kvm, &sp->spt[i]);
+                       if (is_present_pte(gpte))
+                               nonpresent = shadow_trap_nonpresent_pte;
+                       else
+                               nonpresent = shadow_notrap_nonpresent_pte;
+                       set_shadow_pte(&sp->spt[i], nonpresent);
+                       continue;
+               }
+
+               nr_present++;
+               pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+               set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
+                        is_dirty_pte(gpte), 0, gfn,
+                        spte_to_pfn(sp->spt[i]), true, false);
+       }
+
+       return !nr_present;
+}
+
 #undef pt_element_t
 #undef guest_walker
+#undef shadow_walker
 #undef FNAME
 #undef PT_BASE_ADDR_MASK
 #undef PT_INDEX
-#undef SHADOW_PT_INDEX
 #undef PT_LEVEL_MASK
 #undef PT_DIR_BASE_ADDR_MASK
 #undef PT_LEVEL_BITS
index 8233b86c778cfd0a01b2ca96dbe7d27776381037..9c4ce657d96389753ff9650bb06f12bf76e003c8 100644 (file)
@@ -18,6 +18,7 @@
 #include "kvm_svm.h"
 #include "irq.h"
 #include "mmu.h"
+#include "kvm_cache_regs.h"
 
 #include <linux/module.h>
 #include <linux/kernel.h>
@@ -35,10 +36,6 @@ MODULE_LICENSE("GPL");
 #define IOPM_ALLOC_ORDER 2
 #define MSRPM_ALLOC_ORDER 1
 
-#define DB_VECTOR 1
-#define UD_VECTOR 6
-#define GP_VECTOR 13
-
 #define DR7_GD_MASK (1 << 13)
 #define DR6_BD_MASK (1 << 13)
 
@@ -47,7 +44,7 @@ MODULE_LICENSE("GPL");
 
 #define SVM_FEATURE_NPT  (1 << 0)
 #define SVM_FEATURE_LBRV (1 << 1)
-#define SVM_DEATURE_SVML (1 << 2)
+#define SVM_FEATURE_SVML (1 << 2)
 
 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 
@@ -236,13 +233,11 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
                printk(KERN_DEBUG "%s: NOP\n", __func__);
                return;
        }
-       if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
-               printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
-                      __func__,
-                      svm->vmcb->save.rip,
-                      svm->next_rip);
+       if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
+               printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
+                      __func__, kvm_rip_read(vcpu), svm->next_rip);
 
-       vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip;
+       kvm_rip_write(vcpu, svm->next_rip);
        svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
 
        vcpu->arch.interrupt_window_open = 1;
@@ -530,6 +525,7 @@ static void init_vmcb(struct vcpu_svm *svm)
                                (1ULL << INTERCEPT_CPUID) |
                                (1ULL << INTERCEPT_INVD) |
                                (1ULL << INTERCEPT_HLT) |
+                               (1ULL << INTERCEPT_INVLPG) |
                                (1ULL << INTERCEPT_INVLPGA) |
                                (1ULL << INTERCEPT_IOIO_PROT) |
                                (1ULL << INTERCEPT_MSR_PROT) |
@@ -581,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm)
        save->dr7 = 0x400;
        save->rflags = 2;
        save->rip = 0x0000fff0;
+       svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
 
        /*
         * cr0 val on cpu init should be 0x60000010, we enable cpu
@@ -593,7 +590,8 @@ static void init_vmcb(struct vcpu_svm *svm)
        if (npt_enabled) {
                /* Setup VMCB for Nested Paging */
                control->nested_ctl = 1;
-               control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH);
+               control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
+                                       (1ULL << INTERCEPT_INVLPG));
                control->intercept_exceptions &= ~(1 << PF_VECTOR);
                control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
                                                INTERCEPT_CR3_MASK);
@@ -615,10 +613,12 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
        init_vmcb(svm);
 
        if (vcpu->vcpu_id != 0) {
-               svm->vmcb->save.rip = 0;
+               kvm_rip_write(vcpu, 0);
                svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
                svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
        }
+       vcpu->arch.regs_avail = ~0;
+       vcpu->arch.regs_dirty = ~0;
 
        return 0;
 }
@@ -721,23 +721,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
        rdtscll(vcpu->arch.host_tsc);
 }
 
-static void svm_cache_regs(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-
-       vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
-       vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
-       vcpu->arch.rip = svm->vmcb->save.rip;
-}
-
-static void svm_decache_regs(struct kvm_vcpu *vcpu)
-{
-       struct vcpu_svm *svm = to_svm(vcpu);
-       svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
-       svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
-       svm->vmcb->save.rip = vcpu->arch.rip;
-}
-
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
 {
        return to_svm(vcpu)->vmcb->save.rflags;
@@ -1040,7 +1023,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        if (npt_enabled)
                svm_flush_tlb(&svm->vcpu);
 
-       if (event_injection)
+       if (!npt_enabled && event_injection)
                kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
        return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
 }
@@ -1139,14 +1122,14 @@ static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 
 static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-       svm->next_rip = svm->vmcb->save.rip + 1;
+       svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
        skip_emulated_instruction(&svm->vcpu);
        return kvm_emulate_halt(&svm->vcpu);
 }
 
 static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-       svm->next_rip = svm->vmcb->save.rip + 3;
+       svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
        skip_emulated_instruction(&svm->vcpu);
        kvm_emulate_hypercall(&svm->vcpu);
        return 1;
@@ -1178,11 +1161,18 @@ static int task_switch_interception(struct vcpu_svm *svm,
 
 static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-       svm->next_rip = svm->vmcb->save.rip + 2;
+       svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
        kvm_emulate_cpuid(&svm->vcpu);
        return 1;
 }
 
+static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+       if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
+               pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
+       return 1;
+}
+
 static int emulate_on_interception(struct vcpu_svm *svm,
                                   struct kvm_run *kvm_run)
 {
@@ -1273,9 +1263,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
                KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
                            (u32)(data >> 32), handler);
 
-               svm->vmcb->save.rax = data & 0xffffffff;
+               svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
                svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
-               svm->next_rip = svm->vmcb->save.rip + 2;
+               svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
                skip_emulated_instruction(&svm->vcpu);
        }
        return 1;
@@ -1359,13 +1349,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
        u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
-       u64 data = (svm->vmcb->save.rax & -1u)
+       u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
                | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
 
        KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
                    handler);
 
-       svm->next_rip = svm->vmcb->save.rip + 2;
+       svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
        if (svm_set_msr(&svm->vcpu, ecx, data))
                kvm_inject_gp(&svm->vcpu, 0);
        else
@@ -1436,7 +1426,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
        [SVM_EXIT_CPUID]                        = cpuid_interception,
        [SVM_EXIT_INVD]                         = emulate_on_interception,
        [SVM_EXIT_HLT]                          = halt_interception,
-       [SVM_EXIT_INVLPG]                       = emulate_on_interception,
+       [SVM_EXIT_INVLPG]                       = invlpg_interception,
        [SVM_EXIT_INVLPGA]                      = invalid_op_interception,
        [SVM_EXIT_IOIO]                         = io_interception,
        [SVM_EXIT_MSR]                          = msr_interception,
@@ -1538,6 +1528,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
 
        KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
 
+       ++svm->vcpu.stat.irq_injections;
        control = &svm->vmcb->control;
        control->int_vector = irq;
        control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -1716,6 +1707,12 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
        svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
 }
 
+#ifdef CONFIG_X86_64
+#define R "r"
+#else
+#define R "e"
+#endif
+
 static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -1723,6 +1720,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        u16 gs_selector;
        u16 ldt_selector;
 
+       svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+       svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+       svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
        pre_svm_run(svm);
 
        sync_lapic_to_cr8(vcpu);
@@ -1750,19 +1751,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        local_irq_enable();
 
        asm volatile (
+               "push %%"R"bp; \n\t"
+               "mov %c[rbx](%[svm]), %%"R"bx \n\t"
+               "mov %c[rcx](%[svm]), %%"R"cx \n\t"
+               "mov %c[rdx](%[svm]), %%"R"dx \n\t"
+               "mov %c[rsi](%[svm]), %%"R"si \n\t"
+               "mov %c[rdi](%[svm]), %%"R"di \n\t"
+               "mov %c[rbp](%[svm]), %%"R"bp \n\t"
 #ifdef CONFIG_X86_64
-               "push %%rbp; \n\t"
-#else
-               "push %%ebp; \n\t"
-#endif
-
-#ifdef CONFIG_X86_64
-               "mov %c[rbx](%[svm]), %%rbx \n\t"
-               "mov %c[rcx](%[svm]), %%rcx \n\t"
-               "mov %c[rdx](%[svm]), %%rdx \n\t"
-               "mov %c[rsi](%[svm]), %%rsi \n\t"
-               "mov %c[rdi](%[svm]), %%rdi \n\t"
-               "mov %c[rbp](%[svm]), %%rbp \n\t"
                "mov %c[r8](%[svm]),  %%r8  \n\t"
                "mov %c[r9](%[svm]),  %%r9  \n\t"
                "mov %c[r10](%[svm]), %%r10 \n\t"
@@ -1771,41 +1767,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                "mov %c[r13](%[svm]), %%r13 \n\t"
                "mov %c[r14](%[svm]), %%r14 \n\t"
                "mov %c[r15](%[svm]), %%r15 \n\t"
-#else
-               "mov %c[rbx](%[svm]), %%ebx \n\t"
-               "mov %c[rcx](%[svm]), %%ecx \n\t"
-               "mov %c[rdx](%[svm]), %%edx \n\t"
-               "mov %c[rsi](%[svm]), %%esi \n\t"
-               "mov %c[rdi](%[svm]), %%edi \n\t"
-               "mov %c[rbp](%[svm]), %%ebp \n\t"
 #endif
 
-#ifdef CONFIG_X86_64
-               /* Enter guest mode */
-               "push %%rax \n\t"
-               "mov %c[vmcb](%[svm]), %%rax \n\t"
-               __ex(SVM_VMLOAD) "\n\t"
-               __ex(SVM_VMRUN) "\n\t"
-               __ex(SVM_VMSAVE) "\n\t"
-               "pop %%rax \n\t"
-#else
                /* Enter guest mode */
-               "push %%eax \n\t"
-               "mov %c[vmcb](%[svm]), %%eax \n\t"
+               "push %%"R"ax \n\t"
+               "mov %c[vmcb](%[svm]), %%"R"ax \n\t"
                __ex(SVM_VMLOAD) "\n\t"
                __ex(SVM_VMRUN) "\n\t"
                __ex(SVM_VMSAVE) "\n\t"
-               "pop %%eax \n\t"
-#endif
+               "pop %%"R"ax \n\t"
 
                /* Save guest registers, load host registers */
+               "mov %%"R"bx, %c[rbx](%[svm]) \n\t"
+               "mov %%"R"cx, %c[rcx](%[svm]) \n\t"
+               "mov %%"R"dx, %c[rdx](%[svm]) \n\t"
+               "mov %%"R"si, %c[rsi](%[svm]) \n\t"
+               "mov %%"R"di, %c[rdi](%[svm]) \n\t"
+               "mov %%"R"bp, %c[rbp](%[svm]) \n\t"
 #ifdef CONFIG_X86_64
-               "mov %%rbx, %c[rbx](%[svm]) \n\t"
-               "mov %%rcx, %c[rcx](%[svm]) \n\t"
-               "mov %%rdx, %c[rdx](%[svm]) \n\t"
-               "mov %%rsi, %c[rsi](%[svm]) \n\t"
-               "mov %%rdi, %c[rdi](%[svm]) \n\t"
-               "mov %%rbp, %c[rbp](%[svm]) \n\t"
                "mov %%r8,  %c[r8](%[svm]) \n\t"
                "mov %%r9,  %c[r9](%[svm]) \n\t"
                "mov %%r10, %c[r10](%[svm]) \n\t"
@@ -1814,18 +1793,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                "mov %%r13, %c[r13](%[svm]) \n\t"
                "mov %%r14, %c[r14](%[svm]) \n\t"
                "mov %%r15, %c[r15](%[svm]) \n\t"
-
-               "pop  %%rbp; \n\t"
-#else
-               "mov %%ebx, %c[rbx](%[svm]) \n\t"
-               "mov %%ecx, %c[rcx](%[svm]) \n\t"
-               "mov %%edx, %c[rdx](%[svm]) \n\t"
-               "mov %%esi, %c[rsi](%[svm]) \n\t"
-               "mov %%edi, %c[rdi](%[svm]) \n\t"
-               "mov %%ebp, %c[rbp](%[svm]) \n\t"
-
-               "pop  %%ebp; \n\t"
 #endif
+               "pop %%"R"bp"
                :
                : [svm]"a"(svm),
                  [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
@@ -1846,11 +1815,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                  [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
 #endif
                : "cc", "memory"
+               , R"bx", R"cx", R"dx", R"si", R"di"
 #ifdef CONFIG_X86_64
-               , "rbx", "rcx", "rdx", "rsi", "rdi"
                , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
-#else
-               , "ebx", "ecx", "edx" , "esi", "edi"
 #endif
                );
 
@@ -1858,6 +1825,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                load_db_regs(svm->host_db_regs);
 
        vcpu->arch.cr2 = svm->vmcb->save.cr2;
+       vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+       vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+       vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
 
        write_dr6(svm->host_dr6);
        write_dr7(svm->host_dr7);
@@ -1879,6 +1849,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        svm->next_rip = 0;
 }
 
+#undef R
+
 static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -1977,8 +1949,6 @@ static struct kvm_x86_ops svm_x86_ops = {
        .set_gdt = svm_set_gdt,
        .get_dr = svm_get_dr,
        .set_dr = svm_set_dr,
-       .cache_regs = svm_cache_regs,
-       .decache_regs = svm_decache_regs,
        .get_rflags = svm_get_rflags,
        .set_rflags = svm_set_rflags,
 
index 7041cc52b562eccc98f4a5fd060390709bc915b4..2643b430d83a0cb2f6519dbdaccd8e9a4cd480d7 100644 (file)
@@ -26,6 +26,8 @@
 #include <linux/highmem.h>
 #include <linux/sched.h>
 #include <linux/moduleparam.h>
+#include "kvm_cache_regs.h"
+#include "x86.h"
 
 #include <asm/io.h>
 #include <asm/desc.h>
@@ -47,6 +49,9 @@ module_param(flexpriority_enabled, bool, 0);
 static int enable_ept = 1;
 module_param(enable_ept, bool, 0);
 
+static int emulate_invalid_guest_state = 0;
+module_param(emulate_invalid_guest_state, bool, 0);
+
 struct vmcs {
        u32 revision_id;
        u32 abort;
@@ -56,6 +61,7 @@ struct vmcs {
 struct vcpu_vmx {
        struct kvm_vcpu       vcpu;
        struct list_head      local_vcpus_link;
+       unsigned long         host_rsp;
        int                   launched;
        u8                    fail;
        u32                   idt_vectoring_info;
@@ -83,6 +89,7 @@ struct vcpu_vmx {
                } irq;
        } rmode;
        int vpid;
+       bool emulation_required;
 };
 
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -468,7 +475,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
        if (!vcpu->fpu_active)
                eb |= 1u << NM_VECTOR;
        if (vcpu->guest_debug.enabled)
-               eb |= 1u << 1;
+               eb |= 1u << DB_VECTOR;
        if (vcpu->arch.rmode.active)
                eb = ~0;
        if (vm_need_ept())
@@ -715,9 +722,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
        unsigned long rip;
        u32 interruptibility;
 
-       rip = vmcs_readl(GUEST_RIP);
+       rip = kvm_rip_read(vcpu);
        rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
-       vmcs_writel(GUEST_RIP, rip);
+       kvm_rip_write(vcpu, rip);
 
        /*
         * We emulated an instruction, so temporary interrupt blocking
@@ -733,19 +740,35 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
                                bool has_error_code, u32 error_code)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (has_error_code)
+               vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+
+       if (vcpu->arch.rmode.active) {
+               vmx->rmode.irq.pending = true;
+               vmx->rmode.irq.vector = nr;
+               vmx->rmode.irq.rip = kvm_rip_read(vcpu);
+               if (nr == BP_VECTOR)
+                       vmx->rmode.irq.rip++;
+               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+                            nr | INTR_TYPE_SOFT_INTR
+                            | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
+                            | INTR_INFO_VALID_MASK);
+               vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+               kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+               return;
+       }
+
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
                     nr | INTR_TYPE_EXCEPTION
                     | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
                     | INTR_INFO_VALID_MASK);
-       if (has_error_code)
-               vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
 }
 
 static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
 {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-       return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+       return false;
 }
 
 /*
@@ -947,24 +970,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
        return ret;
 }
 
-/*
- * Sync the rsp and rip registers into the vcpu structure.  This allows
- * registers to be accessed by indexing vcpu->arch.regs.
- */
-static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
-{
-       vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
-       vcpu->arch.rip = vmcs_readl(GUEST_RIP);
-}
-
-/*
- * Syncs rsp and rip back into the vmcs.  Should be called after possible
- * modification.
- */
-static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
+static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 {
-       vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
-       vmcs_writel(GUEST_RIP, vcpu->arch.rip);
+       __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+       switch (reg) {
+       case VCPU_REGS_RSP:
+               vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
+               break;
+       case VCPU_REGS_RIP:
+               vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
+               break;
+       default:
+               break;
+       }
 }
 
 static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
@@ -1007,17 +1025,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
 
 static int vmx_get_irq(struct kvm_vcpu *vcpu)
 {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       u32 idtv_info_field;
-
-       idtv_info_field = vmx->idt_vectoring_info;
-       if (idtv_info_field & INTR_INFO_VALID_MASK) {
-               if (is_external_interrupt(idtv_info_field))
-                       return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
-               else
-                       printk(KERN_DEBUG "pending exception: not handled yet\n");
-       }
-       return -1;
+       if (!vcpu->arch.interrupt.pending)
+               return -1;
+       return vcpu->arch.interrupt.nr;
 }
 
 static __init int cpu_has_kvm_support(void)
@@ -1031,9 +1041,9 @@ static __init int vmx_disabled_by_bios(void)
        u64 msr;
 
        rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
-       return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED |
-                      MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
-           == MSR_IA32_FEATURE_CONTROL_LOCKED;
+       return (msr & (FEATURE_CONTROL_LOCKED |
+                      FEATURE_CONTROL_VMXON_ENABLED))
+           == FEATURE_CONTROL_LOCKED;
        /* locked but not enabled */
 }
 
@@ -1045,14 +1055,14 @@ static void hardware_enable(void *garbage)
 
        INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
        rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
-       if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED |
-                   MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
-           != (MSR_IA32_FEATURE_CONTROL_LOCKED |
-               MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED))
+       if ((old & (FEATURE_CONTROL_LOCKED |
+                   FEATURE_CONTROL_VMXON_ENABLED))
+           != (FEATURE_CONTROL_LOCKED |
+               FEATURE_CONTROL_VMXON_ENABLED))
                /* enable and lock */
                wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
-                      MSR_IA32_FEATURE_CONTROL_LOCKED |
-                      MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED);
+                      FEATURE_CONTROL_LOCKED |
+                      FEATURE_CONTROL_VMXON_ENABLED);
        write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
        asm volatile (ASM_VMX_VMXON_RAX
                      : : "a"(&phys_addr), "m"(phys_addr)
@@ -1120,7 +1130,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
              CPU_BASED_CR3_STORE_EXITING |
              CPU_BASED_USE_IO_BITMAPS |
              CPU_BASED_MOV_DR_EXITING |
-             CPU_BASED_USE_TSC_OFFSETING;
+             CPU_BASED_USE_TSC_OFFSETING |
+             CPU_BASED_INVLPG_EXITING;
        opt = CPU_BASED_TPR_SHADOW |
              CPU_BASED_USE_MSR_BITMAPS |
              CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1149,9 +1160,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
 #endif
        if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
-               /* CR3 accesses don't need to cause VM Exits when EPT enabled */
+               /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
+                  enabled */
                min &= ~(CPU_BASED_CR3_LOAD_EXITING |
-                        CPU_BASED_CR3_STORE_EXITING);
+                        CPU_BASED_CR3_STORE_EXITING |
+                        CPU_BASED_INVLPG_EXITING);
                if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
                                        &_cpu_based_exec_control) < 0)
                        return -EIO;
@@ -1288,7 +1301,9 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
 static void enter_pmode(struct kvm_vcpu *vcpu)
 {
        unsigned long flags;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+       vmx->emulation_required = 1;
        vcpu->arch.rmode.active = 0;
 
        vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
@@ -1305,6 +1320,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
 
        update_exception_bitmap(vcpu);
 
+       if (emulate_invalid_guest_state)
+               return;
+
        fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
        fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
        fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
@@ -1345,7 +1363,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
 static void enter_rmode(struct kvm_vcpu *vcpu)
 {
        unsigned long flags;
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+       vmx->emulation_required = 1;
        vcpu->arch.rmode.active = 1;
 
        vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
@@ -1367,6 +1387,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
        vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
        update_exception_bitmap(vcpu);
 
+       if (emulate_invalid_guest_state)
+               goto continue_rmode;
+
        vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
        vmcs_write32(GUEST_SS_LIMIT, 0xffff);
        vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
@@ -1382,6 +1405,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
        fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
        fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
 
+continue_rmode:
        kvm_mmu_reset_context(vcpu);
        init_rmode(vcpu->kvm);
 }
@@ -1715,6 +1739,186 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
        vmcs_writel(GUEST_GDTR_BASE, dt->base);
 }
 
+static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
+{
+       struct kvm_segment var;
+       u32 ar;
+
+       vmx_get_segment(vcpu, &var, seg);
+       ar = vmx_segment_access_rights(&var);
+
+       if (var.base != (var.selector << 4))
+               return false;
+       if (var.limit != 0xffff)
+               return false;
+       if (ar != 0xf3)
+               return false;
+
+       return true;
+}
+
+static bool code_segment_valid(struct kvm_vcpu *vcpu)
+{
+       struct kvm_segment cs;
+       unsigned int cs_rpl;
+
+       vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+       cs_rpl = cs.selector & SELECTOR_RPL_MASK;
+
+       if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
+               return false;
+       if (!cs.s)
+               return false;
+       if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) {
+               if (cs.dpl > cs_rpl)
+                       return false;
+       } else if (cs.type & AR_TYPE_CODE_MASK) {
+               if (cs.dpl != cs_rpl)
+                       return false;
+       }
+       if (!cs.present)
+               return false;
+
+       /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
+       return true;
+}
+
+static bool stack_segment_valid(struct kvm_vcpu *vcpu)
+{
+       struct kvm_segment ss;
+       unsigned int ss_rpl;
+
+       vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
+       ss_rpl = ss.selector & SELECTOR_RPL_MASK;
+
+       if ((ss.type != 3) || (ss.type != 7))
+               return false;
+       if (!ss.s)
+               return false;
+       if (ss.dpl != ss_rpl) /* DPL != RPL */
+               return false;
+       if (!ss.present)
+               return false;
+
+       return true;
+}
+
+static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
+{
+       struct kvm_segment var;
+       unsigned int rpl;
+
+       vmx_get_segment(vcpu, &var, seg);
+       rpl = var.selector & SELECTOR_RPL_MASK;
+
+       if (!var.s)
+               return false;
+       if (!var.present)
+               return false;
+       if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
+               if (var.dpl < rpl) /* DPL < RPL */
+                       return false;
+       }
+
+       /* TODO: Add other members to kvm_segment_field to allow checking for other access
+        * rights flags
+        */
+       return true;
+}
+
+static bool tr_valid(struct kvm_vcpu *vcpu)
+{
+       struct kvm_segment tr;
+
+       vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
+
+       if (tr.selector & SELECTOR_TI_MASK)     /* TI = 1 */
+               return false;
+       if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */
+               return false;
+       if (!tr.present)
+               return false;
+
+       return true;
+}
+
+static bool ldtr_valid(struct kvm_vcpu *vcpu)
+{
+       struct kvm_segment ldtr;
+
+       vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
+
+       if (ldtr.selector & SELECTOR_TI_MASK)   /* TI = 1 */
+               return false;
+       if (ldtr.type != 2)
+               return false;
+       if (!ldtr.present)
+               return false;
+
+       return true;
+}
+
+static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
+{
+       struct kvm_segment cs, ss;
+
+       vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+       vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
+
+       return ((cs.selector & SELECTOR_RPL_MASK) ==
+                (ss.selector & SELECTOR_RPL_MASK));
+}
+
+/*
+ * Check if guest state is valid. Returns true if valid, false if
+ * not.
+ * We assume that registers are always usable
+ */
+static bool guest_state_valid(struct kvm_vcpu *vcpu)
+{
+       /* real mode guest state checks */
+       if (!(vcpu->arch.cr0 & X86_CR0_PE)) {
+               if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
+                       return false;
+               if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
+                       return false;
+               if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
+                       return false;
+               if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
+                       return false;
+               if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
+                       return false;
+               if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
+                       return false;
+       } else {
+       /* protected mode guest state checks */
+               if (!cs_ss_rpl_check(vcpu))
+                       return false;
+               if (!code_segment_valid(vcpu))
+                       return false;
+               if (!stack_segment_valid(vcpu))
+                       return false;
+               if (!data_segment_valid(vcpu, VCPU_SREG_DS))
+                       return false;
+               if (!data_segment_valid(vcpu, VCPU_SREG_ES))
+                       return false;
+               if (!data_segment_valid(vcpu, VCPU_SREG_FS))
+                       return false;
+               if (!data_segment_valid(vcpu, VCPU_SREG_GS))
+                       return false;
+               if (!tr_valid(vcpu))
+                       return false;
+               if (!ldtr_valid(vcpu))
+                       return false;
+       }
+       /* TODO:
+        * - Add checks on RIP
+        * - Add checks on RFLAGS
+        */
+
+       return true;
+}
+
 static int init_rmode_tss(struct kvm *kvm)
 {
        gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
@@ -1726,7 +1930,8 @@ static int init_rmode_tss(struct kvm *kvm)
        if (r < 0)
                goto out;
        data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
-       r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16));
+       r = kvm_write_guest_page(kvm, fn++, &data,
+                       TSS_IOPB_BASE_OFFSET, sizeof(u16));
        if (r < 0)
                goto out;
        r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
@@ -1789,7 +1994,7 @@ static void seg_setup(int seg)
        vmcs_write16(sf->selector, 0);
        vmcs_writel(sf->base, 0);
        vmcs_write32(sf->limit, 0xffff);
-       vmcs_write32(sf->ar_bytes, 0x93);
+       vmcs_write32(sf->ar_bytes, 0xf3);
 }
 
 static int alloc_apic_access_page(struct kvm *kvm)
@@ -1808,9 +2013,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
        if (r)
                goto out;
 
-       down_read(&current->mm->mmap_sem);
        kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
-       up_read(&current->mm->mmap_sem);
 out:
        up_write(&kvm->slots_lock);
        return r;
@@ -1832,10 +2035,8 @@ static int alloc_identity_pagetable(struct kvm *kvm)
        if (r)
                goto out;
 
-       down_read(&current->mm->mmap_sem);
        kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
                        VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
-       up_read(&current->mm->mmap_sem);
 out:
        up_write(&kvm->slots_lock);
        return r;
@@ -1917,7 +2118,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        }
        if (!vm_need_ept())
                exec_control |= CPU_BASED_CR3_STORE_EXITING |
-                               CPU_BASED_CR3_LOAD_EXITING;
+                               CPU_BASED_CR3_LOAD_EXITING  |
+                               CPU_BASED_INVLPG_EXITING;
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
 
        if (cpu_has_secondary_exec_ctrls()) {
@@ -2019,6 +2221,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        u64 msr;
        int ret;
 
+       vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
        down_read(&vcpu->kvm->slots_lock);
        if (!init_rmode(vmx->vcpu.kvm)) {
                ret = -ENOMEM;
@@ -2036,6 +2239,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 
        fx_init(&vmx->vcpu);
 
+       seg_setup(VCPU_SREG_CS);
        /*
         * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
         * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
@@ -2047,8 +2251,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
                vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
                vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
        }
-       vmcs_write32(GUEST_CS_LIMIT, 0xffff);
-       vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
 
        seg_setup(VCPU_SREG_DS);
        seg_setup(VCPU_SREG_ES);
@@ -2072,10 +2274,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 
        vmcs_writel(GUEST_RFLAGS, 0x02);
        if (vmx->vcpu.vcpu_id == 0)
-               vmcs_writel(GUEST_RIP, 0xfff0);
+               kvm_rip_write(vcpu, 0xfff0);
        else
-               vmcs_writel(GUEST_RIP, 0);
-       vmcs_writel(GUEST_RSP, 0);
+               kvm_rip_write(vcpu, 0);
+       kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
 
        /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
        vmcs_writel(GUEST_DR7, 0x400);
@@ -2125,6 +2327,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 
        ret = 0;
 
+       /* HACK: Don't enable emulation on guest boot/reset */
+       vmx->emulation_required = 0;
+
 out:
        up_read(&vcpu->kvm->slots_lock);
        return ret;
@@ -2136,14 +2341,15 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 
        KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
 
+       ++vcpu->stat.irq_injections;
        if (vcpu->arch.rmode.active) {
                vmx->rmode.irq.pending = true;
                vmx->rmode.irq.vector = irq;
-               vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP);
+               vmx->rmode.irq.rip = kvm_rip_read(vcpu);
                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
                             irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
-               vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1);
+               kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
                return;
        }
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -2154,7 +2360,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
-       vcpu->arch.nmi_pending = 0;
 }
 
 static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
@@ -2166,7 +2371,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
        clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
        if (!vcpu->arch.irq_pending[word_index])
                clear_bit(word_index, &vcpu->arch.irq_summary);
-       vmx_inject_irq(vcpu, irq);
+       kvm_queue_interrupt(vcpu, irq);
 }
 
 
@@ -2180,13 +2385,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
                 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
 
        if (vcpu->arch.interrupt_window_open &&
-           vcpu->arch.irq_summary &&
-           !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
-               /*
-                * If interrupts enabled, and not blocked by sti or mov ss. Good.
-                */
+           vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
                kvm_do_inject_irq(vcpu);
 
+       if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending)
+               vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+
        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
        if (!vcpu->arch.interrupt_window_open &&
            (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
@@ -2237,9 +2441,6 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
                                  int vec, u32 err_code)
 {
-       if (!vcpu->arch.rmode.active)
-               return 0;
-
        /*
         * Instruction with address size override prefix opcode 0x67
         * Cause the #SS fault with 0 error code in VM86 mode.
@@ -2247,6 +2448,25 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
        if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
                if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
                        return 1;
+       /*
+        * Forward all other exceptions that are valid in real mode.
+        * FIXME: Breaks guest debugging in real mode, needs to be fixed with
+        *        the required debugging infrastructure rework.
+        */
+       switch (vec) {
+       case DE_VECTOR:
+       case DB_VECTOR:
+       case BP_VECTOR:
+       case OF_VECTOR:
+       case BR_VECTOR:
+       case UD_VECTOR:
+       case DF_VECTOR:
+       case SS_VECTOR:
+       case GP_VECTOR:
+       case MF_VECTOR:
+               kvm_queue_exception(vcpu, vec);
+               return 1;
+       }
        return 0;
 }
 
@@ -2288,7 +2508,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        }
 
        error_code = 0;
-       rip = vmcs_readl(GUEST_RIP);
+       rip = kvm_rip_read(vcpu);
        if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
                error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
        if (is_page_fault(intr_info)) {
@@ -2298,7 +2518,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                cr2 = vmcs_readl(EXIT_QUALIFICATION);
                KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
                            (u32)((u64)cr2 >> 32), handler);
-               if (vect_info & VECTORING_INFO_VALID_MASK)
+               if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending)
                        kvm_mmu_unprotect_page_virt(vcpu, cr2);
                return kvm_mmu_page_fault(vcpu, cr2, error_code);
        }
@@ -2386,27 +2606,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        reg = (exit_qualification >> 8) & 15;
        switch ((exit_qualification >> 4) & 3) {
        case 0: /* mov to cr */
-               KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg],
-                           (u32)((u64)vcpu->arch.regs[reg] >> 32), handler);
+               KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr,
+                           (u32)kvm_register_read(vcpu, reg),
+                           (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
+                           handler);
                switch (cr) {
                case 0:
-                       vcpu_load_rsp_rip(vcpu);
-                       kvm_set_cr0(vcpu, vcpu->arch.regs[reg]);
+                       kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg));
                        skip_emulated_instruction(vcpu);
                        return 1;
                case 3:
-                       vcpu_load_rsp_rip(vcpu);
-                       kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
+                       kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg));
                        skip_emulated_instruction(vcpu);
                        return 1;
                case 4:
-                       vcpu_load_rsp_rip(vcpu);
-                       kvm_set_cr4(vcpu, vcpu->arch.regs[reg]);
+                       kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
                        skip_emulated_instruction(vcpu);
                        return 1;
                case 8:
-                       vcpu_load_rsp_rip(vcpu);
-                       kvm_set_cr8(vcpu, vcpu->arch.regs[reg]);
+                       kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg));
                        skip_emulated_instruction(vcpu);
                        if (irqchip_in_kernel(vcpu->kvm))
                                return 1;
@@ -2415,7 +2633,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                };
                break;
        case 2: /* clts */
-               vcpu_load_rsp_rip(vcpu);
                vmx_fpu_deactivate(vcpu);
                vcpu->arch.cr0 &= ~X86_CR0_TS;
                vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
@@ -2426,21 +2643,17 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        case 1: /*mov from cr*/
                switch (cr) {
                case 3:
-                       vcpu_load_rsp_rip(vcpu);
-                       vcpu->arch.regs[reg] = vcpu->arch.cr3;
-                       vcpu_put_rsp_rip(vcpu);
+                       kvm_register_write(vcpu, reg, vcpu->arch.cr3);
                        KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
-                                   (u32)vcpu->arch.regs[reg],
-                                   (u32)((u64)vcpu->arch.regs[reg] >> 32),
+                                   (u32)kvm_register_read(vcpu, reg),
+                                   (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
                                    handler);
                        skip_emulated_instruction(vcpu);
                        return 1;
                case 8:
-                       vcpu_load_rsp_rip(vcpu);
-                       vcpu->arch.regs[reg] = kvm_get_cr8(vcpu);
-                       vcpu_put_rsp_rip(vcpu);
+                       kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu));
                        KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
-                                   (u32)vcpu->arch.regs[reg], handler);
+                                   (u32)kvm_register_read(vcpu, reg), handler);
                        skip_emulated_instruction(vcpu);
                        return 1;
                }
@@ -2472,7 +2685,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
        dr = exit_qualification & 7;
        reg = (exit_qualification >> 8) & 15;
-       vcpu_load_rsp_rip(vcpu);
        if (exit_qualification & 16) {
                /* mov from dr */
                switch (dr) {
@@ -2485,12 +2697,11 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                default:
                        val = 0;
                }
-               vcpu->arch.regs[reg] = val;
+               kvm_register_write(vcpu, reg, val);
                KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
        } else {
                /* mov to dr */
        }
-       vcpu_put_rsp_rip(vcpu);
        skip_emulated_instruction(vcpu);
        return 1;
 }
@@ -2583,6 +2794,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        return 1;
 }
 
+static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+
+       kvm_mmu_invlpg(vcpu, exit_qualification);
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
+
 static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        skip_emulated_instruction(vcpu);
@@ -2695,6 +2915,43 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        return 1;
 }
 
+static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
+                               struct kvm_run *kvm_run)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       int err;
+
+       preempt_enable();
+       local_irq_enable();
+
+       while (!guest_state_valid(vcpu)) {
+               err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
+
+               switch (err) {
+                       case EMULATE_DONE:
+                               break;
+                       case EMULATE_DO_MMIO:
+                               kvm_report_emulation_failure(vcpu, "mmio");
+                               /* TODO: Handle MMIO */
+                               return;
+                       default:
+                               kvm_report_emulation_failure(vcpu, "emulation failure");
+                               return;
+               }
+
+               if (signal_pending(current))
+                       break;
+               if (need_resched())
+                       schedule();
+       }
+
+       local_irq_disable();
+       preempt_disable();
+
+       /* Guest state should be valid now, no more emulation should be needed */
+       vmx->emulation_required = 0;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -2714,6 +2971,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
        [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
        [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
        [EXIT_REASON_HLT]                     = handle_halt,
+       [EXIT_REASON_INVLPG]                  = handle_invlpg,
        [EXIT_REASON_VMCALL]                  = handle_vmcall,
        [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
        [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
@@ -2735,8 +2993,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 vectoring_info = vmx->idt_vectoring_info;
 
-       KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP),
-                   (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit);
+       KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
+                   (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
 
        /* Access CR3 don't cause VMExit in paging mode, so we need
         * to sync with guest real CR3. */
@@ -2829,88 +3087,92 @@ static void enable_intr_window(struct kvm_vcpu *vcpu)
                enable_irq_window(vcpu);
 }
 
-static void vmx_intr_assist(struct kvm_vcpu *vcpu)
+static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
-       u32 idtv_info_field, intr_info_field, exit_intr_info_field;
-       int vector;
+       u32 exit_intr_info;
+       u32 idt_vectoring_info;
+       bool unblock_nmi;
+       u8 vector;
+       int type;
+       bool idtv_info_valid;
+       u32 error;
 
-       update_tpr_threshold(vcpu);
-
-       intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
-       exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO);
-       idtv_info_field = vmx->idt_vectoring_info;
-       if (intr_info_field & INTR_INFO_VALID_MASK) {
-               if (idtv_info_field & INTR_INFO_VALID_MASK) {
-                       /* TODO: fault when IDT_Vectoring */
-                       if (printk_ratelimit())
-                               printk(KERN_ERR "Fault when IDT_Vectoring\n");
-               }
-               enable_intr_window(vcpu);
-               return;
+       exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+       if (cpu_has_virtual_nmis()) {
+               unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
+               vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+               /*
+                * SDM 3: 25.7.1.2
+                * Re-set bit "block by NMI" before VM entry if vmexit caused by
+                * a guest IRET fault.
+                */
+               if (unblock_nmi && vector != DF_VECTOR)
+                       vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+                                     GUEST_INTR_STATE_NMI);
        }
-       if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
-               if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
-                   == INTR_TYPE_EXT_INTR
-                   && vcpu->arch.rmode.active) {
-                       u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
-
-                       vmx_inject_irq(vcpu, vect);
-                       enable_intr_window(vcpu);
-                       return;
-               }
-
-               KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler);
 
+       idt_vectoring_info = vmx->idt_vectoring_info;
+       idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+       vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
+       type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
+       if (vmx->vcpu.arch.nmi_injected) {
                /*
                 * SDM 3: 25.7.1.2
                 * Clear bit "block by NMI" before VM entry if a NMI delivery
                 * faulted.
                 */
-               if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
-                   == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis())
-                       vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
-                               vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-                               ~GUEST_INTR_STATE_NMI);
-
-               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field
-                               & ~INTR_INFO_RESVD_BITS_MASK);
-               vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
-                               vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
-
-               if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK))
-                       vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
-                               vmcs_read32(IDT_VECTORING_ERROR_CODE));
-               enable_intr_window(vcpu);
-               return;
+               if (idtv_info_valid && type == INTR_TYPE_NMI_INTR)
+                       vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+                                       GUEST_INTR_STATE_NMI);
+               else
+                       vmx->vcpu.arch.nmi_injected = false;
+       }
+       kvm_clear_exception_queue(&vmx->vcpu);
+       if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) {
+               if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
+                       error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+                       kvm_queue_exception_e(&vmx->vcpu, vector, error);
+               } else
+                       kvm_queue_exception(&vmx->vcpu, vector);
+               vmx->idt_vectoring_info = 0;
        }
+       kvm_clear_interrupt_queue(&vmx->vcpu);
+       if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) {
+               kvm_queue_interrupt(&vmx->vcpu, vector);
+               vmx->idt_vectoring_info = 0;
+       }
+}
+
+static void vmx_intr_assist(struct kvm_vcpu *vcpu)
+{
+       update_tpr_threshold(vcpu);
+
        if (cpu_has_virtual_nmis()) {
-               /*
-                * SDM 3: 25.7.1.2
-                * Re-set bit "block by NMI" before VM entry if vmexit caused by
-                * a guest IRET fault.
-                */
-               if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) &&
-                   (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8)
-                       vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
-                               vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) |
-                               GUEST_INTR_STATE_NMI);
-               else if (vcpu->arch.nmi_pending) {
-                       if (vmx_nmi_enabled(vcpu))
-                               vmx_inject_nmi(vcpu);
+               if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
+                       if (vmx_nmi_enabled(vcpu)) {
+                               vcpu->arch.nmi_pending = false;
+                               vcpu->arch.nmi_injected = true;
+                       } else {
+                               enable_intr_window(vcpu);
+                               return;
+                       }
+               }
+               if (vcpu->arch.nmi_injected) {
+                       vmx_inject_nmi(vcpu);
                        enable_intr_window(vcpu);
                        return;
                }
-
        }
-       if (!kvm_cpu_has_interrupt(vcpu))
-               return;
-       if (vmx_irq_enabled(vcpu)) {
-               vector = kvm_cpu_get_interrupt(vcpu);
-               vmx_inject_irq(vcpu, vector);
-               kvm_timer_intr_post(vcpu, vector);
-       } else
-               enable_irq_window(vcpu);
+       if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
+               if (vmx_irq_enabled(vcpu))
+                       kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
+               else
+                       enable_irq_window(vcpu);
+       }
+       if (vcpu->arch.interrupt.pending) {
+               vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+               kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
+       }
 }
 
 /*
@@ -2922,9 +3184,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 {
        vmx->rmode.irq.pending = 0;
-       if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip)
+       if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
                return;
-       vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip);
+       kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
        if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
                vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
                vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
@@ -2936,11 +3198,30 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
                | vmx->rmode.irq.vector;
 }
 
+#ifdef CONFIG_X86_64
+#define R "r"
+#define Q "q"
+#else
+#define R "e"
+#define Q "l"
+#endif
+
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 intr_info;
 
+       /* Handle invalid guest state instead of entering VMX */
+       if (vmx->emulation_required && emulate_invalid_guest_state) {
+               handle_invalid_guest_state(vcpu, kvm_run);
+               return;
+       }
+
+       if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+               vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
+       if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+               vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+
        /*
         * Loading guest fpu may have cleared host cr0.ts
         */
@@ -2948,26 +3229,25 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
        asm(
                /* Store host registers */
-#ifdef CONFIG_X86_64
-               "push %%rdx; push %%rbp;"
-               "push %%rcx \n\t"
-#else
-               "push %%edx; push %%ebp;"
-               "push %%ecx \n\t"
-#endif
+               "push %%"R"dx; push %%"R"bp;"
+               "push %%"R"cx \n\t"
+               "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
+               "je 1f \n\t"
+               "mov %%"R"sp, %c[host_rsp](%0) \n\t"
                __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
+               "1: \n\t"
                /* Check if vmlaunch of vmresume is needed */
                "cmpl $0, %c[launched](%0) \n\t"
                /* Load guest registers.  Don't clobber flags. */
+               "mov %c[cr2](%0), %%"R"ax \n\t"
+               "mov %%"R"ax, %%cr2 \n\t"
+               "mov %c[rax](%0), %%"R"ax \n\t"
+               "mov %c[rbx](%0), %%"R"bx \n\t"
+               "mov %c[rdx](%0), %%"R"dx \n\t"
+               "mov %c[rsi](%0), %%"R"si \n\t"
+               "mov %c[rdi](%0), %%"R"di \n\t"
+               "mov %c[rbp](%0), %%"R"bp \n\t"
 #ifdef CONFIG_X86_64
-               "mov %c[cr2](%0), %%rax \n\t"
-               "mov %%rax, %%cr2 \n\t"
-               "mov %c[rax](%0), %%rax \n\t"
-               "mov %c[rbx](%0), %%rbx \n\t"
-               "mov %c[rdx](%0), %%rdx \n\t"
-               "mov %c[rsi](%0), %%rsi \n\t"
-               "mov %c[rdi](%0), %%rdi \n\t"
-               "mov %c[rbp](%0), %%rbp \n\t"
                "mov %c[r8](%0),  %%r8  \n\t"
                "mov %c[r9](%0),  %%r9  \n\t"
                "mov %c[r10](%0), %%r10 \n\t"
@@ -2976,18 +3256,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                "mov %c[r13](%0), %%r13 \n\t"
                "mov %c[r14](%0), %%r14 \n\t"
                "mov %c[r15](%0), %%r15 \n\t"
-               "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
-#else
-               "mov %c[cr2](%0), %%eax \n\t"
-               "mov %%eax,   %%cr2 \n\t"
-               "mov %c[rax](%0), %%eax \n\t"
-               "mov %c[rbx](%0), %%ebx \n\t"
-               "mov %c[rdx](%0), %%edx \n\t"
-               "mov %c[rsi](%0), %%esi \n\t"
-               "mov %c[rdi](%0), %%edi \n\t"
-               "mov %c[rbp](%0), %%ebp \n\t"
-               "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
 #endif
+               "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
+
                /* Enter guest mode */
                "jne .Llaunched \n\t"
                __ex(ASM_VMX_VMLAUNCH) "\n\t"
@@ -2995,15 +3266,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
                ".Lkvm_vmx_return: "
                /* Save guest registers, load host registers, keep flags */
+               "xchg %0,     (%%"R"sp) \n\t"
+               "mov %%"R"ax, %c[rax](%0) \n\t"
+               "mov %%"R"bx, %c[rbx](%0) \n\t"
+               "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
+               "mov %%"R"dx, %c[rdx](%0) \n\t"
+               "mov %%"R"si, %c[rsi](%0) \n\t"
+               "mov %%"R"di, %c[rdi](%0) \n\t"
+               "mov %%"R"bp, %c[rbp](%0) \n\t"
 #ifdef CONFIG_X86_64
-               "xchg %0,     (%%rsp) \n\t"
-               "mov %%rax, %c[rax](%0) \n\t"
-               "mov %%rbx, %c[rbx](%0) \n\t"
-               "pushq (%%rsp); popq %c[rcx](%0) \n\t"
-               "mov %%rdx, %c[rdx](%0) \n\t"
-               "mov %%rsi, %c[rsi](%0) \n\t"
-               "mov %%rdi, %c[rdi](%0) \n\t"
-               "mov %%rbp, %c[rbp](%0) \n\t"
                "mov %%r8,  %c[r8](%0) \n\t"
                "mov %%r9,  %c[r9](%0) \n\t"
                "mov %%r10, %c[r10](%0) \n\t"
@@ -3012,28 +3283,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                "mov %%r13, %c[r13](%0) \n\t"
                "mov %%r14, %c[r14](%0) \n\t"
                "mov %%r15, %c[r15](%0) \n\t"
-               "mov %%cr2, %%rax   \n\t"
-               "mov %%rax, %c[cr2](%0) \n\t"
-
-               "pop  %%rbp; pop  %%rbp; pop  %%rdx \n\t"
-#else
-               "xchg %0, (%%esp) \n\t"
-               "mov %%eax, %c[rax](%0) \n\t"
-               "mov %%ebx, %c[rbx](%0) \n\t"
-               "pushl (%%esp); popl %c[rcx](%0) \n\t"
-               "mov %%edx, %c[rdx](%0) \n\t"
-               "mov %%esi, %c[rsi](%0) \n\t"
-               "mov %%edi, %c[rdi](%0) \n\t"
-               "mov %%ebp, %c[rbp](%0) \n\t"
-               "mov %%cr2, %%eax  \n\t"
-               "mov %%eax, %c[cr2](%0) \n\t"
-
-               "pop %%ebp; pop %%ebp; pop %%edx \n\t"
 #endif
+               "mov %%cr2, %%"R"ax   \n\t"
+               "mov %%"R"ax, %c[cr2](%0) \n\t"
+
+               "pop  %%"R"bp; pop  %%"R"bp; pop  %%"R"dx \n\t"
                "setbe %c[fail](%0) \n\t"
              : : "c"(vmx), "d"((unsigned long)HOST_RSP),
                [launched]"i"(offsetof(struct vcpu_vmx, launched)),
                [fail]"i"(offsetof(struct vcpu_vmx, fail)),
+               [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
                [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
                [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
                [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
@@ -3053,14 +3312,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 #endif
                [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
              : "cc", "memory"
+               , R"bx", R"di", R"si"
 #ifdef CONFIG_X86_64
-               , "rbx", "rdi", "rsi"
                , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
-#else
-               , "ebx", "edi", "rsi"
 #endif
              );
 
+       vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
+       vcpu->arch.regs_dirty = 0;
+
        vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
        if (vmx->rmode.irq.pending)
                fixup_rmode_irq(vmx);
@@ -3080,8 +3340,13 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                KVMTRACE_0D(NMI, vcpu, handler);
                asm("int $2");
        }
+
+       vmx_complete_interrupts(vmx);
 }
 
+#undef R
+#undef Q
+
 static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3224,8 +3489,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .set_idt = vmx_set_idt,
        .get_gdt = vmx_get_gdt,
        .set_gdt = vmx_set_gdt,
-       .cache_regs = vcpu_load_rsp_rip,
-       .decache_regs = vcpu_put_rsp_rip,
+       .cache_reg = vmx_cache_reg,
        .get_rflags = vmx_get_rflags,
        .set_rflags = vmx_set_rflags,
 
index 17e25995b65b62986ec2d4bbff7f7e9ce6d162f6..3e010d21fdd71afdf65cdb086fb158632e391abf 100644 (file)
@@ -331,9 +331,6 @@ enum vmcs_field {
 
 #define AR_RESERVD_MASK 0xfffe0f00
 
-#define MSR_IA32_FEATURE_CONTROL_LOCKED         0x1
-#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED  0x4
-
 #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT       9
 #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT     10
 
index 19afbb644c7f7649420d1bbc5a7cba209e0b3f2b..4f0677d1eae8f495147b2184474f5806c81303bd 100644 (file)
@@ -4,10 +4,14 @@
  * derived from drivers/kvm/kvm_main.c
  *
  * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
  *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Amit Shah    <amit.shah@qumranet.com>
+ *   Ben-Ami Yassour <benami@il.ibm.com>
  *
  * This work is licensed under the terms of the GNU GPL, version 2.  See
  * the COPYING file in the top-level directory.
 #include "mmu.h"
 #include "i8254.h"
 #include "tss.h"
+#include "kvm_cache_regs.h"
+#include "x86.h"
 
 #include <linux/clocksource.h>
+#include <linux/interrupt.h>
 #include <linux/kvm.h>
 #include <linux/fs.h>
 #include <linux/vmalloc.h>
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/highmem.h>
+#include <linux/intel-iommu.h>
 
 #include <asm/uaccess.h>
 #include <asm/msr.h>
@@ -61,6 +69,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
                                    struct kvm_cpuid_entry2 __user *entries);
 
 struct kvm_x86_ops *kvm_x86_ops;
+EXPORT_SYMBOL_GPL(kvm_x86_ops);
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "pf_fixed", VCPU_STAT(pf_fixed) },
@@ -83,6 +92,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "fpu_reload", VCPU_STAT(fpu_reload) },
        { "insn_emulation", VCPU_STAT(insn_emulation) },
        { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
+       { "irq_injections", VCPU_STAT(irq_injections) },
        { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
        { "mmu_pte_write", VM_STAT(mmu_pte_write) },
        { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -90,12 +100,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "mmu_flooded", VM_STAT(mmu_flooded) },
        { "mmu_recycled", VM_STAT(mmu_recycled) },
        { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
+       { "mmu_unsync", VM_STAT(mmu_unsync) },
        { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
        { "largepages", VM_STAT(lpages) },
        { NULL }
 };
 
-
 unsigned long segment_base(u16 selector)
 {
        struct descriptor_table gdt;
@@ -352,6 +362,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
        if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
+               kvm_mmu_sync_roots(vcpu);
                kvm_mmu_flush_tlb(vcpu);
                return;
        }
@@ -662,6 +673,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
                        __func__, data);
                break;
+       case MSR_IA32_DEBUGCTLMSR:
+               if (!data) {
+                       /* We support the non-activated case already */
+                       break;
+               } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
+                       /* Values other than LBR and BTF are vendor-specific,
+                          thus reserved and should throw a #GP */
+                       return 1;
+               }
+               pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+                       __func__, data);
+               break;
        case MSR_IA32_UCODE_REV:
        case MSR_IA32_UCODE_WRITE:
                break;
@@ -692,10 +715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                /* ...but clean it before doing the actual write */
                vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
 
-               down_read(&current->mm->mmap_sem);
                vcpu->arch.time_page =
                                gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
-               up_read(&current->mm->mmap_sem);
 
                if (is_error_page(vcpu->arch.time_page)) {
                        kvm_release_page_clean(vcpu->arch.time_page);
@@ -752,8 +773,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case MSR_IA32_MC0_MISC+8:
        case MSR_IA32_MC0_MISC+12:
        case MSR_IA32_MC0_MISC+16:
+       case MSR_IA32_MC0_MISC+20:
        case MSR_IA32_UCODE_REV:
        case MSR_IA32_EBL_CR_POWERON:
+       case MSR_IA32_DEBUGCTLMSR:
+       case MSR_IA32_LASTBRANCHFROMIP:
+       case MSR_IA32_LASTBRANCHTOIP:
+       case MSR_IA32_LASTINTFROMIP:
+       case MSR_IA32_LASTINTTOIP:
                data = 0;
                break;
        case MSR_MTRRcap:
@@ -901,6 +928,9 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_PV_MMU:
                r = !tdp_enabled;
                break;
+       case KVM_CAP_IOMMU:
+               r = intel_iommu_found();
+               break;
        default:
                r = 0;
                break;
@@ -1303,28 +1333,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        struct kvm_vcpu *vcpu = filp->private_data;
        void __user *argp = (void __user *)arg;
        int r;
+       struct kvm_lapic_state *lapic = NULL;
 
        switch (ioctl) {
        case KVM_GET_LAPIC: {
-               struct kvm_lapic_state lapic;
+               lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
 
-               memset(&lapic, 0, sizeof lapic);
-               r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
+               r = -ENOMEM;
+               if (!lapic)
+                       goto out;
+               r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
                if (r)
                        goto out;
                r = -EFAULT;
-               if (copy_to_user(argp, &lapic, sizeof lapic))
+               if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
                        goto out;
                r = 0;
                break;
        }
        case KVM_SET_LAPIC: {
-               struct kvm_lapic_state lapic;
-
+               lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+               r = -ENOMEM;
+               if (!lapic)
+                       goto out;
                r = -EFAULT;
-               if (copy_from_user(&lapic, argp, sizeof lapic))
+               if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
                        goto out;
-               r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
+               r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
                if (r)
                        goto out;
                r = 0;
@@ -1422,6 +1457,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                r = -EINVAL;
        }
 out:
+       if (lapic)
+               kfree(lapic);
        return r;
 }
 
@@ -1630,6 +1667,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
        struct kvm *kvm = filp->private_data;
        void __user *argp = (void __user *)arg;
        int r = -EINVAL;
+       /*
+        * This union makes it completely explicit to gcc-3.x
+        * that these two variables' stack usage should be
+        * combined, not added together.
+        */
+       union {
+               struct kvm_pit_state ps;
+               struct kvm_memory_alias alias;
+       } u;
 
        switch (ioctl) {
        case KVM_SET_TSS_ADDR:
@@ -1661,17 +1707,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
        case KVM_GET_NR_MMU_PAGES:
                r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
                break;
-       case KVM_SET_MEMORY_ALIAS: {
-               struct kvm_memory_alias alias;
-
+       case KVM_SET_MEMORY_ALIAS:
                r = -EFAULT;
-               if (copy_from_user(&alias, argp, sizeof alias))
+               if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
                        goto out;
-               r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
+               r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
                if (r)
                        goto out;
                break;
-       }
        case KVM_CREATE_IRQCHIP:
                r = -ENOMEM;
                kvm->arch.vpic = kvm_create_pic(kvm);
@@ -1699,13 +1742,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                        goto out;
                if (irqchip_in_kernel(kvm)) {
                        mutex_lock(&kvm->lock);
-                       if (irq_event.irq < 16)
-                               kvm_pic_set_irq(pic_irqchip(kvm),
-                                       irq_event.irq,
-                                       irq_event.level);
-                       kvm_ioapic_set_irq(kvm->arch.vioapic,
-                                       irq_event.irq,
-                                       irq_event.level);
+                       kvm_set_irq(kvm, irq_event.irq, irq_event.level);
                        mutex_unlock(&kvm->lock);
                        r = 0;
                }
@@ -1713,65 +1750,77 @@ long kvm_arch_vm_ioctl(struct file *filp,
        }
        case KVM_GET_IRQCHIP: {
                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
-               struct kvm_irqchip chip;
+               struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
 
-               r = -EFAULT;
-               if (copy_from_user(&chip, argp, sizeof chip))
+               r = -ENOMEM;
+               if (!chip)
                        goto out;
+               r = -EFAULT;
+               if (copy_from_user(chip, argp, sizeof *chip))
+                       goto get_irqchip_out;
                r = -ENXIO;
                if (!irqchip_in_kernel(kvm))
-                       goto out;
-               r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
+                       goto get_irqchip_out;
+               r = kvm_vm_ioctl_get_irqchip(kvm, chip);
                if (r)
-                       goto out;
+                       goto get_irqchip_out;
                r = -EFAULT;
-               if (copy_to_user(argp, &chip, sizeof chip))
-                       goto out;
+               if (copy_to_user(argp, chip, sizeof *chip))
+                       goto get_irqchip_out;
                r = 0;
+       get_irqchip_out:
+               kfree(chip);
+               if (r)
+                       goto out;
                break;
        }
        case KVM_SET_IRQCHIP: {
                /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
-               struct kvm_irqchip chip;
+               struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
 
-               r = -EFAULT;
-               if (copy_from_user(&chip, argp, sizeof chip))
+               r = -ENOMEM;
+               if (!chip)
                        goto out;
+               r = -EFAULT;
+               if (copy_from_user(chip, argp, sizeof *chip))
+                       goto set_irqchip_out;
                r = -ENXIO;
                if (!irqchip_in_kernel(kvm))
-                       goto out;
-               r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
+                       goto set_irqchip_out;
+               r = kvm_vm_ioctl_set_irqchip(kvm, chip);
                if (r)
-                       goto out;
+                       goto set_irqchip_out;
                r = 0;
+       set_irqchip_out:
+               kfree(chip);
+               if (r)
+                       goto out;
                break;
        }
        case KVM_GET_PIT: {
-               struct kvm_pit_state ps;
                r = -EFAULT;
-               if (copy_from_user(&ps, argp, sizeof ps))
+               if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
                        goto out;
                r = -ENXIO;
                if (!kvm->arch.vpit)
                        goto out;
-               r = kvm_vm_ioctl_get_pit(kvm, &ps);
+               r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
                if (r)
                        goto out;
                r = -EFAULT;
-               if (copy_to_user(argp, &ps, sizeof ps))
+               if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
                        goto out;
                r = 0;
                break;
        }
        case KVM_SET_PIT: {
-               struct kvm_pit_state ps;
                r = -EFAULT;
-               if (copy_from_user(&ps, argp, sizeof ps))
+               if (copy_from_user(&u.ps, argp, sizeof u.ps))
                        goto out;
                r = -ENXIO;
                if (!kvm->arch.vpit)
                        goto out;
-               r = kvm_vm_ioctl_set_pit(kvm, &ps);
+               r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
                if (r)
                        goto out;
                r = 0;
@@ -2018,9 +2067,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 
                val = *(u64 *)new;
 
-               down_read(&current->mm->mmap_sem);
                page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-               up_read(&current->mm->mmap_sem);
 
                kaddr = kmap_atomic(page, KM_USER0);
                set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
@@ -2040,6 +2087,7 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
 
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
 {
+       kvm_mmu_invlpg(vcpu, address);
        return X86EMUL_CONTINUE;
 }
 
@@ -2080,7 +2128,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
 {
        u8 opcodes[4];
-       unsigned long rip = vcpu->arch.rip;
+       unsigned long rip = kvm_rip_read(vcpu);
        unsigned long rip_linear;
 
        if (!printk_ratelimit())
@@ -2102,6 +2150,14 @@ static struct x86_emulate_ops emulate_ops = {
        .cmpxchg_emulated    = emulator_cmpxchg_emulated,
 };
 
+static void cache_all_regs(struct kvm_vcpu *vcpu)
+{
+       kvm_register_read(vcpu, VCPU_REGS_RAX);
+       kvm_register_read(vcpu, VCPU_REGS_RSP);
+       kvm_register_read(vcpu, VCPU_REGS_RIP);
+       vcpu->arch.regs_dirty = ~0;
+}
+
 int emulate_instruction(struct kvm_vcpu *vcpu,
                        struct kvm_run *run,
                        unsigned long cr2,
@@ -2111,8 +2167,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
        int r;
        struct decode_cache *c;
 
+       kvm_clear_exception_queue(vcpu);
        vcpu->arch.mmio_fault_cr2 = cr2;
-       kvm_x86_ops->cache_regs(vcpu);
+       /*
+        * TODO: fix x86_emulate.c to use guest_read/write_register
+        * instead of direct ->regs accesses, can save hundred cycles
+        * on Intel for instructions that don't read/change RSP, for
+        * for example.
+        */
+       cache_all_regs(vcpu);
 
        vcpu->mmio_is_write = 0;
        vcpu->arch.pio.string = 0;
@@ -2172,7 +2235,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
                return EMULATE_DO_MMIO;
        }
 
-       kvm_x86_ops->decache_regs(vcpu);
        kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 
        if (vcpu->mmio_is_write) {
@@ -2225,20 +2287,19 @@ int complete_pio(struct kvm_vcpu *vcpu)
        struct kvm_pio_request *io = &vcpu->arch.pio;
        long delta;
        int r;
-
-       kvm_x86_ops->cache_regs(vcpu);
+       unsigned long val;
 
        if (!io->string) {
-               if (io->in)
-                       memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data,
-                              io->size);
+               if (io->in) {
+                       val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+                       memcpy(&val, vcpu->arch.pio_data, io->size);
+                       kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+               }
        } else {
                if (io->in) {
                        r = pio_copy_data(vcpu);
-                       if (r) {
-                               kvm_x86_ops->cache_regs(vcpu);
+                       if (r)
                                return r;
-                       }
                }
 
                delta = 1;
@@ -2248,19 +2309,24 @@ int complete_pio(struct kvm_vcpu *vcpu)
                         * The size of the register should really depend on
                         * current address size.
                         */
-                       vcpu->arch.regs[VCPU_REGS_RCX] -= delta;
+                       val = kvm_register_read(vcpu, VCPU_REGS_RCX);
+                       val -= delta;
+                       kvm_register_write(vcpu, VCPU_REGS_RCX, val);
                }
                if (io->down)
                        delta = -delta;
                delta *= io->size;
-               if (io->in)
-                       vcpu->arch.regs[VCPU_REGS_RDI] += delta;
-               else
-                       vcpu->arch.regs[VCPU_REGS_RSI] += delta;
+               if (io->in) {
+                       val = kvm_register_read(vcpu, VCPU_REGS_RDI);
+                       val += delta;
+                       kvm_register_write(vcpu, VCPU_REGS_RDI, val);
+               } else {
+                       val = kvm_register_read(vcpu, VCPU_REGS_RSI);
+                       val += delta;
+                       kvm_register_write(vcpu, VCPU_REGS_RSI, val);
+               }
        }
 
-       kvm_x86_ops->decache_regs(vcpu);
-
        io->count -= io->cur_count;
        io->cur_count = 0;
 
@@ -2313,6 +2379,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
                  int size, unsigned port)
 {
        struct kvm_io_device *pio_dev;
+       unsigned long val;
 
        vcpu->run->exit_reason = KVM_EXIT_IO;
        vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -2333,8 +2400,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
                KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
                            handler);
 
-       kvm_x86_ops->cache_regs(vcpu);
-       memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
+       val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+       memcpy(vcpu->arch.pio_data, &val, 4);
 
        kvm_x86_ops->skip_emulated_instruction(vcpu);
 
@@ -2492,11 +2559,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
        KVMTRACE_0D(HLT, vcpu, handler);
        if (irqchip_in_kernel(vcpu->kvm)) {
                vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
-               up_read(&vcpu->kvm->slots_lock);
-               kvm_vcpu_block(vcpu);
-               down_read(&vcpu->kvm->slots_lock);
-               if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
-                       return -EINTR;
                return 1;
        } else {
                vcpu->run->exit_reason = KVM_EXIT_HLT;
@@ -2519,13 +2581,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
        unsigned long nr, a0, a1, a2, a3, ret;
        int r = 1;
 
-       kvm_x86_ops->cache_regs(vcpu);
-
-       nr = vcpu->arch.regs[VCPU_REGS_RAX];
-       a0 = vcpu->arch.regs[VCPU_REGS_RBX];
-       a1 = vcpu->arch.regs[VCPU_REGS_RCX];
-       a2 = vcpu->arch.regs[VCPU_REGS_RDX];
-       a3 = vcpu->arch.regs[VCPU_REGS_RSI];
+       nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
+       a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
+       a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
+       a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
+       a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
 
        KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
 
@@ -2548,8 +2608,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
                ret = -KVM_ENOSYS;
                break;
        }
-       vcpu->arch.regs[VCPU_REGS_RAX] = ret;
-       kvm_x86_ops->decache_regs(vcpu);
+       kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
        ++vcpu->stat.hypercalls;
        return r;
 }
@@ -2559,6 +2618,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
 {
        char instruction[3];
        int ret = 0;
+       unsigned long rip = kvm_rip_read(vcpu);
 
 
        /*
@@ -2568,9 +2628,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
         */
        kvm_mmu_zap_all(vcpu->kvm);
 
-       kvm_x86_ops->cache_regs(vcpu);
        kvm_x86_ops->patch_hypercall(vcpu, instruction);
-       if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu)
+       if (emulator_write_emulated(rip, instruction, 3, vcpu)
            != X86EMUL_CONTINUE)
                ret = -EFAULT;
 
@@ -2700,13 +2759,12 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
        u32 function, index;
        struct kvm_cpuid_entry2 *e, *best;
 
-       kvm_x86_ops->cache_regs(vcpu);
-       function = vcpu->arch.regs[VCPU_REGS_RAX];
-       index = vcpu->arch.regs[VCPU_REGS_RCX];
-       vcpu->arch.regs[VCPU_REGS_RAX] = 0;
-       vcpu->arch.regs[VCPU_REGS_RBX] = 0;
-       vcpu->arch.regs[VCPU_REGS_RCX] = 0;
-       vcpu->arch.regs[VCPU_REGS_RDX] = 0;
+       function = kvm_register_read(vcpu, VCPU_REGS_RAX);
+       index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+       kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
+       kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
+       kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
+       kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
        best = NULL;
        for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
                e = &vcpu->arch.cpuid_entries[i];
@@ -2724,18 +2782,17 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
                                best = e;
        }
        if (best) {
-               vcpu->arch.regs[VCPU_REGS_RAX] = best->eax;
-               vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx;
-               vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx;
-               vcpu->arch.regs[VCPU_REGS_RDX] = best->edx;
+               kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
+               kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
+               kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
+               kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
        }
-       kvm_x86_ops->decache_regs(vcpu);
        kvm_x86_ops->skip_emulated_instruction(vcpu);
        KVMTRACE_5D(CPUID, vcpu, function,
-                   (u32)vcpu->arch.regs[VCPU_REGS_RAX],
-                   (u32)vcpu->arch.regs[VCPU_REGS_RBX],
-                   (u32)vcpu->arch.regs[VCPU_REGS_RCX],
-                   (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler);
+                   (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
+                   (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
+                   (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
+                   (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
 
@@ -2776,9 +2833,7 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
        if (!apic || !apic->vapic_addr)
                return;
 
-       down_read(&current->mm->mmap_sem);
        page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
-       up_read(&current->mm->mmap_sem);
 
        vcpu->arch.apic->vapic_page = page;
 }
@@ -2796,28 +2851,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
        up_read(&vcpu->kvm->slots_lock);
 }
 
-static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
        int r;
 
-       if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
-               pr_debug("vcpu %d received sipi with vector # %x\n",
-                      vcpu->vcpu_id, vcpu->arch.sipi_vector);
-               kvm_lapic_reset(vcpu);
-               r = kvm_x86_ops->vcpu_reset(vcpu);
-               if (r)
-                       return r;
-               vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-       }
-
-       down_read(&vcpu->kvm->slots_lock);
-       vapic_enter(vcpu);
-
-preempted:
-       if (vcpu->guest_debug.enabled)
-               kvm_x86_ops->guest_debug_pre(vcpu);
-
-again:
        if (vcpu->requests)
                if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
                        kvm_mmu_unload(vcpu);
@@ -2829,6 +2866,8 @@ again:
        if (vcpu->requests) {
                if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
                        __kvm_migrate_timers(vcpu);
+               if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
+                       kvm_mmu_sync_roots(vcpu);
                if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
                        kvm_x86_ops->tlb_flush(vcpu);
                if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
@@ -2854,21 +2893,15 @@ again:
 
        local_irq_disable();
 
-       if (vcpu->requests || need_resched()) {
+       if (vcpu->requests || need_resched() || signal_pending(current)) {
                local_irq_enable();
                preempt_enable();
                r = 1;
                goto out;
        }
 
-       if (signal_pending(current)) {
-               local_irq_enable();
-               preempt_enable();
-               r = -EINTR;
-               kvm_run->exit_reason = KVM_EXIT_INTR;
-               ++vcpu->stat.signal_exits;
-               goto out;
-       }
+       if (vcpu->guest_debug.enabled)
+               kvm_x86_ops->guest_debug_pre(vcpu);
 
        vcpu->guest_mode = 1;
        /*
@@ -2917,8 +2950,8 @@ again:
         * Profile KVM exit RIPs:
         */
        if (unlikely(prof_on == KVM_PROFILING)) {
-               kvm_x86_ops->cache_regs(vcpu);
-               profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip);
+               unsigned long rip = kvm_rip_read(vcpu);
+               profile_hit(KVM_PROFILING, (void *)rip);
        }
 
        if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
@@ -2927,26 +2960,63 @@ again:
        kvm_lapic_sync_from_vapic(vcpu);
 
        r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
+out:
+       return r;
+}
 
-       if (r > 0) {
-               if (dm_request_for_irq_injection(vcpu, kvm_run)) {
-                       r = -EINTR;
-                       kvm_run->exit_reason = KVM_EXIT_INTR;
-                       ++vcpu->stat.request_irq_exits;
-                       goto out;
-               }
-               if (!need_resched())
-                       goto again;
+static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+       int r;
+
+       if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
+               pr_debug("vcpu %d received sipi with vector # %x\n",
+                        vcpu->vcpu_id, vcpu->arch.sipi_vector);
+               kvm_lapic_reset(vcpu);
+               r = kvm_x86_ops->vcpu_reset(vcpu);
+               if (r)
+                       return r;
+               vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
        }
 
-out:
-       up_read(&vcpu->kvm->slots_lock);
-       if (r > 0) {
-               kvm_resched(vcpu);
-               down_read(&vcpu->kvm->slots_lock);
-               goto preempted;
+       down_read(&vcpu->kvm->slots_lock);
+       vapic_enter(vcpu);
+
+       r = 1;
+       while (r > 0) {
+               if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
+                       r = vcpu_enter_guest(vcpu, kvm_run);
+               else {
+                       up_read(&vcpu->kvm->slots_lock);
+                       kvm_vcpu_block(vcpu);
+                       down_read(&vcpu->kvm->slots_lock);
+                       if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
+                               if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+                                       vcpu->arch.mp_state =
+                                                       KVM_MP_STATE_RUNNABLE;
+                       if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
+                               r = -EINTR;
+               }
+
+               if (r > 0) {
+                       if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+                               r = -EINTR;
+                               kvm_run->exit_reason = KVM_EXIT_INTR;
+                               ++vcpu->stat.request_irq_exits;
+                       }
+                       if (signal_pending(current)) {
+                               r = -EINTR;
+                               kvm_run->exit_reason = KVM_EXIT_INTR;
+                               ++vcpu->stat.signal_exits;
+                       }
+                       if (need_resched()) {
+                               up_read(&vcpu->kvm->slots_lock);
+                               kvm_resched(vcpu);
+                               down_read(&vcpu->kvm->slots_lock);
+                       }
+               }
        }
 
+       up_read(&vcpu->kvm->slots_lock);
        post_kvm_run_save(vcpu, kvm_run);
 
        vapic_exit(vcpu);
@@ -2966,6 +3036,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
        if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
                kvm_vcpu_block(vcpu);
+               clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
                r = -EAGAIN;
                goto out;
        }
@@ -2999,11 +3070,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                }
        }
 #endif
-       if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
-               kvm_x86_ops->cache_regs(vcpu);
-               vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
-               kvm_x86_ops->decache_regs(vcpu);
-       }
+       if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
+               kvm_register_write(vcpu, VCPU_REGS_RAX,
+                                    kvm_run->hypercall.ret);
 
        r = __vcpu_run(vcpu, kvm_run);
 
@@ -3019,28 +3088,26 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
        vcpu_load(vcpu);
 
-       kvm_x86_ops->cache_regs(vcpu);
-
-       regs->rax = vcpu->arch.regs[VCPU_REGS_RAX];
-       regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX];
-       regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX];
-       regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX];
-       regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI];
-       regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI];
-       regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
-       regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
+       regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+       regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+       regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+       regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+       regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+       regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
+       regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+       regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
 #ifdef CONFIG_X86_64
-       regs->r8 = vcpu->arch.regs[VCPU_REGS_R8];
-       regs->r9 = vcpu->arch.regs[VCPU_REGS_R9];
-       regs->r10 = vcpu->arch.regs[VCPU_REGS_R10];
-       regs->r11 = vcpu->arch.regs[VCPU_REGS_R11];
-       regs->r12 = vcpu->arch.regs[VCPU_REGS_R12];
-       regs->r13 = vcpu->arch.regs[VCPU_REGS_R13];
-       regs->r14 = vcpu->arch.regs[VCPU_REGS_R14];
-       regs->r15 = vcpu->arch.regs[VCPU_REGS_R15];
+       regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
+       regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
+       regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
+       regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
+       regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
+       regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
+       regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
+       regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
 #endif
 
-       regs->rip = vcpu->arch.rip;
+       regs->rip = kvm_rip_read(vcpu);
        regs->rflags = kvm_x86_ops->get_rflags(vcpu);
 
        /*
@@ -3058,29 +3125,29 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
        vcpu_load(vcpu);
 
-       vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax;
-       vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx;
-       vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx;
-       vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx;
-       vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi;
-       vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi;
-       vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp;
-       vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp;
+       kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
+       kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
+       kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
+       kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
+       kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
+       kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
+       kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
+       kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
 #ifdef CONFIG_X86_64
-       vcpu->arch.regs[VCPU_REGS_R8] = regs->r8;
-       vcpu->arch.regs[VCPU_REGS_R9] = regs->r9;
-       vcpu->arch.regs[VCPU_REGS_R10] = regs->r10;
-       vcpu->arch.regs[VCPU_REGS_R11] = regs->r11;
-       vcpu->arch.regs[VCPU_REGS_R12] = regs->r12;
-       vcpu->arch.regs[VCPU_REGS_R13] = regs->r13;
-       vcpu->arch.regs[VCPU_REGS_R14] = regs->r14;
-       vcpu->arch.regs[VCPU_REGS_R15] = regs->r15;
+       kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
+       kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
+       kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
+       kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
+       kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
+       kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
+       kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
+       kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
+
 #endif
 
-       vcpu->arch.rip = regs->rip;
+       kvm_rip_write(vcpu, regs->rip);
        kvm_x86_ops->set_rflags(vcpu, regs->rflags);
 
-       kvm_x86_ops->decache_regs(vcpu);
 
        vcpu->arch.exception.pending = false;
 
@@ -3294,11 +3361,33 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
        return 0;
 }
 
+static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
+{
+       struct kvm_segment segvar = {
+               .base = selector << 4,
+               .limit = 0xffff,
+               .selector = selector,
+               .type = 3,
+               .present = 1,
+               .dpl = 3,
+               .db = 0,
+               .s = 1,
+               .l = 0,
+               .g = 0,
+               .avl = 0,
+               .unusable = 0,
+       };
+       kvm_x86_ops->set_segment(vcpu, &segvar, seg);
+       return 0;
+}
+
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
                                int type_bits, int seg)
 {
        struct kvm_segment kvm_seg;
 
+       if (!(vcpu->arch.cr0 & X86_CR0_PE))
+               return kvm_load_realmode_segment(vcpu, selector, seg);
        if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
                return 1;
        kvm_seg.type |= type_bits;
@@ -3316,17 +3405,16 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
                                struct tss_segment_32 *tss)
 {
        tss->cr3 = vcpu->arch.cr3;
-       tss->eip = vcpu->arch.rip;
+       tss->eip = kvm_rip_read(vcpu);
        tss->eflags = kvm_x86_ops->get_rflags(vcpu);
-       tss->eax = vcpu->arch.regs[VCPU_REGS_RAX];
-       tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX];
-       tss->edx = vcpu->arch.regs[VCPU_REGS_RDX];
-       tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX];
-       tss->esp = vcpu->arch.regs[VCPU_REGS_RSP];
-       tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP];
-       tss->esi = vcpu->arch.regs[VCPU_REGS_RSI];
-       tss->edi = vcpu->arch.regs[VCPU_REGS_RDI];
-
+       tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+       tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+       tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+       tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+       tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+       tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+       tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
+       tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
        tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
        tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
        tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
@@ -3342,17 +3430,17 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
 {
        kvm_set_cr3(vcpu, tss->cr3);
 
-       vcpu->arch.rip = tss->eip;
+       kvm_rip_write(vcpu, tss->eip);
        kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
 
-       vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax;
-       vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx;
-       vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx;
-       vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx;
-       vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp;
-       vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp;
-       vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi;
-       vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi;
+       kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
+       kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
+       kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
+       kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
+       kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
+       kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
+       kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
+       kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
 
        if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
                return 1;
@@ -3380,16 +3468,16 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
 static void save_state_to_tss16(struct kvm_vcpu *vcpu,
                                struct tss_segment_16 *tss)
 {
-       tss->ip = vcpu->arch.rip;
+       tss->ip = kvm_rip_read(vcpu);
        tss->flag = kvm_x86_ops->get_rflags(vcpu);
-       tss->ax = vcpu->arch.regs[VCPU_REGS_RAX];
-       tss->cx = vcpu->arch.regs[VCPU_REGS_RCX];
-       tss->dx = vcpu->arch.regs[VCPU_REGS_RDX];
-       tss->bx = vcpu->arch.regs[VCPU_REGS_RBX];
-       tss->sp = vcpu->arch.regs[VCPU_REGS_RSP];
-       tss->bp = vcpu->arch.regs[VCPU_REGS_RBP];
-       tss->si = vcpu->arch.regs[VCPU_REGS_RSI];
-       tss->di = vcpu->arch.regs[VCPU_REGS_RDI];
+       tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+       tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+       tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
+       tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
+       tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+       tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+       tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
+       tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
 
        tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
        tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
@@ -3402,16 +3490,16 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
 static int load_state_from_tss16(struct kvm_vcpu *vcpu,
                                 struct tss_segment_16 *tss)
 {
-       vcpu->arch.rip = tss->ip;
+       kvm_rip_write(vcpu, tss->ip);
        kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
-       vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax;
-       vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx;
-       vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx;
-       vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx;
-       vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp;
-       vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp;
-       vcpu->arch.regs[VCPU_REGS_RSI] = tss->si;
-       vcpu->arch.regs[VCPU_REGS_RDI] = tss->di;
+       kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
+       kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
+       kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
+       kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
+       kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
+       kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
+       kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
+       kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
 
        if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
                return 1;
@@ -3534,7 +3622,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
        }
 
        kvm_x86_ops->skip_emulated_instruction(vcpu);
-       kvm_x86_ops->cache_regs(vcpu);
 
        if (nseg_desc.type & 8)
                ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
@@ -3559,7 +3646,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
        tr_seg.type = 11;
        kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
 out:
-       kvm_x86_ops->decache_regs(vcpu);
        return ret;
 }
 EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -3622,6 +3708,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
                        pr_debug("Set back pending irq %d\n",
                                 pending_vec);
                }
+               kvm_pic_clear_isr_ack(vcpu->kvm);
        }
 
        kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -3634,6 +3721,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
        kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
        kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
 
+       /* Older userspace won't unhalt the vcpu on reset. */
+       if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
+           sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
+           !(vcpu->arch.cr0 & X86_CR0_PE))
+               vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
        vcpu_put(vcpu);
 
        return 0;
@@ -3918,6 +4011,7 @@ struct  kvm *kvm_arch_create_vm(void)
                return ERR_PTR(-ENOMEM);
 
        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+       INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
        return kvm;
 }
@@ -3950,6 +4044,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+       kvm_iommu_unmap_guest(kvm);
+       kvm_free_all_assigned_devices(kvm);
        kvm_free_pit(kvm);
        kfree(kvm->arch.vpic);
        kfree(kvm->arch.vioapic);
@@ -3981,7 +4077,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
                        userspace_addr = do_mmap(NULL, 0,
                                                 npages * PAGE_SIZE,
                                                 PROT_READ | PROT_WRITE,
-                                                MAP_SHARED | MAP_ANONYMOUS,
+                                                MAP_PRIVATE | MAP_ANONYMOUS,
                                                 0);
                        up_write(&current->mm->mmap_sem);
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
new file mode 100644 (file)
index 0000000..6a4be78
--- /dev/null
@@ -0,0 +1,22 @@
+#ifndef ARCH_X86_KVM_X86_H
+#define ARCH_X86_KVM_X86_H
+
+#include <linux/kvm_host.h>
+
+static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.exception.pending = false;
+}
+
+static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector)
+{
+       vcpu->arch.interrupt.pending = true;
+       vcpu->arch.interrupt.nr = vector;
+}
+
+static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.interrupt.pending = false;
+}
+
+#endif
index f2f90468f8b1c90042cc468f77eee4b5b19412ba..ea051173b0da9e950ca92d6841a42c507e957444 100644 (file)
@@ -26,6 +26,7 @@
 #define DPRINTF(_f, _a ...) printf(_f , ## _a)
 #else
 #include <linux/kvm_host.h>
+#include "kvm_cache_regs.h"
 #define DPRINTF(x...) do {} while (0)
 #endif
 #include <linux/module.h>
 #define ImplicitOps (1<<1)     /* Implicit in opcode. No generic decode. */
 #define DstReg      (2<<1)     /* Register operand. */
 #define DstMem      (3<<1)     /* Memory operand. */
-#define DstMask     (3<<1)
+#define DstAcc      (4<<1)      /* Destination Accumulator */
+#define DstMask     (7<<1)
 /* Source operand type. */
-#define SrcNone     (0<<3)     /* No source operand. */
-#define SrcImplicit (0<<3)     /* Source operand is implicit in the opcode. */
-#define SrcReg      (1<<3)     /* Register operand. */
-#define SrcMem      (2<<3)     /* Memory operand. */
-#define SrcMem16    (3<<3)     /* Memory operand (16-bit). */
-#define SrcMem32    (4<<3)     /* Memory operand (32-bit). */
-#define SrcImm      (5<<3)     /* Immediate operand. */
-#define SrcImmByte  (6<<3)     /* 8-bit sign-extended immediate operand. */
-#define SrcMask     (7<<3)
+#define SrcNone     (0<<4)     /* No source operand. */
+#define SrcImplicit (0<<4)     /* Source operand is implicit in the opcode. */
+#define SrcReg      (1<<4)     /* Register operand. */
+#define SrcMem      (2<<4)     /* Memory operand. */
+#define SrcMem16    (3<<4)     /* Memory operand (16-bit). */
+#define SrcMem32    (4<<4)     /* Memory operand (32-bit). */
+#define SrcImm      (5<<4)     /* Immediate operand. */
+#define SrcImmByte  (6<<4)     /* 8-bit sign-extended immediate operand. */
+#define SrcMask     (7<<4)
 /* Generic ModRM decode. */
-#define ModRM       (1<<6)
+#define ModRM       (1<<7)
 /* Destination is only written; never read. */
-#define Mov         (1<<7)
-#define BitOp       (1<<8)
-#define MemAbs      (1<<9)      /* Memory operand is absolute displacement */
-#define String      (1<<10)     /* String instruction (rep capable) */
-#define Stack       (1<<11)     /* Stack instruction (push/pop) */
+#define Mov         (1<<8)
+#define BitOp       (1<<9)
+#define MemAbs      (1<<10)      /* Memory operand is absolute displacement */
+#define String      (1<<12)     /* String instruction (rep capable) */
+#define Stack       (1<<13)     /* Stack instruction (push/pop) */
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
 #define GroupMask   0xff        /* Group number stored in bits 0:7 */
@@ -94,7 +96,7 @@ static u16 opcode_table[256] = {
        /* 0x20 - 0x27 */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       SrcImmByte, SrcImm, 0, 0,
+       DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
        /* 0x28 - 0x2F */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -106,7 +108,8 @@ static u16 opcode_table[256] = {
        /* 0x38 - 0x3F */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
+       ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
+       0, 0,
        /* 0x40 - 0x47 */
        DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
        /* 0x48 - 0x4F */
@@ -153,9 +156,16 @@ static u16 opcode_table[256] = {
        0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
        ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
        ByteOp | ImplicitOps | String, ImplicitOps | String,
-       /* 0xB0 - 0xBF */
-       0, 0, 0, 0, 0, 0, 0, 0,
-       DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0,
+       /* 0xB0 - 0xB7 */
+       ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+       ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+       ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+       ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
+       /* 0xB8 - 0xBF */
+       DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+       DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+       DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
+       DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
        /* 0xC0 - 0xC7 */
        ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
        0, ImplicitOps | Stack, 0, 0,
@@ -169,17 +179,20 @@ static u16 opcode_table[256] = {
        /* 0xD8 - 0xDF */
        0, 0, 0, 0, 0, 0, 0, 0,
        /* 0xE0 - 0xE7 */
-       0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0,
+       SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+       SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
        /* 0xE8 - 0xEF */
        ImplicitOps | Stack, SrcImm | ImplicitOps,
        ImplicitOps, SrcImmByte | ImplicitOps,
-       0, 0, 0, 0,
+       SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+       SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
        /* 0xF0 - 0xF7 */
        0, 0, 0, 0,
        ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
        /* 0xF8 - 0xFF */
        ImplicitOps, 0, ImplicitOps, ImplicitOps,
-       0, 0, Group | Group4, Group | Group5,
+       ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
 };
 
 static u16 twobyte_table[256] = {
@@ -268,15 +281,16 @@ static u16 group_table[] = {
        ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
        0, 0, 0, 0,
        [Group3*8] =
-       DstMem | SrcImm | ModRM | SrcImm, 0,
-       DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+       DstMem | SrcImm | ModRM, 0,
+       DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
        0, 0, 0, 0,
        [Group4*8] =
        ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
        0, 0, 0, 0, 0, 0,
        [Group5*8] =
-       DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0,
-       SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0,
+       DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+       SrcMem | ModRM | Stack, 0,
+       SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
        [Group7*8] =
        0, 0, ModRM | SrcMem, ModRM | SrcMem,
        SrcNone | ModRM | DstMem | Mov, 0,
@@ -839,7 +853,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
        /* Shadow copy of register state. Committed on successful emulation. */
 
        memset(c, 0, sizeof(struct decode_cache));
-       c->eip = ctxt->vcpu->arch.rip;
+       c->eip = kvm_rip_read(ctxt->vcpu);
        ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
        memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
 
@@ -1048,6 +1062,23 @@ done_prefixes:
                }
                c->dst.type = OP_MEM;
                break;
+       case DstAcc:
+               c->dst.type = OP_REG;
+               c->dst.bytes = c->op_bytes;
+               c->dst.ptr = &c->regs[VCPU_REGS_RAX];
+               switch (c->op_bytes) {
+                       case 1:
+                               c->dst.val = *(u8 *)c->dst.ptr;
+                               break;
+                       case 2:
+                               c->dst.val = *(u16 *)c->dst.ptr;
+                               break;
+                       case 4:
+                               c->dst.val = *(u32 *)c->dst.ptr;
+                               break;
+               }
+               c->dst.orig_val = c->dst.val;
+               break;
        }
 
        if (c->rip_relative)
@@ -1151,6 +1182,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
        case 1: /* dec */
                emulate_1op("dec", c->dst, ctxt->eflags);
                break;
+       case 2: /* call near abs */ {
+               long int old_eip;
+               old_eip = c->eip;
+               c->eip = c->src.val;
+               c->src.val = old_eip;
+               emulate_push(ctxt);
+               break;
+       }
        case 4: /* jmp abs */
                c->eip = c->src.val;
                break;
@@ -1251,6 +1290,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
        u64 msr_data;
        unsigned long saved_eip = 0;
        struct decode_cache *c = &ctxt->decode;
+       unsigned int port;
+       int io_dir_in;
        int rc = 0;
 
        /* Shadow copy of register state. Committed on successful emulation.
@@ -1267,7 +1308,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
        if (c->rep_prefix && (c->d & String)) {
                /* All REP prefixes have the same first termination condition */
                if (c->regs[VCPU_REGS_RCX] == 0) {
-                       ctxt->vcpu->arch.rip = c->eip;
+                       kvm_rip_write(ctxt->vcpu, c->eip);
                        goto done;
                }
                /* The second termination condition only applies for REPE
@@ -1281,17 +1322,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
                                (c->b == 0xae) || (c->b == 0xaf)) {
                        if ((c->rep_prefix == REPE_PREFIX) &&
                                ((ctxt->eflags & EFLG_ZF) == 0)) {
-                                       ctxt->vcpu->arch.rip = c->eip;
+                                       kvm_rip_write(ctxt->vcpu, c->eip);
                                        goto done;
                        }
                        if ((c->rep_prefix == REPNE_PREFIX) &&
                                ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
-                               ctxt->vcpu->arch.rip = c->eip;
+                               kvm_rip_write(ctxt->vcpu, c->eip);
                                goto done;
                        }
                }
                c->regs[VCPU_REGS_RCX]--;
-               c->eip = ctxt->vcpu->arch.rip;
+               c->eip = kvm_rip_read(ctxt->vcpu);
        }
 
        if (c->src.type == OP_MEM) {
@@ -1351,27 +1392,10 @@ special_insn:
              sbb:              /* sbb */
                emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
                break;
-       case 0x20 ... 0x23:
+       case 0x20 ... 0x25:
              and:              /* and */
                emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
                break;
-       case 0x24:              /* and al imm8 */
-               c->dst.type = OP_REG;
-               c->dst.ptr = &c->regs[VCPU_REGS_RAX];
-               c->dst.val = *(u8 *)c->dst.ptr;
-               c->dst.bytes = 1;
-               c->dst.orig_val = c->dst.val;
-               goto and;
-       case 0x25:              /* and ax imm16, or eax imm32 */
-               c->dst.type = OP_REG;
-               c->dst.bytes = c->op_bytes;
-               c->dst.ptr = &c->regs[VCPU_REGS_RAX];
-               if (c->op_bytes == 2)
-                       c->dst.val = *(u16 *)c->dst.ptr;
-               else
-                       c->dst.val = *(u32 *)c->dst.ptr;
-               c->dst.orig_val = c->dst.val;
-               goto and;
        case 0x28 ... 0x2d:
              sub:              /* sub */
                emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
@@ -1659,7 +1683,7 @@ special_insn:
        case 0xae ... 0xaf:     /* scas */
                DPRINTF("Urk! I don't handle SCAS.\n");
                goto cannot_emulate;
-       case 0xb8: /* mov r, imm */
+       case 0xb0 ... 0xbf: /* mov r, imm */
                goto mov;
        case 0xc0 ... 0xc1:
                emulate_grp2(ctxt);
@@ -1679,6 +1703,16 @@ special_insn:
                c->src.val = c->regs[VCPU_REGS_RCX];
                emulate_grp2(ctxt);
                break;
+       case 0xe4:      /* inb */
+       case 0xe5:      /* in */
+               port = insn_fetch(u8, 1, c->eip);
+               io_dir_in = 1;
+               goto do_io;
+       case 0xe6: /* outb */
+       case 0xe7: /* out */
+               port = insn_fetch(u8, 1, c->eip);
+               io_dir_in = 0;
+               goto do_io;
        case 0xe8: /* call (near) */ {
                long int rel;
                switch (c->op_bytes) {
@@ -1729,6 +1763,22 @@ special_insn:
                jmp_rel(c, c->src.val);
                c->dst.type = OP_NONE; /* Disable writeback. */
                break;
+       case 0xec: /* in al,dx */
+       case 0xed: /* in (e/r)ax,dx */
+               port = c->regs[VCPU_REGS_RDX];
+               io_dir_in = 1;
+               goto do_io;
+       case 0xee: /* out al,dx */
+       case 0xef: /* out (e/r)ax,dx */
+               port = c->regs[VCPU_REGS_RDX];
+               io_dir_in = 0;
+       do_io:  if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in,
+                                  (c->d & ByteOp) ? 1 : c->op_bytes,
+                                  port) != 0) {
+                       c->eip = saved_eip;
+                       goto cannot_emulate;
+               }
+               return 0;
        case 0xf4:              /* hlt */
                ctxt->vcpu->arch.halt_request = 1;
                break;
@@ -1754,6 +1804,14 @@ special_insn:
                ctxt->eflags |= X86_EFLAGS_IF;
                c->dst.type = OP_NONE;  /* Disable writeback. */
                break;
+       case 0xfc: /* cld */
+               ctxt->eflags &= ~EFLG_DF;
+               c->dst.type = OP_NONE;  /* Disable writeback. */
+               break;
+       case 0xfd: /* std */
+               ctxt->eflags |= EFLG_DF;
+               c->dst.type = OP_NONE;  /* Disable writeback. */
+               break;
        case 0xfe ... 0xff:     /* Grp4/Grp5 */
                rc = emulate_grp45(ctxt, ops);
                if (rc != 0)
@@ -1768,7 +1826,7 @@ writeback:
 
        /* Commit shadow register state. */
        memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
-       ctxt->vcpu->arch.rip = c->eip;
+       kvm_rip_write(ctxt->vcpu, c->eip);
 
 done:
        if (rc == X86EMUL_UNHANDLEABLE) {
@@ -1793,7 +1851,7 @@ twobyte_insn:
                                goto done;
 
                        /* Let the processor re-execute the fixed hypercall */
-                       c->eip = ctxt->vcpu->arch.rip;
+                       c->eip = kvm_rip_read(ctxt->vcpu);
                        /* Disable writeback. */
                        c->dst.type = OP_NONE;
                        break;
@@ -1889,7 +1947,7 @@ twobyte_insn:
                rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
                if (rc) {
                        kvm_inject_gp(ctxt->vcpu, 0);
-                       c->eip = ctxt->vcpu->arch.rip;
+                       c->eip = kvm_rip_read(ctxt->vcpu);
                }
                rc = X86EMUL_CONTINUE;
                c->dst.type = OP_NONE;
@@ -1899,7 +1957,7 @@ twobyte_insn:
                rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
                if (rc) {
                        kvm_inject_gp(ctxt->vcpu, 0);
-                       c->eip = ctxt->vcpu->arch.rip;
+                       c->eip = kvm_rip_read(ctxt->vcpu);
                } else {
                        c->regs[VCPU_REGS_RAX] = (u32)msr_data;
                        c->regs[VCPU_REGS_RDX] = msr_data >> 32;
index 004ba86326ae022eb1b9d60a926a9adc6f207582..c9f7cda48ed78ecbe1b421c540bbe2b777681400 100644 (file)
@@ -198,17 +198,10 @@ unsigned long long xen_sched_clock(void)
 /* Get the TSC speed from Xen */
 unsigned long xen_tsc_khz(void)
 {
-       u64 xen_khz = 1000000ULL << 32;
-       const struct pvclock_vcpu_time_info *info =
+       struct pvclock_vcpu_time_info *info =
                &HYPERVISOR_shared_info->vcpu_info[0].time;
 
-       do_div(xen_khz, info->tsc_to_system_mul);
-       if (info->tsc_shift < 0)
-               xen_khz <<= -info->tsc_shift;
-       else
-               xen_khz >>= info->tsc_shift;
-
-       return xen_khz;
+       return pvclock_tsc_khz(info);
 }
 
 cycle_t xen_clocksource_read(void)
index bd2c01674f5ec1cd07063ce499e38e5faf805995..e842e756308a71c470e658164ec6c2163d5e8955 100644 (file)
@@ -28,9 +28,9 @@
 
 #include <linux/pci.h>
 #include <linux/dmar.h>
+#include <linux/iova.h>
+#include <linux/intel-iommu.h>
 #include <linux/timer.h>
-#include "iova.h"
-#include "intel-iommu.h"
 
 #undef PREFIX
 #define PREFIX "DMAR:"
index 389fdd6f4a9f6afcdd0877150887dc1f1571a914..fc5f2dbf532383b637258a3aec52024654d6693e 100644 (file)
@@ -33,8 +33,8 @@
 #include <linux/dma-mapping.h>
 #include <linux/mempool.h>
 #include <linux/timer.h>
-#include "iova.h"
-#include "intel-iommu.h"
+#include <linux/iova.h>
+#include <linux/intel-iommu.h>
 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
@@ -156,7 +156,7 @@ static inline void *alloc_domain_mem(void)
        return iommu_kmem_cache_alloc(iommu_domain_cache);
 }
 
-static inline void free_domain_mem(void *vaddr)
+static void free_domain_mem(void *vaddr)
 {
        kmem_cache_free(iommu_domain_cache, vaddr);
 }
@@ -1341,7 +1341,7 @@ static void domain_remove_dev_info(struct dmar_domain *domain)
  * find_domain
  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
  */
-struct dmar_domain *
+static struct dmar_domain *
 find_domain(struct pci_dev *pdev)
 {
        struct device_domain_info *info;
@@ -2318,3 +2318,111 @@ int __init intel_iommu_init(void)
        return 0;
 }
 
+void intel_iommu_domain_exit(struct dmar_domain *domain)
+{
+       u64 end;
+
+       /* Domain 0 is reserved, so dont process it */
+       if (!domain)
+               return;
+
+       end = DOMAIN_MAX_ADDR(domain->gaw);
+       end = end & (~PAGE_MASK_4K);
+
+       /* clear ptes */
+       dma_pte_clear_range(domain, 0, end);
+
+       /* free page tables */
+       dma_pte_free_pagetable(domain, 0, end);
+
+       iommu_free_domain(domain);
+       free_domain_mem(domain);
+}
+EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
+
+struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
+{
+       struct dmar_drhd_unit *drhd;
+       struct dmar_domain *domain;
+       struct intel_iommu *iommu;
+
+       drhd = dmar_find_matched_drhd_unit(pdev);
+       if (!drhd) {
+               printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
+               return NULL;
+       }
+
+       iommu = drhd->iommu;
+       if (!iommu) {
+               printk(KERN_ERR
+                       "intel_iommu_domain_alloc: iommu == NULL\n");
+               return NULL;
+       }
+       domain = iommu_alloc_domain(iommu);
+       if (!domain) {
+               printk(KERN_ERR
+                       "intel_iommu_domain_alloc: domain == NULL\n");
+               return NULL;
+       }
+       if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
+               printk(KERN_ERR
+                       "intel_iommu_domain_alloc: domain_init() failed\n");
+               intel_iommu_domain_exit(domain);
+               return NULL;
+       }
+       return domain;
+}
+EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
+
+int intel_iommu_context_mapping(
+       struct dmar_domain *domain, struct pci_dev *pdev)
+{
+       int rc;
+       rc = domain_context_mapping(domain, pdev);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
+
+int intel_iommu_page_mapping(
+       struct dmar_domain *domain, dma_addr_t iova,
+       u64 hpa, size_t size, int prot)
+{
+       int rc;
+       rc = domain_page_mapping(domain, iova, hpa, size, prot);
+       return rc;
+}
+EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
+
+void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
+{
+       detach_domain_for_dev(domain, bus, devfn);
+}
+EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
+
+struct dmar_domain *
+intel_iommu_find_domain(struct pci_dev *pdev)
+{
+       return find_domain(pdev);
+}
+EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
+
+int intel_iommu_found(void)
+{
+       return g_num_of_iommus;
+}
+EXPORT_SYMBOL_GPL(intel_iommu_found);
+
+u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
+{
+       struct dma_pte *pte;
+       u64 pfn;
+
+       pfn = 0;
+       pte = addr_to_dma_pte(domain, iova);
+
+       if (pte)
+               pfn = dma_pte_addr(*pte);
+
+       return pfn >> PAGE_SHIFT_4K;
+}
+EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);
index bb642cc5e18c9a410c28b1bb60490550b049f105..738d4c89581cc7a41f5c2746add5dca767ad81be 100644 (file)
@@ -4,7 +4,7 @@
 #include <linux/pci.h>
 #include <linux/irq.h>
 #include <asm/io_apic.h>
-#include "intel-iommu.h"
+#include <linux/intel-iommu.h>
 #include "intr_remapping.h"
 
 static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
index 05f2635bbe4e501caa8f59953003ba2f6b9e4610..ca48f0df8ac989bb41889e1efd87db7d5ee5fe10 100644 (file)
@@ -1,4 +1,4 @@
-#include "intel-iommu.h"
+#include <linux/intel-iommu.h>
 
 struct ioapic_scope {
        struct intel_iommu *iommu;
index 3ef4ac064315dd6bd0df4cc86559a34c66cbeb23..2287116e9822472a1ef879bd6facfbd30920b087 100644 (file)
@@ -7,7 +7,7 @@
  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
  */
 
-#include "iova.h"
+#include <linux/iova.h>
 
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
index 78e954db1e7f60066ea2cd232bf4d38a99821e57..ba0dd791fadf2d20560b5f16b3c6c0ab72e39789 100644 (file)
@@ -208,26 +208,4 @@ struct kvm_pit_channel_state {
 struct kvm_pit_state {
        struct kvm_pit_channel_state channels[3];
 };
-
-#define KVM_TRC_INJ_VIRQ         (KVM_TRC_HANDLER + 0x02)
-#define KVM_TRC_REDELIVER_EVT    (KVM_TRC_HANDLER + 0x03)
-#define KVM_TRC_PEND_INTR        (KVM_TRC_HANDLER + 0x04)
-#define KVM_TRC_IO_READ          (KVM_TRC_HANDLER + 0x05)
-#define KVM_TRC_IO_WRITE         (KVM_TRC_HANDLER + 0x06)
-#define KVM_TRC_CR_READ          (KVM_TRC_HANDLER + 0x07)
-#define KVM_TRC_CR_WRITE         (KVM_TRC_HANDLER + 0x08)
-#define KVM_TRC_DR_READ          (KVM_TRC_HANDLER + 0x09)
-#define KVM_TRC_DR_WRITE         (KVM_TRC_HANDLER + 0x0A)
-#define KVM_TRC_MSR_READ         (KVM_TRC_HANDLER + 0x0B)
-#define KVM_TRC_MSR_WRITE        (KVM_TRC_HANDLER + 0x0C)
-#define KVM_TRC_CPUID            (KVM_TRC_HANDLER + 0x0D)
-#define KVM_TRC_INTR             (KVM_TRC_HANDLER + 0x0E)
-#define KVM_TRC_NMI              (KVM_TRC_HANDLER + 0x0F)
-#define KVM_TRC_VMMCALL          (KVM_TRC_HANDLER + 0x10)
-#define KVM_TRC_HLT              (KVM_TRC_HANDLER + 0x11)
-#define KVM_TRC_CLTS             (KVM_TRC_HANDLER + 0x12)
-#define KVM_TRC_LMSW             (KVM_TRC_HANDLER + 0x13)
-#define KVM_TRC_APIC_ACCESS      (KVM_TRC_HANDLER + 0x14)
-#define KVM_TRC_TDP_FAULT        (KVM_TRC_HANDLER + 0x15)
-
 #endif /* ASM_X86__KVM_H */
index 69794547f514f5f249b65acf55ba8d5fd5c12685..411fb8cfb24e6b4d856dc234b5fe37309fb3a3cb 100644 (file)
 #define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE)
 
 #define DE_VECTOR 0
+#define DB_VECTOR 1
+#define BP_VECTOR 3
+#define OF_VECTOR 4
+#define BR_VECTOR 5
 #define UD_VECTOR 6
 #define NM_VECTOR 7
 #define DF_VECTOR 8
@@ -65,6 +69,7 @@
 #define SS_VECTOR 12
 #define GP_VECTOR 13
 #define PF_VECTOR 14
+#define MF_VECTOR 16
 #define MC_VECTOR 18
 
 #define SELECTOR_TI_MASK (1 << 2)
@@ -89,7 +94,7 @@ extern struct list_head vm_list;
 struct kvm_vcpu;
 struct kvm;
 
-enum {
+enum kvm_reg {
        VCPU_REGS_RAX = 0,
        VCPU_REGS_RCX = 1,
        VCPU_REGS_RDX = 2,
@@ -108,6 +113,7 @@ enum {
        VCPU_REGS_R14 = 14,
        VCPU_REGS_R15 = 15,
 #endif
+       VCPU_REGS_RIP,
        NR_VCPU_REGS
 };
 
@@ -189,10 +195,20 @@ struct kvm_mmu_page {
                                    */
        int multimapped;         /* More than one parent_pte? */
        int root_count;          /* Currently serving as active root */
+       bool unsync;
+       bool unsync_children;
        union {
                u64 *parent_pte;               /* !multimapped */
                struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
        };
+       DECLARE_BITMAP(unsync_child_bitmap, 512);
+};
+
+struct kvm_pv_mmu_op_buffer {
+       void *ptr;
+       unsigned len;
+       unsigned processed;
+       char buf[512] __aligned(sizeof(long));
 };
 
 /*
@@ -207,6 +223,9 @@ struct kvm_mmu {
        gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
        void (*prefetch_page)(struct kvm_vcpu *vcpu,
                              struct kvm_mmu_page *page);
+       int (*sync_page)(struct kvm_vcpu *vcpu,
+                        struct kvm_mmu_page *sp);
+       void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
        hpa_t root_hpa;
        int root_level;
        int shadow_root_level;
@@ -219,8 +238,13 @@ struct kvm_vcpu_arch {
        int interrupt_window_open;
        unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
        DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
-       unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */
-       unsigned long rip;      /* needs vcpu_load_rsp_rip() */
+       /*
+        * rip and regs accesses must go through
+        * kvm_{register,rip}_{read,write} functions.
+        */
+       unsigned long regs[NR_VCPU_REGS];
+       u32 regs_avail;
+       u32 regs_dirty;
 
        unsigned long cr0;
        unsigned long cr2;
@@ -237,6 +261,9 @@ struct kvm_vcpu_arch {
        bool tpr_access_reporting;
 
        struct kvm_mmu mmu;
+       /* only needed in kvm_pv_mmu_op() path, but it's hot so
+        * put it here to avoid allocation */
+       struct kvm_pv_mmu_op_buffer mmu_op_buffer;
 
        struct kvm_mmu_memory_cache mmu_pte_chain_cache;
        struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
@@ -269,6 +296,11 @@ struct kvm_vcpu_arch {
                u32 error_code;
        } exception;
 
+       struct kvm_queued_interrupt {
+               bool pending;
+               u8 nr;
+       } interrupt;
+
        struct {
                int active;
                u8 save_iopl;
@@ -294,6 +326,7 @@ struct kvm_vcpu_arch {
        struct page *time_page;
 
        bool nmi_pending;
+       bool nmi_injected;
 
        u64 mtrr[0x100];
 };
@@ -316,9 +349,12 @@ struct kvm_arch{
         * Hash table of struct kvm_mmu_page.
         */
        struct list_head active_mmu_pages;
+       struct list_head assigned_dev_head;
+       struct dmar_domain *intel_iommu_domain;
        struct kvm_pic *vpic;
        struct kvm_ioapic *vioapic;
        struct kvm_pit *vpit;
+       struct hlist_head irq_ack_notifier_list;
 
        int round_robin_prev_vcpu;
        unsigned int tss_addr;
@@ -338,6 +374,7 @@ struct kvm_vm_stat {
        u32 mmu_flooded;
        u32 mmu_recycled;
        u32 mmu_cache_miss;
+       u32 mmu_unsync;
        u32 remote_tlb_flush;
        u32 lpages;
 };
@@ -364,6 +401,7 @@ struct kvm_vcpu_stat {
        u32 insn_emulation;
        u32 insn_emulation_fail;
        u32 hypercalls;
+       u32 irq_injections;
 };
 
 struct descriptor_table {
@@ -414,8 +452,7 @@ struct kvm_x86_ops {
        unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
        void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
                       int *exception);
-       void (*cache_regs)(struct kvm_vcpu *vcpu);
-       void (*decache_regs)(struct kvm_vcpu *vcpu);
+       void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
        unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
        void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 
@@ -528,6 +565,8 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
                           u32 error_code);
 
+void kvm_pic_set_irq(void *opaque, int irq, int level);
+
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
 void fx_init(struct kvm_vcpu *vcpu);
@@ -550,12 +589,14 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 
 int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
 
 void kvm_enable_tdp(void);
 void kvm_disable_tdp(void);
@@ -686,33 +727,6 @@ enum {
        TASK_SWITCH_GATE = 3,
 };
 
-#define KVMTRACE_5D(evt, vcpu, d1, d2, d3, d4, d5, name) \
-       trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
-                                               vcpu, 5, d1, d2, d3, d4, d5)
-#define KVMTRACE_4D(evt, vcpu, d1, d2, d3, d4, name) \
-       trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
-                                               vcpu, 4, d1, d2, d3, d4, 0)
-#define KVMTRACE_3D(evt, vcpu, d1, d2, d3, name) \
-       trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
-                                               vcpu, 3, d1, d2, d3, 0, 0)
-#define KVMTRACE_2D(evt, vcpu, d1, d2, name) \
-       trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
-                                               vcpu, 2, d1, d2, 0, 0, 0)
-#define KVMTRACE_1D(evt, vcpu, d1, name) \
-       trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
-                                               vcpu, 1, d1, 0, 0, 0, 0)
-#define KVMTRACE_0D(evt, vcpu, name) \
-       trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
-                                               vcpu, 0, 0, 0, 0, 0, 0)
-
-#ifdef CONFIG_64BIT
-# define KVM_EX_ENTRY ".quad"
-# define KVM_EX_PUSH "pushq"
-#else
-# define KVM_EX_ENTRY ".long"
-# define KVM_EX_PUSH "pushl"
-#endif
-
 /*
  * Hardware virtualization extension instructions may fault if a
  * reboot turns off virtualization while processes are running.
@@ -724,11 +738,11 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
        "666: " insn "\n\t" \
        ".pushsection .fixup, \"ax\" \n" \
        "667: \n\t" \
-       KVM_EX_PUSH " $666b \n\t" \
+       __ASM_SIZE(push) " $666b \n\t"        \
        "jmp kvm_handle_fault_on_reboot \n\t" \
        ".popsection \n\t" \
        ".pushsection __ex_table, \"a\" \n\t" \
-       KVM_EX_ENTRY " 666b, 667b \n\t" \
+       _ASM_PTR " 666b, 667b \n\t" \
        ".popsection"
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
index 0bb43301a2022ca8d3a23f8ce6a3459476caf7b7..dabd10f0bbee25359a54d7104344813371b742ab 100644 (file)
 #define MSR_IA32_EBL_CR_POWERON                0x0000002a
 #define MSR_IA32_FEATURE_CONTROL        0x0000003a
 
+#define FEATURE_CONTROL_LOCKED         (1<<0)
+#define FEATURE_CONTROL_VMXON_ENABLED  (1<<2)
+
 #define MSR_IA32_APICBASE              0x0000001b
 #define MSR_IA32_APICBASE_BSP          (1<<8)
 #define MSR_IA32_APICBASE_ENABLE       (1<<11)
index 1a38f68348007a64dfabf488dd623bef2a463f42..ad29e277fd6d6dd9d114e33b6113c42d03bf5d2c 100644 (file)
@@ -6,6 +6,7 @@
 
 /* some helper functions for xen and kvm pv clock sources */
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
 void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
                            struct pvclock_vcpu_time_info *vcpu,
                            struct timespec *ts);
similarity index 92%
rename from drivers/pci/intel-iommu.h
rename to include/linux/intel-iommu.h
index 2142c01e0143c947e353e8ce26b29eff090f3d7b..2e117f30a76ca8a18f328b96f884387534602f29 100644 (file)
 #include <linux/types.h>
 #include <linux/msi.h>
 #include <linux/sysdev.h>
-#include "iova.h"
+#include <linux/iova.h>
 #include <linux/io.h>
+#include <linux/dma_remapping.h>
 #include <asm/cacheflush.h>
-#include "dma_remapping.h"
 
 /*
  * Intel IOMMU register specification per version 1.0 public spec.
@@ -304,4 +304,24 @@ extern int dmar_enable_qi(struct intel_iommu *iommu);
 extern void qi_global_iec(struct intel_iommu *iommu);
 
 extern void qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
+
+void intel_iommu_domain_exit(struct dmar_domain *domain);
+struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev);
+int intel_iommu_context_mapping(struct dmar_domain *domain,
+                               struct pci_dev *pdev);
+int intel_iommu_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
+                            u64 hpa, size_t size, int prot);
+void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn);
+struct dmar_domain *intel_iommu_find_domain(struct pci_dev *pdev);
+u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova);
+
+#ifdef CONFIG_DMAR
+int intel_iommu_found(void);
+#else /* CONFIG_DMAR */
+static inline int intel_iommu_found(void)
+{
+       return 0;
+}
+#endif /* CONFIG_DMAR */
+
 #endif
similarity index 100%
rename from drivers/pci/iova.h
rename to include/linux/iova.h
index 70a30651cd128cd3dbaf91a366886764abdede33..797fcd7812420596f10f4a768b6641e6277455dc 100644 (file)
@@ -311,22 +311,33 @@ struct kvm_s390_interrupt {
 
 /* This structure represents a single trace buffer record. */
 struct kvm_trace_rec {
-       __u32 event:28;
-       __u32 extra_u32:3;
-       __u32 cycle_in:1;
+       /* variable rec_val
+        * is split into:
+        * bits 0 - 27  -> event id
+        * bits 28 -30  -> number of extra data args of size u32
+        * bits 31      -> binary indicator for if tsc is in record
+        */
+       __u32 rec_val;
        __u32 pid;
        __u32 vcpu_id;
        union {
                struct {
-                       __u64 cycle_u64;
+                       __u64 timestamp;
                        __u32 extra_u32[KVM_TRC_EXTRA_MAX];
-               } __attribute__((packed)) cycle;
+               } __attribute__((packed)) timestamp;
                struct {
                        __u32 extra_u32[KVM_TRC_EXTRA_MAX];
-               } nocycle;
+               } notimestamp;
        } u;
 };
 
+#define TRACE_REC_EVENT_ID(val) \
+               (0x0fffffff & (val))
+#define TRACE_REC_NUM_DATA_ARGS(val) \
+               (0x70000000 & ((val) << 28))
+#define TRACE_REC_TCS(val) \
+               (0x80000000 & ((val) << 31))
+
 #define KVMIO 0xAE
 
 /*
@@ -372,6 +383,10 @@ struct kvm_trace_rec {
 #define KVM_CAP_MP_STATE 14
 #define KVM_CAP_COALESCED_MMIO 15
 #define KVM_CAP_SYNC_MMU 16  /* Changes to host mmap are reflected in guest */
+#if defined(CONFIG_X86)||defined(CONFIG_IA64)
+#define KVM_CAP_DEVICE_ASSIGNMENT 17
+#endif
+#define KVM_CAP_IOMMU 18
 
 /*
  * ioctls for VM fds
@@ -401,6 +416,10 @@ struct kvm_trace_rec {
                        _IOW(KVMIO,  0x67, struct kvm_coalesced_mmio_zone)
 #define KVM_UNREGISTER_COALESCED_MMIO \
                        _IOW(KVMIO,  0x68, struct kvm_coalesced_mmio_zone)
+#define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \
+                                  struct kvm_assigned_pci_dev)
+#define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \
+                           struct kvm_assigned_irq)
 
 /*
  * ioctls for vcpu fds
@@ -440,4 +459,45 @@ struct kvm_trace_rec {
 #define KVM_GET_MP_STATE          _IOR(KVMIO,  0x98, struct kvm_mp_state)
 #define KVM_SET_MP_STATE          _IOW(KVMIO,  0x99, struct kvm_mp_state)
 
+#define KVM_TRC_INJ_VIRQ         (KVM_TRC_HANDLER + 0x02)
+#define KVM_TRC_REDELIVER_EVT    (KVM_TRC_HANDLER + 0x03)
+#define KVM_TRC_PEND_INTR        (KVM_TRC_HANDLER + 0x04)
+#define KVM_TRC_IO_READ          (KVM_TRC_HANDLER + 0x05)
+#define KVM_TRC_IO_WRITE         (KVM_TRC_HANDLER + 0x06)
+#define KVM_TRC_CR_READ          (KVM_TRC_HANDLER + 0x07)
+#define KVM_TRC_CR_WRITE         (KVM_TRC_HANDLER + 0x08)
+#define KVM_TRC_DR_READ          (KVM_TRC_HANDLER + 0x09)
+#define KVM_TRC_DR_WRITE         (KVM_TRC_HANDLER + 0x0A)
+#define KVM_TRC_MSR_READ         (KVM_TRC_HANDLER + 0x0B)
+#define KVM_TRC_MSR_WRITE        (KVM_TRC_HANDLER + 0x0C)
+#define KVM_TRC_CPUID            (KVM_TRC_HANDLER + 0x0D)
+#define KVM_TRC_INTR             (KVM_TRC_HANDLER + 0x0E)
+#define KVM_TRC_NMI              (KVM_TRC_HANDLER + 0x0F)
+#define KVM_TRC_VMMCALL          (KVM_TRC_HANDLER + 0x10)
+#define KVM_TRC_HLT              (KVM_TRC_HANDLER + 0x11)
+#define KVM_TRC_CLTS             (KVM_TRC_HANDLER + 0x12)
+#define KVM_TRC_LMSW             (KVM_TRC_HANDLER + 0x13)
+#define KVM_TRC_APIC_ACCESS      (KVM_TRC_HANDLER + 0x14)
+#define KVM_TRC_TDP_FAULT        (KVM_TRC_HANDLER + 0x15)
+#define KVM_TRC_GTLB_WRITE       (KVM_TRC_HANDLER + 0x16)
+#define KVM_TRC_STLB_WRITE       (KVM_TRC_HANDLER + 0x17)
+#define KVM_TRC_STLB_INVAL       (KVM_TRC_HANDLER + 0x18)
+#define KVM_TRC_PPC_INSTR        (KVM_TRC_HANDLER + 0x19)
+
+struct kvm_assigned_pci_dev {
+       __u32 assigned_dev_id;
+       __u32 busnr;
+       __u32 devfn;
+       __u32 flags;
+};
+
+struct kvm_assigned_irq {
+       __u32 assigned_dev_id;
+       __u32 host_irq;
+       __u32 guest_irq;
+       __u32 flags;
+};
+
+#define KVM_DEV_ASSIGN_ENABLE_IOMMU    (1 << 0)
+
 #endif
index 8525afc53107faaaa993fa6ebae6c2d703eb718c..3833c48fae3a372585475eca6cdfd28ef3124d4a 100644 (file)
@@ -34,6 +34,8 @@
 #define KVM_REQ_MMU_RELOAD         3
 #define KVM_REQ_TRIPLE_FAULT       4
 #define KVM_REQ_PENDING_TIMER      5
+#define KVM_REQ_UNHALT             6
+#define KVM_REQ_MMU_SYNC           7
 
 struct kvm_vcpu;
 extern struct kmem_cache *kvm_vcpu_cache;
@@ -279,12 +281,68 @@ void kvm_free_physmem(struct kvm *kvm);
 
 struct  kvm *kvm_arch_create_vm(void);
 void kvm_arch_destroy_vm(struct kvm *kvm);
+void kvm_free_all_assigned_devices(struct kvm *kvm);
 
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *v);
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 
+int kvm_is_mmio_pfn(pfn_t pfn);
+
+struct kvm_irq_ack_notifier {
+       struct hlist_node link;
+       unsigned gsi;
+       void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
+};
+
+struct kvm_assigned_dev_kernel {
+       struct kvm_irq_ack_notifier ack_notifier;
+       struct work_struct interrupt_work;
+       struct list_head list;
+       int assigned_dev_id;
+       int host_busnr;
+       int host_devfn;
+       int host_irq;
+       int guest_irq;
+       int irq_requested;
+       struct pci_dev *dev;
+       struct kvm *kvm;
+};
+void kvm_set_irq(struct kvm *kvm, int irq, int level);
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
+void kvm_register_irq_ack_notifier(struct kvm *kvm,
+                                  struct kvm_irq_ack_notifier *kian);
+void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+                                    struct kvm_irq_ack_notifier *kian);
+
+#ifdef CONFIG_DMAR
+int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn,
+                       unsigned long npages);
+int kvm_iommu_map_guest(struct kvm *kvm,
+                       struct kvm_assigned_dev_kernel *assigned_dev);
+int kvm_iommu_unmap_guest(struct kvm *kvm);
+#else /* CONFIG_DMAR */
+static inline int kvm_iommu_map_pages(struct kvm *kvm,
+                                     gfn_t base_gfn,
+                                     unsigned long npages)
+{
+       return 0;
+}
+
+static inline int kvm_iommu_map_guest(struct kvm *kvm,
+                                     struct kvm_assigned_dev_kernel
+                                     *assigned_dev)
+{
+       return -ENODEV;
+}
+
+static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
+{
+       return 0;
+}
+#endif /* CONFIG_DMAR */
+
 static inline void kvm_guest_enter(void)
 {
        account_system_vtime(current);
@@ -307,6 +365,11 @@ static inline gpa_t gfn_to_gpa(gfn_t gfn)
        return (gpa_t)gfn << PAGE_SHIFT;
 }
 
+static inline hpa_t pfn_to_hpa(pfn_t pfn)
+{
+       return (hpa_t)pfn << PAGE_SHIFT;
+}
+
 static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu)
 {
        set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
@@ -326,6 +389,25 @@ struct kvm_stats_debugfs_item {
 extern struct kvm_stats_debugfs_item debugfs_entries[];
 extern struct dentry *kvm_debugfs_dir;
 
+#define KVMTRACE_5D(evt, vcpu, d1, d2, d3, d4, d5, name) \
+       trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+                                               vcpu, 5, d1, d2, d3, d4, d5)
+#define KVMTRACE_4D(evt, vcpu, d1, d2, d3, d4, name) \
+       trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+                                               vcpu, 4, d1, d2, d3, d4, 0)
+#define KVMTRACE_3D(evt, vcpu, d1, d2, d3, name) \
+       trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+                                               vcpu, 3, d1, d2, d3, 0, 0)
+#define KVMTRACE_2D(evt, vcpu, d1, d2, name) \
+       trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+                                               vcpu, 2, d1, d2, 0, 0, 0)
+#define KVMTRACE_1D(evt, vcpu, d1, name) \
+       trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+                                               vcpu, 1, d1, 0, 0, 0, 0)
+#define KVMTRACE_0D(evt, vcpu, name) \
+       trace_mark(kvm_trace_##name, "%u %p %u %u %u %u %u %u", KVM_TRC_##evt, \
+                                               vcpu, 0, 0, 0, 0, 0, 0)
+
 #ifdef CONFIG_KVM_TRACE
 int kvm_trace_ioctl(unsigned int ioctl, unsigned long arg);
 void kvm_trace_cleanup(void);
index c0d22870ee9c56137583c026b06dee47c66d58d8..53772bb46320ae9e1fbf84297a6a4707e5b3e0ed 100644 (file)
@@ -39,6 +39,7 @@
 
 #include "ioapic.h"
 #include "lapic.h"
+#include "irq.h"
 
 #if 0
 #define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
@@ -285,26 +286,31 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
        }
 }
 
-static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi)
+static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi,
+                                   int trigger_mode)
 {
        union ioapic_redir_entry *ent;
 
        ent = &ioapic->redirtbl[gsi];
-       ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
 
-       ent->fields.remote_irr = 0;
-       if (!ent->fields.mask && (ioapic->irr & (1 << gsi)))
-               ioapic_service(ioapic, gsi);
+       kvm_notify_acked_irq(ioapic->kvm, gsi);
+
+       if (trigger_mode == IOAPIC_LEVEL_TRIG) {
+               ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
+               ent->fields.remote_irr = 0;
+               if (!ent->fields.mask && (ioapic->irr & (1 << gsi)))
+                       ioapic_service(ioapic, gsi);
+       }
 }
 
-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector)
+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
 {
        struct kvm_ioapic *ioapic = kvm->arch.vioapic;
        int i;
 
        for (i = 0; i < IOAPIC_NUM_PINS; i++)
                if (ioapic->redirtbl[i].fields.vector == vector)
-                       __kvm_ioapic_update_eoi(ioapic, i);
+                       __kvm_ioapic_update_eoi(ioapic, i, trigger_mode);
 }
 
 static int ioapic_in_range(struct kvm_io_device *this, gpa_t addr,
@@ -380,7 +386,7 @@ static void ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
                break;
 #ifdef CONFIG_IA64
        case IOAPIC_REG_EOI:
-               kvm_ioapic_update_eoi(ioapic->kvm, data);
+               kvm_ioapic_update_eoi(ioapic->kvm, data, IOAPIC_LEVEL_TRIG);
                break;
 #endif
 
index 7f16675fe783a8e4d1a8e672a8a4c1c25a1f61a0..cd7ae7691c9d68eb3c066b6c47bbc4749c85e18d 100644 (file)
@@ -58,6 +58,7 @@ struct kvm_ioapic {
        } redirtbl[IOAPIC_NUM_PINS];
        struct kvm_io_device dev;
        struct kvm *kvm;
+       void (*ack_notifier)(void *opaque, int irq);
 };
 
 #ifdef DEBUG
@@ -78,16 +79,9 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
        return kvm->arch.vioapic;
 }
 
-#ifdef CONFIG_IA64
-static inline int irqchip_in_kernel(struct kvm *kvm)
-{
-       return 1;
-}
-#endif
-
 struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
                                       unsigned long bitmap);
-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector);
+void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
new file mode 100644 (file)
index 0000000..d0169f5
--- /dev/null
@@ -0,0 +1,60 @@
+/*
+ * irq_comm.c: Common API for in kernel interrupt controller
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ */
+
+#include <linux/kvm_host.h>
+#include "irq.h"
+
+#include "ioapic.h"
+
+/* This should be called with the kvm->lock mutex held */
+void kvm_set_irq(struct kvm *kvm, int irq, int level)
+{
+       /* Not possible to detect if the guest uses the PIC or the
+        * IOAPIC.  So set the bit in both. The guest will ignore
+        * writes to the unused one.
+        */
+       kvm_ioapic_set_irq(kvm->arch.vioapic, irq, level);
+#ifdef CONFIG_X86
+       kvm_pic_set_irq(pic_irqchip(kvm), irq, level);
+#endif
+}
+
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi)
+{
+       struct kvm_irq_ack_notifier *kian;
+       struct hlist_node *n;
+
+       hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link)
+               if (kian->gsi == gsi)
+                       kian->irq_acked(kian);
+}
+
+void kvm_register_irq_ack_notifier(struct kvm *kvm,
+                                  struct kvm_irq_ack_notifier *kian)
+{
+       hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list);
+}
+
+void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+                                    struct kvm_irq_ack_notifier *kian)
+{
+       hlist_del(&kian->link);
+}
index 7dd9b0b85e4eea1aafaff116e993e6392c9cc173..cf0ab8ed3845c898e927d62608221512148bee4a 100644 (file)
 #include "coalesced_mmio.h"
 #endif
 
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include "irq.h"
+#endif
+
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
@@ -71,11 +77,253 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 
 bool kvm_rebooting;
 
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
+                                                     int assigned_dev_id)
+{
+       struct list_head *ptr;
+       struct kvm_assigned_dev_kernel *match;
+
+       list_for_each(ptr, head) {
+               match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
+               if (match->assigned_dev_id == assigned_dev_id)
+                       return match;
+       }
+       return NULL;
+}
+
+static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
+{
+       struct kvm_assigned_dev_kernel *assigned_dev;
+
+       assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
+                                   interrupt_work);
+
+       /* This is taken to safely inject irq inside the guest. When
+        * the interrupt injection (or the ioapic code) uses a
+        * finer-grained lock, update this
+        */
+       mutex_lock(&assigned_dev->kvm->lock);
+       kvm_set_irq(assigned_dev->kvm,
+                   assigned_dev->guest_irq, 1);
+       mutex_unlock(&assigned_dev->kvm->lock);
+       kvm_put_kvm(assigned_dev->kvm);
+}
+
+/* FIXME: Implement the OR logic needed to make shared interrupts on
+ * this line behave properly
+ */
+static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
+{
+       struct kvm_assigned_dev_kernel *assigned_dev =
+               (struct kvm_assigned_dev_kernel *) dev_id;
+
+       kvm_get_kvm(assigned_dev->kvm);
+       schedule_work(&assigned_dev->interrupt_work);
+       disable_irq_nosync(irq);
+       return IRQ_HANDLED;
+}
+
+/* Ack the irq line for an assigned device */
+static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+       struct kvm_assigned_dev_kernel *dev;
+
+       if (kian->gsi == -1)
+               return;
+
+       dev = container_of(kian, struct kvm_assigned_dev_kernel,
+                          ack_notifier);
+       kvm_set_irq(dev->kvm, dev->guest_irq, 0);
+       enable_irq(dev->host_irq);
+}
+
+static void kvm_free_assigned_device(struct kvm *kvm,
+                                    struct kvm_assigned_dev_kernel
+                                    *assigned_dev)
+{
+       if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested)
+               free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+
+       kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
+
+       if (cancel_work_sync(&assigned_dev->interrupt_work))
+               /* We had pending work. That means we will have to take
+                * care of kvm_put_kvm.
+                */
+               kvm_put_kvm(kvm);
+
+       pci_release_regions(assigned_dev->dev);
+       pci_disable_device(assigned_dev->dev);
+       pci_dev_put(assigned_dev->dev);
+
+       list_del(&assigned_dev->list);
+       kfree(assigned_dev);
+}
+
+void kvm_free_all_assigned_devices(struct kvm *kvm)
+{
+       struct list_head *ptr, *ptr2;
+       struct kvm_assigned_dev_kernel *assigned_dev;
+
+       list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
+               assigned_dev = list_entry(ptr,
+                                         struct kvm_assigned_dev_kernel,
+                                         list);
+
+               kvm_free_assigned_device(kvm, assigned_dev);
+       }
+}
+
+static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
+                                  struct kvm_assigned_irq
+                                  *assigned_irq)
+{
+       int r = 0;
+       struct kvm_assigned_dev_kernel *match;
+
+       mutex_lock(&kvm->lock);
+
+       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+                                     assigned_irq->assigned_dev_id);
+       if (!match) {
+               mutex_unlock(&kvm->lock);
+               return -EINVAL;
+       }
+
+       if (match->irq_requested) {
+               match->guest_irq = assigned_irq->guest_irq;
+               match->ack_notifier.gsi = assigned_irq->guest_irq;
+               mutex_unlock(&kvm->lock);
+               return 0;
+       }
+
+       INIT_WORK(&match->interrupt_work,
+                 kvm_assigned_dev_interrupt_work_handler);
+
+       if (irqchip_in_kernel(kvm)) {
+               if (!capable(CAP_SYS_RAWIO)) {
+                       r = -EPERM;
+                       goto out_release;
+               }
+
+               if (assigned_irq->host_irq)
+                       match->host_irq = assigned_irq->host_irq;
+               else
+                       match->host_irq = match->dev->irq;
+               match->guest_irq = assigned_irq->guest_irq;
+               match->ack_notifier.gsi = assigned_irq->guest_irq;
+               match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
+               kvm_register_irq_ack_notifier(kvm, &match->ack_notifier);
+
+               /* Even though this is PCI, we don't want to use shared
+                * interrupts. Sharing host devices with guest-assigned devices
+                * on the same interrupt line is not a happy situation: there
+                * are going to be long delays in accepting, acking, etc.
+                */
+               if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0,
+                               "kvm_assigned_device", (void *)match)) {
+                       r = -EIO;
+                       goto out_release;
+               }
+       }
+
+       match->irq_requested = true;
+       mutex_unlock(&kvm->lock);
+       return r;
+out_release:
+       mutex_unlock(&kvm->lock);
+       kvm_free_assigned_device(kvm, match);
+       return r;
+}
+
+static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
+                                     struct kvm_assigned_pci_dev *assigned_dev)
+{
+       int r = 0;
+       struct kvm_assigned_dev_kernel *match;
+       struct pci_dev *dev;
+
+       mutex_lock(&kvm->lock);
+
+       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+                                     assigned_dev->assigned_dev_id);
+       if (match) {
+               /* device already assigned */
+               r = -EINVAL;
+               goto out;
+       }
+
+       match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
+       if (match == NULL) {
+               printk(KERN_INFO "%s: Couldn't allocate memory\n",
+                      __func__);
+               r = -ENOMEM;
+               goto out;
+       }
+       dev = pci_get_bus_and_slot(assigned_dev->busnr,
+                                  assigned_dev->devfn);
+       if (!dev) {
+               printk(KERN_INFO "%s: host device not found\n", __func__);
+               r = -EINVAL;
+               goto out_free;
+       }
+       if (pci_enable_device(dev)) {
+               printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
+               r = -EBUSY;
+               goto out_put;
+       }
+       r = pci_request_regions(dev, "kvm_assigned_device");
+       if (r) {
+               printk(KERN_INFO "%s: Could not get access to device regions\n",
+                      __func__);
+               goto out_disable;
+       }
+       match->assigned_dev_id = assigned_dev->assigned_dev_id;
+       match->host_busnr = assigned_dev->busnr;
+       match->host_devfn = assigned_dev->devfn;
+       match->dev = dev;
+
+       match->kvm = kvm;
+
+       list_add(&match->list, &kvm->arch.assigned_dev_head);
+
+       if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
+               r = kvm_iommu_map_guest(kvm, match);
+               if (r)
+                       goto out_list_del;
+       }
+
+out:
+       mutex_unlock(&kvm->lock);
+       return r;
+out_list_del:
+       list_del(&match->list);
+       pci_release_regions(dev);
+out_disable:
+       pci_disable_device(dev);
+out_put:
+       pci_dev_put(dev);
+out_free:
+       kfree(match);
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+#endif
+
 static inline int valid_vcpu(int n)
 {
        return likely(n >= 0 && n < KVM_MAX_VCPUS);
 }
 
+inline int kvm_is_mmio_pfn(pfn_t pfn)
+{
+       if (pfn_valid(pfn))
+               return PageReserved(pfn_to_page(pfn));
+
+       return true;
+}
+
 /*
  * Switches to specified vcpu, until a matching vcpu_put()
  */
@@ -570,6 +818,12 @@ int __kvm_set_memory_region(struct kvm *kvm,
        }
 
        kvm_free_physmem_slot(&old, &new);
+#ifdef CONFIG_DMAR
+       /* map the pages in iommu page table */
+       r = kvm_iommu_map_pages(kvm, base_gfn, npages);
+       if (r)
+               goto out;
+#endif
        return 0;
 
 out_free:
@@ -708,9 +962,6 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_hva);
 
-/*
- * Requires current->mm->mmap_sem to be held
- */
 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
 {
        struct page *page[1];
@@ -726,21 +977,24 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
                return page_to_pfn(bad_page);
        }
 
-       npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page,
-                               NULL);
+       npages = get_user_pages_fast(addr, 1, 1, page);
 
        if (unlikely(npages != 1)) {
                struct vm_area_struct *vma;
 
+               down_read(&current->mm->mmap_sem);
                vma = find_vma(current->mm, addr);
+
                if (vma == NULL || addr < vma->vm_start ||
                    !(vma->vm_flags & VM_PFNMAP)) {
+                       up_read(&current->mm->mmap_sem);
                        get_page(bad_page);
                        return page_to_pfn(bad_page);
                }
 
                pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-               BUG_ON(pfn_valid(pfn));
+               up_read(&current->mm->mmap_sem);
+               BUG_ON(!kvm_is_mmio_pfn(pfn));
        } else
                pfn = page_to_pfn(page[0]);
 
@@ -754,10 +1008,10 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
        pfn_t pfn;
 
        pfn = gfn_to_pfn(kvm, gfn);
-       if (pfn_valid(pfn))
+       if (!kvm_is_mmio_pfn(pfn))
                return pfn_to_page(pfn);
 
-       WARN_ON(!pfn_valid(pfn));
+       WARN_ON(kvm_is_mmio_pfn(pfn));
 
        get_page(bad_page);
        return bad_page;
@@ -773,7 +1027,7 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 
 void kvm_release_pfn_clean(pfn_t pfn)
 {
-       if (pfn_valid(pfn))
+       if (!kvm_is_mmio_pfn(pfn))
                put_page(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
@@ -799,7 +1053,7 @@ EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
 
 void kvm_set_pfn_dirty(pfn_t pfn)
 {
-       if (pfn_valid(pfn)) {
+       if (!kvm_is_mmio_pfn(pfn)) {
                struct page *page = pfn_to_page(pfn);
                if (!PageReserved(page))
                        SetPageDirty(page);
@@ -809,14 +1063,14 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
 
 void kvm_set_pfn_accessed(pfn_t pfn)
 {
-       if (pfn_valid(pfn))
+       if (!kvm_is_mmio_pfn(pfn))
                mark_page_accessed(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
 
 void kvm_get_pfn(pfn_t pfn)
 {
-       if (pfn_valid(pfn))
+       if (!kvm_is_mmio_pfn(pfn))
                get_page(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_get_pfn);
@@ -972,12 +1226,12 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
        for (;;) {
                prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
-               if (kvm_cpu_has_interrupt(vcpu))
-                       break;
-               if (kvm_cpu_has_pending_timer(vcpu))
-                       break;
-               if (kvm_arch_vcpu_runnable(vcpu))
+               if (kvm_cpu_has_interrupt(vcpu) ||
+                   kvm_cpu_has_pending_timer(vcpu) ||
+                   kvm_arch_vcpu_runnable(vcpu)) {
+                       set_bit(KVM_REQ_UNHALT, &vcpu->requests);
                        break;
+               }
                if (signal_pending(current))
                        break;
 
@@ -1074,12 +1328,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
 
        r = kvm_arch_vcpu_setup(vcpu);
        if (r)
-               goto vcpu_destroy;
+               return r;
 
        mutex_lock(&kvm->lock);
        if (kvm->vcpus[n]) {
                r = -EEXIST;
-               mutex_unlock(&kvm->lock);
                goto vcpu_destroy;
        }
        kvm->vcpus[n] = vcpu;
@@ -1095,8 +1348,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
 unlink:
        mutex_lock(&kvm->lock);
        kvm->vcpus[n] = NULL;
-       mutex_unlock(&kvm->lock);
 vcpu_destroy:
+       mutex_unlock(&kvm->lock);
        kvm_arch_vcpu_destroy(vcpu);
        return r;
 }
@@ -1118,6 +1371,8 @@ static long kvm_vcpu_ioctl(struct file *filp,
        struct kvm_vcpu *vcpu = filp->private_data;
        void __user *argp = (void __user *)arg;
        int r;
+       struct kvm_fpu *fpu = NULL;
+       struct kvm_sregs *kvm_sregs = NULL;
 
        if (vcpu->kvm->mm != current->mm)
                return -EIO;
@@ -1165,25 +1420,28 @@ out_free2:
                break;
        }
        case KVM_GET_SREGS: {
-               struct kvm_sregs kvm_sregs;
-
-               memset(&kvm_sregs, 0, sizeof kvm_sregs);
-               r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
+               kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
+               r = -ENOMEM;
+               if (!kvm_sregs)
+                       goto out;
+               r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
                if (r)
                        goto out;
                r = -EFAULT;
-               if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
+               if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
                        goto out;
                r = 0;
                break;
        }
        case KVM_SET_SREGS: {
-               struct kvm_sregs kvm_sregs;
-
+               kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
+               r = -ENOMEM;
+               if (!kvm_sregs)
+                       goto out;
                r = -EFAULT;
-               if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
+               if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs)))
                        goto out;
-               r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
+               r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
                if (r)
                        goto out;
                r = 0;
@@ -1264,25 +1522,28 @@ out_free2:
                break;
        }
        case KVM_GET_FPU: {
-               struct kvm_fpu fpu;
-
-               memset(&fpu, 0, sizeof fpu);
-               r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu);
+               fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
+               r = -ENOMEM;
+               if (!fpu)
+                       goto out;
+               r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
                if (r)
                        goto out;
                r = -EFAULT;
-               if (copy_to_user(argp, &fpu, sizeof fpu))
+               if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
                        goto out;
                r = 0;
                break;
        }
        case KVM_SET_FPU: {
-               struct kvm_fpu fpu;
-
+               fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
+               r = -ENOMEM;
+               if (!fpu)
+                       goto out;
                r = -EFAULT;
-               if (copy_from_user(&fpu, argp, sizeof fpu))
+               if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu)))
                        goto out;
-               r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu);
+               r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
                if (r)
                        goto out;
                r = 0;
@@ -1292,6 +1553,8 @@ out_free2:
                r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
        }
 out:
+       kfree(fpu);
+       kfree(kvm_sregs);
        return r;
 }
 
@@ -1359,6 +1622,30 @@ static long kvm_vm_ioctl(struct file *filp,
                r = 0;
                break;
        }
+#endif
+#ifdef KVM_CAP_DEVICE_ASSIGNMENT
+       case KVM_ASSIGN_PCI_DEVICE: {
+               struct kvm_assigned_pci_dev assigned_dev;
+
+               r = -EFAULT;
+               if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+                       goto out;
+               r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
+               if (r)
+                       goto out;
+               break;
+       }
+       case KVM_ASSIGN_IRQ: {
+               struct kvm_assigned_irq assigned_irq;
+
+               r = -EFAULT;
+               if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+                       goto out;
+               r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
+               if (r)
+                       goto out;
+               break;
+       }
 #endif
        default:
                r = kvm_arch_vm_ioctl(filp, ioctl, arg);
@@ -1369,17 +1656,22 @@ out:
 
 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+       struct page *page[1];
+       unsigned long addr;
+       int npages;
+       gfn_t gfn = vmf->pgoff;
        struct kvm *kvm = vma->vm_file->private_data;
-       struct page *page;
 
-       if (!kvm_is_visible_gfn(kvm, vmf->pgoff))
+       addr = gfn_to_hva(kvm, gfn);
+       if (kvm_is_error_hva(addr))
                return VM_FAULT_SIGBUS;
-       page = gfn_to_page(kvm, vmf->pgoff);
-       if (is_error_page(page)) {
-               kvm_release_page_clean(page);
+
+       npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page,
+                               NULL);
+       if (unlikely(npages != 1))
                return VM_FAULT_SIGBUS;
-       }
-       vmf->page = page;
+
+       vmf->page = page[0];
        return 0;
 }
 
index 58141f31ea8fb218f8091b16f192857615ad66b6..41dcc845f78c34d65a9d13e72cc7023a8a14f142 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/module.h>
 #include <linux/relay.h>
 #include <linux/debugfs.h>
+#include <linux/ktime.h>
 
 #include <linux/kvm_host.h>
 
@@ -35,16 +36,16 @@ static struct kvm_trace *kvm_trace;
 struct kvm_trace_probe {
        const char *name;
        const char *format;
-       u32 cycle_in;
+       u32 timestamp_in;
        marker_probe_func *probe_func;
 };
 
-static inline int calc_rec_size(int cycle, int extra)
+static inline int calc_rec_size(int timestamp, int extra)
 {
        int rec_size = KVM_TRC_HEAD_SIZE;
 
        rec_size += extra;
-       return cycle ? rec_size += KVM_TRC_CYCLE_SIZE : rec_size;
+       return timestamp ? rec_size += KVM_TRC_CYCLE_SIZE : rec_size;
 }
 
 static void kvm_add_trace(void *probe_private, void *call_data,
@@ -54,12 +55,13 @@ static void kvm_add_trace(void *probe_private, void *call_data,
        struct kvm_trace *kt = kvm_trace;
        struct kvm_trace_rec rec;
        struct kvm_vcpu *vcpu;
-       int    i, extra, size;
+       int    i, size;
+       u32    extra;
 
        if (unlikely(kt->trace_state != KVM_TRACE_STATE_RUNNING))
                return;
 
-       rec.event       = va_arg(*args, u32);
+       rec.rec_val     = TRACE_REC_EVENT_ID(va_arg(*args, u32));
        vcpu            = va_arg(*args, struct kvm_vcpu *);
        rec.pid         = current->tgid;
        rec.vcpu_id     = vcpu->vcpu_id;
@@ -67,21 +69,21 @@ static void kvm_add_trace(void *probe_private, void *call_data,
        extra           = va_arg(*args, u32);
        WARN_ON(!(extra <= KVM_TRC_EXTRA_MAX));
        extra           = min_t(u32, extra, KVM_TRC_EXTRA_MAX);
-       rec.extra_u32   = extra;
 
-       rec.cycle_in    = p->cycle_in;
+       rec.rec_val |= TRACE_REC_TCS(p->timestamp_in)
+                       | TRACE_REC_NUM_DATA_ARGS(extra);
 
-       if (rec.cycle_in) {
-               rec.u.cycle.cycle_u64 = get_cycles();
+       if (p->timestamp_in) {
+               rec.u.timestamp.timestamp = ktime_to_ns(ktime_get());
 
-               for (i = 0; i < rec.extra_u32; i++)
-                       rec.u.cycle.extra_u32[i] = va_arg(*args, u32);
+               for (i = 0; i < extra; i++)
+                       rec.u.timestamp.extra_u32[i] = va_arg(*args, u32);
        } else {
-               for (i = 0; i < rec.extra_u32; i++)
-                       rec.u.nocycle.extra_u32[i] = va_arg(*args, u32);
+               for (i = 0; i < extra; i++)
+                       rec.u.notimestamp.extra_u32[i] = va_arg(*args, u32);
        }
 
-       size = calc_rec_size(rec.cycle_in, rec.extra_u32 * sizeof(u32));
+       size = calc_rec_size(p->timestamp_in, extra * sizeof(u32));
        relay_write(kt->rchan, &rec, size);
 }
 
diff --git a/virt/kvm/vtd.c b/virt/kvm/vtd.c
new file mode 100644 (file)
index 0000000..a770874
--- /dev/null
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) 2006-2008 Intel Corporation
+ * Copyright IBM Corporation, 2008
+ * Author: Allen M. Kay <allen.m.kay@intel.com>
+ * Author: Weidong Han <weidong.han@intel.com>
+ * Author: Ben-Ami Yassour <benami@il.ibm.com>
+ */
+
+#include <linux/list.h>
+#include <linux/kvm_host.h>
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/intel-iommu.h>
+
+static int kvm_iommu_unmap_memslots(struct kvm *kvm);
+static void kvm_iommu_put_pages(struct kvm *kvm,
+                               gfn_t base_gfn, unsigned long npages);
+
+int kvm_iommu_map_pages(struct kvm *kvm,
+                       gfn_t base_gfn, unsigned long npages)
+{
+       gfn_t gfn = base_gfn;
+       pfn_t pfn;
+       int i, r = 0;
+       struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
+
+       /* check if iommu exists and in use */
+       if (!domain)
+               return 0;
+
+       for (i = 0; i < npages; i++) {
+               /* check if already mapped */
+               pfn = (pfn_t)intel_iommu_iova_to_pfn(domain,
+                                                    gfn_to_gpa(gfn));
+               if (pfn)
+                       continue;
+
+               pfn = gfn_to_pfn(kvm, gfn);
+               r = intel_iommu_page_mapping(domain,
+                                            gfn_to_gpa(gfn),
+                                            pfn_to_hpa(pfn),
+                                            PAGE_SIZE,
+                                            DMA_PTE_READ |
+                                            DMA_PTE_WRITE);
+               if (r) {
+                       printk(KERN_ERR "kvm_iommu_map_pages:"
+                              "iommu failed to map pfn=%lx\n", pfn);
+                       goto unmap_pages;
+               }
+               gfn++;
+       }
+       return 0;
+
+unmap_pages:
+       kvm_iommu_put_pages(kvm, base_gfn, i);
+       return r;
+}
+
+static int kvm_iommu_map_memslots(struct kvm *kvm)
+{
+       int i, r;
+
+       down_read(&kvm->slots_lock);
+       for (i = 0; i < kvm->nmemslots; i++) {
+               r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn,
+                                       kvm->memslots[i].npages);
+               if (r)
+                       break;
+       }
+       up_read(&kvm->slots_lock);
+       return r;
+}
+
+int kvm_iommu_map_guest(struct kvm *kvm,
+                       struct kvm_assigned_dev_kernel *assigned_dev)
+{
+       struct pci_dev *pdev = NULL;
+       int r;
+
+       if (!intel_iommu_found()) {
+               printk(KERN_ERR "%s: intel iommu not found\n", __func__);
+               return -ENODEV;
+       }
+
+       printk(KERN_DEBUG "VT-d direct map: host bdf = %x:%x:%x\n",
+              assigned_dev->host_busnr,
+              PCI_SLOT(assigned_dev->host_devfn),
+              PCI_FUNC(assigned_dev->host_devfn));
+
+       pdev = assigned_dev->dev;
+
+       if (pdev == NULL) {
+               if (kvm->arch.intel_iommu_domain) {
+                       intel_iommu_domain_exit(kvm->arch.intel_iommu_domain);
+                       kvm->arch.intel_iommu_domain = NULL;
+               }
+               return -ENODEV;
+       }
+
+       kvm->arch.intel_iommu_domain = intel_iommu_domain_alloc(pdev);
+       if (!kvm->arch.intel_iommu_domain)
+               return -ENODEV;
+
+       r = kvm_iommu_map_memslots(kvm);
+       if (r)
+               goto out_unmap;
+
+       intel_iommu_detach_dev(kvm->arch.intel_iommu_domain,
+                              pdev->bus->number, pdev->devfn);
+
+       r = intel_iommu_context_mapping(kvm->arch.intel_iommu_domain,
+                                       pdev);
+       if (r) {
+               printk(KERN_ERR "Domain context map for %s failed",
+                      pci_name(pdev));
+               goto out_unmap;
+       }
+       return 0;
+
+out_unmap:
+       kvm_iommu_unmap_memslots(kvm);
+       return r;
+}
+
+static void kvm_iommu_put_pages(struct kvm *kvm,
+                              gfn_t base_gfn, unsigned long npages)
+{
+       gfn_t gfn = base_gfn;
+       pfn_t pfn;
+       struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
+       int i;
+
+       for (i = 0; i < npages; i++) {
+               pfn = (pfn_t)intel_iommu_iova_to_pfn(domain,
+                                                    gfn_to_gpa(gfn));
+               kvm_release_pfn_clean(pfn);
+               gfn++;
+       }
+}
+
+static int kvm_iommu_unmap_memslots(struct kvm *kvm)
+{
+       int i;
+       down_read(&kvm->slots_lock);
+       for (i = 0; i < kvm->nmemslots; i++) {
+               kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn,
+                                   kvm->memslots[i].npages);
+       }
+       up_read(&kvm->slots_lock);
+
+       return 0;
+}
+
+int kvm_iommu_unmap_guest(struct kvm *kvm)
+{
+       struct kvm_assigned_dev_kernel *entry;
+       struct dmar_domain *domain = kvm->arch.intel_iommu_domain;
+
+       /* check if iommu exists and in use */
+       if (!domain)
+               return 0;
+
+       list_for_each_entry(entry, &kvm->arch.assigned_dev_head, list) {
+               printk(KERN_DEBUG "VT-d unmap: host bdf = %x:%x:%x\n",
+                      entry->host_busnr,
+                      PCI_SLOT(entry->host_devfn),
+                      PCI_FUNC(entry->host_devfn));
+
+               /* detach kvm dmar domain */
+               intel_iommu_detach_dev(domain, entry->host_busnr,
+                                      entry->host_devfn);
+       }
+       kvm_iommu_unmap_memslots(kvm);
+       intel_iommu_domain_exit(domain);
+       return 0;
+}