]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/commitdiff
Merge git://git.kernel.org/pub/scm/linux/kernel/git/sam/kbuild-next
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 2 Jan 2009 23:56:54 +0000 (15:56 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 2 Jan 2009 23:56:54 +0000 (15:56 -0800)
* git://git.kernel.org/pub/scm/linux/kernel/git/sam/kbuild-next:
  kbuild: ignore a few files in headers_check
  kbuild: add checks for include of linux/types in userspace headers
  kbuild: drop debugging leftover in tags.sh
  kbuild: document environment variables
  kbuild: make *config usage docs
  kbuild: disable sparse warning "returning void-valued expression"
  kbuild: in headers_install autoconvert asm/inline/volatile to __xxx__
  kbuild: check for leaked CONFIG_ symbols to userspace
  headers_check.pl: disallow extern's
  kconfig: improve error messages for bad source statements
  kconfig: struct property commented
  kconfig: add comments to symbol flags
  kconfig: explain symbol value defaults
  m68k: fix recursive dependency in Kconfig

309 files changed:
Documentation/cpu-hotplug.txt
MAINTAINERS
arch/alpha/include/asm/smp.h
arch/alpha/kernel/irq.c
arch/alpha/kernel/process.c
arch/alpha/kernel/smp.c
arch/alpha/kernel/sys_dp264.c
arch/alpha/kernel/sys_titan.c
arch/arm/common/gic.c
arch/arm/kernel/irq.c
arch/arm/kernel/smp.c
arch/arm/mach-at91/at91rm9200_time.c
arch/arm/mach-at91/at91sam926x_time.c
arch/arm/mach-davinci/time.c
arch/arm/mach-imx/time.c
arch/arm/mach-ixp4xx/common.c
arch/arm/mach-msm/timer.c
arch/arm/mach-ns9xxx/time-ns9360.c
arch/arm/mach-omap1/time.c
arch/arm/mach-omap1/timer32k.c
arch/arm/mach-omap2/timer-gp.c
arch/arm/mach-pxa/time.c
arch/arm/mach-realview/core.c
arch/arm/mach-realview/localtimer.c
arch/arm/mach-sa1100/time.c
arch/arm/mach-versatile/core.c
arch/arm/oprofile/op_model_mpcore.c
arch/arm/plat-mxc/time.c
arch/arm/plat-orion/time.c
arch/avr32/kernel/time.c
arch/blackfin/kernel/time-ts.c
arch/cris/arch-v32/kernel/irq.c
arch/cris/arch-v32/kernel/smp.c
arch/cris/include/asm/smp.h
arch/ia64/hp/sim/hpsim_irq.c
arch/ia64/include/asm/kvm.h
arch/ia64/include/asm/kvm_host.h
arch/ia64/include/asm/smp.h
arch/ia64/include/asm/topology.h
arch/ia64/kernel/iosapic.c
arch/ia64/kernel/irq.c
arch/ia64/kernel/msi_ia64.c
arch/ia64/kernel/smpboot.c
arch/ia64/kernel/topology.c
arch/ia64/kvm/Makefile
arch/ia64/kvm/asm-offsets.c
arch/ia64/kvm/kvm-ia64.c
arch/ia64/kvm/kvm_lib.c [new file with mode: 0644]
arch/ia64/kvm/kvm_minstate.h
arch/ia64/kvm/misc.h
arch/ia64/kvm/mmio.c
arch/ia64/kvm/process.c
arch/ia64/kvm/vcpu.c
arch/ia64/kvm/vcpu.h
arch/ia64/kvm/vmm.c
arch/ia64/kvm/vmm_ivt.S
arch/ia64/kvm/vtlb.c
arch/ia64/sn/kernel/irq.c
arch/ia64/sn/kernel/msi_sn.c
arch/m32r/Kconfig
arch/m32r/kernel/smpboot.c
arch/m68knommu/platform/coldfire/pit.c
arch/mips/include/asm/irq.h
arch/mips/include/asm/mach-ip27/topology.h
arch/mips/include/asm/smp.h
arch/mips/jazz/irq.c
arch/mips/kernel/cevt-bcm1480.c
arch/mips/kernel/cevt-ds1287.c
arch/mips/kernel/cevt-gt641xx.c
arch/mips/kernel/cevt-r4k.c
arch/mips/kernel/cevt-sb1250.c
arch/mips/kernel/cevt-smtc.c
arch/mips/kernel/cevt-txx9.c
arch/mips/kernel/i8253.c
arch/mips/kernel/irq-gic.c
arch/mips/kernel/smp-cmp.c
arch/mips/kernel/smp-mt.c
arch/mips/kernel/smp.c
arch/mips/kernel/smtc.c
arch/mips/mti-malta/malta-smtc.c
arch/mips/nxp/pnx8550/common/time.c
arch/mips/pmc-sierra/yosemite/smp.c
arch/mips/sgi-ip27/ip27-smp.c
arch/mips/sgi-ip27/ip27-timer.c
arch/mips/sibyte/bcm1480/irq.c
arch/mips/sibyte/bcm1480/smp.c
arch/mips/sibyte/sb1250/irq.c
arch/mips/sibyte/sb1250/smp.c
arch/mips/sni/time.c
arch/parisc/Kconfig
arch/parisc/kernel/irq.c
arch/parisc/kernel/smp.c
arch/powerpc/include/asm/disassemble.h [new file with mode: 0644]
arch/powerpc/include/asm/kvm_44x.h [new file with mode: 0644]
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/include/asm/mmu-44x.h
arch/powerpc/include/asm/topology.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/irq.c
arch/powerpc/kernel/smp.c
arch/powerpc/kernel/time.c
arch/powerpc/kvm/44x.c [new file with mode: 0644]
arch/powerpc/kvm/44x_emulate.c [new file with mode: 0644]
arch/powerpc/kvm/44x_tlb.c
arch/powerpc/kvm/44x_tlb.h
arch/powerpc/kvm/Kconfig
arch/powerpc/kvm/Makefile
arch/powerpc/kvm/booke.c [moved from arch/powerpc/kvm/booke_guest.c with 56% similarity]
arch/powerpc/kvm/booke.h [new file with mode: 0644]
arch/powerpc/kvm/booke_host.c [deleted file]
arch/powerpc/kvm/booke_interrupts.S
arch/powerpc/kvm/emulate.c
arch/powerpc/kvm/powerpc.c
arch/powerpc/kvm/timing.c [new file with mode: 0644]
arch/powerpc/kvm/timing.h [new file with mode: 0644]
arch/powerpc/platforms/pseries/xics.c
arch/powerpc/sysdev/mpic.c
arch/powerpc/sysdev/mpic.h
arch/s390/Kconfig
arch/s390/kernel/smp.c
arch/s390/kernel/time.c
arch/s390/kvm/kvm-s390.c
arch/sh/include/asm/smp.h
arch/sh/include/asm/topology.h
arch/sh/kernel/smp.c
arch/sh/kernel/timers/timer-broadcast.c
arch/sh/kernel/timers/timer-tmu.c
arch/sparc/include/asm/smp_32.h
arch/sparc/kernel/irq_64.c
arch/sparc/kernel/of_device_64.c
arch/sparc/kernel/pci_msi.c
arch/sparc/kernel/smp_32.c
arch/sparc/kernel/smp_64.c
arch/sparc/kernel/sparc_ksyms_32.c
arch/sparc/kernel/time_64.c
arch/um/kernel/smp.c
arch/um/kernel/time.c
arch/x86/Kconfig
arch/x86/ia32/ia32_signal.c
arch/x86/ia32/ipc32.c
arch/x86/ia32/sys_ia32.c
arch/x86/include/asm/apic.h
arch/x86/include/asm/bigsmp/apic.h
arch/x86/include/asm/bigsmp/ipi.h
arch/x86/include/asm/desc.h
arch/x86/include/asm/efi.h
arch/x86/include/asm/es7000/apic.h
arch/x86/include/asm/es7000/ipi.h
arch/x86/include/asm/genapic_32.h
arch/x86/include/asm/genapic_64.h
arch/x86/include/asm/ipi.h
arch/x86/include/asm/irq.h
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/kvm_x86_emulate.h
arch/x86/include/asm/mach-default/mach_apic.h
arch/x86/include/asm/mach-default/mach_ipi.h
arch/x86/include/asm/mach-generic/mach_apic.h
arch/x86/include/asm/mpspec.h
arch/x86/include/asm/mtrr.h
arch/x86/include/asm/numaq/apic.h
arch/x86/include/asm/numaq/ipi.h
arch/x86/include/asm/pci_x86.h [moved from arch/x86/pci/pci.h with 88% similarity]
arch/x86/include/asm/smp.h
arch/x86/include/asm/summit/apic.h
arch/x86/include/asm/summit/ipi.h
arch/x86/include/asm/svm.h [moved from arch/x86/kvm/svm.h with 100% similarity]
arch/x86/include/asm/sys_ia32.h [new file with mode: 0644]
arch/x86/include/asm/topology.h
arch/x86/include/asm/uv/uv_bau.h
arch/x86/include/asm/virtext.h [new file with mode: 0644]
arch/x86/include/asm/vmx.h [moved from arch/x86/kvm/vmx.h with 93% similarity]
arch/x86/kernel/amd_iommu.c
arch/x86/kernel/amd_iommu_init.c
arch/x86/kernel/apic.c
arch/x86/kernel/bios_uv.c
arch/x86/kernel/cpu/intel_cacheinfo.c
arch/x86/kernel/cpu/mcheck/mce_amd_64.c
arch/x86/kernel/cpu/mtrr/generic.c
arch/x86/kernel/cpu/mtrr/main.c
arch/x86/kernel/cpu/mtrr/mtrr.h
arch/x86/kernel/cpuid.c
arch/x86/kernel/crash.c
arch/x86/kernel/early_printk.c
arch/x86/kernel/genapic_flat_64.c
arch/x86/kernel/genx2apic_cluster.c
arch/x86/kernel/genx2apic_phys.c
arch/x86/kernel/genx2apic_uv_x.c
arch/x86/kernel/head64.c
arch/x86/kernel/hpet.c
arch/x86/kernel/i8253.c
arch/x86/kernel/io_apic.c
arch/x86/kernel/ipi.c
arch/x86/kernel/irq.c
arch/x86/kernel/irq_32.c
arch/x86/kernel/irq_64.c
arch/x86/kernel/irqinit_32.c
arch/x86/kernel/irqinit_64.c
arch/x86/kernel/kvmclock.c
arch/x86/kernel/ldt.c
arch/x86/kernel/mfgpt_32.c
arch/x86/kernel/mmconf-fam10h_64.c
arch/x86/kernel/mpparse.c
arch/x86/kernel/nmi.c
arch/x86/kernel/pci-gart_64.c
arch/x86/kernel/reboot.c
arch/x86/kernel/setup_percpu.c
arch/x86/kernel/smp.c
arch/x86/kernel/smpboot.c
arch/x86/kernel/tlb_32.c
arch/x86/kernel/tlb_64.c
arch/x86/kernel/tlb_uv.c
arch/x86/kernel/traps.c
arch/x86/kernel/vmiclock_32.c
arch/x86/kernel/xsave.c
arch/x86/kvm/i8254.c
arch/x86/kvm/i8259.c
arch/x86/kvm/irq.h
arch/x86/kvm/kvm_svm.h
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu.c
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/kvm/x86_emulate.c
arch/x86/lguest/boot.c
arch/x86/mach-default/setup.c
arch/x86/mach-generic/bigsmp.c
arch/x86/mach-generic/es7000.c
arch/x86/mach-generic/numaq.c
arch/x86/mach-generic/summit.c
arch/x86/mach-voyager/voyager_smp.c
arch/x86/mm/numa_64.c
arch/x86/mm/srat_64.c
arch/x86/pci/acpi.c
arch/x86/pci/amd_bus.c
arch/x86/pci/common.c
arch/x86/pci/direct.c
arch/x86/pci/early.c
arch/x86/pci/fixup.c
arch/x86/pci/i386.c
arch/x86/pci/init.c
arch/x86/pci/irq.c
arch/x86/pci/legacy.c
arch/x86/pci/mmconfig-shared.c
arch/x86/pci/mmconfig_32.c
arch/x86/pci/mmconfig_64.c
arch/x86/pci/numaq_32.c
arch/x86/pci/olpc.c
arch/x86/pci/pcbios.c
arch/x86/pci/visws.c
arch/x86/xen/mmu.c
arch/x86/xen/smp.c
arch/x86/xen/suspend.c
arch/x86/xen/time.c
arch/x86/xen/xen-ops.h
drivers/base/cpu.c
drivers/base/node.c
drivers/base/topology.c
drivers/clocksource/tcb_clksrc.c
drivers/lguest/interrupts_and_traps.c
drivers/parisc/iosapic.c
drivers/pci/hotplug/cpqphp_core.c
drivers/pci/hotplug/cpqphp_pci.c
drivers/pci/hotplug/ibmphp_core.c
drivers/pci/pci-sysfs.c
drivers/pci/probe.c
drivers/xen/events.c
fs/anon_inodes.c
include/asm-generic/topology.h
include/asm-m32r/smp.h
include/linux/clockchips.h
include/linux/compiler-gcc3.h
include/linux/cpumask.h
include/linux/interrupt.h
include/linux/irq.h
include/linux/kvm.h
include/linux/kvm_host.h
include/linux/sched.h
include/linux/topology.h
init/Kconfig
kernel/cpu.c
kernel/cpuset.c
kernel/irq/chip.c
kernel/irq/manage.c
kernel/irq/migration.c
kernel/irq/proc.c
kernel/profile.c
kernel/rcuclassic.c
kernel/sched.c
kernel/sched_cpupri.c
kernel/sched_cpupri.h
kernel/sched_fair.c
kernel/sched_rt.c
kernel/sched_stats.h
kernel/taskstats.c
kernel/time/clockevents.c
kernel/time/tick-broadcast.c
kernel/time/tick-common.c
kernel/time/tick-sched.c
kernel/trace/trace.c
lib/Kconfig
mm/slub.c
virt/kvm/ioapic.c
virt/kvm/ioapic.h
virt/kvm/irq_comm.c
virt/kvm/kvm_main.c
virt/kvm/kvm_trace.c

index 94bbc27ddd4fbf50ff361177450c5f524210f1cb..9d620c153b04f2469bf664e6489e3af6817663e2 100644 (file)
@@ -50,16 +50,17 @@ additional_cpus=n (*)       Use this to limit hotpluggable cpus. This option sets
                        cpu_possible_map = cpu_present_map + additional_cpus
 
 (*) Option valid only for following architectures
-- x86_64, ia64
+- ia64
 
-ia64 and x86_64 use the number of disabled local apics in ACPI tables MADT
-to determine the number of potentially hot-pluggable cpus. The implementation
-should only rely on this to count the # of cpus, but *MUST* not rely on the
-apicid values in those tables for disabled apics. In the event BIOS doesn't
-mark such hot-pluggable cpus as disabled entries, one could use this
-parameter "additional_cpus=x" to represent those cpus in the cpu_possible_map.
+ia64 uses the number of disabled local apics in ACPI tables MADT to
+determine the number of potentially hot-pluggable cpus. The implementation
+should only rely on this to count the # of cpus, but *MUST* not rely
+on the apicid values in those tables for disabled apics. In the event
+BIOS doesn't mark such hot-pluggable cpus as disabled entries, one could
+use this parameter "additional_cpus=x" to represent those cpus in the
+cpu_possible_map.
 
-possible_cpus=n                [s390 only] use this to set hotpluggable cpus.
+possible_cpus=n                [s390,x86_64] use this to set hotpluggable cpus.
                        This option sets possible_cpus bits in
                        cpu_possible_map. Thus keeping the numbers of bits set
                        constant even if the machine gets rebooted.
index 88f43e09aeb33c4791d420be54efe2b731d6ffe0..befacf07729f7c88a46d2776ab3d662968f344b8 100644 (file)
@@ -2542,8 +2542,6 @@ W:        http://kvm.qumranet.com
 S:     Supported
 
 KERNEL VIRTUAL MACHINE For Itanium (KVM/IA64)
-P:     Anthony Xu
-M:     anthony.xu@intel.com
 P:     Xiantao Zhang
 M:     xiantao.zhang@intel.com
 L:     kvm-ia64@vger.kernel.org
index 544c69af8168b27baf40a32b34f334bad1d106a4..547e90951cec07a92bd31d6b9cf37f293612383e 100644 (file)
@@ -45,7 +45,6 @@ extern struct cpuinfo_alpha cpu_data[NR_CPUS];
 #define raw_smp_processor_id() (current_thread_info()->cpu)
 
 extern int smp_num_cpus;
-#define cpu_possible_map       cpu_present_map
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi(cpumask_t mask);
index c626a821cdcb9eacc089121b52d7afa6f3f5fe4f..d0f1620007f7de83d72f76257fb778791518d587 100644 (file)
@@ -55,7 +55,7 @@ int irq_select_affinity(unsigned int irq)
        last_cpu = cpu;
 
        irq_desc[irq].affinity = cpumask_of_cpu(cpu);
-       irq_desc[irq].chip->set_affinity(irq, cpumask_of_cpu(cpu));
+       irq_desc[irq].chip->set_affinity(irq, cpumask_of(cpu));
        return 0;
 }
 #endif /* CONFIG_SMP */
index 351407e07e71a715a654936b744e498f2cbfb2e0..f238370c907d9f4c86ecbed9da7332ccf08908c3 100644 (file)
@@ -94,6 +94,7 @@ common_shutdown_1(void *generic_ptr)
                flags |= 0x00040000UL; /* "remain halted" */
                *pflags = flags;
                cpu_clear(cpuid, cpu_present_map);
+               cpu_clear(cpuid, cpu_possible_map);
                halt();
        }
 #endif
@@ -120,6 +121,7 @@ common_shutdown_1(void *generic_ptr)
 #ifdef CONFIG_SMP
        /* Wait for the secondaries to halt. */
        cpu_clear(boot_cpuid, cpu_present_map);
+       cpu_clear(boot_cpuid, cpu_possible_map);
        while (cpus_weight(cpu_present_map))
                barrier();
 #endif
index cf7da10097bb29d5f8fb69aaefc9107a3419783a..d953e510f68d71ff417206e98c76dc9c13be45ad 100644 (file)
@@ -70,11 +70,6 @@ enum ipi_message_type {
 /* Set to a secondary's cpuid when it comes online.  */
 static int smp_secondary_alive __devinitdata = 0;
 
-/* Which cpus ids came online.  */
-cpumask_t cpu_online_map;
-
-EXPORT_SYMBOL(cpu_online_map);
-
 int smp_num_probed;            /* Internal processor count */
 int smp_num_cpus = 1;          /* Number that came online.  */
 EXPORT_SYMBOL(smp_num_cpus);
@@ -440,6 +435,7 @@ setup_smp(void)
                                ((char *)cpubase + i*hwrpb->processor_size);
                        if ((cpu->flags & 0x1cc) == 0x1cc) {
                                smp_num_probed++;
+                               cpu_set(i, cpu_possible_map);
                                cpu_set(i, cpu_present_map);
                                cpu->pal_revision = boot_cpu_palrev;
                        }
@@ -473,6 +469,7 @@ smp_prepare_cpus(unsigned int max_cpus)
 
        /* Nothing to do on a UP box, or when told not to.  */
        if (smp_num_probed == 1 || max_cpus == 0) {
+               cpu_possible_map = cpumask_of_cpu(boot_cpuid);
                cpu_present_map = cpumask_of_cpu(boot_cpuid);
                printk(KERN_INFO "SMP mode deactivated.\n");
                return;
index c71b0fd7a61f8748b857463c03854006ec5b91a1..ab44c164d9d47e95ae80d84bd96fd39b9cf927a9 100644 (file)
@@ -177,19 +177,19 @@ cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity)
 }
 
 static void
-dp264_set_affinity(unsigned int irq, cpumask_t affinity)
+dp264_set_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
        spin_lock(&dp264_irq_lock);
-       cpu_set_irq_affinity(irq, affinity);
+       cpu_set_irq_affinity(irq, *affinity);
        tsunami_update_irq_hw(cached_irq_mask);
        spin_unlock(&dp264_irq_lock);
 }
 
 static void
-clipper_set_affinity(unsigned int irq, cpumask_t affinity)
+clipper_set_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
        spin_lock(&dp264_irq_lock);
-       cpu_set_irq_affinity(irq - 16, affinity);
+       cpu_set_irq_affinity(irq - 16, *affinity);
        tsunami_update_irq_hw(cached_irq_mask);
        spin_unlock(&dp264_irq_lock);
 }
index 52c91ccc164866d3c08ed24ed6984d7b53e35c65..27f840a4ad3d7ec0949ac92a55ef5c7c1a3b9280 100644 (file)
@@ -158,10 +158,10 @@ titan_cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity)
 }
 
 static void
-titan_set_irq_affinity(unsigned int irq, cpumask_t affinity)
+titan_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
        spin_lock(&titan_irq_lock);
-       titan_cpu_set_irq_affinity(irq - 16, affinity);
+       titan_cpu_set_irq_affinity(irq - 16, *affinity);
        titan_update_irq_hw(titan_cached_irq_mask);
        spin_unlock(&titan_irq_lock);
 }
index 7fc9860a97d79b5dcf73cb97c6d3de9ca9530af3..c6884ba1d5ed0b91122f45072f97c99cda52a23d 100644 (file)
@@ -109,11 +109,11 @@ static void gic_unmask_irq(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void gic_set_cpu(unsigned int irq, cpumask_t mask_val)
+static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
 {
        void __iomem *reg = gic_dist_base(irq) + GIC_DIST_TARGET + (gic_irq(irq) & ~3);
        unsigned int shift = (irq % 4) * 8;
-       unsigned int cpu = first_cpu(mask_val);
+       unsigned int cpu = cpumask_first(mask_val);
        u32 val;
 
        spin_lock(&irq_controller_lock);
index 2f3eb795fa6e99a4fd1cde9911ab99070f233aad..7141cee1fab71e8b131d03e0d8e6dbfc043dbb85 100644 (file)
@@ -174,7 +174,7 @@ static void route_irq(struct irq_desc *desc, unsigned int irq, unsigned int cpu)
        pr_debug("IRQ%u: moving from cpu%u to cpu%u\n", irq, desc->cpu, cpu);
 
        spin_lock_irq(&desc->lock);
-       desc->chip->set_affinity(irq, cpumask_of_cpu(cpu));
+       desc->chip->set_affinity(irq, cpumask_of(cpu));
        spin_unlock_irq(&desc->lock);
 }
 
index 019237d21622ba2cde3669d4915dca02a590ea26..55fa7ff96a3e7aaf654d30c64677b1a4d666ee4b 100644 (file)
 #include <asm/tlbflush.h>
 #include <asm/ptrace.h>
 
-/*
- * bitmask of present and online CPUs.
- * The present bitmask indicates that the CPU is physically present.
- * The online bitmask indicates that the CPU is up and running.
- */
-cpumask_t cpu_possible_map;
-EXPORT_SYMBOL(cpu_possible_map);
-cpumask_t cpu_online_map;
-EXPORT_SYMBOL(cpu_online_map);
-
 /*
  * as from 2.5, kernels no longer have an init_tasks structure
  * so we need some other way of telling a new secondary core
index d140eae53ded281bcc9f33fc6b991b1086c5fd55..1ff1bda0a894a4ce313ecb0896eb1011bd4a2413 100644 (file)
@@ -178,7 +178,6 @@ static struct clock_event_device clkevt = {
        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
        .shift          = 32,
        .rating         = 150,
-       .cpumask        = CPU_MASK_CPU0,
        .set_next_event = clkevt32k_next_event,
        .set_mode       = clkevt32k_mode,
 };
@@ -206,7 +205,7 @@ void __init at91rm9200_timer_init(void)
        clkevt.mult = div_sc(AT91_SLOW_CLOCK, NSEC_PER_SEC, clkevt.shift);
        clkevt.max_delta_ns = clockevent_delta2ns(AT91_ST_ALMV, &clkevt);
        clkevt.min_delta_ns = clockevent_delta2ns(2, &clkevt) + 1;
-       clkevt.cpumask = cpumask_of_cpu(0);
+       clkevt.cpumask = cpumask_of(0);
        clockevents_register_device(&clkevt);
 
        /* register clocksource */
index 122fd77ed580399b632451b114b66a9458c97239..b63e1d5f1badc60ad3573febf6d9ff557f770c64 100644 (file)
@@ -91,7 +91,6 @@ static struct clock_event_device pit_clkevt = {
        .features       = CLOCK_EVT_FEAT_PERIODIC,
        .shift          = 32,
        .rating         = 100,
-       .cpumask        = CPU_MASK_CPU0,
        .set_mode       = pit_clkevt_mode,
 };
 
@@ -173,6 +172,7 @@ static void __init at91sam926x_pit_init(void)
 
        /* Set up and register clockevents */
        pit_clkevt.mult = div_sc(pit_rate, NSEC_PER_SEC, pit_clkevt.shift);
+       pit_clkevt.cpumask = cpumask_of(0);
        clockevents_register_device(&pit_clkevt);
 }
 
index 3b9a296b5c4ba43ee703b7ccafce24fdd6067d44..f8bcd29d17a624a6a1d762b0b6ec4c82ffcf59a2 100644 (file)
@@ -322,7 +322,7 @@ static void __init davinci_timer_init(void)
        clockevent_davinci.min_delta_ns =
                clockevent_delta2ns(1, &clockevent_davinci);
 
-       clockevent_davinci.cpumask = cpumask_of_cpu(0);
+       clockevent_davinci.cpumask = cpumask_of(0);
        clockevents_register_device(&clockevent_davinci);
 }
 
index a11765f5f23b111c1f21a1a4edb3f498462c1a16..aff0ebcfa847a5422051c866e4e48f34109b7f76 100644 (file)
@@ -184,7 +184,7 @@ static int __init imx_clockevent_init(unsigned long rate)
        clockevent_imx.min_delta_ns =
                clockevent_delta2ns(0xf, &clockevent_imx);
 
-       clockevent_imx.cpumask = cpumask_of_cpu(0);
+       clockevent_imx.cpumask = cpumask_of(0);
 
        clockevents_register_device(&clockevent_imx);
 
index 7766f469456be0025ea8bc353dd45ad61c530e31..f4656d2ac8a85c3bbb899d43ec7f6b6b96b74c66 100644 (file)
@@ -487,7 +487,7 @@ static int __init ixp4xx_clockevent_init(void)
                clockevent_delta2ns(0xfffffffe, &clockevent_ixp4xx);
        clockevent_ixp4xx.min_delta_ns =
                clockevent_delta2ns(0xf, &clockevent_ixp4xx);
-       clockevent_ixp4xx.cpumask = cpumask_of_cpu(0);
+       clockevent_ixp4xx.cpumask = cpumask_of(0);
 
        clockevents_register_device(&clockevent_ixp4xx);
        return 0;
index 345a14cb73c38debc9e6bf55caac2b56700dea1c..444d9c0f5ca68c30ce2a50af7bdc83ef8d0785b0 100644 (file)
@@ -182,7 +182,7 @@ static void __init msm_timer_init(void)
                        clockevent_delta2ns(0xf0000000 >> clock->shift, ce);
                /* 4 gets rounded down to 3 */
                ce->min_delta_ns = clockevent_delta2ns(4, ce);
-               ce->cpumask = cpumask_of_cpu(0);
+               ce->cpumask = cpumask_of(0);
 
                cs->mult = clocksource_hz2mult(clock->freq, cs->shift);
                res = clocksource_register(cs);
index a63424d083d95186f7b5aabbe5c55cbc4b1302ad..41df697217692e3562668af05cf2ccf9153670f7 100644 (file)
@@ -173,7 +173,7 @@ static void __init ns9360_timer_init(void)
        ns9360_clockevent_device.min_delta_ns =
                clockevent_delta2ns(1, &ns9360_clockevent_device);
 
-       ns9360_clockevent_device.cpumask = cpumask_of_cpu(0);
+       ns9360_clockevent_device.cpumask = cpumask_of(0);
        clockevents_register_device(&ns9360_clockevent_device);
 
        setup_irq(IRQ_NS9360_TIMER0 + TIMER_CLOCKEVENT,
index 2cf7e32bd293796e216a81794c2ecc785bba1297..495a32c287b49c01944f9217e94f137f7cf69d6f 100644 (file)
@@ -173,7 +173,7 @@ static __init void omap_init_mpu_timer(unsigned long rate)
        clockevent_mpu_timer1.min_delta_ns =
                clockevent_delta2ns(1, &clockevent_mpu_timer1);
 
-       clockevent_mpu_timer1.cpumask = cpumask_of_cpu(0);
+       clockevent_mpu_timer1.cpumask = cpumask_of(0);
        clockevents_register_device(&clockevent_mpu_timer1);
 }
 
index 705367ece1741b539f6838454d9cf2d14aa61bad..fd3f7396e16249f8a447876fe93bbc1f36bc049a 100644 (file)
@@ -187,7 +187,7 @@ static __init void omap_init_32k_timer(void)
        clockevent_32k_timer.min_delta_ns =
                clockevent_delta2ns(1, &clockevent_32k_timer);
 
-       clockevent_32k_timer.cpumask = cpumask_of_cpu(0);
+       clockevent_32k_timer.cpumask = cpumask_of(0);
        clockevents_register_device(&clockevent_32k_timer);
 }
 
index 589393bedade0477dbecb909d5fd82501a58203c..ae6036300f603521ce2b59267fee71d5f668e589 100644 (file)
@@ -120,7 +120,7 @@ static void __init omap2_gp_clockevent_init(void)
        clockevent_gpt.min_delta_ns =
                clockevent_delta2ns(1, &clockevent_gpt);
 
-       clockevent_gpt.cpumask = cpumask_of_cpu(0);
+       clockevent_gpt.cpumask = cpumask_of(0);
        clockevents_register_device(&clockevent_gpt);
 }
 
index 0016241585190e3770de17ac6e02b0db57547a2f..95656a72268dd9f1a1ee7177c4ca5286d9d0a570 100644 (file)
@@ -122,7 +122,6 @@ static struct clock_event_device ckevt_pxa_osmr0 = {
        .features       = CLOCK_EVT_FEAT_ONESHOT,
        .shift          = 32,
        .rating         = 200,
-       .cpumask        = CPU_MASK_CPU0,
        .set_next_event = pxa_osmr0_set_next_event,
        .set_mode       = pxa_osmr0_set_mode,
 };
@@ -163,6 +162,7 @@ static void __init pxa_timer_init(void)
                clockevent_delta2ns(0x7fffffff, &ckevt_pxa_osmr0);
        ckevt_pxa_osmr0.min_delta_ns =
                clockevent_delta2ns(MIN_OSCR_DELTA * 2, &ckevt_pxa_osmr0) + 1;
+       ckevt_pxa_osmr0.cpumask = cpumask_of(0);
 
        cksrc_pxa_oscr0.mult =
                clocksource_hz2mult(clock_tick_rate, cksrc_pxa_oscr0.shift);
index 5f1d55963cedb8e7b4d3cbd0b6e71a24c84b10c9..bd2aa4f16141d72895e655f63efe75cf6ffcd298 100644 (file)
@@ -624,7 +624,7 @@ static struct clock_event_device timer0_clockevent =         {
        .set_mode       = timer_set_mode,
        .set_next_event = timer_set_next_event,
        .rating         = 300,
-       .cpumask        = CPU_MASK_ALL,
+       .cpumask        = cpu_all_mask,
 };
 
 static void __init realview_clockevents_init(unsigned int timer_irq)
index 9019ef2e56115ac72f5b359bd2da1d68999a5d7b..67d6d9cc68b2a693b5edc2a89aa8d78125e294d3 100644 (file)
@@ -154,7 +154,7 @@ void __cpuinit local_timer_setup(void)
        clk->set_mode           = local_timer_set_mode;
        clk->set_next_event     = local_timer_set_next_event;
        clk->irq                = IRQ_LOCALTIMER;
-       clk->cpumask            = cpumask_of_cpu(cpu);
+       clk->cpumask            = cpumask_of(cpu);
        clk->shift              = 20;
        clk->mult               = div_sc(mpcore_timer_rate, NSEC_PER_SEC, clk->shift);
        clk->max_delta_ns       = clockevent_delta2ns(0xffffffff, clk);
@@ -193,7 +193,7 @@ void __cpuinit local_timer_setup(void)
        clk->rating             = 200;
        clk->set_mode           = dummy_timer_set_mode;
        clk->broadcast          = smp_timer_broadcast;
-       clk->cpumask            = cpumask_of_cpu(cpu);
+       clk->cpumask            = cpumask_of(cpu);
 
        clockevents_register_device(clk);
 }
index 8c5e727f3b751ffb0e87b176402ae75adb276cfc..711c0295c66f1710de34e20f3d9a88c2eeb99609 100644 (file)
@@ -73,7 +73,6 @@ static struct clock_event_device ckevt_sa1100_osmr0 = {
        .features       = CLOCK_EVT_FEAT_ONESHOT,
        .shift          = 32,
        .rating         = 200,
-       .cpumask        = CPU_MASK_CPU0,
        .set_next_event = sa1100_osmr0_set_next_event,
        .set_mode       = sa1100_osmr0_set_mode,
 };
@@ -110,6 +109,7 @@ static void __init sa1100_timer_init(void)
                clockevent_delta2ns(0x7fffffff, &ckevt_sa1100_osmr0);
        ckevt_sa1100_osmr0.min_delta_ns =
                clockevent_delta2ns(MIN_OSCR_DELTA * 2, &ckevt_sa1100_osmr0) + 1;
+       ckevt_sa1100_osmr0.cpumask = cpumask_of(0);
 
        cksrc_sa1100_oscr.mult =
                clocksource_hz2mult(CLOCK_TICK_RATE, cksrc_sa1100_oscr.shift);
index df25aa138509c95aec6dfcb391f8409de5b182a8..1c43494f5c422092d713a860e1623e83078b9d05 100644 (file)
@@ -1005,7 +1005,7 @@ static void __init versatile_timer_init(void)
        timer0_clockevent.min_delta_ns =
                clockevent_delta2ns(0xf, &timer0_clockevent);
 
-       timer0_clockevent.cpumask = cpumask_of_cpu(0);
+       timer0_clockevent.cpumask = cpumask_of(0);
        clockevents_register_device(&timer0_clockevent);
 }
 
index 4de366e8b4c5f7fdaa900343e218c6e2cd49fcc1..6d6bd5899240862771935c47f32007fb452aec48 100644 (file)
@@ -260,10 +260,10 @@ static void em_stop(void)
 static void em_route_irq(int irq, unsigned int cpu)
 {
        struct irq_desc *desc = irq_desc + irq;
-       cpumask_t mask = cpumask_of_cpu(cpu);
+       const struct cpumask *mask = cpumask_of(cpu);
 
        spin_lock_irq(&desc->lock);
-       desc->affinity = mask;
+       desc->affinity = *mask;
        desc->chip->set_affinity(irq, mask);
        spin_unlock_irq(&desc->lock);
 }
index fd28f5194f71ad3bd1a09b05545c2d01e8ea17c6..758a1293bcfaaefa2be4191f030eb74b8dcc0ce8 100644 (file)
@@ -190,7 +190,7 @@ static int __init mxc_clockevent_init(void)
        clockevent_mxc.min_delta_ns =
                        clockevent_delta2ns(0xff, &clockevent_mxc);
 
-       clockevent_mxc.cpumask = cpumask_of_cpu(0);
+       clockevent_mxc.cpumask = cpumask_of(0);
 
        clockevents_register_device(&clockevent_mxc);
 
index 544d6b327f3a1e3f383a343ba6571316b67b3886..6fa2923e6dca388c4e445f5ccda9b327bfd1e58a 100644 (file)
@@ -149,7 +149,6 @@ static struct clock_event_device orion_clkevt = {
        .features       = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_PERIODIC,
        .shift          = 32,
        .rating         = 300,
-       .cpumask        = CPU_MASK_CPU0,
        .set_next_event = orion_clkevt_next_event,
        .set_mode       = orion_clkevt_mode,
 };
@@ -199,5 +198,6 @@ void __init orion_time_init(unsigned int irq, unsigned int tclk)
        orion_clkevt.mult = div_sc(tclk, NSEC_PER_SEC, orion_clkevt.shift);
        orion_clkevt.max_delta_ns = clockevent_delta2ns(0xfffffffe, &orion_clkevt);
        orion_clkevt.min_delta_ns = clockevent_delta2ns(1, &orion_clkevt);
+       orion_clkevt.cpumask = cpumask_of(0);
        clockevents_register_device(&orion_clkevt);
 }
index 283481d74a5bf454e2b6966810df4133d70d5cfc..0ff46bf873b034192ec557511febfe6586930ece 100644 (file)
@@ -106,7 +106,6 @@ static struct clock_event_device comparator = {
        .features       = CLOCK_EVT_FEAT_ONESHOT,
        .shift          = 16,
        .rating         = 50,
-       .cpumask        = CPU_MASK_CPU0,
        .set_next_event = comparator_next_event,
        .set_mode       = comparator_mode,
 };
@@ -134,6 +133,7 @@ void __init time_init(void)
        comparator.mult = div_sc(counter_hz, NSEC_PER_SEC, comparator.shift);
        comparator.max_delta_ns = clockevent_delta2ns((u32)~0, &comparator);
        comparator.min_delta_ns = clockevent_delta2ns(50, &comparator) + 1;
+       comparator.cpumask = cpumask_of(0);
 
        sysreg_write(COMPARE, 0);
        timer_irqaction.dev_id = &comparator;
index e887efc86c29e8215d544dcba262128b65343008..0ed2badfd74613b3753c901d44f7da6cf9ebe8df 100644 (file)
@@ -162,7 +162,6 @@ static struct clock_event_device clockevent_bfin = {
        .name           = "bfin_core_timer",
        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
        .shift          = 32,
-       .cpumask        = CPU_MASK_CPU0,
        .set_next_event = bfin_timer_set_next_event,
        .set_mode       = bfin_timer_set_mode,
 };
@@ -193,6 +192,7 @@ static int __init bfin_clockevent_init(void)
        clockevent_bfin.mult = div_sc(timer_clk, NSEC_PER_SEC, clockevent_bfin.shift);
        clockevent_bfin.max_delta_ns = clockevent_delta2ns(-1, &clockevent_bfin);
        clockevent_bfin.min_delta_ns = clockevent_delta2ns(100, &clockevent_bfin);
+       clockevent_bfin.cpumask = cpumask_of(0);
        clockevents_register_device(&clockevent_bfin);
 
        return 0;
index 173c141ac9ba85fa04884f1ec6ab8cbf5aba14c8..295131fee710ebe97758fed49f71fc5c0a3b906c 100644 (file)
@@ -325,11 +325,11 @@ static void end_crisv32_irq(unsigned int irq)
 {
 }
 
-void set_affinity_crisv32_irq(unsigned int irq, cpumask_t dest)
+void set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest)
 {
        unsigned long flags;
        spin_lock_irqsave(&irq_lock, flags);
-       irq_allocations[irq - FIRST_IRQ].mask = dest;
+       irq_allocations[irq - FIRST_IRQ].mask = *dest;
        spin_unlock_irqrestore(&irq_lock, flags);
 }
 
index 52e16c6436f9bac0733022622fa252d217b40369..9dac17334640a9cf3eda77c0c0a22b53e817f0a1 100644 (file)
 spinlock_t cris_atomic_locks[] = { [0 ... LOCK_COUNT - 1] = SPIN_LOCK_UNLOCKED};
 
 /* CPU masks */
-cpumask_t cpu_online_map = CPU_MASK_NONE;
-EXPORT_SYMBOL(cpu_online_map);
 cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
-cpumask_t cpu_possible_map;
-EXPORT_SYMBOL(cpu_possible_map);
 EXPORT_SYMBOL(phys_cpu_present_map);
 
 /* Variables used during SMP boot */
index dba33aba3e95583e815becee7cfecca797d64b9b..c615a06dd757aa46ca2e35b4d141b1834fb4bbc2 100644 (file)
@@ -4,7 +4,6 @@
 #include <linux/cpumask.h>
 
 extern cpumask_t phys_cpu_present_map;
-extern cpumask_t cpu_possible_map;
 
 #define raw_smp_processor_id() (current_thread_info()->cpu)
 
index c2f58ff364e7144c43de3e49958bfac7b3b3b0ad..cc0a3182db3c0158d246f658065a0f0f5d426a5f 100644 (file)
@@ -22,7 +22,7 @@ hpsim_irq_noop (unsigned int irq)
 }
 
 static void
-hpsim_set_affinity_noop (unsigned int a, cpumask_t b)
+hpsim_set_affinity_noop(unsigned int a, const struct cpumask *b)
 {
 }
 
index f38472ac22675b14322b648387b79d52958d59c7..68aa6da807c191fa74344c85b562b97f6d14694b 100644 (file)
@@ -166,8 +166,6 @@ struct saved_vpd {
 };
 
 struct kvm_regs {
-       char *saved_guest;
-       char *saved_stack;
        struct saved_vpd vpd;
        /*Arch-regs*/
        int mp_state;
@@ -200,6 +198,10 @@ struct kvm_regs {
        unsigned long fp_psr;       /*used for lazy float register */
        unsigned long saved_gp;
        /*for phycial  emulation */
+
+       union context saved_guest;
+
+       unsigned long reserved[64];     /* for future use */
 };
 
 struct kvm_sregs {
index c60d324da5407bde6aac09db462426ab7e7e4959..0560f3fae5380eae6b877822f82c047f49ada011 100644 (file)
 #ifndef __ASM_KVM_HOST_H
 #define __ASM_KVM_HOST_H
 
-
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/kvm.h>
-#include <linux/kvm_para.h>
-#include <linux/kvm_types.h>
-
-#include <asm/pal.h>
-#include <asm/sal.h>
-
-#define KVM_MAX_VCPUS 4
 #define KVM_MEMORY_SLOTS 32
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
 #define EXIT_REASON_EXTERNAL_INTERRUPT 6
 #define EXIT_REASON_IPI                        7
 #define EXIT_REASON_PTC_G              8
+#define EXIT_REASON_DEBUG              20
 
 /*Define vmm address space and vm data space.*/
-#define KVM_VMM_SIZE (16UL<<20)
+#define KVM_VMM_SIZE (__IA64_UL_CONST(16)<<20)
 #define KVM_VMM_SHIFT 24
-#define KVM_VMM_BASE 0xD000000000000000UL
-#define VMM_SIZE (8UL<<20)
+#define KVM_VMM_BASE 0xD000000000000000
+#define VMM_SIZE (__IA64_UL_CONST(8)<<20)
 
 /*
  * Define vm_buffer, used by PAL Services, base address.
- * Note: vmbuffer is in the VMM-BLOCK, the size must be < 8M
+ * Note: vm_buffer is in the VMM-BLOCK, the size must be < 8M
  */
 #define KVM_VM_BUFFER_BASE (KVM_VMM_BASE + VMM_SIZE)
-#define KVM_VM_BUFFER_SIZE (8UL<<20)
-
-/*Define Virtual machine data layout.*/
-#define KVM_VM_DATA_SHIFT  24
-#define KVM_VM_DATA_SIZE (1UL << KVM_VM_DATA_SHIFT)
-#define KVM_VM_DATA_BASE (KVM_VMM_BASE + KVM_VMM_SIZE)
-
-
-#define KVM_P2M_BASE    KVM_VM_DATA_BASE
-#define KVM_P2M_OFS     0
-#define KVM_P2M_SIZE    (8UL << 20)
-
-#define KVM_VHPT_BASE   (KVM_P2M_BASE + KVM_P2M_SIZE)
-#define KVM_VHPT_OFS    KVM_P2M_SIZE
-#define KVM_VHPT_BLOCK_SIZE   (2UL << 20)
-#define VHPT_SHIFT      18
-#define VHPT_SIZE       (1UL << VHPT_SHIFT)
-#define VHPT_NUM_ENTRIES (1<<(VHPT_SHIFT-5))
-
-#define KVM_VTLB_BASE   (KVM_VHPT_BASE+KVM_VHPT_BLOCK_SIZE)
-#define KVM_VTLB_OFS    (KVM_VHPT_OFS+KVM_VHPT_BLOCK_SIZE)
-#define KVM_VTLB_BLOCK_SIZE   (1UL<<20)
-#define VTLB_SHIFT      17
-#define VTLB_SIZE       (1UL<<VTLB_SHIFT)
-#define VTLB_NUM_ENTRIES (1<<(VTLB_SHIFT-5))
-
-#define KVM_VPD_BASE   (KVM_VTLB_BASE+KVM_VTLB_BLOCK_SIZE)
-#define KVM_VPD_OFS    (KVM_VTLB_OFS+KVM_VTLB_BLOCK_SIZE)
-#define KVM_VPD_BLOCK_SIZE   (2UL<<20)
-#define VPD_SHIFT       16
-#define VPD_SIZE        (1UL<<VPD_SHIFT)
-
-#define KVM_VCPU_BASE   (KVM_VPD_BASE+KVM_VPD_BLOCK_SIZE)
-#define KVM_VCPU_OFS    (KVM_VPD_OFS+KVM_VPD_BLOCK_SIZE)
-#define KVM_VCPU_BLOCK_SIZE   (2UL<<20)
-#define VCPU_SHIFT 18
-#define VCPU_SIZE (1UL<<VCPU_SHIFT)
-#define MAX_VCPU_NUM KVM_VCPU_BLOCK_SIZE/VCPU_SIZE
-
-#define KVM_VM_BASE     (KVM_VCPU_BASE+KVM_VCPU_BLOCK_SIZE)
-#define KVM_VM_OFS      (KVM_VCPU_OFS+KVM_VCPU_BLOCK_SIZE)
-#define KVM_VM_BLOCK_SIZE     (1UL<<19)
-
-#define KVM_MEM_DIRTY_LOG_BASE (KVM_VM_BASE+KVM_VM_BLOCK_SIZE)
-#define KVM_MEM_DIRTY_LOG_OFS  (KVM_VM_OFS+KVM_VM_BLOCK_SIZE)
-#define KVM_MEM_DIRTY_LOG_SIZE (1UL<<19)
-
-/* Get vpd, vhpt, tlb, vcpu, base*/
-#define VPD_ADDR(n) (KVM_VPD_BASE+n*VPD_SIZE)
-#define VHPT_ADDR(n) (KVM_VHPT_BASE+n*VHPT_SIZE)
-#define VTLB_ADDR(n) (KVM_VTLB_BASE+n*VTLB_SIZE)
-#define VCPU_ADDR(n) (KVM_VCPU_BASE+n*VCPU_SIZE)
+#define KVM_VM_BUFFER_SIZE (__IA64_UL_CONST(8)<<20)
+
+/*
+ * kvm guest's data area looks as follow:
+ *
+ *            +----------------------+ ------- KVM_VM_DATA_SIZE
+ *           |     vcpu[n]'s data   |   |     ___________________KVM_STK_OFFSET
+ *                   |                      |   |    /                   |
+ *                   |        ..........    |   |   /vcpu's struct&stack |
+ *                   |        ..........    |   |  /---------------------|---- 0
+ *           |     vcpu[5]'s data   |   | /       vpd            |
+ *           |     vcpu[4]'s data   |   |/-----------------------|
+ *           |     vcpu[3]'s data   |   /         vtlb           |
+ *           |     vcpu[2]'s data   |  /|------------------------|
+ *           |     vcpu[1]'s data   |/  |         vhpt           |
+ *           |     vcpu[0]'s data   |____________________________|
+ *            +----------------------+  |
+ *           |    memory dirty log  |   |
+ *            +----------------------+  |
+ *           |    vm's data struct  |   |
+ *            +----------------------+  |
+ *           |                      |   |
+ *           |                      |   |
+ *           |                      |   |
+ *           |                      |   |
+ *           |                      |   |
+ *           |                      |   |
+ *           |                      |   |
+ *           |   vm's p2m table  |      |
+ *           |                      |   |
+ *            |                             |   |
+ *           |                      |   |  |
+ * vm's data->|                             |   |  |
+ *           +----------------------+ ------- 0
+ * To support large memory, needs to increase the size of p2m.
+ * To support more vcpus, needs to ensure it has enough space to
+ * hold vcpus' data.
+ */
+
+#define KVM_VM_DATA_SHIFT      26
+#define KVM_VM_DATA_SIZE       (__IA64_UL_CONST(1) << KVM_VM_DATA_SHIFT)
+#define KVM_VM_DATA_BASE       (KVM_VMM_BASE + KVM_VM_DATA_SIZE)
+
+#define KVM_P2M_BASE           KVM_VM_DATA_BASE
+#define KVM_P2M_SIZE           (__IA64_UL_CONST(24) << 20)
+
+#define VHPT_SHIFT             16
+#define VHPT_SIZE              (__IA64_UL_CONST(1) << VHPT_SHIFT)
+#define VHPT_NUM_ENTRIES       (__IA64_UL_CONST(1) << (VHPT_SHIFT-5))
+
+#define VTLB_SHIFT             16
+#define VTLB_SIZE              (__IA64_UL_CONST(1) << VTLB_SHIFT)
+#define VTLB_NUM_ENTRIES       (1UL << (VHPT_SHIFT-5))
+
+#define VPD_SHIFT              16
+#define VPD_SIZE               (__IA64_UL_CONST(1) << VPD_SHIFT)
+
+#define VCPU_STRUCT_SHIFT      16
+#define VCPU_STRUCT_SIZE       (__IA64_UL_CONST(1) << VCPU_STRUCT_SHIFT)
+
+#define KVM_STK_OFFSET         VCPU_STRUCT_SIZE
+
+#define KVM_VM_STRUCT_SHIFT    19
+#define KVM_VM_STRUCT_SIZE     (__IA64_UL_CONST(1) << KVM_VM_STRUCT_SHIFT)
+
+#define KVM_MEM_DIRY_LOG_SHIFT 19
+#define KVM_MEM_DIRTY_LOG_SIZE (__IA64_UL_CONST(1) << KVM_MEM_DIRY_LOG_SHIFT)
+
+#ifndef __ASSEMBLY__
+
+/*Define the max vcpus and memory for Guests.*/
+#define KVM_MAX_VCPUS  (KVM_VM_DATA_SIZE - KVM_P2M_SIZE - KVM_VM_STRUCT_SIZE -\
+                       KVM_MEM_DIRTY_LOG_SIZE) / sizeof(struct kvm_vcpu_data)
+#define KVM_MAX_MEM_SIZE (KVM_P2M_SIZE >> 3 << PAGE_SHIFT)
+
+#define VMM_LOG_LEN 256
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/kvm.h>
+#include <linux/kvm_para.h>
+#include <linux/kvm_types.h>
+
+#include <asm/pal.h>
+#include <asm/sal.h>
+#include <asm/page.h>
+
+struct kvm_vcpu_data {
+       char vcpu_vhpt[VHPT_SIZE];
+       char vcpu_vtlb[VTLB_SIZE];
+       char vcpu_vpd[VPD_SIZE];
+       char vcpu_struct[VCPU_STRUCT_SIZE];
+};
+
+struct kvm_vm_data {
+       char kvm_p2m[KVM_P2M_SIZE];
+       char kvm_vm_struct[KVM_VM_STRUCT_SIZE];
+       char kvm_mem_dirty_log[KVM_MEM_DIRTY_LOG_SIZE];
+       struct kvm_vcpu_data vcpu_data[KVM_MAX_VCPUS];
+};
+
+#define VCPU_BASE(n)   KVM_VM_DATA_BASE + \
+                               offsetof(struct kvm_vm_data, vcpu_data[n])
+#define VM_BASE                KVM_VM_DATA_BASE + \
+                               offsetof(struct kvm_vm_data, kvm_vm_struct)
+#define KVM_MEM_DIRTY_LOG_BASE KVM_VM_DATA_BASE + \
+                               offsetof(struct kvm_vm_data, kvm_mem_dirty_log)
+
+#define VHPT_BASE(n) (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vhpt))
+#define VTLB_BASE(n) (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vtlb))
+#define VPD_BASE(n)  (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vpd))
+#define VCPU_STRUCT_BASE(n)    (VCPU_BASE(n) + \
+                               offsetof(struct kvm_vcpu_data, vcpu_struct))
 
 /*IO section definitions*/
 #define IOREQ_READ      1
@@ -389,6 +440,7 @@ struct kvm_vcpu_arch {
 
        unsigned long opcode;
        unsigned long cause;
+       char log_buf[VMM_LOG_LEN];
        union context host;
        union context guest;
 };
@@ -403,14 +455,13 @@ struct kvm_sal_data {
 };
 
 struct kvm_arch {
+       spinlock_t dirty_log_lock;
+
        unsigned long   vm_base;
        unsigned long   metaphysical_rr0;
        unsigned long   metaphysical_rr4;
        unsigned long   vmm_init_rr;
-       unsigned long   vhpt_base;
-       unsigned long   vtlb_base;
-       unsigned long   vpd_base;
-       spinlock_t dirty_log_lock;
+
        struct kvm_ioapic *vioapic;
        struct kvm_vm_stat stat;
        struct kvm_sal_data rdv_sal_data;
@@ -512,7 +563,7 @@ struct kvm_pt_regs {
 
 static inline struct kvm_pt_regs *vcpu_regs(struct kvm_vcpu *v)
 {
-       return (struct kvm_pt_regs *) ((unsigned long) v + IA64_STK_OFFSET) - 1;
+       return (struct kvm_pt_regs *) ((unsigned long) v + KVM_STK_OFFSET) - 1;
 }
 
 typedef int kvm_vmm_entry(void);
@@ -531,5 +582,6 @@ int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
 void kvm_sal_emul(struct kvm_vcpu *vcpu);
 
 static inline void kvm_inject_nmi(struct kvm_vcpu *vcpu) {}
+#endif /* __ASSEMBLY__*/
 
 #endif
index 12d96e0cd5134d9d95e01e6714b5b75179dcfcf7..21c402365d0ea7f86bf63cbf0009bb7c7e266642 100644 (file)
@@ -57,7 +57,6 @@ extern struct smp_boot_data {
 
 extern char no_int_routing __devinitdata;
 
-extern cpumask_t cpu_online_map;
 extern cpumask_t cpu_core_map[NR_CPUS];
 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
 extern int smp_num_siblings;
index 35bcb641c9e5731abb025c879d0acbb65a957ad6..a3cc9f65f954e7231a6fe0cba79ee5a0a22f5dc1 100644 (file)
@@ -55,7 +55,6 @@
 void build_cpu_to_node_map(void);
 
 #define SD_CPU_INIT (struct sched_domain) {            \
-       .span                   = CPU_MASK_NONE,        \
        .parent                 = NULL,                 \
        .child                  = NULL,                 \
        .groups                 = NULL,                 \
@@ -80,7 +79,6 @@ void build_cpu_to_node_map(void);
 
 /* sched_domains SD_NODE_INIT for IA64 NUMA machines */
 #define SD_NODE_INIT (struct sched_domain) {           \
-       .span                   = CPU_MASK_NONE,        \
        .parent                 = NULL,                 \
        .child                  = NULL,                 \
        .groups                 = NULL,                 \
index 5c4674ae8aea0279f1abba8cff8bdade994e0131..c8adecd5b4164a5e48264fc252c5dc8b43e6d9fd 100644 (file)
@@ -330,25 +330,25 @@ unmask_irq (unsigned int irq)
 
 
 static void
-iosapic_set_affinity (unsigned int irq, cpumask_t mask)
+iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 #ifdef CONFIG_SMP
        u32 high32, low32;
-       int dest, rte_index;
+       int cpu, dest, rte_index;
        int redir = (irq & IA64_IRQ_REDIRECTED) ? 1 : 0;
        struct iosapic_rte_info *rte;
        struct iosapic *iosapic;
 
        irq &= (~IA64_IRQ_REDIRECTED);
 
-       cpus_and(mask, mask, cpu_online_map);
-       if (cpus_empty(mask))
+       cpu = cpumask_first_and(cpu_online_mask, mask);
+       if (cpu >= nr_cpu_ids)
                return;
 
-       if (irq_prepare_move(irq, first_cpu(mask)))
+       if (irq_prepare_move(irq, cpu))
                return;
 
-       dest = cpu_physical_id(first_cpu(mask));
+       dest = cpu_physical_id(cpu);
 
        if (!iosapic_intr_info[irq].count)
                return;                 /* not an IOSAPIC interrupt */
index 7fd18f54c0568980238cc75a7810ea51de4a4308..0b6db53fedcf3b05a8417d71572143bb993752da 100644 (file)
@@ -133,7 +133,6 @@ unsigned int vectors_in_migration[NR_IRQS];
  */
 static void migrate_irqs(void)
 {
-       cpumask_t       mask;
        irq_desc_t *desc;
        int             irq, new_cpu;
 
@@ -152,15 +151,14 @@ static void migrate_irqs(void)
                if (desc->status == IRQ_PER_CPU)
                        continue;
 
-               cpus_and(mask, irq_desc[irq].affinity, cpu_online_map);
-               if (any_online_cpu(mask) == NR_CPUS) {
+               if (cpumask_any_and(&irq_desc[irq].affinity, cpu_online_mask)
+                   >= nr_cpu_ids) {
                        /*
                         * Save it for phase 2 processing
                         */
                        vectors_in_migration[irq] = irq;
 
                        new_cpu = any_online_cpu(cpu_online_map);
-                       mask = cpumask_of_cpu(new_cpu);
 
                        /*
                         * Al three are essential, currently WARN_ON.. maybe panic?
@@ -168,7 +166,8 @@ static void migrate_irqs(void)
                        if (desc->chip && desc->chip->disable &&
                                desc->chip->enable && desc->chip->set_affinity) {
                                desc->chip->disable(irq);
-                               desc->chip->set_affinity(irq, mask);
+                               desc->chip->set_affinity(irq,
+                                                        cpumask_of(new_cpu));
                                desc->chip->enable(irq);
                        } else {
                                WARN_ON((!(desc->chip) || !(desc->chip->disable) ||
index 702a09c132387577fa7d3b954a499feaa59705bc..890339339035b74814721c799a024fa6d184438a 100644 (file)
 static struct irq_chip ia64_msi_chip;
 
 #ifdef CONFIG_SMP
-static void ia64_set_msi_irq_affinity(unsigned int irq, cpumask_t cpu_mask)
+static void ia64_set_msi_irq_affinity(unsigned int irq,
+                                     const cpumask_t *cpu_mask)
 {
        struct msi_msg msg;
        u32 addr, data;
-       int cpu = first_cpu(cpu_mask);
+       int cpu = first_cpu(*cpu_mask);
 
        if (!cpu_online(cpu))
                return;
@@ -166,12 +167,11 @@ void arch_teardown_msi_irq(unsigned int irq)
 
 #ifdef CONFIG_DMAR
 #ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
        struct irq_cfg *cfg = irq_cfg + irq;
        struct msi_msg msg;
-       int cpu = first_cpu(mask);
-
+       int cpu = cpumask_first(mask);
 
        if (!cpu_online(cpu))
                return;
@@ -187,7 +187,7 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
        msg.address_lo |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu));
 
        dmar_msi_write(irq, &msg);
-       irq_desc[irq].affinity = mask;
+       irq_desc[irq].affinity = *mask;
 }
 #endif /* CONFIG_SMP */
 
index 1dcbb85fc4ee9f05253d5d76b5e8033b39a07cf9..11463994a7d540b63de364b80d3077a5f51f6445 100644 (file)
@@ -131,12 +131,6 @@ struct task_struct *task_for_booting_cpu;
  */
 DEFINE_PER_CPU(int, cpu_state);
 
-/* Bitmasks of currently online, and possible CPUs */
-cpumask_t cpu_online_map;
-EXPORT_SYMBOL(cpu_online_map);
-cpumask_t cpu_possible_map = CPU_MASK_NONE;
-EXPORT_SYMBOL(cpu_possible_map);
-
 cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
 EXPORT_SYMBOL(cpu_core_map);
 DEFINE_PER_CPU_SHARED_ALIGNED(cpumask_t, cpu_sibling_map);
@@ -688,7 +682,7 @@ int migrate_platform_irqs(unsigned int cpu)
 {
        int new_cpei_cpu;
        irq_desc_t *desc = NULL;
-       cpumask_t       mask;
+       const struct cpumask *mask;
        int             retval = 0;
 
        /*
@@ -701,7 +695,7 @@ int migrate_platform_irqs(unsigned int cpu)
                         * Now re-target the CPEI to a different processor
                         */
                        new_cpei_cpu = any_online_cpu(cpu_online_map);
-                       mask = cpumask_of_cpu(new_cpei_cpu);
+                       mask = cpumask_of(new_cpei_cpu);
                        set_cpei_target_cpu(new_cpei_cpu);
                        desc = irq_desc + ia64_cpe_irq;
                        /*
index c75b914f2d6bb587c413990663e864f573c9368a..a8d61a3e9a948c2fbc3cd33b2621f75737ab89b0 100644 (file)
@@ -219,7 +219,7 @@ static ssize_t show_shared_cpu_map(struct cache_info *this_leaf, char *buf)
        cpumask_t shared_cpu_map;
 
        cpus_and(shared_cpu_map, this_leaf->shared_cpu_map, cpu_online_map);
-       len = cpumask_scnprintf(buf, NR_CPUS+1, shared_cpu_map);
+       len = cpumask_scnprintf(buf, NR_CPUS+1, &shared_cpu_map);
        len += sprintf(buf+len, "\n");
        return len;
 }
index 92cef66ca268b8631db5b3b72b9887cb8f7dc446..76464dc312e63f781ded9f8abcbcfb8322c29a89 100644 (file)
@@ -60,7 +60,7 @@ obj-$(CONFIG_KVM) += kvm.o
 
 CFLAGS_vcpu.o += -mfixed-range=f2-f5,f12-f127
 kvm-intel-objs = vmm.o vmm_ivt.o trampoline.o vcpu.o optvfault.o mmio.o \
-       vtlb.o process.o
+       vtlb.o process.o kvm_lib.o
 #Add link memcpy and memset to avoid possible structure assignment error
 kvm-intel-objs += memcpy.o memset.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
index 4e3dc13a619c6782f94337a4a2873a4a208058c1..0c3564a7a033a25b2df4996c3b7f14169dcc8064 100644 (file)
 
 #include <linux/autoconf.h>
 #include <linux/kvm_host.h>
+#include <linux/kbuild.h>
 
 #include "vcpu.h"
 
-#define task_struct kvm_vcpu
-
-#define DEFINE(sym, val) \
-       asm volatile("\n->" #sym " (%0) " #val : : "i" (val))
-
-#define BLANK() asm volatile("\n->" : :)
-
-#define OFFSET(_sym, _str, _mem) \
-    DEFINE(_sym, offsetof(_str, _mem));
-
 void foo(void)
 {
        DEFINE(VMM_TASK_SIZE, sizeof(struct kvm_vcpu));
index af1464f7a6ad270ae6de3783f7d6096aba726294..0f5ebd94843725e11277603dcb87924fd0bd2df0 100644 (file)
@@ -180,7 +180,6 @@ int kvm_dev_ioctl_check_extension(long ext)
 
        switch (ext) {
        case KVM_CAP_IRQCHIP:
-       case KVM_CAP_USER_MEMORY:
        case KVM_CAP_MP_STATE:
 
                r = 1;
@@ -439,7 +438,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
                expires = div64_u64(itc_diff, cyc_per_usec);
                kt = ktime_set(0, 1000 * expires);
 
-               down_read(&vcpu->kvm->slots_lock);
                vcpu->arch.ht_active = 1;
                hrtimer_start(p_ht, kt, HRTIMER_MODE_ABS);
 
@@ -452,7 +450,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
                        if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
                                vcpu->arch.mp_state =
                                        KVM_MP_STATE_RUNNABLE;
-               up_read(&vcpu->kvm->slots_lock);
 
                if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
                        return -EINTR;
@@ -476,6 +473,13 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu,
        return 1;
 }
 
+static int handle_vcpu_debug(struct kvm_vcpu *vcpu,
+                               struct kvm_run *kvm_run)
+{
+       printk("VMM: %s", vcpu->arch.log_buf);
+       return 1;
+}
+
 static int (*kvm_vti_exit_handlers[])(struct kvm_vcpu *vcpu,
                struct kvm_run *kvm_run) = {
        [EXIT_REASON_VM_PANIC]              = handle_vm_error,
@@ -487,6 +491,7 @@ static int (*kvm_vti_exit_handlers[])(struct kvm_vcpu *vcpu,
        [EXIT_REASON_EXTERNAL_INTERRUPT]    = handle_external_interrupt,
        [EXIT_REASON_IPI]                   = handle_ipi,
        [EXIT_REASON_PTC_G]                 = handle_global_purge,
+       [EXIT_REASON_DEBUG]                 = handle_vcpu_debug,
 
 };
 
@@ -698,27 +703,24 @@ out:
        return r;
 }
 
-/*
- * Allocate 16M memory for every vm to hold its specific data.
- * Its memory map is defined in kvm_host.h.
- */
 static struct kvm *kvm_alloc_kvm(void)
 {
 
        struct kvm *kvm;
        uint64_t  vm_base;
 
+       BUG_ON(sizeof(struct kvm) > KVM_VM_STRUCT_SIZE);
+
        vm_base = __get_free_pages(GFP_KERNEL, get_order(KVM_VM_DATA_SIZE));
 
        if (!vm_base)
                return ERR_PTR(-ENOMEM);
-       printk(KERN_DEBUG"kvm: VM data's base Address:0x%lx\n", vm_base);
 
-       /* Zero all pages before use! */
        memset((void *)vm_base, 0, KVM_VM_DATA_SIZE);
-
-       kvm = (struct kvm *)(vm_base + KVM_VM_OFS);
+       kvm = (struct kvm *)(vm_base +
+                       offsetof(struct kvm_vm_data, kvm_vm_struct));
        kvm->arch.vm_base = vm_base;
+       printk(KERN_DEBUG"kvm: vm's data area:0x%lx\n", vm_base);
 
        return kvm;
 }
@@ -760,21 +762,12 @@ static void kvm_build_io_pmt(struct kvm *kvm)
 
 static void kvm_init_vm(struct kvm *kvm)
 {
-       long vm_base;
-
        BUG_ON(!kvm);
 
        kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0;
        kvm->arch.metaphysical_rr4 = GUEST_PHYSICAL_RR4;
        kvm->arch.vmm_init_rr = VMM_INIT_RR;
 
-       vm_base = kvm->arch.vm_base;
-       if (vm_base) {
-               kvm->arch.vhpt_base = vm_base + KVM_VHPT_OFS;
-               kvm->arch.vtlb_base = vm_base + KVM_VTLB_OFS;
-               kvm->arch.vpd_base  = vm_base + KVM_VPD_OFS;
-       }
-
        /*
         *Fill P2M entries for MMIO/IO ranges
         */
@@ -838,9 +831,8 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
-       int i;
        struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
-       int r;
+       int i;
 
        vcpu_load(vcpu);
 
@@ -857,18 +849,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 
        vpd->vpr = regs->vpd.vpr;
 
-       r = -EFAULT;
-       r = copy_from_user(&vcpu->arch.guest, regs->saved_guest,
-                                               sizeof(union context));
-       if (r)
-               goto out;
-       r = copy_from_user(vcpu + 1, regs->saved_stack +
-                       sizeof(struct kvm_vcpu),
-                       IA64_STK_OFFSET - sizeof(struct kvm_vcpu));
-       if (r)
-               goto out;
-       vcpu->arch.exit_data =
-               ((struct kvm_vcpu *)(regs->saved_stack))->arch.exit_data;
+       memcpy(&vcpu->arch.guest, &regs->saved_guest, sizeof(union context));
 
        RESTORE_REGS(mp_state);
        RESTORE_REGS(vmm_rr);
@@ -902,9 +883,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        set_bit(KVM_REQ_RESUME, &vcpu->requests);
 
        vcpu_put(vcpu);
-       r = 0;
-out:
-       return r;
+
+       return 0;
 }
 
 long kvm_arch_vm_ioctl(struct file *filp,
@@ -1166,10 +1146,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
                /*Set entry address for first run.*/
                regs->cr_iip = PALE_RESET_ENTRY;
 
-               /*Initilize itc offset for vcpus*/
+               /*Initialize itc offset for vcpus*/
                itc_offset = 0UL - ia64_getreg(_IA64_REG_AR_ITC);
-               for (i = 0; i < MAX_VCPU_NUM; i++) {
-                       v = (struct kvm_vcpu *)((char *)vcpu + VCPU_SIZE * i);
+               for (i = 0; i < KVM_MAX_VCPUS; i++) {
+                       v = (struct kvm_vcpu *)((char *)vcpu +
+                                       sizeof(struct kvm_vcpu_data) * i);
                        v->arch.itc_offset = itc_offset;
                        v->arch.last_itc = 0;
                }
@@ -1183,7 +1164,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        vcpu->arch.apic->vcpu = vcpu;
 
        p_ctx->gr[1] = 0;
-       p_ctx->gr[12] = (unsigned long)((char *)vmm_vcpu + IA64_STK_OFFSET);
+       p_ctx->gr[12] = (unsigned long)((char *)vmm_vcpu + KVM_STK_OFFSET);
        p_ctx->gr[13] = (unsigned long)vmm_vcpu;
        p_ctx->psr = 0x1008522000UL;
        p_ctx->ar[40] = FPSR_DEFAULT; /*fpsr*/
@@ -1218,12 +1199,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        vcpu->arch.hlt_timer.function = hlt_timer_fn;
 
        vcpu->arch.last_run_cpu = -1;
-       vcpu->arch.vpd = (struct vpd *)VPD_ADDR(vcpu->vcpu_id);
+       vcpu->arch.vpd = (struct vpd *)VPD_BASE(vcpu->vcpu_id);
        vcpu->arch.vsa_base = kvm_vsa_base;
        vcpu->arch.__gp = kvm_vmm_gp;
        vcpu->arch.dirty_log_lock_pa = __pa(&kvm->arch.dirty_log_lock);
-       vcpu->arch.vhpt.hash = (struct thash_data *)VHPT_ADDR(vcpu->vcpu_id);
-       vcpu->arch.vtlb.hash = (struct thash_data *)VTLB_ADDR(vcpu->vcpu_id);
+       vcpu->arch.vhpt.hash = (struct thash_data *)VHPT_BASE(vcpu->vcpu_id);
+       vcpu->arch.vtlb.hash = (struct thash_data *)VTLB_BASE(vcpu->vcpu_id);
        init_ptce_info(vcpu);
 
        r = 0;
@@ -1273,12 +1254,22 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
        int r;
        int cpu;
 
+       BUG_ON(sizeof(struct kvm_vcpu) > VCPU_STRUCT_SIZE/2);
+
+       r = -EINVAL;
+       if (id >= KVM_MAX_VCPUS) {
+               printk(KERN_ERR"kvm: Can't configure vcpus > %ld",
+                               KVM_MAX_VCPUS);
+               goto fail;
+       }
+
        r = -ENOMEM;
        if (!vm_base) {
                printk(KERN_ERR"kvm: Create vcpu[%d] error!\n", id);
                goto fail;
        }
-       vcpu = (struct kvm_vcpu *)(vm_base + KVM_VCPU_OFS + VCPU_SIZE * id);
+       vcpu = (struct kvm_vcpu *)(vm_base + offsetof(struct kvm_vm_data,
+                                       vcpu_data[id].vcpu_struct));
        vcpu->kvm = kvm;
 
        cpu = get_cpu();
@@ -1374,9 +1365,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
-       int i;
-       int r;
        struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
+       int i;
+
        vcpu_load(vcpu);
 
        for (i = 0; i < 16; i++) {
@@ -1391,14 +1382,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        regs->vpd.vpsr = vpd->vpsr;
        regs->vpd.vpr = vpd->vpr;
 
-       r = -EFAULT;
-       r = copy_to_user(regs->saved_guest, &vcpu->arch.guest,
-                                       sizeof(union context));
-       if (r)
-               goto out;
-       r = copy_to_user(regs->saved_stack, (void *)vcpu, IA64_STK_OFFSET);
-       if (r)
-               goto out;
+       memcpy(&regs->saved_guest, &vcpu->arch.guest, sizeof(union context));
+
        SAVE_REGS(mp_state);
        SAVE_REGS(vmm_rr);
        memcpy(regs->itrs, vcpu->arch.itrs, sizeof(struct thash_data) * NITRS);
@@ -1426,10 +1411,9 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        SAVE_REGS(metaphysical_saved_rr4);
        SAVE_REGS(fp_psr);
        SAVE_REGS(saved_gp);
+
        vcpu_put(vcpu);
-       r = 0;
-out:
-       return r;
+       return 0;
 }
 
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
@@ -1457,6 +1441,9 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
        struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
        unsigned long base_gfn = memslot->base_gfn;
 
+       if (base_gfn + npages > (KVM_MAX_MEM_SIZE >> PAGE_SHIFT))
+               return -ENOMEM;
+
        for (i = 0; i < npages; i++) {
                pfn = gfn_to_pfn(kvm, base_gfn + i);
                if (!kvm_is_mmio_pfn(pfn)) {
@@ -1631,8 +1618,8 @@ static int kvm_ia64_sync_dirty_log(struct kvm *kvm,
        struct kvm_memory_slot *memslot;
        int r, i;
        long n, base;
-       unsigned long *dirty_bitmap = (unsigned long *)((void *)kvm - KVM_VM_OFS
-                                       + KVM_MEM_DIRTY_LOG_OFS);
+       unsigned long *dirty_bitmap = (unsigned long *)(kvm->arch.vm_base +
+                       offsetof(struct kvm_vm_data, kvm_mem_dirty_log));
 
        r = -EINVAL;
        if (log->slot >= KVM_MEMORY_SLOTS)
diff --git a/arch/ia64/kvm/kvm_lib.c b/arch/ia64/kvm/kvm_lib.c
new file mode 100644 (file)
index 0000000..a85cb61
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * kvm_lib.c: Compile some libraries for kvm-intel module.
+ *
+ *     Just include kernel's library, and disable symbols export.
+ *     Copyright (C) 2008, Intel Corporation.
+ *     Xiantao Zhang  (xiantao.zhang@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#undef CONFIG_MODULES
+#include "../../../lib/vsprintf.c"
+#include "../../../lib/ctype.c"
index 2cc41d17cf991855d31460c1025137589e037461..b2bcaa2787aad228a9c24e99dca6bd35fda2422e 100644 (file)
@@ -24,6 +24,8 @@
 #include <asm/asmmacro.h>
 #include <asm/types.h>
 #include <asm/kregs.h>
+#include <asm/kvm_host.h>
+
 #include "asm-offsets.h"
 
 #define KVM_MINSTATE_START_SAVE_MIN                                            \
@@ -33,7 +35,7 @@
        addl r22 = VMM_RBS_OFFSET,r1;            /* compute base of RBS */      \
        ;;                                                                      \
        lfetch.fault.excl.nt1 [r22];                                            \
-       addl r1 = IA64_STK_OFFSET-VMM_PT_REGS_SIZE,r1;  /* compute base of memory stack */  \
+       addl r1 = KVM_STK_OFFSET-VMM_PT_REGS_SIZE, r1;  \
        mov r23 = ar.bspstore;                  /* save ar.bspstore */          \
        ;;                                                                      \
        mov ar.bspstore = r22;                          /* switch to kernel RBS */\
index e585c46073443de4cdd4687301876255e7fab87a..dd979e00b574a990374a4e715a3ab123586719c4 100644 (file)
@@ -27,7 +27,8 @@
  */
 static inline uint64_t *kvm_host_get_pmt(struct kvm *kvm)
 {
-       return (uint64_t *)(kvm->arch.vm_base + KVM_P2M_OFS);
+       return (uint64_t *)(kvm->arch.vm_base +
+                               offsetof(struct kvm_vm_data, kvm_p2m));
 }
 
 static inline void kvm_set_pmt_entry(struct kvm *kvm, gfn_t gfn,
index 7f1a858bc69f64c0156748e1d07c14f42066d208..21f63fffc379ad8a7faf227e1867789ec13c502d 100644 (file)
@@ -66,31 +66,25 @@ void lsapic_write(struct kvm_vcpu *v, unsigned long addr,
 
        switch (addr) {
        case PIB_OFST_INTA:
-               /*panic_domain(NULL, "Undefined write on PIB INTA\n");*/
-               panic_vm(v);
+               panic_vm(v, "Undefined write on PIB INTA\n");
                break;
        case PIB_OFST_XTP:
                if (length == 1) {
                        vlsapic_write_xtp(v, val);
                } else {
-                       /*panic_domain(NULL,
-                       "Undefined write on PIB XTP\n");*/
-                       panic_vm(v);
+                       panic_vm(v, "Undefined write on PIB XTP\n");
                }
                break;
        default:
                if (PIB_LOW_HALF(addr)) {
-                       /*lower half */
+                       /*Lower half */
                        if (length != 8)
-                               /*panic_domain(NULL,
-                               "Can't LHF write with size %ld!\n",
-                               length);*/
-                               panic_vm(v);
+                               panic_vm(v, "Can't LHF write with size %ld!\n",
+                                               length);
                        else
                                vlsapic_write_ipi(v, addr, val);
-               } else {   /*   upper half
-                               printk("IPI-UHF write %lx\n",addr);*/
-                       panic_vm(v);
+               } else {   /*Upper half */
+                       panic_vm(v, "IPI-UHF write %lx\n", addr);
                }
                break;
        }
@@ -108,22 +102,18 @@ unsigned long lsapic_read(struct kvm_vcpu *v, unsigned long addr,
                if (length == 1) /* 1 byte load */
                        ; /* There is no i8259, there is no INTA access*/
                else
-                       /*panic_domain(NULL,"Undefined read on PIB INTA\n"); */
-                       panic_vm(v);
+                       panic_vm(v, "Undefined read on PIB INTA\n");
 
                break;
        case PIB_OFST_XTP:
                if (length == 1) {
                        result = VLSAPIC_XTP(v);
-                       /* printk("read xtp %lx\n", result); */
                } else {
-                       /*panic_domain(NULL,
-                       "Undefined read on PIB XTP\n");*/
-                       panic_vm(v);
+                       panic_vm(v, "Undefined read on PIB XTP\n");
                }
                break;
        default:
-               panic_vm(v);
+               panic_vm(v, "Undefined addr access for lsapic!\n");
                break;
        }
        return result;
@@ -162,7 +152,7 @@ static void mmio_access(struct kvm_vcpu *vcpu, u64 src_pa, u64 *dest,
                        /* it's necessary to ensure zero extending */
                        *dest = p->u.ioreq.data & (~0UL >> (64-(s*8)));
        } else
-               panic_vm(vcpu);
+               panic_vm(vcpu, "Unhandled mmio access returned!\n");
 out:
        local_irq_restore(psr);
        return ;
@@ -324,7 +314,9 @@ void emulate_io_inst(struct kvm_vcpu *vcpu, u64 padr, u64 ma)
                return;
        } else {
                inst_type = -1;
-               panic_vm(vcpu);
+               panic_vm(vcpu, "Unsupported MMIO access instruction! \
+                               Bunld[0]=0x%lx, Bundle[1]=0x%lx\n",
+                               bundle.i64[0], bundle.i64[1]);
        }
 
        size = 1 << size;
@@ -335,7 +327,7 @@ void emulate_io_inst(struct kvm_vcpu *vcpu, u64 padr, u64 ma)
                if (inst_type == SL_INTEGER)
                        vcpu_set_gr(vcpu, inst.M1.r1, data, 0);
                else
-                       panic_vm(vcpu);
+                       panic_vm(vcpu, "Unsupported instruction type!\n");
 
        }
        vcpu_increment_iip(vcpu);
index 800817307b7b9da774cb678373476537b2c1e28e..552d07724207e213b5949e2dd15c00bc31c74dd3 100644 (file)
@@ -527,7 +527,8 @@ void reflect_interruption(u64 ifa, u64 isr, u64 iim,
        vector = vec2off[vec];
 
        if (!(vpsr & IA64_PSR_IC) && (vector != IA64_DATA_NESTED_TLB_VECTOR)) {
-               panic_vm(vcpu);
+               panic_vm(vcpu, "Interruption with vector :0x%lx occurs "
+                                               "with psr.ic = 0\n", vector);
                return;
        }
 
@@ -586,7 +587,7 @@ static void set_pal_call_result(struct kvm_vcpu *vcpu)
                vcpu_set_gr(vcpu, 10, p->u.pal_data.ret.v1, 0);
                vcpu_set_gr(vcpu, 11, p->u.pal_data.ret.v2, 0);
        } else
-               panic_vm(vcpu);
+               panic_vm(vcpu, "Mis-set for exit reason!\n");
 }
 
 static void set_sal_call_data(struct kvm_vcpu *vcpu)
@@ -614,7 +615,7 @@ static void set_sal_call_result(struct kvm_vcpu *vcpu)
                vcpu_set_gr(vcpu, 10, p->u.sal_data.ret.r10, 0);
                vcpu_set_gr(vcpu, 11, p->u.sal_data.ret.r11, 0);
        } else
-               panic_vm(vcpu);
+               panic_vm(vcpu, "Mis-set for exit reason!\n");
 }
 
 void  kvm_ia64_handle_break(unsigned long ifa, struct kvm_pt_regs *regs,
@@ -680,7 +681,7 @@ static void generate_exirq(struct kvm_vcpu *vcpu)
        vpsr = VCPU(vcpu, vpsr);
        isr = vpsr & IA64_PSR_RI;
        if (!(vpsr & IA64_PSR_IC))
-               panic_vm(vcpu);
+               panic_vm(vcpu, "Trying to inject one IRQ with psr.ic=0\n");
        reflect_interruption(0, isr, 0, 12, regs); /* EXT IRQ */
 }
 
@@ -941,8 +942,20 @@ static void vcpu_do_resume(struct kvm_vcpu *vcpu)
        ia64_set_pta(vcpu->arch.vhpt.pta.val);
 }
 
+static void vmm_sanity_check(struct kvm_vcpu *vcpu)
+{
+       struct exit_ctl_data *p = &vcpu->arch.exit_data;
+
+       if (!vmm_sanity && p->exit_reason != EXIT_REASON_DEBUG) {
+               panic_vm(vcpu, "Failed to do vmm sanity check,"
+                       "it maybe caused by crashed vmm!!\n\n");
+       }
+}
+
 static void kvm_do_resume_op(struct kvm_vcpu *vcpu)
 {
+       vmm_sanity_check(vcpu); /*Guarantee vcpu runing on healthy vmm!*/
+
        if (test_and_clear_bit(KVM_REQ_RESUME, &vcpu->requests)) {
                vcpu_do_resume(vcpu);
                return;
@@ -968,3 +981,11 @@ void vmm_transition(struct kvm_vcpu *vcpu)
                                                1, 0, 0, 0, 0, 0);
        kvm_do_resume_op(vcpu);
 }
+
+void vmm_panic_handler(u64 vec)
+{
+       struct kvm_vcpu *vcpu = current_vcpu;
+       vmm_sanity = 0;
+       panic_vm(vcpu, "Unexpected interruption occurs in VMM, vector:0x%lx\n",
+                       vec2off[vec]);
+}
index e44027ce5667e553b89abab2abb95012b98f8048..ecd526b5532305002b008b8936d3d7d7d6c9f54d 100644 (file)
@@ -816,8 +816,9 @@ static void vcpu_set_itc(struct kvm_vcpu *vcpu, u64 val)
        unsigned long vitv = VCPU(vcpu, itv);
 
        if (vcpu->vcpu_id == 0) {
-               for (i = 0; i < MAX_VCPU_NUM; i++) {
-                       v = (struct kvm_vcpu *)((char *)vcpu + VCPU_SIZE * i);
+               for (i = 0; i < KVM_MAX_VCPUS; i++) {
+                       v = (struct kvm_vcpu *)((char *)vcpu +
+                                       sizeof(struct kvm_vcpu_data) * i);
                        VMX(v, itc_offset) = itc_offset;
                        VMX(v, last_itc) = 0;
                }
@@ -1650,7 +1651,8 @@ void vcpu_set_psr(struct kvm_vcpu *vcpu, unsigned long val)
         * Otherwise panic
         */
        if (val & (IA64_PSR_PK | IA64_PSR_IS | IA64_PSR_VM))
-               panic_vm(vcpu);
+               panic_vm(vcpu, "Only support guests with vpsr.pk =0 \
+                               & vpsr.is=0\n");
 
        /*
         * For those IA64_PSR bits: id/da/dd/ss/ed/ia
@@ -2103,7 +2105,7 @@ void kvm_init_all_rr(struct kvm_vcpu *vcpu)
 
        if (is_physical_mode(vcpu)) {
                if (vcpu->arch.mode_flags & GUEST_PHY_EMUL)
-                       panic_vm(vcpu);
+                       panic_vm(vcpu, "Machine Status conflicts!\n");
 
                ia64_set_rr((VRN0 << VRN_SHIFT), vcpu->arch.metaphysical_rr0);
                ia64_dv_serialize_data();
@@ -2152,10 +2154,70 @@ int vmm_entry(void)
        return 0;
 }
 
-void panic_vm(struct kvm_vcpu *v)
-{
+static void kvm_show_registers(struct kvm_pt_regs *regs)
+{
+       unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri;
+
+       struct kvm_vcpu *vcpu = current_vcpu;
+       if (vcpu != NULL)
+               printk("vcpu 0x%p vcpu %d\n",
+                      vcpu, vcpu->vcpu_id);
+
+       printk("psr : %016lx ifs : %016lx ip  : [<%016lx>]\n",
+              regs->cr_ipsr, regs->cr_ifs, ip);
+
+       printk("unat: %016lx pfs : %016lx rsc : %016lx\n",
+              regs->ar_unat, regs->ar_pfs, regs->ar_rsc);
+       printk("rnat: %016lx bspstore: %016lx pr  : %016lx\n",
+              regs->ar_rnat, regs->ar_bspstore, regs->pr);
+       printk("ldrs: %016lx ccv : %016lx fpsr: %016lx\n",
+              regs->loadrs, regs->ar_ccv, regs->ar_fpsr);
+       printk("csd : %016lx ssd : %016lx\n", regs->ar_csd, regs->ar_ssd);
+       printk("b0  : %016lx b6  : %016lx b7  : %016lx\n", regs->b0,
+                                                       regs->b6, regs->b7);
+       printk("f6  : %05lx%016lx f7  : %05lx%016lx\n",
+              regs->f6.u.bits[1], regs->f6.u.bits[0],
+              regs->f7.u.bits[1], regs->f7.u.bits[0]);
+       printk("f8  : %05lx%016lx f9  : %05lx%016lx\n",
+              regs->f8.u.bits[1], regs->f8.u.bits[0],
+              regs->f9.u.bits[1], regs->f9.u.bits[0]);
+       printk("f10 : %05lx%016lx f11 : %05lx%016lx\n",
+              regs->f10.u.bits[1], regs->f10.u.bits[0],
+              regs->f11.u.bits[1], regs->f11.u.bits[0]);
+
+       printk("r1  : %016lx r2  : %016lx r3  : %016lx\n", regs->r1,
+                                                       regs->r2, regs->r3);
+       printk("r8  : %016lx r9  : %016lx r10 : %016lx\n", regs->r8,
+                                                       regs->r9, regs->r10);
+       printk("r11 : %016lx r12 : %016lx r13 : %016lx\n", regs->r11,
+                                                       regs->r12, regs->r13);
+       printk("r14 : %016lx r15 : %016lx r16 : %016lx\n", regs->r14,
+                                                       regs->r15, regs->r16);
+       printk("r17 : %016lx r18 : %016lx r19 : %016lx\n", regs->r17,
+                                                       regs->r18, regs->r19);
+       printk("r20 : %016lx r21 : %016lx r22 : %016lx\n", regs->r20,
+                                                       regs->r21, regs->r22);
+       printk("r23 : %016lx r24 : %016lx r25 : %016lx\n", regs->r23,
+                                                       regs->r24, regs->r25);
+       printk("r26 : %016lx r27 : %016lx r28 : %016lx\n", regs->r26,
+                                                       regs->r27, regs->r28);
+       printk("r29 : %016lx r30 : %016lx r31 : %016lx\n", regs->r29,
+                                                       regs->r30, regs->r31);
+
+}
+
+void panic_vm(struct kvm_vcpu *v, const char *fmt, ...)
+{
+       va_list args;
+       char buf[256];
+
+       struct kvm_pt_regs *regs = vcpu_regs(v);
        struct exit_ctl_data *p = &v->arch.exit_data;
-
+       va_start(args, fmt);
+       vsnprintf(buf, sizeof(buf), fmt, args);
+       va_end(args);
+       printk(buf);
+       kvm_show_registers(regs);
        p->exit_reason = EXIT_REASON_VM_PANIC;
        vmm_transition(v);
        /*Never to return*/
index e9b2a4e121c052636745927e40964417c5955812..b2f12a562bdf60fd4462378c5aa75c44ea882841 100644 (file)
@@ -737,9 +737,12 @@ void kvm_init_vtlb(struct kvm_vcpu *v);
 void kvm_init_vhpt(struct kvm_vcpu *v);
 void thash_init(struct thash_cb *hcb, u64 sz);
 
-void panic_vm(struct kvm_vcpu *v);
+void panic_vm(struct kvm_vcpu *v, const char *fmt, ...);
 
 extern u64 ia64_call_vsa(u64 proc, u64 arg1, u64 arg2, u64 arg3,
                u64 arg4, u64 arg5, u64 arg6, u64 arg7);
+
+extern long vmm_sanity;
+
 #endif
 #endif /* __VCPU_H__ */
index 2275bf4e681a891c47cbb2002b8b0e714b0508a0..9eee5c04baccceaaf646d72922d3403cf313e2d5 100644 (file)
@@ -20,6 +20,7 @@
  */
 
 
+#include<linux/kernel.h>
 #include<linux/module.h>
 #include<asm/fpswa.h>
 
@@ -31,6 +32,8 @@ MODULE_LICENSE("GPL");
 extern char kvm_ia64_ivt;
 extern fpswa_interface_t *vmm_fpswa_interface;
 
+long vmm_sanity = 1;
+
 struct kvm_vmm_info vmm_info = {
        .module      = THIS_MODULE,
        .vmm_entry   = vmm_entry,
@@ -62,5 +65,31 @@ void vmm_spin_unlock(spinlock_t *lock)
 {
        _vmm_raw_spin_unlock(lock);
 }
+
+static void vcpu_debug_exit(struct kvm_vcpu *vcpu)
+{
+       struct exit_ctl_data *p = &vcpu->arch.exit_data;
+       long psr;
+
+       local_irq_save(psr);
+       p->exit_reason = EXIT_REASON_DEBUG;
+       vmm_transition(vcpu);
+       local_irq_restore(psr);
+}
+
+asmlinkage int printk(const char *fmt, ...)
+{
+       struct kvm_vcpu *vcpu = current_vcpu;
+       va_list args;
+       int r;
+
+       memset(vcpu->arch.log_buf, 0, VMM_LOG_LEN);
+       va_start(args, fmt);
+       r = vsnprintf(vcpu->arch.log_buf, VMM_LOG_LEN, fmt, args);
+       va_end(args);
+       vcpu_debug_exit(vcpu);
+       return r;
+}
+
 module_init(kvm_vmm_init)
 module_exit(kvm_vmm_exit)
index c1d7251a148008bcbdcb57b9b0611ab9dc6a52ee..3ef1a017a3189e76e713767f56d7c3a9e95fdb3c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * /ia64/kvm_ivt.S
+ * arch/ia64/kvm/vmm_ivt.S
  *
  * Copyright (C) 1998-2001, 2003 Hewlett-Packard Co
  *      Stephane Eranian <eranian@hpl.hp.com>
 # define PSR_DEFAULT_BITS   0
 #endif
 
-
 #define KVM_FAULT(n)    \
-    kvm_fault_##n:;          \
-    mov r19=n;;          \
-    br.sptk.many kvm_fault_##n;         \
-    ;;                  \
-
+       kvm_fault_##n:;          \
+       mov r19=n;;          \
+       br.sptk.many kvm_vmm_panic;         \
+       ;;                  \
 
 #define KVM_REFLECT(n)    \
-    mov r31=pr;           \
-    mov r19=n;       /* prepare to save predicates */ \
-    mov r29=cr.ipsr;      \
-    ;;      \
-    tbit.z p6,p7=r29,IA64_PSR_VM_BIT;       \
-(p7)br.sptk.many kvm_dispatch_reflection;        \
-    br.sptk.many kvm_panic;      \
-
-
-GLOBAL_ENTRY(kvm_panic)
-    br.sptk.many kvm_panic
-    ;;
-END(kvm_panic)
-
-
-
-
+       mov r31=pr;           \
+       mov r19=n;       /* prepare to save predicates */ \
+       mov r29=cr.ipsr;      \
+       ;;      \
+       tbit.z p6,p7=r29,IA64_PSR_VM_BIT;       \
+(p7)   br.sptk.many kvm_dispatch_reflection;        \
+       br.sptk.many kvm_vmm_panic;      \
+
+GLOBAL_ENTRY(kvm_vmm_panic)
+       KVM_SAVE_MIN_WITH_COVER_R19
+       alloc r14=ar.pfs,0,0,1,0
+       mov out0=r15
+       adds r3=8,r2                // set up second base pointer
+       ;;
+       ssm psr.ic
+       ;;
+       srlz.i    // guarantee that interruption collection is on
+       ;;
+       //(p15) ssm psr.i               // restore psr.i
+       addl r14=@gprel(ia64_leave_hypervisor),gp
+       ;;
+       KVM_SAVE_REST
+       mov rp=r14
+       ;;
+       br.call.sptk.many b6=vmm_panic_handler;
+END(kvm_vmm_panic)
 
     .section .text.ivt,"ax"
 
@@ -105,308 +112,307 @@ kvm_ia64_ivt:
 ///////////////////////////////////////////////////////////////
 // 0x0000 Entry 0 (size 64 bundles) VHPT Translation (8,20,47)
 ENTRY(kvm_vhpt_miss)
-    KVM_FAULT(0)
+       KVM_FAULT(0)
 END(kvm_vhpt_miss)
 
-
     .org kvm_ia64_ivt+0x400
 ////////////////////////////////////////////////////////////////
 // 0x0400 Entry 1 (size 64 bundles) ITLB (21)
 ENTRY(kvm_itlb_miss)
-    mov r31 = pr
-    mov r29=cr.ipsr;
-    ;;
-    tbit.z p6,p7=r29,IA64_PSR_VM_BIT;
-    (p6) br.sptk kvm_alt_itlb_miss
-    mov r19 = 1
-    br.sptk kvm_itlb_miss_dispatch
-    KVM_FAULT(1);
+       mov r31 = pr
+       mov r29=cr.ipsr;
+       ;;
+       tbit.z p6,p7=r29,IA64_PSR_VM_BIT;
+(p6)   br.sptk kvm_alt_itlb_miss
+       mov r19 = 1
+       br.sptk kvm_itlb_miss_dispatch
+       KVM_FAULT(1);
 END(kvm_itlb_miss)
 
     .org kvm_ia64_ivt+0x0800
 //////////////////////////////////////////////////////////////////
 // 0x0800 Entry 2 (size 64 bundles) DTLB (9,48)
 ENTRY(kvm_dtlb_miss)
-    mov r31 = pr
-    mov r29=cr.ipsr;
-    ;;
-    tbit.z p6,p7=r29,IA64_PSR_VM_BIT;
-(p6)br.sptk kvm_alt_dtlb_miss
-    br.sptk kvm_dtlb_miss_dispatch
+       mov r31 = pr
+       mov r29=cr.ipsr;
+       ;;
+       tbit.z p6,p7=r29,IA64_PSR_VM_BIT;
+(p6)   br.sptk kvm_alt_dtlb_miss
+       br.sptk kvm_dtlb_miss_dispatch
 END(kvm_dtlb_miss)
 
      .org kvm_ia64_ivt+0x0c00
 ////////////////////////////////////////////////////////////////////
 // 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19)
 ENTRY(kvm_alt_itlb_miss)
-    mov r16=cr.ifa    // get address that caused the TLB miss
-    ;;
-    movl r17=PAGE_KERNEL
-    mov r24=cr.ipsr
-    movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
-    ;;
-    and r19=r19,r16     // clear ed, reserved bits, and PTE control bits
-    ;;
-    or r19=r17,r19      // insert PTE control bits into r19
-    ;;
-    movl r20=IA64_GRANULE_SHIFT<<2
-    ;;
-    mov cr.itir=r20
-    ;;
-    itc.i r19          // insert the TLB entry
-    mov pr=r31,-1
-    rfi
+       mov r16=cr.ifa    // get address that caused the TLB miss
+       ;;
+       movl r17=PAGE_KERNEL
+       mov r24=cr.ipsr
+       movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
+       ;;
+       and r19=r19,r16     // clear ed, reserved bits, and PTE control bits
+       ;;
+       or r19=r17,r19      // insert PTE control bits into r19
+       ;;
+       movl r20=IA64_GRANULE_SHIFT<<2
+       ;;
+       mov cr.itir=r20
+       ;;
+       itc.i r19               // insert the TLB entry
+       mov pr=r31,-1
+       rfi
 END(kvm_alt_itlb_miss)
 
     .org kvm_ia64_ivt+0x1000
 /////////////////////////////////////////////////////////////////////
 // 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46)
 ENTRY(kvm_alt_dtlb_miss)
-    mov r16=cr.ifa             // get address that caused the TLB miss
-    ;;
-    movl r17=PAGE_KERNEL
-    movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
-    mov r24=cr.ipsr
-    ;;
-    and r19=r19,r16     // clear ed, reserved bits, and PTE control bits
-    ;;
-    or r19=r19,r17     // insert PTE control bits into r19
-    ;;
-    movl r20=IA64_GRANULE_SHIFT<<2
-    ;;
-    mov cr.itir=r20
-    ;;
-    itc.d r19          // insert the TLB entry
-    mov pr=r31,-1
-    rfi
+       mov r16=cr.ifa          // get address that caused the TLB miss
+       ;;
+       movl r17=PAGE_KERNEL
+       movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
+       mov r24=cr.ipsr
+       ;;
+       and r19=r19,r16     // clear ed, reserved bits, and PTE control bits
+       ;;
+       or r19=r19,r17  // insert PTE control bits into r19
+       ;;
+       movl r20=IA64_GRANULE_SHIFT<<2
+       ;;
+       mov cr.itir=r20
+       ;;
+       itc.d r19               // insert the TLB entry
+       mov pr=r31,-1
+       rfi
 END(kvm_alt_dtlb_miss)
 
     .org kvm_ia64_ivt+0x1400
 //////////////////////////////////////////////////////////////////////
 // 0x1400 Entry 5 (size 64 bundles) Data nested TLB (6,45)
 ENTRY(kvm_nested_dtlb_miss)
-    KVM_FAULT(5)
+       KVM_FAULT(5)
 END(kvm_nested_dtlb_miss)
 
     .org kvm_ia64_ivt+0x1800
 /////////////////////////////////////////////////////////////////////
 // 0x1800 Entry 6 (size 64 bundles) Instruction Key Miss (24)
 ENTRY(kvm_ikey_miss)
-    KVM_REFLECT(6)
+       KVM_REFLECT(6)
 END(kvm_ikey_miss)
 
     .org kvm_ia64_ivt+0x1c00
 /////////////////////////////////////////////////////////////////////
 // 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
 ENTRY(kvm_dkey_miss)
-    KVM_REFLECT(7)
+       KVM_REFLECT(7)
 END(kvm_dkey_miss)
 
     .org kvm_ia64_ivt+0x2000
 ////////////////////////////////////////////////////////////////////
 // 0x2000 Entry 8 (size 64 bundles) Dirty-bit (54)
 ENTRY(kvm_dirty_bit)
-    KVM_REFLECT(8)
+       KVM_REFLECT(8)
 END(kvm_dirty_bit)
 
     .org kvm_ia64_ivt+0x2400
 ////////////////////////////////////////////////////////////////////
 // 0x2400 Entry 9 (size 64 bundles) Instruction Access-bit (27)
 ENTRY(kvm_iaccess_bit)
-    KVM_REFLECT(9)
+       KVM_REFLECT(9)
 END(kvm_iaccess_bit)
 
     .org kvm_ia64_ivt+0x2800
 ///////////////////////////////////////////////////////////////////
 // 0x2800 Entry 10 (size 64 bundles) Data Access-bit (15,55)
 ENTRY(kvm_daccess_bit)
-    KVM_REFLECT(10)
+       KVM_REFLECT(10)
 END(kvm_daccess_bit)
 
     .org kvm_ia64_ivt+0x2c00
 /////////////////////////////////////////////////////////////////
 // 0x2c00 Entry 11 (size 64 bundles) Break instruction (33)
 ENTRY(kvm_break_fault)
-    mov r31=pr
-    mov r19=11
-    mov r29=cr.ipsr
-    ;;
-    KVM_SAVE_MIN_WITH_COVER_R19
-    ;;
-    alloc r14=ar.pfs,0,0,4,0 // now it's safe (must be first in insn group!)
-    mov out0=cr.ifa
-    mov out2=cr.isr     // FIXME: pity to make this slow access twice
-    mov out3=cr.iim     // FIXME: pity to make this slow access twice
-    adds r3=8,r2                // set up second base pointer
-    ;;
-    ssm psr.ic
-    ;;
-    srlz.i                  // guarantee that interruption collection is on
-    ;;
-    //(p15)ssm psr.i               // restore psr.i
-    addl r14=@gprel(ia64_leave_hypervisor),gp
-    ;;
-    KVM_SAVE_REST
-    mov rp=r14
-    ;;
-    adds out1=16,sp
-    br.call.sptk.many b6=kvm_ia64_handle_break
-    ;;
+       mov r31=pr
+       mov r19=11
+       mov r29=cr.ipsr
+       ;;
+       KVM_SAVE_MIN_WITH_COVER_R19
+       ;;
+       alloc r14=ar.pfs,0,0,4,0 //(must be first in insn group!)
+       mov out0=cr.ifa
+       mov out2=cr.isr     // FIXME: pity to make this slow access twice
+       mov out3=cr.iim     // FIXME: pity to make this slow access twice
+       adds r3=8,r2                // set up second base pointer
+       ;;
+       ssm psr.ic
+       ;;
+       srlz.i         // guarantee that interruption collection is on
+       ;;
+       //(p15)ssm psr.i               // restore psr.i
+       addl r14=@gprel(ia64_leave_hypervisor),gp
+       ;;
+       KVM_SAVE_REST
+       mov rp=r14
+       ;;
+       adds out1=16,sp
+       br.call.sptk.many b6=kvm_ia64_handle_break
+       ;;
 END(kvm_break_fault)
 
     .org kvm_ia64_ivt+0x3000
 /////////////////////////////////////////////////////////////////
 // 0x3000 Entry 12 (size 64 bundles) External Interrupt (4)
 ENTRY(kvm_interrupt)
-    mov r31=pr         // prepare to save predicates
-    mov r19=12
-    mov r29=cr.ipsr
-    ;;
-    tbit.z p6,p7=r29,IA64_PSR_VM_BIT
-    tbit.z p0,p15=r29,IA64_PSR_I_BIT
-    ;;
-(p7) br.sptk kvm_dispatch_interrupt
-    ;;
-    mov r27=ar.rsc             /* M */
-    mov r20=r1                 /* A */
-    mov r25=ar.unat            /* M */
-    mov r26=ar.pfs             /* I */
-    mov r28=cr.iip             /* M */
-    cover                      /* B (or nothing) */
-    ;;
-    mov r1=sp
-    ;;
-    invala                     /* M */
-    mov r30=cr.ifs
-    ;;
-    addl r1=-VMM_PT_REGS_SIZE,r1
-    ;;
-    adds r17=2*L1_CACHE_BYTES,r1       /* really: biggest cache-line size */
-    adds r16=PT(CR_IPSR),r1
-    ;;
-    lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES
-    st8 [r16]=r29                      /* save cr.ipsr */
-    ;;
-    lfetch.fault.excl.nt1 [r17]
-    mov r29=b0
-    ;;
-    adds r16=PT(R8),r1         /* initialize first base pointer */
-    adds r17=PT(R9),r1         /* initialize second base pointer */
-    mov r18=r0                 /* make sure r18 isn't NaT */
-    ;;
+       mov r31=pr              // prepare to save predicates
+       mov r19=12
+       mov r29=cr.ipsr
+       ;;
+       tbit.z p6,p7=r29,IA64_PSR_VM_BIT
+       tbit.z p0,p15=r29,IA64_PSR_I_BIT
+       ;;
+(p7)   br.sptk kvm_dispatch_interrupt
+       ;;
+       mov r27=ar.rsc          /* M */
+       mov r20=r1                      /* A */
+       mov r25=ar.unat         /* M */
+       mov r26=ar.pfs          /* I */
+       mov r28=cr.iip          /* M */
+       cover                   /* B (or nothing) */
+       ;;
+       mov r1=sp
+       ;;
+       invala                  /* M */
+       mov r30=cr.ifs
+       ;;
+       addl r1=-VMM_PT_REGS_SIZE,r1
+       ;;
+       adds r17=2*L1_CACHE_BYTES,r1    /* really: biggest cache-line size */
+       adds r16=PT(CR_IPSR),r1
+       ;;
+       lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES
+       st8 [r16]=r29                   /* save cr.ipsr */
+       ;;
+       lfetch.fault.excl.nt1 [r17]
+       mov r29=b0
+       ;;
+       adds r16=PT(R8),r1      /* initialize first base pointer */
+       adds r17=PT(R9),r1      /* initialize second base pointer */
+       mov r18=r0                      /* make sure r18 isn't NaT */
+       ;;
 .mem.offset 0,0; st8.spill [r16]=r8,16
 .mem.offset 8,0; st8.spill [r17]=r9,16
         ;;
 .mem.offset 0,0; st8.spill [r16]=r10,24
 .mem.offset 8,0; st8.spill [r17]=r11,24
         ;;
-    st8 [r16]=r28,16           /* save cr.iip */
-    st8 [r17]=r30,16           /* save cr.ifs */
-    mov r8=ar.fpsr             /* M */
-    mov r9=ar.csd
-    mov r10=ar.ssd
-    movl r11=FPSR_DEFAULT      /* L-unit */
-    ;;
-    st8 [r16]=r25,16           /* save ar.unat */
-    st8 [r17]=r26,16           /* save ar.pfs */
-    shl r18=r18,16             /* compute ar.rsc to be used for "loadrs" */
-    ;;
-    st8 [r16]=r27,16           /* save ar.rsc */
-    adds r17=16,r17            /* skip over ar_rnat field */
-    ;;
-    st8 [r17]=r31,16           /* save predicates */
-    adds r16=16,r16            /* skip over ar_bspstore field */
-    ;;
-    st8 [r16]=r29,16           /* save b0 */
-    st8 [r17]=r18,16           /* save ar.rsc value for "loadrs" */
-    ;;
+       st8 [r16]=r28,16                /* save cr.iip */
+       st8 [r17]=r30,16                /* save cr.ifs */
+       mov r8=ar.fpsr          /* M */
+       mov r9=ar.csd
+       mov r10=ar.ssd
+       movl r11=FPSR_DEFAULT   /* L-unit */
+       ;;
+       st8 [r16]=r25,16                /* save ar.unat */
+       st8 [r17]=r26,16                /* save ar.pfs */
+       shl r18=r18,16          /* compute ar.rsc to be used for "loadrs" */
+       ;;
+       st8 [r16]=r27,16                /* save ar.rsc */
+       adds r17=16,r17         /* skip over ar_rnat field */
+       ;;
+       st8 [r17]=r31,16                /* save predicates */
+       adds r16=16,r16         /* skip over ar_bspstore field */
+       ;;
+       st8 [r16]=r29,16                /* save b0 */
+       st8 [r17]=r18,16                /* save ar.rsc value for "loadrs" */
+       ;;
 .mem.offset 0,0; st8.spill [r16]=r20,16    /* save original r1 */
 .mem.offset 8,0; st8.spill [r17]=r12,16
-    adds r12=-16,r1
-    /* switch to kernel memory stack (with 16 bytes of scratch) */
-    ;;
+       adds r12=-16,r1
+       /* switch to kernel memory stack (with 16 bytes of scratch) */
+       ;;
 .mem.offset 0,0; st8.spill [r16]=r13,16
 .mem.offset 8,0; st8.spill [r17]=r8,16 /* save ar.fpsr */
-    ;;
+       ;;
 .mem.offset 0,0; st8.spill [r16]=r15,16
 .mem.offset 8,0; st8.spill [r17]=r14,16
-    dep r14=-1,r0,60,4
-    ;;
+       dep r14=-1,r0,60,4
+       ;;
 .mem.offset 0,0; st8.spill [r16]=r2,16
 .mem.offset 8,0; st8.spill [r17]=r3,16
-    adds r2=VMM_PT_REGS_R16_OFFSET,r1
-    adds r14 = VMM_VCPU_GP_OFFSET,r13
-    ;;
-    mov r8=ar.ccv
-    ld8 r14 = [r14]
-    ;;
-    mov r1=r14       /* establish kernel global pointer */
-    ;;                                          \
-    bsw.1
-    ;;
-    alloc r14=ar.pfs,0,0,1,0   // must be first in an insn group
-    mov out0=r13
-    ;;
-    ssm psr.ic
-    ;;
-    srlz.i
-    ;;
-    //(p15) ssm psr.i
-    adds r3=8,r2               // set up second base pointer for SAVE_REST
-    srlz.i                     // ensure everybody knows psr.ic is back on
-    ;;
+       adds r2=VMM_PT_REGS_R16_OFFSET,r1
+       adds r14 = VMM_VCPU_GP_OFFSET,r13
+       ;;
+       mov r8=ar.ccv
+       ld8 r14 = [r14]
+       ;;
+       mov r1=r14       /* establish kernel global pointer */
+       ;;                                          \
+       bsw.1
+       ;;
+       alloc r14=ar.pfs,0,0,1,0        // must be first in an insn group
+       mov out0=r13
+       ;;
+       ssm psr.ic
+       ;;
+       srlz.i
+       ;;
+       //(p15) ssm psr.i
+       adds r3=8,r2            // set up second base pointer for SAVE_REST
+       srlz.i                  // ensure everybody knows psr.ic is back on
+       ;;
 .mem.offset 0,0; st8.spill [r2]=r16,16
 .mem.offset 8,0; st8.spill [r3]=r17,16
-    ;;
+       ;;
 .mem.offset 0,0; st8.spill [r2]=r18,16
 .mem.offset 8,0; st8.spill [r3]=r19,16
-    ;;
+       ;;
 .mem.offset 0,0; st8.spill [r2]=r20,16
 .mem.offset 8,0; st8.spill [r3]=r21,16
-    mov r18=b6
-    ;;
+       mov r18=b6
+       ;;
 .mem.offset 0,0; st8.spill [r2]=r22,16
 .mem.offset 8,0; st8.spill [r3]=r23,16
-    mov r19=b7
-    ;;
+       mov r19=b7
+       ;;
 .mem.offset 0,0; st8.spill [r2]=r24,16
 .mem.offset 8,0; st8.spill [r3]=r25,16
-    ;;
+       ;;
 .mem.offset 0,0; st8.spill [r2]=r26,16
 .mem.offset 8,0; st8.spill [r3]=r27,16
-    ;;
+       ;;
 .mem.offset 0,0; st8.spill [r2]=r28,16
 .mem.offset 8,0; st8.spill [r3]=r29,16
-    ;;
+       ;;
 .mem.offset 0,0; st8.spill [r2]=r30,16
 .mem.offset 8,0; st8.spill [r3]=r31,32
-    ;;
-    mov ar.fpsr=r11       /* M-unit */
-    st8 [r2]=r8,8         /* ar.ccv */
-    adds r24=PT(B6)-PT(F7),r3
-    ;;
-    stf.spill [r2]=f6,32
-    stf.spill [r3]=f7,32
-    ;;
-    stf.spill [r2]=f8,32
-    stf.spill [r3]=f9,32
-    ;;
-    stf.spill [r2]=f10
-    stf.spill [r3]=f11
-    adds r25=PT(B7)-PT(F11),r3
-    ;;
-    st8 [r24]=r18,16       /* b6 */
-    st8 [r25]=r19,16       /* b7 */
-    ;;
-    st8 [r24]=r9           /* ar.csd */
-    st8 [r25]=r10          /* ar.ssd */
-    ;;
-    srlz.d             // make sure we see the effect of cr.ivr
-    addl r14=@gprel(ia64_leave_nested),gp
-    ;;
-    mov rp=r14
-    br.call.sptk.many b6=kvm_ia64_handle_irq
-    ;;
+       ;;
+       mov ar.fpsr=r11       /* M-unit */
+       st8 [r2]=r8,8         /* ar.ccv */
+       adds r24=PT(B6)-PT(F7),r3
+       ;;
+       stf.spill [r2]=f6,32
+       stf.spill [r3]=f7,32
+       ;;
+       stf.spill [r2]=f8,32
+       stf.spill [r3]=f9,32
+       ;;
+       stf.spill [r2]=f10
+       stf.spill [r3]=f11
+       adds r25=PT(B7)-PT(F11),r3
+       ;;
+       st8 [r24]=r18,16       /* b6 */
+       st8 [r25]=r19,16       /* b7 */
+       ;;
+       st8 [r24]=r9           /* ar.csd */
+       st8 [r25]=r10          /* ar.ssd */
+       ;;
+       srlz.d          // make sure we see the effect of cr.ivr
+       addl r14=@gprel(ia64_leave_nested),gp
+       ;;
+       mov rp=r14
+       br.call.sptk.many b6=kvm_ia64_handle_irq
+       ;;
 END(kvm_interrupt)
 
     .global kvm_dispatch_vexirq
@@ -414,387 +420,385 @@ END(kvm_interrupt)
 //////////////////////////////////////////////////////////////////////
 // 0x3400 Entry 13 (size 64 bundles) Reserved
 ENTRY(kvm_virtual_exirq)
-    mov r31=pr
-    mov r19=13
-    mov r30 =r0
-    ;;
+       mov r31=pr
+       mov r19=13
+       mov r30 =r0
+       ;;
 kvm_dispatch_vexirq:
-    cmp.eq p6,p0 = 1,r30
-    ;;
-(p6)add r29 = VMM_VCPU_SAVED_GP_OFFSET,r21
-    ;;
-(p6)ld8 r1 = [r29]
-    ;;
-    KVM_SAVE_MIN_WITH_COVER_R19
-    alloc r14=ar.pfs,0,0,1,0
-    mov out0=r13
-
-    ssm psr.ic
-    ;;
-    srlz.i                  // guarantee that interruption collection is on
-    ;;
-    //(p15) ssm psr.i               // restore psr.i
-    adds r3=8,r2                // set up second base pointer
-    ;;
-    KVM_SAVE_REST
-    addl r14=@gprel(ia64_leave_hypervisor),gp
-    ;;
-    mov rp=r14
-    br.call.sptk.many b6=kvm_vexirq
+       cmp.eq p6,p0 = 1,r30
+       ;;
+(p6)   add r29 = VMM_VCPU_SAVED_GP_OFFSET,r21
+       ;;
+(p6)   ld8 r1 = [r29]
+       ;;
+       KVM_SAVE_MIN_WITH_COVER_R19
+       alloc r14=ar.pfs,0,0,1,0
+       mov out0=r13
+
+       ssm psr.ic
+       ;;
+       srlz.i // guarantee that interruption collection is on
+       ;;
+       //(p15) ssm psr.i               // restore psr.i
+       adds r3=8,r2                // set up second base pointer
+       ;;
+       KVM_SAVE_REST
+       addl r14=@gprel(ia64_leave_hypervisor),gp
+       ;;
+       mov rp=r14
+       br.call.sptk.many b6=kvm_vexirq
 END(kvm_virtual_exirq)
 
     .org kvm_ia64_ivt+0x3800
 /////////////////////////////////////////////////////////////////////
 // 0x3800 Entry 14 (size 64 bundles) Reserved
-    KVM_FAULT(14)
-    // this code segment is from 2.6.16.13
-
+       KVM_FAULT(14)
+       // this code segment is from 2.6.16.13
 
     .org kvm_ia64_ivt+0x3c00
 ///////////////////////////////////////////////////////////////////////
 // 0x3c00 Entry 15 (size 64 bundles) Reserved
-    KVM_FAULT(15)
-
+       KVM_FAULT(15)
 
     .org kvm_ia64_ivt+0x4000
 ///////////////////////////////////////////////////////////////////////
 // 0x4000 Entry 16 (size 64 bundles) Reserved
-    KVM_FAULT(16)
+       KVM_FAULT(16)
 
     .org kvm_ia64_ivt+0x4400
 //////////////////////////////////////////////////////////////////////
 // 0x4400 Entry 17 (size 64 bundles) Reserved
-    KVM_FAULT(17)
+       KVM_FAULT(17)
 
     .org kvm_ia64_ivt+0x4800
 //////////////////////////////////////////////////////////////////////
 // 0x4800 Entry 18 (size 64 bundles) Reserved
-    KVM_FAULT(18)
+       KVM_FAULT(18)
 
     .org kvm_ia64_ivt+0x4c00
 //////////////////////////////////////////////////////////////////////
 // 0x4c00 Entry 19 (size 64 bundles) Reserved
-    KVM_FAULT(19)
+       KVM_FAULT(19)
 
     .org kvm_ia64_ivt+0x5000
 //////////////////////////////////////////////////////////////////////
 // 0x5000 Entry 20 (size 16 bundles) Page Not Present
 ENTRY(kvm_page_not_present)
-    KVM_REFLECT(20)
+       KVM_REFLECT(20)
 END(kvm_page_not_present)
 
     .org kvm_ia64_ivt+0x5100
 ///////////////////////////////////////////////////////////////////////
 // 0x5100 Entry 21 (size 16 bundles) Key Permission vector
 ENTRY(kvm_key_permission)
-    KVM_REFLECT(21)
+       KVM_REFLECT(21)
 END(kvm_key_permission)
 
     .org kvm_ia64_ivt+0x5200
 //////////////////////////////////////////////////////////////////////
 // 0x5200 Entry 22 (size 16 bundles) Instruction Access Rights (26)
 ENTRY(kvm_iaccess_rights)
-    KVM_REFLECT(22)
+       KVM_REFLECT(22)
 END(kvm_iaccess_rights)
 
     .org kvm_ia64_ivt+0x5300
 //////////////////////////////////////////////////////////////////////
 // 0x5300 Entry 23 (size 16 bundles) Data Access Rights (14,53)
 ENTRY(kvm_daccess_rights)
-    KVM_REFLECT(23)
+       KVM_REFLECT(23)
 END(kvm_daccess_rights)
 
     .org kvm_ia64_ivt+0x5400
 /////////////////////////////////////////////////////////////////////
 // 0x5400 Entry 24 (size 16 bundles) General Exception (5,32,34,36,38,39)
 ENTRY(kvm_general_exception)
-   KVM_REFLECT(24)
-   KVM_FAULT(24)
+       KVM_REFLECT(24)
+       KVM_FAULT(24)
 END(kvm_general_exception)
 
     .org kvm_ia64_ivt+0x5500
 //////////////////////////////////////////////////////////////////////
 // 0x5500 Entry 25 (size 16 bundles) Disabled FP-Register (35)
 ENTRY(kvm_disabled_fp_reg)
-    KVM_REFLECT(25)
+       KVM_REFLECT(25)
 END(kvm_disabled_fp_reg)
 
     .org kvm_ia64_ivt+0x5600
 ////////////////////////////////////////////////////////////////////
 // 0x5600 Entry 26 (size 16 bundles) Nat Consumption (11,23,37,50)
 ENTRY(kvm_nat_consumption)
-    KVM_REFLECT(26)
+       KVM_REFLECT(26)
 END(kvm_nat_consumption)
 
     .org kvm_ia64_ivt+0x5700
 /////////////////////////////////////////////////////////////////////
 // 0x5700 Entry 27 (size 16 bundles) Speculation (40)
 ENTRY(kvm_speculation_vector)
-    KVM_REFLECT(27)
+       KVM_REFLECT(27)
 END(kvm_speculation_vector)
 
     .org kvm_ia64_ivt+0x5800
 /////////////////////////////////////////////////////////////////////
 // 0x5800 Entry 28 (size 16 bundles) Reserved
-    KVM_FAULT(28)
+       KVM_FAULT(28)
 
     .org kvm_ia64_ivt+0x5900
 ///////////////////////////////////////////////////////////////////
 // 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56)
 ENTRY(kvm_debug_vector)
-    KVM_FAULT(29)
+       KVM_FAULT(29)
 END(kvm_debug_vector)
 
     .org kvm_ia64_ivt+0x5a00
 ///////////////////////////////////////////////////////////////
 // 0x5a00 Entry 30 (size 16 bundles) Unaligned Reference (57)
 ENTRY(kvm_unaligned_access)
-    KVM_REFLECT(30)
+       KVM_REFLECT(30)
 END(kvm_unaligned_access)
 
     .org kvm_ia64_ivt+0x5b00
 //////////////////////////////////////////////////////////////////////
 // 0x5b00 Entry 31 (size 16 bundles) Unsupported Data Reference (57)
 ENTRY(kvm_unsupported_data_reference)
-    KVM_REFLECT(31)
+       KVM_REFLECT(31)
 END(kvm_unsupported_data_reference)
 
     .org kvm_ia64_ivt+0x5c00
 ////////////////////////////////////////////////////////////////////
 // 0x5c00 Entry 32 (size 16 bundles) Floating Point FAULT (65)
 ENTRY(kvm_floating_point_fault)
-    KVM_REFLECT(32)
+       KVM_REFLECT(32)
 END(kvm_floating_point_fault)
 
     .org kvm_ia64_ivt+0x5d00
 /////////////////////////////////////////////////////////////////////
 // 0x5d00 Entry 33 (size 16 bundles) Floating Point Trap (66)
 ENTRY(kvm_floating_point_trap)
-    KVM_REFLECT(33)
+       KVM_REFLECT(33)
 END(kvm_floating_point_trap)
 
     .org kvm_ia64_ivt+0x5e00
 //////////////////////////////////////////////////////////////////////
 // 0x5e00 Entry 34 (size 16 bundles) Lower Privilege Transfer Trap (66)
 ENTRY(kvm_lower_privilege_trap)
-    KVM_REFLECT(34)
+       KVM_REFLECT(34)
 END(kvm_lower_privilege_trap)
 
     .org kvm_ia64_ivt+0x5f00
 //////////////////////////////////////////////////////////////////////
 // 0x5f00 Entry 35 (size 16 bundles) Taken Branch Trap (68)
 ENTRY(kvm_taken_branch_trap)
-    KVM_REFLECT(35)
+       KVM_REFLECT(35)
 END(kvm_taken_branch_trap)
 
     .org kvm_ia64_ivt+0x6000
 ////////////////////////////////////////////////////////////////////
 // 0x6000 Entry 36 (size 16 bundles) Single Step Trap (69)
 ENTRY(kvm_single_step_trap)
-    KVM_REFLECT(36)
+       KVM_REFLECT(36)
 END(kvm_single_step_trap)
     .global kvm_virtualization_fault_back
     .org kvm_ia64_ivt+0x6100
 /////////////////////////////////////////////////////////////////////
 // 0x6100 Entry 37 (size 16 bundles) Virtualization Fault
 ENTRY(kvm_virtualization_fault)
-    mov r31=pr
-    adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
-    ;;
-    st8 [r16] = r1
-    adds r17 = VMM_VCPU_GP_OFFSET, r21
-    ;;
-    ld8 r1 = [r17]
-    cmp.eq p6,p0=EVENT_MOV_FROM_AR,r24
-    cmp.eq p7,p0=EVENT_MOV_FROM_RR,r24
-    cmp.eq p8,p0=EVENT_MOV_TO_RR,r24
-    cmp.eq p9,p0=EVENT_RSM,r24
-    cmp.eq p10,p0=EVENT_SSM,r24
-    cmp.eq p11,p0=EVENT_MOV_TO_PSR,r24
-    cmp.eq p12,p0=EVENT_THASH,r24
-    (p6) br.dptk.many kvm_asm_mov_from_ar
-    (p7) br.dptk.many kvm_asm_mov_from_rr
-    (p8) br.dptk.many kvm_asm_mov_to_rr
-    (p9) br.dptk.many kvm_asm_rsm
-    (p10) br.dptk.many kvm_asm_ssm
-    (p11) br.dptk.many kvm_asm_mov_to_psr
-    (p12) br.dptk.many kvm_asm_thash
-    ;;
+       mov r31=pr
+       adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
+       ;;
+       st8 [r16] = r1
+       adds r17 = VMM_VCPU_GP_OFFSET, r21
+       ;;
+       ld8 r1 = [r17]
+       cmp.eq p6,p0=EVENT_MOV_FROM_AR,r24
+       cmp.eq p7,p0=EVENT_MOV_FROM_RR,r24
+       cmp.eq p8,p0=EVENT_MOV_TO_RR,r24
+       cmp.eq p9,p0=EVENT_RSM,r24
+       cmp.eq p10,p0=EVENT_SSM,r24
+       cmp.eq p11,p0=EVENT_MOV_TO_PSR,r24
+       cmp.eq p12,p0=EVENT_THASH,r24
+(p6)   br.dptk.many kvm_asm_mov_from_ar
+(p7)   br.dptk.many kvm_asm_mov_from_rr
+(p8)   br.dptk.many kvm_asm_mov_to_rr
+(p9)   br.dptk.many kvm_asm_rsm
+(p10)  br.dptk.many kvm_asm_ssm
+(p11)  br.dptk.many kvm_asm_mov_to_psr
+(p12)  br.dptk.many kvm_asm_thash
+       ;;
 kvm_virtualization_fault_back:
-    adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
-    ;;
-    ld8 r1 = [r16]
-    ;;
-    mov r19=37
-    adds r16 = VMM_VCPU_CAUSE_OFFSET,r21
-    adds r17 = VMM_VCPU_OPCODE_OFFSET,r21
-    ;;
-    st8 [r16] = r24
-    st8 [r17] = r25
-    ;;
-    cmp.ne p6,p0=EVENT_RFI, r24
-    (p6) br.sptk kvm_dispatch_virtualization_fault
-    ;;
-    adds r18=VMM_VPD_BASE_OFFSET,r21
-    ;;
-    ld8 r18=[r18]
-    ;;
-    adds r18=VMM_VPD_VIFS_OFFSET,r18
-    ;;
-    ld8 r18=[r18]
-    ;;
-    tbit.z p6,p0=r18,63
-    (p6) br.sptk kvm_dispatch_virtualization_fault
-    ;;
-    //if vifs.v=1 desert current register frame
-    alloc r18=ar.pfs,0,0,0,0
-    br.sptk kvm_dispatch_virtualization_fault
+       adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
+       ;;
+       ld8 r1 = [r16]
+       ;;
+       mov r19=37
+       adds r16 = VMM_VCPU_CAUSE_OFFSET,r21
+       adds r17 = VMM_VCPU_OPCODE_OFFSET,r21
+       ;;
+       st8 [r16] = r24
+       st8 [r17] = r25
+       ;;
+       cmp.ne p6,p0=EVENT_RFI, r24
+(p6)   br.sptk kvm_dispatch_virtualization_fault
+       ;;
+       adds r18=VMM_VPD_BASE_OFFSET,r21
+       ;;
+       ld8 r18=[r18]
+       ;;
+       adds r18=VMM_VPD_VIFS_OFFSET,r18
+       ;;
+       ld8 r18=[r18]
+       ;;
+       tbit.z p6,p0=r18,63
+(p6)   br.sptk kvm_dispatch_virtualization_fault
+       ;;
+//if vifs.v=1 desert current register frame
+       alloc r18=ar.pfs,0,0,0,0
+       br.sptk kvm_dispatch_virtualization_fault
 END(kvm_virtualization_fault)
 
     .org kvm_ia64_ivt+0x6200
 //////////////////////////////////////////////////////////////
 // 0x6200 Entry 38 (size 16 bundles) Reserved
-    KVM_FAULT(38)
+       KVM_FAULT(38)
 
     .org kvm_ia64_ivt+0x6300
 /////////////////////////////////////////////////////////////////
 // 0x6300 Entry 39 (size 16 bundles) Reserved
-    KVM_FAULT(39)
+       KVM_FAULT(39)
 
     .org kvm_ia64_ivt+0x6400
 /////////////////////////////////////////////////////////////////
 // 0x6400 Entry 40 (size 16 bundles) Reserved
-    KVM_FAULT(40)
+       KVM_FAULT(40)
 
     .org kvm_ia64_ivt+0x6500
 //////////////////////////////////////////////////////////////////
 // 0x6500 Entry 41 (size 16 bundles) Reserved
-    KVM_FAULT(41)
+       KVM_FAULT(41)
 
     .org kvm_ia64_ivt+0x6600
 //////////////////////////////////////////////////////////////////
 // 0x6600 Entry 42 (size 16 bundles) Reserved
-    KVM_FAULT(42)
+       KVM_FAULT(42)
 
     .org kvm_ia64_ivt+0x6700
 //////////////////////////////////////////////////////////////////
 // 0x6700 Entry 43 (size 16 bundles) Reserved
-    KVM_FAULT(43)
+       KVM_FAULT(43)
 
     .org kvm_ia64_ivt+0x6800
 //////////////////////////////////////////////////////////////////
 // 0x6800 Entry 44 (size 16 bundles) Reserved
-    KVM_FAULT(44)
+       KVM_FAULT(44)
 
     .org kvm_ia64_ivt+0x6900
 ///////////////////////////////////////////////////////////////////
 // 0x6900 Entry 45 (size 16 bundles) IA-32 Exeception
 //(17,18,29,41,42,43,44,58,60,61,62,72,73,75,76,77)
 ENTRY(kvm_ia32_exception)
-    KVM_FAULT(45)
+       KVM_FAULT(45)
 END(kvm_ia32_exception)
 
     .org kvm_ia64_ivt+0x6a00
 ////////////////////////////////////////////////////////////////////
 // 0x6a00 Entry 46 (size 16 bundles) IA-32 Intercept  (30,31,59,70,71)
 ENTRY(kvm_ia32_intercept)
-    KVM_FAULT(47)
+       KVM_FAULT(47)
 END(kvm_ia32_intercept)
 
     .org kvm_ia64_ivt+0x6c00
 /////////////////////////////////////////////////////////////////////
 // 0x6c00 Entry 48 (size 16 bundles) Reserved
-    KVM_FAULT(48)
+       KVM_FAULT(48)
 
     .org kvm_ia64_ivt+0x6d00
 //////////////////////////////////////////////////////////////////////
 // 0x6d00 Entry 49 (size 16 bundles) Reserved
-    KVM_FAULT(49)
+       KVM_FAULT(49)
 
     .org kvm_ia64_ivt+0x6e00
 //////////////////////////////////////////////////////////////////////
 // 0x6e00 Entry 50 (size 16 bundles) Reserved
-    KVM_FAULT(50)
+       KVM_FAULT(50)
 
     .org kvm_ia64_ivt+0x6f00
 /////////////////////////////////////////////////////////////////////
 // 0x6f00 Entry 51 (size 16 bundles) Reserved
-    KVM_FAULT(52)
+       KVM_FAULT(52)
 
     .org kvm_ia64_ivt+0x7100
 ////////////////////////////////////////////////////////////////////
 // 0x7100 Entry 53 (size 16 bundles) Reserved
-    KVM_FAULT(53)
+       KVM_FAULT(53)
 
     .org kvm_ia64_ivt+0x7200
 /////////////////////////////////////////////////////////////////////
 // 0x7200 Entry 54 (size 16 bundles) Reserved
-    KVM_FAULT(54)
+       KVM_FAULT(54)
 
     .org kvm_ia64_ivt+0x7300
 ////////////////////////////////////////////////////////////////////
 // 0x7300 Entry 55 (size 16 bundles) Reserved
-    KVM_FAULT(55)
+       KVM_FAULT(55)
 
     .org kvm_ia64_ivt+0x7400
 ////////////////////////////////////////////////////////////////////
 // 0x7400 Entry 56 (size 16 bundles) Reserved
-    KVM_FAULT(56)
+       KVM_FAULT(56)
 
     .org kvm_ia64_ivt+0x7500
 /////////////////////////////////////////////////////////////////////
 // 0x7500 Entry 57 (size 16 bundles) Reserved
-    KVM_FAULT(57)
+       KVM_FAULT(57)
 
     .org kvm_ia64_ivt+0x7600
 /////////////////////////////////////////////////////////////////////
 // 0x7600 Entry 58 (size 16 bundles) Reserved
-    KVM_FAULT(58)
+       KVM_FAULT(58)
 
     .org kvm_ia64_ivt+0x7700
 ////////////////////////////////////////////////////////////////////
 // 0x7700 Entry 59 (size 16 bundles) Reserved
-    KVM_FAULT(59)
+       KVM_FAULT(59)
 
     .org kvm_ia64_ivt+0x7800
 ////////////////////////////////////////////////////////////////////
 // 0x7800 Entry 60 (size 16 bundles) Reserved
-    KVM_FAULT(60)
+       KVM_FAULT(60)
 
     .org kvm_ia64_ivt+0x7900
 /////////////////////////////////////////////////////////////////////
 // 0x7900 Entry 61 (size 16 bundles) Reserved
-    KVM_FAULT(61)
+       KVM_FAULT(61)
 
     .org kvm_ia64_ivt+0x7a00
 /////////////////////////////////////////////////////////////////////
 // 0x7a00 Entry 62 (size 16 bundles) Reserved
-    KVM_FAULT(62)
+       KVM_FAULT(62)
 
     .org kvm_ia64_ivt+0x7b00
 /////////////////////////////////////////////////////////////////////
 // 0x7b00 Entry 63 (size 16 bundles) Reserved
-    KVM_FAULT(63)
+       KVM_FAULT(63)
 
     .org kvm_ia64_ivt+0x7c00
 ////////////////////////////////////////////////////////////////////
 // 0x7c00 Entry 64 (size 16 bundles) Reserved
-    KVM_FAULT(64)
+       KVM_FAULT(64)
 
     .org kvm_ia64_ivt+0x7d00
 /////////////////////////////////////////////////////////////////////
 // 0x7d00 Entry 65 (size 16 bundles) Reserved
-    KVM_FAULT(65)
+       KVM_FAULT(65)
 
     .org kvm_ia64_ivt+0x7e00
 /////////////////////////////////////////////////////////////////////
 // 0x7e00 Entry 66 (size 16 bundles) Reserved
-    KVM_FAULT(66)
+       KVM_FAULT(66)
 
     .org kvm_ia64_ivt+0x7f00
 ////////////////////////////////////////////////////////////////////
 // 0x7f00 Entry 67 (size 16 bundles) Reserved
-    KVM_FAULT(67)
+       KVM_FAULT(67)
 
     .org kvm_ia64_ivt+0x8000
 // There is no particular reason for this code to be here, other than that
@@ -804,132 +808,128 @@ END(kvm_ia32_intercept)
 
 
 ENTRY(kvm_dtlb_miss_dispatch)
-    mov r19 = 2
-    KVM_SAVE_MIN_WITH_COVER_R19
-    alloc r14=ar.pfs,0,0,3,0
-    mov out0=cr.ifa
-    mov out1=r15
-    adds r3=8,r2                // set up second base pointer
-    ;;
-    ssm psr.ic
-    ;;
-    srlz.i                  // guarantee that interruption collection is on
-    ;;
-    //(p15) ssm psr.i               // restore psr.i
-    addl r14=@gprel(ia64_leave_hypervisor_prepare),gp
-    ;;
-    KVM_SAVE_REST
-    KVM_SAVE_EXTRA
-    mov rp=r14
-    ;;
-    adds out2=16,r12
-    br.call.sptk.many b6=kvm_page_fault
+       mov r19 = 2
+       KVM_SAVE_MIN_WITH_COVER_R19
+       alloc r14=ar.pfs,0,0,3,0
+       mov out0=cr.ifa
+       mov out1=r15
+       adds r3=8,r2                // set up second base pointer
+       ;;
+       ssm psr.ic
+       ;;
+       srlz.i     // guarantee that interruption collection is on
+       ;;
+       //(p15) ssm psr.i               // restore psr.i
+       addl r14=@gprel(ia64_leave_hypervisor_prepare),gp
+       ;;
+       KVM_SAVE_REST
+       KVM_SAVE_EXTRA
+       mov rp=r14
+       ;;
+       adds out2=16,r12
+       br.call.sptk.many b6=kvm_page_fault
 END(kvm_dtlb_miss_dispatch)
 
 ENTRY(kvm_itlb_miss_dispatch)
 
-    KVM_SAVE_MIN_WITH_COVER_R19
-    alloc r14=ar.pfs,0,0,3,0
-    mov out0=cr.ifa
-    mov out1=r15
-    adds r3=8,r2                // set up second base pointer
-    ;;
-    ssm psr.ic
-    ;;
-    srlz.i                  // guarantee that interruption collection is on
-    ;;
-    //(p15) ssm psr.i               // restore psr.i
-    addl r14=@gprel(ia64_leave_hypervisor),gp
-    ;;
-    KVM_SAVE_REST
-    mov rp=r14
-    ;;
-    adds out2=16,r12
-    br.call.sptk.many b6=kvm_page_fault
+       KVM_SAVE_MIN_WITH_COVER_R19
+       alloc r14=ar.pfs,0,0,3,0
+       mov out0=cr.ifa
+       mov out1=r15
+       adds r3=8,r2                // set up second base pointer
+       ;;
+       ssm psr.ic
+       ;;
+       srlz.i   // guarantee that interruption collection is on
+       ;;
+       //(p15) ssm psr.i               // restore psr.i
+       addl r14=@gprel(ia64_leave_hypervisor),gp
+       ;;
+       KVM_SAVE_REST
+       mov rp=r14
+       ;;
+       adds out2=16,r12
+       br.call.sptk.many b6=kvm_page_fault
 END(kvm_itlb_miss_dispatch)
 
 ENTRY(kvm_dispatch_reflection)
-    /*
    * Input:
    *  psr.ic: off
    *  r19:    intr type (offset into ivt, see ia64_int.h)
    *  r31:    contains saved predicates (pr)
    */
-    KVM_SAVE_MIN_WITH_COVER_R19
-    alloc r14=ar.pfs,0,0,5,0
-    mov out0=cr.ifa
-    mov out1=cr.isr
-    mov out2=cr.iim
-    mov out3=r15
-    adds r3=8,r2                // set up second base pointer
-    ;;
-    ssm psr.ic
-    ;;
-    srlz.i                  // guarantee that interruption collection is on
-    ;;
-    //(p15) ssm psr.i               // restore psr.i
-    addl r14=@gprel(ia64_leave_hypervisor),gp
-    ;;
-    KVM_SAVE_REST
-    mov rp=r14
-    ;;
-    adds out4=16,r12
-    br.call.sptk.many b6=reflect_interruption
+/*
+ * Input:
+ *  psr.ic: off
+ *  r19:    intr type (offset into ivt, see ia64_int.h)
+ *  r31:    contains saved predicates (pr)
+ */
+       KVM_SAVE_MIN_WITH_COVER_R19
+       alloc r14=ar.pfs,0,0,5,0
+       mov out0=cr.ifa
+       mov out1=cr.isr
+       mov out2=cr.iim
+       mov out3=r15
+       adds r3=8,r2                // set up second base pointer
+       ;;
+       ssm psr.ic
+       ;;
+       srlz.i   // guarantee that interruption collection is on
+       ;;
+       //(p15) ssm psr.i               // restore psr.i
+       addl r14=@gprel(ia64_leave_hypervisor),gp
+       ;;
+       KVM_SAVE_REST
+       mov rp=r14
+       ;;
+       adds out4=16,r12
+       br.call.sptk.many b6=reflect_interruption
 END(kvm_dispatch_reflection)
 
 ENTRY(kvm_dispatch_virtualization_fault)
-    adds r16 = VMM_VCPU_CAUSE_OFFSET,r21
-    adds r17 = VMM_VCPU_OPCODE_OFFSET,r21
-    ;;
-    st8 [r16] = r24
-    st8 [r17] = r25
-    ;;
-    KVM_SAVE_MIN_WITH_COVER_R19
-    ;;
-    alloc r14=ar.pfs,0,0,2,0 // now it's safe (must be first in insn group!)
-    mov out0=r13        //vcpu
-    adds r3=8,r2                // set up second base pointer
-    ;;
-    ssm psr.ic
-    ;;
-    srlz.i                  // guarantee that interruption collection is on
-    ;;
-    //(p15) ssm psr.i               // restore psr.i
-    addl r14=@gprel(ia64_leave_hypervisor_prepare),gp
-    ;;
-    KVM_SAVE_REST
-    KVM_SAVE_EXTRA
-    mov rp=r14
-    ;;
-    adds out1=16,sp         //regs
-    br.call.sptk.many b6=kvm_emulate
+       adds r16 = VMM_VCPU_CAUSE_OFFSET,r21
+       adds r17 = VMM_VCPU_OPCODE_OFFSET,r21
+       ;;
+       st8 [r16] = r24
+       st8 [r17] = r25
+       ;;
+       KVM_SAVE_MIN_WITH_COVER_R19
+       ;;
+       alloc r14=ar.pfs,0,0,2,0 // (must be first in insn group!)
+       mov out0=r13        //vcpu
+       adds r3=8,r2                // set up second base pointer
+       ;;
+       ssm psr.ic
+       ;;
+       srlz.i    // guarantee that interruption collection is on
+       ;;
+       //(p15) ssm psr.i               // restore psr.i
+       addl r14=@gprel(ia64_leave_hypervisor_prepare),gp
+       ;;
+       KVM_SAVE_REST
+       KVM_SAVE_EXTRA
+       mov rp=r14
+       ;;
+       adds out1=16,sp         //regs
+       br.call.sptk.many b6=kvm_emulate
 END(kvm_dispatch_virtualization_fault)
 
 
 ENTRY(kvm_dispatch_interrupt)
-    KVM_SAVE_MIN_WITH_COVER_R19        // uses r31; defines r2 and r3
-    ;;
-    alloc r14=ar.pfs,0,0,1,0 // must be first in an insn group
-    //mov out0=cr.ivr          // pass cr.ivr as first arg
-    adds r3=8,r2               // set up second base pointer for SAVE_REST
-    ;;
-    ssm psr.ic
-    ;;
-    srlz.i
-    ;;
-    //(p15) ssm psr.i
-    addl r14=@gprel(ia64_leave_hypervisor),gp
-    ;;
-    KVM_SAVE_REST
-    mov rp=r14
-    ;;
-    mov out0=r13               // pass pointer to pt_regs as second arg
-    br.call.sptk.many b6=kvm_ia64_handle_irq
+       KVM_SAVE_MIN_WITH_COVER_R19     // uses r31; defines r2 and r3
+       ;;
+       alloc r14=ar.pfs,0,0,1,0 // must be first in an insn group
+       adds r3=8,r2            // set up second base pointer for SAVE_REST
+       ;;
+       ssm psr.ic
+       ;;
+       srlz.i
+       ;;
+       //(p15) ssm psr.i
+       addl r14=@gprel(ia64_leave_hypervisor),gp
+       ;;
+       KVM_SAVE_REST
+       mov rp=r14
+       ;;
+       mov out0=r13            // pass pointer to pt_regs as second arg
+       br.call.sptk.many b6=kvm_ia64_handle_irq
 END(kvm_dispatch_interrupt)
 
-
-
-
 GLOBAL_ENTRY(ia64_leave_nested)
        rsm psr.i
        ;;
@@ -1008,7 +1008,7 @@ GLOBAL_ENTRY(ia64_leave_nested)
        ;;
        ldf.fill f11=[r2]
 //     mov r18=r13
-//    mov r21=r13
+//     mov r21=r13
        adds r16=PT(CR_IPSR)+16,r12
        adds r17=PT(CR_IIP)+16,r12
        ;;
@@ -1058,138 +1058,135 @@ GLOBAL_ENTRY(ia64_leave_nested)
        rfi
 END(ia64_leave_nested)
 
-
-
 GLOBAL_ENTRY(ia64_leave_hypervisor_prepare)
-    /*
-     * work.need_resched etc. mustn't get changed
-     *by this CPU before it returns to
-    ;;
-     * user- or fsys-mode, hence we disable interrupts early on:
-     */
-    adds r2 = PT(R4)+16,r12
-    adds r3 = PT(R5)+16,r12
-    adds r8 = PT(EML_UNAT)+16,r12
-    ;;
-    ld8 r8 = [r8]
-    ;;
-    mov ar.unat=r8
-    ;;
-    ld8.fill r4=[r2],16    //load r4
-    ld8.fill r5=[r3],16    //load r5
-    ;;
-    ld8.fill r6=[r2]    //load r6
-    ld8.fill r7=[r3]    //load r7
-    ;;
+/*
+ * work.need_resched etc. mustn't get changed
+ *by this CPU before it returns to
+ * user- or fsys-mode, hence we disable interrupts early on:
+ */
+       adds r2 = PT(R4)+16,r12
+       adds r3 = PT(R5)+16,r12
+       adds r8 = PT(EML_UNAT)+16,r12
+       ;;
+       ld8 r8 = [r8]
+       ;;
+       mov ar.unat=r8
+       ;;
+       ld8.fill r4=[r2],16    //load r4
+       ld8.fill r5=[r3],16    //load r5
+       ;;
+       ld8.fill r6=[r2]    //load r6
+       ld8.fill r7=[r3]    //load r7
+       ;;
 END(ia64_leave_hypervisor_prepare)
 //fall through
 GLOBAL_ENTRY(ia64_leave_hypervisor)
-    rsm psr.i
-    ;;
-    br.call.sptk.many b0=leave_hypervisor_tail
-    ;;
-    adds r20=PT(PR)+16,r12
-    adds r8=PT(EML_UNAT)+16,r12
-    ;;
-    ld8 r8=[r8]
-    ;;
-    mov ar.unat=r8
-    ;;
-    lfetch [r20],PT(CR_IPSR)-PT(PR)
-    adds r2 = PT(B6)+16,r12
-    adds r3 = PT(B7)+16,r12
-    ;;
-    lfetch [r20]
-    ;;
-    ld8 r24=[r2],16        /* B6 */
-    ld8 r25=[r3],16        /* B7 */
-    ;;
-    ld8 r26=[r2],16        /* ar_csd */
-    ld8 r27=[r3],16        /* ar_ssd */
-    mov b6 = r24
-    ;;
-    ld8.fill r8=[r2],16
-    ld8.fill r9=[r3],16
-    mov b7 = r25
-    ;;
-    mov ar.csd = r26
-    mov ar.ssd = r27
-    ;;
-    ld8.fill r10=[r2],PT(R15)-PT(R10)
-    ld8.fill r11=[r3],PT(R14)-PT(R11)
-    ;;
-    ld8.fill r15=[r2],PT(R16)-PT(R15)
-    ld8.fill r14=[r3],PT(R17)-PT(R14)
-    ;;
-    ld8.fill r16=[r2],16
-    ld8.fill r17=[r3],16
-    ;;
-    ld8.fill r18=[r2],16
-    ld8.fill r19=[r3],16
-    ;;
-    ld8.fill r20=[r2],16
-    ld8.fill r21=[r3],16
-    ;;
-    ld8.fill r22=[r2],16
-    ld8.fill r23=[r3],16
-    ;;
-    ld8.fill r24=[r2],16
-    ld8.fill r25=[r3],16
-    ;;
-    ld8.fill r26=[r2],16
-    ld8.fill r27=[r3],16
-    ;;
-    ld8.fill r28=[r2],16
-    ld8.fill r29=[r3],16
-    ;;
-    ld8.fill r30=[r2],PT(F6)-PT(R30)
-    ld8.fill r31=[r3],PT(F7)-PT(R31)
-    ;;
-    rsm psr.i | psr.ic
-    // initiate turning off of interrupt and interruption collection
-    invala          // invalidate ALAT
-    ;;
-    srlz.i          // ensure interruption collection is off
-    ;;
-    bsw.0
-    ;;
-    adds r16 = PT(CR_IPSR)+16,r12
-    adds r17 = PT(CR_IIP)+16,r12
-    mov r21=r13                // get current
-    ;;
-    ld8 r31=[r16],16    // load cr.ipsr
-    ld8 r30=[r17],16    // load cr.iip
-    ;;
-    ld8 r29=[r16],16    // load cr.ifs
-    ld8 r28=[r17],16    // load ar.unat
-    ;;
-    ld8 r27=[r16],16    // load ar.pfs
-    ld8 r26=[r17],16    // load ar.rsc
-    ;;
-    ld8 r25=[r16],16    // load ar.rnat
-    ld8 r24=[r17],16    // load ar.bspstore
-    ;;
-    ld8 r23=[r16],16    // load predicates
-    ld8 r22=[r17],16    // load b0
-    ;;
-    ld8 r20=[r16],16    // load ar.rsc value for "loadrs"
-    ld8.fill r1=[r17],16    //load r1
-    ;;
-    ld8.fill r12=[r16],16    //load r12
-    ld8.fill r13=[r17],PT(R2)-PT(R13)    //load r13
-    ;;
-    ld8 r19=[r16],PT(R3)-PT(AR_FPSR)    //load ar_fpsr
-    ld8.fill r2=[r17],PT(AR_CCV)-PT(R2)    //load r2
-    ;;
-    ld8.fill r3=[r16]  //load r3
-    ld8 r18=[r17]      //load ar_ccv
-    ;;
-    mov ar.fpsr=r19
-    mov ar.ccv=r18
-    shr.u r18=r20,16
-    ;;
+       rsm psr.i
+       ;;
+       br.call.sptk.many b0=leave_hypervisor_tail
+       ;;
+       adds r20=PT(PR)+16,r12
+       adds r8=PT(EML_UNAT)+16,r12
+       ;;
+       ld8 r8=[r8]
+       ;;
+       mov ar.unat=r8
+       ;;
+       lfetch [r20],PT(CR_IPSR)-PT(PR)
+       adds r2 = PT(B6)+16,r12
+       adds r3 = PT(B7)+16,r12
+       ;;
+       lfetch [r20]
+       ;;
+       ld8 r24=[r2],16        /* B6 */
+       ld8 r25=[r3],16        /* B7 */
+       ;;
+       ld8 r26=[r2],16        /* ar_csd */
+       ld8 r27=[r3],16        /* ar_ssd */
+       mov b6 = r24
+       ;;
+       ld8.fill r8=[r2],16
+       ld8.fill r9=[r3],16
+       mov b7 = r25
+       ;;
+       mov ar.csd = r26
+       mov ar.ssd = r27
+       ;;
+       ld8.fill r10=[r2],PT(R15)-PT(R10)
+       ld8.fill r11=[r3],PT(R14)-PT(R11)
+       ;;
+       ld8.fill r15=[r2],PT(R16)-PT(R15)
+       ld8.fill r14=[r3],PT(R17)-PT(R14)
+       ;;
+       ld8.fill r16=[r2],16
+       ld8.fill r17=[r3],16
+       ;;
+       ld8.fill r18=[r2],16
+       ld8.fill r19=[r3],16
+       ;;
+       ld8.fill r20=[r2],16
+       ld8.fill r21=[r3],16
+       ;;
+       ld8.fill r22=[r2],16
+       ld8.fill r23=[r3],16
+       ;;
+       ld8.fill r24=[r2],16
+       ld8.fill r25=[r3],16
+       ;;
+       ld8.fill r26=[r2],16
+       ld8.fill r27=[r3],16
+       ;;
+       ld8.fill r28=[r2],16
+       ld8.fill r29=[r3],16
+       ;;
+       ld8.fill r30=[r2],PT(F6)-PT(R30)
+       ld8.fill r31=[r3],PT(F7)-PT(R31)
+       ;;
+       rsm psr.i | psr.ic
+       // initiate turning off of interrupt and interruption collection
+       invala          // invalidate ALAT
+       ;;
+       srlz.i          // ensure interruption collection is off
+       ;;
+       bsw.0
+       ;;
+       adds r16 = PT(CR_IPSR)+16,r12
+       adds r17 = PT(CR_IIP)+16,r12
+       mov r21=r13             // get current
+       ;;
+       ld8 r31=[r16],16    // load cr.ipsr
+       ld8 r30=[r17],16    // load cr.iip
+       ;;
+       ld8 r29=[r16],16    // load cr.ifs
+       ld8 r28=[r17],16    // load ar.unat
+       ;;
+       ld8 r27=[r16],16    // load ar.pfs
+       ld8 r26=[r17],16    // load ar.rsc
+       ;;
+       ld8 r25=[r16],16    // load ar.rnat
+       ld8 r24=[r17],16    // load ar.bspstore
+       ;;
+       ld8 r23=[r16],16    // load predicates
+       ld8 r22=[r17],16    // load b0
+       ;;
+       ld8 r20=[r16],16    // load ar.rsc value for "loadrs"
+       ld8.fill r1=[r17],16    //load r1
+       ;;
+       ld8.fill r12=[r16],16    //load r12
+       ld8.fill r13=[r17],PT(R2)-PT(R13)    //load r13
+       ;;
+       ld8 r19=[r16],PT(R3)-PT(AR_FPSR)    //load ar_fpsr
+       ld8.fill r2=[r17],PT(AR_CCV)-PT(R2)    //load r2
+       ;;
+       ld8.fill r3=[r16]       //load r3
+       ld8 r18=[r17]   //load ar_ccv
+       ;;
+       mov ar.fpsr=r19
+       mov ar.ccv=r18
+       shr.u r18=r20,16
+       ;;
 kvm_rbs_switch:
-    mov r19=96
+       mov r19=96
 
 kvm_dont_preserve_current_frame:
 /*
@@ -1201,76 +1198,76 @@ kvm_dont_preserve_current_frame:
 #   define pReturn     p7
 #   define Nregs       14
 
-    alloc loc0=ar.pfs,2,Nregs-2,2,0
-    shr.u loc1=r18,9           // RNaTslots <= floor(dirtySize / (64*8))
-    sub r19=r19,r18            // r19 = (physStackedSize + 8) - dirtySize
-    ;;
-    mov ar.rsc=r20             // load ar.rsc to be used for "loadrs"
-    shladd in0=loc1,3,r19
-    mov in1=0
-    ;;
-    TEXT_ALIGN(32)
+       alloc loc0=ar.pfs,2,Nregs-2,2,0
+       shr.u loc1=r18,9        // RNaTslots <= floor(dirtySize / (64*8))
+       sub r19=r19,r18         // r19 = (physStackedSize + 8) - dirtySize
+       ;;
+       mov ar.rsc=r20          // load ar.rsc to be used for "loadrs"
+       shladd in0=loc1,3,r19
+       mov in1=0
+       ;;
+       TEXT_ALIGN(32)
 kvm_rse_clear_invalid:
-    alloc loc0=ar.pfs,2,Nregs-2,2,0
-    cmp.lt pRecurse,p0=Nregs*8,in0
-    // if more than Nregs regs left to clear, (re)curse
-    add out0=-Nregs*8,in0
-    add out1=1,in1             // increment recursion count
-    mov loc1=0
-    mov loc2=0
-    ;;
-    mov loc3=0
-    mov loc4=0
-    mov loc5=0
-    mov loc6=0
-    mov loc7=0
+       alloc loc0=ar.pfs,2,Nregs-2,2,0
+       cmp.lt pRecurse,p0=Nregs*8,in0
+       // if more than Nregs regs left to clear, (re)curse
+       add out0=-Nregs*8,in0
+       add out1=1,in1          // increment recursion count
+       mov loc1=0
+       mov loc2=0
+       ;;
+       mov loc3=0
+       mov loc4=0
+       mov loc5=0
+       mov loc6=0
+       mov loc7=0
 (pRecurse) br.call.dptk.few b0=kvm_rse_clear_invalid
-    ;;
-    mov loc8=0
-    mov loc9=0
-    cmp.ne pReturn,p0=r0,in1
-    // if recursion count != 0, we need to do a br.ret
-    mov loc10=0
-    mov loc11=0
+       ;;
+       mov loc8=0
+       mov loc9=0
+       cmp.ne pReturn,p0=r0,in1
+       // if recursion count != 0, we need to do a br.ret
+       mov loc10=0
+       mov loc11=0
 (pReturn) br.ret.dptk.many b0
 
 #      undef pRecurse
 #      undef pReturn
 
 // loadrs has already been shifted
-    alloc r16=ar.pfs,0,0,0,0    // drop current register frame
-    ;;
-    loadrs
-    ;;
-    mov ar.bspstore=r24
-    ;;
-    mov ar.unat=r28
-    mov ar.rnat=r25
-    mov ar.rsc=r26
-    ;;
-    mov cr.ipsr=r31
-    mov cr.iip=r30
-    mov cr.ifs=r29
-    mov ar.pfs=r27
-    adds r18=VMM_VPD_BASE_OFFSET,r21
-    ;;
-    ld8 r18=[r18]   //vpd
-    adds r17=VMM_VCPU_ISR_OFFSET,r21
-    ;;
-    ld8 r17=[r17]
-    adds r19=VMM_VPD_VPSR_OFFSET,r18
-    ;;
-    ld8 r19=[r19]        //vpsr
-    mov r25=r18
-    adds r16= VMM_VCPU_GP_OFFSET,r21
-    ;;
-    ld8 r16= [r16] // Put gp in r24
-    movl r24=@gprel(ia64_vmm_entry)  // calculate return address
-    ;;
-    add  r24=r24,r16
-    ;;
-    br.sptk.many  kvm_vps_sync_write       // call the service
-    ;;
+       alloc r16=ar.pfs,0,0,0,0    // drop current register frame
+       ;;
+       loadrs
+       ;;
+       mov ar.bspstore=r24
+       ;;
+       mov ar.unat=r28
+       mov ar.rnat=r25
+       mov ar.rsc=r26
+       ;;
+       mov cr.ipsr=r31
+       mov cr.iip=r30
+       mov cr.ifs=r29
+       mov ar.pfs=r27
+       adds r18=VMM_VPD_BASE_OFFSET,r21
+       ;;
+       ld8 r18=[r18]   //vpd
+       adds r17=VMM_VCPU_ISR_OFFSET,r21
+       ;;
+       ld8 r17=[r17]
+       adds r19=VMM_VPD_VPSR_OFFSET,r18
+       ;;
+       ld8 r19=[r19]        //vpsr
+       mov r25=r18
+       adds r16= VMM_VCPU_GP_OFFSET,r21
+       ;;
+       ld8 r16= [r16] // Put gp in r24
+       movl r24=@gprel(ia64_vmm_entry)  // calculate return address
+       ;;
+       add  r24=r24,r16
+       ;;
+       br.sptk.many  kvm_vps_sync_write       // call the service
+       ;;
 END(ia64_leave_hypervisor)
 // fall through
 GLOBAL_ENTRY(ia64_vmm_entry)
@@ -1283,16 +1280,14 @@ GLOBAL_ENTRY(ia64_vmm_entry)
  *  r22:b0
  *  r23:predicate
  */
-    mov r24=r22
-    mov r25=r18
-    tbit.nz p1,p2 = r19,IA64_PSR_IC_BIT        // p1=vpsr.ic
-    (p1) br.cond.sptk.few kvm_vps_resume_normal
-    (p2) br.cond.sptk.many kvm_vps_resume_handler
-    ;;
+       mov r24=r22
+       mov r25=r18
+       tbit.nz p1,p2 = r19,IA64_PSR_IC_BIT        // p1=vpsr.ic
+(p1)   br.cond.sptk.few kvm_vps_resume_normal
+(p2)   br.cond.sptk.many kvm_vps_resume_handler
+       ;;
 END(ia64_vmm_entry)
 
-
-
 /*
  * extern u64 ia64_call_vsa(u64 proc, u64 arg1, u64 arg2,
  *                  u64 arg3, u64 arg4, u64 arg5,
@@ -1310,88 +1305,88 @@ psrsave =   loc2
 entry   =   loc3
 hostret =   r24
 
-    alloc   pfssave=ar.pfs,4,4,0,0
-    mov rpsave=rp
-    adds entry=VMM_VCPU_VSA_BASE_OFFSET, r13
-    ;;
-    ld8 entry=[entry]
-1:  mov hostret=ip
-    mov r25=in1         // copy arguments
-    mov r26=in2
-    mov r27=in3
-    mov psrsave=psr
-    ;;
-    tbit.nz p6,p0=psrsave,14    // IA64_PSR_I
-    tbit.nz p7,p0=psrsave,13    // IA64_PSR_IC
-    ;;
-    add hostret=2f-1b,hostret   // calculate return address
-    add entry=entry,in0
-    ;;
-    rsm psr.i | psr.ic
-    ;;
-    srlz.i
-    mov b6=entry
-    br.cond.sptk b6         // call the service
+       alloc   pfssave=ar.pfs,4,4,0,0
+       mov rpsave=rp
+       adds entry=VMM_VCPU_VSA_BASE_OFFSET, r13
+       ;;
+       ld8 entry=[entry]
+1:     mov hostret=ip
+       mov r25=in1         // copy arguments
+       mov r26=in2
+       mov r27=in3
+       mov psrsave=psr
+       ;;
+       tbit.nz p6,p0=psrsave,14    // IA64_PSR_I
+       tbit.nz p7,p0=psrsave,13    // IA64_PSR_IC
+       ;;
+       add hostret=2f-1b,hostret   // calculate return address
+       add entry=entry,in0
+       ;;
+       rsm psr.i | psr.ic
+       ;;
+       srlz.i
+       mov b6=entry
+       br.cond.sptk b6         // call the service
 2:
-    // Architectural sequence for enabling interrupts if necessary
+// Architectural sequence for enabling interrupts if necessary
 (p7)    ssm psr.ic
-    ;;
+       ;;
 (p7)    srlz.i
-    ;;
+       ;;
 //(p6)    ssm psr.i
-    ;;
-    mov rp=rpsave
-    mov ar.pfs=pfssave
-    mov r8=r31
-    ;;
-    srlz.d
-    br.ret.sptk rp
+       ;;
+       mov rp=rpsave
+       mov ar.pfs=pfssave
+       mov r8=r31
+       ;;
+       srlz.d
+       br.ret.sptk rp
 
 END(ia64_call_vsa)
 
 #define  INIT_BSPSTORE  ((4<<30)-(12<<20)-0x100)
 
 GLOBAL_ENTRY(vmm_reset_entry)
-    //set up ipsr, iip, vpd.vpsr, dcr
-    // For IPSR: it/dt/rt=1, i/ic=1, si=1, vm/bn=1
-    // For DCR: all bits 0
-    bsw.0
-    ;;
-    mov r21 =r13
-    adds r14=-VMM_PT_REGS_SIZE, r12
-    ;;
-    movl r6=0x501008826000      // IPSR dt/rt/it:1;i/ic:1, si:1, vm/bn:1
-    movl r10=0x8000000000000000
-    adds r16=PT(CR_IIP), r14
-    adds r20=PT(R1), r14
-    ;;
-    rsm psr.ic | psr.i
-    ;;
-    srlz.i
-    ;;
-    mov ar.rsc = 0
-    ;;
-    flushrs
-    ;;
-    mov ar.bspstore = 0
-    // clear BSPSTORE
-    ;;
-    mov cr.ipsr=r6
-    mov cr.ifs=r10
-    ld8 r4 = [r16] // Set init iip for first run.
-    ld8 r1 = [r20]
-    ;;
-    mov cr.iip=r4
-    adds r16=VMM_VPD_BASE_OFFSET,r13
-    ;;
-    ld8 r18=[r16]
-    ;;
-    adds r19=VMM_VPD_VPSR_OFFSET,r18
-    ;;
-    ld8 r19=[r19]
-    mov r17=r0
-    mov r22=r0
-    mov r23=r0
-    br.cond.sptk ia64_vmm_entry
-    br.ret.sptk  b0
+       //set up ipsr, iip, vpd.vpsr, dcr
+       // For IPSR: it/dt/rt=1, i/ic=1, si=1, vm/bn=1
+       // For DCR: all bits 0
+       bsw.0
+       ;;
+       mov r21 =r13
+       adds r14=-VMM_PT_REGS_SIZE, r12
+       ;;
+       movl r6=0x501008826000      // IPSR dt/rt/it:1;i/ic:1, si:1, vm/bn:1
+       movl r10=0x8000000000000000
+       adds r16=PT(CR_IIP), r14
+       adds r20=PT(R1), r14
+       ;;
+       rsm psr.ic | psr.i
+       ;;
+       srlz.i
+       ;;
+       mov ar.rsc = 0
+       ;;
+       flushrs
+       ;;
+       mov ar.bspstore = 0
+       // clear BSPSTORE
+       ;;
+       mov cr.ipsr=r6
+       mov cr.ifs=r10
+       ld8 r4 = [r16] // Set init iip for first run.
+       ld8 r1 = [r20]
+       ;;
+       mov cr.iip=r4
+       adds r16=VMM_VPD_BASE_OFFSET,r13
+       ;;
+       ld8 r18=[r16]
+       ;;
+       adds r19=VMM_VPD_VPSR_OFFSET,r18
+       ;;
+       ld8 r19=[r19]
+       mov r17=r0
+       mov r22=r0
+       mov r23=r0
+       br.cond.sptk ia64_vmm_entry
+       br.ret.sptk  b0
 END(vmm_reset_entry)
index e22b93361e082528812b2d56c1e2bd628f21f584..6b6307a3bd556a08f6eb937a9d8b083defb0a701 100644 (file)
@@ -183,8 +183,8 @@ void mark_pages_dirty(struct kvm_vcpu *v, u64 pte, u64 ps)
        u64 i, dirty_pages = 1;
        u64 base_gfn = (pte&_PAGE_PPN_MASK) >> PAGE_SHIFT;
        spinlock_t *lock = __kvm_va(v->arch.dirty_log_lock_pa);
-       void *dirty_bitmap = (void *)v - (KVM_VCPU_OFS + v->vcpu_id * VCPU_SIZE)
-                                               + KVM_MEM_DIRTY_LOG_OFS;
+       void *dirty_bitmap = (void *)KVM_MEM_DIRTY_LOG_BASE;
+
        dirty_pages <<= ps <= PAGE_SHIFT ? 0 : ps - PAGE_SHIFT;
 
        vmm_spin_lock(lock);
index 0c66dbdd1d728d0cd826b865a0d0ffe0cdd8b2cc..66fd705e82c09ee1e1905356cf447526461c086a 100644 (file)
@@ -227,14 +227,14 @@ finish_up:
        return new_irq_info;
 }
 
-static void sn_set_affinity_irq(unsigned int irq, cpumask_t mask)
+static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
 {
        struct sn_irq_info *sn_irq_info, *sn_irq_info_safe;
        nasid_t nasid;
        int slice;
 
-       nasid = cpuid_to_nasid(first_cpu(mask));
-       slice = cpuid_to_slice(first_cpu(mask));
+       nasid = cpuid_to_nasid(cpumask_first(mask));
+       slice = cpuid_to_slice(cpumask_first(mask));
 
        list_for_each_entry_safe(sn_irq_info, sn_irq_info_safe,
                                 sn_irq_lh[irq], list)
index 83f190ffe35078dd062266727c20025d00b3a7fa..ca553b0429ce34cfb00398701d32ab73c942eeaf 100644 (file)
@@ -151,7 +151,8 @@ int sn_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *entry)
 }
 
 #ifdef CONFIG_SMP
-static void sn_set_msi_irq_affinity(unsigned int irq, cpumask_t cpu_mask)
+static void sn_set_msi_irq_affinity(unsigned int irq,
+                                   const struct cpumask *cpu_mask)
 {
        struct msi_msg msg;
        int slice;
@@ -164,7 +165,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq, cpumask_t cpu_mask)
        struct sn_pcibus_provider *provider;
        unsigned int cpu;
 
-       cpu = first_cpu(cpu_mask);
+       cpu = cpumask_first(cpu_mask);
        sn_irq_info = sn_msi_info[irq].sn_irq_info;
        if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0)
                return;
@@ -204,7 +205,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq, cpumask_t cpu_mask)
        msg.address_lo = (u32)(bus_addr & 0x00000000ffffffff);
 
        write_msi_msg(irq, &msg);
-       irq_desc[irq].affinity = cpu_mask;
+       irq_desc[irq].affinity = *cpu_mask;
 }
 #endif /* CONFIG_SMP */
 
index 29047d5c259aa9a2f9622bf15f9baa1e3d21dfcc..cabba332cc48322c99ee23eb1604ddd929f29d91 100644 (file)
@@ -10,6 +10,7 @@ config M32R
        default y
        select HAVE_IDE
        select HAVE_OPROFILE
+       select INIT_ALL_POSSIBLE
 
 config SBUS
        bool
index 39cb6da72dcbef3358d04e2b419a150f88c299ba..0f06b3722e960518bc275b1167b6948e5eb35914 100644 (file)
@@ -73,17 +73,11 @@ static unsigned int bsp_phys_id = -1;
 /* Bitmask of physically existing CPUs */
 physid_mask_t phys_cpu_present_map;
 
-/* Bitmask of currently online CPUs */
-cpumask_t cpu_online_map;
-EXPORT_SYMBOL(cpu_online_map);
-
 cpumask_t cpu_bootout_map;
 cpumask_t cpu_bootin_map;
 static cpumask_t cpu_callin_map;
 cpumask_t cpu_callout_map;
 EXPORT_SYMBOL(cpu_callout_map);
-cpumask_t cpu_possible_map = CPU_MASK_ALL;
-EXPORT_SYMBOL(cpu_possible_map);
 
 /* Per CPU bogomips and other parameters */
 struct cpuinfo_m32r cpu_data[NR_CPUS] __cacheline_aligned;
index c5b916700b222194e68e148d3ad13115bd41625e..2a12e7fa97484edcd46370ca65754cced3455bea 100644 (file)
@@ -156,7 +156,7 @@ void hw_timer_init(void)
 {
        u32 imr;
 
-       cf_pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+       cf_pit_clockevent.cpumask = cpumask_of(smp_processor_id());
        cf_pit_clockevent.mult = div_sc(FREQ, NSEC_PER_SEC, 32);
        cf_pit_clockevent.max_delta_ns =
                clockevent_delta2ns(0xFFFF, &cf_pit_clockevent);
index a58f0eecc68f91b5d5a0a1c2ce6d4617dc5ba42d..abc62aa744ace852888367ffce294fea809095f0 100644 (file)
@@ -49,7 +49,8 @@ static inline void smtc_im_ack_irq(unsigned int irq)
 #ifdef CONFIG_MIPS_MT_SMTC_IRQAFF
 #include <linux/cpumask.h>
 
-extern void plat_set_irq_affinity(unsigned int irq, cpumask_t affinity);
+extern void plat_set_irq_affinity(unsigned int irq,
+                                 const struct cpumask *affinity);
 extern void smtc_forward_irq(unsigned int irq);
 
 /*
index 7785bec732f267254bae3477b83a885e968bd928..1fb959f98982c70c192e1f1fe731cc3c92f849b9 100644 (file)
@@ -37,7 +37,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
 
 /* sched_domains SD_NODE_INIT for SGI IP27 machines */
 #define SD_NODE_INIT (struct sched_domain) {           \
-       .span                   = CPU_MASK_NONE,        \
        .parent                 = NULL,                 \
        .child                  = NULL,                 \
        .groups                 = NULL,                 \
index 0ff5b523ea77d8f6a5b7b2337969e414ce5ddaa8..86557b5d1b3f152932d9fe02744cad4eb4ec6316 100644 (file)
@@ -38,9 +38,6 @@ extern int __cpu_logical_map[NR_CPUS];
 #define SMP_RESCHEDULE_YOURSELF        0x1     /* XXX braindead */
 #define SMP_CALL_FUNCTION      0x2
 
-extern cpumask_t phys_cpu_present_map;
-#define cpu_possible_map       phys_cpu_present_map
-
 extern void asmlinkage smp_bootstrap(void);
 
 /*
index d7f8a782aae4d9a3200c307f383be28ed83b2b18..03965cb1b2527e23fd60a003284f51279b42a758 100644 (file)
@@ -146,7 +146,7 @@ void __init plat_time_init(void)
 
        BUG_ON(HZ != 100);
 
-       cd->cpumask             = cpumask_of_cpu(cpu);
+       cd->cpumask             = cpumask_of(cpu);
        clockevents_register_device(cd);
        action->dev_id = cd;
        setup_irq(JAZZ_TIMER_IRQ, action);
index 0a57f86945f1e5be7b85553cc4839ba7348d2371..b820661678b0b948e7fb56e8332a28277df07a98 100644 (file)
@@ -126,7 +126,7 @@ void __cpuinit sb1480_clockevent_init(void)
        cd->min_delta_ns        = clockevent_delta2ns(2, cd);
        cd->rating              = 200;
        cd->irq                 = irq;
-       cd->cpumask             = cpumask_of_cpu(cpu);
+       cd->cpumask             = cpumask_of(cpu);
        cd->set_next_event      = sibyte_next_event;
        cd->set_mode            = sibyte_set_mode;
        clockevents_register_device(cd);
@@ -148,6 +148,6 @@ void __cpuinit sb1480_clockevent_init(void)
        action->name    = name;
        action->dev_id  = cd;
 
-       irq_set_affinity(irq, cpumask_of_cpu(cpu));
+       irq_set_affinity(irq, cpumask_of(cpu));
        setup_irq(irq, action);
 }
index df4acb68bfb57be98fb38298826a677e15c6920e..1ada45ea07003d426cf591b2d639d45ffda0c53b 100644 (file)
@@ -88,7 +88,6 @@ static void ds1287_event_handler(struct clock_event_device *dev)
 static struct clock_event_device ds1287_clockevent = {
        .name           = "ds1287",
        .features       = CLOCK_EVT_FEAT_PERIODIC,
-       .cpumask        = CPU_MASK_CPU0,
        .set_next_event = ds1287_set_next_event,
        .set_mode       = ds1287_set_mode,
        .event_handler  = ds1287_event_handler,
@@ -122,6 +121,7 @@ int __init ds1287_clockevent_init(int irq)
        clockevent_set_clock(cd, 32768);
        cd->max_delta_ns = clockevent_delta2ns(0x7fffffff, cd);
        cd->min_delta_ns = clockevent_delta2ns(0x300, cd);
+       cd->cpumask = cpumask_of(0);
 
        clockevents_register_device(&ds1287_clockevent);
 
index 6e2f58520afbdba949fb839ebfd66df78e4df364..e9b787feedcb7595fb763e057cfb18a06fb1ede1 100644 (file)
@@ -96,7 +96,6 @@ static void gt641xx_timer0_event_handler(struct clock_event_device *dev)
 static struct clock_event_device gt641xx_timer0_clockevent = {
        .name           = "gt641xx-timer0",
        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-       .cpumask        = CPU_MASK_CPU0,
        .irq            = GT641XX_TIMER0_IRQ,
        .set_next_event = gt641xx_timer0_set_next_event,
        .set_mode       = gt641xx_timer0_set_mode,
@@ -132,6 +131,7 @@ static int __init gt641xx_timer0_clockevent_init(void)
        clockevent_set_clock(cd, gt641xx_base_clock);
        cd->max_delta_ns = clockevent_delta2ns(0x7fffffff, cd);
        cd->min_delta_ns = clockevent_delta2ns(0x300, cd);
+       cd->cpumask = cpumask_of(0);
 
        clockevents_register_device(&gt641xx_timer0_clockevent);
 
index 4a4c59f2737a5d0bcdc8fc48f4f8e28ea217f528..e1ec83b68031b447ffc2e46f85f8519de6b56b7f 100644 (file)
@@ -195,7 +195,7 @@ int __cpuinit mips_clockevent_init(void)
 
        cd->rating              = 300;
        cd->irq                 = irq;
-       cd->cpumask             = cpumask_of_cpu(cpu);
+       cd->cpumask             = cpumask_of(cpu);
        cd->set_next_event      = mips_next_event;
        cd->set_mode            = mips_set_clock_mode;
        cd->event_handler       = mips_event_handler;
index 63ac3ad462bc61cf2aeafe817a6ea1255c3991a0..a2eebaafda52c224ee385193bcb998984d9d790d 100644 (file)
@@ -125,7 +125,7 @@ void __cpuinit sb1250_clockevent_init(void)
        cd->min_delta_ns        = clockevent_delta2ns(2, cd);
        cd->rating              = 200;
        cd->irq                 = irq;
-       cd->cpumask             = cpumask_of_cpu(cpu);
+       cd->cpumask             = cpumask_of(cpu);
        cd->set_next_event      = sibyte_next_event;
        cd->set_mode            = sibyte_set_mode;
        clockevents_register_device(cd);
@@ -147,6 +147,6 @@ void __cpuinit sb1250_clockevent_init(void)
        action->name    = name;
        action->dev_id  = cd;
 
-       irq_set_affinity(irq, cpumask_of_cpu(cpu));
+       irq_set_affinity(irq, cpumask_of(cpu));
        setup_irq(irq, action);
 }
index 5162fe4b59528ac9ae24fe894c6076f8af89c530..6d45e24db5bfce9a8f620f22cfdfabb0e662a342 100644 (file)
@@ -292,7 +292,7 @@ int __cpuinit mips_clockevent_init(void)
 
        cd->rating              = 300;
        cd->irq                 = irq;
-       cd->cpumask             = cpumask_of_cpu(cpu);
+       cd->cpumask             = cpumask_of(cpu);
        cd->set_next_event      = mips_next_event;
        cd->set_mode            = mips_set_clock_mode;
        cd->event_handler       = mips_event_handler;
index b5fc4eb412d2e53b3544f0d67f032cecf4933095..eccf7d6096bd7b8cbdd9d2c4d2dd7ed2d4742a7a 100644 (file)
@@ -112,7 +112,6 @@ static struct clock_event_device txx9tmr_clock_event_device = {
        .name           = "TXx9",
        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
        .rating         = 200,
-       .cpumask        = CPU_MASK_CPU0,
        .set_mode       = txx9tmr_set_mode,
        .set_next_event = txx9tmr_set_next_event,
 };
@@ -150,6 +149,7 @@ void __init txx9_clockevent_init(unsigned long baseaddr, int irq,
                clockevent_delta2ns(0xffffffff >> (32 - TXX9_TIMER_BITS), cd);
        cd->min_delta_ns = clockevent_delta2ns(0xf, cd);
        cd->irq = irq;
+       cd->cpumask = cpumask_of(0),
        clockevents_register_device(cd);
        setup_irq(irq, &txx9tmr_irq);
        printk(KERN_INFO "TXx9: clockevent device at 0x%lx, irq %d\n",
index b6ac55162b9a8e533063af37b73677fd6137b442..f4d187825f96406612d16a6016ab8defcf997658 100644 (file)
@@ -115,7 +115,7 @@ void __init setup_pit_timer(void)
         * Start pit with the boot cpu mask and make it global after the
         * IO_APIC has been initialized.
         */
-       cd->cpumask = cpumask_of_cpu(cpu);
+       cd->cpumask = cpumask_of(cpu);
        clockevent_set_clock(cd, CLOCK_TICK_RATE);
        cd->max_delta_ns = clockevent_delta2ns(0x7FFF, cd);
        cd->min_delta_ns = clockevent_delta2ns(0xF, cd);
index f0a4bb19e096ab7ddde894a44111fd5fca72b0b6..494a49a317e9b78fb137714423a64ef0661a411f 100644 (file)
@@ -155,7 +155,7 @@ static void gic_unmask_irq(unsigned int irq)
 
 static DEFINE_SPINLOCK(gic_lock);
 
-static void gic_set_affinity(unsigned int irq, cpumask_t cpumask)
+static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 {
        cpumask_t       tmp = CPU_MASK_NONE;
        unsigned long   flags;
@@ -164,7 +164,7 @@ static void gic_set_affinity(unsigned int irq, cpumask_t cpumask)
        pr_debug(KERN_DEBUG "%s called\n", __func__);
        irq -= _irqbase;
 
-       cpus_and(tmp, cpumask, cpu_online_map);
+       cpumask_and(&tmp, cpumask, cpu_online_mask);
        if (cpus_empty(tmp))
                return;
 
@@ -187,7 +187,7 @@ static void gic_set_affinity(unsigned int irq, cpumask_t cpumask)
                set_bit(irq, pcpu_masks[first_cpu(tmp)].pcpu_mask);
 
        }
-       irq_desc[irq].affinity = cpumask;
+       irq_desc[irq].affinity = *cpumask;
        spin_unlock_irqrestore(&gic_lock, flags);
 
 }
index ca476c4f62a57d12d839975334662254824a3b2f..f27beca4b26d22c67366b1be5d1f263637ad21c3 100644 (file)
@@ -51,10 +51,10 @@ static int __init allowcpus(char *str)
        int len;
 
        cpus_clear(cpu_allow_map);
-       if (cpulist_parse(str, cpu_allow_map) == 0) {
+       if (cpulist_parse(str, &cpu_allow_map) == 0) {
                cpu_set(0, cpu_allow_map);
                cpus_and(cpu_possible_map, cpu_possible_map, cpu_allow_map);
-               len = cpulist_scnprintf(buf, sizeof(buf)-1, cpu_possible_map);
+               len = cpulist_scnprintf(buf, sizeof(buf)-1, &cpu_possible_map);
                buf[len] = '\0';
                pr_debug("Allowable CPUs: %s\n", buf);
                return 1;
@@ -226,7 +226,7 @@ void __init cmp_smp_setup(void)
 
        for (i = 1; i < NR_CPUS; i++) {
                if (amon_cpu_avail(i)) {
-                       cpu_set(i, phys_cpu_present_map);
+                       cpu_set(i, cpu_possible_map);
                        __cpu_number_map[i]     = ++ncpu;
                        __cpu_logical_map[ncpu] = i;
                }
index 87a1816c1f4589cdb90c88896830b16b77b84f58..6f7ee5ac46ee5d5f1544967d3e1f3c94ea1fc7df 100644 (file)
@@ -70,7 +70,7 @@ static unsigned int __init smvp_vpe_init(unsigned int tc, unsigned int mvpconf0,
                write_vpe_c0_vpeconf0(tmp);
 
                /* Record this as available CPU */
-               cpu_set(tc, phys_cpu_present_map);
+               cpu_set(tc, cpu_possible_map);
                __cpu_number_map[tc]    = ++ncpu;
                __cpu_logical_map[ncpu] = tc;
        }
index 8bf88faf5afdcb5e7eca6923fa5f4d41f988880e..3da94704f81645027d246cf6aea5be80119eddfa 100644 (file)
 #include <asm/mipsmtregs.h>
 #endif /* CONFIG_MIPS_MT_SMTC */
 
-cpumask_t phys_cpu_present_map;                /* Bitmask of available CPUs */
 volatile cpumask_t cpu_callin_map;     /* Bitmask of started secondaries */
-cpumask_t cpu_online_map;              /* Bitmask of currently online CPUs */
 int __cpu_number_map[NR_CPUS];         /* Map physical to logical */
 int __cpu_logical_map[NR_CPUS];                /* Map logical to physical */
 
-EXPORT_SYMBOL(phys_cpu_present_map);
-EXPORT_SYMBOL(cpu_online_map);
-
 extern void cpu_idle(void);
 
 /* Number of TCs (or siblings in Intel speak) per CPU core */
@@ -195,7 +190,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 /* preload SMP state for boot cpu */
 void __devinit smp_prepare_boot_cpu(void)
 {
-       cpu_set(0, phys_cpu_present_map);
+       cpu_set(0, cpu_possible_map);
        cpu_set(0, cpu_online_map);
        cpu_set(0, cpu_callin_map);
 }
index 897fb2b4751c95715af0b798d79db5cf899a5e5d..b6cca01ff82b309e803105c1230957e943c87df8 100644 (file)
@@ -290,7 +290,7 @@ static void smtc_configure_tlb(void)
  * possibly leave some TCs/VPEs as "slave" processors.
  *
  * Use c0_MVPConf0 to find out how many TCs are available, setting up
- * phys_cpu_present_map and the logical/physical mappings.
+ * cpu_possible_map and the logical/physical mappings.
  */
 
 int __init smtc_build_cpu_map(int start_cpu_slot)
@@ -304,7 +304,7 @@ int __init smtc_build_cpu_map(int start_cpu_slot)
         */
        ntcs = ((read_c0_mvpconf0() & MVPCONF0_PTC) >> MVPCONF0_PTC_SHIFT) + 1;
        for (i=start_cpu_slot; i<NR_CPUS && i<ntcs; i++) {
-               cpu_set(i, phys_cpu_present_map);
+               cpu_set(i, cpu_possible_map);
                __cpu_number_map[i] = i;
                __cpu_logical_map[i] = i;
        }
@@ -521,7 +521,7 @@ void smtc_prepare_cpus(int cpus)
         * Pull any physically present but unused TCs out of circulation.
         */
        while (tc < (((val & MVPCONF0_PTC) >> MVPCONF0_PTC_SHIFT) + 1)) {
-               cpu_clear(tc, phys_cpu_present_map);
+               cpu_clear(tc, cpu_possible_map);
                cpu_clear(tc, cpu_present_map);
                tc++;
        }
index f84a46a8ae6e5261d0c8669b2726ed353cda7dc3..aabd7274507b875e2726c246240d9e465a27bc9b 100644 (file)
@@ -114,9 +114,9 @@ struct plat_smp_ops msmtc_smp_ops = {
  */
 
 
-void plat_set_irq_affinity(unsigned int irq, cpumask_t affinity)
+void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
 {
-       cpumask_t tmask = affinity;
+       cpumask_t tmask = *affinity;
        int cpu = 0;
        void smtc_set_irq_affinity(unsigned int irq, cpumask_t aff);
 
@@ -139,7 +139,7 @@ void plat_set_irq_affinity(unsigned int irq, cpumask_t affinity)
         * be made to forward to an offline "CPU".
         */
 
-       for_each_cpu_mask(cpu, affinity) {
+       for_each_cpu(cpu, affinity) {
                if ((cpu_data[cpu].vpe_id != 0) || !cpu_online(cpu))
                        cpu_clear(cpu, tmask);
        }
index 62f495b57f937c5710fdea3eb0f94471003191ae..cf293b2790981566fd802a1f982ba52339560bde 100644 (file)
@@ -102,6 +102,7 @@ __init void plat_time_init(void)
        unsigned int p;
        unsigned int pow2p;
 
+       pnx8xxx_clockevent.cpumask = cpu_none_mask;
        clockevents_register_device(&pnx8xxx_clockevent);
        clocksource_register(&pnx_clocksource);
 
index 3a7df647ca7739b9336cc22ade77c34d883d5252..f78c29b68d77015e114195154c76671e2ce7d1a6 100644 (file)
@@ -141,7 +141,7 @@ static void __cpuinit yos_boot_secondary(int cpu, struct task_struct *idle)
 }
 
 /*
- * Detect available CPUs, populate phys_cpu_present_map before smp_init
+ * Detect available CPUs, populate cpu_possible_map before smp_init
  *
  * We don't want to start the secondary CPU yet nor do we have a nice probing
  * feature in PMON so we just assume presence of the secondary core.
@@ -150,10 +150,10 @@ static void __init yos_smp_setup(void)
 {
        int i;
 
-       cpus_clear(phys_cpu_present_map);
+       cpus_clear(cpu_possible_map);
 
        for (i = 0; i < 2; i++) {
-               cpu_set(i, phys_cpu_present_map);
+               cpu_set(i, cpu_possible_map);
                __cpu_number_map[i]     = i;
                __cpu_logical_map[i]    = i;
        }
index ba5cdebeaf0d23d703ce93a163cdd1b030a6bdf6..5b47d6b65275ba6835043e8a050958ba014d0ed2 100644 (file)
@@ -76,7 +76,7 @@ static int do_cpumask(cnodeid_t cnode, nasid_t nasid, int highest)
                        /* Only let it join in if it's marked enabled */
                        if ((acpu->cpu_info.flags & KLINFO_ENABLE) &&
                            (tot_cpus_found != NR_CPUS)) {
-                               cpu_set(cpuid, phys_cpu_present_map);
+                               cpu_set(cpuid, cpu_possible_map);
                                alloc_cpupda(cpuid, tot_cpus_found);
                                cpus_found++;
                                tot_cpus_found++;
index 1327c2746fb73be5cedc9484df1bfdb8ab16ce1c..f024057a35f8bab0e40997cd2bc0f30c2b39fb8a 100644 (file)
@@ -134,7 +134,7 @@ void __cpuinit hub_rt_clock_event_init(void)
        cd->min_delta_ns        = clockevent_delta2ns(0x300, cd);
        cd->rating              = 200;
        cd->irq                 = irq;
-       cd->cpumask             = cpumask_of_cpu(cpu);
+       cd->cpumask             = cpumask_of(cpu);
        cd->set_next_event      = rt_next_event;
        cd->set_mode            = rt_set_mode;
        clockevents_register_device(cd);
index a35818ed42639c8548537546d82eae9f35f23674..12b465d404dfc393e0996a248ff5e9a510aefe5e 100644 (file)
@@ -50,7 +50,7 @@ static void enable_bcm1480_irq(unsigned int irq);
 static void disable_bcm1480_irq(unsigned int irq);
 static void ack_bcm1480_irq(unsigned int irq);
 #ifdef CONFIG_SMP
-static void bcm1480_set_affinity(unsigned int irq, cpumask_t mask);
+static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask);
 #endif
 
 #ifdef CONFIG_PCI
@@ -109,7 +109,7 @@ void bcm1480_unmask_irq(int cpu, int irq)
 }
 
 #ifdef CONFIG_SMP
-static void bcm1480_set_affinity(unsigned int irq, cpumask_t mask)
+static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
        int i = 0, old_cpu, cpu, int_on, k;
        u64 cur_ints;
@@ -117,11 +117,11 @@ static void bcm1480_set_affinity(unsigned int irq, cpumask_t mask)
        unsigned long flags;
        unsigned int irq_dirty;
 
-       if (cpus_weight(mask) != 1) {
+       if (cpumask_weight(mask) != 1) {
                printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq);
                return;
        }
-       i = first_cpu(mask);
+       i = cpumask_first(mask);
 
        /* Convert logical CPU to physical CPU */
        cpu = cpu_logical_map(i);
index bd9eeb43ed0e1bd86a365322374ae9b9cfdfe25c..dddfda8e829461e8297ab44cfad6501e34641ca3 100644 (file)
@@ -136,7 +136,7 @@ static void __cpuinit bcm1480_boot_secondary(int cpu, struct task_struct *idle)
 
 /*
  * Use CFE to find out how many CPUs are available, setting up
- * phys_cpu_present_map and the logical/physical mappings.
+ * cpu_possible_map and the logical/physical mappings.
  * XXXKW will the boot CPU ever not be physical 0?
  *
  * Common setup before any secondaries are started
@@ -145,14 +145,14 @@ static void __init bcm1480_smp_setup(void)
 {
        int i, num;
 
-       cpus_clear(phys_cpu_present_map);
-       cpu_set(0, phys_cpu_present_map);
+       cpus_clear(cpu_possible_map);
+       cpu_set(0, cpu_possible_map);
        __cpu_number_map[0] = 0;
        __cpu_logical_map[0] = 0;
 
        for (i = 1, num = 0; i < NR_CPUS; i++) {
                if (cfe_cpu_stop(i) == 0) {
-                       cpu_set(i, phys_cpu_present_map);
+                       cpu_set(i, cpu_possible_map);
                        __cpu_number_map[i] = ++num;
                        __cpu_logical_map[num] = i;
                }
index a5158483986e878dd7d935a439d6c2b0c1f98d2e..808ac2959b8c497ff69f3bfe64090893ea57ca64 100644 (file)
@@ -50,7 +50,7 @@ static void enable_sb1250_irq(unsigned int irq);
 static void disable_sb1250_irq(unsigned int irq);
 static void ack_sb1250_irq(unsigned int irq);
 #ifdef CONFIG_SMP
-static void sb1250_set_affinity(unsigned int irq, cpumask_t mask);
+static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask);
 #endif
 
 #ifdef CONFIG_SIBYTE_HAS_LDT
@@ -103,16 +103,16 @@ void sb1250_unmask_irq(int cpu, int irq)
 }
 
 #ifdef CONFIG_SMP
-static void sb1250_set_affinity(unsigned int irq, cpumask_t mask)
+static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
        int i = 0, old_cpu, cpu, int_on;
        u64 cur_ints;
        struct irq_desc *desc = irq_desc + irq;
        unsigned long flags;
 
-       i = first_cpu(mask);
+       i = cpumask_first(mask);
 
-       if (cpus_weight(mask) > 1) {
+       if (cpumask_weight(mask) > 1) {
                printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq);
                return;
        }
index 0734b933e969240054bd54e8f05f65608ba20f4c..5950a288a7daab17939664417fadacd68ca28d7b 100644 (file)
@@ -124,7 +124,7 @@ static void __cpuinit sb1250_boot_secondary(int cpu, struct task_struct *idle)
 
 /*
  * Use CFE to find out how many CPUs are available, setting up
- * phys_cpu_present_map and the logical/physical mappings.
+ * cpu_possible_map and the logical/physical mappings.
  * XXXKW will the boot CPU ever not be physical 0?
  *
  * Common setup before any secondaries are started
@@ -133,14 +133,14 @@ static void __init sb1250_smp_setup(void)
 {
        int i, num;
 
-       cpus_clear(phys_cpu_present_map);
-       cpu_set(0, phys_cpu_present_map);
+       cpus_clear(cpu_possible_map);
+       cpu_set(0, cpu_possible_map);
        __cpu_number_map[0] = 0;
        __cpu_logical_map[0] = 0;
 
        for (i = 1, num = 0; i < NR_CPUS; i++) {
                if (cfe_cpu_stop(i) == 0) {
-                       cpu_set(i, phys_cpu_present_map);
+                       cpu_set(i, cpu_possible_map);
                        __cpu_number_map[i] = ++num;
                        __cpu_logical_map[num] = i;
                }
index 796e3ce28720ace81719f4114382ded6df96ba79..69f5f88711cca8721cbc97ad4fda96e6682ecffd 100644 (file)
@@ -80,7 +80,7 @@ static void __init sni_a20r_timer_setup(void)
        struct irqaction *action = &a20r_irqaction;
        unsigned int cpu = smp_processor_id();
 
-       cd->cpumask             = cpumask_of_cpu(cpu);
+       cd->cpumask             = cpumask_of(cpu);
        clockevents_register_device(cd);
        action->dev_id = cd;
        setup_irq(SNI_A20R_IRQ_TIMER, &a20r_irqaction);
index 644a70b1b04e4fd9ecf16d2e73d6e22ea5227115..aacf11d33723edc80aa3638ba5256e83d0c74bf2 100644 (file)
@@ -11,6 +11,7 @@ config PARISC
        select HAVE_OPROFILE
        select RTC_CLASS
        select RTC_DRV_PARISC
+       select INIT_ALL_POSSIBLE
        help
          The PA-RISC microprocessor is designed by Hewlett-Packard and used
          in many of their workstations & servers (HP9000 700 and 800 series,
index 23ef950df0080a11d0e8c6390d52d948c778ba99..4cea935e2f99d1d51436fdb815e357dea54a4e31 100644 (file)
@@ -131,12 +131,12 @@ int cpu_check_affinity(unsigned int irq, cpumask_t *dest)
        return 0;
 }
 
-static void cpu_set_affinity_irq(unsigned int irq, cpumask_t dest)
+static void cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
 {
-       if (cpu_check_affinity(irq, &dest))
+       if (cpu_check_affinity(irq, dest))
                return;
 
-       irq_desc[irq].affinity = dest;
+       irq_desc[irq].affinity = *dest;
 }
 #endif
 
index d47f3975c9c6b683d4e0cfaf21dcf9156d68988d..80bc000523face7176458cf26c452f461ecd6413 100644 (file)
@@ -67,21 +67,6 @@ static volatile int cpu_now_booting __read_mostly = 0;       /* track which CPU is boo
 
 static int parisc_max_cpus __read_mostly = 1;
 
-/* online cpus are ones that we've managed to bring up completely
- * possible cpus are all valid cpu 
- * present cpus are all detected cpu
- *
- * On startup we bring up the "possible" cpus. Since we discover
- * CPUs later, we add them as hotplug, so the possible cpu mask is
- * empty in the beginning.
- */
-
-cpumask_t cpu_online_map   __read_mostly = CPU_MASK_NONE;      /* Bitmap of online CPUs */
-cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;       /* Bitmap of Present CPUs */
-
-EXPORT_SYMBOL(cpu_online_map);
-EXPORT_SYMBOL(cpu_possible_map);
-
 DEFINE_PER_CPU(spinlock_t, ipi_lock) = SPIN_LOCK_UNLOCKED;
 
 enum ipi_message_type {
diff --git a/arch/powerpc/include/asm/disassemble.h b/arch/powerpc/include/asm/disassemble.h
new file mode 100644 (file)
index 0000000..9b198d1
--- /dev/null
@@ -0,0 +1,80 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#ifndef __ASM_PPC_DISASSEMBLE_H__
+#define __ASM_PPC_DISASSEMBLE_H__
+
+#include <linux/types.h>
+
+static inline unsigned int get_op(u32 inst)
+{
+       return inst >> 26;
+}
+
+static inline unsigned int get_xop(u32 inst)
+{
+       return (inst >> 1) & 0x3ff;
+}
+
+static inline unsigned int get_sprn(u32 inst)
+{
+       return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
+}
+
+static inline unsigned int get_dcrn(u32 inst)
+{
+       return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
+}
+
+static inline unsigned int get_rt(u32 inst)
+{
+       return (inst >> 21) & 0x1f;
+}
+
+static inline unsigned int get_rs(u32 inst)
+{
+       return (inst >> 21) & 0x1f;
+}
+
+static inline unsigned int get_ra(u32 inst)
+{
+       return (inst >> 16) & 0x1f;
+}
+
+static inline unsigned int get_rb(u32 inst)
+{
+       return (inst >> 11) & 0x1f;
+}
+
+static inline unsigned int get_rc(u32 inst)
+{
+       return inst & 0x1;
+}
+
+static inline unsigned int get_ws(u32 inst)
+{
+       return (inst >> 11) & 0x1f;
+}
+
+static inline unsigned int get_d(u32 inst)
+{
+       return inst & 0xffff;
+}
+
+#endif /* __ASM_PPC_DISASSEMBLE_H__ */
diff --git a/arch/powerpc/include/asm/kvm_44x.h b/arch/powerpc/include/asm/kvm_44x.h
new file mode 100644 (file)
index 0000000..f49031b
--- /dev/null
@@ -0,0 +1,61 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#ifndef __ASM_44X_H__
+#define __ASM_44X_H__
+
+#include <linux/kvm_host.h>
+
+#define PPC44x_TLB_SIZE 64
+
+/* If the guest is expecting it, this can be as large as we like; we'd just
+ * need to find some way of advertising it. */
+#define KVM44x_GUEST_TLB_SIZE 64
+
+struct kvmppc_44x_shadow_ref {
+       struct page *page;
+       u16 gtlb_index;
+       u8 writeable;
+       u8 tid;
+};
+
+struct kvmppc_vcpu_44x {
+       /* Unmodified copy of the guest's TLB. */
+       struct kvmppc_44x_tlbe guest_tlb[KVM44x_GUEST_TLB_SIZE];
+
+       /* References to guest pages in the hardware TLB. */
+       struct kvmppc_44x_shadow_ref shadow_refs[PPC44x_TLB_SIZE];
+
+       /* State of the shadow TLB at guest context switch time. */
+       struct kvmppc_44x_tlbe shadow_tlb[PPC44x_TLB_SIZE];
+       u8 shadow_tlb_mod[PPC44x_TLB_SIZE];
+
+       struct kvm_vcpu vcpu;
+};
+
+static inline struct kvmppc_vcpu_44x *to_44x(struct kvm_vcpu *vcpu)
+{
+       return container_of(vcpu, struct kvmppc_vcpu_44x, vcpu);
+}
+
+void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid);
+void kvmppc_44x_tlb_put(struct kvm_vcpu *vcpu);
+void kvmppc_44x_tlb_load(struct kvm_vcpu *vcpu);
+
+#endif /* __ASM_44X_H__ */
index 34b52b7180cd52da00f31532cb956dff68923a86..c1e436fe7738c8685d021453c699fed6a7729003 100644 (file)
@@ -64,27 +64,58 @@ struct kvm_vcpu_stat {
        u32 halt_wakeup;
 };
 
-struct tlbe {
+struct kvmppc_44x_tlbe {
        u32 tid; /* Only the low 8 bits are used. */
        u32 word0;
        u32 word1;
        u32 word2;
 };
 
-struct kvm_arch {
+enum kvm_exit_types {
+       MMIO_EXITS,
+       DCR_EXITS,
+       SIGNAL_EXITS,
+       ITLB_REAL_MISS_EXITS,
+       ITLB_VIRT_MISS_EXITS,
+       DTLB_REAL_MISS_EXITS,
+       DTLB_VIRT_MISS_EXITS,
+       SYSCALL_EXITS,
+       ISI_EXITS,
+       DSI_EXITS,
+       EMULATED_INST_EXITS,
+       EMULATED_MTMSRWE_EXITS,
+       EMULATED_WRTEE_EXITS,
+       EMULATED_MTSPR_EXITS,
+       EMULATED_MFSPR_EXITS,
+       EMULATED_MTMSR_EXITS,
+       EMULATED_MFMSR_EXITS,
+       EMULATED_TLBSX_EXITS,
+       EMULATED_TLBWE_EXITS,
+       EMULATED_RFI_EXITS,
+       DEC_EXITS,
+       EXT_INTR_EXITS,
+       HALT_WAKEUP,
+       USR_PR_INST,
+       FP_UNAVAIL,
+       DEBUG_EXITS,
+       TIMEINGUEST,
+       __NUMBER_OF_KVM_EXIT_TYPES
 };
 
-struct kvm_vcpu_arch {
-       /* Unmodified copy of the guest's TLB. */
-       struct tlbe guest_tlb[PPC44x_TLB_SIZE];
-       /* TLB that's actually used when the guest is running. */
-       struct tlbe shadow_tlb[PPC44x_TLB_SIZE];
-       /* Pages which are referenced in the shadow TLB. */
-       struct page *shadow_pages[PPC44x_TLB_SIZE];
+/* allow access to big endian 32bit upper/lower parts and 64bit var */
+struct kvmppc_exit_timing {
+       union {
+               u64 tv64;
+               struct {
+                       u32 tbu, tbl;
+               } tv32;
+       };
+};
 
-       /* Track which TLB entries we've modified in the current exit. */
-       u8 shadow_tlb_mod[PPC44x_TLB_SIZE];
+struct kvm_arch {
+};
 
+struct kvm_vcpu_arch {
        u32 host_stack;
        u32 host_pid;
        u32 host_dbcr0;
@@ -94,32 +125,32 @@ struct kvm_vcpu_arch {
        u32 host_msr;
 
        u64 fpr[32];
-       u32 gpr[32];
+       ulong gpr[32];
 
-       u32 pc;
+       ulong pc;
        u32 cr;
-       u32 ctr;
-       u32 lr;
-       u32 xer;
+       ulong ctr;
+       ulong lr;
+       ulong xer;
 
-       u32 msr;
+       ulong msr;
        u32 mmucr;
-       u32 sprg0;
-       u32 sprg1;
-       u32 sprg2;
-       u32 sprg3;
-       u32 sprg4;
-       u32 sprg5;
-       u32 sprg6;
-       u32 sprg7;
-       u32 srr0;
-       u32 srr1;
-       u32 csrr0;
-       u32 csrr1;
-       u32 dsrr0;
-       u32 dsrr1;
-       u32 dear;
-       u32 esr;
+       ulong sprg0;
+       ulong sprg1;
+       ulong sprg2;
+       ulong sprg3;
+       ulong sprg4;
+       ulong sprg5;
+       ulong sprg6;
+       ulong sprg7;
+       ulong srr0;
+       ulong srr1;
+       ulong csrr0;
+       ulong csrr1;
+       ulong dsrr0;
+       ulong dsrr1;
+       ulong dear;
+       ulong esr;
        u32 dec;
        u32 decar;
        u32 tbl;
@@ -127,7 +158,7 @@ struct kvm_vcpu_arch {
        u32 tcr;
        u32 tsr;
        u32 ivor[16];
-       u32 ivpr;
+       ulong ivpr;
        u32 pir;
 
        u32 shadow_pid;
@@ -140,9 +171,22 @@ struct kvm_vcpu_arch {
        u32 dbcr0;
        u32 dbcr1;
 
+#ifdef CONFIG_KVM_EXIT_TIMING
+       struct kvmppc_exit_timing timing_exit;
+       struct kvmppc_exit_timing timing_last_enter;
+       u32 last_exit_type;
+       u32 timing_count_type[__NUMBER_OF_KVM_EXIT_TYPES];
+       u64 timing_sum_duration[__NUMBER_OF_KVM_EXIT_TYPES];
+       u64 timing_sum_quad_duration[__NUMBER_OF_KVM_EXIT_TYPES];
+       u64 timing_min_duration[__NUMBER_OF_KVM_EXIT_TYPES];
+       u64 timing_max_duration[__NUMBER_OF_KVM_EXIT_TYPES];
+       u64 timing_last_exit;
+       struct dentry *debugfs_exit_timing;
+#endif
+
        u32 last_inst;
-       u32 fault_dear;
-       u32 fault_esr;
+       ulong fault_dear;
+       ulong fault_esr;
        gpa_t paddr_accessed;
 
        u8 io_gpr; /* GPR used as IO source/target */
index bb62ad876de32750c70cc7ecd91adcb9f0539d5a..36d2a50a84875f760522183eddd80d2ecd40e230 100644 (file)
 #include <linux/kvm_types.h>
 #include <linux/kvm_host.h>
 
-struct kvm_tlb {
-       struct tlbe guest_tlb[PPC44x_TLB_SIZE];
-       struct tlbe shadow_tlb[PPC44x_TLB_SIZE];
-};
-
 enum emulation_result {
        EMULATE_DONE,         /* no further processing */
        EMULATE_DO_MMIO,      /* kvm_run filled with MMIO request */
@@ -41,9 +36,6 @@ enum emulation_result {
        EMULATE_FAIL,         /* can't emulate this instruction */
 };
 
-extern const unsigned char exception_priority[];
-extern const unsigned char priority_exception[];
-
 extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 extern char kvmppc_handlers_start[];
 extern unsigned long kvmppc_handler_len;
@@ -58,51 +50,44 @@ extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 extern int kvmppc_emulate_instruction(struct kvm_run *run,
                                       struct kvm_vcpu *vcpu);
 extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
+extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
 
-extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn,
-                           u64 asid, u32 flags);
-extern void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
-                                  gva_t eend, u32 asid);
+extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
+                           u64 asid, u32 flags, u32 max_bytes,
+                           unsigned int gtlb_idx);
 extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode);
 extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid);
 
-/* XXX Book E specific */
-extern void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i);
-
-extern void kvmppc_check_and_deliver_interrupts(struct kvm_vcpu *vcpu);
-
-static inline void kvmppc_queue_exception(struct kvm_vcpu *vcpu, int exception)
-{
-       unsigned int priority = exception_priority[exception];
-       set_bit(priority, &vcpu->arch.pending_exceptions);
-}
-
-static inline void kvmppc_clear_exception(struct kvm_vcpu *vcpu, int exception)
-{
-       unsigned int priority = exception_priority[exception];
-       clear_bit(priority, &vcpu->arch.pending_exceptions);
-}
-
-/* Helper function for "full" MSR writes. No need to call this if only EE is
- * changing. */
-static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
-{
-       if ((new_msr & MSR_PR) != (vcpu->arch.msr & MSR_PR))
-               kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR);
-
-       vcpu->arch.msr = new_msr;
-
-       if (vcpu->arch.msr & MSR_WE)
-               kvm_vcpu_block(vcpu);
-}
-
-static inline void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid)
-{
-       if (vcpu->arch.pid != new_pid) {
-               vcpu->arch.pid = new_pid;
-               vcpu->arch.swap_pid = 1;
-       }
-}
+/* Core-specific hooks */
+
+extern struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm,
+                                                unsigned int id);
+extern void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu);
+extern int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu);
+extern int kvmppc_core_check_processor_compat(void);
+extern int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
+                                      struct kvm_translation *tr);
+
+extern void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu);
+
+extern void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu);
+
+extern void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu);
+extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
+                                       struct kvm_interrupt *irq);
+
+extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                                  unsigned int op, int *advance);
+extern int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs);
+extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt);
+
+extern int kvmppc_booke_init(void);
+extern void kvmppc_booke_exit(void);
 
 extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
 
index 8a97cfb08b7e9eaad3a6149ce3285e6ad9507234..27cc6fdcd3b79e81cadca44218a8a17994336d1d 100644 (file)
@@ -56,6 +56,7 @@
 #ifndef __ASSEMBLY__
 
 extern unsigned int tlb_44x_hwater;
+extern unsigned int tlb_44x_index;
 
 typedef struct {
        unsigned int    id;
index c32da6f9799957a1ce3439b8f9e269bd887621f2..373fca394a54a1a1841d06d2ebec20440aa4ca7d 100644 (file)
@@ -48,7 +48,6 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 
 /* sched_domains SD_NODE_INIT for PPC64 machines */
 #define SD_NODE_INIT (struct sched_domain) {           \
-       .span                   = CPU_MASK_NONE,        \
        .parent                 = NULL,                 \
        .child                  = NULL,                 \
        .groups                 = NULL,                 \
index 661d07d2146bcbead9cb44e2a43aff1b5151e562..9937fe44555f3d4e15cffa5a18d597231ca6040c 100644 (file)
@@ -23,9 +23,6 @@
 #include <linux/mm.h>
 #include <linux/suspend.h>
 #include <linux/hrtimer.h>
-#ifdef CONFIG_KVM
-#include <linux/kvm_host.h>
-#endif
 #ifdef CONFIG_PPC64
 #include <linux/time.h>
 #include <linux/hardirq.h>
@@ -51,6 +48,9 @@
 #ifdef CONFIG_PPC_ISERIES
 #include <asm/iseries/alpaca.h>
 #endif
+#ifdef CONFIG_KVM
+#include <asm/kvm_44x.h>
+#endif
 
 #if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
 #include "head_booke.h"
@@ -357,12 +357,10 @@ int main(void)
        DEFINE(PTE_SIZE, sizeof(pte_t));
 
 #ifdef CONFIG_KVM
-       DEFINE(TLBE_BYTES, sizeof(struct tlbe));
+       DEFINE(TLBE_BYTES, sizeof(struct kvmppc_44x_tlbe));
 
        DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
        DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
-       DEFINE(VCPU_SHADOW_TLB, offsetof(struct kvm_vcpu, arch.shadow_tlb));
-       DEFINE(VCPU_SHADOW_MOD, offsetof(struct kvm_vcpu, arch.shadow_tlb_mod));
        DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
        DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
        DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
@@ -385,5 +383,16 @@ int main(void)
        DEFINE(PTE_T_LOG2, PTE_T_LOG2);
 #endif
 
+#ifdef CONFIG_KVM_EXIT_TIMING
+       DEFINE(VCPU_TIMING_EXIT_TBU, offsetof(struct kvm_vcpu,
+                                               arch.timing_exit.tv32.tbu));
+       DEFINE(VCPU_TIMING_EXIT_TBL, offsetof(struct kvm_vcpu,
+                                               arch.timing_exit.tv32.tbl));
+       DEFINE(VCPU_TIMING_LAST_ENTER_TBU, offsetof(struct kvm_vcpu,
+                                       arch.timing_last_enter.tv32.tbu));
+       DEFINE(VCPU_TIMING_LAST_ENTER_TBL, offsetof(struct kvm_vcpu,
+                                       arch.timing_last_enter.tv32.tbl));
+#endif
+
        return 0;
 }
index ac222d0ab12e86406bcbc6247adb3b8150ed49ee..23b8b5e36f98b9afb5cf970e20b4a77ab5785627 100644 (file)
@@ -237,7 +237,7 @@ void fixup_irqs(cpumask_t map)
                        mask = map;
                }
                if (irq_desc[irq].chip->set_affinity)
-                       irq_desc[irq].chip->set_affinity(irq, mask);
+                       irq_desc[irq].chip->set_affinity(irq, &mask);
                else if (irq_desc[irq].action && !(warned++))
                        printk("Cannot set affinity for irq %i\n", irq);
        }
index 8ac3f721d2359e20b09476fbe85aafc7c161f97d..65484b2200b36add20b48dc38caf3f80971c4d29 100644 (file)
 
 struct thread_info *secondary_ti;
 
-cpumask_t cpu_possible_map = CPU_MASK_NONE;
-cpumask_t cpu_online_map = CPU_MASK_NONE;
 DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE;
 DEFINE_PER_CPU(cpumask_t, cpu_core_map) = CPU_MASK_NONE;
 
-EXPORT_SYMBOL(cpu_online_map);
-EXPORT_SYMBOL(cpu_possible_map);
 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
 
index e1f3a51404292ee3c422295e47292da9b7c22256..99f1ddd68582e9628456f8d8e7f8fe3fb0c5b25f 100644 (file)
@@ -844,7 +844,7 @@ static void register_decrementer_clockevent(int cpu)
        struct clock_event_device *dec = &per_cpu(decrementers, cpu).event;
 
        *dec = decrementer_clockevent;
-       dec->cpumask = cpumask_of_cpu(cpu);
+       dec->cpumask = cpumask_of(cpu);
 
        printk(KERN_DEBUG "clockevent: %s mult[%lx] shift[%d] cpu[%d]\n",
               dec->name, dec->mult, dec->shift, cpu);
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
new file mode 100644 (file)
index 0000000..a66bec5
--- /dev/null
@@ -0,0 +1,228 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/tlbflush.h>
+#include <asm/kvm_44x.h>
+#include <asm/kvm_ppc.h>
+
+#include "44x_tlb.h"
+
+/* Note: clearing MSR[DE] just means that the debug interrupt will not be
+ * delivered *immediately*. Instead, it simply sets the appropriate DBSR bits.
+ * If those DBSR bits are still set when MSR[DE] is re-enabled, the interrupt
+ * will be delivered as an "imprecise debug event" (which is indicated by
+ * DBSR[IDE].
+ */
+static void kvm44x_disable_debug_interrupts(void)
+{
+       mtmsr(mfmsr() & ~MSR_DE);
+}
+
+void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu)
+{
+       kvm44x_disable_debug_interrupts();
+
+       mtspr(SPRN_IAC1, vcpu->arch.host_iac[0]);
+       mtspr(SPRN_IAC2, vcpu->arch.host_iac[1]);
+       mtspr(SPRN_IAC3, vcpu->arch.host_iac[2]);
+       mtspr(SPRN_IAC4, vcpu->arch.host_iac[3]);
+       mtspr(SPRN_DBCR1, vcpu->arch.host_dbcr1);
+       mtspr(SPRN_DBCR2, vcpu->arch.host_dbcr2);
+       mtspr(SPRN_DBCR0, vcpu->arch.host_dbcr0);
+       mtmsr(vcpu->arch.host_msr);
+}
+
+void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
+{
+       struct kvm_guest_debug *dbg = &vcpu->guest_debug;
+       u32 dbcr0 = 0;
+
+       vcpu->arch.host_msr = mfmsr();
+       kvm44x_disable_debug_interrupts();
+
+       /* Save host debug register state. */
+       vcpu->arch.host_iac[0] = mfspr(SPRN_IAC1);
+       vcpu->arch.host_iac[1] = mfspr(SPRN_IAC2);
+       vcpu->arch.host_iac[2] = mfspr(SPRN_IAC3);
+       vcpu->arch.host_iac[3] = mfspr(SPRN_IAC4);
+       vcpu->arch.host_dbcr0 = mfspr(SPRN_DBCR0);
+       vcpu->arch.host_dbcr1 = mfspr(SPRN_DBCR1);
+       vcpu->arch.host_dbcr2 = mfspr(SPRN_DBCR2);
+
+       /* set registers up for guest */
+
+       if (dbg->bp[0]) {
+               mtspr(SPRN_IAC1, dbg->bp[0]);
+               dbcr0 |= DBCR0_IAC1 | DBCR0_IDM;
+       }
+       if (dbg->bp[1]) {
+               mtspr(SPRN_IAC2, dbg->bp[1]);
+               dbcr0 |= DBCR0_IAC2 | DBCR0_IDM;
+       }
+       if (dbg->bp[2]) {
+               mtspr(SPRN_IAC3, dbg->bp[2]);
+               dbcr0 |= DBCR0_IAC3 | DBCR0_IDM;
+       }
+       if (dbg->bp[3]) {
+               mtspr(SPRN_IAC4, dbg->bp[3]);
+               dbcr0 |= DBCR0_IAC4 | DBCR0_IDM;
+       }
+
+       mtspr(SPRN_DBCR0, dbcr0);
+       mtspr(SPRN_DBCR1, 0);
+       mtspr(SPRN_DBCR2, 0);
+}
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+       kvmppc_44x_tlb_load(vcpu);
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+       kvmppc_44x_tlb_put(vcpu);
+}
+
+int kvmppc_core_check_processor_compat(void)
+{
+       int r;
+
+       if (strcmp(cur_cpu_spec->platform, "ppc440") == 0)
+               r = 0;
+       else
+               r = -ENOTSUPP;
+
+       return r;
+}
+
+int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+       struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+       struct kvmppc_44x_tlbe *tlbe = &vcpu_44x->guest_tlb[0];
+       int i;
+
+       tlbe->tid = 0;
+       tlbe->word0 = PPC44x_TLB_16M | PPC44x_TLB_VALID;
+       tlbe->word1 = 0;
+       tlbe->word2 = PPC44x_TLB_SX | PPC44x_TLB_SW | PPC44x_TLB_SR;
+
+       tlbe++;
+       tlbe->tid = 0;
+       tlbe->word0 = 0xef600000 | PPC44x_TLB_4K | PPC44x_TLB_VALID;
+       tlbe->word1 = 0xef600000;
+       tlbe->word2 = PPC44x_TLB_SX | PPC44x_TLB_SW | PPC44x_TLB_SR
+                     | PPC44x_TLB_I | PPC44x_TLB_G;
+
+       /* Since the guest can directly access the timebase, it must know the
+        * real timebase frequency. Accordingly, it must see the state of
+        * CCR1[TCS]. */
+       vcpu->arch.ccr1 = mfspr(SPRN_CCR1);
+
+       for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++)
+               vcpu_44x->shadow_refs[i].gtlb_index = -1;
+
+       return 0;
+}
+
+/* 'linear_address' is actually an encoding of AS|PID|EADDR . */
+int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
+                               struct kvm_translation *tr)
+{
+       struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+       struct kvmppc_44x_tlbe *gtlbe;
+       int index;
+       gva_t eaddr;
+       u8 pid;
+       u8 as;
+
+       eaddr = tr->linear_address;
+       pid = (tr->linear_address >> 32) & 0xff;
+       as = (tr->linear_address >> 40) & 0x1;
+
+       index = kvmppc_44x_tlb_index(vcpu, eaddr, pid, as);
+       if (index == -1) {
+               tr->valid = 0;
+               return 0;
+       }
+
+       gtlbe = &vcpu_44x->guest_tlb[index];
+
+       tr->physical_address = tlb_xlate(gtlbe, eaddr);
+       /* XXX what does "writeable" and "usermode" even mean? */
+       tr->valid = 1;
+
+       return 0;
+}
+
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+       struct kvmppc_vcpu_44x *vcpu_44x;
+       struct kvm_vcpu *vcpu;
+       int err;
+
+       vcpu_44x = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+       if (!vcpu_44x) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       vcpu = &vcpu_44x->vcpu;
+       err = kvm_vcpu_init(vcpu, kvm, id);
+       if (err)
+               goto free_vcpu;
+
+       return vcpu;
+
+free_vcpu:
+       kmem_cache_free(kvm_vcpu_cache, vcpu_44x);
+out:
+       return ERR_PTR(err);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+       struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+
+       kvm_vcpu_uninit(vcpu);
+       kmem_cache_free(kvm_vcpu_cache, vcpu_44x);
+}
+
+static int kvmppc_44x_init(void)
+{
+       int r;
+
+       r = kvmppc_booke_init();
+       if (r)
+               return r;
+
+       return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), THIS_MODULE);
+}
+
+static void kvmppc_44x_exit(void)
+{
+       kvmppc_booke_exit();
+}
+
+module_init(kvmppc_44x_init);
+module_exit(kvmppc_44x_exit);
diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c
new file mode 100644 (file)
index 0000000..82489a7
--- /dev/null
@@ -0,0 +1,371 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#include <asm/kvm_ppc.h>
+#include <asm/dcr.h>
+#include <asm/dcr-regs.h>
+#include <asm/disassemble.h>
+#include <asm/kvm_44x.h>
+#include "timing.h"
+
+#include "booke.h"
+#include "44x_tlb.h"
+
+#define OP_RFI      19
+
+#define XOP_RFI     50
+#define XOP_MFMSR   83
+#define XOP_WRTEE   131
+#define XOP_MTMSR   146
+#define XOP_WRTEEI  163
+#define XOP_MFDCR   323
+#define XOP_MTDCR   451
+#define XOP_TLBSX   914
+#define XOP_ICCCI   966
+#define XOP_TLBWE   978
+
+static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.pc = vcpu->arch.srr0;
+       kvmppc_set_msr(vcpu, vcpu->arch.srr1);
+}
+
+int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                           unsigned int inst, int *advance)
+{
+       int emulated = EMULATE_DONE;
+       int dcrn;
+       int ra;
+       int rb;
+       int rc;
+       int rs;
+       int rt;
+       int ws;
+
+       switch (get_op(inst)) {
+       case OP_RFI:
+               switch (get_xop(inst)) {
+               case XOP_RFI:
+                       kvmppc_emul_rfi(vcpu);
+                       kvmppc_set_exit_type(vcpu, EMULATED_RFI_EXITS);
+                       *advance = 0;
+                       break;
+
+               default:
+                       emulated = EMULATE_FAIL;
+                       break;
+               }
+               break;
+
+       case 31:
+               switch (get_xop(inst)) {
+
+               case XOP_MFMSR:
+                       rt = get_rt(inst);
+                       vcpu->arch.gpr[rt] = vcpu->arch.msr;
+                       kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS);
+                       break;
+
+               case XOP_MTMSR:
+                       rs = get_rs(inst);
+                       kvmppc_set_exit_type(vcpu, EMULATED_MTMSR_EXITS);
+                       kvmppc_set_msr(vcpu, vcpu->arch.gpr[rs]);
+                       break;
+
+               case XOP_WRTEE:
+                       rs = get_rs(inst);
+                       vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
+                                                        | (vcpu->arch.gpr[rs] & MSR_EE);
+                       kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
+                       break;
+
+               case XOP_WRTEEI:
+                       vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
+                                                        | (inst & MSR_EE);
+                       kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
+                       break;
+
+               case XOP_MFDCR:
+                       dcrn = get_dcrn(inst);
+                       rt = get_rt(inst);
+
+                       /* The guest may access CPR0 registers to determine the timebase
+                        * frequency, and it must know the real host frequency because it
+                        * can directly access the timebase registers.
+                        *
+                        * It would be possible to emulate those accesses in userspace,
+                        * but userspace can really only figure out the end frequency.
+                        * We could decompose that into the factors that compute it, but
+                        * that's tricky math, and it's easier to just report the real
+                        * CPR0 values.
+                        */
+                       switch (dcrn) {
+                       case DCRN_CPR0_CONFIG_ADDR:
+                               vcpu->arch.gpr[rt] = vcpu->arch.cpr0_cfgaddr;
+                               break;
+                       case DCRN_CPR0_CONFIG_DATA:
+                               local_irq_disable();
+                               mtdcr(DCRN_CPR0_CONFIG_ADDR,
+                                         vcpu->arch.cpr0_cfgaddr);
+                               vcpu->arch.gpr[rt] = mfdcr(DCRN_CPR0_CONFIG_DATA);
+                               local_irq_enable();
+                               break;
+                       default:
+                               run->dcr.dcrn = dcrn;
+                               run->dcr.data =  0;
+                               run->dcr.is_write = 0;
+                               vcpu->arch.io_gpr = rt;
+                               vcpu->arch.dcr_needed = 1;
+                               kvmppc_account_exit(vcpu, DCR_EXITS);
+                               emulated = EMULATE_DO_DCR;
+                       }
+
+                       break;
+
+               case XOP_MTDCR:
+                       dcrn = get_dcrn(inst);
+                       rs = get_rs(inst);
+
+                       /* emulate some access in kernel */
+                       switch (dcrn) {
+                       case DCRN_CPR0_CONFIG_ADDR:
+                               vcpu->arch.cpr0_cfgaddr = vcpu->arch.gpr[rs];
+                               break;
+                       default:
+                               run->dcr.dcrn = dcrn;
+                               run->dcr.data = vcpu->arch.gpr[rs];
+                               run->dcr.is_write = 1;
+                               vcpu->arch.dcr_needed = 1;
+                               kvmppc_account_exit(vcpu, DCR_EXITS);
+                               emulated = EMULATE_DO_DCR;
+                       }
+
+                       break;
+
+               case XOP_TLBWE:
+                       ra = get_ra(inst);
+                       rs = get_rs(inst);
+                       ws = get_ws(inst);
+                       emulated = kvmppc_44x_emul_tlbwe(vcpu, ra, rs, ws);
+                       break;
+
+               case XOP_TLBSX:
+                       rt = get_rt(inst);
+                       ra = get_ra(inst);
+                       rb = get_rb(inst);
+                       rc = get_rc(inst);
+                       emulated = kvmppc_44x_emul_tlbsx(vcpu, rt, ra, rb, rc);
+                       break;
+
+               case XOP_ICCCI:
+                       break;
+
+               default:
+                       emulated = EMULATE_FAIL;
+               }
+
+               break;
+
+       default:
+               emulated = EMULATE_FAIL;
+       }
+
+       return emulated;
+}
+
+int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+{
+       switch (sprn) {
+       case SPRN_MMUCR:
+               vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break;
+       case SPRN_PID:
+               kvmppc_set_pid(vcpu, vcpu->arch.gpr[rs]); break;
+       case SPRN_CCR0:
+               vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break;
+       case SPRN_CCR1:
+               vcpu->arch.ccr1 = vcpu->arch.gpr[rs]; break;
+       case SPRN_DEAR:
+               vcpu->arch.dear = vcpu->arch.gpr[rs]; break;
+       case SPRN_ESR:
+               vcpu->arch.esr = vcpu->arch.gpr[rs]; break;
+       case SPRN_DBCR0:
+               vcpu->arch.dbcr0 = vcpu->arch.gpr[rs]; break;
+       case SPRN_DBCR1:
+               vcpu->arch.dbcr1 = vcpu->arch.gpr[rs]; break;
+       case SPRN_TSR:
+               vcpu->arch.tsr &= ~vcpu->arch.gpr[rs]; break;
+       case SPRN_TCR:
+               vcpu->arch.tcr = vcpu->arch.gpr[rs];
+               kvmppc_emulate_dec(vcpu);
+               break;
+
+       /* Note: SPRG4-7 are user-readable. These values are
+        * loaded into the real SPRGs when resuming the
+        * guest. */
+       case SPRN_SPRG4:
+               vcpu->arch.sprg4 = vcpu->arch.gpr[rs]; break;
+       case SPRN_SPRG5:
+               vcpu->arch.sprg5 = vcpu->arch.gpr[rs]; break;
+       case SPRN_SPRG6:
+               vcpu->arch.sprg6 = vcpu->arch.gpr[rs]; break;
+       case SPRN_SPRG7:
+               vcpu->arch.sprg7 = vcpu->arch.gpr[rs]; break;
+
+       case SPRN_IVPR:
+               vcpu->arch.ivpr = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_IVOR0:
+               vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_IVOR1:
+               vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_IVOR2:
+               vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_IVOR3:
+               vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_IVOR4:
+               vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_IVOR5:
+               vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_IVOR6:
+               vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_IVOR7:
+               vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_IVOR8:
+               vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_IVOR9:
+               vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_IVOR10:
+               vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_IVOR11:
+               vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_IVOR12:
+               vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_IVOR13:
+               vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_IVOR14:
+               vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_IVOR15:
+               vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = vcpu->arch.gpr[rs];
+               break;
+
+       default:
+               return EMULATE_FAIL;
+       }
+
+       kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS);
+       return EMULATE_DONE;
+}
+
+int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+{
+       switch (sprn) {
+       /* 440 */
+       case SPRN_MMUCR:
+               vcpu->arch.gpr[rt] = vcpu->arch.mmucr; break;
+       case SPRN_CCR0:
+               vcpu->arch.gpr[rt] = vcpu->arch.ccr0; break;
+       case SPRN_CCR1:
+               vcpu->arch.gpr[rt] = vcpu->arch.ccr1; break;
+
+       /* Book E */
+       case SPRN_PID:
+               vcpu->arch.gpr[rt] = vcpu->arch.pid; break;
+       case SPRN_IVPR:
+               vcpu->arch.gpr[rt] = vcpu->arch.ivpr; break;
+       case SPRN_DEAR:
+               vcpu->arch.gpr[rt] = vcpu->arch.dear; break;
+       case SPRN_ESR:
+               vcpu->arch.gpr[rt] = vcpu->arch.esr; break;
+       case SPRN_DBCR0:
+               vcpu->arch.gpr[rt] = vcpu->arch.dbcr0; break;
+       case SPRN_DBCR1:
+               vcpu->arch.gpr[rt] = vcpu->arch.dbcr1; break;
+
+       case SPRN_IVOR0:
+               vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL];
+               break;
+       case SPRN_IVOR1:
+               vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK];
+               break;
+       case SPRN_IVOR2:
+               vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE];
+               break;
+       case SPRN_IVOR3:
+               vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE];
+               break;
+       case SPRN_IVOR4:
+               vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL];
+               break;
+       case SPRN_IVOR5:
+               vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT];
+               break;
+       case SPRN_IVOR6:
+               vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM];
+               break;
+       case SPRN_IVOR7:
+               vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL];
+               break;
+       case SPRN_IVOR8:
+               vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL];
+               break;
+       case SPRN_IVOR9:
+               vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL];
+               break;
+       case SPRN_IVOR10:
+               vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER];
+               break;
+       case SPRN_IVOR11:
+               vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT];
+               break;
+       case SPRN_IVOR12:
+               vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG];
+               break;
+       case SPRN_IVOR13:
+               vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS];
+               break;
+       case SPRN_IVOR14:
+               vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS];
+               break;
+       case SPRN_IVOR15:
+               vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG];
+               break;
+
+       default:
+               return EMULATE_FAIL;
+       }
+
+       kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS);
+       return EMULATE_DONE;
+}
+
index ad72c6f9811f62ed05695ca23221bd2d9b87f31a..9a34b8edb9e283adcc8c53e3033b44d7f0845bc1 100644 (file)
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <linux/highmem.h>
+
+#include <asm/tlbflush.h>
 #include <asm/mmu-44x.h>
 #include <asm/kvm_ppc.h>
+#include <asm/kvm_44x.h>
+#include "timing.h"
 
 #include "44x_tlb.h"
 
+#ifndef PPC44x_TLBE_SIZE
+#define PPC44x_TLBE_SIZE       PPC44x_TLB_4K
+#endif
+
+#define PAGE_SIZE_4K (1<<12)
+#define PAGE_MASK_4K (~(PAGE_SIZE_4K - 1))
+
+#define PPC44x_TLB_UATTR_MASK \
+       (PPC44x_TLB_U0|PPC44x_TLB_U1|PPC44x_TLB_U2|PPC44x_TLB_U3)
 #define PPC44x_TLB_USER_PERM_MASK (PPC44x_TLB_UX|PPC44x_TLB_UR|PPC44x_TLB_UW)
 #define PPC44x_TLB_SUPER_PERM_MASK (PPC44x_TLB_SX|PPC44x_TLB_SR|PPC44x_TLB_SW)
 
-static unsigned int kvmppc_tlb_44x_pos;
+#ifdef DEBUG
+void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
+{
+       struct kvmppc_44x_tlbe *tlbe;
+       int i;
+
+       printk("vcpu %d TLB dump:\n", vcpu->vcpu_id);
+       printk("| %2s | %3s | %8s | %8s | %8s |\n",
+                       "nr", "tid", "word0", "word1", "word2");
+
+       for (i = 0; i < ARRAY_SIZE(vcpu_44x->guest_tlb); i++) {
+               tlbe = &vcpu_44x->guest_tlb[i];
+               if (tlbe->word0 & PPC44x_TLB_VALID)
+                       printk(" G%2d |  %02X | %08X | %08X | %08X |\n",
+                              i, tlbe->tid, tlbe->word0, tlbe->word1,
+                              tlbe->word2);
+       }
+}
+#endif
+
+static inline void kvmppc_44x_tlbie(unsigned int index)
+{
+       /* 0 <= index < 64, so the V bit is clear and we can use the index as
+        * word0. */
+       asm volatile(
+               "tlbwe %[index], %[index], 0\n"
+       :
+       : [index] "r"(index)
+       );
+}
+
+static inline void kvmppc_44x_tlbre(unsigned int index,
+                                    struct kvmppc_44x_tlbe *tlbe)
+{
+       asm volatile(
+               "tlbre %[word0], %[index], 0\n"
+               "mfspr %[tid], %[sprn_mmucr]\n"
+               "andi. %[tid], %[tid], 0xff\n"
+               "tlbre %[word1], %[index], 1\n"
+               "tlbre %[word2], %[index], 2\n"
+               : [word0] "=r"(tlbe->word0),
+                 [word1] "=r"(tlbe->word1),
+                 [word2] "=r"(tlbe->word2),
+                 [tid]   "=r"(tlbe->tid)
+               : [index] "r"(index),
+                 [sprn_mmucr] "i"(SPRN_MMUCR)
+               : "cc"
+       );
+}
+
+static inline void kvmppc_44x_tlbwe(unsigned int index,
+                                    struct kvmppc_44x_tlbe *stlbe)
+{
+       unsigned long tmp;
+
+       asm volatile(
+               "mfspr %[tmp], %[sprn_mmucr]\n"
+               "rlwimi %[tmp], %[tid], 0, 0xff\n"
+               "mtspr %[sprn_mmucr], %[tmp]\n"
+               "tlbwe %[word0], %[index], 0\n"
+               "tlbwe %[word1], %[index], 1\n"
+               "tlbwe %[word2], %[index], 2\n"
+               : [tmp]   "=&r"(tmp)
+               : [word0] "r"(stlbe->word0),
+                 [word1] "r"(stlbe->word1),
+                 [word2] "r"(stlbe->word2),
+                 [tid]   "r"(stlbe->tid),
+                 [index] "r"(index),
+                 [sprn_mmucr] "i"(SPRN_MMUCR)
+       );
+}
 
 static u32 kvmppc_44x_tlb_shadow_attrib(u32 attrib, int usermode)
 {
-       /* Mask off reserved bits. */
-       attrib &= PPC44x_TLB_PERM_MASK|PPC44x_TLB_ATTR_MASK;
+       /* We only care about the guest's permission and user bits. */
+       attrib &= PPC44x_TLB_PERM_MASK|PPC44x_TLB_UATTR_MASK;
 
        if (!usermode) {
                /* Guest is in supervisor mode, so we need to translate guest
@@ -47,18 +130,60 @@ static u32 kvmppc_44x_tlb_shadow_attrib(u32 attrib, int usermode)
        /* Make sure host can always access this memory. */
        attrib |= PPC44x_TLB_SX|PPC44x_TLB_SR|PPC44x_TLB_SW;
 
+       /* WIMGE = 0b00100 */
+       attrib |= PPC44x_TLB_M;
+
        return attrib;
 }
 
+/* Load shadow TLB back into hardware. */
+void kvmppc_44x_tlb_load(struct kvm_vcpu *vcpu)
+{
+       struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+       int i;
+
+       for (i = 0; i <= tlb_44x_hwater; i++) {
+               struct kvmppc_44x_tlbe *stlbe = &vcpu_44x->shadow_tlb[i];
+
+               if (get_tlb_v(stlbe) && get_tlb_ts(stlbe))
+                       kvmppc_44x_tlbwe(i, stlbe);
+       }
+}
+
+static void kvmppc_44x_tlbe_set_modified(struct kvmppc_vcpu_44x *vcpu_44x,
+                                         unsigned int i)
+{
+       vcpu_44x->shadow_tlb_mod[i] = 1;
+}
+
+/* Save hardware TLB to the vcpu, and invalidate all guest mappings. */
+void kvmppc_44x_tlb_put(struct kvm_vcpu *vcpu)
+{
+       struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+       int i;
+
+       for (i = 0; i <= tlb_44x_hwater; i++) {
+               struct kvmppc_44x_tlbe *stlbe = &vcpu_44x->shadow_tlb[i];
+
+               if (vcpu_44x->shadow_tlb_mod[i])
+                       kvmppc_44x_tlbre(i, stlbe);
+
+               if (get_tlb_v(stlbe) && get_tlb_ts(stlbe))
+                       kvmppc_44x_tlbie(i);
+       }
+}
+
+
 /* Search the guest TLB for a matching entry. */
 int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr, unsigned int pid,
                          unsigned int as)
 {
+       struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
        int i;
 
        /* XXX Replace loop with fancy data structures. */
-       for (i = 0; i < PPC44x_TLB_SIZE; i++) {
-               struct tlbe *tlbe = &vcpu->arch.guest_tlb[i];
+       for (i = 0; i < ARRAY_SIZE(vcpu_44x->guest_tlb); i++) {
+               struct kvmppc_44x_tlbe *tlbe = &vcpu_44x->guest_tlb[i];
                unsigned int tid;
 
                if (eaddr < get_tlb_eaddr(tlbe))
@@ -83,78 +208,89 @@ int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr, unsigned int pid,
        return -1;
 }
 
-struct tlbe *kvmppc_44x_itlb_search(struct kvm_vcpu *vcpu, gva_t eaddr)
+int kvmppc_44x_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
        unsigned int as = !!(vcpu->arch.msr & MSR_IS);
-       unsigned int index;
 
-       index = kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
-       if (index == -1)
-               return NULL;
-       return &vcpu->arch.guest_tlb[index];
+       return kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
 }
 
-struct tlbe *kvmppc_44x_dtlb_search(struct kvm_vcpu *vcpu, gva_t eaddr)
+int kvmppc_44x_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
        unsigned int as = !!(vcpu->arch.msr & MSR_DS);
-       unsigned int index;
 
-       index = kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
-       if (index == -1)
-               return NULL;
-       return &vcpu->arch.guest_tlb[index];
+       return kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
 }
 
-static int kvmppc_44x_tlbe_is_writable(struct tlbe *tlbe)
+static void kvmppc_44x_shadow_release(struct kvmppc_vcpu_44x *vcpu_44x,
+                                      unsigned int stlb_index)
 {
-       return tlbe->word2 & (PPC44x_TLB_SW|PPC44x_TLB_UW);
-}
+       struct kvmppc_44x_shadow_ref *ref = &vcpu_44x->shadow_refs[stlb_index];
 
-static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu,
-                                      unsigned int index)
-{
-       struct tlbe *stlbe = &vcpu->arch.shadow_tlb[index];
-       struct page *page = vcpu->arch.shadow_pages[index];
+       if (!ref->page)
+               return;
 
-       if (get_tlb_v(stlbe)) {
-               if (kvmppc_44x_tlbe_is_writable(stlbe))
-                       kvm_release_page_dirty(page);
-               else
-                       kvm_release_page_clean(page);
-       }
+       /* Discard from the TLB. */
+       /* Note: we could actually invalidate a host mapping, if the host overwrote
+        * this TLB entry since we inserted a guest mapping. */
+       kvmppc_44x_tlbie(stlb_index);
+
+       /* Now release the page. */
+       if (ref->writeable)
+               kvm_release_page_dirty(ref->page);
+       else
+               kvm_release_page_clean(ref->page);
+
+       ref->page = NULL;
+
+       /* XXX set tlb_44x_index to stlb_index? */
+
+       KVMTRACE_1D(STLB_INVAL, &vcpu_44x->vcpu, stlb_index, handler);
 }
 
 void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu)
 {
+       struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
        int i;
 
        for (i = 0; i <= tlb_44x_hwater; i++)
-               kvmppc_44x_shadow_release(vcpu, i);
-}
-
-void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i)
-{
-    vcpu->arch.shadow_tlb_mod[i] = 1;
+               kvmppc_44x_shadow_release(vcpu_44x, i);
 }
 
-/* Caller must ensure that the specified guest TLB entry is safe to insert into
- * the shadow TLB. */
-void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
-                    u32 flags)
+/**
+ * kvmppc_mmu_map -- create a host mapping for guest memory
+ *
+ * If the guest wanted a larger page than the host supports, only the first
+ * host page is mapped here and the rest are demand faulted.
+ *
+ * If the guest wanted a smaller page than the host page size, we map only the
+ * guest-size page (i.e. not a full host page mapping).
+ *
+ * Caller must ensure that the specified guest TLB entry is safe to insert into
+ * the shadow TLB.
+ */
+void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, u64 asid,
+                    u32 flags, u32 max_bytes, unsigned int gtlb_index)
 {
+       struct kvmppc_44x_tlbe stlbe;
+       struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+       struct kvmppc_44x_shadow_ref *ref;
        struct page *new_page;
-       struct tlbe *stlbe;
        hpa_t hpaddr;
+       gfn_t gfn;
        unsigned int victim;
 
-       /* Future optimization: don't overwrite the TLB entry containing the
-        * current PC (or stack?). */
-       victim = kvmppc_tlb_44x_pos++;
-       if (kvmppc_tlb_44x_pos > tlb_44x_hwater)
-               kvmppc_tlb_44x_pos = 0;
-       stlbe = &vcpu->arch.shadow_tlb[victim];
+       /* Select TLB entry to clobber. Indirectly guard against races with the TLB
+        * miss handler by disabling interrupts. */
+       local_irq_disable();
+       victim = ++tlb_44x_index;
+       if (victim > tlb_44x_hwater)
+               victim = 0;
+       tlb_44x_index = victim;
+       local_irq_enable();
 
        /* Get reference to new page. */
+       gfn = gpaddr >> PAGE_SHIFT;
        new_page = gfn_to_page(vcpu->kvm, gfn);
        if (is_error_page(new_page)) {
                printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn);
@@ -163,10 +299,8 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
        }
        hpaddr = page_to_phys(new_page);
 
-       /* Drop reference to old page. */
-       kvmppc_44x_shadow_release(vcpu, victim);
-
-       vcpu->arch.shadow_pages[victim] = new_page;
+       /* Invalidate any previous shadow mappings. */
+       kvmppc_44x_shadow_release(vcpu_44x, victim);
 
        /* XXX Make sure (va, size) doesn't overlap any other
         * entries. 440x6 user manual says the result would be
@@ -174,78 +308,193 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
 
        /* XXX what about AS? */
 
-       stlbe->tid = !(asid & 0xff);
-
        /* Force TS=1 for all guest mappings. */
-       /* For now we hardcode 4KB mappings, but it will be important to
-        * use host large pages in the future. */
-       stlbe->word0 = (gvaddr & PAGE_MASK) | PPC44x_TLB_VALID | PPC44x_TLB_TS
-                      | PPC44x_TLB_4K;
-       stlbe->word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf);
-       stlbe->word2 = kvmppc_44x_tlb_shadow_attrib(flags,
-                                                   vcpu->arch.msr & MSR_PR);
-       kvmppc_tlbe_set_modified(vcpu, victim);
+       stlbe.word0 = PPC44x_TLB_VALID | PPC44x_TLB_TS;
+
+       if (max_bytes >= PAGE_SIZE) {
+               /* Guest mapping is larger than or equal to host page size. We can use
+                * a "native" host mapping. */
+               stlbe.word0 |= (gvaddr & PAGE_MASK) | PPC44x_TLBE_SIZE;
+       } else {
+               /* Guest mapping is smaller than host page size. We must restrict the
+                * size of the mapping to be at most the smaller of the two, but for
+                * simplicity we fall back to a 4K mapping (this is probably what the
+                * guest is using anyways). */
+               stlbe.word0 |= (gvaddr & PAGE_MASK_4K) | PPC44x_TLB_4K;
+
+               /* 'hpaddr' is a host page, which is larger than the mapping we're
+                * inserting here. To compensate, we must add the in-page offset to the
+                * sub-page. */
+               hpaddr |= gpaddr & (PAGE_MASK ^ PAGE_MASK_4K);
+       }
 
-       KVMTRACE_5D(STLB_WRITE, vcpu, victim,
-                       stlbe->tid, stlbe->word0, stlbe->word1, stlbe->word2,
-                       handler);
+       stlbe.word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf);
+       stlbe.word2 = kvmppc_44x_tlb_shadow_attrib(flags,
+                                                   vcpu->arch.msr & MSR_PR);
+       stlbe.tid = !(asid & 0xff);
+
+       /* Keep track of the reference so we can properly release it later. */
+       ref = &vcpu_44x->shadow_refs[victim];
+       ref->page = new_page;
+       ref->gtlb_index = gtlb_index;
+       ref->writeable = !!(stlbe.word2 & PPC44x_TLB_UW);
+       ref->tid = stlbe.tid;
+
+       /* Insert shadow mapping into hardware TLB. */
+       kvmppc_44x_tlbe_set_modified(vcpu_44x, victim);
+       kvmppc_44x_tlbwe(victim, &stlbe);
+       KVMTRACE_5D(STLB_WRITE, vcpu, victim, stlbe.tid, stlbe.word0, stlbe.word1,
+                   stlbe.word2, handler);
 }
 
-void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
-                           gva_t eend, u32 asid)
+/* For a particular guest TLB entry, invalidate the corresponding host TLB
+ * mappings and release the host pages. */
+static void kvmppc_44x_invalidate(struct kvm_vcpu *vcpu,
+                                  unsigned int gtlb_index)
 {
-       unsigned int pid = !(asid & 0xff);
+       struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
        int i;
 
-       /* XXX Replace loop with fancy data structures. */
-       for (i = 0; i <= tlb_44x_hwater; i++) {
-               struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
-               unsigned int tid;
+       for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++) {
+               struct kvmppc_44x_shadow_ref *ref = &vcpu_44x->shadow_refs[i];
+               if (ref->gtlb_index == gtlb_index)
+                       kvmppc_44x_shadow_release(vcpu_44x, i);
+       }
+}
 
-               if (!get_tlb_v(stlbe))
-                       continue;
+void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
+{
+       vcpu->arch.shadow_pid = !usermode;
+}
 
-               if (eend < get_tlb_eaddr(stlbe))
-                       continue;
+void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid)
+{
+       struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+       int i;
 
-               if (eaddr > get_tlb_end(stlbe))
-                       continue;
+       if (unlikely(vcpu->arch.pid == new_pid))
+               return;
 
-               tid = get_tlb_tid(stlbe);
-               if (tid && (tid != pid))
-                       continue;
+       vcpu->arch.pid = new_pid;
 
-               kvmppc_44x_shadow_release(vcpu, i);
-               stlbe->word0 = 0;
-               kvmppc_tlbe_set_modified(vcpu, i);
-               KVMTRACE_5D(STLB_INVAL, vcpu, i,
-                               stlbe->tid, stlbe->word0, stlbe->word1,
-                               stlbe->word2, handler);
+       /* Guest userspace runs with TID=0 mappings and PID=0, to make sure it
+        * can't access guest kernel mappings (TID=1). When we switch to a new
+        * guest PID, which will also use host PID=0, we must discard the old guest
+        * userspace mappings. */
+       for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++) {
+               struct kvmppc_44x_shadow_ref *ref = &vcpu_44x->shadow_refs[i];
+
+               if (ref->tid == 0)
+                       kvmppc_44x_shadow_release(vcpu_44x, i);
        }
 }
 
-/* Invalidate all mappings on the privilege switch after PID has been changed.
- * The guest always runs with PID=1, so we must clear the entire TLB when
- * switching address spaces. */
-void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
+static int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
+                             const struct kvmppc_44x_tlbe *tlbe)
 {
-       int i;
+       gpa_t gpa;
 
-       if (vcpu->arch.swap_pid) {
-               /* XXX Replace loop with fancy data structures. */
-               for (i = 0; i <= tlb_44x_hwater; i++) {
-                       struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
-
-                       /* Future optimization: clear only userspace mappings. */
-                       kvmppc_44x_shadow_release(vcpu, i);
-                       stlbe->word0 = 0;
-                       kvmppc_tlbe_set_modified(vcpu, i);
-                       KVMTRACE_5D(STLB_INVAL, vcpu, i,
-                                   stlbe->tid, stlbe->word0, stlbe->word1,
-                                   stlbe->word2, handler);
-               }
-               vcpu->arch.swap_pid = 0;
+       if (!get_tlb_v(tlbe))
+               return 0;
+
+       /* Does it match current guest AS? */
+       /* XXX what about IS != DS? */
+       if (get_tlb_ts(tlbe) != !!(vcpu->arch.msr & MSR_IS))
+               return 0;
+
+       gpa = get_tlb_raddr(tlbe);
+       if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT))
+               /* Mapping is not for RAM. */
+               return 0;
+
+       return 1;
+}
+
+int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
+{
+       struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+       struct kvmppc_44x_tlbe *tlbe;
+       unsigned int gtlb_index;
+
+       gtlb_index = vcpu->arch.gpr[ra];
+       if (gtlb_index > KVM44x_GUEST_TLB_SIZE) {
+               printk("%s: index %d\n", __func__, gtlb_index);
+               kvmppc_dump_vcpu(vcpu);
+               return EMULATE_FAIL;
        }
 
-       vcpu->arch.shadow_pid = !usermode;
+       tlbe = &vcpu_44x->guest_tlb[gtlb_index];
+
+       /* Invalidate shadow mappings for the about-to-be-clobbered TLB entry. */
+       if (tlbe->word0 & PPC44x_TLB_VALID)
+               kvmppc_44x_invalidate(vcpu, gtlb_index);
+
+       switch (ws) {
+       case PPC44x_TLB_PAGEID:
+               tlbe->tid = get_mmucr_stid(vcpu);
+               tlbe->word0 = vcpu->arch.gpr[rs];
+               break;
+
+       case PPC44x_TLB_XLAT:
+               tlbe->word1 = vcpu->arch.gpr[rs];
+               break;
+
+       case PPC44x_TLB_ATTRIB:
+               tlbe->word2 = vcpu->arch.gpr[rs];
+               break;
+
+       default:
+               return EMULATE_FAIL;
+       }
+
+       if (tlbe_is_host_safe(vcpu, tlbe)) {
+               u64 asid;
+               gva_t eaddr;
+               gpa_t gpaddr;
+               u32 flags;
+               u32 bytes;
+
+               eaddr = get_tlb_eaddr(tlbe);
+               gpaddr = get_tlb_raddr(tlbe);
+
+               /* Use the advertised page size to mask effective and real addrs. */
+               bytes = get_tlb_bytes(tlbe);
+               eaddr &= ~(bytes - 1);
+               gpaddr &= ~(bytes - 1);
+
+               asid = (tlbe->word0 & PPC44x_TLB_TS) | tlbe->tid;
+               flags = tlbe->word2 & 0xffff;
+
+               kvmppc_mmu_map(vcpu, eaddr, gpaddr, asid, flags, bytes, gtlb_index);
+       }
+
+       KVMTRACE_5D(GTLB_WRITE, vcpu, gtlb_index, tlbe->tid, tlbe->word0,
+                   tlbe->word1, tlbe->word2, handler);
+
+       kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS);
+       return EMULATE_DONE;
+}
+
+int kvmppc_44x_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb, u8 rc)
+{
+       u32 ea;
+       int gtlb_index;
+       unsigned int as = get_mmucr_sts(vcpu);
+       unsigned int pid = get_mmucr_stid(vcpu);
+
+       ea = vcpu->arch.gpr[rb];
+       if (ra)
+               ea += vcpu->arch.gpr[ra];
+
+       gtlb_index = kvmppc_44x_tlb_index(vcpu, ea, pid, as);
+       if (rc) {
+               if (gtlb_index < 0)
+                       vcpu->arch.cr &= ~0x20000000;
+               else
+                       vcpu->arch.cr |= 0x20000000;
+       }
+       vcpu->arch.gpr[rt] = gtlb_index;
+
+       kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS);
+       return EMULATE_DONE;
 }
index 2ccd46b6f6b762474a63a89fe2e4a8ec63e62b8e..772191f29e62f702c5df56ad5addacbc37b64922 100644 (file)
 
 extern int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr,
                                 unsigned int pid, unsigned int as);
-extern struct tlbe *kvmppc_44x_dtlb_search(struct kvm_vcpu *vcpu, gva_t eaddr);
-extern struct tlbe *kvmppc_44x_itlb_search(struct kvm_vcpu *vcpu, gva_t eaddr);
+extern int kvmppc_44x_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr);
+extern int kvmppc_44x_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr);
+
+extern int kvmppc_44x_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb,
+                                 u8 rc);
+extern int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws);
 
 /* TLB helper functions */
-static inline unsigned int get_tlb_size(const struct tlbe *tlbe)
+static inline unsigned int get_tlb_size(const struct kvmppc_44x_tlbe *tlbe)
 {
        return (tlbe->word0 >> 4) & 0xf;
 }
 
-static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe)
+static inline gva_t get_tlb_eaddr(const struct kvmppc_44x_tlbe *tlbe)
 {
        return tlbe->word0 & 0xfffffc00;
 }
 
-static inline gva_t get_tlb_bytes(const struct tlbe *tlbe)
+static inline gva_t get_tlb_bytes(const struct kvmppc_44x_tlbe *tlbe)
 {
        unsigned int pgsize = get_tlb_size(tlbe);
        return 1 << 10 << (pgsize << 1);
 }
 
-static inline gva_t get_tlb_end(const struct tlbe *tlbe)
+static inline gva_t get_tlb_end(const struct kvmppc_44x_tlbe *tlbe)
 {
        return get_tlb_eaddr(tlbe) + get_tlb_bytes(tlbe) - 1;
 }
 
-static inline u64 get_tlb_raddr(const struct tlbe *tlbe)
+static inline u64 get_tlb_raddr(const struct kvmppc_44x_tlbe *tlbe)
 {
        u64 word1 = tlbe->word1;
        return ((word1 & 0xf) << 32) | (word1 & 0xfffffc00);
 }
 
-static inline unsigned int get_tlb_tid(const struct tlbe *tlbe)
+static inline unsigned int get_tlb_tid(const struct kvmppc_44x_tlbe *tlbe)
 {
        return tlbe->tid & 0xff;
 }
 
-static inline unsigned int get_tlb_ts(const struct tlbe *tlbe)
+static inline unsigned int get_tlb_ts(const struct kvmppc_44x_tlbe *tlbe)
 {
        return (tlbe->word0 >> 8) & 0x1;
 }
 
-static inline unsigned int get_tlb_v(const struct tlbe *tlbe)
+static inline unsigned int get_tlb_v(const struct kvmppc_44x_tlbe *tlbe)
 {
        return (tlbe->word0 >> 9) & 0x1;
 }
@@ -81,7 +85,7 @@ static inline unsigned int get_mmucr_sts(const struct kvm_vcpu *vcpu)
        return (vcpu->arch.mmucr >> 16) & 0x1;
 }
 
-static inline gpa_t tlb_xlate(struct tlbe *tlbe, gva_t eaddr)
+static inline gpa_t tlb_xlate(struct kvmppc_44x_tlbe *tlbe, gva_t eaddr)
 {
        unsigned int pgmask = get_tlb_bytes(tlbe) - 1;
 
index 53aaa66b25e5ada55b425004e7a004295747857b..6dbdc4817d80c5bb2424236208497b5b3bde1373 100644 (file)
@@ -15,27 +15,33 @@ menuconfig VIRTUALIZATION
 if VIRTUALIZATION
 
 config KVM
-       bool "Kernel-based Virtual Machine (KVM) support"
-       depends on 44x && EXPERIMENTAL
+       bool
        select PREEMPT_NOTIFIERS
        select ANON_INODES
-       # We can only run on Book E hosts so far
-       select KVM_BOOKE_HOST
+
+config KVM_440
+       bool "KVM support for PowerPC 440 processors"
+       depends on EXPERIMENTAL && 44x
+       select KVM
        ---help---
-         Support hosting virtualized guest machines. You will also
-         need to select one or more of the processor modules below.
+         Support running unmodified 440 guest kernels in virtual machines on
+         440 host processors.
 
          This module provides access to the hardware capabilities through
          a character device node named /dev/kvm.
 
          If unsure, say N.
 
-config KVM_BOOKE_HOST
-       bool "KVM host support for Book E PowerPC processors"
-       depends on KVM && 44x
+config KVM_EXIT_TIMING
+       bool "Detailed exit timing"
+       depends on KVM
        ---help---
-         Provides host support for KVM on Book E PowerPC processors. Currently
-         this works on 440 processors only.
+         Calculate elapsed time for every exit/enter cycle. A per-vcpu
+         report is available in debugfs kvm/vm#_vcpu#_timing.
+         The overhead is relatively small, however it is not recommended for
+         production environments.
+
+         If unsure, say N.
 
 config KVM_TRACE
        bool "KVM trace support"
index 2a5d4397ac4b06f7545a5fc764b9da98b0ba580a..df7ba59e6d53b495c1885497882d6d67a01e77cf 100644 (file)
@@ -8,10 +8,16 @@ common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
 
 common-objs-$(CONFIG_KVM_TRACE)  += $(addprefix ../../../virt/kvm/, kvm_trace.o)
 
-kvm-objs := $(common-objs-y) powerpc.o emulate.o booke_guest.o
+kvm-objs := $(common-objs-y) powerpc.o emulate.o
+obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
 obj-$(CONFIG_KVM) += kvm.o
 
 AFLAGS_booke_interrupts.o := -I$(obj)
 
-kvm-booke-host-objs := booke_host.o booke_interrupts.o 44x_tlb.o
-obj-$(CONFIG_KVM_BOOKE_HOST) += kvm-booke-host.o
+kvm-440-objs := \
+       booke.o \
+       booke_interrupts.o \
+       44x.o \
+       44x_tlb.o \
+       44x_emulate.o
+obj-$(CONFIG_KVM_440) += kvm-440.o
similarity index 56%
rename from arch/powerpc/kvm/booke_guest.c
rename to arch/powerpc/kvm/booke.c
index 7b2591e26bae3dc93dc7f002e833fd98de8288bb..35485dd6927eef520632f727660a2d9499a3699f 100644 (file)
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/fs.h>
+
 #include <asm/cputable.h>
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
+#include "timing.h"
+#include <asm/cacheflush.h>
+#include <asm/kvm_44x.h>
 
+#include "booke.h"
 #include "44x_tlb.h"
 
+unsigned long kvmppc_booke_handlers;
+
 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
-       { "exits",      VCPU_STAT(sum_exits) },
        { "mmio",       VCPU_STAT(mmio_exits) },
        { "dcr",        VCPU_STAT(dcr_exits) },
        { "sig",        VCPU_STAT(signal_exits) },
-       { "light",      VCPU_STAT(light_exits) },
        { "itlb_r",     VCPU_STAT(itlb_real_miss_exits) },
        { "itlb_v",     VCPU_STAT(itlb_virt_miss_exits) },
        { "dtlb_r",     VCPU_STAT(dtlb_real_miss_exits) },
@@ -53,103 +58,19 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { NULL }
 };
 
-static const u32 interrupt_msr_mask[16] = {
-       [BOOKE_INTERRUPT_CRITICAL]      = MSR_ME,
-       [BOOKE_INTERRUPT_MACHINE_CHECK] = 0,
-       [BOOKE_INTERRUPT_DATA_STORAGE]  = MSR_CE|MSR_ME|MSR_DE,
-       [BOOKE_INTERRUPT_INST_STORAGE]  = MSR_CE|MSR_ME|MSR_DE,
-       [BOOKE_INTERRUPT_EXTERNAL]      = MSR_CE|MSR_ME|MSR_DE,
-       [BOOKE_INTERRUPT_ALIGNMENT]     = MSR_CE|MSR_ME|MSR_DE,
-       [BOOKE_INTERRUPT_PROGRAM]       = MSR_CE|MSR_ME|MSR_DE,
-       [BOOKE_INTERRUPT_FP_UNAVAIL]    = MSR_CE|MSR_ME|MSR_DE,
-       [BOOKE_INTERRUPT_SYSCALL]       = MSR_CE|MSR_ME|MSR_DE,
-       [BOOKE_INTERRUPT_AP_UNAVAIL]    = MSR_CE|MSR_ME|MSR_DE,
-       [BOOKE_INTERRUPT_DECREMENTER]   = MSR_CE|MSR_ME|MSR_DE,
-       [BOOKE_INTERRUPT_FIT]           = MSR_CE|MSR_ME|MSR_DE,
-       [BOOKE_INTERRUPT_WATCHDOG]      = MSR_ME,
-       [BOOKE_INTERRUPT_DTLB_MISS]     = MSR_CE|MSR_ME|MSR_DE,
-       [BOOKE_INTERRUPT_ITLB_MISS]     = MSR_CE|MSR_ME|MSR_DE,
-       [BOOKE_INTERRUPT_DEBUG]         = MSR_ME,
-};
-
-const unsigned char exception_priority[] = {
-       [BOOKE_INTERRUPT_DATA_STORAGE] = 0,
-       [BOOKE_INTERRUPT_INST_STORAGE] = 1,
-       [BOOKE_INTERRUPT_ALIGNMENT] = 2,
-       [BOOKE_INTERRUPT_PROGRAM] = 3,
-       [BOOKE_INTERRUPT_FP_UNAVAIL] = 4,
-       [BOOKE_INTERRUPT_SYSCALL] = 5,
-       [BOOKE_INTERRUPT_AP_UNAVAIL] = 6,
-       [BOOKE_INTERRUPT_DTLB_MISS] = 7,
-       [BOOKE_INTERRUPT_ITLB_MISS] = 8,
-       [BOOKE_INTERRUPT_MACHINE_CHECK] = 9,
-       [BOOKE_INTERRUPT_DEBUG] = 10,
-       [BOOKE_INTERRUPT_CRITICAL] = 11,
-       [BOOKE_INTERRUPT_WATCHDOG] = 12,
-       [BOOKE_INTERRUPT_EXTERNAL] = 13,
-       [BOOKE_INTERRUPT_FIT] = 14,
-       [BOOKE_INTERRUPT_DECREMENTER] = 15,
-};
-
-const unsigned char priority_exception[] = {
-       BOOKE_INTERRUPT_DATA_STORAGE,
-       BOOKE_INTERRUPT_INST_STORAGE,
-       BOOKE_INTERRUPT_ALIGNMENT,
-       BOOKE_INTERRUPT_PROGRAM,
-       BOOKE_INTERRUPT_FP_UNAVAIL,
-       BOOKE_INTERRUPT_SYSCALL,
-       BOOKE_INTERRUPT_AP_UNAVAIL,
-       BOOKE_INTERRUPT_DTLB_MISS,
-       BOOKE_INTERRUPT_ITLB_MISS,
-       BOOKE_INTERRUPT_MACHINE_CHECK,
-       BOOKE_INTERRUPT_DEBUG,
-       BOOKE_INTERRUPT_CRITICAL,
-       BOOKE_INTERRUPT_WATCHDOG,
-       BOOKE_INTERRUPT_EXTERNAL,
-       BOOKE_INTERRUPT_FIT,
-       BOOKE_INTERRUPT_DECREMENTER,
-};
-
-
-void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
-{
-       struct tlbe *tlbe;
-       int i;
-
-       printk("vcpu %d TLB dump:\n", vcpu->vcpu_id);
-       printk("| %2s | %3s | %8s | %8s | %8s |\n",
-                       "nr", "tid", "word0", "word1", "word2");
-
-       for (i = 0; i < PPC44x_TLB_SIZE; i++) {
-               tlbe = &vcpu->arch.guest_tlb[i];
-               if (tlbe->word0 & PPC44x_TLB_VALID)
-                       printk(" G%2d |  %02X | %08X | %08X | %08X |\n",
-                              i, tlbe->tid, tlbe->word0, tlbe->word1,
-                              tlbe->word2);
-       }
-
-       for (i = 0; i < PPC44x_TLB_SIZE; i++) {
-               tlbe = &vcpu->arch.shadow_tlb[i];
-               if (tlbe->word0 & PPC44x_TLB_VALID)
-                       printk(" S%2d | %02X | %08X | %08X | %08X |\n",
-                              i, tlbe->tid, tlbe->word0, tlbe->word1,
-                              tlbe->word2);
-       }
-}
-
 /* TODO: use vcpu_printf() */
 void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
 {
        int i;
 
-       printk("pc:   %08x msr:  %08x\n", vcpu->arch.pc, vcpu->arch.msr);
-       printk("lr:   %08x ctr:  %08x\n", vcpu->arch.lr, vcpu->arch.ctr);
-       printk("srr0: %08x srr1: %08x\n", vcpu->arch.srr0, vcpu->arch.srr1);
+       printk("pc:   %08lx msr:  %08lx\n", vcpu->arch.pc, vcpu->arch.msr);
+       printk("lr:   %08lx ctr:  %08lx\n", vcpu->arch.lr, vcpu->arch.ctr);
+       printk("srr0: %08lx srr1: %08lx\n", vcpu->arch.srr0, vcpu->arch.srr1);
 
        printk("exceptions: %08lx\n", vcpu->arch.pending_exceptions);
 
        for (i = 0; i < 32; i += 4) {
-               printk("gpr%02d: %08x %08x %08x %08x\n", i,
+               printk("gpr%02d: %08lx %08lx %08lx %08lx\n", i,
                       vcpu->arch.gpr[i],
                       vcpu->arch.gpr[i+1],
                       vcpu->arch.gpr[i+2],
@@ -157,69 +78,96 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
        }
 }
 
-/* Check if we are ready to deliver the interrupt */
-static int kvmppc_can_deliver_interrupt(struct kvm_vcpu *vcpu, int interrupt)
+static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu,
+                                       unsigned int priority)
 {
-       int r;
+       set_bit(priority, &vcpu->arch.pending_exceptions);
+}
 
-       switch (interrupt) {
-       case BOOKE_INTERRUPT_CRITICAL:
-               r = vcpu->arch.msr & MSR_CE;
-               break;
-       case BOOKE_INTERRUPT_MACHINE_CHECK:
-               r = vcpu->arch.msr & MSR_ME;
-               break;
-       case BOOKE_INTERRUPT_EXTERNAL:
-               r = vcpu->arch.msr & MSR_EE;
+void kvmppc_core_queue_program(struct kvm_vcpu *vcpu)
+{
+       kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM);
+}
+
+void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
+{
+       kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DECREMENTER);
+}
+
+int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu)
+{
+       return test_bit(BOOKE_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions);
+}
+
+void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
+                                struct kvm_interrupt *irq)
+{
+       kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_EXTERNAL);
+}
+
+/* Deliver the interrupt of the corresponding priority, if possible. */
+static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
+                                        unsigned int priority)
+{
+       int allowed = 0;
+       ulong msr_mask;
+
+       switch (priority) {
+       case BOOKE_IRQPRIO_PROGRAM:
+       case BOOKE_IRQPRIO_DTLB_MISS:
+       case BOOKE_IRQPRIO_ITLB_MISS:
+       case BOOKE_IRQPRIO_SYSCALL:
+       case BOOKE_IRQPRIO_DATA_STORAGE:
+       case BOOKE_IRQPRIO_INST_STORAGE:
+       case BOOKE_IRQPRIO_FP_UNAVAIL:
+       case BOOKE_IRQPRIO_AP_UNAVAIL:
+       case BOOKE_IRQPRIO_ALIGNMENT:
+               allowed = 1;
+               msr_mask = MSR_CE|MSR_ME|MSR_DE;
                break;
-       case BOOKE_INTERRUPT_DECREMENTER:
-               r = vcpu->arch.msr & MSR_EE;
+       case BOOKE_IRQPRIO_CRITICAL:
+       case BOOKE_IRQPRIO_WATCHDOG:
+               allowed = vcpu->arch.msr & MSR_CE;
+               msr_mask = MSR_ME;
                break;
-       case BOOKE_INTERRUPT_FIT:
-               r = vcpu->arch.msr & MSR_EE;
+       case BOOKE_IRQPRIO_MACHINE_CHECK:
+               allowed = vcpu->arch.msr & MSR_ME;
+               msr_mask = 0;
                break;
-       case BOOKE_INTERRUPT_WATCHDOG:
-               r = vcpu->arch.msr & MSR_CE;
+       case BOOKE_IRQPRIO_EXTERNAL:
+       case BOOKE_IRQPRIO_DECREMENTER:
+       case BOOKE_IRQPRIO_FIT:
+               allowed = vcpu->arch.msr & MSR_EE;
+               msr_mask = MSR_CE|MSR_ME|MSR_DE;
                break;
-       case BOOKE_INTERRUPT_DEBUG:
-               r = vcpu->arch.msr & MSR_DE;
+       case BOOKE_IRQPRIO_DEBUG:
+               allowed = vcpu->arch.msr & MSR_DE;
+               msr_mask = MSR_ME;
                break;
-       default:
-               r = 1;
        }
 
-       return r;
-}
+       if (allowed) {
+               vcpu->arch.srr0 = vcpu->arch.pc;
+               vcpu->arch.srr1 = vcpu->arch.msr;
+               vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority];
+               kvmppc_set_msr(vcpu, vcpu->arch.msr & msr_mask);
 
-static void kvmppc_deliver_interrupt(struct kvm_vcpu *vcpu, int interrupt)
-{
-       switch (interrupt) {
-       case BOOKE_INTERRUPT_DECREMENTER:
-               vcpu->arch.tsr |= TSR_DIS;
-               break;
+               clear_bit(priority, &vcpu->arch.pending_exceptions);
        }
 
-       vcpu->arch.srr0 = vcpu->arch.pc;
-       vcpu->arch.srr1 = vcpu->arch.msr;
-       vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[interrupt];
-       kvmppc_set_msr(vcpu, vcpu->arch.msr & interrupt_msr_mask[interrupt]);
+       return allowed;
 }
 
 /* Check pending exceptions and deliver one, if possible. */
-void kvmppc_check_and_deliver_interrupts(struct kvm_vcpu *vcpu)
+void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 {
        unsigned long *pending = &vcpu->arch.pending_exceptions;
-       unsigned int exception;
        unsigned int priority;
 
-       priority = find_first_bit(pending, BITS_PER_BYTE * sizeof(*pending));
+       priority = __ffs(*pending);
        while (priority <= BOOKE_MAX_INTERRUPT) {
-               exception = priority_exception[priority];
-               if (kvmppc_can_deliver_interrupt(vcpu, exception)) {
-                       kvmppc_clear_exception(vcpu, exception);
-                       kvmppc_deliver_interrupt(vcpu, exception);
+               if (kvmppc_booke_irqprio_deliver(vcpu, priority))
                        break;
-               }
 
                priority = find_next_bit(pending,
                                         BITS_PER_BYTE * sizeof(*pending),
@@ -238,6 +186,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
        enum emulation_result er;
        int r = RESUME_HOST;
 
+       /* update before a new last_exit_type is rewritten */
+       kvmppc_update_timing_stats(vcpu);
+
        local_irq_enable();
 
        run->exit_reason = KVM_EXIT_UNKNOWN;
@@ -251,21 +202,19 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                break;
 
        case BOOKE_INTERRUPT_EXTERNAL:
+               kvmppc_account_exit(vcpu, EXT_INTR_EXITS);
+               if (need_resched())
+                       cond_resched();
+               r = RESUME_GUEST;
+               break;
+
        case BOOKE_INTERRUPT_DECREMENTER:
                /* Since we switched IVPR back to the host's value, the host
                 * handled this interrupt the moment we enabled interrupts.
                 * Now we just offer it a chance to reschedule the guest. */
-
-               /* XXX At this point the TLB still holds our shadow TLB, so if
-                * we do reschedule the host will fault over it. Perhaps we
-                * should politely restore the host's entries to minimize
-                * misses before ceding control. */
+               kvmppc_account_exit(vcpu, DEC_EXITS);
                if (need_resched())
                        cond_resched();
-               if (exit_nr == BOOKE_INTERRUPT_DECREMENTER)
-                       vcpu->stat.dec_exits++;
-               else
-                       vcpu->stat.ext_intr_exits++;
                r = RESUME_GUEST;
                break;
 
@@ -274,17 +223,19 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                        /* Program traps generated by user-level software must be handled
                         * by the guest kernel. */
                        vcpu->arch.esr = vcpu->arch.fault_esr;
-                       kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_PROGRAM);
+                       kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM);
                        r = RESUME_GUEST;
+                       kvmppc_account_exit(vcpu, USR_PR_INST);
                        break;
                }
 
                er = kvmppc_emulate_instruction(run, vcpu);
                switch (er) {
                case EMULATE_DONE:
+                       /* don't overwrite subtypes, just account kvm_stats */
+                       kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS);
                        /* Future optimization: only reload non-volatiles if
                         * they were actually modified by emulation. */
-                       vcpu->stat.emulated_inst_exits++;
                        r = RESUME_GUEST_NV;
                        break;
                case EMULATE_DO_DCR:
@@ -293,7 +244,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                        break;
                case EMULATE_FAIL:
                        /* XXX Deliver Program interrupt to guest. */
-                       printk(KERN_CRIT "%s: emulation at %x failed (%08x)\n",
+                       printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
                               __func__, vcpu->arch.pc, vcpu->arch.last_inst);
                        /* For debugging, encode the failing instruction and
                         * report it to userspace. */
@@ -307,48 +258,53 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                break;
 
        case BOOKE_INTERRUPT_FP_UNAVAIL:
-               kvmppc_queue_exception(vcpu, exit_nr);
+               kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_FP_UNAVAIL);
+               kvmppc_account_exit(vcpu, FP_UNAVAIL);
                r = RESUME_GUEST;
                break;
 
        case BOOKE_INTERRUPT_DATA_STORAGE:
                vcpu->arch.dear = vcpu->arch.fault_dear;
                vcpu->arch.esr = vcpu->arch.fault_esr;
-               kvmppc_queue_exception(vcpu, exit_nr);
-               vcpu->stat.dsi_exits++;
+               kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DATA_STORAGE);
+               kvmppc_account_exit(vcpu, DSI_EXITS);
                r = RESUME_GUEST;
                break;
 
        case BOOKE_INTERRUPT_INST_STORAGE:
                vcpu->arch.esr = vcpu->arch.fault_esr;
-               kvmppc_queue_exception(vcpu, exit_nr);
-               vcpu->stat.isi_exits++;
+               kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE);
+               kvmppc_account_exit(vcpu, ISI_EXITS);
                r = RESUME_GUEST;
                break;
 
        case BOOKE_INTERRUPT_SYSCALL:
-               kvmppc_queue_exception(vcpu, exit_nr);
-               vcpu->stat.syscall_exits++;
+               kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SYSCALL);
+               kvmppc_account_exit(vcpu, SYSCALL_EXITS);
                r = RESUME_GUEST;
                break;
 
+       /* XXX move to a 440-specific file. */
        case BOOKE_INTERRUPT_DTLB_MISS: {
-               struct tlbe *gtlbe;
+               struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+               struct kvmppc_44x_tlbe *gtlbe;
                unsigned long eaddr = vcpu->arch.fault_dear;
+               int gtlb_index;
                gfn_t gfn;
 
                /* Check the guest TLB. */
-               gtlbe = kvmppc_44x_dtlb_search(vcpu, eaddr);
-               if (!gtlbe) {
+               gtlb_index = kvmppc_44x_dtlb_index(vcpu, eaddr);
+               if (gtlb_index < 0) {
                        /* The guest didn't have a mapping for it. */
-                       kvmppc_queue_exception(vcpu, exit_nr);
+                       kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS);
                        vcpu->arch.dear = vcpu->arch.fault_dear;
                        vcpu->arch.esr = vcpu->arch.fault_esr;
-                       vcpu->stat.dtlb_real_miss_exits++;
+                       kvmppc_account_exit(vcpu, DTLB_REAL_MISS_EXITS);
                        r = RESUME_GUEST;
                        break;
                }
 
+               gtlbe = &vcpu_44x->guest_tlb[gtlb_index];
                vcpu->arch.paddr_accessed = tlb_xlate(gtlbe, eaddr);
                gfn = vcpu->arch.paddr_accessed >> PAGE_SHIFT;
 
@@ -359,38 +315,45 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                         * b) the guest used a large mapping which we're faking
                         * Either way, we need to satisfy the fault without
                         * invoking the guest. */
-                       kvmppc_mmu_map(vcpu, eaddr, gfn, gtlbe->tid,
-                                      gtlbe->word2);
-                       vcpu->stat.dtlb_virt_miss_exits++;
+                       kvmppc_mmu_map(vcpu, eaddr, vcpu->arch.paddr_accessed, gtlbe->tid,
+                                      gtlbe->word2, get_tlb_bytes(gtlbe), gtlb_index);
+                       kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS);
                        r = RESUME_GUEST;
                } else {
                        /* Guest has mapped and accessed a page which is not
                         * actually RAM. */
                        r = kvmppc_emulate_mmio(run, vcpu);
+                       kvmppc_account_exit(vcpu, MMIO_EXITS);
                }
 
                break;
        }
 
+       /* XXX move to a 440-specific file. */
        case BOOKE_INTERRUPT_ITLB_MISS: {
-               struct tlbe *gtlbe;
+               struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+               struct kvmppc_44x_tlbe *gtlbe;
                unsigned long eaddr = vcpu->arch.pc;
+               gpa_t gpaddr;
                gfn_t gfn;
+               int gtlb_index;
 
                r = RESUME_GUEST;
 
                /* Check the guest TLB. */
-               gtlbe = kvmppc_44x_itlb_search(vcpu, eaddr);
-               if (!gtlbe) {
+               gtlb_index = kvmppc_44x_itlb_index(vcpu, eaddr);
+               if (gtlb_index < 0) {
                        /* The guest didn't have a mapping for it. */
-                       kvmppc_queue_exception(vcpu, exit_nr);
-                       vcpu->stat.itlb_real_miss_exits++;
+                       kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ITLB_MISS);
+                       kvmppc_account_exit(vcpu, ITLB_REAL_MISS_EXITS);
                        break;
                }
 
-               vcpu->stat.itlb_virt_miss_exits++;
+               kvmppc_account_exit(vcpu, ITLB_VIRT_MISS_EXITS);
 
-               gfn = tlb_xlate(gtlbe, eaddr) >> PAGE_SHIFT;
+               gtlbe = &vcpu_44x->guest_tlb[gtlb_index];
+               gpaddr = tlb_xlate(gtlbe, eaddr);
+               gfn = gpaddr >> PAGE_SHIFT;
 
                if (kvm_is_visible_gfn(vcpu->kvm, gfn)) {
                        /* The guest TLB had a mapping, but the shadow TLB
@@ -399,12 +362,11 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                         * b) the guest used a large mapping which we're faking
                         * Either way, we need to satisfy the fault without
                         * invoking the guest. */
-                       kvmppc_mmu_map(vcpu, eaddr, gfn, gtlbe->tid,
-                                      gtlbe->word2);
+                       kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlbe->tid,
+                                      gtlbe->word2, get_tlb_bytes(gtlbe), gtlb_index);
                } else {
                        /* Guest mapped and leaped at non-RAM! */
-                       kvmppc_queue_exception(vcpu,
-                                              BOOKE_INTERRUPT_MACHINE_CHECK);
+                       kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_MACHINE_CHECK);
                }
 
                break;
@@ -421,6 +383,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                mtspr(SPRN_DBSR, dbsr);
 
                run->exit_reason = KVM_EXIT_DEBUG;
+               kvmppc_account_exit(vcpu, DEBUG_EXITS);
                r = RESUME_HOST;
                break;
        }
@@ -432,10 +395,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
        local_irq_disable();
 
-       kvmppc_check_and_deliver_interrupts(vcpu);
+       kvmppc_core_deliver_interrupts(vcpu);
 
-       /* Do some exit accounting. */
-       vcpu->stat.sum_exits++;
        if (!(r & RESUME_HOST)) {
                /* To avoid clobbering exit_reason, only check for signals if
                 * we aren't already exiting to userspace for some other
@@ -443,22 +404,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                if (signal_pending(current)) {
                        run->exit_reason = KVM_EXIT_INTR;
                        r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
-
-                       vcpu->stat.signal_exits++;
-               } else {
-                       vcpu->stat.light_exits++;
-               }
-       } else {
-               switch (run->exit_reason) {
-               case KVM_EXIT_MMIO:
-                       vcpu->stat.mmio_exits++;
-                       break;
-               case KVM_EXIT_DCR:
-                       vcpu->stat.dcr_exits++;
-                       break;
-               case KVM_EXIT_INTR:
-                       vcpu->stat.signal_exits++;
-                       break;
+                       kvmppc_account_exit(vcpu, SIGNAL_EXITS);
                }
        }
 
@@ -468,20 +414,6 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 /* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
-       struct tlbe *tlbe = &vcpu->arch.guest_tlb[0];
-
-       tlbe->tid = 0;
-       tlbe->word0 = PPC44x_TLB_16M | PPC44x_TLB_VALID;
-       tlbe->word1 = 0;
-       tlbe->word2 = PPC44x_TLB_SX | PPC44x_TLB_SW | PPC44x_TLB_SR;
-
-       tlbe++;
-       tlbe->tid = 0;
-       tlbe->word0 = 0xef600000 | PPC44x_TLB_4K | PPC44x_TLB_VALID;
-       tlbe->word1 = 0xef600000;
-       tlbe->word2 = PPC44x_TLB_SX | PPC44x_TLB_SW | PPC44x_TLB_SR
-                     | PPC44x_TLB_I | PPC44x_TLB_G;
-
        vcpu->arch.pc = 0;
        vcpu->arch.msr = 0;
        vcpu->arch.gpr[1] = (16<<20) - 8; /* -8 for the callee-save LR slot */
@@ -492,12 +424,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
         * before it's programmed its own IVPR. */
        vcpu->arch.ivpr = 0x55550000;
 
-       /* Since the guest can directly access the timebase, it must know the
-        * real timebase frequency. Accordingly, it must see the state of
-        * CCR1[TCS]. */
-       vcpu->arch.ccr1 = mfspr(SPRN_CCR1);
+       kvmppc_init_timing_stats(vcpu);
 
-       return 0;
+       return kvmppc_core_vcpu_setup(vcpu);
 }
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
@@ -536,7 +465,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        vcpu->arch.ctr = regs->ctr;
        vcpu->arch.lr = regs->lr;
        vcpu->arch.xer = regs->xer;
-       vcpu->arch.msr = regs->msr;
+       kvmppc_set_msr(vcpu, regs->msr);
        vcpu->arch.srr0 = regs->srr0;
        vcpu->arch.srr1 = regs->srr1;
        vcpu->arch.sprg0 = regs->sprg0;
@@ -575,31 +504,62 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
        return -ENOTSUPP;
 }
 
-/* 'linear_address' is actually an encoding of AS|PID|EADDR . */
 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
                                   struct kvm_translation *tr)
 {
-       struct tlbe *gtlbe;
-       int index;
-       gva_t eaddr;
-       u8 pid;
-       u8 as;
-
-       eaddr = tr->linear_address;
-       pid = (tr->linear_address >> 32) & 0xff;
-       as = (tr->linear_address >> 40) & 0x1;
-
-       index = kvmppc_44x_tlb_index(vcpu, eaddr, pid, as);
-       if (index == -1) {
-               tr->valid = 0;
-               return 0;
-       }
+       return kvmppc_core_vcpu_translate(vcpu, tr);
+}
 
-       gtlbe = &vcpu->arch.guest_tlb[index];
+int kvmppc_booke_init(void)
+{
+       unsigned long ivor[16];
+       unsigned long max_ivor = 0;
+       int i;
 
-       tr->physical_address = tlb_xlate(gtlbe, eaddr);
-       /* XXX what does "writeable" and "usermode" even mean? */
-       tr->valid = 1;
+       /* We install our own exception handlers by hijacking IVPR. IVPR must
+        * be 16-bit aligned, so we need a 64KB allocation. */
+       kvmppc_booke_handlers = __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+                                                VCPU_SIZE_ORDER);
+       if (!kvmppc_booke_handlers)
+               return -ENOMEM;
+
+       /* XXX make sure our handlers are smaller than Linux's */
+
+       /* Copy our interrupt handlers to match host IVORs. That way we don't
+        * have to swap the IVORs on every guest/host transition. */
+       ivor[0] = mfspr(SPRN_IVOR0);
+       ivor[1] = mfspr(SPRN_IVOR1);
+       ivor[2] = mfspr(SPRN_IVOR2);
+       ivor[3] = mfspr(SPRN_IVOR3);
+       ivor[4] = mfspr(SPRN_IVOR4);
+       ivor[5] = mfspr(SPRN_IVOR5);
+       ivor[6] = mfspr(SPRN_IVOR6);
+       ivor[7] = mfspr(SPRN_IVOR7);
+       ivor[8] = mfspr(SPRN_IVOR8);
+       ivor[9] = mfspr(SPRN_IVOR9);
+       ivor[10] = mfspr(SPRN_IVOR10);
+       ivor[11] = mfspr(SPRN_IVOR11);
+       ivor[12] = mfspr(SPRN_IVOR12);
+       ivor[13] = mfspr(SPRN_IVOR13);
+       ivor[14] = mfspr(SPRN_IVOR14);
+       ivor[15] = mfspr(SPRN_IVOR15);
+
+       for (i = 0; i < 16; i++) {
+               if (ivor[i] > max_ivor)
+                       max_ivor = ivor[i];
+
+               memcpy((void *)kvmppc_booke_handlers + ivor[i],
+                      kvmppc_handlers_start + i * kvmppc_handler_len,
+                      kvmppc_handler_len);
+       }
+       flush_icache_range(kvmppc_booke_handlers,
+                          kvmppc_booke_handlers + max_ivor + kvmppc_handler_len);
 
        return 0;
 }
+
+void __exit kvmppc_booke_exit(void)
+{
+       free_pages(kvmppc_booke_handlers, VCPU_SIZE_ORDER);
+       kvm_exit();
+}
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
new file mode 100644 (file)
index 0000000..cf7c94c
--- /dev/null
@@ -0,0 +1,60 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#ifndef __KVM_BOOKE_H__
+#define __KVM_BOOKE_H__
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include "timing.h"
+
+/* interrupt priortity ordering */
+#define BOOKE_IRQPRIO_DATA_STORAGE 0
+#define BOOKE_IRQPRIO_INST_STORAGE 1
+#define BOOKE_IRQPRIO_ALIGNMENT 2
+#define BOOKE_IRQPRIO_PROGRAM 3
+#define BOOKE_IRQPRIO_FP_UNAVAIL 4
+#define BOOKE_IRQPRIO_SYSCALL 5
+#define BOOKE_IRQPRIO_AP_UNAVAIL 6
+#define BOOKE_IRQPRIO_DTLB_MISS 7
+#define BOOKE_IRQPRIO_ITLB_MISS 8
+#define BOOKE_IRQPRIO_MACHINE_CHECK 9
+#define BOOKE_IRQPRIO_DEBUG 10
+#define BOOKE_IRQPRIO_CRITICAL 11
+#define BOOKE_IRQPRIO_WATCHDOG 12
+#define BOOKE_IRQPRIO_EXTERNAL 13
+#define BOOKE_IRQPRIO_FIT 14
+#define BOOKE_IRQPRIO_DECREMENTER 15
+
+/* Helper function for "full" MSR writes. No need to call this if only EE is
+ * changing. */
+static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
+{
+       if ((new_msr & MSR_PR) != (vcpu->arch.msr & MSR_PR))
+               kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR);
+
+       vcpu->arch.msr = new_msr;
+
+       if (vcpu->arch.msr & MSR_WE) {
+               kvm_vcpu_block(vcpu);
+               kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
+       };
+}
+
+#endif /* __KVM_BOOKE_H__ */
diff --git a/arch/powerpc/kvm/booke_host.c b/arch/powerpc/kvm/booke_host.c
deleted file mode 100644 (file)
index b480341..0000000
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- *
- * Copyright IBM Corp. 2008
- *
- * Authors: Hollis Blanchard <hollisb@us.ibm.com>
- */
-
-#include <linux/errno.h>
-#include <linux/kvm_host.h>
-#include <linux/module.h>
-#include <asm/cacheflush.h>
-#include <asm/kvm_ppc.h>
-
-unsigned long kvmppc_booke_handlers;
-
-static int kvmppc_booke_init(void)
-{
-       unsigned long ivor[16];
-       unsigned long max_ivor = 0;
-       int i;
-
-       /* We install our own exception handlers by hijacking IVPR. IVPR must
-        * be 16-bit aligned, so we need a 64KB allocation. */
-       kvmppc_booke_handlers = __get_free_pages(GFP_KERNEL | __GFP_ZERO,
-                                                VCPU_SIZE_ORDER);
-       if (!kvmppc_booke_handlers)
-               return -ENOMEM;
-
-       /* XXX make sure our handlers are smaller than Linux's */
-
-       /* Copy our interrupt handlers to match host IVORs. That way we don't
-        * have to swap the IVORs on every guest/host transition. */
-       ivor[0] = mfspr(SPRN_IVOR0);
-       ivor[1] = mfspr(SPRN_IVOR1);
-       ivor[2] = mfspr(SPRN_IVOR2);
-       ivor[3] = mfspr(SPRN_IVOR3);
-       ivor[4] = mfspr(SPRN_IVOR4);
-       ivor[5] = mfspr(SPRN_IVOR5);
-       ivor[6] = mfspr(SPRN_IVOR6);
-       ivor[7] = mfspr(SPRN_IVOR7);
-       ivor[8] = mfspr(SPRN_IVOR8);
-       ivor[9] = mfspr(SPRN_IVOR9);
-       ivor[10] = mfspr(SPRN_IVOR10);
-       ivor[11] = mfspr(SPRN_IVOR11);
-       ivor[12] = mfspr(SPRN_IVOR12);
-       ivor[13] = mfspr(SPRN_IVOR13);
-       ivor[14] = mfspr(SPRN_IVOR14);
-       ivor[15] = mfspr(SPRN_IVOR15);
-
-       for (i = 0; i < 16; i++) {
-               if (ivor[i] > max_ivor)
-                       max_ivor = ivor[i];
-
-               memcpy((void *)kvmppc_booke_handlers + ivor[i],
-                      kvmppc_handlers_start + i * kvmppc_handler_len,
-                      kvmppc_handler_len);
-       }
-       flush_icache_range(kvmppc_booke_handlers,
-                          kvmppc_booke_handlers + max_ivor + kvmppc_handler_len);
-
-       return kvm_init(NULL, sizeof(struct kvm_vcpu), THIS_MODULE);
-}
-
-static void __exit kvmppc_booke_exit(void)
-{
-       free_pages(kvmppc_booke_handlers, VCPU_SIZE_ORDER);
-       kvm_exit();
-}
-
-module_init(kvmppc_booke_init)
-module_exit(kvmppc_booke_exit)
index 95e165baf85faee89b7dfa30e628b75778358618..084ebcd7dd83f416136d323f998beb52cc303709 100644 (file)
@@ -107,6 +107,18 @@ _GLOBAL(kvmppc_resume_host)
        li      r6, 1
        slw     r6, r6, r5
 
+#ifdef CONFIG_KVM_EXIT_TIMING
+       /* save exit time */
+1:
+       mfspr   r7, SPRN_TBRU
+       mfspr   r8, SPRN_TBRL
+       mfspr   r9, SPRN_TBRU
+       cmpw    r9, r7
+       bne     1b
+       stw     r8, VCPU_TIMING_EXIT_TBL(r4)
+       stw     r9, VCPU_TIMING_EXIT_TBU(r4)
+#endif
+
        /* Save the faulting instruction and all GPRs for emulation. */
        andi.   r7, r6, NEED_INST_MASK
        beq     ..skip_inst_copy
@@ -335,54 +347,6 @@ lightweight_exit:
        lwz     r3, VCPU_SHADOW_PID(r4)
        mtspr   SPRN_PID, r3
 
-       /* Prevent all asynchronous TLB updates. */
-       mfmsr   r5
-       lis     r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@h
-       ori     r6, r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@l
-       andc    r6, r5, r6
-       mtmsr   r6
-
-       /* Load the guest mappings, leaving the host's "pinned" kernel mappings
-        * in place. */
-       mfspr   r10, SPRN_MMUCR                 /* Save host MMUCR. */
-       li      r5, PPC44x_TLB_SIZE
-       lis     r5, tlb_44x_hwater@ha
-       lwz     r5, tlb_44x_hwater@l(r5)
-       mtctr   r5
-       addi    r9, r4, VCPU_SHADOW_TLB
-       addi    r5, r4, VCPU_SHADOW_MOD
-       li      r3, 0
-1:
-       lbzx    r7, r3, r5
-       cmpwi   r7, 0
-       beq     3f
-
-       /* Load guest entry. */
-       mulli   r11, r3, TLBE_BYTES
-       add     r11, r11, r9
-       lwz     r7, 0(r11)
-       mtspr   SPRN_MMUCR, r7
-       lwz     r7, 4(r11)
-       tlbwe   r7, r3, PPC44x_TLB_PAGEID
-       lwz     r7, 8(r11)
-       tlbwe   r7, r3, PPC44x_TLB_XLAT
-       lwz     r7, 12(r11)
-       tlbwe   r7, r3, PPC44x_TLB_ATTRIB
-3:
-       addi    r3, r3, 1                       /* Increment index. */
-       bdnz    1b
-
-       mtspr   SPRN_MMUCR, r10                 /* Restore host MMUCR. */
-
-       /* Clear bitmap of modified TLB entries */
-       li      r5, PPC44x_TLB_SIZE>>2
-       mtctr   r5
-       addi    r5, r4, VCPU_SHADOW_MOD - 4
-       li      r6, 0
-1:
-       stwu    r6, 4(r5)
-       bdnz    1b
-
        iccci   0, 0 /* XXX hack */
 
        /* Load some guest volatiles. */
@@ -423,6 +387,18 @@ lightweight_exit:
        lwz     r3, VCPU_SPRG7(r4)
        mtspr   SPRN_SPRG7, r3
 
+#ifdef CONFIG_KVM_EXIT_TIMING
+       /* save enter time */
+1:
+       mfspr   r6, SPRN_TBRU
+       mfspr   r7, SPRN_TBRL
+       mfspr   r8, SPRN_TBRU
+       cmpw    r8, r6
+       bne     1b
+       stw     r7, VCPU_TIMING_LAST_ENTER_TBL(r4)
+       stw     r8, VCPU_TIMING_LAST_ENTER_TBU(r4)
+#endif
+
        /* Finish loading guest volatiles and jump to guest. */
        lwz     r3, VCPU_CTR(r4)
        mtctr   r3
index 0fce4fbdc20d2015f44ce3c2c8bfcef291bc82b9..d1d38daa93fbf723490f09a38b2e5e6eca2dcfd1 100644 (file)
 #include <linux/string.h>
 #include <linux/kvm_host.h>
 
-#include <asm/dcr.h>
-#include <asm/dcr-regs.h>
+#include <asm/reg.h>
 #include <asm/time.h>
 #include <asm/byteorder.h>
 #include <asm/kvm_ppc.h>
+#include <asm/disassemble.h>
+#include "timing.h"
 
-#include "44x_tlb.h"
-
-/* Instruction decoding */
-static inline unsigned int get_op(u32 inst)
-{
-       return inst >> 26;
-}
-
-static inline unsigned int get_xop(u32 inst)
-{
-       return (inst >> 1) & 0x3ff;
-}
-
-static inline unsigned int get_sprn(u32 inst)
-{
-       return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
-}
-
-static inline unsigned int get_dcrn(u32 inst)
-{
-       return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
-}
-
-static inline unsigned int get_rt(u32 inst)
-{
-       return (inst >> 21) & 0x1f;
-}
-
-static inline unsigned int get_rs(u32 inst)
-{
-       return (inst >> 21) & 0x1f;
-}
-
-static inline unsigned int get_ra(u32 inst)
-{
-       return (inst >> 16) & 0x1f;
-}
-
-static inline unsigned int get_rb(u32 inst)
-{
-       return (inst >> 11) & 0x1f;
-}
-
-static inline unsigned int get_rc(u32 inst)
-{
-       return inst & 0x1;
-}
-
-static inline unsigned int get_ws(u32 inst)
-{
-       return (inst >> 11) & 0x1f;
-}
-
-static inline unsigned int get_d(u32 inst)
-{
-       return inst & 0xffff;
-}
-
-static int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
-                             const struct tlbe *tlbe)
-{
-       gpa_t gpa;
-
-       if (!get_tlb_v(tlbe))
-               return 0;
-
-       /* Does it match current guest AS? */
-       /* XXX what about IS != DS? */
-       if (get_tlb_ts(tlbe) != !!(vcpu->arch.msr & MSR_IS))
-               return 0;
-
-       gpa = get_tlb_raddr(tlbe);
-       if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT))
-               /* Mapping is not for RAM. */
-               return 0;
-
-       return 1;
-}
-
-static int kvmppc_emul_tlbwe(struct kvm_vcpu *vcpu, u32 inst)
-{
-       u64 eaddr;
-       u64 raddr;
-       u64 asid;
-       u32 flags;
-       struct tlbe *tlbe;
-       unsigned int ra;
-       unsigned int rs;
-       unsigned int ws;
-       unsigned int index;
-
-       ra = get_ra(inst);
-       rs = get_rs(inst);
-       ws = get_ws(inst);
-
-       index = vcpu->arch.gpr[ra];
-       if (index > PPC44x_TLB_SIZE) {
-               printk("%s: index %d\n", __func__, index);
-               kvmppc_dump_vcpu(vcpu);
-               return EMULATE_FAIL;
-       }
-
-       tlbe = &vcpu->arch.guest_tlb[index];
-
-       /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
-       if (tlbe->word0 & PPC44x_TLB_VALID) {
-               eaddr = get_tlb_eaddr(tlbe);
-               asid = (tlbe->word0 & PPC44x_TLB_TS) | tlbe->tid;
-               kvmppc_mmu_invalidate(vcpu, eaddr, get_tlb_end(tlbe), asid);
-       }
-
-       switch (ws) {
-       case PPC44x_TLB_PAGEID:
-               tlbe->tid = vcpu->arch.mmucr & 0xff;
-               tlbe->word0 = vcpu->arch.gpr[rs];
-               break;
-
-       case PPC44x_TLB_XLAT:
-               tlbe->word1 = vcpu->arch.gpr[rs];
-               break;
-
-       case PPC44x_TLB_ATTRIB:
-               tlbe->word2 = vcpu->arch.gpr[rs];
-               break;
-
-       default:
-               return EMULATE_FAIL;
-       }
-
-       if (tlbe_is_host_safe(vcpu, tlbe)) {
-               eaddr = get_tlb_eaddr(tlbe);
-               raddr = get_tlb_raddr(tlbe);
-               asid = (tlbe->word0 & PPC44x_TLB_TS) | tlbe->tid;
-               flags = tlbe->word2 & 0xffff;
-
-               /* Create a 4KB mapping on the host. If the guest wanted a
-                * large page, only the first 4KB is mapped here and the rest
-                * are mapped on the fly. */
-               kvmppc_mmu_map(vcpu, eaddr, raddr >> PAGE_SHIFT, asid, flags);
-       }
-
-       KVMTRACE_5D(GTLB_WRITE, vcpu, index,
-                       tlbe->tid, tlbe->word0, tlbe->word1, tlbe->word2,
-                       handler);
-
-       return EMULATE_DONE;
-}
-
-static void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
+void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
 {
        if (vcpu->arch.tcr & TCR_DIE) {
                /* The decrementer ticks at the same rate as the timebase, so
@@ -193,12 +46,6 @@ static void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
        }
 }
 
-static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)
-{
-       vcpu->arch.pc = vcpu->arch.srr0;
-       kvmppc_set_msr(vcpu, vcpu->arch.srr1);
-}
-
 /* XXX to do:
  * lhax
  * lhaux
@@ -213,40 +60,30 @@ static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)
  *
  * XXX is_bigendian should depend on MMU mapping or MSR[LE]
  */
+/* XXX Should probably auto-generate instruction decoding for a particular core
+ * from opcode tables in the future. */
 int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
        u32 inst = vcpu->arch.last_inst;
        u32 ea;
        int ra;
        int rb;
-       int rc;
        int rs;
        int rt;
        int sprn;
-       int dcrn;
        enum emulation_result emulated = EMULATE_DONE;
        int advance = 1;
 
+       /* this default type might be overwritten by subcategories */
+       kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS);
+
        switch (get_op(inst)) {
-       case 3:                                                 /* trap */
-               printk("trap!\n");
-               kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_PROGRAM);
+       case 3:                                             /* trap */
+               vcpu->arch.esr |= ESR_PTR;
+               kvmppc_core_queue_program(vcpu);
                advance = 0;
                break;
 
-       case 19:
-               switch (get_xop(inst)) {
-               case 50:                                        /* rfi */
-                       kvmppc_emul_rfi(vcpu);
-                       advance = 0;
-                       break;
-
-               default:
-                       emulated = EMULATE_FAIL;
-                       break;
-               }
-               break;
-
        case 31:
                switch (get_xop(inst)) {
 
@@ -255,27 +92,11 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
                        break;
 
-               case 83:                                        /* mfmsr */
-                       rt = get_rt(inst);
-                       vcpu->arch.gpr[rt] = vcpu->arch.msr;
-                       break;
-
                case 87:                                        /* lbzx */
                        rt = get_rt(inst);
                        emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
                        break;
 
-               case 131:                                       /* wrtee */
-                       rs = get_rs(inst);
-                       vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
-                                        | (vcpu->arch.gpr[rs] & MSR_EE);
-                       break;
-
-               case 146:                                       /* mtmsr */
-                       rs = get_rs(inst);
-                       kvmppc_set_msr(vcpu, vcpu->arch.gpr[rs]);
-                       break;
-
                case 151:                                       /* stwx */
                        rs = get_rs(inst);
                        emulated = kvmppc_handle_store(run, vcpu,
@@ -283,11 +104,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                                                       4, 1);
                        break;
 
-               case 163:                                       /* wrteei */
-                       vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
-                                        | (inst & MSR_EE);
-                       break;
-
                case 215:                                       /* stbx */
                        rs = get_rs(inst);
                        emulated = kvmppc_handle_store(run, vcpu,
@@ -328,42 +144,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        vcpu->arch.gpr[ra] = ea;
                        break;
 
-               case 323:                                       /* mfdcr */
-                       dcrn = get_dcrn(inst);
-                       rt = get_rt(inst);
-
-                       /* The guest may access CPR0 registers to determine the timebase
-                        * frequency, and it must know the real host frequency because it
-                        * can directly access the timebase registers.
-                        *
-                        * It would be possible to emulate those accesses in userspace,
-                        * but userspace can really only figure out the end frequency.
-                        * We could decompose that into the factors that compute it, but
-                        * that's tricky math, and it's easier to just report the real
-                        * CPR0 values.
-                        */
-                       switch (dcrn) {
-                       case DCRN_CPR0_CONFIG_ADDR:
-                               vcpu->arch.gpr[rt] = vcpu->arch.cpr0_cfgaddr;
-                               break;
-                       case DCRN_CPR0_CONFIG_DATA:
-                               local_irq_disable();
-                               mtdcr(DCRN_CPR0_CONFIG_ADDR,
-                                     vcpu->arch.cpr0_cfgaddr);
-                               vcpu->arch.gpr[rt] = mfdcr(DCRN_CPR0_CONFIG_DATA);
-                               local_irq_enable();
-                               break;
-                       default:
-                               run->dcr.dcrn = dcrn;
-                               run->dcr.data =  0;
-                               run->dcr.is_write = 0;
-                               vcpu->arch.io_gpr = rt;
-                               vcpu->arch.dcr_needed = 1;
-                               emulated = EMULATE_DO_DCR;
-                       }
-
-                       break;
-
                case 339:                                       /* mfspr */
                        sprn = get_sprn(inst);
                        rt = get_rt(inst);
@@ -373,26 +153,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                                vcpu->arch.gpr[rt] = vcpu->arch.srr0; break;
                        case SPRN_SRR1:
                                vcpu->arch.gpr[rt] = vcpu->arch.srr1; break;
-                       case SPRN_MMUCR:
-                               vcpu->arch.gpr[rt] = vcpu->arch.mmucr; break;
-                       case SPRN_PID:
-                               vcpu->arch.gpr[rt] = vcpu->arch.pid; break;
-                       case SPRN_IVPR:
-                               vcpu->arch.gpr[rt] = vcpu->arch.ivpr; break;
-                       case SPRN_CCR0:
-                               vcpu->arch.gpr[rt] = vcpu->arch.ccr0; break;
-                       case SPRN_CCR1:
-                               vcpu->arch.gpr[rt] = vcpu->arch.ccr1; break;
                        case SPRN_PVR:
                                vcpu->arch.gpr[rt] = vcpu->arch.pvr; break;
-                       case SPRN_DEAR:
-                               vcpu->arch.gpr[rt] = vcpu->arch.dear; break;
-                       case SPRN_ESR:
-                               vcpu->arch.gpr[rt] = vcpu->arch.esr; break;
-                       case SPRN_DBCR0:
-                               vcpu->arch.gpr[rt] = vcpu->arch.dbcr0; break;
-                       case SPRN_DBCR1:
-                               vcpu->arch.gpr[rt] = vcpu->arch.dbcr1; break;
 
                        /* Note: mftb and TBRL/TBWL are user-accessible, so
                         * the guest can always access the real TB anyways.
@@ -413,42 +175,12 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        /* Note: SPRG4-7 are user-readable, so we don't get
                         * a trap. */
 
-                       case SPRN_IVOR0:
-                               vcpu->arch.gpr[rt] = vcpu->arch.ivor[0]; break;
-                       case SPRN_IVOR1:
-                               vcpu->arch.gpr[rt] = vcpu->arch.ivor[1]; break;
-                       case SPRN_IVOR2:
-                               vcpu->arch.gpr[rt] = vcpu->arch.ivor[2]; break;
-                       case SPRN_IVOR3:
-                               vcpu->arch.gpr[rt] = vcpu->arch.ivor[3]; break;
-                       case SPRN_IVOR4:
-                               vcpu->arch.gpr[rt] = vcpu->arch.ivor[4]; break;
-                       case SPRN_IVOR5:
-                               vcpu->arch.gpr[rt] = vcpu->arch.ivor[5]; break;
-                       case SPRN_IVOR6:
-                               vcpu->arch.gpr[rt] = vcpu->arch.ivor[6]; break;
-                       case SPRN_IVOR7:
-                               vcpu->arch.gpr[rt] = vcpu->arch.ivor[7]; break;
-                       case SPRN_IVOR8:
-                               vcpu->arch.gpr[rt] = vcpu->arch.ivor[8]; break;
-                       case SPRN_IVOR9:
-                               vcpu->arch.gpr[rt] = vcpu->arch.ivor[9]; break;
-                       case SPRN_IVOR10:
-                               vcpu->arch.gpr[rt] = vcpu->arch.ivor[10]; break;
-                       case SPRN_IVOR11:
-                               vcpu->arch.gpr[rt] = vcpu->arch.ivor[11]; break;
-                       case SPRN_IVOR12:
-                               vcpu->arch.gpr[rt] = vcpu->arch.ivor[12]; break;
-                       case SPRN_IVOR13:
-                               vcpu->arch.gpr[rt] = vcpu->arch.ivor[13]; break;
-                       case SPRN_IVOR14:
-                               vcpu->arch.gpr[rt] = vcpu->arch.ivor[14]; break;
-                       case SPRN_IVOR15:
-                               vcpu->arch.gpr[rt] = vcpu->arch.ivor[15]; break;
-
                        default:
-                               printk("mfspr: unknown spr %x\n", sprn);
-                               vcpu->arch.gpr[rt] = 0;
+                               emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, rt);
+                               if (emulated == EMULATE_FAIL) {
+                                       printk("mfspr: unknown spr %x\n", sprn);
+                                       vcpu->arch.gpr[rt] = 0;
+                               }
                                break;
                        }
                        break;
@@ -478,25 +210,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        vcpu->arch.gpr[ra] = ea;
                        break;
 
-               case 451:                                       /* mtdcr */
-                       dcrn = get_dcrn(inst);
-                       rs = get_rs(inst);
-
-                       /* emulate some access in kernel */
-                       switch (dcrn) {
-                       case DCRN_CPR0_CONFIG_ADDR:
-                               vcpu->arch.cpr0_cfgaddr = vcpu->arch.gpr[rs];
-                               break;
-                       default:
-                               run->dcr.dcrn = dcrn;
-                               run->dcr.data = vcpu->arch.gpr[rs];
-                               run->dcr.is_write = 1;
-                               vcpu->arch.dcr_needed = 1;
-                               emulated = EMULATE_DO_DCR;
-                       }
-
-                       break;
-
                case 467:                                       /* mtspr */
                        sprn = get_sprn(inst);
                        rs = get_rs(inst);
@@ -505,22 +218,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                                vcpu->arch.srr0 = vcpu->arch.gpr[rs]; break;
                        case SPRN_SRR1:
                                vcpu->arch.srr1 = vcpu->arch.gpr[rs]; break;
-                       case SPRN_MMUCR:
-                               vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break;
-                       case SPRN_PID:
-                               kvmppc_set_pid(vcpu, vcpu->arch.gpr[rs]); break;
-                       case SPRN_CCR0:
-                               vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break;
-                       case SPRN_CCR1:
-                               vcpu->arch.ccr1 = vcpu->arch.gpr[rs]; break;
-                       case SPRN_DEAR:
-                               vcpu->arch.dear = vcpu->arch.gpr[rs]; break;
-                       case SPRN_ESR:
-                               vcpu->arch.esr = vcpu->arch.gpr[rs]; break;
-                       case SPRN_DBCR0:
-                               vcpu->arch.dbcr0 = vcpu->arch.gpr[rs]; break;
-                       case SPRN_DBCR1:
-                               vcpu->arch.dbcr1 = vcpu->arch.gpr[rs]; break;
 
                        /* XXX We need to context-switch the timebase for
                         * watchdog and FIT. */
@@ -532,14 +229,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                                kvmppc_emulate_dec(vcpu);
                                break;
 
-                       case SPRN_TSR:
-                               vcpu->arch.tsr &= ~vcpu->arch.gpr[rs]; break;
-
-                       case SPRN_TCR:
-                               vcpu->arch.tcr = vcpu->arch.gpr[rs];
-                               kvmppc_emulate_dec(vcpu);
-                               break;
-
                        case SPRN_SPRG0:
                                vcpu->arch.sprg0 = vcpu->arch.gpr[rs]; break;
                        case SPRN_SPRG1:
@@ -549,56 +238,10 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        case SPRN_SPRG3:
                                vcpu->arch.sprg3 = vcpu->arch.gpr[rs]; break;
 
-                       /* Note: SPRG4-7 are user-readable. These values are
-                        * loaded into the real SPRGs when resuming the
-                        * guest. */
-                       case SPRN_SPRG4:
-                               vcpu->arch.sprg4 = vcpu->arch.gpr[rs]; break;
-                       case SPRN_SPRG5:
-                               vcpu->arch.sprg5 = vcpu->arch.gpr[rs]; break;
-                       case SPRN_SPRG6:
-                               vcpu->arch.sprg6 = vcpu->arch.gpr[rs]; break;
-                       case SPRN_SPRG7:
-                               vcpu->arch.sprg7 = vcpu->arch.gpr[rs]; break;
-
-                       case SPRN_IVPR:
-                               vcpu->arch.ivpr = vcpu->arch.gpr[rs]; break;
-                       case SPRN_IVOR0:
-                               vcpu->arch.ivor[0] = vcpu->arch.gpr[rs]; break;
-                       case SPRN_IVOR1:
-                               vcpu->arch.ivor[1] = vcpu->arch.gpr[rs]; break;
-                       case SPRN_IVOR2:
-                               vcpu->arch.ivor[2] = vcpu->arch.gpr[rs]; break;
-                       case SPRN_IVOR3:
-                               vcpu->arch.ivor[3] = vcpu->arch.gpr[rs]; break;
-                       case SPRN_IVOR4:
-                               vcpu->arch.ivor[4] = vcpu->arch.gpr[rs]; break;
-                       case SPRN_IVOR5:
-                               vcpu->arch.ivor[5] = vcpu->arch.gpr[rs]; break;
-                       case SPRN_IVOR6:
-                               vcpu->arch.ivor[6] = vcpu->arch.gpr[rs]; break;
-                       case SPRN_IVOR7:
-                               vcpu->arch.ivor[7] = vcpu->arch.gpr[rs]; break;
-                       case SPRN_IVOR8:
-                               vcpu->arch.ivor[8] = vcpu->arch.gpr[rs]; break;
-                       case SPRN_IVOR9:
-                               vcpu->arch.ivor[9] = vcpu->arch.gpr[rs]; break;
-                       case SPRN_IVOR10:
-                               vcpu->arch.ivor[10] = vcpu->arch.gpr[rs]; break;
-                       case SPRN_IVOR11:
-                               vcpu->arch.ivor[11] = vcpu->arch.gpr[rs]; break;
-                       case SPRN_IVOR12:
-                               vcpu->arch.ivor[12] = vcpu->arch.gpr[rs]; break;
-                       case SPRN_IVOR13:
-                               vcpu->arch.ivor[13] = vcpu->arch.gpr[rs]; break;
-                       case SPRN_IVOR14:
-                               vcpu->arch.ivor[14] = vcpu->arch.gpr[rs]; break;
-                       case SPRN_IVOR15:
-                               vcpu->arch.ivor[15] = vcpu->arch.gpr[rs]; break;
-
                        default:
-                               printk("mtspr: unknown spr %x\n", sprn);
-                               emulated = EMULATE_FAIL;
+                               emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, rs);
+                               if (emulated == EMULATE_FAIL)
+                                       printk("mtspr: unknown spr %x\n", sprn);
                                break;
                        }
                        break;
@@ -629,36 +272,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                                                       4, 0);
                        break;
 
-               case 978:                                       /* tlbwe */
-                       emulated = kvmppc_emul_tlbwe(vcpu, inst);
-                       break;
-
-               case 914:       {                               /* tlbsx */
-                       int index;
-                       unsigned int as = get_mmucr_sts(vcpu);
-                       unsigned int pid = get_mmucr_stid(vcpu);
-
-                       rt = get_rt(inst);
-                       ra = get_ra(inst);
-                       rb = get_rb(inst);
-                       rc = get_rc(inst);
-
-                       ea = vcpu->arch.gpr[rb];
-                       if (ra)
-                               ea += vcpu->arch.gpr[ra];
-
-                       index = kvmppc_44x_tlb_index(vcpu, ea, pid, as);
-                       if (rc) {
-                               if (index < 0)
-                                       vcpu->arch.cr &= ~0x20000000;
-                               else
-                                       vcpu->arch.cr |= 0x20000000;
-                       }
-                       vcpu->arch.gpr[rt] = index;
-
-                       }
-                       break;
-
                case 790:                                       /* lhbrx */
                        rt = get_rt(inst);
                        emulated = kvmppc_handle_load(run, vcpu, rt, 2, 0);
@@ -674,14 +287,9 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                                                       2, 0);
                        break;
 
-               case 966:                                       /* iccci */
-                       break;
-
                default:
-                       printk("unknown: op %d xop %d\n", get_op(inst),
-                               get_xop(inst));
+                       /* Attempt core-specific emulation below. */
                        emulated = EMULATE_FAIL;
-                       break;
                }
                break;
 
@@ -764,12 +372,19 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                break;
 
        default:
-               printk("unknown op %d\n", get_op(inst));
                emulated = EMULATE_FAIL;
-               break;
        }
 
-       KVMTRACE_3D(PPC_INSTR, vcpu, inst, vcpu->arch.pc, emulated, entryexit);
+       if (emulated == EMULATE_FAIL) {
+               emulated = kvmppc_core_emulate_op(run, vcpu, inst, &advance);
+               if (emulated == EMULATE_FAIL) {
+                       advance = 0;
+                       printk(KERN_ERR "Couldn't emulate instruction 0x%08x "
+                              "(op %d xop %d)\n", inst, get_op(inst), get_xop(inst));
+               }
+       }
+
+       KVMTRACE_3D(PPC_INSTR, vcpu, inst, (int)vcpu->arch.pc, emulated, entryexit);
 
        if (advance)
                vcpu->arch.pc += 4; /* Advance past emulated instruction. */
index 8bef0efcdfe19ba11b1f5d8d9f3561213f6a2a45..2822c8ccfaaf3ae036d6ce8da0fc16b4cf7c12a7 100644 (file)
@@ -28,9 +28,9 @@
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
 #include <asm/tlbflush.h>
+#include "timing.h"
 #include "../mm/mmu_decl.h"
 
-
 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
 {
        return gfn;
@@ -99,14 +99,7 @@ void kvm_arch_hardware_unsetup(void)
 
 void kvm_arch_check_processor_compat(void *rtn)
 {
-       int r;
-
-       if (strcmp(cur_cpu_spec->platform, "ppc440") == 0)
-               r = 0;
-       else
-               r = -ENOTSUPP;
-
-       *(int *)rtn = r;
+       *(int *)rtn = kvmppc_core_check_processor_compat();
 }
 
 struct kvm *kvm_arch_create_vm(void)
@@ -144,9 +137,6 @@ int kvm_dev_ioctl_check_extension(long ext)
        int r;
 
        switch (ext) {
-       case KVM_CAP_USER_MEMORY:
-               r = 1;
-               break;
        case KVM_CAP_COALESCED_MMIO:
                r = KVM_COALESCED_MMIO_PAGE_OFFSET;
                break;
@@ -179,30 +169,15 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 {
        struct kvm_vcpu *vcpu;
-       int err;
-
-       vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
-       if (!vcpu) {
-               err = -ENOMEM;
-               goto out;
-       }
-
-       err = kvm_vcpu_init(vcpu, kvm, id);
-       if (err)
-               goto free_vcpu;
-
+       vcpu = kvmppc_core_vcpu_create(kvm, id);
+       kvmppc_create_vcpu_debugfs(vcpu, id);
        return vcpu;
-
-free_vcpu:
-       kmem_cache_free(kvm_vcpu_cache, vcpu);
-out:
-       return ERR_PTR(err);
 }
 
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 {
-       kvm_vcpu_uninit(vcpu);
-       kmem_cache_free(kvm_vcpu_cache, vcpu);
+       kvmppc_remove_vcpu_debugfs(vcpu);
+       kvmppc_core_vcpu_free(vcpu);
 }
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -212,16 +187,14 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-       unsigned int priority = exception_priority[BOOKE_INTERRUPT_DECREMENTER];
-
-       return test_bit(priority, &vcpu->arch.pending_exceptions);
+       return kvmppc_core_pending_dec(vcpu);
 }
 
 static void kvmppc_decrementer_func(unsigned long data)
 {
        struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
 
-       kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_DECREMENTER);
+       kvmppc_core_queue_dec(vcpu);
 
        if (waitqueue_active(&vcpu->wq)) {
                wake_up_interruptible(&vcpu->wq);
@@ -242,96 +215,25 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
        kvmppc_core_destroy_mmu(vcpu);
 }
 
-/* Note: clearing MSR[DE] just means that the debug interrupt will not be
- * delivered *immediately*. Instead, it simply sets the appropriate DBSR bits.
- * If those DBSR bits are still set when MSR[DE] is re-enabled, the interrupt
- * will be delivered as an "imprecise debug event" (which is indicated by
- * DBSR[IDE].
- */
-static void kvmppc_disable_debug_interrupts(void)
-{
-       mtmsr(mfmsr() & ~MSR_DE);
-}
-
-static void kvmppc_restore_host_debug_state(struct kvm_vcpu *vcpu)
-{
-       kvmppc_disable_debug_interrupts();
-
-       mtspr(SPRN_IAC1, vcpu->arch.host_iac[0]);
-       mtspr(SPRN_IAC2, vcpu->arch.host_iac[1]);
-       mtspr(SPRN_IAC3, vcpu->arch.host_iac[2]);
-       mtspr(SPRN_IAC4, vcpu->arch.host_iac[3]);
-       mtspr(SPRN_DBCR1, vcpu->arch.host_dbcr1);
-       mtspr(SPRN_DBCR2, vcpu->arch.host_dbcr2);
-       mtspr(SPRN_DBCR0, vcpu->arch.host_dbcr0);
-       mtmsr(vcpu->arch.host_msr);
-}
-
-static void kvmppc_load_guest_debug_registers(struct kvm_vcpu *vcpu)
-{
-       struct kvm_guest_debug *dbg = &vcpu->guest_debug;
-       u32 dbcr0 = 0;
-
-       vcpu->arch.host_msr = mfmsr();
-       kvmppc_disable_debug_interrupts();
-
-       /* Save host debug register state. */
-       vcpu->arch.host_iac[0] = mfspr(SPRN_IAC1);
-       vcpu->arch.host_iac[1] = mfspr(SPRN_IAC2);
-       vcpu->arch.host_iac[2] = mfspr(SPRN_IAC3);
-       vcpu->arch.host_iac[3] = mfspr(SPRN_IAC4);
-       vcpu->arch.host_dbcr0 = mfspr(SPRN_DBCR0);
-       vcpu->arch.host_dbcr1 = mfspr(SPRN_DBCR1);
-       vcpu->arch.host_dbcr2 = mfspr(SPRN_DBCR2);
-
-       /* set registers up for guest */
-
-       if (dbg->bp[0]) {
-               mtspr(SPRN_IAC1, dbg->bp[0]);
-               dbcr0 |= DBCR0_IAC1 | DBCR0_IDM;
-       }
-       if (dbg->bp[1]) {
-               mtspr(SPRN_IAC2, dbg->bp[1]);
-               dbcr0 |= DBCR0_IAC2 | DBCR0_IDM;
-       }
-       if (dbg->bp[2]) {
-               mtspr(SPRN_IAC3, dbg->bp[2]);
-               dbcr0 |= DBCR0_IAC3 | DBCR0_IDM;
-       }
-       if (dbg->bp[3]) {
-               mtspr(SPRN_IAC4, dbg->bp[3]);
-               dbcr0 |= DBCR0_IAC4 | DBCR0_IDM;
-       }
-
-       mtspr(SPRN_DBCR0, dbcr0);
-       mtspr(SPRN_DBCR1, 0);
-       mtspr(SPRN_DBCR2, 0);
-}
-
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-       int i;
-
        if (vcpu->guest_debug.enabled)
-               kvmppc_load_guest_debug_registers(vcpu);
+               kvmppc_core_load_guest_debugstate(vcpu);
 
-       /* Mark every guest entry in the shadow TLB entry modified, so that they
-        * will all be reloaded on the next vcpu run (instead of being
-        * demand-faulted). */
-       for (i = 0; i <= tlb_44x_hwater; i++)
-               kvmppc_tlbe_set_modified(vcpu, i);
+       kvmppc_core_vcpu_load(vcpu, cpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
        if (vcpu->guest_debug.enabled)
-               kvmppc_restore_host_debug_state(vcpu);
+               kvmppc_core_load_host_debugstate(vcpu);
 
        /* Don't leave guest TLB entries resident when being de-scheduled. */
        /* XXX It would be nice to differentiate between heavyweight exit and
         * sched_out here, since we could avoid the TLB flush for heavyweight
         * exits. */
        _tlbil_all();
+       kvmppc_core_vcpu_put(vcpu);
 }
 
 int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
@@ -355,14 +257,14 @@ int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
 static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
                                      struct kvm_run *run)
 {
-       u32 *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr];
+       ulong *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr];
        *gpr = run->dcr.data;
 }
 
 static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
                                       struct kvm_run *run)
 {
-       u32 *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr];
+       ulong *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr];
 
        if (run->mmio.len > sizeof(*gpr)) {
                printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len);
@@ -460,7 +362,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                vcpu->arch.dcr_needed = 0;
        }
 
-       kvmppc_check_and_deliver_interrupts(vcpu);
+       kvmppc_core_deliver_interrupts(vcpu);
 
        local_irq_disable();
        kvm_guest_enter();
@@ -478,7 +380,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 {
-       kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_EXTERNAL);
+       kvmppc_core_queue_external(vcpu, irq);
 
        if (waitqueue_active(&vcpu->wq)) {
                wake_up_interruptible(&vcpu->wq);
diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c
new file mode 100644 (file)
index 0000000..47ee603
--- /dev/null
@@ -0,0 +1,239 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ *          Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+
+#include <asm/time.h>
+#include <asm-generic/div64.h>
+
+#include "timing.h"
+
+void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu)
+{
+       int i;
+
+       /* pause guest execution to avoid concurrent updates */
+       local_irq_disable();
+       mutex_lock(&vcpu->mutex);
+
+       vcpu->arch.last_exit_type = 0xDEAD;
+       for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) {
+               vcpu->arch.timing_count_type[i] = 0;
+               vcpu->arch.timing_max_duration[i] = 0;
+               vcpu->arch.timing_min_duration[i] = 0xFFFFFFFF;
+               vcpu->arch.timing_sum_duration[i] = 0;
+               vcpu->arch.timing_sum_quad_duration[i] = 0;
+       }
+       vcpu->arch.timing_last_exit = 0;
+       vcpu->arch.timing_exit.tv64 = 0;
+       vcpu->arch.timing_last_enter.tv64 = 0;
+
+       mutex_unlock(&vcpu->mutex);
+       local_irq_enable();
+}
+
+static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type)
+{
+       u64 old;
+
+       do_div(duration, tb_ticks_per_usec);
+       if (unlikely(duration > 0xFFFFFFFF)) {
+               printk(KERN_ERR"%s - duration too big -> overflow"
+                       " duration %lld type %d exit #%d\n",
+                       __func__, duration, type,
+                       vcpu->arch.timing_count_type[type]);
+               return;
+       }
+
+       vcpu->arch.timing_count_type[type]++;
+
+       /* sum */
+       old = vcpu->arch.timing_sum_duration[type];
+       vcpu->arch.timing_sum_duration[type] += duration;
+       if (unlikely(old > vcpu->arch.timing_sum_duration[type])) {
+               printk(KERN_ERR"%s - wrap adding sum of durations"
+                       " old %lld new %lld type %d exit # of type %d\n",
+                       __func__, old, vcpu->arch.timing_sum_duration[type],
+                       type, vcpu->arch.timing_count_type[type]);
+       }
+
+       /* square sum */
+       old = vcpu->arch.timing_sum_quad_duration[type];
+       vcpu->arch.timing_sum_quad_duration[type] += (duration*duration);
+       if (unlikely(old > vcpu->arch.timing_sum_quad_duration[type])) {
+               printk(KERN_ERR"%s - wrap adding sum of squared durations"
+                       " old %lld new %lld type %d exit # of type %d\n",
+                       __func__, old,
+                       vcpu->arch.timing_sum_quad_duration[type],
+                       type, vcpu->arch.timing_count_type[type]);
+       }
+
+       /* set min/max */
+       if (unlikely(duration < vcpu->arch.timing_min_duration[type]))
+               vcpu->arch.timing_min_duration[type] = duration;
+       if (unlikely(duration > vcpu->arch.timing_max_duration[type]))
+               vcpu->arch.timing_max_duration[type] = duration;
+}
+
+void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu)
+{
+       u64 exit = vcpu->arch.timing_last_exit;
+       u64 enter = vcpu->arch.timing_last_enter.tv64;
+
+       /* save exit time, used next exit when the reenter time is known */
+       vcpu->arch.timing_last_exit = vcpu->arch.timing_exit.tv64;
+
+       if (unlikely(vcpu->arch.last_exit_type == 0xDEAD || exit == 0))
+               return; /* skip incomplete cycle (e.g. after reset) */
+
+       /* update statistics for average and standard deviation */
+       add_exit_timing(vcpu, (enter - exit), vcpu->arch.last_exit_type);
+       /* enter -> timing_last_exit is time spent in guest - log this too */
+       add_exit_timing(vcpu, (vcpu->arch.timing_last_exit - enter),
+                       TIMEINGUEST);
+}
+
+static const char *kvm_exit_names[__NUMBER_OF_KVM_EXIT_TYPES] = {
+       [MMIO_EXITS] =              "MMIO",
+       [DCR_EXITS] =               "DCR",
+       [SIGNAL_EXITS] =            "SIGNAL",
+       [ITLB_REAL_MISS_EXITS] =    "ITLBREAL",
+       [ITLB_VIRT_MISS_EXITS] =    "ITLBVIRT",
+       [DTLB_REAL_MISS_EXITS] =    "DTLBREAL",
+       [DTLB_VIRT_MISS_EXITS] =    "DTLBVIRT",
+       [SYSCALL_EXITS] =           "SYSCALL",
+       [ISI_EXITS] =               "ISI",
+       [DSI_EXITS] =               "DSI",
+       [EMULATED_INST_EXITS] =     "EMULINST",
+       [EMULATED_MTMSRWE_EXITS] =  "EMUL_WAIT",
+       [EMULATED_WRTEE_EXITS] =    "EMUL_WRTEE",
+       [EMULATED_MTSPR_EXITS] =    "EMUL_MTSPR",
+       [EMULATED_MFSPR_EXITS] =    "EMUL_MFSPR",
+       [EMULATED_MTMSR_EXITS] =    "EMUL_MTMSR",
+       [EMULATED_MFMSR_EXITS] =    "EMUL_MFMSR",
+       [EMULATED_TLBSX_EXITS] =    "EMUL_TLBSX",
+       [EMULATED_TLBWE_EXITS] =    "EMUL_TLBWE",
+       [EMULATED_RFI_EXITS] =      "EMUL_RFI",
+       [DEC_EXITS] =               "DEC",
+       [EXT_INTR_EXITS] =          "EXTINT",
+       [HALT_WAKEUP] =             "HALT",
+       [USR_PR_INST] =             "USR_PR_INST",
+       [FP_UNAVAIL] =              "FP_UNAVAIL",
+       [DEBUG_EXITS] =             "DEBUG",
+       [TIMEINGUEST] =             "TIMEINGUEST"
+};
+
+static int kvmppc_exit_timing_show(struct seq_file *m, void *private)
+{
+       struct kvm_vcpu *vcpu = m->private;
+       int i;
+
+       seq_printf(m, "%s", "type       count   min     max     sum     sum_squared\n");
+
+       for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) {
+               seq_printf(m, "%12s     %10d    %10lld  %10lld  %20lld  %20lld\n",
+                       kvm_exit_names[i],
+                       vcpu->arch.timing_count_type[i],
+                       vcpu->arch.timing_min_duration[i],
+                       vcpu->arch.timing_max_duration[i],
+                       vcpu->arch.timing_sum_duration[i],
+                       vcpu->arch.timing_sum_quad_duration[i]);
+       }
+       return 0;
+}
+
+/* Write 'c' to clear the timing statistics. */
+static ssize_t kvmppc_exit_timing_write(struct file *file,
+                                      const char __user *user_buf,
+                                      size_t count, loff_t *ppos)
+{
+       int err = -EINVAL;
+       char c;
+
+       if (count > 1) {
+               goto done;
+       }
+
+       if (get_user(c, user_buf)) {
+               err = -EFAULT;
+               goto done;
+       }
+
+       if (c == 'c') {
+               struct seq_file *seqf = (struct seq_file *)file->private_data;
+               struct kvm_vcpu *vcpu = seqf->private;
+               /* Write does not affect our buffers previously generated with
+                * show. seq_file is locked here to prevent races of init with
+                * a show call */
+               mutex_lock(&seqf->lock);
+               kvmppc_init_timing_stats(vcpu);
+               mutex_unlock(&seqf->lock);
+               err = count;
+       }
+
+done:
+       return err;
+}
+
+static int kvmppc_exit_timing_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, kvmppc_exit_timing_show, inode->i_private);
+}
+
+static struct file_operations kvmppc_exit_timing_fops = {
+       .owner   = THIS_MODULE,
+       .open    = kvmppc_exit_timing_open,
+       .read    = seq_read,
+       .write   = kvmppc_exit_timing_write,
+       .llseek  = seq_lseek,
+       .release = single_release,
+};
+
+void kvmppc_create_vcpu_debugfs(struct kvm_vcpu *vcpu, unsigned int id)
+{
+       static char dbg_fname[50];
+       struct dentry *debugfs_file;
+
+       snprintf(dbg_fname, sizeof(dbg_fname), "vm%u_vcpu%u_timing",
+                current->pid, id);
+       debugfs_file = debugfs_create_file(dbg_fname, 0666,
+                                       kvm_debugfs_dir, vcpu,
+                                       &kvmppc_exit_timing_fops);
+
+       if (!debugfs_file) {
+               printk(KERN_ERR"%s: error creating debugfs file %s\n",
+                       __func__, dbg_fname);
+               return;
+       }
+
+       vcpu->arch.debugfs_exit_timing = debugfs_file;
+}
+
+void kvmppc_remove_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->arch.debugfs_exit_timing) {
+               debugfs_remove(vcpu->arch.debugfs_exit_timing);
+               vcpu->arch.debugfs_exit_timing = NULL;
+       }
+}
diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h
new file mode 100644 (file)
index 0000000..bb13b1f
--- /dev/null
@@ -0,0 +1,102 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+ */
+
+#ifndef __POWERPC_KVM_EXITTIMING_H__
+#define __POWERPC_KVM_EXITTIMING_H__
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_host.h>
+
+#ifdef CONFIG_KVM_EXIT_TIMING
+void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu);
+void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu);
+void kvmppc_create_vcpu_debugfs(struct kvm_vcpu *vcpu, unsigned int id);
+void kvmppc_remove_vcpu_debugfs(struct kvm_vcpu *vcpu);
+
+static inline void kvmppc_set_exit_type(struct kvm_vcpu *vcpu, int type)
+{
+       vcpu->arch.last_exit_type = type;
+}
+
+#else
+/* if exit timing is not configured there is no need to build the c file */
+static inline void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu) {}
+static inline void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu) {}
+static inline void kvmppc_create_vcpu_debugfs(struct kvm_vcpu *vcpu,
+                                               unsigned int id) {}
+static inline void kvmppc_remove_vcpu_debugfs(struct kvm_vcpu *vcpu) {}
+static inline void kvmppc_set_exit_type(struct kvm_vcpu *vcpu, int type) {}
+#endif /* CONFIG_KVM_EXIT_TIMING */
+
+/* account the exit in kvm_stats */
+static inline void kvmppc_account_exit_stat(struct kvm_vcpu *vcpu, int type)
+{
+       /* type has to be known at build time for optimization */
+       BUILD_BUG_ON(__builtin_constant_p(type));
+       switch (type) {
+       case EXT_INTR_EXITS:
+               vcpu->stat.ext_intr_exits++;
+               break;
+       case DEC_EXITS:
+               vcpu->stat.dec_exits++;
+               break;
+       case EMULATED_INST_EXITS:
+               vcpu->stat.emulated_inst_exits++;
+               break;
+       case DCR_EXITS:
+               vcpu->stat.dcr_exits++;
+               break;
+       case DSI_EXITS:
+               vcpu->stat.dsi_exits++;
+               break;
+       case ISI_EXITS:
+               vcpu->stat.isi_exits++;
+               break;
+       case SYSCALL_EXITS:
+               vcpu->stat.syscall_exits++;
+               break;
+       case DTLB_REAL_MISS_EXITS:
+               vcpu->stat.dtlb_real_miss_exits++;
+               break;
+       case DTLB_VIRT_MISS_EXITS:
+               vcpu->stat.dtlb_virt_miss_exits++;
+               break;
+       case MMIO_EXITS:
+               vcpu->stat.mmio_exits++;
+               break;
+       case ITLB_REAL_MISS_EXITS:
+               vcpu->stat.itlb_real_miss_exits++;
+               break;
+       case ITLB_VIRT_MISS_EXITS:
+               vcpu->stat.itlb_virt_miss_exits++;
+               break;
+       case SIGNAL_EXITS:
+               vcpu->stat.signal_exits++;
+               break;
+       }
+}
+
+/* wrapper to set exit time and account for it in kvm_stats */
+static inline void kvmppc_account_exit(struct kvm_vcpu *vcpu, int type)
+{
+       kvmppc_set_exit_type(vcpu, type);
+       kvmppc_account_exit_stat(vcpu, type);
+}
+
+#endif /* __POWERPC_KVM_EXITTIMING_H__ */
index f7a69021b7bf8ed9832a94527a48b632dc2477ee..84e058f1e1cc805516157d9a2c503168149e2084 100644 (file)
@@ -332,7 +332,7 @@ static void xics_eoi_lpar(unsigned int virq)
        lpar_xirr_info_set((0xff << 24) | irq);
 }
 
-static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
+static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
 {
        unsigned int irq;
        int status;
@@ -870,7 +870,7 @@ void xics_migrate_irqs_away(void)
 
                /* Reset affinity to all cpus */
                irq_desc[virq].affinity = CPU_MASK_ALL;
-               desc->chip->set_affinity(virq, CPU_MASK_ALL);
+               desc->chip->set_affinity(virq, cpu_all_mask);
 unlock:
                spin_unlock_irqrestore(&desc->lock, flags);
        }
index c82babb70074963bf4c37f3ead2d0feb4be9bf56..3e0d89dcdba2a5b4950a63a9cd79534f8e3702cf 100644 (file)
@@ -806,7 +806,7 @@ static void mpic_end_ipi(unsigned int irq)
 
 #endif /* CONFIG_SMP */
 
-void mpic_set_affinity(unsigned int irq, cpumask_t cpumask)
+void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 {
        struct mpic *mpic = mpic_from_irq(irq);
        unsigned int src = mpic_irq_to_hw(irq);
@@ -818,7 +818,7 @@ void mpic_set_affinity(unsigned int irq, cpumask_t cpumask)
        } else {
                cpumask_t tmp;
 
-               cpus_and(tmp, cpumask, cpu_online_map);
+               cpumask_and(&tmp, cpumask, cpu_online_mask);
 
                mpic_irq_write(src, MPIC_INFO(IRQ_DESTINATION),
                               mpic_physmask(cpus_addr(tmp)[0]));
index 6209c62a426d1177cb574a1bf92f0dc6f7ed7b43..3cef2af10f4254daea1ff69769117ffb1f8a23b3 100644 (file)
@@ -36,6 +36,6 @@ static inline int mpic_pasemi_msi_init(struct mpic *mpic)
 
 extern int mpic_set_irq_type(unsigned int virq, unsigned int flow_type);
 extern void mpic_set_vector(unsigned int virq, unsigned int vector);
-extern void mpic_set_affinity(unsigned int irq, cpumask_t cpumask);
+extern void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask);
 
 #endif /* _POWERPC_SYSDEV_MPIC_H */
index 8152fefc97b919469f8da42ea2c9b967dd55f049..19577aeffd7b42799eb576d4f882fcc969a28ae2 100644 (file)
@@ -83,6 +83,7 @@ config S390
        select HAVE_KRETPROBES
        select HAVE_KVM if 64BIT
        select HAVE_ARCH_TRACEHOOK
+       select INIT_ALL_POSSIBLE
 
 source "init/Kconfig"
 
index 6fc78541dc57fee528a712f1cc3a259db97e23de..3ed5c7a83c6c477e242ab411de06f7af35121f7a 100644 (file)
 struct _lowcore *lowcore_ptr[NR_CPUS];
 EXPORT_SYMBOL(lowcore_ptr);
 
-cpumask_t cpu_online_map = CPU_MASK_NONE;
-EXPORT_SYMBOL(cpu_online_map);
-
-cpumask_t cpu_possible_map = CPU_MASK_ALL;
-EXPORT_SYMBOL(cpu_possible_map);
-
 static struct task_struct *current_set[NR_CPUS];
 
 static u8 smp_cpu_type;
index 5be981a36c3ecadcb66db97b0bd3d88834d1081e..d649600df5b9a887c17f6770ad878e8820c51d92 100644 (file)
@@ -160,7 +160,7 @@ void init_cpu_timer(void)
        cd->min_delta_ns        = 1;
        cd->max_delta_ns        = LONG_MAX;
        cd->rating              = 400;
-       cd->cpumask             = cpumask_of_cpu(cpu);
+       cd->cpumask             = cpumask_of(cpu);
        cd->set_next_event      = s390_next_event;
        cd->set_mode            = s390_set_mode;
 
index 8b00eb2ddf57459c9469d8455d0762d365a71c1f..be8497186b96d86472f1ae692c9c74e7c88515f4 100644 (file)
@@ -113,8 +113,6 @@ long kvm_arch_dev_ioctl(struct file *filp,
 int kvm_dev_ioctl_check_extension(long ext)
 {
        switch (ext) {
-       case KVM_CAP_USER_MEMORY:
-               return 1;
        default:
                return 0;
        }
@@ -185,8 +183,6 @@ struct kvm *kvm_arch_create_vm(void)
        debug_register_view(kvm->arch.dbf, &debug_sprintf_view);
        VM_EVENT(kvm, 3, "%s", "vm created");
 
-       try_module_get(THIS_MODULE);
-
        return kvm;
 out_nodbf:
        free_page((unsigned long)(kvm->arch.sca));
@@ -196,13 +192,33 @@ out_nokvm:
        return ERR_PTR(rc);
 }
 
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+       VCPU_EVENT(vcpu, 3, "%s", "free cpu");
+       free_page((unsigned long)(vcpu->arch.sie_block));
+       kvm_vcpu_uninit(vcpu);
+       kfree(vcpu);
+}
+
+static void kvm_free_vcpus(struct kvm *kvm)
+{
+       unsigned int i;
+
+       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+               if (kvm->vcpus[i]) {
+                       kvm_arch_vcpu_destroy(kvm->vcpus[i]);
+                       kvm->vcpus[i] = NULL;
+               }
+       }
+}
+
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
-       debug_unregister(kvm->arch.dbf);
+       kvm_free_vcpus(kvm);
        kvm_free_physmem(kvm);
        free_page((unsigned long)(kvm->arch.sca));
+       debug_unregister(kvm->arch.dbf);
        kfree(kvm);
-       module_put(THIS_MODULE);
 }
 
 /* Section: vcpu related */
@@ -213,8 +229,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
-       /* kvm common code refers to this, but does'nt call it */
-       BUG();
+       /* Nothing todo */
 }
 
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -308,8 +323,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
        VM_EVENT(kvm, 3, "create cpu %d at %p, sie block at %p", id, vcpu,
                 vcpu->arch.sie_block);
 
-       try_module_get(THIS_MODULE);
-
        return vcpu;
 out_free_cpu:
        kfree(vcpu);
@@ -317,14 +330,6 @@ out_nomem:
        return ERR_PTR(rc);
 }
 
-void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
-       VCPU_EVENT(vcpu, 3, "%s", "destroy cpu");
-       free_page((unsigned long)(vcpu->arch.sie_block));
-       kfree(vcpu);
-       module_put(THIS_MODULE);
-}
-
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
        /* kvm common code refers to this, but never calls it */
index 85b660c17eb070e230b7ce3e0dcac1c7f7ab27c8..c24e9c6a173661a307e2b2f90b9dc3ea961551ed 100644 (file)
@@ -31,7 +31,7 @@ enum {
 };
 
 void smp_message_recv(unsigned int msg);
-void smp_timer_broadcast(cpumask_t mask);
+void smp_timer_broadcast(const struct cpumask *mask);
 
 void local_timer_interrupt(void);
 void local_timer_setup(unsigned int cpu);
index 95f0085e098a6859d746581b3b4b616439062854..279d9cc4a007672a403dcfce9eb4d8fe5ab25114 100644 (file)
@@ -5,7 +5,6 @@
 
 /* sched_domains SD_NODE_INIT for sh machines */
 #define SD_NODE_INIT (struct sched_domain) {           \
-       .span                   = CPU_MASK_NONE,        \
        .parent                 = NULL,                 \
        .child                  = NULL,                 \
        .groups                 = NULL,                 \
index 3c5ad1660bbcdd6b0366934167b3dc879cdb026d..8f40274126142847f99e7c51f71e13a22813b2cf 100644 (file)
 int __cpu_number_map[NR_CPUS];         /* Map physical to logical */
 int __cpu_logical_map[NR_CPUS];                /* Map logical to physical */
 
-cpumask_t cpu_possible_map;
-EXPORT_SYMBOL(cpu_possible_map);
-
-cpumask_t cpu_online_map;
-EXPORT_SYMBOL(cpu_online_map);
-
 static inline void __init smp_store_cpu_info(unsigned int cpu)
 {
        struct sh_cpuinfo *c = cpu_data + cpu;
@@ -190,11 +184,11 @@ void arch_send_call_function_single_ipi(int cpu)
        plat_send_ipi(cpu, SMP_MSG_FUNCTION_SINGLE);
 }
 
-void smp_timer_broadcast(cpumask_t mask)
+void smp_timer_broadcast(const struct cpumask *mask)
 {
        int cpu;
 
-       for_each_cpu_mask(cpu, mask)
+       for_each_cpu(cpu, mask)
                plat_send_ipi(cpu, SMP_MSG_TIMER);
 }
 
index c2317635230fda90f2b97ed2c7903b154a2ab175..96e8eaea1e62003603fd2343a09277022fb518a5 100644 (file)
@@ -51,7 +51,7 @@ void __cpuinit local_timer_setup(unsigned int cpu)
        clk->mult               = 1;
        clk->set_mode           = dummy_timer_set_mode;
        clk->broadcast          = smp_timer_broadcast;
-       clk->cpumask            = cpumask_of_cpu(cpu);
+       clk->cpumask            = cpumask_of(cpu);
 
        clockevents_register_device(clk);
 }
index 3c61ddd4d43e0459c3cd4355133a486f328bea50..0db3f95103368fa4c236e4666b38c828d25ac3a2 100644 (file)
@@ -263,7 +263,7 @@ static int tmu_timer_init(void)
        tmu0_clockevent.min_delta_ns =
                        clockevent_delta2ns(1, &tmu0_clockevent);
 
-       tmu0_clockevent.cpumask = cpumask_of_cpu(0);
+       tmu0_clockevent.cpumask = cpumask_of(0);
 
        clockevents_register_device(&tmu0_clockevent);
 
index a8180e546a48f87a6d5b9a7123d9ed91a3b51a56..8408d9d2a662264c0f71184c22fd62f4dc2adb92 100644 (file)
@@ -29,8 +29,6 @@
  */
 
 extern unsigned char boot_cpu_id;
-extern cpumask_t phys_cpu_present_map;
-#define cpu_possible_map phys_cpu_present_map
 
 typedef void (*smpfunc_t)(unsigned long, unsigned long, unsigned long,
                       unsigned long, unsigned long);
index a3ea2bcb95de6a39ebb7bf297ee2355f507f178c..cab8e02868716d691a38b9bad239ec754dd39134 100644 (file)
@@ -312,7 +312,8 @@ static void sun4u_irq_enable(unsigned int virt_irq)
        }
 }
 
-static void sun4u_set_affinity(unsigned int virt_irq, cpumask_t mask)
+static void sun4u_set_affinity(unsigned int virt_irq,
+                              const struct cpumask *mask)
 {
        sun4u_irq_enable(virt_irq);
 }
@@ -362,7 +363,8 @@ static void sun4v_irq_enable(unsigned int virt_irq)
                       ino, err);
 }
 
-static void sun4v_set_affinity(unsigned int virt_irq, cpumask_t mask)
+static void sun4v_set_affinity(unsigned int virt_irq,
+                              const struct cpumask *mask)
 {
        unsigned int ino = virt_irq_table[virt_irq].dev_ino;
        unsigned long cpuid = irq_choose_cpu(virt_irq);
@@ -429,7 +431,8 @@ static void sun4v_virq_enable(unsigned int virt_irq)
                       dev_handle, dev_ino, err);
 }
 
-static void sun4v_virt_set_affinity(unsigned int virt_irq, cpumask_t mask)
+static void sun4v_virt_set_affinity(unsigned int virt_irq,
+                                   const struct cpumask *mask)
 {
        unsigned long cpuid, dev_handle, dev_ino;
        int err;
@@ -851,7 +854,7 @@ void fixup_irqs(void)
                    !(irq_desc[irq].status & IRQ_PER_CPU)) {
                        if (irq_desc[irq].chip->set_affinity)
                                irq_desc[irq].chip->set_affinity(irq,
-                                       irq_desc[irq].affinity);
+                                       &irq_desc[irq].affinity);
                }
                spin_unlock_irqrestore(&irq_desc[irq].lock, flags);
        }
index 46e231f7c5ce2c37cf51b6111a0175f9dcc4280e..322046cdf85f2298da755944d381537c585188be 100644 (file)
@@ -780,7 +780,7 @@ out:
        if (nid != -1) {
                cpumask_t numa_mask = node_to_cpumask(nid);
 
-               irq_set_affinity(irq, numa_mask);
+               irq_set_affinity(irq, &numa_mask);
        }
 
        return irq;
index 2e680f34f727fa61f5defef92e63b8144e365d70..0d0cd815e83e505b24e4b12a14f276fa37be892a 100644 (file)
@@ -288,7 +288,7 @@ static int bringup_one_msi_queue(struct pci_pbm_info *pbm,
        if (nid != -1) {
                cpumask_t numa_mask = node_to_cpumask(nid);
 
-               irq_set_affinity(irq, numa_mask);
+               irq_set_affinity(irq, &numa_mask);
        }
        err = request_irq(irq, sparc64_msiq_interrupt, 0,
                          "MSIQ",
index e396c1f17a922deaef7b03a0751d06f1345770fd..1e5ac4e282e1285030aaa43380b91c01b7bcff48 100644 (file)
@@ -39,8 +39,6 @@ volatile unsigned long cpu_callin_map[NR_CPUS] __cpuinitdata = {0,};
 unsigned char boot_cpu_id = 0;
 unsigned char boot_cpu_id4 = 0; /* boot_cpu_id << 2 */
 
-cpumask_t cpu_online_map = CPU_MASK_NONE;
-cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
 cpumask_t smp_commenced_mask = CPU_MASK_NONE;
 
 /* The only guaranteed locking primitive available on all Sparc
@@ -334,7 +332,7 @@ void __init smp_setup_cpu_possible_map(void)
        instance = 0;
        while (!cpu_find_by_instance(instance, NULL, &mid)) {
                if (mid < NR_CPUS) {
-                       cpu_set(mid, phys_cpu_present_map);
+                       cpu_set(mid, cpu_possible_map);
                        cpu_set(mid, cpu_present_map);
                }
                instance++;
@@ -354,7 +352,7 @@ void __init smp_prepare_boot_cpu(void)
 
        current_thread_info()->cpu = cpuid;
        cpu_set(cpuid, cpu_online_map);
-       cpu_set(cpuid, phys_cpu_present_map);
+       cpu_set(cpuid, cpu_possible_map);
 }
 
 int __cpuinit __cpu_up(unsigned int cpu)
index bfe99d82d458702d32bf52a863cf00345e04a8e2..46329799f3462bb4002024558be74f204e679f07 100644 (file)
 
 int sparc64_multi_core __read_mostly;
 
-cpumask_t cpu_possible_map __read_mostly = CPU_MASK_NONE;
-cpumask_t cpu_online_map __read_mostly = CPU_MASK_NONE;
 DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE;
 cpumask_t cpu_core_map[NR_CPUS] __read_mostly =
        { [0 ... NR_CPUS-1] = CPU_MASK_NONE };
 
-EXPORT_SYMBOL(cpu_possible_map);
-EXPORT_SYMBOL(cpu_online_map);
 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
 EXPORT_SYMBOL(cpu_core_map);
 
index a4d45fc29b21e4ac9c9c069bdab7b8c277c2f6f5..e1e97639231b208e8b01ee99373771020b49ec2f 100644 (file)
@@ -112,10 +112,6 @@ EXPORT_PER_CPU_SYMBOL(__cpu_data);
 #ifdef CONFIG_SMP
 /* IRQ implementation. */
 EXPORT_SYMBOL(synchronize_irq);
-
-/* CPU online map and active count. */
-EXPORT_SYMBOL(cpu_online_map);
-EXPORT_SYMBOL(phys_cpu_present_map);
 #endif
 
 EXPORT_SYMBOL(__udelay);
index 141da375909129dea0ab0fffc9d5359d85e9a039..9df8f095a8b11a59e448bec001ac6aec852c480c 100644 (file)
@@ -763,7 +763,7 @@ void __devinit setup_sparc64_timer(void)
        sevt = &__get_cpu_var(sparc64_events);
 
        memcpy(sevt, &sparc64_clockevent, sizeof(*sevt));
-       sevt->cpumask = cpumask_of_cpu(smp_processor_id());
+       sevt->cpumask = cpumask_of(smp_processor_id());
 
        clockevents_register_device(sevt);
 }
index 045772142844690f2471d9330b836da594c945f8..98351c78bc814de7062dc5e9c213810ee4811de5 100644 (file)
@@ -25,13 +25,6 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 #include "irq_user.h"
 #include "os.h"
 
-/* CPU online map, set by smp_boot_cpus */
-cpumask_t cpu_online_map = CPU_MASK_NONE;
-cpumask_t cpu_possible_map = CPU_MASK_NONE;
-
-EXPORT_SYMBOL(cpu_online_map);
-EXPORT_SYMBOL(cpu_possible_map);
-
 /* Per CPU bogomips and other parameters
  * The only piece used here is the ipi pipe, which is set before SMP is
  * started and never changed.
index 47f04f4a3464a918850b67573e1a67a7cf5c3dec..b13a87a3ec95cf7754fe0216660a2695e40b4d1d 100644 (file)
@@ -50,7 +50,7 @@ static int itimer_next_event(unsigned long delta,
 static struct clock_event_device itimer_clockevent = {
        .name           = "itimer",
        .rating         = 250,
-       .cpumask        = CPU_MASK_ALL,
+       .cpumask        = cpu_all_mask,
        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
        .set_mode       = itimer_set_mode,
        .set_next_event = itimer_next_event,
index 0f44add3e0b7850e676cf8cc45d8514ebc94a5ea..249d1e0824b523e0a1e51a613a86a497649cc089 100644 (file)
@@ -601,19 +601,20 @@ config IOMMU_HELPER
 
 config MAXSMP
        bool "Configure Maximum number of SMP Processors and NUMA Nodes"
-       depends on X86_64 && SMP && BROKEN
+       depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
+       select CPUMASK_OFFSTACK
        default n
        help
          Configure maximum number of CPUS and NUMA Nodes for this architecture.
          If unsure, say N.
 
 config NR_CPUS
-       int "Maximum number of CPUs (2-512)" if !MAXSMP
-       range 2 512
-       depends on SMP
+       int "Maximum number of CPUs" if SMP && !MAXSMP
+       range 2 512 if SMP && !MAXSMP
+       default "1" if !SMP
        default "4096" if MAXSMP
-       default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
-       default "8"
+       default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
+       default "8" if SMP
        help
          This allows you to specify the maximum number of CPUs which this
          kernel will support.  The maximum supported value is 512 and the
index b195f85526e35813e08e93b2553587a37896af2f..9dabd00e98055c8d21526e6f86a63deced16a98b 100644 (file)
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
-#include <asm/ia32.h>
 #include <asm/ptrace.h>
 #include <asm/ia32_unistd.h>
 #include <asm/user32.h>
 #include <asm/sigcontext32.h>
 #include <asm/proto.h>
 #include <asm/vdso.h>
-
 #include <asm/sigframe.h>
+#include <asm/sys_ia32.h>
 
 #define DEBUG_SIG 0
 
index d21991ce606cc54dc4a3c71745049364bda26582..29cdcd02ead30524c4394a7b558cc3223ffc986c 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/shm.h>
 #include <linux/ipc.h>
 #include <linux/compat.h>
+#include <asm/sys_ia32.h>
 
 asmlinkage long sys32_ipc(u32 call, int first, int second, int third,
                          compat_uptr_t ptr, u32 fifth)
index 2e09dcd3c0a6034d46e3345efa40736e49941d13..6c0d7f6231af26e3eb3fb26821d4dbf493233fc2 100644 (file)
@@ -44,8 +44,8 @@
 #include <asm/types.h>
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
-#include <asm/ia32.h>
 #include <asm/vgtod.h>
+#include <asm/sys_ia32.h>
 
 #define AA(__x)                ((unsigned long)(__x))
 
index 25caa0738af5f9d99399d3c2f36405bc62d7115b..ab1d51a8855e18e80ca4cb24b9ef330e749c13dd 100644 (file)
@@ -54,7 +54,6 @@ extern int disable_apic;
 extern int is_vsmp_box(void);
 extern void xapic_wait_icr_idle(void);
 extern u32 safe_xapic_wait_icr_idle(void);
-extern u64 xapic_icr_read(void);
 extern void xapic_icr_write(u32, u32);
 extern int setup_profiling_timer(unsigned int);
 
@@ -93,7 +92,7 @@ static inline u32 native_apic_msr_read(u32 reg)
 }
 
 #ifndef CONFIG_X86_32
-extern int x2apic, x2apic_preenabled;
+extern int x2apic;
 extern void check_x2apic(void);
 extern void enable_x2apic(void);
 extern void enable_IR_x2apic(void);
index ce547f24a1cd30da5da6a9fc088d4a94cae2f7c8..d8dd9f537911fb2a6e98f5c7d68cd2121cfd6ecc 100644 (file)
@@ -9,12 +9,12 @@ static inline int apic_id_registered(void)
        return (1);
 }
 
-static inline cpumask_t target_cpus(void)
+static inline const cpumask_t *target_cpus(void)
 {
 #ifdef CONFIG_SMP
-        return cpu_online_map;
+       return &cpu_online_map;
 #else
-        return cpumask_of_cpu(0);
+       return &cpumask_of_cpu(0);
 #endif
 }
 
@@ -79,7 +79,7 @@ static inline int apicid_to_node(int logical_apicid)
 
 static inline int cpu_present_to_apicid(int mps_cpu)
 {
-       if (mps_cpu < NR_CPUS)
+       if (mps_cpu < nr_cpu_ids)
                return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
 
        return BAD_APICID;
@@ -94,7 +94,7 @@ extern u8 cpu_2_logical_apicid[];
 /* Mapping from cpu number to logical apicid */
 static inline int cpu_to_logical_apicid(int cpu)
 {
-       if (cpu >= NR_CPUS)
+       if (cpu >= nr_cpu_ids)
                return BAD_APICID;
        return cpu_physical_id(cpu);
 }
@@ -119,16 +119,34 @@ static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
 }
 
 /* As we are using single CPU as destination, pick only one CPU here */
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
 {
        int cpu;
        int apicid;     
 
-       cpu = first_cpu(cpumask);
+       cpu = first_cpu(*cpumask);
        apicid = cpu_to_logical_apicid(cpu);
        return apicid;
 }
 
+static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                                 const struct cpumask *andmask)
+{
+       int cpu;
+
+       /*
+        * We're using fixed IRQ delivery, can only return one phys APIC ID.
+        * May as well be the first.
+        */
+       for_each_cpu_and(cpu, cpumask, andmask)
+               if (cpumask_test_cpu(cpu, cpu_online_mask))
+                       break;
+       if (cpu < nr_cpu_ids)
+               return cpu_to_logical_apicid(cpu);
+
+       return BAD_APICID;
+}
+
 static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
 {
        return cpuid_apic >> index_msb;
index 9404c535b7ecbbf6f496d0cd80f4eb266a529208..27fcd01b3ae6054ddf5c1b5c4145b4f9d913576e 100644 (file)
@@ -1,25 +1,22 @@
 #ifndef __ASM_MACH_IPI_H
 #define __ASM_MACH_IPI_H
 
-void send_IPI_mask_sequence(cpumask_t mask, int vector);
+void send_IPI_mask_sequence(const struct cpumask *mask, int vector);
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
 
-static inline void send_IPI_mask(cpumask_t mask, int vector)
+static inline void send_IPI_mask(const struct cpumask *mask, int vector)
 {
        send_IPI_mask_sequence(mask, vector);
 }
 
 static inline void send_IPI_allbutself(int vector)
 {
-       cpumask_t mask = cpu_online_map;
-       cpu_clear(smp_processor_id(), mask);
-
-       if (!cpus_empty(mask))
-               send_IPI_mask(mask, vector);
+       send_IPI_mask_allbutself(cpu_online_mask, vector);
 }
 
 static inline void send_IPI_all(int vector)
 {
-       send_IPI_mask(cpu_online_map, vector);
+       send_IPI_mask(cpu_online_mask, vector);
 }
 
 #endif /* __ASM_MACH_IPI_H */
index e6b82b17b0721095cb79fced0888d320eb64230a..dc27705f5443268cc69857590fde7e1d369f97a2 100644 (file)
@@ -320,16 +320,14 @@ static inline void set_intr_gate(unsigned int n, void *addr)
        _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
 }
 
-#define SYS_VECTOR_FREE                0
-#define SYS_VECTOR_ALLOCED     1
-
 extern int first_system_vector;
-extern char system_vectors[];
+/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
+extern unsigned long used_vectors[];
 
 static inline void alloc_system_vector(int vector)
 {
-       if (system_vectors[vector] == SYS_VECTOR_FREE) {
-               system_vectors[vector] = SYS_VECTOR_ALLOCED;
+       if (!test_bit(vector, used_vectors)) {
+               set_bit(vector, used_vectors);
                if (first_system_vector > vector)
                        first_system_vector = vector;
        } else
index a2e545c91c359682e4583f1e4bdabd76ed91e7ed..ca5ffb2856b6810ec399a48ec98cc36d020edd69 100644 (file)
@@ -90,6 +90,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size);
 
 #endif /* CONFIG_X86_32 */
 
+extern int add_efi_memmap;
 extern void efi_reserve_early(void);
 extern void efi_call_phys_prelog(void);
 extern void efi_call_phys_epilog(void);
index e24ef876915f8833cd996c1a879b1b3ecaa54826..51ac1230294e53561e176904a5d37d05bdfa712b 100644 (file)
@@ -9,14 +9,14 @@ static inline int apic_id_registered(void)
                return (1);
 }
 
-static inline cpumask_t target_cpus_cluster(void)
+static inline const cpumask_t *target_cpus_cluster(void)
 {
-       return CPU_MASK_ALL;
+       return &CPU_MASK_ALL;
 }
 
-static inline cpumask_t target_cpus(void)
+static inline const cpumask_t *target_cpus(void)
 {
-       return cpumask_of_cpu(smp_processor_id());
+       return &cpumask_of_cpu(smp_processor_id());
 }
 
 #define APIC_DFR_VALUE_CLUSTER         (APIC_DFR_CLUSTER)
@@ -80,9 +80,10 @@ extern int apic_version [MAX_APICS];
 static inline void setup_apic_routing(void)
 {
        int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
-       printk("Enabling APIC mode:  %s.  Using %d I/O APICs, target cpus %lx\n",
+       printk("Enabling APIC mode:  %s. Using %d I/O APICs, target cpus %lx\n",
                (apic_version[apic] == 0x14) ?
-               "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(target_cpus())[0]);
+                       "Physical Cluster" : "Logical Cluster",
+                       nr_ioapics, cpus_addr(*target_cpus())[0]);
 }
 
 static inline int multi_timer_check(int apic, int irq)
@@ -100,7 +101,7 @@ static inline int cpu_present_to_apicid(int mps_cpu)
 {
        if (!mps_cpu)
                return boot_cpu_physical_apicid;
-       else if (mps_cpu < NR_CPUS)
+       else if (mps_cpu < nr_cpu_ids)
                return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
        else
                return BAD_APICID;
@@ -120,9 +121,9 @@ extern u8 cpu_2_logical_apicid[];
 static inline int cpu_to_logical_apicid(int cpu)
 {
 #ifdef CONFIG_SMP
-       if (cpu >= NR_CPUS)
-              return BAD_APICID;
-       return (int)cpu_2_logical_apicid[cpu];
+       if (cpu >= nr_cpu_ids)
+               return BAD_APICID;
+       return (int)cpu_2_logical_apicid[cpu];
 #else
        return logical_smp_processor_id();
 #endif
@@ -146,14 +147,15 @@ static inline int check_phys_apicid_present(int cpu_physical_apicid)
        return (1);
 }
 
-static inline unsigned int cpu_mask_to_apicid_cluster(cpumask_t cpumask)
+static inline unsigned int
+cpu_mask_to_apicid_cluster(const struct cpumask *cpumask)
 {
        int num_bits_set;
        int cpus_found = 0;
        int cpu;
        int apicid;
 
-       num_bits_set = cpus_weight(cpumask);
+       num_bits_set = cpumask_weight(cpumask);
        /* Return id to all */
        if (num_bits_set == NR_CPUS)
                return 0xFF;
@@ -161,10 +163,10 @@ static inline unsigned int cpu_mask_to_apicid_cluster(cpumask_t cpumask)
         * The cpus in the mask must all be on the apic cluster.  If are not
         * on the same apicid cluster return default value of TARGET_CPUS.
         */
-       cpu = first_cpu(cpumask);
+       cpu = cpumask_first(cpumask);
        apicid = cpu_to_logical_apicid(cpu);
        while (cpus_found < num_bits_set) {
-               if (cpu_isset(cpu, cpumask)) {
+               if (cpumask_test_cpu(cpu, cpumask)) {
                        int new_apicid = cpu_to_logical_apicid(cpu);
                        if (apicid_cluster(apicid) !=
                                        apicid_cluster(new_apicid)){
@@ -179,14 +181,14 @@ static inline unsigned int cpu_mask_to_apicid_cluster(cpumask_t cpumask)
        return apicid;
 }
 
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
 {
        int num_bits_set;
        int cpus_found = 0;
        int cpu;
        int apicid;
 
-       num_bits_set = cpus_weight(cpumask);
+       num_bits_set = cpus_weight(*cpumask);
        /* Return id to all */
        if (num_bits_set == NR_CPUS)
                return cpu_to_logical_apicid(0);
@@ -194,10 +196,52 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
         * The cpus in the mask must all be on the apic cluster.  If are not
         * on the same apicid cluster return default value of TARGET_CPUS.
         */
-       cpu = first_cpu(cpumask);
+       cpu = first_cpu(*cpumask);
+       apicid = cpu_to_logical_apicid(cpu);
+       while (cpus_found < num_bits_set) {
+               if (cpu_isset(cpu, *cpumask)) {
+                       int new_apicid = cpu_to_logical_apicid(cpu);
+                       if (apicid_cluster(apicid) !=
+                                       apicid_cluster(new_apicid)){
+                               printk ("%s: Not a valid mask!\n", __func__);
+                               return cpu_to_logical_apicid(0);
+                       }
+                       apicid = new_apicid;
+                       cpus_found++;
+               }
+               cpu++;
+       }
+       return apicid;
+}
+
+
+static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *inmask,
+                                                 const struct cpumask *andmask)
+{
+       int num_bits_set;
+       int cpus_found = 0;
+       int cpu;
+       int apicid = cpu_to_logical_apicid(0);
+       cpumask_var_t cpumask;
+
+       if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
+               return apicid;
+
+       cpumask_and(cpumask, inmask, andmask);
+       cpumask_and(cpumask, cpumask, cpu_online_mask);
+
+       num_bits_set = cpumask_weight(cpumask);
+       /* Return id to all */
+       if (num_bits_set == NR_CPUS)
+               goto exit;
+       /*
+        * The cpus in the mask must all be on the apic cluster.  If are not
+        * on the same apicid cluster return default value of TARGET_CPUS.
+        */
+       cpu = cpumask_first(cpumask);
        apicid = cpu_to_logical_apicid(cpu);
        while (cpus_found < num_bits_set) {
-               if (cpu_isset(cpu, cpumask)) {
+               if (cpumask_test_cpu(cpu, cpumask)) {
                        int new_apicid = cpu_to_logical_apicid(cpu);
                        if (apicid_cluster(apicid) !=
                                        apicid_cluster(new_apicid)){
@@ -209,6 +253,8 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
                }
                cpu++;
        }
+exit:
+       free_cpumask_var(cpumask);
        return apicid;
 }
 
index 632a955fcc0a937b870f4be05322943928ccc754..7e8ed24d4b8a9895e824c0de34982e8844521827 100644 (file)
@@ -1,24 +1,22 @@
 #ifndef __ASM_ES7000_IPI_H
 #define __ASM_ES7000_IPI_H
 
-void send_IPI_mask_sequence(cpumask_t mask, int vector);
+void send_IPI_mask_sequence(const struct cpumask *mask, int vector);
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
 
-static inline void send_IPI_mask(cpumask_t mask, int vector)
+static inline void send_IPI_mask(const struct cpumask *mask, int vector)
 {
        send_IPI_mask_sequence(mask, vector);
 }
 
 static inline void send_IPI_allbutself(int vector)
 {
-       cpumask_t mask = cpu_online_map;
-       cpu_clear(smp_processor_id(), mask);
-       if (!cpus_empty(mask))
-               send_IPI_mask(mask, vector);
+       send_IPI_mask_allbutself(cpu_online_mask, vector);
 }
 
 static inline void send_IPI_all(int vector)
 {
-       send_IPI_mask(cpu_online_map, vector);
+       send_IPI_mask(cpu_online_mask, vector);
 }
 
 #endif /* __ASM_ES7000_IPI_H */
index 0ac17d33a8c7114a015ba2bb4971e3b5e381a6da..746f37a7963adf9b9a7d91e197c1e0c50ce7b68a 100644 (file)
@@ -24,7 +24,7 @@ struct genapic {
        int (*probe)(void);
 
        int (*apic_id_registered)(void);
-       cpumask_t (*target_cpus)(void);
+       const struct cpumask *(*target_cpus)(void);
        int int_delivery_mode;
        int int_dest_mode;
        int ESR_DISABLE;
@@ -57,12 +57,16 @@ struct genapic {
 
        unsigned (*get_apic_id)(unsigned long x);
        unsigned long apic_id_mask;
-       unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask);
-       cpumask_t (*vector_allocation_domain)(int cpu);
+       unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
+       unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
+                                              const struct cpumask *andmask);
+       void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
 
 #ifdef CONFIG_SMP
        /* ipi */
-       void (*send_IPI_mask)(cpumask_t mask, int vector);
+       void (*send_IPI_mask)(const struct cpumask *mask, int vector);
+       void (*send_IPI_mask_allbutself)(const struct cpumask *mask,
+                                        int vector);
        void (*send_IPI_allbutself)(int vector);
        void (*send_IPI_all)(int vector);
 #endif
@@ -114,6 +118,7 @@ struct genapic {
        APICFUNC(get_apic_id)                           \
        .apic_id_mask = APIC_ID_MASK,                   \
        APICFUNC(cpu_mask_to_apicid)                    \
+       APICFUNC(cpu_mask_to_apicid_and)                \
        APICFUNC(vector_allocation_domain)              \
        APICFUNC(acpi_madt_oem_check)                   \
        IPIFUNC(send_IPI_mask)                          \
index 2cae011668b7e09a1397e09b7fc9241f63b48d1d..adf32fb56aa638d2aa160d25986f7c93ac6d3ba0 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_GENAPIC_64_H
 #define _ASM_X86_GENAPIC_64_H
 
+#include <linux/cpumask.h>
+
 /*
  * Copyright 2004 James Cleverdon, IBM.
  * Subject to the GNU Public License, v.2
@@ -18,16 +20,20 @@ struct genapic {
        u32 int_delivery_mode;
        u32 int_dest_mode;
        int (*apic_id_registered)(void);
-       cpumask_t (*target_cpus)(void);
-       cpumask_t (*vector_allocation_domain)(int cpu);
+       const struct cpumask *(*target_cpus)(void);
+       void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
        void (*init_apic_ldr)(void);
        /* ipi */
-       void (*send_IPI_mask)(cpumask_t mask, int vector);
+       void (*send_IPI_mask)(const struct cpumask *mask, int vector);
+       void (*send_IPI_mask_allbutself)(const struct cpumask *mask,
+                                        int vector);
        void (*send_IPI_allbutself)(int vector);
        void (*send_IPI_all)(int vector);
        void (*send_IPI_self)(int vector);
        /* */
-       unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask);
+       unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
+       unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
+                                              const struct cpumask *andmask);
        unsigned int (*phys_pkg_id)(int index_msb);
        unsigned int (*get_apic_id)(unsigned long x);
        unsigned long (*set_apic_id)(unsigned int id);
index f89dffb28aa93d3427c812ec059a538ac281aaef..c745a306f7d3572be1e5baa29a6cba7eca5b03c7 100644 (file)
@@ -117,7 +117,8 @@ static inline void __send_IPI_dest_field(unsigned int mask, int vector,
        native_apic_mem_write(APIC_ICR, cfg);
 }
 
-static inline void send_IPI_mask_sequence(cpumask_t mask, int vector)
+static inline void send_IPI_mask_sequence(const struct cpumask *mask,
+                                         int vector)
 {
        unsigned long flags;
        unsigned long query_cpu;
@@ -128,11 +129,29 @@ static inline void send_IPI_mask_sequence(cpumask_t mask, int vector)
         * - mbligh
         */
        local_irq_save(flags);
-       for_each_cpu_mask_nr(query_cpu, mask) {
+       for_each_cpu(query_cpu, mask) {
                __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu),
                                      vector, APIC_DEST_PHYSICAL);
        }
        local_irq_restore(flags);
 }
 
+static inline void send_IPI_mask_allbutself(const struct cpumask *mask,
+                                           int vector)
+{
+       unsigned long flags;
+       unsigned int query_cpu;
+       unsigned int this_cpu = smp_processor_id();
+
+       /* See Hack comment above */
+
+       local_irq_save(flags);
+       for_each_cpu(query_cpu, mask)
+               if (query_cpu != this_cpu)
+                       __send_IPI_dest_field(
+                               per_cpu(x86_cpu_to_apicid, query_cpu),
+                               vector, APIC_DEST_PHYSICAL);
+       local_irq_restore(flags);
+}
+
 #endif /* _ASM_X86_IPI_H */
index 28e409fc73f3df33e4c6b2cbde5e99e0433e0fa1..592688ed04d33462d172936d56dff55fc6e52eac 100644 (file)
@@ -33,7 +33,7 @@ static inline int irq_canonicalize(int irq)
 
 #ifdef CONFIG_HOTPLUG_CPU
 #include <linux/cpumask.h>
-extern void fixup_irqs(cpumask_t map);
+extern void fixup_irqs(void);
 #endif
 
 extern unsigned int do_IRQ(struct pt_regs *regs);
@@ -42,5 +42,6 @@ extern void native_init_IRQ(void);
 
 /* Interrupt vector management */
 extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
+extern int vector_used_by_percpu_irq(unsigned int vector);
 
 #endif /* _ASM_X86_IRQ_H */
index 8346be87cfa1a37778a74d7e80b6dccf3dd4204f..97215a458e5f4fe3ccf6ad5a0a33a041bada43a5 100644 (file)
@@ -21,6 +21,7 @@
 
 #include <asm/pvclock-abi.h>
 #include <asm/desc.h>
+#include <asm/mtrr.h>
 
 #define KVM_MAX_VCPUS 16
 #define KVM_MEMORY_SLOTS 32
@@ -86,6 +87,7 @@
 #define KVM_MIN_FREE_MMU_PAGES 5
 #define KVM_REFILL_PAGES 25
 #define KVM_MAX_CPUID_ENTRIES 40
+#define KVM_NR_FIXED_MTRR_REGION 88
 #define KVM_NR_VAR_MTRR 8
 
 extern spinlock_t kvm_lock;
@@ -180,6 +182,8 @@ struct kvm_mmu_page {
        struct list_head link;
        struct hlist_node hash_link;
 
+       struct list_head oos_link;
+
        /*
         * The following two entries are used to key the shadow page in the
         * hash table.
@@ -190,13 +194,16 @@ struct kvm_mmu_page {
        u64 *spt;
        /* hold the gfn of each spte inside spt */
        gfn_t *gfns;
-       unsigned long slot_bitmap; /* One bit set per slot which has memory
-                                   * in this shadow page.
-                                   */
+       /*
+        * One bit set per slot which has memory
+        * in this shadow page.
+        */
+       DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
        int multimapped;         /* More than one parent_pte? */
        int root_count;          /* Currently serving as active root */
        bool unsync;
-       bool unsync_children;
+       bool global;
+       unsigned int unsync_children;
        union {
                u64 *parent_pte;               /* !multimapped */
                struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
@@ -327,8 +334,10 @@ struct kvm_vcpu_arch {
 
        bool nmi_pending;
        bool nmi_injected;
+       bool nmi_window_open;
 
-       u64 mtrr[0x100];
+       struct mtrr_state_type mtrr_state;
+       u32 pat;
 };
 
 struct kvm_mem_alias {
@@ -350,11 +359,13 @@ struct kvm_arch{
         */
        struct list_head active_mmu_pages;
        struct list_head assigned_dev_head;
+       struct list_head oos_global_pages;
        struct dmar_domain *intel_iommu_domain;
        struct kvm_pic *vpic;
        struct kvm_ioapic *vioapic;
        struct kvm_pit *vpit;
        struct hlist_head irq_ack_notifier_list;
+       int vapics_in_nmi_mode;
 
        int round_robin_prev_vcpu;
        unsigned int tss_addr;
@@ -378,6 +389,7 @@ struct kvm_vm_stat {
        u32 mmu_recycled;
        u32 mmu_cache_miss;
        u32 mmu_unsync;
+       u32 mmu_unsync_global;
        u32 remote_tlb_flush;
        u32 lpages;
 };
@@ -397,6 +409,7 @@ struct kvm_vcpu_stat {
        u32 halt_exits;
        u32 halt_wakeup;
        u32 request_irq_exits;
+       u32 request_nmi_exits;
        u32 irq_exits;
        u32 host_state_reload;
        u32 efer_reload;
@@ -405,6 +418,7 @@ struct kvm_vcpu_stat {
        u32 insn_emulation_fail;
        u32 hypercalls;
        u32 irq_injections;
+       u32 nmi_injections;
 };
 
 struct descriptor_table {
@@ -477,6 +491,7 @@ struct kvm_x86_ops {
 
        int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
        int (*get_tdp_level)(void);
+       int (*get_mt_mask_shift)(void);
 };
 
 extern struct kvm_x86_ops *kvm_x86_ops;
@@ -490,7 +505,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
 void kvm_mmu_set_base_ptes(u64 base_pte);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-               u64 dirty_mask, u64 nx_mask, u64 x_mask);
+               u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask);
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
@@ -587,12 +602,14 @@ unsigned long segment_base(u16 selector);
 
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-                      const u8 *new, int bytes);
+                      const u8 *new, int bytes,
+                      bool guest_initiated);
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_global(struct kvm_vcpu *vcpu);
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 
@@ -607,6 +624,8 @@ void kvm_disable_tdp(void);
 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
 int complete_pio(struct kvm_vcpu *vcpu);
 
+struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
+
 static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
 {
        struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -702,18 +721,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
        kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
 }
 
-#define ASM_VMX_VMCLEAR_RAX       ".byte 0x66, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMLAUNCH          ".byte 0x0f, 0x01, 0xc2"
-#define ASM_VMX_VMRESUME          ".byte 0x0f, 0x01, 0xc3"
-#define ASM_VMX_VMPTRLD_RAX       ".byte 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMREAD_RDX_RAX    ".byte 0x0f, 0x78, 0xd0"
-#define ASM_VMX_VMWRITE_RAX_RDX   ".byte 0x0f, 0x79, 0xd0"
-#define ASM_VMX_VMWRITE_RSP_RDX   ".byte 0x0f, 0x79, 0xd4"
-#define ASM_VMX_VMXOFF            ".byte 0x0f, 0x01, 0xc4"
-#define ASM_VMX_VMXON_RAX         ".byte 0xf3, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_INVEPT           ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
-#define ASM_VMX_INVVPID                  ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
-
 #define MSR_IA32_TIME_STAMP_COUNTER            0x010
 
 #define TSS_IOPB_BASE_OFFSET 0x66
index 25179a29f208cdac80946c406635a2b8518e4dcd..6a159732881a6b4067fd552fd4576cce797a5926 100644 (file)
@@ -123,6 +123,7 @@ struct decode_cache {
        u8 ad_bytes;
        u8 rex_prefix;
        struct operand src;
+       struct operand src2;
        struct operand dst;
        bool has_seg_override;
        u8 seg_override;
@@ -146,22 +147,18 @@ struct x86_emulate_ctxt {
        /* Register state before/after emulation. */
        struct kvm_vcpu *vcpu;
 
-       /* Linear faulting address (if emulating a page-faulting instruction) */
        unsigned long eflags;
-
        /* Emulated execution mode, represented by an X86EMUL_MODE value. */
        int mode;
-
        u32 cs_base;
 
        /* decode cache */
-
        struct decode_cache decode;
 };
 
 /* Repeat String Operation Prefix */
-#define REPE_PREFIX  1
-#define REPNE_PREFIX    2
+#define REPE_PREFIX    1
+#define REPNE_PREFIX   2
 
 /* Execution mode, passed to the emulator. */
 #define X86EMUL_MODE_REAL     0        /* Real mode.             */
@@ -170,7 +167,7 @@ struct x86_emulate_ctxt {
 #define X86EMUL_MODE_PROT64   8        /* 64-bit (long) mode.    */
 
 /* Host execution mode. */
-#if defined(__i386__)
+#if defined(CONFIG_X86_32)
 #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
 #elif defined(CONFIG_X86_64)
 #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
index 6cb3a467e0673c031c6e39b675481466a0c7531a..cc09cbbee27e9fc7d3e7f1adb339d812819818e5 100644 (file)
@@ -8,12 +8,12 @@
 
 #define APIC_DFR_VALUE (APIC_DFR_FLAT)
 
-static inline cpumask_t target_cpus(void)
+static inline const struct cpumask *target_cpus(void)
 { 
 #ifdef CONFIG_SMP
-       return cpu_online_map;
+       return cpu_online_mask;
 #else
-       return cpumask_of_cpu(0);
+       return cpumask_of(0);
 #endif
 } 
 
@@ -28,6 +28,7 @@ static inline cpumask_t target_cpus(void)
 #define apic_id_registered (genapic->apic_id_registered)
 #define init_apic_ldr (genapic->init_apic_ldr)
 #define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
+#define cpu_mask_to_apicid_and (genapic->cpu_mask_to_apicid_and)
 #define phys_pkg_id    (genapic->phys_pkg_id)
 #define vector_allocation_domain    (genapic->vector_allocation_domain)
 #define read_apic_id()  (GET_APIC_ID(apic_read(APIC_ID)))
@@ -61,9 +62,19 @@ static inline int apic_id_registered(void)
        return physid_isset(read_apic_id(), phys_cpu_present_map);
 }
 
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+static inline unsigned int cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
-       return cpus_addr(cpumask)[0];
+       return cpumask_bits(cpumask)[0];
+}
+
+static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                                 const struct cpumask *andmask)
+{
+       unsigned long mask1 = cpumask_bits(cpumask)[0];
+       unsigned long mask2 = cpumask_bits(andmask)[0];
+       unsigned long mask3 = cpumask_bits(cpu_online_mask)[0];
+
+       return (unsigned int)(mask1 & mask2 & mask3);
 }
 
 static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
@@ -88,7 +99,7 @@ static inline int apicid_to_node(int logical_apicid)
 #endif
 }
 
-static inline cpumask_t vector_allocation_domain(int cpu)
+static inline void vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
         /* Careful. Some cpus do not strictly honor the set of cpus
          * specified in the interrupt destination when using lowest
@@ -98,8 +109,7 @@ static inline cpumask_t vector_allocation_domain(int cpu)
          * deliver interrupts to the wrong hyperthread when only one
          * hyperthread was specified in the interrupt desitination.
          */
-        cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
-        return domain;
+       *retmask = (cpumask_t) { { [0] = APIC_ALL_CPUS } };
 }
 #endif
 
@@ -131,7 +141,7 @@ static inline int cpu_to_logical_apicid(int cpu)
 
 static inline int cpu_present_to_apicid(int mps_cpu)
 {
-       if (mps_cpu < NR_CPUS && cpu_present(mps_cpu))
+       if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
                return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
        else
                return BAD_APICID;
index fabca01ebacf29147f0174d00f220e7d64f8d32a..191312d155da7778dcc9cd9bab1b14da59349ff6 100644 (file)
@@ -4,7 +4,8 @@
 /* Avoid include hell */
 #define NMI_VECTOR 0x02
 
-void send_IPI_mask_bitmask(cpumask_t mask, int vector);
+void send_IPI_mask_bitmask(const struct cpumask *mask, int vector);
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
 void __send_IPI_shortcut(unsigned int shortcut, int vector);
 
 extern int no_broadcast;
@@ -12,28 +13,27 @@ extern int no_broadcast;
 #ifdef CONFIG_X86_64
 #include <asm/genapic.h>
 #define send_IPI_mask (genapic->send_IPI_mask)
+#define send_IPI_mask_allbutself (genapic->send_IPI_mask_allbutself)
 #else
-static inline void send_IPI_mask(cpumask_t mask, int vector)
+static inline void send_IPI_mask(const struct cpumask *mask, int vector)
 {
        send_IPI_mask_bitmask(mask, vector);
 }
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
 #endif
 
 static inline void __local_send_IPI_allbutself(int vector)
 {
-       if (no_broadcast || vector == NMI_VECTOR) {
-               cpumask_t mask = cpu_online_map;
-
-               cpu_clear(smp_processor_id(), mask);
-               send_IPI_mask(mask, vector);
-       } else
+       if (no_broadcast || vector == NMI_VECTOR)
+               send_IPI_mask_allbutself(cpu_online_mask, vector);
+       else
                __send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
 }
 
 static inline void __local_send_IPI_all(int vector)
 {
        if (no_broadcast || vector == NMI_VECTOR)
-               send_IPI_mask(cpu_online_map, vector);
+               send_IPI_mask(cpu_online_mask, vector);
        else
                __send_IPI_shortcut(APIC_DEST_ALLINC, vector);
 }
index e430f47df667255e26bab25ecb123702426b6e63..48553e958ad517fc72ca40eac6dc145c2fbc5f77 100644 (file)
@@ -24,6 +24,7 @@
 #define check_phys_apicid_present (genapic->check_phys_apicid_present)
 #define check_apicid_used (genapic->check_apicid_used)
 #define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
+#define cpu_mask_to_apicid_and (genapic->cpu_mask_to_apicid_and)
 #define vector_allocation_domain (genapic->vector_allocation_domain)
 #define enable_apic_mode (genapic->enable_apic_mode)
 #define phys_pkg_id (genapic->phys_pkg_id)
index 91885c28f66b3ae909b402886682202b5280b6a1..62d14ce3cd0050f2fc3e36beb0ffbe98cc08dcb5 100644 (file)
@@ -6,13 +6,13 @@
 #include <asm/mpspec_def.h>
 
 extern int apic_version[MAX_APICS];
+extern int pic_mode;
 
 #ifdef CONFIG_X86_32
 #include <mach_mpspec.h>
 
 extern unsigned int def_to_bigsmp;
 extern u8 apicid_2_node[];
-extern int pic_mode;
 
 #ifdef CONFIG_X86_NUMAQ
 extern int mp_bus_id_to_node[MAX_MP_BUSSES];
index 7c1e4258b31e208ce6edd67f94ef7e202044f920..cb988aab716dfc7c30467ae32adce09dbb7489e6 100644 (file)
@@ -57,6 +57,31 @@ struct mtrr_gentry {
 };
 #endif /* !__i386__ */
 
+struct mtrr_var_range {
+       u32 base_lo;
+       u32 base_hi;
+       u32 mask_lo;
+       u32 mask_hi;
+};
+
+/* In the Intel processor's MTRR interface, the MTRR type is always held in
+   an 8 bit field: */
+typedef u8 mtrr_type;
+
+#define MTRR_NUM_FIXED_RANGES 88
+#define MTRR_MAX_VAR_RANGES 256
+
+struct mtrr_state_type {
+       struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES];
+       mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES];
+       unsigned char enabled;
+       unsigned char have_fixed;
+       mtrr_type def_type;
+};
+
+#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
+#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
+
 /*  These are the various ioctls  */
 #define MTRRIOC_ADD_ENTRY        _IOW(MTRR_IOCTL_BASE,  0, struct mtrr_sentry)
 #define MTRRIOC_SET_ENTRY        _IOW(MTRR_IOCTL_BASE,  1, struct mtrr_sentry)
index 0bf2a06b7a4e66a33629cc930f6d59c9c9907c92..c80f00d299656278baedc68be829650ec1b5c53c 100644 (file)
@@ -7,9 +7,9 @@
 
 #define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
 
-static inline cpumask_t target_cpus(void)
+static inline const cpumask_t *target_cpus(void)
 {
-       return CPU_MASK_ALL;
+       return &CPU_MASK_ALL;
 }
 
 #define NO_BALANCE_IRQ (1)
@@ -122,7 +122,13 @@ static inline void enable_apic_mode(void)
  * We use physical apicids here, not logical, so just return the default
  * physical broadcast to stop people from breaking us
  */
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
+{
+       return (int) 0xF;
+}
+
+static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                                 const struct cpumask *andmask)
 {
        return (int) 0xF;
 }
index 935588d286cf85c0ada9dbde94d83e74e081cf94..a8374c652778f5c23424b7e82a3f2aee93c1632d 100644 (file)
@@ -1,25 +1,22 @@
 #ifndef __ASM_NUMAQ_IPI_H
 #define __ASM_NUMAQ_IPI_H
 
-void send_IPI_mask_sequence(cpumask_t, int vector);
+void send_IPI_mask_sequence(const struct cpumask *mask, int vector);
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
 
-static inline void send_IPI_mask(cpumask_t mask, int vector)
+static inline void send_IPI_mask(const struct cpumask *mask, int vector)
 {
        send_IPI_mask_sequence(mask, vector);
 }
 
 static inline void send_IPI_allbutself(int vector)
 {
-       cpumask_t mask = cpu_online_map;
-       cpu_clear(smp_processor_id(), mask);
-
-       if (!cpus_empty(mask))
-               send_IPI_mask(mask, vector);
+       send_IPI_mask_allbutself(cpu_online_mask, vector);
 }
 
 static inline void send_IPI_all(int vector)
 {
-       send_IPI_mask(cpu_online_map, vector);
+       send_IPI_mask(cpu_online_mask, vector);
 }
 
 #endif /* __ASM_NUMAQ_IPI_H */
similarity index 88%
rename from arch/x86/pci/pci.h
rename to arch/x86/include/asm/pci_x86.h
index 1959018aac025ea3a38048973e6cc7a107396684..e60fd3e14bdf2c720f3c890f945a80e1828d087b 100644 (file)
@@ -57,7 +57,8 @@ extern struct pci_ops pci_root_ops;
 struct irq_info {
        u8 bus, devfn;                  /* Bus, device and function */
        struct {
-               u8 link;                /* IRQ line ID, chipset dependent, 0=not routed */
+               u8 link;                /* IRQ line ID, chipset dependent,
+                                          0 = not routed */
                u16 bitmap;             /* Available IRQs */
        } __attribute__((packed)) irq[4];
        u8 slot;                        /* Slot number, 0=onboard */
@@ -69,11 +70,13 @@ struct irq_routing_table {
        u16 version;                    /* PIRQ_VERSION */
        u16 size;                       /* Table size in bytes */
        u8 rtr_bus, rtr_devfn;          /* Where the interrupt router lies */
-       u16 exclusive_irqs;             /* IRQs devoted exclusively to PCI usage */
-       u16 rtr_vendor, rtr_device;     /* Vendor and device ID of interrupt router */
+       u16 exclusive_irqs;             /* IRQs devoted exclusively to
+                                          PCI usage */
+       u16 rtr_vendor, rtr_device;     /* Vendor and device ID of
+                                          interrupt router */
        u32 miniport_data;              /* Crap */
        u8 rfu[11];
-       u8 checksum;                    /* Modulo 256 checksum must give zero */
+       u8 checksum;                    /* Modulo 256 checksum must give 0 */
        struct irq_info slots[0];
 } __attribute__((packed));
 
@@ -148,15 +151,15 @@ static inline unsigned int mmio_config_readl(void __iomem *pos)
 
 static inline void mmio_config_writeb(void __iomem *pos, u8 val)
 {
-       asm volatile("movb %%al,(%1)" :: "a" (val), "r" (pos) : "memory");
+       asm volatile("movb %%al,(%1)" : : "a" (val), "r" (pos) : "memory");
 }
 
 static inline void mmio_config_writew(void __iomem *pos, u16 val)
 {
-       asm volatile("movw %%ax,(%1)" :: "a" (val), "r" (pos) : "memory");
+       asm volatile("movw %%ax,(%1)" : : "a" (val), "r" (pos) : "memory");
 }
 
 static inline void mmio_config_writel(void __iomem *pos, u32 val)
 {
-       asm volatile("movl %%eax,(%1)" :: "a" (val), "r" (pos) : "memory");
+       asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory");
 }
index d12811ce51d92fc8d4be51c24981329ba2591aec..830b9fcb6427103689d005417f20ac19816b0766 100644 (file)
@@ -60,7 +60,7 @@ struct smp_ops {
        void (*cpu_die)(unsigned int cpu);
        void (*play_dead)(void);
 
-       void (*send_call_func_ipi)(cpumask_t mask);
+       void (*send_call_func_ipi)(const struct cpumask *mask);
        void (*send_call_func_single_ipi)(int cpu);
 };
 
@@ -125,7 +125,7 @@ static inline void arch_send_call_function_single_ipi(int cpu)
 
 static inline void arch_send_call_function_ipi(cpumask_t mask)
 {
-       smp_ops.send_call_func_ipi(mask);
+       smp_ops.send_call_func_ipi(&mask);
 }
 
 void cpu_disable_common(void);
@@ -138,7 +138,7 @@ void native_cpu_die(unsigned int cpu);
 void native_play_dead(void);
 void play_dead_common(void);
 
-void native_send_call_func_ipi(cpumask_t mask);
+void native_send_call_func_ipi(const struct cpumask *mask);
 void native_send_call_func_single_ipi(int cpu);
 
 extern void prefill_possible_map(void);
index 9b3070f1c2ac6fb80f0b53edf9b33b080af0f081..99327d1be49f0bd0fed4ecd5ca189af6bde63188 100644 (file)
 
 #define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
 
-static inline cpumask_t target_cpus(void)
+static inline const cpumask_t *target_cpus(void)
 {
        /* CPU_MASK_ALL (0xff) has undefined behaviour with
         * dest_LowestPrio mode logical clustered apic interrupt routing
         * Just start on cpu 0.  IRQ balancing will spread load
         */
-       return cpumask_of_cpu(0);
+       return &cpumask_of_cpu(0);
 }
 
 #define INT_DELIVERY_MODE (dest_LowestPrio)
@@ -137,14 +137,14 @@ static inline void enable_apic_mode(void)
 {
 }
 
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
 {
        int num_bits_set;
        int cpus_found = 0;
        int cpu;
        int apicid;
 
-       num_bits_set = cpus_weight(cpumask);
+       num_bits_set = cpus_weight(*cpumask);
        /* Return id to all */
        if (num_bits_set == NR_CPUS)
                return (int) 0xFF;
@@ -152,10 +152,10 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
         * The cpus in the mask must all be on the apic cluster.  If are not
         * on the same apicid cluster return default value of TARGET_CPUS.
         */
-       cpu = first_cpu(cpumask);
+       cpu = first_cpu(*cpumask);
        apicid = cpu_to_logical_apicid(cpu);
        while (cpus_found < num_bits_set) {
-               if (cpu_isset(cpu, cpumask)) {
+               if (cpu_isset(cpu, *cpumask)) {
                        int new_apicid = cpu_to_logical_apicid(cpu);
                        if (apicid_cluster(apicid) !=
                                        apicid_cluster(new_apicid)){
@@ -170,6 +170,49 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
        return apicid;
 }
 
+static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *inmask,
+                                                 const struct cpumask *andmask)
+{
+       int num_bits_set;
+       int cpus_found = 0;
+       int cpu;
+       int apicid = 0xFF;
+       cpumask_var_t cpumask;
+
+       if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
+               return (int) 0xFF;
+
+       cpumask_and(cpumask, inmask, andmask);
+       cpumask_and(cpumask, cpumask, cpu_online_mask);
+
+       num_bits_set = cpumask_weight(cpumask);
+       /* Return id to all */
+       if (num_bits_set == nr_cpu_ids)
+               goto exit;
+       /*
+        * The cpus in the mask must all be on the apic cluster.  If are not
+        * on the same apicid cluster return default value of TARGET_CPUS.
+        */
+       cpu = cpumask_first(cpumask);
+       apicid = cpu_to_logical_apicid(cpu);
+       while (cpus_found < num_bits_set) {
+               if (cpumask_test_cpu(cpu, cpumask)) {
+                       int new_apicid = cpu_to_logical_apicid(cpu);
+                       if (apicid_cluster(apicid) !=
+                                       apicid_cluster(new_apicid)){
+                               printk ("%s: Not a valid mask!\n", __func__);
+                               return 0xFF;
+                       }
+                       apicid = apicid | new_apicid;
+                       cpus_found++;
+               }
+               cpu++;
+       }
+exit:
+       free_cpumask_var(cpumask);
+       return apicid;
+}
+
 /* cpuid returns the value latched in the HW at reset, not the APIC ID
  * register's value.  For any box whose BIOS changes APIC IDs, like
  * clustered APIC systems, we must use hard_smp_processor_id.
index 53bd1e7bd7b448ac2d1f6ff3a25a496dc9c16358..a8a2c24f50ccfd34d8dca98cb399dcafc51a1c29 100644 (file)
@@ -1,9 +1,10 @@
 #ifndef __ASM_SUMMIT_IPI_H
 #define __ASM_SUMMIT_IPI_H
 
-void send_IPI_mask_sequence(cpumask_t mask, int vector);
+void send_IPI_mask_sequence(const cpumask_t *mask, int vector);
+void send_IPI_mask_allbutself(const cpumask_t *mask, int vector);
 
-static inline void send_IPI_mask(cpumask_t mask, int vector)
+static inline void send_IPI_mask(const cpumask_t *mask, int vector)
 {
        send_IPI_mask_sequence(mask, vector);
 }
@@ -14,12 +15,12 @@ static inline void send_IPI_allbutself(int vector)
        cpu_clear(smp_processor_id(), mask);
 
        if (!cpus_empty(mask))
-               send_IPI_mask(mask, vector);
+               send_IPI_mask(&mask, vector);
 }
 
 static inline void send_IPI_all(int vector)
 {
-       send_IPI_mask(cpu_online_map, vector);
+       send_IPI_mask(&cpu_online_map, vector);
 }
 
 #endif /* __ASM_SUMMIT_IPI_H */
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
new file mode 100644 (file)
index 0000000..ffb08be
--- /dev/null
@@ -0,0 +1,101 @@
+/*
+ * sys_ia32.h - Linux ia32 syscall interfaces
+ *
+ * Copyright (c) 2008 Jaswinder Singh Rajput
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#ifndef _ASM_X86_SYS_IA32_H
+#define _ASM_X86_SYS_IA32_H
+
+#include <linux/compiler.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/signal.h>
+#include <asm/compat.h>
+#include <asm/ia32.h>
+
+/* ia32/sys_ia32.c */
+asmlinkage long sys32_truncate64(char __user *, unsigned long, unsigned long);
+asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long);
+
+asmlinkage long sys32_stat64(char __user *, struct stat64 __user *);
+asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *);
+asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *);
+asmlinkage long sys32_fstatat(unsigned int, char __user *,
+                             struct stat64 __user *, int);
+struct mmap_arg_struct;
+asmlinkage long sys32_mmap(struct mmap_arg_struct __user *);
+asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long);
+
+asmlinkage long sys32_pipe(int __user *);
+struct sigaction32;
+struct old_sigaction32;
+asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *,
+                                  struct sigaction32 __user *, unsigned int);
+asmlinkage long sys32_sigaction(int, struct old_sigaction32 __user *,
+                               struct old_sigaction32 __user *);
+asmlinkage long sys32_rt_sigprocmask(int, compat_sigset_t __user *,
+                                    compat_sigset_t __user *, unsigned int);
+asmlinkage long sys32_alarm(unsigned int);
+
+struct sel_arg_struct;
+asmlinkage long sys32_old_select(struct sel_arg_struct __user *);
+asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int);
+asmlinkage long sys32_sysfs(int, u32, u32);
+
+asmlinkage long sys32_sched_rr_get_interval(compat_pid_t,
+                                           struct compat_timespec __user *);
+asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t);
+asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *);
+
+#ifdef CONFIG_SYSCTL_SYSCALL
+struct sysctl_ia32;
+asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *);
+#endif
+
+asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
+asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);
+
+asmlinkage long sys32_personality(unsigned long);
+asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
+
+asmlinkage long sys32_mmap2(unsigned long, unsigned long, unsigned long,
+                           unsigned long, unsigned long, unsigned long);
+
+struct oldold_utsname;
+struct old_utsname;
+asmlinkage long sys32_olduname(struct oldold_utsname __user *);
+long sys32_uname(struct old_utsname __user *);
+
+long sys32_ustat(unsigned, struct ustat32 __user *);
+
+asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *,
+                            compat_uptr_t __user *, struct pt_regs *);
+asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *);
+
+long sys32_lseek(unsigned int, int, unsigned int);
+long sys32_kill(int, int);
+long sys32_fadvise64_64(int, __u32, __u32, __u32, __u32, int);
+long sys32_vm86_warning(void);
+long sys32_lookup_dcookie(u32, u32, char __user *, size_t);
+
+asmlinkage ssize_t sys32_readahead(int, unsigned, unsigned, size_t);
+asmlinkage long sys32_sync_file_range(int, unsigned, unsigned,
+                                     unsigned, unsigned, int);
+asmlinkage long sys32_fadvise64(int, unsigned, unsigned, size_t, int);
+asmlinkage long sys32_fallocate(int, int, unsigned,
+                               unsigned, unsigned, unsigned);
+
+/* ia32/ia32_signal.c */
+asmlinkage long sys32_sigsuspend(int, int, old_sigset_t);
+asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *,
+                                 stack_ia32_t __user *, struct pt_regs *);
+asmlinkage long sys32_sigreturn(struct pt_regs *);
+asmlinkage long sys32_rt_sigreturn(struct pt_regs *);
+
+/* ia32/ipc32.c */
+asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32);
+#endif /* _ASM_X86_SYS_IA32_H */
index ff386ff50ed766d5d8c4e661d9ef2c3acef3162f..79e31e9dcdda9f924d17489dc1e0b134995c764b 100644 (file)
@@ -226,6 +226,8 @@ extern cpumask_t cpu_coregroup_map(int cpu);
 #define topology_core_id(cpu)                  (cpu_data(cpu).cpu_core_id)
 #define topology_core_siblings(cpu)            (per_cpu(cpu_core_map, cpu))
 #define topology_thread_siblings(cpu)          (per_cpu(cpu_sibling_map, cpu))
+#define topology_core_cpumask(cpu)             (&per_cpu(cpu_core_map, cpu))
+#define topology_thread_cpumask(cpu)           (&per_cpu(cpu_sibling_map, cpu))
 
 /* indicates that pointers to the topology cpumask_t maps are valid */
 #define arch_provides_topology_pointers                yes
index e2363253bbbf21431467516d99404c64eaeaec34..50423c7b56b2eed9ddce519fac35c0a478dd5f81 100644 (file)
@@ -133,61 +133,61 @@ struct bau_msg_payload {
  * see table 4.2.3.0.1 in broacast_assist spec.
  */
 struct bau_msg_header {
-       int dest_subnodeid:6;   /* must be zero */
+       unsigned int dest_subnodeid:6;  /* must be zero */
        /* bits 5:0 */
-       int base_dest_nodeid:15; /* nasid>>1 (pnode) of first bit in node_map */
-       /* bits 20:6 */
-       int command:8;          /* message type */
+       unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */
+       /* bits 20:6 */                   /* first bit in node_map */
+       unsigned int command:8; /* message type */
        /* bits 28:21 */
                                /* 0x38: SN3net EndPoint Message */
-       int rsvd_1:3;           /* must be zero */
+       unsigned int rsvd_1:3;  /* must be zero */
        /* bits 31:29 */
                                /* int will align on 32 bits */
-       int rsvd_2:9;           /* must be zero */
+       unsigned int rsvd_2:9;  /* must be zero */
        /* bits 40:32 */
                                /* Suppl_A is 56-41 */
-       int payload_2a:8;       /* becomes byte 16 of msg */
+       unsigned int payload_2a:8;/* becomes byte 16 of msg */
        /* bits 48:41 */        /* not currently using */
-       int payload_2b:8;       /* becomes byte 17 of msg */
+       unsigned int payload_2b:8;/* becomes byte 17 of msg */
        /* bits 56:49 */        /* not currently using */
                                /* Address field (96:57) is never used as an
                                   address (these are address bits 42:3) */
-       int rsvd_3:1;           /* must be zero */
+       unsigned int rsvd_3:1;  /* must be zero */
        /* bit 57 */
                                /* address bits 27:4 are payload */
                                /* these 24 bits become bytes 12-14 of msg */
-       int replied_to:1;       /* sent as 0 by the source to byte 12 */
+       unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */
        /* bit 58 */
 
-       int payload_1a:5;       /* not currently used */
+       unsigned int payload_1a:5;/* not currently used */
        /* bits 63:59 */
-       int payload_1b:8;       /* not currently used */
+       unsigned int payload_1b:8;/* not currently used */
        /* bits 71:64 */
-       int payload_1c:8;       /* not currently used */
+       unsigned int payload_1c:8;/* not currently used */
        /* bits 79:72 */
-       int payload_1d:2;       /* not currently used */
+       unsigned int payload_1d:2;/* not currently used */
        /* bits 81:80 */
 
-       int rsvd_4:7;           /* must be zero */
+       unsigned int rsvd_4:7;  /* must be zero */
        /* bits 88:82 */
-       int sw_ack_flag:1;      /* software acknowledge flag */
+       unsigned int sw_ack_flag:1;/* software acknowledge flag */
        /* bit 89 */
                                /* INTD trasactions at destination are to
                                   wait for software acknowledge */
-       int rsvd_5:6;           /* must be zero */
+       unsigned int rsvd_5:6;  /* must be zero */
        /* bits 95:90 */
-       int rsvd_6:5;           /* must be zero */
+       unsigned int rsvd_6:5;  /* must be zero */
        /* bits 100:96 */
-       int int_both:1;         /* if 1, interrupt both sockets on the blade */
+       unsigned int int_both:1;/* if 1, interrupt both sockets on the blade */
        /* bit 101*/
-       int fairness:3;         /* usually zero */
+       unsigned int fairness:3;/* usually zero */
        /* bits 104:102 */
-       int multilevel:1;       /* multi-level multicast format */
+       unsigned int multilevel:1;      /* multi-level multicast format */
        /* bit 105 */
                                /* 0 for TLB: endpoint multi-unicast messages */
-       int chaining:1;         /* next descriptor is part of this activation*/
+       unsigned int chaining:1;/* next descriptor is part of this activation*/
        /* bit 106 */
-       int rsvd_7:21;          /* must be zero */
+       unsigned int rsvd_7:21; /* must be zero */
        /* bits 127:107 */
 };
 
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
new file mode 100644 (file)
index 0000000..5936362
--- /dev/null
@@ -0,0 +1,132 @@
+/* CPU virtualization extensions handling
+ *
+ * This should carry the code for handling CPU virtualization extensions
+ * that needs to live in the kernel core.
+ *
+ * Author: Eduardo Habkost <ehabkost@redhat.com>
+ *
+ * Copyright (C) 2008, Red Hat Inc.
+ *
+ * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+#ifndef _ASM_X86_VIRTEX_H
+#define _ASM_X86_VIRTEX_H
+
+#include <asm/processor.h>
+#include <asm/system.h>
+
+#include <asm/vmx.h>
+#include <asm/svm.h>
+
+/*
+ * VMX functions:
+ */
+
+static inline int cpu_has_vmx(void)
+{
+       unsigned long ecx = cpuid_ecx(1);
+       return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
+}
+
+
+/** Disable VMX on the current CPU
+ *
+ * vmxoff causes a undefined-opcode exception if vmxon was not run
+ * on the CPU previously. Only call this function if you know VMX
+ * is enabled.
+ */
+static inline void cpu_vmxoff(void)
+{
+       asm volatile (ASM_VMX_VMXOFF : : : "cc");
+       write_cr4(read_cr4() & ~X86_CR4_VMXE);
+}
+
+static inline int cpu_vmx_enabled(void)
+{
+       return read_cr4() & X86_CR4_VMXE;
+}
+
+/** Disable VMX if it is enabled on the current CPU
+ *
+ * You shouldn't call this if cpu_has_vmx() returns 0.
+ */
+static inline void __cpu_emergency_vmxoff(void)
+{
+       if (cpu_vmx_enabled())
+               cpu_vmxoff();
+}
+
+/** Disable VMX if it is supported and enabled on the current CPU
+ */
+static inline void cpu_emergency_vmxoff(void)
+{
+       if (cpu_has_vmx())
+               __cpu_emergency_vmxoff();
+}
+
+
+
+
+/*
+ * SVM functions:
+ */
+
+/** Check if the CPU has SVM support
+ *
+ * You can use the 'msg' arg to get a message describing the problem,
+ * if the function returns zero. Simply pass NULL if you are not interested
+ * on the messages; gcc should take care of not generating code for
+ * the messages on this case.
+ */
+static inline int cpu_has_svm(const char **msg)
+{
+       uint32_t eax, ebx, ecx, edx;
+
+       if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
+               if (msg)
+                       *msg = "not amd";
+               return 0;
+       }
+
+       cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
+       if (eax < SVM_CPUID_FUNC) {
+               if (msg)
+                       *msg = "can't execute cpuid_8000000a";
+               return 0;
+       }
+
+       cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+       if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
+               if (msg)
+                       *msg = "svm not available";
+               return 0;
+       }
+       return 1;
+}
+
+
+/** Disable SVM on the current CPU
+ *
+ * You should call this only if cpu_has_svm() returned true.
+ */
+static inline void cpu_svm_disable(void)
+{
+       uint64_t efer;
+
+       wrmsrl(MSR_VM_HSAVE_PA, 0);
+       rdmsrl(MSR_EFER, efer);
+       wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
+}
+
+/** Makes sure SVM is disabled, if it is supported on the CPU
+ */
+static inline void cpu_emergency_svm_disable(void)
+{
+       if (cpu_has_svm(NULL))
+               cpu_svm_disable();
+}
+
+#endif /* _ASM_X86_VIRTEX_H */
similarity index 93%
rename from arch/x86/kvm/vmx.h
rename to arch/x86/include/asm/vmx.h
index ec5edc339da605c8d028927f3a57568c3e2c4463..d0238e6151d86dbbdf37082161ff4c3f7338d21c 100644 (file)
 
 #define VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200
 #define VM_EXIT_ACK_INTR_ON_EXIT                0x00008000
+#define VM_EXIT_SAVE_IA32_PAT                  0x00040000
+#define VM_EXIT_LOAD_IA32_PAT                  0x00080000
 
 #define VM_ENTRY_IA32E_MODE                     0x00000200
 #define VM_ENTRY_SMM                            0x00000400
 #define VM_ENTRY_DEACT_DUAL_MONITOR             0x00000800
+#define VM_ENTRY_LOAD_IA32_PAT                 0x00004000
 
 /* VMCS Encodings */
 enum vmcs_field {
@@ -112,6 +115,8 @@ enum vmcs_field {
        VMCS_LINK_POINTER_HIGH          = 0x00002801,
        GUEST_IA32_DEBUGCTL             = 0x00002802,
        GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
+       GUEST_IA32_PAT                  = 0x00002804,
+       GUEST_IA32_PAT_HIGH             = 0x00002805,
        GUEST_PDPTR0                    = 0x0000280a,
        GUEST_PDPTR0_HIGH               = 0x0000280b,
        GUEST_PDPTR1                    = 0x0000280c,
@@ -120,6 +125,8 @@ enum vmcs_field {
        GUEST_PDPTR2_HIGH               = 0x0000280f,
        GUEST_PDPTR3                    = 0x00002810,
        GUEST_PDPTR3_HIGH               = 0x00002811,
+       HOST_IA32_PAT                   = 0x00002c00,
+       HOST_IA32_PAT_HIGH              = 0x00002c01,
        PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
        CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
        EXCEPTION_BITMAP                = 0x00004004,
@@ -331,8 +338,9 @@ enum vmcs_field {
 
 #define AR_RESERVD_MASK 0xfffe0f00
 
-#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT       9
-#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT     10
+#define TSS_PRIVATE_MEMSLOT                    (KVM_MEMORY_SLOTS + 0)
+#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT       (KVM_MEMORY_SLOTS + 1)
+#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT     (KVM_MEMORY_SLOTS + 2)
 
 #define VMX_NR_VPIDS                           (1 << 16)
 #define VMX_VPID_EXTENT_SINGLE_CONTEXT         1
@@ -356,4 +364,19 @@ enum vmcs_field {
 
 #define VMX_EPT_IDENTITY_PAGETABLE_ADDR                0xfffbc000ul
 
+
+#define ASM_VMX_VMCLEAR_RAX       ".byte 0x66, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMLAUNCH          ".byte 0x0f, 0x01, 0xc2"
+#define ASM_VMX_VMRESUME          ".byte 0x0f, 0x01, 0xc3"
+#define ASM_VMX_VMPTRLD_RAX       ".byte 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMREAD_RDX_RAX    ".byte 0x0f, 0x78, 0xd0"
+#define ASM_VMX_VMWRITE_RAX_RDX   ".byte 0x0f, 0x79, 0xd0"
+#define ASM_VMX_VMWRITE_RSP_RDX   ".byte 0x0f, 0x79, 0xd4"
+#define ASM_VMX_VMXOFF            ".byte 0x0f, 0x01, 0xc4"
+#define ASM_VMX_VMXON_RAX         ".byte 0xf3, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_INVEPT           ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
+#define ASM_VMX_INVVPID                  ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
+
+
+
 #endif
index 2e2da717b350bbd30cba12cfee11d06b12f6e1b8..658e29e0f49bb6ce807ad14842aaeeaf82005569 100644 (file)
@@ -1296,7 +1296,7 @@ static int amd_iommu_dma_supported(struct device *dev, u64 mask)
  * we don't need to preallocate the protection domains anymore.
  * For now we have to.
  */
-void prealloc_protection_domains(void)
+static void prealloc_protection_domains(void)
 {
        struct pci_dev *dev = NULL;
        struct dma_ops_domain *dma_dom;
index c625800c55caeb411211607983ab4e7f22e3c731..fb85e8d466cca7ae35304da0ef94a288fa667fe9 100644 (file)
@@ -243,7 +243,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
 }
 
 /* Function to enable the hardware */
-void __init iommu_enable(struct amd_iommu *iommu)
+static void __init iommu_enable(struct amd_iommu *iommu)
 {
        printk(KERN_INFO "AMD IOMMU: Enabling IOMMU "
               "at %02x:%02x.%x cap 0x%hx\n",
@@ -256,7 +256,7 @@ void __init iommu_enable(struct amd_iommu *iommu)
 }
 
 /* Function to enable IOMMU event logging and event interrupts */
-void __init iommu_enable_event_logging(struct amd_iommu *iommu)
+static void __init iommu_enable_event_logging(struct amd_iommu *iommu)
 {
        iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
        iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
index b5229affb95397cf8dbab2f49ab823497f8ba531..d652515e2855fd61cb5252ecd9d92d9f1653bf81 100644 (file)
@@ -98,8 +98,8 @@ __setup("apicpmtimer", setup_apicpmtimer);
 #ifdef HAVE_X2APIC
 int x2apic;
 /* x2apic enabled before OS handover */
-int x2apic_preenabled;
-int disable_x2apic;
+static int x2apic_preenabled;
+static int disable_x2apic;
 static __init int setup_nox2apic(char *str)
 {
        disable_x2apic = 1;
@@ -119,8 +119,6 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
 
 int first_system_vector = 0xfe;
 
-char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
-
 /*
  * Debug level, exported for io_apic.c
  */
@@ -142,7 +140,7 @@ static int lapic_next_event(unsigned long delta,
                            struct clock_event_device *evt);
 static void lapic_timer_setup(enum clock_event_mode mode,
                              struct clock_event_device *evt);
-static void lapic_timer_broadcast(cpumask_t mask);
+static void lapic_timer_broadcast(const cpumask_t *mask);
 static void apic_pm_activate(void);
 
 /*
@@ -228,7 +226,7 @@ void xapic_icr_write(u32 low, u32 id)
        apic_write(APIC_ICR, low);
 }
 
-u64 xapic_icr_read(void)
+static u64 xapic_icr_read(void)
 {
        u32 icr1, icr2;
 
@@ -268,7 +266,7 @@ void x2apic_icr_write(u32 low, u32 id)
        wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
 }
 
-u64 x2apic_icr_read(void)
+static u64 x2apic_icr_read(void)
 {
        unsigned long val;
 
@@ -455,7 +453,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
 /*
  * Local APIC timer broadcast function
  */
-static void lapic_timer_broadcast(cpumask_t mask)
+static void lapic_timer_broadcast(const cpumask_t *mask)
 {
 #ifdef CONFIG_SMP
        send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
@@ -471,7 +469,7 @@ static void __cpuinit setup_APIC_timer(void)
        struct clock_event_device *levt = &__get_cpu_var(lapic_events);
 
        memcpy(levt, &lapic_clockevent, sizeof(*levt));
-       levt->cpumask = cpumask_of_cpu(smp_processor_id());
+       levt->cpumask = cpumask_of(smp_processor_id());
 
        clockevents_register_device(levt);
 }
@@ -1807,28 +1805,32 @@ void disconnect_bsp_APIC(int virt_wire_setup)
 void __cpuinit generic_processor_info(int apicid, int version)
 {
        int cpu;
-       cpumask_t tmp_map;
 
        /*
         * Validate version
         */
        if (version == 0x0) {
                pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
-                       "fixing up to 0x10. (tell your hw vendor)\n",
-                       version);
+                          "fixing up to 0x10. (tell your hw vendor)\n",
+                               version);
                version = 0x10;
        }
        apic_version[apicid] = version;
 
-       if (num_processors >= NR_CPUS) {
-               pr_warning("WARNING: NR_CPUS limit of %i reached."
-                       "  Processor ignored.\n", NR_CPUS);
+       if (num_processors >= nr_cpu_ids) {
+               int max = nr_cpu_ids;
+               int thiscpu = max + disabled_cpus;
+
+               pr_warning(
+                       "ACPI: NR_CPUS/possible_cpus limit of %i reached."
+                       "  Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
+
+               disabled_cpus++;
                return;
        }
 
        num_processors++;
-       cpus_complement(tmp_map, cpu_present_map);
-       cpu = first_cpu(tmp_map);
+       cpu = cpumask_next_zero(-1, cpu_present_mask);
 
        physid_set(apicid, phys_cpu_present_map);
        if (apicid == boot_cpu_physical_apicid) {
@@ -1878,8 +1880,8 @@ void __cpuinit generic_processor_info(int apicid, int version)
        }
 #endif
 
-       cpu_set(cpu, cpu_possible_map);
-       cpu_set(cpu, cpu_present_map);
+       set_cpu_possible(cpu, true);
+       set_cpu_present(cpu, true);
 }
 
 #ifdef CONFIG_X86_64
@@ -2081,7 +2083,7 @@ __cpuinit int apic_is_clustered_box(void)
        bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
        bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
 
-       for (i = 0; i < NR_CPUS; i++) {
+       for (i = 0; i < nr_cpu_ids; i++) {
                /* are we being called early in kernel startup? */
                if (bios_cpu_apicid) {
                        id = bios_cpu_apicid[i];
index 2a0a2a3cac263a87edc38f12894185749a1b9dd5..f63882728d91868d3ce392aa87bc7965e8e9145c 100644 (file)
@@ -25,7 +25,7 @@
 #include <asm/uv/bios.h>
 #include <asm/uv/uv_hub.h>
 
-struct uv_systab uv_systab;
+static struct uv_systab uv_systab;
 
 s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
 {
index 68b5d8681cbb54597edf9021721745dbab19cb5c..c6ecda64f5f125b1ba358edac6d9e5693414b020 100644 (file)
@@ -534,31 +534,16 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
        per_cpu(cpuid4_info, cpu) = NULL;
 }
 
-static int __cpuinit detect_cache_attributes(unsigned int cpu)
+static void get_cpu_leaves(void *_retval)
 {
-       struct _cpuid4_info     *this_leaf;
-       unsigned long           j;
-       int                     retval;
-       cpumask_t               oldmask;
-
-       if (num_cache_leaves == 0)
-               return -ENOENT;
-
-       per_cpu(cpuid4_info, cpu) = kzalloc(
-           sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
-       if (per_cpu(cpuid4_info, cpu) == NULL)
-               return -ENOMEM;
-
-       oldmask = current->cpus_allowed;
-       retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-       if (retval)
-               goto out;
+       int j, *retval = _retval, cpu = smp_processor_id();
 
        /* Do cpuid and store the results */
        for (j = 0; j < num_cache_leaves; j++) {
+               struct _cpuid4_info *this_leaf;
                this_leaf = CPUID4_INFO_IDX(cpu, j);
-               retval = cpuid4_cache_lookup(j, this_leaf);
-               if (unlikely(retval < 0)) {
+               *retval = cpuid4_cache_lookup(j, this_leaf);
+               if (unlikely(*retval < 0)) {
                        int i;
 
                        for (i = 0; i < j; i++)
@@ -567,9 +552,21 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
                }
                cache_shared_cpu_map_setup(cpu, j);
        }
-       set_cpus_allowed_ptr(current, &oldmask);
+}
+
+static int __cpuinit detect_cache_attributes(unsigned int cpu)
+{
+       int                     retval;
+
+       if (num_cache_leaves == 0)
+               return -ENOENT;
+
+       per_cpu(cpuid4_info, cpu) = kzalloc(
+           sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
+       if (per_cpu(cpuid4_info, cpu) == NULL)
+               return -ENOMEM;
 
-out:
+       smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
        if (retval) {
                kfree(per_cpu(cpuid4_info, cpu));
                per_cpu(cpuid4_info, cpu) = NULL;
@@ -626,8 +623,8 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
                cpumask_t *mask = &this_leaf->shared_cpu_map;
 
                n = type?
-                       cpulist_scnprintf(buf, len-2, *mask):
-                       cpumask_scnprintf(buf, len-2, *mask);
+                       cpulist_scnprintf(buf, len-2, mask) :
+                       cpumask_scnprintf(buf, len-2, mask);
                buf[n++] = '\n';
                buf[n] = '\0';
        }
index 748c8f9e7a0527d1e6f6298852113995f033350b..a5a5e0530370b6c007e484dc981f110f8b310dbd 100644 (file)
@@ -83,34 +83,41 @@ static DEFINE_PER_CPU(unsigned char, bank_map);     /* see which banks are on */
  * CPU Initialization
  */
 
+struct thresh_restart {
+       struct threshold_block *b;
+       int reset;
+       u16 old_limit;
+};
+
 /* must be called with correct cpu affinity */
-static void threshold_restart_bank(struct threshold_block *b,
-                                  int reset, u16 old_limit)
+static long threshold_restart_bank(void *_tr)
 {
+       struct thresh_restart *tr = _tr;
        u32 mci_misc_hi, mci_misc_lo;
 
-       rdmsr(b->address, mci_misc_lo, mci_misc_hi);
+       rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
 
-       if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
-               reset = 1;      /* limit cannot be lower than err count */
+       if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
+               tr->reset = 1;  /* limit cannot be lower than err count */
 
-       if (reset) {            /* reset err count and overflow bit */
+       if (tr->reset) {                /* reset err count and overflow bit */
                mci_misc_hi =
                    (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
-                   (THRESHOLD_MAX - b->threshold_limit);
-       } else if (old_limit) { /* change limit w/o reset */
+                   (THRESHOLD_MAX - tr->b->threshold_limit);
+       } else if (tr->old_limit) {     /* change limit w/o reset */
                int new_count = (mci_misc_hi & THRESHOLD_MAX) +
-                   (old_limit - b->threshold_limit);
+                   (tr->old_limit - tr->b->threshold_limit);
                mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
                    (new_count & THRESHOLD_MAX);
        }
 
-       b->interrupt_enable ?
+       tr->b->interrupt_enable ?
            (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
            (mci_misc_hi &= ~MASK_INT_TYPE_HI);
 
        mci_misc_hi |= MASK_COUNT_EN_HI;
-       wrmsr(b->address, mci_misc_lo, mci_misc_hi);
+       wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+       return 0;
 }
 
 /* cpu init entry point, called from mce.c with preempt off */
@@ -120,6 +127,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
        unsigned int cpu = smp_processor_id();
        u8 lvt_off;
        u32 low = 0, high = 0, address = 0;
+       struct thresh_restart tr;
 
        for (bank = 0; bank < NR_BANKS; ++bank) {
                for (block = 0; block < NR_BLOCKS; ++block) {
@@ -162,7 +170,10 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
                        wrmsr(address, low, high);
 
                        threshold_defaults.address = address;
-                       threshold_restart_bank(&threshold_defaults, 0, 0);
+                       tr.b = &threshold_defaults;
+                       tr.reset = 0;
+                       tr.old_limit = 0;
+                       threshold_restart_bank(&tr);
                }
        }
 }
@@ -251,20 +262,6 @@ struct threshold_attr {
        ssize_t(*store) (struct threshold_block *, const char *, size_t count);
 };
 
-static void affinity_set(unsigned int cpu, cpumask_t *oldmask,
-                                          cpumask_t *newmask)
-{
-       *oldmask = current->cpus_allowed;
-       cpus_clear(*newmask);
-       cpu_set(cpu, *newmask);
-       set_cpus_allowed_ptr(current, newmask);
-}
-
-static void affinity_restore(const cpumask_t *oldmask)
-{
-       set_cpus_allowed_ptr(current, oldmask);
-}
-
 #define SHOW_FIELDS(name)                                           \
 static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
 {                                                                   \
@@ -277,15 +274,16 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
                                      const char *buf, size_t count)
 {
        char *end;
-       cpumask_t oldmask, newmask;
+       struct thresh_restart tr;
        unsigned long new = simple_strtoul(buf, &end, 0);
        if (end == buf)
                return -EINVAL;
        b->interrupt_enable = !!new;
 
-       affinity_set(b->cpu, &oldmask, &newmask);
-       threshold_restart_bank(b, 0, 0);
-       affinity_restore(&oldmask);
+       tr.b = b;
+       tr.reset = 0;
+       tr.old_limit = 0;
+       work_on_cpu(b->cpu, threshold_restart_bank, &tr);
 
        return end - buf;
 }
@@ -294,8 +292,7 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
                                     const char *buf, size_t count)
 {
        char *end;
-       cpumask_t oldmask, newmask;
-       u16 old;
+       struct thresh_restart tr;
        unsigned long new = simple_strtoul(buf, &end, 0);
        if (end == buf)
                return -EINVAL;
@@ -303,34 +300,36 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
                new = THRESHOLD_MAX;
        if (new < 1)
                new = 1;
-       old = b->threshold_limit;
+       tr.old_limit = b->threshold_limit;
        b->threshold_limit = new;
+       tr.b = b;
+       tr.reset = 0;
 
-       affinity_set(b->cpu, &oldmask, &newmask);
-       threshold_restart_bank(b, 0, old);
-       affinity_restore(&oldmask);
+       work_on_cpu(b->cpu, threshold_restart_bank, &tr);
 
        return end - buf;
 }
 
-static ssize_t show_error_count(struct threshold_block *b, char *buf)
+static long local_error_count(void *_b)
 {
-       u32 high, low;
-       cpumask_t oldmask, newmask;
-       affinity_set(b->cpu, &oldmask, &newmask);
+       struct threshold_block *b = _b;
+       u32 low, high;
+
        rdmsr(b->address, low, high);
-       affinity_restore(&oldmask);
-       return sprintf(buf, "%x\n",
-                      (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
+       return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
+}
+
+static ssize_t show_error_count(struct threshold_block *b, char *buf)
+{
+       return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b));
 }
 
 static ssize_t store_error_count(struct threshold_block *b,
                                 const char *buf, size_t count)
 {
-       cpumask_t oldmask, newmask;
-       affinity_set(b->cpu, &oldmask, &newmask);
-       threshold_restart_bank(b, 1, 0);
-       affinity_restore(&oldmask);
+       struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
+
+       work_on_cpu(b->cpu, threshold_restart_bank, &tr);
        return 1;
 }
 
@@ -463,12 +462,19 @@ out_free:
        return err;
 }
 
+static long local_allocate_threshold_blocks(void *_bank)
+{
+       unsigned int *bank = _bank;
+
+       return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
+                                        MSR_IA32_MC0_MISC + *bank * 4);
+}
+
 /* symlinks sibling shared banks to first core.  first core owns dir/files. */
 static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 {
        int i, err = 0;
        struct threshold_bank *b = NULL;
-       cpumask_t oldmask, newmask;
        char name[32];
 
        sprintf(name, "threshold_bank%i", bank);
@@ -519,11 +525,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 
        per_cpu(threshold_banks, cpu)[bank] = b;
 
-       affinity_set(cpu, &oldmask, &newmask);
-       err = allocate_threshold_blocks(cpu, bank, 0,
-                                       MSR_IA32_MC0_MISC + bank * 4);
-       affinity_restore(&oldmask);
-
+       err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank);
        if (err)
                goto out_free;
 
index 4e8d77f01eeb0913527373ed38a71795cc0b7be1..b59ddcc88cd84b2b2d19a8217977b3661136fb42 100644 (file)
 #include <asm/pat.h>
 #include "mtrr.h"
 
-struct mtrr_state {
-       struct mtrr_var_range var_ranges[MAX_VAR_RANGES];
-       mtrr_type fixed_ranges[NUM_FIXED_RANGES];
-       unsigned char enabled;
-       unsigned char have_fixed;
-       mtrr_type def_type;
-};
-
 struct fixed_range_block {
        int base_msr; /* start address of an MTRR block */
        int ranges;   /* number of MTRRs in this block  */
@@ -35,10 +27,12 @@ static struct fixed_range_block fixed_range_blocks[] = {
 };
 
 static unsigned long smp_changes_mask;
-static struct mtrr_state mtrr_state = {};
 static int mtrr_state_set;
 u64 mtrr_tom2;
 
+struct mtrr_state_type mtrr_state = {};
+EXPORT_SYMBOL_GPL(mtrr_state);
+
 #undef MODULE_PARAM_PREFIX
 #define MODULE_PARAM_PREFIX "mtrr."
 
index 1159e269e596bc05d9b82d3ed633a434c67d1649..d259e5d2e0546adbec36d55f63b7f2a81f88e002 100644 (file)
@@ -49,7 +49,7 @@
 
 u32 num_var_ranges = 0;
 
-unsigned int mtrr_usage_table[MAX_VAR_RANGES];
+unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
 static DEFINE_MUTEX(mtrr_mutex);
 
 u64 size_or_mask, size_and_mask;
@@ -574,7 +574,7 @@ struct mtrr_value {
        unsigned long   lsize;
 };
 
-static struct mtrr_value mtrr_state[MAX_VAR_RANGES];
+static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES];
 
 static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
 {
@@ -824,16 +824,14 @@ static int enable_mtrr_cleanup __initdata =
 
 static int __init disable_mtrr_cleanup_setup(char *str)
 {
-       if (enable_mtrr_cleanup != -1)
-               enable_mtrr_cleanup = 0;
+       enable_mtrr_cleanup = 0;
        return 0;
 }
 early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
 
 static int __init enable_mtrr_cleanup_setup(char *str)
 {
-       if (enable_mtrr_cleanup != -1)
-               enable_mtrr_cleanup = 1;
+       enable_mtrr_cleanup = 1;
        return 0;
 }
 early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
index 2dc4ec656b23c22c9d3806cd57746d4c834b5166..ffd60409cc6d410ee16eac82d76f4f83badab39e 100644 (file)
@@ -8,11 +8,6 @@
 #define MTRRcap_MSR     0x0fe
 #define MTRRdefType_MSR 0x2ff
 
-#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
-#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
-
-#define NUM_FIXED_RANGES 88
-#define MAX_VAR_RANGES 256
 #define MTRRfix64K_00000_MSR 0x250
 #define MTRRfix16K_80000_MSR 0x258
 #define MTRRfix16K_A0000_MSR 0x259
 #define MTRR_CHANGE_MASK_VARIABLE  0x02
 #define MTRR_CHANGE_MASK_DEFTYPE   0x04
 
-/* In the Intel processor's MTRR interface, the MTRR type is always held in
-   an 8 bit field: */
-typedef u8 mtrr_type;
-
-extern unsigned int mtrr_usage_table[MAX_VAR_RANGES];
+extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
 
 struct mtrr_ops {
        u32     vendor;
@@ -70,13 +61,6 @@ struct set_mtrr_context {
        u32 ccr3;
 };
 
-struct mtrr_var_range {
-       u32 base_lo;
-       u32 base_hi;
-       u32 mask_lo;
-       u32 mask_hi;
-};
-
 void set_mtrr_done(struct set_mtrr_context *ctxt);
 void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
 void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
index 72cefd1e649ba4be254b5ec497f442160dab1752..85d28d53f5d357e59329bdcc1ada8328eb78f0b7 100644 (file)
 #include <linux/device.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
+#include <linux/uaccess.h>
 
 #include <asm/processor.h>
 #include <asm/msr.h>
-#include <asm/uaccess.h>
 #include <asm/system.h>
 
 static struct class *cpuid_class;
@@ -82,7 +82,7 @@ static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
 }
 
 static ssize_t cpuid_read(struct file *file, char __user *buf,
-                         size_t count, loff_t * ppos)
+                         size_t count, loff_t *ppos)
 {
        char __user *tmp = buf;
        struct cpuid_regs cmd;
@@ -117,7 +117,7 @@ static int cpuid_open(struct inode *inode, struct file *file)
        unsigned int cpu;
        struct cpuinfo_x86 *c;
        int ret = 0;
-       
+
        lock_kernel();
 
        cpu = iminor(file->f_path.dentry->d_inode);
index d84a852e4cd76f4983bc617d674c7e59c0b3632d..c689d19e35abb786657926265c03ab63e74a7538 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/kdebug.h>
 #include <asm/smp.h>
 #include <asm/reboot.h>
+#include <asm/virtext.h>
 
 #include <mach_ipi.h>
 
@@ -49,6 +50,15 @@ static void kdump_nmi_callback(int cpu, struct die_args *args)
 #endif
        crash_save_cpu(regs, cpu);
 
+       /* Disable VMX or SVM if needed.
+        *
+        * We need to disable virtualization on all CPUs.
+        * Having VMX or SVM enabled on any CPU may break rebooting
+        * after the kdump kernel has finished its task.
+        */
+       cpu_emergency_vmxoff();
+       cpu_emergency_svm_disable();
+
        disable_local_APIC();
 }
 
@@ -80,6 +90,14 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
        local_irq_disable();
 
        kdump_nmi_shootdown_cpus();
+
+       /* Booting kdump kernel with VMX or SVM enabled won't work,
+        * because (among other limitations) we can't disable paging
+        * with the virt flags.
+        */
+       cpu_emergency_vmxoff();
+       cpu_emergency_svm_disable();
+
        lapic_shutdown();
 #if defined(CONFIG_X86_IO_APIC)
        disable_IO_APIC();
index 23b138e31e9c2fe229c42050ab2c5e864cac4710..504ad198e4ad0f999b43dfbb16f1957305e71ffe 100644 (file)
@@ -886,7 +886,7 @@ asmlinkage void early_printk(const char *fmt, ...)
        va_list ap;
 
        va_start(ap, fmt);
-       n = vscnprintf(buf, 512, fmt, ap);
+       n = vscnprintf(buf, sizeof(buf), fmt, ap);
        early_console->write(early_console, buf, n);
        va_end(ap);
 }
index c0262791bda40e5cd10960fb496360ca8298b879..34185488e4fb47e41aed797bcd8cd6b1a5c9d33d 100644 (file)
@@ -30,12 +30,12 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
        return 1;
 }
 
-static cpumask_t flat_target_cpus(void)
+static const struct cpumask *flat_target_cpus(void)
 {
-       return cpu_online_map;
+       return cpu_online_mask;
 }
 
-static cpumask_t flat_vector_allocation_domain(int cpu)
+static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
        /* Careful. Some cpus do not strictly honor the set of cpus
         * specified in the interrupt destination when using lowest
@@ -45,8 +45,8 @@ static cpumask_t flat_vector_allocation_domain(int cpu)
         * deliver interrupts to the wrong hyperthread when only one
         * hyperthread was specified in the interrupt desitination.
         */
-       cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
-       return domain;
+       cpumask_clear(retmask);
+       cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
 }
 
 /*
@@ -69,9 +69,8 @@ static void flat_init_apic_ldr(void)
        apic_write(APIC_LDR, val);
 }
 
-static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
+static inline void _flat_send_IPI_mask(unsigned long mask, int vector)
 {
-       unsigned long mask = cpus_addr(cpumask)[0];
        unsigned long flags;
 
        local_irq_save(flags);
@@ -79,20 +78,41 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
        local_irq_restore(flags);
 }
 
+static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
+{
+       unsigned long mask = cpumask_bits(cpumask)[0];
+
+       _flat_send_IPI_mask(mask, vector);
+}
+
+static void flat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
+                                         int vector)
+{
+       unsigned long mask = cpumask_bits(cpumask)[0];
+       int cpu = smp_processor_id();
+
+       if (cpu < BITS_PER_LONG)
+               clear_bit(cpu, &mask);
+       _flat_send_IPI_mask(mask, vector);
+}
+
 static void flat_send_IPI_allbutself(int vector)
 {
+       int cpu = smp_processor_id();
 #ifdef CONFIG_HOTPLUG_CPU
        int hotplug = 1;
 #else
        int hotplug = 0;
 #endif
        if (hotplug || vector == NMI_VECTOR) {
-               cpumask_t allbutme = cpu_online_map;
+               if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) {
+                       unsigned long mask = cpumask_bits(cpu_online_mask)[0];
 
-               cpu_clear(smp_processor_id(), allbutme);
+                       if (cpu < BITS_PER_LONG)
+                               clear_bit(cpu, &mask);
 
-               if (!cpus_empty(allbutme))
-                       flat_send_IPI_mask(allbutme, vector);
+                       _flat_send_IPI_mask(mask, vector);
+               }
        } else if (num_online_cpus() > 1) {
                __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
        }
@@ -101,7 +121,7 @@ static void flat_send_IPI_allbutself(int vector)
 static void flat_send_IPI_all(int vector)
 {
        if (vector == NMI_VECTOR)
-               flat_send_IPI_mask(cpu_online_map, vector);
+               flat_send_IPI_mask(cpu_online_mask, vector);
        else
                __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
 }
@@ -135,9 +155,18 @@ static int flat_apic_id_registered(void)
        return physid_isset(read_xapic_id(), phys_cpu_present_map);
 }
 
-static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int flat_cpu_mask_to_apicid(const struct cpumask *cpumask)
+{
+       return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
+}
+
+static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                               const struct cpumask *andmask)
 {
-       return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
+       unsigned long mask1 = cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
+       unsigned long mask2 = cpumask_bits(andmask)[0] & APIC_ALL_CPUS;
+
+       return mask1 & mask2;
 }
 
 static unsigned int phys_pkg_id(int index_msb)
@@ -157,8 +186,10 @@ struct genapic apic_flat =  {
        .send_IPI_all = flat_send_IPI_all,
        .send_IPI_allbutself = flat_send_IPI_allbutself,
        .send_IPI_mask = flat_send_IPI_mask,
+       .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
        .send_IPI_self = apic_send_IPI_self,
        .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
+       .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
        .phys_pkg_id = phys_pkg_id,
        .get_apic_id = get_apic_id,
        .set_apic_id = set_apic_id,
@@ -188,35 +219,39 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
        return 0;
 }
 
-static cpumask_t physflat_target_cpus(void)
+static const struct cpumask *physflat_target_cpus(void)
 {
-       return cpu_online_map;
+       return cpu_online_mask;
 }
 
-static cpumask_t physflat_vector_allocation_domain(int cpu)
+static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
-       return cpumask_of_cpu(cpu);
+       cpumask_clear(retmask);
+       cpumask_set_cpu(cpu, retmask);
 }
 
-static void physflat_send_IPI_mask(cpumask_t cpumask, int vector)
+static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
 {
        send_IPI_mask_sequence(cpumask, vector);
 }
 
-static void physflat_send_IPI_allbutself(int vector)
+static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
+                                             int vector)
 {
-       cpumask_t allbutme = cpu_online_map;
+       send_IPI_mask_allbutself(cpumask, vector);
+}
 
-       cpu_clear(smp_processor_id(), allbutme);
-       physflat_send_IPI_mask(allbutme, vector);
+static void physflat_send_IPI_allbutself(int vector)
+{
+       send_IPI_mask_allbutself(cpu_online_mask, vector);
 }
 
 static void physflat_send_IPI_all(int vector)
 {
-       physflat_send_IPI_mask(cpu_online_map, vector);
+       physflat_send_IPI_mask(cpu_online_mask, vector);
 }
 
-static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
        int cpu;
 
@@ -224,13 +259,31 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
         * We're using fixed IRQ delivery, can only return one phys APIC ID.
         * May as well be the first.
         */
-       cpu = first_cpu(cpumask);
+       cpu = cpumask_first(cpumask);
        if ((unsigned)cpu < nr_cpu_ids)
                return per_cpu(x86_cpu_to_apicid, cpu);
        else
                return BAD_APICID;
 }
 
+static unsigned int
+physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                               const struct cpumask *andmask)
+{
+       int cpu;
+
+       /*
+        * We're using fixed IRQ delivery, can only return one phys APIC ID.
+        * May as well be the first.
+        */
+       for_each_cpu_and(cpu, cpumask, andmask)
+               if (cpumask_test_cpu(cpu, cpu_online_mask))
+                       break;
+       if (cpu < nr_cpu_ids)
+               return per_cpu(x86_cpu_to_apicid, cpu);
+       return BAD_APICID;
+}
+
 struct genapic apic_physflat =  {
        .name = "physical flat",
        .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
@@ -243,8 +296,10 @@ struct genapic apic_physflat =  {
        .send_IPI_all = physflat_send_IPI_all,
        .send_IPI_allbutself = physflat_send_IPI_allbutself,
        .send_IPI_mask = physflat_send_IPI_mask,
+       .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
        .send_IPI_self = apic_send_IPI_self,
        .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
+       .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and,
        .phys_pkg_id = phys_pkg_id,
        .get_apic_id = get_apic_id,
        .set_apic_id = set_apic_id,
index f6a2c8eb48a6bb1706212636246539d5f1ce64dc..6ce497cc372d60c54942c839311f9767e477507f 100644 (file)
@@ -22,19 +22,18 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 
 /* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
 
-static cpumask_t x2apic_target_cpus(void)
+static const struct cpumask *x2apic_target_cpus(void)
 {
-       return cpumask_of_cpu(0);
+       return cpumask_of(0);
 }
 
 /*
  * for now each logical cpu is in its own vector allocation domain.
  */
-static cpumask_t x2apic_vector_allocation_domain(int cpu)
+static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
-       cpumask_t domain = CPU_MASK_NONE;
-       cpu_set(cpu, domain);
-       return domain;
+       cpumask_clear(retmask);
+       cpumask_set_cpu(cpu, retmask);
 }
 
 static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
@@ -56,32 +55,53 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
  * at once. We have 16 cpu's in a cluster. This will minimize IPI register
  * writes.
  */
-static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
 {
        unsigned long flags;
        unsigned long query_cpu;
 
        local_irq_save(flags);
-       for_each_cpu_mask(query_cpu, mask) {
-               __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-                                      vector, APIC_DEST_LOGICAL);
-       }
+       for_each_cpu(query_cpu, mask)
+               __x2apic_send_IPI_dest(
+                       per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+                       vector, APIC_DEST_LOGICAL);
        local_irq_restore(flags);
 }
 
-static void x2apic_send_IPI_allbutself(int vector)
+static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask,
+                                           int vector)
 {
-       cpumask_t mask = cpu_online_map;
+       unsigned long flags;
+       unsigned long query_cpu;
+       unsigned long this_cpu = smp_processor_id();
 
-       cpu_clear(smp_processor_id(), mask);
+       local_irq_save(flags);
+       for_each_cpu(query_cpu, mask)
+               if (query_cpu != this_cpu)
+                       __x2apic_send_IPI_dest(
+                               per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+                               vector, APIC_DEST_LOGICAL);
+       local_irq_restore(flags);
+}
+
+static void x2apic_send_IPI_allbutself(int vector)
+{
+       unsigned long flags;
+       unsigned long query_cpu;
+       unsigned long this_cpu = smp_processor_id();
 
-       if (!cpus_empty(mask))
-               x2apic_send_IPI_mask(mask, vector);
+       local_irq_save(flags);
+       for_each_online_cpu(query_cpu)
+               if (query_cpu != this_cpu)
+                       __x2apic_send_IPI_dest(
+                               per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+                               vector, APIC_DEST_LOGICAL);
+       local_irq_restore(flags);
 }
 
 static void x2apic_send_IPI_all(int vector)
 {
-       x2apic_send_IPI_mask(cpu_online_map, vector);
+       x2apic_send_IPI_mask(cpu_online_mask, vector);
 }
 
 static int x2apic_apic_id_registered(void)
@@ -89,21 +109,38 @@ static int x2apic_apic_id_registered(void)
        return 1;
 }
 
-static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
        int cpu;
 
        /*
-        * We're using fixed IRQ delivery, can only return one phys APIC ID.
+        * We're using fixed IRQ delivery, can only return one logical APIC ID.
         * May as well be the first.
         */
-       cpu = first_cpu(cpumask);
-       if ((unsigned)cpu < NR_CPUS)
+       cpu = cpumask_first(cpumask);
+       if ((unsigned)cpu < nr_cpu_ids)
                return per_cpu(x86_cpu_to_logical_apicid, cpu);
        else
                return BAD_APICID;
 }
 
+static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                                 const struct cpumask *andmask)
+{
+       int cpu;
+
+       /*
+        * We're using fixed IRQ delivery, can only return one logical APIC ID.
+        * May as well be the first.
+        */
+       for_each_cpu_and(cpu, cpumask, andmask)
+               if (cpumask_test_cpu(cpu, cpu_online_mask))
+                       break;
+       if (cpu < nr_cpu_ids)
+               return per_cpu(x86_cpu_to_logical_apicid, cpu);
+       return BAD_APICID;
+}
+
 static unsigned int get_apic_id(unsigned long x)
 {
        unsigned int id;
@@ -150,8 +187,10 @@ struct genapic apic_x2apic_cluster = {
        .send_IPI_all = x2apic_send_IPI_all,
        .send_IPI_allbutself = x2apic_send_IPI_allbutself,
        .send_IPI_mask = x2apic_send_IPI_mask,
+       .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
        .send_IPI_self = x2apic_send_IPI_self,
        .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+       .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
        .phys_pkg_id = phys_pkg_id,
        .get_apic_id = get_apic_id,
        .set_apic_id = set_apic_id,
index d042211768b74243f0c7b7dbfd321be707109f04..21bcc0e098ba48c3cd78b068f88dd858c97d98dd 100644 (file)
@@ -29,16 +29,15 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 
 /* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
 
-static cpumask_t x2apic_target_cpus(void)
+static const struct cpumask *x2apic_target_cpus(void)
 {
-       return cpumask_of_cpu(0);
+       return cpumask_of(0);
 }
 
-static cpumask_t x2apic_vector_allocation_domain(int cpu)
+static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
-       cpumask_t domain = CPU_MASK_NONE;
-       cpu_set(cpu, domain);
-       return domain;
+       cpumask_clear(retmask);
+       cpumask_set_cpu(cpu, retmask);
 }
 
 static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
@@ -54,32 +53,54 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
        x2apic_icr_write(cfg, apicid);
 }
 
-static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
 {
        unsigned long flags;
        unsigned long query_cpu;
 
        local_irq_save(flags);
-       for_each_cpu_mask(query_cpu, mask) {
+       for_each_cpu(query_cpu, mask) {
                __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
                                       vector, APIC_DEST_PHYSICAL);
        }
        local_irq_restore(flags);
 }
 
-static void x2apic_send_IPI_allbutself(int vector)
+static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask,
+                                           int vector)
 {
-       cpumask_t mask = cpu_online_map;
+       unsigned long flags;
+       unsigned long query_cpu;
+       unsigned long this_cpu = smp_processor_id();
+
+       local_irq_save(flags);
+       for_each_cpu(query_cpu, mask) {
+               if (query_cpu != this_cpu)
+                       __x2apic_send_IPI_dest(
+                               per_cpu(x86_cpu_to_apicid, query_cpu),
+                               vector, APIC_DEST_PHYSICAL);
+       }
+       local_irq_restore(flags);
+}
 
-       cpu_clear(smp_processor_id(), mask);
+static void x2apic_send_IPI_allbutself(int vector)
+{
+       unsigned long flags;
+       unsigned long query_cpu;
+       unsigned long this_cpu = smp_processor_id();
 
-       if (!cpus_empty(mask))
-               x2apic_send_IPI_mask(mask, vector);
+       local_irq_save(flags);
+       for_each_online_cpu(query_cpu)
+               if (query_cpu != this_cpu)
+                       __x2apic_send_IPI_dest(
+                               per_cpu(x86_cpu_to_apicid, query_cpu),
+                               vector, APIC_DEST_PHYSICAL);
+       local_irq_restore(flags);
 }
 
 static void x2apic_send_IPI_all(int vector)
 {
-       x2apic_send_IPI_mask(cpu_online_map, vector);
+       x2apic_send_IPI_mask(cpu_online_mask, vector);
 }
 
 static int x2apic_apic_id_registered(void)
@@ -87,7 +108,7 @@ static int x2apic_apic_id_registered(void)
        return 1;
 }
 
-static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
        int cpu;
 
@@ -95,13 +116,30 @@ static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
         * We're using fixed IRQ delivery, can only return one phys APIC ID.
         * May as well be the first.
         */
-       cpu = first_cpu(cpumask);
-       if ((unsigned)cpu < NR_CPUS)
+       cpu = cpumask_first(cpumask);
+       if ((unsigned)cpu < nr_cpu_ids)
                return per_cpu(x86_cpu_to_apicid, cpu);
        else
                return BAD_APICID;
 }
 
+static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                                 const struct cpumask *andmask)
+{
+       int cpu;
+
+       /*
+        * We're using fixed IRQ delivery, can only return one phys APIC ID.
+        * May as well be the first.
+        */
+       for_each_cpu_and(cpu, cpumask, andmask)
+               if (cpumask_test_cpu(cpu, cpu_online_mask))
+                       break;
+       if (cpu < nr_cpu_ids)
+               return per_cpu(x86_cpu_to_apicid, cpu);
+       return BAD_APICID;
+}
+
 static unsigned int get_apic_id(unsigned long x)
 {
        unsigned int id;
@@ -123,12 +161,12 @@ static unsigned int phys_pkg_id(int index_msb)
        return current_cpu_data.initial_apicid >> index_msb;
 }
 
-void x2apic_send_IPI_self(int vector)
+static void x2apic_send_IPI_self(int vector)
 {
        apic_write(APIC_SELF_IPI, vector);
 }
 
-void init_x2apic_ldr(void)
+static void init_x2apic_ldr(void)
 {
        return;
 }
@@ -145,8 +183,10 @@ struct genapic apic_x2apic_phys = {
        .send_IPI_all = x2apic_send_IPI_all,
        .send_IPI_allbutself = x2apic_send_IPI_allbutself,
        .send_IPI_mask = x2apic_send_IPI_mask,
+       .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
        .send_IPI_self = x2apic_send_IPI_self,
        .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+       .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
        .phys_pkg_id = phys_pkg_id,
        .get_apic_id = get_apic_id,
        .set_apic_id = set_apic_id,
index dece17289731b951c4189bd5f66db0a919f91197..b193e082f6ce41041a2a15503bb103c9e4871994 100644 (file)
@@ -79,16 +79,15 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second);
 
 /* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
 
-static cpumask_t uv_target_cpus(void)
+static const struct cpumask *uv_target_cpus(void)
 {
-       return cpumask_of_cpu(0);
+       return cpumask_of(0);
 }
 
-static cpumask_t uv_vector_allocation_domain(int cpu)
+static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
-       cpumask_t domain = CPU_MASK_NONE;
-       cpu_set(cpu, domain);
-       return domain;
+       cpumask_clear(retmask);
+       cpumask_set_cpu(cpu, retmask);
 }
 
 int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
@@ -127,28 +126,37 @@ static void uv_send_IPI_one(int cpu, int vector)
        uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
 }
 
-static void uv_send_IPI_mask(cpumask_t mask, int vector)
+static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
 {
        unsigned int cpu;
 
-       for_each_possible_cpu(cpu)
-               if (cpu_isset(cpu, mask))
+       for_each_cpu(cpu, mask)
+               uv_send_IPI_one(cpu, vector);
+}
+
+static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+       unsigned int cpu;
+       unsigned int this_cpu = smp_processor_id();
+
+       for_each_cpu(cpu, mask)
+               if (cpu != this_cpu)
                        uv_send_IPI_one(cpu, vector);
 }
 
 static void uv_send_IPI_allbutself(int vector)
 {
-       cpumask_t mask = cpu_online_map;
-
-       cpu_clear(smp_processor_id(), mask);
+       unsigned int cpu;
+       unsigned int this_cpu = smp_processor_id();
 
-       if (!cpus_empty(mask))
-               uv_send_IPI_mask(mask, vector);
+       for_each_online_cpu(cpu)
+               if (cpu != this_cpu)
+                       uv_send_IPI_one(cpu, vector);
 }
 
 static void uv_send_IPI_all(int vector)
 {
-       uv_send_IPI_mask(cpu_online_map, vector);
+       uv_send_IPI_mask(cpu_online_mask, vector);
 }
 
 static int uv_apic_id_registered(void)
@@ -160,7 +168,7 @@ static void uv_init_apic_ldr(void)
 {
 }
 
-static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
        int cpu;
 
@@ -168,13 +176,30 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
         * We're using fixed IRQ delivery, can only return one phys APIC ID.
         * May as well be the first.
         */
-       cpu = first_cpu(cpumask);
+       cpu = cpumask_first(cpumask);
        if ((unsigned)cpu < nr_cpu_ids)
                return per_cpu(x86_cpu_to_apicid, cpu);
        else
                return BAD_APICID;
 }
 
+static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                             const struct cpumask *andmask)
+{
+       int cpu;
+
+       /*
+        * We're using fixed IRQ delivery, can only return one phys APIC ID.
+        * May as well be the first.
+        */
+       for_each_cpu_and(cpu, cpumask, andmask)
+               if (cpumask_test_cpu(cpu, cpu_online_mask))
+                       break;
+       if (cpu < nr_cpu_ids)
+               return per_cpu(x86_cpu_to_apicid, cpu);
+       return BAD_APICID;
+}
+
 static unsigned int get_apic_id(unsigned long x)
 {
        unsigned int id;
@@ -222,8 +247,10 @@ struct genapic apic_x2apic_uv_x = {
        .send_IPI_all = uv_send_IPI_all,
        .send_IPI_allbutself = uv_send_IPI_allbutself,
        .send_IPI_mask = uv_send_IPI_mask,
+       .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
        .send_IPI_self = uv_send_IPI_self,
        .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
+       .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
        .phys_pkg_id = phys_pkg_id,
        .get_apic_id = get_apic_id,
        .set_apic_id = set_apic_id,
index 388e05a5fc17ec05dc68fc881c169608390538b5..b9a4d8c4b93529b3c568c054755d04d1f2534581 100644 (file)
@@ -27,7 +27,7 @@
 #include <asm/trampoline.h>
 
 /* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda __read_mostly;
+static struct x8664_pda _boot_cpu_pda;
 
 #ifdef CONFIG_SMP
 /*
index 845ea097383ee4051a24b54bca8da4c29bd9f6d1..cd759ad90690e72d109aed4309adeb87755977e5 100644 (file)
@@ -248,7 +248,7 @@ static void hpet_legacy_clockevent_register(void)
         * Start hpet with the boot cpu mask and make it
         * global after the IO_APIC has been initialized.
         */
-       hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+       hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
        clockevents_register_device(&hpet_clockevent);
        global_clock_event = &hpet_clockevent;
        printk(KERN_DEBUG "hpet clockevent registered\n");
@@ -303,7 +303,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
                        struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
                        hpet_setup_msi_irq(hdev->irq);
                        disable_irq(hdev->irq);
-                       irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu));
+                       irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
                        enable_irq(hdev->irq);
                }
                break;
@@ -451,7 +451,7 @@ static int hpet_setup_irq(struct hpet_dev *dev)
                return -1;
 
        disable_irq(dev->irq);
-       irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu));
+       irq_set_affinity(dev->irq, cpumask_of(dev->cpu));
        enable_irq(dev->irq);
 
        printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
@@ -502,7 +502,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
        /* 5 usec minimum reprogramming delta. */
        evt->min_delta_ns = 5000;
 
-       evt->cpumask = cpumask_of_cpu(hdev->cpu);
+       evt->cpumask = cpumask_of(hdev->cpu);
        clockevents_register_device(evt);
 }
 
index c1b5e3ece1f241f9ef8716abd02d98ad6721ffa7..10f92fb532f34e1c0e0c6a93059fa845eed7049c 100644 (file)
@@ -114,7 +114,7 @@ void __init setup_pit_timer(void)
         * Start pit with the boot cpu mask and make it global after the
         * IO_APIC has been initialized.
         */
-       pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+       pit_clockevent.cpumask = cpumask_of(smp_processor_id());
        pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC,
                                     pit_clockevent.shift);
        pit_clockevent.max_delta_ns =
index 74917658b004aea2eb08dca5c5c9ba8a775eb81e..69911722b9d38acefc7f0f50e5d1c2df4b5b8b49 100644 (file)
@@ -136,8 +136,8 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
 
 struct irq_cfg {
        struct irq_pin_list *irq_2_pin;
-       cpumask_t domain;
-       cpumask_t old_domain;
+       cpumask_var_t domain;
+       cpumask_var_t old_domain;
        unsigned move_cleanup_count;
        u8 vector;
        u8 move_in_progress : 1;
@@ -152,22 +152,22 @@ static struct irq_cfg irq_cfgx[] = {
 #else
 static struct irq_cfg irq_cfgx[NR_IRQS] = {
 #endif
-       [0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
-       [1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
-       [2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
-       [3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
-       [4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
-       [5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
-       [6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
-       [7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
-       [8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
-       [9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
-       [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
-       [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
-       [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
-       [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
-       [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
-       [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+       [0]  = { .vector = IRQ0_VECTOR,  },
+       [1]  = { .vector = IRQ1_VECTOR,  },
+       [2]  = { .vector = IRQ2_VECTOR,  },
+       [3]  = { .vector = IRQ3_VECTOR,  },
+       [4]  = { .vector = IRQ4_VECTOR,  },
+       [5]  = { .vector = IRQ5_VECTOR,  },
+       [6]  = { .vector = IRQ6_VECTOR,  },
+       [7]  = { .vector = IRQ7_VECTOR,  },
+       [8]  = { .vector = IRQ8_VECTOR,  },
+       [9]  = { .vector = IRQ9_VECTOR,  },
+       [10] = { .vector = IRQ10_VECTOR, },
+       [11] = { .vector = IRQ11_VECTOR, },
+       [12] = { .vector = IRQ12_VECTOR, },
+       [13] = { .vector = IRQ13_VECTOR, },
+       [14] = { .vector = IRQ14_VECTOR, },
+       [15] = { .vector = IRQ15_VECTOR, },
 };
 
 int __init arch_early_irq_init(void)
@@ -183,6 +183,10 @@ int __init arch_early_irq_init(void)
        for (i = 0; i < count; i++) {
                desc = irq_to_desc(i);
                desc->chip_data = &cfg[i];
+               alloc_bootmem_cpumask_var(&cfg[i].domain);
+               alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+               if (i < NR_IRQS_LEGACY)
+                       cpumask_setall(cfg[i].domain);
        }
 
        return 0;
@@ -209,6 +213,20 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu)
        node = cpu_to_node(cpu);
 
        cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
+       if (cfg) {
+               /* FIXME: needs alloc_cpumask_var_node() */
+               if (!alloc_cpumask_var(&cfg->domain, GFP_ATOMIC)) {
+                       kfree(cfg);
+                       cfg = NULL;
+               } else if (!alloc_cpumask_var(&cfg->old_domain, GFP_ATOMIC)) {
+                       free_cpumask_var(cfg->domain);
+                       kfree(cfg);
+                       cfg = NULL;
+               } else {
+                       cpumask_clear(cfg->domain);
+                       cpumask_clear(cfg->old_domain);
+               }
+       }
        printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
 
        return cfg;
@@ -333,13 +351,14 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
        }
 }
 
-static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+static void
+set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
 {
        struct irq_cfg *cfg = desc->chip_data;
 
        if (!cfg->move_in_progress) {
                /* it means that domain is not changed */
-               if (!cpus_intersects(desc->affinity, mask))
+               if (!cpumask_intersects(&desc->affinity, mask))
                        cfg->move_desc_pending = 1;
        }
 }
@@ -354,7 +373,8 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
 #endif
 
 #ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
-static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+static inline void
+set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
 {
 }
 #endif
@@ -485,6 +505,26 @@ static void ioapic_mask_entry(int apic, int pin)
 }
 
 #ifdef CONFIG_SMP
+static void send_cleanup_vector(struct irq_cfg *cfg)
+{
+       cpumask_var_t cleanup_mask;
+
+       if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+               unsigned int i;
+               cfg->move_cleanup_count = 0;
+               for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+                       cfg->move_cleanup_count++;
+               for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+                       send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+       } else {
+               cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+               cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
+               send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+               free_cpumask_var(cleanup_mask);
+       }
+       cfg->move_in_progress = 0;
+}
+
 static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
 {
        int apic, pin;
@@ -520,41 +560,55 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
        }
 }
 
-static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask);
+static int
+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
 
-static void set_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
+/*
+ * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid
+ * of that, or returns BAD_APICID and leaves desc->affinity untouched.
+ */
+static unsigned int
+set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
 {
        struct irq_cfg *cfg;
-       unsigned long flags;
-       unsigned int dest;
-       cpumask_t tmp;
        unsigned int irq;
 
-       cpus_and(tmp, mask, cpu_online_map);
-       if (cpus_empty(tmp))
-               return;
+       if (!cpumask_intersects(mask, cpu_online_mask))
+               return BAD_APICID;
 
        irq = desc->irq;
        cfg = desc->chip_data;
        if (assign_irq_vector(irq, cfg, mask))
-               return;
+               return BAD_APICID;
 
+       cpumask_and(&desc->affinity, cfg->domain, mask);
        set_extra_move_desc(desc, mask);
+       return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
+}
 
-       cpus_and(tmp, cfg->domain, mask);
-       dest = cpu_mask_to_apicid(tmp);
-       /*
-        * Only the high 8 bits are valid.
-        */
-       dest = SET_APIC_LOGICAL_ID(dest);
+static void
+set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+{
+       struct irq_cfg *cfg;
+       unsigned long flags;
+       unsigned int dest;
+       unsigned int irq;
+
+       irq = desc->irq;
+       cfg = desc->chip_data;
 
        spin_lock_irqsave(&ioapic_lock, flags);
-       __target_IO_APIC_irq(irq, dest, cfg);
-       desc->affinity = mask;
+       dest = set_desc_affinity(desc, mask);
+       if (dest != BAD_APICID) {
+               /* Only the high 8 bits are valid. */
+               dest = SET_APIC_LOGICAL_ID(dest);
+               __target_IO_APIC_irq(irq, dest, cfg);
+       }
        spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+static void
+set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
 {
        struct irq_desc *desc;
 
@@ -652,7 +706,7 @@ static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
 }
 
 #ifdef CONFIG_X86_64
-void io_apic_sync(struct irq_pin_list *entry)
+static void io_apic_sync(struct irq_pin_list *entry)
 {
        /*
         * Synchronize the IO-APIC and the CPU by doing
@@ -1222,7 +1276,8 @@ void unlock_vector_lock(void)
        spin_unlock(&vector_lock);
 }
 
-static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
+static int
+__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
 {
        /*
         * NOTE! The local APIC isn't very good at handling
@@ -1237,49 +1292,49 @@ static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
         */
        static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
        unsigned int old_vector;
-       int cpu;
+       int cpu, err;
+       cpumask_var_t tmp_mask;
 
        if ((cfg->move_in_progress) || cfg->move_cleanup_count)
                return -EBUSY;
 
-       /* Only try and allocate irqs on cpus that are present */
-       cpus_and(mask, mask, cpu_online_map);
+       if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
+               return -ENOMEM;
 
        old_vector = cfg->vector;
        if (old_vector) {
-               cpumask_t tmp;
-               cpus_and(tmp, cfg->domain, mask);
-               if (!cpus_empty(tmp))
+               cpumask_and(tmp_mask, mask, cpu_online_mask);
+               cpumask_and(tmp_mask, cfg->domain, tmp_mask);
+               if (!cpumask_empty(tmp_mask)) {
+                       free_cpumask_var(tmp_mask);
                        return 0;
+               }
        }
 
-       for_each_cpu_mask_nr(cpu, mask) {
-               cpumask_t domain, new_mask;
+       /* Only try and allocate irqs on cpus that are present */
+       err = -ENOSPC;
+       for_each_cpu_and(cpu, mask, cpu_online_mask) {
                int new_cpu;
                int vector, offset;
 
-               domain = vector_allocation_domain(cpu);
-               cpus_and(new_mask, domain, cpu_online_map);
+               vector_allocation_domain(cpu, tmp_mask);
 
                vector = current_vector;
                offset = current_offset;
 next:
                vector += 8;
                if (vector >= first_system_vector) {
-                       /* If we run out of vectors on large boxen, must share them. */
+                       /* If out of vectors on large boxen, must share them. */
                        offset = (offset + 1) % 8;
                        vector = FIRST_DEVICE_VECTOR + offset;
                }
                if (unlikely(current_vector == vector))
                        continue;
-#ifdef CONFIG_X86_64
-               if (vector == IA32_SYSCALL_VECTOR)
-                       goto next;
-#else
-               if (vector == SYSCALL_VECTOR)
+
+               if (test_bit(vector, used_vectors))
                        goto next;
-#endif
-               for_each_cpu_mask_nr(new_cpu, new_mask)
+
+               for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
                        if (per_cpu(vector_irq, new_cpu)[vector] != -1)
                                goto next;
                /* Found one! */
@@ -1287,18 +1342,21 @@ next:
                current_offset = offset;
                if (old_vector) {
                        cfg->move_in_progress = 1;
-                       cfg->old_domain = cfg->domain;
+                       cpumask_copy(cfg->old_domain, cfg->domain);
                }
-               for_each_cpu_mask_nr(new_cpu, new_mask)
+               for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
                        per_cpu(vector_irq, new_cpu)[vector] = irq;
                cfg->vector = vector;
-               cfg->domain = domain;
-               return 0;
+               cpumask_copy(cfg->domain, tmp_mask);
+               err = 0;
+               break;
        }
-       return -ENOSPC;
+       free_cpumask_var(tmp_mask);
+       return err;
 }
 
-static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
+static int
+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
 {
        int err;
        unsigned long flags;
@@ -1311,23 +1369,20 @@ static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
 
 static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
 {
-       cpumask_t mask;
        int cpu, vector;
 
        BUG_ON(!cfg->vector);
 
        vector = cfg->vector;
-       cpus_and(mask, cfg->domain, cpu_online_map);
-       for_each_cpu_mask_nr(cpu, mask)
+       for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
                per_cpu(vector_irq, cpu)[vector] = -1;
 
        cfg->vector = 0;
-       cpus_clear(cfg->domain);
+       cpumask_clear(cfg->domain);
 
        if (likely(!cfg->move_in_progress))
                return;
-       cpus_and(mask, cfg->old_domain, cpu_online_map);
-       for_each_cpu_mask_nr(cpu, mask) {
+       for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
                for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
                                                                vector++) {
                        if (per_cpu(vector_irq, cpu)[vector] != irq)
@@ -1350,7 +1405,7 @@ void __setup_vector_irq(int cpu)
        /* Mark the inuse vectors */
        for_each_irq_desc(irq, desc) {
                cfg = desc->chip_data;
-               if (!cpu_isset(cpu, cfg->domain))
+               if (!cpumask_test_cpu(cpu, cfg->domain))
                        continue;
                vector = cfg->vector;
                per_cpu(vector_irq, cpu)[vector] = irq;
@@ -1362,7 +1417,7 @@ void __setup_vector_irq(int cpu)
                        continue;
 
                cfg = irq_cfg(irq);
-               if (!cpu_isset(cpu, cfg->domain))
+               if (!cpumask_test_cpu(cpu, cfg->domain))
                        per_cpu(vector_irq, cpu)[vector] = -1;
        }
 }
@@ -1498,18 +1553,17 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_de
 {
        struct irq_cfg *cfg;
        struct IO_APIC_route_entry entry;
-       cpumask_t mask;
+       unsigned int dest;
 
        if (!IO_APIC_IRQ(irq))
                return;
 
        cfg = desc->chip_data;
 
-       mask = TARGET_CPUS;
-       if (assign_irq_vector(irq, cfg, mask))
+       if (assign_irq_vector(irq, cfg, TARGET_CPUS))
                return;
 
-       cpus_and(mask, cfg->domain, mask);
+       dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
 
        apic_printk(APIC_VERBOSE,KERN_DEBUG
                    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
@@ -1519,8 +1573,7 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_de
 
 
        if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
-                              cpu_mask_to_apicid(mask), trigger, polarity,
-                              cfg->vector)) {
+                              dest, trigger, polarity, cfg->vector)) {
                printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
                       mp_ioapics[apic].mp_apicid, pin);
                __clear_irq_vector(irq, cfg);
@@ -2240,7 +2293,7 @@ static int ioapic_retrigger_irq(unsigned int irq)
        unsigned long flags;
 
        spin_lock_irqsave(&vector_lock, flags);
-       send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
+       send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
        spin_unlock_irqrestore(&vector_lock, flags);
 
        return 1;
@@ -2289,18 +2342,17 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
  * as simple as edge triggered migration and we can do the irq migration
  * with a simple atomic update to IO-APIC RTE.
  */
-static void migrate_ioapic_irq_desc(struct irq_desc *desc, cpumask_t mask)
+static void
+migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 {
        struct irq_cfg *cfg;
-       cpumask_t tmp, cleanup_mask;
        struct irte irte;
        int modify_ioapic_rte;
        unsigned int dest;
        unsigned long flags;
        unsigned int irq;
 
-       cpus_and(tmp, mask, cpu_online_map);
-       if (cpus_empty(tmp))
+       if (!cpumask_intersects(mask, cpu_online_mask))
                return;
 
        irq = desc->irq;
@@ -2313,8 +2365,7 @@ static void migrate_ioapic_irq_desc(struct irq_desc *desc, cpumask_t mask)
 
        set_extra_move_desc(desc, mask);
 
-       cpus_and(tmp, cfg->domain, mask);
-       dest = cpu_mask_to_apicid(tmp);
+       dest = cpu_mask_to_apicid_and(cfg->domain, mask);
 
        modify_ioapic_rte = desc->status & IRQ_LEVEL;
        if (modify_ioapic_rte) {
@@ -2331,14 +2382,10 @@ static void migrate_ioapic_irq_desc(struct irq_desc *desc, cpumask_t mask)
         */
        modify_irte(irq, &irte);
 
-       if (cfg->move_in_progress) {
-               cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
-               cfg->move_cleanup_count = cpus_weight(cleanup_mask);
-               send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-               cfg->move_in_progress = 0;
-       }
+       if (cfg->move_in_progress)
+               send_cleanup_vector(cfg);
 
-       desc->affinity = mask;
+       cpumask_copy(&desc->affinity, mask);
 }
 
 static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
@@ -2360,11 +2407,11 @@ static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
        }
 
        /* everthing is clear. we have right of way */
-       migrate_ioapic_irq_desc(desc, desc->pending_mask);
+       migrate_ioapic_irq_desc(desc, &desc->pending_mask);
 
        ret = 0;
        desc->status &= ~IRQ_MOVE_PENDING;
-       cpus_clear(desc->pending_mask);
+       cpumask_clear(&desc->pending_mask);
 
 unmask:
        unmask_IO_APIC_irq_desc(desc);
@@ -2389,7 +2436,7 @@ static void ir_irq_migration(struct work_struct *work)
                                continue;
                        }
 
-                       desc->chip->set_affinity(irq, desc->pending_mask);
+                       desc->chip->set_affinity(irq, &desc->pending_mask);
                        spin_unlock_irqrestore(&desc->lock, flags);
                }
        }
@@ -2398,18 +2445,20 @@ static void ir_irq_migration(struct work_struct *work)
 /*
  * Migrates the IRQ destination in the process context.
  */
-static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
+static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+                                           const struct cpumask *mask)
 {
        if (desc->status & IRQ_LEVEL) {
                desc->status |= IRQ_MOVE_PENDING;
-               desc->pending_mask = mask;
+               cpumask_copy(&desc->pending_mask, mask);
                migrate_irq_remapped_level_desc(desc);
                return;
        }
 
        migrate_ioapic_irq_desc(desc, mask);
 }
-static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+static void set_ir_ioapic_affinity_irq(unsigned int irq,
+                                      const struct cpumask *mask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
 
@@ -2444,7 +2493,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
                if (!cfg->move_cleanup_count)
                        goto unlock;
 
-               if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
+               if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
                        goto unlock;
 
                __get_cpu_var(vector_irq)[vector] = -1;
@@ -2481,20 +2530,14 @@ static void irq_complete_move(struct irq_desc **descp)
 
        vector = ~get_irq_regs()->orig_ax;
        me = smp_processor_id();
-       if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
-               cpumask_t cleanup_mask;
-
 #ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
                *descp = desc = move_irq_desc(desc, me);
                /* get the new one */
                cfg = desc->chip_data;
 #endif
 
-               cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
-               cfg->move_cleanup_count = cpus_weight(cleanup_mask);
-               send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-               cfg->move_in_progress = 0;
-       }
+       if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+               send_cleanup_vector(cfg);
 }
 #else
 static inline void irq_complete_move(struct irq_desc **descp) {}
@@ -3216,16 +3259,13 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
        struct irq_cfg *cfg;
        int err;
        unsigned dest;
-       cpumask_t tmp;
 
        cfg = irq_cfg(irq);
-       tmp = TARGET_CPUS;
-       err = assign_irq_vector(irq, cfg, tmp);
+       err = assign_irq_vector(irq, cfg, TARGET_CPUS);
        if (err)
                return err;
 
-       cpus_and(tmp, cfg->domain, tmp);
-       dest = cpu_mask_to_apicid(tmp);
+       dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
 
 #ifdef CONFIG_INTR_REMAP
        if (irq_remapped(irq)) {
@@ -3279,26 +3319,18 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 }
 
 #ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
        struct msi_msg msg;
        unsigned int dest;
-       cpumask_t tmp;
 
-       cpus_and(tmp, mask, cpu_online_map);
-       if (cpus_empty(tmp))
+       dest = set_desc_affinity(desc, mask);
+       if (dest == BAD_APICID)
                return;
 
        cfg = desc->chip_data;
-       if (assign_irq_vector(irq, cfg, mask))
-               return;
-
-       set_extra_move_desc(desc, mask);
-
-       cpus_and(tmp, cfg->domain, mask);
-       dest = cpu_mask_to_apicid(tmp);
 
        read_msi_msg_desc(desc, &msg);
 
@@ -3308,37 +3340,27 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
        msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
        write_msi_msg_desc(desc, &msg);
-       desc->affinity = mask;
 }
 #ifdef CONFIG_INTR_REMAP
 /*
  * Migrate the MSI irq to another cpumask. This migration is
  * done in the process context using interrupt-remapping hardware.
  */
-static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+static void
+ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-       struct irq_cfg *cfg;
+       struct irq_cfg *cfg = desc->chip_data;
        unsigned int dest;
-       cpumask_t tmp, cleanup_mask;
        struct irte irte;
 
-       cpus_and(tmp, mask, cpu_online_map);
-       if (cpus_empty(tmp))
-               return;
-
        if (get_irte(irq, &irte))
                return;
 
-       cfg = desc->chip_data;
-       if (assign_irq_vector(irq, cfg, mask))
+       dest = set_desc_affinity(desc, mask);
+       if (dest == BAD_APICID)
                return;
 
-       set_extra_move_desc(desc, mask);
-
-       cpus_and(tmp, cfg->domain, mask);
-       dest = cpu_mask_to_apicid(tmp);
-
        irte.vector = cfg->vector;
        irte.dest_id = IRTE_DEST(dest);
 
@@ -3352,14 +3374,8 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
         * at the new destination. So, time to cleanup the previous
         * vector allocation.
         */
-       if (cfg->move_in_progress) {
-               cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
-               cfg->move_cleanup_count = cpus_weight(cleanup_mask);
-               send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-               cfg->move_in_progress = 0;
-       }
-
-       desc->affinity = mask;
+       if (cfg->move_in_progress)
+               send_cleanup_vector(cfg);
 }
 
 #endif
@@ -3550,26 +3566,18 @@ void arch_teardown_msi_irq(unsigned int irq)
 
 #ifdef CONFIG_DMAR
 #ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
        struct msi_msg msg;
        unsigned int dest;
-       cpumask_t tmp;
 
-       cpus_and(tmp, mask, cpu_online_map);
-       if (cpus_empty(tmp))
+       dest = set_desc_affinity(desc, mask);
+       if (dest == BAD_APICID)
                return;
 
        cfg = desc->chip_data;
-       if (assign_irq_vector(irq, cfg, mask))
-               return;
-
-       set_extra_move_desc(desc, mask);
-
-       cpus_and(tmp, cfg->domain, mask);
-       dest = cpu_mask_to_apicid(tmp);
 
        dmar_msi_read(irq, &msg);
 
@@ -3579,7 +3587,6 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
        msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
        dmar_msi_write(irq, &msg);
-       desc->affinity = mask;
 }
 
 #endif /* CONFIG_SMP */
@@ -3613,26 +3620,18 @@ int arch_setup_dmar_msi(unsigned int irq)
 #ifdef CONFIG_HPET_TIMER
 
 #ifdef CONFIG_SMP
-static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
        struct msi_msg msg;
        unsigned int dest;
-       cpumask_t tmp;
 
-       cpus_and(tmp, mask, cpu_online_map);
-       if (cpus_empty(tmp))
+       dest = set_desc_affinity(desc, mask);
+       if (dest == BAD_APICID)
                return;
 
        cfg = desc->chip_data;
-       if (assign_irq_vector(irq, cfg, mask))
-               return;
-
-       set_extra_move_desc(desc, mask);
-
-       cpus_and(tmp, cfg->domain, mask);
-       dest = cpu_mask_to_apicid(tmp);
 
        hpet_msi_read(irq, &msg);
 
@@ -3642,7 +3641,6 @@ static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
        msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
        hpet_msi_write(irq, &msg);
-       desc->affinity = mask;
 }
 
 #endif /* CONFIG_SMP */
@@ -3697,28 +3695,19 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
        write_ht_irq_msg(irq, &msg);
 }
 
-static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
        unsigned int dest;
-       cpumask_t tmp;
 
-       cpus_and(tmp, mask, cpu_online_map);
-       if (cpus_empty(tmp))
+       dest = set_desc_affinity(desc, mask);
+       if (dest == BAD_APICID)
                return;
 
        cfg = desc->chip_data;
-       if (assign_irq_vector(irq, cfg, mask))
-               return;
-
-       set_extra_move_desc(desc, mask);
-
-       cpus_and(tmp, cfg->domain, mask);
-       dest = cpu_mask_to_apicid(tmp);
 
        target_ht_irq(irq, dest, cfg->vector);
-       desc->affinity = mask;
 }
 
 #endif
@@ -3738,17 +3727,14 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 {
        struct irq_cfg *cfg;
        int err;
-       cpumask_t tmp;
 
        cfg = irq_cfg(irq);
-       tmp = TARGET_CPUS;
-       err = assign_irq_vector(irq, cfg, tmp);
+       err = assign_irq_vector(irq, cfg, TARGET_CPUS);
        if (!err) {
                struct ht_irq_msg msg;
                unsigned dest;
 
-               cpus_and(tmp, cfg->domain, tmp);
-               dest = cpu_mask_to_apicid(tmp);
+               dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
 
                msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
 
@@ -3784,7 +3770,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
                       unsigned long mmr_offset)
 {
-       const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
+       const struct cpumask *eligible_cpu = cpumask_of(cpu);
        struct irq_cfg *cfg;
        int mmr_pnode;
        unsigned long mmr_value;
@@ -3794,7 +3780,7 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
 
        cfg = irq_cfg(irq);
 
-       err = assign_irq_vector(irq, cfg, *eligible_cpu);
+       err = assign_irq_vector(irq, cfg, eligible_cpu);
        if (err != 0)
                return err;
 
@@ -3813,7 +3799,7 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
        entry->polarity = 0;
        entry->trigger = 0;
        entry->mask = 0;
-       entry->dest = cpu_mask_to_apicid(*eligible_cpu);
+       entry->dest = cpu_mask_to_apicid(eligible_cpu);
 
        mmr_pnode = uv_blade_to_pnode(mmr_blade);
        uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -4024,7 +4010,7 @@ void __init setup_ioapic_dest(void)
        int pin, ioapic, irq, irq_entry;
        struct irq_desc *desc;
        struct irq_cfg *cfg;
-       cpumask_t mask;
+       const struct cpumask *mask;
 
        if (skip_ioapic_setup == 1)
                return;
@@ -4055,7 +4041,7 @@ void __init setup_ioapic_dest(void)
                         */
                        if (desc->status &
                            (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
-                               mask = desc->affinity;
+                               mask = &desc->affinity;
                        else
                                mask = TARGET_CPUS;
 
index f1c688e46f35a8443e031b0e8dd572800a86c097..285bbf8831faf3cd826a4dd7cbf295ea0684ec21 100644 (file)
@@ -116,18 +116,18 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
 /*
  * This is only used on smaller machines.
  */
-void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
+void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector)
 {
-       unsigned long mask = cpus_addr(cpumask)[0];
+       unsigned long mask = cpumask_bits(cpumask)[0];
        unsigned long flags;
 
        local_irq_save(flags);
-       WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
+       WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
        __send_IPI_dest_field(mask, vector);
        local_irq_restore(flags);
 }
 
-void send_IPI_mask_sequence(cpumask_t mask, int vector)
+void send_IPI_mask_sequence(const struct cpumask *mask, int vector)
 {
        unsigned long flags;
        unsigned int query_cpu;
@@ -139,12 +139,24 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
         */
 
        local_irq_save(flags);
-       for_each_possible_cpu(query_cpu) {
-               if (cpu_isset(query_cpu, mask)) {
+       for_each_cpu(query_cpu, mask)
+               __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector);
+       local_irq_restore(flags);
+}
+
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+       unsigned long flags;
+       unsigned int query_cpu;
+       unsigned int this_cpu = smp_processor_id();
+
+       /* See Hack comment above */
+
+       local_irq_save(flags);
+       for_each_cpu(query_cpu, mask)
+               if (query_cpu != this_cpu)
                        __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
                                              vector);
-               }
-       }
        local_irq_restore(flags);
 }
 
index 3f1d9d18df679c858bda94daae4d578c0d8cbf93..bce53e1352a087b5841f9998b4fa63cbf40b8c2a 100644 (file)
@@ -9,6 +9,7 @@
 #include <asm/apic.h>
 #include <asm/io_apic.h>
 #include <asm/smp.h>
+#include <asm/irq.h>
 
 atomic_t irq_err_count;
 
@@ -190,3 +191,5 @@ u64 arch_irq_stat(void)
 #endif
        return sum;
 }
+
+EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
index 119fc9c8ff7f29135b0e353cc31722cd6abe452f..9dc5588f336a21bfa5fb19eb83a32e022fb8db96 100644 (file)
@@ -233,27 +233,28 @@ unsigned int do_IRQ(struct pt_regs *regs)
 #ifdef CONFIG_HOTPLUG_CPU
 #include <mach_apic.h>
 
-void fixup_irqs(cpumask_t map)
+/* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
+void fixup_irqs(void)
 {
        unsigned int irq;
        static int warned;
        struct irq_desc *desc;
 
        for_each_irq_desc(irq, desc) {
-               cpumask_t mask;
+               const struct cpumask *affinity;
 
                if (!desc)
                        continue;
                if (irq == 2)
                        continue;
 
-               cpus_and(mask, desc->affinity, map);
-               if (any_online_cpu(mask) == NR_CPUS) {
+               affinity = &desc->affinity;
+               if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
                        printk("Breaking affinity for irq %i\n", irq);
-                       mask = map;
+                       affinity = cpu_all_mask;
                }
                if (desc->chip->set_affinity)
-                       desc->chip->set_affinity(irq, mask);
+                       desc->chip->set_affinity(irq, affinity);
                else if (desc->action && !(warned++))
                        printk("Cannot set affinity for irq %i\n", irq);
        }
index a174a217eb1aefc25c971c2d6155e483d2d50192..6383d50f82ea11e93220a9fd45b9f0fc466860e6 100644 (file)
@@ -80,16 +80,17 @@ asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-void fixup_irqs(cpumask_t map)
+/* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
+void fixup_irqs(void)
 {
        unsigned int irq;
        static int warned;
        struct irq_desc *desc;
 
        for_each_irq_desc(irq, desc) {
-               cpumask_t mask;
                int break_affinity = 0;
                int set_affinity = 1;
+               const struct cpumask *affinity;
 
                if (!desc)
                        continue;
@@ -99,23 +100,23 @@ void fixup_irqs(cpumask_t map)
                /* interrupt's are disabled at this point */
                spin_lock(&desc->lock);
 
+               affinity = &desc->affinity;
                if (!irq_has_action(irq) ||
-                   cpus_equal(desc->affinity, map)) {
+                   cpumask_equal(affinity, cpu_online_mask)) {
                        spin_unlock(&desc->lock);
                        continue;
                }
 
-               cpus_and(mask, desc->affinity, map);
-               if (cpus_empty(mask)) {
+               if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
                        break_affinity = 1;
-                       mask = map;
+                       affinity = cpu_all_mask;
                }
 
                if (desc->chip->mask)
                        desc->chip->mask(irq);
 
                if (desc->chip->set_affinity)
-                       desc->chip->set_affinity(irq, mask);
+                       desc->chip->set_affinity(irq, affinity);
                else if (!(warned++))
                        set_affinity = 0;
 
index 203384ed2b5d85342f0ad9ea4f85adf45385acb5..84723295f88a061a8078dbff08a66df484789dc9 100644 (file)
@@ -110,6 +110,18 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
        [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
 };
 
+int vector_used_by_percpu_irq(unsigned int vector)
+{
+       int cpu;
+
+       for_each_online_cpu(cpu) {
+               if (per_cpu(vector_irq, cpu)[vector] != -1)
+                       return 1;
+       }
+
+       return 0;
+}
+
 /* Overridden in paravirt.c */
 void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
 
@@ -146,10 +158,12 @@ void __init native_init_IRQ(void)
        alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
 
        /* IPI for single call function */
-       set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
+       alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
+                                call_function_single_interrupt);
 
        /* Low priority IPI to cleanup after moving an irq */
        set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+       set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
 #endif
 
 #ifdef CONFIG_X86_LOCAL_APIC
index 6190e6ef546cfc197e63d0a716861d94905b9a59..31ebfe38e96ce98327ed64075e773f6bde4b0d66 100644 (file)
@@ -69,6 +69,18 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
        [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
 };
 
+int vector_used_by_percpu_irq(unsigned int vector)
+{
+       int cpu;
+
+       for_each_online_cpu(cpu) {
+               if (per_cpu(vector_irq, cpu)[vector] != -1)
+                       return 1;
+       }
+
+       return 0;
+}
+
 void __init init_ISA_irqs(void)
 {
        int i;
@@ -121,6 +133,7 @@ static void __init smp_intr_init(void)
 
        /* Low priority IPI to cleanup after moving an irq */
        set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+       set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
 #endif
 }
 
index e169ae9b6a62e9f3d2b4d914de87ce4157c26569..652fce6d2cceb9166f2bcacfa447c75060ede8c3 100644 (file)
@@ -89,17 +89,17 @@ static cycle_t kvm_clock_read(void)
  */
 static unsigned long kvm_get_tsc_khz(void)
 {
-       return preset_lpj;
+       struct pvclock_vcpu_time_info *src;
+       src = &per_cpu(hv_clock, 0);
+       return pvclock_tsc_khz(src);
 }
 
 static void kvm_get_preset_lpj(void)
 {
-       struct pvclock_vcpu_time_info *src;
        unsigned long khz;
        u64 lpj;
 
-       src = &per_cpu(hv_clock, 0);
-       khz = pvclock_tsc_khz(src);
+       khz = kvm_get_tsc_khz();
 
        lpj = ((u64)khz * 1000);
        do_div(lpj, HZ);
@@ -194,5 +194,7 @@ void __init kvmclock_init(void)
 #endif
                kvm_get_preset_lpj();
                clocksource_register(&kvm_clock);
+               pv_info.paravirt_enabled = 1;
+               pv_info.name = "KVM";
        }
 }
index eee32b43fee3e59b5ab8e3a2243bb6f6a56fb21e..71f1d99a635d75dd7f30ee1f1876b91112d2a82c 100644 (file)
@@ -12,8 +12,8 @@
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/vmalloc.h>
+#include <linux/uaccess.h>
 
-#include <asm/uaccess.h>
 #include <asm/system.h>
 #include <asm/ldt.h>
 #include <asm/desc.h>
@@ -93,7 +93,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
        if (err < 0)
                return err;
 
-       for(i = 0; i < old->size; i++)
+       for (i = 0; i < old->size; i++)
                write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
        return 0;
 }
index 3b599518c322a143c9623759061a44ca92647ecf..c12314c9e86fd512367a588646770ea0c4149a5c 100644 (file)
@@ -287,7 +287,7 @@ static struct clock_event_device mfgpt_clockevent = {
        .set_mode = mfgpt_set_mode,
        .set_next_event = mfgpt_next_event,
        .rating = 250,
-       .cpumask = CPU_MASK_ALL,
+       .cpumask = cpu_all_mask,
        .shift = 32
 };
 
index efc2f361fe85f0c1280cfce1aeafefa572a5c83d..666e43df51f99ce58b92c844149e1f8efcfa258f 100644 (file)
@@ -13,8 +13,7 @@
 #include <asm/msr.h>
 #include <asm/acpi.h>
 #include <asm/mmconfig.h>
-
-#include "../pci/pci.h"
+#include <asm/pci_x86.h>
 
 struct pci_hostbridge_probe {
        u32 bus;
index 45e3b69808ba6722bcc9617cdd0c883cab1a8ee2..c5c5b8df1dbcc2ef40caa3f11451112102a65fb1 100644 (file)
 #include <linux/bitops.h>
 #include <linux/acpi.h>
 #include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/acpi.h>
 
-#include <asm/smp.h>
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
 #include <asm/pgalloc.h>
 #include <asm/io_apic.h>
 #include <asm/proto.h>
-#include <asm/acpi.h>
 #include <asm/bios_ebda.h>
 #include <asm/e820.h>
 #include <asm/trampoline.h>
@@ -95,8 +95,8 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
 #endif
 
        if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
-                set_bit(m->mpc_busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
+               set_bit(m->mpc_busid, mp_bus_not_pci);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
                mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
 #endif
        } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
@@ -104,7 +104,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
                        x86_quirks->mpc_oem_pci_bus(m);
 
                clear_bit(m->mpc_busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
                mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
        } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
                mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
index 8bd1bf9622a7cf1062f33e039c37a2167e5f2918..45a09ccdc2145e4c1b10b4cf1d0ce0715a7a5fc8 100644 (file)
 #include <linux/kernel_stat.h>
 #include <linux/kdebug.h>
 #include <linux/smp.h>
+#include <linux/nmi.h>
 
 #include <asm/i8259.h>
 #include <asm/io_apic.h>
-#include <asm/smp.h>
-#include <asm/nmi.h>
 #include <asm/proto.h>
 #include <asm/timer.h>
 
index a35eaa379ff632fb142367df58a8fdeaaad81267..00c2bcd41463e325976a74cbee0fa06f4b6fe85c 100644 (file)
@@ -52,7 +52,7 @@ static u32 *iommu_gatt_base;          /* Remapping table */
  * to trigger bugs with some popular PCI cards, in particular 3ware (but
  * has been also also seen with Qlogic at least).
  */
-int iommu_fullflush = 1;
+static int iommu_fullflush = 1;
 
 /* Allocation bitmap for the remapping area: */
 static DEFINE_SPINLOCK(iommu_bitmap_lock);
index 61f718df6eec1fe74dffaa2cb3b110b1deaeb70b..bf088c61fa40c07fc8616505a8cee640b8f35e1d 100644 (file)
@@ -12,6 +12,8 @@
 #include <asm/proto.h>
 #include <asm/reboot_fixups.h>
 #include <asm/reboot.h>
+#include <asm/pci_x86.h>
+#include <asm/virtext.h>
 
 #ifdef CONFIG_X86_32
 # include <linux/dmi.h>
@@ -23,7 +25,6 @@
 
 #include <mach_ipi.h>
 
-
 /*
  * Power off function, if any
  */
@@ -39,6 +40,12 @@ int reboot_force;
 static int reboot_cpu = -1;
 #endif
 
+/* This is set if we need to go through the 'emergency' path.
+ * When machine_emergency_restart() is called, we may be on
+ * an inconsistent state and won't be able to do a clean cleanup
+ */
+static int reboot_emergency;
+
 /* This is set by the PCI code if either type 1 or type 2 PCI is detected */
 bool port_cf9_safe = false;
 
@@ -368,6 +375,48 @@ static inline void kb_wait(void)
        }
 }
 
+static void vmxoff_nmi(int cpu, struct die_args *args)
+{
+       cpu_emergency_vmxoff();
+}
+
+/* Use NMIs as IPIs to tell all CPUs to disable virtualization
+ */
+static void emergency_vmx_disable_all(void)
+{
+       /* Just make sure we won't change CPUs while doing this */
+       local_irq_disable();
+
+       /* We need to disable VMX on all CPUs before rebooting, otherwise
+        * we risk hanging up the machine, because the CPU ignore INIT
+        * signals when VMX is enabled.
+        *
+        * We can't take any locks and we may be on an inconsistent
+        * state, so we use NMIs as IPIs to tell the other CPUs to disable
+        * VMX and halt.
+        *
+        * For safety, we will avoid running the nmi_shootdown_cpus()
+        * stuff unnecessarily, but we don't have a way to check
+        * if other CPUs have VMX enabled. So we will call it only if the
+        * CPU we are running on has VMX enabled.
+        *
+        * We will miss cases where VMX is not enabled on all CPUs. This
+        * shouldn't do much harm because KVM always enable VMX on all
+        * CPUs anyway. But we can miss it on the small window where KVM
+        * is still enabling VMX.
+        */
+       if (cpu_has_vmx() && cpu_vmx_enabled()) {
+               /* Disable VMX on this CPU.
+                */
+               cpu_vmxoff();
+
+               /* Halt and disable VMX on the other CPUs */
+               nmi_shootdown_cpus(vmxoff_nmi);
+
+       }
+}
+
+
 void __attribute__((weak)) mach_reboot_fixups(void)
 {
 }
@@ -376,6 +425,9 @@ static void native_machine_emergency_restart(void)
 {
        int i;
 
+       if (reboot_emergency)
+               emergency_vmx_disable_all();
+
        /* Tell the BIOS if we want cold or warm reboot */
        *((unsigned short *)__va(0x472)) = reboot_mode;
 
@@ -482,13 +534,19 @@ void native_machine_shutdown(void)
 #endif
 }
 
+static void __machine_emergency_restart(int emergency)
+{
+       reboot_emergency = emergency;
+       machine_ops.emergency_restart();
+}
+
 static void native_machine_restart(char *__unused)
 {
        printk("machine restart\n");
 
        if (!reboot_force)
                machine_shutdown();
-       machine_emergency_restart();
+       __machine_emergency_restart(0);
 }
 
 static void native_machine_halt(void)
@@ -532,7 +590,7 @@ void machine_shutdown(void)
 
 void machine_emergency_restart(void)
 {
-       machine_ops.emergency_restart();
+       __machine_emergency_restart(1);
 }
 
 void machine_restart(char *cmd)
@@ -592,10 +650,7 @@ static int crash_nmi_callback(struct notifier_block *self,
 
 static void smp_send_nmi_allbutself(void)
 {
-       cpumask_t mask = cpu_online_map;
-       cpu_clear(safe_smp_processor_id(), mask);
-       if (!cpus_empty(mask))
-               send_IPI_mask(mask, NMI_VECTOR);
+       send_IPI_allbutself(NMI_VECTOR);
 }
 
 static struct notifier_block crash_nmi_nb = {
index ae0c0d3bb7704605440467d26ec48eaf84faad83..0b63b08e75304a1da386a1686b7ed0d647541081 100644 (file)
@@ -152,6 +152,11 @@ void __init setup_per_cpu_areas(void)
        old_size = PERCPU_ENOUGH_ROOM;
        align = max_t(unsigned long, PAGE_SIZE, align);
        size = roundup(old_size, align);
+
+       printk(KERN_INFO
+               "NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
+               NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
+
        printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
                          size);
 
@@ -168,24 +173,24 @@ void __init setup_per_cpu_areas(void)
                               "cpu %d has no node %d or node-local memory\n",
                                cpu, node);
                        if (ptr)
-                               printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n",
+                               printk(KERN_DEBUG
+                                       "per cpu data for cpu%d at %016lx\n",
                                         cpu, __pa(ptr));
                }
                else {
                        ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
                                                        __pa(MAX_DMA_ADDRESS));
                        if (ptr)
-                               printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
-                                        cpu, node, __pa(ptr));
+                               printk(KERN_DEBUG
+                                       "per cpu data for cpu%d on node%d "
+                                       "at %016lx\n",
+                                       cpu, node, __pa(ptr));
                }
 #endif
                per_cpu_offset(cpu) = ptr - __per_cpu_start;
                memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
        }
 
-       printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
-               NR_CPUS, nr_cpu_ids, nr_node_ids);
-
        /* Setup percpu data maps */
        setup_per_cpu_maps();
 
@@ -282,7 +287,7 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable)
        else
                cpu_clear(cpu, *mask);
 
-       cpulist_scnprintf(buf, sizeof(buf), *mask);
+       cpulist_scnprintf(buf, sizeof(buf), mask);
        printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
                enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
  }
index 7e558db362c1870f19013da9879f8cea1542e472..beea2649a2406240c3727da2a02e1f538feadba9 100644 (file)
@@ -118,22 +118,22 @@ static void native_smp_send_reschedule(int cpu)
                WARN_ON(1);
                return;
        }
-       send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+       send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
 }
 
 void native_send_call_func_single_ipi(int cpu)
 {
-       send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR);
+       send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
 }
 
-void native_send_call_func_ipi(cpumask_t mask)
+void native_send_call_func_ipi(const struct cpumask *mask)
 {
        cpumask_t allbutself;
 
        allbutself = cpu_online_map;
        cpu_clear(smp_processor_id(), allbutself);
 
-       if (cpus_equal(mask, allbutself) &&
+       if (cpus_equal(*mask, allbutself) &&
            cpus_equal(cpu_online_map, cpu_callout_map))
                send_IPI_allbutself(CALL_FUNCTION_VECTOR);
        else
index f8500c969442875e4db94cf9ca3d68e1c3777fbf..31869bf5fabd68a187fe36799a3cff7fa9b5f0a9 100644 (file)
@@ -102,14 +102,8 @@ EXPORT_SYMBOL(smp_num_siblings);
 /* Last level cache ID of each logical CPU */
 DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
 
-/* bitmap of online cpus */
-cpumask_t cpu_online_map __read_mostly;
-EXPORT_SYMBOL(cpu_online_map);
-
 cpumask_t cpu_callin_map;
 cpumask_t cpu_callout_map;
-cpumask_t cpu_possible_map;
-EXPORT_SYMBOL(cpu_possible_map);
 
 /* representing HT siblings of each logical CPU */
 DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
@@ -1260,6 +1254,15 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
        check_nmi_watchdog();
 }
 
+static int __initdata setup_possible_cpus = -1;
+static int __init _setup_possible_cpus(char *str)
+{
+       get_option(&str, &setup_possible_cpus);
+       return 0;
+}
+early_param("possible_cpus", _setup_possible_cpus);
+
+
 /*
  * cpu_possible_map should be static, it cannot change as cpu's
  * are onlined, or offlined. The reason is per-cpu data-structures
@@ -1272,7 +1275,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
  *
  * Three ways to find out the number of additional hotplug CPUs:
  * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
- * - The user can overwrite it with additional_cpus=NUM
+ * - The user can overwrite it with possible_cpus=NUM
  * - Otherwise don't reserve additional CPUs.
  * We do this because additional CPUs waste a lot of memory.
  * -AK
@@ -1285,9 +1288,17 @@ __init void prefill_possible_map(void)
        if (!num_processors)
                num_processors = 1;
 
-       possible = num_processors + disabled_cpus;
-       if (possible > NR_CPUS)
-               possible = NR_CPUS;
+       if (setup_possible_cpus == -1)
+               possible = num_processors + disabled_cpus;
+       else
+               possible = setup_possible_cpus;
+
+       if (possible > CONFIG_NR_CPUS) {
+               printk(KERN_WARNING
+                       "%d Processors exceeds NR_CPUS limit of %d\n",
+                       possible, CONFIG_NR_CPUS);
+               possible = CONFIG_NR_CPUS;
+       }
 
        printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
                possible, max_t(int, possible - num_processors, 0));
@@ -1352,7 +1363,7 @@ void cpu_disable_common(void)
        lock_vector_lock();
        remove_cpu_from_maps(cpu);
        unlock_vector_lock();
-       fixup_irqs(cpu_online_map);
+       fixup_irqs();
 }
 
 int native_cpu_disable(void)
index 8da059f949be9527e1ce7919bfff54550a65c714..ce505464224758c650a34f153a17fff8003e267c 100644 (file)
@@ -163,7 +163,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
         * We have to send the IPI only to
         * CPUs affected.
         */
-       send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
+       send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR);
 
        while (!cpus_empty(flush_cpumask))
                /* nothing. lockup detection does not belong here */
index 29887d7081a9572557e5ae30f630f8c2a3ab3321..f8be6f1d2e48645c35d883479d6072418d4d2d26 100644 (file)
@@ -191,7 +191,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
         * We have to send the IPI only to
         * CPUs affected.
         */
-       send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
+       send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender);
 
        while (!cpus_empty(f->flush_cpumask))
                cpu_relax();
index 6a00e5faaa74375d5b709488a0b9c371c3fe5c79..f885023167e042073cc3f37721cb7a0d3e1b2941 100644 (file)
@@ -582,7 +582,6 @@ static int __init uv_ptc_init(void)
 static struct bau_control * __init uv_table_bases_init(int blade, int node)
 {
        int i;
-       int *ip;
        struct bau_msg_status *msp;
        struct bau_control *bau_tabp;
 
@@ -599,13 +598,6 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node)
                bau_cpubits_clear(&msp->seen_by, (int)
                                  uv_blade_nr_possible_cpus(blade));
 
-       bau_tabp->watching =
-           kmalloc_node(sizeof(int) * DEST_NUM_RESOURCES, GFP_KERNEL, node);
-       BUG_ON(!bau_tabp->watching);
-
-       for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++)
-               *ip = 0;
-
        uv_bau_table_bases[blade] = bau_tabp;
 
        return bau_tabp;
@@ -628,7 +620,6 @@ uv_table_bases_finish(int blade, int node, int cur_cpu,
                bcp->bau_msg_head       = bau_tablesp->va_queue_first;
                bcp->va_queue_first     = bau_tablesp->va_queue_first;
                bcp->va_queue_last      = bau_tablesp->va_queue_last;
-               bcp->watching           = bau_tablesp->watching;
                bcp->msg_statuses       = bau_tablesp->msg_statuses;
                bcp->descriptor_base    = adp;
        }
index 141907ab6e2260adaa2f20cbcdc6747751184c64..ce6650eb64e976e120cbefa87adae9b4675e94ed 100644 (file)
@@ -72,9 +72,6 @@
 
 #include "cpu/mcheck/mce.h"
 
-DECLARE_BITMAP(used_vectors, NR_VECTORS);
-EXPORT_SYMBOL_GPL(used_vectors);
-
 asmlinkage int system_call(void);
 
 /* Do we ignore FPU interrupts ? */
@@ -89,6 +86,9 @@ gate_desc idt_table[256]
        __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
 #endif
 
+DECLARE_BITMAP(used_vectors, NR_VECTORS);
+EXPORT_SYMBOL_GPL(used_vectors);
+
 static int ignore_nmis;
 
 static inline void conditional_sti(struct pt_regs *regs)
@@ -292,8 +292,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
        tsk->thread.error_code = error_code;
        tsk->thread.trap_no = 8;
 
-       /* This is always a kernel trap and never fixable (and thus must
-          never return). */
+       /*
+        * This is always a kernel trap and never fixable (and thus must
+        * never return).
+        */
        for (;;)
                die(str, regs, error_code);
 }
@@ -520,9 +522,11 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
 }
 
 #ifdef CONFIG_X86_64
-/* Help handler running on IST stack to switch back to user stack
-   for scheduling or signal handling. The actual stack switch is done in
-   entry.S */
+/*
+ * Help handler running on IST stack to switch back to user stack
+ * for scheduling or signal handling. The actual stack switch is done in
+ * entry.S
+ */
 asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
 {
        struct pt_regs *regs = eregs;
@@ -532,8 +536,10 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
        /* Exception from user space */
        else if (user_mode(eregs))
                regs = task_pt_regs(current);
-       /* Exception from kernel and interrupts are enabled. Move to
-          kernel process stack. */
+       /*
+        * Exception from kernel and interrupts are enabled. Move to
+        * kernel process stack.
+        */
        else if (eregs->flags & X86_EFLAGS_IF)
                regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
        if (eregs != regs)
@@ -685,12 +691,7 @@ void math_error(void __user *ip)
        cwd = get_fpu_cwd(task);
        swd = get_fpu_swd(task);
 
-       err = swd & ~cwd & 0x3f;
-
-#ifdef CONFIG_X86_32
-       if (!err)
-               return;
-#endif
+       err = swd & ~cwd;
 
        if (err & 0x001) {      /* Invalid op */
                /*
@@ -708,7 +709,11 @@ void math_error(void __user *ip)
        } else if (err & 0x020) { /* Precision */
                info.si_code = FPE_FLTRES;
        } else {
-               info.si_code = __SI_FAULT|SI_KERNEL; /* WTF? */
+               /*
+                * If we're using IRQ 13, or supposedly even some trap 16
+                * implementations, it's possible we get a spurious trap...
+                */
+               return;         /* Spurious trap, no error */
        }
        force_sig_info(SIGFPE, &info, task);
 }
@@ -941,9 +946,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 
 void __init trap_init(void)
 {
-#ifdef CONFIG_X86_32
        int i;
-#endif
 
 #ifdef CONFIG_EISA
        void __iomem *p = early_ioremap(0x0FFFD9, 4);
@@ -1000,11 +1003,15 @@ void __init trap_init(void)
        }
 
        set_system_trap_gate(SYSCALL_VECTOR, &system_call);
+#endif
 
        /* Reserve all the builtin and the syscall vector: */
        for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
                set_bit(i, used_vectors);
 
+#ifdef CONFIG_X86_64
+       set_bit(IA32_SYSCALL_VECTOR, used_vectors);
+#else
        set_bit(SYSCALL_VECTOR, used_vectors);
 #endif
        /*
index 254ee07f8635007262591bbdf595f819caa1358d..c4c1f9e094027089373189f7e57cb7ede8149be9 100644 (file)
@@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void)
        /* Upper bound is clockevent's use of ulong for cycle deltas. */
        evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
        evt->min_delta_ns = clockevent_delta2ns(1, evt);
-       evt->cpumask = cpumask_of_cpu(cpu);
+       evt->cpumask = cpumask_of(cpu);
 
        printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
               evt->name, evt->mult, evt->shift);
index 15c3e699918221487429508113173f692794a943..2b54fe002e94e490db9a0e8ec3d4b89ec76524ab 100644 (file)
@@ -159,7 +159,7 @@ int save_i387_xstate(void __user *buf)
  * Restore the extended state if present. Otherwise, restore the FP/SSE
  * state.
  */
-int restore_user_xstate(void __user *buf)
+static int restore_user_xstate(void __user *buf)
 {
        struct _fpx_sw_bytes fx_sw_user;
        u64 mask;
index 59ebd37ad79e25b658217e02dd5b57ab38f18c21..e665d1c623ca04f45444daf5324e6c6d6477612d 100644 (file)
@@ -603,10 +603,29 @@ void kvm_free_pit(struct kvm *kvm)
 
 static void __inject_pit_timer_intr(struct kvm *kvm)
 {
+       struct kvm_vcpu *vcpu;
+       int i;
+
        mutex_lock(&kvm->lock);
        kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
        kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
        mutex_unlock(&kvm->lock);
+
+       /*
+        * Provides NMI watchdog support via Virtual Wire mode.
+        * The route is: PIT -> PIC -> LVT0 in NMI mode.
+        *
+        * Note: Our Virtual Wire implementation is simplified, only
+        * propagating PIT interrupts to all VCPUs when they have set
+        * LVT0 to NMI delivery. Other PIC interrupts are just sent to
+        * VCPU0, and only if its LVT0 is in EXTINT mode.
+        */
+       if (kvm->arch.vapics_in_nmi_mode > 0)
+               for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+                       vcpu = kvm->vcpus[i];
+                       if (vcpu)
+                               kvm_apic_nmi_wd_deliver(vcpu);
+               }
 }
 
 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
index 17e41e165f1a296cb538f48ec118db1109cf3b92..179dcb0103fdcf7b2a59c27ee399fdd4302fe67a 100644 (file)
  *   Port from Qemu.
  */
 #include <linux/mm.h>
+#include <linux/bitops.h>
 #include "irq.h"
 
 #include <linux/kvm_host.h>
 
+static void pic_lock(struct kvm_pic *s)
+{
+       spin_lock(&s->lock);
+}
+
+static void pic_unlock(struct kvm_pic *s)
+{
+       struct kvm *kvm = s->kvm;
+       unsigned acks = s->pending_acks;
+       bool wakeup = s->wakeup_needed;
+       struct kvm_vcpu *vcpu;
+
+       s->pending_acks = 0;
+       s->wakeup_needed = false;
+
+       spin_unlock(&s->lock);
+
+       while (acks) {
+               kvm_notify_acked_irq(kvm, __ffs(acks));
+               acks &= acks - 1;
+       }
+
+       if (wakeup) {
+               vcpu = s->kvm->vcpus[0];
+               if (vcpu)
+                       kvm_vcpu_kick(vcpu);
+       }
+}
+
 static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
 {
        s->isr &= ~(1 << irq);
@@ -136,17 +166,21 @@ static void pic_update_irq(struct kvm_pic *s)
 
 void kvm_pic_update_irq(struct kvm_pic *s)
 {
+       pic_lock(s);
        pic_update_irq(s);
+       pic_unlock(s);
 }
 
 void kvm_pic_set_irq(void *opaque, int irq, int level)
 {
        struct kvm_pic *s = opaque;
 
+       pic_lock(s);
        if (irq >= 0 && irq < PIC_NUM_PINS) {
                pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
                pic_update_irq(s);
        }
+       pic_unlock(s);
 }
 
 /*
@@ -172,6 +206,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
        int irq, irq2, intno;
        struct kvm_pic *s = pic_irqchip(kvm);
 
+       pic_lock(s);
        irq = pic_get_irq(&s->pics[0]);
        if (irq >= 0) {
                pic_intack(&s->pics[0], irq);
@@ -196,6 +231,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
                intno = s->pics[0].irq_base + irq;
        }
        pic_update_irq(s);
+       pic_unlock(s);
        kvm_notify_acked_irq(kvm, irq);
 
        return intno;
@@ -203,7 +239,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 
 void kvm_pic_reset(struct kvm_kpic_state *s)
 {
-       int irq, irqbase;
+       int irq, irqbase, n;
        struct kvm *kvm = s->pics_state->irq_request_opaque;
        struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
 
@@ -214,8 +250,10 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
 
        for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
                if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
-                       if (s->irr & (1 << irq) || s->isr & (1 << irq))
-                               kvm_notify_acked_irq(kvm, irq+irqbase);
+                       if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
+                               n = irq + irqbase;
+                               s->pics_state->pending_acks |= 1 << n;
+                       }
        }
        s->last_irr = 0;
        s->irr = 0;
@@ -406,6 +444,7 @@ static void picdev_write(struct kvm_io_device *this,
                        printk(KERN_ERR "PIC: non byte write\n");
                return;
        }
+       pic_lock(s);
        switch (addr) {
        case 0x20:
        case 0x21:
@@ -418,6 +457,7 @@ static void picdev_write(struct kvm_io_device *this,
                elcr_ioport_write(&s->pics[addr & 1], addr, data);
                break;
        }
+       pic_unlock(s);
 }
 
 static void picdev_read(struct kvm_io_device *this,
@@ -431,6 +471,7 @@ static void picdev_read(struct kvm_io_device *this,
                        printk(KERN_ERR "PIC: non byte read\n");
                return;
        }
+       pic_lock(s);
        switch (addr) {
        case 0x20:
        case 0x21:
@@ -444,6 +485,7 @@ static void picdev_read(struct kvm_io_device *this,
                break;
        }
        *(unsigned char *)val = data;
+       pic_unlock(s);
 }
 
 /*
@@ -459,7 +501,7 @@ static void pic_irq_request(void *opaque, int level)
        s->output = level;
        if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
                s->pics[0].isr_ack &= ~(1 << irq);
-               kvm_vcpu_kick(vcpu);
+               s->wakeup_needed = true;
        }
 }
 
@@ -469,6 +511,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
        s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
        if (!s)
                return NULL;
+       spin_lock_init(&s->lock);
+       s->kvm = kvm;
        s->pics[0].elcr_mask = 0xf8;
        s->pics[1].elcr_mask = 0xde;
        s->irq_request = pic_irq_request;
index f17c8f5bbf31b8bae7b1d2a0d770bd5a968a7a6a..2bf32a03ceecf5837f6e30ef99b80067423632da 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/mm_types.h>
 #include <linux/hrtimer.h>
 #include <linux/kvm_host.h>
+#include <linux/spinlock.h>
 
 #include "iodev.h"
 #include "ioapic.h"
@@ -59,6 +60,10 @@ struct kvm_kpic_state {
 };
 
 struct kvm_pic {
+       spinlock_t lock;
+       bool wakeup_needed;
+       unsigned pending_acks;
+       struct kvm *kvm;
        struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
        irq_request_func *irq_request;
        void *irq_request_opaque;
@@ -87,6 +92,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s);
 void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
 void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
index 65ef0fc2c036ab15527e4abb04a9f2b3ca12eb4f..8e5ee99551f6e75bf5c2a2ea64627abe98b12607 100644 (file)
@@ -7,7 +7,7 @@
 #include <linux/kvm_host.h>
 #include <asm/msr.h>
 
-#include "svm.h"
+#include <asm/svm.h>
 
 static const u32 host_save_user_msrs[] = {
 #ifdef CONFIG_X86_64
index 0fc3cab48943da8a3c513c7c753a4792c6743198..afac68c0815c78e712edc789821029d05b1d2d2f 100644 (file)
@@ -130,6 +130,11 @@ static inline int apic_lvtt_period(struct kvm_lapic *apic)
        return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
 }
 
+static inline int apic_lvt_nmi_mode(u32 lvt_val)
+{
+       return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
+}
+
 static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
        LVT_MASK | APIC_LVT_TIMER_PERIODIC,     /* LVTT */
        LVT_MASK | APIC_MODE_MASK,      /* LVTTHMR */
@@ -354,6 +359,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 
        case APIC_DM_NMI:
                kvm_inject_nmi(vcpu);
+               kvm_vcpu_kick(vcpu);
                break;
 
        case APIC_DM_INIT:
@@ -380,6 +386,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                }
                break;
 
+       case APIC_DM_EXTINT:
+               /*
+                * Should only be called by kvm_apic_local_deliver() with LVT0,
+                * before NMI watchdog was enabled. Already handled by
+                * kvm_apic_accept_pic_intr().
+                */
+               break;
+
        default:
                printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
                       delivery_mode);
@@ -663,6 +677,20 @@ static void start_apic_timer(struct kvm_lapic *apic)
                                        apic->timer.period)));
 }
 
+static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
+{
+       int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0));
+
+       if (apic_lvt_nmi_mode(lvt0_val)) {
+               if (!nmi_wd_enabled) {
+                       apic_debug("Receive NMI setting on APIC_LVT0 "
+                                  "for cpu %d\n", apic->vcpu->vcpu_id);
+                       apic->vcpu->kvm->arch.vapics_in_nmi_mode++;
+               }
+       } else if (nmi_wd_enabled)
+               apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
+}
+
 static void apic_mmio_write(struct kvm_io_device *this,
                            gpa_t address, int len, const void *data)
 {
@@ -743,10 +771,11 @@ static void apic_mmio_write(struct kvm_io_device *this,
                apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
                break;
 
+       case APIC_LVT0:
+               apic_manage_nmi_watchdog(apic, val);
        case APIC_LVTT:
        case APIC_LVTTHMR:
        case APIC_LVTPC:
-       case APIC_LVT0:
        case APIC_LVT1:
        case APIC_LVTERR:
                /* TODO: Check vector */
@@ -961,12 +990,26 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
        return 0;
 }
 
-static int __inject_apic_timer_irq(struct kvm_lapic *apic)
+static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
+{
+       u32 reg = apic_get_reg(apic, lvt_type);
+       int vector, mode, trig_mode;
+
+       if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
+               vector = reg & APIC_VECTOR_MASK;
+               mode = reg & APIC_MODE_MASK;
+               trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
+               return __apic_accept_irq(apic, mode, vector, 1, trig_mode);
+       }
+       return 0;
+}
+
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
 {
-       int vector;
+       struct kvm_lapic *apic = vcpu->arch.apic;
 
-       vector = apic_lvt_vector(apic, APIC_LVTT);
-       return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
+       if (apic)
+               kvm_apic_local_deliver(apic, APIC_LVT0);
 }
 
 static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
@@ -1061,9 +1104,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
 
-       if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
-               atomic_read(&apic->timer.pending) > 0) {
-               if (__inject_apic_timer_irq(apic))
+       if (apic && atomic_read(&apic->timer.pending) > 0) {
+               if (kvm_apic_local_deliver(apic, APIC_LVTT))
                        atomic_dec(&apic->timer.pending);
        }
 }
index 410ddbc1aa2eb0fc2dfa45de560c829df364aa81..83f11c7474a1d74ef996ca7245078e1b6dda7051 100644 (file)
@@ -17,7 +17,6 @@
  *
  */
 
-#include "vmx.h"
 #include "mmu.h"
 
 #include <linux/kvm_host.h>
@@ -33,6 +32,7 @@
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
 #include <asm/io.h>
+#include <asm/vmx.h>
 
 /*
  * When setting this variable to true it enables Two-Dimensional-Paging
@@ -168,6 +168,7 @@ static u64 __read_mostly shadow_x_mask;     /* mutual exclusive with nx_mask */
 static u64 __read_mostly shadow_user_mask;
 static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
+static u64 __read_mostly shadow_mt_mask;
 
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 {
@@ -183,13 +184,14 @@ void kvm_mmu_set_base_ptes(u64 base_pte)
 EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
 
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-               u64 dirty_mask, u64 nx_mask, u64 x_mask)
+               u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask)
 {
        shadow_user_mask = user_mask;
        shadow_accessed_mask = accessed_mask;
        shadow_dirty_mask = dirty_mask;
        shadow_nx_mask = nx_mask;
        shadow_x_mask = x_mask;
+       shadow_mt_mask = mt_mask;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
@@ -384,7 +386,9 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 {
        int *write_count;
 
-       write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+       gfn = unalias_gfn(kvm, gfn);
+       write_count = slot_largepage_idx(gfn,
+                                        gfn_to_memslot_unaliased(kvm, gfn));
        *write_count += 1;
 }
 
@@ -392,16 +396,20 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 {
        int *write_count;
 
-       write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+       gfn = unalias_gfn(kvm, gfn);
+       write_count = slot_largepage_idx(gfn,
+                                        gfn_to_memslot_unaliased(kvm, gfn));
        *write_count -= 1;
        WARN_ON(*write_count < 0);
 }
 
 static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
 {
-       struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+       struct kvm_memory_slot *slot;
        int *largepage_idx;
 
+       gfn = unalias_gfn(kvm, gfn);
+       slot = gfn_to_memslot_unaliased(kvm, gfn);
        if (slot) {
                largepage_idx = slot_largepage_idx(gfn, slot);
                return *largepage_idx;
@@ -613,7 +621,7 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
        return NULL;
 }
 
-static void rmap_write_protect(struct kvm *kvm, u64 gfn)
+static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
        unsigned long *rmapp;
        u64 *spte;
@@ -659,8 +667,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
                spte = rmap_next(kvm, rmapp, spte);
        }
 
-       if (write_protected)
-               kvm_flush_remote_tlbs(kvm);
+       return write_protected;
 }
 
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -786,9 +793,11 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
        sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
        set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
        list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+       INIT_LIST_HEAD(&sp->oos_link);
        ASSERT(is_empty_shadow_page(sp->spt));
-       sp->slot_bitmap = 0;
+       bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
        sp->multimapped = 0;
+       sp->global = 1;
        sp->parent_pte = parent_pte;
        --vcpu->kvm->arch.n_free_mmu_pages;
        return sp;
@@ -900,8 +909,9 @@ static void kvm_mmu_update_unsync_bitmap(u64 *spte)
        struct kvm_mmu_page *sp = page_header(__pa(spte));
 
        index = spte - sp->spt;
-       __set_bit(index, sp->unsync_child_bitmap);
-       sp->unsync_children = 1;
+       if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
+               sp->unsync_children++;
+       WARN_ON(!sp->unsync_children);
 }
 
 static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
@@ -928,7 +938,6 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
 
 static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
-       sp->unsync_children = 1;
        kvm_mmu_update_parents_unsync(sp);
        return 1;
 }
@@ -959,38 +968,66 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
 }
 
+#define KVM_PAGE_ARRAY_NR 16
+
+struct kvm_mmu_pages {
+       struct mmu_page_and_offset {
+               struct kvm_mmu_page *sp;
+               unsigned int idx;
+       } page[KVM_PAGE_ARRAY_NR];
+       unsigned int nr;
+};
+
 #define for_each_unsync_children(bitmap, idx)          \
        for (idx = find_first_bit(bitmap, 512);         \
             idx < 512;                                 \
             idx = find_next_bit(bitmap, 512, idx+1))
 
-static int mmu_unsync_walk(struct kvm_mmu_page *sp,
-                          struct kvm_unsync_walk *walker)
+int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
+                  int idx)
 {
-       int i, ret;
+       int i;
 
-       if (!sp->unsync_children)
-               return 0;
+       if (sp->unsync)
+               for (i=0; i < pvec->nr; i++)
+                       if (pvec->page[i].sp == sp)
+                               return 0;
+
+       pvec->page[pvec->nr].sp = sp;
+       pvec->page[pvec->nr].idx = idx;
+       pvec->nr++;
+       return (pvec->nr == KVM_PAGE_ARRAY_NR);
+}
+
+static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
+                          struct kvm_mmu_pages *pvec)
+{
+       int i, ret, nr_unsync_leaf = 0;
 
        for_each_unsync_children(sp->unsync_child_bitmap, i) {
                u64 ent = sp->spt[i];
 
-               if (is_shadow_present_pte(ent)) {
+               if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
                        struct kvm_mmu_page *child;
                        child = page_header(ent & PT64_BASE_ADDR_MASK);
 
                        if (child->unsync_children) {
-                               ret = mmu_unsync_walk(child, walker);
-                               if (ret)
+                               if (mmu_pages_add(pvec, child, i))
+                                       return -ENOSPC;
+
+                               ret = __mmu_unsync_walk(child, pvec);
+                               if (!ret)
+                                       __clear_bit(i, sp->unsync_child_bitmap);
+                               else if (ret > 0)
+                                       nr_unsync_leaf += ret;
+                               else
                                        return ret;
-                               __clear_bit(i, sp->unsync_child_bitmap);
                        }
 
                        if (child->unsync) {
-                               ret = walker->entry(child, walker);
-                               __clear_bit(i, sp->unsync_child_bitmap);
-                               if (ret)
-                                       return ret;
+                               nr_unsync_leaf++;
+                               if (mmu_pages_add(pvec, child, i))
+                                       return -ENOSPC;
                        }
                }
        }
@@ -998,7 +1035,17 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
        if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
                sp->unsync_children = 0;
 
-       return 0;
+       return nr_unsync_leaf;
+}
+
+static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+                          struct kvm_mmu_pages *pvec)
+{
+       if (!sp->unsync_children)
+               return 0;
+
+       mmu_pages_add(pvec, sp, 0);
+       return __mmu_unsync_walk(sp, pvec);
 }
 
 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
@@ -1021,10 +1068,18 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
        return NULL;
 }
 
+static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+       list_del(&sp->oos_link);
+       --kvm->stat.mmu_unsync_global;
+}
+
 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
        WARN_ON(!sp->unsync);
        sp->unsync = 0;
+       if (sp->global)
+               kvm_unlink_unsync_global(kvm, sp);
        --kvm->stat.mmu_unsync;
 }
 
@@ -1037,7 +1092,8 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                return 1;
        }
 
-       rmap_write_protect(vcpu->kvm, sp->gfn);
+       if (rmap_write_protect(vcpu->kvm, sp->gfn))
+               kvm_flush_remote_tlbs(vcpu->kvm);
        kvm_unlink_unsync_page(vcpu->kvm, sp);
        if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
                kvm_mmu_zap_page(vcpu->kvm, sp);
@@ -1048,30 +1104,89 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
        return 0;
 }
 
-struct sync_walker {
-       struct kvm_vcpu *vcpu;
-       struct kvm_unsync_walk walker;
+struct mmu_page_path {
+       struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
+       unsigned int idx[PT64_ROOT_LEVEL-1];
 };
 
-static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+#define for_each_sp(pvec, sp, parents, i)                      \
+               for (i = mmu_pages_next(&pvec, &parents, -1),   \
+                       sp = pvec.page[i].sp;                   \
+                       i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
+                       i = mmu_pages_next(&pvec, &parents, i))
+
+int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents,
+                  int i)
 {
-       struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
-                                                    walker);
-       struct kvm_vcpu *vcpu = sync_walk->vcpu;
+       int n;
 
-       kvm_sync_page(vcpu, sp);
-       return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
+       for (n = i+1; n < pvec->nr; n++) {
+               struct kvm_mmu_page *sp = pvec->page[n].sp;
+
+               if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
+                       parents->idx[0] = pvec->page[n].idx;
+                       return n;
+               }
+
+               parents->parent[sp->role.level-2] = sp;
+               parents->idx[sp->role.level-1] = pvec->page[n].idx;
+       }
+
+       return n;
 }
 
-static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+void mmu_pages_clear_parents(struct mmu_page_path *parents)
 {
-       struct sync_walker walker = {
-               .walker = { .entry = mmu_sync_fn, },
-               .vcpu = vcpu,
-       };
+       struct kvm_mmu_page *sp;
+       unsigned int level = 0;
+
+       do {
+               unsigned int idx = parents->idx[level];
+
+               sp = parents->parent[level];
+               if (!sp)
+                       return;
+
+               --sp->unsync_children;
+               WARN_ON((int)sp->unsync_children < 0);
+               __clear_bit(idx, sp->unsync_child_bitmap);
+               level++;
+       } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
+}
+
+static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
+                              struct mmu_page_path *parents,
+                              struct kvm_mmu_pages *pvec)
+{
+       parents->parent[parent->role.level-1] = NULL;
+       pvec->nr = 0;
+}
+
+static void mmu_sync_children(struct kvm_vcpu *vcpu,
+                             struct kvm_mmu_page *parent)
+{
+       int i;
+       struct kvm_mmu_page *sp;
+       struct mmu_page_path parents;
+       struct kvm_mmu_pages pages;
+
+       kvm_mmu_pages_init(parent, &parents, &pages);
+       while (mmu_unsync_walk(parent, &pages)) {
+               int protected = 0;
 
-       while (mmu_unsync_walk(sp, &walker.walker))
+               for_each_sp(pages, sp, parents, i)
+                       protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
+
+               if (protected)
+                       kvm_flush_remote_tlbs(vcpu->kvm);
+
+               for_each_sp(pages, sp, parents, i) {
+                       kvm_sync_page(vcpu, sp);
+                       mmu_pages_clear_parents(&parents);
+               }
                cond_resched_lock(&vcpu->kvm->mmu_lock);
+               kvm_mmu_pages_init(parent, &parents, &pages);
+       }
 }
 
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
@@ -1129,7 +1244,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
        sp->role = role;
        hlist_add_head(&sp->hash_link, bucket);
        if (!metaphysical) {
-               rmap_write_protect(vcpu->kvm, gfn);
+               if (rmap_write_protect(vcpu->kvm, gfn))
+                       kvm_flush_remote_tlbs(vcpu->kvm);
                account_shadowed(vcpu->kvm, gfn);
        }
        if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
@@ -1153,6 +1269,8 @@ static int walk_shadow(struct kvm_shadow_walk *walker,
        if (level == PT32E_ROOT_LEVEL) {
                shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
                shadow_addr &= PT64_BASE_ADDR_MASK;
+               if (!shadow_addr)
+                       return 1;
                --level;
        }
 
@@ -1237,33 +1355,29 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
        }
 }
 
-struct zap_walker {
-       struct kvm_unsync_walk walker;
-       struct kvm *kvm;
-       int zapped;
-};
-
-static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+static int mmu_zap_unsync_children(struct kvm *kvm,
+                                  struct kvm_mmu_page *parent)
 {
-       struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
-                                                    walker);
-       kvm_mmu_zap_page(zap_walk->kvm, sp);
-       zap_walk->zapped = 1;
-       return 0;
-}
+       int i, zapped = 0;
+       struct mmu_page_path parents;
+       struct kvm_mmu_pages pages;
 
-static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-       struct zap_walker walker = {
-               .walker = { .entry = mmu_zap_fn, },
-               .kvm = kvm,
-               .zapped = 0,
-       };
-
-       if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+       if (parent->role.level == PT_PAGE_TABLE_LEVEL)
                return 0;
-       mmu_unsync_walk(sp, &walker.walker);
-       return walker.zapped;
+
+       kvm_mmu_pages_init(parent, &parents, &pages);
+       while (mmu_unsync_walk(parent, &pages)) {
+               struct kvm_mmu_page *sp;
+
+               for_each_sp(pages, sp, parents, i) {
+                       kvm_mmu_zap_page(kvm, sp);
+                       mmu_pages_clear_parents(&parents);
+               }
+               zapped += pages.nr;
+               kvm_mmu_pages_init(parent, &parents, &pages);
+       }
+
+       return zapped;
 }
 
 static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -1362,7 +1476,7 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
        int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
        struct kvm_mmu_page *sp = page_header(__pa(pte));
 
-       __set_bit(slot, &sp->slot_bitmap);
+       __set_bit(slot, sp->slot_bitmap);
 }
 
 static void mmu_convert_notrap(struct kvm_mmu_page *sp)
@@ -1393,6 +1507,110 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
        return page;
 }
 
+/*
+ * The function is based on mtrr_type_lookup() in
+ * arch/x86/kernel/cpu/mtrr/generic.c
+ */
+static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
+                        u64 start, u64 end)
+{
+       int i;
+       u64 base, mask;
+       u8 prev_match, curr_match;
+       int num_var_ranges = KVM_NR_VAR_MTRR;
+
+       if (!mtrr_state->enabled)
+               return 0xFF;
+
+       /* Make end inclusive end, instead of exclusive */
+       end--;
+
+       /* Look in fixed ranges. Just return the type as per start */
+       if (mtrr_state->have_fixed && (start < 0x100000)) {
+               int idx;
+
+               if (start < 0x80000) {
+                       idx = 0;
+                       idx += (start >> 16);
+                       return mtrr_state->fixed_ranges[idx];
+               } else if (start < 0xC0000) {
+                       idx = 1 * 8;
+                       idx += ((start - 0x80000) >> 14);
+                       return mtrr_state->fixed_ranges[idx];
+               } else if (start < 0x1000000) {
+                       idx = 3 * 8;
+                       idx += ((start - 0xC0000) >> 12);
+                       return mtrr_state->fixed_ranges[idx];
+               }
+       }
+
+       /*
+        * Look in variable ranges
+        * Look of multiple ranges matching this address and pick type
+        * as per MTRR precedence
+        */
+       if (!(mtrr_state->enabled & 2))
+               return mtrr_state->def_type;
+
+       prev_match = 0xFF;
+       for (i = 0; i < num_var_ranges; ++i) {
+               unsigned short start_state, end_state;
+
+               if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
+                       continue;
+
+               base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
+                      (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
+               mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
+                      (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
+
+               start_state = ((start & mask) == (base & mask));
+               end_state = ((end & mask) == (base & mask));
+               if (start_state != end_state)
+                       return 0xFE;
+
+               if ((start & mask) != (base & mask))
+                       continue;
+
+               curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
+               if (prev_match == 0xFF) {
+                       prev_match = curr_match;
+                       continue;
+               }
+
+               if (prev_match == MTRR_TYPE_UNCACHABLE ||
+                   curr_match == MTRR_TYPE_UNCACHABLE)
+                       return MTRR_TYPE_UNCACHABLE;
+
+               if ((prev_match == MTRR_TYPE_WRBACK &&
+                    curr_match == MTRR_TYPE_WRTHROUGH) ||
+                   (prev_match == MTRR_TYPE_WRTHROUGH &&
+                    curr_match == MTRR_TYPE_WRBACK)) {
+                       prev_match = MTRR_TYPE_WRTHROUGH;
+                       curr_match = MTRR_TYPE_WRTHROUGH;
+               }
+
+               if (prev_match != curr_match)
+                       return MTRR_TYPE_UNCACHABLE;
+       }
+
+       if (prev_match != 0xFF)
+               return prev_match;
+
+       return mtrr_state->def_type;
+}
+
+static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+       u8 mtrr;
+
+       mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
+                            (gfn << PAGE_SHIFT) + PAGE_SIZE);
+       if (mtrr == 0xfe || mtrr == 0xff)
+               mtrr = MTRR_TYPE_WRBACK;
+       return mtrr;
+}
+
 static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
        unsigned index;
@@ -1409,9 +1627,15 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                if (s->role.word != sp->role.word)
                        return 1;
        }
-       kvm_mmu_mark_parents_unsync(vcpu, sp);
        ++vcpu->kvm->stat.mmu_unsync;
        sp->unsync = 1;
+
+       if (sp->global) {
+               list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages);
+               ++vcpu->kvm->stat.mmu_unsync_global;
+       } else
+               kvm_mmu_mark_parents_unsync(vcpu, sp);
+
        mmu_convert_notrap(sp);
        return 0;
 }
@@ -1437,11 +1661,24 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                    unsigned pte_access, int user_fault,
                    int write_fault, int dirty, int largepage,
-                   gfn_t gfn, pfn_t pfn, bool speculative,
+                   int global, gfn_t gfn, pfn_t pfn, bool speculative,
                    bool can_unsync)
 {
        u64 spte;
        int ret = 0;
+       u64 mt_mask = shadow_mt_mask;
+       struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
+
+       if (!(vcpu->arch.cr4 & X86_CR4_PGE))
+               global = 0;
+       if (!global && sp->global) {
+               sp->global = 0;
+               if (sp->unsync) {
+                       kvm_unlink_unsync_global(vcpu->kvm, sp);
+                       kvm_mmu_mark_parents_unsync(vcpu, sp);
+               }
+       }
+
        /*
         * We don't set the accessed bit, since we sometimes want to see
         * whether the guest actually used the pte (in order to detect
@@ -1460,6 +1697,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                spte |= shadow_user_mask;
        if (largepage)
                spte |= PT_PAGE_SIZE_MASK;
+       if (mt_mask) {
+               mt_mask = get_memory_type(vcpu, gfn) <<
+                         kvm_x86_ops->get_mt_mask_shift();
+               spte |= mt_mask;
+       }
 
        spte |= (u64)pfn << PAGE_SHIFT;
 
@@ -1474,6 +1716,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 
                spte |= PT_WRITABLE_MASK;
 
+               /*
+                * Optimization: for pte sync, if spte was writable the hash
+                * lookup is unnecessary (and expensive). Write protection
+                * is responsibility of mmu_get_page / kvm_sync_page.
+                * Same reasoning can be applied to dirty page accounting.
+                */
+               if (!can_unsync && is_writeble_pte(*shadow_pte))
+                       goto set_pte;
+
                if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
                        pgprintk("%s: found shadow page for %lx, marking ro\n",
                                 __func__, gfn);
@@ -1495,8 +1746,8 @@ set_pte:
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                         unsigned pt_access, unsigned pte_access,
                         int user_fault, int write_fault, int dirty,
-                        int *ptwrite, int largepage, gfn_t gfn,
-                        pfn_t pfn, bool speculative)
+                        int *ptwrite, int largepage, int global,
+                        gfn_t gfn, pfn_t pfn, bool speculative)
 {
        int was_rmapped = 0;
        int was_writeble = is_writeble_pte(*shadow_pte);
@@ -1529,7 +1780,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
                }
        }
        if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
-                     dirty, largepage, gfn, pfn, speculative, true)) {
+                     dirty, largepage, global, gfn, pfn, speculative, true)) {
                if (write_fault)
                        *ptwrite = 1;
                kvm_x86_ops->tlb_flush(vcpu);
@@ -1586,7 +1837,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk,
            || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
                mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
                             0, walk->write, 1, &walk->pt_write,
-                            walk->largepage, gfn, walk->pfn, false);
+                            walk->largepage, 0, gfn, walk->pfn, false);
                ++vcpu->stat.pf_fixed;
                return 1;
        }
@@ -1773,6 +2024,15 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
        }
 }
 
+static void mmu_sync_global(struct kvm_vcpu *vcpu)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_mmu_page *sp, *n;
+
+       list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link)
+               kvm_sync_page(vcpu, sp);
+}
+
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 {
        spin_lock(&vcpu->kvm->mmu_lock);
@@ -1780,6 +2040,13 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
        spin_unlock(&vcpu->kvm->mmu_lock);
 }
 
+void kvm_mmu_sync_global(struct kvm_vcpu *vcpu)
+{
+       spin_lock(&vcpu->kvm->mmu_lock);
+       mmu_sync_global(vcpu);
+       spin_unlock(&vcpu->kvm->mmu_lock);
+}
+
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
 {
        return vaddr;
@@ -2178,7 +2445,8 @@ static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
 }
 
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-                      const u8 *new, int bytes)
+                      const u8 *new, int bytes,
+                      bool guest_initiated)
 {
        gfn_t gfn = gpa >> PAGE_SHIFT;
        struct kvm_mmu_page *sp;
@@ -2204,15 +2472,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        kvm_mmu_free_some_pages(vcpu);
        ++vcpu->kvm->stat.mmu_pte_write;
        kvm_mmu_audit(vcpu, "pre pte write");
-       if (gfn == vcpu->arch.last_pt_write_gfn
-           && !last_updated_pte_accessed(vcpu)) {
-               ++vcpu->arch.last_pt_write_count;
-               if (vcpu->arch.last_pt_write_count >= 3)
-                       flooded = 1;
-       } else {
-               vcpu->arch.last_pt_write_gfn = gfn;
-               vcpu->arch.last_pt_write_count = 1;
-               vcpu->arch.last_pte_updated = NULL;
+       if (guest_initiated) {
+               if (gfn == vcpu->arch.last_pt_write_gfn
+                   && !last_updated_pte_accessed(vcpu)) {
+                       ++vcpu->arch.last_pt_write_count;
+                       if (vcpu->arch.last_pt_write_count >= 3)
+                               flooded = 1;
+               } else {
+                       vcpu->arch.last_pt_write_gfn = gfn;
+                       vcpu->arch.last_pt_write_count = 1;
+                       vcpu->arch.last_pte_updated = NULL;
+               }
        }
        index = kvm_page_table_hashfn(gfn);
        bucket = &vcpu->kvm->arch.mmu_page_hash[index];
@@ -2352,9 +2622,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
-       spin_lock(&vcpu->kvm->mmu_lock);
        vcpu->arch.mmu.invlpg(vcpu, gva);
-       spin_unlock(&vcpu->kvm->mmu_lock);
        kvm_mmu_flush_tlb(vcpu);
        ++vcpu->stat.invlpg;
 }
@@ -2451,7 +2719,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
                int i;
                u64 *pt;
 
-               if (!test_bit(slot, &sp->slot_bitmap))
+               if (!test_bit(slot, sp->slot_bitmap))
                        continue;
 
                pt = sp->spt;
@@ -2860,8 +3128,8 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
                if (sp->role.metaphysical)
                        continue;
 
-               slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
                gfn = unalias_gfn(vcpu->kvm, sp->gfn);
+               slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
                rmapp = &slot->rmap[gfn - slot->base_gfn];
                if (*rmapp)
                        printk(KERN_ERR "%s: (%s) shadow page has writable"
index 84eee43bbe742b005bd1825b0cdac24ba5efffbb..9fd78b6e17ad20fab48e6901c131de0658b6df5d 100644 (file)
@@ -82,6 +82,7 @@ struct shadow_walker {
        int *ptwrite;
        pfn_t pfn;
        u64 *sptep;
+       gpa_t pte_gpa;
 };
 
 static gfn_t gpte_to_gfn(pt_element_t gpte)
@@ -222,7 +223,7 @@ walk:
                if (ret)
                        goto walk;
                pte |= PT_DIRTY_MASK;
-               kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
+               kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0);
                walker->ptes[walker->level - 1] = pte;
        }
 
@@ -274,7 +275,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
                return;
        kvm_get_pfn(pfn);
        mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
-                    gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
+                    gpte & PT_DIRTY_MASK, NULL, largepage,
+                    gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte),
                     pfn, true);
 }
 
@@ -301,8 +303,9 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
                mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
                             sw->user_fault, sw->write_fault,
                             gw->ptes[gw->level-1] & PT_DIRTY_MASK,
-                            sw->ptwrite, sw->largepage, gw->gfn, sw->pfn,
-                            false);
+                            sw->ptwrite, sw->largepage,
+                            gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
+                            gw->gfn, sw->pfn, false);
                sw->sptep = sptep;
                return 1;
        }
@@ -466,10 +469,22 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
                                      struct kvm_vcpu *vcpu, u64 addr,
                                      u64 *sptep, int level)
 {
+       struct shadow_walker *sw =
+               container_of(_sw, struct shadow_walker, walker);
 
-       if (level == PT_PAGE_TABLE_LEVEL) {
-               if (is_shadow_present_pte(*sptep))
+       /* FIXME: properly handle invlpg on large guest pages */
+       if (level == PT_PAGE_TABLE_LEVEL ||
+           ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) {
+               struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+               sw->pte_gpa = (sp->gfn << PAGE_SHIFT);
+               sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+
+               if (is_shadow_present_pte(*sptep)) {
                        rmap_remove(vcpu->kvm, sptep);
+                       if (is_large_pte(*sptep))
+                               --vcpu->kvm->stat.lpages;
+               }
                set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
                return 1;
        }
@@ -480,11 +495,26 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
 
 static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 {
+       pt_element_t gpte;
        struct shadow_walker walker = {
                .walker = { .entry = FNAME(shadow_invlpg_entry), },
+               .pte_gpa = -1,
        };
 
+       spin_lock(&vcpu->kvm->mmu_lock);
        walk_shadow(&walker.walker, vcpu, gva);
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       if (walker.pte_gpa == -1)
+               return;
+       if (kvm_read_guest_atomic(vcpu->kvm, walker.pte_gpa, &gpte,
+                                 sizeof(pt_element_t)))
+               return;
+       if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) {
+               if (mmu_topup_memory_caches(vcpu))
+                       return;
+               kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)&gpte,
+                                 sizeof(pt_element_t), 0);
+       }
 }
 
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -580,7 +610,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                nr_present++;
                pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
                set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
-                        is_dirty_pte(gpte), 0, gfn,
+                        is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn,
                         spte_to_pfn(sp->spt[i]), true, false);
        }
 
index 9c4ce657d96389753ff9650bb06f12bf76e003c8..1452851ae2583b689f250a403ce6ee08075144c8 100644 (file)
@@ -28,6 +28,8 @@
 
 #include <asm/desc.h>
 
+#include <asm/virtext.h>
+
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
 MODULE_AUTHOR("Qumranet");
@@ -245,34 +247,19 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 
 static int has_svm(void)
 {
-       uint32_t eax, ebx, ecx, edx;
-
-       if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
-               printk(KERN_INFO "has_svm: not amd\n");
-               return 0;
-       }
+       const char *msg;
 
-       cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
-       if (eax < SVM_CPUID_FUNC) {
-               printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
+       if (!cpu_has_svm(&msg)) {
+               printk(KERN_INFO "has_svn: %s\n", msg);
                return 0;
        }
 
-       cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
-       if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
-               printk(KERN_DEBUG "has_svm: svm not available\n");
-               return 0;
-       }
        return 1;
 }
 
 static void svm_hardware_disable(void *garbage)
 {
-       uint64_t efer;
-
-       wrmsrl(MSR_VM_HSAVE_PA, 0);
-       rdmsrl(MSR_EFER, efer);
-       wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
+       cpu_svm_disable();
 }
 
 static void svm_hardware_enable(void *garbage)
@@ -772,6 +759,22 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
        var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
        var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
        var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
+
+       /*
+        * SVM always stores 0 for the 'G' bit in the CS selector in
+        * the VMCB on a VMEXIT. This hurts cross-vendor migration:
+        * Intel's VMENTRY has a check on the 'G' bit.
+        */
+       if (seg == VCPU_SREG_CS)
+               var->g = s->limit > 0xfffff;
+
+       /*
+        * Work around a bug where the busy flag in the tr selector
+        * isn't exposed
+        */
+       if (seg == VCPU_SREG_TR)
+               var->type |= 0x2;
+
        var->unusable = !var->present;
 }
 
@@ -1099,6 +1102,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        rep = (io_info & SVM_IOIO_REP_MASK) != 0;
        down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
 
+       skip_emulated_instruction(&svm->vcpu);
        return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
 }
 
@@ -1912,6 +1916,11 @@ static int get_npt_level(void)
 #endif
 }
 
+static int svm_get_mt_mask_shift(void)
+{
+       return 0;
+}
+
 static struct kvm_x86_ops svm_x86_ops = {
        .cpu_has_kvm_support = has_svm,
        .disabled_by_bios = is_disabled,
@@ -1967,6 +1976,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 
        .set_tss_addr = svm_set_tss_addr,
        .get_tdp_level = get_npt_level,
+       .get_mt_mask_shift = svm_get_mt_mask_shift,
 };
 
 static int __init svm_init(void)
index a4018b01e1f973bad1619d6ca16e0e67e29aceab..6259d7467648e1209456abff3666ad9a4e4a3cb4 100644 (file)
@@ -16,7 +16,6 @@
  */
 
 #include "irq.h"
-#include "vmx.h"
 #include "mmu.h"
 
 #include <linux/kvm_host.h>
@@ -31,6 +30,8 @@
 
 #include <asm/io.h>
 #include <asm/desc.h>
+#include <asm/vmx.h>
+#include <asm/virtext.h>
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
@@ -90,6 +91,11 @@ struct vcpu_vmx {
        } rmode;
        int vpid;
        bool emulation_required;
+
+       /* Support for vnmi-less CPUs */
+       int soft_vnmi_blocked;
+       ktime_t entry_time;
+       s64 vnmi_blocked_time;
 };
 
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -122,7 +128,7 @@ static struct vmcs_config {
        u32 vmentry_ctrl;
 } vmcs_config;
 
-struct vmx_capability {
+static struct vmx_capability {
        u32 ept;
        u32 vpid;
 } vmx_capability;
@@ -957,6 +963,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
                pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
 
                break;
+       case MSR_IA32_CR_PAT:
+               if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+                       vmcs_write64(GUEST_IA32_PAT, data);
+                       vcpu->arch.pat = data;
+                       break;
+               }
+               /* Otherwise falls through to kvm_set_msr_common */
        default:
                vmx_load_host_state(vmx);
                msr = find_msr_entry(vmx, msr_index);
@@ -1032,8 +1045,7 @@ static int vmx_get_irq(struct kvm_vcpu *vcpu)
 
 static __init int cpu_has_kvm_support(void)
 {
-       unsigned long ecx = cpuid_ecx(1);
-       return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
+       return cpu_has_vmx();
 }
 
 static __init int vmx_disabled_by_bios(void)
@@ -1079,13 +1091,22 @@ static void vmclear_local_vcpus(void)
                __vcpu_clear(vmx);
 }
 
-static void hardware_disable(void *garbage)
+
+/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
+ * tricks.
+ */
+static void kvm_cpu_vmxoff(void)
 {
-       vmclear_local_vcpus();
        asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
        write_cr4(read_cr4() & ~X86_CR4_VMXE);
 }
 
+static void hardware_disable(void *garbage)
+{
+       vmclear_local_vcpus();
+       kvm_cpu_vmxoff();
+}
+
 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
                                      u32 msr, u32 *result)
 {
@@ -1176,12 +1197,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 #ifdef CONFIG_X86_64
        min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
 #endif
-       opt = 0;
+       opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
                                &_vmexit_control) < 0)
                return -EIO;
 
-       min = opt = 0;
+       min = 0;
+       opt = VM_ENTRY_LOAD_IA32_PAT;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
                                &_vmentry_control) < 0)
                return -EIO;
@@ -2087,8 +2109,9 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
  */
 static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
-       u32 host_sysenter_cs;
+       u32 host_sysenter_cs, msr_low, msr_high;
        u32 junk;
+       u64 host_pat;
        unsigned long a;
        struct descriptor_table dt;
        int i;
@@ -2176,6 +2199,20 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        rdmsrl(MSR_IA32_SYSENTER_EIP, a);
        vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
 
+       if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
+               rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+               host_pat = msr_low | ((u64) msr_high << 32);
+               vmcs_write64(HOST_IA32_PAT, host_pat);
+       }
+       if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+               rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+               host_pat = msr_low | ((u64) msr_high << 32);
+               /* Write the default value follow host pat */
+               vmcs_write64(GUEST_IA32_PAT, host_pat);
+               /* Keep arch.pat sync with GUEST_IA32_PAT */
+               vmx->vcpu.arch.pat = host_pat;
+       }
+
        for (i = 0; i < NR_VMX_MSR; ++i) {
                u32 index = vmx_msr_index[i];
                u32 data_low, data_high;
@@ -2230,6 +2267,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 
        vmx->vcpu.arch.rmode.active = 0;
 
+       vmx->soft_vnmi_blocked = 0;
+
        vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
        kvm_set_cr8(&vmx->vcpu, 0);
        msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
@@ -2335,6 +2374,29 @@ out:
        return ret;
 }
 
+static void enable_irq_window(struct kvm_vcpu *vcpu)
+{
+       u32 cpu_based_vm_exec_control;
+
+       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+       cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
+       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+       u32 cpu_based_vm_exec_control;
+
+       if (!cpu_has_virtual_nmis()) {
+               enable_irq_window(vcpu);
+               return;
+       }
+
+       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+       cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
+       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
 static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2358,10 +2420,54 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (!cpu_has_virtual_nmis()) {
+               /*
+                * Tracking the NMI-blocked state in software is built upon
+                * finding the next open IRQ window. This, in turn, depends on
+                * well-behaving guests: They have to keep IRQs disabled at
+                * least as long as the NMI handler runs. Otherwise we may
+                * cause NMI nesting, maybe breaking the guest. But as this is
+                * highly unlikely, we can live with the residual risk.
+                */
+               vmx->soft_vnmi_blocked = 1;
+               vmx->vnmi_blocked_time = 0;
+       }
+
+       ++vcpu->stat.nmi_injections;
+       if (vcpu->arch.rmode.active) {
+               vmx->rmode.irq.pending = true;
+               vmx->rmode.irq.vector = NMI_VECTOR;
+               vmx->rmode.irq.rip = kvm_rip_read(vcpu);
+               vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+                            NMI_VECTOR | INTR_TYPE_SOFT_INTR |
+                            INTR_INFO_VALID_MASK);
+               vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+               kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+               return;
+       }
        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
 }
 
+static void vmx_update_window_states(struct kvm_vcpu *vcpu)
+{
+       u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+
+       vcpu->arch.nmi_window_open =
+               !(guest_intr & (GUEST_INTR_STATE_STI |
+                               GUEST_INTR_STATE_MOV_SS |
+                               GUEST_INTR_STATE_NMI));
+       if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
+               vcpu->arch.nmi_window_open = 0;
+
+       vcpu->arch.interrupt_window_open =
+               ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+                !(guest_intr & (GUEST_INTR_STATE_STI |
+                                GUEST_INTR_STATE_MOV_SS)));
+}
+
 static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
 {
        int word_index = __ffs(vcpu->arch.irq_summary);
@@ -2374,40 +2480,49 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
        kvm_queue_interrupt(vcpu, irq);
 }
 
-
 static void do_interrupt_requests(struct kvm_vcpu *vcpu,
                                       struct kvm_run *kvm_run)
 {
-       u32 cpu_based_vm_exec_control;
-
-       vcpu->arch.interrupt_window_open =
-               ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
-                (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
+       vmx_update_window_states(vcpu);
 
-       if (vcpu->arch.interrupt_window_open &&
-           vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
-               kvm_do_inject_irq(vcpu);
+       if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
+               if (vcpu->arch.interrupt.pending) {
+                       enable_nmi_window(vcpu);
+               } else if (vcpu->arch.nmi_window_open) {
+                       vcpu->arch.nmi_pending = false;
+                       vcpu->arch.nmi_injected = true;
+               } else {
+                       enable_nmi_window(vcpu);
+                       return;
+               }
+       }
+       if (vcpu->arch.nmi_injected) {
+               vmx_inject_nmi(vcpu);
+               if (vcpu->arch.nmi_pending)
+                       enable_nmi_window(vcpu);
+               else if (vcpu->arch.irq_summary
+                        || kvm_run->request_interrupt_window)
+                       enable_irq_window(vcpu);
+               return;
+       }
 
-       if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending)
-               vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+       if (vcpu->arch.interrupt_window_open) {
+               if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
+                       kvm_do_inject_irq(vcpu);
 
-       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+               if (vcpu->arch.interrupt.pending)
+                       vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+       }
        if (!vcpu->arch.interrupt_window_open &&
            (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
-               /*
-                * Interrupts blocked.  Wait for unblock.
-                */
-               cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-       else
-               cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+               enable_irq_window(vcpu);
 }
 
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
 {
        int ret;
        struct kvm_userspace_memory_region tss_mem = {
-               .slot = 8,
+               .slot = TSS_PRIVATE_MEMSLOT,
                .guest_phys_addr = addr,
                .memory_size = PAGE_SIZE * 3,
                .flags = 0,
@@ -2492,7 +2607,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
        }
 
-       if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
+       if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
                return 1;  /* already handled by vmx_vcpu_run() */
 
        if (is_no_device(intr_info)) {
@@ -2581,6 +2696,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        rep = (exit_qualification & 32) != 0;
        port = exit_qualification >> 16;
 
+       skip_emulated_instruction(vcpu);
        return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
 }
 
@@ -2767,6 +2883,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 
        KVMTRACE_0D(PEND_INTR, vcpu, handler);
+       ++vcpu->stat.irq_window_exits;
 
        /*
         * If the user space waits to inject interrupts, exit as soon as
@@ -2775,7 +2892,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
        if (kvm_run->request_interrupt_window &&
            !vcpu->arch.irq_summary) {
                kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
-               ++vcpu->stat.irq_window_exits;
                return 0;
        }
        return 1;
@@ -2832,6 +2948,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long exit_qualification;
        u16 tss_selector;
        int reason;
@@ -2839,6 +2956,15 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
        reason = (u32)exit_qualification >> 30;
+       if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected &&
+           (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+           (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK)
+           == INTR_TYPE_NMI_INTR) {
+               vcpu->arch.nmi_injected = false;
+               if (cpu_has_virtual_nmis())
+                       vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+                                     GUEST_INTR_STATE_NMI);
+       }
        tss_selector = exit_qualification;
 
        return kvm_task_switch(vcpu, tss_selector, reason);
@@ -2927,16 +3053,12 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
        while (!guest_state_valid(vcpu)) {
                err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
 
-               switch (err) {
-                       case EMULATE_DONE:
-                               break;
-                       case EMULATE_DO_MMIO:
-                               kvm_report_emulation_failure(vcpu, "mmio");
-                               /* TODO: Handle MMIO */
-                               return;
-                       default:
-                               kvm_report_emulation_failure(vcpu, "emulation failure");
-                               return;
+               if (err == EMULATE_DO_MMIO)
+                       break;
+
+               if (err != EMULATE_DONE) {
+                       kvm_report_emulation_failure(vcpu, "emulation failure");
+                       return;
                }
 
                if (signal_pending(current))
@@ -2948,8 +3070,10 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
        local_irq_disable();
        preempt_disable();
 
-       /* Guest state should be valid now, no more emulation should be needed */
-       vmx->emulation_required = 0;
+       /* Guest state should be valid now except if we need to
+        * emulate an MMIO */
+       if (guest_state_valid(vcpu))
+               vmx->emulation_required = 0;
 }
 
 /*
@@ -2996,6 +3120,11 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
        KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
                    (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
 
+       /* If we need to emulate an MMIO from handle_invalid_guest_state
+        * we just return 0 */
+       if (vmx->emulation_required && emulate_invalid_guest_state)
+               return 0;
+
        /* Access CR3 don't cause VMExit in paging mode, so we need
         * to sync with guest real CR3. */
        if (vm_need_ept() && is_paging(vcpu)) {
@@ -3012,9 +3141,32 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
        if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
                        (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
-                       exit_reason != EXIT_REASON_EPT_VIOLATION))
-               printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
-                      "exit reason is 0x%x\n", __func__, exit_reason);
+                       exit_reason != EXIT_REASON_EPT_VIOLATION &&
+                       exit_reason != EXIT_REASON_TASK_SWITCH))
+               printk(KERN_WARNING "%s: unexpected, valid vectoring info "
+                      "(0x%x) and exit reason is 0x%x\n",
+                      __func__, vectoring_info, exit_reason);
+
+       if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+               if (vcpu->arch.interrupt_window_open) {
+                       vmx->soft_vnmi_blocked = 0;
+                       vcpu->arch.nmi_window_open = 1;
+               } else if (vmx->vnmi_blocked_time > 1000000000LL &&
+                          vcpu->arch.nmi_pending) {
+                       /*
+                        * This CPU don't support us in finding the end of an
+                        * NMI-blocked window if the guest runs with IRQs
+                        * disabled. So we pull the trigger after 1 s of
+                        * futile waiting, but inform the user about this.
+                        */
+                       printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
+                              "state on VCPU %d after 1 s timeout\n",
+                              __func__, vcpu->vcpu_id);
+                       vmx->soft_vnmi_blocked = 0;
+                       vmx->vcpu.arch.nmi_window_open = 1;
+               }
+       }
+
        if (exit_reason < kvm_vmx_max_exit_handlers
            && kvm_vmx_exit_handlers[exit_reason])
                return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
@@ -3042,51 +3194,6 @@ static void update_tpr_threshold(struct kvm_vcpu *vcpu)
        vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
 }
 
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
-       u32 cpu_based_vm_exec_control;
-
-       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
-{
-       u32 cpu_based_vm_exec_control;
-
-       if (!cpu_has_virtual_nmis())
-               return;
-
-       cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-       cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
-       vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static int vmx_nmi_enabled(struct kvm_vcpu *vcpu)
-{
-       u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-       return !(guest_intr & (GUEST_INTR_STATE_NMI |
-                              GUEST_INTR_STATE_MOV_SS |
-                              GUEST_INTR_STATE_STI));
-}
-
-static int vmx_irq_enabled(struct kvm_vcpu *vcpu)
-{
-       u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-       return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS |
-                              GUEST_INTR_STATE_STI)) &&
-               (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
-}
-
-static void enable_intr_window(struct kvm_vcpu *vcpu)
-{
-       if (vcpu->arch.nmi_pending)
-               enable_nmi_window(vcpu);
-       else if (kvm_cpu_has_interrupt(vcpu))
-               enable_irq_window(vcpu);
-}
-
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
        u32 exit_intr_info;
@@ -3109,7 +3216,9 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
                if (unblock_nmi && vector != DF_VECTOR)
                        vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
                                      GUEST_INTR_STATE_NMI);
-       }
+       } else if (unlikely(vmx->soft_vnmi_blocked))
+               vmx->vnmi_blocked_time +=
+                       ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
 
        idt_vectoring_info = vmx->idt_vectoring_info;
        idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
@@ -3147,26 +3256,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 {
        update_tpr_threshold(vcpu);
 
-       if (cpu_has_virtual_nmis()) {
-               if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
-                       if (vcpu->arch.interrupt.pending) {
-                               enable_nmi_window(vcpu);
-                       } else if (vmx_nmi_enabled(vcpu)) {
-                               vcpu->arch.nmi_pending = false;
-                               vcpu->arch.nmi_injected = true;
-                       } else {
-                               enable_intr_window(vcpu);
-                               return;
-                       }
-               }
-               if (vcpu->arch.nmi_injected) {
-                       vmx_inject_nmi(vcpu);
-                       enable_intr_window(vcpu);
+       vmx_update_window_states(vcpu);
+
+       if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
+               if (vcpu->arch.interrupt.pending) {
+                       enable_nmi_window(vcpu);
+               } else if (vcpu->arch.nmi_window_open) {
+                       vcpu->arch.nmi_pending = false;
+                       vcpu->arch.nmi_injected = true;
+               } else {
+                       enable_nmi_window(vcpu);
                        return;
                }
        }
+       if (vcpu->arch.nmi_injected) {
+               vmx_inject_nmi(vcpu);
+               if (vcpu->arch.nmi_pending)
+                       enable_nmi_window(vcpu);
+               else if (kvm_cpu_has_interrupt(vcpu))
+                       enable_irq_window(vcpu);
+               return;
+       }
        if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
-               if (vmx_irq_enabled(vcpu))
+               if (vcpu->arch.interrupt_window_open)
                        kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
                else
                        enable_irq_window(vcpu);
@@ -3174,6 +3286,8 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
        if (vcpu->arch.interrupt.pending) {
                vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
                kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
+               if (kvm_cpu_has_interrupt(vcpu))
+                       enable_irq_window(vcpu);
        }
 }
 
@@ -3213,6 +3327,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 intr_info;
 
+       /* Record the guest's net vcpu time for enforced NMI injections. */
+       if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
+               vmx->entry_time = ktime_get();
+
        /* Handle invalid guest state instead of entering VMX */
        if (vmx->emulation_required && emulate_invalid_guest_state) {
                handle_invalid_guest_state(vcpu, kvm_run);
@@ -3327,9 +3445,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        if (vmx->rmode.irq.pending)
                fixup_rmode_irq(vmx);
 
-       vcpu->arch.interrupt_window_open =
-               (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-                (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0;
+       vmx_update_window_states(vcpu);
 
        asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
        vmx->launched = 1;
@@ -3337,7 +3453,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
        /* We need to handle NMIs before interrupts are enabled */
-       if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 &&
+       if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
            (intr_info & INTR_INFO_VALID_MASK)) {
                KVMTRACE_0D(NMI, vcpu, handler);
                asm("int $2");
@@ -3455,6 +3571,11 @@ static int get_ept_level(void)
        return VMX_EPT_DEFAULT_GAW + 1;
 }
 
+static int vmx_get_mt_mask_shift(void)
+{
+       return VMX_EPT_MT_EPTE_SHIFT;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
        .cpu_has_kvm_support = cpu_has_kvm_support,
        .disabled_by_bios = vmx_disabled_by_bios,
@@ -3510,6 +3631,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
        .set_tss_addr = vmx_set_tss_addr,
        .get_tdp_level = get_ept_level,
+       .get_mt_mask_shift = vmx_get_mt_mask_shift,
 };
 
 static int __init vmx_init(void)
@@ -3566,10 +3688,10 @@ static int __init vmx_init(void)
                bypass_guest_pf = 0;
                kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
                        VMX_EPT_WRITABLE_MASK |
-                       VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT |
                        VMX_EPT_IGMT_BIT);
                kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
-                               VMX_EPT_EXECUTABLE_MASK);
+                               VMX_EPT_EXECUTABLE_MASK,
+                               VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
                kvm_enable_tdp();
        } else
                kvm_disable_tdp();
index f1f8ff2f1fa2d4650266e1ed82ed86a1d4fe029c..0e6aa8141dcd936e860b4576c9e0af28d380d49c 100644 (file)
@@ -39,6 +39,7 @@
 #include <asm/uaccess.h>
 #include <asm/msr.h>
 #include <asm/desc.h>
+#include <asm/mtrr.h>
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS                                              \
@@ -86,6 +87,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "halt_wakeup", VCPU_STAT(halt_wakeup) },
        { "hypercalls", VCPU_STAT(hypercalls) },
        { "request_irq", VCPU_STAT(request_irq_exits) },
+       { "request_nmi", VCPU_STAT(request_nmi_exits) },
        { "irq_exits", VCPU_STAT(irq_exits) },
        { "host_state_reload", VCPU_STAT(host_state_reload) },
        { "efer_reload", VCPU_STAT(efer_reload) },
@@ -93,6 +95,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "insn_emulation", VCPU_STAT(insn_emulation) },
        { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
        { "irq_injections", VCPU_STAT(irq_injections) },
+       { "nmi_injections", VCPU_STAT(nmi_injections) },
        { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
        { "mmu_pte_write", VM_STAT(mmu_pte_write) },
        { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -101,6 +104,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "mmu_recycled", VM_STAT(mmu_recycled) },
        { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
        { "mmu_unsync", VM_STAT(mmu_unsync) },
+       { "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
        { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
        { "largepages", VM_STAT(lpages) },
        { NULL }
@@ -312,6 +316,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        kvm_x86_ops->set_cr0(vcpu, cr0);
        vcpu->arch.cr0 = cr0;
 
+       kvm_mmu_sync_global(vcpu);
        kvm_mmu_reset_context(vcpu);
        return;
 }
@@ -355,6 +360,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
        }
        kvm_x86_ops->set_cr4(vcpu, cr4);
        vcpu->arch.cr4 = cr4;
+       kvm_mmu_sync_global(vcpu);
        kvm_mmu_reset_context(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -449,7 +455,7 @@ static u32 msrs_to_save[] = {
        MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
        MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
-       MSR_IA32_PERF_STATUS,
+       MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT
 };
 
 static unsigned num_msrs_to_save;
@@ -648,10 +654,38 @@ static bool msr_mtrr_valid(unsigned msr)
 
 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
+       u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
+
        if (!msr_mtrr_valid(msr))
                return 1;
 
-       vcpu->arch.mtrr[msr - 0x200] = data;
+       if (msr == MSR_MTRRdefType) {
+               vcpu->arch.mtrr_state.def_type = data;
+               vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
+       } else if (msr == MSR_MTRRfix64K_00000)
+               p[0] = data;
+       else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+               p[1 + msr - MSR_MTRRfix16K_80000] = data;
+       else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+               p[3 + msr - MSR_MTRRfix4K_C0000] = data;
+       else if (msr == MSR_IA32_CR_PAT)
+               vcpu->arch.pat = data;
+       else {  /* Variable MTRRs */
+               int idx, is_mtrr_mask;
+               u64 *pt;
+
+               idx = (msr - 0x200) / 2;
+               is_mtrr_mask = msr - 0x200 - 2 * idx;
+               if (!is_mtrr_mask)
+                       pt =
+                         (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
+               else
+                       pt =
+                         (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
+               *pt = data;
+       }
+
+       kvm_mmu_reset_context(vcpu);
        return 0;
 }
 
@@ -747,10 +781,37 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 
 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
+       u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
+
        if (!msr_mtrr_valid(msr))
                return 1;
 
-       *pdata = vcpu->arch.mtrr[msr - 0x200];
+       if (msr == MSR_MTRRdefType)
+               *pdata = vcpu->arch.mtrr_state.def_type +
+                        (vcpu->arch.mtrr_state.enabled << 10);
+       else if (msr == MSR_MTRRfix64K_00000)
+               *pdata = p[0];
+       else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+               *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
+       else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+               *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
+       else if (msr == MSR_IA32_CR_PAT)
+               *pdata = vcpu->arch.pat;
+       else {  /* Variable MTRRs */
+               int idx, is_mtrr_mask;
+               u64 *pt;
+
+               idx = (msr - 0x200) / 2;
+               is_mtrr_mask = msr - 0x200 - 2 * idx;
+               if (!is_mtrr_mask)
+                       pt =
+                         (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
+               else
+                       pt =
+                         (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
+               *pdata = *pt;
+       }
+
        return 0;
 }
 
@@ -903,7 +964,6 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_IRQCHIP:
        case KVM_CAP_HLT:
        case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
-       case KVM_CAP_USER_MEMORY:
        case KVM_CAP_SET_TSS_ADDR:
        case KVM_CAP_EXT_CPUID:
        case KVM_CAP_CLOCKSOURCE:
@@ -1188,6 +1248,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                int t, times = entry->eax & 0xff;
 
                entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+               entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
                for (t = 1; t < times && *nent < maxnent; ++t) {
                        do_cpuid_1_ent(&entry[t], function, 0);
                        entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
@@ -1218,7 +1279,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                /* read more entries until level_type is zero */
                for (i = 1; *nent < maxnent; ++i) {
-                       level_type = entry[i - 1].ecx & 0xff;
+                       level_type = entry[i - 1].ecx & 0xff00;
                        if (!level_type)
                                break;
                        do_cpuid_1_ent(&entry[i], function, i);
@@ -1318,6 +1379,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
        return 0;
 }
 
+static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
+{
+       vcpu_load(vcpu);
+       kvm_inject_nmi(vcpu);
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
                                           struct kvm_tpr_access_ctl *tac)
 {
@@ -1377,6 +1447,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                r = 0;
                break;
        }
+       case KVM_NMI: {
+               r = kvm_vcpu_ioctl_nmi(vcpu);
+               if (r)
+                       goto out;
+               r = 0;
+               break;
+       }
        case KVM_SET_CPUID: {
                struct kvm_cpuid __user *cpuid_arg = argp;
                struct kvm_cpuid cpuid;
@@ -1968,7 +2045,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
        ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
        if (ret < 0)
                return 0;
-       kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+       kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
        return 1;
 }
 
@@ -2404,8 +2481,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
        val = kvm_register_read(vcpu, VCPU_REGS_RAX);
        memcpy(vcpu->arch.pio_data, &val, 4);
 
-       kvm_x86_ops->skip_emulated_instruction(vcpu);
-
        pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
        if (pio_dev) {
                kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
@@ -2541,7 +2616,7 @@ int kvm_arch_init(void *opaque)
        kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
        kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
        kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
-                       PT_DIRTY_MASK, PT64_NX_MASK, 0);
+                       PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
        return 0;
 
 out:
@@ -2729,7 +2804,7 @@ static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
 
        e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
        /* when no next entry is found, the current entry[i] is reselected */
-       for (j = i + 1; j == i; j = (j + 1) % nent) {
+       for (j = i + 1; ; j = (j + 1) % nent) {
                struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
                if (ej->function == e->function) {
                        ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
@@ -2973,7 +3048,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                pr_debug("vcpu %d received sipi with vector # %x\n",
                         vcpu->vcpu_id, vcpu->arch.sipi_vector);
                kvm_lapic_reset(vcpu);
-               r = kvm_x86_ops->vcpu_reset(vcpu);
+               r = kvm_arch_vcpu_reset(vcpu);
                if (r)
                        return r;
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -3275,9 +3350,9 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
        kvm_desct->padding = 0;
 }
 
-static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
-                                          u16 selector,
-                                          struct descriptor_table *dtable)
+static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
+                                         u16 selector,
+                                         struct descriptor_table *dtable)
 {
        if (selector & 1 << 2) {
                struct kvm_segment kvm_seg;
@@ -3302,7 +3377,7 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
        struct descriptor_table dtable;
        u16 index = selector >> 3;
 
-       get_segment_descritptor_dtable(vcpu, selector, &dtable);
+       get_segment_descriptor_dtable(vcpu, selector, &dtable);
 
        if (dtable.limit < index * 8 + 7) {
                kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
@@ -3321,7 +3396,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
        struct descriptor_table dtable;
        u16 index = selector >> 3;
 
-       get_segment_descritptor_dtable(vcpu, selector, &dtable);
+       get_segment_descriptor_dtable(vcpu, selector, &dtable);
 
        if (dtable.limit < index * 8 + 7)
                return 1;
@@ -3900,6 +3975,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        /* We do fxsave: this must be aligned. */
        BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
 
+       vcpu->arch.mtrr_state.have_fixed = 1;
        vcpu_load(vcpu);
        r = kvm_arch_vcpu_reset(vcpu);
        if (r == 0)
@@ -3925,6 +4001,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 {
+       vcpu->arch.nmi_pending = false;
+       vcpu->arch.nmi_injected = false;
+
        return kvm_x86_ops->vcpu_reset(vcpu);
 }
 
@@ -4012,6 +4091,7 @@ struct  kvm *kvm_arch_create_vm(void)
                return ERR_PTR(-ENOMEM);
 
        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+       INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
        INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
        /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -4048,8 +4128,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
-       kvm_iommu_unmap_guest(kvm);
        kvm_free_all_assigned_devices(kvm);
+       kvm_iommu_unmap_guest(kvm);
        kvm_free_pit(kvm);
        kfree(kvm->arch.vpic);
        kfree(kvm->arch.vioapic);
@@ -4127,7 +4207,8 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
        return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
-              || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED;
+              || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
+              || vcpu->arch.nmi_pending;
 }
 
 static void vcpu_kick_intr(void *info)
index ea051173b0da9e950ca92d6841a42c507e957444..d174db7a3370d9e80f6c1b93f2960a40b5b1cc43 100644 (file)
@@ -58,6 +58,7 @@
 #define SrcMem32    (4<<4)     /* Memory operand (32-bit). */
 #define SrcImm      (5<<4)     /* Immediate operand. */
 #define SrcImmByte  (6<<4)     /* 8-bit sign-extended immediate operand. */
+#define SrcOne      (7<<4)     /* Implied '1' */
 #define SrcMask     (7<<4)
 /* Generic ModRM decode. */
 #define ModRM       (1<<7)
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
 #define GroupMask   0xff        /* Group number stored in bits 0:7 */
+/* Source 2 operand type */
+#define Src2None    (0<<29)
+#define Src2CL      (1<<29)
+#define Src2ImmByte (2<<29)
+#define Src2One     (3<<29)
+#define Src2Mask    (7<<29)
 
 enum {
        Group1_80, Group1_81, Group1_82, Group1_83,
        Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
 };
 
-static u16 opcode_table[256] = {
+static u32 opcode_table[256] = {
        /* 0x00 - 0x07 */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-       0, 0, 0, 0,
+       ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
        /* 0x08 - 0x0F */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
        ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -195,7 +202,7 @@ static u16 opcode_table[256] = {
        ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
 };
 
-static u16 twobyte_table[256] = {
+static u32 twobyte_table[256] = {
        /* 0x00 - 0x0F */
        0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0,
        ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
@@ -230,9 +237,14 @@ static u16 twobyte_table[256] = {
        /* 0x90 - 0x9F */
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        /* 0xA0 - 0xA7 */
-       0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
+       0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
+       DstMem | SrcReg | Src2ImmByte | ModRM,
+       DstMem | SrcReg | Src2CL | ModRM, 0, 0,
        /* 0xA8 - 0xAF */
-       0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0,
+       0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
+       DstMem | SrcReg | Src2ImmByte | ModRM,
+       DstMem | SrcReg | Src2CL | ModRM,
+       ModRM, 0,
        /* 0xB0 - 0xB7 */
        ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
            DstMem | SrcReg | ModRM | BitOp,
@@ -253,7 +265,7 @@ static u16 twobyte_table[256] = {
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
-static u16 group_table[] = {
+static u32 group_table[] = {
        [Group1_80*8] =
        ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
        ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
@@ -297,9 +309,9 @@ static u16 group_table[] = {
        SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp,
 };
 
-static u16 group2_table[] = {
+static u32 group2_table[] = {
        [Group7*8] =
-       SrcNone | ModRM, 0, 0, 0,
+       SrcNone | ModRM, 0, 0, SrcNone | ModRM,
        SrcNone | ModRM | DstMem | Mov, 0,
        SrcMem16 | ModRM | Mov, 0,
 };
@@ -359,49 +371,48 @@ static u16 group2_table[] = {
        "andl %"_msk",%"_LO32 _tmp"; "          \
        "orl  %"_LO32 _tmp",%"_sav"; "
 
+#ifdef CONFIG_X86_64
+#define ON64(x) x
+#else
+#define ON64(x)
+#endif
+
+#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix)     \
+       do {                                                            \
+               __asm__ __volatile__ (                                  \
+                       _PRE_EFLAGS("0", "4", "2")                      \
+                       _op _suffix " %"_x"3,%1; "                      \
+                       _POST_EFLAGS("0", "4", "2")                     \
+                       : "=m" (_eflags), "=m" ((_dst).val),            \
+                         "=&r" (_tmp)                                  \
+                       : _y ((_src).val), "i" (EFLAGS_MASK));          \
+       } while (0)
+
+
 /* Raw emulation: instruction has two explicit operands. */
 #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
-       do {                                                                \
-               unsigned long _tmp;                                         \
-                                                                           \
-               switch ((_dst).bytes) {                                     \
-               case 2:                                                     \
-                       __asm__ __volatile__ (                              \
-                               _PRE_EFLAGS("0", "4", "2")                  \
-                               _op"w %"_wx"3,%1; "                         \
-                               _POST_EFLAGS("0", "4", "2")                 \
-                               : "=m" (_eflags), "=m" ((_dst).val),        \
-                                 "=&r" (_tmp)                              \
-                               : _wy ((_src).val), "i" (EFLAGS_MASK));     \
-                       break;                                              \
-               case 4:                                                     \
-                       __asm__ __volatile__ (                              \
-                               _PRE_EFLAGS("0", "4", "2")                  \
-                               _op"l %"_lx"3,%1; "                         \
-                               _POST_EFLAGS("0", "4", "2")                 \
-                               : "=m" (_eflags), "=m" ((_dst).val),        \
-                                 "=&r" (_tmp)                              \
-                               : _ly ((_src).val), "i" (EFLAGS_MASK));     \
-                       break;                                              \
-               case 8:                                                     \
-                       __emulate_2op_8byte(_op, _src, _dst,                \
-                                           _eflags, _qx, _qy);             \
-                       break;                                              \
-               }                                                           \
+       do {                                                            \
+               unsigned long _tmp;                                     \
+                                                                       \
+               switch ((_dst).bytes) {                                 \
+               case 2:                                                 \
+                       ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \
+                       break;                                          \
+               case 4:                                                 \
+                       ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \
+                       break;                                          \
+               case 8:                                                 \
+                       ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \
+                       break;                                          \
+               }                                                       \
        } while (0)
 
 #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
        do {                                                                 \
-               unsigned long __tmp;                                         \
+               unsigned long _tmp;                                          \
                switch ((_dst).bytes) {                                      \
                case 1:                                                      \
-                       __asm__ __volatile__ (                               \
-                               _PRE_EFLAGS("0", "4", "2")                   \
-                               _op"b %"_bx"3,%1; "                          \
-                               _POST_EFLAGS("0", "4", "2")                  \
-                               : "=m" (_eflags), "=m" ((_dst).val),         \
-                                 "=&r" (__tmp)                              \
-                               : _by ((_src).val), "i" (EFLAGS_MASK));      \
+                       ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b");  \
                        break;                                               \
                default:                                                     \
                        __emulate_2op_nobyte(_op, _src, _dst, _eflags,       \
@@ -425,71 +436,68 @@ static u16 group2_table[] = {
        __emulate_2op_nobyte(_op, _src, _dst, _eflags,                  \
                             "w", "r", _LO32, "r", "", "r")
 
-/* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(_op, _dst, _eflags)                                    \
-       do {                                                            \
-               unsigned long _tmp;                                     \
-                                                                       \
-               switch ((_dst).bytes) {                                 \
-               case 1:                                                 \
-                       __asm__ __volatile__ (                          \
-                               _PRE_EFLAGS("0", "3", "2")              \
-                               _op"b %1; "                             \
-                               _POST_EFLAGS("0", "3", "2")             \
-                               : "=m" (_eflags), "=m" ((_dst).val),    \
-                                 "=&r" (_tmp)                          \
-                               : "i" (EFLAGS_MASK));                   \
-                       break;                                          \
-               case 2:                                                 \
-                       __asm__ __volatile__ (                          \
-                               _PRE_EFLAGS("0", "3", "2")              \
-                               _op"w %1; "                             \
-                               _POST_EFLAGS("0", "3", "2")             \
-                               : "=m" (_eflags), "=m" ((_dst).val),    \
-                                 "=&r" (_tmp)                          \
-                               : "i" (EFLAGS_MASK));                   \
-                       break;                                          \
-               case 4:                                                 \
-                       __asm__ __volatile__ (                          \
-                               _PRE_EFLAGS("0", "3", "2")              \
-                               _op"l %1; "                             \
-                               _POST_EFLAGS("0", "3", "2")             \
-                               : "=m" (_eflags), "=m" ((_dst).val),    \
-                                 "=&r" (_tmp)                          \
-                               : "i" (EFLAGS_MASK));                   \
-                       break;                                          \
-               case 8:                                                 \
-                       __emulate_1op_8byte(_op, _dst, _eflags);        \
-                       break;                                          \
-               }                                                       \
+/* Instruction has three operands and one operand is stored in ECX register */
+#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type)        \
+       do {                                                                    \
+               unsigned long _tmp;                                             \
+               _type _clv  = (_cl).val;                                        \
+               _type _srcv = (_src).val;                                       \
+               _type _dstv = (_dst).val;                                       \
+                                                                               \
+               __asm__ __volatile__ (                                          \
+                       _PRE_EFLAGS("0", "5", "2")                              \
+                       _op _suffix " %4,%1 \n"                                 \
+                       _POST_EFLAGS("0", "5", "2")                             \
+                       : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp)            \
+                       : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK)           \
+                       );                                                      \
+                                                                               \
+               (_cl).val  = (unsigned long) _clv;                              \
+               (_src).val = (unsigned long) _srcv;                             \
+               (_dst).val = (unsigned long) _dstv;                             \
        } while (0)
 
-/* Emulate an instruction with quadword operands (x86/64 only). */
-#if defined(CONFIG_X86_64)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)           \
-       do {                                                              \
-               __asm__ __volatile__ (                                    \
-                       _PRE_EFLAGS("0", "4", "2")                        \
-                       _op"q %"_qx"3,%1; "                               \
-                       _POST_EFLAGS("0", "4", "2")                       \
-                       : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
-                       : _qy ((_src).val), "i" (EFLAGS_MASK));         \
+#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags)                          \
+       do {                                                                    \
+               switch ((_dst).bytes) {                                         \
+               case 2:                                                         \
+                       __emulate_2op_cl(_op, _cl, _src, _dst, _eflags,         \
+                                               "w", unsigned short);           \
+                       break;                                                  \
+               case 4:                                                         \
+                       __emulate_2op_cl(_op, _cl, _src, _dst, _eflags,         \
+                                               "l", unsigned int);             \
+                       break;                                                  \
+               case 8:                                                         \
+                       ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags,    \
+                                               "q", unsigned long));           \
+                       break;                                                  \
+               }                                                               \
        } while (0)
 
-#define __emulate_1op_8byte(_op, _dst, _eflags)                           \
-       do {                                                              \
-               __asm__ __volatile__ (                                    \
-                       _PRE_EFLAGS("0", "3", "2")                        \
-                       _op"q %1; "                                       \
-                       _POST_EFLAGS("0", "3", "2")                       \
-                       : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
-                       : "i" (EFLAGS_MASK));                             \
+#define __emulate_1op(_op, _dst, _eflags, _suffix)                     \
+       do {                                                            \
+               unsigned long _tmp;                                     \
+                                                                       \
+               __asm__ __volatile__ (                                  \
+                       _PRE_EFLAGS("0", "3", "2")                      \
+                       _op _suffix " %1; "                             \
+                       _POST_EFLAGS("0", "3", "2")                     \
+                       : "=m" (_eflags), "+m" ((_dst).val),            \
+                         "=&r" (_tmp)                                  \
+                       : "i" (EFLAGS_MASK));                           \
        } while (0)
 
-#elif defined(__i386__)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
-#define __emulate_1op_8byte(_op, _dst, _eflags)
-#endif                         /* __i386__ */
+/* Instruction has only one explicit operand (no source operand). */
+#define emulate_1op(_op, _dst, _eflags)                                    \
+       do {                                                            \
+               switch ((_dst).bytes) {                                 \
+               case 1: __emulate_1op(_op, _dst, _eflags, "b"); break;  \
+               case 2: __emulate_1op(_op, _dst, _eflags, "w"); break;  \
+               case 4: __emulate_1op(_op, _dst, _eflags, "l"); break;  \
+               case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \
+               }                                                       \
+       } while (0)
 
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch(_type, _size, _eip)                                  \
@@ -1041,6 +1049,33 @@ done_prefixes:
                c->src.bytes = 1;
                c->src.val = insn_fetch(s8, 1, c->eip);
                break;
+       case SrcOne:
+               c->src.bytes = 1;
+               c->src.val = 1;
+               break;
+       }
+
+       /*
+        * Decode and fetch the second source operand: register, memory
+        * or immediate.
+        */
+       switch (c->d & Src2Mask) {
+       case Src2None:
+               break;
+       case Src2CL:
+               c->src2.bytes = 1;
+               c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
+               break;
+       case Src2ImmByte:
+               c->src2.type = OP_IMM;
+               c->src2.ptr = (unsigned long *)c->eip;
+               c->src2.bytes = 1;
+               c->src2.val = insn_fetch(u8, 1, c->eip);
+               break;
+       case Src2One:
+               c->src2.bytes = 1;
+               c->src2.val = 1;
+               break;
        }
 
        /* Decode and fetch the destination operand: register or memory. */
@@ -1100,20 +1135,33 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
                                               c->regs[VCPU_REGS_RSP]);
 }
 
-static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
-                               struct x86_emulate_ops *ops)
+static int emulate_pop(struct x86_emulate_ctxt *ctxt,
+                      struct x86_emulate_ops *ops)
 {
        struct decode_cache *c = &ctxt->decode;
        int rc;
 
-       rc = ops->read_std(register_address(c, ss_base(ctxt),
-                                           c->regs[VCPU_REGS_RSP]),
-                          &c->dst.val, c->dst.bytes, ctxt->vcpu);
+       rc = ops->read_emulated(register_address(c, ss_base(ctxt),
+                                                c->regs[VCPU_REGS_RSP]),
+                               &c->src.val, c->src.bytes, ctxt->vcpu);
        if (rc != 0)
                return rc;
 
-       register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->dst.bytes);
+       register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.bytes);
+       return rc;
+}
+
+static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
+                               struct x86_emulate_ops *ops)
+{
+       struct decode_cache *c = &ctxt->decode;
+       int rc;
 
+       c->src.bytes = c->dst.bytes;
+       rc = emulate_pop(ctxt, ops);
+       if (rc != 0)
+               return rc;
+       c->dst.val = c->src.val;
        return 0;
 }
 
@@ -1415,24 +1463,15 @@ special_insn:
                emulate_1op("dec", c->dst, ctxt->eflags);
                break;
        case 0x50 ... 0x57:  /* push reg */
-               c->dst.type  = OP_MEM;
-               c->dst.bytes = c->op_bytes;
-               c->dst.val = c->src.val;
-               register_address_increment(c, &c->regs[VCPU_REGS_RSP],
-                                          -c->op_bytes);
-               c->dst.ptr = (void *) register_address(
-                       c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]);
+               emulate_push(ctxt);
                break;
        case 0x58 ... 0x5f: /* pop reg */
        pop_instruction:
-               if ((rc = ops->read_std(register_address(c, ss_base(ctxt),
-                       c->regs[VCPU_REGS_RSP]), c->dst.ptr,
-                       c->op_bytes, ctxt->vcpu)) != 0)
+               c->src.bytes = c->op_bytes;
+               rc = emulate_pop(ctxt, ops);
+               if (rc != 0)
                        goto done;
-
-               register_address_increment(c, &c->regs[VCPU_REGS_RSP],
-                                          c->op_bytes);
-               c->dst.type = OP_NONE;  /* Disable writeback. */
+               c->dst.val = c->src.val;
                break;
        case 0x63:              /* movsxd */
                if (ctxt->mode != X86EMUL_MODE_PROT64)
@@ -1591,7 +1630,9 @@ special_insn:
                emulate_push(ctxt);
                break;
        case 0x9d: /* popf */
+               c->dst.type = OP_REG;
                c->dst.ptr = (unsigned long *) &ctxt->eflags;
+               c->dst.bytes = c->op_bytes;
                goto pop_instruction;
        case 0xa0 ... 0xa1:     /* mov */
                c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
@@ -1689,7 +1730,9 @@ special_insn:
                emulate_grp2(ctxt);
                break;
        case 0xc3: /* ret */
+               c->dst.type = OP_REG;
                c->dst.ptr = &c->eip;
+               c->dst.bytes = c->op_bytes;
                goto pop_instruction;
        case 0xc6 ... 0xc7:     /* mov (sole member of Grp11) */
        mov:
@@ -1778,7 +1821,7 @@ special_insn:
                        c->eip = saved_eip;
                        goto cannot_emulate;
                }
-               return 0;
+               break;
        case 0xf4:              /* hlt */
                ctxt->vcpu->arch.halt_request = 1;
                break;
@@ -1999,12 +2042,20 @@ twobyte_insn:
                c->src.val &= (c->dst.bytes << 3) - 1;
                emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
                break;
+       case 0xa4: /* shld imm8, r, r/m */
+       case 0xa5: /* shld cl, r, r/m */
+               emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
+               break;
        case 0xab:
              bts:              /* bts */
                /* only subword offset */
                c->src.val &= (c->dst.bytes << 3) - 1;
                emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
                break;
+       case 0xac: /* shrd imm8, r, r/m */
+       case 0xad: /* shrd cl, r, r/m */
+               emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags);
+               break;
        case 0xae:              /* clflush */
                break;
        case 0xb0 ... 0xb1:     /* cmpxchg */
index 50a779264bb18fabcf21e3434f58a7963b1c691d..a7ed208f81e3e4bbb5ff4e3447d844dc77cd3371 100644 (file)
@@ -738,7 +738,7 @@ static void lguest_time_init(void)
 
        /* We can't set cpumask in the initializer: damn C limitations!  Set it
         * here and register our timer device. */
-       lguest_clockevent.cpumask = cpumask_of_cpu(0);
+       lguest_clockevent.cpumask = cpumask_of(0);
        clockevents_register_device(&lguest_clockevent);
 
        /* Finally, we unblock the timer interrupt. */
index 37b9ae4d44c5b755470cb3661328b073e951b068..df167f2656228c83eb58ce2ebd4600074d766e77 100644 (file)
@@ -133,29 +133,28 @@ void __init time_init_hook(void)
  **/
 void mca_nmi_hook(void)
 {
-       /* If I recall correctly, there's a whole bunch of other things that
+       /*
+        * If I recall correctly, there's a whole bunch of other things that
         * we can do to check for NMI problems, but that's all I know about
         * at the moment.
         */
-
-       printk("NMI generated from unknown source!\n");
+       pr_warning("NMI generated from unknown source!\n");
 }
 #endif
 
 static __init int no_ipi_broadcast(char *str)
 {
        get_option(&str, &no_broadcast);
-       printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
-                                                                                       "IPI Broadcast");
+       pr_info("Using %s mode\n",
+               no_broadcast ? "No IPI Broadcast" : "IPI Broadcast");
        return 1;
 }
-
 __setup("no_ipi_broadcast=", no_ipi_broadcast);
 
 static int __init print_ipi_mode(void)
 {
-       printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
-                                                                                       "Shortcut");
+       pr_info("Using IPI %s mode\n",
+               no_broadcast ? "No-Shortcut" : "Shortcut");
        return 0;
 }
 
index 3624a364b7f37ad8f7105956c15f4e1385dd66e8..bc4c7840b2a8daad937ccf34751f3e2718e9781c 100644 (file)
@@ -42,9 +42,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
         { }
 };
 
-static cpumask_t vector_allocation_domain(int cpu)
+static void vector_allocation_domain(int cpu, cpumask_t *retmask)
 {
-        return cpumask_of_cpu(cpu);
+       cpus_clear(*retmask);
+       cpu_set(cpu, *retmask);
 }
 
 static int probe_bigsmp(void)
index 7b4e6d0d169001480eebb37c8b4bc3a32d1c6333..4ba5ccaa15842f358c5598b63536c06b51e637c7 100644 (file)
@@ -87,7 +87,7 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 }
 #endif
 
-static cpumask_t vector_allocation_domain(int cpu)
+static void vector_allocation_domain(int cpu, cpumask_t *retmask)
 {
        /* Careful. Some cpus do not strictly honor the set of cpus
         * specified in the interrupt destination when using lowest
@@ -97,8 +97,7 @@ static cpumask_t vector_allocation_domain(int cpu)
         * deliver interrupts to the wrong hyperthread when only one
         * hyperthread was specified in the interrupt desitination.
         */
-       cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
-       return domain;
+       *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
 }
 
 struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000);
index 71a309b122e672ef948fbe125f3637588ecf1149..511d7941364f8e268e0e00505ef63201c2247749 100644 (file)
@@ -38,7 +38,7 @@ static int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
        return 0;
 }
 
-static cpumask_t vector_allocation_domain(int cpu)
+static void vector_allocation_domain(int cpu, cpumask_t *retmask)
 {
        /* Careful. Some cpus do not strictly honor the set of cpus
         * specified in the interrupt destination when using lowest
@@ -48,8 +48,7 @@ static cpumask_t vector_allocation_domain(int cpu)
         * deliver interrupts to the wrong hyperthread when only one
         * hyperthread was specified in the interrupt desitination.
         */
-       cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
-       return domain;
+       *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
 }
 
 struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq);
index 2c6d234e0009ccd5b2ca50db816f599a5cd133b3..2821ffc188b57d9c1cb5a54d69a63fade0c6c198 100644 (file)
@@ -24,7 +24,7 @@ static int probe_summit(void)
        return 0;
 }
 
-static cpumask_t vector_allocation_domain(int cpu)
+static void vector_allocation_domain(int cpu, cpumask_t *retmask)
 {
        /* Careful. Some cpus do not strictly honor the set of cpus
         * specified in the interrupt destination when using lowest
@@ -34,8 +34,7 @@ static cpumask_t vector_allocation_domain(int cpu)
         * deliver interrupts to the wrong hyperthread when only one
         * hyperthread was specified in the interrupt desitination.
         */
-       cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
-       return domain;
+       *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
 }
 
 struct genapic apic_summit = APIC_INIT("summit", probe_summit);
index 52145007bd7efb8e99a9516ddeb09af6ea1e445b..a5bc05492b1ee17c3cdfc04894cfe706047ef4f7 100644 (file)
@@ -63,11 +63,6 @@ static int voyager_extended_cpus = 1;
 /* Used for the invalidate map that's also checked in the spinlock */
 static volatile unsigned long smp_invalidate_needed;
 
-/* Bitmask of currently online CPUs - used by setup.c for
-   /proc/cpuinfo, visible externally but still physical */
-cpumask_t cpu_online_map = CPU_MASK_NONE;
-EXPORT_SYMBOL(cpu_online_map);
-
 /* Bitmask of CPUs present in the system - exported by i386_syms.c, used
  * by scheduler but indexed physically */
 cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
@@ -218,8 +213,6 @@ static cpumask_t smp_commenced_mask = CPU_MASK_NONE;
 /* This is for the new dynamic CPU boot code */
 cpumask_t cpu_callin_map = CPU_MASK_NONE;
 cpumask_t cpu_callout_map = CPU_MASK_NONE;
-cpumask_t cpu_possible_map = CPU_MASK_NONE;
-EXPORT_SYMBOL(cpu_possible_map);
 
 /* The per processor IRQ masks (these are usually kept in sync) */
 static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned;
@@ -679,7 +672,7 @@ void __init smp_boot_cpus(void)
 
        /* loop over all the extended VIC CPUs and boot them.  The
         * Quad CPUs must be bootstrapped by their extended VIC cpu */
-       for (i = 0; i < NR_CPUS; i++) {
+       for (i = 0; i < nr_cpu_ids; i++) {
                if (i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map))
                        continue;
                do_boot_cpu(i);
index cebcbf152d46b06b725b525debc573f3d91da71e..71a14f89f89e7367c72f2de85da6708acc83c069 100644 (file)
@@ -278,7 +278,7 @@ void __init numa_init_array(void)
        int rr, i;
 
        rr = first_node(node_online_map);
-       for (i = 0; i < NR_CPUS; i++) {
+       for (i = 0; i < nr_cpu_ids; i++) {
                if (early_cpu_to_node(i) != NUMA_NO_NODE)
                        continue;
                numa_set_node(i, rr);
@@ -549,7 +549,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
        memnodemap[0] = 0;
        node_set_online(0);
        node_set(0, node_possible_map);
-       for (i = 0; i < NR_CPUS; i++)
+       for (i = 0; i < nr_cpu_ids; i++)
                numa_set_node(i, 0);
        e820_register_active_regions(0, start_pfn, last_pfn);
        setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
index 51c0a2fc14fe05521def4af57c0e2d1d29c8359d..09737c8af07479020e7183fef611c376e4fc1dad 100644 (file)
@@ -382,7 +382,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
                if (!node_online(i))
                        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 
-       for (i = 0; i < NR_CPUS; i++) {
+       for (i = 0; i < nr_cpu_ids; i++) {
                int node = early_cpu_to_node(i);
 
                if (node == NUMA_NO_NODE)
index 1d88d2b39771e2a89e0cb4ac53e98886354f4956..9e5752fe4d153d3c3a472b392363846f0de3a51a 100644 (file)
@@ -4,7 +4,7 @@
 #include <linux/irq.h>
 #include <linux/dmi.h>
 #include <asm/numa.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 struct pci_root_info {
        char *name;
index 22e057665e5517971a67b84a0e60e2ffe87ad151..9bb09823b3622410542753309cd3f546e207f923 100644 (file)
@@ -2,7 +2,7 @@
 #include <linux/pci.h>
 #include <linux/topology.h>
 #include <linux/cpu.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/pci-direct.h>
index bb1a01f089e29c817432907a820ca7a06f727347..62ddb73e09ed5767affbc3f1657a25ff67a3912e 100644 (file)
@@ -14,8 +14,7 @@
 #include <asm/segment.h>
 #include <asm/io.h>
 #include <asm/smp.h>
-
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
                                PCI_PROBE_MMCONF;
index 9a5af6c8fbe9fc445fdca3358b90666c9ecae7bb..bd13c3e4c6db6687f8336d1471575fea006d6728 100644 (file)
@@ -5,7 +5,7 @@
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/dmi.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 /*
  * Functions for accessing PCI base (first 256 bytes) and extended
index 86631ccbc25a6f7e9975f0c3f4999e6d022d84b6..f6adf2c6d7514148eb7e6221dea010f55158f97d 100644 (file)
@@ -2,7 +2,7 @@
 #include <linux/pci.h>
 #include <asm/pci-direct.h>
 #include <asm/io.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 /* Direct PCI access. This is used for PCI accesses in early boot before
    the PCI subsystem works. */
index 2051dc96b8e94aee2b63af14d3d2a8a9486cfdcd..7d388d5cf54852136da06d0ca08b04c646fc7199 100644 (file)
@@ -6,8 +6,7 @@
 #include <linux/dmi.h>
 #include <linux/pci.h>
 #include <linux/init.h>
-#include "pci.h"
-
+#include <asm/pci_x86.h>
 
 static void __devinit pci_fixup_i450nx(struct pci_dev *d)
 {
index 844df0cbbd3e011784531ddb6e6ff1a06e7ec83c..e51bf2cda4b0279fbf1502dfadbe9eb4ba9bbe2f 100644 (file)
@@ -34,8 +34,8 @@
 
 #include <asm/pat.h>
 #include <asm/e820.h>
+#include <asm/pci_x86.h>
 
-#include "pci.h"
 
 static int
 skip_isa_ioresource_align(struct pci_dev *dev) {
index d6c950f818585aea6df65e38e18c2f85ab217941..bec3b048e72b6b0148efad321b6c24279b73e8df 100644 (file)
@@ -1,6 +1,6 @@
 #include <linux/pci.h>
 #include <linux/init.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 /* arch_initcall has too random ordering, so call the initializers
    in the right sequence from here. */
index bf69dbe08bff66b19b8e9a7558d152fd4fce62f7..373b9afe6d4445b08efee66dfb4856519b434bd1 100644 (file)
@@ -16,8 +16,7 @@
 #include <asm/io_apic.h>
 #include <linux/irq.h>
 #include <linux/acpi.h>
-
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 #define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
 #define PIRQ_VERSION 0x0100
index b722dd481b3975b2b9b171203a42f6406b9cafed..f1065b129e9cf981fa52c374dcf3bf2c4987fe36 100644 (file)
@@ -3,7 +3,7 @@
  */
 #include <linux/init.h>
 #include <linux/pci.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 /*
  * Discover remaining PCI buses in case there are peer host bridges.
index 654a2234f8f3ba8adbcba6e644c7b7d4935488bc..89bf9242c80a3b068fa2c1fcf3f6cba4b447193d 100644 (file)
@@ -15,8 +15,7 @@
 #include <linux/acpi.h>
 #include <linux/bitmap.h>
 #include <asm/e820.h>
-
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 /* aperture is up to 256MB but BIOS may reserve less */
 #define MMCONFIG_APER_MIN      (2 * 1024*1024)
index f3c761dce6957118494ff8f2492edae9c7d9e651..8b2d561046a3bba83ef3953171fb1abdd5a195b5 100644 (file)
@@ -13,7 +13,7 @@
 #include <linux/init.h>
 #include <linux/acpi.h>
 #include <asm/e820.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 /* Assume systems with more busses have correct MCFG */
 #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
index a1994163c99dd328e3c542d1bc0f5dd246e583cf..30007ffc8e11d203922d2c8304c5d0bdb909ef33 100644 (file)
@@ -10,8 +10,7 @@
 #include <linux/acpi.h>
 #include <linux/bitmap.h>
 #include <asm/e820.h>
-
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 /* Static virtual mapping of the MMCONFIG aperture */
 struct mmcfg_virt {
index 1177845d31863602e9bf0dbf8bc4e031db3f6487..2089354968a21f67ed3fe59e1e1cf26e777a1095 100644 (file)
@@ -7,7 +7,7 @@
 #include <linux/nodemask.h>
 #include <mach_apic.h>
 #include <asm/mpspec.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 #define XQUAD_PORTIO_BASE 0xfe400000
 #define XQUAD_PORTIO_QUAD 0x40000  /* 256k per quad. */
index e11e9e803d5fd2acd8135d73e2f07c33c6684b99..b889d824f7c6aee4eba1a54ba0404609c3d8fbd5 100644 (file)
@@ -29,7 +29,7 @@
 #include <linux/init.h>
 #include <asm/olpc.h>
 #include <asm/geode.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
 
 /*
  * In the tables below, the first two line (8 longwords) are the
index 37472fc6f72949b2d54910702962f7361e12bed6..b82cae970dfd6d1d2dc9d5bbef3801841f57d1a8 100644 (file)
@@ -6,9 +6,8 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/uaccess.h>
-#include "pci.h"
-#include "pci-functions.h"
-
+#include <asm/pci_x86.h>
+#include <asm/mach-default/pci-functions.h>
 
 /* BIOS32 signature: "_32_" */
 #define BIOS32_SIGNATURE       (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
index 42f4cb19facab8a47b70c85fcfd819cb086ade3b..16d0c0eb0d19670692aedf958a05b381ee47b7c1 100644 (file)
@@ -9,11 +9,10 @@
 #include <linux/init.h>
 
 #include <asm/setup.h>
+#include <asm/pci_x86.h>
 #include <asm/visws/cobalt.h>
 #include <asm/visws/lithium.h>
 
-#include "pci.h"
-
 static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; }
 static void pci_visws_disable_irq(struct pci_dev *dev) { }
 
index 773d68d3e9128eba414a31ae3fa3dbc989033d72..503c240e26c73539c2d4061f0d186b92a7c20c83 100644 (file)
@@ -1082,7 +1082,7 @@ static void drop_other_mm_ref(void *info)
 
 static void xen_drop_mm_ref(struct mm_struct *mm)
 {
-       cpumask_t mask;
+       cpumask_var_t mask;
        unsigned cpu;
 
        if (current->active_mm == mm) {
@@ -1094,7 +1094,16 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
        }
 
        /* Get the "official" set of cpus referring to our pagetable. */
-       mask = mm->cpu_vm_mask;
+       if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
+               for_each_online_cpu(cpu) {
+                       if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask)
+                           && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
+                               continue;
+                       smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
+               }
+               return;
+       }
+       cpumask_copy(mask, &mm->cpu_vm_mask);
 
        /* It's possible that a vcpu may have a stale reference to our
           cr3, because its in lazy mode, and it hasn't yet flushed
@@ -1103,11 +1112,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
           if needed. */
        for_each_online_cpu(cpu) {
                if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
-                       cpu_set(cpu, mask);
+                       cpumask_set_cpu(cpu, mask);
        }
 
-       if (!cpus_empty(mask))
-               smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
+       if (!cpumask_empty(mask))
+               smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
+       free_cpumask_var(mask);
 }
 #else
 static void xen_drop_mm_ref(struct mm_struct *mm)
index acd9b6705e024f0833614f209caf2ab0da62807a..c44e2069c7c78078ab7ee895acf817689de48443 100644 (file)
@@ -33,7 +33,7 @@
 #include "xen-ops.h"
 #include "mmu.h"
 
-cpumask_t xen_cpu_initialized_map;
+cpumask_var_t xen_cpu_initialized_map;
 
 static DEFINE_PER_CPU(int, resched_irq);
 static DEFINE_PER_CPU(int, callfunc_irq);
@@ -158,7 +158,7 @@ static void __init xen_fill_possible_map(void)
 {
        int i, rc;
 
-       for (i = 0; i < NR_CPUS; i++) {
+       for (i = 0; i < nr_cpu_ids; i++) {
                rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
                if (rc >= 0) {
                        num_processors++;
@@ -192,11 +192,14 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
        if (xen_smp_intr_init(0))
                BUG();
 
-       xen_cpu_initialized_map = cpumask_of_cpu(0);
+       if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL))
+               panic("could not allocate xen_cpu_initialized_map\n");
+
+       cpumask_copy(xen_cpu_initialized_map, cpumask_of(0));
 
        /* Restrict the possible_map according to max_cpus. */
        while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
-               for (cpu = NR_CPUS - 1; !cpu_possible(cpu); cpu--)
+               for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--)
                        continue;
                cpu_clear(cpu, cpu_possible_map);
        }
@@ -221,7 +224,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
        struct vcpu_guest_context *ctxt;
        struct desc_struct *gdt;
 
-       if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
+       if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
                return 0;
 
        ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
@@ -408,24 +411,23 @@ static void xen_smp_send_reschedule(int cpu)
        xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
 }
 
-static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
+static void xen_send_IPI_mask(const struct cpumask *mask,
+                             enum ipi_vector vector)
 {
        unsigned cpu;
 
-       cpus_and(mask, mask, cpu_online_map);
-
-       for_each_cpu_mask_nr(cpu, mask)
+       for_each_cpu_and(cpu, mask, cpu_online_mask)
                xen_send_IPI_one(cpu, vector);
 }
 
-static void xen_smp_send_call_function_ipi(cpumask_t mask)
+static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
 {
        int cpu;
 
        xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
 
        /* Make sure other vcpus get a chance to run if they need to. */
-       for_each_cpu_mask_nr(cpu, mask) {
+       for_each_cpu(cpu, mask) {
                if (xen_vcpu_stolen(cpu)) {
                        HYPERVISOR_sched_op(SCHEDOP_yield, 0);
                        break;
@@ -435,7 +437,8 @@ static void xen_smp_send_call_function_ipi(cpumask_t mask)
 
 static void xen_smp_send_call_function_single_ipi(int cpu)
 {
-       xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
+       xen_send_IPI_mask(cpumask_of(cpu),
+                         XEN_CALL_FUNCTION_SINGLE_VECTOR);
 }
 
 static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
index 2a234db5949beb39e21e5fa09089c3740d802940..212ffe012b76656231834f2a9026db28d289e080 100644 (file)
@@ -35,7 +35,8 @@ void xen_post_suspend(int suspend_cancelled)
                        pfn_to_mfn(xen_start_info->console.domU.mfn);
        } else {
 #ifdef CONFIG_SMP
-               xen_cpu_initialized_map = cpu_online_map;
+               BUG_ON(xen_cpu_initialized_map == NULL);
+               cpumask_copy(xen_cpu_initialized_map, cpu_online_mask);
 #endif
                xen_vcpu_restore();
        }
index c9f7cda48ed78ecbe1b421c540bbe2b777681400..65d75a6be0ba586eeaf22faffd466b1ec109eb5d 100644 (file)
@@ -437,7 +437,7 @@ void xen_setup_timer(int cpu)
        evt = &per_cpu(xen_clock_events, cpu);
        memcpy(evt, xen_clockevent, sizeof(*evt));
 
-       evt->cpumask = cpumask_of_cpu(cpu);
+       evt->cpumask = cpumask_of(cpu);
        evt->irq = irq;
 
        setup_runstate_info(cpu);
index 9e1afae8461f1eabc04e625c06d9ef329a73b8f2..c1f8faf0a2c5a49d0676e24aaad161323bec6135 100644 (file)
@@ -58,7 +58,7 @@ void __init xen_init_spinlocks(void);
 __cpuinit void xen_init_lock_cpu(int cpu);
 void xen_uninit_lock_cpu(int cpu);
 
-extern cpumask_t xen_cpu_initialized_map;
+extern cpumask_var_t xen_cpu_initialized_map;
 #else
 static inline void xen_smp_init(void) {}
 #endif
index 64f5d54f7edcee095ae8ead0a2f29d032ce546cd..4259072f5bd0960e113bb97d0f8aaa1818a8ab58 100644 (file)
@@ -109,7 +109,7 @@ static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL);
  */
 static ssize_t print_cpus_map(char *buf, cpumask_t *map)
 {
-       int n = cpulist_scnprintf(buf, PAGE_SIZE-2, *map);
+       int n = cpulist_scnprintf(buf, PAGE_SIZE-2, map);
 
        buf[n++] = '\n';
        buf[n] = '\0';
index f5207090885a8fc80f28b670b95a6fa13ca87418..91636cd8b6c9ad2e12c2bb0a4925397b6b0c454b 100644 (file)
@@ -30,8 +30,8 @@ static ssize_t node_read_cpumap(struct sys_device *dev, int type, char *buf)
        BUILD_BUG_ON((NR_CPUS/32 * 9) > (PAGE_SIZE-1));
 
        len = type?
-               cpulist_scnprintf(buf, PAGE_SIZE-2, *mask):
-               cpumask_scnprintf(buf, PAGE_SIZE-2, *mask);
+               cpulist_scnprintf(buf, PAGE_SIZE-2, mask) :
+               cpumask_scnprintf(buf, PAGE_SIZE-2, mask);
        buf[len++] = '\n';
        buf[len] = '\0';
        return len;
index 199cd97e32e64b2c56338b2d74eb68ab5ce01de3..a8bc1cbcfa7cb2bb9c5e00fe68b17f0644749026 100644 (file)
@@ -49,8 +49,8 @@ static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf)
 
        if (len > 1) {
                n = type?
-                       cpulist_scnprintf(buf, len-2, *mask):
-                       cpumask_scnprintf(buf, len-2, *mask);
+                       cpulist_scnprintf(buf, len-2, mask) :
+                       cpumask_scnprintf(buf, len-2, mask);
                buf[n++] = '\n';
                buf[n] = '\0';
        }
index f450588e5858884ea16964f407b825de5c673cc7..254f1064d97354993c44b6e1185d23d4bd286aab 100644 (file)
@@ -154,7 +154,6 @@ static struct tc_clkevt_device clkevt = {
                .shift          = 32,
                /* Should be lower than at91rm9200's system timer */
                .rating         = 125,
-               .cpumask        = CPU_MASK_CPU0,
                .set_next_event = tc_next_event,
                .set_mode       = tc_mode,
        },
@@ -195,6 +194,7 @@ static void __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
        clkevt.clkevt.max_delta_ns
                = clockevent_delta2ns(0xffff, &clkevt.clkevt);
        clkevt.clkevt.min_delta_ns = clockevent_delta2ns(1, &clkevt.clkevt) + 1;
+       clkevt.clkevt.cpumask = cpumask_of(0);
 
        setup_irq(irq, &tc_irqaction);
 
index a1039068f95c378cd85b2d6ff2f0f6729c003db2..415fab0125acd8e465e1fe2e6fe8241d75894868 100644 (file)
@@ -222,11 +222,16 @@ bool check_syscall_vector(struct lguest *lg)
 int init_interrupts(void)
 {
        /* If they want some strange system call vector, reserve it now */
-       if (syscall_vector != SYSCALL_VECTOR
-           && test_and_set_bit(syscall_vector, used_vectors)) {
-               printk("lg: couldn't reserve syscall %u\n", syscall_vector);
-               return -EBUSY;
+       if (syscall_vector != SYSCALL_VECTOR) {
+               if (test_bit(syscall_vector, used_vectors) ||
+                   vector_used_by_percpu_irq(syscall_vector)) {
+                       printk(KERN_ERR "lg: couldn't reserve syscall %u\n",
+                                syscall_vector);
+                       return -EBUSY;
+               }
+               set_bit(syscall_vector, used_vectors);
        }
+
        return 0;
 }
 
index 7beffcab274548545c15673025cbd639ce563094..9dedbbd218c3967932a83f866d5338034f664457 100644 (file)
@@ -704,16 +704,17 @@ static unsigned int iosapic_startup_irq(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void iosapic_set_affinity_irq(unsigned int irq, cpumask_t dest)
+static void iosapic_set_affinity_irq(unsigned int irq,
+                                    const struct cpumask *dest)
 {
        struct vector_info *vi = iosapic_get_vector(irq);
        u32 d0, d1, dummy_d0;
        unsigned long flags;
 
-       if (cpu_check_affinity(irq, &dest))
+       if (cpu_check_affinity(irq, dest))
                return;
 
-       vi->txn_addr = txn_affinity_addr(irq, first_cpu(dest));
+       vi->txn_addr = txn_affinity_addr(irq, cpumask_first(dest));
 
        spin_lock_irqsave(&iosapic_lock, flags);
        /* d1 contains the destination CPU, so only want to set that
index 8514c3a1746a6c15e4ff3db1dd43b7c1dcefdee2..c2e1bcbb28a79edf262926174d805e23b811015e 100644 (file)
@@ -45,7 +45,7 @@
 
 #include "cpqphp.h"
 #include "cpqphp_nvram.h"
-#include "../../../arch/x86/pci/pci.h" /* horrible hack showing how processor dependent we are... */
+#include <asm/pci_x86.h>
 
 
 /* Global variables */
index 09021930589fb477fc885b80a0dd7623c817e676..df146be9d2e9f33cb6c6c8eff1d72b8de5e86bc3 100644 (file)
@@ -37,7 +37,7 @@
 #include "../pci.h"
 #include "cpqphp.h"
 #include "cpqphp_nvram.h"
-#include "../../../arch/x86/pci/pci.h" /* horrible hack showing how processor dependent we are... */
+#include <asm/pci_x86.h>
 
 
 u8 cpqhp_nic_irq;
index 633e743442ac711d0b02d6b85a8252a1ee74dbb5..dd18f857dfb042d8d5802bfbbab57c617054b3f4 100644 (file)
@@ -35,7 +35,7 @@
 #include <linux/delay.h>
 #include <linux/wait.h>
 #include "../pci.h"
-#include "../../../arch/x86/pci/pci.h" /* for struct irq_routing_table */
+#include <asm/pci_x86.h>               /* for struct irq_routing_table */
 #include "ibmphp.h"
 
 #define attn_on(sl)  ibmphp_hpc_writeslot (sl, HPC_SLOT_ATTNON)
index 5d72866897a8afc3397fe7b599b74de64eecbe22..c88485860a0ad3dfff19485a417aaf112b051f08 100644 (file)
@@ -74,7 +74,7 @@ static ssize_t local_cpus_show(struct device *dev,
        int len;
 
        mask = pcibus_to_cpumask(to_pci_dev(dev)->bus);
-       len = cpumask_scnprintf(buf, PAGE_SIZE-2, mask);
+       len = cpumask_scnprintf(buf, PAGE_SIZE-2, &mask);
        buf[len++] = '\n';
        buf[len] = '\0';
        return len;
@@ -88,7 +88,7 @@ static ssize_t local_cpulist_show(struct device *dev,
        int len;
 
        mask = pcibus_to_cpumask(to_pci_dev(dev)->bus);
-       len = cpulist_scnprintf(buf, PAGE_SIZE-2, mask);
+       len = cpulist_scnprintf(buf, PAGE_SIZE-2, &mask);
        buf[len++] = '\n';
        buf[len] = '\0';
        return len;
index 003a9b3c293fd6937682d061f2b4e242ab673adc..5b3f5937ecf556429217afa10d58b0053f273269 100644 (file)
@@ -55,8 +55,8 @@ static ssize_t pci_bus_show_cpuaffinity(struct device *dev,
 
        cpumask = pcibus_to_cpumask(to_pci_bus(dev));
        ret = type?
-               cpulist_scnprintf(buf, PAGE_SIZE-2, cpumask):
-               cpumask_scnprintf(buf, PAGE_SIZE-2, cpumask);
+               cpulist_scnprintf(buf, PAGE_SIZE-2, &cpumask) :
+               cpumask_scnprintf(buf, PAGE_SIZE-2, &cpumask);
        buf[ret++] = '\n';
        buf[ret] = '\0';
        return ret;
index e26733a9df21d5ca583ad09bea26dfea909873da..eb0dfdeaa9494ac3043f13db5cbd1bf0c0dd5751 100644 (file)
@@ -585,7 +585,7 @@ void rebind_evtchn_irq(int evtchn, int irq)
        spin_unlock(&irq_mapping_update_lock);
 
        /* new event channels are always bound to cpu 0 */
-       irq_set_affinity(irq, cpumask_of_cpu(0));
+       irq_set_affinity(irq, cpumask_of(0));
 
        /* Unmask the event channel. */
        enable_irq(irq);
@@ -614,9 +614,9 @@ static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
 }
 
 
-static void set_affinity_irq(unsigned irq, cpumask_t dest)
+static void set_affinity_irq(unsigned irq, const struct cpumask *dest)
 {
-       unsigned tcpu = first_cpu(dest);
+       unsigned tcpu = cpumask_first(dest);
        rebind_irq_to_cpu(irq, tcpu);
 }
 
index c16d9be1b017c9ad8cdf6289fddad01df22732bd..3bbdb9d023767a142f8917bc793437abf586f211 100644 (file)
@@ -79,9 +79,12 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
        if (IS_ERR(anon_inode_inode))
                return -ENODEV;
 
+       if (fops->owner && !try_module_get(fops->owner))
+               return -ENOENT;
+
        error = get_unused_fd_flags(flags);
        if (error < 0)
-               return error;
+               goto err_module;
        fd = error;
 
        /*
@@ -128,6 +131,8 @@ err_dput:
        dput(dentry);
 err_put_unused_fd:
        put_unused_fd(fd);
+err_module:
+       module_put(fops->owner);
        return error;
 }
 EXPORT_SYMBOL_GPL(anon_inode_getfd);
index 54bbf6e04ee8f3bbd652c30254ef6553e38866c4..0e9e2bc0ee96e66d08beb37bb71933ab2ece7636 100644 (file)
@@ -40,6 +40,9 @@
 #ifndef node_to_cpumask
 #define node_to_cpumask(node)  ((void)node, cpu_online_map)
 #endif
+#ifndef cpumask_of_node
+#define cpumask_of_node(node)  ((void)node, cpu_online_mask)
+#endif
 #ifndef node_to_first_cpu
 #define node_to_first_cpu(node)        ((void)(node),0)
 #endif
                                )
 #endif
 
+#ifndef cpumask_of_pcibus
+#define cpumask_of_pcibus(bus) (pcibus_to_node(bus) == -1 ?            \
+                                cpu_all_mask :                         \
+                                cpumask_of_node(pcibus_to_node(bus)))
+#endif
+
 #endif /* CONFIG_NUMA */
 
-/* returns pointer to cpumask for specified node */
+/*
+ * returns pointer to cpumask for specified node
+ * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
+ */
 #ifndef node_to_cpumask_ptr
 
 #define        node_to_cpumask_ptr(v, node)                                    \
index c5dd66916692ad1723fa29bd34089165afef582c..b96a6d2ffbc3f16161cb7fbc8e85de866dfeaaf3 100644 (file)
@@ -63,8 +63,6 @@ extern volatile int cpu_2_physid[NR_CPUS];
 #define raw_smp_processor_id() (current_thread_info()->cpu)
 
 extern cpumask_t cpu_callout_map;
-extern cpumask_t cpu_possible_map;
-extern cpumask_t cpu_present_map;
 
 static __inline__ int hard_smp_processor_id(void)
 {
index ed3a5d473e52b2282c0d09c2c25d406400dc7522..cea153697ec788a3c3d61c2402a3f1188b5b06c7 100644 (file)
@@ -82,13 +82,13 @@ struct clock_event_device {
        int                     shift;
        int                     rating;
        int                     irq;
-       cpumask_t               cpumask;
+       const struct cpumask    *cpumask;
        int                     (*set_next_event)(unsigned long evt,
                                                  struct clock_event_device *);
        void                    (*set_mode)(enum clock_event_mode mode,
                                            struct clock_event_device *);
        void                    (*event_handler)(struct clock_event_device *);
-       void                    (*broadcast)(cpumask_t mask);
+       void                    (*broadcast)(const struct cpumask *mask);
        struct list_head        list;
        enum clock_event_mode   mode;
        ktime_t                 next_event;
index 2befe6513ce4ab31613d26b22b398bbce627598f..8005effc04f19097aa781caf016a666e492adcb1 100644 (file)
@@ -2,6 +2,10 @@
 #error "Please don't include <linux/compiler-gcc3.h> directly, include <linux/compiler.h> instead."
 #endif
 
+#if __GNUC_MINOR__ < 2
+# error Sorry, your compiler is too old - please upgrade it.
+#endif
+
 #if __GNUC_MINOR__ >= 3
 # define __used                        __attribute__((__used__))
 #else
index 21e1dd43e52aff50afe75a05d067492407ced320..d4bf52603e6bfbdab95eb81b65bad0dce510356c 100644 (file)
@@ -339,36 +339,6 @@ extern cpumask_t cpu_mask_all;
 #endif
 #define        CPUMASK_PTR(v, m)       cpumask_t *v = &(m->v)
 
-#define cpumask_scnprintf(buf, len, src) \
-                       __cpumask_scnprintf((buf), (len), &(src), NR_CPUS)
-static inline int __cpumask_scnprintf(char *buf, int len,
-                                       const cpumask_t *srcp, int nbits)
-{
-       return bitmap_scnprintf(buf, len, srcp->bits, nbits);
-}
-
-#define cpumask_parse_user(ubuf, ulen, dst) \
-                       __cpumask_parse_user((ubuf), (ulen), &(dst), NR_CPUS)
-static inline int __cpumask_parse_user(const char __user *buf, int len,
-                                       cpumask_t *dstp, int nbits)
-{
-       return bitmap_parse_user(buf, len, dstp->bits, nbits);
-}
-
-#define cpulist_scnprintf(buf, len, src) \
-                       __cpulist_scnprintf((buf), (len), &(src), NR_CPUS)
-static inline int __cpulist_scnprintf(char *buf, int len,
-                                       const cpumask_t *srcp, int nbits)
-{
-       return bitmap_scnlistprintf(buf, len, srcp->bits, nbits);
-}
-
-#define cpulist_parse(buf, dst) __cpulist_parse((buf), &(dst), NR_CPUS)
-static inline int __cpulist_parse(const char *buf, cpumask_t *dstp, int nbits)
-{
-       return bitmap_parselist(buf, dstp->bits, nbits);
-}
-
 #define cpu_remap(oldbit, old, new) \
                __cpu_remap((oldbit), &(old), &(new), NR_CPUS)
 static inline int __cpu_remap(int oldbit,
@@ -540,9 +510,6 @@ extern cpumask_t cpu_active_map;
        [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \
 }
 
-/* This produces more efficient code. */
-#define nr_cpumask_bits        NR_CPUS
-
 #else /* NR_CPUS > BITS_PER_LONG */
 
 #define CPU_BITS_ALL                                           \
@@ -550,9 +517,15 @@ extern cpumask_t cpu_active_map;
        [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,                \
        [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD         \
 }
+#endif /* NR_CPUS > BITS_PER_LONG */
 
+#ifdef CONFIG_CPUMASK_OFFSTACK
+/* Assuming NR_CPUS is huge, a runtime limit is more efficient.  Also,
+ * not all bits may be allocated. */
 #define nr_cpumask_bits        nr_cpu_ids
-#endif /* NR_CPUS > BITS_PER_LONG */
+#else
+#define nr_cpumask_bits        NR_CPUS
+#endif
 
 /* verify cpu argument to cpumask_* operators */
 static inline unsigned int cpumask_check(unsigned int cpu)
@@ -945,6 +918,63 @@ static inline void cpumask_copy(struct cpumask *dstp,
  */
 #define cpumask_of(cpu) (get_cpu_mask(cpu))
 
+/**
+ * cpumask_scnprintf - print a cpumask into a string as comma-separated hex
+ * @buf: the buffer to sprintf into
+ * @len: the length of the buffer
+ * @srcp: the cpumask to print
+ *
+ * If len is zero, returns zero.  Otherwise returns the length of the
+ * (nul-terminated) @buf string.
+ */
+static inline int cpumask_scnprintf(char *buf, int len,
+                                   const struct cpumask *srcp)
+{
+       return bitmap_scnprintf(buf, len, srcp->bits, nr_cpumask_bits);
+}
+
+/**
+ * cpumask_parse_user - extract a cpumask from a user string
+ * @buf: the buffer to extract from
+ * @len: the length of the buffer
+ * @dstp: the cpumask to set.
+ *
+ * Returns -errno, or 0 for success.
+ */
+static inline int cpumask_parse_user(const char __user *buf, int len,
+                                    struct cpumask *dstp)
+{
+       return bitmap_parse_user(buf, len, dstp->bits, nr_cpumask_bits);
+}
+
+/**
+ * cpulist_scnprintf - print a cpumask into a string as comma-separated list
+ * @buf: the buffer to sprintf into
+ * @len: the length of the buffer
+ * @srcp: the cpumask to print
+ *
+ * If len is zero, returns zero.  Otherwise returns the length of the
+ * (nul-terminated) @buf string.
+ */
+static inline int cpulist_scnprintf(char *buf, int len,
+                                   const struct cpumask *srcp)
+{
+       return bitmap_scnlistprintf(buf, len, srcp->bits, nr_cpumask_bits);
+}
+
+/**
+ * cpulist_parse_user - extract a cpumask from a user string of ranges
+ * @buf: the buffer to extract from
+ * @len: the length of the buffer
+ * @dstp: the cpumask to set.
+ *
+ * Returns -errno, or 0 for success.
+ */
+static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
+{
+       return bitmap_parselist(buf, dstp->bits, nr_cpumask_bits);
+}
+
 /**
  * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
  * @bitmap: the bitmap
index 8cc8ef47f5b63ac3f7460214bdebe346a9b0de7d..990355fbc54ee64004f7a2150552b2db2d1e6682 100644 (file)
@@ -111,13 +111,13 @@ extern void enable_irq(unsigned int irq);
 
 extern cpumask_t irq_default_affinity;
 
-extern int irq_set_affinity(unsigned int irq, cpumask_t cpumask);
+extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
 extern int irq_can_set_affinity(unsigned int irq);
 extern int irq_select_affinity(unsigned int irq);
 
 #else /* CONFIG_SMP */
 
-static inline int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
+static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
 {
        return -EINVAL;
 }
index d64a6d49bdef0c9d194db7bb555d6718de4a4157..f899b502f18622dce2c509aac5c656ef8150216f 100644 (file)
@@ -113,7 +113,8 @@ struct irq_chip {
        void            (*eoi)(unsigned int irq);
 
        void            (*end)(unsigned int irq);
-       void            (*set_affinity)(unsigned int irq, cpumask_t dest);
+       void            (*set_affinity)(unsigned int irq,
+                                       const struct cpumask *dest);
        int             (*retrigger)(unsigned int irq);
        int             (*set_type)(unsigned int irq, unsigned int flow_type);
        int             (*set_wake)(unsigned int irq, unsigned int on);
index f18b86fa86553ba02ef8cb2df0b61ac116c4fdf5..35525ac63337112e0dd61d5bb87bb494c3e25de2 100644 (file)
@@ -83,6 +83,7 @@ struct kvm_irqchip {
 #define KVM_EXIT_S390_SIEIC       13
 #define KVM_EXIT_S390_RESET       14
 #define KVM_EXIT_DCR              15
+#define KVM_EXIT_NMI              16
 
 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
 struct kvm_run {
@@ -387,6 +388,14 @@ struct kvm_trace_rec {
 #define KVM_CAP_DEVICE_ASSIGNMENT 17
 #endif
 #define KVM_CAP_IOMMU 18
+#if defined(CONFIG_X86)
+#define KVM_CAP_DEVICE_MSI 20
+#endif
+/* Bug in KVM_SET_USER_MEMORY_REGION fixed: */
+#define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21
+#if defined(CONFIG_X86)
+#define KVM_CAP_USER_NMI 22
+#endif
 
 /*
  * ioctls for VM fds
@@ -458,6 +467,8 @@ struct kvm_trace_rec {
 #define KVM_S390_INITIAL_RESET    _IO(KVMIO,  0x97)
 #define KVM_GET_MP_STATE          _IOR(KVMIO,  0x98, struct kvm_mp_state)
 #define KVM_SET_MP_STATE          _IOW(KVMIO,  0x99, struct kvm_mp_state)
+/* Available with KVM_CAP_NMI */
+#define KVM_NMI                   _IO(KVMIO,  0x9a)
 
 #define KVM_TRC_INJ_VIRQ         (KVM_TRC_HANDLER + 0x02)
 #define KVM_TRC_REDELIVER_EVT    (KVM_TRC_HANDLER + 0x03)
@@ -500,10 +511,17 @@ struct kvm_assigned_irq {
        __u32 guest_irq;
        __u32 flags;
        union {
+               struct {
+                       __u32 addr_lo;
+                       __u32 addr_hi;
+                       __u32 data;
+               } guest_msi;
                __u32 reserved[12];
        };
 };
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU    (1 << 0)
 
+#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI  (1 << 0)
+
 #endif
index bb92be2153bc364cc9c0876236b3867644ea7b85..eafabd5c66b2951234d8d157520771514e9e83f0 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/mm.h>
 #include <linux/preempt.h>
 #include <linux/marker.h>
+#include <linux/msi.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -306,8 +307,14 @@ struct kvm_assigned_dev_kernel {
        int host_busnr;
        int host_devfn;
        int host_irq;
+       bool host_irq_disabled;
        int guest_irq;
-       int irq_requested;
+       struct msi_msg guest_msi;
+#define KVM_ASSIGNED_DEV_GUEST_INTX    (1 << 0)
+#define KVM_ASSIGNED_DEV_GUEST_MSI     (1 << 1)
+#define KVM_ASSIGNED_DEV_HOST_INTX     (1 << 8)
+#define KVM_ASSIGNED_DEV_HOST_MSI      (1 << 9)
+       unsigned long irq_requested_type;
        int irq_source_id;
        struct pci_dev *dev;
        struct kvm *kvm;
@@ -316,8 +323,7 @@ void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
                                   struct kvm_irq_ack_notifier *kian);
-void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
-                                    struct kvm_irq_ack_notifier *kian);
+void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian);
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
index 8395e715809d382bb7f3a5ed7087173e99cd15d2..158d53d07765888af5c2e7bdc4515373983d930e 100644 (file)
@@ -250,7 +250,7 @@ extern void init_idle_bootup_task(struct task_struct *idle);
 extern int runqueue_is_locked(void);
 extern void task_rq_unlock_wait(struct task_struct *p);
 
-extern cpumask_t nohz_cpu_mask;
+extern cpumask_var_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
 #else
@@ -758,20 +758,51 @@ enum cpu_idle_type {
 #define SD_SERIALIZE           1024    /* Only a single load balancing instance */
 #define SD_WAKE_IDLE_FAR       2048    /* Gain latency sacrificing cache hit */
 
-#define BALANCE_FOR_MC_POWER   \
-       (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
+enum powersavings_balance_level {
+       POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
+       POWERSAVINGS_BALANCE_BASIC,     /* Fill one thread/core/package
+                                        * first for long running threads
+                                        */
+       POWERSAVINGS_BALANCE_WAKEUP,    /* Also bias task wakeups to semi-idle
+                                        * cpu package for power savings
+                                        */
+       MAX_POWERSAVINGS_BALANCE_LEVELS
+};
 
-#define BALANCE_FOR_PKG_POWER  \
-       ((sched_mc_power_savings || sched_smt_power_savings) ?  \
-        SD_POWERSAVINGS_BALANCE : 0)
+extern int sched_mc_power_savings, sched_smt_power_savings;
 
-#define test_sd_parent(sd, flag)       ((sd->parent &&         \
-                                        (sd->parent->flags & flag)) ? 1 : 0)
+static inline int sd_balance_for_mc_power(void)
+{
+       if (sched_smt_power_savings)
+               return SD_POWERSAVINGS_BALANCE;
 
+       return 0;
+}
+
+static inline int sd_balance_for_package_power(void)
+{
+       if (sched_mc_power_savings | sched_smt_power_savings)
+               return SD_POWERSAVINGS_BALANCE;
+
+       return 0;
+}
+
+/*
+ * Optimise SD flags for power savings:
+ * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings.
+ * Keep default SD flags if sched_{smt,mc}_power_saving=0
+ */
+
+static inline int sd_power_saving_flags(void)
+{
+       if (sched_mc_power_savings | sched_smt_power_savings)
+               return SD_BALANCE_NEWIDLE;
+
+       return 0;
+}
 
 struct sched_group {
        struct sched_group *next;       /* Must be a circular list */
-       cpumask_t cpumask;
 
        /*
         * CPU power of this group, SCHED_LOAD_SCALE being max power for a
@@ -784,8 +815,15 @@ struct sched_group {
         * (see include/linux/reciprocal_div.h)
         */
        u32 reciprocal_cpu_power;
+
+       unsigned long cpumask[];
 };
 
+static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+{
+       return to_cpumask(sg->cpumask);
+}
+
 enum sched_domain_level {
        SD_LV_NONE = 0,
        SD_LV_SIBLING,
@@ -809,7 +847,6 @@ struct sched_domain {
        struct sched_domain *parent;    /* top domain must be null terminated */
        struct sched_domain *child;     /* bottom domain must be null terminated */
        struct sched_group *groups;     /* the balancing groups of the domain */
-       cpumask_t span;                 /* span of all CPUs in this domain */
        unsigned long min_interval;     /* Minimum balance interval ms */
        unsigned long max_interval;     /* Maximum balance interval ms */
        unsigned int busy_factor;       /* less balancing by factor if busy */
@@ -864,18 +901,35 @@ struct sched_domain {
 #ifdef CONFIG_SCHED_DEBUG
        char *name;
 #endif
+
+       /* span of all CPUs in this domain */
+       unsigned long span[];
 };
 
-extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
+{
+       return to_cpumask(sd->span);
+}
+
+extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                                    struct sched_domain_attr *dattr_new);
 extern int arch_reinit_sched_domains(void);
 
+/* Test a flag in parent sched domain */
+static inline int test_sd_parent(struct sched_domain *sd, int flag)
+{
+       if (sd->parent && (sd->parent->flags & flag))
+               return 1;
+
+       return 0;
+}
+
 #else /* CONFIG_SMP */
 
 struct sched_domain_attr;
 
 static inline void
-partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                        struct sched_domain_attr *dattr_new)
 {
 }
@@ -926,7 +980,7 @@ struct sched_class {
        void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
 
        void (*set_cpus_allowed)(struct task_struct *p,
-                                const cpumask_t *newmask);
+                                const struct cpumask *newmask);
 
        void (*rq_online)(struct rq *rq);
        void (*rq_offline)(struct rq *rq);
@@ -1579,12 +1633,12 @@ extern cputime_t task_gtime(struct task_struct *p);
 
 #ifdef CONFIG_SMP
 extern int set_cpus_allowed_ptr(struct task_struct *p,
-                               const cpumask_t *new_mask);
+                               const struct cpumask *new_mask);
 #else
 static inline int set_cpus_allowed_ptr(struct task_struct *p,
-                                      const cpumask_t *new_mask)
+                                      const struct cpumask *new_mask)
 {
-       if (!cpu_isset(0, *new_mask))
+       if (!cpumask_test_cpu(0, new_mask))
                return -EINVAL;
        return 0;
 }
@@ -2195,10 +2249,8 @@ __trace_special(void *__tr, void *__data,
 }
 #endif
 
-extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
-extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
-
-extern int sched_mc_power_savings, sched_smt_power_savings;
+extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
+extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
 extern void normalize_rt_tasks(void);
 
index 0c5b5ac36d8edc06bb6eb68975f8223e50631c05..e632d29f054473c2ca34186185f1d7bb797ef931 100644 (file)
@@ -125,7 +125,8 @@ int arch_update_cpu_topology(void);
                                | SD_WAKE_AFFINE        \
                                | SD_WAKE_BALANCE       \
                                | SD_SHARE_PKG_RESOURCES\
-                               | BALANCE_FOR_MC_POWER, \
+                               | sd_balance_for_mc_power()\
+                               | sd_power_saving_flags(),\
        .last_balance           = jiffies,              \
        .balance_interval       = 1,                    \
 }
@@ -150,7 +151,8 @@ int arch_update_cpu_topology(void);
                                | SD_BALANCE_FORK       \
                                | SD_WAKE_AFFINE        \
                                | SD_WAKE_BALANCE       \
-                               | BALANCE_FOR_PKG_POWER,\
+                               | sd_balance_for_package_power()\
+                               | sd_power_saving_flags(),\
        .last_balance           = jiffies,              \
        .balance_interval       = 1,                    \
 }
index 13627191a60d194de08aaa4b410aa752cfd7cb21..f6281711166d5dbeba4f55121547396d00b6ff2e 100644 (file)
@@ -924,6 +924,15 @@ config KMOD
 
 endif # MODULES
 
+config INIT_ALL_POSSIBLE
+       bool
+       help
+         Back when each arch used to define their own cpu_online_map and
+         cpu_possible_map, some of them chose to initialize cpu_possible_map
+         with all 1s, and others with all 0s.  When they were centralised,
+         it was better to provide this option than to break all the archs
+         and have several arch maintainers persuing me down dark alleys.
+
 config STOP_MACHINE
        bool
        default y
index 8ea32e8d68b05c95b211830deee92d9a7929637b..bae131a1211b55a9051bfe581dd74df8e72dc248 100644 (file)
 cpumask_t cpu_present_map __read_mostly;
 EXPORT_SYMBOL(cpu_present_map);
 
-#ifndef CONFIG_SMP
-
 /*
  * Represents all cpu's that are currently online.
  */
-cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+cpumask_t cpu_online_map __read_mostly;
 EXPORT_SYMBOL(cpu_online_map);
 
+#ifdef CONFIG_INIT_ALL_POSSIBLE
 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
+#else
+cpumask_t cpu_possible_map __read_mostly;
+#endif
 EXPORT_SYMBOL(cpu_possible_map);
 
-#else /* CONFIG_SMP */
-
+#ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_map, cpu_present_map */
 static DEFINE_MUTEX(cpu_add_remove_lock);
 
index 96c0ba13b8cd06add25de5043547b62cef6ef7b5..39c1a4c1c5a926a58a8ba132f684b97ebdf060bf 100644 (file)
@@ -896,7 +896,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
        if (!*buf) {
                cpus_clear(trialcs.cpus_allowed);
        } else {
-               retval = cpulist_parse(buf, trialcs.cpus_allowed);
+               retval = cpulist_parse(buf, &trialcs.cpus_allowed);
                if (retval < 0)
                        return retval;
 
@@ -1482,7 +1482,7 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
        mask = cs->cpus_allowed;
        mutex_unlock(&callback_mutex);
 
-       return cpulist_scnprintf(page, PAGE_SIZE, mask);
+       return cpulist_scnprintf(page, PAGE_SIZE, &mask);
 }
 
 static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
index 6eb3c7952b6496fc9c5f8da49b982d79866e1504..f63c706d25e15f481f61548dd248d1eaf69702bb 100644 (file)
@@ -46,7 +46,7 @@ void dynamic_irq_init(unsigned int irq)
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
 #ifdef CONFIG_SMP
-       cpus_setall(desc->affinity);
+       cpumask_setall(&desc->affinity);
 #endif
        spin_unlock_irqrestore(&desc->lock, flags);
 }
index 540f6c49f3fa156b2bd0d61ad2c5090ea0e46013..61c4a9b6216546aac546b4a6ab1427ed4ac2a00a 100644 (file)
@@ -79,7 +79,7 @@ int irq_can_set_affinity(unsigned int irq)
  *     @cpumask:       cpumask
  *
  */
-int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
+int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
@@ -91,14 +91,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
-               desc->affinity = cpumask;
+               cpumask_copy(&desc->affinity, cpumask);
                desc->chip->set_affinity(irq, cpumask);
        } else {
                desc->status |= IRQ_MOVE_PENDING;
-               desc->pending_mask = cpumask;
+               cpumask_copy(&desc->pending_mask, cpumask);
        }
 #else
-       desc->affinity = cpumask;
+       cpumask_copy(&desc->affinity, cpumask);
        desc->chip->set_affinity(irq, cpumask);
 #endif
        desc->status |= IRQ_AFFINITY_SET;
@@ -112,26 +112,24 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
  */
 int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
 {
-       cpumask_t mask;
-
        if (!irq_can_set_affinity(irq))
                return 0;
 
-       cpus_and(mask, cpu_online_map, irq_default_affinity);
-
        /*
         * Preserve an userspace affinity setup, but make sure that
         * one of the targets is online.
         */
        if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
-               if (cpus_intersects(desc->affinity, cpu_online_map))
-                       mask = desc->affinity;
+               if (cpumask_any_and(&desc->affinity, cpu_online_mask)
+                   < nr_cpu_ids)
+                       goto set_affinity;
                else
                        desc->status &= ~IRQ_AFFINITY_SET;
        }
 
-       desc->affinity = mask;
-       desc->chip->set_affinity(irq, mask);
+       cpumask_and(&desc->affinity, cpu_online_mask, &irq_default_affinity);
+set_affinity:
+       desc->chip->set_affinity(irq, &desc->affinity);
 
        return 0;
 }
index 9db681d95814baa2662fdccba0755b09579577cf..bd72329e630c6cc6f3958bf4d70484f9124f4330 100644 (file)
@@ -4,7 +4,6 @@
 void move_masked_irq(int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-       cpumask_t tmp;
 
        if (likely(!(desc->status & IRQ_MOVE_PENDING)))
                return;
@@ -19,7 +18,7 @@ void move_masked_irq(int irq)
 
        desc->status &= ~IRQ_MOVE_PENDING;
 
-       if (unlikely(cpus_empty(desc->pending_mask)))
+       if (unlikely(cpumask_empty(&desc->pending_mask)))
                return;
 
        if (!desc->chip->set_affinity)
@@ -27,8 +26,6 @@ void move_masked_irq(int irq)
 
        assert_spin_locked(&desc->lock);
 
-       cpus_and(tmp, desc->pending_mask, cpu_online_map);
-
        /*
         * If there was a valid mask to work with, please
         * do the disable, re-program, enable sequence.
@@ -41,10 +38,13 @@ void move_masked_irq(int irq)
         * For correct operation this depends on the caller
         * masking the irqs.
         */
-       if (likely(!cpus_empty(tmp))) {
-               desc->chip->set_affinity(irq,tmp);
+       if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask)
+                  < nr_cpu_ids)) {
+               cpumask_and(&desc->affinity,
+                           &desc->pending_mask, cpu_online_mask);
+               desc->chip->set_affinity(irq, &desc->affinity);
        }
-       cpus_clear(desc->pending_mask);
+       cpumask_clear(&desc->pending_mask);
 }
 
 void move_native_irq(int irq)
index f6b3440f05bc50dbe68cdb7026e4e287e227b934..d2c0e5ee53c573019f45942e4d63bb362db42551 100644 (file)
@@ -40,33 +40,42 @@ static ssize_t irq_affinity_proc_write(struct file *file,
                const char __user *buffer, size_t count, loff_t *pos)
 {
        unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
-       cpumask_t new_value;
+       cpumask_var_t new_value;
        int err;
 
        if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
            irq_balancing_disabled(irq))
                return -EIO;
 
+       if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+               return -ENOMEM;
+
        err = cpumask_parse_user(buffer, count, new_value);
        if (err)
-               return err;
+               goto free_cpumask;
 
-       if (!is_affinity_mask_valid(new_value))
-               return -EINVAL;
+       if (!is_affinity_mask_valid(*new_value)) {
+               err = -EINVAL;
+               goto free_cpumask;
+       }
 
        /*
         * Do not allow disabling IRQs completely - it's a too easy
         * way to make the system unusable accidentally :-) At least
         * one online CPU still has to be targeted.
         */
-       if (!cpus_intersects(new_value, cpu_online_map))
+       if (!cpumask_intersects(new_value, cpu_online_mask)) {
                /* Special case for empty set - allow the architecture
                   code to set default SMP affinity. */
-               return irq_select_affinity_usr(irq) ? -EINVAL : count;
-
-       irq_set_affinity(irq, new_value);
+               err = irq_select_affinity_usr(irq) ? -EINVAL : count;
+       } else {
+               irq_set_affinity(irq, new_value);
+               err = count;
+       }
 
-       return count;
+free_cpumask:
+       free_cpumask_var(new_value);
+       return err;
 }
 
 static int irq_affinity_proc_open(struct inode *inode, struct file *file)
@@ -95,7 +104,7 @@ static ssize_t default_affinity_write(struct file *file,
        cpumask_t new_value;
        int err;
 
-       err = cpumask_parse_user(buffer, count, new_value);
+       err = cpumask_parse_user(buffer, count, &new_value);
        if (err)
                return err;
 
index 60adefb59b5e24f4f3d916df1caab686f91bc9f4..4cb7d68fed82aabce7f74da9ecf30f3bb54b77e1 100644 (file)
@@ -442,7 +442,7 @@ void profile_tick(int type)
 static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
                        int count, int *eof, void *data)
 {
-       int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
+       int len = cpumask_scnprintf(page, count, (cpumask_t *)data);
        if (count - len < 2)
                return -EINVAL;
        len += sprintf(page + len, "\n");
@@ -456,7 +456,7 @@ static int prof_cpu_mask_write_proc(struct file *file,
        unsigned long full_count = count, err;
        cpumask_t new_value;
 
-       err = cpumask_parse_user(buffer, count, new_value);
+       err = cpumask_parse_user(buffer, count, &new_value);
        if (err)
                return err;
 
index e503a002f330fb1f7ed74ef996384e74ae294d44..c03ca3e619190c44292aed969c6c58f947e0b35a 100644 (file)
@@ -393,7 +393,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
                 * unnecessarily.
                 */
                smp_mb();
-               cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
+               cpumask_andnot(&rcp->cpumask, cpu_online_mask, nohz_cpu_mask);
 
                rcp->signaled = 0;
        }
index fff1c4a20b6538966a0cf2b97a012c045d52b84d..27ba1d642f0f0c4c370e81a61b067874310532d0 100644 (file)
@@ -498,18 +498,26 @@ struct rt_rq {
  */
 struct root_domain {
        atomic_t refcount;
-       cpumask_t span;
-       cpumask_t online;
+       cpumask_var_t span;
+       cpumask_var_t online;
 
        /*
         * The "RT overload" flag: it gets set if a CPU has more than
         * one runnable RT task.
         */
-       cpumask_t rto_mask;
+       cpumask_var_t rto_mask;
        atomic_t rto_count;
 #ifdef CONFIG_SMP
        struct cpupri cpupri;
 #endif
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+       /*
+        * Preferred wake up cpu nominated by sched_mc balance that will be
+        * used when most cpus are idle in the system indicating overall very
+        * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
+        */
+       unsigned int sched_mc_preferred_wakeup_cpu;
+#endif
 };
 
 /*
@@ -1514,7 +1522,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
        struct sched_domain *sd = data;
        int i;
 
-       for_each_cpu_mask(i, sd->span) {
+       for_each_cpu(i, sched_domain_span(sd)) {
                /*
                 * If there are currently no tasks on the cpu pretend there
                 * is one of average load so that when a new task gets to
@@ -1535,7 +1543,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
                shares = tg->shares;
 
-       for_each_cpu_mask(i, sd->span)
+       for_each_cpu(i, sched_domain_span(sd))
                update_group_shares_cpu(tg, i, shares, rq_weight);
 
        return 0;
@@ -2101,15 +2109,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                int i;
 
                /* Skip over this group if it has no CPUs allowed */
-               if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+               if (!cpumask_intersects(sched_group_cpus(group),
+                                       &p->cpus_allowed))
                        continue;
 
-               local_group = cpu_isset(this_cpu, group->cpumask);
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_cpus(group));
 
                /* Tally up the load of all CPUs in the group */
                avg_load = 0;
 
-               for_each_cpu_mask_nr(i, group->cpumask) {
+               for_each_cpu(i, sched_group_cpus(group)) {
                        /* Bias balancing toward cpus of our domain */
                        if (local_group)
                                load = source_load(i, load_idx);
@@ -2141,17 +2151,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
  * find_idlest_cpu - find the idlest cpu among the cpus in group.
  */
 static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
-               cpumask_t *tmp)
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
        unsigned long load, min_load = ULONG_MAX;
        int idlest = -1;
        int i;
 
        /* Traverse only the allowed CPUs */
-       cpus_and(*tmp, group->cpumask, p->cpus_allowed);
-
-       for_each_cpu_mask_nr(i, *tmp) {
+       for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
                load = weighted_cpuload(i);
 
                if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2193,7 +2200,6 @@ static int sched_balance_self(int cpu, int flag)
                update_shares(sd);
 
        while (sd) {
-               cpumask_t span, tmpmask;
                struct sched_group *group;
                int new_cpu, weight;
 
@@ -2202,14 +2208,13 @@ static int sched_balance_self(int cpu, int flag)
                        continue;
                }
 
-               span = sd->span;
                group = find_idlest_group(sd, t, cpu);
                if (!group) {
                        sd = sd->child;
                        continue;
                }
 
-               new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
+               new_cpu = find_idlest_cpu(group, t, cpu);
                if (new_cpu == -1 || new_cpu == cpu) {
                        /* Now try balancing at a lower domain level of cpu */
                        sd = sd->child;
@@ -2218,10 +2223,10 @@ static int sched_balance_self(int cpu, int flag)
 
                /* Now try balancing at a lower domain level of new_cpu */
                cpu = new_cpu;
+               weight = cpumask_weight(sched_domain_span(sd));
                sd = NULL;
-               weight = cpus_weight(span);
                for_each_domain(cpu, tmp) {
-                       if (weight <= cpus_weight(tmp->span))
+                       if (weight <= cpumask_weight(sched_domain_span(tmp)))
                                break;
                        if (tmp->flags & flag)
                                sd = tmp;
@@ -2266,7 +2271,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
                cpu = task_cpu(p);
 
                for_each_domain(this_cpu, sd) {
-                       if (cpu_isset(cpu, sd->span)) {
+                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                                update_shares(sd);
                                break;
                        }
@@ -2315,7 +2320,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
        else {
                struct sched_domain *sd;
                for_each_domain(this_cpu, sd) {
-                       if (cpu_isset(cpu, sd->span)) {
+                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                                schedstat_inc(sd, ttwu_wake_remote);
                                break;
                        }
@@ -2846,7 +2851,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
        struct rq *rq;
 
        rq = task_rq_lock(p, &flags);
-       if (!cpu_isset(dest_cpu, p->cpus_allowed)
+       if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
            || unlikely(!cpu_active(dest_cpu)))
                goto out;
 
@@ -2911,7 +2916,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
         * 2) cannot be migrated to this CPU due to cpus_allowed, or
         * 3) are cache-hot on their current CPU.
         */
-       if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+       if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
                schedstat_inc(p, se.nr_failed_migrations_affine);
                return 0;
        }
@@ -3086,7 +3091,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
                   unsigned long *imbalance, enum cpu_idle_type idle,
-                  int *sd_idle, const cpumask_t *cpus, int *balance)
+                  int *sd_idle, const struct cpumask *cpus, int *balance)
 {
        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -3122,10 +3127,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                unsigned long sum_avg_load_per_task;
                unsigned long avg_load_per_task;
 
-               local_group = cpu_isset(this_cpu, group->cpumask);
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_cpus(group));
 
                if (local_group)
-                       balance_cpu = first_cpu(group->cpumask);
+                       balance_cpu = cpumask_first(sched_group_cpus(group));
 
                /* Tally up the load of all CPUs in the group */
                sum_weighted_load = sum_nr_running = avg_load = 0;
@@ -3134,13 +3140,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                max_cpu_load = 0;
                min_cpu_load = ~0UL;
 
-               for_each_cpu_mask_nr(i, group->cpumask) {
-                       struct rq *rq;
-
-                       if (!cpu_isset(i, *cpus))
-                               continue;
-
-                       rq = cpu_rq(i);
+               for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+                       struct rq *rq = cpu_rq(i);
 
                        if (*sd_idle && rq->nr_running)
                                *sd_idle = 0;
@@ -3251,8 +3252,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                 */
                if ((sum_nr_running < min_nr_running) ||
                    (sum_nr_running == min_nr_running &&
-                    first_cpu(group->cpumask) <
-                    first_cpu(group_min->cpumask))) {
+                    cpumask_first(sched_group_cpus(group)) >
+                    cpumask_first(sched_group_cpus(group_min)))) {
                        group_min = group;
                        min_nr_running = sum_nr_running;
                        min_load_per_task = sum_weighted_load /
@@ -3267,8 +3268,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                if (sum_nr_running <= group_capacity - 1) {
                        if (sum_nr_running > leader_nr_running ||
                            (sum_nr_running == leader_nr_running &&
-                            first_cpu(group->cpumask) >
-                             first_cpu(group_leader->cpumask))) {
+                            cpumask_first(sched_group_cpus(group)) <
+                            cpumask_first(sched_group_cpus(group_leader)))) {
                                group_leader = group;
                                leader_nr_running = sum_nr_running;
                        }
@@ -3394,6 +3395,10 @@ out_balanced:
 
        if (this == group_leader && group_leader != group_min) {
                *imbalance = min_load_per_task;
+               if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+                       cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+                               cpumask_first(sched_group_cpus(group_leader));
+               }
                return group_min;
        }
 #endif
@@ -3407,16 +3412,16 @@ ret:
  */
 static struct rq *
 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
-                  unsigned long imbalance, const cpumask_t *cpus)
+                  unsigned long imbalance, const struct cpumask *cpus)
 {
        struct rq *busiest = NULL, *rq;
        unsigned long max_load = 0;
        int i;
 
-       for_each_cpu_mask_nr(i, group->cpumask) {
+       for_each_cpu(i, sched_group_cpus(group)) {
                unsigned long wl;
 
-               if (!cpu_isset(i, *cpus))
+               if (!cpumask_test_cpu(i, cpus))
                        continue;
 
                rq = cpu_rq(i);
@@ -3446,7 +3451,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
-                       int *balance, cpumask_t *cpus)
+                       int *balance, struct cpumask *cpus)
 {
        int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
        struct sched_group *group;
@@ -3454,7 +3459,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        struct rq *busiest;
        unsigned long flags;
 
-       cpus_setall(*cpus);
+       cpumask_setall(cpus);
 
        /*
         * When power savings policy is enabled for the parent domain, idle
@@ -3514,8 +3519,8 @@ redo:
 
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(all_pinned)) {
-                       cpu_clear(cpu_of(busiest), *cpus);
-                       if (!cpus_empty(*cpus))
+                       cpumask_clear_cpu(cpu_of(busiest), cpus);
+                       if (!cpumask_empty(cpus))
                                goto redo;
                        goto out_balanced;
                }
@@ -3532,7 +3537,8 @@ redo:
                        /* don't kick the migration_thread, if the curr
                         * task on busiest cpu can't be moved to this_cpu
                         */
-                       if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+                       if (!cpumask_test_cpu(this_cpu,
+                                             &busiest->curr->cpus_allowed)) {
                                spin_unlock_irqrestore(&busiest->lock, flags);
                                all_pinned = 1;
                                goto out_one_pinned;
@@ -3607,7 +3613,7 @@ out:
  */
 static int
 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
-                       cpumask_t *cpus)
+                       struct cpumask *cpus)
 {
        struct sched_group *group;
        struct rq *busiest = NULL;
@@ -3616,7 +3622,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
        int sd_idle = 0;
        int all_pinned = 0;
 
-       cpus_setall(*cpus);
+       cpumask_setall(cpus);
 
        /*
         * When power savings policy is enabled for the parent domain, idle
@@ -3660,17 +3666,71 @@ redo:
                double_unlock_balance(this_rq, busiest);
 
                if (unlikely(all_pinned)) {
-                       cpu_clear(cpu_of(busiest), *cpus);
-                       if (!cpus_empty(*cpus))
+                       cpumask_clear_cpu(cpu_of(busiest), cpus);
+                       if (!cpumask_empty(cpus))
                                goto redo;
                }
        }
 
        if (!ld_moved) {
+               int active_balance = 0;
+
                schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
                if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
                    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                        return -1;
+
+               if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
+                       return -1;
+
+               if (sd->nr_balance_failed++ < 2)
+                       return -1;
+
+               /*
+                * The only task running in a non-idle cpu can be moved to this
+                * cpu in an attempt to completely freeup the other CPU
+                * package. The same method used to move task in load_balance()
+                * have been extended for load_balance_newidle() to speedup
+                * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
+                *
+                * The package power saving logic comes from
+                * find_busiest_group().  If there are no imbalance, then
+                * f_b_g() will return NULL.  However when sched_mc={1,2} then
+                * f_b_g() will select a group from which a running task may be
+                * pulled to this cpu in order to make the other package idle.
+                * If there is no opportunity to make a package idle and if
+                * there are no imbalance, then f_b_g() will return NULL and no
+                * action will be taken in load_balance_newidle().
+                *
+                * Under normal task pull operation due to imbalance, there
+                * will be more than one task in the source run queue and
+                * move_tasks() will succeed.  ld_moved will be true and this
+                * active balance code will not be triggered.
+                */
+
+               /* Lock busiest in correct order while this_rq is held */
+               double_lock_balance(this_rq, busiest);
+
+               /*
+                * don't kick the migration_thread, if the curr
+                * task on busiest cpu can't be moved to this_cpu
+                */
+               if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+                       double_unlock_balance(this_rq, busiest);
+                       all_pinned = 1;
+                       return ld_moved;
+               }
+
+               if (!busiest->active_balance) {
+                       busiest->active_balance = 1;
+                       busiest->push_cpu = this_cpu;
+                       active_balance = 1;
+               }
+
+               double_unlock_balance(this_rq, busiest);
+               if (active_balance)
+                       wake_up_process(busiest->migration_thread);
+
        } else
                sd->nr_balance_failed = 0;
 
@@ -3696,7 +3756,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
        struct sched_domain *sd;
        int pulled_task = 0;
        unsigned long next_balance = jiffies + HZ;
-       cpumask_t tmpmask;
+       cpumask_var_t tmpmask;
+
+       if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
+               return;
 
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
@@ -3707,7 +3770,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                if (sd->flags & SD_BALANCE_NEWIDLE)
                        /* If we've pulled tasks over stop searching: */
                        pulled_task = load_balance_newidle(this_cpu, this_rq,
-                                                          sd, &tmpmask);
+                                                          sd, tmpmask);
 
                interval = msecs_to_jiffies(sd->balance_interval);
                if (time_after(next_balance, sd->last_balance + interval))
@@ -3722,6 +3785,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                 */
                this_rq->next_balance = next_balance;
        }
+       free_cpumask_var(tmpmask);
 }
 
 /*
@@ -3759,7 +3823,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
        /* Search for an sd spanning us and the target CPU. */
        for_each_domain(target_cpu, sd) {
                if ((sd->flags & SD_LOAD_BALANCE) &&
-                   cpu_isset(busiest_cpu, sd->span))
+                   cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
                                break;
        }
 
@@ -3778,10 +3842,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 #ifdef CONFIG_NO_HZ
 static struct {
        atomic_t load_balancer;
-       cpumask_t cpu_mask;
+       cpumask_var_t cpu_mask;
 } nohz ____cacheline_aligned = {
        .load_balancer = ATOMIC_INIT(-1),
-       .cpu_mask = CPU_MASK_NONE,
 };
 
 /*
@@ -3809,7 +3872,7 @@ int select_nohz_load_balancer(int stop_tick)
        int cpu = smp_processor_id();
 
        if (stop_tick) {
-               cpu_set(cpu, nohz.cpu_mask);
+               cpumask_set_cpu(cpu, nohz.cpu_mask);
                cpu_rq(cpu)->in_nohz_recently = 1;
 
                /*
@@ -3823,7 +3886,7 @@ int select_nohz_load_balancer(int stop_tick)
                }
 
                /* time for ilb owner also to sleep */
-               if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+               if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                        if (atomic_read(&nohz.load_balancer) == cpu)
                                atomic_set(&nohz.load_balancer, -1);
                        return 0;
@@ -3836,10 +3899,10 @@ int select_nohz_load_balancer(int stop_tick)
                } else if (atomic_read(&nohz.load_balancer) == cpu)
                        return 1;
        } else {
-               if (!cpu_isset(cpu, nohz.cpu_mask))
+               if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
                        return 0;
 
-               cpu_clear(cpu, nohz.cpu_mask);
+               cpumask_clear_cpu(cpu, nohz.cpu_mask);
 
                if (atomic_read(&nohz.load_balancer) == cpu)
                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
@@ -3867,7 +3930,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
        unsigned long next_balance = jiffies + 60*HZ;
        int update_next_balance = 0;
        int need_serialize;
-       cpumask_t tmp;
+       cpumask_var_t tmp;
+
+       /* Fails alloc?  Rebalancing probably not a priority right now. */
+       if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
+               return;
 
        for_each_domain(cpu, sd) {
                if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3892,7 +3959,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                }
 
                if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                       if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
+                       if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
                                /*
                                 * We've pulled tasks over so either we're no
                                 * longer idle, or one of our SMT siblings is
@@ -3926,6 +3993,8 @@ out:
         */
        if (likely(update_next_balance))
                rq->next_balance = next_balance;
+
+       free_cpumask_var(tmp);
 }
 
 /*
@@ -3950,12 +4019,13 @@ static void run_rebalance_domains(struct softirq_action *h)
         */
        if (this_rq->idle_at_tick &&
            atomic_read(&nohz.load_balancer) == this_cpu) {
-               cpumask_t cpus = nohz.cpu_mask;
                struct rq *rq;
                int balance_cpu;
 
-               cpu_clear(this_cpu, cpus);
-               for_each_cpu_mask_nr(balance_cpu, cpus) {
+               for_each_cpu(balance_cpu, nohz.cpu_mask) {
+                       if (balance_cpu == this_cpu)
+                               continue;
+
                        /*
                         * If this cpu gets work to do, stop the load balancing
                         * work being done for other cpus. Next load
@@ -3993,7 +4063,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
                rq->in_nohz_recently = 0;
 
                if (atomic_read(&nohz.load_balancer) == cpu) {
-                       cpu_clear(cpu, nohz.cpu_mask);
+                       cpumask_clear_cpu(cpu, nohz.cpu_mask);
                        atomic_set(&nohz.load_balancer, -1);
                }
 
@@ -4006,7 +4076,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
                         * TBD: Traverse the sched domains and nominate
                         * the nearest cpu in the nohz.cpu_mask.
                         */
-                       int ilb = first_cpu(nohz.cpu_mask);
+                       int ilb = cpumask_first(nohz.cpu_mask);
 
                        if (ilb < nr_cpu_ids)
                                resched_cpu(ilb);
@@ -4018,7 +4088,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
         * cpus with ticks stopped, is it time for that to stop?
         */
        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
-           cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+           cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                resched_cpu(cpu);
                return;
        }
@@ -4028,7 +4098,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
         * someone else, then no need raise the SCHED_SOFTIRQ
         */
        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
-           cpu_isset(cpu, nohz.cpu_mask))
+           cpumask_test_cpu(cpu, nohz.cpu_mask))
                return;
 #endif
        if (time_after_eq(jiffies, rq->next_balance))
@@ -5401,10 +5471,9 @@ out_unlock:
        return retval;
 }
 
-long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 {
-       cpumask_t cpus_allowed;
-       cpumask_t new_mask = *in_mask;
+       cpumask_var_t cpus_allowed, new_mask;
        struct task_struct *p;
        int retval;
 
@@ -5426,6 +5495,14 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
        get_task_struct(p);
        read_unlock(&tasklist_lock);
 
+       if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_put_task;
+       }
+       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_free_cpus_allowed;
+       }
        retval = -EPERM;
        if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                goto out_unlock;
@@ -5434,37 +5511,41 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
        if (retval)
                goto out_unlock;
 
-       cpuset_cpus_allowed(p, &cpus_allowed);
-       cpus_and(new_mask, new_mask, cpus_allowed);
+       cpuset_cpus_allowed(p, cpus_allowed);
+       cpumask_and(new_mask, in_mask, cpus_allowed);
  again:
-       retval = set_cpus_allowed_ptr(p, &new_mask);
+       retval = set_cpus_allowed_ptr(p, new_mask);
 
        if (!retval) {
-               cpuset_cpus_allowed(p, &cpus_allowed);
-               if (!cpus_subset(new_mask, cpus_allowed)) {
+               cpuset_cpus_allowed(p, cpus_allowed);
+               if (!cpumask_subset(new_mask, cpus_allowed)) {
                        /*
                         * We must have raced with a concurrent cpuset
                         * update. Just reset the cpus_allowed to the
                         * cpuset's cpus_allowed
                         */
-                       new_mask = cpus_allowed;
+                       cpumask_copy(new_mask, cpus_allowed);
                        goto again;
                }
        }
 out_unlock:
+       free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+       free_cpumask_var(cpus_allowed);
+out_put_task:
        put_task_struct(p);
        put_online_cpus();
        return retval;
 }
 
 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
-                            cpumask_t *new_mask)
+                            struct cpumask *new_mask)
 {
-       if (len < sizeof(cpumask_t)) {
-               memset(new_mask, 0, sizeof(cpumask_t));
-       } else if (len > sizeof(cpumask_t)) {
-               len = sizeof(cpumask_t);
-       }
+       if (len < cpumask_size())
+               cpumask_clear(new_mask);
+       else if (len > cpumask_size())
+               len = cpumask_size();
+
        return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
 }
 
@@ -5477,17 +5558,20 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
                                      unsigned long __user *user_mask_ptr)
 {
-       cpumask_t new_mask;
+       cpumask_var_t new_mask;
        int retval;
 
-       retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
-       if (retval)
-               return retval;
+       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+               return -ENOMEM;
 
-       return sched_setaffinity(pid, &new_mask);
+       retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+       if (retval == 0)
+               retval = sched_setaffinity(pid, new_mask);
+       free_cpumask_var(new_mask);
+       return retval;
 }
 
-long sched_getaffinity(pid_t pid, cpumask_t *mask)
+long sched_getaffinity(pid_t pid, struct cpumask *mask)
 {
        struct task_struct *p;
        int retval;
@@ -5504,7 +5588,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
        if (retval)
                goto out_unlock;
 
-       cpus_and(*mask, p->cpus_allowed, cpu_online_map);
+       cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
 
 out_unlock:
        read_unlock(&tasklist_lock);
@@ -5523,19 +5607,24 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
                                      unsigned long __user *user_mask_ptr)
 {
        int ret;
-       cpumask_t mask;
+       cpumask_var_t mask;
 
-       if (len < sizeof(cpumask_t))
+       if (len < cpumask_size())
                return -EINVAL;
 
-       ret = sched_getaffinity(pid, &mask);
-       if (ret < 0)
-               return ret;
+       if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+               return -ENOMEM;
 
-       if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
-               return -EFAULT;
+       ret = sched_getaffinity(pid, mask);
+       if (ret == 0) {
+               if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+                       ret = -EFAULT;
+               else
+                       ret = cpumask_size();
+       }
+       free_cpumask_var(mask);
 
-       return sizeof(cpumask_t);
+       return ret;
 }
 
 /**
@@ -5877,7 +5966,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        idle->se.exec_start = sched_clock();
 
        idle->prio = idle->normal_prio = MAX_PRIO;
-       idle->cpus_allowed = cpumask_of_cpu(cpu);
+       cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
        __set_task_cpu(idle, cpu);
 
        rq->curr = rq->idle = idle;
@@ -5904,9 +5993,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
  * indicates which cpus entered this state. This is used
  * in the rcu update to wait only for active cpus. For system
  * which do not switch off the HZ timer nohz_cpu_mask should
- * always be CPU_MASK_NONE.
+ * always be CPU_BITS_NONE.
  */
-cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+cpumask_var_t nohz_cpu_mask;
 
 /*
  * Increase the granularity value when there are more CPUs,
@@ -5961,7 +6050,7 @@ static inline void sched_init_granularity(void)
  * task must not exit() & deallocate itself prematurely. The
  * call is not atomic; no spinlocks may be held.
  */
-int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
        struct migration_req req;
        unsigned long flags;
@@ -5969,13 +6058,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
        int ret = 0;
 
        rq = task_rq_lock(p, &flags);
-       if (!cpus_intersects(*new_mask, cpu_online_map)) {
+       if (!cpumask_intersects(new_mask, cpu_online_mask)) {
                ret = -EINVAL;
                goto out;
        }
 
        if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
-                    !cpus_equal(p->cpus_allowed, *new_mask))) {
+                    !cpumask_equal(&p->cpus_allowed, new_mask))) {
                ret = -EINVAL;
                goto out;
        }
@@ -5983,15 +6072,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
        if (p->sched_class->set_cpus_allowed)
                p->sched_class->set_cpus_allowed(p, new_mask);
        else {
-               p->cpus_allowed = *new_mask;
-               p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
+               cpumask_copy(&p->cpus_allowed, new_mask);
+               p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
        }
 
        /* Can the task run on the task's current CPU? If so, we're done */
-       if (cpu_isset(task_cpu(p), *new_mask))
+       if (cpumask_test_cpu(task_cpu(p), new_mask))
                goto out;
 
-       if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
+       if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
                /* Need help from migration thread: drop lock and wait. */
                task_rq_unlock(rq, &flags);
                wake_up_process(rq->migration_thread);
@@ -6033,7 +6122,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        if (task_cpu(p) != src_cpu)
                goto done;
        /* Affinity changed (again). */
-       if (!cpu_isset(dest_cpu, p->cpus_allowed))
+       if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
                goto fail;
 
        on_rq = p->se.on_rq;
@@ -6130,50 +6219,43 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
  */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
-       unsigned long flags;
-       cpumask_t mask;
-       struct rq *rq;
        int dest_cpu;
+       /* FIXME: Use cpumask_of_node here. */
+       cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu));
+       const struct cpumask *nodemask = &_nodemask;
+
+again:
+       /* Look for allowed, online CPU in same node. */
+       for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
+               if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+                       goto move;
+
+       /* Any allowed, online CPU? */
+       dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
+       if (dest_cpu < nr_cpu_ids)
+               goto move;
+
+       /* No more Mr. Nice Guy. */
+       if (dest_cpu >= nr_cpu_ids) {
+               cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
+               dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
 
-       do {
-               /* On same node? */
-               mask = node_to_cpumask(cpu_to_node(dead_cpu));
-               cpus_and(mask, mask, p->cpus_allowed);
-               dest_cpu = any_online_cpu(mask);
-
-               /* On any allowed CPU? */
-               if (dest_cpu >= nr_cpu_ids)
-                       dest_cpu = any_online_cpu(p->cpus_allowed);
-
-               /* No more Mr. Nice Guy. */
-               if (dest_cpu >= nr_cpu_ids) {
-                       cpumask_t cpus_allowed;
-
-                       cpuset_cpus_allowed_locked(p, &cpus_allowed);
-                       /*
-                        * Try to stay on the same cpuset, where the
-                        * current cpuset may be a subset of all cpus.
-                        * The cpuset_cpus_allowed_locked() variant of
-                        * cpuset_cpus_allowed() will not block. It must be
-                        * called within calls to cpuset_lock/cpuset_unlock.
-                        */
-                       rq = task_rq_lock(p, &flags);
-                       p->cpus_allowed = cpus_allowed;
-                       dest_cpu = any_online_cpu(p->cpus_allowed);
-                       task_rq_unlock(rq, &flags);
-
-                       /*
-                        * Don't tell them about moving exiting tasks or
-                        * kernel threads (both mm NULL), since they never
-                        * leave kernel.
-                        */
-                       if (p->mm && printk_ratelimit()) {
-                               printk(KERN_INFO "process %d (%s) no "
-                                      "longer affine to cpu%d\n",
-                                       task_pid_nr(p), p->comm, dead_cpu);
-                       }
+               /*
+                * Don't tell them about moving exiting tasks or
+                * kernel threads (both mm NULL), since they never
+                * leave kernel.
+                */
+               if (p->mm && printk_ratelimit()) {
+                       printk(KERN_INFO "process %d (%s) no "
+                              "longer affine to cpu%d\n",
+                              task_pid_nr(p), p->comm, dead_cpu);
                }
-       } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
+       }
+
+move:
+       /* It can have affinity changed while we were choosing. */
+       if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
+               goto again;
 }
 
 /*
@@ -6185,7 +6267,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
  */
 static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
-       struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
+       struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
        unsigned long flags;
 
        local_irq_save(flags);
@@ -6475,7 +6557,7 @@ static void set_rq_online(struct rq *rq)
        if (!rq->online) {
                const struct sched_class *class;
 
-               cpu_set(rq->cpu, rq->rd->online);
+               cpumask_set_cpu(rq->cpu, rq->rd->online);
                rq->online = 1;
 
                for_each_class(class) {
@@ -6495,7 +6577,7 @@ static void set_rq_offline(struct rq *rq)
                                class->rq_offline(rq);
                }
 
-               cpu_clear(rq->cpu, rq->rd->online);
+               cpumask_clear_cpu(rq->cpu, rq->rd->online);
                rq->online = 0;
        }
 }
@@ -6536,7 +6618,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                rq = cpu_rq(cpu);
                spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
-                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
+                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 
                        set_rq_online(rq);
                }
@@ -6550,7 +6632,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                        break;
                /* Unbind it from offline cpu so it can run. Fall thru. */
                kthread_bind(cpu_rq(cpu)->migration_thread,
-                            any_online_cpu(cpu_online_map));
+                            cpumask_any(cpu_online_mask));
                kthread_stop(cpu_rq(cpu)->migration_thread);
                cpu_rq(cpu)->migration_thread = NULL;
                break;
@@ -6600,7 +6682,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                rq = cpu_rq(cpu);
                spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
-                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
+                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                        set_rq_offline(rq);
                }
                spin_unlock_irqrestore(&rq->lock, flags);
@@ -6639,13 +6721,13 @@ early_initcall(migration_init);
 #ifdef CONFIG_SCHED_DEBUG
 
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
-                                 cpumask_t *groupmask)
+                                 struct cpumask *groupmask)
 {
        struct sched_group *group = sd->groups;
        char str[256];
 
-       cpulist_scnprintf(str, sizeof(str), sd->span);
-       cpus_clear(*groupmask);
+       cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
+       cpumask_clear(groupmask);
 
        printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
 
@@ -6659,11 +6741,11 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 
        printk(KERN_CONT "span %s level %s\n", str, sd->name);
 
-       if (!cpu_isset(cpu, sd->span)) {
+       if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                printk(KERN_ERR "ERROR: domain->span does not contain "
                                "CPU%d\n", cpu);
        }
-       if (!cpu_isset(cpu, group->cpumask)) {
+       if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
                printk(KERN_ERR "ERROR: domain->groups does not contain"
                                " CPU%d\n", cpu);
        }
@@ -6683,31 +6765,32 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                        break;
                }
 
-               if (!cpus_weight(group->cpumask)) {
+               if (!cpumask_weight(sched_group_cpus(group))) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: empty group\n");
                        break;
                }
 
-               if (cpus_intersects(*groupmask, group->cpumask)) {
+               if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: repeated CPUs\n");
                        break;
                }
 
-               cpus_or(*groupmask, *groupmask, group->cpumask);
+               cpumask_or(groupmask, groupmask, sched_group_cpus(group));
 
-               cpulist_scnprintf(str, sizeof(str), group->cpumask);
+               cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
                printk(KERN_CONT " %s", str);
 
                group = group->next;
        } while (group != sd->groups);
        printk(KERN_CONT "\n");
 
-       if (!cpus_equal(sd->span, *groupmask))
+       if (!cpumask_equal(sched_domain_span(sd), groupmask))
                printk(KERN_ERR "ERROR: groups don't span domain->span\n");
 
-       if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
+       if (sd->parent &&
+           !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
                printk(KERN_ERR "ERROR: parent span is not a superset "
                        "of domain->span\n");
        return 0;
@@ -6715,7 +6798,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
-       cpumask_t *groupmask;
+       cpumask_var_t groupmask;
        int level = 0;
 
        if (!sd) {
@@ -6725,8 +6808,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 
        printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
 
-       groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
-       if (!groupmask) {
+       if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
                printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
                return;
        }
@@ -6739,7 +6821,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
                if (!sd)
                        break;
        }
-       kfree(groupmask);
+       free_cpumask_var(groupmask);
 }
 #else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6747,7 +6829,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 
 static int sd_degenerate(struct sched_domain *sd)
 {
-       if (cpus_weight(sd->span) == 1)
+       if (cpumask_weight(sched_domain_span(sd)) == 1)
                return 1;
 
        /* Following flags need at least 2 groups */
@@ -6778,7 +6860,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
        if (sd_degenerate(parent))
                return 1;
 
-       if (!cpus_equal(sd->span, parent->span))
+       if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
                return 0;
 
        /* Does parent contain flags not in child? */
@@ -6802,6 +6884,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
        return 1;
 }
 
+static void free_rootdomain(struct root_domain *rd)
+{
+       cpupri_cleanup(&rd->cpupri);
+
+       free_cpumask_var(rd->rto_mask);
+       free_cpumask_var(rd->online);
+       free_cpumask_var(rd->span);
+       kfree(rd);
+}
+
 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 {
        unsigned long flags;
@@ -6811,38 +6903,63 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
        if (rq->rd) {
                struct root_domain *old_rd = rq->rd;
 
-               if (cpu_isset(rq->cpu, old_rd->online))
+               if (cpumask_test_cpu(rq->cpu, old_rd->online))
                        set_rq_offline(rq);
 
-               cpu_clear(rq->cpu, old_rd->span);
+               cpumask_clear_cpu(rq->cpu, old_rd->span);
 
                if (atomic_dec_and_test(&old_rd->refcount))
-                       kfree(old_rd);
+                       free_rootdomain(old_rd);
        }
 
        atomic_inc(&rd->refcount);
        rq->rd = rd;
 
-       cpu_set(rq->cpu, rd->span);
-       if (cpu_isset(rq->cpu, cpu_online_map))
+       cpumask_set_cpu(rq->cpu, rd->span);
+       if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
                set_rq_online(rq);
 
        spin_unlock_irqrestore(&rq->lock, flags);
 }
 
-static void init_rootdomain(struct root_domain *rd)
+static int init_rootdomain(struct root_domain *rd, bool bootmem)
 {
        memset(rd, 0, sizeof(*rd));
 
-       cpus_clear(rd->span);
-       cpus_clear(rd->online);
+       if (bootmem) {
+               alloc_bootmem_cpumask_var(&def_root_domain.span);
+               alloc_bootmem_cpumask_var(&def_root_domain.online);
+               alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
+               cpupri_init(&rd->cpupri, true);
+               return 0;
+       }
+
+       if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+               goto free_rd;
+       if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+               goto free_span;
+       if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+               goto free_online;
+
+       if (cpupri_init(&rd->cpupri, false) != 0)
+               goto free_rto_mask;
+       return 0;
 
-       cpupri_init(&rd->cpupri);
+free_rto_mask:
+       free_cpumask_var(rd->rto_mask);
+free_online:
+       free_cpumask_var(rd->online);
+free_span:
+       free_cpumask_var(rd->span);
+free_rd:
+       kfree(rd);
+       return -ENOMEM;
 }
 
 static void init_defrootdomain(void)
 {
-       init_rootdomain(&def_root_domain);
+       init_rootdomain(&def_root_domain, true);
+
        atomic_set(&def_root_domain.refcount, 1);
 }
 
@@ -6854,7 +6971,10 @@ static struct root_domain *alloc_rootdomain(void)
        if (!rd)
                return NULL;
 
-       init_rootdomain(rd);
+       if (init_rootdomain(rd, false) != 0) {
+               kfree(rd);
+               return NULL;
+       }
 
        return rd;
 }
@@ -6896,19 +7016,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 }
 
 /* cpus with isolated domains */
-static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_var_t cpu_isolated_map;
 
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
-       static int __initdata ints[NR_CPUS];
-       int i;
-
-       str = get_options(str, ARRAY_SIZE(ints), ints);
-       cpus_clear(cpu_isolated_map);
-       for (i = 1; i <= ints[0]; i++)
-               if (ints[i] < NR_CPUS)
-                       cpu_set(ints[i], cpu_isolated_map);
+       cpulist_parse(str, cpu_isolated_map);
        return 1;
 }
 
@@ -6917,42 +7030,43 @@ __setup("isolcpus=", isolated_cpu_setup);
 /*
  * init_sched_build_groups takes the cpumask we wish to span, and a pointer
  * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
- * (due to the fact that we keep track of groups covered with a cpumask_t).
+ * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
+ * (due to the fact that we keep track of groups covered with a struct cpumask).
  *
  * init_sched_build_groups will build a circular linked list of the groups
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
 static void
-init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
-                       int (*group_fn)(int cpu, const cpumask_t *cpu_map,
+init_sched_build_groups(const struct cpumask *span,
+                       const struct cpumask *cpu_map,
+                       int (*group_fn)(int cpu, const struct cpumask *cpu_map,
                                        struct sched_group **sg,
-                                       cpumask_t *tmpmask),
-                       cpumask_t *covered, cpumask_t *tmpmask)
+                                       struct cpumask *tmpmask),
+                       struct cpumask *covered, struct cpumask *tmpmask)
 {
        struct sched_group *first = NULL, *last = NULL;
        int i;
 
-       cpus_clear(*covered);
+       cpumask_clear(covered);
 
-       for_each_cpu_mask_nr(i, *span) {
+       for_each_cpu(i, span) {
                struct sched_group *sg;
                int group = group_fn(i, cpu_map, &sg, tmpmask);
                int j;
 
-               if (cpu_isset(i, *covered))
+               if (cpumask_test_cpu(i, covered))
                        continue;
 
-               cpus_clear(sg->cpumask);
+               cpumask_clear(sched_group_cpus(sg));
                sg->__cpu_power = 0;
 
-               for_each_cpu_mask_nr(j, *span) {
+               for_each_cpu(j, span) {
                        if (group_fn(j, cpu_map, NULL, tmpmask) != group)
                                continue;
 
-                       cpu_set(j, *covered);
-                       cpu_set(j, sg->cpumask);
+                       cpumask_set_cpu(j, covered);
+                       cpumask_set_cpu(j, sched_group_cpus(sg));
                }
                if (!first)
                        first = sg;
@@ -7016,9 +7130,10 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
  * should be one that prevents unnecessary balancing, but also spreads tasks
  * out optimally.
  */
-static void sched_domain_node_span(int node, cpumask_t *span)
+static void sched_domain_node_span(int node, struct cpumask *span)
 {
        nodemask_t used_nodes;
+       /* FIXME: use cpumask_of_node() */
        node_to_cpumask_ptr(nodemask, node);
        int i;
 
@@ -7039,19 +7154,34 @@ static void sched_domain_node_span(int node, cpumask_t *span)
 
 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 
+/*
+ * The cpus mask in sched_group and sched_domain hangs off the end.
+ * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
+ * for nr_cpu_ids < CONFIG_NR_CPUS.
+ */
+struct static_sched_group {
+       struct sched_group sg;
+       DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
+};
+
+struct static_sched_domain {
+       struct sched_domain sd;
+       DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+};
+
 /*
  * SMT sched-domains:
  */
 #ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
+static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
 
 static int
-cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                cpumask_t *unused)
+cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
+                struct sched_group **sg, struct cpumask *unused)
 {
        if (sg)
-               *sg = &per_cpu(sched_group_cpus, cpu);
+               *sg = &per_cpu(sched_group_cpus, cpu).sg;
        return cpu;
 }
 #endif /* CONFIG_SCHED_SMT */
@@ -7060,56 +7190,55 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
  * multi-core sched-domains:
  */
 #ifdef CONFIG_SCHED_MC
-static DEFINE_PER_CPU(struct sched_domain, core_domains);
-static DEFINE_PER_CPU(struct sched_group, sched_group_core);
+static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
 #endif /* CONFIG_SCHED_MC */
 
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
 static int
-cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *mask)
+cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *mask)
 {
        int group;
 
-       *mask = per_cpu(cpu_sibling_map, cpu);
-       cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+       group = cpumask_first(mask);
        if (sg)
-               *sg = &per_cpu(sched_group_core, group);
+               *sg = &per_cpu(sched_group_core, group).sg;
        return group;
 }
 #elif defined(CONFIG_SCHED_MC)
 static int
-cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *unused)
+cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *unused)
 {
        if (sg)
-               *sg = &per_cpu(sched_group_core, cpu);
+               *sg = &per_cpu(sched_group_core, cpu).sg;
        return cpu;
 }
 #endif
 
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
+static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
 
 static int
-cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *mask)
+cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *mask)
 {
        int group;
 #ifdef CONFIG_SCHED_MC
+       /* FIXME: Use cpu_coregroup_mask. */
        *mask = cpu_coregroup_map(cpu);
        cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       group = cpumask_first(mask);
 #elif defined(CONFIG_SCHED_SMT)
-       *mask = per_cpu(cpu_sibling_map, cpu);
-       cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+       group = cpumask_first(mask);
 #else
        group = cpu;
 #endif
        if (sg)
-               *sg = &per_cpu(sched_group_phys, group);
+               *sg = &per_cpu(sched_group_phys, group).sg;
        return group;
 }
 
@@ -7123,19 +7252,21 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group ***sched_group_nodes_bycpu;
 
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
 
-static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
-                                struct sched_group **sg, cpumask_t *nodemask)
+static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
+                                struct sched_group **sg,
+                                struct cpumask *nodemask)
 {
        int group;
+       /* FIXME: use cpumask_of_node */
+       node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu));
 
-       *nodemask = node_to_cpumask(cpu_to_node(cpu));
-       cpus_and(*nodemask, *nodemask, *cpu_map);
-       group = first_cpu(*nodemask);
+       cpumask_and(nodemask, pnodemask, cpu_map);
+       group = cpumask_first(nodemask);
 
        if (sg)
-               *sg = &per_cpu(sched_group_allnodes, group);
+               *sg = &per_cpu(sched_group_allnodes, group).sg;
        return group;
 }
 
@@ -7147,11 +7278,11 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
        if (!sg)
                return;
        do {
-               for_each_cpu_mask_nr(j, sg->cpumask) {
+               for_each_cpu(j, sched_group_cpus(sg)) {
                        struct sched_domain *sd;
 
-                       sd = &per_cpu(phys_domains, j);
-                       if (j != first_cpu(sd->groups->cpumask)) {
+                       sd = &per_cpu(phys_domains, j).sd;
+                       if (j != cpumask_first(sched_group_cpus(sd->groups))) {
                                /*
                                 * Only add "power" once for each
                                 * physical package.
@@ -7168,11 +7299,12 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 
 #ifdef CONFIG_NUMA
 /* Free memory allocated for various sched_group structures */
-static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+static void free_sched_groups(const struct cpumask *cpu_map,
+                             struct cpumask *nodemask)
 {
        int cpu, i;
 
-       for_each_cpu_mask_nr(cpu, *cpu_map) {
+       for_each_cpu(cpu, cpu_map) {
                struct sched_group **sched_group_nodes
                        = sched_group_nodes_bycpu[cpu];
 
@@ -7181,10 +7313,11 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 
                for (i = 0; i < nr_node_ids; i++) {
                        struct sched_group *oldsg, *sg = sched_group_nodes[i];
+                       /* FIXME: Use cpumask_of_node */
+                       node_to_cpumask_ptr(pnodemask, i);
 
-                       *nodemask = node_to_cpumask(i);
-                       cpus_and(*nodemask, *nodemask, *cpu_map);
-                       if (cpus_empty(*nodemask))
+                       cpus_and(*nodemask, *pnodemask, *cpu_map);
+                       if (cpumask_empty(nodemask))
                                continue;
 
                        if (sg == NULL)
@@ -7202,7 +7335,8 @@ next_sg:
        }
 }
 #else /* !CONFIG_NUMA */
-static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+static void free_sched_groups(const struct cpumask *cpu_map,
+                             struct cpumask *nodemask)
 {
 }
 #endif /* CONFIG_NUMA */
@@ -7228,7 +7362,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 
        WARN_ON(!sd || !sd->groups);
 
-       if (cpu != first_cpu(sd->groups->cpumask))
+       if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
                return;
 
        child = sd->child;
@@ -7293,48 +7427,6 @@ SD_INIT_FUNC(CPU)
  SD_INIT_FUNC(MC)
 #endif
 
-/*
- * To minimize stack usage kmalloc room for cpumasks and share the
- * space as the usage in build_sched_domains() dictates.  Used only
- * if the amount of space is significant.
- */
-struct allmasks {
-       cpumask_t tmpmask;                      /* make this one first */
-       union {
-               cpumask_t nodemask;
-               cpumask_t this_sibling_map;
-               cpumask_t this_core_map;
-       };
-       cpumask_t send_covered;
-
-#ifdef CONFIG_NUMA
-       cpumask_t domainspan;
-       cpumask_t covered;
-       cpumask_t notcovered;
-#endif
-};
-
-#if    NR_CPUS > 128
-#define SCHED_CPUMASK_DECLARE(v)       struct allmasks *v
-static inline void sched_cpumask_alloc(struct allmasks **masks)
-{
-       *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
-}
-static inline void sched_cpumask_free(struct allmasks *masks)
-{
-       kfree(masks);
-}
-#else
-#define SCHED_CPUMASK_DECLARE(v)       struct allmasks _v, *v = &_v
-static inline void sched_cpumask_alloc(struct allmasks **masks)
-{ }
-static inline void sched_cpumask_free(struct allmasks *masks)
-{ }
-#endif
-
-#define        SCHED_CPUMASK_VAR(v, a)         cpumask_t *v = (cpumask_t *) \
-                       ((unsigned long)(a) + offsetof(struct allmasks, v))
-
 static int default_relax_domain_level = -1;
 
 static int __init setup_relax_domain_level(char *str)
@@ -7374,17 +7466,38 @@ static void set_domain_attribute(struct sched_domain *sd,
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
-static int __build_sched_domains(const cpumask_t *cpu_map,
+static int __build_sched_domains(const struct cpumask *cpu_map,
                                 struct sched_domain_attr *attr)
 {
-       int i;
+       int i, err = -ENOMEM;
        struct root_domain *rd;
-       SCHED_CPUMASK_DECLARE(allmasks);
-       cpumask_t *tmpmask;
+       cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
+               tmpmask;
 #ifdef CONFIG_NUMA
+       cpumask_var_t domainspan, covered, notcovered;
        struct sched_group **sched_group_nodes = NULL;
        int sd_allnodes = 0;
 
+       if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
+               goto out;
+       if (!alloc_cpumask_var(&covered, GFP_KERNEL))
+               goto free_domainspan;
+       if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
+               goto free_covered;
+#endif
+
+       if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
+               goto free_notcovered;
+       if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
+               goto free_nodemask;
+       if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
+               goto free_this_sibling_map;
+       if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
+               goto free_this_core_map;
+       if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
+               goto free_send_covered;
+
+#ifdef CONFIG_NUMA
        /*
         * Allocate the per-node list of sched groups
         */
@@ -7392,54 +7505,37 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                                    GFP_KERNEL);
        if (!sched_group_nodes) {
                printk(KERN_WARNING "Can not alloc sched group node list\n");
-               return -ENOMEM;
+               goto free_tmpmask;
        }
 #endif
 
        rd = alloc_rootdomain();
        if (!rd) {
                printk(KERN_WARNING "Cannot alloc root domain\n");
-#ifdef CONFIG_NUMA
-               kfree(sched_group_nodes);
-#endif
-               return -ENOMEM;
+               goto free_sched_groups;
        }
 
-       /* get space for all scratch cpumask variables */
-       sched_cpumask_alloc(&allmasks);
-       if (!allmasks) {
-               printk(KERN_WARNING "Cannot alloc cpumask array\n");
-               kfree(rd);
 #ifdef CONFIG_NUMA
-               kfree(sched_group_nodes);
-#endif
-               return -ENOMEM;
-       }
-
-       tmpmask = (cpumask_t *)allmasks;
-
-
-#ifdef CONFIG_NUMA
-       sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+       sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
 #endif
 
        /*
         * Set up domains for cpus specified by the cpu_map.
         */
-       for_each_cpu_mask_nr(i, *cpu_map) {
+       for_each_cpu(i, cpu_map) {
                struct sched_domain *sd = NULL, *p;
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
 
+               /* FIXME: use cpumask_of_node */
                *nodemask = node_to_cpumask(cpu_to_node(i));
                cpus_and(*nodemask, *nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
-               if (cpus_weight(*cpu_map) >
-                               SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
+               if (cpumask_weight(cpu_map) >
+                               SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
                        sd = &per_cpu(allnodes_domains, i);
                        SD_INIT(sd, ALLNODES);
                        set_domain_attribute(sd, attr);
-                       sd->span = *cpu_map;
+                       cpumask_copy(sched_domain_span(sd), cpu_map);
                        cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
                        p = sd;
                        sd_allnodes = 1;
@@ -7449,18 +7545,19 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                sd = &per_cpu(node_domains, i);
                SD_INIT(sd, NODE);
                set_domain_attribute(sd, attr);
-               sched_domain_node_span(cpu_to_node(i), &sd->span);
+               sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
                sd->parent = p;
                if (p)
                        p->child = sd;
-               cpus_and(sd->span, sd->span, *cpu_map);
+               cpumask_and(sched_domain_span(sd),
+                           sched_domain_span(sd), cpu_map);
 #endif
 
                p = sd;
-               sd = &per_cpu(phys_domains, i);
+               sd = &per_cpu(phys_domains, i).sd;
                SD_INIT(sd, CPU);
                set_domain_attribute(sd, attr);
-               sd->span = *nodemask;
+               cpumask_copy(sched_domain_span(sd), nodemask);
                sd->parent = p;
                if (p)
                        p->child = sd;
@@ -7468,11 +7565,12 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_MC
                p = sd;
-               sd = &per_cpu(core_domains, i);
+               sd = &per_cpu(core_domains, i).sd;
                SD_INIT(sd, MC);
                set_domain_attribute(sd, attr);
-               sd->span = cpu_coregroup_map(i);
-               cpus_and(sd->span, sd->span, *cpu_map);
+               *sched_domain_span(sd) = cpu_coregroup_map(i);
+               cpumask_and(sched_domain_span(sd),
+                           sched_domain_span(sd), cpu_map);
                sd->parent = p;
                p->child = sd;
                cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7480,11 +7578,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_SMT
                p = sd;
-               sd = &per_cpu(cpu_domains, i);
+               sd = &per_cpu(cpu_domains, i).sd;
                SD_INIT(sd, SIBLING);
                set_domain_attribute(sd, attr);
-               sd->span = per_cpu(cpu_sibling_map, i);
-               cpus_and(sd->span, sd->span, *cpu_map);
+               cpumask_and(sched_domain_span(sd),
+                           &per_cpu(cpu_sibling_map, i), cpu_map);
                sd->parent = p;
                p->child = sd;
                cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7493,13 +7591,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_SMT
        /* Set up CPU (sibling) groups */
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
-
-               *this_sibling_map = per_cpu(cpu_sibling_map, i);
-               cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
-               if (i != first_cpu(*this_sibling_map))
+       for_each_cpu(i, cpu_map) {
+               cpumask_and(this_sibling_map,
+                           &per_cpu(cpu_sibling_map, i), cpu_map);
+               if (i != cpumask_first(this_sibling_map))
                        continue;
 
                init_sched_build_groups(this_sibling_map, cpu_map,
@@ -7510,13 +7605,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_MC
        /* Set up multi-core groups */
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               SCHED_CPUMASK_VAR(this_core_map, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
-
+       for_each_cpu(i, cpu_map) {
+               /* FIXME: Use cpu_coregroup_mask */
                *this_core_map = cpu_coregroup_map(i);
                cpus_and(*this_core_map, *this_core_map, *cpu_map);
-               if (i != first_cpu(*this_core_map))
+               if (i != cpumask_first(this_core_map))
                        continue;
 
                init_sched_build_groups(this_core_map, cpu_map,
@@ -7527,12 +7620,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
        /* Set up physical groups */
        for (i = 0; i < nr_node_ids; i++) {
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
-
+               /* FIXME: Use cpumask_of_node */
                *nodemask = node_to_cpumask(i);
                cpus_and(*nodemask, *nodemask, *cpu_map);
-               if (cpus_empty(*nodemask))
+               if (cpumask_empty(nodemask))
                        continue;
 
                init_sched_build_groups(nodemask, cpu_map,
@@ -7543,8 +7634,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #ifdef CONFIG_NUMA
        /* Set up node groups */
        if (sd_allnodes) {
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
-
                init_sched_build_groups(cpu_map, cpu_map,
                                        &cpu_to_allnodes_group,
                                        send_covered, tmpmask);
@@ -7553,58 +7642,58 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
        for (i = 0; i < nr_node_ids; i++) {
                /* Set up node groups */
                struct sched_group *sg, *prev;
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
-               SCHED_CPUMASK_VAR(domainspan, allmasks);
-               SCHED_CPUMASK_VAR(covered, allmasks);
                int j;
 
+               /* FIXME: Use cpumask_of_node */
                *nodemask = node_to_cpumask(i);
-               cpus_clear(*covered);
+               cpumask_clear(covered);
 
                cpus_and(*nodemask, *nodemask, *cpu_map);
-               if (cpus_empty(*nodemask)) {
+               if (cpumask_empty(nodemask)) {
                        sched_group_nodes[i] = NULL;
                        continue;
                }
 
                sched_domain_node_span(i, domainspan);
-               cpus_and(*domainspan, *domainspan, *cpu_map);
+               cpumask_and(domainspan, domainspan, cpu_map);
 
-               sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
+               sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                                 GFP_KERNEL, i);
                if (!sg) {
                        printk(KERN_WARNING "Can not alloc domain group for "
                                "node %d\n", i);
                        goto error;
                }
                sched_group_nodes[i] = sg;
-               for_each_cpu_mask_nr(j, *nodemask) {
+               for_each_cpu(j, nodemask) {
                        struct sched_domain *sd;
 
                        sd = &per_cpu(node_domains, j);
                        sd->groups = sg;
                }
                sg->__cpu_power = 0;
-               sg->cpumask = *nodemask;
+               cpumask_copy(sched_group_cpus(sg), nodemask);
                sg->next = sg;
-               cpus_or(*covered, *covered, *nodemask);
+               cpumask_or(covered, covered, nodemask);
                prev = sg;
 
                for (j = 0; j < nr_node_ids; j++) {
-                       SCHED_CPUMASK_VAR(notcovered, allmasks);
                        int n = (i + j) % nr_node_ids;
+                       /* FIXME: Use cpumask_of_node */
                        node_to_cpumask_ptr(pnodemask, n);
 
-                       cpus_complement(*notcovered, *covered);
-                       cpus_and(*tmpmask, *notcovered, *cpu_map);
-                       cpus_and(*tmpmask, *tmpmask, *domainspan);
-                       if (cpus_empty(*tmpmask))
+                       cpumask_complement(notcovered, covered);
+                       cpumask_and(tmpmask, notcovered, cpu_map);
+                       cpumask_and(tmpmask, tmpmask, domainspan);
+                       if (cpumask_empty(tmpmask))
                                break;
 
-                       cpus_and(*tmpmask, *tmpmask, *pnodemask);
-                       if (cpus_empty(*tmpmask))
+                       cpumask_and(tmpmask, tmpmask, pnodemask);
+                       if (cpumask_empty(tmpmask))
                                continue;
 
-                       sg = kmalloc_node(sizeof(struct sched_group),
+                       sg = kmalloc_node(sizeof(struct sched_group) +
+                                         cpumask_size(),
                                          GFP_KERNEL, i);
                        if (!sg) {
                                printk(KERN_WARNING
@@ -7612,9 +7701,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                                goto error;
                        }
                        sg->__cpu_power = 0;
-                       sg->cpumask = *tmpmask;
+                       cpumask_copy(sched_group_cpus(sg), tmpmask);
                        sg->next = prev->next;
-                       cpus_or(*covered, *covered, *tmpmask);
+                       cpumask_or(covered, covered, tmpmask);
                        prev->next = sg;
                        prev = sg;
                }
@@ -7623,22 +7712,22 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
        /* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(cpu_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
 
                init_sched_groups_power(i, sd);
        }
 #endif
 #ifdef CONFIG_SCHED_MC
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(core_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(core_domains, i).sd;
 
                init_sched_groups_power(i, sd);
        }
 #endif
 
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(phys_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
 
                init_sched_groups_power(i, sd);
        }
@@ -7650,53 +7739,78 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
        if (sd_allnodes) {
                struct sched_group *sg;
 
-               cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
+               cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
                                                                tmpmask);
                init_numa_sched_groups_power(sg);
        }
 #endif
 
        /* Attach the domains */
-       for_each_cpu_mask_nr(i, *cpu_map) {
+       for_each_cpu(i, cpu_map) {
                struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
-               sd = &per_cpu(cpu_domains, i);
+               sd = &per_cpu(cpu_domains, i).sd;
 #elif defined(CONFIG_SCHED_MC)
-               sd = &per_cpu(core_domains, i);
+               sd = &per_cpu(core_domains, i).sd;
 #else
-               sd = &per_cpu(phys_domains, i);
+               sd = &per_cpu(phys_domains, i).sd;
 #endif
                cpu_attach_domain(sd, rd, i);
        }
 
-       sched_cpumask_free(allmasks);
-       return 0;
+       err = 0;
+
+free_tmpmask:
+       free_cpumask_var(tmpmask);
+free_send_covered:
+       free_cpumask_var(send_covered);
+free_this_core_map:
+       free_cpumask_var(this_core_map);
+free_this_sibling_map:
+       free_cpumask_var(this_sibling_map);
+free_nodemask:
+       free_cpumask_var(nodemask);
+free_notcovered:
+#ifdef CONFIG_NUMA
+       free_cpumask_var(notcovered);
+free_covered:
+       free_cpumask_var(covered);
+free_domainspan:
+       free_cpumask_var(domainspan);
+out:
+#endif
+       return err;
+
+free_sched_groups:
+#ifdef CONFIG_NUMA
+       kfree(sched_group_nodes);
+#endif
+       goto free_tmpmask;
 
 #ifdef CONFIG_NUMA
 error:
        free_sched_groups(cpu_map, tmpmask);
-       sched_cpumask_free(allmasks);
-       kfree(rd);
-       return -ENOMEM;
+       free_rootdomain(rd);
+       goto free_tmpmask;
 #endif
 }
 
-static int build_sched_domains(const cpumask_t *cpu_map)
+static int build_sched_domains(const struct cpumask *cpu_map)
 {
        return __build_sched_domains(cpu_map, NULL);
 }
 
-static cpumask_t *doms_cur;    /* current sched domains */
+static struct cpumask *doms_cur;       /* current sched domains */
 static int ndoms_cur;          /* number of sched domains in 'doms_cur' */
 static struct sched_domain_attr *dattr_cur;
                                /* attribues of custom domains in 'doms_cur' */
 
 /*
  * Special case: If a kmalloc of a doms_cur partition (array of
- * cpumask_t) fails, then fallback to a single sched domain,
- * as determined by the single cpumask_t fallback_doms.
+ * cpumask) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask fallback_doms.
  */
-static cpumask_t fallback_doms;
+static cpumask_var_t fallback_doms;
 
 /*
  * arch_update_cpu_topology lets virtualized architectures update the
@@ -7713,16 +7827,16 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
  * For now this just excludes isolated cpus, but could be used to
  * exclude other special cases in the future.
  */
-static int arch_init_sched_domains(const cpumask_t *cpu_map)
+static int arch_init_sched_domains(const struct cpumask *cpu_map)
 {
        int err;
 
        arch_update_cpu_topology();
        ndoms_cur = 1;
-       doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+       doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
        if (!doms_cur)
-               doms_cur = &fallback_doms;
-       cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
+               doms_cur = fallback_doms;
+       cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
        dattr_cur = NULL;
        err = build_sched_domains(doms_cur);
        register_sched_domain_sysctl();
@@ -7730,8 +7844,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
        return err;
 }
 
-static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
-                                      cpumask_t *tmpmask)
+static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
+                                      struct cpumask *tmpmask)
 {
        free_sched_groups(cpu_map, tmpmask);
 }
@@ -7740,15 +7854,16 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
  * Detach sched domains from a group of cpus specified in cpu_map
  * These cpus will now be attached to the NULL domain
  */
-static void detach_destroy_domains(const cpumask_t *cpu_map)
+static void detach_destroy_domains(const struct cpumask *cpu_map)
 {
-       cpumask_t tmpmask;
+       /* Save because hotplug lock held. */
+       static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
        int i;
 
-       for_each_cpu_mask_nr(i, *cpu_map)
+       for_each_cpu(i, cpu_map)
                cpu_attach_domain(NULL, &def_root_domain, i);
        synchronize_sched();
-       arch_destroy_sched_domains(cpu_map, &tmpmask);
+       arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
 }
 
 /* handle null as "default" */
@@ -7773,7 +7888,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  * doms_new[] to the current sched domain partitioning, doms_cur[].
  * It destroys each deleted domain and builds each new domain.
  *
- * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
+ * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
  * The masks don't intersect (don't overlap.) We should setup one
  * sched domain for each mask. CPUs not in any of the cpumasks will
  * not be load balanced. If the same cpumask appears both in the
@@ -7787,13 +7902,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  * the single partition 'fallback_doms', it also forces the domains
  * to be rebuilt.
  *
- * If doms_new == NULL it will be replaced with cpu_online_map.
+ * If doms_new == NULL it will be replaced with cpu_online_mask.
  * ndoms_new == 0 is a special case for destroying existing domains,
  * and it will not create the default domain.
  *
  * Call with hotplug lock held
  */
-void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+/* FIXME: Change to struct cpumask *doms_new[] */
+void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                             struct sched_domain_attr *dattr_new)
 {
        int i, j, n;
@@ -7812,7 +7928,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
        /* Destroy deleted domains */
        for (i = 0; i < ndoms_cur; i++) {
                for (j = 0; j < n && !new_topology; j++) {
-                       if (cpus_equal(doms_cur[i], doms_new[j])
+                       if (cpumask_equal(&doms_cur[i], &doms_new[j])
                            && dattrs_equal(dattr_cur, i, dattr_new, j))
                                goto match1;
                }
@@ -7824,15 +7940,15 @@ match1:
 
        if (doms_new == NULL) {
                ndoms_cur = 0;
-               doms_new = &fallback_doms;
-               cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+               doms_new = fallback_doms;
+               cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
                WARN_ON_ONCE(dattr_new);
        }
 
        /* Build new domains */
        for (i = 0; i < ndoms_new; i++) {
                for (j = 0; j < ndoms_cur && !new_topology; j++) {
-                       if (cpus_equal(doms_new[i], doms_cur[j])
+                       if (cpumask_equal(&doms_new[i], &doms_cur[j])
                            && dattrs_equal(dattr_new, i, dattr_cur, j))
                                goto match2;
                }
@@ -7844,7 +7960,7 @@ match2:
        }
 
        /* Remember the new sched domains */
-       if (doms_cur != &fallback_doms)
+       if (doms_cur != fallback_doms)
                kfree(doms_cur);
        kfree(dattr_cur);       /* kfree(NULL) is safe */
        doms_cur = doms_new;
@@ -7873,14 +7989,25 @@ int arch_reinit_sched_domains(void)
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 {
        int ret;
+       unsigned int level = 0;
 
-       if (buf[0] != '0' && buf[0] != '1')
+       if (sscanf(buf, "%u", &level) != 1)
+               return -EINVAL;
+
+       /*
+        * level is always be positive so don't check for
+        * level < POWERSAVINGS_BALANCE_NONE which is 0
+        * What happens on 0 or 1 byte write,
+        * need to check for count as well?
+        */
+
+       if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
                return -EINVAL;
 
        if (smt)
-               sched_smt_power_savings = (buf[0] == '1');
+               sched_smt_power_savings = level;
        else
-               sched_mc_power_savings = (buf[0] == '1');
+               sched_mc_power_savings = level;
 
        ret = arch_reinit_sched_domains();
 
@@ -7984,7 +8111,9 @@ static int update_runtime(struct notifier_block *nfb,
 
 void __init sched_init_smp(void)
 {
-       cpumask_t non_isolated_cpus;
+       cpumask_var_t non_isolated_cpus;
+
+       alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
 
 #if defined(CONFIG_NUMA)
        sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -7993,10 +8122,10 @@ void __init sched_init_smp(void)
 #endif
        get_online_cpus();
        mutex_lock(&sched_domains_mutex);
-       arch_init_sched_domains(&cpu_online_map);
-       cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
-       if (cpus_empty(non_isolated_cpus))
-               cpu_set(smp_processor_id(), non_isolated_cpus);
+       arch_init_sched_domains(cpu_online_mask);
+       cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+       if (cpumask_empty(non_isolated_cpus))
+               cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
        mutex_unlock(&sched_domains_mutex);
        put_online_cpus();
 
@@ -8011,9 +8140,13 @@ void __init sched_init_smp(void)
        init_hrtick();
 
        /* Move init over to a non-isolated CPU */
-       if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
+       if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
                BUG();
        sched_init_granularity();
+       free_cpumask_var(non_isolated_cpus);
+
+       alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+       init_sched_rt_class();
 }
 #else
 void __init sched_init_smp(void)
@@ -8328,6 +8461,15 @@ void __init sched_init(void)
         */
        current->sched_class = &fair_sched_class;
 
+       /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
+       alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+#ifdef CONFIG_SMP
+#ifdef CONFIG_NO_HZ
+       alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+#endif
+       alloc_bootmem_cpumask_var(&cpu_isolated_map);
+#endif /* SMP */
+
        scheduler_running = 1;
 }
 
index 52154fefab7eef9c8302a7da96741bf59d13c6b6..018b7be1db2e57a219066def4b6efb48043605b9 100644 (file)
@@ -67,24 +67,21 @@ static int convert_prio(int prio)
  * Returns: (int)bool - CPUs were found
  */
 int cpupri_find(struct cpupri *cp, struct task_struct *p,
-               cpumask_t *lowest_mask)
+               struct cpumask *lowest_mask)
 {
        int                  idx      = 0;
        int                  task_pri = convert_prio(p->prio);
 
        for_each_cpupri_active(cp->pri_active, idx) {
                struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
-               cpumask_t mask;
 
                if (idx >= task_pri)
                        break;
 
-               cpus_and(mask, p->cpus_allowed, vec->mask);
-
-               if (cpus_empty(mask))
+               if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
                        continue;
 
-               *lowest_mask = mask;
+               cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
                return 1;
        }
 
@@ -126,7 +123,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
                vec->count--;
                if (!vec->count)
                        clear_bit(oldpri, cp->pri_active);
-               cpu_clear(cpu, vec->mask);
+               cpumask_clear_cpu(cpu, vec->mask);
 
                spin_unlock_irqrestore(&vec->lock, flags);
        }
@@ -136,7 +133,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 
                spin_lock_irqsave(&vec->lock, flags);
 
-               cpu_set(cpu, vec->mask);
+               cpumask_set_cpu(cpu, vec->mask);
                vec->count++;
                if (vec->count == 1)
                        set_bit(newpri, cp->pri_active);
@@ -150,10 +147,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 /**
  * cpupri_init - initialize the cpupri structure
  * @cp: The cpupri context
+ * @bootmem: true if allocations need to use bootmem
  *
- * Returns: (void)
+ * Returns: -ENOMEM if memory fails.
  */
-void cpupri_init(struct cpupri *cp)
+int cpupri_init(struct cpupri *cp, bool bootmem)
 {
        int i;
 
@@ -164,11 +162,30 @@ void cpupri_init(struct cpupri *cp)
 
                spin_lock_init(&vec->lock);
                vec->count = 0;
-               cpus_clear(vec->mask);
+               if (bootmem)
+                       alloc_bootmem_cpumask_var(&vec->mask);
+               else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL))
+                       goto cleanup;
        }
 
        for_each_possible_cpu(i)
                cp->cpu_to_pri[i] = CPUPRI_INVALID;
+       return 0;
+
+cleanup:
+       for (i--; i >= 0; i--)
+               free_cpumask_var(cp->pri_to_cpu[i].mask);
+       return -ENOMEM;
 }
 
+/**
+ * cpupri_cleanup - clean up the cpupri structure
+ * @cp: The cpupri context
+ */
+void cpupri_cleanup(struct cpupri *cp)
+{
+       int i;
 
+       for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
+               free_cpumask_var(cp->pri_to_cpu[i].mask);
+}
index f25811b0f931fffb7276a10562cb1e366f12dfdb..642a94ef8a0a7fb39f9b341eb292dfad9ab38762 100644 (file)
@@ -14,7 +14,7 @@
 struct cpupri_vec {
        spinlock_t lock;
        int        count;
-       cpumask_ mask;
+       cpumask_var_t mask;
 };
 
 struct cpupri {
@@ -27,7 +27,8 @@ struct cpupri {
 int  cpupri_find(struct cpupri *cp,
                 struct task_struct *p, cpumask_t *lowest_mask);
 void cpupri_set(struct cpupri *cp, int cpu, int pri);
-void cpupri_init(struct cpupri *cp);
+int cpupri_init(struct cpupri *cp, bool bootmem);
+void cpupri_cleanup(struct cpupri *cp);
 #else
 #define cpupri_set(cp, cpu, pri) do { } while (0)
 #define cpupri_init() do { } while (0)
index 5ad4440f0fc44798b84417fa56463dd19ad81267..56c0efe902a79bca1c578aa7fcd5e7d5f0df3144 100644 (file)
@@ -1019,16 +1019,33 @@ static void yield_task_fair(struct rq *rq)
  * search starts with cpus closest then further out as needed,
  * so we always favor a closer, idle cpu.
  * Domains may include CPUs that are not usable for migration,
- * hence we need to mask them out (cpu_active_map)
+ * hence we need to mask them out (cpu_active_mask)
  *
  * Returns the CPU we should wake onto.
  */
 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
 static int wake_idle(int cpu, struct task_struct *p)
 {
-       cpumask_t tmp;
        struct sched_domain *sd;
        int i;
+       unsigned int chosen_wakeup_cpu;
+       int this_cpu;
+
+       /*
+        * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
+        * are idle and this is not a kernel thread and this task's affinity
+        * allows it to be moved to preferred cpu, then just move!
+        */
+
+       this_cpu = smp_processor_id();
+       chosen_wakeup_cpu =
+               cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
+
+       if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
+               idle_cpu(cpu) && idle_cpu(this_cpu) &&
+               p->mm && !(p->flags & PF_KTHREAD) &&
+               cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
+               return chosen_wakeup_cpu;
 
        /*
         * If it is idle, then it is the best cpu to run this task.
@@ -1046,10 +1063,9 @@ static int wake_idle(int cpu, struct task_struct *p)
                if ((sd->flags & SD_WAKE_IDLE)
                    || ((sd->flags & SD_WAKE_IDLE_FAR)
                        && !task_hot(p, task_rq(p)->clock, sd))) {
-                       cpus_and(tmp, sd->span, p->cpus_allowed);
-                       cpus_and(tmp, tmp, cpu_active_map);
-                       for_each_cpu_mask_nr(i, tmp) {
-                               if (idle_cpu(i)) {
+                       for_each_cpu_and(i, sched_domain_span(sd),
+                                        &p->cpus_allowed) {
+                               if (cpu_active(i) && idle_cpu(i)) {
                                        if (i != task_cpu(p)) {
                                                schedstat_inc(p,
                                                       se.nr_wakeups_idle);
@@ -1242,13 +1258,13 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
         * this_cpu and prev_cpu are present in:
         */
        for_each_domain(this_cpu, sd) {
-               if (cpu_isset(prev_cpu, sd->span)) {
+               if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
                        this_sd = sd;
                        break;
                }
        }
 
-       if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+       if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
                goto out;
 
        /*
index 51d2af3e6191e0680e7acb894dd48a1b26381e51..833b6d44483c57bb4473b9eddf448a2366302b6a 100644 (file)
@@ -15,7 +15,7 @@ static inline void rt_set_overload(struct rq *rq)
        if (!rq->online)
                return;
 
-       cpu_set(rq->cpu, rq->rd->rto_mask);
+       cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
        /*
         * Make sure the mask is visible before we set
         * the overload count. That is checked to determine
@@ -34,7 +34,7 @@ static inline void rt_clear_overload(struct rq *rq)
 
        /* the order here really doesn't matter */
        atomic_dec(&rq->rd->rto_count);
-       cpu_clear(rq->cpu, rq->rd->rto_mask);
+       cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
 }
 
 static void update_rt_migration(struct rq *rq)
@@ -139,14 +139,14 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
 }
 
 #ifdef CONFIG_SMP
-static inline cpumask_t sched_rt_period_mask(void)
+static inline const struct cpumask *sched_rt_period_mask(void)
 {
        return cpu_rq(smp_processor_id())->rd->span;
 }
 #else
-static inline cpumask_t sched_rt_period_mask(void)
+static inline const struct cpumask *sched_rt_period_mask(void)
 {
-       return cpu_online_map;
+       return cpu_online_mask;
 }
 #endif
 
@@ -212,9 +212,9 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq)
        return rt_rq->rt_throttled;
 }
 
-static inline cpumask_t sched_rt_period_mask(void)
+static inline const struct cpumask *sched_rt_period_mask(void)
 {
-       return cpu_online_map;
+       return cpu_online_mask;
 }
 
 static inline
@@ -241,11 +241,11 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
        int i, weight, more = 0;
        u64 rt_period;
 
-       weight = cpus_weight(rd->span);
+       weight = cpumask_weight(rd->span);
 
        spin_lock(&rt_b->rt_runtime_lock);
        rt_period = ktime_to_ns(rt_b->rt_period);
-       for_each_cpu_mask_nr(i, rd->span) {
+       for_each_cpu(i, rd->span) {
                struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
                s64 diff;
 
@@ -324,7 +324,7 @@ static void __disable_runtime(struct rq *rq)
                /*
                 * Greedy reclaim, take back as much as we can.
                 */
-               for_each_cpu_mask(i, rd->span) {
+               for_each_cpu(i, rd->span) {
                        struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
                        s64 diff;
 
@@ -429,13 +429,13 @@ static inline int balance_runtime(struct rt_rq *rt_rq)
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 {
        int i, idle = 1;
-       cpumask_t span;
+       const struct cpumask *span;
 
        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
                return 1;
 
        span = sched_rt_period_mask();
-       for_each_cpu_mask(i, span) {
+       for_each_cpu(i, span) {
                int enqueue = 0;
                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
                struct rq *rq = rq_of_rt_rq(rt_rq);
@@ -805,17 +805,20 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 
 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 {
-       cpumask_t mask;
+       cpumask_var_t mask;
 
        if (rq->curr->rt.nr_cpus_allowed == 1)
                return;
 
-       if (p->rt.nr_cpus_allowed != 1
-           && cpupri_find(&rq->rd->cpupri, p, &mask))
+       if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
                return;
 
-       if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
-               return;
+       if (p->rt.nr_cpus_allowed != 1
+           && cpupri_find(&rq->rd->cpupri, p, mask))
+               goto free;
+
+       if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
+               goto free;
 
        /*
         * There appears to be other cpus that can accept
@@ -824,6 +827,8 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
         */
        requeue_task_rt(rq, p, 1);
        resched_task(rq->curr);
+free:
+       free_cpumask_var(mask);
 }
 
 #endif /* CONFIG_SMP */
@@ -914,7 +919,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
        if (!task_running(rq, p) &&
-           (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
+           (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
            (p->rt.nr_cpus_allowed > 1))
                return 1;
        return 0;
@@ -953,7 +958,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
        return next;
 }
 
-static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
+static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
 
 static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
 {
@@ -973,7 +978,7 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
 static int find_lowest_rq(struct task_struct *task)
 {
        struct sched_domain *sd;
-       cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
+       struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
        int this_cpu = smp_processor_id();
        int cpu      = task_cpu(task);
 
@@ -988,7 +993,7 @@ static int find_lowest_rq(struct task_struct *task)
         * I guess we might want to change cpupri_find() to ignore those
         * in the first place.
         */
-       cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
+       cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
 
        /*
         * At this point we have built a mask of cpus representing the
@@ -998,7 +1003,7 @@ static int find_lowest_rq(struct task_struct *task)
         * We prioritize the last cpu that the task executed on since
         * it is most likely cache-hot in that location.
         */
-       if (cpu_isset(cpu, *lowest_mask))
+       if (cpumask_test_cpu(cpu, lowest_mask))
                return cpu;
 
        /*
@@ -1013,7 +1018,8 @@ static int find_lowest_rq(struct task_struct *task)
                        cpumask_t domain_mask;
                        int       best_cpu;
 
-                       cpus_and(domain_mask, sd->span, *lowest_mask);
+                       cpumask_and(&domain_mask, sched_domain_span(sd),
+                                   lowest_mask);
 
                        best_cpu = pick_optimal_cpu(this_cpu,
                                                    &domain_mask);
@@ -1054,8 +1060,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                         * Also make sure that it wasn't scheduled on its rq.
                         */
                        if (unlikely(task_rq(task) != rq ||
-                                    !cpu_isset(lowest_rq->cpu,
-                                               task->cpus_allowed) ||
+                                    !cpumask_test_cpu(lowest_rq->cpu,
+                                                      &task->cpus_allowed) ||
                                     task_running(rq, task) ||
                                     !task->se.on_rq)) {
 
@@ -1176,7 +1182,7 @@ static int pull_rt_task(struct rq *this_rq)
 
        next = pick_next_task_rt(this_rq);
 
-       for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) {
+       for_each_cpu(cpu, this_rq->rd->rto_mask) {
                if (this_cpu == cpu)
                        continue;
 
@@ -1305,9 +1311,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 }
 
 static void set_cpus_allowed_rt(struct task_struct *p,
-                               const cpumask_t *new_mask)
+                               const struct cpumask *new_mask)
 {
-       int weight = cpus_weight(*new_mask);
+       int weight = cpumask_weight(new_mask);
 
        BUG_ON(!rt_task(p));
 
@@ -1328,7 +1334,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
                update_rt_migration(rq);
        }
 
-       p->cpus_allowed    = *new_mask;
+       cpumask_copy(&p->cpus_allowed, new_mask);
        p->rt.nr_cpus_allowed = weight;
 }
 
@@ -1371,6 +1377,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
        if (!rq->rt.rt_nr_running)
                pull_rt_task(rq);
 }
+
+static inline void init_sched_rt_class(void)
+{
+       unsigned int i;
+
+       for_each_possible_cpu(i)
+               alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL);
+}
 #endif /* CONFIG_SMP */
 
 /*
@@ -1541,3 +1555,4 @@ static void print_rt_stats(struct seq_file *m, int cpu)
        rcu_read_unlock();
 }
 #endif /* CONFIG_SCHED_DEBUG */
+
index 3b01098164c8963b47ad04d27f68eb7b711a37dc..f2773b5d12260bbb10c3b293bd4fd1811b483848 100644 (file)
@@ -42,7 +42,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
                for_each_domain(cpu, sd) {
                        enum cpu_idle_type itype;
 
-                       cpumask_scnprintf(mask_str, mask_len, sd->span);
+                       cpumask_scnprintf(mask_str, mask_len,
+                                         sched_domain_span(sd));
                        seq_printf(seq, "domain%d %s", dcount++, mask_str);
                        for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
                                        itype++) {
index bd6be76303cf23b4c52b757a93bf1966bdbb4f0a..6d7dc4ec4aa5735dbfd02ae68593c98ae5bf315d 100644 (file)
@@ -352,7 +352,7 @@ static int parse(struct nlattr *na, cpumask_t *mask)
        if (!data)
                return -ENOMEM;
        nla_strlcpy(data, na, len);
-       ret = cpulist_parse(data, *mask);
+       ret = cpulist_parse(data, mask);
        kfree(data);
        return ret;
 }
index f8d968063cea38e348f63590ce73d409385cc5e9..ea2f48af83cff4dde64764da811744f74cddb745 100644 (file)
@@ -166,6 +166,8 @@ static void clockevents_notify_released(void)
 void clockevents_register_device(struct clock_event_device *dev)
 {
        BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+       BUG_ON(!dev->cpumask);
+
        /*
         * A nsec2cyc multiplicator of 0 is invalid and we'd crash
         * on it, so fix it up and emit a warning:
index f98a1b7b16e942018ffe9e5998756405695da9a4..9590af2327be382cc030ef79acdf6d1d26bd9599 100644 (file)
@@ -150,7 +150,7 @@ static void tick_do_broadcast(cpumask_t mask)
                 */
                cpu = first_cpu(mask);
                td = &per_cpu(tick_cpu_device, cpu);
-               td->evtdev->broadcast(mask);
+               td->evtdev->broadcast(&mask);
        }
 }
 
index df12434b43ca09ec5ea9b0c48d679b842feddc9e..f8372be74122aa3f10b3dfc6642ffde2dc297920 100644 (file)
@@ -136,7 +136,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
  */
 static void tick_setup_device(struct tick_device *td,
                              struct clock_event_device *newdev, int cpu,
-                             const cpumask_t *cpumask)
+                             const struct cpumask *cpumask)
 {
        ktime_t next_event;
        void (*handler)(struct clock_event_device *) = NULL;
@@ -171,8 +171,8 @@ static void tick_setup_device(struct tick_device *td,
         * When the device is not per cpu, pin the interrupt to the
         * current cpu:
         */
-       if (!cpus_equal(newdev->cpumask, *cpumask))
-               irq_set_affinity(newdev->irq, *cpumask);
+       if (!cpumask_equal(newdev->cpumask, cpumask))
+               irq_set_affinity(newdev->irq, cpumask);
 
        /*
         * When global broadcasting is active, check if the current
@@ -202,14 +202,14 @@ static int tick_check_new_device(struct clock_event_device *newdev)
        spin_lock_irqsave(&tick_device_lock, flags);
 
        cpu = smp_processor_id();
-       if (!cpu_isset(cpu, newdev->cpumask))
+       if (!cpumask_test_cpu(cpu, newdev->cpumask))
                goto out_bc;
 
        td = &per_cpu(tick_cpu_device, cpu);
        curdev = td->evtdev;
 
        /* cpu local device ? */
-       if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) {
+       if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) {
 
                /*
                 * If the cpu affinity of the device interrupt can not
@@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
                 * If we have a cpu local device already, do not replace it
                 * by a non cpu local device
                 */
-               if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu)))
+               if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
                        goto out_bc;
        }
 
index 8f3fc2582d38b7073927e9dc004ef38e16909ab0..76a574bbef97270672dee229b81b77872ba319e6 100644 (file)
@@ -144,7 +144,7 @@ void tick_nohz_update_jiffies(void)
        if (!ts->tick_stopped)
                return;
 
-       cpu_clear(cpu, nohz_cpu_mask);
+       cpumask_clear_cpu(cpu, nohz_cpu_mask);
        now = ktime_get();
        ts->idle_waketime = now;
 
@@ -301,7 +301,7 @@ void tick_nohz_stop_sched_tick(int inidle)
                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 
                if (delta_jiffies > 1)
-                       cpu_set(cpu, nohz_cpu_mask);
+                       cpumask_set_cpu(cpu, nohz_cpu_mask);
 
                /* Skip reprogram of event if its not changed */
                if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
@@ -319,7 +319,7 @@ void tick_nohz_stop_sched_tick(int inidle)
                                /*
                                 * sched tick not stopped!
                                 */
-                               cpu_clear(cpu, nohz_cpu_mask);
+                               cpumask_clear_cpu(cpu, nohz_cpu_mask);
                                goto out;
                        }
 
@@ -361,7 +361,7 @@ void tick_nohz_stop_sched_tick(int inidle)
                 * softirq.
                 */
                tick_do_update_jiffies64(ktime_get());
-               cpu_clear(cpu, nohz_cpu_mask);
+               cpumask_clear_cpu(cpu, nohz_cpu_mask);
        }
        raise_softirq_irqoff(TIMER_SOFTIRQ);
 out:
@@ -439,7 +439,7 @@ void tick_nohz_restart_sched_tick(void)
        select_nohz_load_balancer(0);
        now = ktime_get();
        tick_do_update_jiffies64(now);
-       cpu_clear(cpu, nohz_cpu_mask);
+       cpumask_clear_cpu(cpu, nohz_cpu_mask);
 
        /*
         * We stopped the tick in idle. Update process times would miss the
index 4185d5221633f19755efb818a1fb969a69d0d29c..0e91f43b6baf53cac177af4ca02f0939997c1a0c 100644 (file)
@@ -2674,7 +2674,7 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf,
 
        mutex_lock(&tracing_cpumask_update_lock);
 
-       len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
+       len = cpumask_scnprintf(mask_str, count, &tracing_cpumask);
        if (count - len < 2) {
                count = -EINVAL;
                goto out_err;
@@ -2695,7 +2695,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
        int err, cpu;
 
        mutex_lock(&tracing_cpumask_update_lock);
-       err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
+       err = cpumask_parse_user(ubuf, count, &tracing_cpumask_new);
        if (err)
                goto err_unlock;
 
index fd4118e097f0711c9c08cf6e08575678fcc5ec16..2ba43c4a5b075a28cb3dd05ae3f5db46fee23661 100644 (file)
@@ -159,4 +159,11 @@ config CHECK_SIGNATURE
 config HAVE_LMB
        boolean
 
+config CPUMASK_OFFSTACK
+       bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
+       help
+         Use dynamic allocation for cpumask_var_t, instead of putting
+         them on the stack.  This is a bit more expensive, but avoids
+         stack overflow.
+
 endmenu
index 6cb7ad10785227f2b889bcff96d6610ddeabe37a..0d861c3154b6eeed56981c33b3505ecca87d5712 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3642,7 +3642,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
                                len < PAGE_SIZE - 60) {
                        len += sprintf(buf + len, " cpus=");
                        len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50,
-                                       l->cpus);
+                                       &l->cpus);
                }
 
                if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
index 53772bb46320ae9e1fbf84297a6a4707e5b3e0ed..23b81cf242af739675f01303eabb3cebb71b09a7 100644 (file)
@@ -150,10 +150,11 @@ static int ioapic_inj_irq(struct kvm_ioapic *ioapic,
 static void ioapic_inj_nmi(struct kvm_vcpu *vcpu)
 {
        kvm_inject_nmi(vcpu);
+       kvm_vcpu_kick(vcpu);
 }
 
-static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
-                                      u8 dest_mode)
+u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
+                                   u8 dest_mode)
 {
        u32 mask = 0;
        int i;
@@ -207,7 +208,8 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
                     "vector=%x trig_mode=%x\n",
                     dest, dest_mode, delivery_mode, vector, trig_mode);
 
-       deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
+       deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic, dest,
+                                                         dest_mode);
        if (!deliver_bitmask) {
                ioapic_debug("no target on destination\n");
                return 0;
index cd7ae7691c9d68eb3c066b6c47bbc4749c85e18d..49c9581d25860edd3ad76ba2991a05b9dc9ba6c8 100644 (file)
@@ -85,5 +85,7 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
+u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
+                               u8 dest_mode);
 
 #endif
index 55ad76ee2d0913dbb52fcfa220687c33a3c535ef..aa5d1e5c497ef12e3fe9aa12b79865a2081e4b43 100644 (file)
@@ -61,10 +61,9 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm,
        hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list);
 }
 
-void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
-                                    struct kvm_irq_ack_notifier *kian)
+void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian)
 {
-       hlist_del(&kian->link);
+       hlist_del_init(&kian->link);
 }
 
 /* The caller must hold kvm->lock mutex */
@@ -73,11 +72,15 @@ int kvm_request_irq_source_id(struct kvm *kvm)
        unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
        int irq_source_id = find_first_zero_bit(bitmap,
                                sizeof(kvm->arch.irq_sources_bitmap));
+
        if (irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) {
                printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n");
-               irq_source_id = -EFAULT;
-       } else
-               set_bit(irq_source_id, bitmap);
+               return -EFAULT;
+       }
+
+       ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+       set_bit(irq_source_id, bitmap);
+
        return irq_source_id;
 }
 
@@ -85,7 +88,9 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
 {
        int i;
 
-       if (irq_source_id <= 0 ||
+       ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+
+       if (irq_source_id < 0 ||
            irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) {
                printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
                return;
index a87f45edfae848e07c295c7623097e235ffe8c78..fc6127cbea1f09e98c7d5e05bcec17e733a5b165 100644 (file)
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
+#ifdef CONFIG_X86
+#include <asm/msidef.h>
+#endif
+
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
 #include "coalesced_mmio.h"
 #endif
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
+static int msi2intx = 1;
+module_param(msi2intx, bool, 0);
+
 DEFINE_SPINLOCK(kvm_lock);
 LIST_HEAD(vm_list);
 
-static cpumask_t cpus_hardware_enabled;
+static cpumask_var_t cpus_hardware_enabled;
 
 struct kmem_cache *kvm_vcpu_cache;
 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
@@ -75,9 +82,60 @@ struct dentry *kvm_debugfs_dir;
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
                           unsigned long arg);
 
-bool kvm_rebooting;
+static bool kvm_rebooting;
 
 #ifdef KVM_CAP_DEVICE_ASSIGNMENT
+
+#ifdef CONFIG_X86
+static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev)
+{
+       int vcpu_id;
+       struct kvm_vcpu *vcpu;
+       struct kvm_ioapic *ioapic = ioapic_irqchip(dev->kvm);
+       int dest_id = (dev->guest_msi.address_lo & MSI_ADDR_DEST_ID_MASK)
+                       >> MSI_ADDR_DEST_ID_SHIFT;
+       int vector = (dev->guest_msi.data & MSI_DATA_VECTOR_MASK)
+                       >> MSI_DATA_VECTOR_SHIFT;
+       int dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT,
+                               (unsigned long *)&dev->guest_msi.address_lo);
+       int trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT,
+                               (unsigned long *)&dev->guest_msi.data);
+       int delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT,
+                               (unsigned long *)&dev->guest_msi.data);
+       u32 deliver_bitmask;
+
+       BUG_ON(!ioapic);
+
+       deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic,
+                               dest_id, dest_mode);
+       /* IOAPIC delivery mode value is the same as MSI here */
+       switch (delivery_mode) {
+       case IOAPIC_LOWEST_PRIORITY:
+               vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
+                               deliver_bitmask);
+               if (vcpu != NULL)
+                       kvm_apic_set_irq(vcpu, vector, trig_mode);
+               else
+                       printk(KERN_INFO "kvm: null lowest priority vcpu!\n");
+               break;
+       case IOAPIC_FIXED:
+               for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
+                       if (!(deliver_bitmask & (1 << vcpu_id)))
+                               continue;
+                       deliver_bitmask &= ~(1 << vcpu_id);
+                       vcpu = ioapic->kvm->vcpus[vcpu_id];
+                       if (vcpu)
+                               kvm_apic_set_irq(vcpu, vector, trig_mode);
+               }
+               break;
+       default:
+               printk(KERN_INFO "kvm: unsupported MSI delivery mode\n");
+       }
+}
+#else
+static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev) {}
+#endif
+
 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
                                                      int assigned_dev_id)
 {
@@ -104,9 +162,16 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
         * finer-grained lock, update this
         */
        mutex_lock(&assigned_dev->kvm->lock);
-       kvm_set_irq(assigned_dev->kvm,
-                   assigned_dev->irq_source_id,
-                   assigned_dev->guest_irq, 1);
+       if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_INTX)
+               kvm_set_irq(assigned_dev->kvm,
+                           assigned_dev->irq_source_id,
+                           assigned_dev->guest_irq, 1);
+       else if (assigned_dev->irq_requested_type &
+                               KVM_ASSIGNED_DEV_GUEST_MSI) {
+               assigned_device_msi_dispatch(assigned_dev);
+               enable_irq(assigned_dev->host_irq);
+               assigned_dev->host_irq_disabled = false;
+       }
        mutex_unlock(&assigned_dev->kvm->lock);
        kvm_put_kvm(assigned_dev->kvm);
 }
@@ -117,8 +182,12 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
                (struct kvm_assigned_dev_kernel *) dev_id;
 
        kvm_get_kvm(assigned_dev->kvm);
+
        schedule_work(&assigned_dev->interrupt_work);
+
        disable_irq_nosync(irq);
+       assigned_dev->host_irq_disabled = true;
+
        return IRQ_HANDLED;
 }
 
@@ -132,19 +201,32 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 
        dev = container_of(kian, struct kvm_assigned_dev_kernel,
                           ack_notifier);
+
        kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
-       enable_irq(dev->host_irq);
+
+       /* The guest irq may be shared so this ack may be
+        * from another device.
+        */
+       if (dev->host_irq_disabled) {
+               enable_irq(dev->host_irq);
+               dev->host_irq_disabled = false;
+       }
 }
 
-static void kvm_free_assigned_device(struct kvm *kvm,
-                                    struct kvm_assigned_dev_kernel
-                                    *assigned_dev)
+static void kvm_free_assigned_irq(struct kvm *kvm,
+                                 struct kvm_assigned_dev_kernel *assigned_dev)
 {
-       if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested)
-               free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+       if (!irqchip_in_kernel(kvm))
+               return;
+
+       kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier);
 
-       kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
-       kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
+       if (assigned_dev->irq_source_id != -1)
+               kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
+       assigned_dev->irq_source_id = -1;
+
+       if (!assigned_dev->irq_requested_type)
+               return;
 
        if (cancel_work_sync(&assigned_dev->interrupt_work))
                /* We had pending work. That means we will have to take
@@ -152,6 +234,23 @@ static void kvm_free_assigned_device(struct kvm *kvm,
                 */
                kvm_put_kvm(kvm);
 
+       free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+
+       if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)
+               pci_disable_msi(assigned_dev->dev);
+
+       assigned_dev->irq_requested_type = 0;
+}
+
+
+static void kvm_free_assigned_device(struct kvm *kvm,
+                                    struct kvm_assigned_dev_kernel
+                                    *assigned_dev)
+{
+       kvm_free_assigned_irq(kvm, assigned_dev);
+
+       pci_reset_function(assigned_dev->dev);
+
        pci_release_regions(assigned_dev->dev);
        pci_disable_device(assigned_dev->dev);
        pci_dev_put(assigned_dev->dev);
@@ -174,6 +273,95 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
        }
 }
 
+static int assigned_device_update_intx(struct kvm *kvm,
+                       struct kvm_assigned_dev_kernel *adev,
+                       struct kvm_assigned_irq *airq)
+{
+       adev->guest_irq = airq->guest_irq;
+       adev->ack_notifier.gsi = airq->guest_irq;
+
+       if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_INTX)
+               return 0;
+
+       if (irqchip_in_kernel(kvm)) {
+               if (!msi2intx &&
+                   adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) {
+                       free_irq(adev->host_irq, (void *)kvm);
+                       pci_disable_msi(adev->dev);
+               }
+
+               if (!capable(CAP_SYS_RAWIO))
+                       return -EPERM;
+
+               if (airq->host_irq)
+                       adev->host_irq = airq->host_irq;
+               else
+                       adev->host_irq = adev->dev->irq;
+
+               /* Even though this is PCI, we don't want to use shared
+                * interrupts. Sharing host devices with guest-assigned devices
+                * on the same interrupt line is not a happy situation: there
+                * are going to be long delays in accepting, acking, etc.
+                */
+               if (request_irq(adev->host_irq, kvm_assigned_dev_intr,
+                               0, "kvm_assigned_intx_device", (void *)adev))
+                       return -EIO;
+       }
+
+       adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_INTX |
+                                  KVM_ASSIGNED_DEV_HOST_INTX;
+       return 0;
+}
+
+#ifdef CONFIG_X86
+static int assigned_device_update_msi(struct kvm *kvm,
+                       struct kvm_assigned_dev_kernel *adev,
+                       struct kvm_assigned_irq *airq)
+{
+       int r;
+
+       if (airq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI) {
+               /* x86 don't care upper address of guest msi message addr */
+               adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_MSI;
+               adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_INTX;
+               adev->guest_msi.address_lo = airq->guest_msi.addr_lo;
+               adev->guest_msi.data = airq->guest_msi.data;
+               adev->ack_notifier.gsi = -1;
+       } else if (msi2intx) {
+               adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_INTX;
+               adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_MSI;
+               adev->guest_irq = airq->guest_irq;
+               adev->ack_notifier.gsi = airq->guest_irq;
+       }
+
+       if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)
+               return 0;
+
+       if (irqchip_in_kernel(kvm)) {
+               if (!msi2intx) {
+                       if (adev->irq_requested_type &
+                                       KVM_ASSIGNED_DEV_HOST_INTX)
+                               free_irq(adev->host_irq, (void *)adev);
+
+                       r = pci_enable_msi(adev->dev);
+                       if (r)
+                               return r;
+               }
+
+               adev->host_irq = adev->dev->irq;
+               if (request_irq(adev->host_irq, kvm_assigned_dev_intr, 0,
+                               "kvm_assigned_msi_device", (void *)adev))
+                       return -EIO;
+       }
+
+       if (!msi2intx)
+               adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_MSI;
+
+       adev->irq_requested_type |= KVM_ASSIGNED_DEV_HOST_MSI;
+       return 0;
+}
+#endif
+
 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
                                   struct kvm_assigned_irq
                                   *assigned_irq)
@@ -190,49 +378,68 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
                return -EINVAL;
        }
 
-       if (match->irq_requested) {
-               match->guest_irq = assigned_irq->guest_irq;
-               match->ack_notifier.gsi = assigned_irq->guest_irq;
-               mutex_unlock(&kvm->lock);
-               return 0;
-       }
+       if (!match->irq_requested_type) {
+               INIT_WORK(&match->interrupt_work,
+                               kvm_assigned_dev_interrupt_work_handler);
+               if (irqchip_in_kernel(kvm)) {
+                       /* Register ack nofitier */
+                       match->ack_notifier.gsi = -1;
+                       match->ack_notifier.irq_acked =
+                                       kvm_assigned_dev_ack_irq;
+                       kvm_register_irq_ack_notifier(kvm,
+                                       &match->ack_notifier);
+
+                       /* Request IRQ source ID */
+                       r = kvm_request_irq_source_id(kvm);
+                       if (r < 0)
+                               goto out_release;
+                       else
+                               match->irq_source_id = r;
 
-       INIT_WORK(&match->interrupt_work,
-                 kvm_assigned_dev_interrupt_work_handler);
+#ifdef CONFIG_X86
+                       /* Determine host device irq type, we can know the
+                        * result from dev->msi_enabled */
+                       if (msi2intx)
+                               pci_enable_msi(match->dev);
+#endif
+               }
+       }
 
-       if (irqchip_in_kernel(kvm)) {
-               if (!capable(CAP_SYS_RAWIO)) {
-                       r = -EPERM;
+       if ((!msi2intx &&
+            (assigned_irq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI)) ||
+           (msi2intx && match->dev->msi_enabled)) {
+#ifdef CONFIG_X86
+               r = assigned_device_update_msi(kvm, match, assigned_irq);
+               if (r) {
+                       printk(KERN_WARNING "kvm: failed to enable "
+                                       "MSI device!\n");
                        goto out_release;
                }
-
-               if (assigned_irq->host_irq)
-                       match->host_irq = assigned_irq->host_irq;
-               else
-                       match->host_irq = match->dev->irq;
-               match->guest_irq = assigned_irq->guest_irq;
-               match->ack_notifier.gsi = assigned_irq->guest_irq;
-               match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
-               kvm_register_irq_ack_notifier(kvm, &match->ack_notifier);
-               r = kvm_request_irq_source_id(kvm);
-               if (r < 0)
+#else
+               r = -ENOTTY;
+#endif
+       } else if (assigned_irq->host_irq == 0 && match->dev->irq == 0) {
+               /* Host device IRQ 0 means don't support INTx */
+               if (!msi2intx) {
+                       printk(KERN_WARNING
+                              "kvm: wait device to enable MSI!\n");
+                       r = 0;
+               } else {
+                       printk(KERN_WARNING
+                              "kvm: failed to enable MSI device!\n");
+                       r = -ENOTTY;
                        goto out_release;
-               else
-                       match->irq_source_id = r;
-
-               /* Even though this is PCI, we don't want to use shared
-                * interrupts. Sharing host devices with guest-assigned devices
-                * on the same interrupt line is not a happy situation: there
-                * are going to be long delays in accepting, acking, etc.
-                */
-               if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0,
-                               "kvm_assigned_device", (void *)match)) {
-                       r = -EIO;
+               }
+       } else {
+               /* Non-sharing INTx mode */
+               r = assigned_device_update_intx(kvm, match, assigned_irq);
+               if (r) {
+                       printk(KERN_WARNING "kvm: failed to enable "
+                                       "INTx device!\n");
                        goto out_release;
                }
        }
 
-       match->irq_requested = true;
        mutex_unlock(&kvm->lock);
        return r;
 out_release:
@@ -283,11 +490,14 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
                       __func__);
                goto out_disable;
        }
+
+       pci_reset_function(dev);
+
        match->assigned_dev_id = assigned_dev->assigned_dev_id;
        match->host_busnr = assigned_dev->busnr;
        match->host_devfn = assigned_dev->devfn;
        match->dev = dev;
-
+       match->irq_source_id = -1;
        match->kvm = kvm;
 
        list_add(&match->list, &kvm->arch.assigned_dev_head);
@@ -355,57 +565,48 @@ static void ack_flush(void *_completed)
 {
 }
 
-void kvm_flush_remote_tlbs(struct kvm *kvm)
+static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 {
        int i, cpu, me;
-       cpumask_t cpus;
+       cpumask_var_t cpus;
+       bool called = true;
        struct kvm_vcpu *vcpu;
 
+       if (alloc_cpumask_var(&cpus, GFP_ATOMIC))
+               cpumask_clear(cpus);
+
        me = get_cpu();
-       cpus_clear(cpus);
        for (i = 0; i < KVM_MAX_VCPUS; ++i) {
                vcpu = kvm->vcpus[i];
                if (!vcpu)
                        continue;
-               if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+               if (test_and_set_bit(req, &vcpu->requests))
                        continue;
                cpu = vcpu->cpu;
-               if (cpu != -1 && cpu != me)
-                       cpu_set(cpu, cpus);
+               if (cpus != NULL && cpu != -1 && cpu != me)
+                       cpumask_set_cpu(cpu, cpus);
        }
-       if (cpus_empty(cpus))
-               goto out;
-       ++kvm->stat.remote_tlb_flush;
-       smp_call_function_mask(cpus, ack_flush, NULL, 1);
-out:
+       if (unlikely(cpus == NULL))
+               smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
+       else if (!cpumask_empty(cpus))
+               smp_call_function_many(cpus, ack_flush, NULL, 1);
+       else
+               called = false;
        put_cpu();
+       free_cpumask_var(cpus);
+       return called;
 }
 
-void kvm_reload_remote_mmus(struct kvm *kvm)
+void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
-       int i, cpu, me;
-       cpumask_t cpus;
-       struct kvm_vcpu *vcpu;
-
-       me = get_cpu();
-       cpus_clear(cpus);
-       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-               vcpu = kvm->vcpus[i];
-               if (!vcpu)
-                       continue;
-               if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
-                       continue;
-               cpu = vcpu->cpu;
-               if (cpu != -1 && cpu != me)
-                       cpu_set(cpu, cpus);
-       }
-       if (cpus_empty(cpus))
-               goto out;
-       smp_call_function_mask(cpus, ack_flush, NULL, 1);
-out:
-       put_cpu();
+       if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
+               ++kvm->stat.remote_tlb_flush;
 }
 
+void kvm_reload_remote_mmus(struct kvm *kvm)
+{
+       make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
+}
 
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 {
@@ -710,6 +911,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
                goto out;
        if (mem->guest_phys_addr & (PAGE_SIZE - 1))
                goto out;
+       if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1)))
+               goto out;
        if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
                goto out;
        if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
@@ -821,7 +1024,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
                goto out_free;
        }
 
-       kvm_free_physmem_slot(&old, &new);
+       kvm_free_physmem_slot(&old, npages ? &new : NULL);
+       /* Slot deletion case: we have to update the current slot */
+       if (!npages)
+               *memslot = old;
 #ifdef CONFIG_DMAR
        /* map the pages in iommu page table */
        r = kvm_iommu_map_pages(kvm, base_gfn, npages);
@@ -918,7 +1124,7 @@ int kvm_is_error_hva(unsigned long addr)
 }
 EXPORT_SYMBOL_GPL(kvm_is_error_hva);
 
-static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
 {
        int i;
 
@@ -931,11 +1137,12 @@ static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
        }
        return NULL;
 }
+EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased);
 
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
        gfn = unalias_gfn(kvm, gfn);
-       return __gfn_to_memslot(kvm, gfn);
+       return gfn_to_memslot_unaliased(kvm, gfn);
 }
 
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
@@ -959,7 +1166,7 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
        struct kvm_memory_slot *slot;
 
        gfn = unalias_gfn(kvm, gfn);
-       slot = __gfn_to_memslot(kvm, gfn);
+       slot = gfn_to_memslot_unaliased(kvm, gfn);
        if (!slot)
                return bad_hva();
        return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
@@ -1210,7 +1417,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
        struct kvm_memory_slot *memslot;
 
        gfn = unalias_gfn(kvm, gfn);
-       memslot = __gfn_to_memslot(kvm, gfn);
+       memslot = gfn_to_memslot_unaliased(kvm, gfn);
        if (memslot && memslot->dirty_bitmap) {
                unsigned long rel_gfn = gfn - memslot->base_gfn;
 
@@ -1295,7 +1502,7 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
        return 0;
 }
 
-static const struct file_operations kvm_vcpu_fops = {
+static struct file_operations kvm_vcpu_fops = {
        .release        = kvm_vcpu_release,
        .unlocked_ioctl = kvm_vcpu_ioctl,
        .compat_ioctl   = kvm_vcpu_ioctl,
@@ -1689,7 +1896,7 @@ static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
 
-static const struct file_operations kvm_vm_fops = {
+static struct file_operations kvm_vm_fops = {
        .release        = kvm_vm_release,
        .unlocked_ioctl = kvm_vm_ioctl,
        .compat_ioctl   = kvm_vm_ioctl,
@@ -1711,6 +1918,18 @@ static int kvm_dev_ioctl_create_vm(void)
        return fd;
 }
 
+static long kvm_dev_ioctl_check_extension_generic(long arg)
+{
+       switch (arg) {
+       case KVM_CAP_USER_MEMORY:
+       case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
+               return 1;
+       default:
+               break;
+       }
+       return kvm_dev_ioctl_check_extension(arg);
+}
+
 static long kvm_dev_ioctl(struct file *filp,
                          unsigned int ioctl, unsigned long arg)
 {
@@ -1730,7 +1949,7 @@ static long kvm_dev_ioctl(struct file *filp,
                r = kvm_dev_ioctl_create_vm();
                break;
        case KVM_CHECK_EXTENSION:
-               r = kvm_dev_ioctl_check_extension(arg);
+               r = kvm_dev_ioctl_check_extension_generic(arg);
                break;
        case KVM_GET_VCPU_MMAP_SIZE:
                r = -EINVAL;
@@ -1771,9 +1990,9 @@ static void hardware_enable(void *junk)
 {
        int cpu = raw_smp_processor_id();
 
-       if (cpu_isset(cpu, cpus_hardware_enabled))
+       if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
                return;
-       cpu_set(cpu, cpus_hardware_enabled);
+       cpumask_set_cpu(cpu, cpus_hardware_enabled);
        kvm_arch_hardware_enable(NULL);
 }
 
@@ -1781,9 +2000,9 @@ static void hardware_disable(void *junk)
 {
        int cpu = raw_smp_processor_id();
 
-       if (!cpu_isset(cpu, cpus_hardware_enabled))
+       if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
                return;
-       cpu_clear(cpu, cpus_hardware_enabled);
+       cpumask_clear_cpu(cpu, cpus_hardware_enabled);
        kvm_arch_hardware_disable(NULL);
 }
 
@@ -2017,9 +2236,14 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 
        bad_pfn = page_to_pfn(bad_page);
 
+       if (!alloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
+               r = -ENOMEM;
+               goto out_free_0;
+       }
+
        r = kvm_arch_hardware_setup();
        if (r < 0)
-               goto out_free_0;
+               goto out_free_0a;
 
        for_each_online_cpu(cpu) {
                smp_call_function_single(cpu,
@@ -2053,6 +2277,8 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
        }
 
        kvm_chardev_ops.owner = module;
+       kvm_vm_fops.owner = module;
+       kvm_vcpu_fops.owner = module;
 
        r = misc_register(&kvm_dev);
        if (r) {
@@ -2062,6 +2288,9 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 
        kvm_preempt_ops.sched_in = kvm_sched_in;
        kvm_preempt_ops.sched_out = kvm_sched_out;
+#ifndef CONFIG_X86
+       msi2intx = 0;
+#endif
 
        return 0;
 
@@ -2078,6 +2307,8 @@ out_free_2:
        on_each_cpu(hardware_disable, NULL, 1);
 out_free_1:
        kvm_arch_hardware_unsetup();
+out_free_0a:
+       free_cpumask_var(cpus_hardware_enabled);
 out_free_0:
        __free_page(bad_page);
 out:
@@ -2101,6 +2332,7 @@ void kvm_exit(void)
        kvm_arch_hardware_unsetup();
        kvm_arch_exit();
        kvm_exit_debug();
+       free_cpumask_var(cpus_hardware_enabled);
        __free_page(bad_page);
 }
 EXPORT_SYMBOL_GPL(kvm_exit);
index 41dcc845f78c34d65a9d13e72cc7023a8a14f142..f59874446440cab9fc51df1a55ac1a552f5c47ce 100644 (file)
@@ -252,6 +252,7 @@ void kvm_trace_cleanup(void)
                        struct kvm_trace_probe *p = &kvm_trace_probes[i];
                        marker_probe_unregister(p->name, p->probe_func, p);
                }
+               marker_synchronize_unregister();
 
                relay_close(kt->rchan);
                debugfs_remove(kt->lost_file);