#include <xen/interface/xen.h>
#include <xen/interface/physdev.h>
#include <xen/interface/vcpu.h>
-#include <xen/interface/sched.h>
#include <xen/features.h>
#include <xen/page.h>
#include <xen/hvc-console.h>
#include <asm/xen/hypervisor.h>
#include <asm/fixmap.h>
#include <asm/processor.h>
+#include <asm/msr-index.h>
#include <asm/setup.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/reboot.h>
-#include <asm/pgalloc.h>
#include "xen-ops.h"
#include "mmu.h"
DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+/*
+ * Identity map, in addition to plain kernel map. This needs to be
+ * large enough to allocate page table pages to allocate the rest.
+ * Each page can map 2MB.
+ */
+static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
+
+#ifdef CONFIG_X86_64
+/* l3 pud for userspace vsyscall mapping */
+static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
+#endif /* CONFIG_X86_64 */
+
/*
* Note about cr3 (pagetable base) values:
*
static void __init xen_banner(void)
{
+ unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
+ struct xen_extraversion extra;
+ HYPERVISOR_xen_version(XENVER_extraversion, &extra);
+
printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
pv_info.name);
- printk(KERN_INFO "Hypervisor signature: %s%s\n",
- xen_start_info->magic,
+ printk(KERN_INFO "Xen version: %d.%d%s%s\n",
+ version >> 16, version & 0xffff, extra.extraversion,
xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
}
return HYPERVISOR_get_debugreg(reg);
}
-static unsigned long xen_save_fl(void)
+static void xen_leave_lazy(void)
{
- struct vcpu_info *vcpu;
- unsigned long flags;
-
- vcpu = x86_read_percpu(xen_vcpu);
-
- /* flag has opposite sense of mask */
- flags = !vcpu->evtchn_upcall_mask;
-
- /* convert to IF type flag
- -0 -> 0x00000000
- -1 -> 0xffffffff
- */
- return (-flags) & X86_EFLAGS_IF;
+ paravirt_leave_lazy(paravirt_get_lazy_mode());
+ xen_mc_flush();
}
-static void xen_restore_fl(unsigned long flags)
+static unsigned long xen_store_tr(void)
{
- struct vcpu_info *vcpu;
-
- /* convert from IF type flag */
- flags = !(flags & X86_EFLAGS_IF);
-
- /* There's a one instruction preempt window here. We need to
- make sure we're don't switch CPUs between getting the vcpu
- pointer and updating the mask. */
- preempt_disable();
- vcpu = x86_read_percpu(xen_vcpu);
- vcpu->evtchn_upcall_mask = flags;
- preempt_enable_no_resched();
-
- /* Doesn't matter if we get preempted here, because any
- pending event will get dealt with anyway. */
-
- if (flags == 0) {
- preempt_check_resched();
- barrier(); /* unmask then check (avoid races) */
- if (unlikely(vcpu->evtchn_upcall_pending))
- force_evtchn_callback();
- }
+ return 0;
}
-static void xen_irq_disable(void)
+/*
+ * Set the page permissions for a particular virtual address. If the
+ * address is a vmalloc mapping (or other non-linear mapping), then
+ * find the linear mapping of the page and also set its protections to
+ * match.
+ */
+static void set_aliased_prot(void *v, pgprot_t prot)
{
- /* There's a one instruction preempt window here. We need to
- make sure we're don't switch CPUs between getting the vcpu
- pointer and updating the mask. */
- preempt_disable();
- x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
- preempt_enable_no_resched();
-}
+ int level;
+ pte_t *ptep;
+ pte_t pte;
+ unsigned long pfn;
+ struct page *page;
-static void xen_irq_enable(void)
-{
- struct vcpu_info *vcpu;
+ ptep = lookup_address((unsigned long)v, &level);
+ BUG_ON(ptep == NULL);
- /* We don't need to worry about being preempted here, since
- either a) interrupts are disabled, so no preemption, or b)
- the caller is confused and is trying to re-enable interrupts
- on an indeterminate processor. */
+ pfn = pte_pfn(*ptep);
+ page = pfn_to_page(pfn);
- vcpu = x86_read_percpu(xen_vcpu);
- vcpu->evtchn_upcall_mask = 0;
+ pte = pfn_pte(pfn, prot);
- /* Doesn't matter if we get preempted here, because any
- pending event will get dealt with anyway. */
+ if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
+ BUG();
- barrier(); /* unmask then check (avoid races) */
- if (unlikely(vcpu->evtchn_upcall_pending))
- force_evtchn_callback();
-}
+ if (!PageHighMem(page)) {
+ void *av = __va(PFN_PHYS(pfn));
-static void xen_safe_halt(void)
-{
- /* Blocking includes an implicit local_irq_enable(). */
- if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
- BUG();
+ if (av != v)
+ if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
+ BUG();
+ } else
+ kmap_flush_unused();
}
-static void xen_halt(void)
+static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
{
- if (irqs_disabled())
- HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
- else
- xen_safe_halt();
-}
+ const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
+ int i;
-static void xen_leave_lazy(void)
-{
- paravirt_leave_lazy(paravirt_get_lazy_mode());
- xen_mc_flush();
+ for(i = 0; i < entries; i += entries_per_page)
+ set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
}
-static unsigned long xen_store_tr(void)
+static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
{
- return 0;
+ const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
+ int i;
+
+ for(i = 0; i < entries; i += entries_per_page)
+ set_aliased_prot(ldt + i, PAGE_KERNEL);
}
static void xen_set_ldt(const void *addr, unsigned entries)
static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
{
- xen_mc_batch();
-
- load_TLS_descriptor(t, cpu, 0);
- load_TLS_descriptor(t, cpu, 1);
- load_TLS_descriptor(t, cpu, 2);
-
- xen_mc_issue(PARAVIRT_LAZY_CPU);
-
/*
* XXX sleazy hack: If we're being called in a lazy-cpu zone,
* it means we're in a context switch, and %gs has just been
* Either way, it has been saved, and the new value will get
* loaded properly. This will go away as soon as Xen has been
* modified to not save/restore %gs for normal hypercalls.
+ *
+ * On x86_64, this hack is not used for %gs, because gs points
+ * to KERNEL_GS_BASE (and uses it for PDA references), so we
+ * must not zero %gs on x86_64
+ *
+ * For x86_64, we need to zero %fs, otherwise we may get an
+ * exception between the new %fs descriptor being loaded and
+ * %fs being effectively cleared at __switch_to().
*/
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
+#ifdef CONFIG_X86_32
loadsegment(gs, 0);
+#else
+ loadsegment(fs, 0);
+#endif
+ }
+
+ xen_mc_batch();
+
+ load_TLS_descriptor(t, cpu, 0);
+ load_TLS_descriptor(t, cpu, 1);
+ load_TLS_descriptor(t, cpu, 2);
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+#ifdef CONFIG_X86_64
+static void xen_load_gs_index(unsigned int idx)
+{
+ if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
+ BUG();
}
+#endif
static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
const void *ptr)
{
- unsigned long lp = (unsigned long)&dt[entrynum];
- xmaddr_t mach_lp = virt_to_machine(lp);
+ xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
u64 entry = *(u64 *)ptr;
preempt_disable();
}
static void xen_load_sp0(struct tss_struct *tss,
- struct thread_struct *thread)
+ struct thread_struct *thread)
{
struct multicall_space mcs = xen_mc_entry(0);
MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
x86_write_percpu(xen_current_cr3, (unsigned long)v);
}
-static void xen_write_cr3(unsigned long cr3)
+static void __xen_write_cr3(bool kernel, unsigned long cr3)
{
struct mmuext_op *op;
struct multicall_space mcs;
- unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+ unsigned long mfn;
- BUG_ON(preemptible());
+ if (cr3)
+ mfn = pfn_to_mfn(PFN_DOWN(cr3));
+ else
+ mfn = 0;
- mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */
+ WARN_ON(mfn == 0 && kernel);
- /* Update while interrupts are disabled, so its atomic with
- respect to ipis */
- x86_write_percpu(xen_cr3, cr3);
+ mcs = __xen_mc_entry(sizeof(*op));
op = mcs.args;
- op->cmd = MMUEXT_NEW_BASEPTR;
+ op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
op->arg1.mfn = mfn;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
- /* Update xen_update_cr3 once the batch has actually
- been submitted. */
- xen_mc_callback(set_current_cr3, (void *)cr3);
+ if (kernel) {
+ x86_write_percpu(xen_cr3, cr3);
+
+ /* Update xen_current_cr3 once the batch has actually
+ been submitted. */
+ xen_mc_callback(set_current_cr3, (void *)cr3);
+ }
+}
+
+static void xen_write_cr3(unsigned long cr3)
+{
+ BUG_ON(preemptible());
+
+ xen_mc_batch(); /* disables interrupts */
+
+ /* Update while interrupts are disabled, so its atomic with
+ respect to ipis */
+ x86_write_percpu(xen_cr3, cr3);
+
+ __xen_write_cr3(true, cr3);
+
+#ifdef CONFIG_X86_64
+ {
+ pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
+ if (user_pgd)
+ __xen_write_cr3(false, __pa(user_pgd));
+ else
+ __xen_write_cr3(false, 0);
+ }
+#endif
xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
}
+static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
+{
+ int ret;
+
+ ret = 0;
+
+ switch(msr) {
+#ifdef CONFIG_X86_64
+ unsigned which;
+ u64 base;
+
+ case MSR_FS_BASE: which = SEGBASE_FS; goto set;
+ case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
+ case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
+
+ set:
+ base = ((u64)high << 32) | low;
+ if (HYPERVISOR_set_segment_base(which, base) != 0)
+ ret = -EFAULT;
+ break;
+#endif
+
+ case MSR_STAR:
+ case MSR_CSTAR:
+ case MSR_LSTAR:
+ case MSR_SYSCALL_MASK:
+ case MSR_IA32_SYSENTER_CS:
+ case MSR_IA32_SYSENTER_ESP:
+ case MSR_IA32_SYSENTER_EIP:
+ /* Fast syscall setup is all done in hypercalls, so
+ these are all ignored. Stub them out here to stop
+ Xen console noise. */
+ break;
+
+ default:
+ ret = native_write_msr_safe(msr, low, high);
+ }
+
+ return ret;
+}
+
/* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */
static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
xen_alloc_ptpage(mm, pfn, PT_PMD);
}
+static int xen_pgd_alloc(struct mm_struct *mm)
+{
+ pgd_t *pgd = mm->pgd;
+ int ret = 0;
+
+ BUG_ON(PagePinned(virt_to_page(pgd)));
+
+#ifdef CONFIG_X86_64
+ {
+ struct page *page = virt_to_page(pgd);
+ pgd_t *user_pgd;
+
+ BUG_ON(page->private != 0);
+
+ ret = -ENOMEM;
+
+ user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ page->private = (unsigned long)user_pgd;
+
+ if (user_pgd != NULL) {
+ user_pgd[pgd_index(VSYSCALL_START)] =
+ __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
+ ret = 0;
+ }
+
+ BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
+ }
+#endif
+
+ return ret;
+}
+
+static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+ pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+ if (user_pgd)
+ free_page((unsigned long)user_pgd);
+#endif
+}
+
/* This should never happen until we're OK to use struct page */
static void xen_release_ptpage(u32 pfn, unsigned level)
{
pv_mmu_ops.release_pud = xen_release_pud;
#endif
+#ifdef CONFIG_X86_64
+ SetPagePinned(virt_to_page(level3_user_vsyscall));
+#endif
xen_mark_init_mm_pinned();
}
#ifdef CONFIG_X86_32
case FIX_WP_TEST:
case FIX_VDSO:
+# ifdef CONFIG_HIGHMEM
case FIX_KMAP_BEGIN ... FIX_KMAP_END:
+# endif
#else
case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
#endif
}
__native_set_fixmap(idx, pte);
+
+#ifdef CONFIG_X86_64
+ /* Replicate changes to map the vsyscall page into the user
+ pagetable vsyscall mapping. */
+ if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
+ unsigned long vaddr = __fix_to_virt(idx);
+ set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
+ }
+#endif
}
static const struct pv_info xen_info __initdata = {
.wbinvd = native_wbinvd,
.read_msr = native_read_msr_safe,
- .write_msr = native_write_msr_safe,
+ .write_msr = xen_write_msr_safe,
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
.iret = xen_iret,
.irq_enable_sysexit = xen_sysexit,
+#ifdef CONFIG_X86_64
+ .usergs_sysret32 = xen_sysret32,
+ .usergs_sysret64 = xen_sysret64,
+#endif
.load_tr_desc = paravirt_nop,
.set_ldt = xen_set_ldt,
.load_gdt = xen_load_gdt,
.load_idt = xen_load_idt,
.load_tls = xen_load_tls,
+#ifdef CONFIG_X86_64
+ .load_gs_index = xen_load_gs_index,
+#endif
+
+ .alloc_ldt = xen_alloc_ldt,
+ .free_ldt = xen_free_ldt,
.store_gdt = native_store_gdt,
.store_idt = native_store_idt,
.set_iopl_mask = xen_set_iopl_mask,
.io_delay = xen_io_delay,
+ /* Xen takes care of %gs when switching to usermode for us */
+ .swapgs = paravirt_nop,
+
.lazy_mode = {
.enter = paravirt_enter_lazy_cpu,
.leave = xen_leave_lazy,
},
};
-static const struct pv_irq_ops xen_irq_ops __initdata = {
- .init_IRQ = xen_init_IRQ,
- .save_fl = xen_save_fl,
- .restore_fl = xen_restore_fl,
- .irq_disable = xen_irq_disable,
- .irq_enable = xen_irq_enable,
- .safe_halt = xen_safe_halt,
- .halt = xen_halt,
-#ifdef CONFIG_X86_64
- .adjust_exception_frame = xen_adjust_exception_frame,
-#endif
-};
-
static const struct pv_apic_ops xen_apic_ops __initdata = {
#ifdef CONFIG_X86_LOCAL_APIC
.apic_write = xen_apic_write,
- .apic_write_atomic = xen_apic_write,
.apic_read = xen_apic_read,
.setup_boot_clock = paravirt_nop,
.setup_secondary_clock = paravirt_nop,
.pte_update = paravirt_nop,
.pte_update_defer = paravirt_nop,
- .pgd_alloc = __paravirt_pgd_alloc,
- .pgd_free = paravirt_nop,
+ .pgd_alloc = xen_pgd_alloc,
+ .pgd_free = xen_pgd_free,
.alloc_pte = xen_alloc_pte_init,
.release_pte = xen_release_pte_init,
.ptep_modify_prot_commit = __ptep_modify_prot_commit,
.pte_val = xen_pte_val,
- .pte_flags = native_pte_val,
+ .pte_flags = native_pte_flags,
.pgd_val = xen_pgd_val,
.make_pte = xen_make_pte,
{
phys_addr_t paddr;
- maddr &= PTE_MASK;
+ maddr &= PTE_PFN_MASK;
paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
return paddr;
BUG();
}
-/*
- * Identity map, in addition to plain kernel map. This needs to be
- * large enough to allocate page table pages to allocate the rest.
- * Each page can map 2MB.
- */
-static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
-
static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
{
unsigned pmdidx, pteidx;
set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
/* Switch over */
pgd = init_level4_pgt;
- xen_write_cr3(__pa(pgd));
+
+ /*
+ * At this stage there can be no user pgd, and no page
+ * structure to attach it to, so make sure we just set kernel
+ * pgd.
+ */
+ xen_mc_batch();
+ __xen_write_cr3(true, __pa(pgd));
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
reserve_early(__pa(xen_start_info->pt_base),
__pa(xen_start_info->pt_base +
pv_init_ops = xen_init_ops;
pv_time_ops = xen_time_ops;
pv_cpu_ops = xen_cpu_ops;
- pv_irq_ops = xen_irq_ops;
pv_apic_ops = xen_apic_ops;
pv_mmu_ops = xen_mmu_ops;
+ xen_init_irq_ops();
+
if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
boot_params.hdr.ramdisk_image = xen_start_info->mod_start
? __pa(xen_start_info->mod_start) : 0;
boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
+ boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
if (!is_initial_xendomain()) {
add_preferred_console("xenboot", 0, NULL);