arch/x86/xen/mmu.c

   1 /*
   2  * Xen mmu operations
   3  *
   4  * This file contains the various mmu fetch and update operations.
   5  * The most important job they must perform is the mapping between the
   6  * domain's pfn and the overall machine mfns.
   7  *
   8  * Xen allows guests to directly update the pagetable, in a controlled
   9  * fashion.  In other words, the guest modifies the same pagetable
  10  * that the CPU actually uses, which eliminates the overhead of having
  11  * a separate shadow pagetable.
  12  *
  13  * In order to allow this, it falls on the guest domain to map its
  14  * notion of a "physical" pfn - which is just a domain-local linear
  15  * address - into a real "machine address" which the CPU's MMU can
  16  * use.
  17  *
  18  * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
  19  * inserted directly into the pagetable.  When creating a new
  20  * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
  21  * when reading the content back with __(pgd|pmd|pte)_val, it converts
  22  * the mfn back into a pfn.
  23  *
  24  * The other constraint is that all pages which make up a pagetable
  25  * must be mapped read-only in the guest.  This prevents uncontrolled
  26  * guest updates to the pagetable.  Xen strictly enforces this, and
  27  * will disallow any pagetable update which will end up mapping a
  28  * pagetable page RW, and will disallow using any writable page as a
  29  * pagetable.
  30  *
  31  * Naively, when loading %cr3 with the base of a new pagetable, Xen
  32  * would need to validate the whole pagetable before going on.
  33  * Naturally, this is quite slow.  The solution is to "pin" a
  34  * pagetable, which enforces all the constraints on the pagetable even
  35  * when it is not actively in use.  This menas that Xen can be assured
  36  * that it is still valid when you do load it into %cr3, and doesn't
  37  * need to revalidate it.
  38  *
  39  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  40  */
  41 #include <linux/sched.h>
  42 #include <linux/highmem.h>
  43 #include <linux/bug.h>
  44
  45 #include <asm/pgtable.h>
  46 #include <asm/tlbflush.h>
  47 #include <asm/mmu_context.h>
  48 #include <asm/paravirt.h>
  49 #include <asm/linkage.h>
  50
  51 #include <asm/xen/hypercall.h>
  52 #include <asm/xen/hypervisor.h>
  53
  54 #include <xen/page.h>
  55 #include <xen/interface/xen.h>
  56
  57 #include "multicalls.h"
  58 #include "mmu.h"
  59
  60 #define P2M_ENTRIES_PER_PAGE    (PAGE_SIZE / sizeof(unsigned long))
  61 #define TOP_ENTRIES             (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
  62
  63 /* Placeholder for holes in the address space */
  64 static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
  65                 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
  66
  67  /* Array of pointers to pages containing p2m entries */
  68 static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
  69                 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
  70
  71 /* Arrays of p2m arrays expressed in mfns used for save/restore */
  72 static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
  73
  74 static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
  75         __page_aligned_bss;
  76
  77 static inline unsigned p2m_top_index(unsigned long pfn)
  78 {
  79         BUG_ON(pfn >= MAX_DOMAIN_PAGES);
  80         return pfn / P2M_ENTRIES_PER_PAGE;
  81 }
  82
  83 static inline unsigned p2m_index(unsigned long pfn)
  84 {
  85         return pfn % P2M_ENTRIES_PER_PAGE;
  86 }
  87
  88 /* Build the parallel p2m_top_mfn structures */
  89 void xen_setup_mfn_list_list(void)
  90 {
  91         unsigned pfn, idx;
  92
  93         for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
  94                 unsigned topidx = p2m_top_index(pfn);
  95
  96                 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
  97         }
  98
  99         for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
 100                 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
 101                 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
 102         }
 103
 104         BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 105
 106         HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
 107                 virt_to_mfn(p2m_top_mfn_list);
 108         HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
 109 }
 110
 111 /* Set up p2m_top to point to the domain-builder provided p2m pages */
 112 void __init xen_build_dynamic_phys_to_machine(void)
 113 {
 114         unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
 115         unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
 116         unsigned pfn;
 117
 118         for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
 119                 unsigned topidx = p2m_top_index(pfn);
 120
 121                 p2m_top[topidx] = &mfn_list[pfn];
 122         }
 123 }
 124
 125 unsigned long get_phys_to_machine(unsigned long pfn)
 126 {
 127         unsigned topidx, idx;
 128
 129         if (unlikely(pfn >= MAX_DOMAIN_PAGES))
 130                 return INVALID_P2M_ENTRY;
 131
 132         topidx = p2m_top_index(pfn);
 133         idx = p2m_index(pfn);
 134         return p2m_top[topidx][idx];
 135 }
 136 EXPORT_SYMBOL_GPL(get_phys_to_machine);
 137
 138 static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
 139 {
 140         unsigned long *p;
 141         unsigned i;
 142
 143         p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
 144         BUG_ON(p == NULL);
 145
 146         for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
 147                 p[i] = INVALID_P2M_ENTRY;
 148
 149         if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
 150                 free_page((unsigned long)p);
 151         else
 152                 *mfnp = virt_to_mfn(p);
 153 }
 154
 155 void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 156 {
 157         unsigned topidx, idx;
 158
 159         if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
 160                 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
 161                 return;
 162         }
 163
 164         if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
 165                 BUG_ON(mfn != INVALID_P2M_ENTRY);
 166                 return;
 167         }
 168
 169         topidx = p2m_top_index(pfn);
 170         if (p2m_top[topidx] == p2m_missing) {
 171                 /* no need to allocate a page to store an invalid entry */
 172                 if (mfn == INVALID_P2M_ENTRY)
 173                         return;
 174                 alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
 175         }
 176
 177         idx = p2m_index(pfn);
 178         p2m_top[topidx][idx] = mfn;
 179 }
 180
 181 xmaddr_t arbitrary_virt_to_machine(unsigned long address)
 182 {
 183         unsigned int level;
 184         pte_t *pte = lookup_address(address, &level);
 185         unsigned offset = address & ~PAGE_MASK;
 186
 187         BUG_ON(pte == NULL);
 188
 189         return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
 190 }
 191
 192 void make_lowmem_page_readonly(void *vaddr)
 193 {
 194         pte_t *pte, ptev;
 195         unsigned long address = (unsigned long)vaddr;
 196         unsigned int level;
 197
 198         pte = lookup_address(address, &level);
 199         BUG_ON(pte == NULL);
 200
 201         ptev = pte_wrprotect(*pte);
 202
 203         if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 204                 BUG();
 205 }
 206
 207 void make_lowmem_page_readwrite(void *vaddr)
 208 {
 209         pte_t *pte, ptev;
 210         unsigned long address = (unsigned long)vaddr;
 211         unsigned int level;
 212
 213         pte = lookup_address(address, &level);
 214         BUG_ON(pte == NULL);
 215
 216         ptev = pte_mkwrite(*pte);
 217
 218         if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 219                 BUG();
 220 }
 221
 222
 223 static bool page_pinned(void *ptr)
 224 {
 225         struct page *page = virt_to_page(ptr);
 226
 227         return PagePinned(page);
 228 }
 229
 230 static void extend_mmu_update(const struct mmu_update *update)
 231 {
 232         struct multicall_space mcs;
 233         struct mmu_update *u;
 234
 235         mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
 236
 237         if (mcs.mc != NULL)
 238                 mcs.mc->args[1]++;
 239         else {
 240                 mcs = __xen_mc_entry(sizeof(*u));
 241                 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
 242         }
 243
 244         u = mcs.args;
 245         *u = *update;
 246 }
 247
 248 void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 249 {
 250         struct mmu_update u;
 251
 252         preempt_disable();
 253
 254         xen_mc_batch();
 255
 256         u.ptr = virt_to_machine(ptr).maddr;
 257         u.val = pmd_val_ma(val);
 258         extend_mmu_update(&u);
 259
 260         xen_mc_issue(PARAVIRT_LAZY_MMU);
 261
 262         preempt_enable();
 263 }
 264
 265 void xen_set_pmd(pmd_t *ptr, pmd_t val)
 266 {
 267         /* If page is not pinned, we can just update the entry
 268            directly */
 269         if (!page_pinned(ptr)) {
 270                 *ptr = val;
 271                 return;
 272         }
 273
 274         xen_set_pmd_hyper(ptr, val);
 275 }
 276
 277 /*
 278  * Associate a virtual page frame with a given physical page frame
 279  * and protection flags for that frame.
 280  */
 281 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 282 {
 283         pgd_t *pgd;
 284         pud_t *pud;
 285         pmd_t *pmd;
 286         pte_t *pte;
 287
 288         pgd = swapper_pg_dir + pgd_index(vaddr);
 289         if (pgd_none(*pgd)) {
 290                 BUG();
 291                 return;
 292         }
 293         pud = pud_offset(pgd, vaddr);
 294         if (pud_none(*pud)) {
 295                 BUG();
 296                 return;
 297         }
 298         pmd = pmd_offset(pud, vaddr);
 299         if (pmd_none(*pmd)) {
 300                 BUG();
 301                 return;
 302         }
 303         pte = pte_offset_kernel(pmd, vaddr);
 304         /* <mfn,flags> stored as-is, to permit clearing entries */
 305         xen_set_pte(pte, mfn_pte(mfn, flags));
 306
 307         /*
 308          * It's enough to flush this one mapping.
 309          * (PGE mappings get flushed as well)
 310          */
 311         __flush_tlb_one(vaddr);
 312 }
 313
 314 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 315                     pte_t *ptep, pte_t pteval)
 316 {
 317         /* updates to init_mm may be done without lock */
 318         if (mm == &init_mm)
 319                 preempt_disable();
 320
 321         if (mm == current->mm || mm == &init_mm) {
 322                 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
 323                         struct multicall_space mcs;
 324                         mcs = xen_mc_entry(0);
 325
 326                         MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
 327                         xen_mc_issue(PARAVIRT_LAZY_MMU);
 328                         goto out;
 329                 } else
 330                         if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
 331                                 goto out;
 332         }
 333         xen_set_pte(ptep, pteval);
 334
 335 out:
 336         if (mm == &init_mm)
 337                 preempt_enable();
 338 }
 339
 340 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 341 {
 342         /* Just return the pte as-is.  We preserve the bits on commit */
 343         return *ptep;
 344 }
 345
 346 void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 347                                  pte_t *ptep, pte_t pte)
 348 {
 349         struct mmu_update u;
 350
 351         xen_mc_batch();
 352
 353         u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
 354         u.val = pte_val_ma(pte);
 355         extend_mmu_update(&u);
 356
 357         xen_mc_issue(PARAVIRT_LAZY_MMU);
 358 }
 359
 360 /* Assume pteval_t is equivalent to all the other *val_t types. */
 361 static pteval_t pte_mfn_to_pfn(pteval_t val)
 362 {
 363         if (val & _PAGE_PRESENT) {
 364                 unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT;
 365                 pteval_t flags = val & ~PTE_MASK;
 366                 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
 367         }
 368
 369         return val;
 370 }
 371
 372 static pteval_t pte_pfn_to_mfn(pteval_t val)
 373 {
 374         if (val & _PAGE_PRESENT) {
 375                 unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT;
 376                 pteval_t flags = val & ~PTE_MASK;
 377                 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
 378         }
 379
 380         return val;
 381 }
 382
 383 pteval_t xen_pte_val(pte_t pte)
 384 {
 385         return pte_mfn_to_pfn(pte.pte);
 386 }
 387
 388 pgdval_t xen_pgd_val(pgd_t pgd)
 389 {
 390         return pte_mfn_to_pfn(pgd.pgd);
 391 }
 392
 393 pte_t xen_make_pte(pteval_t pte)
 394 {
 395         pte = pte_pfn_to_mfn(pte);
 396         return native_make_pte(pte);
 397 }
 398
 399 pgd_t xen_make_pgd(pgdval_t pgd)
 400 {
 401         pgd = pte_pfn_to_mfn(pgd);
 402         return native_make_pgd(pgd);
 403 }
 404
 405 pmdval_t xen_pmd_val(pmd_t pmd)
 406 {
 407         return pte_mfn_to_pfn(pmd.pmd);
 408 }
 409
 410 void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 411 {
 412         struct mmu_update u;
 413
 414         preempt_disable();
 415
 416         xen_mc_batch();
 417
 418         u.ptr = virt_to_machine(ptr).maddr;
 419         u.val = pud_val_ma(val);
 420         extend_mmu_update(&u);
 421
 422         xen_mc_issue(PARAVIRT_LAZY_MMU);
 423
 424         preempt_enable();
 425 }
 426
 427 void xen_set_pud(pud_t *ptr, pud_t val)
 428 {
 429         /* If page is not pinned, we can just update the entry
 430            directly */
 431         if (!page_pinned(ptr)) {
 432                 *ptr = val;
 433                 return;
 434         }
 435
 436         xen_set_pud_hyper(ptr, val);
 437 }
 438
 439 void xen_set_pte(pte_t *ptep, pte_t pte)
 440 {
 441 #ifdef CONFIG_X86_PAE
 442         ptep->pte_high = pte.pte_high;
 443         smp_wmb();
 444         ptep->pte_low = pte.pte_low;
 445 #else
 446         *ptep = pte;
 447 #endif
 448 }
 449
 450 #ifdef CONFIG_X86_PAE
 451 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 452 {
 453         set_64bit((u64 *)ptep, native_pte_val(pte));
 454 }
 455
 456 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 457 {
 458         ptep->pte_low = 0;
 459         smp_wmb();              /* make sure low gets written first */
 460         ptep->pte_high = 0;
 461 }
 462
 463 void xen_pmd_clear(pmd_t *pmdp)
 464 {
 465         set_pmd(pmdp, __pmd(0));
 466 }
 467 #endif  /* CONFIG_X86_PAE */
 468
 469 pmd_t xen_make_pmd(pmdval_t pmd)
 470 {
 471         pmd = pte_pfn_to_mfn(pmd);
 472         return native_make_pmd(pmd);
 473 }
 474
 475 #if PAGETABLE_LEVELS == 4
 476 pudval_t xen_pud_val(pud_t pud)
 477 {
 478         return pte_mfn_to_pfn(pud.pud);
 479 }
 480
 481 pud_t xen_make_pud(pudval_t pud)
 482 {
 483         pud = pte_pfn_to_mfn(pud);
 484
 485         return native_make_pud(pud);
 486 }
 487
 488 void xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 489 {
 490         struct mmu_update u;
 491
 492         preempt_disable();
 493
 494         xen_mc_batch();
 495
 496         u.ptr = virt_to_machine(ptr).maddr;
 497         u.val = pgd_val_ma(val);
 498         extend_mmu_update(&u);
 499
 500         xen_mc_issue(PARAVIRT_LAZY_MMU);
 501
 502         preempt_enable();
 503 }
 504
 505 void xen_set_pgd(pgd_t *ptr, pgd_t val)
 506 {
 507         /* If page is not pinned, we can just update the entry
 508            directly */
 509         if (!page_pinned(ptr)) {
 510                 *ptr = val;
 511                 return;
 512         }
 513
 514         xen_set_pgd_hyper(ptr, val);
 515 }
 516 #endif  /* PAGETABLE_LEVELS == 4 */
 517
 518 /*
 519   (Yet another) pagetable walker.  This one is intended for pinning a
 520   pagetable.  This means that it walks a pagetable and calls the
 521   callback function on each page it finds making up the page table,
 522   at every level.  It walks the entire pagetable, but it only bothers
 523   pinning pte pages which are below pte_limit.  In the normal case
 524   this will be TASK_SIZE, but at boot we need to pin up to
 525   FIXADDR_TOP.  But the important bit is that we don't pin beyond
 526   there, because then we start getting into Xen's ptes.
 527 */
 528 static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
 529                     unsigned long limit)
 530 {
 531         pgd_t *pgd = pgd_base;
 532         int flush = 0;
 533         unsigned long addr = 0;
 534         unsigned long pgd_next;
 535
 536         BUG_ON(limit > FIXADDR_TOP);
 537
 538         if (xen_feature(XENFEAT_auto_translated_physmap))
 539                 return 0;
 540
 541         for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
 542                 pud_t *pud;
 543                 unsigned long pud_limit, pud_next;
 544
 545                 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
 546
 547                 if (!pgd_val(*pgd))
 548                         continue;
 549
 550                 pud = pud_offset(pgd, 0);
 551
 552                 if (PTRS_PER_PUD > 1) /* not folded */
 553                         flush |= (*func)(virt_to_page(pud), PT_PUD);
 554
 555                 for (; addr != pud_limit; pud++, addr = pud_next) {
 556                         pmd_t *pmd;
 557                         unsigned long pmd_limit;
 558
 559                         pud_next = pud_addr_end(addr, pud_limit);
 560
 561                         if (pud_next < limit)
 562                                 pmd_limit = pud_next;
 563                         else
 564                                 pmd_limit = limit;
 565
 566                         if (pud_none(*pud))
 567                                 continue;
 568
 569                         pmd = pmd_offset(pud, 0);
 570
 571                         if (PTRS_PER_PMD > 1) /* not folded */
 572                                 flush |= (*func)(virt_to_page(pmd), PT_PMD);
 573
 574                         for (; addr != pmd_limit; pmd++) {
 575                                 addr += (PAGE_SIZE * PTRS_PER_PTE);
 576                                 if ((pmd_limit-1) < (addr-1)) {
 577                                         addr = pmd_limit;
 578                                         break;
 579                                 }
 580
 581                                 if (pmd_none(*pmd))
 582                                         continue;
 583
 584                                 flush |= (*func)(pmd_page(*pmd), PT_PTE);
 585                         }
 586                 }
 587         }
 588
 589         flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
 590
 591         return flush;
 592 }
 593
 594 static spinlock_t *lock_pte(struct page *page)
 595 {
 596         spinlock_t *ptl = NULL;
 597
 598 #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
 599         ptl = __pte_lockptr(page);
 600         spin_lock(ptl);
 601 #endif
 602
 603         return ptl;
 604 }
 605
 606 static void do_unlock(void *v)
 607 {
 608         spinlock_t *ptl = v;
 609         spin_unlock(ptl);
 610 }
 611
 612 static void xen_do_pin(unsigned level, unsigned long pfn)
 613 {
 614         struct mmuext_op *op;
 615         struct multicall_space mcs;
 616
 617         mcs = __xen_mc_entry(sizeof(*op));
 618         op = mcs.args;
 619         op->cmd = level;
 620         op->arg1.mfn = pfn_to_mfn(pfn);
 621         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 622 }
 623
 624 static int pin_page(struct page *page, enum pt_level level)
 625 {
 626         unsigned pgfl = TestSetPagePinned(page);
 627         int flush;
 628
 629         if (pgfl)
 630                 flush = 0;              /* already pinned */
 631         else if (PageHighMem(page))
 632                 /* kmaps need flushing if we found an unpinned
 633                    highpage */
 634                 flush = 1;
 635         else {
 636                 void *pt = lowmem_page_address(page);
 637                 unsigned long pfn = page_to_pfn(page);
 638                 struct multicall_space mcs = __xen_mc_entry(0);
 639                 spinlock_t *ptl;
 640
 641                 flush = 0;
 642
 643                 ptl = NULL;
 644                 if (level == PT_PTE)
 645                         ptl = lock_pte(page);
 646
 647                 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 648                                         pfn_pte(pfn, PAGE_KERNEL_RO),
 649                                         level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 650
 651                 if (level == PT_PTE)
 652                         xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
 653
 654                 if (ptl) {
 655                         /* Queue a deferred unlock for when this batch
 656                            is completed. */
 657                         xen_mc_callback(do_unlock, ptl);
 658                 }
 659         }
 660
 661         return flush;
 662 }
 663
 664 /* This is called just after a mm has been created, but it has not
 665    been used yet.  We need to make sure that its pagetable is all
 666    read-only, and can be pinned. */
 667 void xen_pgd_pin(pgd_t *pgd)
 668 {
 669         xen_mc_batch();
 670
 671         if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
 672                 /* re-enable interrupts for kmap_flush_unused */
 673                 xen_mc_issue(0);
 674                 kmap_flush_unused();
 675                 xen_mc_batch();
 676         }
 677
 678         xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
 679         xen_mc_issue(0);
 680 }
 681
 682 /*
 683  * On save, we need to pin all pagetables to make sure they get their
 684  * mfns turned into pfns.  Search the list for any unpinned pgds and pin
 685  * them (unpinned pgds are not currently in use, probably because the
 686  * process is under construction or destruction).
 687  */
 688 void xen_mm_pin_all(void)
 689 {
 690         unsigned long flags;
 691         struct page *page;
 692
 693         spin_lock_irqsave(&pgd_lock, flags);
 694
 695         list_for_each_entry(page, &pgd_list, lru) {
 696                 if (!PagePinned(page)) {
 697                         xen_pgd_pin((pgd_t *)page_address(page));
 698                         SetPageSavePinned(page);
 699                 }
 700         }
 701
 702         spin_unlock_irqrestore(&pgd_lock, flags);
 703 }
 704
 705 /*
 706  * The init_mm pagetable is really pinned as soon as its created, but
 707  * that's before we have page structures to store the bits.  So do all
 708  * the book-keeping now.
 709  */
 710 static __init int mark_pinned(struct page *page, enum pt_level level)
 711 {
 712         SetPagePinned(page);
 713         return 0;
 714 }
 715
 716 void __init xen_mark_init_mm_pinned(void)
 717 {
 718         pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
 719 }
 720
 721 static int unpin_page(struct page *page, enum pt_level level)
 722 {
 723         unsigned pgfl = TestClearPagePinned(page);
 724
 725         if (pgfl && !PageHighMem(page)) {
 726                 void *pt = lowmem_page_address(page);
 727                 unsigned long pfn = page_to_pfn(page);
 728                 spinlock_t *ptl = NULL;
 729                 struct multicall_space mcs;
 730
 731                 if (level == PT_PTE) {
 732                         ptl = lock_pte(page);
 733
 734                         xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
 735                 }
 736
 737                 mcs = __xen_mc_entry(0);
 738
 739                 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 740                                         pfn_pte(pfn, PAGE_KERNEL),
 741                                         level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 742
 743                 if (ptl) {
 744                         /* unlock when batch completed */
 745                         xen_mc_callback(do_unlock, ptl);
 746                 }
 747         }
 748
 749         return 0;               /* never need to flush on unpin */
 750 }
 751
 752 /* Release a pagetables pages back as normal RW */
 753 static void xen_pgd_unpin(pgd_t *pgd)
 754 {
 755         xen_mc_batch();
 756
 757         xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
 758
 759         pgd_walk(pgd, unpin_page, TASK_SIZE);
 760
 761         xen_mc_issue(0);
 762 }
 763
 764 /*
 765  * On resume, undo any pinning done at save, so that the rest of the
 766  * kernel doesn't see any unexpected pinned pagetables.
 767  */
 768 void xen_mm_unpin_all(void)
 769 {
 770         unsigned long flags;
 771         struct page *page;
 772
 773         spin_lock_irqsave(&pgd_lock, flags);
 774
 775         list_for_each_entry(page, &pgd_list, lru) {
 776                 if (PageSavePinned(page)) {
 777                         BUG_ON(!PagePinned(page));
 778                         printk("unpinning pinned %p\n", page_address(page));
 779                         xen_pgd_unpin((pgd_t *)page_address(page));
 780                         ClearPageSavePinned(page);
 781                 }
 782         }
 783
 784         spin_unlock_irqrestore(&pgd_lock, flags);
 785 }
 786
 787 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
 788 {
 789         spin_lock(&next->page_table_lock);
 790         xen_pgd_pin(next->pgd);
 791         spin_unlock(&next->page_table_lock);
 792 }
 793
 794 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 795 {
 796         spin_lock(&mm->page_table_lock);
 797         xen_pgd_pin(mm->pgd);
 798         spin_unlock(&mm->page_table_lock);
 799 }
 800
 801
 802 #ifdef CONFIG_SMP
 803 /* Another cpu may still have their %cr3 pointing at the pagetable, so
 804    we need to repoint it somewhere else before we can unpin it. */
 805 static void drop_other_mm_ref(void *info)
 806 {
 807         struct mm_struct *mm = info;
 808         struct mm_struct *active_mm;
 809
 810 #ifdef CONFIG_X86_64
 811         active_mm = read_pda(active_mm);
 812 #else
 813         active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
 814 #endif
 815
 816         if (active_mm == mm)
 817                 leave_mm(smp_processor_id());
 818
 819         /* If this cpu still has a stale cr3 reference, then make sure
 820            it has been flushed. */
 821         if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
 822                 load_cr3(swapper_pg_dir);
 823                 arch_flush_lazy_cpu_mode();
 824         }
 825 }
 826
 827 static void drop_mm_ref(struct mm_struct *mm)
 828 {
 829         cpumask_t mask;
 830         unsigned cpu;
 831
 832         if (current->active_mm == mm) {
 833                 if (current->mm == mm)
 834                         load_cr3(swapper_pg_dir);
 835                 else
 836                         leave_mm(smp_processor_id());
 837                 arch_flush_lazy_cpu_mode();
 838         }
 839
 840         /* Get the "official" set of cpus referring to our pagetable. */
 841         mask = mm->cpu_vm_mask;
 842
 843         /* It's possible that a vcpu may have a stale reference to our
 844            cr3, because its in lazy mode, and it hasn't yet flushed
 845            its set of pending hypercalls yet.  In this case, we can
 846            look at its actual current cr3 value, and force it to flush
 847            if needed. */
 848         for_each_online_cpu(cpu) {
 849                 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
 850                         cpu_set(cpu, mask);
 851         }
 852
 853         if (!cpus_empty(mask))
 854                 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
 855 }
 856 #else
 857 static void drop_mm_ref(struct mm_struct *mm)
 858 {
 859         if (current->active_mm == mm)
 860                 load_cr3(swapper_pg_dir);
 861 }
 862 #endif
 863
 864 /*
 865  * While a process runs, Xen pins its pagetables, which means that the
 866  * hypervisor forces it to be read-only, and it controls all updates
 867  * to it.  This means that all pagetable updates have to go via the
 868  * hypervisor, which is moderately expensive.
 869  *
 870  * Since we're pulling the pagetable down, we switch to use init_mm,
 871  * unpin old process pagetable and mark it all read-write, which
 872  * allows further operations on it to be simple memory accesses.
 873  *
 874  * The only subtle point is that another CPU may be still using the
 875  * pagetable because of lazy tlb flushing.  This means we need need to
 876  * switch all CPUs off this pagetable before we can unpin it.
 877  */
 878 void xen_exit_mmap(struct mm_struct *mm)
 879 {
 880         get_cpu();              /* make sure we don't move around */
 881         drop_mm_ref(mm);
 882         put_cpu();
 883
 884         spin_lock(&mm->page_table_lock);
 885
 886         /* pgd may not be pinned in the error exit path of execve */
 887         if (page_pinned(mm->pgd))
 888                 xen_pgd_unpin(mm->pgd);
 889
 890         spin_unlock(&mm->page_table_lock);
 891 }