mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #include <linux/mempolicy.h>
  69 #include <linux/mm.h>
  70 #include <linux/highmem.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/kernel.h>
  73 #include <linux/sched.h>
  74 #include <linux/nodemask.h>
  75 #include <linux/cpuset.h>
  76 #include <linux/gfp.h>
  77 #include <linux/slab.h>
  78 #include <linux/string.h>
  79 #include <linux/module.h>
  80 #include <linux/nsproxy.h>
  81 #include <linux/interrupt.h>
  82 #include <linux/init.h>
  83 #include <linux/compat.h>
  84 #include <linux/swap.h>
  85 #include <linux/seq_file.h>
  86 #include <linux/proc_fs.h>
  87 #include <linux/migrate.h>
  88 #include <linux/rmap.h>
  89 #include <linux/security.h>
  90 #include <linux/syscalls.h>
  91
  92 #include <asm/tlbflush.h>
  93 #include <asm/uaccess.h>
  94
  95 /* Internal flags */
  96 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  97 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  98 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  99
 100 static struct kmem_cache *policy_cache;
 101 static struct kmem_cache *sn_cache;
 102
 103 /* Highest zone. An specific allocation for a zone below that is not
 104    policied. */
 105 enum zone_type policy_zone = 0;
 106
 107 struct mempolicy default_policy = {
 108         .refcnt = ATOMIC_INIT(1), /* never free it */
 109         .policy = MPOL_DEFAULT,
 110 };
 111
 112 static const struct mempolicy_operations {
 113         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 114         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 115 } mpol_ops[MPOL_MAX];
 116
 117 /* Check that the nodemask contains at least one populated zone */
 118 static int is_valid_nodemask(const nodemask_t *nodemask)
 119 {
 120         int nd, k;
 121
 122         /* Check that there is something useful in this mask */
 123         k = policy_zone;
 124
 125         for_each_node_mask(nd, *nodemask) {
 126                 struct zone *z;
 127
 128                 for (k = 0; k <= policy_zone; k++) {
 129                         z = &NODE_DATA(nd)->node_zones[k];
 130                         if (z->present_pages > 0)
 131                                 return 1;
 132                 }
 133         }
 134
 135         return 0;
 136 }
 137
 138 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 139 {
 140         return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
 141 }
 142
 143 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 144                                    const nodemask_t *rel)
 145 {
 146         nodemask_t tmp;
 147         nodes_fold(tmp, *orig, nodes_weight(*rel));
 148         nodes_onto(*ret, tmp, *rel);
 149 }
 150
 151 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 152 {
 153         if (nodes_empty(*nodes))
 154                 return -EINVAL;
 155         pol->v.nodes = *nodes;
 156         return 0;
 157 }
 158
 159 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 160 {
 161         if (!nodes)
 162                 pol->v.preferred_node = -1;     /* local allocation */
 163         else if (nodes_empty(*nodes))
 164                 return -EINVAL;                 /*  no allowed nodes */
 165         else
 166                 pol->v.preferred_node = first_node(*nodes);
 167         return 0;
 168 }
 169
 170 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 171 {
 172         if (!is_valid_nodemask(nodes))
 173                 return -EINVAL;
 174         pol->v.nodes = *nodes;
 175         return 0;
 176 }
 177
 178 /* Create a new policy */
 179 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 180                                   nodemask_t *nodes)
 181 {
 182         struct mempolicy *policy;
 183         nodemask_t cpuset_context_nmask;
 184         int localalloc = 0;
 185         int ret;
 186
 187         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 188                  mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 189
 190         if (mode == MPOL_DEFAULT)
 191                 return NULL;
 192         if (!nodes || nodes_empty(*nodes)) {
 193                 if (mode != MPOL_PREFERRED)
 194                         return ERR_PTR(-EINVAL);
 195                 localalloc = 1; /* special case:  no mode flags */
 196         }
 197         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 198         if (!policy)
 199                 return ERR_PTR(-ENOMEM);
 200         atomic_set(&policy->refcnt, 1);
 201         policy->policy = mode;
 202
 203         if (!localalloc) {
 204                 policy->flags = flags;
 205                 cpuset_update_task_memory_state();
 206                 if (flags & MPOL_F_RELATIVE_NODES)
 207                         mpol_relative_nodemask(&cpuset_context_nmask, nodes,
 208                                                &cpuset_current_mems_allowed);
 209                 else
 210                         nodes_and(cpuset_context_nmask, *nodes,
 211                                   cpuset_current_mems_allowed);
 212                 if (mpol_store_user_nodemask(policy))
 213                         policy->w.user_nodemask = *nodes;
 214                 else
 215                         policy->w.cpuset_mems_allowed =
 216                                                 cpuset_mems_allowed(current);
 217         }
 218
 219         ret = mpol_ops[mode].create(policy,
 220                                 localalloc ? NULL : &cpuset_context_nmask);
 221         if (ret < 0) {
 222                 kmem_cache_free(policy_cache, policy);
 223                 return ERR_PTR(ret);
 224         }
 225         return policy;
 226 }
 227
 228 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 229 {
 230 }
 231
 232 static void mpol_rebind_nodemask(struct mempolicy *pol,
 233                                  const nodemask_t *nodes)
 234 {
 235         nodemask_t tmp;
 236
 237         if (pol->flags & MPOL_F_STATIC_NODES)
 238                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 239         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 240                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 241         else {
 242                 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
 243                             *nodes);
 244                 pol->w.cpuset_mems_allowed = *nodes;
 245         }
 246
 247         pol->v.nodes = tmp;
 248         if (!node_isset(current->il_next, tmp)) {
 249                 current->il_next = next_node(current->il_next, tmp);
 250                 if (current->il_next >= MAX_NUMNODES)
 251                         current->il_next = first_node(tmp);
 252                 if (current->il_next >= MAX_NUMNODES)
 253                         current->il_next = numa_node_id();
 254         }
 255 }
 256
 257 static void mpol_rebind_preferred(struct mempolicy *pol,
 258                                   const nodemask_t *nodes)
 259 {
 260         nodemask_t tmp;
 261
 262         /*
 263          * check 'STATIC_NODES first, as preferred_node == -1 may be
 264          * a temporary, "fallback" state for this policy.
 265          */
 266         if (pol->flags & MPOL_F_STATIC_NODES) {
 267                 int node = first_node(pol->w.user_nodemask);
 268
 269                 if (node_isset(node, *nodes))
 270                         pol->v.preferred_node = node;
 271                 else
 272                         pol->v.preferred_node = -1;
 273         } else if (pol->v.preferred_node == -1) {
 274                 return; /* no remap required for explicit local alloc */
 275         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 276                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 277                 pol->v.preferred_node = first_node(tmp);
 278         } else {
 279                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 280                                                    pol->w.cpuset_mems_allowed,
 281                                                    *nodes);
 282                 pol->w.cpuset_mems_allowed = *nodes;
 283         }
 284 }
 285
 286 /* Migrate a policy to a different set of nodes */
 287 static void mpol_rebind_policy(struct mempolicy *pol,
 288                                const nodemask_t *newmask)
 289 {
 290         if (!pol)
 291                 return;
 292         if (!mpol_store_user_nodemask(pol) &&
 293             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 294                 return;
 295         mpol_ops[pol->policy].rebind(pol, newmask);
 296 }
 297
 298 /*
 299  * Wrapper for mpol_rebind_policy() that just requires task
 300  * pointer, and updates task mempolicy.
 301  */
 302
 303 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 304 {
 305         mpol_rebind_policy(tsk->mempolicy, new);
 306 }
 307
 308 /*
 309  * Rebind each vma in mm to new nodemask.
 310  *
 311  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 312  */
 313
 314 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 315 {
 316         struct vm_area_struct *vma;
 317
 318         down_write(&mm->mmap_sem);
 319         for (vma = mm->mmap; vma; vma = vma->vm_next)
 320                 mpol_rebind_policy(vma->vm_policy, new);
 321         up_write(&mm->mmap_sem);
 322 }
 323
 324 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 325         [MPOL_DEFAULT] = {
 326                 .rebind = mpol_rebind_default,
 327         },
 328         [MPOL_INTERLEAVE] = {
 329                 .create = mpol_new_interleave,
 330                 .rebind = mpol_rebind_nodemask,
 331         },
 332         [MPOL_PREFERRED] = {
 333                 .create = mpol_new_preferred,
 334                 .rebind = mpol_rebind_preferred,
 335         },
 336         [MPOL_BIND] = {
 337                 .create = mpol_new_bind,
 338                 .rebind = mpol_rebind_nodemask,
 339         },
 340 };
 341
 342 static void gather_stats(struct page *, void *, int pte_dirty);
 343 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 344                                 unsigned long flags);
 345
 346 /* Scan through pages checking if pages follow certain conditions. */
 347 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 348                 unsigned long addr, unsigned long end,
 349                 const nodemask_t *nodes, unsigned long flags,
 350                 void *private)
 351 {
 352         pte_t *orig_pte;
 353         pte_t *pte;
 354         spinlock_t *ptl;
 355
 356         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 357         do {
 358                 struct page *page;
 359                 int nid;
 360
 361                 if (!pte_present(*pte))
 362                         continue;
 363                 page = vm_normal_page(vma, addr, *pte);
 364                 if (!page)
 365                         continue;
 366                 /*
 367                  * The check for PageReserved here is important to avoid
 368                  * handling zero pages and other pages that may have been
 369                  * marked special by the system.
 370                  *
 371                  * If the PageReserved would not be checked here then f.e.
 372                  * the location of the zero page could have an influence
 373                  * on MPOL_MF_STRICT, zero pages would be counted for
 374                  * the per node stats, and there would be useless attempts
 375                  * to put zero pages on the migration list.
 376                  */
 377                 if (PageReserved(page))
 378                         continue;
 379                 nid = page_to_nid(page);
 380                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 381                         continue;
 382
 383                 if (flags & MPOL_MF_STATS)
 384                         gather_stats(page, private, pte_dirty(*pte));
 385                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 386                         migrate_page_add(page, private, flags);
 387                 else
 388                         break;
 389         } while (pte++, addr += PAGE_SIZE, addr != end);
 390         pte_unmap_unlock(orig_pte, ptl);
 391         return addr != end;
 392 }
 393
 394 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 395                 unsigned long addr, unsigned long end,
 396                 const nodemask_t *nodes, unsigned long flags,
 397                 void *private)
 398 {
 399         pmd_t *pmd;
 400         unsigned long next;
 401
 402         pmd = pmd_offset(pud, addr);
 403         do {
 404                 next = pmd_addr_end(addr, end);
 405                 if (pmd_none_or_clear_bad(pmd))
 406                         continue;
 407                 if (check_pte_range(vma, pmd, addr, next, nodes,
 408                                     flags, private))
 409                         return -EIO;
 410         } while (pmd++, addr = next, addr != end);
 411         return 0;
 412 }
 413
 414 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 415                 unsigned long addr, unsigned long end,
 416                 const nodemask_t *nodes, unsigned long flags,
 417                 void *private)
 418 {
 419         pud_t *pud;
 420         unsigned long next;
 421
 422         pud = pud_offset(pgd, addr);
 423         do {
 424                 next = pud_addr_end(addr, end);
 425                 if (pud_none_or_clear_bad(pud))
 426                         continue;
 427                 if (check_pmd_range(vma, pud, addr, next, nodes,
 428                                     flags, private))
 429                         return -EIO;
 430         } while (pud++, addr = next, addr != end);
 431         return 0;
 432 }
 433
 434 static inline int check_pgd_range(struct vm_area_struct *vma,
 435                 unsigned long addr, unsigned long end,
 436                 const nodemask_t *nodes, unsigned long flags,
 437                 void *private)
 438 {
 439         pgd_t *pgd;
 440         unsigned long next;
 441
 442         pgd = pgd_offset(vma->vm_mm, addr);
 443         do {
 444                 next = pgd_addr_end(addr, end);
 445                 if (pgd_none_or_clear_bad(pgd))
 446                         continue;
 447                 if (check_pud_range(vma, pgd, addr, next, nodes,
 448                                     flags, private))
 449                         return -EIO;
 450         } while (pgd++, addr = next, addr != end);
 451         return 0;
 452 }
 453
 454 /*
 455  * Check if all pages in a range are on a set of nodes.
 456  * If pagelist != NULL then isolate pages from the LRU and
 457  * put them on the pagelist.
 458  */
 459 static struct vm_area_struct *
 460 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 461                 const nodemask_t *nodes, unsigned long flags, void *private)
 462 {
 463         int err;
 464         struct vm_area_struct *first, *vma, *prev;
 465
 466         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 467
 468                 err = migrate_prep();
 469                 if (err)
 470                         return ERR_PTR(err);
 471         }
 472
 473         first = find_vma(mm, start);
 474         if (!first)
 475                 return ERR_PTR(-EFAULT);
 476         prev = NULL;
 477         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 478                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 479                         if (!vma->vm_next && vma->vm_end < end)
 480                                 return ERR_PTR(-EFAULT);
 481                         if (prev && prev->vm_end < vma->vm_start)
 482                                 return ERR_PTR(-EFAULT);
 483                 }
 484                 if (!is_vm_hugetlb_page(vma) &&
 485                     ((flags & MPOL_MF_STRICT) ||
 486                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 487                                 vma_migratable(vma)))) {
 488                         unsigned long endvma = vma->vm_end;
 489
 490                         if (endvma > end)
 491                                 endvma = end;
 492                         if (vma->vm_start > start)
 493                                 start = vma->vm_start;
 494                         err = check_pgd_range(vma, start, endvma, nodes,
 495                                                 flags, private);
 496                         if (err) {
 497                                 first = ERR_PTR(err);
 498                                 break;
 499                         }
 500                 }
 501                 prev = vma;
 502         }
 503         return first;
 504 }
 505
 506 /* Apply policy to a single VMA */
 507 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 508 {
 509         int err = 0;
 510         struct mempolicy *old = vma->vm_policy;
 511
 512         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 513                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 514                  vma->vm_ops, vma->vm_file,
 515                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 516
 517         if (vma->vm_ops && vma->vm_ops->set_policy)
 518                 err = vma->vm_ops->set_policy(vma, new);
 519         if (!err) {
 520                 mpol_get(new);
 521                 vma->vm_policy = new;
 522                 mpol_free(old);
 523         }
 524         return err;
 525 }
 526
 527 /* Step 2: apply policy to a range and do splits. */
 528 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 529                        unsigned long end, struct mempolicy *new)
 530 {
 531         struct vm_area_struct *next;
 532         int err;
 533
 534         err = 0;
 535         for (; vma && vma->vm_start < end; vma = next) {
 536                 next = vma->vm_next;
 537                 if (vma->vm_start < start)
 538                         err = split_vma(vma->vm_mm, vma, start, 1);
 539                 if (!err && vma->vm_end > end)
 540                         err = split_vma(vma->vm_mm, vma, end, 0);
 541                 if (!err)
 542                         err = policy_vma(vma, new);
 543                 if (err)
 544                         break;
 545         }
 546         return err;
 547 }
 548
 549 /*
 550  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 551  * mempolicy.  Allows more rapid checking of this (combined perhaps
 552  * with other PF_* flag bits) on memory allocation hot code paths.
 553  *
 554  * If called from outside this file, the task 'p' should -only- be
 555  * a newly forked child not yet visible on the task list, because
 556  * manipulating the task flags of a visible task is not safe.
 557  *
 558  * The above limitation is why this routine has the funny name
 559  * mpol_fix_fork_child_flag().
 560  *
 561  * It is also safe to call this with a task pointer of current,
 562  * which the static wrapper mpol_set_task_struct_flag() does,
 563  * for use within this file.
 564  */
 565
 566 void mpol_fix_fork_child_flag(struct task_struct *p)
 567 {
 568         if (p->mempolicy)
 569                 p->flags |= PF_MEMPOLICY;
 570         else
 571                 p->flags &= ~PF_MEMPOLICY;
 572 }
 573
 574 static void mpol_set_task_struct_flag(void)
 575 {
 576         mpol_fix_fork_child_flag(current);
 577 }
 578
 579 /* Set the process memory policy */
 580 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 581                              nodemask_t *nodes)
 582 {
 583         struct mempolicy *new;
 584
 585         new = mpol_new(mode, flags, nodes);
 586         if (IS_ERR(new))
 587                 return PTR_ERR(new);
 588         mpol_free(current->mempolicy);
 589         current->mempolicy = new;
 590         mpol_set_task_struct_flag();
 591         if (new && new->policy == MPOL_INTERLEAVE &&
 592             nodes_weight(new->v.nodes))
 593                 current->il_next = first_node(new->v.nodes);
 594         return 0;
 595 }
 596
 597 /* Fill a zone bitmap for a policy */
 598 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 599 {
 600         nodes_clear(*nodes);
 601         switch (p->policy) {
 602         case MPOL_DEFAULT:
 603                 break;
 604         case MPOL_BIND:
 605                 /* Fall through */
 606         case MPOL_INTERLEAVE:
 607                 *nodes = p->v.nodes;
 608                 break;
 609         case MPOL_PREFERRED:
 610                 /* or use current node instead of memory_map? */
 611                 if (p->v.preferred_node < 0)
 612                         *nodes = node_states[N_HIGH_MEMORY];
 613                 else
 614                         node_set(p->v.preferred_node, *nodes);
 615                 break;
 616         default:
 617                 BUG();
 618         }
 619 }
 620
 621 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 622 {
 623         struct page *p;
 624         int err;
 625
 626         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 627         if (err >= 0) {
 628                 err = page_to_nid(p);
 629                 put_page(p);
 630         }
 631         return err;
 632 }
 633
 634 /* Retrieve NUMA policy */
 635 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 636                              unsigned long addr, unsigned long flags)
 637 {
 638         int err;
 639         struct mm_struct *mm = current->mm;
 640         struct vm_area_struct *vma = NULL;
 641         struct mempolicy *pol = current->mempolicy;
 642
 643         cpuset_update_task_memory_state();
 644         if (flags &
 645                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 646                 return -EINVAL;
 647
 648         if (flags & MPOL_F_MEMS_ALLOWED) {
 649                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 650                         return -EINVAL;
 651                 *policy = 0;    /* just so it's initialized */
 652                 *nmask  = cpuset_current_mems_allowed;
 653                 return 0;
 654         }
 655
 656         if (flags & MPOL_F_ADDR) {
 657                 down_read(&mm->mmap_sem);
 658                 vma = find_vma_intersection(mm, addr, addr+1);
 659                 if (!vma) {
 660                         up_read(&mm->mmap_sem);
 661                         return -EFAULT;
 662                 }
 663                 if (vma->vm_ops && vma->vm_ops->get_policy)
 664                         pol = vma->vm_ops->get_policy(vma, addr);
 665                 else
 666                         pol = vma->vm_policy;
 667         } else if (addr)
 668                 return -EINVAL;
 669
 670         if (!pol)
 671                 pol = &default_policy;
 672
 673         if (flags & MPOL_F_NODE) {
 674                 if (flags & MPOL_F_ADDR) {
 675                         err = lookup_node(mm, addr);
 676                         if (err < 0)
 677                                 goto out;
 678                         *policy = err;
 679                 } else if (pol == current->mempolicy &&
 680                                 pol->policy == MPOL_INTERLEAVE) {
 681                         *policy = current->il_next;
 682                 } else {
 683                         err = -EINVAL;
 684                         goto out;
 685                 }
 686         } else
 687                 *policy = pol->policy | pol->flags;
 688
 689         if (vma) {
 690                 up_read(&current->mm->mmap_sem);
 691                 vma = NULL;
 692         }
 693
 694         err = 0;
 695         if (nmask)
 696                 get_zonemask(pol, nmask);
 697
 698  out:
 699         if (vma)
 700                 up_read(&current->mm->mmap_sem);
 701         return err;
 702 }
 703
 704 #ifdef CONFIG_MIGRATION
 705 /*
 706  * page migration
 707  */
 708 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 709                                 unsigned long flags)
 710 {
 711         /*
 712          * Avoid migrating a page that is shared with others.
 713          */
 714         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 715                 isolate_lru_page(page, pagelist);
 716 }
 717
 718 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 719 {
 720         return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
 721 }
 722
 723 /*
 724  * Migrate pages from one node to a target node.
 725  * Returns error or the number of pages not migrated.
 726  */
 727 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 728                            int flags)
 729 {
 730         nodemask_t nmask;
 731         LIST_HEAD(pagelist);
 732         int err = 0;
 733
 734         nodes_clear(nmask);
 735         node_set(source, nmask);
 736
 737         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 738                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 739
 740         if (!list_empty(&pagelist))
 741                 err = migrate_pages(&pagelist, new_node_page, dest);
 742
 743         return err;
 744 }
 745
 746 /*
 747  * Move pages between the two nodesets so as to preserve the physical
 748  * layout as much as possible.
 749  *
 750  * Returns the number of page that could not be moved.
 751  */
 752 int do_migrate_pages(struct mm_struct *mm,
 753         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 754 {
 755         LIST_HEAD(pagelist);
 756         int busy = 0;
 757         int err = 0;
 758         nodemask_t tmp;
 759
 760         down_read(&mm->mmap_sem);
 761
 762         err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 763         if (err)
 764                 goto out;
 765
 766 /*
 767  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 768  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 769  * bit in 'tmp', and return that <source, dest> pair for migration.
 770  * The pair of nodemasks 'to' and 'from' define the map.
 771  *
 772  * If no pair of bits is found that way, fallback to picking some
 773  * pair of 'source' and 'dest' bits that are not the same.  If the
 774  * 'source' and 'dest' bits are the same, this represents a node
 775  * that will be migrating to itself, so no pages need move.
 776  *
 777  * If no bits are left in 'tmp', or if all remaining bits left
 778  * in 'tmp' correspond to the same bit in 'to', return false
 779  * (nothing left to migrate).
 780  *
 781  * This lets us pick a pair of nodes to migrate between, such that
 782  * if possible the dest node is not already occupied by some other
 783  * source node, minimizing the risk of overloading the memory on a
 784  * node that would happen if we migrated incoming memory to a node
 785  * before migrating outgoing memory source that same node.
 786  *
 787  * A single scan of tmp is sufficient.  As we go, we remember the
 788  * most recent <s, d> pair that moved (s != d).  If we find a pair
 789  * that not only moved, but what's better, moved to an empty slot
 790  * (d is not set in tmp), then we break out then, with that pair.
 791  * Otherwise when we finish scannng from_tmp, we at least have the
 792  * most recent <s, d> pair that moved.  If we get all the way through
 793  * the scan of tmp without finding any node that moved, much less
 794  * moved to an empty node, then there is nothing left worth migrating.
 795  */
 796
 797         tmp = *from_nodes;
 798         while (!nodes_empty(tmp)) {
 799                 int s,d;
 800                 int source = -1;
 801                 int dest = 0;
 802
 803                 for_each_node_mask(s, tmp) {
 804                         d = node_remap(s, *from_nodes, *to_nodes);
 805                         if (s == d)
 806                                 continue;
 807
 808                         source = s;     /* Node moved. Memorize */
 809                         dest = d;
 810
 811                         /* dest not in remaining from nodes? */
 812                         if (!node_isset(dest, tmp))
 813                                 break;
 814                 }
 815                 if (source == -1)
 816                         break;
 817
 818                 node_clear(source, tmp);
 819                 err = migrate_to_node(mm, source, dest, flags);
 820                 if (err > 0)
 821                         busy += err;
 822                 if (err < 0)
 823                         break;
 824         }
 825 out:
 826         up_read(&mm->mmap_sem);
 827         if (err < 0)
 828                 return err;
 829         return busy;
 830
 831 }
 832
 833 /*
 834  * Allocate a new page for page migration based on vma policy.
 835  * Start assuming that page is mapped by vma pointed to by @private.
 836  * Search forward from there, if not.  N.B., this assumes that the
 837  * list of pages handed to migrate_pages()--which is how we get here--
 838  * is in virtual address order.
 839  */
 840 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 841 {
 842         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 843         unsigned long uninitialized_var(address);
 844
 845         while (vma) {
 846                 address = page_address_in_vma(page, vma);
 847                 if (address != -EFAULT)
 848                         break;
 849                 vma = vma->vm_next;
 850         }
 851
 852         /*
 853          * if !vma, alloc_page_vma() will use task or system default policy
 854          */
 855         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 856 }
 857 #else
 858
 859 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 860                                 unsigned long flags)
 861 {
 862 }
 863
 864 int do_migrate_pages(struct mm_struct *mm,
 865         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 866 {
 867         return -ENOSYS;
 868 }
 869
 870 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 871 {
 872         return NULL;
 873 }
 874 #endif
 875
 876 static long do_mbind(unsigned long start, unsigned long len,
 877                      unsigned short mode, unsigned short mode_flags,
 878                      nodemask_t *nmask, unsigned long flags)
 879 {
 880         struct vm_area_struct *vma;
 881         struct mm_struct *mm = current->mm;
 882         struct mempolicy *new;
 883         unsigned long end;
 884         int err;
 885         LIST_HEAD(pagelist);
 886
 887         if (flags & ~(unsigned long)(MPOL_MF_STRICT |
 888                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 889                 return -EINVAL;
 890         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 891                 return -EPERM;
 892
 893         if (start & ~PAGE_MASK)
 894                 return -EINVAL;
 895
 896         if (mode == MPOL_DEFAULT)
 897                 flags &= ~MPOL_MF_STRICT;
 898
 899         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 900         end = start + len;
 901
 902         if (end < start)
 903                 return -EINVAL;
 904         if (end == start)
 905                 return 0;
 906
 907         new = mpol_new(mode, mode_flags, nmask);
 908         if (IS_ERR(new))
 909                 return PTR_ERR(new);
 910
 911         /*
 912          * If we are using the default policy then operation
 913          * on discontinuous address spaces is okay after all
 914          */
 915         if (!new)
 916                 flags |= MPOL_MF_DISCONTIG_OK;
 917
 918         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
 919                  start, start + len, mode, mode_flags,
 920                  nmask ? nodes_addr(*nmask)[0] : -1);
 921
 922         down_write(&mm->mmap_sem);
 923         vma = check_range(mm, start, end, nmask,
 924                           flags | MPOL_MF_INVERT, &pagelist);
 925
 926         err = PTR_ERR(vma);
 927         if (!IS_ERR(vma)) {
 928                 int nr_failed = 0;
 929
 930                 err = mbind_range(vma, start, end, new);
 931
 932                 if (!list_empty(&pagelist))
 933                         nr_failed = migrate_pages(&pagelist, new_vma_page,
 934                                                 (unsigned long)vma);
 935
 936                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 937                         err = -EIO;
 938         }
 939
 940         up_write(&mm->mmap_sem);
 941         mpol_free(new);
 942         return err;
 943 }
 944
 945 /*
 946  * User space interface with variable sized bitmaps for nodelists.
 947  */
 948
 949 /* Copy a node mask from user space. */
 950 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 951                      unsigned long maxnode)
 952 {
 953         unsigned long k;
 954         unsigned long nlongs;
 955         unsigned long endmask;
 956
 957         --maxnode;
 958         nodes_clear(*nodes);
 959         if (maxnode == 0 || !nmask)
 960                 return 0;
 961         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 962                 return -EINVAL;
 963
 964         nlongs = BITS_TO_LONGS(maxnode);
 965         if ((maxnode % BITS_PER_LONG) == 0)
 966                 endmask = ~0UL;
 967         else
 968                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 969
 970         /* When the user specified more nodes than supported just check
 971            if the non supported part is all zero. */
 972         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 973                 if (nlongs > PAGE_SIZE/sizeof(long))
 974                         return -EINVAL;
 975                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 976                         unsigned long t;
 977                         if (get_user(t, nmask + k))
 978                                 return -EFAULT;
 979                         if (k == nlongs - 1) {
 980                                 if (t & endmask)
 981                                         return -EINVAL;
 982                         } else if (t)
 983                                 return -EINVAL;
 984                 }
 985                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 986                 endmask = ~0UL;
 987         }
 988
 989         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 990                 return -EFAULT;
 991         nodes_addr(*nodes)[nlongs-1] &= endmask;
 992         return 0;
 993 }
 994
 995 /* Copy a kernel node mask to user space */
 996 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 997                               nodemask_t *nodes)
 998 {
 999         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1000         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1001
1002         if (copy > nbytes) {
1003                 if (copy > PAGE_SIZE)
1004                         return -EINVAL;
1005                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1006                         return -EFAULT;
1007                 copy = nbytes;
1008         }
1009         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1010 }
1011
1012 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
1013                         unsigned long mode,
1014                         unsigned long __user *nmask, unsigned long maxnode,
1015                         unsigned flags)
1016 {
1017         nodemask_t nodes;
1018         int err;
1019         unsigned short mode_flags;
1020
1021         mode_flags = mode & MPOL_MODE_FLAGS;
1022         mode &= ~MPOL_MODE_FLAGS;
1023         if (mode >= MPOL_MAX)
1024                 return -EINVAL;
1025         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1026             (mode_flags & MPOL_F_RELATIVE_NODES))
1027                 return -EINVAL;
1028         err = get_nodes(&nodes, nmask, maxnode);
1029         if (err)
1030                 return err;
1031         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1032 }
1033
1034 /* Set the process memory policy */
1035 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
1036                 unsigned long maxnode)
1037 {
1038         int err;
1039         nodemask_t nodes;
1040         unsigned short flags;
1041
1042         flags = mode & MPOL_MODE_FLAGS;
1043         mode &= ~MPOL_MODE_FLAGS;
1044         if ((unsigned int)mode >= MPOL_MAX)
1045                 return -EINVAL;
1046         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1047                 return -EINVAL;
1048         err = get_nodes(&nodes, nmask, maxnode);
1049         if (err)
1050                 return err;
1051         return do_set_mempolicy(mode, flags, &nodes);
1052 }
1053
1054 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
1055                 const unsigned long __user *old_nodes,
1056                 const unsigned long __user *new_nodes)
1057 {
1058         struct mm_struct *mm;
1059         struct task_struct *task;
1060         nodemask_t old;
1061         nodemask_t new;
1062         nodemask_t task_nodes;
1063         int err;
1064
1065         err = get_nodes(&old, old_nodes, maxnode);
1066         if (err)
1067                 return err;
1068
1069         err = get_nodes(&new, new_nodes, maxnode);
1070         if (err)
1071                 return err;
1072
1073         /* Find the mm_struct */
1074         read_lock(&tasklist_lock);
1075         task = pid ? find_task_by_vpid(pid) : current;
1076         if (!task) {
1077                 read_unlock(&tasklist_lock);
1078                 return -ESRCH;
1079         }
1080         mm = get_task_mm(task);
1081         read_unlock(&tasklist_lock);
1082
1083         if (!mm)
1084                 return -EINVAL;
1085
1086         /*
1087          * Check if this process has the right to modify the specified
1088          * process. The right exists if the process has administrative
1089          * capabilities, superuser privileges or the same
1090          * userid as the target process.
1091          */
1092         if ((current->euid != task->suid) && (current->euid != task->uid) &&
1093             (current->uid != task->suid) && (current->uid != task->uid) &&
1094             !capable(CAP_SYS_NICE)) {
1095                 err = -EPERM;
1096                 goto out;
1097         }
1098
1099         task_nodes = cpuset_mems_allowed(task);
1100         /* Is the user allowed to access the target nodes? */
1101         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1102                 err = -EPERM;
1103                 goto out;
1104         }
1105
1106         if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1107                 err = -EINVAL;
1108                 goto out;
1109         }
1110
1111         err = security_task_movememory(task);
1112         if (err)
1113                 goto out;
1114
1115         err = do_migrate_pages(mm, &old, &new,
1116                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1117 out:
1118         mmput(mm);
1119         return err;
1120 }
1121
1122
1123 /* Retrieve NUMA policy */
1124 asmlinkage long sys_get_mempolicy(int __user *policy,
1125                                 unsigned long __user *nmask,
1126                                 unsigned long maxnode,
1127                                 unsigned long addr, unsigned long flags)
1128 {
1129         int err;
1130         int uninitialized_var(pval);
1131         nodemask_t nodes;
1132
1133         if (nmask != NULL && maxnode < MAX_NUMNODES)
1134                 return -EINVAL;
1135
1136         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1137
1138         if (err)
1139                 return err;
1140
1141         if (policy && put_user(pval, policy))
1142                 return -EFAULT;
1143
1144         if (nmask)
1145                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1146
1147         return err;
1148 }
1149
1150 #ifdef CONFIG_COMPAT
1151
1152 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1153                                      compat_ulong_t __user *nmask,
1154                                      compat_ulong_t maxnode,
1155                                      compat_ulong_t addr, compat_ulong_t flags)
1156 {
1157         long err;
1158         unsigned long __user *nm = NULL;
1159         unsigned long nr_bits, alloc_size;
1160         DECLARE_BITMAP(bm, MAX_NUMNODES);
1161
1162         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1163         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1164
1165         if (nmask)
1166                 nm = compat_alloc_user_space(alloc_size);
1167
1168         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1169
1170         if (!err && nmask) {
1171                 err = copy_from_user(bm, nm, alloc_size);
1172                 /* ensure entire bitmap is zeroed */
1173                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1174                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1175         }
1176
1177         return err;
1178 }
1179
1180 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1181                                      compat_ulong_t maxnode)
1182 {
1183         long err = 0;
1184         unsigned long __user *nm = NULL;
1185         unsigned long nr_bits, alloc_size;
1186         DECLARE_BITMAP(bm, MAX_NUMNODES);
1187
1188         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1189         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1190
1191         if (nmask) {
1192                 err = compat_get_bitmap(bm, nmask, nr_bits);
1193                 nm = compat_alloc_user_space(alloc_size);
1194                 err |= copy_to_user(nm, bm, alloc_size);
1195         }
1196
1197         if (err)
1198                 return -EFAULT;
1199
1200         return sys_set_mempolicy(mode, nm, nr_bits+1);
1201 }
1202
1203 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1204                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1205                              compat_ulong_t maxnode, compat_ulong_t flags)
1206 {
1207         long err = 0;
1208         unsigned long __user *nm = NULL;
1209         unsigned long nr_bits, alloc_size;
1210         nodemask_t bm;
1211
1212         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1213         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1214
1215         if (nmask) {
1216                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1217                 nm = compat_alloc_user_space(alloc_size);
1218                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1219         }
1220
1221         if (err)
1222                 return -EFAULT;
1223
1224         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1225 }
1226
1227 #endif
1228
1229 /*
1230  * get_vma_policy(@task, @vma, @addr)
1231  * @task - task for fallback if vma policy == default
1232  * @vma   - virtual memory area whose policy is sought
1233  * @addr  - address in @vma for shared policy lookup
1234  *
1235  * Returns effective policy for a VMA at specified address.
1236  * Falls back to @task or system default policy, as necessary.
1237  * Returned policy has extra reference count if shared, vma,
1238  * or some other task's policy [show_numa_maps() can pass
1239  * @task != current].  It is the caller's responsibility to
1240  * free the reference in these cases.
1241  */
1242 static struct mempolicy * get_vma_policy(struct task_struct *task,
1243                 struct vm_area_struct *vma, unsigned long addr)
1244 {
1245         struct mempolicy *pol = task->mempolicy;
1246         int shared_pol = 0;
1247
1248         if (vma) {
1249                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1250                         pol = vma->vm_ops->get_policy(vma, addr);
1251                         shared_pol = 1; /* if pol non-NULL, add ref below */
1252                 } else if (vma->vm_policy &&
1253                                 vma->vm_policy->policy != MPOL_DEFAULT)
1254                         pol = vma->vm_policy;
1255         }
1256         if (!pol)
1257                 pol = &default_policy;
1258         else if (!shared_pol && pol != current->mempolicy)
1259                 mpol_get(pol);  /* vma or other task's policy */
1260         return pol;
1261 }
1262
1263 /* Return a nodemask representing a mempolicy */
1264 static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
1265 {
1266         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1267         if (unlikely(policy->policy == MPOL_BIND) &&
1268                         gfp_zone(gfp) >= policy_zone &&
1269                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1270                 return &policy->v.nodes;
1271
1272         return NULL;
1273 }
1274
1275 /* Return a zonelist representing a mempolicy */
1276 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1277 {
1278         int nd;
1279
1280         switch (policy->policy) {
1281         case MPOL_PREFERRED:
1282                 nd = policy->v.preferred_node;
1283                 if (nd < 0)
1284                         nd = numa_node_id();
1285                 break;
1286         case MPOL_BIND:
1287                 /*
1288                  * Normally, MPOL_BIND allocations node-local are node-local
1289                  * within the allowed nodemask. However, if __GFP_THISNODE is
1290                  * set and the current node is part of the mask, we use the
1291                  * the zonelist for the first node in the mask instead.
1292                  */
1293                 nd = numa_node_id();
1294                 if (unlikely(gfp & __GFP_THISNODE) &&
1295                                 unlikely(!node_isset(nd, policy->v.nodes)))
1296                         nd = first_node(policy->v.nodes);
1297                 break;
1298         case MPOL_INTERLEAVE: /* should not happen */
1299         case MPOL_DEFAULT:
1300                 nd = numa_node_id();
1301                 break;
1302         default:
1303                 nd = 0;
1304                 BUG();
1305         }
1306         return node_zonelist(nd, gfp);
1307 }
1308
1309 /* Do dynamic interleaving for a process */
1310 static unsigned interleave_nodes(struct mempolicy *policy)
1311 {
1312         unsigned nid, next;
1313         struct task_struct *me = current;
1314
1315         nid = me->il_next;
1316         next = next_node(nid, policy->v.nodes);
1317         if (next >= MAX_NUMNODES)
1318                 next = first_node(policy->v.nodes);
1319         if (next < MAX_NUMNODES)
1320                 me->il_next = next;
1321         return nid;
1322 }
1323
1324 /*
1325  * Depending on the memory policy provide a node from which to allocate the
1326  * next slab entry.
1327  */
1328 unsigned slab_node(struct mempolicy *policy)
1329 {
1330         unsigned short pol = policy ? policy->policy : MPOL_DEFAULT;
1331
1332         switch (pol) {
1333         case MPOL_INTERLEAVE:
1334                 return interleave_nodes(policy);
1335
1336         case MPOL_BIND: {
1337                 /*
1338                  * Follow bind policy behavior and start allocation at the
1339                  * first node.
1340                  */
1341                 struct zonelist *zonelist;
1342                 struct zone *zone;
1343                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1344                 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1345                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1346                                                         &policy->v.nodes,
1347                                                         &zone);
1348                 return zone->node;
1349         }
1350
1351         case MPOL_PREFERRED:
1352                 if (policy->v.preferred_node >= 0)
1353                         return policy->v.preferred_node;
1354                 /* Fall through */
1355
1356         default:
1357                 return numa_node_id();
1358         }
1359 }
1360
1361 /* Do static interleaving for a VMA with known offset. */
1362 static unsigned offset_il_node(struct mempolicy *pol,
1363                 struct vm_area_struct *vma, unsigned long off)
1364 {
1365         unsigned nnodes = nodes_weight(pol->v.nodes);
1366         unsigned target;
1367         int c;
1368         int nid = -1;
1369
1370         if (!nnodes)
1371                 return numa_node_id();
1372         target = (unsigned int)off % nnodes;
1373         c = 0;
1374         do {
1375                 nid = next_node(nid, pol->v.nodes);
1376                 c++;
1377         } while (c <= target);
1378         return nid;
1379 }
1380
1381 /* Determine a node number for interleave */
1382 static inline unsigned interleave_nid(struct mempolicy *pol,
1383                  struct vm_area_struct *vma, unsigned long addr, int shift)
1384 {
1385         if (vma) {
1386                 unsigned long off;
1387
1388                 /*
1389                  * for small pages, there is no difference between
1390                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1391                  * for huge pages, since vm_pgoff is in units of small
1392                  * pages, we need to shift off the always 0 bits to get
1393                  * a useful offset.
1394                  */
1395                 BUG_ON(shift < PAGE_SHIFT);
1396                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1397                 off += (addr - vma->vm_start) >> shift;
1398                 return offset_il_node(pol, vma, off);
1399         } else
1400                 return interleave_nodes(pol);
1401 }
1402
1403 #ifdef CONFIG_HUGETLBFS
1404 /*
1405  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1406  * @vma = virtual memory area whose policy is sought
1407  * @addr = address in @vma for shared policy lookup and interleave policy
1408  * @gfp_flags = for requested zone
1409  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1410  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1411  *
1412  * Returns a zonelist suitable for a huge page allocation.
1413  * If the effective policy is 'BIND, returns pointer to local node's zonelist,
1414  * and a pointer to the mempolicy's @nodemask for filtering the zonelist.
1415  * If it is also a policy for which get_vma_policy() returns an extra
1416  * reference, we must hold that reference until after the allocation.
1417  * In that case, return policy via @mpol so hugetlb allocation can drop
1418  * the reference. For non-'BIND referenced policies, we can/do drop the
1419  * reference here, so the caller doesn't need to know about the special case
1420  * for default and current task policy.
1421  */
1422 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1423                                 gfp_t gfp_flags, struct mempolicy **mpol,
1424                                 nodemask_t **nodemask)
1425 {
1426         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1427         struct zonelist *zl;
1428
1429         *mpol = NULL;           /* probably no unref needed */
1430         *nodemask = NULL;       /* assume !MPOL_BIND */
1431         if (pol->policy == MPOL_BIND) {
1432                         *nodemask = &pol->v.nodes;
1433         } else if (pol->policy == MPOL_INTERLEAVE) {
1434                 unsigned nid;
1435
1436                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1437                 if (unlikely(pol != &default_policy &&
1438                                 pol != current->mempolicy))
1439                         __mpol_free(pol);       /* finished with pol */
1440                 return node_zonelist(nid, gfp_flags);
1441         }
1442
1443         zl = zonelist_policy(GFP_HIGHUSER, pol);
1444         if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1445                 if (pol->policy != MPOL_BIND)
1446                         __mpol_free(pol);       /* finished with pol */
1447                 else
1448                         *mpol = pol;    /* unref needed after allocation */
1449         }
1450         return zl;
1451 }
1452 #endif
1453
1454 /* Allocate a page in interleaved policy.
1455    Own path because it needs to do special accounting. */
1456 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1457                                         unsigned nid)
1458 {
1459         struct zonelist *zl;
1460         struct page *page;
1461
1462         zl = node_zonelist(nid, gfp);
1463         page = __alloc_pages(gfp, order, zl);
1464         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1465                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1466         return page;
1467 }
1468
1469 /**
1470  *      alloc_page_vma  - Allocate a page for a VMA.
1471  *
1472  *      @gfp:
1473  *      %GFP_USER    user allocation.
1474  *      %GFP_KERNEL  kernel allocations,
1475  *      %GFP_HIGHMEM highmem/user allocations,
1476  *      %GFP_FS      allocation should not call back into a file system.
1477  *      %GFP_ATOMIC  don't sleep.
1478  *
1479  *      @vma:  Pointer to VMA or NULL if not available.
1480  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1481  *
1482  *      This function allocates a page from the kernel page pool and applies
1483  *      a NUMA policy associated with the VMA or the current process.
1484  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1485  *      mm_struct of the VMA to prevent it from going away. Should be used for
1486  *      all allocations for pages that will be mapped into
1487  *      user space. Returns NULL when no page can be allocated.
1488  *
1489  *      Should be called with the mm_sem of the vma hold.
1490  */
1491 struct page *
1492 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1493 {
1494         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1495         struct zonelist *zl;
1496
1497         cpuset_update_task_memory_state();
1498
1499         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1500                 unsigned nid;
1501
1502                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1503                 if (unlikely(pol != &default_policy &&
1504                                 pol != current->mempolicy))
1505                         __mpol_free(pol);       /* finished with pol */
1506                 return alloc_page_interleave(gfp, 0, nid);
1507         }
1508         zl = zonelist_policy(gfp, pol);
1509         if (pol != &default_policy && pol != current->mempolicy) {
1510                 /*
1511                  * slow path: ref counted policy -- shared or vma
1512                  */
1513                 struct page *page =  __alloc_pages_nodemask(gfp, 0,
1514                                                 zl, nodemask_policy(gfp, pol));
1515                 __mpol_free(pol);
1516                 return page;
1517         }
1518         /*
1519          * fast path:  default or task policy
1520          */
1521         return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
1522 }
1523
1524 /**
1525  *      alloc_pages_current - Allocate pages.
1526  *
1527  *      @gfp:
1528  *              %GFP_USER   user allocation,
1529  *              %GFP_KERNEL kernel allocation,
1530  *              %GFP_HIGHMEM highmem allocation,
1531  *              %GFP_FS     don't call back into a file system.
1532  *              %GFP_ATOMIC don't sleep.
1533  *      @order: Power of two of allocation size in pages. 0 is a single page.
1534  *
1535  *      Allocate a page from the kernel page pool.  When not in
1536  *      interrupt context and apply the current process NUMA policy.
1537  *      Returns NULL when no page can be allocated.
1538  *
1539  *      Don't call cpuset_update_task_memory_state() unless
1540  *      1) it's ok to take cpuset_sem (can WAIT), and
1541  *      2) allocating for current task (not interrupt).
1542  */
1543 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1544 {
1545         struct mempolicy *pol = current->mempolicy;
1546
1547         if ((gfp & __GFP_WAIT) && !in_interrupt())
1548                 cpuset_update_task_memory_state();
1549         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1550                 pol = &default_policy;
1551         if (pol->policy == MPOL_INTERLEAVE)
1552                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1553         return __alloc_pages_nodemask(gfp, order,
1554                         zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
1555 }
1556 EXPORT_SYMBOL(alloc_pages_current);
1557
1558 /*
1559  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1560  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1561  * with the mems_allowed returned by cpuset_mems_allowed().  This
1562  * keeps mempolicies cpuset relative after its cpuset moves.  See
1563  * further kernel/cpuset.c update_nodemask().
1564  */
1565
1566 /* Slow path of a mempolicy copy */
1567 struct mempolicy *__mpol_copy(struct mempolicy *old)
1568 {
1569         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1570
1571         if (!new)
1572                 return ERR_PTR(-ENOMEM);
1573         if (current_cpuset_is_being_rebound()) {
1574                 nodemask_t mems = cpuset_mems_allowed(current);
1575                 mpol_rebind_policy(old, &mems);
1576         }
1577         *new = *old;
1578         atomic_set(&new->refcnt, 1);
1579         return new;
1580 }
1581
1582 static int mpol_match_intent(const struct mempolicy *a,
1583                              const struct mempolicy *b)
1584 {
1585         if (a->flags != b->flags)
1586                 return 0;
1587         if (!mpol_store_user_nodemask(a))
1588                 return 1;
1589         return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1590 }
1591
1592 /* Slow path of a mempolicy comparison */
1593 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1594 {
1595         if (!a || !b)
1596                 return 0;
1597         if (a->policy != b->policy)
1598                 return 0;
1599         if (a->policy != MPOL_DEFAULT && !mpol_match_intent(a, b))
1600                 return 0;
1601         switch (a->policy) {
1602         case MPOL_DEFAULT:
1603                 return 1;
1604         case MPOL_BIND:
1605                 /* Fall through */
1606         case MPOL_INTERLEAVE:
1607                 return nodes_equal(a->v.nodes, b->v.nodes);
1608         case MPOL_PREFERRED:
1609                 return a->v.preferred_node == b->v.preferred_node;
1610         default:
1611                 BUG();
1612                 return 0;
1613         }
1614 }
1615
1616 /* Slow path of a mpol destructor. */
1617 void __mpol_free(struct mempolicy *p)
1618 {
1619         if (!atomic_dec_and_test(&p->refcnt))
1620                 return;
1621         p->policy = MPOL_DEFAULT;
1622         kmem_cache_free(policy_cache, p);
1623 }
1624
1625 /*
1626  * Shared memory backing store policy support.
1627  *
1628  * Remember policies even when nobody has shared memory mapped.
1629  * The policies are kept in Red-Black tree linked from the inode.
1630  * They are protected by the sp->lock spinlock, which should be held
1631  * for any accesses to the tree.
1632  */
1633
1634 /* lookup first element intersecting start-end */
1635 /* Caller holds sp->lock */
1636 static struct sp_node *
1637 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1638 {
1639         struct rb_node *n = sp->root.rb_node;
1640
1641         while (n) {
1642                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1643
1644                 if (start >= p->end)
1645                         n = n->rb_right;
1646                 else if (end <= p->start)
1647                         n = n->rb_left;
1648                 else
1649                         break;
1650         }
1651         if (!n)
1652                 return NULL;
1653         for (;;) {
1654                 struct sp_node *w = NULL;
1655                 struct rb_node *prev = rb_prev(n);
1656                 if (!prev)
1657                         break;
1658                 w = rb_entry(prev, struct sp_node, nd);
1659                 if (w->end <= start)
1660                         break;
1661                 n = prev;
1662         }
1663         return rb_entry(n, struct sp_node, nd);
1664 }
1665
1666 /* Insert a new shared policy into the list. */
1667 /* Caller holds sp->lock */
1668 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1669 {
1670         struct rb_node **p = &sp->root.rb_node;
1671         struct rb_node *parent = NULL;
1672         struct sp_node *nd;
1673
1674         while (*p) {
1675                 parent = *p;
1676                 nd = rb_entry(parent, struct sp_node, nd);
1677                 if (new->start < nd->start)
1678                         p = &(*p)->rb_left;
1679                 else if (new->end > nd->end)
1680                         p = &(*p)->rb_right;
1681                 else
1682                         BUG();
1683         }
1684         rb_link_node(&new->nd, parent, p);
1685         rb_insert_color(&new->nd, &sp->root);
1686         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1687                  new->policy ? new->policy->policy : 0);
1688 }
1689
1690 /* Find shared policy intersecting idx */
1691 struct mempolicy *
1692 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1693 {
1694         struct mempolicy *pol = NULL;
1695         struct sp_node *sn;
1696
1697         if (!sp->root.rb_node)
1698                 return NULL;
1699         spin_lock(&sp->lock);
1700         sn = sp_lookup(sp, idx, idx+1);
1701         if (sn) {
1702                 mpol_get(sn->policy);
1703                 pol = sn->policy;
1704         }
1705         spin_unlock(&sp->lock);
1706         return pol;
1707 }
1708
1709 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1710 {
1711         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1712         rb_erase(&n->nd, &sp->root);
1713         mpol_free(n->policy);
1714         kmem_cache_free(sn_cache, n);
1715 }
1716
1717 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1718                                 struct mempolicy *pol)
1719 {
1720         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1721
1722         if (!n)
1723                 return NULL;
1724         n->start = start;
1725         n->end = end;
1726         mpol_get(pol);
1727         n->policy = pol;
1728         return n;
1729 }
1730
1731 /* Replace a policy range. */
1732 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1733                                  unsigned long end, struct sp_node *new)
1734 {
1735         struct sp_node *n, *new2 = NULL;
1736
1737 restart:
1738         spin_lock(&sp->lock);
1739         n = sp_lookup(sp, start, end);
1740         /* Take care of old policies in the same range. */
1741         while (n && n->start < end) {
1742                 struct rb_node *next = rb_next(&n->nd);
1743                 if (n->start >= start) {
1744                         if (n->end <= end)
1745                                 sp_delete(sp, n);
1746                         else
1747                                 n->start = end;
1748                 } else {
1749                         /* Old policy spanning whole new range. */
1750                         if (n->end > end) {
1751                                 if (!new2) {
1752                                         spin_unlock(&sp->lock);
1753                                         new2 = sp_alloc(end, n->end, n->policy);
1754                                         if (!new2)
1755                                                 return -ENOMEM;
1756                                         goto restart;
1757                                 }
1758                                 n->end = start;
1759                                 sp_insert(sp, new2);
1760                                 new2 = NULL;
1761                                 break;
1762                         } else
1763                                 n->end = start;
1764                 }
1765                 if (!next)
1766                         break;
1767                 n = rb_entry(next, struct sp_node, nd);
1768         }
1769         if (new)
1770                 sp_insert(sp, new);
1771         spin_unlock(&sp->lock);
1772         if (new2) {
1773                 mpol_free(new2->policy);
1774                 kmem_cache_free(sn_cache, new2);
1775         }
1776         return 0;
1777 }
1778
1779 void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
1780                         unsigned short flags, nodemask_t *policy_nodes)
1781 {
1782         info->root = RB_ROOT;
1783         spin_lock_init(&info->lock);
1784
1785         if (policy != MPOL_DEFAULT) {
1786                 struct mempolicy *newpol;
1787
1788                 /* Falls back to MPOL_DEFAULT on any error */
1789                 newpol = mpol_new(policy, flags, policy_nodes);
1790                 if (!IS_ERR(newpol)) {
1791                         /* Create pseudo-vma that contains just the policy */
1792                         struct vm_area_struct pvma;
1793
1794                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1795                         /* Policy covers entire file */
1796                         pvma.vm_end = TASK_SIZE;
1797                         mpol_set_shared_policy(info, &pvma, newpol);
1798                         mpol_free(newpol);
1799                 }
1800         }
1801 }
1802
1803 int mpol_set_shared_policy(struct shared_policy *info,
1804                         struct vm_area_struct *vma, struct mempolicy *npol)
1805 {
1806         int err;
1807         struct sp_node *new = NULL;
1808         unsigned long sz = vma_pages(vma);
1809
1810         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1811                  vma->vm_pgoff,
1812                  sz, npol ? npol->policy : -1,
1813                  npol ? npol->flags : -1,
1814                  npol ? nodes_addr(npol->v.nodes)[0] : -1);
1815
1816         if (npol) {
1817                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1818                 if (!new)
1819                         return -ENOMEM;
1820         }
1821         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1822         if (err && new)
1823                 kmem_cache_free(sn_cache, new);
1824         return err;
1825 }
1826
1827 /* Free a backing policy store on inode delete. */
1828 void mpol_free_shared_policy(struct shared_policy *p)
1829 {
1830         struct sp_node *n;
1831         struct rb_node *next;
1832
1833         if (!p->root.rb_node)
1834                 return;
1835         spin_lock(&p->lock);
1836         next = rb_first(&p->root);
1837         while (next) {
1838                 n = rb_entry(next, struct sp_node, nd);
1839                 next = rb_next(&n->nd);
1840                 rb_erase(&n->nd, &p->root);
1841                 mpol_free(n->policy);
1842                 kmem_cache_free(sn_cache, n);
1843         }
1844         spin_unlock(&p->lock);
1845 }
1846
1847 /* assumes fs == KERNEL_DS */
1848 void __init numa_policy_init(void)
1849 {
1850         nodemask_t interleave_nodes;
1851         unsigned long largest = 0;
1852         int nid, prefer = 0;
1853
1854         policy_cache = kmem_cache_create("numa_policy",
1855                                          sizeof(struct mempolicy),
1856                                          0, SLAB_PANIC, NULL);
1857
1858         sn_cache = kmem_cache_create("shared_policy_node",
1859                                      sizeof(struct sp_node),
1860                                      0, SLAB_PANIC, NULL);
1861
1862         /*
1863          * Set interleaving policy for system init. Interleaving is only
1864          * enabled across suitably sized nodes (default is >= 16MB), or
1865          * fall back to the largest node if they're all smaller.
1866          */
1867         nodes_clear(interleave_nodes);
1868         for_each_node_state(nid, N_HIGH_MEMORY) {
1869                 unsigned long total_pages = node_present_pages(nid);
1870
1871                 /* Preserve the largest node */
1872                 if (largest < total_pages) {
1873                         largest = total_pages;
1874                         prefer = nid;
1875                 }
1876
1877                 /* Interleave this node? */
1878                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1879                         node_set(nid, interleave_nodes);
1880         }
1881
1882         /* All too small, use the largest */
1883         if (unlikely(nodes_empty(interleave_nodes)))
1884                 node_set(prefer, interleave_nodes);
1885
1886         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1887                 printk("numa_policy_init: interleaving failed\n");
1888 }
1889
1890 /* Reset policy of current process to default */
1891 void numa_default_policy(void)
1892 {
1893         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1894 }
1895
1896 /*
1897  * Display pages allocated per node and memory policy via /proc.
1898  */
1899 static const char * const policy_types[] =
1900         { "default", "prefer", "bind", "interleave" };
1901
1902 /*
1903  * Convert a mempolicy into a string.
1904  * Returns the number of characters in buffer (if positive)
1905  * or an error (negative)
1906  */
1907 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1908 {
1909         char *p = buffer;
1910         int l;
1911         nodemask_t nodes;
1912         unsigned short mode = pol ? pol->policy : MPOL_DEFAULT;
1913         unsigned short flags = pol ? pol->flags : 0;
1914
1915         switch (mode) {
1916         case MPOL_DEFAULT:
1917                 nodes_clear(nodes);
1918                 break;
1919
1920         case MPOL_PREFERRED:
1921                 nodes_clear(nodes);
1922                 node_set(pol->v.preferred_node, nodes);
1923                 break;
1924
1925         case MPOL_BIND:
1926                 /* Fall through */
1927         case MPOL_INTERLEAVE:
1928                 nodes = pol->v.nodes;
1929                 break;
1930
1931         default:
1932                 BUG();
1933                 return -EFAULT;
1934         }
1935
1936         l = strlen(policy_types[mode]);
1937         if (buffer + maxlen < p + l + 1)
1938                 return -ENOSPC;
1939
1940         strcpy(p, policy_types[mode]);
1941         p += l;
1942
1943         if (flags) {
1944                 int need_bar = 0;
1945
1946                 if (buffer + maxlen < p + 2)
1947                         return -ENOSPC;
1948                 *p++ = '=';
1949
1950                 if (flags & MPOL_F_STATIC_NODES)
1951                         p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
1952                 if (flags & MPOL_F_RELATIVE_NODES)
1953                         p += sprintf(p, "%srelative", need_bar++ ? "|" : "");
1954         }
1955
1956         if (!nodes_empty(nodes)) {
1957                 if (buffer + maxlen < p + 2)
1958                         return -ENOSPC;
1959                 *p++ = '=';
1960                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1961         }
1962         return p - buffer;
1963 }
1964
1965 struct numa_maps {
1966         unsigned long pages;
1967         unsigned long anon;
1968         unsigned long active;
1969         unsigned long writeback;
1970         unsigned long mapcount_max;
1971         unsigned long dirty;
1972         unsigned long swapcache;
1973         unsigned long node[MAX_NUMNODES];
1974 };
1975
1976 static void gather_stats(struct page *page, void *private, int pte_dirty)
1977 {
1978         struct numa_maps *md = private;
1979         int count = page_mapcount(page);
1980
1981         md->pages++;
1982         if (pte_dirty || PageDirty(page))
1983                 md->dirty++;
1984
1985         if (PageSwapCache(page))
1986                 md->swapcache++;
1987
1988         if (PageActive(page))
1989                 md->active++;
1990
1991         if (PageWriteback(page))
1992                 md->writeback++;
1993
1994         if (PageAnon(page))
1995                 md->anon++;
1996
1997         if (count > md->mapcount_max)
1998                 md->mapcount_max = count;
1999
2000         md->node[page_to_nid(page)]++;
2001 }
2002
2003 #ifdef CONFIG_HUGETLB_PAGE
2004 static void check_huge_range(struct vm_area_struct *vma,
2005                 unsigned long start, unsigned long end,
2006                 struct numa_maps *md)
2007 {
2008         unsigned long addr;
2009         struct page *page;
2010
2011         for (addr = start; addr < end; addr += HPAGE_SIZE) {
2012                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
2013                 pte_t pte;
2014
2015                 if (!ptep)
2016                         continue;
2017
2018                 pte = *ptep;
2019                 if (pte_none(pte))
2020                         continue;
2021
2022                 page = pte_page(pte);
2023                 if (!page)
2024                         continue;
2025
2026                 gather_stats(page, md, pte_dirty(*ptep));
2027         }
2028 }
2029 #else
2030 static inline void check_huge_range(struct vm_area_struct *vma,
2031                 unsigned long start, unsigned long end,
2032                 struct numa_maps *md)
2033 {
2034 }
2035 #endif
2036
2037 int show_numa_map(struct seq_file *m, void *v)
2038 {
2039         struct proc_maps_private *priv = m->private;
2040         struct vm_area_struct *vma = v;
2041         struct numa_maps *md;
2042         struct file *file = vma->vm_file;
2043         struct mm_struct *mm = vma->vm_mm;
2044         struct mempolicy *pol;
2045         int n;
2046         char buffer[50];
2047
2048         if (!mm)
2049                 return 0;
2050
2051         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2052         if (!md)
2053                 return 0;
2054
2055         pol = get_vma_policy(priv->task, vma, vma->vm_start);
2056         mpol_to_str(buffer, sizeof(buffer), pol);
2057         /*
2058          * unref shared or other task's mempolicy
2059          */
2060         if (pol != &default_policy && pol != current->mempolicy)
2061                 __mpol_free(pol);
2062
2063         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2064
2065         if (file) {
2066                 seq_printf(m, " file=");
2067                 seq_path(m, &file->f_path, "\n\t= ");
2068         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2069                 seq_printf(m, " heap");
2070         } else if (vma->vm_start <= mm->start_stack &&
2071                         vma->vm_end >= mm->start_stack) {
2072                 seq_printf(m, " stack");
2073         }
2074
2075         if (is_vm_hugetlb_page(vma)) {
2076                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2077                 seq_printf(m, " huge");
2078         } else {
2079                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
2080                         &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2081         }
2082
2083         if (!md->pages)
2084                 goto out;
2085
2086         if (md->anon)
2087                 seq_printf(m," anon=%lu",md->anon);
2088
2089         if (md->dirty)
2090                 seq_printf(m," dirty=%lu",md->dirty);
2091
2092         if (md->pages != md->anon && md->pages != md->dirty)
2093                 seq_printf(m, " mapped=%lu", md->pages);
2094
2095         if (md->mapcount_max > 1)
2096                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2097
2098         if (md->swapcache)
2099                 seq_printf(m," swapcache=%lu", md->swapcache);
2100
2101         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2102                 seq_printf(m," active=%lu", md->active);
2103
2104         if (md->writeback)
2105                 seq_printf(m," writeback=%lu", md->writeback);
2106
2107         for_each_node_state(n, N_HIGH_MEMORY)
2108                 if (md->node[n])
2109                         seq_printf(m, " N%d=%lu", n, md->node[n]);
2110 out:
2111         seq_putc(m, '\n');
2112         kfree(md);
2113
2114         if (m->count < m->size)
2115                 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2116         return 0;
2117 }