mm/percpu.c

   1 /*
   2  * linux/mm/percpu.c - percpu memory allocator
   3  *
   4  * Copyright (C) 2009           SUSE Linux Products GmbH
   5  * Copyright (C) 2009           Tejun Heo <tj@kernel.org>
   6  *
   7  * This file is released under the GPLv2.
   8  *
   9  * This is percpu allocator which can handle both static and dynamic
  10  * areas.  Percpu areas are allocated in chunks in vmalloc area.  Each
  11  * chunk is consisted of num_possible_cpus() units and the first chunk
  12  * is used for static percpu variables in the kernel image (special
  13  * boot time alloc/init handling necessary as these areas need to be
  14  * brought up before allocation services are running).  Unit grows as
  15  * necessary and all units grow or shrink in unison.  When a chunk is
  16  * filled up, another chunk is allocated.  ie. in vmalloc area
  17  *
  18  *  c0                           c1                         c2
  19  *  -------------------          -------------------        ------------
  20  * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
  21  *  -------------------  ......  -------------------  ....  ------------
  22  *
  23  * Allocation is done in offset-size areas of single unit space.  Ie,
  24  * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
  25  * c1:u1, c1:u2 and c1:u3.  Percpu access can be done by configuring
  26  * percpu base registers UNIT_SIZE apart.
  27  *
  28  * There are usually many small percpu allocations many of them as
  29  * small as 4 bytes.  The allocator organizes chunks into lists
  30  * according to free size and tries to allocate from the fullest one.
  31  * Each chunk keeps the maximum contiguous area size hint which is
  32  * guaranteed to be eqaul to or larger than the maximum contiguous
  33  * area in the chunk.  This helps the allocator not to iterate the
  34  * chunk maps unnecessarily.
  35  *
  36  * Allocation state in each chunk is kept using an array of integers
  37  * on chunk->map.  A positive value in the map represents a free
  38  * region and negative allocated.  Allocation inside a chunk is done
  39  * by scanning this map sequentially and serving the first matching
  40  * entry.  This is mostly copied from the percpu_modalloc() allocator.
  41  * Chunks are also linked into a rb tree to ease address to chunk
  42  * mapping during free.
  43  *
  44  * To use this allocator, arch code should do the followings.
  45  *
  46  * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
  47  *
  48  * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
  49  *   regular address to percpu pointer and back
  50  *
  51  * - use pcpu_setup_first_chunk() during percpu area initialization to
  52  *   setup the first chunk containing the kernel static percpu area
  53  */
  54
  55 #include <linux/bitmap.h>
  56 #include <linux/bootmem.h>
  57 #include <linux/list.h>
  58 #include <linux/mm.h>
  59 #include <linux/module.h>
  60 #include <linux/mutex.h>
  61 #include <linux/percpu.h>
  62 #include <linux/pfn.h>
  63 #include <linux/rbtree.h>
  64 #include <linux/slab.h>
  65 #include <linux/vmalloc.h>
  66
  67 #include <asm/cacheflush.h>
  68 #include <asm/tlbflush.h>
  69
  70 #define PCPU_SLOT_BASE_SHIFT            5       /* 1-31 shares the same slot */
  71 #define PCPU_DFL_MAP_ALLOC              16      /* start a map with 16 ents */
  72
  73 struct pcpu_chunk {
  74         struct list_head        list;           /* linked to pcpu_slot lists */
  75         struct rb_node          rb_node;        /* key is chunk->vm->addr */
  76         int                     free_size;      /* free bytes in the chunk */
  77         int                     contig_hint;    /* max contiguous size hint */
  78         struct vm_struct        *vm;            /* mapped vmalloc region */
  79         int                     map_used;       /* # of map entries used */
  80         int                     map_alloc;      /* # of map entries allocated */
  81         int                     *map;           /* allocation map */
  82         bool                    immutable;      /* no [de]population allowed */
  83         struct page             **page;         /* points to page array */
  84         struct page             *page_ar[];     /* #cpus * UNIT_PAGES */
  85 };
  86
  87 static int pcpu_unit_pages __read_mostly;
  88 static int pcpu_unit_size __read_mostly;
  89 static int pcpu_chunk_size __read_mostly;
  90 static int pcpu_nr_slots __read_mostly;
  91 static size_t pcpu_chunk_struct_size __read_mostly;
  92
  93 /* the address of the first chunk which starts with the kernel static area */
  94 void *pcpu_base_addr __read_mostly;
  95 EXPORT_SYMBOL_GPL(pcpu_base_addr);
  96
  97 /*
  98  * One mutex to rule them all.
  99  *
 100  * The following mutex is grabbed in the outermost public alloc/free
 101  * interface functions and released only when the operation is
 102  * complete.  As such, every function in this file other than the
 103  * outermost functions are called under pcpu_mutex.
 104  *
 105  * It can easily be switched to use spinlock such that only the area
 106  * allocation and page population commit are protected with it doing
 107  * actual [de]allocation without holding any lock.  However, given
 108  * what this allocator does, I think it's better to let them run
 109  * sequentially.
 110  */
 111 static DEFINE_MUTEX(pcpu_mutex);
 112
 113 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 114 static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
 115
 116 static int __pcpu_size_to_slot(int size)
 117 {
 118         int highbit = fls(size);        /* size is in bytes */
 119         return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
 120 }
 121
 122 static int pcpu_size_to_slot(int size)
 123 {
 124         if (size == pcpu_unit_size)
 125                 return pcpu_nr_slots - 1;
 126         return __pcpu_size_to_slot(size);
 127 }
 128
 129 static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
 130 {
 131         if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
 132                 return 0;
 133
 134         return pcpu_size_to_slot(chunk->free_size);
 135 }
 136
 137 static int pcpu_page_idx(unsigned int cpu, int page_idx)
 138 {
 139         return cpu * pcpu_unit_pages + page_idx;
 140 }
 141
 142 static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
 143                                       unsigned int cpu, int page_idx)
 144 {
 145         return &chunk->page[pcpu_page_idx(cpu, page_idx)];
 146 }
 147
 148 static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
 149                                      unsigned int cpu, int page_idx)
 150 {
 151         return (unsigned long)chunk->vm->addr +
 152                 (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
 153 }
 154
 155 static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
 156                                      int page_idx)
 157 {
 158         return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
 159 }
 160
 161 /**
 162  * pcpu_realloc - versatile realloc
 163  * @p: the current pointer (can be NULL for new allocations)
 164  * @size: the current size in bytes (can be 0 for new allocations)
 165  * @new_size: the wanted new size in bytes (can be 0 for free)
 166  *
 167  * More robust realloc which can be used to allocate, resize or free a
 168  * memory area of arbitrary size.  If the needed size goes over
 169  * PAGE_SIZE, kernel VM is used.
 170  *
 171  * RETURNS:
 172  * The new pointer on success, NULL on failure.
 173  */
 174 static void *pcpu_realloc(void *p, size_t size, size_t new_size)
 175 {
 176         void *new;
 177
 178         if (new_size <= PAGE_SIZE)
 179                 new = kmalloc(new_size, GFP_KERNEL);
 180         else
 181                 new = vmalloc(new_size);
 182         if (new_size && !new)
 183                 return NULL;
 184
 185         memcpy(new, p, min(size, new_size));
 186         if (new_size > size)
 187                 memset(new + size, 0, new_size - size);
 188
 189         if (size <= PAGE_SIZE)
 190                 kfree(p);
 191         else
 192                 vfree(p);
 193
 194         return new;
 195 }
 196
 197 /**
 198  * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
 199  * @chunk: chunk of interest
 200  * @oslot: the previous slot it was on
 201  *
 202  * This function is called after an allocation or free changed @chunk.
 203  * New slot according to the changed state is determined and @chunk is
 204  * moved to the slot.
 205  */
 206 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 207 {
 208         int nslot = pcpu_chunk_slot(chunk);
 209
 210         if (oslot != nslot) {
 211                 if (oslot < nslot)
 212                         list_move(&chunk->list, &pcpu_slot[nslot]);
 213                 else
 214                         list_move_tail(&chunk->list, &pcpu_slot[nslot]);
 215         }
 216 }
 217
 218 static struct rb_node **pcpu_chunk_rb_search(void *addr,
 219                                              struct rb_node **parentp)
 220 {
 221         struct rb_node **p = &pcpu_addr_root.rb_node;
 222         struct rb_node *parent = NULL;
 223         struct pcpu_chunk *chunk;
 224
 225         while (*p) {
 226                 parent = *p;
 227                 chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
 228
 229                 if (addr < chunk->vm->addr)
 230                         p = &(*p)->rb_left;
 231                 else if (addr > chunk->vm->addr)
 232                         p = &(*p)->rb_right;
 233                 else
 234                         break;
 235         }
 236
 237         if (parentp)
 238                 *parentp = parent;
 239         return p;
 240 }
 241
 242 /**
 243  * pcpu_chunk_addr_search - search for chunk containing specified address
 244  * @addr: address to search for
 245  *
 246  * Look for chunk which might contain @addr.  More specifically, it
 247  * searchs for the chunk with the highest start address which isn't
 248  * beyond @addr.
 249  *
 250  * RETURNS:
 251  * The address of the found chunk.
 252  */
 253 static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 254 {
 255         struct rb_node *n, *parent;
 256         struct pcpu_chunk *chunk;
 257
 258         n = *pcpu_chunk_rb_search(addr, &parent);
 259         if (!n) {
 260                 /* no exactly matching chunk, the parent is the closest */
 261                 n = parent;
 262                 BUG_ON(!n);
 263         }
 264         chunk = rb_entry(n, struct pcpu_chunk, rb_node);
 265
 266         if (addr < chunk->vm->addr) {
 267                 /* the parent was the next one, look for the previous one */
 268                 n = rb_prev(n);
 269                 BUG_ON(!n);
 270                 chunk = rb_entry(n, struct pcpu_chunk, rb_node);
 271         }
 272
 273         return chunk;
 274 }
 275
 276 /**
 277  * pcpu_chunk_addr_insert - insert chunk into address rb tree
 278  * @new: chunk to insert
 279  *
 280  * Insert @new into address rb tree.
 281  */
 282 static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
 283 {
 284         struct rb_node **p, *parent;
 285
 286         p = pcpu_chunk_rb_search(new->vm->addr, &parent);
 287         BUG_ON(*p);
 288         rb_link_node(&new->rb_node, parent, p);
 289         rb_insert_color(&new->rb_node, &pcpu_addr_root);
 290 }
 291
 292 /**
 293  * pcpu_split_block - split a map block
 294  * @chunk: chunk of interest
 295  * @i: index of map block to split
 296  * @head: head size in bytes (can be 0)
 297  * @tail: tail size in bytes (can be 0)
 298  *
 299  * Split the @i'th map block into two or three blocks.  If @head is
 300  * non-zero, @head bytes block is inserted before block @i moving it
 301  * to @i+1 and reducing its size by @head bytes.
 302  *
 303  * If @tail is non-zero, the target block, which can be @i or @i+1
 304  * depending on @head, is reduced by @tail bytes and @tail byte block
 305  * is inserted after the target block.
 306  *
 307  * RETURNS:
 308  * 0 on success, -errno on failure.
 309  */
 310 static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
 311 {
 312         int nr_extra = !!head + !!tail;
 313         int target = chunk->map_used + nr_extra;
 314
 315         /* reallocation required? */
 316         if (chunk->map_alloc < target) {
 317                 int new_alloc;
 318                 int *new;
 319
 320                 new_alloc = PCPU_DFL_MAP_ALLOC;
 321                 while (new_alloc < target)
 322                         new_alloc *= 2;
 323
 324                 if (chunk->map_alloc < PCPU_DFL_MAP_ALLOC) {
 325                         /*
 326                          * map_alloc smaller than the default size
 327                          * indicates that the chunk is one of the
 328                          * first chunks and still using static map.
 329                          * Allocate a dynamic one and copy.
 330                          */
 331                         new = pcpu_realloc(NULL, 0, new_alloc * sizeof(new[0]));
 332                         if (new)
 333                                 memcpy(new, chunk->map,
 334                                        chunk->map_alloc * sizeof(new[0]));
 335                 } else
 336                         new = pcpu_realloc(chunk->map,
 337                                            chunk->map_alloc * sizeof(new[0]),
 338                                            new_alloc * sizeof(new[0]));
 339                 if (!new)
 340                         return -ENOMEM;
 341
 342                 chunk->map_alloc = new_alloc;
 343                 chunk->map = new;
 344         }
 345
 346         /* insert a new subblock */
 347         memmove(&chunk->map[i + nr_extra], &chunk->map[i],
 348                 sizeof(chunk->map[0]) * (chunk->map_used - i));
 349         chunk->map_used += nr_extra;
 350
 351         if (head) {
 352                 chunk->map[i + 1] = chunk->map[i] - head;
 353                 chunk->map[i++] = head;
 354         }
 355         if (tail) {
 356                 chunk->map[i++] -= tail;
 357                 chunk->map[i] = tail;
 358         }
 359         return 0;
 360 }
 361
 362 /**
 363  * pcpu_alloc_area - allocate area from a pcpu_chunk
 364  * @chunk: chunk of interest
 365  * @size: wanted size in bytes
 366  * @align: wanted align
 367  *
 368  * Try to allocate @size bytes area aligned at @align from @chunk.
 369  * Note that this function only allocates the offset.  It doesn't
 370  * populate or map the area.
 371  *
 372  * RETURNS:
 373  * Allocated offset in @chunk on success, -errno on failure.
 374  */
 375 static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 376 {
 377         int oslot = pcpu_chunk_slot(chunk);
 378         int max_contig = 0;
 379         int i, off;
 380
 381         for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
 382                 bool is_last = i + 1 == chunk->map_used;
 383                 int head, tail;
 384
 385                 /* extra for alignment requirement */
 386                 head = ALIGN(off, align) - off;
 387                 BUG_ON(i == 0 && head != 0);
 388
 389                 if (chunk->map[i] < 0)
 390                         continue;
 391                 if (chunk->map[i] < head + size) {
 392                         max_contig = max(chunk->map[i], max_contig);
 393                         continue;
 394                 }
 395
 396                 /*
 397                  * If head is small or the previous block is free,
 398                  * merge'em.  Note that 'small' is defined as smaller
 399                  * than sizeof(int), which is very small but isn't too
 400                  * uncommon for percpu allocations.
 401                  */
 402                 if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
 403                         if (chunk->map[i - 1] > 0)
 404                                 chunk->map[i - 1] += head;
 405                         else {
 406                                 chunk->map[i - 1] -= head;
 407                                 chunk->free_size -= head;
 408                         }
 409                         chunk->map[i] -= head;
 410                         off += head;
 411                         head = 0;
 412                 }
 413
 414                 /* if tail is small, just keep it around */
 415                 tail = chunk->map[i] - head - size;
 416                 if (tail < sizeof(int))
 417                         tail = 0;
 418
 419                 /* split if warranted */
 420                 if (head || tail) {
 421                         if (pcpu_split_block(chunk, i, head, tail))
 422                                 return -ENOMEM;
 423                         if (head) {
 424                                 i++;
 425                                 off += head;
 426                                 max_contig = max(chunk->map[i - 1], max_contig);
 427                         }
 428                         if (tail)
 429                                 max_contig = max(chunk->map[i + 1], max_contig);
 430                 }
 431
 432                 /* update hint and mark allocated */
 433                 if (is_last)
 434                         chunk->contig_hint = max_contig; /* fully scanned */
 435                 else
 436                         chunk->contig_hint = max(chunk->contig_hint,
 437                                                  max_contig);
 438
 439                 chunk->free_size -= chunk->map[i];
 440                 chunk->map[i] = -chunk->map[i];
 441
 442                 pcpu_chunk_relocate(chunk, oslot);
 443                 return off;
 444         }
 445
 446         chunk->contig_hint = max_contig;        /* fully scanned */
 447         pcpu_chunk_relocate(chunk, oslot);
 448
 449         /*
 450          * Tell the upper layer that this chunk has no area left.
 451          * Note that this is not an error condition but a notification
 452          * to upper layer that it needs to look at other chunks.
 453          * -ENOSPC is chosen as it isn't used in memory subsystem and
 454          * matches the meaning in a way.
 455          */
 456         return -ENOSPC;
 457 }
 458
 459 /**
 460  * pcpu_free_area - free area to a pcpu_chunk
 461  * @chunk: chunk of interest
 462  * @freeme: offset of area to free
 463  *
 464  * Free area starting from @freeme to @chunk.  Note that this function
 465  * only modifies the allocation map.  It doesn't depopulate or unmap
 466  * the area.
 467  */
 468 static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
 469 {
 470         int oslot = pcpu_chunk_slot(chunk);
 471         int i, off;
 472
 473         for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
 474                 if (off == freeme)
 475                         break;
 476         BUG_ON(off != freeme);
 477         BUG_ON(chunk->map[i] > 0);
 478
 479         chunk->map[i] = -chunk->map[i];
 480         chunk->free_size += chunk->map[i];
 481
 482         /* merge with previous? */
 483         if (i > 0 && chunk->map[i - 1] >= 0) {
 484                 chunk->map[i - 1] += chunk->map[i];
 485                 chunk->map_used--;
 486                 memmove(&chunk->map[i], &chunk->map[i + 1],
 487                         (chunk->map_used - i) * sizeof(chunk->map[0]));
 488                 i--;
 489         }
 490         /* merge with next? */
 491         if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
 492                 chunk->map[i] += chunk->map[i + 1];
 493                 chunk->map_used--;
 494                 memmove(&chunk->map[i + 1], &chunk->map[i + 2],
 495                         (chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
 496         }
 497
 498         chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
 499         pcpu_chunk_relocate(chunk, oslot);
 500 }
 501
 502 /**
 503  * pcpu_unmap - unmap pages out of a pcpu_chunk
 504  * @chunk: chunk of interest
 505  * @page_start: page index of the first page to unmap
 506  * @page_end: page index of the last page to unmap + 1
 507  * @flush: whether to flush cache and tlb or not
 508  *
 509  * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
 510  * If @flush is true, vcache is flushed before unmapping and tlb
 511  * after.
 512  */
 513 static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
 514                        bool flush)
 515 {
 516         unsigned int last = num_possible_cpus() - 1;
 517         unsigned int cpu;
 518
 519         /* unmap must not be done on immutable chunk */
 520         WARN_ON(chunk->immutable);
 521
 522         /*
 523          * Each flushing trial can be very expensive, issue flush on
 524          * the whole region at once rather than doing it for each cpu.
 525          * This could be an overkill but is more scalable.
 526          */
 527         if (flush)
 528                 flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
 529                                    pcpu_chunk_addr(chunk, last, page_end));
 530
 531         for_each_possible_cpu(cpu)
 532                 unmap_kernel_range_noflush(
 533                                 pcpu_chunk_addr(chunk, cpu, page_start),
 534                                 (page_end - page_start) << PAGE_SHIFT);
 535
 536         /* ditto as flush_cache_vunmap() */
 537         if (flush)
 538                 flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
 539                                        pcpu_chunk_addr(chunk, last, page_end));
 540 }
 541
 542 /**
 543  * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
 544  * @chunk: chunk to depopulate
 545  * @off: offset to the area to depopulate
 546  * @size: size of the area to depopulate in bytes
 547  * @flush: whether to flush cache and tlb or not
 548  *
 549  * For each cpu, depopulate and unmap pages [@page_start,@page_end)
 550  * from @chunk.  If @flush is true, vcache is flushed before unmapping
 551  * and tlb after.
 552  */
 553 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
 554                                   bool flush)
 555 {
 556         int page_start = PFN_DOWN(off);
 557         int page_end = PFN_UP(off + size);
 558         int unmap_start = -1;
 559         int uninitialized_var(unmap_end);
 560         unsigned int cpu;
 561         int i;
 562
 563         for (i = page_start; i < page_end; i++) {
 564                 for_each_possible_cpu(cpu) {
 565                         struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
 566
 567                         if (!*pagep)
 568                                 continue;
 569
 570                         __free_page(*pagep);
 571
 572                         /*
 573                          * If it's partial depopulation, it might get
 574                          * populated or depopulated again.  Mark the
 575                          * page gone.
 576                          */
 577                         *pagep = NULL;
 578
 579                         unmap_start = unmap_start < 0 ? i : unmap_start;
 580                         unmap_end = i + 1;
 581                 }
 582         }
 583
 584         if (unmap_start >= 0)
 585                 pcpu_unmap(chunk, unmap_start, unmap_end, flush);
 586 }
 587
 588 /**
 589  * pcpu_map - map pages into a pcpu_chunk
 590  * @chunk: chunk of interest
 591  * @page_start: page index of the first page to map
 592  * @page_end: page index of the last page to map + 1
 593  *
 594  * For each cpu, map pages [@page_start,@page_end) into @chunk.
 595  * vcache is flushed afterwards.
 596  */
 597 static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
 598 {
 599         unsigned int last = num_possible_cpus() - 1;
 600         unsigned int cpu;
 601         int err;
 602
 603         /* map must not be done on immutable chunk */
 604         WARN_ON(chunk->immutable);
 605
 606         for_each_possible_cpu(cpu) {
 607                 err = map_kernel_range_noflush(
 608                                 pcpu_chunk_addr(chunk, cpu, page_start),
 609                                 (page_end - page_start) << PAGE_SHIFT,
 610                                 PAGE_KERNEL,
 611                                 pcpu_chunk_pagep(chunk, cpu, page_start));
 612                 if (err < 0)
 613                         return err;
 614         }
 615
 616         /* flush at once, please read comments in pcpu_unmap() */
 617         flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
 618                          pcpu_chunk_addr(chunk, last, page_end));
 619         return 0;
 620 }
 621
 622 /**
 623  * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
 624  * @chunk: chunk of interest
 625  * @off: offset to the area to populate
 626  * @size: size of the area to populate in bytes
 627  *
 628  * For each cpu, populate and map pages [@page_start,@page_end) into
 629  * @chunk.  The area is cleared on return.
 630  */
 631 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 632 {
 633         const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
 634         int page_start = PFN_DOWN(off);
 635         int page_end = PFN_UP(off + size);
 636         int map_start = -1;
 637         int uninitialized_var(map_end);
 638         unsigned int cpu;
 639         int i;
 640
 641         for (i = page_start; i < page_end; i++) {
 642                 if (pcpu_chunk_page_occupied(chunk, i)) {
 643                         if (map_start >= 0) {
 644                                 if (pcpu_map(chunk, map_start, map_end))
 645                                         goto err;
 646                                 map_start = -1;
 647                         }
 648                         continue;
 649                 }
 650
 651                 map_start = map_start < 0 ? i : map_start;
 652                 map_end = i + 1;
 653
 654                 for_each_possible_cpu(cpu) {
 655                         struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
 656
 657                         *pagep = alloc_pages_node(cpu_to_node(cpu),
 658                                                   alloc_mask, 0);
 659                         if (!*pagep)
 660                                 goto err;
 661                 }
 662         }
 663
 664         if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
 665                 goto err;
 666
 667         for_each_possible_cpu(cpu)
 668                 memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
 669                        size);
 670
 671         return 0;
 672 err:
 673         /* likely under heavy memory pressure, give memory back */
 674         pcpu_depopulate_chunk(chunk, off, size, true);
 675         return -ENOMEM;
 676 }
 677
 678 static void free_pcpu_chunk(struct pcpu_chunk *chunk)
 679 {
 680         if (!chunk)
 681                 return;
 682         if (chunk->vm)
 683                 free_vm_area(chunk->vm);
 684         pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0);
 685         kfree(chunk);
 686 }
 687
 688 static struct pcpu_chunk *alloc_pcpu_chunk(void)
 689 {
 690         struct pcpu_chunk *chunk;
 691
 692         chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
 693         if (!chunk)
 694                 return NULL;
 695
 696         chunk->map = pcpu_realloc(NULL, 0,
 697                                   PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
 698         chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
 699         chunk->map[chunk->map_used++] = pcpu_unit_size;
 700         chunk->page = chunk->page_ar;
 701
 702         chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
 703         if (!chunk->vm) {
 704                 free_pcpu_chunk(chunk);
 705                 return NULL;
 706         }
 707
 708         INIT_LIST_HEAD(&chunk->list);
 709         chunk->free_size = pcpu_unit_size;
 710         chunk->contig_hint = pcpu_unit_size;
 711
 712         return chunk;
 713 }
 714
 715 /**
 716  * __alloc_percpu - allocate percpu area
 717  * @size: size of area to allocate in bytes
 718  * @align: alignment of area (max PAGE_SIZE)
 719  *
 720  * Allocate percpu area of @size bytes aligned at @align.  Might
 721  * sleep.  Might trigger writeouts.
 722  *
 723  * RETURNS:
 724  * Percpu pointer to the allocated area on success, NULL on failure.
 725  */
 726 void *__alloc_percpu(size_t size, size_t align)
 727 {
 728         void *ptr = NULL;
 729         struct pcpu_chunk *chunk;
 730         int slot, off;
 731
 732         if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
 733                 WARN(true, "illegal size (%zu) or align (%zu) for "
 734                      "percpu allocation\n", size, align);
 735                 return NULL;
 736         }
 737
 738         mutex_lock(&pcpu_mutex);
 739
 740         /* allocate area */
 741         for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
 742                 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 743                         if (size > chunk->contig_hint)
 744                                 continue;
 745                         off = pcpu_alloc_area(chunk, size, align);
 746                         if (off >= 0)
 747                                 goto area_found;
 748                         if (off != -ENOSPC)
 749                                 goto out_unlock;
 750                 }
 751         }
 752
 753         /* hmmm... no space left, create a new chunk */
 754         chunk = alloc_pcpu_chunk();
 755         if (!chunk)
 756                 goto out_unlock;
 757         pcpu_chunk_relocate(chunk, -1);
 758         pcpu_chunk_addr_insert(chunk);
 759
 760         off = pcpu_alloc_area(chunk, size, align);
 761         if (off < 0)
 762                 goto out_unlock;
 763
 764 area_found:
 765         /* populate, map and clear the area */
 766         if (pcpu_populate_chunk(chunk, off, size)) {
 767                 pcpu_free_area(chunk, off);
 768                 goto out_unlock;
 769         }
 770
 771         ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
 772 out_unlock:
 773         mutex_unlock(&pcpu_mutex);
 774         return ptr;
 775 }
 776 EXPORT_SYMBOL_GPL(__alloc_percpu);
 777
 778 static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
 779 {
 780         WARN_ON(chunk->immutable);
 781         pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
 782         list_del(&chunk->list);
 783         rb_erase(&chunk->rb_node, &pcpu_addr_root);
 784         free_pcpu_chunk(chunk);
 785 }
 786
 787 /**
 788  * free_percpu - free percpu area
 789  * @ptr: pointer to area to free
 790  *
 791  * Free percpu area @ptr.  Might sleep.
 792  */
 793 void free_percpu(void *ptr)
 794 {
 795         void *addr = __pcpu_ptr_to_addr(ptr);
 796         struct pcpu_chunk *chunk;
 797         int off;
 798
 799         if (!ptr)
 800                 return;
 801
 802         mutex_lock(&pcpu_mutex);
 803
 804         chunk = pcpu_chunk_addr_search(addr);
 805         off = addr - chunk->vm->addr;
 806
 807         pcpu_free_area(chunk, off);
 808
 809         /* the chunk became fully free, kill one if there are other free ones */
 810         if (chunk->free_size == pcpu_unit_size) {
 811                 struct pcpu_chunk *pos;
 812
 813                 list_for_each_entry(pos,
 814                                     &pcpu_slot[pcpu_chunk_slot(chunk)], list)
 815                         if (pos != chunk) {
 816                                 pcpu_kill_chunk(pos);
 817                                 break;
 818                         }
 819         }
 820
 821         mutex_unlock(&pcpu_mutex);
 822 }
 823 EXPORT_SYMBOL_GPL(free_percpu);
 824
 825 /**
 826  * pcpu_setup_first_chunk - initialize the first percpu chunk
 827  * @get_page_fn: callback to fetch page pointer
 828  * @static_size: the size of static percpu area in bytes
 829  * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
 830  * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
 831  * @base_addr: mapped address, NULL for auto
 832  * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
 833  *
 834  * Initialize the first percpu chunk which contains the kernel static
 835  * perpcu area.  This function is to be called from arch percpu area
 836  * setup path.  The first two parameters are mandatory.  The rest are
 837  * optional.
 838  *
 839  * @get_page_fn() should return pointer to percpu page given cpu
 840  * number and page number.  It should at least return enough pages to
 841  * cover the static area.  The returned pages for static area should
 842  * have been initialized with valid data.  If @unit_size is specified,
 843  * it can also return pages after the static area.  NULL return
 844  * indicates end of pages for the cpu.  Note that @get_page_fn() must
 845  * return the same number of pages for all cpus.
 846  *
 847  * @unit_size, if non-negative, specifies unit size and must be
 848  * aligned to PAGE_SIZE and equal to or larger than @static_size +
 849  * @dyn_size.
 850  *
 851  * @dyn_size, if non-negative, limits the number of bytes available
 852  * for dynamic allocation in the first chunk.  Specifying non-negative
 853  * value make percpu leave alone the area beyond @static_size +
 854  * @dyn_size.
 855  *
 856  * Non-null @base_addr means that the caller already allocated virtual
 857  * region for the first chunk and mapped it.  percpu must not mess
 858  * with the chunk.  Note that @base_addr with 0 @unit_size or non-NULL
 859  * @populate_pte_fn doesn't make any sense.
 860  *
 861  * @populate_pte_fn is used to populate the pagetable.  NULL means the
 862  * caller already populated the pagetable.
 863  *
 864  * RETURNS:
 865  * The determined pcpu_unit_size which can be used to initialize
 866  * percpu access.
 867  */
 868 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 869                                      size_t static_size,
 870                                      ssize_t unit_size, ssize_t dyn_size,
 871                                      void *base_addr,
 872                                      pcpu_populate_pte_fn_t populate_pte_fn)
 873 {
 874         static struct vm_struct first_vm;
 875         static int smap[2];
 876         struct pcpu_chunk *schunk;
 877         unsigned int cpu;
 878         int nr_pages;
 879         int err, i;
 880
 881         /* santiy checks */
 882         BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC);
 883         BUG_ON(!static_size);
 884         if (unit_size >= 0) {
 885                 BUG_ON(unit_size < static_size +
 886                                    (dyn_size >= 0 ? dyn_size : 0));
 887                 BUG_ON(unit_size & ~PAGE_MASK);
 888         } else {
 889                 BUG_ON(dyn_size >= 0);
 890                 BUG_ON(base_addr);
 891         }
 892         BUG_ON(base_addr && populate_pte_fn);
 893
 894         if (unit_size >= 0)
 895                 pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 896         else
 897                 pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
 898                                         PFN_UP(static_size));
 899
 900         pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 901         pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
 902         pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
 903                 + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
 904
 905         if (dyn_size < 0)
 906                 dyn_size = pcpu_unit_size - static_size;
 907
 908         /*
 909          * Allocate chunk slots.  The additional last slot is for
 910          * empty chunks.
 911          */
 912         pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
 913         pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
 914         for (i = 0; i < pcpu_nr_slots; i++)
 915                 INIT_LIST_HEAD(&pcpu_slot[i]);
 916
 917         /* init static chunk */
 918         schunk = alloc_bootmem(pcpu_chunk_struct_size);
 919         INIT_LIST_HEAD(&schunk->list);
 920         schunk->vm = &first_vm;
 921         schunk->map = smap;
 922         schunk->map_alloc = ARRAY_SIZE(smap);
 923         schunk->page = schunk->page_ar;
 924         schunk->free_size = dyn_size;
 925         schunk->contig_hint = schunk->free_size;
 926
 927         schunk->map[schunk->map_used++] = -static_size;
 928         if (schunk->free_size)
 929                 schunk->map[schunk->map_used++] = schunk->free_size;
 930
 931         /* allocate vm address */
 932         first_vm.flags = VM_ALLOC;
 933         first_vm.size = pcpu_chunk_size;
 934
 935         if (!base_addr)
 936                 vm_area_register_early(&first_vm, PAGE_SIZE);
 937         else {
 938                 /*
 939                  * Pages already mapped.  No need to remap into
 940                  * vmalloc area.  In this case the static chunk can't
 941                  * be mapped or unmapped by percpu and is marked
 942                  * immutable.
 943                  */
 944                 first_vm.addr = base_addr;
 945                 schunk->immutable = true;
 946         }
 947
 948         /* assign pages */
 949         nr_pages = -1;
 950         for_each_possible_cpu(cpu) {
 951                 for (i = 0; i < pcpu_unit_pages; i++) {
 952                         struct page *page = get_page_fn(cpu, i);
 953
 954                         if (!page)
 955                                 break;
 956                         *pcpu_chunk_pagep(schunk, cpu, i) = page;
 957                 }
 958
 959                 BUG_ON(i < PFN_UP(static_size));
 960
 961                 if (nr_pages < 0)
 962                         nr_pages = i;
 963                 else
 964                         BUG_ON(nr_pages != i);
 965         }
 966
 967         /* map them */
 968         if (populate_pte_fn) {
 969                 for_each_possible_cpu(cpu)
 970                         for (i = 0; i < nr_pages; i++)
 971                                 populate_pte_fn(pcpu_chunk_addr(schunk,
 972                                                                 cpu, i));
 973
 974                 err = pcpu_map(schunk, 0, nr_pages);
 975                 if (err)
 976                         panic("failed to setup static percpu area, err=%d\n",
 977                               err);
 978         }
 979
 980         /* link the first chunk in */
 981         pcpu_chunk_relocate(schunk, -1);
 982         pcpu_chunk_addr_insert(schunk);
 983
 984         /* we're done */
 985         pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
 986         return pcpu_unit_size;
 987 }