percpu: implement new dynamic percpu allocator

author Tejun Heo <tj@kernel.org>

Fri, 20 Feb 2009 07:29:08 +0000 (16:29 +0900)

committer Tejun Heo <tj@kernel.org>

Fri, 20 Feb 2009 07:29:08 +0000 (16:29 +0900)
author Tejun Heo <tj@kernel.org>
Fri, 20 Feb 2009 07:29:08 +0000 (16:29 +0900)
committer Tejun Heo <tj@kernel.org>
Fri, 20 Feb 2009 07:29:08 +0000 (16:29 +0900)
diff --git a/include/linux/percpu.h b/include/linux/percpu.h

index d99e24ae18117aa0502453e8213cc6cccfe2c7cb..18080995ff3efda8a5dac47859029fff51363e23 100644 (file)
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -76,23 +76,37 @@
  
  #ifdef CONFIG_SMP
  
-struct percpu_data {
-       void *ptrs[1];
-};
+#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
  
-#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
+extern void *pcpu_base_addr;
  
+typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
+
+extern size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
+                                      struct page **pages, size_t cpu_size);
  /*
   * Use this to get to a cpu's version of the per-cpu object
   * dynamically allocated. Non-atomic access to the current CPU's
   * version should probably be combined with get_cpu()/put_cpu().
   */
+#define per_cpu_ptr(ptr, cpu)  SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
+
+#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
+struct percpu_data {
+       void *ptrs[1];
+};
+
+#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
+
  #define per_cpu_ptr(ptr, cpu)                                          \
  ({                                                                     \
          struct percpu_data *__p = __percpu_disguise(ptr);              \
          (__typeof__(ptr))__p->ptrs[(cpu)];                             \
  })
  
+#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
  extern void *__alloc_percpu(size_t size, size_t align);
  extern void free_percpu(void *__pdata);
  
diff --git a/kernel/module.c b/kernel/module.c

index 52b3497b8748e9a6bdc14ecd135701521af4af07..1f0657ae555b15eadbb10658a02665bad204e828 100644 (file)
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -51,6 +51,7 @@
  #include <linux/tracepoint.h>
  #include <linux/ftrace.h>
  #include <linux/async.h>
+#include <linux/percpu.h>
  
  #if 0
  #define DEBUGP printk
@@ -366,6 +367,34 @@ static struct module *find_module(const char *name)
  }
  
  #ifdef CONFIG_SMP
+
+#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+
+static void *percpu_modalloc(unsigned long size, unsigned long align,
+                            const char *name)
+{
+       void *ptr;
+
+       if (align > PAGE_SIZE) {
+               printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
+                      name, align, PAGE_SIZE);
+               align = PAGE_SIZE;
+       }
+
+       ptr = __alloc_percpu(size, align);
+       if (!ptr)
+               printk(KERN_WARNING
+                      "Could not allocate %lu bytes percpu data\n", size);
+       return ptr;
+}
+
+static void percpu_modfree(void *freeme)
+{
+       free_percpu(freeme);
+}
+
+#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
  /* Number of blocks used and allocated. */
  static unsigned int pcpu_num_used, pcpu_num_allocated;
  /* Size of each block.  -ve means used. */
@@ -499,6 +528,8 @@ static int percpu_modinit(void)
  }
  __initcall(percpu_modinit);
  
+#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
  static unsigned int find_pcpusec(Elf_Ehdr *hdr,
                                  Elf_Shdr *sechdrs,
                                  const char *secstrings)
diff --git a/mm/Makefile b/mm/Makefile

index 72255be57f89160cf8ff12e702c013db310d6ef4..818569b68f4652e623f5d2bd4ed91fe68fcea2b4 100644 (file)
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -30,6 +30,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
  obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
  obj-$(CONFIG_FS_XIP) += filemap_xip.o
  obj-$(CONFIG_MIGRATION) += migrate.o
+ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+obj-$(CONFIG_SMP) += percpu.o
+else
  obj-$(CONFIG_SMP) += allocpercpu.o
+endif
  obj-$(CONFIG_QUICKLIST) += quicklist.o
  obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
diff --git a/mm/percpu.c b/mm/percpu.c

new file mode 100644 (file)

index 0000000..4617d97
--- /dev/null
+++ b/mm/percpu.c
@@ -0,0 +1,890 @@
+/*
+ * linux/mm/percpu.c - percpu memory allocator
+ *
+ * Copyright (C) 2009          SUSE Linux Products GmbH
+ * Copyright (C) 2009          Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * This is percpu allocator which can handle both static and dynamic
+ * areas.  Percpu areas are allocated in chunks in vmalloc area.  Each
+ * chunk is consisted of num_possible_cpus() units and the first chunk
+ * is used for static percpu variables in the kernel image (special
+ * boot time alloc/init handling necessary as these areas need to be
+ * brought up before allocation services are running).  Unit grows as
+ * necessary and all units grow or shrink in unison.  When a chunk is
+ * filled up, another chunk is allocated.  ie. in vmalloc area
+ *
+ *  c0                           c1                         c2
+ *  -------------------          -------------------        ------------
+ * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
+ *  -------------------  ......  -------------------  ....  ------------
+ *
+ * Allocation is done in offset-size areas of single unit space.  Ie,
+ * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
+ * c1:u1, c1:u2 and c1:u3.  Percpu access can be done by configuring
+ * percpu base registers UNIT_SIZE apart.
+ *
+ * There are usually many small percpu allocations many of them as
+ * small as 4 bytes.  The allocator organizes chunks into lists
+ * according to free size and tries to allocate from the fullest one.
+ * Each chunk keeps the maximum contiguous area size hint which is
+ * guaranteed to be eqaul to or larger than the maximum contiguous
+ * area in the chunk.  This helps the allocator not to iterate the
+ * chunk maps unnecessarily.
+ *
+ * Allocation state in each chunk is kept using an array of integers
+ * on chunk->map.  A positive value in the map represents a free
+ * region and negative allocated.  Allocation inside a chunk is done
+ * by scanning this map sequentially and serving the first matching
+ * entry.  This is mostly copied from the percpu_modalloc() allocator.
+ * Chunks are also linked into a rb tree to ease address to chunk
+ * mapping during free.
+ *
+ * To use this allocator, arch code should do the followings.
+ *
+ * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ *
+ * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
+ *   regular address to percpu pointer and back
+ *
+ * - use pcpu_setup_static() during percpu area initialization to
+ *   setup kernel static percpu area
+ */
+
+#include <linux/bitmap.h>
+#include <linux/bootmem.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/pfn.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+#define PCPU_MIN_UNIT_PAGES_SHIFT      4       /* also max alloc size */
+#define PCPU_SLOT_BASE_SHIFT           5       /* 1-31 shares the same slot */
+#define PCPU_DFL_MAP_ALLOC             16      /* start a map with 16 ents */
+
+struct pcpu_chunk {
+       struct list_head        list;           /* linked to pcpu_slot lists */
+       struct rb_node          rb_node;        /* key is chunk->vm->addr */
+       int                     free_size;      /* free bytes in the chunk */
+       int                     contig_hint;    /* max contiguous size hint */
+       struct vm_struct        *vm;            /* mapped vmalloc region */
+       int                     map_used;       /* # of map entries used */
+       int                     map_alloc;      /* # of map entries allocated */
+       int                     *map;           /* allocation map */
+       struct page             *page[];        /* #cpus * UNIT_PAGES */
+};
+
+static int pcpu_unit_pages_shift;
+static int pcpu_unit_pages;
+static int pcpu_unit_shift;
+static int pcpu_unit_size;
+static int pcpu_chunk_size;
+static int pcpu_nr_slots;
+static size_t pcpu_chunk_struct_size;
+
+/* the address of the first chunk which starts with the kernel static area */
+void *pcpu_base_addr;
+EXPORT_SYMBOL_GPL(pcpu_base_addr);
+
+/* the size of kernel static area */
+static int pcpu_static_size;
+
+/*
+ * One mutex to rule them all.
+ *
+ * The following mutex is grabbed in the outermost public alloc/free
+ * interface functions and released only when the operation is
+ * complete.  As such, every function in this file other than the
+ * outermost functions are called under pcpu_mutex.
+ *
+ * It can easily be switched to use spinlock such that only the area
+ * allocation and page population commit are protected with it doing
+ * actual [de]allocation without holding any lock.  However, given
+ * what this allocator does, I think it's better to let them run
+ * sequentially.
+ */
+static DEFINE_MUTEX(pcpu_mutex);
+
+static struct list_head *pcpu_slot;            /* chunk list slots */
+static struct rb_root pcpu_addr_root = RB_ROOT;        /* chunks by address */
+
+static int pcpu_size_to_slot(int size)
+{
+       int highbit = fls(size);
+       return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
+}
+
+static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
+{
+       if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
+               return 0;
+
+       return pcpu_size_to_slot(chunk->free_size);
+}
+
+static int pcpu_page_idx(unsigned int cpu, int page_idx)
+{
+       return (cpu << pcpu_unit_pages_shift) + page_idx;
+}
+
+static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
+                                     unsigned int cpu, int page_idx)
+{
+       return &chunk->page[pcpu_page_idx(cpu, page_idx)];
+}
+
+static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
+                                    unsigned int cpu, int page_idx)
+{
+       return (unsigned long)chunk->vm->addr +
+               (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
+}
+
+static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
+                                    int page_idx)
+{
+       return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
+}
+
+/**
+ * pcpu_realloc - versatile realloc
+ * @p: the current pointer (can be NULL for new allocations)
+ * @size: the current size (can be 0 for new allocations)
+ * @new_size: the wanted new size (can be 0 for free)
+ *
+ * More robust realloc which can be used to allocate, resize or free a
+ * memory area of arbitrary size.  If the needed size goes over
+ * PAGE_SIZE, kernel VM is used.
+ *
+ * RETURNS:
+ * The new pointer on success, NULL on failure.
+ */
+static void *pcpu_realloc(void *p, size_t size, size_t new_size)
+{
+       void *new;
+
+       if (new_size <= PAGE_SIZE)
+               new = kmalloc(new_size, GFP_KERNEL);
+       else
+               new = vmalloc(new_size);
+       if (new_size && !new)
+               return NULL;
+
+       memcpy(new, p, min(size, new_size));
+       if (new_size > size)
+               memset(new + size, 0, new_size - size);
+
+       if (size <= PAGE_SIZE)
+               kfree(p);
+       else
+               vfree(p);
+
+       return new;
+}
+
+/**
+ * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
+ * @chunk: chunk of interest
+ * @oslot: the previous slot it was on
+ *
+ * This function is called after an allocation or free changed @chunk.
+ * New slot according to the changed state is determined and @chunk is
+ * moved to the slot.
+ */
+static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
+{
+       int nslot = pcpu_chunk_slot(chunk);
+
+       if (oslot != nslot) {
+               if (oslot < nslot)
+                       list_move(&chunk->list, &pcpu_slot[nslot]);
+               else
+                       list_move_tail(&chunk->list, &pcpu_slot[nslot]);
+       }
+}
+
+static struct rb_node **pcpu_chunk_rb_search(void *addr,
+                                            struct rb_node **parentp)
+{
+       struct rb_node **p = &pcpu_addr_root.rb_node;
+       struct rb_node *parent = NULL;
+       struct pcpu_chunk *chunk;
+
+       while (*p) {
+               parent = *p;
+               chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
+
+               if (addr < chunk->vm->addr)
+                       p = &(*p)->rb_left;
+               else if (addr > chunk->vm->addr)
+                       p = &(*p)->rb_right;
+               else
+                       break;
+       }
+
+       if (parentp)
+               *parentp = parent;
+       return p;
+}
+
+/**
+ * pcpu_chunk_addr_search - search for chunk containing specified address
+ * @addr: address to search for
+ *
+ * Look for chunk which might contain @addr.  More specifically, it
+ * searchs for the chunk with the highest start address which isn't
+ * beyond @addr.
+ *
+ * RETURNS:
+ * The address of the found chunk.
+ */
+static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
+{
+       struct rb_node *n, *parent;
+       struct pcpu_chunk *chunk;
+
+       n = *pcpu_chunk_rb_search(addr, &parent);
+       if (!n) {
+               /* no exactly matching chunk, the parent is the closest */
+               n = parent;
+               BUG_ON(!n);
+       }
+       chunk = rb_entry(n, struct pcpu_chunk, rb_node);
+
+       if (addr < chunk->vm->addr) {
+               /* the parent was the next one, look for the previous one */
+               n = rb_prev(n);
+               BUG_ON(!n);
+               chunk = rb_entry(n, struct pcpu_chunk, rb_node);
+       }
+
+       return chunk;
+}
+
+/**
+ * pcpu_chunk_addr_insert - insert chunk into address rb tree
+ * @new: chunk to insert
+ *
+ * Insert @new into address rb tree.
+ */
+static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
+{
+       struct rb_node **p, *parent;
+
+       p = pcpu_chunk_rb_search(new->vm->addr, &parent);
+       BUG_ON(*p);
+       rb_link_node(&new->rb_node, parent, p);
+       rb_insert_color(&new->rb_node, &pcpu_addr_root);
+}
+
+/**
+ * pcpu_split_block - split a map block
+ * @chunk: chunk of interest
+ * @i: index of map block to split
+ * @head: head size (can be 0)
+ * @tail: tail size (can be 0)
+ *
+ * Split the @i'th map block into two or three blocks.  If @head is
+ * non-zero, @head bytes block is inserted before block @i moving it
+ * to @i+1 and reducing its size by @head bytes.
+ *
+ * If @tail is non-zero, the target block, which can be @i or @i+1
+ * depending on @head, is reduced by @tail bytes and @tail byte block
+ * is inserted after the target block.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
+{
+       int nr_extra = !!head + !!tail;
+       int target = chunk->map_used + nr_extra;
+
+       /* reallocation required? */
+       if (chunk->map_alloc < target) {
+               int new_alloc = chunk->map_alloc;
+               int *new;
+
+               while (new_alloc < target)
+                       new_alloc *= 2;
+
+               new = pcpu_realloc(chunk->map,
+                                  chunk->map_alloc * sizeof(new[0]),
+                                  new_alloc * sizeof(new[0]));
+               if (!new)
+                       return -ENOMEM;
+
+               chunk->map_alloc = new_alloc;
+               chunk->map = new;
+       }
+
+       /* insert a new subblock */
+       memmove(&chunk->map[i + nr_extra], &chunk->map[i],
+               sizeof(chunk->map[0]) * (chunk->map_used - i));
+       chunk->map_used += nr_extra;
+
+       if (head) {
+               chunk->map[i + 1] = chunk->map[i] - head;
+               chunk->map[i++] = head;
+       }
+       if (tail) {
+               chunk->map[i++] -= tail;
+               chunk->map[i] = tail;
+       }
+       return 0;
+}
+
+/**
+ * pcpu_alloc_area - allocate area from a pcpu_chunk
+ * @chunk: chunk of interest
+ * @size: wanted size
+ * @align: wanted align
+ *
+ * Try to allocate @size bytes area aligned at @align from @chunk.
+ * Note that this function only allocates the offset.  It doesn't
+ * populate or map the area.
+ *
+ * RETURNS:
+ * Allocated offset in @chunk on success, -errno on failure.
+ */
+static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
+{
+       int oslot = pcpu_chunk_slot(chunk);
+       int max_contig = 0;
+       int i, off;
+
+       /*
+        * The static chunk initially doesn't have map attached
+        * because kmalloc wasn't available during init.  Give it one.
+        */
+       if (unlikely(!chunk->map)) {
+               chunk->map = pcpu_realloc(NULL, 0,
+                               PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+               if (!chunk->map)
+                       return -ENOMEM;
+
+               chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
+               chunk->map[chunk->map_used++] = -pcpu_static_size;
+               if (chunk->free_size)
+                       chunk->map[chunk->map_used++] = chunk->free_size;
+       }
+
+       for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
+               bool is_last = i + 1 == chunk->map_used;
+               int head, tail;
+
+               /* extra for alignment requirement */
+               head = ALIGN(off, align) - off;
+               BUG_ON(i == 0 && head != 0);
+
+               if (chunk->map[i] < 0)
+                       continue;
+               if (chunk->map[i] < head + size) {
+                       max_contig = max(chunk->map[i], max_contig);
+                       continue;
+               }
+
+               /*
+                * If head is small or the previous block is free,
+                * merge'em.  Note that 'small' is defined as smaller
+                * than sizeof(int), which is very small but isn't too
+                * uncommon for percpu allocations.
+                */
+               if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
+                       if (chunk->map[i - 1] > 0)
+                               chunk->map[i - 1] += head;
+                       else {
+                               chunk->map[i - 1] -= head;
+                               chunk->free_size -= head;
+                       }
+                       chunk->map[i] -= head;
+                       off += head;
+                       head = 0;
+               }
+
+               /* if tail is small, just keep it around */
+               tail = chunk->map[i] - head - size;
+               if (tail < sizeof(int))
+                       tail = 0;
+
+               /* split if warranted */
+               if (head || tail) {
+                       if (pcpu_split_block(chunk, i, head, tail))
+                               return -ENOMEM;
+                       if (head) {
+                               i++;
+                               off += head;
+                               max_contig = max(chunk->map[i - 1], max_contig);
+                       }
+                       if (tail)
+                               max_contig = max(chunk->map[i + 1], max_contig);
+               }
+
+               /* update hint and mark allocated */
+               if (is_last)
+                       chunk->contig_hint = max_contig; /* fully scanned */
+               else
+                       chunk->contig_hint = max(chunk->contig_hint,
+                                                max_contig);
+
+               chunk->free_size -= chunk->map[i];
+               chunk->map[i] = -chunk->map[i];
+
+               pcpu_chunk_relocate(chunk, oslot);
+               return off;
+       }
+
+       chunk->contig_hint = max_contig;        /* fully scanned */
+       pcpu_chunk_relocate(chunk, oslot);
+
+       /*
+        * Tell the upper layer that this chunk has no area left.
+        * Note that this is not an error condition but a notification
+        * to upper layer that it needs to look at other chunks.
+        * -ENOSPC is chosen as it isn't used in memory subsystem and
+        * matches the meaning in a way.
+        */
+       return -ENOSPC;
+}
+
+/**
+ * pcpu_free_area - free area to a pcpu_chunk
+ * @chunk: chunk of interest
+ * @freeme: offset of area to free
+ *
+ * Free area starting from @freeme to @chunk.  Note that this function
+ * only modifies the allocation map.  It doesn't depopulate or unmap
+ * the area.
+ */
+static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
+{
+       int oslot = pcpu_chunk_slot(chunk);
+       int i, off;
+
+       for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
+               if (off == freeme)
+                       break;
+       BUG_ON(off != freeme);
+       BUG_ON(chunk->map[i] > 0);
+
+       chunk->map[i] = -chunk->map[i];
+       chunk->free_size += chunk->map[i];
+
+       /* merge with previous? */
+       if (i > 0 && chunk->map[i - 1] >= 0) {
+               chunk->map[i - 1] += chunk->map[i];
+               chunk->map_used--;
+               memmove(&chunk->map[i], &chunk->map[i + 1],
+                       (chunk->map_used - i) * sizeof(chunk->map[0]));
+               i--;
+       }
+       /* merge with next? */
+       if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
+               chunk->map[i] += chunk->map[i + 1];
+               chunk->map_used--;
+               memmove(&chunk->map[i + 1], &chunk->map[i + 2],
+                       (chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
+       }
+
+       chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
+       pcpu_chunk_relocate(chunk, oslot);
+}
+
+/**
+ * pcpu_unmap - unmap pages out of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @page_start: page index of the first page to unmap
+ * @page_end: page index of the last page to unmap + 1
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ * If @flush is true, vcache is flushed before unmapping and tlb
+ * after.
+ */
+static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
+                      bool flush)
+{
+       unsigned int last = num_possible_cpus() - 1;
+       unsigned int cpu;
+
+       /*
+        * Each flushing trial can be very expensive, issue flush on
+        * the whole region at once rather than doing it for each cpu.
+        * This could be an overkill but is more scalable.
+        */
+       if (flush)
+               flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
+                                  pcpu_chunk_addr(chunk, last, page_end));
+
+       for_each_possible_cpu(cpu)
+               unmap_kernel_range_noflush(
+                               pcpu_chunk_addr(chunk, cpu, page_start),
+                               (page_end - page_start) << PAGE_SHIFT);
+
+       /* ditto as flush_cache_vunmap() */
+       if (flush)
+               flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
+                                      pcpu_chunk_addr(chunk, last, page_end));
+}
+
+/**
+ * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
+ * @chunk: chunk to depopulate
+ * @off: offset to the area to depopulate
+ * @size: size of the area to depopulate
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, depopulate and unmap pages [@page_start,@page_end)
+ * from @chunk.  If @flush is true, vcache is flushed before unmapping
+ * and tlb after.
+ */
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, size_t off,
+                                 size_t size, bool flush)
+{
+       int page_start = PFN_DOWN(off);
+       int page_end = PFN_UP(off + size);
+       int unmap_start = -1;
+       int uninitialized_var(unmap_end);
+       unsigned int cpu;
+       int i;
+
+       for (i = page_start; i < page_end; i++) {
+               for_each_possible_cpu(cpu) {
+                       struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+
+                       if (!*pagep)
+                               continue;
+
+                       __free_page(*pagep);
+
+                       /*
+                        * If it's partial depopulation, it might get
+                        * populated or depopulated again.  Mark the
+                        * page gone.
+                        */
+                       *pagep = NULL;
+
+                       unmap_start = unmap_start < 0 ? i : unmap_start;
+                       unmap_end = i + 1;
+               }
+       }
+
+       if (unmap_start >= 0)
+               pcpu_unmap(chunk, unmap_start, unmap_end, flush);
+}
+
+/**
+ * pcpu_map - map pages into a pcpu_chunk
+ * @chunk: chunk of interest
+ * @page_start: page index of the first page to map
+ * @page_end: page index of the last page to map + 1
+ *
+ * For each cpu, map pages [@page_start,@page_end) into @chunk.
+ * vcache is flushed afterwards.
+ */
+static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
+{
+       unsigned int last = num_possible_cpus() - 1;
+       unsigned int cpu;
+       int err;
+
+       for_each_possible_cpu(cpu) {
+               err = map_kernel_range_noflush(
+                               pcpu_chunk_addr(chunk, cpu, page_start),
+                               (page_end - page_start) << PAGE_SHIFT,
+                               PAGE_KERNEL,
+                               pcpu_chunk_pagep(chunk, cpu, page_start));
+               if (err < 0)
+                       return err;
+       }
+
+       /* flush at once, please read comments in pcpu_unmap() */
+       flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
+                        pcpu_chunk_addr(chunk, last, page_end));
+       return 0;
+}
+
+/**
+ * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @off: offset to the area to populate
+ * @size: size of the area to populate
+ *
+ * For each cpu, populate and map pages [@page_start,@page_end) into
+ * @chunk.  The area is cleared on return.
+ */
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+       const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
+       int page_start = PFN_DOWN(off);
+       int page_end = PFN_UP(off + size);
+       int map_start = -1;
+       int map_end;
+       unsigned int cpu;
+       int i;
+
+       for (i = page_start; i < page_end; i++) {
+               if (pcpu_chunk_page_occupied(chunk, i)) {
+                       if (map_start >= 0) {
+                               if (pcpu_map(chunk, map_start, map_end))
+                                       goto err;
+                               map_start = -1;
+                       }
+                       continue;
+               }
+
+               map_start = map_start < 0 ? i : map_start;
+               map_end = i + 1;
+
+               for_each_possible_cpu(cpu) {
+                       struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+
+                       *pagep = alloc_pages_node(cpu_to_node(cpu),
+                                                 alloc_mask, 0);
+                       if (!*pagep)
+                               goto err;
+               }
+       }
+
+       if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
+               goto err;
+
+       for_each_possible_cpu(cpu)
+               memset(chunk->vm->addr + (cpu << pcpu_unit_shift) + off, 0,
+                      size);
+
+       return 0;
+err:
+       /* likely under heavy memory pressure, give memory back */
+       pcpu_depopulate_chunk(chunk, off, size, true);
+       return -ENOMEM;
+}
+
+static void free_pcpu_chunk(struct pcpu_chunk *chunk)
+{
+       if (!chunk)
+               return;
+       if (chunk->vm)
+               free_vm_area(chunk->vm);
+       pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0);
+       kfree(chunk);
+}
+
+static struct pcpu_chunk *alloc_pcpu_chunk(void)
+{
+       struct pcpu_chunk *chunk;
+
+       chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
+       if (!chunk)
+               return NULL;
+
+       chunk->map = pcpu_realloc(NULL, 0,
+                                 PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+       chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
+       chunk->map[chunk->map_used++] = pcpu_unit_size;
+
+       chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
+       if (!chunk->vm) {
+               free_pcpu_chunk(chunk);
+               return NULL;
+       }
+
+       INIT_LIST_HEAD(&chunk->list);
+       chunk->free_size = pcpu_unit_size;
+       chunk->contig_hint = pcpu_unit_size;
+
+       return chunk;
+}
+
+/**
+ * __alloc_percpu - allocate percpu area
+ * @size: size of area to allocate
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate percpu area of @size bytes aligned at @align.  Might
+ * sleep.  Might trigger writeouts.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void *__alloc_percpu(size_t size, size_t align)
+{
+       void *ptr = NULL;
+       struct pcpu_chunk *chunk;
+       int slot, off;
+
+       if (unlikely(!size || size > PAGE_SIZE << PCPU_MIN_UNIT_PAGES_SHIFT ||
+                    align > PAGE_SIZE)) {
+               WARN(true, "illegal size (%zu) or align (%zu) for "
+                    "percpu allocation\n", size, align);
+               return NULL;
+       }
+
+       mutex_lock(&pcpu_mutex);
+
+       /* allocate area */
+       for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
+               list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+                       if (size > chunk->contig_hint)
+                               continue;
+                       off = pcpu_alloc_area(chunk, size, align);
+                       if (off >= 0)
+                               goto area_found;
+                       if (off != -ENOSPC)
+                               goto out_unlock;
+               }
+       }
+
+       /* hmmm... no space left, create a new chunk */
+       chunk = alloc_pcpu_chunk();
+       if (!chunk)
+               goto out_unlock;
+       pcpu_chunk_relocate(chunk, -1);
+       pcpu_chunk_addr_insert(chunk);
+
+       off = pcpu_alloc_area(chunk, size, align);
+       if (off < 0)
+               goto out_unlock;
+
+area_found:
+       /* populate, map and clear the area */
+       if (pcpu_populate_chunk(chunk, off, size)) {
+               pcpu_free_area(chunk, off);
+               goto out_unlock;
+       }
+
+       ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
+out_unlock:
+       mutex_unlock(&pcpu_mutex);
+       return ptr;
+}
+EXPORT_SYMBOL_GPL(__alloc_percpu);
+
+static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
+{
+       pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
+       list_del(&chunk->list);
+       rb_erase(&chunk->rb_node, &pcpu_addr_root);
+       free_pcpu_chunk(chunk);
+}
+
+/**
+ * free_percpu - free percpu area
+ * @ptr: pointer to area to free
+ *
+ * Free percpu area @ptr.  Might sleep.
+ */
+void free_percpu(void *ptr)
+{
+       void *addr = __pcpu_ptr_to_addr(ptr);
+       struct pcpu_chunk *chunk;
+       int off;
+
+       if (!ptr)
+               return;
+
+       mutex_lock(&pcpu_mutex);
+
+       chunk = pcpu_chunk_addr_search(addr);
+       off = addr - chunk->vm->addr;
+
+       pcpu_free_area(chunk, off);
+
+       /* the chunk became fully free, kill one if there are other free ones */
+       if (chunk->free_size == pcpu_unit_size) {
+               struct pcpu_chunk *pos;
+
+               list_for_each_entry(pos,
+                                   &pcpu_slot[pcpu_chunk_slot(chunk)], list)
+                       if (pos != chunk) {
+                               pcpu_kill_chunk(pos);
+                               break;
+                       }
+       }
+
+       mutex_unlock(&pcpu_mutex);
+}
+EXPORT_SYMBOL_GPL(free_percpu);
+
+/**
+ * pcpu_setup_static - initialize kernel static percpu area
+ * @populate_pte_fn: callback to allocate pagetable
+ * @pages: num_possible_cpus() * PFN_UP(cpu_size) pages
+ *
+ * Initialize kernel static percpu area.  The caller should allocate
+ * all the necessary pages and pass them in @pages.
+ * @populate_pte_fn() is called on each page to be used for percpu
+ * mapping and is responsible for making sure all the necessary page
+ * tables for the page is allocated.
+ *
+ * RETURNS:
+ * The determined pcpu_unit_size which can be used to initialize
+ * percpu access.
+ */
+size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
+                               struct page **pages, size_t cpu_size)
+{
+       static struct vm_struct static_vm;
+       struct pcpu_chunk *static_chunk;
+       int nr_cpu_pages = DIV_ROUND_UP(cpu_size, PAGE_SIZE);
+       unsigned int cpu;
+       int err, i;
+
+       pcpu_unit_pages_shift = max_t(int, PCPU_MIN_UNIT_PAGES_SHIFT,
+                                     order_base_2(cpu_size) - PAGE_SHIFT);
+
+       pcpu_static_size = cpu_size;
+       pcpu_unit_pages = 1 << pcpu_unit_pages_shift;
+       pcpu_unit_shift = PAGE_SHIFT + pcpu_unit_pages_shift;
+       pcpu_unit_size = 1 << pcpu_unit_shift;
+       pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
+       pcpu_nr_slots = pcpu_size_to_slot(pcpu_unit_size) + 1;
+       pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
+               + (1 << pcpu_unit_pages_shift) * sizeof(struct page *);
+
+       /* allocate chunk slots */
+       pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
+       for (i = 0; i < pcpu_nr_slots; i++)
+               INIT_LIST_HEAD(&pcpu_slot[i]);
+
+       /* init and register vm area */
+       static_vm.flags = VM_ALLOC;
+       static_vm.size = pcpu_chunk_size;
+       vm_area_register_early(&static_vm);
+
+       /* init static_chunk */
+       static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
+       INIT_LIST_HEAD(&static_chunk->list);
+       static_chunk->vm = &static_vm;
+       static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
+       static_chunk->contig_hint = static_chunk->free_size;
+
+       /* assign pages and map them */
+       for_each_possible_cpu(cpu) {
+               for (i = 0; i < nr_cpu_pages; i++) {
+                       *pcpu_chunk_pagep(static_chunk, cpu, i) = *pages++;
+                       populate_pte_fn(pcpu_chunk_addr(static_chunk, cpu, i));
+               }
+       }
+
+       err = pcpu_map(static_chunk, 0, nr_cpu_pages);
+       if (err)
+               panic("failed to setup static percpu area, err=%d\n", err);
+
+       /* link static_chunk in */
+       pcpu_chunk_relocate(static_chunk, -1);
+       pcpu_chunk_addr_insert(static_chunk);
+
+       /* we're done */
+       pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0);
+       return pcpu_unit_size;
+}
author	Tejun Heo <tj@kernel.org>
	Fri, 20 Feb 2009 07:29:08 +0000 (16:29 +0900)
committer	Tejun Heo <tj@kernel.org>
	Fri, 20 Feb 2009 07:29:08 +0000 (16:29 +0900)
include/linux/percpu.h		patch \| blob \| history
kernel/module.c		patch \| blob \| history
mm/Makefile		patch \| blob \| history
mm/percpu.c	[new file with mode: 0644]	patch \| blob