]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/commitdiff
Merge branch 'tj-percpu' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc...
authorIngo Molnar <mingo@elte.hu>
Tue, 24 Feb 2009 20:52:45 +0000 (21:52 +0100)
committerIngo Molnar <mingo@elte.hu>
Tue, 24 Feb 2009 20:52:45 +0000 (21:52 +0100)
Conflicts:
arch/x86/include/asm/pgtable.h

25 files changed:
arch/alpha/mm/init.c
arch/avr32/Kconfig
arch/x86/Kconfig
arch/x86/include/asm/mmzone_32.h
arch/x86/include/asm/percpu.h
arch/x86/include/asm/pgtable.h
arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
arch/x86/kernel/irq_32.c
arch/x86/kernel/setup_percpu.c
arch/x86/mm/init_32.c
arch/x86/mm/init_64.c
block/blktrace.c
drivers/acpi/processor_perflib.c
include/linux/bootmem.h
include/linux/percpu.h
include/linux/vmalloc.h
kernel/module.c
kernel/sched.c
kernel/stop_machine.c
mm/Makefile
mm/allocpercpu.c
mm/bootmem.c
mm/percpu.c [new file with mode: 0644]
mm/vmalloc.c
net/ipv4/af_inet.c

index 5d7a16eab312619f94cc48515715ae81b298b296..91eddd8505df3d30c0e137f9cf5d5c2b8f6d53e8 100644 (file)
@@ -189,9 +189,21 @@ callback_init(void * kernel_end)
 
        if (alpha_using_srm) {
                static struct vm_struct console_remap_vm;
-               unsigned long vaddr = VMALLOC_START;
+               unsigned long nr_pages = 0;
+               unsigned long vaddr;
                unsigned long i, j;
 
+               /* calculate needed size */
+               for (i = 0; i < crb->map_entries; ++i)
+                       nr_pages += crb->map[i].count;
+
+               /* register the vm area */
+               console_remap_vm.flags = VM_ALLOC;
+               console_remap_vm.size = nr_pages << PAGE_SHIFT;
+               vm_area_register_early(&console_remap_vm, PAGE_SIZE);
+
+               vaddr = (unsigned long)consle_remap_vm.addr;
+
                /* Set up the third level PTEs and update the virtual
                   addresses of the CRB entries.  */
                for (i = 0; i < crb->map_entries; ++i) {
@@ -213,12 +225,6 @@ callback_init(void * kernel_end)
                                vaddr += PAGE_SIZE;
                        }
                }
-
-               /* Let vmalloc know that we've allocated some space.  */
-               console_remap_vm.flags = VM_ALLOC;
-               console_remap_vm.addr = (void *) VMALLOC_START;
-               console_remap_vm.size = vaddr - VMALLOC_START;
-               vmlist = &console_remap_vm;
        }
 
        callback_init_done = 1;
index b189680d18b0493d7f9d12f265442563bec2f11d..05fe3053dcaec7f725059d78e7a6988788c557ff 100644 (file)
@@ -181,7 +181,7 @@ source "kernel/Kconfig.preempt"
 config QUICKLIST
        def_bool y
 
-config HAVE_ARCH_BOOTMEM_NODE
+config HAVE_ARCH_BOOTMEM
        def_bool n
 
 config ARCH_HAVE_MEMORY_PRESENT
index 5e2919c0ff922a86c5c965c38a6628ba15f49edf..8015641478bdacfb79d5c6ec54341b1ac35b0914 100644 (file)
@@ -135,6 +135,9 @@ config ARCH_HAS_CACHE_LINE_SIZE
 config HAVE_SETUP_PER_CPU_AREA
        def_bool y
 
+config HAVE_DYNAMIC_PER_CPU_AREA
+       def_bool y
+
 config HAVE_CPUMASK_OF_CPU_MAP
        def_bool X86_64_SMP
 
@@ -1122,7 +1125,7 @@ config NODES_SHIFT
          Specify the maximum number of NUMA Nodes available on the target
          system.  Increases memory reserved to accomodate various tables.
 
-config HAVE_ARCH_BOOTMEM_NODE
+config HAVE_ARCH_BOOTMEM
        def_bool y
        depends on X86_32 && NUMA
 
index 105fb90a063527d761ed47350d553ecb755bd3cb..eeacf67de49e45938bf8a839f1539fc25d9ca094 100644 (file)
@@ -91,45 +91,12 @@ static inline int pfn_valid(int pfn)
 #endif /* CONFIG_DISCONTIGMEM */
 
 #ifdef CONFIG_NEED_MULTIPLE_NODES
-
-/*
- * Following are macros that are specific to this numa platform.
- */
-#define reserve_bootmem(addr, size, flags) \
-       reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
-#define alloc_bootmem(x) \
-       __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_nopanic(x) \
-       __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
-                               __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low(x) \
-       __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
-#define alloc_bootmem_pages(x) \
-       __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_pages_nopanic(x) \
-       __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
-                               __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low_pages(x) \
-       __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
-#define alloc_bootmem_node(pgdat, x)                                   \
-({                                                                     \
-       struct pglist_data  __maybe_unused                      \
-                               *__alloc_bootmem_node__pgdat = (pgdat); \
-       __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES,        \
-                                               __pa(MAX_DMA_ADDRESS)); \
-})
-#define alloc_bootmem_pages_node(pgdat, x)                             \
-({                                                                     \
-       struct pglist_data  __maybe_unused                      \
-                               *__alloc_bootmem_node__pgdat = (pgdat); \
-       __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE,              \
-                                               __pa(MAX_DMA_ADDRESS)); \
-})
-#define alloc_bootmem_low_pages_node(pgdat, x)                         \
+/* always use node 0 for bootmem on this numa platform */
+#define alloc_bootmem_core(__bdata, size, align, goal, limit)          \
 ({                                                                     \
-       struct pglist_data  __maybe_unused                      \
-                               *__alloc_bootmem_node__pgdat = (pgdat); \
-       __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0);          \
+       bootmem_data_t __maybe_unused * __abm_bdata_dummy = (__bdata);  \
+       __alloc_bootmem_core(NODE_DATA(0)->bdata,                       \
+                            (size), (align), (goal), (limit));         \
 })
 #endif /* CONFIG_NEED_MULTIPLE_NODES */
 
index aee103b26d01778c987e7f828876b876473a3bfd..8f1d2fbec1d4ecfebdd3053f79f1157b80c799e9 100644 (file)
 #else /* ...!ASSEMBLY */
 
 #include <linux/stringify.h>
+#include <asm/sections.h>
+
+#define __addr_to_pcpu_ptr(addr)                                       \
+       (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr  \
+                + (unsigned long)__per_cpu_start)
+#define __pcpu_ptr_to_addr(ptr)                                                \
+       (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr   \
+                - (unsigned long)__per_cpu_start)
 
 #ifdef CONFIG_SMP
 #define __percpu_arg(x)                "%%"__stringify(__percpu_seg)":%P" #x
index 1c097a3a6669345e8dcbaa3f9a166c1a34e4740e..d0812e155f1d60da04c614bee8b97e2e33869876 100644 (file)
@@ -288,6 +288,8 @@ static inline int is_new_memtype_allowed(unsigned long flags,
        return 1;
 }
 
+pmd_t *populate_extra_pmd(unsigned long vaddr);
+pte_t *populate_extra_pte(unsigned long vaddr);
 #endif /* __ASSEMBLY__ */
 
 #ifdef CONFIG_X86_32
index 4b1c319d30c368592e990663fb1c8efd5abea963..22590cf688aedd45f9165c5ee6d9ea571f20a61c 100644 (file)
@@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
        if (!data)
                return -ENOMEM;
 
-       data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
+       data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
        per_cpu(drv_data, cpu) = data;
 
        if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
index 9dc6b2b24275cdc77b88fb8da2c3d7daebbf45b2..3b09634a51535768cb3f99781dfa58c6a16b6ba4 100644 (file)
@@ -16,6 +16,7 @@
 #include <linux/cpu.h>
 #include <linux/delay.h>
 #include <linux/uaccess.h>
+#include <linux/percpu.h>
 
 #include <asm/apic.h>
 
@@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { }
 union irq_ctx {
        struct thread_info      tinfo;
        u32                     stack[THREAD_SIZE/sizeof(u32)];
-};
+} __attribute__((aligned(PAGE_SIZE)));
 
-static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
-static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
+static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
+static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
 
-static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
-static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
+static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
+static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
 
 static void call_on_stack(void *func, void *stack)
 {
@@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
        u32 *isp, arg1, arg2;
 
        curctx = (union irq_ctx *) current_thread_info();
-       irqctx = hardirq_ctx[smp_processor_id()];
+       irqctx = __get_cpu_var(hardirq_ctx);
 
        /*
         * this is where we switch to the IRQ stack. However, if we are
@@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu)
 {
        union irq_ctx *irqctx;
 
-       if (hardirq_ctx[cpu])
+       if (per_cpu(hardirq_ctx, cpu))
                return;
 
-       irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
+       irqctx = &per_cpu(hardirq_stack, cpu);
        irqctx->tinfo.task              = NULL;
        irqctx->tinfo.exec_domain       = NULL;
        irqctx->tinfo.cpu               = cpu;
        irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
        irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
 
-       hardirq_ctx[cpu] = irqctx;
+       per_cpu(hardirq_ctx, cpu) = irqctx;
 
-       irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE];
+       irqctx = &per_cpu(softirq_stack, cpu);
        irqctx->tinfo.task              = NULL;
        irqctx->tinfo.exec_domain       = NULL;
        irqctx->tinfo.cpu               = cpu;
        irqctx->tinfo.preempt_count     = 0;
        irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
 
-       softirq_ctx[cpu] = irqctx;
+       per_cpu(softirq_ctx, cpu) = irqctx;
 
        printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
-              cpu, hardirq_ctx[cpu], softirq_ctx[cpu]);
+              cpu, per_cpu(hardirq_ctx, cpu),  per_cpu(softirq_ctx, cpu));
 }
 
 void irq_ctx_exit(int cpu)
 {
-       hardirq_ctx[cpu] = NULL;
+       per_cpu(hardirq_ctx, cpu) = NULL;
 }
 
 asmlinkage void do_softirq(void)
@@ -169,7 +170,7 @@ asmlinkage void do_softirq(void)
 
        if (local_softirq_pending()) {
                curctx = current_thread_info();
-               irqctx = softirq_ctx[smp_processor_id()];
+               irqctx = __get_cpu_var(softirq_ctx);
                irqctx->tinfo.task = curctx->task;
                irqctx->tinfo.previous_esp = current_stack_pointer;
 
index d992e6cff73023f43a4c2a3d8ed7d95a7d28f6c1..2d946a8f78b9192d7d2cffc10d05719367412f23 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/crash_dump.h>
 #include <linux/smp.h>
 #include <linux/topology.h>
+#include <linux/pfn.h>
 #include <asm/sections.h>
 #include <asm/processor.h>
 #include <asm/setup.h>
@@ -41,6 +42,321 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
 };
 EXPORT_SYMBOL(__per_cpu_offset);
 
+/**
+ * pcpu_need_numa - determine percpu allocation needs to consider NUMA
+ *
+ * If NUMA is not configured or there is only one NUMA node available,
+ * there is no reason to consider NUMA.  This function determines
+ * whether percpu allocation should consider NUMA or not.
+ *
+ * RETURNS:
+ * true if NUMA should be considered; otherwise, false.
+ */
+static bool __init pcpu_need_numa(void)
+{
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+       pg_data_t *last = NULL;
+       unsigned int cpu;
+
+       for_each_possible_cpu(cpu) {
+               int node = early_cpu_to_node(cpu);
+
+               if (node_online(node) && NODE_DATA(node) &&
+                   last && last != NODE_DATA(node))
+                       return true;
+
+               last = NODE_DATA(node);
+       }
+#endif
+       return false;
+}
+
+/**
+ * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
+ * @cpu: cpu to allocate for
+ * @size: size allocation in bytes
+ * @align: alignment
+ *
+ * Allocate @size bytes aligned at @align for cpu @cpu.  This wrapper
+ * does the right thing for NUMA regardless of the current
+ * configuration.
+ *
+ * RETURNS:
+ * Pointer to the allocated area on success, NULL on failure.
+ */
+static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
+                                       unsigned long align)
+{
+       const unsigned long goal = __pa(MAX_DMA_ADDRESS);
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+       int node = early_cpu_to_node(cpu);
+       void *ptr;
+
+       if (!node_online(node) || !NODE_DATA(node)) {
+               ptr = __alloc_bootmem_nopanic(size, align, goal);
+               pr_info("cpu %d has no node %d or node-local memory\n",
+                       cpu, node);
+               pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
+                        cpu, size, __pa(ptr));
+       } else {
+               ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
+                                                  size, align, goal);
+               pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
+                        "%016lx\n", cpu, size, node, __pa(ptr));
+       }
+       return ptr;
+#else
+       return __alloc_bootmem_nopanic(size, align, goal);
+#endif
+}
+
+/*
+ * Remap allocator
+ *
+ * This allocator uses PMD page as unit.  A PMD page is allocated for
+ * each cpu and each is remapped into vmalloc area using PMD mapping.
+ * As PMD page is quite large, only part of it is used for the first
+ * chunk.  Unused part is returned to the bootmem allocator.
+ *
+ * So, the PMD pages are mapped twice - once to the physical mapping
+ * and to the vmalloc area for the first percpu chunk.  The double
+ * mapping does add one more PMD TLB entry pressure but still is much
+ * better than only using 4k mappings while still being NUMA friendly.
+ */
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+static size_t pcpur_size __initdata;
+static void **pcpur_ptrs __initdata;
+
+static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
+{
+       size_t off = (size_t)pageno << PAGE_SHIFT;
+
+       if (off >= pcpur_size)
+               return NULL;
+
+       return virt_to_page(pcpur_ptrs[cpu] + off);
+}
+
+static ssize_t __init setup_pcpu_remap(size_t static_size)
+{
+       static struct vm_struct vm;
+       pg_data_t *last;
+       size_t ptrs_size;
+       unsigned int cpu;
+       ssize_t ret;
+
+       /*
+        * If large page isn't supported, there's no benefit in doing
+        * this.  Also, on non-NUMA, embedding is better.
+        */
+       if (!cpu_has_pse || pcpu_need_numa())
+               return -EINVAL;
+
+       last = NULL;
+       for_each_possible_cpu(cpu) {
+               int node = early_cpu_to_node(cpu);
+
+               if (node_online(node) && NODE_DATA(node) &&
+                   last && last != NODE_DATA(node))
+                       goto proceed;
+
+               last = NODE_DATA(node);
+       }
+       return -EINVAL;
+
+proceed:
+       /*
+        * Currently supports only single page.  Supporting multiple
+        * pages won't be too difficult if it ever becomes necessary.
+        */
+       pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
+       if (pcpur_size > PMD_SIZE) {
+               pr_warning("PERCPU: static data is larger than large page, "
+                          "can't use large page\n");
+               return -EINVAL;
+       }
+
+       /* allocate pointer array and alloc large pages */
+       ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
+       pcpur_ptrs = alloc_bootmem(ptrs_size);
+
+       for_each_possible_cpu(cpu) {
+               pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
+               if (!pcpur_ptrs[cpu])
+                       goto enomem;
+
+               /*
+                * Only use pcpur_size bytes and give back the rest.
+                *
+                * Ingo: The 2MB up-rounding bootmem is needed to make
+                * sure the partial 2MB page is still fully RAM - it's
+                * not well-specified to have a PAT-incompatible area
+                * (unmapped RAM, device memory, etc.) in that hole.
+                */
+               free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
+                            PMD_SIZE - pcpur_size);
+
+               memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
+       }
+
+       /* allocate address and map */
+       vm.flags = VM_ALLOC;
+       vm.size = num_possible_cpus() * PMD_SIZE;
+       vm_area_register_early(&vm, PMD_SIZE);
+
+       for_each_possible_cpu(cpu) {
+               pmd_t *pmd;
+
+               pmd = populate_extra_pmd((unsigned long)vm.addr
+                                        + cpu * PMD_SIZE);
+               set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
+                                    PAGE_KERNEL_LARGE));
+       }
+
+       /* we're ready, commit */
+       pr_info("PERCPU: Remapped at %p with large pages, static data "
+               "%zu bytes\n", vm.addr, static_size);
+
+       ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE,
+                                    pcpur_size - static_size, vm.addr, NULL);
+       goto out_free_ar;
+
+enomem:
+       for_each_possible_cpu(cpu)
+               if (pcpur_ptrs[cpu])
+                       free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
+       ret = -ENOMEM;
+out_free_ar:
+       free_bootmem(__pa(pcpur_ptrs), ptrs_size);
+       return ret;
+}
+#else
+static ssize_t __init setup_pcpu_remap(size_t static_size)
+{
+       return -EINVAL;
+}
+#endif
+
+/*
+ * Embedding allocator
+ *
+ * The first chunk is sized to just contain the static area plus
+ * PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using
+ * bootmem allocator and used as-is without being mapped into vmalloc
+ * area.  This enables the first chunk to piggy back on the linear
+ * physical PMD mapping and doesn't add any additional pressure to
+ * TLB.
+ */
+static void *pcpue_ptr __initdata;
+static size_t pcpue_unit_size __initdata;
+
+static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
+{
+       return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size
+                           + ((size_t)pageno << PAGE_SHIFT));
+}
+
+static ssize_t __init setup_pcpu_embed(size_t static_size)
+{
+       unsigned int cpu;
+
+       /*
+        * If large page isn't supported, there's no benefit in doing
+        * this.  Also, embedding allocation doesn't play well with
+        * NUMA.
+        */
+       if (!cpu_has_pse || pcpu_need_numa())
+               return -EINVAL;
+
+       /* allocate and copy */
+       pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
+       pcpue_unit_size = max(pcpue_unit_size, PCPU_MIN_UNIT_SIZE);
+       pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
+                                      PAGE_SIZE);
+       if (!pcpue_ptr)
+               return -ENOMEM;
+
+       for_each_possible_cpu(cpu)
+               memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load,
+                      static_size);
+
+       /* we're ready, commit */
+       pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
+               pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size);
+
+       return pcpu_setup_first_chunk(pcpue_get_page, static_size,
+                                     pcpue_unit_size,
+                                     pcpue_unit_size - static_size, pcpue_ptr,
+                                     NULL);
+}
+
+/*
+ * 4k page allocator
+ *
+ * This is the basic allocator.  Static percpu area is allocated
+ * page-by-page and most of initialization is done by the generic
+ * setup function.
+ */
+static struct page **pcpu4k_pages __initdata;
+static int pcpu4k_nr_static_pages __initdata;
+
+static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
+{
+       if (pageno < pcpu4k_nr_static_pages)
+               return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
+       return NULL;
+}
+
+static void __init pcpu4k_populate_pte(unsigned long addr)
+{
+       populate_extra_pte(addr);
+}
+
+static ssize_t __init setup_pcpu_4k(size_t static_size)
+{
+       size_t pages_size;
+       unsigned int cpu;
+       int i, j;
+       ssize_t ret;
+
+       pcpu4k_nr_static_pages = PFN_UP(static_size);
+
+       /* unaligned allocations can't be freed, round up to page size */
+       pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
+                              * sizeof(pcpu4k_pages[0]));
+       pcpu4k_pages = alloc_bootmem(pages_size);
+
+       /* allocate and copy */
+       j = 0;
+       for_each_possible_cpu(cpu)
+               for (i = 0; i < pcpu4k_nr_static_pages; i++) {
+                       void *ptr;
+
+                       ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
+                       if (!ptr)
+                               goto enomem;
+
+                       memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
+                       pcpu4k_pages[j++] = virt_to_page(ptr);
+               }
+
+       /* we're ready, commit */
+       pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
+               pcpu4k_nr_static_pages, static_size);
+
+       ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL,
+                                    pcpu4k_populate_pte);
+       goto out_free_ar;
+
+enomem:
+       while (--j >= 0)
+               free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
+       ret = -ENOMEM;
+out_free_ar:
+       free_bootmem(__pa(pcpu4k_pages), pages_size);
+       return ret;
+}
+
 static inline void setup_percpu_segment(int cpu)
 {
 #ifdef CONFIG_X86_32
@@ -61,38 +377,35 @@ static inline void setup_percpu_segment(int cpu)
  */
 void __init setup_per_cpu_areas(void)
 {
-       ssize_t size;
-       char *ptr;
-       int cpu;
-
-       /* Copy section for each CPU (we discard the original) */
-       size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
+       size_t static_size = __per_cpu_end - __per_cpu_start;
+       unsigned int cpu;
+       unsigned long delta;
+       size_t pcpu_unit_size;
+       ssize_t ret;
 
        pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
                NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
 
-       pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size);
+       /*
+        * Allocate percpu area.  If PSE is supported, try to make use
+        * of large page mappings.  Please read comments on top of
+        * each allocator for details.
+        */
+       ret = setup_pcpu_remap(static_size);
+       if (ret < 0)
+               ret = setup_pcpu_embed(static_size);
+       if (ret < 0)
+               ret = setup_pcpu_4k(static_size);
+       if (ret < 0)
+               panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
+                     static_size, ret);
 
-       for_each_possible_cpu(cpu) {
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-               ptr = alloc_bootmem_pages(size);
-#else
-               int node = early_cpu_to_node(cpu);
-               if (!node_online(node) || !NODE_DATA(node)) {
-                       ptr = alloc_bootmem_pages(size);
-                       pr_info("cpu %d has no node %d or node-local memory\n",
-                               cpu, node);
-                       pr_debug("per cpu data for cpu%d at %016lx\n",
-                                cpu, __pa(ptr));
-               } else {
-                       ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
-                       pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
-                               cpu, node, __pa(ptr));
-               }
-#endif
+       pcpu_unit_size = ret;
 
-               memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
-               per_cpu_offset(cpu) = ptr - __per_cpu_start;
+       /* alrighty, percpu areas up and running */
+       delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+       for_each_possible_cpu(cpu) {
+               per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
                per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
                per_cpu(cpu_number, cpu) = cpu;
                setup_percpu_segment(cpu);
index 06708ee94aa407d1e773afee5b1e1abc102b6635..ef0bb941cdf518a8ea4fc1f74a5cd81423e29b64 100644 (file)
@@ -137,6 +137,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
        return pte_offset_kernel(pmd, 0);
 }
 
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+       int pgd_idx = pgd_index(vaddr);
+       int pmd_idx = pmd_index(vaddr);
+
+       return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+       int pte_idx = pte_index(vaddr);
+       pmd_t *pmd;
+
+       pmd = populate_extra_pmd(vaddr);
+       return one_page_table_init(pmd) + pte_idx;
+}
+
 static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
                                           unsigned long vaddr, pte_t *lastpte)
 {
index e6d36b490250bed6952f2d93de3d1f337c3cf7f4..7d4e76da3368a87e78944b8432a1f3bfa45d2061 100644 (file)
@@ -168,34 +168,51 @@ static __ref void *spp_getpage(void)
        return ptr;
 }
 
-void
-set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr)
 {
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
+       if (pgd_none(*pgd)) {
+               pud_t *pud = (pud_t *)spp_getpage();
+               pgd_populate(&init_mm, pgd, pud);
+               if (pud != pud_offset(pgd, 0))
+                       printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
+                              pud, pud_offset(pgd, 0));
+       }
+       return pud_offset(pgd, vaddr);
+}
 
-       pud = pud_page + pud_index(vaddr);
+static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr)
+{
        if (pud_none(*pud)) {
-               pmd = (pmd_t *) spp_getpage();
+               pmd_t *pmd = (pmd_t *) spp_getpage();
                pud_populate(&init_mm, pud, pmd);
-               if (pmd != pmd_offset(pud, 0)) {
+               if (pmd != pmd_offset(pud, 0))
                        printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
-                               pmd, pmd_offset(pud, 0));
-                       return;
-               }
+                              pmd, pmd_offset(pud, 0));
        }
-       pmd = pmd_offset(pud, vaddr);
+       return pmd_offset(pud, vaddr);
+}
+
+static pte_t * __init fill_pte(pmd_t *pmd, unsigned long vaddr)
+{
        if (pmd_none(*pmd)) {
-               pte = (pte_t *) spp_getpage();
+               pte_t *pte = (pte_t *) spp_getpage();
                pmd_populate_kernel(&init_mm, pmd, pte);
-               if (pte != pte_offset_kernel(pmd, 0)) {
+               if (pte != pte_offset_kernel(pmd, 0))
                        printk(KERN_ERR "PAGETABLE BUG #02!\n");
-                       return;
-               }
        }
+       return pte_offset_kernel(pmd, vaddr);
+}
+
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+{
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pud = pud_page + pud_index(vaddr);
+       pmd = fill_pmd(pud, vaddr);
+       pte = fill_pte(pmd, vaddr);
 
-       pte = pte_offset_kernel(pmd, vaddr);
        set_pte(pte, new_pte);
 
        /*
@@ -205,8 +222,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
        __flush_tlb_one(vaddr);
 }
 
-void
-set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 {
        pgd_t *pgd;
        pud_t *pud_page;
@@ -223,6 +239,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval)
        set_pte_vaddr_pud(pud_page, vaddr, pteval);
 }
 
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+
+       pgd = pgd_offset_k(vaddr);
+       pud = fill_pud(pgd, vaddr);
+       return fill_pmd(pud, vaddr);
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+       pmd_t *pmd;
+
+       pmd = populate_extra_pmd(vaddr);
+       return fill_pte(pmd, vaddr);
+}
+
 /*
  * Create large page table mappings for a range of physical addresses.
  */
index 7cf9d1ff45a015e0d8fe7be2546c6fb7f95f590e..028120a0965aa9a723196d1f245012151459ae3a 100644 (file)
@@ -363,7 +363,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
        if (!bt->sequence)
                goto err;
 
-       bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG);
+       bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
        if (!bt->msg_data)
                goto err;
 
index 9cc769b587ff7bbe061ae0c5ab5cc66d67e62ae5..68fd3d2927997efe09ced4d0df8120afa8adc7ce 100644 (file)
@@ -516,12 +516,12 @@ int acpi_processor_preregister_performance(
                        continue;
                }
 
-               if (!performance || !percpu_ptr(performance, i)) {
+               if (!performance || !per_cpu_ptr(performance, i)) {
                        retval = -EINVAL;
                        continue;
                }
 
-               pr->performance = percpu_ptr(performance, i);
+               pr->performance = per_cpu_ptr(performance, i);
                cpumask_set_cpu(i, pr->performance->shared_cpu_map);
                if (acpi_processor_get_psd(pr)) {
                        retval = -EINVAL;
index 95837bfb52561dded74d03982bb252bdae4776f5..455d83219fae5f3e40bdf83575da34ea9c864da7 100644 (file)
@@ -65,23 +65,20 @@ extern void free_bootmem(unsigned long addr, unsigned long size);
 #define BOOTMEM_DEFAULT                0
 #define BOOTMEM_EXCLUSIVE      (1<<0)
 
+extern int reserve_bootmem(unsigned long addr,
+                          unsigned long size,
+                          int flags);
 extern int reserve_bootmem_node(pg_data_t *pgdat,
-                                unsigned long physaddr,
-                                unsigned long size,
-                                int flags);
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
-extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags);
-#endif
+                               unsigned long physaddr,
+                               unsigned long size,
+                               int flags);
 
-extern void *__alloc_bootmem_nopanic(unsigned long size,
+extern void *__alloc_bootmem(unsigned long size,
                             unsigned long align,
                             unsigned long goal);
-extern void *__alloc_bootmem(unsigned long size,
+extern void *__alloc_bootmem_nopanic(unsigned long size,
                                     unsigned long align,
                                     unsigned long goal);
-extern void *__alloc_bootmem_low(unsigned long size,
-                                unsigned long align,
-                                unsigned long goal);
 extern void *__alloc_bootmem_node(pg_data_t *pgdat,
                                  unsigned long size,
                                  unsigned long align,
@@ -90,30 +87,35 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
                                  unsigned long size,
                                  unsigned long align,
                                  unsigned long goal);
+extern void *__alloc_bootmem_low(unsigned long size,
+                                unsigned long align,
+                                unsigned long goal);
 extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
                                      unsigned long size,
                                      unsigned long align,
                                      unsigned long goal);
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
+
 #define alloc_bootmem(x) \
        __alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_nopanic(x) \
        __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low(x) \
-       __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
 #define alloc_bootmem_pages(x) \
        __alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_pages_nopanic(x) \
        __alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low_pages(x) \
-       __alloc_bootmem_low(x, PAGE_SIZE, 0)
 #define alloc_bootmem_node(pgdat, x) \
        __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
 #define alloc_bootmem_pages_node(pgdat, x) \
        __alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
+#define alloc_bootmem_pages_node_nopanic(pgdat, x) \
+       __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
+
+#define alloc_bootmem_low(x) \
+       __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
+#define alloc_bootmem_low_pages(x) \
+       __alloc_bootmem_low(x, PAGE_SIZE, 0)
 #define alloc_bootmem_low_pages_node(pgdat, x) \
        __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
-#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
 extern int reserve_bootmem_generic(unsigned long addr, unsigned long size,
                                   int flags);
index 3577ffd90d45f6fc284571834614a199f4076dc0..910beb0abea23769064a675c9013037b42c6a4fb 100644 (file)
 
 #ifdef CONFIG_SMP
 
+#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+
+/* minimum unit size, also is the maximum supported allocation size */
+#define PCPU_MIN_UNIT_SIZE             (16UL << PAGE_SHIFT)
+
+/*
+ * PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
+ * back on the first chunk if arch is manually allocating and mapping
+ * it for faster access (as a part of large page mapping for example).
+ * Note that dynamic percpu allocator covers both static and dynamic
+ * areas, so these values are bigger than PERCPU_MODULE_RESERVE.
+ *
+ * On typical configuration with modules, the following values leave
+ * about 8k of free space on the first chunk after boot on both x86_32
+ * and 64 when module support is enabled.  When module support is
+ * disabled, it's much tighter.
+ */
+#ifndef PERCPU_DYNAMIC_RESERVE
+#  if BITS_PER_LONG > 32
+#    ifdef CONFIG_MODULES
+#      define PERCPU_DYNAMIC_RESERVE   (6 << PAGE_SHIFT)
+#    else
+#      define PERCPU_DYNAMIC_RESERVE   (4 << PAGE_SHIFT)
+#    endif
+#  else
+#    ifdef CONFIG_MODULES
+#      define PERCPU_DYNAMIC_RESERVE   (4 << PAGE_SHIFT)
+#    else
+#      define PERCPU_DYNAMIC_RESERVE   (2 << PAGE_SHIFT)
+#    endif
+#  endif
+#endif /* PERCPU_DYNAMIC_RESERVE */
+
+extern void *pcpu_base_addr;
+
+typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
+typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
+
+extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
+                                       size_t static_size, size_t unit_size,
+                                       size_t free_size, void *base_addr,
+                                       pcpu_populate_pte_fn_t populate_pte_fn);
+
+/*
+ * Use this to get to a cpu's version of the per-cpu object
+ * dynamically allocated. Non-atomic access to the current CPU's
+ * version should probably be combined with get_cpu()/put_cpu().
+ */
+#define per_cpu_ptr(ptr, cpu)  SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
+
+#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
 struct percpu_data {
        void *ptrs[1];
 };
 
 #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
-/* 
- * Use this to get to a cpu's version of the per-cpu object dynamically
- * allocated. Non-atomic access to the current CPU's version should
- * probably be combined with get_cpu()/put_cpu().
- */ 
-#define percpu_ptr(ptr, cpu)                              \
-({                                                        \
-        struct percpu_data *__p = __percpu_disguise(ptr); \
-        (__typeof__(ptr))__p->ptrs[(cpu)];               \
+
+#define per_cpu_ptr(ptr, cpu)                                          \
+({                                                                     \
+        struct percpu_data *__p = __percpu_disguise(ptr);              \
+        (__typeof__(ptr))__p->ptrs[(cpu)];                             \
 })
 
-extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask);
-extern void percpu_free(void *__pdata);
+#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
+extern void *__alloc_percpu(size_t size, size_t align);
+extern void free_percpu(void *__pdata);
 
 #else /* CONFIG_SMP */
 
-#define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
+#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
 
-static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
+static inline void *__alloc_percpu(size_t size, size_t align)
 {
+       /*
+        * Can't easily make larger alignment work with kmalloc.  WARN
+        * on it.  Larger alignment should only be used for module
+        * percpu sections on SMP for which this path isn't used.
+        */
+       WARN_ON_ONCE(align > __alignof__(unsigned long long));
        return kzalloc(size, gfp);
 }
 
-static inline void percpu_free(void *__pdata)
+static inline void free_percpu(void *p)
 {
-       kfree(__pdata);
+       kfree(p);
 }
 
 #endif /* CONFIG_SMP */
 
-#define percpu_alloc_mask(size, gfp, mask) \
-       __percpu_alloc_mask((size), (gfp), &(mask))
-
-#define percpu_alloc(size, gfp) percpu_alloc_mask((size), (gfp), cpu_online_map)
-
-/* (legacy) interface for use without CPU hotplug handling */
-
-#define __alloc_percpu(size)   percpu_alloc_mask((size), GFP_KERNEL, \
-                                                 cpu_possible_map)
-#define alloc_percpu(type)     (type *)__alloc_percpu(sizeof(type))
-#define free_percpu(ptr)       percpu_free((ptr))
-#define per_cpu_ptr(ptr, cpu)  percpu_ptr((ptr), (cpu))
+#define alloc_percpu(type)     (type *)__alloc_percpu(sizeof(type), \
+                                                      __alignof__(type))
 
 #endif /* __LINUX_PERCPU_H */
index 9c0890c7a06a357dd62e12cde059ca86764f8efa..a43ebec3a7b92692f0cc970a075e2462b705e270 100644 (file)
@@ -95,6 +95,9 @@ extern struct vm_struct *remove_vm_area(const void *addr);
 
 extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
                        struct page ***pages);
+extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
+                                   pgprot_t prot, struct page **pages);
+extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
 extern void unmap_kernel_range(unsigned long addr, unsigned long size);
 
 /* Allocate/destroy a 'vmalloc' VM area. */
@@ -110,5 +113,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
  */
 extern rwlock_t vmlist_lock;
 extern struct vm_struct *vmlist;
+extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 
 #endif /* _LINUX_VMALLOC_H */
index ba22484a987eb96a9d2bff0cdeede4b62a1d7156..1f0657ae555b15eadbb10658a02665bad204e828 100644 (file)
@@ -51,6 +51,7 @@
 #include <linux/tracepoint.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
+#include <linux/percpu.h>
 
 #if 0
 #define DEBUGP printk
@@ -366,6 +367,34 @@ static struct module *find_module(const char *name)
 }
 
 #ifdef CONFIG_SMP
+
+#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+
+static void *percpu_modalloc(unsigned long size, unsigned long align,
+                            const char *name)
+{
+       void *ptr;
+
+       if (align > PAGE_SIZE) {
+               printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
+                      name, align, PAGE_SIZE);
+               align = PAGE_SIZE;
+       }
+
+       ptr = __alloc_percpu(size, align);
+       if (!ptr)
+               printk(KERN_WARNING
+                      "Could not allocate %lu bytes percpu data\n", size);
+       return ptr;
+}
+
+static void percpu_modfree(void *freeme)
+{
+       free_percpu(freeme);
+}
+
+#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
 /* Number of blocks used and allocated. */
 static unsigned int pcpu_num_used, pcpu_num_allocated;
 /* Size of each block.  -ve means used. */
@@ -480,21 +509,6 @@ static void percpu_modfree(void *freeme)
        }
 }
 
-static unsigned int find_pcpusec(Elf_Ehdr *hdr,
-                                Elf_Shdr *sechdrs,
-                                const char *secstrings)
-{
-       return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
-}
-
-static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
-{
-       int cpu;
-
-       for_each_possible_cpu(cpu)
-               memcpy(pcpudest + per_cpu_offset(cpu), from, size);
-}
-
 static int percpu_modinit(void)
 {
        pcpu_num_used = 2;
@@ -513,7 +527,26 @@ static int percpu_modinit(void)
        return 0;
 }
 __initcall(percpu_modinit);
+
+#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
+static unsigned int find_pcpusec(Elf_Ehdr *hdr,
+                                Elf_Shdr *sechdrs,
+                                const char *secstrings)
+{
+       return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
+}
+
+static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               memcpy(pcpudest + per_cpu_offset(cpu), from, size);
+}
+
 #else /* ... !CONFIG_SMP */
+
 static inline void *percpu_modalloc(unsigned long size, unsigned long align,
                                    const char *name)
 {
@@ -535,6 +568,7 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
        /* pcpusec should be 0, and size of that section should be 0. */
        BUG_ON(size != 0);
 }
+
 #endif /* CONFIG_SMP */
 
 #define MODINFO_ATTR(field)    \
index 7d97ff7c447804cacde6adc114cde868296d67ac..0e5c38e1c8b5cdad3e2ab022fa8db8f79b88316c 100644 (file)
@@ -9476,7 +9476,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 
 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 {
-       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
        u64 data;
 
 #ifndef CONFIG_64BIT
@@ -9495,7 +9495,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 
 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 {
-       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+       u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 
 #ifndef CONFIG_64BIT
        /*
@@ -9591,7 +9591,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
        ca = task_ca(tsk);
 
        for (; ca; ca = ca->parent) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+               u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
        }
 }
index 0cd415ee62a262ccadea6b592a7402eb89fc4797..74541ca49536fda5dab61d473f8ce9d6aa3f3254 100644 (file)
@@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
         * doesn't hit this CPU until we're ready. */
        get_cpu();
        for_each_online_cpu(i) {
-               sm_work = percpu_ptr(stop_machine_work, i);
+               sm_work = per_cpu_ptr(stop_machine_work, i);
                INIT_WORK(sm_work, stop_cpu);
                queue_work_on(i, stop_machine_wq, sm_work);
        }
index 72255be57f89160cf8ff12e702c013db310d6ef4..818569b68f4652e623f5d2bd4ed91fe68fcea2b4 100644 (file)
@@ -30,6 +30,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
+ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+obj-$(CONFIG_SMP) += percpu.o
+else
 obj-$(CONFIG_SMP) += allocpercpu.o
+endif
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
index 4297bc41bfd2aefc986d3d3957cf83e37a63cd4a..3653c570232bc5fd643a6c9cdccf4c05360525c7 100644 (file)
@@ -99,45 +99,51 @@ static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
        __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
 
 /**
- * percpu_alloc_mask - initial setup of per-cpu data
+ * alloc_percpu - initial setup of per-cpu data
  * @size: size of per-cpu object
- * @gfp: may sleep or not etc.
- * @mask: populate per-data for cpu's selected through mask bits
+ * @align: alignment
  *
- * Populating per-cpu data for all online cpu's would be a typical use case,
- * which is simplified by the percpu_alloc() wrapper.
- * Per-cpu objects are populated with zeroed buffers.
+ * Allocate dynamic percpu area.  Percpu objects are populated with
+ * zeroed buffers.
  */
-void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
+void *__alloc_percpu(size_t size, size_t align)
 {
        /*
         * We allocate whole cache lines to avoid false sharing
         */
        size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
-       void *pdata = kzalloc(sz, gfp);
+       void *pdata = kzalloc(sz, GFP_KERNEL);
        void *__pdata = __percpu_disguise(pdata);
 
+       /*
+        * Can't easily make larger alignment work with kmalloc.  WARN
+        * on it.  Larger alignment should only be used for module
+        * percpu sections on SMP for which this path isn't used.
+        */
+       WARN_ON_ONCE(align > __alignof__(unsigned long long));
+
        if (unlikely(!pdata))
                return NULL;
-       if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
+       if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL,
+                                          &cpu_possible_map)))
                return __pdata;
        kfree(pdata);
        return NULL;
 }
-EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
+EXPORT_SYMBOL_GPL(__alloc_percpu);
 
 /**
- * percpu_free - final cleanup of per-cpu data
+ * free_percpu - final cleanup of per-cpu data
  * @__pdata: object to clean up
  *
  * We simply clean up any per-cpu object left. No need for the client to
  * track and specify through a bis mask which per-cpu objects are to free.
  */
-void percpu_free(void *__pdata)
+void free_percpu(void *__pdata)
 {
        if (unlikely(!__pdata))
                return;
        __percpu_depopulate_mask(__pdata, &cpu_possible_map);
        kfree(__percpu_disguise(__pdata));
 }
-EXPORT_SYMBOL_GPL(percpu_free);
+EXPORT_SYMBOL_GPL(free_percpu);
index 51a0ccf61e0e97284cf3be1d05d4f43feb293d93..d7140c008ba8b6786d7480d3cb4068dcc6a3ce1b 100644 (file)
@@ -37,6 +37,16 @@ static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
 
 static int bootmem_debug;
 
+/*
+ * If an arch needs to apply workarounds to bootmem allocation, it can
+ * set CONFIG_HAVE_ARCH_BOOTMEM and define a wrapper around
+ * __alloc_bootmem_core().
+ */
+#ifndef CONFIG_HAVE_ARCH_BOOTMEM
+#define alloc_bootmem_core(bdata, size, align, goal, limit)            \
+       __alloc_bootmem_core((bdata), (size), (align), (goal), (limit))
+#endif
+
 static int __init bootmem_debug_setup(char *buf)
 {
        bootmem_debug = 1;
@@ -382,7 +392,6 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
        return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
 }
 
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 /**
  * reserve_bootmem - mark a page range as usable
  * @addr: starting address of the range
@@ -403,7 +412,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
 
        return mark_bootmem(start, end, 1, flags);
 }
-#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
 static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
                        unsigned long step)
@@ -428,7 +436,7 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
        return ALIGN(base + off, align) - base;
 }
 
-static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
+static void * __init __alloc_bootmem_core(struct bootmem_data *bdata,
                                unsigned long size, unsigned long align,
                                unsigned long goal, unsigned long limit)
 {
diff --git a/mm/percpu.c b/mm/percpu.c
new file mode 100644 (file)
index 0000000..5954e7a
--- /dev/null
@@ -0,0 +1,979 @@
+/*
+ * linux/mm/percpu.c - percpu memory allocator
+ *
+ * Copyright (C) 2009          SUSE Linux Products GmbH
+ * Copyright (C) 2009          Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * This is percpu allocator which can handle both static and dynamic
+ * areas.  Percpu areas are allocated in chunks in vmalloc area.  Each
+ * chunk is consisted of num_possible_cpus() units and the first chunk
+ * is used for static percpu variables in the kernel image (special
+ * boot time alloc/init handling necessary as these areas need to be
+ * brought up before allocation services are running).  Unit grows as
+ * necessary and all units grow or shrink in unison.  When a chunk is
+ * filled up, another chunk is allocated.  ie. in vmalloc area
+ *
+ *  c0                           c1                         c2
+ *  -------------------          -------------------        ------------
+ * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
+ *  -------------------  ......  -------------------  ....  ------------
+ *
+ * Allocation is done in offset-size areas of single unit space.  Ie,
+ * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
+ * c1:u1, c1:u2 and c1:u3.  Percpu access can be done by configuring
+ * percpu base registers UNIT_SIZE apart.
+ *
+ * There are usually many small percpu allocations many of them as
+ * small as 4 bytes.  The allocator organizes chunks into lists
+ * according to free size and tries to allocate from the fullest one.
+ * Each chunk keeps the maximum contiguous area size hint which is
+ * guaranteed to be eqaul to or larger than the maximum contiguous
+ * area in the chunk.  This helps the allocator not to iterate the
+ * chunk maps unnecessarily.
+ *
+ * Allocation state in each chunk is kept using an array of integers
+ * on chunk->map.  A positive value in the map represents a free
+ * region and negative allocated.  Allocation inside a chunk is done
+ * by scanning this map sequentially and serving the first matching
+ * entry.  This is mostly copied from the percpu_modalloc() allocator.
+ * Chunks are also linked into a rb tree to ease address to chunk
+ * mapping during free.
+ *
+ * To use this allocator, arch code should do the followings.
+ *
+ * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ *
+ * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
+ *   regular address to percpu pointer and back
+ *
+ * - use pcpu_setup_first_chunk() during percpu area initialization to
+ *   setup the first chunk containing the kernel static percpu area
+ */
+
+#include <linux/bitmap.h>
+#include <linux/bootmem.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/pfn.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+#define PCPU_SLOT_BASE_SHIFT           5       /* 1-31 shares the same slot */
+#define PCPU_DFL_MAP_ALLOC             16      /* start a map with 16 ents */
+
+struct pcpu_chunk {
+       struct list_head        list;           /* linked to pcpu_slot lists */
+       struct rb_node          rb_node;        /* key is chunk->vm->addr */
+       int                     free_size;      /* free bytes in the chunk */
+       int                     contig_hint;    /* max contiguous size hint */
+       struct vm_struct        *vm;            /* mapped vmalloc region */
+       int                     map_used;       /* # of map entries used */
+       int                     map_alloc;      /* # of map entries allocated */
+       int                     *map;           /* allocation map */
+       bool                    immutable;      /* no [de]population allowed */
+       struct page             *page[];        /* #cpus * UNIT_PAGES */
+};
+
+static int pcpu_unit_pages __read_mostly;
+static int pcpu_unit_size __read_mostly;
+static int pcpu_chunk_size __read_mostly;
+static int pcpu_nr_slots __read_mostly;
+static size_t pcpu_chunk_struct_size __read_mostly;
+
+/* the address of the first chunk which starts with the kernel static area */
+void *pcpu_base_addr __read_mostly;
+EXPORT_SYMBOL_GPL(pcpu_base_addr);
+
+/* the size of kernel static area */
+static int pcpu_static_size __read_mostly;
+
+/*
+ * One mutex to rule them all.
+ *
+ * The following mutex is grabbed in the outermost public alloc/free
+ * interface functions and released only when the operation is
+ * complete.  As such, every function in this file other than the
+ * outermost functions are called under pcpu_mutex.
+ *
+ * It can easily be switched to use spinlock such that only the area
+ * allocation and page population commit are protected with it doing
+ * actual [de]allocation without holding any lock.  However, given
+ * what this allocator does, I think it's better to let them run
+ * sequentially.
+ */
+static DEFINE_MUTEX(pcpu_mutex);
+
+static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
+static struct rb_root pcpu_addr_root = RB_ROOT;        /* chunks by address */
+
+static int __pcpu_size_to_slot(int size)
+{
+       int highbit = fls(size);        /* size is in bytes */
+       return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
+}
+
+static int pcpu_size_to_slot(int size)
+{
+       if (size == pcpu_unit_size)
+               return pcpu_nr_slots - 1;
+       return __pcpu_size_to_slot(size);
+}
+
+static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
+{
+       if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
+               return 0;
+
+       return pcpu_size_to_slot(chunk->free_size);
+}
+
+static int pcpu_page_idx(unsigned int cpu, int page_idx)
+{
+       return cpu * pcpu_unit_pages + page_idx;
+}
+
+static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
+                                     unsigned int cpu, int page_idx)
+{
+       return &chunk->page[pcpu_page_idx(cpu, page_idx)];
+}
+
+static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
+                                    unsigned int cpu, int page_idx)
+{
+       return (unsigned long)chunk->vm->addr +
+               (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
+}
+
+static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
+                                    int page_idx)
+{
+       return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
+}
+
+/**
+ * pcpu_realloc - versatile realloc
+ * @p: the current pointer (can be NULL for new allocations)
+ * @size: the current size in bytes (can be 0 for new allocations)
+ * @new_size: the wanted new size in bytes (can be 0 for free)
+ *
+ * More robust realloc which can be used to allocate, resize or free a
+ * memory area of arbitrary size.  If the needed size goes over
+ * PAGE_SIZE, kernel VM is used.
+ *
+ * RETURNS:
+ * The new pointer on success, NULL on failure.
+ */
+static void *pcpu_realloc(void *p, size_t size, size_t new_size)
+{
+       void *new;
+
+       if (new_size <= PAGE_SIZE)
+               new = kmalloc(new_size, GFP_KERNEL);
+       else
+               new = vmalloc(new_size);
+       if (new_size && !new)
+               return NULL;
+
+       memcpy(new, p, min(size, new_size));
+       if (new_size > size)
+               memset(new + size, 0, new_size - size);
+
+       if (size <= PAGE_SIZE)
+               kfree(p);
+       else
+               vfree(p);
+
+       return new;
+}
+
+/**
+ * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
+ * @chunk: chunk of interest
+ * @oslot: the previous slot it was on
+ *
+ * This function is called after an allocation or free changed @chunk.
+ * New slot according to the changed state is determined and @chunk is
+ * moved to the slot.
+ */
+static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
+{
+       int nslot = pcpu_chunk_slot(chunk);
+
+       if (oslot != nslot) {
+               if (oslot < nslot)
+                       list_move(&chunk->list, &pcpu_slot[nslot]);
+               else
+                       list_move_tail(&chunk->list, &pcpu_slot[nslot]);
+       }
+}
+
+static struct rb_node **pcpu_chunk_rb_search(void *addr,
+                                            struct rb_node **parentp)
+{
+       struct rb_node **p = &pcpu_addr_root.rb_node;
+       struct rb_node *parent = NULL;
+       struct pcpu_chunk *chunk;
+
+       while (*p) {
+               parent = *p;
+               chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
+
+               if (addr < chunk->vm->addr)
+                       p = &(*p)->rb_left;
+               else if (addr > chunk->vm->addr)
+                       p = &(*p)->rb_right;
+               else
+                       break;
+       }
+
+       if (parentp)
+               *parentp = parent;
+       return p;
+}
+
+/**
+ * pcpu_chunk_addr_search - search for chunk containing specified address
+ * @addr: address to search for
+ *
+ * Look for chunk which might contain @addr.  More specifically, it
+ * searchs for the chunk with the highest start address which isn't
+ * beyond @addr.
+ *
+ * RETURNS:
+ * The address of the found chunk.
+ */
+static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
+{
+       struct rb_node *n, *parent;
+       struct pcpu_chunk *chunk;
+
+       n = *pcpu_chunk_rb_search(addr, &parent);
+       if (!n) {
+               /* no exactly matching chunk, the parent is the closest */
+               n = parent;
+               BUG_ON(!n);
+       }
+       chunk = rb_entry(n, struct pcpu_chunk, rb_node);
+
+       if (addr < chunk->vm->addr) {
+               /* the parent was the next one, look for the previous one */
+               n = rb_prev(n);
+               BUG_ON(!n);
+               chunk = rb_entry(n, struct pcpu_chunk, rb_node);
+       }
+
+       return chunk;
+}
+
+/**
+ * pcpu_chunk_addr_insert - insert chunk into address rb tree
+ * @new: chunk to insert
+ *
+ * Insert @new into address rb tree.
+ */
+static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
+{
+       struct rb_node **p, *parent;
+
+       p = pcpu_chunk_rb_search(new->vm->addr, &parent);
+       BUG_ON(*p);
+       rb_link_node(&new->rb_node, parent, p);
+       rb_insert_color(&new->rb_node, &pcpu_addr_root);
+}
+
+/**
+ * pcpu_split_block - split a map block
+ * @chunk: chunk of interest
+ * @i: index of map block to split
+ * @head: head size in bytes (can be 0)
+ * @tail: tail size in bytes (can be 0)
+ *
+ * Split the @i'th map block into two or three blocks.  If @head is
+ * non-zero, @head bytes block is inserted before block @i moving it
+ * to @i+1 and reducing its size by @head bytes.
+ *
+ * If @tail is non-zero, the target block, which can be @i or @i+1
+ * depending on @head, is reduced by @tail bytes and @tail byte block
+ * is inserted after the target block.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
+{
+       int nr_extra = !!head + !!tail;
+       int target = chunk->map_used + nr_extra;
+
+       /* reallocation required? */
+       if (chunk->map_alloc < target) {
+               int new_alloc = chunk->map_alloc;
+               int *new;
+
+               while (new_alloc < target)
+                       new_alloc *= 2;
+
+               new = pcpu_realloc(chunk->map,
+                                  chunk->map_alloc * sizeof(new[0]),
+                                  new_alloc * sizeof(new[0]));
+               if (!new)
+                       return -ENOMEM;
+
+               chunk->map_alloc = new_alloc;
+               chunk->map = new;
+       }
+
+       /* insert a new subblock */
+       memmove(&chunk->map[i + nr_extra], &chunk->map[i],
+               sizeof(chunk->map[0]) * (chunk->map_used - i));
+       chunk->map_used += nr_extra;
+
+       if (head) {
+               chunk->map[i + 1] = chunk->map[i] - head;
+               chunk->map[i++] = head;
+       }
+       if (tail) {
+               chunk->map[i++] -= tail;
+               chunk->map[i] = tail;
+       }
+       return 0;
+}
+
+/**
+ * pcpu_alloc_area - allocate area from a pcpu_chunk
+ * @chunk: chunk of interest
+ * @size: wanted size in bytes
+ * @align: wanted align
+ *
+ * Try to allocate @size bytes area aligned at @align from @chunk.
+ * Note that this function only allocates the offset.  It doesn't
+ * populate or map the area.
+ *
+ * RETURNS:
+ * Allocated offset in @chunk on success, -errno on failure.
+ */
+static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
+{
+       int oslot = pcpu_chunk_slot(chunk);
+       int max_contig = 0;
+       int i, off;
+
+       /*
+        * The static chunk initially doesn't have map attached
+        * because kmalloc wasn't available during init.  Give it one.
+        */
+       if (unlikely(!chunk->map)) {
+               chunk->map = pcpu_realloc(NULL, 0,
+                               PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+               if (!chunk->map)
+                       return -ENOMEM;
+
+               chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
+               chunk->map[chunk->map_used++] = -pcpu_static_size;
+               if (chunk->free_size)
+                       chunk->map[chunk->map_used++] = chunk->free_size;
+       }
+
+       for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
+               bool is_last = i + 1 == chunk->map_used;
+               int head, tail;
+
+               /* extra for alignment requirement */
+               head = ALIGN(off, align) - off;
+               BUG_ON(i == 0 && head != 0);
+
+               if (chunk->map[i] < 0)
+                       continue;
+               if (chunk->map[i] < head + size) {
+                       max_contig = max(chunk->map[i], max_contig);
+                       continue;
+               }
+
+               /*
+                * If head is small or the previous block is free,
+                * merge'em.  Note that 'small' is defined as smaller
+                * than sizeof(int), which is very small but isn't too
+                * uncommon for percpu allocations.
+                */
+               if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
+                       if (chunk->map[i - 1] > 0)
+                               chunk->map[i - 1] += head;
+                       else {
+                               chunk->map[i - 1] -= head;
+                               chunk->free_size -= head;
+                       }
+                       chunk->map[i] -= head;
+                       off += head;
+                       head = 0;
+               }
+
+               /* if tail is small, just keep it around */
+               tail = chunk->map[i] - head - size;
+               if (tail < sizeof(int))
+                       tail = 0;
+
+               /* split if warranted */
+               if (head || tail) {
+                       if (pcpu_split_block(chunk, i, head, tail))
+                               return -ENOMEM;
+                       if (head) {
+                               i++;
+                               off += head;
+                               max_contig = max(chunk->map[i - 1], max_contig);
+                       }
+                       if (tail)
+                               max_contig = max(chunk->map[i + 1], max_contig);
+               }
+
+               /* update hint and mark allocated */
+               if (is_last)
+                       chunk->contig_hint = max_contig; /* fully scanned */
+               else
+                       chunk->contig_hint = max(chunk->contig_hint,
+                                                max_contig);
+
+               chunk->free_size -= chunk->map[i];
+               chunk->map[i] = -chunk->map[i];
+
+               pcpu_chunk_relocate(chunk, oslot);
+               return off;
+       }
+
+       chunk->contig_hint = max_contig;        /* fully scanned */
+       pcpu_chunk_relocate(chunk, oslot);
+
+       /*
+        * Tell the upper layer that this chunk has no area left.
+        * Note that this is not an error condition but a notification
+        * to upper layer that it needs to look at other chunks.
+        * -ENOSPC is chosen as it isn't used in memory subsystem and
+        * matches the meaning in a way.
+        */
+       return -ENOSPC;
+}
+
+/**
+ * pcpu_free_area - free area to a pcpu_chunk
+ * @chunk: chunk of interest
+ * @freeme: offset of area to free
+ *
+ * Free area starting from @freeme to @chunk.  Note that this function
+ * only modifies the allocation map.  It doesn't depopulate or unmap
+ * the area.
+ */
+static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
+{
+       int oslot = pcpu_chunk_slot(chunk);
+       int i, off;
+
+       for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
+               if (off == freeme)
+                       break;
+       BUG_ON(off != freeme);
+       BUG_ON(chunk->map[i] > 0);
+
+       chunk->map[i] = -chunk->map[i];
+       chunk->free_size += chunk->map[i];
+
+       /* merge with previous? */
+       if (i > 0 && chunk->map[i - 1] >= 0) {
+               chunk->map[i - 1] += chunk->map[i];
+               chunk->map_used--;
+               memmove(&chunk->map[i], &chunk->map[i + 1],
+                       (chunk->map_used - i) * sizeof(chunk->map[0]));
+               i--;
+       }
+       /* merge with next? */
+       if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
+               chunk->map[i] += chunk->map[i + 1];
+               chunk->map_used--;
+               memmove(&chunk->map[i + 1], &chunk->map[i + 2],
+                       (chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
+       }
+
+       chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
+       pcpu_chunk_relocate(chunk, oslot);
+}
+
+/**
+ * pcpu_unmap - unmap pages out of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @page_start: page index of the first page to unmap
+ * @page_end: page index of the last page to unmap + 1
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ * If @flush is true, vcache is flushed before unmapping and tlb
+ * after.
+ */
+static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
+                      bool flush)
+{
+       unsigned int last = num_possible_cpus() - 1;
+       unsigned int cpu;
+
+       /* unmap must not be done on immutable chunk */
+       WARN_ON(chunk->immutable);
+
+       /*
+        * Each flushing trial can be very expensive, issue flush on
+        * the whole region at once rather than doing it for each cpu.
+        * This could be an overkill but is more scalable.
+        */
+       if (flush)
+               flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
+                                  pcpu_chunk_addr(chunk, last, page_end));
+
+       for_each_possible_cpu(cpu)
+               unmap_kernel_range_noflush(
+                               pcpu_chunk_addr(chunk, cpu, page_start),
+                               (page_end - page_start) << PAGE_SHIFT);
+
+       /* ditto as flush_cache_vunmap() */
+       if (flush)
+               flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
+                                      pcpu_chunk_addr(chunk, last, page_end));
+}
+
+/**
+ * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
+ * @chunk: chunk to depopulate
+ * @off: offset to the area to depopulate
+ * @size: size of the area to depopulate in bytes
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, depopulate and unmap pages [@page_start,@page_end)
+ * from @chunk.  If @flush is true, vcache is flushed before unmapping
+ * and tlb after.
+ */
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
+                                 bool flush)
+{
+       int page_start = PFN_DOWN(off);
+       int page_end = PFN_UP(off + size);
+       int unmap_start = -1;
+       int uninitialized_var(unmap_end);
+       unsigned int cpu;
+       int i;
+
+       for (i = page_start; i < page_end; i++) {
+               for_each_possible_cpu(cpu) {
+                       struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+
+                       if (!*pagep)
+                               continue;
+
+                       __free_page(*pagep);
+
+                       /*
+                        * If it's partial depopulation, it might get
+                        * populated or depopulated again.  Mark the
+                        * page gone.
+                        */
+                       *pagep = NULL;
+
+                       unmap_start = unmap_start < 0 ? i : unmap_start;
+                       unmap_end = i + 1;
+               }
+       }
+
+       if (unmap_start >= 0)
+               pcpu_unmap(chunk, unmap_start, unmap_end, flush);
+}
+
+/**
+ * pcpu_map - map pages into a pcpu_chunk
+ * @chunk: chunk of interest
+ * @page_start: page index of the first page to map
+ * @page_end: page index of the last page to map + 1
+ *
+ * For each cpu, map pages [@page_start,@page_end) into @chunk.
+ * vcache is flushed afterwards.
+ */
+static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
+{
+       unsigned int last = num_possible_cpus() - 1;
+       unsigned int cpu;
+       int err;
+
+       /* map must not be done on immutable chunk */
+       WARN_ON(chunk->immutable);
+
+       for_each_possible_cpu(cpu) {
+               err = map_kernel_range_noflush(
+                               pcpu_chunk_addr(chunk, cpu, page_start),
+                               (page_end - page_start) << PAGE_SHIFT,
+                               PAGE_KERNEL,
+                               pcpu_chunk_pagep(chunk, cpu, page_start));
+               if (err < 0)
+                       return err;
+       }
+
+       /* flush at once, please read comments in pcpu_unmap() */
+       flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
+                        pcpu_chunk_addr(chunk, last, page_end));
+       return 0;
+}
+
+/**
+ * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @off: offset to the area to populate
+ * @size: size of the area to populate in bytes
+ *
+ * For each cpu, populate and map pages [@page_start,@page_end) into
+ * @chunk.  The area is cleared on return.
+ */
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+       const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
+       int page_start = PFN_DOWN(off);
+       int page_end = PFN_UP(off + size);
+       int map_start = -1;
+       int map_end;
+       unsigned int cpu;
+       int i;
+
+       for (i = page_start; i < page_end; i++) {
+               if (pcpu_chunk_page_occupied(chunk, i)) {
+                       if (map_start >= 0) {
+                               if (pcpu_map(chunk, map_start, map_end))
+                                       goto err;
+                               map_start = -1;
+                       }
+                       continue;
+               }
+
+               map_start = map_start < 0 ? i : map_start;
+               map_end = i + 1;
+
+               for_each_possible_cpu(cpu) {
+                       struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+
+                       *pagep = alloc_pages_node(cpu_to_node(cpu),
+                                                 alloc_mask, 0);
+                       if (!*pagep)
+                               goto err;
+               }
+       }
+
+       if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
+               goto err;
+
+       for_each_possible_cpu(cpu)
+               memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
+                      size);
+
+       return 0;
+err:
+       /* likely under heavy memory pressure, give memory back */
+       pcpu_depopulate_chunk(chunk, off, size, true);
+       return -ENOMEM;
+}
+
+static void free_pcpu_chunk(struct pcpu_chunk *chunk)
+{
+       if (!chunk)
+               return;
+       if (chunk->vm)
+               free_vm_area(chunk->vm);
+       pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0);
+       kfree(chunk);
+}
+
+static struct pcpu_chunk *alloc_pcpu_chunk(void)
+{
+       struct pcpu_chunk *chunk;
+
+       chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
+       if (!chunk)
+               return NULL;
+
+       chunk->map = pcpu_realloc(NULL, 0,
+                                 PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+       chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
+       chunk->map[chunk->map_used++] = pcpu_unit_size;
+
+       chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
+       if (!chunk->vm) {
+               free_pcpu_chunk(chunk);
+               return NULL;
+       }
+
+       INIT_LIST_HEAD(&chunk->list);
+       chunk->free_size = pcpu_unit_size;
+       chunk->contig_hint = pcpu_unit_size;
+
+       return chunk;
+}
+
+/**
+ * __alloc_percpu - allocate percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate percpu area of @size bytes aligned at @align.  Might
+ * sleep.  Might trigger writeouts.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void *__alloc_percpu(size_t size, size_t align)
+{
+       void *ptr = NULL;
+       struct pcpu_chunk *chunk;
+       int slot, off;
+
+       if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
+               WARN(true, "illegal size (%zu) or align (%zu) for "
+                    "percpu allocation\n", size, align);
+               return NULL;
+       }
+
+       mutex_lock(&pcpu_mutex);
+
+       /* allocate area */
+       for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
+               list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+                       if (size > chunk->contig_hint)
+                               continue;
+                       off = pcpu_alloc_area(chunk, size, align);
+                       if (off >= 0)
+                               goto area_found;
+                       if (off != -ENOSPC)
+                               goto out_unlock;
+               }
+       }
+
+       /* hmmm... no space left, create a new chunk */
+       chunk = alloc_pcpu_chunk();
+       if (!chunk)
+               goto out_unlock;
+       pcpu_chunk_relocate(chunk, -1);
+       pcpu_chunk_addr_insert(chunk);
+
+       off = pcpu_alloc_area(chunk, size, align);
+       if (off < 0)
+               goto out_unlock;
+
+area_found:
+       /* populate, map and clear the area */
+       if (pcpu_populate_chunk(chunk, off, size)) {
+               pcpu_free_area(chunk, off);
+               goto out_unlock;
+       }
+
+       ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
+out_unlock:
+       mutex_unlock(&pcpu_mutex);
+       return ptr;
+}
+EXPORT_SYMBOL_GPL(__alloc_percpu);
+
+static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
+{
+       WARN_ON(chunk->immutable);
+       pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
+       list_del(&chunk->list);
+       rb_erase(&chunk->rb_node, &pcpu_addr_root);
+       free_pcpu_chunk(chunk);
+}
+
+/**
+ * free_percpu - free percpu area
+ * @ptr: pointer to area to free
+ *
+ * Free percpu area @ptr.  Might sleep.
+ */
+void free_percpu(void *ptr)
+{
+       void *addr = __pcpu_ptr_to_addr(ptr);
+       struct pcpu_chunk *chunk;
+       int off;
+
+       if (!ptr)
+               return;
+
+       mutex_lock(&pcpu_mutex);
+
+       chunk = pcpu_chunk_addr_search(addr);
+       off = addr - chunk->vm->addr;
+
+       pcpu_free_area(chunk, off);
+
+       /* the chunk became fully free, kill one if there are other free ones */
+       if (chunk->free_size == pcpu_unit_size) {
+               struct pcpu_chunk *pos;
+
+               list_for_each_entry(pos,
+                                   &pcpu_slot[pcpu_chunk_slot(chunk)], list)
+                       if (pos != chunk) {
+                               pcpu_kill_chunk(pos);
+                               break;
+                       }
+       }
+
+       mutex_unlock(&pcpu_mutex);
+}
+EXPORT_SYMBOL_GPL(free_percpu);
+
+/**
+ * pcpu_setup_first_chunk - initialize the first percpu chunk
+ * @get_page_fn: callback to fetch page pointer
+ * @static_size: the size of static percpu area in bytes
+ * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
+ * @free_size: free size in bytes, 0 for auto
+ * @base_addr: mapped address, NULL for auto
+ * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
+ *
+ * Initialize the first percpu chunk which contains the kernel static
+ * perpcu area.  This function is to be called from arch percpu area
+ * setup path.  The first two parameters are mandatory.  The rest are
+ * optional.
+ *
+ * @get_page_fn() should return pointer to percpu page given cpu
+ * number and page number.  It should at least return enough pages to
+ * cover the static area.  The returned pages for static area should
+ * have been initialized with valid data.  If @unit_size is specified,
+ * it can also return pages after the static area.  NULL return
+ * indicates end of pages for the cpu.  Note that @get_page_fn() must
+ * return the same number of pages for all cpus.
+ *
+ * @unit_size, if non-zero, determines unit size and must be aligned
+ * to PAGE_SIZE and equal to or larger than @static_size + @free_size.
+ *
+ * @free_size determines the number of free bytes after the static
+ * area in the first chunk.  If zero, whatever left is available.
+ * Specifying non-zero value make percpu leave the area after
+ * @static_size + @free_size alone.
+ *
+ * Non-null @base_addr means that the caller already allocated virtual
+ * region for the first chunk and mapped it.  percpu must not mess
+ * with the chunk.  Note that @base_addr with 0 @unit_size or non-NULL
+ * @populate_pte_fn doesn't make any sense.
+ *
+ * @populate_pte_fn is used to populate the pagetable.  NULL means the
+ * caller already populated the pagetable.
+ *
+ * RETURNS:
+ * The determined pcpu_unit_size which can be used to initialize
+ * percpu access.
+ */
+size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
+                                    size_t static_size, size_t unit_size,
+                                    size_t free_size, void *base_addr,
+                                    pcpu_populate_pte_fn_t populate_pte_fn)
+{
+       static struct vm_struct static_vm;
+       struct pcpu_chunk *static_chunk;
+       unsigned int cpu;
+       int nr_pages;
+       int err, i;
+
+       /* santiy checks */
+       BUG_ON(!static_size);
+       BUG_ON(!unit_size && free_size);
+       BUG_ON(unit_size && unit_size < static_size + free_size);
+       BUG_ON(unit_size & ~PAGE_MASK);
+       BUG_ON(base_addr && !unit_size);
+       BUG_ON(base_addr && populate_pte_fn);
+
+       if (unit_size)
+               pcpu_unit_pages = unit_size >> PAGE_SHIFT;
+       else
+               pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
+                                       PFN_UP(static_size));
+
+       pcpu_static_size = static_size;
+       pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
+       pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
+       pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
+               + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
+
+       /*
+        * Allocate chunk slots.  The additional last slot is for
+        * empty chunks.
+        */
+       pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
+       pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
+       for (i = 0; i < pcpu_nr_slots; i++)
+               INIT_LIST_HEAD(&pcpu_slot[i]);
+
+       /* init static_chunk */
+       static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
+       INIT_LIST_HEAD(&static_chunk->list);
+       static_chunk->vm = &static_vm;
+
+       if (free_size)
+               static_chunk->free_size = free_size;
+       else
+               static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
+
+       static_chunk->contig_hint = static_chunk->free_size;
+
+       /* allocate vm address */
+       static_vm.flags = VM_ALLOC;
+       static_vm.size = pcpu_chunk_size;
+
+       if (!base_addr)
+               vm_area_register_early(&static_vm, PAGE_SIZE);
+       else {
+               /*
+                * Pages already mapped.  No need to remap into
+                * vmalloc area.  In this case the static chunk can't
+                * be mapped or unmapped by percpu and is marked
+                * immutable.
+                */
+               static_vm.addr = base_addr;
+               static_chunk->immutable = true;
+       }
+
+       /* assign pages */
+       nr_pages = -1;
+       for_each_possible_cpu(cpu) {
+               for (i = 0; i < pcpu_unit_pages; i++) {
+                       struct page *page = get_page_fn(cpu, i);
+
+                       if (!page)
+                               break;
+                       *pcpu_chunk_pagep(static_chunk, cpu, i) = page;
+               }
+
+               BUG_ON(i < PFN_UP(pcpu_static_size));
+
+               if (nr_pages < 0)
+                       nr_pages = i;
+               else
+                       BUG_ON(nr_pages != i);
+       }
+
+       /* map them */
+       if (populate_pte_fn) {
+               for_each_possible_cpu(cpu)
+                       for (i = 0; i < nr_pages; i++)
+                               populate_pte_fn(pcpu_chunk_addr(static_chunk,
+                                                               cpu, i));
+
+               err = pcpu_map(static_chunk, 0, nr_pages);
+               if (err)
+                       panic("failed to setup static percpu area, err=%d\n",
+                             err);
+       }
+
+       /* link static_chunk in */
+       pcpu_chunk_relocate(static_chunk, -1);
+       pcpu_chunk_addr_insert(static_chunk);
+
+       /* we're done */
+       pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0);
+       return pcpu_unit_size;
+}
index 903cad46e796b94bde12306efbce62b532be3713..fb6f59935fb2eef70084e467bce65ef6a5692221 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
 #include <linux/bootmem.h>
+#include <linux/pfn.h>
 
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
@@ -152,8 +153,8 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
  *
  * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
  */
-static int vmap_page_range(unsigned long start, unsigned long end,
-                               pgprot_t prot, struct page **pages)
+static int vmap_page_range_noflush(unsigned long start, unsigned long end,
+                                  pgprot_t prot, struct page **pages)
 {
        pgd_t *pgd;
        unsigned long next;
@@ -169,13 +170,22 @@ static int vmap_page_range(unsigned long start, unsigned long end,
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);
-       flush_cache_vmap(start, end);
 
        if (unlikely(err))
                return err;
        return nr;
 }
 
+static int vmap_page_range(unsigned long start, unsigned long end,
+                          pgprot_t prot, struct page **pages)
+{
+       int ret;
+
+       ret = vmap_page_range_noflush(start, end, prot, pages);
+       flush_cache_vmap(start, end);
+       return ret;
+}
+
 static inline int is_vmalloc_or_module_addr(const void *x)
 {
        /*
@@ -982,6 +992,32 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
 }
 EXPORT_SYMBOL(vm_map_ram);
 
+/**
+ * vm_area_register_early - register vmap area early during boot
+ * @vm: vm_struct to register
+ * @align: requested alignment
+ *
+ * This function is used to register kernel vm area before
+ * vmalloc_init() is called.  @vm->size and @vm->flags should contain
+ * proper values on entry and other fields should be zero.  On return,
+ * vm->addr contains the allocated address.
+ *
+ * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
+ */
+void __init vm_area_register_early(struct vm_struct *vm, size_t align)
+{
+       static size_t vm_init_off __initdata;
+       unsigned long addr;
+
+       addr = ALIGN(VMALLOC_START + vm_init_off, align);
+       vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
+
+       vm->addr = (void *)addr;
+
+       vm->next = vmlist;
+       vmlist = vm;
+}
+
 void __init vmalloc_init(void)
 {
        struct vmap_area *va;
@@ -1009,6 +1045,58 @@ void __init vmalloc_init(void)
        vmap_initialized = true;
 }
 
+/**
+ * map_kernel_range_noflush - map kernel VM area with the specified pages
+ * @addr: start of the VM area to map
+ * @size: size of the VM area to map
+ * @prot: page protection flags to use
+ * @pages: pages to map
+ *
+ * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size
+ * specify should have been allocated using get_vm_area() and its
+ * friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing.  The caller is
+ * responsible for calling flush_cache_vmap() on to-be-mapped areas
+ * before calling this function.
+ *
+ * RETURNS:
+ * The number of pages mapped on success, -errno on failure.
+ */
+int map_kernel_range_noflush(unsigned long addr, unsigned long size,
+                            pgprot_t prot, struct page **pages)
+{
+       return vmap_page_range_noflush(addr, addr + size, prot, pages);
+}
+
+/**
+ * unmap_kernel_range_noflush - unmap kernel VM area
+ * @addr: start of the VM area to unmap
+ * @size: size of the VM area to unmap
+ *
+ * Unmap PFN_UP(@size) pages at @addr.  The VM area @addr and @size
+ * specify should have been allocated using get_vm_area() and its
+ * friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing.  The caller is
+ * responsible for calling flush_cache_vunmap() on to-be-mapped areas
+ * before calling this function and flush_tlb_kernel_range() after.
+ */
+void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
+{
+       vunmap_page_range(addr, addr + size);
+}
+
+/**
+ * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
+ * @addr: start of the VM area to unmap
+ * @size: size of the VM area to unmap
+ *
+ * Similar to unmap_kernel_range_noflush() but flushes vcache before
+ * the unmapping and tlb after.
+ */
 void unmap_kernel_range(unsigned long addr, unsigned long size)
 {
        unsigned long end = addr + size;
index 743f5542d65a6385c7ae423ddb49d4c69daaeaee..3a3dad8013548d5581ae8e9250f8e818a2a3a122 100644 (file)
@@ -1375,10 +1375,10 @@ EXPORT_SYMBOL_GPL(snmp_fold_field);
 int snmp_mib_init(void *ptr[2], size_t mibsize)
 {
        BUG_ON(ptr == NULL);
-       ptr[0] = __alloc_percpu(mibsize);
+       ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
        if (!ptr[0])
                goto err0;
-       ptr[1] = __alloc_percpu(mibsize);
+       ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
        if (!ptr[1])
                goto err1;
        return 0;