[PATCH] slab debug and ARCH_SLAB_MINALIGN don't get along

[linux-2.6-omap-h63xx.git] / mm / slab.c
diff --git a/mm/slab.c b/mm/slab.c

index 13b5050f84cce61121d6b163a57f306e38e27727..ff60a94142f995aceaa5e684cf6ed55c3138189d 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -86,7 +86,6 @@
   *     All object allocations for a node occur from node specific slab lists.
   */
  
-#include       <linux/config.h>
  #include       <linux/slab.h>
  #include       <linux/mm.h>
  #include       <linux/poison.h>
@@ -314,7 +313,7 @@ static int drain_freelist(struct kmem_cache *cache,
  static void free_block(struct kmem_cache *cachep, void **objpp, int len,
                         int node);
  static int enable_cpucache(struct kmem_cache *cachep);
-static void cache_reap(void *unused);
+static void cache_reap(struct work_struct *unused);
  
  /*
   * This function must be completely optimized away if a constant is passed to
@@ -735,14 +734,6 @@ static inline void init_lock_keys(void)
  static DEFINE_MUTEX(cache_chain_mutex);
  static struct list_head cache_chain;
  
-/*
- * vm_enough_memory() looks at this to determine how many slab-allocated pages
- * are possibly freeable under pressure
- *
- * SLAB_RECLAIM_ACCOUNT turns this on per-slab
- */
-atomic_t slab_reclaim_pages;
-
  /*
   * chicken and egg problem: delay the per-cpu array allocation
   * until the general caches are up.
@@ -762,7 +753,7 @@ int slab_is_available(void)
         return g_cpucache_up == FULL;
  }
  
-static DEFINE_PER_CPU(struct work_struct, reap_work);
+static DEFINE_PER_CPU(struct delayed_work, reap_work);
  
  static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
  {
@@ -892,7 +883,7 @@ static void init_reap_node(int cpu)
         if (node == MAX_NUMNODES)
                 node = first_node(node_online_map);
  
-       __get_cpu_var(reap_node) = node;
+       per_cpu(reap_node, cpu) = node;
  }
  
  static void next_reap_node(void)
@@ -925,16 +916,16 @@ static void next_reap_node(void)
   */
  static void __devinit start_cpu_timer(int cpu)
  {
-       struct work_struct *reap_work = &per_cpu(reap_work, cpu);
+       struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
  
         /*
          * When this gets called from do_initcalls via cpucache_init(),
          * init_workqueues() has already run, so keventd will be setup
          * at that time.
          */
-       if (keventd_up() && reap_work->func == NULL) {
+       if (keventd_up() && reap_work->work.func == NULL) {
                 init_reap_node(cpu);
-               INIT_WORK(reap_work, cache_reap, NULL);
+               INIT_DELAYED_WORK(reap_work, cache_reap);
                 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
         }
  }
@@ -980,7 +971,39 @@ static int transfer_objects(struct array_cache *to,
         return nr;
  }
  
-#ifdef CONFIG_NUMA
+#ifndef CONFIG_NUMA
+
+#define drain_alien_cache(cachep, alien) do { } while (0)
+#define reap_alien(cachep, l3) do { } while (0)
+
+static inline struct array_cache **alloc_alien_cache(int node, int limit)
+{
+       return (struct array_cache **)BAD_ALIEN_MAGIC;
+}
+
+static inline void free_alien_cache(struct array_cache **ac_ptr)
+{
+}
+
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+{
+       return 0;
+}
+
+static inline void *alternate_node_alloc(struct kmem_cache *cachep,
+               gfp_t flags)
+{
+       return NULL;
+}
+
+static inline void *__cache_alloc_node(struct kmem_cache *cachep,
+                gfp_t flags, int nodeid)
+{
+       return NULL;
+}
+
+#else  /* CONFIG_NUMA */
+
  static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
  static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
  
@@ -1083,15 +1106,18 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
         int nodeid = slabp->nodeid;
         struct kmem_list3 *l3;
         struct array_cache *alien = NULL;
+       int node;
+
+       node = numa_node_id();
  
         /*
          * Make sure we are not freeing a object from another node to the array
          * cache on this cpu.
          */
-       if (likely(slabp->nodeid == numa_node_id()))
+       if (likely(slabp->nodeid == node))
                 return 0;
  
-       l3 = cachep->nodelists[numa_node_id()];
+       l3 = cachep->nodelists[node];
         STATS_INC_NODEFREES(cachep);
         if (l3->alien && l3->alien[nodeid]) {
                 alien = l3->alien[nodeid];
@@ -1109,26 +1135,6 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
         }
         return 1;
  }
-
-#else
-
-#define drain_alien_cache(cachep, alien) do { } while (0)
-#define reap_alien(cachep, l3) do { } while (0)
-
-static inline struct array_cache **alloc_alien_cache(int node, int limit)
-{
-       return (struct array_cache **)BAD_ALIEN_MAGIC;
-}
-
-static inline void free_alien_cache(struct array_cache **ac_ptr)
-{
-}
-
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
-{
-       return 0;
-}
-
  #endif
  
  static int __cpuinit cpuup_callback(struct notifier_block *nfb,
@@ -1322,7 +1328,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
  {
         struct kmem_list3 *ptr;
  
-       BUG_ON(cachep->nodelists[nodeid] != list);
         ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
         BUG_ON(!ptr);
  
@@ -1349,6 +1354,7 @@ void __init kmem_cache_init(void)
         struct cache_names *names;
         int i;
         int order;
+       int node;
  
         for (i = 0; i < NUM_INIT_LISTS; i++) {
                 kmem_list3_init(&initkmem_list3[i]);
@@ -1383,12 +1389,14 @@ void __init kmem_cache_init(void)
          * 6) Resize the head arrays of the kmalloc caches to their final sizes.
          */
  
+       node = numa_node_id();
+
         /* 1) create the cache_cache */
         INIT_LIST_HEAD(&cache_chain);
         list_add(&cache_cache.next, &cache_chain);
         cache_cache.colour_off = cache_line_size();
         cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
-       cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
+       cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE];
  
         cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
                                         cache_line_size());
@@ -1493,19 +1501,18 @@ void __init kmem_cache_init(void)
         }
         /* 5) Replace the bootstrap kmem_list3's */
         {
-               int node;
+               int nid;
+
                 /* Replace the static kmem_list3 structures for the boot cpu */
-               init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
-                         numa_node_id());
+               init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node);
  
-               for_each_online_node(node) {
+               for_each_online_node(nid) {
                         init_list(malloc_sizes[INDEX_AC].cs_cachep,
-                                 &initkmem_list3[SIZE_AC + node], node);
+                                 &initkmem_list3[SIZE_AC + nid], nid);
  
                         if (INDEX_AC != INDEX_L3) {
                                 init_list(malloc_sizes[INDEX_L3].cs_cachep,
-                                         &initkmem_list3[SIZE_L3 + node],
-                                         node);
+                                         &initkmem_list3[SIZE_L3 + nid], nid);
                         }
                 }
         }
@@ -1572,7 +1579,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
          */
         flags |= __GFP_COMP;
  #endif
-       flags |= cachep->gfpflags;
+
+       /*
+        * Under NUMA we want memory on the indicated node. We will handle
+        * the needed fallback ourselves since we want to serve from our
+        * per node object lists first for other nodes.
+        */
+       flags |= cachep->gfpflags | GFP_THISNODE;
  
         page = alloc_pages_node(nodeid, flags, cachep->gfporder);
         if (!page)
@@ -1580,8 +1593,11 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
  
         nr_pages = (1 << cachep->gfporder);
         if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
-               atomic_add(nr_pages, &slab_reclaim_pages);
-       add_zone_page_state(page_zone(page), NR_SLAB, nr_pages);
+               add_zone_page_state(page_zone(page),
+                       NR_SLAB_RECLAIMABLE, nr_pages);
+       else
+               add_zone_page_state(page_zone(page),
+                       NR_SLAB_UNRECLAIMABLE, nr_pages);
         for (i = 0; i < nr_pages; i++)
                 __SetPageSlab(page + i);
         return page_address(page);
@@ -1596,7 +1612,12 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
         struct page *page = virt_to_page(addr);
         const unsigned long nr_freed = i;
  
-       sub_zone_page_state(page_zone(page), NR_SLAB, nr_freed);
+       if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
+               sub_zone_page_state(page_zone(page),
+                               NR_SLAB_RECLAIMABLE, nr_freed);
+       else
+               sub_zone_page_state(page_zone(page),
+                               NR_SLAB_UNRECLAIMABLE, nr_freed);
         while (i--) {
                 BUG_ON(!PageSlab(page));
                 __ClearPageSlab(page);
@@ -1605,8 +1626,6 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
         if (current->reclaim_state)
                 current->reclaim_state->reclaimed_slab += nr_freed;
         free_pages((unsigned long)addr, cachep->gfporder);
-       if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
-               atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
  }
  
  static void kmem_rcu_free(struct rcu_head *head)
@@ -1667,10 +1686,32 @@ static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
  static void dump_line(char *data, int offset, int limit)
  {
         int i;
+       unsigned char error = 0;
+       int bad_count = 0;
+
         printk(KERN_ERR "%03x:", offset);
-       for (i = 0; i < limit; i++)
+       for (i = 0; i < limit; i++) {
+               if (data[offset + i] != POISON_FREE) {
+                       error = data[offset + i];
+                       bad_count++;
+               }
                 printk(" %02x", (unsigned char)data[offset + i]);
+       }
         printk("\n");
+
+       if (bad_count == 1) {
+               error ^= POISON_FREE;
+               if (!(error & (error - 1))) {
+                       printk(KERN_ERR "Single bit error detected. Probably "
+                                       "bad RAM.\n");
+#ifdef CONFIG_X86
+                       printk(KERN_ERR "Run memtest86+ or a similar memory "
+                                       "test tool.\n");
+#else
+                       printk(KERN_ERR "Run a memory test tool.\n");
+#endif
+               }
+       }
  }
  #endif
  
@@ -2156,18 +2197,17 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
                 ralign = BYTES_PER_WORD;
  
-       /* 2) arch mandated alignment: disables debug if necessary */
+       /* 2) arch mandated alignment */
         if (ralign < ARCH_SLAB_MINALIGN) {
                 ralign = ARCH_SLAB_MINALIGN;
-               if (ralign > BYTES_PER_WORD)
-                       flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
         }
-       /* 3) caller mandated alignment: disables debug if necessary */
+       /* 3) caller mandated alignment */
         if (ralign < align) {
                 ralign = align;
-               if (ralign > BYTES_PER_WORD)
-                       flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
         }
+       /* disable debug if necessary */
+       if (ralign > BYTES_PER_WORD)
+               flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
         /*
          * 4) Store it.
          */
@@ -2444,7 +2484,6 @@ EXPORT_SYMBOL(kmem_cache_shrink);
   * @cachep: the cache to destroy
   *
   * Remove a struct kmem_cache object from the slab cache.
- * Returns 0 on success.
   *
   * It is expected this function will be called by a module when it is
   * unloaded.  This will remove the cache completely, and avoid a duplicate
@@ -2456,7 +2495,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
   * The caller must guarantee that noone will allocate memory from the cache
   * during the kmem_cache_destroy().
   */
-int kmem_cache_destroy(struct kmem_cache *cachep)
+void kmem_cache_destroy(struct kmem_cache *cachep)
  {
         BUG_ON(!cachep || in_interrupt());
  
@@ -2477,7 +2516,7 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
                 list_add(&cachep->next, &cache_chain);
                 mutex_unlock(&cache_chain_mutex);
                 unlock_cpu_hotplug();
-               return 1;
+               return;
         }
  
         if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
@@ -2485,7 +2524,6 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
  
         __kmem_cache_destroy(cachep);
         unlock_cpu_hotplug();
-       return 0;
  }
  EXPORT_SYMBOL(kmem_cache_destroy);
  
@@ -2883,6 +2921,9 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
         int batchcount;
         struct kmem_list3 *l3;
         struct array_cache *ac;
+       int node;
+
+       node = numa_node_id();
  
         check_irq_off();
         ac = cpu_cache_get(cachep);
@@ -2896,7 +2937,7 @@ retry:
                  */
                 batchcount = BATCHREFILL_LIMIT;
         }
-       l3 = cachep->nodelists[numa_node_id()];
+       l3 = cachep->nodelists[node];
  
         BUG_ON(ac->avail > 0 || !l3);
         spin_lock(&l3->list_lock);
@@ -2926,7 +2967,7 @@ retry:
                         STATS_SET_HIGH(cachep);
  
                         ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
-                                                           numa_node_id());
+                                                           node);
                 }
                 check_slabp(cachep, slabp);
  
@@ -2945,7 +2986,7 @@ alloc_done:
  
         if (unlikely(!ac->avail)) {
                 int x;
-               x = cache_grow(cachep, flags, numa_node_id());
+               x = cache_grow(cachep, flags, node);
  
                 /* cache_grow can reenable interrupts, then ac could change. */
                 ac = cpu_cache_get(cachep);
@@ -3021,6 +3062,12 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
  
                 cachep->ctor(objp, cachep, ctor_flags);
         }
+#if ARCH_SLAB_MINALIGN
+       if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
+               printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
+                      objp, ARCH_SLAB_MINALIGN);
+       }
+#endif
         return objp;
  }
  #else
@@ -3032,14 +3079,6 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
         void *objp;
         struct array_cache *ac;
  
-#ifdef CONFIG_NUMA
-       if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
-               objp = alternate_node_alloc(cachep, flags);
-               if (objp != NULL)
-                       return objp;
-       }
-#endif
-
         check_irq_off();
         ac = cpu_cache_get(cachep);
         if (likely(ac->avail)) {
@@ -3057,12 +3096,24 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
                                                 gfp_t flags, void *caller)
  {
         unsigned long save_flags;
-       void *objp;
+       void *objp = NULL;
  
         cache_alloc_debugcheck_before(cachep, flags);
  
         local_irq_save(save_flags);
-       objp = ____cache_alloc(cachep, flags);
+
+       if (unlikely(NUMA_BUILD &&
+                       current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
+               objp = alternate_node_alloc(cachep, flags);
+
+       if (!objp)
+               objp = ____cache_alloc(cachep, flags);
+       /*
+        * We may just have run out of memory on the local node.
+        * __cache_alloc_node() knows how to locate memory on other nodes
+        */
+       if (NUMA_BUILD && !objp)
+               objp = __cache_alloc_node(cachep, flags, numa_node_id());
         local_irq_restore(save_flags);
         objp = cache_alloc_debugcheck_after(cachep, flags, objp,
                                             caller);
@@ -3081,7 +3132,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
  {
         int nid_alloc, nid_here;
  
-       if (in_interrupt())
+       if (in_interrupt() || (flags & __GFP_THISNODE))
                 return NULL;
         nid_alloc = nid_here = numa_node_id();
         if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
@@ -3093,6 +3144,31 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
         return NULL;
  }
  
+/*
+ * Fallback function if there was no memory available and no objects on a
+ * certain node and we are allowed to fall back. We mimick the behavior of
+ * the page allocator. We fall back according to a zonelist determined by
+ * the policy layer while obeying cpuset constraints.
+ */
+void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
+{
+       struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy))
+                                       ->node_zonelists[gfp_zone(flags)];
+       struct zone **z;
+       void *obj = NULL;
+
+       for (z = zonelist->zones; *z && !obj; z++) {
+               int nid = zone_to_nid(*z);
+
+               if (zone_idx(*z) <= ZONE_NORMAL &&
+                               cpuset_zone_allowed(*z, flags) &&
+                               cache->nodelists[nid])
+                       obj = __cache_alloc_node(cache,
+                                       flags | __GFP_THISNODE, nid);
+       }
+       return obj;
+}
+
  /*
   * A interface to enable slab creation on nodeid
   */
@@ -3146,11 +3222,15 @@ retry:
  must_grow:
         spin_unlock(&l3->list_lock);
         x = cache_grow(cachep, flags, nodeid);
+       if (x)
+               goto retry;
  
-       if (!x)
-               return NULL;
+       if (!(flags & __GFP_THISNODE))
+               /* Unable to grow the cache. Fall back to other nodes. */
+               return fallback_alloc(cachep, flags);
+
+       return NULL;
  
-       goto retry;
  done:
         return obj;
  }
@@ -3422,22 +3502,25 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
  }
  
  
+#ifdef CONFIG_DEBUG_SLAB
  void *__kmalloc(size_t size, gfp_t flags)
  {
-#ifndef CONFIG_DEBUG_SLAB
-       return __do_kmalloc(size, flags, NULL);
-#else
         return __do_kmalloc(size, flags, __builtin_return_address(0));
-#endif
  }
  EXPORT_SYMBOL(__kmalloc);
  
-#ifdef CONFIG_DEBUG_SLAB
  void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
  {
         return __do_kmalloc(size, flags, caller);
  }
  EXPORT_SYMBOL(__kmalloc_track_caller);
+
+#else
+void *__kmalloc(size_t size, gfp_t flags)
+{
+       return __do_kmalloc(size, flags, NULL);
+}
+EXPORT_SYMBOL(__kmalloc);
  #endif
  
  /**
@@ -3737,7 +3820,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
   * If we cannot acquire the cache chain mutex then just give up - we'll try
   * again on the next iteration.
   */
-static void cache_reap(void *unused)
+static void cache_reap(struct work_struct *unused)
  {
         struct kmem_cache *searchp;
         struct kmem_list3 *l3;