]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/blobdiff - mm/memcontrol.c
memcg: fix a race when setting memory.swappiness
[linux-2.6-omap-h63xx.git] / mm / memcontrol.c
index e2996b80601f8fd20a3bb9d1216ebf3017d11afa..4d0ea3ceba6d2e40431f64f7e8d8c0f3c0022165 100644 (file)
@@ -358,6 +358,10 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
                return;
 
        pc = lookup_page_cgroup(page);
+       /*
+        * Used bit is set without atomic ops but after smp_wmb().
+        * For making pc->mem_cgroup visible, insert smp_rmb() here.
+        */
        smp_rmb();
        /* unused page is not rotated. */
        if (!PageCgroupUsed(pc))
@@ -374,7 +378,10 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
        if (mem_cgroup_disabled())
                return;
        pc = lookup_page_cgroup(page);
-       /* barrier to sync with "charge" */
+       /*
+        * Used bit is set without atomic ops but after smp_wmb().
+        * For making pc->mem_cgroup visible, insert smp_rmb() here.
+        */
        smp_rmb();
        if (!PageCgroupUsed(pc))
                return;
@@ -559,6 +566,14 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
                return NULL;
 
        pc = lookup_page_cgroup(page);
+       /*
+        * Used bit is set without atomic ops but after smp_wmb().
+        * For making pc->mem_cgroup visible, insert smp_rmb() here.
+        */
+       smp_rmb();
+       if (!PageCgroupUsed(pc))
+               return NULL;
+
        mz = page_cgroup_zoneinfo(pc);
        if (!mz)
                return NULL;
@@ -618,7 +633,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
  * called with hierarchy_mutex held
  */
 static struct mem_cgroup *
-mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
+__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
 {
        struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
 
@@ -629,19 +644,16 @@ mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
                /*
                 * Walk down to children
                 */
-               mem_cgroup_put(curr);
                cgroup = list_entry(curr_cgroup->children.next,
                                                struct cgroup, sibling);
                curr = mem_cgroup_from_cont(cgroup);
-               mem_cgroup_get(curr);
                goto done;
        }
 
 visit_parent:
        if (curr_cgroup == root_cgroup) {
-               mem_cgroup_put(curr);
-               curr = root_mem;
-               mem_cgroup_get(curr);
+               /* caller handles NULL case */
+               curr = NULL;
                goto done;
        }
 
@@ -649,11 +661,9 @@ visit_parent:
         * Goto next sibling
         */
        if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
-               mem_cgroup_put(curr);
                cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
                                                sibling);
                curr = mem_cgroup_from_cont(cgroup);
-               mem_cgroup_get(curr);
                goto done;
        }
 
@@ -664,7 +674,6 @@ visit_parent:
        goto visit_parent;
 
 done:
-       root_mem->last_scanned_child = curr;
        return curr;
 }
 
@@ -674,40 +683,46 @@ done:
  * that to reclaim free pages from.
  */
 static struct mem_cgroup *
-mem_cgroup_get_first_node(struct mem_cgroup *root_mem)
+mem_cgroup_get_next_node(struct mem_cgroup *root_mem)
 {
        struct cgroup *cgroup;
-       struct mem_cgroup *ret;
+       struct mem_cgroup *orig, *next;
        bool obsolete;
 
-       obsolete = mem_cgroup_is_obsolete(root_mem->last_scanned_child);
-
        /*
         * Scan all children under the mem_cgroup mem
         */
        mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
+
+       orig = root_mem->last_scanned_child;
+       obsolete = mem_cgroup_is_obsolete(orig);
+
        if (list_empty(&root_mem->css.cgroup->children)) {
-               ret = root_mem;
+               /*
+                * root_mem might have children before and last_scanned_child
+                * may point to one of them. We put it later.
+                */
+               if (orig)
+                       VM_BUG_ON(!obsolete);
+               next = NULL;
                goto done;
        }
 
-       if (!root_mem->last_scanned_child || obsolete) {
-
-               if (obsolete && root_mem->last_scanned_child)
-                       mem_cgroup_put(root_mem->last_scanned_child);
-
+       if (!orig || obsolete) {
                cgroup = list_first_entry(&root_mem->css.cgroup->children,
                                struct cgroup, sibling);
-               ret = mem_cgroup_from_cont(cgroup);
-               mem_cgroup_get(ret);
+               next = mem_cgroup_from_cont(cgroup);
        } else
-               ret = mem_cgroup_get_next_node(root_mem->last_scanned_child,
-                                               root_mem);
+               next = __mem_cgroup_get_next_node(orig, root_mem);
 
 done:
-       root_mem->last_scanned_child = ret;
+       if (next)
+               mem_cgroup_get(next);
+       root_mem->last_scanned_child = next;
+       if (orig)
+               mem_cgroup_put(orig);
        mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
-       return ret;
+       return (next) ? next : root_mem;
 }
 
 static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
@@ -758,28 +773,25 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
         * but there might be left over accounting, even after children
         * have left.
         */
-       ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap,
+       ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap,
                                           get_swappiness(root_mem));
        if (mem_cgroup_check_under_limit(root_mem))
-               return 0;
+               return 1;       /* indicate reclaim has succeeded */
        if (!root_mem->use_hierarchy)
                return ret;
 
-       next_mem = mem_cgroup_get_first_node(root_mem);
+       next_mem = mem_cgroup_get_next_node(root_mem);
 
        while (next_mem != root_mem) {
                if (mem_cgroup_is_obsolete(next_mem)) {
-                       mem_cgroup_put(next_mem);
-                       next_mem = mem_cgroup_get_first_node(root_mem);
+                       next_mem = mem_cgroup_get_next_node(root_mem);
                        continue;
                }
-               ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap,
+               ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap,
                                                   get_swappiness(next_mem));
                if (mem_cgroup_check_under_limit(root_mem))
-                       return 0;
-               mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
-               next_mem = mem_cgroup_get_next_node(next_mem, root_mem);
-               mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
+                       return 1;       /* indicate reclaim has succeeded */
+               next_mem = mem_cgroup_get_next_node(root_mem);
        }
        return ret;
 }
@@ -863,6 +875,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 
                ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
                                                        noswap);
+               if (ret)
+                       continue;
 
                /*
                 * try_to_free_mem_cgroup_pages() might not give us a full
@@ -979,14 +993,15 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
        if (pc->mem_cgroup != from)
                goto out;
 
-       css_put(&from->css);
        res_counter_uncharge(&from->res, PAGE_SIZE);
        mem_cgroup_charge_statistics(from, pc, false);
        if (do_swap_account)
                res_counter_uncharge(&from->memsw, PAGE_SIZE);
+       css_put(&from->css);
+
+       css_get(&to->css);
        pc->mem_cgroup = to;
        mem_cgroup_charge_statistics(to, pc, true);
-       css_get(&to->css);
        ret = 0;
 out:
        unlock_page_cgroup(pc);
@@ -1019,8 +1034,10 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
        if (ret || !parent)
                return ret;
 
-       if (!get_page_unless_zero(page))
-               return -EBUSY;
+       if (!get_page_unless_zero(page)) {
+               ret = -EBUSY;
+               goto uncharge;
+       }
 
        ret = isolate_lru_page(page);
 
@@ -1029,19 +1046,23 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
 
        ret = mem_cgroup_move_account(pc, child, parent);
 
-       /* drop extra refcnt by try_charge() (move_account increment one) */
-       css_put(&parent->css);
        putback_lru_page(page);
        if (!ret) {
                put_page(page);
+               /* drop extra refcnt by try_charge() */
+               css_put(&parent->css);
                return 0;
        }
-       /* uncharge if move fails */
+
 cancel:
+       put_page(page);
+uncharge:
+       /* drop extra refcnt by try_charge() */
+       css_put(&parent->css);
+       /* uncharge if move fails */
        res_counter_uncharge(&parent->res, PAGE_SIZE);
        if (do_swap_account)
                res_counter_uncharge(&parent->memsw, PAGE_SIZE);
-       put_page(page);
        return ret;
 }
 
@@ -1971,6 +1992,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
        struct mem_cgroup *parent;
+
        if (val > 100)
                return -EINVAL;
 
@@ -1978,15 +2000,22 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
                return -EINVAL;
 
        parent = mem_cgroup_from_cont(cgrp->parent);
+
+       cgroup_lock();
+
        /* If under hierarchy, only empty-root can set this value */
        if ((parent->use_hierarchy) ||
-           (memcg->use_hierarchy && !list_empty(&cgrp->children)))
+           (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
+               cgroup_unlock();
                return -EINVAL;
+       }
 
        spin_lock(&memcg->reclaim_param_lock);
        memcg->swappiness = val;
        spin_unlock(&memcg->reclaim_param_lock);
 
+       cgroup_unlock();
+
        return 0;
 }
 
@@ -2181,7 +2210,7 @@ static void __init enable_swap_cgroup(void)
 }
 #endif
 
-static struct cgroup_subsys_state *
+static struct cgroup_subsys_state * __ref
 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 {
        struct mem_cgroup *mem, *parent;
@@ -2232,7 +2261,14 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
                                struct cgroup *cont)
 {
-       mem_cgroup_put(mem_cgroup_from_cont(cont));
+       struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+       struct mem_cgroup *last_scanned_child = mem->last_scanned_child;
+
+       if (last_scanned_child) {
+               VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child));
+               mem_cgroup_put(last_scanned_child);
+       }
+       mem_cgroup_put(mem);
 }
 
 static int mem_cgroup_populate(struct cgroup_subsys *ss,