#define do_swap_account (0)
#endif
+static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */
/*
* Statistics for memory cgroup.
*/
struct list_head lists[NR_LRU_LISTS];
unsigned long count[NR_LRU_LISTS];
+
+ struct zone_reclaim_stat reclaim_stat;
};
/* Macro for accessing counter */
#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
*/
struct mem_cgroup_lru_info info;
+ /*
+ protect against reclaim related member.
+ */
+ spinlock_t reclaim_param_lock;
+
int prev_priority; /* for recording reclaim priority */
- int obsolete;
+
+ /*
+ * While reclaiming in a hiearchy, we cache the last child we
+ * reclaimed from. Protected by hierarchy_mutex
+ */
+ struct mem_cgroup *last_scanned_child;
+ /*
+ * Should the accounting and control be hierarchical, per subtree?
+ */
+ bool use_hierarchy;
+ unsigned long last_oom_jiffies;
atomic_t refcnt;
+
+ unsigned int swappiness;
+
/*
* statistics. This must be placed at the end of memcg.
*/
0, /* FORCE */
};
-
/* for encoding cft->private value on file */
#define _MEM (0)
#define _MEMSWAP (1)
int nid = page_cgroup_nid(pc);
int zid = page_cgroup_zid(pc);
+ if (!mem)
+ return NULL;
+
return mem_cgroup_zoneinfo(mem, nid, zid);
}
struct mem_cgroup, css);
}
+static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
+{
+ struct mem_cgroup *mem = NULL;
+ /*
+ * Because we have no locks, mm->owner's may be being moved to other
+ * cgroup. We use css_tryget() here even if this looks
+ * pessimistic (rather than adding locks here).
+ */
+ rcu_read_lock();
+ do {
+ mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!mem))
+ break;
+ } while (!css_tryget(&mem->css));
+ rcu_read_unlock();
+ return mem;
+}
+
+static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem)
+{
+ if (!mem)
+ return true;
+ return css_is_removed(&mem->css);
+}
+
/*
* Following LRU functions are allowed to be used without PCG_LOCK.
* Operations are called by routine of global LRU independently from memcg.
struct mem_cgroup *mem;
struct mem_cgroup_per_zone *mz;
- if (mem_cgroup_subsys.disabled)
+ if (mem_cgroup_disabled())
return;
pc = lookup_page_cgroup(page);
/* can happen while we handle swapcache. */
- if (list_empty(&pc->lru))
+ if (list_empty(&pc->lru) || !pc->mem_cgroup)
return;
+ /*
+ * We don't check PCG_USED bit. It's cleared when the "page" is finally
+ * removed from global LRU.
+ */
mz = page_cgroup_zoneinfo(pc);
mem = pc->mem_cgroup;
MEM_CGROUP_ZSTAT(mz, lru) -= 1;
struct mem_cgroup_per_zone *mz;
struct page_cgroup *pc;
- if (mem_cgroup_subsys.disabled)
+ if (mem_cgroup_disabled())
return;
pc = lookup_page_cgroup(page);
+ /*
+ * Used bit is set without atomic ops but after smp_wmb().
+ * For making pc->mem_cgroup visible, insert smp_rmb() here.
+ */
smp_rmb();
/* unused page is not rotated. */
if (!PageCgroupUsed(pc))
struct page_cgroup *pc;
struct mem_cgroup_per_zone *mz;
- if (mem_cgroup_subsys.disabled)
+ if (mem_cgroup_disabled())
return;
pc = lookup_page_cgroup(page);
- /* barrier to sync with "charge" */
+ /*
+ * Used bit is set without atomic ops but after smp_wmb().
+ * For making pc->mem_cgroup visible, insert smp_rmb() here.
+ */
smp_rmb();
if (!PageCgroupUsed(pc))
return;
MEM_CGROUP_ZSTAT(mz, lru) += 1;
list_add(&pc->lru, &mz->lists[lru]);
}
+
/*
- * To add swapcache into LRU. Be careful to all this function.
- * zone->lru_lock shouldn't be held and irq must not be disabled.
+ * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
+ * lru because the page may.be reused after it's fully uncharged (because of
+ * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
+ * it again. This function is only used to charge SwapCache. It's done under
+ * lock_page and expected that zone->lru_lock is never held.
*/
-static void mem_cgroup_lru_fixup(struct page *page)
+static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
{
- if (!isolate_lru_page(page))
- putback_lru_page(page);
+ unsigned long flags;
+ struct zone *zone = page_zone(page);
+ struct page_cgroup *pc = lookup_page_cgroup(page);
+
+ spin_lock_irqsave(&zone->lru_lock, flags);
+ /*
+ * Forget old LRU when this page_cgroup is *not* used. This Used bit
+ * is guarded by lock_page() because the page is SwapCache.
+ */
+ if (!PageCgroupUsed(pc))
+ mem_cgroup_del_lru_list(page, page_lru(page));
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
}
+static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
+{
+ unsigned long flags;
+ struct zone *zone = page_zone(page);
+ struct page_cgroup *pc = lookup_page_cgroup(page);
+
+ spin_lock_irqsave(&zone->lru_lock, flags);
+ /* link when the page is linked to LRU but page_cgroup isn't */
+ if (PageLRU(page) && list_empty(&pc->lru))
+ mem_cgroup_add_lru_list(page, page_lru(page));
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+}
+
+
void mem_cgroup_move_lists(struct page *page,
enum lru_list from, enum lru_list to)
{
- if (mem_cgroup_subsys.disabled)
+ if (mem_cgroup_disabled())
return;
mem_cgroup_del_lru_list(page, from);
mem_cgroup_add_lru_list(page, to);
*/
int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
{
- return mem->prev_priority;
+ int prev_priority;
+
+ spin_lock(&mem->reclaim_param_lock);
+ prev_priority = mem->prev_priority;
+ spin_unlock(&mem->reclaim_param_lock);
+
+ return prev_priority;
}
void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
{
+ spin_lock(&mem->reclaim_param_lock);
if (priority < mem->prev_priority)
mem->prev_priority = priority;
+ spin_unlock(&mem->reclaim_param_lock);
}
void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
{
+ spin_lock(&mem->reclaim_param_lock);
mem->prev_priority = priority;
+ spin_unlock(&mem->reclaim_param_lock);
}
-/*
- * Calculate # of pages to be scanned in this priority/zone.
- * See also vmscan.c
- *
- * priority starts from "DEF_PRIORITY" and decremented in each loop.
- * (see include/linux/mmzone.h)
- */
+static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
+{
+ unsigned long active;
+ unsigned long inactive;
+ unsigned long gb;
+ unsigned long inactive_ratio;
+
+ inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON);
+ active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON);
+
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
+ if (gb)
+ inactive_ratio = int_sqrt(10 * gb);
+ else
+ inactive_ratio = 1;
+
+ if (present_pages) {
+ present_pages[0] = inactive;
+ present_pages[1] = active;
+ }
+
+ return inactive_ratio;
+}
+
+int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
+{
+ unsigned long active;
+ unsigned long inactive;
+ unsigned long present_pages[2];
+ unsigned long inactive_ratio;
+
+ inactive_ratio = calc_inactive_ratio(memcg, present_pages);
+
+ inactive = present_pages[0];
+ active = present_pages[1];
+
+ if (inactive * inactive_ratio < active)
+ return 1;
+
+ return 0;
+}
+
+unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
+ struct zone *zone,
+ enum lru_list lru)
+{
+ int nid = zone->zone_pgdat->node_id;
+ int zid = zone_idx(zone);
+ struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+
+ return MEM_CGROUP_ZSTAT(mz, lru);
+}
-long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
- int priority, enum lru_list lru)
+struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
+ struct zone *zone)
{
- long nr_pages;
int nid = zone->zone_pgdat->node_id;
int zid = zone_idx(zone);
- struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
+ struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+
+ return &mz->reclaim_stat;
+}
- nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
+struct zone_reclaim_stat *
+mem_cgroup_get_reclaim_stat_from_page(struct page *page)
+{
+ struct page_cgroup *pc;
+ struct mem_cgroup_per_zone *mz;
- return (nr_pages >> priority);
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ pc = lookup_page_cgroup(page);
+ /*
+ * Used bit is set without atomic ops but after smp_wmb().
+ * For making pc->mem_cgroup visible, insert smp_rmb() here.
+ */
+ smp_rmb();
+ if (!PageCgroupUsed(pc))
+ return NULL;
+
+ mz = page_cgroup_zoneinfo(pc);
+ if (!mz)
+ return NULL;
+
+ return &mz->reclaim_stat;
}
unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
return nr_taken;
}
+#define mem_cgroup_from_res_counter(counter, member) \
+ container_of(counter, struct mem_cgroup, member)
+
+/*
+ * This routine finds the DFS walk successor. This routine should be
+ * called with hierarchy_mutex held
+ */
+static struct mem_cgroup *
+__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
+{
+ struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
+
+ curr_cgroup = curr->css.cgroup;
+ root_cgroup = root_mem->css.cgroup;
+
+ if (!list_empty(&curr_cgroup->children)) {
+ /*
+ * Walk down to children
+ */
+ cgroup = list_entry(curr_cgroup->children.next,
+ struct cgroup, sibling);
+ curr = mem_cgroup_from_cont(cgroup);
+ goto done;
+ }
+
+visit_parent:
+ if (curr_cgroup == root_cgroup) {
+ /* caller handles NULL case */
+ curr = NULL;
+ goto done;
+ }
+
+ /*
+ * Goto next sibling
+ */
+ if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
+ cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
+ sibling);
+ curr = mem_cgroup_from_cont(cgroup);
+ goto done;
+ }
+
+ /*
+ * Go up to next parent and next parent's sibling if need be
+ */
+ curr_cgroup = curr_cgroup->parent;
+ goto visit_parent;
+
+done:
+ return curr;
+}
+
+/*
+ * Visit the first child (need not be the first child as per the ordering
+ * of the cgroup list, since we track last_scanned_child) of @mem and use
+ * that to reclaim free pages from.
+ */
+static struct mem_cgroup *
+mem_cgroup_get_next_node(struct mem_cgroup *root_mem)
+{
+ struct cgroup *cgroup;
+ struct mem_cgroup *orig, *next;
+ bool obsolete;
+
+ /*
+ * Scan all children under the mem_cgroup mem
+ */
+ mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
+
+ orig = root_mem->last_scanned_child;
+ obsolete = mem_cgroup_is_obsolete(orig);
+
+ if (list_empty(&root_mem->css.cgroup->children)) {
+ /*
+ * root_mem might have children before and last_scanned_child
+ * may point to one of them. We put it later.
+ */
+ if (orig)
+ VM_BUG_ON(!obsolete);
+ next = NULL;
+ goto done;
+ }
+
+ if (!orig || obsolete) {
+ cgroup = list_first_entry(&root_mem->css.cgroup->children,
+ struct cgroup, sibling);
+ next = mem_cgroup_from_cont(cgroup);
+ } else
+ next = __mem_cgroup_get_next_node(orig, root_mem);
+
+done:
+ if (next)
+ mem_cgroup_get(next);
+ root_mem->last_scanned_child = next;
+ if (orig)
+ mem_cgroup_put(orig);
+ mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
+ return (next) ? next : root_mem;
+}
+
+static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
+{
+ if (do_swap_account) {
+ if (res_counter_check_under_limit(&mem->res) &&
+ res_counter_check_under_limit(&mem->memsw))
+ return true;
+ } else
+ if (res_counter_check_under_limit(&mem->res))
+ return true;
+ return false;
+}
+
+static unsigned int get_swappiness(struct mem_cgroup *memcg)
+{
+ struct cgroup *cgrp = memcg->css.cgroup;
+ unsigned int swappiness;
+
+ /* root ? */
+ if (cgrp->parent == NULL)
+ return vm_swappiness;
+
+ spin_lock(&memcg->reclaim_param_lock);
+ swappiness = memcg->swappiness;
+ spin_unlock(&memcg->reclaim_param_lock);
+
+ return swappiness;
+}
+
+/*
+ * Dance down the hierarchy if needed to reclaim memory. We remember the
+ * last child we reclaimed from, so that we don't end up penalizing
+ * one child extensively based on its position in the children list.
+ *
+ * root_mem is the original ancestor that we've been reclaim from.
+ */
+static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
+ gfp_t gfp_mask, bool noswap)
+{
+ struct mem_cgroup *next_mem;
+ int ret = 0;
+
+ /*
+ * Reclaim unconditionally and don't check for return value.
+ * We need to reclaim in the current group and down the tree.
+ * One might think about checking for children before reclaiming,
+ * but there might be left over accounting, even after children
+ * have left.
+ */
+ ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap,
+ get_swappiness(root_mem));
+ if (mem_cgroup_check_under_limit(root_mem))
+ return 0;
+ if (!root_mem->use_hierarchy)
+ return ret;
+
+ next_mem = mem_cgroup_get_next_node(root_mem);
+
+ while (next_mem != root_mem) {
+ if (mem_cgroup_is_obsolete(next_mem)) {
+ next_mem = mem_cgroup_get_next_node(root_mem);
+ continue;
+ }
+ ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap,
+ get_swappiness(next_mem));
+ if (mem_cgroup_check_under_limit(root_mem))
+ return 0;
+ next_mem = mem_cgroup_get_next_node(root_mem);
+ }
+ return ret;
+}
+
+bool mem_cgroup_oom_called(struct task_struct *task)
+{
+ bool ret = false;
+ struct mem_cgroup *mem;
+ struct mm_struct *mm;
+
+ rcu_read_lock();
+ mm = task->mm;
+ if (!mm)
+ mm = &init_mm;
+ mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
+ ret = true;
+ rcu_read_unlock();
+ return ret;
+}
/*
* Unlike exported interface, "oom" parameter is added. if oom==true,
* oom-killer can be invoked.
gfp_t gfp_mask, struct mem_cgroup **memcg,
bool oom)
{
- struct mem_cgroup *mem;
+ struct mem_cgroup *mem, *mem_over_limit;
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ struct res_counter *fail_res;
+
+ if (unlikely(test_thread_flag(TIF_MEMDIE))) {
+ /* Don't account this! */
+ *memcg = NULL;
+ return 0;
+ }
+
/*
* We always charge the cgroup the mm_struct belongs to.
* The mm_struct's mem_cgroup changes on task migration if the
* thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage).
*/
- if (likely(!*memcg)) {
- rcu_read_lock();
- mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (unlikely(!mem)) {
- rcu_read_unlock();
- return 0;
- }
- /*
- * For every charge from the cgroup, increment reference count
- */
- css_get(&mem->css);
+ mem = *memcg;
+ if (likely(!mem)) {
+ mem = try_get_mem_cgroup_from_mm(mm);
*memcg = mem;
- rcu_read_unlock();
} else {
- mem = *memcg;
css_get(&mem->css);
}
+ if (unlikely(!mem))
+ return 0;
+
+ VM_BUG_ON(mem_cgroup_is_obsolete(mem));
while (1) {
int ret;
bool noswap = false;
- ret = res_counter_charge(&mem->res, PAGE_SIZE);
+ ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
if (likely(!ret)) {
if (!do_swap_account)
break;
- ret = res_counter_charge(&mem->memsw, PAGE_SIZE);
+ ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
+ &fail_res);
if (likely(!ret))
break;
/* mem+swap counter fails */
res_counter_uncharge(&mem->res, PAGE_SIZE);
noswap = true;
- }
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res,
+ memsw);
+ } else
+ /* mem counter fails */
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res,
+ res);
+
if (!(gfp_mask & __GFP_WAIT))
goto nomem;
- if (try_to_free_mem_cgroup_pages(mem, gfp_mask, noswap))
- continue;
+ ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
+ noswap);
/*
* try_to_free_mem_cgroup_pages() might not give us a full
* current usage of the cgroup before giving up
*
*/
- if (!do_swap_account &&
- res_counter_check_under_limit(&mem->res))
- continue;
- if (do_swap_account &&
- res_counter_check_under_limit(&mem->memsw))
+ if (mem_cgroup_check_under_limit(mem_over_limit))
continue;
if (!nr_retries--) {
- if (oom)
- mem_cgroup_out_of_memory(mem, gfp_mask);
+ if (oom) {
+ mutex_lock(&memcg_tasklist);
+ mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
+ mutex_unlock(&memcg_tasklist);
+ mem_over_limit->last_oom_jiffies = jiffies;
+ }
goto nomem;
}
}
return -ENOMEM;
}
-/**
- * mem_cgroup_try_charge - get charge of PAGE_SIZE.
- * @mm: an mm_struct which is charged against. (when *memcg is NULL)
- * @gfp_mask: gfp_mask for reclaim.
- * @memcg: a pointer to memory cgroup which is charged against.
- *
- * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
- * memory cgroup from @mm is got and stored in *memcg.
- *
- * Returns 0 if success. -ENOMEM at failure.
- * This call can invoke OOM-Killer.
- */
-
-int mem_cgroup_try_charge(struct mm_struct *mm,
- gfp_t mask, struct mem_cgroup **memcg)
+static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
{
- return __mem_cgroup_try_charge(mm, mask, memcg, true);
+ struct mem_cgroup *mem;
+ swp_entry_t ent;
+
+ if (!PageSwapCache(page))
+ return NULL;
+
+ ent.val = page_private(page);
+ mem = lookup_swap_cgroup(ent);
+ if (!mem)
+ return NULL;
+ if (!css_tryget(&mem->css))
+ return NULL;
+ return mem;
}
/*
- * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be
+ * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
* USED state. If already USED, uncharge and return.
*/
if (pc->mem_cgroup != from)
goto out;
- css_put(&from->css);
res_counter_uncharge(&from->res, PAGE_SIZE);
mem_cgroup_charge_statistics(from, pc, false);
if (do_swap_account)
res_counter_uncharge(&from->memsw, PAGE_SIZE);
+ css_put(&from->css);
+
+ css_get(&to->css);
pc->mem_cgroup = to;
mem_cgroup_charge_statistics(to, pc, true);
- css_get(&to->css);
ret = 0;
out:
unlock_page_cgroup(pc);
ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
- if (ret)
+ if (ret || !parent)
return ret;
- if (!get_page_unless_zero(page))
- return -EBUSY;
+ if (!get_page_unless_zero(page)) {
+ ret = -EBUSY;
+ goto uncharge;
+ }
ret = isolate_lru_page(page);
ret = mem_cgroup_move_account(pc, child, parent);
- /* drop extra refcnt by try_charge() (move_account increment one) */
- css_put(&parent->css);
putback_lru_page(page);
if (!ret) {
put_page(page);
+ /* drop extra refcnt by try_charge() */
+ css_put(&parent->css);
return 0;
}
- /* uncharge if move fails */
+
cancel:
+ put_page(page);
+uncharge:
+ /* drop extra refcnt by try_charge() */
+ css_put(&parent->css);
+ /* uncharge if move fails */
res_counter_uncharge(&parent->res, PAGE_SIZE);
if (do_swap_account)
res_counter_uncharge(&parent->memsw, PAGE_SIZE);
- put_page(page);
return ret;
}
mem = memcg;
ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
- if (ret)
+ if (ret || !mem)
return ret;
__mem_cgroup_commit_charge(mem, pc, ctype);
int mem_cgroup_newpage_charge(struct page *page,
struct mm_struct *mm, gfp_t gfp_mask)
{
- if (mem_cgroup_subsys.disabled)
+ if (mem_cgroup_disabled())
return 0;
if (PageCompound(page))
return 0;
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
{
- if (mem_cgroup_subsys.disabled)
+ struct mem_cgroup *mem = NULL;
+ int ret;
+
+ if (mem_cgroup_disabled())
return 0;
if (PageCompound(page))
return 0;
* For GFP_NOWAIT case, the page may be pre-charged before calling
* add_to_page_cache(). (See shmem.c) check it here and avoid to call
* charge twice. (It works but has to pay a bit larger cost.)
+ * And when the page is SwapCache, it should take swap information
+ * into account. This is under lock_page() now.
*/
if (!(gfp_mask & __GFP_WAIT)) {
struct page_cgroup *pc;
unlock_page_cgroup(pc);
}
- if (unlikely(!mm))
+ if (do_swap_account && PageSwapCache(page)) {
+ mem = try_get_mem_cgroup_from_swapcache(page);
+ if (mem)
+ mm = NULL;
+ else
+ mem = NULL;
+ /* SwapCache may be still linked to LRU now. */
+ mem_cgroup_lru_del_before_commit_swapcache(page);
+ }
+
+ if (unlikely(!mm && !mem))
mm = &init_mm;
if (page_is_file_cache(page))
return mem_cgroup_charge_common(page, mm, gfp_mask,
MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
- else
- return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
+
+ ret = mem_cgroup_charge_common(page, mm, gfp_mask,
+ MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
+ if (mem)
+ css_put(&mem->css);
+ if (PageSwapCache(page))
+ mem_cgroup_lru_add_after_commit_swapcache(page);
+
+ if (do_swap_account && !ret && PageSwapCache(page)) {
+ swp_entry_t ent = {.val = page_private(page)};
+ /* avoid double counting */
+ mem = swap_cgroup_record(ent, NULL);
+ if (mem) {
+ res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+ mem_cgroup_put(mem);
+ }
+ }
+ return ret;
}
+/*
+ * While swap-in, try_charge -> commit or cancel, the page is locked.
+ * And when try_charge() successfully returns, one refcnt to memcg without
+ * struct page_cgroup is aquired. This refcnt will be cumsumed by
+ * "commit()" or removed by "cancel()"
+ */
int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
struct page *page,
gfp_t mask, struct mem_cgroup **ptr)
{
struct mem_cgroup *mem;
- swp_entry_t ent;
+ int ret;
- if (mem_cgroup_subsys.disabled)
+ if (mem_cgroup_disabled())
return 0;
if (!do_swap_account)
goto charge_cur_mm;
-
/*
* A racing thread's fault, or swapoff, may have already updated
* the pte, and even removed page from swap cache: return success
*/
if (!PageSwapCache(page))
return 0;
-
- ent.val = page_private(page);
-
- mem = lookup_swap_cgroup(ent);
- if (!mem || mem->obsolete)
+ mem = try_get_mem_cgroup_from_swapcache(page);
+ if (!mem)
goto charge_cur_mm;
*ptr = mem;
- return __mem_cgroup_try_charge(NULL, mask, ptr, true);
+ ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
+ /* drop extra refcnt from tryget */
+ css_put(&mem->css);
+ return ret;
charge_cur_mm:
if (unlikely(!mm))
mm = &init_mm;
return __mem_cgroup_try_charge(mm, mask, ptr, true);
}
-#ifdef CONFIG_SWAP
-
-int mem_cgroup_cache_charge_swapin(struct page *page,
- struct mm_struct *mm, gfp_t mask, bool locked)
-{
- int ret = 0;
-
- if (mem_cgroup_subsys.disabled)
- return 0;
- if (unlikely(!mm))
- mm = &init_mm;
- if (!locked)
- lock_page(page);
- /*
- * If not locked, the page can be dropped from SwapCache until
- * we reach here.
- */
- if (PageSwapCache(page)) {
- struct mem_cgroup *mem = NULL;
- swp_entry_t ent;
-
- ent.val = page_private(page);
- if (do_swap_account) {
- mem = lookup_swap_cgroup(ent);
- if (mem && mem->obsolete)
- mem = NULL;
- if (mem)
- mm = NULL;
- }
- ret = mem_cgroup_charge_common(page, mm, mask,
- MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
-
- if (!ret && do_swap_account) {
- /* avoid double counting */
- mem = swap_cgroup_record(ent, NULL);
- if (mem) {
- res_counter_uncharge(&mem->memsw, PAGE_SIZE);
- mem_cgroup_put(mem);
- }
- }
- }
- if (!locked)
- unlock_page(page);
- /* add this page(page_cgroup) to the LRU we want. */
- mem_cgroup_lru_fixup(page);
-
- return ret;
-}
-#endif
-
void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
{
struct page_cgroup *pc;
- if (mem_cgroup_subsys.disabled)
+ if (mem_cgroup_disabled())
return;
if (!ptr)
return;
pc = lookup_page_cgroup(page);
+ mem_cgroup_lru_del_before_commit_swapcache(page);
__mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
+ mem_cgroup_lru_add_after_commit_swapcache(page);
/*
* Now swap is on-memory. This means this page may be
* counted both as mem and swap....double count.
- * Fix it by uncharging from memsw. This SwapCache is stable
- * because we're still under lock_page().
+ * Fix it by uncharging from memsw. Basically, this SwapCache is stable
+ * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
+ * may call delete_from_swap_cache() before reach here.
*/
- if (do_swap_account) {
+ if (do_swap_account && PageSwapCache(page)) {
swp_entry_t ent = {.val = page_private(page)};
struct mem_cgroup *memcg;
memcg = swap_cgroup_record(ent, NULL);
if (memcg) {
- /* If memcg is obsolete, memcg can be != ptr */
res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
mem_cgroup_put(memcg);
}
}
/* add this page(page_cgroup) to the LRU we want. */
- mem_cgroup_lru_fixup(page);
+
}
void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
{
- if (mem_cgroup_subsys.disabled)
+ if (mem_cgroup_disabled())
return;
if (!mem)
return;
struct mem_cgroup *mem = NULL;
struct mem_cgroup_per_zone *mz;
- if (mem_cgroup_subsys.disabled)
+ if (mem_cgroup_disabled())
return NULL;
if (PageSwapCache(page))
mem_cgroup_charge_statistics(mem, pc, false);
ClearPageCgroupUsed(pc);
+ /*
+ * pc->mem_cgroup is not cleared here. It will be accessed when it's
+ * freed from LRU. This is safe because uncharged page is expected not
+ * to be reused (freed soon). Exception is SwapCache, it's handled by
+ * special functions.
+ */
mz = page_cgroup_zoneinfo(pc);
unlock_page_cgroup(pc);
- css_put(&mem->css);
+ /* at swapout, this memcg will be accessed to record to swap */
+ if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
+ css_put(&mem->css);
return mem;
swap_cgroup_record(ent, memcg);
mem_cgroup_get(memcg);
}
+ if (memcg)
+ css_put(&memcg->css);
}
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
struct mem_cgroup *mem = NULL;
int ret = 0;
- if (mem_cgroup_subsys.disabled)
+ if (mem_cgroup_disabled())
return 0;
pc = lookup_page_cgroup(page);
unlock_page_cgroup(pc);
if (mem) {
- ret = mem_cgroup_try_charge(NULL, GFP_HIGHUSER_MOVABLE, &mem);
+ ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
css_put(&mem->css);
}
*ptr = mem;
* This is typically used for page reclaiming for shmem for reducing side
* effect of page allocation from shmem, which is used by some mem_cgroup.
*/
-int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
+int mem_cgroup_shrink_usage(struct page *page,
+ struct mm_struct *mm,
+ gfp_t gfp_mask)
{
- struct mem_cgroup *mem;
+ struct mem_cgroup *mem = NULL;
int progress = 0;
int retry = MEM_CGROUP_RECLAIM_RETRIES;
- if (mem_cgroup_subsys.disabled)
- return 0;
- if (!mm)
+ if (mem_cgroup_disabled())
return 0;
-
- rcu_read_lock();
- mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (unlikely(!mem)) {
- rcu_read_unlock();
+ if (page)
+ mem = try_get_mem_cgroup_from_swapcache(page);
+ if (!mem && mm)
+ mem = try_get_mem_cgroup_from_mm(mm);
+ if (unlikely(!mem))
return 0;
- }
- css_get(&mem->css);
- rcu_read_unlock();
do {
- progress = try_to_free_mem_cgroup_pages(mem, gfp_mask, true);
- progress += res_counter_check_under_limit(&mem->res);
+ progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true);
+ progress += mem_cgroup_check_under_limit(mem);
} while (!progress && --retry);
css_put(&mem->css);
if (!ret)
break;
- progress = try_to_free_mem_cgroup_pages(memcg,
- GFP_HIGHUSER_MOVABLE, false);
+ progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
+ false);
if (!progress) retry_count--;
}
+
return ret;
}
break;
oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
- try_to_free_mem_cgroup_pages(memcg, GFP_HIGHUSER_MOVABLE, true);
+ mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true);
curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
if (curusage >= oldusage)
retry_count--;
}
spin_unlock_irqrestore(&zone->lru_lock, flags);
- ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE);
+ ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
if (ret == -ENOMEM)
break;
ret = -EINTR;
goto out;
}
- progress = try_to_free_mem_cgroup_pages(mem,
- GFP_HIGHUSER_MOVABLE, false);
+ progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
+ false, get_swappiness(mem));
if (!progress) {
nr_retries--;
/* maybe some writeback is necessary */
}
+static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
+{
+ return mem_cgroup_from_cont(cont)->use_hierarchy;
+}
+
+static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
+ u64 val)
+{
+ int retval = 0;
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+ struct cgroup *parent = cont->parent;
+ struct mem_cgroup *parent_mem = NULL;
+
+ if (parent)
+ parent_mem = mem_cgroup_from_cont(parent);
+
+ cgroup_lock();
+ /*
+ * If parent's use_hiearchy is set, we can't make any modifications
+ * in the child subtrees. If it is unset, then the change can
+ * occur, provided the current cgroup has no children.
+ *
+ * For the root cgroup, parent_mem is NULL, we allow value to be
+ * set if there are no children.
+ */
+ if ((!parent_mem || !parent_mem->use_hierarchy) &&
+ (val == 1 || val == 0)) {
+ if (list_empty(&cont->children))
+ mem->use_hierarchy = val;
+ else
+ retval = -EBUSY;
+ } else
+ retval = -EINVAL;
+ cgroup_unlock();
+
+ return retval;
+}
+
static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
return ret;
}
+static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
+ unsigned long long *mem_limit, unsigned long long *memsw_limit)
+{
+ struct cgroup *cgroup;
+ unsigned long long min_limit, min_memsw_limit, tmp;
+
+ min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+ min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+ cgroup = memcg->css.cgroup;
+ if (!memcg->use_hierarchy)
+ goto out;
+
+ while (cgroup->parent) {
+ cgroup = cgroup->parent;
+ memcg = mem_cgroup_from_cont(cgroup);
+ if (!memcg->use_hierarchy)
+ break;
+ tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
+ min_limit = min(min_limit, tmp);
+ tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+ min_memsw_limit = min(min_memsw_limit, tmp);
+ }
+out:
+ *mem_limit = min_limit;
+ *memsw_limit = min_memsw_limit;
+ return;
+}
+
static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
{
struct mem_cgroup *mem;
cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
}
+ {
+ unsigned long long limit, memsw_limit;
+ memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
+ cb->fill(cb, "hierarchical_memory_limit", limit);
+ if (do_swap_account)
+ cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
+ }
+
+#ifdef CONFIG_DEBUG_VM
+ cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
+
+ {
+ int nid, zid;
+ struct mem_cgroup_per_zone *mz;
+ unsigned long recent_rotated[2] = {0, 0};
+ unsigned long recent_scanned[2] = {0, 0};
+
+ for_each_online_node(nid)
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
+
+ recent_rotated[0] +=
+ mz->reclaim_stat.recent_rotated[0];
+ recent_rotated[1] +=
+ mz->reclaim_stat.recent_rotated[1];
+ recent_scanned[0] +=
+ mz->reclaim_stat.recent_scanned[0];
+ recent_scanned[1] +=
+ mz->reclaim_stat.recent_scanned[1];
+ }
+ cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
+ cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
+ cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
+ cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
+ }
+#endif
+
+ return 0;
+}
+
+static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+
+ return get_swappiness(memcg);
+}
+
+static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
+ u64 val)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup *parent;
+ if (val > 100)
+ return -EINVAL;
+
+ if (cgrp->parent == NULL)
+ return -EINVAL;
+
+ parent = mem_cgroup_from_cont(cgrp->parent);
+ /* If under hierarchy, only empty-root can set this value */
+ if ((parent->use_hierarchy) ||
+ (memcg->use_hierarchy && !list_empty(&cgrp->children)))
+ return -EINVAL;
+
+ spin_lock(&memcg->reclaim_param_lock);
+ memcg->swappiness = val;
+ spin_unlock(&memcg->reclaim_param_lock);
+
return 0;
}
.name = "force_empty",
.trigger = mem_cgroup_force_empty_write,
},
+ {
+ .name = "use_hierarchy",
+ .write_u64 = mem_cgroup_hierarchy_write,
+ .read_u64 = mem_cgroup_hierarchy_read,
+ },
+ {
+ .name = "swappiness",
+ .read_u64 = mem_cgroup_swappiness_read,
+ .write_u64 = mem_cgroup_swappiness_write,
+ },
};
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
* the number of reference from swap_cgroup and free mem_cgroup when
* it goes down to 0.
*
- * When mem_cgroup is destroyed, mem->obsolete will be set to 0 and
- * entry which points to this memcg will be ignore at swapin.
- *
* Removal of cgroup itself succeeds regardless of refs from swap.
*/
-static void mem_cgroup_free(struct mem_cgroup *mem)
+static void __mem_cgroup_free(struct mem_cgroup *mem)
{
int node;
- if (atomic_read(&mem->refcnt) > 0)
- return;
-
-
for_each_node_state(node, N_POSSIBLE)
free_mem_cgroup_per_zone_info(mem, node);
static void mem_cgroup_put(struct mem_cgroup *mem)
{
- if (atomic_dec_and_test(&mem->refcnt)) {
- if (!mem->obsolete)
- return;
- mem_cgroup_free(mem);
- }
+ if (atomic_dec_and_test(&mem->refcnt))
+ __mem_cgroup_free(mem);
}
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
static void __init enable_swap_cgroup(void)
{
- if (!mem_cgroup_subsys.disabled && really_do_swap_account)
+ if (!mem_cgroup_disabled() && really_do_swap_account)
do_swap_account = 1;
}
#else
static struct cgroup_subsys_state *
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
{
- struct mem_cgroup *mem;
+ struct mem_cgroup *mem, *parent;
int node;
mem = mem_cgroup_alloc();
if (!mem)
return ERR_PTR(-ENOMEM);
- res_counter_init(&mem->res);
- res_counter_init(&mem->memsw);
-
for_each_node_state(node, N_POSSIBLE)
if (alloc_mem_cgroup_per_zone_info(mem, node))
goto free_out;
/* root ? */
- if (cont->parent == NULL)
+ if (cont->parent == NULL) {
enable_swap_cgroup();
+ parent = NULL;
+ } else {
+ parent = mem_cgroup_from_cont(cont->parent);
+ mem->use_hierarchy = parent->use_hierarchy;
+ }
+ if (parent && parent->use_hierarchy) {
+ res_counter_init(&mem->res, &parent->res);
+ res_counter_init(&mem->memsw, &parent->memsw);
+ } else {
+ res_counter_init(&mem->res, NULL);
+ res_counter_init(&mem->memsw, NULL);
+ }
+ mem->last_scanned_child = NULL;
+ spin_lock_init(&mem->reclaim_param_lock);
+
+ if (parent)
+ mem->swappiness = get_swappiness(parent);
+ atomic_set(&mem->refcnt, 1);
return &mem->css;
free_out:
- for_each_node_state(node, N_POSSIBLE)
- free_mem_cgroup_per_zone_info(mem, node);
- mem_cgroup_free(mem);
+ __mem_cgroup_free(mem);
return ERR_PTR(-ENOMEM);
}
struct cgroup *cont)
{
struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
- mem->obsolete = 1;
mem_cgroup_force_empty(mem, false);
}
static void mem_cgroup_destroy(struct cgroup_subsys *ss,
struct cgroup *cont)
{
- mem_cgroup_free(mem_cgroup_from_cont(cont));
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+ struct mem_cgroup *last_scanned_child = mem->last_scanned_child;
+
+ if (last_scanned_child) {
+ VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child));
+ mem_cgroup_put(last_scanned_child);
+ }
+ mem_cgroup_put(mem);
}
static int mem_cgroup_populate(struct cgroup_subsys *ss,
struct cgroup *old_cont,
struct task_struct *p)
{
- struct mm_struct *mm;
- struct mem_cgroup *mem, *old_mem;
-
- mm = get_task_mm(p);
- if (mm == NULL)
- return;
-
- mem = mem_cgroup_from_cont(cont);
- old_mem = mem_cgroup_from_cont(old_cont);
-
+ mutex_lock(&memcg_tasklist);
/*
- * Only thread group leaders are allowed to migrate, the mm_struct is
- * in effect owned by the leader
+ * FIXME: It's better to move charges of this process from old
+ * memcg to new memcg. But it's just on TODO-List now.
*/
- if (!thread_group_leader(p))
- goto out;
-
-out:
- mmput(mm);
+ mutex_unlock(&memcg_tasklist);
}
struct cgroup_subsys mem_cgroup_subsys = {