1 /* memcontrol.c - Memory Controller
3 * Copyright IBM Corporation, 2007
4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
6 * Copyright 2007 OpenVZ SWsoft Inc
7 * Author: Pavel Emelianov <xemul@openvz.org>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
20 #include <linux/res_counter.h>
21 #include <linux/memcontrol.h>
22 #include <linux/cgroup.h>
24 #include <linux/smp.h>
25 #include <linux/page-flags.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bit_spinlock.h>
28 #include <linux/rcupdate.h>
29 #include <linux/slab.h>
30 #include <linux/swap.h>
31 #include <linux/spinlock.h>
33 #include <linux/seq_file.h>
34 #include <linux/vmalloc.h>
35 #include <linux/mm_inline.h>
37 #include <asm/uaccess.h>
39 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
40 static struct kmem_cache *page_cgroup_cache __read_mostly;
41 #define MEM_CGROUP_RECLAIM_RETRIES 5
44 * Statistics for memory cgroup.
46 enum mem_cgroup_stat_index {
48 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
50 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
51 MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */
52 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
53 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
55 MEM_CGROUP_STAT_NSTATS,
58 struct mem_cgroup_stat_cpu {
59 s64 count[MEM_CGROUP_STAT_NSTATS];
60 } ____cacheline_aligned_in_smp;
62 struct mem_cgroup_stat {
63 struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
67 * For accounting under irq disable, no need for increment preempt count.
69 static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
70 enum mem_cgroup_stat_index idx, int val)
72 stat->count[idx] += val;
75 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
76 enum mem_cgroup_stat_index idx)
80 for_each_possible_cpu(cpu)
81 ret += stat->cpustat[cpu].count[idx];
86 * per-zone information in memory controller.
88 struct mem_cgroup_per_zone {
90 * spin_lock to protect the per cgroup LRU
93 struct list_head lists[NR_LRU_LISTS];
94 unsigned long count[NR_LRU_LISTS];
96 /* Macro for accessing counter */
97 #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
99 struct mem_cgroup_per_node {
100 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
103 struct mem_cgroup_lru_info {
104 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
108 * The memory controller data structure. The memory controller controls both
109 * page cache and RSS per cgroup. We would eventually like to provide
110 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
111 * to help the administrator determine what knobs to tune.
113 * TODO: Add a water mark for the memory controller. Reclaim will begin when
114 * we hit the water mark. May be even add a low water mark, such that
115 * no reclaim occurs from a cgroup at it's low water mark, this is
116 * a feature that will be implemented much later in the future.
119 struct cgroup_subsys_state css;
121 * the counter to account for memory usage
123 struct res_counter res;
125 * Per cgroup active and inactive list, similar to the
126 * per zone LRU lists.
128 struct mem_cgroup_lru_info info;
130 int prev_priority; /* for recording reclaim priority */
134 struct mem_cgroup_stat stat;
136 static struct mem_cgroup init_mem_cgroup;
139 * We use the lower bit of the page->page_cgroup pointer as a bit spin
140 * lock. We need to ensure that page->page_cgroup is at least two
141 * byte aligned (based on comments from Nick Piggin). But since
142 * bit_spin_lock doesn't actually set that lock bit in a non-debug
143 * uniprocessor kernel, we should avoid setting it here too.
145 #define PAGE_CGROUP_LOCK_BIT 0x0
146 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
147 #define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
149 #define PAGE_CGROUP_LOCK 0x0
153 * A page_cgroup page is associated with every page descriptor. The
154 * page_cgroup helps us identify information about the cgroup
157 struct list_head lru; /* per cgroup LRU list */
159 struct mem_cgroup *mem_cgroup;
164 /* flags for mem_cgroup */
165 PCG_CACHE, /* charged as cache */
166 /* flags for LRU placement */
167 PCG_ACTIVE, /* page is active in this cgroup */
168 PCG_FILE, /* page is file system backed */
169 PCG_UNEVICTABLE, /* page is unevictableable */
172 #define TESTPCGFLAG(uname, lname) \
173 static inline int PageCgroup##uname(struct page_cgroup *pc) \
174 { return test_bit(PCG_##lname, &pc->flags); }
176 #define SETPCGFLAG(uname, lname) \
177 static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
178 { set_bit(PCG_##lname, &pc->flags); }
180 #define CLEARPCGFLAG(uname, lname) \
181 static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
182 { clear_bit(PCG_##lname, &pc->flags); }
185 /* Cache flag is set only once (at allocation) */
186 TESTPCGFLAG(Cache, CACHE)
188 /* LRU management flags (from global-lru definition) */
189 TESTPCGFLAG(File, FILE)
190 SETPCGFLAG(File, FILE)
191 CLEARPCGFLAG(File, FILE)
193 TESTPCGFLAG(Active, ACTIVE)
194 SETPCGFLAG(Active, ACTIVE)
195 CLEARPCGFLAG(Active, ACTIVE)
197 TESTPCGFLAG(Unevictable, UNEVICTABLE)
198 SETPCGFLAG(Unevictable, UNEVICTABLE)
199 CLEARPCGFLAG(Unevictable, UNEVICTABLE)
201 static int page_cgroup_nid(struct page_cgroup *pc)
203 return page_to_nid(pc->page);
206 static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
208 return page_zonenum(pc->page);
212 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
213 MEM_CGROUP_CHARGE_TYPE_MAPPED,
214 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
215 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
219 static const unsigned long
220 pcg_default_flags[NR_CHARGE_TYPE] = {
221 ((1 << PCG_CACHE) | (1 << PCG_FILE)),
223 ((1 << PCG_ACTIVE) | (1 << PCG_CACHE)),
228 * Always modified under lru lock. Then, not necessary to preempt_disable()
230 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
231 struct page_cgroup *pc,
234 int val = (charge)? 1 : -1;
235 struct mem_cgroup_stat *stat = &mem->stat;
236 struct mem_cgroup_stat_cpu *cpustat;
238 VM_BUG_ON(!irqs_disabled());
240 cpustat = &stat->cpustat[smp_processor_id()];
241 if (PageCgroupCache(pc))
242 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
244 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
247 __mem_cgroup_stat_add_safe(cpustat,
248 MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
250 __mem_cgroup_stat_add_safe(cpustat,
251 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
254 static struct mem_cgroup_per_zone *
255 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
257 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
260 static struct mem_cgroup_per_zone *
261 page_cgroup_zoneinfo(struct page_cgroup *pc)
263 struct mem_cgroup *mem = pc->mem_cgroup;
264 int nid = page_cgroup_nid(pc);
265 int zid = page_cgroup_zid(pc);
267 return mem_cgroup_zoneinfo(mem, nid, zid);
270 static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
274 struct mem_cgroup_per_zone *mz;
277 for_each_online_node(nid)
278 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
279 mz = mem_cgroup_zoneinfo(mem, nid, zid);
280 total += MEM_CGROUP_ZSTAT(mz, idx);
285 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
287 return container_of(cgroup_subsys_state(cont,
288 mem_cgroup_subsys_id), struct mem_cgroup,
292 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
295 * mm_update_next_owner() may clear mm->owner to NULL
296 * if it races with swapoff, page migration, etc.
297 * So this can be called with p == NULL.
302 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
303 struct mem_cgroup, css);
306 static inline int page_cgroup_locked(struct page *page)
308 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
311 static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
313 VM_BUG_ON(!page_cgroup_locked(page));
314 page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
317 struct page_cgroup *page_get_page_cgroup(struct page *page)
319 return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
322 static void lock_page_cgroup(struct page *page)
324 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
327 static int try_lock_page_cgroup(struct page *page)
329 return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
332 static void unlock_page_cgroup(struct page *page)
334 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
337 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
338 struct page_cgroup *pc)
342 if (PageCgroupUnevictable(pc))
343 lru = LRU_UNEVICTABLE;
345 if (PageCgroupActive(pc))
347 if (PageCgroupFile(pc))
351 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
353 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
357 static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
358 struct page_cgroup *pc)
362 if (PageCgroupUnevictable(pc))
363 lru = LRU_UNEVICTABLE;
365 if (PageCgroupActive(pc))
367 if (PageCgroupFile(pc))
371 MEM_CGROUP_ZSTAT(mz, lru) += 1;
372 list_add(&pc->lru, &mz->lists[lru]);
374 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
377 static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
379 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
380 int active = PageCgroupActive(pc);
381 int file = PageCgroupFile(pc);
382 int unevictable = PageCgroupUnevictable(pc);
383 enum lru_list from = unevictable ? LRU_UNEVICTABLE :
384 (LRU_FILE * !!file + !!active);
389 MEM_CGROUP_ZSTAT(mz, from) -= 1;
391 * However this is done under mz->lru_lock, another flags, which
392 * are not related to LRU, will be modified from out-of-lock.
393 * We have to use atomic set/clear flags.
395 if (is_unevictable_lru(lru)) {
396 ClearPageCgroupActive(pc);
397 SetPageCgroupUnevictable(pc);
399 if (is_active_lru(lru))
400 SetPageCgroupActive(pc);
402 ClearPageCgroupActive(pc);
403 ClearPageCgroupUnevictable(pc);
406 MEM_CGROUP_ZSTAT(mz, lru) += 1;
407 list_move(&pc->lru, &mz->lists[lru]);
410 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
415 ret = task->mm && mm_match_cgroup(task->mm, mem);
421 * This routine assumes that the appropriate zone's lru lock is already held
423 void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
425 struct page_cgroup *pc;
426 struct mem_cgroup_per_zone *mz;
429 if (mem_cgroup_subsys.disabled)
433 * We cannot lock_page_cgroup while holding zone's lru_lock,
434 * because other holders of lock_page_cgroup can be interrupted
435 * with an attempt to rotate_reclaimable_page. But we cannot
436 * safely get to page_cgroup without it, so just try_lock it:
437 * mem_cgroup_isolate_pages allows for page left on wrong list.
439 if (!try_lock_page_cgroup(page))
442 pc = page_get_page_cgroup(page);
444 mz = page_cgroup_zoneinfo(pc);
445 spin_lock_irqsave(&mz->lru_lock, flags);
446 __mem_cgroup_move_lists(pc, lru);
447 spin_unlock_irqrestore(&mz->lru_lock, flags);
449 unlock_page_cgroup(page);
453 * Calculate mapped_ratio under memory controller. This will be used in
454 * vmscan.c for deteremining we have to reclaim mapped pages.
456 int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
461 * usage is recorded in bytes. But, here, we assume the number of
462 * physical pages can be represented by "long" on any arch.
464 total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
465 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
466 return (int)((rss * 100L) / total);
470 * prev_priority control...this will be used in memory reclaim path.
472 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
474 return mem->prev_priority;
477 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
479 if (priority < mem->prev_priority)
480 mem->prev_priority = priority;
483 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
485 mem->prev_priority = priority;
489 * Calculate # of pages to be scanned in this priority/zone.
492 * priority starts from "DEF_PRIORITY" and decremented in each loop.
493 * (see include/linux/mmzone.h)
496 long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
497 int priority, enum lru_list lru)
500 int nid = zone->zone_pgdat->node_id;
501 int zid = zone_idx(zone);
502 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
504 nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
506 return (nr_pages >> priority);
509 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
510 struct list_head *dst,
511 unsigned long *scanned, int order,
512 int mode, struct zone *z,
513 struct mem_cgroup *mem_cont,
514 int active, int file)
516 unsigned long nr_taken = 0;
520 struct list_head *src;
521 struct page_cgroup *pc, *tmp;
522 int nid = z->zone_pgdat->node_id;
523 int zid = zone_idx(z);
524 struct mem_cgroup_per_zone *mz;
525 int lru = LRU_FILE * !!file + !!active;
528 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
529 src = &mz->lists[lru];
531 spin_lock(&mz->lru_lock);
533 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
534 if (scan >= nr_to_scan)
538 if (unlikely(!PageLRU(page)))
542 * TODO: play better with lumpy reclaim, grabbing anything.
544 if (PageUnevictable(page) ||
545 (PageActive(page) && !active) ||
546 (!PageActive(page) && active)) {
547 __mem_cgroup_move_lists(pc, page_lru(page));
552 list_move(&pc->lru, &pc_list);
554 if (__isolate_lru_page(page, mode, file) == 0) {
555 list_move(&page->lru, dst);
560 list_splice(&pc_list, src);
561 spin_unlock(&mz->lru_lock);
568 * Charge the memory controller for page usage.
570 * 0 if the charge was successful
571 * < 0 if the cgroup is over its limit
573 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
574 gfp_t gfp_mask, enum charge_type ctype,
575 struct mem_cgroup *memcg)
577 struct mem_cgroup *mem;
578 struct page_cgroup *pc;
580 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
581 struct mem_cgroup_per_zone *mz;
583 pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
584 if (unlikely(pc == NULL))
588 * We always charge the cgroup the mm_struct belongs to.
589 * The mm_struct's mem_cgroup changes on task migration if the
590 * thread group leader migrates. It's possible that mm is not
591 * set, if so charge the init_mm (happens for pagecache usage).
593 if (likely(!memcg)) {
595 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
596 if (unlikely(!mem)) {
598 kmem_cache_free(page_cgroup_cache, pc);
602 * For every charge from the cgroup, increment reference count
608 css_get(&memcg->css);
611 while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
612 if (!(gfp_mask & __GFP_WAIT))
615 if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
619 * try_to_free_mem_cgroup_pages() might not give us a full
620 * picture of reclaim. Some pages are reclaimed and might be
621 * moved to swap cache or just unmapped from the cgroup.
622 * Check the limit again to see if the reclaim reduced the
623 * current usage of the cgroup before giving up
625 if (res_counter_check_under_limit(&mem->res))
629 mem_cgroup_out_of_memory(mem, gfp_mask);
634 pc->mem_cgroup = mem;
637 * If a page is accounted as a page cache, insert to inactive list.
638 * If anon, insert to active list.
640 pc->flags = pcg_default_flags[ctype];
642 lock_page_cgroup(page);
643 if (unlikely(page_get_page_cgroup(page))) {
644 unlock_page_cgroup(page);
645 res_counter_uncharge(&mem->res, PAGE_SIZE);
647 kmem_cache_free(page_cgroup_cache, pc);
650 page_assign_page_cgroup(page, pc);
652 mz = page_cgroup_zoneinfo(pc);
653 spin_lock_irqsave(&mz->lru_lock, flags);
654 __mem_cgroup_add_list(mz, pc);
655 spin_unlock_irqrestore(&mz->lru_lock, flags);
657 unlock_page_cgroup(page);
662 kmem_cache_free(page_cgroup_cache, pc);
667 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
669 if (mem_cgroup_subsys.disabled)
673 * If already mapped, we don't have to account.
674 * If page cache, page->mapping has address_space.
675 * But page->mapping may have out-of-use anon_vma pointer,
676 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
679 if (page_mapped(page) || (page->mapping && !PageAnon(page)))
683 return mem_cgroup_charge_common(page, mm, gfp_mask,
684 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
687 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
690 if (mem_cgroup_subsys.disabled)
694 * Corner case handling. This is called from add_to_page_cache()
695 * in usual. But some FS (shmem) precharges this page before calling it
696 * and call add_to_page_cache() with GFP_NOWAIT.
698 * For GFP_NOWAIT case, the page may be pre-charged before calling
699 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
700 * charge twice. (It works but has to pay a bit larger cost.)
702 if (!(gfp_mask & __GFP_WAIT)) {
703 struct page_cgroup *pc;
705 lock_page_cgroup(page);
706 pc = page_get_page_cgroup(page);
708 VM_BUG_ON(pc->page != page);
709 VM_BUG_ON(!pc->mem_cgroup);
710 unlock_page_cgroup(page);
713 unlock_page_cgroup(page);
719 if (page_is_file_cache(page))
720 return mem_cgroup_charge_common(page, mm, gfp_mask,
721 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
723 return mem_cgroup_charge_common(page, mm, gfp_mask,
724 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
728 * uncharge if !page_mapped(page)
731 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
733 struct page_cgroup *pc;
734 struct mem_cgroup *mem;
735 struct mem_cgroup_per_zone *mz;
738 if (mem_cgroup_subsys.disabled)
742 * Check if our page_cgroup is valid
744 lock_page_cgroup(page);
745 pc = page_get_page_cgroup(page);
749 VM_BUG_ON(pc->page != page);
751 if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
752 && ((PageCgroupCache(pc) || page_mapped(page))))
755 mz = page_cgroup_zoneinfo(pc);
756 spin_lock_irqsave(&mz->lru_lock, flags);
757 __mem_cgroup_remove_list(mz, pc);
758 spin_unlock_irqrestore(&mz->lru_lock, flags);
760 page_assign_page_cgroup(page, NULL);
761 unlock_page_cgroup(page);
763 mem = pc->mem_cgroup;
764 res_counter_uncharge(&mem->res, PAGE_SIZE);
767 kmem_cache_free(page_cgroup_cache, pc);
770 unlock_page_cgroup(page);
773 void mem_cgroup_uncharge_page(struct page *page)
775 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
778 void mem_cgroup_uncharge_cache_page(struct page *page)
780 VM_BUG_ON(page_mapped(page));
781 VM_BUG_ON(page->mapping);
782 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
786 * Before starting migration, account against new page.
788 int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
790 struct page_cgroup *pc;
791 struct mem_cgroup *mem = NULL;
792 enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
795 if (mem_cgroup_subsys.disabled)
798 lock_page_cgroup(page);
799 pc = page_get_page_cgroup(page);
801 mem = pc->mem_cgroup;
803 if (PageCgroupCache(pc)) {
804 if (page_is_file_cache(page))
805 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
807 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
810 unlock_page_cgroup(page);
812 ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
819 /* remove redundant charge if migration failed*/
820 void mem_cgroup_end_migration(struct page *newpage)
823 * At success, page->mapping is not NULL.
824 * special rollback care is necessary when
825 * 1. at migration failure. (newpage->mapping is cleared in this case)
826 * 2. the newpage was moved but not remapped again because the task
827 * exits and the newpage is obsolete. In this case, the new page
828 * may be a swapcache. So, we just call mem_cgroup_uncharge_page()
829 * always for avoiding mess. The page_cgroup will be removed if
830 * unnecessary. File cache pages is still on radix-tree. Don't
833 if (!newpage->mapping)
834 __mem_cgroup_uncharge_common(newpage,
835 MEM_CGROUP_CHARGE_TYPE_FORCE);
836 else if (PageAnon(newpage))
837 mem_cgroup_uncharge_page(newpage);
841 * A call to try to shrink memory usage under specified resource controller.
842 * This is typically used for page reclaiming for shmem for reducing side
843 * effect of page allocation from shmem, which is used by some mem_cgroup.
845 int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
847 struct mem_cgroup *mem;
849 int retry = MEM_CGROUP_RECLAIM_RETRIES;
851 if (mem_cgroup_subsys.disabled)
857 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
858 if (unlikely(!mem)) {
866 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
867 progress += res_counter_check_under_limit(&mem->res);
868 } while (!progress && --retry);
876 int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
879 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
883 while (res_counter_set_limit(&memcg->res, val)) {
884 if (signal_pending(current)) {
892 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL);
901 * This routine traverse page_cgroup in given list and drop them all.
902 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
904 #define FORCE_UNCHARGE_BATCH (128)
905 static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
906 struct mem_cgroup_per_zone *mz,
909 struct page_cgroup *pc;
911 int count = FORCE_UNCHARGE_BATCH;
913 struct list_head *list;
915 list = &mz->lists[lru];
917 spin_lock_irqsave(&mz->lru_lock, flags);
918 while (!list_empty(list)) {
919 pc = list_entry(list->prev, struct page_cgroup, lru);
922 spin_unlock_irqrestore(&mz->lru_lock, flags);
924 * Check if this page is on LRU. !LRU page can be found
925 * if it's under page migration.
928 __mem_cgroup_uncharge_common(page,
929 MEM_CGROUP_CHARGE_TYPE_FORCE);
932 count = FORCE_UNCHARGE_BATCH;
937 spin_lock_irqsave(&mz->lru_lock, flags);
939 spin_unlock_irqrestore(&mz->lru_lock, flags);
943 * make mem_cgroup's charge to be 0 if there is no task.
944 * This enables deleting this mem_cgroup.
946 static int mem_cgroup_force_empty(struct mem_cgroup *mem)
953 * page reclaim code (kswapd etc..) will move pages between
954 * active_list <-> inactive_list while we don't take a lock.
955 * So, we have to do loop here until all lists are empty.
957 while (mem->res.usage > 0) {
958 if (atomic_read(&mem->css.cgroup->count) > 0)
960 for_each_node_state(node, N_POSSIBLE)
961 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
962 struct mem_cgroup_per_zone *mz;
964 mz = mem_cgroup_zoneinfo(mem, node, zid);
966 mem_cgroup_force_empty_list(mem, mz, l);
975 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
977 return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
981 * The user of this function is...
984 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
987 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
988 unsigned long long val;
991 switch (cft->private) {
993 /* This function does all necessary parse...reuse it */
994 ret = res_counter_memparse_write_strategy(buffer, &val);
996 ret = mem_cgroup_resize_limit(memcg, val);
999 ret = -EINVAL; /* should be BUG() ? */
1005 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
1007 struct mem_cgroup *mem;
1009 mem = mem_cgroup_from_cont(cont);
1012 res_counter_reset_max(&mem->res);
1015 res_counter_reset_failcnt(&mem->res);
1021 static int mem_force_empty_write(struct cgroup *cont, unsigned int event)
1023 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont));
1026 static const struct mem_cgroup_stat_desc {
1029 } mem_cgroup_stat_desc[] = {
1030 [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
1031 [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
1032 [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
1033 [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
1036 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
1037 struct cgroup_map_cb *cb)
1039 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
1040 struct mem_cgroup_stat *stat = &mem_cont->stat;
1043 for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
1046 val = mem_cgroup_read_stat(stat, i);
1047 val *= mem_cgroup_stat_desc[i].unit;
1048 cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
1050 /* showing # of active pages */
1052 unsigned long active_anon, inactive_anon;
1053 unsigned long active_file, inactive_file;
1054 unsigned long unevictable;
1056 inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
1058 active_anon = mem_cgroup_get_all_zonestat(mem_cont,
1060 inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
1062 active_file = mem_cgroup_get_all_zonestat(mem_cont,
1064 unevictable = mem_cgroup_get_all_zonestat(mem_cont,
1067 cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
1068 cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
1069 cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
1070 cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
1071 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
1077 static struct cftype mem_cgroup_files[] = {
1079 .name = "usage_in_bytes",
1080 .private = RES_USAGE,
1081 .read_u64 = mem_cgroup_read,
1084 .name = "max_usage_in_bytes",
1085 .private = RES_MAX_USAGE,
1086 .trigger = mem_cgroup_reset,
1087 .read_u64 = mem_cgroup_read,
1090 .name = "limit_in_bytes",
1091 .private = RES_LIMIT,
1092 .write_string = mem_cgroup_write,
1093 .read_u64 = mem_cgroup_read,
1097 .private = RES_FAILCNT,
1098 .trigger = mem_cgroup_reset,
1099 .read_u64 = mem_cgroup_read,
1102 .name = "force_empty",
1103 .trigger = mem_force_empty_write,
1107 .read_map = mem_control_stat_show,
1111 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1113 struct mem_cgroup_per_node *pn;
1114 struct mem_cgroup_per_zone *mz;
1116 int zone, tmp = node;
1118 * This routine is called against possible nodes.
1119 * But it's BUG to call kmalloc() against offline node.
1121 * TODO: this routine can waste much memory for nodes which will
1122 * never be onlined. It's better to use memory hotplug callback
1125 if (!node_state(node, N_NORMAL_MEMORY))
1127 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
1131 mem->info.nodeinfo[node] = pn;
1132 memset(pn, 0, sizeof(*pn));
1134 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
1135 mz = &pn->zoneinfo[zone];
1136 spin_lock_init(&mz->lru_lock);
1138 INIT_LIST_HEAD(&mz->lists[l]);
1143 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1145 kfree(mem->info.nodeinfo[node]);
1148 static struct mem_cgroup *mem_cgroup_alloc(void)
1150 struct mem_cgroup *mem;
1152 if (sizeof(*mem) < PAGE_SIZE)
1153 mem = kmalloc(sizeof(*mem), GFP_KERNEL);
1155 mem = vmalloc(sizeof(*mem));
1158 memset(mem, 0, sizeof(*mem));
1162 static void mem_cgroup_free(struct mem_cgroup *mem)
1164 if (sizeof(*mem) < PAGE_SIZE)
1171 static struct cgroup_subsys_state *
1172 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1174 struct mem_cgroup *mem;
1177 if (unlikely((cont->parent) == NULL)) {
1178 mem = &init_mem_cgroup;
1179 page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
1181 mem = mem_cgroup_alloc();
1183 return ERR_PTR(-ENOMEM);
1186 res_counter_init(&mem->res);
1188 for_each_node_state(node, N_POSSIBLE)
1189 if (alloc_mem_cgroup_per_zone_info(mem, node))
1194 for_each_node_state(node, N_POSSIBLE)
1195 free_mem_cgroup_per_zone_info(mem, node);
1196 if (cont->parent != NULL)
1197 mem_cgroup_free(mem);
1198 return ERR_PTR(-ENOMEM);
1201 static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
1202 struct cgroup *cont)
1204 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1205 mem_cgroup_force_empty(mem);
1208 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
1209 struct cgroup *cont)
1212 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1214 for_each_node_state(node, N_POSSIBLE)
1215 free_mem_cgroup_per_zone_info(mem, node);
1217 mem_cgroup_free(mem_cgroup_from_cont(cont));
1220 static int mem_cgroup_populate(struct cgroup_subsys *ss,
1221 struct cgroup *cont)
1223 return cgroup_add_files(cont, ss, mem_cgroup_files,
1224 ARRAY_SIZE(mem_cgroup_files));
1227 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1228 struct cgroup *cont,
1229 struct cgroup *old_cont,
1230 struct task_struct *p)
1232 struct mm_struct *mm;
1233 struct mem_cgroup *mem, *old_mem;
1235 mm = get_task_mm(p);
1239 mem = mem_cgroup_from_cont(cont);
1240 old_mem = mem_cgroup_from_cont(old_cont);
1243 * Only thread group leaders are allowed to migrate, the mm_struct is
1244 * in effect owned by the leader
1246 if (!thread_group_leader(p))
1253 struct cgroup_subsys mem_cgroup_subsys = {
1255 .subsys_id = mem_cgroup_subsys_id,
1256 .create = mem_cgroup_create,
1257 .pre_destroy = mem_cgroup_pre_destroy,
1258 .destroy = mem_cgroup_destroy,
1259 .populate = mem_cgroup_populate,
1260 .attach = mem_cgroup_move_task,