mm/memcontrol.c

   1 /* memcontrol.c - Memory Controller
   2  *
   3  * Copyright IBM Corporation, 2007
   4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5  *
   6  * Copyright 2007 OpenVZ SWsoft Inc
   7  * Author: Pavel Emelianov <xemul@openvz.org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  */
  19
  20 #include <linux/res_counter.h>
  21 #include <linux/memcontrol.h>
  22 #include <linux/cgroup.h>
  23 #include <linux/mm.h>
  24 #include <linux/smp.h>
  25 #include <linux/page-flags.h>
  26 #include <linux/backing-dev.h>
  27 #include <linux/bit_spinlock.h>
  28 #include <linux/rcupdate.h>
  29 #include <linux/slab.h>
  30 #include <linux/swap.h>
  31 #include <linux/spinlock.h>
  32 #include <linux/fs.h>
  33 #include <linux/seq_file.h>
  34 #include <linux/vmalloc.h>
  35 #include <linux/mm_inline.h>
  36
  37 #include <asm/uaccess.h>
  38
  39 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
  40 static struct kmem_cache *page_cgroup_cache __read_mostly;
  41 #define MEM_CGROUP_RECLAIM_RETRIES      5
  42
  43 /*
  44  * Statistics for memory cgroup.
  45  */
  46 enum mem_cgroup_stat_index {
  47         /*
  48          * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
  49          */
  50         MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
  51         MEM_CGROUP_STAT_RSS,       /* # of pages charged as rss */
  52         MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
  53         MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
  54
  55         MEM_CGROUP_STAT_NSTATS,
  56 };
  57
  58 struct mem_cgroup_stat_cpu {
  59         s64 count[MEM_CGROUP_STAT_NSTATS];
  60 } ____cacheline_aligned_in_smp;
  61
  62 struct mem_cgroup_stat {
  63         struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
  64 };
  65
  66 /*
  67  * For accounting under irq disable, no need for increment preempt count.
  68  */
  69 static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
  70                 enum mem_cgroup_stat_index idx, int val)
  71 {
  72         stat->count[idx] += val;
  73 }
  74
  75 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
  76                 enum mem_cgroup_stat_index idx)
  77 {
  78         int cpu;
  79         s64 ret = 0;
  80         for_each_possible_cpu(cpu)
  81                 ret += stat->cpustat[cpu].count[idx];
  82         return ret;
  83 }
  84
  85 /*
  86  * per-zone information in memory controller.
  87  */
  88 struct mem_cgroup_per_zone {
  89         /*
  90          * spin_lock to protect the per cgroup LRU
  91          */
  92         spinlock_t              lru_lock;
  93         struct list_head        lists[NR_LRU_LISTS];
  94         unsigned long           count[NR_LRU_LISTS];
  95 };
  96 /* Macro for accessing counter */
  97 #define MEM_CGROUP_ZSTAT(mz, idx)       ((mz)->count[(idx)])
  98
  99 struct mem_cgroup_per_node {
 100         struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 101 };
 102
 103 struct mem_cgroup_lru_info {
 104         struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
 105 };
 106
 107 /*
 108  * The memory controller data structure. The memory controller controls both
 109  * page cache and RSS per cgroup. We would eventually like to provide
 110  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 111  * to help the administrator determine what knobs to tune.
 112  *
 113  * TODO: Add a water mark for the memory controller. Reclaim will begin when
 114  * we hit the water mark. May be even add a low water mark, such that
 115  * no reclaim occurs from a cgroup at it's low water mark, this is
 116  * a feature that will be implemented much later in the future.
 117  */
 118 struct mem_cgroup {
 119         struct cgroup_subsys_state css;
 120         /*
 121          * the counter to account for memory usage
 122          */
 123         struct res_counter res;
 124         /*
 125          * Per cgroup active and inactive list, similar to the
 126          * per zone LRU lists.
 127          */
 128         struct mem_cgroup_lru_info info;
 129
 130         int     prev_priority;  /* for recording reclaim priority */
 131         /*
 132          * statistics.
 133          */
 134         struct mem_cgroup_stat stat;
 135 };
 136 static struct mem_cgroup init_mem_cgroup;
 137
 138 /*
 139  * We use the lower bit of the page->page_cgroup pointer as a bit spin
 140  * lock.  We need to ensure that page->page_cgroup is at least two
 141  * byte aligned (based on comments from Nick Piggin).  But since
 142  * bit_spin_lock doesn't actually set that lock bit in a non-debug
 143  * uniprocessor kernel, we should avoid setting it here too.
 144  */
 145 #define PAGE_CGROUP_LOCK_BIT    0x0
 146 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
 147 #define PAGE_CGROUP_LOCK        (1 << PAGE_CGROUP_LOCK_BIT)
 148 #else
 149 #define PAGE_CGROUP_LOCK        0x0
 150 #endif
 151
 152 /*
 153  * A page_cgroup page is associated with every page descriptor. The
 154  * page_cgroup helps us identify information about the cgroup
 155  */
 156 struct page_cgroup {
 157         struct list_head lru;           /* per cgroup LRU list */
 158         struct page *page;
 159         struct mem_cgroup *mem_cgroup;
 160         unsigned long flags;
 161 };
 162
 163 enum {
 164         /* flags for mem_cgroup */
 165         PCG_CACHE, /* charged as cache */
 166         /* flags for LRU placement */
 167         PCG_ACTIVE, /* page is active in this cgroup */
 168         PCG_FILE, /* page is file system backed */
 169         PCG_UNEVICTABLE, /* page is unevictableable */
 170 };
 171
 172 #define TESTPCGFLAG(uname, lname)                       \
 173 static inline int PageCgroup##uname(struct page_cgroup *pc)     \
 174         { return test_bit(PCG_##lname, &pc->flags); }
 175
 176 #define SETPCGFLAG(uname, lname)                        \
 177 static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
 178         { set_bit(PCG_##lname, &pc->flags);  }
 179
 180 #define CLEARPCGFLAG(uname, lname)                      \
 181 static inline void ClearPageCgroup##uname(struct page_cgroup *pc)       \
 182         { clear_bit(PCG_##lname, &pc->flags);  }
 183
 184
 185 /* Cache flag is set only once (at allocation) */
 186 TESTPCGFLAG(Cache, CACHE)
 187
 188 /* LRU management flags (from global-lru definition) */
 189 TESTPCGFLAG(File, FILE)
 190 SETPCGFLAG(File, FILE)
 191 CLEARPCGFLAG(File, FILE)
 192
 193 TESTPCGFLAG(Active, ACTIVE)
 194 SETPCGFLAG(Active, ACTIVE)
 195 CLEARPCGFLAG(Active, ACTIVE)
 196
 197 TESTPCGFLAG(Unevictable, UNEVICTABLE)
 198 SETPCGFLAG(Unevictable, UNEVICTABLE)
 199 CLEARPCGFLAG(Unevictable, UNEVICTABLE)
 200
 201 static int page_cgroup_nid(struct page_cgroup *pc)
 202 {
 203         return page_to_nid(pc->page);
 204 }
 205
 206 static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
 207 {
 208         return page_zonenum(pc->page);
 209 }
 210
 211 enum charge_type {
 212         MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 213         MEM_CGROUP_CHARGE_TYPE_MAPPED,
 214         MEM_CGROUP_CHARGE_TYPE_SHMEM,   /* used by page migration of shmem */
 215         MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
 216         NR_CHARGE_TYPE,
 217 };
 218
 219 static const unsigned long
 220 pcg_default_flags[NR_CHARGE_TYPE] = {
 221         ((1 << PCG_CACHE) | (1 << PCG_FILE)),
 222         ((1 << PCG_ACTIVE)),
 223         ((1 << PCG_ACTIVE) | (1 << PCG_CACHE)),
 224         0,
 225 };
 226
 227 /*
 228  * Always modified under lru lock. Then, not necessary to preempt_disable()
 229  */
 230 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 231                                          struct page_cgroup *pc,
 232                                          bool charge)
 233 {
 234         int val = (charge)? 1 : -1;
 235         struct mem_cgroup_stat *stat = &mem->stat;
 236         struct mem_cgroup_stat_cpu *cpustat;
 237
 238         VM_BUG_ON(!irqs_disabled());
 239
 240         cpustat = &stat->cpustat[smp_processor_id()];
 241         if (PageCgroupCache(pc))
 242                 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
 243         else
 244                 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
 245
 246         if (charge)
 247                 __mem_cgroup_stat_add_safe(cpustat,
 248                                 MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
 249         else
 250                 __mem_cgroup_stat_add_safe(cpustat,
 251                                 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
 252 }
 253
 254 static struct mem_cgroup_per_zone *
 255 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
 256 {
 257         return &mem->info.nodeinfo[nid]->zoneinfo[zid];
 258 }
 259
 260 static struct mem_cgroup_per_zone *
 261 page_cgroup_zoneinfo(struct page_cgroup *pc)
 262 {
 263         struct mem_cgroup *mem = pc->mem_cgroup;
 264         int nid = page_cgroup_nid(pc);
 265         int zid = page_cgroup_zid(pc);
 266
 267         return mem_cgroup_zoneinfo(mem, nid, zid);
 268 }
 269
 270 static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
 271                                         enum lru_list idx)
 272 {
 273         int nid, zid;
 274         struct mem_cgroup_per_zone *mz;
 275         u64 total = 0;
 276
 277         for_each_online_node(nid)
 278                 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 279                         mz = mem_cgroup_zoneinfo(mem, nid, zid);
 280                         total += MEM_CGROUP_ZSTAT(mz, idx);
 281                 }
 282         return total;
 283 }
 284
 285 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 286 {
 287         return container_of(cgroup_subsys_state(cont,
 288                                 mem_cgroup_subsys_id), struct mem_cgroup,
 289                                 css);
 290 }
 291
 292 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 293 {
 294         /*
 295          * mm_update_next_owner() may clear mm->owner to NULL
 296          * if it races with swapoff, page migration, etc.
 297          * So this can be called with p == NULL.
 298          */
 299         if (unlikely(!p))
 300                 return NULL;
 301
 302         return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
 303                                 struct mem_cgroup, css);
 304 }
 305
 306 static inline int page_cgroup_locked(struct page *page)
 307 {
 308         return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
 309 }
 310
 311 static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
 312 {
 313         VM_BUG_ON(!page_cgroup_locked(page));
 314         page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
 315 }
 316
 317 struct page_cgroup *page_get_page_cgroup(struct page *page)
 318 {
 319         return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
 320 }
 321
 322 static void lock_page_cgroup(struct page *page)
 323 {
 324         bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
 325 }
 326
 327 static int try_lock_page_cgroup(struct page *page)
 328 {
 329         return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
 330 }
 331
 332 static void unlock_page_cgroup(struct page *page)
 333 {
 334         bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
 335 }
 336
 337 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
 338                         struct page_cgroup *pc)
 339 {
 340         int lru = LRU_BASE;
 341
 342         if (PageCgroupUnevictable(pc))
 343                 lru = LRU_UNEVICTABLE;
 344         else {
 345                 if (PageCgroupActive(pc))
 346                         lru += LRU_ACTIVE;
 347                 if (PageCgroupFile(pc))
 348                         lru += LRU_FILE;
 349         }
 350
 351         MEM_CGROUP_ZSTAT(mz, lru) -= 1;
 352
 353         mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
 354         list_del(&pc->lru);
 355 }
 356
 357 static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
 358                                 struct page_cgroup *pc)
 359 {
 360         int lru = LRU_BASE;
 361
 362         if (PageCgroupUnevictable(pc))
 363                 lru = LRU_UNEVICTABLE;
 364         else {
 365                 if (PageCgroupActive(pc))
 366                         lru += LRU_ACTIVE;
 367                 if (PageCgroupFile(pc))
 368                         lru += LRU_FILE;
 369         }
 370
 371         MEM_CGROUP_ZSTAT(mz, lru) += 1;
 372         list_add(&pc->lru, &mz->lists[lru]);
 373
 374         mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
 375 }
 376
 377 static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
 378 {
 379         struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
 380         int active    = PageCgroupActive(pc);
 381         int file      = PageCgroupFile(pc);
 382         int unevictable = PageCgroupUnevictable(pc);
 383         enum lru_list from = unevictable ? LRU_UNEVICTABLE :
 384                                 (LRU_FILE * !!file + !!active);
 385
 386         if (lru == from)
 387                 return;
 388
 389         MEM_CGROUP_ZSTAT(mz, from) -= 1;
 390         /*
 391          * However this is done under mz->lru_lock, another flags, which
 392          * are not related to LRU, will be modified from out-of-lock.
 393          * We have to use atomic set/clear flags.
 394          */
 395         if (is_unevictable_lru(lru)) {
 396                 ClearPageCgroupActive(pc);
 397                 SetPageCgroupUnevictable(pc);
 398         } else {
 399                 if (is_active_lru(lru))
 400                         SetPageCgroupActive(pc);
 401                 else
 402                         ClearPageCgroupActive(pc);
 403                 ClearPageCgroupUnevictable(pc);
 404         }
 405
 406         MEM_CGROUP_ZSTAT(mz, lru) += 1;
 407         list_move(&pc->lru, &mz->lists[lru]);
 408 }
 409
 410 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 411 {
 412         int ret;
 413
 414         task_lock(task);
 415         ret = task->mm && mm_match_cgroup(task->mm, mem);
 416         task_unlock(task);
 417         return ret;
 418 }
 419
 420 /*
 421  * This routine assumes that the appropriate zone's lru lock is already held
 422  */
 423 void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
 424 {
 425         struct page_cgroup *pc;
 426         struct mem_cgroup_per_zone *mz;
 427         unsigned long flags;
 428
 429         if (mem_cgroup_subsys.disabled)
 430                 return;
 431
 432         /*
 433          * We cannot lock_page_cgroup while holding zone's lru_lock,
 434          * because other holders of lock_page_cgroup can be interrupted
 435          * with an attempt to rotate_reclaimable_page.  But we cannot
 436          * safely get to page_cgroup without it, so just try_lock it:
 437          * mem_cgroup_isolate_pages allows for page left on wrong list.
 438          */
 439         if (!try_lock_page_cgroup(page))
 440                 return;
 441
 442         pc = page_get_page_cgroup(page);
 443         if (pc) {
 444                 mz = page_cgroup_zoneinfo(pc);
 445                 spin_lock_irqsave(&mz->lru_lock, flags);
 446                 __mem_cgroup_move_lists(pc, lru);
 447                 spin_unlock_irqrestore(&mz->lru_lock, flags);
 448         }
 449         unlock_page_cgroup(page);
 450 }
 451
 452 /*
 453  * Calculate mapped_ratio under memory controller. This will be used in
 454  * vmscan.c for deteremining we have to reclaim mapped pages.
 455  */
 456 int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
 457 {
 458         long total, rss;
 459
 460         /*
 461          * usage is recorded in bytes. But, here, we assume the number of
 462          * physical pages can be represented by "long" on any arch.
 463          */
 464         total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
 465         rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
 466         return (int)((rss * 100L) / total);
 467 }
 468
 469 /*
 470  * prev_priority control...this will be used in memory reclaim path.
 471  */
 472 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
 473 {
 474         return mem->prev_priority;
 475 }
 476
 477 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
 478 {
 479         if (priority < mem->prev_priority)
 480                 mem->prev_priority = priority;
 481 }
 482
 483 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
 484 {
 485         mem->prev_priority = priority;
 486 }
 487
 488 /*
 489  * Calculate # of pages to be scanned in this priority/zone.
 490  * See also vmscan.c
 491  *
 492  * priority starts from "DEF_PRIORITY" and decremented in each loop.
 493  * (see include/linux/mmzone.h)
 494  */
 495
 496 long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
 497                                         int priority, enum lru_list lru)
 498 {
 499         long nr_pages;
 500         int nid = zone->zone_pgdat->node_id;
 501         int zid = zone_idx(zone);
 502         struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
 503
 504         nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
 505
 506         return (nr_pages >> priority);
 507 }
 508
 509 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 510                                         struct list_head *dst,
 511                                         unsigned long *scanned, int order,
 512                                         int mode, struct zone *z,
 513                                         struct mem_cgroup *mem_cont,
 514                                         int active, int file)
 515 {
 516         unsigned long nr_taken = 0;
 517         struct page *page;
 518         unsigned long scan;
 519         LIST_HEAD(pc_list);
 520         struct list_head *src;
 521         struct page_cgroup *pc, *tmp;
 522         int nid = z->zone_pgdat->node_id;
 523         int zid = zone_idx(z);
 524         struct mem_cgroup_per_zone *mz;
 525         int lru = LRU_FILE * !!file + !!active;
 526
 527         BUG_ON(!mem_cont);
 528         mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
 529         src = &mz->lists[lru];
 530
 531         spin_lock(&mz->lru_lock);
 532         scan = 0;
 533         list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
 534                 if (scan >= nr_to_scan)
 535                         break;
 536                 page = pc->page;
 537
 538                 if (unlikely(!PageLRU(page)))
 539                         continue;
 540
 541                 /*
 542                  * TODO: play better with lumpy reclaim, grabbing anything.
 543                  */
 544                 if (PageUnevictable(page) ||
 545                     (PageActive(page) && !active) ||
 546                     (!PageActive(page) && active)) {
 547                         __mem_cgroup_move_lists(pc, page_lru(page));
 548                         continue;
 549                 }
 550
 551                 scan++;
 552                 list_move(&pc->lru, &pc_list);
 553
 554                 if (__isolate_lru_page(page, mode, file) == 0) {
 555                         list_move(&page->lru, dst);
 556                         nr_taken++;
 557                 }
 558         }
 559
 560         list_splice(&pc_list, src);
 561         spin_unlock(&mz->lru_lock);
 562
 563         *scanned = scan;
 564         return nr_taken;
 565 }
 566
 567 /*
 568  * Charge the memory controller for page usage.
 569  * Return
 570  * 0 if the charge was successful
 571  * < 0 if the cgroup is over its limit
 572  */
 573 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 574                                 gfp_t gfp_mask, enum charge_type ctype,
 575                                 struct mem_cgroup *memcg)
 576 {
 577         struct mem_cgroup *mem;
 578         struct page_cgroup *pc;
 579         unsigned long flags;
 580         unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 581         struct mem_cgroup_per_zone *mz;
 582
 583         pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
 584         if (unlikely(pc == NULL))
 585                 goto err;
 586
 587         /*
 588          * We always charge the cgroup the mm_struct belongs to.
 589          * The mm_struct's mem_cgroup changes on task migration if the
 590          * thread group leader migrates. It's possible that mm is not
 591          * set, if so charge the init_mm (happens for pagecache usage).
 592          */
 593         if (likely(!memcg)) {
 594                 rcu_read_lock();
 595                 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 596                 if (unlikely(!mem)) {
 597                         rcu_read_unlock();
 598                         kmem_cache_free(page_cgroup_cache, pc);
 599                         return 0;
 600                 }
 601                 /*
 602                  * For every charge from the cgroup, increment reference count
 603                  */
 604                 css_get(&mem->css);
 605                 rcu_read_unlock();
 606         } else {
 607                 mem = memcg;
 608                 css_get(&memcg->css);
 609         }
 610
 611         while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
 612                 if (!(gfp_mask & __GFP_WAIT))
 613                         goto out;
 614
 615                 if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
 616                         continue;
 617
 618                 /*
 619                  * try_to_free_mem_cgroup_pages() might not give us a full
 620                  * picture of reclaim. Some pages are reclaimed and might be
 621                  * moved to swap cache or just unmapped from the cgroup.
 622                  * Check the limit again to see if the reclaim reduced the
 623                  * current usage of the cgroup before giving up
 624                  */
 625                 if (res_counter_check_under_limit(&mem->res))
 626                         continue;
 627
 628                 if (!nr_retries--) {
 629                         mem_cgroup_out_of_memory(mem, gfp_mask);
 630                         goto out;
 631                 }
 632         }
 633
 634         pc->mem_cgroup = mem;
 635         pc->page = page;
 636         /*
 637          * If a page is accounted as a page cache, insert to inactive list.
 638          * If anon, insert to active list.
 639          */
 640         pc->flags = pcg_default_flags[ctype];
 641
 642         lock_page_cgroup(page);
 643         if (unlikely(page_get_page_cgroup(page))) {
 644                 unlock_page_cgroup(page);
 645                 res_counter_uncharge(&mem->res, PAGE_SIZE);
 646                 css_put(&mem->css);
 647                 kmem_cache_free(page_cgroup_cache, pc);
 648                 goto done;
 649         }
 650         page_assign_page_cgroup(page, pc);
 651
 652         mz = page_cgroup_zoneinfo(pc);
 653         spin_lock_irqsave(&mz->lru_lock, flags);
 654         __mem_cgroup_add_list(mz, pc);
 655         spin_unlock_irqrestore(&mz->lru_lock, flags);
 656
 657         unlock_page_cgroup(page);
 658 done:
 659         return 0;
 660 out:
 661         css_put(&mem->css);
 662         kmem_cache_free(page_cgroup_cache, pc);
 663 err:
 664         return -ENOMEM;
 665 }
 666
 667 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 668 {
 669         if (mem_cgroup_subsys.disabled)
 670                 return 0;
 671
 672         /*
 673          * If already mapped, we don't have to account.
 674          * If page cache, page->mapping has address_space.
 675          * But page->mapping may have out-of-use anon_vma pointer,
 676          * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
 677          * is NULL.
 678          */
 679         if (page_mapped(page) || (page->mapping && !PageAnon(page)))
 680                 return 0;
 681         if (unlikely(!mm))
 682                 mm = &init_mm;
 683         return mem_cgroup_charge_common(page, mm, gfp_mask,
 684                                 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
 685 }
 686
 687 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 688                                 gfp_t gfp_mask)
 689 {
 690         if (mem_cgroup_subsys.disabled)
 691                 return 0;
 692
 693         /*
 694          * Corner case handling. This is called from add_to_page_cache()
 695          * in usual. But some FS (shmem) precharges this page before calling it
 696          * and call add_to_page_cache() with GFP_NOWAIT.
 697          *
 698          * For GFP_NOWAIT case, the page may be pre-charged before calling
 699          * add_to_page_cache(). (See shmem.c) check it here and avoid to call
 700          * charge twice. (It works but has to pay a bit larger cost.)
 701          */
 702         if (!(gfp_mask & __GFP_WAIT)) {
 703                 struct page_cgroup *pc;
 704
 705                 lock_page_cgroup(page);
 706                 pc = page_get_page_cgroup(page);
 707                 if (pc) {
 708                         VM_BUG_ON(pc->page != page);
 709                         VM_BUG_ON(!pc->mem_cgroup);
 710                         unlock_page_cgroup(page);
 711                         return 0;
 712                 }
 713                 unlock_page_cgroup(page);
 714         }
 715
 716         if (unlikely(!mm))
 717                 mm = &init_mm;
 718
 719         if (page_is_file_cache(page))
 720                 return mem_cgroup_charge_common(page, mm, gfp_mask,
 721                                 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
 722         else
 723                 return mem_cgroup_charge_common(page, mm, gfp_mask,
 724                                 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
 725 }
 726
 727 /*
 728  * uncharge if !page_mapped(page)
 729  */
 730 static void
 731 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 732 {
 733         struct page_cgroup *pc;
 734         struct mem_cgroup *mem;
 735         struct mem_cgroup_per_zone *mz;
 736         unsigned long flags;
 737
 738         if (mem_cgroup_subsys.disabled)
 739                 return;
 740
 741         /*
 742          * Check if our page_cgroup is valid
 743          */
 744         lock_page_cgroup(page);
 745         pc = page_get_page_cgroup(page);
 746         if (unlikely(!pc))
 747                 goto unlock;
 748
 749         VM_BUG_ON(pc->page != page);
 750
 751         if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
 752             && ((PageCgroupCache(pc) || page_mapped(page))))
 753                 goto unlock;
 754
 755         mz = page_cgroup_zoneinfo(pc);
 756         spin_lock_irqsave(&mz->lru_lock, flags);
 757         __mem_cgroup_remove_list(mz, pc);
 758         spin_unlock_irqrestore(&mz->lru_lock, flags);
 759
 760         page_assign_page_cgroup(page, NULL);
 761         unlock_page_cgroup(page);
 762
 763         mem = pc->mem_cgroup;
 764         res_counter_uncharge(&mem->res, PAGE_SIZE);
 765         css_put(&mem->css);
 766
 767         kmem_cache_free(page_cgroup_cache, pc);
 768         return;
 769 unlock:
 770         unlock_page_cgroup(page);
 771 }
 772
 773 void mem_cgroup_uncharge_page(struct page *page)
 774 {
 775         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
 776 }
 777
 778 void mem_cgroup_uncharge_cache_page(struct page *page)
 779 {
 780         VM_BUG_ON(page_mapped(page));
 781         VM_BUG_ON(page->mapping);
 782         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
 783 }
 784
 785 /*
 786  * Before starting migration, account against new page.
 787  */
 788 int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
 789 {
 790         struct page_cgroup *pc;
 791         struct mem_cgroup *mem = NULL;
 792         enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
 793         int ret = 0;
 794
 795         if (mem_cgroup_subsys.disabled)
 796                 return 0;
 797
 798         lock_page_cgroup(page);
 799         pc = page_get_page_cgroup(page);
 800         if (pc) {
 801                 mem = pc->mem_cgroup;
 802                 css_get(&mem->css);
 803                 if (PageCgroupCache(pc)) {
 804                         if (page_is_file_cache(page))
 805                                 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
 806                         else
 807                                 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
 808                 }
 809         }
 810         unlock_page_cgroup(page);
 811         if (mem) {
 812                 ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
 813                         ctype, mem);
 814                 css_put(&mem->css);
 815         }
 816         return ret;
 817 }
 818
 819 /* remove redundant charge if migration failed*/
 820 void mem_cgroup_end_migration(struct page *newpage)
 821 {
 822         /*
 823          * At success, page->mapping is not NULL.
 824          * special rollback care is necessary when
 825          * 1. at migration failure. (newpage->mapping is cleared in this case)
 826          * 2. the newpage was moved but not remapped again because the task
 827          *    exits and the newpage is obsolete. In this case, the new page
 828          *    may be a swapcache. So, we just call mem_cgroup_uncharge_page()
 829          *    always for avoiding mess. The  page_cgroup will be removed if
 830          *    unnecessary. File cache pages is still on radix-tree. Don't
 831          *    care it.
 832          */
 833         if (!newpage->mapping)
 834                 __mem_cgroup_uncharge_common(newpage,
 835                                          MEM_CGROUP_CHARGE_TYPE_FORCE);
 836         else if (PageAnon(newpage))
 837                 mem_cgroup_uncharge_page(newpage);
 838 }
 839
 840 /*
 841  * A call to try to shrink memory usage under specified resource controller.
 842  * This is typically used for page reclaiming for shmem for reducing side
 843  * effect of page allocation from shmem, which is used by some mem_cgroup.
 844  */
 845 int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
 846 {
 847         struct mem_cgroup *mem;
 848         int progress = 0;
 849         int retry = MEM_CGROUP_RECLAIM_RETRIES;
 850
 851         if (mem_cgroup_subsys.disabled)
 852                 return 0;
 853         if (!mm)
 854                 return 0;
 855
 856         rcu_read_lock();
 857         mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 858         if (unlikely(!mem)) {
 859                 rcu_read_unlock();
 860                 return 0;
 861         }
 862         css_get(&mem->css);
 863         rcu_read_unlock();
 864
 865         do {
 866                 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
 867                 progress += res_counter_check_under_limit(&mem->res);
 868         } while (!progress && --retry);
 869
 870         css_put(&mem->css);
 871         if (!retry)
 872                 return -ENOMEM;
 873         return 0;
 874 }
 875
 876 int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
 877 {
 878
 879         int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
 880         int progress;
 881         int ret = 0;
 882
 883         while (res_counter_set_limit(&memcg->res, val)) {
 884                 if (signal_pending(current)) {
 885                         ret = -EINTR;
 886                         break;
 887                 }
 888                 if (!retry_count) {
 889                         ret = -EBUSY;
 890                         break;
 891                 }
 892                 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL);
 893                 if (!progress)
 894                         retry_count--;
 895         }
 896         return ret;
 897 }
 898
 899
 900 /*
 901  * This routine traverse page_cgroup in given list and drop them all.
 902  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
 903  */
 904 #define FORCE_UNCHARGE_BATCH    (128)
 905 static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
 906                             struct mem_cgroup_per_zone *mz,
 907                             enum lru_list lru)
 908 {
 909         struct page_cgroup *pc;
 910         struct page *page;
 911         int count = FORCE_UNCHARGE_BATCH;
 912         unsigned long flags;
 913         struct list_head *list;
 914
 915         list = &mz->lists[lru];
 916
 917         spin_lock_irqsave(&mz->lru_lock, flags);
 918         while (!list_empty(list)) {
 919                 pc = list_entry(list->prev, struct page_cgroup, lru);
 920                 page = pc->page;
 921                 get_page(page);
 922                 spin_unlock_irqrestore(&mz->lru_lock, flags);
 923                 /*
 924                  * Check if this page is on LRU. !LRU page can be found
 925                  * if it's under page migration.
 926                  */
 927                 if (PageLRU(page)) {
 928                         __mem_cgroup_uncharge_common(page,
 929                                         MEM_CGROUP_CHARGE_TYPE_FORCE);
 930                         put_page(page);
 931                         if (--count <= 0) {
 932                                 count = FORCE_UNCHARGE_BATCH;
 933                                 cond_resched();
 934                         }
 935                 } else
 936                         cond_resched();
 937                 spin_lock_irqsave(&mz->lru_lock, flags);
 938         }
 939         spin_unlock_irqrestore(&mz->lru_lock, flags);
 940 }
 941
 942 /*
 943  * make mem_cgroup's charge to be 0 if there is no task.
 944  * This enables deleting this mem_cgroup.
 945  */
 946 static int mem_cgroup_force_empty(struct mem_cgroup *mem)
 947 {
 948         int ret = -EBUSY;
 949         int node, zid;
 950
 951         css_get(&mem->css);
 952         /*
 953          * page reclaim code (kswapd etc..) will move pages between
 954          * active_list <-> inactive_list while we don't take a lock.
 955          * So, we have to do loop here until all lists are empty.
 956          */
 957         while (mem->res.usage > 0) {
 958                 if (atomic_read(&mem->css.cgroup->count) > 0)
 959                         goto out;
 960                 for_each_node_state(node, N_POSSIBLE)
 961                         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 962                                 struct mem_cgroup_per_zone *mz;
 963                                 enum lru_list l;
 964                                 mz = mem_cgroup_zoneinfo(mem, node, zid);
 965                                 for_each_lru(l)
 966                                         mem_cgroup_force_empty_list(mem, mz, l);
 967                         }
 968         }
 969         ret = 0;
 970 out:
 971         css_put(&mem->css);
 972         return ret;
 973 }
 974
 975 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 976 {
 977         return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
 978                                     cft->private);
 979 }
 980 /*
 981  * The user of this function is...
 982  * RES_LIMIT.
 983  */
 984 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 985                             const char *buffer)
 986 {
 987         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 988         unsigned long long val;
 989         int ret;
 990
 991         switch (cft->private) {
 992         case RES_LIMIT:
 993                 /* This function does all necessary parse...reuse it */
 994                 ret = res_counter_memparse_write_strategy(buffer, &val);
 995                 if (!ret)
 996                         ret = mem_cgroup_resize_limit(memcg, val);
 997                 break;
 998         default:
 999                 ret = -EINVAL; /* should be BUG() ? */
1000                 break;
1001         }
1002         return ret;
1003 }
1004
1005 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
1006 {
1007         struct mem_cgroup *mem;
1008
1009         mem = mem_cgroup_from_cont(cont);
1010         switch (event) {
1011         case RES_MAX_USAGE:
1012                 res_counter_reset_max(&mem->res);
1013                 break;
1014         case RES_FAILCNT:
1015                 res_counter_reset_failcnt(&mem->res);
1016                 break;
1017         }
1018         return 0;
1019 }
1020
1021 static int mem_force_empty_write(struct cgroup *cont, unsigned int event)
1022 {
1023         return mem_cgroup_force_empty(mem_cgroup_from_cont(cont));
1024 }
1025
1026 static const struct mem_cgroup_stat_desc {
1027         const char *msg;
1028         u64 unit;
1029 } mem_cgroup_stat_desc[] = {
1030         [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
1031         [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
1032         [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
1033         [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
1034 };
1035
1036 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
1037                                  struct cgroup_map_cb *cb)
1038 {
1039         struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
1040         struct mem_cgroup_stat *stat = &mem_cont->stat;
1041         int i;
1042
1043         for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
1044                 s64 val;
1045
1046                 val = mem_cgroup_read_stat(stat, i);
1047                 val *= mem_cgroup_stat_desc[i].unit;
1048                 cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
1049         }
1050         /* showing # of active pages */
1051         {
1052                 unsigned long active_anon, inactive_anon;
1053                 unsigned long active_file, inactive_file;
1054                 unsigned long unevictable;
1055
1056                 inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
1057                                                 LRU_INACTIVE_ANON);
1058                 active_anon = mem_cgroup_get_all_zonestat(mem_cont,
1059                                                 LRU_ACTIVE_ANON);
1060                 inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
1061                                                 LRU_INACTIVE_FILE);
1062                 active_file = mem_cgroup_get_all_zonestat(mem_cont,
1063                                                 LRU_ACTIVE_FILE);
1064                 unevictable = mem_cgroup_get_all_zonestat(mem_cont,
1065                                                         LRU_UNEVICTABLE);
1066
1067                 cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
1068                 cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
1069                 cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
1070                 cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
1071                 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
1072
1073         }
1074         return 0;
1075 }
1076
1077 static struct cftype mem_cgroup_files[] = {
1078         {
1079                 .name = "usage_in_bytes",
1080                 .private = RES_USAGE,
1081                 .read_u64 = mem_cgroup_read,
1082         },
1083         {
1084                 .name = "max_usage_in_bytes",
1085                 .private = RES_MAX_USAGE,
1086                 .trigger = mem_cgroup_reset,
1087                 .read_u64 = mem_cgroup_read,
1088         },
1089         {
1090                 .name = "limit_in_bytes",
1091                 .private = RES_LIMIT,
1092                 .write_string = mem_cgroup_write,
1093                 .read_u64 = mem_cgroup_read,
1094         },
1095         {
1096                 .name = "failcnt",
1097                 .private = RES_FAILCNT,
1098                 .trigger = mem_cgroup_reset,
1099                 .read_u64 = mem_cgroup_read,
1100         },
1101         {
1102                 .name = "force_empty",
1103                 .trigger = mem_force_empty_write,
1104         },
1105         {
1106                 .name = "stat",
1107                 .read_map = mem_control_stat_show,
1108         },
1109 };
1110
1111 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1112 {
1113         struct mem_cgroup_per_node *pn;
1114         struct mem_cgroup_per_zone *mz;
1115         enum lru_list l;
1116         int zone, tmp = node;
1117         /*
1118          * This routine is called against possible nodes.
1119          * But it's BUG to call kmalloc() against offline node.
1120          *
1121          * TODO: this routine can waste much memory for nodes which will
1122          *       never be onlined. It's better to use memory hotplug callback
1123          *       function.
1124          */
1125         if (!node_state(node, N_NORMAL_MEMORY))
1126                 tmp = -1;
1127         pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
1128         if (!pn)
1129                 return 1;
1130
1131         mem->info.nodeinfo[node] = pn;
1132         memset(pn, 0, sizeof(*pn));
1133
1134         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
1135                 mz = &pn->zoneinfo[zone];
1136                 spin_lock_init(&mz->lru_lock);
1137                 for_each_lru(l)
1138                         INIT_LIST_HEAD(&mz->lists[l]);
1139         }
1140         return 0;
1141 }
1142
1143 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1144 {
1145         kfree(mem->info.nodeinfo[node]);
1146 }
1147
1148 static struct mem_cgroup *mem_cgroup_alloc(void)
1149 {
1150         struct mem_cgroup *mem;
1151
1152         if (sizeof(*mem) < PAGE_SIZE)
1153                 mem = kmalloc(sizeof(*mem), GFP_KERNEL);
1154         else
1155                 mem = vmalloc(sizeof(*mem));
1156
1157         if (mem)
1158                 memset(mem, 0, sizeof(*mem));
1159         return mem;
1160 }
1161
1162 static void mem_cgroup_free(struct mem_cgroup *mem)
1163 {
1164         if (sizeof(*mem) < PAGE_SIZE)
1165                 kfree(mem);
1166         else
1167                 vfree(mem);
1168 }
1169
1170
1171 static struct cgroup_subsys_state *
1172 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1173 {
1174         struct mem_cgroup *mem;
1175         int node;
1176
1177         if (unlikely((cont->parent) == NULL)) {
1178                 mem = &init_mem_cgroup;
1179                 page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
1180         } else {
1181                 mem = mem_cgroup_alloc();
1182                 if (!mem)
1183                         return ERR_PTR(-ENOMEM);
1184         }
1185
1186         res_counter_init(&mem->res);
1187
1188         for_each_node_state(node, N_POSSIBLE)
1189                 if (alloc_mem_cgroup_per_zone_info(mem, node))
1190                         goto free_out;
1191
1192         return &mem->css;
1193 free_out:
1194         for_each_node_state(node, N_POSSIBLE)
1195                 free_mem_cgroup_per_zone_info(mem, node);
1196         if (cont->parent != NULL)
1197                 mem_cgroup_free(mem);
1198         return ERR_PTR(-ENOMEM);
1199 }
1200
1201 static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
1202                                         struct cgroup *cont)
1203 {
1204         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1205         mem_cgroup_force_empty(mem);
1206 }
1207
1208 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
1209                                 struct cgroup *cont)
1210 {
1211         int node;
1212         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1213
1214         for_each_node_state(node, N_POSSIBLE)
1215                 free_mem_cgroup_per_zone_info(mem, node);
1216
1217         mem_cgroup_free(mem_cgroup_from_cont(cont));
1218 }
1219
1220 static int mem_cgroup_populate(struct cgroup_subsys *ss,
1221                                 struct cgroup *cont)
1222 {
1223         return cgroup_add_files(cont, ss, mem_cgroup_files,
1224                                         ARRAY_SIZE(mem_cgroup_files));
1225 }
1226
1227 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1228                                 struct cgroup *cont,
1229                                 struct cgroup *old_cont,
1230                                 struct task_struct *p)
1231 {
1232         struct mm_struct *mm;
1233         struct mem_cgroup *mem, *old_mem;
1234
1235         mm = get_task_mm(p);
1236         if (mm == NULL)
1237                 return;
1238
1239         mem = mem_cgroup_from_cont(cont);
1240         old_mem = mem_cgroup_from_cont(old_cont);
1241
1242         /*
1243          * Only thread group leaders are allowed to migrate, the mm_struct is
1244          * in effect owned by the leader
1245          */
1246         if (!thread_group_leader(p))
1247                 goto out;
1248
1249 out:
1250         mmput(mm);
1251 }
1252
1253 struct cgroup_subsys mem_cgroup_subsys = {
1254         .name = "memory",
1255         .subsys_id = mem_cgroup_subsys_id,
1256         .create = mem_cgroup_create,
1257         .pre_destroy = mem_cgroup_pre_destroy,
1258         .destroy = mem_cgroup_destroy,
1259         .populate = mem_cgroup_populate,
1260         .attach = mem_cgroup_move_task,
1261         .early_init = 0,
1262 };