]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/commitdiff
Merge commit 'v2.6.28-rc2' into core/locking
authorIngo Molnar <mingo@elte.hu>
Tue, 28 Oct 2008 15:54:49 +0000 (16:54 +0100)
committerIngo Molnar <mingo@elte.hu>
Tue, 28 Oct 2008 15:54:49 +0000 (16:54 +0100)
Conflicts:
arch/um/include/asm/system.h

1  2 
arch/um/include/asm/system.h
arch/x86/include/asm/uaccess.h
arch/x86/include/asm/uaccess_32.h
arch/x86/include/asm/uaccess_64.h
arch/x86/lib/usercopy_32.c
include/linux/kernel.h
kernel/sched.c
mm/memory.c

index f1ea4da34fadb8d75c2fb740d5a1464ecc6e9adb,753346e2cdfd64d5d472f3388014952f5d0ca4ff..ae5f94d6317d584c051fbcea83533431e4b635d7
@@@ -1,19 -1,7 +1,7 @@@
  #ifndef __UM_SYSTEM_GENERIC_H
  #define __UM_SYSTEM_GENERIC_H
  
- #include "asm/arch/system.h"
- #undef switch_to
- #undef raw_local_irq_save
- #undef raw_local_irq_restore
- #undef raw_local_irq_disable
- #undef raw_local_irq_enable
- #undef raw_local_save_flags
- #undef raw_local_irq_restore
- #undef raw_local_irq_enable
- #undef raw_local_irq_disable
- #undef raw_local_irq_save
- #undef irqs_disabled
+ #include "sysdep/system.h"
  
  extern void *switch_to(void *prev, void *next, void *last);
  
@@@ -23,21 -11,21 +11,21 @@@ extern int get_signals(void)
  extern void block_signals(void);
  extern void unblock_signals(void);
  
 -#define local_save_flags(flags) do { typecheck(unsigned long, flags); \
 +#define raw_local_save_flags(flags) do { typecheck(unsigned long, flags); \
                                     (flags) = get_signals(); } while(0)
 -#define local_irq_restore(flags) do { typecheck(unsigned long, flags); \
 +#define raw_local_irq_restore(flags) do { typecheck(unsigned long, flags); \
                                      set_signals(flags); } while(0)
  
 -#define local_irq_save(flags) do { local_save_flags(flags); \
 -                                   local_irq_disable(); } while(0)
 +#define raw_local_irq_save(flags) do { raw_local_save_flags(flags); \
 +                                   raw_local_irq_disable(); } while(0)
  
 -#define local_irq_enable() unblock_signals()
 -#define local_irq_disable() block_signals()
 +#define raw_local_irq_enable() unblock_signals()
 +#define raw_local_irq_disable() block_signals()
  
  #define irqs_disabled()                 \
  ({                                      \
          unsigned long flags;            \
 -        local_save_flags(flags);        \
 +        raw_local_save_flags(flags);        \
          (flags == 0);                   \
  })
  
index dc8edb5c46593dad80aa46618e9e8fa3ff296152,35c54921b2e434cdd95bbd223077f2b1155d0f1f..99192bb55a53bf68afc30efeae89b9b40b5f4b4d
@@@ -1,5 -1,5 +1,5 @@@
- #ifndef _ASM_UACCES_H_
- #define _ASM_UACCES_H_
+ #ifndef _ASM_X86_UACCESS_H
+ #define _ASM_X86_UACCESS_H
  /*
   * User space memory access functions
   */
@@@ -157,7 -157,6 +157,7 @@@ extern int __get_user_bad(void)
        int __ret_gu;                                                   \
        unsigned long __val_gu;                                         \
        __chk_user_ptr(ptr);                                            \
 +      might_fault();                                                  \
        switch (sizeof(*(ptr))) {                                       \
        case 1:                                                         \
                __get_user_x(1, __ret_gu, __val_gu, ptr);               \
@@@ -242,7 -241,6 +242,7 @@@ extern void __put_user_8(void)
        int __ret_pu;                                           \
        __typeof__(*(ptr)) __pu_val;                            \
        __chk_user_ptr(ptr);                                    \
 +      might_fault();                                          \
        __pu_val = x;                                           \
        switch (sizeof(*(ptr))) {                               \
        case 1:                                                 \
@@@ -452,5 -450,5 +452,5 @@@ extern struct movsl_mask 
  # include "uaccess_64.h"
  #endif
  
- #endif
+ #endif /* _ASM_X86_UACCESS_H */
  
index d10e842ec3eed2d9cc97fcee885b2cc9be17a0b5,d095a3aeea1b44d3063f0165c956d0a156c6ca38..5e06259e90e5a736539948ae22e1de25444ba485
@@@ -1,5 -1,5 +1,5 @@@
- #ifndef __i386_UACCESS_H
- #define __i386_UACCESS_H
+ #ifndef _ASM_X86_UACCESS_32_H
+ #define _ASM_X86_UACCESS_32_H
  
  /*
   * User space memory access functions
@@@ -82,8 -82,8 +82,8 @@@ __copy_to_user_inatomic(void __user *to
  static __always_inline unsigned long __must_check
  __copy_to_user(void __user *to, const void *from, unsigned long n)
  {
 -       might_sleep();
 -       return __copy_to_user_inatomic(to, from, n);
 +      might_fault();
 +      return __copy_to_user_inatomic(to, from, n);
  }
  
  static __always_inline unsigned long
@@@ -137,7 -137,7 +137,7 @@@ __copy_from_user_inatomic(void *to, con
  static __always_inline unsigned long
  __copy_from_user(void *to, const void __user *from, unsigned long n)
  {
 -      might_sleep();
 +      might_fault();
        if (__builtin_constant_p(n)) {
                unsigned long ret;
  
  static __always_inline unsigned long __copy_from_user_nocache(void *to,
                                const void __user *from, unsigned long n)
  {
 -      might_sleep();
 +      might_fault();
        if (__builtin_constant_p(n)) {
                unsigned long ret;
  
@@@ -215,4 -215,4 +215,4 @@@ long strnlen_user(const char __user *st
  unsigned long __must_check clear_user(void __user *mem, unsigned long len);
  unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
  
- #endif /* __i386_UACCESS_H */
+ #endif /* _ASM_X86_UACCESS_32_H */
index 13fd56fbc3aba9791d0fc5b9655f311eb4c8436c,664f15280f14354dc057e1d97954db6baab4b959..543ba883cc66200ff0e2206aec36b434b63d3695
@@@ -1,5 -1,5 +1,5 @@@
- #ifndef __X86_64_UACCESS_H
- #define __X86_64_UACCESS_H
+ #ifndef _ASM_X86_UACCESS_64_H
+ #define _ASM_X86_UACCESS_64_H
  
  /*
   * User space memory access functions
@@@ -7,6 -7,7 +7,7 @@@
  #include <linux/compiler.h>
  #include <linux/errno.h>
  #include <linux/prefetch.h>
+ #include <linux/lockdep.h>
  #include <asm/page.h>
  
  /*
@@@ -28,8 -29,6 +29,8 @@@ static __always_inline __must_chec
  int __copy_from_user(void *dst, const void __user *src, unsigned size)
  {
        int ret = 0;
 +
 +      might_fault();
        if (!__builtin_constant_p(size))
                return copy_user_generic(dst, (__force void *)src, size);
        switch (size) {
@@@ -72,8 -71,6 +73,8 @@@ static __always_inline __must_chec
  int __copy_to_user(void __user *dst, const void *src, unsigned size)
  {
        int ret = 0;
 +
 +      might_fault();
        if (!__builtin_constant_p(size))
                return copy_user_generic((__force void *)dst, src, size);
        switch (size) {
@@@ -116,8 -113,6 +117,8 @@@ static __always_inline __must_chec
  int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
  {
        int ret = 0;
 +
 +      might_fault();
        if (!__builtin_constant_p(size))
                return copy_user_generic((__force void *)dst,
                                         (__force void *)src, size);
@@@ -204,4 -199,4 +205,4 @@@ static inline int __copy_from_user_inat
  unsigned long
  copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest);
  
- #endif /* __X86_64_UACCESS_H */
+ #endif /* _ASM_X86_UACCESS_64_H */
index fab5faba1d3e5ff4d4f1778a3692d0a51c4475fc,9e68075544f6dbb5e9a6fbc002e3a2df3091dc6c..4a20b2f9a381a360b46246c2c21c941258c1367a
  #include <asm/uaccess.h>
  #include <asm/mmx.h>
  
+ #ifdef CONFIG_X86_INTEL_USERCOPY
+ /*
+  * Alignment at which movsl is preferred for bulk memory copies.
+  */
+ struct movsl_mask movsl_mask __read_mostly;
+ #endif
  static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n)
  {
  #ifdef CONFIG_X86_INTEL_USERCOPY
@@@ -32,7 -39,7 +39,7 @@@
  #define __do_strncpy_from_user(dst, src, count, res)                     \
  do {                                                                     \
        int __d0, __d1, __d2;                                              \
 -      might_sleep();                                                     \
 +      might_fault();                                                     \
        __asm__ __volatile__(                                              \
                "       testl %1,%1\n"                                     \
                "       jz 2f\n"                                           \
@@@ -119,7 -126,7 +126,7 @@@ EXPORT_SYMBOL(strncpy_from_user)
  #define __do_clear_user(addr,size)                                    \
  do {                                                                  \
        int __d0;                                                       \
 -      might_sleep();                                                  \
 +      might_fault();                                                  \
        __asm__ __volatile__(                                           \
                "0:     rep; stosl\n"                                   \
                "       movl %2,%0\n"                                   \
  unsigned long
  clear_user(void __user *to, unsigned long n)
  {
 -      might_sleep();
 +      might_fault();
        if (access_ok(VERIFY_WRITE, to, n))
                __do_clear_user(to, n);
        return n;
@@@ -190,7 -197,7 +197,7 @@@ long strnlen_user(const char __user *s
        unsigned long mask = -__addr_ok(s);
        unsigned long res, tmp;
  
 -      might_sleep();
 +      might_fault();
  
        __asm__ __volatile__(
                "       testl %0, %0\n"
diff --combined include/linux/kernel.h
index e580ec095765ed1041e9fa7c61c2c527b5826744,396a350b87a60b79935a9b738c34783581319a2b..fa2853b49f70a4bb26e255c679f69eefb83fc6c8
@@@ -16,6 -16,7 +16,7 @@@
  #include <linux/log2.h>
  #include <linux/typecheck.h>
  #include <linux/ratelimit.h>
+ #include <linux/dynamic_printk.h>
  #include <asm/byteorder.h>
  #include <asm/bug.h>
  
@@@ -140,15 -141,6 +141,15 @@@ extern int _cond_resched(void)
                (__x < 0) ? -__x : __x;         \
        })
  
 +#ifdef CONFIG_PROVE_LOCKING
 +void might_fault(void);
 +#else
 +static inline void might_fault(void)
 +{
 +      might_sleep();
 +}
 +#endif
 +
  extern struct atomic_notifier_head panic_notifier_list;
  extern long (*panic_blink)(long time);
  NORET_TYPE void panic(const char * fmt, ...)
@@@ -191,7 -183,7 +192,7 @@@ extern int vsscanf(const char *, const 
  
  extern int get_option(char **str, int *pint);
  extern char *get_options(const char *str, int nints, int *ints);
- extern unsigned long long memparse(char *ptr, char **retptr);
+ extern unsigned long long memparse(const char *ptr, char **retptr);
  
  extern int core_kernel_text(unsigned long addr);
  extern int __kernel_text_address(unsigned long addr);
@@@ -199,6 -191,30 +200,30 @@@ extern int kernel_text_address(unsigne
  struct pid;
  extern struct pid *session_of_pgrp(struct pid *pgrp);
  
+ /*
+  * FW_BUG
+  * Add this to a message where you are sure the firmware is buggy or behaves
+  * really stupid or out of spec. Be aware that the responsible BIOS developer
+  * should be able to fix this issue or at least get a concrete idea of the
+  * problem by reading your message without the need of looking at the kernel
+  * code.
+  * 
+  * Use it for definite and high priority BIOS bugs.
+  *
+  * FW_WARN
+  * Use it for not that clear (e.g. could the kernel messed up things already?)
+  * and medium priority BIOS bugs.
+  *
+  * FW_INFO
+  * Use this one if you want to tell the user or vendor about something
+  * suspicious, but generally harmless related to the firmware.
+  *
+  * Use it for information or very low priority BIOS bugs.
+  */
+ #define FW_BUG                "[Firmware Bug]: "
+ #define FW_WARN               "[Firmware Warn]: "
+ #define FW_INFO               "[Firmware Info]: "
  #ifdef CONFIG_PRINTK
  asmlinkage int vprintk(const char *fmt, va_list args)
        __attribute__ ((format (printf, 1, 0)));
@@@ -222,6 -238,9 +247,9 @@@ static inline bool printk_timed_ratelim
                { return false; }
  #endif
  
+ extern int printk_needs_cpu(int cpu);
+ extern void printk_tick(void);
  extern void asmlinkage __attribute__((format(printf, 1, 2)))
        early_printk(const char *fmt, ...);
  
@@@ -244,9 -263,10 +272,10 @@@ extern int oops_in_progress;             /* If set
  extern int panic_timeout;
  extern int panic_on_oops;
  extern int panic_on_unrecovered_nmi;
- extern int tainted;
  extern const char *print_tainted(void);
- extern void add_taint(unsigned);
+ extern void add_taint(unsigned flag);
+ extern int test_taint(unsigned flag);
+ extern unsigned long get_taint(void);
  extern int root_mountflags;
  
  /* Values used for system_state */
@@@ -259,16 -279,17 +288,17 @@@ extern enum system_states 
        SYSTEM_SUSPEND_DISK,
  } system_state;
  
- #define TAINT_PROPRIETARY_MODULE      (1<<0)
- #define TAINT_FORCED_MODULE           (1<<1)
- #define TAINT_UNSAFE_SMP              (1<<2)
- #define TAINT_FORCED_RMMOD            (1<<3)
- #define TAINT_MACHINE_CHECK           (1<<4)
- #define TAINT_BAD_PAGE                        (1<<5)
- #define TAINT_USER                    (1<<6)
- #define TAINT_DIE                     (1<<7)
- #define TAINT_OVERRIDDEN_ACPI_TABLE   (1<<8)
- #define TAINT_WARN                    (1<<9)
+ #define TAINT_PROPRIETARY_MODULE      0
+ #define TAINT_FORCED_MODULE           1
+ #define TAINT_UNSAFE_SMP              2
+ #define TAINT_FORCED_RMMOD            3
+ #define TAINT_MACHINE_CHECK           4
+ #define TAINT_BAD_PAGE                        5
+ #define TAINT_USER                    6
+ #define TAINT_DIE                     7
+ #define TAINT_OVERRIDDEN_ACPI_TABLE   8
+ #define TAINT_WARN                    9
+ #define TAINT_CRAP                    10
  
  extern void dump_stack(void) __cold;
  
@@@ -312,8 -333,12 +342,12 @@@ static inline char *pack_hex_byte(char 
  #define pr_info(fmt, arg...) \
        printk(KERN_INFO fmt, ##arg)
  
- #ifdef DEBUG
  /* If you are writing a driver, please use dev_dbg instead */
+ #if defined(CONFIG_DYNAMIC_PRINTK_DEBUG)
+ #define pr_debug(fmt, ...) do { \
+       dynamic_pr_debug(fmt, ##__VA_ARGS__); \
+       } while (0)
+ #elif defined(DEBUG)
  #define pr_debug(fmt, arg...) \
        printk(KERN_DEBUG fmt, ##arg)
  #else
@@@ -495,4 -520,9 +529,9 @@@ struct sysinfo 
  #define NUMA_BUILD 0
  #endif
  
+ /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
+ #ifdef CONFIG_FTRACE_MCOUNT_RECORD
+ # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
+ #endif
  #endif
diff --combined kernel/sched.c
index ec3bd1f398b34ea37ca6f5e75152c42d99723b8b,6625c3c4b10d06c3f76c371becd615d174244902..0a4dc3b1300b07d7284f626a3c572f186d8c257c
@@@ -55,6 -55,7 +55,7 @@@
  #include <linux/cpuset.h>
  #include <linux/percpu.h>
  #include <linux/kthread.h>
+ #include <linux/proc_fs.h>
  #include <linux/seq_file.h>
  #include <linux/sysctl.h>
  #include <linux/syscalls.h>
@@@ -71,6 -72,7 +72,7 @@@
  #include <linux/debugfs.h>
  #include <linux/ctype.h>
  #include <linux/ftrace.h>
+ #include <trace/sched.h>
  
  #include <asm/tlb.h>
  #include <asm/irq_regs.h>
@@@ -201,14 -203,19 +203,19 @@@ void init_rt_bandwidth(struct rt_bandwi
        hrtimer_init(&rt_b->rt_period_timer,
                        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        rt_b->rt_period_timer.function = sched_rt_period_timer;
-       rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+       rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
+ }
+ static inline int rt_bandwidth_enabled(void)
+ {
+       return sysctl_sched_rt_runtime >= 0;
  }
  
  static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
  {
        ktime_t now;
  
-       if (rt_b->rt_runtime == RUNTIME_INF)
+       if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
                return;
  
        if (hrtimer_active(&rt_b->rt_period_timer))
  
                now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
                hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-               hrtimer_start(&rt_b->rt_period_timer,
-                             rt_b->rt_period_timer.expires,
-                             HRTIMER_MODE_ABS);
+               hrtimer_start_expires(&rt_b->rt_period_timer,
+                               HRTIMER_MODE_ABS);
        }
        spin_unlock(&rt_b->rt_runtime_lock);
  }
@@@ -298,9 -304,9 +304,9 @@@ static DEFINE_PER_CPU(struct cfs_rq, in
  static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
  static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
  #endif /* CONFIG_RT_GROUP_SCHED */
- #else /* !CONFIG_FAIR_GROUP_SCHED */
+ #else /* !CONFIG_USER_SCHED */
  #define root_task_group init_task_group
- #endif /* CONFIG_FAIR_GROUP_SCHED */
+ #endif /* CONFIG_USER_SCHED */
  
  /* task_group_lock serializes add/remove of task groups and also changes to
   * a task group's cpu shares.
@@@ -604,9 -610,9 +610,9 @@@ struct rq 
  
  static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
  
- static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
+ static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
  {
-       rq->curr->sched_class->check_preempt_curr(rq, p);
+       rq->curr->sched_class->check_preempt_curr(rq, p, sync);
  }
  
  static inline int cpu_of(struct rq *rq)
@@@ -812,6 -818,13 +818,13 @@@ const_debug unsigned int sysctl_sched_n
   */
  unsigned int sysctl_sched_shares_ratelimit = 250000;
  
+ /*
+  * Inject some fuzzyness into changing the per-cpu group shares
+  * this avoids remote rq-locks at the expense of fairness.
+  * default: 4
+  */
+ unsigned int sysctl_sched_shares_thresh = 4;
  /*
   * period over which we measure -rt task cpu usage in us.
   * default: 1s
@@@ -1058,7 -1071,7 +1071,7 @@@ static void hrtick_start(struct rq *rq
        struct hrtimer *timer = &rq->hrtick_timer;
        ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
  
-       timer->expires = time;
+       hrtimer_set_expires(timer, time);
  
        if (rq == this_rq()) {
                hrtimer_restart(timer);
@@@ -1087,7 -1100,7 +1100,7 @@@ hotplug_hrtick(struct notifier_block *n
        return NOTIFY_DONE;
  }
  
- static void init_hrtick(void)
+ static __init void init_hrtick(void)
  {
        hotcpu_notifier(hotplug_hrtick, 0);
  }
@@@ -1102,7 -1115,7 +1115,7 @@@ static void hrtick_start(struct rq *rq
        hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
  }
  
- static void init_hrtick(void)
+ static inline void init_hrtick(void)
  {
  }
  #endif /* CONFIG_SMP */
@@@ -1119,9 -1132,9 +1132,9 @@@ static void init_rq_hrtick(struct rq *r
  
        hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        rq->hrtick_timer.function = hrtick;
-       rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+       rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
  }
- #else
+ #else /* CONFIG_SCHED_HRTICK */
  static inline void hrtick_clear(struct rq *rq)
  {
  }
@@@ -1133,7 -1146,7 +1146,7 @@@ static inline void init_rq_hrtick(struc
  static inline void init_hrtick(void)
  {
  }
- #endif
+ #endif        /* CONFIG_SCHED_HRTICK */
  
  /*
   * resched_task - mark a task 'to be rescheduled now'.
@@@ -1380,38 -1393,24 +1393,24 @@@ static inline void dec_cpu_load(struct 
        update_load_sub(&rq->load, load);
  }
  
- #ifdef CONFIG_SMP
- static unsigned long source_load(int cpu, int type);
- static unsigned long target_load(int cpu, int type);
- static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
- static unsigned long cpu_avg_load_per_task(int cpu)
- {
-       struct rq *rq = cpu_rq(cpu);
-       if (rq->nr_running)
-               rq->avg_load_per_task = rq->load.weight / rq->nr_running;
-       return rq->avg_load_per_task;
- }
- #ifdef CONFIG_FAIR_GROUP_SCHED
- typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
+ #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
+ typedef int (*tg_visitor)(struct task_group *, void *);
  
  /*
   * Iterate the full tree, calling @down when first entering a node and @up when
   * leaving it for the final time.
   */
- static void
- walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
+ static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
  {
        struct task_group *parent, *child;
+       int ret;
  
        rcu_read_lock();
        parent = &root_task_group;
  down:
-       (*down)(parent, cpu, sd);
+       ret = (*down)(parent, data);
+       if (ret)
+               goto out_unlock;
        list_for_each_entry_rcu(child, &parent->children, siblings) {
                parent = child;
                goto down;
  up:
                continue;
        }
-       (*up)(parent, cpu, sd);
+       ret = (*up)(parent, data);
+       if (ret)
+               goto out_unlock;
  
        child = parent;
        parent = parent->parent;
        if (parent)
                goto up;
+ out_unlock:
        rcu_read_unlock();
+       return ret;
+ }
+ static int tg_nop(struct task_group *tg, void *data)
+ {
+       return 0;
+ }
+ #endif
+ #ifdef CONFIG_SMP
+ static unsigned long source_load(int cpu, int type);
+ static unsigned long target_load(int cpu, int type);
+ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+ static unsigned long cpu_avg_load_per_task(int cpu)
+ {
+       struct rq *rq = cpu_rq(cpu);
+       if (rq->nr_running)
+               rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+       return rq->avg_load_per_task;
  }
  
+ #ifdef CONFIG_FAIR_GROUP_SCHED
  static void __set_se_shares(struct sched_entity *se, unsigned long shares);
  
  /*
   * Calculate and set the cpu's group shares.
   */
  static void
__update_group_shares_cpu(struct task_group *tg, int cpu,
-                         unsigned long sd_shares, unsigned long sd_rq_weight)
+ update_group_shares_cpu(struct task_group *tg, int cpu,
+                       unsigned long sd_shares, unsigned long sd_rq_weight)
  {
        int boost = 0;
        unsigned long shares;
         *
         */
        shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+       shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
  
-       /*
-        * record the actual number of shares, not the boosted amount.
-        */
-       tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-       tg->cfs_rq[cpu]->rq_weight = rq_weight;
+       if (abs(shares - tg->se[cpu]->load.weight) >
+                       sysctl_sched_shares_thresh) {
+               struct rq *rq = cpu_rq(cpu);
+               unsigned long flags;
  
-       if (shares < MIN_SHARES)
-               shares = MIN_SHARES;
-       else if (shares > MAX_SHARES)
-               shares = MAX_SHARES;
+               spin_lock_irqsave(&rq->lock, flags);
+               /*
+                * record the actual number of shares, not the boosted amount.
+                */
+               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+               tg->cfs_rq[cpu]->rq_weight = rq_weight;
  
-       __set_se_shares(tg->se[cpu], shares);
+               __set_se_shares(tg->se[cpu], shares);
+               spin_unlock_irqrestore(&rq->lock, flags);
+       }
  }
  
  /*
   * This needs to be done in a bottom-up fashion because the rq weight of a
   * parent group depends on the shares of its child groups.
   */
- static void
- tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
+ static int tg_shares_up(struct task_group *tg, void *data)
  {
        unsigned long rq_weight = 0;
        unsigned long shares = 0;
+       struct sched_domain *sd = data;
        int i;
  
        for_each_cpu_mask(i, sd->span) {
        if (!rq_weight)
                rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
  
-       for_each_cpu_mask(i, sd->span) {
-               struct rq *rq = cpu_rq(i);
-               unsigned long flags;
+       for_each_cpu_mask(i, sd->span)
+               update_group_shares_cpu(tg, i, shares, rq_weight);
  
-               spin_lock_irqsave(&rq->lock, flags);
-               __update_group_shares_cpu(tg, i, shares, rq_weight);
-               spin_unlock_irqrestore(&rq->lock, flags);
-       }
+       return 0;
  }
  
  /*
   * This needs to be done in a top-down fashion because the load of a child
   * group is a fraction of its parents load.
   */
- static void
- tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+ static int tg_load_down(struct task_group *tg, void *data)
  {
        unsigned long load;
+       long cpu = (long)data;
  
        if (!tg->parent) {
                load = cpu_rq(cpu)->load.weight;
        }
  
        tg->cfs_rq[cpu]->h_load = load;
- }
  
- static void
- tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
- {
+       return 0;
  }
  
  static void update_shares(struct sched_domain *sd)
  
        if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
                sd->last_update = now;
-               walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+               walk_tg_tree(tg_nop, tg_shares_up, sd);
        }
  }
  
@@@ -1561,9 -1585,9 +1585,9 @@@ static void update_shares_locked(struc
        spin_lock(&rq->lock);
  }
  
- static void update_h_load(int cpu)
+ static void update_h_load(long cpu)
  {
-       walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
+       walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
  }
  
  #else
@@@ -1918,14 -1942,12 +1942,12 @@@ unsigned long wait_task_inactive(struc
                 * just go back and repeat.
                 */
                rq = task_rq_lock(p, &flags);
+               trace_sched_wait_task(rq, p);
                running = task_running(rq, p);
                on_rq = p->se.on_rq;
                ncsw = 0;
-               if (!match_state || p->state == match_state) {
-                       ncsw = p->nivcsw + p->nvcsw;
-                       if (unlikely(!ncsw))
-                               ncsw = 1;
-               }
+               if (!match_state || p->state == match_state)
+                       ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
                task_rq_unlock(rq, &flags);
  
                /*
@@@ -2282,10 -2304,8 +2304,8 @@@ out_activate
        success = 1;
  
  out_running:
-       trace_mark(kernel_sched_wakeup,
-               "pid %d state %ld ## rq %p task %p rq->curr %p",
-               p->pid, p->state, rq, p, rq->curr);
-       check_preempt_curr(rq, p);
+       trace_sched_wakeup(rq, p);
+       check_preempt_curr(rq, p, sync);
  
        p->state = TASK_RUNNING;
  #ifdef CONFIG_SMP
@@@ -2417,10 -2437,8 +2437,8 @@@ void wake_up_new_task(struct task_struc
                p->sched_class->task_new(rq, p);
                inc_nr_running(rq);
        }
-       trace_mark(kernel_sched_wakeup_new,
-               "pid %d state %ld ## rq %p task %p rq->curr %p",
-               p->pid, p->state, rq, p, rq->curr);
-       check_preempt_curr(rq, p);
+       trace_sched_wakeup_new(rq, p);
+       check_preempt_curr(rq, p, 0);
  #ifdef CONFIG_SMP
        if (p->sched_class->task_wake_up)
                p->sched_class->task_wake_up(rq, p);
@@@ -2592,11 -2610,7 +2610,7 @@@ context_switch(struct rq *rq, struct ta
        struct mm_struct *mm, *oldmm;
  
        prepare_task_switch(rq, prev, next);
-       trace_mark(kernel_sched_schedule,
-               "prev_pid %d next_pid %d prev_state %ld "
-               "## rq %p prev %p next %p",
-               prev->pid, next->pid, prev->state,
-               rq, prev, next);
+       trace_sched_switch(rq, prev, next);
        mm = next->mm;
        oldmm = prev->active_mm;
        /*
@@@ -2836,6 -2850,7 +2850,7 @@@ static void sched_migrate_task(struct t
            || unlikely(!cpu_active(dest_cpu)))
                goto out;
  
+       trace_sched_migrate_task(rq, p, dest_cpu);
        /* force the process onto the specified CPU */
        if (migrate_task(p, dest_cpu, &req)) {
                /* Need to wait for migration thread (might exit: take ref). */
@@@ -2880,7 -2895,7 +2895,7 @@@ static void pull_task(struct rq *src_rq
         * Note that idle threads have a prio of MAX_PRIO, for this test
         * to be always true for them.
         */
-       check_preempt_curr(this_rq, p);
+       check_preempt_curr(this_rq, p, 0);
  }
  
  /*
@@@ -4037,23 -4052,26 +4052,26 @@@ DEFINE_PER_CPU(struct kernel_stat, ksta
  EXPORT_PER_CPU_SYMBOL(kstat);
  
  /*
-  * Return p->sum_exec_runtime plus any more ns on the sched_clock
-  * that have not yet been banked in case the task is currently running.
+  * Return any ns on the sched_clock that have not yet been banked in
+  * @p in case that task is currently running.
   */
- unsigned long long task_sched_runtime(struct task_struct *p)
+ unsigned long long task_delta_exec(struct task_struct *p)
  {
        unsigned long flags;
-       u64 ns, delta_exec;
        struct rq *rq;
+       u64 ns = 0;
  
        rq = task_rq_lock(p, &flags);
-       ns = p->se.sum_exec_runtime;
        if (task_current(rq, p)) {
+               u64 delta_exec;
                update_rq_clock(rq);
                delta_exec = rq->clock - p->se.exec_start;
                if ((s64)delta_exec > 0)
-                       ns += delta_exec;
+                       ns = delta_exec;
        }
        task_rq_unlock(rq, &flags);
  
        return ns;
@@@ -4070,6 -4088,7 +4088,7 @@@ void account_user_time(struct task_stru
        cputime64_t tmp;
  
        p->utime = cputime_add(p->utime, cputime);
+       account_group_user_time(p, cputime);
  
        /* Add user time to cpustat. */
        tmp = cputime_to_cputime64(cputime);
@@@ -4094,6 -4113,7 +4113,7 @@@ static void account_guest_time(struct t
        tmp = cputime_to_cputime64(cputime);
  
        p->utime = cputime_add(p->utime, cputime);
+       account_group_user_time(p, cputime);
        p->gtime = cputime_add(p->gtime, cputime);
  
        cpustat->user = cputime64_add(cpustat->user, tmp);
@@@ -4129,6 -4149,7 +4149,7 @@@ void account_system_time(struct task_st
        }
  
        p->stime = cputime_add(p->stime, cputime);
+       account_group_system_time(p, cputime);
  
        /* Add system time to cpustat. */
        tmp = cputime_to_cputime64(cputime);
@@@ -4170,6 -4191,7 +4191,7 @@@ void account_steal_time(struct task_str
  
        if (p == rq->idle) {
                p->stime = cputime_add(p->stime, steal);
+               account_group_system_time(p, steal);
                if (atomic_read(&rq->nr_iowait) > 0)
                        cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
                else
@@@ -4305,7 -4327,7 +4327,7 @@@ void __kprobes sub_preempt_count(int va
        /*
         * Underflow?
         */
 -      if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
 +       if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
                return;
        /*
         * Is the spinlock portion underflowing?
@@@ -4426,12 -4448,8 +4448,8 @@@ need_resched_nonpreemptible
        if (sched_feat(HRTICK))
                hrtick_clear(rq);
  
-       /*
-        * Do the rq-clock update outside the rq lock:
-        */
-       local_irq_disable();
+       spin_lock_irq(&rq->lock);
        update_rq_clock(rq);
-       spin_lock(&rq->lock);
        clear_tsk_need_resched(prev);
  
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@@ -4627,6 -4645,15 +4645,15 @@@ __wake_up_sync(wait_queue_head_t *q, un
  }
  EXPORT_SYMBOL_GPL(__wake_up_sync);    /* For internal use only */
  
+ /**
+  * complete: - signals a single thread waiting on this completion
+  * @x:  holds the state of this particular completion
+  *
+  * This will wake up a single thread waiting on this completion. Threads will be
+  * awakened in the same order in which they were queued.
+  *
+  * See also complete_all(), wait_for_completion() and related routines.
+  */
  void complete(struct completion *x)
  {
        unsigned long flags;
  }
  EXPORT_SYMBOL(complete);
  
+ /**
+  * complete_all: - signals all threads waiting on this completion
+  * @x:  holds the state of this particular completion
+  *
+  * This will wake up all threads waiting on this particular completion event.
+  */
  void complete_all(struct completion *x)
  {
        unsigned long flags;
@@@ -4658,10 -4691,7 +4691,7 @@@ do_wait_for_common(struct completion *x
                wait.flags |= WQ_FLAG_EXCLUSIVE;
                __add_wait_queue_tail(&x->wait, &wait);
                do {
-                       if ((state == TASK_INTERRUPTIBLE &&
-                            signal_pending(current)) ||
-                           (state == TASK_KILLABLE &&
-                            fatal_signal_pending(current))) {
+                       if (signal_pending_state(state, current)) {
                                timeout = -ERESTARTSYS;
                                break;
                        }
@@@ -4689,12 -4719,31 +4719,31 @@@ wait_for_common(struct completion *x, l
        return timeout;
  }
  
+ /**
+  * wait_for_completion: - waits for completion of a task
+  * @x:  holds the state of this particular completion
+  *
+  * This waits to be signaled for completion of a specific task. It is NOT
+  * interruptible and there is no timeout.
+  *
+  * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+  * and interrupt capability. Also see complete().
+  */
  void __sched wait_for_completion(struct completion *x)
  {
        wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
  }
  EXPORT_SYMBOL(wait_for_completion);
  
+ /**
+  * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+  * @x:  holds the state of this particular completion
+  * @timeout:  timeout value in jiffies
+  *
+  * This waits for either a completion of a specific task to be signaled or for a
+  * specified timeout to expire. The timeout is in jiffies. It is not
+  * interruptible.
+  */
  unsigned long __sched
  wait_for_completion_timeout(struct completion *x, unsigned long timeout)
  {
  }
  EXPORT_SYMBOL(wait_for_completion_timeout);
  
+ /**
+  * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+  * @x:  holds the state of this particular completion
+  *
+  * This waits for completion of a specific task to be signaled. It is
+  * interruptible.
+  */
  int __sched wait_for_completion_interruptible(struct completion *x)
  {
        long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
  }
  EXPORT_SYMBOL(wait_for_completion_interruptible);
  
+ /**
+  * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+  * @x:  holds the state of this particular completion
+  * @timeout:  timeout value in jiffies
+  *
+  * This waits for either a completion of a specific task to be signaled or for a
+  * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+  */
  unsigned long __sched
  wait_for_completion_interruptible_timeout(struct completion *x,
                                          unsigned long timeout)
  }
  EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
  
+ /**
+  * wait_for_completion_killable: - waits for completion of a task (killable)
+  * @x:  holds the state of this particular completion
+  *
+  * This waits to be signaled for completion of a specific task. It can be
+  * interrupted by a kill signal.
+  */
  int __sched wait_for_completion_killable(struct completion *x)
  {
        long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@@ -5121,7 -5192,8 +5192,8 @@@ recheck
                 * Do not allow realtime tasks into groups that have no runtime
                 * assigned.
                 */
-               if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+               if (rt_bandwidth_enabled() && rt_policy(policy) &&
+                               task_group(p)->rt_bandwidth.rt_runtime == 0)
                        return -EPERM;
  #endif
  
@@@ -5957,7 -6029,7 +6029,7 @@@ static int __migrate_task(struct task_s
        set_task_cpu(p, dest_cpu);
        if (on_rq) {
                activate_task(rq_dest, p, 0);
-               check_preempt_curr(rq_dest, p);
+               check_preempt_curr(rq_dest, p, 0);
        }
  done:
        ret = 1;
@@@ -6282,7 -6354,7 +6354,7 @@@ set_table_entry(struct ctl_table *entry
  static struct ctl_table *
  sd_alloc_ctl_domain_table(struct sched_domain *sd)
  {
-       struct ctl_table *table = sd_alloc_ctl_entry(12);
+       struct ctl_table *table = sd_alloc_ctl_entry(13);
  
        if (table == NULL)
                return NULL;
                sizeof(int), 0644, proc_dointvec_minmax);
        set_table_entry(&table[10], "flags", &sd->flags,
                sizeof(int), 0644, proc_dointvec_minmax);
-       /* &table[11] is terminator */
+       set_table_entry(&table[11], "name", sd->name,
+               CORENAME_MAX_SIZE, 0444, proc_dostring);
+       /* &table[12] is terminator */
  
        return table;
  }
@@@ -7194,13 -7268,21 +7268,21 @@@ static void init_sched_groups_power(in
   * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
   */
  
+ #ifdef CONFIG_SCHED_DEBUG
+ # define SD_INIT_NAME(sd, type)               sd->name = #type
+ #else
+ # define SD_INIT_NAME(sd, type)               do { } while (0)
+ #endif
  #define       SD_INIT(sd, type)       sd_init_##type(sd)
  #define SD_INIT_FUNC(type)    \
  static noinline void sd_init_##type(struct sched_domain *sd)  \
  {                                                             \
        memset(sd, 0, sizeof(*sd));                             \
        *sd = SD_##type##_INIT;                                 \
        sd->level = SD_LV_##type;                               \
+       SD_INIT_NAME(sd, type);                                 \
  }
  
  SD_INIT_FUNC(CPU)
@@@ -8242,20 -8324,25 +8324,25 @@@ void __might_sleep(char *file, int line
  #ifdef in_atomic
        static unsigned long prev_jiffy;        /* ratelimiting */
  
-       if ((in_atomic() || irqs_disabled()) &&
-           system_state == SYSTEM_RUNNING && !oops_in_progress) {
-               if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-                       return;
-               prev_jiffy = jiffies;
-               printk(KERN_ERR "BUG: sleeping function called from invalid"
-                               " context at %s:%d\n", file, line);
-               printk("in_atomic():%d, irqs_disabled():%d\n",
-                       in_atomic(), irqs_disabled());
-               debug_show_held_locks(current);
-               if (irqs_disabled())
-                       print_irqtrace_events(current);
-               dump_stack();
-       }
+       if ((!in_atomic() && !irqs_disabled()) ||
+                   system_state != SYSTEM_RUNNING || oops_in_progress)
+               return;
+       if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+               return;
+       prev_jiffy = jiffies;
+       printk(KERN_ERR
+               "BUG: sleeping function called from invalid context at %s:%d\n",
+                       file, line);
+       printk(KERN_ERR
+               "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+                       in_atomic(), irqs_disabled(),
+                       current->pid, current->comm);
+       debug_show_held_locks(current);
+       if (irqs_disabled())
+               print_irqtrace_events(current);
+       dump_stack();
  #endif
  }
  EXPORT_SYMBOL(__might_sleep);
@@@ -8753,73 -8840,95 +8840,95 @@@ static DEFINE_MUTEX(rt_constraints_mute
  static unsigned long to_ratio(u64 period, u64 runtime)
  {
        if (runtime == RUNTIME_INF)
-               return 1ULL << 16;
+               return 1ULL << 20;
  
-       return div64_u64(runtime << 16, period);
+       return div64_u64(runtime << 20, period);
  }
  
- #ifdef CONFIG_CGROUP_SCHED
- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+ /* Must be called with tasklist_lock held */
+ static inline int tg_has_rt_tasks(struct task_group *tg)
  {
-       struct task_group *tgi, *parent = tg->parent;
-       unsigned long total = 0;
+       struct task_struct *g, *p;
  
-       if (!parent) {
-               if (global_rt_period() < period)
-                       return 0;
+       do_each_thread(g, p) {
+               if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+                       return 1;
+       } while_each_thread(g, p);
  
-               return to_ratio(period, runtime) <
-                       to_ratio(global_rt_period(), global_rt_runtime());
-       }
+       return 0;
+ }
  
-       if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
-               return 0;
+ struct rt_schedulable_data {
+       struct task_group *tg;
+       u64 rt_period;
+       u64 rt_runtime;
+ };
  
-       rcu_read_lock();
-       list_for_each_entry_rcu(tgi, &parent->children, siblings) {
-               if (tgi == tg)
-                       continue;
+ static int tg_schedulable(struct task_group *tg, void *data)
+ {
+       struct rt_schedulable_data *d = data;
+       struct task_group *child;
+       unsigned long total, sum = 0;
+       u64 period, runtime;
  
-               total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-                               tgi->rt_bandwidth.rt_runtime);
+       period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+       runtime = tg->rt_bandwidth.rt_runtime;
+       if (tg == d->tg) {
+               period = d->rt_period;
+               runtime = d->rt_runtime;
        }
-       rcu_read_unlock();
  
-       return total + to_ratio(period, runtime) <=
-               to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
-                               parent->rt_bandwidth.rt_runtime);
- }
- #elif defined CONFIG_USER_SCHED
- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
- {
-       struct task_group *tgi;
-       unsigned long total = 0;
-       unsigned long global_ratio =
-               to_ratio(global_rt_period(), global_rt_runtime());
+       /*
+        * Cannot have more runtime than the period.
+        */
+       if (runtime > period && runtime != RUNTIME_INF)
+               return -EINVAL;
  
-       rcu_read_lock();
-       list_for_each_entry_rcu(tgi, &task_groups, list) {
-               if (tgi == tg)
-                       continue;
+       /*
+        * Ensure we don't starve existing RT tasks.
+        */
+       if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+               return -EBUSY;
  
-               total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-                               tgi->rt_bandwidth.rt_runtime);
+       total = to_ratio(period, runtime);
+       /*
+        * Nobody can have more than the global setting allows.
+        */
+       if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+               return -EINVAL;
+       /*
+        * The sum of our children's runtime should not exceed our own.
+        */
+       list_for_each_entry_rcu(child, &tg->children, siblings) {
+               period = ktime_to_ns(child->rt_bandwidth.rt_period);
+               runtime = child->rt_bandwidth.rt_runtime;
+               if (child == d->tg) {
+                       period = d->rt_period;
+                       runtime = d->rt_runtime;
+               }
+               sum += to_ratio(period, runtime);
        }
-       rcu_read_unlock();
  
-       return total + to_ratio(period, runtime) < global_ratio;
+       if (sum > total)
+               return -EINVAL;
+       return 0;
  }
- #endif
  
- /* Must be called with tasklist_lock held */
- static inline int tg_has_rt_tasks(struct task_group *tg)
+ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
  {
-       struct task_struct *g, *p;
-       do_each_thread(g, p) {
-               if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
-                       return 1;
-       } while_each_thread(g, p);
-       return 0;
+       struct rt_schedulable_data data = {
+               .tg = tg,
+               .rt_period = period,
+               .rt_runtime = runtime,
+       };
+       return walk_tg_tree(tg_schedulable, tg_nop, &data);
  }
  
  static int tg_set_bandwidth(struct task_group *tg,
  
        mutex_lock(&rt_constraints_mutex);
        read_lock(&tasklist_lock);
-       if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
-               err = -EBUSY;
-               goto unlock;
-       }
-       if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
-               err = -EINVAL;
+       err = __rt_schedulable(tg, rt_period, rt_runtime);
+       if (err)
                goto unlock;
-       }
  
        spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
        tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@@ -8905,19 -9009,25 +9009,25 @@@ long sched_group_rt_period(struct task_
  
  static int sched_rt_global_constraints(void)
  {
-       struct task_group *tg = &root_task_group;
-       u64 rt_runtime, rt_period;
+       u64 runtime, period;
        int ret = 0;
  
        if (sysctl_sched_rt_period <= 0)
                return -EINVAL;
  
-       rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-       rt_runtime = tg->rt_bandwidth.rt_runtime;
+       runtime = global_rt_runtime();
+       period = global_rt_period();
+       /*
+        * Sanity check on the sysctl variables.
+        */
+       if (runtime > period && runtime != RUNTIME_INF)
+               return -EINVAL;
  
        mutex_lock(&rt_constraints_mutex);
-       if (!__rt_schedulable(tg, rt_period, rt_runtime))
-               ret = -EINVAL;
+       read_lock(&tasklist_lock);
+       ret = __rt_schedulable(NULL, 0, 0);
+       read_unlock(&tasklist_lock);
        mutex_unlock(&rt_constraints_mutex);
  
        return ret;
@@@ -8991,7 -9101,6 +9101,6 @@@ cpu_cgroup_create(struct cgroup_subsys 
  
        if (!cgrp->parent) {
                /* This is early initialization for the top cgroup */
-               init_task_group.css.cgroup = cgrp;
                return &init_task_group.css;
        }
  
        if (IS_ERR(tg))
                return ERR_PTR(-ENOMEM);
  
-       /* Bind the cgroup to task_group object we just created */
-       tg->css.cgroup = cgrp;
        return &tg->css;
  }
  
diff --combined mm/memory.c
index b8fdf4e5e65b52a812413ffc4797edf4e056f42c,164951c473058a25c081d5e47260d872068cdbb7..fc031d68327e5fad33130b15d50a020b7b9b31a8
@@@ -1129,12 -1129,17 +1129,17 @@@ static inline int use_zero_page(struct 
        return !vma->vm_ops || !vma->vm_ops->fault;
  }
  
- int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-               unsigned long start, int len, int write, int force,
+ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+                    unsigned long start, int len, int flags,
                struct page **pages, struct vm_area_struct **vmas)
  {
        int i;
-       unsigned int vm_flags;
+       unsigned int vm_flags = 0;
+       int write = !!(flags & GUP_FLAGS_WRITE);
+       int force = !!(flags & GUP_FLAGS_FORCE);
+       int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
  
        if (len <= 0)
                return 0;
                        pud_t *pud;
                        pmd_t *pmd;
                        pte_t *pte;
-                       if (write) /* user gate pages are read-only */
+                       /* user gate pages are read-only */
+                       if (!ignore && write)
                                return i ? : -EFAULT;
                        if (pg > TASK_SIZE)
                                pgd = pgd_offset_k(pg);
                        continue;
                }
  
-               if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
-                               || !(vm_flags & vma->vm_flags))
+               if (!vma ||
+                   (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
+                   (!ignore && !(vm_flags & vma->vm_flags)))
                        return i ? : -EFAULT;
  
                if (is_vm_hugetlb_page(vma)) {
        } while (len);
        return i;
  }
+ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+               unsigned long start, int len, int write, int force,
+               struct page **pages, struct vm_area_struct **vmas)
+ {
+       int flags = 0;
+       if (write)
+               flags |= GUP_FLAGS_WRITE;
+       if (force)
+               flags |= GUP_FLAGS_FORCE;
+       return __get_user_pages(tsk, mm,
+                               start, len, flags,
+                               pages, vmas);
+ }
  EXPORT_SYMBOL(get_user_pages);
  
  pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
@@@ -1296,18 -1321,14 +1321,14 @@@ static int insert_page(struct vm_area_s
        pte_t *pte;
        spinlock_t *ptl;
  
-       retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
-       if (retval)
-               goto out;
        retval = -EINVAL;
        if (PageAnon(page))
-               goto out_uncharge;
+               goto out;
        retval = -ENOMEM;
        flush_dcache_page(page);
        pte = get_locked_pte(mm, addr, &ptl);
        if (!pte)
-               goto out_uncharge;
+               goto out;
        retval = -EBUSY;
        if (!pte_none(*pte))
                goto out_unlock;
        return retval;
  out_unlock:
        pte_unmap_unlock(pte, ptl);
- out_uncharge:
-       mem_cgroup_uncharge_page(page);
  out:
        return retval;
  }
@@@ -1858,6 -1877,15 +1877,15 @@@ gotten
        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
        if (!new_page)
                goto oom;
+       /*
+        * Don't let another task, with possibly unlocked vma,
+        * keep the mlocked page.
+        */
+       if (vma->vm_flags & VM_LOCKED) {
+               lock_page(old_page);    /* for LRU manipulation */
+               clear_page_mlock(old_page);
+               unlock_page(old_page);
+       }
        cow_user_page(new_page, old_page, address, vma);
        __SetPageUptodate(new_page);
  
                 * thread doing COW.
                 */
                ptep_clear_flush_notify(vma, address, page_table);
-               set_pte_at(mm, address, page_table, entry);
-               update_mmu_cache(vma, address, entry);
-               lru_cache_add_active(new_page);
+               SetPageSwapBacked(new_page);
+               lru_cache_add_active_or_unevictable(new_page, vma);
                page_add_new_anon_rmap(new_page, vma, address);
  
+ //TODO:  is this safe?  do_anonymous_page() does it this way.
+               set_pte_at(mm, address, page_table, entry);
+               update_mmu_cache(vma, address, entry);
                if (old_page) {
                        /*
                         * Only after switching the pte to the new page may
@@@ -2288,16 -2318,17 +2318,17 @@@ static int do_swap_page(struct mm_struc
                count_vm_event(PGMAJFAULT);
        }
  
+       mark_page_accessed(page);
+       lock_page(page);
+       delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
        if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
-               delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
                ret = VM_FAULT_OOM;
+               unlock_page(page);
                goto out;
        }
  
-       mark_page_accessed(page);
-       lock_page(page);
-       delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
        /*
         * Back out if somebody else already faulted in this pte.
         */
        page_add_anon_rmap(page, vma, address);
  
        swap_free(entry);
-       if (vm_swap_full())
+       if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
                remove_exclusive_swap_page(page);
        unlock_page(page);
  
@@@ -2382,7 -2413,8 +2413,8 @@@ static int do_anonymous_page(struct mm_
        if (!pte_none(*page_table))
                goto release;
        inc_mm_counter(mm, anon_rss);
-       lru_cache_add_active(page);
+       SetPageSwapBacked(page);
+       lru_cache_add_active_or_unevictable(page, vma);
        page_add_new_anon_rmap(page, vma, address);
        set_pte_at(mm, address, page_table, entry);
  
@@@ -2423,6 -2455,7 +2455,7 @@@ static int __do_fault(struct mm_struct 
        struct page *page;
        pte_t entry;
        int anon = 0;
+       int charged = 0;
        struct page *dirty_page = NULL;
        struct vm_fault vmf;
        int ret;
                                ret = VM_FAULT_OOM;
                                goto out;
                        }
+                       if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
+                               ret = VM_FAULT_OOM;
+                               page_cache_release(page);
+                               goto out;
+                       }
+                       charged = 1;
+                       /*
+                        * Don't let another task, with possibly unlocked vma,
+                        * keep the mlocked page.
+                        */
+                       if (vma->vm_flags & VM_LOCKED)
+                               clear_page_mlock(vmf.page);
                        copy_user_highpage(page, vmf.page, address, vma);
                        __SetPageUptodate(page);
                } else {
  
        }
  
-       if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
-               ret = VM_FAULT_OOM;
-               goto out;
-       }
        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
  
        /*
                entry = mk_pte(page, vma->vm_page_prot);
                if (flags & FAULT_FLAG_WRITE)
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-               set_pte_at(mm, address, page_table, entry);
                if (anon) {
-                         inc_mm_counter(mm, anon_rss);
-                         lru_cache_add_active(page);
-                         page_add_new_anon_rmap(page, vma, address);
+                       inc_mm_counter(mm, anon_rss);
+                       SetPageSwapBacked(page);
+                       lru_cache_add_active_or_unevictable(page, vma);
+                       page_add_new_anon_rmap(page, vma, address);
                } else {
                        inc_mm_counter(mm, file_rss);
                        page_add_file_rmap(page);
                                get_page(dirty_page);
                        }
                }
+ //TODO:  is this safe?  do_anonymous_page() does it this way.
+               set_pte_at(mm, address, page_table, entry);
  
                /* no need to invalidate: a not-present page won't be cached */
                update_mmu_cache(vma, address, entry);
        } else {
-               mem_cgroup_uncharge_page(page);
+               if (charged)
+                       mem_cgroup_uncharge_page(page);
                if (anon)
                        page_cache_release(page);
                else
@@@ -2772,19 -2815,9 +2815,9 @@@ int make_pages_present(unsigned long ad
        len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
        ret = get_user_pages(current, current->mm, addr,
                        len, write, 0, NULL, NULL);
-       if (ret < 0) {
-               /*
-                  SUS require strange return value to mlock
-                   - invalid addr generate to ENOMEM.
-                   - out of memory should generate EAGAIN.
-               */
-               if (ret == -EFAULT)
-                       ret = -ENOMEM;
-               else if (ret == -ENOMEM)
-                       ret = -EAGAIN;
+       if (ret < 0)
                return ret;
-       }
-       return ret == len ? 0 : -ENOMEM;
+       return ret == len ? 0 : -EFAULT;
  }
  
  #if !defined(__HAVE_ARCH_GATE_AREA)
@@@ -3016,18 -3049,3 +3049,18 @@@ void print_vma_addr(char *prefix, unsig
        }
        up_read(&current->mm->mmap_sem);
  }
 +
 +#ifdef CONFIG_PROVE_LOCKING
 +void might_fault(void)
 +{
 +      might_sleep();
 +      /*
 +       * it would be nicer only to annotate paths which are not under
 +       * pagefault_disable, however that requires a larger audit and
 +       * providing helpers like get_user_atomic.
 +       */
 +      if (!in_atomic() && current->mm)
 +              might_lock_read(&current->mm->mmap_sem);
 +}
 +EXPORT_SYMBOL(might_fault);
 +#endif