Merge commit 'v2.6.28-rc2' into core/locking

author Ingo Molnar <mingo@elte.hu>

Tue, 28 Oct 2008 15:54:49 +0000 (16:54 +0100)

committer Ingo Molnar <mingo@elte.hu>

Tue, 28 Oct 2008 15:54:49 +0000 (16:54 +0100)
author Ingo Molnar <mingo@elte.hu>
Tue, 28 Oct 2008 15:54:49 +0000 (16:54 +0100)
committer Ingo Molnar <mingo@elte.hu>
Tue, 28 Oct 2008 15:54:49 +0000 (16:54 +0100)
diff --combined arch/um/include/asm/system.h

index f1ea4da34fadb8d75c2fb740d5a1464ecc6e9adb,753346e2cdfd64d5d472f3388014952f5d0ca4ff..ae5f94d6317d584c051fbcea83533431e4b635d7
--- 1/include/asm-um/system-generic.h
--- 2/arch/um/include/asm/system.h
+++ b/arch/um/include/asm/system.h
@@@ -1,19 -1,7 +1,7 @@@
   #ifndef __UM_SYSTEM_GENERIC_H
   #define __UM_SYSTEM_GENERIC_H
   
- #include "asm/arch/system.h"
- 
- #undef switch_to
- #undef raw_local_irq_save
- #undef raw_local_irq_restore
- #undef raw_local_irq_disable
- #undef raw_local_irq_enable
- #undef raw_local_save_flags
- #undef raw_local_irq_restore
- #undef raw_local_irq_enable
- #undef raw_local_irq_disable
- #undef raw_local_irq_save
- #undef irqs_disabled
+ #include "sysdep/system.h"
   
   extern void *switch_to(void *prev, void *next, void *last);
   
@@@ -23,21 -11,21 +11,21 @@@ extern int get_signals(void)
   extern void block_signals(void);
   extern void unblock_signals(void);
   
- -#define local_save_flags(flags) do { typecheck(unsigned long, flags); \
+ +#define raw_local_save_flags(flags) do { typecheck(unsigned long, flags); \
                                      (flags) = get_signals(); } while(0)
- -#define local_irq_restore(flags) do { typecheck(unsigned long, flags); \
+ +#define raw_local_irq_restore(flags) do { typecheck(unsigned long, flags); \
                                       set_signals(flags); } while(0)
   
- -#define local_irq_save(flags) do { local_save_flags(flags); \
- -                                   local_irq_disable(); } while(0)
+ +#define raw_local_irq_save(flags) do { raw_local_save_flags(flags); \
+ +                                   raw_local_irq_disable(); } while(0)
   
- -#define local_irq_enable() unblock_signals()
- -#define local_irq_disable() block_signals()
+ +#define raw_local_irq_enable() unblock_signals()
+ +#define raw_local_irq_disable() block_signals()
   
   #define irqs_disabled()                 \
   ({                                      \
           unsigned long flags;            \
- -        local_save_flags(flags);        \
+ +        raw_local_save_flags(flags);        \
           (flags == 0);                   \
   })
   
diff --combined arch/x86/include/asm/uaccess.h

index dc8edb5c46593dad80aa46618e9e8fa3ff296152,35c54921b2e434cdd95bbd223077f2b1155d0f1f..99192bb55a53bf68afc30efeae89b9b40b5f4b4d
--- 1/include/asm-x86/uaccess.h
--- 2/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@@ -1,5 -1,5 +1,5 @@@
- #ifndef _ASM_UACCES_H_
- #define _ASM_UACCES_H_
+ #ifndef _ASM_X86_UACCESS_H
+ #define _ASM_X86_UACCESS_H
   /*
    * User space memory access functions
    */
@@@ -157,7 -157,6 +157,7 @@@ extern int __get_user_bad(void)
         int __ret_gu;                                                   \
         unsigned long __val_gu;                                         \
         __chk_user_ptr(ptr);                                            \
+ +      might_fault();                                                  \
         switch (sizeof(*(ptr))) {                                       \
         case 1:                                                         \
                 __get_user_x(1, __ret_gu, __val_gu, ptr);               \
@@@ -242,7 -241,6 +242,7 @@@ extern void __put_user_8(void)
         int __ret_pu;                                           \
         __typeof__(*(ptr)) __pu_val;                            \
         __chk_user_ptr(ptr);                                    \
+ +      might_fault();                                          \
         __pu_val = x;                                           \
         switch (sizeof(*(ptr))) {                               \
         case 1:                                                 \
@@@ -452,5 -450,5 +452,5 @@@ extern struct movsl_mask 
   # include "uaccess_64.h"
   #endif
   
- #endif
+ #endif /* _ASM_X86_UACCESS_H */
   
diff --combined arch/x86/include/asm/uaccess_32.h

index d10e842ec3eed2d9cc97fcee885b2cc9be17a0b5,d095a3aeea1b44d3063f0165c956d0a156c6ca38..5e06259e90e5a736539948ae22e1de25444ba485
--- 1/include/asm-x86/uaccess_32.h
--- 2/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@@ -1,5 -1,5 +1,5 @@@
- #ifndef __i386_UACCESS_H
- #define __i386_UACCESS_H
+ #ifndef _ASM_X86_UACCESS_32_H
+ #define _ASM_X86_UACCESS_32_H
   
   /*
    * User space memory access functions
@@@ -82,8 -82,8 +82,8 @@@ __copy_to_user_inatomic(void __user *to
   static __always_inline unsigned long __must_check
   __copy_to_user(void __user *to, const void *from, unsigned long n)
   {
- -       might_sleep();
- -       return __copy_to_user_inatomic(to, from, n);
+ +      might_fault();
+ +      return __copy_to_user_inatomic(to, from, n);
   }
   
   static __always_inline unsigned long
@@@ -137,7 -137,7 +137,7 @@@ __copy_from_user_inatomic(void *to, con
   static __always_inline unsigned long
   __copy_from_user(void *to, const void __user *from, unsigned long n)
   {
- -      might_sleep();
+ +      might_fault();
         if (__builtin_constant_p(n)) {
                 unsigned long ret;
   
@@@ -159,7 -159,7 +159,7 @@@
   static __always_inline unsigned long __copy_from_user_nocache(void *to,
                                 const void __user *from, unsigned long n)
   {
- -      might_sleep();
+ +      might_fault();
         if (__builtin_constant_p(n)) {
                 unsigned long ret;
   
@@@ -215,4 -215,4 +215,4 @@@ long strnlen_user(const char __user *st
   unsigned long __must_check clear_user(void __user *mem, unsigned long len);
   unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
   
- #endif /* __i386_UACCESS_H */
+ #endif /* _ASM_X86_UACCESS_32_H */
diff --combined arch/x86/include/asm/uaccess_64.h

index 13fd56fbc3aba9791d0fc5b9655f311eb4c8436c,664f15280f14354dc057e1d97954db6baab4b959..543ba883cc66200ff0e2206aec36b434b63d3695
--- 1/include/asm-x86/uaccess_64.h
--- 2/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@@ -1,5 -1,5 +1,5 @@@
- #ifndef __X86_64_UACCESS_H
- #define __X86_64_UACCESS_H
+ #ifndef _ASM_X86_UACCESS_64_H
+ #define _ASM_X86_UACCESS_64_H
   
   /*
    * User space memory access functions
@@@ -7,6 -7,7 +7,7 @@@
   #include <linux/compiler.h>
   #include <linux/errno.h>
   #include <linux/prefetch.h>
+ #include <linux/lockdep.h>
   #include <asm/page.h>
   
   /*
@@@ -28,8 -29,6 +29,8 @@@ static __always_inline __must_chec
   int __copy_from_user(void *dst, const void __user *src, unsigned size)
   {
         int ret = 0;
+ +
+ +      might_fault();
         if (!__builtin_constant_p(size))
                 return copy_user_generic(dst, (__force void *)src, size);
         switch (size) {
@@@ -72,8 -71,6 +73,8 @@@ static __always_inline __must_chec
   int __copy_to_user(void __user *dst, const void *src, unsigned size)
   {
         int ret = 0;
+ +
+ +      might_fault();
         if (!__builtin_constant_p(size))
                 return copy_user_generic((__force void *)dst, src, size);
         switch (size) {
@@@ -116,8 -113,6 +117,8 @@@ static __always_inline __must_chec
   int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
   {
         int ret = 0;
+ +
+ +      might_fault();
         if (!__builtin_constant_p(size))
                 return copy_user_generic((__force void *)dst,
                                          (__force void *)src, size);
@@@ -204,4 -199,4 +205,4 @@@ static inline int __copy_from_user_inat
   unsigned long
   copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest);
   
- #endif /* __X86_64_UACCESS_H */
+ #endif /* _ASM_X86_UACCESS_64_H */
diff --combined arch/x86/lib/usercopy_32.c

index fab5faba1d3e5ff4d4f1778a3692d0a51c4475fc,9e68075544f6dbb5e9a6fbc002e3a2df3091dc6c..4a20b2f9a381a360b46246c2c21c941258c1367a
--- 1/arch/x86/lib/usercopy_32.c
--- 2/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@@ -14,6 -14,13 +14,13 @@@
   #include <asm/uaccess.h>
   #include <asm/mmx.h>
   
+ #ifdef CONFIG_X86_INTEL_USERCOPY
+ /*
+  * Alignment at which movsl is preferred for bulk memory copies.
+  */
+ struct movsl_mask movsl_mask __read_mostly;
+ #endif
+ 
   static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n)
   {
   #ifdef CONFIG_X86_INTEL_USERCOPY
@@@ -32,7 -39,7 +39,7 @@@
   #define __do_strncpy_from_user(dst, src, count, res)                     \
   do {                                                                     \
         int __d0, __d1, __d2;                                              \
- -      might_sleep();                                                     \
+ +      might_fault();                                                     \
         __asm__ __volatile__(                                              \
                 "       testl %1,%1\n"                                     \
                 "       jz 2f\n"                                           \
@@@ -119,7 -126,7 +126,7 @@@ EXPORT_SYMBOL(strncpy_from_user)
   #define __do_clear_user(addr,size)                                    \
   do {                                                                  \
         int __d0;                                                       \
- -      might_sleep();                                                  \
+ +      might_fault();                                                  \
         __asm__ __volatile__(                                           \
                 "0:     rep; stosl\n"                                   \
                 "       movl %2,%0\n"                                   \
@@@ -148,7 -155,7 +155,7 @@@
   unsigned long
   clear_user(void __user *to, unsigned long n)
   {
- -      might_sleep();
+ +      might_fault();
         if (access_ok(VERIFY_WRITE, to, n))
                 __do_clear_user(to, n);
         return n;
@@@ -190,7 -197,7 +197,7 @@@ long strnlen_user(const char __user *s
         unsigned long mask = -__addr_ok(s);
         unsigned long res, tmp;
   
- -      might_sleep();
+ +      might_fault();
   
         __asm__ __volatile__(
                 "       testl %0, %0\n"
diff --combined include/linux/kernel.h

index e580ec095765ed1041e9fa7c61c2c527b5826744,396a350b87a60b79935a9b738c34783581319a2b..fa2853b49f70a4bb26e255c679f69eefb83fc6c8
--- 1/include/linux/kernel.h
--- 2/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@@ -16,6 -16,7 +16,7 @@@
   #include <linux/log2.h>
   #include <linux/typecheck.h>
   #include <linux/ratelimit.h>
+ #include <linux/dynamic_printk.h>
   #include <asm/byteorder.h>
   #include <asm/bug.h>
   
@@@ -140,15 -141,6 +141,15 @@@ extern int _cond_resched(void)
                 (__x < 0) ? -__x : __x;         \
         })
   
+ +#ifdef CONFIG_PROVE_LOCKING
+ +void might_fault(void);
+ +#else
+ +static inline void might_fault(void)
+ +{
+ +      might_sleep();
+ +}
+ +#endif
+ +
   extern struct atomic_notifier_head panic_notifier_list;
   extern long (*panic_blink)(long time);
   NORET_TYPE void panic(const char * fmt, ...)
@@@ -191,7 -183,7 +192,7 @@@ extern int vsscanf(const char *, const 
   
   extern int get_option(char **str, int *pint);
   extern char *get_options(const char *str, int nints, int *ints);
- extern unsigned long long memparse(char *ptr, char **retptr);
+ extern unsigned long long memparse(const char *ptr, char **retptr);
   
   extern int core_kernel_text(unsigned long addr);
   extern int __kernel_text_address(unsigned long addr);
@@@ -199,6 -191,30 +200,30 @@@ extern int kernel_text_address(unsigne
   struct pid;
   extern struct pid *session_of_pgrp(struct pid *pgrp);
   
+ /*
+  * FW_BUG
+  * Add this to a message where you are sure the firmware is buggy or behaves
+  * really stupid or out of spec. Be aware that the responsible BIOS developer
+  * should be able to fix this issue or at least get a concrete idea of the
+  * problem by reading your message without the need of looking at the kernel
+  * code.
+  * 
+  * Use it for definite and high priority BIOS bugs.
+  *
+  * FW_WARN
+  * Use it for not that clear (e.g. could the kernel messed up things already?)
+  * and medium priority BIOS bugs.
+  *
+  * FW_INFO
+  * Use this one if you want to tell the user or vendor about something
+  * suspicious, but generally harmless related to the firmware.
+  *
+  * Use it for information or very low priority BIOS bugs.
+  */
+ #define FW_BUG                "[Firmware Bug]: "
+ #define FW_WARN               "[Firmware Warn]: "
+ #define FW_INFO               "[Firmware Info]: "
+ 
   #ifdef CONFIG_PRINTK
   asmlinkage int vprintk(const char *fmt, va_list args)
         __attribute__ ((format (printf, 1, 0)));
@@@ -222,6 -238,9 +247,9 @@@ static inline bool printk_timed_ratelim
                 { return false; }
   #endif
   
+ extern int printk_needs_cpu(int cpu);
+ extern void printk_tick(void);
+ 
   extern void asmlinkage __attribute__((format(printf, 1, 2)))
         early_printk(const char *fmt, ...);
   
@@@ -244,9 -263,10 +272,10 @@@ extern int oops_in_progress;             /* If set
   extern int panic_timeout;
   extern int panic_on_oops;
   extern int panic_on_unrecovered_nmi;
- extern int tainted;
   extern const char *print_tainted(void);
- extern void add_taint(unsigned);
+ extern void add_taint(unsigned flag);
+ extern int test_taint(unsigned flag);
+ extern unsigned long get_taint(void);
   extern int root_mountflags;
   
   /* Values used for system_state */
@@@ -259,16 -279,17 +288,17 @@@ extern enum system_states 
         SYSTEM_SUSPEND_DISK,
   } system_state;
   
- #define TAINT_PROPRIETARY_MODULE      (1<<0)
- #define TAINT_FORCED_MODULE           (1<<1)
- #define TAINT_UNSAFE_SMP              (1<<2)
- #define TAINT_FORCED_RMMOD            (1<<3)
- #define TAINT_MACHINE_CHECK           (1<<4)
- #define TAINT_BAD_PAGE                        (1<<5)
- #define TAINT_USER                    (1<<6)
- #define TAINT_DIE                     (1<<7)
- #define TAINT_OVERRIDDEN_ACPI_TABLE   (1<<8)
- #define TAINT_WARN                    (1<<9)
+ #define TAINT_PROPRIETARY_MODULE      0
+ #define TAINT_FORCED_MODULE           1
+ #define TAINT_UNSAFE_SMP              2
+ #define TAINT_FORCED_RMMOD            3
+ #define TAINT_MACHINE_CHECK           4
+ #define TAINT_BAD_PAGE                        5
+ #define TAINT_USER                    6
+ #define TAINT_DIE                     7
+ #define TAINT_OVERRIDDEN_ACPI_TABLE   8
+ #define TAINT_WARN                    9
+ #define TAINT_CRAP                    10
   
   extern void dump_stack(void) __cold;
   
@@@ -312,8 -333,12 +342,12 @@@ static inline char *pack_hex_byte(char 
   #define pr_info(fmt, arg...) \
         printk(KERN_INFO fmt, ##arg)
   
- #ifdef DEBUG
   /* If you are writing a driver, please use dev_dbg instead */
+ #if defined(CONFIG_DYNAMIC_PRINTK_DEBUG)
+ #define pr_debug(fmt, ...) do { \
+       dynamic_pr_debug(fmt, ##__VA_ARGS__); \
+       } while (0)
+ #elif defined(DEBUG)
   #define pr_debug(fmt, arg...) \
         printk(KERN_DEBUG fmt, ##arg)
   #else
@@@ -495,4 -520,9 +529,9 @@@ struct sysinfo 
   #define NUMA_BUILD 0
   #endif
   
+ /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
+ #ifdef CONFIG_FTRACE_MCOUNT_RECORD
+ # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
+ #endif
+ 
   #endif
diff --combined kernel/sched.c

index ec3bd1f398b34ea37ca6f5e75152c42d99723b8b,6625c3c4b10d06c3f76c371becd615d174244902..0a4dc3b1300b07d7284f626a3c572f186d8c257c
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -55,6 -55,7 +55,7 @@@
   #include <linux/cpuset.h>
   #include <linux/percpu.h>
   #include <linux/kthread.h>
+ #include <linux/proc_fs.h>
   #include <linux/seq_file.h>
   #include <linux/sysctl.h>
   #include <linux/syscalls.h>
@@@ -71,6 -72,7 +72,7 @@@
   #include <linux/debugfs.h>
   #include <linux/ctype.h>
   #include <linux/ftrace.h>
+ #include <trace/sched.h>
   
   #include <asm/tlb.h>
   #include <asm/irq_regs.h>
@@@ -201,14 -203,19 +203,19 @@@ void init_rt_bandwidth(struct rt_bandwi
         hrtimer_init(&rt_b->rt_period_timer,
                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         rt_b->rt_period_timer.function = sched_rt_period_timer;
-       rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+       rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
+ }
+ 
+ static inline int rt_bandwidth_enabled(void)
+ {
+       return sysctl_sched_rt_runtime >= 0;
   }
   
   static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
   {
         ktime_t now;
   
-       if (rt_b->rt_runtime == RUNTIME_INF)
+       if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
                 return;
   
         if (hrtimer_active(&rt_b->rt_period_timer))
@@@ -221,9 -228,8 +228,8 @@@
   
                 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
                 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-               hrtimer_start(&rt_b->rt_period_timer,
-                             rt_b->rt_period_timer.expires,
-                             HRTIMER_MODE_ABS);
+               hrtimer_start_expires(&rt_b->rt_period_timer,
+                               HRTIMER_MODE_ABS);
         }
         spin_unlock(&rt_b->rt_runtime_lock);
   }
@@@ -298,9 -304,9 +304,9 @@@ static DEFINE_PER_CPU(struct cfs_rq, in
   static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
   static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
   #endif /* CONFIG_RT_GROUP_SCHED */
- #else /* !CONFIG_FAIR_GROUP_SCHED */
+ #else /* !CONFIG_USER_SCHED */
   #define root_task_group init_task_group
- #endif /* CONFIG_FAIR_GROUP_SCHED */
+ #endif /* CONFIG_USER_SCHED */
   
   /* task_group_lock serializes add/remove of task groups and also changes to
    * a task group's cpu shares.
@@@ -604,9 -610,9 +610,9 @@@ struct rq 
   
   static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
   
- static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
+ static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
   {
-       rq->curr->sched_class->check_preempt_curr(rq, p);
+       rq->curr->sched_class->check_preempt_curr(rq, p, sync);
   }
   
   static inline int cpu_of(struct rq *rq)
@@@ -812,6 -818,13 +818,13 @@@ const_debug unsigned int sysctl_sched_n
    */
   unsigned int sysctl_sched_shares_ratelimit = 250000;
   
+ /*
+  * Inject some fuzzyness into changing the per-cpu group shares
+  * this avoids remote rq-locks at the expense of fairness.
+  * default: 4
+  */
+ unsigned int sysctl_sched_shares_thresh = 4;
+ 
   /*
    * period over which we measure -rt task cpu usage in us.
    * default: 1s
@@@ -1058,7 -1071,7 +1071,7 @@@ static void hrtick_start(struct rq *rq
         struct hrtimer *timer = &rq->hrtick_timer;
         ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
   
-       timer->expires = time;
+       hrtimer_set_expires(timer, time);
   
         if (rq == this_rq()) {
                 hrtimer_restart(timer);
@@@ -1087,7 -1100,7 +1100,7 @@@ hotplug_hrtick(struct notifier_block *n
         return NOTIFY_DONE;
   }
   
- static void init_hrtick(void)
+ static __init void init_hrtick(void)
   {
         hotcpu_notifier(hotplug_hrtick, 0);
   }
@@@ -1102,7 -1115,7 +1115,7 @@@ static void hrtick_start(struct rq *rq
         hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
   }
   
- static void init_hrtick(void)
+ static inline void init_hrtick(void)
   {
   }
   #endif /* CONFIG_SMP */
@@@ -1119,9 -1132,9 +1132,9 @@@ static void init_rq_hrtick(struct rq *r
   
         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         rq->hrtick_timer.function = hrtick;
-       rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+       rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
   }
- #else
+ #else /* CONFIG_SCHED_HRTICK */
   static inline void hrtick_clear(struct rq *rq)
   {
   }
@@@ -1133,7 -1146,7 +1146,7 @@@ static inline void init_rq_hrtick(struc
   static inline void init_hrtick(void)
   {
   }
- #endif
+ #endif        /* CONFIG_SCHED_HRTICK */
   
   /*
    * resched_task - mark a task 'to be rescheduled now'.
@@@ -1380,38 -1393,24 +1393,24 @@@ static inline void dec_cpu_load(struct 
         update_load_sub(&rq->load, load);
   }
   
- #ifdef CONFIG_SMP
- static unsigned long source_load(int cpu, int type);
- static unsigned long target_load(int cpu, int type);
- static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
- 
- static unsigned long cpu_avg_load_per_task(int cpu)
- {
-       struct rq *rq = cpu_rq(cpu);
- 
-       if (rq->nr_running)
-               rq->avg_load_per_task = rq->load.weight / rq->nr_running;
- 
-       return rq->avg_load_per_task;
- }
- 
- #ifdef CONFIG_FAIR_GROUP_SCHED
- 
- typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
+ #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
+ typedef int (*tg_visitor)(struct task_group *, void *);
   
   /*
    * Iterate the full tree, calling @down when first entering a node and @up when
    * leaving it for the final time.
    */
- static void
- walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
+ static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
   {
         struct task_group *parent, *child;
+       int ret;
   
         rcu_read_lock();
         parent = &root_task_group;
   down:
-       (*down)(parent, cpu, sd);
+       ret = (*down)(parent, data);
+       if (ret)
+               goto out_unlock;
         list_for_each_entry_rcu(child, &parent->children, siblings) {
                 parent = child;
                 goto down;
@@@ -1419,23 -1418,51 +1418,51 @@@
   up:
                 continue;
         }
-       (*up)(parent, cpu, sd);
+       ret = (*up)(parent, data);
+       if (ret)
+               goto out_unlock;
   
         child = parent;
         parent = parent->parent;
         if (parent)
                 goto up;
+ out_unlock:
         rcu_read_unlock();
+ 
+       return ret;
+ }
+ 
+ static int tg_nop(struct task_group *tg, void *data)
+ {
+       return 0;
+ }
+ #endif
+ 
+ #ifdef CONFIG_SMP
+ static unsigned long source_load(int cpu, int type);
+ static unsigned long target_load(int cpu, int type);
+ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+ 
+ static unsigned long cpu_avg_load_per_task(int cpu)
+ {
+       struct rq *rq = cpu_rq(cpu);
+ 
+       if (rq->nr_running)
+               rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+ 
+       return rq->avg_load_per_task;
   }
   
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ 
   static void __set_se_shares(struct sched_entity *se, unsigned long shares);
   
   /*
    * Calculate and set the cpu's group shares.
    */
   static void
- __update_group_shares_cpu(struct task_group *tg, int cpu,
-                         unsigned long sd_shares, unsigned long sd_rq_weight)
+ update_group_shares_cpu(struct task_group *tg, int cpu,
+                       unsigned long sd_shares, unsigned long sd_rq_weight)
   {
         int boost = 0;
         unsigned long shares;
@@@ -1466,19 -1493,23 +1493,23 @@@
          *
          */
         shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+       shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
   
-       /*
-        * record the actual number of shares, not the boosted amount.
-        */
-       tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-       tg->cfs_rq[cpu]->rq_weight = rq_weight;
+       if (abs(shares - tg->se[cpu]->load.weight) >
+                       sysctl_sched_shares_thresh) {
+               struct rq *rq = cpu_rq(cpu);
+               unsigned long flags;
   
-       if (shares < MIN_SHARES)
-               shares = MIN_SHARES;
-       else if (shares > MAX_SHARES)
-               shares = MAX_SHARES;
+               spin_lock_irqsave(&rq->lock, flags);
+               /*
+                * record the actual number of shares, not the boosted amount.
+                */
+               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+               tg->cfs_rq[cpu]->rq_weight = rq_weight;
   
-       __set_se_shares(tg->se[cpu], shares);
+               __set_se_shares(tg->se[cpu], shares);
+               spin_unlock_irqrestore(&rq->lock, flags);
+       }
   }
   
   /*
@@@ -1486,11 -1517,11 +1517,11 @@@
    * This needs to be done in a bottom-up fashion because the rq weight of a
    * parent group depends on the shares of its child groups.
    */
- static void
- tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
+ static int tg_shares_up(struct task_group *tg, void *data)
   {
         unsigned long rq_weight = 0;
         unsigned long shares = 0;
+       struct sched_domain *sd = data;
         int i;
   
         for_each_cpu_mask(i, sd->span) {
@@@ -1507,14 -1538,10 +1538,10 @@@
         if (!rq_weight)
                 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
   
-       for_each_cpu_mask(i, sd->span) {
-               struct rq *rq = cpu_rq(i);
-               unsigned long flags;
+       for_each_cpu_mask(i, sd->span)
+               update_group_shares_cpu(tg, i, shares, rq_weight);
   
-               spin_lock_irqsave(&rq->lock, flags);
-               __update_group_shares_cpu(tg, i, shares, rq_weight);
-               spin_unlock_irqrestore(&rq->lock, flags);
-       }
+       return 0;
   }
   
   /*
@@@ -1522,10 -1549,10 +1549,10 @@@
    * This needs to be done in a top-down fashion because the load of a child
    * group is a fraction of its parents load.
    */
- static void
- tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+ static int tg_load_down(struct task_group *tg, void *data)
   {
         unsigned long load;
+       long cpu = (long)data;
   
         if (!tg->parent) {
                 load = cpu_rq(cpu)->load.weight;
@@@ -1536,11 -1563,8 +1563,8 @@@
         }
   
         tg->cfs_rq[cpu]->h_load = load;
- }
   
- static void
- tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
- {
+       return 0;
   }
   
   static void update_shares(struct sched_domain *sd)
@@@ -1550,7 -1574,7 +1574,7 @@@
   
         if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
                 sd->last_update = now;
-               walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+               walk_tg_tree(tg_nop, tg_shares_up, sd);
         }
   }
   
@@@ -1561,9 -1585,9 +1585,9 @@@ static void update_shares_locked(struc
         spin_lock(&rq->lock);
   }
   
- static void update_h_load(int cpu)
+ static void update_h_load(long cpu)
   {
-       walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
+       walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
   }
   
   #else
@@@ -1918,14 -1942,12 +1942,12 @@@ unsigned long wait_task_inactive(struc
                  * just go back and repeat.
                  */
                 rq = task_rq_lock(p, &flags);
+               trace_sched_wait_task(rq, p);
                 running = task_running(rq, p);
                 on_rq = p->se.on_rq;
                 ncsw = 0;
-               if (!match_state || p->state == match_state) {
-                       ncsw = p->nivcsw + p->nvcsw;
-                       if (unlikely(!ncsw))
-                               ncsw = 1;
-               }
+               if (!match_state || p->state == match_state)
+                       ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
                 task_rq_unlock(rq, &flags);
   
                 /*
@@@ -2282,10 -2304,8 +2304,8 @@@ out_activate
         success = 1;
   
   out_running:
-       trace_mark(kernel_sched_wakeup,
-               "pid %d state %ld ## rq %p task %p rq->curr %p",
-               p->pid, p->state, rq, p, rq->curr);
-       check_preempt_curr(rq, p);
+       trace_sched_wakeup(rq, p);
+       check_preempt_curr(rq, p, sync);
   
         p->state = TASK_RUNNING;
   #ifdef CONFIG_SMP
@@@ -2417,10 -2437,8 +2437,8 @@@ void wake_up_new_task(struct task_struc
                 p->sched_class->task_new(rq, p);
                 inc_nr_running(rq);
         }
-       trace_mark(kernel_sched_wakeup_new,
-               "pid %d state %ld ## rq %p task %p rq->curr %p",
-               p->pid, p->state, rq, p, rq->curr);
-       check_preempt_curr(rq, p);
+       trace_sched_wakeup_new(rq, p);
+       check_preempt_curr(rq, p, 0);
   #ifdef CONFIG_SMP
         if (p->sched_class->task_wake_up)
                 p->sched_class->task_wake_up(rq, p);
@@@ -2592,11 -2610,7 +2610,7 @@@ context_switch(struct rq *rq, struct ta
         struct mm_struct *mm, *oldmm;
   
         prepare_task_switch(rq, prev, next);
-       trace_mark(kernel_sched_schedule,
-               "prev_pid %d next_pid %d prev_state %ld "
-               "## rq %p prev %p next %p",
-               prev->pid, next->pid, prev->state,
-               rq, prev, next);
+       trace_sched_switch(rq, prev, next);
         mm = next->mm;
         oldmm = prev->active_mm;
         /*
@@@ -2836,6 -2850,7 +2850,7 @@@ static void sched_migrate_task(struct t
             || unlikely(!cpu_active(dest_cpu)))
                 goto out;
   
+       trace_sched_migrate_task(rq, p, dest_cpu);
         /* force the process onto the specified CPU */
         if (migrate_task(p, dest_cpu, &req)) {
                 /* Need to wait for migration thread (might exit: take ref). */
@@@ -2880,7 -2895,7 +2895,7 @@@ static void pull_task(struct rq *src_rq
          * Note that idle threads have a prio of MAX_PRIO, for this test
          * to be always true for them.
          */
-       check_preempt_curr(this_rq, p);
+       check_preempt_curr(this_rq, p, 0);
   }
   
   /*
@@@ -4037,23 -4052,26 +4052,26 @@@ DEFINE_PER_CPU(struct kernel_stat, ksta
   EXPORT_PER_CPU_SYMBOL(kstat);
   
   /*
-  * Return p->sum_exec_runtime plus any more ns on the sched_clock
-  * that have not yet been banked in case the task is currently running.
+  * Return any ns on the sched_clock that have not yet been banked in
+  * @p in case that task is currently running.
    */
- unsigned long long task_sched_runtime(struct task_struct *p)
+ unsigned long long task_delta_exec(struct task_struct *p)
   {
         unsigned long flags;
-       u64 ns, delta_exec;
         struct rq *rq;
+       u64 ns = 0;
   
         rq = task_rq_lock(p, &flags);
-       ns = p->se.sum_exec_runtime;
+ 
         if (task_current(rq, p)) {
+               u64 delta_exec;
+ 
                 update_rq_clock(rq);
                 delta_exec = rq->clock - p->se.exec_start;
                 if ((s64)delta_exec > 0)
-                       ns += delta_exec;
+                       ns = delta_exec;
         }
+ 
         task_rq_unlock(rq, &flags);
   
         return ns;
@@@ -4070,6 -4088,7 +4088,7 @@@ void account_user_time(struct task_stru
         cputime64_t tmp;
   
         p->utime = cputime_add(p->utime, cputime);
+       account_group_user_time(p, cputime);
   
         /* Add user time to cpustat. */
         tmp = cputime_to_cputime64(cputime);
@@@ -4094,6 -4113,7 +4113,7 @@@ static void account_guest_time(struct t
         tmp = cputime_to_cputime64(cputime);
   
         p->utime = cputime_add(p->utime, cputime);
+       account_group_user_time(p, cputime);
         p->gtime = cputime_add(p->gtime, cputime);
   
         cpustat->user = cputime64_add(cpustat->user, tmp);
@@@ -4129,6 -4149,7 +4149,7 @@@ void account_system_time(struct task_st
         }
   
         p->stime = cputime_add(p->stime, cputime);
+       account_group_system_time(p, cputime);
   
         /* Add system time to cpustat. */
         tmp = cputime_to_cputime64(cputime);
@@@ -4170,6 -4191,7 +4191,7 @@@ void account_steal_time(struct task_str
   
         if (p == rq->idle) {
                 p->stime = cputime_add(p->stime, steal);
+               account_group_system_time(p, steal);
                 if (atomic_read(&rq->nr_iowait) > 0)
                         cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
                 else
@@@ -4305,7 -4327,7 +4327,7 @@@ void __kprobes sub_preempt_count(int va
         /*
          * Underflow?
          */
- -      if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+ +       if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
                 return;
         /*
          * Is the spinlock portion underflowing?
@@@ -4426,12 -4448,8 +4448,8 @@@ need_resched_nonpreemptible
         if (sched_feat(HRTICK))
                 hrtick_clear(rq);
   
-       /*
-        * Do the rq-clock update outside the rq lock:
-        */
-       local_irq_disable();
+       spin_lock_irq(&rq->lock);
         update_rq_clock(rq);
-       spin_lock(&rq->lock);
         clear_tsk_need_resched(prev);
   
         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@@ -4627,6 -4645,15 +4645,15 @@@ __wake_up_sync(wait_queue_head_t *q, un
   }
   EXPORT_SYMBOL_GPL(__wake_up_sync);    /* For internal use only */
   
+ /**
+  * complete: - signals a single thread waiting on this completion
+  * @x:  holds the state of this particular completion
+  *
+  * This will wake up a single thread waiting on this completion. Threads will be
+  * awakened in the same order in which they were queued.
+  *
+  * See also complete_all(), wait_for_completion() and related routines.
+  */
   void complete(struct completion *x)
   {
         unsigned long flags;
@@@ -4638,6 -4665,12 +4665,12 @@@
   }
   EXPORT_SYMBOL(complete);
   
+ /**
+  * complete_all: - signals all threads waiting on this completion
+  * @x:  holds the state of this particular completion
+  *
+  * This will wake up all threads waiting on this particular completion event.
+  */
   void complete_all(struct completion *x)
   {
         unsigned long flags;
@@@ -4658,10 -4691,7 +4691,7 @@@ do_wait_for_common(struct completion *x
                 wait.flags |= WQ_FLAG_EXCLUSIVE;
                 __add_wait_queue_tail(&x->wait, &wait);
                 do {
-                       if ((state == TASK_INTERRUPTIBLE &&
-                            signal_pending(current)) ||
-                           (state == TASK_KILLABLE &&
-                            fatal_signal_pending(current))) {
+                       if (signal_pending_state(state, current)) {
                                 timeout = -ERESTARTSYS;
                                 break;
                         }
@@@ -4689,12 -4719,31 +4719,31 @@@ wait_for_common(struct completion *x, l
         return timeout;
   }
   
+ /**
+  * wait_for_completion: - waits for completion of a task
+  * @x:  holds the state of this particular completion
+  *
+  * This waits to be signaled for completion of a specific task. It is NOT
+  * interruptible and there is no timeout.
+  *
+  * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+  * and interrupt capability. Also see complete().
+  */
   void __sched wait_for_completion(struct completion *x)
   {
         wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
   }
   EXPORT_SYMBOL(wait_for_completion);
   
+ /**
+  * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+  * @x:  holds the state of this particular completion
+  * @timeout:  timeout value in jiffies
+  *
+  * This waits for either a completion of a specific task to be signaled or for a
+  * specified timeout to expire. The timeout is in jiffies. It is not
+  * interruptible.
+  */
   unsigned long __sched
   wait_for_completion_timeout(struct completion *x, unsigned long timeout)
   {
@@@ -4702,6 -4751,13 +4751,13 @@@
   }
   EXPORT_SYMBOL(wait_for_completion_timeout);
   
+ /**
+  * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+  * @x:  holds the state of this particular completion
+  *
+  * This waits for completion of a specific task to be signaled. It is
+  * interruptible.
+  */
   int __sched wait_for_completion_interruptible(struct completion *x)
   {
         long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@@ -4711,6 -4767,14 +4767,14 @@@
   }
   EXPORT_SYMBOL(wait_for_completion_interruptible);
   
+ /**
+  * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+  * @x:  holds the state of this particular completion
+  * @timeout:  timeout value in jiffies
+  *
+  * This waits for either a completion of a specific task to be signaled or for a
+  * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+  */
   unsigned long __sched
   wait_for_completion_interruptible_timeout(struct completion *x,
                                           unsigned long timeout)
@@@ -4719,6 -4783,13 +4783,13 @@@
   }
   EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
   
+ /**
+  * wait_for_completion_killable: - waits for completion of a task (killable)
+  * @x:  holds the state of this particular completion
+  *
+  * This waits to be signaled for completion of a specific task. It can be
+  * interrupted by a kill signal.
+  */
   int __sched wait_for_completion_killable(struct completion *x)
   {
         long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@@ -5121,7 -5192,8 +5192,8 @@@ recheck
                  * Do not allow realtime tasks into groups that have no runtime
                  * assigned.
                  */
-               if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+               if (rt_bandwidth_enabled() && rt_policy(policy) &&
+                               task_group(p)->rt_bandwidth.rt_runtime == 0)
                         return -EPERM;
   #endif
   
@@@ -5957,7 -6029,7 +6029,7 @@@ static int __migrate_task(struct task_s
         set_task_cpu(p, dest_cpu);
         if (on_rq) {
                 activate_task(rq_dest, p, 0);
-               check_preempt_curr(rq_dest, p);
+               check_preempt_curr(rq_dest, p, 0);
         }
   done:
         ret = 1;
@@@ -6282,7 -6354,7 +6354,7 @@@ set_table_entry(struct ctl_table *entry
   static struct ctl_table *
   sd_alloc_ctl_domain_table(struct sched_domain *sd)
   {
-       struct ctl_table *table = sd_alloc_ctl_entry(12);
+       struct ctl_table *table = sd_alloc_ctl_entry(13);
   
         if (table == NULL)
                 return NULL;
@@@ -6310,7 -6382,9 +6382,9 @@@
                 sizeof(int), 0644, proc_dointvec_minmax);
         set_table_entry(&table[10], "flags", &sd->flags,
                 sizeof(int), 0644, proc_dointvec_minmax);
-       /* &table[11] is terminator */
+       set_table_entry(&table[11], "name", sd->name,
+               CORENAME_MAX_SIZE, 0444, proc_dostring);
+       /* &table[12] is terminator */
   
         return table;
   }
@@@ -7194,13 -7268,21 +7268,21 @@@ static void init_sched_groups_power(in
    * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
    */
   
+ #ifdef CONFIG_SCHED_DEBUG
+ # define SD_INIT_NAME(sd, type)               sd->name = #type
+ #else
+ # define SD_INIT_NAME(sd, type)               do { } while (0)
+ #endif
+ 
   #define       SD_INIT(sd, type)       sd_init_##type(sd)
+ 
   #define SD_INIT_FUNC(type)    \
   static noinline void sd_init_##type(struct sched_domain *sd)  \
   {                                                             \
         memset(sd, 0, sizeof(*sd));                             \
         *sd = SD_##type##_INIT;                                 \
         sd->level = SD_LV_##type;                               \
+       SD_INIT_NAME(sd, type);                                 \
   }
   
   SD_INIT_FUNC(CPU)
@@@ -8242,20 -8324,25 +8324,25 @@@ void __might_sleep(char *file, int line
   #ifdef in_atomic
         static unsigned long prev_jiffy;        /* ratelimiting */
   
-       if ((in_atomic() || irqs_disabled()) &&
-           system_state == SYSTEM_RUNNING && !oops_in_progress) {
-               if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-                       return;
-               prev_jiffy = jiffies;
-               printk(KERN_ERR "BUG: sleeping function called from invalid"
-                               " context at %s:%d\n", file, line);
-               printk("in_atomic():%d, irqs_disabled():%d\n",
-                       in_atomic(), irqs_disabled());
-               debug_show_held_locks(current);
-               if (irqs_disabled())
-                       print_irqtrace_events(current);
-               dump_stack();
-       }
+       if ((!in_atomic() && !irqs_disabled()) ||
+                   system_state != SYSTEM_RUNNING || oops_in_progress)
+               return;
+       if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+               return;
+       prev_jiffy = jiffies;
+ 
+       printk(KERN_ERR
+               "BUG: sleeping function called from invalid context at %s:%d\n",
+                       file, line);
+       printk(KERN_ERR
+               "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+                       in_atomic(), irqs_disabled(),
+                       current->pid, current->comm);
+ 
+       debug_show_held_locks(current);
+       if (irqs_disabled())
+               print_irqtrace_events(current);
+       dump_stack();
   #endif
   }
   EXPORT_SYMBOL(__might_sleep);
@@@ -8753,73 -8840,95 +8840,95 @@@ static DEFINE_MUTEX(rt_constraints_mute
   static unsigned long to_ratio(u64 period, u64 runtime)
   {
         if (runtime == RUNTIME_INF)
-               return 1ULL << 16;
+               return 1ULL << 20;
   
-       return div64_u64(runtime << 16, period);
+       return div64_u64(runtime << 20, period);
   }
   
- #ifdef CONFIG_CGROUP_SCHED
- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+ /* Must be called with tasklist_lock held */
+ static inline int tg_has_rt_tasks(struct task_group *tg)
   {
-       struct task_group *tgi, *parent = tg->parent;
-       unsigned long total = 0;
+       struct task_struct *g, *p;
   
-       if (!parent) {
-               if (global_rt_period() < period)
-                       return 0;
+       do_each_thread(g, p) {
+               if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+                       return 1;
+       } while_each_thread(g, p);
   
-               return to_ratio(period, runtime) <
-                       to_ratio(global_rt_period(), global_rt_runtime());
-       }
+       return 0;
+ }
   
-       if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
-               return 0;
+ struct rt_schedulable_data {
+       struct task_group *tg;
+       u64 rt_period;
+       u64 rt_runtime;
+ };
   
-       rcu_read_lock();
-       list_for_each_entry_rcu(tgi, &parent->children, siblings) {
-               if (tgi == tg)
-                       continue;
+ static int tg_schedulable(struct task_group *tg, void *data)
+ {
+       struct rt_schedulable_data *d = data;
+       struct task_group *child;
+       unsigned long total, sum = 0;
+       u64 period, runtime;
   
-               total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-                               tgi->rt_bandwidth.rt_runtime);
+       period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+       runtime = tg->rt_bandwidth.rt_runtime;
+ 
+       if (tg == d->tg) {
+               period = d->rt_period;
+               runtime = d->rt_runtime;
         }
-       rcu_read_unlock();
   
-       return total + to_ratio(period, runtime) <=
-               to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
-                               parent->rt_bandwidth.rt_runtime);
- }
- #elif defined CONFIG_USER_SCHED
- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
- {
-       struct task_group *tgi;
-       unsigned long total = 0;
-       unsigned long global_ratio =
-               to_ratio(global_rt_period(), global_rt_runtime());
+       /*
+        * Cannot have more runtime than the period.
+        */
+       if (runtime > period && runtime != RUNTIME_INF)
+               return -EINVAL;
   
-       rcu_read_lock();
-       list_for_each_entry_rcu(tgi, &task_groups, list) {
-               if (tgi == tg)
-                       continue;
+       /*
+        * Ensure we don't starve existing RT tasks.
+        */
+       if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+               return -EBUSY;
   
-               total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-                               tgi->rt_bandwidth.rt_runtime);
+       total = to_ratio(period, runtime);
+ 
+       /*
+        * Nobody can have more than the global setting allows.
+        */
+       if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+               return -EINVAL;
+ 
+       /*
+        * The sum of our children's runtime should not exceed our own.
+        */
+       list_for_each_entry_rcu(child, &tg->children, siblings) {
+               period = ktime_to_ns(child->rt_bandwidth.rt_period);
+               runtime = child->rt_bandwidth.rt_runtime;
+ 
+               if (child == d->tg) {
+                       period = d->rt_period;
+                       runtime = d->rt_runtime;
+               }
+ 
+               sum += to_ratio(period, runtime);
         }
-       rcu_read_unlock();
   
-       return total + to_ratio(period, runtime) < global_ratio;
+       if (sum > total)
+               return -EINVAL;
+ 
+       return 0;
   }
- #endif
   
- /* Must be called with tasklist_lock held */
- static inline int tg_has_rt_tasks(struct task_group *tg)
+ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
   {
-       struct task_struct *g, *p;
-       do_each_thread(g, p) {
-               if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
-                       return 1;
-       } while_each_thread(g, p);
-       return 0;
+       struct rt_schedulable_data data = {
+               .tg = tg,
+               .rt_period = period,
+               .rt_runtime = runtime,
+       };
+ 
+       return walk_tg_tree(tg_schedulable, tg_nop, &data);
   }
   
   static int tg_set_bandwidth(struct task_group *tg,
@@@ -8829,14 -8938,9 +8938,9 @@@
   
         mutex_lock(&rt_constraints_mutex);
         read_lock(&tasklist_lock);
-       if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
-               err = -EBUSY;
-               goto unlock;
-       }
-       if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
-               err = -EINVAL;
+       err = __rt_schedulable(tg, rt_period, rt_runtime);
+       if (err)
                 goto unlock;
-       }
   
         spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
         tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@@ -8905,19 -9009,25 +9009,25 @@@ long sched_group_rt_period(struct task_
   
   static int sched_rt_global_constraints(void)
   {
-       struct task_group *tg = &root_task_group;
-       u64 rt_runtime, rt_period;
+       u64 runtime, period;
         int ret = 0;
   
         if (sysctl_sched_rt_period <= 0)
                 return -EINVAL;
   
-       rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-       rt_runtime = tg->rt_bandwidth.rt_runtime;
+       runtime = global_rt_runtime();
+       period = global_rt_period();
+ 
+       /*
+        * Sanity check on the sysctl variables.
+        */
+       if (runtime > period && runtime != RUNTIME_INF)
+               return -EINVAL;
   
         mutex_lock(&rt_constraints_mutex);
-       if (!__rt_schedulable(tg, rt_period, rt_runtime))
-               ret = -EINVAL;
+       read_lock(&tasklist_lock);
+       ret = __rt_schedulable(NULL, 0, 0);
+       read_unlock(&tasklist_lock);
         mutex_unlock(&rt_constraints_mutex);
   
         return ret;
@@@ -8991,7 -9101,6 +9101,6 @@@ cpu_cgroup_create(struct cgroup_subsys 
   
         if (!cgrp->parent) {
                 /* This is early initialization for the top cgroup */
-               init_task_group.css.cgroup = cgrp;
                 return &init_task_group.css;
         }
   
@@@ -9000,9 -9109,6 +9109,6 @@@
         if (IS_ERR(tg))
                 return ERR_PTR(-ENOMEM);
   
-       /* Bind the cgroup to task_group object we just created */
-       tg->css.cgroup = cgrp;
- 
         return &tg->css;
   }
   
diff --combined mm/memory.c

index b8fdf4e5e65b52a812413ffc4797edf4e056f42c,164951c473058a25c081d5e47260d872068cdbb7..fc031d68327e5fad33130b15d50a020b7b9b31a8
--- 1/mm/memory.c
--- 2/mm/memory.c
+++ b/mm/memory.c
@@@ -1129,12 -1129,17 +1129,17 @@@ static inline int use_zero_page(struct 
         return !vma->vm_ops || !vma->vm_ops->fault;
   }
   
- int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-               unsigned long start, int len, int write, int force,
+ 
+ 
+ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+                    unsigned long start, int len, int flags,
                 struct page **pages, struct vm_area_struct **vmas)
   {
         int i;
-       unsigned int vm_flags;
+       unsigned int vm_flags = 0;
+       int write = !!(flags & GUP_FLAGS_WRITE);
+       int force = !!(flags & GUP_FLAGS_FORCE);
+       int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
   
         if (len <= 0)
                 return 0;
@@@ -1158,7 -1163,9 +1163,9 @@@
                         pud_t *pud;
                         pmd_t *pmd;
                         pte_t *pte;
-                       if (write) /* user gate pages are read-only */
+ 
+                       /* user gate pages are read-only */
+                       if (!ignore && write)
                                 return i ? : -EFAULT;
                         if (pg > TASK_SIZE)
                                 pgd = pgd_offset_k(pg);
@@@ -1190,8 -1197,9 +1197,9 @@@
                         continue;
                 }
   
-               if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
-                               || !(vm_flags & vma->vm_flags))
+               if (!vma ||
+                   (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
+                   (!ignore && !(vm_flags & vma->vm_flags)))
                         return i ? : -EFAULT;
   
                 if (is_vm_hugetlb_page(vma)) {
@@@ -1266,6 -1274,23 +1274,23 @@@
         } while (len);
         return i;
   }
+ 
+ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+               unsigned long start, int len, int write, int force,
+               struct page **pages, struct vm_area_struct **vmas)
+ {
+       int flags = 0;
+ 
+       if (write)
+               flags |= GUP_FLAGS_WRITE;
+       if (force)
+               flags |= GUP_FLAGS_FORCE;
+ 
+       return __get_user_pages(tsk, mm,
+                               start, len, flags,
+                               pages, vmas);
+ }
+ 
   EXPORT_SYMBOL(get_user_pages);
   
   pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
@@@ -1296,18 -1321,14 +1321,14 @@@ static int insert_page(struct vm_area_s
         pte_t *pte;
         spinlock_t *ptl;
   
-       retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
-       if (retval)
-               goto out;
- 
         retval = -EINVAL;
         if (PageAnon(page))
-               goto out_uncharge;
+               goto out;
         retval = -ENOMEM;
         flush_dcache_page(page);
         pte = get_locked_pte(mm, addr, &ptl);
         if (!pte)
-               goto out_uncharge;
+               goto out;
         retval = -EBUSY;
         if (!pte_none(*pte))
                 goto out_unlock;
@@@ -1323,8 -1344,6 +1344,6 @@@
         return retval;
   out_unlock:
         pte_unmap_unlock(pte, ptl);
- out_uncharge:
-       mem_cgroup_uncharge_page(page);
   out:
         return retval;
   }
@@@ -1858,6 -1877,15 +1877,15 @@@ gotten
         new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
         if (!new_page)
                 goto oom;
+       /*
+        * Don't let another task, with possibly unlocked vma,
+        * keep the mlocked page.
+        */
+       if (vma->vm_flags & VM_LOCKED) {
+               lock_page(old_page);    /* for LRU manipulation */
+               clear_page_mlock(old_page);
+               unlock_page(old_page);
+       }
         cow_user_page(new_page, old_page, address, vma);
         __SetPageUptodate(new_page);
   
@@@ -1886,11 -1914,13 +1914,13 @@@
                  * thread doing COW.
                  */
                 ptep_clear_flush_notify(vma, address, page_table);
-               set_pte_at(mm, address, page_table, entry);
-               update_mmu_cache(vma, address, entry);
-               lru_cache_add_active(new_page);
+               SetPageSwapBacked(new_page);
+               lru_cache_add_active_or_unevictable(new_page, vma);
                 page_add_new_anon_rmap(new_page, vma, address);
   
+ //TODO:  is this safe?  do_anonymous_page() does it this way.
+               set_pte_at(mm, address, page_table, entry);
+               update_mmu_cache(vma, address, entry);
                 if (old_page) {
                         /*
                          * Only after switching the pte to the new page may
@@@ -2288,16 -2318,17 +2318,17 @@@ static int do_swap_page(struct mm_struc
                 count_vm_event(PGMAJFAULT);
         }
   
+       mark_page_accessed(page);
+ 
+       lock_page(page);
+       delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ 
         if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
-               delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
                 ret = VM_FAULT_OOM;
+               unlock_page(page);
                 goto out;
         }
   
-       mark_page_accessed(page);
-       lock_page(page);
-       delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
- 
         /*
          * Back out if somebody else already faulted in this pte.
          */
@@@ -2324,7 -2355,7 +2355,7 @@@
         page_add_anon_rmap(page, vma, address);
   
         swap_free(entry);
-       if (vm_swap_full())
+       if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
                 remove_exclusive_swap_page(page);
         unlock_page(page);
   
@@@ -2382,7 -2413,8 +2413,8 @@@ static int do_anonymous_page(struct mm_
         if (!pte_none(*page_table))
                 goto release;
         inc_mm_counter(mm, anon_rss);
-       lru_cache_add_active(page);
+       SetPageSwapBacked(page);
+       lru_cache_add_active_or_unevictable(page, vma);
         page_add_new_anon_rmap(page, vma, address);
         set_pte_at(mm, address, page_table, entry);
   
@@@ -2423,6 -2455,7 +2455,7 @@@ static int __do_fault(struct mm_struct 
         struct page *page;
         pte_t entry;
         int anon = 0;
+       int charged = 0;
         struct page *dirty_page = NULL;
         struct vm_fault vmf;
         int ret;
@@@ -2463,6 -2496,18 +2496,18 @@@
                                 ret = VM_FAULT_OOM;
                                 goto out;
                         }
+                       if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
+                               ret = VM_FAULT_OOM;
+                               page_cache_release(page);
+                               goto out;
+                       }
+                       charged = 1;
+                       /*
+                        * Don't let another task, with possibly unlocked vma,
+                        * keep the mlocked page.
+                        */
+                       if (vma->vm_flags & VM_LOCKED)
+                               clear_page_mlock(vmf.page);
                         copy_user_highpage(page, vmf.page, address, vma);
                         __SetPageUptodate(page);
                 } else {
@@@ -2497,11 -2542,6 +2542,6 @@@
   
         }
   
-       if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
-               ret = VM_FAULT_OOM;
-               goto out;
-       }
- 
         page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
   
         /*
@@@ -2520,11 -2560,11 +2560,11 @@@
                 entry = mk_pte(page, vma->vm_page_prot);
                 if (flags & FAULT_FLAG_WRITE)
                         entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-               set_pte_at(mm, address, page_table, entry);
                 if (anon) {
-                         inc_mm_counter(mm, anon_rss);
-                         lru_cache_add_active(page);
-                         page_add_new_anon_rmap(page, vma, address);
+                       inc_mm_counter(mm, anon_rss);
+                       SetPageSwapBacked(page);
+                       lru_cache_add_active_or_unevictable(page, vma);
+                       page_add_new_anon_rmap(page, vma, address);
                 } else {
                         inc_mm_counter(mm, file_rss);
                         page_add_file_rmap(page);
@@@ -2533,11 -2573,14 +2573,14 @@@
                                 get_page(dirty_page);
                         }
                 }
+ //TODO:  is this safe?  do_anonymous_page() does it this way.
+               set_pte_at(mm, address, page_table, entry);
   
                 /* no need to invalidate: a not-present page won't be cached */
                 update_mmu_cache(vma, address, entry);
         } else {
-               mem_cgroup_uncharge_page(page);
+               if (charged)
+                       mem_cgroup_uncharge_page(page);
                 if (anon)
                         page_cache_release(page);
                 else
@@@ -2772,19 -2815,9 +2815,9 @@@ int make_pages_present(unsigned long ad
         len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
         ret = get_user_pages(current, current->mm, addr,
                         len, write, 0, NULL, NULL);
-       if (ret < 0) {
-               /*
-                  SUS require strange return value to mlock
-                   - invalid addr generate to ENOMEM.
-                   - out of memory should generate EAGAIN.
-               */
-               if (ret == -EFAULT)
-                       ret = -ENOMEM;
-               else if (ret == -ENOMEM)
-                       ret = -EAGAIN;
+       if (ret < 0)
                 return ret;
-       }
-       return ret == len ? 0 : -ENOMEM;
+       return ret == len ? 0 : -EFAULT;
   }
   
   #if !defined(__HAVE_ARCH_GATE_AREA)
@@@ -3016,18 -3049,3 +3049,18 @@@ void print_vma_addr(char *prefix, unsig
         }
         up_read(&current->mm->mmap_sem);
   }
+ +
+ +#ifdef CONFIG_PROVE_LOCKING
+ +void might_fault(void)
+ +{
+ +      might_sleep();
+ +      /*
+ +       * it would be nicer only to annotate paths which are not under
+ +       * pagefault_disable, however that requires a larger audit and
+ +       * providing helpers like get_user_atomic.
+ +       */
+ +      if (!in_atomic() && current->mm)
+ +              might_lock_read(&current->mm->mmap_sem);
+ +}
+ +EXPORT_SYMBOL(might_fault);
+ +#endif
author	Ingo Molnar <mingo@elte.hu>
	Tue, 28 Oct 2008 15:54:49 +0000 (16:54 +0100)
committer	Ingo Molnar <mingo@elte.hu>
	Tue, 28 Oct 2008 15:54:49 +0000 (16:54 +0100)
		1	2
arch/um/include/asm/system.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/uaccess.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/uaccess_32.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/uaccess_64.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/lib/usercopy_32.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/kernel.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memory.c	patch \|	diff1 \|	diff2 \|	blob \| history