Merge branches 'timers/clocksource', 'timers/hrtimers', 'timers/nohz', 'timers/ntp...

author Thomas Gleixner <tglx@linutronix.de>

Mon, 20 Oct 2008 11:14:06 +0000 (13:14 +0200)

committer Thomas Gleixner <tglx@linutronix.de>

Mon, 20 Oct 2008 11:14:06 +0000 (13:14 +0200)
author Thomas Gleixner <tglx@linutronix.de>
Mon, 20 Oct 2008 11:14:06 +0000 (13:14 +0200)
committer Thomas Gleixner <tglx@linutronix.de>
Mon, 20 Oct 2008 11:14:06 +0000 (13:14 +0200)
diff --combined drivers/clocksource/acpi_pm.c

index 3df338481004c0bbdf69debbca9dd4d69b6bd026,5ca1d80de182bfa3bb333f0aeca907f7929ed26c,71d2ac4e3f46cc0e33410a2d8f7889ac1ae8de09,5ca1d80de182bfa3bb333f0aeca907f7929ed26c,4eee533f3f4af7e8f5c9d811c7ac671d95d05083,71d2ac4e3f46cc0e33410a2d8f7889ac1ae8de09..c20171078d1d6f475f04a23fc0be9e63101d5d76
--- 1/drivers/clocksource/acpi_pm.c
--- 2/drivers/clocksource/acpi_pm.c
--- 3/drivers/clocksource/acpi_pm.c
--- 4/drivers/clocksource/acpi_pm.c
--- 5/drivers/clocksource/acpi_pm.c
--- 6/drivers/clocksource/acpi_pm.c
+++ b/drivers/clocksource/acpi_pm.c
@@@@@@@ -21,6 -21,6 -21,7 -21,6 -21,7 -21,7 +21,7 @@@@@@@
       #include <linux/errno.h>
       #include <linux/init.h>
       #include <linux/pci.h>
++ +  #include <linux/delay.h>
       #include <asm/io.h>
       
       /*
@@@@@@@ -151,13 -151,13 -152,13 -151,13 -152,13 -152,13 +152,13 @@@@@@@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_S
        */
       static int verify_pmtmr_rate(void)
       {
-- -    u32 value1, value2;
++ +    cycle_t value1, value2;
         unsigned long count, delta;
       
         mach_prepare_counter();
-- -    value1 = read_pmtmr();
++ +    value1 = clocksource_acpi_pm.read();
         mach_countup(&count);
-- -    value2 = read_pmtmr();
++ +    value2 = clocksource_acpi_pm.read();
         delta = (value2 - value1) & ACPI_PM_MASK;
       
         /* Check that the PMTMR delta is within 5% of what we expect */
@@@@@@@ -175,10 -175,10 -176,15 -175,10 -176,13 -176,15 +176,15 @@@@@@@
       #define verify_pmtmr_rate() (0)
       #endif
       
++ +  /* Number of monotonicity checks to perform during initialization */
++ +  #define ACPI_PM_MONOTONICITY_CHECKS 10
++ ++ /* Number of reads we try to get two different values */
++ ++ #define ACPI_PM_READ_CHECKS 10000
++ +  
       static int __init init_acpi_pm_clocksource(void)
       {
-- -    u32 value1, value2;
-- -    unsigned int i;
++ +    cycle_t value1, value2;
-    -   unsigned int i, j, good = 0;
++ ++   unsigned int i, j = 0;
       
         if (!pmtmr_ioport)
                 return -ENODEV;
@@@@@@@ -187,24 -187,24 -193,29 -187,24 -191,32 -193,29 +193,29 @@@@@@@
                                                 clocksource_acpi_pm.shift);
       
         /* "verify" this timing source: */
-- -    value1 = read_pmtmr();
-- -    for (i = 0; i < 10000; i++) {
-- -            value2 = read_pmtmr();
-- -            if (value2 == value1)
-- -                    continue;
-- -            if (value2 > value1)
-- -                    goto pm_good;
-- -            if ((value2 < value1) && ((value2) < 0xFFF))
-- -                    goto pm_good;
-- -            printk(KERN_INFO "PM-Timer had inconsistent results:"
-- -                    " 0x%#x, 0x%#x - aborting.\n", value1, value2);
-- -            return -EINVAL;
++ +    for (j = 0; j < ACPI_PM_MONOTONICITY_CHECKS; j++) {
++ ++           udelay(100 * j);
++ +            value1 = clocksource_acpi_pm.read();
-    -           for (i = 0; i < 10000; i++) {
++ ++           for (i = 0; i < ACPI_PM_READ_CHECKS; i++) {
++ +                    value2 = clocksource_acpi_pm.read();
++ +                    if (value2 == value1)
++ +                            continue;
++ +                    if (value2 > value1)
-    -                           good++;
++ +                            break;
++ +                    if ((value2 < value1) && ((value2) < 0xFFF))
-    -                           good++;
++ +                            break;
++ +                    printk(KERN_INFO "PM-Timer had inconsistent results:"
++ +                           " 0x%#llx, 0x%#llx - aborting.\n",
++ +                           value1, value2);
++ +                    return -EINVAL;
++ +            }
-    -           udelay(300 * i);
-    -   }
-    - 
-    -   if (good != ACPI_PM_MONOTONICITY_CHECKS) {
-    -           printk(KERN_INFO "PM-Timer failed consistency check "
-    -                  " (0x%#llx) - aborting.\n", value1);
-    -           return -ENODEV;
++ ++           if (i == ACPI_PM_READ_CHECKS) {
++ ++                   printk(KERN_INFO "PM-Timer failed consistency check "
++ ++                          " (0x%#llx) - aborting.\n", value1);
++ ++                   return -ENODEV;
++ ++           }
         }
-- -    printk(KERN_INFO "PM-Timer had no reasonable result:"
-- -                    " 0x%#x - aborting.\n", value1);
-- -    return -ENODEV;
       
-- -  pm_good:
         if (verify_pmtmr_rate() != 0)
                 return -ENODEV;
       
@@@@@@@ -226,12 -226,9 -237,9 -226,9 -238,9 -237,9 +237,12 @@@@@@@ static int __init parse_pmtmr(char *arg
       
         if (strict_strtoul(arg, 16, &base))
                 return -EINVAL;
- -----
+ +++++#ifdef CONFIG_X86_64
+ +++++  if (base > UINT_MAX)
+ +++++          return -ERANGE;
+ +++++#endif
         printk(KERN_INFO "PMTMR IOPort override: 0x%04x -> 0x%04lx\n",
- -----         (unsigned int)pmtmr_ioport, base);
+ +++++         pmtmr_ioport, base);
         pmtmr_ioport = base;
       
         return 1;
diff --combined fs/binfmt_elf.c

index 655ed8d30a86ef7e963414a9cb56d6ed7dc5d3c4,655ed8d30a86ef7e963414a9cb56d6ed7dc5d3c4,c76afa26edf735c7644e32b931ac675cbbf08454,655ed8d30a86ef7e963414a9cb56d6ed7dc5d3c4,a8635f637038f9119623555a8dd70d275cd7342f,c76afa26edf735c7644e32b931ac675cbbf08454..83d72006e29d6f68ebfb52024323221d73d5ab31
--- 1/fs/binfmt_elf.c
--- 2/fs/binfmt_elf.c
--- 3/fs/binfmt_elf.c
--- 4/fs/binfmt_elf.c
--- 5/fs/binfmt_elf.c
--- 6/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@@@@@@ -683,7 -683,7 -683,7 -683,7 -683,7 -683,7 +683,7 @@@@@@@ static int load_elf_binary(struct linux
                          * switch really is going to happen - do this in
                          * flush_thread().      - akpm
                          */
-- --                   SET_PERSONALITY(loc->elf_ex, 0);
++ ++                   SET_PERSONALITY(loc->elf_ex);
       
                         interpreter = open_exec(elf_interpreter);
                         retval = PTR_ERR(interpreter);
@@@@@@@ -734,7 -734,7 -734,7 -734,7 -734,7 -734,7 +734,7 @@@@@@@
                         goto out_free_dentry;
         } else {
                 /* Executables without an interpreter also need a personality  */
-- --           SET_PERSONALITY(loc->elf_ex, 0);
++ ++           SET_PERSONALITY(loc->elf_ex);
         }
       
         /* Flush all traces of the currently running executable */
@@@@@@@ -748,7 -748,7 -748,7 -748,7 -748,7 -748,7 +748,7 @@@@@@@
       
         /* Do this immediately, since STACK_TOP as used in setup_arg_pages
            may depend on the personality.  */
-- --   SET_PERSONALITY(loc->elf_ex, 0);
++ ++   SET_PERSONALITY(loc->elf_ex);
         if (elf_read_implies_exec(loc->elf_ex, executable_stack))
                 current->personality |= READ_IMPLIES_EXEC;
       
@@@@@@@ -1333,20 -1333,20 -1333,20 -1333,20 -1333,15 -1333,20 +1333,15 @@@@@@@ static void fill_prstatus(struct elf_pr
         prstatus->pr_pgrp = task_pgrp_vnr(p);
         prstatus->pr_sid = task_session_vnr(p);
         if (thread_group_leader(p)) {
++++ +          struct task_cputime cputime;
++++ +
                 /*
---- -           * This is the record for the group leader.  Add in the
---- -           * cumulative times of previous dead threads.  This total
---- -           * won't include the time of each live thread whose state
---- -           * is included in the core dump.  The final total reported
---- -           * to our parent process when it calls wait4 will include
---- -           * those sums as well as the little bit more time it takes
---- -           * this and each other thread to finish dying after the
---- -           * core dump synchronization phase.
++++ +           * This is the record for the group leader.  It shows the
++++ +           * group-wide total, not its individual thread total.
                  */
---- -          cputime_to_timeval(cputime_add(p->utime, p->signal->utime),
---- -                             &prstatus->pr_utime);
---- -          cputime_to_timeval(cputime_add(p->stime, p->signal->stime),
---- -                             &prstatus->pr_stime);
++++ +          thread_group_cputime(p, &cputime);
++++ +          cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
++++ +          cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
         } else {
                 cputime_to_timeval(p->utime, &prstatus->pr_utime);
                 cputime_to_timeval(p->stime, &prstatus->pr_stime);
diff --combined fs/proc/array.c

index 0d6eb33597c6627f24586407f7e0a820454b3612,0d6eb33597c6627f24586407f7e0a820454b3612,f4bc0e789539f413e080324ab8209575ba349c42,71c9be59c9c2574045d69c33aa024d618b681b4c,933953c4e407db8d3c6b1608c263fb222d4dcd6c,f4bc0e789539f413e080324ab8209575ba349c42..bb9f4b05703de9b587a1b2cbd36ab651d2613da8
--- 1/fs/proc/array.c
--- 2/fs/proc/array.c
--- 3/fs/proc/array.c
--- 4/fs/proc/array.c
--- 5/fs/proc/array.c
--- 6/fs/proc/array.c
+++ b/fs/proc/array.c
@@@@@@@ -86,11 -86,11 -86,6 -86,11 -86,11 -86,6 +86,6 @@@@@@@
       #include <asm/processor.h>
       #include "internal.h"
       
-- -- /* Gcc optimizes away "strlen(x)" for constant x */
-- -- #define ADDBUF(buffer, string) \
-- -- do { memcpy(buffer, string, strlen(string)); \
-- --      buffer += strlen(string); } while (0)
-- -- 
       static inline void task_name(struct seq_file *m, struct task_struct *p)
       {
         int i;
@@@@@@@ -261,7 -261,7 -256,6 -261,7 -261,7 -256,6 +256,6 @@@@@@@ static inline void task_sig(struct seq_
         sigemptyset(&ignored);
         sigemptyset(&caught);
       
-- --   rcu_read_lock();
         if (lock_task_sighand(p, &flags)) {
                 pending = p->pending.signal;
                 shpending = p->signal->shared_pending.signal;
@@@@@@@ -272,7 -272,7 -266,6 -272,7 -272,7 -266,6 +266,6 @@@@@@@
                 qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
                 unlock_task_sighand(p, &flags);
         }
-- --   rcu_read_unlock();
       
         seq_printf(m, "Threads:\t%d\n", num_threads);
         seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim);
@@@@@@@ -337,65 -337,65 -330,6 -337,6 -337,6 -330,6 +330,6 @@@@@@@ int proc_pid_status(struct seq_file *m
         return 0;
       }
       
--    /*
--     * Use precise platform statistics if available:
--     */
--    #ifdef CONFIG_VIRT_CPU_ACCOUNTING
--    static cputime_t task_utime(struct task_struct *p)
--    {
--      return p->utime;
--    }
--    
--    static cputime_t task_stime(struct task_struct *p)
--    {
--      return p->stime;
--    }
--    #else
--    static cputime_t task_utime(struct task_struct *p)
--    {
--      clock_t utime = cputime_to_clock_t(p->utime),
--              total = utime + cputime_to_clock_t(p->stime);
--      u64 temp;
--    
--      /*
--       * Use CFS's precise accounting:
--       */
--      temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
--    
--      if (total) {
--              temp *= utime;
--              do_div(temp, total);
--      }
--      utime = (clock_t)temp;
--    
--      p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
--      return p->prev_utime;
--    }
--    
--    static cputime_t task_stime(struct task_struct *p)
--    {
--      clock_t stime;
--    
--      /*
--       * Use CFS's precise accounting. (we subtract utime from
--       * the total, to make sure the total observed by userspace
--       * grows monotonically - apps rely on that):
--       */
--      stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
--                      cputime_to_clock_t(task_utime(p));
--    
--      if (stime >= 0)
--              p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
--    
--      return p->prev_stime;
--    }
--    #endif
--    
--    static cputime_t task_gtime(struct task_struct *p)
--    {
--      return p->gtime;
--    }
--    
       static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                         struct pid *pid, struct task_struct *task, int whole)
       {
@@@@@@@ -454,20 -454,20 -388,20 -395,20 -395,20 -388,20 +388,20 @@@@@@@
       
                 /* add up live thread stats at the group level */
                 if (whole) {
++++ +                  struct task_cputime cputime;
                         struct task_struct *t = task;
                         do {
                                 min_flt += t->min_flt;
                                 maj_flt += t->maj_flt;
---- -                          utime = cputime_add(utime, task_utime(t));
---- -                          stime = cputime_add(stime, task_stime(t));
                                 gtime = cputime_add(gtime, task_gtime(t));
                                 t = next_thread(t);
                         } while (t != task);
       
                         min_flt += sig->min_flt;
                         maj_flt += sig->maj_flt;
---- -                  utime = cputime_add(utime, sig->utime);
---- -                  stime = cputime_add(stime, sig->stime);
++++ +                  thread_group_cputime(task, &cputime);
++++ +                  utime = cputime.utime;
++++ +                  stime = cputime.stime;
                         gtime = cputime_add(gtime, sig->gtime);
                 }
       
diff --combined include/linux/hrtimer.h

index 6d93dce61cbbc25c8b17d2f8f8c5c9d0e864ee9f,8730b60c94327a413f1395f9094983fe9bf37c48,2f245fe63bda5611ad909c1452aa8a79c4f29eb4,6d93dce61cbbc25c8b17d2f8f8c5c9d0e864ee9f,6d93dce61cbbc25c8b17d2f8f8c5c9d0e864ee9f,2f245fe63bda5611ad909c1452aa8a79c4f29eb4..9a4e35cd5f79d80e52bf3a1509e0438bb6a77ec4
--- 1/include/linux/hrtimer.h
--- 2/include/linux/hrtimer.h
--- 3/include/linux/hrtimer.h
--- 4/include/linux/hrtimer.h
--- 5/include/linux/hrtimer.h
--- 6/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@@@@@@ -47,14 -47,14 -47,22 -47,14 -47,14 -47,22 +47,22 @@@@@@@ enum hrtimer_restart 
        *        HRTIMER_CB_IRQSAFE:             Callback may run in hardirq context
        *        HRTIMER_CB_IRQSAFE_NO_RESTART:  Callback may run in hardirq context and
        *                                        does not restart the timer
-- --  *        HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:  Callback must run in hardirq context
-- --  *                                        Special mode for tick emultation
++ ++  *        HRTIMER_CB_IRQSAFE_PERCPU:      Callback must run in hardirq context
++ ++  *                                        Special mode for tick emulation and
++ ++  *                                        scheduler timer. Such timers are per
++ ++  *                                        cpu and not allowed to be migrated on
++ ++  *                                        cpu unplug.
++ ++  *        HRTIMER_CB_IRQSAFE_UNLOCKED:    Callback should run in hardirq context
++ ++  *                                        with timer->base lock unlocked
++ ++  *                                        used for timers which call wakeup to
++ ++  *                                        avoid lock order problems with rq->lock
        */
       enum hrtimer_cb_mode {
         HRTIMER_CB_SOFTIRQ,
         HRTIMER_CB_IRQSAFE,
         HRTIMER_CB_IRQSAFE_NO_RESTART,
-- --   HRTIMER_CB_IRQSAFE_NO_SOFTIRQ,
++ ++   HRTIMER_CB_IRQSAFE_PERCPU,
++ ++   HRTIMER_CB_IRQSAFE_UNLOCKED,
       };
       
       /*
@@@@@@@ -67,9 -67,9 -75,10 -67,9 -67,9 -75,10 +75,10 @@@@@@@
        * 0x02           callback function running
        * 0x04           callback pending (high resolution mode)
        *
-- --  * Special case:
++ ++  * Special cases:
        * 0x03           callback function running and enqueued
        *                (was requeued on another CPU)
++ ++  * 0x09           timer was migrated on CPU hotunplug
        * The "callback function running and enqueued" status is only possible on
        * SMP. It happens for example when a posix timer expired and the callback
        * queued a signal. Between dropping the lock which protects the posix timer
@@@@@@@ -87,6 -87,6 -96,7 -87,6 -87,6 -96,7 +96,7 @@@@@@@
       #define HRTIMER_STATE_ENQUEUED    0x01
       #define HRTIMER_STATE_CALLBACK    0x02
       #define HRTIMER_STATE_PENDING     0x04
++ ++ #define HRTIMER_STATE_MIGRATE     0x08
       
       /**
        * struct hrtimer - the basic hrtimer structure
@@@@@@@ -115,12 -115,12 -125,12 -115,12 -115,12 -125,12 +125,12 @@@@@@@ struct hrtimer 
         enum hrtimer_restart            (*function)(struct hrtimer *);
         struct hrtimer_clock_base       *base;
         unsigned long                   state;
- ----  enum hrtimer_cb_mode            cb_mode;
         struct list_head                cb_entry;
+ ++++  enum hrtimer_cb_mode            cb_mode;
       #ifdef CONFIG_TIMER_STATS
+ ++++  int                             start_pid;
         void                            *start_site;
         char                            start_comm[16];
- ----  int                             start_pid;
       #endif
       };
       
@@@@@@@ -145,10 -145,8 -155,10 -145,10 -145,10 -155,10 +155,8 @@@@@@@ struct hrtimer_sleeper 
        * @first:                pointer to the timer node which expires first
        * @resolution:           the resolution of the clock, in nanoseconds
        * @get_time:             function to retrieve the current time of the clock
- ---- * @get_softirq_time:     function to retrieve the current time from the softirq
        * @softirq_time: the time when running the hrtimer queue in the softirq
        * @offset:               offset of this clock to the monotonic base
- ---- * @reprogram:            function to reprogram the timer event
        */
       struct hrtimer_clock_base {
         struct hrtimer_cpu_base *cpu_base;
@@@@@@@ -157,13 -155,9 -167,13 -157,13 -157,13 -167,13 +165,9 @@@@@@@
         struct rb_node          *first;
         ktime_t                 resolution;
         ktime_t                 (*get_time)(void);
- ----  ktime_t                 (*get_softirq_time)(void);
         ktime_t                 softirq_time;
       #ifdef CONFIG_HIGH_RES_TIMERS
         ktime_t                 offset;
- ----  int                     (*reprogram)(struct hrtimer *t,
- ----                                       struct hrtimer_clock_base *b,
- ----                                       ktime_t n);
       #endif
       };
       
diff --combined include/linux/sched.h

index cfb0d87b99fcafb8ff0eb753e0ea489dc20478dd,cfb0d87b99fcafb8ff0eb753e0ea489dc20478dd,c226c7b82946ce1d830853c4fd3b9bad3d92fa0d,3d9120c5ad1589a0da722e514c370c0a3f1c4fe4,23d9d5464544abef91b6bcbb9b3d9eac6ff03d17,c226c7b82946ce1d830853c4fd3b9bad3d92fa0d..81c68fef4431746411d30c9a936cc8652c3d5b81
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
--- 3/include/linux/sched.h
--- 4/include/linux/sched.h
--- 5/include/linux/sched.h
--- 6/include/linux/sched.h
+++ b/include/linux/sched.h
@@@@@@@ -352,7 -352,7 -352,7 -352,7 -352,7 -352,7 +352,7 @@@@@@@ arch_get_unmapped_area_topdown(struct f
       extern void arch_unmap_area(struct mm_struct *, unsigned long);
       extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
       
-- -- #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
++ ++ #if USE_SPLIT_PTLOCKS
       /*
        * The mm counters are not protected by its page_table_lock,
        * so must be incremented atomically.
@@@@@@@ -363,7 -363,7 -363,7 -363,7 -363,7 -363,7 +363,7 @@@@@@@
       #define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
       #define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
       
-- -- #else  /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
++ ++ #else  /* !USE_SPLIT_PTLOCKS */
       /*
        * The mm counters are protected by its page_table_lock,
        * so can be incremented directly.
@@@@@@@ -374,7 -374,7 -374,7 -374,7 -374,7 -374,7 +374,7 @@@@@@@
       #define inc_mm_counter(mm, member) (mm)->_##member++
       #define dec_mm_counter(mm, member) (mm)->_##member--
       
-- -- #endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
++ ++ #endif /* !USE_SPLIT_PTLOCKS */
       
       #define get_mm_rss(mm)                                    \
         (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
@@@@@@@ -425,6 -425,6 -425,6 -425,6 -425,39 -425,6 +425,39 @@@@@@@ struct pacct_struct 
         unsigned long           ac_minflt, ac_majflt;
       };
       
++++ +/**
++++ + * struct task_cputime - collected CPU time counts
++++ + * @utime:                time spent in user mode, in &cputime_t units
++++ + * @stime:                time spent in kernel mode, in &cputime_t units
++++ + * @sum_exec_runtime:     total time spent on the CPU, in nanoseconds
++++ + *
++++ + * This structure groups together three kinds of CPU time that are
++++ + * tracked for threads and thread groups.  Most things considering
++++ + * CPU time want to group these counts together and treat all three
++++ + * of them in parallel.
++++ + */
++++ +struct task_cputime {
++++ +  cputime_t utime;
++++ +  cputime_t stime;
++++ +  unsigned long long sum_exec_runtime;
++++ +};
++++ +/* Alternate field names when used to cache expirations. */
++++ +#define prof_exp  stime
++++ +#define virt_exp  utime
++++ +#define sched_exp sum_exec_runtime
++++ +
++++ +/**
++++ + * struct thread_group_cputime - thread group interval timer counts
++++ + * @totals:               thread group interval timers; substructure for
++++ + *                        uniprocessor kernel, per-cpu for SMP kernel.
++++ + *
++++ + * This structure contains the version of task_cputime, above, that is
++++ + * used for thread group CPU clock calculations.
++++ + */
++++ +struct thread_group_cputime {
++++ +  struct task_cputime *totals;
++++ +};
++++ +
       /*
        * NOTE! "signal_struct" does not have it's own
        * locking, because a shared signal_struct always
@@@@@@@ -451,8 -451,8 -451,8 -451,8 -484,8 -451,8 +484,8 @@@@@@@ struct signal_struct 
          * - everyone except group_exit_task is stopped during signal delivery
          *   of fatal signals, group_exit_task processes the signal.
          */
-- --   struct task_struct      *group_exit_task;
         int                     notify_count;
++ ++   struct task_struct      *group_exit_task;
       
         /* thread group stop support, overloads group_exit_code too */
         int                     group_stop_count;
@@@@@@@ -470,6 -470,6 -470,6 -470,6 -503,17 -470,6 +503,17 @@@@@@@
         cputime_t it_prof_expires, it_virt_expires;
         cputime_t it_prof_incr, it_virt_incr;
       
++++ +  /*
++++ +   * Thread group totals for process CPU clocks.
++++ +   * See thread_group_cputime(), et al, for details.
++++ +   */
++++ +  struct thread_group_cputime cputime;
++++ +
++++ +  /* Earliest-expiration cache. */
++++ +  struct task_cputime cputime_expires;
++++ +
++++ +  struct list_head cpu_timers[3];
++++ +
         /* job control IDs */
       
         /*
@@@@@@@ -500,7 -500,7 -500,7 -500,7 -544,7 -500,7 +544,7 @@@@@@@
          * Live threads maintain their own counters and add to these
          * in __exit_signal, except for the group leader.
          */
---- -  cputime_t utime, stime, cutime, cstime;
++++ +  cputime_t cutime, cstime;
         cputime_t gtime;
         cputime_t cgtime;
         unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
@@@@@@@ -508,14 -508,14 -508,14 -508,14 -552,6 -508,14 +552,6 @@@@@@@
         unsigned long inblock, oublock, cinblock, coublock;
         struct task_io_accounting ioac;
       
---- -  /*
---- -   * Cumulative ns of scheduled CPU time for dead threads in the
---- -   * group, not including a zombie group leader.  (This only differs
---- -   * from jiffies_to_ns(utime + stime) if sched_clock uses something
---- -   * other than jiffies.)
---- -   */
---- -  unsigned long long sum_sched_runtime;
---- -
         /*
          * We don't bother to synchronize most readers of this at all,
          * because there is no reader checking a limit that actually needs
@@@@@@@ -527,8 -527,8 -527,8 -527,8 -563,6 -527,8 +563,6 @@@@@@@
          */
         struct rlimit rlim[RLIM_NLIMITS];
       
---- -  struct list_head cpu_timers[3];
---- -
         /* keep the process-shared keyrings here so that they do the right
          * thing in threads created with CLONE_THREAD */
       #ifdef CONFIG_KEYS
@@@@@@@ -824,6 -824,6 -824,9 -824,6 -858,6 -824,9 +858,9 @@@@@@@ struct sched_domain 
         unsigned int ttwu_move_affine;
         unsigned int ttwu_move_balance;
       #endif
++ ++ #ifdef CONFIG_SCHED_DEBUG
++ ++   char *name;
++ ++ #endif
       };
       
       extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
@@@@@@@ -897,7 -897,7 -900,7 -897,7 -931,7 -900,7 +934,7 @@@@@@@ struct sched_class 
         void (*yield_task) (struct rq *rq);
         int  (*select_task_rq)(struct task_struct *p, int sync);
       
-- --   void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
++ ++   void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync);
       
         struct task_struct * (*pick_next_task) (struct rq *rq);
         void (*put_prev_task) (struct rq *rq, struct task_struct *p);
@@@@@@@ -1010,8 -1010,8 -1013,8 -1010,8 -1044,8 -1013,8 +1047,8 @@@@@@@ struct sched_entity 
       
       struct sched_rt_entity {
         struct list_head run_list;
-- --   unsigned int time_slice;
         unsigned long timeout;
++ ++   unsigned int time_slice;
         int nr_cpus_allowed;
       
         struct sched_rt_entity *back;
@@@@@@@ -1134,8 -1134,8 -1137,8 -1134,8 -1168,7 -1137,8 +1171,7 @@@@@@@ struct task_struct 
       /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
         unsigned long min_flt, maj_flt;
       
---- -          cputime_t it_prof_expires, it_virt_expires;
---- -  unsigned long long it_sched_expires;
++++ +  struct task_cputime cputime_expires;
         struct list_head cpu_timers[3];
       
       /* process credentials */
@@@@@@@ -1475,6 -1475,6 -1478,10 -1475,10 -1508,10 -1478,10 +1511,10 @@@@@@@ static inline void put_task_struct(stru
                 __put_task_struct(t);
       }
       
++    extern cputime_t task_utime(struct task_struct *p);
++    extern cputime_t task_stime(struct task_struct *p);
++    extern cputime_t task_gtime(struct task_struct *p);
++    
       /*
        * Per process flags
        */
@@@@@@@ -1581,6 -1581,6 -1588,6 -1585,6 -1618,7 -1588,6 +1621,7 @@@@@@@ extern unsigned long long cpu_clock(in
       
       extern unsigned long long
       task_sched_runtime(struct task_struct *task);
++++ +extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
       
       /* sched_exec is called by processes performing an exec */
       #ifdef CONFIG_SMP
@@@@@@@ -2077,6 -2077,6 -2084,6 -2081,6 -2115,30 -2084,6 +2118,30 @@@@@@@ static inline int spin_needbreak(spinlo
       #endif
       }
       
++++ +/*
++++ + * Thread group CPU time accounting.
++++ + */
++++ +
++++ +extern int thread_group_cputime_alloc(struct task_struct *);
++++ +extern void thread_group_cputime(struct task_struct *, struct task_cputime *);
++++ +
++++ +static inline void thread_group_cputime_init(struct signal_struct *sig)
++++ +{
++++ +  sig->cputime.totals = NULL;
++++ +}
++++ +
++++ +static inline int thread_group_cputime_clone_thread(struct task_struct *curr)
++++ +{
++++ +  if (curr->signal->cputime.totals)
++++ +          return 0;
++++ +  return thread_group_cputime_alloc(curr);
++++ +}
++++ +
++++ +static inline void thread_group_cputime_free(struct signal_struct *sig)
++++ +{
++++ +  free_percpu(sig->cputime.totals);
++++ +}
++++ +
       /*
        * Reevaluate whether the task has signals pending delivery.
        * Wake the task if so.
diff --combined include/linux/time.h

index 205f974b9ebf1677e5fd269a5c17fdd7acdeacd0,e15206a7e82ec9278b25fc062585302f9ed71687,51e883df0fa51fe598832747477533fc4303e30a,e15206a7e82ec9278b25fc062585302f9ed71687,1b70b3c293e934536a0a8cf5dd64046767de5c28,51e883df0fa51fe598832747477533fc4303e30a..4f1c9db577079ed3cc688def99e476219b5aa9b0
--- 1/include/linux/time.h
--- 2/include/linux/time.h
--- 3/include/linux/time.h
--- 4/include/linux/time.h
--- 5/include/linux/time.h
--- 6/include/linux/time.h
+++ b/include/linux/time.h
@@@@@@@ -29,6 -29,6 -29,8 -29,6 -29,6 -29,8 +29,8 @@@@@@@ struct timezone 
       
       #ifdef __KERNEL__
       
++ ++ extern struct timezone sys_tz;
++ ++ 
       /* Parameters used to convert the timespec values: */
       #define MSEC_PER_SEC      1000L
       #define USEC_PER_MSEC     1000L
@@@@@@@ -117,7 -117,6 -119,6 -117,6 -117,6 -119,6 +119,7 @@@@@@@ extern int do_setitimer(int which, stru
       extern unsigned int alarm_setitimer(unsigned int seconds);
       extern int do_getitimer(int which, struct itimerval *value);
       extern void getnstimeofday(struct timespec *tv);
+ +++++extern void getrawmonotonic(struct timespec *ts);
       extern void getboottime(struct timespec *ts);
       extern void monotonic_to_bootbased(struct timespec *ts);
       
@@@@@@@ -126,6 -125,6 -127,6 -125,6 -125,9 -127,6 +128,9 @@@@@@@ extern int timekeeping_valid_for_hres(v
       extern void update_wall_time(void);
       extern void update_xtime_cache(u64 nsec);
       
++++ +struct tms;
++++ +extern void do_sys_times(struct tms *);
++++ +
       /**
        * timespec_to_ns - Convert timespec to nanoseconds
        * @ts:           pointer to the timespec variable to be converted
@@@@@@@ -215,7 -214,6 -216,6 -214,6 -217,6 -216,6 +220,7 @@@@@@@ struct itimerval 
       #define CLOCK_MONOTONIC                   1
       #define CLOCK_PROCESS_CPUTIME_ID  2
       #define CLOCK_THREAD_CPUTIME_ID           3
+ +++++#define CLOCK_MONOTONIC_RAW               4
       
       /*
        * The IDs of various hardware clocks:
diff --combined kernel/compat.c

index 32c254a8ab9af07ae3ac102972063956592e0b3b,32c254a8ab9af07ae3ac102972063956592e0b3b,143990e48cb9aab2af2f04d93a372d63dc570882,32c254a8ab9af07ae3ac102972063956592e0b3b,72650e39b3e6bc296368913b71dae42893844acd,143990e48cb9aab2af2f04d93a372d63dc570882..8eafe3eb50d9feb76dce5b289a4a7278321e4d8a
--- 1/kernel/compat.c
--- 2/kernel/compat.c
--- 3/kernel/compat.c
--- 4/kernel/compat.c
--- 5/kernel/compat.c
--- 6/kernel/compat.c
+++ b/kernel/compat.c
@@@@@@@ -23,9 -23,9 -23,67 -23,9 -23,10 -23,67 +23,68 @@@@@@@
       #include <linux/timex.h>
       #include <linux/migrate.h>
       #include <linux/posix-timers.h>
++++ +#include <linux/times.h>
       
       #include <asm/uaccess.h>
       
++ ++ /*
++ ++  * Note that the native side is already converted to a timespec, because
++ ++  * that's what we want anyway.
++ ++  */
++ ++ static int compat_get_timeval(struct timespec *o,
++ ++           struct compat_timeval __user *i)
++ ++ {
++ ++   long usec;
++ ++ 
++ ++   if (get_user(o->tv_sec, &i->tv_sec) ||
++ ++       get_user(usec, &i->tv_usec))
++ ++           return -EFAULT;
++ ++   o->tv_nsec = usec * 1000;
++ ++   return 0;
++ ++ }
++ ++ 
++ ++ static int compat_put_timeval(struct compat_timeval __user *o,
++ ++           struct timeval *i)
++ ++ {
++ ++   return (put_user(i->tv_sec, &o->tv_sec) ||
++ ++           put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
++ ++ }
++ ++ 
++ ++ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
++ ++           struct timezone __user *tz)
++ ++ {
++ ++   if (tv) {
++ ++           struct timeval ktv;
++ ++           do_gettimeofday(&ktv);
++ ++           if (compat_put_timeval(tv, &ktv))
++ ++                   return -EFAULT;
++ ++   }
++ ++   if (tz) {
++ ++           if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
++ ++                   return -EFAULT;
++ ++   }
++ ++ 
++ ++   return 0;
++ ++ }
++ ++ 
++ ++ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
++ ++           struct timezone __user *tz)
++ ++ {
++ ++   struct timespec kts;
++ ++   struct timezone ktz;
++ ++ 
++ ++   if (tv) {
++ ++           if (compat_get_timeval(&kts, tv))
++ ++                   return -EFAULT;
++ ++   }
++ ++   if (tz) {
++ ++           if (copy_from_user(&ktz, tz, sizeof(ktz)))
++ ++                   return -EFAULT;
++ ++   }
++ ++ 
++ ++   return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
++ ++ }
++ ++ 
       int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
       {
         return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
@@@@@@@ -150,49 -150,49 -208,49 -150,49 -151,23 -208,49 +209,23 @@@@@@@ asmlinkage long compat_sys_setitimer(in
         return 0;
       }
       
++++ +static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
++++ +{
++++ +  return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
++++ +}
++++ +
       asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
       {
---- -  /*
---- -   *      In the SMP world we might just be unlucky and have one of
---- -   *      the times increment as we use it. Since the value is an
---- -   *      atomically safe type this is just fine. Conceptually its
---- -   *      as if the syscall took an instant longer to occur.
---- -   */
         if (tbuf) {
++++ +          struct tms tms;
                 struct compat_tms tmp;
---- -          struct task_struct *tsk = current;
---- -          struct task_struct *t;
---- -          cputime_t utime, stime, cutime, cstime;
---- -
---- -          read_lock(&tasklist_lock);
---- -          utime = tsk->signal->utime;
---- -          stime = tsk->signal->stime;
---- -          t = tsk;
---- -          do {
---- -                  utime = cputime_add(utime, t->utime);
---- -                  stime = cputime_add(stime, t->stime);
---- -                  t = next_thread(t);
---- -          } while (t != tsk);
---- -
---- -          /*
---- -           * While we have tasklist_lock read-locked, no dying thread
---- -           * can be updating current->signal->[us]time.  Instead,
---- -           * we got their counts included in the live thread loop.
---- -           * However, another thread can come in right now and
---- -           * do a wait call that updates current->signal->c[us]time.
---- -           * To make sure we always see that pair updated atomically,
---- -           * we take the siglock around fetching them.
---- -           */
---- -          spin_lock_irq(&tsk->sighand->siglock);
---- -          cutime = tsk->signal->cutime;
---- -          cstime = tsk->signal->cstime;
---- -          spin_unlock_irq(&tsk->sighand->siglock);
---- -          read_unlock(&tasklist_lock);
---- -
---- -          tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime));
---- -          tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime));
---- -          tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime));
---- -          tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime));
++++ +
++++ +          do_sys_times(&tms);
++++ +          /* Convert our struct tms to the compat version. */
++++ +          tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
++++ +          tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
++++ +          tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
++++ +          tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
                 if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
                         return -EFAULT;
         }
diff --combined kernel/exit.c

index 38ec40630149bd07b97f06d406d9398763a6de8c,38ec40630149bd07b97f06d406d9398763a6de8c,0ef4673e351bddd4e738f88941511efa02f062fa,16395644a98ff8c060b2f5fd776fe7abadd61c6a,40036ac04271a016d7946977f0e9655a33183566,0ef4673e351bddd4e738f88941511efa02f062fa..059b38cae3848ea62fc7f03fb5a08c3dfbb53f10
--- 1/kernel/exit.c
--- 2/kernel/exit.c
--- 3/kernel/exit.c
--- 4/kernel/exit.c
--- 5/kernel/exit.c
--- 6/kernel/exit.c
+++ b/kernel/exit.c
@@@@@@@ -112,9 -112,9 -112,9 -112,9 -112,7 -112,9 +112,7 @@@@@@@ static void __exit_signal(struct task_s
                  * We won't ever get here for the group leader, since it
                  * will have been the last reference on the signal_struct.
                  */
--              sig->utime = cputime_add(sig->utime, tsk->utime);
--              sig->stime = cputime_add(sig->stime, tsk->stime);
--              sig->gtime = cputime_add(sig->gtime, tsk->gtime);
-  -- -          sig->utime = cputime_add(sig->utime, task_utime(tsk));
-  -- -          sig->stime = cputime_add(sig->stime, task_stime(tsk));
++              sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
                 sig->min_flt += tsk->min_flt;
                 sig->maj_flt += tsk->maj_flt;
                 sig->nvcsw += tsk->nvcsw;
@@@@@@@ -122,7 -122,7 -122,7 -122,7 -120,6 -122,7 +120,6 @@@@@@@
                 sig->inblock += task_io_get_inblock(tsk);
                 sig->oublock += task_io_get_oublock(tsk);
                 task_io_accounting_add(&sig->ioac, &tsk->ioac);
---- -          sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
                 sig = NULL; /* Marker for below. */
         }
       
@@@@@@@ -583,8 -583,8 -583,6 -583,8 -580,8 -583,6 +580,6 @@@@@@@ mm_need_new_owner(struct mm_struct *mm
          * If there are other users of the mm and the owner (us) is exiting
          * we need to find a new owner to take on the responsibility.
          */
-- --   if (!mm)
-- --           return 0;
         if (atomic_read(&mm->mm_users) <= 1)
                 return 0;
         if (mm->owner != p)
@@@@@@@ -627,29 -627,29 -625,38 -627,29 -624,29 -625,38 +622,38 @@@@@@@ retry
         } while_each_thread(g, c);
       
         read_unlock(&tasklist_lock);
++ ++   /*
++ ++    * We found no owner yet mm_users > 1: this implies that we are
++ ++    * most likely racing with swapoff (try_to_unuse()) or /proc or
++ ++    * ptrace or page migration (get_task_mm()).  Mark owner as NULL,
++ ++    * so that subsystems can understand the callback and take action.
++ ++    */
++ ++   down_write(&mm->mmap_sem);
++ ++   cgroup_mm_owner_callbacks(mm->owner, NULL);
++ ++   mm->owner = NULL;
++ ++   up_write(&mm->mmap_sem);
         return;
       
       assign_new_owner:
         BUG_ON(c == p);
         get_task_struct(c);
++ ++   read_unlock(&tasklist_lock);
++ ++   down_write(&mm->mmap_sem);
         /*
          * The task_lock protects c->mm from changing.
          * We always want mm->owner->mm == mm
          */
         task_lock(c);
-- --   /*
-- --    * Delay read_unlock() till we have the task_lock()
-- --    * to ensure that c does not slip away underneath us
-- --    */
-- --   read_unlock(&tasklist_lock);
         if (c->mm != mm) {
                 task_unlock(c);
++ ++           up_write(&mm->mmap_sem);
                 put_task_struct(c);
                 goto retry;
         }
         cgroup_mm_owner_callbacks(mm->owner, c);
         mm->owner = c;
         task_unlock(c);
++ ++   up_write(&mm->mmap_sem);
         put_task_struct(c);
       }
       #endif /* CONFIG_MM_OWNER */
@@@@@@@ -831,26 -831,26 -838,50 -831,50 -828,50 -838,50 +835,50 @@@@@@@ static void reparent_thread(struct task
        * the child reaper process (ie "init") in our pid
        * space.
        */
++    static struct task_struct *find_new_reaper(struct task_struct *father)
++    {
++      struct pid_namespace *pid_ns = task_active_pid_ns(father);
++      struct task_struct *thread;
++    
++      thread = father;
++      while_each_thread(father, thread) {
++              if (thread->flags & PF_EXITING)
++                      continue;
++              if (unlikely(pid_ns->child_reaper == father))
++                      pid_ns->child_reaper = thread;
++              return thread;
++      }
++    
++      if (unlikely(pid_ns->child_reaper == father)) {
++              write_unlock_irq(&tasklist_lock);
++              if (unlikely(pid_ns == &init_pid_ns))
++                      panic("Attempted to kill init!");
++    
++              zap_pid_ns_processes(pid_ns);
++              write_lock_irq(&tasklist_lock);
++              /*
++               * We can not clear ->child_reaper or leave it alone.
++               * There may by stealth EXIT_DEAD tasks on ->children,
++               * forget_original_parent() must move them somewhere.
++               */
++              pid_ns->child_reaper = init_pid_ns.child_reaper;
++      }
++    
++      return pid_ns->child_reaper;
++    }
++    
       static void forget_original_parent(struct task_struct *father)
       {
--      struct task_struct *p, *n, *reaper = father;
++      struct task_struct *p, *n, *reaper;
         LIST_HEAD(ptrace_dead);
       
         write_lock_irq(&tasklist_lock);
--    
++      reaper = find_new_reaper(father);
         /*
          * First clean up ptrace if we were using it.
          */
         ptrace_exit(father, &ptrace_dead);
       
--      do {
--              reaper = next_thread(reaper);
--              if (reaper == father) {
--                      reaper = task_child_reaper(father);
--                      break;
--              }
--      } while (reaper->flags & PF_EXITING);
--    
         list_for_each_entry_safe(p, n, &father->children, sibling) {
                 p->real_parent = reaper;
                 if (p->parent == father) {
@@@@@@@ -918,8 -918,8 -949,8 -942,8 -939,8 -949,8 +946,8 @@@@@@@ static void exit_notify(struct task_str
       
         /* mt-exec, de_thread() is waiting for us */
         if (thread_group_leader(tsk) &&
--          tsk->signal->notify_count < 0 &&
--          tsk->signal->group_exit_task)
++          tsk->signal->group_exit_task &&
++          tsk->signal->notify_count < 0)
                 wake_up_process(tsk->signal->group_exit_task);
       
         write_unlock_irq(&tasklist_lock);
@@@@@@@ -959,39 -959,39 -990,6 -983,6 -980,6 -990,6 +987,6 @@@@@@@ static void check_stack_usage(void
       static inline void check_stack_usage(void) {}
       #endif
       
--    static inline void exit_child_reaper(struct task_struct *tsk)
--    {
--      if (likely(tsk->group_leader != task_child_reaper(tsk)))
--              return;
--    
--      if (tsk->nsproxy->pid_ns == &init_pid_ns)
--              panic("Attempted to kill init!");
--    
--      /*
--       * @tsk is the last thread in the 'cgroup-init' and is exiting.
--       * Terminate all remaining processes in the namespace and reap them
--       * before exiting @tsk.
--       *
--       * Note that @tsk (last thread of cgroup-init) may not necessarily
--       * be the child-reaper (i.e main thread of cgroup-init) of the
--       * namespace i.e the child_reaper may have already exited.
--       *
--       * Even after a child_reaper exits, we let it inherit orphaned children,
--       * because, pid_ns->child_reaper remains valid as long as there is
--       * at least one living sub-thread in the cgroup init.
--    
--       * This living sub-thread of the cgroup-init will be notified when
--       * a child inherited by the 'child-reaper' exits (do_notify_parent()
--       * uses __group_send_sig_info()). Further, when reaping child processes,
--       * do_wait() iterates over children of all living sub threads.
--    
--       * i.e even though 'child_reaper' thread is listed as the parent of the
--       * orphaned children, any living sub-thread in the cgroup-init can
--       * perform the role of the child_reaper.
--       */
--      zap_pid_ns_processes(tsk->nsproxy->pid_ns);
--    }
--    
       NORET_TYPE void do_exit(long code)
       {
         struct task_struct *tsk = current;
@@@@@@@ -1051,7 -1051,7 -1049,6 -1042,6 -1039,6 -1049,6 +1046,6 @@@@@@@
         }
         group_dead = atomic_dec_and_test(&tsk->signal->live);
         if (group_dead) {
--              exit_child_reaper(tsk);
                 hrtimer_cancel(&tsk->signal->real_timer);
                 exit_itimers(tsk->signal);
         }
@@@@@@@ -1304,6 -1304,6 -1301,6 -1294,6 -1291,7 -1301,6 +1298,7 @@@@@@@ static int wait_task_zombie(struct task
         if (likely(!traced)) {
                 struct signal_struct *psig;
                 struct signal_struct *sig;
++++ +          struct task_cputime cputime;
       
                 /*
                  * The resource counters for the group leader are in its
@@@@@@@ -1319,20 -1319,20 -1316,20 -1309,20 -1307,23 -1316,20 +1314,23 @@@@@@@
                  * need to protect the access to p->parent->signal fields,
                  * as other threads in the parent group can be right
                  * here reaping other children at the same time.
++++ +           *
++++ +           * We use thread_group_cputime() to get times for the thread
++++ +           * group, which consolidates times for all threads in the
++++ +           * group including the group leader.
                  */
                 spin_lock_irq(&p->parent->sighand->siglock);
                 psig = p->parent->signal;
                 sig = p->signal;
++++ +          thread_group_cputime(p, &cputime);
                 psig->cutime =
                         cputime_add(psig->cutime,
---- -                  cputime_add(p->utime,
---- -                  cputime_add(sig->utime,
---- -                              sig->cutime)));
++++ +                  cputime_add(cputime.utime,
++++ +                              sig->cutime));
                 psig->cstime =
                         cputime_add(psig->cstime,
---- -                  cputime_add(p->stime,
---- -                  cputime_add(sig->stime,
---- -                              sig->cstime)));
++++ +                  cputime_add(cputime.stime,
++++ +                              sig->cstime));
                 psig->cgtime =
                         cputime_add(psig->cgtime,
                         cputime_add(p->gtime,
diff --combined kernel/fork.c

index 7ce2ebe847964ecd0701c3c74c18994e3eebcf26,7ce2ebe847964ecd0701c3c74c18994e3eebcf26,30de644a40c4d4d9617d650589f4c90da1e977a2,7ce2ebe847964ecd0701c3c74c18994e3eebcf26,021ae012cc757096bad23977c55f352076e9a231,30de644a40c4d4d9617d650589f4c90da1e977a2..44e64d7ba29b8ecba78bc093ae18b8c0325e4047
--- 1/kernel/fork.c
--- 2/kernel/fork.c
--- 3/kernel/fork.c
--- 4/kernel/fork.c
--- 5/kernel/fork.c
--- 6/kernel/fork.c
+++ b/kernel/fork.c
@@@@@@@ -759,15 -759,15 -759,15 -759,15 -759,44 -759,15 +759,44 @@@@@@@ void __cleanup_sighand(struct sighand_s
                 kmem_cache_free(sighand_cachep, sighand);
       }
       
++++ +
++++ +/*
++++ + * Initialize POSIX timer handling for a thread group.
++++ + */
++++ +static void posix_cpu_timers_init_group(struct signal_struct *sig)
++++ +{
++++ +  /* Thread group counters. */
++++ +  thread_group_cputime_init(sig);
++++ +
++++ +  /* Expiration times and increments. */
++++ +  sig->it_virt_expires = cputime_zero;
++++ +  sig->it_virt_incr = cputime_zero;
++++ +  sig->it_prof_expires = cputime_zero;
++++ +  sig->it_prof_incr = cputime_zero;
++++ +
++++ +  /* Cached expiration times. */
++++ +  sig->cputime_expires.prof_exp = cputime_zero;
++++ +  sig->cputime_expires.virt_exp = cputime_zero;
++++ +  sig->cputime_expires.sched_exp = 0;
++++ +
++++ +  /* The timer lists. */
++++ +  INIT_LIST_HEAD(&sig->cpu_timers[0]);
++++ +  INIT_LIST_HEAD(&sig->cpu_timers[1]);
++++ +  INIT_LIST_HEAD(&sig->cpu_timers[2]);
++++ +}
++++ +
       static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
       {
         struct signal_struct *sig;
         int ret;
       
         if (clone_flags & CLONE_THREAD) {
---- -          atomic_inc(&current->signal->count);
---- -          atomic_inc(&current->signal->live);
---- -          return 0;
++++ +          ret = thread_group_cputime_clone_thread(current);
++++ +          if (likely(!ret)) {
++++ +                  atomic_inc(&current->signal->count);
++++ +                  atomic_inc(&current->signal->live);
++++ +          }
++++ +          return ret;
         }
         sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
         tsk->signal = sig;
@@@@@@@ -795,39 -795,39 -795,40 -795,39 -824,24 -795,40 +824,25 @@@@@@@
         sig->it_real_incr.tv64 = 0;
         sig->real_timer.function = it_real_fn;
       
---- -  sig->it_virt_expires = cputime_zero;
---- -  sig->it_virt_incr = cputime_zero;
---- -  sig->it_prof_expires = cputime_zero;
---- -  sig->it_prof_incr = cputime_zero;
---- -
         sig->leader = 0;        /* session leadership doesn't inherit */
         sig->tty_old_pgrp = NULL;
++ ++   sig->tty = NULL;
       
---- -  sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
++++ +  sig->cutime = sig->cstime = cputime_zero;
         sig->gtime = cputime_zero;
         sig->cgtime = cputime_zero;
         sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
         sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
         sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
         task_io_accounting_init(&sig->ioac);
---- -  sig->sum_sched_runtime = 0;
---- -  INIT_LIST_HEAD(&sig->cpu_timers[0]);
---- -  INIT_LIST_HEAD(&sig->cpu_timers[1]);
---- -  INIT_LIST_HEAD(&sig->cpu_timers[2]);
         taskstats_tgid_init(sig);
       
         task_lock(current->group_leader);
         memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
         task_unlock(current->group_leader);
       
---- -  if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
---- -          /*
---- -           * New sole thread in the process gets an expiry time
---- -           * of the whole CPU time limit.
---- -           */
---- -          tsk->it_prof_expires =
---- -                  secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
---- -  }
++++ +  posix_cpu_timers_init_group(sig);
++++ +
         acct_init_pacct(&sig->pacct);
       
         tty_audit_fork(sig);
@@@@@@@ -837,7 -837,7 -838,8 -837,7 -851,8 -838,8 +852,9 @@@@@@@
       
       void __cleanup_signal(struct signal_struct *sig)
       {
++++ +  thread_group_cputime_free(sig);
         exit_thread_group_keys(sig);
++ ++   tty_kref_put(sig->tty);
         kmem_cache_free(signal_cachep, sig);
       }
       
@@@@@@@ -885,6 -885,6 -887,6 -885,6 -900,19 -887,6 +902,19 @@@@@@@ void mm_init_owner(struct mm_struct *mm
       }
       #endif /* CONFIG_MM_OWNER */
       
++++ +/*
++++ + * Initialize POSIX timer handling for a single task.
++++ + */
++++ +static void posix_cpu_timers_init(struct task_struct *tsk)
++++ +{
++++ +  tsk->cputime_expires.prof_exp = cputime_zero;
++++ +  tsk->cputime_expires.virt_exp = cputime_zero;
++++ +  tsk->cputime_expires.sched_exp = 0;
++++ +  INIT_LIST_HEAD(&tsk->cpu_timers[0]);
++++ +  INIT_LIST_HEAD(&tsk->cpu_timers[1]);
++++ +  INIT_LIST_HEAD(&tsk->cpu_timers[2]);
++++ +}
++++ +
       /*
        * This creates a new process as a copy of the old one,
        * but does not actually start it yet.
@@@@@@@ -995,12 -995,12 -997,12 -995,12 -1023,7 -997,12 +1025,7 @@@@@@@ static struct task_struct *copy_process
         task_io_accounting_init(&p->ioac);
         acct_clear_integrals(p);
       
---- -  p->it_virt_expires = cputime_zero;
---- -  p->it_prof_expires = cputime_zero;
---- -  p->it_sched_expires = 0;
---- -  INIT_LIST_HEAD(&p->cpu_timers[0]);
---- -  INIT_LIST_HEAD(&p->cpu_timers[1]);
---- -  INIT_LIST_HEAD(&p->cpu_timers[2]);
++++ +  posix_cpu_timers_init(p);
       
         p->lock_depth = -1;             /* -1 = no lock */
         do_posix_clock_monotonic_gettime(&p->start_time);
@@@@@@@ -1201,21 -1201,21 -1203,21 -1201,21 -1224,6 -1203,21 +1226,6 @@@@@@@
         if (clone_flags & CLONE_THREAD) {
                 p->group_leader = current->group_leader;
                 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
---- -
---- -          if (!cputime_eq(current->signal->it_virt_expires,
---- -                          cputime_zero) ||
---- -              !cputime_eq(current->signal->it_prof_expires,
---- -                          cputime_zero) ||
---- -              current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
---- -              !list_empty(&current->signal->cpu_timers[0]) ||
---- -              !list_empty(&current->signal->cpu_timers[1]) ||
---- -              !list_empty(&current->signal->cpu_timers[2])) {
---- -                  /*
---- -                   * Have child wake up on its first tick to check
---- -                   * for process CPU timers.
---- -                   */
---- -                  p->it_prof_expires = jiffies_to_cputime(1);
---- -          }
         }
       
         if (likely(p->pid)) {
@@@@@@@ -1227,7 -1227,7 -1229,8 -1227,7 -1235,7 -1229,8 +1237,8 @@@@@@@
                                 p->nsproxy->pid_ns->child_reaper = p;
       
                         p->signal->leader_pid = pid;
-- --                   p->signal->tty = current->signal->tty;
++ ++                   tty_kref_put(p->signal->tty);
++ ++                   p->signal->tty = tty_kref_get(current->signal->tty);
                         set_task_pgrp(p, task_pgrp_nr(current));
                         set_task_session(p, task_session_nr(current));
                         attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
diff --combined kernel/hrtimer.c

index b8e4dce80a748dc06c94ed6cb43225903c7f99c1,4d761d50c5297826dba26745970359435a4957f0,cdec83e722fa1b80ee0af0f828d8e47532431a20,b8e4dce80a748dc06c94ed6cb43225903c7f99c1,b8e4dce80a748dc06c94ed6cb43225903c7f99c1,cdec83e722fa1b80ee0af0f828d8e47532431a20..95978f48e039fcbd7e7e233224a7e3f4a0b2f884
--- 1/kernel/hrtimer.c
--- 2/kernel/hrtimer.c
--- 3/kernel/hrtimer.c
--- 4/kernel/hrtimer.c
--- 5/kernel/hrtimer.c
--- 6/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@@@@@@ -672,13 -672,13 -672,14 -672,13 -672,13 -672,14 +672,14 @@@@@@@ static inline int hrtimer_enqueue_repro
                          */
                         BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
                         return 1;
-- --           case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:
++ ++           case HRTIMER_CB_IRQSAFE_PERCPU:
++ ++           case HRTIMER_CB_IRQSAFE_UNLOCKED:
                         /*
                          * This is solely for the sched tick emulation with
                          * dynamic tick support to ensure that we do not
                          * restart the tick right on the edge and end up with
                          * the tick timer in the softirq ! The calling site
-- --                    * takes care of this.
++ ++                    * takes care of this. Also used for hrtimer sleeper !
                          */
                         debug_hrtimer_deactivate(timer);
                         return 1;
@@@@@@@ -1245,7 -1245,7 -1246,8 -1245,7 -1245,7 -1246,8 +1246,8 @@@@@@@ static void __run_hrtimer(struct hrtime
         timer_stats_account_hrtimer(timer);
       
         fn = timer->function;
-- --   if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
++ ++   if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
++ ++       timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
                 /*
                  * Used for scheduler timers, avoid lock inversion with
                  * rq->lock and tasklist_lock.
@@@@@@@ -1401,9 -1401,7 -1403,9 -1401,9 -1401,9 -1403,9 +1403,7 @@@@@@@ void hrtimer_run_queues(void
                 if (!base->first)
                         continue;
       
- ----          if (base->get_softirq_time)
- ----                  base->softirq_time = base->get_softirq_time();
- ----          else if (gettime) {
+ ++++          if (gettime) {
                         hrtimer_get_softirq_time(cpu_base);
                         gettime = 0;
                 }
@@@@@@@ -1452,7 -1450,7 -1454,7 -1452,7 -1452,7 -1454,7 +1452,7 @@@@@@@ void hrtimer_init_sleeper(struct hrtime
         sl->timer.function = hrtimer_wakeup;
         sl->task = task;
       #ifdef CONFIG_HIGH_RES_TIMERS
-- --   sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
++ ++   sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
       #endif
       }
       
@@@@@@@ -1591,49 -1589,50 -1593,122 -1591,49 -1591,49 -1593,122 +1591,123 @@@@@@@ static void __cpuinit init_hrtimers_cpu
       
       #ifdef CONFIG_HOTPLUG_CPU
       
-- -- static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
-- --                           struct hrtimer_clock_base *new_base)
++ ++ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
++ ++                           struct hrtimer_clock_base *new_base, int dcpu)
       {
         struct hrtimer *timer;
         struct rb_node *node;
++ ++   int raise = 0;
       
         while ((node = rb_first(&old_base->active))) {
                 timer = rb_entry(node, struct hrtimer, node);
                 BUG_ON(hrtimer_callback_running(timer));
                 debug_hrtimer_deactivate(timer);
-- --           __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
++ ++ 
++ ++           /*
++ ++            * Should not happen. Per CPU timers should be
++ ++            * canceled _before_ the migration code is called
++ ++            */
++ ++           if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
++ ++                   __remove_hrtimer(timer, old_base,
++ ++                                    HRTIMER_STATE_INACTIVE, 0);
++ ++                   WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
++ ++                        timer, timer->function, dcpu);
++ ++                   continue;
++ ++           }
++ ++ 
++ ++           /*
++ ++            * Mark it as STATE_MIGRATE not INACTIVE otherwise the
++ ++            * timer could be seen as !active and just vanish away
++ ++            * under us on another CPU
++ ++            */
++ ++           __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
                 timer->base = new_base;
                 /*
                  * Enqueue the timer. Allow reprogramming of the event device
                  */
                 enqueue_hrtimer(timer, new_base, 1);
++ ++ 
++ ++ #ifdef CONFIG_HIGH_RES_TIMERS
++ ++           /*
++ ++            * Happens with high res enabled when the timer was
++ ++            * already expired and the callback mode is
++ ++            * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
++ ++            * enqueue code does not move them to the soft irq
++ ++            * pending list for performance/latency reasons, but
++ ++            * in the migration state, we need to do that
++ ++            * otherwise we end up with a stale timer.
++ ++            */
++ ++           if (timer->state == HRTIMER_STATE_MIGRATE) {
++ ++                   timer->state = HRTIMER_STATE_PENDING;
++ ++                   list_add_tail(&timer->cb_entry,
++ ++                                 &new_base->cpu_base->cb_pending);
++ ++                   raise = 1;
++ ++           }
++ ++ #endif
++ ++           /* Clear the migration state bit */
++ ++           timer->state &= ~HRTIMER_STATE_MIGRATE;
+ +      }
++ ++   return raise;
+ +    }
+ +    
++ ++ #ifdef CONFIG_HIGH_RES_TIMERS
++ ++ static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
++ ++                              struct hrtimer_cpu_base *new_base)
++ ++ {
++ ++   struct hrtimer *timer;
++ ++   int raise = 0;
++ ++ 
++ ++   while (!list_empty(&old_base->cb_pending)) {
++ ++           timer = list_entry(old_base->cb_pending.next,
++ ++                              struct hrtimer, cb_entry);
++ ++ 
++ ++           __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
++ ++           timer->base = &new_base->clock_base[timer->base->index];
++ ++           list_add_tail(&timer->cb_entry, &new_base->cb_pending);
++ ++           raise = 1;
+  ++   }
++ ++   return raise;
++ ++ }
++ ++ #else
++ ++ static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
++ ++                              struct hrtimer_cpu_base *new_base)
++ ++ {
++ ++   return 0;
+  ++ }
++ ++ #endif
+  ++ 
       static void migrate_hrtimers(int cpu)
       {
         struct hrtimer_cpu_base *old_base, *new_base;
-- --   int i;
++ ++   int i, raise = 0;
       
         BUG_ON(cpu_online(cpu));
         old_base = &per_cpu(hrtimer_bases, cpu);
         new_base = &get_cpu_var(hrtimer_bases);
       
         tick_cancel_sched_timer(cpu);
- ----
- ----  local_irq_disable();
- ----  spin_lock(&new_base->lock);
+ ++++  /*
+ ++++   * The caller is globally serialized and nobody else
+ ++++   * takes two locks at once, deadlock is not possible.
+ ++++   */
+ ++++  spin_lock_irq(&new_base->lock);
         spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
       
         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
-- --           migrate_hrtimer_list(&old_base->clock_base[i],
-- --                                &new_base->clock_base[i]);
++ ++           if (migrate_hrtimer_list(&old_base->clock_base[i],
++ ++                                    &new_base->clock_base[i], cpu))
++ ++                   raise = 1;
         }
       
++ ++   if (migrate_hrtimer_pending(old_base, new_base))
++ ++           raise = 1;
++ ++ 
         spin_unlock(&old_base->lock);
- ----  spin_unlock(&new_base->lock);
- ----  local_irq_enable();
+ ++++  spin_unlock_irq(&new_base->lock);
         put_cpu_var(hrtimer_bases);
++ ++ 
++ ++   if (raise)
++ ++           hrtimer_raise_softirq();
       }
       #endif /* CONFIG_HOTPLUG_CPU */
       
diff --combined kernel/posix-timers.c

index d3c66b53dff67fd93bcd562f95c7e515ea99816b,e36d5798cbff427fca02fd8c9a8fb6f615dbd3fe,5131e5471169226ef8db42f20792c8ffdac6d12b,e36d5798cbff427fca02fd8c9a8fb6f615dbd3fe,95451bf7d2ebeb1331694efdfa5e13c84687fd13,5131e5471169226ef8db42f20792c8ffdac6d12b..b931d7cedbfa9fd70a47d07535d3b791661ec39a
--- 1/kernel/posix-timers.c
--- 2/kernel/posix-timers.c
--- 3/kernel/posix-timers.c
--- 4/kernel/posix-timers.c
--- 5/kernel/posix-timers.c
--- 6/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@@@@@@ -222,15 -222,6 -222,6 -222,6 -222,6 -222,6 +222,15 @@@@@@@ static int posix_ktime_get_ts(clockid_
         return 0;
       }
       
+ +++++/*
+ +++++ * Get monotonic time for posix timers
+ +++++ */
+ +++++static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
+ +++++{
+ +++++  getrawmonotonic(tp);
+ +++++  return 0;
+ +++++}
+ +++++
       /*
        * Initialize everything, well, just everything in Posix clocks/timers ;)
        */
@@@@@@@ -244,15 -235,9 -235,9 -235,9 -235,9 -235,9 +244,15 @@@@@@@ static __init int init_posix_timers(voi
                 .clock_get = posix_ktime_get_ts,
                 .clock_set = do_posix_clock_nosettime,
         };
+ +++++  struct k_clock clock_monotonic_raw = {
+ +++++          .clock_getres = hrtimer_get_res,
+ +++++          .clock_get = posix_get_monotonic_raw,
+ +++++          .clock_set = do_posix_clock_nosettime,
+ +++++  };
       
         register_posix_clock(CLOCK_REALTIME, &clock_realtime);
         register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
+ +++++  register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
       
         posix_timers_cache = kmem_cache_create("posix_timers_cache",
                                         sizeof (struct k_itimer), 0, SLAB_PANIC,
@@@@@@@ -313,6 -298,6 -298,6 -298,6 -298,7 -298,6 +313,7 @@@@@@@ void do_schedule_next_timer(struct sigi
       
       int posix_timer_event(struct k_itimer *timr, int si_private)
       {
++++ +  int shared, ret;
         /*
          * FIXME: if ->sigq is queued we can race with
          * dequeue_signal()->do_schedule_next_timer().
@@@@@@@ -326,25 -311,25 -311,25 -311,25 -312,10 -311,25 +327,10 @@@@@@@
          */
         timr->sigq->info.si_sys_private = si_private;
       
---- -  timr->sigq->info.si_signo = timr->it_sigev_signo;
---- -  timr->sigq->info.si_code = SI_TIMER;
---- -  timr->sigq->info.si_tid = timr->it_id;
---- -  timr->sigq->info.si_value = timr->it_sigev_value;
---- -
---- -  if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
---- -          struct task_struct *leader;
---- -          int ret = send_sigqueue(timr->sigq, timr->it_process, 0);
---- -
---- -          if (likely(ret >= 0))
---- -                  return ret;
---- -
---- -          timr->it_sigev_notify = SIGEV_SIGNAL;
---- -          leader = timr->it_process->group_leader;
---- -          put_task_struct(timr->it_process);
---- -          timr->it_process = leader;
---- -  }
---- -
---- -  return send_sigqueue(timr->sigq, timr->it_process, 1);
++++ +  shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
++++ +  ret = send_sigqueue(timr->sigq, timr->it_process, shared);
++++ +  /* If we failed to send the signal the timer stops. */
++++ +  return ret > 0;
       }
       EXPORT_SYMBOL_GPL(posix_timer_event);
       
@@@@@@@ -456,7 -441,7 -441,7 -441,7 -427,7 -441,7 +442,7 @@@@@@@ static struct k_itimer * alloc_posix_ti
                 return tmr;
         if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
                 kmem_cache_free(posix_timers_cache, tmr);
-- --           tmr = NULL;
++ ++           return NULL;
         }
         memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
         return tmr;
@@@@@@@ -483,11 -468,11 -468,11 -468,11 -454,9 -468,11 +469,9 @@@@@@@ sys_timer_create(const clockid_t which_
                  struct sigevent __user *timer_event_spec,
                  timer_t __user * created_timer_id)
       {
---- -  int error = 0;
---- -  struct k_itimer *new_timer = NULL;
---- -  int new_timer_id;
---- -  struct task_struct *process = NULL;
---- -  unsigned long flags;
++++ +  struct k_itimer *new_timer;
++++ +  int error, new_timer_id;
++++ +  struct task_struct *process;
         sigevent_t event;
         int it_id_set = IT_ID_NOT_SET;
       
@@@@@@@ -505,12 -490,12 -490,12 -490,12 -474,11 -490,12 +489,11 @@@@@@@
                 goto out;
         }
         spin_lock_irq(&idr_lock);
---- -  error = idr_get_new(&posix_timers_id, (void *) new_timer,
---- -                      &new_timer_id);
++++ +  error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id);
         spin_unlock_irq(&idr_lock);
---- -  if (error == -EAGAIN)
---- -          goto retry;
---- -  else if (error) {
++++ +  if (error) {
++++ +          if (error == -EAGAIN)
++++ +                  goto retry;
                 /*
                  * Weird looking, but we return EAGAIN if the IDR is
                  * full (proper POSIX return value for this)
@@@@@@@ -541,67 -526,67 -526,67 -526,67 -509,43 -526,67 +524,43 @@@@@@@
                         error = -EFAULT;
                         goto out;
                 }
---- -          new_timer->it_sigev_notify = event.sigev_notify;
---- -          new_timer->it_sigev_signo = event.sigev_signo;
---- -          new_timer->it_sigev_value = event.sigev_value;
---- -
---- -          read_lock(&tasklist_lock);
---- -          if ((process = good_sigevent(&event))) {
---- -                  /*
---- -                   * We may be setting up this process for another
---- -                   * thread.  It may be exiting.  To catch this
---- -                   * case the we check the PF_EXITING flag.  If
---- -                   * the flag is not set, the siglock will catch
---- -                   * him before it is too late (in exit_itimers).
---- -                   *
---- -                   * The exec case is a bit more invloved but easy
---- -                   * to code.  If the process is in our thread
---- -                   * group (and it must be or we would not allow
---- -                   * it here) and is doing an exec, it will cause
---- -                   * us to be killed.  In this case it will wait
---- -                   * for us to die which means we can finish this
---- -                   * linkage with our last gasp. I.e. no code :)
---- -                   */
---- -                  spin_lock_irqsave(&process->sighand->siglock, flags);
---- -                  if (!(process->flags & PF_EXITING)) {
---- -                          new_timer->it_process = process;
---- -                          list_add(&new_timer->list,
---- -                                   &process->signal->posix_timers);
---- -                          if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
---- -                                  get_task_struct(process);
---- -                          spin_unlock_irqrestore(&process->sighand->siglock, flags);
---- -                  } else {
---- -                          spin_unlock_irqrestore(&process->sighand->siglock, flags);
---- -                          process = NULL;
---- -                  }
---- -          }
---- -          read_unlock(&tasklist_lock);
++++ +          rcu_read_lock();
++++ +          process = good_sigevent(&event);
++++ +          if (process)
++++ +                  get_task_struct(process);
++++ +          rcu_read_unlock();
                 if (!process) {
                         error = -EINVAL;
                         goto out;
                 }
         } else {
---- -          new_timer->it_sigev_notify = SIGEV_SIGNAL;
---- -          new_timer->it_sigev_signo = SIGALRM;
---- -          new_timer->it_sigev_value.sival_int = new_timer->it_id;
++++ +          event.sigev_notify = SIGEV_SIGNAL;
++++ +          event.sigev_signo = SIGALRM;
++++ +          event.sigev_value.sival_int = new_timer->it_id;
                 process = current->group_leader;
---- -          spin_lock_irqsave(&process->sighand->siglock, flags);
---- -          new_timer->it_process = process;
---- -          list_add(&new_timer->list, &process->signal->posix_timers);
---- -          spin_unlock_irqrestore(&process->sighand->siglock, flags);
++++ +          get_task_struct(process);
         }
       
++++ +  new_timer->it_sigev_notify     = event.sigev_notify;
++++ +  new_timer->sigq->info.si_signo = event.sigev_signo;
++++ +  new_timer->sigq->info.si_value = event.sigev_value;
++++ +  new_timer->sigq->info.si_tid   = new_timer->it_id;
++++ +  new_timer->sigq->info.si_code  = SI_TIMER;
++++ +
++++ +  spin_lock_irq(&current->sighand->siglock);
++++ +  new_timer->it_process = process;
++++ +  list_add(&new_timer->list, &current->signal->posix_timers);
++++ +  spin_unlock_irq(&current->sighand->siglock);
++++ +
++++ +  return 0;
         /*
          * In the case of the timer belonging to another task, after
          * the task is unlocked, the timer is owned by the other task
          * and may cease to exist at any time.  Don't use or modify
          * new_timer after the unlock call.
          */
---- -
       out:
---- -  if (error)
---- -          release_posix_timer(new_timer, it_id_set);
---- -
++++ +  release_posix_timer(new_timer, it_id_set);
         return error;
       }
       
@@@@@@@ -612,7 -597,7 -597,7 -597,7 -556,7 -597,7 +571,7 @@@@@@@
        * the find to the timer lock.  To avoid a dead lock, the timer id MUST
        * be release with out holding the timer lock.
        */
---- -static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
++++ +static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
       {
         struct k_itimer *timr;
         /*
@@@@@@@ -620,23 -605,23 -605,23 -605,23 -564,20 -605,23 +579,20 @@@@@@@
          * flags part over to the timer lock.  Must not let interrupts in
          * while we are moving the lock.
          */
---- -
         spin_lock_irqsave(&idr_lock, *flags);
---- -  timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id);
++++ +  timr = idr_find(&posix_timers_id, (int)timer_id);
         if (timr) {
                 spin_lock(&timr->it_lock);
---- -
---- -          if ((timr->it_id != timer_id) || !(timr->it_process) ||
---- -                          !same_thread_group(timr->it_process, current)) {
---- -                  spin_unlock(&timr->it_lock);
---- -                  spin_unlock_irqrestore(&idr_lock, *flags);
---- -                  timr = NULL;
---- -          } else
++++ +          if (timr->it_process &&
++++ +              same_thread_group(timr->it_process, current)) {
                         spin_unlock(&idr_lock);
---- -  } else
---- -          spin_unlock_irqrestore(&idr_lock, *flags);
++++ +                  return timr;
++++ +          }
++++ +          spin_unlock(&timr->it_lock);
++++ +  }
++++ +  spin_unlock_irqrestore(&idr_lock, *flags);
       
---- -  return timr;
++++ +  return NULL;
       }
       
       /*
@@@@@@@ -877,8 -862,8 -862,8 -862,8 -818,7 -862,8 +833,7 @@@@@@@ retry_delete
          * This keeps any tasks waiting on the spin lock from thinking
          * they got something (see the lock code above).
          */
---- -  if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
---- -          put_task_struct(timer->it_process);
++++ +  put_task_struct(timer->it_process);
         timer->it_process = NULL;
       
         unlock_timer(timer, flags);
@@@@@@@ -905,8 -890,8 -890,8 -890,8 -845,7 -890,8 +860,7 @@@@@@@ retry_delete
          * This keeps any tasks waiting on the spin lock from thinking
          * they got something (see the lock code above).
          */
---- -  if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
---- -          put_task_struct(timer->it_process);
++++ +  put_task_struct(timer->it_process);
         timer->it_process = NULL;
       
         unlock_timer(timer, flags);
diff --combined kernel/sched.c

index 9a1ddb84e26d56e7d6b283daab1edbad20ae2042,9a1ddb84e26d56e7d6b283daab1edbad20ae2042,6f230596bd0c1d21a2c68ffbff8207e93dcd65b5,1a5f73c1fcdcd12983c6a2eed9ef66e630dca837,ebb03def564bfe93c17953a7e68fbed55de7abd1,6f230596bd0c1d21a2c68ffbff8207e93dcd65b5..09a8c15748f1e5fcd2cae7d98b47ecfd7e73ba40
--- 1/kernel/sched.c
--- 2/kernel/sched.c
--- 3/kernel/sched.c
--- 4/kernel/sched.c
--- 5/kernel/sched.c
--- 6/kernel/sched.c
+++ b/kernel/sched.c
@@@@@@@ -201,14 -201,14 -201,19 -201,14 -201,14 -201,19 +201,19 @@@@@@@ void init_rt_bandwidth(struct rt_bandwi
         hrtimer_init(&rt_b->rt_period_timer,
                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         rt_b->rt_period_timer.function = sched_rt_period_timer;
-- --   rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
++ ++   rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
++ ++ }
++ ++ 
++ ++ static inline int rt_bandwidth_enabled(void)
++ ++ {
++ ++   return sysctl_sched_rt_runtime >= 0;
       }
       
       static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
       {
         ktime_t now;
       
-- --   if (rt_b->rt_runtime == RUNTIME_INF)
++ ++   if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
                 return;
       
         if (hrtimer_active(&rt_b->rt_period_timer))
@@@@@@@ -298,9 -298,9 -303,9 -298,9 -298,9 -303,9 +303,9 @@@@@@@ static DEFINE_PER_CPU(struct cfs_rq, in
       static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
       static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
       #endif /* CONFIG_RT_GROUP_SCHED */
-- -- #else /* !CONFIG_FAIR_GROUP_SCHED */
++ ++ #else /* !CONFIG_USER_SCHED */
       #define root_task_group init_task_group
-- -- #endif /* CONFIG_FAIR_GROUP_SCHED */
++ ++ #endif /* CONFIG_USER_SCHED */
       
       /* task_group_lock serializes add/remove of task groups and also changes to
        * a task group's cpu shares.
@@@@@@@ -604,9 -604,9 -609,9 -604,9 -604,9 -609,9 +609,9 @@@@@@@ struct rq 
       
       static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
       
-- -- static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
++ ++ static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
       {
-- --   rq->curr->sched_class->check_preempt_curr(rq, p);
++ ++   rq->curr->sched_class->check_preempt_curr(rq, p, sync);
       }
       
       static inline int cpu_of(struct rq *rq)
@@@@@@@ -1087,7 -1087,7 -1092,7 -1087,7 -1087,7 -1092,7 +1092,7 @@@@@@@ hotplug_hrtick(struct notifier_block *n
         return NOTIFY_DONE;
       }
       
-- -- static void init_hrtick(void)
++ ++ static __init void init_hrtick(void)
       {
         hotcpu_notifier(hotplug_hrtick, 0);
       }
@@@@@@@ -1102,7 -1102,7 -1107,7 -1102,7 -1102,7 -1107,7 +1107,7 @@@@@@@ static void hrtick_start(struct rq *rq
         hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
       }
       
-- -- static void init_hrtick(void)
++ ++ static inline void init_hrtick(void)
       {
       }
       #endif /* CONFIG_SMP */
@@@@@@@ -1119,9 -1119,9 -1124,9 -1119,9 -1119,9 -1124,9 +1124,9 @@@@@@@ static void init_rq_hrtick(struct rq *r
       
         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         rq->hrtick_timer.function = hrtick;
-- --   rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
++ ++   rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
       }
-- -- #else
++ ++ #else     /* CONFIG_SCHED_HRTICK */
       static inline void hrtick_clear(struct rq *rq)
       {
       }
@@@@@@@ -1133,7 -1133,7 -1138,7 -1133,7 -1133,7 -1138,7 +1138,7 @@@@@@@ static inline void init_rq_hrtick(struc
       static inline void init_hrtick(void)
       {
       }
-- -- #endif
++ ++ #endif    /* CONFIG_SCHED_HRTICK */
       
       /*
        * resched_task - mark a task 'to be rescheduled now'.
@@@@@@@ -1380,38 -1380,38 -1385,24 -1380,38 -1380,38 -1385,24 +1385,24 @@@@@@@ static inline void dec_cpu_load(struct 
         update_load_sub(&rq->load, load);
       }
       
-- -- #ifdef CONFIG_SMP
-- -- static unsigned long source_load(int cpu, int type);
-- -- static unsigned long target_load(int cpu, int type);
-- -- static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-- -- 
-- -- static unsigned long cpu_avg_load_per_task(int cpu)
-- -- {
-- --   struct rq *rq = cpu_rq(cpu);
-- -- 
-- --   if (rq->nr_running)
-- --           rq->avg_load_per_task = rq->load.weight / rq->nr_running;
-- -- 
-- --   return rq->avg_load_per_task;
-- -- }
-- -- 
-- -- #ifdef CONFIG_FAIR_GROUP_SCHED
-- -- 
-- -- typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
++ ++ #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
++ ++ typedef int (*tg_visitor)(struct task_group *, void *);
       
       /*
        * Iterate the full tree, calling @down when first entering a node and @up when
        * leaving it for the final time.
        */
-- -- static void
-- -- walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
++ ++ static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
       {
         struct task_group *parent, *child;
++ ++   int ret;
       
         rcu_read_lock();
         parent = &root_task_group;
       down:
-- --   (*down)(parent, cpu, sd);
++ ++   ret = (*down)(parent, data);
++ ++   if (ret)
++ ++           goto out_unlock;
         list_for_each_entry_rcu(child, &parent->children, siblings) {
                 parent = child;
                 goto down;
@@@@@@@ -1419,15 -1419,15 -1410,43 -1419,15 -1419,15 -1410,43 +1410,43 @@@@@@@
       up:
                 continue;
         }
-- --   (*up)(parent, cpu, sd);
++ ++   ret = (*up)(parent, data);
++ ++   if (ret)
++ ++           goto out_unlock;
       
         child = parent;
         parent = parent->parent;
         if (parent)
                 goto up;
++ ++ out_unlock:
         rcu_read_unlock();
++ ++ 
++ ++   return ret;
++  + }
++  + 
++ ++ static int tg_nop(struct task_group *tg, void *data)
++ ++ {
++ ++   return 0;
+   +  }
++ ++ #endif
++ ++ 
++ ++ #ifdef CONFIG_SMP
++ ++ static unsigned long source_load(int cpu, int type);
++ ++ static unsigned long target_load(int cpu, int type);
++ ++ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
++ ++ 
++ ++ static unsigned long cpu_avg_load_per_task(int cpu)
++ ++ {
++ ++   struct rq *rq = cpu_rq(cpu);
++ ++ 
++ ++   if (rq->nr_running)
++ ++           rq->avg_load_per_task = rq->load.weight / rq->nr_running;
++ ++ 
++ ++   return rq->avg_load_per_task;
++ ++ }
++ ++ 
++ ++ #ifdef CONFIG_FAIR_GROUP_SCHED
+   +  
       static void __set_se_shares(struct sched_entity *se, unsigned long shares);
       
       /*
@@@@@@@ -1486,11 -1486,11 -1505,11 -1486,11 -1486,11 -1505,11 +1505,11 @@@@@@@ __update_group_shares_cpu(struct task_g
        * This needs to be done in a bottom-up fashion because the rq weight of a
        * parent group depends on the shares of its child groups.
        */
-- -- static void
-- -- tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
++ ++ static int tg_shares_up(struct task_group *tg, void *data)
       {
         unsigned long rq_weight = 0;
         unsigned long shares = 0;
++ ++   struct sched_domain *sd = data;
         int i;
       
         for_each_cpu_mask(i, sd->span) {
@@@@@@@ -1515,6 -1515,6 -1534,8 -1515,6 -1515,6 -1534,8 +1534,8 @@@@@@@
                 __update_group_shares_cpu(tg, i, shares, rq_weight);
                 spin_unlock_irqrestore(&rq->lock, flags);
         }
++ ++ 
++ ++   return 0;
       }
       
       /*
@@@@@@@ -1522,10 -1522,10 -1543,10 -1522,10 -1522,10 -1543,10 +1543,10 @@@@@@@
        * This needs to be done in a top-down fashion because the load of a child
        * group is a fraction of its parents load.
        */
-- -- static void
-- -- tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
++ ++ static int tg_load_down(struct task_group *tg, void *data)
       {
         unsigned long load;
++ ++   long cpu = (long)data;
       
         if (!tg->parent) {
                 load = cpu_rq(cpu)->load.weight;
@@@@@@@ -1536,11 -1536,11 -1557,8 -1536,11 -1536,11 -1557,8 +1557,8 @@@@@@@
         }
       
         tg->cfs_rq[cpu]->h_load = load;
-- -- }
       
-- -- static void
-- -- tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
-- -- {
++ ++   return 0;
       }
       
       static void update_shares(struct sched_domain *sd)
@@@@@@@ -1550,7 -1550,7 -1568,7 -1550,7 -1550,7 -1568,7 +1568,7 @@@@@@@
       
         if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
                 sd->last_update = now;
-- --           walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
++ ++           walk_tg_tree(tg_nop, tg_shares_up, sd);
         }
       }
       
@@@@@@@ -1561,9 -1561,9 -1579,9 -1561,9 -1561,9 -1579,9 +1579,9 @@@@@@@ static void update_shares_locked(struc
         spin_lock(&rq->lock);
       }
       
-- -- static void update_h_load(int cpu)
++ ++ static void update_h_load(long cpu)
       {
-- --   walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
++ ++   walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
       }
       
       #else
@@@@@@@ -1921,11 -1921,11 -1939,8 -1921,11 -1921,11 -1939,8 +1939,8 @@@@@@@ unsigned long wait_task_inactive(struc
                 running = task_running(rq, p);
                 on_rq = p->se.on_rq;
                 ncsw = 0;
-- --           if (!match_state || p->state == match_state) {
-- --                   ncsw = p->nivcsw + p->nvcsw;
-- --                   if (unlikely(!ncsw))
-- --                           ncsw = 1;
-- --           }
++ ++           if (!match_state || p->state == match_state)
++ ++                   ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
                 task_rq_unlock(rq, &flags);
       
                 /*
@@@@@@@ -2285,7 -2285,7 -2300,7 -2285,7 -2285,7 -2300,7 +2300,7 @@@@@@@ out_running
         trace_mark(kernel_sched_wakeup,
                 "pid %d state %ld ## rq %p task %p rq->curr %p",
                 p->pid, p->state, rq, p, rq->curr);
-- --   check_preempt_curr(rq, p);
++ ++   check_preempt_curr(rq, p, sync);
       
         p->state = TASK_RUNNING;
       #ifdef CONFIG_SMP
@@@@@@@ -2420,7 -2420,7 -2435,7 -2420,7 -2420,7 -2435,7 +2435,7 @@@@@@@ void wake_up_new_task(struct task_struc
         trace_mark(kernel_sched_wakeup_new,
                 "pid %d state %ld ## rq %p task %p rq->curr %p",
                 p->pid, p->state, rq, p, rq->curr);
-- --   check_preempt_curr(rq, p);
++ ++   check_preempt_curr(rq, p, 0);
       #ifdef CONFIG_SMP
         if (p->sched_class->task_wake_up)
                 p->sched_class->task_wake_up(rq, p);
@@@@@@@ -2880,7 -2880,7 -2895,7 -2880,7 -2880,7 -2895,7 +2895,7 @@@@@@@ static void pull_task(struct rq *src_rq
          * Note that idle threads have a prio of MAX_PRIO, for this test
          * to be always true for them.
          */
-- --   check_preempt_curr(this_rq, p);
++ ++   check_preempt_curr(this_rq, p, 0);
       }
       
       /*
@@@@@@@ -4037,23 -4037,23 -4052,23 -4037,23 -4037,26 -4052,23 +4052,26 @@@@@@@ DEFINE_PER_CPU(struct kernel_stat, ksta
       EXPORT_PER_CPU_SYMBOL(kstat);
       
       /*
---- - * Return p->sum_exec_runtime plus any more ns on the sched_clock
---- - * that have not yet been banked in case the task is currently running.
++++ + * Return any ns on the sched_clock that have not yet been banked in
++++ + * @p in case that task is currently running.
        */
---- -unsigned long long task_sched_runtime(struct task_struct *p)
++++ +unsigned long long task_delta_exec(struct task_struct *p)
       {
         unsigned long flags;
---- -  u64 ns, delta_exec;
         struct rq *rq;
++++ +  u64 ns = 0;
       
         rq = task_rq_lock(p, &flags);
---- -  ns = p->se.sum_exec_runtime;
++++ +
         if (task_current(rq, p)) {
++++ +          u64 delta_exec;
++++ +
                 update_rq_clock(rq);
                 delta_exec = rq->clock - p->se.exec_start;
                 if ((s64)delta_exec > 0)
---- -                  ns += delta_exec;
++++ +                  ns = delta_exec;
         }
++++ +
         task_rq_unlock(rq, &flags);
       
         return ns;
@@@@@@@ -4070,6 -4070,6 -4085,6 -4070,6 -4073,7 -4085,6 +4088,7 @@@@@@@ void account_user_time(struct task_stru
         cputime64_t tmp;
       
         p->utime = cputime_add(p->utime, cputime);
++++ +  account_group_user_time(p, cputime);
       
         /* Add user time to cpustat. */
         tmp = cputime_to_cputime64(cputime);
@@@@@@@ -4094,6 -4094,6 -4109,6 -4094,6 -4098,7 -4109,6 +4113,7 @@@@@@@ static void account_guest_time(struct t
         tmp = cputime_to_cputime64(cputime);
       
         p->utime = cputime_add(p->utime, cputime);
++++ +  account_group_user_time(p, cputime);
         p->gtime = cputime_add(p->gtime, cputime);
       
         cpustat->user = cputime64_add(cpustat->user, tmp);
@@@@@@@ -4129,6 -4129,6 -4144,6 -4129,6 -4134,7 -4144,6 +4149,7 @@@@@@@ void account_system_time(struct task_st
         }
       
         p->stime = cputime_add(p->stime, cputime);
++++ +  account_group_system_time(p, cputime);
       
         /* Add system time to cpustat. */
         tmp = cputime_to_cputime64(cputime);
@@@@@@@ -4170,6 -4170,6 -4185,6 -4170,6 -4176,7 -4185,6 +4191,7 @@@@@@@ void account_steal_time(struct task_str
       
         if (p == rq->idle) {
                 p->stime = cputime_add(p->stime, steal);
++++ +          account_group_system_time(p, steal);
                 if (atomic_read(&rq->nr_iowait) > 0)
                         cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
                 else
@@@@@@@ -4178,6 -4178,6 -4193,65 -4178,65 -4185,65 -4193,65 +4200,65 @@@@@@@
                 cpustat->steal = cputime64_add(cpustat->steal, tmp);
       }
       
++    /*
++     * Use precise platform statistics if available:
++     */
++    #ifdef CONFIG_VIRT_CPU_ACCOUNTING
++    cputime_t task_utime(struct task_struct *p)
++    {
++      return p->utime;
++    }
++    
++    cputime_t task_stime(struct task_struct *p)
++    {
++      return p->stime;
++    }
++    #else
++    cputime_t task_utime(struct task_struct *p)
++    {
++      clock_t utime = cputime_to_clock_t(p->utime),
++              total = utime + cputime_to_clock_t(p->stime);
++      u64 temp;
++    
++      /*
++       * Use CFS's precise accounting:
++       */
++      temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
++    
++      if (total) {
++              temp *= utime;
++              do_div(temp, total);
++      }
++      utime = (clock_t)temp;
++    
++      p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
++      return p->prev_utime;
++    }
++    
++    cputime_t task_stime(struct task_struct *p)
++    {
++      clock_t stime;
++    
++      /*
++       * Use CFS's precise accounting. (we subtract utime from
++       * the total, to make sure the total observed by userspace
++       * grows monotonically - apps rely on that):
++       */
++      stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
++                      cputime_to_clock_t(task_utime(p));
++    
++      if (stime >= 0)
++              p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
++    
++      return p->prev_stime;
++    }
++    #endif
++    
++    inline cputime_t task_gtime(struct task_struct *p)
++    {
++      return p->gtime;
++    }
++    
       /*
        * This function gets called by the timer code, with HZ frequency.
        * We call it with interrupts disabled.
@@@@@@@ -4568,6 -4568,6 -4642,15 -4627,6 -4634,6 -4642,15 +4649,15 @@@@@@@ __wake_up_sync(wait_queue_head_t *q, un
       }
       EXPORT_SYMBOL_GPL(__wake_up_sync);        /* For internal use only */
       
++ ++ /**
++ ++  * complete: - signals a single thread waiting on this completion
++ ++  * @x:  holds the state of this particular completion
++ ++  *
++ ++  * This will wake up a single thread waiting on this completion. Threads will be
++ ++  * awakened in the same order in which they were queued.
++ ++  *
++ ++  * See also complete_all(), wait_for_completion() and related routines.
++ ++  */
       void complete(struct completion *x)
       {
         unsigned long flags;
@@@@@@@ -4579,6 -4579,6 -4662,12 -4638,6 -4645,6 -4662,12 +4669,12 @@@@@@@
       }
       EXPORT_SYMBOL(complete);
       
++ ++ /**
++ ++  * complete_all: - signals all threads waiting on this completion
++ ++  * @x:  holds the state of this particular completion
++ ++  *
++ ++  * This will wake up all threads waiting on this particular completion event.
++ ++  */
       void complete_all(struct completion *x)
       {
         unsigned long flags;
@@@@@@@ -4599,10 -4599,10 -4688,7 -4658,10 -4665,10 -4688,7 +4695,7 @@@@@@@ do_wait_for_common(struct completion *x
                 wait.flags |= WQ_FLAG_EXCLUSIVE;
                 __add_wait_queue_tail(&x->wait, &wait);
                 do {
-- --                   if ((state == TASK_INTERRUPTIBLE &&
-- --                        signal_pending(current)) ||
-- --                       (state == TASK_KILLABLE &&
-- --                        fatal_signal_pending(current))) {
++ ++                   if (signal_pending_state(state, current)) {
                                 timeout = -ERESTARTSYS;
                                 break;
                         }
@@@@@@@ -4630,12 -4630,12 -4716,31 -4689,12 -4696,12 -4716,31 +4723,31 @@@@@@@ wait_for_common(struct completion *x, l
         return timeout;
       }
       
++ ++ /**
++ ++  * wait_for_completion: - waits for completion of a task
++ ++  * @x:  holds the state of this particular completion
++ ++  *
++ ++  * This waits to be signaled for completion of a specific task. It is NOT
++ ++  * interruptible and there is no timeout.
++ ++  *
++ ++  * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
++ ++  * and interrupt capability. Also see complete().
++ ++  */
       void __sched wait_for_completion(struct completion *x)
       {
         wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
       }
       EXPORT_SYMBOL(wait_for_completion);
       
++ ++ /**
++ ++  * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
++ ++  * @x:  holds the state of this particular completion
++ ++  * @timeout:  timeout value in jiffies
++ ++  *
++ ++  * This waits for either a completion of a specific task to be signaled or for a
++ ++  * specified timeout to expire. The timeout is in jiffies. It is not
++ ++  * interruptible.
++ ++  */
       unsigned long __sched
       wait_for_completion_timeout(struct completion *x, unsigned long timeout)
       {
@@@@@@@ -4643,6 -4643,6 -4748,13 -4702,6 -4709,6 -4748,13 +4755,13 @@@@@@@
       }
       EXPORT_SYMBOL(wait_for_completion_timeout);
       
++ ++ /**
++ ++  * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
++ ++  * @x:  holds the state of this particular completion
++ ++  *
++ ++  * This waits for completion of a specific task to be signaled. It is
++ ++  * interruptible.
++ ++  */
       int __sched wait_for_completion_interruptible(struct completion *x)
       {
         long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@@@@@@ -4652,6 -4652,6 -4764,14 -4711,6 -4718,6 -4764,14 +4771,14 @@@@@@@
       }
       EXPORT_SYMBOL(wait_for_completion_interruptible);
       
++ ++ /**
++ ++  * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
++ ++  * @x:  holds the state of this particular completion
++ ++  * @timeout:  timeout value in jiffies
++ ++  *
++ ++  * This waits for either a completion of a specific task to be signaled or for a
++ ++  * specified timeout to expire. It is interruptible. The timeout is in jiffies.
++ ++  */
       unsigned long __sched
       wait_for_completion_interruptible_timeout(struct completion *x,
                                           unsigned long timeout)
@@@@@@@ -4660,6 -4660,6 -4780,13 -4719,6 -4726,6 -4780,13 +4787,13 @@@@@@@
       }
       EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
       
++ ++ /**
++ ++  * wait_for_completion_killable: - waits for completion of a task (killable)
++ ++  * @x:  holds the state of this particular completion
++ ++  *
++ ++  * This waits to be signaled for completion of a specific task. It can be
++ ++  * interrupted by a kill signal.
++ ++  */
       int __sched wait_for_completion_killable(struct completion *x)
       {
         long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@@@@@@ -5062,7 -5062,7 -5189,8 -5121,7 -5128,7 -5189,8 +5196,8 @@@@@@@ recheck
                  * Do not allow realtime tasks into groups that have no runtime
                  * assigned.
                  */
-- --           if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
++ ++           if (rt_bandwidth_enabled() && rt_policy(policy) &&
++ ++                           task_group(p)->rt_bandwidth.rt_runtime == 0)
                         return -EPERM;
       #endif
       
@@@@@@@ -5898,7 -5898,7 -6026,7 -5957,7 -5964,7 -6026,7 +6033,7 @@@@@@@ static int __migrate_task(struct task_s
         set_task_cpu(p, dest_cpu);
         if (on_rq) {
                 activate_task(rq_dest, p, 0);
-- --           check_preempt_curr(rq_dest, p);
++ ++           check_preempt_curr(rq_dest, p, 0);
         }
       done:
         ret = 1;
@@@@@@@ -6223,7 -6223,7 -6351,7 -6282,7 -6289,7 -6351,7 +6358,7 @@@@@@@ set_table_entry(struct ctl_table *entry
       static struct ctl_table *
       sd_alloc_ctl_domain_table(struct sched_domain *sd)
       {
-- --   struct ctl_table *table = sd_alloc_ctl_entry(12);
++ ++   struct ctl_table *table = sd_alloc_ctl_entry(13);
       
         if (table == NULL)
                 return NULL;
@@@@@@@ -6251,7 -6251,7 -6379,9 -6310,7 -6317,7 -6379,9 +6386,9 @@@@@@@
                 sizeof(int), 0644, proc_dointvec_minmax);
         set_table_entry(&table[10], "flags", &sd->flags,
                 sizeof(int), 0644, proc_dointvec_minmax);
-- --   /* &table[11] is terminator */
++ ++   set_table_entry(&table[11], "name", sd->name,
++ ++           CORENAME_MAX_SIZE, 0444, proc_dostring);
++ ++   /* &table[12] is terminator */
       
         return table;
       }
@@@@@@@ -7135,13 -7135,13 -7265,21 -7194,13 -7201,13 -7265,21 +7272,21 @@@@@@@ static void init_sched_groups_power(in
        * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
        */
       
++ ++ #ifdef CONFIG_SCHED_DEBUG
++ ++ # define SD_INIT_NAME(sd, type)           sd->name = #type
++ ++ #else
++ ++ # define SD_INIT_NAME(sd, type)           do { } while (0)
++ ++ #endif
++ ++ 
       #define   SD_INIT(sd, type)       sd_init_##type(sd)
++ ++ 
       #define SD_INIT_FUNC(type)        \
       static noinline void sd_init_##type(struct sched_domain *sd)      \
       {                                                         \
         memset(sd, 0, sizeof(*sd));                             \
         *sd = SD_##type##_INIT;                                 \
         sd->level = SD_LV_##type;                               \
++ ++   SD_INIT_NAME(sd, type);                                 \
       }
       
       SD_INIT_FUNC(CPU)
@@@@@@@ -7637,24 -7637,24 -7775,27 -7696,24 -7703,27 -7775,27 +7782,27 @@@@@@@ static int dattrs_equal(struct sched_do
        * and partition_sched_domains() will fallback to the single partition
        * 'fallback_doms', it also forces the domains to be rebuilt.
        *
++ +   * If doms_new==NULL it will be replaced with cpu_online_map.
++ +   * ndoms_new==0 is a special case for destroying existing domains.
++ +   * It will not create the default domain.
++ +   *
        * Call with hotplug lock held
        */
       void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
                              struct sched_domain_attr *dattr_new)
       {
-- -    int i, j;
++ +    int i, j, n;
       
         mutex_lock(&sched_domains_mutex);
       
         /* always unregister in case we don't destroy any domains */
         unregister_sched_domain_sysctl();
       
-- -    if (doms_new == NULL)
-- -            ndoms_new = 0;
++ +    n = doms_new ? ndoms_new : 0;
       
         /* Destroy deleted domains */
         for (i = 0; i < ndoms_cur; i++) {
-- -            for (j = 0; j < ndoms_new; j++) {
++ +            for (j = 0; j < n; j++) {
                         if (cpus_equal(doms_cur[i], doms_new[j])
                             && dattrs_equal(dattr_cur, i, dattr_new, j))
                                 goto match1;
@@@@@@@ -7667,7 -7667,7 -7808,6 -7726,7 -7736,6 -7808,6 +7815,6 @@@@@@@ match1
       
         if (doms_new == NULL) {
                 ndoms_cur = 0;
-- -            ndoms_new = 1;
                 doms_new = &fallback_doms;
                 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
                 dattr_new = NULL;
@@@@@@@ -7704,8 -7704,8 -7844,13 -7763,8 -7772,13 -7844,13 +7851,13 @@@@@@@ match2
       int arch_reinit_sched_domains(void)
       {
         get_online_cpus();
++ +  
++ +    /* Destroy domains first to force the rebuild */
++ +    partition_sched_domains(0, NULL, NULL);
++ +  
         rebuild_sched_domains();
         put_online_cpus();
++ +  
         return 0;
       }
       
@@@@@@@ -7789,7 -7789,7 -7934,7 -7848,7 -7862,7 -7934,7 +7941,7 @@@@@@@ static int update_sched_domains(struct 
         case CPU_ONLINE_FROZEN:
         case CPU_DEAD:
         case CPU_DEAD_FROZEN:
-- -            partition_sched_domains(0, NULL, NULL);
++ +            partition_sched_domains(1, NULL, NULL);
                 return NOTIFY_OK;
       
         default:
@@@@@@@ -8176,20 -8176,20 -8321,25 -8235,20 -8249,20 -8321,25 +8328,25 @@@@@@@ void __might_sleep(char *file, int line
       #ifdef in_atomic
         static unsigned long prev_jiffy;        /* ratelimiting */
       
-- --   if ((in_atomic() || irqs_disabled()) &&
-- --       system_state == SYSTEM_RUNNING && !oops_in_progress) {
-- --           if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-- --                   return;
-- --           prev_jiffy = jiffies;
-- --           printk(KERN_ERR "BUG: sleeping function called from invalid"
-- --                           " context at %s:%d\n", file, line);
-- --           printk("in_atomic():%d, irqs_disabled():%d\n",
-- --                   in_atomic(), irqs_disabled());
-- --           debug_show_held_locks(current);
-- --           if (irqs_disabled())
-- --                   print_irqtrace_events(current);
-- --           dump_stack();
-- --   }
++ ++   if ((!in_atomic() && !irqs_disabled()) ||
++ ++               system_state != SYSTEM_RUNNING || oops_in_progress)
++ ++           return;
++ ++   if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
++ ++           return;
++ ++   prev_jiffy = jiffies;
++ ++ 
++ ++   printk(KERN_ERR
++ ++           "BUG: sleeping function called from invalid context at %s:%d\n",
++ ++                   file, line);
++ ++   printk(KERN_ERR
++ ++           "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
++ ++                   in_atomic(), irqs_disabled(),
++ ++                   current->pid, current->comm);
++ ++ 
++ ++   debug_show_held_locks(current);
++ ++   if (irqs_disabled())
++ ++           print_irqtrace_events(current);
++ ++   dump_stack();
       #endif
       }
       EXPORT_SYMBOL(__might_sleep);
@@@@@@@ -8687,73 -8687,73 -8837,95 -8746,73 -8760,73 -8837,95 +8844,95 @@@@@@@ static DEFINE_MUTEX(rt_constraints_mute
       static unsigned long to_ratio(u64 period, u64 runtime)
       {
         if (runtime == RUNTIME_INF)
-- --           return 1ULL << 16;
++ ++           return 1ULL << 20;
       
-- --   return div64_u64(runtime << 16, period);
++ ++   return div64_u64(runtime << 20, period);
       }
       
-- -- #ifdef CONFIG_CGROUP_SCHED
-- -- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
++ ++ /* Must be called with tasklist_lock held */
++ ++ static inline int tg_has_rt_tasks(struct task_group *tg)
       {
-- --   struct task_group *tgi, *parent = tg->parent;
-- --   unsigned long total = 0;
++ ++   struct task_struct *g, *p;
       
-- --   if (!parent) {
-- --           if (global_rt_period() < period)
-- --                   return 0;
++ ++   do_each_thread(g, p) {
++ ++           if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
++ ++                   return 1;
++ ++   } while_each_thread(g, p);
       
-- --           return to_ratio(period, runtime) <
-- --                   to_ratio(global_rt_period(), global_rt_runtime());
-- --   }
++ ++   return 0;
++ ++ }
       
-- --   if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
-- --           return 0;
++ ++ struct rt_schedulable_data {
++ ++   struct task_group *tg;
++ ++   u64 rt_period;
++ ++   u64 rt_runtime;
++ ++ };
       
-- --   rcu_read_lock();
-- --   list_for_each_entry_rcu(tgi, &parent->children, siblings) {
-- --           if (tgi == tg)
-- --                   continue;
++ ++ static int tg_schedulable(struct task_group *tg, void *data)
++ ++ {
++ ++   struct rt_schedulable_data *d = data;
++ ++   struct task_group *child;
++ ++   unsigned long total, sum = 0;
++ ++   u64 period, runtime;
++    
-   --           total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-   --                           tgi->rt_bandwidth.rt_runtime);
++ ++   period = ktime_to_ns(tg->rt_bandwidth.rt_period);
++ ++   runtime = tg->rt_bandwidth.rt_runtime;
+   ++ 
--              total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
--                              tgi->rt_bandwidth.rt_runtime);
++ ++   if (tg == d->tg) {
++ ++           period = d->rt_period;
++ ++           runtime = d->rt_runtime;
         }
-- --   rcu_read_unlock();
       
-- --   return total + to_ratio(period, runtime) <=
-- --           to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
-- --                           parent->rt_bandwidth.rt_runtime);
-- -- }
-- -- #elif defined CONFIG_USER_SCHED
-- -- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-- -- {
-- --   struct task_group *tgi;
-- --   unsigned long total = 0;
-- --   unsigned long global_ratio =
-- --           to_ratio(global_rt_period(), global_rt_runtime());
++ ++   /*
++ ++    * Cannot have more runtime than the period.
++ ++    */
++ ++   if (runtime > period && runtime != RUNTIME_INF)
++ ++           return -EINVAL;
       
-- --   rcu_read_lock();
-- --   list_for_each_entry_rcu(tgi, &task_groups, list) {
-- --           if (tgi == tg)
-- --                   continue;
++ ++   /*
++ ++    * Ensure we don't starve existing RT tasks.
++ ++    */
++ ++   if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
++ ++           return -EBUSY;
++ +  
-    -           total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-    -                           tgi->rt_bandwidth.rt_runtime);
++ ++   total = to_ratio(period, runtime);
++ ++ 
++ ++   /*
++ ++    * Nobody can have more than the global setting allows.
++ ++    */
++ ++   if (total > to_ratio(global_rt_period(), global_rt_runtime()))
++ ++           return -EINVAL;
+   ++ 
--              total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
--                              tgi->rt_bandwidth.rt_runtime);
++ ++   /*
++ ++    * The sum of our children's runtime should not exceed our own.
++ ++    */
++ ++   list_for_each_entry_rcu(child, &tg->children, siblings) {
++ ++           period = ktime_to_ns(child->rt_bandwidth.rt_period);
++ ++           runtime = child->rt_bandwidth.rt_runtime;
++ ++ 
++ ++           if (child == d->tg) {
++ ++                   period = d->rt_period;
++ ++                   runtime = d->rt_runtime;
++ ++           }
++  + 
-   -            total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-   -                            tgi->rt_bandwidth.rt_runtime);
++ ++           sum += to_ratio(period, runtime);
         }
-- --   rcu_read_unlock();
       
-- --   return total + to_ratio(period, runtime) < global_ratio;
++ ++   if (sum > total)
++ ++           return -EINVAL;
++ ++ 
++ ++   return 0;
       }
-- -- #endif
       
-- -- /* Must be called with tasklist_lock held */
-- -- static inline int tg_has_rt_tasks(struct task_group *tg)
++ ++ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
       {
-- --   struct task_struct *g, *p;
-- --   do_each_thread(g, p) {
-- --           if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
-- --                   return 1;
-- --   } while_each_thread(g, p);
-- --   return 0;
++ ++   struct rt_schedulable_data data = {
++ ++           .tg = tg,
++ ++           .rt_period = period,
++ ++           .rt_runtime = runtime,
++ ++   };
++ ++ 
++ ++   return walk_tg_tree(tg_schedulable, tg_nop, &data);
       }
       
       static int tg_set_bandwidth(struct task_group *tg,
@@@@@@@ -8763,14 -8763,14 -8935,9 -8822,14 -8836,14 -8935,9 +8942,9 @@@@@@@
       
         mutex_lock(&rt_constraints_mutex);
         read_lock(&tasklist_lock);
-- --   if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
-- --           err = -EBUSY;
-- --           goto unlock;
-- --   }
-- --   if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
-- --           err = -EINVAL;
++ ++   err = __rt_schedulable(tg, rt_period, rt_runtime);
++ ++   if (err)
                 goto unlock;
-- --   }
       
         spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
         tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@@@@@@ -8839,16 -8839,16 -9006,25 -8898,16 -8912,16 -9006,25 +9013,25 @@@@@@@ long sched_group_rt_period(struct task_
       
       static int sched_rt_global_constraints(void)
       {
-- --   struct task_group *tg = &root_task_group;
-- --   u64 rt_runtime, rt_period;
++ ++   u64 runtime, period;
         int ret = 0;
       
-- --   rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-- --   rt_runtime = tg->rt_bandwidth.rt_runtime;
++ ++   if (sysctl_sched_rt_period <= 0)
++ ++           return -EINVAL;
++ ++ 
++ ++   runtime = global_rt_runtime();
++ ++   period = global_rt_period();
++ ++ 
++ ++   /*
++ ++    * Sanity check on the sysctl variables.
++ ++    */
++ ++   if (runtime > period && runtime != RUNTIME_INF)
++ ++           return -EINVAL;
       
         mutex_lock(&rt_constraints_mutex);
-- --   if (!__rt_schedulable(tg, rt_period, rt_runtime))
-- --           ret = -EINVAL;
++ ++   read_lock(&tasklist_lock);
++ ++   ret = __rt_schedulable(NULL, 0, 0);
++ ++   read_unlock(&tasklist_lock);
         mutex_unlock(&rt_constraints_mutex);
       
         return ret;
@@@@@@@ -8859,6 -8859,6 -9035,9 -8918,6 -8932,6 -9035,9 +9042,9 @@@@@@@ static int sched_rt_global_constraints(
         unsigned long flags;
         int i;
       
++ ++   if (sysctl_sched_rt_period <= 0)
++ ++           return -EINVAL;
++ ++ 
         spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
         for_each_possible_cpu(i) {
                 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
@@@@@@@ -8919,7 -8919,7 -9098,6 -8978,7 -8992,7 -9098,6 +9105,6 @@@@@@@ cpu_cgroup_create(struct cgroup_subsys 
       
         if (!cgrp->parent) {
                 /* This is early initialization for the top cgroup */
-- --           init_task_group.css.cgroup = cgrp;
                 return &init_task_group.css;
         }
       
@@@@@@@ -8928,9 -8928,9 -9106,6 -8987,9 -9001,9 -9106,6 +9113,6 @@@@@@@
         if (IS_ERR(tg))
                 return ERR_PTR(-ENOMEM);
       
-- --   /* Bind the cgroup to task_group object we just created */
-- --   tg->css.cgroup = cgrp;
-- -- 
         return &tg->css;
       }
       
diff --combined kernel/sched_fair.c

index fb8994c6d4bb4bbe90a71f89341baee3cc6e9806,fb8994c6d4bb4bbe90a71f89341baee3cc6e9806,18fd17172eb66bb567ca4bcc47ca6c0cea923462,fb8994c6d4bb4bbe90a71f89341baee3cc6e9806,99aa31acc544888dc616fc090ec61cec4d1af017,18fd17172eb66bb567ca4bcc47ca6c0cea923462..f604dae71316264445e63b4d09f26a483d61113e
--- 1/kernel/sched_fair.c
--- 2/kernel/sched_fair.c
--- 3/kernel/sched_fair.c
--- 4/kernel/sched_fair.c
--- 5/kernel/sched_fair.c
--- 6/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@@@@@@ -408,64 -408,64 -408,6 -408,64 -408,64 -408,6 +408,6 @@@@@@@ static u64 sched_vslice_add(struct cfs_
         return __sched_period(nr_running);
       }
       
-- -- /*
-- --  * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
-- --  * that it favours >=0 over <0.
-- --  *
-- --  *   -20         |
-- --  *               |
-- --  *     0 --------+-------
-- --  *             .'
-- --  *    19     .'
-- --  *
-- --  */
-- -- static unsigned long
-- -- calc_delta_asym(unsigned long delta, struct sched_entity *se)
-- -- {
-- --   struct load_weight lw = {
-- --           .weight = NICE_0_LOAD,
-- --           .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
-- --   };
-- -- 
-- --   for_each_sched_entity(se) {
-- --           struct load_weight *se_lw = &se->load;
-- --           unsigned long rw = cfs_rq_of(se)->load.weight;
-- -- 
-- -- #ifdef CONFIG_FAIR_SCHED_GROUP
-- --           struct cfs_rq *cfs_rq = se->my_q;
-- --           struct task_group *tg = NULL
-- -- 
-- --           if (cfs_rq)
-- --                   tg = cfs_rq->tg;
-- -- 
-- --           if (tg && tg->shares < NICE_0_LOAD) {
-- --                   /*
-- --                    * scale shares to what it would have been had
-- --                    * tg->weight been NICE_0_LOAD:
-- --                    *
-- --                    *   weight = 1024 * shares / tg->weight
-- --                    */
-- --                   lw.weight *= se->load.weight;
-- --                   lw.weight /= tg->shares;
-- -- 
-- --                   lw.inv_weight = 0;
-- -- 
-- --                   se_lw = &lw;
-- --                   rw += lw.weight - se->load.weight;
-- --           } else
-- -- #endif
-- -- 
-- --           if (se->load.weight < NICE_0_LOAD) {
-- --                   se_lw = &lw;
-- --                   rw += NICE_0_LOAD - se->load.weight;
-- --           }
-- -- 
-- --           delta = calc_delta_mine(delta, rw, se_lw);
-- --   }
-- -- 
-- --   return delta;
-- -- }
-- -- 
       /*
        * Update the current task's runtime statistics. Skip current tasks that
        * are not in our scheduling class.
@@@@@@@ -507,6 -507,6 -449,6 -507,6 -507,7 -449,6 +449,7 @@@@@@@ static void update_curr(struct cfs_rq *
                 struct task_struct *curtask = task_of(curr);
       
                 cpuacct_charge(curtask, delta_exec);
++++ +          account_group_exec_runtime(curtask, delta_exec);
         }
       }
       
@@@@@@@ -586,11 -586,11 -528,12 -586,11 -587,11 -528,12 +529,12 @@@@@@@ account_entity_enqueue(struct cfs_rq *c
         update_load_add(&cfs_rq->load, se->load.weight);
         if (!parent_entity(se))
                 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
-- --   if (entity_is_task(se))
++ ++   if (entity_is_task(se)) {
                 add_cfs_task_weight(cfs_rq, se->load.weight);
++ ++           list_add(&se->group_node, &cfs_rq->tasks);
++ ++   }
         cfs_rq->nr_running++;
         se->on_rq = 1;
-- --   list_add(&se->group_node, &cfs_rq->tasks);
       }
       
       static void
@@@@@@@ -599,11 -599,11 -542,12 -599,11 -600,11 -542,12 +543,12 @@@@@@@ account_entity_dequeue(struct cfs_rq *c
         update_load_sub(&cfs_rq->load, se->load.weight);
         if (!parent_entity(se))
                 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
-- --   if (entity_is_task(se))
++ ++   if (entity_is_task(se)) {
                 add_cfs_task_weight(cfs_rq, -se->load.weight);
++ ++           list_del_init(&se->group_node);
++ ++   }
         cfs_rq->nr_running--;
         se->on_rq = 0;
-- --   list_del_init(&se->group_node);
       }
       
       static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@@@@@@ -1085,7 -1085,7 -1029,6 -1085,7 -1086,7 -1029,6 +1030,6 @@@@@@@ static long effective_load(struct task_
                 long wl, long wg)
       {
         struct sched_entity *se = tg->se[cpu];
-- --   long more_w;
       
         if (!tg->parent)
                 return wl;
@@@@@@@ -1097,18 -1097,18 -1040,17 -1097,18 -1098,18 -1040,17 +1041,17 @@@@@@@
         if (!wl && sched_feat(ASYM_EFF_LOAD))
                 return wl;
       
-- --   /*
-- --    * Instead of using this increment, also add the difference
-- --    * between when the shares were last updated and now.
-- --    */
-- --   more_w = se->my_q->load.weight - se->my_q->rq_weight;
-- --   wl += more_w;
-- --   wg += more_w;
-- -- 
         for_each_sched_entity(se) {
-- -- #define D(n) (likely(n) ? (n) : 1)
-- -- 
                 long S, rw, s, a, b;
++ ++           long more_w;
++ ++ 
++ ++           /*
++ ++            * Instead of using this increment, also add the difference
++ ++            * between when the shares were last updated and now.
++ ++            */
++ ++           more_w = se->my_q->load.weight - se->my_q->rq_weight;
++ ++           wl += more_w;
++ ++           wg += more_w;
       
                 S = se->my_q->tg->shares;
                 s = se->my_q->shares;
@@@@@@@ -1117,7 -1117,7 -1059,11 -1117,7 -1118,7 -1059,11 +1060,11 @@@@@@@
                 a = S*(rw + wl);
                 b = S*rw + s*wg;
       
-- --           wl = s*(a-b)/D(b);
++ ++           wl = s*(a-b);
++ ++ 
++ ++           if (likely(b))
++ ++                   wl /= b;
++ ++ 
                 /*
                  * Assume the group is already running and will
                  * thus already be accounted for in the weight.
@@@@@@@ -1126,7 -1126,7 -1072,6 -1126,7 -1127,7 -1072,6 +1073,6 @@@@@@@
                  * alter the group weight.
                  */
                 wg = 0;
-- -- #undef D
         }
       
         return wl;
@@@@@@@ -1143,7 -1143,7 -1088,7 -1143,7 -1144,7 -1088,7 +1089,7 @@@@@@@ static inline unsigned long effective_l
       #endif
       
       static int
-- -- wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
++ ++ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
             struct task_struct *p, int prev_cpu, int this_cpu, int sync,
             int idx, unsigned long load, unsigned long this_load,
             unsigned int imbalance)
@@@@@@@ -1158,6 -1158,6 -1103,11 -1158,6 -1159,6 -1103,11 +1104,11 @@@@@@@
         if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
                 return 0;
       
++ ++   if (!sync && sched_feat(SYNC_WAKEUPS) &&
++ ++       curr->se.avg_overlap < sysctl_sched_migration_cost &&
++ ++       p->se.avg_overlap < sysctl_sched_migration_cost)
++ ++           sync = 1;
++ ++ 
         /*
          * If sync wakeup then subtract the (maximum possible)
          * effect of the currently running task from the load
@@@@@@@ -1182,17 -1182,17 -1132,14 -1182,17 -1183,17 -1132,14 +1133,14 @@@@@@@
          * a reasonable amount of time then attract this newly
          * woken task:
          */
-- --   if (sync && balanced) {
-- --           if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-- --               p->se.avg_overlap < sysctl_sched_migration_cost)
-- --                   return 1;
-- --   }
++ ++   if (sync && balanced)
++ ++           return 1;
       
         schedstat_inc(p, se.nr_wakeups_affine_attempts);
         tl_per_task = cpu_avg_load_per_task(this_cpu);
       
-- --   if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
-- --                   balanced) {
++ ++   if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
++ ++                   tl_per_task)) {
                 /*
                  * This domain has SD_WAKE_AFFINE and
                  * p is cache cold in this domain, and
@@@@@@@ -1211,16 -1211,16 -1158,17 -1211,16 -1212,16 -1158,17 +1159,17 @@@@@@@ static int select_task_rq_fair(struct t
         struct sched_domain *sd, *this_sd = NULL;
         int prev_cpu, this_cpu, new_cpu;
         unsigned long load, this_load;
-- --   struct rq *rq, *this_rq;
++ ++   struct rq *this_rq;
         unsigned int imbalance;
         int idx;
       
         prev_cpu        = task_cpu(p);
-- --   rq              = task_rq(p);
         this_cpu        = smp_processor_id();
         this_rq         = cpu_rq(this_cpu);
         new_cpu         = prev_cpu;
       
++ ++   if (prev_cpu == this_cpu)
++ ++           goto out;
         /*
          * 'this_sd' is the first domain that both
          * this_cpu and prev_cpu are present in:
@@@@@@@ -1248,13 -1248,13 -1196,10 -1248,13 -1249,13 -1196,10 +1197,10 @@@@@@@
         load = source_load(prev_cpu, idx);
         this_load = target_load(this_cpu, idx);
       
-- --   if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
++ ++   if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
                                      load, this_load, imbalance))
                 return this_cpu;
       
-- --   if (prev_cpu == this_cpu)
-- --           goto out;
-- -- 
         /*
          * Start passive balancing when half the imbalance_pct
          * limit is reached.
@@@@@@@ -1281,62 -1281,62 -1226,20 -1281,62 -1282,62 -1226,20 +1227,20 @@@@@@@ static unsigned long wakeup_gran(struc
          * + nice tasks.
          */
         if (sched_feat(ASYM_GRAN))
-- --           gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
-- --   else
-- --           gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
++ ++           gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
       
         return gran;
       }
       
-- -- /*
-- --  * Should 'se' preempt 'curr'.
-- --  *
-- --  *             |s1
-- --  *        |s2
-- --  *   |s3
-- --  *         g
-- --  *      |<--->|c
-- --  *
-- --  *  w(c, s1) = -1
-- --  *  w(c, s2) =  0
-- --  *  w(c, s3) =  1
-- --  *
-- --  */
-- -- static int
-- -- wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
-- -- {
-- --   s64 gran, vdiff = curr->vruntime - se->vruntime;
-- -- 
-- --   if (vdiff < 0)
-- --           return -1;
-- -- 
-- --   gran = wakeup_gran(curr);
-- --   if (vdiff > gran)
-- --           return 1;
-- -- 
-- --   return 0;
-- -- }
-- -- 
-- -- /* return depth at which a sched entity is present in the hierarchy */
-- -- static inline int depth_se(struct sched_entity *se)
-- -- {
-- --   int depth = 0;
-- -- 
-- --   for_each_sched_entity(se)
-- --           depth++;
-- -- 
-- --   return depth;
-- -- }
-- -- 
       /*
        * Preempt the current task with a newly woken task if needed:
        */
-- -- static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
++ ++ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
       {
         struct task_struct *curr = rq->curr;
         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
         struct sched_entity *se = &curr->se, *pse = &p->se;
-- --   int se_depth, pse_depth;
++ ++   s64 delta_exec;
       
         if (unlikely(rt_prio(p->prio))) {
                 update_rq_clock(rq);
@@@@@@@ -1350,6 -1350,6 -1253,13 -1350,6 -1351,6 -1253,13 +1254,13 @@@@@@@
       
         cfs_rq_of(pse)->next = pse;
       
++ ++   /*
++ ++    * We can come here with TIF_NEED_RESCHED already set from new task
++ ++    * wake up path.
++ ++    */
++ ++   if (test_tsk_need_resched(curr))
++ ++           return;
++ ++ 
         /*
          * Batch tasks do not preempt (their preemption is driven by
          * the tick):
@@@@@@@ -1360,33 -1360,33 -1270,15 -1360,33 -1361,33 -1270,15 +1271,15 @@@@@@@
         if (!sched_feat(WAKEUP_PREEMPT))
                 return;
       
-- --   /*
-- --    * preemption test can be made between sibling entities who are in the
-- --    * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
-- --    * both tasks until we find their ancestors who are siblings of common
-- --    * parent.
-- --    */
-- -- 
-- --   /* First walk up until both entities are at same depth */
-- --   se_depth = depth_se(se);
-- --   pse_depth = depth_se(pse);
-- -- 
-- --   while (se_depth > pse_depth) {
-- --           se_depth--;
-- --           se = parent_entity(se);
-- --   }
-- -- 
-- --   while (pse_depth > se_depth) {
-- --           pse_depth--;
-- --           pse = parent_entity(pse);
-- --   }
-- -- 
-- --   while (!is_same_group(se, pse)) {
-- --           se = parent_entity(se);
-- --           pse = parent_entity(pse);
++ ++   if (sched_feat(WAKEUP_OVERLAP) && (sync ||
++ ++                   (se->avg_overlap < sysctl_sched_migration_cost &&
++ ++                    pse->avg_overlap < sysctl_sched_migration_cost))) {
++ ++           resched_task(curr);
++ ++           return;
         }
       
-- --   if (wakeup_preempt_entity(se, pse) == 1)
++ ++   delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
++ ++   if (delta_exec > wakeup_gran(pse))
                 resched_task(curr);
       }
       
@@@@@@@ -1445,19 -1445,19 -1337,9 -1445,19 -1446,19 -1337,9 +1338,9 @@@@@@@ __load_balance_iterator(struct cfs_rq *
         if (next == &cfs_rq->tasks)
                 return NULL;
       
-- --   /* Skip over entities that are not tasks */
-- --   do {
-- --           se = list_entry(next, struct sched_entity, group_node);
-- --           next = next->next;
-- --   } while (next != &cfs_rq->tasks && !entity_is_task(se));
-- -- 
-- --   if (next == &cfs_rq->tasks)
-- --           return NULL;
-- -- 
-- --   cfs_rq->balance_iterator = next;
-- -- 
-- --   if (entity_is_task(se))
-- --           p = task_of(se);
++ ++   se = list_entry(next, struct sched_entity, group_node);
++ ++   p = task_of(se);
++ ++   cfs_rq->balance_iterator = next->next;
       
         return p;
       }
@@@@@@@ -1507,7 -1507,7 -1389,7 -1507,7 -1508,7 -1389,7 +1390,7 @@@@@@@ load_balance_fair(struct rq *this_rq, i
         rcu_read_lock();
         update_h_load(busiest_cpu);
       
-- --   list_for_each_entry(tg, &task_groups, list) {
++ ++   list_for_each_entry_rcu(tg, &task_groups, list) {
                 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
                 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
                 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
@@@@@@@ -1620,10 -1620,10 -1502,10 -1620,10 -1621,10 -1502,10 +1503,10 @@@@@@@ static void task_new_fair(struct rq *rq
                  * 'current' within the tree based on its new key value.
                  */
                 swap(curr->vruntime, se->vruntime);
++ ++           resched_task(rq->curr);
         }
       
         enqueue_task_fair(rq, p, 0);
-- --   resched_task(rq->curr);
       }
       
       /*
@@@@@@@ -1642,7 -1642,7 -1524,7 -1642,7 -1643,7 -1524,7 +1525,7 @@@@@@@ static void prio_changed_fair(struct r
                 if (p->prio > oldprio)
                         resched_task(rq->curr);
         } else
-- --           check_preempt_curr(rq, p);
++ ++           check_preempt_curr(rq, p, 0);
       }
       
       /*
@@@@@@@ -1659,7 -1659,7 -1541,7 -1659,7 -1660,7 -1541,7 +1542,7 @@@@@@@ static void switched_to_fair(struct rq 
         if (running)
                 resched_task(rq->curr);
         else
-- --           check_preempt_curr(rq, p);
++ ++           check_preempt_curr(rq, p, 0);
       }
       
       /* Account for a task changing its policy or group.
diff --combined kernel/sched_rt.c

index 998ba54b4543d876a6fa82f4a41debb981923319,998ba54b4543d876a6fa82f4a41debb981923319,cdf5740ab03e8133c0a2b7713d6c77d2be1f07bf,552310798dadf13e3b2059d5f84e050f7e0926e2,8375e69af36a75439cfab472a355c4fb726b12a6,cdf5740ab03e8133c0a2b7713d6c77d2be1f07bf..b446dc87494fd681fd0a7d7e265402ac09936773
--- 1/kernel/sched_rt.c
--- 2/kernel/sched_rt.c
--- 3/kernel/sched_rt.c
--- 4/kernel/sched_rt.c
--- 5/kernel/sched_rt.c
--- 6/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@@@@@@ -102,12 -102,12 -102,12 -102,12 -102,12 -102,12 +102,12 @@@@@@@ static void dequeue_rt_entity(struct sc
       
       static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
       {
++ ++   struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
         struct sched_rt_entity *rt_se = rt_rq->rt_se;
       
-- --   if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
-- --           struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
-- -- 
-- --           enqueue_rt_entity(rt_se);
++ ++   if (rt_rq->rt_nr_running) {
++ ++           if (rt_se && !on_rt_rq(rt_se))
++ ++                   enqueue_rt_entity(rt_se);
                 if (rt_rq->highest_prio < curr->prio)
                         resched_task(curr);
         }
@@@@@@@ -199,6 -199,6 -199,8 -199,8 -199,8 -199,8 +199,8 @@@@@@@ static inline struct rt_rq *group_rt_rq
       
       static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
       {
++      if (rt_rq->rt_nr_running)
++              resched_task(rq_of_rt_rq(rt_rq)->curr);
       }
       
       static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@@@@@@ -229,6 -229,6 -231,9 -231,6 -231,6 -231,9 +231,9 @@@@@@@ static inline struct rt_bandwidth *sche
       #endif /* CONFIG_RT_GROUP_SCHED */
       
       #ifdef CONFIG_SMP
++ ++ /*
++ ++  * We ran out of runtime, see if we can borrow some from our neighbours.
++ ++  */
       static int do_balance_runtime(struct rt_rq *rt_rq)
       {
         struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@@@@@@ -248,9 -248,9 -253,18 -250,9 -250,9 -253,18 +253,18 @@@@@@@
                         continue;
       
                 spin_lock(&iter->rt_runtime_lock);
++ ++           /*
++ ++            * Either all rqs have inf runtime and there's nothing to steal
++ ++            * or __disable_runtime() below sets a specific rq to inf to
++ ++            * indicate its been disabled and disalow stealing.
++ ++            */
                 if (iter->rt_runtime == RUNTIME_INF)
                         goto next;
       
++ ++           /*
++ ++            * From runqueues with spare time, take 1/n part of their
++ ++            * spare time, but no more than our period.
++ ++            */
                 diff = iter->rt_runtime - iter->rt_time;
                 if (diff > 0) {
                         diff = div_u64((u64)diff, weight);
@@@@@@@ -272,6 -272,6 -286,9 -274,6 -274,6 -286,9 +286,9 @@@@@@@ next
         return more;
       }
       
++ ++ /*
++ ++  * Ensure this RQ takes back all the runtime it lend to its neighbours.
++ ++  */
       static void __disable_runtime(struct rq *rq)
       {
         struct root_domain *rd = rq->rd;
@@@@@@@ -287,17 -287,17 -304,33 -289,17 -289,17 -304,33 +304,33 @@@@@@@
       
                 spin_lock(&rt_b->rt_runtime_lock);
                 spin_lock(&rt_rq->rt_runtime_lock);
++ ++           /*
++ ++            * Either we're all inf and nobody needs to borrow, or we're
++ ++            * already disabled and thus have nothing to do, or we have
++ ++            * exactly the right amount of runtime to take out.
++ ++            */
                 if (rt_rq->rt_runtime == RUNTIME_INF ||
                                 rt_rq->rt_runtime == rt_b->rt_runtime)
                         goto balanced;
                 spin_unlock(&rt_rq->rt_runtime_lock);
       
++ ++           /*
++ ++            * Calculate the difference between what we started out with
++ ++            * and what we current have, that's the amount of runtime
++ ++            * we lend and now have to reclaim.
++ ++            */
                 want = rt_b->rt_runtime - rt_rq->rt_runtime;
       
++ ++           /*
++ ++            * Greedy reclaim, take back as much as we can.
++ ++            */
                 for_each_cpu_mask(i, rd->span) {
                         struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
                         s64 diff;
       
++ ++                   /*
++ ++                    * Can't reclaim from ourselves or disabled runqueues.
++ ++                    */
                         if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
                                 continue;
       
@@@@@@@ -317,8 -317,8 -350,16 -319,8 -319,8 -350,16 +350,16 @@@@@@@
                 }
       
                 spin_lock(&rt_rq->rt_runtime_lock);
++ ++           /*
++ ++            * We cannot be left wanting - that would mean some runtime
++ ++            * leaked out of the system.
++ ++            */
                 BUG_ON(want);
       balanced:
++ ++           /*
++ ++            * Disable all the borrow logic by pretending we have inf
++ ++            * runtime - in which case borrowing doesn't make sense.
++ ++            */
                 rt_rq->rt_runtime = RUNTIME_INF;
                 spin_unlock(&rt_rq->rt_runtime_lock);
                 spin_unlock(&rt_b->rt_runtime_lock);
@@@@@@@ -341,6 -341,6 -382,9 -343,6 -343,6 -382,9 +382,9 @@@@@@@ static void __enable_runtime(struct rq 
         if (unlikely(!scheduler_running))
                 return;
       
++ ++   /*
++ ++    * Reset each runqueue's bandwidth settings
++ ++    */
         for_each_leaf_rt_rq(rt_rq, rq) {
                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
       
@@@@@@@ -348,6 -348,6 -392,7 -350,6 -350,6 -392,7 +392,7 @@@@@@@
                 spin_lock(&rt_rq->rt_runtime_lock);
                 rt_rq->rt_runtime = rt_b->rt_runtime;
                 rt_rq->rt_time = 0;
++ ++           rt_rq->rt_throttled = 0;
                 spin_unlock(&rt_rq->rt_runtime_lock);
                 spin_unlock(&rt_b->rt_runtime_lock);
         }
@@@@@@@ -386,7 -386,7 -431,7 -388,7 -388,7 -431,7 +431,7 @@@@@@@ static int do_sched_rt_period_timer(str
         int i, idle = 1;
         cpumask_t span;
       
-- --   if (rt_b->rt_runtime == RUNTIME_INF)
++ ++   if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
                 return 1;
       
         span = sched_rt_period_mask();
@@@@@@@ -438,9 -438,9 -483,6 -440,6 -440,6 -483,6 +483,6 @@@@@@@ static int sched_rt_runtime_exceeded(st
       {
         u64 runtime = sched_rt_runtime(rt_rq);
       
--      if (runtime == RUNTIME_INF)
--              return 0;
--    
         if (rt_rq->rt_throttled)
                 return rt_rq_throttled(rt_rq);
       
@@@@@@@ -484,16 -484,16 -526,21 -483,18 -483,20 -526,21 +526,23 @@@@@@@ static void update_curr_rt(struct rq *r
         schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
       
         curr->se.sum_exec_runtime += delta_exec;
++++ +  account_group_exec_runtime(curr, delta_exec);
++++ +
         curr->se.exec_start = rq->clock;
         cpuacct_charge(curr, delta_exec);
       
++ ++   if (!rt_bandwidth_enabled())
++ ++           return;
++ ++ 
         for_each_sched_rt_entity(rt_se) {
                 rt_rq = rt_rq_of_se(rt_se);
       
                 spin_lock(&rt_rq->rt_runtime_lock);
--              rt_rq->rt_time += delta_exec;
--              if (sched_rt_runtime_exceeded(rt_rq))
--                      resched_task(curr);
++              if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
++                      rt_rq->rt_time += delta_exec;
++                      if (sched_rt_runtime_exceeded(rt_rq))
++                              resched_task(curr);
++              }
                 spin_unlock(&rt_rq->rt_runtime_lock);
         }
       }
@@@@@@@ -782,7 -782,7 -829,7 -783,7 -785,7 -829,7 +831,7 @@@@@@@ static void check_preempt_equal_prio(st
       /*
        * Preempt the current task with a newly woken task if needed:
        */
-- -- static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
++ ++ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
       {
         if (p->prio < rq->curr->prio) {
                 resched_task(rq->curr);
@@@@@@@ -1411,7 -1411,7 -1458,7 -1412,7 -1414,7 -1458,7 +1460,7 @@@@@@@ static void watchdog(struct rq *rq, str
                 p->rt.timeout++;
                 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
                 if (p->rt.timeout > next)
---- -                  p->it_sched_expires = p->se.sum_exec_runtime;
++++ +                  p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
         }
       }
       
diff --combined kernel/softirq.c

index c506f266a6b90ee9d887b9eeecdd45061e74ef1f,c506f266a6b90ee9d887b9eeecdd45061e74ef1f,d410014279e776821fea52a0d4e971c6aae8869b,c506f266a6b90ee9d887b9eeecdd45061e74ef1f,c506f266a6b90ee9d887b9eeecdd45061e74ef1f,83ba21a13bd470cea2815d6792e8ff24af43e727..7110daeb9a90b2b585d97cf0fda94b7eb0b1869f
--- 1/kernel/softirq.c
--- 2/kernel/softirq.c
--- 3/kernel/softirq.c
--- 4/kernel/softirq.c
--- 5/kernel/softirq.c
--- 6/kernel/softirq.c
+++ b/kernel/softirq.c
@@@@@@@ -6,6 -6,6 -6,6 -6,6 -6,6 -6,8 +6,8 @@@@@@@
        *        Distribute under GPLv2.
        *
        *        Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+++++  *
+++++  *        Remote softirq infrastructure is by Jens Axboe.
        */
       
       #include <linux/module.h>
@@@@@@@ -46,7 -46,7 -46,7 -46,7 -46,7 -48,7 +48,7 @@@@@@@ irq_cpustat_t irq_stat[NR_CPUS] ____cac
       EXPORT_SYMBOL(irq_stat);
       #endif
       
-- -- static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp;
++ ++ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
       
       static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
       
@@@@@@@ -205,7 -205,7 -205,18 -205,7 -205,7 -207,18 +207,18 @@@@@@@ restart
       
         do {
                 if (pending & 1) {
++ ++                   int prev_count = preempt_count();
++ ++ 
                         h->action(h);
++ ++ 
++ ++                   if (unlikely(prev_count != preempt_count())) {
++ ++                           printk(KERN_ERR "huh, entered softirq %td %p"
++ ++                                  "with preempt_count %08x,"
++ ++                                  " exited with %08x?\n", h - softirq_vec,
++ ++                                  h->action, prev_count, preempt_count());
++ ++                           preempt_count() = prev_count;
++ ++                   }
++ ++ 
                         rcu_bh_qsctr_inc(cpu);
                 }
                 h++;
@@@@@@@ -254,16 -254,16 -265,12 -254,16 -254,16 -267,16 +267,12 @@@@@@@ asmlinkage void do_softirq(void
        */
       void irq_enter(void)
       {
-- ---#ifdef CONFIG_NO_HZ
         int cpu = smp_processor_id();
++ +++
         if (idle_cpu(cpu) && !in_interrupt())
-- ---          tick_nohz_stop_idle(cpu);
-- ---#endif
++ +++          tick_check_idle(cpu);
++ +++
         __irq_enter();
-- ---#ifdef CONFIG_NO_HZ
-- ---  if (idle_cpu(cpu))
-- ---          tick_nohz_update_jiffies();
-- ---#endif
       }
       
       #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@@@@@@ -463,17 -463,17 -470,17 -463,17 -463,17 -476,144 +472,144 @@@@@@@ void tasklet_kill(struct tasklet_struc
       
       EXPORT_SYMBOL(tasklet_kill);
       
+++++ DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
+++++ EXPORT_PER_CPU_SYMBOL(softirq_work_list);
+++++ 
+++++ static void __local_trigger(struct call_single_data *cp, int softirq)
+++++ {
+++++   struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
+++++ 
+++++   list_add_tail(&cp->list, head);
+++++ 
+++++   /* Trigger the softirq only if the list was previously empty.  */
+++++   if (head->next == &cp->list)
+++++           raise_softirq_irqoff(softirq);
+++++ }
+++++ 
+++++ #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
+++++ static void remote_softirq_receive(void *data)
+++++ {
+++++   struct call_single_data *cp = data;
+++++   unsigned long flags;
+++++   int softirq;
+++++ 
+++++   softirq = cp->priv;
+++++ 
+++++   local_irq_save(flags);
+++++   __local_trigger(cp, softirq);
+++++   local_irq_restore(flags);
+++++ }
+++++ 
+++++ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+++++ {
+++++   if (cpu_online(cpu)) {
+++++           cp->func = remote_softirq_receive;
+++++           cp->info = cp;
+++++           cp->flags = 0;
+++++           cp->priv = softirq;
+++++ 
+++++           __smp_call_function_single(cpu, cp);
+++++           return 0;
+++++   }
+++++   return 1;
+++++ }
+++++ #else /* CONFIG_USE_GENERIC_SMP_HELPERS */
+++++ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+++++ {
+++++   return 1;
+++++ }
+++++ #endif
+++++ 
+++++ /**
+++++  * __send_remote_softirq - try to schedule softirq work on a remote cpu
+++++  * @cp: private SMP call function data area
+++++  * @cpu: the remote cpu
+++++  * @this_cpu: the currently executing cpu
+++++  * @softirq: the softirq for the work
+++++  *
+++++  * Attempt to schedule softirq work on a remote cpu.  If this cannot be
+++++  * done, the work is instead queued up on the local cpu.
+++++  *
+++++  * Interrupts must be disabled.
+++++  */
+++++ void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
+++++ {
+++++   if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq))
+++++           __local_trigger(cp, softirq);
+++++ }
+++++ EXPORT_SYMBOL(__send_remote_softirq);
+++++ 
+++++ /**
+++++  * send_remote_softirq - try to schedule softirq work on a remote cpu
+++++  * @cp: private SMP call function data area
+++++  * @cpu: the remote cpu
+++++  * @softirq: the softirq for the work
+++++  *
+++++  * Like __send_remote_softirq except that disabling interrupts and
+++++  * computing the current cpu is done for the caller.
+++++  */
+++++ void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+++++ {
+++++   unsigned long flags;
+++++   int this_cpu;
+++++ 
+++++   local_irq_save(flags);
+++++   this_cpu = smp_processor_id();
+++++   __send_remote_softirq(cp, cpu, this_cpu, softirq);
+++++   local_irq_restore(flags);
+++++ }
+++++ EXPORT_SYMBOL(send_remote_softirq);
+++++ 
+++++ static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
+++++                                          unsigned long action, void *hcpu)
+++++ {
+++++   /*
+++++    * If a CPU goes away, splice its entries to the current CPU
+++++    * and trigger a run of the softirq
+++++    */
+++++   if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+++++           int cpu = (unsigned long) hcpu;
+++++           int i;
+++++ 
+++++           local_irq_disable();
+++++           for (i = 0; i < NR_SOFTIRQS; i++) {
+++++                   struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
+++++                   struct list_head *local_head;
+++++ 
+++++                   if (list_empty(head))
+++++                           continue;
+++++ 
+++++                   local_head = &__get_cpu_var(softirq_work_list[i]);
+++++                   list_splice_init(head, local_head);
+++++                   raise_softirq_irqoff(i);
+++++           }
+++++           local_irq_enable();
+++++   }
+++++ 
+++++   return NOTIFY_OK;
+++++ }
+++++ 
+++++ static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = {
+++++   .notifier_call  = remote_softirq_cpu_notify,
+++++ };
+++++ 
       void __init softirq_init(void)
       {
         int cpu;
       
         for_each_possible_cpu(cpu) {
+++++           int i;
+++++ 
                 per_cpu(tasklet_vec, cpu).tail =
                         &per_cpu(tasklet_vec, cpu).head;
                 per_cpu(tasklet_hi_vec, cpu).tail =
                         &per_cpu(tasklet_hi_vec, cpu).head;
+++++           for (i = 0; i < NR_SOFTIRQS; i++)
+++++                   INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
         }
       
+++++   register_hotcpu_notifier(&remote_softirq_cpu_notifier);
+++++ 
         open_softirq(TASKLET_SOFTIRQ, tasklet_action);
         open_softirq(HI_SOFTIRQ, tasklet_hi_action);
       }
diff --combined kernel/sys.c

index 038a7bc0901d20f90f841c5e4326fc1f2f2b963f,038a7bc0901d20f90f841c5e4326fc1f2f2b963f,0bc8fa3c2288110b49fad4e9eaab2326f52c69f7,038a7bc0901d20f90f841c5e4326fc1f2f2b963f,d046a7a055c2defd0d5d643122ac2ad4f0a31de7,0bc8fa3c2288110b49fad4e9eaab2326f52c69f7..53879cdae483b6371543bdeb94818292e91f07bb
--- 1/kernel/sys.c
--- 2/kernel/sys.c
--- 3/kernel/sys.c
--- 4/kernel/sys.c
--- 5/kernel/sys.c
--- 6/kernel/sys.c
+++ b/kernel/sys.c
@@@@@@@ -853,38 -853,38 -853,38 -853,38 -853,28 -853,38 +853,28 @@@@@@@ asmlinkage long sys_setfsgid(gid_t gid
         return old_fsgid;
       }
       
++++ +void do_sys_times(struct tms *tms)
++++ +{
++++ +  struct task_cputime cputime;
++++ +  cputime_t cutime, cstime;
++++ +
++++ +  spin_lock_irq(&current->sighand->siglock);
++++ +  thread_group_cputime(current, &cputime);
++++ +  cutime = current->signal->cutime;
++++ +  cstime = current->signal->cstime;
++++ +  spin_unlock_irq(&current->sighand->siglock);
++++ +  tms->tms_utime = cputime_to_clock_t(cputime.utime);
++++ +  tms->tms_stime = cputime_to_clock_t(cputime.stime);
++++ +  tms->tms_cutime = cputime_to_clock_t(cutime);
++++ +  tms->tms_cstime = cputime_to_clock_t(cstime);
++++ +}
++++ +
       asmlinkage long sys_times(struct tms __user * tbuf)
       {
---- -  /*
---- -   *      In the SMP world we might just be unlucky and have one of
---- -   *      the times increment as we use it. Since the value is an
---- -   *      atomically safe type this is just fine. Conceptually its
---- -   *      as if the syscall took an instant longer to occur.
---- -   */
         if (tbuf) {
                 struct tms tmp;
---- -          struct task_struct *tsk = current;
---- -          struct task_struct *t;
---- -          cputime_t utime, stime, cutime, cstime;
---- -
---- -          spin_lock_irq(&tsk->sighand->siglock);
---- -          utime = tsk->signal->utime;
---- -          stime = tsk->signal->stime;
---- -          t = tsk;
---- -          do {
---- -                  utime = cputime_add(utime, t->utime);
---- -                  stime = cputime_add(stime, t->stime);
---- -                  t = next_thread(t);
---- -          } while (t != tsk);
---- -
---- -          cutime = tsk->signal->cutime;
---- -          cstime = tsk->signal->cstime;
---- -          spin_unlock_irq(&tsk->sighand->siglock);
---- -
---- -          tmp.tms_utime = cputime_to_clock_t(utime);
---- -          tmp.tms_stime = cputime_to_clock_t(stime);
---- -          tmp.tms_cutime = cputime_to_clock_t(cutime);
---- -          tmp.tms_cstime = cputime_to_clock_t(cstime);
++++ +
++++ +          do_sys_times(&tmp);
                 if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
                         return -EFAULT;
         }
@@@@@@@ -1060,9 -1060,9 -1060,7 -1060,9 -1050,9 -1060,7 +1050,7 @@@@@@@ asmlinkage long sys_setsid(void
         group_leader->signal->leader = 1;
         __set_special_pids(sid);
       
-- --   spin_lock(&group_leader->sighand->siglock);
-- --   group_leader->signal->tty = NULL;
-- --   spin_unlock(&group_leader->sighand->siglock);
++ ++   proc_clear_tty(group_leader);
       
         err = session;
       out:
@@@@@@@ -1351,8 -1351,8 -1349,10 -1351,8 -1341,8 -1349,10 +1339,10 @@@@@@@ asmlinkage long sys_sethostname(char __
         down_write(&uts_sem);
         errno = -EFAULT;
         if (!copy_from_user(tmp, name, len)) {
-- --           memcpy(utsname()->nodename, tmp, len);
-- --           utsname()->nodename[len] = 0;
++ ++           struct new_utsname *u = utsname();
++ ++ 
++ ++           memcpy(u->nodename, tmp, len);
++ ++           memset(u->nodename + len, 0, sizeof(u->nodename) - len);
                 errno = 0;
         }
         up_write(&uts_sem);
@@@@@@@ -1364,15 -1364,15 -1364,17 -1364,15 -1354,15 -1364,17 +1354,17 @@@@@@@
       asmlinkage long sys_gethostname(char __user *name, int len)
       {
         int i, errno;
++ ++   struct new_utsname *u;
       
         if (len < 0)
                 return -EINVAL;
         down_read(&uts_sem);
-- --   i = 1 + strlen(utsname()->nodename);
++ ++   u = utsname();
++ ++   i = 1 + strlen(u->nodename);
         if (i > len)
                 i = len;
         errno = 0;
-- --   if (copy_to_user(name, utsname()->nodename, i))
++ ++   if (copy_to_user(name, u->nodename, i))
                 errno = -EFAULT;
         up_read(&uts_sem);
         return errno;
@@@@@@@ -1397,8 -1397,8 -1399,10 -1397,8 -1387,8 -1399,10 +1389,10 @@@@@@@ asmlinkage long sys_setdomainname(char 
         down_write(&uts_sem);
         errno = -EFAULT;
         if (!copy_from_user(tmp, name, len)) {
-- --           memcpy(utsname()->domainname, tmp, len);
-- --           utsname()->domainname[len] = 0;
++ ++           struct new_utsname *u = utsname();
++ ++ 
++ ++           memcpy(u->domainname, tmp, len);
++ ++           memset(u->domainname + len, 0, sizeof(u->domainname) - len);
                 errno = 0;
         }
         up_write(&uts_sem);
@@@@@@@ -1445,21 -1445,21 -1449,29 -1445,21 -1435,20 -1449,29 +1439,28 @@@@@@@ asmlinkage long sys_old_getrlimit(unsig
       asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
       {
         struct rlimit new_rlim, *old_rlim;
---- -  unsigned long it_prof_secs;
         int retval;
       
         if (resource >= RLIM_NLIMITS)
                 return -EINVAL;
         if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
                 return -EFAULT;
-- --   if (new_rlim.rlim_cur > new_rlim.rlim_max)
-- --           return -EINVAL;
         old_rlim = current->signal->rlim + resource;
         if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
             !capable(CAP_SYS_RESOURCE))
                 return -EPERM;
-- --   if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
-- --           return -EPERM;
++ ++ 
++ ++   if (resource == RLIMIT_NOFILE) {
++ ++           if (new_rlim.rlim_max == RLIM_INFINITY)
++ ++                   new_rlim.rlim_max = sysctl_nr_open;
++ ++           if (new_rlim.rlim_cur == RLIM_INFINITY)
++ ++                   new_rlim.rlim_cur = sysctl_nr_open;
++ ++           if (new_rlim.rlim_max > sysctl_nr_open)
++ ++                   return -EPERM;
++ ++   }
++ ++ 
++ ++   if (new_rlim.rlim_cur > new_rlim.rlim_max)
++ ++           return -EINVAL;
       
         retval = security_task_setrlimit(resource, &new_rlim);
         if (retval)
@@@@@@@ -1491,18 -1491,18 -1503,18 -1491,18 -1480,7 -1503,18 +1492,7 @@@@@@@
         if (new_rlim.rlim_cur == RLIM_INFINITY)
                 goto out;
       
---- -  it_prof_secs = cputime_to_secs(current->signal->it_prof_expires);
---- -  if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) {
---- -          unsigned long rlim_cur = new_rlim.rlim_cur;
---- -          cputime_t cputime;
---- -
---- -          cputime = secs_to_cputime(rlim_cur);
---- -          read_lock(&tasklist_lock);
---- -          spin_lock_irq(&current->sighand->siglock);
---- -          set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
---- -          spin_unlock_irq(&current->sighand->siglock);
---- -          read_unlock(&tasklist_lock);
---- -  }
++++ +  update_rlimit_cpu(new_rlim.rlim_cur);
       out:
         return 0;
       }
@@@@@@@ -1540,11 -1540,11 -1552,11 -1540,11 -1518,8 -1552,11 +1530,8 @@@@@@@
        *
        */
       
---- -static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r,
---- -                               cputime_t *utimep, cputime_t *stimep)
++++ +static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
       {
---- -  *utimep = cputime_add(*utimep, t->utime);
---- -  *stimep = cputime_add(*stimep, t->stime);
         r->ru_nvcsw += t->nvcsw;
         r->ru_nivcsw += t->nivcsw;
         r->ru_minflt += t->min_flt;
@@@@@@@ -1558,12 -1558,12 -1570,12 -1558,12 -1533,13 -1570,12 +1545,13 @@@@@@@ static void k_getrusage(struct task_str
         struct task_struct *t;
         unsigned long flags;
         cputime_t utime, stime;
++++ +  struct task_cputime cputime;
       
         memset((char *) r, 0, sizeof *r);
         utime = stime = cputime_zero;
       
         if (who == RUSAGE_THREAD) {
---- -          accumulate_thread_rusage(p, r, &utime, &stime);
++++ +          accumulate_thread_rusage(p, r);
                 goto out;
         }
       
@@@@@@@ -1586,8 -1586,8 -1598,8 -1586,8 -1562,9 -1598,8 +1574,9 @@@@@@@
                                 break;
       
                 case RUSAGE_SELF:
---- -                  utime = cputime_add(utime, p->signal->utime);
---- -                  stime = cputime_add(stime, p->signal->stime);
++++ +                  thread_group_cputime(p, &cputime);
++++ +                  utime = cputime_add(utime, cputime.utime);
++++ +                  stime = cputime_add(stime, cputime.stime);
                         r->ru_nvcsw += p->signal->nvcsw;
                         r->ru_nivcsw += p->signal->nivcsw;
                         r->ru_minflt += p->signal->min_flt;
@@@@@@@ -1596,7 -1596,7 -1608,7 -1596,7 -1573,7 -1608,7 +1585,7 @@@@@@@
                         r->ru_oublock += p->signal->oublock;
                         t = p;
                         do {
---- -                          accumulate_thread_rusage(t, r, &utime, &stime);
++++ +                          accumulate_thread_rusage(t, r);
                                 t = next_thread(t);
                         } while (t != p);
                         break;
diff --combined kernel/time/ntp.c

index 5125ddd8196ba3cffb2fd683c2fd3abdac0a695b,5125ddd8196ba3cffb2fd683c2fd3abdac0a695b,1ad46f3df6e76cd8994403b1c1ca72c14ec3553b,ddb0465a6baab4d81d79ed7799f463e5f5d9cf55,1ad46f3df6e76cd8994403b1c1ca72c14ec3553b,1ad46f3df6e76cd8994403b1c1ca72c14ec3553b..1a20715bfd6e4854e96e96eb1541f792767aeaa5
--- 1/kernel/time/ntp.c
--- 2/kernel/time/ntp.c
--- 3/kernel/time/ntp.c
--- 4/kernel/time/ntp.c
--- 5/kernel/time/ntp.c
--- 6/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@@@@@@ -10,13 -10,13 -10,13 -10,13 -10,13 -10,13 +10,13 @@@@@@@
       
       #include <linux/mm.h>
       #include <linux/time.h>
--- --#include <linux/timer.h>
       #include <linux/timex.h>
       #include <linux/jiffies.h>
       #include <linux/hrtimer.h>
       #include <linux/capability.h>
       #include <linux/math64.h>
       #include <linux/clocksource.h>
+++ ++#include <linux/workqueue.h>
       #include <asm/timex.h>
       
       /*
@@@@@@@ -218,11 -218,11 -218,11 -218,11 -218,11 -218,11 +218,11 @@@@@@@ void second_overflow(void
       /* Disable the cmos update - used by virtualization and embedded */
       int no_sync_cmos_clock  __read_mostly;
       
--- --static void sync_cmos_clock(unsigned long dummy);
+++ ++static void sync_cmos_clock(struct work_struct *work);
       
--- --static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
+++ ++static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
       
--- --static void sync_cmos_clock(unsigned long dummy)
+++ ++static void sync_cmos_clock(struct work_struct *work)
       {
         struct timespec now, next;
         int fail = 1;
@@@@@@@ -245,7 -245,7 -245,7 -245,7 -245,7 -245,7 +245,7 @@@@@@@
         if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
                 fail = update_persistent_clock(now);
       
-- -    next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec;
++ +    next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
         if (next.tv_nsec <= 0)
                 next.tv_nsec += NSEC_PER_SEC;
       
@@@@@@@ -258,13 -258,13 -258,13 -258,13 -258,13 -258,13 +258,13 @@@@@@@
                 next.tv_sec++;
                 next.tv_nsec -= NSEC_PER_SEC;
         }
--- --  mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next));
+++ ++  schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
       }
       
       static void notify_cmos_timer(void)
       {
         if (!no_sync_cmos_clock)
--- --          mod_timer(&sync_cmos_timer, jiffies + 1);
+++ ++          schedule_delayed_work(&sync_cmos_work, 0);
       }
       
       #else
@@@@@@@ -277,38 -277,38 -277,38 -277,50 -277,38 -277,38 +277,50 @@@@@@@ static inline void notify_cmos_timer(vo
       int do_adjtimex(struct timex *txc)
       {
         struct timespec ts;
--- --  long save_adjust, sec;
         int result;
       
--- --  /* In order to modify anything, you gotta be super-user! */
--- --  if (txc->modes && !capable(CAP_SYS_TIME))
--- --          return -EPERM;
--- --
--- --  /* Now we validate the data before disabling interrupts */
--- --
--- --  if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) {
+++ ++  /* Validate the data before disabling interrupts */
+++ ++  if (txc->modes & ADJ_ADJTIME) {
                 /* singleshot must not be used with any other mode bits */
--- --          if (txc->modes & ~ADJ_OFFSET_SS_READ)
+++ ++          if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
                         return -EINVAL;
+++ ++          if (!(txc->modes & ADJ_OFFSET_READONLY) &&
+++ ++              !capable(CAP_SYS_TIME))
+++ ++                  return -EPERM;
+++ ++  } else {
+++ ++          /* In order to modify anything, you gotta be super-user! */
+++ ++           if (txc->modes && !capable(CAP_SYS_TIME))
+++ ++                  return -EPERM;
+++ ++
+++ ++          /* if the quartz is off by more than 10% something is VERY wrong! */
+++ ++          if (txc->modes & ADJ_TICK &&
+++ ++              (txc->tick <  900000/USER_HZ ||
+++ ++               txc->tick > 1100000/USER_HZ))
+++ ++                          return -EINVAL;
+++ ++
+++ ++          if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
+++ ++                  hrtimer_cancel(&leap_timer);
         }
       
--- --  /* if the quartz is off by more than 10% something is VERY wrong ! */
--- --  if (txc->modes & ADJ_TICK)
--- --          if (txc->tick <  900000/USER_HZ ||
--- --              txc->tick > 1100000/USER_HZ)
--- --                  return -EINVAL;
--- --
--- --  if (time_state != TIME_OK && txc->modes & ADJ_STATUS)
--- --          hrtimer_cancel(&leap_timer);
         getnstimeofday(&ts);
       
         write_seqlock_irq(&xtime_lock);
       
--- --  /* Save for later - semantics of adjtime is to return old value */
--- --  save_adjust = time_adjust;
--- --
         /* If there are input parameters, then process them */
+++ ++  if (txc->modes & ADJ_ADJTIME) {
+++ ++          long save_adjust = time_adjust;
+++ ++
+++ ++          if (!(txc->modes & ADJ_OFFSET_READONLY)) {
+++ ++                  /* adjtime() is independent from ntp_adjtime() */
+++ ++                  time_adjust = txc->offset;
+++ ++                  ntp_update_frequency();
+++ ++          }
+++ ++          txc->offset = save_adjust;
+++ ++          goto adj_done;
+++ ++  }
         if (txc->modes) {
+++ ++          long sec;
+++ ++
                 if (txc->modes & ADJ_STATUS) {
                         if ((time_status & STA_PLL) &&
                             !(txc->status & STA_PLL)) {
@@@@@@@ -375,13 -375,13 -375,13 -387,8 -375,13 -375,13 +387,8 @@@@@@@
                 if (txc->modes & ADJ_TAI && txc->constant > 0)
                         time_tai = txc->constant;
       
--- --          if (txc->modes & ADJ_OFFSET) {
--- --                  if (txc->modes == ADJ_OFFSET_SINGLESHOT)
--- --                          /* adjtime() is independent from ntp_adjtime() */
--- --                          time_adjust = txc->offset;
--- --                  else
--- --                          ntp_update_offset(txc->offset);
--- --          }
+++ ++          if (txc->modes & ADJ_OFFSET)
+++ ++                  ntp_update_offset(txc->offset);
                 if (txc->modes & ADJ_TICK)
                         tick_usec = txc->tick;
       
@@@@@@@ -389,22 -389,22 -389,22 -396,18 -389,22 -389,22 +396,18 @@@@@@@
                         ntp_update_frequency();
         }
       
+++ ++  txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
+++ ++                            NTP_SCALE_SHIFT);
+++ ++  if (!(time_status & STA_NANO))
+++ ++          txc->offset /= NSEC_PER_USEC;
+++ ++
+++ ++adj_done:
         result = time_state;    /* mostly `TIME_OK' */
         if (time_status & (STA_UNSYNC|STA_CLOCKERR))
                 result = TIME_ERROR;
       
--- --  if ((txc->modes == ADJ_OFFSET_SINGLESHOT) ||
--- --      (txc->modes == ADJ_OFFSET_SS_READ))
--- --          txc->offset = save_adjust;
--- --  else {
--- --          txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
--- --                                    NTP_SCALE_SHIFT);
--- --          if (!(time_status & STA_NANO))
--- --                  txc->offset /= NSEC_PER_USEC;
--- --  }
--- --  txc->freq          = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
--- --                                   (s64)PPM_SCALE_INV,
--- --                                   NTP_SCALE_SHIFT);
+++ ++  txc->freq          = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
+++ ++                                   (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT);
         txc->maxerror      = time_maxerror;
         txc->esterror      = time_esterror;
         txc->status        = time_status;
diff --combined kernel/time/timekeeping.c

index 5099c95b8aa2677def4d992c6082b46f521d055b,e91c29f961c900d7739c0dc2f27b81c480cdb55c,e91c29f961c900d7739c0dc2f27b81c480cdb55c,5ecbfc39a2684eca0f23988eb1064dff987f5098,e91c29f961c900d7739c0dc2f27b81c480cdb55c,e91c29f961c900d7739c0dc2f27b81c480cdb55c..e7acfb482a680ea248f5268fbb7f158868938e56
--- 1/kernel/time/timekeeping.c
--- 2/kernel/time/timekeeping.c
--- 3/kernel/time/timekeeping.c
--- 4/kernel/time/timekeeping.c
--- 5/kernel/time/timekeeping.c
--- 6/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@@@@@@ -58,26 -58,27 -58,27 -58,27 -58,27 -58,27 +58,26 @@@@@@@ struct clocksource *clock
       
       #ifdef CONFIG_GENERIC_TIME
       /**
- ----- * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
+ +++++ * clocksource_forward_now - update clock to the current time
        *
- ----- * private function, must hold xtime_lock lock when being
- ----- * called. Returns the number of nanoseconds since the
- ----- * last call to update_wall_time() (adjusted by NTP scaling)
+ +++++ * Forward the current clock to update its state since the last call to
+ +++++ * update_wall_time(). This is useful before significant clock changes,
+ +++++ * as it avoids having to deal with this time offset explicitly.
        */
- -----static inline s64 __get_nsec_offset(void)
+ +++++static void clocksource_forward_now(void)
       {
         cycle_t cycle_now, cycle_delta;
- -----  s64 ns_offset;
+ +++++  s64 nsec;
       
- -----  /* read clocksource: */
         cycle_now = clocksource_read(clock);
- -----
- -----  /* calculate the delta since the last update_wall_time: */
         cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+ +++++  clock->cycle_last = cycle_now;
       
- -----  /* convert to nanoseconds: */
- -----  ns_offset = cyc2ns(clock, cycle_delta);
+ +++++  nsec = cyc2ns(clock, cycle_delta);
+ +++++  timespec_add_ns(&xtime, nsec);
       
- -----  return ns_offset;
+ +++++  nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
+ +++++  clock->raw_time.tv_nsec += nsec;
       }
       
       /**
@@@@@@@ -88,7 -89,6 -89,6 -89,6 -89,6 -89,6 +88,7 @@@@@@@
        */
       void getnstimeofday(struct timespec *ts)
       {
+ +++++  cycle_t cycle_now, cycle_delta;
         unsigned long seq;
         s64 nsecs;
       
@@@@@@@ -96,15 -96,7 -96,7 -96,7 -96,7 -96,7 +96,15 @@@@@@@
                 seq = read_seqbegin(&xtime_lock);
       
                 *ts = xtime;
- -----          nsecs = __get_nsec_offset();
+ +++++
+ +++++          /* read clocksource: */
+ +++++          cycle_now = clocksource_read(clock);
+ +++++
+ +++++          /* calculate the delta since the last update_wall_time: */
+ +++++          cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+ +++++
+ +++++          /* convert to nanoseconds: */
+ +++++          nsecs = cyc2ns(clock, cycle_delta);
       
         } while (read_seqretry(&xtime_lock, seq));
       
@@@@@@@ -137,22 -129,22 -129,22 -129,22 -129,22 -129,22 +137,22 @@@@@@@ EXPORT_SYMBOL(do_gettimeofday)
        */
       int do_settimeofday(struct timespec *tv)
       {
+ +++++  struct timespec ts_delta;
         unsigned long flags;
- -----  time_t wtm_sec, sec = tv->tv_sec;
- -----  long wtm_nsec, nsec = tv->tv_nsec;
       
         if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
                 return -EINVAL;
       
         write_seqlock_irqsave(&xtime_lock, flags);
       
- -----  nsec -= __get_nsec_offset();
+ +++++  clocksource_forward_now();
+ +++++
+ +++++  ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
+ +++++  ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
+ +++++  wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta);
       
- -----  wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
- -----  wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
+ +++++  xtime = *tv;
       
- -----  set_normalized_timespec(&xtime, sec, nsec);
- -----  set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
         update_xtime_cache(0);
       
         clock->error = 0;
@@@@@@@ -178,19 -170,22 -170,22 -170,22 -170,22 -170,22 +178,19 @@@@@@@ EXPORT_SYMBOL(do_settimeofday)
       static void change_clocksource(void)
       {
         struct clocksource *new;
- -----  cycle_t now;
- -----  u64 nsec;
       
         new = clocksource_get_next();
       
         if (clock == new)
                 return;
       
- -----  new->cycle_last = 0;
- -----  now = clocksource_read(new);
- -----  nsec =  __get_nsec_offset();
- -----  timespec_add_ns(&xtime, nsec);
+ +++++  clocksource_forward_now();
       
- -----  clock = new;
- -----  clock->cycle_last = now;
+ +++++  new->raw_time = clock->raw_time;
       
+ +++++  clock = new;
+ +++++  clock->cycle_last = 0;
+ +++++  clock->cycle_last = clocksource_read(new);
         clock->error = 0;
         clock->xtime_nsec = 0;
         clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@@@@@@ -205,43 -200,10 -200,10 -200,10 -200,10 -200,10 +205,43 @@@@@@@
          */
       }
       #else
+ +++++static inline void clocksource_forward_now(void) { }
       static inline void change_clocksource(void) { }
- -----static inline s64 __get_nsec_offset(void) { return 0; }
       #endif
       
+ +++++/**
+ +++++ * getrawmonotonic - Returns the raw monotonic time in a timespec
+ +++++ * @ts:           pointer to the timespec to be set
+ +++++ *
+ +++++ * Returns the raw monotonic time (completely un-modified by ntp)
+ +++++ */
+ +++++void getrawmonotonic(struct timespec *ts)
+ +++++{
+ +++++  unsigned long seq;
+ +++++  s64 nsecs;
+ +++++  cycle_t cycle_now, cycle_delta;
+ +++++
+ +++++  do {
+ +++++          seq = read_seqbegin(&xtime_lock);
+ +++++
+ +++++          /* read clocksource: */
+ +++++          cycle_now = clocksource_read(clock);
+ +++++
+ +++++          /* calculate the delta since the last update_wall_time: */
+ +++++          cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+ +++++
+ +++++          /* convert to nanoseconds: */
+ +++++          nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
+ +++++
+ +++++          *ts = clock->raw_time;
+ +++++
+ +++++  } while (read_seqretry(&xtime_lock, seq));
+ +++++
+ +++++  timespec_add_ns(ts, nsecs);
+ +++++}
+ +++++EXPORT_SYMBOL(getrawmonotonic);
+ +++++
+ +++++
       /**
        * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
        */
@@@@@@@ -303,6 -265,8 -265,8 -265,8 -265,8 -265,8 +303,6 @@@@@@@ void __init timekeeping_init(void
       static int timekeeping_suspended;
       /* time in seconds when suspend began */
       static unsigned long timekeeping_suspend_time;
- -----/* xtime offset when we went into suspend */
- -----static s64 timekeeping_suspend_nsecs;
       
       /**
        * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@@@@@@ -328,6 -292,8 -292,8 -292,8 -292,8 -292,8 +328,6 @@@@@@@ static int timekeeping_resume(struct sy
                 wall_to_monotonic.tv_sec -= sleep_length;
                 total_sleep_time += sleep_length;
         }
- -----  /* Make sure that we have the correct xtime reference */
- -----  timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
         update_xtime_cache(0);
         /* re-base the last cycle value */
         clock->cycle_last = 0;
@@@@@@@ -353,7 -319,8 -319,8 -319,8 -319,8 -319,8 +353,7 @@@@@@@ static int timekeeping_suspend(struct s
         timekeeping_suspend_time = read_persistent_clock();
       
         write_seqlock_irqsave(&xtime_lock, flags);
- -----  /* Get the current xtime offset */
- -----  timekeeping_suspend_nsecs = __get_nsec_offset();
+ +++++  clocksource_forward_now();
         timekeeping_suspended = 1;
         write_sequnlock_irqrestore(&xtime_lock, flags);
       
@@@@@@@ -487,29 -454,23 -454,23 -454,23 -454,23 -454,23 +487,29 @@@@@@@ void update_wall_time(void
       #else
         offset = clock->cycle_interval;
       #endif
--- --  clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
+++ ++  clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
       
         /* normally this loop will run just once, however in the
          * case of lost or late ticks, it will accumulate correctly.
          */
         while (offset >= clock->cycle_interval) {
                 /* accumulate one interval */
- -----          clock->xtime_nsec += clock->xtime_interval;
- -----          clock->cycle_last += clock->cycle_interval;
                 offset -= clock->cycle_interval;
+ +++++          clock->cycle_last += clock->cycle_interval;
       
+ +++++          clock->xtime_nsec += clock->xtime_interval;
                 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
                         clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
                         xtime.tv_sec++;
                         second_overflow();
                 }
       
+ +++++          clock->raw_time.tv_nsec += clock->raw_interval;
+ +++++          if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
+ +++++                  clock->raw_time.tv_nsec -= NSEC_PER_SEC;
+ +++++                  clock->raw_time.tv_sec++;
+ +++++          }
+ +++++
                 /* accumulate error between NTP and clock interval */
                 clock->error += tick_length;
                 clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
@@@@@@@ -518,9 -479,9 -479,9 -479,12 -479,9 -479,9 +518,12 @@@@@@@
         /* correct the clock when NTP error is too big */
         clocksource_adjust(offset);
       
--- --  /* store full nanoseconds into xtime */
--- --  xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
+++ ++  /* store full nanoseconds into xtime after rounding it up and
+++ ++   * add the remainder to the error difference.
+++ ++   */
+++ ++  xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1;
         clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
+++ ++  clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift);
       
         update_xtime_cache(cyc2ns(clock, offset));
       
diff --combined kernel/timer.c

index 03bc7f1f159350d2ad29ca45d3b3a28fc275f296,e8019cc3418d442bd94ccba44cdecfead9ef2d41,510fe69351ca2ec19700802b59ac3dbe9d142019,03bc7f1f159350d2ad29ca45d3b3a28fc275f296,03bc7f1f159350d2ad29ca45d3b3a28fc275f296,510fe69351ca2ec19700802b59ac3dbe9d142019..56becf373c589ba90c36a5b6e23df0527b227663
--- 1/kernel/timer.c
--- 2/kernel/timer.c
--- 3/kernel/timer.c
--- 4/kernel/timer.c
--- 5/kernel/timer.c
--- 6/kernel/timer.c
+++ b/kernel/timer.c
@@@@@@@ -978,6 -978,6 -978,7 -978,6 -978,6 -978,7 +978,7 @@@@@@@ void update_process_times(int user_tick
         run_local_timers();
         if (rcu_pending(cpu))
                 rcu_check_callbacks(cpu, user_tick);
++ ++   printk_tick();
         scheduler_tick();
         run_posix_cpu_timers(p);
       }
@@@@@@@ -1435,9 -1435,11 -1436,9 -1435,9 -1435,9 -1436,9 +1436,11 @@@@@@@ static void __cpuinit migrate_timers(in
         BUG_ON(cpu_online(cpu));
         old_base = per_cpu(tvec_bases, cpu);
         new_base = get_cpu_var(tvec_bases);
- ----
- ----  local_irq_disable();
- ----  spin_lock(&new_base->lock);
+ ++++  /*
+ ++++   * The caller is globally serialized and nobody else
+ ++++   * takes two locks at once, deadlock is not possible.
+ ++++   */
+ ++++  spin_lock_irq(&new_base->lock);
         spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
       
         BUG_ON(old_base->running_timer);
@@@@@@@ -1452,8 -1454,7 -1453,8 -1452,8 -1452,8 -1453,8 +1455,7 @@@@@@@
         }
       
         spin_unlock(&old_base->lock);
- ----  spin_unlock(&new_base->lock);
- ----  local_irq_enable();
+ ++++  spin_unlock_irq(&new_base->lock);
         put_cpu_var(tvec_bases);
       }
       #endif /* CONFIG_HOTPLUG_CPU */
diff --combined security/selinux/hooks.c

index 03fc6a81ae32bd783ddd96eca85f118a2ba79bd8,03fc6a81ae32bd783ddd96eca85f118a2ba79bd8,576e511990794eacfad7ad525722c4fa077ae852,03fc6a81ae32bd783ddd96eca85f118a2ba79bd8,69649783c26603e1afcadf85aff62a9a66998ed9,576e511990794eacfad7ad525722c4fa077ae852..3e3fde7c1d2bf2a48af6d19d72e47f3adc95514b
--- 1/security/selinux/hooks.c
--- 2/security/selinux/hooks.c
--- 3/security/selinux/hooks.c
--- 4/security/selinux/hooks.c
--- 5/security/selinux/hooks.c
--- 6/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@@@@@@ -75,6 -75,6 -75,6 -75,6 -75,7 -75,6 +75,7 @@@@@@@
       #include <linux/string.h>
       #include <linux/selinux.h>
       #include <linux/mutex.h>
++++ +#include <linux/posix-timers.h>
       
       #include "avc.h"
       #include "objsec.h"
@@@@@@@ -291,6 -291,6 -291,7 -291,6 -292,6 -291,7 +292,7 @@@@@@@ static void sk_free_security(struct soc
         struct sk_security_struct *ssec = sk->sk_security;
       
         sk->sk_security = NULL;
++ ++   selinux_netlbl_sk_security_free(ssec);
         kfree(ssec);
       }
       
@@@@@@@ -324,7 -324,7 -325,7 -324,7 -325,7 -325,7 +326,7 @@@@@@@ enum 
         Opt_rootcontext = 4,
       };
       
-- -- static match_table_t tokens = {
++ ++ static const match_table_t tokens = {
         {Opt_context, CONTEXT_STR "%s"},
         {Opt_fscontext, FSCONTEXT_STR "%s"},
         {Opt_defcontext, DEFCONTEXT_STR "%s"},
@@@@@@@ -957,7 -957,7 -958,8 -957,7 -958,7 -958,8 +959,8 @@@@@@@ out_err
         return rc;
       }
       
-- -- void selinux_write_opts(struct seq_file *m, struct security_mnt_opts *opts)
++ ++ static void selinux_write_opts(struct seq_file *m,
++ ++                          struct security_mnt_opts *opts)
       {
         int i;
         char *prefix;
@@@@@@@ -1290,7 -1290,7 -1292,7 -1290,7 -1291,7 -1292,7 +1293,7 @@@@@@@ static int inode_doinit_with_dentry(str
                 /* Default to the fs superblock SID. */
                 isec->sid = sbsec->sid;
       
-- --           if (sbsec->proc) {
++ ++           if (sbsec->proc && !S_ISLNK(inode->i_mode)) {
                         struct proc_inode *proci = PROC_I(inode);
                         if (proci->pde) {
                                 isec->sclass = inode_mode_to_security_class(inode->i_mode);
@@@@@@@ -2120,7 -2120,7 -2122,6 -2120,7 -2121,7 -2122,6 +2123,6 @@@@@@@ static inline void flush_unauthorized_f
         long j = -1;
         int drop_tty = 0;
       
-- --   mutex_lock(&tty_mutex);
         tty = get_current_tty();
         if (tty) {
                 file_list_lock();
@@@@@@@ -2138,8 -2138,8 -2139,8 -2138,8 -2139,8 -2139,8 +2140,8 @@@@@@@
                         }
                 }
                 file_list_unlock();
++ ++           tty_kref_put(tty);
         }
-- --   mutex_unlock(&tty_mutex);
         /* Reset controlling tty. */
         if (drop_tty)
                 no_tty();
@@@@@@@ -2321,13 -2321,13 -2322,13 -2321,13 -2322,7 -2322,13 +2323,7 @@@@@@@ static void selinux_bprm_post_apply_cre
                         initrlim = init_task.signal->rlim+i;
                         rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur);
                 }
---- -          if (current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
---- -                  /*
---- -                   * This will cause RLIMIT_CPU calculations
---- -                   * to be refigured.
---- -                   */
---- -                  current->it_prof_expires = jiffies_to_cputime(1);
---- -          }
++++ +          update_rlimit_cpu(rlim->rlim_cur);
         }
       
         /* Wake up the parent if it is waiting so that it can
@@@@@@@ -3548,38 -3548,38 -3549,44 -3548,38 -3543,38 -3549,44 +3544,44 @@@@@@@ out
       #endif /* IPV6 */
       
       static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
-- --                        char **addrp, int src, u8 *proto)
++ ++                        char **_addrp, int src, u8 *proto)
       {
-- --   int ret = 0;
++ ++   char *addrp;
++ ++   int ret;
       
         switch (ad->u.net.family) {
         case PF_INET:
                 ret = selinux_parse_skb_ipv4(skb, ad, proto);
-- --           if (ret || !addrp)
-- --                   break;
-- --           *addrp = (char *)(src ? &ad->u.net.v4info.saddr :
-- --                                   &ad->u.net.v4info.daddr);
-- --           break;
++ ++           if (ret)
++ ++                   goto parse_error;
++ ++           addrp = (char *)(src ? &ad->u.net.v4info.saddr :
++ ++                                  &ad->u.net.v4info.daddr);
++ ++           goto okay;
       
       #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
         case PF_INET6:
                 ret = selinux_parse_skb_ipv6(skb, ad, proto);
-- --           if (ret || !addrp)
-- --                   break;
-- --           *addrp = (char *)(src ? &ad->u.net.v6info.saddr :
-- --                                   &ad->u.net.v6info.daddr);
-- --           break;
++ ++           if (ret)
++ ++                   goto parse_error;
++ ++           addrp = (char *)(src ? &ad->u.net.v6info.saddr :
++ ++                                  &ad->u.net.v6info.daddr);
++ ++           goto okay;
       #endif    /* IPV6 */
         default:
-- --           break;
++ ++           addrp = NULL;
++ ++           goto okay;
         }
       
-- --   if (unlikely(ret))
-- --           printk(KERN_WARNING
-- --                  "SELinux: failure in selinux_parse_skb(),"
-- --                  " unable to parse packet\n");
-- -- 
++ ++ parse_error:
++ ++   printk(KERN_WARNING
++ ++          "SELinux: failure in selinux_parse_skb(),"
++ ++          " unable to parse packet\n");
         return ret;
++ ++ 
++ ++ okay:
++ ++   if (_addrp)
++ ++           *_addrp = addrp;
++ ++   return 0;
       }
       
       /**
@@@@@@@ -3794,6 -3794,6 -3801,7 -3794,6 -3789,6 -3801,7 +3796,7 @@@@@@@ out
       
       static int selinux_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen)
       {
++ ++   struct sock *sk = sock->sk;
         struct inode_security_struct *isec;
         int err;
       
@@@@@@@ -3807,7 -3807,7 -3815,6 -3807,7 -3802,7 -3815,6 +3810,6 @@@@@@@
         isec = SOCK_INODE(sock)->i_security;
         if (isec->sclass == SECCLASS_TCP_SOCKET ||
             isec->sclass == SECCLASS_DCCP_SOCKET) {
-- --           struct sock *sk = sock->sk;
                 struct avc_audit_data ad;
                 struct sockaddr_in *addr4 = NULL;
                 struct sockaddr_in6 *addr6 = NULL;
@@@@@@@ -3841,6 -3841,6 -3848,8 -3841,6 -3836,6 -3848,8 +3843,8 @@@@@@@
                         goto out;
         }
       
++ ++   err = selinux_netlbl_socket_connect(sk, address);
++ ++ 
       out:
         return err;
       }
@@@@@@@ -4070,20 -4070,20 -4079,28 -4070,20 -4065,20 -4079,28 +4074,28 @@@@@@@ static int selinux_sock_rcv_skb_iptable
       }
       
       static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb,
-- --                                  struct avc_audit_data *ad,
-- --                                  u16 family, char *addrp)
++ ++                                  u16 family)
       {
         int err;
         struct sk_security_struct *sksec = sk->sk_security;
         u32 peer_sid;
         u32 sk_sid = sksec->sid;
++ ++   struct avc_audit_data ad;
++ ++   char *addrp;
++ ++ 
++ ++   AVC_AUDIT_DATA_INIT(&ad, NET);
++ ++   ad.u.net.netif = skb->iif;
++ ++   ad.u.net.family = family;
++ ++   err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL);
++ ++   if (err)
++ ++           return err;
       
         if (selinux_compat_net)
-- --           err = selinux_sock_rcv_skb_iptables_compat(sk, skb, ad,
++ ++           err = selinux_sock_rcv_skb_iptables_compat(sk, skb, &ad,
                                                            family, addrp);
         else
                 err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET,
-- --                              PACKET__RECV, ad);
++ ++                              PACKET__RECV, &ad);
         if (err)
                 return err;
       
@@@@@@@ -4092,12 -4092,12 -4109,14 -4092,12 -4087,12 -4109,14 +4104,14 @@@@@@@
                 if (err)
                         return err;
                 err = avc_has_perm(sk_sid, peer_sid,
-- --                              SECCLASS_PEER, PEER__RECV, ad);
++ ++                              SECCLASS_PEER, PEER__RECV, &ad);
++ ++           if (err)
++ ++                   selinux_netlbl_err(skb, err, 0);
         } else {
-- --           err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, ad);
++ ++           err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, &ad);
                 if (err)
                         return err;
-- --           err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, ad);
++ ++           err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad);
         }
       
         return err;
@@@@@@@ -4111,6 -4111,6 -4130,8 -4111,6 -4106,6 -4130,8 +4125,8 @@@@@@@ static int selinux_socket_sock_rcv_skb(
         u32 sk_sid = sksec->sid;
         struct avc_audit_data ad;
         char *addrp;
++ ++   u8 secmark_active;
++ ++   u8 peerlbl_active;
       
         if (family != PF_INET && family != PF_INET6)
                 return 0;
@@@@@@@ -4119,6 -4119,6 -4140,18 -4119,6 -4114,6 -4140,18 +4135,18 @@@@@@@
         if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
                 family = PF_INET;
       
++ ++   /* If any sort of compatibility mode is enabled then handoff processing
++ ++    * to the selinux_sock_rcv_skb_compat() function to deal with the
++ ++    * special handling.  We do this in an attempt to keep this function
++ ++    * as fast and as clean as possible. */
++ ++   if (selinux_compat_net || !selinux_policycap_netpeer)
++ ++           return selinux_sock_rcv_skb_compat(sk, skb, family);
++ ++ 
++ ++   secmark_active = selinux_secmark_enabled();
++ ++   peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled();
++ ++   if (!secmark_active && !peerlbl_active)
++ ++           return 0;
++ ++ 
         AVC_AUDIT_DATA_INIT(&ad, NET);
         ad.u.net.netif = skb->iif;
         ad.u.net.family = family;
@@@@@@@ -4126,15 -4126,15 -4159,7 -4126,15 -4121,15 -4159,7 +4154,7 @@@@@@@
         if (err)
                 return err;
       
-- --   /* If any sort of compatibility mode is enabled then handoff processing
-- --    * to the selinux_sock_rcv_skb_compat() function to deal with the
-- --    * special handling.  We do this in an attempt to keep this function
-- --    * as fast and as clean as possible. */
-- --   if (selinux_compat_net || !selinux_policycap_netpeer)
-- --           return selinux_sock_rcv_skb_compat(sk, skb, &ad,
-- --                                              family, addrp);
-- -- 
-- --   if (netlbl_enabled() || selinux_xfrm_enabled()) {
++ ++   if (peerlbl_active) {
                 u32 peer_sid;
       
                 err = selinux_skb_peerlbl_sid(skb, family, &peer_sid);
@@@@@@@ -4142,13 -4142,13 -4167,17 -4142,13 -4137,13 -4167,17 +4162,17 @@@@@@@
                         return err;
                 err = selinux_inet_sys_rcv_skb(skb->iif, addrp, family,
                                                peer_sid, &ad);
-- --           if (err)
++ ++           if (err) {
++ ++                   selinux_netlbl_err(skb, err, 0);
                         return err;
++ ++           }
                 err = avc_has_perm(sk_sid, peer_sid, SECCLASS_PEER,
                                    PEER__RECV, &ad);
++ ++           if (err)
++ ++                   selinux_netlbl_err(skb, err, 0);
         }
       
-- --   if (selinux_secmark_enabled()) {
++ ++   if (secmark_active) {
                 err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET,
                                    PACKET__RECV, &ad);
                 if (err)
@@@@@@@ -4207,10 -4207,10 -4236,12 -4207,10 -4202,10 -4236,12 +4231,12 @@@@@@@ static int selinux_socket_getpeersec_dg
         u32 peer_secid = SECSID_NULL;
         u16 family;
       
-- --   if (sock)
++ ++   if (skb && skb->protocol == htons(ETH_P_IP))
++ ++           family = PF_INET;
++ ++   else if (skb && skb->protocol == htons(ETH_P_IPV6))
++ ++           family = PF_INET6;
++ ++   else if (sock)
                 family = sock->sk->sk_family;
-- --   else if (skb && skb->sk)
-- --           family = skb->sk->sk_family;
         else
                 goto out;
       
@@@@@@@ -4268,8 -4268,8 -4299,6 -4268,8 -4263,8 -4299,6 +4294,6 @@@@@@@ static void selinux_sock_graft(struct s
             sk->sk_family == PF_UNIX)
                 isec->sid = sksec->sid;
         sksec->sclass = isec->sclass;
-- -- 
-- --   selinux_netlbl_sock_graft(sk, parent);
       }
       
       static int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb,
@@@@@@@ -4277,10 -4277,10 -4306,15 -4277,10 -4272,10 -4306,15 +4301,15 @@@@@@@
       {
         struct sk_security_struct *sksec = sk->sk_security;
         int err;
++ ++   u16 family = sk->sk_family;
         u32 newsid;
         u32 peersid;
       
-- --   err = selinux_skb_peerlbl_sid(skb, sk->sk_family, &peersid);
++ ++   /* handle mapped IPv4 packets arriving via IPv6 sockets */
++ ++   if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
++ ++           family = PF_INET;
++ ++ 
++ ++   err = selinux_skb_peerlbl_sid(skb, family, &peersid);
         if (err)
                 return err;
         if (peersid == SECSID_NULL) {
@@@@@@@ -4315,12 -4315,12 -4349,18 -4315,12 -4310,12 -4349,18 +4344,18 @@@@@@@ static void selinux_inet_csk_clone(stru
         selinux_netlbl_sk_security_reset(newsksec, req->rsk_ops->family);
       }
       
-- -- static void selinux_inet_conn_established(struct sock *sk,
-- --                           struct sk_buff *skb)
++ ++ static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb)
       {
++ ++   u16 family = sk->sk_family;
         struct sk_security_struct *sksec = sk->sk_security;
       
-- --   selinux_skb_peerlbl_sid(skb, sk->sk_family, &sksec->peer_sid);
++ ++   /* handle mapped IPv4 packets arriving via IPv6 sockets */
++ ++   if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
++ ++           family = PF_INET;
++ ++ 
++ ++   selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid);
++ ++ 
++ ++   selinux_netlbl_inet_conn_established(sk, family);
       }
       
       static void selinux_req_classify_flow(const struct request_sock *req,
@@@@@@@ -4370,39 -4370,39 -4410,54 -4370,39 -4365,39 -4410,54 +4405,54 @@@@@@@ out
       static unsigned int selinux_ip_forward(struct sk_buff *skb, int ifindex,
                                        u16 family)
       {
++ ++   int err;
         char *addrp;
         u32 peer_sid;
         struct avc_audit_data ad;
         u8 secmark_active;
++ ++   u8 netlbl_active;
         u8 peerlbl_active;
       
         if (!selinux_policycap_netpeer)
                 return NF_ACCEPT;
       
         secmark_active = selinux_secmark_enabled();
-- --   peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled();
++ ++   netlbl_active = netlbl_enabled();
++ ++   peerlbl_active = netlbl_active || selinux_xfrm_enabled();
         if (!secmark_active && !peerlbl_active)
                 return NF_ACCEPT;
       
++ ++   if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0)
++ ++           return NF_DROP;
++ ++ 
         AVC_AUDIT_DATA_INIT(&ad, NET);
         ad.u.net.netif = ifindex;
         ad.u.net.family = family;
         if (selinux_parse_skb(skb, &ad, &addrp, 1, NULL) != 0)
                 return NF_DROP;
       
-- --   if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0)
-- --           return NF_DROP;
-- -- 
-- --   if (peerlbl_active)
-- --           if (selinux_inet_sys_rcv_skb(ifindex, addrp, family,
-- --                                        peer_sid, &ad) != 0)
++ ++   if (peerlbl_active) {
++ ++           err = selinux_inet_sys_rcv_skb(ifindex, addrp, family,
++ ++                                          peer_sid, &ad);
++ ++           if (err) {
++ ++                   selinux_netlbl_err(skb, err, 1);
                         return NF_DROP;
++ ++           }
++ ++   }
       
         if (secmark_active)
                 if (avc_has_perm(peer_sid, skb->secmark,
                                  SECCLASS_PACKET, PACKET__FORWARD_IN, &ad))
                         return NF_DROP;
       
++ ++   if (netlbl_active)
++ ++           /* we do this in the FORWARD path and not the POST_ROUTING
++ ++            * path because we want to make sure we apply the necessary
++ ++            * labeling before IPsec is applied so we can leverage AH
++ ++            * protection */
++ ++           if (selinux_netlbl_skbuff_setsid(skb, family, peer_sid) != 0)
++ ++                   return NF_DROP;
++ ++ 
         return NF_ACCEPT;
       }
       
@@@@@@@ -4426,6 -4426,6 -4481,37 -4426,6 -4421,6 -4481,37 +4476,37 @@@@@@@ static unsigned int selinux_ipv6_forwar
       }
       #endif    /* IPV6 */
       
++ ++ static unsigned int selinux_ip_output(struct sk_buff *skb,
++ ++                                 u16 family)
++ ++ {
++ ++   u32 sid;
++ ++ 
++ ++   if (!netlbl_enabled())
++ ++           return NF_ACCEPT;
++ ++ 
++ ++   /* we do this in the LOCAL_OUT path and not the POST_ROUTING path
++ ++    * because we want to make sure we apply the necessary labeling
++ ++    * before IPsec is applied so we can leverage AH protection */
++ ++   if (skb->sk) {
++ ++           struct sk_security_struct *sksec = skb->sk->sk_security;
++ ++           sid = sksec->sid;
++ ++   } else
++ ++           sid = SECINITSID_KERNEL;
++ ++   if (selinux_netlbl_skbuff_setsid(skb, family, sid) != 0)
++ ++           return NF_DROP;
++ ++ 
++ ++   return NF_ACCEPT;
++ ++ }
++ ++ 
++ ++ static unsigned int selinux_ipv4_output(unsigned int hooknum,
++ ++                                   struct sk_buff *skb,
++ ++                                   const struct net_device *in,
++ ++                                   const struct net_device *out,
++ ++                                   int (*okfn)(struct sk_buff *))
++ ++ {
++ ++   return selinux_ip_output(skb, PF_INET);
++ ++ }
++ ++ 
       static int selinux_ip_postroute_iptables_compat(struct sock *sk,
                                                 int ifindex,
                                                 struct avc_audit_data *ad,
@@@@@@@ -4493,30 -4493,30 -4579,36 -4493,30 -4488,30 -4579,36 +4574,36 @@@@@@@
       
       static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb,
                                                 int ifindex,
-- --                                           struct avc_audit_data *ad,
-- --                                           u16 family,
-- --                                           char *addrp,
-- --                                           u8 proto)
++ ++                                           u16 family)
       {
         struct sock *sk = skb->sk;
         struct sk_security_struct *sksec;
++ ++   struct avc_audit_data ad;
++ ++   char *addrp;
++ ++   u8 proto;
       
         if (sk == NULL)
                 return NF_ACCEPT;
         sksec = sk->sk_security;
       
++ ++   AVC_AUDIT_DATA_INIT(&ad, NET);
++ ++   ad.u.net.netif = ifindex;
++ ++   ad.u.net.family = family;
++ ++   if (selinux_parse_skb(skb, &ad, &addrp, 0, &proto))
++ ++           return NF_DROP;
++ ++ 
         if (selinux_compat_net) {
                 if (selinux_ip_postroute_iptables_compat(skb->sk, ifindex,
-- --                                                    ad, family, addrp))
++ ++                                                    &ad, family, addrp))
                         return NF_DROP;
         } else {
                 if (avc_has_perm(sksec->sid, skb->secmark,
-- --                            SECCLASS_PACKET, PACKET__SEND, ad))
++ ++                            SECCLASS_PACKET, PACKET__SEND, &ad))
                         return NF_DROP;
         }
       
         if (selinux_policycap_netpeer)
-- --           if (selinux_xfrm_postroute_last(sksec->sid, skb, ad, proto))
++ ++           if (selinux_xfrm_postroute_last(sksec->sid, skb, &ad, proto))
                         return NF_DROP;
       
         return NF_ACCEPT;
@@@@@@@ -4530,23 -4530,23 -4622,15 -4530,23 -4525,23 -4622,15 +4617,15 @@@@@@@ static unsigned int selinux_ip_postrout
         struct sock *sk;
         struct avc_audit_data ad;
         char *addrp;
-- --   u8 proto;
         u8 secmark_active;
         u8 peerlbl_active;
       
-- --   AVC_AUDIT_DATA_INIT(&ad, NET);
-- --   ad.u.net.netif = ifindex;
-- --   ad.u.net.family = family;
-- --   if (selinux_parse_skb(skb, &ad, &addrp, 0, &proto))
-- --           return NF_DROP;
-- -- 
         /* If any sort of compatibility mode is enabled then handoff processing
          * to the selinux_ip_postroute_compat() function to deal with the
          * special handling.  We do this in an attempt to keep this function
          * as fast and as clean as possible. */
         if (selinux_compat_net || !selinux_policycap_netpeer)
-- --           return selinux_ip_postroute_compat(skb, ifindex, &ad,
-- --                                              family, addrp, proto);
++ ++           return selinux_ip_postroute_compat(skb, ifindex, family);
       
         /* If skb->dst->xfrm is non-NULL then the packet is undergoing an IPsec
          * packet transformation so allow the packet to pass without any checks
@@@@@@@ -4562,21 -4562,21 -4646,45 -4562,21 -4557,21 -4646,45 +4641,45 @@@@@@@
         if (!secmark_active && !peerlbl_active)
                 return NF_ACCEPT;
       
-- --   /* if the packet is locally generated (skb->sk != NULL) then use the
-- --    * socket's label as the peer label, otherwise the packet is being
-- --    * forwarded through this system and we need to fetch the peer label
-- --    * directly from the packet */
++ ++   /* if the packet is being forwarded then get the peer label from the
++ ++    * packet itself; otherwise check to see if it is from a local
++ ++    * application or the kernel, if from an application get the peer label
++ ++    * from the sending socket, otherwise use the kernel's sid */
         sk = skb->sk;
-- --   if (sk) {
++ ++   if (sk == NULL) {
++ ++           switch (family) {
++ ++           case PF_INET:
++ ++                   if (IPCB(skb)->flags & IPSKB_FORWARDED)
++ ++                           secmark_perm = PACKET__FORWARD_OUT;
++ ++                   else
++ ++                           secmark_perm = PACKET__SEND;
++ ++                   break;
++ ++           case PF_INET6:
++ ++                   if (IP6CB(skb)->flags & IP6SKB_FORWARDED)
++ ++                           secmark_perm = PACKET__FORWARD_OUT;
++ ++                   else
++ ++                           secmark_perm = PACKET__SEND;
++ ++                   break;
++ ++           default:
++ ++                   return NF_DROP;
++ ++           }
++ ++           if (secmark_perm == PACKET__FORWARD_OUT) {
++ ++                   if (selinux_skb_peerlbl_sid(skb, family, &peer_sid))
++ ++                           return NF_DROP;
++ ++           } else
++ ++                   peer_sid = SECINITSID_KERNEL;
++ ++   } else {
                 struct sk_security_struct *sksec = sk->sk_security;
                 peer_sid = sksec->sid;
                 secmark_perm = PACKET__SEND;
-- --   } else {
-- --           if (selinux_skb_peerlbl_sid(skb, family, &peer_sid))
-- --                           return NF_DROP;
-- --           secmark_perm = PACKET__FORWARD_OUT;
         }
       
++ ++   AVC_AUDIT_DATA_INIT(&ad, NET);
++ ++   ad.u.net.netif = ifindex;
++ ++   ad.u.net.family = family;
++ ++   if (selinux_parse_skb(skb, &ad, &addrp, 0, NULL))
++ ++           return NF_DROP;
++ ++ 
         if (secmark_active)
                 if (avc_has_perm(peer_sid, skb->secmark,
                                  SECCLASS_PACKET, secmark_perm, &ad))
@@@@@@@ -5219,8 -5219,8 -5327,12 -5219,8 -5214,8 -5327,12 +5322,12 @@@@@@@ static int selinux_setprocattr(struct t
       
                 if (sid == 0)
                         return -EINVAL;
-- -- 
-- --           /* Only allow single threaded processes to change context */
++ ++           /*
++ ++            * SELinux allows to change context in the following case only.
++ ++            *  - Single threaded processes.
++ ++            *  - Multi threaded processes intend to change its context into
++ ++            *    more restricted domain (defined by TYPEBOUNDS statement).
++ ++            */
                 if (atomic_read(&p->mm->mm_users) != 1) {
                         struct task_struct *g, *t;
                         struct mm_struct *mm = p->mm;
@@@@@@@ -5228,11 -5228,11 -5340,16 -5228,11 -5223,11 -5340,16 +5335,16 @@@@@@@
                         do_each_thread(g, t) {
                                 if (t->mm == mm && t != p) {
                                         read_unlock(&tasklist_lock);
-- --                                   return -EPERM;
++ ++                                   error = security_bounded_transition(tsec->sid, sid);
++ ++                                   if (!error)
++ ++                                           goto boundary_ok;
++ ++ 
++ ++                                   return error;
                                 }
                         } while_each_thread(g, t);
                         read_unlock(&tasklist_lock);
                 }
++ ++ boundary_ok:
       
                 /* Check permissions for the transition. */
                 error = avc_has_perm(tsec->sid, sid, SECCLASS_PROCESS,
@@@@@@@ -5641,6 -5641,6 -5758,13 -5641,6 -5636,6 -5758,13 +5753,13 @@@@@@@ static struct nf_hook_ops selinux_ipv4_
                 .pf =           PF_INET,
                 .hooknum =      NF_INET_FORWARD,
                 .priority =     NF_IP_PRI_SELINUX_FIRST,
++ ++   },
++ ++   {
++ ++           .hook =         selinux_ipv4_output,
++ ++           .owner =        THIS_MODULE,
++ ++           .pf =           PF_INET,
++ ++           .hooknum =      NF_INET_LOCAL_OUT,
++ ++           .priority =     NF_IP_PRI_SELINUX_FIRST,
         }
       };
author	Thomas Gleixner <tglx@linutronix.de>
	Mon, 20 Oct 2008 11:14:06 +0000 (13:14 +0200)
committer	Thomas Gleixner <tglx@linutronix.de>
	Mon, 20 Oct 2008 11:14:06 +0000 (13:14 +0200)
		1	2	3	4	5	6
drivers/clocksource/acpi_pm.c	patch \|	diff1 \|	diff2 \|	diff3 \|	diff4 \|	diff5 \|	diff6 \|	blob \| history
fs/binfmt_elf.c	patch \|	diff1 \|	diff2 \|	diff3 \|	diff4 \|	diff5 \|	diff6 \|	blob \| history
fs/proc/array.c	patch \|	diff1 \|	diff2 \|	diff3 \|	diff4 \|	diff5 \|	diff6 \|	blob \| history
include/linux/hrtimer.h	patch \|	diff1 \|	diff2 \|	diff3 \|	diff4 \|	diff5 \|	diff6 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	diff3 \|	diff4 \|	diff5 \|	diff6 \|	blob \| history
include/linux/time.h	patch \|	diff1 \|	diff2 \|	diff3 \|	diff4 \|	diff5 \|	diff6 \|	blob \| history
kernel/compat.c	patch \|	diff1 \|	diff2 \|	diff3 \|	diff4 \|	diff5 \|	diff6 \|	blob \| history
kernel/exit.c	patch \|	diff1 \|	diff2 \|	diff3 \|	diff4 \|	diff5 \|	diff6 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	diff3 \|	diff4 \|	diff5 \|	diff6 \|	blob \| history
kernel/hrtimer.c	patch \|	diff1 \|	diff2 \|	diff3 \|	diff4 \|	diff5 \|	diff6 \|	blob \| history
kernel/posix-timers.c	patch \|	diff1 \|	diff2 \|	diff3 \|	diff4 \|	diff5 \|	diff6 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	diff3 \|	diff4 \|	diff5 \|	diff6 \|	blob \| history
kernel/sched_fair.c	patch \|	diff1 \|	diff2 \|	diff3 \|	diff4 \|	diff5 \|	diff6 \|	blob \| history
kernel/sched_rt.c	patch \|	diff1 \|	diff2 \|	diff3 \|	diff4 \|	diff5 \|	diff6 \|	blob \| history
kernel/softirq.c	patch \|	diff1 \|	diff2 \|	diff3 \|	diff4 \|	diff5 \|	diff6 \|	blob \| history
kernel/sys.c	patch \|	diff1 \|	diff2 \|	diff3 \|	diff4 \|	diff5 \|	diff6 \|	blob \| history
kernel/time/ntp.c	patch \|	diff1 \|	diff2 \|	diff3 \|	diff4 \|	diff5 \|	diff6 \|	blob \| history
kernel/time/timekeeping.c	patch \|	diff1 \|	diff2 \|	diff3 \|	diff4 \|	diff5 \|	diff6 \|	blob \| history
kernel/timer.c	patch \|	diff1 \|	diff2 \|	diff3 \|	diff4 \|	diff5 \|	diff6 \|	blob \| history
security/selinux/hooks.c	patch \|	diff1 \|	diff2 \|	diff3 \|	diff4 \|	diff5 \|	diff6 \|	blob \| history