From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 20 Oct 2008 11:14:06 +0000 (+0200)
Subject: Merge branches 'timers/clocksource', 'timers/hrtimers', 'timers/nohz', 'timers/ntp... 
X-Git-Tag: v2.6.28-rc1~82^2
X-Git-Url: http://www.pilppa.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=c465a76af658b443075d6efee1c3131257643020;hp=-c;p=linux-2.6-omap-h63xx.git

Merge branches 'timers/clocksource', 'timers/hrtimers', 'timers/nohz', 'timers/ntp', 'timers/posixtimers' and 'timers/debug' into v28-timers-for-linus
---

c465a76af658b443075d6efee1c3131257643020
diff --combined drivers/clocksource/acpi_pm.c
index 3df33848100,5ca1d80de18,71d2ac4e3f4,5ca1d80de18,4eee533f3f4,71d2ac4e3f4..c20171078d1
--- a/drivers/clocksource/acpi_pm.c
+++ b/drivers/clocksource/acpi_pm.c
@@@@@@@ -21,6 -21,6 -21,7 -21,6 -21,7 -21,7 +21,7 @@@@@@@
      #include <linux/errno.h>
      #include <linux/init.h>
      #include <linux/pci.h>
++ +  #include <linux/delay.h>
      #include <asm/io.h>
      
      /*
@@@@@@@ -151,13 -151,13 -152,13 -151,13 -152,13 -152,13 +152,13 @@@@@@@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_S
       */
      static int verify_pmtmr_rate(void)
      {
-- -  	u32 value1, value2;
++ +  	cycle_t value1, value2;
      	unsigned long count, delta;
      
      	mach_prepare_counter();
-- -  	value1 = read_pmtmr();
++ +  	value1 = clocksource_acpi_pm.read();
      	mach_countup(&count);
-- -  	value2 = read_pmtmr();
++ +  	value2 = clocksource_acpi_pm.read();
      	delta = (value2 - value1) & ACPI_PM_MASK;
      
      	/* Check that the PMTMR delta is within 5% of what we expect */
@@@@@@@ -175,10 -175,10 -176,15 -175,10 -176,13 -176,15 +176,15 @@@@@@@
      #define verify_pmtmr_rate() (0)
      #endif
      
++ +  /* Number of monotonicity checks to perform during initialization */
++ +  #define ACPI_PM_MONOTONICITY_CHECKS 10
++ ++ /* Number of reads we try to get two different values */
++ ++ #define ACPI_PM_READ_CHECKS 10000
++ +  
      static int __init init_acpi_pm_clocksource(void)
      {
-- -  	u32 value1, value2;
-- -  	unsigned int i;
++ +  	cycle_t value1, value2;
    - 	unsigned int i, j, good = 0;
++ ++ 	unsigned int i, j = 0;
      
      	if (!pmtmr_ioport)
      		return -ENODEV;
@@@@@@@ -187,24 -187,24 -193,29 -187,24 -191,32 -193,29 +193,29 @@@@@@@
      						clocksource_acpi_pm.shift);
      
      	/* "verify" this timing source: */
-- -  	value1 = read_pmtmr();
-- -  	for (i = 0; i < 10000; i++) {
-- -  		value2 = read_pmtmr();
-- -  		if (value2 == value1)
-- -  			continue;
-- -  		if (value2 > value1)
-- -  			goto pm_good;
-- -  		if ((value2 < value1) && ((value2) < 0xFFF))
-- -  			goto pm_good;
-- -  		printk(KERN_INFO "PM-Timer had inconsistent results:"
-- -  			" 0x%#x, 0x%#x - aborting.\n", value1, value2);
-- -  		return -EINVAL;
++ +  	for (j = 0; j < ACPI_PM_MONOTONICITY_CHECKS; j++) {
++ ++ 		udelay(100 * j);
++ +  		value1 = clocksource_acpi_pm.read();
    - 		for (i = 0; i < 10000; i++) {
++ ++ 		for (i = 0; i < ACPI_PM_READ_CHECKS; i++) {
++ +  			value2 = clocksource_acpi_pm.read();
++ +  			if (value2 == value1)
++ +  				continue;
++ +  			if (value2 > value1)
    - 				good++;
++ +  				break;
++ +  			if ((value2 < value1) && ((value2) < 0xFFF))
    - 				good++;
++ +  				break;
++ +  			printk(KERN_INFO "PM-Timer had inconsistent results:"
++ +  			       " 0x%#llx, 0x%#llx - aborting.\n",
++ +  			       value1, value2);
++ +  			return -EINVAL;
++ +  		}
    - 		udelay(300 * i);
    - 	}
    - 
    - 	if (good != ACPI_PM_MONOTONICITY_CHECKS) {
    - 		printk(KERN_INFO "PM-Timer failed consistency check "
    - 		       " (0x%#llx) - aborting.\n", value1);
    - 		return -ENODEV;
++ ++ 		if (i == ACPI_PM_READ_CHECKS) {
++ ++ 			printk(KERN_INFO "PM-Timer failed consistency check "
++ ++ 			       " (0x%#llx) - aborting.\n", value1);
++ ++ 			return -ENODEV;
++ ++ 		}
      	}
-- -  	printk(KERN_INFO "PM-Timer had no reasonable result:"
-- -  			" 0x%#x - aborting.\n", value1);
-- -  	return -ENODEV;
      
-- -  pm_good:
      	if (verify_pmtmr_rate() != 0)
      		return -ENODEV;
      
@@@@@@@ -226,12 -226,9 -237,9 -226,9 -238,9 -237,9 +237,12 @@@@@@@ static int __init parse_pmtmr(char *arg
      
      	if (strict_strtoul(arg, 16, &base))
      		return -EINVAL;
 -----
 +++++#ifdef CONFIG_X86_64
 +++++	if (base > UINT_MAX)
 +++++		return -ERANGE;
 +++++#endif
      	printk(KERN_INFO "PMTMR IOPort override: 0x%04x -> 0x%04lx\n",
 -----	       (unsigned int)pmtmr_ioport, base);
 +++++	       pmtmr_ioport, base);
      	pmtmr_ioport = base;
      
      	return 1;
diff --combined fs/binfmt_elf.c
index 655ed8d30a8,655ed8d30a8,c76afa26edf,655ed8d30a8,a8635f63703,c76afa26edf..83d72006e29
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@@@@@@ -683,7 -683,7 -683,7 -683,7 -683,7 -683,7 +683,7 @@@@@@@ static int load_elf_binary(struct linux
      			 * switch really is going to happen - do this in
      			 * flush_thread().	- akpm
      			 */
-- -- 			SET_PERSONALITY(loc->elf_ex, 0);
++ ++ 			SET_PERSONALITY(loc->elf_ex);
      
      			interpreter = open_exec(elf_interpreter);
      			retval = PTR_ERR(interpreter);
@@@@@@@ -734,7 -734,7 -734,7 -734,7 -734,7 -734,7 +734,7 @@@@@@@
      			goto out_free_dentry;
      	} else {
      		/* Executables without an interpreter also need a personality  */
-- -- 		SET_PERSONALITY(loc->elf_ex, 0);
++ ++ 		SET_PERSONALITY(loc->elf_ex);
      	}
      
      	/* Flush all traces of the currently running executable */
@@@@@@@ -748,7 -748,7 -748,7 -748,7 -748,7 -748,7 +748,7 @@@@@@@
      
      	/* Do this immediately, since STACK_TOP as used in setup_arg_pages
      	   may depend on the personality.  */
-- -- 	SET_PERSONALITY(loc->elf_ex, 0);
++ ++ 	SET_PERSONALITY(loc->elf_ex);
      	if (elf_read_implies_exec(loc->elf_ex, executable_stack))
      		current->personality |= READ_IMPLIES_EXEC;
      
@@@@@@@ -1333,20 -1333,20 -1333,20 -1333,20 -1333,15 -1333,20 +1333,15 @@@@@@@ static void fill_prstatus(struct elf_pr
      	prstatus->pr_pgrp = task_pgrp_vnr(p);
      	prstatus->pr_sid = task_session_vnr(p);
      	if (thread_group_leader(p)) {
++++ +		struct task_cputime cputime;
++++ +
      		/*
---- -		 * This is the record for the group leader.  Add in the
---- -		 * cumulative times of previous dead threads.  This total
---- -		 * won't include the time of each live thread whose state
---- -		 * is included in the core dump.  The final total reported
---- -		 * to our parent process when it calls wait4 will include
---- -		 * those sums as well as the little bit more time it takes
---- -		 * this and each other thread to finish dying after the
---- -		 * core dump synchronization phase.
++++ +		 * This is the record for the group leader.  It shows the
++++ +		 * group-wide total, not its individual thread total.
      		 */
---- -		cputime_to_timeval(cputime_add(p->utime, p->signal->utime),
---- -				   &prstatus->pr_utime);
---- -		cputime_to_timeval(cputime_add(p->stime, p->signal->stime),
---- -				   &prstatus->pr_stime);
++++ +		thread_group_cputime(p, &cputime);
++++ +		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
++++ +		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
      	} else {
      		cputime_to_timeval(p->utime, &prstatus->pr_utime);
      		cputime_to_timeval(p->stime, &prstatus->pr_stime);
diff --combined fs/proc/array.c
index 0d6eb33597c,0d6eb33597c,f4bc0e78953,71c9be59c9c,933953c4e40,f4bc0e78953..bb9f4b05703
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@@@@@@ -86,11 -86,11 -86,6 -86,11 -86,11 -86,6 +86,6 @@@@@@@
      #include <asm/processor.h>
      #include "internal.h"
      
-- -- /* Gcc optimizes away "strlen(x)" for constant x */
-- -- #define ADDBUF(buffer, string) \
-- -- do { memcpy(buffer, string, strlen(string)); \
-- --      buffer += strlen(string); } while (0)
-- -- 
      static inline void task_name(struct seq_file *m, struct task_struct *p)
      {
      	int i;
@@@@@@@ -261,7 -261,7 -256,6 -261,7 -261,7 -256,6 +256,6 @@@@@@@ static inline void task_sig(struct seq_
      	sigemptyset(&ignored);
      	sigemptyset(&caught);
      
-- -- 	rcu_read_lock();
      	if (lock_task_sighand(p, &flags)) {
      		pending = p->pending.signal;
      		shpending = p->signal->shared_pending.signal;
@@@@@@@ -272,7 -272,7 -266,6 -272,7 -272,7 -266,6 +266,6 @@@@@@@
      		qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
      		unlock_task_sighand(p, &flags);
      	}
-- -- 	rcu_read_unlock();
      
      	seq_printf(m, "Threads:\t%d\n", num_threads);
      	seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim);
@@@@@@@ -337,65 -337,65 -330,6 -337,6 -337,6 -330,6 +330,6 @@@@@@@ int proc_pid_status(struct seq_file *m
      	return 0;
      }
      
--    /*
--     * Use precise platform statistics if available:
--     */
--    #ifdef CONFIG_VIRT_CPU_ACCOUNTING
--    static cputime_t task_utime(struct task_struct *p)
--    {
--    	return p->utime;
--    }
--    
--    static cputime_t task_stime(struct task_struct *p)
--    {
--    	return p->stime;
--    }
--    #else
--    static cputime_t task_utime(struct task_struct *p)
--    {
--    	clock_t utime = cputime_to_clock_t(p->utime),
--    		total = utime + cputime_to_clock_t(p->stime);
--    	u64 temp;
--    
--    	/*
--    	 * Use CFS's precise accounting:
--    	 */
--    	temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
--    
--    	if (total) {
--    		temp *= utime;
--    		do_div(temp, total);
--    	}
--    	utime = (clock_t)temp;
--    
--    	p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
--    	return p->prev_utime;
--    }
--    
--    static cputime_t task_stime(struct task_struct *p)
--    {
--    	clock_t stime;
--    
--    	/*
--    	 * Use CFS's precise accounting. (we subtract utime from
--    	 * the total, to make sure the total observed by userspace
--    	 * grows monotonically - apps rely on that):
--    	 */
--    	stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
--    			cputime_to_clock_t(task_utime(p));
--    
--    	if (stime >= 0)
--    		p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
--    
--    	return p->prev_stime;
--    }
--    #endif
--    
--    static cputime_t task_gtime(struct task_struct *p)
--    {
--    	return p->gtime;
--    }
--    
      static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
      			struct pid *pid, struct task_struct *task, int whole)
      {
@@@@@@@ -454,20 -454,20 -388,20 -395,20 -395,20 -388,20 +388,20 @@@@@@@
      
      		/* add up live thread stats at the group level */
      		if (whole) {
++++ +			struct task_cputime cputime;
      			struct task_struct *t = task;
      			do {
      				min_flt += t->min_flt;
      				maj_flt += t->maj_flt;
---- -				utime = cputime_add(utime, task_utime(t));
---- -				stime = cputime_add(stime, task_stime(t));
      				gtime = cputime_add(gtime, task_gtime(t));
      				t = next_thread(t);
      			} while (t != task);
      
      			min_flt += sig->min_flt;
      			maj_flt += sig->maj_flt;
---- -			utime = cputime_add(utime, sig->utime);
---- -			stime = cputime_add(stime, sig->stime);
++++ +			thread_group_cputime(task, &cputime);
++++ +			utime = cputime.utime;
++++ +			stime = cputime.stime;
      			gtime = cputime_add(gtime, sig->gtime);
      		}
      
diff --combined include/linux/hrtimer.h
index 6d93dce61cb,8730b60c943,2f245fe63bd,6d93dce61cb,6d93dce61cb,2f245fe63bd..9a4e35cd5f7
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@@@@@@ -47,14 -47,14 -47,22 -47,14 -47,14 -47,22 +47,22 @@@@@@@ enum hrtimer_restart 
       *	HRTIMER_CB_IRQSAFE:		Callback may run in hardirq context
       *	HRTIMER_CB_IRQSAFE_NO_RESTART:	Callback may run in hardirq context and
       *					does not restart the timer
-- --  *	HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:	Callback must run in hardirq context
-- --  *					Special mode for tick emultation
++ ++  *	HRTIMER_CB_IRQSAFE_PERCPU:	Callback must run in hardirq context
++ ++  *					Special mode for tick emulation and
++ ++  *					scheduler timer. Such timers are per
++ ++  *					cpu and not allowed to be migrated on
++ ++  *					cpu unplug.
++ ++  *	HRTIMER_CB_IRQSAFE_UNLOCKED:	Callback should run in hardirq context
++ ++  *					with timer->base lock unlocked
++ ++  *					used for timers which call wakeup to
++ ++  *					avoid lock order problems with rq->lock
       */
      enum hrtimer_cb_mode {
      	HRTIMER_CB_SOFTIRQ,
      	HRTIMER_CB_IRQSAFE,
      	HRTIMER_CB_IRQSAFE_NO_RESTART,
-- -- 	HRTIMER_CB_IRQSAFE_NO_SOFTIRQ,
++ ++ 	HRTIMER_CB_IRQSAFE_PERCPU,
++ ++ 	HRTIMER_CB_IRQSAFE_UNLOCKED,
      };
      
      /*
@@@@@@@ -67,9 -67,9 -75,10 -67,9 -67,9 -75,10 +75,10 @@@@@@@
       * 0x02		callback function running
       * 0x04		callback pending (high resolution mode)
       *
-- --  * Special case:
++ ++  * Special cases:
       * 0x03		callback function running and enqueued
       *		(was requeued on another CPU)
++ ++  * 0x09		timer was migrated on CPU hotunplug
       * The "callback function running and enqueued" status is only possible on
       * SMP. It happens for example when a posix timer expired and the callback
       * queued a signal. Between dropping the lock which protects the posix timer
@@@@@@@ -87,6 -87,6 -96,7 -87,6 -87,6 -96,7 +96,7 @@@@@@@
      #define HRTIMER_STATE_ENQUEUED	0x01
      #define HRTIMER_STATE_CALLBACK	0x02
      #define HRTIMER_STATE_PENDING	0x04
++ ++ #define HRTIMER_STATE_MIGRATE	0x08
      
      /**
       * struct hrtimer - the basic hrtimer structure
@@@@@@@ -115,12 -115,12 -125,12 -115,12 -115,12 -125,12 +125,12 @@@@@@@ struct hrtimer 
      	enum hrtimer_restart		(*function)(struct hrtimer *);
      	struct hrtimer_clock_base	*base;
      	unsigned long			state;
- ----	enum hrtimer_cb_mode		cb_mode;
      	struct list_head		cb_entry;
+ ++++	enum hrtimer_cb_mode		cb_mode;
      #ifdef CONFIG_TIMER_STATS
+ ++++	int				start_pid;
      	void				*start_site;
      	char				start_comm[16];
- ----	int				start_pid;
      #endif
      };
      
@@@@@@@ -145,10 -145,8 -155,10 -145,10 -145,10 -155,10 +155,8 @@@@@@@ struct hrtimer_sleeper 
       * @first:		pointer to the timer node which expires first
       * @resolution:		the resolution of the clock, in nanoseconds
       * @get_time:		function to retrieve the current time of the clock
- ---- * @get_softirq_time:	function to retrieve the current time from the softirq
       * @softirq_time:	the time when running the hrtimer queue in the softirq
       * @offset:		offset of this clock to the monotonic base
- ---- * @reprogram:		function to reprogram the timer event
       */
      struct hrtimer_clock_base {
      	struct hrtimer_cpu_base	*cpu_base;
@@@@@@@ -157,13 -155,9 -167,13 -157,13 -157,13 -167,13 +165,9 @@@@@@@
      	struct rb_node		*first;
      	ktime_t			resolution;
      	ktime_t			(*get_time)(void);
- ----	ktime_t			(*get_softirq_time)(void);
      	ktime_t			softirq_time;
      #ifdef CONFIG_HIGH_RES_TIMERS
      	ktime_t			offset;
- ----	int			(*reprogram)(struct hrtimer *t,
- ----					     struct hrtimer_clock_base *b,
- ----					     ktime_t n);
      #endif
      };
      
diff --combined include/linux/sched.h
index cfb0d87b99f,cfb0d87b99f,c226c7b8294,3d9120c5ad1,23d9d546454,c226c7b8294..81c68fef443
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@@@@@@ -352,7 -352,7 -352,7 -352,7 -352,7 -352,7 +352,7 @@@@@@@ arch_get_unmapped_area_topdown(struct f
      extern void arch_unmap_area(struct mm_struct *, unsigned long);
      extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
      
-- -- #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
++ ++ #if USE_SPLIT_PTLOCKS
      /*
       * The mm counters are not protected by its page_table_lock,
       * so must be incremented atomically.
@@@@@@@ -363,7 -363,7 -363,7 -363,7 -363,7 -363,7 +363,7 @@@@@@@
      #define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
      #define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
      
-- -- #else  /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
++ ++ #else  /* !USE_SPLIT_PTLOCKS */
      /*
       * The mm counters are protected by its page_table_lock,
       * so can be incremented directly.
@@@@@@@ -374,7 -374,7 -374,7 -374,7 -374,7 -374,7 +374,7 @@@@@@@
      #define inc_mm_counter(mm, member) (mm)->_##member++
      #define dec_mm_counter(mm, member) (mm)->_##member--
      
-- -- #endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
++ ++ #endif /* !USE_SPLIT_PTLOCKS */
      
      #define get_mm_rss(mm)					\
      	(get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
@@@@@@@ -425,6 -425,6 -425,6 -425,6 -425,39 -425,6 +425,39 @@@@@@@ struct pacct_struct 
      	unsigned long		ac_minflt, ac_majflt;
      };
      
++++ +/**
++++ + * struct task_cputime - collected CPU time counts
++++ + * @utime:		time spent in user mode, in &cputime_t units
++++ + * @stime:		time spent in kernel mode, in &cputime_t units
++++ + * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
++++ + *
++++ + * This structure groups together three kinds of CPU time that are
++++ + * tracked for threads and thread groups.  Most things considering
++++ + * CPU time want to group these counts together and treat all three
++++ + * of them in parallel.
++++ + */
++++ +struct task_cputime {
++++ +	cputime_t utime;
++++ +	cputime_t stime;
++++ +	unsigned long long sum_exec_runtime;
++++ +};
++++ +/* Alternate field names when used to cache expirations. */
++++ +#define prof_exp	stime
++++ +#define virt_exp	utime
++++ +#define sched_exp	sum_exec_runtime
++++ +
++++ +/**
++++ + * struct thread_group_cputime - thread group interval timer counts
++++ + * @totals:		thread group interval timers; substructure for
++++ + *			uniprocessor kernel, per-cpu for SMP kernel.
++++ + *
++++ + * This structure contains the version of task_cputime, above, that is
++++ + * used for thread group CPU clock calculations.
++++ + */
++++ +struct thread_group_cputime {
++++ +	struct task_cputime *totals;
++++ +};
++++ +
      /*
       * NOTE! "signal_struct" does not have it's own
       * locking, because a shared signal_struct always
@@@@@@@ -451,8 -451,8 -451,8 -451,8 -484,8 -451,8 +484,8 @@@@@@@ struct signal_struct 
      	 * - everyone except group_exit_task is stopped during signal delivery
      	 *   of fatal signals, group_exit_task processes the signal.
      	 */
-- -- 	struct task_struct	*group_exit_task;
      	int			notify_count;
++ ++ 	struct task_struct	*group_exit_task;
      
      	/* thread group stop support, overloads group_exit_code too */
      	int			group_stop_count;
@@@@@@@ -470,6 -470,6 -470,6 -470,6 -503,17 -470,6 +503,17 @@@@@@@
      	cputime_t it_prof_expires, it_virt_expires;
      	cputime_t it_prof_incr, it_virt_incr;
      
++++ +	/*
++++ +	 * Thread group totals for process CPU clocks.
++++ +	 * See thread_group_cputime(), et al, for details.
++++ +	 */
++++ +	struct thread_group_cputime cputime;
++++ +
++++ +	/* Earliest-expiration cache. */
++++ +	struct task_cputime cputime_expires;
++++ +
++++ +	struct list_head cpu_timers[3];
++++ +
      	/* job control IDs */
      
      	/*
@@@@@@@ -500,7 -500,7 -500,7 -500,7 -544,7 -500,7 +544,7 @@@@@@@
      	 * Live threads maintain their own counters and add to these
      	 * in __exit_signal, except for the group leader.
      	 */
---- -	cputime_t utime, stime, cutime, cstime;
++++ +	cputime_t cutime, cstime;
      	cputime_t gtime;
      	cputime_t cgtime;
      	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
@@@@@@@ -508,14 -508,14 -508,14 -508,14 -552,6 -508,14 +552,6 @@@@@@@
      	unsigned long inblock, oublock, cinblock, coublock;
      	struct task_io_accounting ioac;
      
---- -	/*
---- -	 * Cumulative ns of scheduled CPU time for dead threads in the
---- -	 * group, not including a zombie group leader.  (This only differs
---- -	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
---- -	 * other than jiffies.)
---- -	 */
---- -	unsigned long long sum_sched_runtime;
---- -
      	/*
      	 * We don't bother to synchronize most readers of this at all,
      	 * because there is no reader checking a limit that actually needs
@@@@@@@ -527,8 -527,8 -527,8 -527,8 -563,6 -527,8 +563,6 @@@@@@@
      	 */
      	struct rlimit rlim[RLIM_NLIMITS];
      
---- -	struct list_head cpu_timers[3];
---- -
      	/* keep the process-shared keyrings here so that they do the right
      	 * thing in threads created with CLONE_THREAD */
      #ifdef CONFIG_KEYS
@@@@@@@ -824,6 -824,6 -824,9 -824,6 -858,6 -824,9 +858,9 @@@@@@@ struct sched_domain 
      	unsigned int ttwu_move_affine;
      	unsigned int ttwu_move_balance;
      #endif
++ ++ #ifdef CONFIG_SCHED_DEBUG
++ ++ 	char *name;
++ ++ #endif
      };
      
      extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
@@@@@@@ -897,7 -897,7 -900,7 -897,7 -931,7 -900,7 +934,7 @@@@@@@ struct sched_class 
      	void (*yield_task) (struct rq *rq);
      	int  (*select_task_rq)(struct task_struct *p, int sync);
      
-- -- 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
++ ++ 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync);
      
      	struct task_struct * (*pick_next_task) (struct rq *rq);
      	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
@@@@@@@ -1010,8 -1010,8 -1013,8 -1010,8 -1044,8 -1013,8 +1047,8 @@@@@@@ struct sched_entity 
      
      struct sched_rt_entity {
      	struct list_head run_list;
-- -- 	unsigned int time_slice;
      	unsigned long timeout;
++ ++ 	unsigned int time_slice;
      	int nr_cpus_allowed;
      
      	struct sched_rt_entity *back;
@@@@@@@ -1134,8 -1134,8 -1137,8 -1134,8 -1168,7 -1137,8 +1171,7 @@@@@@@ struct task_struct 
      /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
      	unsigned long min_flt, maj_flt;
      
---- -  	cputime_t it_prof_expires, it_virt_expires;
---- -	unsigned long long it_sched_expires;
++++ +	struct task_cputime cputime_expires;
      	struct list_head cpu_timers[3];
      
      /* process credentials */
@@@@@@@ -1475,6 -1475,6 -1478,10 -1475,10 -1508,10 -1478,10 +1511,10 @@@@@@@ static inline void put_task_struct(stru
      		__put_task_struct(t);
      }
      
++    extern cputime_t task_utime(struct task_struct *p);
++    extern cputime_t task_stime(struct task_struct *p);
++    extern cputime_t task_gtime(struct task_struct *p);
++    
      /*
       * Per process flags
       */
@@@@@@@ -1581,6 -1581,6 -1588,6 -1585,6 -1618,7 -1588,6 +1621,7 @@@@@@@ extern unsigned long long cpu_clock(in
      
      extern unsigned long long
      task_sched_runtime(struct task_struct *task);
++++ +extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
      
      /* sched_exec is called by processes performing an exec */
      #ifdef CONFIG_SMP
@@@@@@@ -2077,6 -2077,6 -2084,6 -2081,6 -2115,30 -2084,6 +2118,30 @@@@@@@ static inline int spin_needbreak(spinlo
      #endif
      }
      
++++ +/*
++++ + * Thread group CPU time accounting.
++++ + */
++++ +
++++ +extern int thread_group_cputime_alloc(struct task_struct *);
++++ +extern void thread_group_cputime(struct task_struct *, struct task_cputime *);
++++ +
++++ +static inline void thread_group_cputime_init(struct signal_struct *sig)
++++ +{
++++ +	sig->cputime.totals = NULL;
++++ +}
++++ +
++++ +static inline int thread_group_cputime_clone_thread(struct task_struct *curr)
++++ +{
++++ +	if (curr->signal->cputime.totals)
++++ +		return 0;
++++ +	return thread_group_cputime_alloc(curr);
++++ +}
++++ +
++++ +static inline void thread_group_cputime_free(struct signal_struct *sig)
++++ +{
++++ +	free_percpu(sig->cputime.totals);
++++ +}
++++ +
      /*
       * Reevaluate whether the task has signals pending delivery.
       * Wake the task if so.
diff --combined include/linux/time.h
index 205f974b9eb,e15206a7e82,51e883df0fa,e15206a7e82,1b70b3c293e,51e883df0fa..4f1c9db5770
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@@@@@@ -29,6 -29,6 -29,8 -29,6 -29,6 -29,8 +29,8 @@@@@@@ struct timezone 
      
      #ifdef __KERNEL__
      
++ ++ extern struct timezone sys_tz;
++ ++ 
      /* Parameters used to convert the timespec values: */
      #define MSEC_PER_SEC	1000L
      #define USEC_PER_MSEC	1000L
@@@@@@@ -117,7 -117,6 -119,6 -117,6 -117,6 -119,6 +119,7 @@@@@@@ extern int do_setitimer(int which, stru
      extern unsigned int alarm_setitimer(unsigned int seconds);
      extern int do_getitimer(int which, struct itimerval *value);
      extern void getnstimeofday(struct timespec *tv);
 +++++extern void getrawmonotonic(struct timespec *ts);
      extern void getboottime(struct timespec *ts);
      extern void monotonic_to_bootbased(struct timespec *ts);
      
@@@@@@@ -126,6 -125,6 -127,6 -125,6 -125,9 -127,6 +128,9 @@@@@@@ extern int timekeeping_valid_for_hres(v
      extern void update_wall_time(void);
      extern void update_xtime_cache(u64 nsec);
      
++++ +struct tms;
++++ +extern void do_sys_times(struct tms *);
++++ +
      /**
       * timespec_to_ns - Convert timespec to nanoseconds
       * @ts:		pointer to the timespec variable to be converted
@@@@@@@ -215,7 -214,6 -216,6 -214,6 -217,6 -216,6 +220,7 @@@@@@@ struct itimerval 
      #define CLOCK_MONOTONIC			1
      #define CLOCK_PROCESS_CPUTIME_ID	2
      #define CLOCK_THREAD_CPUTIME_ID		3
 +++++#define CLOCK_MONOTONIC_RAW		4
      
      /*
       * The IDs of various hardware clocks:
diff --combined kernel/compat.c
index 32c254a8ab9,32c254a8ab9,143990e48cb,32c254a8ab9,72650e39b3e,143990e48cb..8eafe3eb50d
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@@@@@@ -23,9 -23,9 -23,67 -23,9 -23,10 -23,67 +23,68 @@@@@@@
      #include <linux/timex.h>
      #include <linux/migrate.h>
      #include <linux/posix-timers.h>
++++ +#include <linux/times.h>
      
      #include <asm/uaccess.h>
      
++ ++ /*
++ ++  * Note that the native side is already converted to a timespec, because
++ ++  * that's what we want anyway.
++ ++  */
++ ++ static int compat_get_timeval(struct timespec *o,
++ ++ 		struct compat_timeval __user *i)
++ ++ {
++ ++ 	long usec;
++ ++ 
++ ++ 	if (get_user(o->tv_sec, &i->tv_sec) ||
++ ++ 	    get_user(usec, &i->tv_usec))
++ ++ 		return -EFAULT;
++ ++ 	o->tv_nsec = usec * 1000;
++ ++ 	return 0;
++ ++ }
++ ++ 
++ ++ static int compat_put_timeval(struct compat_timeval __user *o,
++ ++ 		struct timeval *i)
++ ++ {
++ ++ 	return (put_user(i->tv_sec, &o->tv_sec) ||
++ ++ 		put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
++ ++ }
++ ++ 
++ ++ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
++ ++ 		struct timezone __user *tz)
++ ++ {
++ ++ 	if (tv) {
++ ++ 		struct timeval ktv;
++ ++ 		do_gettimeofday(&ktv);
++ ++ 		if (compat_put_timeval(tv, &ktv))
++ ++ 			return -EFAULT;
++ ++ 	}
++ ++ 	if (tz) {
++ ++ 		if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
++ ++ 			return -EFAULT;
++ ++ 	}
++ ++ 
++ ++ 	return 0;
++ ++ }
++ ++ 
++ ++ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
++ ++ 		struct timezone __user *tz)
++ ++ {
++ ++ 	struct timespec kts;
++ ++ 	struct timezone ktz;
++ ++ 
++ ++ 	if (tv) {
++ ++ 		if (compat_get_timeval(&kts, tv))
++ ++ 			return -EFAULT;
++ ++ 	}
++ ++ 	if (tz) {
++ ++ 		if (copy_from_user(&ktz, tz, sizeof(ktz)))
++ ++ 			return -EFAULT;
++ ++ 	}
++ ++ 
++ ++ 	return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
++ ++ }
++ ++ 
      int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
      {
      	return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
@@@@@@@ -150,49 -150,49 -208,49 -150,49 -151,23 -208,49 +209,23 @@@@@@@ asmlinkage long compat_sys_setitimer(in
      	return 0;
      }
      
++++ +static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
++++ +{
++++ +	return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
++++ +}
++++ +
      asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
      {
---- -	/*
---- -	 *	In the SMP world we might just be unlucky and have one of
---- -	 *	the times increment as we use it. Since the value is an
---- -	 *	atomically safe type this is just fine. Conceptually its
---- -	 *	as if the syscall took an instant longer to occur.
---- -	 */
      	if (tbuf) {
++++ +		struct tms tms;
      		struct compat_tms tmp;
---- -		struct task_struct *tsk = current;
---- -		struct task_struct *t;
---- -		cputime_t utime, stime, cutime, cstime;
---- -
---- -		read_lock(&tasklist_lock);
---- -		utime = tsk->signal->utime;
---- -		stime = tsk->signal->stime;
---- -		t = tsk;
---- -		do {
---- -			utime = cputime_add(utime, t->utime);
---- -			stime = cputime_add(stime, t->stime);
---- -			t = next_thread(t);
---- -		} while (t != tsk);
---- -
---- -		/*
---- -		 * While we have tasklist_lock read-locked, no dying thread
---- -		 * can be updating current->signal->[us]time.  Instead,
---- -		 * we got their counts included in the live thread loop.
---- -		 * However, another thread can come in right now and
---- -		 * do a wait call that updates current->signal->c[us]time.
---- -		 * To make sure we always see that pair updated atomically,
---- -		 * we take the siglock around fetching them.
---- -		 */
---- -		spin_lock_irq(&tsk->sighand->siglock);
---- -		cutime = tsk->signal->cutime;
---- -		cstime = tsk->signal->cstime;
---- -		spin_unlock_irq(&tsk->sighand->siglock);
---- -		read_unlock(&tasklist_lock);
---- -
---- -		tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime));
---- -		tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime));
---- -		tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime));
---- -		tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime));
++++ +
++++ +		do_sys_times(&tms);
++++ +		/* Convert our struct tms to the compat version. */
++++ +		tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
++++ +		tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
++++ +		tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
++++ +		tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
      		if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
      			return -EFAULT;
      	}
diff --combined kernel/exit.c
index 38ec4063014,38ec4063014,0ef4673e351,16395644a98,40036ac0427,0ef4673e351..059b38cae38
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@@@@@@ -112,9 -112,9 -112,9 -112,9 -112,7 -112,9 +112,7 @@@@@@@ static void __exit_signal(struct task_s
      		 * We won't ever get here for the group leader, since it
      		 * will have been the last reference on the signal_struct.
      		 */
--    		sig->utime = cputime_add(sig->utime, tsk->utime);
--    		sig->stime = cputime_add(sig->stime, tsk->stime);
--    		sig->gtime = cputime_add(sig->gtime, tsk->gtime);
  -- -		sig->utime = cputime_add(sig->utime, task_utime(tsk));
  -- -		sig->stime = cputime_add(sig->stime, task_stime(tsk));
++    		sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
      		sig->min_flt += tsk->min_flt;
      		sig->maj_flt += tsk->maj_flt;
      		sig->nvcsw += tsk->nvcsw;
@@@@@@@ -122,7 -122,7 -122,7 -122,7 -120,6 -122,7 +120,6 @@@@@@@
      		sig->inblock += task_io_get_inblock(tsk);
      		sig->oublock += task_io_get_oublock(tsk);
      		task_io_accounting_add(&sig->ioac, &tsk->ioac);
---- -		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
      		sig = NULL; /* Marker for below. */
      	}
      
@@@@@@@ -583,8 -583,8 -583,6 -583,8 -580,8 -583,6 +580,6 @@@@@@@ mm_need_new_owner(struct mm_struct *mm
      	 * If there are other users of the mm and the owner (us) is exiting
      	 * we need to find a new owner to take on the responsibility.
      	 */
-- -- 	if (!mm)
-- -- 		return 0;
      	if (atomic_read(&mm->mm_users) <= 1)
      		return 0;
      	if (mm->owner != p)
@@@@@@@ -627,29 -627,29 -625,38 -627,29 -624,29 -625,38 +622,38 @@@@@@@ retry
      	} while_each_thread(g, c);
      
      	read_unlock(&tasklist_lock);
++ ++ 	/*
++ ++ 	 * We found no owner yet mm_users > 1: this implies that we are
++ ++ 	 * most likely racing with swapoff (try_to_unuse()) or /proc or
++ ++ 	 * ptrace or page migration (get_task_mm()).  Mark owner as NULL,
++ ++ 	 * so that subsystems can understand the callback and take action.
++ ++ 	 */
++ ++ 	down_write(&mm->mmap_sem);
++ ++ 	cgroup_mm_owner_callbacks(mm->owner, NULL);
++ ++ 	mm->owner = NULL;
++ ++ 	up_write(&mm->mmap_sem);
      	return;
      
      assign_new_owner:
      	BUG_ON(c == p);
      	get_task_struct(c);
++ ++ 	read_unlock(&tasklist_lock);
++ ++ 	down_write(&mm->mmap_sem);
      	/*
      	 * The task_lock protects c->mm from changing.
      	 * We always want mm->owner->mm == mm
      	 */
      	task_lock(c);
-- -- 	/*
-- -- 	 * Delay read_unlock() till we have the task_lock()
-- -- 	 * to ensure that c does not slip away underneath us
-- -- 	 */
-- -- 	read_unlock(&tasklist_lock);
      	if (c->mm != mm) {
      		task_unlock(c);
++ ++ 		up_write(&mm->mmap_sem);
      		put_task_struct(c);
      		goto retry;
      	}
      	cgroup_mm_owner_callbacks(mm->owner, c);
      	mm->owner = c;
      	task_unlock(c);
++ ++ 	up_write(&mm->mmap_sem);
      	put_task_struct(c);
      }
      #endif /* CONFIG_MM_OWNER */
@@@@@@@ -831,26 -831,26 -838,50 -831,50 -828,50 -838,50 +835,50 @@@@@@@ static void reparent_thread(struct task
       * the child reaper process (ie "init") in our pid
       * space.
       */
++    static struct task_struct *find_new_reaper(struct task_struct *father)
++    {
++    	struct pid_namespace *pid_ns = task_active_pid_ns(father);
++    	struct task_struct *thread;
++    
++    	thread = father;
++    	while_each_thread(father, thread) {
++    		if (thread->flags & PF_EXITING)
++    			continue;
++    		if (unlikely(pid_ns->child_reaper == father))
++    			pid_ns->child_reaper = thread;
++    		return thread;
++    	}
++    
++    	if (unlikely(pid_ns->child_reaper == father)) {
++    		write_unlock_irq(&tasklist_lock);
++    		if (unlikely(pid_ns == &init_pid_ns))
++    			panic("Attempted to kill init!");
++    
++    		zap_pid_ns_processes(pid_ns);
++    		write_lock_irq(&tasklist_lock);
++    		/*
++    		 * We can not clear ->child_reaper or leave it alone.
++    		 * There may by stealth EXIT_DEAD tasks on ->children,
++    		 * forget_original_parent() must move them somewhere.
++    		 */
++    		pid_ns->child_reaper = init_pid_ns.child_reaper;
++    	}
++    
++    	return pid_ns->child_reaper;
++    }
++    
      static void forget_original_parent(struct task_struct *father)
      {
--    	struct task_struct *p, *n, *reaper = father;
++    	struct task_struct *p, *n, *reaper;
      	LIST_HEAD(ptrace_dead);
      
      	write_lock_irq(&tasklist_lock);
--    
++    	reaper = find_new_reaper(father);
      	/*
      	 * First clean up ptrace if we were using it.
      	 */
      	ptrace_exit(father, &ptrace_dead);
      
--    	do {
--    		reaper = next_thread(reaper);
--    		if (reaper == father) {
--    			reaper = task_child_reaper(father);
--    			break;
--    		}
--    	} while (reaper->flags & PF_EXITING);
--    
      	list_for_each_entry_safe(p, n, &father->children, sibling) {
      		p->real_parent = reaper;
      		if (p->parent == father) {
@@@@@@@ -918,8 -918,8 -949,8 -942,8 -939,8 -949,8 +946,8 @@@@@@@ static void exit_notify(struct task_str
      
      	/* mt-exec, de_thread() is waiting for us */
      	if (thread_group_leader(tsk) &&
--    	    tsk->signal->notify_count < 0 &&
--    	    tsk->signal->group_exit_task)
++    	    tsk->signal->group_exit_task &&
++    	    tsk->signal->notify_count < 0)
      		wake_up_process(tsk->signal->group_exit_task);
      
      	write_unlock_irq(&tasklist_lock);
@@@@@@@ -959,39 -959,39 -990,6 -983,6 -980,6 -990,6 +987,6 @@@@@@@ static void check_stack_usage(void
      static inline void check_stack_usage(void) {}
      #endif
      
--    static inline void exit_child_reaper(struct task_struct *tsk)
--    {
--    	if (likely(tsk->group_leader != task_child_reaper(tsk)))
--    		return;
--    
--    	if (tsk->nsproxy->pid_ns == &init_pid_ns)
--    		panic("Attempted to kill init!");
--    
--    	/*
--    	 * @tsk is the last thread in the 'cgroup-init' and is exiting.
--    	 * Terminate all remaining processes in the namespace and reap them
--    	 * before exiting @tsk.
--    	 *
--    	 * Note that @tsk (last thread of cgroup-init) may not necessarily
--    	 * be the child-reaper (i.e main thread of cgroup-init) of the
--    	 * namespace i.e the child_reaper may have already exited.
--    	 *
--    	 * Even after a child_reaper exits, we let it inherit orphaned children,
--    	 * because, pid_ns->child_reaper remains valid as long as there is
--    	 * at least one living sub-thread in the cgroup init.
--    
--    	 * This living sub-thread of the cgroup-init will be notified when
--    	 * a child inherited by the 'child-reaper' exits (do_notify_parent()
--    	 * uses __group_send_sig_info()). Further, when reaping child processes,
--    	 * do_wait() iterates over children of all living sub threads.
--    
--    	 * i.e even though 'child_reaper' thread is listed as the parent of the
--    	 * orphaned children, any living sub-thread in the cgroup-init can
--    	 * perform the role of the child_reaper.
--    	 */
--    	zap_pid_ns_processes(tsk->nsproxy->pid_ns);
--    }
--    
      NORET_TYPE void do_exit(long code)
      {
      	struct task_struct *tsk = current;
@@@@@@@ -1051,7 -1051,7 -1049,6 -1042,6 -1039,6 -1049,6 +1046,6 @@@@@@@
      	}
      	group_dead = atomic_dec_and_test(&tsk->signal->live);
      	if (group_dead) {
--    		exit_child_reaper(tsk);
      		hrtimer_cancel(&tsk->signal->real_timer);
      		exit_itimers(tsk->signal);
      	}
@@@@@@@ -1304,6 -1304,6 -1301,6 -1294,6 -1291,7 -1301,6 +1298,7 @@@@@@@ static int wait_task_zombie(struct task
      	if (likely(!traced)) {
      		struct signal_struct *psig;
      		struct signal_struct *sig;
++++ +		struct task_cputime cputime;
      
      		/*
      		 * The resource counters for the group leader are in its
@@@@@@@ -1319,20 -1319,20 -1316,20 -1309,20 -1307,23 -1316,20 +1314,23 @@@@@@@
      		 * need to protect the access to p->parent->signal fields,
      		 * as other threads in the parent group can be right
      		 * here reaping other children at the same time.
++++ +		 *
++++ +		 * We use thread_group_cputime() to get times for the thread
++++ +		 * group, which consolidates times for all threads in the
++++ +		 * group including the group leader.
      		 */
      		spin_lock_irq(&p->parent->sighand->siglock);
      		psig = p->parent->signal;
      		sig = p->signal;
++++ +		thread_group_cputime(p, &cputime);
      		psig->cutime =
      			cputime_add(psig->cutime,
---- -			cputime_add(p->utime,
---- -			cputime_add(sig->utime,
---- -				    sig->cutime)));
++++ +			cputime_add(cputime.utime,
++++ +				    sig->cutime));
      		psig->cstime =
      			cputime_add(psig->cstime,
---- -			cputime_add(p->stime,
---- -			cputime_add(sig->stime,
---- -				    sig->cstime)));
++++ +			cputime_add(cputime.stime,
++++ +				    sig->cstime));
      		psig->cgtime =
      			cputime_add(psig->cgtime,
      			cputime_add(p->gtime,
diff --combined kernel/fork.c
index 7ce2ebe8479,7ce2ebe8479,30de644a40c,7ce2ebe8479,021ae012cc7,30de644a40c..44e64d7ba29
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@@@@@@ -759,15 -759,15 -759,15 -759,15 -759,44 -759,15 +759,44 @@@@@@@ void __cleanup_sighand(struct sighand_s
      		kmem_cache_free(sighand_cachep, sighand);
      }
      
++++ +
++++ +/*
++++ + * Initialize POSIX timer handling for a thread group.
++++ + */
++++ +static void posix_cpu_timers_init_group(struct signal_struct *sig)
++++ +{
++++ +	/* Thread group counters. */
++++ +	thread_group_cputime_init(sig);
++++ +
++++ +	/* Expiration times and increments. */
++++ +	sig->it_virt_expires = cputime_zero;
++++ +	sig->it_virt_incr = cputime_zero;
++++ +	sig->it_prof_expires = cputime_zero;
++++ +	sig->it_prof_incr = cputime_zero;
++++ +
++++ +	/* Cached expiration times. */
++++ +	sig->cputime_expires.prof_exp = cputime_zero;
++++ +	sig->cputime_expires.virt_exp = cputime_zero;
++++ +	sig->cputime_expires.sched_exp = 0;
++++ +
++++ +	/* The timer lists. */
++++ +	INIT_LIST_HEAD(&sig->cpu_timers[0]);
++++ +	INIT_LIST_HEAD(&sig->cpu_timers[1]);
++++ +	INIT_LIST_HEAD(&sig->cpu_timers[2]);
++++ +}
++++ +
      static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
      {
      	struct signal_struct *sig;
      	int ret;
      
      	if (clone_flags & CLONE_THREAD) {
---- -		atomic_inc(&current->signal->count);
---- -		atomic_inc(&current->signal->live);
---- -		return 0;
++++ +		ret = thread_group_cputime_clone_thread(current);
++++ +		if (likely(!ret)) {
++++ +			atomic_inc(&current->signal->count);
++++ +			atomic_inc(&current->signal->live);
++++ +		}
++++ +		return ret;
      	}
      	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
      	tsk->signal = sig;
@@@@@@@ -795,39 -795,39 -795,40 -795,39 -824,24 -795,40 +824,25 @@@@@@@
      	sig->it_real_incr.tv64 = 0;
      	sig->real_timer.function = it_real_fn;
      
---- -	sig->it_virt_expires = cputime_zero;
---- -	sig->it_virt_incr = cputime_zero;
---- -	sig->it_prof_expires = cputime_zero;
---- -	sig->it_prof_incr = cputime_zero;
---- -
      	sig->leader = 0;	/* session leadership doesn't inherit */
      	sig->tty_old_pgrp = NULL;
++ ++ 	sig->tty = NULL;
      
---- -	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
++++ +	sig->cutime = sig->cstime = cputime_zero;
      	sig->gtime = cputime_zero;
      	sig->cgtime = cputime_zero;
      	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
      	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
      	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
      	task_io_accounting_init(&sig->ioac);
---- -	sig->sum_sched_runtime = 0;
---- -	INIT_LIST_HEAD(&sig->cpu_timers[0]);
---- -	INIT_LIST_HEAD(&sig->cpu_timers[1]);
---- -	INIT_LIST_HEAD(&sig->cpu_timers[2]);
      	taskstats_tgid_init(sig);
      
      	task_lock(current->group_leader);
      	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
      	task_unlock(current->group_leader);
      
---- -	if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
---- -		/*
---- -		 * New sole thread in the process gets an expiry time
---- -		 * of the whole CPU time limit.
---- -		 */
---- -		tsk->it_prof_expires =
---- -			secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
---- -	}
++++ +	posix_cpu_timers_init_group(sig);
++++ +
      	acct_init_pacct(&sig->pacct);
      
      	tty_audit_fork(sig);
@@@@@@@ -837,7 -837,7 -838,8 -837,7 -851,8 -838,8 +852,9 @@@@@@@
      
      void __cleanup_signal(struct signal_struct *sig)
      {
++++ +	thread_group_cputime_free(sig);
      	exit_thread_group_keys(sig);
++ ++ 	tty_kref_put(sig->tty);
      	kmem_cache_free(signal_cachep, sig);
      }
      
@@@@@@@ -885,6 -885,6 -887,6 -885,6 -900,19 -887,6 +902,19 @@@@@@@ void mm_init_owner(struct mm_struct *mm
      }
      #endif /* CONFIG_MM_OWNER */
      
++++ +/*
++++ + * Initialize POSIX timer handling for a single task.
++++ + */
++++ +static void posix_cpu_timers_init(struct task_struct *tsk)
++++ +{
++++ +	tsk->cputime_expires.prof_exp = cputime_zero;
++++ +	tsk->cputime_expires.virt_exp = cputime_zero;
++++ +	tsk->cputime_expires.sched_exp = 0;
++++ +	INIT_LIST_HEAD(&tsk->cpu_timers[0]);
++++ +	INIT_LIST_HEAD(&tsk->cpu_timers[1]);
++++ +	INIT_LIST_HEAD(&tsk->cpu_timers[2]);
++++ +}
++++ +
      /*
       * This creates a new process as a copy of the old one,
       * but does not actually start it yet.
@@@@@@@ -995,12 -995,12 -997,12 -995,12 -1023,7 -997,12 +1025,7 @@@@@@@ static struct task_struct *copy_process
      	task_io_accounting_init(&p->ioac);
      	acct_clear_integrals(p);
      
---- -	p->it_virt_expires = cputime_zero;
---- -	p->it_prof_expires = cputime_zero;
---- -	p->it_sched_expires = 0;
---- -	INIT_LIST_HEAD(&p->cpu_timers[0]);
---- -	INIT_LIST_HEAD(&p->cpu_timers[1]);
---- -	INIT_LIST_HEAD(&p->cpu_timers[2]);
++++ +	posix_cpu_timers_init(p);
      
      	p->lock_depth = -1;		/* -1 = no lock */
      	do_posix_clock_monotonic_gettime(&p->start_time);
@@@@@@@ -1201,21 -1201,21 -1203,21 -1201,21 -1224,6 -1203,21 +1226,6 @@@@@@@
      	if (clone_flags & CLONE_THREAD) {
      		p->group_leader = current->group_leader;
      		list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
---- -
---- -		if (!cputime_eq(current->signal->it_virt_expires,
---- -				cputime_zero) ||
---- -		    !cputime_eq(current->signal->it_prof_expires,
---- -				cputime_zero) ||
---- -		    current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
---- -		    !list_empty(&current->signal->cpu_timers[0]) ||
---- -		    !list_empty(&current->signal->cpu_timers[1]) ||
---- -		    !list_empty(&current->signal->cpu_timers[2])) {
---- -			/*
---- -			 * Have child wake up on its first tick to check
---- -			 * for process CPU timers.
---- -			 */
---- -			p->it_prof_expires = jiffies_to_cputime(1);
---- -		}
      	}
      
      	if (likely(p->pid)) {
@@@@@@@ -1227,7 -1227,7 -1229,8 -1227,7 -1235,7 -1229,8 +1237,8 @@@@@@@
      				p->nsproxy->pid_ns->child_reaper = p;
      
      			p->signal->leader_pid = pid;
-- -- 			p->signal->tty = current->signal->tty;
++ ++ 			tty_kref_put(p->signal->tty);
++ ++ 			p->signal->tty = tty_kref_get(current->signal->tty);
      			set_task_pgrp(p, task_pgrp_nr(current));
      			set_task_session(p, task_session_nr(current));
      			attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
diff --combined kernel/hrtimer.c
index b8e4dce80a7,4d761d50c52,cdec83e722f,b8e4dce80a7,b8e4dce80a7,cdec83e722f..95978f48e03
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@@@@@@ -672,13 -672,13 -672,14 -672,13 -672,13 -672,14 +672,14 @@@@@@@ static inline int hrtimer_enqueue_repro
      			 */
      			BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
      			return 1;
-- -- 		case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:
++ ++ 		case HRTIMER_CB_IRQSAFE_PERCPU:
++ ++ 		case HRTIMER_CB_IRQSAFE_UNLOCKED:
      			/*
      			 * This is solely for the sched tick emulation with
      			 * dynamic tick support to ensure that we do not
      			 * restart the tick right on the edge and end up with
      			 * the tick timer in the softirq ! The calling site
-- -- 			 * takes care of this.
++ ++ 			 * takes care of this. Also used for hrtimer sleeper !
      			 */
      			debug_hrtimer_deactivate(timer);
      			return 1;
@@@@@@@ -1245,7 -1245,7 -1246,8 -1245,7 -1245,7 -1246,8 +1246,8 @@@@@@@ static void __run_hrtimer(struct hrtime
      	timer_stats_account_hrtimer(timer);
      
      	fn = timer->function;
-- -- 	if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
++ ++ 	if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
++ ++ 	    timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
      		/*
      		 * Used for scheduler timers, avoid lock inversion with
      		 * rq->lock and tasklist_lock.
@@@@@@@ -1401,9 -1401,7 -1403,9 -1401,9 -1401,9 -1403,9 +1403,7 @@@@@@@ void hrtimer_run_queues(void
      		if (!base->first)
      			continue;
      
- ----		if (base->get_softirq_time)
- ----			base->softirq_time = base->get_softirq_time();
- ----		else if (gettime) {
+ ++++		if (gettime) {
      			hrtimer_get_softirq_time(cpu_base);
      			gettime = 0;
      		}
@@@@@@@ -1452,7 -1450,7 -1454,7 -1452,7 -1452,7 -1454,7 +1452,7 @@@@@@@ void hrtimer_init_sleeper(struct hrtime
      	sl->timer.function = hrtimer_wakeup;
      	sl->task = task;
      #ifdef CONFIG_HIGH_RES_TIMERS
-- -- 	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
++ ++ 	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
      #endif
      }
      
@@@@@@@ -1591,49 -1589,50 -1593,122 -1591,49 -1591,49 -1593,122 +1591,123 @@@@@@@ static void __cpuinit init_hrtimers_cpu
      
      #ifdef CONFIG_HOTPLUG_CPU
      
-- -- static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
-- -- 				struct hrtimer_clock_base *new_base)
++ ++ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
++ ++ 				struct hrtimer_clock_base *new_base, int dcpu)
      {
      	struct hrtimer *timer;
      	struct rb_node *node;
++ ++ 	int raise = 0;
      
      	while ((node = rb_first(&old_base->active))) {
      		timer = rb_entry(node, struct hrtimer, node);
      		BUG_ON(hrtimer_callback_running(timer));
      		debug_hrtimer_deactivate(timer);
-- -- 		__remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
++ ++ 
++ ++ 		/*
++ ++ 		 * Should not happen. Per CPU timers should be
++ ++ 		 * canceled _before_ the migration code is called
++ ++ 		 */
++ ++ 		if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
++ ++ 			__remove_hrtimer(timer, old_base,
++ ++ 					 HRTIMER_STATE_INACTIVE, 0);
++ ++ 			WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
++ ++ 			     timer, timer->function, dcpu);
++ ++ 			continue;
++ ++ 		}
++ ++ 
++ ++ 		/*
++ ++ 		 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
++ ++ 		 * timer could be seen as !active and just vanish away
++ ++ 		 * under us on another CPU
++ ++ 		 */
++ ++ 		__remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
      		timer->base = new_base;
      		/*
      		 * Enqueue the timer. Allow reprogramming of the event device
      		 */
      		enqueue_hrtimer(timer, new_base, 1);
++ ++ 
++ ++ #ifdef CONFIG_HIGH_RES_TIMERS
++ ++ 		/*
++ ++ 		 * Happens with high res enabled when the timer was
++ ++ 		 * already expired and the callback mode is
++ ++ 		 * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
++ ++ 		 * enqueue code does not move them to the soft irq
++ ++ 		 * pending list for performance/latency reasons, but
++ ++ 		 * in the migration state, we need to do that
++ ++ 		 * otherwise we end up with a stale timer.
++ ++ 		 */
++ ++ 		if (timer->state == HRTIMER_STATE_MIGRATE) {
++ ++ 			timer->state = HRTIMER_STATE_PENDING;
++ ++ 			list_add_tail(&timer->cb_entry,
++ ++ 				      &new_base->cpu_base->cb_pending);
++ ++ 			raise = 1;
++ ++ 		}
++ ++ #endif
++ ++ 		/* Clear the migration state bit */
++ ++ 		timer->state &= ~HRTIMER_STATE_MIGRATE;
 +    	}
++ ++ 	return raise;
 +    }
 +    
++ ++ #ifdef CONFIG_HIGH_RES_TIMERS
++ ++ static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
++ ++ 				   struct hrtimer_cpu_base *new_base)
++ ++ {
++ ++ 	struct hrtimer *timer;
++ ++ 	int raise = 0;
++ ++ 
++ ++ 	while (!list_empty(&old_base->cb_pending)) {
++ ++ 		timer = list_entry(old_base->cb_pending.next,
++ ++ 				   struct hrtimer, cb_entry);
++ ++ 
++ ++ 		__remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
++ ++ 		timer->base = &new_base->clock_base[timer->base->index];
++ ++ 		list_add_tail(&timer->cb_entry, &new_base->cb_pending);
++ ++ 		raise = 1;
+  ++ 	}
++ ++ 	return raise;
++ ++ }
++ ++ #else
++ ++ static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
++ ++ 				   struct hrtimer_cpu_base *new_base)
++ ++ {
++ ++ 	return 0;
+  ++ }
++ ++ #endif
+  ++ 
      static void migrate_hrtimers(int cpu)
      {
      	struct hrtimer_cpu_base *old_base, *new_base;
-- -- 	int i;
++ ++ 	int i, raise = 0;
      
      	BUG_ON(cpu_online(cpu));
      	old_base = &per_cpu(hrtimer_bases, cpu);
      	new_base = &get_cpu_var(hrtimer_bases);
      
      	tick_cancel_sched_timer(cpu);
- ----
- ----	local_irq_disable();
- ----	spin_lock(&new_base->lock);
+ ++++	/*
+ ++++	 * The caller is globally serialized and nobody else
+ ++++	 * takes two locks at once, deadlock is not possible.
+ ++++	 */
+ ++++	spin_lock_irq(&new_base->lock);
      	spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
      
      	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
-- -- 		migrate_hrtimer_list(&old_base->clock_base[i],
-- -- 				     &new_base->clock_base[i]);
++ ++ 		if (migrate_hrtimer_list(&old_base->clock_base[i],
++ ++ 					 &new_base->clock_base[i], cpu))
++ ++ 			raise = 1;
      	}
      
++ ++ 	if (migrate_hrtimer_pending(old_base, new_base))
++ ++ 		raise = 1;
++ ++ 
      	spin_unlock(&old_base->lock);
- ----	spin_unlock(&new_base->lock);
- ----	local_irq_enable();
+ ++++	spin_unlock_irq(&new_base->lock);
      	put_cpu_var(hrtimer_bases);
++ ++ 
++ ++ 	if (raise)
++ ++ 		hrtimer_raise_softirq();
      }
      #endif /* CONFIG_HOTPLUG_CPU */
      
diff --combined kernel/posix-timers.c
index d3c66b53dff,e36d5798cbf,5131e547116,e36d5798cbf,95451bf7d2e,5131e547116..b931d7cedbf
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@@@@@@ -222,15 -222,6 -222,6 -222,6 -222,6 -222,6 +222,15 @@@@@@@ static int posix_ktime_get_ts(clockid_
      	return 0;
      }
      
 +++++/*
 +++++ * Get monotonic time for posix timers
 +++++ */
 +++++static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
 +++++{
 +++++	getrawmonotonic(tp);
 +++++	return 0;
 +++++}
 +++++
      /*
       * Initialize everything, well, just everything in Posix clocks/timers ;)
       */
@@@@@@@ -244,15 -235,9 -235,9 -235,9 -235,9 -235,9 +244,15 @@@@@@@ static __init int init_posix_timers(voi
      		.clock_get = posix_ktime_get_ts,
      		.clock_set = do_posix_clock_nosettime,
      	};
 +++++	struct k_clock clock_monotonic_raw = {
 +++++		.clock_getres = hrtimer_get_res,
 +++++		.clock_get = posix_get_monotonic_raw,
 +++++		.clock_set = do_posix_clock_nosettime,
 +++++	};
      
      	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
      	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
 +++++	register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
      
      	posix_timers_cache = kmem_cache_create("posix_timers_cache",
      					sizeof (struct k_itimer), 0, SLAB_PANIC,
@@@@@@@ -313,6 -298,6 -298,6 -298,6 -298,7 -298,6 +313,7 @@@@@@@ void do_schedule_next_timer(struct sigi
      
      int posix_timer_event(struct k_itimer *timr, int si_private)
      {
++++ +	int shared, ret;
      	/*
      	 * FIXME: if ->sigq is queued we can race with
      	 * dequeue_signal()->do_schedule_next_timer().
@@@@@@@ -326,25 -311,25 -311,25 -311,25 -312,10 -311,25 +327,10 @@@@@@@
      	 */
      	timr->sigq->info.si_sys_private = si_private;
      
---- -	timr->sigq->info.si_signo = timr->it_sigev_signo;
---- -	timr->sigq->info.si_code = SI_TIMER;
---- -	timr->sigq->info.si_tid = timr->it_id;
---- -	timr->sigq->info.si_value = timr->it_sigev_value;
---- -
---- -	if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
---- -		struct task_struct *leader;
---- -		int ret = send_sigqueue(timr->sigq, timr->it_process, 0);
---- -
---- -		if (likely(ret >= 0))
---- -			return ret;
---- -
---- -		timr->it_sigev_notify = SIGEV_SIGNAL;
---- -		leader = timr->it_process->group_leader;
---- -		put_task_struct(timr->it_process);
---- -		timr->it_process = leader;
---- -	}
---- -
---- -	return send_sigqueue(timr->sigq, timr->it_process, 1);
++++ +	shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
++++ +	ret = send_sigqueue(timr->sigq, timr->it_process, shared);
++++ +	/* If we failed to send the signal the timer stops. */
++++ +	return ret > 0;
      }
      EXPORT_SYMBOL_GPL(posix_timer_event);
      
@@@@@@@ -456,7 -441,7 -441,7 -441,7 -427,7 -441,7 +442,7 @@@@@@@ static struct k_itimer * alloc_posix_ti
      		return tmr;
      	if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
      		kmem_cache_free(posix_timers_cache, tmr);
-- -- 		tmr = NULL;
++ ++ 		return NULL;
      	}
      	memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
      	return tmr;
@@@@@@@ -483,11 -468,11 -468,11 -468,11 -454,9 -468,11 +469,9 @@@@@@@ sys_timer_create(const clockid_t which_
      		 struct sigevent __user *timer_event_spec,
      		 timer_t __user * created_timer_id)
      {
---- -	int error = 0;
---- -	struct k_itimer *new_timer = NULL;
---- -	int new_timer_id;
---- -	struct task_struct *process = NULL;
---- -	unsigned long flags;
++++ +	struct k_itimer *new_timer;
++++ +	int error, new_timer_id;
++++ +	struct task_struct *process;
      	sigevent_t event;
      	int it_id_set = IT_ID_NOT_SET;
      
@@@@@@@ -505,12 -490,12 -490,12 -490,12 -474,11 -490,12 +489,11 @@@@@@@
      		goto out;
      	}
      	spin_lock_irq(&idr_lock);
---- -	error = idr_get_new(&posix_timers_id, (void *) new_timer,
---- -			    &new_timer_id);
++++ +	error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id);
      	spin_unlock_irq(&idr_lock);
---- -	if (error == -EAGAIN)
---- -		goto retry;
---- -	else if (error) {
++++ +	if (error) {
++++ +		if (error == -EAGAIN)
++++ +			goto retry;
      		/*
      		 * Weird looking, but we return EAGAIN if the IDR is
      		 * full (proper POSIX return value for this)
@@@@@@@ -541,67 -526,67 -526,67 -526,67 -509,43 -526,67 +524,43 @@@@@@@
      			error = -EFAULT;
      			goto out;
      		}
---- -		new_timer->it_sigev_notify = event.sigev_notify;
---- -		new_timer->it_sigev_signo = event.sigev_signo;
---- -		new_timer->it_sigev_value = event.sigev_value;
---- -
---- -		read_lock(&tasklist_lock);
---- -		if ((process = good_sigevent(&event))) {
---- -			/*
---- -			 * We may be setting up this process for another
---- -			 * thread.  It may be exiting.  To catch this
---- -			 * case the we check the PF_EXITING flag.  If
---- -			 * the flag is not set, the siglock will catch
---- -			 * him before it is too late (in exit_itimers).
---- -			 *
---- -			 * The exec case is a bit more invloved but easy
---- -			 * to code.  If the process is in our thread
---- -			 * group (and it must be or we would not allow
---- -			 * it here) and is doing an exec, it will cause
---- -			 * us to be killed.  In this case it will wait
---- -			 * for us to die which means we can finish this
---- -			 * linkage with our last gasp. I.e. no code :)
---- -			 */
---- -			spin_lock_irqsave(&process->sighand->siglock, flags);
---- -			if (!(process->flags & PF_EXITING)) {
---- -				new_timer->it_process = process;
---- -				list_add(&new_timer->list,
---- -					 &process->signal->posix_timers);
---- -				if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
---- -					get_task_struct(process);
---- -				spin_unlock_irqrestore(&process->sighand->siglock, flags);
---- -			} else {
---- -				spin_unlock_irqrestore(&process->sighand->siglock, flags);
---- -				process = NULL;
---- -			}
---- -		}
---- -		read_unlock(&tasklist_lock);
++++ +		rcu_read_lock();
++++ +		process = good_sigevent(&event);
++++ +		if (process)
++++ +			get_task_struct(process);
++++ +		rcu_read_unlock();
      		if (!process) {
      			error = -EINVAL;
      			goto out;
      		}
      	} else {
---- -		new_timer->it_sigev_notify = SIGEV_SIGNAL;
---- -		new_timer->it_sigev_signo = SIGALRM;
---- -		new_timer->it_sigev_value.sival_int = new_timer->it_id;
++++ +		event.sigev_notify = SIGEV_SIGNAL;
++++ +		event.sigev_signo = SIGALRM;
++++ +		event.sigev_value.sival_int = new_timer->it_id;
      		process = current->group_leader;
---- -		spin_lock_irqsave(&process->sighand->siglock, flags);
---- -		new_timer->it_process = process;
---- -		list_add(&new_timer->list, &process->signal->posix_timers);
---- -		spin_unlock_irqrestore(&process->sighand->siglock, flags);
++++ +		get_task_struct(process);
      	}
      
++++ +	new_timer->it_sigev_notify     = event.sigev_notify;
++++ +	new_timer->sigq->info.si_signo = event.sigev_signo;
++++ +	new_timer->sigq->info.si_value = event.sigev_value;
++++ +	new_timer->sigq->info.si_tid   = new_timer->it_id;
++++ +	new_timer->sigq->info.si_code  = SI_TIMER;
++++ +
++++ +	spin_lock_irq(&current->sighand->siglock);
++++ +	new_timer->it_process = process;
++++ +	list_add(&new_timer->list, &current->signal->posix_timers);
++++ +	spin_unlock_irq(&current->sighand->siglock);
++++ +
++++ +	return 0;
       	/*
      	 * In the case of the timer belonging to another task, after
      	 * the task is unlocked, the timer is owned by the other task
      	 * and may cease to exist at any time.  Don't use or modify
      	 * new_timer after the unlock call.
      	 */
---- -
      out:
---- -	if (error)
---- -		release_posix_timer(new_timer, it_id_set);
---- -
++++ +	release_posix_timer(new_timer, it_id_set);
      	return error;
      }
      
@@@@@@@ -612,7 -597,7 -597,7 -597,7 -556,7 -597,7 +571,7 @@@@@@@
       * the find to the timer lock.  To avoid a dead lock, the timer id MUST
       * be release with out holding the timer lock.
       */
---- -static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
++++ +static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
      {
      	struct k_itimer *timr;
      	/*
@@@@@@@ -620,23 -605,23 -605,23 -605,23 -564,20 -605,23 +579,20 @@@@@@@
      	 * flags part over to the timer lock.  Must not let interrupts in
      	 * while we are moving the lock.
      	 */
---- -
      	spin_lock_irqsave(&idr_lock, *flags);
---- -	timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id);
++++ +	timr = idr_find(&posix_timers_id, (int)timer_id);
      	if (timr) {
      		spin_lock(&timr->it_lock);
---- -
---- -		if ((timr->it_id != timer_id) || !(timr->it_process) ||
---- -				!same_thread_group(timr->it_process, current)) {
---- -			spin_unlock(&timr->it_lock);
---- -			spin_unlock_irqrestore(&idr_lock, *flags);
---- -			timr = NULL;
---- -		} else
++++ +		if (timr->it_process &&
++++ +		    same_thread_group(timr->it_process, current)) {
      			spin_unlock(&idr_lock);
---- -	} else
---- -		spin_unlock_irqrestore(&idr_lock, *flags);
++++ +			return timr;
++++ +		}
++++ +		spin_unlock(&timr->it_lock);
++++ +	}
++++ +	spin_unlock_irqrestore(&idr_lock, *flags);
      
---- -	return timr;
++++ +	return NULL;
      }
      
      /*
@@@@@@@ -877,8 -862,8 -862,8 -862,8 -818,7 -862,8 +833,7 @@@@@@@ retry_delete
      	 * This keeps any tasks waiting on the spin lock from thinking
      	 * they got something (see the lock code above).
      	 */
---- -	if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
---- -		put_task_struct(timer->it_process);
++++ +	put_task_struct(timer->it_process);
      	timer->it_process = NULL;
      
      	unlock_timer(timer, flags);
@@@@@@@ -905,8 -890,8 -890,8 -890,8 -845,7 -890,8 +860,7 @@@@@@@ retry_delete
      	 * This keeps any tasks waiting on the spin lock from thinking
      	 * they got something (see the lock code above).
      	 */
---- -	if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
---- -		put_task_struct(timer->it_process);
++++ +	put_task_struct(timer->it_process);
      	timer->it_process = NULL;
      
      	unlock_timer(timer, flags);
diff --combined kernel/sched.c
index 9a1ddb84e26,9a1ddb84e26,6f230596bd0,1a5f73c1fcd,ebb03def564,6f230596bd0..09a8c15748f
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@@@@@@ -201,14 -201,14 -201,19 -201,14 -201,14 -201,19 +201,19 @@@@@@@ void init_rt_bandwidth(struct rt_bandwi
      	hrtimer_init(&rt_b->rt_period_timer,
      			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
      	rt_b->rt_period_timer.function = sched_rt_period_timer;
-- -- 	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
++ ++ 	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
++ ++ }
++ ++ 
++ ++ static inline int rt_bandwidth_enabled(void)
++ ++ {
++ ++ 	return sysctl_sched_rt_runtime >= 0;
      }
      
      static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
      {
      	ktime_t now;
      
-- -- 	if (rt_b->rt_runtime == RUNTIME_INF)
++ ++ 	if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
      		return;
      
      	if (hrtimer_active(&rt_b->rt_period_timer))
@@@@@@@ -298,9 -298,9 -303,9 -298,9 -298,9 -303,9 +303,9 @@@@@@@ static DEFINE_PER_CPU(struct cfs_rq, in
      static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
      static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
      #endif /* CONFIG_RT_GROUP_SCHED */
-- -- #else /* !CONFIG_FAIR_GROUP_SCHED */
++ ++ #else /* !CONFIG_USER_SCHED */
      #define root_task_group init_task_group
-- -- #endif /* CONFIG_FAIR_GROUP_SCHED */
++ ++ #endif /* CONFIG_USER_SCHED */
      
      /* task_group_lock serializes add/remove of task groups and also changes to
       * a task group's cpu shares.
@@@@@@@ -604,9 -604,9 -609,9 -604,9 -604,9 -609,9 +609,9 @@@@@@@ struct rq 
      
      static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
      
-- -- static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
++ ++ static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
      {
-- -- 	rq->curr->sched_class->check_preempt_curr(rq, p);
++ ++ 	rq->curr->sched_class->check_preempt_curr(rq, p, sync);
      }
      
      static inline int cpu_of(struct rq *rq)
@@@@@@@ -1087,7 -1087,7 -1092,7 -1087,7 -1087,7 -1092,7 +1092,7 @@@@@@@ hotplug_hrtick(struct notifier_block *n
      	return NOTIFY_DONE;
      }
      
-- -- static void init_hrtick(void)
++ ++ static __init void init_hrtick(void)
      {
      	hotcpu_notifier(hotplug_hrtick, 0);
      }
@@@@@@@ -1102,7 -1102,7 -1107,7 -1102,7 -1102,7 -1107,7 +1107,7 @@@@@@@ static void hrtick_start(struct rq *rq
      	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
      }
      
-- -- static void init_hrtick(void)
++ ++ static inline void init_hrtick(void)
      {
      }
      #endif /* CONFIG_SMP */
@@@@@@@ -1119,9 -1119,9 -1124,9 -1119,9 -1119,9 -1124,9 +1124,9 @@@@@@@ static void init_rq_hrtick(struct rq *r
      
      	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
      	rq->hrtick_timer.function = hrtick;
-- -- 	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
++ ++ 	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
      }
-- -- #else
++ ++ #else	/* CONFIG_SCHED_HRTICK */
      static inline void hrtick_clear(struct rq *rq)
      {
      }
@@@@@@@ -1133,7 -1133,7 -1138,7 -1133,7 -1133,7 -1138,7 +1138,7 @@@@@@@ static inline void init_rq_hrtick(struc
      static inline void init_hrtick(void)
      {
      }
-- -- #endif
++ ++ #endif	/* CONFIG_SCHED_HRTICK */
      
      /*
       * resched_task - mark a task 'to be rescheduled now'.
@@@@@@@ -1380,38 -1380,38 -1385,24 -1380,38 -1380,38 -1385,24 +1385,24 @@@@@@@ static inline void dec_cpu_load(struct 
      	update_load_sub(&rq->load, load);
      }
      
-- -- #ifdef CONFIG_SMP
-- -- static unsigned long source_load(int cpu, int type);
-- -- static unsigned long target_load(int cpu, int type);
-- -- static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-- -- 
-- -- static unsigned long cpu_avg_load_per_task(int cpu)
-- -- {
-- -- 	struct rq *rq = cpu_rq(cpu);
-- -- 
-- -- 	if (rq->nr_running)
-- -- 		rq->avg_load_per_task = rq->load.weight / rq->nr_running;
-- -- 
-- -- 	return rq->avg_load_per_task;
-- -- }
-- -- 
-- -- #ifdef CONFIG_FAIR_GROUP_SCHED
-- -- 
-- -- typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
++ ++ #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
++ ++ typedef int (*tg_visitor)(struct task_group *, void *);
      
      /*
       * Iterate the full tree, calling @down when first entering a node and @up when
       * leaving it for the final time.
       */
-- -- static void
-- -- walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
++ ++ static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
      {
      	struct task_group *parent, *child;
++ ++ 	int ret;
      
      	rcu_read_lock();
      	parent = &root_task_group;
      down:
-- -- 	(*down)(parent, cpu, sd);
++ ++ 	ret = (*down)(parent, data);
++ ++ 	if (ret)
++ ++ 		goto out_unlock;
      	list_for_each_entry_rcu(child, &parent->children, siblings) {
      		parent = child;
      		goto down;
@@@@@@@ -1419,15 -1419,15 -1410,43 -1419,15 -1419,15 -1410,43 +1410,43 @@@@@@@
      up:
      		continue;
      	}
-- -- 	(*up)(parent, cpu, sd);
++ ++ 	ret = (*up)(parent, data);
++ ++ 	if (ret)
++ ++ 		goto out_unlock;
      
      	child = parent;
      	parent = parent->parent;
      	if (parent)
      		goto up;
++ ++ out_unlock:
      	rcu_read_unlock();
++ ++ 
++ ++ 	return ret;
++  + }
++  + 
++ ++ static int tg_nop(struct task_group *tg, void *data)
++ ++ {
++ ++ 	return 0;
   +  }
++ ++ #endif
++ ++ 
++ ++ #ifdef CONFIG_SMP
++ ++ static unsigned long source_load(int cpu, int type);
++ ++ static unsigned long target_load(int cpu, int type);
++ ++ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
++ ++ 
++ ++ static unsigned long cpu_avg_load_per_task(int cpu)
++ ++ {
++ ++ 	struct rq *rq = cpu_rq(cpu);
++ ++ 
++ ++ 	if (rq->nr_running)
++ ++ 		rq->avg_load_per_task = rq->load.weight / rq->nr_running;
++ ++ 
++ ++ 	return rq->avg_load_per_task;
++ ++ }
++ ++ 
++ ++ #ifdef CONFIG_FAIR_GROUP_SCHED
   +  
      static void __set_se_shares(struct sched_entity *se, unsigned long shares);
      
      /*
@@@@@@@ -1486,11 -1486,11 -1505,11 -1486,11 -1486,11 -1505,11 +1505,11 @@@@@@@ __update_group_shares_cpu(struct task_g
       * This needs to be done in a bottom-up fashion because the rq weight of a
       * parent group depends on the shares of its child groups.
       */
-- -- static void
-- -- tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
++ ++ static int tg_shares_up(struct task_group *tg, void *data)
      {
      	unsigned long rq_weight = 0;
      	unsigned long shares = 0;
++ ++ 	struct sched_domain *sd = data;
      	int i;
      
      	for_each_cpu_mask(i, sd->span) {
@@@@@@@ -1515,6 -1515,6 -1534,8 -1515,6 -1515,6 -1534,8 +1534,8 @@@@@@@
      		__update_group_shares_cpu(tg, i, shares, rq_weight);
      		spin_unlock_irqrestore(&rq->lock, flags);
      	}
++ ++ 
++ ++ 	return 0;
      }
      
      /*
@@@@@@@ -1522,10 -1522,10 -1543,10 -1522,10 -1522,10 -1543,10 +1543,10 @@@@@@@
       * This needs to be done in a top-down fashion because the load of a child
       * group is a fraction of its parents load.
       */
-- -- static void
-- -- tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
++ ++ static int tg_load_down(struct task_group *tg, void *data)
      {
      	unsigned long load;
++ ++ 	long cpu = (long)data;
      
      	if (!tg->parent) {
      		load = cpu_rq(cpu)->load.weight;
@@@@@@@ -1536,11 -1536,11 -1557,8 -1536,11 -1536,11 -1557,8 +1557,8 @@@@@@@
      	}
      
      	tg->cfs_rq[cpu]->h_load = load;
-- -- }
      
-- -- static void
-- -- tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
-- -- {
++ ++ 	return 0;
      }
      
      static void update_shares(struct sched_domain *sd)
@@@@@@@ -1550,7 -1550,7 -1568,7 -1550,7 -1550,7 -1568,7 +1568,7 @@@@@@@
      
      	if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
      		sd->last_update = now;
-- -- 		walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
++ ++ 		walk_tg_tree(tg_nop, tg_shares_up, sd);
      	}
      }
      
@@@@@@@ -1561,9 -1561,9 -1579,9 -1561,9 -1561,9 -1579,9 +1579,9 @@@@@@@ static void update_shares_locked(struc
      	spin_lock(&rq->lock);
      }
      
-- -- static void update_h_load(int cpu)
++ ++ static void update_h_load(long cpu)
      {
-- -- 	walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
++ ++ 	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
      }
      
      #else
@@@@@@@ -1921,11 -1921,11 -1939,8 -1921,11 -1921,11 -1939,8 +1939,8 @@@@@@@ unsigned long wait_task_inactive(struc
      		running = task_running(rq, p);
      		on_rq = p->se.on_rq;
      		ncsw = 0;
-- -- 		if (!match_state || p->state == match_state) {
-- -- 			ncsw = p->nivcsw + p->nvcsw;
-- -- 			if (unlikely(!ncsw))
-- -- 				ncsw = 1;
-- -- 		}
++ ++ 		if (!match_state || p->state == match_state)
++ ++ 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
      		task_rq_unlock(rq, &flags);
      
      		/*
@@@@@@@ -2285,7 -2285,7 -2300,7 -2285,7 -2285,7 -2300,7 +2300,7 @@@@@@@ out_running
      	trace_mark(kernel_sched_wakeup,
      		"pid %d state %ld ## rq %p task %p rq->curr %p",
      		p->pid, p->state, rq, p, rq->curr);
-- -- 	check_preempt_curr(rq, p);
++ ++ 	check_preempt_curr(rq, p, sync);
      
      	p->state = TASK_RUNNING;
      #ifdef CONFIG_SMP
@@@@@@@ -2420,7 -2420,7 -2435,7 -2420,7 -2420,7 -2435,7 +2435,7 @@@@@@@ void wake_up_new_task(struct task_struc
      	trace_mark(kernel_sched_wakeup_new,
      		"pid %d state %ld ## rq %p task %p rq->curr %p",
      		p->pid, p->state, rq, p, rq->curr);
-- -- 	check_preempt_curr(rq, p);
++ ++ 	check_preempt_curr(rq, p, 0);
      #ifdef CONFIG_SMP
      	if (p->sched_class->task_wake_up)
      		p->sched_class->task_wake_up(rq, p);
@@@@@@@ -2880,7 -2880,7 -2895,7 -2880,7 -2880,7 -2895,7 +2895,7 @@@@@@@ static void pull_task(struct rq *src_rq
      	 * Note that idle threads have a prio of MAX_PRIO, for this test
      	 * to be always true for them.
      	 */
-- -- 	check_preempt_curr(this_rq, p);
++ ++ 	check_preempt_curr(this_rq, p, 0);
      }
      
      /*
@@@@@@@ -4037,23 -4037,23 -4052,23 -4037,23 -4037,26 -4052,23 +4052,26 @@@@@@@ DEFINE_PER_CPU(struct kernel_stat, ksta
      EXPORT_PER_CPU_SYMBOL(kstat);
      
      /*
---- - * Return p->sum_exec_runtime plus any more ns on the sched_clock
---- - * that have not yet been banked in case the task is currently running.
++++ + * Return any ns on the sched_clock that have not yet been banked in
++++ + * @p in case that task is currently running.
       */
---- -unsigned long long task_sched_runtime(struct task_struct *p)
++++ +unsigned long long task_delta_exec(struct task_struct *p)
      {
      	unsigned long flags;
---- -	u64 ns, delta_exec;
      	struct rq *rq;
++++ +	u64 ns = 0;
      
      	rq = task_rq_lock(p, &flags);
---- -	ns = p->se.sum_exec_runtime;
++++ +
      	if (task_current(rq, p)) {
++++ +		u64 delta_exec;
++++ +
      		update_rq_clock(rq);
      		delta_exec = rq->clock - p->se.exec_start;
      		if ((s64)delta_exec > 0)
---- -			ns += delta_exec;
++++ +			ns = delta_exec;
      	}
++++ +
      	task_rq_unlock(rq, &flags);
      
      	return ns;
@@@@@@@ -4070,6 -4070,6 -4085,6 -4070,6 -4073,7 -4085,6 +4088,7 @@@@@@@ void account_user_time(struct task_stru
      	cputime64_t tmp;
      
      	p->utime = cputime_add(p->utime, cputime);
++++ +	account_group_user_time(p, cputime);
      
      	/* Add user time to cpustat. */
      	tmp = cputime_to_cputime64(cputime);
@@@@@@@ -4094,6 -4094,6 -4109,6 -4094,6 -4098,7 -4109,6 +4113,7 @@@@@@@ static void account_guest_time(struct t
      	tmp = cputime_to_cputime64(cputime);
      
      	p->utime = cputime_add(p->utime, cputime);
++++ +	account_group_user_time(p, cputime);
      	p->gtime = cputime_add(p->gtime, cputime);
      
      	cpustat->user = cputime64_add(cpustat->user, tmp);
@@@@@@@ -4129,6 -4129,6 -4144,6 -4129,6 -4134,7 -4144,6 +4149,7 @@@@@@@ void account_system_time(struct task_st
      	}
      
      	p->stime = cputime_add(p->stime, cputime);
++++ +	account_group_system_time(p, cputime);
      
      	/* Add system time to cpustat. */
      	tmp = cputime_to_cputime64(cputime);
@@@@@@@ -4170,6 -4170,6 -4185,6 -4170,6 -4176,7 -4185,6 +4191,7 @@@@@@@ void account_steal_time(struct task_str
      
      	if (p == rq->idle) {
      		p->stime = cputime_add(p->stime, steal);
++++ +		account_group_system_time(p, steal);
      		if (atomic_read(&rq->nr_iowait) > 0)
      			cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
      		else
@@@@@@@ -4178,6 -4178,6 -4193,65 -4178,65 -4185,65 -4193,65 +4200,65 @@@@@@@
      		cpustat->steal = cputime64_add(cpustat->steal, tmp);
      }
      
++    /*
++     * Use precise platform statistics if available:
++     */
++    #ifdef CONFIG_VIRT_CPU_ACCOUNTING
++    cputime_t task_utime(struct task_struct *p)
++    {
++    	return p->utime;
++    }
++    
++    cputime_t task_stime(struct task_struct *p)
++    {
++    	return p->stime;
++    }
++    #else
++    cputime_t task_utime(struct task_struct *p)
++    {
++    	clock_t utime = cputime_to_clock_t(p->utime),
++    		total = utime + cputime_to_clock_t(p->stime);
++    	u64 temp;
++    
++    	/*
++    	 * Use CFS's precise accounting:
++    	 */
++    	temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
++    
++    	if (total) {
++    		temp *= utime;
++    		do_div(temp, total);
++    	}
++    	utime = (clock_t)temp;
++    
++    	p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
++    	return p->prev_utime;
++    }
++    
++    cputime_t task_stime(struct task_struct *p)
++    {
++    	clock_t stime;
++    
++    	/*
++    	 * Use CFS's precise accounting. (we subtract utime from
++    	 * the total, to make sure the total observed by userspace
++    	 * grows monotonically - apps rely on that):
++    	 */
++    	stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
++    			cputime_to_clock_t(task_utime(p));
++    
++    	if (stime >= 0)
++    		p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
++    
++    	return p->prev_stime;
++    }
++    #endif
++    
++    inline cputime_t task_gtime(struct task_struct *p)
++    {
++    	return p->gtime;
++    }
++    
      /*
       * This function gets called by the timer code, with HZ frequency.
       * We call it with interrupts disabled.
@@@@@@@ -4568,6 -4568,6 -4642,15 -4627,6 -4634,6 -4642,15 +4649,15 @@@@@@@ __wake_up_sync(wait_queue_head_t *q, un
      }
      EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
      
++ ++ /**
++ ++  * complete: - signals a single thread waiting on this completion
++ ++  * @x:  holds the state of this particular completion
++ ++  *
++ ++  * This will wake up a single thread waiting on this completion. Threads will be
++ ++  * awakened in the same order in which they were queued.
++ ++  *
++ ++  * See also complete_all(), wait_for_completion() and related routines.
++ ++  */
      void complete(struct completion *x)
      {
      	unsigned long flags;
@@@@@@@ -4579,6 -4579,6 -4662,12 -4638,6 -4645,6 -4662,12 +4669,12 @@@@@@@
      }
      EXPORT_SYMBOL(complete);
      
++ ++ /**
++ ++  * complete_all: - signals all threads waiting on this completion
++ ++  * @x:  holds the state of this particular completion
++ ++  *
++ ++  * This will wake up all threads waiting on this particular completion event.
++ ++  */
      void complete_all(struct completion *x)
      {
      	unsigned long flags;
@@@@@@@ -4599,10 -4599,10 -4688,7 -4658,10 -4665,10 -4688,7 +4695,7 @@@@@@@ do_wait_for_common(struct completion *x
      		wait.flags |= WQ_FLAG_EXCLUSIVE;
      		__add_wait_queue_tail(&x->wait, &wait);
      		do {
-- -- 			if ((state == TASK_INTERRUPTIBLE &&
-- -- 			     signal_pending(current)) ||
-- -- 			    (state == TASK_KILLABLE &&
-- -- 			     fatal_signal_pending(current))) {
++ ++ 			if (signal_pending_state(state, current)) {
      				timeout = -ERESTARTSYS;
      				break;
      			}
@@@@@@@ -4630,12 -4630,12 -4716,31 -4689,12 -4696,12 -4716,31 +4723,31 @@@@@@@ wait_for_common(struct completion *x, l
      	return timeout;
      }
      
++ ++ /**
++ ++  * wait_for_completion: - waits for completion of a task
++ ++  * @x:  holds the state of this particular completion
++ ++  *
++ ++  * This waits to be signaled for completion of a specific task. It is NOT
++ ++  * interruptible and there is no timeout.
++ ++  *
++ ++  * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
++ ++  * and interrupt capability. Also see complete().
++ ++  */
      void __sched wait_for_completion(struct completion *x)
      {
      	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
      }
      EXPORT_SYMBOL(wait_for_completion);
      
++ ++ /**
++ ++  * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
++ ++  * @x:  holds the state of this particular completion
++ ++  * @timeout:  timeout value in jiffies
++ ++  *
++ ++  * This waits for either a completion of a specific task to be signaled or for a
++ ++  * specified timeout to expire. The timeout is in jiffies. It is not
++ ++  * interruptible.
++ ++  */
      unsigned long __sched
      wait_for_completion_timeout(struct completion *x, unsigned long timeout)
      {
@@@@@@@ -4643,6 -4643,6 -4748,13 -4702,6 -4709,6 -4748,13 +4755,13 @@@@@@@
      }
      EXPORT_SYMBOL(wait_for_completion_timeout);
      
++ ++ /**
++ ++  * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
++ ++  * @x:  holds the state of this particular completion
++ ++  *
++ ++  * This waits for completion of a specific task to be signaled. It is
++ ++  * interruptible.
++ ++  */
      int __sched wait_for_completion_interruptible(struct completion *x)
      {
      	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@@@@@@ -4652,6 -4652,6 -4764,14 -4711,6 -4718,6 -4764,14 +4771,14 @@@@@@@
      }
      EXPORT_SYMBOL(wait_for_completion_interruptible);
      
++ ++ /**
++ ++  * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
++ ++  * @x:  holds the state of this particular completion
++ ++  * @timeout:  timeout value in jiffies
++ ++  *
++ ++  * This waits for either a completion of a specific task to be signaled or for a
++ ++  * specified timeout to expire. It is interruptible. The timeout is in jiffies.
++ ++  */
      unsigned long __sched
      wait_for_completion_interruptible_timeout(struct completion *x,
      					  unsigned long timeout)
@@@@@@@ -4660,6 -4660,6 -4780,13 -4719,6 -4726,6 -4780,13 +4787,13 @@@@@@@
      }
      EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
      
++ ++ /**
++ ++  * wait_for_completion_killable: - waits for completion of a task (killable)
++ ++  * @x:  holds the state of this particular completion
++ ++  *
++ ++  * This waits to be signaled for completion of a specific task. It can be
++ ++  * interrupted by a kill signal.
++ ++  */
      int __sched wait_for_completion_killable(struct completion *x)
      {
      	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@@@@@@ -5062,7 -5062,7 -5189,8 -5121,7 -5128,7 -5189,8 +5196,8 @@@@@@@ recheck
      		 * Do not allow realtime tasks into groups that have no runtime
      		 * assigned.
      		 */
-- -- 		if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
++ ++ 		if (rt_bandwidth_enabled() && rt_policy(policy) &&
++ ++ 				task_group(p)->rt_bandwidth.rt_runtime == 0)
      			return -EPERM;
      #endif
      
@@@@@@@ -5898,7 -5898,7 -6026,7 -5957,7 -5964,7 -6026,7 +6033,7 @@@@@@@ static int __migrate_task(struct task_s
      	set_task_cpu(p, dest_cpu);
      	if (on_rq) {
      		activate_task(rq_dest, p, 0);
-- -- 		check_preempt_curr(rq_dest, p);
++ ++ 		check_preempt_curr(rq_dest, p, 0);
      	}
      done:
      	ret = 1;
@@@@@@@ -6223,7 -6223,7 -6351,7 -6282,7 -6289,7 -6351,7 +6358,7 @@@@@@@ set_table_entry(struct ctl_table *entry
      static struct ctl_table *
      sd_alloc_ctl_domain_table(struct sched_domain *sd)
      {
-- -- 	struct ctl_table *table = sd_alloc_ctl_entry(12);
++ ++ 	struct ctl_table *table = sd_alloc_ctl_entry(13);
      
      	if (table == NULL)
      		return NULL;
@@@@@@@ -6251,7 -6251,7 -6379,9 -6310,7 -6317,7 -6379,9 +6386,9 @@@@@@@
      		sizeof(int), 0644, proc_dointvec_minmax);
      	set_table_entry(&table[10], "flags", &sd->flags,
      		sizeof(int), 0644, proc_dointvec_minmax);
-- -- 	/* &table[11] is terminator */
++ ++ 	set_table_entry(&table[11], "name", sd->name,
++ ++ 		CORENAME_MAX_SIZE, 0444, proc_dostring);
++ ++ 	/* &table[12] is terminator */
      
      	return table;
      }
@@@@@@@ -7135,13 -7135,13 -7265,21 -7194,13 -7201,13 -7265,21 +7272,21 @@@@@@@ static void init_sched_groups_power(in
       * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
       */
      
++ ++ #ifdef CONFIG_SCHED_DEBUG
++ ++ # define SD_INIT_NAME(sd, type)		sd->name = #type
++ ++ #else
++ ++ # define SD_INIT_NAME(sd, type)		do { } while (0)
++ ++ #endif
++ ++ 
      #define	SD_INIT(sd, type)	sd_init_##type(sd)
++ ++ 
      #define SD_INIT_FUNC(type)	\
      static noinline void sd_init_##type(struct sched_domain *sd)	\
      {								\
      	memset(sd, 0, sizeof(*sd));				\
      	*sd = SD_##type##_INIT;					\
      	sd->level = SD_LV_##type;				\
++ ++ 	SD_INIT_NAME(sd, type);					\
      }
      
      SD_INIT_FUNC(CPU)
@@@@@@@ -7637,24 -7637,24 -7775,27 -7696,24 -7703,27 -7775,27 +7782,27 @@@@@@@ static int dattrs_equal(struct sched_do
       * and partition_sched_domains() will fallback to the single partition
       * 'fallback_doms', it also forces the domains to be rebuilt.
       *
++ +   * If doms_new==NULL it will be replaced with cpu_online_map.
++ +   * ndoms_new==0 is a special case for destroying existing domains.
++ +   * It will not create the default domain.
++ +   *
       * Call with hotplug lock held
       */
      void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
      			     struct sched_domain_attr *dattr_new)
      {
-- -  	int i, j;
++ +  	int i, j, n;
      
      	mutex_lock(&sched_domains_mutex);
      
      	/* always unregister in case we don't destroy any domains */
      	unregister_sched_domain_sysctl();
      
-- -  	if (doms_new == NULL)
-- -  		ndoms_new = 0;
++ +  	n = doms_new ? ndoms_new : 0;
      
      	/* Destroy deleted domains */
      	for (i = 0; i < ndoms_cur; i++) {
-- -  		for (j = 0; j < ndoms_new; j++) {
++ +  		for (j = 0; j < n; j++) {
      			if (cpus_equal(doms_cur[i], doms_new[j])
      			    && dattrs_equal(dattr_cur, i, dattr_new, j))
      				goto match1;
@@@@@@@ -7667,7 -7667,7 -7808,6 -7726,7 -7736,6 -7808,6 +7815,6 @@@@@@@ match1
      
      	if (doms_new == NULL) {
      		ndoms_cur = 0;
-- -  		ndoms_new = 1;
      		doms_new = &fallback_doms;
      		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
      		dattr_new = NULL;
@@@@@@@ -7704,8 -7704,8 -7844,13 -7763,8 -7772,13 -7844,13 +7851,13 @@@@@@@ match2
      int arch_reinit_sched_domains(void)
      {
      	get_online_cpus();
++ +  
++ +  	/* Destroy domains first to force the rebuild */
++ +  	partition_sched_domains(0, NULL, NULL);
++ +  
      	rebuild_sched_domains();
      	put_online_cpus();
++ +  
      	return 0;
      }
      
@@@@@@@ -7789,7 -7789,7 -7934,7 -7848,7 -7862,7 -7934,7 +7941,7 @@@@@@@ static int update_sched_domains(struct 
      	case CPU_ONLINE_FROZEN:
      	case CPU_DEAD:
      	case CPU_DEAD_FROZEN:
-- -  		partition_sched_domains(0, NULL, NULL);
++ +  		partition_sched_domains(1, NULL, NULL);
      		return NOTIFY_OK;
      
      	default:
@@@@@@@ -8176,20 -8176,20 -8321,25 -8235,20 -8249,20 -8321,25 +8328,25 @@@@@@@ void __might_sleep(char *file, int line
      #ifdef in_atomic
      	static unsigned long prev_jiffy;	/* ratelimiting */
      
-- -- 	if ((in_atomic() || irqs_disabled()) &&
-- -- 	    system_state == SYSTEM_RUNNING && !oops_in_progress) {
-- -- 		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-- -- 			return;
-- -- 		prev_jiffy = jiffies;
-- -- 		printk(KERN_ERR "BUG: sleeping function called from invalid"
-- -- 				" context at %s:%d\n", file, line);
-- -- 		printk("in_atomic():%d, irqs_disabled():%d\n",
-- -- 			in_atomic(), irqs_disabled());
-- -- 		debug_show_held_locks(current);
-- -- 		if (irqs_disabled())
-- -- 			print_irqtrace_events(current);
-- -- 		dump_stack();
-- -- 	}
++ ++ 	if ((!in_atomic() && !irqs_disabled()) ||
++ ++ 		    system_state != SYSTEM_RUNNING || oops_in_progress)
++ ++ 		return;
++ ++ 	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
++ ++ 		return;
++ ++ 	prev_jiffy = jiffies;
++ ++ 
++ ++ 	printk(KERN_ERR
++ ++ 		"BUG: sleeping function called from invalid context at %s:%d\n",
++ ++ 			file, line);
++ ++ 	printk(KERN_ERR
++ ++ 		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
++ ++ 			in_atomic(), irqs_disabled(),
++ ++ 			current->pid, current->comm);
++ ++ 
++ ++ 	debug_show_held_locks(current);
++ ++ 	if (irqs_disabled())
++ ++ 		print_irqtrace_events(current);
++ ++ 	dump_stack();
      #endif
      }
      EXPORT_SYMBOL(__might_sleep);
@@@@@@@ -8687,73 -8687,73 -8837,95 -8746,73 -8760,73 -8837,95 +8844,95 @@@@@@@ static DEFINE_MUTEX(rt_constraints_mute
      static unsigned long to_ratio(u64 period, u64 runtime)
      {
      	if (runtime == RUNTIME_INF)
-- -- 		return 1ULL << 16;
++ ++ 		return 1ULL << 20;
      
-- -- 	return div64_u64(runtime << 16, period);
++ ++ 	return div64_u64(runtime << 20, period);
      }
      
-- -- #ifdef CONFIG_CGROUP_SCHED
-- -- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
++ ++ /* Must be called with tasklist_lock held */
++ ++ static inline int tg_has_rt_tasks(struct task_group *tg)
      {
-- -- 	struct task_group *tgi, *parent = tg->parent;
-- -- 	unsigned long total = 0;
++ ++ 	struct task_struct *g, *p;
      
-- -- 	if (!parent) {
-- -- 		if (global_rt_period() < period)
-- -- 			return 0;
++ ++ 	do_each_thread(g, p) {
++ ++ 		if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
++ ++ 			return 1;
++ ++ 	} while_each_thread(g, p);
      
-- -- 		return to_ratio(period, runtime) <
-- -- 			to_ratio(global_rt_period(), global_rt_runtime());
-- -- 	}
++ ++ 	return 0;
++ ++ }
      
-- -- 	if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
-- -- 		return 0;
++ ++ struct rt_schedulable_data {
++ ++ 	struct task_group *tg;
++ ++ 	u64 rt_period;
++ ++ 	u64 rt_runtime;
++ ++ };
      
-- -- 	rcu_read_lock();
-- -- 	list_for_each_entry_rcu(tgi, &parent->children, siblings) {
-- -- 		if (tgi == tg)
-- -- 			continue;
++ ++ static int tg_schedulable(struct task_group *tg, void *data)
++ ++ {
++ ++ 	struct rt_schedulable_data *d = data;
++ ++ 	struct task_group *child;
++ ++ 	unsigned long total, sum = 0;
++ ++ 	u64 period, runtime;
++    
   -- 		total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
   -- 				tgi->rt_bandwidth.rt_runtime);
++ ++ 	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
++ ++ 	runtime = tg->rt_bandwidth.rt_runtime;
   ++ 
--    		total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
--    				tgi->rt_bandwidth.rt_runtime);
++ ++ 	if (tg == d->tg) {
++ ++ 		period = d->rt_period;
++ ++ 		runtime = d->rt_runtime;
      	}
-- -- 	rcu_read_unlock();
      
-- -- 	return total + to_ratio(period, runtime) <=
-- -- 		to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
-- -- 				parent->rt_bandwidth.rt_runtime);
-- -- }
-- -- #elif defined CONFIG_USER_SCHED
-- -- static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-- -- {
-- -- 	struct task_group *tgi;
-- -- 	unsigned long total = 0;
-- -- 	unsigned long global_ratio =
-- -- 		to_ratio(global_rt_period(), global_rt_runtime());
++ ++ 	/*
++ ++ 	 * Cannot have more runtime than the period.
++ ++ 	 */
++ ++ 	if (runtime > period && runtime != RUNTIME_INF)
++ ++ 		return -EINVAL;
      
-- -- 	rcu_read_lock();
-- -- 	list_for_each_entry_rcu(tgi, &task_groups, list) {
-- -- 		if (tgi == tg)
-- -- 			continue;
++ ++ 	/*
++ ++ 	 * Ensure we don't starve existing RT tasks.
++ ++ 	 */
++ ++ 	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
++ ++ 		return -EBUSY;
++ +  
    - 		total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
    - 				tgi->rt_bandwidth.rt_runtime);
++ ++ 	total = to_ratio(period, runtime);
++ ++ 
++ ++ 	/*
++ ++ 	 * Nobody can have more than the global setting allows.
++ ++ 	 */
++ ++ 	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
++ ++ 		return -EINVAL;
   ++ 
--    		total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
--    				tgi->rt_bandwidth.rt_runtime);
++ ++ 	/*
++ ++ 	 * The sum of our children's runtime should not exceed our own.
++ ++ 	 */
++ ++ 	list_for_each_entry_rcu(child, &tg->children, siblings) {
++ ++ 		period = ktime_to_ns(child->rt_bandwidth.rt_period);
++ ++ 		runtime = child->rt_bandwidth.rt_runtime;
++ ++ 
++ ++ 		if (child == d->tg) {
++ ++ 			period = d->rt_period;
++ ++ 			runtime = d->rt_runtime;
++ ++ 		}
++  + 
   -  		total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
   -  				tgi->rt_bandwidth.rt_runtime);
++ ++ 		sum += to_ratio(period, runtime);
      	}
-- -- 	rcu_read_unlock();
      
-- -- 	return total + to_ratio(period, runtime) < global_ratio;
++ ++ 	if (sum > total)
++ ++ 		return -EINVAL;
++ ++ 
++ ++ 	return 0;
      }
-- -- #endif
      
-- -- /* Must be called with tasklist_lock held */
-- -- static inline int tg_has_rt_tasks(struct task_group *tg)
++ ++ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
      {
-- -- 	struct task_struct *g, *p;
-- -- 	do_each_thread(g, p) {
-- -- 		if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
-- -- 			return 1;
-- -- 	} while_each_thread(g, p);
-- -- 	return 0;
++ ++ 	struct rt_schedulable_data data = {
++ ++ 		.tg = tg,
++ ++ 		.rt_period = period,
++ ++ 		.rt_runtime = runtime,
++ ++ 	};
++ ++ 
++ ++ 	return walk_tg_tree(tg_schedulable, tg_nop, &data);
      }
      
      static int tg_set_bandwidth(struct task_group *tg,
@@@@@@@ -8763,14 -8763,14 -8935,9 -8822,14 -8836,14 -8935,9 +8942,9 @@@@@@@
      
      	mutex_lock(&rt_constraints_mutex);
      	read_lock(&tasklist_lock);
-- -- 	if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
-- -- 		err = -EBUSY;
-- -- 		goto unlock;
-- -- 	}
-- -- 	if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
-- -- 		err = -EINVAL;
++ ++ 	err = __rt_schedulable(tg, rt_period, rt_runtime);
++ ++ 	if (err)
      		goto unlock;
-- -- 	}
      
      	spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
      	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@@@@@@ -8839,16 -8839,16 -9006,25 -8898,16 -8912,16 -9006,25 +9013,25 @@@@@@@ long sched_group_rt_period(struct task_
      
      static int sched_rt_global_constraints(void)
      {
-- -- 	struct task_group *tg = &root_task_group;
-- -- 	u64 rt_runtime, rt_period;
++ ++ 	u64 runtime, period;
      	int ret = 0;
      
-- -- 	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-- -- 	rt_runtime = tg->rt_bandwidth.rt_runtime;
++ ++ 	if (sysctl_sched_rt_period <= 0)
++ ++ 		return -EINVAL;
++ ++ 
++ ++ 	runtime = global_rt_runtime();
++ ++ 	period = global_rt_period();
++ ++ 
++ ++ 	/*
++ ++ 	 * Sanity check on the sysctl variables.
++ ++ 	 */
++ ++ 	if (runtime > period && runtime != RUNTIME_INF)
++ ++ 		return -EINVAL;
      
      	mutex_lock(&rt_constraints_mutex);
-- -- 	if (!__rt_schedulable(tg, rt_period, rt_runtime))
-- -- 		ret = -EINVAL;
++ ++ 	read_lock(&tasklist_lock);
++ ++ 	ret = __rt_schedulable(NULL, 0, 0);
++ ++ 	read_unlock(&tasklist_lock);
      	mutex_unlock(&rt_constraints_mutex);
      
      	return ret;
@@@@@@@ -8859,6 -8859,6 -9035,9 -8918,6 -8932,6 -9035,9 +9042,9 @@@@@@@ static int sched_rt_global_constraints(
      	unsigned long flags;
      	int i;
      
++ ++ 	if (sysctl_sched_rt_period <= 0)
++ ++ 		return -EINVAL;
++ ++ 
      	spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
      	for_each_possible_cpu(i) {
      		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
@@@@@@@ -8919,7 -8919,7 -9098,6 -8978,7 -8992,7 -9098,6 +9105,6 @@@@@@@ cpu_cgroup_create(struct cgroup_subsys 
      
      	if (!cgrp->parent) {
      		/* This is early initialization for the top cgroup */
-- -- 		init_task_group.css.cgroup = cgrp;
      		return &init_task_group.css;
      	}
      
@@@@@@@ -8928,9 -8928,9 -9106,6 -8987,9 -9001,9 -9106,6 +9113,6 @@@@@@@
      	if (IS_ERR(tg))
      		return ERR_PTR(-ENOMEM);
      
-- -- 	/* Bind the cgroup to task_group object we just created */
-- -- 	tg->css.cgroup = cgrp;
-- -- 
      	return &tg->css;
      }
      
diff --combined kernel/sched_fair.c
index fb8994c6d4b,fb8994c6d4b,18fd17172eb,fb8994c6d4b,99aa31acc54,18fd17172eb..f604dae7131
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@@@@@@ -408,64 -408,64 -408,6 -408,64 -408,64 -408,6 +408,6 @@@@@@@ static u64 sched_vslice_add(struct cfs_
      	return __sched_period(nr_running);
      }
      
-- -- /*
-- --  * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
-- --  * that it favours >=0 over <0.
-- --  *
-- --  *   -20         |
-- --  *               |
-- --  *     0 --------+-------
-- --  *             .'
-- --  *    19     .'
-- --  *
-- --  */
-- -- static unsigned long
-- -- calc_delta_asym(unsigned long delta, struct sched_entity *se)
-- -- {
-- -- 	struct load_weight lw = {
-- -- 		.weight = NICE_0_LOAD,
-- -- 		.inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
-- -- 	};
-- -- 
-- -- 	for_each_sched_entity(se) {
-- -- 		struct load_weight *se_lw = &se->load;
-- -- 		unsigned long rw = cfs_rq_of(se)->load.weight;
-- -- 
-- -- #ifdef CONFIG_FAIR_SCHED_GROUP
-- -- 		struct cfs_rq *cfs_rq = se->my_q;
-- -- 		struct task_group *tg = NULL
-- -- 
-- -- 		if (cfs_rq)
-- -- 			tg = cfs_rq->tg;
-- -- 
-- -- 		if (tg && tg->shares < NICE_0_LOAD) {
-- -- 			/*
-- -- 			 * scale shares to what it would have been had
-- -- 			 * tg->weight been NICE_0_LOAD:
-- -- 			 *
-- -- 			 *   weight = 1024 * shares / tg->weight
-- -- 			 */
-- -- 			lw.weight *= se->load.weight;
-- -- 			lw.weight /= tg->shares;
-- -- 
-- -- 			lw.inv_weight = 0;
-- -- 
-- -- 			se_lw = &lw;
-- -- 			rw += lw.weight - se->load.weight;
-- -- 		} else
-- -- #endif
-- -- 
-- -- 		if (se->load.weight < NICE_0_LOAD) {
-- -- 			se_lw = &lw;
-- -- 			rw += NICE_0_LOAD - se->load.weight;
-- -- 		}
-- -- 
-- -- 		delta = calc_delta_mine(delta, rw, se_lw);
-- -- 	}
-- -- 
-- -- 	return delta;
-- -- }
-- -- 
      /*
       * Update the current task's runtime statistics. Skip current tasks that
       * are not in our scheduling class.
@@@@@@@ -507,6 -507,6 -449,6 -507,6 -507,7 -449,6 +449,7 @@@@@@@ static void update_curr(struct cfs_rq *
      		struct task_struct *curtask = task_of(curr);
      
      		cpuacct_charge(curtask, delta_exec);
++++ +		account_group_exec_runtime(curtask, delta_exec);
      	}
      }
      
@@@@@@@ -586,11 -586,11 -528,12 -586,11 -587,11 -528,12 +529,12 @@@@@@@ account_entity_enqueue(struct cfs_rq *c
      	update_load_add(&cfs_rq->load, se->load.weight);
      	if (!parent_entity(se))
      		inc_cpu_load(rq_of(cfs_rq), se->load.weight);
-- -- 	if (entity_is_task(se))
++ ++ 	if (entity_is_task(se)) {
      		add_cfs_task_weight(cfs_rq, se->load.weight);
++ ++ 		list_add(&se->group_node, &cfs_rq->tasks);
++ ++ 	}
      	cfs_rq->nr_running++;
      	se->on_rq = 1;
-- -- 	list_add(&se->group_node, &cfs_rq->tasks);
      }
      
      static void
@@@@@@@ -599,11 -599,11 -542,12 -599,11 -600,11 -542,12 +543,12 @@@@@@@ account_entity_dequeue(struct cfs_rq *c
      	update_load_sub(&cfs_rq->load, se->load.weight);
      	if (!parent_entity(se))
      		dec_cpu_load(rq_of(cfs_rq), se->load.weight);
-- -- 	if (entity_is_task(se))
++ ++ 	if (entity_is_task(se)) {
      		add_cfs_task_weight(cfs_rq, -se->load.weight);
++ ++ 		list_del_init(&se->group_node);
++ ++ 	}
      	cfs_rq->nr_running--;
      	se->on_rq = 0;
-- -- 	list_del_init(&se->group_node);
      }
      
      static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@@@@@@ -1085,7 -1085,7 -1029,6 -1085,7 -1086,7 -1029,6 +1030,6 @@@@@@@ static long effective_load(struct task_
      		long wl, long wg)
      {
      	struct sched_entity *se = tg->se[cpu];
-- -- 	long more_w;
      
      	if (!tg->parent)
      		return wl;
@@@@@@@ -1097,18 -1097,18 -1040,17 -1097,18 -1098,18 -1040,17 +1041,17 @@@@@@@
      	if (!wl && sched_feat(ASYM_EFF_LOAD))
      		return wl;
      
-- -- 	/*
-- -- 	 * Instead of using this increment, also add the difference
-- -- 	 * between when the shares were last updated and now.
-- -- 	 */
-- -- 	more_w = se->my_q->load.weight - se->my_q->rq_weight;
-- -- 	wl += more_w;
-- -- 	wg += more_w;
-- -- 
      	for_each_sched_entity(se) {
-- -- #define D(n) (likely(n) ? (n) : 1)
-- -- 
      		long S, rw, s, a, b;
++ ++ 		long more_w;
++ ++ 
++ ++ 		/*
++ ++ 		 * Instead of using this increment, also add the difference
++ ++ 		 * between when the shares were last updated and now.
++ ++ 		 */
++ ++ 		more_w = se->my_q->load.weight - se->my_q->rq_weight;
++ ++ 		wl += more_w;
++ ++ 		wg += more_w;
      
      		S = se->my_q->tg->shares;
      		s = se->my_q->shares;
@@@@@@@ -1117,7 -1117,7 -1059,11 -1117,7 -1118,7 -1059,11 +1060,11 @@@@@@@
      		a = S*(rw + wl);
      		b = S*rw + s*wg;
      
-- -- 		wl = s*(a-b)/D(b);
++ ++ 		wl = s*(a-b);
++ ++ 
++ ++ 		if (likely(b))
++ ++ 			wl /= b;
++ ++ 
      		/*
      		 * Assume the group is already running and will
      		 * thus already be accounted for in the weight.
@@@@@@@ -1126,7 -1126,7 -1072,6 -1126,7 -1127,7 -1072,6 +1073,6 @@@@@@@
      		 * alter the group weight.
      		 */
      		wg = 0;
-- -- #undef D
      	}
      
      	return wl;
@@@@@@@ -1143,7 -1143,7 -1088,7 -1143,7 -1144,7 -1088,7 +1089,7 @@@@@@@ static inline unsigned long effective_l
      #endif
      
      static int
-- -- wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
++ ++ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
      	    struct task_struct *p, int prev_cpu, int this_cpu, int sync,
      	    int idx, unsigned long load, unsigned long this_load,
      	    unsigned int imbalance)
@@@@@@@ -1158,6 -1158,6 -1103,11 -1158,6 -1159,6 -1103,11 +1104,11 @@@@@@@
      	if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
      		return 0;
      
++ ++ 	if (!sync && sched_feat(SYNC_WAKEUPS) &&
++ ++ 	    curr->se.avg_overlap < sysctl_sched_migration_cost &&
++ ++ 	    p->se.avg_overlap < sysctl_sched_migration_cost)
++ ++ 		sync = 1;
++ ++ 
      	/*
      	 * If sync wakeup then subtract the (maximum possible)
      	 * effect of the currently running task from the load
@@@@@@@ -1182,17 -1182,17 -1132,14 -1182,17 -1183,17 -1132,14 +1133,14 @@@@@@@
      	 * a reasonable amount of time then attract this newly
      	 * woken task:
      	 */
-- -- 	if (sync && balanced) {
-- -- 		if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-- -- 		    p->se.avg_overlap < sysctl_sched_migration_cost)
-- -- 			return 1;
-- -- 	}
++ ++ 	if (sync && balanced)
++ ++ 		return 1;
      
      	schedstat_inc(p, se.nr_wakeups_affine_attempts);
      	tl_per_task = cpu_avg_load_per_task(this_cpu);
      
-- -- 	if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
-- -- 			balanced) {
++ ++ 	if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
++ ++ 			tl_per_task)) {
      		/*
      		 * This domain has SD_WAKE_AFFINE and
      		 * p is cache cold in this domain, and
@@@@@@@ -1211,16 -1211,16 -1158,17 -1211,16 -1212,16 -1158,17 +1159,17 @@@@@@@ static int select_task_rq_fair(struct t
      	struct sched_domain *sd, *this_sd = NULL;
      	int prev_cpu, this_cpu, new_cpu;
      	unsigned long load, this_load;
-- -- 	struct rq *rq, *this_rq;
++ ++ 	struct rq *this_rq;
      	unsigned int imbalance;
      	int idx;
      
      	prev_cpu	= task_cpu(p);
-- -- 	rq		= task_rq(p);
      	this_cpu	= smp_processor_id();
      	this_rq		= cpu_rq(this_cpu);
      	new_cpu		= prev_cpu;
      
++ ++ 	if (prev_cpu == this_cpu)
++ ++ 		goto out;
      	/*
      	 * 'this_sd' is the first domain that both
      	 * this_cpu and prev_cpu are present in:
@@@@@@@ -1248,13 -1248,13 -1196,10 -1248,13 -1249,13 -1196,10 +1197,10 @@@@@@@
      	load = source_load(prev_cpu, idx);
      	this_load = target_load(this_cpu, idx);
      
-- -- 	if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
++ ++ 	if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
      				     load, this_load, imbalance))
      		return this_cpu;
      
-- -- 	if (prev_cpu == this_cpu)
-- -- 		goto out;
-- -- 
      	/*
      	 * Start passive balancing when half the imbalance_pct
      	 * limit is reached.
@@@@@@@ -1281,62 -1281,62 -1226,20 -1281,62 -1282,62 -1226,20 +1227,20 @@@@@@@ static unsigned long wakeup_gran(struc
      	 * + nice tasks.
      	 */
      	if (sched_feat(ASYM_GRAN))
-- -- 		gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
-- -- 	else
-- -- 		gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
++ ++ 		gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
      
      	return gran;
      }
      
-- -- /*
-- --  * Should 'se' preempt 'curr'.
-- --  *
-- --  *             |s1
-- --  *        |s2
-- --  *   |s3
-- --  *         g
-- --  *      |<--->|c
-- --  *
-- --  *  w(c, s1) = -1
-- --  *  w(c, s2) =  0
-- --  *  w(c, s3) =  1
-- --  *
-- --  */
-- -- static int
-- -- wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
-- -- {
-- -- 	s64 gran, vdiff = curr->vruntime - se->vruntime;
-- -- 
-- -- 	if (vdiff < 0)
-- -- 		return -1;
-- -- 
-- -- 	gran = wakeup_gran(curr);
-- -- 	if (vdiff > gran)
-- -- 		return 1;
-- -- 
-- -- 	return 0;
-- -- }
-- -- 
-- -- /* return depth at which a sched entity is present in the hierarchy */
-- -- static inline int depth_se(struct sched_entity *se)
-- -- {
-- -- 	int depth = 0;
-- -- 
-- -- 	for_each_sched_entity(se)
-- -- 		depth++;
-- -- 
-- -- 	return depth;
-- -- }
-- -- 
      /*
       * Preempt the current task with a newly woken task if needed:
       */
-- -- static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
++ ++ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
      {
      	struct task_struct *curr = rq->curr;
      	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
      	struct sched_entity *se = &curr->se, *pse = &p->se;
-- -- 	int se_depth, pse_depth;
++ ++ 	s64 delta_exec;
      
      	if (unlikely(rt_prio(p->prio))) {
      		update_rq_clock(rq);
@@@@@@@ -1350,6 -1350,6 -1253,13 -1350,6 -1351,6 -1253,13 +1254,13 @@@@@@@
      
      	cfs_rq_of(pse)->next = pse;
      
++ ++ 	/*
++ ++ 	 * We can come here with TIF_NEED_RESCHED already set from new task
++ ++ 	 * wake up path.
++ ++ 	 */
++ ++ 	if (test_tsk_need_resched(curr))
++ ++ 		return;
++ ++ 
      	/*
      	 * Batch tasks do not preempt (their preemption is driven by
      	 * the tick):
@@@@@@@ -1360,33 -1360,33 -1270,15 -1360,33 -1361,33 -1270,15 +1271,15 @@@@@@@
      	if (!sched_feat(WAKEUP_PREEMPT))
      		return;
      
-- -- 	/*
-- -- 	 * preemption test can be made between sibling entities who are in the
-- -- 	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
-- -- 	 * both tasks until we find their ancestors who are siblings of common
-- -- 	 * parent.
-- -- 	 */
-- -- 
-- -- 	/* First walk up until both entities are at same depth */
-- -- 	se_depth = depth_se(se);
-- -- 	pse_depth = depth_se(pse);
-- -- 
-- -- 	while (se_depth > pse_depth) {
-- -- 		se_depth--;
-- -- 		se = parent_entity(se);
-- -- 	}
-- -- 
-- -- 	while (pse_depth > se_depth) {
-- -- 		pse_depth--;
-- -- 		pse = parent_entity(pse);
-- -- 	}
-- -- 
-- -- 	while (!is_same_group(se, pse)) {
-- -- 		se = parent_entity(se);
-- -- 		pse = parent_entity(pse);
++ ++ 	if (sched_feat(WAKEUP_OVERLAP) && (sync ||
++ ++ 			(se->avg_overlap < sysctl_sched_migration_cost &&
++ ++ 			 pse->avg_overlap < sysctl_sched_migration_cost))) {
++ ++ 		resched_task(curr);
++ ++ 		return;
      	}
      
-- -- 	if (wakeup_preempt_entity(se, pse) == 1)
++ ++ 	delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
++ ++ 	if (delta_exec > wakeup_gran(pse))
      		resched_task(curr);
      }
      
@@@@@@@ -1445,19 -1445,19 -1337,9 -1445,19 -1446,19 -1337,9 +1338,9 @@@@@@@ __load_balance_iterator(struct cfs_rq *
      	if (next == &cfs_rq->tasks)
      		return NULL;
      
-- -- 	/* Skip over entities that are not tasks */
-- -- 	do {
-- -- 		se = list_entry(next, struct sched_entity, group_node);
-- -- 		next = next->next;
-- -- 	} while (next != &cfs_rq->tasks && !entity_is_task(se));
-- -- 
-- -- 	if (next == &cfs_rq->tasks)
-- -- 		return NULL;
-- -- 
-- -- 	cfs_rq->balance_iterator = next;
-- -- 
-- -- 	if (entity_is_task(se))
-- -- 		p = task_of(se);
++ ++ 	se = list_entry(next, struct sched_entity, group_node);
++ ++ 	p = task_of(se);
++ ++ 	cfs_rq->balance_iterator = next->next;
      
      	return p;
      }
@@@@@@@ -1507,7 -1507,7 -1389,7 -1507,7 -1508,7 -1389,7 +1390,7 @@@@@@@ load_balance_fair(struct rq *this_rq, i
      	rcu_read_lock();
      	update_h_load(busiest_cpu);
      
-- -- 	list_for_each_entry(tg, &task_groups, list) {
++ ++ 	list_for_each_entry_rcu(tg, &task_groups, list) {
      		struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
      		unsigned long busiest_h_load = busiest_cfs_rq->h_load;
      		unsigned long busiest_weight = busiest_cfs_rq->load.weight;
@@@@@@@ -1620,10 -1620,10 -1502,10 -1620,10 -1621,10 -1502,10 +1503,10 @@@@@@@ static void task_new_fair(struct rq *rq
      		 * 'current' within the tree based on its new key value.
      		 */
      		swap(curr->vruntime, se->vruntime);
++ ++ 		resched_task(rq->curr);
      	}
      
      	enqueue_task_fair(rq, p, 0);
-- -- 	resched_task(rq->curr);
      }
      
      /*
@@@@@@@ -1642,7 -1642,7 -1524,7 -1642,7 -1643,7 -1524,7 +1525,7 @@@@@@@ static void prio_changed_fair(struct r
      		if (p->prio > oldprio)
      			resched_task(rq->curr);
      	} else
-- -- 		check_preempt_curr(rq, p);
++ ++ 		check_preempt_curr(rq, p, 0);
      }
      
      /*
@@@@@@@ -1659,7 -1659,7 -1541,7 -1659,7 -1660,7 -1541,7 +1542,7 @@@@@@@ static void switched_to_fair(struct rq 
      	if (running)
      		resched_task(rq->curr);
      	else
-- -- 		check_preempt_curr(rq, p);
++ ++ 		check_preempt_curr(rq, p, 0);
      }
      
      /* Account for a task changing its policy or group.
diff --combined kernel/sched_rt.c
index 998ba54b454,998ba54b454,cdf5740ab03,552310798da,8375e69af36,cdf5740ab03..b446dc87494
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@@@@@@ -102,12 -102,12 -102,12 -102,12 -102,12 -102,12 +102,12 @@@@@@@ static void dequeue_rt_entity(struct sc
      
      static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
      {
++ ++ 	struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
      	struct sched_rt_entity *rt_se = rt_rq->rt_se;
      
-- -- 	if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
-- -- 		struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
-- -- 
-- -- 		enqueue_rt_entity(rt_se);
++ ++ 	if (rt_rq->rt_nr_running) {
++ ++ 		if (rt_se && !on_rt_rq(rt_se))
++ ++ 			enqueue_rt_entity(rt_se);
      		if (rt_rq->highest_prio < curr->prio)
      			resched_task(curr);
      	}
@@@@@@@ -199,6 -199,6 -199,8 -199,8 -199,8 -199,8 +199,8 @@@@@@@ static inline struct rt_rq *group_rt_rq
      
      static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
      {
++    	if (rt_rq->rt_nr_running)
++    		resched_task(rq_of_rt_rq(rt_rq)->curr);
      }
      
      static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@@@@@@ -229,6 -229,6 -231,9 -231,6 -231,6 -231,9 +231,9 @@@@@@@ static inline struct rt_bandwidth *sche
      #endif /* CONFIG_RT_GROUP_SCHED */
      
      #ifdef CONFIG_SMP
++ ++ /*
++ ++  * We ran out of runtime, see if we can borrow some from our neighbours.
++ ++  */
      static int do_balance_runtime(struct rt_rq *rt_rq)
      {
      	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@@@@@@ -248,9 -248,9 -253,18 -250,9 -250,9 -253,18 +253,18 @@@@@@@
      			continue;
      
      		spin_lock(&iter->rt_runtime_lock);
++ ++ 		/*
++ ++ 		 * Either all rqs have inf runtime and there's nothing to steal
++ ++ 		 * or __disable_runtime() below sets a specific rq to inf to
++ ++ 		 * indicate its been disabled and disalow stealing.
++ ++ 		 */
      		if (iter->rt_runtime == RUNTIME_INF)
      			goto next;
      
++ ++ 		/*
++ ++ 		 * From runqueues with spare time, take 1/n part of their
++ ++ 		 * spare time, but no more than our period.
++ ++ 		 */
      		diff = iter->rt_runtime - iter->rt_time;
      		if (diff > 0) {
      			diff = div_u64((u64)diff, weight);
@@@@@@@ -272,6 -272,6 -286,9 -274,6 -274,6 -286,9 +286,9 @@@@@@@ next
      	return more;
      }
      
++ ++ /*
++ ++  * Ensure this RQ takes back all the runtime it lend to its neighbours.
++ ++  */
      static void __disable_runtime(struct rq *rq)
      {
      	struct root_domain *rd = rq->rd;
@@@@@@@ -287,17 -287,17 -304,33 -289,17 -289,17 -304,33 +304,33 @@@@@@@
      
      		spin_lock(&rt_b->rt_runtime_lock);
      		spin_lock(&rt_rq->rt_runtime_lock);
++ ++ 		/*
++ ++ 		 * Either we're all inf and nobody needs to borrow, or we're
++ ++ 		 * already disabled and thus have nothing to do, or we have
++ ++ 		 * exactly the right amount of runtime to take out.
++ ++ 		 */
      		if (rt_rq->rt_runtime == RUNTIME_INF ||
      				rt_rq->rt_runtime == rt_b->rt_runtime)
      			goto balanced;
      		spin_unlock(&rt_rq->rt_runtime_lock);
      
++ ++ 		/*
++ ++ 		 * Calculate the difference between what we started out with
++ ++ 		 * and what we current have, that's the amount of runtime
++ ++ 		 * we lend and now have to reclaim.
++ ++ 		 */
      		want = rt_b->rt_runtime - rt_rq->rt_runtime;
      
++ ++ 		/*
++ ++ 		 * Greedy reclaim, take back as much as we can.
++ ++ 		 */
      		for_each_cpu_mask(i, rd->span) {
      			struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
      			s64 diff;
      
++ ++ 			/*
++ ++ 			 * Can't reclaim from ourselves or disabled runqueues.
++ ++ 			 */
      			if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
      				continue;
      
@@@@@@@ -317,8 -317,8 -350,16 -319,8 -319,8 -350,16 +350,16 @@@@@@@
      		}
      
      		spin_lock(&rt_rq->rt_runtime_lock);
++ ++ 		/*
++ ++ 		 * We cannot be left wanting - that would mean some runtime
++ ++ 		 * leaked out of the system.
++ ++ 		 */
      		BUG_ON(want);
      balanced:
++ ++ 		/*
++ ++ 		 * Disable all the borrow logic by pretending we have inf
++ ++ 		 * runtime - in which case borrowing doesn't make sense.
++ ++ 		 */
      		rt_rq->rt_runtime = RUNTIME_INF;
      		spin_unlock(&rt_rq->rt_runtime_lock);
      		spin_unlock(&rt_b->rt_runtime_lock);
@@@@@@@ -341,6 -341,6 -382,9 -343,6 -343,6 -382,9 +382,9 @@@@@@@ static void __enable_runtime(struct rq 
      	if (unlikely(!scheduler_running))
      		return;
      
++ ++ 	/*
++ ++ 	 * Reset each runqueue's bandwidth settings
++ ++ 	 */
      	for_each_leaf_rt_rq(rt_rq, rq) {
      		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
      
@@@@@@@ -348,6 -348,6 -392,7 -350,6 -350,6 -392,7 +392,7 @@@@@@@
      		spin_lock(&rt_rq->rt_runtime_lock);
      		rt_rq->rt_runtime = rt_b->rt_runtime;
      		rt_rq->rt_time = 0;
++ ++ 		rt_rq->rt_throttled = 0;
      		spin_unlock(&rt_rq->rt_runtime_lock);
      		spin_unlock(&rt_b->rt_runtime_lock);
      	}
@@@@@@@ -386,7 -386,7 -431,7 -388,7 -388,7 -431,7 +431,7 @@@@@@@ static int do_sched_rt_period_timer(str
      	int i, idle = 1;
      	cpumask_t span;
      
-- -- 	if (rt_b->rt_runtime == RUNTIME_INF)
++ ++ 	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
      		return 1;
      
      	span = sched_rt_period_mask();
@@@@@@@ -438,9 -438,9 -483,6 -440,6 -440,6 -483,6 +483,6 @@@@@@@ static int sched_rt_runtime_exceeded(st
      {
      	u64 runtime = sched_rt_runtime(rt_rq);
      
--    	if (runtime == RUNTIME_INF)
--    		return 0;
--    
      	if (rt_rq->rt_throttled)
      		return rt_rq_throttled(rt_rq);
      
@@@@@@@ -484,16 -484,16 -526,21 -483,18 -483,20 -526,21 +526,23 @@@@@@@ static void update_curr_rt(struct rq *r
      	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
      
      	curr->se.sum_exec_runtime += delta_exec;
++++ +	account_group_exec_runtime(curr, delta_exec);
++++ +
      	curr->se.exec_start = rq->clock;
      	cpuacct_charge(curr, delta_exec);
      
++ ++ 	if (!rt_bandwidth_enabled())
++ ++ 		return;
++ ++ 
      	for_each_sched_rt_entity(rt_se) {
      		rt_rq = rt_rq_of_se(rt_se);
      
      		spin_lock(&rt_rq->rt_runtime_lock);
--    		rt_rq->rt_time += delta_exec;
--    		if (sched_rt_runtime_exceeded(rt_rq))
--    			resched_task(curr);
++    		if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
++    			rt_rq->rt_time += delta_exec;
++    			if (sched_rt_runtime_exceeded(rt_rq))
++    				resched_task(curr);
++    		}
      		spin_unlock(&rt_rq->rt_runtime_lock);
      	}
      }
@@@@@@@ -782,7 -782,7 -829,7 -783,7 -785,7 -829,7 +831,7 @@@@@@@ static void check_preempt_equal_prio(st
      /*
       * Preempt the current task with a newly woken task if needed:
       */
-- -- static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
++ ++ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
      {
      	if (p->prio < rq->curr->prio) {
      		resched_task(rq->curr);
@@@@@@@ -1411,7 -1411,7 -1458,7 -1412,7 -1414,7 -1458,7 +1460,7 @@@@@@@ static void watchdog(struct rq *rq, str
      		p->rt.timeout++;
      		next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
      		if (p->rt.timeout > next)
---- -			p->it_sched_expires = p->se.sum_exec_runtime;
++++ +			p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
      	}
      }
      
diff --combined kernel/softirq.c
index c506f266a6b,c506f266a6b,d410014279e,c506f266a6b,c506f266a6b,83ba21a13bd..7110daeb9a9
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@@@@@@ -6,6 -6,6 -6,6 -6,6 -6,6 -6,8 +6,8 @@@@@@@
       *	Distribute under GPLv2.
       *
       *	Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+++++  *
+++++  *	Remote softirq infrastructure is by Jens Axboe.
       */
      
      #include <linux/module.h>
@@@@@@@ -46,7 -46,7 -46,7 -46,7 -46,7 -48,7 +48,7 @@@@@@@ irq_cpustat_t irq_stat[NR_CPUS] ____cac
      EXPORT_SYMBOL(irq_stat);
      #endif
      
-- -- static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp;
++ ++ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
      
      static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
      
@@@@@@@ -205,7 -205,7 -205,18 -205,7 -205,7 -207,18 +207,18 @@@@@@@ restart
      
      	do {
      		if (pending & 1) {
++ ++ 			int prev_count = preempt_count();
++ ++ 
      			h->action(h);
++ ++ 
++ ++ 			if (unlikely(prev_count != preempt_count())) {
++ ++ 				printk(KERN_ERR "huh, entered softirq %td %p"
++ ++ 				       "with preempt_count %08x,"
++ ++ 				       " exited with %08x?\n", h - softirq_vec,
++ ++ 				       h->action, prev_count, preempt_count());
++ ++ 				preempt_count() = prev_count;
++ ++ 			}
++ ++ 
      			rcu_bh_qsctr_inc(cpu);
      		}
      		h++;
@@@@@@@ -254,16 -254,16 -265,12 -254,16 -254,16 -267,16 +267,12 @@@@@@@ asmlinkage void do_softirq(void
       */
      void irq_enter(void)
      {
-- ---#ifdef CONFIG_NO_HZ
      	int cpu = smp_processor_id();
++ +++
      	if (idle_cpu(cpu) && !in_interrupt())
-- ---		tick_nohz_stop_idle(cpu);
-- ---#endif
++ +++		tick_check_idle(cpu);
++ +++
      	__irq_enter();
-- ---#ifdef CONFIG_NO_HZ
-- ---	if (idle_cpu(cpu))
-- ---		tick_nohz_update_jiffies();
-- ---#endif
      }
      
      #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@@@@@@ -463,17 -463,17 -470,17 -463,17 -463,17 -476,144 +472,144 @@@@@@@ void tasklet_kill(struct tasklet_struc
      
      EXPORT_SYMBOL(tasklet_kill);
      
+++++ DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
+++++ EXPORT_PER_CPU_SYMBOL(softirq_work_list);
+++++ 
+++++ static void __local_trigger(struct call_single_data *cp, int softirq)
+++++ {
+++++ 	struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
+++++ 
+++++ 	list_add_tail(&cp->list, head);
+++++ 
+++++ 	/* Trigger the softirq only if the list was previously empty.  */
+++++ 	if (head->next == &cp->list)
+++++ 		raise_softirq_irqoff(softirq);
+++++ }
+++++ 
+++++ #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
+++++ static void remote_softirq_receive(void *data)
+++++ {
+++++ 	struct call_single_data *cp = data;
+++++ 	unsigned long flags;
+++++ 	int softirq;
+++++ 
+++++ 	softirq = cp->priv;
+++++ 
+++++ 	local_irq_save(flags);
+++++ 	__local_trigger(cp, softirq);
+++++ 	local_irq_restore(flags);
+++++ }
+++++ 
+++++ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+++++ {
+++++ 	if (cpu_online(cpu)) {
+++++ 		cp->func = remote_softirq_receive;
+++++ 		cp->info = cp;
+++++ 		cp->flags = 0;
+++++ 		cp->priv = softirq;
+++++ 
+++++ 		__smp_call_function_single(cpu, cp);
+++++ 		return 0;
+++++ 	}
+++++ 	return 1;
+++++ }
+++++ #else /* CONFIG_USE_GENERIC_SMP_HELPERS */
+++++ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+++++ {
+++++ 	return 1;
+++++ }
+++++ #endif
+++++ 
+++++ /**
+++++  * __send_remote_softirq - try to schedule softirq work on a remote cpu
+++++  * @cp: private SMP call function data area
+++++  * @cpu: the remote cpu
+++++  * @this_cpu: the currently executing cpu
+++++  * @softirq: the softirq for the work
+++++  *
+++++  * Attempt to schedule softirq work on a remote cpu.  If this cannot be
+++++  * done, the work is instead queued up on the local cpu.
+++++  *
+++++  * Interrupts must be disabled.
+++++  */
+++++ void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
+++++ {
+++++ 	if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq))
+++++ 		__local_trigger(cp, softirq);
+++++ }
+++++ EXPORT_SYMBOL(__send_remote_softirq);
+++++ 
+++++ /**
+++++  * send_remote_softirq - try to schedule softirq work on a remote cpu
+++++  * @cp: private SMP call function data area
+++++  * @cpu: the remote cpu
+++++  * @softirq: the softirq for the work
+++++  *
+++++  * Like __send_remote_softirq except that disabling interrupts and
+++++  * computing the current cpu is done for the caller.
+++++  */
+++++ void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
+++++ {
+++++ 	unsigned long flags;
+++++ 	int this_cpu;
+++++ 
+++++ 	local_irq_save(flags);
+++++ 	this_cpu = smp_processor_id();
+++++ 	__send_remote_softirq(cp, cpu, this_cpu, softirq);
+++++ 	local_irq_restore(flags);
+++++ }
+++++ EXPORT_SYMBOL(send_remote_softirq);
+++++ 
+++++ static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
+++++ 					       unsigned long action, void *hcpu)
+++++ {
+++++ 	/*
+++++ 	 * If a CPU goes away, splice its entries to the current CPU
+++++ 	 * and trigger a run of the softirq
+++++ 	 */
+++++ 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+++++ 		int cpu = (unsigned long) hcpu;
+++++ 		int i;
+++++ 
+++++ 		local_irq_disable();
+++++ 		for (i = 0; i < NR_SOFTIRQS; i++) {
+++++ 			struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
+++++ 			struct list_head *local_head;
+++++ 
+++++ 			if (list_empty(head))
+++++ 				continue;
+++++ 
+++++ 			local_head = &__get_cpu_var(softirq_work_list[i]);
+++++ 			list_splice_init(head, local_head);
+++++ 			raise_softirq_irqoff(i);
+++++ 		}
+++++ 		local_irq_enable();
+++++ 	}
+++++ 
+++++ 	return NOTIFY_OK;
+++++ }
+++++ 
+++++ static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = {
+++++ 	.notifier_call	= remote_softirq_cpu_notify,
+++++ };
+++++ 
      void __init softirq_init(void)
      {
      	int cpu;
      
      	for_each_possible_cpu(cpu) {
+++++ 		int i;
+++++ 
      		per_cpu(tasklet_vec, cpu).tail =
      			&per_cpu(tasklet_vec, cpu).head;
      		per_cpu(tasklet_hi_vec, cpu).tail =
      			&per_cpu(tasklet_hi_vec, cpu).head;
+++++ 		for (i = 0; i < NR_SOFTIRQS; i++)
+++++ 			INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
      	}
      
+++++ 	register_hotcpu_notifier(&remote_softirq_cpu_notifier);
+++++ 
      	open_softirq(TASKLET_SOFTIRQ, tasklet_action);
      	open_softirq(HI_SOFTIRQ, tasklet_hi_action);
      }
diff --combined kernel/sys.c
index 038a7bc0901,038a7bc0901,0bc8fa3c228,038a7bc0901,d046a7a055c,0bc8fa3c228..53879cdae48
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@@@@@@ -853,38 -853,38 -853,38 -853,38 -853,28 -853,38 +853,28 @@@@@@@ asmlinkage long sys_setfsgid(gid_t gid
      	return old_fsgid;
      }
      
++++ +void do_sys_times(struct tms *tms)
++++ +{
++++ +	struct task_cputime cputime;
++++ +	cputime_t cutime, cstime;
++++ +
++++ +	spin_lock_irq(&current->sighand->siglock);
++++ +	thread_group_cputime(current, &cputime);
++++ +	cutime = current->signal->cutime;
++++ +	cstime = current->signal->cstime;
++++ +	spin_unlock_irq(&current->sighand->siglock);
++++ +	tms->tms_utime = cputime_to_clock_t(cputime.utime);
++++ +	tms->tms_stime = cputime_to_clock_t(cputime.stime);
++++ +	tms->tms_cutime = cputime_to_clock_t(cutime);
++++ +	tms->tms_cstime = cputime_to_clock_t(cstime);
++++ +}
++++ +
      asmlinkage long sys_times(struct tms __user * tbuf)
      {
---- -	/*
---- -	 *	In the SMP world we might just be unlucky and have one of
---- -	 *	the times increment as we use it. Since the value is an
---- -	 *	atomically safe type this is just fine. Conceptually its
---- -	 *	as if the syscall took an instant longer to occur.
---- -	 */
      	if (tbuf) {
      		struct tms tmp;
---- -		struct task_struct *tsk = current;
---- -		struct task_struct *t;
---- -		cputime_t utime, stime, cutime, cstime;
---- -
---- -		spin_lock_irq(&tsk->sighand->siglock);
---- -		utime = tsk->signal->utime;
---- -		stime = tsk->signal->stime;
---- -		t = tsk;
---- -		do {
---- -			utime = cputime_add(utime, t->utime);
---- -			stime = cputime_add(stime, t->stime);
---- -			t = next_thread(t);
---- -		} while (t != tsk);
---- -
---- -		cutime = tsk->signal->cutime;
---- -		cstime = tsk->signal->cstime;
---- -		spin_unlock_irq(&tsk->sighand->siglock);
---- -
---- -		tmp.tms_utime = cputime_to_clock_t(utime);
---- -		tmp.tms_stime = cputime_to_clock_t(stime);
---- -		tmp.tms_cutime = cputime_to_clock_t(cutime);
---- -		tmp.tms_cstime = cputime_to_clock_t(cstime);
++++ +
++++ +		do_sys_times(&tmp);
      		if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
      			return -EFAULT;
      	}
@@@@@@@ -1060,9 -1060,9 -1060,7 -1060,9 -1050,9 -1060,7 +1050,7 @@@@@@@ asmlinkage long sys_setsid(void
      	group_leader->signal->leader = 1;
      	__set_special_pids(sid);
      
-- -- 	spin_lock(&group_leader->sighand->siglock);
-- -- 	group_leader->signal->tty = NULL;
-- -- 	spin_unlock(&group_leader->sighand->siglock);
++ ++ 	proc_clear_tty(group_leader);
      
      	err = session;
      out:
@@@@@@@ -1351,8 -1351,8 -1349,10 -1351,8 -1341,8 -1349,10 +1339,10 @@@@@@@ asmlinkage long sys_sethostname(char __
      	down_write(&uts_sem);
      	errno = -EFAULT;
      	if (!copy_from_user(tmp, name, len)) {
-- -- 		memcpy(utsname()->nodename, tmp, len);
-- -- 		utsname()->nodename[len] = 0;
++ ++ 		struct new_utsname *u = utsname();
++ ++ 
++ ++ 		memcpy(u->nodename, tmp, len);
++ ++ 		memset(u->nodename + len, 0, sizeof(u->nodename) - len);
      		errno = 0;
      	}
      	up_write(&uts_sem);
@@@@@@@ -1364,15 -1364,15 -1364,17 -1364,15 -1354,15 -1364,17 +1354,17 @@@@@@@
      asmlinkage long sys_gethostname(char __user *name, int len)
      {
      	int i, errno;
++ ++ 	struct new_utsname *u;
      
      	if (len < 0)
      		return -EINVAL;
      	down_read(&uts_sem);
-- -- 	i = 1 + strlen(utsname()->nodename);
++ ++ 	u = utsname();
++ ++ 	i = 1 + strlen(u->nodename);
      	if (i > len)
      		i = len;
      	errno = 0;
-- -- 	if (copy_to_user(name, utsname()->nodename, i))
++ ++ 	if (copy_to_user(name, u->nodename, i))
      		errno = -EFAULT;
      	up_read(&uts_sem);
      	return errno;
@@@@@@@ -1397,8 -1397,8 -1399,10 -1397,8 -1387,8 -1399,10 +1389,10 @@@@@@@ asmlinkage long sys_setdomainname(char 
      	down_write(&uts_sem);
      	errno = -EFAULT;
      	if (!copy_from_user(tmp, name, len)) {
-- -- 		memcpy(utsname()->domainname, tmp, len);
-- -- 		utsname()->domainname[len] = 0;
++ ++ 		struct new_utsname *u = utsname();
++ ++ 
++ ++ 		memcpy(u->domainname, tmp, len);
++ ++ 		memset(u->domainname + len, 0, sizeof(u->domainname) - len);
      		errno = 0;
      	}
      	up_write(&uts_sem);
@@@@@@@ -1445,21 -1445,21 -1449,29 -1445,21 -1435,20 -1449,29 +1439,28 @@@@@@@ asmlinkage long sys_old_getrlimit(unsig
      asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
      {
      	struct rlimit new_rlim, *old_rlim;
---- -	unsigned long it_prof_secs;
      	int retval;
      
      	if (resource >= RLIM_NLIMITS)
      		return -EINVAL;
      	if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
      		return -EFAULT;
-- -- 	if (new_rlim.rlim_cur > new_rlim.rlim_max)
-- -- 		return -EINVAL;
      	old_rlim = current->signal->rlim + resource;
      	if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
      	    !capable(CAP_SYS_RESOURCE))
      		return -EPERM;
-- -- 	if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
-- -- 		return -EPERM;
++ ++ 
++ ++ 	if (resource == RLIMIT_NOFILE) {
++ ++ 		if (new_rlim.rlim_max == RLIM_INFINITY)
++ ++ 			new_rlim.rlim_max = sysctl_nr_open;
++ ++ 		if (new_rlim.rlim_cur == RLIM_INFINITY)
++ ++ 			new_rlim.rlim_cur = sysctl_nr_open;
++ ++ 		if (new_rlim.rlim_max > sysctl_nr_open)
++ ++ 			return -EPERM;
++ ++ 	}
++ ++ 
++ ++ 	if (new_rlim.rlim_cur > new_rlim.rlim_max)
++ ++ 		return -EINVAL;
      
      	retval = security_task_setrlimit(resource, &new_rlim);
      	if (retval)
@@@@@@@ -1491,18 -1491,18 -1503,18 -1491,18 -1480,7 -1503,18 +1492,7 @@@@@@@
      	if (new_rlim.rlim_cur == RLIM_INFINITY)
      		goto out;
      
---- -	it_prof_secs = cputime_to_secs(current->signal->it_prof_expires);
---- -	if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) {
---- -		unsigned long rlim_cur = new_rlim.rlim_cur;
---- -		cputime_t cputime;
---- -
---- -		cputime = secs_to_cputime(rlim_cur);
---- -		read_lock(&tasklist_lock);
---- -		spin_lock_irq(&current->sighand->siglock);
---- -		set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
---- -		spin_unlock_irq(&current->sighand->siglock);
---- -		read_unlock(&tasklist_lock);
---- -	}
++++ +	update_rlimit_cpu(new_rlim.rlim_cur);
      out:
      	return 0;
      }
@@@@@@@ -1540,11 -1540,11 -1552,11 -1540,11 -1518,8 -1552,11 +1530,8 @@@@@@@
       *
       */
      
---- -static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r,
---- -				     cputime_t *utimep, cputime_t *stimep)
++++ +static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
      {
---- -	*utimep = cputime_add(*utimep, t->utime);
---- -	*stimep = cputime_add(*stimep, t->stime);
      	r->ru_nvcsw += t->nvcsw;
      	r->ru_nivcsw += t->nivcsw;
      	r->ru_minflt += t->min_flt;
@@@@@@@ -1558,12 -1558,12 -1570,12 -1558,12 -1533,13 -1570,12 +1545,13 @@@@@@@ static void k_getrusage(struct task_str
      	struct task_struct *t;
      	unsigned long flags;
      	cputime_t utime, stime;
++++ +	struct task_cputime cputime;
      
      	memset((char *) r, 0, sizeof *r);
      	utime = stime = cputime_zero;
      
      	if (who == RUSAGE_THREAD) {
---- -		accumulate_thread_rusage(p, r, &utime, &stime);
++++ +		accumulate_thread_rusage(p, r);
      		goto out;
      	}
      
@@@@@@@ -1586,8 -1586,8 -1598,8 -1586,8 -1562,9 -1598,8 +1574,9 @@@@@@@
      				break;
      
      		case RUSAGE_SELF:
---- -			utime = cputime_add(utime, p->signal->utime);
---- -			stime = cputime_add(stime, p->signal->stime);
++++ +			thread_group_cputime(p, &cputime);
++++ +			utime = cputime_add(utime, cputime.utime);
++++ +			stime = cputime_add(stime, cputime.stime);
      			r->ru_nvcsw += p->signal->nvcsw;
      			r->ru_nivcsw += p->signal->nivcsw;
      			r->ru_minflt += p->signal->min_flt;
@@@@@@@ -1596,7 -1596,7 -1608,7 -1596,7 -1573,7 -1608,7 +1585,7 @@@@@@@
      			r->ru_oublock += p->signal->oublock;
      			t = p;
      			do {
---- -				accumulate_thread_rusage(t, r, &utime, &stime);
++++ +				accumulate_thread_rusage(t, r);
      				t = next_thread(t);
      			} while (t != p);
      			break;
diff --combined kernel/time/ntp.c
index 5125ddd8196,5125ddd8196,1ad46f3df6e,ddb0465a6ba,1ad46f3df6e,1ad46f3df6e..1a20715bfd6
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@@@@@@ -10,13 -10,13 -10,13 -10,13 -10,13 -10,13 +10,13 @@@@@@@
      
      #include <linux/mm.h>
      #include <linux/time.h>
--- --#include <linux/timer.h>
      #include <linux/timex.h>
      #include <linux/jiffies.h>
      #include <linux/hrtimer.h>
      #include <linux/capability.h>
      #include <linux/math64.h>
      #include <linux/clocksource.h>
+++ ++#include <linux/workqueue.h>
      #include <asm/timex.h>
      
      /*
@@@@@@@ -218,11 -218,11 -218,11 -218,11 -218,11 -218,11 +218,11 @@@@@@@ void second_overflow(void
      /* Disable the cmos update - used by virtualization and embedded */
      int no_sync_cmos_clock  __read_mostly;
      
--- --static void sync_cmos_clock(unsigned long dummy);
+++ ++static void sync_cmos_clock(struct work_struct *work);
      
--- --static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
+++ ++static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
      
--- --static void sync_cmos_clock(unsigned long dummy)
+++ ++static void sync_cmos_clock(struct work_struct *work)
      {
      	struct timespec now, next;
      	int fail = 1;
@@@@@@@ -245,7 -245,7 -245,7 -245,7 -245,7 -245,7 +245,7 @@@@@@@
      	if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
      		fail = update_persistent_clock(now);
      
-- -  	next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec;
++ +  	next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
      	if (next.tv_nsec <= 0)
      		next.tv_nsec += NSEC_PER_SEC;
      
@@@@@@@ -258,13 -258,13 -258,13 -258,13 -258,13 -258,13 +258,13 @@@@@@@
      		next.tv_sec++;
      		next.tv_nsec -= NSEC_PER_SEC;
      	}
--- --	mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next));
+++ ++	schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
      }
      
      static void notify_cmos_timer(void)
      {
      	if (!no_sync_cmos_clock)
--- --		mod_timer(&sync_cmos_timer, jiffies + 1);
+++ ++		schedule_delayed_work(&sync_cmos_work, 0);
      }
      
      #else
@@@@@@@ -277,38 -277,38 -277,38 -277,50 -277,38 -277,38 +277,50 @@@@@@@ static inline void notify_cmos_timer(vo
      int do_adjtimex(struct timex *txc)
      {
      	struct timespec ts;
--- --	long save_adjust, sec;
      	int result;
      
--- --	/* In order to modify anything, you gotta be super-user! */
--- --	if (txc->modes && !capable(CAP_SYS_TIME))
--- --		return -EPERM;
--- --
--- --	/* Now we validate the data before disabling interrupts */
--- --
--- --	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) {
+++ ++	/* Validate the data before disabling interrupts */
+++ ++	if (txc->modes & ADJ_ADJTIME) {
      		/* singleshot must not be used with any other mode bits */
--- --		if (txc->modes & ~ADJ_OFFSET_SS_READ)
+++ ++		if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
      			return -EINVAL;
+++ ++		if (!(txc->modes & ADJ_OFFSET_READONLY) &&
+++ ++		    !capable(CAP_SYS_TIME))
+++ ++			return -EPERM;
+++ ++	} else {
+++ ++		/* In order to modify anything, you gotta be super-user! */
+++ ++		 if (txc->modes && !capable(CAP_SYS_TIME))
+++ ++			return -EPERM;
+++ ++
+++ ++		/* if the quartz is off by more than 10% something is VERY wrong! */
+++ ++		if (txc->modes & ADJ_TICK &&
+++ ++		    (txc->tick <  900000/USER_HZ ||
+++ ++		     txc->tick > 1100000/USER_HZ))
+++ ++				return -EINVAL;
+++ ++
+++ ++		if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
+++ ++			hrtimer_cancel(&leap_timer);
      	}
      
--- --	/* if the quartz is off by more than 10% something is VERY wrong ! */
--- --	if (txc->modes & ADJ_TICK)
--- --		if (txc->tick <  900000/USER_HZ ||
--- --		    txc->tick > 1100000/USER_HZ)
--- --			return -EINVAL;
--- --
--- --	if (time_state != TIME_OK && txc->modes & ADJ_STATUS)
--- --		hrtimer_cancel(&leap_timer);
      	getnstimeofday(&ts);
      
      	write_seqlock_irq(&xtime_lock);
      
--- --	/* Save for later - semantics of adjtime is to return old value */
--- --	save_adjust = time_adjust;
--- --
      	/* If there are input parameters, then process them */
+++ ++	if (txc->modes & ADJ_ADJTIME) {
+++ ++		long save_adjust = time_adjust;
+++ ++
+++ ++		if (!(txc->modes & ADJ_OFFSET_READONLY)) {
+++ ++			/* adjtime() is independent from ntp_adjtime() */
+++ ++			time_adjust = txc->offset;
+++ ++			ntp_update_frequency();
+++ ++		}
+++ ++		txc->offset = save_adjust;
+++ ++		goto adj_done;
+++ ++	}
      	if (txc->modes) {
+++ ++		long sec;
+++ ++
      		if (txc->modes & ADJ_STATUS) {
      			if ((time_status & STA_PLL) &&
      			    !(txc->status & STA_PLL)) {
@@@@@@@ -375,13 -375,13 -375,13 -387,8 -375,13 -375,13 +387,8 @@@@@@@
      		if (txc->modes & ADJ_TAI && txc->constant > 0)
      			time_tai = txc->constant;
      
--- --		if (txc->modes & ADJ_OFFSET) {
--- --			if (txc->modes == ADJ_OFFSET_SINGLESHOT)
--- --				/* adjtime() is independent from ntp_adjtime() */
--- --				time_adjust = txc->offset;
--- --			else
--- --				ntp_update_offset(txc->offset);
--- --		}
+++ ++		if (txc->modes & ADJ_OFFSET)
+++ ++			ntp_update_offset(txc->offset);
      		if (txc->modes & ADJ_TICK)
      			tick_usec = txc->tick;
      
@@@@@@@ -389,22 -389,22 -389,22 -396,18 -389,22 -389,22 +396,18 @@@@@@@
      			ntp_update_frequency();
      	}
      
+++ ++	txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
+++ ++				  NTP_SCALE_SHIFT);
+++ ++	if (!(time_status & STA_NANO))
+++ ++		txc->offset /= NSEC_PER_USEC;
+++ ++
+++ ++adj_done:
      	result = time_state;	/* mostly `TIME_OK' */
      	if (time_status & (STA_UNSYNC|STA_CLOCKERR))
      		result = TIME_ERROR;
      
--- --	if ((txc->modes == ADJ_OFFSET_SINGLESHOT) ||
--- --	    (txc->modes == ADJ_OFFSET_SS_READ))
--- --		txc->offset = save_adjust;
--- --	else {
--- --		txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
--- --					  NTP_SCALE_SHIFT);
--- --		if (!(time_status & STA_NANO))
--- --			txc->offset /= NSEC_PER_USEC;
--- --	}
--- --	txc->freq	   = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
--- --					 (s64)PPM_SCALE_INV,
--- --					 NTP_SCALE_SHIFT);
+++ ++	txc->freq	   = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
+++ ++					 (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT);
      	txc->maxerror	   = time_maxerror;
      	txc->esterror	   = time_esterror;
      	txc->status	   = time_status;
diff --combined kernel/time/timekeeping.c
index 5099c95b8aa,e91c29f961c,e91c29f961c,5ecbfc39a26,e91c29f961c,e91c29f961c..e7acfb482a6
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@@@@@@ -58,26 -58,27 -58,27 -58,27 -58,27 -58,27 +58,26 @@@@@@@ struct clocksource *clock
      
      #ifdef CONFIG_GENERIC_TIME
      /**
 ----- * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
 +++++ * clocksource_forward_now - update clock to the current time
       *
 ----- * private function, must hold xtime_lock lock when being
 ----- * called. Returns the number of nanoseconds since the
 ----- * last call to update_wall_time() (adjusted by NTP scaling)
 +++++ * Forward the current clock to update its state since the last call to
 +++++ * update_wall_time(). This is useful before significant clock changes,
 +++++ * as it avoids having to deal with this time offset explicitly.
       */
 -----static inline s64 __get_nsec_offset(void)
 +++++static void clocksource_forward_now(void)
      {
      	cycle_t cycle_now, cycle_delta;
 -----	s64 ns_offset;
 +++++	s64 nsec;
      
 -----	/* read clocksource: */
      	cycle_now = clocksource_read(clock);
 -----
 -----	/* calculate the delta since the last update_wall_time: */
      	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 +++++	clock->cycle_last = cycle_now;
      
 -----	/* convert to nanoseconds: */
 -----	ns_offset = cyc2ns(clock, cycle_delta);
 +++++	nsec = cyc2ns(clock, cycle_delta);
 +++++	timespec_add_ns(&xtime, nsec);
      
 -----	return ns_offset;
 +++++	nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
 +++++	clock->raw_time.tv_nsec += nsec;
      }
      
      /**
@@@@@@@ -88,7 -89,6 -89,6 -89,6 -89,6 -89,6 +88,7 @@@@@@@
       */
      void getnstimeofday(struct timespec *ts)
      {
 +++++	cycle_t cycle_now, cycle_delta;
      	unsigned long seq;
      	s64 nsecs;
      
@@@@@@@ -96,15 -96,7 -96,7 -96,7 -96,7 -96,7 +96,15 @@@@@@@
      		seq = read_seqbegin(&xtime_lock);
      
      		*ts = xtime;
 -----		nsecs = __get_nsec_offset();
 +++++
 +++++		/* read clocksource: */
 +++++		cycle_now = clocksource_read(clock);
 +++++
 +++++		/* calculate the delta since the last update_wall_time: */
 +++++		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 +++++
 +++++		/* convert to nanoseconds: */
 +++++		nsecs = cyc2ns(clock, cycle_delta);
      
      	} while (read_seqretry(&xtime_lock, seq));
      
@@@@@@@ -137,22 -129,22 -129,22 -129,22 -129,22 -129,22 +137,22 @@@@@@@ EXPORT_SYMBOL(do_gettimeofday)
       */
      int do_settimeofday(struct timespec *tv)
      {
 +++++	struct timespec ts_delta;
      	unsigned long flags;
 -----	time_t wtm_sec, sec = tv->tv_sec;
 -----	long wtm_nsec, nsec = tv->tv_nsec;
      
      	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
      		return -EINVAL;
      
      	write_seqlock_irqsave(&xtime_lock, flags);
      
 -----	nsec -= __get_nsec_offset();
 +++++	clocksource_forward_now();
 +++++
 +++++	ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
 +++++	ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
 +++++	wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta);
      
 -----	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
 -----	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
 +++++	xtime = *tv;
      
 -----	set_normalized_timespec(&xtime, sec, nsec);
 -----	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
      	update_xtime_cache(0);
      
      	clock->error = 0;
@@@@@@@ -178,19 -170,22 -170,22 -170,22 -170,22 -170,22 +178,19 @@@@@@@ EXPORT_SYMBOL(do_settimeofday)
      static void change_clocksource(void)
      {
      	struct clocksource *new;
 -----	cycle_t now;
 -----	u64 nsec;
      
      	new = clocksource_get_next();
      
      	if (clock == new)
      		return;
      
 -----	new->cycle_last = 0;
 -----	now = clocksource_read(new);
 -----	nsec =  __get_nsec_offset();
 -----	timespec_add_ns(&xtime, nsec);
 +++++	clocksource_forward_now();
      
 -----	clock = new;
 -----	clock->cycle_last = now;
 +++++	new->raw_time = clock->raw_time;
      
 +++++	clock = new;
 +++++	clock->cycle_last = 0;
 +++++	clock->cycle_last = clocksource_read(new);
      	clock->error = 0;
      	clock->xtime_nsec = 0;
      	clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@@@@@@ -205,43 -200,10 -200,10 -200,10 -200,10 -200,10 +205,43 @@@@@@@
      	 */
      }
      #else
 +++++static inline void clocksource_forward_now(void) { }
      static inline void change_clocksource(void) { }
 -----static inline s64 __get_nsec_offset(void) { return 0; }
      #endif
      
 +++++/**
 +++++ * getrawmonotonic - Returns the raw monotonic time in a timespec
 +++++ * @ts:		pointer to the timespec to be set
 +++++ *
 +++++ * Returns the raw monotonic time (completely un-modified by ntp)
 +++++ */
 +++++void getrawmonotonic(struct timespec *ts)
 +++++{
 +++++	unsigned long seq;
 +++++	s64 nsecs;
 +++++	cycle_t cycle_now, cycle_delta;
 +++++
 +++++	do {
 +++++		seq = read_seqbegin(&xtime_lock);
 +++++
 +++++		/* read clocksource: */
 +++++		cycle_now = clocksource_read(clock);
 +++++
 +++++		/* calculate the delta since the last update_wall_time: */
 +++++		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
 +++++
 +++++		/* convert to nanoseconds: */
 +++++		nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
 +++++
 +++++		*ts = clock->raw_time;
 +++++
 +++++	} while (read_seqretry(&xtime_lock, seq));
 +++++
 +++++	timespec_add_ns(ts, nsecs);
 +++++}
 +++++EXPORT_SYMBOL(getrawmonotonic);
 +++++
 +++++
      /**
       * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
       */
@@@@@@@ -303,6 -265,8 -265,8 -265,8 -265,8 -265,8 +303,6 @@@@@@@ void __init timekeeping_init(void
      static int timekeeping_suspended;
      /* time in seconds when suspend began */
      static unsigned long timekeeping_suspend_time;
 -----/* xtime offset when we went into suspend */
 -----static s64 timekeeping_suspend_nsecs;
      
      /**
       * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@@@@@@ -328,6 -292,8 -292,8 -292,8 -292,8 -292,8 +328,6 @@@@@@@ static int timekeeping_resume(struct sy
      		wall_to_monotonic.tv_sec -= sleep_length;
      		total_sleep_time += sleep_length;
      	}
 -----	/* Make sure that we have the correct xtime reference */
 -----	timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
      	update_xtime_cache(0);
      	/* re-base the last cycle value */
      	clock->cycle_last = 0;
@@@@@@@ -353,7 -319,8 -319,8 -319,8 -319,8 -319,8 +353,7 @@@@@@@ static int timekeeping_suspend(struct s
      	timekeeping_suspend_time = read_persistent_clock();
      
      	write_seqlock_irqsave(&xtime_lock, flags);
 -----	/* Get the current xtime offset */
 -----	timekeeping_suspend_nsecs = __get_nsec_offset();
 +++++	clocksource_forward_now();
      	timekeeping_suspended = 1;
      	write_sequnlock_irqrestore(&xtime_lock, flags);
      
@@@@@@@ -487,29 -454,23 -454,23 -454,23 -454,23 -454,23 +487,29 @@@@@@@ void update_wall_time(void
      #else
      	offset = clock->cycle_interval;
      #endif
--- --	clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
+++ ++	clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
      
      	/* normally this loop will run just once, however in the
      	 * case of lost or late ticks, it will accumulate correctly.
      	 */
      	while (offset >= clock->cycle_interval) {
      		/* accumulate one interval */
 -----		clock->xtime_nsec += clock->xtime_interval;
 -----		clock->cycle_last += clock->cycle_interval;
      		offset -= clock->cycle_interval;
 +++++		clock->cycle_last += clock->cycle_interval;
      
 +++++		clock->xtime_nsec += clock->xtime_interval;
      		if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
      			clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
      			xtime.tv_sec++;
      			second_overflow();
      		}
      
 +++++		clock->raw_time.tv_nsec += clock->raw_interval;
 +++++		if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
 +++++			clock->raw_time.tv_nsec -= NSEC_PER_SEC;
 +++++			clock->raw_time.tv_sec++;
 +++++		}
 +++++
      		/* accumulate error between NTP and clock interval */
      		clock->error += tick_length;
      		clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
@@@@@@@ -518,9 -479,9 -479,9 -479,12 -479,9 -479,9 +518,12 @@@@@@@
      	/* correct the clock when NTP error is too big */
      	clocksource_adjust(offset);
      
--- --	/* store full nanoseconds into xtime */
--- --	xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
+++ ++	/* store full nanoseconds into xtime after rounding it up and
+++ ++	 * add the remainder to the error difference.
+++ ++	 */
+++ ++	xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1;
      	clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
+++ ++	clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift);
      
      	update_xtime_cache(cyc2ns(clock, offset));
      
diff --combined kernel/timer.c
index 03bc7f1f159,e8019cc3418,510fe69351c,03bc7f1f159,03bc7f1f159,510fe69351c..56becf373c5
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@@@@@@ -978,6 -978,6 -978,7 -978,6 -978,6 -978,7 +978,7 @@@@@@@ void update_process_times(int user_tick
      	run_local_timers();
      	if (rcu_pending(cpu))
      		rcu_check_callbacks(cpu, user_tick);
++ ++ 	printk_tick();
      	scheduler_tick();
      	run_posix_cpu_timers(p);
      }
@@@@@@@ -1435,9 -1435,11 -1436,9 -1435,9 -1435,9 -1436,9 +1436,11 @@@@@@@ static void __cpuinit migrate_timers(in
      	BUG_ON(cpu_online(cpu));
      	old_base = per_cpu(tvec_bases, cpu);
      	new_base = get_cpu_var(tvec_bases);
- ----
- ----	local_irq_disable();
- ----	spin_lock(&new_base->lock);
+ ++++	/*
+ ++++	 * The caller is globally serialized and nobody else
+ ++++	 * takes two locks at once, deadlock is not possible.
+ ++++	 */
+ ++++	spin_lock_irq(&new_base->lock);
      	spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
      
      	BUG_ON(old_base->running_timer);
@@@@@@@ -1452,8 -1454,7 -1453,8 -1452,8 -1452,8 -1453,8 +1455,7 @@@@@@@
      	}
      
      	spin_unlock(&old_base->lock);
- ----	spin_unlock(&new_base->lock);
- ----	local_irq_enable();
+ ++++	spin_unlock_irq(&new_base->lock);
      	put_cpu_var(tvec_bases);
      }
      #endif /* CONFIG_HOTPLUG_CPU */
diff --combined security/selinux/hooks.c
index 03fc6a81ae3,03fc6a81ae3,576e5119907,03fc6a81ae3,69649783c26,576e5119907..3e3fde7c1d2
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@@@@@@ -75,6 -75,6 -75,6 -75,6 -75,7 -75,6 +75,7 @@@@@@@
      #include <linux/string.h>
      #include <linux/selinux.h>
      #include <linux/mutex.h>
++++ +#include <linux/posix-timers.h>
      
      #include "avc.h"
      #include "objsec.h"
@@@@@@@ -291,6 -291,6 -291,7 -291,6 -292,6 -291,7 +292,7 @@@@@@@ static void sk_free_security(struct soc
      	struct sk_security_struct *ssec = sk->sk_security;
      
      	sk->sk_security = NULL;
++ ++ 	selinux_netlbl_sk_security_free(ssec);
      	kfree(ssec);
      }
      
@@@@@@@ -324,7 -324,7 -325,7 -324,7 -325,7 -325,7 +326,7 @@@@@@@ enum 
      	Opt_rootcontext = 4,
      };
      
-- -- static match_table_t tokens = {
++ ++ static const match_table_t tokens = {
      	{Opt_context, CONTEXT_STR "%s"},
      	{Opt_fscontext, FSCONTEXT_STR "%s"},
      	{Opt_defcontext, DEFCONTEXT_STR "%s"},
@@@@@@@ -957,7 -957,7 -958,8 -957,7 -958,7 -958,8 +959,8 @@@@@@@ out_err
      	return rc;
      }
      
-- -- void selinux_write_opts(struct seq_file *m, struct security_mnt_opts *opts)
++ ++ static void selinux_write_opts(struct seq_file *m,
++ ++ 			       struct security_mnt_opts *opts)
      {
      	int i;
      	char *prefix;
@@@@@@@ -1290,7 -1290,7 -1292,7 -1290,7 -1291,7 -1292,7 +1293,7 @@@@@@@ static int inode_doinit_with_dentry(str
      		/* Default to the fs superblock SID. */
      		isec->sid = sbsec->sid;
      
-- -- 		if (sbsec->proc) {
++ ++ 		if (sbsec->proc && !S_ISLNK(inode->i_mode)) {
      			struct proc_inode *proci = PROC_I(inode);
      			if (proci->pde) {
      				isec->sclass = inode_mode_to_security_class(inode->i_mode);
@@@@@@@ -2120,7 -2120,7 -2122,6 -2120,7 -2121,7 -2122,6 +2123,6 @@@@@@@ static inline void flush_unauthorized_f
      	long j = -1;
      	int drop_tty = 0;
      
-- -- 	mutex_lock(&tty_mutex);
      	tty = get_current_tty();
      	if (tty) {
      		file_list_lock();
@@@@@@@ -2138,8 -2138,8 -2139,8 -2138,8 -2139,8 -2139,8 +2140,8 @@@@@@@
      			}
      		}
      		file_list_unlock();
++ ++ 		tty_kref_put(tty);
      	}
-- -- 	mutex_unlock(&tty_mutex);
      	/* Reset controlling tty. */
      	if (drop_tty)
      		no_tty();
@@@@@@@ -2321,13 -2321,13 -2322,13 -2321,13 -2322,7 -2322,13 +2323,7 @@@@@@@ static void selinux_bprm_post_apply_cre
      			initrlim = init_task.signal->rlim+i;
      			rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur);
      		}
---- -		if (current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
---- -			/*
---- -			 * This will cause RLIMIT_CPU calculations
---- -			 * to be refigured.
---- -			 */
---- -			current->it_prof_expires = jiffies_to_cputime(1);
---- -		}
++++ +		update_rlimit_cpu(rlim->rlim_cur);
      	}
      
      	/* Wake up the parent if it is waiting so that it can
@@@@@@@ -3548,38 -3548,38 -3549,44 -3548,38 -3543,38 -3549,44 +3544,44 @@@@@@@ out
      #endif /* IPV6 */
      
      static int selinux_parse_skb(struct sk_buff *skb, struct avc_audit_data *ad,
-- -- 			     char **addrp, int src, u8 *proto)
++ ++ 			     char **_addrp, int src, u8 *proto)
      {
-- -- 	int ret = 0;
++ ++ 	char *addrp;
++ ++ 	int ret;
      
      	switch (ad->u.net.family) {
      	case PF_INET:
      		ret = selinux_parse_skb_ipv4(skb, ad, proto);
-- -- 		if (ret || !addrp)
-- -- 			break;
-- -- 		*addrp = (char *)(src ? &ad->u.net.v4info.saddr :
-- -- 					&ad->u.net.v4info.daddr);
-- -- 		break;
++ ++ 		if (ret)
++ ++ 			goto parse_error;
++ ++ 		addrp = (char *)(src ? &ad->u.net.v4info.saddr :
++ ++ 				       &ad->u.net.v4info.daddr);
++ ++ 		goto okay;
      
      #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
      	case PF_INET6:
      		ret = selinux_parse_skb_ipv6(skb, ad, proto);
-- -- 		if (ret || !addrp)
-- -- 			break;
-- -- 		*addrp = (char *)(src ? &ad->u.net.v6info.saddr :
-- -- 					&ad->u.net.v6info.daddr);
-- -- 		break;
++ ++ 		if (ret)
++ ++ 			goto parse_error;
++ ++ 		addrp = (char *)(src ? &ad->u.net.v6info.saddr :
++ ++ 				       &ad->u.net.v6info.daddr);
++ ++ 		goto okay;
      #endif	/* IPV6 */
      	default:
-- -- 		break;
++ ++ 		addrp = NULL;
++ ++ 		goto okay;
      	}
      
-- -- 	if (unlikely(ret))
-- -- 		printk(KERN_WARNING
-- -- 		       "SELinux: failure in selinux_parse_skb(),"
-- -- 		       " unable to parse packet\n");
-- -- 
++ ++ parse_error:
++ ++ 	printk(KERN_WARNING
++ ++ 	       "SELinux: failure in selinux_parse_skb(),"
++ ++ 	       " unable to parse packet\n");
      	return ret;
++ ++ 
++ ++ okay:
++ ++ 	if (_addrp)
++ ++ 		*_addrp = addrp;
++ ++ 	return 0;
      }
      
      /**
@@@@@@@ -3794,6 -3794,6 -3801,7 -3794,6 -3789,6 -3801,7 +3796,7 @@@@@@@ out
      
      static int selinux_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen)
      {
++ ++ 	struct sock *sk = sock->sk;
      	struct inode_security_struct *isec;
      	int err;
      
@@@@@@@ -3807,7 -3807,7 -3815,6 -3807,7 -3802,7 -3815,6 +3810,6 @@@@@@@
      	isec = SOCK_INODE(sock)->i_security;
      	if (isec->sclass == SECCLASS_TCP_SOCKET ||
      	    isec->sclass == SECCLASS_DCCP_SOCKET) {
-- -- 		struct sock *sk = sock->sk;
      		struct avc_audit_data ad;
      		struct sockaddr_in *addr4 = NULL;
      		struct sockaddr_in6 *addr6 = NULL;
@@@@@@@ -3841,6 -3841,6 -3848,8 -3841,6 -3836,6 -3848,8 +3843,8 @@@@@@@
      			goto out;
      	}
      
++ ++ 	err = selinux_netlbl_socket_connect(sk, address);
++ ++ 
      out:
      	return err;
      }
@@@@@@@ -4070,20 -4070,20 -4079,28 -4070,20 -4065,20 -4079,28 +4074,28 @@@@@@@ static int selinux_sock_rcv_skb_iptable
      }
      
      static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb,
-- -- 				       struct avc_audit_data *ad,
-- -- 				       u16 family, char *addrp)
++ ++ 				       u16 family)
      {
      	int err;
      	struct sk_security_struct *sksec = sk->sk_security;
      	u32 peer_sid;
      	u32 sk_sid = sksec->sid;
++ ++ 	struct avc_audit_data ad;
++ ++ 	char *addrp;
++ ++ 
++ ++ 	AVC_AUDIT_DATA_INIT(&ad, NET);
++ ++ 	ad.u.net.netif = skb->iif;
++ ++ 	ad.u.net.family = family;
++ ++ 	err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL);
++ ++ 	if (err)
++ ++ 		return err;
      
      	if (selinux_compat_net)
-- -- 		err = selinux_sock_rcv_skb_iptables_compat(sk, skb, ad,
++ ++ 		err = selinux_sock_rcv_skb_iptables_compat(sk, skb, &ad,
      							   family, addrp);
      	else
      		err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET,
-- -- 				   PACKET__RECV, ad);
++ ++ 				   PACKET__RECV, &ad);
      	if (err)
      		return err;
      
@@@@@@@ -4092,12 -4092,12 -4109,14 -4092,12 -4087,12 -4109,14 +4104,14 @@@@@@@
      		if (err)
      			return err;
      		err = avc_has_perm(sk_sid, peer_sid,
-- -- 				   SECCLASS_PEER, PEER__RECV, ad);
++ ++ 				   SECCLASS_PEER, PEER__RECV, &ad);
++ ++ 		if (err)
++ ++ 			selinux_netlbl_err(skb, err, 0);
      	} else {
-- -- 		err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, ad);
++ ++ 		err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, &ad);
      		if (err)
      			return err;
-- -- 		err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, ad);
++ ++ 		err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad);
      	}
      
      	return err;
@@@@@@@ -4111,6 -4111,6 -4130,8 -4111,6 -4106,6 -4130,8 +4125,8 @@@@@@@ static int selinux_socket_sock_rcv_skb(
      	u32 sk_sid = sksec->sid;
      	struct avc_audit_data ad;
      	char *addrp;
++ ++ 	u8 secmark_active;
++ ++ 	u8 peerlbl_active;
      
      	if (family != PF_INET && family != PF_INET6)
      		return 0;
@@@@@@@ -4119,6 -4119,6 -4140,18 -4119,6 -4114,6 -4140,18 +4135,18 @@@@@@@
      	if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
      		family = PF_INET;
      
++ ++ 	/* If any sort of compatibility mode is enabled then handoff processing
++ ++ 	 * to the selinux_sock_rcv_skb_compat() function to deal with the
++ ++ 	 * special handling.  We do this in an attempt to keep this function
++ ++ 	 * as fast and as clean as possible. */
++ ++ 	if (selinux_compat_net || !selinux_policycap_netpeer)
++ ++ 		return selinux_sock_rcv_skb_compat(sk, skb, family);
++ ++ 
++ ++ 	secmark_active = selinux_secmark_enabled();
++ ++ 	peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled();
++ ++ 	if (!secmark_active && !peerlbl_active)
++ ++ 		return 0;
++ ++ 
      	AVC_AUDIT_DATA_INIT(&ad, NET);
      	ad.u.net.netif = skb->iif;
      	ad.u.net.family = family;
@@@@@@@ -4126,15 -4126,15 -4159,7 -4126,15 -4121,15 -4159,7 +4154,7 @@@@@@@
      	if (err)
      		return err;
      
-- -- 	/* If any sort of compatibility mode is enabled then handoff processing
-- -- 	 * to the selinux_sock_rcv_skb_compat() function to deal with the
-- -- 	 * special handling.  We do this in an attempt to keep this function
-- -- 	 * as fast and as clean as possible. */
-- -- 	if (selinux_compat_net || !selinux_policycap_netpeer)
-- -- 		return selinux_sock_rcv_skb_compat(sk, skb, &ad,
-- -- 						   family, addrp);
-- -- 
-- -- 	if (netlbl_enabled() || selinux_xfrm_enabled()) {
++ ++ 	if (peerlbl_active) {
      		u32 peer_sid;
      
      		err = selinux_skb_peerlbl_sid(skb, family, &peer_sid);
@@@@@@@ -4142,13 -4142,13 -4167,17 -4142,13 -4137,13 -4167,17 +4162,17 @@@@@@@
      			return err;
      		err = selinux_inet_sys_rcv_skb(skb->iif, addrp, family,
      					       peer_sid, &ad);
-- -- 		if (err)
++ ++ 		if (err) {
++ ++ 			selinux_netlbl_err(skb, err, 0);
      			return err;
++ ++ 		}
      		err = avc_has_perm(sk_sid, peer_sid, SECCLASS_PEER,
      				   PEER__RECV, &ad);
++ ++ 		if (err)
++ ++ 			selinux_netlbl_err(skb, err, 0);
      	}
      
-- -- 	if (selinux_secmark_enabled()) {
++ ++ 	if (secmark_active) {
      		err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET,
      				   PACKET__RECV, &ad);
      		if (err)
@@@@@@@ -4207,10 -4207,10 -4236,12 -4207,10 -4202,10 -4236,12 +4231,12 @@@@@@@ static int selinux_socket_getpeersec_dg
      	u32 peer_secid = SECSID_NULL;
      	u16 family;
      
-- -- 	if (sock)
++ ++ 	if (skb && skb->protocol == htons(ETH_P_IP))
++ ++ 		family = PF_INET;
++ ++ 	else if (skb && skb->protocol == htons(ETH_P_IPV6))
++ ++ 		family = PF_INET6;
++ ++ 	else if (sock)
      		family = sock->sk->sk_family;
-- -- 	else if (skb && skb->sk)
-- -- 		family = skb->sk->sk_family;
      	else
      		goto out;
      
@@@@@@@ -4268,8 -4268,8 -4299,6 -4268,8 -4263,8 -4299,6 +4294,6 @@@@@@@ static void selinux_sock_graft(struct s
      	    sk->sk_family == PF_UNIX)
      		isec->sid = sksec->sid;
      	sksec->sclass = isec->sclass;
-- -- 
-- -- 	selinux_netlbl_sock_graft(sk, parent);
      }
      
      static int selinux_inet_conn_request(struct sock *sk, struct sk_buff *skb,
@@@@@@@ -4277,10 -4277,10 -4306,15 -4277,10 -4272,10 -4306,15 +4301,15 @@@@@@@
      {
      	struct sk_security_struct *sksec = sk->sk_security;
      	int err;
++ ++ 	u16 family = sk->sk_family;
      	u32 newsid;
      	u32 peersid;
      
-- -- 	err = selinux_skb_peerlbl_sid(skb, sk->sk_family, &peersid);
++ ++ 	/* handle mapped IPv4 packets arriving via IPv6 sockets */
++ ++ 	if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
++ ++ 		family = PF_INET;
++ ++ 
++ ++ 	err = selinux_skb_peerlbl_sid(skb, family, &peersid);
      	if (err)
      		return err;
      	if (peersid == SECSID_NULL) {
@@@@@@@ -4315,12 -4315,12 -4349,18 -4315,12 -4310,12 -4349,18 +4344,18 @@@@@@@ static void selinux_inet_csk_clone(stru
      	selinux_netlbl_sk_security_reset(newsksec, req->rsk_ops->family);
      }
      
-- -- static void selinux_inet_conn_established(struct sock *sk,
-- -- 				struct sk_buff *skb)
++ ++ static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb)
      {
++ ++ 	u16 family = sk->sk_family;
      	struct sk_security_struct *sksec = sk->sk_security;
      
-- -- 	selinux_skb_peerlbl_sid(skb, sk->sk_family, &sksec->peer_sid);
++ ++ 	/* handle mapped IPv4 packets arriving via IPv6 sockets */
++ ++ 	if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
++ ++ 		family = PF_INET;
++ ++ 
++ ++ 	selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid);
++ ++ 
++ ++ 	selinux_netlbl_inet_conn_established(sk, family);
      }
      
      static void selinux_req_classify_flow(const struct request_sock *req,
@@@@@@@ -4370,39 -4370,39 -4410,54 -4370,39 -4365,39 -4410,54 +4405,54 @@@@@@@ out
      static unsigned int selinux_ip_forward(struct sk_buff *skb, int ifindex,
      				       u16 family)
      {
++ ++ 	int err;
      	char *addrp;
      	u32 peer_sid;
      	struct avc_audit_data ad;
      	u8 secmark_active;
++ ++ 	u8 netlbl_active;
      	u8 peerlbl_active;
      
      	if (!selinux_policycap_netpeer)
      		return NF_ACCEPT;
      
      	secmark_active = selinux_secmark_enabled();
-- -- 	peerlbl_active = netlbl_enabled() || selinux_xfrm_enabled();
++ ++ 	netlbl_active = netlbl_enabled();
++ ++ 	peerlbl_active = netlbl_active || selinux_xfrm_enabled();
      	if (!secmark_active && !peerlbl_active)
      		return NF_ACCEPT;
      
++ ++ 	if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0)
++ ++ 		return NF_DROP;
++ ++ 
      	AVC_AUDIT_DATA_INIT(&ad, NET);
      	ad.u.net.netif = ifindex;
      	ad.u.net.family = family;
      	if (selinux_parse_skb(skb, &ad, &addrp, 1, NULL) != 0)
      		return NF_DROP;
      
-- -- 	if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0)
-- -- 		return NF_DROP;
-- -- 
-- -- 	if (peerlbl_active)
-- -- 		if (selinux_inet_sys_rcv_skb(ifindex, addrp, family,
-- -- 					     peer_sid, &ad) != 0)
++ ++ 	if (peerlbl_active) {
++ ++ 		err = selinux_inet_sys_rcv_skb(ifindex, addrp, family,
++ ++ 					       peer_sid, &ad);
++ ++ 		if (err) {
++ ++ 			selinux_netlbl_err(skb, err, 1);
      			return NF_DROP;
++ ++ 		}
++ ++ 	}
      
      	if (secmark_active)
      		if (avc_has_perm(peer_sid, skb->secmark,
      				 SECCLASS_PACKET, PACKET__FORWARD_IN, &ad))
      			return NF_DROP;
      
++ ++ 	if (netlbl_active)
++ ++ 		/* we do this in the FORWARD path and not the POST_ROUTING
++ ++ 		 * path because we want to make sure we apply the necessary
++ ++ 		 * labeling before IPsec is applied so we can leverage AH
++ ++ 		 * protection */
++ ++ 		if (selinux_netlbl_skbuff_setsid(skb, family, peer_sid) != 0)
++ ++ 			return NF_DROP;
++ ++ 
      	return NF_ACCEPT;
      }
      
@@@@@@@ -4426,6 -4426,6 -4481,37 -4426,6 -4421,6 -4481,37 +4476,37 @@@@@@@ static unsigned int selinux_ipv6_forwar
      }
      #endif	/* IPV6 */
      
++ ++ static unsigned int selinux_ip_output(struct sk_buff *skb,
++ ++ 				      u16 family)
++ ++ {
++ ++ 	u32 sid;
++ ++ 
++ ++ 	if (!netlbl_enabled())
++ ++ 		return NF_ACCEPT;
++ ++ 
++ ++ 	/* we do this in the LOCAL_OUT path and not the POST_ROUTING path
++ ++ 	 * because we want to make sure we apply the necessary labeling
++ ++ 	 * before IPsec is applied so we can leverage AH protection */
++ ++ 	if (skb->sk) {
++ ++ 		struct sk_security_struct *sksec = skb->sk->sk_security;
++ ++ 		sid = sksec->sid;
++ ++ 	} else
++ ++ 		sid = SECINITSID_KERNEL;
++ ++ 	if (selinux_netlbl_skbuff_setsid(skb, family, sid) != 0)
++ ++ 		return NF_DROP;
++ ++ 
++ ++ 	return NF_ACCEPT;
++ ++ }
++ ++ 
++ ++ static unsigned int selinux_ipv4_output(unsigned int hooknum,
++ ++ 					struct sk_buff *skb,
++ ++ 					const struct net_device *in,
++ ++ 					const struct net_device *out,
++ ++ 					int (*okfn)(struct sk_buff *))
++ ++ {
++ ++ 	return selinux_ip_output(skb, PF_INET);
++ ++ }
++ ++ 
      static int selinux_ip_postroute_iptables_compat(struct sock *sk,
      						int ifindex,
      						struct avc_audit_data *ad,
@@@@@@@ -4493,30 -4493,30 -4579,36 -4493,30 -4488,30 -4579,36 +4574,36 @@@@@@@
      
      static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb,
      						int ifindex,
-- -- 						struct avc_audit_data *ad,
-- -- 						u16 family,
-- -- 						char *addrp,
-- -- 						u8 proto)
++ ++ 						u16 family)
      {
      	struct sock *sk = skb->sk;
      	struct sk_security_struct *sksec;
++ ++ 	struct avc_audit_data ad;
++ ++ 	char *addrp;
++ ++ 	u8 proto;
      
      	if (sk == NULL)
      		return NF_ACCEPT;
      	sksec = sk->sk_security;
      
++ ++ 	AVC_AUDIT_DATA_INIT(&ad, NET);
++ ++ 	ad.u.net.netif = ifindex;
++ ++ 	ad.u.net.family = family;
++ ++ 	if (selinux_parse_skb(skb, &ad, &addrp, 0, &proto))
++ ++ 		return NF_DROP;
++ ++ 
      	if (selinux_compat_net) {
      		if (selinux_ip_postroute_iptables_compat(skb->sk, ifindex,
-- -- 							 ad, family, addrp))
++ ++ 							 &ad, family, addrp))
      			return NF_DROP;
      	} else {
      		if (avc_has_perm(sksec->sid, skb->secmark,
-- -- 				 SECCLASS_PACKET, PACKET__SEND, ad))
++ ++ 				 SECCLASS_PACKET, PACKET__SEND, &ad))
      			return NF_DROP;
      	}
      
      	if (selinux_policycap_netpeer)
-- -- 		if (selinux_xfrm_postroute_last(sksec->sid, skb, ad, proto))
++ ++ 		if (selinux_xfrm_postroute_last(sksec->sid, skb, &ad, proto))
      			return NF_DROP;
      
      	return NF_ACCEPT;
@@@@@@@ -4530,23 -4530,23 -4622,15 -4530,23 -4525,23 -4622,15 +4617,15 @@@@@@@ static unsigned int selinux_ip_postrout
      	struct sock *sk;
      	struct avc_audit_data ad;
      	char *addrp;
-- -- 	u8 proto;
      	u8 secmark_active;
      	u8 peerlbl_active;
      
-- -- 	AVC_AUDIT_DATA_INIT(&ad, NET);
-- -- 	ad.u.net.netif = ifindex;
-- -- 	ad.u.net.family = family;
-- -- 	if (selinux_parse_skb(skb, &ad, &addrp, 0, &proto))
-- -- 		return NF_DROP;
-- -- 
      	/* If any sort of compatibility mode is enabled then handoff processing
      	 * to the selinux_ip_postroute_compat() function to deal with the
      	 * special handling.  We do this in an attempt to keep this function
      	 * as fast and as clean as possible. */
      	if (selinux_compat_net || !selinux_policycap_netpeer)
-- -- 		return selinux_ip_postroute_compat(skb, ifindex, &ad,
-- -- 						   family, addrp, proto);
++ ++ 		return selinux_ip_postroute_compat(skb, ifindex, family);
      
      	/* If skb->dst->xfrm is non-NULL then the packet is undergoing an IPsec
      	 * packet transformation so allow the packet to pass without any checks
@@@@@@@ -4562,21 -4562,21 -4646,45 -4562,21 -4557,21 -4646,45 +4641,45 @@@@@@@
      	if (!secmark_active && !peerlbl_active)
      		return NF_ACCEPT;
      
-- -- 	/* if the packet is locally generated (skb->sk != NULL) then use the
-- -- 	 * socket's label as the peer label, otherwise the packet is being
-- -- 	 * forwarded through this system and we need to fetch the peer label
-- -- 	 * directly from the packet */
++ ++ 	/* if the packet is being forwarded then get the peer label from the
++ ++ 	 * packet itself; otherwise check to see if it is from a local
++ ++ 	 * application or the kernel, if from an application get the peer label
++ ++ 	 * from the sending socket, otherwise use the kernel's sid */
      	sk = skb->sk;
-- -- 	if (sk) {
++ ++ 	if (sk == NULL) {
++ ++ 		switch (family) {
++ ++ 		case PF_INET:
++ ++ 			if (IPCB(skb)->flags & IPSKB_FORWARDED)
++ ++ 				secmark_perm = PACKET__FORWARD_OUT;
++ ++ 			else
++ ++ 				secmark_perm = PACKET__SEND;
++ ++ 			break;
++ ++ 		case PF_INET6:
++ ++ 			if (IP6CB(skb)->flags & IP6SKB_FORWARDED)
++ ++ 				secmark_perm = PACKET__FORWARD_OUT;
++ ++ 			else
++ ++ 				secmark_perm = PACKET__SEND;
++ ++ 			break;
++ ++ 		default:
++ ++ 			return NF_DROP;
++ ++ 		}
++ ++ 		if (secmark_perm == PACKET__FORWARD_OUT) {
++ ++ 			if (selinux_skb_peerlbl_sid(skb, family, &peer_sid))
++ ++ 				return NF_DROP;
++ ++ 		} else
++ ++ 			peer_sid = SECINITSID_KERNEL;
++ ++ 	} else {
      		struct sk_security_struct *sksec = sk->sk_security;
      		peer_sid = sksec->sid;
      		secmark_perm = PACKET__SEND;
-- -- 	} else {
-- -- 		if (selinux_skb_peerlbl_sid(skb, family, &peer_sid))
-- -- 				return NF_DROP;
-- -- 		secmark_perm = PACKET__FORWARD_OUT;
      	}
      
++ ++ 	AVC_AUDIT_DATA_INIT(&ad, NET);
++ ++ 	ad.u.net.netif = ifindex;
++ ++ 	ad.u.net.family = family;
++ ++ 	if (selinux_parse_skb(skb, &ad, &addrp, 0, NULL))
++ ++ 		return NF_DROP;
++ ++ 
      	if (secmark_active)
      		if (avc_has_perm(peer_sid, skb->secmark,
      				 SECCLASS_PACKET, secmark_perm, &ad))
@@@@@@@ -5219,8 -5219,8 -5327,12 -5219,8 -5214,8 -5327,12 +5322,12 @@@@@@@ static int selinux_setprocattr(struct t
      
      		if (sid == 0)
      			return -EINVAL;
-- -- 
-- -- 		/* Only allow single threaded processes to change context */
++ ++ 		/*
++ ++ 		 * SELinux allows to change context in the following case only.
++ ++ 		 *  - Single threaded processes.
++ ++ 		 *  - Multi threaded processes intend to change its context into
++ ++ 		 *    more restricted domain (defined by TYPEBOUNDS statement).
++ ++ 		 */
      		if (atomic_read(&p->mm->mm_users) != 1) {
      			struct task_struct *g, *t;
      			struct mm_struct *mm = p->mm;
@@@@@@@ -5228,11 -5228,11 -5340,16 -5228,11 -5223,11 -5340,16 +5335,16 @@@@@@@
      			do_each_thread(g, t) {
      				if (t->mm == mm && t != p) {
      					read_unlock(&tasklist_lock);
-- -- 					return -EPERM;
++ ++ 					error = security_bounded_transition(tsec->sid, sid);
++ ++ 					if (!error)
++ ++ 						goto boundary_ok;
++ ++ 
++ ++ 					return error;
      				}
      			} while_each_thread(g, t);
      			read_unlock(&tasklist_lock);
      		}
++ ++ boundary_ok:
      
      		/* Check permissions for the transition. */
      		error = avc_has_perm(tsec->sid, sid, SECCLASS_PROCESS,
@@@@@@@ -5641,6 -5641,6 -5758,13 -5641,6 -5636,6 -5758,13 +5753,13 @@@@@@@ static struct nf_hook_ops selinux_ipv4_
      		.pf =		PF_INET,
      		.hooknum =	NF_INET_FORWARD,
      		.priority =	NF_IP_PRI_SELINUX_FIRST,
++ ++ 	},
++ ++ 	{
++ ++ 		.hook =		selinux_ipv4_output,
++ ++ 		.owner =	THIS_MODULE,
++ ++ 		.pf =		PF_INET,
++ ++ 		.hooknum =	NF_INET_LOCAL_OUT,
++ ++ 		.priority =	NF_IP_PRI_SELINUX_FIRST,
      	}
      };