From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 23 Oct 2008 16:37:16 +0000 (-0700)
Subject: Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel... 
X-Git-Tag: v2.6.28-rc1~43
X-Git-Url: http://www.pilppa.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=133e887f90208d339088dd60cb1d08a72ba27288;hp=-c;p=linux-2.6-omap-h63xx.git

Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  sched: disable the hrtick for now
  sched: revert back to per-rq vruntime
  sched: fair scheduler should not resched rt tasks
  sched: optimize group load balancer
  sched: minor fast-path overhead reduction
  sched: fix the wrong mask_len, cleanup
  sched: kill unused scheduler decl.
  sched: fix the wrong mask_len
  sched: only update rq->clock while holding rq->lock
---

133e887f90208d339088dd60cb1d08a72ba27288
diff --combined include/linux/sched.h
index 5c38db536e0,4f59c8e8597..10bff55b082
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -287,7 -287,6 +287,6 @@@ extern void trap_init(void)
  extern void account_process_tick(struct task_struct *task, int user);
  extern void update_process_times(int user);
  extern void scheduler_tick(void);
- extern void hrtick_resched(void);
  
  extern void sched_show_task(struct task_struct *p);
  
@@@ -403,21 -402,12 +402,21 @@@ extern int get_dumpable(struct mm_struc
  #define MMF_DUMP_MAPPED_PRIVATE	4
  #define MMF_DUMP_MAPPED_SHARED	5
  #define MMF_DUMP_ELF_HEADERS	6
 +#define MMF_DUMP_HUGETLB_PRIVATE 7
 +#define MMF_DUMP_HUGETLB_SHARED  8
  #define MMF_DUMP_FILTER_SHIFT	MMF_DUMPABLE_BITS
 -#define MMF_DUMP_FILTER_BITS	5
 +#define MMF_DUMP_FILTER_BITS	7
  #define MMF_DUMP_FILTER_MASK \
  	(((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
  #define MMF_DUMP_FILTER_DEFAULT \
 -	((1 << MMF_DUMP_ANON_PRIVATE) |	(1 << MMF_DUMP_ANON_SHARED))
 +	((1 << MMF_DUMP_ANON_PRIVATE) |	(1 << MMF_DUMP_ANON_SHARED) |\
 +	 (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
 +
 +#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
 +# define MMF_DUMP_MASK_DEFAULT_ELF	(1 << MMF_DUMP_ELF_HEADERS)
 +#else
 +# define MMF_DUMP_MASK_DEFAULT_ELF	0
 +#endif
  
  struct sighand_struct {
  	atomic_t		count;
@@@ -434,39 -424,6 +433,39 @@@ struct pacct_struct 
  	unsigned long		ac_minflt, ac_majflt;
  };
  
 +/**
 + * struct task_cputime - collected CPU time counts
 + * @utime:		time spent in user mode, in &cputime_t units
 + * @stime:		time spent in kernel mode, in &cputime_t units
 + * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
 + *
 + * This structure groups together three kinds of CPU time that are
 + * tracked for threads and thread groups.  Most things considering
 + * CPU time want to group these counts together and treat all three
 + * of them in parallel.
 + */
 +struct task_cputime {
 +	cputime_t utime;
 +	cputime_t stime;
 +	unsigned long long sum_exec_runtime;
 +};
 +/* Alternate field names when used to cache expirations. */
 +#define prof_exp	stime
 +#define virt_exp	utime
 +#define sched_exp	sum_exec_runtime
 +
 +/**
 + * struct thread_group_cputime - thread group interval timer counts
 + * @totals:		thread group interval timers; substructure for
 + *			uniprocessor kernel, per-cpu for SMP kernel.
 + *
 + * This structure contains the version of task_cputime, above, that is
 + * used for thread group CPU clock calculations.
 + */
 +struct thread_group_cputime {
 +	struct task_cputime *totals;
 +};
 +
  /*
   * NOTE! "signal_struct" does not have it's own
   * locking, because a shared signal_struct always
@@@ -512,17 -469,6 +511,17 @@@ struct signal_struct 
  	cputime_t it_prof_expires, it_virt_expires;
  	cputime_t it_prof_incr, it_virt_incr;
  
 +	/*
 +	 * Thread group totals for process CPU clocks.
 +	 * See thread_group_cputime(), et al, for details.
 +	 */
 +	struct thread_group_cputime cputime;
 +
 +	/* Earliest-expiration cache. */
 +	struct task_cputime cputime_expires;
 +
 +	struct list_head cpu_timers[3];
 +
  	/* job control IDs */
  
  	/*
@@@ -553,7 -499,7 +552,7 @@@
  	 * Live threads maintain their own counters and add to these
  	 * in __exit_signal, except for the group leader.
  	 */
 -	cputime_t utime, stime, cutime, cstime;
 +	cputime_t cutime, cstime;
  	cputime_t gtime;
  	cputime_t cgtime;
  	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
@@@ -561,6 -507,14 +560,6 @@@
  	unsigned long inblock, oublock, cinblock, coublock;
  	struct task_io_accounting ioac;
  
 -	/*
 -	 * Cumulative ns of scheduled CPU time for dead threads in the
 -	 * group, not including a zombie group leader.  (This only differs
 -	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
 -	 * other than jiffies.)
 -	 */
 -	unsigned long long sum_sched_runtime;
 -
  	/*
  	 * We don't bother to synchronize most readers of this at all,
  	 * because there is no reader checking a limit that actually needs
@@@ -572,6 -526,8 +571,6 @@@
  	 */
  	struct rlimit rlim[RLIM_NLIMITS];
  
 -	struct list_head cpu_timers[3];
 -
  	/* keep the process-shared keyrings here so that they do the right
  	 * thing in threads created with CLONE_THREAD */
  #ifdef CONFIG_KEYS
@@@ -1180,7 -1136,8 +1179,7 @@@ struct task_struct 
  /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
  	unsigned long min_flt, maj_flt;
  
 -  	cputime_t it_prof_expires, it_virt_expires;
 -	unsigned long long it_sched_expires;
 +	struct task_cputime cputime_expires;
  	struct list_head cpu_timers[3];
  
  /* process credentials */
@@@ -1630,7 -1587,6 +1629,7 @@@ extern unsigned long long cpu_clock(in
  
  extern unsigned long long
  task_sched_runtime(struct task_struct *task);
 +extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
  
  /* sched_exec is called by processes performing an exec */
  #ifdef CONFIG_SMP
@@@ -1665,6 -1621,7 +1664,7 @@@ extern unsigned int sysctl_sched_featur
  extern unsigned int sysctl_sched_migration_cost;
  extern unsigned int sysctl_sched_nr_migrate;
  extern unsigned int sysctl_sched_shares_ratelimit;
+ extern unsigned int sysctl_sched_shares_thresh;
  
  int sched_nr_latency_handler(struct ctl_table *table, int write,
  		struct file *file, void __user *buffer, size_t *length,
@@@ -2127,30 -2084,6 +2127,30 @@@ static inline int spin_needbreak(spinlo
  #endif
  }
  
 +/*
 + * Thread group CPU time accounting.
 + */
 +
 +extern int thread_group_cputime_alloc(struct task_struct *);
 +extern void thread_group_cputime(struct task_struct *, struct task_cputime *);
 +
 +static inline void thread_group_cputime_init(struct signal_struct *sig)
 +{
 +	sig->cputime.totals = NULL;
 +}
 +
 +static inline int thread_group_cputime_clone_thread(struct task_struct *curr)
 +{
 +	if (curr->signal->cputime.totals)
 +		return 0;
 +	return thread_group_cputime_alloc(curr);
 +}
 +
 +static inline void thread_group_cputime_free(struct signal_struct *sig)
 +{
 +	free_percpu(sig->cputime.totals);
 +}
 +
  /*
   * Reevaluate whether the task has signals pending delivery.
   * Wake the task if so.
diff --combined kernel/sched.c
index d906f72b42d,11ca3901783..945a97b9600
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@@ -71,7 -71,6 +71,7 @@@
  #include <linux/debugfs.h>
  #include <linux/ctype.h>
  #include <linux/ftrace.h>
 +#include <trace/sched.h>
  
  #include <asm/tlb.h>
  #include <asm/irq_regs.h>
@@@ -818,6 -817,13 +818,13 @@@ const_debug unsigned int sysctl_sched_n
   */
  unsigned int sysctl_sched_shares_ratelimit = 250000;
  
+ /*
+  * Inject some fuzzyness into changing the per-cpu group shares
+  * this avoids remote rq-locks at the expense of fairness.
+  * default: 4
+  */
+ unsigned int sysctl_sched_shares_thresh = 4;
+ 
  /*
   * period over which we measure -rt task cpu usage in us.
   * default: 1s
@@@ -1454,8 -1460,8 +1461,8 @@@ static void __set_se_shares(struct sche
   * Calculate and set the cpu's group shares.
   */
  static void
- __update_group_shares_cpu(struct task_group *tg, int cpu,
- 			  unsigned long sd_shares, unsigned long sd_rq_weight)
+ update_group_shares_cpu(struct task_group *tg, int cpu,
+ 			unsigned long sd_shares, unsigned long sd_rq_weight)
  {
  	int boost = 0;
  	unsigned long shares;
@@@ -1486,19 -1492,23 +1493,23 @@@
  	 *
  	 */
  	shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+ 	shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
  
- 	/*
- 	 * record the actual number of shares, not the boosted amount.
- 	 */
- 	tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
- 	tg->cfs_rq[cpu]->rq_weight = rq_weight;
+ 	if (abs(shares - tg->se[cpu]->load.weight) >
+ 			sysctl_sched_shares_thresh) {
+ 		struct rq *rq = cpu_rq(cpu);
+ 		unsigned long flags;
  
- 	if (shares < MIN_SHARES)
- 		shares = MIN_SHARES;
- 	else if (shares > MAX_SHARES)
- 		shares = MAX_SHARES;
+ 		spin_lock_irqsave(&rq->lock, flags);
+ 		/*
+ 		 * record the actual number of shares, not the boosted amount.
+ 		 */
+ 		tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
+ 		tg->cfs_rq[cpu]->rq_weight = rq_weight;
  
- 	__set_se_shares(tg->se[cpu], shares);
+ 		__set_se_shares(tg->se[cpu], shares);
+ 		spin_unlock_irqrestore(&rq->lock, flags);
+ 	}
  }
  
  /*
@@@ -1527,14 -1537,8 +1538,8 @@@ static int tg_shares_up(struct task_gro
  	if (!rq_weight)
  		rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
  
- 	for_each_cpu_mask(i, sd->span) {
- 		struct rq *rq = cpu_rq(i);
- 		unsigned long flags;
- 
- 		spin_lock_irqsave(&rq->lock, flags);
- 		__update_group_shares_cpu(tg, i, shares, rq_weight);
- 		spin_unlock_irqrestore(&rq->lock, flags);
- 	}
+ 	for_each_cpu_mask(i, sd->span)
+ 		update_group_shares_cpu(tg, i, shares, rq_weight);
  
  	return 0;
  }
@@@ -1937,7 -1941,6 +1942,7 @@@ unsigned long wait_task_inactive(struc
  		 * just go back and repeat.
  		 */
  		rq = task_rq_lock(p, &flags);
 +		trace_sched_wait_task(rq, p);
  		running = task_running(rq, p);
  		on_rq = p->se.on_rq;
  		ncsw = 0;
@@@ -2299,7 -2302,9 +2304,7 @@@ out_activate
  	success = 1;
  
  out_running:
 -	trace_mark(kernel_sched_wakeup,
 -		"pid %d state %ld ## rq %p task %p rq->curr %p",
 -		p->pid, p->state, rq, p, rq->curr);
 +	trace_sched_wakeup(rq, p);
  	check_preempt_curr(rq, p, sync);
  
  	p->state = TASK_RUNNING;
@@@ -2432,7 -2437,9 +2437,7 @@@ void wake_up_new_task(struct task_struc
  		p->sched_class->task_new(rq, p);
  		inc_nr_running(rq);
  	}
 -	trace_mark(kernel_sched_wakeup_new,
 -		"pid %d state %ld ## rq %p task %p rq->curr %p",
 -		p->pid, p->state, rq, p, rq->curr);
 +	trace_sched_wakeup_new(rq, p);
  	check_preempt_curr(rq, p, 0);
  #ifdef CONFIG_SMP
  	if (p->sched_class->task_wake_up)
@@@ -2605,7 -2612,11 +2610,7 @@@ context_switch(struct rq *rq, struct ta
  	struct mm_struct *mm, *oldmm;
  
  	prepare_task_switch(rq, prev, next);
 -	trace_mark(kernel_sched_schedule,
 -		"prev_pid %d next_pid %d prev_state %ld "
 -		"## rq %p prev %p next %p",
 -		prev->pid, next->pid, prev->state,
 -		rq, prev, next);
 +	trace_sched_switch(rq, prev, next);
  	mm = next->mm;
  	oldmm = prev->active_mm;
  	/*
@@@ -2845,7 -2856,6 +2850,7 @@@ static void sched_migrate_task(struct t
  	    || unlikely(!cpu_active(dest_cpu)))
  		goto out;
  
 +	trace_sched_migrate_task(rq, p, dest_cpu);
  	/* force the process onto the specified CPU */
  	if (migrate_task(p, dest_cpu, &req)) {
  		/* Need to wait for migration thread (might exit: take ref). */
@@@ -4047,26 -4057,23 +4052,26 @@@ DEFINE_PER_CPU(struct kernel_stat, ksta
  EXPORT_PER_CPU_SYMBOL(kstat);
  
  /*
 - * Return p->sum_exec_runtime plus any more ns on the sched_clock
 - * that have not yet been banked in case the task is currently running.
 + * Return any ns on the sched_clock that have not yet been banked in
 + * @p in case that task is currently running.
   */
 -unsigned long long task_sched_runtime(struct task_struct *p)
 +unsigned long long task_delta_exec(struct task_struct *p)
  {
  	unsigned long flags;
 -	u64 ns, delta_exec;
  	struct rq *rq;
 +	u64 ns = 0;
  
  	rq = task_rq_lock(p, &flags);
 -	ns = p->se.sum_exec_runtime;
 +
  	if (task_current(rq, p)) {
 +		u64 delta_exec;
 +
  		update_rq_clock(rq);
  		delta_exec = rq->clock - p->se.exec_start;
  		if ((s64)delta_exec > 0)
 -			ns += delta_exec;
 +			ns = delta_exec;
  	}
 +
  	task_rq_unlock(rq, &flags);
  
  	return ns;
@@@ -4083,7 -4090,6 +4088,7 @@@ void account_user_time(struct task_stru
  	cputime64_t tmp;
  
  	p->utime = cputime_add(p->utime, cputime);
 +	account_group_user_time(p, cputime);
  
  	/* Add user time to cpustat. */
  	tmp = cputime_to_cputime64(cputime);
@@@ -4108,7 -4114,6 +4113,7 @@@ static void account_guest_time(struct t
  	tmp = cputime_to_cputime64(cputime);
  
  	p->utime = cputime_add(p->utime, cputime);
 +	account_group_user_time(p, cputime);
  	p->gtime = cputime_add(p->gtime, cputime);
  
  	cpustat->user = cputime64_add(cpustat->user, tmp);
@@@ -4144,7 -4149,6 +4149,7 @@@ void account_system_time(struct task_st
  	}
  
  	p->stime = cputime_add(p->stime, cputime);
 +	account_group_system_time(p, cputime);
  
  	/* Add system time to cpustat. */
  	tmp = cputime_to_cputime64(cputime);
@@@ -4186,7 -4190,6 +4191,7 @@@ void account_steal_time(struct task_str
  
  	if (p == rq->idle) {
  		p->stime = cputime_add(p->stime, steal);
 +		account_group_system_time(p, steal);
  		if (atomic_read(&rq->nr_iowait) > 0)
  			cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
  		else
@@@ -4443,12 -4446,8 +4448,8 @@@ need_resched_nonpreemptible
  	if (sched_feat(HRTICK))
  		hrtick_clear(rq);
  
- 	/*
- 	 * Do the rq-clock update outside the rq lock:
- 	 */
- 	local_irq_disable();
+ 	spin_lock_irq(&rq->lock);
  	update_rq_clock(rq);
- 	spin_lock(&rq->lock);
  	clear_tsk_need_resched(prev);
  
  	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
diff --combined kernel/sched_fair.c
index f604dae7131,a0aa38b10fd..9573c33688b
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@@ -73,6 -73,8 +73,8 @@@ unsigned int sysctl_sched_wakeup_granul
  
  const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
  
+ static const struct sched_class fair_sched_class;
+ 
  /**************************************************************
   * CFS operations on generic schedulable entities:
   */
@@@ -334,7 -336,7 +336,7 @@@ int sched_nr_latency_handler(struct ctl
  #endif
  
  /*
-  * delta *= w / rw
+  * delta *= P[w / rw]
   */
  static inline unsigned long
  calc_delta_weight(unsigned long delta, struct sched_entity *se)
@@@ -348,15 -350,13 +350,13 @@@
  }
  
  /*
-  * delta *= rw / w
+  * delta /= w
   */
  static inline unsigned long
  calc_delta_fair(unsigned long delta, struct sched_entity *se)
  {
- 	for_each_sched_entity(se) {
- 		delta = calc_delta_mine(delta,
- 				cfs_rq_of(se)->load.weight, &se->load);
- 	}
+ 	if (unlikely(se->load.weight != NICE_0_LOAD))
+ 		delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
  
  	return delta;
  }
@@@ -386,26 -386,26 +386,26 @@@ static u64 __sched_period(unsigned lon
   * We calculate the wall-time slice from the period by taking a part
   * proportional to the weight.
   *
-  * s = p*w/rw
+  * s = p*P[w/rw]
   */
  static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
- 	return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
+ 	unsigned long nr_running = cfs_rq->nr_running;
+ 
+ 	if (unlikely(!se->on_rq))
+ 		nr_running++;
+ 
+ 	return calc_delta_weight(__sched_period(nr_running), se);
  }
  
  /*
   * We calculate the vruntime slice of a to be inserted task
   *
-  * vs = s*rw/w = p
+  * vs = s/w
   */
- static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
- 	unsigned long nr_running = cfs_rq->nr_running;
- 
- 	if (!se->on_rq)
- 		nr_running++;
- 
- 	return __sched_period(nr_running);
+ 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
  }
  
  /*
@@@ -449,7 -449,6 +449,7 @@@ static void update_curr(struct cfs_rq *
  		struct task_struct *curtask = task_of(curr);
  
  		cpuacct_charge(curtask, delta_exec);
 +		account_group_exec_runtime(curtask, delta_exec);
  	}
  }
  
@@@ -628,7 -627,7 +628,7 @@@ place_entity(struct cfs_rq *cfs_rq, str
  	 * stays open at the end.
  	 */
  	if (initial && sched_feat(START_DEBIT))
- 		vruntime += sched_vslice_add(cfs_rq, se);
+ 		vruntime += sched_vslice(cfs_rq, se);
  
  	if (!initial) {
  		/* sleeps upto a single latency don't count. */
@@@ -748,7 -747,7 +748,7 @@@ pick_next(struct cfs_rq *cfs_rq, struc
  	struct rq *rq = rq_of(cfs_rq);
  	u64 pair_slice = rq->clock - cfs_rq->pair_start;
  
- 	if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
+ 	if (!cfs_rq->next || pair_slice > sysctl_sched_min_granularity) {
  		cfs_rq->pair_start = rq->clock;
  		return se;
  	}
@@@ -849,11 -848,31 +849,31 @@@ static void hrtick_start_fair(struct r
  		hrtick_start(rq, delta);
  	}
  }
+ 
+ /*
+  * called from enqueue/dequeue and updates the hrtick when the
+  * current task is from our class and nr_running is low enough
+  * to matter.
+  */
+ static void hrtick_update(struct rq *rq)
+ {
+ 	struct task_struct *curr = rq->curr;
+ 
+ 	if (curr->sched_class != &fair_sched_class)
+ 		return;
+ 
+ 	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
+ 		hrtick_start_fair(rq, curr);
+ }
  #else /* !CONFIG_SCHED_HRTICK */
  static inline void
  hrtick_start_fair(struct rq *rq, struct task_struct *p)
  {
  }
+ 
+ static inline void hrtick_update(struct rq *rq)
+ {
+ }
  #endif
  
  /*
@@@ -874,7 -893,7 +894,7 @@@ static void enqueue_task_fair(struct r
  		wakeup = 1;
  	}
  
- 	hrtick_start_fair(rq, rq->curr);
+ 	hrtick_update(rq);
  }
  
  /*
@@@ -896,7 -915,7 +916,7 @@@ static void dequeue_task_fair(struct r
  		sleep = 1;
  	}
  
- 	hrtick_start_fair(rq, rq->curr);
+ 	hrtick_update(rq);
  }
  
  /*
@@@ -1002,8 -1021,6 +1022,6 @@@ static inline int wake_idle(int cpu, st
  
  #ifdef CONFIG_SMP
  
- static const struct sched_class fair_sched_class;
- 
  #ifdef CONFIG_FAIR_GROUP_SCHED
  /*
   * effective_load() calculates the load change as seen from the root_task_group
diff --combined kernel/sched_stats.h
index b8c156979cf,67579253b53..2df9d297d29
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@@ -9,7 -9,7 +9,7 @@@
  static int show_schedstat(struct seq_file *seq, void *v)
  {
  	int cpu;
- 	int mask_len = NR_CPUS/32 * 9;
+ 	int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
  	char *mask_str = kmalloc(mask_len, GFP_KERNEL);
  
  	if (mask_str == NULL)
@@@ -270,89 -270,3 +270,89 @@@ sched_info_switch(struct task_struct *p
  #define sched_info_switch(t, next)		do { } while (0)
  #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
  
 +/*
 + * The following are functions that support scheduler-internal time accounting.
 + * These functions are generally called at the timer tick.  None of this depends
 + * on CONFIG_SCHEDSTATS.
 + */
 +
 +/**
 + * account_group_user_time - Maintain utime for a thread group.
 + *
 + * @tsk:	Pointer to task structure.
 + * @cputime:	Time value by which to increment the utime field of the
 + *		thread_group_cputime structure.
 + *
 + * If thread group time is being maintained, get the structure for the
 + * running CPU and update the utime field there.
 + */
 +static inline void account_group_user_time(struct task_struct *tsk,
 +					   cputime_t cputime)
 +{
 +	struct signal_struct *sig;
 +
 +	sig = tsk->signal;
 +	if (unlikely(!sig))
 +		return;
 +	if (sig->cputime.totals) {
 +		struct task_cputime *times;
 +
 +		times = per_cpu_ptr(sig->cputime.totals, get_cpu());
 +		times->utime = cputime_add(times->utime, cputime);
 +		put_cpu_no_resched();
 +	}
 +}
 +
 +/**
 + * account_group_system_time - Maintain stime for a thread group.
 + *
 + * @tsk:	Pointer to task structure.
 + * @cputime:	Time value by which to increment the stime field of the
 + *		thread_group_cputime structure.
 + *
 + * If thread group time is being maintained, get the structure for the
 + * running CPU and update the stime field there.
 + */
 +static inline void account_group_system_time(struct task_struct *tsk,
 +					     cputime_t cputime)
 +{
 +	struct signal_struct *sig;
 +
 +	sig = tsk->signal;
 +	if (unlikely(!sig))
 +		return;
 +	if (sig->cputime.totals) {
 +		struct task_cputime *times;
 +
 +		times = per_cpu_ptr(sig->cputime.totals, get_cpu());
 +		times->stime = cputime_add(times->stime, cputime);
 +		put_cpu_no_resched();
 +	}
 +}
 +
 +/**
 + * account_group_exec_runtime - Maintain exec runtime for a thread group.
 + *
 + * @tsk:	Pointer to task structure.
 + * @ns:		Time value by which to increment the sum_exec_runtime field
 + *		of the thread_group_cputime structure.
 + *
 + * If thread group time is being maintained, get the structure for the
 + * running CPU and update the sum_exec_runtime field there.
 + */
 +static inline void account_group_exec_runtime(struct task_struct *tsk,
 +					      unsigned long long ns)
 +{
 +	struct signal_struct *sig;
 +
 +	sig = tsk->signal;
 +	if (unlikely(!sig))
 +		return;
 +	if (sig->cputime.totals) {
 +		struct task_cputime *times;
 +
 +		times = per_cpu_ptr(sig->cputime.totals, get_cpu());
 +		times->sum_exec_runtime += ns;
 +		put_cpu_no_resched();
 +	}
 +}
diff --combined kernel/sysctl.c
index b3cc73931d1,3d804f41e64..a13bd4dfaeb
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@@ -274,6 -274,16 +274,16 @@@ static struct ctl_table kern_table[] = 
  		.mode		= 0644,
  		.proc_handler	= &proc_dointvec,
  	},
+ 	{
+ 		.ctl_name	= CTL_UNNUMBERED,
+ 		.procname	= "sched_shares_thresh",
+ 		.data		= &sysctl_sched_shares_thresh,
+ 		.maxlen		= sizeof(unsigned int),
+ 		.mode		= 0644,
+ 		.proc_handler	= &proc_dointvec_minmax,
+ 		.strategy	= &sysctl_intvec,
+ 		.extra1		= &zero,
+ 	},
  	{
  		.ctl_name	= CTL_UNNUMBERED,
  		.procname	= "sched_child_runs_first",
@@@ -833,16 -843,6 +843,16 @@@
  		.proc_handler   = &proc_dointvec,
  	},
  #endif
 +#ifdef CONFIG_UNEVICTABLE_LRU
 +	{
 +		.ctl_name	= CTL_UNNUMBERED,
 +		.procname	= "scan_unevictable_pages",
 +		.data		= &scan_unevictable_pages,
 +		.maxlen		= sizeof(scan_unevictable_pages),
 +		.mode		= 0644,
 +		.proc_handler	= &scan_unevictable_handler,
 +	},
 +#endif
  /*
   * NOTE: do not add new entries to this table unless you have read
   * Documentation/sysctl/ctl_unnumbered.txt