]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/commitdiff
Merge branch 'sched/latest' of git://git.kernel.org/pub/scm/linux/kernel/git/ghaskins...
authorIngo Molnar <mingo@elte.hu>
Sun, 11 Jan 2009 03:58:49 +0000 (04:58 +0100)
committerIngo Molnar <mingo@elte.hu>
Sun, 11 Jan 2009 03:58:49 +0000 (04:58 +0100)
1  2 
include/linux/init_task.h
include/linux/sched.h
kernel/sched.c
kernel/sched_rt.c

index 2f3c2d4ef73b1b0df8b4e96998a9021480d5706e,6851225f44a76a8d5f17d08bf17ca88fc337f284..9d85d9f03d183e50e1ab2f4272f54b28d0327144
@@@ -12,7 -12,6 +12,7 @@@
  #include <net/net_namespace.h>
  
  extern struct files_struct init_files;
 +extern struct fs_struct init_fs;
  
  #define INIT_KIOCTX(name, which_mm) \
  {                                                     \
@@@ -58,6 -57,7 +58,6 @@@ extern struct nsproxy init_nsproxy
        .mnt_ns         = NULL,                                         \
        INIT_NET_NS(net_ns)                                             \
        INIT_IPC_NS(ipc_ns)                                             \
 -      .user_ns        = &init_user_ns,                                \
  }
  
  #define INIT_SIGHAND(sighand) {                                               \
@@@ -113,8 -113,6 +113,8 @@@ extern struct group_info init_groups
  # define CAP_INIT_BSET  CAP_INIT_EFF_SET
  #endif
  
 +extern struct cred init_cred;
 +
  /*
   *  INIT_TASK is used to set up the first task table, touch at
   * your own risk!. Base=0, limit=0x1fffff (=2MB)
                .nr_cpus_allowed = NR_CPUS,                             \
        },                                                              \
        .tasks          = LIST_HEAD_INIT(tsk.tasks),                    \
+       .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \
        .ptraced        = LIST_HEAD_INIT(tsk.ptraced),                  \
        .ptrace_entry   = LIST_HEAD_INIT(tsk.ptrace_entry),             \
        .real_parent    = &tsk,                                         \
        .children       = LIST_HEAD_INIT(tsk.children),                 \
        .sibling        = LIST_HEAD_INIT(tsk.sibling),                  \
        .group_leader   = &tsk,                                         \
 -      .group_info     = &init_groups,                                 \
 -      .cap_effective  = CAP_INIT_EFF_SET,                             \
 -      .cap_inheritable = CAP_INIT_INH_SET,                            \
 -      .cap_permitted  = CAP_FULL_SET,                                 \
 -      .cap_bset       = CAP_INIT_BSET,                                \
 -      .securebits     = SECUREBITS_DEFAULT,                           \
 -      .user           = INIT_USER,                                    \
 +      .real_cred      = &init_cred,                                   \
 +      .cred           = &init_cred,                                   \
 +      .cred_exec_mutex =                                              \
 +               __MUTEX_INITIALIZER(tsk.cred_exec_mutex),              \
        .comm           = "swapper",                                    \
        .thread         = INIT_THREAD,                                  \
        .fs             = &init_fs,                                     \
diff --combined include/linux/sched.h
index 4cae9b81a1f8851d51a5380d8d37fa7ba3ceb529,440cabb2d432b486b87bea3f9d49cd8054c63239..c37c5141037b8cb398acd2712652a02110d5fb5c
@@@ -284,6 -284,7 +284,6 @@@ long io_schedule_timeout(long timeout)
  
  extern void cpu_init (void);
  extern void trap_init(void);
 -extern void account_process_tick(struct task_struct *task, int user);
  extern void update_process_times(int user);
  extern void scheduler_tick(void);
  
@@@ -386,9 -387,6 +386,9 @@@ extern void arch_unmap_area_topdown(str
                (mm)->hiwater_vm = (mm)->total_vm;      \
  } while (0)
  
 +#define get_mm_hiwater_rss(mm)        max((mm)->hiwater_rss, get_mm_rss(mm))
 +#define get_mm_hiwater_vm(mm) max((mm)->hiwater_vm, (mm)->total_vm)
 +
  extern void set_dumpable(struct mm_struct *mm, int value);
  extern int get_dumpable(struct mm_struct *mm);
  
@@@ -573,6 -571,12 +573,6 @@@ struct signal_struct 
         */
        struct rlimit rlim[RLIM_NLIMITS];
  
 -      /* keep the process-shared keyrings here so that they do the right
 -       * thing in threads created with CLONE_THREAD */
 -#ifdef CONFIG_KEYS
 -      struct key *session_keyring;    /* keyring inherited over fork */
 -      struct key *process_keyring;    /* keyring private to this process */
 -#endif
  #ifdef CONFIG_BSD_PROCESS_ACCT
        struct pacct_struct pacct;      /* per-process accounting information */
  #endif
@@@ -643,7 -647,6 +643,7 @@@ struct user_struct 
        /* Hash table maintenance information */
        struct hlist_node uidhash_node;
        uid_t uid;
 +      struct user_namespace *user_ns;
  
  #ifdef CONFIG_USER_SCHED
        struct task_group *tg;
@@@ -661,7 -664,6 +661,7 @@@ extern struct user_struct *find_user(ui
  extern struct user_struct root_user;
  #define INIT_USER (&root_user)
  
 +
  struct backing_dev_info;
  struct reclaim_state;
  
  struct sched_info {
        /* cumulative counters */
        unsigned long pcount;         /* # of times run on this cpu */
 -      unsigned long long cpu_time,  /* time spent on the cpu */
 -                         run_delay; /* time spent waiting on a runqueue */
 +      unsigned long long run_delay; /* time spent waiting on a runqueue */
  
        /* timestamps */
        unsigned long long last_arrival,/* when we last ran on a cpu */
@@@ -915,6 -918,7 +915,6 @@@ static inline struct cpumask *sched_dom
  
  extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                                    struct sched_domain_attr *dattr_new);
 -extern int arch_reinit_sched_domains(void);
  
  /* Test a flag in parent sched domain */
  static inline int test_sd_parent(struct sched_domain *sd, int flag)
@@@ -937,7 -941,38 +937,7 @@@ partition_sched_domains(int ndoms_new, 
  #endif        /* !CONFIG_SMP */
  
  struct io_context;                    /* See blkdev.h */
 -#define NGROUPS_SMALL         32
 -#define NGROUPS_PER_BLOCK     ((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
 -struct group_info {
 -      int ngroups;
 -      atomic_t usage;
 -      gid_t small_block[NGROUPS_SMALL];
 -      int nblocks;
 -      gid_t *blocks[0];
 -};
 -
 -/*
 - * get_group_info() must be called with the owning task locked (via task_lock())
 - * when task != current.  The reason being that the vast majority of callers are
 - * looking at current->group_info, which can not be changed except by the
 - * current task.  Changing current->group_info requires the task lock, too.
 - */
 -#define get_group_info(group_info) do { \
 -      atomic_inc(&(group_info)->usage); \
 -} while (0)
 -
 -#define put_group_info(group_info) do { \
 -      if (atomic_dec_and_test(&(group_info)->usage)) \
 -              groups_free(group_info); \
 -} while (0)
  
 -extern struct group_info *groups_alloc(int gidsetsize);
 -extern void groups_free(struct group_info *group_info);
 -extern int set_current_groups(struct group_info *group_info);
 -extern int groups_search(struct group_info *group_info, gid_t grp);
 -/* access the groups "array" with this macro */
 -#define GROUP_AT(gi, i) \
 -    ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
  
  #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
  extern void prefetch_stack(struct task_struct *t);
@@@ -977,6 -1012,7 +977,7 @@@ struct sched_class 
                              struct rq *busiest, struct sched_domain *sd,
                              enum cpu_idle_type idle);
        void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+       int (*needs_post_schedule) (struct rq *this_rq);
        void (*post_schedule) (struct rq *this_rq);
        void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
  
@@@ -1143,6 -1179,7 +1144,7 @@@ struct task_struct 
  #endif
  
        struct list_head tasks;
+       struct plist_node pushable_tasks;
  
        struct mm_struct *mm, *active_mm;
  
         * The buffer to hold the BTS data.
         */
        void *bts_buffer;
 +      size_t bts_size;
  #endif /* CONFIG_X86_PTRACE_BTS */
  
        /* PID/PID hash table linkage. */
        struct list_head cpu_timers[3];
  
  /* process credentials */
 -      uid_t uid,euid,suid,fsuid;
 -      gid_t gid,egid,sgid,fsgid;
 -      struct group_info *group_info;
 -      kernel_cap_t   cap_effective, cap_inheritable, cap_permitted, cap_bset;
 -      struct user_struct *user;
 -      unsigned securebits;
 -#ifdef CONFIG_KEYS
 -      unsigned char jit_keyring;      /* default keyring to attach requested keys to */
 -      struct key *request_key_auth;   /* assumed request_key authority */
 -      struct key *thread_keyring;     /* keyring private to this thread */
 -#endif
 +      const struct cred *real_cred;   /* objective and real subjective task
 +                                       * credentials (COW) */
 +      const struct cred *cred;        /* effective (overridable) subjective task
 +                                       * credentials (COW) */
 +      struct mutex cred_exec_mutex;   /* execve vs ptrace cred calculation mutex */
 +
        char comm[TASK_COMM_LEN]; /* executable name excluding path
                                     - access with [gs]et_task_comm (which lock
                                       it with task_lock())
        int (*notifier)(void *priv);
        void *notifier_data;
        sigset_t *notifier_mask;
 -#ifdef CONFIG_SECURITY
 -      void *security;
 -#endif
        struct audit_context *audit_context;
  #ifdef CONFIG_AUDITSYSCALL
        uid_t loginuid;
@@@ -1706,16 -1750,16 +1708,16 @@@ extern void wake_up_idle_cpu(int cpu)
  static inline void wake_up_idle_cpu(int cpu) { }
  #endif
  
 -#ifdef CONFIG_SCHED_DEBUG
  extern unsigned int sysctl_sched_latency;
  extern unsigned int sysctl_sched_min_granularity;
  extern unsigned int sysctl_sched_wakeup_granularity;
 +extern unsigned int sysctl_sched_shares_ratelimit;
 +extern unsigned int sysctl_sched_shares_thresh;
 +#ifdef CONFIG_SCHED_DEBUG
  extern unsigned int sysctl_sched_child_runs_first;
  extern unsigned int sysctl_sched_features;
  extern unsigned int sysctl_sched_migration_cost;
  extern unsigned int sysctl_sched_nr_migrate;
 -extern unsigned int sysctl_sched_shares_ratelimit;
 -extern unsigned int sysctl_sched_shares_thresh;
  
  int sched_nr_latency_handler(struct ctl_table *table, int write,
                struct file *file, void __user *buffer, size_t *length,
@@@ -1815,6 -1859,7 +1817,6 @@@ static inline struct user_struct *get_u
        return u;
  }
  extern void free_uid(struct user_struct *);
 -extern void switch_uid(struct user_struct *);
  extern void release_uids(struct user_namespace *ns);
  
  #include <asm/current.h>
@@@ -1833,6 -1878,9 +1835,6 @@@ extern void wake_up_new_task(struct tas
  extern void sched_fork(struct task_struct *p, int clone_flags);
  extern void sched_dead(struct task_struct *p);
  
 -extern int in_group_p(gid_t);
 -extern int in_egroup_p(gid_t);
 -
  extern void proc_caches_init(void);
  extern void flush_signals(struct task_struct *);
  extern void ignore_signals(struct task_struct *);
@@@ -1964,8 -2012,6 +1966,8 @@@ static inline unsigned long wait_task_i
  #define for_each_process(p) \
        for (p = &init_task ; (p = next_task(p)) != &init_task ; )
  
 +extern bool is_single_threaded(struct task_struct *);
 +
  /*
   * Careful: do_each_thread/while_each_thread is a double loop so
   *          'break' will not work as expected - use goto instead.
diff --combined kernel/sched.c
index deb5ac8c12f37c44e71dcc46484149d073430948,24ab80c28765b6b8a6e713c2f864dbaf61cea14f..dd1a1466c1e6a7412ca43022185c8aed0f76c88d
@@@ -209,6 -209,7 +209,6 @@@ void init_rt_bandwidth(struct rt_bandwi
        hrtimer_init(&rt_b->rt_period_timer,
                        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        rt_b->rt_period_timer.function = sched_rt_period_timer;
 -      rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
  }
  
  static inline int rt_bandwidth_enabled(void)
@@@ -360,9 -361,7 +360,9 @@@ static inline struct task_group *task_g
        struct task_group *tg;
  
  #ifdef CONFIG_USER_SCHED
 -      tg = p->user->tg;
 +      rcu_read_lock();
 +      tg = __task_cred(p)->user->tg;
 +      rcu_read_unlock();
  #elif defined(CONFIG_CGROUP_SCHED)
        tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
                                struct task_group, css);
@@@ -464,11 -463,15 +464,15 @@@ struct rt_rq 
        struct rt_prio_array active;
        unsigned long rt_nr_running;
  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       int highest_prio; /* highest queued rt task prio */
+       struct {
+               int curr; /* highest queued rt task prio */
+               int next; /* next highest */
+       } highest_prio;
  #endif
  #ifdef CONFIG_SMP
        unsigned long rt_nr_migratory;
        int overloaded;
+       struct plist_head pushable_tasks;
  #endif
        int rt_throttled;
        u64 rt_time;
@@@ -611,8 -614,6 +615,8 @@@ struct rq 
  #ifdef CONFIG_SCHEDSTATS
        /* latency stats */
        struct sched_info rq_sched_info;
 +      unsigned long long rq_cpu_time;
 +      /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
  
        /* sys_sched_yield() stats */
        unsigned int yld_exp_empty;
@@@ -1146,6 -1147,7 +1150,6 @@@ static void init_rq_hrtick(struct rq *r
  
        hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        rq->hrtick_timer.function = hrtick;
 -      rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
  }
  #else /* CONFIG_SCHED_HRTICK */
  static inline void hrtick_clear(struct rq *rq)
@@@ -1607,21 -1609,42 +1611,42 @@@ static inline void update_shares_locked
  
  #endif
  
+ #ifdef CONFIG_PREEMPT
  /*
-  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+  * fair double_lock_balance: Safely acquires both rq->locks in a fair
+  * way at the expense of forcing extra atomic operations in all
+  * invocations.  This assures that the double_lock is acquired using the
+  * same underlying policy as the spinlock_t on this architecture, which
+  * reduces latency compared to the unfair variant below.  However, it
+  * also adds more overhead and therefore may reduce throughput.
   */
- static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(this_rq->lock)
+       __acquires(busiest->lock)
+       __acquires(this_rq->lock)
+ {
+       spin_unlock(&this_rq->lock);
+       double_rq_lock(this_rq, busiest);
+       return 1;
+ }
+ #else
+ /*
+  * Unfair double_lock_balance: Optimizes throughput at the expense of
+  * latency by eliminating extra atomic operations when the locks are
+  * already in proper order on entry.  This favors lower cpu-ids and will
+  * grant the double lock to lower cpus over higher ids under contention,
+  * regardless of entry order into the function.
+  */
+ static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(this_rq->lock)
        __acquires(busiest->lock)
        __acquires(this_rq->lock)
  {
        int ret = 0;
  
-       if (unlikely(!irqs_disabled())) {
-               /* printk() doesn't work good under rq->lock */
-               spin_unlock(&this_rq->lock);
-               BUG_ON(1);
-       }
        if (unlikely(!spin_trylock(&busiest->lock))) {
                if (busiest < this_rq) {
                        spin_unlock(&this_rq->lock);
        return ret;
  }
  
+ #endif /* CONFIG_PREEMPT */
+ /*
+  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+  */
+ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ {
+       if (unlikely(!irqs_disabled())) {
+               /* printk() doesn't work good under rq->lock */
+               spin_unlock(&this_rq->lock);
+               BUG_ON(1);
+       }
+       return _double_lock_balance(this_rq, busiest);
+ }
  static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(busiest->lock)
  {
@@@ -1873,8 -1912,6 +1914,8 @@@ void set_task_cpu(struct task_struct *p
  
        clock_offset = old_rq->clock - new_rq->clock;
  
 +      trace_sched_migrate_task(p, task_cpu(p), new_cpu);
 +
  #ifdef CONFIG_SCHEDSTATS
        if (p->se.wait_start)
                p->se.wait_start -= clock_offset;
@@@ -2281,7 -2318,6 +2322,7 @@@ static int try_to_wake_up(struct task_s
  
        smp_wmb();
        rq = task_rq_lock(p, &flags);
 +      update_rq_clock(rq);
        old_state = p->state;
        if (!(old_state & state))
                goto out;
@@@ -2339,11 -2375,12 +2380,11 @@@ out_activate
                schedstat_inc(p, se.nr_wakeups_local);
        else
                schedstat_inc(p, se.nr_wakeups_remote);
 -      update_rq_clock(rq);
        activate_task(rq, p, 1);
        success = 1;
  
  out_running:
 -      trace_sched_wakeup(rq, p);
 +      trace_sched_wakeup(rq, p, success);
        check_preempt_curr(rq, p, sync);
  
        p->state = TASK_RUNNING;
@@@ -2445,6 -2482,8 +2486,8 @@@ void sched_fork(struct task_struct *p, 
        /* Want to start with kernel preemption disabled. */
        task_thread_info(p)->preempt_count = 1;
  #endif
+       plist_node_init(&p->pushable_tasks, MAX_PRIO);
        put_cpu();
  }
  
@@@ -2476,7 -2515,7 +2519,7 @@@ void wake_up_new_task(struct task_struc
                p->sched_class->task_new(rq, p);
                inc_nr_running(rq);
        }
 -      trace_sched_wakeup_new(rq, p);
 +      trace_sched_wakeup_new(rq, p, 1);
        check_preempt_curr(rq, p, 0);
  #ifdef CONFIG_SMP
        if (p->sched_class->task_wake_up)
@@@ -2585,6 -2624,12 +2628,12 @@@ static void finish_task_switch(struct r
  {
        struct mm_struct *mm = rq->prev_mm;
        long prev_state;
+ #ifdef CONFIG_SMP
+       int post_schedule = 0;
+       if (current->sched_class->needs_post_schedule)
+               post_schedule = current->sched_class->needs_post_schedule(rq);
+ #endif
  
        rq->prev_mm = NULL;
  
        finish_arch_switch(prev);
        finish_lock_switch(rq, prev);
  #ifdef CONFIG_SMP
-       if (current->sched_class->post_schedule)
+       if (post_schedule)
                current->sched_class->post_schedule(rq);
  #endif
  
@@@ -2855,6 -2900,7 +2904,6 @@@ static void sched_migrate_task(struct t
            || unlikely(!cpu_active(dest_cpu)))
                goto out;
  
 -      trace_sched_migrate_task(rq, p, dest_cpu);
        /* force the process onto the specified CPU */
        if (migrate_task(p, dest_cpu, &req)) {
                /* Need to wait for migration thread (might exit: take ref). */
@@@ -2984,6 -3030,16 +3033,16 @@@ next
        pulled++;
        rem_load_move -= p->se.load.weight;
  
+ #ifdef CONFIG_PREEMPT
+       /*
+        * NEWIDLE balancing is a source of latency, so preemptible kernels
+        * will stop after the first task is pulled to minimize the critical
+        * section.
+        */
+       if (idle == CPU_NEWLY_IDLE)
+               goto out;
+ #endif
        /*
         * We only want to steal up to the prescribed amount of weighted load.
         */
@@@ -3030,9 -3086,15 +3089,15 @@@ static int move_tasks(struct rq *this_r
                                sd, idle, all_pinned, &this_best_prio);
                class = class->next;
  
+ #ifdef CONFIG_PREEMPT
+               /*
+                * NEWIDLE balancing is a source of latency, so preemptible
+                * kernels will stop after the first task is pulled to minimize
+                * the critical section.
+                */
                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
                        break;
+ #endif
        } while (class && max_load_move > total_load_moved);
  
        return total_load_moved > 0;
@@@ -3715,7 -3777,7 +3780,7 @@@ redo
                 * don't kick the migration_thread, if the curr
                 * task on busiest cpu can't be moved to this_cpu
                 */
 -              if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
 +              if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
                        double_unlock_balance(this_rq, busiest);
                        all_pinned = 1;
                        return ld_moved;
                }
  
                double_unlock_balance(this_rq, busiest);
 +              /*
 +               * Should not call ttwu while holding a rq->lock
 +               */
 +              spin_unlock(&this_rq->lock);
                if (active_balance)
                        wake_up_process(busiest->migration_thread);
 +              spin_lock(&this_rq->lock);
  
        } else
                sd->nr_balance_failed = 0;
@@@ -4155,17 -4212,13 +4220,17 @@@ unsigned long long task_delta_exec(stru
   * Account user cpu time to a process.
   * @p: the process that the cpu time gets accounted to
   * @cputime: the cpu time spent in user space since the last update
 + * @cputime_scaled: cputime scaled by cpu frequency
   */
 -void account_user_time(struct task_struct *p, cputime_t cputime)
 +void account_user_time(struct task_struct *p, cputime_t cputime,
 +                     cputime_t cputime_scaled)
  {
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
        cputime64_t tmp;
  
 +      /* Add user time to process. */
        p->utime = cputime_add(p->utime, cputime);
 +      p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
        account_group_user_time(p, cputime);
  
        /* Add user time to cpustat. */
   * Account guest cpu time to a process.
   * @p: the process that the cpu time gets accounted to
   * @cputime: the cpu time spent in virtual machine since the last update
 + * @cputime_scaled: cputime scaled by cpu frequency
   */
 -static void account_guest_time(struct task_struct *p, cputime_t cputime)
 +static void account_guest_time(struct task_struct *p, cputime_t cputime,
 +                             cputime_t cputime_scaled)
  {
        cputime64_t tmp;
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
  
        tmp = cputime_to_cputime64(cputime);
  
 +      /* Add guest time to process. */
        p->utime = cputime_add(p->utime, cputime);
 +      p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
        account_group_user_time(p, cputime);
        p->gtime = cputime_add(p->gtime, cputime);
  
 +      /* Add guest time to cpustat. */
        cpustat->user = cputime64_add(cpustat->user, tmp);
        cpustat->guest = cputime64_add(cpustat->guest, tmp);
  }
  
 -/*
 - * Account scaled user cpu time to a process.
 - * @p: the process that the cpu time gets accounted to
 - * @cputime: the cpu time spent in user space since the last update
 - */
 -void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
 -{
 -      p->utimescaled = cputime_add(p->utimescaled, cputime);
 -}
 -
  /*
   * Account system cpu time to a process.
   * @p: the process that the cpu time gets accounted to
   * @hardirq_offset: the offset to subtract from hardirq_count()
   * @cputime: the cpu time spent in kernel space since the last update
 + * @cputime_scaled: cputime scaled by cpu frequency
   */
  void account_system_time(struct task_struct *p, int hardirq_offset,
 -                       cputime_t cputime)
 +                       cputime_t cputime, cputime_t cputime_scaled)
  {
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 -      struct rq *rq = this_rq();
        cputime64_t tmp;
  
        if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
 -              account_guest_time(p, cputime);
 +              account_guest_time(p, cputime, cputime_scaled);
                return;
        }
  
 +      /* Add system time to process. */
        p->stime = cputime_add(p->stime, cputime);
 +      p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
        account_group_system_time(p, cputime);
  
        /* Add system time to cpustat. */
                cpustat->irq = cputime64_add(cpustat->irq, tmp);
        else if (softirq_count())
                cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
 -      else if (p != rq->idle)
 -              cpustat->system = cputime64_add(cpustat->system, tmp);
 -      else if (atomic_read(&rq->nr_iowait) > 0)
 -              cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
        else
 -              cpustat->idle = cputime64_add(cpustat->idle, tmp);
 +              cpustat->system = cputime64_add(cpustat->system, tmp);
 +
        /* Account for system time used */
        acct_update_integrals(p);
  }
  
  /*
 - * Account scaled system cpu time to a process.
 - * @p: the process that the cpu time gets accounted to
 - * @hardirq_offset: the offset to subtract from hardirq_count()
 - * @cputime: the cpu time spent in kernel space since the last update
 + * Account for involuntary wait time.
 + * @steal: the cpu time spent in involuntary wait
   */
 -void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
 +void account_steal_time(cputime_t cputime)
  {
 -      p->stimescaled = cputime_add(p->stimescaled, cputime);
 +      struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 +      cputime64_t cputime64 = cputime_to_cputime64(cputime);
 +
 +      cpustat->steal = cputime64_add(cpustat->steal, cputime64);
  }
  
  /*
 - * Account for involuntary wait time.
 - * @p: the process from which the cpu time has been stolen
 - * @steal: the cpu time spent in involuntary wait
 + * Account for idle time.
 + * @cputime: the cpu time spent in idle wait
   */
 -void account_steal_time(struct task_struct *p, cputime_t steal)
 +void account_idle_time(cputime_t cputime)
  {
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 -      cputime64_t tmp = cputime_to_cputime64(steal);
 +      cputime64_t cputime64 = cputime_to_cputime64(cputime);
        struct rq *rq = this_rq();
  
 -      if (p == rq->idle) {
 -              p->stime = cputime_add(p->stime, steal);
 -              if (atomic_read(&rq->nr_iowait) > 0)
 -                      cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 -              else
 -                      cpustat->idle = cputime64_add(cpustat->idle, tmp);
 -      } else
 -              cpustat->steal = cputime64_add(cpustat->steal, tmp);
 +      if (atomic_read(&rq->nr_iowait) > 0)
 +              cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
 +      else
 +              cpustat->idle = cputime64_add(cpustat->idle, cputime64);
 +}
 +
 +#ifndef CONFIG_VIRT_CPU_ACCOUNTING
 +
 +/*
 + * Account a single tick of cpu time.
 + * @p: the process that the cpu time gets accounted to
 + * @user_tick: indicates if the tick is a user or a system tick
 + */
 +void account_process_tick(struct task_struct *p, int user_tick)
 +{
 +      cputime_t one_jiffy = jiffies_to_cputime(1);
 +      cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
 +      struct rq *rq = this_rq();
 +
 +      if (user_tick)
 +              account_user_time(p, one_jiffy, one_jiffy_scaled);
 +      else if (p != rq->idle)
 +              account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
 +                                  one_jiffy_scaled);
 +      else
 +              account_idle_time(one_jiffy);
 +}
 +
 +/*
 + * Account multiple ticks of steal time.
 + * @p: the process from which the cpu time has been stolen
 + * @ticks: number of stolen ticks
 + */
 +void account_steal_ticks(unsigned long ticks)
 +{
 +      account_steal_time(jiffies_to_cputime(ticks));
 +}
 +
 +/*
 + * Account multiple ticks of idle time.
 + * @ticks: number of stolen ticks
 + */
 +void account_idle_ticks(unsigned long ticks)
 +{
 +      account_idle_time(jiffies_to_cputime(ticks));
  }
  
 +#endif
 +
  /*
   * Use precise platform statistics if available:
   */
@@@ -5232,22 -5252,6 +5297,22 @@@ __setscheduler(struct rq *rq, struct ta
        set_load_weight(p);
  }
  
 +/*
 + * check the target process has a UID that matches the current process's
 + */
 +static bool check_same_owner(struct task_struct *p)
 +{
 +      const struct cred *cred = current_cred(), *pcred;
 +      bool match;
 +
 +      rcu_read_lock();
 +      pcred = __task_cred(p);
 +      match = (cred->euid == pcred->euid ||
 +               cred->euid == pcred->uid);
 +      rcu_read_unlock();
 +      return match;
 +}
 +
  static int __sched_setscheduler(struct task_struct *p, int policy,
                                struct sched_param *param, bool user)
  {
@@@ -5307,7 -5311,8 +5372,7 @@@ recheck
                        return -EPERM;
  
                /* can't change other user's priorities */
 -              if ((current->euid != p->euid) &&
 -                  (current->euid != p->uid))
 +              if (!check_same_owner(p))
                        return -EPERM;
        }
  
@@@ -5546,7 -5551,8 +5611,7 @@@ long sched_setaffinity(pid_t pid, cons
                goto out_free_cpus_allowed;
        }
        retval = -EPERM;
 -      if ((current->euid != p->euid) && (current->euid != p->uid) &&
 -                      !capable(CAP_SYS_NICE))
 +      if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                goto out_unlock;
  
        retval = security_task_setscheduler(p, 0, NULL);
@@@ -6262,7 -6268,9 +6327,7 @@@ static int __migrate_task_irq(struct ta
  static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
  {
        int dest_cpu;
 -      /* FIXME: Use cpumask_of_node here. */
 -      cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu));
 -      const struct cpumask *nodemask = &_nodemask;
 +      const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
  
  again:
        /* Look for allowed, online CPU in same node. */
@@@ -6962,7 -6970,7 +7027,7 @@@ static void rq_attach_root(struct rq *r
        spin_unlock_irqrestore(&rq->lock, flags);
  }
  
 -static int init_rootdomain(struct root_domain *rd, bool bootmem)
 +static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
  {
        memset(rd, 0, sizeof(*rd));
  
        }
  
        if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
 -              goto free_rd;
 +              goto out;
        if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
                goto free_span;
        if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
@@@ -6991,7 -6999,8 +7056,7 @@@ free_online
        free_cpumask_var(rd->online);
  free_span:
        free_cpumask_var(rd->span);
 -free_rd:
 -      kfree(rd);
 +out:
        return -ENOMEM;
  }
  
@@@ -7172,18 -7181,21 +7237,18 @@@ static int find_next_best_node(int node
  static void sched_domain_node_span(int node, struct cpumask *span)
  {
        nodemask_t used_nodes;
 -      /* FIXME: use cpumask_of_node() */
 -      node_to_cpumask_ptr(nodemask, node);
        int i;
  
 -      cpus_clear(*span);
 +      cpumask_clear(span);
        nodes_clear(used_nodes);
  
 -      cpus_or(*span, *span, *nodemask);
 +      cpumask_or(span, span, cpumask_of_node(node));
        node_set(node, used_nodes);
  
        for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
                int next_node = find_next_best_node(node, &used_nodes);
  
 -              node_to_cpumask_ptr_next(nodemask, next_node);
 -              cpus_or(*span, *span, *nodemask);
 +              cpumask_or(span, span, cpumask_of_node(next_node));
        }
  }
  #endif /* CONFIG_NUMA */
@@@ -7263,7 -7275,9 +7328,7 @@@ cpu_to_phys_group(int cpu, const struc
  {
        int group;
  #ifdef CONFIG_SCHED_MC
 -      /* FIXME: Use cpu_coregroup_mask. */
 -      *mask = cpu_coregroup_map(cpu);
 -      cpus_and(*mask, *mask, *cpu_map);
 +      cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
        group = cpumask_first(mask);
  #elif defined(CONFIG_SCHED_SMT)
        cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
@@@ -7293,8 -7307,10 +7358,8 @@@ static int cpu_to_allnodes_group(int cp
                                 struct cpumask *nodemask)
  {
        int group;
 -      /* FIXME: use cpumask_of_node */
 -      node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu));
  
 -      cpumask_and(nodemask, pnodemask, cpu_map);
 +      cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
        group = cpumask_first(nodemask);
  
        if (sg)
@@@ -7345,8 -7361,10 +7410,8 @@@ static void free_sched_groups(const str
  
                for (i = 0; i < nr_node_ids; i++) {
                        struct sched_group *oldsg, *sg = sched_group_nodes[i];
 -                      /* FIXME: Use cpumask_of_node */
 -                      node_to_cpumask_ptr(pnodemask, i);
  
 -                      cpus_and(*nodemask, *pnodemask, *cpu_map);
 +                      cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
                        if (cpumask_empty(nodemask))
                                continue;
  
@@@ -7555,7 -7573,9 +7620,7 @@@ static int __build_sched_domains(const 
        for_each_cpu(i, cpu_map) {
                struct sched_domain *sd = NULL, *p;
  
 -              /* FIXME: use cpumask_of_node */
 -              *nodemask = node_to_cpumask(cpu_to_node(i));
 -              cpus_and(*nodemask, *nodemask, *cpu_map);
 +              cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
  
  #ifdef CONFIG_NUMA
                if (cpumask_weight(cpu_map) >
                sd = &per_cpu(core_domains, i).sd;
                SD_INIT(sd, MC);
                set_domain_attribute(sd, attr);
 -              *sched_domain_span(sd) = cpu_coregroup_map(i);
 -              cpumask_and(sched_domain_span(sd),
 -                          sched_domain_span(sd), cpu_map);
 +              cpumask_and(sched_domain_span(sd), cpu_map,
 +                                                 cpu_coregroup_mask(i));
                sd->parent = p;
                p->child = sd;
                cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
  #ifdef CONFIG_SCHED_MC
        /* Set up multi-core groups */
        for_each_cpu(i, cpu_map) {
 -              /* FIXME: Use cpu_coregroup_mask */
 -              *this_core_map = cpu_coregroup_map(i);
 -              cpus_and(*this_core_map, *this_core_map, *cpu_map);
 +              cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
                if (i != cpumask_first(this_core_map))
                        continue;
  
  
        /* Set up physical groups */
        for (i = 0; i < nr_node_ids; i++) {
 -              /* FIXME: Use cpumask_of_node */
 -              *nodemask = node_to_cpumask(i);
 -              cpus_and(*nodemask, *nodemask, *cpu_map);
 +              cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
                if (cpumask_empty(nodemask))
                        continue;
  
                struct sched_group *sg, *prev;
                int j;
  
 -              /* FIXME: Use cpumask_of_node */
 -              *nodemask = node_to_cpumask(i);
                cpumask_clear(covered);
 -
 -              cpus_and(*nodemask, *nodemask, *cpu_map);
 +              cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
                if (cpumask_empty(nodemask)) {
                        sched_group_nodes[i] = NULL;
                        continue;
  
                for (j = 0; j < nr_node_ids; j++) {
                        int n = (i + j) % nr_node_ids;
 -                      /* FIXME: Use cpumask_of_node */
 -                      node_to_cpumask_ptr(pnodemask, n);
  
                        cpumask_complement(notcovered, covered);
                        cpumask_and(tmpmask, notcovered, cpu_map);
                        if (cpumask_empty(tmpmask))
                                break;
  
 -                      cpumask_and(tmpmask, tmpmask, pnodemask);
 +                      cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
                        if (cpumask_empty(tmpmask))
                                continue;
  
@@@ -7991,7 -8021,7 +8056,7 @@@ match2
  }
  
  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 -int arch_reinit_sched_domains(void)
 +static void arch_reinit_sched_domains(void)
  {
        get_online_cpus();
  
  
        rebuild_sched_domains();
        put_online_cpus();
 -
 -      return 0;
  }
  
  static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
  {
 -      int ret;
        unsigned int level = 0;
  
        if (sscanf(buf, "%u", &level) != 1)
        else
                sched_mc_power_savings = level;
  
 -      ret = arch_reinit_sched_domains();
 +      arch_reinit_sched_domains();
  
 -      return ret ? ret : count;
 +      return count;
  }
  
  #ifdef CONFIG_SCHED_MC
@@@ -8061,7 -8094,7 +8126,7 @@@ static SYSDEV_CLASS_ATTR(sched_smt_powe
                   sched_smt_power_savings_store);
  #endif
  
 -int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 +int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
  {
        int err = 0;
  
@@@ -8201,11 -8234,13 +8266,13 @@@ static void init_rt_rq(struct rt_rq *rt
        __set_bit(MAX_RT_PRIO, array->bitmap);
  
  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       rt_rq->highest_prio = MAX_RT_PRIO;
+       rt_rq->highest_prio.curr = MAX_RT_PRIO;
+       rt_rq->highest_prio.next = MAX_RT_PRIO;
  #endif
  #ifdef CONFIG_SMP
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
+       plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
  #endif
  
        rt_rq->rt_time = 0;
@@@ -9455,41 -9490,6 +9522,41 @@@ cpuacct_destroy(struct cgroup_subsys *s
        kfree(ca);
  }
  
 +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 +{
 +      u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 +      u64 data;
 +
 +#ifndef CONFIG_64BIT
 +      /*
 +       * Take rq->lock to make 64-bit read safe on 32-bit platforms.
 +       */
 +      spin_lock_irq(&cpu_rq(cpu)->lock);
 +      data = *cpuusage;
 +      spin_unlock_irq(&cpu_rq(cpu)->lock);
 +#else
 +      data = *cpuusage;
 +#endif
 +
 +      return data;
 +}
 +
 +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 +{
 +      u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 +
 +#ifndef CONFIG_64BIT
 +      /*
 +       * Take rq->lock to make 64-bit write safe on 32-bit platforms.
 +       */
 +      spin_lock_irq(&cpu_rq(cpu)->lock);
 +      *cpuusage = val;
 +      spin_unlock_irq(&cpu_rq(cpu)->lock);
 +#else
 +      *cpuusage = val;
 +#endif
 +}
 +
  /* return total cpu usage (in nanoseconds) of a group */
  static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
  {
        u64 totalcpuusage = 0;
        int i;
  
 -      for_each_possible_cpu(i) {
 -              u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
 -
 -              /*
 -               * Take rq->lock to make 64-bit addition safe on 32-bit
 -               * platforms.
 -               */
 -              spin_lock_irq(&cpu_rq(i)->lock);
 -              totalcpuusage += *cpuusage;
 -              spin_unlock_irq(&cpu_rq(i)->lock);
 -      }
 +      for_each_present_cpu(i)
 +              totalcpuusage += cpuacct_cpuusage_read(ca, i);
  
        return totalcpuusage;
  }
@@@ -9515,39 -9524,23 +9582,39 @@@ static int cpuusage_write(struct cgrou
                goto out;
        }
  
 -      for_each_possible_cpu(i) {
 -              u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
 +      for_each_present_cpu(i)
 +              cpuacct_cpuusage_write(ca, i, 0);
  
 -              spin_lock_irq(&cpu_rq(i)->lock);
 -              *cpuusage = 0;
 -              spin_unlock_irq(&cpu_rq(i)->lock);
 -      }
  out:
        return err;
  }
  
 +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
 +                                 struct seq_file *m)
 +{
 +      struct cpuacct *ca = cgroup_ca(cgroup);
 +      u64 percpu;
 +      int i;
 +
 +      for_each_present_cpu(i) {
 +              percpu = cpuacct_cpuusage_read(ca, i);
 +              seq_printf(m, "%llu ", (unsigned long long) percpu);
 +      }
 +      seq_printf(m, "\n");
 +      return 0;
 +}
 +
  static struct cftype files[] = {
        {
                .name = "usage",
                .read_u64 = cpuusage_read,
                .write_u64 = cpuusage_write,
        },
 +      {
 +              .name = "usage_percpu",
 +              .read_seq_string = cpuacct_percpu_seq_read,
 +      },
 +
  };
  
  static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
diff --combined kernel/sched_rt.c
index 954e1a81b7967567b617b918b254deaa12a848a4,64a8f0aa117b1595d28ea0c90adcf7de03985bad..18c7b5b3158aaa1a75ce2fa746d70c71c5a5fa92
@@@ -49,6 -49,24 +49,24 @@@ static void update_rt_migration(struct 
                rq->rt.overloaded = 0;
        }
  }
+ static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+ {
+       plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+       plist_node_init(&p->pushable_tasks, p->prio);
+       plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ }
+ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+ {
+       plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ }
+ #else
+ #define enqueue_pushable_task(rq, p) do { } while (0)
+ #define dequeue_pushable_task(rq, p) do { } while (0)
  #endif /* CONFIG_SMP */
  
  static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
@@@ -77,7 -95,7 +95,7 @@@ static inline u64 sched_rt_period(struc
  }
  
  #define for_each_leaf_rt_rq(rt_rq, rq) \
 -      list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 +      list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
  
  static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  {
@@@ -108,7 -126,7 +126,7 @@@ static void sched_rt_rq_enqueue(struct 
        if (rt_rq->rt_nr_running) {
                if (rt_se && !on_rt_rq(rt_se))
                        enqueue_rt_entity(rt_se);
-               if (rt_rq->highest_prio < curr->prio)
+               if (rt_rq->highest_prio.curr < curr->prio)
                        resched_task(curr);
        }
  }
@@@ -473,7 -491,7 +491,7 @@@ static inline int rt_se_prio(struct sch
        struct rt_rq *rt_rq = group_rt_rq(rt_se);
  
        if (rt_rq)
-               return rt_rq->highest_prio;
+               return rt_rq->highest_prio.curr;
  #endif
  
        return rt_task_of(rt_se)->prio;
@@@ -547,33 -565,64 +565,64 @@@ static void update_curr_rt(struct rq *r
        }
  }
  
+ #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
+ static inline int next_prio(struct rq *rq)
+ {
+       struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
+       if (next && rt_prio(next->prio))
+               return next->prio;
+       else
+               return MAX_RT_PRIO;
+ }
+ #endif
  static inline
  void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  {
-       WARN_ON(!rt_prio(rt_se_prio(rt_se)));
-       rt_rq->rt_nr_running++;
- #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+       int prio = rt_se_prio(rt_se);
  #ifdef CONFIG_SMP
-               struct rq *rq = rq_of_rt_rq(rt_rq);
+       struct rq *rq = rq_of_rt_rq(rt_rq);
  #endif
  
-               rt_rq->highest_prio = rt_se_prio(rt_se);
+       WARN_ON(!rt_prio(prio));
+       rt_rq->rt_nr_running++;
+ #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+       if (prio < rt_rq->highest_prio.curr) {
+               /*
+                * If the new task is higher in priority than anything on the
+                * run-queue, we have a new high that must be published to
+                * the world.  We also know that the previous high becomes
+                * our next-highest.
+                */
+               rt_rq->highest_prio.next = rt_rq->highest_prio.curr;
+               rt_rq->highest_prio.curr = prio;
  #ifdef CONFIG_SMP
                if (rq->online)
-                       cpupri_set(&rq->rd->cpupri, rq->cpu,
-                                  rt_se_prio(rt_se));
+                       cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
  #endif
-       }
+       } else if (prio == rt_rq->highest_prio.curr)
+               /*
+                * If the next task is equal in priority to the highest on
+                * the run-queue, then we implicitly know that the next highest
+                * task cannot be any lower than current
+                */
+               rt_rq->highest_prio.next = prio;
+       else if (prio < rt_rq->highest_prio.next)
+               /*
+                * Otherwise, we need to recompute next-highest
+                */
+               rt_rq->highest_prio.next = next_prio(rq);
  #endif
  #ifdef CONFIG_SMP
-       if (rt_se->nr_cpus_allowed > 1) {
-               struct rq *rq = rq_of_rt_rq(rt_rq);
+       if (rt_se->nr_cpus_allowed > 1)
                rq->rt.rt_nr_migratory++;
-       }
  
-       update_rt_migration(rq_of_rt_rq(rt_rq));
+       update_rt_migration(rq);
  #endif
  #ifdef CONFIG_RT_GROUP_SCHED
        if (rt_se_boosted(rt_se))
@@@ -590,7 -639,8 +639,8 @@@ static inlin
  void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  {
  #ifdef CONFIG_SMP
-       int highest_prio = rt_rq->highest_prio;
+       struct rq *rq = rq_of_rt_rq(rt_rq);
+       int highest_prio = rt_rq->highest_prio.curr;
  #endif
  
        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
        rt_rq->rt_nr_running--;
  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
        if (rt_rq->rt_nr_running) {
-               struct rt_prio_array *array;
+               int prio = rt_se_prio(rt_se);
  
-               WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
-               if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
-                       /* recalculate */
-                       array = &rt_rq->active;
-                       rt_rq->highest_prio =
+               WARN_ON(prio < rt_rq->highest_prio.curr);
+               /*
+                * This may have been our highest or next-highest priority
+                * task and therefore we may have some recomputation to do
+                */
+               if (prio == rt_rq->highest_prio.curr) {
+                       struct rt_prio_array *array = &rt_rq->active;
+                       rt_rq->highest_prio.curr =
                                sched_find_first_bit(array->bitmap);
-               } /* otherwise leave rq->highest prio alone */
+               }
+               if (prio <= rt_rq->highest_prio.next)
+                       rt_rq->highest_prio.next = next_prio(rq);
        } else
-               rt_rq->highest_prio = MAX_RT_PRIO;
+               rt_rq->highest_prio.curr = MAX_RT_PRIO;
  #endif
  #ifdef CONFIG_SMP
-       if (rt_se->nr_cpus_allowed > 1) {
-               struct rq *rq = rq_of_rt_rq(rt_rq);
+       if (rt_se->nr_cpus_allowed > 1)
                rq->rt.rt_nr_migratory--;
-       }
  
-       if (rt_rq->highest_prio != highest_prio) {
-               struct rq *rq = rq_of_rt_rq(rt_rq);
+       if (rq->online && rt_rq->highest_prio.curr != highest_prio)
+               cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
  
-               if (rq->online)
-                       cpupri_set(&rq->rd->cpupri, rq->cpu,
-                                  rt_rq->highest_prio);
-       }
-       update_rt_migration(rq_of_rt_rq(rt_rq));
+       update_rt_migration(rq);
  #endif /* CONFIG_SMP */
  #ifdef CONFIG_RT_GROUP_SCHED
        if (rt_se_boosted(rt_se))
@@@ -718,6 -769,9 +769,9 @@@ static void enqueue_task_rt(struct rq *
  
        enqueue_rt_entity(rt_se);
  
+       if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+               enqueue_pushable_task(rq, p);
        inc_cpu_load(rq, p->se.load.weight);
  }
  
@@@ -728,6 -782,8 +782,8 @@@ static void dequeue_task_rt(struct rq *
        update_curr_rt(rq);
        dequeue_rt_entity(rt_se);
  
+       dequeue_pushable_task(rq, p);
        dec_cpu_load(rq, p->se.load.weight);
  }
  
@@@ -878,7 -934,7 +934,7 @@@ static struct sched_rt_entity *pick_nex
        return next;
  }
  
- static struct task_struct *pick_next_task_rt(struct rq *rq)
+ static struct task_struct *_pick_next_task_rt(struct rq *rq)
  {
        struct sched_rt_entity *rt_se;
        struct task_struct *p;
  
        p = rt_task_of(rt_se);
        p->se.exec_start = rq->clock;
+       return p;
+ }
+ static struct task_struct *pick_next_task_rt(struct rq *rq)
+ {
+       struct task_struct *p = _pick_next_task_rt(rq);
+       /* The running task is never eligible for pushing */
+       if (p)
+               dequeue_pushable_task(rq, p);
        return p;
  }
  
@@@ -907,6 -975,13 +975,13 @@@ static void put_prev_task_rt(struct rq 
  {
        update_curr_rt(rq);
        p->se.exec_start = 0;
+       /*
+        * The previous task needs to be made eligible for pushing
+        * if it is still active
+        */
+       if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
+               enqueue_pushable_task(rq, p);
  }
  
  #ifdef CONFIG_SMP
@@@ -1072,7 -1147,7 +1147,7 @@@ static struct rq *find_lock_lowest_rq(s
                }
  
                /* If this rq is still suitable use it. */
-               if (lowest_rq->rt.highest_prio > task->prio)
+               if (lowest_rq->rt.highest_prio.curr > task->prio)
                        break;
  
                /* try again */
        return lowest_rq;
  }
  
+ static inline int has_pushable_tasks(struct rq *rq)
+ {
+       return !plist_head_empty(&rq->rt.pushable_tasks);
+ }
+ static struct task_struct *pick_next_pushable_task(struct rq *rq)
+ {
+       struct task_struct *p;
+       if (!has_pushable_tasks(rq))
+               return NULL;
+       p = plist_first_entry(&rq->rt.pushable_tasks,
+                             struct task_struct, pushable_tasks);
+       BUG_ON(rq->cpu != task_cpu(p));
+       BUG_ON(task_current(rq, p));
+       BUG_ON(p->rt.nr_cpus_allowed <= 1);
+       BUG_ON(!p->se.on_rq);
+       BUG_ON(!rt_task(p));
+       return p;
+ }
  /*
   * If the current CPU has more than one RT task, see if the non
   * running task can migrate over to a CPU that is running a task
@@@ -1092,13 -1192,11 +1192,11 @@@ static int push_rt_task(struct rq *rq
  {
        struct task_struct *next_task;
        struct rq *lowest_rq;
-       int ret = 0;
-       int paranoid = RT_MAX_TRIES;
  
        if (!rq->rt.overloaded)
                return 0;
  
-       next_task = pick_next_highest_task_rt(rq, -1);
+       next_task = pick_next_pushable_task(rq);
        if (!next_task)
                return 0;
  
                struct task_struct *task;
                /*
                 * find lock_lowest_rq releases rq->lock
-                * so it is possible that next_task has changed.
-                * If it has, then try again.
+                * so it is possible that next_task has migrated.
+                *
+                * We need to make sure that the task is still on the same
+                * run-queue and is also still the next task eligible for
+                * pushing.
                 */
-               task = pick_next_highest_task_rt(rq, -1);
-               if (unlikely(task != next_task) && task && paranoid--) {
-                       put_task_struct(next_task);
-                       next_task = task;
-                       goto retry;
+               task = pick_next_pushable_task(rq);
+               if (task_cpu(next_task) == rq->cpu && task == next_task) {
+                       /*
+                        * If we get here, the task hasnt moved at all, but
+                        * it has failed to push.  We will not try again,
+                        * since the other cpus will pull from us when they
+                        * are ready.
+                        */
+                       dequeue_pushable_task(rq, next_task);
+                       goto out;
                }
-               goto out;
+               if (!task)
+                       /* No more tasks, just exit */
+                       goto out;
+               /*
+                * Something has shifted, try again.
+                */
+               put_task_struct(next_task);
+               next_task = task;
+               goto retry;
        }
  
        deactivate_task(rq, next_task, 0);
  
        double_unlock_balance(rq, lowest_rq);
  
-       ret = 1;
  out:
        put_task_struct(next_task);
  
-       return ret;
+       return 1;
  }
  
- /*
-  * TODO: Currently we just use the second highest prio task on
-  *       the queue, and stop when it can't migrate (or there's
-  *       no more RT tasks).  There may be a case where a lower
-  *       priority RT task has a different affinity than the
-  *       higher RT task. In this case the lower RT task could
-  *       possibly be able to migrate where as the higher priority
-  *       RT task could not.  We currently ignore this issue.
-  *       Enhancements are welcome!
-  */
  static void push_rt_tasks(struct rq *rq)
  {
        /* push_rt_task will return true if it moved an RT */
  static int pull_rt_task(struct rq *this_rq)
  {
        int this_cpu = this_rq->cpu, ret = 0, cpu;
-       struct task_struct *p, *next;
+       struct task_struct *p;
        struct rq *src_rq;
  
        if (likely(!rt_overloaded(this_rq)))
                return 0;
  
-       next = pick_next_task_rt(this_rq);
        for_each_cpu(cpu, this_rq->rd->rto_mask) {
                if (this_cpu == cpu)
                        continue;
  
                src_rq = cpu_rq(cpu);
+               /*
+                * Don't bother taking the src_rq->lock if the next highest
+                * task is known to be lower-priority than our current task.
+                * This may look racy, but if this value is about to go
+                * logically higher, the src_rq will push this task away.
+                * And if its going logically lower, we do not care
+                */
+               if (src_rq->rt.highest_prio.next >=
+                   this_rq->rt.highest_prio.curr)
+                       continue;
                /*
                 * We can potentially drop this_rq's lock in
                 * double_lock_balance, and another CPU could
-                * steal our next task - hence we must cause
-                * the caller to recalculate the next task
-                * in that case:
+                * alter this_rq
                 */
-               if (double_lock_balance(this_rq, src_rq)) {
-                       struct task_struct *old_next = next;
-                       next = pick_next_task_rt(this_rq);
-                       if (next != old_next)
-                               ret = 1;
-               }
+               double_lock_balance(this_rq, src_rq);
  
                /*
                 * Are there still pullable RT tasks?
                 * Do we have an RT task that preempts
                 * the to-be-scheduled task?
                 */
-               if (p && (!next || (p->prio < next->prio))) {
+               if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
                        WARN_ON(p == src_rq->curr);
                        WARN_ON(!p->se.on_rq);
  
                         * This is just that p is wakeing up and hasn't
                         * had a chance to schedule. We only pull
                         * p if it is lower in priority than the
-                        * current task on the run queue or
-                        * this_rq next task is lower in prio than
-                        * the current task on that rq.
+                        * current task on the run queue
                         */
-                       if (p->prio < src_rq->curr->prio ||
-                           (next && next->prio < src_rq->curr->prio))
+                       if (p->prio < src_rq->curr->prio)
                                goto skip;
  
                        ret = 1;
                         * case there's an even higher prio task
                         * in another runqueue. (low likelyhood
                         * but possible)
-                        *
-                        * Update next so that we won't pick a task
-                        * on another cpu with a priority lower (or equal)
-                        * than the one we just picked.
                         */
-                       next = p;
                }
   skip:
                double_unlock_balance(this_rq, src_rq);
  static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
  {
        /* Try to pull RT tasks here if we lower this rq's prio */
-       if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
+       if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
                pull_rt_task(rq);
  }
  
+ /*
+  * assumes rq->lock is held
+  */
+ static int needs_post_schedule_rt(struct rq *rq)
+ {
+       return has_pushable_tasks(rq);
+ }
  static void post_schedule_rt(struct rq *rq)
  {
        /*
-        * If we have more than one rt_task queued, then
-        * see if we can push the other rt_tasks off to other CPUS.
-        * Note we may release the rq lock, and since
-        * the lock was owned by prev, we need to release it
-        * first via finish_lock_switch and then reaquire it here.
+        * This is only called if needs_post_schedule_rt() indicates that
+        * we need to push tasks away
         */
-       if (unlikely(rq->rt.overloaded)) {
-               spin_lock_irq(&rq->lock);
-               push_rt_tasks(rq);
-               spin_unlock_irq(&rq->lock);
-       }
+       spin_lock_irq(&rq->lock);
+       push_rt_tasks(rq);
+       spin_unlock_irq(&rq->lock);
  }
  
  /*
@@@ -1288,7 -1389,8 +1389,8 @@@ static void task_wake_up_rt(struct rq *
  {
        if (!task_running(rq, p) &&
            !test_tsk_need_resched(rq->curr) &&
-           rq->rt.overloaded)
+           has_pushable_tasks(rq) &&
+           p->rt.nr_cpus_allowed > 1)
                push_rt_tasks(rq);
  }
  
@@@ -1324,6 -1426,24 +1426,24 @@@ static void set_cpus_allowed_rt(struct 
        if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
                struct rq *rq = task_rq(p);
  
+               if (!task_current(rq, p)) {
+                       /*
+                        * Make sure we dequeue this task from the pushable list
+                        * before going further.  It will either remain off of
+                        * the list because we are no longer pushable, or it
+                        * will be requeued.
+                        */
+                       if (p->rt.nr_cpus_allowed > 1)
+                               dequeue_pushable_task(rq, p);
+                       /*
+                        * Requeue if our weight is changing and still > 1
+                        */
+                       if (weight > 1)
+                               enqueue_pushable_task(rq, p);
+               }
                if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
                        rq->rt.rt_nr_migratory++;
                } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
@@@ -1346,7 -1466,7 +1466,7 @@@ static void rq_online_rt(struct rq *rq
  
        __enable_runtime(rq);
  
-       cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
+       cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
  }
  
  /* Assumes rq->lock is held */
@@@ -1383,8 -1503,7 +1503,8 @@@ static inline void init_sched_rt_class(
        unsigned int i;
  
        for_each_possible_cpu(i)
 -              alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL);
 +              alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
 +                                      GFP_KERNEL, cpu_to_node(i));
  }
  #endif /* CONFIG_SMP */
  
@@@ -1438,7 -1557,7 +1558,7 @@@ static void prio_changed_rt(struct rq *
                 * can release the rq lock and p could migrate.
                 * Only reschedule if p is still on the same runqueue.
                 */
-               if (p->prio > rq->rt.highest_prio && rq->curr == p)
+               if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
                        resched_task(p);
  #else
                /* For UP simply resched on drop of prio */
@@@ -1509,6 -1628,9 +1629,9 @@@ static void set_curr_task_rt(struct rq 
        struct task_struct *p = rq->curr;
  
        p->se.exec_start = rq->clock;
+       /* The running task is never eligible for pushing */
+       dequeue_pushable_task(rq, p);
  }
  
  static const struct sched_class rt_sched_class = {
        .rq_online              = rq_online_rt,
        .rq_offline             = rq_offline_rt,
        .pre_schedule           = pre_schedule_rt,
+       .needs_post_schedule    = needs_post_schedule_rt,
        .post_schedule          = post_schedule_rt,
        .task_wake_up           = task_wake_up_rt,
        .switched_from          = switched_from_rt,