Merge branch 'sched/latest' of git://git.kernel.org/pub/scm/linux/kernel/git/ghaskins...

author Ingo Molnar <mingo@elte.hu>

Sun, 11 Jan 2009 03:58:49 +0000 (04:58 +0100)

committer Ingo Molnar <mingo@elte.hu>

Sun, 11 Jan 2009 03:58:49 +0000 (04:58 +0100)
author Ingo Molnar <mingo@elte.hu>
Sun, 11 Jan 2009 03:58:49 +0000 (04:58 +0100)
committer Ingo Molnar <mingo@elte.hu>
Sun, 11 Jan 2009 03:58:49 +0000 (04:58 +0100)
diff --combined include/linux/init_task.h

index 2f3c2d4ef73b1b0df8b4e96998a9021480d5706e,6851225f44a76a8d5f17d08bf17ca88fc337f284..9d85d9f03d183e50e1ab2f4272f54b28d0327144
--- 1/include/linux/init_task.h
--- 2/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@@ -12,7 -12,6 +12,7 @@@
   #include <net/net_namespace.h>
   
   extern struct files_struct init_files;
+ +extern struct fs_struct init_fs;
   
   #define INIT_KIOCTX(name, which_mm) \
   {                                                     \
@@@ -58,6 -57,7 +58,6 @@@ extern struct nsproxy init_nsproxy
         .mnt_ns         = NULL,                                         \
         INIT_NET_NS(net_ns)                                             \
         INIT_IPC_NS(ipc_ns)                                             \
- -      .user_ns        = &init_user_ns,                                \
   }
   
   #define INIT_SIGHAND(sighand) {                                               \
@@@ -113,8 -113,6 +113,8 @@@ extern struct group_info init_groups
   # define CAP_INIT_BSET  CAP_INIT_EFF_SET
   #endif
   
+ +extern struct cred init_cred;
+ +
   /*
    *  INIT_TASK is used to set up the first task table, touch at
    * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@@ -142,6 -140,7 +142,7 @@@
                 .nr_cpus_allowed = NR_CPUS,                             \
         },                                                              \
         .tasks          = LIST_HEAD_INIT(tsk.tasks),                    \
+       .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \
         .ptraced        = LIST_HEAD_INIT(tsk.ptraced),                  \
         .ptrace_entry   = LIST_HEAD_INIT(tsk.ptrace_entry),             \
         .real_parent    = &tsk,                                         \
@@@ -149,10 -148,13 +150,10 @@@
         .children       = LIST_HEAD_INIT(tsk.children),                 \
         .sibling        = LIST_HEAD_INIT(tsk.sibling),                  \
         .group_leader   = &tsk,                                         \
- -      .group_info     = &init_groups,                                 \
- -      .cap_effective  = CAP_INIT_EFF_SET,                             \
- -      .cap_inheritable = CAP_INIT_INH_SET,                            \
- -      .cap_permitted  = CAP_FULL_SET,                                 \
- -      .cap_bset       = CAP_INIT_BSET,                                \
- -      .securebits     = SECUREBITS_DEFAULT,                           \
- -      .user           = INIT_USER,                                    \
+ +      .real_cred      = &init_cred,                                   \
+ +      .cred           = &init_cred,                                   \
+ +      .cred_exec_mutex =                                              \
+ +               __MUTEX_INITIALIZER(tsk.cred_exec_mutex),              \
         .comm           = "swapper",                                    \
         .thread         = INIT_THREAD,                                  \
         .fs             = &init_fs,                                     \
diff --combined include/linux/sched.h

index 4cae9b81a1f8851d51a5380d8d37fa7ba3ceb529,440cabb2d432b486b87bea3f9d49cd8054c63239..c37c5141037b8cb398acd2712652a02110d5fb5c
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -284,6 -284,7 +284,6 @@@ long io_schedule_timeout(long timeout)
   
   extern void cpu_init (void);
   extern void trap_init(void);
- -extern void account_process_tick(struct task_struct *task, int user);
   extern void update_process_times(int user);
   extern void scheduler_tick(void);
   
@@@ -386,9 -387,6 +386,9 @@@ extern void arch_unmap_area_topdown(str
                 (mm)->hiwater_vm = (mm)->total_vm;      \
   } while (0)
   
+ +#define get_mm_hiwater_rss(mm)        max((mm)->hiwater_rss, get_mm_rss(mm))
+ +#define get_mm_hiwater_vm(mm) max((mm)->hiwater_vm, (mm)->total_vm)
+ +
   extern void set_dumpable(struct mm_struct *mm, int value);
   extern int get_dumpable(struct mm_struct *mm);
   
@@@ -573,6 -571,12 +573,6 @@@ struct signal_struct 
          */
         struct rlimit rlim[RLIM_NLIMITS];
   
- -      /* keep the process-shared keyrings here so that they do the right
- -       * thing in threads created with CLONE_THREAD */
- -#ifdef CONFIG_KEYS
- -      struct key *session_keyring;    /* keyring inherited over fork */
- -      struct key *process_keyring;    /* keyring private to this process */
- -#endif
   #ifdef CONFIG_BSD_PROCESS_ACCT
         struct pacct_struct pacct;      /* per-process accounting information */
   #endif
@@@ -643,7 -647,6 +643,7 @@@ struct user_struct 
         /* Hash table maintenance information */
         struct hlist_node uidhash_node;
         uid_t uid;
+ +      struct user_namespace *user_ns;
   
   #ifdef CONFIG_USER_SCHED
         struct task_group *tg;
@@@ -661,7 -664,6 +661,7 @@@ extern struct user_struct *find_user(ui
   extern struct user_struct root_user;
   #define INIT_USER (&root_user)
   
+ +
   struct backing_dev_info;
   struct reclaim_state;
   
@@@ -669,7 -671,8 +669,7 @@@
   struct sched_info {
         /* cumulative counters */
         unsigned long pcount;         /* # of times run on this cpu */
- -      unsigned long long cpu_time,  /* time spent on the cpu */
- -                         run_delay; /* time spent waiting on a runqueue */
+ +      unsigned long long run_delay; /* time spent waiting on a runqueue */
   
         /* timestamps */
         unsigned long long last_arrival,/* when we last ran on a cpu */
@@@ -915,6 -918,7 +915,6 @@@ static inline struct cpumask *sched_dom
   
   extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                                     struct sched_domain_attr *dattr_new);
- -extern int arch_reinit_sched_domains(void);
   
   /* Test a flag in parent sched domain */
   static inline int test_sd_parent(struct sched_domain *sd, int flag)
@@@ -937,7 -941,38 +937,7 @@@ partition_sched_domains(int ndoms_new, 
   #endif        /* !CONFIG_SMP */
   
   struct io_context;                    /* See blkdev.h */
- -#define NGROUPS_SMALL         32
- -#define NGROUPS_PER_BLOCK     ((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
- -struct group_info {
- -      int ngroups;
- -      atomic_t usage;
- -      gid_t small_block[NGROUPS_SMALL];
- -      int nblocks;
- -      gid_t *blocks[0];
- -};
- -
- -/*
- - * get_group_info() must be called with the owning task locked (via task_lock())
- - * when task != current.  The reason being that the vast majority of callers are
- - * looking at current->group_info, which can not be changed except by the
- - * current task.  Changing current->group_info requires the task lock, too.
- - */
- -#define get_group_info(group_info) do { \
- -      atomic_inc(&(group_info)->usage); \
- -} while (0)
- -
- -#define put_group_info(group_info) do { \
- -      if (atomic_dec_and_test(&(group_info)->usage)) \
- -              groups_free(group_info); \
- -} while (0)
   
- -extern struct group_info *groups_alloc(int gidsetsize);
- -extern void groups_free(struct group_info *group_info);
- -extern int set_current_groups(struct group_info *group_info);
- -extern int groups_search(struct group_info *group_info, gid_t grp);
- -/* access the groups "array" with this macro */
- -#define GROUP_AT(gi, i) \
- -    ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
   
   #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
   extern void prefetch_stack(struct task_struct *t);
@@@ -977,6 -1012,7 +977,7 @@@ struct sched_class 
                               struct rq *busiest, struct sched_domain *sd,
                               enum cpu_idle_type idle);
         void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+       int (*needs_post_schedule) (struct rq *this_rq);
         void (*post_schedule) (struct rq *this_rq);
         void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
   
@@@ -1143,6 -1179,7 +1144,7 @@@ struct task_struct 
   #endif
   
         struct list_head tasks;
+       struct plist_node pushable_tasks;
   
         struct mm_struct *mm, *active_mm;
   
@@@ -1193,7 -1230,6 +1195,7 @@@
          * The buffer to hold the BTS data.
          */
         void *bts_buffer;
+ +      size_t bts_size;
   #endif /* CONFIG_X86_PTRACE_BTS */
   
         /* PID/PID hash table linkage. */
@@@ -1217,12 -1253,17 +1219,12 @@@
         struct list_head cpu_timers[3];
   
   /* process credentials */
- -      uid_t uid,euid,suid,fsuid;
- -      gid_t gid,egid,sgid,fsgid;
- -      struct group_info *group_info;
- -      kernel_cap_t   cap_effective, cap_inheritable, cap_permitted, cap_bset;
- -      struct user_struct *user;
- -      unsigned securebits;
- -#ifdef CONFIG_KEYS
- -      unsigned char jit_keyring;      /* default keyring to attach requested keys to */
- -      struct key *request_key_auth;   /* assumed request_key authority */
- -      struct key *thread_keyring;     /* keyring private to this thread */
- -#endif
+ +      const struct cred *real_cred;   /* objective and real subjective task
+ +                                       * credentials (COW) */
+ +      const struct cred *cred;        /* effective (overridable) subjective task
+ +                                       * credentials (COW) */
+ +      struct mutex cred_exec_mutex;   /* execve vs ptrace cred calculation mutex */
+ +
         char comm[TASK_COMM_LEN]; /* executable name excluding path
                                      - access with [gs]et_task_comm (which lock
                                        it with task_lock())
@@@ -1259,6 -1300,9 +1261,6 @@@
         int (*notifier)(void *priv);
         void *notifier_data;
         sigset_t *notifier_mask;
- -#ifdef CONFIG_SECURITY
- -      void *security;
- -#endif
         struct audit_context *audit_context;
   #ifdef CONFIG_AUDITSYSCALL
         uid_t loginuid;
@@@ -1706,16 -1750,16 +1708,16 @@@ extern void wake_up_idle_cpu(int cpu)
   static inline void wake_up_idle_cpu(int cpu) { }
   #endif
   
- -#ifdef CONFIG_SCHED_DEBUG
   extern unsigned int sysctl_sched_latency;
   extern unsigned int sysctl_sched_min_granularity;
   extern unsigned int sysctl_sched_wakeup_granularity;
+ +extern unsigned int sysctl_sched_shares_ratelimit;
+ +extern unsigned int sysctl_sched_shares_thresh;
+ +#ifdef CONFIG_SCHED_DEBUG
   extern unsigned int sysctl_sched_child_runs_first;
   extern unsigned int sysctl_sched_features;
   extern unsigned int sysctl_sched_migration_cost;
   extern unsigned int sysctl_sched_nr_migrate;
- -extern unsigned int sysctl_sched_shares_ratelimit;
- -extern unsigned int sysctl_sched_shares_thresh;
   
   int sched_nr_latency_handler(struct ctl_table *table, int write,
                 struct file *file, void __user *buffer, size_t *length,
@@@ -1815,6 -1859,7 +1817,6 @@@ static inline struct user_struct *get_u
         return u;
   }
   extern void free_uid(struct user_struct *);
- -extern void switch_uid(struct user_struct *);
   extern void release_uids(struct user_namespace *ns);
   
   #include <asm/current.h>
@@@ -1833,6 -1878,9 +1835,6 @@@ extern void wake_up_new_task(struct tas
   extern void sched_fork(struct task_struct *p, int clone_flags);
   extern void sched_dead(struct task_struct *p);
   
- -extern int in_group_p(gid_t);
- -extern int in_egroup_p(gid_t);
- -
   extern void proc_caches_init(void);
   extern void flush_signals(struct task_struct *);
   extern void ignore_signals(struct task_struct *);
@@@ -1964,8 -2012,6 +1966,8 @@@ static inline unsigned long wait_task_i
   #define for_each_process(p) \
         for (p = &init_task ; (p = next_task(p)) != &init_task ; )
   
+ +extern bool is_single_threaded(struct task_struct *);
+ +
   /*
    * Careful: do_each_thread/while_each_thread is a double loop so
    *          'break' will not work as expected - use goto instead.
diff --combined kernel/sched.c

index deb5ac8c12f37c44e71dcc46484149d073430948,24ab80c28765b6b8a6e713c2f864dbaf61cea14f..dd1a1466c1e6a7412ca43022185c8aed0f76c88d
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -209,6 -209,7 +209,6 @@@ void init_rt_bandwidth(struct rt_bandwi
         hrtimer_init(&rt_b->rt_period_timer,
                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         rt_b->rt_period_timer.function = sched_rt_period_timer;
- -      rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
   }
   
   static inline int rt_bandwidth_enabled(void)
@@@ -360,9 -361,7 +360,9 @@@ static inline struct task_group *task_g
         struct task_group *tg;
   
   #ifdef CONFIG_USER_SCHED
- -      tg = p->user->tg;
+ +      rcu_read_lock();
+ +      tg = __task_cred(p)->user->tg;
+ +      rcu_read_unlock();
   #elif defined(CONFIG_CGROUP_SCHED)
         tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
                                 struct task_group, css);
@@@ -464,11 -463,15 +464,15 @@@ struct rt_rq 
         struct rt_prio_array active;
         unsigned long rt_nr_running;
   #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       int highest_prio; /* highest queued rt task prio */
+       struct {
+               int curr; /* highest queued rt task prio */
+               int next; /* next highest */
+       } highest_prio;
   #endif
   #ifdef CONFIG_SMP
         unsigned long rt_nr_migratory;
         int overloaded;
+       struct plist_head pushable_tasks;
   #endif
         int rt_throttled;
         u64 rt_time;
@@@ -611,8 -614,6 +615,8 @@@ struct rq 
   #ifdef CONFIG_SCHEDSTATS
         /* latency stats */
         struct sched_info rq_sched_info;
+ +      unsigned long long rq_cpu_time;
+ +      /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
   
         /* sys_sched_yield() stats */
         unsigned int yld_exp_empty;
@@@ -1146,6 -1147,7 +1150,6 @@@ static void init_rq_hrtick(struct rq *r
   
         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         rq->hrtick_timer.function = hrtick;
- -      rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
   }
   #else /* CONFIG_SCHED_HRTICK */
   static inline void hrtick_clear(struct rq *rq)
@@@ -1607,21 -1609,42 +1611,42 @@@ static inline void update_shares_locked
   
   #endif
   
+ #ifdef CONFIG_PREEMPT
+ 
   /*
-  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+  * fair double_lock_balance: Safely acquires both rq->locks in a fair
+  * way at the expense of forcing extra atomic operations in all
+  * invocations.  This assures that the double_lock is acquired using the
+  * same underlying policy as the spinlock_t on this architecture, which
+  * reduces latency compared to the unfair variant below.  However, it
+  * also adds more overhead and therefore may reduce throughput.
    */
- static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(this_rq->lock)
+       __acquires(busiest->lock)
+       __acquires(this_rq->lock)
+ {
+       spin_unlock(&this_rq->lock);
+       double_rq_lock(this_rq, busiest);
+ 
+       return 1;
+ }
+ 
+ #else
+ /*
+  * Unfair double_lock_balance: Optimizes throughput at the expense of
+  * latency by eliminating extra atomic operations when the locks are
+  * already in proper order on entry.  This favors lower cpu-ids and will
+  * grant the double lock to lower cpus over higher ids under contention,
+  * regardless of entry order into the function.
+  */
+ static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
         __releases(this_rq->lock)
         __acquires(busiest->lock)
         __acquires(this_rq->lock)
   {
         int ret = 0;
   
-       if (unlikely(!irqs_disabled())) {
-               /* printk() doesn't work good under rq->lock */
-               spin_unlock(&this_rq->lock);
-               BUG_ON(1);
-       }
         if (unlikely(!spin_trylock(&busiest->lock))) {
                 if (busiest < this_rq) {
                         spin_unlock(&this_rq->lock);
@@@ -1634,6 -1657,22 +1659,22 @@@
         return ret;
   }
   
+ #endif /* CONFIG_PREEMPT */
+ 
+ /*
+  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+  */
+ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ {
+       if (unlikely(!irqs_disabled())) {
+               /* printk() doesn't work good under rq->lock */
+               spin_unlock(&this_rq->lock);
+               BUG_ON(1);
+       }
+ 
+       return _double_lock_balance(this_rq, busiest);
+ }
+ 
   static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
         __releases(busiest->lock)
   {
@@@ -1873,8 -1912,6 +1914,8 @@@ void set_task_cpu(struct task_struct *p
   
         clock_offset = old_rq->clock - new_rq->clock;
   
+ +      trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+ +
   #ifdef CONFIG_SCHEDSTATS
         if (p->se.wait_start)
                 p->se.wait_start -= clock_offset;
@@@ -2281,7 -2318,6 +2322,7 @@@ static int try_to_wake_up(struct task_s
   
         smp_wmb();
         rq = task_rq_lock(p, &flags);
+ +      update_rq_clock(rq);
         old_state = p->state;
         if (!(old_state & state))
                 goto out;
@@@ -2339,11 -2375,12 +2380,11 @@@ out_activate
                 schedstat_inc(p, se.nr_wakeups_local);
         else
                 schedstat_inc(p, se.nr_wakeups_remote);
- -      update_rq_clock(rq);
         activate_task(rq, p, 1);
         success = 1;
   
   out_running:
- -      trace_sched_wakeup(rq, p);
+ +      trace_sched_wakeup(rq, p, success);
         check_preempt_curr(rq, p, sync);
   
         p->state = TASK_RUNNING;
@@@ -2445,6 -2482,8 +2486,8 @@@ void sched_fork(struct task_struct *p, 
         /* Want to start with kernel preemption disabled. */
         task_thread_info(p)->preempt_count = 1;
   #endif
+       plist_node_init(&p->pushable_tasks, MAX_PRIO);
+ 
         put_cpu();
   }
   
@@@ -2476,7 -2515,7 +2519,7 @@@ void wake_up_new_task(struct task_struc
                 p->sched_class->task_new(rq, p);
                 inc_nr_running(rq);
         }
- -      trace_sched_wakeup_new(rq, p);
+ +      trace_sched_wakeup_new(rq, p, 1);
         check_preempt_curr(rq, p, 0);
   #ifdef CONFIG_SMP
         if (p->sched_class->task_wake_up)
@@@ -2585,6 -2624,12 +2628,12 @@@ static void finish_task_switch(struct r
   {
         struct mm_struct *mm = rq->prev_mm;
         long prev_state;
+ #ifdef CONFIG_SMP
+       int post_schedule = 0;
+ 
+       if (current->sched_class->needs_post_schedule)
+               post_schedule = current->sched_class->needs_post_schedule(rq);
+ #endif
   
         rq->prev_mm = NULL;
   
@@@ -2603,7 -2648,7 +2652,7 @@@
         finish_arch_switch(prev);
         finish_lock_switch(rq, prev);
   #ifdef CONFIG_SMP
-       if (current->sched_class->post_schedule)
+       if (post_schedule)
                 current->sched_class->post_schedule(rq);
   #endif
   
@@@ -2855,6 -2900,7 +2904,6 @@@ static void sched_migrate_task(struct t
             || unlikely(!cpu_active(dest_cpu)))
                 goto out;
   
- -      trace_sched_migrate_task(rq, p, dest_cpu);
         /* force the process onto the specified CPU */
         if (migrate_task(p, dest_cpu, &req)) {
                 /* Need to wait for migration thread (might exit: take ref). */
@@@ -2984,6 -3030,16 +3033,16 @@@ next
         pulled++;
         rem_load_move -= p->se.load.weight;
   
+ #ifdef CONFIG_PREEMPT
+       /*
+        * NEWIDLE balancing is a source of latency, so preemptible kernels
+        * will stop after the first task is pulled to minimize the critical
+        * section.
+        */
+       if (idle == CPU_NEWLY_IDLE)
+               goto out;
+ #endif
+ 
         /*
          * We only want to steal up to the prescribed amount of weighted load.
          */
@@@ -3030,9 -3086,15 +3089,15 @@@ static int move_tasks(struct rq *this_r
                                 sd, idle, all_pinned, &this_best_prio);
                 class = class->next;
   
+ #ifdef CONFIG_PREEMPT
+               /*
+                * NEWIDLE balancing is a source of latency, so preemptible
+                * kernels will stop after the first task is pulled to minimize
+                * the critical section.
+                */
                 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
                         break;
- 
+ #endif
         } while (class && max_load_move > total_load_moved);
   
         return total_load_moved > 0;
@@@ -3715,7 -3777,7 +3780,7 @@@ redo
                  * don't kick the migration_thread, if the curr
                  * task on busiest cpu can't be moved to this_cpu
                  */
- -              if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+ +              if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
                         double_unlock_balance(this_rq, busiest);
                         all_pinned = 1;
                         return ld_moved;
@@@ -3728,13 -3790,8 +3793,13 @@@
                 }
   
                 double_unlock_balance(this_rq, busiest);
+ +              /*
+ +               * Should not call ttwu while holding a rq->lock
+ +               */
+ +              spin_unlock(&this_rq->lock);
                 if (active_balance)
                         wake_up_process(busiest->migration_thread);
+ +              spin_lock(&this_rq->lock);
   
         } else
                 sd->nr_balance_failed = 0;
@@@ -4155,17 -4212,13 +4220,17 @@@ unsigned long long task_delta_exec(stru
    * Account user cpu time to a process.
    * @p: the process that the cpu time gets accounted to
    * @cputime: the cpu time spent in user space since the last update
+ + * @cputime_scaled: cputime scaled by cpu frequency
    */
- -void account_user_time(struct task_struct *p, cputime_t cputime)
+ +void account_user_time(struct task_struct *p, cputime_t cputime,
+ +                     cputime_t cputime_scaled)
   {
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
         cputime64_t tmp;
   
+ +      /* Add user time to process. */
         p->utime = cputime_add(p->utime, cputime);
+ +      p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
         account_group_user_time(p, cputime);
   
         /* Add user time to cpustat. */
@@@ -4182,48 -4235,51 +4247,48 @@@
    * Account guest cpu time to a process.
    * @p: the process that the cpu time gets accounted to
    * @cputime: the cpu time spent in virtual machine since the last update
+ + * @cputime_scaled: cputime scaled by cpu frequency
    */
- -static void account_guest_time(struct task_struct *p, cputime_t cputime)
+ +static void account_guest_time(struct task_struct *p, cputime_t cputime,
+ +                             cputime_t cputime_scaled)
   {
         cputime64_t tmp;
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
   
         tmp = cputime_to_cputime64(cputime);
   
+ +      /* Add guest time to process. */
         p->utime = cputime_add(p->utime, cputime);
+ +      p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
         account_group_user_time(p, cputime);
         p->gtime = cputime_add(p->gtime, cputime);
   
+ +      /* Add guest time to cpustat. */
         cpustat->user = cputime64_add(cpustat->user, tmp);
         cpustat->guest = cputime64_add(cpustat->guest, tmp);
   }
   
- -/*
- - * Account scaled user cpu time to a process.
- - * @p: the process that the cpu time gets accounted to
- - * @cputime: the cpu time spent in user space since the last update
- - */
- -void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
- -{
- -      p->utimescaled = cputime_add(p->utimescaled, cputime);
- -}
- -
   /*
    * Account system cpu time to a process.
    * @p: the process that the cpu time gets accounted to
    * @hardirq_offset: the offset to subtract from hardirq_count()
    * @cputime: the cpu time spent in kernel space since the last update
+ + * @cputime_scaled: cputime scaled by cpu frequency
    */
   void account_system_time(struct task_struct *p, int hardirq_offset,
- -                       cputime_t cputime)
+ +                       cputime_t cputime, cputime_t cputime_scaled)
   {
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- -      struct rq *rq = this_rq();
         cputime64_t tmp;
   
         if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
- -              account_guest_time(p, cputime);
+ +              account_guest_time(p, cputime, cputime_scaled);
                 return;
         }
   
+ +      /* Add system time to process. */
         p->stime = cputime_add(p->stime, cputime);
+ +      p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
         account_group_system_time(p, cputime);
   
         /* Add system time to cpustat. */
@@@ -4232,84 -4288,48 +4297,84 @@@
                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
         else if (softirq_count())
                 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
- -      else if (p != rq->idle)
- -              cpustat->system = cputime64_add(cpustat->system, tmp);
- -      else if (atomic_read(&rq->nr_iowait) > 0)
- -              cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
         else
- -              cpustat->idle = cputime64_add(cpustat->idle, tmp);
+ +              cpustat->system = cputime64_add(cpustat->system, tmp);
+ +
         /* Account for system time used */
         acct_update_integrals(p);
   }
   
   /*
- - * Account scaled system cpu time to a process.
- - * @p: the process that the cpu time gets accounted to
- - * @hardirq_offset: the offset to subtract from hardirq_count()
- - * @cputime: the cpu time spent in kernel space since the last update
+ + * Account for involuntary wait time.
+ + * @steal: the cpu time spent in involuntary wait
    */
- -void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
+ +void account_steal_time(cputime_t cputime)
   {
- -      p->stimescaled = cputime_add(p->stimescaled, cputime);
+ +      struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ +      cputime64_t cputime64 = cputime_to_cputime64(cputime);
+ +
+ +      cpustat->steal = cputime64_add(cpustat->steal, cputime64);
   }
   
   /*
- - * Account for involuntary wait time.
- - * @p: the process from which the cpu time has been stolen
- - * @steal: the cpu time spent in involuntary wait
+ + * Account for idle time.
+ + * @cputime: the cpu time spent in idle wait
    */
- -void account_steal_time(struct task_struct *p, cputime_t steal)
+ +void account_idle_time(cputime_t cputime)
   {
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- -      cputime64_t tmp = cputime_to_cputime64(steal);
+ +      cputime64_t cputime64 = cputime_to_cputime64(cputime);
         struct rq *rq = this_rq();
   
- -      if (p == rq->idle) {
- -              p->stime = cputime_add(p->stime, steal);
- -              if (atomic_read(&rq->nr_iowait) > 0)
- -                      cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
- -              else
- -                      cpustat->idle = cputime64_add(cpustat->idle, tmp);
- -      } else
- -              cpustat->steal = cputime64_add(cpustat->steal, tmp);
+ +      if (atomic_read(&rq->nr_iowait) > 0)
+ +              cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
+ +      else
+ +              cpustat->idle = cputime64_add(cpustat->idle, cputime64);
+ +}
+ +
+ +#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+ +
+ +/*
+ + * Account a single tick of cpu time.
+ + * @p: the process that the cpu time gets accounted to
+ + * @user_tick: indicates if the tick is a user or a system tick
+ + */
+ +void account_process_tick(struct task_struct *p, int user_tick)
+ +{
+ +      cputime_t one_jiffy = jiffies_to_cputime(1);
+ +      cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
+ +      struct rq *rq = this_rq();
+ +
+ +      if (user_tick)
+ +              account_user_time(p, one_jiffy, one_jiffy_scaled);
+ +      else if (p != rq->idle)
+ +              account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
+ +                                  one_jiffy_scaled);
+ +      else
+ +              account_idle_time(one_jiffy);
+ +}
+ +
+ +/*
+ + * Account multiple ticks of steal time.
+ + * @p: the process from which the cpu time has been stolen
+ + * @ticks: number of stolen ticks
+ + */
+ +void account_steal_ticks(unsigned long ticks)
+ +{
+ +      account_steal_time(jiffies_to_cputime(ticks));
+ +}
+ +
+ +/*
+ + * Account multiple ticks of idle time.
+ + * @ticks: number of stolen ticks
+ + */
+ +void account_idle_ticks(unsigned long ticks)
+ +{
+ +      account_idle_time(jiffies_to_cputime(ticks));
   }
   
+ +#endif
+ +
   /*
    * Use precise platform statistics if available:
    */
@@@ -5232,22 -5252,6 +5297,22 @@@ __setscheduler(struct rq *rq, struct ta
         set_load_weight(p);
   }
   
+ +/*
+ + * check the target process has a UID that matches the current process's
+ + */
+ +static bool check_same_owner(struct task_struct *p)
+ +{
+ +      const struct cred *cred = current_cred(), *pcred;
+ +      bool match;
+ +
+ +      rcu_read_lock();
+ +      pcred = __task_cred(p);
+ +      match = (cred->euid == pcred->euid ||
+ +               cred->euid == pcred->uid);
+ +      rcu_read_unlock();
+ +      return match;
+ +}
+ +
   static int __sched_setscheduler(struct task_struct *p, int policy,
                                 struct sched_param *param, bool user)
   {
@@@ -5307,7 -5311,8 +5372,7 @@@ recheck
                         return -EPERM;
   
                 /* can't change other user's priorities */
- -              if ((current->euid != p->euid) &&
- -                  (current->euid != p->uid))
+ +              if (!check_same_owner(p))
                         return -EPERM;
         }
   
@@@ -5546,7 -5551,8 +5611,7 @@@ long sched_setaffinity(pid_t pid, cons
                 goto out_free_cpus_allowed;
         }
         retval = -EPERM;
- -      if ((current->euid != p->euid) && (current->euid != p->uid) &&
- -                      !capable(CAP_SYS_NICE))
+ +      if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                 goto out_unlock;
   
         retval = security_task_setscheduler(p, 0, NULL);
@@@ -6262,7 -6268,9 +6327,7 @@@ static int __migrate_task_irq(struct ta
   static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
   {
         int dest_cpu;
- -      /* FIXME: Use cpumask_of_node here. */
- -      cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu));
- -      const struct cpumask *nodemask = &_nodemask;
+ +      const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
   
   again:
         /* Look for allowed, online CPU in same node. */
@@@ -6962,7 -6970,7 +7027,7 @@@ static void rq_attach_root(struct rq *r
         spin_unlock_irqrestore(&rq->lock, flags);
   }
   
- -static int init_rootdomain(struct root_domain *rd, bool bootmem)
+ +static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
   {
         memset(rd, 0, sizeof(*rd));
   
@@@ -6975,7 -6983,7 +7040,7 @@@
         }
   
         if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
- -              goto free_rd;
+ +              goto out;
         if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
                 goto free_span;
         if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
@@@ -6991,7 -6999,8 +7056,7 @@@ free_online
         free_cpumask_var(rd->online);
   free_span:
         free_cpumask_var(rd->span);
- -free_rd:
- -      kfree(rd);
+ +out:
         return -ENOMEM;
   }
   
@@@ -7172,18 -7181,21 +7237,18 @@@ static int find_next_best_node(int node
   static void sched_domain_node_span(int node, struct cpumask *span)
   {
         nodemask_t used_nodes;
- -      /* FIXME: use cpumask_of_node() */
- -      node_to_cpumask_ptr(nodemask, node);
         int i;
   
- -      cpus_clear(*span);
+ +      cpumask_clear(span);
         nodes_clear(used_nodes);
   
- -      cpus_or(*span, *span, *nodemask);
+ +      cpumask_or(span, span, cpumask_of_node(node));
         node_set(node, used_nodes);
   
         for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
                 int next_node = find_next_best_node(node, &used_nodes);
   
- -              node_to_cpumask_ptr_next(nodemask, next_node);
- -              cpus_or(*span, *span, *nodemask);
+ +              cpumask_or(span, span, cpumask_of_node(next_node));
         }
   }
   #endif /* CONFIG_NUMA */
@@@ -7263,7 -7275,9 +7328,7 @@@ cpu_to_phys_group(int cpu, const struc
   {
         int group;
   #ifdef CONFIG_SCHED_MC
- -      /* FIXME: Use cpu_coregroup_mask. */
- -      *mask = cpu_coregroup_map(cpu);
- -      cpus_and(*mask, *mask, *cpu_map);
+ +      cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
         group = cpumask_first(mask);
   #elif defined(CONFIG_SCHED_SMT)
         cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
@@@ -7293,8 -7307,10 +7358,8 @@@ static int cpu_to_allnodes_group(int cp
                                  struct cpumask *nodemask)
   {
         int group;
- -      /* FIXME: use cpumask_of_node */
- -      node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu));
   
- -      cpumask_and(nodemask, pnodemask, cpu_map);
+ +      cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
         group = cpumask_first(nodemask);
   
         if (sg)
@@@ -7345,8 -7361,10 +7410,8 @@@ static void free_sched_groups(const str
   
                 for (i = 0; i < nr_node_ids; i++) {
                         struct sched_group *oldsg, *sg = sched_group_nodes[i];
- -                      /* FIXME: Use cpumask_of_node */
- -                      node_to_cpumask_ptr(pnodemask, i);
   
- -                      cpus_and(*nodemask, *pnodemask, *cpu_map);
+ +                      cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
                         if (cpumask_empty(nodemask))
                                 continue;
   
@@@ -7555,7 -7573,9 +7620,7 @@@ static int __build_sched_domains(const 
         for_each_cpu(i, cpu_map) {
                 struct sched_domain *sd = NULL, *p;
   
- -              /* FIXME: use cpumask_of_node */
- -              *nodemask = node_to_cpumask(cpu_to_node(i));
- -              cpus_and(*nodemask, *nodemask, *cpu_map);
+ +              cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
   
   #ifdef CONFIG_NUMA
                 if (cpumask_weight(cpu_map) >
@@@ -7596,8 -7616,9 +7661,8 @@@
                 sd = &per_cpu(core_domains, i).sd;
                 SD_INIT(sd, MC);
                 set_domain_attribute(sd, attr);
- -              *sched_domain_span(sd) = cpu_coregroup_map(i);
- -              cpumask_and(sched_domain_span(sd),
- -                          sched_domain_span(sd), cpu_map);
+ +              cpumask_and(sched_domain_span(sd), cpu_map,
+ +                                                 cpu_coregroup_mask(i));
                 sd->parent = p;
                 p->child = sd;
                 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@@ -7633,7 -7654,9 +7698,7 @@@
   #ifdef CONFIG_SCHED_MC
         /* Set up multi-core groups */
         for_each_cpu(i, cpu_map) {
- -              /* FIXME: Use cpu_coregroup_mask */
- -              *this_core_map = cpu_coregroup_map(i);
- -              cpus_and(*this_core_map, *this_core_map, *cpu_map);
+ +              cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
                 if (i != cpumask_first(this_core_map))
                         continue;
   
@@@ -7645,7 -7668,9 +7710,7 @@@
   
         /* Set up physical groups */
         for (i = 0; i < nr_node_ids; i++) {
- -              /* FIXME: Use cpumask_of_node */
- -              *nodemask = node_to_cpumask(i);
- -              cpus_and(*nodemask, *nodemask, *cpu_map);
+ +              cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
                 if (cpumask_empty(nodemask))
                         continue;
   
@@@ -7667,8 -7692,11 +7732,8 @@@
                 struct sched_group *sg, *prev;
                 int j;
   
- -              /* FIXME: Use cpumask_of_node */
- -              *nodemask = node_to_cpumask(i);
                 cpumask_clear(covered);
- -
- -              cpus_and(*nodemask, *nodemask, *cpu_map);
+ +              cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
                 if (cpumask_empty(nodemask)) {
                         sched_group_nodes[i] = NULL;
                         continue;
@@@ -7699,6 -7727,8 +7764,6 @@@
   
                 for (j = 0; j < nr_node_ids; j++) {
                         int n = (i + j) % nr_node_ids;
- -                      /* FIXME: Use cpumask_of_node */
- -                      node_to_cpumask_ptr(pnodemask, n);
   
                         cpumask_complement(notcovered, covered);
                         cpumask_and(tmpmask, notcovered, cpu_map);
@@@ -7706,7 -7736,7 +7771,7 @@@
                         if (cpumask_empty(tmpmask))
                                 break;
   
- -                      cpumask_and(tmpmask, tmpmask, pnodemask);
+ +                      cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
                         if (cpumask_empty(tmpmask))
                                 continue;
   
@@@ -7991,7 -8021,7 +8056,7 @@@ match2
   }
   
   #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- -int arch_reinit_sched_domains(void)
+ +static void arch_reinit_sched_domains(void)
   {
         get_online_cpus();
   
@@@ -8000,10 -8030,13 +8065,10 @@@
   
         rebuild_sched_domains();
         put_online_cpus();
- -
- -      return 0;
   }
   
   static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
   {
- -      int ret;
         unsigned int level = 0;
   
         if (sscanf(buf, "%u", &level) != 1)
@@@ -8024,9 -8057,9 +8089,9 @@@
         else
                 sched_mc_power_savings = level;
   
- -      ret = arch_reinit_sched_domains();
+ +      arch_reinit_sched_domains();
   
- -      return ret ? ret : count;
+ +      return count;
   }
   
   #ifdef CONFIG_SCHED_MC
@@@ -8061,7 -8094,7 +8126,7 @@@ static SYSDEV_CLASS_ATTR(sched_smt_powe
                    sched_smt_power_savings_store);
   #endif
   
- -int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+ +int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
   {
         int err = 0;
   
@@@ -8201,11 -8234,13 +8266,13 @@@ static void init_rt_rq(struct rt_rq *rt
         __set_bit(MAX_RT_PRIO, array->bitmap);
   
   #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       rt_rq->highest_prio = MAX_RT_PRIO;
+       rt_rq->highest_prio.curr = MAX_RT_PRIO;
+       rt_rq->highest_prio.next = MAX_RT_PRIO;
   #endif
   #ifdef CONFIG_SMP
         rt_rq->rt_nr_migratory = 0;
         rt_rq->overloaded = 0;
+       plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
   #endif
   
         rt_rq->rt_time = 0;
@@@ -9455,41 -9490,6 +9522,41 @@@ cpuacct_destroy(struct cgroup_subsys *s
         kfree(ca);
   }
   
+ +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+ +{
+ +      u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ +      u64 data;
+ +
+ +#ifndef CONFIG_64BIT
+ +      /*
+ +       * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+ +       */
+ +      spin_lock_irq(&cpu_rq(cpu)->lock);
+ +      data = *cpuusage;
+ +      spin_unlock_irq(&cpu_rq(cpu)->lock);
+ +#else
+ +      data = *cpuusage;
+ +#endif
+ +
+ +      return data;
+ +}
+ +
+ +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+ +{
+ +      u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ +
+ +#ifndef CONFIG_64BIT
+ +      /*
+ +       * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+ +       */
+ +      spin_lock_irq(&cpu_rq(cpu)->lock);
+ +      *cpuusage = val;
+ +      spin_unlock_irq(&cpu_rq(cpu)->lock);
+ +#else
+ +      *cpuusage = val;
+ +#endif
+ +}
+ +
   /* return total cpu usage (in nanoseconds) of a group */
   static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
   {
@@@ -9497,8 -9497,17 +9564,8 @@@
         u64 totalcpuusage = 0;
         int i;
   
- -      for_each_possible_cpu(i) {
- -              u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
- -
- -              /*
- -               * Take rq->lock to make 64-bit addition safe on 32-bit
- -               * platforms.
- -               */
- -              spin_lock_irq(&cpu_rq(i)->lock);
- -              totalcpuusage += *cpuusage;
- -              spin_unlock_irq(&cpu_rq(i)->lock);
- -      }
+ +      for_each_present_cpu(i)
+ +              totalcpuusage += cpuacct_cpuusage_read(ca, i);
   
         return totalcpuusage;
   }
@@@ -9515,39 -9524,23 +9582,39 @@@ static int cpuusage_write(struct cgrou
                 goto out;
         }
   
- -      for_each_possible_cpu(i) {
- -              u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
+ +      for_each_present_cpu(i)
+ +              cpuacct_cpuusage_write(ca, i, 0);
   
- -              spin_lock_irq(&cpu_rq(i)->lock);
- -              *cpuusage = 0;
- -              spin_unlock_irq(&cpu_rq(i)->lock);
- -      }
   out:
         return err;
   }
   
+ +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+ +                                 struct seq_file *m)
+ +{
+ +      struct cpuacct *ca = cgroup_ca(cgroup);
+ +      u64 percpu;
+ +      int i;
+ +
+ +      for_each_present_cpu(i) {
+ +              percpu = cpuacct_cpuusage_read(ca, i);
+ +              seq_printf(m, "%llu ", (unsigned long long) percpu);
+ +      }
+ +      seq_printf(m, "\n");
+ +      return 0;
+ +}
+ +
   static struct cftype files[] = {
         {
                 .name = "usage",
                 .read_u64 = cpuusage_read,
                 .write_u64 = cpuusage_write,
         },
+ +      {
+ +              .name = "usage_percpu",
+ +              .read_seq_string = cpuacct_percpu_seq_read,
+ +      },
+ +
   };
   
   static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
diff --combined kernel/sched_rt.c

index 954e1a81b7967567b617b918b254deaa12a848a4,64a8f0aa117b1595d28ea0c90adcf7de03985bad..18c7b5b3158aaa1a75ce2fa746d70c71c5a5fa92
--- 1/kernel/sched_rt.c
--- 2/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@@ -49,6 -49,24 +49,24 @@@ static void update_rt_migration(struct 
                 rq->rt.overloaded = 0;
         }
   }
+ 
+ static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+ {
+       plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+       plist_node_init(&p->pushable_tasks, p->prio);
+       plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ }
+ 
+ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+ {
+       plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ }
+ 
+ #else
+ 
+ #define enqueue_pushable_task(rq, p) do { } while (0)
+ #define dequeue_pushable_task(rq, p) do { } while (0)
+ 
   #endif /* CONFIG_SMP */
   
   static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
@@@ -77,7 -95,7 +95,7 @@@ static inline u64 sched_rt_period(struc
   }
   
   #define for_each_leaf_rt_rq(rt_rq, rq) \
- -      list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
+ +      list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
   
   static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
   {
@@@ -108,7 -126,7 +126,7 @@@ static void sched_rt_rq_enqueue(struct 
         if (rt_rq->rt_nr_running) {
                 if (rt_se && !on_rt_rq(rt_se))
                         enqueue_rt_entity(rt_se);
-               if (rt_rq->highest_prio < curr->prio)
+               if (rt_rq->highest_prio.curr < curr->prio)
                         resched_task(curr);
         }
   }
@@@ -473,7 -491,7 +491,7 @@@ static inline int rt_se_prio(struct sch
         struct rt_rq *rt_rq = group_rt_rq(rt_se);
   
         if (rt_rq)
-               return rt_rq->highest_prio;
+               return rt_rq->highest_prio.curr;
   #endif
   
         return rt_task_of(rt_se)->prio;
@@@ -547,33 -565,64 +565,64 @@@ static void update_curr_rt(struct rq *r
         }
   }
   
+ #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+ 
+ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
+ 
+ static inline int next_prio(struct rq *rq)
+ {
+       struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
+ 
+       if (next && rt_prio(next->prio))
+               return next->prio;
+       else
+               return MAX_RT_PRIO;
+ }
+ #endif
+ 
   static inline
   void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
   {
-       WARN_ON(!rt_prio(rt_se_prio(rt_se)));
-       rt_rq->rt_nr_running++;
- #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+       int prio = rt_se_prio(rt_se);
   #ifdef CONFIG_SMP
-               struct rq *rq = rq_of_rt_rq(rt_rq);
+       struct rq *rq = rq_of_rt_rq(rt_rq);
   #endif
   
-               rt_rq->highest_prio = rt_se_prio(rt_se);
+       WARN_ON(!rt_prio(prio));
+       rt_rq->rt_nr_running++;
+ #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+       if (prio < rt_rq->highest_prio.curr) {
+ 
+               /*
+                * If the new task is higher in priority than anything on the
+                * run-queue, we have a new high that must be published to
+                * the world.  We also know that the previous high becomes
+                * our next-highest.
+                */
+               rt_rq->highest_prio.next = rt_rq->highest_prio.curr;
+               rt_rq->highest_prio.curr = prio;
   #ifdef CONFIG_SMP
                 if (rq->online)
-                       cpupri_set(&rq->rd->cpupri, rq->cpu,
-                                  rt_se_prio(rt_se));
+                       cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
   #endif
-       }
+       } else if (prio == rt_rq->highest_prio.curr)
+               /*
+                * If the next task is equal in priority to the highest on
+                * the run-queue, then we implicitly know that the next highest
+                * task cannot be any lower than current
+                */
+               rt_rq->highest_prio.next = prio;
+       else if (prio < rt_rq->highest_prio.next)
+               /*
+                * Otherwise, we need to recompute next-highest
+                */
+               rt_rq->highest_prio.next = next_prio(rq);
   #endif
   #ifdef CONFIG_SMP
-       if (rt_se->nr_cpus_allowed > 1) {
-               struct rq *rq = rq_of_rt_rq(rt_rq);
- 
+       if (rt_se->nr_cpus_allowed > 1)
                 rq->rt.rt_nr_migratory++;
-       }
   
-       update_rt_migration(rq_of_rt_rq(rt_rq));
+       update_rt_migration(rq);
   #endif
   #ifdef CONFIG_RT_GROUP_SCHED
         if (rt_se_boosted(rt_se))
@@@ -590,7 -639,8 +639,8 @@@ static inlin
   void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
   {
   #ifdef CONFIG_SMP
-       int highest_prio = rt_rq->highest_prio;
+       struct rq *rq = rq_of_rt_rq(rt_rq);
+       int highest_prio = rt_rq->highest_prio.curr;
   #endif
   
         WARN_ON(!rt_prio(rt_se_prio(rt_se)));
@@@ -598,33 -648,34 +648,34 @@@
         rt_rq->rt_nr_running--;
   #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
         if (rt_rq->rt_nr_running) {
-               struct rt_prio_array *array;
+               int prio = rt_se_prio(rt_se);
   
-               WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
-               if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
-                       /* recalculate */
-                       array = &rt_rq->active;
-                       rt_rq->highest_prio =
+               WARN_ON(prio < rt_rq->highest_prio.curr);
+ 
+               /*
+                * This may have been our highest or next-highest priority
+                * task and therefore we may have some recomputation to do
+                */
+               if (prio == rt_rq->highest_prio.curr) {
+                       struct rt_prio_array *array = &rt_rq->active;
+ 
+                       rt_rq->highest_prio.curr =
                                 sched_find_first_bit(array->bitmap);
-               } /* otherwise leave rq->highest prio alone */
+               }
+ 
+               if (prio <= rt_rq->highest_prio.next)
+                       rt_rq->highest_prio.next = next_prio(rq);
         } else
-               rt_rq->highest_prio = MAX_RT_PRIO;
+               rt_rq->highest_prio.curr = MAX_RT_PRIO;
   #endif
   #ifdef CONFIG_SMP
-       if (rt_se->nr_cpus_allowed > 1) {
-               struct rq *rq = rq_of_rt_rq(rt_rq);
+       if (rt_se->nr_cpus_allowed > 1)
                 rq->rt.rt_nr_migratory--;
-       }
   
-       if (rt_rq->highest_prio != highest_prio) {
-               struct rq *rq = rq_of_rt_rq(rt_rq);
+       if (rq->online && rt_rq->highest_prio.curr != highest_prio)
+               cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
   
-               if (rq->online)
-                       cpupri_set(&rq->rd->cpupri, rq->cpu,
-                                  rt_rq->highest_prio);
-       }
- 
-       update_rt_migration(rq_of_rt_rq(rt_rq));
+       update_rt_migration(rq);
   #endif /* CONFIG_SMP */
   #ifdef CONFIG_RT_GROUP_SCHED
         if (rt_se_boosted(rt_se))
@@@ -718,6 -769,9 +769,9 @@@ static void enqueue_task_rt(struct rq *
   
         enqueue_rt_entity(rt_se);
   
+       if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+               enqueue_pushable_task(rq, p);
+ 
         inc_cpu_load(rq, p->se.load.weight);
   }
   
@@@ -728,6 -782,8 +782,8 @@@ static void dequeue_task_rt(struct rq *
         update_curr_rt(rq);
         dequeue_rt_entity(rt_se);
   
+       dequeue_pushable_task(rq, p);
+ 
         dec_cpu_load(rq, p->se.load.weight);
   }
   
@@@ -878,7 -934,7 +934,7 @@@ static struct sched_rt_entity *pick_nex
         return next;
   }
   
- static struct task_struct *pick_next_task_rt(struct rq *rq)
+ static struct task_struct *_pick_next_task_rt(struct rq *rq)
   {
         struct sched_rt_entity *rt_se;
         struct task_struct *p;
@@@ -900,6 -956,18 +956,18 @@@
   
         p = rt_task_of(rt_se);
         p->se.exec_start = rq->clock;
+ 
+       return p;
+ }
+ 
+ static struct task_struct *pick_next_task_rt(struct rq *rq)
+ {
+       struct task_struct *p = _pick_next_task_rt(rq);
+ 
+       /* The running task is never eligible for pushing */
+       if (p)
+               dequeue_pushable_task(rq, p);
+ 
         return p;
   }
   
@@@ -907,6 -975,13 +975,13 @@@ static void put_prev_task_rt(struct rq 
   {
         update_curr_rt(rq);
         p->se.exec_start = 0;
+ 
+       /*
+        * The previous task needs to be made eligible for pushing
+        * if it is still active
+        */
+       if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
+               enqueue_pushable_task(rq, p);
   }
   
   #ifdef CONFIG_SMP
@@@ -1072,7 -1147,7 +1147,7 @@@ static struct rq *find_lock_lowest_rq(s
                 }
   
                 /* If this rq is still suitable use it. */
-               if (lowest_rq->rt.highest_prio > task->prio)
+               if (lowest_rq->rt.highest_prio.curr > task->prio)
                         break;
   
                 /* try again */
@@@ -1083,6 -1158,31 +1158,31 @@@
         return lowest_rq;
   }
   
+ static inline int has_pushable_tasks(struct rq *rq)
+ {
+       return !plist_head_empty(&rq->rt.pushable_tasks);
+ }
+ 
+ static struct task_struct *pick_next_pushable_task(struct rq *rq)
+ {
+       struct task_struct *p;
+ 
+       if (!has_pushable_tasks(rq))
+               return NULL;
+ 
+       p = plist_first_entry(&rq->rt.pushable_tasks,
+                             struct task_struct, pushable_tasks);
+ 
+       BUG_ON(rq->cpu != task_cpu(p));
+       BUG_ON(task_current(rq, p));
+       BUG_ON(p->rt.nr_cpus_allowed <= 1);
+ 
+       BUG_ON(!p->se.on_rq);
+       BUG_ON(!rt_task(p));
+ 
+       return p;
+ }
+ 
   /*
    * If the current CPU has more than one RT task, see if the non
    * running task can migrate over to a CPU that is running a task
@@@ -1092,13 -1192,11 +1192,11 @@@ static int push_rt_task(struct rq *rq
   {
         struct task_struct *next_task;
         struct rq *lowest_rq;
-       int ret = 0;
-       int paranoid = RT_MAX_TRIES;
   
         if (!rq->rt.overloaded)
                 return 0;
   
-       next_task = pick_next_highest_task_rt(rq, -1);
+       next_task = pick_next_pushable_task(rq);
         if (!next_task)
                 return 0;
   
@@@ -1127,16 -1225,34 +1225,34 @@@
                 struct task_struct *task;
                 /*
                  * find lock_lowest_rq releases rq->lock
-                * so it is possible that next_task has changed.
-                * If it has, then try again.
+                * so it is possible that next_task has migrated.
+                *
+                * We need to make sure that the task is still on the same
+                * run-queue and is also still the next task eligible for
+                * pushing.
                  */
-               task = pick_next_highest_task_rt(rq, -1);
-               if (unlikely(task != next_task) && task && paranoid--) {
-                       put_task_struct(next_task);
-                       next_task = task;
-                       goto retry;
+               task = pick_next_pushable_task(rq);
+               if (task_cpu(next_task) == rq->cpu && task == next_task) {
+                       /*
+                        * If we get here, the task hasnt moved at all, but
+                        * it has failed to push.  We will not try again,
+                        * since the other cpus will pull from us when they
+                        * are ready.
+                        */
+                       dequeue_pushable_task(rq, next_task);
+                       goto out;
                 }
-               goto out;
+ 
+               if (!task)
+                       /* No more tasks, just exit */
+                       goto out;
+ 
+               /*
+                * Something has shifted, try again.
+                */
+               put_task_struct(next_task);
+               next_task = task;
+               goto retry;
         }
   
         deactivate_task(rq, next_task, 0);
@@@ -1147,23 -1263,12 +1263,12 @@@
   
         double_unlock_balance(rq, lowest_rq);
   
-       ret = 1;
   out:
         put_task_struct(next_task);
   
-       return ret;
+       return 1;
   }
   
- /*
-  * TODO: Currently we just use the second highest prio task on
-  *       the queue, and stop when it can't migrate (or there's
-  *       no more RT tasks).  There may be a case where a lower
-  *       priority RT task has a different affinity than the
-  *       higher RT task. In this case the lower RT task could
-  *       possibly be able to migrate where as the higher priority
-  *       RT task could not.  We currently ignore this issue.
-  *       Enhancements are welcome!
-  */
   static void push_rt_tasks(struct rq *rq)
   {
         /* push_rt_task will return true if it moved an RT */
@@@ -1174,33 -1279,35 +1279,35 @@@
   static int pull_rt_task(struct rq *this_rq)
   {
         int this_cpu = this_rq->cpu, ret = 0, cpu;
-       struct task_struct *p, *next;
+       struct task_struct *p;
         struct rq *src_rq;
   
         if (likely(!rt_overloaded(this_rq)))
                 return 0;
   
-       next = pick_next_task_rt(this_rq);
- 
         for_each_cpu(cpu, this_rq->rd->rto_mask) {
                 if (this_cpu == cpu)
                         continue;
   
                 src_rq = cpu_rq(cpu);
+ 
+               /*
+                * Don't bother taking the src_rq->lock if the next highest
+                * task is known to be lower-priority than our current task.
+                * This may look racy, but if this value is about to go
+                * logically higher, the src_rq will push this task away.
+                * And if its going logically lower, we do not care
+                */
+               if (src_rq->rt.highest_prio.next >=
+                   this_rq->rt.highest_prio.curr)
+                       continue;
+ 
                 /*
                  * We can potentially drop this_rq's lock in
                  * double_lock_balance, and another CPU could
-                * steal our next task - hence we must cause
-                * the caller to recalculate the next task
-                * in that case:
+                * alter this_rq
                  */
-               if (double_lock_balance(this_rq, src_rq)) {
-                       struct task_struct *old_next = next;
- 
-                       next = pick_next_task_rt(this_rq);
-                       if (next != old_next)
-                               ret = 1;
-               }
+               double_lock_balance(this_rq, src_rq);
   
                 /*
                  * Are there still pullable RT tasks?
@@@ -1214,7 -1321,7 +1321,7 @@@
                  * Do we have an RT task that preempts
                  * the to-be-scheduled task?
                  */
-               if (p && (!next || (p->prio < next->prio))) {
+               if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
                         WARN_ON(p == src_rq->curr);
                         WARN_ON(!p->se.on_rq);
   
@@@ -1224,12 -1331,9 +1331,9 @@@
                          * This is just that p is wakeing up and hasn't
                          * had a chance to schedule. We only pull
                          * p if it is lower in priority than the
-                        * current task on the run queue or
-                        * this_rq next task is lower in prio than
-                        * the current task on that rq.
+                        * current task on the run queue
                          */
-                       if (p->prio < src_rq->curr->prio ||
-                           (next && next->prio < src_rq->curr->prio))
+                       if (p->prio < src_rq->curr->prio)
                                 goto skip;
   
                         ret = 1;
@@@ -1242,13 -1346,7 +1346,7 @@@
                          * case there's an even higher prio task
                          * in another runqueue. (low likelyhood
                          * but possible)
-                        *
-                        * Update next so that we won't pick a task
-                        * on another cpu with a priority lower (or equal)
-                        * than the one we just picked.
                          */
-                       next = p;
- 
                 }
    skip:
                 double_unlock_balance(this_rq, src_rq);
@@@ -1260,24 -1358,27 +1358,27 @@@
   static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
   {
         /* Try to pull RT tasks here if we lower this rq's prio */
-       if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
+       if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
                 pull_rt_task(rq);
   }
   
+ /*
+  * assumes rq->lock is held
+  */
+ static int needs_post_schedule_rt(struct rq *rq)
+ {
+       return has_pushable_tasks(rq);
+ }
+ 
   static void post_schedule_rt(struct rq *rq)
   {
         /*
-        * If we have more than one rt_task queued, then
-        * see if we can push the other rt_tasks off to other CPUS.
-        * Note we may release the rq lock, and since
-        * the lock was owned by prev, we need to release it
-        * first via finish_lock_switch and then reaquire it here.
+        * This is only called if needs_post_schedule_rt() indicates that
+        * we need to push tasks away
          */
-       if (unlikely(rq->rt.overloaded)) {
-               spin_lock_irq(&rq->lock);
-               push_rt_tasks(rq);
-               spin_unlock_irq(&rq->lock);
-       }
+       spin_lock_irq(&rq->lock);
+       push_rt_tasks(rq);
+       spin_unlock_irq(&rq->lock);
   }
   
   /*
@@@ -1288,7 -1389,8 +1389,8 @@@ static void task_wake_up_rt(struct rq *
   {
         if (!task_running(rq, p) &&
             !test_tsk_need_resched(rq->curr) &&
-           rq->rt.overloaded)
+           has_pushable_tasks(rq) &&
+           p->rt.nr_cpus_allowed > 1)
                 push_rt_tasks(rq);
   }
   
@@@ -1324,6 -1426,24 +1426,24 @@@ static void set_cpus_allowed_rt(struct 
         if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
                 struct rq *rq = task_rq(p);
   
+               if (!task_current(rq, p)) {
+                       /*
+                        * Make sure we dequeue this task from the pushable list
+                        * before going further.  It will either remain off of
+                        * the list because we are no longer pushable, or it
+                        * will be requeued.
+                        */
+                       if (p->rt.nr_cpus_allowed > 1)
+                               dequeue_pushable_task(rq, p);
+ 
+                       /*
+                        * Requeue if our weight is changing and still > 1
+                        */
+                       if (weight > 1)
+                               enqueue_pushable_task(rq, p);
+ 
+               }
+ 
                 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
                         rq->rt.rt_nr_migratory++;
                 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
@@@ -1346,7 -1466,7 +1466,7 @@@ static void rq_online_rt(struct rq *rq
   
         __enable_runtime(rq);
   
-       cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
+       cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
   }
   
   /* Assumes rq->lock is held */
@@@ -1383,8 -1503,7 +1503,8 @@@ static inline void init_sched_rt_class(
         unsigned int i;
   
         for_each_possible_cpu(i)
- -              alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL);
+ +              alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
+ +                                      GFP_KERNEL, cpu_to_node(i));
   }
   #endif /* CONFIG_SMP */
   
@@@ -1438,7 -1557,7 +1558,7 @@@ static void prio_changed_rt(struct rq *
                  * can release the rq lock and p could migrate.
                  * Only reschedule if p is still on the same runqueue.
                  */
-               if (p->prio > rq->rt.highest_prio && rq->curr == p)
+               if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
                         resched_task(p);
   #else
                 /* For UP simply resched on drop of prio */
@@@ -1509,6 -1628,9 +1629,9 @@@ static void set_curr_task_rt(struct rq 
         struct task_struct *p = rq->curr;
   
         p->se.exec_start = rq->clock;
+ 
+       /* The running task is never eligible for pushing */
+       dequeue_pushable_task(rq, p);
   }
   
   static const struct sched_class rt_sched_class = {
@@@ -1531,6 -1653,7 +1654,7 @@@
         .rq_online              = rq_online_rt,
         .rq_offline             = rq_offline_rt,
         .pre_schedule           = pre_schedule_rt,
+       .needs_post_schedule    = needs_post_schedule_rt,
         .post_schedule          = post_schedule_rt,
         .task_wake_up           = task_wake_up_rt,
         .switched_from          = switched_from_rt,
author	Ingo Molnar <mingo@elte.hu>
	Sun, 11 Jan 2009 03:58:49 +0000 (04:58 +0100)
committer	Ingo Molnar <mingo@elte.hu>
	Sun, 11 Jan 2009 03:58:49 +0000 (04:58 +0100)
		1	2
include/linux/init_task.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched_rt.c	patch \|	diff1 \|	diff2 \|	blob \| history