Merge branch 'sched/latest' of git://git.kernel.org/pub/scm/linux/kernel/git/ghaskins...

[linux-2.6-omap-h63xx.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 27ba1d642f0f0c4c370e81a61b067874310532d0..dd1a1466c1e6a7412ca43022185c8aed0f76c88d 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -464,11 +464,15 @@ struct rt_rq {
         struct rt_prio_array active;
         unsigned long rt_nr_running;
  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       int highest_prio; /* highest queued rt task prio */
+       struct {
+               int curr; /* highest queued rt task prio */
+               int next; /* next highest */
+       } highest_prio;
  #endif
  #ifdef CONFIG_SMP
         unsigned long rt_nr_migratory;
         int overloaded;
+       struct plist_head pushable_tasks;
  #endif
         int rt_throttled;
         u64 rt_time;
@@ -1607,21 +1611,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
  
  #endif
  
+#ifdef CONFIG_PREEMPT
+
  /*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ * way at the expense of forcing extra atomic operations in all
+ * invocations.  This assures that the double_lock is acquired using the
+ * same underlying policy as the spinlock_t on this architecture, which
+ * reduces latency compared to the unfair variant below.  However, it
+ * also adds more overhead and therefore may reduce throughput.
   */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(this_rq->lock)
+       __acquires(busiest->lock)
+       __acquires(this_rq->lock)
+{
+       spin_unlock(&this_rq->lock);
+       double_rq_lock(this_rq, busiest);
+
+       return 1;
+}
+
+#else
+/*
+ * Unfair double_lock_balance: Optimizes throughput at the expense of
+ * latency by eliminating extra atomic operations when the locks are
+ * already in proper order on entry.  This favors lower cpu-ids and will
+ * grant the double lock to lower cpus over higher ids under contention,
+ * regardless of entry order into the function.
+ */
+static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
         __releases(this_rq->lock)
         __acquires(busiest->lock)
         __acquires(this_rq->lock)
  {
         int ret = 0;
  
-       if (unlikely(!irqs_disabled())) {
-               /* printk() doesn't work good under rq->lock */
-               spin_unlock(&this_rq->lock);
-               BUG_ON(1);
-       }
         if (unlikely(!spin_trylock(&busiest->lock))) {
                 if (busiest < this_rq) {
                         spin_unlock(&this_rq->lock);
@@ -1634,6 +1659,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
         return ret;
  }
  
+#endif /* CONFIG_PREEMPT */
+
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+{
+       if (unlikely(!irqs_disabled())) {
+               /* printk() doesn't work good under rq->lock */
+               spin_unlock(&this_rq->lock);
+               BUG_ON(1);
+       }
+
+       return _double_lock_balance(this_rq, busiest);
+}
+
  static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
         __releases(busiest->lock)
  {
@@ -2445,6 +2486,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
         /* Want to start with kernel preemption disabled. */
         task_thread_info(p)->preempt_count = 1;
  #endif
+       plist_node_init(&p->pushable_tasks, MAX_PRIO);
+
         put_cpu();
  }
  
@@ -2585,6 +2628,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
  {
         struct mm_struct *mm = rq->prev_mm;
         long prev_state;
+#ifdef CONFIG_SMP
+       int post_schedule = 0;
+
+       if (current->sched_class->needs_post_schedule)
+               post_schedule = current->sched_class->needs_post_schedule(rq);
+#endif
  
         rq->prev_mm = NULL;
  
@@ -2603,7 +2652,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         finish_arch_switch(prev);
         finish_lock_switch(rq, prev);
  #ifdef CONFIG_SMP
-       if (current->sched_class->post_schedule)
+       if (post_schedule)
                 current->sched_class->post_schedule(rq);
  #endif
  
@@ -2984,6 +3033,16 @@ next:
         pulled++;
         rem_load_move -= p->se.load.weight;
  
+#ifdef CONFIG_PREEMPT
+       /*
+        * NEWIDLE balancing is a source of latency, so preemptible kernels
+        * will stop after the first task is pulled to minimize the critical
+        * section.
+        */
+       if (idle == CPU_NEWLY_IDLE)
+               goto out;
+#endif
+
         /*
          * We only want to steal up to the prescribed amount of weighted load.
          */
@@ -3030,9 +3089,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                                 sd, idle, all_pinned, &this_best_prio);
                 class = class->next;
  
+#ifdef CONFIG_PREEMPT
+               /*
+                * NEWIDLE balancing is a source of latency, so preemptible
+                * kernels will stop after the first task is pulled to minimize
+                * the critical section.
+                */
                 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
                         break;
-
+#endif
         } while (class && max_load_move > total_load_moved);
  
         return total_load_moved > 0;
@@ -3715,7 +3780,7 @@ redo:
                  * don't kick the migration_thread, if the curr
                  * task on busiest cpu can't be moved to this_cpu
                  */
-               if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+               if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
                         double_unlock_balance(this_rq, busiest);
                         all_pinned = 1;
                         return ld_moved;
@@ -3728,8 +3793,13 @@ redo:
                 }
  
                 double_unlock_balance(this_rq, busiest);
+               /*
+                * Should not call ttwu while holding a rq->lock
+                */
+               spin_unlock(&this_rq->lock);
                 if (active_balance)
                         wake_up_process(busiest->migration_thread);
+               spin_lock(&this_rq->lock);
  
         } else
                 sd->nr_balance_failed = 0;
@@ -4150,13 +4220,17 @@ unsigned long long task_delta_exec(struct task_struct *p)
   * Account user cpu time to a process.
   * @p: the process that the cpu time gets accounted to
   * @cputime: the cpu time spent in user space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
   */
-void account_user_time(struct task_struct *p, cputime_t cputime)
+void account_user_time(struct task_struct *p, cputime_t cputime,
+                      cputime_t cputime_scaled)
  {
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
         cputime64_t tmp;
  
+       /* Add user time to process. */
         p->utime = cputime_add(p->utime, cputime);
+       p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
         account_group_user_time(p, cputime);
  
         /* Add user time to cpustat. */
@@ -4173,51 +4247,48 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
   * Account guest cpu time to a process.
   * @p: the process that the cpu time gets accounted to
   * @cputime: the cpu time spent in virtual machine since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
   */
-static void account_guest_time(struct task_struct *p, cputime_t cputime)
+static void account_guest_time(struct task_struct *p, cputime_t cputime,
+                              cputime_t cputime_scaled)
  {
         cputime64_t tmp;
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
  
         tmp = cputime_to_cputime64(cputime);
  
+       /* Add guest time to process. */
         p->utime = cputime_add(p->utime, cputime);
+       p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
         account_group_user_time(p, cputime);
         p->gtime = cputime_add(p->gtime, cputime);
  
+       /* Add guest time to cpustat. */
         cpustat->user = cputime64_add(cpustat->user, tmp);
         cpustat->guest = cputime64_add(cpustat->guest, tmp);
  }
  
-/*
- * Account scaled user cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user space since the last update
- */
-void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
-{
-       p->utimescaled = cputime_add(p->utimescaled, cputime);
-}
-
  /*
   * Account system cpu time to a process.
   * @p: the process that the cpu time gets accounted to
   * @hardirq_offset: the offset to subtract from hardirq_count()
   * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
   */
  void account_system_time(struct task_struct *p, int hardirq_offset,
-                        cputime_t cputime)
+                        cputime_t cputime, cputime_t cputime_scaled)
  {
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       struct rq *rq = this_rq();
         cputime64_t tmp;
  
         if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
-               account_guest_time(p, cputime);
+               account_guest_time(p, cputime, cputime_scaled);
                 return;
         }
  
+       /* Add system time to process. */
         p->stime = cputime_add(p->stime, cputime);
+       p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
         account_group_system_time(p, cputime);
  
         /* Add system time to cpustat. */
@@ -4226,48 +4297,84 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
         else if (softirq_count())
                 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
-       else if (p != rq->idle)
-               cpustat->system = cputime64_add(cpustat->system, tmp);
-       else if (atomic_read(&rq->nr_iowait) > 0)
-               cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
         else
-               cpustat->idle = cputime64_add(cpustat->idle, tmp);
+               cpustat->system = cputime64_add(cpustat->system, tmp);
+
         /* Account for system time used */
         acct_update_integrals(p);
  }
  
  /*
- * Account scaled system cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @hardirq_offset: the offset to subtract from hardirq_count()
- * @cputime: the cpu time spent in kernel space since the last update
+ * Account for involuntary wait time.
+ * @steal: the cpu time spent in involuntary wait
   */
-void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
+void account_steal_time(cputime_t cputime)
  {
-       p->stimescaled = cputime_add(p->stimescaled, cputime);
+       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+       cputime64_t cputime64 = cputime_to_cputime64(cputime);
+
+       cpustat->steal = cputime64_add(cpustat->steal, cputime64);
  }
  
  /*
- * Account for involuntary wait time.
- * @p: the process from which the cpu time has been stolen
- * @steal: the cpu time spent in involuntary wait
+ * Account for idle time.
+ * @cputime: the cpu time spent in idle wait
   */
-void account_steal_time(struct task_struct *p, cputime_t steal)
+void account_idle_time(cputime_t cputime)
  {
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       cputime64_t tmp = cputime_to_cputime64(steal);
+       cputime64_t cputime64 = cputime_to_cputime64(cputime);
         struct rq *rq = this_rq();
  
-       if (p == rq->idle) {
-               p->stime = cputime_add(p->stime, steal);
-               if (atomic_read(&rq->nr_iowait) > 0)
-                       cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
-               else
-                       cpustat->idle = cputime64_add(cpustat->idle, tmp);
-       } else
-               cpustat->steal = cputime64_add(cpustat->steal, tmp);
+       if (atomic_read(&rq->nr_iowait) > 0)
+               cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
+       else
+               cpustat->idle = cputime64_add(cpustat->idle, cputime64);
  }
  
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+
+/*
+ * Account a single tick of cpu time.
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: indicates if the tick is a user or a system tick
+ */
+void account_process_tick(struct task_struct *p, int user_tick)
+{
+       cputime_t one_jiffy = jiffies_to_cputime(1);
+       cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
+       struct rq *rq = this_rq();
+
+       if (user_tick)
+               account_user_time(p, one_jiffy, one_jiffy_scaled);
+       else if (p != rq->idle)
+               account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
+                                   one_jiffy_scaled);
+       else
+               account_idle_time(one_jiffy);
+}
+
+/*
+ * Account multiple ticks of steal time.
+ * @p: the process from which the cpu time has been stolen
+ * @ticks: number of stolen ticks
+ */
+void account_steal_ticks(unsigned long ticks)
+{
+       account_steal_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Account multiple ticks of idle time.
+ * @ticks: number of stolen ticks
+ */
+void account_idle_ticks(unsigned long ticks)
+{
+       account_idle_time(jiffies_to_cputime(ticks));
+}
+
+#endif
+
  /*
   * Use precise platform statistics if available:
   */
@@ -6220,9 +6327,7 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
  static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
  {
         int dest_cpu;
-       /* FIXME: Use cpumask_of_node here. */
-       cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu));
-       const struct cpumask *nodemask = &_nodemask;
+       const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
  
  again:
         /* Look for allowed, online CPU in same node. */
@@ -6922,7 +7027,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
         spin_unlock_irqrestore(&rq->lock, flags);
  }
  
-static int init_rootdomain(struct root_domain *rd, bool bootmem)
+static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
  {
         memset(rd, 0, sizeof(*rd));
  
@@ -6935,7 +7040,7 @@ static int init_rootdomain(struct root_domain *rd, bool bootmem)
         }
  
         if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
-               goto free_rd;
+               goto out;
         if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
                 goto free_span;
         if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
@@ -6951,8 +7056,7 @@ free_online:
         free_cpumask_var(rd->online);
  free_span:
         free_cpumask_var(rd->span);
-free_rd:
-       kfree(rd);
+out:
         return -ENOMEM;
  }
  
@@ -7133,21 +7237,18 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
  static void sched_domain_node_span(int node, struct cpumask *span)
  {
         nodemask_t used_nodes;
-       /* FIXME: use cpumask_of_node() */
-       node_to_cpumask_ptr(nodemask, node);
         int i;
  
-       cpus_clear(*span);
+       cpumask_clear(span);
         nodes_clear(used_nodes);
  
-       cpus_or(*span, *span, *nodemask);
+       cpumask_or(span, span, cpumask_of_node(node));
         node_set(node, used_nodes);
  
         for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
                 int next_node = find_next_best_node(node, &used_nodes);
  
-               node_to_cpumask_ptr_next(nodemask, next_node);
-               cpus_or(*span, *span, *nodemask);
+               cpumask_or(span, span, cpumask_of_node(next_node));
         }
  }
  #endif /* CONFIG_NUMA */
@@ -7227,9 +7328,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
  {
         int group;
  #ifdef CONFIG_SCHED_MC
-       /* FIXME: Use cpu_coregroup_mask. */
-       *mask = cpu_coregroup_map(cpu);
-       cpus_and(*mask, *mask, *cpu_map);
+       cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
         group = cpumask_first(mask);
  #elif defined(CONFIG_SCHED_SMT)
         cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
@@ -7259,10 +7358,8 @@ static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
                                  struct cpumask *nodemask)
  {
         int group;
-       /* FIXME: use cpumask_of_node */
-       node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu));
  
-       cpumask_and(nodemask, pnodemask, cpu_map);
+       cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
         group = cpumask_first(nodemask);
  
         if (sg)
@@ -7313,10 +7410,8 @@ static void free_sched_groups(const struct cpumask *cpu_map,
  
                 for (i = 0; i < nr_node_ids; i++) {
                         struct sched_group *oldsg, *sg = sched_group_nodes[i];
-                       /* FIXME: Use cpumask_of_node */
-                       node_to_cpumask_ptr(pnodemask, i);
  
-                       cpus_and(*nodemask, *pnodemask, *cpu_map);
+                       cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
                         if (cpumask_empty(nodemask))
                                 continue;
  
@@ -7525,9 +7620,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
         for_each_cpu(i, cpu_map) {
                 struct sched_domain *sd = NULL, *p;
  
-               /* FIXME: use cpumask_of_node */
-               *nodemask = node_to_cpumask(cpu_to_node(i));
-               cpus_and(*nodemask, *nodemask, *cpu_map);
+               cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
  
  #ifdef CONFIG_NUMA
                 if (cpumask_weight(cpu_map) >
@@ -7568,9 +7661,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                 sd = &per_cpu(core_domains, i).sd;
                 SD_INIT(sd, MC);
                 set_domain_attribute(sd, attr);
-               *sched_domain_span(sd) = cpu_coregroup_map(i);
-               cpumask_and(sched_domain_span(sd),
-                           sched_domain_span(sd), cpu_map);
+               cpumask_and(sched_domain_span(sd), cpu_map,
+                                                  cpu_coregroup_mask(i));
                 sd->parent = p;
                 p->child = sd;
                 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7606,9 +7698,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
  #ifdef CONFIG_SCHED_MC
         /* Set up multi-core groups */
         for_each_cpu(i, cpu_map) {
-               /* FIXME: Use cpu_coregroup_mask */
-               *this_core_map = cpu_coregroup_map(i);
-               cpus_and(*this_core_map, *this_core_map, *cpu_map);
+               cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
                 if (i != cpumask_first(this_core_map))
                         continue;
  
@@ -7620,9 +7710,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
  
         /* Set up physical groups */
         for (i = 0; i < nr_node_ids; i++) {
-               /* FIXME: Use cpumask_of_node */
-               *nodemask = node_to_cpumask(i);
-               cpus_and(*nodemask, *nodemask, *cpu_map);
+               cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
                 if (cpumask_empty(nodemask))
                         continue;
  
@@ -7644,11 +7732,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                 struct sched_group *sg, *prev;
                 int j;
  
-               /* FIXME: Use cpumask_of_node */
-               *nodemask = node_to_cpumask(i);
                 cpumask_clear(covered);
-
-               cpus_and(*nodemask, *nodemask, *cpu_map);
+               cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
                 if (cpumask_empty(nodemask)) {
                         sched_group_nodes[i] = NULL;
                         continue;
@@ -7679,8 +7764,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
  
                 for (j = 0; j < nr_node_ids; j++) {
                         int n = (i + j) % nr_node_ids;
-                       /* FIXME: Use cpumask_of_node */
-                       node_to_cpumask_ptr(pnodemask, n);
  
                         cpumask_complement(notcovered, covered);
                         cpumask_and(tmpmask, notcovered, cpu_map);
@@ -7688,7 +7771,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                         if (cpumask_empty(tmpmask))
                                 break;
  
-                       cpumask_and(tmpmask, tmpmask, pnodemask);
+                       cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
                         if (cpumask_empty(tmpmask))
                                 continue;
  
@@ -7973,7 +8056,7 @@ match2:
  }
  
  #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-int arch_reinit_sched_domains(void)
+static void arch_reinit_sched_domains(void)
  {
         get_online_cpus();
  
@@ -7982,13 +8065,10 @@ int arch_reinit_sched_domains(void)
  
         rebuild_sched_domains();
         put_online_cpus();
-
-       return 0;
  }
  
  static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
  {
-       int ret;
         unsigned int level = 0;
  
         if (sscanf(buf, "%u", &level) != 1)
@@ -8009,9 +8089,9 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
         else
                 sched_mc_power_savings = level;
  
-       ret = arch_reinit_sched_domains();
+       arch_reinit_sched_domains();
  
-       return ret ? ret : count;
+       return count;
  }
  
  #ifdef CONFIG_SCHED_MC
@@ -8046,7 +8126,7 @@ static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
                    sched_smt_power_savings_store);
  #endif
  
-int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
  {
         int err = 0;
  
@@ -8186,11 +8266,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
         __set_bit(MAX_RT_PRIO, array->bitmap);
  
  #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-       rt_rq->highest_prio = MAX_RT_PRIO;
+       rt_rq->highest_prio.curr = MAX_RT_PRIO;
+       rt_rq->highest_prio.next = MAX_RT_PRIO;
  #endif
  #ifdef CONFIG_SMP
         rt_rq->rt_nr_migratory = 0;
         rt_rq->overloaded = 0;
+       plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
  #endif
  
         rt_rq->rt_time = 0;