Merge branches 'sched/rt' and 'sched/urgent' into sched/core

[linux-2.6-omap-h63xx.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 2b703f1fac3a1de8cefd93509cf87ae3813c22c5..1dae85a1221ad7b2214a12bb939b07cd1a14576c 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -125,6 +125,9 @@ DEFINE_TRACE(sched_switch);
  DEFINE_TRACE(sched_migrate_task);
  
  #ifdef CONFIG_SMP
+
+static void double_rq_lock(struct rq *rq1, struct rq *rq2);
+
  /*
   * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
   * Since cpu_power is a 'constant', we can use a reciprocal divide.
@@ -1326,8 +1329,8 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
   * slice expiry etc.
   */
  
-#define WEIGHT_IDLEPRIO                2
-#define WMULT_IDLEPRIO         (1 << 31)
+#define WEIGHT_IDLEPRIO                3
+#define WMULT_IDLEPRIO         1431655765
  
  /*
   * Nice levels are multiplicative, with a gentle 10% change for every
@@ -1745,6 +1748,9 @@ static void update_avg(u64 *avg, u64 sample)
  
  static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
  {
+       if (wakeup)
+               p->se.start_runtime = p->se.sum_exec_runtime;
+
         sched_info_queued(p);
         p->sched_class->enqueue_task(rq, p, wakeup);
         p->se.on_rq = 1;
@@ -1752,10 +1758,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
  
  static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
  {
-       if (sleep && p->se.last_wakeup) {
-               update_avg(&p->se.avg_overlap,
-                          p->se.sum_exec_runtime - p->se.last_wakeup);
-               p->se.last_wakeup = 0;
+       if (sleep) {
+               if (p->se.last_wakeup) {
+                       update_avg(&p->se.avg_overlap,
+                               p->se.sum_exec_runtime - p->se.last_wakeup);
+                       p->se.last_wakeup = 0;
+               } else {
+                       update_avg(&p->se.avg_wakeup,
+                               sysctl_sched_wakeup_granularity);
+               }
         }
  
         sched_info_dequeued(p);
@@ -2306,6 +2317,16 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
         if (!sched_feat(SYNC_WAKEUPS))
                 sync = 0;
  
+       if (!sync) {
+               if (current->se.avg_overlap < sysctl_sched_migration_cost &&
+                         p->se.avg_overlap < sysctl_sched_migration_cost)
+                       sync = 1;
+       } else {
+               if (current->se.avg_overlap >= sysctl_sched_migration_cost ||
+                         p->se.avg_overlap >= sysctl_sched_migration_cost)
+                       sync = 0;
+       }
+
  #ifdef CONFIG_SMP
         if (sched_feat(LB_WAKEUP_UPDATE)) {
                 struct sched_domain *sd;
@@ -2385,6 +2406,22 @@ out_activate:
         activate_task(rq, p, 1);
         success = 1;
  
+       /*
+        * Only attribute actual wakeups done by this task.
+        */
+       if (!in_interrupt()) {
+               struct sched_entity *se = &current->se;
+               u64 sample = se->sum_exec_runtime;
+
+               if (se->last_wakeup)
+                       sample -= se->last_wakeup;
+               else
+                       sample -= se->start_runtime;
+               update_avg(&se->avg_wakeup, sample);
+
+               se->last_wakeup = se->sum_exec_runtime;
+       }
+
  out_running:
         trace_sched_wakeup(rq, p, success);
         check_preempt_curr(rq, p, sync);
@@ -2395,8 +2432,6 @@ out_running:
                 p->sched_class->task_wake_up(rq, p);
  #endif
  out:
-       current->se.last_wakeup = current->se.sum_exec_runtime;
-
         task_rq_unlock(rq, &flags);
  
         return success;
@@ -2426,6 +2461,8 @@ static void __sched_fork(struct task_struct *p)
         p->se.prev_sum_exec_runtime     = 0;
         p->se.last_wakeup               = 0;
         p->se.avg_overlap               = 0;
+       p->se.start_runtime             = 0;
+       p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
  
  #ifdef CONFIG_SCHEDSTATS
         p->se.wait_start                = 0;
@@ -3944,19 +3981,24 @@ int select_nohz_load_balancer(int stop_tick)
         int cpu = smp_processor_id();
  
         if (stop_tick) {
-               cpumask_set_cpu(cpu, nohz.cpu_mask);
                 cpu_rq(cpu)->in_nohz_recently = 1;
  
-               /*
-                * If we are going offline and still the leader, give up!
-                */
-               if (!cpu_active(cpu) &&
-                   atomic_read(&nohz.load_balancer) == cpu) {
+               if (!cpu_active(cpu)) {
+                       if (atomic_read(&nohz.load_balancer) != cpu)
+                               return 0;
+
+                       /*
+                        * If we are going offline and still the leader,
+                        * give up!
+                        */
                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
                                 BUG();
+
                         return 0;
                 }
  
+               cpumask_set_cpu(cpu, nohz.cpu_mask);
+
                 /* time for ilb owner also to sleep */
                 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                         if (atomic_read(&nohz.load_balancer) == cpu)
@@ -4504,7 +4546,7 @@ void __kprobes sub_preempt_count(int val)
         /*
          * Underflow?
          */
-       if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
+       if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
                 return;
         /*
          * Is the spinlock portion underflowing?
@@ -5190,7 +5232,7 @@ int can_nice(const struct task_struct *p, const int nice)
   * sys_setpriority is a more generic, but much slower function that
   * does similar things.
   */
-asmlinkage long sys_nice(int increment)
+SYSCALL_DEFINE1(nice, int, increment)
  {
         long nice, retval;
  
@@ -5497,8 +5539,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
   * @policy: new policy.
   * @param: structure containing the new RT priority.
   */
-asmlinkage long
-sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
+               struct sched_param __user *, param)
  {
         /* negative values for policy are not valid */
         if (policy < 0)
@@ -5512,7 +5554,7 @@ sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
   * @pid: the pid in question.
   * @param: structure containing the new RT priority.
   */
-asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
  {
         return do_sched_setscheduler(pid, -1, param);
  }
@@ -5521,7 +5563,7 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
   * sys_sched_getscheduler - get the policy (scheduling class) of a thread
   * @pid: the pid in question.
   */
-asmlinkage long sys_sched_getscheduler(pid_t pid)
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
  {
         struct task_struct *p;
         int retval;
@@ -5546,7 +5588,7 @@ asmlinkage long sys_sched_getscheduler(pid_t pid)
   * @pid: the pid in question.
   * @param: structure containing the RT priority.
   */
-asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
  {
         struct sched_param lp;
         struct task_struct *p;
@@ -5664,8 +5706,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
   * @len: length in bytes of the bitmask pointed to by user_mask_ptr
   * @user_mask_ptr: user-space pointer to the new cpu mask
   */
-asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
-                                     unsigned long __user *user_mask_ptr)
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+               unsigned long __user *, user_mask_ptr)
  {
         cpumask_var_t new_mask;
         int retval;
@@ -5712,8 +5754,8 @@ out_unlock:
   * @len: length in bytes of the bitmask pointed to by user_mask_ptr
   * @user_mask_ptr: user-space pointer to hold the current cpu mask
   */
-asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
-                                     unsigned long __user *user_mask_ptr)
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+               unsigned long __user *, user_mask_ptr)
  {
         int ret;
         cpumask_var_t mask;
@@ -5742,7 +5784,7 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
   * This function yields the current CPU to other tasks. If there are no
   * other threads running on this CPU then this function will return.
   */
-asmlinkage long sys_sched_yield(void)
+SYSCALL_DEFINE0(sched_yield)
  {
         struct rq *rq = this_rq_lock();
  
@@ -5883,7 +5925,7 @@ long __sched io_schedule_timeout(long timeout)
   * this syscall returns the maximum rt_priority that can be used
   * by a given scheduling class.
   */
-asmlinkage long sys_sched_get_priority_max(int policy)
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
  {
         int ret = -EINVAL;
  
@@ -5908,7 +5950,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
   * this syscall returns the minimum rt_priority that can be used
   * by a given scheduling class.
   */
-asmlinkage long sys_sched_get_priority_min(int policy)
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
  {
         int ret = -EINVAL;
  
@@ -5933,8 +5975,8 @@ asmlinkage long sys_sched_get_priority_min(int policy)
   * this syscall writes the default timeslice value of a given process
   * into the user-space timespec buffer. A value of '0' means infinity.
   */
-asmlinkage
-long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+               struct timespec __user *, interval)
  {
         struct task_struct *p;
         unsigned int time_slice;
@@ -7349,10 +7391,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
   * groups, so roll our own. Now each node has its own list of groups which
   * gets dynamically allocated.
   */
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
  static struct sched_group ***sched_group_nodes_bycpu;
  
-static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
  static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
  
  static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
@@ -7627,7 +7669,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
  #ifdef CONFIG_NUMA
                 if (cpumask_weight(cpu_map) >
                                 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
-                       sd = &per_cpu(allnodes_domains, i);
+                       sd = &per_cpu(allnodes_domains, i).sd;
                         SD_INIT(sd, ALLNODES);
                         set_domain_attribute(sd, attr);
                         cpumask_copy(sched_domain_span(sd), cpu_map);
@@ -7637,7 +7679,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                 } else
                         p = NULL;
  
-               sd = &per_cpu(node_domains, i);
+               sd = &per_cpu(node_domains, i).sd;
                 SD_INIT(sd, NODE);
                 set_domain_attribute(sd, attr);
                 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
@@ -7755,7 +7797,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                 for_each_cpu(j, nodemask) {
                         struct sched_domain *sd;
  
-                       sd = &per_cpu(node_domains, j);
+                       sd = &per_cpu(node_domains, j).sd;
                         sd->groups = sg;
                 }
                 sg->__cpu_power = 0;
@@ -9118,6 +9160,13 @@ static int tg_schedulable(struct task_group *tg, void *data)
                 runtime = d->rt_runtime;
         }
  
+#ifdef CONFIG_USER_SCHED
+       if (tg == &root_task_group) {
+               period = global_rt_period();
+               runtime = global_rt_runtime();
+       }
+#endif
+
         /*
          * Cannot have more runtime than the period.
          */