sched: add new API sched_setscheduler_nocheck: add a flag to control access checks

[linux-2.6-omap-h63xx.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index bfb8ad8ed1717bf95f82ddf7ea8b5b40bb7fbe7b..8d7c246ab864cbc35a0af29aa52da05546d83baa 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -312,12 +312,15 @@ static DEFINE_SPINLOCK(task_group_lock);
  #endif
  
  /*
- * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems.
+ * A weight of 0 or 1 can cause arithmetics problems.
+ * A weight of a cfs_rq is the sum of weights of which entities
+ * are queued on this cfs_rq, so a weight of a entity should not be
+ * too large, so as the shares value of a task group.
   * (The default weight is 1024 - so there's no practical
   *  limitation from this.)
   */
  #define MIN_SHARES     2
-#define MAX_SHARES     (ULONG_MAX - 1)
+#define MAX_SHARES     (1UL << 18)
  
  static int init_task_group_load = INIT_TASK_GROUP_LOAD;
  #endif
@@ -1124,6 +1127,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
         return HRTIMER_NORESTART;
  }
  
+#ifdef CONFIG_SMP
  static void hotplug_hrtick_disable(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
@@ -1179,6 +1183,7 @@ static void init_hrtick(void)
  {
         hotcpu_notifier(hotplug_hrtick, 0);
  }
+#endif /* CONFIG_SMP */
  
  static void init_rq_hrtick(struct rq *rq)
  {
@@ -1337,8 +1342,13 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
  {
         u64 tmp;
  
-       if (!lw->inv_weight)
-               lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1);
+       if (!lw->inv_weight) {
+               if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
+                       lw->inv_weight = 1;
+               else
+                       lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
+                               / (lw->weight+1);
+       }
  
         tmp = (u64)delta_exec * weight;
         /*
@@ -4159,12 +4169,10 @@ need_resched_nonpreemptible:
         clear_tsk_need_resched(prev);
  
         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
-               if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
-                               signal_pending(prev))) {
+               if (unlikely(signal_pending_state(prev->state, prev)))
                         prev->state = TASK_RUNNING;
-               } else {
+               else
                         deactivate_task(rq, prev, 1);
-               }
                 switch_count = &prev->nvcsw;
         }
  
@@ -4738,16 +4746,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
         set_load_weight(p);
  }
  
-/**
- * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * NOTE that the task may be already dead.
- */
-int sched_setscheduler(struct task_struct *p, int policy,
-                      struct sched_param *param)
+static int __sched_setscheduler(struct task_struct *p, int policy,
+                               struct sched_param *param, bool user)
  {
         int retval, oldprio, oldpolicy = -1, on_rq, running;
         unsigned long flags;
@@ -4779,7 +4779,7 @@ recheck:
         /*
          * Allow unprivileged RT tasks to decrease priority:
          */
-       if (!capable(CAP_SYS_NICE)) {
+       if (user && !capable(CAP_SYS_NICE)) {
                 if (rt_policy(policy)) {
                         unsigned long rlim_rtprio;
  
@@ -4815,7 +4815,8 @@ recheck:
          * Do not allow realtime tasks into groups that have no runtime
          * assigned.
          */
-       if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+       if (user
+           && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
                 return -EPERM;
  #endif
  
@@ -4864,8 +4865,39 @@ recheck:
  
         return 0;
  }
+
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+                      struct sched_param *param)
+{
+       return __sched_setscheduler(p, policy, param, true);
+}
  EXPORT_SYMBOL_GPL(sched_setscheduler);
  
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission.  For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+                              struct sched_param *param)
+{
+       return __sched_setscheduler(p, policy, param, false);
+}
+
  static int
  do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
  {
@@ -6871,7 +6903,12 @@ static int default_relax_domain_level = -1;
  
  static int __init setup_relax_domain_level(char *str)
  {
-       default_relax_domain_level = simple_strtoul(str, NULL, 0);
+       unsigned long val;
+
+       val = simple_strtoul(str, NULL, 0);
+       if (val < SD_LV_MAX)
+               default_relax_domain_level = val;
+
         return 1;
  }
  __setup("relax_domain_level=", setup_relax_domain_level);
@@ -7229,6 +7266,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
  {
  }
  
+/*
+ * Free current domain masks.
+ * Called after all cpus are attached to NULL domain.
+ */
+static void free_sched_domains(void)
+{
+       ndoms_cur = 0;
+       if (doms_cur != &fallback_doms)
+               kfree(doms_cur);
+       doms_cur = &fallback_doms;
+}
+
  /*
   * Set up scheduler domains and groups. Callers must hold the hotplug lock.
   * For now this just excludes isolated cpus, but could be used to
@@ -7376,6 +7425,7 @@ int arch_reinit_sched_domains(void)
         get_online_cpus();
         mutex_lock(&sched_domains_mutex);
         detach_destroy_domains(&cpu_online_map);
+       free_sched_domains();
         err = arch_init_sched_domains(&cpu_online_map);
         mutex_unlock(&sched_domains_mutex);
         put_online_cpus();
@@ -7461,6 +7511,7 @@ static int update_sched_domains(struct notifier_block *nfb,
         case CPU_DOWN_PREPARE:
         case CPU_DOWN_PREPARE_FROZEN:
                 detach_destroy_domains(&cpu_online_map);
+               free_sched_domains();
                 return NOTIFY_OK;
  
         case CPU_UP_CANCELED:
@@ -7479,8 +7530,16 @@ static int update_sched_domains(struct notifier_block *nfb,
                 return NOTIFY_DONE;
         }
  
+#ifndef CONFIG_CPUSETS
+       /*
+        * Create default domain partitioning if cpusets are disabled.
+        * Otherwise we let cpusets rebuild the domains based on the
+        * current setup.
+        */
+
         /* The hotplug lock is already held by cpu_up/cpu_down */
         arch_init_sched_domains(&cpu_online_map);
+#endif
  
         return NOTIFY_OK;
  }
@@ -7620,7 +7679,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
         else
                 rt_se->rt_rq = parent->my_q;
  
-       rt_se->rt_rq = &rq->rt;
         rt_se->my_q = rt_rq;
         rt_se->parent = parent;
         INIT_LIST_HEAD(&rt_se->run_list);
@@ -8342,7 +8400,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
  #ifdef CONFIG_CGROUP_SCHED
  static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
  {
-       struct task_group *tgi, *parent = tg->parent;
+       struct task_group *tgi, *parent = tg ? tg->parent : NULL;
         unsigned long total = 0;
  
         if (!parent) {