kernel/sched.c

   1 /*
   2  *  kernel/sched.c
   3  *
   4  *  Kernel scheduler and related syscalls
   5  *
   6  *  Copyright (C) 1991-2002  Linus Torvalds
   7  *
   8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9  *              make semaphores SMP safe
  10  *  1998-11-19  Implemented schedule_timeout() and related stuff
  11  *              by Andrea Arcangeli
  12  *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
  13  *              hybrid priority-list and round-robin design with
  14  *              an array-switch method of distributing timeslices
  15  *              and per-CPU runqueues.  Cleanups and useful suggestions
  16  *              by Davide Libenzi, preemptible kernel bits by Robert Love.
  17  *  2003-09-03  Interactivity tuning by Con Kolivas.
  18  *  2004-04-02  Scheduler domains code by Nick Piggin
  19  *  2007-04-15  Work begun on replacing all interactivity tuning with a
  20  *              fair scheduling design by Con Kolivas.
  21  *  2007-05-05  Load balancing (smp-nice) and other improvements
  22  *              by Peter Williams
  23  *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
  24  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  25  */
  26
  27 #include <linux/mm.h>
  28 #include <linux/module.h>
  29 #include <linux/nmi.h>
  30 #include <linux/init.h>
  31 #include <linux/uaccess.h>
  32 #include <linux/highmem.h>
  33 #include <linux/smp_lock.h>
  34 #include <asm/mmu_context.h>
  35 #include <linux/interrupt.h>
  36 #include <linux/capability.h>
  37 #include <linux/completion.h>
  38 #include <linux/kernel_stat.h>
  39 #include <linux/debug_locks.h>
  40 #include <linux/security.h>
  41 #include <linux/notifier.h>
  42 #include <linux/profile.h>
  43 #include <linux/freezer.h>
  44 #include <linux/vmalloc.h>
  45 #include <linux/blkdev.h>
  46 #include <linux/delay.h>
  47 #include <linux/smp.h>
  48 #include <linux/threads.h>
  49 #include <linux/timer.h>
  50 #include <linux/rcupdate.h>
  51 #include <linux/cpu.h>
  52 #include <linux/cpuset.h>
  53 #include <linux/percpu.h>
  54 #include <linux/kthread.h>
  55 #include <linux/seq_file.h>
  56 #include <linux/sysctl.h>
  57 #include <linux/syscalls.h>
  58 #include <linux/times.h>
  59 #include <linux/tsacct_kern.h>
  60 #include <linux/kprobes.h>
  61 #include <linux/delayacct.h>
  62 #include <linux/reciprocal_div.h>
  63 #include <linux/unistd.h>
  64
  65 #include <asm/tlb.h>
  66
  67 /*
  68  * Scheduler clock - returns current time in nanosec units.
  69  * This is default implementation.
  70  * Architectures and sub-architectures can override this.
  71  */
  72 unsigned long long __attribute__((weak)) sched_clock(void)
  73 {
  74         return (unsigned long long)jiffies * (1000000000 / HZ);
  75 }
  76
  77 /*
  78  * Convert user-nice values [ -20 ... 0 ... 19 ]
  79  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  80  * and back.
  81  */
  82 #define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
  83 #define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
  84 #define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
  85
  86 /*
  87  * 'User priority' is the nice value converted to something we
  88  * can work with better when scaling various scheduler parameters,
  89  * it's a [ 0 ... 39 ] range.
  90  */
  91 #define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
  92 #define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
  93 #define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
  94
  95 /*
  96  * Some helpers for converting nanosecond timing to jiffy resolution
  97  */
  98 #define NS_TO_JIFFIES(TIME)     ((TIME) / (1000000000 / HZ))
  99 #define JIFFIES_TO_NS(TIME)     ((TIME) * (1000000000 / HZ))
 100
 101 #define NICE_0_LOAD             SCHED_LOAD_SCALE
 102 #define NICE_0_SHIFT            SCHED_LOAD_SHIFT
 103
 104 /*
 105  * These are the 'tuning knobs' of the scheduler:
 106  *
 107  * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
 108  * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
 109  * Timeslices get refilled after they expire.
 110  */
 111 #define MIN_TIMESLICE           max(5 * HZ / 1000, 1)
 112 #define DEF_TIMESLICE           (100 * HZ / 1000)
 113
 114 #ifdef CONFIG_SMP
 115 /*
 116  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
 117  * Since cpu_power is a 'constant', we can use a reciprocal divide.
 118  */
 119 static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
 120 {
 121         return reciprocal_divide(load, sg->reciprocal_cpu_power);
 122 }
 123
 124 /*
 125  * Each time a sched group cpu_power is changed,
 126  * we must compute its reciprocal value
 127  */
 128 static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
 129 {
 130         sg->__cpu_power += val;
 131         sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
 132 }
 133 #endif
 134
 135 #define SCALE_PRIO(x, prio) \
 136         max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
 137
 138 /*
 139  * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
 140  * to time slice values: [800ms ... 100ms ... 5ms]
 141  */
 142 static unsigned int static_prio_timeslice(int static_prio)
 143 {
 144         if (static_prio == NICE_TO_PRIO(19))
 145                 return 1;
 146
 147         if (static_prio < NICE_TO_PRIO(0))
 148                 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
 149         else
 150                 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
 151 }
 152
 153 static inline int rt_policy(int policy)
 154 {
 155         if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
 156                 return 1;
 157         return 0;
 158 }
 159
 160 static inline int task_has_rt_policy(struct task_struct *p)
 161 {
 162         return rt_policy(p->policy);
 163 }
 164
 165 /*
 166  * This is the priority-queue data structure of the RT scheduling class:
 167  */
 168 struct rt_prio_array {
 169         DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
 170         struct list_head queue[MAX_RT_PRIO];
 171 };
 172
 173 struct load_stat {
 174         struct load_weight load;
 175         u64 load_update_start, load_update_last;
 176         unsigned long delta_fair, delta_exec, delta_stat;
 177 };
 178
 179 /* CFS-related fields in a runqueue */
 180 struct cfs_rq {
 181         struct load_weight load;
 182         unsigned long nr_running;
 183
 184         s64 fair_clock;
 185         u64 exec_clock;
 186         s64 wait_runtime;
 187         u64 sleeper_bonus;
 188         unsigned long wait_runtime_overruns, wait_runtime_underruns;
 189
 190         struct rb_root tasks_timeline;
 191         struct rb_node *rb_leftmost;
 192         struct rb_node *rb_load_balance_curr;
 193 #ifdef CONFIG_FAIR_GROUP_SCHED
 194         /* 'curr' points to currently running entity on this cfs_rq.
 195          * It is set to NULL otherwise (i.e when none are currently running).
 196          */
 197         struct sched_entity *curr;
 198         struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
 199
 200         /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 201          * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
 202          * (like users, containers etc.)
 203          *
 204          * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
 205          * list is used during load balance.
 206          */
 207         struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
 208 #endif
 209 };
 210
 211 /* Real-Time classes' related field in a runqueue: */
 212 struct rt_rq {
 213         struct rt_prio_array active;
 214         int rt_load_balance_idx;
 215         struct list_head *rt_load_balance_head, *rt_load_balance_curr;
 216 };
 217
 218 /*
 219  * This is the main, per-CPU runqueue data structure.
 220  *
 221  * Locking rule: those places that want to lock multiple runqueues
 222  * (such as the load balancing or the thread migration code), lock
 223  * acquire operations must be ordered by ascending &runqueue.
 224  */
 225 struct rq {
 226         spinlock_t lock;        /* runqueue lock */
 227
 228         /*
 229          * nr_running and cpu_load should be in the same cacheline because
 230          * remote CPUs use both these fields when doing load calculation.
 231          */
 232         unsigned long nr_running;
 233         #define CPU_LOAD_IDX_MAX 5
 234         unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 235         unsigned char idle_at_tick;
 236 #ifdef CONFIG_NO_HZ
 237         unsigned char in_nohz_recently;
 238 #endif
 239         struct load_stat ls;    /* capture load from *all* tasks on this cpu */
 240         unsigned long nr_load_updates;
 241         u64 nr_switches;
 242
 243         struct cfs_rq cfs;
 244 #ifdef CONFIG_FAIR_GROUP_SCHED
 245         struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
 246 #endif
 247         struct rt_rq  rt;
 248
 249         /*
 250          * This is part of a global counter where only the total sum
 251          * over all CPUs matters. A task can increase this counter on
 252          * one CPU and if it got migrated afterwards it may decrease
 253          * it on another CPU. Always updated under the runqueue lock:
 254          */
 255         unsigned long nr_uninterruptible;
 256
 257         struct task_struct *curr, *idle;
 258         unsigned long next_balance;
 259         struct mm_struct *prev_mm;
 260
 261         u64 clock, prev_clock_raw;
 262         s64 clock_max_delta;
 263
 264         unsigned int clock_warps, clock_overflows;
 265         unsigned int clock_unstable_events;
 266
 267         atomic_t nr_iowait;
 268
 269 #ifdef CONFIG_SMP
 270         struct sched_domain *sd;
 271
 272         /* For active balancing */
 273         int active_balance;
 274         int push_cpu;
 275         int cpu;                /* cpu of this runqueue */
 276
 277         struct task_struct *migration_thread;
 278         struct list_head migration_queue;
 279 #endif
 280
 281 #ifdef CONFIG_SCHEDSTATS
 282         /* latency stats */
 283         struct sched_info rq_sched_info;
 284
 285         /* sys_sched_yield() stats */
 286         unsigned long yld_exp_empty;
 287         unsigned long yld_act_empty;
 288         unsigned long yld_both_empty;
 289         unsigned long yld_cnt;
 290
 291         /* schedule() stats */
 292         unsigned long sched_switch;
 293         unsigned long sched_cnt;
 294         unsigned long sched_goidle;
 295
 296         /* try_to_wake_up() stats */
 297         unsigned long ttwu_cnt;
 298         unsigned long ttwu_local;
 299 #endif
 300         struct lock_class_key rq_lock_key;
 301 };
 302
 303 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 304 static DEFINE_MUTEX(sched_hotcpu_mutex);
 305
 306 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
 307 {
 308         rq->curr->sched_class->check_preempt_curr(rq, p);
 309 }
 310
 311 static inline int cpu_of(struct rq *rq)
 312 {
 313 #ifdef CONFIG_SMP
 314         return rq->cpu;
 315 #else
 316         return 0;
 317 #endif
 318 }
 319
 320 /*
 321  * Per-runqueue clock, as finegrained as the platform can give us:
 322  */
 323 static unsigned long long __rq_clock(struct rq *rq)
 324 {
 325         u64 prev_raw = rq->prev_clock_raw;
 326         u64 now = sched_clock();
 327         s64 delta = now - prev_raw;
 328         u64 clock = rq->clock;
 329
 330         /*
 331          * Protect against sched_clock() occasionally going backwards:
 332          */
 333         if (unlikely(delta < 0)) {
 334                 clock++;
 335                 rq->clock_warps++;
 336         } else {
 337                 /*
 338                  * Catch too large forward jumps too:
 339                  */
 340                 if (unlikely(delta > 2*TICK_NSEC)) {
 341                         clock++;
 342                         rq->clock_overflows++;
 343                 } else {
 344                         if (unlikely(delta > rq->clock_max_delta))
 345                                 rq->clock_max_delta = delta;
 346                         clock += delta;
 347                 }
 348         }
 349
 350         rq->prev_clock_raw = now;
 351         rq->clock = clock;
 352
 353         return clock;
 354 }
 355
 356 static inline unsigned long long rq_clock(struct rq *rq)
 357 {
 358         int this_cpu = smp_processor_id();
 359
 360         if (this_cpu == cpu_of(rq))
 361                 return __rq_clock(rq);
 362
 363         return rq->clock;
 364 }
 365
 366 /*
 367  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
 368  * See detach_destroy_domains: synchronize_sched for details.
 369  *
 370  * The domain tree of any CPU may only be accessed from within
 371  * preempt-disabled sections.
 372  */
 373 #define for_each_domain(cpu, __sd) \
 374         for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 375
 376 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 377 #define this_rq()               (&__get_cpu_var(runqueues))
 378 #define task_rq(p)              cpu_rq(task_cpu(p))
 379 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 380
 381 /*
 382  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
 383  * clock constructed from sched_clock():
 384  */
 385 unsigned long long cpu_clock(int cpu)
 386 {
 387         unsigned long long now;
 388         unsigned long flags;
 389
 390         local_irq_save(flags);
 391         now = rq_clock(cpu_rq(cpu));
 392         local_irq_restore(flags);
 393
 394         return now;
 395 }
 396
 397 #ifdef CONFIG_FAIR_GROUP_SCHED
 398 /* Change a task's ->cfs_rq if it moves across CPUs */
 399 static inline void set_task_cfs_rq(struct task_struct *p)
 400 {
 401         p->se.cfs_rq = &task_rq(p)->cfs;
 402 }
 403 #else
 404 static inline void set_task_cfs_rq(struct task_struct *p)
 405 {
 406 }
 407 #endif
 408
 409 #ifndef prepare_arch_switch
 410 # define prepare_arch_switch(next)      do { } while (0)
 411 #endif
 412 #ifndef finish_arch_switch
 413 # define finish_arch_switch(prev)       do { } while (0)
 414 #endif
 415
 416 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 417 static inline int task_running(struct rq *rq, struct task_struct *p)
 418 {
 419         return rq->curr == p;
 420 }
 421
 422 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 423 {
 424 }
 425
 426 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 427 {
 428 #ifdef CONFIG_DEBUG_SPINLOCK
 429         /* this is a valid case when another task releases the spinlock */
 430         rq->lock.owner = current;
 431 #endif
 432         /*
 433          * If we are tracking spinlock dependencies then we have to
 434          * fix up the runqueue lock - which gets 'carried over' from
 435          * prev into current:
 436          */
 437         spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 438
 439         spin_unlock_irq(&rq->lock);
 440 }
 441
 442 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
 443 static inline int task_running(struct rq *rq, struct task_struct *p)
 444 {
 445 #ifdef CONFIG_SMP
 446         return p->oncpu;
 447 #else
 448         return rq->curr == p;
 449 #endif
 450 }
 451
 452 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 453 {
 454 #ifdef CONFIG_SMP
 455         /*
 456          * We can optimise this out completely for !SMP, because the
 457          * SMP rebalancing from interrupt is the only thing that cares
 458          * here.
 459          */
 460         next->oncpu = 1;
 461 #endif
 462 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 463         spin_unlock_irq(&rq->lock);
 464 #else
 465         spin_unlock(&rq->lock);
 466 #endif
 467 }
 468
 469 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 470 {
 471 #ifdef CONFIG_SMP
 472         /*
 473          * After ->oncpu is cleared, the task can be moved to a different CPU.
 474          * We must ensure this doesn't happen until the switch is completely
 475          * finished.
 476          */
 477         smp_wmb();
 478         prev->oncpu = 0;
 479 #endif
 480 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 481         local_irq_enable();
 482 #endif
 483 }
 484 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 485
 486 /*
 487  * __task_rq_lock - lock the runqueue a given task resides on.
 488  * Must be called interrupts disabled.
 489  */
 490 static inline struct rq *__task_rq_lock(struct task_struct *p)
 491         __acquires(rq->lock)
 492 {
 493         struct rq *rq;
 494
 495 repeat_lock_task:
 496         rq = task_rq(p);
 497         spin_lock(&rq->lock);
 498         if (unlikely(rq != task_rq(p))) {
 499                 spin_unlock(&rq->lock);
 500                 goto repeat_lock_task;
 501         }
 502         return rq;
 503 }
 504
 505 /*
 506  * task_rq_lock - lock the runqueue a given task resides on and disable
 507  * interrupts.  Note the ordering: we can safely lookup the task_rq without
 508  * explicitly disabling preemption.
 509  */
 510 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 511         __acquires(rq->lock)
 512 {
 513         struct rq *rq;
 514
 515 repeat_lock_task:
 516         local_irq_save(*flags);
 517         rq = task_rq(p);
 518         spin_lock(&rq->lock);
 519         if (unlikely(rq != task_rq(p))) {
 520                 spin_unlock_irqrestore(&rq->lock, *flags);
 521                 goto repeat_lock_task;
 522         }
 523         return rq;
 524 }
 525
 526 static inline void __task_rq_unlock(struct rq *rq)
 527         __releases(rq->lock)
 528 {
 529         spin_unlock(&rq->lock);
 530 }
 531
 532 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
 533         __releases(rq->lock)
 534 {
 535         spin_unlock_irqrestore(&rq->lock, *flags);
 536 }
 537
 538 /*
 539  * this_rq_lock - lock this runqueue and disable interrupts.
 540  */
 541 static inline struct rq *this_rq_lock(void)
 542         __acquires(rq->lock)
 543 {
 544         struct rq *rq;
 545
 546         local_irq_disable();
 547         rq = this_rq();
 548         spin_lock(&rq->lock);
 549
 550         return rq;
 551 }
 552
 553 /*
 554  * CPU frequency is/was unstable - start new by setting prev_clock_raw:
 555  */
 556 void sched_clock_unstable_event(void)
 557 {
 558         unsigned long flags;
 559         struct rq *rq;
 560
 561         rq = task_rq_lock(current, &flags);
 562         rq->prev_clock_raw = sched_clock();
 563         rq->clock_unstable_events++;
 564         task_rq_unlock(rq, &flags);
 565 }
 566
 567 /*
 568  * resched_task - mark a task 'to be rescheduled now'.
 569  *
 570  * On UP this means the setting of the need_resched flag, on SMP it
 571  * might also involve a cross-CPU call to trigger the scheduler on
 572  * the target CPU.
 573  */
 574 #ifdef CONFIG_SMP
 575
 576 #ifndef tsk_is_polling
 577 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 578 #endif
 579
 580 static void resched_task(struct task_struct *p)
 581 {
 582         int cpu;
 583
 584         assert_spin_locked(&task_rq(p)->lock);
 585
 586         if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
 587                 return;
 588
 589         set_tsk_thread_flag(p, TIF_NEED_RESCHED);
 590
 591         cpu = task_cpu(p);
 592         if (cpu == smp_processor_id())
 593                 return;
 594
 595         /* NEED_RESCHED must be visible before we test polling */
 596         smp_mb();
 597         if (!tsk_is_polling(p))
 598                 smp_send_reschedule(cpu);
 599 }
 600
 601 static void resched_cpu(int cpu)
 602 {
 603         struct rq *rq = cpu_rq(cpu);
 604         unsigned long flags;
 605
 606         if (!spin_trylock_irqsave(&rq->lock, flags))
 607                 return;
 608         resched_task(cpu_curr(cpu));
 609         spin_unlock_irqrestore(&rq->lock, flags);
 610 }
 611 #else
 612 static inline void resched_task(struct task_struct *p)
 613 {
 614         assert_spin_locked(&task_rq(p)->lock);
 615         set_tsk_need_resched(p);
 616 }
 617 #endif
 618
 619 static u64 div64_likely32(u64 divident, unsigned long divisor)
 620 {
 621 #if BITS_PER_LONG == 32
 622         if (likely(divident <= 0xffffffffULL))
 623                 return (u32)divident / divisor;
 624         do_div(divident, divisor);
 625
 626         return divident;
 627 #else
 628         return divident / divisor;
 629 #endif
 630 }
 631
 632 #if BITS_PER_LONG == 32
 633 # define WMULT_CONST    (~0UL)
 634 #else
 635 # define WMULT_CONST    (1UL << 32)
 636 #endif
 637
 638 #define WMULT_SHIFT     32
 639
 640 static unsigned long
 641 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 642                 struct load_weight *lw)
 643 {
 644         u64 tmp;
 645
 646         if (unlikely(!lw->inv_weight))
 647                 lw->inv_weight = WMULT_CONST / lw->weight;
 648
 649         tmp = (u64)delta_exec * weight;
 650         /*
 651          * Check whether we'd overflow the 64-bit multiplication:
 652          */
 653         if (unlikely(tmp > WMULT_CONST)) {
 654                 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
 655                                 >> (WMULT_SHIFT/2);
 656         } else {
 657                 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
 658         }
 659
 660         return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 661 }
 662
 663 static inline unsigned long
 664 calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
 665 {
 666         return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
 667 }
 668
 669 static void update_load_add(struct load_weight *lw, unsigned long inc)
 670 {
 671         lw->weight += inc;
 672         lw->inv_weight = 0;
 673 }
 674
 675 static void update_load_sub(struct load_weight *lw, unsigned long dec)
 676 {
 677         lw->weight -= dec;
 678         lw->inv_weight = 0;
 679 }
 680
 681 static void __update_curr_load(struct rq *rq, struct load_stat *ls)
 682 {
 683         if (rq->curr != rq->idle && ls->load.weight) {
 684                 ls->delta_exec += ls->delta_stat;
 685                 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
 686                 ls->delta_stat = 0;
 687         }
 688 }
 689
 690 /*
 691  * Update delta_exec, delta_fair fields for rq.
 692  *
 693  * delta_fair clock advances at a rate inversely proportional to
 694  * total load (rq->ls.load.weight) on the runqueue, while
 695  * delta_exec advances at the same rate as wall-clock (provided
 696  * cpu is not idle).
 697  *
 698  * delta_exec / delta_fair is a measure of the (smoothened) load on this
 699  * runqueue over any given interval. This (smoothened) load is used
 700  * during load balance.
 701  *
 702  * This function is called /before/ updating rq->ls.load
 703  * and when switching tasks.
 704  */
 705 static void update_curr_load(struct rq *rq, u64 now)
 706 {
 707         struct load_stat *ls = &rq->ls;
 708         u64 start;
 709
 710         start = ls->load_update_start;
 711         ls->load_update_start = now;
 712         ls->delta_stat += now - start;
 713         /*
 714          * Stagger updates to ls->delta_fair. Very frequent updates
 715          * can be expensive.
 716          */
 717         if (ls->delta_stat >= sysctl_sched_stat_granularity)
 718                 __update_curr_load(rq, ls);
 719 }
 720
 721 /*
 722  * To aid in avoiding the subversion of "niceness" due to uneven distribution
 723  * of tasks with abnormal "nice" values across CPUs the contribution that
 724  * each task makes to its run queue's load is weighted according to its
 725  * scheduling class and "nice" value.  For SCHED_NORMAL tasks this is just a
 726  * scaled version of the new time slice allocation that they receive on time
 727  * slice expiry etc.
 728  */
 729
 730 #define WEIGHT_IDLEPRIO         2
 731 #define WMULT_IDLEPRIO          (1 << 31)
 732
 733 /*
 734  * Nice levels are multiplicative, with a gentle 10% change for every
 735  * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
 736  * nice 1, it will get ~10% less CPU time than another CPU-bound task
 737  * that remained on nice 0.
 738  *
 739  * The "10% effect" is relative and cumulative: from _any_ nice level,
 740  * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
 741  * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
 742  * If a task goes up by ~10% and another task goes down by ~10% then
 743  * the relative distance between them is ~25%.)
 744  */
 745 static const int prio_to_weight[40] = {
 746 /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
 747 /* -10 */  9537,  7629,  6103,  4883,  3906,  3125,  2500,  2000,  1600,  1280,
 748 /*   0 */  NICE_0_LOAD /* 1024 */,
 749 /*   1 */          819,   655,   524,   419,   336,   268,   215,   172,   137,
 750 /*  10 */   110,    87,    70,    56,    45,    36,    29,    23,    18,    15,
 751 };
 752
 753 /*
 754  * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
 755  *
 756  * In cases where the weight does not change often, we can use the
 757  * precalculated inverse to speed up arithmetics by turning divisions
 758  * into multiplications:
 759  */
 760 static const u32 prio_to_wmult[40] = {
 761 /* -20 */     48356,     60446,     75558,     94446,    118058,
 762 /* -15 */    147573,    184467,    230589,    288233,    360285,
 763 /* -10 */    450347,    562979,    703746,    879575,   1099582,
 764 /*  -5 */   1374389,   1717986,   2147483,   2684354,   3355443,
 765 /*   0 */   4194304,   5244160,   6557201,   8196502,  10250518,
 766 /*   5 */  12782640,  16025997,  19976592,  24970740,  31350126,
 767 /*  10 */  39045157,  49367440,  61356675,  76695844,  95443717,
 768 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 769 };
 770
 771 static inline void
 772 inc_load(struct rq *rq, const struct task_struct *p, u64 now)
 773 {
 774         update_curr_load(rq, now);
 775         update_load_add(&rq->ls.load, p->se.load.weight);
 776 }
 777
 778 static inline void
 779 dec_load(struct rq *rq, const struct task_struct *p, u64 now)
 780 {
 781         update_curr_load(rq, now);
 782         update_load_sub(&rq->ls.load, p->se.load.weight);
 783 }
 784
 785 static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 786 {
 787         rq->nr_running++;
 788         inc_load(rq, p, now);
 789 }
 790
 791 static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 792 {
 793         rq->nr_running--;
 794         dec_load(rq, p, now);
 795 }
 796
 797 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
 798
 799 /*
 800  * runqueue iterator, to support SMP load-balancing between different
 801  * scheduling classes, without having to expose their internal data
 802  * structures to the load-balancing proper:
 803  */
 804 struct rq_iterator {
 805         void *arg;
 806         struct task_struct *(*start)(void *);
 807         struct task_struct *(*next)(void *);
 808 };
 809
 810 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 811                       unsigned long max_nr_move, unsigned long max_load_move,
 812                       struct sched_domain *sd, enum cpu_idle_type idle,
 813                       int *all_pinned, unsigned long *load_moved,
 814                       int this_best_prio, int best_prio, int best_prio_seen,
 815                       struct rq_iterator *iterator);
 816
 817 #include "sched_stats.h"
 818 #include "sched_rt.c"
 819 #include "sched_fair.c"
 820 #include "sched_idletask.c"
 821 #ifdef CONFIG_SCHED_DEBUG
 822 # include "sched_debug.c"
 823 #endif
 824
 825 #define sched_class_highest (&rt_sched_class)
 826
 827 static void set_load_weight(struct task_struct *p)
 828 {
 829         task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
 830         p->se.wait_runtime = 0;
 831
 832         if (task_has_rt_policy(p)) {
 833                 p->se.load.weight = prio_to_weight[0] * 2;
 834                 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
 835                 return;
 836         }
 837
 838         /*
 839          * SCHED_IDLE tasks get minimal weight:
 840          */
 841         if (p->policy == SCHED_IDLE) {
 842                 p->se.load.weight = WEIGHT_IDLEPRIO;
 843                 p->se.load.inv_weight = WMULT_IDLEPRIO;
 844                 return;
 845         }
 846
 847         p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
 848         p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 849 }
 850
 851 static void
 852 enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
 853 {
 854         sched_info_queued(p);
 855         p->sched_class->enqueue_task(rq, p, wakeup, now);
 856         p->se.on_rq = 1;
 857 }
 858
 859 static void
 860 dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 861 {
 862         p->sched_class->dequeue_task(rq, p, sleep, now);
 863         p->se.on_rq = 0;
 864 }
 865
 866 /*
 867  * __normal_prio - return the priority that is based on the static prio
 868  */
 869 static inline int __normal_prio(struct task_struct *p)
 870 {
 871         return p->static_prio;
 872 }
 873
 874 /*
 875  * Calculate the expected normal priority: i.e. priority
 876  * without taking RT-inheritance into account. Might be
 877  * boosted by interactivity modifiers. Changes upon fork,
 878  * setprio syscalls, and whenever the interactivity
 879  * estimator recalculates.
 880  */
 881 static inline int normal_prio(struct task_struct *p)
 882 {
 883         int prio;
 884
 885         if (task_has_rt_policy(p))
 886                 prio = MAX_RT_PRIO-1 - p->rt_priority;
 887         else
 888                 prio = __normal_prio(p);
 889         return prio;
 890 }
 891
 892 /*
 893  * Calculate the current priority, i.e. the priority
 894  * taken into account by the scheduler. This value might
 895  * be boosted by RT tasks, or might be boosted by
 896  * interactivity modifiers. Will be RT if the task got
 897  * RT-boosted. If not then it returns p->normal_prio.
 898  */
 899 static int effective_prio(struct task_struct *p)
 900 {
 901         p->normal_prio = normal_prio(p);
 902         /*
 903          * If we are RT tasks or we were boosted to RT priority,
 904          * keep the priority unchanged. Otherwise, update priority
 905          * to the normal priority:
 906          */
 907         if (!rt_prio(p->prio))
 908                 return p->normal_prio;
 909         return p->prio;
 910 }
 911
 912 /*
 913  * activate_task - move a task to the runqueue.
 914  */
 915 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 916 {
 917         u64 now = rq_clock(rq);
 918
 919         if (p->state == TASK_UNINTERRUPTIBLE)
 920                 rq->nr_uninterruptible--;
 921
 922         enqueue_task(rq, p, wakeup, now);
 923         inc_nr_running(p, rq, now);
 924 }
 925
 926 /*
 927  * activate_idle_task - move idle task to the _front_ of runqueue.
 928  */
 929 static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
 930 {
 931         u64 now = rq_clock(rq);
 932
 933         if (p->state == TASK_UNINTERRUPTIBLE)
 934                 rq->nr_uninterruptible--;
 935
 936         enqueue_task(rq, p, 0, now);
 937         inc_nr_running(p, rq, now);
 938 }
 939
 940 /*
 941  * deactivate_task - remove a task from the runqueue.
 942  */
 943 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 944 {
 945         u64 now = rq_clock(rq);
 946
 947         if (p->state == TASK_UNINTERRUPTIBLE)
 948                 rq->nr_uninterruptible++;
 949
 950         dequeue_task(rq, p, sleep, now);
 951         dec_nr_running(p, rq, now);
 952 }
 953
 954 /**
 955  * task_curr - is this task currently executing on a CPU?
 956  * @p: the task in question.
 957  */
 958 inline int task_curr(const struct task_struct *p)
 959 {
 960         return cpu_curr(task_cpu(p)) == p;
 961 }
 962
 963 /* Used instead of source_load when we know the type == 0 */
 964 unsigned long weighted_cpuload(const int cpu)
 965 {
 966         return cpu_rq(cpu)->ls.load.weight;
 967 }
 968
 969 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 970 {
 971 #ifdef CONFIG_SMP
 972         task_thread_info(p)->cpu = cpu;
 973         set_task_cfs_rq(p);
 974 #endif
 975 }
 976
 977 #ifdef CONFIG_SMP
 978
 979 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 980 {
 981         int old_cpu = task_cpu(p);
 982         struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
 983         u64 clock_offset, fair_clock_offset;
 984
 985         clock_offset = old_rq->clock - new_rq->clock;
 986         fair_clock_offset = old_rq->cfs.fair_clock -
 987                                                  new_rq->cfs.fair_clock;
 988         if (p->se.wait_start)
 989                 p->se.wait_start -= clock_offset;
 990         if (p->se.wait_start_fair)
 991                 p->se.wait_start_fair -= fair_clock_offset;
 992         if (p->se.sleep_start)
 993                 p->se.sleep_start -= clock_offset;
 994         if (p->se.block_start)
 995                 p->se.block_start -= clock_offset;
 996         if (p->se.sleep_start_fair)
 997                 p->se.sleep_start_fair -= fair_clock_offset;
 998
 999         __set_task_cpu(p, new_cpu);
1000 }
1001
1002 struct migration_req {
1003         struct list_head list;
1004
1005         struct task_struct *task;
1006         int dest_cpu;
1007
1008         struct completion done;
1009 };
1010
1011 /*
1012  * The task's runqueue lock must be held.
1013  * Returns true if you have to wait for migration thread.
1014  */
1015 static int
1016 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1017 {
1018         struct rq *rq = task_rq(p);
1019
1020         /*
1021          * If the task is not on a runqueue (and not running), then
1022          * it is sufficient to simply update the task's cpu field.
1023          */
1024         if (!p->se.on_rq && !task_running(rq, p)) {
1025                 set_task_cpu(p, dest_cpu);
1026                 return 0;
1027         }
1028
1029         init_completion(&req->done);
1030         req->task = p;
1031         req->dest_cpu = dest_cpu;
1032         list_add(&req->list, &rq->migration_queue);
1033
1034         return 1;
1035 }
1036
1037 /*
1038  * wait_task_inactive - wait for a thread to unschedule.
1039  *
1040  * The caller must ensure that the task *will* unschedule sometime soon,
1041  * else this function might spin for a *long* time. This function can't
1042  * be called with interrupts off, or it may introduce deadlock with
1043  * smp_call_function() if an IPI is sent by the same process we are
1044  * waiting to become inactive.
1045  */
1046 void wait_task_inactive(struct task_struct *p)
1047 {
1048         unsigned long flags;
1049         int running, on_rq;
1050         struct rq *rq;
1051
1052 repeat:
1053         /*
1054          * We do the initial early heuristics without holding
1055          * any task-queue locks at all. We'll only try to get
1056          * the runqueue lock when things look like they will
1057          * work out!
1058          */
1059         rq = task_rq(p);
1060
1061         /*
1062          * If the task is actively running on another CPU
1063          * still, just relax and busy-wait without holding
1064          * any locks.
1065          *
1066          * NOTE! Since we don't hold any locks, it's not
1067          * even sure that "rq" stays as the right runqueue!
1068          * But we don't care, since "task_running()" will
1069          * return false if the runqueue has changed and p
1070          * is actually now running somewhere else!
1071          */
1072         while (task_running(rq, p))
1073                 cpu_relax();
1074
1075         /*
1076          * Ok, time to look more closely! We need the rq
1077          * lock now, to be *sure*. If we're wrong, we'll
1078          * just go back and repeat.
1079          */
1080         rq = task_rq_lock(p, &flags);
1081         running = task_running(rq, p);
1082         on_rq = p->se.on_rq;
1083         task_rq_unlock(rq, &flags);
1084
1085         /*
1086          * Was it really running after all now that we
1087          * checked with the proper locks actually held?
1088          *
1089          * Oops. Go back and try again..
1090          */
1091         if (unlikely(running)) {
1092                 cpu_relax();
1093                 goto repeat;
1094         }
1095
1096         /*
1097          * It's not enough that it's not actively running,
1098          * it must be off the runqueue _entirely_, and not
1099          * preempted!
1100          *
1101          * So if it wa still runnable (but just not actively
1102          * running right now), it's preempted, and we should
1103          * yield - it could be a while.
1104          */
1105         if (unlikely(on_rq)) {
1106                 yield();
1107                 goto repeat;
1108         }
1109
1110         /*
1111          * Ahh, all good. It wasn't running, and it wasn't
1112          * runnable, which means that it will never become
1113          * running in the future either. We're all done!
1114          */
1115 }
1116
1117 /***
1118  * kick_process - kick a running thread to enter/exit the kernel
1119  * @p: the to-be-kicked thread
1120  *
1121  * Cause a process which is running on another CPU to enter
1122  * kernel-mode, without any delay. (to get signals handled.)
1123  *
1124  * NOTE: this function doesnt have to take the runqueue lock,
1125  * because all it wants to ensure is that the remote task enters
1126  * the kernel. If the IPI races and the task has been migrated
1127  * to another CPU then no harm is done and the purpose has been
1128  * achieved as well.
1129  */
1130 void kick_process(struct task_struct *p)
1131 {
1132         int cpu;
1133
1134         preempt_disable();
1135         cpu = task_cpu(p);
1136         if ((cpu != smp_processor_id()) && task_curr(p))
1137                 smp_send_reschedule(cpu);
1138         preempt_enable();
1139 }
1140
1141 /*
1142  * Return a low guess at the load of a migration-source cpu weighted
1143  * according to the scheduling class and "nice" value.
1144  *
1145  * We want to under-estimate the load of migration sources, to
1146  * balance conservatively.
1147  */
1148 static inline unsigned long source_load(int cpu, int type)
1149 {
1150         struct rq *rq = cpu_rq(cpu);
1151         unsigned long total = weighted_cpuload(cpu);
1152
1153         if (type == 0)
1154                 return total;
1155
1156         return min(rq->cpu_load[type-1], total);
1157 }
1158
1159 /*
1160  * Return a high guess at the load of a migration-target cpu weighted
1161  * according to the scheduling class and "nice" value.
1162  */
1163 static inline unsigned long target_load(int cpu, int type)
1164 {
1165         struct rq *rq = cpu_rq(cpu);
1166         unsigned long total = weighted_cpuload(cpu);
1167
1168         if (type == 0)
1169                 return total;
1170
1171         return max(rq->cpu_load[type-1], total);
1172 }
1173
1174 /*
1175  * Return the average load per task on the cpu's run queue
1176  */
1177 static inline unsigned long cpu_avg_load_per_task(int cpu)
1178 {
1179         struct rq *rq = cpu_rq(cpu);
1180         unsigned long total = weighted_cpuload(cpu);
1181         unsigned long n = rq->nr_running;
1182
1183         return n ? total / n : SCHED_LOAD_SCALE;
1184 }
1185
1186 /*
1187  * find_idlest_group finds and returns the least busy CPU group within the
1188  * domain.
1189  */
1190 static struct sched_group *
1191 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1192 {
1193         struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1194         unsigned long min_load = ULONG_MAX, this_load = 0;
1195         int load_idx = sd->forkexec_idx;
1196         int imbalance = 100 + (sd->imbalance_pct-100)/2;
1197
1198         do {
1199                 unsigned long load, avg_load;
1200                 int local_group;
1201                 int i;
1202
1203                 /* Skip over this group if it has no CPUs allowed */
1204                 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1205                         goto nextgroup;
1206
1207                 local_group = cpu_isset(this_cpu, group->cpumask);
1208
1209                 /* Tally up the load of all CPUs in the group */
1210                 avg_load = 0;
1211
1212                 for_each_cpu_mask(i, group->cpumask) {
1213                         /* Bias balancing toward cpus of our domain */
1214                         if (local_group)
1215                                 load = source_load(i, load_idx);
1216                         else
1217                                 load = target_load(i, load_idx);
1218
1219                         avg_load += load;
1220                 }
1221
1222                 /* Adjust by relative CPU power of the group */
1223                 avg_load = sg_div_cpu_power(group,
1224                                 avg_load * SCHED_LOAD_SCALE);
1225
1226                 if (local_group) {
1227                         this_load = avg_load;
1228                         this = group;
1229                 } else if (avg_load < min_load) {
1230                         min_load = avg_load;
1231                         idlest = group;
1232                 }
1233 nextgroup:
1234                 group = group->next;
1235         } while (group != sd->groups);
1236
1237         if (!idlest || 100*this_load < imbalance*min_load)
1238                 return NULL;
1239         return idlest;
1240 }
1241
1242 /*
1243  * find_idlest_cpu - find the idlest cpu among the cpus in group.
1244  */
1245 static int
1246 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1247 {
1248         cpumask_t tmp;
1249         unsigned long load, min_load = ULONG_MAX;
1250         int idlest = -1;
1251         int i;
1252
1253         /* Traverse only the allowed CPUs */
1254         cpus_and(tmp, group->cpumask, p->cpus_allowed);
1255
1256         for_each_cpu_mask(i, tmp) {
1257                 load = weighted_cpuload(i);
1258
1259                 if (load < min_load || (load == min_load && i == this_cpu)) {
1260                         min_load = load;
1261                         idlest = i;
1262                 }
1263         }
1264
1265         return idlest;
1266 }
1267
1268 /*
1269  * sched_balance_self: balance the current task (running on cpu) in domains
1270  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1271  * SD_BALANCE_EXEC.
1272  *
1273  * Balance, ie. select the least loaded group.
1274  *
1275  * Returns the target CPU number, or the same CPU if no balancing is needed.
1276  *
1277  * preempt must be disabled.
1278  */
1279 static int sched_balance_self(int cpu, int flag)
1280 {
1281         struct task_struct *t = current;
1282         struct sched_domain *tmp, *sd = NULL;
1283
1284         for_each_domain(cpu, tmp) {
1285                 /*
1286                  * If power savings logic is enabled for a domain, stop there.
1287                  */
1288                 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1289                         break;
1290                 if (tmp->flags & flag)
1291                         sd = tmp;
1292         }
1293
1294         while (sd) {
1295                 cpumask_t span;
1296                 struct sched_group *group;
1297                 int new_cpu, weight;
1298
1299                 if (!(sd->flags & flag)) {
1300                         sd = sd->child;
1301                         continue;
1302                 }
1303
1304                 span = sd->span;
1305                 group = find_idlest_group(sd, t, cpu);
1306                 if (!group) {
1307                         sd = sd->child;
1308                         continue;
1309                 }
1310
1311                 new_cpu = find_idlest_cpu(group, t, cpu);
1312                 if (new_cpu == -1 || new_cpu == cpu) {
1313                         /* Now try balancing at a lower domain level of cpu */
1314                         sd = sd->child;
1315                         continue;
1316                 }
1317
1318                 /* Now try balancing at a lower domain level of new_cpu */
1319                 cpu = new_cpu;
1320                 sd = NULL;
1321                 weight = cpus_weight(span);
1322                 for_each_domain(cpu, tmp) {
1323                         if (weight <= cpus_weight(tmp->span))
1324                                 break;
1325                         if (tmp->flags & flag)
1326                                 sd = tmp;
1327                 }
1328                 /* while loop will break here if sd == NULL */
1329         }
1330
1331         return cpu;
1332 }
1333
1334 #endif /* CONFIG_SMP */
1335
1336 /*
1337  * wake_idle() will wake a task on an idle cpu if task->cpu is
1338  * not idle and an idle cpu is available.  The span of cpus to
1339  * search starts with cpus closest then further out as needed,
1340  * so we always favor a closer, idle cpu.
1341  *
1342  * Returns the CPU we should wake onto.
1343  */
1344 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1345 static int wake_idle(int cpu, struct task_struct *p)
1346 {
1347         cpumask_t tmp;
1348         struct sched_domain *sd;
1349         int i;
1350
1351         /*
1352          * If it is idle, then it is the best cpu to run this task.
1353          *
1354          * This cpu is also the best, if it has more than one task already.
1355          * Siblings must be also busy(in most cases) as they didn't already
1356          * pickup the extra load from this cpu and hence we need not check
1357          * sibling runqueue info. This will avoid the checks and cache miss
1358          * penalities associated with that.
1359          */
1360         if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1361                 return cpu;
1362
1363         for_each_domain(cpu, sd) {
1364                 if (sd->flags & SD_WAKE_IDLE) {
1365                         cpus_and(tmp, sd->span, p->cpus_allowed);
1366                         for_each_cpu_mask(i, tmp) {
1367                                 if (idle_cpu(i))
1368                                         return i;
1369                         }
1370                 } else {
1371                         break;
1372                 }
1373         }
1374         return cpu;
1375 }
1376 #else
1377 static inline int wake_idle(int cpu, struct task_struct *p)
1378 {
1379         return cpu;
1380 }
1381 #endif
1382
1383 /***
1384  * try_to_wake_up - wake up a thread
1385  * @p: the to-be-woken-up thread
1386  * @state: the mask of task states that can be woken
1387  * @sync: do a synchronous wakeup?
1388  *
1389  * Put it on the run-queue if it's not already there. The "current"
1390  * thread is always on the run-queue (except when the actual
1391  * re-schedule is in progress), and as such you're allowed to do
1392  * the simpler "current->state = TASK_RUNNING" to mark yourself
1393  * runnable without the overhead of this.
1394  *
1395  * returns failure only if the task is already active.
1396  */
1397 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1398 {
1399         int cpu, this_cpu, success = 0;
1400         unsigned long flags;
1401         long old_state;
1402         struct rq *rq;
1403 #ifdef CONFIG_SMP
1404         struct sched_domain *sd, *this_sd = NULL;
1405         unsigned long load, this_load;
1406         int new_cpu;
1407 #endif
1408
1409         rq = task_rq_lock(p, &flags);
1410         old_state = p->state;
1411         if (!(old_state & state))
1412                 goto out;
1413
1414         if (p->se.on_rq)
1415                 goto out_running;
1416
1417         cpu = task_cpu(p);
1418         this_cpu = smp_processor_id();
1419
1420 #ifdef CONFIG_SMP
1421         if (unlikely(task_running(rq, p)))
1422                 goto out_activate;
1423
1424         new_cpu = cpu;
1425
1426         schedstat_inc(rq, ttwu_cnt);
1427         if (cpu == this_cpu) {
1428                 schedstat_inc(rq, ttwu_local);
1429                 goto out_set_cpu;
1430         }
1431
1432         for_each_domain(this_cpu, sd) {
1433                 if (cpu_isset(cpu, sd->span)) {
1434                         schedstat_inc(sd, ttwu_wake_remote);
1435                         this_sd = sd;
1436                         break;
1437                 }
1438         }
1439
1440         if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1441                 goto out_set_cpu;
1442
1443         /*
1444          * Check for affine wakeup and passive balancing possibilities.
1445          */
1446         if (this_sd) {
1447                 int idx = this_sd->wake_idx;
1448                 unsigned int imbalance;
1449
1450                 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1451
1452                 load = source_load(cpu, idx);
1453                 this_load = target_load(this_cpu, idx);
1454
1455                 new_cpu = this_cpu; /* Wake to this CPU if we can */
1456
1457                 if (this_sd->flags & SD_WAKE_AFFINE) {
1458                         unsigned long tl = this_load;
1459                         unsigned long tl_per_task;
1460
1461                         tl_per_task = cpu_avg_load_per_task(this_cpu);
1462
1463                         /*
1464                          * If sync wakeup then subtract the (maximum possible)
1465                          * effect of the currently running task from the load
1466                          * of the current CPU:
1467                          */
1468                         if (sync)
1469                                 tl -= current->se.load.weight;
1470
1471                         if ((tl <= load &&
1472                                 tl + target_load(cpu, idx) <= tl_per_task) ||
1473                                100*(tl + p->se.load.weight) <= imbalance*load) {
1474                                 /*
1475                                  * This domain has SD_WAKE_AFFINE and
1476                                  * p is cache cold in this domain, and
1477                                  * there is no bad imbalance.
1478                                  */
1479                                 schedstat_inc(this_sd, ttwu_move_affine);
1480                                 goto out_set_cpu;
1481                         }
1482                 }
1483
1484                 /*
1485                  * Start passive balancing when half the imbalance_pct
1486                  * limit is reached.
1487                  */
1488                 if (this_sd->flags & SD_WAKE_BALANCE) {
1489                         if (imbalance*this_load <= 100*load) {
1490                                 schedstat_inc(this_sd, ttwu_move_balance);
1491                                 goto out_set_cpu;
1492                         }
1493                 }
1494         }
1495
1496         new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1497 out_set_cpu:
1498         new_cpu = wake_idle(new_cpu, p);
1499         if (new_cpu != cpu) {
1500                 set_task_cpu(p, new_cpu);
1501                 task_rq_unlock(rq, &flags);
1502                 /* might preempt at this point */
1503                 rq = task_rq_lock(p, &flags);
1504                 old_state = p->state;
1505                 if (!(old_state & state))
1506                         goto out;
1507                 if (p->se.on_rq)
1508                         goto out_running;
1509
1510                 this_cpu = smp_processor_id();
1511                 cpu = task_cpu(p);
1512         }
1513
1514 out_activate:
1515 #endif /* CONFIG_SMP */
1516         activate_task(rq, p, 1);
1517         /*
1518          * Sync wakeups (i.e. those types of wakeups where the waker
1519          * has indicated that it will leave the CPU in short order)
1520          * don't trigger a preemption, if the woken up task will run on
1521          * this cpu. (in this case the 'I will reschedule' promise of
1522          * the waker guarantees that the freshly woken up task is going
1523          * to be considered on this CPU.)
1524          */
1525         if (!sync || cpu != this_cpu)
1526                 check_preempt_curr(rq, p);
1527         success = 1;
1528
1529 out_running:
1530         p->state = TASK_RUNNING;
1531 out:
1532         task_rq_unlock(rq, &flags);
1533
1534         return success;
1535 }
1536
1537 int fastcall wake_up_process(struct task_struct *p)
1538 {
1539         return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1540                                  TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1541 }
1542 EXPORT_SYMBOL(wake_up_process);
1543
1544 int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1545 {
1546         return try_to_wake_up(p, state, 0);
1547 }
1548
1549 /*
1550  * Perform scheduler related setup for a newly forked process p.
1551  * p is forked by current.
1552  *
1553  * __sched_fork() is basic setup used by init_idle() too:
1554  */
1555 static void __sched_fork(struct task_struct *p)
1556 {
1557         p->se.wait_start_fair           = 0;
1558         p->se.wait_start                = 0;
1559         p->se.exec_start                = 0;
1560         p->se.sum_exec_runtime          = 0;
1561         p->se.delta_exec                = 0;
1562         p->se.delta_fair_run            = 0;
1563         p->se.delta_fair_sleep          = 0;
1564         p->se.wait_runtime              = 0;
1565         p->se.sum_wait_runtime          = 0;
1566         p->se.sum_sleep_runtime         = 0;
1567         p->se.sleep_start               = 0;
1568         p->se.sleep_start_fair          = 0;
1569         p->se.block_start               = 0;
1570         p->se.sleep_max                 = 0;
1571         p->se.block_max                 = 0;
1572         p->se.exec_max                  = 0;
1573         p->se.wait_max                  = 0;
1574         p->se.wait_runtime_overruns     = 0;
1575         p->se.wait_runtime_underruns    = 0;
1576
1577         INIT_LIST_HEAD(&p->run_list);
1578         p->se.on_rq = 0;
1579
1580 #ifdef CONFIG_PREEMPT_NOTIFIERS
1581         INIT_HLIST_HEAD(&p->preempt_notifiers);
1582 #endif
1583
1584         /*
1585          * We mark the process as running here, but have not actually
1586          * inserted it onto the runqueue yet. This guarantees that
1587          * nobody will actually run it, and a signal or other external
1588          * event cannot wake it up and insert it on the runqueue either.
1589          */
1590         p->state = TASK_RUNNING;
1591 }
1592
1593 /*
1594  * fork()/clone()-time setup:
1595  */
1596 void sched_fork(struct task_struct *p, int clone_flags)
1597 {
1598         int cpu = get_cpu();
1599
1600         __sched_fork(p);
1601
1602 #ifdef CONFIG_SMP
1603         cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1604 #endif
1605         __set_task_cpu(p, cpu);
1606
1607         /*
1608          * Make sure we do not leak PI boosting priority to the child:
1609          */
1610         p->prio = current->normal_prio;
1611
1612 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1613         if (likely(sched_info_on()))
1614                 memset(&p->sched_info, 0, sizeof(p->sched_info));
1615 #endif
1616 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1617         p->oncpu = 0;
1618 #endif
1619 #ifdef CONFIG_PREEMPT
1620         /* Want to start with kernel preemption disabled. */
1621         task_thread_info(p)->preempt_count = 1;
1622 #endif
1623         put_cpu();
1624 }
1625
1626 /*
1627  * After fork, child runs first. (default) If set to 0 then
1628  * parent will (try to) run first.
1629  */
1630 unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1631
1632 /*
1633  * wake_up_new_task - wake up a newly created task for the first time.
1634  *
1635  * This function will do some initial scheduler statistics housekeeping
1636  * that must be done for every newly created context, then puts the task
1637  * on the runqueue and wakes it.
1638  */
1639 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1640 {
1641         unsigned long flags;
1642         struct rq *rq;
1643         int this_cpu;
1644
1645         rq = task_rq_lock(p, &flags);
1646         BUG_ON(p->state != TASK_RUNNING);
1647         this_cpu = smp_processor_id(); /* parent's CPU */
1648
1649         p->prio = effective_prio(p);
1650
1651         if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1652                         task_cpu(p) != this_cpu || !current->se.on_rq) {
1653                 activate_task(rq, p, 0);
1654         } else {
1655                 /*
1656                  * Let the scheduling class do new task startup
1657                  * management (if any):
1658                  */
1659                 p->sched_class->task_new(rq, p);
1660         }
1661         check_preempt_curr(rq, p);
1662         task_rq_unlock(rq, &flags);
1663 }
1664
1665 #ifdef CONFIG_PREEMPT_NOTIFIERS
1666
1667 /**
1668  * preempt_notifier_register - tell me when current is being being preempted & rescheduled
1669  * @notifier: notifier struct to register
1670  */
1671 void preempt_notifier_register(struct preempt_notifier *notifier)
1672 {
1673         hlist_add_head(&notifier->link, &current->preempt_notifiers);
1674 }
1675 EXPORT_SYMBOL_GPL(preempt_notifier_register);
1676
1677 /**
1678  * preempt_notifier_unregister - no longer interested in preemption notifications
1679  * @notifier: notifier struct to unregister
1680  *
1681  * This is safe to call from within a preemption notifier.
1682  */
1683 void preempt_notifier_unregister(struct preempt_notifier *notifier)
1684 {
1685         hlist_del(&notifier->link);
1686 }
1687 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1688
1689 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1690 {
1691         struct preempt_notifier *notifier;
1692         struct hlist_node *node;
1693
1694         hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1695                 notifier->ops->sched_in(notifier, raw_smp_processor_id());
1696 }
1697
1698 static void
1699 fire_sched_out_preempt_notifiers(struct task_struct *curr,
1700                                  struct task_struct *next)
1701 {
1702         struct preempt_notifier *notifier;
1703         struct hlist_node *node;
1704
1705         hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1706                 notifier->ops->sched_out(notifier, next);
1707 }
1708
1709 #else
1710
1711 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1712 {
1713 }
1714
1715 static void
1716 fire_sched_out_preempt_notifiers(struct task_struct *curr,
1717                                  struct task_struct *next)
1718 {
1719 }
1720
1721 #endif
1722
1723 /**
1724  * prepare_task_switch - prepare to switch tasks
1725  * @rq: the runqueue preparing to switch
1726  * @prev: the current task that is being switched out
1727  * @next: the task we are going to switch to.
1728  *
1729  * This is called with the rq lock held and interrupts off. It must
1730  * be paired with a subsequent finish_task_switch after the context
1731  * switch.
1732  *
1733  * prepare_task_switch sets up locking and calls architecture specific
1734  * hooks.
1735  */
1736 static inline void
1737 prepare_task_switch(struct rq *rq, struct task_struct *prev,
1738                     struct task_struct *next)
1739 {
1740         fire_sched_out_preempt_notifiers(prev, next);
1741         prepare_lock_switch(rq, next);
1742         prepare_arch_switch(next);
1743 }
1744
1745 /**
1746  * finish_task_switch - clean up after a task-switch
1747  * @rq: runqueue associated with task-switch
1748  * @prev: the thread we just switched away from.
1749  *
1750  * finish_task_switch must be called after the context switch, paired
1751  * with a prepare_task_switch call before the context switch.
1752  * finish_task_switch will reconcile locking set up by prepare_task_switch,
1753  * and do any other architecture-specific cleanup actions.
1754  *
1755  * Note that we may have delayed dropping an mm in context_switch(). If
1756  * so, we finish that here outside of the runqueue lock.  (Doing it
1757  * with the lock held can cause deadlocks; see schedule() for
1758  * details.)
1759  */
1760 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1761         __releases(rq->lock)
1762 {
1763         struct mm_struct *mm = rq->prev_mm;
1764         long prev_state;
1765
1766         rq->prev_mm = NULL;
1767
1768         /*
1769          * A task struct has one reference for the use as "current".
1770          * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1771          * schedule one last time. The schedule call will never return, and
1772          * the scheduled task must drop that reference.
1773          * The test for TASK_DEAD must occur while the runqueue locks are
1774          * still held, otherwise prev could be scheduled on another cpu, die
1775          * there before we look at prev->state, and then the reference would
1776          * be dropped twice.
1777          *              Manfred Spraul <manfred@colorfullife.com>
1778          */
1779         prev_state = prev->state;
1780         finish_arch_switch(prev);
1781         finish_lock_switch(rq, prev);
1782         fire_sched_in_preempt_notifiers(current);
1783         if (mm)
1784                 mmdrop(mm);
1785         if (unlikely(prev_state == TASK_DEAD)) {
1786                 /*
1787                  * Remove function-return probe instances associated with this
1788                  * task and put them back on the free list.
1789                  */
1790                 kprobe_flush_task(prev);
1791                 put_task_struct(prev);
1792         }
1793 }
1794
1795 /**
1796  * schedule_tail - first thing a freshly forked thread must call.
1797  * @prev: the thread we just switched away from.
1798  */
1799 asmlinkage void schedule_tail(struct task_struct *prev)
1800         __releases(rq->lock)
1801 {
1802         struct rq *rq = this_rq();
1803
1804         finish_task_switch(rq, prev);
1805 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
1806         /* In this case, finish_task_switch does not reenable preemption */
1807         preempt_enable();
1808 #endif
1809         if (current->set_child_tid)
1810                 put_user(current->pid, current->set_child_tid);
1811 }
1812
1813 /*
1814  * context_switch - switch to the new MM and the new
1815  * thread's register state.
1816  */
1817 static inline void
1818 context_switch(struct rq *rq, struct task_struct *prev,
1819                struct task_struct *next)
1820 {
1821         struct mm_struct *mm, *oldmm;
1822
1823         prepare_task_switch(rq, prev, next);
1824         mm = next->mm;
1825         oldmm = prev->active_mm;
1826         /*
1827          * For paravirt, this is coupled with an exit in switch_to to
1828          * combine the page table reload and the switch backend into
1829          * one hypercall.
1830          */
1831         arch_enter_lazy_cpu_mode();
1832
1833         if (unlikely(!mm)) {
1834                 next->active_mm = oldmm;
1835                 atomic_inc(&oldmm->mm_count);
1836                 enter_lazy_tlb(oldmm, next);
1837         } else
1838                 switch_mm(oldmm, mm, next);
1839
1840         if (unlikely(!prev->mm)) {
1841                 prev->active_mm = NULL;
1842                 rq->prev_mm = oldmm;
1843         }
1844         /*
1845          * Since the runqueue lock will be released by the next
1846          * task (which is an invalid locking op but in the case
1847          * of the scheduler it's an obvious special-case), so we
1848          * do an early lockdep release here:
1849          */
1850 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
1851         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1852 #endif
1853
1854         /* Here we just switch the register state and the stack. */
1855         switch_to(prev, next, prev);
1856
1857         barrier();
1858         /*
1859          * this_rq must be evaluated again because prev may have moved
1860          * CPUs since it called schedule(), thus the 'rq' on its stack
1861          * frame will be invalid.
1862          */
1863         finish_task_switch(this_rq(), prev);
1864 }
1865
1866 /*
1867  * nr_running, nr_uninterruptible and nr_context_switches:
1868  *
1869  * externally visible scheduler statistics: current number of runnable
1870  * threads, current number of uninterruptible-sleeping threads, total
1871  * number of context switches performed since bootup.
1872  */
1873 unsigned long nr_running(void)
1874 {
1875         unsigned long i, sum = 0;
1876
1877         for_each_online_cpu(i)
1878                 sum += cpu_rq(i)->nr_running;
1879
1880         return sum;
1881 }
1882
1883 unsigned long nr_uninterruptible(void)
1884 {
1885         unsigned long i, sum = 0;
1886
1887         for_each_possible_cpu(i)
1888                 sum += cpu_rq(i)->nr_uninterruptible;
1889
1890         /*
1891          * Since we read the counters lockless, it might be slightly
1892          * inaccurate. Do not allow it to go below zero though:
1893          */
1894         if (unlikely((long)sum < 0))
1895                 sum = 0;
1896
1897         return sum;
1898 }
1899
1900 unsigned long long nr_context_switches(void)
1901 {
1902         int i;
1903         unsigned long long sum = 0;
1904
1905         for_each_possible_cpu(i)
1906                 sum += cpu_rq(i)->nr_switches;
1907
1908         return sum;
1909 }
1910
1911 unsigned long nr_iowait(void)
1912 {
1913         unsigned long i, sum = 0;
1914
1915         for_each_possible_cpu(i)
1916                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1917
1918         return sum;
1919 }
1920
1921 unsigned long nr_active(void)
1922 {
1923         unsigned long i, running = 0, uninterruptible = 0;
1924
1925         for_each_online_cpu(i) {
1926                 running += cpu_rq(i)->nr_running;
1927                 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1928         }
1929
1930         if (unlikely((long)uninterruptible < 0))
1931                 uninterruptible = 0;
1932
1933         return running + uninterruptible;
1934 }
1935
1936 /*
1937  * Update rq->cpu_load[] statistics. This function is usually called every
1938  * scheduler tick (TICK_NSEC).
1939  */
1940 static void update_cpu_load(struct rq *this_rq)
1941 {
1942         u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1943         unsigned long total_load = this_rq->ls.load.weight;
1944         unsigned long this_load =  total_load;
1945         struct load_stat *ls = &this_rq->ls;
1946         u64 now = __rq_clock(this_rq);
1947         int i, scale;
1948
1949         this_rq->nr_load_updates++;
1950         if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1951                 goto do_avg;
1952
1953         /* Update delta_fair/delta_exec fields first */
1954         update_curr_load(this_rq, now);
1955
1956         fair_delta64 = ls->delta_fair + 1;
1957         ls->delta_fair = 0;
1958
1959         exec_delta64 = ls->delta_exec + 1;
1960         ls->delta_exec = 0;
1961
1962         sample_interval64 = now - ls->load_update_last;
1963         ls->load_update_last = now;
1964
1965         if ((s64)sample_interval64 < (s64)TICK_NSEC)
1966                 sample_interval64 = TICK_NSEC;
1967
1968         if (exec_delta64 > sample_interval64)
1969                 exec_delta64 = sample_interval64;
1970
1971         idle_delta64 = sample_interval64 - exec_delta64;
1972
1973         tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1974         tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1975
1976         this_load = (unsigned long)tmp64;
1977
1978 do_avg:
1979
1980         /* Update our load: */
1981         for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1982                 unsigned long old_load, new_load;
1983
1984                 /* scale is effectively 1 << i now, and >> i divides by scale */
1985
1986                 old_load = this_rq->cpu_load[i];
1987                 new_load = this_load;
1988
1989                 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1990         }
1991 }
1992
1993 #ifdef CONFIG_SMP
1994
1995 /*
1996  * double_rq_lock - safely lock two runqueues
1997  *
1998  * Note this does not disable interrupts like task_rq_lock,
1999  * you need to do so manually before calling.
2000  */
2001 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2002         __acquires(rq1->lock)
2003         __acquires(rq2->lock)
2004 {
2005         BUG_ON(!irqs_disabled());
2006         if (rq1 == rq2) {
2007                 spin_lock(&rq1->lock);
2008                 __acquire(rq2->lock);   /* Fake it out ;) */
2009         } else {
2010                 if (rq1 < rq2) {
2011                         spin_lock(&rq1->lock);
2012                         spin_lock(&rq2->lock);
2013                 } else {
2014                         spin_lock(&rq2->lock);
2015                         spin_lock(&rq1->lock);
2016                 }
2017         }
2018 }
2019
2020 /*
2021  * double_rq_unlock - safely unlock two runqueues
2022  *
2023  * Note this does not restore interrupts like task_rq_unlock,
2024  * you need to do so manually after calling.
2025  */
2026 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2027         __releases(rq1->lock)
2028         __releases(rq2->lock)
2029 {
2030         spin_unlock(&rq1->lock);
2031         if (rq1 != rq2)
2032                 spin_unlock(&rq2->lock);
2033         else
2034                 __release(rq2->lock);
2035 }
2036
2037 /*
2038  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2039  */
2040 static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
2041         __releases(this_rq->lock)
2042         __acquires(busiest->lock)
2043         __acquires(this_rq->lock)
2044 {
2045         if (unlikely(!irqs_disabled())) {
2046                 /* printk() doesn't work good under rq->lock */
2047                 spin_unlock(&this_rq->lock);
2048                 BUG_ON(1);
2049         }
2050         if (unlikely(!spin_trylock(&busiest->lock))) {
2051                 if (busiest < this_rq) {
2052                         spin_unlock(&this_rq->lock);
2053                         spin_lock(&busiest->lock);
2054                         spin_lock(&this_rq->lock);
2055                 } else
2056                         spin_lock(&busiest->lock);
2057         }
2058 }
2059
2060 /*
2061  * If dest_cpu is allowed for this process, migrate the task to it.
2062  * This is accomplished by forcing the cpu_allowed mask to only
2063  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
2064  * the cpu_allowed mask is restored.
2065  */
2066 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2067 {
2068         struct migration_req req;
2069         unsigned long flags;
2070         struct rq *rq;
2071
2072         rq = task_rq_lock(p, &flags);
2073         if (!cpu_isset(dest_cpu, p->cpus_allowed)
2074             || unlikely(cpu_is_offline(dest_cpu)))
2075                 goto out;
2076
2077         /* force the process onto the specified CPU */
2078         if (migrate_task(p, dest_cpu, &req)) {
2079                 /* Need to wait for migration thread (might exit: take ref). */
2080                 struct task_struct *mt = rq->migration_thread;
2081
2082                 get_task_struct(mt);
2083                 task_rq_unlock(rq, &flags);
2084                 wake_up_process(mt);
2085                 put_task_struct(mt);
2086                 wait_for_completion(&req.done);
2087
2088                 return;
2089         }
2090 out:
2091         task_rq_unlock(rq, &flags);
2092 }
2093
2094 /*
2095  * sched_exec - execve() is a valuable balancing opportunity, because at
2096  * this point the task has the smallest effective memory and cache footprint.
2097  */
2098 void sched_exec(void)
2099 {
2100         int new_cpu, this_cpu = get_cpu();
2101         new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2102         put_cpu();
2103         if (new_cpu != this_cpu)
2104                 sched_migrate_task(current, new_cpu);
2105 }
2106
2107 /*
2108  * pull_task - move a task from a remote runqueue to the local runqueue.
2109  * Both runqueues must be locked.
2110  */
2111 static void pull_task(struct rq *src_rq, struct task_struct *p,
2112                       struct rq *this_rq, int this_cpu)
2113 {
2114         deactivate_task(src_rq, p, 0);
2115         set_task_cpu(p, this_cpu);
2116         activate_task(this_rq, p, 0);
2117         /*
2118          * Note that idle threads have a prio of MAX_PRIO, for this test
2119          * to be always true for them.
2120          */
2121         check_preempt_curr(this_rq, p);
2122 }
2123
2124 /*
2125  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2126  */
2127 static
2128 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2129                      struct sched_domain *sd, enum cpu_idle_type idle,
2130                      int *all_pinned)
2131 {
2132         /*
2133          * We do not migrate tasks that are:
2134          * 1) running (obviously), or
2135          * 2) cannot be migrated to this CPU due to cpus_allowed, or
2136          * 3) are cache-hot on their current CPU.
2137          */
2138         if (!cpu_isset(this_cpu, p->cpus_allowed))
2139                 return 0;
2140         *all_pinned = 0;
2141
2142         if (task_running(rq, p))
2143                 return 0;
2144
2145         /*
2146          * Aggressive migration if too many balance attempts have failed:
2147          */
2148         if (sd->nr_balance_failed > sd->cache_nice_tries)
2149                 return 1;
2150
2151         return 1;
2152 }
2153
2154 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2155                       unsigned long max_nr_move, unsigned long max_load_move,
2156                       struct sched_domain *sd, enum cpu_idle_type idle,
2157                       int *all_pinned, unsigned long *load_moved,
2158                       int this_best_prio, int best_prio, int best_prio_seen,
2159                       struct rq_iterator *iterator)
2160 {
2161         int pulled = 0, pinned = 0, skip_for_load;
2162         struct task_struct *p;
2163         long rem_load_move = max_load_move;
2164
2165         if (max_nr_move == 0 || max_load_move == 0)
2166                 goto out;
2167
2168         pinned = 1;
2169
2170         /*
2171          * Start the load-balancing iterator:
2172          */
2173         p = iterator->start(iterator->arg);
2174 next:
2175         if (!p)
2176                 goto out;
2177         /*
2178          * To help distribute high priority tasks accross CPUs we don't
2179          * skip a task if it will be the highest priority task (i.e. smallest
2180          * prio value) on its new queue regardless of its load weight
2181          */
2182         skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2183                                                          SCHED_LOAD_SCALE_FUZZ;
2184         if (skip_for_load && p->prio < this_best_prio)
2185                 skip_for_load = !best_prio_seen && p->prio == best_prio;
2186         if (skip_for_load ||
2187             !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2188
2189                 best_prio_seen |= p->prio == best_prio;
2190                 p = iterator->next(iterator->arg);
2191                 goto next;
2192         }
2193
2194         pull_task(busiest, p, this_rq, this_cpu);
2195         pulled++;
2196         rem_load_move -= p->se.load.weight;
2197
2198         /*
2199          * We only want to steal up to the prescribed number of tasks
2200          * and the prescribed amount of weighted load.
2201          */
2202         if (pulled < max_nr_move && rem_load_move > 0) {
2203                 if (p->prio < this_best_prio)
2204                         this_best_prio = p->prio;
2205                 p = iterator->next(iterator->arg);
2206                 goto next;
2207         }
2208 out:
2209         /*
2210          * Right now, this is the only place pull_task() is called,
2211          * so we can safely collect pull_task() stats here rather than
2212          * inside pull_task().
2213          */
2214         schedstat_add(sd, lb_gained[idle], pulled);
2215
2216         if (all_pinned)
2217                 *all_pinned = pinned;
2218         *load_moved = max_load_move - rem_load_move;
2219         return pulled;
2220 }
2221
2222 /*
2223  * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2224  * load from busiest to this_rq, as part of a balancing operation within
2225  * "domain". Returns the number of tasks moved.
2226  *
2227  * Called with both runqueues locked.
2228  */
2229 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2230                       unsigned long max_nr_move, unsigned long max_load_move,
2231                       struct sched_domain *sd, enum cpu_idle_type idle,
2232                       int *all_pinned)
2233 {
2234         struct sched_class *class = sched_class_highest;
2235         unsigned long load_moved, total_nr_moved = 0, nr_moved;
2236         long rem_load_move = max_load_move;
2237
2238         do {
2239                 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2240                                 max_nr_move, (unsigned long)rem_load_move,
2241                                 sd, idle, all_pinned, &load_moved);
2242                 total_nr_moved += nr_moved;
2243                 max_nr_move -= nr_moved;
2244                 rem_load_move -= load_moved;
2245                 class = class->next;
2246         } while (class && max_nr_move && rem_load_move > 0);
2247
2248         return total_nr_moved;
2249 }
2250
2251 /*
2252  * find_busiest_group finds and returns the busiest CPU group within the
2253  * domain. It calculates and returns the amount of weighted load which
2254  * should be moved to restore balance via the imbalance parameter.
2255  */
2256 static struct sched_group *
2257 find_busiest_group(struct sched_domain *sd, int this_cpu,
2258                    unsigned long *imbalance, enum cpu_idle_type idle,
2259                    int *sd_idle, cpumask_t *cpus, int *balance)
2260 {
2261         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2262         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
2263         unsigned long max_pull;
2264         unsigned long busiest_load_per_task, busiest_nr_running;
2265         unsigned long this_load_per_task, this_nr_running;
2266         int load_idx;
2267 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2268         int power_savings_balance = 1;
2269         unsigned long leader_nr_running = 0, min_load_per_task = 0;
2270         unsigned long min_nr_running = ULONG_MAX;
2271         struct sched_group *group_min = NULL, *group_leader = NULL;
2272 #endif
2273
2274         max_load = this_load = total_load = total_pwr = 0;
2275         busiest_load_per_task = busiest_nr_running = 0;
2276         this_load_per_task = this_nr_running = 0;
2277         if (idle == CPU_NOT_IDLE)
2278                 load_idx = sd->busy_idx;
2279         else if (idle == CPU_NEWLY_IDLE)
2280                 load_idx = sd->newidle_idx;
2281         else
2282                 load_idx = sd->idle_idx;
2283
2284         do {
2285                 unsigned long load, group_capacity;
2286                 int local_group;
2287                 int i;
2288                 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2289                 unsigned long sum_nr_running, sum_weighted_load;
2290
2291                 local_group = cpu_isset(this_cpu, group->cpumask);
2292
2293                 if (local_group)
2294                         balance_cpu = first_cpu(group->cpumask);
2295
2296                 /* Tally up the load of all CPUs in the group */
2297                 sum_weighted_load = sum_nr_running = avg_load = 0;
2298
2299                 for_each_cpu_mask(i, group->cpumask) {
2300                         struct rq *rq;
2301
2302                         if (!cpu_isset(i, *cpus))
2303                                 continue;
2304
2305                         rq = cpu_rq(i);
2306
2307                         if (*sd_idle && rq->nr_running)
2308                                 *sd_idle = 0;
2309
2310                         /* Bias balancing toward cpus of our domain */
2311                         if (local_group) {
2312                                 if (idle_cpu(i) && !first_idle_cpu) {
2313                                         first_idle_cpu = 1;
2314                                         balance_cpu = i;
2315                                 }
2316
2317                                 load = target_load(i, load_idx);
2318                         } else
2319                                 load = source_load(i, load_idx);
2320
2321                         avg_load += load;
2322                         sum_nr_running += rq->nr_running;
2323                         sum_weighted_load += weighted_cpuload(i);
2324                 }
2325
2326                 /*
2327                  * First idle cpu or the first cpu(busiest) in this sched group
2328                  * is eligible for doing load balancing at this and above
2329                  * domains. In the newly idle case, we will allow all the cpu's
2330                  * to do the newly idle load balance.
2331                  */
2332                 if (idle != CPU_NEWLY_IDLE && local_group &&
2333                     balance_cpu != this_cpu && balance) {
2334                         *balance = 0;
2335                         goto ret;
2336                 }
2337
2338                 total_load += avg_load;
2339                 total_pwr += group->__cpu_power;
2340
2341                 /* Adjust by relative CPU power of the group */
2342                 avg_load = sg_div_cpu_power(group,
2343                                 avg_load * SCHED_LOAD_SCALE);
2344
2345                 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
2346
2347                 if (local_group) {
2348                         this_load = avg_load;
2349                         this = group;
2350                         this_nr_running = sum_nr_running;
2351                         this_load_per_task = sum_weighted_load;
2352                 } else if (avg_load > max_load &&
2353                            sum_nr_running > group_capacity) {
2354                         max_load = avg_load;
2355                         busiest = group;
2356                         busiest_nr_running = sum_nr_running;
2357                         busiest_load_per_task = sum_weighted_load;
2358                 }
2359
2360 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2361                 /*
2362                  * Busy processors will not participate in power savings
2363                  * balance.
2364                  */
2365                 if (idle == CPU_NOT_IDLE ||
2366                                 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2367                         goto group_next;
2368
2369                 /*
2370                  * If the local group is idle or completely loaded
2371                  * no need to do power savings balance at this domain
2372                  */
2373                 if (local_group && (this_nr_running >= group_capacity ||
2374                                     !this_nr_running))
2375                         power_savings_balance = 0;
2376
2377                 /*
2378                  * If a group is already running at full capacity or idle,
2379                  * don't include that group in power savings calculations
2380                  */
2381                 if (!power_savings_balance || sum_nr_running >= group_capacity
2382                     || !sum_nr_running)
2383                         goto group_next;
2384
2385                 /*
2386                  * Calculate the group which has the least non-idle load.
2387                  * This is the group from where we need to pick up the load
2388                  * for saving power
2389                  */
2390                 if ((sum_nr_running < min_nr_running) ||
2391                     (sum_nr_running == min_nr_running &&
2392                      first_cpu(group->cpumask) <
2393                      first_cpu(group_min->cpumask))) {
2394                         group_min = group;
2395                         min_nr_running = sum_nr_running;
2396                         min_load_per_task = sum_weighted_load /
2397                                                 sum_nr_running;
2398                 }
2399
2400                 /*
2401                  * Calculate the group which is almost near its
2402                  * capacity but still has some space to pick up some load
2403                  * from other group and save more power
2404                  */
2405                 if (sum_nr_running <= group_capacity - 1) {
2406                         if (sum_nr_running > leader_nr_running ||
2407                             (sum_nr_running == leader_nr_running &&
2408                              first_cpu(group->cpumask) >
2409                               first_cpu(group_leader->cpumask))) {
2410                                 group_leader = group;
2411                                 leader_nr_running = sum_nr_running;
2412                         }
2413                 }
2414 group_next:
2415 #endif
2416                 group = group->next;
2417         } while (group != sd->groups);
2418
2419         if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2420                 goto out_balanced;
2421
2422         avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2423
2424         if (this_load >= avg_load ||
2425                         100*max_load <= sd->imbalance_pct*this_load)
2426                 goto out_balanced;
2427
2428         busiest_load_per_task /= busiest_nr_running;
2429         /*
2430          * We're trying to get all the cpus to the average_load, so we don't
2431          * want to push ourselves above the average load, nor do we wish to
2432          * reduce the max loaded cpu below the average load, as either of these
2433          * actions would just result in more rebalancing later, and ping-pong
2434          * tasks around. Thus we look for the minimum possible imbalance.
2435          * Negative imbalances (*we* are more loaded than anyone else) will
2436          * be counted as no imbalance for these purposes -- we can't fix that
2437          * by pulling tasks to us.  Be careful of negative numbers as they'll
2438          * appear as very large values with unsigned longs.
2439          */
2440         if (max_load <= busiest_load_per_task)
2441                 goto out_balanced;
2442
2443         /*
2444          * In the presence of smp nice balancing, certain scenarios can have
2445          * max load less than avg load(as we skip the groups at or below
2446          * its cpu_power, while calculating max_load..)
2447          */
2448         if (max_load < avg_load) {
2449                 *imbalance = 0;
2450                 goto small_imbalance;
2451         }
2452
2453         /* Don't want to pull so many tasks that a group would go idle */
2454         max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2455
2456         /* How much load to actually move to equalise the imbalance */
2457         *imbalance = min(max_pull * busiest->__cpu_power,
2458                                 (avg_load - this_load) * this->__cpu_power)
2459                         / SCHED_LOAD_SCALE;
2460
2461         /*
2462          * if *imbalance is less than the average load per runnable task
2463          * there is no gaurantee that any tasks will be moved so we'll have
2464          * a think about bumping its value to force at least one task to be
2465          * moved
2466          */
2467         if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
2468                 unsigned long tmp, pwr_now, pwr_move;
2469                 unsigned int imbn;
2470
2471 small_imbalance:
2472                 pwr_move = pwr_now = 0;
2473                 imbn = 2;
2474                 if (this_nr_running) {
2475                         this_load_per_task /= this_nr_running;
2476                         if (busiest_load_per_task > this_load_per_task)
2477                                 imbn = 1;
2478                 } else
2479                         this_load_per_task = SCHED_LOAD_SCALE;
2480
2481                 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2482                                         busiest_load_per_task * imbn) {
2483                         *imbalance = busiest_load_per_task;
2484                         return busiest;
2485                 }
2486
2487                 /*
2488                  * OK, we don't have enough imbalance to justify moving tasks,
2489                  * however we may be able to increase total CPU power used by
2490                  * moving them.
2491                  */
2492
2493                 pwr_now += busiest->__cpu_power *
2494                                 min(busiest_load_per_task, max_load);
2495                 pwr_now += this->__cpu_power *
2496                                 min(this_load_per_task, this_load);
2497                 pwr_now /= SCHED_LOAD_SCALE;
2498
2499                 /* Amount of load we'd subtract */
2500                 tmp = sg_div_cpu_power(busiest,
2501                                 busiest_load_per_task * SCHED_LOAD_SCALE);
2502                 if (max_load > tmp)
2503                         pwr_move += busiest->__cpu_power *
2504                                 min(busiest_load_per_task, max_load - tmp);
2505
2506                 /* Amount of load we'd add */
2507                 if (max_load * busiest->__cpu_power <
2508                                 busiest_load_per_task * SCHED_LOAD_SCALE)
2509                         tmp = sg_div_cpu_power(this,
2510                                         max_load * busiest->__cpu_power);
2511                 else
2512                         tmp = sg_div_cpu_power(this,
2513                                 busiest_load_per_task * SCHED_LOAD_SCALE);
2514                 pwr_move += this->__cpu_power *
2515                                 min(this_load_per_task, this_load + tmp);
2516                 pwr_move /= SCHED_LOAD_SCALE;
2517
2518                 /* Move if we gain throughput */
2519                 if (pwr_move <= pwr_now)
2520                         goto out_balanced;
2521
2522                 *imbalance = busiest_load_per_task;
2523         }
2524
2525         return busiest;
2526
2527 out_balanced:
2528 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2529         if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2530                 goto ret;
2531
2532         if (this == group_leader && group_leader != group_min) {
2533                 *imbalance = min_load_per_task;
2534                 return group_min;
2535         }
2536 #endif
2537 ret:
2538         *imbalance = 0;
2539         return NULL;
2540 }
2541
2542 /*
2543  * find_busiest_queue - find the busiest runqueue among the cpus in group.
2544  */
2545 static struct rq *
2546 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2547                    unsigned long imbalance, cpumask_t *cpus)
2548 {
2549         struct rq *busiest = NULL, *rq;
2550         unsigned long max_load = 0;
2551         int i;
2552
2553         for_each_cpu_mask(i, group->cpumask) {
2554                 unsigned long wl;
2555
2556                 if (!cpu_isset(i, *cpus))
2557                         continue;
2558
2559                 rq = cpu_rq(i);
2560                 wl = weighted_cpuload(i);
2561
2562                 if (rq->nr_running == 1 && wl > imbalance)
2563                         continue;
2564
2565                 if (wl > max_load) {
2566                         max_load = wl;
2567                         busiest = rq;
2568                 }
2569         }
2570
2571         return busiest;
2572 }
2573
2574 /*
2575  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2576  * so long as it is large enough.
2577  */
2578 #define MAX_PINNED_INTERVAL     512
2579
2580 static inline unsigned long minus_1_or_zero(unsigned long n)
2581 {
2582         return n > 0 ? n - 1 : 0;
2583 }
2584
2585 /*
2586  * Check this_cpu to ensure it is balanced within domain. Attempt to move
2587  * tasks if there is an imbalance.
2588  */
2589 static int load_balance(int this_cpu, struct rq *this_rq,
2590                         struct sched_domain *sd, enum cpu_idle_type idle,
2591                         int *balance)
2592 {
2593         int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2594         struct sched_group *group;
2595         unsigned long imbalance;
2596         struct rq *busiest;
2597         cpumask_t cpus = CPU_MASK_ALL;
2598         unsigned long flags;
2599
2600         /*
2601          * When power savings policy is enabled for the parent domain, idle
2602          * sibling can pick up load irrespective of busy siblings. In this case,
2603          * let the state of idle sibling percolate up as CPU_IDLE, instead of
2604          * portraying it as CPU_NOT_IDLE.
2605          */
2606         if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2607             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2608                 sd_idle = 1;
2609
2610         schedstat_inc(sd, lb_cnt[idle]);
2611
2612 redo:
2613         group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2614                                    &cpus, balance);
2615
2616         if (*balance == 0)
2617                 goto out_balanced;
2618
2619         if (!group) {
2620                 schedstat_inc(sd, lb_nobusyg[idle]);
2621                 goto out_balanced;
2622         }
2623
2624         busiest = find_busiest_queue(group, idle, imbalance, &cpus);
2625         if (!busiest) {
2626                 schedstat_inc(sd, lb_nobusyq[idle]);
2627                 goto out_balanced;
2628         }
2629
2630         BUG_ON(busiest == this_rq);
2631
2632         schedstat_add(sd, lb_imbalance[idle], imbalance);
2633
2634         nr_moved = 0;
2635         if (busiest->nr_running > 1) {
2636                 /*
2637                  * Attempt to move tasks. If find_busiest_group has found
2638                  * an imbalance but busiest->nr_running <= 1, the group is
2639                  * still unbalanced. nr_moved simply stays zero, so it is
2640                  * correctly treated as an imbalance.
2641                  */
2642                 local_irq_save(flags);
2643                 double_rq_lock(this_rq, busiest);
2644                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2645                                       minus_1_or_zero(busiest->nr_running),
2646                                       imbalance, sd, idle, &all_pinned);
2647                 double_rq_unlock(this_rq, busiest);
2648                 local_irq_restore(flags);
2649
2650                 /*
2651                  * some other cpu did the load balance for us.
2652                  */
2653                 if (nr_moved && this_cpu != smp_processor_id())
2654                         resched_cpu(this_cpu);
2655
2656                 /* All tasks on this runqueue were pinned by CPU affinity */
2657                 if (unlikely(all_pinned)) {
2658                         cpu_clear(cpu_of(busiest), cpus);
2659                         if (!cpus_empty(cpus))
2660                                 goto redo;
2661                         goto out_balanced;
2662                 }
2663         }
2664
2665         if (!nr_moved) {
2666                 schedstat_inc(sd, lb_failed[idle]);
2667                 sd->nr_balance_failed++;
2668
2669                 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2670
2671                         spin_lock_irqsave(&busiest->lock, flags);
2672
2673                         /* don't kick the migration_thread, if the curr
2674                          * task on busiest cpu can't be moved to this_cpu
2675                          */
2676                         if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2677                                 spin_unlock_irqrestore(&busiest->lock, flags);
2678                                 all_pinned = 1;
2679                                 goto out_one_pinned;
2680                         }
2681
2682                         if (!busiest->active_balance) {
2683                                 busiest->active_balance = 1;
2684                                 busiest->push_cpu = this_cpu;
2685                                 active_balance = 1;
2686                         }
2687                         spin_unlock_irqrestore(&busiest->lock, flags);
2688                         if (active_balance)
2689                                 wake_up_process(busiest->migration_thread);
2690
2691                         /*
2692                          * We've kicked active balancing, reset the failure
2693                          * counter.
2694                          */
2695                         sd->nr_balance_failed = sd->cache_nice_tries+1;
2696                 }
2697         } else
2698                 sd->nr_balance_failed = 0;
2699
2700         if (likely(!active_balance)) {
2701                 /* We were unbalanced, so reset the balancing interval */
2702                 sd->balance_interval = sd->min_interval;
2703         } else {
2704                 /*
2705                  * If we've begun active balancing, start to back off. This
2706                  * case may not be covered by the all_pinned logic if there
2707                  * is only 1 task on the busy runqueue (because we don't call
2708                  * move_tasks).
2709                  */
2710                 if (sd->balance_interval < sd->max_interval)
2711                         sd->balance_interval *= 2;
2712         }
2713
2714         if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2715             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2716                 return -1;
2717         return nr_moved;
2718
2719 out_balanced:
2720         schedstat_inc(sd, lb_balanced[idle]);
2721
2722         sd->nr_balance_failed = 0;
2723
2724 out_one_pinned:
2725         /* tune up the balancing interval */
2726         if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2727                         (sd->balance_interval < sd->max_interval))
2728                 sd->balance_interval *= 2;
2729
2730         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2731             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2732                 return -1;
2733         return 0;
2734 }
2735
2736 /*
2737  * Check this_cpu to ensure it is balanced within domain. Attempt to move
2738  * tasks if there is an imbalance.
2739  *
2740  * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2741  * this_rq is locked.
2742  */
2743 static int
2744 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2745 {
2746         struct sched_group *group;
2747         struct rq *busiest = NULL;
2748         unsigned long imbalance;
2749         int nr_moved = 0;
2750         int sd_idle = 0;
2751         int all_pinned = 0;
2752         cpumask_t cpus = CPU_MASK_ALL;
2753
2754         /*
2755          * When power savings policy is enabled for the parent domain, idle
2756          * sibling can pick up load irrespective of busy siblings. In this case,
2757          * let the state of idle sibling percolate up as IDLE, instead of
2758          * portraying it as CPU_NOT_IDLE.
2759          */
2760         if (sd->flags & SD_SHARE_CPUPOWER &&
2761             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2762                 sd_idle = 1;
2763
2764         schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
2765 redo:
2766         group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2767                                    &sd_idle, &cpus, NULL);
2768         if (!group) {
2769                 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2770                 goto out_balanced;
2771         }
2772
2773         busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2774                                 &cpus);
2775         if (!busiest) {
2776                 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2777                 goto out_balanced;
2778         }
2779
2780         BUG_ON(busiest == this_rq);
2781
2782         schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2783
2784         nr_moved = 0;
2785         if (busiest->nr_running > 1) {
2786                 /* Attempt to move tasks */
2787                 double_lock_balance(this_rq, busiest);
2788                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2789                                         minus_1_or_zero(busiest->nr_running),
2790                                         imbalance, sd, CPU_NEWLY_IDLE,
2791                                         &all_pinned);
2792                 spin_unlock(&busiest->lock);
2793
2794                 if (unlikely(all_pinned)) {
2795                         cpu_clear(cpu_of(busiest), cpus);
2796                         if (!cpus_empty(cpus))
2797                                 goto redo;
2798                 }
2799         }
2800
2801         if (!nr_moved) {
2802                 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2803                 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2804                     !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2805                         return -1;
2806         } else
2807                 sd->nr_balance_failed = 0;
2808
2809         return nr_moved;
2810
2811 out_balanced:
2812         schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2813         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2814             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2815                 return -1;
2816         sd->nr_balance_failed = 0;
2817
2818         return 0;
2819 }
2820
2821 /*
2822  * idle_balance is called by schedule() if this_cpu is about to become
2823  * idle. Attempts to pull tasks from other CPUs.
2824  */
2825 static void idle_balance(int this_cpu, struct rq *this_rq)
2826 {
2827         struct sched_domain *sd;
2828         int pulled_task = -1;
2829         unsigned long next_balance = jiffies + HZ;
2830
2831         for_each_domain(this_cpu, sd) {
2832                 unsigned long interval;
2833
2834                 if (!(sd->flags & SD_LOAD_BALANCE))
2835                         continue;
2836
2837                 if (sd->flags & SD_BALANCE_NEWIDLE)
2838                         /* If we've pulled tasks over stop searching: */
2839                         pulled_task = load_balance_newidle(this_cpu,
2840                                                                 this_rq, sd);
2841
2842                 interval = msecs_to_jiffies(sd->balance_interval);
2843                 if (time_after(next_balance, sd->last_balance + interval))
2844                         next_balance = sd->last_balance + interval;
2845                 if (pulled_task)
2846                         break;
2847         }
2848         if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
2849                 /*
2850                  * We are going idle. next_balance may be set based on
2851                  * a busy processor. So reset next_balance.
2852                  */
2853                 this_rq->next_balance = next_balance;
2854         }
2855 }
2856
2857 /*
2858  * active_load_balance is run by migration threads. It pushes running tasks
2859  * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2860  * running on each physical CPU where possible, and avoids physical /
2861  * logical imbalances.
2862  *
2863  * Called with busiest_rq locked.
2864  */
2865 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2866 {
2867         int target_cpu = busiest_rq->push_cpu;
2868         struct sched_domain *sd;
2869         struct rq *target_rq;
2870
2871         /* Is there any task to move? */
2872         if (busiest_rq->nr_running <= 1)
2873                 return;
2874
2875         target_rq = cpu_rq(target_cpu);
2876
2877         /*
2878          * This condition is "impossible", if it occurs
2879          * we need to fix it.  Originally reported by
2880          * Bjorn Helgaas on a 128-cpu setup.
2881          */
2882         BUG_ON(busiest_rq == target_rq);
2883
2884         /* move a task from busiest_rq to target_rq */
2885         double_lock_balance(busiest_rq, target_rq);
2886
2887         /* Search for an sd spanning us and the target CPU. */
2888         for_each_domain(target_cpu, sd) {
2889                 if ((sd->flags & SD_LOAD_BALANCE) &&
2890                     cpu_isset(busiest_cpu, sd->span))
2891                                 break;
2892         }
2893
2894         if (likely(sd)) {
2895                 schedstat_inc(sd, alb_cnt);
2896
2897                 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2898                                ULONG_MAX, sd, CPU_IDLE, NULL))
2899                         schedstat_inc(sd, alb_pushed);
2900                 else
2901                         schedstat_inc(sd, alb_failed);
2902         }
2903         spin_unlock(&target_rq->lock);
2904 }
2905
2906 #ifdef CONFIG_NO_HZ
2907 static struct {
2908         atomic_t load_balancer;
2909         cpumask_t  cpu_mask;
2910 } nohz ____cacheline_aligned = {
2911         .load_balancer = ATOMIC_INIT(-1),
2912         .cpu_mask = CPU_MASK_NONE,
2913 };
2914
2915 /*
2916  * This routine will try to nominate the ilb (idle load balancing)
2917  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2918  * load balancing on behalf of all those cpus. If all the cpus in the system
2919  * go into this tickless mode, then there will be no ilb owner (as there is
2920  * no need for one) and all the cpus will sleep till the next wakeup event
2921  * arrives...
2922  *
2923  * For the ilb owner, tick is not stopped. And this tick will be used
2924  * for idle load balancing. ilb owner will still be part of
2925  * nohz.cpu_mask..
2926  *
2927  * While stopping the tick, this cpu will become the ilb owner if there
2928  * is no other owner. And will be the owner till that cpu becomes busy
2929  * or if all cpus in the system stop their ticks at which point
2930  * there is no need for ilb owner.
2931  *
2932  * When the ilb owner becomes busy, it nominates another owner, during the
2933  * next busy scheduler_tick()
2934  */
2935 int select_nohz_load_balancer(int stop_tick)
2936 {
2937         int cpu = smp_processor_id();
2938
2939         if (stop_tick) {
2940                 cpu_set(cpu, nohz.cpu_mask);
2941                 cpu_rq(cpu)->in_nohz_recently = 1;
2942
2943                 /*
2944                  * If we are going offline and still the leader, give up!
2945                  */
2946                 if (cpu_is_offline(cpu) &&
2947                     atomic_read(&nohz.load_balancer) == cpu) {
2948                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2949                                 BUG();
2950                         return 0;
2951                 }
2952
2953                 /* time for ilb owner also to sleep */
2954                 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2955                         if (atomic_read(&nohz.load_balancer) == cpu)
2956                                 atomic_set(&nohz.load_balancer, -1);
2957                         return 0;
2958                 }
2959
2960                 if (atomic_read(&nohz.load_balancer) == -1) {
2961                         /* make me the ilb owner */
2962                         if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2963                                 return 1;
2964                 } else if (atomic_read(&nohz.load_balancer) == cpu)
2965                         return 1;
2966         } else {
2967                 if (!cpu_isset(cpu, nohz.cpu_mask))
2968                         return 0;
2969
2970                 cpu_clear(cpu, nohz.cpu_mask);
2971
2972                 if (atomic_read(&nohz.load_balancer) == cpu)
2973                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2974                                 BUG();
2975         }
2976         return 0;
2977 }
2978 #endif
2979
2980 static DEFINE_SPINLOCK(balancing);
2981
2982 /*
2983  * It checks each scheduling domain to see if it is due to be balanced,
2984  * and initiates a balancing operation if so.
2985  *
2986  * Balancing parameters are set up in arch_init_sched_domains.
2987  */
2988 static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
2989 {
2990         int balance = 1;
2991         struct rq *rq = cpu_rq(cpu);
2992         unsigned long interval;
2993         struct sched_domain *sd;
2994         /* Earliest time when we have to do rebalance again */
2995         unsigned long next_balance = jiffies + 60*HZ;
2996
2997         for_each_domain(cpu, sd) {
2998                 if (!(sd->flags & SD_LOAD_BALANCE))
2999                         continue;
3000
3001                 interval = sd->balance_interval;
3002                 if (idle != CPU_IDLE)
3003                         interval *= sd->busy_factor;
3004
3005                 /* scale ms to jiffies */
3006                 interval = msecs_to_jiffies(interval);
3007                 if (unlikely(!interval))
3008                         interval = 1;
3009                 if (interval > HZ*NR_CPUS/10)
3010                         interval = HZ*NR_CPUS/10;
3011
3012
3013                 if (sd->flags & SD_SERIALIZE) {
3014                         if (!spin_trylock(&balancing))
3015                                 goto out;
3016                 }
3017
3018                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3019                         if (load_balance(cpu, rq, sd, idle, &balance)) {
3020                                 /*
3021                                  * We've pulled tasks over so either we're no
3022                                  * longer idle, or one of our SMT siblings is
3023                                  * not idle.
3024                                  */
3025                                 idle = CPU_NOT_IDLE;
3026                         }
3027                         sd->last_balance = jiffies;
3028                 }
3029                 if (sd->flags & SD_SERIALIZE)
3030                         spin_unlock(&balancing);
3031 out:
3032                 if (time_after(next_balance, sd->last_balance + interval))
3033                         next_balance = sd->last_balance + interval;
3034
3035                 /*
3036                  * Stop the load balance at this level. There is another
3037                  * CPU in our sched group which is doing load balancing more
3038                  * actively.
3039                  */
3040                 if (!balance)
3041                         break;
3042         }
3043         rq->next_balance = next_balance;
3044 }
3045
3046 /*
3047  * run_rebalance_domains is triggered when needed from the scheduler tick.
3048  * In CONFIG_NO_HZ case, the idle load balance owner will do the
3049  * rebalancing for all the cpus for whom scheduler ticks are stopped.
3050  */
3051 static void run_rebalance_domains(struct softirq_action *h)
3052 {
3053         int this_cpu = smp_processor_id();
3054         struct rq *this_rq = cpu_rq(this_cpu);
3055         enum cpu_idle_type idle = this_rq->idle_at_tick ?
3056                                                 CPU_IDLE : CPU_NOT_IDLE;
3057
3058         rebalance_domains(this_cpu, idle);
3059
3060 #ifdef CONFIG_NO_HZ
3061         /*
3062          * If this cpu is the owner for idle load balancing, then do the
3063          * balancing on behalf of the other idle cpus whose ticks are
3064          * stopped.
3065          */
3066         if (this_rq->idle_at_tick &&
3067             atomic_read(&nohz.load_balancer) == this_cpu) {
3068                 cpumask_t cpus = nohz.cpu_mask;
3069                 struct rq *rq;
3070                 int balance_cpu;
3071
3072                 cpu_clear(this_cpu, cpus);
3073                 for_each_cpu_mask(balance_cpu, cpus) {
3074                         /*
3075                          * If this cpu gets work to do, stop the load balancing
3076                          * work being done for other cpus. Next load
3077                          * balancing owner will pick it up.
3078                          */
3079                         if (need_resched())
3080                                 break;
3081
3082                         rebalance_domains(balance_cpu, SCHED_IDLE);
3083
3084                         rq = cpu_rq(balance_cpu);
3085                         if (time_after(this_rq->next_balance, rq->next_balance))
3086                                 this_rq->next_balance = rq->next_balance;
3087                 }
3088         }
3089 #endif
3090 }
3091
3092 /*
3093  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3094  *
3095  * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3096  * idle load balancing owner or decide to stop the periodic load balancing,
3097  * if the whole system is idle.
3098  */
3099 static inline void trigger_load_balance(struct rq *rq, int cpu)
3100 {
3101 #ifdef CONFIG_NO_HZ
3102         /*
3103          * If we were in the nohz mode recently and busy at the current
3104          * scheduler tick, then check if we need to nominate new idle
3105          * load balancer.
3106          */
3107         if (rq->in_nohz_recently && !rq->idle_at_tick) {
3108                 rq->in_nohz_recently = 0;
3109
3110                 if (atomic_read(&nohz.load_balancer) == cpu) {
3111                         cpu_clear(cpu, nohz.cpu_mask);
3112                         atomic_set(&nohz.load_balancer, -1);
3113                 }
3114
3115                 if (atomic_read(&nohz.load_balancer) == -1) {
3116                         /*
3117                          * simple selection for now: Nominate the
3118                          * first cpu in the nohz list to be the next
3119                          * ilb owner.
3120                          *
3121                          * TBD: Traverse the sched domains and nominate
3122                          * the nearest cpu in the nohz.cpu_mask.
3123                          */
3124                         int ilb = first_cpu(nohz.cpu_mask);
3125
3126                         if (ilb != NR_CPUS)
3127                                 resched_cpu(ilb);
3128                 }
3129         }
3130
3131         /*
3132          * If this cpu is idle and doing idle load balancing for all the
3133          * cpus with ticks stopped, is it time for that to stop?
3134          */
3135         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3136             cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3137                 resched_cpu(cpu);
3138                 return;
3139         }
3140
3141         /*
3142          * If this cpu is idle and the idle load balancing is done by
3143          * someone else, then no need raise the SCHED_SOFTIRQ
3144          */
3145         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3146             cpu_isset(cpu, nohz.cpu_mask))
3147                 return;
3148 #endif
3149         if (time_after_eq(jiffies, rq->next_balance))
3150                 raise_softirq(SCHED_SOFTIRQ);
3151 }
3152
3153 #else   /* CONFIG_SMP */
3154
3155 /*
3156  * on UP we do not need to balance between CPUs:
3157  */
3158 static inline void idle_balance(int cpu, struct rq *rq)
3159 {
3160 }
3161
3162 /* Avoid "used but not defined" warning on UP */
3163 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3164                       unsigned long max_nr_move, unsigned long max_load_move,
3165                       struct sched_domain *sd, enum cpu_idle_type idle,
3166                       int *all_pinned, unsigned long *load_moved,
3167                       int this_best_prio, int best_prio, int best_prio_seen,
3168                       struct rq_iterator *iterator)
3169 {
3170         *load_moved = 0;
3171
3172         return 0;
3173 }
3174
3175 #endif
3176
3177 DEFINE_PER_CPU(struct kernel_stat, kstat);
3178
3179 EXPORT_PER_CPU_SYMBOL(kstat);
3180
3181 /*
3182  * Return p->sum_exec_runtime plus any more ns on the sched_clock
3183  * that have not yet been banked in case the task is currently running.
3184  */
3185 unsigned long long task_sched_runtime(struct task_struct *p)
3186 {
3187         unsigned long flags;
3188         u64 ns, delta_exec;
3189         struct rq *rq;
3190
3191         rq = task_rq_lock(p, &flags);
3192         ns = p->se.sum_exec_runtime;
3193         if (rq->curr == p) {
3194                 delta_exec = rq_clock(rq) - p->se.exec_start;
3195                 if ((s64)delta_exec > 0)
3196                         ns += delta_exec;
3197         }
3198         task_rq_unlock(rq, &flags);
3199
3200         return ns;
3201 }
3202
3203 /*
3204  * Account user cpu time to a process.
3205  * @p: the process that the cpu time gets accounted to
3206  * @hardirq_offset: the offset to subtract from hardirq_count()
3207  * @cputime: the cpu time spent in user space since the last update
3208  */
3209 void account_user_time(struct task_struct *p, cputime_t cputime)
3210 {
3211         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3212         cputime64_t tmp;
3213
3214         p->utime = cputime_add(p->utime, cputime);
3215
3216         /* Add user time to cpustat. */
3217         tmp = cputime_to_cputime64(cputime);
3218         if (TASK_NICE(p) > 0)
3219                 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3220         else
3221                 cpustat->user = cputime64_add(cpustat->user, tmp);
3222 }
3223
3224 /*
3225  * Account system cpu time to a process.
3226  * @p: the process that the cpu time gets accounted to
3227  * @hardirq_offset: the offset to subtract from hardirq_count()
3228  * @cputime: the cpu time spent in kernel space since the last update
3229  */
3230 void account_system_time(struct task_struct *p, int hardirq_offset,
3231                          cputime_t cputime)
3232 {
3233         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3234         struct rq *rq = this_rq();
3235         cputime64_t tmp;
3236
3237         p->stime = cputime_add(p->stime, cputime);
3238
3239         /* Add system time to cpustat. */
3240         tmp = cputime_to_cputime64(cputime);
3241         if (hardirq_count() - hardirq_offset)
3242                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3243         else if (softirq_count())
3244                 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3245         else if (p != rq->idle)
3246                 cpustat->system = cputime64_add(cpustat->system, tmp);
3247         else if (atomic_read(&rq->nr_iowait) > 0)
3248                 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3249         else
3250                 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3251         /* Account for system time used */
3252         acct_update_integrals(p);
3253 }
3254
3255 /*
3256  * Account for involuntary wait time.
3257  * @p: the process from which the cpu time has been stolen
3258  * @steal: the cpu time spent in involuntary wait
3259  */
3260 void account_steal_time(struct task_struct *p, cputime_t steal)
3261 {
3262         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3263         cputime64_t tmp = cputime_to_cputime64(steal);
3264         struct rq *rq = this_rq();
3265
3266         if (p == rq->idle) {
3267                 p->stime = cputime_add(p->stime, steal);
3268                 if (atomic_read(&rq->nr_iowait) > 0)
3269                         cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3270                 else
3271                         cpustat->idle = cputime64_add(cpustat->idle, tmp);
3272         } else
3273                 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3274 }
3275
3276 /*
3277  * This function gets called by the timer code, with HZ frequency.
3278  * We call it with interrupts disabled.
3279  *
3280  * It also gets called by the fork code, when changing the parent's
3281  * timeslices.
3282  */
3283 void scheduler_tick(void)
3284 {
3285         int cpu = smp_processor_id();
3286         struct rq *rq = cpu_rq(cpu);
3287         struct task_struct *curr = rq->curr;
3288
3289         spin_lock(&rq->lock);
3290         if (curr != rq->idle) /* FIXME: needed? */
3291                 curr->sched_class->task_tick(rq, curr);
3292         update_cpu_load(rq);
3293         spin_unlock(&rq->lock);
3294
3295 #ifdef CONFIG_SMP
3296         rq->idle_at_tick = idle_cpu(cpu);
3297         trigger_load_balance(rq, cpu);
3298 #endif
3299 }
3300
3301 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3302
3303 void fastcall add_preempt_count(int val)
3304 {
3305         /*
3306          * Underflow?
3307          */
3308         if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3309                 return;
3310         preempt_count() += val;
3311         /*
3312          * Spinlock count overflowing soon?
3313          */
3314         DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3315                                 PREEMPT_MASK - 10);
3316 }
3317 EXPORT_SYMBOL(add_preempt_count);
3318
3319 void fastcall sub_preempt_count(int val)
3320 {
3321         /*
3322          * Underflow?
3323          */
3324         if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3325                 return;
3326         /*
3327          * Is the spinlock portion underflowing?
3328          */
3329         if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3330                         !(preempt_count() & PREEMPT_MASK)))
3331                 return;
3332
3333         preempt_count() -= val;
3334 }
3335 EXPORT_SYMBOL(sub_preempt_count);
3336
3337 #endif
3338
3339 /*
3340  * Print scheduling while atomic bug:
3341  */
3342 static noinline void __schedule_bug(struct task_struct *prev)
3343 {
3344         printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3345                 prev->comm, preempt_count(), prev->pid);
3346         debug_show_held_locks(prev);
3347         if (irqs_disabled())
3348                 print_irqtrace_events(prev);
3349         dump_stack();
3350 }
3351
3352 /*
3353  * Various schedule()-time debugging checks and statistics:
3354  */
3355 static inline void schedule_debug(struct task_struct *prev)
3356 {
3357         /*
3358          * Test if we are atomic.  Since do_exit() needs to call into
3359          * schedule() atomically, we ignore that path for now.
3360          * Otherwise, whine if we are scheduling when we should not be.
3361          */
3362         if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3363                 __schedule_bug(prev);
3364
3365         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3366
3367         schedstat_inc(this_rq(), sched_cnt);
3368 }
3369
3370 /*
3371  * Pick up the highest-prio task:
3372  */
3373 static inline struct task_struct *
3374 pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3375 {
3376         struct sched_class *class;
3377         struct task_struct *p;
3378
3379         /*
3380          * Optimization: we know that if all tasks are in
3381          * the fair class we can call that function directly:
3382          */
3383         if (likely(rq->nr_running == rq->cfs.nr_running)) {
3384                 p = fair_sched_class.pick_next_task(rq, now);
3385                 if (likely(p))
3386                         return p;
3387         }
3388
3389         class = sched_class_highest;
3390         for ( ; ; ) {
3391                 p = class->pick_next_task(rq, now);
3392                 if (p)
3393                         return p;
3394                 /*
3395                  * Will never be NULL as the idle class always
3396                  * returns a non-NULL p:
3397                  */
3398                 class = class->next;
3399         }
3400 }
3401
3402 /*
3403  * schedule() is the main scheduler function.
3404  */
3405 asmlinkage void __sched schedule(void)
3406 {
3407         struct task_struct *prev, *next;
3408         long *switch_count;
3409         struct rq *rq;
3410         u64 now;
3411         int cpu;
3412
3413 need_resched:
3414         preempt_disable();
3415         cpu = smp_processor_id();
3416         rq = cpu_rq(cpu);
3417         rcu_qsctr_inc(cpu);
3418         prev = rq->curr;
3419         switch_count = &prev->nivcsw;
3420
3421         release_kernel_lock(prev);
3422 need_resched_nonpreemptible:
3423
3424         schedule_debug(prev);
3425
3426         spin_lock_irq(&rq->lock);
3427         clear_tsk_need_resched(prev);
3428
3429         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3430                 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3431                                 unlikely(signal_pending(prev)))) {
3432                         prev->state = TASK_RUNNING;
3433                 } else {
3434                         deactivate_task(rq, prev, 1);
3435                 }
3436                 switch_count = &prev->nvcsw;
3437         }
3438
3439         if (unlikely(!rq->nr_running))
3440                 idle_balance(cpu, rq);
3441
3442         now = __rq_clock(rq);
3443         prev->sched_class->put_prev_task(rq, prev, now);
3444         next = pick_next_task(rq, prev, now);
3445
3446         sched_info_switch(prev, next);
3447
3448         if (likely(prev != next)) {
3449                 rq->nr_switches++;
3450                 rq->curr = next;
3451                 ++*switch_count;
3452
3453                 context_switch(rq, prev, next); /* unlocks the rq */
3454         } else
3455                 spin_unlock_irq(&rq->lock);
3456
3457         if (unlikely(reacquire_kernel_lock(current) < 0)) {
3458                 cpu = smp_processor_id();
3459                 rq = cpu_rq(cpu);
3460                 goto need_resched_nonpreemptible;
3461         }
3462         preempt_enable_no_resched();
3463         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3464                 goto need_resched;
3465 }
3466 EXPORT_SYMBOL(schedule);
3467
3468 #ifdef CONFIG_PREEMPT
3469 /*
3470  * this is the entry point to schedule() from in-kernel preemption
3471  * off of preempt_enable.  Kernel preemptions off return from interrupt
3472  * occur there and call schedule directly.
3473  */
3474 asmlinkage void __sched preempt_schedule(void)
3475 {
3476         struct thread_info *ti = current_thread_info();
3477 #ifdef CONFIG_PREEMPT_BKL
3478         struct task_struct *task = current;
3479         int saved_lock_depth;
3480 #endif
3481         /*
3482          * If there is a non-zero preempt_count or interrupts are disabled,
3483          * we do not want to preempt the current task.  Just return..
3484          */
3485         if (likely(ti->preempt_count || irqs_disabled()))
3486                 return;
3487
3488 need_resched:
3489         add_preempt_count(PREEMPT_ACTIVE);
3490         /*
3491          * We keep the big kernel semaphore locked, but we
3492          * clear ->lock_depth so that schedule() doesnt
3493          * auto-release the semaphore:
3494          */
3495 #ifdef CONFIG_PREEMPT_BKL
3496         saved_lock_depth = task->lock_depth;
3497         task->lock_depth = -1;
3498 #endif
3499         schedule();
3500 #ifdef CONFIG_PREEMPT_BKL
3501         task->lock_depth = saved_lock_depth;
3502 #endif
3503         sub_preempt_count(PREEMPT_ACTIVE);
3504
3505         /* we could miss a preemption opportunity between schedule and now */
3506         barrier();
3507         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3508                 goto need_resched;
3509 }
3510 EXPORT_SYMBOL(preempt_schedule);
3511
3512 /*
3513  * this is the entry point to schedule() from kernel preemption
3514  * off of irq context.
3515  * Note, that this is called and return with irqs disabled. This will
3516  * protect us against recursive calling from irq.
3517  */
3518 asmlinkage void __sched preempt_schedule_irq(void)
3519 {
3520         struct thread_info *ti = current_thread_info();
3521 #ifdef CONFIG_PREEMPT_BKL
3522         struct task_struct *task = current;
3523         int saved_lock_depth;
3524 #endif
3525         /* Catch callers which need to be fixed */
3526         BUG_ON(ti->preempt_count || !irqs_disabled());
3527
3528 need_resched:
3529         add_preempt_count(PREEMPT_ACTIVE);
3530         /*
3531          * We keep the big kernel semaphore locked, but we
3532          * clear ->lock_depth so that schedule() doesnt
3533          * auto-release the semaphore:
3534          */
3535 #ifdef CONFIG_PREEMPT_BKL
3536         saved_lock_depth = task->lock_depth;
3537         task->lock_depth = -1;
3538 #endif
3539         local_irq_enable();
3540         schedule();
3541         local_irq_disable();
3542 #ifdef CONFIG_PREEMPT_BKL
3543         task->lock_depth = saved_lock_depth;
3544 #endif
3545         sub_preempt_count(PREEMPT_ACTIVE);
3546
3547         /* we could miss a preemption opportunity between schedule and now */
3548         barrier();
3549         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3550                 goto need_resched;
3551 }
3552
3553 #endif /* CONFIG_PREEMPT */
3554
3555 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3556                           void *key)
3557 {
3558         return try_to_wake_up(curr->private, mode, sync);
3559 }
3560 EXPORT_SYMBOL(default_wake_function);
3561
3562 /*
3563  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
3564  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
3565  * number) then we wake all the non-exclusive tasks and one exclusive task.
3566  *
3567  * There are circumstances in which we can try to wake a task which has already
3568  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
3569  * zero in this (rare) case, and we handle it by continuing to scan the queue.
3570  */
3571 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3572                              int nr_exclusive, int sync, void *key)
3573 {
3574         struct list_head *tmp, *next;
3575
3576         list_for_each_safe(tmp, next, &q->task_list) {
3577                 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3578                 unsigned flags = curr->flags;
3579
3580                 if (curr->func(curr, mode, sync, key) &&
3581                                 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3582                         break;
3583         }
3584 }
3585
3586 /**
3587  * __wake_up - wake up threads blocked on a waitqueue.
3588  * @q: the waitqueue
3589  * @mode: which threads
3590  * @nr_exclusive: how many wake-one or wake-many threads to wake up
3591  * @key: is directly passed to the wakeup function
3592  */
3593 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3594                         int nr_exclusive, void *key)
3595 {
3596         unsigned long flags;
3597
3598         spin_lock_irqsave(&q->lock, flags);
3599         __wake_up_common(q, mode, nr_exclusive, 0, key);
3600         spin_unlock_irqrestore(&q->lock, flags);
3601 }
3602 EXPORT_SYMBOL(__wake_up);
3603
3604 /*
3605  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3606  */
3607 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3608 {
3609         __wake_up_common(q, mode, 1, 0, NULL);
3610 }
3611
3612 /**
3613  * __wake_up_sync - wake up threads blocked on a waitqueue.
3614  * @q: the waitqueue
3615  * @mode: which threads
3616  * @nr_exclusive: how many wake-one or wake-many threads to wake up
3617  *
3618  * The sync wakeup differs that the waker knows that it will schedule
3619  * away soon, so while the target thread will be woken up, it will not
3620  * be migrated to another CPU - ie. the two threads are 'synchronized'
3621  * with each other. This can prevent needless bouncing between CPUs.
3622  *
3623  * On UP it can prevent extra preemption.
3624  */
3625 void fastcall
3626 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3627 {
3628         unsigned long flags;
3629         int sync = 1;
3630
3631         if (unlikely(!q))
3632                 return;
3633
3634         if (unlikely(!nr_exclusive))
3635                 sync = 0;
3636
3637         spin_lock_irqsave(&q->lock, flags);
3638         __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3639         spin_unlock_irqrestore(&q->lock, flags);
3640 }
3641 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
3642
3643 void fastcall complete(struct completion *x)
3644 {
3645         unsigned long flags;
3646
3647         spin_lock_irqsave(&x->wait.lock, flags);
3648         x->done++;
3649         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3650                          1, 0, NULL);
3651         spin_unlock_irqrestore(&x->wait.lock, flags);
3652 }
3653 EXPORT_SYMBOL(complete);
3654
3655 void fastcall complete_all(struct completion *x)
3656 {
3657         unsigned long flags;
3658
3659         spin_lock_irqsave(&x->wait.lock, flags);
3660         x->done += UINT_MAX/2;
3661         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3662                          0, 0, NULL);
3663         spin_unlock_irqrestore(&x->wait.lock, flags);
3664 }
3665 EXPORT_SYMBOL(complete_all);
3666
3667 void fastcall __sched wait_for_completion(struct completion *x)
3668 {
3669         might_sleep();
3670
3671         spin_lock_irq(&x->wait.lock);
3672         if (!x->done) {
3673                 DECLARE_WAITQUEUE(wait, current);
3674
3675                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3676                 __add_wait_queue_tail(&x->wait, &wait);
3677                 do {
3678                         __set_current_state(TASK_UNINTERRUPTIBLE);
3679                         spin_unlock_irq(&x->wait.lock);
3680                         schedule();
3681                         spin_lock_irq(&x->wait.lock);
3682                 } while (!x->done);
3683                 __remove_wait_queue(&x->wait, &wait);
3684         }
3685         x->done--;
3686         spin_unlock_irq(&x->wait.lock);
3687 }
3688 EXPORT_SYMBOL(wait_for_completion);
3689
3690 unsigned long fastcall __sched
3691 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3692 {
3693         might_sleep();
3694
3695         spin_lock_irq(&x->wait.lock);
3696         if (!x->done) {
3697                 DECLARE_WAITQUEUE(wait, current);
3698
3699                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3700                 __add_wait_queue_tail(&x->wait, &wait);
3701                 do {
3702                         __set_current_state(TASK_UNINTERRUPTIBLE);
3703                         spin_unlock_irq(&x->wait.lock);
3704                         timeout = schedule_timeout(timeout);
3705                         spin_lock_irq(&x->wait.lock);
3706                         if (!timeout) {
3707                                 __remove_wait_queue(&x->wait, &wait);
3708                                 goto out;
3709                         }
3710                 } while (!x->done);
3711                 __remove_wait_queue(&x->wait, &wait);
3712         }
3713         x->done--;
3714 out:
3715         spin_unlock_irq(&x->wait.lock);
3716         return timeout;
3717 }
3718 EXPORT_SYMBOL(wait_for_completion_timeout);
3719
3720 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3721 {
3722         int ret = 0;
3723
3724         might_sleep();
3725
3726         spin_lock_irq(&x->wait.lock);
3727         if (!x->done) {
3728                 DECLARE_WAITQUEUE(wait, current);
3729
3730                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3731                 __add_wait_queue_tail(&x->wait, &wait);
3732                 do {
3733                         if (signal_pending(current)) {
3734                                 ret = -ERESTARTSYS;
3735                                 __remove_wait_queue(&x->wait, &wait);
3736                                 goto out;
3737                         }
3738                         __set_current_state(TASK_INTERRUPTIBLE);
3739                         spin_unlock_irq(&x->wait.lock);
3740                         schedule();
3741                         spin_lock_irq(&x->wait.lock);
3742                 } while (!x->done);
3743                 __remove_wait_queue(&x->wait, &wait);
3744         }
3745         x->done--;
3746 out:
3747         spin_unlock_irq(&x->wait.lock);
3748
3749         return ret;
3750 }
3751 EXPORT_SYMBOL(wait_for_completion_interruptible);
3752
3753 unsigned long fastcall __sched
3754 wait_for_completion_interruptible_timeout(struct completion *x,
3755                                           unsigned long timeout)
3756 {
3757         might_sleep();
3758
3759         spin_lock_irq(&x->wait.lock);
3760         if (!x->done) {
3761                 DECLARE_WAITQUEUE(wait, current);
3762
3763                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3764                 __add_wait_queue_tail(&x->wait, &wait);
3765                 do {
3766                         if (signal_pending(current)) {
3767                                 timeout = -ERESTARTSYS;
3768                                 __remove_wait_queue(&x->wait, &wait);
3769                                 goto out;
3770                         }
3771                         __set_current_state(TASK_INTERRUPTIBLE);
3772                         spin_unlock_irq(&x->wait.lock);
3773                         timeout = schedule_timeout(timeout);
3774                         spin_lock_irq(&x->wait.lock);
3775                         if (!timeout) {
3776                                 __remove_wait_queue(&x->wait, &wait);
3777                                 goto out;
3778                         }
3779                 } while (!x->done);
3780                 __remove_wait_queue(&x->wait, &wait);
3781         }
3782         x->done--;
3783 out:
3784         spin_unlock_irq(&x->wait.lock);
3785         return timeout;
3786 }
3787 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3788
3789 static inline void
3790 sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3791 {
3792         spin_lock_irqsave(&q->lock, *flags);
3793         __add_wait_queue(q, wait);
3794         spin_unlock(&q->lock);
3795 }
3796
3797 static inline void
3798 sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3799 {
3800         spin_lock_irq(&q->lock);
3801         __remove_wait_queue(q, wait);
3802         spin_unlock_irqrestore(&q->lock, *flags);
3803 }
3804
3805 void __sched interruptible_sleep_on(wait_queue_head_t *q)
3806 {
3807         unsigned long flags;
3808         wait_queue_t wait;
3809
3810         init_waitqueue_entry(&wait, current);
3811
3812         current->state = TASK_INTERRUPTIBLE;
3813
3814         sleep_on_head(q, &wait, &flags);
3815         schedule();
3816         sleep_on_tail(q, &wait, &flags);
3817 }
3818 EXPORT_SYMBOL(interruptible_sleep_on);
3819
3820 long __sched
3821 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3822 {
3823         unsigned long flags;
3824         wait_queue_t wait;
3825
3826         init_waitqueue_entry(&wait, current);
3827
3828         current->state = TASK_INTERRUPTIBLE;
3829
3830         sleep_on_head(q, &wait, &flags);
3831         timeout = schedule_timeout(timeout);
3832         sleep_on_tail(q, &wait, &flags);
3833
3834         return timeout;
3835 }
3836 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3837
3838 void __sched sleep_on(wait_queue_head_t *q)
3839 {
3840         unsigned long flags;
3841         wait_queue_t wait;
3842
3843         init_waitqueue_entry(&wait, current);
3844
3845         current->state = TASK_UNINTERRUPTIBLE;
3846
3847         sleep_on_head(q, &wait, &flags);
3848         schedule();
3849         sleep_on_tail(q, &wait, &flags);
3850 }
3851 EXPORT_SYMBOL(sleep_on);
3852
3853 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3854 {
3855         unsigned long flags;
3856         wait_queue_t wait;
3857
3858         init_waitqueue_entry(&wait, current);
3859
3860         current->state = TASK_UNINTERRUPTIBLE;
3861
3862         sleep_on_head(q, &wait, &flags);
3863         timeout = schedule_timeout(timeout);
3864         sleep_on_tail(q, &wait, &flags);
3865
3866         return timeout;
3867 }
3868 EXPORT_SYMBOL(sleep_on_timeout);
3869
3870 #ifdef CONFIG_RT_MUTEXES
3871
3872 /*
3873  * rt_mutex_setprio - set the current priority of a task
3874  * @p: task
3875  * @prio: prio value (kernel-internal form)
3876  *
3877  * This function changes the 'effective' priority of a task. It does
3878  * not touch ->normal_prio like __setscheduler().
3879  *
3880  * Used by the rt_mutex code to implement priority inheritance logic.
3881  */
3882 void rt_mutex_setprio(struct task_struct *p, int prio)
3883 {
3884         unsigned long flags;
3885         int oldprio, on_rq;
3886         struct rq *rq;
3887         u64 now;
3888
3889         BUG_ON(prio < 0 || prio > MAX_PRIO);
3890
3891         rq = task_rq_lock(p, &flags);
3892         now = rq_clock(rq);
3893
3894         oldprio = p->prio;
3895         on_rq = p->se.on_rq;
3896         if (on_rq)
3897                 dequeue_task(rq, p, 0, now);
3898
3899         if (rt_prio(prio))
3900                 p->sched_class = &rt_sched_class;
3901         else
3902                 p->sched_class = &fair_sched_class;
3903
3904         p->prio = prio;
3905
3906         if (on_rq) {
3907                 enqueue_task(rq, p, 0, now);
3908                 /*
3909                  * Reschedule if we are currently running on this runqueue and
3910                  * our priority decreased, or if we are not currently running on
3911                  * this runqueue and our priority is higher than the current's
3912                  */
3913                 if (task_running(rq, p)) {
3914                         if (p->prio > oldprio)
3915                                 resched_task(rq->curr);
3916                 } else {
3917                         check_preempt_curr(rq, p);
3918                 }
3919         }
3920         task_rq_unlock(rq, &flags);
3921 }
3922
3923 #endif
3924
3925 void set_user_nice(struct task_struct *p, long nice)
3926 {
3927         int old_prio, delta, on_rq;
3928         unsigned long flags;
3929         struct rq *rq;
3930         u64 now;
3931
3932         if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3933                 return;
3934         /*
3935          * We have to be careful, if called from sys_setpriority(),
3936          * the task might be in the middle of scheduling on another CPU.
3937          */
3938         rq = task_rq_lock(p, &flags);
3939         now = rq_clock(rq);
3940         /*
3941          * The RT priorities are set via sched_setscheduler(), but we still
3942          * allow the 'normal' nice value to be set - but as expected
3943          * it wont have any effect on scheduling until the task is
3944          * SCHED_FIFO/SCHED_RR:
3945          */
3946         if (task_has_rt_policy(p)) {
3947                 p->static_prio = NICE_TO_PRIO(nice);
3948                 goto out_unlock;
3949         }
3950         on_rq = p->se.on_rq;
3951         if (on_rq) {
3952                 dequeue_task(rq, p, 0, now);
3953                 dec_load(rq, p, now);
3954         }
3955
3956         p->static_prio = NICE_TO_PRIO(nice);
3957         set_load_weight(p);
3958         old_prio = p->prio;
3959         p->prio = effective_prio(p);
3960         delta = p->prio - old_prio;
3961
3962         if (on_rq) {
3963                 enqueue_task(rq, p, 0, now);
3964                 inc_load(rq, p, now);
3965                 /*
3966                  * If the task increased its priority or is running and
3967                  * lowered its priority, then reschedule its CPU:
3968                  */
3969                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3970                         resched_task(rq->curr);
3971         }
3972 out_unlock:
3973         task_rq_unlock(rq, &flags);
3974 }
3975 EXPORT_SYMBOL(set_user_nice);
3976
3977 /*
3978  * can_nice - check if a task can reduce its nice value
3979  * @p: task
3980  * @nice: nice value
3981  */
3982 int can_nice(const struct task_struct *p, const int nice)
3983 {
3984         /* convert nice value [19,-20] to rlimit style value [1,40] */
3985         int nice_rlim = 20 - nice;
3986
3987         return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3988                 capable(CAP_SYS_NICE));
3989 }
3990
3991 #ifdef __ARCH_WANT_SYS_NICE
3992
3993 /*
3994  * sys_nice - change the priority of the current process.
3995  * @increment: priority increment
3996  *
3997  * sys_setpriority is a more generic, but much slower function that
3998  * does similar things.
3999  */
4000 asmlinkage long sys_nice(int increment)
4001 {
4002         long nice, retval;
4003
4004         /*
4005          * Setpriority might change our priority at the same moment.
4006          * We don't have to worry. Conceptually one call occurs first
4007          * and we have a single winner.
4008          */
4009         if (increment < -40)
4010                 increment = -40;
4011         if (increment > 40)
4012                 increment = 40;
4013
4014         nice = PRIO_TO_NICE(current->static_prio) + increment;
4015         if (nice < -20)
4016                 nice = -20;
4017         if (nice > 19)
4018                 nice = 19;
4019
4020         if (increment < 0 && !can_nice(current, nice))
4021                 return -EPERM;
4022
4023         retval = security_task_setnice(current, nice);
4024         if (retval)
4025                 return retval;
4026
4027         set_user_nice(current, nice);
4028         return 0;
4029 }
4030
4031 #endif
4032
4033 /**
4034  * task_prio - return the priority value of a given task.
4035  * @p: the task in question.
4036  *
4037  * This is the priority value as seen by users in /proc.
4038  * RT tasks are offset by -200. Normal tasks are centered
4039  * around 0, value goes from -16 to +15.
4040  */
4041 int task_prio(const struct task_struct *p)
4042 {
4043         return p->prio - MAX_RT_PRIO;
4044 }
4045
4046 /**
4047  * task_nice - return the nice value of a given task.
4048  * @p: the task in question.
4049  */
4050 int task_nice(const struct task_struct *p)
4051 {
4052         return TASK_NICE(p);
4053 }
4054 EXPORT_SYMBOL_GPL(task_nice);
4055
4056 /**
4057  * idle_cpu - is a given cpu idle currently?
4058  * @cpu: the processor in question.
4059  */
4060 int idle_cpu(int cpu)
4061 {
4062         return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4063 }
4064
4065 /**
4066  * idle_task - return the idle task for a given cpu.
4067  * @cpu: the processor in question.
4068  */
4069 struct task_struct *idle_task(int cpu)
4070 {
4071         return cpu_rq(cpu)->idle;
4072 }
4073
4074 /**
4075  * find_process_by_pid - find a process with a matching PID value.
4076  * @pid: the pid in question.
4077  */
4078 static inline struct task_struct *find_process_by_pid(pid_t pid)
4079 {
4080         return pid ? find_task_by_pid(pid) : current;
4081 }
4082
4083 /* Actually do priority change: must hold rq lock. */
4084 static void
4085 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4086 {
4087         BUG_ON(p->se.on_rq);
4088
4089         p->policy = policy;
4090         switch (p->policy) {
4091         case SCHED_NORMAL:
4092         case SCHED_BATCH:
4093         case SCHED_IDLE:
4094                 p->sched_class = &fair_sched_class;
4095                 break;
4096         case SCHED_FIFO:
4097         case SCHED_RR:
4098                 p->sched_class = &rt_sched_class;
4099                 break;
4100         }
4101
4102         p->rt_priority = prio;
4103         p->normal_prio = normal_prio(p);
4104         /* we are holding p->pi_lock already */
4105         p->prio = rt_mutex_getprio(p);
4106         set_load_weight(p);
4107 }
4108
4109 /**
4110  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4111  * @p: the task in question.
4112  * @policy: new policy.
4113  * @param: structure containing the new RT priority.
4114  *
4115  * NOTE that the task may be already dead.
4116  */
4117 int sched_setscheduler(struct task_struct *p, int policy,
4118                        struct sched_param *param)
4119 {
4120         int retval, oldprio, oldpolicy = -1, on_rq;
4121         unsigned long flags;
4122         struct rq *rq;
4123
4124         /* may grab non-irq protected spin_locks */
4125         BUG_ON(in_interrupt());
4126 recheck:
4127         /* double check policy once rq lock held */
4128         if (policy < 0)
4129                 policy = oldpolicy = p->policy;
4130         else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4131                         policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4132                         policy != SCHED_IDLE)
4133                 return -EINVAL;
4134         /*
4135          * Valid priorities for SCHED_FIFO and SCHED_RR are
4136          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4137          * SCHED_BATCH and SCHED_IDLE is 0.
4138          */
4139         if (param->sched_priority < 0 ||
4140             (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4141             (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4142                 return -EINVAL;
4143         if (rt_policy(policy) != (param->sched_priority != 0))
4144                 return -EINVAL;
4145
4146         /*
4147          * Allow unprivileged RT tasks to decrease priority:
4148          */
4149         if (!capable(CAP_SYS_NICE)) {
4150                 if (rt_policy(policy)) {
4151                         unsigned long rlim_rtprio;
4152
4153                         if (!lock_task_sighand(p, &flags))
4154                                 return -ESRCH;
4155                         rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4156                         unlock_task_sighand(p, &flags);
4157
4158                         /* can't set/change the rt policy */
4159                         if (policy != p->policy && !rlim_rtprio)
4160                                 return -EPERM;
4161
4162                         /* can't increase priority */
4163                         if (param->sched_priority > p->rt_priority &&
4164                             param->sched_priority > rlim_rtprio)
4165                                 return -EPERM;
4166                 }
4167                 /*
4168                  * Like positive nice levels, dont allow tasks to
4169                  * move out of SCHED_IDLE either:
4170                  */
4171                 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4172                         return -EPERM;
4173
4174                 /* can't change other user's priorities */
4175                 if ((current->euid != p->euid) &&
4176                     (current->euid != p->uid))
4177                         return -EPERM;
4178         }
4179
4180         retval = security_task_setscheduler(p, policy, param);
4181         if (retval)
4182                 return retval;
4183         /*
4184          * make sure no PI-waiters arrive (or leave) while we are
4185          * changing the priority of the task:
4186          */
4187         spin_lock_irqsave(&p->pi_lock, flags);
4188         /*
4189          * To be able to change p->policy safely, the apropriate
4190          * runqueue lock must be held.
4191          */
4192         rq = __task_rq_lock(p);
4193         /* recheck policy now with rq lock held */
4194         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4195                 policy = oldpolicy = -1;
4196                 __task_rq_unlock(rq);
4197                 spin_unlock_irqrestore(&p->pi_lock, flags);
4198                 goto recheck;
4199         }
4200         on_rq = p->se.on_rq;
4201         if (on_rq)
4202                 deactivate_task(rq, p, 0);
4203         oldprio = p->prio;
4204         __setscheduler(rq, p, policy, param->sched_priority);
4205         if (on_rq) {
4206                 activate_task(rq, p, 0);
4207                 /*
4208                  * Reschedule if we are currently running on this runqueue and
4209                  * our priority decreased, or if we are not currently running on
4210                  * this runqueue and our priority is higher than the current's
4211                  */
4212                 if (task_running(rq, p)) {
4213                         if (p->prio > oldprio)
4214                                 resched_task(rq->curr);
4215                 } else {
4216                         check_preempt_curr(rq, p);
4217                 }
4218         }
4219         __task_rq_unlock(rq);
4220         spin_unlock_irqrestore(&p->pi_lock, flags);
4221
4222         rt_mutex_adjust_pi(p);
4223
4224         return 0;
4225 }
4226 EXPORT_SYMBOL_GPL(sched_setscheduler);
4227
4228 static int
4229 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4230 {
4231         struct sched_param lparam;
4232         struct task_struct *p;
4233         int retval;
4234
4235         if (!param || pid < 0)
4236                 return -EINVAL;
4237         if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4238                 return -EFAULT;
4239
4240         rcu_read_lock();
4241         retval = -ESRCH;
4242         p = find_process_by_pid(pid);
4243         if (p != NULL)
4244                 retval = sched_setscheduler(p, policy, &lparam);
4245         rcu_read_unlock();
4246
4247         return retval;
4248 }
4249
4250 /**
4251  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4252  * @pid: the pid in question.
4253  * @policy: new policy.
4254  * @param: structure containing the new RT priority.
4255  */
4256 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4257                                        struct sched_param __user *param)
4258 {
4259         /* negative values for policy are not valid */
4260         if (policy < 0)
4261                 return -EINVAL;
4262
4263         return do_sched_setscheduler(pid, policy, param);
4264 }
4265
4266 /**
4267  * sys_sched_setparam - set/change the RT priority of a thread
4268  * @pid: the pid in question.
4269  * @param: structure containing the new RT priority.
4270  */
4271 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4272 {
4273         return do_sched_setscheduler(pid, -1, param);
4274 }
4275
4276 /**
4277  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4278  * @pid: the pid in question.
4279  */
4280 asmlinkage long sys_sched_getscheduler(pid_t pid)
4281 {
4282         struct task_struct *p;
4283         int retval = -EINVAL;
4284
4285         if (pid < 0)
4286                 goto out_nounlock;
4287
4288         retval = -ESRCH;
4289         read_lock(&tasklist_lock);
4290         p = find_process_by_pid(pid);
4291         if (p) {
4292                 retval = security_task_getscheduler(p);
4293                 if (!retval)
4294                         retval = p->policy;
4295         }
4296         read_unlock(&tasklist_lock);
4297
4298 out_nounlock:
4299         return retval;
4300 }
4301
4302 /**
4303  * sys_sched_getscheduler - get the RT priority of a thread
4304  * @pid: the pid in question.
4305  * @param: structure containing the RT priority.
4306  */
4307 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4308 {
4309         struct sched_param lp;
4310         struct task_struct *p;
4311         int retval = -EINVAL;
4312
4313         if (!param || pid < 0)
4314                 goto out_nounlock;
4315
4316         read_lock(&tasklist_lock);
4317         p = find_process_by_pid(pid);
4318         retval = -ESRCH;
4319         if (!p)
4320                 goto out_unlock;
4321
4322         retval = security_task_getscheduler(p);
4323         if (retval)
4324                 goto out_unlock;
4325
4326         lp.sched_priority = p->rt_priority;
4327         read_unlock(&tasklist_lock);
4328
4329         /*
4330          * This one might sleep, we cannot do it with a spinlock held ...
4331          */
4332         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4333
4334 out_nounlock:
4335         return retval;
4336
4337 out_unlock:
4338         read_unlock(&tasklist_lock);
4339         return retval;
4340 }
4341
4342 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4343 {
4344         cpumask_t cpus_allowed;
4345         struct task_struct *p;
4346         int retval;
4347
4348         mutex_lock(&sched_hotcpu_mutex);
4349         read_lock(&tasklist_lock);
4350
4351         p = find_process_by_pid(pid);
4352         if (!p) {
4353                 read_unlock(&tasklist_lock);
4354                 mutex_unlock(&sched_hotcpu_mutex);
4355                 return -ESRCH;
4356         }
4357
4358         /*
4359          * It is not safe to call set_cpus_allowed with the
4360          * tasklist_lock held.  We will bump the task_struct's
4361          * usage count and then drop tasklist_lock.
4362          */
4363         get_task_struct(p);
4364         read_unlock(&tasklist_lock);
4365
4366         retval = -EPERM;
4367         if ((current->euid != p->euid) && (current->euid != p->uid) &&
4368                         !capable(CAP_SYS_NICE))
4369                 goto out_unlock;
4370
4371         retval = security_task_setscheduler(p, 0, NULL);
4372         if (retval)
4373                 goto out_unlock;
4374
4375         cpus_allowed = cpuset_cpus_allowed(p);
4376         cpus_and(new_mask, new_mask, cpus_allowed);
4377         retval = set_cpus_allowed(p, new_mask);
4378
4379 out_unlock:
4380         put_task_struct(p);
4381         mutex_unlock(&sched_hotcpu_mutex);
4382         return retval;
4383 }
4384
4385 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4386                              cpumask_t *new_mask)
4387 {
4388         if (len < sizeof(cpumask_t)) {
4389                 memset(new_mask, 0, sizeof(cpumask_t));
4390         } else if (len > sizeof(cpumask_t)) {
4391                 len = sizeof(cpumask_t);
4392         }
4393         return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4394 }
4395
4396 /**
4397  * sys_sched_setaffinity - set the cpu affinity of a process
4398  * @pid: pid of the process
4399  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4400  * @user_mask_ptr: user-space pointer to the new cpu mask
4401  */
4402 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4403                                       unsigned long __user *user_mask_ptr)
4404 {
4405         cpumask_t new_mask;
4406         int retval;
4407
4408         retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4409         if (retval)
4410                 return retval;
4411
4412         return sched_setaffinity(pid, new_mask);
4413 }
4414
4415 /*
4416  * Represents all cpu's present in the system
4417  * In systems capable of hotplug, this map could dynamically grow
4418  * as new cpu's are detected in the system via any platform specific
4419  * method, such as ACPI for e.g.
4420  */
4421
4422 cpumask_t cpu_present_map __read_mostly;
4423 EXPORT_SYMBOL(cpu_present_map);
4424
4425 #ifndef CONFIG_SMP
4426 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
4427 EXPORT_SYMBOL(cpu_online_map);
4428
4429 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
4430 EXPORT_SYMBOL(cpu_possible_map);
4431 #endif
4432
4433 long sched_getaffinity(pid_t pid, cpumask_t *mask)
4434 {
4435         struct task_struct *p;
4436         int retval;
4437
4438         mutex_lock(&sched_hotcpu_mutex);
4439         read_lock(&tasklist_lock);
4440
4441         retval = -ESRCH;
4442         p = find_process_by_pid(pid);
4443         if (!p)
4444                 goto out_unlock;
4445
4446         retval = security_task_getscheduler(p);
4447         if (retval)
4448                 goto out_unlock;
4449
4450         cpus_and(*mask, p->cpus_allowed, cpu_online_map);
4451
4452 out_unlock:
4453         read_unlock(&tasklist_lock);
4454         mutex_unlock(&sched_hotcpu_mutex);
4455         if (retval)
4456                 return retval;
4457
4458         return 0;
4459 }
4460
4461 /**
4462  * sys_sched_getaffinity - get the cpu affinity of a process
4463  * @pid: pid of the process
4464  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4465  * @user_mask_ptr: user-space pointer to hold the current cpu mask
4466  */
4467 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4468                                       unsigned long __user *user_mask_ptr)
4469 {
4470         int ret;
4471         cpumask_t mask;
4472
4473         if (len < sizeof(cpumask_t))
4474                 return -EINVAL;
4475
4476         ret = sched_getaffinity(pid, &mask);
4477         if (ret < 0)
4478                 return ret;
4479
4480         if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4481                 return -EFAULT;
4482
4483         return sizeof(cpumask_t);
4484 }
4485
4486 /**
4487  * sys_sched_yield - yield the current processor to other threads.
4488  *
4489  * This function yields the current CPU to other tasks. If there are no
4490  * other threads running on this CPU then this function will return.
4491  */
4492 asmlinkage long sys_sched_yield(void)
4493 {
4494         struct rq *rq = this_rq_lock();
4495
4496         schedstat_inc(rq, yld_cnt);
4497         if (unlikely(rq->nr_running == 1))
4498                 schedstat_inc(rq, yld_act_empty);
4499         else
4500                 current->sched_class->yield_task(rq, current);
4501
4502         /*
4503          * Since we are going to call schedule() anyway, there's
4504          * no need to preempt or enable interrupts:
4505          */
4506         __release(rq->lock);
4507         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4508         _raw_spin_unlock(&rq->lock);
4509         preempt_enable_no_resched();
4510
4511         schedule();
4512
4513         return 0;
4514 }
4515
4516 static void __cond_resched(void)
4517 {
4518 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4519         __might_sleep(__FILE__, __LINE__);
4520 #endif
4521         /*
4522          * The BKS might be reacquired before we have dropped
4523          * PREEMPT_ACTIVE, which could trigger a second
4524          * cond_resched() call.
4525          */
4526         do {
4527                 add_preempt_count(PREEMPT_ACTIVE);
4528                 schedule();
4529                 sub_preempt_count(PREEMPT_ACTIVE);
4530         } while (need_resched());
4531 }
4532
4533 int __sched cond_resched(void)
4534 {
4535         if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4536                                         system_state == SYSTEM_RUNNING) {
4537                 __cond_resched();
4538                 return 1;
4539         }
4540         return 0;
4541 }
4542 EXPORT_SYMBOL(cond_resched);
4543
4544 /*
4545  * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4546  * call schedule, and on return reacquire the lock.
4547  *
4548  * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
4549  * operations here to prevent schedule() from being called twice (once via
4550  * spin_unlock(), once by hand).
4551  */
4552 int cond_resched_lock(spinlock_t *lock)
4553 {
4554         int ret = 0;
4555
4556         if (need_lockbreak(lock)) {
4557                 spin_unlock(lock);
4558                 cpu_relax();
4559                 ret = 1;
4560                 spin_lock(lock);
4561         }
4562         if (need_resched() && system_state == SYSTEM_RUNNING) {
4563                 spin_release(&lock->dep_map, 1, _THIS_IP_);
4564                 _raw_spin_unlock(lock);
4565                 preempt_enable_no_resched();
4566                 __cond_resched();
4567                 ret = 1;
4568                 spin_lock(lock);
4569         }
4570         return ret;
4571 }
4572 EXPORT_SYMBOL(cond_resched_lock);
4573
4574 int __sched cond_resched_softirq(void)
4575 {
4576         BUG_ON(!in_softirq());
4577
4578         if (need_resched() && system_state == SYSTEM_RUNNING) {
4579                 local_bh_enable();
4580                 __cond_resched();
4581                 local_bh_disable();
4582                 return 1;
4583         }
4584         return 0;
4585 }
4586 EXPORT_SYMBOL(cond_resched_softirq);
4587
4588 /**
4589  * yield - yield the current processor to other threads.
4590  *
4591  * This is a shortcut for kernel-space yielding - it marks the
4592  * thread runnable and calls sys_sched_yield().
4593  */
4594 void __sched yield(void)
4595 {
4596         set_current_state(TASK_RUNNING);
4597         sys_sched_yield();
4598 }
4599 EXPORT_SYMBOL(yield);
4600
4601 /*
4602  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
4603  * that process accounting knows that this is a task in IO wait state.
4604  *
4605  * But don't do that if it is a deliberate, throttling IO wait (this task
4606  * has set its backing_dev_info: the queue against which it should throttle)
4607  */
4608 void __sched io_schedule(void)
4609 {
4610         struct rq *rq = &__raw_get_cpu_var(runqueues);
4611
4612         delayacct_blkio_start();
4613         atomic_inc(&rq->nr_iowait);
4614         schedule();
4615         atomic_dec(&rq->nr_iowait);
4616         delayacct_blkio_end();
4617 }
4618 EXPORT_SYMBOL(io_schedule);
4619
4620 long __sched io_schedule_timeout(long timeout)
4621 {
4622         struct rq *rq = &__raw_get_cpu_var(runqueues);
4623         long ret;
4624
4625         delayacct_blkio_start();
4626         atomic_inc(&rq->nr_iowait);
4627         ret = schedule_timeout(timeout);
4628         atomic_dec(&rq->nr_iowait);
4629         delayacct_blkio_end();
4630         return ret;
4631 }
4632
4633 /**
4634  * sys_sched_get_priority_max - return maximum RT priority.
4635  * @policy: scheduling class.
4636  *
4637  * this syscall returns the maximum rt_priority that can be used
4638  * by a given scheduling class.
4639  */
4640 asmlinkage long sys_sched_get_priority_max(int policy)
4641 {
4642         int ret = -EINVAL;
4643
4644         switch (policy) {
4645         case SCHED_FIFO:
4646         case SCHED_RR:
4647                 ret = MAX_USER_RT_PRIO-1;
4648                 break;
4649         case SCHED_NORMAL:
4650         case SCHED_BATCH:
4651         case SCHED_IDLE:
4652                 ret = 0;
4653                 break;
4654         }
4655         return ret;
4656 }
4657
4658 /**
4659  * sys_sched_get_priority_min - return minimum RT priority.
4660  * @policy: scheduling class.
4661  *
4662  * this syscall returns the minimum rt_priority that can be used
4663  * by a given scheduling class.
4664  */
4665 asmlinkage long sys_sched_get_priority_min(int policy)
4666 {
4667         int ret = -EINVAL;
4668
4669         switch (policy) {
4670         case SCHED_FIFO:
4671         case SCHED_RR:
4672                 ret = 1;
4673                 break;
4674         case SCHED_NORMAL:
4675         case SCHED_BATCH:
4676         case SCHED_IDLE:
4677                 ret = 0;
4678         }
4679         return ret;
4680 }
4681
4682 /**
4683  * sys_sched_rr_get_interval - return the default timeslice of a process.
4684  * @pid: pid of the process.
4685  * @interval: userspace pointer to the timeslice value.
4686  *
4687  * this syscall writes the default timeslice value of a given process
4688  * into the user-space timespec buffer. A value of '0' means infinity.
4689  */
4690 asmlinkage
4691 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4692 {
4693         struct task_struct *p;
4694         int retval = -EINVAL;
4695         struct timespec t;
4696
4697         if (pid < 0)
4698                 goto out_nounlock;
4699
4700         retval = -ESRCH;
4701         read_lock(&tasklist_lock);
4702         p = find_process_by_pid(pid);
4703         if (!p)
4704                 goto out_unlock;
4705
4706         retval = security_task_getscheduler(p);
4707         if (retval)
4708                 goto out_unlock;
4709
4710         jiffies_to_timespec(p->policy == SCHED_FIFO ?
4711                                 0 : static_prio_timeslice(p->static_prio), &t);
4712         read_unlock(&tasklist_lock);
4713         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4714 out_nounlock:
4715         return retval;
4716 out_unlock:
4717         read_unlock(&tasklist_lock);
4718         return retval;
4719 }
4720
4721 static const char stat_nam[] = "RSDTtZX";
4722
4723 static void show_task(struct task_struct *p)
4724 {
4725         unsigned long free = 0;
4726         unsigned state;
4727
4728         state = p->state ? __ffs(p->state) + 1 : 0;
4729         printk("%-13.13s %c", p->comm,
4730                 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4731 #if BITS_PER_LONG == 32
4732         if (state == TASK_RUNNING)
4733                 printk(" running  ");
4734         else
4735                 printk(" %08lx ", thread_saved_pc(p));
4736 #else
4737         if (state == TASK_RUNNING)
4738                 printk("  running task    ");
4739         else
4740                 printk(" %016lx ", thread_saved_pc(p));
4741 #endif
4742 #ifdef CONFIG_DEBUG_STACK_USAGE
4743         {
4744                 unsigned long *n = end_of_stack(p);
4745                 while (!*n)
4746                         n++;
4747                 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4748         }
4749 #endif
4750         printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid);
4751
4752         if (state != TASK_RUNNING)
4753                 show_stack(p, NULL);
4754 }
4755
4756 void show_state_filter(unsigned long state_filter)
4757 {
4758         struct task_struct *g, *p;
4759
4760 #if BITS_PER_LONG == 32
4761         printk(KERN_INFO
4762                 "  task                PC stack   pid father\n");
4763 #else
4764         printk(KERN_INFO
4765                 "  task                        PC stack   pid father\n");
4766 #endif
4767         read_lock(&tasklist_lock);
4768         do_each_thread(g, p) {
4769                 /*
4770                  * reset the NMI-timeout, listing all files on a slow
4771                  * console might take alot of time:
4772                  */
4773                 touch_nmi_watchdog();
4774                 if (!state_filter || (p->state & state_filter))
4775                         show_task(p);
4776         } while_each_thread(g, p);
4777
4778         touch_all_softlockup_watchdogs();
4779
4780 #ifdef CONFIG_SCHED_DEBUG
4781         sysrq_sched_debug_show();
4782 #endif
4783         read_unlock(&tasklist_lock);
4784         /*
4785          * Only show locks if all tasks are dumped:
4786          */
4787         if (state_filter == -1)
4788                 debug_show_all_locks();
4789 }
4790
4791 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4792 {
4793         idle->sched_class = &idle_sched_class;
4794 }
4795
4796 /**
4797  * init_idle - set up an idle thread for a given CPU
4798  * @idle: task in question
4799  * @cpu: cpu the idle task belongs to
4800  *
4801  * NOTE: this function does not set the idle thread's NEED_RESCHED
4802  * flag, to make booting more robust.
4803  */
4804 void __cpuinit init_idle(struct task_struct *idle, int cpu)
4805 {
4806         struct rq *rq = cpu_rq(cpu);
4807         unsigned long flags;
4808
4809         __sched_fork(idle);
4810         idle->se.exec_start = sched_clock();
4811
4812         idle->prio = idle->normal_prio = MAX_PRIO;
4813         idle->cpus_allowed = cpumask_of_cpu(cpu);
4814         __set_task_cpu(idle, cpu);
4815
4816         spin_lock_irqsave(&rq->lock, flags);
4817         rq->curr = rq->idle = idle;
4818 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4819         idle->oncpu = 1;
4820 #endif
4821         spin_unlock_irqrestore(&rq->lock, flags);
4822
4823         /* Set the preempt count _outside_ the spinlocks! */
4824 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4825         task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4826 #else
4827         task_thread_info(idle)->preempt_count = 0;
4828 #endif
4829         /*
4830          * The idle tasks have their own, simple scheduling class:
4831          */
4832         idle->sched_class = &idle_sched_class;
4833 }
4834
4835 /*
4836  * In a system that switches off the HZ timer nohz_cpu_mask
4837  * indicates which cpus entered this state. This is used
4838  * in the rcu update to wait only for active cpus. For system
4839  * which do not switch off the HZ timer nohz_cpu_mask should
4840  * always be CPU_MASK_NONE.
4841  */
4842 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4843
4844 /*
4845  * Increase the granularity value when there are more CPUs,
4846  * because with more CPUs the 'effective latency' as visible
4847  * to users decreases. But the relationship is not linear,
4848  * so pick a second-best guess by going with the log2 of the
4849  * number of CPUs.
4850  *
4851  * This idea comes from the SD scheduler of Con Kolivas:
4852  */
4853 static inline void sched_init_granularity(void)
4854 {
4855         unsigned int factor = 1 + ilog2(num_online_cpus());
4856         const unsigned long gran_limit = 100000000;
4857
4858         sysctl_sched_granularity *= factor;
4859         if (sysctl_sched_granularity > gran_limit)
4860                 sysctl_sched_granularity = gran_limit;
4861
4862         sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4863         sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4864 }
4865
4866 #ifdef CONFIG_SMP
4867 /*
4868  * This is how migration works:
4869  *
4870  * 1) we queue a struct migration_req structure in the source CPU's
4871  *    runqueue and wake up that CPU's migration thread.
4872  * 2) we down() the locked semaphore => thread blocks.
4873  * 3) migration thread wakes up (implicitly it forces the migrated
4874  *    thread off the CPU)
4875  * 4) it gets the migration request and checks whether the migrated
4876  *    task is still in the wrong runqueue.
4877  * 5) if it's in the wrong runqueue then the migration thread removes
4878  *    it and puts it into the right queue.
4879  * 6) migration thread up()s the semaphore.
4880  * 7) we wake up and the migration is done.
4881  */
4882
4883 /*
4884  * Change a given task's CPU affinity. Migrate the thread to a
4885  * proper CPU and schedule it away if the CPU it's executing on
4886  * is removed from the allowed bitmask.
4887  *
4888  * NOTE: the caller must have a valid reference to the task, the
4889  * task must not exit() & deallocate itself prematurely.  The
4890  * call is not atomic; no spinlocks may be held.
4891  */
4892 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
4893 {
4894         struct migration_req req;
4895         unsigned long flags;
4896         struct rq *rq;
4897         int ret = 0;
4898
4899         rq = task_rq_lock(p, &flags);
4900         if (!cpus_intersects(new_mask, cpu_online_map)) {
4901                 ret = -EINVAL;
4902                 goto out;
4903         }
4904
4905         p->cpus_allowed = new_mask;
4906         /* Can the task run on the task's current CPU? If so, we're done */
4907         if (cpu_isset(task_cpu(p), new_mask))
4908                 goto out;
4909
4910         if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4911                 /* Need help from migration thread: drop lock and wait. */
4912                 task_rq_unlock(rq, &flags);
4913                 wake_up_process(rq->migration_thread);
4914                 wait_for_completion(&req.done);
4915                 tlb_migrate_finish(p->mm);
4916                 return 0;
4917         }
4918 out:
4919         task_rq_unlock(rq, &flags);
4920
4921         return ret;
4922 }
4923 EXPORT_SYMBOL_GPL(set_cpus_allowed);
4924
4925 /*
4926  * Move (not current) task off this cpu, onto dest cpu.  We're doing
4927  * this because either it can't run here any more (set_cpus_allowed()
4928  * away from this CPU, or CPU going down), or because we're
4929  * attempting to rebalance this task on exec (sched_exec).
4930  *
4931  * So we race with normal scheduler movements, but that's OK, as long
4932  * as the task is no longer on this CPU.
4933  *
4934  * Returns non-zero if task was successfully migrated.
4935  */
4936 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4937 {
4938         struct rq *rq_dest, *rq_src;
4939         int ret = 0, on_rq;
4940
4941         if (unlikely(cpu_is_offline(dest_cpu)))
4942                 return ret;
4943
4944         rq_src = cpu_rq(src_cpu);
4945         rq_dest = cpu_rq(dest_cpu);
4946
4947         double_rq_lock(rq_src, rq_dest);
4948         /* Already moved. */
4949         if (task_cpu(p) != src_cpu)
4950                 goto out;
4951         /* Affinity changed (again). */
4952         if (!cpu_isset(dest_cpu, p->cpus_allowed))
4953                 goto out;
4954
4955         on_rq = p->se.on_rq;
4956         if (on_rq)
4957                 deactivate_task(rq_src, p, 0);
4958         set_task_cpu(p, dest_cpu);
4959         if (on_rq) {
4960                 activate_task(rq_dest, p, 0);
4961                 check_preempt_curr(rq_dest, p);
4962         }
4963         ret = 1;
4964 out:
4965         double_rq_unlock(rq_src, rq_dest);
4966         return ret;
4967 }
4968
4969 /*
4970  * migration_thread - this is a highprio system thread that performs
4971  * thread migration by bumping thread off CPU then 'pushing' onto
4972  * another runqueue.
4973  */
4974 static int migration_thread(void *data)
4975 {
4976         int cpu = (long)data;
4977         struct rq *rq;
4978
4979         rq = cpu_rq(cpu);
4980         BUG_ON(rq->migration_thread != current);
4981
4982         set_current_state(TASK_INTERRUPTIBLE);
4983         while (!kthread_should_stop()) {
4984                 struct migration_req *req;
4985                 struct list_head *head;
4986
4987                 spin_lock_irq(&rq->lock);
4988
4989                 if (cpu_is_offline(cpu)) {
4990                         spin_unlock_irq(&rq->lock);
4991                         goto wait_to_die;
4992                 }
4993
4994                 if (rq->active_balance) {
4995                         active_load_balance(rq, cpu);
4996                         rq->active_balance = 0;
4997                 }
4998
4999                 head = &rq->migration_queue;
5000
5001                 if (list_empty(head)) {
5002                         spin_unlock_irq(&rq->lock);
5003                         schedule();
5004                         set_current_state(TASK_INTERRUPTIBLE);
5005                         continue;
5006                 }
5007                 req = list_entry(head->next, struct migration_req, list);
5008                 list_del_init(head->next);
5009
5010                 spin_unlock(&rq->lock);
5011                 __migrate_task(req->task, cpu, req->dest_cpu);
5012                 local_irq_enable();
5013
5014                 complete(&req->done);
5015         }
5016         __set_current_state(TASK_RUNNING);
5017         return 0;
5018
5019 wait_to_die:
5020         /* Wait for kthread_stop */
5021         set_current_state(TASK_INTERRUPTIBLE);
5022         while (!kthread_should_stop()) {
5023                 schedule();
5024                 set_current_state(TASK_INTERRUPTIBLE);
5025         }
5026         __set_current_state(TASK_RUNNING);
5027         return 0;
5028 }
5029
5030 #ifdef CONFIG_HOTPLUG_CPU
5031 /*
5032  * Figure out where task on dead CPU should go, use force if neccessary.
5033  * NOTE: interrupts should be disabled by the caller
5034  */
5035 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5036 {
5037         unsigned long flags;
5038         cpumask_t mask;
5039         struct rq *rq;
5040         int dest_cpu;
5041
5042 restart:
5043         /* On same node? */
5044         mask = node_to_cpumask(cpu_to_node(dead_cpu));
5045         cpus_and(mask, mask, p->cpus_allowed);
5046         dest_cpu = any_online_cpu(mask);
5047
5048         /* On any allowed CPU? */
5049         if (dest_cpu == NR_CPUS)
5050                 dest_cpu = any_online_cpu(p->cpus_allowed);
5051
5052         /* No more Mr. Nice Guy. */
5053         if (dest_cpu == NR_CPUS) {
5054                 rq = task_rq_lock(p, &flags);
5055                 cpus_setall(p->cpus_allowed);
5056                 dest_cpu = any_online_cpu(p->cpus_allowed);
5057                 task_rq_unlock(rq, &flags);
5058
5059                 /*
5060                  * Don't tell them about moving exiting tasks or
5061                  * kernel threads (both mm NULL), since they never
5062                  * leave kernel.
5063                  */
5064                 if (p->mm && printk_ratelimit())
5065                         printk(KERN_INFO "process %d (%s) no "
5066                                "longer affine to cpu%d\n",
5067                                p->pid, p->comm, dead_cpu);
5068         }
5069         if (!__migrate_task(p, dead_cpu, dest_cpu))
5070                 goto restart;
5071 }
5072
5073 /*
5074  * While a dead CPU has no uninterruptible tasks queued at this point,
5075  * it might still have a nonzero ->nr_uninterruptible counter, because
5076  * for performance reasons the counter is not stricly tracking tasks to
5077  * their home CPUs. So we just add the counter to another CPU's counter,
5078  * to keep the global sum constant after CPU-down:
5079  */
5080 static void migrate_nr_uninterruptible(struct rq *rq_src)
5081 {
5082         struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
5083         unsigned long flags;
5084
5085         local_irq_save(flags);
5086         double_rq_lock(rq_src, rq_dest);
5087         rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5088         rq_src->nr_uninterruptible = 0;
5089         double_rq_unlock(rq_src, rq_dest);
5090         local_irq_restore(flags);
5091 }
5092
5093 /* Run through task list and migrate tasks from the dead cpu. */
5094 static void migrate_live_tasks(int src_cpu)
5095 {
5096         struct task_struct *p, *t;
5097
5098         write_lock_irq(&tasklist_lock);
5099
5100         do_each_thread(t, p) {
5101                 if (p == current)
5102                         continue;
5103
5104                 if (task_cpu(p) == src_cpu)
5105                         move_task_off_dead_cpu(src_cpu, p);
5106         } while_each_thread(t, p);
5107
5108         write_unlock_irq(&tasklist_lock);
5109 }
5110
5111 /*
5112  * Schedules idle task to be the next runnable task on current CPU.
5113  * It does so by boosting its priority to highest possible and adding it to
5114  * the _front_ of the runqueue. Used by CPU offline code.
5115  */
5116 void sched_idle_next(void)
5117 {
5118         int this_cpu = smp_processor_id();
5119         struct rq *rq = cpu_rq(this_cpu);
5120         struct task_struct *p = rq->idle;
5121         unsigned long flags;
5122
5123         /* cpu has to be offline */
5124         BUG_ON(cpu_online(this_cpu));
5125
5126         /*
5127          * Strictly not necessary since rest of the CPUs are stopped by now
5128          * and interrupts disabled on the current cpu.
5129          */
5130         spin_lock_irqsave(&rq->lock, flags);
5131
5132         __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5133
5134         /* Add idle task to the _front_ of its priority queue: */
5135         activate_idle_task(p, rq);
5136
5137         spin_unlock_irqrestore(&rq->lock, flags);
5138 }
5139
5140 /*
5141  * Ensures that the idle task is using init_mm right before its cpu goes
5142  * offline.
5143  */
5144 void idle_task_exit(void)
5145 {
5146         struct mm_struct *mm = current->active_mm;
5147
5148         BUG_ON(cpu_online(smp_processor_id()));
5149
5150         if (mm != &init_mm)
5151                 switch_mm(mm, &init_mm, current);
5152         mmdrop(mm);
5153 }
5154
5155 /* called under rq->lock with disabled interrupts */
5156 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5157 {
5158         struct rq *rq = cpu_rq(dead_cpu);
5159
5160         /* Must be exiting, otherwise would be on tasklist. */
5161         BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
5162
5163         /* Cannot have done final schedule yet: would have vanished. */
5164         BUG_ON(p->state == TASK_DEAD);
5165
5166         get_task_struct(p);
5167
5168         /*
5169          * Drop lock around migration; if someone else moves it,
5170          * that's OK.  No task can be added to this CPU, so iteration is
5171          * fine.
5172          * NOTE: interrupts should be left disabled  --dev@
5173          */
5174         spin_unlock(&rq->lock);
5175         move_task_off_dead_cpu(dead_cpu, p);
5176         spin_lock(&rq->lock);
5177
5178         put_task_struct(p);
5179 }
5180
5181 /* release_task() removes task from tasklist, so we won't find dead tasks. */
5182 static void migrate_dead_tasks(unsigned int dead_cpu)
5183 {
5184         struct rq *rq = cpu_rq(dead_cpu);
5185         struct task_struct *next;
5186
5187         for ( ; ; ) {
5188                 if (!rq->nr_running)
5189                         break;
5190                 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5191                 if (!next)
5192                         break;
5193                 migrate_dead(dead_cpu, next);
5194
5195         }
5196 }
5197 #endif /* CONFIG_HOTPLUG_CPU */
5198
5199 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
5200
5201 static struct ctl_table sd_ctl_dir[] = {
5202         {CTL_UNNUMBERED, "sched_domain", NULL, 0, 0755, NULL, },
5203         {0,},
5204 };
5205
5206 static struct ctl_table sd_ctl_root[] = {
5207         {CTL_UNNUMBERED, "kernel", NULL, 0, 0755, sd_ctl_dir, },
5208         {0,},
5209 };
5210
5211 static struct ctl_table *sd_alloc_ctl_entry(int n)
5212 {
5213         struct ctl_table *entry =
5214                 kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL);
5215
5216         BUG_ON(!entry);
5217         memset(entry, 0, n * sizeof(struct ctl_table));
5218
5219         return entry;
5220 }
5221
5222 static void
5223 set_table_entry(struct ctl_table *entry, int ctl_name,
5224                 const char *procname, void *data, int maxlen,
5225                 mode_t mode, proc_handler *proc_handler)
5226 {
5227         entry->ctl_name = ctl_name;
5228         entry->procname = procname;
5229         entry->data = data;
5230         entry->maxlen = maxlen;
5231         entry->mode = mode;
5232         entry->proc_handler = proc_handler;
5233 }
5234
5235 static struct ctl_table *
5236 sd_alloc_ctl_domain_table(struct sched_domain *sd)
5237 {
5238         struct ctl_table *table = sd_alloc_ctl_entry(14);
5239
5240         set_table_entry(&table[0], 1, "min_interval", &sd->min_interval,
5241                 sizeof(long), 0644, proc_doulongvec_minmax);
5242         set_table_entry(&table[1], 2, "max_interval", &sd->max_interval,
5243                 sizeof(long), 0644, proc_doulongvec_minmax);
5244         set_table_entry(&table[2], 3, "busy_idx", &sd->busy_idx,
5245                 sizeof(int), 0644, proc_dointvec_minmax);
5246         set_table_entry(&table[3], 4, "idle_idx", &sd->idle_idx,
5247                 sizeof(int), 0644, proc_dointvec_minmax);
5248         set_table_entry(&table[4], 5, "newidle_idx", &sd->newidle_idx,
5249                 sizeof(int), 0644, proc_dointvec_minmax);
5250         set_table_entry(&table[5], 6, "wake_idx", &sd->wake_idx,
5251                 sizeof(int), 0644, proc_dointvec_minmax);
5252         set_table_entry(&table[6], 7, "forkexec_idx", &sd->forkexec_idx,
5253                 sizeof(int), 0644, proc_dointvec_minmax);
5254         set_table_entry(&table[7], 8, "busy_factor", &sd->busy_factor,
5255                 sizeof(int), 0644, proc_dointvec_minmax);
5256         set_table_entry(&table[8], 9, "imbalance_pct", &sd->imbalance_pct,
5257                 sizeof(int), 0644, proc_dointvec_minmax);
5258         set_table_entry(&table[10], 11, "cache_nice_tries",
5259                 &sd->cache_nice_tries,
5260                 sizeof(int), 0644, proc_dointvec_minmax);
5261         set_table_entry(&table[12], 13, "flags", &sd->flags,
5262                 sizeof(int), 0644, proc_dointvec_minmax);
5263
5264         return table;
5265 }
5266
5267 static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5268 {
5269         struct ctl_table *entry, *table;
5270         struct sched_domain *sd;
5271         int domain_num = 0, i;
5272         char buf[32];
5273
5274         for_each_domain(cpu, sd)
5275                 domain_num++;
5276         entry = table = sd_alloc_ctl_entry(domain_num + 1);
5277
5278         i = 0;
5279         for_each_domain(cpu, sd) {
5280                 snprintf(buf, 32, "domain%d", i);
5281                 entry->ctl_name = i + 1;
5282                 entry->procname = kstrdup(buf, GFP_KERNEL);
5283                 entry->mode = 0755;
5284                 entry->child = sd_alloc_ctl_domain_table(sd);
5285                 entry++;
5286                 i++;
5287         }
5288         return table;
5289 }
5290
5291 static struct ctl_table_header *sd_sysctl_header;
5292 static void init_sched_domain_sysctl(void)
5293 {
5294         int i, cpu_num = num_online_cpus();
5295         struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5296         char buf[32];
5297
5298         sd_ctl_dir[0].child = entry;
5299
5300         for (i = 0; i < cpu_num; i++, entry++) {
5301                 snprintf(buf, 32, "cpu%d", i);
5302                 entry->ctl_name = i + 1;
5303                 entry->procname = kstrdup(buf, GFP_KERNEL);
5304                 entry->mode = 0755;
5305                 entry->child = sd_alloc_ctl_cpu_table(i);
5306         }
5307         sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5308 }
5309 #else
5310 static void init_sched_domain_sysctl(void)
5311 {
5312 }
5313 #endif
5314
5315 /*
5316  * migration_call - callback that gets triggered when a CPU is added.
5317  * Here we can start up the necessary migration thread for the new CPU.
5318  */
5319 static int __cpuinit
5320 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5321 {
5322         struct task_struct *p;
5323         int cpu = (long)hcpu;
5324         unsigned long flags;
5325         struct rq *rq;
5326
5327         switch (action) {
5328         case CPU_LOCK_ACQUIRE:
5329                 mutex_lock(&sched_hotcpu_mutex);
5330                 break;
5331
5332         case CPU_UP_PREPARE:
5333         case CPU_UP_PREPARE_FROZEN:
5334                 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5335                 if (IS_ERR(p))
5336                         return NOTIFY_BAD;
5337                 kthread_bind(p, cpu);
5338                 /* Must be high prio: stop_machine expects to yield to it. */
5339                 rq = task_rq_lock(p, &flags);
5340                 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5341                 task_rq_unlock(rq, &flags);
5342                 cpu_rq(cpu)->migration_thread = p;
5343                 break;
5344
5345         case CPU_ONLINE:
5346         case CPU_ONLINE_FROZEN:
5347                 /* Strictly unneccessary, as first user will wake it. */
5348                 wake_up_process(cpu_rq(cpu)->migration_thread);
5349                 break;
5350
5351 #ifdef CONFIG_HOTPLUG_CPU
5352         case CPU_UP_CANCELED:
5353         case CPU_UP_CANCELED_FROZEN:
5354                 if (!cpu_rq(cpu)->migration_thread)
5355                         break;
5356                 /* Unbind it from offline cpu so it can run.  Fall thru. */
5357                 kthread_bind(cpu_rq(cpu)->migration_thread,
5358                              any_online_cpu(cpu_online_map));
5359                 kthread_stop(cpu_rq(cpu)->migration_thread);
5360                 cpu_rq(cpu)->migration_thread = NULL;
5361                 break;
5362
5363         case CPU_DEAD:
5364         case CPU_DEAD_FROZEN:
5365                 migrate_live_tasks(cpu);
5366                 rq = cpu_rq(cpu);
5367                 kthread_stop(rq->migration_thread);
5368                 rq->migration_thread = NULL;
5369                 /* Idle task back to normal (off runqueue, low prio) */
5370                 rq = task_rq_lock(rq->idle, &flags);
5371                 deactivate_task(rq, rq->idle, 0);
5372                 rq->idle->static_prio = MAX_PRIO;
5373                 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5374                 rq->idle->sched_class = &idle_sched_class;
5375                 migrate_dead_tasks(cpu);
5376                 task_rq_unlock(rq, &flags);
5377                 migrate_nr_uninterruptible(rq);
5378                 BUG_ON(rq->nr_running != 0);
5379
5380                 /* No need to migrate the tasks: it was best-effort if
5381                  * they didn't take sched_hotcpu_mutex.  Just wake up
5382                  * the requestors. */
5383                 spin_lock_irq(&rq->lock);
5384                 while (!list_empty(&rq->migration_queue)) {
5385                         struct migration_req *req;
5386
5387                         req = list_entry(rq->migration_queue.next,
5388                                          struct migration_req, list);
5389                         list_del_init(&req->list);
5390                         complete(&req->done);
5391                 }
5392                 spin_unlock_irq(&rq->lock);
5393                 break;
5394 #endif
5395         case CPU_LOCK_RELEASE:
5396                 mutex_unlock(&sched_hotcpu_mutex);
5397                 break;
5398         }
5399         return NOTIFY_OK;
5400 }
5401
5402 /* Register at highest priority so that task migration (migrate_all_tasks)
5403  * happens before everything else.
5404  */
5405 static struct notifier_block __cpuinitdata migration_notifier = {
5406         .notifier_call = migration_call,
5407         .priority = 10
5408 };
5409
5410 int __init migration_init(void)
5411 {
5412         void *cpu = (void *)(long)smp_processor_id();
5413         int err;
5414
5415         /* Start one for the boot CPU: */
5416         err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5417         BUG_ON(err == NOTIFY_BAD);
5418         migration_call(&migration_notifier, CPU_ONLINE, cpu);
5419         register_cpu_notifier(&migration_notifier);
5420
5421         return 0;
5422 }
5423 #endif
5424
5425 #ifdef CONFIG_SMP
5426
5427 /* Number of possible processor ids */
5428 int nr_cpu_ids __read_mostly = NR_CPUS;
5429 EXPORT_SYMBOL(nr_cpu_ids);
5430
5431 #undef SCHED_DOMAIN_DEBUG
5432 #ifdef SCHED_DOMAIN_DEBUG
5433 static void sched_domain_debug(struct sched_domain *sd, int cpu)
5434 {
5435         int level = 0;
5436
5437         if (!sd) {
5438                 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5439                 return;
5440         }
5441
5442         printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5443
5444         do {
5445                 int i;
5446                 char str[NR_CPUS];
5447                 struct sched_group *group = sd->groups;
5448                 cpumask_t groupmask;
5449
5450                 cpumask_scnprintf(str, NR_CPUS, sd->span);
5451                 cpus_clear(groupmask);
5452
5453                 printk(KERN_DEBUG);
5454                 for (i = 0; i < level + 1; i++)
5455                         printk(" ");
5456                 printk("domain %d: ", level);
5457
5458                 if (!(sd->flags & SD_LOAD_BALANCE)) {
5459                         printk("does not load-balance\n");
5460                         if (sd->parent)
5461                                 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5462                                                 " has parent");
5463                         break;
5464                 }
5465
5466                 printk("span %s\n", str);
5467
5468                 if (!cpu_isset(cpu, sd->span))
5469                         printk(KERN_ERR "ERROR: domain->span does not contain "
5470                                         "CPU%d\n", cpu);
5471                 if (!cpu_isset(cpu, group->cpumask))
5472                         printk(KERN_ERR "ERROR: domain->groups does not contain"
5473                                         " CPU%d\n", cpu);
5474
5475                 printk(KERN_DEBUG);
5476                 for (i = 0; i < level + 2; i++)
5477                         printk(" ");
5478                 printk("groups:");
5479                 do {
5480                         if (!group) {
5481                                 printk("\n");
5482                                 printk(KERN_ERR "ERROR: group is NULL\n");
5483                                 break;
5484                         }
5485
5486                         if (!group->__cpu_power) {
5487                                 printk("\n");
5488                                 printk(KERN_ERR "ERROR: domain->cpu_power not "
5489                                                 "set\n");
5490                         }
5491
5492                         if (!cpus_weight(group->cpumask)) {
5493                                 printk("\n");
5494                                 printk(KERN_ERR "ERROR: empty group\n");
5495                         }
5496
5497                         if (cpus_intersects(groupmask, group->cpumask)) {
5498                                 printk("\n");
5499                                 printk(KERN_ERR "ERROR: repeated CPUs\n");
5500                         }
5501
5502                         cpus_or(groupmask, groupmask, group->cpumask);
5503
5504                         cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5505                         printk(" %s", str);
5506
5507                         group = group->next;
5508                 } while (group != sd->groups);
5509                 printk("\n");
5510
5511                 if (!cpus_equal(sd->span, groupmask))
5512                         printk(KERN_ERR "ERROR: groups don't span "
5513                                         "domain->span\n");
5514
5515                 level++;
5516                 sd = sd->parent;
5517                 if (!sd)
5518                         continue;
5519
5520                 if (!cpus_subset(groupmask, sd->span))
5521                         printk(KERN_ERR "ERROR: parent span is not a superset "
5522                                 "of domain->span\n");
5523
5524         } while (sd);
5525 }
5526 #else
5527 # define sched_domain_debug(sd, cpu) do { } while (0)
5528 #endif
5529
5530 static int sd_degenerate(struct sched_domain *sd)
5531 {
5532         if (cpus_weight(sd->span) == 1)
5533                 return 1;
5534
5535         /* Following flags need at least 2 groups */
5536         if (sd->flags & (SD_LOAD_BALANCE |
5537                          SD_BALANCE_NEWIDLE |
5538                          SD_BALANCE_FORK |
5539                          SD_BALANCE_EXEC |
5540                          SD_SHARE_CPUPOWER |
5541                          SD_SHARE_PKG_RESOURCES)) {
5542                 if (sd->groups != sd->groups->next)
5543                         return 0;
5544         }
5545
5546         /* Following flags don't use groups */
5547         if (sd->flags & (SD_WAKE_IDLE |
5548                          SD_WAKE_AFFINE |
5549                          SD_WAKE_BALANCE))
5550                 return 0;
5551
5552         return 1;
5553 }
5554
5555 static int
5556 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5557 {
5558         unsigned long cflags = sd->flags, pflags = parent->flags;
5559
5560         if (sd_degenerate(parent))
5561                 return 1;
5562
5563         if (!cpus_equal(sd->span, parent->span))
5564                 return 0;
5565
5566         /* Does parent contain flags not in child? */
5567         /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5568         if (cflags & SD_WAKE_AFFINE)
5569                 pflags &= ~SD_WAKE_BALANCE;
5570         /* Flags needing groups don't count if only 1 group in parent */
5571         if (parent->groups == parent->groups->next) {
5572                 pflags &= ~(SD_LOAD_BALANCE |
5573                                 SD_BALANCE_NEWIDLE |
5574                                 SD_BALANCE_FORK |
5575                                 SD_BALANCE_EXEC |
5576                                 SD_SHARE_CPUPOWER |
5577                                 SD_SHARE_PKG_RESOURCES);
5578         }
5579         if (~cflags & pflags)
5580                 return 0;
5581
5582         return 1;
5583 }
5584
5585 /*
5586  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
5587  * hold the hotplug lock.
5588  */
5589 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5590 {
5591         struct rq *rq = cpu_rq(cpu);
5592         struct sched_domain *tmp;
5593
5594         /* Remove the sched domains which do not contribute to scheduling. */
5595         for (tmp = sd; tmp; tmp = tmp->parent) {
5596                 struct sched_domain *parent = tmp->parent;
5597                 if (!parent)
5598                         break;
5599                 if (sd_parent_degenerate(tmp, parent)) {
5600                         tmp->parent = parent->parent;
5601                         if (parent->parent)
5602                                 parent->parent->child = tmp;
5603                 }
5604         }
5605
5606         if (sd && sd_degenerate(sd)) {
5607                 sd = sd->parent;
5608                 if (sd)
5609                         sd->child = NULL;
5610         }
5611
5612         sched_domain_debug(sd, cpu);
5613
5614         rcu_assign_pointer(rq->sd, sd);
5615 }
5616
5617 /* cpus with isolated domains */
5618 static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
5619
5620 /* Setup the mask of cpus configured for isolated domains */
5621 static int __init isolated_cpu_setup(char *str)
5622 {
5623         int ints[NR_CPUS], i;
5624
5625         str = get_options(str, ARRAY_SIZE(ints), ints);
5626         cpus_clear(cpu_isolated_map);
5627         for (i = 1; i <= ints[0]; i++)
5628                 if (ints[i] < NR_CPUS)
5629                         cpu_set(ints[i], cpu_isolated_map);
5630         return 1;
5631 }
5632
5633 __setup ("isolcpus=", isolated_cpu_setup);
5634
5635 /*
5636  * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5637  * to a function which identifies what group(along with sched group) a CPU
5638  * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5639  * (due to the fact that we keep track of groups covered with a cpumask_t).
5640  *
5641  * init_sched_build_groups will build a circular linked list of the groups
5642  * covered by the given span, and will set each group's ->cpumask correctly,
5643  * and ->cpu_power to 0.
5644  */
5645 static void
5646 init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5647                         int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5648                                         struct sched_group **sg))
5649 {
5650         struct sched_group *first = NULL, *last = NULL;
5651         cpumask_t covered = CPU_MASK_NONE;
5652         int i;
5653
5654         for_each_cpu_mask(i, span) {
5655                 struct sched_group *sg;
5656                 int group = group_fn(i, cpu_map, &sg);
5657                 int j;
5658
5659                 if (cpu_isset(i, covered))
5660                         continue;
5661
5662                 sg->cpumask = CPU_MASK_NONE;
5663                 sg->__cpu_power = 0;
5664
5665                 for_each_cpu_mask(j, span) {
5666                         if (group_fn(j, cpu_map, NULL) != group)
5667                                 continue;
5668
5669                         cpu_set(j, covered);
5670                         cpu_set(j, sg->cpumask);
5671                 }
5672                 if (!first)
5673                         first = sg;
5674                 if (last)
5675                         last->next = sg;
5676                 last = sg;
5677         }
5678         last->next = first;
5679 }
5680
5681 #define SD_NODES_PER_DOMAIN 16
5682
5683 #ifdef CONFIG_NUMA
5684
5685 /**
5686  * find_next_best_node - find the next node to include in a sched_domain
5687  * @node: node whose sched_domain we're building
5688  * @used_nodes: nodes already in the sched_domain
5689  *
5690  * Find the next node to include in a given scheduling domain.  Simply
5691  * finds the closest node not already in the @used_nodes map.
5692  *
5693  * Should use nodemask_t.
5694  */
5695 static int find_next_best_node(int node, unsigned long *used_nodes)
5696 {
5697         int i, n, val, min_val, best_node = 0;
5698
5699         min_val = INT_MAX;
5700
5701         for (i = 0; i < MAX_NUMNODES; i++) {
5702                 /* Start at @node */
5703                 n = (node + i) % MAX_NUMNODES;
5704
5705                 if (!nr_cpus_node(n))
5706                         continue;
5707
5708                 /* Skip already used nodes */
5709                 if (test_bit(n, used_nodes))
5710                         continue;
5711
5712                 /* Simple min distance search */
5713                 val = node_distance(node, n);
5714
5715                 if (val < min_val) {
5716                         min_val = val;
5717                         best_node = n;
5718                 }
5719         }
5720
5721         set_bit(best_node, used_nodes);
5722         return best_node;
5723 }
5724
5725 /**
5726  * sched_domain_node_span - get a cpumask for a node's sched_domain
5727  * @node: node whose cpumask we're constructing
5728  * @size: number of nodes to include in this span
5729  *
5730  * Given a node, construct a good cpumask for its sched_domain to span.  It
5731  * should be one that prevents unnecessary balancing, but also spreads tasks
5732  * out optimally.
5733  */
5734 static cpumask_t sched_domain_node_span(int node)
5735 {
5736         DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
5737         cpumask_t span, nodemask;
5738         int i;
5739
5740         cpus_clear(span);
5741         bitmap_zero(used_nodes, MAX_NUMNODES);
5742
5743         nodemask = node_to_cpumask(node);
5744         cpus_or(span, span, nodemask);
5745         set_bit(node, used_nodes);
5746
5747         for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5748                 int next_node = find_next_best_node(node, used_nodes);
5749
5750                 nodemask = node_to_cpumask(next_node);
5751                 cpus_or(span, span, nodemask);
5752         }
5753
5754         return span;
5755 }
5756 #endif
5757
5758 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5759
5760 /*
5761  * SMT sched-domains:
5762  */
5763 #ifdef CONFIG_SCHED_SMT
5764 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5765 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
5766
5767 static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5768                             struct sched_group **sg)
5769 {
5770         if (sg)
5771                 *sg = &per_cpu(sched_group_cpus, cpu);
5772         return cpu;
5773 }
5774 #endif
5775
5776 /*
5777  * multi-core sched-domains:
5778  */
5779 #ifdef CONFIG_SCHED_MC
5780 static DEFINE_PER_CPU(struct sched_domain, core_domains);
5781 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
5782 #endif
5783
5784 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
5785 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5786                              struct sched_group **sg)
5787 {
5788         int group;
5789         cpumask_t mask = cpu_sibling_map[cpu];
5790         cpus_and(mask, mask, *cpu_map);
5791         group = first_cpu(mask);
5792         if (sg)
5793                 *sg = &per_cpu(sched_group_core, group);
5794         return group;
5795 }
5796 #elif defined(CONFIG_SCHED_MC)
5797 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5798                              struct sched_group **sg)
5799 {
5800         if (sg)
5801                 *sg = &per_cpu(sched_group_core, cpu);
5802         return cpu;
5803 }
5804 #endif
5805
5806 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5807 static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
5808
5809 static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5810                              struct sched_group **sg)
5811 {
5812         int group;
5813 #ifdef CONFIG_SCHED_MC
5814         cpumask_t mask = cpu_coregroup_map(cpu);
5815         cpus_and(mask, mask, *cpu_map);
5816         group = first_cpu(mask);
5817 #elif defined(CONFIG_SCHED_SMT)
5818         cpumask_t mask = cpu_sibling_map[cpu];
5819         cpus_and(mask, mask, *cpu_map);
5820         group = first_cpu(mask);
5821 #else
5822         group = cpu;
5823 #endif
5824         if (sg)
5825                 *sg = &per_cpu(sched_group_phys, group);
5826         return group;
5827 }
5828
5829 #ifdef CONFIG_NUMA
5830 /*
5831  * The init_sched_build_groups can't handle what we want to do with node
5832  * groups, so roll our own. Now each node has its own list of groups which
5833  * gets dynamically allocated.
5834  */
5835 static DEFINE_PER_CPU(struct sched_domain, node_domains);
5836 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
5837
5838 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
5839 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
5840
5841 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5842                                  struct sched_group **sg)
5843 {
5844         cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5845         int group;
5846
5847         cpus_and(nodemask, nodemask, *cpu_map);
5848         group = first_cpu(nodemask);
5849
5850         if (sg)
5851                 *sg = &per_cpu(sched_group_allnodes, group);
5852         return group;
5853 }
5854
5855 static void init_numa_sched_groups_power(struct sched_group *group_head)
5856 {
5857         struct sched_group *sg = group_head;
5858         int j;
5859
5860         if (!sg)
5861                 return;
5862 next_sg:
5863         for_each_cpu_mask(j, sg->cpumask) {
5864                 struct sched_domain *sd;
5865
5866                 sd = &per_cpu(phys_domains, j);
5867                 if (j != first_cpu(sd->groups->cpumask)) {
5868                         /*
5869                          * Only add "power" once for each
5870                          * physical package.
5871                          */
5872                         continue;
5873                 }
5874
5875                 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
5876         }
5877         sg = sg->next;
5878         if (sg != group_head)
5879                 goto next_sg;
5880 }
5881 #endif
5882
5883 #ifdef CONFIG_NUMA
5884 /* Free memory allocated for various sched_group structures */
5885 static void free_sched_groups(const cpumask_t *cpu_map)
5886 {
5887         int cpu, i;
5888
5889         for_each_cpu_mask(cpu, *cpu_map) {
5890                 struct sched_group **sched_group_nodes
5891                         = sched_group_nodes_bycpu[cpu];
5892
5893                 if (!sched_group_nodes)
5894                         continue;
5895
5896                 for (i = 0; i < MAX_NUMNODES; i++) {
5897                         cpumask_t nodemask = node_to_cpumask(i);
5898                         struct sched_group *oldsg, *sg = sched_group_nodes[i];
5899
5900                         cpus_and(nodemask, nodemask, *cpu_map);
5901                         if (cpus_empty(nodemask))
5902                                 continue;
5903
5904                         if (sg == NULL)
5905                                 continue;
5906                         sg = sg->next;
5907 next_sg:
5908                         oldsg = sg;
5909                         sg = sg->next;
5910                         kfree(oldsg);
5911                         if (oldsg != sched_group_nodes[i])
5912                                 goto next_sg;
5913                 }
5914                 kfree(sched_group_nodes);
5915                 sched_group_nodes_bycpu[cpu] = NULL;
5916         }
5917 }
5918 #else
5919 static void free_sched_groups(const cpumask_t *cpu_map)
5920 {
5921 }
5922 #endif
5923
5924 /*
5925  * Initialize sched groups cpu_power.
5926  *
5927  * cpu_power indicates the capacity of sched group, which is used while
5928  * distributing the load between different sched groups in a sched domain.
5929  * Typically cpu_power for all the groups in a sched domain will be same unless
5930  * there are asymmetries in the topology. If there are asymmetries, group
5931  * having more cpu_power will pickup more load compared to the group having
5932  * less cpu_power.
5933  *
5934  * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5935  * the maximum number of tasks a group can handle in the presence of other idle
5936  * or lightly loaded groups in the same sched domain.
5937  */
5938 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5939 {
5940         struct sched_domain *child;
5941         struct sched_group *group;
5942
5943         WARN_ON(!sd || !sd->groups);
5944
5945         if (cpu != first_cpu(sd->groups->cpumask))
5946                 return;
5947
5948         child = sd->child;
5949
5950         sd->groups->__cpu_power = 0;
5951
5952         /*
5953          * For perf policy, if the groups in child domain share resources
5954          * (for example cores sharing some portions of the cache hierarchy
5955          * or SMT), then set this domain groups cpu_power such that each group
5956          * can handle only one task, when there are other idle groups in the
5957          * same sched domain.
5958          */
5959         if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5960                        (child->flags &
5961                         (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
5962                 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
5963                 return;
5964         }
5965
5966         /*
5967          * add cpu_power of each child group to this groups cpu_power
5968          */
5969         group = child->groups;
5970         do {
5971                 sg_inc_cpu_power(sd->groups, group->__cpu_power);
5972                 group = group->next;
5973         } while (group != child->groups);
5974 }
5975
5976 /*
5977  * Build sched domains for a given set of cpus and attach the sched domains
5978  * to the individual cpus
5979  */
5980 static int build_sched_domains(const cpumask_t *cpu_map)
5981 {
5982         int i;
5983 #ifdef CONFIG_NUMA
5984         struct sched_group **sched_group_nodes = NULL;
5985         int sd_allnodes = 0;
5986
5987         /*
5988          * Allocate the per-node list of sched groups
5989          */
5990         sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
5991                                            GFP_KERNEL);
5992         if (!sched_group_nodes) {
5993                 printk(KERN_WARNING "Can not alloc sched group node list\n");
5994                 return -ENOMEM;
5995         }
5996         sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5997 #endif
5998
5999         /*
6000          * Set up domains for cpus specified by the cpu_map.
6001          */
6002         for_each_cpu_mask(i, *cpu_map) {
6003                 struct sched_domain *sd = NULL, *p;
6004                 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
6005
6006                 cpus_and(nodemask, nodemask, *cpu_map);
6007
6008 #ifdef CONFIG_NUMA
6009                 if (cpus_weight(*cpu_map) >
6010                                 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
6011                         sd = &per_cpu(allnodes_domains, i);
6012                         *sd = SD_ALLNODES_INIT;
6013                         sd->span = *cpu_map;
6014                         cpu_to_allnodes_group(i, cpu_map, &sd->groups);
6015                         p = sd;
6016                         sd_allnodes = 1;
6017                 } else
6018                         p = NULL;
6019
6020                 sd = &per_cpu(node_domains, i);
6021                 *sd = SD_NODE_INIT;
6022                 sd->span = sched_domain_node_span(cpu_to_node(i));
6023                 sd->parent = p;
6024                 if (p)
6025                         p->child = sd;
6026                 cpus_and(sd->span, sd->span, *cpu_map);
6027 #endif
6028
6029                 p = sd;
6030                 sd = &per_cpu(phys_domains, i);
6031                 *sd = SD_CPU_INIT;
6032                 sd->span = nodemask;
6033                 sd->parent = p;
6034                 if (p)
6035                         p->child = sd;
6036                 cpu_to_phys_group(i, cpu_map, &sd->groups);
6037
6038 #ifdef CONFIG_SCHED_MC
6039                 p = sd;
6040                 sd = &per_cpu(core_domains, i);
6041                 *sd = SD_MC_INIT;
6042                 sd->span = cpu_coregroup_map(i);
6043                 cpus_and(sd->span, sd->span, *cpu_map);
6044                 sd->parent = p;
6045                 p->child = sd;
6046                 cpu_to_core_group(i, cpu_map, &sd->groups);
6047 #endif
6048
6049 #ifdef CONFIG_SCHED_SMT
6050                 p = sd;
6051                 sd = &per_cpu(cpu_domains, i);
6052                 *sd = SD_SIBLING_INIT;
6053                 sd->span = cpu_sibling_map[i];
6054                 cpus_and(sd->span, sd->span, *cpu_map);
6055                 sd->parent = p;
6056                 p->child = sd;
6057                 cpu_to_cpu_group(i, cpu_map, &sd->groups);
6058 #endif
6059         }
6060
6061 #ifdef CONFIG_SCHED_SMT
6062         /* Set up CPU (sibling) groups */
6063         for_each_cpu_mask(i, *cpu_map) {
6064                 cpumask_t this_sibling_map = cpu_sibling_map[i];
6065                 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
6066                 if (i != first_cpu(this_sibling_map))
6067                         continue;
6068
6069                 init_sched_build_groups(this_sibling_map, cpu_map,
6070                                         &cpu_to_cpu_group);
6071         }
6072 #endif
6073
6074 #ifdef CONFIG_SCHED_MC
6075         /* Set up multi-core groups */
6076         for_each_cpu_mask(i, *cpu_map) {
6077                 cpumask_t this_core_map = cpu_coregroup_map(i);
6078                 cpus_and(this_core_map, this_core_map, *cpu_map);
6079                 if (i != first_cpu(this_core_map))
6080                         continue;
6081                 init_sched_build_groups(this_core_map, cpu_map,
6082                                         &cpu_to_core_group);
6083         }
6084 #endif
6085
6086         /* Set up physical groups */
6087         for (i = 0; i < MAX_NUMNODES; i++) {
6088                 cpumask_t nodemask = node_to_cpumask(i);
6089
6090                 cpus_and(nodemask, nodemask, *cpu_map);
6091                 if (cpus_empty(nodemask))
6092                         continue;
6093
6094                 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
6095         }
6096
6097 #ifdef CONFIG_NUMA
6098         /* Set up node groups */
6099         if (sd_allnodes)
6100                 init_sched_build_groups(*cpu_map, cpu_map,
6101                                         &cpu_to_allnodes_group);
6102
6103         for (i = 0; i < MAX_NUMNODES; i++) {
6104                 /* Set up node groups */
6105                 struct sched_group *sg, *prev;
6106                 cpumask_t nodemask = node_to_cpumask(i);
6107                 cpumask_t domainspan;
6108                 cpumask_t covered = CPU_MASK_NONE;
6109                 int j;
6110
6111                 cpus_and(nodemask, nodemask, *cpu_map);
6112                 if (cpus_empty(nodemask)) {
6113                         sched_group_nodes[i] = NULL;
6114                         continue;
6115                 }
6116
6117                 domainspan = sched_domain_node_span(i);
6118                 cpus_and(domainspan, domainspan, *cpu_map);
6119
6120                 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
6121                 if (!sg) {
6122                         printk(KERN_WARNING "Can not alloc domain group for "
6123                                 "node %d\n", i);
6124                         goto error;
6125                 }
6126                 sched_group_nodes[i] = sg;
6127                 for_each_cpu_mask(j, nodemask) {
6128                         struct sched_domain *sd;
6129
6130                         sd = &per_cpu(node_domains, j);
6131                         sd->groups = sg;
6132                 }
6133                 sg->__cpu_power = 0;
6134                 sg->cpumask = nodemask;
6135                 sg->next = sg;
6136                 cpus_or(covered, covered, nodemask);
6137                 prev = sg;
6138
6139                 for (j = 0; j < MAX_NUMNODES; j++) {
6140                         cpumask_t tmp, notcovered;
6141                         int n = (i + j) % MAX_NUMNODES;
6142
6143                         cpus_complement(notcovered, covered);
6144                         cpus_and(tmp, notcovered, *cpu_map);
6145                         cpus_and(tmp, tmp, domainspan);
6146                         if (cpus_empty(tmp))
6147                                 break;
6148
6149                         nodemask = node_to_cpumask(n);
6150                         cpus_and(tmp, tmp, nodemask);
6151                         if (cpus_empty(tmp))
6152                                 continue;
6153
6154                         sg = kmalloc_node(sizeof(struct sched_group),
6155                                           GFP_KERNEL, i);
6156                         if (!sg) {
6157                                 printk(KERN_WARNING
6158                                 "Can not alloc domain group for node %d\n", j);
6159                                 goto error;
6160                         }
6161                         sg->__cpu_power = 0;
6162                         sg->cpumask = tmp;
6163                         sg->next = prev->next;
6164                         cpus_or(covered, covered, tmp);
6165                         prev->next = sg;
6166                         prev = sg;
6167                 }
6168         }
6169 #endif
6170
6171         /* Calculate CPU power for physical packages and nodes */
6172 #ifdef CONFIG_SCHED_SMT
6173         for_each_cpu_mask(i, *cpu_map) {
6174                 struct sched_domain *sd = &per_cpu(cpu_domains, i);
6175
6176                 init_sched_groups_power(i, sd);
6177         }
6178 #endif
6179 #ifdef CONFIG_SCHED_MC
6180         for_each_cpu_mask(i, *cpu_map) {
6181                 struct sched_domain *sd = &per_cpu(core_domains, i);
6182
6183                 init_sched_groups_power(i, sd);
6184         }
6185 #endif
6186
6187         for_each_cpu_mask(i, *cpu_map) {
6188                 struct sched_domain *sd = &per_cpu(phys_domains, i);
6189
6190                 init_sched_groups_power(i, sd);
6191         }
6192
6193 #ifdef CONFIG_NUMA
6194         for (i = 0; i < MAX_NUMNODES; i++)
6195                 init_numa_sched_groups_power(sched_group_nodes[i]);
6196
6197         if (sd_allnodes) {
6198                 struct sched_group *sg;
6199
6200                 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6201                 init_numa_sched_groups_power(sg);
6202         }
6203 #endif
6204
6205         /* Attach the domains */
6206         for_each_cpu_mask(i, *cpu_map) {
6207                 struct sched_domain *sd;
6208 #ifdef CONFIG_SCHED_SMT
6209                 sd = &per_cpu(cpu_domains, i);
6210 #elif defined(CONFIG_SCHED_MC)
6211                 sd = &per_cpu(core_domains, i);
6212 #else
6213                 sd = &per_cpu(phys_domains, i);
6214 #endif
6215                 cpu_attach_domain(sd, i);
6216         }
6217
6218         return 0;
6219
6220 #ifdef CONFIG_NUMA
6221 error:
6222         free_sched_groups(cpu_map);
6223         return -ENOMEM;
6224 #endif
6225 }
6226 /*
6227  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
6228  */
6229 static int arch_init_sched_domains(const cpumask_t *cpu_map)
6230 {
6231         cpumask_t cpu_default_map;
6232         int err;
6233
6234         /*
6235          * Setup mask for cpus without special case scheduling requirements.
6236          * For now this just excludes isolated cpus, but could be used to
6237          * exclude other special cases in the future.
6238          */
6239         cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6240
6241         err = build_sched_domains(&cpu_default_map);
6242
6243         return err;
6244 }
6245
6246 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
6247 {
6248         free_sched_groups(cpu_map);
6249 }
6250
6251 /*
6252  * Detach sched domains from a group of cpus specified in cpu_map
6253  * These cpus will now be attached to the NULL domain
6254  */
6255 static void detach_destroy_domains(const cpumask_t *cpu_map)
6256 {
6257         int i;
6258
6259         for_each_cpu_mask(i, *cpu_map)
6260                 cpu_attach_domain(NULL, i);
6261         synchronize_sched();
6262         arch_destroy_sched_domains(cpu_map);
6263 }
6264
6265 /*
6266  * Partition sched domains as specified by the cpumasks below.
6267  * This attaches all cpus from the cpumasks to the NULL domain,
6268  * waits for a RCU quiescent period, recalculates sched
6269  * domain information and then attaches them back to the
6270  * correct sched domains
6271  * Call with hotplug lock held
6272  */
6273 int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6274 {
6275         cpumask_t change_map;
6276         int err = 0;
6277
6278         cpus_and(*partition1, *partition1, cpu_online_map);
6279         cpus_and(*partition2, *partition2, cpu_online_map);
6280         cpus_or(change_map, *partition1, *partition2);
6281
6282         /* Detach sched domains from all of the affected cpus */
6283         detach_destroy_domains(&change_map);
6284         if (!cpus_empty(*partition1))
6285                 err = build_sched_domains(partition1);
6286         if (!err && !cpus_empty(*partition2))
6287                 err = build_sched_domains(partition2);
6288
6289         return err;
6290 }
6291
6292 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6293 int arch_reinit_sched_domains(void)
6294 {
6295         int err;
6296
6297         mutex_lock(&sched_hotcpu_mutex);
6298         detach_destroy_domains(&cpu_online_map);
6299         err = arch_init_sched_domains(&cpu_online_map);
6300         mutex_unlock(&sched_hotcpu_mutex);
6301
6302         return err;
6303 }
6304
6305 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6306 {
6307         int ret;
6308
6309         if (buf[0] != '0' && buf[0] != '1')
6310                 return -EINVAL;
6311
6312         if (smt)
6313                 sched_smt_power_savings = (buf[0] == '1');
6314         else
6315                 sched_mc_power_savings = (buf[0] == '1');
6316
6317         ret = arch_reinit_sched_domains();
6318
6319         return ret ? ret : count;
6320 }
6321
6322 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6323 {
6324         int err = 0;
6325
6326 #ifdef CONFIG_SCHED_SMT
6327         if (smt_capable())
6328                 err = sysfs_create_file(&cls->kset.kobj,
6329                                         &attr_sched_smt_power_savings.attr);
6330 #endif
6331 #ifdef CONFIG_SCHED_MC
6332         if (!err && mc_capable())
6333                 err = sysfs_create_file(&cls->kset.kobj,
6334                                         &attr_sched_mc_power_savings.attr);
6335 #endif
6336         return err;
6337 }
6338 #endif
6339
6340 #ifdef CONFIG_SCHED_MC
6341 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6342 {
6343         return sprintf(page, "%u\n", sched_mc_power_savings);
6344 }
6345 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6346                                             const char *buf, size_t count)
6347 {
6348         return sched_power_savings_store(buf, count, 0);
6349 }
6350 SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6351             sched_mc_power_savings_store);
6352 #endif
6353
6354 #ifdef CONFIG_SCHED_SMT
6355 static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6356 {
6357         return sprintf(page, "%u\n", sched_smt_power_savings);
6358 }
6359 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6360                                              const char *buf, size_t count)
6361 {
6362         return sched_power_savings_store(buf, count, 1);
6363 }
6364 SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6365             sched_smt_power_savings_store);
6366 #endif
6367
6368 /*
6369  * Force a reinitialization of the sched domains hierarchy.  The domains
6370  * and groups cannot be updated in place without racing with the balancing
6371  * code, so we temporarily attach all running cpus to the NULL domain
6372  * which will prevent rebalancing while the sched domains are recalculated.
6373  */
6374 static int update_sched_domains(struct notifier_block *nfb,
6375                                 unsigned long action, void *hcpu)
6376 {
6377         switch (action) {
6378         case CPU_UP_PREPARE:
6379         case CPU_UP_PREPARE_FROZEN:
6380         case CPU_DOWN_PREPARE:
6381         case CPU_DOWN_PREPARE_FROZEN:
6382                 detach_destroy_domains(&cpu_online_map);
6383                 return NOTIFY_OK;
6384
6385         case CPU_UP_CANCELED:
6386         case CPU_UP_CANCELED_FROZEN:
6387         case CPU_DOWN_FAILED:
6388         case CPU_DOWN_FAILED_FROZEN:
6389         case CPU_ONLINE:
6390         case CPU_ONLINE_FROZEN:
6391         case CPU_DEAD:
6392         case CPU_DEAD_FROZEN:
6393                 /*
6394                  * Fall through and re-initialise the domains.
6395                  */
6396                 break;
6397         default:
6398                 return NOTIFY_DONE;
6399         }
6400
6401         /* The hotplug lock is already held by cpu_up/cpu_down */
6402         arch_init_sched_domains(&cpu_online_map);
6403
6404         return NOTIFY_OK;
6405 }
6406
6407 void __init sched_init_smp(void)
6408 {
6409         cpumask_t non_isolated_cpus;
6410
6411         mutex_lock(&sched_hotcpu_mutex);
6412         arch_init_sched_domains(&cpu_online_map);
6413         cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6414         if (cpus_empty(non_isolated_cpus))
6415                 cpu_set(smp_processor_id(), non_isolated_cpus);
6416         mutex_unlock(&sched_hotcpu_mutex);
6417         /* XXX: Theoretical race here - CPU may be hotplugged now */
6418         hotcpu_notifier(update_sched_domains, 0);
6419
6420         init_sched_domain_sysctl();
6421
6422         /* Move init over to a non-isolated CPU */
6423         if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6424                 BUG();
6425         sched_init_granularity();
6426 }
6427 #else
6428 void __init sched_init_smp(void)
6429 {
6430         sched_init_granularity();
6431 }
6432 #endif /* CONFIG_SMP */
6433
6434 int in_sched_functions(unsigned long addr)
6435 {
6436         /* Linker adds these: start and end of __sched functions */
6437         extern char __sched_text_start[], __sched_text_end[];
6438
6439         return in_lock_functions(addr) ||
6440                 (addr >= (unsigned long)__sched_text_start
6441                 && addr < (unsigned long)__sched_text_end);
6442 }
6443
6444 static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6445 {
6446         cfs_rq->tasks_timeline = RB_ROOT;
6447         cfs_rq->fair_clock = 1;
6448 #ifdef CONFIG_FAIR_GROUP_SCHED
6449         cfs_rq->rq = rq;
6450 #endif
6451 }
6452
6453 void __init sched_init(void)
6454 {
6455         u64 now = sched_clock();
6456         int highest_cpu = 0;
6457         int i, j;
6458
6459         /*
6460          * Link up the scheduling class hierarchy:
6461          */
6462         rt_sched_class.next = &fair_sched_class;
6463         fair_sched_class.next = &idle_sched_class;
6464         idle_sched_class.next = NULL;
6465
6466         for_each_possible_cpu(i) {
6467                 struct rt_prio_array *array;
6468                 struct rq *rq;
6469
6470                 rq = cpu_rq(i);
6471                 spin_lock_init(&rq->lock);
6472                 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6473                 rq->nr_running = 0;
6474                 rq->clock = 1;
6475                 init_cfs_rq(&rq->cfs, rq);
6476 #ifdef CONFIG_FAIR_GROUP_SCHED
6477                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6478                 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6479 #endif
6480                 rq->ls.load_update_last = now;
6481                 rq->ls.load_update_start = now;
6482
6483                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6484                         rq->cpu_load[j] = 0;
6485 #ifdef CONFIG_SMP
6486                 rq->sd = NULL;
6487                 rq->active_balance = 0;
6488                 rq->next_balance = jiffies;
6489                 rq->push_cpu = 0;
6490                 rq->cpu = i;
6491                 rq->migration_thread = NULL;
6492                 INIT_LIST_HEAD(&rq->migration_queue);
6493 #endif
6494                 atomic_set(&rq->nr_iowait, 0);
6495
6496                 array = &rq->rt.active;
6497                 for (j = 0; j < MAX_RT_PRIO; j++) {
6498                         INIT_LIST_HEAD(array->queue + j);
6499                         __clear_bit(j, array->bitmap);
6500                 }
6501                 highest_cpu = i;
6502                 /* delimiter for bitsearch: */
6503                 __set_bit(MAX_RT_PRIO, array->bitmap);
6504         }
6505
6506         set_load_weight(&init_task);
6507
6508 #ifdef CONFIG_PREEMPT_NOTIFIERS
6509         INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6510 #endif
6511
6512 #ifdef CONFIG_SMP
6513         nr_cpu_ids = highest_cpu + 1;
6514         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6515 #endif
6516
6517 #ifdef CONFIG_RT_MUTEXES
6518         plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6519 #endif
6520
6521         /*
6522          * The boot idle thread does lazy MMU switching as well:
6523          */
6524         atomic_inc(&init_mm.mm_count);
6525         enter_lazy_tlb(&init_mm, current);
6526
6527         /*
6528          * Make us the idle thread. Technically, schedule() should not be
6529          * called from this thread, however somewhere below it might be,
6530          * but because we are the idle thread, we just pick up running again
6531          * when this runqueue becomes "idle".
6532          */
6533         init_idle(current, smp_processor_id());
6534         /*
6535          * During early bootup we pretend to be a normal task:
6536          */
6537         current->sched_class = &fair_sched_class;
6538 }
6539
6540 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6541 void __might_sleep(char *file, int line)
6542 {
6543 #ifdef in_atomic
6544         static unsigned long prev_jiffy;        /* ratelimiting */
6545
6546         if ((in_atomic() || irqs_disabled()) &&
6547             system_state == SYSTEM_RUNNING && !oops_in_progress) {
6548                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6549                         return;
6550                 prev_jiffy = jiffies;
6551                 printk(KERN_ERR "BUG: sleeping function called from invalid"
6552                                 " context at %s:%d\n", file, line);
6553                 printk("in_atomic():%d, irqs_disabled():%d\n",
6554                         in_atomic(), irqs_disabled());
6555                 debug_show_held_locks(current);
6556                 if (irqs_disabled())
6557                         print_irqtrace_events(current);
6558                 dump_stack();
6559         }
6560 #endif
6561 }
6562 EXPORT_SYMBOL(__might_sleep);
6563 #endif
6564
6565 #ifdef CONFIG_MAGIC_SYSRQ
6566 void normalize_rt_tasks(void)
6567 {
6568         struct task_struct *g, *p;
6569         unsigned long flags;
6570         struct rq *rq;
6571         int on_rq;
6572
6573         read_lock_irq(&tasklist_lock);
6574         do_each_thread(g, p) {
6575                 p->se.fair_key                  = 0;
6576                 p->se.wait_runtime              = 0;
6577                 p->se.wait_start_fair           = 0;
6578                 p->se.wait_start                = 0;
6579                 p->se.exec_start                = 0;
6580                 p->se.sleep_start               = 0;
6581                 p->se.sleep_start_fair          = 0;
6582                 p->se.block_start               = 0;
6583                 task_rq(p)->cfs.fair_clock      = 0;
6584                 task_rq(p)->clock               = 0;
6585
6586                 if (!rt_task(p)) {
6587                         /*
6588                          * Renice negative nice level userspace
6589                          * tasks back to 0:
6590                          */
6591                         if (TASK_NICE(p) < 0 && p->mm)
6592                                 set_user_nice(p, 0);
6593                         continue;
6594                 }
6595
6596                 spin_lock_irqsave(&p->pi_lock, flags);
6597                 rq = __task_rq_lock(p);
6598 #ifdef CONFIG_SMP
6599                 /*
6600                  * Do not touch the migration thread:
6601                  */
6602                 if (p == rq->migration_thread)
6603                         goto out_unlock;
6604 #endif
6605
6606                 on_rq = p->se.on_rq;
6607                 if (on_rq)
6608                         deactivate_task(task_rq(p), p, 0);
6609                 __setscheduler(rq, p, SCHED_NORMAL, 0);
6610                 if (on_rq) {
6611                         activate_task(task_rq(p), p, 0);
6612                         resched_task(rq->curr);
6613                 }
6614 #ifdef CONFIG_SMP
6615  out_unlock:
6616 #endif
6617                 __task_rq_unlock(rq);
6618                 spin_unlock_irqrestore(&p->pi_lock, flags);
6619         } while_each_thread(g, p);
6620
6621         read_unlock_irq(&tasklist_lock);
6622 }
6623
6624 #endif /* CONFIG_MAGIC_SYSRQ */
6625
6626 #ifdef CONFIG_IA64
6627 /*
6628  * These functions are only useful for the IA64 MCA handling.
6629  *
6630  * They can only be called when the whole system has been
6631  * stopped - every CPU needs to be quiescent, and no scheduling
6632  * activity can take place. Using them for anything else would
6633  * be a serious bug, and as a result, they aren't even visible
6634  * under any other configuration.
6635  */
6636
6637 /**
6638  * curr_task - return the current task for a given cpu.
6639  * @cpu: the processor in question.
6640  *
6641  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6642  */
6643 struct task_struct *curr_task(int cpu)
6644 {
6645         return cpu_curr(cpu);
6646 }
6647
6648 /**
6649  * set_curr_task - set the current task for a given cpu.
6650  * @cpu: the processor in question.
6651  * @p: the task pointer to set.
6652  *
6653  * Description: This function must only be used when non-maskable interrupts
6654  * are serviced on a separate stack.  It allows the architecture to switch the
6655  * notion of the current task on a cpu in a non-blocking manner.  This function
6656  * must be called with all CPU's synchronized, and interrupts disabled, the
6657  * and caller must save the original value of the current task (see
6658  * curr_task() above) and restore that value before reenabling interrupts and
6659  * re-starting the system.
6660  *
6661  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6662  */
6663 void set_curr_task(int cpu, struct task_struct *p)
6664 {
6665         cpu_curr(cpu) = p;
6666 }
6667
6668 #endif