kernel/sched_rt.c

   1 /*
   2  * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   3  * policies)
   4  */
   5
   6 #ifdef CONFIG_SMP
   7
   8 static inline int rt_overloaded(struct rq *rq)
   9 {
  10         return atomic_read(&rq->rd->rto_count);
  11 }
  12
  13 static inline void rt_set_overload(struct rq *rq)
  14 {
  15         cpu_set(rq->cpu, rq->rd->rto_mask);
  16         /*
  17          * Make sure the mask is visible before we set
  18          * the overload count. That is checked to determine
  19          * if we should look at the mask. It would be a shame
  20          * if we looked at the mask, but the mask was not
  21          * updated yet.
  22          */
  23         wmb();
  24         atomic_inc(&rq->rd->rto_count);
  25 }
  26
  27 static inline void rt_clear_overload(struct rq *rq)
  28 {
  29         /* the order here really doesn't matter */
  30         atomic_dec(&rq->rd->rto_count);
  31         cpu_clear(rq->cpu, rq->rd->rto_mask);
  32 }
  33
  34 static void update_rt_migration(struct rq *rq)
  35 {
  36         if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
  37                 if (!rq->rt.overloaded) {
  38                         rt_set_overload(rq);
  39                         rq->rt.overloaded = 1;
  40                 }
  41         } else if (rq->rt.overloaded) {
  42                 rt_clear_overload(rq);
  43                 rq->rt.overloaded = 0;
  44         }
  45 }
  46 #endif /* CONFIG_SMP */
  47
  48 static int sched_rt_ratio_exceeded(struct rq *rq, struct rt_rq *rt_rq)
  49 {
  50         u64 period, ratio;
  51
  52         if (sysctl_sched_rt_ratio == SCHED_RT_FRAC)
  53                 return 0;
  54
  55         if (rt_rq->rt_throttled)
  56                 return 1;
  57
  58         period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
  59         ratio = (period * sysctl_sched_rt_ratio) >> SCHED_RT_FRAC_SHIFT;
  60
  61         if (rt_rq->rt_time > ratio) {
  62                 rt_rq->rt_throttled = rq->clock + period - rt_rq->rt_time;
  63                 return 1;
  64         }
  65
  66         return 0;
  67 }
  68
  69 static void update_sched_rt_period(struct rq *rq)
  70 {
  71         while (rq->clock > rq->rt_period_expire) {
  72                 u64 period, ratio;
  73
  74                 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
  75                 ratio = (period * sysctl_sched_rt_ratio) >> SCHED_RT_FRAC_SHIFT;
  76
  77                 rq->rt.rt_time -= min(rq->rt.rt_time, ratio);
  78                 rq->rt_period_expire += period;
  79         }
  80
  81         /*
  82          * When the rt throttle is expired, let them rip.
  83          * (XXX: use hrtick when available)
  84          */
  85         if (rq->rt.rt_throttled && rq->clock > rq->rt.rt_throttled) {
  86                 rq->rt.rt_throttled = 0;
  87                 if (!sched_rt_ratio_exceeded(rq, &rq->rt))
  88                         resched_task(rq->curr);
  89         }
  90 }
  91
  92 /*
  93  * Update the current task's runtime statistics. Skip current tasks that
  94  * are not in our scheduling class.
  95  */
  96 static void update_curr_rt(struct rq *rq)
  97 {
  98         struct task_struct *curr = rq->curr;
  99         u64 delta_exec;
 100
 101         if (!task_has_rt_policy(curr))
 102                 return;
 103
 104         delta_exec = rq->clock - curr->se.exec_start;
 105         if (unlikely((s64)delta_exec < 0))
 106                 delta_exec = 0;
 107
 108         schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
 109
 110         curr->se.sum_exec_runtime += delta_exec;
 111         curr->se.exec_start = rq->clock;
 112         cpuacct_charge(curr, delta_exec);
 113
 114         rq->rt.rt_time += delta_exec;
 115         update_sched_rt_period(rq);
 116         if (sched_rt_ratio_exceeded(rq, &rq->rt))
 117                 resched_task(curr);
 118 }
 119
 120 static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
 121 {
 122         WARN_ON(!rt_task(p));
 123         rq->rt.rt_nr_running++;
 124 #ifdef CONFIG_SMP
 125         if (p->prio < rq->rt.highest_prio)
 126                 rq->rt.highest_prio = p->prio;
 127         if (p->nr_cpus_allowed > 1)
 128                 rq->rt.rt_nr_migratory++;
 129
 130         update_rt_migration(rq);
 131 #endif /* CONFIG_SMP */
 132 }
 133
 134 static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
 135 {
 136         WARN_ON(!rt_task(p));
 137         WARN_ON(!rq->rt.rt_nr_running);
 138         rq->rt.rt_nr_running--;
 139 #ifdef CONFIG_SMP
 140         if (rq->rt.rt_nr_running) {
 141                 struct rt_prio_array *array;
 142
 143                 WARN_ON(p->prio < rq->rt.highest_prio);
 144                 if (p->prio == rq->rt.highest_prio) {
 145                         /* recalculate */
 146                         array = &rq->rt.active;
 147                         rq->rt.highest_prio =
 148                                 sched_find_first_bit(array->bitmap);
 149                 } /* otherwise leave rq->highest prio alone */
 150         } else
 151                 rq->rt.highest_prio = MAX_RT_PRIO;
 152         if (p->nr_cpus_allowed > 1)
 153                 rq->rt.rt_nr_migratory--;
 154
 155         update_rt_migration(rq);
 156 #endif /* CONFIG_SMP */
 157 }
 158
 159 static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 160 {
 161         struct rt_prio_array *array = &rq->rt.active;
 162
 163         list_add_tail(&p->rt.run_list, array->queue + p->prio);
 164         __set_bit(p->prio, array->bitmap);
 165         inc_cpu_load(rq, p->se.load.weight);
 166
 167         inc_rt_tasks(p, rq);
 168
 169         if (wakeup)
 170                 p->rt.timeout = 0;
 171 }
 172
 173 /*
 174  * Adding/removing a task to/from a priority array:
 175  */
 176 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 177 {
 178         struct rt_prio_array *array = &rq->rt.active;
 179
 180         update_curr_rt(rq);
 181
 182         list_del(&p->rt.run_list);
 183         if (list_empty(array->queue + p->prio))
 184                 __clear_bit(p->prio, array->bitmap);
 185         dec_cpu_load(rq, p->se.load.weight);
 186
 187         dec_rt_tasks(p, rq);
 188 }
 189
 190 /*
 191  * Put task to the end of the run list without the overhead of dequeue
 192  * followed by enqueue.
 193  */
 194 static void requeue_task_rt(struct rq *rq, struct task_struct *p)
 195 {
 196         struct rt_prio_array *array = &rq->rt.active;
 197
 198         list_move_tail(&p->rt.run_list, array->queue + p->prio);
 199 }
 200
 201 static void
 202 yield_task_rt(struct rq *rq)
 203 {
 204         requeue_task_rt(rq, rq->curr);
 205 }
 206
 207 #ifdef CONFIG_SMP
 208 static int find_lowest_rq(struct task_struct *task);
 209
 210 static int select_task_rq_rt(struct task_struct *p, int sync)
 211 {
 212         struct rq *rq = task_rq(p);
 213
 214         /*
 215          * If the current task is an RT task, then
 216          * try to see if we can wake this RT task up on another
 217          * runqueue. Otherwise simply start this RT task
 218          * on its current runqueue.
 219          *
 220          * We want to avoid overloading runqueues. Even if
 221          * the RT task is of higher priority than the current RT task.
 222          * RT tasks behave differently than other tasks. If
 223          * one gets preempted, we try to push it off to another queue.
 224          * So trying to keep a preempting RT task on the same
 225          * cache hot CPU will force the running RT task to
 226          * a cold CPU. So we waste all the cache for the lower
 227          * RT task in hopes of saving some of a RT task
 228          * that is just being woken and probably will have
 229          * cold cache anyway.
 230          */
 231         if (unlikely(rt_task(rq->curr)) &&
 232             (p->nr_cpus_allowed > 1)) {
 233                 int cpu = find_lowest_rq(p);
 234
 235                 return (cpu == -1) ? task_cpu(p) : cpu;
 236         }
 237
 238         /*
 239          * Otherwise, just let it ride on the affined RQ and the
 240          * post-schedule router will push the preempted task away
 241          */
 242         return task_cpu(p);
 243 }
 244 #endif /* CONFIG_SMP */
 245
 246 /*
 247  * Preempt the current task with a newly woken task if needed:
 248  */
 249 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 250 {
 251         if (p->prio < rq->curr->prio)
 252                 resched_task(rq->curr);
 253 }
 254
 255 static struct task_struct *pick_next_task_rt(struct rq *rq)
 256 {
 257         struct rt_prio_array *array = &rq->rt.active;
 258         struct task_struct *next;
 259         struct list_head *queue;
 260         struct rt_rq *rt_rq = &rq->rt;
 261         int idx;
 262
 263         if (sched_rt_ratio_exceeded(rq, rt_rq))
 264                 return NULL;
 265
 266         idx = sched_find_first_bit(array->bitmap);
 267         if (idx >= MAX_RT_PRIO)
 268                 return NULL;
 269
 270         queue = array->queue + idx;
 271         next = list_entry(queue->next, struct task_struct, rt.run_list);
 272
 273         next->se.exec_start = rq->clock;
 274
 275         return next;
 276 }
 277
 278 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 279 {
 280         update_curr_rt(rq);
 281         p->se.exec_start = 0;
 282 }
 283
 284 #ifdef CONFIG_SMP
 285 /* Only try algorithms three times */
 286 #define RT_MAX_TRIES 3
 287
 288 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
 289 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 290
 291 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 292 {
 293         if (!task_running(rq, p) &&
 294             (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
 295             (p->nr_cpus_allowed > 1))
 296                 return 1;
 297         return 0;
 298 }
 299
 300 /* Return the second highest RT task, NULL otherwise */
 301 static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 302 {
 303         struct rt_prio_array *array = &rq->rt.active;
 304         struct task_struct *next;
 305         struct list_head *queue;
 306         int idx;
 307
 308         if (likely(rq->rt.rt_nr_running < 2))
 309                 return NULL;
 310
 311         idx = sched_find_first_bit(array->bitmap);
 312         if (unlikely(idx >= MAX_RT_PRIO)) {
 313                 WARN_ON(1); /* rt_nr_running is bad */
 314                 return NULL;
 315         }
 316
 317         queue = array->queue + idx;
 318         BUG_ON(list_empty(queue));
 319
 320         next = list_entry(queue->next, struct task_struct, rt.run_list);
 321         if (unlikely(pick_rt_task(rq, next, cpu)))
 322                 goto out;
 323
 324         if (queue->next->next != queue) {
 325                 /* same prio task */
 326                 next = list_entry(queue->next->next, struct task_struct,
 327                                   rt.run_list);
 328                 if (pick_rt_task(rq, next, cpu))
 329                         goto out;
 330         }
 331
 332  retry:
 333         /* slower, but more flexible */
 334         idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
 335         if (unlikely(idx >= MAX_RT_PRIO))
 336                 return NULL;
 337
 338         queue = array->queue + idx;
 339         BUG_ON(list_empty(queue));
 340
 341         list_for_each_entry(next, queue, rt.run_list) {
 342                 if (pick_rt_task(rq, next, cpu))
 343                         goto out;
 344         }
 345
 346         goto retry;
 347
 348  out:
 349         return next;
 350 }
 351
 352 static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
 353
 354 static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
 355 {
 356         int       lowest_prio = -1;
 357         int       lowest_cpu  = -1;
 358         int       count       = 0;
 359         int       cpu;
 360
 361         cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
 362
 363         /*
 364          * Scan each rq for the lowest prio.
 365          */
 366         for_each_cpu_mask(cpu, *lowest_mask) {
 367                 struct rq *rq = cpu_rq(cpu);
 368
 369                 /* We look for lowest RT prio or non-rt CPU */
 370                 if (rq->rt.highest_prio >= MAX_RT_PRIO) {
 371                         /*
 372                          * if we already found a low RT queue
 373                          * and now we found this non-rt queue
 374                          * clear the mask and set our bit.
 375                          * Otherwise just return the queue as is
 376                          * and the count==1 will cause the algorithm
 377                          * to use the first bit found.
 378                          */
 379                         if (lowest_cpu != -1) {
 380                                 cpus_clear(*lowest_mask);
 381                                 cpu_set(rq->cpu, *lowest_mask);
 382                         }
 383                         return 1;
 384                 }
 385
 386                 /* no locking for now */
 387                 if ((rq->rt.highest_prio > task->prio)
 388                     && (rq->rt.highest_prio >= lowest_prio)) {
 389                         if (rq->rt.highest_prio > lowest_prio) {
 390                                 /* new low - clear old data */
 391                                 lowest_prio = rq->rt.highest_prio;
 392                                 lowest_cpu = cpu;
 393                                 count = 0;
 394                         }
 395                         count++;
 396                 } else
 397                         cpu_clear(cpu, *lowest_mask);
 398         }
 399
 400         /*
 401          * Clear out all the set bits that represent
 402          * runqueues that were of higher prio than
 403          * the lowest_prio.
 404          */
 405         if (lowest_cpu > 0) {
 406                 /*
 407                  * Perhaps we could add another cpumask op to
 408                  * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
 409                  * Then that could be optimized to use memset and such.
 410                  */
 411                 for_each_cpu_mask(cpu, *lowest_mask) {
 412                         if (cpu >= lowest_cpu)
 413                                 break;
 414                         cpu_clear(cpu, *lowest_mask);
 415                 }
 416         }
 417
 418         return count;
 419 }
 420
 421 static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
 422 {
 423         int first;
 424
 425         /* "this_cpu" is cheaper to preempt than a remote processor */
 426         if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
 427                 return this_cpu;
 428
 429         first = first_cpu(*mask);
 430         if (first != NR_CPUS)
 431                 return first;
 432
 433         return -1;
 434 }
 435
 436 static int find_lowest_rq(struct task_struct *task)
 437 {
 438         struct sched_domain *sd;
 439         cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
 440         int this_cpu = smp_processor_id();
 441         int cpu      = task_cpu(task);
 442         int count    = find_lowest_cpus(task, lowest_mask);
 443
 444         if (!count)
 445                 return -1; /* No targets found */
 446
 447         /*
 448          * There is no sense in performing an optimal search if only one
 449          * target is found.
 450          */
 451         if (count == 1)
 452                 return first_cpu(*lowest_mask);
 453
 454         /*
 455          * At this point we have built a mask of cpus representing the
 456          * lowest priority tasks in the system.  Now we want to elect
 457          * the best one based on our affinity and topology.
 458          *
 459          * We prioritize the last cpu that the task executed on since
 460          * it is most likely cache-hot in that location.
 461          */
 462         if (cpu_isset(cpu, *lowest_mask))
 463                 return cpu;
 464
 465         /*
 466          * Otherwise, we consult the sched_domains span maps to figure
 467          * out which cpu is logically closest to our hot cache data.
 468          */
 469         if (this_cpu == cpu)
 470                 this_cpu = -1; /* Skip this_cpu opt if the same */
 471
 472         for_each_domain(cpu, sd) {
 473                 if (sd->flags & SD_WAKE_AFFINE) {
 474                         cpumask_t domain_mask;
 475                         int       best_cpu;
 476
 477                         cpus_and(domain_mask, sd->span, *lowest_mask);
 478
 479                         best_cpu = pick_optimal_cpu(this_cpu,
 480                                                     &domain_mask);
 481                         if (best_cpu != -1)
 482                                 return best_cpu;
 483                 }
 484         }
 485
 486         /*
 487          * And finally, if there were no matches within the domains
 488          * just give the caller *something* to work with from the compatible
 489          * locations.
 490          */
 491         return pick_optimal_cpu(this_cpu, lowest_mask);
 492 }
 493
 494 /* Will lock the rq it finds */
 495 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 496 {
 497         struct rq *lowest_rq = NULL;
 498         int tries;
 499         int cpu;
 500
 501         for (tries = 0; tries < RT_MAX_TRIES; tries++) {
 502                 cpu = find_lowest_rq(task);
 503
 504                 if ((cpu == -1) || (cpu == rq->cpu))
 505                         break;
 506
 507                 lowest_rq = cpu_rq(cpu);
 508
 509                 /* if the prio of this runqueue changed, try again */
 510                 if (double_lock_balance(rq, lowest_rq)) {
 511                         /*
 512                          * We had to unlock the run queue. In
 513                          * the mean time, task could have
 514                          * migrated already or had its affinity changed.
 515                          * Also make sure that it wasn't scheduled on its rq.
 516                          */
 517                         if (unlikely(task_rq(task) != rq ||
 518                                      !cpu_isset(lowest_rq->cpu,
 519                                                 task->cpus_allowed) ||
 520                                      task_running(rq, task) ||
 521                                      !task->se.on_rq)) {
 522
 523                                 spin_unlock(&lowest_rq->lock);
 524                                 lowest_rq = NULL;
 525                                 break;
 526                         }
 527                 }
 528
 529                 /* If this rq is still suitable use it. */
 530                 if (lowest_rq->rt.highest_prio > task->prio)
 531                         break;
 532
 533                 /* try again */
 534                 spin_unlock(&lowest_rq->lock);
 535                 lowest_rq = NULL;
 536         }
 537
 538         return lowest_rq;
 539 }
 540
 541 /*
 542  * If the current CPU has more than one RT task, see if the non
 543  * running task can migrate over to a CPU that is running a task
 544  * of lesser priority.
 545  */
 546 static int push_rt_task(struct rq *rq)
 547 {
 548         struct task_struct *next_task;
 549         struct rq *lowest_rq;
 550         int ret = 0;
 551         int paranoid = RT_MAX_TRIES;
 552
 553         if (!rq->rt.overloaded)
 554                 return 0;
 555
 556         next_task = pick_next_highest_task_rt(rq, -1);
 557         if (!next_task)
 558                 return 0;
 559
 560  retry:
 561         if (unlikely(next_task == rq->curr)) {
 562                 WARN_ON(1);
 563                 return 0;
 564         }
 565
 566         /*
 567          * It's possible that the next_task slipped in of
 568          * higher priority than current. If that's the case
 569          * just reschedule current.
 570          */
 571         if (unlikely(next_task->prio < rq->curr->prio)) {
 572                 resched_task(rq->curr);
 573                 return 0;
 574         }
 575
 576         /* We might release rq lock */
 577         get_task_struct(next_task);
 578
 579         /* find_lock_lowest_rq locks the rq if found */
 580         lowest_rq = find_lock_lowest_rq(next_task, rq);
 581         if (!lowest_rq) {
 582                 struct task_struct *task;
 583                 /*
 584                  * find lock_lowest_rq releases rq->lock
 585                  * so it is possible that next_task has changed.
 586                  * If it has, then try again.
 587                  */
 588                 task = pick_next_highest_task_rt(rq, -1);
 589                 if (unlikely(task != next_task) && task && paranoid--) {
 590                         put_task_struct(next_task);
 591                         next_task = task;
 592                         goto retry;
 593                 }
 594                 goto out;
 595         }
 596
 597         deactivate_task(rq, next_task, 0);
 598         set_task_cpu(next_task, lowest_rq->cpu);
 599         activate_task(lowest_rq, next_task, 0);
 600
 601         resched_task(lowest_rq->curr);
 602
 603         spin_unlock(&lowest_rq->lock);
 604
 605         ret = 1;
 606 out:
 607         put_task_struct(next_task);
 608
 609         return ret;
 610 }
 611
 612 /*
 613  * TODO: Currently we just use the second highest prio task on
 614  *       the queue, and stop when it can't migrate (or there's
 615  *       no more RT tasks).  There may be a case where a lower
 616  *       priority RT task has a different affinity than the
 617  *       higher RT task. In this case the lower RT task could
 618  *       possibly be able to migrate where as the higher priority
 619  *       RT task could not.  We currently ignore this issue.
 620  *       Enhancements are welcome!
 621  */
 622 static void push_rt_tasks(struct rq *rq)
 623 {
 624         /* push_rt_task will return true if it moved an RT */
 625         while (push_rt_task(rq))
 626                 ;
 627 }
 628
 629 static int pull_rt_task(struct rq *this_rq)
 630 {
 631         int this_cpu = this_rq->cpu, ret = 0, cpu;
 632         struct task_struct *p, *next;
 633         struct rq *src_rq;
 634
 635         if (likely(!rt_overloaded(this_rq)))
 636                 return 0;
 637
 638         next = pick_next_task_rt(this_rq);
 639
 640         for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
 641                 if (this_cpu == cpu)
 642                         continue;
 643
 644                 src_rq = cpu_rq(cpu);
 645                 /*
 646                  * We can potentially drop this_rq's lock in
 647                  * double_lock_balance, and another CPU could
 648                  * steal our next task - hence we must cause
 649                  * the caller to recalculate the next task
 650                  * in that case:
 651                  */
 652                 if (double_lock_balance(this_rq, src_rq)) {
 653                         struct task_struct *old_next = next;
 654
 655                         next = pick_next_task_rt(this_rq);
 656                         if (next != old_next)
 657                                 ret = 1;
 658                 }
 659
 660                 /*
 661                  * Are there still pullable RT tasks?
 662                  */
 663                 if (src_rq->rt.rt_nr_running <= 1) {
 664                         spin_unlock(&src_rq->lock);
 665                         continue;
 666                 }
 667
 668                 p = pick_next_highest_task_rt(src_rq, this_cpu);
 669
 670                 /*
 671                  * Do we have an RT task that preempts
 672                  * the to-be-scheduled task?
 673                  */
 674                 if (p && (!next || (p->prio < next->prio))) {
 675                         WARN_ON(p == src_rq->curr);
 676                         WARN_ON(!p->se.on_rq);
 677
 678                         /*
 679                          * There's a chance that p is higher in priority
 680                          * than what's currently running on its cpu.
 681                          * This is just that p is wakeing up and hasn't
 682                          * had a chance to schedule. We only pull
 683                          * p if it is lower in priority than the
 684                          * current task on the run queue or
 685                          * this_rq next task is lower in prio than
 686                          * the current task on that rq.
 687                          */
 688                         if (p->prio < src_rq->curr->prio ||
 689                             (next && next->prio < src_rq->curr->prio))
 690                                 goto out;
 691
 692                         ret = 1;
 693
 694                         deactivate_task(src_rq, p, 0);
 695                         set_task_cpu(p, this_cpu);
 696                         activate_task(this_rq, p, 0);
 697                         /*
 698                          * We continue with the search, just in
 699                          * case there's an even higher prio task
 700                          * in another runqueue. (low likelyhood
 701                          * but possible)
 702                          *
 703                          * Update next so that we won't pick a task
 704                          * on another cpu with a priority lower (or equal)
 705                          * than the one we just picked.
 706                          */
 707                         next = p;
 708
 709                 }
 710  out:
 711                 spin_unlock(&src_rq->lock);
 712         }
 713
 714         return ret;
 715 }
 716
 717 static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 718 {
 719         /* Try to pull RT tasks here if we lower this rq's prio */
 720         if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
 721                 pull_rt_task(rq);
 722 }
 723
 724 static void post_schedule_rt(struct rq *rq)
 725 {
 726         /*
 727          * If we have more than one rt_task queued, then
 728          * see if we can push the other rt_tasks off to other CPUS.
 729          * Note we may release the rq lock, and since
 730          * the lock was owned by prev, we need to release it
 731          * first via finish_lock_switch and then reaquire it here.
 732          */
 733         if (unlikely(rq->rt.overloaded)) {
 734                 spin_lock_irq(&rq->lock);
 735                 push_rt_tasks(rq);
 736                 spin_unlock_irq(&rq->lock);
 737         }
 738 }
 739
 740
 741 static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
 742 {
 743         if (!task_running(rq, p) &&
 744             (p->prio >= rq->rt.highest_prio) &&
 745             rq->rt.overloaded)
 746                 push_rt_tasks(rq);
 747 }
 748
 749 static unsigned long
 750 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 751                 unsigned long max_load_move,
 752                 struct sched_domain *sd, enum cpu_idle_type idle,
 753                 int *all_pinned, int *this_best_prio)
 754 {
 755         /* don't touch RT tasks */
 756         return 0;
 757 }
 758
 759 static int
 760 move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 761                  struct sched_domain *sd, enum cpu_idle_type idle)
 762 {
 763         /* don't touch RT tasks */
 764         return 0;
 765 }
 766
 767 static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
 768 {
 769         int weight = cpus_weight(*new_mask);
 770
 771         BUG_ON(!rt_task(p));
 772
 773         /*
 774          * Update the migration status of the RQ if we have an RT task
 775          * which is running AND changing its weight value.
 776          */
 777         if (p->se.on_rq && (weight != p->nr_cpus_allowed)) {
 778                 struct rq *rq = task_rq(p);
 779
 780                 if ((p->nr_cpus_allowed <= 1) && (weight > 1)) {
 781                         rq->rt.rt_nr_migratory++;
 782                 } else if ((p->nr_cpus_allowed > 1) && (weight <= 1)) {
 783                         BUG_ON(!rq->rt.rt_nr_migratory);
 784                         rq->rt.rt_nr_migratory--;
 785                 }
 786
 787                 update_rt_migration(rq);
 788         }
 789
 790         p->cpus_allowed    = *new_mask;
 791         p->nr_cpus_allowed = weight;
 792 }
 793
 794 /* Assumes rq->lock is held */
 795 static void join_domain_rt(struct rq *rq)
 796 {
 797         if (rq->rt.overloaded)
 798                 rt_set_overload(rq);
 799 }
 800
 801 /* Assumes rq->lock is held */
 802 static void leave_domain_rt(struct rq *rq)
 803 {
 804         if (rq->rt.overloaded)
 805                 rt_clear_overload(rq);
 806 }
 807
 808 /*
 809  * When switch from the rt queue, we bring ourselves to a position
 810  * that we might want to pull RT tasks from other runqueues.
 811  */
 812 static void switched_from_rt(struct rq *rq, struct task_struct *p,
 813                            int running)
 814 {
 815         /*
 816          * If there are other RT tasks then we will reschedule
 817          * and the scheduling of the other RT tasks will handle
 818          * the balancing. But if we are the last RT task
 819          * we may need to handle the pulling of RT tasks
 820          * now.
 821          */
 822         if (!rq->rt.rt_nr_running)
 823                 pull_rt_task(rq);
 824 }
 825 #endif /* CONFIG_SMP */
 826
 827 /*
 828  * When switching a task to RT, we may overload the runqueue
 829  * with RT tasks. In this case we try to push them off to
 830  * other runqueues.
 831  */
 832 static void switched_to_rt(struct rq *rq, struct task_struct *p,
 833                            int running)
 834 {
 835         int check_resched = 1;
 836
 837         /*
 838          * If we are already running, then there's nothing
 839          * that needs to be done. But if we are not running
 840          * we may need to preempt the current running task.
 841          * If that current running task is also an RT task
 842          * then see if we can move to another run queue.
 843          */
 844         if (!running) {
 845 #ifdef CONFIG_SMP
 846                 if (rq->rt.overloaded && push_rt_task(rq) &&
 847                     /* Don't resched if we changed runqueues */
 848                     rq != task_rq(p))
 849                         check_resched = 0;
 850 #endif /* CONFIG_SMP */
 851                 if (check_resched && p->prio < rq->curr->prio)
 852                         resched_task(rq->curr);
 853         }
 854 }
 855
 856 /*
 857  * Priority of the task has changed. This may cause
 858  * us to initiate a push or pull.
 859  */
 860 static void prio_changed_rt(struct rq *rq, struct task_struct *p,
 861                             int oldprio, int running)
 862 {
 863         if (running) {
 864 #ifdef CONFIG_SMP
 865                 /*
 866                  * If our priority decreases while running, we
 867                  * may need to pull tasks to this runqueue.
 868                  */
 869                 if (oldprio < p->prio)
 870                         pull_rt_task(rq);
 871                 /*
 872                  * If there's a higher priority task waiting to run
 873                  * then reschedule.
 874                  */
 875                 if (p->prio > rq->rt.highest_prio)
 876                         resched_task(p);
 877 #else
 878                 /* For UP simply resched on drop of prio */
 879                 if (oldprio < p->prio)
 880                         resched_task(p);
 881 #endif /* CONFIG_SMP */
 882         } else {
 883                 /*
 884                  * This task is not running, but if it is
 885                  * greater than the current running task
 886                  * then reschedule.
 887                  */
 888                 if (p->prio < rq->curr->prio)
 889                         resched_task(rq->curr);
 890         }
 891 }
 892
 893 static void watchdog(struct rq *rq, struct task_struct *p)
 894 {
 895         unsigned long soft, hard;
 896
 897         if (!p->signal)
 898                 return;
 899
 900         soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
 901         hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
 902
 903         if (soft != RLIM_INFINITY) {
 904                 unsigned long next;
 905
 906                 p->rt.timeout++;
 907                 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
 908                 if (next > p->rt.timeout) {
 909                         u64 next_time = p->se.sum_exec_runtime;
 910
 911                         next_time += next * (NSEC_PER_SEC/HZ);
 912                         if (p->it_sched_expires > next_time)
 913                                 p->it_sched_expires = next_time;
 914                 } else
 915                         p->it_sched_expires = p->se.sum_exec_runtime;
 916         }
 917 }
 918
 919 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 920 {
 921         update_curr_rt(rq);
 922
 923         watchdog(rq, p);
 924
 925         /*
 926          * RR tasks need a special form of timeslice management.
 927          * FIFO tasks have no timeslices.
 928          */
 929         if (p->policy != SCHED_RR)
 930                 return;
 931
 932         if (--p->rt.time_slice)
 933                 return;
 934
 935         p->rt.time_slice = DEF_TIMESLICE;
 936
 937         /*
 938          * Requeue to the end of queue if we are not the only element
 939          * on the queue:
 940          */
 941         if (p->rt.run_list.prev != p->rt.run_list.next) {
 942                 requeue_task_rt(rq, p);
 943                 set_tsk_need_resched(p);
 944         }
 945 }
 946
 947 static void set_curr_task_rt(struct rq *rq)
 948 {
 949         struct task_struct *p = rq->curr;
 950
 951         p->se.exec_start = rq->clock;
 952 }
 953
 954 const struct sched_class rt_sched_class = {
 955         .next                   = &fair_sched_class,
 956         .enqueue_task           = enqueue_task_rt,
 957         .dequeue_task           = dequeue_task_rt,
 958         .yield_task             = yield_task_rt,
 959 #ifdef CONFIG_SMP
 960         .select_task_rq         = select_task_rq_rt,
 961 #endif /* CONFIG_SMP */
 962
 963         .check_preempt_curr     = check_preempt_curr_rt,
 964
 965         .pick_next_task         = pick_next_task_rt,
 966         .put_prev_task          = put_prev_task_rt,
 967
 968 #ifdef CONFIG_SMP
 969         .load_balance           = load_balance_rt,
 970         .move_one_task          = move_one_task_rt,
 971         .set_cpus_allowed       = set_cpus_allowed_rt,
 972         .join_domain            = join_domain_rt,
 973         .leave_domain           = leave_domain_rt,
 974         .pre_schedule           = pre_schedule_rt,
 975         .post_schedule          = post_schedule_rt,
 976         .task_wake_up           = task_wake_up_rt,
 977         .switched_from          = switched_from_rt,
 978 #endif
 979
 980         .set_curr_task          = set_curr_task_rt,
 981         .task_tick              = task_tick_rt,
 982
 983         .prio_changed           = prio_changed_rt,
 984         .switched_to            = switched_to_rt,
 985 };