kernel/sched_rt.c

   1 /*
   2  * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   3  * policies)
   4  */
   5
   6 #ifdef CONFIG_SMP
   7 static cpumask_t rt_overload_mask;
   8 static atomic_t rto_count;
   9 static inline int rt_overloaded(void)
  10 {
  11         return atomic_read(&rto_count);
  12 }
  13 static inline cpumask_t *rt_overload(void)
  14 {
  15         return &rt_overload_mask;
  16 }
  17 static inline void rt_set_overload(struct rq *rq)
  18 {
  19         cpu_set(rq->cpu, rt_overload_mask);
  20         /*
  21          * Make sure the mask is visible before we set
  22          * the overload count. That is checked to determine
  23          * if we should look at the mask. It would be a shame
  24          * if we looked at the mask, but the mask was not
  25          * updated yet.
  26          */
  27         wmb();
  28         atomic_inc(&rto_count);
  29 }
  30 static inline void rt_clear_overload(struct rq *rq)
  31 {
  32         /* the order here really doesn't matter */
  33         atomic_dec(&rto_count);
  34         cpu_clear(rq->cpu, rt_overload_mask);
  35 }
  36 #endif /* CONFIG_SMP */
  37
  38 /*
  39  * Update the current task's runtime statistics. Skip current tasks that
  40  * are not in our scheduling class.
  41  */
  42 static void update_curr_rt(struct rq *rq)
  43 {
  44         struct task_struct *curr = rq->curr;
  45         u64 delta_exec;
  46
  47         if (!task_has_rt_policy(curr))
  48                 return;
  49
  50         delta_exec = rq->clock - curr->se.exec_start;
  51         if (unlikely((s64)delta_exec < 0))
  52                 delta_exec = 0;
  53
  54         schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
  55
  56         curr->se.sum_exec_runtime += delta_exec;
  57         curr->se.exec_start = rq->clock;
  58         cpuacct_charge(curr, delta_exec);
  59 }
  60
  61 static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
  62 {
  63         WARN_ON(!rt_task(p));
  64         rq->rt.rt_nr_running++;
  65 #ifdef CONFIG_SMP
  66         if (p->prio < rq->rt.highest_prio)
  67                 rq->rt.highest_prio = p->prio;
  68         if (rq->rt.rt_nr_running > 1)
  69                 rt_set_overload(rq);
  70 #endif /* CONFIG_SMP */
  71 }
  72
  73 static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
  74 {
  75         WARN_ON(!rt_task(p));
  76         WARN_ON(!rq->rt.rt_nr_running);
  77         rq->rt.rt_nr_running--;
  78 #ifdef CONFIG_SMP
  79         if (rq->rt.rt_nr_running) {
  80                 struct rt_prio_array *array;
  81
  82                 WARN_ON(p->prio < rq->rt.highest_prio);
  83                 if (p->prio == rq->rt.highest_prio) {
  84                         /* recalculate */
  85                         array = &rq->rt.active;
  86                         rq->rt.highest_prio =
  87                                 sched_find_first_bit(array->bitmap);
  88                 } /* otherwise leave rq->highest prio alone */
  89         } else
  90                 rq->rt.highest_prio = MAX_RT_PRIO;
  91         if (rq->rt.rt_nr_running < 2)
  92                 rt_clear_overload(rq);
  93 #endif /* CONFIG_SMP */
  94 }
  95
  96 static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
  97 {
  98         struct rt_prio_array *array = &rq->rt.active;
  99
 100         list_add_tail(&p->run_list, array->queue + p->prio);
 101         __set_bit(p->prio, array->bitmap);
 102         inc_cpu_load(rq, p->se.load.weight);
 103
 104         inc_rt_tasks(p, rq);
 105 }
 106
 107 /*
 108  * Adding/removing a task to/from a priority array:
 109  */
 110 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 111 {
 112         struct rt_prio_array *array = &rq->rt.active;
 113
 114         update_curr_rt(rq);
 115
 116         list_del(&p->run_list);
 117         if (list_empty(array->queue + p->prio))
 118                 __clear_bit(p->prio, array->bitmap);
 119         dec_cpu_load(rq, p->se.load.weight);
 120
 121         dec_rt_tasks(p, rq);
 122 }
 123
 124 /*
 125  * Put task to the end of the run list without the overhead of dequeue
 126  * followed by enqueue.
 127  */
 128 static void requeue_task_rt(struct rq *rq, struct task_struct *p)
 129 {
 130         struct rt_prio_array *array = &rq->rt.active;
 131
 132         list_move_tail(&p->run_list, array->queue + p->prio);
 133 }
 134
 135 static void
 136 yield_task_rt(struct rq *rq)
 137 {
 138         requeue_task_rt(rq, rq->curr);
 139 }
 140
 141 /*
 142  * Preempt the current task with a newly woken task if needed:
 143  */
 144 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 145 {
 146         if (p->prio < rq->curr->prio)
 147                 resched_task(rq->curr);
 148 }
 149
 150 static struct task_struct *pick_next_task_rt(struct rq *rq)
 151 {
 152         struct rt_prio_array *array = &rq->rt.active;
 153         struct task_struct *next;
 154         struct list_head *queue;
 155         int idx;
 156
 157         idx = sched_find_first_bit(array->bitmap);
 158         if (idx >= MAX_RT_PRIO)
 159                 return NULL;
 160
 161         queue = array->queue + idx;
 162         next = list_entry(queue->next, struct task_struct, run_list);
 163
 164         next->se.exec_start = rq->clock;
 165
 166         return next;
 167 }
 168
 169 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 170 {
 171         update_curr_rt(rq);
 172         p->se.exec_start = 0;
 173 }
 174
 175 #ifdef CONFIG_SMP
 176 /* Only try algorithms three times */
 177 #define RT_MAX_TRIES 3
 178
 179 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
 180 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 181
 182 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 183 {
 184         if (!task_running(rq, p) &&
 185             (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)))
 186                 return 1;
 187         return 0;
 188 }
 189
 190 /* Return the second highest RT task, NULL otherwise */
 191 static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
 192                                                      int cpu)
 193 {
 194         struct rt_prio_array *array = &rq->rt.active;
 195         struct task_struct *next;
 196         struct list_head *queue;
 197         int idx;
 198
 199         assert_spin_locked(&rq->lock);
 200
 201         if (likely(rq->rt.rt_nr_running < 2))
 202                 return NULL;
 203
 204         idx = sched_find_first_bit(array->bitmap);
 205         if (unlikely(idx >= MAX_RT_PRIO)) {
 206                 WARN_ON(1); /* rt_nr_running is bad */
 207                 return NULL;
 208         }
 209
 210         queue = array->queue + idx;
 211         BUG_ON(list_empty(queue));
 212
 213         next = list_entry(queue->next, struct task_struct, run_list);
 214         if (unlikely(pick_rt_task(rq, next, cpu)))
 215                 goto out;
 216
 217         if (queue->next->next != queue) {
 218                 /* same prio task */
 219                 next = list_entry(queue->next->next, struct task_struct, run_list);
 220                 if (pick_rt_task(rq, next, cpu))
 221                         goto out;
 222         }
 223
 224  retry:
 225         /* slower, but more flexible */
 226         idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
 227         if (unlikely(idx >= MAX_RT_PRIO))
 228                 return NULL;
 229
 230         queue = array->queue + idx;
 231         BUG_ON(list_empty(queue));
 232
 233         list_for_each_entry(next, queue, run_list) {
 234                 if (pick_rt_task(rq, next, cpu))
 235                         goto out;
 236         }
 237
 238         goto retry;
 239
 240  out:
 241         return next;
 242 }
 243
 244 static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
 245
 246 /* Will lock the rq it finds */
 247 static struct rq *find_lock_lowest_rq(struct task_struct *task,
 248                                       struct rq *this_rq)
 249 {
 250         struct rq *lowest_rq = NULL;
 251         int cpu;
 252         int tries;
 253         cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
 254
 255         cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
 256
 257         for (tries = 0; tries < RT_MAX_TRIES; tries++) {
 258                 /*
 259                  * Scan each rq for the lowest prio.
 260                  */
 261                 for_each_cpu_mask(cpu, *cpu_mask) {
 262                         struct rq *rq = &per_cpu(runqueues, cpu);
 263
 264                         if (cpu == this_rq->cpu)
 265                                 continue;
 266
 267                         /* We look for lowest RT prio or non-rt CPU */
 268                         if (rq->rt.highest_prio >= MAX_RT_PRIO) {
 269                                 lowest_rq = rq;
 270                                 break;
 271                         }
 272
 273                         /* no locking for now */
 274                         if (rq->rt.highest_prio > task->prio &&
 275                             (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
 276                                 lowest_rq = rq;
 277                         }
 278                 }
 279
 280                 if (!lowest_rq)
 281                         break;
 282
 283                 /* if the prio of this runqueue changed, try again */
 284                 if (double_lock_balance(this_rq, lowest_rq)) {
 285                         /*
 286                          * We had to unlock the run queue. In
 287                          * the mean time, task could have
 288                          * migrated already or had its affinity changed.
 289                          * Also make sure that it wasn't scheduled on its rq.
 290                          */
 291                         if (unlikely(task_rq(task) != this_rq ||
 292                                      !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
 293                                      task_running(this_rq, task) ||
 294                                      !task->se.on_rq)) {
 295                                 spin_unlock(&lowest_rq->lock);
 296                                 lowest_rq = NULL;
 297                                 break;
 298                         }
 299                 }
 300
 301                 /* If this rq is still suitable use it. */
 302                 if (lowest_rq->rt.highest_prio > task->prio)
 303                         break;
 304
 305                 /* try again */
 306                 spin_unlock(&lowest_rq->lock);
 307                 lowest_rq = NULL;
 308         }
 309
 310         return lowest_rq;
 311 }
 312
 313 /*
 314  * If the current CPU has more than one RT task, see if the non
 315  * running task can migrate over to a CPU that is running a task
 316  * of lesser priority.
 317  */
 318 static int push_rt_task(struct rq *this_rq)
 319 {
 320         struct task_struct *next_task;
 321         struct rq *lowest_rq;
 322         int ret = 0;
 323         int paranoid = RT_MAX_TRIES;
 324
 325         assert_spin_locked(&this_rq->lock);
 326
 327         next_task = pick_next_highest_task_rt(this_rq, -1);
 328         if (!next_task)
 329                 return 0;
 330
 331  retry:
 332         if (unlikely(next_task == this_rq->curr)) {
 333                 WARN_ON(1);
 334                 return 0;
 335         }
 336
 337         /*
 338          * It's possible that the next_task slipped in of
 339          * higher priority than current. If that's the case
 340          * just reschedule current.
 341          */
 342         if (unlikely(next_task->prio < this_rq->curr->prio)) {
 343                 resched_task(this_rq->curr);
 344                 return 0;
 345         }
 346
 347         /* We might release this_rq lock */
 348         get_task_struct(next_task);
 349
 350         /* find_lock_lowest_rq locks the rq if found */
 351         lowest_rq = find_lock_lowest_rq(next_task, this_rq);
 352         if (!lowest_rq) {
 353                 struct task_struct *task;
 354                 /*
 355                  * find lock_lowest_rq releases this_rq->lock
 356                  * so it is possible that next_task has changed.
 357                  * If it has, then try again.
 358                  */
 359                 task = pick_next_highest_task_rt(this_rq, -1);
 360                 if (unlikely(task != next_task) && task && paranoid--) {
 361                         put_task_struct(next_task);
 362                         next_task = task;
 363                         goto retry;
 364                 }
 365                 goto out;
 366         }
 367
 368         assert_spin_locked(&lowest_rq->lock);
 369
 370         deactivate_task(this_rq, next_task, 0);
 371         set_task_cpu(next_task, lowest_rq->cpu);
 372         activate_task(lowest_rq, next_task, 0);
 373
 374         resched_task(lowest_rq->curr);
 375
 376         spin_unlock(&lowest_rq->lock);
 377
 378         ret = 1;
 379 out:
 380         put_task_struct(next_task);
 381
 382         return ret;
 383 }
 384
 385 /*
 386  * TODO: Currently we just use the second highest prio task on
 387  *       the queue, and stop when it can't migrate (or there's
 388  *       no more RT tasks).  There may be a case where a lower
 389  *       priority RT task has a different affinity than the
 390  *       higher RT task. In this case the lower RT task could
 391  *       possibly be able to migrate where as the higher priority
 392  *       RT task could not.  We currently ignore this issue.
 393  *       Enhancements are welcome!
 394  */
 395 static void push_rt_tasks(struct rq *rq)
 396 {
 397         /* push_rt_task will return true if it moved an RT */
 398         while (push_rt_task(rq))
 399                 ;
 400 }
 401
 402 static int pull_rt_task(struct rq *this_rq)
 403 {
 404         struct task_struct *next;
 405         struct task_struct *p;
 406         struct rq *src_rq;
 407         cpumask_t *rto_cpumask;
 408         int this_cpu = this_rq->cpu;
 409         int cpu;
 410         int ret = 0;
 411
 412         assert_spin_locked(&this_rq->lock);
 413
 414         /*
 415          * If cpusets are used, and we have overlapping
 416          * run queue cpusets, then this algorithm may not catch all.
 417          * This is just the price you pay on trying to keep
 418          * dirtying caches down on large SMP machines.
 419          */
 420         if (likely(!rt_overloaded()))
 421                 return 0;
 422
 423         next = pick_next_task_rt(this_rq);
 424
 425         rto_cpumask = rt_overload();
 426
 427         for_each_cpu_mask(cpu, *rto_cpumask) {
 428                 if (this_cpu == cpu)
 429                         continue;
 430
 431                 src_rq = cpu_rq(cpu);
 432                 if (unlikely(src_rq->rt.rt_nr_running <= 1)) {
 433                         /*
 434                          * It is possible that overlapping cpusets
 435                          * will miss clearing a non overloaded runqueue.
 436                          * Clear it now.
 437                          */
 438                         if (double_lock_balance(this_rq, src_rq)) {
 439                                 /* unlocked our runqueue lock */
 440                                 struct task_struct *old_next = next;
 441                                 next = pick_next_task_rt(this_rq);
 442                                 if (next != old_next)
 443                                         ret = 1;
 444                         }
 445                         if (likely(src_rq->rt.rt_nr_running <= 1))
 446                                 /*
 447                                  * Small chance that this_rq->curr changed
 448                                  * but it's really harmless here.
 449                                  */
 450                                 rt_clear_overload(this_rq);
 451                         else
 452                                 /*
 453                                  * Heh, the src_rq is now overloaded, since
 454                                  * we already have the src_rq lock, go straight
 455                                  * to pulling tasks from it.
 456                                  */
 457                                 goto try_pulling;
 458                         spin_unlock(&src_rq->lock);
 459                         continue;
 460                 }
 461
 462                 /*
 463                  * We can potentially drop this_rq's lock in
 464                  * double_lock_balance, and another CPU could
 465                  * steal our next task - hence we must cause
 466                  * the caller to recalculate the next task
 467                  * in that case:
 468                  */
 469                 if (double_lock_balance(this_rq, src_rq)) {
 470                         struct task_struct *old_next = next;
 471                         next = pick_next_task_rt(this_rq);
 472                         if (next != old_next)
 473                                 ret = 1;
 474                 }
 475
 476                 /*
 477                  * Are there still pullable RT tasks?
 478                  */
 479                 if (src_rq->rt.rt_nr_running <= 1) {
 480                         spin_unlock(&src_rq->lock);
 481                         continue;
 482                 }
 483
 484  try_pulling:
 485                 p = pick_next_highest_task_rt(src_rq, this_cpu);
 486
 487                 /*
 488                  * Do we have an RT task that preempts
 489                  * the to-be-scheduled task?
 490                  */
 491                 if (p && (!next || (p->prio < next->prio))) {
 492                         WARN_ON(p == src_rq->curr);
 493                         WARN_ON(!p->se.on_rq);
 494
 495                         /*
 496                          * There's a chance that p is higher in priority
 497                          * than what's currently running on its cpu.
 498                          * This is just that p is wakeing up and hasn't
 499                          * had a chance to schedule. We only pull
 500                          * p if it is lower in priority than the
 501                          * current task on the run queue or
 502                          * this_rq next task is lower in prio than
 503                          * the current task on that rq.
 504                          */
 505                         if (p->prio < src_rq->curr->prio ||
 506                             (next && next->prio < src_rq->curr->prio))
 507                                 goto bail;
 508
 509                         ret = 1;
 510
 511                         deactivate_task(src_rq, p, 0);
 512                         set_task_cpu(p, this_cpu);
 513                         activate_task(this_rq, p, 0);
 514                         /*
 515                          * We continue with the search, just in
 516                          * case there's an even higher prio task
 517                          * in another runqueue. (low likelyhood
 518                          * but possible)
 519                          */
 520
 521                         /*
 522                          * Update next so that we won't pick a task
 523                          * on another cpu with a priority lower (or equal)
 524                          * than the one we just picked.
 525                          */
 526                         next = p;
 527
 528                 }
 529  bail:
 530                 spin_unlock(&src_rq->lock);
 531         }
 532
 533         return ret;
 534 }
 535
 536 static void schedule_balance_rt(struct rq *rq,
 537                                 struct task_struct *prev)
 538 {
 539         /* Try to pull RT tasks here if we lower this rq's prio */
 540         if (unlikely(rt_task(prev)) &&
 541             rq->rt.highest_prio > prev->prio)
 542                 pull_rt_task(rq);
 543 }
 544
 545 static void schedule_tail_balance_rt(struct rq *rq)
 546 {
 547         /*
 548          * If we have more than one rt_task queued, then
 549          * see if we can push the other rt_tasks off to other CPUS.
 550          * Note we may release the rq lock, and since
 551          * the lock was owned by prev, we need to release it
 552          * first via finish_lock_switch and then reaquire it here.
 553          */
 554         if (unlikely(rq->rt.rt_nr_running > 1)) {
 555                 spin_lock_irq(&rq->lock);
 556                 push_rt_tasks(rq);
 557                 spin_unlock_irq(&rq->lock);
 558         }
 559 }
 560
 561
 562 static void wakeup_balance_rt(struct rq *rq, struct task_struct *p)
 563 {
 564         if (unlikely(rt_task(p)) &&
 565             !task_running(rq, p) &&
 566             (p->prio >= rq->curr->prio))
 567                 push_rt_tasks(rq);
 568 }
 569
 570 static unsigned long
 571 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 572                 unsigned long max_load_move,
 573                 struct sched_domain *sd, enum cpu_idle_type idle,
 574                 int *all_pinned, int *this_best_prio)
 575 {
 576         /* don't touch RT tasks */
 577         return 0;
 578 }
 579
 580 static int
 581 move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 582                  struct sched_domain *sd, enum cpu_idle_type idle)
 583 {
 584         /* don't touch RT tasks */
 585         return 0;
 586 }
 587 #else /* CONFIG_SMP */
 588 # define schedule_tail_balance_rt(rq)   do { } while (0)
 589 # define schedule_balance_rt(rq, prev)  do { } while (0)
 590 # define wakeup_balance_rt(rq, p)       do { } while (0)
 591 #endif /* CONFIG_SMP */
 592
 593 static void task_tick_rt(struct rq *rq, struct task_struct *p)
 594 {
 595         update_curr_rt(rq);
 596
 597         /*
 598          * RR tasks need a special form of timeslice management.
 599          * FIFO tasks have no timeslices.
 600          */
 601         if (p->policy != SCHED_RR)
 602                 return;
 603
 604         if (--p->time_slice)
 605                 return;
 606
 607         p->time_slice = DEF_TIMESLICE;
 608
 609         /*
 610          * Requeue to the end of queue if we are not the only element
 611          * on the queue:
 612          */
 613         if (p->run_list.prev != p->run_list.next) {
 614                 requeue_task_rt(rq, p);
 615                 set_tsk_need_resched(p);
 616         }
 617 }
 618
 619 static void set_curr_task_rt(struct rq *rq)
 620 {
 621         struct task_struct *p = rq->curr;
 622
 623         p->se.exec_start = rq->clock;
 624 }
 625
 626 const struct sched_class rt_sched_class = {
 627         .next                   = &fair_sched_class,
 628         .enqueue_task           = enqueue_task_rt,
 629         .dequeue_task           = dequeue_task_rt,
 630         .yield_task             = yield_task_rt,
 631
 632         .check_preempt_curr     = check_preempt_curr_rt,
 633
 634         .pick_next_task         = pick_next_task_rt,
 635         .put_prev_task          = put_prev_task_rt,
 636
 637 #ifdef CONFIG_SMP
 638         .load_balance           = load_balance_rt,
 639         .move_one_task          = move_one_task_rt,
 640 #endif
 641
 642         .set_curr_task          = set_curr_task_rt,
 643         .task_tick              = task_tick_rt,
 644 };