arch/powerpc/platforms/cell/spufs/sched.c

   1 /* sched.c - SPU scheduler.
   2  *
   3  * Copyright (C) IBM 2005
   4  * Author: Mark Nutter <mnutter@us.ibm.com>
   5  *
   6  * 2006-03-31   NUMA domains added.
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2, or (at your option)
  11  * any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with this program; if not, write to the Free Software
  20  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  21  */
  22
  23 #undef DEBUG
  24
  25 #include <linux/module.h>
  26 #include <linux/errno.h>
  27 #include <linux/sched.h>
  28 #include <linux/kernel.h>
  29 #include <linux/mm.h>
  30 #include <linux/completion.h>
  31 #include <linux/vmalloc.h>
  32 #include <linux/smp.h>
  33 #include <linux/stddef.h>
  34 #include <linux/unistd.h>
  35 #include <linux/numa.h>
  36 #include <linux/mutex.h>
  37 #include <linux/notifier.h>
  38 #include <linux/kthread.h>
  39 #include <linux/pid_namespace.h>
  40 #include <linux/proc_fs.h>
  41 #include <linux/seq_file.h>
  42
  43 #include <asm/io.h>
  44 #include <asm/mmu_context.h>
  45 #include <asm/spu.h>
  46 #include <asm/spu_csa.h>
  47 #include <asm/spu_priv1.h>
  48 #include "spufs.h"
  49
  50 struct spu_prio_array {
  51         DECLARE_BITMAP(bitmap, MAX_PRIO);
  52         struct list_head runq[MAX_PRIO];
  53         spinlock_t runq_lock;
  54         int nr_waiting;
  55 };
  56
  57 static unsigned long spu_avenrun[3];
  58 static struct spu_prio_array *spu_prio;
  59 static struct task_struct *spusched_task;
  60 static struct timer_list spusched_timer;
  61
  62 /*
  63  * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
  64  */
  65 #define NORMAL_PRIO             120
  66
  67 /*
  68  * Frequency of the spu scheduler tick.  By default we do one SPU scheduler
  69  * tick for every 10 CPU scheduler ticks.
  70  */
  71 #define SPUSCHED_TICK           (10)
  72
  73 /*
  74  * These are the 'tuning knobs' of the scheduler:
  75  *
  76  * Minimum timeslice is 5 msecs (or 1 spu scheduler tick, whichever is
  77  * larger), default timeslice is 100 msecs, maximum timeslice is 800 msecs.
  78  */
  79 #define MIN_SPU_TIMESLICE       max(5 * HZ / (1000 * SPUSCHED_TICK), 1)
  80 #define DEF_SPU_TIMESLICE       (100 * HZ / (1000 * SPUSCHED_TICK))
  81
  82 #define MAX_USER_PRIO           (MAX_PRIO - MAX_RT_PRIO)
  83 #define SCALE_PRIO(x, prio) \
  84         max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE)
  85
  86 /*
  87  * scale user-nice values [ -20 ... 0 ... 19 ] to time slice values:
  88  * [800ms ... 100ms ... 5ms]
  89  *
  90  * The higher a thread's priority, the bigger timeslices
  91  * it gets during one round of execution. But even the lowest
  92  * priority thread gets MIN_TIMESLICE worth of execution time.
  93  */
  94 void spu_set_timeslice(struct spu_context *ctx)
  95 {
  96         if (ctx->prio < NORMAL_PRIO)
  97                 ctx->time_slice = SCALE_PRIO(DEF_SPU_TIMESLICE * 4, ctx->prio);
  98         else
  99                 ctx->time_slice = SCALE_PRIO(DEF_SPU_TIMESLICE, ctx->prio);
 100 }
 101
 102 /*
 103  * Update scheduling information from the owning thread.
 104  */
 105 void __spu_update_sched_info(struct spu_context *ctx)
 106 {
 107         /*
 108          * 32-Bit assignments are atomic on powerpc, and we don't care about
 109          * memory ordering here because retrieving the controlling thread is
 110          * per definition racy.
 111          */
 112         ctx->tid = current->pid;
 113
 114         /*
 115          * We do our own priority calculations, so we normally want
 116          * ->static_prio to start with. Unfortunately this field
 117          * contains junk for threads with a realtime scheduling
 118          * policy so we have to look at ->prio in this case.
 119          */
 120         if (rt_prio(current->prio))
 121                 ctx->prio = current->prio;
 122         else
 123                 ctx->prio = current->static_prio;
 124         ctx->policy = current->policy;
 125
 126         /*
 127          * A lot of places that don't hold list_mutex poke into
 128          * cpus_allowed, including grab_runnable_context which
 129          * already holds the runq_lock.  So abuse runq_lock
 130          * to protect this field as well.
 131          */
 132         spin_lock(&spu_prio->runq_lock);
 133         ctx->cpus_allowed = current->cpus_allowed;
 134         spin_unlock(&spu_prio->runq_lock);
 135 }
 136
 137 void spu_update_sched_info(struct spu_context *ctx)
 138 {
 139         int node = ctx->spu->node;
 140
 141         mutex_lock(&cbe_spu_info[node].list_mutex);
 142         __spu_update_sched_info(ctx);
 143         mutex_unlock(&cbe_spu_info[node].list_mutex);
 144 }
 145
 146 static int __node_allowed(struct spu_context *ctx, int node)
 147 {
 148         if (nr_cpus_node(node)) {
 149                 cpumask_t mask = node_to_cpumask(node);
 150
 151                 if (cpus_intersects(mask, ctx->cpus_allowed))
 152                         return 1;
 153         }
 154
 155         return 0;
 156 }
 157
 158 static int node_allowed(struct spu_context *ctx, int node)
 159 {
 160         int rval;
 161
 162         spin_lock(&spu_prio->runq_lock);
 163         rval = __node_allowed(ctx, node);
 164         spin_unlock(&spu_prio->runq_lock);
 165
 166         return rval;
 167 }
 168
 169 static BLOCKING_NOTIFIER_HEAD(spu_switch_notifier);
 170
 171 void spu_switch_notify(struct spu *spu, struct spu_context *ctx)
 172 {
 173         blocking_notifier_call_chain(&spu_switch_notifier,
 174                             ctx ? ctx->object_id : 0, spu);
 175 }
 176
 177 static void notify_spus_active(void)
 178 {
 179         int node;
 180
 181         /*
 182          * Wake up the active spu_contexts.
 183          *
 184          * When the awakened processes see their "notify_active" flag is set,
 185          * they will call spu_switch_notify().
 186          */
 187         for_each_online_node(node) {
 188                 struct spu *spu;
 189
 190                 mutex_lock(&cbe_spu_info[node].list_mutex);
 191                 list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
 192                         if (spu->alloc_state != SPU_FREE) {
 193                                 struct spu_context *ctx = spu->ctx;
 194                                 set_bit(SPU_SCHED_NOTIFY_ACTIVE,
 195                                         &ctx->sched_flags);
 196                                 mb();
 197                                 wake_up_all(&ctx->stop_wq);
 198                         }
 199                 }
 200                 mutex_unlock(&cbe_spu_info[node].list_mutex);
 201         }
 202 }
 203
 204 int spu_switch_event_register(struct notifier_block * n)
 205 {
 206         int ret;
 207         ret = blocking_notifier_chain_register(&spu_switch_notifier, n);
 208         if (!ret)
 209                 notify_spus_active();
 210         return ret;
 211 }
 212 EXPORT_SYMBOL_GPL(spu_switch_event_register);
 213
 214 int spu_switch_event_unregister(struct notifier_block * n)
 215 {
 216         return blocking_notifier_chain_unregister(&spu_switch_notifier, n);
 217 }
 218 EXPORT_SYMBOL_GPL(spu_switch_event_unregister);
 219
 220 /**
 221  * spu_bind_context - bind spu context to physical spu
 222  * @spu:        physical spu to bind to
 223  * @ctx:        context to bind
 224  */
 225 static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
 226 {
 227         pr_debug("%s: pid=%d SPU=%d NODE=%d\n", __FUNCTION__, current->pid,
 228                  spu->number, spu->node);
 229         spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
 230
 231         if (ctx->flags & SPU_CREATE_NOSCHED)
 232                 atomic_inc(&cbe_spu_info[spu->node].reserved_spus);
 233
 234         ctx->stats.slb_flt_base = spu->stats.slb_flt;
 235         ctx->stats.class2_intr_base = spu->stats.class2_intr;
 236
 237         spu->ctx = ctx;
 238         spu->flags = 0;
 239         ctx->spu = spu;
 240         ctx->ops = &spu_hw_ops;
 241         spu->pid = current->pid;
 242         spu->tgid = current->tgid;
 243         spu_associate_mm(spu, ctx->owner);
 244         spu->ibox_callback = spufs_ibox_callback;
 245         spu->wbox_callback = spufs_wbox_callback;
 246         spu->stop_callback = spufs_stop_callback;
 247         spu->mfc_callback = spufs_mfc_callback;
 248         spu->dma_callback = spufs_dma_callback;
 249         mb();
 250         spu_unmap_mappings(ctx);
 251         spu_restore(&ctx->csa, spu);
 252         spu->timestamp = jiffies;
 253         spu_cpu_affinity_set(spu, raw_smp_processor_id());
 254         spu_switch_notify(spu, ctx);
 255         ctx->state = SPU_STATE_RUNNABLE;
 256
 257         spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED);
 258 }
 259
 260 /*
 261  * Must be used with the list_mutex held.
 262  */
 263 static inline int sched_spu(struct spu *spu)
 264 {
 265         BUG_ON(!mutex_is_locked(&cbe_spu_info[spu->node].list_mutex));
 266
 267         return (!spu->ctx || !(spu->ctx->flags & SPU_CREATE_NOSCHED));
 268 }
 269
 270 static void aff_merge_remaining_ctxs(struct spu_gang *gang)
 271 {
 272         struct spu_context *ctx;
 273
 274         list_for_each_entry(ctx, &gang->aff_list_head, aff_list) {
 275                 if (list_empty(&ctx->aff_list))
 276                         list_add(&ctx->aff_list, &gang->aff_list_head);
 277         }
 278         gang->aff_flags |= AFF_MERGED;
 279 }
 280
 281 static void aff_set_offsets(struct spu_gang *gang)
 282 {
 283         struct spu_context *ctx;
 284         int offset;
 285
 286         offset = -1;
 287         list_for_each_entry_reverse(ctx, &gang->aff_ref_ctx->aff_list,
 288                                                                 aff_list) {
 289                 if (&ctx->aff_list == &gang->aff_list_head)
 290                         break;
 291                 ctx->aff_offset = offset--;
 292         }
 293
 294         offset = 0;
 295         list_for_each_entry(ctx, gang->aff_ref_ctx->aff_list.prev, aff_list) {
 296                 if (&ctx->aff_list == &gang->aff_list_head)
 297                         break;
 298                 ctx->aff_offset = offset++;
 299         }
 300
 301         gang->aff_flags |= AFF_OFFSETS_SET;
 302 }
 303
 304 static struct spu *aff_ref_location(struct spu_context *ctx, int mem_aff,
 305                  int group_size, int lowest_offset)
 306 {
 307         struct spu *spu;
 308         int node, n;
 309
 310         /*
 311          * TODO: A better algorithm could be used to find a good spu to be
 312          *       used as reference location for the ctxs chain.
 313          */
 314         node = cpu_to_node(raw_smp_processor_id());
 315         for (n = 0; n < MAX_NUMNODES; n++, node++) {
 316                 node = (node < MAX_NUMNODES) ? node : 0;
 317                 if (!node_allowed(ctx, node))
 318                         continue;
 319                 mutex_lock(&cbe_spu_info[node].list_mutex);
 320                 list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
 321                         if ((!mem_aff || spu->has_mem_affinity) &&
 322                                                         sched_spu(spu)) {
 323                                 mutex_unlock(&cbe_spu_info[node].list_mutex);
 324                                 return spu;
 325                         }
 326                 }
 327                 mutex_unlock(&cbe_spu_info[node].list_mutex);
 328         }
 329         return NULL;
 330 }
 331
 332 static void aff_set_ref_point_location(struct spu_gang *gang)
 333 {
 334         int mem_aff, gs, lowest_offset;
 335         struct spu_context *ctx;
 336         struct spu *tmp;
 337
 338         mem_aff = gang->aff_ref_ctx->flags & SPU_CREATE_AFFINITY_MEM;
 339         lowest_offset = 0;
 340         gs = 0;
 341
 342         list_for_each_entry(tmp, &gang->aff_list_head, aff_list)
 343                 gs++;
 344
 345         list_for_each_entry_reverse(ctx, &gang->aff_ref_ctx->aff_list,
 346                                                                 aff_list) {
 347                 if (&ctx->aff_list == &gang->aff_list_head)
 348                         break;
 349                 lowest_offset = ctx->aff_offset;
 350         }
 351
 352         gang->aff_ref_spu = aff_ref_location(gang->aff_ref_ctx, mem_aff, gs,
 353                                                         lowest_offset);
 354 }
 355
 356 static struct spu *ctx_location(struct spu *ref, int offset, int node)
 357 {
 358         struct spu *spu;
 359
 360         spu = NULL;
 361         if (offset >= 0) {
 362                 list_for_each_entry(spu, ref->aff_list.prev, aff_list) {
 363                         BUG_ON(spu->node != node);
 364                         if (offset == 0)
 365                                 break;
 366                         if (sched_spu(spu))
 367                                 offset--;
 368                 }
 369         } else {
 370                 list_for_each_entry_reverse(spu, ref->aff_list.next, aff_list) {
 371                         BUG_ON(spu->node != node);
 372                         if (offset == 0)
 373                                 break;
 374                         if (sched_spu(spu))
 375                                 offset++;
 376                 }
 377         }
 378
 379         return spu;
 380 }
 381
 382 /*
 383  * affinity_check is called each time a context is going to be scheduled.
 384  * It returns the spu ptr on which the context must run.
 385  */
 386 static int has_affinity(struct spu_context *ctx)
 387 {
 388         struct spu_gang *gang = ctx->gang;
 389
 390         if (list_empty(&ctx->aff_list))
 391                 return 0;
 392
 393         if (!gang->aff_ref_spu) {
 394                 if (!(gang->aff_flags & AFF_MERGED))
 395                         aff_merge_remaining_ctxs(gang);
 396                 if (!(gang->aff_flags & AFF_OFFSETS_SET))
 397                         aff_set_offsets(gang);
 398                 aff_set_ref_point_location(gang);
 399         }
 400
 401         return gang->aff_ref_spu != NULL;
 402 }
 403
 404 /**
 405  * spu_unbind_context - unbind spu context from physical spu
 406  * @spu:        physical spu to unbind from
 407  * @ctx:        context to unbind
 408  */
 409 static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
 410 {
 411         pr_debug("%s: unbind pid=%d SPU=%d NODE=%d\n", __FUNCTION__,
 412                  spu->pid, spu->number, spu->node);
 413         spuctx_switch_state(ctx, SPU_UTIL_SYSTEM);
 414
 415         if (spu->ctx->flags & SPU_CREATE_NOSCHED)
 416                 atomic_dec(&cbe_spu_info[spu->node].reserved_spus);
 417
 418         if (ctx->gang){
 419                 mutex_lock(&ctx->gang->aff_mutex);
 420                 if (has_affinity(ctx)) {
 421                         if (atomic_dec_and_test(&ctx->gang->aff_sched_count))
 422                                 ctx->gang->aff_ref_spu = NULL;
 423                 }
 424                 mutex_unlock(&ctx->gang->aff_mutex);
 425         }
 426
 427         spu_switch_notify(spu, NULL);
 428         spu_unmap_mappings(ctx);
 429         spu_save(&ctx->csa, spu);
 430         spu->timestamp = jiffies;
 431         ctx->state = SPU_STATE_SAVED;
 432         spu->ibox_callback = NULL;
 433         spu->wbox_callback = NULL;
 434         spu->stop_callback = NULL;
 435         spu->mfc_callback = NULL;
 436         spu->dma_callback = NULL;
 437         spu_associate_mm(spu, NULL);
 438         spu->pid = 0;
 439         spu->tgid = 0;
 440         ctx->ops = &spu_backing_ops;
 441         spu->flags = 0;
 442         spu->ctx = NULL;
 443
 444         ctx->stats.slb_flt +=
 445                 (spu->stats.slb_flt - ctx->stats.slb_flt_base);
 446         ctx->stats.class2_intr +=
 447                 (spu->stats.class2_intr - ctx->stats.class2_intr_base);
 448
 449         /* This maps the underlying spu state to idle */
 450         spuctx_switch_state(ctx, SPU_UTIL_IDLE_LOADED);
 451         ctx->spu = NULL;
 452 }
 453
 454 /**
 455  * spu_add_to_rq - add a context to the runqueue
 456  * @ctx:       context to add
 457  */
 458 static void __spu_add_to_rq(struct spu_context *ctx)
 459 {
 460         /*
 461          * Unfortunately this code path can be called from multiple threads
 462          * on behalf of a single context due to the way the problem state
 463          * mmap support works.
 464          *
 465          * Fortunately we need to wake up all these threads at the same time
 466          * and can simply skip the runqueue addition for every but the first
 467          * thread getting into this codepath.
 468          *
 469          * It's still quite hacky, and long-term we should proxy all other
 470          * threads through the owner thread so that spu_run is in control
 471          * of all the scheduling activity for a given context.
 472          */
 473         if (list_empty(&ctx->rq)) {
 474                 list_add_tail(&ctx->rq, &spu_prio->runq[ctx->prio]);
 475                 set_bit(ctx->prio, spu_prio->bitmap);
 476                 if (!spu_prio->nr_waiting++)
 477                         __mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
 478         }
 479 }
 480
 481 static void __spu_del_from_rq(struct spu_context *ctx)
 482 {
 483         int prio = ctx->prio;
 484
 485         if (!list_empty(&ctx->rq)) {
 486                 if (!--spu_prio->nr_waiting)
 487                         del_timer(&spusched_timer);
 488                 list_del_init(&ctx->rq);
 489
 490                 if (list_empty(&spu_prio->runq[prio]))
 491                         clear_bit(prio, spu_prio->bitmap);
 492         }
 493 }
 494
 495 static void spu_prio_wait(struct spu_context *ctx)
 496 {
 497         DEFINE_WAIT(wait);
 498
 499         spin_lock(&spu_prio->runq_lock);
 500         prepare_to_wait_exclusive(&ctx->stop_wq, &wait, TASK_INTERRUPTIBLE);
 501         if (!signal_pending(current)) {
 502                 __spu_add_to_rq(ctx);
 503                 spin_unlock(&spu_prio->runq_lock);
 504                 mutex_unlock(&ctx->state_mutex);
 505                 schedule();
 506                 mutex_lock(&ctx->state_mutex);
 507                 spin_lock(&spu_prio->runq_lock);
 508                 __spu_del_from_rq(ctx);
 509         }
 510         spin_unlock(&spu_prio->runq_lock);
 511         __set_current_state(TASK_RUNNING);
 512         remove_wait_queue(&ctx->stop_wq, &wait);
 513 }
 514
 515 static struct spu *spu_get_idle(struct spu_context *ctx)
 516 {
 517         struct spu *spu, *aff_ref_spu;
 518         int node, n;
 519
 520         if (ctx->gang) {
 521                 mutex_lock(&ctx->gang->aff_mutex);
 522                 if (has_affinity(ctx)) {
 523                         aff_ref_spu = ctx->gang->aff_ref_spu;
 524                         atomic_inc(&ctx->gang->aff_sched_count);
 525                         mutex_unlock(&ctx->gang->aff_mutex);
 526                         node = aff_ref_spu->node;
 527
 528                         mutex_lock(&cbe_spu_info[node].list_mutex);
 529                         spu = ctx_location(aff_ref_spu, ctx->aff_offset, node);
 530                         if (spu && spu->alloc_state == SPU_FREE)
 531                                 goto found;
 532                         mutex_unlock(&cbe_spu_info[node].list_mutex);
 533
 534                         mutex_lock(&ctx->gang->aff_mutex);
 535                         if (atomic_dec_and_test(&ctx->gang->aff_sched_count))
 536                                 ctx->gang->aff_ref_spu = NULL;
 537                         mutex_unlock(&ctx->gang->aff_mutex);
 538
 539                         return NULL;
 540                 }
 541                 mutex_unlock(&ctx->gang->aff_mutex);
 542         }
 543         node = cpu_to_node(raw_smp_processor_id());
 544         for (n = 0; n < MAX_NUMNODES; n++, node++) {
 545                 node = (node < MAX_NUMNODES) ? node : 0;
 546                 if (!node_allowed(ctx, node))
 547                         continue;
 548
 549                 mutex_lock(&cbe_spu_info[node].list_mutex);
 550                 list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
 551                         if (spu->alloc_state == SPU_FREE)
 552                                 goto found;
 553                 }
 554                 mutex_unlock(&cbe_spu_info[node].list_mutex);
 555         }
 556
 557         return NULL;
 558
 559  found:
 560         spu->alloc_state = SPU_USED;
 561         mutex_unlock(&cbe_spu_info[node].list_mutex);
 562         pr_debug("Got SPU %d %d\n", spu->number, spu->node);
 563         spu_init_channels(spu);
 564         return spu;
 565 }
 566
 567 /**
 568  * find_victim - find a lower priority context to preempt
 569  * @ctx:        canidate context for running
 570  *
 571  * Returns the freed physical spu to run the new context on.
 572  */
 573 static struct spu *find_victim(struct spu_context *ctx)
 574 {
 575         struct spu_context *victim = NULL;
 576         struct spu *spu;
 577         int node, n;
 578
 579         /*
 580          * Look for a possible preemption candidate on the local node first.
 581          * If there is no candidate look at the other nodes.  This isn't
 582          * exactly fair, but so far the whole spu scheduler tries to keep
 583          * a strong node affinity.  We might want to fine-tune this in
 584          * the future.
 585          */
 586  restart:
 587         node = cpu_to_node(raw_smp_processor_id());
 588         for (n = 0; n < MAX_NUMNODES; n++, node++) {
 589                 node = (node < MAX_NUMNODES) ? node : 0;
 590                 if (!node_allowed(ctx, node))
 591                         continue;
 592
 593                 mutex_lock(&cbe_spu_info[node].list_mutex);
 594                 list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list) {
 595                         struct spu_context *tmp = spu->ctx;
 596
 597                         if (tmp && tmp->prio > ctx->prio &&
 598                             (!victim || tmp->prio > victim->prio))
 599                                 victim = spu->ctx;
 600                 }
 601                 mutex_unlock(&cbe_spu_info[node].list_mutex);
 602
 603                 if (victim) {
 604                         /*
 605                          * This nests ctx->state_mutex, but we always lock
 606                          * higher priority contexts before lower priority
 607                          * ones, so this is safe until we introduce
 608                          * priority inheritance schemes.
 609                          */
 610                         if (!mutex_trylock(&victim->state_mutex)) {
 611                                 victim = NULL;
 612                                 goto restart;
 613                         }
 614
 615                         spu = victim->spu;
 616                         if (!spu) {
 617                                 /*
 618                                  * This race can happen because we've dropped
 619                                  * the active list mutex.  No a problem, just
 620                                  * restart the search.
 621                                  */
 622                                 mutex_unlock(&victim->state_mutex);
 623                                 victim = NULL;
 624                                 goto restart;
 625                         }
 626
 627                         mutex_lock(&cbe_spu_info[node].list_mutex);
 628                         cbe_spu_info[node].nr_active--;
 629                         spu_unbind_context(spu, victim);
 630                         mutex_unlock(&cbe_spu_info[node].list_mutex);
 631
 632                         victim->stats.invol_ctx_switch++;
 633                         spu->stats.invol_ctx_switch++;
 634                         mutex_unlock(&victim->state_mutex);
 635                         /*
 636                          * We need to break out of the wait loop in spu_run
 637                          * manually to ensure this context gets put on the
 638                          * runqueue again ASAP.
 639                          */
 640                         wake_up(&victim->stop_wq);
 641                         return spu;
 642                 }
 643         }
 644
 645         return NULL;
 646 }
 647
 648 /**
 649  * spu_activate - find a free spu for a context and execute it
 650  * @ctx:        spu context to schedule
 651  * @flags:      flags (currently ignored)
 652  *
 653  * Tries to find a free spu to run @ctx.  If no free spu is available
 654  * add the context to the runqueue so it gets woken up once an spu
 655  * is available.
 656  */
 657 int spu_activate(struct spu_context *ctx, unsigned long flags)
 658 {
 659         do {
 660                 struct spu *spu;
 661
 662                 /*
 663                  * If there are multiple threads waiting for a single context
 664                  * only one actually binds the context while the others will
 665                  * only be able to acquire the state_mutex once the context
 666                  * already is in runnable state.
 667                  */
 668                 if (ctx->spu)
 669                         return 0;
 670
 671                 spu = spu_get_idle(ctx);
 672                 /*
 673                  * If this is a realtime thread we try to get it running by
 674                  * preempting a lower priority thread.
 675                  */
 676                 if (!spu && rt_prio(ctx->prio))
 677                         spu = find_victim(ctx);
 678                 if (spu) {
 679                         int node = spu->node;
 680
 681                         mutex_lock(&cbe_spu_info[node].list_mutex);
 682                         spu_bind_context(spu, ctx);
 683                         cbe_spu_info[node].nr_active++;
 684                         mutex_unlock(&cbe_spu_info[node].list_mutex);
 685                         return 0;
 686                 }
 687
 688                 spu_prio_wait(ctx);
 689         } while (!signal_pending(current));
 690
 691         return -ERESTARTSYS;
 692 }
 693
 694 /**
 695  * grab_runnable_context - try to find a runnable context
 696  *
 697  * Remove the highest priority context on the runqueue and return it
 698  * to the caller.  Returns %NULL if no runnable context was found.
 699  */
 700 static struct spu_context *grab_runnable_context(int prio, int node)
 701 {
 702         struct spu_context *ctx;
 703         int best;
 704
 705         spin_lock(&spu_prio->runq_lock);
 706         best = find_first_bit(spu_prio->bitmap, prio);
 707         while (best < prio) {
 708                 struct list_head *rq = &spu_prio->runq[best];
 709
 710                 list_for_each_entry(ctx, rq, rq) {
 711                         /* XXX(hch): check for affinity here aswell */
 712                         if (__node_allowed(ctx, node)) {
 713                                 __spu_del_from_rq(ctx);
 714                                 goto found;
 715                         }
 716                 }
 717                 best++;
 718         }
 719         ctx = NULL;
 720  found:
 721         spin_unlock(&spu_prio->runq_lock);
 722         return ctx;
 723 }
 724
 725 static int __spu_deactivate(struct spu_context *ctx, int force, int max_prio)
 726 {
 727         struct spu *spu = ctx->spu;
 728         struct spu_context *new = NULL;
 729
 730         if (spu) {
 731                 new = grab_runnable_context(max_prio, spu->node);
 732                 if (new || force) {
 733                         int node = spu->node;
 734
 735                         mutex_lock(&cbe_spu_info[node].list_mutex);
 736                         spu_unbind_context(spu, ctx);
 737                         spu->alloc_state = SPU_FREE;
 738                         cbe_spu_info[node].nr_active--;
 739                         mutex_unlock(&cbe_spu_info[node].list_mutex);
 740
 741                         ctx->stats.vol_ctx_switch++;
 742                         spu->stats.vol_ctx_switch++;
 743
 744                         if (new)
 745                                 wake_up(&new->stop_wq);
 746                 }
 747
 748         }
 749
 750         return new != NULL;
 751 }
 752
 753 /**
 754  * spu_deactivate - unbind a context from it's physical spu
 755  * @ctx:        spu context to unbind
 756  *
 757  * Unbind @ctx from the physical spu it is running on and schedule
 758  * the highest priority context to run on the freed physical spu.
 759  */
 760 void spu_deactivate(struct spu_context *ctx)
 761 {
 762         __spu_deactivate(ctx, 1, MAX_PRIO);
 763 }
 764
 765 /**
 766  * spu_yield -  yield a physical spu if others are waiting
 767  * @ctx:        spu context to yield
 768  *
 769  * Check if there is a higher priority context waiting and if yes
 770  * unbind @ctx from the physical spu and schedule the highest
 771  * priority context to run on the freed physical spu instead.
 772  */
 773 void spu_yield(struct spu_context *ctx)
 774 {
 775         if (!(ctx->flags & SPU_CREATE_NOSCHED)) {
 776                 mutex_lock(&ctx->state_mutex);
 777                 __spu_deactivate(ctx, 0, MAX_PRIO);
 778                 mutex_unlock(&ctx->state_mutex);
 779         }
 780 }
 781
 782 static noinline void spusched_tick(struct spu_context *ctx)
 783 {
 784         if (ctx->flags & SPU_CREATE_NOSCHED)
 785                 return;
 786         if (ctx->policy == SCHED_FIFO)
 787                 return;
 788
 789         if (--ctx->time_slice)
 790                 return;
 791
 792         /*
 793          * Unfortunately list_mutex ranks outside of state_mutex, so
 794          * we have to trylock here.  If we fail give the context another
 795          * tick and try again.
 796          */
 797         if (mutex_trylock(&ctx->state_mutex)) {
 798                 struct spu *spu = ctx->spu;
 799                 struct spu_context *new;
 800
 801                 new = grab_runnable_context(ctx->prio + 1, spu->node);
 802                 if (new) {
 803                         spu_unbind_context(spu, ctx);
 804                         ctx->stats.invol_ctx_switch++;
 805                         spu->stats.invol_ctx_switch++;
 806                         spu->alloc_state = SPU_FREE;
 807                         cbe_spu_info[spu->node].nr_active--;
 808                         wake_up(&new->stop_wq);
 809                         /*
 810                          * We need to break out of the wait loop in
 811                          * spu_run manually to ensure this context
 812                          * gets put on the runqueue again ASAP.
 813                          */
 814                         wake_up(&ctx->stop_wq);
 815                 }
 816                 spu_set_timeslice(ctx);
 817                 mutex_unlock(&ctx->state_mutex);
 818         } else {
 819                 ctx->time_slice++;
 820         }
 821 }
 822
 823 /**
 824  * count_active_contexts - count nr of active tasks
 825  *
 826  * Return the number of tasks currently running or waiting to run.
 827  *
 828  * Note that we don't take runq_lock / list_mutex here.  Reading
 829  * a single 32bit value is atomic on powerpc, and we don't care
 830  * about memory ordering issues here.
 831  */
 832 static unsigned long count_active_contexts(void)
 833 {
 834         int nr_active = 0, node;
 835
 836         for (node = 0; node < MAX_NUMNODES; node++)
 837                 nr_active += cbe_spu_info[node].nr_active;
 838         nr_active += spu_prio->nr_waiting;
 839
 840         return nr_active;
 841 }
 842
 843 /**
 844  * spu_calc_load - given tick count, update the avenrun load estimates.
 845  * @tick:       tick count
 846  *
 847  * No locking against reading these values from userspace, as for
 848  * the CPU loadavg code.
 849  */
 850 static void spu_calc_load(unsigned long ticks)
 851 {
 852         unsigned long active_tasks; /* fixed-point */
 853         static int count = LOAD_FREQ;
 854
 855         count -= ticks;
 856
 857         if (unlikely(count < 0)) {
 858                 active_tasks = count_active_contexts() * FIXED_1;
 859                 do {
 860                         CALC_LOAD(spu_avenrun[0], EXP_1, active_tasks);
 861                         CALC_LOAD(spu_avenrun[1], EXP_5, active_tasks);
 862                         CALC_LOAD(spu_avenrun[2], EXP_15, active_tasks);
 863                         count += LOAD_FREQ;
 864                 } while (count < 0);
 865         }
 866 }
 867
 868 static void spusched_wake(unsigned long data)
 869 {
 870         mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
 871         wake_up_process(spusched_task);
 872         spu_calc_load(SPUSCHED_TICK);
 873 }
 874
 875 static int spusched_thread(void *unused)
 876 {
 877         struct spu *spu;
 878         int node;
 879
 880         while (!kthread_should_stop()) {
 881                 set_current_state(TASK_INTERRUPTIBLE);
 882                 schedule();
 883                 for (node = 0; node < MAX_NUMNODES; node++) {
 884                         mutex_lock(&cbe_spu_info[node].list_mutex);
 885                         list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list)
 886                                 if (spu->ctx)
 887                                         spusched_tick(spu->ctx);
 888                         mutex_unlock(&cbe_spu_info[node].list_mutex);
 889                 }
 890         }
 891
 892         return 0;
 893 }
 894
 895 #define LOAD_INT(x) ((x) >> FSHIFT)
 896 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
 897
 898 static int show_spu_loadavg(struct seq_file *s, void *private)
 899 {
 900         int a, b, c;
 901
 902         a = spu_avenrun[0] + (FIXED_1/200);
 903         b = spu_avenrun[1] + (FIXED_1/200);
 904         c = spu_avenrun[2] + (FIXED_1/200);
 905
 906         /*
 907          * Note that last_pid doesn't really make much sense for the
 908          * SPU loadavg (it even seems very odd on the CPU side...),
 909          * but we include it here to have a 100% compatible interface.
 910          */
 911         seq_printf(s, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
 912                 LOAD_INT(a), LOAD_FRAC(a),
 913                 LOAD_INT(b), LOAD_FRAC(b),
 914                 LOAD_INT(c), LOAD_FRAC(c),
 915                 count_active_contexts(),
 916                 atomic_read(&nr_spu_contexts),
 917                 current->nsproxy->pid_ns->last_pid);
 918         return 0;
 919 }
 920
 921 static int spu_loadavg_open(struct inode *inode, struct file *file)
 922 {
 923         return single_open(file, show_spu_loadavg, NULL);
 924 }
 925
 926 static const struct file_operations spu_loadavg_fops = {
 927         .open           = spu_loadavg_open,
 928         .read           = seq_read,
 929         .llseek         = seq_lseek,
 930         .release        = single_release,
 931 };
 932
 933 int __init spu_sched_init(void)
 934 {
 935         struct proc_dir_entry *entry;
 936         int err = -ENOMEM, i;
 937
 938         spu_prio = kzalloc(sizeof(struct spu_prio_array), GFP_KERNEL);
 939         if (!spu_prio)
 940                 goto out;
 941
 942         for (i = 0; i < MAX_PRIO; i++) {
 943                 INIT_LIST_HEAD(&spu_prio->runq[i]);
 944                 __clear_bit(i, spu_prio->bitmap);
 945         }
 946         spin_lock_init(&spu_prio->runq_lock);
 947
 948         setup_timer(&spusched_timer, spusched_wake, 0);
 949
 950         spusched_task = kthread_run(spusched_thread, NULL, "spusched");
 951         if (IS_ERR(spusched_task)) {
 952                 err = PTR_ERR(spusched_task);
 953                 goto out_free_spu_prio;
 954         }
 955
 956         entry = create_proc_entry("spu_loadavg", 0, NULL);
 957         if (!entry)
 958                 goto out_stop_kthread;
 959         entry->proc_fops = &spu_loadavg_fops;
 960
 961         pr_debug("spusched: tick: %d, min ticks: %d, default ticks: %d\n",
 962                         SPUSCHED_TICK, MIN_SPU_TIMESLICE, DEF_SPU_TIMESLICE);
 963         return 0;
 964
 965  out_stop_kthread:
 966         kthread_stop(spusched_task);
 967  out_free_spu_prio:
 968         kfree(spu_prio);
 969  out:
 970         return err;
 971 }
 972
 973 void spu_sched_exit(void)
 974 {
 975         struct spu *spu;
 976         int node;
 977
 978         remove_proc_entry("spu_loadavg", NULL);
 979
 980         del_timer_sync(&spusched_timer);
 981         kthread_stop(spusched_task);
 982
 983         for (node = 0; node < MAX_NUMNODES; node++) {
 984                 mutex_lock(&cbe_spu_info[node].list_mutex);
 985                 list_for_each_entry(spu, &cbe_spu_info[node].spus, cbe_list)
 986                         if (spu->alloc_state != SPU_FREE)
 987                                 spu->alloc_state = SPU_FREE;
 988                 mutex_unlock(&cbe_spu_info[node].list_mutex);
 989         }
 990         kfree(spu_prio);
 991 }