git.oblomov.eu Git - linux-2.6/blob - arch/powerpc/platforms/cell/spufs/sched.c

   1 /* sched.c - SPU scheduler.
   2  *
   3  * Copyright (C) IBM 2005
   4  * Author: Mark Nutter <mnutter@us.ibm.com>
   5  *
   6  * 2006-03-31   NUMA domains added.
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2, or (at your option)
  11  * any later version.
  12  *
  13  * This program is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * along with this program; if not, write to the Free Software
  20  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  21  */
  22
  23 #undef DEBUG
  24
  25 #include <linux/module.h>
  26 #include <linux/errno.h>
  27 #include <linux/sched.h>
  28 #include <linux/kernel.h>
  29 #include <linux/mm.h>
  30 #include <linux/completion.h>
  31 #include <linux/vmalloc.h>
  32 #include <linux/smp.h>
  33 #include <linux/stddef.h>
  34 #include <linux/unistd.h>
  35 #include <linux/numa.h>
  36 #include <linux/mutex.h>
  37 #include <linux/notifier.h>
  38 #include <linux/kthread.h>
  39
  40 #include <asm/io.h>
  41 #include <asm/mmu_context.h>
  42 #include <asm/spu.h>
  43 #include <asm/spu_csa.h>
  44 #include <asm/spu_priv1.h>
  45 #include "spufs.h"
  46
  47 struct spu_prio_array {
  48         DECLARE_BITMAP(bitmap, MAX_PRIO);
  49         struct list_head runq[MAX_PRIO];
  50         spinlock_t runq_lock;
  51         struct list_head active_list[MAX_NUMNODES];
  52         struct mutex active_mutex[MAX_NUMNODES];
  53 };
  54
  55 static struct spu_prio_array *spu_prio;
  56 static struct task_struct *spusched_task;
  57 static struct timer_list spusched_timer;
  58
  59 /*
  60  * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
  61  */
  62 #define NORMAL_PRIO             120
  63
  64 /*
  65  * Frequency of the spu scheduler tick.  By default we do one SPU scheduler
  66  * tick for every 10 CPU scheduler ticks.
  67  */
  68 #define SPUSCHED_TICK           (10)
  69
  70 /*
  71  * These are the 'tuning knobs' of the scheduler:
  72  *
  73  * Minimum timeslice is 5 msecs (or 1 spu scheduler tick, whichever is
  74  * larger), default timeslice is 100 msecs, maximum timeslice is 800 msecs.
  75  */
  76 #define MIN_SPU_TIMESLICE       max(5 * HZ / (1000 * SPUSCHED_TICK), 1)
  77 #define DEF_SPU_TIMESLICE       (100 * HZ / (1000 * SPUSCHED_TICK))
  78
  79 #define MAX_USER_PRIO           (MAX_PRIO - MAX_RT_PRIO)
  80 #define SCALE_PRIO(x, prio) \
  81         max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE)
  82
  83 /*
  84  * scale user-nice values [ -20 ... 0 ... 19 ] to time slice values:
  85  * [800ms ... 100ms ... 5ms]
  86  *
  87  * The higher a thread's priority, the bigger timeslices
  88  * it gets during one round of execution. But even the lowest
  89  * priority thread gets MIN_TIMESLICE worth of execution time.
  90  */
  91 void spu_set_timeslice(struct spu_context *ctx)
  92 {
  93         if (ctx->prio < NORMAL_PRIO)
  94                 ctx->time_slice = SCALE_PRIO(DEF_SPU_TIMESLICE * 4, ctx->prio);
  95         else
  96                 ctx->time_slice = SCALE_PRIO(DEF_SPU_TIMESLICE, ctx->prio);
  97 }
  98
  99 /*
 100  * Update scheduling information from the owning thread.
 101  */
 102 void __spu_update_sched_info(struct spu_context *ctx)
 103 {
 104         /*
 105          * 32-Bit assignment are atomic on powerpc, and we don't care about
 106          * memory ordering here because retriving the controlling thread is
 107          * per defintion racy.
 108          */
 109         ctx->tid = current->pid;
 110
 111         /*
 112          * We do our own priority calculations, so we normally want
 113          * ->static_prio to start with. Unfortunately thies field
 114          * contains junk for threads with a realtime scheduling
 115          * policy so we have to look at ->prio in this case.
 116          */
 117         if (rt_prio(current->prio))
 118                 ctx->prio = current->prio;
 119         else
 120                 ctx->prio = current->static_prio;
 121         ctx->policy = current->policy;
 122
 123         /*
 124          * A lot of places that don't hold active_mutex poke into
 125          * cpus_allowed, including grab_runnable_context which
 126          * already holds the runq_lock.  So abuse runq_lock
 127          * to protect this field aswell.
 128          */
 129         spin_lock(&spu_prio->runq_lock);
 130         ctx->cpus_allowed = current->cpus_allowed;
 131         spin_unlock(&spu_prio->runq_lock);
 132 }
 133
 134 void spu_update_sched_info(struct spu_context *ctx)
 135 {
 136         int node = ctx->spu->node;
 137
 138         mutex_lock(&spu_prio->active_mutex[node]);
 139         __spu_update_sched_info(ctx);
 140         mutex_unlock(&spu_prio->active_mutex[node]);
 141 }
 142
 143 static int __node_allowed(struct spu_context *ctx, int node)
 144 {
 145         if (nr_cpus_node(node)) {
 146                 cpumask_t mask = node_to_cpumask(node);
 147
 148                 if (cpus_intersects(mask, ctx->cpus_allowed))
 149                         return 1;
 150         }
 151
 152         return 0;
 153 }
 154
 155 static int node_allowed(struct spu_context *ctx, int node)
 156 {
 157         int rval;
 158
 159         spin_lock(&spu_prio->runq_lock);
 160         rval = __node_allowed(ctx, node);
 161         spin_unlock(&spu_prio->runq_lock);
 162
 163         return rval;
 164 }
 165
 166 /**
 167  * spu_add_to_active_list - add spu to active list
 168  * @spu:        spu to add to the active list
 169  */
 170 static void spu_add_to_active_list(struct spu *spu)
 171 {
 172         mutex_lock(&spu_prio->active_mutex[spu->node]);
 173         list_add_tail(&spu->list, &spu_prio->active_list[spu->node]);
 174         mutex_unlock(&spu_prio->active_mutex[spu->node]);
 175 }
 176
 177 static void __spu_remove_from_active_list(struct spu *spu)
 178 {
 179         list_del_init(&spu->list);
 180 }
 181
 182 /**
 183  * spu_remove_from_active_list - remove spu from active list
 184  * @spu:       spu to remove from the active list
 185  */
 186 static void spu_remove_from_active_list(struct spu *spu)
 187 {
 188         int node = spu->node;
 189
 190         mutex_lock(&spu_prio->active_mutex[node]);
 191         __spu_remove_from_active_list(spu);
 192         mutex_unlock(&spu_prio->active_mutex[node]);
 193 }
 194
 195 static BLOCKING_NOTIFIER_HEAD(spu_switch_notifier);
 196
 197 static void spu_switch_notify(struct spu *spu, struct spu_context *ctx)
 198 {
 199         blocking_notifier_call_chain(&spu_switch_notifier,
 200                             ctx ? ctx->object_id : 0, spu);
 201 }
 202
 203 int spu_switch_event_register(struct notifier_block * n)
 204 {
 205         return blocking_notifier_chain_register(&spu_switch_notifier, n);
 206 }
 207
 208 int spu_switch_event_unregister(struct notifier_block * n)
 209 {
 210         return blocking_notifier_chain_unregister(&spu_switch_notifier, n);
 211 }
 212
 213 /**
 214  * spu_bind_context - bind spu context to physical spu
 215  * @spu:        physical spu to bind to
 216  * @ctx:        context to bind
 217  */
 218 static void spu_bind_context(struct spu *spu, struct spu_context *ctx)
 219 {
 220         pr_debug("%s: pid=%d SPU=%d NODE=%d\n", __FUNCTION__, current->pid,
 221                  spu->number, spu->node);
 222         spu->ctx = ctx;
 223         spu->flags = 0;
 224         ctx->spu = spu;
 225         ctx->ops = &spu_hw_ops;
 226         spu->pid = current->pid;
 227         spu_associate_mm(spu, ctx->owner);
 228         spu->ibox_callback = spufs_ibox_callback;
 229         spu->wbox_callback = spufs_wbox_callback;
 230         spu->stop_callback = spufs_stop_callback;
 231         spu->mfc_callback = spufs_mfc_callback;
 232         spu->dma_callback = spufs_dma_callback;
 233         mb();
 234         spu_unmap_mappings(ctx);
 235         spu_restore(&ctx->csa, spu);
 236         spu->timestamp = jiffies;
 237         spu_cpu_affinity_set(spu, raw_smp_processor_id());
 238         spu_switch_notify(spu, ctx);
 239         ctx->state = SPU_STATE_RUNNABLE;
 240 }
 241
 242 /**
 243  * spu_unbind_context - unbind spu context from physical spu
 244  * @spu:        physical spu to unbind from
 245  * @ctx:        context to unbind
 246  */
 247 static void spu_unbind_context(struct spu *spu, struct spu_context *ctx)
 248 {
 249         pr_debug("%s: unbind pid=%d SPU=%d NODE=%d\n", __FUNCTION__,
 250                  spu->pid, spu->number, spu->node);
 251
 252         spu_switch_notify(spu, NULL);
 253         spu_unmap_mappings(ctx);
 254         spu_save(&ctx->csa, spu);
 255         spu->timestamp = jiffies;
 256         ctx->state = SPU_STATE_SAVED;
 257         spu->ibox_callback = NULL;
 258         spu->wbox_callback = NULL;
 259         spu->stop_callback = NULL;
 260         spu->mfc_callback = NULL;
 261         spu->dma_callback = NULL;
 262         spu_associate_mm(spu, NULL);
 263         spu->pid = 0;
 264         ctx->ops = &spu_backing_ops;
 265         ctx->spu = NULL;
 266         spu->flags = 0;
 267         spu->ctx = NULL;
 268 }
 269
 270 /**
 271  * spu_add_to_rq - add a context to the runqueue
 272  * @ctx:       context to add
 273  */
 274 static void __spu_add_to_rq(struct spu_context *ctx)
 275 {
 276         int prio = ctx->prio;
 277
 278         list_add_tail(&ctx->rq, &spu_prio->runq[prio]);
 279         set_bit(prio, spu_prio->bitmap);
 280 }
 281
 282 static void __spu_del_from_rq(struct spu_context *ctx)
 283 {
 284         int prio = ctx->prio;
 285
 286         if (!list_empty(&ctx->rq))
 287                 list_del_init(&ctx->rq);
 288         if (list_empty(&spu_prio->runq[prio]))
 289                 clear_bit(prio, spu_prio->bitmap);
 290 }
 291
 292 static void spu_prio_wait(struct spu_context *ctx)
 293 {
 294         DEFINE_WAIT(wait);
 295
 296         spin_lock(&spu_prio->runq_lock);
 297         prepare_to_wait_exclusive(&ctx->stop_wq, &wait, TASK_INTERRUPTIBLE);
 298         if (!signal_pending(current)) {
 299                 __spu_add_to_rq(ctx);
 300                 spin_unlock(&spu_prio->runq_lock);
 301                 mutex_unlock(&ctx->state_mutex);
 302                 schedule();
 303                 mutex_lock(&ctx->state_mutex);
 304                 spin_lock(&spu_prio->runq_lock);
 305                 __spu_del_from_rq(ctx);
 306         }
 307         spin_unlock(&spu_prio->runq_lock);
 308         __set_current_state(TASK_RUNNING);
 309         remove_wait_queue(&ctx->stop_wq, &wait);
 310 }
 311
 312 static struct spu *spu_get_idle(struct spu_context *ctx)
 313 {
 314         struct spu *spu = NULL;
 315         int node = cpu_to_node(raw_smp_processor_id());
 316         int n;
 317
 318         for (n = 0; n < MAX_NUMNODES; n++, node++) {
 319                 node = (node < MAX_NUMNODES) ? node : 0;
 320                 if (!node_allowed(ctx, node))
 321                         continue;
 322                 spu = spu_alloc_node(node);
 323                 if (spu)
 324                         break;
 325         }
 326         return spu;
 327 }
 328
 329 /**
 330  * find_victim - find a lower priority context to preempt
 331  * @ctx:        canidate context for running
 332  *
 333  * Returns the freed physical spu to run the new context on.
 334  */
 335 static struct spu *find_victim(struct spu_context *ctx)
 336 {
 337         struct spu_context *victim = NULL;
 338         struct spu *spu;
 339         int node, n;
 340
 341         /*
 342          * Look for a possible preemption candidate on the local node first.
 343          * If there is no candidate look at the other nodes.  This isn't
 344          * exactly fair, but so far the whole spu schedule tries to keep
 345          * a strong node affinity.  We might want to fine-tune this in
 346          * the future.
 347          */
 348  restart:
 349         node = cpu_to_node(raw_smp_processor_id());
 350         for (n = 0; n < MAX_NUMNODES; n++, node++) {
 351                 node = (node < MAX_NUMNODES) ? node : 0;
 352                 if (!node_allowed(ctx, node))
 353                         continue;
 354
 355                 mutex_lock(&spu_prio->active_mutex[node]);
 356                 list_for_each_entry(spu, &spu_prio->active_list[node], list) {
 357                         struct spu_context *tmp = spu->ctx;
 358
 359                         if (tmp->prio > ctx->prio &&
 360                             (!victim || tmp->prio > victim->prio))
 361                                 victim = spu->ctx;
 362                 }
 363                 mutex_unlock(&spu_prio->active_mutex[node]);
 364
 365                 if (victim) {
 366                         /*
 367                          * This nests ctx->state_mutex, but we always lock
 368                          * higher priority contexts before lower priority
 369                          * ones, so this is safe until we introduce
 370                          * priority inheritance schemes.
 371                          */
 372                         if (!mutex_trylock(&victim->state_mutex)) {
 373                                 victim = NULL;
 374                                 goto restart;
 375                         }
 376
 377                         spu = victim->spu;
 378                         if (!spu) {
 379                                 /*
 380                                  * This race can happen because we've dropped
 381                                  * the active list mutex.  No a problem, just
 382                                  * restart the search.
 383                                  */
 384                                 mutex_unlock(&victim->state_mutex);
 385                                 victim = NULL;
 386                                 goto restart;
 387                         }
 388                         spu_remove_from_active_list(spu);
 389                         spu_unbind_context(spu, victim);
 390                         mutex_unlock(&victim->state_mutex);
 391                         /*
 392                          * We need to break out of the wait loop in spu_run
 393                          * manually to ensure this context gets put on the
 394                          * runqueue again ASAP.
 395                          */
 396                         wake_up(&victim->stop_wq);
 397                         return spu;
 398                 }
 399         }
 400
 401         return NULL;
 402 }
 403
 404 /**
 405  * spu_activate - find a free spu for a context and execute it
 406  * @ctx:        spu context to schedule
 407  * @flags:      flags (currently ignored)
 408  *
 409  * Tries to find a free spu to run @ctx.  If no free spu is available
 410  * add the context to the runqueue so it gets woken up once an spu
 411  * is available.
 412  */
 413 int spu_activate(struct spu_context *ctx, unsigned long flags)
 414 {
 415
 416         if (ctx->spu)
 417                 return 0;
 418
 419         do {
 420                 struct spu *spu;
 421
 422                 spu = spu_get_idle(ctx);
 423                 /*
 424                  * If this is a realtime thread we try to get it running by
 425                  * preempting a lower priority thread.
 426                  */
 427                 if (!spu && rt_prio(ctx->prio))
 428                         spu = find_victim(ctx);
 429                 if (spu) {
 430                         spu_bind_context(spu, ctx);
 431                         spu_add_to_active_list(spu);
 432                         return 0;
 433                 }
 434
 435                 spu_prio_wait(ctx);
 436         } while (!signal_pending(current));
 437
 438         return -ERESTARTSYS;
 439 }
 440
 441 /**
 442  * grab_runnable_context - try to find a runnable context
 443  *
 444  * Remove the highest priority context on the runqueue and return it
 445  * to the caller.  Returns %NULL if no runnable context was found.
 446  */
 447 static struct spu_context *grab_runnable_context(int prio, int node)
 448 {
 449         struct spu_context *ctx;
 450         int best;
 451
 452         spin_lock(&spu_prio->runq_lock);
 453         best = sched_find_first_bit(spu_prio->bitmap);
 454         while (best < prio) {
 455                 struct list_head *rq = &spu_prio->runq[best];
 456
 457                 list_for_each_entry(ctx, rq, rq) {
 458                         /* XXX(hch): check for affinity here aswell */
 459                         if (__node_allowed(ctx, node)) {
 460                                 __spu_del_from_rq(ctx);
 461                                 goto found;
 462                         }
 463                 }
 464                 best++;
 465         }
 466         ctx = NULL;
 467  found:
 468         spin_unlock(&spu_prio->runq_lock);
 469         return ctx;
 470 }
 471
 472 static int __spu_deactivate(struct spu_context *ctx, int force, int max_prio)
 473 {
 474         struct spu *spu = ctx->spu;
 475         struct spu_context *new = NULL;
 476
 477         if (spu) {
 478                 new = grab_runnable_context(max_prio, spu->node);
 479                 if (new || force) {
 480                         spu_remove_from_active_list(spu);
 481                         spu_unbind_context(spu, ctx);
 482                         spu_free(spu);
 483                         if (new)
 484                                 wake_up(&new->stop_wq);
 485                 }
 486
 487         }
 488
 489         return new != NULL;
 490 }
 491
 492 /**
 493  * spu_deactivate - unbind a context from it's physical spu
 494  * @ctx:        spu context to unbind
 495  *
 496  * Unbind @ctx from the physical spu it is running on and schedule
 497  * the highest priority context to run on the freed physical spu.
 498  */
 499 void spu_deactivate(struct spu_context *ctx)
 500 {
 501         /*
 502          * We must never reach this for a nosched context,
 503          * but handle the case gracefull instead of panicing.
 504          */
 505         if (ctx->flags & SPU_CREATE_NOSCHED) {
 506                 WARN_ON(1);
 507                 return;
 508         }
 509
 510         __spu_deactivate(ctx, 1, MAX_PRIO);
 511 }
 512
 513 /**
 514  * spu_yield -  yield a physical spu if others are waiting
 515  * @ctx:        spu context to yield
 516  *
 517  * Check if there is a higher priority context waiting and if yes
 518  * unbind @ctx from the physical spu and schedule the highest
 519  * priority context to run on the freed physical spu instead.
 520  */
 521 void spu_yield(struct spu_context *ctx)
 522 {
 523         if (!(ctx->flags & SPU_CREATE_NOSCHED)) {
 524                 mutex_lock(&ctx->state_mutex);
 525                 __spu_deactivate(ctx, 0, MAX_PRIO);
 526                 mutex_unlock(&ctx->state_mutex);
 527         }
 528 }
 529
 530 static void spusched_tick(struct spu_context *ctx)
 531 {
 532         if (ctx->flags & SPU_CREATE_NOSCHED)
 533                 return;
 534         if (ctx->policy == SCHED_FIFO)
 535                 return;
 536
 537         if (--ctx->time_slice)
 538                 return;
 539
 540         /*
 541          * Unfortunately active_mutex ranks outside of state_mutex, so
 542          * we have to trylock here.  If we fail give the context another
 543          * tick and try again.
 544          */
 545         if (mutex_trylock(&ctx->state_mutex)) {
 546                 struct spu *spu = ctx->spu;
 547                 struct spu_context *new;
 548
 549                 new = grab_runnable_context(ctx->prio + 1, spu->node);
 550                 if (new) {
 551
 552                         __spu_remove_from_active_list(spu);
 553                         spu_unbind_context(spu, ctx);
 554                         spu_free(spu);
 555                         wake_up(&new->stop_wq);
 556                         /*
 557                          * We need to break out of the wait loop in
 558                          * spu_run manually to ensure this context
 559                          * gets put on the runqueue again ASAP.
 560                          */
 561                         wake_up(&ctx->stop_wq);
 562                 }
 563                 spu_set_timeslice(ctx);
 564                 mutex_unlock(&ctx->state_mutex);
 565         } else {
 566                 ctx->time_slice++;
 567         }
 568 }
 569
 570 static void spusched_wake(unsigned long data)
 571 {
 572         mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
 573         wake_up_process(spusched_task);
 574 }
 575
 576 static int spusched_thread(void *unused)
 577 {
 578         struct spu *spu, *next;
 579         int node;
 580
 581         setup_timer(&spusched_timer, spusched_wake, 0);
 582         __mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
 583
 584         while (!kthread_should_stop()) {
 585                 set_current_state(TASK_INTERRUPTIBLE);
 586                 schedule();
 587                 for (node = 0; node < MAX_NUMNODES; node++) {
 588                         mutex_lock(&spu_prio->active_mutex[node]);
 589                         list_for_each_entry_safe(spu, next,
 590                                                  &spu_prio->active_list[node],
 591                                                  list)
 592                                 spusched_tick(spu->ctx);
 593                         mutex_unlock(&spu_prio->active_mutex[node]);
 594                 }
 595         }
 596
 597         del_timer_sync(&spusched_timer);
 598         return 0;
 599 }
 600
 601 int __init spu_sched_init(void)
 602 {
 603         int i;
 604
 605         spu_prio = kzalloc(sizeof(struct spu_prio_array), GFP_KERNEL);
 606         if (!spu_prio)
 607                 return -ENOMEM;
 608
 609         for (i = 0; i < MAX_PRIO; i++) {
 610                 INIT_LIST_HEAD(&spu_prio->runq[i]);
 611                 __clear_bit(i, spu_prio->bitmap);
 612         }
 613         __set_bit(MAX_PRIO, spu_prio->bitmap);
 614         for (i = 0; i < MAX_NUMNODES; i++) {
 615                 mutex_init(&spu_prio->active_mutex[i]);
 616                 INIT_LIST_HEAD(&spu_prio->active_list[i]);
 617         }
 618         spin_lock_init(&spu_prio->runq_lock);
 619
 620         spusched_task = kthread_run(spusched_thread, NULL, "spusched");
 621         if (IS_ERR(spusched_task)) {
 622                 kfree(spu_prio);
 623                 return PTR_ERR(spusched_task);
 624         }
 625
 626         pr_debug("spusched: tick: %d, min ticks: %d, default ticks: %d\n",
 627                         SPUSCHED_TICK, MIN_SPU_TIMESLICE, DEF_SPU_TIMESLICE);
 628         return 0;
 629
 630 }
 631
 632 void __exit spu_sched_exit(void)
 633 {
 634         struct spu *spu, *tmp;
 635         int node;
 636
 637         kthread_stop(spusched_task);
 638
 639         for (node = 0; node < MAX_NUMNODES; node++) {
 640                 mutex_lock(&spu_prio->active_mutex[node]);
 641                 list_for_each_entry_safe(spu, tmp, &spu_prio->active_list[node],
 642                                          list) {
 643                         list_del_init(&spu->list);
 644                         spu_free(spu);
 645                 }
 646                 mutex_unlock(&spu_prio->active_mutex[node]);
 647         }
 648         kfree(spu_prio);
 649 }