git.oblomov.eu Git - linux-2.6/blob - kernel/sched.c

   1 /*
   2  *  kernel/sched.c
   3  *
   4  *  Kernel scheduler and related syscalls
   5  *
   6  *  Copyright (C) 1991-2002  Linus Torvalds
   7  *
   8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9  *              make semaphores SMP safe
  10  *  1998-11-19  Implemented schedule_timeout() and related stuff
  11  *              by Andrea Arcangeli
  12  *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
  13  *              hybrid priority-list and round-robin design with
  14  *              an array-switch method of distributing timeslices
  15  *              and per-CPU runqueues.  Cleanups and useful suggestions
  16  *              by Davide Libenzi, preemptible kernel bits by Robert Love.
  17  *  2003-09-03  Interactivity tuning by Con Kolivas.
  18  *  2004-04-02  Scheduler domains code by Nick Piggin
  19  */
  20
  21 #include <linux/mm.h>
  22 #include <linux/module.h>
  23 #include <linux/nmi.h>
  24 #include <linux/init.h>
  25 #include <asm/uaccess.h>
  26 #include <linux/highmem.h>
  27 #include <linux/smp_lock.h>
  28 #include <asm/mmu_context.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/capability.h>
  31 #include <linux/completion.h>
  32 #include <linux/kernel_stat.h>
  33 #include <linux/debug_locks.h>
  34 #include <linux/security.h>
  35 #include <linux/notifier.h>
  36 #include <linux/profile.h>
  37 #include <linux/freezer.h>
  38 #include <linux/vmalloc.h>
  39 #include <linux/blkdev.h>
  40 #include <linux/delay.h>
  41 #include <linux/smp.h>
  42 #include <linux/threads.h>
  43 #include <linux/timer.h>
  44 #include <linux/rcupdate.h>
  45 #include <linux/cpu.h>
  46 #include <linux/cpuset.h>
  47 #include <linux/percpu.h>
  48 #include <linux/kthread.h>
  49 #include <linux/seq_file.h>
  50 #include <linux/syscalls.h>
  51 #include <linux/times.h>
  52 #include <linux/tsacct_kern.h>
  53 #include <linux/kprobes.h>
  54 #include <linux/delayacct.h>
  55 #include <linux/reciprocal_div.h>
  56
  57 #include <asm/tlb.h>
  58 #include <asm/unistd.h>
  59
  60 /*
  61  * Scheduler clock - returns current time in nanosec units.
  62  * This is default implementation.
  63  * Architectures and sub-architectures can override this.
  64  */
  65 unsigned long long __attribute__((weak)) sched_clock(void)
  66 {
  67         return (unsigned long long)jiffies * (1000000000 / HZ);
  68 }
  69
  70 /*
  71  * Convert user-nice values [ -20 ... 0 ... 19 ]
  72  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  73  * and back.
  74  */
  75 #define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
  76 #define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
  77 #define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
  78
  79 /*
  80  * 'User priority' is the nice value converted to something we
  81  * can work with better when scaling various scheduler parameters,
  82  * it's a [ 0 ... 39 ] range.
  83  */
  84 #define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
  85 #define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
  86 #define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
  87
  88 /*
  89  * Some helpers for converting nanosecond timing to jiffy resolution
  90  */
  91 #define NS_TO_JIFFIES(TIME)     ((TIME) / (1000000000 / HZ))
  92 #define JIFFIES_TO_NS(TIME)     ((TIME) * (1000000000 / HZ))
  93
  94 /*
  95  * These are the 'tuning knobs' of the scheduler:
  96  *
  97  * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
  98  * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
  99  * Timeslices get refilled after they expire.
 100  */
 101 #define MIN_TIMESLICE           max(5 * HZ / 1000, 1)
 102 #define DEF_TIMESLICE           (100 * HZ / 1000)
 103 #define ON_RUNQUEUE_WEIGHT       30
 104 #define CHILD_PENALTY            95
 105 #define PARENT_PENALTY          100
 106 #define EXIT_WEIGHT               3
 107 #define PRIO_BONUS_RATIO         25
 108 #define MAX_BONUS               (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
 109 #define INTERACTIVE_DELTA         2
 110 #define MAX_SLEEP_AVG           (DEF_TIMESLICE * MAX_BONUS)
 111 #define STARVATION_LIMIT        (MAX_SLEEP_AVG)
 112 #define NS_MAX_SLEEP_AVG        (JIFFIES_TO_NS(MAX_SLEEP_AVG))
 113
 114 /*
 115  * If a task is 'interactive' then we reinsert it in the active
 116  * array after it has expired its current timeslice. (it will not
 117  * continue to run immediately, it will still roundrobin with
 118  * other interactive tasks.)
 119  *
 120  * This part scales the interactivity limit depending on niceness.
 121  *
 122  * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
 123  * Here are a few examples of different nice levels:
 124  *
 125  *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
 126  *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
 127  *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
 128  *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
 129  *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
 130  *
 131  * (the X axis represents the possible -5 ... 0 ... +5 dynamic
 132  *  priority range a task can explore, a value of '1' means the
 133  *  task is rated interactive.)
 134  *
 135  * Ie. nice +19 tasks can never get 'interactive' enough to be
 136  * reinserted into the active array. And only heavily CPU-hog nice -20
 137  * tasks will be expired. Default nice 0 tasks are somewhere between,
 138  * it takes some effort for them to get interactive, but it's not
 139  * too hard.
 140  */
 141
 142 #define CURRENT_BONUS(p) \
 143         (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
 144                 MAX_SLEEP_AVG)
 145
 146 #define GRANULARITY     (10 * HZ / 1000 ? : 1)
 147
 148 #ifdef CONFIG_SMP
 149 #define TIMESLICE_GRANULARITY(p)        (GRANULARITY * \
 150                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
 151                         num_online_cpus())
 152 #else
 153 #define TIMESLICE_GRANULARITY(p)        (GRANULARITY * \
 154                 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
 155 #endif
 156
 157 #define SCALE(v1,v1_max,v2_max) \
 158         (v1) * (v2_max) / (v1_max)
 159
 160 #define DELTA(p) \
 161         (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
 162                 INTERACTIVE_DELTA)
 163
 164 #define TASK_INTERACTIVE(p) \
 165         ((p)->prio <= (p)->static_prio - DELTA(p))
 166
 167 #define INTERACTIVE_SLEEP(p) \
 168         (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
 169                 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
 170
 171 #define TASK_PREEMPTS_CURR(p, rq) \
 172         ((p)->prio < (rq)->curr->prio)
 173
 174 #define SCALE_PRIO(x, prio) \
 175         max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
 176
 177 static unsigned int static_prio_timeslice(int static_prio)
 178 {
 179         if (static_prio < NICE_TO_PRIO(0))
 180                 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
 181         else
 182                 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
 183 }
 184
 185 #ifdef CONFIG_SMP
 186 /*
 187  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
 188  * Since cpu_power is a 'constant', we can use a reciprocal divide.
 189  */
 190 static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
 191 {
 192         return reciprocal_divide(load, sg->reciprocal_cpu_power);
 193 }
 194
 195 /*
 196  * Each time a sched group cpu_power is changed,
 197  * we must compute its reciprocal value
 198  */
 199 static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
 200 {
 201         sg->__cpu_power += val;
 202         sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
 203 }
 204 #endif
 205
 206 /*
 207  * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
 208  * to time slice values: [800ms ... 100ms ... 5ms]
 209  *
 210  * The higher a thread's priority, the bigger timeslices
 211  * it gets during one round of execution. But even the lowest
 212  * priority thread gets MIN_TIMESLICE worth of execution time.
 213  */
 214
 215 static inline unsigned int task_timeslice(struct task_struct *p)
 216 {
 217         return static_prio_timeslice(p->static_prio);
 218 }
 219
 220 /*
 221  * These are the runqueue data structures:
 222  */
 223
 224 struct prio_array {
 225         unsigned int nr_active;
 226         DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
 227         struct list_head queue[MAX_PRIO];
 228 };
 229
 230 /*
 231  * This is the main, per-CPU runqueue data structure.
 232  *
 233  * Locking rule: those places that want to lock multiple runqueues
 234  * (such as the load balancing or the thread migration code), lock
 235  * acquire operations must be ordered by ascending &runqueue.
 236  */
 237 struct rq {
 238         spinlock_t lock;
 239
 240         /*
 241          * nr_running and cpu_load should be in the same cacheline because
 242          * remote CPUs use both these fields when doing load calculation.
 243          */
 244         unsigned long nr_running;
 245         unsigned long raw_weighted_load;
 246 #ifdef CONFIG_SMP
 247         unsigned long cpu_load[3];
 248         unsigned char idle_at_tick;
 249 #ifdef CONFIG_NO_HZ
 250         unsigned char in_nohz_recently;
 251 #endif
 252 #endif
 253         unsigned long long nr_switches;
 254
 255         /*
 256          * This is part of a global counter where only the total sum
 257          * over all CPUs matters. A task can increase this counter on
 258          * one CPU and if it got migrated afterwards it may decrease
 259          * it on another CPU. Always updated under the runqueue lock:
 260          */
 261         unsigned long nr_uninterruptible;
 262
 263         unsigned long expired_timestamp;
 264         /* Cached timestamp set by update_cpu_clock() */
 265         unsigned long long most_recent_timestamp;
 266         struct task_struct *curr, *idle;
 267         unsigned long next_balance;
 268         struct mm_struct *prev_mm;
 269         struct prio_array *active, *expired, arrays[2];
 270         int best_expired_prio;
 271         atomic_t nr_iowait;
 272
 273 #ifdef CONFIG_SMP
 274         struct sched_domain *sd;
 275
 276         /* For active balancing */
 277         int active_balance;
 278         int push_cpu;
 279         int cpu;                /* cpu of this runqueue */
 280
 281         struct task_struct *migration_thread;
 282         struct list_head migration_queue;
 283 #endif
 284
 285 #ifdef CONFIG_SCHEDSTATS
 286         /* latency stats */
 287         struct sched_info rq_sched_info;
 288
 289         /* sys_sched_yield() stats */
 290         unsigned long yld_exp_empty;
 291         unsigned long yld_act_empty;
 292         unsigned long yld_both_empty;
 293         unsigned long yld_cnt;
 294
 295         /* schedule() stats */
 296         unsigned long sched_switch;
 297         unsigned long sched_cnt;
 298         unsigned long sched_goidle;
 299
 300         /* try_to_wake_up() stats */
 301         unsigned long ttwu_cnt;
 302         unsigned long ttwu_local;
 303 #endif
 304         struct lock_class_key rq_lock_key;
 305 };
 306
 307 static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
 308 static DEFINE_MUTEX(sched_hotcpu_mutex);
 309
 310 static inline int cpu_of(struct rq *rq)
 311 {
 312 #ifdef CONFIG_SMP
 313         return rq->cpu;
 314 #else
 315         return 0;
 316 #endif
 317 }
 318
 319 /*
 320  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
 321  * See detach_destroy_domains: synchronize_sched for details.
 322  *
 323  * The domain tree of any CPU may only be accessed from within
 324  * preempt-disabled sections.
 325  */
 326 #define for_each_domain(cpu, __sd) \
 327         for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 328
 329 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 330 #define this_rq()               (&__get_cpu_var(runqueues))
 331 #define task_rq(p)              cpu_rq(task_cpu(p))
 332 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 333
 334 #ifndef prepare_arch_switch
 335 # define prepare_arch_switch(next)      do { } while (0)
 336 #endif
 337 #ifndef finish_arch_switch
 338 # define finish_arch_switch(prev)       do { } while (0)
 339 #endif
 340
 341 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 342 static inline int task_running(struct rq *rq, struct task_struct *p)
 343 {
 344         return rq->curr == p;
 345 }
 346
 347 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 348 {
 349 }
 350
 351 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 352 {
 353 #ifdef CONFIG_DEBUG_SPINLOCK
 354         /* this is a valid case when another task releases the spinlock */
 355         rq->lock.owner = current;
 356 #endif
 357         /*
 358          * If we are tracking spinlock dependencies then we have to
 359          * fix up the runqueue lock - which gets 'carried over' from
 360          * prev into current:
 361          */
 362         spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 363
 364         spin_unlock_irq(&rq->lock);
 365 }
 366
 367 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
 368 static inline int task_running(struct rq *rq, struct task_struct *p)
 369 {
 370 #ifdef CONFIG_SMP
 371         return p->oncpu;
 372 #else
 373         return rq->curr == p;
 374 #endif
 375 }
 376
 377 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 378 {
 379 #ifdef CONFIG_SMP
 380         /*
 381          * We can optimise this out completely for !SMP, because the
 382          * SMP rebalancing from interrupt is the only thing that cares
 383          * here.
 384          */
 385         next->oncpu = 1;
 386 #endif
 387 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 388         spin_unlock_irq(&rq->lock);
 389 #else
 390         spin_unlock(&rq->lock);
 391 #endif
 392 }
 393
 394 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 395 {
 396 #ifdef CONFIG_SMP
 397         /*
 398          * After ->oncpu is cleared, the task can be moved to a different CPU.
 399          * We must ensure this doesn't happen until the switch is completely
 400          * finished.
 401          */
 402         smp_wmb();
 403         prev->oncpu = 0;
 404 #endif
 405 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 406         local_irq_enable();
 407 #endif
 408 }
 409 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 410
 411 /*
 412  * __task_rq_lock - lock the runqueue a given task resides on.
 413  * Must be called interrupts disabled.
 414  */
 415 static inline struct rq *__task_rq_lock(struct task_struct *p)
 416         __acquires(rq->lock)
 417 {
 418         struct rq *rq;
 419
 420 repeat_lock_task:
 421         rq = task_rq(p);
 422         spin_lock(&rq->lock);
 423         if (unlikely(rq != task_rq(p))) {
 424                 spin_unlock(&rq->lock);
 425                 goto repeat_lock_task;
 426         }
 427         return rq;
 428 }
 429
 430 /*
 431  * task_rq_lock - lock the runqueue a given task resides on and disable
 432  * interrupts.  Note the ordering: we can safely lookup the task_rq without
 433  * explicitly disabling preemption.
 434  */
 435 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 436         __acquires(rq->lock)
 437 {
 438         struct rq *rq;
 439
 440 repeat_lock_task:
 441         local_irq_save(*flags);
 442         rq = task_rq(p);
 443         spin_lock(&rq->lock);
 444         if (unlikely(rq != task_rq(p))) {
 445                 spin_unlock_irqrestore(&rq->lock, *flags);
 446                 goto repeat_lock_task;
 447         }
 448         return rq;
 449 }
 450
 451 static inline void __task_rq_unlock(struct rq *rq)
 452         __releases(rq->lock)
 453 {
 454         spin_unlock(&rq->lock);
 455 }
 456
 457 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
 458         __releases(rq->lock)
 459 {
 460         spin_unlock_irqrestore(&rq->lock, *flags);
 461 }
 462
 463 #ifdef CONFIG_SCHEDSTATS
 464 /*
 465  * bump this up when changing the output format or the meaning of an existing
 466  * format, so that tools can adapt (or abort)
 467  */
 468 #define SCHEDSTAT_VERSION 14
 469
 470 static int show_schedstat(struct seq_file *seq, void *v)
 471 {
 472         int cpu;
 473
 474         seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
 475         seq_printf(seq, "timestamp %lu\n", jiffies);
 476         for_each_online_cpu(cpu) {
 477                 struct rq *rq = cpu_rq(cpu);
 478 #ifdef CONFIG_SMP
 479                 struct sched_domain *sd;
 480                 int dcnt = 0;
 481 #endif
 482
 483                 /* runqueue-specific stats */
 484                 seq_printf(seq,
 485                     "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
 486                     cpu, rq->yld_both_empty,
 487                     rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
 488                     rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
 489                     rq->ttwu_cnt, rq->ttwu_local,
 490                     rq->rq_sched_info.cpu_time,
 491                     rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
 492
 493                 seq_printf(seq, "\n");
 494
 495 #ifdef CONFIG_SMP
 496                 /* domain-specific stats */
 497                 preempt_disable();
 498                 for_each_domain(cpu, sd) {
 499                         enum cpu_idle_type itype;
 500                         char mask_str[NR_CPUS];
 501
 502                         cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
 503                         seq_printf(seq, "domain%d %s", dcnt++, mask_str);
 504                         for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
 505                                         itype++) {
 506                                 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
 507                                                 "%lu",
 508                                     sd->lb_cnt[itype],
 509                                     sd->lb_balanced[itype],
 510                                     sd->lb_failed[itype],
 511                                     sd->lb_imbalance[itype],
 512                                     sd->lb_gained[itype],
 513                                     sd->lb_hot_gained[itype],
 514                                     sd->lb_nobusyq[itype],
 515                                     sd->lb_nobusyg[itype]);
 516                         }
 517                         seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
 518                             " %lu %lu %lu\n",
 519                             sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
 520                             sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
 521                             sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
 522                             sd->ttwu_wake_remote, sd->ttwu_move_affine,
 523                             sd->ttwu_move_balance);
 524                 }
 525                 preempt_enable();
 526 #endif
 527         }
 528         return 0;
 529 }
 530
 531 static int schedstat_open(struct inode *inode, struct file *file)
 532 {
 533         unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
 534         char *buf = kmalloc(size, GFP_KERNEL);
 535         struct seq_file *m;
 536         int res;
 537
 538         if (!buf)
 539                 return -ENOMEM;
 540         res = single_open(file, show_schedstat, NULL);
 541         if (!res) {
 542                 m = file->private_data;
 543                 m->buf = buf;
 544                 m->size = size;
 545         } else
 546                 kfree(buf);
 547         return res;
 548 }
 549
 550 const struct file_operations proc_schedstat_operations = {
 551         .open    = schedstat_open,
 552         .read    = seq_read,
 553         .llseek  = seq_lseek,
 554         .release = single_release,
 555 };
 556
 557 /*
 558  * Expects runqueue lock to be held for atomicity of update
 559  */
 560 static inline void
 561 rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
 562 {
 563         if (rq) {
 564                 rq->rq_sched_info.run_delay += delta_jiffies;
 565                 rq->rq_sched_info.pcnt++;
 566         }
 567 }
 568
 569 /*
 570  * Expects runqueue lock to be held for atomicity of update
 571  */
 572 static inline void
 573 rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
 574 {
 575         if (rq)
 576                 rq->rq_sched_info.cpu_time += delta_jiffies;
 577 }
 578 # define schedstat_inc(rq, field)       do { (rq)->field++; } while (0)
 579 # define schedstat_add(rq, field, amt)  do { (rq)->field += (amt); } while (0)
 580 #else /* !CONFIG_SCHEDSTATS */
 581 static inline void
 582 rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
 583 {}
 584 static inline void
 585 rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
 586 {}
 587 # define schedstat_inc(rq, field)       do { } while (0)
 588 # define schedstat_add(rq, field, amt)  do { } while (0)
 589 #endif
 590
 591 /*
 592  * this_rq_lock - lock this runqueue and disable interrupts.
 593  */
 594 static inline struct rq *this_rq_lock(void)
 595         __acquires(rq->lock)
 596 {
 597         struct rq *rq;
 598
 599         local_irq_disable();
 600         rq = this_rq();
 601         spin_lock(&rq->lock);
 602
 603         return rq;
 604 }
 605
 606 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 607 /*
 608  * Called when a process is dequeued from the active array and given
 609  * the cpu.  We should note that with the exception of interactive
 610  * tasks, the expired queue will become the active queue after the active
 611  * queue is empty, without explicitly dequeuing and requeuing tasks in the
 612  * expired queue.  (Interactive tasks may be requeued directly to the
 613  * active queue, thus delaying tasks in the expired queue from running;
 614  * see scheduler_tick()).
 615  *
 616  * This function is only called from sched_info_arrive(), rather than
 617  * dequeue_task(). Even though a task may be queued and dequeued multiple
 618  * times as it is shuffled about, we're really interested in knowing how
 619  * long it was from the *first* time it was queued to the time that it
 620  * finally hit a cpu.
 621  */
 622 static inline void sched_info_dequeued(struct task_struct *t)
 623 {
 624         t->sched_info.last_queued = 0;
 625 }
 626
 627 /*
 628  * Called when a task finally hits the cpu.  We can now calculate how
 629  * long it was waiting to run.  We also note when it began so that we
 630  * can keep stats on how long its timeslice is.
 631  */
 632 static void sched_info_arrive(struct task_struct *t)
 633 {
 634         unsigned long now = jiffies, delta_jiffies = 0;
 635
 636         if (t->sched_info.last_queued)
 637                 delta_jiffies = now - t->sched_info.last_queued;
 638         sched_info_dequeued(t);
 639         t->sched_info.run_delay += delta_jiffies;
 640         t->sched_info.last_arrival = now;
 641         t->sched_info.pcnt++;
 642
 643         rq_sched_info_arrive(task_rq(t), delta_jiffies);
 644 }
 645
 646 /*
 647  * Called when a process is queued into either the active or expired
 648  * array.  The time is noted and later used to determine how long we
 649  * had to wait for us to reach the cpu.  Since the expired queue will
 650  * become the active queue after active queue is empty, without dequeuing
 651  * and requeuing any tasks, we are interested in queuing to either. It
 652  * is unusual but not impossible for tasks to be dequeued and immediately
 653  * requeued in the same or another array: this can happen in sched_yield(),
 654  * set_user_nice(), and even load_balance() as it moves tasks from runqueue
 655  * to runqueue.
 656  *
 657  * This function is only called from enqueue_task(), but also only updates
 658  * the timestamp if it is already not set.  It's assumed that
 659  * sched_info_dequeued() will clear that stamp when appropriate.
 660  */
 661 static inline void sched_info_queued(struct task_struct *t)
 662 {
 663         if (unlikely(sched_info_on()))
 664                 if (!t->sched_info.last_queued)
 665                         t->sched_info.last_queued = jiffies;
 666 }
 667
 668 /*
 669  * Called when a process ceases being the active-running process, either
 670  * voluntarily or involuntarily.  Now we can calculate how long we ran.
 671  */
 672 static inline void sched_info_depart(struct task_struct *t)
 673 {
 674         unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
 675
 676         t->sched_info.cpu_time += delta_jiffies;
 677         rq_sched_info_depart(task_rq(t), delta_jiffies);
 678 }
 679
 680 /*
 681  * Called when tasks are switched involuntarily due, typically, to expiring
 682  * their time slice.  (This may also be called when switching to or from
 683  * the idle task.)  We are only called when prev != next.
 684  */
 685 static inline void
 686 __sched_info_switch(struct task_struct *prev, struct task_struct *next)
 687 {
 688         struct rq *rq = task_rq(prev);
 689
 690         /*
 691          * prev now departs the cpu.  It's not interesting to record
 692          * stats about how efficient we were at scheduling the idle
 693          * process, however.
 694          */
 695         if (prev != rq->idle)
 696                 sched_info_depart(prev);
 697
 698         if (next != rq->idle)
 699                 sched_info_arrive(next);
 700 }
 701 static inline void
 702 sched_info_switch(struct task_struct *prev, struct task_struct *next)
 703 {
 704         if (unlikely(sched_info_on()))
 705                 __sched_info_switch(prev, next);
 706 }
 707 #else
 708 #define sched_info_queued(t)            do { } while (0)
 709 #define sched_info_switch(t, next)      do { } while (0)
 710 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
 711
 712 /*
 713  * Adding/removing a task to/from a priority array:
 714  */
 715 static void dequeue_task(struct task_struct *p, struct prio_array *array)
 716 {
 717         array->nr_active--;
 718         list_del(&p->run_list);
 719         if (list_empty(array->queue + p->prio))
 720                 __clear_bit(p->prio, array->bitmap);
 721 }
 722
 723 static void enqueue_task(struct task_struct *p, struct prio_array *array)
 724 {
 725         sched_info_queued(p);
 726         list_add_tail(&p->run_list, array->queue + p->prio);
 727         __set_bit(p->prio, array->bitmap);
 728         array->nr_active++;
 729         p->array = array;
 730 }
 731
 732 /*
 733  * Put task to the end of the run list without the overhead of dequeue
 734  * followed by enqueue.
 735  */
 736 static void requeue_task(struct task_struct *p, struct prio_array *array)
 737 {
 738         list_move_tail(&p->run_list, array->queue + p->prio);
 739 }
 740
 741 static inline void
 742 enqueue_task_head(struct task_struct *p, struct prio_array *array)
 743 {
 744         list_add(&p->run_list, array->queue + p->prio);
 745         __set_bit(p->prio, array->bitmap);
 746         array->nr_active++;
 747         p->array = array;
 748 }
 749
 750 /*
 751  * __normal_prio - return the priority that is based on the static
 752  * priority but is modified by bonuses/penalties.
 753  *
 754  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
 755  * into the -5 ... 0 ... +5 bonus/penalty range.
 756  *
 757  * We use 25% of the full 0...39 priority range so that:
 758  *
 759  * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
 760  * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
 761  *
 762  * Both properties are important to certain workloads.
 763  */
 764
 765 static inline int __normal_prio(struct task_struct *p)
 766 {
 767         int bonus, prio;
 768
 769         bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
 770
 771         prio = p->static_prio - bonus;
 772         if (prio < MAX_RT_PRIO)
 773                 prio = MAX_RT_PRIO;
 774         if (prio > MAX_PRIO-1)
 775                 prio = MAX_PRIO-1;
 776         return prio;
 777 }
 778
 779 /*
 780  * To aid in avoiding the subversion of "niceness" due to uneven distribution
 781  * of tasks with abnormal "nice" values across CPUs the contribution that
 782  * each task makes to its run queue's load is weighted according to its
 783  * scheduling class and "nice" value.  For SCHED_NORMAL tasks this is just a
 784  * scaled version of the new time slice allocation that they receive on time
 785  * slice expiry etc.
 786  */
 787
 788 /*
 789  * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
 790  * If static_prio_timeslice() is ever changed to break this assumption then
 791  * this code will need modification
 792  */
 793 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
 794 #define LOAD_WEIGHT(lp) \
 795         (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
 796 #define PRIO_TO_LOAD_WEIGHT(prio) \
 797         LOAD_WEIGHT(static_prio_timeslice(prio))
 798 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
 799         (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
 800
 801 static void set_load_weight(struct task_struct *p)
 802 {
 803         if (has_rt_policy(p)) {
 804 #ifdef CONFIG_SMP
 805                 if (p == task_rq(p)->migration_thread)
 806                         /*
 807                          * The migration thread does the actual balancing.
 808                          * Giving its load any weight will skew balancing
 809                          * adversely.
 810                          */
 811                         p->load_weight = 0;
 812                 else
 813 #endif
 814                         p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
 815         } else
 816                 p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
 817 }
 818
 819 static inline void
 820 inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
 821 {
 822         rq->raw_weighted_load += p->load_weight;
 823 }
 824
 825 static inline void
 826 dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
 827 {
 828         rq->raw_weighted_load -= p->load_weight;
 829 }
 830
 831 static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
 832 {
 833         rq->nr_running++;
 834         inc_raw_weighted_load(rq, p);
 835 }
 836
 837 static inline void dec_nr_running(struct task_struct *p, struct rq *rq)
 838 {
 839         rq->nr_running--;
 840         dec_raw_weighted_load(rq, p);
 841 }
 842
 843 /*
 844  * Calculate the expected normal priority: i.e. priority
 845  * without taking RT-inheritance into account. Might be
 846  * boosted by interactivity modifiers. Changes upon fork,
 847  * setprio syscalls, and whenever the interactivity
 848  * estimator recalculates.
 849  */
 850 static inline int normal_prio(struct task_struct *p)
 851 {
 852         int prio;
 853
 854         if (has_rt_policy(p))
 855                 prio = MAX_RT_PRIO-1 - p->rt_priority;
 856         else
 857                 prio = __normal_prio(p);
 858         return prio;
 859 }
 860
 861 /*
 862  * Calculate the current priority, i.e. the priority
 863  * taken into account by the scheduler. This value might
 864  * be boosted by RT tasks, or might be boosted by
 865  * interactivity modifiers. Will be RT if the task got
 866  * RT-boosted. If not then it returns p->normal_prio.
 867  */
 868 static int effective_prio(struct task_struct *p)
 869 {
 870         p->normal_prio = normal_prio(p);
 871         /*
 872          * If we are RT tasks or we were boosted to RT priority,
 873          * keep the priority unchanged. Otherwise, update priority
 874          * to the normal priority:
 875          */
 876         if (!rt_prio(p->prio))
 877                 return p->normal_prio;
 878         return p->prio;
 879 }
 880
 881 /*
 882  * __activate_task - move a task to the runqueue.
 883  */
 884 static void __activate_task(struct task_struct *p, struct rq *rq)
 885 {
 886         struct prio_array *target = rq->active;
 887
 888         if (batch_task(p))
 889                 target = rq->expired;
 890         enqueue_task(p, target);
 891         inc_nr_running(p, rq);
 892 }
 893
 894 /*
 895  * __activate_idle_task - move idle task to the _front_ of runqueue.
 896  */
 897 static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
 898 {
 899         enqueue_task_head(p, rq->active);
 900         inc_nr_running(p, rq);
 901 }
 902
 903 /*
 904  * Recalculate p->normal_prio and p->prio after having slept,
 905  * updating the sleep-average too:
 906  */
 907 static int recalc_task_prio(struct task_struct *p, unsigned long long now)
 908 {
 909         /* Caller must always ensure 'now >= p->timestamp' */
 910         unsigned long sleep_time = now - p->timestamp;
 911
 912         if (batch_task(p))
 913                 sleep_time = 0;
 914
 915         if (likely(sleep_time > 0)) {
 916                 /*
 917                  * This ceiling is set to the lowest priority that would allow
 918                  * a task to be reinserted into the active array on timeslice
 919                  * completion.
 920                  */
 921                 unsigned long ceiling = INTERACTIVE_SLEEP(p);
 922
 923                 if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
 924                         /*
 925                          * Prevents user tasks from achieving best priority
 926                          * with one single large enough sleep.
 927                          */
 928                         p->sleep_avg = ceiling;
 929                         /*
 930                          * Using INTERACTIVE_SLEEP() as a ceiling places a
 931                          * nice(0) task 1ms sleep away from promotion, and
 932                          * gives it 700ms to round-robin with no chance of
 933                          * being demoted.  This is more than generous, so
 934                          * mark this sleep as non-interactive to prevent the
 935                          * on-runqueue bonus logic from intervening should
 936                          * this task not receive cpu immediately.
 937                          */
 938                         p->sleep_type = SLEEP_NONINTERACTIVE;
 939                 } else {
 940                         /*
 941                          * Tasks waking from uninterruptible sleep are
 942                          * limited in their sleep_avg rise as they
 943                          * are likely to be waiting on I/O
 944                          */
 945                         if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
 946                                 if (p->sleep_avg >= ceiling)
 947                                         sleep_time = 0;
 948                                 else if (p->sleep_avg + sleep_time >=
 949                                          ceiling) {
 950                                                 p->sleep_avg = ceiling;
 951                                                 sleep_time = 0;
 952                                 }
 953                         }
 954
 955                         /*
 956                          * This code gives a bonus to interactive tasks.
 957                          *
 958                          * The boost works by updating the 'average sleep time'
 959                          * value here, based on ->timestamp. The more time a
 960                          * task spends sleeping, the higher the average gets -
 961                          * and the higher the priority boost gets as well.
 962                          */
 963                         p->sleep_avg += sleep_time;
 964
 965                 }
 966                 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
 967                         p->sleep_avg = NS_MAX_SLEEP_AVG;
 968         }
 969
 970         return effective_prio(p);
 971 }
 972
 973 /*
 974  * activate_task - move a task to the runqueue and do priority recalculation
 975  *
 976  * Update all the scheduling statistics stuff. (sleep average
 977  * calculation, priority modifiers, etc.)
 978  */
 979 static void activate_task(struct task_struct *p, struct rq *rq, int local)
 980 {
 981         unsigned long long now;
 982
 983         if (rt_task(p))
 984                 goto out;
 985
 986         now = sched_clock();
 987 #ifdef CONFIG_SMP
 988         if (!local) {
 989                 /* Compensate for drifting sched_clock */
 990                 struct rq *this_rq = this_rq();
 991                 now = (now - this_rq->most_recent_timestamp)
 992                         + rq->most_recent_timestamp;
 993         }
 994 #endif
 995
 996         /*
 997          * Sleep time is in units of nanosecs, so shift by 20 to get a
 998          * milliseconds-range estimation of the amount of time that the task
 999          * spent sleeping:
1000          */
1001         if (unlikely(prof_on == SLEEP_PROFILING)) {
1002                 if (p->state == TASK_UNINTERRUPTIBLE)
1003                         profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
1004                                      (now - p->timestamp) >> 20);
1005         }
1006
1007         p->prio = recalc_task_prio(p, now);
1008
1009         /*
1010          * This checks to make sure it's not an uninterruptible task
1011          * that is now waking up.
1012          */
1013         if (p->sleep_type == SLEEP_NORMAL) {
1014                 /*
1015                  * Tasks which were woken up by interrupts (ie. hw events)
1016                  * are most likely of interactive nature. So we give them
1017                  * the credit of extending their sleep time to the period
1018                  * of time they spend on the runqueue, waiting for execution
1019                  * on a CPU, first time around:
1020                  */
1021                 if (in_interrupt())
1022                         p->sleep_type = SLEEP_INTERRUPTED;
1023                 else {
1024                         /*
1025                          * Normal first-time wakeups get a credit too for
1026                          * on-runqueue time, but it will be weighted down:
1027                          */
1028                         p->sleep_type = SLEEP_INTERACTIVE;
1029                 }
1030         }
1031         p->timestamp = now;
1032 out:
1033         __activate_task(p, rq);
1034 }
1035
1036 /*
1037  * deactivate_task - remove a task from the runqueue.
1038  */
1039 static void deactivate_task(struct task_struct *p, struct rq *rq)
1040 {
1041         dec_nr_running(p, rq);
1042         dequeue_task(p, p->array);
1043         p->array = NULL;
1044 }
1045
1046 /*
1047  * resched_task - mark a task 'to be rescheduled now'.
1048  *
1049  * On UP this means the setting of the need_resched flag, on SMP it
1050  * might also involve a cross-CPU call to trigger the scheduler on
1051  * the target CPU.
1052  */
1053 #ifdef CONFIG_SMP
1054
1055 #ifndef tsk_is_polling
1056 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1057 #endif
1058
1059 static void resched_task(struct task_struct *p)
1060 {
1061         int cpu;
1062
1063         assert_spin_locked(&task_rq(p)->lock);
1064
1065         if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
1066                 return;
1067
1068         set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1069
1070         cpu = task_cpu(p);
1071         if (cpu == smp_processor_id())
1072                 return;
1073
1074         /* NEED_RESCHED must be visible before we test polling */
1075         smp_mb();
1076         if (!tsk_is_polling(p))
1077                 smp_send_reschedule(cpu);
1078 }
1079
1080 static void resched_cpu(int cpu)
1081 {
1082         struct rq *rq = cpu_rq(cpu);
1083         unsigned long flags;
1084
1085         if (!spin_trylock_irqsave(&rq->lock, flags))
1086                 return;
1087         resched_task(cpu_curr(cpu));
1088         spin_unlock_irqrestore(&rq->lock, flags);
1089 }
1090 #else
1091 static inline void resched_task(struct task_struct *p)
1092 {
1093         assert_spin_locked(&task_rq(p)->lock);
1094         set_tsk_need_resched(p);
1095 }
1096 #endif
1097
1098 /**
1099  * task_curr - is this task currently executing on a CPU?
1100  * @p: the task in question.
1101  */
1102 inline int task_curr(const struct task_struct *p)
1103 {
1104         return cpu_curr(task_cpu(p)) == p;
1105 }
1106
1107 /* Used instead of source_load when we know the type == 0 */
1108 unsigned long weighted_cpuload(const int cpu)
1109 {
1110         return cpu_rq(cpu)->raw_weighted_load;
1111 }
1112
1113 #ifdef CONFIG_SMP
1114
1115 void set_task_cpu(struct task_struct *p, unsigned int cpu)
1116 {
1117         task_thread_info(p)->cpu = cpu;
1118 }
1119
1120 struct migration_req {
1121         struct list_head list;
1122
1123         struct task_struct *task;
1124         int dest_cpu;
1125
1126         struct completion done;
1127 };
1128
1129 /*
1130  * The task's runqueue lock must be held.
1131  * Returns true if you have to wait for migration thread.
1132  */
1133 static int
1134 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1135 {
1136         struct rq *rq = task_rq(p);
1137
1138         /*
1139          * If the task is not on a runqueue (and not running), then
1140          * it is sufficient to simply update the task's cpu field.
1141          */
1142         if (!p->array && !task_running(rq, p)) {
1143                 set_task_cpu(p, dest_cpu);
1144                 return 0;
1145         }
1146
1147         init_completion(&req->done);
1148         req->task = p;
1149         req->dest_cpu = dest_cpu;
1150         list_add(&req->list, &rq->migration_queue);
1151
1152         return 1;
1153 }
1154
1155 /*
1156  * wait_task_inactive - wait for a thread to unschedule.
1157  *
1158  * The caller must ensure that the task *will* unschedule sometime soon,
1159  * else this function might spin for a *long* time. This function can't
1160  * be called with interrupts off, or it may introduce deadlock with
1161  * smp_call_function() if an IPI is sent by the same process we are
1162  * waiting to become inactive.
1163  */
1164 void wait_task_inactive(struct task_struct *p)
1165 {
1166         unsigned long flags;
1167         struct rq *rq;
1168         struct prio_array *array;
1169         int running;
1170
1171 repeat:
1172         /*
1173          * We do the initial early heuristics without holding
1174          * any task-queue locks at all. We'll only try to get
1175          * the runqueue lock when things look like they will
1176          * work out!
1177          */
1178         rq = task_rq(p);
1179
1180         /*
1181          * If the task is actively running on another CPU
1182          * still, just relax and busy-wait without holding
1183          * any locks.
1184          *
1185          * NOTE! Since we don't hold any locks, it's not
1186          * even sure that "rq" stays as the right runqueue!
1187          * But we don't care, since "task_running()" will
1188          * return false if the runqueue has changed and p
1189          * is actually now running somewhere else!
1190          */
1191         while (task_running(rq, p))
1192                 cpu_relax();
1193
1194         /*
1195          * Ok, time to look more closely! We need the rq
1196          * lock now, to be *sure*. If we're wrong, we'll
1197          * just go back and repeat.
1198          */
1199         rq = task_rq_lock(p, &flags);
1200         running = task_running(rq, p);
1201         array = p->array;
1202         task_rq_unlock(rq, &flags);
1203
1204         /*
1205          * Was it really running after all now that we
1206          * checked with the proper locks actually held?
1207          *
1208          * Oops. Go back and try again..
1209          */
1210         if (unlikely(running)) {
1211                 cpu_relax();
1212                 goto repeat;
1213         }
1214
1215         /*
1216          * It's not enough that it's not actively running,
1217          * it must be off the runqueue _entirely_, and not
1218          * preempted!
1219          *
1220          * So if it wa still runnable (but just not actively
1221          * running right now), it's preempted, and we should
1222          * yield - it could be a while.
1223          */
1224         if (unlikely(array)) {
1225                 yield();
1226                 goto repeat;
1227         }
1228
1229         /*
1230          * Ahh, all good. It wasn't running, and it wasn't
1231          * runnable, which means that it will never become
1232          * running in the future either. We're all done!
1233          */
1234 }
1235
1236 /***
1237  * kick_process - kick a running thread to enter/exit the kernel
1238  * @p: the to-be-kicked thread
1239  *
1240  * Cause a process which is running on another CPU to enter
1241  * kernel-mode, without any delay. (to get signals handled.)
1242  *
1243  * NOTE: this function doesnt have to take the runqueue lock,
1244  * because all it wants to ensure is that the remote task enters
1245  * the kernel. If the IPI races and the task has been migrated
1246  * to another CPU then no harm is done and the purpose has been
1247  * achieved as well.
1248  */
1249 void kick_process(struct task_struct *p)
1250 {
1251         int cpu;
1252
1253         preempt_disable();
1254         cpu = task_cpu(p);
1255         if ((cpu != smp_processor_id()) && task_curr(p))
1256                 smp_send_reschedule(cpu);
1257         preempt_enable();
1258 }
1259
1260 /*
1261  * Return a low guess at the load of a migration-source cpu weighted
1262  * according to the scheduling class and "nice" value.
1263  *
1264  * We want to under-estimate the load of migration sources, to
1265  * balance conservatively.
1266  */
1267 static inline unsigned long source_load(int cpu, int type)
1268 {
1269         struct rq *rq = cpu_rq(cpu);
1270
1271         if (type == 0)
1272                 return rq->raw_weighted_load;
1273
1274         return min(rq->cpu_load[type-1], rq->raw_weighted_load);
1275 }
1276
1277 /*
1278  * Return a high guess at the load of a migration-target cpu weighted
1279  * according to the scheduling class and "nice" value.
1280  */
1281 static inline unsigned long target_load(int cpu, int type)
1282 {
1283         struct rq *rq = cpu_rq(cpu);
1284
1285         if (type == 0)
1286                 return rq->raw_weighted_load;
1287
1288         return max(rq->cpu_load[type-1], rq->raw_weighted_load);
1289 }
1290
1291 /*
1292  * Return the average load per task on the cpu's run queue
1293  */
1294 static inline unsigned long cpu_avg_load_per_task(int cpu)
1295 {
1296         struct rq *rq = cpu_rq(cpu);
1297         unsigned long n = rq->nr_running;
1298
1299         return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
1300 }
1301
1302 /*
1303  * find_idlest_group finds and returns the least busy CPU group within the
1304  * domain.
1305  */
1306 static struct sched_group *
1307 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1308 {
1309         struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1310         unsigned long min_load = ULONG_MAX, this_load = 0;
1311         int load_idx = sd->forkexec_idx;
1312         int imbalance = 100 + (sd->imbalance_pct-100)/2;
1313
1314         do {
1315                 unsigned long load, avg_load;
1316                 int local_group;
1317                 int i;
1318
1319                 /* Skip over this group if it has no CPUs allowed */
1320                 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1321                         goto nextgroup;
1322
1323                 local_group = cpu_isset(this_cpu, group->cpumask);
1324
1325                 /* Tally up the load of all CPUs in the group */
1326                 avg_load = 0;
1327
1328                 for_each_cpu_mask(i, group->cpumask) {
1329                         /* Bias balancing toward cpus of our domain */
1330                         if (local_group)
1331                                 load = source_load(i, load_idx);
1332                         else
1333                                 load = target_load(i, load_idx);
1334
1335                         avg_load += load;
1336                 }
1337
1338                 /* Adjust by relative CPU power of the group */
1339                 avg_load = sg_div_cpu_power(group,
1340                                 avg_load * SCHED_LOAD_SCALE);
1341
1342                 if (local_group) {
1343                         this_load = avg_load;
1344                         this = group;
1345                 } else if (avg_load < min_load) {
1346                         min_load = avg_load;
1347                         idlest = group;
1348                 }
1349 nextgroup:
1350                 group = group->next;
1351         } while (group != sd->groups);
1352
1353         if (!idlest || 100*this_load < imbalance*min_load)
1354                 return NULL;
1355         return idlest;
1356 }
1357
1358 /*
1359  * find_idlest_cpu - find the idlest cpu among the cpus in group.
1360  */
1361 static int
1362 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1363 {
1364         cpumask_t tmp;
1365         unsigned long load, min_load = ULONG_MAX;
1366         int idlest = -1;
1367         int i;
1368
1369         /* Traverse only the allowed CPUs */
1370         cpus_and(tmp, group->cpumask, p->cpus_allowed);
1371
1372         for_each_cpu_mask(i, tmp) {
1373                 load = weighted_cpuload(i);
1374
1375                 if (load < min_load || (load == min_load && i == this_cpu)) {
1376                         min_load = load;
1377                         idlest = i;
1378                 }
1379         }
1380
1381         return idlest;
1382 }
1383
1384 /*
1385  * sched_balance_self: balance the current task (running on cpu) in domains
1386  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1387  * SD_BALANCE_EXEC.
1388  *
1389  * Balance, ie. select the least loaded group.
1390  *
1391  * Returns the target CPU number, or the same CPU if no balancing is needed.
1392  *
1393  * preempt must be disabled.
1394  */
1395 static int sched_balance_self(int cpu, int flag)
1396 {
1397         struct task_struct *t = current;
1398         struct sched_domain *tmp, *sd = NULL;
1399
1400         for_each_domain(cpu, tmp) {
1401                 /*
1402                  * If power savings logic is enabled for a domain, stop there.
1403                  */
1404                 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1405                         break;
1406                 if (tmp->flags & flag)
1407                         sd = tmp;
1408         }
1409
1410         while (sd) {
1411                 cpumask_t span;
1412                 struct sched_group *group;
1413                 int new_cpu, weight;
1414
1415                 if (!(sd->flags & flag)) {
1416                         sd = sd->child;
1417                         continue;
1418                 }
1419
1420                 span = sd->span;
1421                 group = find_idlest_group(sd, t, cpu);
1422                 if (!group) {
1423                         sd = sd->child;
1424                         continue;
1425                 }
1426
1427                 new_cpu = find_idlest_cpu(group, t, cpu);
1428                 if (new_cpu == -1 || new_cpu == cpu) {
1429                         /* Now try balancing at a lower domain level of cpu */
1430                         sd = sd->child;
1431                         continue;
1432                 }
1433
1434                 /* Now try balancing at a lower domain level of new_cpu */
1435                 cpu = new_cpu;
1436                 sd = NULL;
1437                 weight = cpus_weight(span);
1438                 for_each_domain(cpu, tmp) {
1439                         if (weight <= cpus_weight(tmp->span))
1440                                 break;
1441                         if (tmp->flags & flag)
1442                                 sd = tmp;
1443                 }
1444                 /* while loop will break here if sd == NULL */
1445         }
1446
1447         return cpu;
1448 }
1449
1450 #endif /* CONFIG_SMP */
1451
1452 /*
1453  * wake_idle() will wake a task on an idle cpu if task->cpu is
1454  * not idle and an idle cpu is available.  The span of cpus to
1455  * search starts with cpus closest then further out as needed,
1456  * so we always favor a closer, idle cpu.
1457  *
1458  * Returns the CPU we should wake onto.
1459  */
1460 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1461 static int wake_idle(int cpu, struct task_struct *p)
1462 {
1463         cpumask_t tmp;
1464         struct sched_domain *sd;
1465         int i;
1466
1467         /*
1468          * If it is idle, then it is the best cpu to run this task.
1469          *
1470          * This cpu is also the best, if it has more than one task already.
1471          * Siblings must be also busy(in most cases) as they didn't already
1472          * pickup the extra load from this cpu and hence we need not check
1473          * sibling runqueue info. This will avoid the checks and cache miss
1474          * penalities associated with that.
1475          */
1476         if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1477                 return cpu;
1478
1479         for_each_domain(cpu, sd) {
1480                 if (sd->flags & SD_WAKE_IDLE) {
1481                         cpus_and(tmp, sd->span, p->cpus_allowed);
1482                         for_each_cpu_mask(i, tmp) {
1483                                 if (idle_cpu(i))
1484                                         return i;
1485                         }
1486                 }
1487                 else
1488                         break;
1489         }
1490         return cpu;
1491 }
1492 #else
1493 static inline int wake_idle(int cpu, struct task_struct *p)
1494 {
1495         return cpu;
1496 }
1497 #endif
1498
1499 /***
1500  * try_to_wake_up - wake up a thread
1501  * @p: the to-be-woken-up thread
1502  * @state: the mask of task states that can be woken
1503  * @sync: do a synchronous wakeup?
1504  *
1505  * Put it on the run-queue if it's not already there. The "current"
1506  * thread is always on the run-queue (except when the actual
1507  * re-schedule is in progress), and as such you're allowed to do
1508  * the simpler "current->state = TASK_RUNNING" to mark yourself
1509  * runnable without the overhead of this.
1510  *
1511  * returns failure only if the task is already active.
1512  */
1513 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1514 {
1515         int cpu, this_cpu, success = 0;
1516         unsigned long flags;
1517         long old_state;
1518         struct rq *rq;
1519 #ifdef CONFIG_SMP
1520         struct sched_domain *sd, *this_sd = NULL;
1521         unsigned long load, this_load;
1522         int new_cpu;
1523 #endif
1524
1525         rq = task_rq_lock(p, &flags);
1526         old_state = p->state;
1527         if (!(old_state & state))
1528                 goto out;
1529
1530         if (p->array)
1531                 goto out_running;
1532
1533         cpu = task_cpu(p);
1534         this_cpu = smp_processor_id();
1535
1536 #ifdef CONFIG_SMP
1537         if (unlikely(task_running(rq, p)))
1538                 goto out_activate;
1539
1540         new_cpu = cpu;
1541
1542         schedstat_inc(rq, ttwu_cnt);
1543         if (cpu == this_cpu) {
1544                 schedstat_inc(rq, ttwu_local);
1545                 goto out_set_cpu;
1546         }
1547
1548         for_each_domain(this_cpu, sd) {
1549                 if (cpu_isset(cpu, sd->span)) {
1550                         schedstat_inc(sd, ttwu_wake_remote);
1551                         this_sd = sd;
1552                         break;
1553                 }
1554         }
1555
1556         if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1557                 goto out_set_cpu;
1558
1559         /*
1560          * Check for affine wakeup and passive balancing possibilities.
1561          */
1562         if (this_sd) {
1563                 int idx = this_sd->wake_idx;
1564                 unsigned int imbalance;
1565
1566                 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1567
1568                 load = source_load(cpu, idx);
1569                 this_load = target_load(this_cpu, idx);
1570
1571                 new_cpu = this_cpu; /* Wake to this CPU if we can */
1572
1573                 if (this_sd->flags & SD_WAKE_AFFINE) {
1574                         unsigned long tl = this_load;
1575                         unsigned long tl_per_task;
1576
1577                         tl_per_task = cpu_avg_load_per_task(this_cpu);
1578
1579                         /*
1580                          * If sync wakeup then subtract the (maximum possible)
1581                          * effect of the currently running task from the load
1582                          * of the current CPU:
1583                          */
1584                         if (sync)
1585                                 tl -= current->load_weight;
1586
1587                         if ((tl <= load &&
1588                                 tl + target_load(cpu, idx) <= tl_per_task) ||
1589                                 100*(tl + p->load_weight) <= imbalance*load) {
1590                                 /*
1591                                  * This domain has SD_WAKE_AFFINE and
1592                                  * p is cache cold in this domain, and
1593                                  * there is no bad imbalance.
1594                                  */
1595                                 schedstat_inc(this_sd, ttwu_move_affine);
1596                                 goto out_set_cpu;
1597                         }
1598                 }
1599
1600                 /*
1601                  * Start passive balancing when half the imbalance_pct
1602                  * limit is reached.
1603                  */
1604                 if (this_sd->flags & SD_WAKE_BALANCE) {
1605                         if (imbalance*this_load <= 100*load) {
1606                                 schedstat_inc(this_sd, ttwu_move_balance);
1607                                 goto out_set_cpu;
1608                         }
1609                 }
1610         }
1611
1612         new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1613 out_set_cpu:
1614         new_cpu = wake_idle(new_cpu, p);
1615         if (new_cpu != cpu) {
1616                 set_task_cpu(p, new_cpu);
1617                 task_rq_unlock(rq, &flags);
1618                 /* might preempt at this point */
1619                 rq = task_rq_lock(p, &flags);
1620                 old_state = p->state;
1621                 if (!(old_state & state))
1622                         goto out;
1623                 if (p->array)
1624                         goto out_running;
1625
1626                 this_cpu = smp_processor_id();
1627                 cpu = task_cpu(p);
1628         }
1629
1630 out_activate:
1631 #endif /* CONFIG_SMP */
1632         if (old_state == TASK_UNINTERRUPTIBLE) {
1633                 rq->nr_uninterruptible--;
1634                 /*
1635                  * Tasks on involuntary sleep don't earn
1636                  * sleep_avg beyond just interactive state.
1637                  */
1638                 p->sleep_type = SLEEP_NONINTERACTIVE;
1639         } else
1640
1641         /*
1642          * Tasks that have marked their sleep as noninteractive get
1643          * woken up with their sleep average not weighted in an
1644          * interactive way.
1645          */
1646                 if (old_state & TASK_NONINTERACTIVE)
1647                         p->sleep_type = SLEEP_NONINTERACTIVE;
1648
1649
1650         activate_task(p, rq, cpu == this_cpu);
1651         /*
1652          * Sync wakeups (i.e. those types of wakeups where the waker
1653          * has indicated that it will leave the CPU in short order)
1654          * don't trigger a preemption, if the woken up task will run on
1655          * this cpu. (in this case the 'I will reschedule' promise of
1656          * the waker guarantees that the freshly woken up task is going
1657          * to be considered on this CPU.)
1658          */
1659         if (!sync || cpu != this_cpu) {
1660                 if (TASK_PREEMPTS_CURR(p, rq))
1661                         resched_task(rq->curr);
1662         }
1663         success = 1;
1664
1665 out_running:
1666         p->state = TASK_RUNNING;
1667 out:
1668         task_rq_unlock(rq, &flags);
1669
1670         return success;
1671 }
1672
1673 int fastcall wake_up_process(struct task_struct *p)
1674 {
1675         return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1676                                  TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1677 }
1678 EXPORT_SYMBOL(wake_up_process);
1679
1680 int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1681 {
1682         return try_to_wake_up(p, state, 0);
1683 }
1684
1685 static void task_running_tick(struct rq *rq, struct task_struct *p);
1686 /*
1687  * Perform scheduler related setup for a newly forked process p.
1688  * p is forked by current.
1689  */
1690 void fastcall sched_fork(struct task_struct *p, int clone_flags)
1691 {
1692         int cpu = get_cpu();
1693
1694 #ifdef CONFIG_SMP
1695         cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1696 #endif
1697         set_task_cpu(p, cpu);
1698
1699         /*
1700          * We mark the process as running here, but have not actually
1701          * inserted it onto the runqueue yet. This guarantees that
1702          * nobody will actually run it, and a signal or other external
1703          * event cannot wake it up and insert it on the runqueue either.
1704          */
1705         p->state = TASK_RUNNING;
1706
1707         /*
1708          * Make sure we do not leak PI boosting priority to the child:
1709          */
1710         p->prio = current->normal_prio;
1711
1712         INIT_LIST_HEAD(&p->run_list);
1713         p->array = NULL;
1714 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1715         if (unlikely(sched_info_on()))
1716                 memset(&p->sched_info, 0, sizeof(p->sched_info));
1717 #endif
1718 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1719         p->oncpu = 0;
1720 #endif
1721 #ifdef CONFIG_PREEMPT
1722         /* Want to start with kernel preemption disabled. */
1723         task_thread_info(p)->preempt_count = 1;
1724 #endif
1725         /*
1726          * Share the timeslice between parent and child, thus the
1727          * total amount of pending timeslices in the system doesn't change,
1728          * resulting in more scheduling fairness.
1729          */
1730         local_irq_disable();
1731         p->time_slice = (current->time_slice + 1) >> 1;
1732         /*
1733          * The remainder of the first timeslice might be recovered by
1734          * the parent if the child exits early enough.
1735          */
1736         p->first_time_slice = 1;
1737         current->time_slice >>= 1;
1738         p->timestamp = sched_clock();
1739         if (unlikely(!current->time_slice)) {
1740                 /*
1741                  * This case is rare, it happens when the parent has only
1742                  * a single jiffy left from its timeslice. Taking the
1743                  * runqueue lock is not a problem.
1744                  */
1745                 current->time_slice = 1;
1746                 task_running_tick(cpu_rq(cpu), current);
1747         }
1748         local_irq_enable();
1749         put_cpu();
1750 }
1751
1752 /*
1753  * wake_up_new_task - wake up a newly created task for the first time.
1754  *
1755  * This function will do some initial scheduler statistics housekeeping
1756  * that must be done for every newly created context, then puts the task
1757  * on the runqueue and wakes it.
1758  */
1759 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1760 {
1761         struct rq *rq, *this_rq;
1762         unsigned long flags;
1763         int this_cpu, cpu;
1764
1765         rq = task_rq_lock(p, &flags);
1766         BUG_ON(p->state != TASK_RUNNING);
1767         this_cpu = smp_processor_id();
1768         cpu = task_cpu(p);
1769
1770         /*
1771          * We decrease the sleep average of forking parents
1772          * and children as well, to keep max-interactive tasks
1773          * from forking tasks that are max-interactive. The parent
1774          * (current) is done further down, under its lock.
1775          */
1776         p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1777                 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1778
1779         p->prio = effective_prio(p);
1780
1781         if (likely(cpu == this_cpu)) {
1782                 if (!(clone_flags & CLONE_VM)) {
1783                         /*
1784                          * The VM isn't cloned, so we're in a good position to
1785                          * do child-runs-first in anticipation of an exec. This
1786                          * usually avoids a lot of COW overhead.
1787                          */
1788                         if (unlikely(!current->array))
1789                                 __activate_task(p, rq);
1790                         else {
1791                                 p->prio = current->prio;
1792                                 p->normal_prio = current->normal_prio;
1793                                 list_add_tail(&p->run_list, &current->run_list);
1794                                 p->array = current->array;
1795                                 p->array->nr_active++;
1796                                 inc_nr_running(p, rq);
1797                         }
1798                         set_need_resched();
1799                 } else
1800                         /* Run child last */
1801                         __activate_task(p, rq);
1802                 /*
1803                  * We skip the following code due to cpu == this_cpu
1804                  *
1805                  *   task_rq_unlock(rq, &flags);
1806                  *   this_rq = task_rq_lock(current, &flags);
1807                  */
1808                 this_rq = rq;
1809         } else {
1810                 this_rq = cpu_rq(this_cpu);
1811
1812                 /*
1813                  * Not the local CPU - must adjust timestamp. This should
1814                  * get optimised away in the !CONFIG_SMP case.
1815                  */
1816                 p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
1817                                         + rq->most_recent_timestamp;
1818                 __activate_task(p, rq);
1819                 if (TASK_PREEMPTS_CURR(p, rq))
1820                         resched_task(rq->curr);
1821
1822                 /*
1823                  * Parent and child are on different CPUs, now get the
1824                  * parent runqueue to update the parent's ->sleep_avg:
1825                  */
1826                 task_rq_unlock(rq, &flags);
1827                 this_rq = task_rq_lock(current, &flags);
1828         }
1829         current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1830                 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1831         task_rq_unlock(this_rq, &flags);
1832 }
1833
1834 /**
1835  * prepare_task_switch - prepare to switch tasks
1836  * @rq: the runqueue preparing to switch
1837  * @next: the task we are going to switch to.
1838  *
1839  * This is called with the rq lock held and interrupts off. It must
1840  * be paired with a subsequent finish_task_switch after the context
1841  * switch.
1842  *
1843  * prepare_task_switch sets up locking and calls architecture specific
1844  * hooks.
1845  */
1846 static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
1847 {
1848         prepare_lock_switch(rq, next);
1849         prepare_arch_switch(next);
1850 }
1851
1852 /**
1853  * finish_task_switch - clean up after a task-switch
1854  * @rq: runqueue associated with task-switch
1855  * @prev: the thread we just switched away from.
1856  *
1857  * finish_task_switch must be called after the context switch, paired
1858  * with a prepare_task_switch call before the context switch.
1859  * finish_task_switch will reconcile locking set up by prepare_task_switch,
1860  * and do any other architecture-specific cleanup actions.
1861  *
1862  * Note that we may have delayed dropping an mm in context_switch(). If
1863  * so, we finish that here outside of the runqueue lock.  (Doing it
1864  * with the lock held can cause deadlocks; see schedule() for
1865  * details.)
1866  */
1867 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1868         __releases(rq->lock)
1869 {
1870         struct mm_struct *mm = rq->prev_mm;
1871         long prev_state;
1872
1873         rq->prev_mm = NULL;
1874
1875         /*
1876          * A task struct has one reference for the use as "current".
1877          * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1878          * schedule one last time. The schedule call will never return, and
1879          * the scheduled task must drop that reference.
1880          * The test for TASK_DEAD must occur while the runqueue locks are
1881          * still held, otherwise prev could be scheduled on another cpu, die
1882          * there before we look at prev->state, and then the reference would
1883          * be dropped twice.
1884          *              Manfred Spraul <manfred@colorfullife.com>
1885          */
1886         prev_state = prev->state;
1887         finish_arch_switch(prev);
1888         finish_lock_switch(rq, prev);
1889         if (mm)
1890                 mmdrop(mm);
1891         if (unlikely(prev_state == TASK_DEAD)) {
1892                 /*
1893                  * Remove function-return probe instances associated with this
1894                  * task and put them back on the free list.
1895                  */
1896                 kprobe_flush_task(prev);
1897                 put_task_struct(prev);
1898         }
1899 }
1900
1901 /**
1902  * schedule_tail - first thing a freshly forked thread must call.
1903  * @prev: the thread we just switched away from.
1904  */
1905 asmlinkage void schedule_tail(struct task_struct *prev)
1906         __releases(rq->lock)
1907 {
1908         struct rq *rq = this_rq();
1909
1910         finish_task_switch(rq, prev);
1911 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
1912         /* In this case, finish_task_switch does not reenable preemption */
1913         preempt_enable();
1914 #endif
1915         if (current->set_child_tid)
1916                 put_user(current->pid, current->set_child_tid);
1917 }
1918
1919 /*
1920  * context_switch - switch to the new MM and the new
1921  * thread's register state.
1922  */
1923 static inline struct task_struct *
1924 context_switch(struct rq *rq, struct task_struct *prev,
1925                struct task_struct *next)
1926 {
1927         struct mm_struct *mm = next->mm;
1928         struct mm_struct *oldmm = prev->active_mm;
1929
1930         /*
1931          * For paravirt, this is coupled with an exit in switch_to to
1932          * combine the page table reload and the switch backend into
1933          * one hypercall.
1934          */
1935         arch_enter_lazy_cpu_mode();
1936
1937         if (!mm) {
1938                 next->active_mm = oldmm;
1939                 atomic_inc(&oldmm->mm_count);
1940                 enter_lazy_tlb(oldmm, next);
1941         } else
1942                 switch_mm(oldmm, mm, next);
1943
1944         if (!prev->mm) {
1945                 prev->active_mm = NULL;
1946                 WARN_ON(rq->prev_mm);
1947                 rq->prev_mm = oldmm;
1948         }
1949         /*
1950          * Since the runqueue lock will be released by the next
1951          * task (which is an invalid locking op but in the case
1952          * of the scheduler it's an obvious special-case), so we
1953          * do an early lockdep release here:
1954          */
1955 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
1956         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1957 #endif
1958
1959         /* Here we just switch the register state and the stack. */
1960         switch_to(prev, next, prev);
1961
1962         return prev;
1963 }
1964
1965 /*
1966  * nr_running, nr_uninterruptible and nr_context_switches:
1967  *
1968  * externally visible scheduler statistics: current number of runnable
1969  * threads, current number of uninterruptible-sleeping threads, total
1970  * number of context switches performed since bootup.
1971  */
1972 unsigned long nr_running(void)
1973 {
1974         unsigned long i, sum = 0;
1975
1976         for_each_online_cpu(i)
1977                 sum += cpu_rq(i)->nr_running;
1978
1979         return sum;
1980 }
1981
1982 unsigned long nr_uninterruptible(void)
1983 {
1984         unsigned long i, sum = 0;
1985
1986         for_each_possible_cpu(i)
1987                 sum += cpu_rq(i)->nr_uninterruptible;
1988
1989         /*
1990          * Since we read the counters lockless, it might be slightly
1991          * inaccurate. Do not allow it to go below zero though:
1992          */
1993         if (unlikely((long)sum < 0))
1994                 sum = 0;
1995
1996         return sum;
1997 }
1998
1999 unsigned long long nr_context_switches(void)
2000 {
2001         int i;
2002         unsigned long long sum = 0;
2003
2004         for_each_possible_cpu(i)
2005                 sum += cpu_rq(i)->nr_switches;
2006
2007         return sum;
2008 }
2009
2010 unsigned long nr_iowait(void)
2011 {
2012         unsigned long i, sum = 0;
2013
2014         for_each_possible_cpu(i)
2015                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2016
2017         return sum;
2018 }
2019
2020 unsigned long nr_active(void)
2021 {
2022         unsigned long i, running = 0, uninterruptible = 0;
2023
2024         for_each_online_cpu(i) {
2025                 running += cpu_rq(i)->nr_running;
2026                 uninterruptible += cpu_rq(i)->nr_uninterruptible;
2027         }
2028
2029         if (unlikely((long)uninterruptible < 0))
2030                 uninterruptible = 0;
2031
2032         return running + uninterruptible;
2033 }
2034
2035 #ifdef CONFIG_SMP
2036
2037 /*
2038  * Is this task likely cache-hot:
2039  */
2040 static inline int
2041 task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
2042 {
2043         return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time;
2044 }
2045
2046 /*
2047  * double_rq_lock - safely lock two runqueues
2048  *
2049  * Note this does not disable interrupts like task_rq_lock,
2050  * you need to do so manually before calling.
2051  */
2052 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2053         __acquires(rq1->lock)
2054         __acquires(rq2->lock)
2055 {
2056         BUG_ON(!irqs_disabled());
2057         if (rq1 == rq2) {
2058                 spin_lock(&rq1->lock);
2059                 __acquire(rq2->lock);   /* Fake it out ;) */
2060         } else {
2061                 if (rq1 < rq2) {
2062                         spin_lock(&rq1->lock);
2063                         spin_lock(&rq2->lock);
2064                 } else {
2065                         spin_lock(&rq2->lock);
2066                         spin_lock(&rq1->lock);
2067                 }
2068         }
2069 }
2070
2071 /*
2072  * double_rq_unlock - safely unlock two runqueues
2073  *
2074  * Note this does not restore interrupts like task_rq_unlock,
2075  * you need to do so manually after calling.
2076  */
2077 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2078         __releases(rq1->lock)
2079         __releases(rq2->lock)
2080 {
2081         spin_unlock(&rq1->lock);
2082         if (rq1 != rq2)
2083                 spin_unlock(&rq2->lock);
2084         else
2085                 __release(rq2->lock);
2086 }
2087
2088 /*
2089  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2090  */
2091 static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
2092         __releases(this_rq->lock)
2093         __acquires(busiest->lock)
2094         __acquires(this_rq->lock)
2095 {
2096         if (unlikely(!irqs_disabled())) {
2097                 /* printk() doesn't work good under rq->lock */
2098                 spin_unlock(&this_rq->lock);
2099                 BUG_ON(1);
2100         }
2101         if (unlikely(!spin_trylock(&busiest->lock))) {
2102                 if (busiest < this_rq) {
2103                         spin_unlock(&this_rq->lock);
2104                         spin_lock(&busiest->lock);
2105                         spin_lock(&this_rq->lock);
2106                 } else
2107                         spin_lock(&busiest->lock);
2108         }
2109 }
2110
2111 /*
2112  * If dest_cpu is allowed for this process, migrate the task to it.
2113  * This is accomplished by forcing the cpu_allowed mask to only
2114  * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
2115  * the cpu_allowed mask is restored.
2116  */
2117 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2118 {
2119         struct migration_req req;
2120         unsigned long flags;
2121         struct rq *rq;
2122
2123         rq = task_rq_lock(p, &flags);
2124         if (!cpu_isset(dest_cpu, p->cpus_allowed)
2125             || unlikely(cpu_is_offline(dest_cpu)))
2126                 goto out;
2127
2128         /* force the process onto the specified CPU */
2129         if (migrate_task(p, dest_cpu, &req)) {
2130                 /* Need to wait for migration thread (might exit: take ref). */
2131                 struct task_struct *mt = rq->migration_thread;
2132
2133                 get_task_struct(mt);
2134                 task_rq_unlock(rq, &flags);
2135                 wake_up_process(mt);
2136                 put_task_struct(mt);
2137                 wait_for_completion(&req.done);
2138
2139                 return;
2140         }
2141 out:
2142         task_rq_unlock(rq, &flags);
2143 }
2144
2145 /*
2146  * sched_exec - execve() is a valuable balancing opportunity, because at
2147  * this point the task has the smallest effective memory and cache footprint.
2148  */
2149 void sched_exec(void)
2150 {
2151         int new_cpu, this_cpu = get_cpu();
2152         new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2153         put_cpu();
2154         if (new_cpu != this_cpu)
2155                 sched_migrate_task(current, new_cpu);
2156 }
2157
2158 /*
2159  * pull_task - move a task from a remote runqueue to the local runqueue.
2160  * Both runqueues must be locked.
2161  */
2162 static void pull_task(struct rq *src_rq, struct prio_array *src_array,
2163                       struct task_struct *p, struct rq *this_rq,
2164                       struct prio_array *this_array, int this_cpu)
2165 {
2166         dequeue_task(p, src_array);
2167         dec_nr_running(p, src_rq);
2168         set_task_cpu(p, this_cpu);
2169         inc_nr_running(p, this_rq);
2170         enqueue_task(p, this_array);
2171         p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
2172                                 + this_rq->most_recent_timestamp;
2173         /*
2174          * Note that idle threads have a prio of MAX_PRIO, for this test
2175          * to be always true for them.
2176          */
2177         if (TASK_PREEMPTS_CURR(p, this_rq))
2178                 resched_task(this_rq->curr);
2179 }
2180
2181 /*
2182  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2183  */
2184 static
2185 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2186                      struct sched_domain *sd, enum cpu_idle_type idle,
2187                      int *all_pinned)
2188 {
2189         /*
2190          * We do not migrate tasks that are:
2191          * 1) running (obviously), or
2192          * 2) cannot be migrated to this CPU due to cpus_allowed, or
2193          * 3) are cache-hot on their current CPU.
2194          */
2195         if (!cpu_isset(this_cpu, p->cpus_allowed))
2196                 return 0;
2197         *all_pinned = 0;
2198
2199         if (task_running(rq, p))
2200                 return 0;
2201
2202         /*
2203          * Aggressive migration if:
2204          * 1) task is cache cold, or
2205          * 2) too many balance attempts have failed.
2206          */
2207
2208         if (sd->nr_balance_failed > sd->cache_nice_tries) {
2209 #ifdef CONFIG_SCHEDSTATS
2210                 if (task_hot(p, rq->most_recent_timestamp, sd))
2211                         schedstat_inc(sd, lb_hot_gained[idle]);
2212 #endif
2213                 return 1;
2214         }
2215
2216         if (task_hot(p, rq->most_recent_timestamp, sd))
2217                 return 0;
2218         return 1;
2219 }
2220
2221 #define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
2222
2223 /*
2224  * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2225  * load from busiest to this_rq, as part of a balancing operation within
2226  * "domain". Returns the number of tasks moved.
2227  *
2228  * Called with both runqueues locked.
2229  */
2230 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2231                       unsigned long max_nr_move, unsigned long max_load_move,
2232                       struct sched_domain *sd, enum cpu_idle_type idle,
2233                       int *all_pinned)
2234 {
2235         int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
2236             best_prio_seen, skip_for_load;
2237         struct prio_array *array, *dst_array;
2238         struct list_head *head, *curr;
2239         struct task_struct *tmp;
2240         long rem_load_move;
2241
2242         if (max_nr_move == 0 || max_load_move == 0)
2243                 goto out;
2244
2245         rem_load_move = max_load_move;
2246         pinned = 1;
2247         this_best_prio = rq_best_prio(this_rq);
2248         best_prio = rq_best_prio(busiest);
2249         /*
2250          * Enable handling of the case where there is more than one task
2251          * with the best priority.   If the current running task is one
2252          * of those with prio==best_prio we know it won't be moved
2253          * and therefore it's safe to override the skip (based on load) of
2254          * any task we find with that prio.
2255          */
2256         best_prio_seen = best_prio == busiest->curr->prio;
2257
2258         /*
2259          * We first consider expired tasks. Those will likely not be
2260          * executed in the near future, and they are most likely to
2261          * be cache-cold, thus switching CPUs has the least effect
2262          * on them.
2263          */
2264         if (busiest->expired->nr_active) {
2265                 array = busiest->expired;
2266                 dst_array = this_rq->expired;
2267         } else {
2268                 array = busiest->active;
2269                 dst_array = this_rq->active;
2270         }
2271
2272 new_array:
2273         /* Start searching at priority 0: */
2274         idx = 0;
2275 skip_bitmap:
2276         if (!idx)
2277                 idx = sched_find_first_bit(array->bitmap);
2278         else
2279                 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
2280         if (idx >= MAX_PRIO) {
2281                 if (array == busiest->expired && busiest->active->nr_active) {
2282                         array = busiest->active;
2283                         dst_array = this_rq->active;
2284                         goto new_array;
2285                 }
2286                 goto out;
2287         }
2288
2289         head = array->queue + idx;
2290         curr = head->prev;
2291 skip_queue:
2292         tmp = list_entry(curr, struct task_struct, run_list);
2293
2294         curr = curr->prev;
2295
2296         /*
2297          * To help distribute high priority tasks accross CPUs we don't
2298          * skip a task if it will be the highest priority task (i.e. smallest
2299          * prio value) on its new queue regardless of its load weight
2300          */
2301         skip_for_load = tmp->load_weight > rem_load_move;
2302         if (skip_for_load && idx < this_best_prio)
2303                 skip_for_load = !best_prio_seen && idx == best_prio;
2304         if (skip_for_load ||
2305             !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
2306
2307                 best_prio_seen |= idx == best_prio;
2308                 if (curr != head)
2309                         goto skip_queue;
2310                 idx++;
2311                 goto skip_bitmap;
2312         }
2313
2314         pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
2315         pulled++;
2316         rem_load_move -= tmp->load_weight;
2317
2318         /*
2319          * We only want to steal up to the prescribed number of tasks
2320          * and the prescribed amount of weighted load.
2321          */
2322         if (pulled < max_nr_move && rem_load_move > 0) {
2323                 if (idx < this_best_prio)
2324                         this_best_prio = idx;
2325                 if (curr != head)
2326                         goto skip_queue;
2327                 idx++;
2328                 goto skip_bitmap;
2329         }
2330 out:
2331         /*
2332          * Right now, this is the only place pull_task() is called,
2333          * so we can safely collect pull_task() stats here rather than
2334          * inside pull_task().
2335          */
2336         schedstat_add(sd, lb_gained[idle], pulled);
2337
2338         if (all_pinned)
2339                 *all_pinned = pinned;
2340         return pulled;
2341 }
2342
2343 /*
2344  * find_busiest_group finds and returns the busiest CPU group within the
2345  * domain. It calculates and returns the amount of weighted load which
2346  * should be moved to restore balance via the imbalance parameter.
2347  */
2348 static struct sched_group *
2349 find_busiest_group(struct sched_domain *sd, int this_cpu,
2350                    unsigned long *imbalance, enum cpu_idle_type idle, int *sd_idle,
2351                    cpumask_t *cpus, int *balance)
2352 {
2353         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2354         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
2355         unsigned long max_pull;
2356         unsigned long busiest_load_per_task, busiest_nr_running;
2357         unsigned long this_load_per_task, this_nr_running;
2358         int load_idx;
2359 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2360         int power_savings_balance = 1;
2361         unsigned long leader_nr_running = 0, min_load_per_task = 0;
2362         unsigned long min_nr_running = ULONG_MAX;
2363         struct sched_group *group_min = NULL, *group_leader = NULL;
2364 #endif
2365
2366         max_load = this_load = total_load = total_pwr = 0;
2367         busiest_load_per_task = busiest_nr_running = 0;
2368         this_load_per_task = this_nr_running = 0;
2369         if (idle == CPU_NOT_IDLE)
2370                 load_idx = sd->busy_idx;
2371         else if (idle == CPU_NEWLY_IDLE)
2372                 load_idx = sd->newidle_idx;
2373         else
2374                 load_idx = sd->idle_idx;
2375
2376         do {
2377                 unsigned long load, group_capacity;
2378                 int local_group;
2379                 int i;
2380                 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2381                 unsigned long sum_nr_running, sum_weighted_load;
2382
2383                 local_group = cpu_isset(this_cpu, group->cpumask);
2384
2385                 if (local_group)
2386                         balance_cpu = first_cpu(group->cpumask);
2387
2388                 /* Tally up the load of all CPUs in the group */
2389                 sum_weighted_load = sum_nr_running = avg_load = 0;
2390
2391                 for_each_cpu_mask(i, group->cpumask) {
2392                         struct rq *rq;
2393
2394                         if (!cpu_isset(i, *cpus))
2395                                 continue;
2396
2397                         rq = cpu_rq(i);
2398
2399                         if (*sd_idle && !idle_cpu(i))
2400                                 *sd_idle = 0;
2401
2402                         /* Bias balancing toward cpus of our domain */
2403                         if (local_group) {
2404                                 if (idle_cpu(i) && !first_idle_cpu) {
2405                                         first_idle_cpu = 1;
2406                                         balance_cpu = i;
2407                                 }
2408
2409                                 load = target_load(i, load_idx);
2410                         } else
2411                                 load = source_load(i, load_idx);
2412
2413                         avg_load += load;
2414                         sum_nr_running += rq->nr_running;
2415                         sum_weighted_load += rq->raw_weighted_load;
2416                 }
2417
2418                 /*
2419                  * First idle cpu or the first cpu(busiest) in this sched group
2420                  * is eligible for doing load balancing at this and above
2421                  * domains.
2422                  */
2423                 if (local_group && balance_cpu != this_cpu && balance) {
2424                         *balance = 0;
2425                         goto ret;
2426                 }
2427
2428                 total_load += avg_load;
2429                 total_pwr += group->__cpu_power;
2430
2431                 /* Adjust by relative CPU power of the group */
2432                 avg_load = sg_div_cpu_power(group,
2433                                 avg_load * SCHED_LOAD_SCALE);
2434
2435                 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
2436
2437                 if (local_group) {
2438                         this_load = avg_load;
2439                         this = group;
2440                         this_nr_running = sum_nr_running;
2441                         this_load_per_task = sum_weighted_load;
2442                 } else if (avg_load > max_load &&
2443                            sum_nr_running > group_capacity) {
2444                         max_load = avg_load;
2445                         busiest = group;
2446                         busiest_nr_running = sum_nr_running;
2447                         busiest_load_per_task = sum_weighted_load;
2448                 }
2449
2450 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2451                 /*
2452                  * Busy processors will not participate in power savings
2453                  * balance.
2454                  */
2455                 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2456                         goto group_next;
2457
2458                 /*
2459                  * If the local group is idle or completely loaded
2460                  * no need to do power savings balance at this domain
2461                  */
2462                 if (local_group && (this_nr_running >= group_capacity ||
2463                                     !this_nr_running))
2464                         power_savings_balance = 0;
2465
2466                 /*
2467                  * If a group is already running at full capacity or idle,
2468                  * don't include that group in power savings calculations
2469                  */
2470                 if (!power_savings_balance || sum_nr_running >= group_capacity
2471                     || !sum_nr_running)
2472                         goto group_next;
2473
2474                 /*
2475                  * Calculate the group which has the least non-idle load.
2476                  * This is the group from where we need to pick up the load
2477                  * for saving power
2478                  */
2479                 if ((sum_nr_running < min_nr_running) ||
2480                     (sum_nr_running == min_nr_running &&
2481                      first_cpu(group->cpumask) <
2482                      first_cpu(group_min->cpumask))) {
2483                         group_min = group;
2484                         min_nr_running = sum_nr_running;
2485                         min_load_per_task = sum_weighted_load /
2486                                                 sum_nr_running;
2487                 }
2488
2489                 /*
2490                  * Calculate the group which is almost near its
2491                  * capacity but still has some space to pick up some load
2492                  * from other group and save more power
2493                  */
2494                 if (sum_nr_running <= group_capacity - 1) {
2495                         if (sum_nr_running > leader_nr_running ||
2496                             (sum_nr_running == leader_nr_running &&
2497                              first_cpu(group->cpumask) >
2498                               first_cpu(group_leader->cpumask))) {
2499                                 group_leader = group;
2500                                 leader_nr_running = sum_nr_running;
2501                         }
2502                 }
2503 group_next:
2504 #endif
2505                 group = group->next;
2506         } while (group != sd->groups);
2507
2508         if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2509                 goto out_balanced;
2510
2511         avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2512
2513         if (this_load >= avg_load ||
2514                         100*max_load <= sd->imbalance_pct*this_load)
2515                 goto out_balanced;
2516
2517         busiest_load_per_task /= busiest_nr_running;
2518         /*
2519          * We're trying to get all the cpus to the average_load, so we don't
2520          * want to push ourselves above the average load, nor do we wish to
2521          * reduce the max loaded cpu below the average load, as either of these
2522          * actions would just result in more rebalancing later, and ping-pong
2523          * tasks around. Thus we look for the minimum possible imbalance.
2524          * Negative imbalances (*we* are more loaded than anyone else) will
2525          * be counted as no imbalance for these purposes -- we can't fix that
2526          * by pulling tasks to us.  Be careful of negative numbers as they'll
2527          * appear as very large values with unsigned longs.
2528          */
2529         if (max_load <= busiest_load_per_task)
2530                 goto out_balanced;
2531
2532         /*
2533          * In the presence of smp nice balancing, certain scenarios can have
2534          * max load less than avg load(as we skip the groups at or below
2535          * its cpu_power, while calculating max_load..)
2536          */
2537         if (max_load < avg_load) {
2538                 *imbalance = 0;
2539                 goto small_imbalance;
2540         }
2541
2542         /* Don't want to pull so many tasks that a group would go idle */
2543         max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2544
2545         /* How much load to actually move to equalise the imbalance */
2546         *imbalance = min(max_pull * busiest->__cpu_power,
2547                                 (avg_load - this_load) * this->__cpu_power)
2548                         / SCHED_LOAD_SCALE;
2549
2550         /*
2551          * if *imbalance is less than the average load per runnable task
2552          * there is no gaurantee that any tasks will be moved so we'll have
2553          * a think about bumping its value to force at least one task to be
2554          * moved
2555          */
2556         if (*imbalance < busiest_load_per_task) {
2557                 unsigned long tmp, pwr_now, pwr_move;
2558                 unsigned int imbn;
2559
2560 small_imbalance:
2561                 pwr_move = pwr_now = 0;
2562                 imbn = 2;
2563                 if (this_nr_running) {
2564                         this_load_per_task /= this_nr_running;
2565                         if (busiest_load_per_task > this_load_per_task)
2566                                 imbn = 1;
2567                 } else
2568                         this_load_per_task = SCHED_LOAD_SCALE;
2569
2570                 if (max_load - this_load >= busiest_load_per_task * imbn) {
2571                         *imbalance = busiest_load_per_task;
2572                         return busiest;
2573                 }
2574
2575                 /*
2576                  * OK, we don't have enough imbalance to justify moving tasks,
2577                  * however we may be able to increase total CPU power used by
2578                  * moving them.
2579                  */
2580
2581                 pwr_now += busiest->__cpu_power *
2582                                 min(busiest_load_per_task, max_load);
2583                 pwr_now += this->__cpu_power *
2584                                 min(this_load_per_task, this_load);
2585                 pwr_now /= SCHED_LOAD_SCALE;
2586
2587                 /* Amount of load we'd subtract */
2588                 tmp = sg_div_cpu_power(busiest,
2589                                 busiest_load_per_task * SCHED_LOAD_SCALE);
2590                 if (max_load > tmp)
2591                         pwr_move += busiest->__cpu_power *
2592                                 min(busiest_load_per_task, max_load - tmp);
2593
2594                 /* Amount of load we'd add */
2595                 if (max_load * busiest->__cpu_power <
2596                                 busiest_load_per_task * SCHED_LOAD_SCALE)
2597                         tmp = sg_div_cpu_power(this,
2598                                         max_load * busiest->__cpu_power);
2599                 else
2600                         tmp = sg_div_cpu_power(this,
2601                                 busiest_load_per_task * SCHED_LOAD_SCALE);
2602                 pwr_move += this->__cpu_power *
2603                                 min(this_load_per_task, this_load + tmp);
2604                 pwr_move /= SCHED_LOAD_SCALE;
2605
2606                 /* Move if we gain throughput */
2607                 if (pwr_move <= pwr_now)
2608                         goto out_balanced;
2609
2610                 *imbalance = busiest_load_per_task;
2611         }
2612
2613         return busiest;
2614
2615 out_balanced:
2616 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2617         if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2618                 goto ret;
2619
2620         if (this == group_leader && group_leader != group_min) {
2621                 *imbalance = min_load_per_task;
2622                 return group_min;
2623         }
2624 #endif
2625 ret:
2626         *imbalance = 0;
2627         return NULL;
2628 }
2629
2630 /*
2631  * find_busiest_queue - find the busiest runqueue among the cpus in group.
2632  */
2633 static struct rq *
2634 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2635                    unsigned long imbalance, cpumask_t *cpus)
2636 {
2637         struct rq *busiest = NULL, *rq;
2638         unsigned long max_load = 0;
2639         int i;
2640
2641         for_each_cpu_mask(i, group->cpumask) {
2642
2643                 if (!cpu_isset(i, *cpus))
2644                         continue;
2645
2646                 rq = cpu_rq(i);
2647
2648                 if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
2649                         continue;
2650
2651                 if (rq->raw_weighted_load > max_load) {
2652                         max_load = rq->raw_weighted_load;
2653                         busiest = rq;
2654                 }
2655         }
2656
2657         return busiest;
2658 }
2659
2660 /*
2661  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2662  * so long as it is large enough.
2663  */
2664 #define MAX_PINNED_INTERVAL     512
2665
2666 static inline unsigned long minus_1_or_zero(unsigned long n)
2667 {
2668         return n > 0 ? n - 1 : 0;
2669 }
2670
2671 /*
2672  * Check this_cpu to ensure it is balanced within domain. Attempt to move
2673  * tasks if there is an imbalance.
2674  */
2675 static int load_balance(int this_cpu, struct rq *this_rq,
2676                         struct sched_domain *sd, enum cpu_idle_type idle,
2677                         int *balance)
2678 {
2679         int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2680         struct sched_group *group;
2681         unsigned long imbalance;
2682         struct rq *busiest;
2683         cpumask_t cpus = CPU_MASK_ALL;
2684         unsigned long flags;
2685
2686         /*
2687          * When power savings policy is enabled for the parent domain, idle
2688          * sibling can pick up load irrespective of busy siblings. In this case,
2689          * let the state of idle sibling percolate up as IDLE, instead of
2690          * portraying it as CPU_NOT_IDLE.
2691          */
2692         if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2693             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2694                 sd_idle = 1;
2695
2696         schedstat_inc(sd, lb_cnt[idle]);
2697
2698 redo:
2699         group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2700                                    &cpus, balance);
2701
2702         if (*balance == 0)
2703                 goto out_balanced;
2704
2705         if (!group) {
2706                 schedstat_inc(sd, lb_nobusyg[idle]);
2707                 goto out_balanced;
2708         }
2709
2710         busiest = find_busiest_queue(group, idle, imbalance, &cpus);
2711         if (!busiest) {
2712                 schedstat_inc(sd, lb_nobusyq[idle]);
2713                 goto out_balanced;
2714         }
2715
2716         BUG_ON(busiest == this_rq);
2717
2718         schedstat_add(sd, lb_imbalance[idle], imbalance);
2719
2720         nr_moved = 0;
2721         if (busiest->nr_running > 1) {
2722                 /*
2723                  * Attempt to move tasks. If find_busiest_group has found
2724                  * an imbalance but busiest->nr_running <= 1, the group is
2725                  * still unbalanced. nr_moved simply stays zero, so it is
2726                  * correctly treated as an imbalance.
2727                  */
2728                 local_irq_save(flags);
2729                 double_rq_lock(this_rq, busiest);
2730                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2731                                       minus_1_or_zero(busiest->nr_running),
2732                                       imbalance, sd, idle, &all_pinned);
2733                 double_rq_unlock(this_rq, busiest);
2734                 local_irq_restore(flags);
2735
2736                 /*
2737                  * some other cpu did the load balance for us.
2738                  */
2739                 if (nr_moved && this_cpu != smp_processor_id())
2740                         resched_cpu(this_cpu);
2741
2742                 /* All tasks on this runqueue were pinned by CPU affinity */
2743                 if (unlikely(all_pinned)) {
2744                         cpu_clear(cpu_of(busiest), cpus);
2745                         if (!cpus_empty(cpus))
2746                                 goto redo;
2747                         goto out_balanced;
2748                 }
2749         }
2750
2751         if (!nr_moved) {
2752                 schedstat_inc(sd, lb_failed[idle]);
2753                 sd->nr_balance_failed++;
2754
2755                 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2756
2757                         spin_lock_irqsave(&busiest->lock, flags);
2758
2759                         /* don't kick the migration_thread, if the curr
2760                          * task on busiest cpu can't be moved to this_cpu
2761                          */
2762                         if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2763                                 spin_unlock_irqrestore(&busiest->lock, flags);
2764                                 all_pinned = 1;
2765                                 goto out_one_pinned;
2766                         }
2767
2768                         if (!busiest->active_balance) {
2769                                 busiest->active_balance = 1;
2770                                 busiest->push_cpu = this_cpu;
2771                                 active_balance = 1;
2772                         }
2773                         spin_unlock_irqrestore(&busiest->lock, flags);
2774                         if (active_balance)
2775                                 wake_up_process(busiest->migration_thread);
2776
2777                         /*
2778                          * We've kicked active balancing, reset the failure
2779                          * counter.
2780                          */
2781                         sd->nr_balance_failed = sd->cache_nice_tries+1;
2782                 }
2783         } else
2784                 sd->nr_balance_failed = 0;
2785
2786         if (likely(!active_balance)) {
2787                 /* We were unbalanced, so reset the balancing interval */
2788                 sd->balance_interval = sd->min_interval;
2789         } else {
2790                 /*
2791                  * If we've begun active balancing, start to back off. This
2792                  * case may not be covered by the all_pinned logic if there
2793                  * is only 1 task on the busy runqueue (because we don't call
2794                  * move_tasks).
2795                  */
2796                 if (sd->balance_interval < sd->max_interval)
2797                         sd->balance_interval *= 2;
2798         }
2799
2800         if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2801             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2802                 return -1;
2803         return nr_moved;
2804
2805 out_balanced:
2806         schedstat_inc(sd, lb_balanced[idle]);
2807
2808         sd->nr_balance_failed = 0;
2809
2810 out_one_pinned:
2811         /* tune up the balancing interval */
2812         if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2813                         (sd->balance_interval < sd->max_interval))
2814                 sd->balance_interval *= 2;
2815
2816         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2817             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2818                 return -1;
2819         return 0;
2820 }
2821
2822 /*
2823  * Check this_cpu to ensure it is balanced within domain. Attempt to move
2824  * tasks if there is an imbalance.
2825  *
2826  * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2827  * this_rq is locked.
2828  */
2829 static int
2830 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2831 {
2832         struct sched_group *group;
2833         struct rq *busiest = NULL;
2834         unsigned long imbalance;
2835         int nr_moved = 0;
2836         int sd_idle = 0;
2837         cpumask_t cpus = CPU_MASK_ALL;
2838
2839         /*
2840          * When power savings policy is enabled for the parent domain, idle
2841          * sibling can pick up load irrespective of busy siblings. In this case,
2842          * let the state of idle sibling percolate up as IDLE, instead of
2843          * portraying it as CPU_NOT_IDLE.
2844          */
2845         if (sd->flags & SD_SHARE_CPUPOWER &&
2846             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2847                 sd_idle = 1;
2848
2849         schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
2850 redo:
2851         group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2852                                    &sd_idle, &cpus, NULL);
2853         if (!group) {
2854                 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2855                 goto out_balanced;
2856         }
2857
2858         busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2859                                 &cpus);
2860         if (!busiest) {
2861                 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2862                 goto out_balanced;
2863         }
2864
2865         BUG_ON(busiest == this_rq);
2866
2867         schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2868
2869         nr_moved = 0;
2870         if (busiest->nr_running > 1) {
2871                 /* Attempt to move tasks */
2872                 double_lock_balance(this_rq, busiest);
2873                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2874                                         minus_1_or_zero(busiest->nr_running),
2875                                         imbalance, sd, CPU_NEWLY_IDLE, NULL);
2876                 spin_unlock(&busiest->lock);
2877
2878                 if (!nr_moved) {
2879                         cpu_clear(cpu_of(busiest), cpus);
2880                         if (!cpus_empty(cpus))
2881                                 goto redo;
2882                 }
2883         }
2884
2885         if (!nr_moved) {
2886                 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2887                 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2888                     !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2889                         return -1;
2890         } else
2891                 sd->nr_balance_failed = 0;
2892
2893         return nr_moved;
2894
2895 out_balanced:
2896         schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2897         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2898             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2899                 return -1;
2900         sd->nr_balance_failed = 0;
2901
2902         return 0;
2903 }
2904
2905 /*
2906  * idle_balance is called by schedule() if this_cpu is about to become
2907  * idle. Attempts to pull tasks from other CPUs.
2908  */
2909 static void idle_balance(int this_cpu, struct rq *this_rq)
2910 {
2911         struct sched_domain *sd;
2912         int pulled_task = 0;
2913         unsigned long next_balance = jiffies + 60 *  HZ;
2914
2915         for_each_domain(this_cpu, sd) {
2916                 unsigned long interval;
2917
2918                 if (!(sd->flags & SD_LOAD_BALANCE))
2919                         continue;
2920
2921                 if (sd->flags & SD_BALANCE_NEWIDLE)
2922                         /* If we've pulled tasks over stop searching: */
2923                         pulled_task = load_balance_newidle(this_cpu,
2924                                                                 this_rq, sd);
2925
2926                 interval = msecs_to_jiffies(sd->balance_interval);
2927                 if (time_after(next_balance, sd->last_balance + interval))
2928                         next_balance = sd->last_balance + interval;
2929                 if (pulled_task)
2930                         break;
2931         }
2932         if (!pulled_task)
2933                 /*
2934                  * We are going idle. next_balance may be set based on
2935                  * a busy processor. So reset next_balance.
2936                  */
2937                 this_rq->next_balance = next_balance;
2938 }
2939
2940 /*
2941  * active_load_balance is run by migration threads. It pushes running tasks
2942  * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2943  * running on each physical CPU where possible, and avoids physical /
2944  * logical imbalances.
2945  *
2946  * Called with busiest_rq locked.
2947  */
2948 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2949 {
2950         int target_cpu = busiest_rq->push_cpu;
2951         struct sched_domain *sd;
2952         struct rq *target_rq;
2953
2954         /* Is there any task to move? */
2955         if (busiest_rq->nr_running <= 1)
2956                 return;
2957
2958         target_rq = cpu_rq(target_cpu);
2959
2960         /*
2961          * This condition is "impossible", if it occurs
2962          * we need to fix it.  Originally reported by
2963          * Bjorn Helgaas on a 128-cpu setup.
2964          */
2965         BUG_ON(busiest_rq == target_rq);
2966
2967         /* move a task from busiest_rq to target_rq */
2968         double_lock_balance(busiest_rq, target_rq);
2969
2970         /* Search for an sd spanning us and the target CPU. */
2971         for_each_domain(target_cpu, sd) {
2972                 if ((sd->flags & SD_LOAD_BALANCE) &&
2973                     cpu_isset(busiest_cpu, sd->span))
2974                                 break;
2975         }
2976
2977         if (likely(sd)) {
2978                 schedstat_inc(sd, alb_cnt);
2979
2980                 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2981                                RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
2982                                NULL))
2983                         schedstat_inc(sd, alb_pushed);
2984                 else
2985                         schedstat_inc(sd, alb_failed);
2986         }
2987         spin_unlock(&target_rq->lock);
2988 }
2989
2990 static void update_load(struct rq *this_rq)
2991 {
2992         unsigned long this_load;
2993         unsigned int i, scale;
2994
2995         this_load = this_rq->raw_weighted_load;
2996
2997         /* Update our load: */
2998         for (i = 0, scale = 1; i < 3; i++, scale += scale) {
2999                 unsigned long old_load, new_load;
3000
3001                 /* scale is effectively 1 << i now, and >> i divides by scale */
3002
3003                 old_load = this_rq->cpu_load[i];
3004                 new_load = this_load;
3005                 /*
3006                  * Round up the averaging division if load is increasing. This
3007                  * prevents us from getting stuck on 9 if the load is 10, for
3008                  * example.
3009                  */
3010                 if (new_load > old_load)
3011                         new_load += scale-1;
3012                 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3013         }
3014 }
3015
3016 #ifdef CONFIG_NO_HZ
3017 static struct {
3018         atomic_t load_balancer;
3019         cpumask_t  cpu_mask;
3020 } nohz ____cacheline_aligned = {
3021         .load_balancer = ATOMIC_INIT(-1),
3022         .cpu_mask = CPU_MASK_NONE,
3023 };
3024
3025 /*
3026  * This routine will try to nominate the ilb (idle load balancing)
3027  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3028  * load balancing on behalf of all those cpus. If all the cpus in the system
3029  * go into this tickless mode, then there will be no ilb owner (as there is
3030  * no need for one) and all the cpus will sleep till the next wakeup event
3031  * arrives...
3032  *
3033  * For the ilb owner, tick is not stopped. And this tick will be used
3034  * for idle load balancing. ilb owner will still be part of
3035  * nohz.cpu_mask..
3036  *
3037  * While stopping the tick, this cpu will become the ilb owner if there
3038  * is no other owner. And will be the owner till that cpu becomes busy
3039  * or if all cpus in the system stop their ticks at which point
3040  * there is no need for ilb owner.
3041  *
3042  * When the ilb owner becomes busy, it nominates another owner, during the
3043  * next busy scheduler_tick()
3044  */
3045 int select_nohz_load_balancer(int stop_tick)
3046 {
3047         int cpu = smp_processor_id();
3048
3049         if (stop_tick) {
3050                 cpu_set(cpu, nohz.cpu_mask);
3051                 cpu_rq(cpu)->in_nohz_recently = 1;
3052
3053                 /*
3054                  * If we are going offline and still the leader, give up!
3055                  */
3056                 if (cpu_is_offline(cpu) &&
3057                     atomic_read(&nohz.load_balancer) == cpu) {
3058                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3059                                 BUG();
3060                         return 0;
3061                 }
3062
3063                 /* time for ilb owner also to sleep */
3064                 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3065                         if (atomic_read(&nohz.load_balancer) == cpu)
3066                                 atomic_set(&nohz.load_balancer, -1);
3067                         return 0;
3068                 }
3069
3070                 if (atomic_read(&nohz.load_balancer) == -1) {
3071                         /* make me the ilb owner */
3072                         if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3073                                 return 1;
3074                 } else if (atomic_read(&nohz.load_balancer) == cpu)
3075                         return 1;
3076         } else {
3077                 if (!cpu_isset(cpu, nohz.cpu_mask))
3078                         return 0;
3079
3080                 cpu_clear(cpu, nohz.cpu_mask);
3081
3082                 if (atomic_read(&nohz.load_balancer) == cpu)
3083                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3084                                 BUG();
3085         }
3086         return 0;
3087 }
3088 #endif
3089
3090 static DEFINE_SPINLOCK(balancing);
3091
3092 /*
3093  * It checks each scheduling domain to see if it is due to be balanced,
3094  * and initiates a balancing operation if so.
3095  *
3096  * Balancing parameters are set up in arch_init_sched_domains.
3097  */
3098 static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
3099 {
3100         int balance = 1;
3101         struct rq *rq = cpu_rq(cpu);
3102         unsigned long interval;
3103         struct sched_domain *sd;
3104         /* Earliest time when we have to do rebalance again */
3105         unsigned long next_balance = jiffies + 60*HZ;
3106
3107         for_each_domain(cpu, sd) {
3108                 if (!(sd->flags & SD_LOAD_BALANCE))
3109                         continue;
3110
3111                 interval = sd->balance_interval;
3112                 if (idle != CPU_IDLE)
3113                         interval *= sd->busy_factor;
3114
3115                 /* scale ms to jiffies */
3116                 interval = msecs_to_jiffies(interval);
3117                 if (unlikely(!interval))
3118                         interval = 1;
3119
3120                 if (sd->flags & SD_SERIALIZE) {
3121                         if (!spin_trylock(&balancing))
3122                                 goto out;
3123                 }
3124
3125                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3126                         if (load_balance(cpu, rq, sd, idle, &balance)) {
3127                                 /*
3128                                  * We've pulled tasks over so either we're no
3129                                  * longer idle, or one of our SMT siblings is
3130                                  * not idle.
3131                                  */
3132                                 idle = CPU_NOT_IDLE;
3133                         }
3134                         sd->last_balance = jiffies;
3135                 }
3136                 if (sd->flags & SD_SERIALIZE)
3137                         spin_unlock(&balancing);
3138 out:
3139                 if (time_after(next_balance, sd->last_balance + interval))
3140                         next_balance = sd->last_balance + interval;
3141
3142                 /*
3143                  * Stop the load balance at this level. There is another
3144                  * CPU in our sched group which is doing load balancing more
3145                  * actively.
3146                  */
3147                 if (!balance)
3148                         break;
3149         }
3150         rq->next_balance = next_balance;
3151 }
3152
3153 /*
3154  * run_rebalance_domains is triggered when needed from the scheduler tick.
3155  * In CONFIG_NO_HZ case, the idle load balance owner will do the
3156  * rebalancing for all the cpus for whom scheduler ticks are stopped.
3157  */
3158 static void run_rebalance_domains(struct softirq_action *h)
3159 {
3160         int local_cpu = smp_processor_id();
3161         struct rq *local_rq = cpu_rq(local_cpu);
3162         enum cpu_idle_type idle = local_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE;
3163
3164         rebalance_domains(local_cpu, idle);
3165
3166 #ifdef CONFIG_NO_HZ
3167         /*
3168          * If this cpu is the owner for idle load balancing, then do the
3169          * balancing on behalf of the other idle cpus whose ticks are
3170          * stopped.
3171          */
3172         if (local_rq->idle_at_tick &&
3173             atomic_read(&nohz.load_balancer) == local_cpu) {
3174                 cpumask_t cpus = nohz.cpu_mask;
3175                 struct rq *rq;
3176                 int balance_cpu;
3177
3178                 cpu_clear(local_cpu, cpus);
3179                 for_each_cpu_mask(balance_cpu, cpus) {
3180                         /*
3181                          * If this cpu gets work to do, stop the load balancing
3182                          * work being done for other cpus. Next load
3183                          * balancing owner will pick it up.
3184                          */
3185                         if (need_resched())
3186                                 break;
3187
3188                         rebalance_domains(balance_cpu, CPU_IDLE);
3189
3190                         rq = cpu_rq(balance_cpu);
3191                         if (time_after(local_rq->next_balance, rq->next_balance))
3192                                 local_rq->next_balance = rq->next_balance;
3193                 }
3194         }
3195 #endif
3196 }
3197
3198 /*
3199  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3200  *
3201  * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3202  * idle load balancing owner or decide to stop the periodic load balancing,
3203  * if the whole system is idle.
3204  */
3205 static inline void trigger_load_balance(int cpu)
3206 {
3207         struct rq *rq = cpu_rq(cpu);
3208 #ifdef CONFIG_NO_HZ
3209         /*
3210          * If we were in the nohz mode recently and busy at the current
3211          * scheduler tick, then check if we need to nominate new idle
3212          * load balancer.
3213          */
3214         if (rq->in_nohz_recently && !rq->idle_at_tick) {
3215                 rq->in_nohz_recently = 0;
3216
3217                 if (atomic_read(&nohz.load_balancer) == cpu) {
3218                         cpu_clear(cpu, nohz.cpu_mask);
3219                         atomic_set(&nohz.load_balancer, -1);
3220                 }
3221
3222                 if (atomic_read(&nohz.load_balancer) == -1) {
3223                         /*
3224                          * simple selection for now: Nominate the
3225                          * first cpu in the nohz list to be the next
3226                          * ilb owner.
3227                          *
3228                          * TBD: Traverse the sched domains and nominate
3229                          * the nearest cpu in the nohz.cpu_mask.
3230                          */
3231                         int ilb = first_cpu(nohz.cpu_mask);
3232
3233                         if (ilb != NR_CPUS)
3234                                 resched_cpu(ilb);
3235                 }
3236         }
3237
3238         /*
3239          * If this cpu is idle and doing idle load balancing for all the
3240          * cpus with ticks stopped, is it time for that to stop?
3241          */
3242         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3243             cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3244                 resched_cpu(cpu);
3245                 return;
3246         }
3247
3248         /*
3249          * If this cpu is idle and the idle load balancing is done by
3250          * someone else, then no need raise the SCHED_SOFTIRQ
3251          */
3252         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3253             cpu_isset(cpu, nohz.cpu_mask))
3254                 return;
3255 #endif
3256         if (time_after_eq(jiffies, rq->next_balance))
3257                 raise_softirq(SCHED_SOFTIRQ);
3258 }
3259 #else
3260 /*
3261  * on UP we do not need to balance between CPUs:
3262  */
3263 static inline void idle_balance(int cpu, struct rq *rq)
3264 {
3265 }
3266 #endif
3267
3268 DEFINE_PER_CPU(struct kernel_stat, kstat);
3269
3270 EXPORT_PER_CPU_SYMBOL(kstat);
3271
3272 /*
3273  * This is called on clock ticks and on context switches.
3274  * Bank in p->sched_time the ns elapsed since the last tick or switch.
3275  */
3276 static inline void
3277 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
3278 {
3279         p->sched_time += now - p->last_ran;
3280         p->last_ran = rq->most_recent_timestamp = now;
3281 }
3282
3283 /*
3284  * Return current->sched_time plus any more ns on the sched_clock
3285  * that have not yet been banked.
3286  */
3287 unsigned long long current_sched_time(const struct task_struct *p)
3288 {
3289         unsigned long long ns;
3290         unsigned long flags;
3291
3292         local_irq_save(flags);
3293         ns = p->sched_time + sched_clock() - p->last_ran;
3294         local_irq_restore(flags);
3295
3296         return ns;
3297 }
3298
3299 /*
3300  * We place interactive tasks back into the active array, if possible.
3301  *
3302  * To guarantee that this does not starve expired tasks we ignore the
3303  * interactivity of a task if the first expired task had to wait more
3304  * than a 'reasonable' amount of time. This deadline timeout is
3305  * load-dependent, as the frequency of array switched decreases with
3306  * increasing number of running tasks. We also ignore the interactivity
3307  * if a better static_prio task has expired:
3308  */
3309 static inline int expired_starving(struct rq *rq)
3310 {
3311         if (rq->curr->static_prio > rq->best_expired_prio)
3312                 return 1;
3313         if (!STARVATION_LIMIT || !rq->expired_timestamp)
3314                 return 0;
3315         if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
3316                 return 1;
3317         return 0;
3318 }
3319
3320 /*
3321  * Account user cpu time to a process.
3322  * @p: the process that the cpu time gets accounted to
3323  * @hardirq_offset: the offset to subtract from hardirq_count()
3324  * @cputime: the cpu time spent in user space since the last update
3325  */
3326 void account_user_time(struct task_struct *p, cputime_t cputime)
3327 {
3328         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3329         cputime64_t tmp;
3330
3331         p->utime = cputime_add(p->utime, cputime);
3332
3333         /* Add user time to cpustat. */
3334         tmp = cputime_to_cputime64(cputime);
3335         if (TASK_NICE(p) > 0)
3336                 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3337         else
3338                 cpustat->user = cputime64_add(cpustat->user, tmp);
3339 }
3340
3341 /*
3342  * Account system cpu time to a process.
3343  * @p: the process that the cpu time gets accounted to
3344  * @hardirq_offset: the offset to subtract from hardirq_count()
3345  * @cputime: the cpu time spent in kernel space since the last update
3346  */
3347 void account_system_time(struct task_struct *p, int hardirq_offset,
3348                          cputime_t cputime)
3349 {
3350         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3351         struct rq *rq = this_rq();
3352         cputime64_t tmp;
3353
3354         p->stime = cputime_add(p->stime, cputime);
3355
3356         /* Add system time to cpustat. */
3357         tmp = cputime_to_cputime64(cputime);
3358         if (hardirq_count() - hardirq_offset)
3359                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3360         else if (softirq_count())
3361                 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3362         else if (p != rq->idle)
3363                 cpustat->system = cputime64_add(cpustat->system, tmp);
3364         else if (atomic_read(&rq->nr_iowait) > 0)
3365                 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3366         else
3367                 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3368         /* Account for system time used */
3369         acct_update_integrals(p);
3370 }
3371
3372 /*
3373  * Account for involuntary wait time.
3374  * @p: the process from which the cpu time has been stolen
3375  * @steal: the cpu time spent in involuntary wait
3376  */
3377 void account_steal_time(struct task_struct *p, cputime_t steal)
3378 {
3379         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3380         cputime64_t tmp = cputime_to_cputime64(steal);
3381         struct rq *rq = this_rq();
3382
3383         if (p == rq->idle) {
3384                 p->stime = cputime_add(p->stime, steal);
3385                 if (atomic_read(&rq->nr_iowait) > 0)
3386                         cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3387                 else
3388                         cpustat->idle = cputime64_add(cpustat->idle, tmp);
3389         } else
3390                 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3391 }
3392
3393 static void task_running_tick(struct rq *rq, struct task_struct *p)
3394 {
3395         if (p->array != rq->active) {
3396                 /* Task has expired but was not scheduled yet */
3397                 set_tsk_need_resched(p);
3398                 return;
3399         }
3400         spin_lock(&rq->lock);
3401         /*
3402          * The task was running during this tick - update the
3403          * time slice counter. Note: we do not update a thread's
3404          * priority until it either goes to sleep or uses up its
3405          * timeslice. This makes it possible for interactive tasks
3406          * to use up their timeslices at their highest priority levels.
3407          */
3408         if (rt_task(p)) {
3409                 /*
3410                  * RR tasks need a special form of timeslice management.
3411                  * FIFO tasks have no timeslices.
3412                  */
3413                 if ((p->policy == SCHED_RR) && !--p->time_slice) {
3414                         p->time_slice = task_timeslice(p);
3415                         p->first_time_slice = 0;
3416                         set_tsk_need_resched(p);
3417
3418                         /* put it at the end of the queue: */
3419                         requeue_task(p, rq->active);
3420                 }
3421                 goto out_unlock;
3422         }
3423         if (!--p->time_slice) {
3424                 dequeue_task(p, rq->active);
3425                 set_tsk_need_resched(p);
3426                 p->prio = effective_prio(p);
3427                 p->time_slice = task_timeslice(p);
3428                 p->first_time_slice = 0;
3429
3430                 if (!rq->expired_timestamp)
3431                         rq->expired_timestamp = jiffies;
3432                 if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
3433                         enqueue_task(p, rq->expired);
3434                         if (p->static_prio < rq->best_expired_prio)
3435                                 rq->best_expired_prio = p->static_prio;
3436                 } else
3437                         enqueue_task(p, rq->active);
3438         } else {
3439                 /*
3440                  * Prevent a too long timeslice allowing a task to monopolize
3441                  * the CPU. We do this by splitting up the timeslice into
3442                  * smaller pieces.
3443                  *
3444                  * Note: this does not mean the task's timeslices expire or
3445                  * get lost in any way, they just might be preempted by
3446                  * another task of equal priority. (one with higher
3447                  * priority would have preempted this task already.) We
3448                  * requeue this task to the end of the list on this priority
3449                  * level, which is in essence a round-robin of tasks with
3450                  * equal priority.
3451                  *
3452                  * This only applies to tasks in the interactive
3453                  * delta range with at least TIMESLICE_GRANULARITY to requeue.
3454                  */
3455                 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
3456                         p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
3457                         (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
3458                         (p->array == rq->active)) {
3459
3460                         requeue_task(p, rq->active);
3461                         set_tsk_need_resched(p);
3462                 }
3463         }
3464 out_unlock:
3465         spin_unlock(&rq->lock);
3466 }
3467
3468 /*
3469  * This function gets called by the timer code, with HZ frequency.
3470  * We call it with interrupts disabled.
3471  *
3472  * It also gets called by the fork code, when changing the parent's
3473  * timeslices.
3474  */
3475 void scheduler_tick(void)
3476 {
3477         unsigned long long now = sched_clock();
3478         struct task_struct *p = current;
3479         int cpu = smp_processor_id();
3480         int idle_at_tick = idle_cpu(cpu);
3481         struct rq *rq = cpu_rq(cpu);
3482
3483         update_cpu_clock(p, rq, now);
3484
3485         if (!idle_at_tick)
3486                 task_running_tick(rq, p);
3487 #ifdef CONFIG_SMP
3488         update_load(rq);
3489         rq->idle_at_tick = idle_at_tick;
3490         trigger_load_balance(cpu);
3491 #endif
3492 }
3493
3494 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3495
3496 void fastcall add_preempt_count(int val)
3497 {
3498         /*
3499          * Underflow?
3500          */
3501         if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3502                 return;
3503         preempt_count() += val;
3504         /*
3505          * Spinlock count overflowing soon?
3506          */
3507         DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3508                                 PREEMPT_MASK - 10);
3509 }
3510 EXPORT_SYMBOL(add_preempt_count);
3511
3512 void fastcall sub_preempt_count(int val)
3513 {
3514         /*
3515          * Underflow?
3516          */
3517         if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3518                 return;
3519         /*
3520          * Is the spinlock portion underflowing?
3521          */
3522         if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3523                         !(preempt_count() & PREEMPT_MASK)))
3524                 return;
3525
3526         preempt_count() -= val;
3527 }
3528 EXPORT_SYMBOL(sub_preempt_count);
3529
3530 #endif
3531
3532 static inline int interactive_sleep(enum sleep_type sleep_type)
3533 {
3534         return (sleep_type == SLEEP_INTERACTIVE ||
3535                 sleep_type == SLEEP_INTERRUPTED);
3536 }
3537
3538 /*
3539  * schedule() is the main scheduler function.
3540  */
3541 asmlinkage void __sched schedule(void)
3542 {
3543         struct task_struct *prev, *next;
3544         struct prio_array *array;
3545         struct list_head *queue;
3546         unsigned long long now;
3547         unsigned long run_time;
3548         int cpu, idx, new_prio;
3549         long *switch_count;
3550         struct rq *rq;
3551
3552         /*
3553          * Test if we are atomic.  Since do_exit() needs to call into
3554          * schedule() atomically, we ignore that path for now.
3555          * Otherwise, whine if we are scheduling when we should not be.
3556          */
3557         if (unlikely(in_atomic() && !current->exit_state)) {
3558                 printk(KERN_ERR "BUG: scheduling while atomic: "
3559                         "%s/0x%08x/%d\n",
3560                         current->comm, preempt_count(), current->pid);
3561                 debug_show_held_locks(current);
3562                 if (irqs_disabled())
3563                         print_irqtrace_events(current);
3564                 dump_stack();
3565         }
3566         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3567
3568 need_resched:
3569         preempt_disable();
3570         prev = current;
3571         release_kernel_lock(prev);
3572 need_resched_nonpreemptible:
3573         rq = this_rq();
3574
3575         /*
3576          * The idle thread is not allowed to schedule!
3577          * Remove this check after it has been exercised a bit.
3578          */
3579         if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
3580                 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
3581                 dump_stack();
3582         }
3583
3584         schedstat_inc(rq, sched_cnt);
3585         now = sched_clock();
3586         if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
3587                 run_time = now - prev->timestamp;
3588                 if (unlikely((long long)(now - prev->timestamp) < 0))
3589                         run_time = 0;
3590         } else
3591                 run_time = NS_MAX_SLEEP_AVG;
3592
3593         /*
3594          * Tasks charged proportionately less run_time at high sleep_avg to
3595          * delay them losing their interactive status
3596          */
3597         run_time /= (CURRENT_BONUS(prev) ? : 1);
3598
3599         spin_lock_irq(&rq->lock);
3600
3601         switch_count = &prev->nivcsw;
3602         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3603                 switch_count = &prev->nvcsw;
3604                 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3605                                 unlikely(signal_pending(prev))))
3606                         prev->state = TASK_RUNNING;
3607                 else {
3608                         if (prev->state == TASK_UNINTERRUPTIBLE)
3609                                 rq->nr_uninterruptible++;
3610                         deactivate_task(prev, rq);
3611                 }
3612         }
3613
3614         cpu = smp_processor_id();
3615         if (unlikely(!rq->nr_running)) {
3616                 idle_balance(cpu, rq);
3617                 if (!rq->nr_running) {
3618                         next = rq->idle;
3619                         rq->expired_timestamp = 0;
3620                         goto switch_tasks;
3621                 }
3622         }
3623
3624         array = rq->active;
3625         if (unlikely(!array->nr_active)) {
3626                 /*
3627                  * Switch the active and expired arrays.
3628                  */
3629                 schedstat_inc(rq, sched_switch);
3630                 rq->active = rq->expired;
3631                 rq->expired = array;
3632                 array = rq->active;
3633                 rq->expired_timestamp = 0;
3634                 rq->best_expired_prio = MAX_PRIO;
3635         }
3636
3637         idx = sched_find_first_bit(array->bitmap);
3638         queue = array->queue + idx;
3639         next = list_entry(queue->next, struct task_struct, run_list);
3640
3641         if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
3642                 unsigned long long delta = now - next->timestamp;
3643                 if (unlikely((long long)(now - next->timestamp) < 0))
3644                         delta = 0;
3645
3646                 if (next->sleep_type == SLEEP_INTERACTIVE)
3647                         delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
3648
3649                 array = next->array;
3650                 new_prio = recalc_task_prio(next, next->timestamp + delta);
3651
3652                 if (unlikely(next->prio != new_prio)) {
3653                         dequeue_task(next, array);
3654                         next->prio = new_prio;
3655                         enqueue_task(next, array);
3656                 }
3657         }
3658         next->sleep_type = SLEEP_NORMAL;
3659 switch_tasks:
3660         if (next == rq->idle)
3661                 schedstat_inc(rq, sched_goidle);
3662         prefetch(next);
3663         prefetch_stack(next);
3664         clear_tsk_need_resched(prev);
3665         rcu_qsctr_inc(task_cpu(prev));
3666
3667         update_cpu_clock(prev, rq, now);
3668
3669         prev->sleep_avg -= run_time;
3670         if ((long)prev->sleep_avg <= 0)
3671                 prev->sleep_avg = 0;
3672         prev->timestamp = prev->last_ran = now;
3673
3674         sched_info_switch(prev, next);
3675         if (likely(prev != next)) {
3676                 next->timestamp = next->last_ran = now;
3677                 rq->nr_switches++;
3678                 rq->curr = next;
3679                 ++*switch_count;
3680
3681                 prepare_task_switch(rq, next);
3682                 prev = context_switch(rq, prev, next);
3683                 barrier();
3684                 /*
3685                  * this_rq must be evaluated again because prev may have moved
3686                  * CPUs since it called schedule(), thus the 'rq' on its stack
3687                  * frame will be invalid.
3688                  */
3689                 finish_task_switch(this_rq(), prev);
3690         } else
3691                 spin_unlock_irq(&rq->lock);
3692
3693         prev = current;
3694         if (unlikely(reacquire_kernel_lock(prev) < 0))
3695                 goto need_resched_nonpreemptible;
3696         preempt_enable_no_resched();
3697         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3698                 goto need_resched;
3699 }
3700 EXPORT_SYMBOL(schedule);
3701
3702 #ifdef CONFIG_PREEMPT
3703 /*
3704  * this is the entry point to schedule() from in-kernel preemption
3705  * off of preempt_enable.  Kernel preemptions off return from interrupt
3706  * occur there and call schedule directly.
3707  */
3708 asmlinkage void __sched preempt_schedule(void)
3709 {
3710         struct thread_info *ti = current_thread_info();
3711 #ifdef CONFIG_PREEMPT_BKL
3712         struct task_struct *task = current;
3713         int saved_lock_depth;
3714 #endif
3715         /*
3716          * If there is a non-zero preempt_count or interrupts are disabled,
3717          * we do not want to preempt the current task.  Just return..
3718          */
3719         if (likely(ti->preempt_count || irqs_disabled()))
3720                 return;
3721
3722 need_resched:
3723         add_preempt_count(PREEMPT_ACTIVE);
3724         /*
3725          * We keep the big kernel semaphore locked, but we
3726          * clear ->lock_depth so that schedule() doesnt
3727          * auto-release the semaphore:
3728          */
3729 #ifdef CONFIG_PREEMPT_BKL
3730         saved_lock_depth = task->lock_depth;
3731         task->lock_depth = -1;
3732 #endif
3733         schedule();
3734 #ifdef CONFIG_PREEMPT_BKL
3735         task->lock_depth = saved_lock_depth;
3736 #endif
3737         sub_preempt_count(PREEMPT_ACTIVE);
3738
3739         /* we could miss a preemption opportunity between schedule and now */
3740         barrier();
3741         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3742                 goto need_resched;
3743 }
3744 EXPORT_SYMBOL(preempt_schedule);
3745
3746 /*
3747  * this is the entry point to schedule() from kernel preemption
3748  * off of irq context.
3749  * Note, that this is called and return with irqs disabled. This will
3750  * protect us against recursive calling from irq.
3751  */
3752 asmlinkage void __sched preempt_schedule_irq(void)
3753 {
3754         struct thread_info *ti = current_thread_info();
3755 #ifdef CONFIG_PREEMPT_BKL
3756         struct task_struct *task = current;
3757         int saved_lock_depth;
3758 #endif
3759         /* Catch callers which need to be fixed */
3760         BUG_ON(ti->preempt_count || !irqs_disabled());
3761
3762 need_resched:
3763         add_preempt_count(PREEMPT_ACTIVE);
3764         /*
3765          * We keep the big kernel semaphore locked, but we
3766          * clear ->lock_depth so that schedule() doesnt
3767          * auto-release the semaphore:
3768          */
3769 #ifdef CONFIG_PREEMPT_BKL
3770         saved_lock_depth = task->lock_depth;
3771         task->lock_depth = -1;
3772 #endif
3773         local_irq_enable();
3774         schedule();
3775         local_irq_disable();
3776 #ifdef CONFIG_PREEMPT_BKL
3777         task->lock_depth = saved_lock_depth;
3778 #endif
3779         sub_preempt_count(PREEMPT_ACTIVE);
3780
3781         /* we could miss a preemption opportunity between schedule and now */
3782         barrier();
3783         if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3784                 goto need_resched;
3785 }
3786
3787 #endif /* CONFIG_PREEMPT */
3788
3789 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3790                           void *key)
3791 {
3792         return try_to_wake_up(curr->private, mode, sync);
3793 }
3794 EXPORT_SYMBOL(default_wake_function);
3795
3796 /*
3797  * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just
3798  * wake everything up.  If it's an exclusive wakeup (nr_exclusive == small +ve
3799  * number) then we wake all the non-exclusive tasks and one exclusive task.
3800  *
3801  * There are circumstances in which we can try to wake a task which has already
3802  * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns
3803  * zero in this (rare) case, and we handle it by continuing to scan the queue.
3804  */
3805 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3806                              int nr_exclusive, int sync, void *key)
3807 {
3808         struct list_head *tmp, *next;
3809
3810         list_for_each_safe(tmp, next, &q->task_list) {
3811                 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3812                 unsigned flags = curr->flags;
3813
3814                 if (curr->func(curr, mode, sync, key) &&
3815                                 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3816                         break;
3817         }
3818 }
3819
3820 /**
3821  * __wake_up - wake up threads blocked on a waitqueue.
3822  * @q: the waitqueue
3823  * @mode: which threads
3824  * @nr_exclusive: how many wake-one or wake-many threads to wake up
3825  * @key: is directly passed to the wakeup function
3826  */
3827 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3828                         int nr_exclusive, void *key)
3829 {
3830         unsigned long flags;
3831
3832         spin_lock_irqsave(&q->lock, flags);
3833         __wake_up_common(q, mode, nr_exclusive, 0, key);
3834         spin_unlock_irqrestore(&q->lock, flags);
3835 }
3836 EXPORT_SYMBOL(__wake_up);
3837
3838 /*
3839  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3840  */
3841 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3842 {
3843         __wake_up_common(q, mode, 1, 0, NULL);
3844 }
3845
3846 /**
3847  * __wake_up_sync - wake up threads blocked on a waitqueue.
3848  * @q: the waitqueue
3849  * @mode: which threads
3850  * @nr_exclusive: how many wake-one or wake-many threads to wake up
3851  *
3852  * The sync wakeup differs that the waker knows that it will schedule
3853  * away soon, so while the target thread will be woken up, it will not
3854  * be migrated to another CPU - ie. the two threads are 'synchronized'
3855  * with each other. This can prevent needless bouncing between CPUs.
3856  *
3857  * On UP it can prevent extra preemption.
3858  */
3859 void fastcall
3860 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3861 {
3862         unsigned long flags;
3863         int sync = 1;
3864
3865         if (unlikely(!q))
3866                 return;
3867
3868         if (unlikely(!nr_exclusive))
3869                 sync = 0;
3870
3871         spin_lock_irqsave(&q->lock, flags);
3872         __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3873         spin_unlock_irqrestore(&q->lock, flags);
3874 }
3875 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
3876
3877 void fastcall complete(struct completion *x)
3878 {
3879         unsigned long flags;
3880
3881         spin_lock_irqsave(&x->wait.lock, flags);
3882         x->done++;
3883         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3884                          1, 0, NULL);
3885         spin_unlock_irqrestore(&x->wait.lock, flags);
3886 }
3887 EXPORT_SYMBOL(complete);
3888
3889 void fastcall complete_all(struct completion *x)
3890 {
3891         unsigned long flags;
3892
3893         spin_lock_irqsave(&x->wait.lock, flags);
3894         x->done += UINT_MAX/2;
3895         __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3896                          0, 0, NULL);
3897         spin_unlock_irqrestore(&x->wait.lock, flags);
3898 }
3899 EXPORT_SYMBOL(complete_all);
3900
3901 void fastcall __sched wait_for_completion(struct completion *x)
3902 {
3903         might_sleep();
3904
3905         spin_lock_irq(&x->wait.lock);
3906         if (!x->done) {
3907                 DECLARE_WAITQUEUE(wait, current);
3908
3909                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3910                 __add_wait_queue_tail(&x->wait, &wait);
3911                 do {
3912                         __set_current_state(TASK_UNINTERRUPTIBLE);
3913                         spin_unlock_irq(&x->wait.lock);
3914                         schedule();
3915                         spin_lock_irq(&x->wait.lock);
3916                 } while (!x->done);
3917                 __remove_wait_queue(&x->wait, &wait);
3918         }
3919         x->done--;
3920         spin_unlock_irq(&x->wait.lock);
3921 }
3922 EXPORT_SYMBOL(wait_for_completion);
3923
3924 unsigned long fastcall __sched
3925 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3926 {
3927         might_sleep();
3928
3929         spin_lock_irq(&x->wait.lock);
3930         if (!x->done) {
3931                 DECLARE_WAITQUEUE(wait, current);
3932
3933                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3934                 __add_wait_queue_tail(&x->wait, &wait);
3935                 do {
3936                         __set_current_state(TASK_UNINTERRUPTIBLE);
3937                         spin_unlock_irq(&x->wait.lock);
3938                         timeout = schedule_timeout(timeout);
3939                         spin_lock_irq(&x->wait.lock);
3940                         if (!timeout) {
3941                                 __remove_wait_queue(&x->wait, &wait);
3942                                 goto out;
3943                         }
3944                 } while (!x->done);
3945                 __remove_wait_queue(&x->wait, &wait);
3946         }
3947         x->done--;
3948 out:
3949         spin_unlock_irq(&x->wait.lock);
3950         return timeout;
3951 }
3952 EXPORT_SYMBOL(wait_for_completion_timeout);
3953
3954 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3955 {
3956         int ret = 0;
3957
3958         might_sleep();
3959
3960         spin_lock_irq(&x->wait.lock);
3961         if (!x->done) {
3962                 DECLARE_WAITQUEUE(wait, current);
3963
3964                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3965                 __add_wait_queue_tail(&x->wait, &wait);
3966                 do {
3967                         if (signal_pending(current)) {
3968                                 ret = -ERESTARTSYS;
3969                                 __remove_wait_queue(&x->wait, &wait);
3970                                 goto out;
3971                         }
3972                         __set_current_state(TASK_INTERRUPTIBLE);
3973                         spin_unlock_irq(&x->wait.lock);
3974                         schedule();
3975                         spin_lock_irq(&x->wait.lock);
3976                 } while (!x->done);
3977                 __remove_wait_queue(&x->wait, &wait);
3978         }
3979         x->done--;
3980 out:
3981         spin_unlock_irq(&x->wait.lock);
3982
3983         return ret;
3984 }
3985 EXPORT_SYMBOL(wait_for_completion_interruptible);
3986
3987 unsigned long fastcall __sched
3988 wait_for_completion_interruptible_timeout(struct completion *x,
3989                                           unsigned long timeout)
3990 {
3991         might_sleep();
3992
3993         spin_lock_irq(&x->wait.lock);
3994         if (!x->done) {
3995                 DECLARE_WAITQUEUE(wait, current);
3996
3997                 wait.flags |= WQ_FLAG_EXCLUSIVE;
3998                 __add_wait_queue_tail(&x->wait, &wait);
3999                 do {
4000                         if (signal_pending(current)) {
4001                                 timeout = -ERESTARTSYS;
4002                                 __remove_wait_queue(&x->wait, &wait);
4003                                 goto out;
4004                         }
4005                         __set_current_state(TASK_INTERRUPTIBLE);
4006                         spin_unlock_irq(&x->wait.lock);
4007                         timeout = schedule_timeout(timeout);
4008                         spin_lock_irq(&x->wait.lock);
4009                         if (!timeout) {
4010                                 __remove_wait_queue(&x->wait, &wait);
4011                                 goto out;
4012                         }
4013                 } while (!x->done);
4014                 __remove_wait_queue(&x->wait, &wait);
4015         }
4016         x->done--;
4017 out:
4018         spin_unlock_irq(&x->wait.lock);
4019         return timeout;
4020 }
4021 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4022
4023
4024 #define SLEEP_ON_VAR                                    \
4025         unsigned long flags;                            \
4026         wait_queue_t wait;                              \
4027         init_waitqueue_entry(&wait, current);
4028
4029 #define SLEEP_ON_HEAD                                   \
4030         spin_lock_irqsave(&q->lock,flags);              \
4031         __add_wait_queue(q, &wait);                     \
4032         spin_unlock(&q->lock);
4033
4034 #define SLEEP_ON_TAIL                                   \
4035         spin_lock_irq(&q->lock);                        \
4036         __remove_wait_queue(q, &wait);                  \
4037         spin_unlock_irqrestore(&q->lock, flags);
4038
4039 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
4040 {
4041         SLEEP_ON_VAR
4042
4043         current->state = TASK_INTERRUPTIBLE;
4044
4045         SLEEP_ON_HEAD
4046         schedule();
4047         SLEEP_ON_TAIL
4048 }
4049 EXPORT_SYMBOL(interruptible_sleep_on);
4050
4051 long fastcall __sched
4052 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4053 {
4054         SLEEP_ON_VAR
4055
4056         current->state = TASK_INTERRUPTIBLE;
4057
4058         SLEEP_ON_HEAD
4059         timeout = schedule_timeout(timeout);
4060         SLEEP_ON_TAIL
4061
4062         return timeout;
4063 }
4064 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4065
4066 void fastcall __sched sleep_on(wait_queue_head_t *q)
4067 {
4068         SLEEP_ON_VAR
4069
4070         current->state = TASK_UNINTERRUPTIBLE;
4071
4072         SLEEP_ON_HEAD
4073         schedule();
4074         SLEEP_ON_TAIL
4075 }
4076 EXPORT_SYMBOL(sleep_on);
4077
4078 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4079 {
4080         SLEEP_ON_VAR
4081
4082         current->state = TASK_UNINTERRUPTIBLE;
4083
4084         SLEEP_ON_HEAD
4085         timeout = schedule_timeout(timeout);
4086         SLEEP_ON_TAIL
4087
4088         return timeout;
4089 }
4090
4091 EXPORT_SYMBOL(sleep_on_timeout);
4092
4093 #ifdef CONFIG_RT_MUTEXES
4094
4095 /*
4096  * rt_mutex_setprio - set the current priority of a task
4097  * @p: task
4098  * @prio: prio value (kernel-internal form)
4099  *
4100  * This function changes the 'effective' priority of a task. It does
4101  * not touch ->normal_prio like __setscheduler().
4102  *
4103  * Used by the rt_mutex code to implement priority inheritance logic.
4104  */
4105 void rt_mutex_setprio(struct task_struct *p, int prio)
4106 {
4107         struct prio_array *array;
4108         unsigned long flags;
4109         struct rq *rq;
4110         int oldprio;
4111
4112         BUG_ON(prio < 0 || prio > MAX_PRIO);
4113
4114         rq = task_rq_lock(p, &flags);
4115
4116         oldprio = p->prio;
4117         array = p->array;
4118         if (array)
4119                 dequeue_task(p, array);
4120         p->prio = prio;
4121
4122         if (array) {
4123                 /*
4124                  * If changing to an RT priority then queue it
4125                  * in the active array!
4126                  */
4127                 if (rt_task(p))
4128                         array = rq->active;
4129                 enqueue_task(p, array);
4130                 /*
4131                  * Reschedule if we are currently running on this runqueue and
4132                  * our priority decreased, or if we are not currently running on
4133                  * this runqueue and our priority is higher than the current's
4134                  */
4135                 if (task_running(rq, p)) {
4136                         if (p->prio > oldprio)
4137                                 resched_task(rq->curr);
4138                 } else if (TASK_PREEMPTS_CURR(p, rq))
4139                         resched_task(rq->curr);
4140         }
4141         task_rq_unlock(rq, &flags);
4142 }
4143
4144 #endif
4145
4146 void set_user_nice(struct task_struct *p, long nice)
4147 {
4148         struct prio_array *array;
4149         int old_prio, delta;
4150         unsigned long flags;
4151         struct rq *rq;
4152
4153         if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4154                 return;
4155         /*
4156          * We have to be careful, if called from sys_setpriority(),
4157          * the task might be in the middle of scheduling on another CPU.
4158          */
4159         rq = task_rq_lock(p, &flags);
4160         /*
4161          * The RT priorities are set via sched_setscheduler(), but we still
4162          * allow the 'normal' nice value to be set - but as expected
4163          * it wont have any effect on scheduling until the task is
4164          * not SCHED_NORMAL/SCHED_BATCH:
4165          */
4166         if (has_rt_policy(p)) {
4167                 p->static_prio = NICE_TO_PRIO(nice);
4168                 goto out_unlock;
4169         }
4170         array = p->array;
4171         if (array) {
4172                 dequeue_task(p, array);
4173                 dec_raw_weighted_load(rq, p);
4174         }
4175
4176         p->static_prio = NICE_TO_PRIO(nice);
4177         set_load_weight(p);
4178         old_prio = p->prio;
4179         p->prio = effective_prio(p);
4180         delta = p->prio - old_prio;
4181
4182         if (array) {
4183                 enqueue_task(p, array);
4184                 inc_raw_weighted_load(rq, p);
4185                 /*
4186                  * If the task increased its priority or is running and
4187                  * lowered its priority, then reschedule its CPU:
4188                  */
4189                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
4190                         resched_task(rq->curr);
4191         }
4192 out_unlock:
4193         task_rq_unlock(rq, &flags);
4194 }
4195 EXPORT_SYMBOL(set_user_nice);
4196
4197 /*
4198  * can_nice - check if a task can reduce its nice value
4199  * @p: task
4200  * @nice: nice value
4201  */
4202 int can_nice(const struct task_struct *p, const int nice)
4203 {
4204         /* convert nice value [19,-20] to rlimit style value [1,40] */
4205         int nice_rlim = 20 - nice;
4206
4207         return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
4208                 capable(CAP_SYS_NICE));
4209 }
4210
4211 #ifdef __ARCH_WANT_SYS_NICE
4212
4213 /*
4214  * sys_nice - change the priority of the current process.
4215  * @increment: priority increment
4216  *
4217  * sys_setpriority is a more generic, but much slower function that
4218  * does similar things.
4219  */
4220 asmlinkage long sys_nice(int increment)
4221 {
4222         long nice, retval;
4223
4224         /*
4225          * Setpriority might change our priority at the same moment.
4226          * We don't have to worry. Conceptually one call occurs first
4227          * and we have a single winner.
4228          */
4229         if (increment < -40)
4230                 increment = -40;
4231         if (increment > 40)
4232                 increment = 40;
4233
4234         nice = PRIO_TO_NICE(current->static_prio) + increment;
4235         if (nice < -20)
4236                 nice = -20;
4237         if (nice > 19)
4238                 nice = 19;
4239
4240         if (increment < 0 && !can_nice(current, nice))
4241                 return -EPERM;
4242
4243         retval = security_task_setnice(current, nice);
4244         if (retval)
4245                 return retval;
4246
4247         set_user_nice(current, nice);
4248         return 0;
4249 }
4250
4251 #endif
4252
4253 /**
4254  * task_prio - return the priority value of a given task.
4255  * @p: the task in question.
4256  *
4257  * This is the priority value as seen by users in /proc.
4258  * RT tasks are offset by -200. Normal tasks are centered
4259  * around 0, value goes from -16 to +15.
4260  */
4261 int task_prio(const struct task_struct *p)
4262 {
4263         return p->prio - MAX_RT_PRIO;
4264 }
4265
4266 /**
4267  * task_nice - return the nice value of a given task.
4268  * @p: the task in question.
4269  */
4270 int task_nice(const struct task_struct *p)
4271 {
4272         return TASK_NICE(p);
4273 }
4274 EXPORT_SYMBOL_GPL(task_nice);
4275
4276 /**
4277  * idle_cpu - is a given cpu idle currently?
4278  * @cpu: the processor in question.
4279  */
4280 int idle_cpu(int cpu)
4281 {
4282         return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4283 }
4284
4285 /**
4286  * idle_task - return the idle task for a given cpu.
4287  * @cpu: the processor in question.
4288  */
4289 struct task_struct *idle_task(int cpu)
4290 {
4291         return cpu_rq(cpu)->idle;
4292 }
4293
4294 /**
4295  * find_process_by_pid - find a process with a matching PID value.
4296  * @pid: the pid in question.
4297  */
4298 static inline struct task_struct *find_process_by_pid(pid_t pid)
4299 {
4300         return pid ? find_task_by_pid(pid) : current;
4301 }
4302
4303 /* Actually do priority change: must hold rq lock. */
4304 static void __setscheduler(struct task_struct *p, int policy, int prio)
4305 {
4306         BUG_ON(p->array);
4307
4308         p->policy = policy;
4309         p->rt_priority = prio;
4310         p->normal_prio = normal_prio(p);
4311         /* we are holding p->pi_lock already */
4312         p->prio = rt_mutex_getprio(p);
4313         /*
4314          * SCHED_BATCH tasks are treated as perpetual CPU hogs:
4315          */
4316         if (policy == SCHED_BATCH)
4317                 p->sleep_avg = 0;
4318         set_load_weight(p);
4319 }
4320
4321 /**
4322  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4323  * @p: the task in question.
4324  * @policy: new policy.
4325  * @param: structure containing the new RT priority.
4326  *
4327  * NOTE that the task may be already dead.
4328  */
4329 int sched_setscheduler(struct task_struct *p, int policy,
4330                        struct sched_param *param)
4331 {
4332         int retval, oldprio, oldpolicy = -1;
4333         struct prio_array *array;
4334         unsigned long flags;
4335         struct rq *rq;
4336
4337         /* may grab non-irq protected spin_locks */
4338         BUG_ON(in_interrupt());
4339 recheck:
4340         /* double check policy once rq lock held */
4341         if (policy < 0)
4342                 policy = oldpolicy = p->policy;
4343         else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4344                         policy != SCHED_NORMAL && policy != SCHED_BATCH)
4345                 return -EINVAL;
4346         /*
4347          * Valid priorities for SCHED_FIFO and SCHED_RR are
4348          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
4349          * SCHED_BATCH is 0.
4350          */
4351         if (param->sched_priority < 0 ||
4352             (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4353             (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4354                 return -EINVAL;
4355         if (is_rt_policy(policy) != (param->sched_priority != 0))
4356                 return -EINVAL;
4357
4358         /*
4359          * Allow unprivileged RT tasks to decrease priority:
4360          */
4361         if (!capable(CAP_SYS_NICE)) {
4362                 if (is_rt_policy(policy)) {
4363                         unsigned long rlim_rtprio;
4364                         unsigned long flags;
4365
4366                         if (!lock_task_sighand(p, &flags))
4367                                 return -ESRCH;
4368                         rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4369                         unlock_task_sighand(p, &flags);
4370
4371                         /* can't set/change the rt policy */
4372                         if (policy != p->policy && !rlim_rtprio)
4373                                 return -EPERM;
4374
4375                         /* can't increase priority */
4376                         if (param->sched_priority > p->rt_priority &&
4377                             param->sched_priority > rlim_rtprio)
4378                                 return -EPERM;
4379                 }
4380
4381                 /* can't change other user's priorities */
4382                 if ((current->euid != p->euid) &&
4383                     (current->euid != p->uid))
4384                         return -EPERM;
4385         }
4386
4387         retval = security_task_setscheduler(p, policy, param);
4388         if (retval)
4389                 return retval;
4390         /*
4391          * make sure no PI-waiters arrive (or leave) while we are
4392          * changing the priority of the task:
4393          */
4394         spin_lock_irqsave(&p->pi_lock, flags);
4395         /*
4396          * To be able to change p->policy safely, the apropriate
4397          * runqueue lock must be held.
4398          */
4399         rq = __task_rq_lock(p);
4400         /* recheck policy now with rq lock held */
4401         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4402                 policy = oldpolicy = -1;
4403                 __task_rq_unlock(rq);
4404                 spin_unlock_irqrestore(&p->pi_lock, flags);
4405                 goto recheck;
4406         }
4407         array = p->array;
4408         if (array)
4409                 deactivate_task(p, rq);
4410         oldprio = p->prio;
4411         __setscheduler(p, policy, param->sched_priority);
4412         if (array) {
4413                 __activate_task(p, rq);
4414                 /*
4415                  * Reschedule if we are currently running on this runqueue and
4416                  * our priority decreased, or if we are not currently running on
4417                  * this runqueue and our priority is higher than the current's
4418                  */
4419                 if (task_running(rq, p)) {
4420                         if (p->prio > oldprio)
4421                                 resched_task(rq->curr);
4422                 } else if (TASK_PREEMPTS_CURR(p, rq))
4423                         resched_task(rq->curr);
4424         }
4425         __task_rq_unlock(rq);
4426         spin_unlock_irqrestore(&p->pi_lock, flags);
4427
4428         rt_mutex_adjust_pi(p);
4429
4430         return 0;
4431 }
4432 EXPORT_SYMBOL_GPL(sched_setscheduler);
4433
4434 static int
4435 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4436 {
4437         struct sched_param lparam;
4438         struct task_struct *p;
4439         int retval;
4440
4441         if (!param || pid < 0)
4442                 return -EINVAL;
4443         if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4444                 return -EFAULT;
4445
4446         rcu_read_lock();
4447         retval = -ESRCH;
4448         p = find_process_by_pid(pid);
4449         if (p != NULL)
4450                 retval = sched_setscheduler(p, policy, &lparam);
4451         rcu_read_unlock();
4452
4453         return retval;
4454 }
4455
4456 /**
4457  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4458  * @pid: the pid in question.
4459  * @policy: new policy.
4460  * @param: structure containing the new RT priority.
4461  */
4462 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4463                                        struct sched_param __user *param)
4464 {
4465         /* negative values for policy are not valid */
4466         if (policy < 0)
4467                 return -EINVAL;
4468
4469         return do_sched_setscheduler(pid, policy, param);
4470 }
4471
4472 /**
4473  * sys_sched_setparam - set/change the RT priority of a thread
4474  * @pid: the pid in question.
4475  * @param: structure containing the new RT priority.
4476  */
4477 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4478 {
4479         return do_sched_setscheduler(pid, -1, param);
4480 }
4481
4482 /**
4483  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4484  * @pid: the pid in question.
4485  */
4486 asmlinkage long sys_sched_getscheduler(pid_t pid)
4487 {
4488         struct task_struct *p;
4489         int retval = -EINVAL;
4490
4491         if (pid < 0)
4492                 goto out_nounlock;
4493
4494         retval = -ESRCH;
4495         read_lock(&tasklist_lock);
4496         p = find_process_by_pid(pid);
4497         if (p) {
4498                 retval = security_task_getscheduler(p);
4499                 if (!retval)
4500                         retval = p->policy;
4501         }
4502         read_unlock(&tasklist_lock);
4503
4504 out_nounlock:
4505         return retval;
4506 }
4507
4508 /**
4509  * sys_sched_getscheduler - get the RT priority of a thread
4510  * @pid: the pid in question.
4511  * @param: structure containing the RT priority.
4512  */
4513 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4514 {
4515         struct sched_param lp;
4516         struct task_struct *p;
4517         int retval = -EINVAL;
4518
4519         if (!param || pid < 0)
4520                 goto out_nounlock;
4521
4522         read_lock(&tasklist_lock);
4523         p = find_process_by_pid(pid);
4524         retval = -ESRCH;
4525         if (!p)
4526                 goto out_unlock;
4527
4528         retval = security_task_getscheduler(p);
4529         if (retval)
4530                 goto out_unlock;
4531
4532         lp.sched_priority = p->rt_priority;
4533         read_unlock(&tasklist_lock);
4534
4535         /*
4536          * This one might sleep, we cannot do it with a spinlock held ...
4537          */
4538         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4539
4540 out_nounlock:
4541         return retval;
4542
4543 out_unlock:
4544         read_unlock(&tasklist_lock);
4545         return retval;
4546 }
4547
4548 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4549 {
4550         cpumask_t cpus_allowed;
4551         struct task_struct *p;
4552         int retval;
4553
4554         mutex_lock(&sched_hotcpu_mutex);
4555         read_lock(&tasklist_lock);
4556
4557         p = find_process_by_pid(pid);
4558         if (!p) {
4559                 read_unlock(&tasklist_lock);
4560                 mutex_unlock(&sched_hotcpu_mutex);
4561                 return -ESRCH;
4562         }
4563
4564         /*
4565          * It is not safe to call set_cpus_allowed with the
4566          * tasklist_lock held.  We will bump the task_struct's
4567          * usage count and then drop tasklist_lock.
4568          */
4569         get_task_struct(p);
4570         read_unlock(&tasklist_lock);
4571
4572         retval = -EPERM;
4573         if ((current->euid != p->euid) && (current->euid != p->uid) &&
4574                         !capable(CAP_SYS_NICE))
4575                 goto out_unlock;
4576
4577         retval = security_task_setscheduler(p, 0, NULL);
4578         if (retval)
4579                 goto out_unlock;
4580
4581         cpus_allowed = cpuset_cpus_allowed(p);
4582         cpus_and(new_mask, new_mask, cpus_allowed);
4583         retval = set_cpus_allowed(p, new_mask);
4584
4585 out_unlock:
4586         put_task_struct(p);
4587         mutex_unlock(&sched_hotcpu_mutex);
4588         return retval;
4589 }
4590
4591 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4592                              cpumask_t *new_mask)
4593 {
4594         if (len < sizeof(cpumask_t)) {
4595                 memset(new_mask, 0, sizeof(cpumask_t));
4596         } else if (len > sizeof(cpumask_t)) {
4597                 len = sizeof(cpumask_t);
4598         }
4599         return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4600 }
4601
4602 /**
4603  * sys_sched_setaffinity - set the cpu affinity of a process
4604  * @pid: pid of the process
4605  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4606  * @user_mask_ptr: user-space pointer to the new cpu mask
4607  */
4608 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4609                                       unsigned long __user *user_mask_ptr)
4610 {
4611         cpumask_t new_mask;
4612         int retval;
4613
4614         retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4615         if (retval)
4616                 return retval;
4617
4618         return sched_setaffinity(pid, new_mask);
4619 }
4620
4621 /*
4622  * Represents all cpu's present in the system
4623  * In systems capable of hotplug, this map could dynamically grow
4624  * as new cpu's are detected in the system via any platform specific
4625  * method, such as ACPI for e.g.
4626  */
4627
4628 cpumask_t cpu_present_map __read_mostly;
4629 EXPORT_SYMBOL(cpu_present_map);
4630
4631 #ifndef CONFIG_SMP
4632 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
4633 EXPORT_SYMBOL(cpu_online_map);
4634
4635 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
4636 EXPORT_SYMBOL(cpu_possible_map);
4637 #endif
4638
4639 long sched_getaffinity(pid_t pid, cpumask_t *mask)
4640 {
4641         struct task_struct *p;
4642         int retval;
4643
4644         mutex_lock(&sched_hotcpu_mutex);
4645         read_lock(&tasklist_lock);
4646
4647         retval = -ESRCH;
4648         p = find_process_by_pid(pid);
4649         if (!p)
4650                 goto out_unlock;
4651
4652         retval = security_task_getscheduler(p);
4653         if (retval)
4654                 goto out_unlock;
4655
4656         cpus_and(*mask, p->cpus_allowed, cpu_online_map);
4657
4658 out_unlock:
4659         read_unlock(&tasklist_lock);
4660         mutex_unlock(&sched_hotcpu_mutex);
4661         if (retval)
4662                 return retval;
4663
4664         return 0;
4665 }
4666
4667 /**
4668  * sys_sched_getaffinity - get the cpu affinity of a process
4669  * @pid: pid of the process
4670  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4671  * @user_mask_ptr: user-space pointer to hold the current cpu mask
4672  */
4673 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4674                                       unsigned long __user *user_mask_ptr)
4675 {
4676         int ret;
4677         cpumask_t mask;
4678
4679         if (len < sizeof(cpumask_t))
4680                 return -EINVAL;
4681
4682         ret = sched_getaffinity(pid, &mask);
4683         if (ret < 0)
4684                 return ret;
4685
4686         if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4687                 return -EFAULT;
4688
4689         return sizeof(cpumask_t);
4690 }
4691
4692 /**
4693  * sys_sched_yield - yield the current processor to other threads.
4694  *
4695  * This function yields the current CPU by moving the calling thread
4696  * to the expired array. If there are no other threads running on this
4697  * CPU then this function will return.
4698  */
4699 asmlinkage long sys_sched_yield(void)
4700 {
4701         struct rq *rq = this_rq_lock();
4702         struct prio_array *array = current->array, *target = rq->expired;
4703
4704         schedstat_inc(rq, yld_cnt);
4705         /*
4706          * We implement yielding by moving the task into the expired
4707          * queue.
4708          *
4709          * (special rule: RT tasks will just roundrobin in the active
4710          *  array.)
4711          */
4712         if (rt_task(current))
4713                 target = rq->active;
4714
4715         if (array->nr_active == 1) {
4716                 schedstat_inc(rq, yld_act_empty);
4717                 if (!rq->expired->nr_active)
4718                         schedstat_inc(rq, yld_both_empty);
4719         } else if (!rq->expired->nr_active)
4720                 schedstat_inc(rq, yld_exp_empty);
4721
4722         if (array != target) {
4723                 dequeue_task(current, array);
4724                 enqueue_task(current, target);
4725         } else
4726                 /*
4727                  * requeue_task is cheaper so perform that if possible.
4728                  */
4729                 requeue_task(current, array);
4730
4731         /*
4732          * Since we are going to call schedule() anyway, there's
4733          * no need to preempt or enable interrupts:
4734          */
4735         __release(rq->lock);
4736         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4737         _raw_spin_unlock(&rq->lock);
4738         preempt_enable_no_resched();
4739
4740         schedule();
4741
4742         return 0;
4743 }
4744
4745 static void __cond_resched(void)
4746 {
4747 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4748         __might_sleep(__FILE__, __LINE__);
4749 #endif
4750         /*
4751          * The BKS might be reacquired before we have dropped
4752          * PREEMPT_ACTIVE, which could trigger a second
4753          * cond_resched() call.
4754          */
4755         do {
4756                 add_preempt_count(PREEMPT_ACTIVE);
4757                 schedule();
4758                 sub_preempt_count(PREEMPT_ACTIVE);
4759         } while (need_resched());
4760 }
4761
4762 int __sched cond_resched(void)
4763 {
4764         if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4765                                         system_state == SYSTEM_RUNNING) {
4766                 __cond_resched();
4767                 return 1;
4768         }
4769         return 0;
4770 }
4771 EXPORT_SYMBOL(cond_resched);
4772
4773 /*
4774  * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4775  * call schedule, and on return reacquire the lock.
4776  *
4777  * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
4778  * operations here to prevent schedule() from being called twice (once via
4779  * spin_unlock(), once by hand).
4780  */
4781 int cond_resched_lock(spinlock_t *lock)
4782 {
4783         int ret = 0;
4784
4785         if (need_lockbreak(lock)) {
4786                 spin_unlock(lock);
4787                 cpu_relax();
4788                 ret = 1;
4789                 spin_lock(lock);
4790         }
4791         if (need_resched() && system_state == SYSTEM_RUNNING) {
4792                 spin_release(&lock->dep_map, 1, _THIS_IP_);
4793                 _raw_spin_unlock(lock);
4794                 preempt_enable_no_resched();
4795                 __cond_resched();
4796                 ret = 1;
4797                 spin_lock(lock);
4798         }
4799         return ret;
4800 }
4801 EXPORT_SYMBOL(cond_resched_lock);
4802
4803 int __sched cond_resched_softirq(void)
4804 {
4805         BUG_ON(!in_softirq());
4806
4807         if (need_resched() && system_state == SYSTEM_RUNNING) {
4808                 local_bh_enable();
4809                 __cond_resched();
4810                 local_bh_disable();
4811                 return 1;
4812         }
4813         return 0;
4814 }
4815 EXPORT_SYMBOL(cond_resched_softirq);
4816
4817 /**
4818  * yield - yield the current processor to other threads.
4819  *
4820  * This is a shortcut for kernel-space yielding - it marks the
4821  * thread runnable and calls sys_sched_yield().
4822  */
4823 void __sched yield(void)
4824 {
4825         set_current_state(TASK_RUNNING);
4826         sys_sched_yield();
4827 }
4828 EXPORT_SYMBOL(yield);
4829
4830 /*
4831  * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
4832  * that process accounting knows that this is a task in IO wait state.
4833  *
4834  * But don't do that if it is a deliberate, throttling IO wait (this task
4835  * has set its backing_dev_info: the queue against which it should throttle)
4836  */
4837 void __sched io_schedule(void)
4838 {
4839         struct rq *rq = &__raw_get_cpu_var(runqueues);
4840
4841         delayacct_blkio_start();
4842         atomic_inc(&rq->nr_iowait);
4843         schedule();
4844         atomic_dec(&rq->nr_iowait);
4845         delayacct_blkio_end();
4846 }
4847 EXPORT_SYMBOL(io_schedule);
4848
4849 long __sched io_schedule_timeout(long timeout)
4850 {
4851         struct rq *rq = &__raw_get_cpu_var(runqueues);
4852         long ret;
4853
4854         delayacct_blkio_start();
4855         atomic_inc(&rq->nr_iowait);
4856         ret = schedule_timeout(timeout);
4857         atomic_dec(&rq->nr_iowait);
4858         delayacct_blkio_end();
4859         return ret;
4860 }
4861
4862 /**
4863  * sys_sched_get_priority_max - return maximum RT priority.
4864  * @policy: scheduling class.
4865  *
4866  * this syscall returns the maximum rt_priority that can be used
4867  * by a given scheduling class.
4868  */
4869 asmlinkage long sys_sched_get_priority_max(int policy)
4870 {
4871         int ret = -EINVAL;
4872
4873         switch (policy) {
4874         case SCHED_FIFO:
4875         case SCHED_RR:
4876                 ret = MAX_USER_RT_PRIO-1;
4877                 break;
4878         case SCHED_NORMAL:
4879         case SCHED_BATCH:
4880                 ret = 0;
4881                 break;
4882         }
4883         return ret;
4884 }
4885
4886 /**
4887  * sys_sched_get_priority_min - return minimum RT priority.
4888  * @policy: scheduling class.
4889  *
4890  * this syscall returns the minimum rt_priority that can be used
4891  * by a given scheduling class.
4892  */
4893 asmlinkage long sys_sched_get_priority_min(int policy)
4894 {
4895         int ret = -EINVAL;
4896
4897         switch (policy) {
4898         case SCHED_FIFO:
4899         case SCHED_RR:
4900                 ret = 1;
4901                 break;
4902         case SCHED_NORMAL:
4903         case SCHED_BATCH:
4904                 ret = 0;
4905         }
4906         return ret;
4907 }
4908
4909 /**
4910  * sys_sched_rr_get_interval - return the default timeslice of a process.
4911  * @pid: pid of the process.
4912  * @interval: userspace pointer to the timeslice value.
4913  *
4914  * this syscall writes the default timeslice value of a given process
4915  * into the user-space timespec buffer. A value of '0' means infinity.
4916  */
4917 asmlinkage
4918 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4919 {
4920         struct task_struct *p;
4921         int retval = -EINVAL;
4922         struct timespec t;
4923
4924         if (pid < 0)
4925                 goto out_nounlock;
4926
4927         retval = -ESRCH;
4928         read_lock(&tasklist_lock);
4929         p = find_process_by_pid(pid);
4930         if (!p)
4931                 goto out_unlock;
4932
4933         retval = security_task_getscheduler(p);
4934         if (retval)
4935                 goto out_unlock;
4936
4937         jiffies_to_timespec(p->policy == SCHED_FIFO ?
4938                                 0 : task_timeslice(p), &t);
4939         read_unlock(&tasklist_lock);
4940         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4941 out_nounlock:
4942         return retval;
4943 out_unlock:
4944         read_unlock(&tasklist_lock);
4945         return retval;
4946 }
4947
4948 static const char stat_nam[] = "RSDTtZX";
4949
4950 static void show_task(struct task_struct *p)
4951 {
4952         unsigned long free = 0;
4953         unsigned state;
4954
4955         state = p->state ? __ffs(p->state) + 1 : 0;
4956         printk("%-13.13s %c", p->comm,
4957                 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4958 #if (BITS_PER_LONG == 32)
4959         if (state == TASK_RUNNING)
4960                 printk(" running ");
4961         else
4962                 printk(" %08lX ", thread_saved_pc(p));
4963 #else
4964         if (state == TASK_RUNNING)
4965                 printk("  running task   ");
4966         else
4967                 printk(" %016lx ", thread_saved_pc(p));
4968 #endif
4969 #ifdef CONFIG_DEBUG_STACK_USAGE
4970         {
4971                 unsigned long *n = end_of_stack(p);
4972                 while (!*n)
4973                         n++;
4974                 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4975         }
4976 #endif
4977         printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
4978         if (!p->mm)
4979                 printk(" (L-TLB)\n");
4980         else
4981                 printk(" (NOTLB)\n");
4982
4983         if (state != TASK_RUNNING)
4984                 show_stack(p, NULL);
4985 }
4986
4987 void show_state_filter(unsigned long state_filter)
4988 {
4989         struct task_struct *g, *p;
4990
4991 #if (BITS_PER_LONG == 32)
4992         printk("\n"
4993                "                         free                        sibling\n");
4994         printk("  task             PC    stack   pid father child younger older\n");
4995 #else
4996         printk("\n"
4997                "                                 free                        sibling\n");
4998         printk("  task                 PC        stack   pid father child younger older\n");
4999 #endif
5000         read_lock(&tasklist_lock);
5001         do_each_thread(g, p) {
5002                 /*
5003                  * reset the NMI-timeout, listing all files on a slow
5004                  * console might take alot of time:
5005                  */
5006                 touch_nmi_watchdog();
5007                 if (!state_filter || (p->state & state_filter))
5008                         show_task(p);
5009         } while_each_thread(g, p);
5010
5011         touch_all_softlockup_watchdogs();
5012
5013         read_unlock(&tasklist_lock);
5014         /*
5015          * Only show locks if all tasks are dumped:
5016          */
5017         if (state_filter == -1)
5018                 debug_show_all_locks();
5019 }
5020
5021 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
5022 {
5023         /* nothing yet */
5024 }
5025
5026 /**
5027  * init_idle - set up an idle thread for a given CPU
5028  * @idle: task in question
5029  * @cpu: cpu the idle task belongs to
5030  *
5031  * NOTE: this function does not set the idle thread's NEED_RESCHED
5032  * flag, to make booting more robust.
5033  */
5034 void __cpuinit init_idle(struct task_struct *idle, int cpu)
5035 {
5036         struct rq *rq = cpu_rq(cpu);
5037         unsigned long flags;
5038
5039         idle->timestamp = sched_clock();
5040         idle->sleep_avg = 0;
5041         idle->array = NULL;
5042         idle->prio = idle->normal_prio = MAX_PRIO;
5043         idle->state = TASK_RUNNING;
5044         idle->cpus_allowed = cpumask_of_cpu(cpu);
5045         set_task_cpu(idle, cpu);
5046
5047         spin_lock_irqsave(&rq->lock, flags);
5048         rq->curr = rq->idle = idle;
5049 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
5050         idle->oncpu = 1;
5051 #endif
5052         spin_unlock_irqrestore(&rq->lock, flags);
5053
5054         /* Set the preempt count _outside_ the spinlocks! */
5055 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
5056         task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5057 #else
5058         task_thread_info(idle)->preempt_count = 0;
5059 #endif
5060 }
5061
5062 /*
5063  * In a system that switches off the HZ timer nohz_cpu_mask
5064  * indicates which cpus entered this state. This is used
5065  * in the rcu update to wait only for active cpus. For system
5066  * which do not switch off the HZ timer nohz_cpu_mask should
5067  * always be CPU_MASK_NONE.
5068  */
5069 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
5070
5071 #ifdef CONFIG_SMP
5072 /*
5073  * This is how migration works:
5074  *
5075  * 1) we queue a struct migration_req structure in the source CPU's
5076  *    runqueue and wake up that CPU's migration thread.
5077  * 2) we down() the locked semaphore => thread blocks.
5078  * 3) migration thread wakes up (implicitly it forces the migrated
5079  *    thread off the CPU)
5080  * 4) it gets the migration request and checks whether the migrated
5081  *    task is still in the wrong runqueue.
5082  * 5) if it's in the wrong runqueue then the migration thread removes
5083  *    it and puts it into the right queue.
5084  * 6) migration thread up()s the semaphore.
5085  * 7) we wake up and the migration is done.
5086  */
5087
5088 /*
5089  * Change a given task's CPU affinity. Migrate the thread to a
5090  * proper CPU and schedule it away if the CPU it's executing on
5091  * is removed from the allowed bitmask.
5092  *
5093  * NOTE: the caller must have a valid reference to the task, the
5094  * task must not exit() & deallocate itself prematurely.  The
5095  * call is not atomic; no spinlocks may be held.
5096  */
5097 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
5098 {
5099         struct migration_req req;
5100         unsigned long flags;
5101         struct rq *rq;
5102         int ret = 0;
5103
5104         rq = task_rq_lock(p, &flags);
5105         if (!cpus_intersects(new_mask, cpu_online_map)) {
5106                 ret = -EINVAL;
5107                 goto out;
5108         }
5109
5110         p->cpus_allowed = new_mask;
5111         /* Can the task run on the task's current CPU? If so, we're done */
5112         if (cpu_isset(task_cpu(p), new_mask))
5113                 goto out;
5114
5115         if (migrate_task(p, any_online_cpu(new_mask), &req)) {
5116                 /* Need help from migration thread: drop lock and wait. */
5117                 task_rq_unlock(rq, &flags);
5118                 wake_up_process(rq->migration_thread);
5119                 wait_for_completion(&req.done);
5120                 tlb_migrate_finish(p->mm);
5121                 return 0;
5122         }
5123 out:
5124         task_rq_unlock(rq, &flags);
5125
5126         return ret;
5127 }
5128 EXPORT_SYMBOL_GPL(set_cpus_allowed);
5129
5130 /*
5131  * Move (not current) task off this cpu, onto dest cpu.  We're doing
5132  * this because either it can't run here any more (set_cpus_allowed()
5133  * away from this CPU, or CPU going down), or because we're
5134  * attempting to rebalance this task on exec (sched_exec).
5135  *
5136  * So we race with normal scheduler movements, but that's OK, as long
5137  * as the task is no longer on this CPU.
5138  *
5139  * Returns non-zero if task was successfully migrated.
5140  */
5141 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5142 {
5143         struct rq *rq_dest, *rq_src;
5144         int ret = 0;
5145
5146         if (unlikely(cpu_is_offline(dest_cpu)))
5147                 return ret;
5148
5149         rq_src = cpu_rq(src_cpu);
5150         rq_dest = cpu_rq(dest_cpu);
5151
5152         double_rq_lock(rq_src, rq_dest);
5153         /* Already moved. */
5154         if (task_cpu(p) != src_cpu)
5155                 goto out;
5156         /* Affinity changed (again). */
5157         if (!cpu_isset(dest_cpu, p->cpus_allowed))
5158                 goto out;
5159
5160         set_task_cpu(p, dest_cpu);
5161         if (p->array) {
5162                 /*
5163                  * Sync timestamp with rq_dest's before activating.
5164                  * The same thing could be achieved by doing this step
5165                  * afterwards, and pretending it was a local activate.
5166                  * This way is cleaner and logically correct.
5167                  */
5168                 p->timestamp = p->timestamp - rq_src->most_recent_timestamp
5169                                 + rq_dest->most_recent_timestamp;
5170                 deactivate_task(p, rq_src);
5171                 __activate_task(p, rq_dest);
5172                 if (TASK_PREEMPTS_CURR(p, rq_dest))
5173                         resched_task(rq_dest->curr);
5174         }
5175         ret = 1;
5176 out:
5177         double_rq_unlock(rq_src, rq_dest);
5178         return ret;
5179 }
5180
5181 /*
5182  * migration_thread - this is a highprio system thread that performs
5183  * thread migration by bumping thread off CPU then 'pushing' onto
5184  * another runqueue.
5185  */
5186 static int migration_thread(void *data)
5187 {
5188         int cpu = (long)data;
5189         struct rq *rq;
5190
5191         rq = cpu_rq(cpu);
5192         BUG_ON(rq->migration_thread != current);
5193
5194         set_current_state(TASK_INTERRUPTIBLE);
5195         while (!kthread_should_stop()) {
5196                 struct migration_req *req;
5197                 struct list_head *head;
5198
5199                 try_to_freeze();
5200
5201                 spin_lock_irq(&rq->lock);
5202
5203                 if (cpu_is_offline(cpu)) {
5204                         spin_unlock_irq(&rq->lock);
5205                         goto wait_to_die;
5206                 }
5207
5208                 if (rq->active_balance) {
5209                         active_load_balance(rq, cpu);
5210                         rq->active_balance = 0;
5211                 }
5212
5213                 head = &rq->migration_queue;
5214
5215                 if (list_empty(head)) {
5216                         spin_unlock_irq(&rq->lock);
5217                         schedule();
5218                         set_current_state(TASK_INTERRUPTIBLE);
5219                         continue;
5220                 }
5221                 req = list_entry(head->next, struct migration_req, list);
5222                 list_del_init(head->next);
5223
5224                 spin_unlock(&rq->lock);
5225                 __migrate_task(req->task, cpu, req->dest_cpu);
5226                 local_irq_enable();
5227
5228                 complete(&req->done);
5229         }
5230         __set_current_state(TASK_RUNNING);
5231         return 0;
5232
5233 wait_to_die:
5234         /* Wait for kthread_stop */
5235         set_current_state(TASK_INTERRUPTIBLE);
5236         while (!kthread_should_stop()) {
5237                 schedule();
5238                 set_current_state(TASK_INTERRUPTIBLE);
5239         }
5240         __set_current_state(TASK_RUNNING);
5241         return 0;
5242 }
5243
5244 #ifdef CONFIG_HOTPLUG_CPU
5245 /*
5246  * Figure out where task on dead CPU should go, use force if neccessary.
5247  * NOTE: interrupts should be disabled by the caller
5248  */
5249 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5250 {
5251         unsigned long flags;
5252         cpumask_t mask;
5253         struct rq *rq;
5254         int dest_cpu;
5255
5256 restart:
5257         /* On same node? */
5258         mask = node_to_cpumask(cpu_to_node(dead_cpu));
5259         cpus_and(mask, mask, p->cpus_allowed);
5260         dest_cpu = any_online_cpu(mask);
5261
5262         /* On any allowed CPU? */
5263         if (dest_cpu == NR_CPUS)
5264                 dest_cpu = any_online_cpu(p->cpus_allowed);
5265
5266         /* No more Mr. Nice Guy. */
5267         if (dest_cpu == NR_CPUS) {
5268                 rq = task_rq_lock(p, &flags);
5269                 cpus_setall(p->cpus_allowed);
5270                 dest_cpu = any_online_cpu(p->cpus_allowed);
5271                 task_rq_unlock(rq, &flags);
5272
5273                 /*
5274                  * Don't tell them about moving exiting tasks or
5275                  * kernel threads (both mm NULL), since they never
5276                  * leave kernel.
5277                  */
5278                 if (p->mm && printk_ratelimit())
5279                         printk(KERN_INFO "process %d (%s) no "
5280                                "longer affine to cpu%d\n",
5281                                p->pid, p->comm, dead_cpu);
5282         }
5283         if (!__migrate_task(p, dead_cpu, dest_cpu))
5284                 goto restart;
5285 }
5286
5287 /*
5288  * While a dead CPU has no uninterruptible tasks queued at this point,
5289  * it might still have a nonzero ->nr_uninterruptible counter, because
5290  * for performance reasons the counter is not stricly tracking tasks to
5291  * their home CPUs. So we just add the counter to another CPU's counter,
5292  * to keep the global sum constant after CPU-down:
5293  */
5294 static void migrate_nr_uninterruptible(struct rq *rq_src)
5295 {
5296         struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
5297         unsigned long flags;
5298
5299         local_irq_save(flags);
5300         double_rq_lock(rq_src, rq_dest);
5301         rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5302         rq_src->nr_uninterruptible = 0;
5303         double_rq_unlock(rq_src, rq_dest);
5304         local_irq_restore(flags);
5305 }
5306
5307 /* Run through task list and migrate tasks from the dead cpu. */
5308 static void migrate_live_tasks(int src_cpu)
5309 {
5310         struct task_struct *p, *t;
5311
5312         write_lock_irq(&tasklist_lock);
5313
5314         do_each_thread(t, p) {
5315                 if (p == current)
5316                         continue;
5317
5318                 if (task_cpu(p) == src_cpu)
5319                         move_task_off_dead_cpu(src_cpu, p);
5320         } while_each_thread(t, p);
5321
5322         write_unlock_irq(&tasklist_lock);
5323 }
5324
5325 /* Schedules idle task to be the next runnable task on current CPU.
5326  * It does so by boosting its priority to highest possible and adding it to
5327  * the _front_ of the runqueue. Used by CPU offline code.
5328  */
5329 void sched_idle_next(void)
5330 {
5331         int this_cpu = smp_processor_id();
5332         struct rq *rq = cpu_rq(this_cpu);
5333         struct task_struct *p = rq->idle;
5334         unsigned long flags;
5335
5336         /* cpu has to be offline */
5337         BUG_ON(cpu_online(this_cpu));
5338
5339         /*
5340          * Strictly not necessary since rest of the CPUs are stopped by now
5341          * and interrupts disabled on the current cpu.
5342          */
5343         spin_lock_irqsave(&rq->lock, flags);
5344
5345         __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
5346
5347         /* Add idle task to the _front_ of its priority queue: */
5348         __activate_idle_task(p, rq);
5349
5350         spin_unlock_irqrestore(&rq->lock, flags);
5351 }
5352
5353 /*
5354  * Ensures that the idle task is using init_mm right before its cpu goes
5355  * offline.
5356  */
5357 void idle_task_exit(void)
5358 {
5359         struct mm_struct *mm = current->active_mm;
5360
5361         BUG_ON(cpu_online(smp_processor_id()));
5362
5363         if (mm != &init_mm)
5364                 switch_mm(mm, &init_mm, current);
5365         mmdrop(mm);
5366 }
5367
5368 /* called under rq->lock with disabled interrupts */
5369 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5370 {
5371         struct rq *rq = cpu_rq(dead_cpu);
5372
5373         /* Must be exiting, otherwise would be on tasklist. */
5374         BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
5375
5376         /* Cannot have done final schedule yet: would have vanished. */
5377         BUG_ON(p->state == TASK_DEAD);
5378
5379         get_task_struct(p);
5380
5381         /*
5382          * Drop lock around migration; if someone else moves it,
5383          * that's OK.  No task can be added to this CPU, so iteration is
5384          * fine.
5385          * NOTE: interrupts should be left disabled  --dev@
5386          */
5387         spin_unlock(&rq->lock);
5388         move_task_off_dead_cpu(dead_cpu, p);
5389         spin_lock(&rq->lock);
5390
5391         put_task_struct(p);
5392 }
5393
5394 /* release_task() removes task from tasklist, so we won't find dead tasks. */
5395 static void migrate_dead_tasks(unsigned int dead_cpu)
5396 {
5397         struct rq *rq = cpu_rq(dead_cpu);
5398         unsigned int arr, i;
5399
5400         for (arr = 0; arr < 2; arr++) {
5401                 for (i = 0; i < MAX_PRIO; i++) {
5402                         struct list_head *list = &rq->arrays[arr].queue[i];
5403
5404                         while (!list_empty(list))
5405                                 migrate_dead(dead_cpu, list_entry(list->next,
5406                                              struct task_struct, run_list));
5407                 }
5408         }
5409 }
5410 #endif /* CONFIG_HOTPLUG_CPU */
5411
5412 /*
5413  * migration_call - callback that gets triggered when a CPU is added.
5414  * Here we can start up the necessary migration thread for the new CPU.
5415  */
5416 static int __cpuinit
5417 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5418 {
5419         struct task_struct *p;
5420         int cpu = (long)hcpu;
5421         unsigned long flags;
5422         struct rq *rq;
5423
5424         switch (action) {
5425         case CPU_LOCK_ACQUIRE:
5426                 mutex_lock(&sched_hotcpu_mutex);
5427                 break;
5428
5429         case CPU_UP_PREPARE:
5430         case CPU_UP_PREPARE_FROZEN:
5431                 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
5432                 if (IS_ERR(p))
5433                         return NOTIFY_BAD;
5434                 p->flags |= PF_NOFREEZE;
5435                 kthread_bind(p, cpu);
5436                 /* Must be high prio: stop_machine expects to yield to it. */
5437                 rq = task_rq_lock(p, &flags);
5438                 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
5439                 task_rq_unlock(rq, &flags);
5440                 cpu_rq(cpu)->migration_thread = p;
5441                 break;
5442
5443         case CPU_ONLINE:
5444         case CPU_ONLINE_FROZEN:
5445                 /* Strictly unneccessary, as first user will wake it. */
5446                 wake_up_process(cpu_rq(cpu)->migration_thread);
5447                 break;
5448
5449 #ifdef CONFIG_HOTPLUG_CPU
5450         case CPU_UP_CANCELED:
5451         case CPU_UP_CANCELED_FROZEN:
5452                 if (!cpu_rq(cpu)->migration_thread)
5453                         break;
5454                 /* Unbind it from offline cpu so it can run.  Fall thru. */
5455                 kthread_bind(cpu_rq(cpu)->migration_thread,
5456                              any_online_cpu(cpu_online_map));
5457                 kthread_stop(cpu_rq(cpu)->migration_thread);
5458                 cpu_rq(cpu)->migration_thread = NULL;
5459                 break;
5460
5461         case CPU_DEAD:
5462         case CPU_DEAD_FROZEN:
5463                 migrate_live_tasks(cpu);
5464                 rq = cpu_rq(cpu);
5465                 kthread_stop(rq->migration_thread);
5466                 rq->migration_thread = NULL;
5467                 /* Idle task back to normal (off runqueue, low prio) */
5468                 rq = task_rq_lock(rq->idle, &flags);
5469                 deactivate_task(rq->idle, rq);
5470                 rq->idle->static_prio = MAX_PRIO;
5471                 __setscheduler(rq->idle, SCHED_NORMAL, 0);
5472                 migrate_dead_tasks(cpu);
5473                 task_rq_unlock(rq, &flags);
5474                 migrate_nr_uninterruptible(rq);
5475                 BUG_ON(rq->nr_running != 0);
5476
5477                 /* No need to migrate the tasks: it was best-effort if
5478                  * they didn't take sched_hotcpu_mutex.  Just wake up
5479                  * the requestors. */
5480                 spin_lock_irq(&rq->lock);
5481                 while (!list_empty(&rq->migration_queue)) {
5482                         struct migration_req *req;
5483
5484                         req = list_entry(rq->migration_queue.next,
5485                                          struct migration_req, list);
5486                         list_del_init(&req->list);
5487                         complete(&req->done);
5488                 }
5489                 spin_unlock_irq(&rq->lock);
5490                 break;
5491 #endif
5492         case CPU_LOCK_RELEASE:
5493                 mutex_unlock(&sched_hotcpu_mutex);
5494                 break;
5495         }
5496         return NOTIFY_OK;
5497 }
5498
5499 /* Register at highest priority so that task migration (migrate_all_tasks)
5500  * happens before everything else.
5501  */
5502 static struct notifier_block __cpuinitdata migration_notifier = {
5503         .notifier_call = migration_call,
5504         .priority = 10
5505 };
5506
5507 int __init migration_init(void)
5508 {
5509         void *cpu = (void *)(long)smp_processor_id();
5510         int err;
5511
5512         /* Start one for the boot CPU: */
5513         err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5514         BUG_ON(err == NOTIFY_BAD);
5515         migration_call(&migration_notifier, CPU_ONLINE, cpu);
5516         register_cpu_notifier(&migration_notifier);
5517
5518         return 0;
5519 }
5520 #endif
5521
5522 #ifdef CONFIG_SMP
5523
5524 /* Number of possible processor ids */
5525 int nr_cpu_ids __read_mostly = NR_CPUS;
5526 EXPORT_SYMBOL(nr_cpu_ids);
5527
5528 #undef SCHED_DOMAIN_DEBUG
5529 #ifdef SCHED_DOMAIN_DEBUG
5530 static void sched_domain_debug(struct sched_domain *sd, int cpu)
5531 {
5532         int level = 0;
5533
5534         if (!sd) {
5535                 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5536                 return;
5537         }
5538
5539         printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5540
5541         do {
5542                 int i;
5543                 char str[NR_CPUS];
5544                 struct sched_group *group = sd->groups;
5545                 cpumask_t groupmask;
5546
5547                 cpumask_scnprintf(str, NR_CPUS, sd->span);
5548                 cpus_clear(groupmask);
5549
5550                 printk(KERN_DEBUG);
5551                 for (i = 0; i < level + 1; i++)
5552                         printk(" ");
5553                 printk("domain %d: ", level);
5554
5555                 if (!(sd->flags & SD_LOAD_BALANCE)) {
5556                         printk("does not load-balance\n");
5557                         if (sd->parent)
5558                                 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5559                                                 " has parent");
5560                         break;
5561                 }
5562
5563                 printk("span %s\n", str);
5564
5565                 if (!cpu_isset(cpu, sd->span))
5566                         printk(KERN_ERR "ERROR: domain->span does not contain "
5567                                         "CPU%d\n", cpu);
5568                 if (!cpu_isset(cpu, group->cpumask))
5569                         printk(KERN_ERR "ERROR: domain->groups does not contain"
5570                                         " CPU%d\n", cpu);
5571
5572                 printk(KERN_DEBUG);
5573                 for (i = 0; i < level + 2; i++)
5574                         printk(" ");
5575                 printk("groups:");
5576                 do {
5577                         if (!group) {
5578                                 printk("\n");
5579                                 printk(KERN_ERR "ERROR: group is NULL\n");
5580                                 break;
5581                         }
5582
5583                         if (!group->__cpu_power) {
5584                                 printk("\n");
5585                                 printk(KERN_ERR "ERROR: domain->cpu_power not "
5586                                                 "set\n");
5587                         }
5588
5589                         if (!cpus_weight(group->cpumask)) {
5590                                 printk("\n");
5591                                 printk(KERN_ERR "ERROR: empty group\n");
5592                         }
5593
5594                         if (cpus_intersects(groupmask, group->cpumask)) {
5595                                 printk("\n");
5596                                 printk(KERN_ERR "ERROR: repeated CPUs\n");
5597                         }
5598
5599                         cpus_or(groupmask, groupmask, group->cpumask);
5600
5601                         cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5602                         printk(" %s", str);
5603
5604                         group = group->next;
5605                 } while (group != sd->groups);
5606                 printk("\n");
5607
5608                 if (!cpus_equal(sd->span, groupmask))
5609                         printk(KERN_ERR "ERROR: groups don't span "
5610                                         "domain->span\n");
5611
5612                 level++;
5613                 sd = sd->parent;
5614                 if (!sd)
5615                         continue;
5616
5617                 if (!cpus_subset(groupmask, sd->span))
5618                         printk(KERN_ERR "ERROR: parent span is not a superset "
5619                                 "of domain->span\n");
5620
5621         } while (sd);
5622 }
5623 #else
5624 # define sched_domain_debug(sd, cpu) do { } while (0)
5625 #endif
5626
5627 static int sd_degenerate(struct sched_domain *sd)
5628 {
5629         if (cpus_weight(sd->span) == 1)
5630                 return 1;
5631
5632         /* Following flags need at least 2 groups */
5633         if (sd->flags & (SD_LOAD_BALANCE |
5634                          SD_BALANCE_NEWIDLE |
5635                          SD_BALANCE_FORK |
5636                          SD_BALANCE_EXEC |
5637                          SD_SHARE_CPUPOWER |
5638                          SD_SHARE_PKG_RESOURCES)) {
5639                 if (sd->groups != sd->groups->next)
5640                         return 0;
5641         }
5642
5643         /* Following flags don't use groups */
5644         if (sd->flags & (SD_WAKE_IDLE |
5645                          SD_WAKE_AFFINE |
5646                          SD_WAKE_BALANCE))
5647                 return 0;
5648
5649         return 1;
5650 }
5651
5652 static int
5653 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5654 {
5655         unsigned long cflags = sd->flags, pflags = parent->flags;
5656
5657         if (sd_degenerate(parent))
5658                 return 1;
5659
5660         if (!cpus_equal(sd->span, parent->span))
5661                 return 0;
5662
5663         /* Does parent contain flags not in child? */
5664         /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5665         if (cflags & SD_WAKE_AFFINE)
5666                 pflags &= ~SD_WAKE_BALANCE;
5667         /* Flags needing groups don't count if only 1 group in parent */
5668         if (parent->groups == parent->groups->next) {
5669                 pflags &= ~(SD_LOAD_BALANCE |
5670                                 SD_BALANCE_NEWIDLE |
5671                                 SD_BALANCE_FORK |
5672                                 SD_BALANCE_EXEC |
5673                                 SD_SHARE_CPUPOWER |
5674                                 SD_SHARE_PKG_RESOURCES);
5675         }
5676         if (~cflags & pflags)
5677                 return 0;
5678
5679         return 1;
5680 }
5681
5682 /*
5683  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
5684  * hold the hotplug lock.
5685  */
5686 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5687 {
5688         struct rq *rq = cpu_rq(cpu);
5689         struct sched_domain *tmp;
5690
5691         /* Remove the sched domains which do not contribute to scheduling. */
5692         for (tmp = sd; tmp; tmp = tmp->parent) {
5693                 struct sched_domain *parent = tmp->parent;
5694                 if (!parent)
5695                         break;
5696                 if (sd_parent_degenerate(tmp, parent)) {
5697                         tmp->parent = parent->parent;
5698                         if (parent->parent)
5699                                 parent->parent->child = tmp;
5700                 }
5701         }
5702
5703         if (sd && sd_degenerate(sd)) {
5704                 sd = sd->parent;
5705                 if (sd)
5706                         sd->child = NULL;
5707         }
5708
5709         sched_domain_debug(sd, cpu);
5710
5711         rcu_assign_pointer(rq->sd, sd);
5712 }
5713
5714 /* cpus with isolated domains */
5715 static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
5716
5717 /* Setup the mask of cpus configured for isolated domains */
5718 static int __init isolated_cpu_setup(char *str)
5719 {
5720         int ints[NR_CPUS], i;
5721
5722         str = get_options(str, ARRAY_SIZE(ints), ints);
5723         cpus_clear(cpu_isolated_map);
5724         for (i = 1; i <= ints[0]; i++)
5725                 if (ints[i] < NR_CPUS)
5726                         cpu_set(ints[i], cpu_isolated_map);
5727         return 1;
5728 }
5729
5730 __setup ("isolcpus=", isolated_cpu_setup);
5731
5732 /*
5733  * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5734  * to a function which identifies what group(along with sched group) a CPU
5735  * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5736  * (due to the fact that we keep track of groups covered with a cpumask_t).
5737  *
5738  * init_sched_build_groups will build a circular linked list of the groups
5739  * covered by the given span, and will set each group's ->cpumask correctly,
5740  * and ->cpu_power to 0.
5741  */
5742 static void
5743 init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5744                         int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5745                                         struct sched_group **sg))
5746 {
5747         struct sched_group *first = NULL, *last = NULL;
5748         cpumask_t covered = CPU_MASK_NONE;
5749         int i;
5750
5751         for_each_cpu_mask(i, span) {
5752                 struct sched_group *sg;
5753                 int group = group_fn(i, cpu_map, &sg);
5754                 int j;
5755
5756                 if (cpu_isset(i, covered))
5757                         continue;
5758
5759                 sg->cpumask = CPU_MASK_NONE;
5760                 sg->__cpu_power = 0;
5761
5762                 for_each_cpu_mask(j, span) {
5763                         if (group_fn(j, cpu_map, NULL) != group)
5764                                 continue;
5765
5766                         cpu_set(j, covered);
5767                         cpu_set(j, sg->cpumask);
5768                 }
5769                 if (!first)
5770                         first = sg;
5771                 if (last)
5772                         last->next = sg;
5773                 last = sg;
5774         }
5775         last->next = first;
5776 }
5777
5778 #define SD_NODES_PER_DOMAIN 16
5779
5780 #ifdef CONFIG_NUMA
5781
5782 /**
5783  * find_next_best_node - find the next node to include in a sched_domain
5784  * @node: node whose sched_domain we're building
5785  * @used_nodes: nodes already in the sched_domain
5786  *
5787  * Find the next node to include in a given scheduling domain.  Simply
5788  * finds the closest node not already in the @used_nodes map.
5789  *
5790  * Should use nodemask_t.
5791  */
5792 static int find_next_best_node(int node, unsigned long *used_nodes)
5793 {
5794         int i, n, val, min_val, best_node = 0;
5795
5796         min_val = INT_MAX;
5797
5798         for (i = 0; i < MAX_NUMNODES; i++) {
5799                 /* Start at @node */
5800                 n = (node + i) % MAX_NUMNODES;
5801
5802                 if (!nr_cpus_node(n))
5803                         continue;
5804
5805                 /* Skip already used nodes */
5806                 if (test_bit(n, used_nodes))
5807                         continue;
5808
5809                 /* Simple min distance search */
5810                 val = node_distance(node, n);
5811
5812                 if (val < min_val) {
5813                         min_val = val;
5814                         best_node = n;
5815                 }
5816         }
5817
5818         set_bit(best_node, used_nodes);
5819         return best_node;
5820 }
5821
5822 /**
5823  * sched_domain_node_span - get a cpumask for a node's sched_domain
5824  * @node: node whose cpumask we're constructing
5825  * @size: number of nodes to include in this span
5826  *
5827  * Given a node, construct a good cpumask for its sched_domain to span.  It
5828  * should be one that prevents unnecessary balancing, but also spreads tasks
5829  * out optimally.
5830  */
5831 static cpumask_t sched_domain_node_span(int node)
5832 {
5833         DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
5834         cpumask_t span, nodemask;
5835         int i;
5836
5837         cpus_clear(span);
5838         bitmap_zero(used_nodes, MAX_NUMNODES);
5839
5840         nodemask = node_to_cpumask(node);
5841         cpus_or(span, span, nodemask);
5842         set_bit(node, used_nodes);
5843
5844         for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5845                 int next_node = find_next_best_node(node, used_nodes);
5846
5847                 nodemask = node_to_cpumask(next_node);
5848                 cpus_or(span, span, nodemask);
5849         }
5850
5851         return span;
5852 }
5853 #endif
5854
5855 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5856
5857 /*
5858  * SMT sched-domains:
5859  */
5860 #ifdef CONFIG_SCHED_SMT
5861 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5862 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
5863
5864 static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5865                             struct sched_group **sg)
5866 {
5867         if (sg)
5868                 *sg = &per_cpu(sched_group_cpus, cpu);
5869         return cpu;
5870 }
5871 #endif
5872
5873 /*
5874  * multi-core sched-domains:
5875  */
5876 #ifdef CONFIG_SCHED_MC
5877 static DEFINE_PER_CPU(struct sched_domain, core_domains);
5878 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
5879 #endif
5880
5881 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
5882 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5883                              struct sched_group **sg)
5884 {
5885         int group;
5886         cpumask_t mask = cpu_sibling_map[cpu];
5887         cpus_and(mask, mask, *cpu_map);
5888         group = first_cpu(mask);
5889         if (sg)
5890                 *sg = &per_cpu(sched_group_core, group);
5891         return group;
5892 }
5893 #elif defined(CONFIG_SCHED_MC)
5894 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5895                              struct sched_group **sg)
5896 {
5897         if (sg)
5898                 *sg = &per_cpu(sched_group_core, cpu);
5899         return cpu;
5900 }
5901 #endif
5902
5903 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5904 static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
5905
5906 static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5907                              struct sched_group **sg)
5908 {
5909         int group;
5910 #ifdef CONFIG_SCHED_MC
5911         cpumask_t mask = cpu_coregroup_map(cpu);
5912         cpus_and(mask, mask, *cpu_map);
5913         group = first_cpu(mask);
5914 #elif defined(CONFIG_SCHED_SMT)
5915         cpumask_t mask = cpu_sibling_map[cpu];
5916         cpus_and(mask, mask, *cpu_map);
5917         group = first_cpu(mask);
5918 #else
5919         group = cpu;
5920 #endif
5921         if (sg)
5922                 *sg = &per_cpu(sched_group_phys, group);
5923         return group;
5924 }
5925
5926 #ifdef CONFIG_NUMA
5927 /*
5928  * The init_sched_build_groups can't handle what we want to do with node
5929  * groups, so roll our own. Now each node has its own list of groups which
5930  * gets dynamically allocated.
5931  */
5932 static DEFINE_PER_CPU(struct sched_domain, node_domains);
5933 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
5934
5935 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
5936 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
5937
5938 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5939                                  struct sched_group **sg)
5940 {
5941         cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5942         int group;
5943
5944         cpus_and(nodemask, nodemask, *cpu_map);
5945         group = first_cpu(nodemask);
5946
5947         if (sg)
5948                 *sg = &per_cpu(sched_group_allnodes, group);
5949         return group;
5950 }
5951
5952 static void init_numa_sched_groups_power(struct sched_group *group_head)
5953 {
5954         struct sched_group *sg = group_head;
5955         int j;
5956
5957         if (!sg)
5958                 return;
5959 next_sg:
5960         for_each_cpu_mask(j, sg->cpumask) {
5961                 struct sched_domain *sd;
5962
5963                 sd = &per_cpu(phys_domains, j);
5964                 if (j != first_cpu(sd->groups->cpumask)) {
5965                         /*
5966                          * Only add "power" once for each
5967                          * physical package.
5968                          */
5969                         continue;
5970                 }
5971
5972                 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
5973         }
5974         sg = sg->next;
5975         if (sg != group_head)
5976                 goto next_sg;
5977 }
5978 #endif
5979
5980 #ifdef CONFIG_NUMA
5981 /* Free memory allocated for various sched_group structures */
5982 static void free_sched_groups(const cpumask_t *cpu_map)
5983 {
5984         int cpu, i;
5985
5986         for_each_cpu_mask(cpu, *cpu_map) {
5987                 struct sched_group **sched_group_nodes
5988                         = sched_group_nodes_bycpu[cpu];
5989
5990                 if (!sched_group_nodes)
5991                         continue;
5992
5993                 for (i = 0; i < MAX_NUMNODES; i++) {
5994                         cpumask_t nodemask = node_to_cpumask(i);
5995                         struct sched_group *oldsg, *sg = sched_group_nodes[i];
5996
5997                         cpus_and(nodemask, nodemask, *cpu_map);
5998                         if (cpus_empty(nodemask))
5999                                 continue;
6000
6001                         if (sg == NULL)
6002                                 continue;
6003                         sg = sg->next;
6004 next_sg:
6005                         oldsg = sg;
6006                         sg = sg->next;
6007                         kfree(oldsg);
6008                         if (oldsg != sched_group_nodes[i])
6009                                 goto next_sg;
6010                 }
6011                 kfree(sched_group_nodes);
6012                 sched_group_nodes_bycpu[cpu] = NULL;
6013         }
6014 }
6015 #else
6016 static void free_sched_groups(const cpumask_t *cpu_map)
6017 {
6018 }
6019 #endif
6020
6021 /*
6022  * Initialize sched groups cpu_power.
6023  *
6024  * cpu_power indicates the capacity of sched group, which is used while
6025  * distributing the load between different sched groups in a sched domain.
6026  * Typically cpu_power for all the groups in a sched domain will be same unless
6027  * there are asymmetries in the topology. If there are asymmetries, group
6028  * having more cpu_power will pickup more load compared to the group having
6029  * less cpu_power.
6030  *
6031  * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
6032  * the maximum number of tasks a group can handle in the presence of other idle
6033  * or lightly loaded groups in the same sched domain.
6034  */
6035 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6036 {
6037         struct sched_domain *child;
6038         struct sched_group *group;
6039
6040         WARN_ON(!sd || !sd->groups);
6041
6042         if (cpu != first_cpu(sd->groups->cpumask))
6043                 return;
6044
6045         child = sd->child;
6046
6047         sd->groups->__cpu_power = 0;
6048
6049         /*
6050          * For perf policy, if the groups in child domain share resources
6051          * (for example cores sharing some portions of the cache hierarchy
6052          * or SMT), then set this domain groups cpu_power such that each group
6053          * can handle only one task, when there are other idle groups in the
6054          * same sched domain.
6055          */
6056         if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
6057                        (child->flags &
6058                         (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
6059                 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
6060                 return;
6061         }
6062
6063         /*
6064          * add cpu_power of each child group to this groups cpu_power
6065          */
6066         group = child->groups;
6067         do {
6068                 sg_inc_cpu_power(sd->groups, group->__cpu_power);
6069                 group = group->next;
6070         } while (group != child->groups);
6071 }
6072
6073 /*
6074  * Build sched domains for a given set of cpus and attach the sched domains
6075  * to the individual cpus
6076  */
6077 static int build_sched_domains(const cpumask_t *cpu_map)
6078 {
6079         int i;
6080         struct sched_domain *sd;
6081 #ifdef CONFIG_NUMA
6082         struct sched_group **sched_group_nodes = NULL;
6083         int sd_allnodes = 0;
6084
6085         /*
6086          * Allocate the per-node list of sched groups
6087          */
6088         sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
6089                                            GFP_KERNEL);
6090         if (!sched_group_nodes) {
6091                 printk(KERN_WARNING "Can not alloc sched group node list\n");
6092                 return -ENOMEM;
6093         }
6094         sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
6095 #endif
6096
6097         /*
6098          * Set up domains for cpus specified by the cpu_map.
6099          */
6100         for_each_cpu_mask(i, *cpu_map) {
6101                 struct sched_domain *sd = NULL, *p;
6102                 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
6103
6104                 cpus_and(nodemask, nodemask, *cpu_map);
6105
6106 #ifdef CONFIG_NUMA
6107                 if (cpus_weight(*cpu_map)
6108                                 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
6109                         sd = &per_cpu(allnodes_domains, i);
6110                         *sd = SD_ALLNODES_INIT;
6111                         sd->span = *cpu_map;
6112                         cpu_to_allnodes_group(i, cpu_map, &sd->groups);
6113                         p = sd;
6114                         sd_allnodes = 1;
6115                 } else
6116                         p = NULL;
6117
6118                 sd = &per_cpu(node_domains, i);
6119                 *sd = SD_NODE_INIT;
6120                 sd->span = sched_domain_node_span(cpu_to_node(i));
6121                 sd->parent = p;
6122                 if (p)
6123                         p->child = sd;
6124                 cpus_and(sd->span, sd->span, *cpu_map);
6125 #endif
6126
6127                 p = sd;
6128                 sd = &per_cpu(phys_domains, i);
6129                 *sd = SD_CPU_INIT;
6130                 sd->span = nodemask;
6131                 sd->parent = p;
6132                 if (p)
6133                         p->child = sd;
6134                 cpu_to_phys_group(i, cpu_map, &sd->groups);
6135
6136 #ifdef CONFIG_SCHED_MC
6137                 p = sd;
6138                 sd = &per_cpu(core_domains, i);
6139                 *sd = SD_MC_INIT;
6140                 sd->span = cpu_coregroup_map(i);
6141                 cpus_and(sd->span, sd->span, *cpu_map);
6142                 sd->parent = p;
6143                 p->child = sd;
6144                 cpu_to_core_group(i, cpu_map, &sd->groups);
6145 #endif
6146
6147 #ifdef CONFIG_SCHED_SMT
6148                 p = sd;
6149                 sd = &per_cpu(cpu_domains, i);
6150                 *sd = SD_SIBLING_INIT;
6151                 sd->span = cpu_sibling_map[i];
6152                 cpus_and(sd->span, sd->span, *cpu_map);
6153                 sd->parent = p;
6154                 p->child = sd;
6155                 cpu_to_cpu_group(i, cpu_map, &sd->groups);
6156 #endif
6157         }
6158
6159 #ifdef CONFIG_SCHED_SMT
6160         /* Set up CPU (sibling) groups */
6161         for_each_cpu_mask(i, *cpu_map) {
6162                 cpumask_t this_sibling_map = cpu_sibling_map[i];
6163                 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
6164                 if (i != first_cpu(this_sibling_map))
6165                         continue;
6166
6167                 init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group);
6168         }
6169 #endif
6170
6171 #ifdef CONFIG_SCHED_MC
6172         /* Set up multi-core groups */
6173         for_each_cpu_mask(i, *cpu_map) {
6174                 cpumask_t this_core_map = cpu_coregroup_map(i);
6175                 cpus_and(this_core_map, this_core_map, *cpu_map);
6176                 if (i != first_cpu(this_core_map))
6177                         continue;
6178                 init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group);
6179         }
6180 #endif
6181
6182
6183         /* Set up physical groups */
6184         for (i = 0; i < MAX_NUMNODES; i++) {
6185                 cpumask_t nodemask = node_to_cpumask(i);
6186
6187                 cpus_and(nodemask, nodemask, *cpu_map);
6188                 if (cpus_empty(nodemask))
6189                         continue;
6190
6191                 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
6192         }
6193
6194 #ifdef CONFIG_NUMA
6195         /* Set up node groups */
6196         if (sd_allnodes)
6197                 init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group);
6198
6199         for (i = 0; i < MAX_NUMNODES; i++) {
6200                 /* Set up node groups */
6201                 struct sched_group *sg, *prev;
6202                 cpumask_t nodemask = node_to_cpumask(i);
6203                 cpumask_t domainspan;
6204                 cpumask_t covered = CPU_MASK_NONE;
6205                 int j;
6206
6207                 cpus_and(nodemask, nodemask, *cpu_map);
6208                 if (cpus_empty(nodemask)) {
6209                         sched_group_nodes[i] = NULL;
6210                         continue;
6211                 }
6212
6213                 domainspan = sched_domain_node_span(i);
6214                 cpus_and(domainspan, domainspan, *cpu_map);
6215
6216                 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
6217                 if (!sg) {
6218                         printk(KERN_WARNING "Can not alloc domain group for "
6219                                 "node %d\n", i);
6220                         goto error;
6221                 }
6222                 sched_group_nodes[i] = sg;
6223                 for_each_cpu_mask(j, nodemask) {
6224                         struct sched_domain *sd;
6225                         sd = &per_cpu(node_domains, j);
6226                         sd->groups = sg;
6227                 }
6228                 sg->__cpu_power = 0;
6229                 sg->cpumask = nodemask;
6230                 sg->next = sg;
6231                 cpus_or(covered, covered, nodemask);
6232                 prev = sg;
6233
6234                 for (j = 0; j < MAX_NUMNODES; j++) {
6235                         cpumask_t tmp, notcovered;
6236                         int n = (i + j) % MAX_NUMNODES;
6237
6238                         cpus_complement(notcovered, covered);
6239                         cpus_and(tmp, notcovered, *cpu_map);
6240                         cpus_and(tmp, tmp, domainspan);
6241                         if (cpus_empty(tmp))
6242                                 break;
6243
6244                         nodemask = node_to_cpumask(n);
6245                         cpus_and(tmp, tmp, nodemask);
6246                         if (cpus_empty(tmp))
6247                                 continue;
6248
6249                         sg = kmalloc_node(sizeof(struct sched_group),
6250                                           GFP_KERNEL, i);
6251                         if (!sg) {
6252                                 printk(KERN_WARNING
6253                                 "Can not alloc domain group for node %d\n", j);
6254                                 goto error;
6255                         }
6256                         sg->__cpu_power = 0;
6257                         sg->cpumask = tmp;
6258                         sg->next = prev->next;
6259                         cpus_or(covered, covered, tmp);
6260                         prev->next = sg;
6261                         prev = sg;
6262                 }
6263         }
6264 #endif
6265
6266         /* Calculate CPU power for physical packages and nodes */
6267 #ifdef CONFIG_SCHED_SMT
6268         for_each_cpu_mask(i, *cpu_map) {
6269                 sd = &per_cpu(cpu_domains, i);
6270                 init_sched_groups_power(i, sd);
6271         }
6272 #endif
6273 #ifdef CONFIG_SCHED_MC
6274         for_each_cpu_mask(i, *cpu_map) {
6275                 sd = &per_cpu(core_domains, i);
6276                 init_sched_groups_power(i, sd);
6277         }
6278 #endif
6279
6280         for_each_cpu_mask(i, *cpu_map) {
6281                 sd = &per_cpu(phys_domains, i);
6282                 init_sched_groups_power(i, sd);
6283         }
6284
6285 #ifdef CONFIG_NUMA
6286         for (i = 0; i < MAX_NUMNODES; i++)
6287                 init_numa_sched_groups_power(sched_group_nodes[i]);
6288
6289         if (sd_allnodes) {
6290                 struct sched_group *sg;
6291
6292                 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6293                 init_numa_sched_groups_power(sg);
6294         }
6295 #endif
6296
6297         /* Attach the domains */
6298         for_each_cpu_mask(i, *cpu_map) {
6299                 struct sched_domain *sd;
6300 #ifdef CONFIG_SCHED_SMT
6301                 sd = &per_cpu(cpu_domains, i);
6302 #elif defined(CONFIG_SCHED_MC)
6303                 sd = &per_cpu(core_domains, i);
6304 #else
6305                 sd = &per_cpu(phys_domains, i);
6306 #endif
6307                 cpu_attach_domain(sd, i);
6308         }
6309
6310         return 0;
6311
6312 #ifdef CONFIG_NUMA
6313 error:
6314         free_sched_groups(cpu_map);
6315         return -ENOMEM;
6316 #endif
6317 }
6318 /*
6319  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
6320  */
6321 static int arch_init_sched_domains(const cpumask_t *cpu_map)
6322 {
6323         cpumask_t cpu_default_map;
6324         int err;
6325
6326         /*
6327          * Setup mask for cpus without special case scheduling requirements.
6328          * For now this just excludes isolated cpus, but could be used to
6329          * exclude other special cases in the future.
6330          */
6331         cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6332
6333         err = build_sched_domains(&cpu_default_map);
6334
6335         return err;
6336 }
6337
6338 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
6339 {
6340         free_sched_groups(cpu_map);
6341 }
6342
6343 /*
6344  * Detach sched domains from a group of cpus specified in cpu_map
6345  * These cpus will now be attached to the NULL domain
6346  */
6347 static void detach_destroy_domains(const cpumask_t *cpu_map)
6348 {
6349         int i;
6350
6351         for_each_cpu_mask(i, *cpu_map)
6352                 cpu_attach_domain(NULL, i);
6353         synchronize_sched();
6354         arch_destroy_sched_domains(cpu_map);
6355 }
6356
6357 /*
6358  * Partition sched domains as specified by the cpumasks below.
6359  * This attaches all cpus from the cpumasks to the NULL domain,
6360  * waits for a RCU quiescent period, recalculates sched
6361  * domain information and then attaches them back to the
6362  * correct sched domains
6363  * Call with hotplug lock held
6364  */
6365 int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6366 {
6367         cpumask_t change_map;
6368         int err = 0;
6369
6370         cpus_and(*partition1, *partition1, cpu_online_map);
6371         cpus_and(*partition2, *partition2, cpu_online_map);
6372         cpus_or(change_map, *partition1, *partition2);
6373
6374         /* Detach sched domains from all of the affected cpus */
6375         detach_destroy_domains(&change_map);
6376         if (!cpus_empty(*partition1))
6377                 err = build_sched_domains(partition1);
6378         if (!err && !cpus_empty(*partition2))
6379                 err = build_sched_domains(partition2);
6380
6381         return err;
6382 }
6383
6384 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6385 int arch_reinit_sched_domains(void)
6386 {
6387         int err;
6388
6389         mutex_lock(&sched_hotcpu_mutex);
6390         detach_destroy_domains(&cpu_online_map);
6391         err = arch_init_sched_domains(&cpu_online_map);
6392         mutex_unlock(&sched_hotcpu_mutex);
6393
6394         return err;
6395 }
6396
6397 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6398 {
6399         int ret;
6400
6401         if (buf[0] != '0' && buf[0] != '1')
6402                 return -EINVAL;
6403
6404         if (smt)
6405                 sched_smt_power_savings = (buf[0] == '1');
6406         else
6407                 sched_mc_power_savings = (buf[0] == '1');
6408
6409         ret = arch_reinit_sched_domains();
6410
6411         return ret ? ret : count;
6412 }
6413
6414 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6415 {
6416         int err = 0;
6417
6418 #ifdef CONFIG_SCHED_SMT
6419         if (smt_capable())
6420                 err = sysfs_create_file(&cls->kset.kobj,
6421                                         &attr_sched_smt_power_savings.attr);
6422 #endif
6423 #ifdef CONFIG_SCHED_MC
6424         if (!err && mc_capable())
6425                 err = sysfs_create_file(&cls->kset.kobj,
6426                                         &attr_sched_mc_power_savings.attr);
6427 #endif
6428         return err;
6429 }
6430 #endif
6431
6432 #ifdef CONFIG_SCHED_MC
6433 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6434 {
6435         return sprintf(page, "%u\n", sched_mc_power_savings);
6436 }
6437 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6438                                             const char *buf, size_t count)
6439 {
6440         return sched_power_savings_store(buf, count, 0);
6441 }
6442 SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6443             sched_mc_power_savings_store);
6444 #endif
6445
6446 #ifdef CONFIG_SCHED_SMT
6447 static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6448 {
6449         return sprintf(page, "%u\n", sched_smt_power_savings);
6450 }
6451 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6452                                              const char *buf, size_t count)
6453 {
6454         return sched_power_savings_store(buf, count, 1);
6455 }
6456 SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6457             sched_smt_power_savings_store);
6458 #endif
6459
6460 /*
6461  * Force a reinitialization of the sched domains hierarchy.  The domains
6462  * and groups cannot be updated in place without racing with the balancing
6463  * code, so we temporarily attach all running cpus to the NULL domain
6464  * which will prevent rebalancing while the sched domains are recalculated.
6465  */
6466 static int update_sched_domains(struct notifier_block *nfb,
6467                                 unsigned long action, void *hcpu)
6468 {
6469         switch (action) {
6470         case CPU_UP_PREPARE:
6471         case CPU_UP_PREPARE_FROZEN:
6472         case CPU_DOWN_PREPARE:
6473         case CPU_DOWN_PREPARE_FROZEN:
6474                 detach_destroy_domains(&cpu_online_map);
6475                 return NOTIFY_OK;
6476
6477         case CPU_UP_CANCELED:
6478         case CPU_UP_CANCELED_FROZEN:
6479         case CPU_DOWN_FAILED:
6480         case CPU_DOWN_FAILED_FROZEN:
6481         case CPU_ONLINE:
6482         case CPU_ONLINE_FROZEN:
6483         case CPU_DEAD:
6484         case CPU_DEAD_FROZEN:
6485                 /*
6486                  * Fall through and re-initialise the domains.
6487                  */
6488                 break;
6489         default:
6490                 return NOTIFY_DONE;
6491         }
6492
6493         /* The hotplug lock is already held by cpu_up/cpu_down */
6494         arch_init_sched_domains(&cpu_online_map);
6495
6496         return NOTIFY_OK;
6497 }
6498
6499 void __init sched_init_smp(void)
6500 {
6501         cpumask_t non_isolated_cpus;
6502
6503         mutex_lock(&sched_hotcpu_mutex);
6504         arch_init_sched_domains(&cpu_online_map);
6505         cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6506         if (cpus_empty(non_isolated_cpus))
6507                 cpu_set(smp_processor_id(), non_isolated_cpus);
6508         mutex_unlock(&sched_hotcpu_mutex);
6509         /* XXX: Theoretical race here - CPU may be hotplugged now */
6510         hotcpu_notifier(update_sched_domains, 0);
6511
6512         /* Move init over to a non-isolated CPU */
6513         if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6514                 BUG();
6515 }
6516 #else
6517 void __init sched_init_smp(void)
6518 {
6519 }
6520 #endif /* CONFIG_SMP */
6521
6522 int in_sched_functions(unsigned long addr)
6523 {
6524         /* Linker adds these: start and end of __sched functions */
6525         extern char __sched_text_start[], __sched_text_end[];
6526
6527         return in_lock_functions(addr) ||
6528                 (addr >= (unsigned long)__sched_text_start
6529                 && addr < (unsigned long)__sched_text_end);
6530 }
6531
6532 void __init sched_init(void)
6533 {
6534         int i, j, k;
6535         int highest_cpu = 0;
6536
6537         for_each_possible_cpu(i) {
6538                 struct prio_array *array;
6539                 struct rq *rq;
6540
6541                 rq = cpu_rq(i);
6542                 spin_lock_init(&rq->lock);
6543                 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6544                 rq->nr_running = 0;
6545                 rq->active = rq->arrays;
6546                 rq->expired = rq->arrays + 1;
6547                 rq->best_expired_prio = MAX_PRIO;
6548
6549 #ifdef CONFIG_SMP
6550                 rq->sd = NULL;
6551                 for (j = 1; j < 3; j++)
6552                         rq->cpu_load[j] = 0;
6553                 rq->active_balance = 0;
6554                 rq->push_cpu = 0;
6555                 rq->cpu = i;
6556                 rq->migration_thread = NULL;
6557                 INIT_LIST_HEAD(&rq->migration_queue);
6558 #endif
6559                 atomic_set(&rq->nr_iowait, 0);
6560
6561                 for (j = 0; j < 2; j++) {
6562                         array = rq->arrays + j;
6563                         for (k = 0; k < MAX_PRIO; k++) {
6564                                 INIT_LIST_HEAD(array->queue + k);
6565                                 __clear_bit(k, array->bitmap);
6566                         }
6567                         // delimiter for bitsearch
6568                         __set_bit(MAX_PRIO, array->bitmap);
6569                 }
6570                 highest_cpu = i;
6571         }
6572
6573         set_load_weight(&init_task);
6574
6575 #ifdef CONFIG_SMP
6576         nr_cpu_ids = highest_cpu + 1;
6577         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6578 #endif
6579
6580 #ifdef CONFIG_RT_MUTEXES
6581         plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6582 #endif
6583
6584         /*
6585          * The boot idle thread does lazy MMU switching as well:
6586          */
6587         atomic_inc(&init_mm.mm_count);
6588         enter_lazy_tlb(&init_mm, current);
6589
6590         /*
6591          * Make us the idle thread. Technically, schedule() should not be
6592          * called from this thread, however somewhere below it might be,
6593          * but because we are the idle thread, we just pick up running again
6594          * when this runqueue becomes "idle".
6595          */
6596         init_idle(current, smp_processor_id());
6597 }
6598
6599 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6600 void __might_sleep(char *file, int line)
6601 {
6602 #ifdef in_atomic
6603         static unsigned long prev_jiffy;        /* ratelimiting */
6604
6605         if ((in_atomic() || irqs_disabled()) &&
6606             system_state == SYSTEM_RUNNING && !oops_in_progress) {
6607                 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6608                         return;
6609                 prev_jiffy = jiffies;
6610                 printk(KERN_ERR "BUG: sleeping function called from invalid"
6611                                 " context at %s:%d\n", file, line);
6612                 printk("in_atomic():%d, irqs_disabled():%d\n",
6613                         in_atomic(), irqs_disabled());
6614                 debug_show_held_locks(current);
6615                 if (irqs_disabled())
6616                         print_irqtrace_events(current);
6617                 dump_stack();
6618         }
6619 #endif
6620 }
6621 EXPORT_SYMBOL(__might_sleep);
6622 #endif
6623
6624 #ifdef CONFIG_MAGIC_SYSRQ
6625 void normalize_rt_tasks(void)
6626 {
6627         struct prio_array *array;
6628         struct task_struct *g, *p;
6629         unsigned long flags;
6630         struct rq *rq;
6631
6632         read_lock_irq(&tasklist_lock);
6633
6634         do_each_thread(g, p) {
6635                 if (!rt_task(p))
6636                         continue;
6637
6638                 spin_lock_irqsave(&p->pi_lock, flags);
6639                 rq = __task_rq_lock(p);
6640
6641                 array = p->array;
6642                 if (array)
6643                         deactivate_task(p, task_rq(p));
6644                 __setscheduler(p, SCHED_NORMAL, 0);
6645                 if (array) {
6646                         __activate_task(p, task_rq(p));
6647                         resched_task(rq->curr);
6648                 }
6649
6650                 __task_rq_unlock(rq);
6651                 spin_unlock_irqrestore(&p->pi_lock, flags);
6652         } while_each_thread(g, p);
6653
6654         read_unlock_irq(&tasklist_lock);
6655 }
6656
6657 #endif /* CONFIG_MAGIC_SYSRQ */
6658
6659 #ifdef CONFIG_IA64
6660 /*
6661  * These functions are only useful for the IA64 MCA handling.
6662  *
6663  * They can only be called when the whole system has been
6664  * stopped - every CPU needs to be quiescent, and no scheduling
6665  * activity can take place. Using them for anything else would
6666  * be a serious bug, and as a result, they aren't even visible
6667  * under any other configuration.
6668  */
6669
6670 /**
6671  * curr_task - return the current task for a given cpu.
6672  * @cpu: the processor in question.
6673  *
6674  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6675  */
6676 struct task_struct *curr_task(int cpu)
6677 {
6678         return cpu_curr(cpu);
6679 }
6680
6681 /**
6682  * set_curr_task - set the current task for a given cpu.
6683  * @cpu: the processor in question.
6684  * @p: the task pointer to set.
6685  *
6686  * Description: This function must only be used when non-maskable interrupts
6687  * are serviced on a separate stack.  It allows the architecture to switch the
6688  * notion of the current task on a cpu in a non-blocking manner.  This function
6689  * must be called with all CPU's synchronized, and interrupts disabled, the
6690  * and caller must save the original value of the current task (see
6691  * curr_task() above) and restore that value before reenabling interrupts and
6692  * re-starting the system.
6693  *
6694  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6695  */
6696 void set_curr_task(int cpu, struct task_struct *p)
6697 {
6698         cpu_curr(cpu) = p;
6699 }
6700
6701 #endif