git.oblomov.eu Git - linux-2.6/blob - kernel/rcupreempt.c

   1 /*
   2  * Read-Copy Update mechanism for mutual exclusion, realtime implementation
   3  *
   4  * This program is free software; you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation; either version 2 of the License, or
   7  * (at your option) any later version.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write to the Free Software
  16  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  17  *
  18  * Copyright IBM Corporation, 2006
  19  *
  20  * Authors: Paul E. McKenney <paulmck@us.ibm.com>
  21  *              With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
  22  *              for pushing me away from locks and towards counters, and
  23  *              to Suparna Bhattacharya for pushing me completely away
  24  *              from atomic instructions on the read side.
  25  *
  26  *  - Added handling of Dynamic Ticks
  27  *      Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
  28  *                     - Steven Rostedt <srostedt@redhat.com>
  29  *
  30  * Papers:  http://www.rdrop.com/users/paulmck/RCU
  31  *
  32  * Design Document: http://lwn.net/Articles/253651/
  33  *
  34  * For detailed explanation of Read-Copy Update mechanism see -
  35  *              Documentation/RCU/ *.txt
  36  *
  37  */
  38 #include <linux/types.h>
  39 #include <linux/kernel.h>
  40 #include <linux/init.h>
  41 #include <linux/spinlock.h>
  42 #include <linux/smp.h>
  43 #include <linux/rcupdate.h>
  44 #include <linux/interrupt.h>
  45 #include <linux/sched.h>
  46 #include <asm/atomic.h>
  47 #include <linux/bitops.h>
  48 #include <linux/module.h>
  49 #include <linux/completion.h>
  50 #include <linux/moduleparam.h>
  51 #include <linux/percpu.h>
  52 #include <linux/notifier.h>
  53 #include <linux/rcupdate.h>
  54 #include <linux/cpu.h>
  55 #include <linux/random.h>
  56 #include <linux/delay.h>
  57 #include <linux/byteorder/swabb.h>
  58 #include <linux/cpumask.h>
  59 #include <linux/rcupreempt_trace.h>
  60
  61 /*
  62  * Macro that prevents the compiler from reordering accesses, but does
  63  * absolutely -nothing- to prevent CPUs from reordering.  This is used
  64  * only to mediate communication between mainline code and hardware
  65  * interrupt and NMI handlers.
  66  */
  67 #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
  68
  69 /*
  70  * PREEMPT_RCU data structures.
  71  */
  72
  73 /*
  74  * GP_STAGES specifies the number of times the state machine has
  75  * to go through the all the rcu_try_flip_states (see below)
  76  * in a single Grace Period.
  77  *
  78  * GP in GP_STAGES stands for Grace Period ;)
  79  */
  80 #define GP_STAGES    2
  81 struct rcu_data {
  82         spinlock_t      lock;           /* Protect rcu_data fields. */
  83         long            completed;      /* Number of last completed batch. */
  84         int             waitlistcount;
  85         struct tasklet_struct rcu_tasklet;
  86         struct rcu_head *nextlist;
  87         struct rcu_head **nexttail;
  88         struct rcu_head *waitlist[GP_STAGES];
  89         struct rcu_head **waittail[GP_STAGES];
  90         struct rcu_head *donelist;
  91         struct rcu_head **donetail;
  92         long rcu_flipctr[2];
  93 #ifdef CONFIG_RCU_TRACE
  94         struct rcupreempt_trace trace;
  95 #endif /* #ifdef CONFIG_RCU_TRACE */
  96 };
  97
  98 /*
  99  * States for rcu_try_flip() and friends.
 100  */
 101
 102 enum rcu_try_flip_states {
 103
 104         /*
 105          * Stay here if nothing is happening. Flip the counter if somthing
 106          * starts happening. Denoted by "I"
 107          */
 108         rcu_try_flip_idle_state,
 109
 110         /*
 111          * Wait here for all CPUs to notice that the counter has flipped. This
 112          * prevents the old set of counters from ever being incremented once
 113          * we leave this state, which in turn is necessary because we cannot
 114          * test any individual counter for zero -- we can only check the sum.
 115          * Denoted by "A".
 116          */
 117         rcu_try_flip_waitack_state,
 118
 119         /*
 120          * Wait here for the sum of the old per-CPU counters to reach zero.
 121          * Denoted by "Z".
 122          */
 123         rcu_try_flip_waitzero_state,
 124
 125         /*
 126          * Wait here for each of the other CPUs to execute a memory barrier.
 127          * This is necessary to ensure that these other CPUs really have
 128          * completed executing their RCU read-side critical sections, despite
 129          * their CPUs wildly reordering memory. Denoted by "M".
 130          */
 131         rcu_try_flip_waitmb_state,
 132 };
 133
 134 struct rcu_ctrlblk {
 135         spinlock_t      fliplock;       /* Protect state-machine transitions. */
 136         long            completed;      /* Number of last completed batch. */
 137         enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
 138                                                         the rcu state machine */
 139 };
 140
 141 static DEFINE_PER_CPU(struct rcu_data, rcu_data);
 142 static struct rcu_ctrlblk rcu_ctrlblk = {
 143         .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
 144         .completed = 0,
 145         .rcu_try_flip_state = rcu_try_flip_idle_state,
 146 };
 147
 148
 149 #ifdef CONFIG_RCU_TRACE
 150 static char *rcu_try_flip_state_names[] =
 151         { "idle", "waitack", "waitzero", "waitmb" };
 152 #endif /* #ifdef CONFIG_RCU_TRACE */
 153
 154 static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE;
 155
 156 /*
 157  * Enum and per-CPU flag to determine when each CPU has seen
 158  * the most recent counter flip.
 159  */
 160
 161 enum rcu_flip_flag_values {
 162         rcu_flip_seen,          /* Steady/initial state, last flip seen. */
 163                                 /* Only GP detector can update. */
 164         rcu_flipped             /* Flip just completed, need confirmation. */
 165                                 /* Only corresponding CPU can update. */
 166 };
 167 static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
 168                                                                 = rcu_flip_seen;
 169
 170 /*
 171  * Enum and per-CPU flag to determine when each CPU has executed the
 172  * needed memory barrier to fence in memory references from its last RCU
 173  * read-side critical section in the just-completed grace period.
 174  */
 175
 176 enum rcu_mb_flag_values {
 177         rcu_mb_done,            /* Steady/initial state, no mb()s required. */
 178                                 /* Only GP detector can update. */
 179         rcu_mb_needed           /* Flip just completed, need an mb(). */
 180                                 /* Only corresponding CPU can update. */
 181 };
 182 static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
 183                                                                 = rcu_mb_done;
 184
 185 /*
 186  * RCU_DATA_ME: find the current CPU's rcu_data structure.
 187  * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
 188  */
 189 #define RCU_DATA_ME()           (&__get_cpu_var(rcu_data))
 190 #define RCU_DATA_CPU(cpu)       (&per_cpu(rcu_data, cpu))
 191
 192 /*
 193  * Helper macro for tracing when the appropriate rcu_data is not
 194  * cached in a local variable, but where the CPU number is so cached.
 195  */
 196 #define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
 197
 198 /*
 199  * Helper macro for tracing when the appropriate rcu_data is not
 200  * cached in a local variable.
 201  */
 202 #define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
 203
 204 /*
 205  * Helper macro for tracing when the appropriate rcu_data is pointed
 206  * to by a local variable.
 207  */
 208 #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
 209
 210 /*
 211  * Return the number of RCU batches processed thus far.  Useful
 212  * for debug and statistics.
 213  */
 214 long rcu_batches_completed(void)
 215 {
 216         return rcu_ctrlblk.completed;
 217 }
 218 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 219
 220 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 221
 222 void __rcu_read_lock(void)
 223 {
 224         int idx;
 225         struct task_struct *t = current;
 226         int nesting;
 227
 228         nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
 229         if (nesting != 0) {
 230
 231                 /* An earlier rcu_read_lock() covers us, just count it. */
 232
 233                 t->rcu_read_lock_nesting = nesting + 1;
 234
 235         } else {
 236                 unsigned long flags;
 237
 238                 /*
 239                  * We disable interrupts for the following reasons:
 240                  * - If we get scheduling clock interrupt here, and we
 241                  *   end up acking the counter flip, it's like a promise
 242                  *   that we will never increment the old counter again.
 243                  *   Thus we will break that promise if that
 244                  *   scheduling clock interrupt happens between the time
 245                  *   we pick the .completed field and the time that we
 246                  *   increment our counter.
 247                  *
 248                  * - We don't want to be preempted out here.
 249                  *
 250                  * NMIs can still occur, of course, and might themselves
 251                  * contain rcu_read_lock().
 252                  */
 253
 254                 local_irq_save(flags);
 255
 256                 /*
 257                  * Outermost nesting of rcu_read_lock(), so increment
 258                  * the current counter for the current CPU.  Use volatile
 259                  * casts to prevent the compiler from reordering.
 260                  */
 261
 262                 idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
 263                 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
 264
 265                 /*
 266                  * Now that the per-CPU counter has been incremented, we
 267                  * are protected from races with rcu_read_lock() invoked
 268                  * from NMI handlers on this CPU.  We can therefore safely
 269                  * increment the nesting counter, relieving further NMIs
 270                  * of the need to increment the per-CPU counter.
 271                  */
 272
 273                 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
 274
 275                 /*
 276                  * Now that we have preventing any NMIs from storing
 277                  * to the ->rcu_flipctr_idx, we can safely use it to
 278                  * remember which counter to decrement in the matching
 279                  * rcu_read_unlock().
 280                  */
 281
 282                 ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
 283                 local_irq_restore(flags);
 284         }
 285 }
 286 EXPORT_SYMBOL_GPL(__rcu_read_lock);
 287
 288 void __rcu_read_unlock(void)
 289 {
 290         int idx;
 291         struct task_struct *t = current;
 292         int nesting;
 293
 294         nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
 295         if (nesting > 1) {
 296
 297                 /*
 298                  * We are still protected by the enclosing rcu_read_lock(),
 299                  * so simply decrement the counter.
 300                  */
 301
 302                 t->rcu_read_lock_nesting = nesting - 1;
 303
 304         } else {
 305                 unsigned long flags;
 306
 307                 /*
 308                  * Disable local interrupts to prevent the grace-period
 309                  * detection state machine from seeing us half-done.
 310                  * NMIs can still occur, of course, and might themselves
 311                  * contain rcu_read_lock() and rcu_read_unlock().
 312                  */
 313
 314                 local_irq_save(flags);
 315
 316                 /*
 317                  * Outermost nesting of rcu_read_unlock(), so we must
 318                  * decrement the current counter for the current CPU.
 319                  * This must be done carefully, because NMIs can
 320                  * occur at any point in this code, and any rcu_read_lock()
 321                  * and rcu_read_unlock() pairs in the NMI handlers
 322                  * must interact non-destructively with this code.
 323                  * Lots of volatile casts, and -very- careful ordering.
 324                  *
 325                  * Changes to this code, including this one, must be
 326                  * inspected, validated, and tested extremely carefully!!!
 327                  */
 328
 329                 /*
 330                  * First, pick up the index.
 331                  */
 332
 333                 idx = ACCESS_ONCE(t->rcu_flipctr_idx);
 334
 335                 /*
 336                  * Now that we have fetched the counter index, it is
 337                  * safe to decrement the per-task RCU nesting counter.
 338                  * After this, any interrupts or NMIs will increment and
 339                  * decrement the per-CPU counters.
 340                  */
 341                 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
 342
 343                 /*
 344                  * It is now safe to decrement this task's nesting count.
 345                  * NMIs that occur after this statement will route their
 346                  * rcu_read_lock() calls through this "else" clause, and
 347                  * will thus start incrementing the per-CPU counter on
 348                  * their own.  They will also clobber ->rcu_flipctr_idx,
 349                  * but that is OK, since we have already fetched it.
 350                  */
 351
 352                 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
 353                 local_irq_restore(flags);
 354         }
 355 }
 356 EXPORT_SYMBOL_GPL(__rcu_read_unlock);
 357
 358 /*
 359  * If a global counter flip has occurred since the last time that we
 360  * advanced callbacks, advance them.  Hardware interrupts must be
 361  * disabled when calling this function.
 362  */
 363 static void __rcu_advance_callbacks(struct rcu_data *rdp)
 364 {
 365         int cpu;
 366         int i;
 367         int wlc = 0;
 368
 369         if (rdp->completed != rcu_ctrlblk.completed) {
 370                 if (rdp->waitlist[GP_STAGES - 1] != NULL) {
 371                         *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
 372                         rdp->donetail = rdp->waittail[GP_STAGES - 1];
 373                         RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
 374                 }
 375                 for (i = GP_STAGES - 2; i >= 0; i--) {
 376                         if (rdp->waitlist[i] != NULL) {
 377                                 rdp->waitlist[i + 1] = rdp->waitlist[i];
 378                                 rdp->waittail[i + 1] = rdp->waittail[i];
 379                                 wlc++;
 380                         } else {
 381                                 rdp->waitlist[i + 1] = NULL;
 382                                 rdp->waittail[i + 1] =
 383                                         &rdp->waitlist[i + 1];
 384                         }
 385                 }
 386                 if (rdp->nextlist != NULL) {
 387                         rdp->waitlist[0] = rdp->nextlist;
 388                         rdp->waittail[0] = rdp->nexttail;
 389                         wlc++;
 390                         rdp->nextlist = NULL;
 391                         rdp->nexttail = &rdp->nextlist;
 392                         RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
 393                 } else {
 394                         rdp->waitlist[0] = NULL;
 395                         rdp->waittail[0] = &rdp->waitlist[0];
 396                 }
 397                 rdp->waitlistcount = wlc;
 398                 rdp->completed = rcu_ctrlblk.completed;
 399         }
 400
 401         /*
 402          * Check to see if this CPU needs to report that it has seen
 403          * the most recent counter flip, thereby declaring that all
 404          * subsequent rcu_read_lock() invocations will respect this flip.
 405          */
 406
 407         cpu = raw_smp_processor_id();
 408         if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
 409                 smp_mb();  /* Subsequent counter accesses must see new value */
 410                 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
 411                 smp_mb();  /* Subsequent RCU read-side critical sections */
 412                            /*  seen -after- acknowledgement. */
 413         }
 414 }
 415
 416 #ifdef CONFIG_NO_HZ
 417
 418 DEFINE_PER_CPU(long, dynticks_progress_counter) = 1;
 419 static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
 420 static DEFINE_PER_CPU(int, rcu_update_flag);
 421
 422 /**
 423  * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
 424  *
 425  * If the CPU was idle with dynamic ticks active, this updates the
 426  * dynticks_progress_counter to let the RCU handling know that the
 427  * CPU is active.
 428  */
 429 void rcu_irq_enter(void)
 430 {
 431         int cpu = smp_processor_id();
 432
 433         if (per_cpu(rcu_update_flag, cpu))
 434                 per_cpu(rcu_update_flag, cpu)++;
 435
 436         /*
 437          * Only update if we are coming from a stopped ticks mode
 438          * (dynticks_progress_counter is even).
 439          */
 440         if (!in_interrupt() &&
 441             (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) {
 442                 /*
 443                  * The following might seem like we could have a race
 444                  * with NMI/SMIs. But this really isn't a problem.
 445                  * Here we do a read/modify/write, and the race happens
 446                  * when an NMI/SMI comes in after the read and before
 447                  * the write. But NMI/SMIs will increment this counter
 448                  * twice before returning, so the zero bit will not
 449                  * be corrupted by the NMI/SMI which is the most important
 450                  * part.
 451                  *
 452                  * The only thing is that we would bring back the counter
 453                  * to a postion that it was in during the NMI/SMI.
 454                  * But the zero bit would be set, so the rest of the
 455                  * counter would again be ignored.
 456                  *
 457                  * On return from the IRQ, the counter may have the zero
 458                  * bit be 0 and the counter the same as the return from
 459                  * the NMI/SMI. If the state machine was so unlucky to
 460                  * see that, it still doesn't matter, since all
 461                  * RCU read-side critical sections on this CPU would
 462                  * have already completed.
 463                  */
 464                 per_cpu(dynticks_progress_counter, cpu)++;
 465                 /*
 466                  * The following memory barrier ensures that any
 467                  * rcu_read_lock() primitives in the irq handler
 468                  * are seen by other CPUs to follow the above
 469                  * increment to dynticks_progress_counter. This is
 470                  * required in order for other CPUs to correctly
 471                  * determine when it is safe to advance the RCU
 472                  * grace-period state machine.
 473                  */
 474                 smp_mb(); /* see above block comment. */
 475                 /*
 476                  * Since we can't determine the dynamic tick mode from
 477                  * the dynticks_progress_counter after this routine,
 478                  * we use a second flag to acknowledge that we came
 479                  * from an idle state with ticks stopped.
 480                  */
 481                 per_cpu(rcu_update_flag, cpu)++;
 482                 /*
 483                  * If we take an NMI/SMI now, they will also increment
 484                  * the rcu_update_flag, and will not update the
 485                  * dynticks_progress_counter on exit. That is for
 486                  * this IRQ to do.
 487                  */
 488         }
 489 }
 490
 491 /**
 492  * rcu_irq_exit - Called from exiting Hard irq context.
 493  *
 494  * If the CPU was idle with dynamic ticks active, update the
 495  * dynticks_progress_counter to put let the RCU handling be
 496  * aware that the CPU is going back to idle with no ticks.
 497  */
 498 void rcu_irq_exit(void)
 499 {
 500         int cpu = smp_processor_id();
 501
 502         /*
 503          * rcu_update_flag is set if we interrupted the CPU
 504          * when it was idle with ticks stopped.
 505          * Once this occurs, we keep track of interrupt nesting
 506          * because a NMI/SMI could also come in, and we still
 507          * only want the IRQ that started the increment of the
 508          * dynticks_progress_counter to be the one that modifies
 509          * it on exit.
 510          */
 511         if (per_cpu(rcu_update_flag, cpu)) {
 512                 if (--per_cpu(rcu_update_flag, cpu))
 513                         return;
 514
 515                 /* This must match the interrupt nesting */
 516                 WARN_ON(in_interrupt());
 517
 518                 /*
 519                  * If an NMI/SMI happens now we are still
 520                  * protected by the dynticks_progress_counter being odd.
 521                  */
 522
 523                 /*
 524                  * The following memory barrier ensures that any
 525                  * rcu_read_unlock() primitives in the irq handler
 526                  * are seen by other CPUs to preceed the following
 527                  * increment to dynticks_progress_counter. This
 528                  * is required in order for other CPUs to determine
 529                  * when it is safe to advance the RCU grace-period
 530                  * state machine.
 531                  */
 532                 smp_mb(); /* see above block comment. */
 533                 per_cpu(dynticks_progress_counter, cpu)++;
 534                 WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1);
 535         }
 536 }
 537
 538 static void dyntick_save_progress_counter(int cpu)
 539 {
 540         per_cpu(rcu_dyntick_snapshot, cpu) =
 541                 per_cpu(dynticks_progress_counter, cpu);
 542 }
 543
 544 static inline int
 545 rcu_try_flip_waitack_needed(int cpu)
 546 {
 547         long curr;
 548         long snap;
 549
 550         curr = per_cpu(dynticks_progress_counter, cpu);
 551         snap = per_cpu(rcu_dyntick_snapshot, cpu);
 552         smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
 553
 554         /*
 555          * If the CPU remained in dynticks mode for the entire time
 556          * and didn't take any interrupts, NMIs, SMIs, or whatever,
 557          * then it cannot be in the middle of an rcu_read_lock(), so
 558          * the next rcu_read_lock() it executes must use the new value
 559          * of the counter.  So we can safely pretend that this CPU
 560          * already acknowledged the counter.
 561          */
 562
 563         if ((curr == snap) && ((curr & 0x1) == 0))
 564                 return 0;
 565
 566         /*
 567          * If the CPU passed through or entered a dynticks idle phase with
 568          * no active irq handlers, then, as above, we can safely pretend
 569          * that this CPU already acknowledged the counter.
 570          */
 571
 572         if ((curr - snap) > 2 || (snap & 0x1) == 0)
 573                 return 0;
 574
 575         /* We need this CPU to explicitly acknowledge the counter flip. */
 576
 577         return 1;
 578 }
 579
 580 static inline int
 581 rcu_try_flip_waitmb_needed(int cpu)
 582 {
 583         long curr;
 584         long snap;
 585
 586         curr = per_cpu(dynticks_progress_counter, cpu);
 587         snap = per_cpu(rcu_dyntick_snapshot, cpu);
 588         smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
 589
 590         /*
 591          * If the CPU remained in dynticks mode for the entire time
 592          * and didn't take any interrupts, NMIs, SMIs, or whatever,
 593          * then it cannot have executed an RCU read-side critical section
 594          * during that time, so there is no need for it to execute a
 595          * memory barrier.
 596          */
 597
 598         if ((curr == snap) && ((curr & 0x1) == 0))
 599                 return 0;
 600
 601         /*
 602          * If the CPU either entered or exited an outermost interrupt,
 603          * SMI, NMI, or whatever handler, then we know that it executed
 604          * a memory barrier when doing so.  So we don't need another one.
 605          */
 606         if (curr != snap)
 607                 return 0;
 608
 609         /* We need the CPU to execute a memory barrier. */
 610
 611         return 1;
 612 }
 613
 614 #else /* !CONFIG_NO_HZ */
 615
 616 # define dyntick_save_progress_counter(cpu)     do { } while (0)
 617 # define rcu_try_flip_waitack_needed(cpu)       (1)
 618 # define rcu_try_flip_waitmb_needed(cpu)        (1)
 619
 620 #endif /* CONFIG_NO_HZ */
 621
 622 /*
 623  * Get here when RCU is idle.  Decide whether we need to
 624  * move out of idle state, and return non-zero if so.
 625  * "Straightforward" approach for the moment, might later
 626  * use callback-list lengths, grace-period duration, or
 627  * some such to determine when to exit idle state.
 628  * Might also need a pre-idle test that does not acquire
 629  * the lock, but let's get the simple case working first...
 630  */
 631
 632 static int
 633 rcu_try_flip_idle(void)
 634 {
 635         int cpu;
 636
 637         RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
 638         if (!rcu_pending(smp_processor_id())) {
 639                 RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
 640                 return 0;
 641         }
 642
 643         /*
 644          * Do the flip.
 645          */
 646
 647         RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
 648         rcu_ctrlblk.completed++;  /* stands in for rcu_try_flip_g2 */
 649
 650         /*
 651          * Need a memory barrier so that other CPUs see the new
 652          * counter value before they see the subsequent change of all
 653          * the rcu_flip_flag instances to rcu_flipped.
 654          */
 655
 656         smp_mb();       /* see above block comment. */
 657
 658         /* Now ask each CPU for acknowledgement of the flip. */
 659
 660         for_each_cpu_mask(cpu, rcu_cpu_online_map) {
 661                 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
 662                 dyntick_save_progress_counter(cpu);
 663         }
 664
 665         return 1;
 666 }
 667
 668 /*
 669  * Wait for CPUs to acknowledge the flip.
 670  */
 671
 672 static int
 673 rcu_try_flip_waitack(void)
 674 {
 675         int cpu;
 676
 677         RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
 678         for_each_cpu_mask(cpu, rcu_cpu_online_map)
 679                 if (rcu_try_flip_waitack_needed(cpu) &&
 680                     per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
 681                         RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
 682                         return 0;
 683                 }
 684
 685         /*
 686          * Make sure our checks above don't bleed into subsequent
 687          * waiting for the sum of the counters to reach zero.
 688          */
 689
 690         smp_mb();       /* see above block comment. */
 691         RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
 692         return 1;
 693 }
 694
 695 /*
 696  * Wait for collective ``last'' counter to reach zero,
 697  * then tell all CPUs to do an end-of-grace-period memory barrier.
 698  */
 699
 700 static int
 701 rcu_try_flip_waitzero(void)
 702 {
 703         int cpu;
 704         int lastidx = !(rcu_ctrlblk.completed & 0x1);
 705         int sum = 0;
 706
 707         /* Check to see if the sum of the "last" counters is zero. */
 708
 709         RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
 710         for_each_cpu_mask(cpu, rcu_cpu_online_map)
 711                 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
 712         if (sum != 0) {
 713                 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
 714                 return 0;
 715         }
 716
 717         /*
 718          * This ensures that the other CPUs see the call for
 719          * memory barriers -after- the sum to zero has been
 720          * detected here
 721          */
 722         smp_mb();  /*  ^^^^^^^^^^^^ */
 723
 724         /* Call for a memory barrier from each CPU. */
 725         for_each_cpu_mask(cpu, rcu_cpu_online_map) {
 726                 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
 727                 dyntick_save_progress_counter(cpu);
 728         }
 729
 730         RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
 731         return 1;
 732 }
 733
 734 /*
 735  * Wait for all CPUs to do their end-of-grace-period memory barrier.
 736  * Return 0 once all CPUs have done so.
 737  */
 738
 739 static int
 740 rcu_try_flip_waitmb(void)
 741 {
 742         int cpu;
 743
 744         RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
 745         for_each_cpu_mask(cpu, rcu_cpu_online_map)
 746                 if (rcu_try_flip_waitmb_needed(cpu) &&
 747                     per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
 748                         RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
 749                         return 0;
 750                 }
 751
 752         smp_mb(); /* Ensure that the above checks precede any following flip. */
 753         RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
 754         return 1;
 755 }
 756
 757 /*
 758  * Attempt a single flip of the counters.  Remember, a single flip does
 759  * -not- constitute a grace period.  Instead, the interval between
 760  * at least GP_STAGES consecutive flips is a grace period.
 761  *
 762  * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
 763  * on a large SMP, they might want to use a hierarchical organization of
 764  * the per-CPU-counter pairs.
 765  */
 766 static void rcu_try_flip(void)
 767 {
 768         unsigned long flags;
 769
 770         RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
 771         if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
 772                 RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
 773                 return;
 774         }
 775
 776         /*
 777          * Take the next transition(s) through the RCU grace-period
 778          * flip-counter state machine.
 779          */
 780
 781         switch (rcu_ctrlblk.rcu_try_flip_state) {
 782         case rcu_try_flip_idle_state:
 783                 if (rcu_try_flip_idle())
 784                         rcu_ctrlblk.rcu_try_flip_state =
 785                                 rcu_try_flip_waitack_state;
 786                 break;
 787         case rcu_try_flip_waitack_state:
 788                 if (rcu_try_flip_waitack())
 789                         rcu_ctrlblk.rcu_try_flip_state =
 790                                 rcu_try_flip_waitzero_state;
 791                 break;
 792         case rcu_try_flip_waitzero_state:
 793                 if (rcu_try_flip_waitzero())
 794                         rcu_ctrlblk.rcu_try_flip_state =
 795                                 rcu_try_flip_waitmb_state;
 796                 break;
 797         case rcu_try_flip_waitmb_state:
 798                 if (rcu_try_flip_waitmb())
 799                         rcu_ctrlblk.rcu_try_flip_state =
 800                                 rcu_try_flip_idle_state;
 801         }
 802         spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
 803 }
 804
 805 /*
 806  * Check to see if this CPU needs to do a memory barrier in order to
 807  * ensure that any prior RCU read-side critical sections have committed
 808  * their counter manipulations and critical-section memory references
 809  * before declaring the grace period to be completed.
 810  */
 811 static void rcu_check_mb(int cpu)
 812 {
 813         if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
 814                 smp_mb();  /* Ensure RCU read-side accesses are visible. */
 815                 per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
 816         }
 817 }
 818
 819 void rcu_check_callbacks(int cpu, int user)
 820 {
 821         unsigned long flags;
 822         struct rcu_data *rdp = RCU_DATA_CPU(cpu);
 823
 824         rcu_check_mb(cpu);
 825         if (rcu_ctrlblk.completed == rdp->completed)
 826                 rcu_try_flip();
 827         spin_lock_irqsave(&rdp->lock, flags);
 828         RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
 829         __rcu_advance_callbacks(rdp);
 830         if (rdp->donelist == NULL) {
 831                 spin_unlock_irqrestore(&rdp->lock, flags);
 832         } else {
 833                 spin_unlock_irqrestore(&rdp->lock, flags);
 834                 raise_softirq(RCU_SOFTIRQ);
 835         }
 836 }
 837
 838 /*
 839  * Needed by dynticks, to make sure all RCU processing has finished
 840  * when we go idle:
 841  */
 842 void rcu_advance_callbacks(int cpu, int user)
 843 {
 844         unsigned long flags;
 845         struct rcu_data *rdp = RCU_DATA_CPU(cpu);
 846
 847         if (rcu_ctrlblk.completed == rdp->completed) {
 848                 rcu_try_flip();
 849                 if (rcu_ctrlblk.completed == rdp->completed)
 850                         return;
 851         }
 852         spin_lock_irqsave(&rdp->lock, flags);
 853         RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
 854         __rcu_advance_callbacks(rdp);
 855         spin_unlock_irqrestore(&rdp->lock, flags);
 856 }
 857
 858 #ifdef CONFIG_HOTPLUG_CPU
 859 #define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
 860                 *dsttail = srclist; \
 861                 if (srclist != NULL) { \
 862                         dsttail = srctail; \
 863                         srclist = NULL; \
 864                         srctail = &srclist;\
 865                 } \
 866         } while (0)
 867
 868 void rcu_offline_cpu(int cpu)
 869 {
 870         int i;
 871         struct rcu_head *list = NULL;
 872         unsigned long flags;
 873         struct rcu_data *rdp = RCU_DATA_CPU(cpu);
 874         struct rcu_head **tail = &list;
 875
 876         /*
 877          * Remove all callbacks from the newly dead CPU, retaining order.
 878          * Otherwise rcu_barrier() will fail
 879          */
 880
 881         spin_lock_irqsave(&rdp->lock, flags);
 882         rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
 883         for (i = GP_STAGES - 1; i >= 0; i--)
 884                 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
 885                                                 list, tail);
 886         rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
 887         spin_unlock_irqrestore(&rdp->lock, flags);
 888         rdp->waitlistcount = 0;
 889
 890         /* Disengage the newly dead CPU from the grace-period computation. */
 891
 892         spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
 893         rcu_check_mb(cpu);
 894         if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
 895                 smp_mb();  /* Subsequent counter accesses must see new value */
 896                 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
 897                 smp_mb();  /* Subsequent RCU read-side critical sections */
 898                            /*  seen -after- acknowledgement. */
 899         }
 900
 901         RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
 902         RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
 903
 904         RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
 905         RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
 906
 907         cpu_clear(cpu, rcu_cpu_online_map);
 908
 909         spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
 910
 911         /*
 912          * Place the removed callbacks on the current CPU's queue.
 913          * Make them all start a new grace period: simple approach,
 914          * in theory could starve a given set of callbacks, but
 915          * you would need to be doing some serious CPU hotplugging
 916          * to make this happen.  If this becomes a problem, adding
 917          * a synchronize_rcu() to the hotplug path would be a simple
 918          * fix.
 919          */
 920
 921         local_irq_save(flags);
 922         rdp = RCU_DATA_ME();
 923         spin_lock(&rdp->lock);
 924         *rdp->nexttail = list;
 925         if (list)
 926                 rdp->nexttail = tail;
 927         spin_unlock_irqrestore(&rdp->lock, flags);
 928 }
 929
 930 void __devinit rcu_online_cpu(int cpu)
 931 {
 932         unsigned long flags;
 933
 934         spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
 935         cpu_set(cpu, rcu_cpu_online_map);
 936         spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
 937 }
 938
 939 #else /* #ifdef CONFIG_HOTPLUG_CPU */
 940
 941 void rcu_offline_cpu(int cpu)
 942 {
 943 }
 944
 945 void __devinit rcu_online_cpu(int cpu)
 946 {
 947 }
 948
 949 #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
 950
 951 static void rcu_process_callbacks(struct softirq_action *unused)
 952 {
 953         unsigned long flags;
 954         struct rcu_head *next, *list;
 955         struct rcu_data *rdp;
 956
 957         local_irq_save(flags);
 958         rdp = RCU_DATA_ME();
 959         spin_lock(&rdp->lock);
 960         list = rdp->donelist;
 961         if (list == NULL) {
 962                 spin_unlock_irqrestore(&rdp->lock, flags);
 963                 return;
 964         }
 965         rdp->donelist = NULL;
 966         rdp->donetail = &rdp->donelist;
 967         RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
 968         spin_unlock_irqrestore(&rdp->lock, flags);
 969         while (list) {
 970                 next = list->next;
 971                 list->func(list);
 972                 list = next;
 973                 RCU_TRACE_ME(rcupreempt_trace_invoke);
 974         }
 975 }
 976
 977 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 978 {
 979         unsigned long flags;
 980         struct rcu_data *rdp;
 981
 982         head->func = func;
 983         head->next = NULL;
 984         local_irq_save(flags);
 985         rdp = RCU_DATA_ME();
 986         spin_lock(&rdp->lock);
 987         __rcu_advance_callbacks(rdp);
 988         *rdp->nexttail = head;
 989         rdp->nexttail = &head->next;
 990         RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
 991         spin_unlock(&rdp->lock);
 992         local_irq_restore(flags);
 993 }
 994 EXPORT_SYMBOL_GPL(call_rcu);
 995
 996 /*
 997  * Wait until all currently running preempt_disable() code segments
 998  * (including hardware-irq-disable segments) complete.  Note that
 999  * in -rt this does -not- necessarily result in all currently executing
1000  * interrupt -handlers- having completed.
1001  */
1002 void __synchronize_sched(void)
1003 {
1004         cpumask_t oldmask;
1005         int cpu;
1006
1007         if (sched_getaffinity(0, &oldmask) < 0)
1008                 oldmask = cpu_possible_map;
1009         for_each_online_cpu(cpu) {
1010                 sched_setaffinity(0, cpumask_of_cpu(cpu));
1011                 schedule();
1012         }
1013         sched_setaffinity(0, oldmask);
1014 }
1015 EXPORT_SYMBOL_GPL(__synchronize_sched);
1016
1017 /*
1018  * Check to see if any future RCU-related work will need to be done
1019  * by the current CPU, even if none need be done immediately, returning
1020  * 1 if so.  Assumes that notifiers would take care of handling any
1021  * outstanding requests from the RCU core.
1022  *
1023  * This function is part of the RCU implementation; it is -not-
1024  * an exported member of the RCU API.
1025  */
1026 int rcu_needs_cpu(int cpu)
1027 {
1028         struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1029
1030         return (rdp->donelist != NULL ||
1031                 !!rdp->waitlistcount ||
1032                 rdp->nextlist != NULL);
1033 }
1034
1035 int rcu_pending(int cpu)
1036 {
1037         struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1038
1039         /* The CPU has at least one callback queued somewhere. */
1040
1041         if (rdp->donelist != NULL ||
1042             !!rdp->waitlistcount ||
1043             rdp->nextlist != NULL)
1044                 return 1;
1045
1046         /* The RCU core needs an acknowledgement from this CPU. */
1047
1048         if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
1049             (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
1050                 return 1;
1051
1052         /* This CPU has fallen behind the global grace-period number. */
1053
1054         if (rdp->completed != rcu_ctrlblk.completed)
1055                 return 1;
1056
1057         /* Nothing needed from this CPU. */
1058
1059         return 0;
1060 }
1061
1062 static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1063                                 unsigned long action, void *hcpu)
1064 {
1065         long cpu = (long)hcpu;
1066
1067         switch (action) {
1068         case CPU_UP_PREPARE:
1069         case CPU_UP_PREPARE_FROZEN:
1070                 rcu_online_cpu(cpu);
1071                 break;
1072         case CPU_UP_CANCELED:
1073         case CPU_UP_CANCELED_FROZEN:
1074         case CPU_DEAD:
1075         case CPU_DEAD_FROZEN:
1076                 rcu_offline_cpu(cpu);
1077                 break;
1078         default:
1079                 break;
1080         }
1081         return NOTIFY_OK;
1082 }
1083
1084 static struct notifier_block __cpuinitdata rcu_nb = {
1085         .notifier_call = rcu_cpu_notify,
1086 };
1087
1088 void __init __rcu_init(void)
1089 {
1090         int cpu;
1091         int i;
1092         struct rcu_data *rdp;
1093
1094         printk(KERN_NOTICE "Preemptible RCU implementation.\n");
1095         for_each_possible_cpu(cpu) {
1096                 rdp = RCU_DATA_CPU(cpu);
1097                 spin_lock_init(&rdp->lock);
1098                 rdp->completed = 0;
1099                 rdp->waitlistcount = 0;
1100                 rdp->nextlist = NULL;
1101                 rdp->nexttail = &rdp->nextlist;
1102                 for (i = 0; i < GP_STAGES; i++) {
1103                         rdp->waitlist[i] = NULL;
1104                         rdp->waittail[i] = &rdp->waitlist[i];
1105                 }
1106                 rdp->donelist = NULL;
1107                 rdp->donetail = &rdp->donelist;
1108                 rdp->rcu_flipctr[0] = 0;
1109                 rdp->rcu_flipctr[1] = 0;
1110         }
1111         register_cpu_notifier(&rcu_nb);
1112
1113         /*
1114          * We don't need protection against CPU-Hotplug here
1115          * since
1116          * a) If a CPU comes online while we are iterating over the
1117          *    cpu_online_map below, we would only end up making a
1118          *    duplicate call to rcu_online_cpu() which sets the corresponding
1119          *    CPU's mask in the rcu_cpu_online_map.
1120          *
1121          * b) A CPU cannot go offline at this point in time since the user
1122          *    does not have access to the sysfs interface, nor do we
1123          *    suspend the system.
1124          */
1125         for_each_online_cpu(cpu)
1126                 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
1127
1128         open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
1129 }
1130
1131 /*
1132  * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
1133  */
1134 void synchronize_kernel(void)
1135 {
1136         synchronize_rcu();
1137 }
1138
1139 #ifdef CONFIG_RCU_TRACE
1140 long *rcupreempt_flipctr(int cpu)
1141 {
1142         return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1143 }
1144 EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
1145
1146 int rcupreempt_flip_flag(int cpu)
1147 {
1148         return per_cpu(rcu_flip_flag, cpu);
1149 }
1150 EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
1151
1152 int rcupreempt_mb_flag(int cpu)
1153 {
1154         return per_cpu(rcu_mb_flag, cpu);
1155 }
1156 EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
1157
1158 char *rcupreempt_try_flip_state_name(void)
1159 {
1160         return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
1161 }
1162 EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
1163
1164 struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
1165 {
1166         struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1167
1168         return &rdp->trace;
1169 }
1170 EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
1171
1172 #endif /* #ifdef RCU_TRACE */