git.oblomov.eu Git - linux-2.6/blob - arch/ia64/sn/kernel/xpc_main.c

   1 /*
   2  * This file is subject to the terms and conditions of the GNU General Public
   3  * License.  See the file "COPYING" in the main directory of this archive
   4  * for more details.
   5  *
   6  * Copyright (c) 2004-2005 Silicon Graphics, Inc.  All Rights Reserved.
   7  */
   8
   9
  10 /*
  11  * Cross Partition Communication (XPC) support - standard version.
  12  *
  13  *      XPC provides a message passing capability that crosses partition
  14  *      boundaries. This module is made up of two parts:
  15  *
  16  *          partition   This part detects the presence/absence of other
  17  *                      partitions. It provides a heartbeat and monitors
  18  *                      the heartbeats of other partitions.
  19  *
  20  *          channel     This part manages the channels and sends/receives
  21  *                      messages across them to/from other partitions.
  22  *
  23  *      There are a couple of additional functions residing in XP, which
  24  *      provide an interface to XPC for its users.
  25  *
  26  *
  27  *      Caveats:
  28  *
  29  *        . We currently have no way to determine which nasid an IPI came
  30  *          from. Thus, xpc_IPI_send() does a remote AMO write followed by
  31  *          an IPI. The AMO indicates where data is to be pulled from, so
  32  *          after the IPI arrives, the remote partition checks the AMO word.
  33  *          The IPI can actually arrive before the AMO however, so other code
  34  *          must periodically check for this case. Also, remote AMO operations
  35  *          do not reliably time out. Thus we do a remote PIO read solely to
  36  *          know whether the remote partition is down and whether we should
  37  *          stop sending IPIs to it. This remote PIO read operation is set up
  38  *          in a special nofault region so SAL knows to ignore (and cleanup)
  39  *          any errors due to the remote AMO write, PIO read, and/or PIO
  40  *          write operations.
  41  *
  42  *          If/when new hardware solves this IPI problem, we should abandon
  43  *          the current approach.
  44  *
  45  */
  46
  47
  48 #include <linux/kernel.h>
  49 #include <linux/module.h>
  50 #include <linux/init.h>
  51 #include <linux/sched.h>
  52 #include <linux/syscalls.h>
  53 #include <linux/cache.h>
  54 #include <linux/interrupt.h>
  55 #include <linux/slab.h>
  56 #include <asm/sn/intr.h>
  57 #include <asm/sn/sn_sal.h>
  58 #include <asm/uaccess.h>
  59 #include "xpc.h"
  60
  61
  62 /* define two XPC debug device structures to be used with dev_dbg() et al */
  63
  64 struct device_driver xpc_dbg_name = {
  65         .name = "xpc"
  66 };
  67
  68 struct device xpc_part_dbg_subname = {
  69         .bus_id = {0},          /* set to "part" at xpc_init() time */
  70         .driver = &xpc_dbg_name
  71 };
  72
  73 struct device xpc_chan_dbg_subname = {
  74         .bus_id = {0},          /* set to "chan" at xpc_init() time */
  75         .driver = &xpc_dbg_name
  76 };
  77
  78 struct device *xpc_part = &xpc_part_dbg_subname;
  79 struct device *xpc_chan = &xpc_chan_dbg_subname;
  80
  81
  82 /* systune related variables for /proc/sys directories */
  83
  84 static int xpc_hb_min = 1;
  85 static int xpc_hb_max = 10;
  86
  87 static int xpc_hb_check_min = 10;
  88 static int xpc_hb_check_max = 120;
  89
  90 static ctl_table xpc_sys_xpc_hb_dir[] = {
  91         {
  92                 1,
  93                 "hb_interval",
  94                 &xpc_hb_interval,
  95                 sizeof(int),
  96                 0644,
  97                 NULL,
  98                 &proc_dointvec_minmax,
  99                 &sysctl_intvec,
 100                 NULL,
 101                 &xpc_hb_min, &xpc_hb_max
 102         },
 103         {
 104                 2,
 105                 "hb_check_interval",
 106                 &xpc_hb_check_interval,
 107                 sizeof(int),
 108                 0644,
 109                 NULL,
 110                 &proc_dointvec_minmax,
 111                 &sysctl_intvec,
 112                 NULL,
 113                 &xpc_hb_check_min, &xpc_hb_check_max
 114         },
 115         {0}
 116 };
 117 static ctl_table xpc_sys_xpc_dir[] = {
 118         {
 119                 1,
 120                 "hb",
 121                 NULL,
 122                 0,
 123                 0555,
 124                 xpc_sys_xpc_hb_dir
 125         },
 126         {0}
 127 };
 128 static ctl_table xpc_sys_dir[] = {
 129         {
 130                 1,
 131                 "xpc",
 132                 NULL,
 133                 0,
 134                 0555,
 135                 xpc_sys_xpc_dir
 136         },
 137         {0}
 138 };
 139 static struct ctl_table_header *xpc_sysctl;
 140
 141
 142 /* #of IRQs received */
 143 static atomic_t xpc_act_IRQ_rcvd;
 144
 145 /* IRQ handler notifies this wait queue on receipt of an IRQ */
 146 static DECLARE_WAIT_QUEUE_HEAD(xpc_act_IRQ_wq);
 147
 148 static unsigned long xpc_hb_check_timeout;
 149
 150 /* xpc_hb_checker thread exited notification */
 151 static DECLARE_MUTEX_LOCKED(xpc_hb_checker_exited);
 152
 153 /* xpc_discovery thread exited notification */
 154 static DECLARE_MUTEX_LOCKED(xpc_discovery_exited);
 155
 156
 157 static struct timer_list xpc_hb_timer;
 158
 159
 160 static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *);
 161
 162
 163 /*
 164  * Notify the heartbeat check thread that an IRQ has been received.
 165  */
 166 static irqreturn_t
 167 xpc_act_IRQ_handler(int irq, void *dev_id, struct pt_regs *regs)
 168 {
 169         atomic_inc(&xpc_act_IRQ_rcvd);
 170         wake_up_interruptible(&xpc_act_IRQ_wq);
 171         return IRQ_HANDLED;
 172 }
 173
 174
 175 /*
 176  * Timer to produce the heartbeat.  The timer structures function is
 177  * already set when this is initially called.  A tunable is used to
 178  * specify when the next timeout should occur.
 179  */
 180 static void
 181 xpc_hb_beater(unsigned long dummy)
 182 {
 183         xpc_vars->heartbeat++;
 184
 185         if (jiffies >= xpc_hb_check_timeout) {
 186                 wake_up_interruptible(&xpc_act_IRQ_wq);
 187         }
 188
 189         xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ);
 190         add_timer(&xpc_hb_timer);
 191 }
 192
 193
 194 /*
 195  * This thread is responsible for nearly all of the partition
 196  * activation/deactivation.
 197  */
 198 static int
 199 xpc_hb_checker(void *ignore)
 200 {
 201         int last_IRQ_count = 0;
 202         int new_IRQ_count;
 203         int force_IRQ=0;
 204
 205
 206         /* this thread was marked active by xpc_hb_init() */
 207
 208         daemonize(XPC_HB_CHECK_THREAD_NAME);
 209
 210         set_cpus_allowed(current, cpumask_of_cpu(XPC_HB_CHECK_CPU));
 211
 212         xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
 213
 214         while (!(volatile int) xpc_exiting) {
 215
 216                 /* wait for IRQ or timeout */
 217                 (void) wait_event_interruptible(xpc_act_IRQ_wq,
 218                             (last_IRQ_count < atomic_read(&xpc_act_IRQ_rcvd) ||
 219                                         jiffies >= xpc_hb_check_timeout ||
 220                                                 (volatile int) xpc_exiting));
 221
 222                 dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have "
 223                         "been received\n",
 224                         (int) (xpc_hb_check_timeout - jiffies),
 225                         atomic_read(&xpc_act_IRQ_rcvd) - last_IRQ_count);
 226
 227
 228                 /* checking of remote heartbeats is skewed by IRQ handling */
 229                 if (jiffies >= xpc_hb_check_timeout) {
 230                         dev_dbg(xpc_part, "checking remote heartbeats\n");
 231                         xpc_check_remote_hb();
 232
 233                         /*
 234                          * We need to periodically recheck to ensure no
 235                          * IPI/AMO pairs have been missed.  That check
 236                          * must always reset xpc_hb_check_timeout.
 237                          */
 238                         force_IRQ = 1;
 239                 }
 240
 241
 242                 new_IRQ_count = atomic_read(&xpc_act_IRQ_rcvd);
 243                 if (last_IRQ_count < new_IRQ_count || force_IRQ != 0) {
 244                         force_IRQ = 0;
 245
 246                         dev_dbg(xpc_part, "found an IRQ to process; will be "
 247                                 "resetting xpc_hb_check_timeout\n");
 248
 249                         last_IRQ_count += xpc_identify_act_IRQ_sender();
 250                         if (last_IRQ_count < new_IRQ_count) {
 251                                 /* retry once to help avoid missing AMO */
 252                                 (void) xpc_identify_act_IRQ_sender();
 253                         }
 254                         last_IRQ_count = new_IRQ_count;
 255
 256                         xpc_hb_check_timeout = jiffies +
 257                                            (xpc_hb_check_interval * HZ);
 258                 }
 259         }
 260
 261         dev_dbg(xpc_part, "heartbeat checker is exiting\n");
 262
 263
 264         /* mark this thread as inactive */
 265         up(&xpc_hb_checker_exited);
 266         return 0;
 267 }
 268
 269
 270 /*
 271  * This thread will attempt to discover other partitions to activate
 272  * based on info provided by SAL. This new thread is short lived and
 273  * will exit once discovery is complete.
 274  */
 275 static int
 276 xpc_initiate_discovery(void *ignore)
 277 {
 278         daemonize(XPC_DISCOVERY_THREAD_NAME);
 279
 280         xpc_discovery();
 281
 282         dev_dbg(xpc_part, "discovery thread is exiting\n");
 283
 284         /* mark this thread as inactive */
 285         up(&xpc_discovery_exited);
 286         return 0;
 287 }
 288
 289
 290 /*
 291  * Establish first contact with the remote partititon. This involves pulling
 292  * the XPC per partition variables from the remote partition and waiting for
 293  * the remote partition to pull ours.
 294  */
 295 static enum xpc_retval
 296 xpc_make_first_contact(struct xpc_partition *part)
 297 {
 298         enum xpc_retval ret;
 299
 300
 301         while ((ret = xpc_pull_remote_vars_part(part)) != xpcSuccess) {
 302                 if (ret != xpcRetry) {
 303                         XPC_DEACTIVATE_PARTITION(part, ret);
 304                         return ret;
 305                 }
 306
 307                 dev_dbg(xpc_chan, "waiting to make first contact with "
 308                         "partition %d\n", XPC_PARTID(part));
 309
 310                 /* wait a 1/4 of a second or so */
 311                 set_current_state(TASK_INTERRUPTIBLE);
 312                 (void) schedule_timeout(0.25 * HZ);
 313
 314                 if (part->act_state == XPC_P_DEACTIVATING) {
 315                         return part->reason;
 316                 }
 317         }
 318
 319         return xpc_mark_partition_active(part);
 320 }
 321
 322
 323 /*
 324  * The first kthread assigned to a newly activated partition is the one
 325  * created by XPC HB with which it calls xpc_partition_up(). XPC hangs on to
 326  * that kthread until the partition is brought down, at which time that kthread
 327  * returns back to XPC HB. (The return of that kthread will signify to XPC HB
 328  * that XPC has dismantled all communication infrastructure for the associated
 329  * partition.) This kthread becomes the channel manager for that partition.
 330  *
 331  * Each active partition has a channel manager, who, besides connecting and
 332  * disconnecting channels, will ensure that each of the partition's connected
 333  * channels has the required number of assigned kthreads to get the work done.
 334  */
 335 static void
 336 xpc_channel_mgr(struct xpc_partition *part)
 337 {
 338         while (part->act_state != XPC_P_DEACTIVATING ||
 339                                 atomic_read(&part->nchannels_active) > 0) {
 340
 341                 xpc_process_channel_activity(part);
 342
 343
 344                 /*
 345                  * Wait until we've been requested to activate kthreads or
 346                  * all of the channel's message queues have been torn down or
 347                  * a signal is pending.
 348                  *
 349                  * The channel_mgr_requests is set to 1 after being awakened,
 350                  * This is done to prevent the channel mgr from making one pass
 351                  * through the loop for each request, since he will
 352                  * be servicing all the requests in one pass. The reason it's
 353                  * set to 1 instead of 0 is so that other kthreads will know
 354                  * that the channel mgr is running and won't bother trying to
 355                  * wake him up.
 356                  */
 357                 atomic_dec(&part->channel_mgr_requests);
 358                 (void) wait_event_interruptible(part->channel_mgr_wq,
 359                                 (atomic_read(&part->channel_mgr_requests) > 0 ||
 360                                 (volatile u64) part->local_IPI_amo != 0 ||
 361                                 ((volatile u8) part->act_state ==
 362                                                         XPC_P_DEACTIVATING &&
 363                                 atomic_read(&part->nchannels_active) == 0)));
 364                 atomic_set(&part->channel_mgr_requests, 1);
 365
 366                 // >>> Does it need to wakeup periodically as well? In case we
 367                 // >>> miscalculated the #of kthreads to wakeup or create?
 368         }
 369 }
 370
 371
 372 /*
 373  * When XPC HB determines that a partition has come up, it will create a new
 374  * kthread and that kthread will call this function to attempt to set up the
 375  * basic infrastructure used for Cross Partition Communication with the newly
 376  * upped partition.
 377  *
 378  * The kthread that was created by XPC HB and which setup the XPC
 379  * infrastructure will remain assigned to the partition until the partition
 380  * goes down. At which time the kthread will teardown the XPC infrastructure
 381  * and then exit.
 382  *
 383  * XPC HB will put the remote partition's XPC per partition specific variables
 384  * physical address into xpc_partitions[partid].remote_vars_part_pa prior to
 385  * calling xpc_partition_up().
 386  */
 387 static void
 388 xpc_partition_up(struct xpc_partition *part)
 389 {
 390         DBUG_ON(part->channels != NULL);
 391
 392         dev_dbg(xpc_chan, "activating partition %d\n", XPC_PARTID(part));
 393
 394         if (xpc_setup_infrastructure(part) != xpcSuccess) {
 395                 return;
 396         }
 397
 398         /*
 399          * The kthread that XPC HB called us with will become the
 400          * channel manager for this partition. It will not return
 401          * back to XPC HB until the partition's XPC infrastructure
 402          * has been dismantled.
 403          */
 404
 405         (void) xpc_part_ref(part);      /* this will always succeed */
 406
 407         if (xpc_make_first_contact(part) == xpcSuccess) {
 408                 xpc_channel_mgr(part);
 409         }
 410
 411         xpc_part_deref(part);
 412
 413         xpc_teardown_infrastructure(part);
 414 }
 415
 416
 417 static int
 418 xpc_activating(void *__partid)
 419 {
 420         partid_t partid = (u64) __partid;
 421         struct xpc_partition *part = &xpc_partitions[partid];
 422         unsigned long irq_flags;
 423         struct sched_param param = { sched_priority: MAX_USER_RT_PRIO - 1 };
 424         int ret;
 425
 426
 427         DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
 428
 429         spin_lock_irqsave(&part->act_lock, irq_flags);
 430
 431         if (part->act_state == XPC_P_DEACTIVATING) {
 432                 part->act_state = XPC_P_INACTIVE;
 433                 spin_unlock_irqrestore(&part->act_lock, irq_flags);
 434                 part->remote_rp_pa = 0;
 435                 return 0;
 436         }
 437
 438         /* indicate the thread is activating */
 439         DBUG_ON(part->act_state != XPC_P_ACTIVATION_REQ);
 440         part->act_state = XPC_P_ACTIVATING;
 441
 442         XPC_SET_REASON(part, 0, 0);
 443         spin_unlock_irqrestore(&part->act_lock, irq_flags);
 444
 445         dev_dbg(xpc_part, "bringing partition %d up\n", partid);
 446
 447         daemonize("xpc%02d", partid);
 448
 449         /*
 450          * This thread needs to run at a realtime priority to prevent a
 451          * significant performance degradation.
 452          */
 453         ret = sched_setscheduler(current, SCHED_FIFO, &param);
 454         if (ret != 0) {
 455                 dev_warn(xpc_part, "unable to set pid %d to a realtime "
 456                         "priority, ret=%d\n", current->pid, ret);
 457         }
 458
 459         /* allow this thread and its children to run on any CPU */
 460         set_cpus_allowed(current, CPU_MASK_ALL);
 461
 462         /*
 463          * Register the remote partition's AMOs with SAL so it can handle
 464          * and cleanup errors within that address range should the remote
 465          * partition go down. We don't unregister this range because it is
 466          * difficult to tell when outstanding writes to the remote partition
 467          * are finished and thus when it is safe to unregister. This should
 468          * not result in wasted space in the SAL xp_addr_region table because
 469          * we should get the same page for remote_amos_page_pa after module
 470          * reloads and system reboots.
 471          */
 472         if (sn_register_xp_addr_region(part->remote_amos_page_pa,
 473                                                         PAGE_SIZE, 1) < 0) {
 474                 dev_warn(xpc_part, "xpc_partition_up(%d) failed to register "
 475                         "xp_addr region\n", partid);
 476
 477                 spin_lock_irqsave(&part->act_lock, irq_flags);
 478                 part->act_state = XPC_P_INACTIVE;
 479                 XPC_SET_REASON(part, xpcPhysAddrRegFailed, __LINE__);
 480                 spin_unlock_irqrestore(&part->act_lock, irq_flags);
 481                 part->remote_rp_pa = 0;
 482                 return 0;
 483         }
 484
 485         XPC_ALLOW_HB(partid, xpc_vars);
 486         xpc_IPI_send_activated(part);
 487
 488
 489         /*
 490          * xpc_partition_up() holds this thread and marks this partition as
 491          * XPC_P_ACTIVE by calling xpc_hb_mark_active().
 492          */
 493         (void) xpc_partition_up(part);
 494
 495         xpc_mark_partition_inactive(part);
 496
 497         if (part->reason == xpcReactivating) {
 498                 /* interrupting ourselves results in activating partition */
 499                 xpc_IPI_send_reactivate(part);
 500         }
 501
 502         return 0;
 503 }
 504
 505
 506 void
 507 xpc_activate_partition(struct xpc_partition *part)
 508 {
 509         partid_t partid = XPC_PARTID(part);
 510         unsigned long irq_flags;
 511         pid_t pid;
 512
 513
 514         spin_lock_irqsave(&part->act_lock, irq_flags);
 515
 516         pid = kernel_thread(xpc_activating, (void *) ((u64) partid), 0);
 517
 518         DBUG_ON(part->act_state != XPC_P_INACTIVE);
 519
 520         if (pid > 0) {
 521                 part->act_state = XPC_P_ACTIVATION_REQ;
 522                 XPC_SET_REASON(part, xpcCloneKThread, __LINE__);
 523         } else {
 524                 XPC_SET_REASON(part, xpcCloneKThreadFailed, __LINE__);
 525         }
 526
 527         spin_unlock_irqrestore(&part->act_lock, irq_flags);
 528 }
 529
 530
 531 /*
 532  * Handle the receipt of a SGI_XPC_NOTIFY IRQ by seeing whether the specified
 533  * partition actually sent it. Since SGI_XPC_NOTIFY IRQs may be shared by more
 534  * than one partition, we use an AMO_t structure per partition to indicate
 535  * whether a partition has sent an IPI or not.  >>> If it has, then wake up the
 536  * associated kthread to handle it.
 537  *
 538  * All SGI_XPC_NOTIFY IRQs received by XPC are the result of IPIs sent by XPC
 539  * running on other partitions.
 540  *
 541  * Noteworthy Arguments:
 542  *
 543  *      irq - Interrupt ReQuest number. NOT USED.
 544  *
 545  *      dev_id - partid of IPI's potential sender.
 546  *
 547  *      regs - processor's context before the processor entered
 548  *             interrupt code. NOT USED.
 549  */
 550 irqreturn_t
 551 xpc_notify_IRQ_handler(int irq, void *dev_id, struct pt_regs *regs)
 552 {
 553         partid_t partid = (partid_t) (u64) dev_id;
 554         struct xpc_partition *part = &xpc_partitions[partid];
 555
 556
 557         DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
 558
 559         if (xpc_part_ref(part)) {
 560                 xpc_check_for_channel_activity(part);
 561
 562                 xpc_part_deref(part);
 563         }
 564         return IRQ_HANDLED;
 565 }
 566
 567
 568 /*
 569  * Check to see if xpc_notify_IRQ_handler() dropped any IPIs on the floor
 570  * because the write to their associated IPI amo completed after the IRQ/IPI
 571  * was received.
 572  */
 573 void
 574 xpc_dropped_IPI_check(struct xpc_partition *part)
 575 {
 576         if (xpc_part_ref(part)) {
 577                 xpc_check_for_channel_activity(part);
 578
 579                 part->dropped_IPI_timer.expires = jiffies +
 580                                                         XPC_P_DROPPED_IPI_WAIT;
 581                 add_timer(&part->dropped_IPI_timer);
 582                 xpc_part_deref(part);
 583         }
 584 }
 585
 586
 587 void
 588 xpc_activate_kthreads(struct xpc_channel *ch, int needed)
 589 {
 590         int idle = atomic_read(&ch->kthreads_idle);
 591         int assigned = atomic_read(&ch->kthreads_assigned);
 592         int wakeup;
 593
 594
 595         DBUG_ON(needed <= 0);
 596
 597         if (idle > 0) {
 598                 wakeup = (needed > idle) ? idle : needed;
 599                 needed -= wakeup;
 600
 601                 dev_dbg(xpc_chan, "wakeup %d idle kthreads, partid=%d, "
 602                         "channel=%d\n", wakeup, ch->partid, ch->number);
 603
 604                 /* only wakeup the requested number of kthreads */
 605                 wake_up_nr(&ch->idle_wq, wakeup);
 606         }
 607
 608         if (needed <= 0) {
 609                 return;
 610         }
 611
 612         if (needed + assigned > ch->kthreads_assigned_limit) {
 613                 needed = ch->kthreads_assigned_limit - assigned;
 614                 // >>>should never be less than 0
 615                 if (needed <= 0) {
 616                         return;
 617                 }
 618         }
 619
 620         dev_dbg(xpc_chan, "create %d new kthreads, partid=%d, channel=%d\n",
 621                 needed, ch->partid, ch->number);
 622
 623         xpc_create_kthreads(ch, needed);
 624 }
 625
 626
 627 /*
 628  * This function is where XPC's kthreads wait for messages to deliver.
 629  */
 630 static void
 631 xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch)
 632 {
 633         do {
 634                 /* deliver messages to their intended recipients */
 635
 636                 while ((volatile s64) ch->w_local_GP.get <
 637                                 (volatile s64) ch->w_remote_GP.put &&
 638                                         !((volatile u32) ch->flags &
 639                                                 XPC_C_DISCONNECTING)) {
 640                         xpc_deliver_msg(ch);
 641                 }
 642
 643                 if (atomic_inc_return(&ch->kthreads_idle) >
 644                                                 ch->kthreads_idle_limit) {
 645                         /* too many idle kthreads on this channel */
 646                         atomic_dec(&ch->kthreads_idle);
 647                         break;
 648                 }
 649
 650                 dev_dbg(xpc_chan, "idle kthread calling "
 651                         "wait_event_interruptible_exclusive()\n");
 652
 653                 (void) wait_event_interruptible_exclusive(ch->idle_wq,
 654                                 ((volatile s64) ch->w_local_GP.get <
 655                                         (volatile s64) ch->w_remote_GP.put ||
 656                                 ((volatile u32) ch->flags &
 657                                                 XPC_C_DISCONNECTING)));
 658
 659                 atomic_dec(&ch->kthreads_idle);
 660
 661         } while (!((volatile u32) ch->flags & XPC_C_DISCONNECTING));
 662 }
 663
 664
 665 static int
 666 xpc_daemonize_kthread(void *args)
 667 {
 668         partid_t partid = XPC_UNPACK_ARG1(args);
 669         u16 ch_number = XPC_UNPACK_ARG2(args);
 670         struct xpc_partition *part = &xpc_partitions[partid];
 671         struct xpc_channel *ch;
 672         int n_needed;
 673
 674
 675         daemonize("xpc%02dc%d", partid, ch_number);
 676
 677         dev_dbg(xpc_chan, "kthread starting, partid=%d, channel=%d\n",
 678                 partid, ch_number);
 679
 680         ch = &part->channels[ch_number];
 681
 682         if (!(ch->flags & XPC_C_DISCONNECTING)) {
 683                 DBUG_ON(!(ch->flags & XPC_C_CONNECTED));
 684
 685                 /* let registerer know that connection has been established */
 686
 687                 if (atomic_read(&ch->kthreads_assigned) == 1) {
 688                         xpc_connected_callout(ch);
 689
 690                         /*
 691                          * It is possible that while the callout was being
 692                          * made that the remote partition sent some messages.
 693                          * If that is the case, we may need to activate
 694                          * additional kthreads to help deliver them. We only
 695                          * need one less than total #of messages to deliver.
 696                          */
 697                         n_needed = ch->w_remote_GP.put - ch->w_local_GP.get - 1;
 698                         if (n_needed > 0 &&
 699                                         !(ch->flags & XPC_C_DISCONNECTING)) {
 700                                 xpc_activate_kthreads(ch, n_needed);
 701                         }
 702                 }
 703
 704                 xpc_kthread_waitmsgs(part, ch);
 705         }
 706
 707         if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
 708                         ((ch->flags & XPC_C_CONNECTCALLOUT) ||
 709                                 (ch->reason != xpcUnregistering &&
 710                                         ch->reason != xpcOtherUnregistering))) {
 711                 xpc_disconnected_callout(ch);
 712         }
 713
 714
 715         xpc_msgqueue_deref(ch);
 716
 717         dev_dbg(xpc_chan, "kthread exiting, partid=%d, channel=%d\n",
 718                 partid, ch_number);
 719
 720         xpc_part_deref(part);
 721         return 0;
 722 }
 723
 724
 725 /*
 726  * For each partition that XPC has established communications with, there is
 727  * a minimum of one kernel thread assigned to perform any operation that
 728  * may potentially sleep or block (basically the callouts to the asynchronous
 729  * functions registered via xpc_connect()).
 730  *
 731  * Additional kthreads are created and destroyed by XPC as the workload
 732  * demands.
 733  *
 734  * A kthread is assigned to one of the active channels that exists for a given
 735  * partition.
 736  */
 737 void
 738 xpc_create_kthreads(struct xpc_channel *ch, int needed)
 739 {
 740         unsigned long irq_flags;
 741         pid_t pid;
 742         u64 args = XPC_PACK_ARGS(ch->partid, ch->number);
 743
 744
 745         while (needed-- > 0) {
 746                 pid = kernel_thread(xpc_daemonize_kthread, (void *) args, 0);
 747                 if (pid < 0) {
 748                         /* the fork failed */
 749
 750                         if (atomic_read(&ch->kthreads_assigned) <
 751                                                 ch->kthreads_idle_limit) {
 752                                 /*
 753                                  * Flag this as an error only if we have an
 754                                  * insufficient #of kthreads for the channel
 755                                  * to function.
 756                                  *
 757                                  * No xpc_msgqueue_ref() is needed here since
 758                                  * the channel mgr is doing this.
 759                                  */
 760                                 spin_lock_irqsave(&ch->lock, irq_flags);
 761                                 XPC_DISCONNECT_CHANNEL(ch, xpcLackOfResources,
 762                                                                 &irq_flags);
 763                                 spin_unlock_irqrestore(&ch->lock, irq_flags);
 764                         }
 765                         break;
 766                 }
 767
 768                 /*
 769                  * The following is done on behalf of the newly created
 770                  * kthread. That kthread is responsible for doing the
 771                  * counterpart to the following before it exits.
 772                  */
 773                 (void) xpc_part_ref(&xpc_partitions[ch->partid]);
 774                 xpc_msgqueue_ref(ch);
 775                 atomic_inc(&ch->kthreads_assigned);
 776                 ch->kthreads_created++; // >>> temporary debug only!!!
 777         }
 778 }
 779
 780
 781 void
 782 xpc_disconnect_wait(int ch_number)
 783 {
 784         partid_t partid;
 785         struct xpc_partition *part;
 786         struct xpc_channel *ch;
 787
 788
 789         /* now wait for all callouts to the caller's function to cease */
 790         for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
 791                 part = &xpc_partitions[partid];
 792
 793                 if (xpc_part_ref(part)) {
 794                         ch = &part->channels[ch_number];
 795
 796 // >>> how do we keep from falling into the window between our check and going
 797 // >>> down and coming back up where sema is re-inited?
 798                         if (ch->flags & XPC_C_SETUP) {
 799                                 (void) down(&ch->teardown_sema);
 800                         }
 801
 802                         xpc_part_deref(part);
 803                 }
 804         }
 805 }
 806
 807
 808 static void
 809 xpc_do_exit(void)
 810 {
 811         partid_t partid;
 812         int active_part_count;
 813         struct xpc_partition *part;
 814
 815
 816         /* now it's time to eliminate our heartbeat */
 817         del_timer_sync(&xpc_hb_timer);
 818         xpc_vars->heartbeating_to_mask = 0;
 819
 820         /* indicate to others that our reserved page is uninitialized */
 821         xpc_rsvd_page->vars_pa = 0;
 822
 823         /*
 824          * Ignore all incoming interrupts. Without interupts the heartbeat
 825          * checker won't activate any new partitions that may come up.
 826          */
 827         free_irq(SGI_XPC_ACTIVATE, NULL);
 828
 829         /*
 830          * Cause the heartbeat checker and the discovery threads to exit.
 831          * We don't want them attempting to activate new partitions as we
 832          * try to deactivate the existing ones.
 833          */
 834         xpc_exiting = 1;
 835         wake_up_interruptible(&xpc_act_IRQ_wq);
 836
 837         /* wait for the heartbeat checker thread to mark itself inactive */
 838         down(&xpc_hb_checker_exited);
 839
 840         /* wait for the discovery thread to mark itself inactive */
 841         down(&xpc_discovery_exited);
 842
 843
 844         set_current_state(TASK_INTERRUPTIBLE);
 845         schedule_timeout(0.3 * HZ);
 846         set_current_state(TASK_RUNNING);
 847
 848
 849         /* wait for all partitions to become inactive */
 850
 851         do {
 852                 active_part_count = 0;
 853
 854                 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
 855                         part = &xpc_partitions[partid];
 856                         if (part->act_state != XPC_P_INACTIVE) {
 857                                 active_part_count++;
 858
 859                                 XPC_DEACTIVATE_PARTITION(part, xpcUnloading);
 860                         }
 861                 }
 862
 863                 if (active_part_count) {
 864                         set_current_state(TASK_INTERRUPTIBLE);
 865                         schedule_timeout(0.3 * HZ);
 866                         set_current_state(TASK_RUNNING);
 867                 }
 868
 869         } while (active_part_count > 0);
 870
 871
 872         /* close down protections for IPI operations */
 873         xpc_restrict_IPI_ops();
 874
 875
 876         /* clear the interface to XPC's functions */
 877         xpc_clear_interface();
 878
 879         if (xpc_sysctl) {
 880                 unregister_sysctl_table(xpc_sysctl);
 881         }
 882 }
 883
 884
 885 int __init
 886 xpc_init(void)
 887 {
 888         int ret;
 889         partid_t partid;
 890         struct xpc_partition *part;
 891         pid_t pid;
 892
 893
 894         /*
 895          * xpc_remote_copy_buffer is used as a temporary buffer for bte_copy'ng
 896          * both a partition's reserved page and its XPC variables. Its size was
 897          * based on the size of a reserved page. So we need to ensure that the
 898          * XPC variables will fit as well.
 899          */
 900         if (XPC_VARS_ALIGNED_SIZE > XPC_RSVD_PAGE_ALIGNED_SIZE) {
 901                 dev_err(xpc_part, "xpc_remote_copy_buffer is not big enough\n");
 902                 return -EPERM;
 903         }
 904         DBUG_ON((u64) xpc_remote_copy_buffer !=
 905                                 L1_CACHE_ALIGN((u64) xpc_remote_copy_buffer));
 906
 907         snprintf(xpc_part->bus_id, BUS_ID_SIZE, "part");
 908         snprintf(xpc_chan->bus_id, BUS_ID_SIZE, "chan");
 909
 910         xpc_sysctl = register_sysctl_table(xpc_sys_dir, 1);
 911
 912         /*
 913          * The first few fields of each entry of xpc_partitions[] need to
 914          * be initialized now so that calls to xpc_connect() and
 915          * xpc_disconnect() can be made prior to the activation of any remote
 916          * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE
 917          * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING
 918          * PARTITION HAS BEEN ACTIVATED.
 919          */
 920         for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
 921                 part = &xpc_partitions[partid];
 922
 923                 DBUG_ON((u64) part != L1_CACHE_ALIGN((u64) part));
 924
 925                 part->act_IRQ_rcvd = 0;
 926                 spin_lock_init(&part->act_lock);
 927                 part->act_state = XPC_P_INACTIVE;
 928                 XPC_SET_REASON(part, 0, 0);
 929                 part->setup_state = XPC_P_UNSET;
 930                 init_waitqueue_head(&part->teardown_wq);
 931                 atomic_set(&part->references, 0);
 932         }
 933
 934         /*
 935          * Open up protections for IPI operations (and AMO operations on
 936          * Shub 1.1 systems).
 937          */
 938         xpc_allow_IPI_ops();
 939
 940         /*
 941          * Interrupts being processed will increment this atomic variable and
 942          * awaken the heartbeat thread which will process the interrupts.
 943          */
 944         atomic_set(&xpc_act_IRQ_rcvd, 0);
 945
 946         /*
 947          * This is safe to do before the xpc_hb_checker thread has started
 948          * because the handler releases a wait queue.  If an interrupt is
 949          * received before the thread is waiting, it will not go to sleep,
 950          * but rather immediately process the interrupt.
 951          */
 952         ret = request_irq(SGI_XPC_ACTIVATE, xpc_act_IRQ_handler, 0,
 953                                                         "xpc hb", NULL);
 954         if (ret != 0) {
 955                 dev_err(xpc_part, "can't register ACTIVATE IRQ handler, "
 956                         "errno=%d\n", -ret);
 957
 958                 xpc_restrict_IPI_ops();
 959
 960                 if (xpc_sysctl) {
 961                         unregister_sysctl_table(xpc_sysctl);
 962                 }
 963                 return -EBUSY;
 964         }
 965
 966         /*
 967          * Fill the partition reserved page with the information needed by
 968          * other partitions to discover we are alive and establish initial
 969          * communications.
 970          */
 971         xpc_rsvd_page = xpc_rsvd_page_init();
 972         if (xpc_rsvd_page == NULL) {
 973                 dev_err(xpc_part, "could not setup our reserved page\n");
 974
 975                 free_irq(SGI_XPC_ACTIVATE, NULL);
 976                 xpc_restrict_IPI_ops();
 977
 978                 if (xpc_sysctl) {
 979                         unregister_sysctl_table(xpc_sysctl);
 980                 }
 981                 return -EBUSY;
 982         }
 983
 984
 985         /*
 986          * Set the beating to other partitions into motion.  This is
 987          * the last requirement for other partitions' discovery to
 988          * initiate communications with us.
 989          */
 990         init_timer(&xpc_hb_timer);
 991         xpc_hb_timer.function = xpc_hb_beater;
 992         xpc_hb_beater(0);
 993
 994
 995         /*
 996          * The real work-horse behind xpc.  This processes incoming
 997          * interrupts and monitors remote heartbeats.
 998          */
 999         pid = kernel_thread(xpc_hb_checker, NULL, 0);
1000         if (pid < 0) {
1001                 dev_err(xpc_part, "failed while forking hb check thread\n");
1002
1003                 /* indicate to others that our reserved page is uninitialized */
1004                 xpc_rsvd_page->vars_pa = 0;
1005
1006                 del_timer_sync(&xpc_hb_timer);
1007                 free_irq(SGI_XPC_ACTIVATE, NULL);
1008                 xpc_restrict_IPI_ops();
1009
1010                 if (xpc_sysctl) {
1011                         unregister_sysctl_table(xpc_sysctl);
1012                 }
1013                 return -EBUSY;
1014         }
1015
1016
1017         /*
1018          * Startup a thread that will attempt to discover other partitions to
1019          * activate based on info provided by SAL. This new thread is short
1020          * lived and will exit once discovery is complete.
1021          */
1022         pid = kernel_thread(xpc_initiate_discovery, NULL, 0);
1023         if (pid < 0) {
1024                 dev_err(xpc_part, "failed while forking discovery thread\n");
1025
1026                 /* mark this new thread as a non-starter */
1027                 up(&xpc_discovery_exited);
1028
1029                 xpc_do_exit();
1030                 return -EBUSY;
1031         }
1032
1033
1034         /* set the interface to point at XPC's functions */
1035         xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect,
1036                           xpc_initiate_allocate, xpc_initiate_send,
1037                           xpc_initiate_send_notify, xpc_initiate_received,
1038                           xpc_initiate_partid_to_nasids);
1039
1040         return 0;
1041 }
1042 module_init(xpc_init);
1043
1044
1045 void __exit
1046 xpc_exit(void)
1047 {
1048         xpc_do_exit();
1049 }
1050 module_exit(xpc_exit);
1051
1052
1053 MODULE_AUTHOR("Silicon Graphics, Inc.");
1054 MODULE_DESCRIPTION("Cross Partition Communication (XPC) support");
1055 MODULE_LICENSE("GPL");
1056
1057 module_param(xpc_hb_interval, int, 0);
1058 MODULE_PARM_DESC(xpc_hb_interval, "Number of seconds between "
1059                 "heartbeat increments.");
1060
1061 module_param(xpc_hb_check_interval, int, 0);
1062 MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between "
1063                 "heartbeat checks.");
1064