Merge branch 'upstream'
[linux-2.6] / arch / ia64 / sn / kernel / xpc_main.c
1 /*
2  * This file is subject to the terms and conditions of the GNU General Public
3  * License.  See the file "COPYING" in the main directory of this archive
4  * for more details.
5  *
6  * Copyright (c) 2004-2005 Silicon Graphics, Inc.  All Rights Reserved.
7  */
8
9
10 /*
11  * Cross Partition Communication (XPC) support - standard version.
12  *
13  *      XPC provides a message passing capability that crosses partition
14  *      boundaries. This module is made up of two parts:
15  *
16  *          partition   This part detects the presence/absence of other
17  *                      partitions. It provides a heartbeat and monitors
18  *                      the heartbeats of other partitions.
19  *
20  *          channel     This part manages the channels and sends/receives
21  *                      messages across them to/from other partitions.
22  *
23  *      There are a couple of additional functions residing in XP, which
24  *      provide an interface to XPC for its users.
25  *
26  *
27  *      Caveats:
28  *
29  *        . We currently have no way to determine which nasid an IPI came
30  *          from. Thus, xpc_IPI_send() does a remote AMO write followed by
31  *          an IPI. The AMO indicates where data is to be pulled from, so
32  *          after the IPI arrives, the remote partition checks the AMO word.
33  *          The IPI can actually arrive before the AMO however, so other code
34  *          must periodically check for this case. Also, remote AMO operations
35  *          do not reliably time out. Thus we do a remote PIO read solely to
36  *          know whether the remote partition is down and whether we should
37  *          stop sending IPIs to it. This remote PIO read operation is set up
38  *          in a special nofault region so SAL knows to ignore (and cleanup)
39  *          any errors due to the remote AMO write, PIO read, and/or PIO
40  *          write operations.
41  *
42  *          If/when new hardware solves this IPI problem, we should abandon
43  *          the current approach.
44  *
45  */
46
47
48 #include <linux/kernel.h>
49 #include <linux/module.h>
50 #include <linux/init.h>
51 #include <linux/sched.h>
52 #include <linux/syscalls.h>
53 #include <linux/cache.h>
54 #include <linux/interrupt.h>
55 #include <linux/slab.h>
56 #include <linux/delay.h>
57 #include <linux/reboot.h>
58 #include <asm/sn/intr.h>
59 #include <asm/sn/sn_sal.h>
60 #include <asm/kdebug.h>
61 #include <asm/uaccess.h>
62 #include "xpc.h"
63
64
65 /* define two XPC debug device structures to be used with dev_dbg() et al */
66
67 struct device_driver xpc_dbg_name = {
68         .name = "xpc"
69 };
70
71 struct device xpc_part_dbg_subname = {
72         .bus_id = {0},          /* set to "part" at xpc_init() time */
73         .driver = &xpc_dbg_name
74 };
75
76 struct device xpc_chan_dbg_subname = {
77         .bus_id = {0},          /* set to "chan" at xpc_init() time */
78         .driver = &xpc_dbg_name
79 };
80
81 struct device *xpc_part = &xpc_part_dbg_subname;
82 struct device *xpc_chan = &xpc_chan_dbg_subname;
83
84
85 /* systune related variables for /proc/sys directories */
86
87 static int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL;
88 static int xpc_hb_min_interval = 1;
89 static int xpc_hb_max_interval = 10;
90
91 static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL;
92 static int xpc_hb_check_min_interval = 10;
93 static int xpc_hb_check_max_interval = 120;
94
95 int xpc_disengage_request_timelimit = XPC_DISENGAGE_REQUEST_DEFAULT_TIMELIMIT;
96 static int xpc_disengage_request_min_timelimit = 0;
97 static int xpc_disengage_request_max_timelimit = 120;
98
99 static ctl_table xpc_sys_xpc_hb_dir[] = {
100         {
101                 1,
102                 "hb_interval",
103                 &xpc_hb_interval,
104                 sizeof(int),
105                 0644,
106                 NULL,
107                 &proc_dointvec_minmax,
108                 &sysctl_intvec,
109                 NULL,
110                 &xpc_hb_min_interval,
111                 &xpc_hb_max_interval
112         },
113         {
114                 2,
115                 "hb_check_interval",
116                 &xpc_hb_check_interval,
117                 sizeof(int),
118                 0644,
119                 NULL,
120                 &proc_dointvec_minmax,
121                 &sysctl_intvec,
122                 NULL,
123                 &xpc_hb_check_min_interval,
124                 &xpc_hb_check_max_interval
125         },
126         {0}
127 };
128 static ctl_table xpc_sys_xpc_dir[] = {
129         {
130                 1,
131                 "hb",
132                 NULL,
133                 0,
134                 0555,
135                 xpc_sys_xpc_hb_dir
136         },
137         {
138                 2,
139                 "disengage_request_timelimit",
140                 &xpc_disengage_request_timelimit,
141                 sizeof(int),
142                 0644,
143                 NULL,
144                 &proc_dointvec_minmax,
145                 &sysctl_intvec,
146                 NULL,
147                 &xpc_disengage_request_min_timelimit,
148                 &xpc_disengage_request_max_timelimit
149         },
150         {0}
151 };
152 static ctl_table xpc_sys_dir[] = {
153         {
154                 1,
155                 "xpc",
156                 NULL,
157                 0,
158                 0555,
159                 xpc_sys_xpc_dir
160         },
161         {0}
162 };
163 static struct ctl_table_header *xpc_sysctl;
164
165
166 /* #of IRQs received */
167 static atomic_t xpc_act_IRQ_rcvd;
168
169 /* IRQ handler notifies this wait queue on receipt of an IRQ */
170 static DECLARE_WAIT_QUEUE_HEAD(xpc_act_IRQ_wq);
171
172 static unsigned long xpc_hb_check_timeout;
173
174 /* notification that the xpc_hb_checker thread has exited */
175 static DECLARE_MUTEX_LOCKED(xpc_hb_checker_exited);
176
177 /* notification that the xpc_discovery thread has exited */
178 static DECLARE_MUTEX_LOCKED(xpc_discovery_exited);
179
180
181 static struct timer_list xpc_hb_timer;
182
183
184 static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *);
185
186
187 static int xpc_system_reboot(struct notifier_block *, unsigned long, void *);
188 static struct notifier_block xpc_reboot_notifier = {
189         .notifier_call = xpc_system_reboot,
190 };
191
192 static int xpc_system_die(struct notifier_block *, unsigned long, void *);
193 static struct notifier_block xpc_die_notifier = {
194         .notifier_call = xpc_system_die,
195 };
196
197
198 /*
199  * Timer function to enforce the timelimit on the partition disengage request.
200  */
201 static void
202 xpc_timeout_partition_disengage_request(unsigned long data)
203 {
204         struct xpc_partition *part = (struct xpc_partition *) data;
205
206
207         DBUG_ON(jiffies < part->disengage_request_timeout);
208
209         (void) xpc_partition_disengaged(part);
210
211         DBUG_ON(part->disengage_request_timeout != 0);
212         DBUG_ON(xpc_partition_engaged(1UL << XPC_PARTID(part)) != 0);
213 }
214
215
216 /*
217  * Notify the heartbeat check thread that an IRQ has been received.
218  */
219 static irqreturn_t
220 xpc_act_IRQ_handler(int irq, void *dev_id, struct pt_regs *regs)
221 {
222         atomic_inc(&xpc_act_IRQ_rcvd);
223         wake_up_interruptible(&xpc_act_IRQ_wq);
224         return IRQ_HANDLED;
225 }
226
227
228 /*
229  * Timer to produce the heartbeat.  The timer structures function is
230  * already set when this is initially called.  A tunable is used to
231  * specify when the next timeout should occur.
232  */
233 static void
234 xpc_hb_beater(unsigned long dummy)
235 {
236         xpc_vars->heartbeat++;
237
238         if (jiffies >= xpc_hb_check_timeout) {
239                 wake_up_interruptible(&xpc_act_IRQ_wq);
240         }
241
242         xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ);
243         add_timer(&xpc_hb_timer);
244 }
245
246
247 /*
248  * This thread is responsible for nearly all of the partition
249  * activation/deactivation.
250  */
251 static int
252 xpc_hb_checker(void *ignore)
253 {
254         int last_IRQ_count = 0;
255         int new_IRQ_count;
256         int force_IRQ=0;
257
258
259         /* this thread was marked active by xpc_hb_init() */
260
261         daemonize(XPC_HB_CHECK_THREAD_NAME);
262
263         set_cpus_allowed(current, cpumask_of_cpu(XPC_HB_CHECK_CPU));
264
265         xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
266
267         while (!(volatile int) xpc_exiting) {
268
269                 dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have "
270                         "been received\n",
271                         (int) (xpc_hb_check_timeout - jiffies),
272                         atomic_read(&xpc_act_IRQ_rcvd) - last_IRQ_count);
273
274
275                 /* checking of remote heartbeats is skewed by IRQ handling */
276                 if (jiffies >= xpc_hb_check_timeout) {
277                         dev_dbg(xpc_part, "checking remote heartbeats\n");
278                         xpc_check_remote_hb();
279
280                         /*
281                          * We need to periodically recheck to ensure no
282                          * IPI/AMO pairs have been missed.  That check
283                          * must always reset xpc_hb_check_timeout.
284                          */
285                         force_IRQ = 1;
286                 }
287
288
289                 /* check for outstanding IRQs */
290                 new_IRQ_count = atomic_read(&xpc_act_IRQ_rcvd);
291                 if (last_IRQ_count < new_IRQ_count || force_IRQ != 0) {
292                         force_IRQ = 0;
293
294                         dev_dbg(xpc_part, "found an IRQ to process; will be "
295                                 "resetting xpc_hb_check_timeout\n");
296
297                         last_IRQ_count += xpc_identify_act_IRQ_sender();
298                         if (last_IRQ_count < new_IRQ_count) {
299                                 /* retry once to help avoid missing AMO */
300                                 (void) xpc_identify_act_IRQ_sender();
301                         }
302                         last_IRQ_count = new_IRQ_count;
303
304                         xpc_hb_check_timeout = jiffies +
305                                            (xpc_hb_check_interval * HZ);
306                 }
307
308                 /* wait for IRQ or timeout */
309                 (void) wait_event_interruptible(xpc_act_IRQ_wq,
310                             (last_IRQ_count < atomic_read(&xpc_act_IRQ_rcvd) ||
311                                         jiffies >= xpc_hb_check_timeout ||
312                                                 (volatile int) xpc_exiting));
313         }
314
315         dev_dbg(xpc_part, "heartbeat checker is exiting\n");
316
317
318         /* mark this thread as having exited */
319         up(&xpc_hb_checker_exited);
320         return 0;
321 }
322
323
324 /*
325  * This thread will attempt to discover other partitions to activate
326  * based on info provided by SAL. This new thread is short lived and
327  * will exit once discovery is complete.
328  */
329 static int
330 xpc_initiate_discovery(void *ignore)
331 {
332         daemonize(XPC_DISCOVERY_THREAD_NAME);
333
334         xpc_discovery();
335
336         dev_dbg(xpc_part, "discovery thread is exiting\n");
337
338         /* mark this thread as having exited */
339         up(&xpc_discovery_exited);
340         return 0;
341 }
342
343
344 /*
345  * Establish first contact with the remote partititon. This involves pulling
346  * the XPC per partition variables from the remote partition and waiting for
347  * the remote partition to pull ours.
348  */
349 static enum xpc_retval
350 xpc_make_first_contact(struct xpc_partition *part)
351 {
352         enum xpc_retval ret;
353
354
355         while ((ret = xpc_pull_remote_vars_part(part)) != xpcSuccess) {
356                 if (ret != xpcRetry) {
357                         XPC_DEACTIVATE_PARTITION(part, ret);
358                         return ret;
359                 }
360
361                 dev_dbg(xpc_chan, "waiting to make first contact with "
362                         "partition %d\n", XPC_PARTID(part));
363
364                 /* wait a 1/4 of a second or so */
365                 (void) msleep_interruptible(250);
366
367                 if (part->act_state == XPC_P_DEACTIVATING) {
368                         return part->reason;
369                 }
370         }
371
372         return xpc_mark_partition_active(part);
373 }
374
375
376 /*
377  * The first kthread assigned to a newly activated partition is the one
378  * created by XPC HB with which it calls xpc_partition_up(). XPC hangs on to
379  * that kthread until the partition is brought down, at which time that kthread
380  * returns back to XPC HB. (The return of that kthread will signify to XPC HB
381  * that XPC has dismantled all communication infrastructure for the associated
382  * partition.) This kthread becomes the channel manager for that partition.
383  *
384  * Each active partition has a channel manager, who, besides connecting and
385  * disconnecting channels, will ensure that each of the partition's connected
386  * channels has the required number of assigned kthreads to get the work done.
387  */
388 static void
389 xpc_channel_mgr(struct xpc_partition *part)
390 {
391         while (part->act_state != XPC_P_DEACTIVATING ||
392                         atomic_read(&part->nchannels_active) > 0 ||
393                                         !xpc_partition_disengaged(part)) {
394
395                 xpc_process_channel_activity(part);
396
397
398                 /*
399                  * Wait until we've been requested to activate kthreads or
400                  * all of the channel's message queues have been torn down or
401                  * a signal is pending.
402                  *
403                  * The channel_mgr_requests is set to 1 after being awakened,
404                  * This is done to prevent the channel mgr from making one pass
405                  * through the loop for each request, since he will
406                  * be servicing all the requests in one pass. The reason it's
407                  * set to 1 instead of 0 is so that other kthreads will know
408                  * that the channel mgr is running and won't bother trying to
409                  * wake him up.
410                  */
411                 atomic_dec(&part->channel_mgr_requests);
412                 (void) wait_event_interruptible(part->channel_mgr_wq,
413                                 (atomic_read(&part->channel_mgr_requests) > 0 ||
414                                 (volatile u64) part->local_IPI_amo != 0 ||
415                                 ((volatile u8) part->act_state ==
416                                                         XPC_P_DEACTIVATING &&
417                                 atomic_read(&part->nchannels_active) == 0 &&
418                                 xpc_partition_disengaged(part))));
419                 atomic_set(&part->channel_mgr_requests, 1);
420
421                 // >>> Does it need to wakeup periodically as well? In case we
422                 // >>> miscalculated the #of kthreads to wakeup or create?
423         }
424 }
425
426
427 /*
428  * When XPC HB determines that a partition has come up, it will create a new
429  * kthread and that kthread will call this function to attempt to set up the
430  * basic infrastructure used for Cross Partition Communication with the newly
431  * upped partition.
432  *
433  * The kthread that was created by XPC HB and which setup the XPC
434  * infrastructure will remain assigned to the partition until the partition
435  * goes down. At which time the kthread will teardown the XPC infrastructure
436  * and then exit.
437  *
438  * XPC HB will put the remote partition's XPC per partition specific variables
439  * physical address into xpc_partitions[partid].remote_vars_part_pa prior to
440  * calling xpc_partition_up().
441  */
442 static void
443 xpc_partition_up(struct xpc_partition *part)
444 {
445         DBUG_ON(part->channels != NULL);
446
447         dev_dbg(xpc_chan, "activating partition %d\n", XPC_PARTID(part));
448
449         if (xpc_setup_infrastructure(part) != xpcSuccess) {
450                 return;
451         }
452
453         /*
454          * The kthread that XPC HB called us with will become the
455          * channel manager for this partition. It will not return
456          * back to XPC HB until the partition's XPC infrastructure
457          * has been dismantled.
458          */
459
460         (void) xpc_part_ref(part);      /* this will always succeed */
461
462         if (xpc_make_first_contact(part) == xpcSuccess) {
463                 xpc_channel_mgr(part);
464         }
465
466         xpc_part_deref(part);
467
468         xpc_teardown_infrastructure(part);
469 }
470
471
472 static int
473 xpc_activating(void *__partid)
474 {
475         partid_t partid = (u64) __partid;
476         struct xpc_partition *part = &xpc_partitions[partid];
477         unsigned long irq_flags;
478         struct sched_param param = { sched_priority: MAX_RT_PRIO - 1 };
479         int ret;
480
481
482         DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
483
484         spin_lock_irqsave(&part->act_lock, irq_flags);
485
486         if (part->act_state == XPC_P_DEACTIVATING) {
487                 part->act_state = XPC_P_INACTIVE;
488                 spin_unlock_irqrestore(&part->act_lock, irq_flags);
489                 part->remote_rp_pa = 0;
490                 return 0;
491         }
492
493         /* indicate the thread is activating */
494         DBUG_ON(part->act_state != XPC_P_ACTIVATION_REQ);
495         part->act_state = XPC_P_ACTIVATING;
496
497         XPC_SET_REASON(part, 0, 0);
498         spin_unlock_irqrestore(&part->act_lock, irq_flags);
499
500         dev_dbg(xpc_part, "bringing partition %d up\n", partid);
501
502         daemonize("xpc%02d", partid);
503
504         /*
505          * This thread needs to run at a realtime priority to prevent a
506          * significant performance degradation.
507          */
508         ret = sched_setscheduler(current, SCHED_FIFO, &param);
509         if (ret != 0) {
510                 dev_warn(xpc_part, "unable to set pid %d to a realtime "
511                         "priority, ret=%d\n", current->pid, ret);
512         }
513
514         /* allow this thread and its children to run on any CPU */
515         set_cpus_allowed(current, CPU_MASK_ALL);
516
517         /*
518          * Register the remote partition's AMOs with SAL so it can handle
519          * and cleanup errors within that address range should the remote
520          * partition go down. We don't unregister this range because it is
521          * difficult to tell when outstanding writes to the remote partition
522          * are finished and thus when it is safe to unregister. This should
523          * not result in wasted space in the SAL xp_addr_region table because
524          * we should get the same page for remote_amos_page_pa after module
525          * reloads and system reboots.
526          */
527         if (sn_register_xp_addr_region(part->remote_amos_page_pa,
528                                                         PAGE_SIZE, 1) < 0) {
529                 dev_warn(xpc_part, "xpc_partition_up(%d) failed to register "
530                         "xp_addr region\n", partid);
531
532                 spin_lock_irqsave(&part->act_lock, irq_flags);
533                 part->act_state = XPC_P_INACTIVE;
534                 XPC_SET_REASON(part, xpcPhysAddrRegFailed, __LINE__);
535                 spin_unlock_irqrestore(&part->act_lock, irq_flags);
536                 part->remote_rp_pa = 0;
537                 return 0;
538         }
539
540         xpc_allow_hb(partid, xpc_vars);
541         xpc_IPI_send_activated(part);
542
543
544         /*
545          * xpc_partition_up() holds this thread and marks this partition as
546          * XPC_P_ACTIVE by calling xpc_hb_mark_active().
547          */
548         (void) xpc_partition_up(part);
549
550         xpc_disallow_hb(partid, xpc_vars);
551         xpc_mark_partition_inactive(part);
552
553         if (part->reason == xpcReactivating) {
554                 /* interrupting ourselves results in activating partition */
555                 xpc_IPI_send_reactivate(part);
556         }
557
558         return 0;
559 }
560
561
562 void
563 xpc_activate_partition(struct xpc_partition *part)
564 {
565         partid_t partid = XPC_PARTID(part);
566         unsigned long irq_flags;
567         pid_t pid;
568
569
570         spin_lock_irqsave(&part->act_lock, irq_flags);
571
572         pid = kernel_thread(xpc_activating, (void *) ((u64) partid), 0);
573
574         DBUG_ON(part->act_state != XPC_P_INACTIVE);
575
576         if (pid > 0) {
577                 part->act_state = XPC_P_ACTIVATION_REQ;
578                 XPC_SET_REASON(part, xpcCloneKThread, __LINE__);
579         } else {
580                 XPC_SET_REASON(part, xpcCloneKThreadFailed, __LINE__);
581         }
582
583         spin_unlock_irqrestore(&part->act_lock, irq_flags);
584 }
585
586
587 /*
588  * Handle the receipt of a SGI_XPC_NOTIFY IRQ by seeing whether the specified
589  * partition actually sent it. Since SGI_XPC_NOTIFY IRQs may be shared by more
590  * than one partition, we use an AMO_t structure per partition to indicate
591  * whether a partition has sent an IPI or not.  >>> If it has, then wake up the
592  * associated kthread to handle it.
593  *
594  * All SGI_XPC_NOTIFY IRQs received by XPC are the result of IPIs sent by XPC
595  * running on other partitions.
596  *
597  * Noteworthy Arguments:
598  *
599  *      irq - Interrupt ReQuest number. NOT USED.
600  *
601  *      dev_id - partid of IPI's potential sender.
602  *
603  *      regs - processor's context before the processor entered
604  *             interrupt code. NOT USED.
605  */
606 irqreturn_t
607 xpc_notify_IRQ_handler(int irq, void *dev_id, struct pt_regs *regs)
608 {
609         partid_t partid = (partid_t) (u64) dev_id;
610         struct xpc_partition *part = &xpc_partitions[partid];
611
612
613         DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
614
615         if (xpc_part_ref(part)) {
616                 xpc_check_for_channel_activity(part);
617
618                 xpc_part_deref(part);
619         }
620         return IRQ_HANDLED;
621 }
622
623
624 /*
625  * Check to see if xpc_notify_IRQ_handler() dropped any IPIs on the floor
626  * because the write to their associated IPI amo completed after the IRQ/IPI
627  * was received.
628  */
629 void
630 xpc_dropped_IPI_check(struct xpc_partition *part)
631 {
632         if (xpc_part_ref(part)) {
633                 xpc_check_for_channel_activity(part);
634
635                 part->dropped_IPI_timer.expires = jiffies +
636                                                         XPC_P_DROPPED_IPI_WAIT;
637                 add_timer(&part->dropped_IPI_timer);
638                 xpc_part_deref(part);
639         }
640 }
641
642
643 void
644 xpc_activate_kthreads(struct xpc_channel *ch, int needed)
645 {
646         int idle = atomic_read(&ch->kthreads_idle);
647         int assigned = atomic_read(&ch->kthreads_assigned);
648         int wakeup;
649
650
651         DBUG_ON(needed <= 0);
652
653         if (idle > 0) {
654                 wakeup = (needed > idle) ? idle : needed;
655                 needed -= wakeup;
656
657                 dev_dbg(xpc_chan, "wakeup %d idle kthreads, partid=%d, "
658                         "channel=%d\n", wakeup, ch->partid, ch->number);
659
660                 /* only wakeup the requested number of kthreads */
661                 wake_up_nr(&ch->idle_wq, wakeup);
662         }
663
664         if (needed <= 0) {
665                 return;
666         }
667
668         if (needed + assigned > ch->kthreads_assigned_limit) {
669                 needed = ch->kthreads_assigned_limit - assigned;
670                 // >>>should never be less than 0
671                 if (needed <= 0) {
672                         return;
673                 }
674         }
675
676         dev_dbg(xpc_chan, "create %d new kthreads, partid=%d, channel=%d\n",
677                 needed, ch->partid, ch->number);
678
679         xpc_create_kthreads(ch, needed);
680 }
681
682
683 /*
684  * This function is where XPC's kthreads wait for messages to deliver.
685  */
686 static void
687 xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch)
688 {
689         do {
690                 /* deliver messages to their intended recipients */
691
692                 while ((volatile s64) ch->w_local_GP.get <
693                                 (volatile s64) ch->w_remote_GP.put &&
694                                         !((volatile u32) ch->flags &
695                                                 XPC_C_DISCONNECTING)) {
696                         xpc_deliver_msg(ch);
697                 }
698
699                 if (atomic_inc_return(&ch->kthreads_idle) >
700                                                 ch->kthreads_idle_limit) {
701                         /* too many idle kthreads on this channel */
702                         atomic_dec(&ch->kthreads_idle);
703                         break;
704                 }
705
706                 dev_dbg(xpc_chan, "idle kthread calling "
707                         "wait_event_interruptible_exclusive()\n");
708
709                 (void) wait_event_interruptible_exclusive(ch->idle_wq,
710                                 ((volatile s64) ch->w_local_GP.get <
711                                         (volatile s64) ch->w_remote_GP.put ||
712                                 ((volatile u32) ch->flags &
713                                                 XPC_C_DISCONNECTING)));
714
715                 atomic_dec(&ch->kthreads_idle);
716
717         } while (!((volatile u32) ch->flags & XPC_C_DISCONNECTING));
718 }
719
720
721 static int
722 xpc_daemonize_kthread(void *args)
723 {
724         partid_t partid = XPC_UNPACK_ARG1(args);
725         u16 ch_number = XPC_UNPACK_ARG2(args);
726         struct xpc_partition *part = &xpc_partitions[partid];
727         struct xpc_channel *ch;
728         int n_needed;
729         unsigned long irq_flags;
730
731
732         daemonize("xpc%02dc%d", partid, ch_number);
733
734         dev_dbg(xpc_chan, "kthread starting, partid=%d, channel=%d\n",
735                 partid, ch_number);
736
737         ch = &part->channels[ch_number];
738
739         if (!(ch->flags & XPC_C_DISCONNECTING)) {
740
741                 /* let registerer know that connection has been established */
742
743                 spin_lock_irqsave(&ch->lock, irq_flags);
744                 if (!(ch->flags & XPC_C_CONNECTCALLOUT)) {
745                         ch->flags |= XPC_C_CONNECTCALLOUT;
746                         spin_unlock_irqrestore(&ch->lock, irq_flags);
747
748                         xpc_connected_callout(ch);
749
750                         /*
751                          * It is possible that while the callout was being
752                          * made that the remote partition sent some messages.
753                          * If that is the case, we may need to activate
754                          * additional kthreads to help deliver them. We only
755                          * need one less than total #of messages to deliver.
756                          */
757                         n_needed = ch->w_remote_GP.put - ch->w_local_GP.get - 1;
758                         if (n_needed > 0 &&
759                                         !(ch->flags & XPC_C_DISCONNECTING)) {
760                                 xpc_activate_kthreads(ch, n_needed);
761                         }
762                 } else {
763                         spin_unlock_irqrestore(&ch->lock, irq_flags);
764                 }
765
766                 xpc_kthread_waitmsgs(part, ch);
767         }
768
769         if (atomic_dec_return(&ch->kthreads_assigned) == 0) {
770                 spin_lock_irqsave(&ch->lock, irq_flags);
771                 if ((ch->flags & XPC_C_CONNECTCALLOUT) &&
772                                 !(ch->flags & XPC_C_DISCONNECTCALLOUT)) {
773                         ch->flags |= XPC_C_DISCONNECTCALLOUT;
774                         spin_unlock_irqrestore(&ch->lock, irq_flags);
775
776                         xpc_disconnecting_callout(ch);
777                 } else {
778                         spin_unlock_irqrestore(&ch->lock, irq_flags);
779                 }
780                 if (atomic_dec_return(&part->nchannels_engaged) == 0) {
781                         xpc_mark_partition_disengaged(part);
782                         xpc_IPI_send_disengage(part);
783                 }
784         }
785
786
787         xpc_msgqueue_deref(ch);
788
789         dev_dbg(xpc_chan, "kthread exiting, partid=%d, channel=%d\n",
790                 partid, ch_number);
791
792         xpc_part_deref(part);
793         return 0;
794 }
795
796
797 /*
798  * For each partition that XPC has established communications with, there is
799  * a minimum of one kernel thread assigned to perform any operation that
800  * may potentially sleep or block (basically the callouts to the asynchronous
801  * functions registered via xpc_connect()).
802  *
803  * Additional kthreads are created and destroyed by XPC as the workload
804  * demands.
805  *
806  * A kthread is assigned to one of the active channels that exists for a given
807  * partition.
808  */
809 void
810 xpc_create_kthreads(struct xpc_channel *ch, int needed)
811 {
812         unsigned long irq_flags;
813         pid_t pid;
814         u64 args = XPC_PACK_ARGS(ch->partid, ch->number);
815         struct xpc_partition *part = &xpc_partitions[ch->partid];
816
817
818         while (needed-- > 0) {
819
820                 /*
821                  * The following is done on behalf of the newly created
822                  * kthread. That kthread is responsible for doing the
823                  * counterpart to the following before it exits.
824                  */
825                 (void) xpc_part_ref(part);
826                 xpc_msgqueue_ref(ch);
827                 if (atomic_inc_return(&ch->kthreads_assigned) == 1 &&
828                     atomic_inc_return(&part->nchannels_engaged) == 1) {
829                         xpc_mark_partition_engaged(part);
830                 }
831
832                 pid = kernel_thread(xpc_daemonize_kthread, (void *) args, 0);
833                 if (pid < 0) {
834                         /* the fork failed */
835                         if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
836                             atomic_dec_return(&part->nchannels_engaged) == 0) {
837                                 xpc_mark_partition_disengaged(part);
838                                 xpc_IPI_send_disengage(part);
839                         }
840                         xpc_msgqueue_deref(ch);
841                         xpc_part_deref(part);
842
843                         if (atomic_read(&ch->kthreads_assigned) <
844                                                 ch->kthreads_idle_limit) {
845                                 /*
846                                  * Flag this as an error only if we have an
847                                  * insufficient #of kthreads for the channel
848                                  * to function.
849                                  *
850                                  * No xpc_msgqueue_ref() is needed here since
851                                  * the channel mgr is doing this.
852                                  */
853                                 spin_lock_irqsave(&ch->lock, irq_flags);
854                                 XPC_DISCONNECT_CHANNEL(ch, xpcLackOfResources,
855                                                                 &irq_flags);
856                                 spin_unlock_irqrestore(&ch->lock, irq_flags);
857                         }
858                         break;
859                 }
860
861                 ch->kthreads_created++; // >>> temporary debug only!!!
862         }
863 }
864
865
866 void
867 xpc_disconnect_wait(int ch_number)
868 {
869         unsigned long irq_flags;
870         partid_t partid;
871         struct xpc_partition *part;
872         struct xpc_channel *ch;
873         int wakeup_channel_mgr;
874
875
876         /* now wait for all callouts to the caller's function to cease */
877         for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
878                 part = &xpc_partitions[partid];
879
880                 if (!xpc_part_ref(part)) {
881                         continue;
882                 }
883
884                 ch = &part->channels[ch_number];
885
886                 if (!(ch->flags & XPC_C_WDISCONNECT)) {
887                         xpc_part_deref(part);
888                         continue;
889                 }
890
891                 (void) down(&ch->wdisconnect_sema);
892
893                 spin_lock_irqsave(&ch->lock, irq_flags);
894                 DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
895                 wakeup_channel_mgr = 0;
896
897                 if (ch->delayed_IPI_flags) {
898                         if (part->act_state != XPC_P_DEACTIVATING) {
899                                 spin_lock(&part->IPI_lock);
900                                 XPC_SET_IPI_FLAGS(part->local_IPI_amo,
901                                         ch->number, ch->delayed_IPI_flags);
902                                 spin_unlock(&part->IPI_lock);
903                                 wakeup_channel_mgr = 1;
904                         }
905                         ch->delayed_IPI_flags = 0;
906                 }
907
908                 ch->flags &= ~XPC_C_WDISCONNECT;
909                 spin_unlock_irqrestore(&ch->lock, irq_flags);
910
911                 if (wakeup_channel_mgr) {
912                         xpc_wakeup_channel_mgr(part);
913                 }
914
915                 xpc_part_deref(part);
916         }
917 }
918
919
920 static void
921 xpc_do_exit(enum xpc_retval reason)
922 {
923         partid_t partid;
924         int active_part_count;
925         struct xpc_partition *part;
926         unsigned long printmsg_time;
927
928
929         /* a 'rmmod XPC' and a 'reboot' cannot both end up here together */
930         DBUG_ON(xpc_exiting == 1);
931
932         /*
933          * Let the heartbeat checker thread and the discovery thread
934          * (if one is running) know that they should exit. Also wake up
935          * the heartbeat checker thread in case it's sleeping.
936          */
937         xpc_exiting = 1;
938         wake_up_interruptible(&xpc_act_IRQ_wq);
939
940         /* ignore all incoming interrupts */
941         free_irq(SGI_XPC_ACTIVATE, NULL);
942
943         /* wait for the discovery thread to exit */
944         down(&xpc_discovery_exited);
945
946         /* wait for the heartbeat checker thread to exit */
947         down(&xpc_hb_checker_exited);
948
949
950         /* sleep for a 1/3 of a second or so */
951         (void) msleep_interruptible(300);
952
953
954         /* wait for all partitions to become inactive */
955
956         printmsg_time = jiffies;
957
958         do {
959                 active_part_count = 0;
960
961                 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
962                         part = &xpc_partitions[partid];
963
964                         if (xpc_partition_disengaged(part) &&
965                                         part->act_state == XPC_P_INACTIVE) {
966                                 continue;
967                         }
968
969                         active_part_count++;
970
971                         XPC_DEACTIVATE_PARTITION(part, reason);
972                 }
973
974                 if (active_part_count == 0) {
975                         break;
976                 }
977
978                 if (jiffies >= printmsg_time) {
979                         dev_info(xpc_part, "waiting for partitions to "
980                                 "deactivate/disengage, active count=%d, remote "
981                                 "engaged=0x%lx\n", active_part_count,
982                                 xpc_partition_engaged(1UL << partid));
983
984                         printmsg_time = jiffies +
985                                         (XPC_DISENGAGE_PRINTMSG_INTERVAL * HZ);
986                 }
987
988                 /* sleep for a 1/3 of a second or so */
989                 (void) msleep_interruptible(300);
990
991         } while (1);
992
993         DBUG_ON(xpc_partition_engaged(-1UL));
994
995
996         /* indicate to others that our reserved page is uninitialized */
997         xpc_rsvd_page->vars_pa = 0;
998
999         /* now it's time to eliminate our heartbeat */
1000         del_timer_sync(&xpc_hb_timer);
1001         DBUG_ON(xpc_vars->heartbeating_to_mask != 0);
1002
1003         /* take ourselves off of the reboot_notifier_list */
1004         (void) unregister_reboot_notifier(&xpc_reboot_notifier);
1005
1006         /* take ourselves off of the die_notifier list */
1007         (void) unregister_die_notifier(&xpc_die_notifier);
1008
1009         /* close down protections for IPI operations */
1010         xpc_restrict_IPI_ops();
1011
1012
1013         /* clear the interface to XPC's functions */
1014         xpc_clear_interface();
1015
1016         if (xpc_sysctl) {
1017                 unregister_sysctl_table(xpc_sysctl);
1018         }
1019 }
1020
1021
1022 /*
1023  * Called when the system is about to be either restarted or halted.
1024  */
1025 static void
1026 xpc_die_disengage(void)
1027 {
1028         struct xpc_partition *part;
1029         partid_t partid;
1030         unsigned long engaged;
1031         long time, print_time, disengage_request_timeout;
1032
1033
1034         /* keep xpc_hb_checker thread from doing anything (just in case) */
1035         xpc_exiting = 1;
1036
1037         xpc_vars->heartbeating_to_mask = 0;  /* indicate we're deactivated */
1038
1039         for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1040                 part = &xpc_partitions[partid];
1041
1042                 if (!XPC_SUPPORTS_DISENGAGE_REQUEST(part->
1043                                                         remote_vars_version)) {
1044
1045                         /* just in case it was left set by an earlier XPC */
1046                         xpc_clear_partition_engaged(1UL << partid);
1047                         continue;
1048                 }
1049
1050                 if (xpc_partition_engaged(1UL << partid) ||
1051                                         part->act_state != XPC_P_INACTIVE) {
1052                         xpc_request_partition_disengage(part);
1053                         xpc_mark_partition_disengaged(part);
1054                         xpc_IPI_send_disengage(part);
1055                 }
1056         }
1057
1058         print_time = rtc_time();
1059         disengage_request_timeout = print_time +
1060                 (xpc_disengage_request_timelimit * sn_rtc_cycles_per_second);
1061
1062         /* wait for all other partitions to disengage from us */
1063
1064         while ((engaged = xpc_partition_engaged(-1UL)) &&
1065                         (time = rtc_time()) < disengage_request_timeout) {
1066
1067                 if (time >= print_time) {
1068                         dev_info(xpc_part, "waiting for remote partitions to "
1069                                 "disengage, engaged=0x%lx\n", engaged);
1070                         print_time = time + (XPC_DISENGAGE_PRINTMSG_INTERVAL *
1071                                                 sn_rtc_cycles_per_second);
1072                 }
1073         }
1074         dev_info(xpc_part, "finished waiting for remote partitions to "
1075                                 "disengage, engaged=0x%lx\n", engaged);
1076 }
1077
1078
1079 /*
1080  * This function is called when the system is being rebooted.
1081  */
1082 static int
1083 xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused)
1084 {
1085         enum xpc_retval reason;
1086
1087
1088         switch (event) {
1089         case SYS_RESTART:
1090                 reason = xpcSystemReboot;
1091                 break;
1092         case SYS_HALT:
1093                 reason = xpcSystemHalt;
1094                 break;
1095         case SYS_POWER_OFF:
1096                 reason = xpcSystemPoweroff;
1097                 break;
1098         default:
1099                 reason = xpcSystemGoingDown;
1100         }
1101
1102         xpc_do_exit(reason);
1103         return NOTIFY_DONE;
1104 }
1105
1106
1107 /*
1108  * This function is called when the system is being rebooted.
1109  */
1110 static int
1111 xpc_system_die(struct notifier_block *nb, unsigned long event, void *unused)
1112 {
1113         switch (event) {
1114         case DIE_MACHINE_RESTART:
1115         case DIE_MACHINE_HALT:
1116                 xpc_die_disengage();
1117                 break;
1118         case DIE_MCA_MONARCH_ENTER:
1119         case DIE_INIT_MONARCH_ENTER:
1120                 xpc_vars->heartbeat++;
1121                 xpc_vars->heartbeat_offline = 1;
1122                 break;
1123         case DIE_MCA_MONARCH_LEAVE:
1124         case DIE_INIT_MONARCH_LEAVE:
1125                 xpc_vars->heartbeat++;
1126                 xpc_vars->heartbeat_offline = 0;
1127                 break;
1128         }
1129
1130         return NOTIFY_DONE;
1131 }
1132
1133
1134 int __init
1135 xpc_init(void)
1136 {
1137         int ret;
1138         partid_t partid;
1139         struct xpc_partition *part;
1140         pid_t pid;
1141
1142
1143         if (!ia64_platform_is("sn2")) {
1144                 return -ENODEV;
1145         }
1146
1147         /*
1148          * xpc_remote_copy_buffer is used as a temporary buffer for bte_copy'ng
1149          * various portions of a partition's reserved page. Its size is based
1150          * on the size of the reserved page header and part_nasids mask. So we
1151          * need to ensure that the other items will fit as well.
1152          */
1153         if (XPC_RP_VARS_SIZE > XPC_RP_HEADER_SIZE + XP_NASID_MASK_BYTES) {
1154                 dev_err(xpc_part, "xpc_remote_copy_buffer is not big enough\n");
1155                 return -EPERM;
1156         }
1157         DBUG_ON((u64) xpc_remote_copy_buffer !=
1158                                 L1_CACHE_ALIGN((u64) xpc_remote_copy_buffer));
1159
1160         snprintf(xpc_part->bus_id, BUS_ID_SIZE, "part");
1161         snprintf(xpc_chan->bus_id, BUS_ID_SIZE, "chan");
1162
1163         xpc_sysctl = register_sysctl_table(xpc_sys_dir, 1);
1164
1165         /*
1166          * The first few fields of each entry of xpc_partitions[] need to
1167          * be initialized now so that calls to xpc_connect() and
1168          * xpc_disconnect() can be made prior to the activation of any remote
1169          * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE
1170          * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING
1171          * PARTITION HAS BEEN ACTIVATED.
1172          */
1173         for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1174                 part = &xpc_partitions[partid];
1175
1176                 DBUG_ON((u64) part != L1_CACHE_ALIGN((u64) part));
1177
1178                 part->act_IRQ_rcvd = 0;
1179                 spin_lock_init(&part->act_lock);
1180                 part->act_state = XPC_P_INACTIVE;
1181                 XPC_SET_REASON(part, 0, 0);
1182
1183                 init_timer(&part->disengage_request_timer);
1184                 part->disengage_request_timer.function =
1185                                 xpc_timeout_partition_disengage_request;
1186                 part->disengage_request_timer.data = (unsigned long) part;
1187
1188                 part->setup_state = XPC_P_UNSET;
1189                 init_waitqueue_head(&part->teardown_wq);
1190                 atomic_set(&part->references, 0);
1191         }
1192
1193         /*
1194          * Open up protections for IPI operations (and AMO operations on
1195          * Shub 1.1 systems).
1196          */
1197         xpc_allow_IPI_ops();
1198
1199         /*
1200          * Interrupts being processed will increment this atomic variable and
1201          * awaken the heartbeat thread which will process the interrupts.
1202          */
1203         atomic_set(&xpc_act_IRQ_rcvd, 0);
1204
1205         /*
1206          * This is safe to do before the xpc_hb_checker thread has started
1207          * because the handler releases a wait queue.  If an interrupt is
1208          * received before the thread is waiting, it will not go to sleep,
1209          * but rather immediately process the interrupt.
1210          */
1211         ret = request_irq(SGI_XPC_ACTIVATE, xpc_act_IRQ_handler, 0,
1212                                                         "xpc hb", NULL);
1213         if (ret != 0) {
1214                 dev_err(xpc_part, "can't register ACTIVATE IRQ handler, "
1215                         "errno=%d\n", -ret);
1216
1217                 xpc_restrict_IPI_ops();
1218
1219                 if (xpc_sysctl) {
1220                         unregister_sysctl_table(xpc_sysctl);
1221                 }
1222                 return -EBUSY;
1223         }
1224
1225         /*
1226          * Fill the partition reserved page with the information needed by
1227          * other partitions to discover we are alive and establish initial
1228          * communications.
1229          */
1230         xpc_rsvd_page = xpc_rsvd_page_init();
1231         if (xpc_rsvd_page == NULL) {
1232                 dev_err(xpc_part, "could not setup our reserved page\n");
1233
1234                 free_irq(SGI_XPC_ACTIVATE, NULL);
1235                 xpc_restrict_IPI_ops();
1236
1237                 if (xpc_sysctl) {
1238                         unregister_sysctl_table(xpc_sysctl);
1239                 }
1240                 return -EBUSY;
1241         }
1242
1243
1244         /* add ourselves to the reboot_notifier_list */
1245         ret = register_reboot_notifier(&xpc_reboot_notifier);
1246         if (ret != 0) {
1247                 dev_warn(xpc_part, "can't register reboot notifier\n");
1248         }
1249
1250         /* add ourselves to the die_notifier list (i.e., ia64die_chain) */
1251         ret = register_die_notifier(&xpc_die_notifier);
1252         if (ret != 0) {
1253                 dev_warn(xpc_part, "can't register die notifier\n");
1254         }
1255
1256
1257         /*
1258          * Set the beating to other partitions into motion.  This is
1259          * the last requirement for other partitions' discovery to
1260          * initiate communications with us.
1261          */
1262         init_timer(&xpc_hb_timer);
1263         xpc_hb_timer.function = xpc_hb_beater;
1264         xpc_hb_beater(0);
1265
1266
1267         /*
1268          * The real work-horse behind xpc.  This processes incoming
1269          * interrupts and monitors remote heartbeats.
1270          */
1271         pid = kernel_thread(xpc_hb_checker, NULL, 0);
1272         if (pid < 0) {
1273                 dev_err(xpc_part, "failed while forking hb check thread\n");
1274
1275                 /* indicate to others that our reserved page is uninitialized */
1276                 xpc_rsvd_page->vars_pa = 0;
1277
1278                 /* take ourselves off of the reboot_notifier_list */
1279                 (void) unregister_reboot_notifier(&xpc_reboot_notifier);
1280
1281                 /* take ourselves off of the die_notifier list */
1282                 (void) unregister_die_notifier(&xpc_die_notifier);
1283
1284                 del_timer_sync(&xpc_hb_timer);
1285                 free_irq(SGI_XPC_ACTIVATE, NULL);
1286                 xpc_restrict_IPI_ops();
1287
1288                 if (xpc_sysctl) {
1289                         unregister_sysctl_table(xpc_sysctl);
1290                 }
1291                 return -EBUSY;
1292         }
1293
1294
1295         /*
1296          * Startup a thread that will attempt to discover other partitions to
1297          * activate based on info provided by SAL. This new thread is short
1298          * lived and will exit once discovery is complete.
1299          */
1300         pid = kernel_thread(xpc_initiate_discovery, NULL, 0);
1301         if (pid < 0) {
1302                 dev_err(xpc_part, "failed while forking discovery thread\n");
1303
1304                 /* mark this new thread as a non-starter */
1305                 up(&xpc_discovery_exited);
1306
1307                 xpc_do_exit(xpcUnloading);
1308                 return -EBUSY;
1309         }
1310
1311
1312         /* set the interface to point at XPC's functions */
1313         xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect,
1314                           xpc_initiate_allocate, xpc_initiate_send,
1315                           xpc_initiate_send_notify, xpc_initiate_received,
1316                           xpc_initiate_partid_to_nasids);
1317
1318         return 0;
1319 }
1320 module_init(xpc_init);
1321
1322
1323 void __exit
1324 xpc_exit(void)
1325 {
1326         xpc_do_exit(xpcUnloading);
1327 }
1328 module_exit(xpc_exit);
1329
1330
1331 MODULE_AUTHOR("Silicon Graphics, Inc.");
1332 MODULE_DESCRIPTION("Cross Partition Communication (XPC) support");
1333 MODULE_LICENSE("GPL");
1334
1335 module_param(xpc_hb_interval, int, 0);
1336 MODULE_PARM_DESC(xpc_hb_interval, "Number of seconds between "
1337                 "heartbeat increments.");
1338
1339 module_param(xpc_hb_check_interval, int, 0);
1340 MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between "
1341                 "heartbeat checks.");
1342
1343 module_param(xpc_disengage_request_timelimit, int, 0);
1344 MODULE_PARM_DESC(xpc_disengage_request_timelimit, "Number of seconds to wait "
1345                 "for disengage request to complete.");
1346