Merge ../linux-2.6 by hand
[linux-2.6] / arch / ia64 / sn / kernel / xpc_main.c
1 /*
2  * This file is subject to the terms and conditions of the GNU General Public
3  * License.  See the file "COPYING" in the main directory of this archive
4  * for more details.
5  *
6  * Copyright (c) 2004-2005 Silicon Graphics, Inc.  All Rights Reserved.
7  */
8
9
10 /*
11  * Cross Partition Communication (XPC) support - standard version.
12  *
13  *      XPC provides a message passing capability that crosses partition
14  *      boundaries. This module is made up of two parts:
15  *
16  *          partition   This part detects the presence/absence of other
17  *                      partitions. It provides a heartbeat and monitors
18  *                      the heartbeats of other partitions.
19  *
20  *          channel     This part manages the channels and sends/receives
21  *                      messages across them to/from other partitions.
22  *
23  *      There are a couple of additional functions residing in XP, which
24  *      provide an interface to XPC for its users.
25  *
26  *
27  *      Caveats:
28  *
29  *        . We currently have no way to determine which nasid an IPI came
30  *          from. Thus, xpc_IPI_send() does a remote AMO write followed by
31  *          an IPI. The AMO indicates where data is to be pulled from, so
32  *          after the IPI arrives, the remote partition checks the AMO word.
33  *          The IPI can actually arrive before the AMO however, so other code
34  *          must periodically check for this case. Also, remote AMO operations
35  *          do not reliably time out. Thus we do a remote PIO read solely to
36  *          know whether the remote partition is down and whether we should
37  *          stop sending IPIs to it. This remote PIO read operation is set up
38  *          in a special nofault region so SAL knows to ignore (and cleanup)
39  *          any errors due to the remote AMO write, PIO read, and/or PIO
40  *          write operations.
41  *
42  *          If/when new hardware solves this IPI problem, we should abandon
43  *          the current approach.
44  *
45  */
46
47
48 #include <linux/kernel.h>
49 #include <linux/module.h>
50 #include <linux/init.h>
51 #include <linux/sched.h>
52 #include <linux/syscalls.h>
53 #include <linux/cache.h>
54 #include <linux/interrupt.h>
55 #include <linux/slab.h>
56 #include <linux/delay.h>
57 #include <linux/reboot.h>
58 #include <asm/sn/intr.h>
59 #include <asm/sn/sn_sal.h>
60 #include <asm/uaccess.h>
61 #include "xpc.h"
62
63
64 /* define two XPC debug device structures to be used with dev_dbg() et al */
65
66 struct device_driver xpc_dbg_name = {
67         .name = "xpc"
68 };
69
70 struct device xpc_part_dbg_subname = {
71         .bus_id = {0},          /* set to "part" at xpc_init() time */
72         .driver = &xpc_dbg_name
73 };
74
75 struct device xpc_chan_dbg_subname = {
76         .bus_id = {0},          /* set to "chan" at xpc_init() time */
77         .driver = &xpc_dbg_name
78 };
79
80 struct device *xpc_part = &xpc_part_dbg_subname;
81 struct device *xpc_chan = &xpc_chan_dbg_subname;
82
83
84 /* systune related variables for /proc/sys directories */
85
86 static int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL;
87 static int xpc_hb_min_interval = 1;
88 static int xpc_hb_max_interval = 10;
89
90 static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL;
91 static int xpc_hb_check_min_interval = 10;
92 static int xpc_hb_check_max_interval = 120;
93
94 int xpc_disengage_request_timelimit = XPC_DISENGAGE_REQUEST_DEFAULT_TIMELIMIT;
95 static int xpc_disengage_request_min_timelimit = 0;
96 static int xpc_disengage_request_max_timelimit = 120;
97
98 static ctl_table xpc_sys_xpc_hb_dir[] = {
99         {
100                 1,
101                 "hb_interval",
102                 &xpc_hb_interval,
103                 sizeof(int),
104                 0644,
105                 NULL,
106                 &proc_dointvec_minmax,
107                 &sysctl_intvec,
108                 NULL,
109                 &xpc_hb_min_interval,
110                 &xpc_hb_max_interval
111         },
112         {
113                 2,
114                 "hb_check_interval",
115                 &xpc_hb_check_interval,
116                 sizeof(int),
117                 0644,
118                 NULL,
119                 &proc_dointvec_minmax,
120                 &sysctl_intvec,
121                 NULL,
122                 &xpc_hb_check_min_interval,
123                 &xpc_hb_check_max_interval
124         },
125         {0}
126 };
127 static ctl_table xpc_sys_xpc_dir[] = {
128         {
129                 1,
130                 "hb",
131                 NULL,
132                 0,
133                 0555,
134                 xpc_sys_xpc_hb_dir
135         },
136         {
137                 2,
138                 "disengage_request_timelimit",
139                 &xpc_disengage_request_timelimit,
140                 sizeof(int),
141                 0644,
142                 NULL,
143                 &proc_dointvec_minmax,
144                 &sysctl_intvec,
145                 NULL,
146                 &xpc_disengage_request_min_timelimit,
147                 &xpc_disengage_request_max_timelimit
148         },
149         {0}
150 };
151 static ctl_table xpc_sys_dir[] = {
152         {
153                 1,
154                 "xpc",
155                 NULL,
156                 0,
157                 0555,
158                 xpc_sys_xpc_dir
159         },
160         {0}
161 };
162 static struct ctl_table_header *xpc_sysctl;
163
164
165 /* #of IRQs received */
166 static atomic_t xpc_act_IRQ_rcvd;
167
168 /* IRQ handler notifies this wait queue on receipt of an IRQ */
169 static DECLARE_WAIT_QUEUE_HEAD(xpc_act_IRQ_wq);
170
171 static unsigned long xpc_hb_check_timeout;
172
173 /* notification that the xpc_hb_checker thread has exited */
174 static DECLARE_MUTEX_LOCKED(xpc_hb_checker_exited);
175
176 /* notification that the xpc_discovery thread has exited */
177 static DECLARE_MUTEX_LOCKED(xpc_discovery_exited);
178
179
180 static struct timer_list xpc_hb_timer;
181
182
183 static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *);
184
185
186 static int xpc_system_reboot(struct notifier_block *, unsigned long, void *);
187 static struct notifier_block xpc_reboot_notifier = {
188         .notifier_call = xpc_system_reboot,
189 };
190
191
192 /*
193  * Timer function to enforce the timelimit on the partition disengage request.
194  */
195 static void
196 xpc_timeout_partition_disengage_request(unsigned long data)
197 {
198         struct xpc_partition *part = (struct xpc_partition *) data;
199
200
201         DBUG_ON(jiffies < part->disengage_request_timeout);
202
203         (void) xpc_partition_disengaged(part);
204
205         DBUG_ON(part->disengage_request_timeout != 0);
206         DBUG_ON(xpc_partition_engaged(1UL << XPC_PARTID(part)) != 0);
207 }
208
209
210 /*
211  * Notify the heartbeat check thread that an IRQ has been received.
212  */
213 static irqreturn_t
214 xpc_act_IRQ_handler(int irq, void *dev_id, struct pt_regs *regs)
215 {
216         atomic_inc(&xpc_act_IRQ_rcvd);
217         wake_up_interruptible(&xpc_act_IRQ_wq);
218         return IRQ_HANDLED;
219 }
220
221
222 /*
223  * Timer to produce the heartbeat.  The timer structures function is
224  * already set when this is initially called.  A tunable is used to
225  * specify when the next timeout should occur.
226  */
227 static void
228 xpc_hb_beater(unsigned long dummy)
229 {
230         xpc_vars->heartbeat++;
231
232         if (jiffies >= xpc_hb_check_timeout) {
233                 wake_up_interruptible(&xpc_act_IRQ_wq);
234         }
235
236         xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ);
237         add_timer(&xpc_hb_timer);
238 }
239
240
241 /*
242  * This thread is responsible for nearly all of the partition
243  * activation/deactivation.
244  */
245 static int
246 xpc_hb_checker(void *ignore)
247 {
248         int last_IRQ_count = 0;
249         int new_IRQ_count;
250         int force_IRQ=0;
251
252
253         /* this thread was marked active by xpc_hb_init() */
254
255         daemonize(XPC_HB_CHECK_THREAD_NAME);
256
257         set_cpus_allowed(current, cpumask_of_cpu(XPC_HB_CHECK_CPU));
258
259         xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
260
261         while (!(volatile int) xpc_exiting) {
262
263                 dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have "
264                         "been received\n",
265                         (int) (xpc_hb_check_timeout - jiffies),
266                         atomic_read(&xpc_act_IRQ_rcvd) - last_IRQ_count);
267
268
269                 /* checking of remote heartbeats is skewed by IRQ handling */
270                 if (jiffies >= xpc_hb_check_timeout) {
271                         dev_dbg(xpc_part, "checking remote heartbeats\n");
272                         xpc_check_remote_hb();
273
274                         /*
275                          * We need to periodically recheck to ensure no
276                          * IPI/AMO pairs have been missed.  That check
277                          * must always reset xpc_hb_check_timeout.
278                          */
279                         force_IRQ = 1;
280                 }
281
282
283                 /* check for outstanding IRQs */
284                 new_IRQ_count = atomic_read(&xpc_act_IRQ_rcvd);
285                 if (last_IRQ_count < new_IRQ_count || force_IRQ != 0) {
286                         force_IRQ = 0;
287
288                         dev_dbg(xpc_part, "found an IRQ to process; will be "
289                                 "resetting xpc_hb_check_timeout\n");
290
291                         last_IRQ_count += xpc_identify_act_IRQ_sender();
292                         if (last_IRQ_count < new_IRQ_count) {
293                                 /* retry once to help avoid missing AMO */
294                                 (void) xpc_identify_act_IRQ_sender();
295                         }
296                         last_IRQ_count = new_IRQ_count;
297
298                         xpc_hb_check_timeout = jiffies +
299                                            (xpc_hb_check_interval * HZ);
300                 }
301
302                 /* wait for IRQ or timeout */
303                 (void) wait_event_interruptible(xpc_act_IRQ_wq,
304                             (last_IRQ_count < atomic_read(&xpc_act_IRQ_rcvd) ||
305                                         jiffies >= xpc_hb_check_timeout ||
306                                                 (volatile int) xpc_exiting));
307         }
308
309         dev_dbg(xpc_part, "heartbeat checker is exiting\n");
310
311
312         /* mark this thread as having exited */
313         up(&xpc_hb_checker_exited);
314         return 0;
315 }
316
317
318 /*
319  * This thread will attempt to discover other partitions to activate
320  * based on info provided by SAL. This new thread is short lived and
321  * will exit once discovery is complete.
322  */
323 static int
324 xpc_initiate_discovery(void *ignore)
325 {
326         daemonize(XPC_DISCOVERY_THREAD_NAME);
327
328         xpc_discovery();
329
330         dev_dbg(xpc_part, "discovery thread is exiting\n");
331
332         /* mark this thread as having exited */
333         up(&xpc_discovery_exited);
334         return 0;
335 }
336
337
338 /*
339  * Establish first contact with the remote partititon. This involves pulling
340  * the XPC per partition variables from the remote partition and waiting for
341  * the remote partition to pull ours.
342  */
343 static enum xpc_retval
344 xpc_make_first_contact(struct xpc_partition *part)
345 {
346         enum xpc_retval ret;
347
348
349         while ((ret = xpc_pull_remote_vars_part(part)) != xpcSuccess) {
350                 if (ret != xpcRetry) {
351                         XPC_DEACTIVATE_PARTITION(part, ret);
352                         return ret;
353                 }
354
355                 dev_dbg(xpc_chan, "waiting to make first contact with "
356                         "partition %d\n", XPC_PARTID(part));
357
358                 /* wait a 1/4 of a second or so */
359                 (void) msleep_interruptible(250);
360
361                 if (part->act_state == XPC_P_DEACTIVATING) {
362                         return part->reason;
363                 }
364         }
365
366         return xpc_mark_partition_active(part);
367 }
368
369
370 /*
371  * The first kthread assigned to a newly activated partition is the one
372  * created by XPC HB with which it calls xpc_partition_up(). XPC hangs on to
373  * that kthread until the partition is brought down, at which time that kthread
374  * returns back to XPC HB. (The return of that kthread will signify to XPC HB
375  * that XPC has dismantled all communication infrastructure for the associated
376  * partition.) This kthread becomes the channel manager for that partition.
377  *
378  * Each active partition has a channel manager, who, besides connecting and
379  * disconnecting channels, will ensure that each of the partition's connected
380  * channels has the required number of assigned kthreads to get the work done.
381  */
382 static void
383 xpc_channel_mgr(struct xpc_partition *part)
384 {
385         while (part->act_state != XPC_P_DEACTIVATING ||
386                         atomic_read(&part->nchannels_active) > 0 ||
387                                         !xpc_partition_disengaged(part)) {
388
389                 xpc_process_channel_activity(part);
390
391
392                 /*
393                  * Wait until we've been requested to activate kthreads or
394                  * all of the channel's message queues have been torn down or
395                  * a signal is pending.
396                  *
397                  * The channel_mgr_requests is set to 1 after being awakened,
398                  * This is done to prevent the channel mgr from making one pass
399                  * through the loop for each request, since he will
400                  * be servicing all the requests in one pass. The reason it's
401                  * set to 1 instead of 0 is so that other kthreads will know
402                  * that the channel mgr is running and won't bother trying to
403                  * wake him up.
404                  */
405                 atomic_dec(&part->channel_mgr_requests);
406                 (void) wait_event_interruptible(part->channel_mgr_wq,
407                                 (atomic_read(&part->channel_mgr_requests) > 0 ||
408                                 (volatile u64) part->local_IPI_amo != 0 ||
409                                 ((volatile u8) part->act_state ==
410                                                         XPC_P_DEACTIVATING &&
411                                 atomic_read(&part->nchannels_active) == 0 &&
412                                 xpc_partition_disengaged(part))));
413                 atomic_set(&part->channel_mgr_requests, 1);
414
415                 // >>> Does it need to wakeup periodically as well? In case we
416                 // >>> miscalculated the #of kthreads to wakeup or create?
417         }
418 }
419
420
421 /*
422  * When XPC HB determines that a partition has come up, it will create a new
423  * kthread and that kthread will call this function to attempt to set up the
424  * basic infrastructure used for Cross Partition Communication with the newly
425  * upped partition.
426  *
427  * The kthread that was created by XPC HB and which setup the XPC
428  * infrastructure will remain assigned to the partition until the partition
429  * goes down. At which time the kthread will teardown the XPC infrastructure
430  * and then exit.
431  *
432  * XPC HB will put the remote partition's XPC per partition specific variables
433  * physical address into xpc_partitions[partid].remote_vars_part_pa prior to
434  * calling xpc_partition_up().
435  */
436 static void
437 xpc_partition_up(struct xpc_partition *part)
438 {
439         DBUG_ON(part->channels != NULL);
440
441         dev_dbg(xpc_chan, "activating partition %d\n", XPC_PARTID(part));
442
443         if (xpc_setup_infrastructure(part) != xpcSuccess) {
444                 return;
445         }
446
447         /*
448          * The kthread that XPC HB called us with will become the
449          * channel manager for this partition. It will not return
450          * back to XPC HB until the partition's XPC infrastructure
451          * has been dismantled.
452          */
453
454         (void) xpc_part_ref(part);      /* this will always succeed */
455
456         if (xpc_make_first_contact(part) == xpcSuccess) {
457                 xpc_channel_mgr(part);
458         }
459
460         xpc_part_deref(part);
461
462         xpc_teardown_infrastructure(part);
463 }
464
465
466 static int
467 xpc_activating(void *__partid)
468 {
469         partid_t partid = (u64) __partid;
470         struct xpc_partition *part = &xpc_partitions[partid];
471         unsigned long irq_flags;
472         struct sched_param param = { sched_priority: MAX_RT_PRIO - 1 };
473         int ret;
474
475
476         DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
477
478         spin_lock_irqsave(&part->act_lock, irq_flags);
479
480         if (part->act_state == XPC_P_DEACTIVATING) {
481                 part->act_state = XPC_P_INACTIVE;
482                 spin_unlock_irqrestore(&part->act_lock, irq_flags);
483                 part->remote_rp_pa = 0;
484                 return 0;
485         }
486
487         /* indicate the thread is activating */
488         DBUG_ON(part->act_state != XPC_P_ACTIVATION_REQ);
489         part->act_state = XPC_P_ACTIVATING;
490
491         XPC_SET_REASON(part, 0, 0);
492         spin_unlock_irqrestore(&part->act_lock, irq_flags);
493
494         dev_dbg(xpc_part, "bringing partition %d up\n", partid);
495
496         daemonize("xpc%02d", partid);
497
498         /*
499          * This thread needs to run at a realtime priority to prevent a
500          * significant performance degradation.
501          */
502         ret = sched_setscheduler(current, SCHED_FIFO, &param);
503         if (ret != 0) {
504                 dev_warn(xpc_part, "unable to set pid %d to a realtime "
505                         "priority, ret=%d\n", current->pid, ret);
506         }
507
508         /* allow this thread and its children to run on any CPU */
509         set_cpus_allowed(current, CPU_MASK_ALL);
510
511         /*
512          * Register the remote partition's AMOs with SAL so it can handle
513          * and cleanup errors within that address range should the remote
514          * partition go down. We don't unregister this range because it is
515          * difficult to tell when outstanding writes to the remote partition
516          * are finished and thus when it is safe to unregister. This should
517          * not result in wasted space in the SAL xp_addr_region table because
518          * we should get the same page for remote_amos_page_pa after module
519          * reloads and system reboots.
520          */
521         if (sn_register_xp_addr_region(part->remote_amos_page_pa,
522                                                         PAGE_SIZE, 1) < 0) {
523                 dev_warn(xpc_part, "xpc_partition_up(%d) failed to register "
524                         "xp_addr region\n", partid);
525
526                 spin_lock_irqsave(&part->act_lock, irq_flags);
527                 part->act_state = XPC_P_INACTIVE;
528                 XPC_SET_REASON(part, xpcPhysAddrRegFailed, __LINE__);
529                 spin_unlock_irqrestore(&part->act_lock, irq_flags);
530                 part->remote_rp_pa = 0;
531                 return 0;
532         }
533
534         xpc_allow_hb(partid, xpc_vars);
535         xpc_IPI_send_activated(part);
536
537
538         /*
539          * xpc_partition_up() holds this thread and marks this partition as
540          * XPC_P_ACTIVE by calling xpc_hb_mark_active().
541          */
542         (void) xpc_partition_up(part);
543
544         xpc_disallow_hb(partid, xpc_vars);
545         xpc_mark_partition_inactive(part);
546
547         if (part->reason == xpcReactivating) {
548                 /* interrupting ourselves results in activating partition */
549                 xpc_IPI_send_reactivate(part);
550         }
551
552         return 0;
553 }
554
555
556 void
557 xpc_activate_partition(struct xpc_partition *part)
558 {
559         partid_t partid = XPC_PARTID(part);
560         unsigned long irq_flags;
561         pid_t pid;
562
563
564         spin_lock_irqsave(&part->act_lock, irq_flags);
565
566         pid = kernel_thread(xpc_activating, (void *) ((u64) partid), 0);
567
568         DBUG_ON(part->act_state != XPC_P_INACTIVE);
569
570         if (pid > 0) {
571                 part->act_state = XPC_P_ACTIVATION_REQ;
572                 XPC_SET_REASON(part, xpcCloneKThread, __LINE__);
573         } else {
574                 XPC_SET_REASON(part, xpcCloneKThreadFailed, __LINE__);
575         }
576
577         spin_unlock_irqrestore(&part->act_lock, irq_flags);
578 }
579
580
581 /*
582  * Handle the receipt of a SGI_XPC_NOTIFY IRQ by seeing whether the specified
583  * partition actually sent it. Since SGI_XPC_NOTIFY IRQs may be shared by more
584  * than one partition, we use an AMO_t structure per partition to indicate
585  * whether a partition has sent an IPI or not.  >>> If it has, then wake up the
586  * associated kthread to handle it.
587  *
588  * All SGI_XPC_NOTIFY IRQs received by XPC are the result of IPIs sent by XPC
589  * running on other partitions.
590  *
591  * Noteworthy Arguments:
592  *
593  *      irq - Interrupt ReQuest number. NOT USED.
594  *
595  *      dev_id - partid of IPI's potential sender.
596  *
597  *      regs - processor's context before the processor entered
598  *             interrupt code. NOT USED.
599  */
600 irqreturn_t
601 xpc_notify_IRQ_handler(int irq, void *dev_id, struct pt_regs *regs)
602 {
603         partid_t partid = (partid_t) (u64) dev_id;
604         struct xpc_partition *part = &xpc_partitions[partid];
605
606
607         DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
608
609         if (xpc_part_ref(part)) {
610                 xpc_check_for_channel_activity(part);
611
612                 xpc_part_deref(part);
613         }
614         return IRQ_HANDLED;
615 }
616
617
618 /*
619  * Check to see if xpc_notify_IRQ_handler() dropped any IPIs on the floor
620  * because the write to their associated IPI amo completed after the IRQ/IPI
621  * was received.
622  */
623 void
624 xpc_dropped_IPI_check(struct xpc_partition *part)
625 {
626         if (xpc_part_ref(part)) {
627                 xpc_check_for_channel_activity(part);
628
629                 part->dropped_IPI_timer.expires = jiffies +
630                                                         XPC_P_DROPPED_IPI_WAIT;
631                 add_timer(&part->dropped_IPI_timer);
632                 xpc_part_deref(part);
633         }
634 }
635
636
637 void
638 xpc_activate_kthreads(struct xpc_channel *ch, int needed)
639 {
640         int idle = atomic_read(&ch->kthreads_idle);
641         int assigned = atomic_read(&ch->kthreads_assigned);
642         int wakeup;
643
644
645         DBUG_ON(needed <= 0);
646
647         if (idle > 0) {
648                 wakeup = (needed > idle) ? idle : needed;
649                 needed -= wakeup;
650
651                 dev_dbg(xpc_chan, "wakeup %d idle kthreads, partid=%d, "
652                         "channel=%d\n", wakeup, ch->partid, ch->number);
653
654                 /* only wakeup the requested number of kthreads */
655                 wake_up_nr(&ch->idle_wq, wakeup);
656         }
657
658         if (needed <= 0) {
659                 return;
660         }
661
662         if (needed + assigned > ch->kthreads_assigned_limit) {
663                 needed = ch->kthreads_assigned_limit - assigned;
664                 // >>>should never be less than 0
665                 if (needed <= 0) {
666                         return;
667                 }
668         }
669
670         dev_dbg(xpc_chan, "create %d new kthreads, partid=%d, channel=%d\n",
671                 needed, ch->partid, ch->number);
672
673         xpc_create_kthreads(ch, needed);
674 }
675
676
677 /*
678  * This function is where XPC's kthreads wait for messages to deliver.
679  */
680 static void
681 xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch)
682 {
683         do {
684                 /* deliver messages to their intended recipients */
685
686                 while ((volatile s64) ch->w_local_GP.get <
687                                 (volatile s64) ch->w_remote_GP.put &&
688                                         !((volatile u32) ch->flags &
689                                                 XPC_C_DISCONNECTING)) {
690                         xpc_deliver_msg(ch);
691                 }
692
693                 if (atomic_inc_return(&ch->kthreads_idle) >
694                                                 ch->kthreads_idle_limit) {
695                         /* too many idle kthreads on this channel */
696                         atomic_dec(&ch->kthreads_idle);
697                         break;
698                 }
699
700                 dev_dbg(xpc_chan, "idle kthread calling "
701                         "wait_event_interruptible_exclusive()\n");
702
703                 (void) wait_event_interruptible_exclusive(ch->idle_wq,
704                                 ((volatile s64) ch->w_local_GP.get <
705                                         (volatile s64) ch->w_remote_GP.put ||
706                                 ((volatile u32) ch->flags &
707                                                 XPC_C_DISCONNECTING)));
708
709                 atomic_dec(&ch->kthreads_idle);
710
711         } while (!((volatile u32) ch->flags & XPC_C_DISCONNECTING));
712 }
713
714
715 static int
716 xpc_daemonize_kthread(void *args)
717 {
718         partid_t partid = XPC_UNPACK_ARG1(args);
719         u16 ch_number = XPC_UNPACK_ARG2(args);
720         struct xpc_partition *part = &xpc_partitions[partid];
721         struct xpc_channel *ch;
722         int n_needed;
723         unsigned long irq_flags;
724
725
726         daemonize("xpc%02dc%d", partid, ch_number);
727
728         dev_dbg(xpc_chan, "kthread starting, partid=%d, channel=%d\n",
729                 partid, ch_number);
730
731         ch = &part->channels[ch_number];
732
733         if (!(ch->flags & XPC_C_DISCONNECTING)) {
734
735                 /* let registerer know that connection has been established */
736
737                 spin_lock_irqsave(&ch->lock, irq_flags);
738                 if (!(ch->flags & XPC_C_CONNECTCALLOUT)) {
739                         ch->flags |= XPC_C_CONNECTCALLOUT;
740                         spin_unlock_irqrestore(&ch->lock, irq_flags);
741
742                         xpc_connected_callout(ch);
743
744                         /*
745                          * It is possible that while the callout was being
746                          * made that the remote partition sent some messages.
747                          * If that is the case, we may need to activate
748                          * additional kthreads to help deliver them. We only
749                          * need one less than total #of messages to deliver.
750                          */
751                         n_needed = ch->w_remote_GP.put - ch->w_local_GP.get - 1;
752                         if (n_needed > 0 &&
753                                         !(ch->flags & XPC_C_DISCONNECTING)) {
754                                 xpc_activate_kthreads(ch, n_needed);
755                         }
756                 } else {
757                         spin_unlock_irqrestore(&ch->lock, irq_flags);
758                 }
759
760                 xpc_kthread_waitmsgs(part, ch);
761         }
762
763         if (atomic_dec_return(&ch->kthreads_assigned) == 0) {
764                 spin_lock_irqsave(&ch->lock, irq_flags);
765                 if ((ch->flags & XPC_C_CONNECTCALLOUT) &&
766                                 !(ch->flags & XPC_C_DISCONNECTCALLOUT)) {
767                         ch->flags |= XPC_C_DISCONNECTCALLOUT;
768                         spin_unlock_irqrestore(&ch->lock, irq_flags);
769
770                         xpc_disconnecting_callout(ch);
771                 } else {
772                         spin_unlock_irqrestore(&ch->lock, irq_flags);
773                 }
774                 if (atomic_dec_return(&part->nchannels_engaged) == 0) {
775                         xpc_mark_partition_disengaged(part);
776                         xpc_IPI_send_disengage(part);
777                 }
778         }
779
780
781         xpc_msgqueue_deref(ch);
782
783         dev_dbg(xpc_chan, "kthread exiting, partid=%d, channel=%d\n",
784                 partid, ch_number);
785
786         xpc_part_deref(part);
787         return 0;
788 }
789
790
791 /*
792  * For each partition that XPC has established communications with, there is
793  * a minimum of one kernel thread assigned to perform any operation that
794  * may potentially sleep or block (basically the callouts to the asynchronous
795  * functions registered via xpc_connect()).
796  *
797  * Additional kthreads are created and destroyed by XPC as the workload
798  * demands.
799  *
800  * A kthread is assigned to one of the active channels that exists for a given
801  * partition.
802  */
803 void
804 xpc_create_kthreads(struct xpc_channel *ch, int needed)
805 {
806         unsigned long irq_flags;
807         pid_t pid;
808         u64 args = XPC_PACK_ARGS(ch->partid, ch->number);
809         struct xpc_partition *part = &xpc_partitions[ch->partid];
810
811
812         while (needed-- > 0) {
813
814                 /*
815                  * The following is done on behalf of the newly created
816                  * kthread. That kthread is responsible for doing the
817                  * counterpart to the following before it exits.
818                  */
819                 (void) xpc_part_ref(part);
820                 xpc_msgqueue_ref(ch);
821                 if (atomic_inc_return(&ch->kthreads_assigned) == 1 &&
822                     atomic_inc_return(&part->nchannels_engaged) == 1) {
823                         xpc_mark_partition_engaged(part);
824                 }
825
826                 pid = kernel_thread(xpc_daemonize_kthread, (void *) args, 0);
827                 if (pid < 0) {
828                         /* the fork failed */
829                         if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
830                             atomic_dec_return(&part->nchannels_engaged) == 0) {
831                                 xpc_mark_partition_disengaged(part);
832                                 xpc_IPI_send_disengage(part);
833                         }
834                         xpc_msgqueue_deref(ch);
835                         xpc_part_deref(part);
836
837                         if (atomic_read(&ch->kthreads_assigned) <
838                                                 ch->kthreads_idle_limit) {
839                                 /*
840                                  * Flag this as an error only if we have an
841                                  * insufficient #of kthreads for the channel
842                                  * to function.
843                                  *
844                                  * No xpc_msgqueue_ref() is needed here since
845                                  * the channel mgr is doing this.
846                                  */
847                                 spin_lock_irqsave(&ch->lock, irq_flags);
848                                 XPC_DISCONNECT_CHANNEL(ch, xpcLackOfResources,
849                                                                 &irq_flags);
850                                 spin_unlock_irqrestore(&ch->lock, irq_flags);
851                         }
852                         break;
853                 }
854
855                 ch->kthreads_created++; // >>> temporary debug only!!!
856         }
857 }
858
859
860 void
861 xpc_disconnect_wait(int ch_number)
862 {
863         unsigned long irq_flags;
864         partid_t partid;
865         struct xpc_partition *part;
866         struct xpc_channel *ch;
867         int wakeup_channel_mgr;
868
869
870         /* now wait for all callouts to the caller's function to cease */
871         for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
872                 part = &xpc_partitions[partid];
873
874                 if (!xpc_part_ref(part)) {
875                         continue;
876                 }
877
878                 ch = &part->channels[ch_number];
879
880                 if (!(ch->flags & XPC_C_WDISCONNECT)) {
881                         xpc_part_deref(part);
882                         continue;
883                 }
884
885                 (void) down(&ch->wdisconnect_sema);
886
887                 spin_lock_irqsave(&ch->lock, irq_flags);
888                 DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
889                 wakeup_channel_mgr = 0;
890
891                 if (ch->delayed_IPI_flags) {
892                         if (part->act_state != XPC_P_DEACTIVATING) {
893                                 spin_lock(&part->IPI_lock);
894                                 XPC_SET_IPI_FLAGS(part->local_IPI_amo,
895                                         ch->number, ch->delayed_IPI_flags);
896                                 spin_unlock(&part->IPI_lock);
897                                 wakeup_channel_mgr = 1;
898                         }
899                         ch->delayed_IPI_flags = 0;
900                 }
901
902                 ch->flags &= ~XPC_C_WDISCONNECT;
903                 spin_unlock_irqrestore(&ch->lock, irq_flags);
904
905                 if (wakeup_channel_mgr) {
906                         xpc_wakeup_channel_mgr(part);
907                 }
908
909                 xpc_part_deref(part);
910         }
911 }
912
913
914 static void
915 xpc_do_exit(enum xpc_retval reason)
916 {
917         partid_t partid;
918         int active_part_count;
919         struct xpc_partition *part;
920         unsigned long printmsg_time;
921
922
923         /* a 'rmmod XPC' and a 'reboot' cannot both end up here together */
924         DBUG_ON(xpc_exiting == 1);
925
926         /*
927          * Let the heartbeat checker thread and the discovery thread
928          * (if one is running) know that they should exit. Also wake up
929          * the heartbeat checker thread in case it's sleeping.
930          */
931         xpc_exiting = 1;
932         wake_up_interruptible(&xpc_act_IRQ_wq);
933
934         /* ignore all incoming interrupts */
935         free_irq(SGI_XPC_ACTIVATE, NULL);
936
937         /* wait for the discovery thread to exit */
938         down(&xpc_discovery_exited);
939
940         /* wait for the heartbeat checker thread to exit */
941         down(&xpc_hb_checker_exited);
942
943
944         /* sleep for a 1/3 of a second or so */
945         (void) msleep_interruptible(300);
946
947
948         /* wait for all partitions to become inactive */
949
950         printmsg_time = jiffies;
951
952         do {
953                 active_part_count = 0;
954
955                 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
956                         part = &xpc_partitions[partid];
957
958                         if (xpc_partition_disengaged(part) &&
959                                         part->act_state == XPC_P_INACTIVE) {
960                                 continue;
961                         }
962
963                         active_part_count++;
964
965                         XPC_DEACTIVATE_PARTITION(part, reason);
966                 }
967
968                 if (active_part_count == 0) {
969                         break;
970                 }
971
972                 if (jiffies >= printmsg_time) {
973                         dev_info(xpc_part, "waiting for partitions to "
974                                 "deactivate/disengage, active count=%d, remote "
975                                 "engaged=0x%lx\n", active_part_count,
976                                 xpc_partition_engaged(1UL << partid));
977
978                         printmsg_time = jiffies +
979                                         (XPC_DISENGAGE_PRINTMSG_INTERVAL * HZ);
980                 }
981
982                 /* sleep for a 1/3 of a second or so */
983                 (void) msleep_interruptible(300);
984
985         } while (1);
986
987         DBUG_ON(xpc_partition_engaged(-1UL));
988
989
990         /* indicate to others that our reserved page is uninitialized */
991         xpc_rsvd_page->vars_pa = 0;
992
993         /* now it's time to eliminate our heartbeat */
994         del_timer_sync(&xpc_hb_timer);
995         DBUG_ON(xpc_vars->heartbeating_to_mask != 0);
996
997         /* take ourselves off of the reboot_notifier_list */
998         (void) unregister_reboot_notifier(&xpc_reboot_notifier);
999
1000         /* close down protections for IPI operations */
1001         xpc_restrict_IPI_ops();
1002
1003
1004         /* clear the interface to XPC's functions */
1005         xpc_clear_interface();
1006
1007         if (xpc_sysctl) {
1008                 unregister_sysctl_table(xpc_sysctl);
1009         }
1010 }
1011
1012
1013 /*
1014  * This function is called when the system is being rebooted.
1015  */
1016 static int
1017 xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused)
1018 {
1019         enum xpc_retval reason;
1020
1021
1022         switch (event) {
1023         case SYS_RESTART:
1024                 reason = xpcSystemReboot;
1025                 break;
1026         case SYS_HALT:
1027                 reason = xpcSystemHalt;
1028                 break;
1029         case SYS_POWER_OFF:
1030                 reason = xpcSystemPoweroff;
1031                 break;
1032         default:
1033                 reason = xpcSystemGoingDown;
1034         }
1035
1036         xpc_do_exit(reason);
1037         return NOTIFY_DONE;
1038 }
1039
1040
1041 int __init
1042 xpc_init(void)
1043 {
1044         int ret;
1045         partid_t partid;
1046         struct xpc_partition *part;
1047         pid_t pid;
1048
1049
1050         if (!ia64_platform_is("sn2")) {
1051                 return -ENODEV;
1052         }
1053
1054         /*
1055          * xpc_remote_copy_buffer is used as a temporary buffer for bte_copy'ng
1056          * various portions of a partition's reserved page. Its size is based
1057          * on the size of the reserved page header and part_nasids mask. So we
1058          * need to ensure that the other items will fit as well.
1059          */
1060         if (XPC_RP_VARS_SIZE > XPC_RP_HEADER_SIZE + XP_NASID_MASK_BYTES) {
1061                 dev_err(xpc_part, "xpc_remote_copy_buffer is not big enough\n");
1062                 return -EPERM;
1063         }
1064         DBUG_ON((u64) xpc_remote_copy_buffer !=
1065                                 L1_CACHE_ALIGN((u64) xpc_remote_copy_buffer));
1066
1067         snprintf(xpc_part->bus_id, BUS_ID_SIZE, "part");
1068         snprintf(xpc_chan->bus_id, BUS_ID_SIZE, "chan");
1069
1070         xpc_sysctl = register_sysctl_table(xpc_sys_dir, 1);
1071
1072         /*
1073          * The first few fields of each entry of xpc_partitions[] need to
1074          * be initialized now so that calls to xpc_connect() and
1075          * xpc_disconnect() can be made prior to the activation of any remote
1076          * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE
1077          * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING
1078          * PARTITION HAS BEEN ACTIVATED.
1079          */
1080         for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1081                 part = &xpc_partitions[partid];
1082
1083                 DBUG_ON((u64) part != L1_CACHE_ALIGN((u64) part));
1084
1085                 part->act_IRQ_rcvd = 0;
1086                 spin_lock_init(&part->act_lock);
1087                 part->act_state = XPC_P_INACTIVE;
1088                 XPC_SET_REASON(part, 0, 0);
1089
1090                 init_timer(&part->disengage_request_timer);
1091                 part->disengage_request_timer.function =
1092                                 xpc_timeout_partition_disengage_request;
1093                 part->disengage_request_timer.data = (unsigned long) part;
1094
1095                 part->setup_state = XPC_P_UNSET;
1096                 init_waitqueue_head(&part->teardown_wq);
1097                 atomic_set(&part->references, 0);
1098         }
1099
1100         /*
1101          * Open up protections for IPI operations (and AMO operations on
1102          * Shub 1.1 systems).
1103          */
1104         xpc_allow_IPI_ops();
1105
1106         /*
1107          * Interrupts being processed will increment this atomic variable and
1108          * awaken the heartbeat thread which will process the interrupts.
1109          */
1110         atomic_set(&xpc_act_IRQ_rcvd, 0);
1111
1112         /*
1113          * This is safe to do before the xpc_hb_checker thread has started
1114          * because the handler releases a wait queue.  If an interrupt is
1115          * received before the thread is waiting, it will not go to sleep,
1116          * but rather immediately process the interrupt.
1117          */
1118         ret = request_irq(SGI_XPC_ACTIVATE, xpc_act_IRQ_handler, 0,
1119                                                         "xpc hb", NULL);
1120         if (ret != 0) {
1121                 dev_err(xpc_part, "can't register ACTIVATE IRQ handler, "
1122                         "errno=%d\n", -ret);
1123
1124                 xpc_restrict_IPI_ops();
1125
1126                 if (xpc_sysctl) {
1127                         unregister_sysctl_table(xpc_sysctl);
1128                 }
1129                 return -EBUSY;
1130         }
1131
1132         /*
1133          * Fill the partition reserved page with the information needed by
1134          * other partitions to discover we are alive and establish initial
1135          * communications.
1136          */
1137         xpc_rsvd_page = xpc_rsvd_page_init();
1138         if (xpc_rsvd_page == NULL) {
1139                 dev_err(xpc_part, "could not setup our reserved page\n");
1140
1141                 free_irq(SGI_XPC_ACTIVATE, NULL);
1142                 xpc_restrict_IPI_ops();
1143
1144                 if (xpc_sysctl) {
1145                         unregister_sysctl_table(xpc_sysctl);
1146                 }
1147                 return -EBUSY;
1148         }
1149
1150
1151         /* add ourselves to the reboot_notifier_list */
1152         ret = register_reboot_notifier(&xpc_reboot_notifier);
1153         if (ret != 0) {
1154                 dev_warn(xpc_part, "can't register reboot notifier\n");
1155         }
1156
1157
1158         /*
1159          * Set the beating to other partitions into motion.  This is
1160          * the last requirement for other partitions' discovery to
1161          * initiate communications with us.
1162          */
1163         init_timer(&xpc_hb_timer);
1164         xpc_hb_timer.function = xpc_hb_beater;
1165         xpc_hb_beater(0);
1166
1167
1168         /*
1169          * The real work-horse behind xpc.  This processes incoming
1170          * interrupts and monitors remote heartbeats.
1171          */
1172         pid = kernel_thread(xpc_hb_checker, NULL, 0);
1173         if (pid < 0) {
1174                 dev_err(xpc_part, "failed while forking hb check thread\n");
1175
1176                 /* indicate to others that our reserved page is uninitialized */
1177                 xpc_rsvd_page->vars_pa = 0;
1178
1179                 /* take ourselves off of the reboot_notifier_list */
1180                 (void) unregister_reboot_notifier(&xpc_reboot_notifier);
1181
1182                 del_timer_sync(&xpc_hb_timer);
1183                 free_irq(SGI_XPC_ACTIVATE, NULL);
1184                 xpc_restrict_IPI_ops();
1185
1186                 if (xpc_sysctl) {
1187                         unregister_sysctl_table(xpc_sysctl);
1188                 }
1189                 return -EBUSY;
1190         }
1191
1192
1193         /*
1194          * Startup a thread that will attempt to discover other partitions to
1195          * activate based on info provided by SAL. This new thread is short
1196          * lived and will exit once discovery is complete.
1197          */
1198         pid = kernel_thread(xpc_initiate_discovery, NULL, 0);
1199         if (pid < 0) {
1200                 dev_err(xpc_part, "failed while forking discovery thread\n");
1201
1202                 /* mark this new thread as a non-starter */
1203                 up(&xpc_discovery_exited);
1204
1205                 xpc_do_exit(xpcUnloading);
1206                 return -EBUSY;
1207         }
1208
1209
1210         /* set the interface to point at XPC's functions */
1211         xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect,
1212                           xpc_initiate_allocate, xpc_initiate_send,
1213                           xpc_initiate_send_notify, xpc_initiate_received,
1214                           xpc_initiate_partid_to_nasids);
1215
1216         return 0;
1217 }
1218 module_init(xpc_init);
1219
1220
1221 void __exit
1222 xpc_exit(void)
1223 {
1224         xpc_do_exit(xpcUnloading);
1225 }
1226 module_exit(xpc_exit);
1227
1228
1229 MODULE_AUTHOR("Silicon Graphics, Inc.");
1230 MODULE_DESCRIPTION("Cross Partition Communication (XPC) support");
1231 MODULE_LICENSE("GPL");
1232
1233 module_param(xpc_hb_interval, int, 0);
1234 MODULE_PARM_DESC(xpc_hb_interval, "Number of seconds between "
1235                 "heartbeat increments.");
1236
1237 module_param(xpc_hb_check_interval, int, 0);
1238 MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between "
1239                 "heartbeat checks.");
1240
1241 module_param(xpc_disengage_request_timelimit, int, 0);
1242 MODULE_PARM_DESC(xpc_disengage_request_timelimit, "Number of seconds to wait "
1243                 "for disengage request to complete.");
1244