]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/blob - arch/ia64/sn/kernel/xpc_main.c
dcac286dab32260367671f6984f73d93e0a0ba75
[linux-2.6-omap-h63xx.git] / arch / ia64 / sn / kernel / xpc_main.c
1 /*
2  * This file is subject to the terms and conditions of the GNU General Public
3  * License.  See the file "COPYING" in the main directory of this archive
4  * for more details.
5  *
6  * Copyright (c) 2004-2005 Silicon Graphics, Inc.  All Rights Reserved.
7  */
8
9
10 /*
11  * Cross Partition Communication (XPC) support - standard version.
12  *
13  *      XPC provides a message passing capability that crosses partition
14  *      boundaries. This module is made up of two parts:
15  *
16  *          partition   This part detects the presence/absence of other
17  *                      partitions. It provides a heartbeat and monitors
18  *                      the heartbeats of other partitions.
19  *
20  *          channel     This part manages the channels and sends/receives
21  *                      messages across them to/from other partitions.
22  *
23  *      There are a couple of additional functions residing in XP, which
24  *      provide an interface to XPC for its users.
25  *
26  *
27  *      Caveats:
28  *
29  *        . We currently have no way to determine which nasid an IPI came
30  *          from. Thus, xpc_IPI_send() does a remote AMO write followed by
31  *          an IPI. The AMO indicates where data is to be pulled from, so
32  *          after the IPI arrives, the remote partition checks the AMO word.
33  *          The IPI can actually arrive before the AMO however, so other code
34  *          must periodically check for this case. Also, remote AMO operations
35  *          do not reliably time out. Thus we do a remote PIO read solely to
36  *          know whether the remote partition is down and whether we should
37  *          stop sending IPIs to it. This remote PIO read operation is set up
38  *          in a special nofault region so SAL knows to ignore (and cleanup)
39  *          any errors due to the remote AMO write, PIO read, and/or PIO
40  *          write operations.
41  *
42  *          If/when new hardware solves this IPI problem, we should abandon
43  *          the current approach.
44  *
45  */
46
47
48 #include <linux/kernel.h>
49 #include <linux/module.h>
50 #include <linux/init.h>
51 #include <linux/sched.h>
52 #include <linux/syscalls.h>
53 #include <linux/cache.h>
54 #include <linux/interrupt.h>
55 #include <linux/slab.h>
56 #include <linux/delay.h>
57 #include <linux/reboot.h>
58 #include <asm/sn/intr.h>
59 #include <asm/sn/sn_sal.h>
60 #include <asm/kdebug.h>
61 #include <asm/uaccess.h>
62 #include "xpc.h"
63
64
65 /* define two XPC debug device structures to be used with dev_dbg() et al */
66
67 struct device_driver xpc_dbg_name = {
68         .name = "xpc"
69 };
70
71 struct device xpc_part_dbg_subname = {
72         .bus_id = {0},          /* set to "part" at xpc_init() time */
73         .driver = &xpc_dbg_name
74 };
75
76 struct device xpc_chan_dbg_subname = {
77         .bus_id = {0},          /* set to "chan" at xpc_init() time */
78         .driver = &xpc_dbg_name
79 };
80
81 struct device *xpc_part = &xpc_part_dbg_subname;
82 struct device *xpc_chan = &xpc_chan_dbg_subname;
83
84
85 /* systune related variables for /proc/sys directories */
86
87 static int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL;
88 static int xpc_hb_min_interval = 1;
89 static int xpc_hb_max_interval = 10;
90
91 static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL;
92 static int xpc_hb_check_min_interval = 10;
93 static int xpc_hb_check_max_interval = 120;
94
95 int xpc_disengage_request_timelimit = XPC_DISENGAGE_REQUEST_DEFAULT_TIMELIMIT;
96 static int xpc_disengage_request_min_timelimit = 0;
97 static int xpc_disengage_request_max_timelimit = 120;
98
99 static ctl_table xpc_sys_xpc_hb_dir[] = {
100         {
101                 1,
102                 "hb_interval",
103                 &xpc_hb_interval,
104                 sizeof(int),
105                 0644,
106                 NULL,
107                 &proc_dointvec_minmax,
108                 &sysctl_intvec,
109                 NULL,
110                 &xpc_hb_min_interval,
111                 &xpc_hb_max_interval
112         },
113         {
114                 2,
115                 "hb_check_interval",
116                 &xpc_hb_check_interval,
117                 sizeof(int),
118                 0644,
119                 NULL,
120                 &proc_dointvec_minmax,
121                 &sysctl_intvec,
122                 NULL,
123                 &xpc_hb_check_min_interval,
124                 &xpc_hb_check_max_interval
125         },
126         {0}
127 };
128 static ctl_table xpc_sys_xpc_dir[] = {
129         {
130                 1,
131                 "hb",
132                 NULL,
133                 0,
134                 0555,
135                 xpc_sys_xpc_hb_dir
136         },
137         {
138                 2,
139                 "disengage_request_timelimit",
140                 &xpc_disengage_request_timelimit,
141                 sizeof(int),
142                 0644,
143                 NULL,
144                 &proc_dointvec_minmax,
145                 &sysctl_intvec,
146                 NULL,
147                 &xpc_disengage_request_min_timelimit,
148                 &xpc_disengage_request_max_timelimit
149         },
150         {0}
151 };
152 static ctl_table xpc_sys_dir[] = {
153         {
154                 1,
155                 "xpc",
156                 NULL,
157                 0,
158                 0555,
159                 xpc_sys_xpc_dir
160         },
161         {0}
162 };
163 static struct ctl_table_header *xpc_sysctl;
164
165 /* non-zero if any remote partition disengage request was timed out */
166 int xpc_disengage_request_timedout;
167
168 /* #of IRQs received */
169 static atomic_t xpc_act_IRQ_rcvd;
170
171 /* IRQ handler notifies this wait queue on receipt of an IRQ */
172 static DECLARE_WAIT_QUEUE_HEAD(xpc_act_IRQ_wq);
173
174 static unsigned long xpc_hb_check_timeout;
175
176 /* notification that the xpc_hb_checker thread has exited */
177 static DECLARE_MUTEX_LOCKED(xpc_hb_checker_exited);
178
179 /* notification that the xpc_discovery thread has exited */
180 static DECLARE_MUTEX_LOCKED(xpc_discovery_exited);
181
182
183 static struct timer_list xpc_hb_timer;
184
185
186 static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *);
187
188
189 static int xpc_system_reboot(struct notifier_block *, unsigned long, void *);
190 static struct notifier_block xpc_reboot_notifier = {
191         .notifier_call = xpc_system_reboot,
192 };
193
194 static int xpc_system_die(struct notifier_block *, unsigned long, void *);
195 static struct notifier_block xpc_die_notifier = {
196         .notifier_call = xpc_system_die,
197 };
198
199
200 /*
201  * Timer function to enforce the timelimit on the partition disengage request.
202  */
203 static void
204 xpc_timeout_partition_disengage_request(unsigned long data)
205 {
206         struct xpc_partition *part = (struct xpc_partition *) data;
207
208
209         DBUG_ON(jiffies < part->disengage_request_timeout);
210
211         (void) xpc_partition_disengaged(part);
212
213         DBUG_ON(part->disengage_request_timeout != 0);
214         DBUG_ON(xpc_partition_engaged(1UL << XPC_PARTID(part)) != 0);
215 }
216
217
218 /*
219  * Notify the heartbeat check thread that an IRQ has been received.
220  */
221 static irqreturn_t
222 xpc_act_IRQ_handler(int irq, void *dev_id, struct pt_regs *regs)
223 {
224         atomic_inc(&xpc_act_IRQ_rcvd);
225         wake_up_interruptible(&xpc_act_IRQ_wq);
226         return IRQ_HANDLED;
227 }
228
229
230 /*
231  * Timer to produce the heartbeat.  The timer structures function is
232  * already set when this is initially called.  A tunable is used to
233  * specify when the next timeout should occur.
234  */
235 static void
236 xpc_hb_beater(unsigned long dummy)
237 {
238         xpc_vars->heartbeat++;
239
240         if (jiffies >= xpc_hb_check_timeout) {
241                 wake_up_interruptible(&xpc_act_IRQ_wq);
242         }
243
244         xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ);
245         add_timer(&xpc_hb_timer);
246 }
247
248
249 /*
250  * This thread is responsible for nearly all of the partition
251  * activation/deactivation.
252  */
253 static int
254 xpc_hb_checker(void *ignore)
255 {
256         int last_IRQ_count = 0;
257         int new_IRQ_count;
258         int force_IRQ=0;
259
260
261         /* this thread was marked active by xpc_hb_init() */
262
263         daemonize(XPC_HB_CHECK_THREAD_NAME);
264
265         set_cpus_allowed(current, cpumask_of_cpu(XPC_HB_CHECK_CPU));
266
267         xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
268
269         while (!(volatile int) xpc_exiting) {
270
271                 dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have "
272                         "been received\n",
273                         (int) (xpc_hb_check_timeout - jiffies),
274                         atomic_read(&xpc_act_IRQ_rcvd) - last_IRQ_count);
275
276
277                 /* checking of remote heartbeats is skewed by IRQ handling */
278                 if (jiffies >= xpc_hb_check_timeout) {
279                         dev_dbg(xpc_part, "checking remote heartbeats\n");
280                         xpc_check_remote_hb();
281
282                         /*
283                          * We need to periodically recheck to ensure no
284                          * IPI/AMO pairs have been missed.  That check
285                          * must always reset xpc_hb_check_timeout.
286                          */
287                         force_IRQ = 1;
288                 }
289
290
291                 /* check for outstanding IRQs */
292                 new_IRQ_count = atomic_read(&xpc_act_IRQ_rcvd);
293                 if (last_IRQ_count < new_IRQ_count || force_IRQ != 0) {
294                         force_IRQ = 0;
295
296                         dev_dbg(xpc_part, "found an IRQ to process; will be "
297                                 "resetting xpc_hb_check_timeout\n");
298
299                         last_IRQ_count += xpc_identify_act_IRQ_sender();
300                         if (last_IRQ_count < new_IRQ_count) {
301                                 /* retry once to help avoid missing AMO */
302                                 (void) xpc_identify_act_IRQ_sender();
303                         }
304                         last_IRQ_count = new_IRQ_count;
305
306                         xpc_hb_check_timeout = jiffies +
307                                            (xpc_hb_check_interval * HZ);
308                 }
309
310                 /* wait for IRQ or timeout */
311                 (void) wait_event_interruptible(xpc_act_IRQ_wq,
312                             (last_IRQ_count < atomic_read(&xpc_act_IRQ_rcvd) ||
313                                         jiffies >= xpc_hb_check_timeout ||
314                                                 (volatile int) xpc_exiting));
315         }
316
317         dev_dbg(xpc_part, "heartbeat checker is exiting\n");
318
319
320         /* mark this thread as having exited */
321         up(&xpc_hb_checker_exited);
322         return 0;
323 }
324
325
326 /*
327  * This thread will attempt to discover other partitions to activate
328  * based on info provided by SAL. This new thread is short lived and
329  * will exit once discovery is complete.
330  */
331 static int
332 xpc_initiate_discovery(void *ignore)
333 {
334         daemonize(XPC_DISCOVERY_THREAD_NAME);
335
336         xpc_discovery();
337
338         dev_dbg(xpc_part, "discovery thread is exiting\n");
339
340         /* mark this thread as having exited */
341         up(&xpc_discovery_exited);
342         return 0;
343 }
344
345
346 /*
347  * Establish first contact with the remote partititon. This involves pulling
348  * the XPC per partition variables from the remote partition and waiting for
349  * the remote partition to pull ours.
350  */
351 static enum xpc_retval
352 xpc_make_first_contact(struct xpc_partition *part)
353 {
354         enum xpc_retval ret;
355
356
357         while ((ret = xpc_pull_remote_vars_part(part)) != xpcSuccess) {
358                 if (ret != xpcRetry) {
359                         XPC_DEACTIVATE_PARTITION(part, ret);
360                         return ret;
361                 }
362
363                 dev_dbg(xpc_chan, "waiting to make first contact with "
364                         "partition %d\n", XPC_PARTID(part));
365
366                 /* wait a 1/4 of a second or so */
367                 (void) msleep_interruptible(250);
368
369                 if (part->act_state == XPC_P_DEACTIVATING) {
370                         return part->reason;
371                 }
372         }
373
374         return xpc_mark_partition_active(part);
375 }
376
377
378 /*
379  * The first kthread assigned to a newly activated partition is the one
380  * created by XPC HB with which it calls xpc_partition_up(). XPC hangs on to
381  * that kthread until the partition is brought down, at which time that kthread
382  * returns back to XPC HB. (The return of that kthread will signify to XPC HB
383  * that XPC has dismantled all communication infrastructure for the associated
384  * partition.) This kthread becomes the channel manager for that partition.
385  *
386  * Each active partition has a channel manager, who, besides connecting and
387  * disconnecting channels, will ensure that each of the partition's connected
388  * channels has the required number of assigned kthreads to get the work done.
389  */
390 static void
391 xpc_channel_mgr(struct xpc_partition *part)
392 {
393         while (part->act_state != XPC_P_DEACTIVATING ||
394                         atomic_read(&part->nchannels_active) > 0 ||
395                                         !xpc_partition_disengaged(part)) {
396
397                 xpc_process_channel_activity(part);
398
399
400                 /*
401                  * Wait until we've been requested to activate kthreads or
402                  * all of the channel's message queues have been torn down or
403                  * a signal is pending.
404                  *
405                  * The channel_mgr_requests is set to 1 after being awakened,
406                  * This is done to prevent the channel mgr from making one pass
407                  * through the loop for each request, since he will
408                  * be servicing all the requests in one pass. The reason it's
409                  * set to 1 instead of 0 is so that other kthreads will know
410                  * that the channel mgr is running and won't bother trying to
411                  * wake him up.
412                  */
413                 atomic_dec(&part->channel_mgr_requests);
414                 (void) wait_event_interruptible(part->channel_mgr_wq,
415                                 (atomic_read(&part->channel_mgr_requests) > 0 ||
416                                 (volatile u64) part->local_IPI_amo != 0 ||
417                                 ((volatile u8) part->act_state ==
418                                                         XPC_P_DEACTIVATING &&
419                                 atomic_read(&part->nchannels_active) == 0 &&
420                                 xpc_partition_disengaged(part))));
421                 atomic_set(&part->channel_mgr_requests, 1);
422
423                 // >>> Does it need to wakeup periodically as well? In case we
424                 // >>> miscalculated the #of kthreads to wakeup or create?
425         }
426 }
427
428
429 /*
430  * When XPC HB determines that a partition has come up, it will create a new
431  * kthread and that kthread will call this function to attempt to set up the
432  * basic infrastructure used for Cross Partition Communication with the newly
433  * upped partition.
434  *
435  * The kthread that was created by XPC HB and which setup the XPC
436  * infrastructure will remain assigned to the partition until the partition
437  * goes down. At which time the kthread will teardown the XPC infrastructure
438  * and then exit.
439  *
440  * XPC HB will put the remote partition's XPC per partition specific variables
441  * physical address into xpc_partitions[partid].remote_vars_part_pa prior to
442  * calling xpc_partition_up().
443  */
444 static void
445 xpc_partition_up(struct xpc_partition *part)
446 {
447         DBUG_ON(part->channels != NULL);
448
449         dev_dbg(xpc_chan, "activating partition %d\n", XPC_PARTID(part));
450
451         if (xpc_setup_infrastructure(part) != xpcSuccess) {
452                 return;
453         }
454
455         /*
456          * The kthread that XPC HB called us with will become the
457          * channel manager for this partition. It will not return
458          * back to XPC HB until the partition's XPC infrastructure
459          * has been dismantled.
460          */
461
462         (void) xpc_part_ref(part);      /* this will always succeed */
463
464         if (xpc_make_first_contact(part) == xpcSuccess) {
465                 xpc_channel_mgr(part);
466         }
467
468         xpc_part_deref(part);
469
470         xpc_teardown_infrastructure(part);
471 }
472
473
474 static int
475 xpc_activating(void *__partid)
476 {
477         partid_t partid = (u64) __partid;
478         struct xpc_partition *part = &xpc_partitions[partid];
479         unsigned long irq_flags;
480         struct sched_param param = { sched_priority: MAX_RT_PRIO - 1 };
481         int ret;
482
483
484         DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
485
486         spin_lock_irqsave(&part->act_lock, irq_flags);
487
488         if (part->act_state == XPC_P_DEACTIVATING) {
489                 part->act_state = XPC_P_INACTIVE;
490                 spin_unlock_irqrestore(&part->act_lock, irq_flags);
491                 part->remote_rp_pa = 0;
492                 return 0;
493         }
494
495         /* indicate the thread is activating */
496         DBUG_ON(part->act_state != XPC_P_ACTIVATION_REQ);
497         part->act_state = XPC_P_ACTIVATING;
498
499         XPC_SET_REASON(part, 0, 0);
500         spin_unlock_irqrestore(&part->act_lock, irq_flags);
501
502         dev_dbg(xpc_part, "bringing partition %d up\n", partid);
503
504         daemonize("xpc%02d", partid);
505
506         /*
507          * This thread needs to run at a realtime priority to prevent a
508          * significant performance degradation.
509          */
510         ret = sched_setscheduler(current, SCHED_FIFO, &param);
511         if (ret != 0) {
512                 dev_warn(xpc_part, "unable to set pid %d to a realtime "
513                         "priority, ret=%d\n", current->pid, ret);
514         }
515
516         /* allow this thread and its children to run on any CPU */
517         set_cpus_allowed(current, CPU_MASK_ALL);
518
519         /*
520          * Register the remote partition's AMOs with SAL so it can handle
521          * and cleanup errors within that address range should the remote
522          * partition go down. We don't unregister this range because it is
523          * difficult to tell when outstanding writes to the remote partition
524          * are finished and thus when it is safe to unregister. This should
525          * not result in wasted space in the SAL xp_addr_region table because
526          * we should get the same page for remote_amos_page_pa after module
527          * reloads and system reboots.
528          */
529         if (sn_register_xp_addr_region(part->remote_amos_page_pa,
530                                                         PAGE_SIZE, 1) < 0) {
531                 dev_warn(xpc_part, "xpc_partition_up(%d) failed to register "
532                         "xp_addr region\n", partid);
533
534                 spin_lock_irqsave(&part->act_lock, irq_flags);
535                 part->act_state = XPC_P_INACTIVE;
536                 XPC_SET_REASON(part, xpcPhysAddrRegFailed, __LINE__);
537                 spin_unlock_irqrestore(&part->act_lock, irq_flags);
538                 part->remote_rp_pa = 0;
539                 return 0;
540         }
541
542         xpc_allow_hb(partid, xpc_vars);
543         xpc_IPI_send_activated(part);
544
545
546         /*
547          * xpc_partition_up() holds this thread and marks this partition as
548          * XPC_P_ACTIVE by calling xpc_hb_mark_active().
549          */
550         (void) xpc_partition_up(part);
551
552         xpc_disallow_hb(partid, xpc_vars);
553         xpc_mark_partition_inactive(part);
554
555         if (part->reason == xpcReactivating) {
556                 /* interrupting ourselves results in activating partition */
557                 xpc_IPI_send_reactivate(part);
558         }
559
560         return 0;
561 }
562
563
564 void
565 xpc_activate_partition(struct xpc_partition *part)
566 {
567         partid_t partid = XPC_PARTID(part);
568         unsigned long irq_flags;
569         pid_t pid;
570
571
572         spin_lock_irqsave(&part->act_lock, irq_flags);
573
574         pid = kernel_thread(xpc_activating, (void *) ((u64) partid), 0);
575
576         DBUG_ON(part->act_state != XPC_P_INACTIVE);
577
578         if (pid > 0) {
579                 part->act_state = XPC_P_ACTIVATION_REQ;
580                 XPC_SET_REASON(part, xpcCloneKThread, __LINE__);
581         } else {
582                 XPC_SET_REASON(part, xpcCloneKThreadFailed, __LINE__);
583         }
584
585         spin_unlock_irqrestore(&part->act_lock, irq_flags);
586 }
587
588
589 /*
590  * Handle the receipt of a SGI_XPC_NOTIFY IRQ by seeing whether the specified
591  * partition actually sent it. Since SGI_XPC_NOTIFY IRQs may be shared by more
592  * than one partition, we use an AMO_t structure per partition to indicate
593  * whether a partition has sent an IPI or not.  >>> If it has, then wake up the
594  * associated kthread to handle it.
595  *
596  * All SGI_XPC_NOTIFY IRQs received by XPC are the result of IPIs sent by XPC
597  * running on other partitions.
598  *
599  * Noteworthy Arguments:
600  *
601  *      irq - Interrupt ReQuest number. NOT USED.
602  *
603  *      dev_id - partid of IPI's potential sender.
604  *
605  *      regs - processor's context before the processor entered
606  *             interrupt code. NOT USED.
607  */
608 irqreturn_t
609 xpc_notify_IRQ_handler(int irq, void *dev_id, struct pt_regs *regs)
610 {
611         partid_t partid = (partid_t) (u64) dev_id;
612         struct xpc_partition *part = &xpc_partitions[partid];
613
614
615         DBUG_ON(partid <= 0 || partid >= XP_MAX_PARTITIONS);
616
617         if (xpc_part_ref(part)) {
618                 xpc_check_for_channel_activity(part);
619
620                 xpc_part_deref(part);
621         }
622         return IRQ_HANDLED;
623 }
624
625
626 /*
627  * Check to see if xpc_notify_IRQ_handler() dropped any IPIs on the floor
628  * because the write to their associated IPI amo completed after the IRQ/IPI
629  * was received.
630  */
631 void
632 xpc_dropped_IPI_check(struct xpc_partition *part)
633 {
634         if (xpc_part_ref(part)) {
635                 xpc_check_for_channel_activity(part);
636
637                 part->dropped_IPI_timer.expires = jiffies +
638                                                         XPC_P_DROPPED_IPI_WAIT;
639                 add_timer(&part->dropped_IPI_timer);
640                 xpc_part_deref(part);
641         }
642 }
643
644
645 void
646 xpc_activate_kthreads(struct xpc_channel *ch, int needed)
647 {
648         int idle = atomic_read(&ch->kthreads_idle);
649         int assigned = atomic_read(&ch->kthreads_assigned);
650         int wakeup;
651
652
653         DBUG_ON(needed <= 0);
654
655         if (idle > 0) {
656                 wakeup = (needed > idle) ? idle : needed;
657                 needed -= wakeup;
658
659                 dev_dbg(xpc_chan, "wakeup %d idle kthreads, partid=%d, "
660                         "channel=%d\n", wakeup, ch->partid, ch->number);
661
662                 /* only wakeup the requested number of kthreads */
663                 wake_up_nr(&ch->idle_wq, wakeup);
664         }
665
666         if (needed <= 0) {
667                 return;
668         }
669
670         if (needed + assigned > ch->kthreads_assigned_limit) {
671                 needed = ch->kthreads_assigned_limit - assigned;
672                 // >>>should never be less than 0
673                 if (needed <= 0) {
674                         return;
675                 }
676         }
677
678         dev_dbg(xpc_chan, "create %d new kthreads, partid=%d, channel=%d\n",
679                 needed, ch->partid, ch->number);
680
681         xpc_create_kthreads(ch, needed);
682 }
683
684
685 /*
686  * This function is where XPC's kthreads wait for messages to deliver.
687  */
688 static void
689 xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch)
690 {
691         do {
692                 /* deliver messages to their intended recipients */
693
694                 while ((volatile s64) ch->w_local_GP.get <
695                                 (volatile s64) ch->w_remote_GP.put &&
696                                         !((volatile u32) ch->flags &
697                                                 XPC_C_DISCONNECTING)) {
698                         xpc_deliver_msg(ch);
699                 }
700
701                 if (atomic_inc_return(&ch->kthreads_idle) >
702                                                 ch->kthreads_idle_limit) {
703                         /* too many idle kthreads on this channel */
704                         atomic_dec(&ch->kthreads_idle);
705                         break;
706                 }
707
708                 dev_dbg(xpc_chan, "idle kthread calling "
709                         "wait_event_interruptible_exclusive()\n");
710
711                 (void) wait_event_interruptible_exclusive(ch->idle_wq,
712                                 ((volatile s64) ch->w_local_GP.get <
713                                         (volatile s64) ch->w_remote_GP.put ||
714                                 ((volatile u32) ch->flags &
715                                                 XPC_C_DISCONNECTING)));
716
717                 atomic_dec(&ch->kthreads_idle);
718
719         } while (!((volatile u32) ch->flags & XPC_C_DISCONNECTING));
720 }
721
722
723 static int
724 xpc_daemonize_kthread(void *args)
725 {
726         partid_t partid = XPC_UNPACK_ARG1(args);
727         u16 ch_number = XPC_UNPACK_ARG2(args);
728         struct xpc_partition *part = &xpc_partitions[partid];
729         struct xpc_channel *ch;
730         int n_needed;
731         unsigned long irq_flags;
732
733
734         daemonize("xpc%02dc%d", partid, ch_number);
735
736         dev_dbg(xpc_chan, "kthread starting, partid=%d, channel=%d\n",
737                 partid, ch_number);
738
739         ch = &part->channels[ch_number];
740
741         if (!(ch->flags & XPC_C_DISCONNECTING)) {
742
743                 /* let registerer know that connection has been established */
744
745                 spin_lock_irqsave(&ch->lock, irq_flags);
746                 if (!(ch->flags & XPC_C_CONNECTCALLOUT)) {
747                         ch->flags |= XPC_C_CONNECTCALLOUT;
748                         spin_unlock_irqrestore(&ch->lock, irq_flags);
749
750                         xpc_connected_callout(ch);
751
752                         /*
753                          * It is possible that while the callout was being
754                          * made that the remote partition sent some messages.
755                          * If that is the case, we may need to activate
756                          * additional kthreads to help deliver them. We only
757                          * need one less than total #of messages to deliver.
758                          */
759                         n_needed = ch->w_remote_GP.put - ch->w_local_GP.get - 1;
760                         if (n_needed > 0 &&
761                                         !(ch->flags & XPC_C_DISCONNECTING)) {
762                                 xpc_activate_kthreads(ch, n_needed);
763                         }
764                 } else {
765                         spin_unlock_irqrestore(&ch->lock, irq_flags);
766                 }
767
768                 xpc_kthread_waitmsgs(part, ch);
769         }
770
771         if (atomic_dec_return(&ch->kthreads_assigned) == 0) {
772                 spin_lock_irqsave(&ch->lock, irq_flags);
773                 if ((ch->flags & XPC_C_CONNECTCALLOUT) &&
774                                 !(ch->flags & XPC_C_DISCONNECTCALLOUT)) {
775                         ch->flags |= XPC_C_DISCONNECTCALLOUT;
776                         spin_unlock_irqrestore(&ch->lock, irq_flags);
777
778                         xpc_disconnect_callout(ch, xpcDisconnecting);
779                 } else {
780                         spin_unlock_irqrestore(&ch->lock, irq_flags);
781                 }
782                 if (atomic_dec_return(&part->nchannels_engaged) == 0) {
783                         xpc_mark_partition_disengaged(part);
784                         xpc_IPI_send_disengage(part);
785                 }
786         }
787
788
789         xpc_msgqueue_deref(ch);
790
791         dev_dbg(xpc_chan, "kthread exiting, partid=%d, channel=%d\n",
792                 partid, ch_number);
793
794         xpc_part_deref(part);
795         return 0;
796 }
797
798
799 /*
800  * For each partition that XPC has established communications with, there is
801  * a minimum of one kernel thread assigned to perform any operation that
802  * may potentially sleep or block (basically the callouts to the asynchronous
803  * functions registered via xpc_connect()).
804  *
805  * Additional kthreads are created and destroyed by XPC as the workload
806  * demands.
807  *
808  * A kthread is assigned to one of the active channels that exists for a given
809  * partition.
810  */
811 void
812 xpc_create_kthreads(struct xpc_channel *ch, int needed)
813 {
814         unsigned long irq_flags;
815         pid_t pid;
816         u64 args = XPC_PACK_ARGS(ch->partid, ch->number);
817         struct xpc_partition *part = &xpc_partitions[ch->partid];
818
819
820         while (needed-- > 0) {
821
822                 /*
823                  * The following is done on behalf of the newly created
824                  * kthread. That kthread is responsible for doing the
825                  * counterpart to the following before it exits.
826                  */
827                 (void) xpc_part_ref(part);
828                 xpc_msgqueue_ref(ch);
829                 if (atomic_inc_return(&ch->kthreads_assigned) == 1 &&
830                     atomic_inc_return(&part->nchannels_engaged) == 1) {
831                         xpc_mark_partition_engaged(part);
832                 }
833
834                 pid = kernel_thread(xpc_daemonize_kthread, (void *) args, 0);
835                 if (pid < 0) {
836                         /* the fork failed */
837                         if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
838                             atomic_dec_return(&part->nchannels_engaged) == 0) {
839                                 xpc_mark_partition_disengaged(part);
840                                 xpc_IPI_send_disengage(part);
841                         }
842                         xpc_msgqueue_deref(ch);
843                         xpc_part_deref(part);
844
845                         if (atomic_read(&ch->kthreads_assigned) <
846                                                 ch->kthreads_idle_limit) {
847                                 /*
848                                  * Flag this as an error only if we have an
849                                  * insufficient #of kthreads for the channel
850                                  * to function.
851                                  *
852                                  * No xpc_msgqueue_ref() is needed here since
853                                  * the channel mgr is doing this.
854                                  */
855                                 spin_lock_irqsave(&ch->lock, irq_flags);
856                                 XPC_DISCONNECT_CHANNEL(ch, xpcLackOfResources,
857                                                                 &irq_flags);
858                                 spin_unlock_irqrestore(&ch->lock, irq_flags);
859                         }
860                         break;
861                 }
862
863                 ch->kthreads_created++; // >>> temporary debug only!!!
864         }
865 }
866
867
868 void
869 xpc_disconnect_wait(int ch_number)
870 {
871         unsigned long irq_flags;
872         partid_t partid;
873         struct xpc_partition *part;
874         struct xpc_channel *ch;
875         int wakeup_channel_mgr;
876
877
878         /* now wait for all callouts to the caller's function to cease */
879         for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
880                 part = &xpc_partitions[partid];
881
882                 if (!xpc_part_ref(part)) {
883                         continue;
884                 }
885
886                 ch = &part->channels[ch_number];
887
888                 if (!(ch->flags & XPC_C_WDISCONNECT)) {
889                         xpc_part_deref(part);
890                         continue;
891                 }
892
893                 (void) down(&ch->wdisconnect_sema);
894
895                 spin_lock_irqsave(&ch->lock, irq_flags);
896                 DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
897                 wakeup_channel_mgr = 0;
898
899                 if (ch->delayed_IPI_flags) {
900                         if (part->act_state != XPC_P_DEACTIVATING) {
901                                 spin_lock(&part->IPI_lock);
902                                 XPC_SET_IPI_FLAGS(part->local_IPI_amo,
903                                         ch->number, ch->delayed_IPI_flags);
904                                 spin_unlock(&part->IPI_lock);
905                                 wakeup_channel_mgr = 1;
906                         }
907                         ch->delayed_IPI_flags = 0;
908                 }
909
910                 ch->flags &= ~XPC_C_WDISCONNECT;
911                 spin_unlock_irqrestore(&ch->lock, irq_flags);
912
913                 if (wakeup_channel_mgr) {
914                         xpc_wakeup_channel_mgr(part);
915                 }
916
917                 xpc_part_deref(part);
918         }
919 }
920
921
922 static void
923 xpc_do_exit(enum xpc_retval reason)
924 {
925         partid_t partid;
926         int active_part_count, printed_waiting_msg = 0;
927         struct xpc_partition *part;
928         unsigned long printmsg_time, disengage_request_timeout = 0;
929
930
931         /* a 'rmmod XPC' and a 'reboot' cannot both end up here together */
932         DBUG_ON(xpc_exiting == 1);
933
934         /*
935          * Let the heartbeat checker thread and the discovery thread
936          * (if one is running) know that they should exit. Also wake up
937          * the heartbeat checker thread in case it's sleeping.
938          */
939         xpc_exiting = 1;
940         wake_up_interruptible(&xpc_act_IRQ_wq);
941
942         /* ignore all incoming interrupts */
943         free_irq(SGI_XPC_ACTIVATE, NULL);
944
945         /* wait for the discovery thread to exit */
946         down(&xpc_discovery_exited);
947
948         /* wait for the heartbeat checker thread to exit */
949         down(&xpc_hb_checker_exited);
950
951
952         /* sleep for a 1/3 of a second or so */
953         (void) msleep_interruptible(300);
954
955
956         /* wait for all partitions to become inactive */
957
958         printmsg_time = jiffies + (XPC_DISENGAGE_PRINTMSG_INTERVAL * HZ);
959         xpc_disengage_request_timedout = 0;
960
961         do {
962                 active_part_count = 0;
963
964                 for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
965                         part = &xpc_partitions[partid];
966
967                         if (xpc_partition_disengaged(part) &&
968                                         part->act_state == XPC_P_INACTIVE) {
969                                 continue;
970                         }
971
972                         active_part_count++;
973
974                         XPC_DEACTIVATE_PARTITION(part, reason);
975
976                         if (part->disengage_request_timeout >
977                                                 disengage_request_timeout) {
978                                 disengage_request_timeout =
979                                                 part->disengage_request_timeout;
980                         }
981                 }
982
983                 if (xpc_partition_engaged(-1UL)) {
984                         if (time_after(jiffies, printmsg_time)) {
985                                 dev_info(xpc_part, "waiting for remote "
986                                         "partitions to disengage, timeout in "
987                                         "%ld seconds\n",
988                                         (disengage_request_timeout - jiffies)
989                                                                         / HZ);
990                                 printmsg_time = jiffies +
991                                         (XPC_DISENGAGE_PRINTMSG_INTERVAL * HZ);
992                                 printed_waiting_msg = 1;
993                         }
994
995                 } else if (active_part_count > 0) {
996                         if (printed_waiting_msg) {
997                                 dev_info(xpc_part, "waiting for local partition"
998                                         " to disengage\n");
999                                 printed_waiting_msg = 0;
1000                         }
1001
1002                 } else {
1003                         if (!xpc_disengage_request_timedout) {
1004                                 dev_info(xpc_part, "all partitions have "
1005                                         "disengaged\n");
1006                         }
1007                         break;
1008                 }
1009
1010                 /* sleep for a 1/3 of a second or so */
1011                 (void) msleep_interruptible(300);
1012
1013         } while (1);
1014
1015         DBUG_ON(xpc_partition_engaged(-1UL));
1016
1017
1018         /* indicate to others that our reserved page is uninitialized */
1019         xpc_rsvd_page->vars_pa = 0;
1020
1021         /* now it's time to eliminate our heartbeat */
1022         del_timer_sync(&xpc_hb_timer);
1023         DBUG_ON(xpc_vars->heartbeating_to_mask != 0);
1024
1025         if (reason == xpcUnloading) {
1026                 /* take ourselves off of the reboot_notifier_list */
1027                 (void) unregister_reboot_notifier(&xpc_reboot_notifier);
1028
1029                 /* take ourselves off of the die_notifier list */
1030                 (void) unregister_die_notifier(&xpc_die_notifier);
1031         }
1032
1033         /* close down protections for IPI operations */
1034         xpc_restrict_IPI_ops();
1035
1036
1037         /* clear the interface to XPC's functions */
1038         xpc_clear_interface();
1039
1040         if (xpc_sysctl) {
1041                 unregister_sysctl_table(xpc_sysctl);
1042         }
1043 }
1044
1045
1046 /*
1047  * Called when the system is about to be either restarted or halted.
1048  */
1049 static void
1050 xpc_die_disengage(void)
1051 {
1052         struct xpc_partition *part;
1053         partid_t partid;
1054         unsigned long engaged;
1055         long time, printmsg_time, disengage_request_timeout;
1056
1057
1058         /* keep xpc_hb_checker thread from doing anything (just in case) */
1059         xpc_exiting = 1;
1060
1061         xpc_vars->heartbeating_to_mask = 0;  /* indicate we're deactivated */
1062
1063         for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1064                 part = &xpc_partitions[partid];
1065
1066                 if (!XPC_SUPPORTS_DISENGAGE_REQUEST(part->
1067                                                         remote_vars_version)) {
1068
1069                         /* just in case it was left set by an earlier XPC */
1070                         xpc_clear_partition_engaged(1UL << partid);
1071                         continue;
1072                 }
1073
1074                 if (xpc_partition_engaged(1UL << partid) ||
1075                                         part->act_state != XPC_P_INACTIVE) {
1076                         xpc_request_partition_disengage(part);
1077                         xpc_mark_partition_disengaged(part);
1078                         xpc_IPI_send_disengage(part);
1079                 }
1080         }
1081
1082         time = rtc_time();
1083         printmsg_time = time +
1084                 (XPC_DISENGAGE_PRINTMSG_INTERVAL * sn_rtc_cycles_per_second);
1085         disengage_request_timeout = time +
1086                 (xpc_disengage_request_timelimit * sn_rtc_cycles_per_second);
1087
1088         /* wait for all other partitions to disengage from us */
1089
1090         while (1) {
1091                 engaged = xpc_partition_engaged(-1UL);
1092                 if (!engaged) {
1093                         dev_info(xpc_part, "all partitions have disengaged\n");
1094                         break;
1095                 }
1096
1097                 time = rtc_time();
1098                 if (time >= disengage_request_timeout) {
1099                         for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1100                                 if (engaged & (1UL << partid)) {
1101                                         dev_info(xpc_part, "disengage from "
1102                                                 "remote partition %d timed "
1103                                                 "out\n", partid);
1104                                 }
1105                         }
1106                         break;
1107                 }
1108
1109                 if (time >= printmsg_time) {
1110                         dev_info(xpc_part, "waiting for remote partitions to "
1111                                 "disengage, timeout in %ld seconds\n",
1112                                 (disengage_request_timeout - time) /
1113                                                 sn_rtc_cycles_per_second);
1114                         printmsg_time = time +
1115                                         (XPC_DISENGAGE_PRINTMSG_INTERVAL *
1116                                                 sn_rtc_cycles_per_second);
1117                 }
1118         }
1119 }
1120
1121
1122 /*
1123  * This function is called when the system is being rebooted.
1124  */
1125 static int
1126 xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused)
1127 {
1128         enum xpc_retval reason;
1129
1130
1131         switch (event) {
1132         case SYS_RESTART:
1133                 reason = xpcSystemReboot;
1134                 break;
1135         case SYS_HALT:
1136                 reason = xpcSystemHalt;
1137                 break;
1138         case SYS_POWER_OFF:
1139                 reason = xpcSystemPoweroff;
1140                 break;
1141         default:
1142                 reason = xpcSystemGoingDown;
1143         }
1144
1145         xpc_do_exit(reason);
1146         return NOTIFY_DONE;
1147 }
1148
1149
1150 /*
1151  * This function is called when the system is being rebooted.
1152  */
1153 static int
1154 xpc_system_die(struct notifier_block *nb, unsigned long event, void *unused)
1155 {
1156         switch (event) {
1157         case DIE_MACHINE_RESTART:
1158         case DIE_MACHINE_HALT:
1159                 xpc_die_disengage();
1160                 break;
1161         case DIE_MCA_MONARCH_ENTER:
1162         case DIE_INIT_MONARCH_ENTER:
1163                 xpc_vars->heartbeat++;
1164                 xpc_vars->heartbeat_offline = 1;
1165                 break;
1166         case DIE_MCA_MONARCH_LEAVE:
1167         case DIE_INIT_MONARCH_LEAVE:
1168                 xpc_vars->heartbeat++;
1169                 xpc_vars->heartbeat_offline = 0;
1170                 break;
1171         }
1172
1173         return NOTIFY_DONE;
1174 }
1175
1176
1177 int __init
1178 xpc_init(void)
1179 {
1180         int ret;
1181         partid_t partid;
1182         struct xpc_partition *part;
1183         pid_t pid;
1184
1185
1186         if (!ia64_platform_is("sn2")) {
1187                 return -ENODEV;
1188         }
1189
1190         /*
1191          * xpc_remote_copy_buffer is used as a temporary buffer for bte_copy'ng
1192          * various portions of a partition's reserved page. Its size is based
1193          * on the size of the reserved page header and part_nasids mask. So we
1194          * need to ensure that the other items will fit as well.
1195          */
1196         if (XPC_RP_VARS_SIZE > XPC_RP_HEADER_SIZE + XP_NASID_MASK_BYTES) {
1197                 dev_err(xpc_part, "xpc_remote_copy_buffer is not big enough\n");
1198                 return -EPERM;
1199         }
1200         DBUG_ON((u64) xpc_remote_copy_buffer !=
1201                                 L1_CACHE_ALIGN((u64) xpc_remote_copy_buffer));
1202
1203         snprintf(xpc_part->bus_id, BUS_ID_SIZE, "part");
1204         snprintf(xpc_chan->bus_id, BUS_ID_SIZE, "chan");
1205
1206         xpc_sysctl = register_sysctl_table(xpc_sys_dir, 1);
1207
1208         /*
1209          * The first few fields of each entry of xpc_partitions[] need to
1210          * be initialized now so that calls to xpc_connect() and
1211          * xpc_disconnect() can be made prior to the activation of any remote
1212          * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE
1213          * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING
1214          * PARTITION HAS BEEN ACTIVATED.
1215          */
1216         for (partid = 1; partid < XP_MAX_PARTITIONS; partid++) {
1217                 part = &xpc_partitions[partid];
1218
1219                 DBUG_ON((u64) part != L1_CACHE_ALIGN((u64) part));
1220
1221                 part->act_IRQ_rcvd = 0;
1222                 spin_lock_init(&part->act_lock);
1223                 part->act_state = XPC_P_INACTIVE;
1224                 XPC_SET_REASON(part, 0, 0);
1225
1226                 init_timer(&part->disengage_request_timer);
1227                 part->disengage_request_timer.function =
1228                                 xpc_timeout_partition_disengage_request;
1229                 part->disengage_request_timer.data = (unsigned long) part;
1230
1231                 part->setup_state = XPC_P_UNSET;
1232                 init_waitqueue_head(&part->teardown_wq);
1233                 atomic_set(&part->references, 0);
1234         }
1235
1236         /*
1237          * Open up protections for IPI operations (and AMO operations on
1238          * Shub 1.1 systems).
1239          */
1240         xpc_allow_IPI_ops();
1241
1242         /*
1243          * Interrupts being processed will increment this atomic variable and
1244          * awaken the heartbeat thread which will process the interrupts.
1245          */
1246         atomic_set(&xpc_act_IRQ_rcvd, 0);
1247
1248         /*
1249          * This is safe to do before the xpc_hb_checker thread has started
1250          * because the handler releases a wait queue.  If an interrupt is
1251          * received before the thread is waiting, it will not go to sleep,
1252          * but rather immediately process the interrupt.
1253          */
1254         ret = request_irq(SGI_XPC_ACTIVATE, xpc_act_IRQ_handler, 0,
1255                                                         "xpc hb", NULL);
1256         if (ret != 0) {
1257                 dev_err(xpc_part, "can't register ACTIVATE IRQ handler, "
1258                         "errno=%d\n", -ret);
1259
1260                 xpc_restrict_IPI_ops();
1261
1262                 if (xpc_sysctl) {
1263                         unregister_sysctl_table(xpc_sysctl);
1264                 }
1265                 return -EBUSY;
1266         }
1267
1268         /*
1269          * Fill the partition reserved page with the information needed by
1270          * other partitions to discover we are alive and establish initial
1271          * communications.
1272          */
1273         xpc_rsvd_page = xpc_rsvd_page_init();
1274         if (xpc_rsvd_page == NULL) {
1275                 dev_err(xpc_part, "could not setup our reserved page\n");
1276
1277                 free_irq(SGI_XPC_ACTIVATE, NULL);
1278                 xpc_restrict_IPI_ops();
1279
1280                 if (xpc_sysctl) {
1281                         unregister_sysctl_table(xpc_sysctl);
1282                 }
1283                 return -EBUSY;
1284         }
1285
1286
1287         /* add ourselves to the reboot_notifier_list */
1288         ret = register_reboot_notifier(&xpc_reboot_notifier);
1289         if (ret != 0) {
1290                 dev_warn(xpc_part, "can't register reboot notifier\n");
1291         }
1292
1293         /* add ourselves to the die_notifier list (i.e., ia64die_chain) */
1294         ret = register_die_notifier(&xpc_die_notifier);
1295         if (ret != 0) {
1296                 dev_warn(xpc_part, "can't register die notifier\n");
1297         }
1298
1299
1300         /*
1301          * Set the beating to other partitions into motion.  This is
1302          * the last requirement for other partitions' discovery to
1303          * initiate communications with us.
1304          */
1305         init_timer(&xpc_hb_timer);
1306         xpc_hb_timer.function = xpc_hb_beater;
1307         xpc_hb_beater(0);
1308
1309
1310         /*
1311          * The real work-horse behind xpc.  This processes incoming
1312          * interrupts and monitors remote heartbeats.
1313          */
1314         pid = kernel_thread(xpc_hb_checker, NULL, 0);
1315         if (pid < 0) {
1316                 dev_err(xpc_part, "failed while forking hb check thread\n");
1317
1318                 /* indicate to others that our reserved page is uninitialized */
1319                 xpc_rsvd_page->vars_pa = 0;
1320
1321                 /* take ourselves off of the reboot_notifier_list */
1322                 (void) unregister_reboot_notifier(&xpc_reboot_notifier);
1323
1324                 /* take ourselves off of the die_notifier list */
1325                 (void) unregister_die_notifier(&xpc_die_notifier);
1326
1327                 del_timer_sync(&xpc_hb_timer);
1328                 free_irq(SGI_XPC_ACTIVATE, NULL);
1329                 xpc_restrict_IPI_ops();
1330
1331                 if (xpc_sysctl) {
1332                         unregister_sysctl_table(xpc_sysctl);
1333                 }
1334                 return -EBUSY;
1335         }
1336
1337
1338         /*
1339          * Startup a thread that will attempt to discover other partitions to
1340          * activate based on info provided by SAL. This new thread is short
1341          * lived and will exit once discovery is complete.
1342          */
1343         pid = kernel_thread(xpc_initiate_discovery, NULL, 0);
1344         if (pid < 0) {
1345                 dev_err(xpc_part, "failed while forking discovery thread\n");
1346
1347                 /* mark this new thread as a non-starter */
1348                 up(&xpc_discovery_exited);
1349
1350                 xpc_do_exit(xpcUnloading);
1351                 return -EBUSY;
1352         }
1353
1354
1355         /* set the interface to point at XPC's functions */
1356         xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect,
1357                           xpc_initiate_allocate, xpc_initiate_send,
1358                           xpc_initiate_send_notify, xpc_initiate_received,
1359                           xpc_initiate_partid_to_nasids);
1360
1361         return 0;
1362 }
1363 module_init(xpc_init);
1364
1365
1366 void __exit
1367 xpc_exit(void)
1368 {
1369         xpc_do_exit(xpcUnloading);
1370 }
1371 module_exit(xpc_exit);
1372
1373
1374 MODULE_AUTHOR("Silicon Graphics, Inc.");
1375 MODULE_DESCRIPTION("Cross Partition Communication (XPC) support");
1376 MODULE_LICENSE("GPL");
1377
1378 module_param(xpc_hb_interval, int, 0);
1379 MODULE_PARM_DESC(xpc_hb_interval, "Number of seconds between "
1380                 "heartbeat increments.");
1381
1382 module_param(xpc_hb_check_interval, int, 0);
1383 MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between "
1384                 "heartbeat checks.");
1385
1386 module_param(xpc_disengage_request_timelimit, int, 0);
1387 MODULE_PARM_DESC(xpc_disengage_request_timelimit, "Number of seconds to wait "
1388                 "for disengage request to complete.");
1389