xref: /dragonfly/sys/kern/lwkt_ipiq.c (revision f02303f9)
1 /*
2  * Copyright (c) 2003,2004 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/kern/lwkt_ipiq.c,v 1.21 2007/01/22 19:37:04 corecode Exp $
35  */
36 
37 /*
38  * This module implements IPI message queueing and the MI portion of IPI
39  * message processing.
40  */
41 
42 #ifdef _KERNEL
43 
44 #include "opt_ddb.h"
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
49 #include <sys/proc.h>
50 #include <sys/rtprio.h>
51 #include <sys/queue.h>
52 #include <sys/thread2.h>
53 #include <sys/sysctl.h>
54 #include <sys/ktr.h>
55 #include <sys/kthread.h>
56 #include <machine/cpu.h>
57 #include <sys/lock.h>
58 #include <sys/caps.h>
59 
60 #include <vm/vm.h>
61 #include <vm/vm_param.h>
62 #include <vm/vm_kern.h>
63 #include <vm/vm_object.h>
64 #include <vm/vm_page.h>
65 #include <vm/vm_map.h>
66 #include <vm/vm_pager.h>
67 #include <vm/vm_extern.h>
68 #include <vm/vm_zone.h>
69 
70 #include <machine/stdarg.h>
71 #include <machine/smp.h>
72 #include <machine/atomic.h>
73 
74 #else
75 
76 #include <sys/stdint.h>
77 #include <libcaps/thread.h>
78 #include <sys/thread.h>
79 #include <sys/msgport.h>
80 #include <sys/errno.h>
81 #include <libcaps/globaldata.h>
82 #include <machine/cpufunc.h>
83 #include <sys/thread2.h>
84 #include <sys/msgport2.h>
85 #include <stdio.h>
86 #include <stdlib.h>
87 #include <string.h>
88 #include <machine/lock.h>
89 #include <machine/cpu.h>
90 #include <machine/atomic.h>
91 
92 #endif
93 
94 #ifdef SMP
95 static __int64_t ipiq_count;	/* total calls to lwkt_send_ipiq*() */
96 static __int64_t ipiq_fifofull;	/* number of fifo full conditions detected */
97 static __int64_t ipiq_avoided;	/* interlock with target avoids cpu ipi */
98 static __int64_t ipiq_passive;	/* passive IPI messages */
99 static __int64_t ipiq_cscount;	/* number of cpu synchronizations */
100 static int ipiq_optimized = 1;	/* XXX temporary sysctl */
101 #ifdef PANIC_DEBUG
102 static int	panic_ipiq_cpu = -1;
103 static int	panic_ipiq_count = 100;
104 #endif
105 #endif
106 
107 #ifdef _KERNEL
108 
109 #ifdef SMP
110 SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_count, CTLFLAG_RW, &ipiq_count, 0, "");
111 SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_fifofull, CTLFLAG_RW, &ipiq_fifofull, 0, "");
112 SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_avoided, CTLFLAG_RW, &ipiq_avoided, 0, "");
113 SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_passive, CTLFLAG_RW, &ipiq_passive, 0, "");
114 SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_cscount, CTLFLAG_RW, &ipiq_cscount, 0, "");
115 SYSCTL_INT(_lwkt, OID_AUTO, ipiq_optimized, CTLFLAG_RW, &ipiq_optimized, 0, "");
116 #ifdef PANIC_DEBUG
117 SYSCTL_INT(_lwkt, OID_AUTO, panic_ipiq_cpu, CTLFLAG_RW, &panic_ipiq_cpu, 0, "");
118 SYSCTL_INT(_lwkt, OID_AUTO, panic_ipiq_count, CTLFLAG_RW, &panic_ipiq_count, 0, "");
119 #endif
120 
121 #define IPIQ_STRING	"func=%p arg1=%p arg2=%d scpu=%d dcpu=%d"
122 #define IPIQ_ARG_SIZE	(sizeof(void *) * 2 + sizeof(int) * 2)
123 
124 #if !defined(KTR_IPIQ)
125 #define KTR_IPIQ	KTR_ALL
126 #endif
127 KTR_INFO_MASTER(ipiq);
128 KTR_INFO(KTR_IPIQ, ipiq, send_norm, 0, IPIQ_STRING, IPIQ_ARG_SIZE);
129 KTR_INFO(KTR_IPIQ, ipiq, send_pasv, 1, IPIQ_STRING, IPIQ_ARG_SIZE);
130 KTR_INFO(KTR_IPIQ, ipiq, send_nbio, 2, IPIQ_STRING, IPIQ_ARG_SIZE);
131 KTR_INFO(KTR_IPIQ, ipiq, send_fail, 3, IPIQ_STRING, IPIQ_ARG_SIZE);
132 KTR_INFO(KTR_IPIQ, ipiq, receive, 4, IPIQ_STRING, IPIQ_ARG_SIZE);
133 
134 #define logipiq(name, func, arg1, arg2, sgd, dgd)	\
135 	KTR_LOG(ipiq_ ## name, func, arg1, arg2, sgd->gd_cpuid, dgd->gd_cpuid)
136 
137 #endif	/* SMP */
138 #endif	/* KERNEL */
139 
140 #ifdef SMP
141 
142 static int lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip,
143 				  struct intrframe *frame);
144 static void lwkt_cpusync_remote1(lwkt_cpusync_t poll);
145 static void lwkt_cpusync_remote2(lwkt_cpusync_t poll);
146 
147 /*
148  * Send a function execution request to another cpu.  The request is queued
149  * on the cpu<->cpu ipiq matrix.  Each cpu owns a unique ipiq FIFO for every
150  * possible target cpu.  The FIFO can be written.
151  *
152  * If the FIFO fills up we have to enable interrupts to avoid an APIC
153  * deadlock and process pending IPIQs while waiting for it to empty.
154  * Otherwise we may soft-deadlock with another cpu whos FIFO is also full.
155  *
156  * We can safely bump gd_intr_nesting_level because our crit_exit() at the
157  * end will take care of any pending interrupts.
158  *
159  * The actual hardware IPI is avoided if the target cpu is already processing
160  * the queue from a prior IPI.  It is possible to pipeline IPI messages
161  * very quickly between cpus due to the FIFO hysteresis.
162  *
163  * Need not be called from a critical section.
164  */
165 int
166 lwkt_send_ipiq3(globaldata_t target, ipifunc3_t func, void *arg1, int arg2)
167 {
168     lwkt_ipiq_t ip;
169     int windex;
170     struct globaldata *gd = mycpu;
171 
172     logipiq(send_norm, func, arg1, arg2, gd, target);
173 
174     if (target == gd) {
175 	func(arg1, arg2, NULL);
176 	return(0);
177     }
178     crit_enter();
179     ++gd->gd_intr_nesting_level;
180 #ifdef INVARIANTS
181     if (gd->gd_intr_nesting_level > 20)
182 	panic("lwkt_send_ipiq: TOO HEAVILY NESTED!");
183 #endif
184     KKASSERT(curthread->td_pri >= TDPRI_CRIT);
185     ++ipiq_count;
186     ip = &gd->gd_ipiq[target->gd_cpuid];
187 
188     /*
189      * Do not allow the FIFO to become full.  Interrupts must be physically
190      * enabled while we liveloop to avoid deadlocking the APIC.
191      */
192     if (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) {
193 	unsigned int eflags = read_eflags();
194 
195 	if (atomic_poll_acquire_int(&ip->ip_npoll) || ipiq_optimized == 0)
196 	    cpu_send_ipiq(target->gd_cpuid);
197 	cpu_enable_intr();
198 	++ipiq_fifofull;
199 	while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) {
200 	    KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1);
201 	    lwkt_process_ipiq();
202 	}
203 	write_eflags(eflags);
204     }
205 
206     /*
207      * Queue the new message
208      */
209     windex = ip->ip_windex & MAXCPUFIFO_MASK;
210     ip->ip_func[windex] = func;
211     ip->ip_arg1[windex] = arg1;
212     ip->ip_arg2[windex] = arg2;
213     cpu_sfence();
214     ++ip->ip_windex;
215     --gd->gd_intr_nesting_level;
216 
217     /*
218      * signal the target cpu that there is work pending.
219      */
220     if (atomic_poll_acquire_int(&ip->ip_npoll)) {
221 	cpu_send_ipiq(target->gd_cpuid);
222     } else {
223 	if (ipiq_optimized == 0)
224 	    cpu_send_ipiq(target->gd_cpuid);
225 	++ipiq_avoided;
226     }
227     crit_exit();
228     return(ip->ip_windex);
229 }
230 
231 /*
232  * Similar to lwkt_send_ipiq() but this function does not actually initiate
233  * the IPI to the target cpu unless the FIFO has become too full, so it is
234  * very fast.
235  *
236  * This function is used for non-critical IPI messages, such as memory
237  * deallocations.  The queue will typically be flushed by the target cpu at
238  * the next clock interrupt.
239  *
240  * Need not be called from a critical section.
241  */
242 int
243 lwkt_send_ipiq3_passive(globaldata_t target, ipifunc3_t func,
244 			void *arg1, int arg2)
245 {
246     lwkt_ipiq_t ip;
247     int windex;
248     struct globaldata *gd = mycpu;
249 
250     KKASSERT(target != gd);
251     crit_enter();
252     logipiq(send_pasv, func, arg1, arg2, gd, target);
253     ++gd->gd_intr_nesting_level;
254 #ifdef INVARIANTS
255     if (gd->gd_intr_nesting_level > 20)
256 	panic("lwkt_send_ipiq: TOO HEAVILY NESTED!");
257 #endif
258     KKASSERT(curthread->td_pri >= TDPRI_CRIT);
259     ++ipiq_count;
260     ++ipiq_passive;
261     ip = &gd->gd_ipiq[target->gd_cpuid];
262 
263     /*
264      * Do not allow the FIFO to become full.  Interrupts must be physically
265      * enabled while we liveloop to avoid deadlocking the APIC.
266      */
267     if (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) {
268 	unsigned int eflags = read_eflags();
269 
270 	if (atomic_poll_acquire_int(&ip->ip_npoll) || ipiq_optimized == 0)
271 	    cpu_send_ipiq(target->gd_cpuid);
272 	cpu_enable_intr();
273 	++ipiq_fifofull;
274 	while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) {
275 	    KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1);
276 	    lwkt_process_ipiq();
277 	}
278 	write_eflags(eflags);
279     }
280 
281     /*
282      * Queue the new message
283      */
284     windex = ip->ip_windex & MAXCPUFIFO_MASK;
285     ip->ip_func[windex] = func;
286     ip->ip_arg1[windex] = arg1;
287     ip->ip_arg2[windex] = arg2;
288     cpu_sfence();
289     ++ip->ip_windex;
290     --gd->gd_intr_nesting_level;
291 
292     /*
293      * Do not signal the target cpu, it will pick up the IPI when it next
294      * polls (typically on the next tick).
295      */
296     crit_exit();
297     return(ip->ip_windex);
298 }
299 
300 /*
301  * Send an IPI request without blocking, return 0 on success, ENOENT on
302  * failure.  The actual queueing of the hardware IPI may still force us
303  * to spin and process incoming IPIs but that will eventually go away
304  * when we've gotten rid of the other general IPIs.
305  */
306 int
307 lwkt_send_ipiq3_nowait(globaldata_t target, ipifunc3_t func,
308 		       void *arg1, int arg2)
309 {
310     lwkt_ipiq_t ip;
311     int windex;
312     struct globaldata *gd = mycpu;
313 
314     logipiq(send_nbio, func, arg1, arg2, gd, target);
315     KKASSERT(curthread->td_pri >= TDPRI_CRIT);
316     if (target == gd) {
317 	func(arg1, arg2, NULL);
318 	return(0);
319     }
320     ++ipiq_count;
321     ip = &gd->gd_ipiq[target->gd_cpuid];
322 
323     if (ip->ip_windex - ip->ip_rindex >= MAXCPUFIFO * 2 / 3) {
324 	logipiq(send_fail, func, arg1, arg2, gd, target);
325 	return(ENOENT);
326     }
327     windex = ip->ip_windex & MAXCPUFIFO_MASK;
328     ip->ip_func[windex] = func;
329     ip->ip_arg1[windex] = arg1;
330     ip->ip_arg2[windex] = arg2;
331     cpu_sfence();
332     ++ip->ip_windex;
333 
334     /*
335      * This isn't a passive IPI, we still have to signal the target cpu.
336      */
337     if (atomic_poll_acquire_int(&ip->ip_npoll)) {
338 	cpu_send_ipiq(target->gd_cpuid);
339     } else {
340 	if (ipiq_optimized == 0)
341 	    cpu_send_ipiq(target->gd_cpuid);
342 	else
343 	    ++ipiq_avoided;
344     }
345     return(0);
346 }
347 
348 /*
349  * deprecated, used only by fast int forwarding.
350  */
351 int
352 lwkt_send_ipiq3_bycpu(int dcpu, ipifunc3_t func, void *arg1, int arg2)
353 {
354     return(lwkt_send_ipiq3(globaldata_find(dcpu), func, arg1, arg2));
355 }
356 
357 /*
358  * Send a message to several target cpus.  Typically used for scheduling.
359  * The message will not be sent to stopped cpus.
360  */
361 int
362 lwkt_send_ipiq3_mask(u_int32_t mask, ipifunc3_t func, void *arg1, int arg2)
363 {
364     int cpuid;
365     int count = 0;
366 
367     mask &= ~stopped_cpus;
368     while (mask) {
369 	cpuid = bsfl(mask);
370 	lwkt_send_ipiq3(globaldata_find(cpuid), func, arg1, arg2);
371 	mask &= ~(1 << cpuid);
372 	++count;
373     }
374     return(count);
375 }
376 
377 /*
378  * Wait for the remote cpu to finish processing a function.
379  *
380  * YYY we have to enable interrupts and process the IPIQ while waiting
381  * for it to empty or we may deadlock with another cpu.  Create a CPU_*()
382  * function to do this!  YYY we really should 'block' here.
383  *
384  * MUST be called from a critical section.  This routine may be called
385  * from an interrupt (for example, if an interrupt wakes a foreign thread
386  * up).
387  */
388 void
389 lwkt_wait_ipiq(globaldata_t target, int seq)
390 {
391     lwkt_ipiq_t ip;
392     int maxc = 100000000;
393 
394     if (target != mycpu) {
395 	ip = &mycpu->gd_ipiq[target->gd_cpuid];
396 	if ((int)(ip->ip_xindex - seq) < 0) {
397 	    unsigned int eflags = read_eflags();
398 	    cpu_enable_intr();
399 	    while ((int)(ip->ip_xindex - seq) < 0) {
400 		crit_enter();
401 		lwkt_process_ipiq();
402 		crit_exit();
403 		if (--maxc == 0)
404 			kprintf("LWKT_WAIT_IPIQ WARNING! %d wait %d (%d)\n", mycpu->gd_cpuid, target->gd_cpuid, ip->ip_xindex - seq);
405 		if (maxc < -1000000)
406 			panic("LWKT_WAIT_IPIQ");
407 		/*
408 		 * xindex may be modified by another cpu, use a load fence
409 		 * to ensure that the loop does not use a speculative value
410 		 * (which may improve performance).
411 		 */
412 		cpu_lfence();
413 	    }
414 	    write_eflags(eflags);
415 	}
416     }
417 }
418 
419 int
420 lwkt_seq_ipiq(globaldata_t target)
421 {
422     lwkt_ipiq_t ip;
423 
424     ip = &mycpu->gd_ipiq[target->gd_cpuid];
425     return(ip->ip_windex);
426 }
427 
428 /*
429  * Called from IPI interrupt (like a fast interrupt), which has placed
430  * us in a critical section.  The MP lock may or may not be held.
431  * May also be called from doreti or splz, or be reentrantly called
432  * indirectly through the ip_func[] we run.
433  *
434  * There are two versions, one where no interrupt frame is available (when
435  * called from the send code and from splz, and one where an interrupt
436  * frame is available.
437  */
438 void
439 lwkt_process_ipiq(void)
440 {
441     globaldata_t gd = mycpu;
442     globaldata_t sgd;
443     lwkt_ipiq_t ip;
444     int n;
445 
446 again:
447     for (n = 0; n < ncpus; ++n) {
448 	if (n != gd->gd_cpuid) {
449 	    sgd = globaldata_find(n);
450 	    ip = sgd->gd_ipiq;
451 	    if (ip != NULL) {
452 		while (lwkt_process_ipiq_core(sgd, &ip[gd->gd_cpuid], NULL))
453 		    ;
454 	    }
455 	}
456     }
457     if (gd->gd_cpusyncq.ip_rindex != gd->gd_cpusyncq.ip_windex) {
458 	if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, NULL)) {
459 	    if (gd->gd_curthread->td_cscount == 0)
460 		goto again;
461 	    need_ipiq();
462 	}
463     }
464 }
465 
466 #ifdef _KERNEL
467 void
468 lwkt_process_ipiq_frame(struct intrframe *frame)
469 {
470     globaldata_t gd = mycpu;
471     globaldata_t sgd;
472     lwkt_ipiq_t ip;
473     int n;
474 
475 again:
476     for (n = 0; n < ncpus; ++n) {
477 	if (n != gd->gd_cpuid) {
478 	    sgd = globaldata_find(n);
479 	    ip = sgd->gd_ipiq;
480 	    if (ip != NULL) {
481 		while (lwkt_process_ipiq_core(sgd, &ip[gd->gd_cpuid], frame))
482 		    ;
483 	    }
484 	}
485     }
486     if (gd->gd_cpusyncq.ip_rindex != gd->gd_cpusyncq.ip_windex) {
487 	if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, frame)) {
488 	    if (gd->gd_curthread->td_cscount == 0)
489 		goto again;
490 	    need_ipiq();
491 	}
492     }
493 }
494 #endif
495 
496 static int
497 lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip,
498 		       struct intrframe *frame)
499 {
500     int ri;
501     int wi;
502     ipifunc3_t copy_func;
503     void *copy_arg1;
504     int copy_arg2;
505 
506     /*
507      * Obtain the current write index, which is modified by a remote cpu.
508      * Issue a load fence to prevent speculative reads of e.g. data written
509      * by the other cpu prior to it updating the index.
510      */
511     KKASSERT(curthread->td_pri >= TDPRI_CRIT);
512     wi = ip->ip_windex;
513     cpu_lfence();
514 
515     /*
516      * Note: xindex is only updated after we are sure the function has
517      * finished execution.  Beware lwkt_process_ipiq() reentrancy!  The
518      * function may send an IPI which may block/drain.
519      *
520      * Note: due to additional IPI operations that the callback function
521      * may make, it is possible for both rindex and windex to advance and
522      * thus for rindex to advance passed our cached windex.
523      */
524     while (wi - (ri = ip->ip_rindex) > 0) {
525 	ri &= MAXCPUFIFO_MASK;
526 	copy_func = ip->ip_func[ri];
527 	copy_arg1 = ip->ip_arg1[ri];
528 	copy_arg2 = ip->ip_arg2[ri];
529 	cpu_mfence();
530 	++ip->ip_rindex;
531 	KKASSERT((ip->ip_rindex & MAXCPUFIFO_MASK) == ((ri + 1) & MAXCPUFIFO_MASK));
532 	logipiq(receive, copy_func, copy_arg1, copy_arg2, sgd, mycpu);
533 	copy_func(copy_arg1, copy_arg2, frame);
534 	cpu_sfence();
535 	ip->ip_xindex = ip->ip_rindex;
536 
537 #ifdef PANIC_DEBUG
538 	/*
539 	 * Simulate panics during the processing of an IPI
540 	 */
541 	if (mycpu->gd_cpuid == panic_ipiq_cpu && panic_ipiq_count) {
542 		if (--panic_ipiq_count == 0) {
543 #ifdef DDB
544 			Debugger("PANIC_DEBUG");
545 #else
546 			panic("PANIC_DEBUG");
547 #endif
548 		}
549 	}
550 #endif
551     }
552 
553     /*
554      * Return non-zero if there are more IPI messages pending on this
555      * ipiq.  ip_npoll is left set as long as possible to reduce the
556      * number of IPIs queued by the originating cpu, but must be cleared
557      * *BEFORE* checking windex.
558      */
559     atomic_poll_release_int(&ip->ip_npoll);
560     return(wi != ip->ip_windex);
561 }
562 
563 #endif
564 
565 /*
566  * CPU Synchronization Support
567  *
568  * lwkt_cpusync_simple()
569  *
570  *	The function is executed synchronously before return on remote cpus.
571  *	A lwkt_cpusync_t pointer is passed as an argument.  The data can
572  *	be accessed via arg->cs_data.
573  *
574  *	XXX should I just pass the data as an argument to be consistent?
575  */
576 
577 void
578 lwkt_cpusync_simple(cpumask_t mask, cpusync_func_t func, void *data)
579 {
580     struct lwkt_cpusync cmd;
581 
582     cmd.cs_run_func = NULL;
583     cmd.cs_fin1_func = func;
584     cmd.cs_fin2_func = NULL;
585     cmd.cs_data = data;
586     lwkt_cpusync_start(mask & mycpu->gd_other_cpus, &cmd);
587     if (mask & (1 << mycpu->gd_cpuid))
588 	func(&cmd);
589     lwkt_cpusync_finish(&cmd);
590 }
591 
592 /*
593  * lwkt_cpusync_fastdata()
594  *
595  *	The function is executed in tandem with return on remote cpus.
596  *	The data is directly passed as an argument.  Do not pass pointers to
597  *	temporary storage as the storage might have
598  *	gone poof by the time the target cpu executes
599  *	the function.
600  *
601  *	At the moment lwkt_cpusync is declared on the stack and we must wait
602  *	for all remote cpus to ack in lwkt_cpusync_finish(), but as a future
603  *	optimization we should be able to put a counter in the globaldata
604  *	structure (if it is not otherwise being used) and just poke it and
605  *	return without waiting. XXX
606  */
607 void
608 lwkt_cpusync_fastdata(cpumask_t mask, cpusync_func2_t func, void *data)
609 {
610     struct lwkt_cpusync cmd;
611 
612     cmd.cs_run_func = NULL;
613     cmd.cs_fin1_func = NULL;
614     cmd.cs_fin2_func = func;
615     cmd.cs_data = NULL;
616     lwkt_cpusync_start(mask & mycpu->gd_other_cpus, &cmd);
617     if (mask & (1 << mycpu->gd_cpuid))
618 	func(data);
619     lwkt_cpusync_finish(&cmd);
620 }
621 
622 /*
623  * lwkt_cpusync_start()
624  *
625  *	Start synchronization with a set of target cpus, return once they are
626  *	known to be in a synchronization loop.  The target cpus will execute
627  *	poll->cs_run_func() IN TANDEM WITH THE RETURN.
628  *
629  *	XXX future: add lwkt_cpusync_start_quick() and require a call to
630  *	lwkt_cpusync_add() or lwkt_cpusync_wait(), allowing the caller to
631  *	potentially absorb the IPI latency doing something useful.
632  */
633 void
634 lwkt_cpusync_start(cpumask_t mask, lwkt_cpusync_t poll)
635 {
636     globaldata_t gd = mycpu;
637 
638     poll->cs_count = 0;
639     poll->cs_mask = mask;
640 #ifdef SMP
641     poll->cs_maxcount = lwkt_send_ipiq_mask(
642 		mask & gd->gd_other_cpus & smp_active_mask,
643 		(ipifunc1_t)lwkt_cpusync_remote1, poll);
644 #endif
645     if (mask & gd->gd_cpumask) {
646 	if (poll->cs_run_func)
647 	    poll->cs_run_func(poll);
648     }
649 #ifdef SMP
650     if (poll->cs_maxcount) {
651 	++ipiq_cscount;
652 	++gd->gd_curthread->td_cscount;
653 	while (poll->cs_count != poll->cs_maxcount) {
654 	    crit_enter();
655 	    lwkt_process_ipiq();
656 	    crit_exit();
657 	}
658     }
659 #endif
660 }
661 
662 void
663 lwkt_cpusync_add(cpumask_t mask, lwkt_cpusync_t poll)
664 {
665     globaldata_t gd = mycpu;
666 #ifdef SMP
667     int count;
668 #endif
669 
670     mask &= ~poll->cs_mask;
671     poll->cs_mask |= mask;
672 #ifdef SMP
673     count = lwkt_send_ipiq_mask(
674 		mask & gd->gd_other_cpus & smp_active_mask,
675 		(ipifunc1_t)lwkt_cpusync_remote1, poll);
676 #endif
677     if (mask & gd->gd_cpumask) {
678 	if (poll->cs_run_func)
679 	    poll->cs_run_func(poll);
680     }
681 #ifdef SMP
682     poll->cs_maxcount += count;
683     if (poll->cs_maxcount) {
684 	if (poll->cs_maxcount == count)
685 	    ++gd->gd_curthread->td_cscount;
686 	while (poll->cs_count != poll->cs_maxcount) {
687 	    crit_enter();
688 	    lwkt_process_ipiq();
689 	    crit_exit();
690 	}
691     }
692 #endif
693 }
694 
695 /*
696  * Finish synchronization with a set of target cpus.  The target cpus will
697  * execute cs_fin1_func(poll) prior to this function returning, and will
698  * execute cs_fin2_func(data) IN TANDEM WITH THIS FUNCTION'S RETURN.
699  *
700  * If cs_maxcount is non-zero then we are mastering a cpusync with one or
701  * more remote cpus and must account for it in our thread structure.
702  */
703 void
704 lwkt_cpusync_finish(lwkt_cpusync_t poll)
705 {
706     globaldata_t gd = mycpu;
707 
708     poll->cs_count = -1;
709     if (poll->cs_mask & gd->gd_cpumask) {
710 	if (poll->cs_fin1_func)
711 	    poll->cs_fin1_func(poll);
712 	if (poll->cs_fin2_func)
713 	    poll->cs_fin2_func(poll->cs_data);
714     }
715 #ifdef SMP
716     if (poll->cs_maxcount) {
717 	while (poll->cs_count != -(poll->cs_maxcount + 1)) {
718 	    crit_enter();
719 	    lwkt_process_ipiq();
720 	    crit_exit();
721 	}
722 	--gd->gd_curthread->td_cscount;
723     }
724 #endif
725 }
726 
727 #ifdef SMP
728 
729 /*
730  * helper IPI remote messaging function.
731  *
732  * Called on remote cpu when a new cpu synchronization request has been
733  * sent to us.  Execute the run function and adjust cs_count, then requeue
734  * the request so we spin on it.
735  */
736 static void
737 lwkt_cpusync_remote1(lwkt_cpusync_t poll)
738 {
739     atomic_add_int(&poll->cs_count, 1);
740     if (poll->cs_run_func)
741 	poll->cs_run_func(poll);
742     lwkt_cpusync_remote2(poll);
743 }
744 
745 /*
746  * helper IPI remote messaging function.
747  *
748  * Poll for the originator telling us to finish.  If it hasn't, requeue
749  * our request so we spin on it.  When the originator requests that we
750  * finish we execute cs_fin1_func(poll) synchronously and cs_fin2_func(data)
751  * in tandem with the release.
752  */
753 static void
754 lwkt_cpusync_remote2(lwkt_cpusync_t poll)
755 {
756     if (poll->cs_count < 0) {
757 	cpusync_func2_t savef;
758 	void *saved;
759 
760 	if (poll->cs_fin1_func)
761 	    poll->cs_fin1_func(poll);
762 	if (poll->cs_fin2_func) {
763 	    savef = poll->cs_fin2_func;
764 	    saved = poll->cs_data;
765 	    atomic_add_int(&poll->cs_count, -1);
766 	    savef(saved);
767 	} else {
768 	    atomic_add_int(&poll->cs_count, -1);
769 	}
770     } else {
771 	globaldata_t gd = mycpu;
772 	lwkt_ipiq_t ip;
773 	int wi;
774 
775 	ip = &gd->gd_cpusyncq;
776 	wi = ip->ip_windex & MAXCPUFIFO_MASK;
777 	ip->ip_func[wi] = (ipifunc3_t)(ipifunc1_t)lwkt_cpusync_remote2;
778 	ip->ip_arg1[wi] = poll;
779 	ip->ip_arg2[wi] = 0;
780 	cpu_sfence();
781 	++ip->ip_windex;
782     }
783 }
784 
785 #endif
786