xref: /dragonfly/sys/kern/lwkt_ipiq.c (revision 9bb2a92d)
1 /*
2  * Copyright (c) 2003 Matthew Dillon <dillon@backplane.com>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $DragonFly: src/sys/kern/lwkt_ipiq.c,v 1.5 2004/03/08 03:03:54 dillon Exp $
27  */
28 
29 /*
30  * This module implements IPI message queueing and the MI portion of IPI
31  * message processing.
32  */
33 
34 #ifdef _KERNEL
35 
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/kernel.h>
39 #include <sys/proc.h>
40 #include <sys/rtprio.h>
41 #include <sys/queue.h>
42 #include <sys/thread2.h>
43 #include <sys/sysctl.h>
44 #include <sys/kthread.h>
45 #include <machine/cpu.h>
46 #include <sys/lock.h>
47 #include <sys/caps.h>
48 
49 #include <vm/vm.h>
50 #include <vm/vm_param.h>
51 #include <vm/vm_kern.h>
52 #include <vm/vm_object.h>
53 #include <vm/vm_page.h>
54 #include <vm/vm_map.h>
55 #include <vm/vm_pager.h>
56 #include <vm/vm_extern.h>
57 #include <vm/vm_zone.h>
58 
59 #include <machine/stdarg.h>
60 #include <machine/ipl.h>
61 #include <machine/smp.h>
62 #include <machine/atomic.h>
63 
64 #define THREAD_STACK	(UPAGES * PAGE_SIZE)
65 
66 #else
67 
68 #include <sys/stdint.h>
69 #include <libcaps/thread.h>
70 #include <sys/thread.h>
71 #include <sys/msgport.h>
72 #include <sys/errno.h>
73 #include <libcaps/globaldata.h>
74 #include <sys/thread2.h>
75 #include <sys/msgport2.h>
76 #include <stdio.h>
77 #include <stdlib.h>
78 #include <string.h>
79 #include <machine/cpufunc.h>
80 #include <machine/lock.h>
81 #include <machine/cpu.h>
82 #include <machine/atomic.h>
83 
84 #endif
85 
86 #ifdef SMP
87 static __int64_t ipiq_count;
88 static __int64_t ipiq_fifofull;
89 static __int64_t ipiq_cscount;
90 #endif
91 
92 #ifdef _KERNEL
93 
94 #ifdef SMP
95 SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_count, CTLFLAG_RW, &ipiq_count, 0, "");
96 SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_fifofull, CTLFLAG_RW, &ipiq_fifofull, 0, "");
97 SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_cscount, CTLFLAG_RW, &ipiq_cscount, 0, "");
98 #endif
99 
100 #endif
101 
102 #ifdef SMP
103 
104 static int lwkt_process_ipiq1(lwkt_ipiq_t ip, struct intrframe *frame);
105 static void lwkt_cpusync_remote1(lwkt_cpusync_t poll);
106 static void lwkt_cpusync_remote2(lwkt_cpusync_t poll);
107 
108 /*
109  * Send a function execution request to another cpu.  The request is queued
110  * on the cpu<->cpu ipiq matrix.  Each cpu owns a unique ipiq FIFO for every
111  * possible target cpu.  The FIFO can be written.
112  *
113  * YYY If the FIFO fills up we have to enable interrupts and process the
114  * IPIQ while waiting for it to empty or we may deadlock with another cpu.
115  * Create a CPU_*() function to do this!
116  *
117  * We can safely bump gd_intr_nesting_level because our crit_exit() at the
118  * end will take care of any pending interrupts.
119  *
120  * Must be called from a critical section.
121  */
122 int
123 lwkt_send_ipiq(globaldata_t target, ipifunc_t func, void *arg)
124 {
125     lwkt_ipiq_t ip;
126     int windex;
127     struct globaldata *gd = mycpu;
128 
129     if (target == gd) {
130 	func(arg);
131 	return(0);
132     }
133     crit_enter();
134     ++gd->gd_intr_nesting_level;
135 #ifdef INVARIANTS
136     if (gd->gd_intr_nesting_level > 20)
137 	panic("lwkt_send_ipiq: TOO HEAVILY NESTED!");
138 #endif
139     KKASSERT(curthread->td_pri >= TDPRI_CRIT);
140     ++ipiq_count;
141     ip = &gd->gd_ipiq[target->gd_cpuid];
142 
143     /*
144      * We always drain before the FIFO becomes full so it should never
145      * become full.  We need to leave enough entries to deal with
146      * reentrancy.
147      */
148     KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO);
149     windex = ip->ip_windex & MAXCPUFIFO_MASK;
150     ip->ip_func[windex] = (ipifunc2_t)func;
151     ip->ip_arg[windex] = arg;
152     cpu_mb1();
153     ++ip->ip_windex;
154     if (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) {
155 	unsigned int eflags = read_eflags();
156 	cpu_enable_intr();
157 	++ipiq_fifofull;
158 	while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) {
159 	    KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1);
160 	    lwkt_process_ipiq();
161 	}
162 	write_eflags(eflags);
163     }
164     --gd->gd_intr_nesting_level;
165     cpu_send_ipiq(target->gd_cpuid);	/* issues mem barrier if appropriate */
166     crit_exit();
167     return(ip->ip_windex);
168 }
169 
170 /*
171  * Send an IPI request passively, return 0 on success and ENOENT on failure.
172  * This routine does not recursive through lwkt_process_ipiq() nor does it
173  * block trying to queue the actual IPI.  If we successfully queue the
174  * message but fail to queue the IPI, we still count it as a success.
175  * The occassional small race against a target cpu HLT is recovered at
176  * the next clock interrupt.
177  */
178 int
179 lwkt_send_ipiq_passive(globaldata_t target, ipifunc_t func, void *arg)
180 {
181     lwkt_ipiq_t ip;
182     int windex;
183     struct globaldata *gd = mycpu;
184 
185     KKASSERT(curthread->td_pri >= TDPRI_CRIT);
186     if (target == gd) {
187 	func(arg);
188 	return(0);
189     }
190     ++ipiq_count;
191     ip = &gd->gd_ipiq[target->gd_cpuid];
192 
193     if (ip->ip_windex - ip->ip_rindex >= MAXCPUFIFO - 1) {
194 	return(ENOENT);
195     }
196     windex = ip->ip_windex & MAXCPUFIFO_MASK;
197     ip->ip_func[windex] = (ipifunc2_t)func;
198     ip->ip_arg[windex] = arg;
199     cpu_mb1();
200     ++ip->ip_windex;
201     /*
202      * passive mode doesn't work yet :-(
203      */
204 #if 1
205     cpu_send_ipiq(target->gd_cpuid);
206 #else
207     cpu_send_ipiq_passive(target->gd_cpuid);
208 #endif
209     return(0);
210 }
211 
212 /*
213  * deprecated, used only by fast int forwarding.
214  */
215 int
216 lwkt_send_ipiq_bycpu(int dcpu, ipifunc_t func, void *arg)
217 {
218     return(lwkt_send_ipiq(globaldata_find(dcpu), func, arg));
219 }
220 
221 /*
222  * Send a message to several target cpus.  Typically used for scheduling.
223  * The message will not be sent to stopped cpus.
224  */
225 int
226 lwkt_send_ipiq_mask(u_int32_t mask, ipifunc_t func, void *arg)
227 {
228     int cpuid;
229     int count = 0;
230 
231     mask &= ~stopped_cpus;
232     while (mask) {
233 	cpuid = bsfl(mask);
234 	lwkt_send_ipiq(globaldata_find(cpuid), func, arg);
235 	mask &= ~(1 << cpuid);
236 	++count;
237     }
238     return(count);
239 }
240 
241 /*
242  * Wait for the remote cpu to finish processing a function.
243  *
244  * YYY we have to enable interrupts and process the IPIQ while waiting
245  * for it to empty or we may deadlock with another cpu.  Create a CPU_*()
246  * function to do this!  YYY we really should 'block' here.
247  *
248  * MUST be called from a critical section.  This routine may be called
249  * from an interrupt (for example, if an interrupt wakes a foreign thread
250  * up).
251  */
252 void
253 lwkt_wait_ipiq(globaldata_t target, int seq)
254 {
255     lwkt_ipiq_t ip;
256     int maxc = 100000000;
257 
258     if (target != mycpu) {
259 	ip = &mycpu->gd_ipiq[target->gd_cpuid];
260 	if ((int)(ip->ip_xindex - seq) < 0) {
261 	    unsigned int eflags = read_eflags();
262 	    cpu_enable_intr();
263 	    while ((int)(ip->ip_xindex - seq) < 0) {
264 		crit_enter();
265 		lwkt_process_ipiq();
266 		crit_exit();
267 		if (--maxc == 0)
268 			printf("LWKT_WAIT_IPIQ WARNING! %d wait %d (%d)\n", mycpu->gd_cpuid, target->gd_cpuid, ip->ip_xindex - seq);
269 		if (maxc < -1000000)
270 			panic("LWKT_WAIT_IPIQ");
271 	    }
272 	    write_eflags(eflags);
273 	}
274     }
275 }
276 
277 int
278 lwkt_seq_ipiq(globaldata_t target)
279 {
280     lwkt_ipiq_t ip;
281 
282     ip = &mycpu->gd_ipiq[target->gd_cpuid];
283     return(ip->ip_windex);
284 }
285 
286 /*
287  * Called from IPI interrupt (like a fast interrupt), which has placed
288  * us in a critical section.  The MP lock may or may not be held.
289  * May also be called from doreti or splz, or be reentrantly called
290  * indirectly through the ip_func[] we run.
291  *
292  * There are two versions, one where no interrupt frame is available (when
293  * called from the send code and from splz, and one where an interrupt
294  * frame is available.
295  */
296 void
297 lwkt_process_ipiq(void)
298 {
299     globaldata_t gd = mycpu;
300     lwkt_ipiq_t ip;
301     int n;
302 
303 again:
304     for (n = 0; n < ncpus; ++n) {
305 	if (n != gd->gd_cpuid) {
306 	    ip = globaldata_find(n)->gd_ipiq;
307 	    if (ip != NULL) {
308 		while (lwkt_process_ipiq1(&ip[gd->gd_cpuid], NULL))
309 		    ;
310 	    }
311 	}
312     }
313     if (gd->gd_cpusyncq.ip_rindex != gd->gd_cpusyncq.ip_windex) {
314 	if (lwkt_process_ipiq1(&gd->gd_cpusyncq, NULL)) {
315 	    if (gd->gd_curthread->td_cscount == 0)
316 		goto again;
317 	    need_ipiq();
318 	}
319     }
320 }
321 
322 #ifdef _KERNEL
323 void
324 lwkt_process_ipiq_frame(struct intrframe frame)
325 {
326     globaldata_t gd = mycpu;
327     lwkt_ipiq_t ip;
328     int n;
329 
330 again:
331     for (n = 0; n < ncpus; ++n) {
332 	if (n != gd->gd_cpuid) {
333 	    ip = globaldata_find(n)->gd_ipiq;
334 	    if (ip != NULL) {
335 		while (lwkt_process_ipiq1(&ip[gd->gd_cpuid], &frame))
336 		    ;
337 	    }
338 	}
339     }
340     if (gd->gd_cpusyncq.ip_rindex != gd->gd_cpusyncq.ip_windex) {
341 	if (lwkt_process_ipiq1(&gd->gd_cpusyncq, &frame)) {
342 	    if (gd->gd_curthread->td_cscount == 0)
343 		goto again;
344 	    need_ipiq();
345 	}
346     }
347 }
348 #endif
349 
350 static int
351 lwkt_process_ipiq1(lwkt_ipiq_t ip, struct intrframe *frame)
352 {
353     int ri;
354     int wi = ip->ip_windex;
355     /*
356      * Note: xindex is only updated after we are sure the function has
357      * finished execution.  Beware lwkt_process_ipiq() reentrancy!  The
358      * function may send an IPI which may block/drain.
359      */
360     while ((ri = ip->ip_rindex) != wi) {
361 	ip->ip_rindex = ri + 1;
362 	ri &= MAXCPUFIFO_MASK;
363 	ip->ip_func[ri](ip->ip_arg[ri], frame);
364 	/* YYY memory barrier */
365 	ip->ip_xindex = ip->ip_rindex;
366     }
367     return(wi != ip->ip_windex);
368 }
369 
370 #else
371 
372 /*
373  * !SMP dummy routines
374  */
375 
376 int
377 lwkt_send_ipiq(globaldata_t target, ipifunc_t func, void *arg)
378 {
379     panic("lwkt_send_ipiq: UP box! (%d,%p,%p)", target->gd_cpuid, func, arg);
380     return(0); /* NOT REACHED */
381 }
382 
383 void
384 lwkt_wait_ipiq(globaldata_t target, int seq)
385 {
386     panic("lwkt_wait_ipiq: UP box! (%d,%d)", target->gd_cpuid, seq);
387 }
388 
389 #endif
390 
391 /*
392  * CPU Synchronization Support
393  *
394  * lwkt_cpusync_simple()
395  *
396  *	The function is executed synchronously before return on remote cpus.
397  *	A lwkt_cpusync_t pointer is passed as an argument.  The data can
398  *	be accessed via arg->cs_data.
399  *
400  *	XXX should I just pass the data as an argument to be consistent?
401  */
402 
403 void
404 lwkt_cpusync_simple(cpumask_t mask, cpusync_func_t func, void *data)
405 {
406     struct lwkt_cpusync cmd;
407 
408     cmd.cs_run_func = NULL;
409     cmd.cs_fin1_func = func;
410     cmd.cs_fin2_func = NULL;
411     cmd.cs_data = data;
412     lwkt_cpusync_start(mask & mycpu->gd_other_cpus, &cmd);
413     if (mask & (1 << mycpu->gd_cpuid))
414 	func(&cmd);
415     lwkt_cpusync_finish(&cmd);
416 }
417 
418 /*
419  * lwkt_cpusync_fastdata()
420  *
421  *	The function is executed in tandem with return on remote cpus.
422  *	The data is directly passed as an argument.  Do not pass pointers to
423  *	temporary storage as the storage might have
424  *	gone poof by the time the target cpu executes
425  *	the function.
426  *
427  *	At the moment lwkt_cpusync is declared on the stack and we must wait
428  *	for all remote cpus to ack in lwkt_cpusync_finish(), but as a future
429  *	optimization we should be able to put a counter in the globaldata
430  *	structure (if it is not otherwise being used) and just poke it and
431  *	return without waiting. XXX
432  */
433 void
434 lwkt_cpusync_fastdata(cpumask_t mask, cpusync_func2_t func, void *data)
435 {
436     struct lwkt_cpusync cmd;
437 
438     cmd.cs_run_func = NULL;
439     cmd.cs_fin1_func = NULL;
440     cmd.cs_fin2_func = func;
441     cmd.cs_data = NULL;
442     lwkt_cpusync_start(mask & mycpu->gd_other_cpus, &cmd);
443     if (mask & (1 << mycpu->gd_cpuid))
444 	func(data);
445     lwkt_cpusync_finish(&cmd);
446 }
447 
448 /*
449  * lwkt_cpusync_start()
450  *
451  *	Start synchronization with a set of target cpus, return once they are
452  *	known to be in a synchronization loop.  The target cpus will execute
453  *	poll->cs_run_func() IN TANDEM WITH THE RETURN.
454  *
455  *	XXX future: add lwkt_cpusync_start_quick() and require a call to
456  *	lwkt_cpusync_add() or lwkt_cpusync_wait(), allowing the caller to
457  *	potentially absorb the IPI latency doing something useful.
458  */
459 void
460 lwkt_cpusync_start(cpumask_t mask, lwkt_cpusync_t poll)
461 {
462     globaldata_t gd = mycpu;
463 
464     poll->cs_count = 0;
465     poll->cs_mask = mask;
466 #ifdef SMP
467     poll->cs_maxcount = lwkt_send_ipiq_mask(
468 		mask & gd->gd_other_cpus & smp_active_mask,
469 		(ipifunc_t)lwkt_cpusync_remote1, poll);
470 #endif
471     if (mask & (1 << gd->gd_cpuid)) {
472 	if (poll->cs_run_func)
473 	    poll->cs_run_func(poll);
474     }
475 #ifdef SMP
476     if (poll->cs_maxcount) {
477 	++ipiq_cscount;
478 	++gd->gd_curthread->td_cscount;
479 	while (poll->cs_count != poll->cs_maxcount) {
480 	    crit_enter();
481 	    lwkt_process_ipiq();
482 	    crit_exit();
483 	}
484     }
485 #endif
486 }
487 
488 void
489 lwkt_cpusync_add(cpumask_t mask, lwkt_cpusync_t poll)
490 {
491     globaldata_t gd = mycpu;
492 #ifdef SMP
493     int count;
494 #endif
495 
496     mask &= ~poll->cs_mask;
497     poll->cs_mask |= mask;
498 #ifdef SMP
499     count = lwkt_send_ipiq_mask(
500 		mask & gd->gd_other_cpus & smp_active_mask,
501 		(ipifunc_t)lwkt_cpusync_remote1, poll);
502 #endif
503     if (mask & (1 << gd->gd_cpuid)) {
504 	if (poll->cs_run_func)
505 	    poll->cs_run_func(poll);
506     }
507 #ifdef SMP
508     poll->cs_maxcount += count;
509     if (poll->cs_maxcount) {
510 	if (poll->cs_maxcount == count)
511 	    ++gd->gd_curthread->td_cscount;
512 	while (poll->cs_count != poll->cs_maxcount) {
513 	    crit_enter();
514 	    lwkt_process_ipiq();
515 	    crit_exit();
516 	}
517     }
518 #endif
519 }
520 
521 /*
522  * Finish synchronization with a set of target cpus.  The target cpus will
523  * execute cs_fin1_func(poll) prior to this function returning, and will
524  * execute cs_fin2_func(data) IN TANDEM WITH THIS FUNCTION'S RETURN.
525  *
526  * If cs_maxcount is non-zero then we are mastering a cpusync with one or
527  * more remote cpus and must account for it in our thread structure.
528  */
529 void
530 lwkt_cpusync_finish(lwkt_cpusync_t poll)
531 {
532     globaldata_t gd = mycpu;
533 
534     poll->cs_count = -1;
535     if (poll->cs_mask & (1 << gd->gd_cpuid)) {
536 	if (poll->cs_fin1_func)
537 	    poll->cs_fin1_func(poll);
538 	if (poll->cs_fin2_func)
539 	    poll->cs_fin2_func(poll->cs_data);
540     }
541 #ifdef SMP
542     if (poll->cs_maxcount) {
543 	while (poll->cs_count != -(poll->cs_maxcount + 1)) {
544 	    crit_enter();
545 	    lwkt_process_ipiq();
546 	    crit_exit();
547 	}
548 	--gd->gd_curthread->td_cscount;
549     }
550 #endif
551 }
552 
553 #ifdef SMP
554 
555 /*
556  * helper IPI remote messaging function.
557  *
558  * Called on remote cpu when a new cpu synchronization request has been
559  * sent to us.  Execute the run function and adjust cs_count, then requeue
560  * the request so we spin on it.
561  */
562 static void
563 lwkt_cpusync_remote1(lwkt_cpusync_t poll)
564 {
565     atomic_add_int(&poll->cs_count, 1);
566     if (poll->cs_run_func)
567 	poll->cs_run_func(poll);
568     lwkt_cpusync_remote2(poll);
569 }
570 
571 /*
572  * helper IPI remote messaging function.
573  *
574  * Poll for the originator telling us to finish.  If it hasn't, requeue
575  * our request so we spin on it.  When the originator requests that we
576  * finish we execute cs_fin1_func(poll) synchronously and cs_fin2_func(data)
577  * in tandem with the release.
578  */
579 static void
580 lwkt_cpusync_remote2(lwkt_cpusync_t poll)
581 {
582     if (poll->cs_count < 0) {
583 	cpusync_func2_t savef;
584 	void *saved;
585 
586 	if (poll->cs_fin1_func)
587 	    poll->cs_fin1_func(poll);
588 	if (poll->cs_fin2_func) {
589 	    savef = poll->cs_fin2_func;
590 	    saved = poll->cs_data;
591 	    atomic_add_int(&poll->cs_count, -1);
592 	    savef(saved);
593 	} else {
594 	    atomic_add_int(&poll->cs_count, -1);
595 	}
596     } else {
597 	globaldata_t gd = mycpu;
598 	lwkt_ipiq_t ip;
599 	int wi;
600 
601 	ip = &gd->gd_cpusyncq;
602 	wi = ip->ip_windex & MAXCPUFIFO_MASK;
603 	ip->ip_func[wi] = (ipifunc2_t)lwkt_cpusync_remote2;
604 	ip->ip_arg[wi] = poll;
605 	++ip->ip_windex;
606     }
607 }
608 
609 #endif
610