xref: /dragonfly/sys/kern/kern_timeout.c (revision 65cc0652)
1 /*
2  * Copyright (c) 2004,2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * Copyright (c) 1982, 1986, 1991, 1993
36  *	The Regents of the University of California.  All rights reserved.
37  * (c) UNIX System Laboratories, Inc.
38  * All or some portions of this file are derived from material licensed
39  * to the University of California by American Telephone and Telegraph
40  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
41  * the permission of UNIX System Laboratories, Inc.
42  *
43  * Redistribution and use in source and binary forms, with or without
44  * modification, are permitted provided that the following conditions
45  * are met:
46  * 1. Redistributions of source code must retain the above copyright
47  *    notice, this list of conditions and the following disclaimer.
48  * 2. Redistributions in binary form must reproduce the above copyright
49  *    notice, this list of conditions and the following disclaimer in the
50  *    documentation and/or other materials provided with the distribution.
51  * 3. Neither the name of the University nor the names of its contributors
52  *    may be used to endorse or promote products derived from this software
53  *    without specific prior written permission.
54  *
55  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65  * SUCH DAMAGE.
66  */
67 /*
68  * The original callout mechanism was based on the work of Adam M. Costello
69  * and George Varghese, published in a technical report entitled "Redesigning
70  * the BSD Callout and Timer Facilities" and modified slightly for inclusion
71  * in FreeBSD by Justin T. Gibbs.  The original work on the data structures
72  * used in this implementation was published by G. Varghese and T. Lauck in
73  * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
74  * the Efficient Implementation of a Timer Facility" in the Proceedings of
75  * the 11th ACM Annual Symposium on Operating Systems Principles,
76  * Austin, Texas Nov 1987.
77  *
78  * The per-cpu augmentation was done by Matthew Dillon.  This file has
79  * essentially been rewritten pretty much from scratch by Matt.
80  */
81 
82 #include <sys/param.h>
83 #include <sys/systm.h>
84 #include <sys/callout.h>
85 #include <sys/kernel.h>
86 #include <sys/interrupt.h>
87 #include <sys/thread.h>
88 
89 #include <sys/thread2.h>
90 #include <sys/mplock2.h>
91 
92 #include <vm/vm_extern.h>
93 
94 struct softclock_pcpu {
95 	struct callout_tailq *callwheel;
96 	struct callout * volatile next;
97 	intptr_t running;	/* NOTE! Bit 0 used to flag wakeup */
98 	int softticks;		/* softticks index */
99 	int curticks;		/* per-cpu ticks counter */
100 	int isrunning;
101 	struct thread thread;
102 };
103 
104 typedef struct softclock_pcpu *softclock_pcpu_t;
105 
106 static int cwheelsize;
107 static int cwheelmask;
108 static softclock_pcpu_t softclock_pcpu_ary[MAXCPU];
109 
110 static void softclock_handler(void *arg);
111 static void slotimer_callback(void *arg);
112 static void callout_reset_ipi(void *arg);
113 static void callout_stop_ipi(void *arg, int issync, struct intrframe *frame);
114 
115 static __inline int
116 callout_setclear(struct callout *c, int sflags, int cflags)
117 {
118 	int flags;
119 	int nflags;
120 
121 	for (;;) {
122 		flags = c->c_flags;
123 		cpu_ccfence();
124 		nflags = (flags | sflags) & ~cflags;
125 		if (atomic_cmpset_int(&c->c_flags, flags, nflags))
126 			break;
127 	}
128 	return flags;
129 }
130 
131 static void
132 swi_softclock_setup(void *arg)
133 {
134 	int cpu;
135 	int i;
136 	int target;
137 
138 	/*
139 	 * Figure out how large a callwheel we need.  It must be a power of 2.
140 	 *
141 	 * ncallout is primarily based on available memory, don't explode
142 	 * the allocations if the system has a lot of cpus.
143 	 */
144 	target = ncallout / ncpus + 16;
145 
146 	cwheelsize = 1;
147 	while (cwheelsize < target)
148 		cwheelsize <<= 1;
149 	cwheelmask = cwheelsize - 1;
150 
151 	/*
152 	 * Initialize per-cpu data structures.
153 	 */
154 	for (cpu = 0; cpu < ncpus; ++cpu) {
155 		softclock_pcpu_t sc;
156 		int wheel_sz;
157 
158 		sc = (void *)kmem_alloc3(&kernel_map, sizeof(*sc),
159 					 VM_SUBSYS_GD, KM_CPU(cpu));
160 		memset(sc, 0, sizeof(*sc));
161 		softclock_pcpu_ary[cpu] = sc;
162 
163 		wheel_sz = sizeof(*sc->callwheel) * cwheelsize;
164 		sc->callwheel = (void *)kmem_alloc3(&kernel_map, wheel_sz,
165 						    VM_SUBSYS_GD, KM_CPU(cpu));
166 		memset(sc->callwheel, 0, wheel_sz);
167 		for (i = 0; i < cwheelsize; ++i)
168 			TAILQ_INIT(&sc->callwheel[i]);
169 
170 		/*
171 		 * Mark the softclock handler as being an interrupt thread
172 		 * even though it really isn't, but do not allow it to
173 		 * preempt other threads (do not assign td_preemptable).
174 		 *
175 		 * Kernel code now assumes that callouts do not preempt
176 		 * the cpu they were scheduled on.
177 		 */
178 		lwkt_create(softclock_handler, sc, NULL, &sc->thread,
179 			    TDF_NOSTART | TDF_INTTHREAD,
180 			    cpu, "softclock %d", cpu);
181 	}
182 }
183 
184 /*
185  * Must occur after ncpus has been initialized.
186  */
187 SYSINIT(softclock_setup, SI_BOOT2_SOFTCLOCK, SI_ORDER_SECOND,
188 	swi_softclock_setup, NULL);
189 
190 /*
191  * This routine is called from the hardclock() (basically a FASTint/IPI) on
192  * each cpu in the system.  sc->curticks is this cpu's notion of the timebase.
193  * It IS NOT NECESSARILY SYNCHRONIZED WITH 'ticks'!  sc->softticks is where
194  * the callwheel is currently indexed.
195  *
196  * WARNING!  The MP lock is not necessarily held on call, nor can it be
197  * safely obtained.
198  *
199  * sc->softticks is adjusted by either this routine or our helper thread
200  * depending on whether the helper thread is running or not.
201  */
202 void
203 hardclock_softtick(globaldata_t gd)
204 {
205 	softclock_pcpu_t sc;
206 
207 	sc = softclock_pcpu_ary[gd->gd_cpuid];
208 	++sc->curticks;
209 	if (sc->isrunning)
210 		return;
211 	if (sc->softticks == sc->curticks) {
212 		/*
213 		 * In sync, only wakeup the thread if there is something to
214 		 * do.
215 		 */
216 		if (TAILQ_FIRST(&sc->callwheel[sc->softticks & cwheelmask])) {
217 			sc->isrunning = 1;
218 			lwkt_schedule(&sc->thread);
219 		} else {
220 			++sc->softticks;
221 		}
222 	} else {
223 		/*
224 		 * out of sync, wakeup the thread unconditionally so it can
225 		 * catch up.
226 		 */
227 		sc->isrunning = 1;
228 		lwkt_schedule(&sc->thread);
229 	}
230 }
231 
232 /*
233  * This procedure is the main loop of our per-cpu helper thread.  The
234  * sc->isrunning flag prevents us from racing hardclock_softtick() and
235  * a critical section is sufficient to interlock sc->curticks and protect
236  * us from remote IPI's / list removal.
237  *
238  * The thread starts with the MP lock released and not in a critical
239  * section.  The loop itself is MP safe while individual callbacks
240  * may or may not be, so we obtain or release the MP lock as appropriate.
241  */
242 static void
243 softclock_handler(void *arg)
244 {
245 	softclock_pcpu_t sc;
246 	struct callout *c;
247 	struct callout_tailq *bucket;
248 	struct callout slotimer;
249 	int mpsafe = 1;
250 	int flags;
251 
252 	/*
253 	 * Setup pcpu slow clocks which we want to run from the callout
254 	 * thread.
255 	 */
256 	callout_init_mp(&slotimer);
257 	callout_reset(&slotimer, hz * 10, slotimer_callback, &slotimer);
258 
259 	/*
260 	 * Run the callout thread at the same priority as other kernel
261 	 * threads so it can be round-robined.
262 	 */
263 	/*lwkt_setpri_self(TDPRI_SOFT_NORM);*/
264 
265 	/*
266 	 * Loop critical section against ipi operations to this cpu.
267 	 */
268 	sc = arg;
269 	crit_enter();
270 loop:
271 	while (sc->softticks != (int)(sc->curticks + 1)) {
272 		bucket = &sc->callwheel[sc->softticks & cwheelmask];
273 
274 		for (c = TAILQ_FIRST(bucket); c; c = sc->next) {
275 			void (*c_func)(void *);
276 			void *c_arg;
277 			struct lock *c_lk;
278 			int error;
279 
280 			if (c->c_time != sc->softticks) {
281 				sc->next = TAILQ_NEXT(c, c_links.tqe);
282 				continue;
283 			}
284 
285 			/*
286 			 * Synchronize with mpsafe requirements
287 			 */
288 			flags = c->c_flags;
289 			if (flags & CALLOUT_MPSAFE) {
290 				if (mpsafe == 0) {
291 					mpsafe = 1;
292 					rel_mplock();
293 				}
294 			} else {
295 				/*
296 				 * The request might be removed while we
297 				 * are waiting to get the MP lock.  If it
298 				 * was removed sc->next will point to the
299 				 * next valid request or NULL, loop up.
300 				 */
301 				if (mpsafe) {
302 					mpsafe = 0;
303 					sc->next = c;
304 					get_mplock();
305 					if (c != sc->next)
306 						continue;
307 				}
308 			}
309 
310 			/*
311 			 * Queue protection only exists while we hold the
312 			 * critical section uninterrupted.
313 			 *
314 			 * Adjust sc->next when removing (c) from the queue,
315 			 * note that an IPI on this cpu may make further
316 			 * adjustments to sc->next.
317 			 */
318 			sc->next = TAILQ_NEXT(c, c_links.tqe);
319 			TAILQ_REMOVE(bucket, c, c_links.tqe);
320 
321 			KASSERT((c->c_flags & CALLOUT_DID_INIT) &&
322 				(c->c_flags & CALLOUT_PENDING) &&
323 				CALLOUT_FLAGS_TO_CPU(c->c_flags) ==
324 				mycpu->gd_cpuid,
325 				("callout %p: bad flags %08x", c, c->c_flags));
326 
327 			/*
328 			 * Once CALLOUT_PENDING is cleared only the IPI_MASK
329 			 * prevents the callout from being moved to another
330 			 * cpu.  However, callout_stop() will also check
331 			 * sc->running on the assigned cpu if CALLOUT_EXECUTED
332 			 * is set.  CALLOUT_EXECUTE implies a callback
333 			 * interlock is needed when cross-cpu.
334 			 */
335 			sc->running = (intptr_t)c;
336 			c_func = c->c_func;
337 			c_arg = c->c_arg;
338 			c_lk = c->c_lk;
339 			c->c_func = NULL;
340 
341 			if ((flags & (CALLOUT_AUTOLOCK | CALLOUT_ACTIVE)) ==
342 			    (CALLOUT_AUTOLOCK | CALLOUT_ACTIVE)) {
343 				error = lockmgr(c_lk, LK_EXCLUSIVE |
344 						      LK_CANCELABLE);
345 				if (error == 0) {
346 					flags = callout_setclear(c,
347 							CALLOUT_EXECUTED,
348 							CALLOUT_PENDING |
349 							CALLOUT_WAITING);
350 					crit_exit();
351 					c_func(c_arg);
352 					crit_enter();
353 					lockmgr(c_lk, LK_RELEASE);
354 				} else {
355 					flags = callout_setclear(c,
356 							0,
357 							CALLOUT_PENDING);
358 				}
359 			} else if (flags & CALLOUT_ACTIVE) {
360 				flags = callout_setclear(c,
361 						CALLOUT_EXECUTED,
362 						CALLOUT_PENDING |
363 						CALLOUT_WAITING);
364 				crit_exit();
365 				c_func(c_arg);
366 				crit_enter();
367 			} else {
368 				flags = callout_setclear(c,
369 						0,
370 						CALLOUT_PENDING |
371 						CALLOUT_WAITING);
372 			}
373 
374 			/*
375 			 * Read and clear sc->running.  If bit 0 was set,
376 			 * a callout_stop() is likely blocked waiting for
377 			 * the callback to complete.
378 			 *
379 			 * The sigclear above also cleared CALLOUT_WAITING
380 			 * and returns the contents of flags prior to clearing
381 			 * any bits.
382 			 *
383 			 * Interlock wakeup any _stop's waiting on us.  Note
384 			 * that once c_func() was called, the callout
385 			 * structure (c) pointer may no longer be valid.  It
386 			 * can only be used for the wakeup.
387 			 */
388 			if ((atomic_readandclear_ptr(&sc->running) & 1) ||
389 			    (flags & CALLOUT_WAITING)) {
390 				wakeup(c);
391 			}
392 			/* NOTE: list may have changed */
393 		}
394 		++sc->softticks;
395 	}
396 
397 	/*
398 	 * Don't leave us holding the MP lock when we deschedule ourselves.
399 	 */
400 	if (mpsafe == 0) {
401 		mpsafe = 1;
402 		rel_mplock();
403 	}
404 	sc->isrunning = 0;
405 	lwkt_deschedule_self(&sc->thread);	/* == curthread */
406 	lwkt_switch();
407 	goto loop;
408 	/* NOT REACHED */
409 }
410 
411 /*
412  * A very slow system cleanup timer (10 second interval),
413  * per-cpu.
414  */
415 void
416 slotimer_callback(void *arg)
417 {
418 	struct callout *c = arg;
419 
420 	slab_cleanup();
421 	callout_reset(c, hz * 10, slotimer_callback, c);
422 }
423 
424 /*
425  * Start or restart a timeout.  Installs the callout structure on the
426  * callwheel of the current cpu.  Callers may legally pass any value, even
427  * if 0 or negative, but since the sc->curticks index may have already
428  * been processed a minimum timeout of 1 tick will be enforced.
429  *
430  * This function will block if the callout is currently queued to a different
431  * cpu or the callback is currently running in another thread.
432  */
433 void
434 callout_reset(struct callout *c, int to_ticks, void (*ftn)(void *), void *arg)
435 {
436 	softclock_pcpu_t sc;
437 	globaldata_t gd;
438 
439 #ifdef INVARIANTS
440         if ((c->c_flags & CALLOUT_DID_INIT) == 0) {
441 		callout_init(c);
442 		kprintf(
443 		    "callout_reset(%p) from %p: callout was not initialized\n",
444 		    c, ((int **)&c)[-1]);
445 		print_backtrace(-1);
446 	}
447 #endif
448 	gd = mycpu;
449 	sc = softclock_pcpu_ary[gd->gd_cpuid];
450 	crit_enter_gd(gd);
451 
452 	/*
453 	 * Our cpu must gain ownership of the callout and cancel anything
454 	 * still running, which is complex.  The easiest way to do it is to
455 	 * issue a callout_stop_sync().  callout_stop_sync() will also
456 	 * handle CALLOUT_EXECUTED (dispatch waiting), and clear it.
457 	 *
458 	 * WARNING: callout_stop_sync()'s return state can race other
459 	 *	    callout_*() calls due to blocking, so we must re-check.
460 	 */
461 	for (;;) {
462 		int flags;
463 		int nflags;
464 
465 		if (c->c_flags & (CALLOUT_ARMED_MASK | CALLOUT_EXECUTED))
466 			callout_stop_sync(c);
467 		flags = c->c_flags & ~(CALLOUT_ARMED_MASK | CALLOUT_EXECUTED);
468 		nflags = (flags & ~CALLOUT_CPU_MASK) |
469 			 CALLOUT_CPU_TO_FLAGS(gd->gd_cpuid) |
470 			 CALLOUT_PENDING |
471 			 CALLOUT_ACTIVE;
472 		if (atomic_cmpset_int(&c->c_flags, flags, nflags))
473 			break;
474 		cpu_pause();
475 	}
476 
477 	/*
478 	 * With the critical section held and PENDING set we now 'own' the
479 	 * callout.
480 	 */
481 	if (to_ticks <= 0)
482 		to_ticks = 1;
483 
484 	c->c_arg = arg;
485 	c->c_func = ftn;
486 	c->c_time = sc->curticks + to_ticks;
487 
488 	TAILQ_INSERT_TAIL(&sc->callwheel[c->c_time & cwheelmask],
489 			  c, c_links.tqe);
490 	crit_exit_gd(gd);
491 }
492 
493 /*
494  * Setup a callout to run on the specified cpu.  Should generally be used
495  * to run a callout on a specific cpu which does not nominally change.  This
496  * callout_reset() will be issued asynchronously via an IPI.
497  */
498 void
499 callout_reset_bycpu(struct callout *c, int to_ticks, void (*ftn)(void *),
500 		    void *arg, int cpuid)
501 {
502 	globaldata_t gd;
503 	globaldata_t tgd;
504 
505 #ifdef INVARIANTS
506         if ((c->c_flags & CALLOUT_DID_INIT) == 0) {
507 		callout_init(c);
508 		kprintf(
509 		    "callout_reset(%p) from %p: callout was not initialized\n",
510 		    c, ((int **)&c)[-1]);
511 		print_backtrace(-1);
512 	}
513 #endif
514 	gd = mycpu;
515 	crit_enter_gd(gd);
516 
517 	tgd = globaldata_find(cpuid);
518 
519 	/*
520 	 * This code is similar to the code in callout_reset() but we assign
521 	 * the callout to the target cpu.  We cannot set PENDING here since
522 	 * we cannot atomically add the callout to the target cpu's queue.
523 	 * However, incrementing the IPI count has the effect of locking
524 	 * the cpu assignment.
525 	 *
526 	 * WARNING: callout_stop_sync()'s return state can race other
527 	 *	    callout_*() calls due to blocking, so we must re-check.
528 	 */
529 	for (;;) {
530 		int flags;
531 		int nflags;
532 
533 		if (c->c_flags & (CALLOUT_ARMED_MASK | CALLOUT_EXECUTED))
534 			callout_stop_sync(c);
535 		flags = c->c_flags & ~(CALLOUT_ARMED_MASK | CALLOUT_EXECUTED);
536 		nflags = (flags & ~(CALLOUT_CPU_MASK |
537 				    CALLOUT_EXECUTED)) |
538 			 CALLOUT_CPU_TO_FLAGS(tgd->gd_cpuid) |
539 			 CALLOUT_ACTIVE;
540 		nflags = nflags + 1;		/* bump IPI count */
541 		if (atomic_cmpset_int(&c->c_flags, flags, nflags))
542 			break;
543 		cpu_pause();
544 	}
545 
546 	/*
547 	 * Since we control our +1 in the IPI count, the target cpu cannot
548 	 * now change until our IPI is processed.
549 	 */
550 	if (to_ticks <= 0)
551 		to_ticks = 1;
552 
553 	c->c_arg = arg;
554 	c->c_func = ftn;
555 	c->c_load = to_ticks;	/* IPI will add curticks */
556 
557 	lwkt_send_ipiq(tgd, callout_reset_ipi, c);
558 	crit_exit_gd(gd);
559 }
560 
561 /*
562  * Remote IPI for callout_reset_bycpu().  The cpu assignment cannot be
563  * ripped out from under us due to the count in IPI_MASK, but it is possible
564  * that other IPIs executed so we must deal with other flags that might
565  * have been set or cleared.
566  */
567 static void
568 callout_reset_ipi(void *arg)
569 {
570 	struct callout *c = arg;
571 	globaldata_t gd = mycpu;
572 	softclock_pcpu_t sc;
573 	int flags;
574 	int nflags;
575 
576 	sc = softclock_pcpu_ary[gd->gd_cpuid];
577 
578 	for (;;) {
579 		flags = c->c_flags;
580 		cpu_ccfence();
581 		KKASSERT((flags & CALLOUT_IPI_MASK) > 0 &&
582 			 CALLOUT_FLAGS_TO_CPU(flags) == gd->gd_cpuid);
583 
584 		nflags = (flags - 1) & ~(CALLOUT_EXECUTED | CALLOUT_WAITING);
585 		nflags |= CALLOUT_PENDING;
586 
587 		/*
588 		 * Put us on the queue
589 		 */
590 		if (atomic_cmpset_int(&c->c_flags, flags, nflags)) {
591 			if (flags & CALLOUT_PENDING) {
592 				if (sc->next == c)
593 					sc->next = TAILQ_NEXT(c, c_links.tqe);
594 				TAILQ_REMOVE(
595 					&sc->callwheel[c->c_time & cwheelmask],
596 					c,
597 					c_links.tqe);
598 			}
599 			c->c_time = sc->curticks + c->c_load;
600 			TAILQ_INSERT_TAIL(
601 				&sc->callwheel[c->c_time & cwheelmask],
602 				c, c_links.tqe);
603 			break;
604 		}
605 		/* retry */
606 		cpu_pause();
607 	}
608 
609 	/*
610 	 * Issue wakeup if requested.
611 	 */
612 	if (flags & CALLOUT_WAITING)
613 		wakeup(c);
614 }
615 
616 /*
617  * Stop a running timer and ensure that any running callout completes before
618  * returning.  If the timer is running on another cpu this function may block
619  * to interlock against the callout.  If the callout is currently executing
620  * or blocked in another thread this function may also block to interlock
621  * against the callout.
622  *
623  * The caller must be careful to avoid deadlocks, either by using
624  * callout_init_lk() (which uses the lockmgr lock cancelation feature),
625  * by using tokens and dealing with breaks in the serialization, or using
626  * the lockmgr lock cancelation feature yourself in the callout callback
627  * function.
628  *
629  * callout_stop() returns non-zero if the callout was pending.
630  */
631 static int
632 _callout_stop(struct callout *c, int issync)
633 {
634 	globaldata_t gd = mycpu;
635 	globaldata_t tgd;
636 	softclock_pcpu_t sc;
637 	int flags;
638 	int nflags;
639 	int rc;
640 	int cpuid;
641 
642 #ifdef INVARIANTS
643         if ((c->c_flags & CALLOUT_DID_INIT) == 0) {
644 		callout_init(c);
645 		kprintf(
646 		    "callout_stop(%p) from %p: callout was not initialized\n",
647 		    c, ((int **)&c)[-1]);
648 		print_backtrace(-1);
649 	}
650 #endif
651 	crit_enter_gd(gd);
652 
653 retry:
654 	/*
655 	 * Adjust flags for the required operation.  If the callout is
656 	 * armed on another cpu we break out into the remote-cpu code which
657 	 * will issue an IPI.  If it is not armed we are trivially done,
658 	 * but may still need to test EXECUTED.
659 	 */
660 	for (;;) {
661 		flags = c->c_flags;
662 		cpu_ccfence();
663 
664 		cpuid = CALLOUT_FLAGS_TO_CPU(flags);
665 
666 		/*
667 		 * Armed on remote cpu (break to remote-cpu code)
668 		 */
669 		if ((flags & CALLOUT_ARMED_MASK) && gd->gd_cpuid != cpuid) {
670 			nflags = flags + 1;
671 			if (atomic_cmpset_int(&c->c_flags, flags, nflags)) {
672 				/*
673 				 * BREAK TO REMOTE-CPU CODE HERE
674 				 */
675 				break;
676 			}
677 			cpu_pause();
678 			continue;
679 		}
680 
681 		/*
682 		 * Armed or armable on current cpu
683 		 */
684 		if (flags & CALLOUT_IPI_MASK) {
685 			lwkt_process_ipiq();
686 			cpu_pause();
687 			continue;	/* retry */
688 		}
689 
690 		/*
691 		 * If PENDING is set we can remove the callout from our
692 		 * queue and also use the side effect that the bit causes
693 		 * the callout to be locked to our cpu.
694 		 */
695 		if (flags & CALLOUT_PENDING) {
696 			sc = softclock_pcpu_ary[gd->gd_cpuid];
697 			if (sc->next == c)
698 				sc->next = TAILQ_NEXT(c, c_links.tqe);
699 			TAILQ_REMOVE(
700 				&sc->callwheel[c->c_time & cwheelmask],
701 				c,
702 				c_links.tqe);
703 			c->c_func = NULL;
704 
705 			for (;;) {
706 				flags = c->c_flags;
707 				cpu_ccfence();
708 				nflags = flags & ~(CALLOUT_ACTIVE |
709 						   CALLOUT_EXECUTED |
710 						   CALLOUT_WAITING |
711 						   CALLOUT_PENDING);
712 				if (atomic_cmpset_int(&c->c_flags,
713 						      flags, nflags)) {
714 					goto skip_slow;
715 				}
716 				cpu_pause();
717 			}
718 			/* NOT REACHED */
719 		}
720 
721 		/*
722 		 * If PENDING was not set the callout might not be locked
723 		 * to this cpu.
724 		 */
725 		nflags = flags & ~(CALLOUT_ACTIVE |
726 				   CALLOUT_EXECUTED |
727 				   CALLOUT_WAITING |
728 				   CALLOUT_PENDING);
729 		if (atomic_cmpset_int(&c->c_flags, flags, nflags)) {
730 			goto skip_slow;
731 		}
732 		cpu_pause();
733 		/* retry */
734 	}
735 
736 	/*
737 	 * Remote cpu path.  We incremented the IPI_MASK count so the callout
738 	 * is now locked to the remote cpu and we can safely send an IPI
739 	 * to it.
740 	 *
741 	 * Once sent, wait for all IPIs to be processed.  If PENDING remains
742 	 * set after all IPIs have processed we raced a callout or
743 	 * callout_reset and must retry.  Callers expect the callout to
744 	 * be completely stopped upon return, so make sure it is.
745 	 */
746 	tgd = globaldata_find(cpuid);
747 	lwkt_send_ipiq3(tgd, callout_stop_ipi, c, issync);
748 
749 	for (;;) {
750 		flags = c->c_flags;
751 		cpu_ccfence();
752 
753 		if ((flags & CALLOUT_IPI_MASK) == 0)
754 			break;
755 
756 		nflags = flags | CALLOUT_WAITING;
757 		tsleep_interlock(c, 0);
758 		if (atomic_cmpset_int(&c->c_flags, flags, nflags)) {
759 			tsleep(c, PINTERLOCKED, "cstp1", 0);
760 		}
761 	}
762 	if (flags & CALLOUT_PENDING)
763 		goto retry;
764 
765 	/*
766 	 * Caller expects callout_stop_sync() to clear EXECUTED and return
767 	 * its previous status.
768 	 */
769 	atomic_clear_int(&c->c_flags, CALLOUT_EXECUTED);
770 
771 skip_slow:
772 	if (flags & CALLOUT_WAITING)
773 		wakeup(c);
774 
775 	/*
776 	 * If (issync) we must also wait for any in-progress callbacks to
777 	 * complete, unless the stop is being executed from the callback
778 	 * itself.  The EXECUTED flag is set prior to the callback
779 	 * being made so our existing flags status already has it.
780 	 *
781 	 * If auto-lock mode is being used, this is where we cancel any
782 	 * blocked lock that is potentially preventing the target cpu
783 	 * from completing the callback.
784 	 */
785 	while (issync) {
786 		intptr_t *runp;
787 		intptr_t runco;
788 
789 		sc = softclock_pcpu_ary[cpuid];
790 		if (gd->gd_curthread == &sc->thread)	/* stop from cb */
791 			break;
792 		runp = &sc->running;
793 		runco = *runp;
794 		cpu_ccfence();
795 		if ((runco & ~(intptr_t)1) != (intptr_t)c)
796 			break;
797 		if (c->c_flags & CALLOUT_AUTOLOCK)
798 			lockmgr(c->c_lk, LK_CANCEL_BEG);
799 		tsleep_interlock(c, 0);
800 		if (atomic_cmpset_long(runp, runco, runco | 1))
801 			tsleep(c, PINTERLOCKED, "cstp3", 0);
802 		if (c->c_flags & CALLOUT_AUTOLOCK)
803 			lockmgr(c->c_lk, LK_CANCEL_END);
804 	}
805 
806 	crit_exit_gd(gd);
807 	rc = (flags & CALLOUT_EXECUTED) != 0;
808 
809 	return rc;
810 }
811 
812 /*
813  * IPI for stop function.  The callout is locked to the receiving cpu
814  * by the IPI_MASK count.
815  */
816 static void
817 callout_stop_ipi(void *arg, int issync, struct intrframe *frame)
818 {
819 	globaldata_t gd = mycpu;
820 	struct callout *c = arg;
821 	softclock_pcpu_t sc;
822 	int flags;
823 	int nflags;
824 
825 	flags = c->c_flags;
826 	cpu_ccfence();
827 
828 	KKASSERT(CALLOUT_FLAGS_TO_CPU(flags) == gd->gd_cpuid);
829 
830 	/*
831 	 * We can handle the PENDING flag immediately.
832 	 */
833 	if (flags & CALLOUT_PENDING) {
834 		sc = softclock_pcpu_ary[gd->gd_cpuid];
835 		if (sc->next == c)
836 			sc->next = TAILQ_NEXT(c, c_links.tqe);
837 		TAILQ_REMOVE(
838 			&sc->callwheel[c->c_time & cwheelmask],
839 			c,
840 			c_links.tqe);
841 		c->c_func = NULL;
842 	}
843 
844 	/*
845 	 * Transition to the stopped state and decrement the IPI count.
846 	 * Leave the EXECUTED bit alone (the next callout_reset() will
847 	 * have to deal with it).
848 	 */
849 	for (;;) {
850 		flags = c->c_flags;
851 		cpu_ccfence();
852 		nflags = (flags - 1) & ~(CALLOUT_ACTIVE |
853 					 CALLOUT_PENDING |
854 					 CALLOUT_WAITING);
855 
856 		if (atomic_cmpset_int(&c->c_flags, flags, nflags))
857 			break;
858 		cpu_pause();
859 	}
860 	if (flags & CALLOUT_WAITING)
861 		wakeup(c);
862 }
863 
864 int
865 callout_stop(struct callout *c)
866 {
867 	return _callout_stop(c, 0);
868 }
869 
870 int
871 callout_stop_sync(struct callout *c)
872 {
873 	return _callout_stop(c, 1);
874 }
875 
876 void
877 callout_stop_async(struct callout *c)
878 {
879 	_callout_stop(c, 0);
880 }
881 
882 void
883 callout_terminate(struct callout *c)
884 {
885 	_callout_stop(c, 1);
886 	atomic_clear_int(&c->c_flags, CALLOUT_DID_INIT);
887 }
888 
889 /*
890  * Prepare a callout structure for use by callout_reset() and/or
891  * callout_stop().
892  *
893  * The MP version of this routine requires that the callback
894  * function installed by callout_reset() be MP safe.
895  *
896  * The LK version of this routine is also MPsafe and will automatically
897  * acquire the specified lock for the duration of the function call,
898  * and release it after the function returns.  In addition, when autolocking
899  * is used, callout_stop() becomes synchronous if the caller owns the lock.
900  * callout_reset(), callout_stop(), and callout_stop_sync() will block
901  * normally instead of spinning when a cpu race occurs.  Lock cancelation
902  * is used to avoid deadlocks against the callout ring dispatch.
903  *
904  * The init functions can be called from any cpu and do not have to be
905  * called from the cpu that the timer will eventually run on.
906  */
907 static __inline void
908 _callout_init(struct callout *c, int flags)
909 {
910 	bzero(c, sizeof *c);
911 	c->c_flags = flags;
912 }
913 
914 void
915 callout_init(struct callout *c)
916 {
917 	_callout_init(c, CALLOUT_DID_INIT);
918 }
919 
920 void
921 callout_init_mp(struct callout *c)
922 {
923 	_callout_init(c, CALLOUT_DID_INIT | CALLOUT_MPSAFE);
924 }
925 
926 void
927 callout_init_lk(struct callout *c, struct lock *lk)
928 {
929 	_callout_init(c, CALLOUT_DID_INIT | CALLOUT_MPSAFE | CALLOUT_AUTOLOCK);
930 	c->c_lk = lk;
931 }
932