xref: /dragonfly/sys/kern/kern_timeout.c (revision 0db87cb7)
1 /*
2  * Copyright (c) 2004,2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * Copyright (c) 1982, 1986, 1991, 1993
36  *	The Regents of the University of California.  All rights reserved.
37  * (c) UNIX System Laboratories, Inc.
38  * All or some portions of this file are derived from material licensed
39  * to the University of California by American Telephone and Telegraph
40  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
41  * the permission of UNIX System Laboratories, Inc.
42  *
43  * Redistribution and use in source and binary forms, with or without
44  * modification, are permitted provided that the following conditions
45  * are met:
46  * 1. Redistributions of source code must retain the above copyright
47  *    notice, this list of conditions and the following disclaimer.
48  * 2. Redistributions in binary form must reproduce the above copyright
49  *    notice, this list of conditions and the following disclaimer in the
50  *    documentation and/or other materials provided with the distribution.
51  * 3. Neither the name of the University nor the names of its contributors
52  *    may be used to endorse or promote products derived from this software
53  *    without specific prior written permission.
54  *
55  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65  * SUCH DAMAGE.
66  */
67 /*
68  * The original callout mechanism was based on the work of Adam M. Costello
69  * and George Varghese, published in a technical report entitled "Redesigning
70  * the BSD Callout and Timer Facilities" and modified slightly for inclusion
71  * in FreeBSD by Justin T. Gibbs.  The original work on the data structures
72  * used in this implementation was published by G. Varghese and T. Lauck in
73  * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
74  * the Efficient Implementation of a Timer Facility" in the Proceedings of
75  * the 11th ACM Annual Symposium on Operating Systems Principles,
76  * Austin, Texas Nov 1987.
77  *
78  * The per-cpu augmentation was done by Matthew Dillon.
79  */
80 
81 #include <sys/param.h>
82 #include <sys/systm.h>
83 #include <sys/callout.h>
84 #include <sys/kernel.h>
85 #include <sys/interrupt.h>
86 #include <sys/thread.h>
87 
88 #include <sys/thread2.h>
89 #include <sys/mplock2.h>
90 
91 struct softclock_pcpu {
92 	struct callout_tailq *callwheel;
93 	struct callout * volatile next;
94 	intptr_t running;	/* NOTE! Bit 0 used to flag wakeup */
95 	int softticks;		/* softticks index */
96 	int curticks;		/* per-cpu ticks counter */
97 	int isrunning;
98 	struct thread thread;
99 };
100 
101 typedef struct softclock_pcpu *softclock_pcpu_t;
102 
103 static MALLOC_DEFINE(M_CALLOUT, "callout", "callout structures");
104 static int cwheelsize;
105 static int cwheelmask;
106 static struct softclock_pcpu softclock_pcpu_ary[MAXCPU];
107 
108 static void softclock_handler(void *arg);
109 static void slotimer_callback(void *arg);
110 static void callout_reset_ipi(void *arg);
111 static void callout_stop_ipi(void *arg, int issync, struct intrframe *frame);
112 
113 
114 static void
115 swi_softclock_setup(void *arg)
116 {
117 	int cpu;
118 	int i;
119 	int target;
120 
121 	/*
122 	 * Figure out how large a callwheel we need.  It must be a power of 2.
123 	 *
124 	 * ncallout is primarily based on available memory, don't explode
125 	 * the allocations if the system has a lot of cpus.
126 	 */
127 	target = ncallout / ncpus + 16;
128 
129 	cwheelsize = 1;
130 	while (cwheelsize < target)
131 		cwheelsize <<= 1;
132 	cwheelmask = cwheelsize - 1;
133 
134 	/*
135 	 * Initialize per-cpu data structures.
136 	 */
137 	for (cpu = 0; cpu < ncpus; ++cpu) {
138 		softclock_pcpu_t sc;
139 
140 		sc = &softclock_pcpu_ary[cpu];
141 
142 		sc->callwheel = kmalloc(sizeof(*sc->callwheel) * cwheelsize,
143 					M_CALLOUT, M_WAITOK|M_ZERO);
144 		for (i = 0; i < cwheelsize; ++i)
145 			TAILQ_INIT(&sc->callwheel[i]);
146 
147 		/*
148 		 * Mark the softclock handler as being an interrupt thread
149 		 * even though it really isn't, but do not allow it to
150 		 * preempt other threads (do not assign td_preemptable).
151 		 *
152 		 * Kernel code now assumes that callouts do not preempt
153 		 * the cpu they were scheduled on.
154 		 */
155 		lwkt_create(softclock_handler, sc, NULL,
156 			    &sc->thread, TDF_NOSTART | TDF_INTTHREAD,
157 			    cpu, "softclock %d", cpu);
158 	}
159 }
160 
161 /*
162  * Must occur after ncpus has been initialized.
163  */
164 SYSINIT(softclock_setup, SI_BOOT2_SOFTCLOCK, SI_ORDER_SECOND,
165 	swi_softclock_setup, NULL);
166 
167 /*
168  * Clear PENDING and, if possible, also clear ARMED and WAITING.  Returns
169  * the flags prior to the clear, atomically (used to check for WAITING).
170  *
171  * Clearing the cpu association (ARMED) can significantly improve the
172  * performance of the next callout_reset*() call.
173  */
174 static __inline
175 int
176 callout_unpend_disarm(struct callout *c)
177 {
178 	int flags;
179 	int nflags;
180 
181 	for (;;) {
182 		flags = c->c_flags;
183 		cpu_ccfence();
184 		nflags = flags & ~(CALLOUT_PENDING | CALLOUT_WAITING);
185 		if ((flags & CALLOUT_IPI_MASK) == 0)
186 			nflags &= ~CALLOUT_ARMED;
187 		if (atomic_cmpset_int(&c->c_flags, flags, nflags)) {
188 			break;
189 		}
190 		cpu_pause();
191 		/* retry */
192 	}
193 	return flags;
194 }
195 
196 /*
197  * Clear ARMED after finishing adjustments to the callout, potentially
198  * allowing other cpus to take over.  We can only do this if the IPI mask
199  * is 0.
200  */
201 static __inline
202 int
203 callout_maybe_clear_armed(struct callout *c)
204 {
205 	int flags;
206 	int nflags;
207 
208 	for (;;) {
209 		flags = c->c_flags;
210 		cpu_ccfence();
211 		if (flags & (CALLOUT_PENDING | CALLOUT_IPI_MASK))
212 			break;
213 		nflags = flags & ~CALLOUT_ARMED;
214 		if (atomic_cmpset_int(&c->c_flags, flags, nflags))
215 			break;
216 		cpu_pause();
217 		/* retry */
218 	}
219 	return flags;
220 }
221 
222 /*
223  * This routine is called from the hardclock() (basically a FASTint/IPI) on
224  * each cpu in the system.  sc->curticks is this cpu's notion of the timebase.
225  * It IS NOT NECESSARILY SYNCHRONIZED WITH 'ticks'!  sc->softticks is where
226  * the callwheel is currently indexed.
227  *
228  * WARNING!  The MP lock is not necessarily held on call, nor can it be
229  * safely obtained.
230  *
231  * sc->softticks is adjusted by either this routine or our helper thread
232  * depending on whether the helper thread is running or not.
233  */
234 void
235 hardclock_softtick(globaldata_t gd)
236 {
237 	softclock_pcpu_t sc;
238 
239 	sc = &softclock_pcpu_ary[gd->gd_cpuid];
240 	++sc->curticks;
241 	if (sc->isrunning)
242 		return;
243 	if (sc->softticks == sc->curticks) {
244 		/*
245 		 * In sync, only wakeup the thread if there is something to
246 		 * do.
247 		 */
248 		if (TAILQ_FIRST(&sc->callwheel[sc->softticks & cwheelmask])) {
249 			sc->isrunning = 1;
250 			lwkt_schedule(&sc->thread);
251 		} else {
252 			++sc->softticks;
253 		}
254 	} else {
255 		/*
256 		 * out of sync, wakeup the thread unconditionally so it can
257 		 * catch up.
258 		 */
259 		sc->isrunning = 1;
260 		lwkt_schedule(&sc->thread);
261 	}
262 }
263 
264 /*
265  * This procedure is the main loop of our per-cpu helper thread.  The
266  * sc->isrunning flag prevents us from racing hardclock_softtick() and
267  * a critical section is sufficient to interlock sc->curticks and protect
268  * us from remote IPI's / list removal.
269  *
270  * The thread starts with the MP lock released and not in a critical
271  * section.  The loop itself is MP safe while individual callbacks
272  * may or may not be, so we obtain or release the MP lock as appropriate.
273  */
274 static void
275 softclock_handler(void *arg)
276 {
277 	softclock_pcpu_t sc;
278 	struct callout *c;
279 	struct callout_tailq *bucket;
280 	struct callout slotimer;
281 	int mpsafe = 1;
282 	int flags;
283 
284 	/*
285 	 * Setup pcpu slow clocks which we want to run from the callout
286 	 * thread.
287 	 */
288 	callout_init_mp(&slotimer);
289 	callout_reset(&slotimer, hz * 10, slotimer_callback, &slotimer);
290 
291 	/*
292 	 * Run the callout thread at the same priority as other kernel
293 	 * threads so it can be round-robined.
294 	 */
295 	/*lwkt_setpri_self(TDPRI_SOFT_NORM);*/
296 
297 	/*
298 	 * Loop critical section against ipi operations to this cpu.
299 	 */
300 	sc = arg;
301 	crit_enter();
302 loop:
303 	while (sc->softticks != (int)(sc->curticks + 1)) {
304 		bucket = &sc->callwheel[sc->softticks & cwheelmask];
305 
306 		for (c = TAILQ_FIRST(bucket); c; c = sc->next) {
307 			if (c->c_time != sc->softticks) {
308 				sc->next = TAILQ_NEXT(c, c_links.tqe);
309 				continue;
310 			}
311 
312 			flags = c->c_flags;
313 			if (flags & CALLOUT_MPSAFE) {
314 				if (mpsafe == 0) {
315 					mpsafe = 1;
316 					rel_mplock();
317 				}
318 			} else {
319 				/*
320 				 * The request might be removed while we
321 				 * are waiting to get the MP lock.  If it
322 				 * was removed sc->next will point to the
323 				 * next valid request or NULL, loop up.
324 				 */
325 				if (mpsafe) {
326 					mpsafe = 0;
327 					sc->next = c;
328 					get_mplock();
329 					if (c != sc->next)
330 						continue;
331 				}
332 			}
333 
334 			/*
335 			 * Queue protection only exists while we hold the
336 			 * critical section uninterrupted.
337 			 *
338 			 * Adjust sc->next when removing (c) from the queue,
339 			 * note that an IPI on this cpu may make further
340 			 * adjustments to sc->next.
341 			 */
342 			sc->next = TAILQ_NEXT(c, c_links.tqe);
343 			TAILQ_REMOVE(bucket, c, c_links.tqe);
344 
345 			KASSERT((c->c_flags & CALLOUT_ARMED) &&
346 				(c->c_flags & CALLOUT_PENDING) &&
347 				CALLOUT_FLAGS_TO_CPU(c->c_flags) ==
348 				mycpu->gd_cpuid,
349 				("callout %p: bad flags %08x", c, c->c_flags));
350 
351 			/*
352 			 * Once CALLOUT_PENDING is cleared, sc->running
353 			 * protects the callout structure's existance but
354 			 * only until we call c_func().  A callout_stop()
355 			 * or callout_reset() issued from within c_func()
356 			 * will not block.  The callout can also be kfree()d
357 			 * by c_func().
358 			 *
359 			 * We set EXECUTED before calling c_func() so a
360 			 * callout_stop() issued from within c_func() returns
361 			 * the correct status.
362 			 */
363 			if ((flags & (CALLOUT_AUTOLOCK | CALLOUT_ACTIVE)) ==
364 			    (CALLOUT_AUTOLOCK | CALLOUT_ACTIVE)) {
365 				void (*c_func)(void *);
366 				void *c_arg;
367 				struct lock *c_lk;
368 				int error;
369 
370 				/*
371 				 * NOTE: sc->running must be set prior to
372 				 *	 CALLOUT_PENDING being cleared to
373 				 *	 avoid missed CANCELs and *_stop()
374 				 *	 races.
375 				 */
376 				sc->running = (intptr_t)c;
377 				c_func = c->c_func;
378 				c_arg = c->c_arg;
379 				c_lk = c->c_lk;
380 				c->c_func = NULL;
381 				KKASSERT(c->c_flags & CALLOUT_DID_INIT);
382 				flags = callout_unpend_disarm(c);
383 				error = lockmgr(c_lk, LK_EXCLUSIVE |
384 						      LK_CANCELABLE);
385 				if (error == 0) {
386 					atomic_set_int(&c->c_flags,
387 						       CALLOUT_EXECUTED);
388 					crit_exit();
389 					c_func(c_arg);
390 					crit_enter();
391 					lockmgr(c_lk, LK_RELEASE);
392 				}
393 			} else if (flags & CALLOUT_ACTIVE) {
394 				void (*c_func)(void *);
395 				void *c_arg;
396 
397 				sc->running = (intptr_t)c;
398 				c_func = c->c_func;
399 				c_arg = c->c_arg;
400 				c->c_func = NULL;
401 				KKASSERT(c->c_flags & CALLOUT_DID_INIT);
402 				flags = callout_unpend_disarm(c);
403 				atomic_set_int(&c->c_flags, CALLOUT_EXECUTED);
404 				crit_exit();
405 				c_func(c_arg);
406 				crit_enter();
407 			} else {
408 				flags = callout_unpend_disarm(c);
409 			}
410 
411 			/*
412 			 * Read and clear sc->running.  If bit 0 was set,
413 			 * a callout_stop() is likely blocked waiting for
414 			 * the callback to complete.
415 			 *
416 			 * The sigclear above also cleared CALLOUT_WAITING
417 			 * and returns the contents of flags prior to clearing
418 			 * any bits.
419 			 *
420 			 * Interlock wakeup any _stop's waiting on us.  Note
421 			 * that once c_func() was called, the callout
422 			 * structure (c) pointer may no longer be valid.  It
423 			 * can only be used for the wakeup.
424 			 */
425 			if ((atomic_readandclear_ptr(&sc->running) & 1) ||
426 			    (flags & CALLOUT_WAITING)) {
427 				wakeup(c);
428 			}
429 			/* NOTE: list may have changed */
430 		}
431 		++sc->softticks;
432 	}
433 
434 	/*
435 	 * Don't leave us holding the MP lock when we deschedule ourselves.
436 	 */
437 	if (mpsafe == 0) {
438 		mpsafe = 1;
439 		rel_mplock();
440 	}
441 	sc->isrunning = 0;
442 	lwkt_deschedule_self(&sc->thread);	/* == curthread */
443 	lwkt_switch();
444 	goto loop;
445 	/* NOT REACHED */
446 }
447 
448 /*
449  * A very slow system cleanup timer (10 second interval),
450  * per-cpu.
451  */
452 void
453 slotimer_callback(void *arg)
454 {
455 	struct callout *c = arg;
456 
457 	slab_cleanup();
458 	callout_reset(c, hz * 10, slotimer_callback, c);
459 }
460 
461 /*
462  * Start or restart a timeout.  Installs the callout structure on the
463  * callwheel.  Callers may legally pass any value, even if 0 or negative,
464  * but since the sc->curticks index may have already been processed a
465  * minimum timeout of 1 tick will be enforced.
466  *
467  * This function will block if the callout is currently queued to a different
468  * cpu or the callback is currently running in another thread.
469  */
470 void
471 callout_reset(struct callout *c, int to_ticks, void (*ftn)(void *), void *arg)
472 {
473 	softclock_pcpu_t sc;
474 	globaldata_t gd;
475 
476 #ifdef INVARIANTS
477         if ((c->c_flags & CALLOUT_DID_INIT) == 0) {
478 		callout_init(c);
479 		kprintf(
480 		    "callout_reset(%p) from %p: callout was not initialized\n",
481 		    c, ((int **)&c)[-1]);
482 		print_backtrace(-1);
483 	}
484 #endif
485 	gd = mycpu;
486 	sc = &softclock_pcpu_ary[gd->gd_cpuid];
487 	crit_enter_gd(gd);
488 
489 	/*
490 	 * Our cpu must gain ownership of the callout and cancel anything
491 	 * still running, which is complex.  The easiest way to do it is to
492 	 * issue a callout_stop().
493 	 *
494 	 * Clearing bits on flags is a way to guarantee they are not set,
495 	 * as the cmpset atomic op will fail otherwise.  PENDING and ARMED
496 	 * must not be set, if we find them set we loop up and call
497 	 * stop_sync() again.
498 	 *
499 	 */
500 	for (;;) {
501 		int flags;
502 		int nflags;
503 
504 		callout_stop_sync(c);
505 		flags = c->c_flags & ~(CALLOUT_PENDING | CALLOUT_ARMED);
506 		nflags = (flags & ~(CALLOUT_CPU_MASK |
507 				    CALLOUT_EXECUTED)) |
508 			 CALLOUT_CPU_TO_FLAGS(gd->gd_cpuid) |
509 			 CALLOUT_ARMED |
510 			 CALLOUT_PENDING |
511 			 CALLOUT_ACTIVE;
512 		if (atomic_cmpset_int(&c->c_flags, flags, nflags))
513 			break;
514 	}
515 
516 
517 	if (to_ticks <= 0)
518 		to_ticks = 1;
519 
520 	c->c_arg = arg;
521 	c->c_func = ftn;
522 	c->c_time = sc->curticks + to_ticks;
523 
524 	TAILQ_INSERT_TAIL(&sc->callwheel[c->c_time & cwheelmask],
525 			  c, c_links.tqe);
526 	crit_exit_gd(gd);
527 }
528 
529 /*
530  * Setup a callout to run on the specified cpu.  Should generally be used
531  * to run a callout on a specific cpu which does not nominally change.
532  */
533 void
534 callout_reset_bycpu(struct callout *c, int to_ticks, void (*ftn)(void *),
535 		    void *arg, int cpuid)
536 {
537 	globaldata_t gd;
538 	globaldata_t tgd;
539 
540 #ifdef INVARIANTS
541         if ((c->c_flags & CALLOUT_DID_INIT) == 0) {
542 		callout_init(c);
543 		kprintf(
544 		    "callout_reset(%p) from %p: callout was not initialized\n",
545 		    c, ((int **)&c)[-1]);
546 		print_backtrace(-1);
547 	}
548 #endif
549 	gd = mycpu;
550 	crit_enter_gd(gd);
551 
552 	tgd = globaldata_find(cpuid);
553 
554 	/*
555 	 * Our cpu must temporarily gain ownership of the callout and cancel
556 	 * anything still running, which is complex.  The easiest way to do
557 	 * it is to issue a callout_stop().
558 	 *
559 	 * Clearing bits on flags (vs nflags) is a way to guarantee they were
560 	 * not previously set, by forcing the atomic op to fail.  The callout
561 	 * must not be pending or armed after the stop_sync, if it is we have
562 	 * to loop up and stop_sync() again.
563 	 */
564 	for (;;) {
565 		int flags;
566 		int nflags;
567 
568 		callout_stop_sync(c);
569 		flags = c->c_flags & ~(CALLOUT_PENDING | CALLOUT_ARMED);
570 		nflags = (flags & ~(CALLOUT_CPU_MASK |
571 				    CALLOUT_EXECUTED)) |
572 			 CALLOUT_CPU_TO_FLAGS(tgd->gd_cpuid) |
573 			 CALLOUT_ARMED |
574 			 CALLOUT_ACTIVE;
575 		nflags = nflags + 1;		/* bump IPI count */
576 		if (atomic_cmpset_int(&c->c_flags, flags, nflags))
577 			break;
578 		cpu_pause();
579 	}
580 
581 	/*
582 	 * Even though we are not the cpu that now owns the callout, our
583 	 * bumping of the IPI count (and in a situation where the callout is
584 	 * not queued to the callwheel) will prevent anyone else from
585 	 * depending on or acting on the contents of the callout structure.
586 	 */
587 	if (to_ticks <= 0)
588 		to_ticks = 1;
589 
590 	c->c_arg = arg;
591 	c->c_func = ftn;
592 	c->c_load = to_ticks;	/* IPI will add curticks */
593 
594 	lwkt_send_ipiq(tgd, callout_reset_ipi, c);
595 	crit_exit_gd(gd);
596 }
597 
598 /*
599  * Remote IPI for callout_reset_bycpu().  The operation is performed only
600  * on the 1->0 transition of the counter, otherwise there are callout_stop()s
601  * pending after us.
602  *
603  * The IPI counter and PENDING flags must be set atomically with the
604  * 1->0 transition.  The ACTIVE flag was set prior to the ipi being
605  * sent and we do not want to race a caller on the original cpu trying
606  * to deactivate() the flag concurrent with our installation of the
607  * callout.
608  */
609 static void
610 callout_reset_ipi(void *arg)
611 {
612 	struct callout *c = arg;
613 	globaldata_t gd = mycpu;
614 	globaldata_t tgd;
615 	int flags;
616 	int nflags;
617 
618 	for (;;) {
619 		flags = c->c_flags;
620 		cpu_ccfence();
621 		KKASSERT((flags & CALLOUT_IPI_MASK) > 0);
622 
623 		/*
624 		 * We should already be armed for our cpu, if armed to another
625 		 * cpu, chain the IPI.  If for some reason we are not armed,
626 		 * we can arm ourselves.
627 		 */
628 		if (flags & CALLOUT_ARMED) {
629 			if (CALLOUT_FLAGS_TO_CPU(flags) != gd->gd_cpuid) {
630 				tgd = globaldata_find(
631 						CALLOUT_FLAGS_TO_CPU(flags));
632 				lwkt_send_ipiq(tgd, callout_reset_ipi, c);
633 				return;
634 			}
635 			nflags = (flags & ~CALLOUT_EXECUTED);
636 		} else {
637 			nflags = (flags & ~(CALLOUT_CPU_MASK |
638 					    CALLOUT_EXECUTED)) |
639 				 CALLOUT_ARMED |
640 				 CALLOUT_CPU_TO_FLAGS(gd->gd_cpuid);
641 		}
642 
643 		/*
644 		 * Decrement the IPI count, retain and clear the WAITING
645 		 * status, clear EXECUTED.
646 		 *
647 		 * NOTE: It is possible for the callout to already have been
648 		 *	 marked pending due to SMP races.
649 		 */
650 		nflags = nflags - 1;
651 		if ((flags & CALLOUT_IPI_MASK) == 1) {
652 			nflags &= ~(CALLOUT_WAITING | CALLOUT_EXECUTED);
653 			nflags |= CALLOUT_PENDING;
654 		}
655 
656 		if (atomic_cmpset_int(&c->c_flags, flags, nflags)) {
657 			/*
658 			 * Only install the callout on the 1->0 transition
659 			 * of the IPI count, and only if PENDING was not
660 			 * already set.  The latter situation should never
661 			 * occur but we check anyway.
662 			 */
663 			if ((flags & (CALLOUT_PENDING|CALLOUT_IPI_MASK)) == 1) {
664 				softclock_pcpu_t sc;
665 
666 				sc = &softclock_pcpu_ary[gd->gd_cpuid];
667 				c->c_time = sc->curticks + c->c_load;
668 				TAILQ_INSERT_TAIL(
669 					&sc->callwheel[c->c_time & cwheelmask],
670 					c, c_links.tqe);
671 			}
672 			break;
673 		}
674 		/* retry */
675 		cpu_pause();
676 	}
677 
678 	/*
679 	 * Issue wakeup if requested.
680 	 */
681 	if (flags & CALLOUT_WAITING)
682 		wakeup(c);
683 }
684 
685 /*
686  * Stop a running timer and ensure that any running callout completes before
687  * returning.  If the timer is running on another cpu this function may block
688  * to interlock against the callout.  If the callout is currently executing
689  * or blocked in another thread this function may also block to interlock
690  * against the callout.
691  *
692  * The caller must be careful to avoid deadlocks, either by using
693  * callout_init_lk() (which uses the lockmgr lock cancelation feature),
694  * by using tokens and dealing with breaks in the serialization, or using
695  * the lockmgr lock cancelation feature yourself in the callout callback
696  * function.
697  *
698  * callout_stop() returns non-zero if the callout was pending.
699  */
700 static int
701 _callout_stop(struct callout *c, int issync)
702 {
703 	globaldata_t gd = mycpu;
704 	globaldata_t tgd;
705 	softclock_pcpu_t sc;
706 	int flags;
707 	int nflags;
708 	int rc;
709 	int cpuid;
710 
711 #ifdef INVARIANTS
712         if ((c->c_flags & CALLOUT_DID_INIT) == 0) {
713 		callout_init(c);
714 		kprintf(
715 		    "callout_stop(%p) from %p: callout was not initialized\n",
716 		    c, ((int **)&c)[-1]);
717 		print_backtrace(-1);
718 	}
719 #endif
720 	crit_enter_gd(gd);
721 
722 	/*
723 	 * Fast path operations:
724 	 *
725 	 * If ARMED and owned by our cpu, or not ARMED, and other simple
726 	 * conditions are met, we can just clear ACTIVE and EXECUTED
727 	 * and we are done.
728 	 */
729 	for (;;) {
730 		flags = c->c_flags;
731 		cpu_ccfence();
732 
733 		cpuid = CALLOUT_FLAGS_TO_CPU(flags);
734 
735 		/*
736 		 * Can't handle an armed callout in the fast path if it is
737 		 * not on the current cpu.  We must atomically increment the
738 		 * IPI count for the IPI we intend to send and break out of
739 		 * the fast path to enter the slow path.
740 		 */
741 		if (flags & CALLOUT_ARMED) {
742 			if (gd->gd_cpuid != cpuid) {
743 				nflags = flags + 1;
744 				if (atomic_cmpset_int(&c->c_flags,
745 						      flags, nflags)) {
746 					/* break to slow path */
747 					break;
748 				}
749 				continue;	/* retry */
750 			}
751 		} else {
752 			cpuid = gd->gd_cpuid;
753 			KKASSERT((flags & CALLOUT_IPI_MASK) == 0);
754 			KKASSERT((flags & CALLOUT_PENDING) == 0);
755 		}
756 
757 		/*
758 		 * Process pending IPIs and retry (only if not called from
759 		 * an IPI).
760 		 */
761 		if (flags & CALLOUT_IPI_MASK) {
762 			lwkt_process_ipiq();
763 			continue;	/* retry */
764 		}
765 
766 		/*
767 		 * Transition to the stopped state, recover the EXECUTED
768 		 * status.  If pending we cannot clear ARMED until after
769 		 * we have removed (c) from the callwheel.
770 		 *
771 		 * NOTE: The callout might already not be armed but in this
772 		 *	 case it should also not be pending.
773 		 */
774 		nflags = flags & ~(CALLOUT_ACTIVE |
775 				   CALLOUT_EXECUTED |
776 				   CALLOUT_WAITING |
777 				   CALLOUT_PENDING);
778 
779 		/* NOTE: IPI_MASK already tested */
780 		if ((flags & CALLOUT_PENDING) == 0)
781 			nflags &= ~CALLOUT_ARMED;
782 		if (atomic_cmpset_int(&c->c_flags, flags, nflags)) {
783 			/*
784 			 * Can only remove from callwheel if currently
785 			 * pending.
786 			 */
787 			if (flags & CALLOUT_PENDING) {
788 				sc = &softclock_pcpu_ary[gd->gd_cpuid];
789 				if (sc->next == c)
790 					sc->next = TAILQ_NEXT(c, c_links.tqe);
791 				TAILQ_REMOVE(
792 					&sc->callwheel[c->c_time & cwheelmask],
793 					c,
794 					c_links.tqe);
795 				c->c_func = NULL;
796 
797 				/*
798 				 * NOTE: Can't clear ARMED until we have
799 				 *	 physically removed (c) from the
800 				 *	 callwheel.
801 				 *
802 				 * NOTE: WAITING bit race exists when doing
803 				 *	 unconditional bit clears.
804 				 */
805 				callout_maybe_clear_armed(c);
806 				if (c->c_flags & CALLOUT_WAITING)
807 					flags |= CALLOUT_WAITING;
808 			}
809 
810 			/*
811 			 * ARMED has been cleared at this point and (c)
812 			 * might now be stale.  Only good for wakeup()s.
813 			 */
814 			if (flags & CALLOUT_WAITING)
815 				wakeup(c);
816 
817 			goto skip_slow;
818 		}
819 		/* retry */
820 	}
821 
822 	/*
823 	 * Slow path (and not called via an IPI).
824 	 *
825 	 * When ARMED to a different cpu the stop must be processed on that
826 	 * cpu.  Issue the IPI and wait for completion.  We have already
827 	 * incremented the IPI count.
828 	 */
829 	tgd = globaldata_find(cpuid);
830 	lwkt_send_ipiq3(tgd, callout_stop_ipi, c, issync);
831 
832 	for (;;) {
833 		int flags;
834 		int nflags;
835 
836 		flags = c->c_flags;
837 		cpu_ccfence();
838 		if ((flags & CALLOUT_IPI_MASK) == 0)	/* fast path */
839 			break;
840 		nflags = flags | CALLOUT_WAITING;
841 		tsleep_interlock(c, 0);
842 		if (atomic_cmpset_int(&c->c_flags, flags, nflags)) {
843 			tsleep(c, PINTERLOCKED, "cstp1", 0);
844 		}
845 	}
846 
847 skip_slow:
848 
849 	/*
850 	 * If (issync) we must also wait for any in-progress callbacks to
851 	 * complete, unless the stop is being executed from the callback
852 	 * itself.  The EXECUTED flag is set prior to the callback
853 	 * being made so our existing flags status already has it.
854 	 *
855 	 * If auto-lock mode is being used, this is where we cancel any
856 	 * blocked lock that is potentially preventing the target cpu
857 	 * from completing the callback.
858 	 */
859 	while (issync) {
860 		intptr_t *runp;
861 		intptr_t runco;
862 
863 		sc = &softclock_pcpu_ary[cpuid];
864 		if (gd->gd_curthread == &sc->thread)	/* stop from cb */
865 			break;
866 		runp = &sc->running;
867 		runco = *runp;
868 		cpu_ccfence();
869 		if ((runco & ~(intptr_t)1) != (intptr_t)c)
870 			break;
871 		if (c->c_flags & CALLOUT_AUTOLOCK)
872 			lockmgr(c->c_lk, LK_CANCEL_BEG);
873 		tsleep_interlock(c, 0);
874 		if (atomic_cmpset_long(runp, runco, runco | 1))
875 			tsleep(c, PINTERLOCKED, "cstp3", 0);
876 		if (c->c_flags & CALLOUT_AUTOLOCK)
877 			lockmgr(c->c_lk, LK_CANCEL_END);
878 	}
879 
880 	crit_exit_gd(gd);
881 	rc = (flags & CALLOUT_EXECUTED) != 0;
882 
883 	return rc;
884 }
885 
886 static
887 void
888 callout_stop_ipi(void *arg, int issync, struct intrframe *frame)
889 {
890 	globaldata_t gd = mycpu;
891 	struct callout *c = arg;
892 	softclock_pcpu_t sc;
893 
894 	/*
895 	 * Only the fast path can run in an IPI.  Chain the stop request
896 	 * if we are racing cpu changes.
897 	 */
898 	for (;;) {
899 		globaldata_t tgd;
900 		int flags;
901 		int nflags;
902 		int cpuid;
903 
904 		flags = c->c_flags;
905 		cpu_ccfence();
906 
907 		/*
908 		 * Can't handle an armed callout in the fast path if it is
909 		 * not on the current cpu.  We must atomically increment the
910 		 * IPI count and break out of the fast path.
911 		 *
912 		 * If called from an IPI we chain the IPI instead.
913 		 */
914 		if (flags & CALLOUT_ARMED) {
915 			cpuid = CALLOUT_FLAGS_TO_CPU(flags);
916 			if (gd->gd_cpuid != cpuid) {
917 				tgd = globaldata_find(cpuid);
918 				lwkt_send_ipiq3(tgd, callout_stop_ipi,
919 						c, issync);
920 				break;
921 			}
922 		}
923 
924 		/*
925 		 * NOTE: As an IPI ourselves we cannot wait for other IPIs
926 		 *	 to complete, and we are being executed in-order.
927 		 */
928 
929 		/*
930 		 * Transition to the stopped state, recover the EXECUTED
931 		 * status, decrement the IPI count.  If pending we cannot
932 		 * clear ARMED until after we have removed (c) from the
933 		 * callwheel, and only if there are no more IPIs pending.
934 		 */
935 		nflags = flags & ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
936 		nflags = nflags - 1;			/* dec ipi count */
937 		if ((flags & (CALLOUT_IPI_MASK | CALLOUT_PENDING)) == 1)
938 			nflags &= ~CALLOUT_ARMED;
939 		if ((flags & CALLOUT_IPI_MASK) == 1)
940 			nflags &= ~(CALLOUT_WAITING | CALLOUT_EXECUTED);
941 
942 		if (atomic_cmpset_int(&c->c_flags, flags, nflags)) {
943 			/*
944 			 * Can only remove from callwheel if currently
945 			 * pending.
946 			 */
947 			if (flags & CALLOUT_PENDING) {
948 				sc = &softclock_pcpu_ary[gd->gd_cpuid];
949 				if (sc->next == c)
950 					sc->next = TAILQ_NEXT(c, c_links.tqe);
951 				TAILQ_REMOVE(
952 					&sc->callwheel[c->c_time & cwheelmask],
953 					c,
954 					c_links.tqe);
955 				c->c_func = NULL;
956 
957 				/*
958 				 * NOTE: Can't clear ARMED until we have
959 				 *	 physically removed (c) from the
960 				 *	 callwheel.
961 				 *
962 				 * NOTE: WAITING bit race exists when doing
963 				 *	 unconditional bit clears.
964 				 */
965 				callout_maybe_clear_armed(c);
966 				if (c->c_flags & CALLOUT_WAITING)
967 					flags |= CALLOUT_WAITING;
968 			}
969 
970 			/*
971 			 * ARMED has been cleared at this point and (c)
972 			 * might now be stale.  Only good for wakeup()s.
973 			 */
974 			if (flags & CALLOUT_WAITING)
975 				wakeup(c);
976 			break;
977 		}
978 		/* retry */
979 	}
980 }
981 
982 int
983 callout_stop(struct callout *c)
984 {
985 	return _callout_stop(c, 0);
986 }
987 
988 int
989 callout_stop_sync(struct callout *c)
990 {
991 	return _callout_stop(c, 1);
992 }
993 
994 void
995 callout_stop_async(struct callout *c)
996 {
997 	_callout_stop(c, 0);
998 }
999 
1000 void
1001 callout_terminate(struct callout *c)
1002 {
1003 	_callout_stop(c, 1);
1004 	atomic_clear_int(&c->c_flags, CALLOUT_DID_INIT);
1005 }
1006 
1007 /*
1008  * Prepare a callout structure for use by callout_reset() and/or
1009  * callout_stop().
1010  *
1011  * The MP version of this routine requires that the callback
1012  * function installed by callout_reset() be MP safe.
1013  *
1014  * The LK version of this routine is also MPsafe and will automatically
1015  * acquire the specified lock for the duration of the function call,
1016  * and release it after the function returns.  In addition, when autolocking
1017  * is used, callout_stop() becomes synchronous if the caller owns the lock.
1018  * callout_reset(), callout_stop(), and callout_stop_sync() will block
1019  * normally instead of spinning when a cpu race occurs.  Lock cancelation
1020  * is used to avoid deadlocks against the callout ring dispatch.
1021  *
1022  * The init functions can be called from any cpu and do not have to be
1023  * called from the cpu that the timer will eventually run on.
1024  */
1025 static __inline
1026 void
1027 _callout_init(struct callout *c, int flags)
1028 {
1029 	bzero(c, sizeof *c);
1030 	c->c_flags = flags;
1031 }
1032 
1033 void
1034 callout_init(struct callout *c)
1035 {
1036 	_callout_init(c, CALLOUT_DID_INIT);
1037 }
1038 
1039 void
1040 callout_init_mp(struct callout *c)
1041 {
1042 	_callout_init(c, CALLOUT_DID_INIT | CALLOUT_MPSAFE);
1043 }
1044 
1045 void
1046 callout_init_lk(struct callout *c, struct lock *lk)
1047 {
1048 	_callout_init(c, CALLOUT_DID_INIT | CALLOUT_MPSAFE | CALLOUT_AUTOLOCK);
1049 	c->c_lk = lk;
1050 }
1051