xref: /dragonfly/sys/kern/kern_timeout.c (revision c9c5aa9e)
1 /*
2  * Copyright (c) 2004,2014,2019-2020 The DragonFly Project.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to The DragonFly Project
6  * by Matthew Dillon <dillon@backplane.com>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * Copyright (c) 1982, 1986, 1991, 1993
37  *	The Regents of the University of California.  All rights reserved.
38  * (c) UNIX System Laboratories, Inc.
39  * All or some portions of this file are derived from material licensed
40  * to the University of California by American Telephone and Telegraph
41  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
42  * the permission of UNIX System Laboratories, Inc.
43  *
44  * Redistribution and use in source and binary forms, with or without
45  * modification, are permitted provided that the following conditions
46  * are met:
47  * 1. Redistributions of source code must retain the above copyright
48  *    notice, this list of conditions and the following disclaimer.
49  * 2. Redistributions in binary form must reproduce the above copyright
50  *    notice, this list of conditions and the following disclaimer in the
51  *    documentation and/or other materials provided with the distribution.
52  * 3. Neither the name of the University nor the names of its contributors
53  *    may be used to endorse or promote products derived from this software
54  *    without specific prior written permission.
55  *
56  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
57  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
60  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
61  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
62  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
63  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
64  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
65  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
66  * SUCH DAMAGE.
67  */
68 /*
69  * The original callout mechanism was based on the work of Adam M. Costello
70  * and George Varghese, published in a technical report entitled "Redesigning
71  * the BSD Callout and Timer Facilities" and modified slightly for inclusion
72  * in FreeBSD by Justin T. Gibbs.  The original work on the data structures
73  * used in this implementation was published by G. Varghese and T. Lauck in
74  * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
75  * the Efficient Implementation of a Timer Facility" in the Proceedings of
76  * the 11th ACM Annual Symposium on Operating Systems Principles,
77  * Austin, Texas Nov 1987.
78  */
79 
80 #include <sys/param.h>
81 #include <sys/systm.h>
82 #include <sys/spinlock.h>
83 #include <sys/callout.h>
84 #include <sys/kernel.h>
85 #include <sys/malloc.h>
86 #include <sys/interrupt.h>
87 #include <sys/thread.h>
88 #include <sys/sysctl.h>
89 #include <sys/exislock.h>
90 #include <vm/vm_extern.h>
91 #include <machine/atomic.h>
92 
93 #include <sys/spinlock2.h>
94 #include <sys/thread2.h>
95 #include <sys/mplock2.h>
96 #include <sys/exislock2.h>
97 
98 TAILQ_HEAD(colist, _callout);
99 struct softclock_pcpu;
100 
101 /*
102  * DID_INIT	- Sanity check
103  * PREVENTED	- A callback was prevented
104  * RESET	- Callout_reset requested
105  * STOP		- Callout_stop requested
106  * INPROG	- Softclock_handler thread processing in-progress on callout,
107  *		  queue linkage is indeterminant.  Third parties must queue
108  *		  a STOP or CANCEL and await completion.
109  * SET		- Callout is linked to queue (if INPROG not set)
110  * AUTOLOCK	- Lockmgr cancelable interlock (copied from frontend)
111  * MPSAFE	- Callout is MPSAFE (copied from frontend)
112  * CANCEL	- callout_cancel requested
113  * ACTIVE	- active/inactive (frontend only, see documentation).
114  *		  This is *NOT* the same as whether a callout is queued or
115  *		  not.
116  */
117 #define CALLOUT_DID_INIT	0x00000001	/* frontend */
118 #define CALLOUT_PREVENTED	0x00000002	/* backend */
119 #define CALLOUT_FREELIST	0x00000004	/* backend */
120 #define CALLOUT_UNUSED0008	0x00000008
121 #define CALLOUT_UNUSED0010	0x00000010
122 #define CALLOUT_RESET		0x00000020	/* backend */
123 #define CALLOUT_STOP		0x00000040	/* backend */
124 #define CALLOUT_INPROG		0x00000080	/* backend */
125 #define CALLOUT_SET		0x00000100	/* backend */
126 #define CALLOUT_AUTOLOCK	0x00000200	/* both */
127 #define CALLOUT_MPSAFE		0x00000400	/* both */
128 #define CALLOUT_CANCEL		0x00000800	/* backend */
129 #define CALLOUT_ACTIVE		0x00001000	/* frontend */
130 
131 struct wheel {
132 	struct spinlock spin;
133 	struct colist	list;
134 };
135 
136 struct softclock_pcpu {
137 	struct wheel	*callwheel;
138 	struct _callout *running;
139 	struct _callout * volatile next;
140 	struct colist	freelist;
141 	int		softticks;	/* softticks index */
142 	int		curticks;	/* per-cpu ticks counter */
143 	int		isrunning;
144 	struct thread	thread;
145 };
146 
147 typedef struct softclock_pcpu *softclock_pcpu_t;
148 
149 static int callout_debug = 0;
150 SYSCTL_INT(_debug, OID_AUTO, callout_debug, CTLFLAG_RW,
151 	   &callout_debug, 0, "");
152 
153 static MALLOC_DEFINE(M_CALLOUT, "callouts", "softclock callouts");
154 
155 static int cwheelsize;
156 static int cwheelmask;
157 static softclock_pcpu_t softclock_pcpu_ary[MAXCPU];
158 
159 static void softclock_handler(void *arg);
160 static void slotimer_callback(void *arg);
161 
162 /*
163  * Handle pending requests.  No action can be taken if the callout is still
164  * flagged INPROG.  Called from softclock for post-processing and from
165  * various API functions.
166  *
167  * This routine does not block in any way.
168  * Caller must hold c->spin.
169  *
170  * NOTE: Flags can be adjusted without holding c->spin, so atomic ops
171  *	 must be used at all times.
172  *
173  * NOTE: The related (sc) might refer to another cpu.
174  *
175  * NOTE: The cc-vs-c frontend-vs-backend might be disconnected during the
176  *	 operation, but the EXIS lock prevents (c) from being destroyed.
177  */
178 static __inline
179 void
180 _callout_update_spinlocked(struct _callout *c)
181 {
182 	struct wheel *wheel;
183 
184 	if ((c->flags & CALLOUT_INPROG) && curthread != &c->qsc->thread) {
185 		/*
186 		 * If the callout is in-progress the SET queuing state is
187 		 * indeterminant and no action can be taken at this time.
188 		 *
189 		 * (however, recursive calls from the call-back are not
190 		 * indeterminant and must be processed at this time).
191 		 */
192 		/* nop */
193 	} else if (c->flags & CALLOUT_SET) {
194 		/*
195 		 * If the callout is SET it is queued on a callwheel, process
196 		 * various requests relative to it being in this queued state.
197 		 *
198 		 * c->q* fields are stable while we hold c->spin and
199 		 * wheel->spin.
200 		 */
201 		softclock_pcpu_t sc;
202 
203 		sc = c->qsc;
204 		wheel = &sc->callwheel[c->qtick & cwheelmask];
205 		spin_lock(&wheel->spin);
206 
207 		if ((c->flags & CALLOUT_INPROG) &&
208 		    curthread != &c->qsc->thread) {
209 			/*
210 			 * Raced against INPROG getting set by the softclock
211 			 * handler while we were acquiring wheel->spin.  We
212 			 * can do nothing at this time.
213 			 *
214 			 * (however, recursive calls from the call-back are not
215 			 * indeterminant and must be processed at this time).
216 			 */
217 			/* nop */
218 		} else if (c->flags & CALLOUT_CANCEL) {
219 			/*
220 			 * CANCEL requests override everything else.
221 			 */
222 			if (sc->next == c)
223 				sc->next = TAILQ_NEXT(c, entry);
224 			TAILQ_REMOVE(&wheel->list, c, entry);
225 			atomic_clear_int(&c->flags, CALLOUT_SET |
226 						    CALLOUT_STOP |
227 						    CALLOUT_CANCEL |
228 						    CALLOUT_RESET);
229 			atomic_set_int(&c->flags, CALLOUT_PREVENTED);
230 			if (c->waiters)
231 				wakeup(c);
232 		} else if (c->flags & CALLOUT_RESET) {
233 			/*
234 			 * RESET requests reload the callout, potentially
235 			 * to a different cpu.  Once removed from the wheel,
236 			 * the retention of c->spin prevents further races.
237 			 *
238 			 * Leave SET intact.
239 			 */
240 			if (sc->next == c)
241 				sc->next = TAILQ_NEXT(c, entry);
242 			TAILQ_REMOVE(&wheel->list, c, entry);
243 			spin_unlock(&wheel->spin);
244 
245 			atomic_clear_int(&c->flags, CALLOUT_RESET);
246 			sc = c->rsc;
247 			c->qsc = sc;
248 			c->qarg = c->rarg;
249 			c->qfunc = c->rfunc;
250 			c->qtick = c->rtick;
251 
252 			/*
253 			 * Do not queue to a current or past wheel slot or
254 			 * the callout will be lost for ages.  Handle
255 			 * potential races against soft ticks.
256 			 */
257 			wheel = &sc->callwheel[c->qtick & cwheelmask];
258 			spin_lock(&wheel->spin);
259 			while (c->qtick - sc->softticks <= 0) {
260 				c->qtick = sc->softticks + 1;
261 				spin_unlock(&wheel->spin);
262 				wheel = &sc->callwheel[c->qtick & cwheelmask];
263 				spin_lock(&wheel->spin);
264 			}
265 			TAILQ_INSERT_TAIL(&wheel->list, c, entry);
266 		} else if (c->flags & CALLOUT_STOP) {
267 			/*
268 			 * STOP request simply unloads the callout.
269 			 */
270 			if (sc->next == c)
271 				sc->next = TAILQ_NEXT(c, entry);
272 			TAILQ_REMOVE(&wheel->list, c, entry);
273 			atomic_clear_int(&c->flags, CALLOUT_STOP |
274 						    CALLOUT_SET);
275 
276 			atomic_set_int(&c->flags, CALLOUT_PREVENTED);
277 			if (c->waiters)
278 				wakeup(c);
279 		} else {
280 			/*
281 			 * Do nothing if no request is pending.
282 			 */
283 			/* nop */
284 		}
285 		spin_unlock(&wheel->spin);
286 	} else {
287 		/*
288 		 * If the callout is not SET it is not queued to any callwheel,
289 		 * process various requests relative to it not being queued.
290 		 *
291 		 * c->q* fields are stable while we hold c->spin.
292 		 */
293 		if (c->flags & CALLOUT_CANCEL) {
294 			/*
295 			 * CANCEL requests override everything else.
296 			 *
297 			 * There is no state being canceled in this case,
298 			 * so do not set the PREVENTED flag.
299 			 */
300 			atomic_clear_int(&c->flags, CALLOUT_STOP |
301 						    CALLOUT_CANCEL |
302 						    CALLOUT_RESET);
303 			if (c->waiters)
304 				wakeup(c);
305 		} else if (c->flags & CALLOUT_RESET) {
306 			/*
307 			 * RESET requests get queued.  Do not queue to the
308 			 * currently-processing tick.
309 			 */
310 			softclock_pcpu_t sc;
311 
312 			sc = c->rsc;
313 			c->qsc = sc;
314 			c->qarg = c->rarg;
315 			c->qfunc = c->rfunc;
316 			c->qtick = c->rtick;
317 
318 			/*
319 			 * Do not queue to current or past wheel or the
320 			 * callout will be lost for ages.
321 			 */
322 			wheel = &sc->callwheel[c->qtick & cwheelmask];
323 			spin_lock(&wheel->spin);
324 			while (c->qtick - sc->softticks <= 0) {
325 				c->qtick = sc->softticks + 1;
326 				spin_unlock(&wheel->spin);
327 				wheel = &sc->callwheel[c->qtick & cwheelmask];
328 				spin_lock(&wheel->spin);
329 			}
330 			TAILQ_INSERT_TAIL(&wheel->list, c, entry);
331 			atomic_clear_int(&c->flags, CALLOUT_RESET);
332 			atomic_set_int(&c->flags, CALLOUT_SET);
333 			spin_unlock(&wheel->spin);
334 		} else if (c->flags & CALLOUT_STOP) {
335 			/*
336 			 * STOP requests.
337 			 *
338 			 * There is no state being stopped in this case,
339 			 * so do not set the PREVENTED flag.
340 			 */
341 			atomic_clear_int(&c->flags, CALLOUT_STOP);
342 			if (c->waiters)
343 				wakeup(c);
344 		} else {
345 			/*
346 			 * No request pending (someone else processed the
347 			 * request before we could)
348 			 */
349 			/* nop */
350 		}
351 	}
352 }
353 
354 static __inline
355 void
356 _callout_free(struct _callout *c)
357 {
358 	softclock_pcpu_t sc;
359 
360 	sc = softclock_pcpu_ary[mycpu->gd_cpuid];
361 
362 	crit_enter();
363 	exis_terminate(&c->exis);
364 	atomic_set_int(&c->flags, CALLOUT_FREELIST);
365 	atomic_clear_int(&c->flags, CALLOUT_DID_INIT);
366 	TAILQ_INSERT_TAIL(&sc->freelist, c, entry);
367 	crit_exit();
368 }
369 
370 /*
371  * System init
372  */
373 static void
374 swi_softclock_setup(void *arg)
375 {
376 	int cpu;
377 	int i;
378 	int target;
379 
380 	/*
381 	 * Figure out how large a callwheel we need.  It must be a power of 2.
382 	 *
383 	 * ncallout is primarily based on available memory, don't explode
384 	 * the allocations if the system has a lot of cpus.
385 	 */
386 	target = ncallout / ncpus + 16;
387 
388 	cwheelsize = 1;
389 	while (cwheelsize < target)
390 		cwheelsize <<= 1;
391 	cwheelmask = cwheelsize - 1;
392 
393 	/*
394 	 * Initialize per-cpu data structures.
395 	 */
396 	for (cpu = 0; cpu < ncpus; ++cpu) {
397 		softclock_pcpu_t sc;
398 		int wheel_sz;
399 
400 		sc = (void *)kmem_alloc3(&kernel_map, sizeof(*sc),
401 					 VM_SUBSYS_GD, KM_CPU(cpu));
402 		memset(sc, 0, sizeof(*sc));
403 		TAILQ_INIT(&sc->freelist);
404 		softclock_pcpu_ary[cpu] = sc;
405 
406 		wheel_sz = sizeof(*sc->callwheel) * cwheelsize;
407 		sc->callwheel = (void *)kmem_alloc3(&kernel_map, wheel_sz,
408 						    VM_SUBSYS_GD, KM_CPU(cpu));
409 		memset(sc->callwheel, 0, wheel_sz);
410 		for (i = 0; i < cwheelsize; ++i) {
411 			spin_init(&sc->callwheel[i].spin, "wheel");
412 			TAILQ_INIT(&sc->callwheel[i].list);
413 		}
414 
415 		/*
416 		 * Mark the softclock handler as being an interrupt thread
417 		 * even though it really isn't, but do not allow it to
418 		 * preempt other threads (do not assign td_preemptable).
419 		 *
420 		 * Kernel code now assumes that callouts do not preempt
421 		 * the cpu they were scheduled on.
422 		 */
423 		lwkt_create(softclock_handler, sc, NULL, &sc->thread,
424 			    TDF_NOSTART | TDF_INTTHREAD,
425 			    cpu, "softclock %d", cpu);
426 	}
427 }
428 
429 /*
430  * Must occur after ncpus has been initialized.
431  */
432 SYSINIT(softclock_setup, SI_BOOT2_SOFTCLOCK, SI_ORDER_SECOND,
433 	swi_softclock_setup, NULL);
434 
435 /*
436  * This routine is called from the hardclock() (basically a FASTint/IPI) on
437  * each cpu in the system.  sc->curticks is this cpu's notion of the timebase.
438  * It IS NOT NECESSARILY SYNCHRONIZED WITH 'ticks'!  sc->softticks is where
439  * the callwheel is currently indexed.
440  *
441  * sc->softticks is adjusted by either this routine or our helper thread
442  * depending on whether the helper thread is running or not.
443  *
444  * sc->curticks and sc->softticks are adjusted using atomic ops in order
445  * to ensure that remote cpu callout installation does not race the thread.
446  */
447 void
448 hardclock_softtick(globaldata_t gd)
449 {
450 	softclock_pcpu_t sc;
451 	struct wheel *wheel;
452 
453 	sc = softclock_pcpu_ary[gd->gd_cpuid];
454 	atomic_add_int(&sc->curticks, 1);
455 	if (sc->isrunning)
456 		return;
457 	if (sc->softticks == sc->curticks) {
458 		/*
459 		 * In sync, only wakeup the thread if there is something to
460 		 * do.
461 		 */
462 		wheel = &sc->callwheel[sc->softticks & cwheelmask];
463 		spin_lock(&wheel->spin);
464 		if (TAILQ_FIRST(&wheel->list)) {
465 			sc->isrunning = 1;
466 			spin_unlock(&wheel->spin);
467 			lwkt_schedule(&sc->thread);
468 		} else {
469 			atomic_add_int(&sc->softticks, 1);
470 			spin_unlock(&wheel->spin);
471 		}
472 	} else {
473 		/*
474 		 * out of sync, wakeup the thread unconditionally so it can
475 		 * catch up.
476 		 */
477 		sc->isrunning = 1;
478 		lwkt_schedule(&sc->thread);
479 	}
480 }
481 
482 /*
483  * This procedure is the main loop of our per-cpu helper thread.  The
484  * sc->isrunning flag prevents us from racing hardclock_softtick().
485  *
486  * The thread starts with the MP lock released and not in a critical
487  * section.  The loop itself is MP safe while individual callbacks
488  * may or may not be, so we obtain or release the MP lock as appropriate.
489  */
490 static void
491 softclock_handler(void *arg)
492 {
493 	softclock_pcpu_t sc;
494 	struct _callout *c;
495 	struct wheel *wheel;
496 	struct callout slotimer1;
497 	struct _callout slotimer2;
498 	int mpsafe = 1;
499 
500 	/*
501 	 * Setup pcpu slow clocks which we want to run from the callout
502 	 * thread.  This thread starts very early and cannot kmalloc(),
503 	 * so use internal functions to supply the _callout.
504 	 */
505 	_callout_setup_quick(&slotimer1, &slotimer2, hz * 10,
506 			     slotimer_callback, &slotimer1);
507 
508 	/*
509 	 * Run the callout thread at the same priority as other kernel
510 	 * threads so it can be round-robined.
511 	 */
512 	/*lwkt_setpri_self(TDPRI_SOFT_NORM);*/
513 
514 	sc = arg;
515 loop:
516 	while (sc->softticks != (int)(sc->curticks + 1)) {
517 		wheel = &sc->callwheel[sc->softticks & cwheelmask];
518 
519 		spin_lock(&wheel->spin);
520 		sc->next = TAILQ_FIRST(&wheel->list);
521 		while ((c = sc->next) != NULL) {
522 			int error;
523 
524 			/*
525 			 * Match callouts for this tick.
526 			 */
527 			sc->next = TAILQ_NEXT(c, entry);
528 			if (c->qtick != sc->softticks)
529 				continue;
530 
531 			/*
532 			 * Double check the validity of the callout, detect
533 			 * if the originator's structure has been ripped out.
534 			 */
535 			if ((uintptr_t)c->verifier < VM_MAX_USER_ADDRESS) {
536 				spin_unlock(&wheel->spin);
537 				panic("_callout %p verifier %p failed "
538 				      "func %p/%p\n",
539 				      c, c->verifier, c->rfunc, c->qfunc);
540 			}
541 
542 			if (c->verifier->toc != c) {
543 				spin_unlock(&wheel->spin);
544 				panic("_callout %p verifier %p failed "
545 				      "func %p/%p\n",
546 				      c, c->verifier, c->rfunc, c->qfunc);
547 			}
548 
549 			/*
550 			 * The wheel spinlock is sufficient to set INPROG and
551 			 * remove (c) from the list.  Once INPROG is set,
552 			 * other threads can only make limited changes to (c).
553 			 *
554 			 * Setting INPROG masks SET tests in all other
555 			 * conditionals except the 'quick' code (which is
556 			 * always same-cpu and doesn't race).  This means
557 			 * that we can clear SET here without obtaining
558 			 * c->spin.
559 			 */
560 			TAILQ_REMOVE(&wheel->list, c, entry);
561 			atomic_set_int(&c->flags, CALLOUT_INPROG);
562 			atomic_clear_int(&c->flags, CALLOUT_SET);
563 			sc->running = c;
564 			spin_unlock(&wheel->spin);
565 
566 			/*
567 			 * Legacy mplock support
568 			 */
569 			if (c->flags & CALLOUT_MPSAFE) {
570 				if (mpsafe == 0) {
571 					mpsafe = 1;
572 					rel_mplock();
573 				}
574 			} else {
575 				if (mpsafe) {
576 					mpsafe = 0;
577 					get_mplock();
578 				}
579 			}
580 
581 			/*
582 			 * Execute the 'q' function (protected by INPROG)
583 			 */
584 			if (c->flags & (CALLOUT_STOP | CALLOUT_CANCEL)) {
585 				/*
586 				 * Raced a stop or cancel request, do
587 				 * not execute.  The processing code
588 				 * thinks its a normal completion so
589 				 * flag the fact that cancel/stop actually
590 				 * prevented a callout here.
591 				 */
592 				if (c->flags &
593 				    (CALLOUT_CANCEL | CALLOUT_STOP)) {
594 					atomic_set_int(&c->verifier->flags,
595 						       CALLOUT_PREVENTED);
596 				}
597 			} else if (c->flags & CALLOUT_RESET) {
598 				/*
599 				 * A RESET raced, make it seem like it
600 				 * didn't.  Do nothing here and let the
601 				 * update procedure requeue us.
602 				 */
603 			} else if (c->flags & CALLOUT_AUTOLOCK) {
604 				/*
605 				 * Interlocked cancelable call.  If the
606 				 * lock gets canceled we have to flag the
607 				 * fact that the cancel/stop actually
608 				 * prevented the callout here.
609 				 */
610 				error = lockmgr(c->lk, LK_EXCLUSIVE |
611 						       LK_CANCELABLE);
612 				if (error == 0) {
613 					c->qfunc(c->qarg);
614 					lockmgr(c->lk, LK_RELEASE);
615 				} else if (c->flags &
616 					   (CALLOUT_CANCEL | CALLOUT_STOP)) {
617 					atomic_set_int(&c->verifier->flags,
618 						       CALLOUT_PREVENTED);
619 				}
620 			} else {
621 				/*
622 				 * Normal call
623 				 */
624 				c->qfunc(c->qarg);
625 			}
626 
627 			/*
628 			 * INPROG will prevent SET from being set again.
629 			 * Once we clear INPROG, update the callout to
630 			 * handle any pending operations that have built-up.
631 			 */
632 
633 			/*
634 			 * Interlocked clearing of INPROG, then handle any
635 			 * queued request (such as a callout_reset() request).
636 			 */
637 			spin_lock(&c->spin);
638 			atomic_clear_int(&c->flags, CALLOUT_INPROG);
639 			sc->running = NULL;
640 			_callout_update_spinlocked(c);
641 			spin_unlock(&c->spin);
642 
643 			spin_lock(&wheel->spin);
644 		}
645 		spin_unlock(&wheel->spin);
646 		atomic_add_int(&sc->softticks, 1);
647 
648 		/*
649 		 * Clean up any _callout structures which are now allowed
650 		 * to be freed.
651 		 */
652 		crit_enter();
653 		while ((c = TAILQ_FIRST(&sc->freelist)) != NULL) {
654 			if (!exis_freeable(&c->exis))
655 				break;
656 			TAILQ_REMOVE(&sc->freelist, c, entry);
657 			c->flags = 0;
658 			kfree(c, M_CALLOUT);
659 			if (callout_debug)
660 				kprintf("KFREEB %p\n", c);
661 		}
662 		crit_exit();
663 	}
664 
665 	/*
666 	 * Don't leave us holding the MP lock when we deschedule ourselves.
667 	 */
668 	if (mpsafe == 0) {
669 		mpsafe = 1;
670 		rel_mplock();
671 	}
672 
673 	/*
674 	 * Recheck in critical section to interlock against hardlock
675 	 */
676 	crit_enter();
677 	if (sc->softticks == (int)(sc->curticks + 1)) {
678 		sc->isrunning = 0;
679 		lwkt_deschedule_self(&sc->thread);	/* == curthread */
680 		lwkt_switch();
681 	}
682 	crit_exit();
683 	goto loop;
684 	/* NOT REACHED */
685 }
686 
687 /*
688  * A very slow system cleanup timer (10 second interval),
689  * per-cpu.
690  */
691 void
692 slotimer_callback(void *arg)
693 {
694 	struct callout *c = arg;
695 
696 	slab_cleanup();
697 	callout_reset(c, hz * 10, slotimer_callback, c);
698 }
699 
700 /*
701  * API FUNCTIONS
702  */
703 
704 static __inline
705 struct _callout *
706 _callout_gettoc(struct callout *cc)
707 {
708 	globaldata_t gd = mycpu;
709 	struct _callout *c;
710 	softclock_pcpu_t sc;
711 
712 	KKASSERT(cc->flags & CALLOUT_DID_INIT);
713 	exis_hold_gd(gd);
714 	for (;;) {
715 		c = cc->toc;
716 		cpu_ccfence();
717 		if (c) {
718 			KKASSERT(c->verifier == cc);
719 			spin_lock(&c->spin);
720 			break;
721 		}
722 		sc = softclock_pcpu_ary[gd->gd_cpuid];
723 		c = kmalloc(sizeof(*c), M_CALLOUT, M_INTWAIT | M_ZERO);
724 		if (callout_debug)
725 			kprintf("ALLOC %p\n", c);
726 		c->flags = cc->flags;
727 		c->lk = cc->lk;
728 		c->verifier = cc;
729 		exis_init(&c->exis);
730 		spin_init(&c->spin, "calou");
731 		spin_lock(&c->spin);
732 		if (atomic_cmpset_ptr(&cc->toc, NULL, c))
733 			break;
734 		spin_unlock(&c->spin);
735 		c->verifier = NULL;
736 		kfree(c, M_CALLOUT);
737 		if (callout_debug)
738 			kprintf("KFREEA %p\n", c);
739 	}
740 	exis_drop_gd(gd);
741 
742 	/*
743 	 * Return internal __callout with spin-lock held
744 	 */
745 	return c;
746 }
747 
748 /*
749  * Macrod in sys/callout.h for debugging
750  *
751  * WARNING! tsleep() assumes this will not block
752  */
753 void
754 _callout_init(struct callout *cc CALLOUT_DEBUG_ARGS)
755 {
756 	bzero(cc, sizeof(*cc));
757 	cc->flags = CALLOUT_DID_INIT;
758 }
759 
760 void
761 _callout_init_mp(struct callout *cc CALLOUT_DEBUG_ARGS)
762 {
763 	bzero(cc, sizeof(*cc));
764 	cc->flags = CALLOUT_DID_INIT | CALLOUT_MPSAFE;
765 }
766 
767 void
768 _callout_init_lk(struct callout *cc, struct lock *lk CALLOUT_DEBUG_ARGS)
769 {
770 	bzero(cc, sizeof(*cc));
771 	cc->flags = CALLOUT_DID_INIT | CALLOUT_MPSAFE | CALLOUT_AUTOLOCK;
772 	cc->lk = lk;
773 }
774 
775 /*
776  * Start or restart a timeout.  New timeouts can be installed while the
777  * current one is running.
778  *
779  * Start or restart a timeout.  Installs the callout structure on the
780  * callwheel of the current cpu.  Callers may legally pass any value, even
781  * if 0 or negative, but since the sc->curticks index may have already
782  * been processed a minimum timeout of 1 tick will be enforced.
783  *
784  * This function will not deadlock against a running call.
785  *
786  * WARNING! tsleep() assumes this will not block
787  */
788 void
789 callout_reset(struct callout *cc, int to_ticks, void (*ftn)(void *), void *arg)
790 {
791 	softclock_pcpu_t sc;
792 	struct _callout *c;
793 
794 	/*
795 	 * We need to acquire/associate a _callout.
796 	 * gettoc spin-locks (c).
797 	 */
798 	KKASSERT(cc->flags & CALLOUT_DID_INIT);
799 	atomic_set_int(&cc->flags, CALLOUT_ACTIVE);
800 	c = _callout_gettoc(cc);
801 
802 	/*
803 	 * Request a RESET.  This automatically overrides a STOP in
804 	 * _callout_update_spinlocked().
805 	 */
806 	atomic_set_int(&c->flags, CALLOUT_RESET);
807 	sc = softclock_pcpu_ary[mycpu->gd_cpuid];
808 	c->rsc = sc;
809 	c->rtick = sc->curticks + to_ticks;
810 	c->rfunc = ftn;
811 	c->rarg = arg;
812 	_callout_update_spinlocked(c);
813 	spin_unlock(&c->spin);
814 }
815 
816 /*
817  * Same as callout_reset() but the timeout will run on a particular cpu.
818  */
819 void
820 callout_reset_bycpu(struct callout *cc, int to_ticks, void (*ftn)(void *),
821 		    void *arg, int cpuid)
822 {
823 	softclock_pcpu_t sc;
824 	struct _callout *c;
825 
826 	/*
827 	 * We need to acquire/associate a _callout.
828 	 * gettoc spin-locks (c).
829 	 */
830 	KKASSERT(cc->flags & CALLOUT_DID_INIT);
831 	atomic_set_int(&cc->flags, CALLOUT_ACTIVE);
832 	c = _callout_gettoc(cc);
833 
834 	/*
835 	 * Set RESET.  Do not clear STOP here (let the process code do it).
836 	 */
837 	atomic_set_int(&c->flags, CALLOUT_RESET);
838 
839 	sc = softclock_pcpu_ary[cpuid];
840 	c->rsc = sc;
841 	c->rtick = sc->curticks + to_ticks;
842 	c->rfunc = ftn;
843 	c->rarg = arg;
844 	_callout_update_spinlocked(c);
845 	spin_unlock(&c->spin);
846 }
847 
848 /*
849  * Issue synchronous or asynchronous cancel or stop
850  */
851 static __inline
852 int
853 _callout_cancel_or_stop(struct callout *cc, uint32_t flags, int sync)
854 {
855 	globaldata_t gd = mycpu;
856 	struct _callout *c;
857 	int res;
858 
859 	/*
860 	 * Callout is inactive after cancel or stop.  Degenerate case if
861 	 * no _callout is currently associated.
862 	 */
863 	atomic_clear_int(&cc->flags, CALLOUT_ACTIVE);
864 	if (cc->toc == NULL)
865 		return 0;
866 
867 	/*
868 	 * Ensure that the related (c) is not destroyed.  Set the CANCEL
869 	 * or STOP request flag, clear the PREVENTED status flag, and update.
870 	 */
871 	exis_hold_gd(gd);
872 	c = _callout_gettoc(cc);
873 	atomic_clear_int(&c->flags, CALLOUT_PREVENTED);
874 	atomic_set_int(&c->flags, flags);
875 	_callout_update_spinlocked(c);
876 	spin_unlock(&c->spin);
877 
878 	/*
879 	 * If the operation is still in-progress then re-acquire the spin-lock
880 	 * and block if necessary.  Also initiate the lock cancel.
881 	 */
882 	if (sync == 0 || (c->flags & (CALLOUT_INPROG | CALLOUT_SET)) == 0) {
883 		exis_drop_gd(gd);
884 		return 0;
885 	}
886 	if (c->flags & CALLOUT_AUTOLOCK)
887 		lockmgr(c->lk, LK_CANCEL_BEG);
888 	spin_lock(&c->spin);
889 	if ((c->flags & (CALLOUT_INPROG | CALLOUT_SET)) == 0) {
890 		spin_unlock(&c->spin);
891 		if (c->flags & CALLOUT_AUTOLOCK)
892 			lockmgr(c->lk, LK_CANCEL_END);
893 		exis_drop_gd(gd);
894 		return ((c->flags & CALLOUT_PREVENTED) != 0);
895 	}
896 
897 	/*
898 	 * With c->spin held we can synchronously wait completion of our
899 	 * request.
900 	 *
901 	 * If INPROG is set and we are recursing from the callback the
902 	 * function completes immediately.
903 	 */
904 	++c->waiters;
905 	for (;;) {
906 		cpu_ccfence();
907 		if ((c->flags & flags) == 0)
908 			break;
909 		if ((c->flags & CALLOUT_INPROG) &&
910 		    curthread == &c->qsc->thread) {
911 			_callout_update_spinlocked(c);
912 			break;
913 		}
914 		ssleep(c, &c->spin, 0, "costp", 0);
915 	}
916 	--c->waiters;
917 	spin_unlock(&c->spin);
918 	if (c->flags & CALLOUT_AUTOLOCK)
919 		lockmgr(c->lk, LK_CANCEL_END);
920 	res = ((c->flags & CALLOUT_PREVENTED) != 0);
921 	exis_drop_gd(gd);
922 
923 	return res;
924 }
925 
926 /*
927  * Internalized special low-overhead version without normal safety
928  * checks or allocations.  Used by tsleep().
929  *
930  * Must be called from critical section, specify both the external
931  * and internal callout structure and set timeout on the current cpu.
932  */
933 void
934 _callout_setup_quick(struct callout *cc, struct _callout *c, int ticks,
935 		     void (*ftn)(void *), void *arg)
936 {
937 	softclock_pcpu_t sc;
938 	struct wheel *wheel;
939 
940 	/*
941 	 * Request a RESET.  This automatically overrides a STOP in
942 	 * _callout_update_spinlocked().
943 	 */
944 	sc = softclock_pcpu_ary[mycpu->gd_cpuid];
945 
946 	cc->flags = CALLOUT_DID_INIT | CALLOUT_MPSAFE;
947 	cc->toc = c;
948 	cc->lk = NULL;
949 	c->flags = cc->flags | CALLOUT_SET;
950 	c->lk = NULL;
951 	c->verifier = cc;
952 	c->qsc = sc;
953 	c->qtick = sc->curticks + ticks;
954 	c->qfunc = ftn;
955 	c->qarg = arg;
956 	spin_init(&c->spin, "calou");
957 
958 	/*
959 	 * Since we are on the same cpu with a critical section, we can
960 	 * do this with only the wheel spinlock.
961 	 */
962 	if (c->qtick - sc->softticks <= 0)
963 		c->qtick = sc->softticks + 1;
964 	wheel = &sc->callwheel[c->qtick & cwheelmask];
965 
966 	spin_lock(&wheel->spin);
967 	TAILQ_INSERT_TAIL(&wheel->list, c, entry);
968 	spin_unlock(&wheel->spin);
969 }
970 
971 /*
972  * Internalized special low-overhead version without normal safety
973  * checks or allocations.  Used by tsleep().
974  *
975  * Must be called on the same cpu that queued the timeout.
976  * Must be called with a critical section already held.
977  */
978 void
979 _callout_cancel_quick(struct _callout *c)
980 {
981 	softclock_pcpu_t sc;
982 	struct wheel *wheel;
983 
984 	/*
985 	 * Wakeup callouts for tsleep() should never block, so this flag
986 	 * had better never be found set.
987 	 */
988 	KKASSERT((c->flags & CALLOUT_INPROG) == 0);
989 
990 	/*
991 	 * Remove from queue if necessary.  Since we are in a critical
992 	 * section on the same cpu, the queueing status should not change.
993 	 */
994 	if (c->flags & CALLOUT_SET) {
995 		sc = c->qsc;
996 		KKASSERT(sc == softclock_pcpu_ary[mycpu->gd_cpuid]);
997 		wheel = &sc->callwheel[c->qtick & cwheelmask];
998 
999 		/*
1000 		 * NOTE: We must still spin-lock the wheel because other
1001 		 *	 cpus can manipulate the list.
1002 		 */
1003 		spin_lock(&wheel->spin);
1004 		TAILQ_REMOVE(&wheel->list, c, entry);
1005 		c->flags &= ~(CALLOUT_SET | CALLOUT_STOP |
1006 			      CALLOUT_CANCEL | CALLOUT_RESET);
1007 		spin_unlock(&wheel->spin);
1008 	}
1009 	c->verifier = NULL;
1010 }
1011 
1012 /*
1013  * This is a synchronous STOP which cancels the callout.  If AUTOLOCK
1014  * then a CANCEL will be issued to the lock holder.  Unlike STOP, the
1015  * cancel function prevents any new callout_reset()s from being issued
1016  * in addition to canceling the lock.  The lock will also be deactivated.
1017  *
1018  * Returns 0 if the callout was not active (or was active and completed,
1019  *	     but didn't try to start a new timeout).
1020  * Returns 1 if the cancel is responsible for stopping the callout.
1021  */
1022 int
1023 callout_cancel(struct callout *cc)
1024 {
1025 	return _callout_cancel_or_stop(cc, CALLOUT_CANCEL, 1);
1026 }
1027 
1028 /*
1029  * Currently the same as callout_cancel.  Ultimately we may wish the
1030  * drain function to allow a pending callout to proceed, but for now
1031  * we will attempt to to cancel it.
1032  *
1033  * Returns 0 if the callout was not active (or was active and completed,
1034  *	     but didn't try to start a new timeout).
1035  * Returns 1 if the drain is responsible for stopping the callout.
1036  */
1037 int
1038 callout_drain(struct callout *cc)
1039 {
1040 	return _callout_cancel_or_stop(cc, CALLOUT_CANCEL, 1);
1041 }
1042 
1043 /*
1044  * Stops a callout if it is pending or queued, does not block.
1045  * This function does not interlock against a callout that is in-progress.
1046  *
1047  * Returns whether the STOP operation was responsible for removing a
1048  * queued or pending callout.
1049  */
1050 int
1051 callout_stop_async(struct callout *cc)
1052 {
1053 	return _callout_cancel_or_stop(cc, CALLOUT_STOP, 0);
1054 }
1055 
1056 /*
1057  * Callout deactivate merely clears the CALLOUT_ACTIVE bit and stop a
1058  * callout if it is pending or queued.  However this cannot stop a callout
1059  * whos callback is in-progress.
1060  *
1061  *
1062  * This function does not interlock against a callout that is in-progress.
1063  */
1064 void
1065 callout_deactivate(struct callout *cc)
1066 {
1067 	atomic_clear_int(&cc->flags, CALLOUT_ACTIVE);
1068 	callout_stop_async(cc);
1069 }
1070 
1071 /*
1072  * lock-aided callouts are STOPped synchronously using STOP semantics
1073  * (meaning that another thread can start the callout again before we
1074  * return).
1075  *
1076  * non-lock-aided callouts
1077  *
1078  * Stops a callout if it is pending or queued, does not block.
1079  * This function does not interlock against a callout that is in-progress.
1080  */
1081 int
1082 callout_stop(struct callout *cc)
1083 {
1084 	return _callout_cancel_or_stop(cc, CALLOUT_STOP, 1);
1085 }
1086 
1087 /*
1088  * Destroy the callout.  Synchronously cancel any operation in progress,
1089  * clear the INIT flag, and disconnect the internal _callout.  The internal
1090  * callout will be safely freed via EXIS.
1091  *
1092  * Upon return, the callout structure may only be reused if re-initialized.
1093  */
1094 void
1095 callout_terminate(struct callout *cc)
1096 {
1097 	struct _callout *c;
1098 
1099 	exis_hold();
1100 
1101 	_callout_cancel_or_stop(cc, CALLOUT_CANCEL, 1);
1102 	KKASSERT(cc->flags & CALLOUT_DID_INIT);
1103 	atomic_clear_int(&cc->flags, CALLOUT_DID_INIT);
1104 	c = atomic_swap_ptr((void *)&cc->toc, NULL);
1105 	if (c) {
1106 		KKASSERT(c->verifier == cc);
1107 		c->verifier = NULL;
1108 		_callout_free(c);
1109 	}
1110 
1111 	exis_drop();
1112 }
1113 
1114 /*
1115  * Returns whether a callout is queued and the time has not yet
1116  * arrived (the callout is not yet in-progress).
1117  */
1118 int
1119 callout_pending(struct callout *cc)
1120 {
1121 	struct _callout *c;
1122 
1123 	/*
1124 	 * Don't instantiate toc to test pending
1125 	 */
1126 	if (cc->toc == NULL)
1127 		return 0;
1128 	c = _callout_gettoc(cc);
1129 	if ((c->flags & (CALLOUT_SET | CALLOUT_INPROG)) == CALLOUT_SET) {
1130 		spin_unlock(&c->spin);
1131 		return 1;
1132 	}
1133 	spin_unlock(&c->spin);
1134 
1135 	return 0;
1136 }
1137 
1138 /*
1139  * Returns whether a callout is active or not.  A callout is active when
1140  * a timeout is set and remains active upon normal termination, even if
1141  * it does not issue a new timeout.  A callout is inactive if a timeout has
1142  * never been set or if the callout has been stopped or canceled.  The next
1143  * timeout that is set will re-set the active state.
1144  */
1145 int
1146 callout_active(struct callout *cc)
1147 {
1148 	return ((cc->flags & CALLOUT_ACTIVE) ? 1 : 0);
1149 }
1150