xref: /illumos-gate/usr/src/uts/common/os/callout.c (revision c3ea2840)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/callo.h>
27 #include <sys/param.h>
28 #include <sys/types.h>
29 #include <sys/cpuvar.h>
30 #include <sys/thread.h>
31 #include <sys/kmem.h>
32 #include <sys/kmem_impl.h>
33 #include <sys/cmn_err.h>
34 #include <sys/callb.h>
35 #include <sys/debug.h>
36 #include <sys/vtrace.h>
37 #include <sys/sysmacros.h>
38 #include <sys/sdt.h>
39 
40 /*
41  * Callout tables.  See timeout(9F) for details.
42  */
43 static hrtime_t callout_debug_hrtime;		/* debugger entry time */
44 static int callout_min_resolution;		/* Minimum resolution */
45 static callout_table_t *callout_boot_ct;	/* Boot CPU's callout tables */
46 static clock_t callout_max_ticks;		/* max interval */
47 static hrtime_t callout_longterm;		/* longterm nanoseconds */
48 static ulong_t callout_counter_low;		/* callout ID increment */
49 static ulong_t callout_table_bits;		/* number of table bits in ID */
50 static ulong_t callout_table_mask;		/* mask for the table bits */
51 static callout_cache_t *callout_caches;		/* linked list of caches */
52 #pragma align 64(callout_table)
53 static callout_table_t *callout_table;		/* global callout table array */
54 
55 static char *callout_kstat_names[] = {
56 	"callout_timeouts",
57 	"callout_timeouts_pending",
58 	"callout_untimeouts_unexpired",
59 	"callout_untimeouts_executing",
60 	"callout_untimeouts_expired",
61 	"callout_expirations",
62 	"callout_allocations",
63 };
64 
65 #define	CALLOUT_HASH_INSERT(hash, cp, cnext, cprev)	\
66 {							\
67 	callout_hash_t *hashp = &(hash);		\
68 							\
69 	cp->cprev = NULL;				\
70 	cp->cnext = hashp->ch_head;			\
71 	if (hashp->ch_head == NULL)			\
72 		hashp->ch_tail = cp;			\
73 	else						\
74 		cp->cnext->cprev = cp;			\
75 	hashp->ch_head = cp;				\
76 }
77 
78 #define	CALLOUT_HASH_APPEND(hash, cp, cnext, cprev)	\
79 {							\
80 	callout_hash_t *hashp = &(hash);		\
81 							\
82 	cp->cnext = NULL;				\
83 	cp->cprev = hashp->ch_tail;			\
84 	if (hashp->ch_tail == NULL)			\
85 		hashp->ch_head = cp;			\
86 	else						\
87 		cp->cprev->cnext = cp;			\
88 	hashp->ch_tail = cp;				\
89 }
90 
91 #define	CALLOUT_HASH_DELETE(hash, cp, cnext, cprev)	\
92 {							\
93 	callout_hash_t *hashp = &(hash);		\
94 							\
95 	if (cp->cnext == NULL)				\
96 		hashp->ch_tail = cp->cprev;		\
97 	else						\
98 		cp->cnext->cprev = cp->cprev;		\
99 	if (cp->cprev == NULL)				\
100 		hashp->ch_head = cp->cnext;		\
101 	else						\
102 		cp->cprev->cnext = cp->cnext;		\
103 }
104 
105 /*
106  * These definitions help us queue callouts and callout lists. Here is
107  * the queueing rationale:
108  *
109  *	- callouts are queued in a FIFO manner in the ID hash table.
110  *	  TCP timers are typically cancelled in the same order that they
111  *	  were issued. The FIFO queueing shortens the search for a callout
112  *	  during untimeout().
113  *
114  *	- callouts are queued in a FIFO manner in their callout lists.
115  *	  This ensures that the callouts are executed in the same order that
116  *	  they were queued. This is fair. Plus, it helps to make each
117  *	  callout expiration timely. It also favors cancellations.
118  *
119  *	- callout lists are queued in a LIFO manner in the callout list hash
120  *	  table. This ensures that long term timers stay at the rear of the
121  *	  hash lists.
122  *
123  *	- callout lists are queued in a FIFO manner in the expired callouts
124  *	  list. This ensures that callout lists are executed in the order
125  *	  of expiration.
126  */
127 #define	CALLOUT_APPEND(ct, cp)						\
128 	CALLOUT_HASH_APPEND(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)],	\
129 		cp, c_idnext, c_idprev);				\
130 	CALLOUT_HASH_APPEND(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
131 
132 #define	CALLOUT_DELETE(ct, cp)						\
133 	CALLOUT_HASH_DELETE(ct->ct_idhash[CALLOUT_IDHASH(cp->c_xid)],	\
134 		cp, c_idnext, c_idprev);				\
135 	CALLOUT_HASH_DELETE(cp->c_list->cl_callouts, cp, c_clnext, c_clprev)
136 
137 #define	CALLOUT_LIST_INSERT(hash, cl)				\
138 	CALLOUT_HASH_INSERT(hash, cl, cl_next, cl_prev)
139 
140 #define	CALLOUT_LIST_APPEND(hash, cl)				\
141 	CALLOUT_HASH_APPEND(hash, cl, cl_next, cl_prev)
142 
143 #define	CALLOUT_LIST_DELETE(hash, cl)				\
144 	CALLOUT_HASH_DELETE(hash, cl, cl_next, cl_prev)
145 
146 /*
147  * Allocate a callout structure.  We try quite hard because we
148  * can't sleep, and if we can't do the allocation, we're toast.
149  * Failing all, we try a KM_PANIC allocation. Note that we never
150  * deallocate a callout. See untimeout() for the reasoning.
151  */
152 static callout_t *
153 callout_alloc(callout_table_t *ct)
154 {
155 	size_t size;
156 	callout_t *cp;
157 
158 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
159 	mutex_exit(&ct->ct_mutex);
160 
161 	cp = kmem_cache_alloc(ct->ct_cache, KM_NOSLEEP);
162 	if (cp == NULL) {
163 		size = sizeof (callout_t);
164 		cp = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
165 	}
166 	cp->c_xid = 0;
167 
168 	mutex_enter(&ct->ct_mutex);
169 	ct->ct_allocations++;
170 	return (cp);
171 }
172 
173 /*
174  * Allocate a callout list structure.  We try quite hard because we
175  * can't sleep, and if we can't do the allocation, we're toast.
176  * Failing all, we try a KM_PANIC allocation. Note that we never
177  * deallocate a callout list.
178  */
179 static void
180 callout_list_alloc(callout_table_t *ct)
181 {
182 	size_t size;
183 	callout_list_t *cl;
184 
185 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
186 	mutex_exit(&ct->ct_mutex);
187 
188 	cl = kmem_cache_alloc(ct->ct_lcache, KM_NOSLEEP);
189 	if (cl == NULL) {
190 		size = sizeof (callout_list_t);
191 		cl = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
192 	}
193 	bzero(cl, sizeof (callout_list_t));
194 
195 	mutex_enter(&ct->ct_mutex);
196 	cl->cl_next = ct->ct_lfree;
197 	ct->ct_lfree = cl;
198 }
199 
200 /*
201  * Find the callout list that corresponds to an expiration. There can
202  * be only one.
203  */
204 static callout_list_t *
205 callout_list_get(callout_table_t *ct, hrtime_t expiration, int hash)
206 {
207 	callout_list_t *cl;
208 
209 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
210 
211 	for (cl = ct->ct_clhash[hash].ch_head; (cl != NULL); cl = cl->cl_next) {
212 		if (cl->cl_expiration == expiration)
213 			return (cl);
214 	}
215 
216 	return (NULL);
217 }
218 
219 /*
220  * Find the callout list that corresponds to an expiration. There can
221  * be only one. If the callout list is null, free it. Else, return it.
222  */
223 static callout_list_t *
224 callout_list_check(callout_table_t *ct, hrtime_t expiration, int hash)
225 {
226 	callout_list_t *cl;
227 
228 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
229 
230 	cl = callout_list_get(ct, expiration, hash);
231 	if (cl != NULL) {
232 		if (cl->cl_callouts.ch_head != NULL) {
233 			/*
234 			 * There is exactly one callout list for every
235 			 * unique expiration. So, we are done.
236 			 */
237 			return (cl);
238 		}
239 
240 		CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
241 		cl->cl_next = ct->ct_lfree;
242 		ct->ct_lfree = cl;
243 	}
244 
245 	return (NULL);
246 }
247 
248 /*
249  * Initialize a callout table's heap, if necessary. Preallocate some free
250  * entries so we don't have to check for NULL elsewhere.
251  */
252 static void
253 callout_heap_init(callout_table_t *ct)
254 {
255 	size_t size;
256 
257 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
258 	ASSERT(ct->ct_heap == NULL);
259 
260 	ct->ct_heap_num = 0;
261 	ct->ct_heap_max = CALLOUT_CHUNK;
262 	size = sizeof (hrtime_t) * CALLOUT_CHUNK;
263 	ct->ct_heap = kmem_alloc(size, KM_SLEEP);
264 }
265 
266 /*
267  * Reallocate the heap. We try quite hard because we can't sleep, and if
268  * we can't do the allocation, we're toast. Failing all, we try a KM_PANIC
269  * allocation. Note that the heap only expands, it never contracts.
270  */
271 static void
272 callout_heap_expand(callout_table_t *ct)
273 {
274 	size_t max, size, osize;
275 	hrtime_t *heap;
276 
277 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
278 	ASSERT(ct->ct_heap_num <= ct->ct_heap_max);
279 
280 	while (ct->ct_heap_num == ct->ct_heap_max) {
281 		max = ct->ct_heap_max;
282 		mutex_exit(&ct->ct_mutex);
283 
284 		osize = sizeof (hrtime_t) * max;
285 		size = sizeof (hrtime_t) * (max + CALLOUT_CHUNK);
286 		heap = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
287 
288 		mutex_enter(&ct->ct_mutex);
289 		if (max < ct->ct_heap_max) {
290 			/*
291 			 * Someone beat us to the allocation. Free what we
292 			 * just allocated and proceed.
293 			 */
294 			kmem_free(heap, size);
295 			continue;
296 		}
297 
298 		bcopy(ct->ct_heap, heap, osize);
299 		kmem_free(ct->ct_heap, osize);
300 		ct->ct_heap = heap;
301 		ct->ct_heap_max = size / sizeof (hrtime_t);
302 	}
303 }
304 
305 /*
306  * Move an expiration from the bottom of the heap to its correct place
307  * in the heap. If we reached the root doing this, return 1. Else,
308  * return 0.
309  */
310 static int
311 callout_upheap(callout_table_t *ct)
312 {
313 	int current, parent;
314 	hrtime_t *heap, current_expiration, parent_expiration;
315 
316 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
317 	ASSERT(ct->ct_heap_num >= 1);
318 
319 	if (ct->ct_heap_num == 1) {
320 		return (1);
321 	}
322 
323 	heap = ct->ct_heap;
324 	current = ct->ct_heap_num - 1;
325 
326 	for (;;) {
327 		parent = CALLOUT_HEAP_PARENT(current);
328 		current_expiration = heap[current];
329 		parent_expiration = heap[parent];
330 
331 		/*
332 		 * We have an expiration later than our parent; we're done.
333 		 */
334 		if (current_expiration >= parent_expiration) {
335 			return (0);
336 		}
337 
338 		/*
339 		 * We need to swap with our parent, and continue up the heap.
340 		 */
341 		heap[parent] = current_expiration;
342 		heap[current] = parent_expiration;
343 
344 		/*
345 		 * If we just reached the root, we're done.
346 		 */
347 		if (parent == 0) {
348 			return (1);
349 		}
350 
351 		current = parent;
352 	}
353 	/*NOTREACHED*/
354 }
355 
356 /*
357  * Insert a new, unique expiration into a callout table's heap.
358  */
359 static void
360 callout_heap_insert(callout_table_t *ct, hrtime_t expiration)
361 {
362 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
363 	ASSERT(ct->ct_heap_num < ct->ct_heap_max);
364 
365 	/*
366 	 * First, copy the expiration to the bottom of the heap.
367 	 */
368 	ct->ct_heap[ct->ct_heap_num] = expiration;
369 	ct->ct_heap_num++;
370 
371 	/*
372 	 * Now, perform an upheap operation. If we reached the root, then
373 	 * the cyclic needs to be reprogrammed as we have an earlier
374 	 * expiration.
375 	 *
376 	 * Also, during the CPR suspend phase, do not reprogram the cyclic.
377 	 * We don't want any callout activity. When the CPR resume phase is
378 	 * entered, the cyclic will be programmed for the earliest expiration
379 	 * in the heap.
380 	 */
381 	if (callout_upheap(ct) && (ct->ct_suspend == 0))
382 		(void) cyclic_reprogram(ct->ct_cyclic, expiration);
383 }
384 
385 /*
386  * Move an expiration from the top of the heap to its correct place
387  * in the heap.
388  */
389 static void
390 callout_downheap(callout_table_t *ct)
391 {
392 	int left, right, current, nelems;
393 	hrtime_t *heap, left_expiration, right_expiration, current_expiration;
394 
395 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
396 	ASSERT(ct->ct_heap_num >= 1);
397 
398 	heap = ct->ct_heap;
399 	current = 0;
400 	nelems = ct->ct_heap_num;
401 
402 	for (;;) {
403 		/*
404 		 * If we don't have a left child (i.e., we're a leaf), we're
405 		 * done.
406 		 */
407 		if ((left = CALLOUT_HEAP_LEFT(current)) >= nelems)
408 			return;
409 
410 		left_expiration = heap[left];
411 		current_expiration = heap[current];
412 
413 		right = CALLOUT_HEAP_RIGHT(current);
414 
415 		/*
416 		 * Even if we don't have a right child, we still need to compare
417 		 * our expiration against that of our left child.
418 		 */
419 		if (right >= nelems)
420 			goto comp_left;
421 
422 		right_expiration = heap[right];
423 
424 		/*
425 		 * We have both a left and a right child.  We need to compare
426 		 * the expiration of the children to determine which
427 		 * expires earlier.
428 		 */
429 		if (right_expiration < left_expiration) {
430 			/*
431 			 * Our right child is the earlier of our children.
432 			 * We'll now compare our expiration to its expiration.
433 			 * If ours is the earlier one, we're done.
434 			 */
435 			if (current_expiration <= right_expiration)
436 				return;
437 
438 			/*
439 			 * Our right child expires earlier than we do; swap
440 			 * with our right child, and descend right.
441 			 */
442 			heap[right] = current_expiration;
443 			heap[current] = right_expiration;
444 			current = right;
445 			continue;
446 		}
447 
448 comp_left:
449 		/*
450 		 * Our left child is the earlier of our children (or we have
451 		 * no right child).  We'll now compare our expiration
452 		 * to its expiration. If ours is the earlier one, we're done.
453 		 */
454 		if (current_expiration <= left_expiration)
455 			return;
456 
457 		/*
458 		 * Our left child expires earlier than we do; swap with our
459 		 * left child, and descend left.
460 		 */
461 		heap[left] = current_expiration;
462 		heap[current] = left_expiration;
463 		current = left;
464 	}
465 }
466 
467 /*
468  * Delete and handle all past expirations in a callout table's heap.
469  */
470 static void
471 callout_heap_delete(callout_table_t *ct)
472 {
473 	hrtime_t now, expiration;
474 	callout_list_t *cl;
475 	int hash;
476 
477 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
478 
479 	now = gethrtime();
480 
481 	while (ct->ct_heap_num > 0) {
482 		expiration = ct->ct_heap[0];
483 		/*
484 		 * Find the callout list that corresponds to the expiration.
485 		 * If the callout list is empty, callout_list_check()
486 		 * will free the callout list and return NULL.
487 		 */
488 		hash = CALLOUT_CLHASH(expiration);
489 		cl = callout_list_check(ct, expiration, hash);
490 		if (cl != NULL) {
491 			/*
492 			 * If the root of the heap expires in the future, we are
493 			 * done. We are doing this check here instead of at the
494 			 * beginning because we want to first free all the
495 			 * empty callout lists at the top of the heap.
496 			 */
497 			if (expiration > now)
498 				break;
499 
500 			/*
501 			 * Move the callout list for this expiration to the
502 			 * list of expired callout lists. It will be processed
503 			 * by the callout executor.
504 			 */
505 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
506 			CALLOUT_LIST_APPEND(ct->ct_expired, cl);
507 		}
508 
509 		/*
510 		 * Now delete the root. This is done by swapping the root with
511 		 * the last item in the heap and downheaping the item.
512 		 */
513 		ct->ct_heap_num--;
514 		if (ct->ct_heap_num > 0) {
515 			ct->ct_heap[0] = ct->ct_heap[ct->ct_heap_num];
516 			callout_downheap(ct);
517 		}
518 	}
519 
520 	/*
521 	 * If this callout table is empty or callouts have been suspended
522 	 * by CPR, just return. The cyclic has already been programmed to
523 	 * infinity by the cyclic subsystem.
524 	 */
525 	if ((ct->ct_heap_num == 0) || (ct->ct_suspend > 0))
526 		return;
527 
528 	(void) cyclic_reprogram(ct->ct_cyclic, expiration);
529 }
530 
531 /*
532  * Common function used to create normal and realtime callouts.
533  *
534  * Realtime callouts are handled at CY_LOW_PIL by a cyclic handler. So,
535  * there is one restriction on a realtime callout handler - it should not
536  * directly or indirectly acquire cpu_lock. CPU offline waits for pending
537  * cyclic handlers to complete while holding cpu_lock. So, if a realtime
538  * callout handler were to try to get cpu_lock, there would be a deadlock
539  * during CPU offline.
540  */
541 callout_id_t
542 timeout_generic(int type, void (*func)(void *), void *arg,
543 	hrtime_t expiration, hrtime_t resolution, int flags)
544 {
545 	callout_table_t *ct;
546 	callout_t *cp;
547 	callout_id_t id;
548 	callout_list_t *cl;
549 	hrtime_t now, interval;
550 	int hash;
551 
552 	ASSERT(resolution > 0);
553 	ASSERT(func != NULL);
554 
555 	/*
556 	 * Please see comment about minimum resolution in callout_init().
557 	 */
558 	if (resolution < callout_min_resolution)
559 		resolution = callout_min_resolution;
560 
561 	/*
562 	 * We disable kernel preemption so that we remain on the same CPU
563 	 * throughout. If we needed to reprogram the callout table's cyclic,
564 	 * we can avoid X-calls if we are on the same CPU.
565 	 *
566 	 * Note that callout_alloc() releases and reacquires the callout
567 	 * table mutex. While reacquiring the mutex, it is possible for us
568 	 * to go to sleep and later migrate to another CPU. This should be
569 	 * pretty rare, though.
570 	 */
571 	kpreempt_disable();
572 
573 	ct = &callout_table[CALLOUT_TABLE(type, CPU->cpu_seqid)];
574 	mutex_enter(&ct->ct_mutex);
575 
576 	if (ct->ct_cyclic == CYCLIC_NONE) {
577 		mutex_exit(&ct->ct_mutex);
578 		/*
579 		 * The callout table has not yet been initialized fully.
580 		 * So, put this one on the boot callout table which is
581 		 * always initialized.
582 		 */
583 		ct = &callout_boot_ct[type];
584 		mutex_enter(&ct->ct_mutex);
585 	}
586 
587 	if ((cp = ct->ct_free) == NULL)
588 		cp = callout_alloc(ct);
589 	else
590 		ct->ct_free = cp->c_idnext;
591 
592 	cp->c_func = func;
593 	cp->c_arg = arg;
594 
595 	/*
596 	 * Compute the expiration hrtime.
597 	 */
598 	now = gethrtime();
599 	if (flags & CALLOUT_FLAG_ABSOLUTE) {
600 		ASSERT(expiration > 0);
601 		interval = expiration - now;
602 	} else {
603 		interval = expiration;
604 		expiration += now;
605 		ASSERT(expiration > 0);
606 	}
607 	if (flags & CALLOUT_FLAG_ROUNDUP)
608 		expiration += resolution - 1;
609 	expiration = (expiration / resolution) * resolution;
610 	if (expiration <= 0) {
611 		/*
612 		 * expiration hrtime overflow has occurred. Just set the
613 		 * expiration to infinity.
614 		 */
615 		expiration = CY_INFINITY;
616 	}
617 
618 	/*
619 	 * Assign an ID to this callout
620 	 */
621 	if (flags & CALLOUT_FLAG_32BIT) {
622 		if (interval > callout_longterm) {
623 			id = (ct->ct_long_id - callout_counter_low);
624 			id |= CALLOUT_COUNTER_HIGH;
625 			ct->ct_long_id = id;
626 		} else {
627 			id = (ct->ct_short_id - callout_counter_low);
628 			id |= CALLOUT_COUNTER_HIGH;
629 			ct->ct_short_id = id;
630 		}
631 	} else {
632 		id = (ct->ct_gen_id - callout_counter_low);
633 		if ((id & CALLOUT_COUNTER_HIGH) == 0) {
634 			id |= CALLOUT_COUNTER_HIGH;
635 			id += CALLOUT_GENERATION_LOW;
636 		}
637 		ct->ct_gen_id = id;
638 	}
639 
640 	cp->c_xid = id;
641 	if (flags & CALLOUT_FLAG_HRESTIME)
642 		cp->c_xid |= CALLOUT_HRESTIME;
643 
644 	hash = CALLOUT_CLHASH(expiration);
645 
646 again:
647 	/*
648 	 * Try to see if a callout list already exists for this expiration.
649 	 * Most of the time, this will be the case.
650 	 */
651 	cl = callout_list_get(ct, expiration, hash);
652 	if (cl == NULL) {
653 		/*
654 		 * Check if we have enough space in the heap to insert one
655 		 * expiration. If not, expand the heap.
656 		 */
657 		if (ct->ct_heap_num == ct->ct_heap_max) {
658 			callout_heap_expand(ct);
659 			/*
660 			 * In the above call, we drop the lock, allocate and
661 			 * reacquire the lock. So, we could have been away
662 			 * for a while. In the meantime, someone could have
663 			 * inserted a callout list with the same expiration.
664 			 * So, the best course is to repeat the steps. This
665 			 * should be an infrequent event.
666 			 */
667 			goto again;
668 		}
669 
670 		/*
671 		 * Check the free list. If we don't find one, we have to
672 		 * take the slow path and allocate from kmem.
673 		 */
674 		if ((cl = ct->ct_lfree) == NULL) {
675 			callout_list_alloc(ct);
676 			/*
677 			 * In the above call, we drop the lock, allocate and
678 			 * reacquire the lock. So, we could have been away
679 			 * for a while. In the meantime, someone could have
680 			 * inserted a callout list with the same expiration.
681 			 * Plus, the heap could have become full. So, the best
682 			 * course is to repeat the steps. This should be an
683 			 * infrequent event.
684 			 */
685 			goto again;
686 		}
687 		ct->ct_lfree = cl->cl_next;
688 		cl->cl_expiration = expiration;
689 
690 		CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl);
691 
692 		/*
693 		 * This is a new expiration. So, insert it into the heap.
694 		 * This will also reprogram the cyclic, if the expiration
695 		 * propagated to the root of the heap.
696 		 */
697 		callout_heap_insert(ct, expiration);
698 	}
699 	cp->c_list = cl;
700 	CALLOUT_APPEND(ct, cp);
701 
702 	ct->ct_timeouts++;
703 	ct->ct_timeouts_pending++;
704 
705 	mutex_exit(&ct->ct_mutex);
706 
707 	kpreempt_enable();
708 
709 	TRACE_4(TR_FAC_CALLOUT, TR_TIMEOUT,
710 	    "timeout:%K(%p) in %llx expiration, cp %p", func, arg, expiration,
711 	    cp);
712 
713 	return (id);
714 }
715 
716 timeout_id_t
717 timeout(void (*func)(void *), void *arg, clock_t delta)
718 {
719 	ulong_t id;
720 
721 	/*
722 	 * Make sure the callout runs at least 1 tick in the future.
723 	 */
724 	if (delta <= 0)
725 		delta = 1;
726 	else if (delta > callout_max_ticks)
727 		delta = callout_max_ticks;
728 
729 	id =  (ulong_t)timeout_generic(CALLOUT_NORMAL, func, arg,
730 	    TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY);
731 
732 	return ((timeout_id_t)id);
733 }
734 
735 /*
736  * Convenience function that creates a normal callout with default parameters
737  * and returns a full ID.
738  */
739 callout_id_t
740 timeout_default(void (*func)(void *), void *arg, clock_t delta)
741 {
742 	callout_id_t id;
743 
744 	/*
745 	 * Make sure the callout runs at least 1 tick in the future.
746 	 */
747 	if (delta <= 0)
748 		delta = 1;
749 	else if (delta > callout_max_ticks)
750 		delta = callout_max_ticks;
751 
752 	id = timeout_generic(CALLOUT_NORMAL, func, arg, TICK_TO_NSEC(delta),
753 	    nsec_per_tick, 0);
754 
755 	return (id);
756 }
757 
758 timeout_id_t
759 realtime_timeout(void (*func)(void *), void *arg, clock_t delta)
760 {
761 	ulong_t id;
762 
763 	/*
764 	 * Make sure the callout runs at least 1 tick in the future.
765 	 */
766 	if (delta <= 0)
767 		delta = 1;
768 	else if (delta > callout_max_ticks)
769 		delta = callout_max_ticks;
770 
771 	id =  (ulong_t)timeout_generic(CALLOUT_REALTIME, func, arg,
772 	    TICK_TO_NSEC(delta), nsec_per_tick, CALLOUT_LEGACY);
773 
774 	return ((timeout_id_t)id);
775 }
776 
777 /*
778  * Convenience function that creates a realtime callout with default parameters
779  * and returns a full ID.
780  */
781 callout_id_t
782 realtime_timeout_default(void (*func)(void *), void *arg, clock_t delta)
783 {
784 	callout_id_t id;
785 
786 	/*
787 	 * Make sure the callout runs at least 1 tick in the future.
788 	 */
789 	if (delta <= 0)
790 		delta = 1;
791 	else if (delta > callout_max_ticks)
792 		delta = callout_max_ticks;
793 
794 	id = timeout_generic(CALLOUT_REALTIME, func, arg, TICK_TO_NSEC(delta),
795 	    nsec_per_tick, 0);
796 
797 	return (id);
798 }
799 
800 hrtime_t
801 untimeout_generic(callout_id_t id, int nowait)
802 {
803 	callout_table_t *ct;
804 	callout_t *cp;
805 	callout_id_t xid;
806 	callout_list_t *cl;
807 	int hash;
808 	callout_id_t bogus;
809 
810 	ct = &callout_table[CALLOUT_ID_TO_TABLE(id)];
811 	hash = CALLOUT_IDHASH(id);
812 
813 	mutex_enter(&ct->ct_mutex);
814 
815 	/*
816 	 * Search the ID hash table for the callout.
817 	 */
818 	for (cp = ct->ct_idhash[hash].ch_head; cp; cp = cp->c_idnext) {
819 
820 		xid = cp->c_xid;
821 
822 		/*
823 		 * Match the ID and generation number.
824 		 */
825 		if ((xid & CALLOUT_ID_MASK) != id)
826 			continue;
827 
828 		cl = cp->c_list;
829 		if ((xid & CALLOUT_EXECUTING) == 0) {
830 			hrtime_t expiration;
831 
832 			/*
833 			 * Delete the callout. If the callout list becomes
834 			 * NULL, we don't remove it from the table. This is
835 			 * so it can be reused. If the empty callout list
836 			 * corresponds to the top of the the callout heap, we
837 			 * don't reprogram the table cyclic here. This is in
838 			 * order to avoid lots of X-calls to the CPU associated
839 			 * with the callout table.
840 			 */
841 			expiration = cl->cl_expiration;
842 			CALLOUT_DELETE(ct, cp);
843 			cp->c_idnext = ct->ct_free;
844 			ct->ct_free = cp;
845 			ct->ct_untimeouts_unexpired++;
846 			ct->ct_timeouts_pending--;
847 			mutex_exit(&ct->ct_mutex);
848 
849 			expiration -= gethrtime();
850 			TRACE_2(TR_FAC_CALLOUT, TR_UNTIMEOUT,
851 			    "untimeout:ID %lx hrtime left %llx", id,
852 			    expiration);
853 			return (expiration < 0 ? 0 : expiration);
854 		}
855 
856 		ct->ct_untimeouts_executing++;
857 		/*
858 		 * The callout we want to delete is currently executing.
859 		 * The DDI states that we must wait until the callout
860 		 * completes before returning, so we block on cl_done until the
861 		 * callout ID changes (to the old ID if it's on the freelist,
862 		 * or to a new callout ID if it's in use).  This implicitly
863 		 * assumes that callout structures are persistent (they are).
864 		 */
865 		if (cl->cl_executor == curthread) {
866 			/*
867 			 * The timeout handler called untimeout() on itself.
868 			 * Stupid, but legal.  We can't wait for the timeout
869 			 * to complete without deadlocking, so we just return.
870 			 */
871 			mutex_exit(&ct->ct_mutex);
872 			TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_SELF,
873 			    "untimeout_self:ID %x", id);
874 			return (-1);
875 		}
876 		if (nowait == 0) {
877 			/*
878 			 * We need to wait. Indicate that we are waiting by
879 			 * incrementing cl_waiting. This prevents the executor
880 			 * from doing a wakeup on cl_done if there are no
881 			 * waiters.
882 			 */
883 			while (cp->c_xid == xid) {
884 				cl->cl_waiting = 1;
885 				cv_wait(&cl->cl_done, &ct->ct_mutex);
886 			}
887 		}
888 		mutex_exit(&ct->ct_mutex);
889 		TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_EXECUTING,
890 		    "untimeout_executing:ID %lx", id);
891 		return (-1);
892 	}
893 	ct->ct_untimeouts_expired++;
894 
895 	mutex_exit(&ct->ct_mutex);
896 	TRACE_1(TR_FAC_CALLOUT, TR_UNTIMEOUT_BOGUS_ID,
897 	    "untimeout_bogus_id:ID %lx", id);
898 
899 	/*
900 	 * We didn't find the specified callout ID.  This means either
901 	 * (1) the callout already fired, or (2) the caller passed us
902 	 * a bogus value.  Perform a sanity check to detect case (2).
903 	 */
904 	bogus = (CALLOUT_EXECUTING | CALLOUT_HRESTIME | CALLOUT_COUNTER_HIGH);
905 	if (((id & bogus) != CALLOUT_COUNTER_HIGH) && (id != 0))
906 		panic("untimeout: impossible timeout id %llx",
907 		    (unsigned long long)id);
908 
909 	return (-1);
910 }
911 
912 clock_t
913 untimeout(timeout_id_t id_arg)
914 {
915 	hrtime_t hleft;
916 	clock_t tleft;
917 	callout_id_t id;
918 
919 	id = (ulong_t)id_arg;
920 	hleft = untimeout_generic(id, 0);
921 	if (hleft < 0)
922 		tleft = -1;
923 	else if (hleft == 0)
924 		tleft = 0;
925 	else
926 		tleft = NSEC_TO_TICK(hleft);
927 
928 	return (tleft);
929 }
930 
931 /*
932  * Convenience function to untimeout a timeout with a full ID with default
933  * parameters.
934  */
935 clock_t
936 untimeout_default(callout_id_t id, int nowait)
937 {
938 	hrtime_t hleft;
939 	clock_t tleft;
940 
941 	hleft = untimeout_generic(id, nowait);
942 	if (hleft < 0)
943 		tleft = -1;
944 	else if (hleft == 0)
945 		tleft = 0;
946 	else
947 		tleft = NSEC_TO_TICK(hleft);
948 
949 	return (tleft);
950 }
951 
952 /*
953  * Expire all the callouts queued in the specified callout list.
954  */
955 static void
956 callout_list_expire(callout_table_t *ct, callout_list_t *cl)
957 {
958 	callout_t *cp;
959 
960 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
961 	ASSERT(cl != NULL);
962 
963 	cl->cl_executor = curthread;
964 
965 	while ((cp = cl->cl_callouts.ch_head) != NULL) {
966 		/*
967 		 * Indicate to untimeout() that a callout is
968 		 * being expired by the executor.
969 		 */
970 		cp->c_xid |= CALLOUT_EXECUTING;
971 		mutex_exit(&ct->ct_mutex);
972 
973 		DTRACE_PROBE1(callout__start, callout_t *, cp);
974 		(*cp->c_func)(cp->c_arg);
975 		DTRACE_PROBE1(callout__end, callout_t *, cp);
976 
977 		mutex_enter(&ct->ct_mutex);
978 
979 		ct->ct_expirations++;
980 		ct->ct_timeouts_pending--;
981 		/*
982 		 * Indicate completion for cl_done.
983 		 */
984 		cp->c_xid &= ~CALLOUT_EXECUTING;
985 
986 		/*
987 		 * Delete callout from ID hash table and the callout
988 		 * list, return to freelist, and tell any untimeout() that
989 		 * cares that we're done.
990 		 */
991 		CALLOUT_DELETE(ct, cp);
992 		cp->c_idnext = ct->ct_free;
993 		ct->ct_free = cp;
994 
995 		if (cl->cl_waiting) {
996 			cl->cl_waiting = 0;
997 			cv_broadcast(&cl->cl_done);
998 		}
999 	}
1000 
1001 	cl->cl_executor = NULL;
1002 }
1003 
1004 /*
1005  * Execute all expired callout lists for a callout table.
1006  */
1007 static void
1008 callout_expire(callout_table_t *ct)
1009 {
1010 	callout_list_t *cl, *clnext;
1011 
1012 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1013 
1014 	for (cl = ct->ct_expired.ch_head; (cl != NULL); cl = clnext) {
1015 		/*
1016 		 * Multiple executor threads could be running at the same
1017 		 * time. Each callout list is processed by only one thread.
1018 		 * If this callout list is already being processed by another
1019 		 * executor, go on to the next one.
1020 		 */
1021 		if (cl->cl_executor != NULL) {
1022 			clnext = cl->cl_next;
1023 			continue;
1024 		}
1025 
1026 		/*
1027 		 * Expire all the callouts in this callout list.
1028 		 */
1029 		callout_list_expire(ct, cl);
1030 
1031 		/*
1032 		 * Free the callout list.
1033 		 */
1034 		clnext = cl->cl_next;
1035 		CALLOUT_LIST_DELETE(ct->ct_expired, cl);
1036 		cl->cl_next = ct->ct_lfree;
1037 		ct->ct_lfree = cl;
1038 	}
1039 }
1040 
1041 /*
1042  * The cyclic handlers below process callouts in two steps:
1043  *
1044  *	1. Find all expired callout lists and queue them in a separate
1045  *	   list of expired callouts.
1046  *	2. Execute the expired callout lists.
1047  *
1048  * This is done for two reasons:
1049  *
1050  *	1. We want to quickly find the next earliest expiration to program
1051  *	   the cyclic to and reprogram it. We can do this right at the end
1052  *	   of step 1.
1053  *	2. The realtime cyclic handler expires callouts in place. However,
1054  *	   for normal callouts, callouts are expired by a taskq thread.
1055  *	   So, it is simpler and more robust to have the taskq thread just
1056  *	   do step 2.
1057  */
1058 
1059 /*
1060  * Realtime callout cyclic handler.
1061  */
1062 void
1063 callout_realtime(callout_table_t *ct)
1064 {
1065 	mutex_enter(&ct->ct_mutex);
1066 	callout_heap_delete(ct);
1067 	callout_expire(ct);
1068 	mutex_exit(&ct->ct_mutex);
1069 }
1070 
1071 void
1072 callout_execute(callout_table_t *ct)
1073 {
1074 	mutex_enter(&ct->ct_mutex);
1075 	callout_expire(ct);
1076 	mutex_exit(&ct->ct_mutex);
1077 }
1078 
1079 /*
1080  * Normal callout cyclic handler.
1081  */
1082 void
1083 callout_normal(callout_table_t *ct)
1084 {
1085 	int exec;
1086 
1087 	mutex_enter(&ct->ct_mutex);
1088 	callout_heap_delete(ct);
1089 	exec = (ct->ct_expired.ch_head != NULL);
1090 	mutex_exit(&ct->ct_mutex);
1091 
1092 	if (exec) {
1093 		ASSERT(ct->ct_taskq != NULL);
1094 		(void) taskq_dispatch(ct->ct_taskq,
1095 		    (task_func_t *)callout_execute, ct, TQ_NOSLEEP);
1096 	}
1097 }
1098 
1099 /*
1100  * Suspend callout processing.
1101  */
1102 static void
1103 callout_suspend(void)
1104 {
1105 	int t, f;
1106 	callout_table_t *ct;
1107 
1108 	/*
1109 	 * Traverse every callout table in the system and suspend callout
1110 	 * processing.
1111 	 *
1112 	 * We need to suspend all the tables (including the inactive ones)
1113 	 * so that if a table is made active while the suspend is still on,
1114 	 * the table remains suspended.
1115 	 */
1116 	for (f = 0; f < max_ncpus; f++) {
1117 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1118 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1119 
1120 			mutex_enter(&ct->ct_mutex);
1121 			ct->ct_suspend++;
1122 			if (ct->ct_cyclic == CYCLIC_NONE) {
1123 				mutex_exit(&ct->ct_mutex);
1124 				continue;
1125 			}
1126 			if (ct->ct_suspend == 1)
1127 				(void) cyclic_reprogram(ct->ct_cyclic,
1128 				    CY_INFINITY);
1129 			mutex_exit(&ct->ct_mutex);
1130 		}
1131 	}
1132 }
1133 
1134 static void
1135 callout_adjust(callout_table_t *ct, hrtime_t delta)
1136 {
1137 	int hash, newhash;
1138 	hrtime_t expiration;
1139 	callout_list_t *cl;
1140 	callout_hash_t list;
1141 
1142 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1143 
1144 	/*
1145 	 * In order to adjust the expirations, we null out the heap. Then,
1146 	 * we reinsert adjusted expirations in the heap. Keeps it simple.
1147 	 * Note that since the CALLOUT_TABLE_SUSPENDED flag is set by the
1148 	 * caller, the heap insert does not result in cyclic reprogramming.
1149 	 */
1150 	ct->ct_heap_num = 0;
1151 
1152 	/*
1153 	 * First, remove all the callout lists from the table and string them
1154 	 * in a list.
1155 	 */
1156 	list.ch_head = list.ch_tail = NULL;
1157 	for (hash = 0; hash < CALLOUT_BUCKETS; hash++) {
1158 		while ((cl = ct->ct_clhash[hash].ch_head) != NULL) {
1159 			CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
1160 			CALLOUT_LIST_APPEND(list, cl);
1161 		}
1162 	}
1163 
1164 	/*
1165 	 * Now, traverse the callout lists and adjust their expirations.
1166 	 */
1167 	while ((cl = list.ch_head) != NULL) {
1168 		CALLOUT_LIST_DELETE(list, cl);
1169 		/*
1170 		 * Set the new expiration and reinsert in the right
1171 		 * hash bucket.
1172 		 */
1173 		expiration = cl->cl_expiration;
1174 		expiration += delta;
1175 		cl->cl_expiration = expiration;
1176 		newhash = CALLOUT_CLHASH(expiration);
1177 		CALLOUT_LIST_INSERT(ct->ct_clhash[newhash], cl);
1178 		callout_heap_insert(ct, expiration);
1179 	}
1180 }
1181 
1182 /*
1183  * Resume callout processing.
1184  */
1185 static void
1186 callout_resume(hrtime_t delta)
1187 {
1188 	hrtime_t exp;
1189 	int t, f;
1190 	callout_table_t *ct;
1191 
1192 	/*
1193 	 * Traverse every callout table in the system and resume callout
1194 	 * processing. For active tables, perform any hrtime adjustments
1195 	 * necessary.
1196 	 */
1197 	for (f = 0; f < max_ncpus; f++) {
1198 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1199 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1200 
1201 			mutex_enter(&ct->ct_mutex);
1202 			if (ct->ct_cyclic == CYCLIC_NONE) {
1203 				ct->ct_suspend--;
1204 				mutex_exit(&ct->ct_mutex);
1205 				continue;
1206 			}
1207 
1208 			if (delta)
1209 				callout_adjust(ct, delta);
1210 
1211 			ct->ct_suspend--;
1212 			if (ct->ct_suspend == 0) {
1213 				/*
1214 				 * If the expired list is non-empty, then have
1215 				 * the cyclic expire immediately. Else, program
1216 				 * the cyclic based on the heap.
1217 				 */
1218 				if (ct->ct_expired.ch_head != NULL)
1219 					exp = gethrtime();
1220 				else if (ct->ct_heap_num > 0)
1221 					exp = ct->ct_heap[0];
1222 				else
1223 					exp = 0;
1224 				if (exp != 0)
1225 					(void) cyclic_reprogram(ct->ct_cyclic,
1226 					    exp);
1227 			}
1228 			mutex_exit(&ct->ct_mutex);
1229 		}
1230 	}
1231 }
1232 
1233 /*
1234  * Callback handler used by CPR to stop and resume callouts.
1235  */
1236 /*ARGSUSED*/
1237 static boolean_t
1238 callout_cpr_callb(void *arg, int code)
1239 {
1240 	if (code == CB_CODE_CPR_CHKPT)
1241 		callout_suspend();
1242 	else
1243 		callout_resume(0);
1244 
1245 	return (B_TRUE);
1246 }
1247 
1248 /*
1249  * Callback handler invoked when the debugger is entered or exited.
1250  */
1251 /*ARGSUSED*/
1252 static boolean_t
1253 callout_debug_callb(void *arg, int code)
1254 {
1255 	hrtime_t delta;
1256 
1257 	/*
1258 	 * When the system enters the debugger. make a note of the hrtime.
1259 	 * When it is resumed, compute how long the system was in the
1260 	 * debugger. This interval should not be counted for callouts.
1261 	 */
1262 	if (code == 0) {
1263 		callout_suspend();
1264 		callout_debug_hrtime = gethrtime();
1265 	} else {
1266 		delta = gethrtime() - callout_debug_hrtime;
1267 		callout_resume(delta);
1268 	}
1269 
1270 	return (B_TRUE);
1271 }
1272 
1273 /*
1274  * Move the hrestime callouts to the expired list. Then program the table's
1275  * cyclic to expire immediately so that the callouts can be executed
1276  * immediately.
1277  */
1278 static void
1279 callout_hrestime_one(callout_table_t *ct)
1280 {
1281 	callout_list_t *cl, *ecl;
1282 	callout_t *cp;
1283 	int hash;
1284 
1285 	mutex_enter(&ct->ct_mutex);
1286 	if (ct->ct_heap_num == 0) {
1287 		mutex_exit(&ct->ct_mutex);
1288 		return;
1289 	}
1290 
1291 	if (ct->ct_lfree == NULL)
1292 		callout_list_alloc(ct);
1293 	ecl = ct->ct_lfree;
1294 	ct->ct_lfree = ecl->cl_next;
1295 
1296 	for (hash = 0; hash < CALLOUT_BUCKETS; hash++) {
1297 		for (cl = ct->ct_clhash[hash].ch_head; cl; cl = cl->cl_next) {
1298 			for (cp = cl->cl_callouts.ch_head; cp;
1299 			    cp = cp->c_clnext) {
1300 				if ((cp->c_xid & CALLOUT_HRESTIME) == 0)
1301 					continue;
1302 				CALLOUT_HASH_DELETE(cl->cl_callouts, cp,
1303 				    c_clnext, c_clprev);
1304 				cp->c_list = ecl;
1305 				CALLOUT_HASH_APPEND(ecl->cl_callouts, cp,
1306 				    c_clnext, c_clprev);
1307 			}
1308 		}
1309 	}
1310 
1311 	if (ecl->cl_callouts.ch_head != NULL) {
1312 		CALLOUT_LIST_APPEND(ct->ct_expired, ecl);
1313 		if (ct->ct_suspend == 0)
1314 			(void) cyclic_reprogram(ct->ct_cyclic, gethrtime());
1315 	} else {
1316 		ecl->cl_next = ct->ct_lfree;
1317 		ct->ct_lfree = ecl;
1318 	}
1319 	mutex_exit(&ct->ct_mutex);
1320 }
1321 
1322 /*
1323  * This function is called whenever system time (hrestime) is changed
1324  * explicitly. All the HRESTIME callouts must be expired at once.
1325  */
1326 /*ARGSUSED*/
1327 void
1328 callout_hrestime(void)
1329 {
1330 	int t, f;
1331 	callout_table_t *ct;
1332 
1333 	/*
1334 	 * Traverse every callout table in the system and process the hrestime
1335 	 * callouts therein.
1336 	 *
1337 	 * We look at all the tables because we don't know which ones were
1338 	 * onlined and offlined in the past. The offlined tables may still
1339 	 * have active cyclics processing timers somewhere.
1340 	 */
1341 	for (f = 0; f < max_ncpus; f++) {
1342 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1343 			ct = &callout_table[CALLOUT_TABLE(t, f)];
1344 			callout_hrestime_one(ct);
1345 		}
1346 	}
1347 }
1348 
1349 /*
1350  * Create the hash tables for this callout table.
1351  */
1352 static void
1353 callout_hash_init(callout_table_t *ct)
1354 {
1355 	size_t size;
1356 
1357 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1358 	ASSERT((ct->ct_idhash == NULL) && (ct->ct_clhash == NULL));
1359 
1360 	size = sizeof (callout_hash_t) * CALLOUT_BUCKETS;
1361 	ct->ct_idhash = kmem_zalloc(size, KM_SLEEP);
1362 	ct->ct_clhash = kmem_zalloc(size, KM_SLEEP);
1363 }
1364 
1365 /*
1366  * Create per-callout table kstats.
1367  */
1368 static void
1369 callout_kstat_init(callout_table_t *ct)
1370 {
1371 	callout_stat_type_t stat;
1372 	kstat_t *ct_kstats;
1373 	int ndx;
1374 
1375 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1376 	ASSERT(ct->ct_kstats == NULL);
1377 
1378 	ndx = ct - callout_table;
1379 	ct_kstats = kstat_create("unix", ndx, "callout",
1380 	    "misc", KSTAT_TYPE_NAMED, CALLOUT_NUM_STATS, KSTAT_FLAG_VIRTUAL);
1381 
1382 	if (ct_kstats == NULL) {
1383 		cmn_err(CE_WARN, "kstat_create for callout table %p failed",
1384 		    (void *)ct);
1385 	} else {
1386 		ct_kstats->ks_data = ct->ct_kstat_data;
1387 		for (stat = 0; stat < CALLOUT_NUM_STATS; stat++)
1388 			kstat_named_init(&ct->ct_kstat_data[stat],
1389 			    callout_kstat_names[stat], KSTAT_DATA_INT64);
1390 		ct->ct_kstats = ct_kstats;
1391 		kstat_install(ct_kstats);
1392 	}
1393 }
1394 
1395 static void
1396 callout_cyclic_init(callout_table_t *ct)
1397 {
1398 	cyc_handler_t hdlr;
1399 	cyc_time_t when;
1400 	processorid_t seqid;
1401 	int t;
1402 
1403 	ASSERT(MUTEX_HELD(&ct->ct_mutex));
1404 
1405 	t = CALLOUT_TABLE_TYPE(ct);
1406 	seqid = CALLOUT_TABLE_SEQID(ct);
1407 
1408 	/*
1409 	 * Create the taskq thread if the table type is normal.
1410 	 * Realtime tables are handled at PIL1 by a softint
1411 	 * handler.
1412 	 */
1413 	if (t == CALLOUT_NORMAL) {
1414 		ASSERT(ct->ct_taskq == NULL);
1415 		/*
1416 		 * Each callout thread consumes exactly one
1417 		 * task structure while active.  Therefore,
1418 		 * prepopulating with 2 * CALLOUT_THREADS tasks
1419 		 * ensures that there's at least one task per
1420 		 * thread that's either scheduled or on the
1421 		 * freelist.  In turn, this guarantees that
1422 		 * taskq_dispatch() will always either succeed
1423 		 * (because there's a free task structure) or
1424 		 * be unnecessary (because "callout_excute(ct)"
1425 		 * has already scheduled).
1426 		 */
1427 		ct->ct_taskq =
1428 		    taskq_create_instance("callout_taskq", seqid,
1429 		    CALLOUT_THREADS, maxclsyspri,
1430 		    2 * CALLOUT_THREADS, 2 * CALLOUT_THREADS,
1431 		    TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
1432 	}
1433 
1434 	/*
1435 	 * callouts can only be created in a table whose
1436 	 * cyclic has been initialized.
1437 	 */
1438 	ASSERT(ct->ct_heap_num == 0);
1439 
1440 	/*
1441 	 * Create the callout table cyclics.
1442 	 */
1443 	ASSERT(ct->ct_cyclic == CYCLIC_NONE);
1444 
1445 	hdlr.cyh_func = (cyc_func_t)CALLOUT_CYCLIC_HANDLER(t);
1446 	hdlr.cyh_level = CY_LOW_LEVEL;
1447 	hdlr.cyh_arg = ct;
1448 	when.cyt_when = CY_INFINITY;
1449 	when.cyt_interval = CY_INFINITY;
1450 
1451 	ct->ct_cyclic = cyclic_add(&hdlr, &when);
1452 }
1453 
1454 void
1455 callout_cpu_online(cpu_t *cp)
1456 {
1457 	lgrp_handle_t hand;
1458 	callout_cache_t *cache;
1459 	char s[KMEM_CACHE_NAMELEN];
1460 	callout_table_t *ct;
1461 	processorid_t seqid;
1462 	int t;
1463 
1464 	ASSERT(MUTEX_HELD(&cpu_lock));
1465 
1466 	/*
1467 	 * Locate the cache corresponding to the onlined CPU's lgroup.
1468 	 * Note that access to callout_caches is protected by cpu_lock.
1469 	 */
1470 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
1471 	for (cache = callout_caches; cache != NULL; cache = cache->cc_next) {
1472 		if (cache->cc_hand == hand)
1473 			break;
1474 	}
1475 
1476 	/*
1477 	 * If not found, create one. The caches are never destroyed.
1478 	 */
1479 	if (cache == NULL) {
1480 		cache = kmem_alloc(sizeof (callout_cache_t), KM_SLEEP);
1481 		cache->cc_hand = hand;
1482 		(void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_cache%lx",
1483 		    (long)hand);
1484 		cache->cc_cache = kmem_cache_create(s, sizeof (callout_t),
1485 		    CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
1486 		(void) snprintf(s, KMEM_CACHE_NAMELEN, "callout_lcache%lx",
1487 		    (long)hand);
1488 		cache->cc_lcache = kmem_cache_create(s, sizeof (callout_list_t),
1489 		    CALLOUT_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
1490 		cache->cc_next = callout_caches;
1491 		callout_caches = cache;
1492 	}
1493 
1494 	seqid = cp->cpu_seqid;
1495 
1496 	for (t = 0; t < CALLOUT_NTYPES; t++) {
1497 		ct = &callout_table[CALLOUT_TABLE(t, seqid)];
1498 
1499 		mutex_enter(&ct->ct_mutex);
1500 		/*
1501 		 * Store convinience pointers to the kmem caches
1502 		 * in the callout table. These assignments should always be
1503 		 * done as callout tables can map to different physical
1504 		 * CPUs each time.
1505 		 */
1506 		ct->ct_cache = cache->cc_cache;
1507 		ct->ct_lcache = cache->cc_lcache;
1508 
1509 		/*
1510 		 * We use the heap pointer to check if stuff has been
1511 		 * initialized for this callout table.
1512 		 */
1513 		if (ct->ct_heap == NULL) {
1514 			callout_heap_init(ct);
1515 			callout_hash_init(ct);
1516 			callout_kstat_init(ct);
1517 			callout_cyclic_init(ct);
1518 		}
1519 
1520 		mutex_exit(&ct->ct_mutex);
1521 
1522 		/*
1523 		 * Move the cyclic to this CPU by doing a bind.
1524 		 */
1525 		cyclic_bind(ct->ct_cyclic, cp, NULL);
1526 	}
1527 }
1528 
1529 void
1530 callout_cpu_offline(cpu_t *cp)
1531 {
1532 	callout_table_t *ct;
1533 	processorid_t seqid;
1534 	int t;
1535 
1536 	ASSERT(MUTEX_HELD(&cpu_lock));
1537 
1538 	seqid = cp->cpu_seqid;
1539 
1540 	for (t = 0; t < CALLOUT_NTYPES; t++) {
1541 		ct = &callout_table[CALLOUT_TABLE(t, seqid)];
1542 
1543 		/*
1544 		 * Unbind the cyclic. This will allow the cyclic subsystem
1545 		 * to juggle the cyclic during CPU offline.
1546 		 */
1547 		cyclic_bind(ct->ct_cyclic, NULL, NULL);
1548 	}
1549 }
1550 
1551 /*
1552  * This is called to perform per-CPU initialization for slave CPUs at
1553  * boot time.
1554  */
1555 void
1556 callout_mp_init(void)
1557 {
1558 	cpu_t *cp;
1559 
1560 	mutex_enter(&cpu_lock);
1561 
1562 	cp = cpu_active;
1563 	do {
1564 		callout_cpu_online(cp);
1565 	} while ((cp = cp->cpu_next_onln) != cpu_active);
1566 
1567 	mutex_exit(&cpu_lock);
1568 }
1569 
1570 /*
1571  * Initialize all callout tables.  Called at boot time just before clkstart().
1572  */
1573 void
1574 callout_init(void)
1575 {
1576 	int f, t;
1577 	size_t size;
1578 	int table_id;
1579 	callout_table_t *ct;
1580 	long bits, fanout;
1581 	uintptr_t buf;
1582 
1583 	/*
1584 	 * Initialize callout globals.
1585 	 */
1586 	bits = 0;
1587 	for (fanout = 1; (fanout < max_ncpus); fanout <<= 1)
1588 		bits++;
1589 	callout_table_bits = CALLOUT_TYPE_BITS + bits;
1590 	callout_table_mask = (1 << callout_table_bits) - 1;
1591 	callout_counter_low = 1 << CALLOUT_COUNTER_SHIFT;
1592 	callout_longterm = TICK_TO_NSEC(CALLOUT_LONGTERM_TICKS);
1593 	callout_max_ticks = CALLOUT_MAX_TICKS;
1594 
1595 	/*
1596 	 * Because of the variability in timing behavior across systems with
1597 	 * different architectures, we cannot allow arbitrarily low
1598 	 * resolutions. The minimum resolution has to be determined in a
1599 	 * platform-specific way. Until then, we define a blanket minimum
1600 	 * resolution for callouts of CALLOUT_MIN_RESOLUTION.
1601 	 *
1602 	 * If, in the future, someone requires lower resolution timers, they
1603 	 * can do one of two things:
1604 	 *
1605 	 *	- Define a lower value for callout_min_resolution. This would
1606 	 *	  affect all clients of the callout subsystem. If this done
1607 	 *	  via /etc/system, then no code changes are required and it
1608 	 *	  would affect only that customer.
1609 	 *
1610 	 *	- Define a flag to be passed to timeout creation that allows
1611 	 *	  the lower resolution. This involves code changes. But it
1612 	 *	  would affect only the calling module. It is the developer's
1613 	 *	  responsibility to test on all systems and make sure that
1614 	 *	  everything works.
1615 	 */
1616 	if (callout_min_resolution <= 0)
1617 		callout_min_resolution = CALLOUT_MIN_RESOLUTION;
1618 
1619 	/*
1620 	 * Allocate all the callout tables based on max_ncpus. We have chosen
1621 	 * to do boot-time allocation instead of dynamic allocation because:
1622 	 *
1623 	 *	- the size of the callout tables is not too large.
1624 	 *	- there are race conditions involved in making this dynamic.
1625 	 *	- the hash tables that go with the callout tables consume
1626 	 *	  most of the memory and they are only allocated in
1627 	 *	  callout_cpu_online().
1628 	 *
1629 	 * Each CPU has two tables that are consecutive in the array. The first
1630 	 * one is for realtime callouts and the second one is for normal ones.
1631 	 *
1632 	 * We do this alignment dance to make sure that callout table
1633 	 * structures will always be on a cache line boundary.
1634 	 */
1635 	size = sizeof (callout_table_t) * CALLOUT_NTYPES * max_ncpus;
1636 	size += CALLOUT_ALIGN;
1637 	buf = (uintptr_t)kmem_zalloc(size, KM_SLEEP);
1638 	callout_table = (callout_table_t *)P2ROUNDUP(buf, CALLOUT_ALIGN);
1639 
1640 	size = sizeof (kstat_named_t) * CALLOUT_NUM_STATS;
1641 	/*
1642 	 * Now, initialize the tables for all the CPUs.
1643 	 */
1644 	for (f = 0; f < max_ncpus; f++) {
1645 		for (t = 0; t < CALLOUT_NTYPES; t++) {
1646 			table_id = CALLOUT_TABLE(t, f);
1647 			ct = &callout_table[table_id];
1648 			ct->ct_type = t;
1649 			mutex_init(&ct->ct_mutex, NULL, MUTEX_DEFAULT, NULL);
1650 			/*
1651 			 * Precompute the base IDs for long and short-term
1652 			 * legacy IDs. This makes ID generation during
1653 			 * timeout() fast.
1654 			 */
1655 			ct->ct_short_id = CALLOUT_SHORT_ID(table_id);
1656 			ct->ct_long_id = CALLOUT_LONG_ID(table_id);
1657 			/*
1658 			 * Precompute the base ID for generation-based IDs.
1659 			 * Note that when the first ID gets allocated, the
1660 			 * ID will wrap. This will cause the generation
1661 			 * number to be incremented to 1.
1662 			 */
1663 			ct->ct_gen_id = CALLOUT_SHORT_ID(table_id);
1664 			/*
1665 			 * Initialize the cyclic as NONE. This will get set
1666 			 * during CPU online. This is so that partially
1667 			 * populated systems will only have the required
1668 			 * number of cyclics, not more.
1669 			 */
1670 			ct->ct_cyclic = CYCLIC_NONE;
1671 			ct->ct_kstat_data = kmem_zalloc(size, KM_SLEEP);
1672 		}
1673 	}
1674 
1675 	/*
1676 	 * Add the callback for CPR. This is called during checkpoint
1677 	 * resume to suspend and resume callouts.
1678 	 */
1679 	(void) callb_add(callout_cpr_callb, 0, CB_CL_CPR_CALLOUT,
1680 	    "callout_cpr");
1681 	(void) callb_add(callout_debug_callb, 0, CB_CL_ENTER_DEBUGGER,
1682 	    "callout_debug");
1683 
1684 	/*
1685 	 * Call the per-CPU initialization function for the boot CPU. This
1686 	 * is done here because the function is not called automatically for
1687 	 * the boot CPU from the CPU online/offline hooks. Note that the
1688 	 * CPU lock is taken here because of convention.
1689 	 */
1690 	mutex_enter(&cpu_lock);
1691 	callout_boot_ct = &callout_table[CALLOUT_TABLE(0, CPU->cpu_seqid)];
1692 	callout_cpu_online(CPU);
1693 	mutex_exit(&cpu_lock);
1694 }
1695