xref: /openbsd/sys/kern/subr_witness.c (revision 09467b48)
1 /*	$OpenBSD: subr_witness.c,v 1.37 2020/03/15 05:58:48 visa Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008 Isilon Systems, Inc.
5  * Copyright (c) 2008 Ilya Maykov <ivmaykov@gmail.com>
6  * Copyright (c) 1998 Berkeley Software Design, Inc.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. Berkeley Software Design Inc's name may not be used to endorse or
18  *    promote products derived from this software without specific prior
19  *    written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	from BSDI Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp
34  *	and BSDI Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp
35  */
36 
37 /*
38  * Implementation of the `witness' lock verifier.  Originally implemented for
39  * mutexes in BSD/OS.  Extended to handle generic lock objects and lock
40  * classes in FreeBSD.
41  */
42 
43 /*
44  *	Main Entry: witness
45  *	Pronunciation: 'wit-n&s
46  *	Function: noun
47  *	Etymology: Middle English witnesse, from Old English witnes knowledge,
48  *	    testimony, witness, from 2wit
49  *	Date: before 12th century
50  *	1 : attestation of a fact or event : TESTIMONY
51  *	2 : one that gives evidence; specifically : one who testifies in
52  *	    a cause or before a judicial tribunal
53  *	3 : one asked to be present at a transaction so as to be able to
54  *	    testify to its having taken place
55  *	4 : one who has personal knowledge of something
56  *	5 a : something serving as evidence or proof : SIGN
57  *	  b : public affirmation by word or example of usually
58  *	      religious faith or conviction <the heroic witness to divine
59  *	      life -- Pilot>
60  *	6 capitalized : a member of the Jehovah's Witnesses
61  */
62 
63 /*
64  * Special rules concerning Giant and lock orders:
65  *
66  * 1) Giant must be acquired before any other mutexes.  Stated another way,
67  *    no other mutex may be held when Giant is acquired.
68  *
69  * 2) Giant must be released when blocking on a sleepable lock.
70  *
71  * This rule is less obvious, but is a result of Giant providing the same
72  * semantics as spl().  Basically, when a thread sleeps, it must release
73  * Giant.  When a thread blocks on a sleepable lock, it sleeps.  Hence rule
74  * 2).
75  *
76  * 3) Giant may be acquired before or after sleepable locks.
77  *
78  * This rule is also not quite as obvious.  Giant may be acquired after
79  * a sleepable lock because it is a non-sleepable lock and non-sleepable
80  * locks may always be acquired while holding a sleepable lock.  The second
81  * case, Giant before a sleepable lock, follows from rule 2) above.  Suppose
82  * you have two threads T1 and T2 and a sleepable lock X.  Suppose that T1
83  * acquires X and blocks on Giant.  Then suppose that T2 acquires Giant and
84  * blocks on X.  When T2 blocks on X, T2 will release Giant allowing T1 to
85  * execute.  Thus, acquiring Giant both before and after a sleepable lock
86  * will not result in a lock order reversal.
87  */
88 
89 #include <sys/param.h>
90 #include <sys/systm.h>
91 #include <sys/kernel.h>
92 #include <sys/malloc.h>
93 #ifdef MULTIPROCESSOR
94 #include <sys/mplock.h>
95 #endif
96 #include <sys/mutex.h>
97 #include <sys/percpu.h>
98 #include <sys/proc.h>
99 #include <sys/sched.h>
100 #include <sys/stacktrace.h>
101 #include <sys/stdint.h>
102 #include <sys/sysctl.h>
103 #include <sys/syslog.h>
104 #include <sys/witness.h>
105 
106 #include <machine/cpu.h>
107 
108 #include <uvm/uvm_extern.h>	/* uvm_pageboot_alloc */
109 
110 #ifndef DDB
111 #error "DDB is required for WITNESS"
112 #endif
113 
114 #include <machine/db_machdep.h>
115 
116 #include <ddb/db_access.h>
117 #include <ddb/db_var.h>
118 #include <ddb/db_output.h>
119 
120 #define	LI_RECURSEMASK	0x0000ffff	/* Recursion depth of lock instance. */
121 #define	LI_EXCLUSIVE	0x00010000	/* Exclusive lock instance. */
122 #define	LI_NORELEASE	0x00020000	/* Lock not allowed to be released. */
123 
124 #ifndef WITNESS_COUNT
125 #define	WITNESS_COUNT		1536
126 #endif
127 #define	WITNESS_HASH_SIZE	251	/* Prime, gives load factor < 2 */
128 #define	WITNESS_PENDLIST	(1024 + MAXCPUS)
129 
130 /* Allocate 256 KB of stack data space */
131 #define	WITNESS_LO_DATA_COUNT	2048
132 
133 /* Prime, gives load factor of ~2 at full load */
134 #define	WITNESS_LO_HASH_SIZE	1021
135 
136 /*
137  * XXX: This is somewhat bogus, as we assume here that at most 2048 threads
138  * will hold LOCK_NCHILDREN locks.  We handle failure ok, and we should
139  * probably be safe for the most part, but it's still a SWAG.
140  */
141 #define	LOCK_NCHILDREN	5
142 #define	LOCK_CHILDCOUNT	2048
143 
144 #define	FULLGRAPH_SBUF_SIZE	512
145 
146 /*
147  * These flags go in the witness relationship matrix and describe the
148  * relationship between any two struct witness objects.
149  */
150 #define	WITNESS_UNRELATED        0x00    /* No lock order relation. */
151 #define	WITNESS_PARENT           0x01    /* Parent, aka direct ancestor. */
152 #define	WITNESS_ANCESTOR         0x02    /* Direct or indirect ancestor. */
153 #define	WITNESS_CHILD            0x04    /* Child, aka direct descendant. */
154 #define	WITNESS_DESCENDANT       0x08    /* Direct or indirect descendant. */
155 #define	WITNESS_ANCESTOR_MASK    (WITNESS_PARENT | WITNESS_ANCESTOR)
156 #define	WITNESS_DESCENDANT_MASK  (WITNESS_CHILD | WITNESS_DESCENDANT)
157 #define	WITNESS_RELATED_MASK						\
158 	(WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK)
159 #define	WITNESS_REVERSAL         0x10    /* A lock order reversal has been
160 					  * observed. */
161 #define	WITNESS_RESERVED1        0x20    /* Unused flag, reserved. */
162 #define	WITNESS_RESERVED2        0x40    /* Unused flag, reserved. */
163 #define	WITNESS_LOCK_ORDER_KNOWN 0x80    /* This lock order is known. */
164 
165 /* Descendant to ancestor flags */
166 #define	WITNESS_DTOA(x)	(((x) & WITNESS_RELATED_MASK) >> 2)
167 
168 /* Ancestor to descendant flags */
169 #define	WITNESS_ATOD(x)	(((x) & WITNESS_RELATED_MASK) << 2)
170 
171 #define	WITNESS_INDEX_ASSERT(i)						\
172 	KASSERT((i) > 0 && (i) <= w_max_used_index && (i) < witness_count)
173 
174 /*
175  * Lock classes.  Each lock has a class which describes characteristics
176  * common to all types of locks of a given class.
177  *
178  * Spin locks in general must always protect against preemption, as it is
179  * an error to perform any type of context switch while holding a spin lock.
180  * Also, for an individual lock to be recursable, its class must allow
181  * recursion and the lock itself must explicitly allow recursion.
182  */
183 
184 struct lock_class {
185 	const		char *lc_name;
186 	u_int		lc_flags;
187 };
188 
189 union lock_stack {
190 	union lock_stack	*ls_next;
191 	struct stacktrace	 ls_stack;
192 };
193 
194 #define	LC_SLEEPLOCK	0x00000001	/* Sleep lock. */
195 #define	LC_SPINLOCK	0x00000002	/* Spin lock. */
196 #define	LC_SLEEPABLE	0x00000004	/* Sleeping allowed with this lock. */
197 #define	LC_RECURSABLE	0x00000008	/* Locks of this type may recurse. */
198 #define	LC_UPGRADABLE	0x00000010	/* Upgrades and downgrades permitted. */
199 
200 /*
201  * Lock instances.  A lock instance is the data associated with a lock while
202  * it is held by witness.  For example, a lock instance will hold the
203  * recursion count of a lock.  Lock instances are held in lists.  Spin locks
204  * are held in a per-cpu list while sleep locks are held in per-thread list.
205  */
206 struct lock_instance {
207 	struct lock_object	*li_lock;
208 	union lock_stack	*li_stack;
209 	u_int			li_flags;
210 };
211 
212 /*
213  * A simple list type used to build the list of locks held by a thread
214  * or CPU.  We can't simply embed the list in struct lock_object since a
215  * lock may be held by more than one thread if it is a shared lock.  Locks
216  * are added to the head of the list, so we fill up each list entry from
217  * "the back" logically.  To ease some of the arithmetic, we actually fill
218  * in each list entry the normal way (children[0] then children[1], etc.) but
219  * when we traverse the list we read children[count-1] as the first entry
220  * down to children[0] as the final entry.
221  */
222 struct lock_list_entry {
223 	struct lock_list_entry	*ll_next;
224 	struct lock_instance	ll_children[LOCK_NCHILDREN];
225 	int			ll_count;
226 };
227 
228 /*
229  * The main witness structure. One of these per named lock type in the system
230  * (for example, "vnode interlock").
231  */
232 struct witness {
233 	const struct lock_type	*w_type;
234 	const char		*w_subtype;
235 	uint32_t		w_index;  /* Index in the relationship matrix */
236 	struct lock_class	*w_class;
237 	SIMPLEQ_ENTRY(witness)	w_list;		/* List of all witnesses. */
238 	SIMPLEQ_ENTRY(witness)	w_typelist;	/* Witnesses of a type. */
239 	struct witness		*w_hash_next; /* Linked list in hash buckets. */
240 	uint16_t		w_num_ancestors; /* direct/indirect
241 						  * ancestor count */
242 	uint16_t		w_num_descendants; /* direct/indirect
243 						    * descendant count */
244 	int16_t			w_ddb_level;
245 	unsigned		w_acquired:1;
246 	unsigned		w_displayed:1;
247 	unsigned		w_reversed:1;
248 };
249 
250 SIMPLEQ_HEAD(witness_list, witness);
251 
252 /*
253  * The witness hash table. Keys are witness names (const char *), elements are
254  * witness objects (struct witness *).
255  */
256 struct witness_hash {
257 	struct witness	*wh_array[WITNESS_HASH_SIZE];
258 	uint32_t	wh_size;
259 	uint32_t	wh_count;
260 };
261 
262 /*
263  * Key type for the lock order data hash table.
264  */
265 struct witness_lock_order_key {
266 	uint16_t	from;
267 	uint16_t	to;
268 };
269 
270 struct witness_lock_order_data {
271 	struct stacktrace		wlod_stack;
272 	struct witness_lock_order_key	wlod_key;
273 	struct witness_lock_order_data	*wlod_next;
274 };
275 
276 /*
277  * The witness lock order data hash table. Keys are witness index tuples
278  * (struct witness_lock_order_key), elements are lock order data objects
279  * (struct witness_lock_order_data).
280  */
281 struct witness_lock_order_hash {
282 	struct witness_lock_order_data	*wloh_array[WITNESS_LO_HASH_SIZE];
283 	u_int	wloh_size;
284 	u_int	wloh_count;
285 };
286 
287 struct witness_pendhelp {
288 	const struct lock_type	*wh_type;
289 	struct lock_object	*wh_lock;
290 };
291 
292 struct witness_cpu {
293 	struct lock_list_entry	*wc_spinlocks;
294 	struct lock_list_entry	*wc_lle_cache;
295 	union lock_stack	*wc_stk_cache;
296 	unsigned int		 wc_lle_count;
297 	unsigned int		 wc_stk_count;
298 } __aligned(CACHELINESIZE);
299 
300 #define WITNESS_LLE_CACHE_MAX	8
301 #define WITNESS_STK_CACHE_MAX	(WITNESS_LLE_CACHE_MAX * LOCK_NCHILDREN)
302 
303 struct witness_cpu witness_cpu[MAXCPUS];
304 
305 /*
306  * Returns 0 if one of the locks is a spin lock and the other is not.
307  * Returns 1 otherwise.
308  */
309 static __inline int
310 witness_lock_type_equal(struct witness *w1, struct witness *w2)
311 {
312 
313 	return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) ==
314 		(w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)));
315 }
316 
317 static __inline int
318 witness_lock_order_key_equal(const struct witness_lock_order_key *a,
319     const struct witness_lock_order_key *b)
320 {
321 
322 	return (a->from == b->from && a->to == b->to);
323 }
324 
325 static int	_isitmyx(struct witness *w1, struct witness *w2, int rmask,
326 		    const char *fname);
327 static void	adopt(struct witness *parent, struct witness *child);
328 static struct witness	*enroll(const struct lock_type *, const char *,
329 			    struct lock_class *);
330 static struct lock_instance	*find_instance(struct lock_list_entry *list,
331 				    const struct lock_object *lock);
332 static int	isitmychild(struct witness *parent, struct witness *child);
333 static int	isitmydescendant(struct witness *parent, struct witness *child);
334 static void	itismychild(struct witness *parent, struct witness *child);
335 #ifdef DDB
336 static void	db_witness_add_fullgraph(struct witness *parent);
337 static void	witness_ddb_compute_levels(void);
338 static void	witness_ddb_display(int(*)(const char *fmt, ...));
339 static void	witness_ddb_display_descendants(int(*)(const char *fmt, ...),
340 		    struct witness *, int indent);
341 static void	witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
342 		    struct witness_list *list);
343 static void	witness_ddb_level_descendants(struct witness *parent, int l);
344 static void	witness_ddb_list(struct proc *td);
345 #endif
346 static int	witness_alloc_stacks(void);
347 static void	witness_debugger(int dump);
348 static void	witness_free(struct witness *m);
349 static struct witness	*witness_get(void);
350 static uint32_t	witness_hash_djb2(const uint8_t *key, uint32_t size);
351 static struct witness	*witness_hash_get(const struct lock_type *,
352 		    const char *);
353 static void	witness_hash_put(struct witness *w);
354 static void	witness_init_hash_tables(void);
355 static void	witness_increment_graph_generation(void);
356 static int	witness_list_locks(struct lock_list_entry **,
357 		    int (*)(const char *, ...));
358 static void	witness_lock_list_free(struct lock_list_entry *lle);
359 static struct lock_list_entry	*witness_lock_list_get(void);
360 static void	witness_lock_stack_free(union lock_stack *stack);
361 static union lock_stack		*witness_lock_stack_get(void);
362 static int	witness_lock_order_add(struct witness *parent,
363 		    struct witness *child);
364 static int	witness_lock_order_check(struct witness *parent,
365 		    struct witness *child);
366 static struct witness_lock_order_data	*witness_lock_order_get(
367 					    struct witness *parent,
368 					    struct witness *child);
369 static void	witness_list_lock(struct lock_instance *instance,
370 		    int (*prnt)(const char *fmt, ...));
371 static void	witness_setflag(struct lock_object *lock, int flag, int set);
372 
373 /*
374  * If set to 0, lock order checking is disabled.  If set to -1,
375  * witness is completely disabled.  Otherwise witness performs full
376  * lock order checking for all locks.  At runtime, lock order checking
377  * may be toggled.  However, witness cannot be reenabled once it is
378  * completely disabled.
379  */
380 #ifdef WITNESS_WATCH
381 static int witness_watch = 3;
382 #else
383 static int witness_watch = 2;
384 #endif
385 
386 #ifdef WITNESS_LOCKTRACE
387 static int witness_locktrace = 1;
388 #else
389 static int witness_locktrace = 0;
390 #endif
391 
392 int witness_count = WITNESS_COUNT;
393 
394 static struct mutex w_mtx;
395 static struct rwlock w_ctlock = RWLOCK_INITIALIZER("w_ctlock");
396 
397 /* w_list */
398 static struct witness_list w_free = SIMPLEQ_HEAD_INITIALIZER(w_free);
399 static struct witness_list w_all = SIMPLEQ_HEAD_INITIALIZER(w_all);
400 
401 /* w_typelist */
402 static struct witness_list w_spin = SIMPLEQ_HEAD_INITIALIZER(w_spin);
403 static struct witness_list w_sleep = SIMPLEQ_HEAD_INITIALIZER(w_sleep);
404 
405 /* lock list */
406 static struct lock_list_entry *w_lock_list_free = NULL;
407 static struct witness_pendhelp pending_locks[WITNESS_PENDLIST];
408 static u_int pending_cnt;
409 
410 static int w_free_cnt, w_spin_cnt, w_sleep_cnt;
411 
412 static struct witness *w_data;
413 static uint8_t **w_rmatrix;
414 static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT];
415 static struct witness_hash w_hash;	/* The witness hash table. */
416 
417 /* The lock order data hash */
418 static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT];
419 static struct witness_lock_order_data *w_lofree = NULL;
420 static struct witness_lock_order_hash w_lohash;
421 static int w_max_used_index = 0;
422 static unsigned int w_generation = 0;
423 
424 static union lock_stack *w_lock_stack_free;
425 static unsigned int w_lock_stack_num;
426 
427 static struct lock_class lock_class_kernel_lock = {
428 	.lc_name = "kernel_lock",
429 	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_SLEEPABLE
430 };
431 
432 static struct lock_class lock_class_sched_lock = {
433 	.lc_name = "sched_lock",
434 	.lc_flags = LC_SPINLOCK | LC_RECURSABLE
435 };
436 
437 static struct lock_class lock_class_mutex = {
438 	.lc_name = "mutex",
439 	.lc_flags = LC_SPINLOCK
440 };
441 
442 static struct lock_class lock_class_rwlock = {
443 	.lc_name = "rwlock",
444 	.lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_UPGRADABLE
445 };
446 
447 static struct lock_class lock_class_rrwlock = {
448 	.lc_name = "rrwlock",
449 	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_SLEEPABLE |
450 	    LC_UPGRADABLE
451 };
452 
453 static struct lock_class *lock_classes[] = {
454 	&lock_class_kernel_lock,
455 	&lock_class_sched_lock,
456 	&lock_class_mutex,
457 	&lock_class_rwlock,
458 	&lock_class_rrwlock,
459 };
460 
461 /*
462  * This global is set to 0 once it becomes safe to use the witness code.
463  */
464 static int witness_cold = 1;
465 
466 /*
467  * This global is set to 1 once the static lock orders have been enrolled
468  * so that a warning can be issued for any spin locks enrolled later.
469  */
470 static int witness_spin_warn = 0;
471 
472 /*
473  * The WITNESS-enabled diagnostic code.  Note that the witness code does
474  * assume that the early boot is single-threaded at least until after this
475  * routine is completed.
476  */
477 void
478 witness_initialize(void)
479 {
480 	struct lock_object *lock;
481 	union lock_stack *stacks;
482 	struct witness *w;
483 	int i, s;
484 
485 	w_data = (void *)uvm_pageboot_alloc(sizeof(struct witness) *
486 	    witness_count);
487 	memset(w_data, 0, sizeof(struct witness) * witness_count);
488 
489 	w_rmatrix = (void *)uvm_pageboot_alloc(sizeof(*w_rmatrix) *
490 	    (witness_count + 1));
491 
492 	for (i = 0; i < witness_count + 1; i++) {
493 		w_rmatrix[i] = (void *)uvm_pageboot_alloc(
494 		    sizeof(*w_rmatrix[i]) * (witness_count + 1));
495 		memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) *
496 		    (witness_count + 1));
497 	}
498 
499 	mtx_init_flags(&w_mtx, IPL_HIGH, "witness lock", MTX_NOWITNESS);
500 	for (i = witness_count - 1; i >= 0; i--) {
501 		w = &w_data[i];
502 		memset(w, 0, sizeof(*w));
503 		w_data[i].w_index = i;	/* Witness index never changes. */
504 		witness_free(w);
505 	}
506 	KASSERTMSG(SIMPLEQ_FIRST(&w_free)->w_index == 0,
507 	    "%s: Invalid list of free witness objects", __func__);
508 
509 	/* Witness with index 0 is not used to aid in debugging. */
510 	SIMPLEQ_REMOVE_HEAD(&w_free, w_list);
511 	w_free_cnt--;
512 
513 	for (i = 0; i < witness_count; i++) {
514 		memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) *
515 		    (witness_count + 1));
516 	}
517 
518 	if (witness_locktrace) {
519 		w_lock_stack_num = LOCK_CHILDCOUNT * LOCK_NCHILDREN;
520 		stacks = (void *)uvm_pageboot_alloc(sizeof(*stacks) *
521 		    w_lock_stack_num);
522 	}
523 
524 	s = splhigh();
525 	for (i = 0; i < w_lock_stack_num; i++)
526 		witness_lock_stack_free(&stacks[i]);
527 	for (i = 0; i < LOCK_CHILDCOUNT; i++)
528 		witness_lock_list_free(&w_locklistdata[i]);
529 	splx(s);
530 	witness_init_hash_tables();
531 	witness_spin_warn = 1;
532 
533 	/* Iterate through all locks and add them to witness. */
534 	for (i = 0; pending_locks[i].wh_lock != NULL; i++) {
535 		lock = pending_locks[i].wh_lock;
536 		KASSERTMSG(lock->lo_flags & LO_WITNESS,
537 		    "%s: lock %s is on pending list but not LO_WITNESS",
538 		    __func__, lock->lo_name);
539 		lock->lo_witness = enroll(pending_locks[i].wh_type,
540 		    lock->lo_name, LOCK_CLASS(lock));
541 	}
542 
543 	/* Mark the witness code as being ready for use. */
544 	witness_cold = 0;
545 }
546 
547 void
548 witness_init(struct lock_object *lock, const struct lock_type *type)
549 {
550 	struct lock_class *class;
551 
552 	/* Various sanity checks. */
553 	class = LOCK_CLASS(lock);
554 	if ((lock->lo_flags & LO_RECURSABLE) != 0 &&
555 	    (class->lc_flags & LC_RECURSABLE) == 0)
556 		panic("%s: lock (%s) %s can not be recursable",
557 		    __func__, class->lc_name, lock->lo_name);
558 	if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
559 	    (class->lc_flags & LC_SLEEPABLE) == 0)
560 		panic("%s: lock (%s) %s can not be sleepable",
561 		    __func__, class->lc_name, lock->lo_name);
562 	if ((lock->lo_flags & LO_UPGRADABLE) != 0 &&
563 	    (class->lc_flags & LC_UPGRADABLE) == 0)
564 		panic("%s: lock (%s) %s can not be upgradable",
565 		    __func__, class->lc_name, lock->lo_name);
566 
567 	/*
568 	 * If we shouldn't watch this lock, then just clear lo_witness.
569 	 * Record the type in case the lock becomes watched later.
570 	 * Otherwise, if witness_cold is set, then it is too early to
571 	 * enroll this lock, so defer it to witness_initialize() by adding
572 	 * it to the pending_locks list.  If it is not too early, then enroll
573 	 * the lock now.
574 	 */
575 	if (witness_watch < 1 || panicstr != NULL || db_active ||
576 	    (lock->lo_flags & LO_WITNESS) == 0) {
577 		lock->lo_witness = NULL;
578 		lock->lo_type = type;
579 	} else if (witness_cold) {
580 		pending_locks[pending_cnt].wh_lock = lock;
581 		pending_locks[pending_cnt++].wh_type = type;
582 		if (pending_cnt > WITNESS_PENDLIST)
583 			panic("%s: pending locks list is too small, "
584 			    "increase WITNESS_PENDLIST",
585 			    __func__);
586 	} else
587 		lock->lo_witness = enroll(type, lock->lo_name, class);
588 }
589 
590 static inline int
591 is_kernel_lock(const struct lock_object *lock)
592 {
593 #ifdef MULTIPROCESSOR
594 	return (lock == &kernel_lock.mpl_lock_obj);
595 #else
596 	return (0);
597 #endif
598 }
599 
600 #ifdef DDB
601 static void
602 witness_ddb_compute_levels(void)
603 {
604 	struct witness *w;
605 
606 	/*
607 	 * First clear all levels.
608 	 */
609 	SIMPLEQ_FOREACH(w, &w_all, w_list)
610 		w->w_ddb_level = -1;
611 
612 	/*
613 	 * Look for locks with no parents and level all their descendants.
614 	 */
615 	SIMPLEQ_FOREACH(w, &w_all, w_list) {
616 
617 		/* If the witness has ancestors (is not a root), skip it. */
618 		if (w->w_num_ancestors > 0)
619 			continue;
620 		witness_ddb_level_descendants(w, 0);
621 	}
622 }
623 
624 static void
625 witness_ddb_level_descendants(struct witness *w, int l)
626 {
627 	int i;
628 
629 	if (w->w_ddb_level >= l)
630 		return;
631 
632 	w->w_ddb_level = l;
633 	l++;
634 
635 	for (i = 1; i <= w_max_used_index; i++) {
636 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
637 			witness_ddb_level_descendants(&w_data[i], l);
638 	}
639 }
640 
641 static void
642 witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...),
643     struct witness *w, int indent)
644 {
645 	int i;
646 
647 	for (i = 0; i < indent; i++)
648 		prnt(" ");
649 	prnt("%s (type: %s, depth: %d)",
650 	     w->w_type->lt_name, w->w_class->lc_name, w->w_ddb_level);
651 	if (w->w_displayed) {
652 		prnt(" -- (already displayed)\n");
653 		return;
654 	}
655 	w->w_displayed = 1;
656 	if (!w->w_acquired)
657 		prnt(" -- never acquired\n");
658 	else
659 		prnt("\n");
660 	indent++;
661 	WITNESS_INDEX_ASSERT(w->w_index);
662 	for (i = 1; i <= w_max_used_index; i++) {
663 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
664 			witness_ddb_display_descendants(prnt, &w_data[i],
665 			    indent);
666 	}
667 }
668 
669 static void
670 witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
671     struct witness_list *list)
672 {
673 	struct witness *w;
674 
675 	SIMPLEQ_FOREACH(w, list, w_typelist) {
676 		if (!w->w_acquired || w->w_ddb_level > 0)
677 			continue;
678 
679 		/* This lock has no anscestors - display its descendants. */
680 		witness_ddb_display_descendants(prnt, w, 0);
681 	}
682 }
683 
684 static void
685 witness_ddb_display(int(*prnt)(const char *fmt, ...))
686 {
687 	struct witness *w;
688 
689 	KASSERTMSG(witness_cold == 0, "%s: witness_cold", __func__);
690 	witness_ddb_compute_levels();
691 
692 	/* Clear all the displayed flags. */
693 	SIMPLEQ_FOREACH(w, &w_all, w_list)
694 		w->w_displayed = 0;
695 
696 	/*
697 	 * First, handle sleep locks which have been acquired at least
698 	 * once.
699 	 */
700 	prnt("Sleep locks:\n");
701 	witness_ddb_display_list(prnt, &w_sleep);
702 
703 	/*
704 	 * Now do spin locks which have been acquired at least once.
705 	 */
706 	prnt("\nSpin locks:\n");
707 	witness_ddb_display_list(prnt, &w_spin);
708 
709 	/*
710 	 * Finally, any locks which have not been acquired yet.
711 	 */
712 	prnt("\nLocks which were never acquired:\n");
713 	SIMPLEQ_FOREACH(w, &w_all, w_list) {
714 		if (w->w_acquired)
715 			continue;
716 		prnt("%s (type: %s, depth: %d)\n", w->w_type->lt_name,
717 		    w->w_class->lc_name, w->w_ddb_level);
718 	}
719 }
720 #endif /* DDB */
721 
722 int
723 witness_defineorder(struct lock_object *lock1, struct lock_object *lock2)
724 {
725 
726 	if (witness_watch < 0 || panicstr != NULL || db_active)
727 		return (0);
728 
729 	/* Require locks that witness knows about. */
730 	if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL ||
731 	    lock2->lo_witness == NULL)
732 		return (EINVAL);
733 
734 	MUTEX_ASSERT_UNLOCKED(&w_mtx);
735 	mtx_enter(&w_mtx);
736 
737 	/*
738 	 * If we already have either an explicit or implied lock order that
739 	 * is the other way around, then return an error.
740 	 */
741 	if (witness_watch &&
742 	    isitmydescendant(lock2->lo_witness, lock1->lo_witness)) {
743 		mtx_leave(&w_mtx);
744 		return (EINVAL);
745 	}
746 
747 	/* Try to add the new order. */
748 	itismychild(lock1->lo_witness, lock2->lo_witness);
749 	mtx_leave(&w_mtx);
750 	return (0);
751 }
752 
753 void
754 witness_checkorder(struct lock_object *lock, int flags,
755     struct lock_object *interlock)
756 {
757 	struct lock_list_entry *lock_list, *lle;
758 	struct lock_instance *lock1, *lock2, *plock;
759 	struct lock_class *class, *iclass;
760 	struct proc *p;
761 	struct witness *w, *w1;
762 	int i, j, s;
763 
764 	if (witness_cold || witness_watch < 1 || panicstr != NULL ||
765 	    db_active || (lock->lo_flags & LO_WITNESS) == 0)
766 		return;
767 
768 	w = lock->lo_witness;
769 	class = LOCK_CLASS(lock);
770 
771 	if (w == NULL)
772 		w = lock->lo_witness =
773 		    enroll(lock->lo_type, lock->lo_name, class);
774 
775 	p = curproc;
776 
777 	if (class->lc_flags & LC_SLEEPLOCK) {
778 		/*
779 		 * Since spin locks include a critical section, this check
780 		 * implicitly enforces a lock order of all sleep locks before
781 		 * all spin locks.
782 		 */
783 		lock_list = witness_cpu[cpu_number()].wc_spinlocks;
784 		if (lock_list != NULL && lock_list->ll_count > 0) {
785 			panic("acquiring blockable sleep lock with "
786 			    "spinlock or critical section held (%s) %s",
787 			    class->lc_name, lock->lo_name);
788 		}
789 
790 		/*
791 		 * If this is the first lock acquired then just return as
792 		 * no order checking is needed.
793 		 */
794 		lock_list = p->p_sleeplocks;
795 		if (lock_list == NULL || lock_list->ll_count == 0)
796 			return;
797 	} else {
798 
799 		/*
800 		 * If this is the first lock, just return as no order
801 		 * checking is needed.
802 		 */
803 		lock_list = witness_cpu[cpu_number()].wc_spinlocks;
804 		if (lock_list == NULL || lock_list->ll_count == 0)
805 			return;
806 	}
807 
808 	s = splhigh();
809 
810 	/*
811 	 * Check to see if we are recursing on a lock we already own.  If
812 	 * so, make sure that we don't mismatch exclusive and shared lock
813 	 * acquires.
814 	 */
815 	lock1 = find_instance(lock_list, lock);
816 	if (lock1 != NULL) {
817 		if ((lock1->li_flags & LI_EXCLUSIVE) != 0 &&
818 		    (flags & LOP_EXCLUSIVE) == 0) {
819 			printf("witness: shared lock of (%s) %s "
820 			    "while exclusively locked\n",
821 			    class->lc_name, lock->lo_name);
822 			panic("excl->share");
823 		}
824 		if ((lock1->li_flags & LI_EXCLUSIVE) == 0 &&
825 		    (flags & LOP_EXCLUSIVE) != 0) {
826 			printf("witness: exclusive lock of (%s) %s "
827 			    "while share locked\n",
828 			    class->lc_name, lock->lo_name);
829 			panic("share->excl");
830 		}
831 		goto out_splx;
832 	}
833 
834 	/* Warn if the interlock is not locked exactly once. */
835 	if (interlock != NULL) {
836 		iclass = LOCK_CLASS(interlock);
837 		lock1 = find_instance(lock_list, interlock);
838 		if (lock1 == NULL)
839 			panic("interlock (%s) %s not locked",
840 			    iclass->lc_name, interlock->lo_name);
841 		else if ((lock1->li_flags & LI_RECURSEMASK) != 0)
842 			panic("interlock (%s) %s recursed",
843 			    iclass->lc_name, interlock->lo_name);
844 	}
845 
846 	/*
847 	 * Find the previously acquired lock, but ignore interlocks.
848 	 */
849 	plock = &lock_list->ll_children[lock_list->ll_count - 1];
850 	if (interlock != NULL && plock->li_lock == interlock) {
851 		if (lock_list->ll_count > 1)
852 			plock =
853 			    &lock_list->ll_children[lock_list->ll_count - 2];
854 		else {
855 			lle = lock_list->ll_next;
856 
857 			/*
858 			 * The interlock is the only lock we hold, so
859 			 * simply return.
860 			 */
861 			if (lle == NULL)
862 				goto out_splx;
863 			plock = &lle->ll_children[lle->ll_count - 1];
864 		}
865 	}
866 
867 	/*
868 	 * Try to perform most checks without a lock.  If this succeeds we
869 	 * can skip acquiring the lock and return success.  Otherwise we redo
870 	 * the check with the lock held to handle races with concurrent updates.
871 	 */
872 	w1 = plock->li_lock->lo_witness;
873 	if (witness_lock_order_check(w1, w))
874 		goto out_splx;
875 
876 	mtx_enter(&w_mtx);
877 	if (witness_lock_order_check(w1, w))
878 		goto out;
879 
880 	witness_lock_order_add(w1, w);
881 
882 	/*
883 	 * Check for duplicate locks of the same type.  Note that we only
884 	 * have to check for this on the last lock we just acquired.  Any
885 	 * other cases will be caught as lock order violations.
886 	 */
887 	if (w1 == w) {
888 		i = w->w_index;
889 		if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) &&
890 		    !(w_rmatrix[i][i] & WITNESS_REVERSAL)) {
891 			w_rmatrix[i][i] |= WITNESS_REVERSAL;
892 			w->w_reversed = 1;
893 			mtx_leave(&w_mtx);
894 			printf("witness: acquiring duplicate lock of "
895 			    "same type: \"%s\"\n", w->w_type->lt_name);
896 			printf(" 1st %s\n", plock->li_lock->lo_name);
897 			printf(" 2nd %s\n", lock->lo_name);
898 			witness_debugger(1);
899 		} else
900 			mtx_leave(&w_mtx);
901 		goto out_splx;
902 	}
903 	MUTEX_ASSERT_LOCKED(&w_mtx);
904 
905 	/*
906 	 * If we know that the lock we are acquiring comes after
907 	 * the lock we most recently acquired in the lock order tree,
908 	 * then there is no need for any further checks.
909 	 */
910 	if (isitmychild(w1, w))
911 		goto out;
912 
913 	for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) {
914 		for (i = lle->ll_count - 1; i >= 0; i--, j++) {
915 
916 			KASSERT(j < LOCK_CHILDCOUNT * LOCK_NCHILDREN);
917 			lock1 = &lle->ll_children[i];
918 
919 			/*
920 			 * Ignore the interlock.
921 			 */
922 			if (interlock == lock1->li_lock)
923 				continue;
924 
925 			/*
926 			 * If this lock doesn't undergo witness checking,
927 			 * then skip it.
928 			 */
929 			w1 = lock1->li_lock->lo_witness;
930 			if (w1 == NULL) {
931 				KASSERTMSG((lock1->li_lock->lo_flags &
932 				    LO_WITNESS) == 0,
933 				    "lock missing witness structure");
934 				continue;
935 			}
936 
937 			/*
938 			 * If we are locking Giant and this is a sleepable
939 			 * lock, then skip it.
940 			 */
941 			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 &&
942 			    is_kernel_lock(lock))
943 				continue;
944 
945 			/*
946 			 * If we are locking a sleepable lock and this lock
947 			 * is Giant, then skip it.
948 			 */
949 			if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
950 			    is_kernel_lock(lock1->li_lock))
951 				continue;
952 
953 			/*
954 			 * If we are locking a sleepable lock and this lock
955 			 * isn't sleepable, we want to treat it as a lock
956 			 * order violation to enfore a general lock order of
957 			 * sleepable locks before non-sleepable locks.
958 			 */
959 			if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
960 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
961 				goto reversal;
962 
963 			/*
964 			 * If we are locking Giant and this is a non-sleepable
965 			 * lock, then treat it as a reversal.
966 			 */
967 			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 &&
968 			    is_kernel_lock(lock))
969 				goto reversal;
970 
971 			/*
972 			 * Check the lock order hierarchy for a reveresal.
973 			 */
974 			if (!isitmydescendant(w, w1))
975 				continue;
976 		reversal:
977 
978 			/*
979 			 * We have a lock order violation, check to see if it
980 			 * is allowed or has already been yelled about.
981 			 */
982 
983 			/* Bail if this violation is known */
984 			if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL)
985 				goto out;
986 
987 			/* Record this as a violation */
988 			w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL;
989 			w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL;
990 			w->w_reversed = w1->w_reversed = 1;
991 			witness_increment_graph_generation();
992 			mtx_leave(&w_mtx);
993 
994 			/*
995 			 * There are known LORs between VNODE locks. They are
996 			 * not an indication of a bug. VNODE locks are flagged
997 			 * as such (LO_IS_VNODE) and we don't yell if the LOR
998 			 * is between 2 VNODE locks.
999 			 */
1000 			if ((lock->lo_flags & LO_IS_VNODE) != 0 &&
1001 			    (lock1->li_lock->lo_flags & LO_IS_VNODE) != 0)
1002 				goto out_splx;
1003 
1004 			/*
1005 			 * Ok, yell about it.
1006 			 */
1007 			printf("witness: ");
1008 			if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
1009 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
1010 				printf("lock order reversal: "
1011 				    "(sleepable after non-sleepable)\n");
1012 			else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0
1013 			    && is_kernel_lock(lock))
1014 				printf("lock order reversal: "
1015 				    "(Giant after non-sleepable)\n");
1016 			else
1017 				printf("lock order reversal:\n");
1018 
1019 			/*
1020 			 * Try to locate an earlier lock with
1021 			 * witness w in our list.
1022 			 */
1023 			do {
1024 				lock2 = &lle->ll_children[i];
1025 				KASSERT(lock2->li_lock != NULL);
1026 				if (lock2->li_lock->lo_witness == w)
1027 					break;
1028 				if (i == 0 && lle->ll_next != NULL) {
1029 					lle = lle->ll_next;
1030 					i = lle->ll_count - 1;
1031 					KASSERT(i >= 0 && i < LOCK_NCHILDREN);
1032 				} else
1033 					i--;
1034 			} while (i >= 0);
1035 			if (i < 0) {
1036 				printf(" 1st %p %s (%s)\n",
1037 				    lock1->li_lock, lock1->li_lock->lo_name,
1038 				    w1->w_type->lt_name);
1039 				printf(" 2nd %p %s (%s)\n",
1040 				    lock, lock->lo_name, w->w_type->lt_name);
1041 			} else {
1042 				printf(" 1st %p %s (%s)\n",
1043 				    lock2->li_lock, lock2->li_lock->lo_name,
1044 				    lock2->li_lock->lo_witness->w_type->
1045 				      lt_name);
1046 				printf(" 2nd %p %s (%s)\n",
1047 				    lock1->li_lock, lock1->li_lock->lo_name,
1048 				    w1->w_type->lt_name);
1049 				printf(" 3rd %p %s (%s)\n", lock,
1050 				    lock->lo_name, w->w_type->lt_name);
1051 			}
1052 			if (witness_watch > 1) {
1053 				struct witness_lock_order_data *wlod1, *wlod2;
1054 
1055 				mtx_enter(&w_mtx);
1056 				wlod1 = witness_lock_order_get(w, w1);
1057 				wlod2 = witness_lock_order_get(w1, w);
1058 				mtx_leave(&w_mtx);
1059 
1060 				/*
1061 				 * It is safe to access saved stack traces,
1062 				 * w_type, and w_class without the lock.
1063 				 * Once written, they never change.
1064 				 */
1065 
1066 				if (wlod1 != NULL) {
1067 					printf("lock order \"%s\"(%s) -> "
1068 					    "\"%s\"(%s) first seen at:\n",
1069 					    w->w_type->lt_name,
1070 					    w->w_class->lc_name,
1071 					    w1->w_type->lt_name,
1072 					    w1->w_class->lc_name);
1073 					stacktrace_print(
1074 					    &wlod1->wlod_stack, printf);
1075 				}
1076 				if (wlod2 != NULL) {
1077 					printf("lock order \"%s\"(%s) -> "
1078 					    "\"%s\"(%s) first seen at:\n",
1079 					    w1->w_type->lt_name,
1080 					    w1->w_class->lc_name,
1081 					    w->w_type->lt_name,
1082 					    w->w_class->lc_name);
1083 					stacktrace_print(
1084 					    &wlod2->wlod_stack, printf);
1085 				}
1086 			}
1087 			witness_debugger(0);
1088 			goto out_splx;
1089 		}
1090 	}
1091 
1092 	/*
1093 	 * If requested, build a new lock order.  However, don't build a new
1094 	 * relationship between a sleepable lock and Giant if it is in the
1095 	 * wrong direction.  The correct lock order is that sleepable locks
1096 	 * always come before Giant.
1097 	 */
1098 	if (flags & LOP_NEWORDER &&
1099 	    !(is_kernel_lock(plock->li_lock) &&
1100 	    (lock->lo_flags & LO_SLEEPABLE) != 0))
1101 		itismychild(plock->li_lock->lo_witness, w);
1102 out:
1103 	mtx_leave(&w_mtx);
1104 out_splx:
1105 	splx(s);
1106 }
1107 
1108 void
1109 witness_lock(struct lock_object *lock, int flags)
1110 {
1111 	struct lock_list_entry **lock_list, *lle;
1112 	struct lock_instance *instance;
1113 	struct proc *p;
1114 	struct witness *w;
1115 	int s;
1116 
1117 	if (witness_cold || witness_watch < 0 || panicstr != NULL ||
1118 	    db_active || (lock->lo_flags & LO_WITNESS) == 0)
1119 		return;
1120 
1121 	w = lock->lo_witness;
1122 	if (w == NULL)
1123 		w = lock->lo_witness =
1124 		    enroll(lock->lo_type, lock->lo_name, LOCK_CLASS(lock));
1125 
1126 	p = curproc;
1127 
1128 	/* Determine lock list for this lock. */
1129 	if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK)
1130 		lock_list = &p->p_sleeplocks;
1131 	else
1132 		lock_list = &witness_cpu[cpu_number()].wc_spinlocks;
1133 
1134 	s = splhigh();
1135 
1136 	/* Check to see if we are recursing on a lock we already own. */
1137 	instance = find_instance(*lock_list, lock);
1138 	if (instance != NULL) {
1139 		instance->li_flags++;
1140 		goto out;
1141 	}
1142 
1143 	w->w_acquired = 1;
1144 
1145 	/* Find the next open lock instance in the list and fill it. */
1146 	lle = *lock_list;
1147 	if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) {
1148 		lle = witness_lock_list_get();
1149 		if (lle == NULL)
1150 			goto out;
1151 		lle->ll_next = *lock_list;
1152 		*lock_list = lle;
1153 	}
1154 	instance = &lle->ll_children[lle->ll_count++];
1155 	instance->li_lock = lock;
1156 	if ((flags & LOP_EXCLUSIVE) != 0)
1157 		instance->li_flags = LI_EXCLUSIVE;
1158 	else
1159 		instance->li_flags = 0;
1160 	instance->li_stack = NULL;
1161 	if (witness_locktrace) {
1162 		instance->li_stack = witness_lock_stack_get();
1163 		if (instance->li_stack != NULL)
1164 			stacktrace_save(&instance->li_stack->ls_stack);
1165 	}
1166 out:
1167 	splx(s);
1168 }
1169 
1170 void
1171 witness_upgrade(struct lock_object *lock, int flags)
1172 {
1173 	struct lock_instance *instance;
1174 	struct lock_class *class;
1175 	int s;
1176 
1177 	KASSERTMSG(witness_cold == 0, "%s: witness_cold", __func__);
1178 	if (lock->lo_witness == NULL || witness_watch < 0 ||
1179 	    panicstr != NULL || db_active)
1180 		return;
1181 	class = LOCK_CLASS(lock);
1182 	if (witness_watch) {
1183 		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
1184 			panic("upgrade of non-upgradable lock (%s) %s",
1185 			    class->lc_name, lock->lo_name);
1186 		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
1187 			panic("upgrade of non-sleep lock (%s) %s",
1188 			    class->lc_name, lock->lo_name);
1189 	}
1190 	s = splhigh();
1191 	instance = find_instance(curproc->p_sleeplocks, lock);
1192 	if (instance == NULL) {
1193 		panic("upgrade of unlocked lock (%s) %s",
1194 		    class->lc_name, lock->lo_name);
1195 		goto out;
1196 	}
1197 	if (witness_watch) {
1198 		if ((instance->li_flags & LI_EXCLUSIVE) != 0)
1199 			panic("upgrade of exclusive lock (%s) %s",
1200 			    class->lc_name, lock->lo_name);
1201 		if ((instance->li_flags & LI_RECURSEMASK) != 0)
1202 			panic("upgrade of recursed lock (%s) %s r=%d",
1203 			    class->lc_name, lock->lo_name,
1204 			    instance->li_flags & LI_RECURSEMASK);
1205 	}
1206 	instance->li_flags |= LI_EXCLUSIVE;
1207 out:
1208 	splx(s);
1209 }
1210 
1211 void
1212 witness_downgrade(struct lock_object *lock, int flags)
1213 {
1214 	struct lock_instance *instance;
1215 	struct lock_class *class;
1216 	int s;
1217 
1218 	KASSERTMSG(witness_cold == 0, "%s: witness_cold", __func__);
1219 	if (lock->lo_witness == NULL || witness_watch < 0 ||
1220 	    panicstr != NULL || db_active)
1221 		return;
1222 	class = LOCK_CLASS(lock);
1223 	if (witness_watch) {
1224 		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
1225 			panic(
1226 			    "downgrade of non-upgradable lock (%s) %s",
1227 			    class->lc_name, lock->lo_name);
1228 		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
1229 			panic("downgrade of non-sleep lock (%s) %s",
1230 			    class->lc_name, lock->lo_name);
1231 	}
1232 	s = splhigh();
1233 	instance = find_instance(curproc->p_sleeplocks, lock);
1234 	if (instance == NULL) {
1235 		panic("downgrade of unlocked lock (%s) %s",
1236 		    class->lc_name, lock->lo_name);
1237 		goto out;
1238 	}
1239 	if (witness_watch) {
1240 		if ((instance->li_flags & LI_EXCLUSIVE) == 0)
1241 			panic("downgrade of shared lock (%s) %s",
1242 			    class->lc_name, lock->lo_name);
1243 		if ((instance->li_flags & LI_RECURSEMASK) != 0)
1244 			panic("downgrade of recursed lock (%s) %s r=%d",
1245 			    class->lc_name, lock->lo_name,
1246 			    instance->li_flags & LI_RECURSEMASK);
1247 	}
1248 	instance->li_flags &= ~LI_EXCLUSIVE;
1249 out:
1250 	splx(s);
1251 }
1252 
1253 void
1254 witness_unlock(struct lock_object *lock, int flags)
1255 {
1256 	struct lock_list_entry **lock_list, *lle;
1257 	struct lock_instance *instance;
1258 	struct lock_class *class;
1259 	struct proc *p;
1260 	int i, j;
1261 	int s;
1262 
1263 	if (witness_cold || lock->lo_witness == NULL ||
1264 	    panicstr != NULL || db_active)
1265 		return;
1266 	p = curproc;
1267 	class = LOCK_CLASS(lock);
1268 
1269 	/* Find lock instance associated with this lock. */
1270 	if (class->lc_flags & LC_SLEEPLOCK)
1271 		lock_list = &p->p_sleeplocks;
1272 	else
1273 		lock_list = &witness_cpu[cpu_number()].wc_spinlocks;
1274 
1275 	s = splhigh();
1276 
1277 	lle = *lock_list;
1278 	for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next)
1279 		for (i = 0; i < (*lock_list)->ll_count; i++) {
1280 			instance = &(*lock_list)->ll_children[i];
1281 			if (instance->li_lock == lock)
1282 				goto found;
1283 		}
1284 
1285 	/*
1286 	 * When disabling WITNESS through witness_watch we could end up in
1287 	 * having registered locks in the p_sleeplocks queue.
1288 	 * We have to make sure we flush these queues, so just search for
1289 	 * eventual register locks and remove them.
1290 	 */
1291 	if (witness_watch > 0) {
1292 		panic("lock (%s) %s not locked", class->lc_name, lock->lo_name);
1293 	}
1294 	goto out;
1295 
1296 found:
1297 
1298 	/* First, check for shared/exclusive mismatches. */
1299 	if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 &&
1300 	    (flags & LOP_EXCLUSIVE) == 0) {
1301 		printf("witness: shared unlock of (%s) %s "
1302 		    "while exclusively locked\n",
1303 		    class->lc_name, lock->lo_name);
1304 		panic("excl->ushare");
1305 	}
1306 	if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 &&
1307 	    (flags & LOP_EXCLUSIVE) != 0) {
1308 		printf("witness: exclusive unlock of (%s) %s "
1309 		    "while share locked\n", class->lc_name, lock->lo_name);
1310 		panic("share->uexcl");
1311 	}
1312 	/* If we are recursed, unrecurse. */
1313 	if ((instance->li_flags & LI_RECURSEMASK) > 0) {
1314 		instance->li_flags--;
1315 		goto out;
1316 	}
1317 	/* The lock is now being dropped, check for NORELEASE flag */
1318 	if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) {
1319 		printf("witness: forbidden unlock of (%s) %s\n",
1320 		    class->lc_name, lock->lo_name);
1321 		panic("lock marked norelease");
1322 	}
1323 
1324 	/* Release the stack buffer, if any. */
1325 	if (instance->li_stack != NULL) {
1326 		witness_lock_stack_free(instance->li_stack);
1327 		instance->li_stack = NULL;
1328 	}
1329 
1330 	/* Remove this item from the list. */
1331 	for (j = i; j < (*lock_list)->ll_count - 1; j++)
1332 		(*lock_list)->ll_children[j] =
1333 		    (*lock_list)->ll_children[j + 1];
1334 	(*lock_list)->ll_count--;
1335 
1336 	/*
1337 	 * In order to reduce contention on w_mtx, we want to keep always an
1338 	 * head object into lists so that frequent allocation from the
1339 	 * free witness pool (and subsequent locking) is avoided.
1340 	 * In order to maintain the current code simple, when the head
1341 	 * object is totally unloaded it means also that we do not have
1342 	 * further objects in the list, so the list ownership needs to be
1343 	 * hand over to another object if the current head needs to be freed.
1344 	 */
1345 	if ((*lock_list)->ll_count == 0) {
1346 		if (*lock_list == lle) {
1347 			if (lle->ll_next == NULL)
1348 				goto out;
1349 		} else
1350 			lle = *lock_list;
1351 		*lock_list = lle->ll_next;
1352 		witness_lock_list_free(lle);
1353 	}
1354 out:
1355 	splx(s);
1356 }
1357 
1358 void
1359 witness_thread_exit(struct proc *p)
1360 {
1361 	struct lock_list_entry *lle;
1362 	int i, n, s;
1363 
1364 	lle = p->p_sleeplocks;
1365 	if (lle == NULL || panicstr != NULL || db_active)
1366 		return;
1367 	if (lle->ll_count != 0) {
1368 		for (n = 0; lle != NULL; lle = lle->ll_next)
1369 			for (i = lle->ll_count - 1; i >= 0; i--) {
1370 				if (n == 0)
1371 					printf("witness: thread %p exiting "
1372 					    "with the following locks held:\n",
1373 					    p);
1374 				n++;
1375 				witness_list_lock(&lle->ll_children[i],
1376 				    printf);
1377 			}
1378 		panic("thread %p cannot exit while holding sleeplocks", p);
1379 	}
1380 	KASSERT(lle->ll_next == NULL);
1381 	s = splhigh();
1382 	witness_lock_list_free(lle);
1383 	splx(s);
1384 }
1385 
1386 /*
1387  * Warn if any locks other than 'lock' are held.  Flags can be passed in to
1388  * exempt Giant and sleepable locks from the checks as well.  If any
1389  * non-exempt locks are held, then a supplied message is printed to the
1390  * output channel along with a list of the offending locks.  If indicated in the
1391  * flags then a failure results in a panic as well.
1392  */
1393 int
1394 witness_warn(int flags, struct lock_object *lock, const char *fmt, ...)
1395 {
1396 	struct lock_list_entry *lock_list, *lle;
1397 	struct lock_instance *lock1;
1398 	struct proc *p;
1399 	va_list ap;
1400 	int i, n;
1401 
1402 	if (witness_cold || witness_watch < 1 || panicstr != NULL || db_active)
1403 		return (0);
1404 	n = 0;
1405 	p = curproc;
1406 	for (lle = p->p_sleeplocks; lle != NULL; lle = lle->ll_next)
1407 		for (i = lle->ll_count - 1; i >= 0; i--) {
1408 			lock1 = &lle->ll_children[i];
1409 			if (lock1->li_lock == lock)
1410 				continue;
1411 			if (flags & WARN_KERNELOK &&
1412 			    is_kernel_lock(lock1->li_lock))
1413 				continue;
1414 			if (flags & WARN_SLEEPOK &&
1415 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0)
1416 				continue;
1417 			if (n == 0) {
1418 				printf("witness: ");
1419 				va_start(ap, fmt);
1420 				vprintf(fmt, ap);
1421 				va_end(ap);
1422 				printf(" with the following %slocks held:\n",
1423 				    (flags & WARN_SLEEPOK) != 0 ?
1424 				    "non-sleepable " : "");
1425 			}
1426 			n++;
1427 			witness_list_lock(lock1, printf);
1428 		}
1429 
1430 	lock_list = witness_cpu[cpu_number()].wc_spinlocks;
1431 	if (lock_list != NULL && lock_list->ll_count != 0) {
1432 		/*
1433 		 * We should only have one spinlock and as long as
1434 		 * the flags cannot match for this locks class,
1435 		 * check if the first spinlock is the one curproc
1436 		 * should hold.
1437 		 */
1438 		lock1 = &lock_list->ll_children[lock_list->ll_count - 1];
1439 		if (lock_list->ll_count == 1 && lock_list->ll_next == NULL &&
1440 		    lock1->li_lock == lock && n == 0)
1441 			return (0);
1442 
1443 		printf("witness: ");
1444 		va_start(ap, fmt);
1445 		vprintf(fmt, ap);
1446 		va_end(ap);
1447 		printf(" with the following %slocks held:\n",
1448 		    (flags & WARN_SLEEPOK) != 0 ?  "non-sleepable " : "");
1449 		n += witness_list_locks(&lock_list, printf);
1450 	}
1451 	if (n > 0) {
1452 		if (flags & WARN_PANIC)
1453 			panic("%s", __func__);
1454 		else
1455 			witness_debugger(1);
1456 	}
1457 	return (n);
1458 }
1459 
1460 static struct witness *
1461 enroll(const struct lock_type *type, const char *subtype,
1462     struct lock_class *lock_class)
1463 {
1464 	struct witness *w;
1465 	struct witness_list *typelist;
1466 
1467 	KASSERT(type != NULL);
1468 
1469 	if (witness_watch < 0 || panicstr != NULL || db_active)
1470 		return (NULL);
1471 	if ((lock_class->lc_flags & LC_SPINLOCK)) {
1472 		typelist = &w_spin;
1473 	} else if ((lock_class->lc_flags & LC_SLEEPLOCK)) {
1474 		typelist = &w_sleep;
1475 	} else {
1476 		panic("lock class %s is not sleep or spin",
1477 		    lock_class->lc_name);
1478 		return (NULL);
1479 	}
1480 
1481 	mtx_enter(&w_mtx);
1482 	w = witness_hash_get(type, subtype);
1483 	if (w)
1484 		goto found;
1485 	if ((w = witness_get()) == NULL)
1486 		return (NULL);
1487 	w->w_type = type;
1488 	w->w_subtype = subtype;
1489 	w->w_class = lock_class;
1490 	SIMPLEQ_INSERT_HEAD(&w_all, w, w_list);
1491 	if (lock_class->lc_flags & LC_SPINLOCK) {
1492 		SIMPLEQ_INSERT_HEAD(&w_spin, w, w_typelist);
1493 		w_spin_cnt++;
1494 	} else if (lock_class->lc_flags & LC_SLEEPLOCK) {
1495 		SIMPLEQ_INSERT_HEAD(&w_sleep, w, w_typelist);
1496 		w_sleep_cnt++;
1497 	}
1498 
1499 	/* Insert new witness into the hash */
1500 	witness_hash_put(w);
1501 	witness_increment_graph_generation();
1502 	mtx_leave(&w_mtx);
1503 	return (w);
1504 found:
1505 	mtx_leave(&w_mtx);
1506 	if (lock_class != w->w_class)
1507 		panic("lock (%s) %s does not match earlier (%s) lock",
1508 		    type->lt_name, lock_class->lc_name, w->w_class->lc_name);
1509 	return (w);
1510 }
1511 
1512 static void
1513 adopt(struct witness *parent, struct witness *child)
1514 {
1515 	int pi, ci, i, j;
1516 
1517 	if (witness_cold == 0)
1518 		MUTEX_ASSERT_LOCKED(&w_mtx);
1519 
1520 	/* If the relationship is already known, there's no work to be done. */
1521 	if (isitmychild(parent, child))
1522 		return;
1523 
1524 	/* When the structure of the graph changes, bump up the generation. */
1525 	witness_increment_graph_generation();
1526 
1527 	/*
1528 	 * The hard part ... create the direct relationship, then propagate all
1529 	 * indirect relationships.
1530 	 */
1531 	pi = parent->w_index;
1532 	ci = child->w_index;
1533 	WITNESS_INDEX_ASSERT(pi);
1534 	WITNESS_INDEX_ASSERT(ci);
1535 	KASSERT(pi != ci);
1536 	w_rmatrix[pi][ci] |= WITNESS_PARENT;
1537 	w_rmatrix[ci][pi] |= WITNESS_CHILD;
1538 
1539 	/*
1540 	 * If parent was not already an ancestor of child,
1541 	 * then we increment the descendant and ancestor counters.
1542 	 */
1543 	if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) {
1544 		parent->w_num_descendants++;
1545 		child->w_num_ancestors++;
1546 	}
1547 
1548 	/*
1549 	 * Find each ancestor of 'pi'. Note that 'pi' itself is counted as
1550 	 * an ancestor of 'pi' during this loop.
1551 	 */
1552 	for (i = 1; i <= w_max_used_index; i++) {
1553 		if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 &&
1554 		    (i != pi))
1555 			continue;
1556 
1557 		/* Find each descendant of 'i' and mark it as a descendant. */
1558 		for (j = 1; j <= w_max_used_index; j++) {
1559 
1560 			/*
1561 			 * Skip children that are already marked as
1562 			 * descendants of 'i'.
1563 			 */
1564 			if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK)
1565 				continue;
1566 
1567 			/*
1568 			 * We are only interested in descendants of 'ci'. Note
1569 			 * that 'ci' itself is counted as a descendant of 'ci'.
1570 			 */
1571 			if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 &&
1572 			    (j != ci))
1573 				continue;
1574 			w_rmatrix[i][j] |= WITNESS_ANCESTOR;
1575 			w_rmatrix[j][i] |= WITNESS_DESCENDANT;
1576 			w_data[i].w_num_descendants++;
1577 			w_data[j].w_num_ancestors++;
1578 
1579 			/*
1580 			 * Make sure we aren't marking a node as both an
1581 			 * ancestor and descendant. We should have caught
1582 			 * this as a lock order reversal earlier.
1583 			 */
1584 			if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) &&
1585 			    (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) {
1586 				printf("witness: rmatrix paradox! [%d][%d]=%d "
1587 				    "both ancestor and descendant\n",
1588 				    i, j, w_rmatrix[i][j]);
1589 #ifdef DDB
1590 				db_stack_dump();
1591 #endif
1592 				printf("witness disabled\n");
1593 				witness_watch = -1;
1594 			}
1595 			if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) &&
1596 			    (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) {
1597 				printf("witness: rmatrix paradox! [%d][%d]=%d "
1598 				    "both ancestor and descendant\n",
1599 				    j, i, w_rmatrix[j][i]);
1600 #ifdef DDB
1601 				db_stack_dump();
1602 #endif
1603 				printf("witness disabled\n");
1604 				witness_watch = -1;
1605 			}
1606 		}
1607 	}
1608 }
1609 
1610 static void
1611 itismychild(struct witness *parent, struct witness *child)
1612 {
1613 	int unlocked;
1614 
1615 	KASSERT(child != NULL && parent != NULL);
1616 	if (witness_cold == 0)
1617 		MUTEX_ASSERT_LOCKED(&w_mtx);
1618 
1619 	if (!witness_lock_type_equal(parent, child)) {
1620 		if (witness_cold == 0) {
1621 			unlocked = 1;
1622 			mtx_leave(&w_mtx);
1623 		} else {
1624 			unlocked = 0;
1625 		}
1626 		panic(
1627 		    "%s: parent \"%s\" (%s) and child \"%s\" (%s) are not "
1628 		    "the same lock type", __func__, parent->w_type->lt_name,
1629 		    parent->w_class->lc_name, child->w_type->lt_name,
1630 		    child->w_class->lc_name);
1631 		if (unlocked)
1632 			mtx_enter(&w_mtx);
1633 	}
1634 	adopt(parent, child);
1635 }
1636 
1637 /*
1638  * Generic code for the isitmy*() functions. The rmask parameter is the
1639  * expected relationship of w1 to w2.
1640  */
1641 static int
1642 _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname)
1643 {
1644 	unsigned char r1, r2;
1645 	int i1, i2;
1646 
1647 	i1 = w1->w_index;
1648 	i2 = w2->w_index;
1649 	WITNESS_INDEX_ASSERT(i1);
1650 	WITNESS_INDEX_ASSERT(i2);
1651 	r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK;
1652 	r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK;
1653 
1654 	/* The flags on one better be the inverse of the flags on the other */
1655 	if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) ||
1656 	    (WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) {
1657 		/* Don't squawk if we're potentially racing with an update. */
1658 		if (w_mtx.mtx_owner != curcpu())
1659 			return (0);
1660 		printf("witness: %s: rmatrix mismatch between %s (index %d) "
1661 		    "and %s (index %d): w_rmatrix[%d][%d] == %x but "
1662 		    "w_rmatrix[%d][%d] == %x\n",
1663 		    fname, w1->w_type->lt_name, i1, w2->w_type->lt_name,
1664 		    i2, i1, i2, r1,
1665 		    i2, i1, r2);
1666 #ifdef DDB
1667 		db_stack_dump();
1668 #endif
1669 		printf("witness disabled\n");
1670 		witness_watch = -1;
1671 	}
1672 	return (r1 & rmask);
1673 }
1674 
1675 /*
1676  * Checks if @child is a direct child of @parent.
1677  */
1678 static int
1679 isitmychild(struct witness *parent, struct witness *child)
1680 {
1681 
1682 	return (_isitmyx(parent, child, WITNESS_PARENT, __func__));
1683 }
1684 
1685 /*
1686  * Checks if @descendant is a direct or inderect descendant of @ancestor.
1687  */
1688 static int
1689 isitmydescendant(struct witness *ancestor, struct witness *descendant)
1690 {
1691 
1692 	return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK,
1693 	    __func__));
1694 }
1695 
1696 static struct witness *
1697 witness_get(void)
1698 {
1699 	struct witness *w;
1700 	int index;
1701 
1702 	if (witness_cold == 0)
1703 		MUTEX_ASSERT_LOCKED(&w_mtx);
1704 
1705 	if (witness_watch < 0) {
1706 		mtx_leave(&w_mtx);
1707 		return (NULL);
1708 	}
1709 	if (SIMPLEQ_EMPTY(&w_free)) {
1710 		witness_watch = -1;
1711 		mtx_leave(&w_mtx);
1712 		printf("WITNESS: unable to allocate a new witness object\n");
1713 		return (NULL);
1714 	}
1715 	w = SIMPLEQ_FIRST(&w_free);
1716 	SIMPLEQ_REMOVE_HEAD(&w_free, w_list);
1717 	w_free_cnt--;
1718 	index = w->w_index;
1719 	KASSERT(index > 0 && index == w_max_used_index + 1 &&
1720 	    index < witness_count);
1721 	memset(w, 0, sizeof(*w));
1722 	w->w_index = index;
1723 	if (index > w_max_used_index)
1724 		w_max_used_index = index;
1725 	return (w);
1726 }
1727 
1728 static void
1729 witness_free(struct witness *w)
1730 {
1731 
1732 	SIMPLEQ_INSERT_HEAD(&w_free, w, w_list);
1733 	w_free_cnt++;
1734 }
1735 
1736 static struct lock_list_entry *
1737 witness_lock_list_get(void)
1738 {
1739 	struct lock_list_entry *lle;
1740 	struct witness_cpu *wcpu = &witness_cpu[cpu_number()];
1741 
1742 	if (witness_watch < 0)
1743 		return (NULL);
1744 
1745 	splassert(IPL_HIGH);
1746 
1747 	if (wcpu->wc_lle_count > 0) {
1748 		lle = wcpu->wc_lle_cache;
1749 		wcpu->wc_lle_cache = lle->ll_next;
1750 		wcpu->wc_lle_count--;
1751 		memset(lle, 0, sizeof(*lle));
1752 		return (lle);
1753 	}
1754 
1755 	mtx_enter(&w_mtx);
1756 	lle = w_lock_list_free;
1757 	if (lle == NULL) {
1758 		witness_watch = -1;
1759 		mtx_leave(&w_mtx);
1760 		printf("%s: witness exhausted\n", __func__);
1761 		return (NULL);
1762 	}
1763 	w_lock_list_free = lle->ll_next;
1764 	mtx_leave(&w_mtx);
1765 	memset(lle, 0, sizeof(*lle));
1766 	return (lle);
1767 }
1768 
1769 static void
1770 witness_lock_list_free(struct lock_list_entry *lle)
1771 {
1772 	struct witness_cpu *wcpu = &witness_cpu[cpu_number()];
1773 
1774 	splassert(IPL_HIGH);
1775 
1776 	if (wcpu->wc_lle_count < WITNESS_LLE_CACHE_MAX) {
1777 		lle->ll_next = wcpu->wc_lle_cache;
1778 		wcpu->wc_lle_cache = lle;
1779 		wcpu->wc_lle_count++;
1780 		return;
1781 	}
1782 
1783 	mtx_enter(&w_mtx);
1784 	lle->ll_next = w_lock_list_free;
1785 	w_lock_list_free = lle;
1786 	mtx_leave(&w_mtx);
1787 }
1788 
1789 static union lock_stack *
1790 witness_lock_stack_get(void)
1791 {
1792 	union lock_stack *stack = NULL;
1793 	struct witness_cpu *wcpu = &witness_cpu[cpu_number()];
1794 
1795 	splassert(IPL_HIGH);
1796 
1797 	if (wcpu->wc_stk_count > 0) {
1798 		stack = wcpu->wc_stk_cache;
1799 		wcpu->wc_stk_cache = stack->ls_next;
1800 		wcpu->wc_stk_count--;
1801 		return (stack);
1802 	}
1803 
1804 	mtx_enter(&w_mtx);
1805 	if (w_lock_stack_free != NULL) {
1806 		stack = w_lock_stack_free;
1807 		w_lock_stack_free = stack->ls_next;
1808 	}
1809 	mtx_leave(&w_mtx);
1810 	return (stack);
1811 }
1812 
1813 static void
1814 witness_lock_stack_free(union lock_stack *stack)
1815 {
1816 	struct witness_cpu *wcpu = &witness_cpu[cpu_number()];
1817 
1818 	splassert(IPL_HIGH);
1819 
1820 	if (wcpu->wc_stk_count < WITNESS_STK_CACHE_MAX) {
1821 		stack->ls_next = wcpu->wc_stk_cache;
1822 		wcpu->wc_stk_cache = stack;
1823 		wcpu->wc_stk_count++;
1824 		return;
1825 	}
1826 
1827 	mtx_enter(&w_mtx);
1828 	stack->ls_next = w_lock_stack_free;
1829 	w_lock_stack_free = stack;
1830 	mtx_leave(&w_mtx);
1831 }
1832 
1833 static struct lock_instance *
1834 find_instance(struct lock_list_entry *list, const struct lock_object *lock)
1835 {
1836 	struct lock_list_entry *lle;
1837 	struct lock_instance *instance;
1838 	int i;
1839 
1840 	for (lle = list; lle != NULL; lle = lle->ll_next) {
1841 		for (i = lle->ll_count - 1; i >= 0; i--) {
1842 			instance = &lle->ll_children[i];
1843 			if (instance->li_lock == lock)
1844 				return (instance);
1845 		}
1846 	}
1847 	return (NULL);
1848 }
1849 
1850 static void
1851 witness_list_lock(struct lock_instance *instance,
1852     int (*prnt)(const char *fmt, ...))
1853 {
1854 	struct lock_object *lock;
1855 
1856 	lock = instance->li_lock;
1857 	prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ?
1858 	    "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name);
1859 	prnt(" r = %d (%p)\n", instance->li_flags & LI_RECURSEMASK, lock);
1860 	if (instance->li_stack != NULL)
1861 		stacktrace_print(&instance->li_stack->ls_stack, prnt);
1862 }
1863 
1864 #ifdef DDB
1865 static int
1866 witness_thread_has_locks(struct proc *p)
1867 {
1868 
1869 	if (p->p_sleeplocks == NULL)
1870 		return (0);
1871 	return (p->p_sleeplocks->ll_count != 0);
1872 }
1873 
1874 static int
1875 witness_process_has_locks(struct process *pr)
1876 {
1877 	struct proc *p;
1878 
1879 	TAILQ_FOREACH(p, &pr->ps_threads, p_thr_link) {
1880 		if (witness_thread_has_locks(p))
1881 			return (1);
1882 	}
1883 	return (0);
1884 }
1885 #endif
1886 
1887 int
1888 witness_list_locks(struct lock_list_entry **lock_list,
1889     int (*prnt)(const char *fmt, ...))
1890 {
1891 	struct lock_list_entry *lle;
1892 	int i, nheld;
1893 
1894 	nheld = 0;
1895 	for (lle = *lock_list; lle != NULL; lle = lle->ll_next)
1896 		for (i = lle->ll_count - 1; i >= 0; i--) {
1897 			witness_list_lock(&lle->ll_children[i], prnt);
1898 			nheld++;
1899 		}
1900 	return (nheld);
1901 }
1902 
1903 /*
1904  * This is a bit risky at best.  We call this function when we have timed
1905  * out acquiring a spin lock, and we assume that the other CPU is stuck
1906  * with this lock held.  So, we go groveling around in the other CPU's
1907  * per-cpu data to try to find the lock instance for this spin lock to
1908  * see when it was last acquired.
1909  */
1910 void
1911 witness_display_spinlock(struct lock_object *lock, struct proc *owner,
1912     int (*prnt)(const char *fmt, ...))
1913 {
1914 	struct lock_instance *instance;
1915 
1916 	if (owner->p_stat != SONPROC)
1917 		return;
1918 	instance = find_instance(
1919 	    witness_cpu[owner->p_cpu->ci_cpuid].wc_spinlocks, lock);
1920 	if (instance != NULL)
1921 		witness_list_lock(instance, prnt);
1922 }
1923 
1924 void
1925 witness_assert(const struct lock_object *lock, int flags)
1926 {
1927 	struct lock_instance *instance;
1928 	struct lock_class *class;
1929 
1930 	if (lock->lo_witness == NULL || witness_watch < 1 ||
1931 	    panicstr != NULL || db_active)
1932 		return;
1933 	class = LOCK_CLASS(lock);
1934 	if ((class->lc_flags & LC_SLEEPLOCK) != 0)
1935 		instance = find_instance(curproc->p_sleeplocks, lock);
1936 	else if ((class->lc_flags & LC_SPINLOCK) != 0)
1937 		instance = find_instance(
1938 		    witness_cpu[cpu_number()].wc_spinlocks, lock);
1939 	else {
1940 		panic("lock (%s) %s is not sleep or spin!",
1941 		    class->lc_name, lock->lo_name);
1942 		return;
1943 	}
1944 	switch (flags) {
1945 	case LA_UNLOCKED:
1946 		if (instance != NULL)
1947 			panic("lock (%s) %s locked",
1948 			    class->lc_name, lock->lo_name);
1949 		break;
1950 	case LA_LOCKED:
1951 	case LA_LOCKED | LA_RECURSED:
1952 	case LA_LOCKED | LA_NOTRECURSED:
1953 	case LA_SLOCKED:
1954 	case LA_SLOCKED | LA_RECURSED:
1955 	case LA_SLOCKED | LA_NOTRECURSED:
1956 	case LA_XLOCKED:
1957 	case LA_XLOCKED | LA_RECURSED:
1958 	case LA_XLOCKED | LA_NOTRECURSED:
1959 		if (instance == NULL) {
1960 			panic("lock (%s) %s not locked",
1961 			    class->lc_name, lock->lo_name);
1962 			break;
1963 		}
1964 		if ((flags & LA_XLOCKED) != 0 &&
1965 		    (instance->li_flags & LI_EXCLUSIVE) == 0)
1966 			panic(
1967 			    "lock (%s) %s not exclusively locked",
1968 			    class->lc_name, lock->lo_name);
1969 		if ((flags & LA_SLOCKED) != 0 &&
1970 		    (instance->li_flags & LI_EXCLUSIVE) != 0)
1971 			panic(
1972 			    "lock (%s) %s exclusively locked",
1973 			    class->lc_name, lock->lo_name);
1974 		if ((flags & LA_RECURSED) != 0 &&
1975 		    (instance->li_flags & LI_RECURSEMASK) == 0)
1976 			panic("lock (%s) %s not recursed",
1977 			    class->lc_name, lock->lo_name);
1978 		if ((flags & LA_NOTRECURSED) != 0 &&
1979 		    (instance->li_flags & LI_RECURSEMASK) != 0)
1980 			panic("lock (%s) %s recursed",
1981 			    class->lc_name, lock->lo_name);
1982 		break;
1983 	default:
1984 		panic("invalid lock assertion");
1985 
1986 	}
1987 }
1988 
1989 static void
1990 witness_setflag(struct lock_object *lock, int flag, int set)
1991 {
1992 	struct lock_list_entry *lock_list;
1993 	struct lock_instance *instance;
1994 	struct lock_class *class;
1995 
1996 	if (lock->lo_witness == NULL || witness_watch < 0 ||
1997 	    panicstr != NULL || db_active)
1998 		return;
1999 	class = LOCK_CLASS(lock);
2000 	if (class->lc_flags & LC_SLEEPLOCK)
2001 		lock_list = curproc->p_sleeplocks;
2002 	else
2003 		lock_list = witness_cpu[cpu_number()].wc_spinlocks;
2004 	instance = find_instance(lock_list, lock);
2005 	if (instance == NULL) {
2006 		panic("%s: lock (%s) %s not locked", __func__,
2007 		    class->lc_name, lock->lo_name);
2008 		return;
2009 	}
2010 
2011 	if (set)
2012 		instance->li_flags |= flag;
2013 	else
2014 		instance->li_flags &= ~flag;
2015 }
2016 
2017 void
2018 witness_norelease(struct lock_object *lock)
2019 {
2020 
2021 	witness_setflag(lock, LI_NORELEASE, 1);
2022 }
2023 
2024 void
2025 witness_releaseok(struct lock_object *lock)
2026 {
2027 
2028 	witness_setflag(lock, LI_NORELEASE, 0);
2029 }
2030 
2031 #ifdef DDB
2032 static void
2033 witness_ddb_list(struct proc *p)
2034 {
2035 	struct witness_cpu *wc = &witness_cpu[cpu_number()];
2036 
2037 	KASSERTMSG(witness_cold == 0, "%s: witness_cold", __func__);
2038 	KASSERTMSG(db_active, "%s: not in the debugger", __func__);
2039 
2040 	if (witness_watch < 1)
2041 		return;
2042 
2043 	witness_list_locks(&p->p_sleeplocks, db_printf);
2044 
2045 	/*
2046 	 * We only handle spinlocks if td == curproc.  This is somewhat broken
2047 	 * if td is currently executing on some other CPU and holds spin locks
2048 	 * as we won't display those locks.  If we had a MI way of getting
2049 	 * the per-cpu data for a given cpu then we could use
2050 	 * td->td_oncpu to get the list of spinlocks for this thread
2051 	 * and "fix" this.
2052 	 *
2053 	 * That still wouldn't really fix this unless we locked the scheduler
2054 	 * lock or stopped the other CPU to make sure it wasn't changing the
2055 	 * list out from under us.  It is probably best to just not try to
2056 	 * handle threads on other CPU's for now.
2057 	 */
2058 	if (p == curproc && wc->wc_spinlocks != NULL)
2059 		witness_list_locks(&wc->wc_spinlocks, db_printf);
2060 }
2061 
2062 void
2063 db_witness_list(db_expr_t addr, int have_addr, db_expr_t count, char *modif)
2064 {
2065 	struct proc *p;
2066 
2067 	if (have_addr)
2068 		p = (struct proc *)addr;
2069 	else
2070 		p = curproc;
2071 	witness_ddb_list(p);
2072 }
2073 
2074 void
2075 db_witness_list_all(db_expr_t addr, int have_addr, db_expr_t count, char *modif)
2076 {
2077 	CPU_INFO_ITERATOR cii;
2078 	struct cpu_info *ci;
2079 	struct lock_list_entry *lock_list;
2080 	struct process *pr;
2081 	struct proc *p;
2082 
2083 	CPU_INFO_FOREACH(cii, ci) {
2084 		lock_list = witness_cpu[CPU_INFO_UNIT(ci)].wc_spinlocks;
2085 		if (lock_list == NULL || lock_list->ll_count == 0)
2086 			continue;
2087 		db_printf("CPU %d:\n", CPU_INFO_UNIT(ci));
2088 		witness_list_locks(&lock_list, db_printf);
2089 	}
2090 
2091 	/*
2092 	 * It would be nice to list only threads and processes that actually
2093 	 * held sleep locks, but that information is currently not exported
2094 	 * by WITNESS.
2095 	 */
2096 	LIST_FOREACH(pr, &allprocess, ps_list) {
2097 		if (!witness_process_has_locks(pr))
2098 			continue;
2099 		TAILQ_FOREACH(p, &pr->ps_threads, p_thr_link) {
2100 			if (!witness_thread_has_locks(p))
2101 				continue;
2102 			db_printf("Process %d (%s) thread %p (%d)\n",
2103 			    pr->ps_pid, pr->ps_comm, p, p->p_tid);
2104 			witness_ddb_list(p);
2105 		}
2106 	}
2107 }
2108 
2109 void
2110 witness_print_badstacks(void)
2111 {
2112 	static struct witness tmp_w1, tmp_w2;
2113 	static struct witness_lock_order_data tmp_data1, tmp_data2;
2114 	struct witness_lock_order_data *data1, *data2;
2115 	struct witness *w1, *w2;
2116 	int error, generation, i, j;
2117 
2118 	if (witness_watch < 1) {
2119 		db_printf("witness watch is disabled\n");
2120 		return;
2121 	}
2122 	if (witness_cold) {
2123 		db_printf("witness is cold\n");
2124 		return;
2125 	}
2126 	error = 0;
2127 
2128 	memset(&tmp_w1, 0, sizeof(tmp_w1));
2129 	memset(&tmp_w2, 0, sizeof(tmp_w2));
2130 	memset(&tmp_data1, 0, sizeof(tmp_data1));
2131 	memset(&tmp_data2, 0, sizeof(tmp_data2));
2132 
2133 restart:
2134 	mtx_enter(&w_mtx);
2135 	generation = w_generation;
2136 	mtx_leave(&w_mtx);
2137 	db_printf("Number of known direct relationships is %d\n",
2138 	    w_lohash.wloh_count);
2139 	for (i = 1; i < w_max_used_index; i++) {
2140 		mtx_enter(&w_mtx);
2141 		if (generation != w_generation) {
2142 			mtx_leave(&w_mtx);
2143 
2144 			/* The graph has changed, try again. */
2145 			db_printf("Lock graph changed, restarting trace.\n");
2146 			goto restart;
2147 		}
2148 
2149 		w1 = &w_data[i];
2150 		if (w1->w_reversed == 0) {
2151 			mtx_leave(&w_mtx);
2152 			continue;
2153 		}
2154 
2155 		/* Copy w1 locally so we can release the spin lock. */
2156 		tmp_w1 = *w1;
2157 		mtx_leave(&w_mtx);
2158 
2159 		if (tmp_w1.w_reversed == 0)
2160 			continue;
2161 		for (j = 1; j < w_max_used_index; j++) {
2162 			if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j)
2163 				continue;
2164 
2165 			mtx_enter(&w_mtx);
2166 			if (generation != w_generation) {
2167 				mtx_leave(&w_mtx);
2168 
2169 				/* The graph has changed, try again. */
2170 				db_printf("Lock graph changed, "
2171 				    "restarting trace.\n");
2172 				goto restart;
2173 			}
2174 
2175 			w2 = &w_data[j];
2176 			data1 = witness_lock_order_get(w1, w2);
2177 			data2 = witness_lock_order_get(w2, w1);
2178 
2179 			/*
2180 			 * Copy information locally so we can release the
2181 			 * spin lock.
2182 			 */
2183 			tmp_w2 = *w2;
2184 
2185 			if (data1)
2186 				tmp_data1.wlod_stack = data1->wlod_stack;
2187 			if (data2 && data2 != data1)
2188 				tmp_data2.wlod_stack = data2->wlod_stack;
2189 			mtx_leave(&w_mtx);
2190 
2191 			db_printf("\nLock order reversal between \"%s\"(%s) "
2192 			    "and \"%s\"(%s)!\n",
2193 			    tmp_w1.w_type->lt_name, tmp_w1.w_class->lc_name,
2194 			    tmp_w2.w_type->lt_name, tmp_w2.w_class->lc_name);
2195 			if (data1) {
2196 				db_printf("Lock order \"%s\"(%s) -> \"%s\"(%s) "
2197 				    "first seen at:\n",
2198 				    tmp_w1.w_type->lt_name,
2199 				    tmp_w1.w_class->lc_name,
2200 				    tmp_w2.w_type->lt_name,
2201 				    tmp_w2.w_class->lc_name);
2202 				stacktrace_print(&tmp_data1.wlod_stack,
2203 				    db_printf);
2204 				db_printf("\n");
2205 			}
2206 			if (data2 && data2 != data1) {
2207 				db_printf("Lock order \"%s\"(%s) -> \"%s\"(%s) "
2208 				    "first seen at:\n",
2209 				    tmp_w2.w_type->lt_name,
2210 				    tmp_w2.w_class->lc_name,
2211 				    tmp_w1.w_type->lt_name,
2212 				    tmp_w1.w_class->lc_name);
2213 				stacktrace_print(&tmp_data2.wlod_stack,
2214 				    db_printf);
2215 				db_printf("\n");
2216 			}
2217 		}
2218 	}
2219 	mtx_enter(&w_mtx);
2220 	if (generation != w_generation) {
2221 		mtx_leave(&w_mtx);
2222 
2223 		/*
2224 		 * The graph changed while we were printing stack data,
2225 		 * try again.
2226 		 */
2227 		db_printf("Lock graph changed, restarting trace.\n");
2228 		goto restart;
2229 	}
2230 	mtx_leave(&w_mtx);
2231 }
2232 
2233 void
2234 db_witness_display(db_expr_t addr, int have_addr, db_expr_t count, char *modif)
2235 {
2236 	switch (modif[0]) {
2237 	case 'b':
2238 		witness_print_badstacks();
2239 		break;
2240 	default:
2241 		witness_ddb_display(db_printf);
2242 		break;
2243 	}
2244 }
2245 #endif
2246 
2247 void
2248 db_witness_print_fullgraph(void)
2249 {
2250 	struct witness *w;
2251 	int error;
2252 
2253 	if (witness_watch < 1) {
2254 		db_printf("witness watch is disabled\n");
2255 		return;
2256 	}
2257 	if (witness_cold) {
2258 		db_printf("witness is cold\n");
2259 		return;
2260 	}
2261 	error = 0;
2262 
2263 	mtx_enter(&w_mtx);
2264 	SIMPLEQ_FOREACH(w, &w_all, w_list)
2265 		w->w_displayed = 0;
2266 	SIMPLEQ_FOREACH(w, &w_all, w_list)
2267 		db_witness_add_fullgraph(w);
2268 	mtx_leave(&w_mtx);
2269 }
2270 
2271 static void
2272 db_witness_add_fullgraph(struct witness *w)
2273 {
2274 	int i;
2275 
2276 	if (w->w_displayed != 0 || w->w_acquired == 0)
2277 		return;
2278 	w->w_displayed = 1;
2279 
2280 	WITNESS_INDEX_ASSERT(w->w_index);
2281 	for (i = 1; i <= w_max_used_index; i++) {
2282 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) {
2283 			db_printf("\"%s\",\"%s\"\n", w->w_type->lt_name,
2284 			    w_data[i].w_type->lt_name);
2285 			db_witness_add_fullgraph(&w_data[i]);
2286 		}
2287 	}
2288 }
2289 
2290 /*
2291  * A simple hash function. Takes a key pointer and a key size. If size == 0,
2292  * interprets the key as a string and reads until the null
2293  * terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit
2294  * hash value computed from the key.
2295  */
2296 static uint32_t
2297 witness_hash_djb2(const uint8_t *key, uint32_t size)
2298 {
2299 	unsigned int hash = 5381;
2300 	int i;
2301 
2302 	/* hash = hash * 33 + key[i] */
2303 	if (size)
2304 		for (i = 0; i < size; i++)
2305 			hash = ((hash << 5) + hash) + (unsigned int)key[i];
2306 	else
2307 		for (i = 0; key[i] != 0; i++)
2308 			hash = ((hash << 5) + hash) + (unsigned int)key[i];
2309 
2310 	return (hash);
2311 }
2312 
2313 
2314 /*
2315  * Initializes the two witness hash tables. Called exactly once from
2316  * witness_initialize().
2317  */
2318 static void
2319 witness_init_hash_tables(void)
2320 {
2321 	int i;
2322 
2323 	KASSERT(witness_cold);
2324 
2325 	/* Initialize the hash tables. */
2326 	for (i = 0; i < WITNESS_HASH_SIZE; i++)
2327 		w_hash.wh_array[i] = NULL;
2328 
2329 	w_hash.wh_size = WITNESS_HASH_SIZE;
2330 	w_hash.wh_count = 0;
2331 
2332 	/* Initialize the lock order data hash. */
2333 	w_lofree = NULL;
2334 	for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) {
2335 		memset(&w_lodata[i], 0, sizeof(w_lodata[i]));
2336 		w_lodata[i].wlod_next = w_lofree;
2337 		w_lofree = &w_lodata[i];
2338 	}
2339 	w_lohash.wloh_size = WITNESS_LO_HASH_SIZE;
2340 	w_lohash.wloh_count = 0;
2341 	for (i = 0; i < WITNESS_LO_HASH_SIZE; i++)
2342 		w_lohash.wloh_array[i] = NULL;
2343 }
2344 
2345 static struct witness *
2346 witness_hash_get(const struct lock_type *type, const char *subtype)
2347 {
2348 	struct witness *w;
2349 	uint32_t hash;
2350 
2351 	KASSERT(type != NULL);
2352 	if (witness_cold == 0)
2353 		MUTEX_ASSERT_LOCKED(&w_mtx);
2354 	hash = (uint32_t)((uintptr_t)type ^ (uintptr_t)subtype) %
2355 	    w_hash.wh_size;
2356 	w = w_hash.wh_array[hash];
2357 	while (w != NULL) {
2358 		if (w->w_type == type && w->w_subtype == subtype)
2359 			goto out;
2360 		w = w->w_hash_next;
2361 	}
2362 
2363 out:
2364 	return (w);
2365 }
2366 
2367 static void
2368 witness_hash_put(struct witness *w)
2369 {
2370 	uint32_t hash;
2371 
2372 	KASSERT(w != NULL);
2373 	KASSERT(w->w_type != NULL);
2374 	if (witness_cold == 0)
2375 		MUTEX_ASSERT_LOCKED(&w_mtx);
2376 	KASSERTMSG(witness_hash_get(w->w_type, w->w_subtype) == NULL,
2377 	    "%s: trying to add a hash entry that already exists!", __func__);
2378 	KASSERTMSG(w->w_hash_next == NULL,
2379 	    "%s: w->w_hash_next != NULL", __func__);
2380 
2381 	hash = (uint32_t)((uintptr_t)w->w_type ^ (uintptr_t)w->w_subtype) %
2382 	    w_hash.wh_size;
2383 	w->w_hash_next = w_hash.wh_array[hash];
2384 	w_hash.wh_array[hash] = w;
2385 	w_hash.wh_count++;
2386 }
2387 
2388 
2389 static struct witness_lock_order_data *
2390 witness_lock_order_get(struct witness *parent, struct witness *child)
2391 {
2392 	struct witness_lock_order_data *data = NULL;
2393 	struct witness_lock_order_key key;
2394 	unsigned int hash;
2395 
2396 	KASSERT(parent != NULL && child != NULL);
2397 	key.from = parent->w_index;
2398 	key.to = child->w_index;
2399 	WITNESS_INDEX_ASSERT(key.from);
2400 	WITNESS_INDEX_ASSERT(key.to);
2401 	if ((w_rmatrix[parent->w_index][child->w_index]
2402 	    & WITNESS_LOCK_ORDER_KNOWN) == 0)
2403 		goto out;
2404 
2405 	hash = witness_hash_djb2((const char*)&key,
2406 	    sizeof(key)) % w_lohash.wloh_size;
2407 	data = w_lohash.wloh_array[hash];
2408 	while (data != NULL) {
2409 		if (witness_lock_order_key_equal(&data->wlod_key, &key))
2410 			break;
2411 		data = data->wlod_next;
2412 	}
2413 
2414 out:
2415 	return (data);
2416 }
2417 
2418 /*
2419  * Verify that parent and child have a known relationship, are not the same,
2420  * and child is actually a child of parent.  This is done without w_mtx
2421  * to avoid contention in the common case.
2422  */
2423 static int
2424 witness_lock_order_check(struct witness *parent, struct witness *child)
2425 {
2426 
2427 	if (parent != child &&
2428 	    w_rmatrix[parent->w_index][child->w_index]
2429 	    & WITNESS_LOCK_ORDER_KNOWN &&
2430 	    isitmychild(parent, child))
2431 		return (1);
2432 
2433 	return (0);
2434 }
2435 
2436 static int
2437 witness_lock_order_add(struct witness *parent, struct witness *child)
2438 {
2439 	struct witness_lock_order_data *data = NULL;
2440 	struct witness_lock_order_key key;
2441 	unsigned int hash;
2442 
2443 	KASSERT(parent != NULL && child != NULL);
2444 	key.from = parent->w_index;
2445 	key.to = child->w_index;
2446 	WITNESS_INDEX_ASSERT(key.from);
2447 	WITNESS_INDEX_ASSERT(key.to);
2448 	if (w_rmatrix[parent->w_index][child->w_index]
2449 	    & WITNESS_LOCK_ORDER_KNOWN)
2450 		return (1);
2451 
2452 	hash = witness_hash_djb2((const char*)&key,
2453 	    sizeof(key)) % w_lohash.wloh_size;
2454 	w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN;
2455 	data = w_lofree;
2456 	if (data == NULL)
2457 		return (0);
2458 	w_lofree = data->wlod_next;
2459 	data->wlod_next = w_lohash.wloh_array[hash];
2460 	data->wlod_key = key;
2461 	w_lohash.wloh_array[hash] = data;
2462 	w_lohash.wloh_count++;
2463 	stacktrace_save(&data->wlod_stack);
2464 	return (1);
2465 }
2466 
2467 /* Call this whenever the structure of the witness graph changes. */
2468 static void
2469 witness_increment_graph_generation(void)
2470 {
2471 
2472 	if (witness_cold == 0)
2473 		MUTEX_ASSERT_LOCKED(&w_mtx);
2474 	w_generation++;
2475 }
2476 
2477 static void
2478 witness_debugger(int dump)
2479 {
2480 	switch (witness_watch) {
2481 	case 1:
2482 		break;
2483 	case 2:
2484 		if (dump)
2485 			db_stack_dump();
2486 		break;
2487 	case 3:
2488 		if (dump)
2489 			db_stack_dump();
2490 		db_enter();
2491 		break;
2492 	default:
2493 		panic("witness: locking error");
2494 	}
2495 }
2496 
2497 static int
2498 witness_alloc_stacks(void)
2499 {
2500 	union lock_stack *stacks;
2501 	unsigned int i, nstacks = LOCK_CHILDCOUNT * LOCK_NCHILDREN;
2502 
2503 	rw_assert_wrlock(&w_ctlock);
2504 
2505 	if (w_lock_stack_num >= nstacks)
2506 		return (0);
2507 
2508 	nstacks -= w_lock_stack_num;
2509 	stacks = mallocarray(nstacks, sizeof(*stacks), M_WITNESS,
2510 	    M_WAITOK | M_CANFAIL | M_ZERO);
2511 	if (stacks == NULL)
2512 		return (ENOMEM);
2513 
2514 	mtx_enter(&w_mtx);
2515 	for (i = 0; i < nstacks; i++) {
2516 		stacks[i].ls_next = w_lock_stack_free;
2517 		w_lock_stack_free = &stacks[i];
2518 	}
2519 	mtx_leave(&w_mtx);
2520 	w_lock_stack_num += nstacks;
2521 
2522 	return (0);
2523 }
2524 
2525 int
2526 witness_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
2527     void *newp, size_t newlen)
2528 {
2529 	int error, value;
2530 
2531 	if (namelen != 1)
2532 		return (ENOTDIR);
2533 
2534 	rw_enter_write(&w_ctlock);
2535 
2536 	switch (name[0]) {
2537 	case KERN_WITNESS_WATCH:
2538 		error = witness_sysctl_watch(oldp, oldlenp, newp, newlen);
2539 		break;
2540 	case KERN_WITNESS_LOCKTRACE:
2541 		value = witness_locktrace;
2542 		error = sysctl_int(oldp, oldlenp, newp, newlen, &value);
2543 		if (error == 0 && newp != NULL) {
2544 			switch (value) {
2545 			case 1:
2546 				error = witness_alloc_stacks();
2547 				/* FALLTHROUGH */
2548 			case 0:
2549 				if (error == 0)
2550 					witness_locktrace = value;
2551 				break;
2552 			default:
2553 				error = EINVAL;
2554 				break;
2555 			}
2556 		}
2557 		break;
2558 	default:
2559 		error = EOPNOTSUPP;
2560 		break;
2561 	}
2562 
2563 	rw_exit_write(&w_ctlock);
2564 
2565 	return (error);
2566 }
2567 
2568 int
2569 witness_sysctl_watch(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
2570 {
2571 	int error;
2572 	int value;
2573 
2574 	value = witness_watch;
2575 	error = sysctl_int(oldp, oldlenp, newp, newlen, &value);
2576 	if (error == 0 && newp != NULL) {
2577 		if (value >= -1 && value <= 3) {
2578 			mtx_enter(&w_mtx);
2579 			if (value < 0 || witness_watch >= 0)
2580 				witness_watch = value;
2581 			else
2582 				error = EINVAL;
2583 			mtx_leave(&w_mtx);
2584 		} else
2585 			error = EINVAL;
2586 	}
2587 	return (error);
2588 }
2589