xref: /openbsd/sys/kern/subr_witness.c (revision 097a140d)
1 /*	$OpenBSD: subr_witness.c,v 1.47 2021/03/23 10:22:20 mpi Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008 Isilon Systems, Inc.
5  * Copyright (c) 2008 Ilya Maykov <ivmaykov@gmail.com>
6  * Copyright (c) 1998 Berkeley Software Design, Inc.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. Berkeley Software Design Inc's name may not be used to endorse or
18  *    promote products derived from this software without specific prior
19  *    written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	from BSDI Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp
34  *	and BSDI Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp
35  */
36 
37 /*
38  * Implementation of the `witness' lock verifier.  Originally implemented for
39  * mutexes in BSD/OS.  Extended to handle generic lock objects and lock
40  * classes in FreeBSD.
41  */
42 
43 /*
44  *	Main Entry: witness
45  *	Pronunciation: 'wit-n&s
46  *	Function: noun
47  *	Etymology: Middle English witnesse, from Old English witnes knowledge,
48  *	    testimony, witness, from 2wit
49  *	Date: before 12th century
50  *	1 : attestation of a fact or event : TESTIMONY
51  *	2 : one that gives evidence; specifically : one who testifies in
52  *	    a cause or before a judicial tribunal
53  *	3 : one asked to be present at a transaction so as to be able to
54  *	    testify to its having taken place
55  *	4 : one who has personal knowledge of something
56  *	5 a : something serving as evidence or proof : SIGN
57  *	  b : public affirmation by word or example of usually
58  *	      religious faith or conviction <the heroic witness to divine
59  *	      life -- Pilot>
60  *	6 capitalized : a member of the Jehovah's Witnesses
61  */
62 
63 /*
64  * Special rules concerning Giant and lock orders:
65  *
66  * 1) Giant must be acquired before any other mutexes.  Stated another way,
67  *    no other mutex may be held when Giant is acquired.
68  *
69  * 2) Giant must be released when blocking on a sleepable lock.
70  *
71  * This rule is less obvious, but is a result of Giant providing the same
72  * semantics as spl().  Basically, when a thread sleeps, it must release
73  * Giant.  When a thread blocks on a sleepable lock, it sleeps.  Hence rule
74  * 2).
75  *
76  * 3) Giant may be acquired before or after sleepable locks.
77  *
78  * This rule is also not quite as obvious.  Giant may be acquired after
79  * a sleepable lock because it is a non-sleepable lock and non-sleepable
80  * locks may always be acquired while holding a sleepable lock.  The second
81  * case, Giant before a sleepable lock, follows from rule 2) above.  Suppose
82  * you have two threads T1 and T2 and a sleepable lock X.  Suppose that T1
83  * acquires X and blocks on Giant.  Then suppose that T2 acquires Giant and
84  * blocks on X.  When T2 blocks on X, T2 will release Giant allowing T1 to
85  * execute.  Thus, acquiring Giant both before and after a sleepable lock
86  * will not result in a lock order reversal.
87  */
88 
89 #include <sys/param.h>
90 #include <sys/systm.h>
91 #include <sys/kernel.h>
92 #include <sys/malloc.h>
93 #ifdef MULTIPROCESSOR
94 #include <sys/mplock.h>
95 #endif
96 #include <sys/mutex.h>
97 #include <sys/percpu.h>
98 #include <sys/proc.h>
99 #include <sys/sched.h>
100 #include <sys/stacktrace.h>
101 #include <sys/stdint.h>
102 #include <sys/sysctl.h>
103 #include <sys/syslog.h>
104 #include <sys/witness.h>
105 
106 #include <machine/cpu.h>
107 
108 #include <uvm/uvm_extern.h>	/* uvm_pageboot_alloc */
109 
110 #ifndef DDB
111 #error "DDB is required for WITNESS"
112 #endif
113 
114 #include <machine/db_machdep.h>
115 
116 #include <ddb/db_access.h>
117 #include <ddb/db_var.h>
118 #include <ddb/db_output.h>
119 
120 #define	LI_RECURSEMASK	0x0000ffff	/* Recursion depth of lock instance. */
121 #define	LI_EXCLUSIVE	0x00010000	/* Exclusive lock instance. */
122 #define	LI_NORELEASE	0x00020000	/* Lock not allowed to be released. */
123 
124 #ifndef WITNESS_COUNT
125 #define	WITNESS_COUNT		1536
126 #endif
127 #define	WITNESS_HASH_SIZE	251	/* Prime, gives load factor < 2 */
128 #define	WITNESS_PENDLIST	(1024 + MAXCPUS)
129 
130 /* Allocate 256 KB of stack data space */
131 #define	WITNESS_LO_DATA_COUNT	2048
132 
133 /* Prime, gives load factor of ~2 at full load */
134 #define	WITNESS_LO_HASH_SIZE	1021
135 
136 /*
137  * XXX: This is somewhat bogus, as we assume here that at most 2048 threads
138  * will hold LOCK_NCHILDREN locks.  We handle failure ok, and we should
139  * probably be safe for the most part, but it's still a SWAG.
140  */
141 #define	LOCK_NCHILDREN	5
142 #define	LOCK_CHILDCOUNT	2048
143 
144 #define	FULLGRAPH_SBUF_SIZE	512
145 
146 /*
147  * These flags go in the witness relationship matrix and describe the
148  * relationship between any two struct witness objects.
149  */
150 #define	WITNESS_UNRELATED        0x00    /* No lock order relation. */
151 #define	WITNESS_PARENT           0x01    /* Parent, aka direct ancestor. */
152 #define	WITNESS_ANCESTOR         0x02    /* Direct or indirect ancestor. */
153 #define	WITNESS_CHILD            0x04    /* Child, aka direct descendant. */
154 #define	WITNESS_DESCENDANT       0x08    /* Direct or indirect descendant. */
155 #define	WITNESS_ANCESTOR_MASK    (WITNESS_PARENT | WITNESS_ANCESTOR)
156 #define	WITNESS_DESCENDANT_MASK  (WITNESS_CHILD | WITNESS_DESCENDANT)
157 #define	WITNESS_RELATED_MASK						\
158 	(WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK)
159 #define	WITNESS_REVERSAL         0x10    /* A lock order reversal has been
160 					  * observed. */
161 #define	WITNESS_RESERVED1        0x20    /* Unused flag, reserved. */
162 #define	WITNESS_RESERVED2        0x40    /* Unused flag, reserved. */
163 #define	WITNESS_LOCK_ORDER_KNOWN 0x80    /* This lock order is known. */
164 
165 /* Descendant to ancestor flags */
166 #define	WITNESS_DTOA(x)	(((x) & WITNESS_RELATED_MASK) >> 2)
167 
168 /* Ancestor to descendant flags */
169 #define	WITNESS_ATOD(x)	(((x) & WITNESS_RELATED_MASK) << 2)
170 
171 #define	WITNESS_INDEX_ASSERT(i)						\
172 	KASSERT((i) > 0 && (i) <= w_max_used_index && (i) < witness_count)
173 
174 /*
175  * Lock classes.  Each lock has a class which describes characteristics
176  * common to all types of locks of a given class.
177  *
178  * Spin locks in general must always protect against preemption, as it is
179  * an error to perform any type of context switch while holding a spin lock.
180  * Also, for an individual lock to be recursable, its class must allow
181  * recursion and the lock itself must explicitly allow recursion.
182  */
183 
184 struct lock_class {
185 	const		char *lc_name;
186 	u_int		lc_flags;
187 };
188 
189 union lock_stack {
190 	union lock_stack	*ls_next;
191 	struct stacktrace	 ls_stack;
192 };
193 
194 #define	LC_SLEEPLOCK	0x00000001	/* Sleep lock. */
195 #define	LC_SPINLOCK	0x00000002	/* Spin lock. */
196 #define	LC_SLEEPABLE	0x00000004	/* Sleeping allowed with this lock. */
197 #define	LC_RECURSABLE	0x00000008	/* Locks of this type may recurse. */
198 #define	LC_UPGRADABLE	0x00000010	/* Upgrades and downgrades permitted. */
199 
200 /*
201  * Lock instances.  A lock instance is the data associated with a lock while
202  * it is held by witness.  For example, a lock instance will hold the
203  * recursion count of a lock.  Lock instances are held in lists.  Spin locks
204  * are held in a per-cpu list while sleep locks are held in per-thread list.
205  */
206 struct lock_instance {
207 	struct lock_object	*li_lock;
208 	union lock_stack	*li_stack;
209 	u_int			li_flags;
210 };
211 
212 /*
213  * A simple list type used to build the list of locks held by a thread
214  * or CPU.  We can't simply embed the list in struct lock_object since a
215  * lock may be held by more than one thread if it is a shared lock.  Locks
216  * are added to the head of the list, so we fill up each list entry from
217  * "the back" logically.  To ease some of the arithmetic, we actually fill
218  * in each list entry the normal way (children[0] then children[1], etc.) but
219  * when we traverse the list we read children[count-1] as the first entry
220  * down to children[0] as the final entry.
221  */
222 struct lock_list_entry {
223 	struct lock_list_entry	*ll_next;
224 	struct lock_instance	ll_children[LOCK_NCHILDREN];
225 	int			ll_count;
226 };
227 
228 /*
229  * The main witness structure. One of these per named lock type in the system
230  * (for example, "vnode interlock").
231  */
232 struct witness {
233 	const struct lock_type	*w_type;
234 	const char		*w_subtype;
235 	uint32_t		w_index;  /* Index in the relationship matrix */
236 	struct lock_class	*w_class;
237 	SLIST_ENTRY(witness)	w_list;		/* List of all witnesses. */
238 	SLIST_ENTRY(witness)	w_typelist;	/* Witnesses of a type. */
239 	SLIST_ENTRY(witness)	w_hash_next;	/* Linked list in
240 						 * hash buckets. */
241 	uint16_t		w_num_ancestors; /* direct/indirect
242 						  * ancestor count */
243 	uint16_t		w_num_descendants; /* direct/indirect
244 						    * descendant count */
245 	int16_t			w_ddb_level;
246 	unsigned		w_acquired:1;
247 	unsigned		w_displayed:1;
248 	unsigned		w_reversed:1;
249 };
250 
251 SLIST_HEAD(witness_list, witness);
252 
253 /*
254  * The witness hash table. Keys are witness names (const char *), elements are
255  * witness objects (struct witness *).
256  */
257 struct witness_hash {
258 	struct witness_list	wh_array[WITNESS_HASH_SIZE];
259 	uint32_t		wh_size;
260 	uint32_t		wh_count;
261 };
262 
263 /*
264  * Key type for the lock order data hash table.
265  */
266 struct witness_lock_order_key {
267 	uint16_t	from;
268 	uint16_t	to;
269 };
270 
271 struct witness_lock_order_data {
272 	struct stacktrace		wlod_stack;
273 	struct witness_lock_order_key	wlod_key;
274 	struct witness_lock_order_data	*wlod_next;
275 };
276 
277 /*
278  * The witness lock order data hash table. Keys are witness index tuples
279  * (struct witness_lock_order_key), elements are lock order data objects
280  * (struct witness_lock_order_data).
281  */
282 struct witness_lock_order_hash {
283 	struct witness_lock_order_data	*wloh_array[WITNESS_LO_HASH_SIZE];
284 	u_int	wloh_size;
285 	u_int	wloh_count;
286 };
287 
288 struct witness_pendhelp {
289 	const struct lock_type	*wh_type;
290 	struct lock_object	*wh_lock;
291 };
292 
293 struct witness_cpu {
294 	struct lock_list_entry	*wc_spinlocks;
295 	struct lock_list_entry	*wc_lle_cache;
296 	union lock_stack	*wc_stk_cache;
297 	unsigned int		 wc_lle_count;
298 	unsigned int		 wc_stk_count;
299 } __aligned(CACHELINESIZE);
300 
301 #define WITNESS_LLE_CACHE_MAX	8
302 #define WITNESS_STK_CACHE_MAX	(WITNESS_LLE_CACHE_MAX * LOCK_NCHILDREN)
303 
304 struct witness_cpu witness_cpu[MAXCPUS];
305 
306 /*
307  * Returns 0 if one of the locks is a spin lock and the other is not.
308  * Returns 1 otherwise.
309  */
310 static __inline int
311 witness_lock_type_equal(struct witness *w1, struct witness *w2)
312 {
313 
314 	return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) ==
315 		(w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)));
316 }
317 
318 static __inline int
319 witness_lock_order_key_equal(const struct witness_lock_order_key *a,
320     const struct witness_lock_order_key *b)
321 {
322 
323 	return (a->from == b->from && a->to == b->to);
324 }
325 
326 static int	_isitmyx(struct witness *w1, struct witness *w2, int rmask,
327 		    const char *fname);
328 static void	adopt(struct witness *parent, struct witness *child);
329 static struct witness	*enroll(const struct lock_type *, const char *,
330 			    struct lock_class *);
331 static struct lock_instance	*find_instance(struct lock_list_entry *list,
332 				    const struct lock_object *lock);
333 static int	isitmychild(struct witness *parent, struct witness *child);
334 static int	isitmydescendant(struct witness *parent, struct witness *child);
335 static void	itismychild(struct witness *parent, struct witness *child);
336 #ifdef DDB
337 static void	db_witness_add_fullgraph(struct witness *parent);
338 static void	witness_ddb_compute_levels(void);
339 static void	witness_ddb_display(int(*)(const char *fmt, ...));
340 static void	witness_ddb_display_descendants(int(*)(const char *fmt, ...),
341 		    struct witness *, int indent);
342 static void	witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
343 		    struct witness_list *list);
344 static void	witness_ddb_level_descendants(struct witness *parent, int l);
345 static void	witness_ddb_list(struct proc *td);
346 #endif
347 static int	witness_alloc_stacks(void);
348 static void	witness_debugger(int dump);
349 static void	witness_free(struct witness *m);
350 static struct witness	*witness_get(void);
351 static uint32_t	witness_hash_djb2(const uint8_t *key, uint32_t size);
352 static struct witness	*witness_hash_get(const struct lock_type *,
353 		    const char *);
354 static void	witness_hash_put(struct witness *w);
355 static void	witness_init_hash_tables(void);
356 static void	witness_increment_graph_generation(void);
357 static int	witness_list_locks(struct lock_list_entry **,
358 		    int (*)(const char *, ...));
359 static void	witness_lock_list_free(struct lock_list_entry *lle);
360 static struct lock_list_entry	*witness_lock_list_get(void);
361 static void	witness_lock_stack_free(union lock_stack *stack);
362 static union lock_stack		*witness_lock_stack_get(void);
363 static int	witness_lock_order_add(struct witness *parent,
364 		    struct witness *child);
365 static int	witness_lock_order_check(struct witness *parent,
366 		    struct witness *child);
367 static struct witness_lock_order_data	*witness_lock_order_get(
368 					    struct witness *parent,
369 					    struct witness *child);
370 static void	witness_list_lock(struct lock_instance *instance,
371 		    int (*prnt)(const char *fmt, ...));
372 static void	witness_setflag(struct lock_object *lock, int flag, int set);
373 
374 /*
375  * If set to 0, lock order checking is disabled.  If set to -1,
376  * witness is completely disabled.  Otherwise witness performs full
377  * lock order checking for all locks.  At runtime, lock order checking
378  * may be toggled.  However, witness cannot be reenabled once it is
379  * completely disabled.
380  */
381 #ifdef WITNESS_WATCH
382 static int witness_watch = 3;
383 #else
384 static int witness_watch = 2;
385 #endif
386 
387 #ifdef WITNESS_LOCKTRACE
388 static int witness_locktrace = 1;
389 #else
390 static int witness_locktrace = 0;
391 #endif
392 
393 int witness_count = WITNESS_COUNT;
394 int witness_uninitialized_report = 5;
395 
396 static struct mutex w_mtx;
397 static struct rwlock w_ctlock = RWLOCK_INITIALIZER("w_ctlock");
398 
399 /* w_list */
400 static struct witness_list w_free = SLIST_HEAD_INITIALIZER(w_free);
401 static struct witness_list w_all = SLIST_HEAD_INITIALIZER(w_all);
402 
403 /* w_typelist */
404 static struct witness_list w_spin = SLIST_HEAD_INITIALIZER(w_spin);
405 static struct witness_list w_sleep = SLIST_HEAD_INITIALIZER(w_sleep);
406 
407 /* lock list */
408 static struct lock_list_entry *w_lock_list_free = NULL;
409 static struct witness_pendhelp pending_locks[WITNESS_PENDLIST];
410 static u_int pending_cnt;
411 
412 static int w_free_cnt, w_spin_cnt, w_sleep_cnt;
413 
414 static struct witness *w_data;
415 static uint8_t **w_rmatrix;
416 static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT];
417 static struct witness_hash w_hash;	/* The witness hash table. */
418 
419 /* The lock order data hash */
420 static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT];
421 static struct witness_lock_order_data *w_lofree = NULL;
422 static struct witness_lock_order_hash w_lohash;
423 static int w_max_used_index = 0;
424 static unsigned int w_generation = 0;
425 
426 static union lock_stack *w_lock_stack_free;
427 static unsigned int w_lock_stack_num;
428 
429 static struct lock_class lock_class_kernel_lock = {
430 	.lc_name = "kernel_lock",
431 	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_SLEEPABLE
432 };
433 
434 static struct lock_class lock_class_sched_lock = {
435 	.lc_name = "sched_lock",
436 	.lc_flags = LC_SPINLOCK | LC_RECURSABLE
437 };
438 
439 static struct lock_class lock_class_mutex = {
440 	.lc_name = "mutex",
441 	.lc_flags = LC_SPINLOCK
442 };
443 
444 static struct lock_class lock_class_rwlock = {
445 	.lc_name = "rwlock",
446 	.lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_UPGRADABLE
447 };
448 
449 static struct lock_class lock_class_rrwlock = {
450 	.lc_name = "rrwlock",
451 	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_SLEEPABLE |
452 	    LC_UPGRADABLE
453 };
454 
455 static struct lock_class *lock_classes[] = {
456 	&lock_class_kernel_lock,
457 	&lock_class_sched_lock,
458 	&lock_class_mutex,
459 	&lock_class_rwlock,
460 	&lock_class_rrwlock,
461 };
462 
463 /*
464  * This global is set to 0 once it becomes safe to use the witness code.
465  */
466 static int witness_cold = 1;
467 
468 /*
469  * This global is set to 1 once the static lock orders have been enrolled
470  * so that a warning can be issued for any spin locks enrolled later.
471  */
472 static int witness_spin_warn = 0;
473 
474 /*
475  * The WITNESS-enabled diagnostic code.  Note that the witness code does
476  * assume that the early boot is single-threaded at least until after this
477  * routine is completed.
478  */
479 void
480 witness_initialize(void)
481 {
482 	struct lock_object *lock;
483 	union lock_stack *stacks;
484 	struct witness *w;
485 	int i, s;
486 
487 	w_data = (void *)uvm_pageboot_alloc(sizeof(struct witness) *
488 	    witness_count);
489 	memset(w_data, 0, sizeof(struct witness) * witness_count);
490 
491 	w_rmatrix = (void *)uvm_pageboot_alloc(sizeof(*w_rmatrix) *
492 	    (witness_count + 1));
493 
494 	for (i = 0; i < witness_count + 1; i++) {
495 		w_rmatrix[i] = (void *)uvm_pageboot_alloc(
496 		    sizeof(*w_rmatrix[i]) * (witness_count + 1));
497 		memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) *
498 		    (witness_count + 1));
499 	}
500 
501 	mtx_init_flags(&w_mtx, IPL_HIGH, "witness lock", MTX_NOWITNESS);
502 	for (i = witness_count - 1; i >= 0; i--) {
503 		w = &w_data[i];
504 		memset(w, 0, sizeof(*w));
505 		w_data[i].w_index = i;	/* Witness index never changes. */
506 		witness_free(w);
507 	}
508 	KASSERTMSG(SLIST_FIRST(&w_free)->w_index == 0,
509 	    "%s: Invalid list of free witness objects", __func__);
510 
511 	/* Witness with index 0 is not used to aid in debugging. */
512 	SLIST_REMOVE_HEAD(&w_free, w_list);
513 	w_free_cnt--;
514 
515 	for (i = 0; i < witness_count; i++) {
516 		memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) *
517 		    (witness_count + 1));
518 	}
519 
520 	if (witness_locktrace) {
521 		w_lock_stack_num = LOCK_CHILDCOUNT * LOCK_NCHILDREN;
522 		stacks = (void *)uvm_pageboot_alloc(sizeof(*stacks) *
523 		    w_lock_stack_num);
524 	}
525 
526 	s = splhigh();
527 	for (i = 0; i < w_lock_stack_num; i++)
528 		witness_lock_stack_free(&stacks[i]);
529 	for (i = 0; i < LOCK_CHILDCOUNT; i++)
530 		witness_lock_list_free(&w_locklistdata[i]);
531 	splx(s);
532 	witness_init_hash_tables();
533 	witness_spin_warn = 1;
534 
535 	/* Iterate through all locks and add them to witness. */
536 	for (i = 0; pending_locks[i].wh_lock != NULL; i++) {
537 		lock = pending_locks[i].wh_lock;
538 		KASSERTMSG(lock->lo_flags & LO_WITNESS,
539 		    "%s: lock %s is on pending list but not LO_WITNESS",
540 		    __func__, lock->lo_name);
541 		lock->lo_witness = enroll(pending_locks[i].wh_type,
542 		    lock->lo_name, LOCK_CLASS(lock));
543 	}
544 
545 	/* Mark the witness code as being ready for use. */
546 	witness_cold = 0;
547 }
548 
549 void
550 witness_init(struct lock_object *lock, const struct lock_type *type)
551 {
552 	struct lock_class *class;
553 
554 	/* Various sanity checks. */
555 	class = LOCK_CLASS(lock);
556 	if ((lock->lo_flags & LO_RECURSABLE) != 0 &&
557 	    (class->lc_flags & LC_RECURSABLE) == 0)
558 		panic("%s: lock (%s) %s can not be recursable",
559 		    __func__, class->lc_name, lock->lo_name);
560 	if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
561 	    (class->lc_flags & LC_SLEEPABLE) == 0)
562 		panic("%s: lock (%s) %s can not be sleepable",
563 		    __func__, class->lc_name, lock->lo_name);
564 	if ((lock->lo_flags & LO_UPGRADABLE) != 0 &&
565 	    (class->lc_flags & LC_UPGRADABLE) == 0)
566 		panic("%s: lock (%s) %s can not be upgradable",
567 		    __func__, class->lc_name, lock->lo_name);
568 
569 	/*
570 	 * If we shouldn't watch this lock, then just clear lo_witness.
571 	 * Record the type in case the lock becomes watched later.
572 	 * Otherwise, if witness_cold is set, then it is too early to
573 	 * enroll this lock, so defer it to witness_initialize() by adding
574 	 * it to the pending_locks list.  If it is not too early, then enroll
575 	 * the lock now.
576 	 */
577 	if (witness_watch < 1 || panicstr != NULL || db_active ||
578 	    (lock->lo_flags & LO_WITNESS) == 0) {
579 		lock->lo_witness = NULL;
580 		lock->lo_type = type;
581 	} else if (witness_cold) {
582 		pending_locks[pending_cnt].wh_lock = lock;
583 		pending_locks[pending_cnt++].wh_type = type;
584 		if (pending_cnt > WITNESS_PENDLIST)
585 			panic("%s: pending locks list is too small, "
586 			    "increase WITNESS_PENDLIST",
587 			    __func__);
588 	} else
589 		lock->lo_witness = enroll(type, lock->lo_name, class);
590 }
591 
592 static inline int
593 is_kernel_lock(const struct lock_object *lock)
594 {
595 #ifdef MULTIPROCESSOR
596 	return (lock == &kernel_lock.mpl_lock_obj);
597 #else
598 	return (0);
599 #endif
600 }
601 
602 #ifdef DDB
603 static void
604 witness_ddb_compute_levels(void)
605 {
606 	struct witness *w;
607 
608 	/*
609 	 * First clear all levels.
610 	 */
611 	SLIST_FOREACH(w, &w_all, w_list)
612 		w->w_ddb_level = -1;
613 
614 	/*
615 	 * Look for locks with no parents and level all their descendants.
616 	 */
617 	SLIST_FOREACH(w, &w_all, w_list) {
618 		/* If the witness has ancestors (is not a root), skip it. */
619 		if (w->w_num_ancestors > 0)
620 			continue;
621 		witness_ddb_level_descendants(w, 0);
622 	}
623 }
624 
625 static void
626 witness_ddb_level_descendants(struct witness *w, int l)
627 {
628 	int i;
629 
630 	if (w->w_ddb_level >= l)
631 		return;
632 
633 	w->w_ddb_level = l;
634 	l++;
635 
636 	for (i = 1; i <= w_max_used_index; i++) {
637 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
638 			witness_ddb_level_descendants(&w_data[i], l);
639 	}
640 }
641 
642 static void
643 witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...),
644     struct witness *w, int indent)
645 {
646 	int i;
647 
648 	for (i = 0; i < indent; i++)
649 		prnt(" ");
650 	prnt("%s (type: %s, depth: %d)",
651 	     w->w_type->lt_name, w->w_class->lc_name, w->w_ddb_level);
652 	if (w->w_displayed) {
653 		prnt(" -- (already displayed)\n");
654 		return;
655 	}
656 	w->w_displayed = 1;
657 	if (!w->w_acquired)
658 		prnt(" -- never acquired\n");
659 	else
660 		prnt("\n");
661 	indent++;
662 	WITNESS_INDEX_ASSERT(w->w_index);
663 	for (i = 1; i <= w_max_used_index; i++) {
664 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
665 			witness_ddb_display_descendants(prnt, &w_data[i],
666 			    indent);
667 	}
668 }
669 
670 static void
671 witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
672     struct witness_list *list)
673 {
674 	struct witness *w;
675 
676 	SLIST_FOREACH(w, list, w_typelist) {
677 		if (!w->w_acquired || w->w_ddb_level > 0)
678 			continue;
679 
680 		/* This lock has no anscestors - display its descendants. */
681 		witness_ddb_display_descendants(prnt, w, 0);
682 	}
683 }
684 
685 static void
686 witness_ddb_display(int(*prnt)(const char *fmt, ...))
687 {
688 	struct witness *w;
689 
690 	KASSERTMSG(witness_cold == 0, "%s: witness_cold", __func__);
691 	witness_ddb_compute_levels();
692 
693 	/* Clear all the displayed flags. */
694 	SLIST_FOREACH(w, &w_all, w_list)
695 		w->w_displayed = 0;
696 
697 	/*
698 	 * First, handle sleep locks which have been acquired at least
699 	 * once.
700 	 */
701 	prnt("Sleep locks:\n");
702 	witness_ddb_display_list(prnt, &w_sleep);
703 
704 	/*
705 	 * Now do spin locks which have been acquired at least once.
706 	 */
707 	prnt("\nSpin locks:\n");
708 	witness_ddb_display_list(prnt, &w_spin);
709 
710 	/*
711 	 * Finally, any locks which have not been acquired yet.
712 	 */
713 	prnt("\nLocks which were never acquired:\n");
714 	SLIST_FOREACH(w, &w_all, w_list) {
715 		if (w->w_acquired)
716 			continue;
717 		prnt("%s (type: %s, depth: %d)\n", w->w_type->lt_name,
718 		    w->w_class->lc_name, w->w_ddb_level);
719 	}
720 }
721 #endif /* DDB */
722 
723 int
724 witness_defineorder(struct lock_object *lock1, struct lock_object *lock2)
725 {
726 
727 	if (witness_watch < 0 || panicstr != NULL || db_active)
728 		return (0);
729 
730 	/* Require locks that witness knows about. */
731 	if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL ||
732 	    lock2->lo_witness == NULL)
733 		return (EINVAL);
734 
735 	MUTEX_ASSERT_UNLOCKED(&w_mtx);
736 	mtx_enter(&w_mtx);
737 
738 	/*
739 	 * If we already have either an explicit or implied lock order that
740 	 * is the other way around, then return an error.
741 	 */
742 	if (witness_watch &&
743 	    isitmydescendant(lock2->lo_witness, lock1->lo_witness)) {
744 		mtx_leave(&w_mtx);
745 		return (EINVAL);
746 	}
747 
748 	/* Try to add the new order. */
749 	itismychild(lock1->lo_witness, lock2->lo_witness);
750 	mtx_leave(&w_mtx);
751 	return (0);
752 }
753 
754 void
755 witness_checkorder(struct lock_object *lock, int flags,
756     struct lock_object *interlock)
757 {
758 	struct lock_list_entry *lock_list, *lle;
759 	struct lock_instance *lock1, *lock2, *plock;
760 	struct lock_class *class, *iclass;
761 	struct proc *p;
762 	struct witness *w, *w1;
763 	int i, j, s;
764 
765 	if (witness_cold || witness_watch < 1 || panicstr != NULL || db_active)
766 		return;
767 
768 	if ((lock->lo_flags & LO_INITIALIZED) == 0) {
769 		if (witness_uninitialized_report > 0) {
770 			witness_uninitialized_report--;
771 			printf("witness: lock_object uninitialized: %p\n", lock);
772 			witness_debugger(1);
773 		}
774 		lock->lo_flags |= LO_INITIALIZED;
775 	}
776 
777 	if ((lock->lo_flags & LO_WITNESS) == 0)
778 		return;
779 
780 	w = lock->lo_witness;
781 	class = LOCK_CLASS(lock);
782 
783 	if (w == NULL)
784 		w = lock->lo_witness =
785 		    enroll(lock->lo_type, lock->lo_name, class);
786 
787 	p = curproc;
788 
789 	if (class->lc_flags & LC_SLEEPLOCK) {
790 		/*
791 		 * Since spin locks include a critical section, this check
792 		 * implicitly enforces a lock order of all sleep locks before
793 		 * all spin locks.
794 		 */
795 		lock_list = witness_cpu[cpu_number()].wc_spinlocks;
796 		if (lock_list != NULL && lock_list->ll_count > 0) {
797 			panic("acquiring blockable sleep lock with "
798 			    "spinlock or critical section held (%s) %s",
799 			    class->lc_name, lock->lo_name);
800 		}
801 
802 		/*
803 		 * If this is the first lock acquired then just return as
804 		 * no order checking is needed.
805 		 */
806 		lock_list = p->p_sleeplocks;
807 		if (lock_list == NULL || lock_list->ll_count == 0)
808 			return;
809 	} else {
810 
811 		/*
812 		 * If this is the first lock, just return as no order
813 		 * checking is needed.
814 		 */
815 		lock_list = witness_cpu[cpu_number()].wc_spinlocks;
816 		if (lock_list == NULL || lock_list->ll_count == 0)
817 			return;
818 	}
819 
820 	s = splhigh();
821 
822 	/*
823 	 * Check to see if we are recursing on a lock we already own.  If
824 	 * so, make sure that we don't mismatch exclusive and shared lock
825 	 * acquires.
826 	 */
827 	lock1 = find_instance(lock_list, lock);
828 	if (lock1 != NULL) {
829 		if ((lock1->li_flags & LI_EXCLUSIVE) != 0 &&
830 		    (flags & LOP_EXCLUSIVE) == 0) {
831 			printf("witness: shared lock of (%s) %s "
832 			    "while exclusively locked\n",
833 			    class->lc_name, lock->lo_name);
834 			panic("excl->share");
835 		}
836 		if ((lock1->li_flags & LI_EXCLUSIVE) == 0 &&
837 		    (flags & LOP_EXCLUSIVE) != 0) {
838 			printf("witness: exclusive lock of (%s) %s "
839 			    "while share locked\n",
840 			    class->lc_name, lock->lo_name);
841 			panic("share->excl");
842 		}
843 		goto out_splx;
844 	}
845 
846 	/* Warn if the interlock is not locked exactly once. */
847 	if (interlock != NULL) {
848 		iclass = LOCK_CLASS(interlock);
849 		lock1 = find_instance(lock_list, interlock);
850 		if (lock1 == NULL)
851 			panic("interlock (%s) %s not locked",
852 			    iclass->lc_name, interlock->lo_name);
853 		else if ((lock1->li_flags & LI_RECURSEMASK) != 0)
854 			panic("interlock (%s) %s recursed",
855 			    iclass->lc_name, interlock->lo_name);
856 	}
857 
858 	/*
859 	 * Find the previously acquired lock, but ignore interlocks.
860 	 */
861 	plock = &lock_list->ll_children[lock_list->ll_count - 1];
862 	if (interlock != NULL && plock->li_lock == interlock) {
863 		if (lock_list->ll_count > 1)
864 			plock =
865 			    &lock_list->ll_children[lock_list->ll_count - 2];
866 		else {
867 			lle = lock_list->ll_next;
868 
869 			/*
870 			 * The interlock is the only lock we hold, so
871 			 * simply return.
872 			 */
873 			if (lle == NULL)
874 				goto out_splx;
875 			plock = &lle->ll_children[lle->ll_count - 1];
876 		}
877 	}
878 
879 	/*
880 	 * Try to perform most checks without a lock.  If this succeeds we
881 	 * can skip acquiring the lock and return success.  Otherwise we redo
882 	 * the check with the lock held to handle races with concurrent updates.
883 	 */
884 	w1 = plock->li_lock->lo_witness;
885 	if (witness_lock_order_check(w1, w))
886 		goto out_splx;
887 
888 	mtx_enter(&w_mtx);
889 	if (witness_lock_order_check(w1, w))
890 		goto out;
891 
892 	witness_lock_order_add(w1, w);
893 
894 	/*
895 	 * Check for duplicate locks of the same type.  Note that we only
896 	 * have to check for this on the last lock we just acquired.  Any
897 	 * other cases will be caught as lock order violations.
898 	 */
899 	if (w1 == w) {
900 		i = w->w_index;
901 		if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) &&
902 		    !(w_rmatrix[i][i] & WITNESS_REVERSAL)) {
903 			w_rmatrix[i][i] |= WITNESS_REVERSAL;
904 			w->w_reversed = 1;
905 			mtx_leave(&w_mtx);
906 			printf("witness: acquiring duplicate lock of "
907 			    "same type: \"%s\"\n", w->w_type->lt_name);
908 			printf(" 1st %s\n", plock->li_lock->lo_name);
909 			printf(" 2nd %s\n", lock->lo_name);
910 			witness_debugger(1);
911 		} else
912 			mtx_leave(&w_mtx);
913 		goto out_splx;
914 	}
915 	MUTEX_ASSERT_LOCKED(&w_mtx);
916 
917 	/*
918 	 * If we know that the lock we are acquiring comes after
919 	 * the lock we most recently acquired in the lock order tree,
920 	 * then there is no need for any further checks.
921 	 */
922 	if (isitmychild(w1, w))
923 		goto out;
924 
925 	for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) {
926 		for (i = lle->ll_count - 1; i >= 0; i--, j++) {
927 
928 			KASSERT(j < LOCK_CHILDCOUNT * LOCK_NCHILDREN);
929 			lock1 = &lle->ll_children[i];
930 
931 			/*
932 			 * Ignore the interlock.
933 			 */
934 			if (interlock == lock1->li_lock)
935 				continue;
936 
937 			/*
938 			 * If this lock doesn't undergo witness checking,
939 			 * then skip it.
940 			 */
941 			w1 = lock1->li_lock->lo_witness;
942 			if (w1 == NULL) {
943 				KASSERTMSG((lock1->li_lock->lo_flags &
944 				    LO_WITNESS) == 0,
945 				    "lock missing witness structure");
946 				continue;
947 			}
948 
949 			/*
950 			 * If we are locking Giant and this is a sleepable
951 			 * lock, then skip it.
952 			 */
953 			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 &&
954 			    is_kernel_lock(lock))
955 				continue;
956 
957 			/*
958 			 * If we are locking a sleepable lock and this lock
959 			 * is Giant, then skip it.
960 			 */
961 			if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
962 			    is_kernel_lock(lock1->li_lock))
963 				continue;
964 
965 			/*
966 			 * If we are locking a sleepable lock and this lock
967 			 * isn't sleepable, we want to treat it as a lock
968 			 * order violation to enfore a general lock order of
969 			 * sleepable locks before non-sleepable locks.
970 			 */
971 			if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
972 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
973 				goto reversal;
974 
975 			/*
976 			 * If we are locking Giant and this is a non-sleepable
977 			 * lock, then treat it as a reversal.
978 			 */
979 			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 &&
980 			    is_kernel_lock(lock))
981 				goto reversal;
982 
983 			/*
984 			 * Check the lock order hierarchy for a reveresal.
985 			 */
986 			if (!isitmydescendant(w, w1))
987 				continue;
988 		reversal:
989 
990 			/*
991 			 * We have a lock order violation, check to see if it
992 			 * is allowed or has already been yelled about.
993 			 */
994 
995 			/* Bail if this violation is known */
996 			if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL)
997 				goto out;
998 
999 			/* Record this as a violation */
1000 			w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL;
1001 			w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL;
1002 			w->w_reversed = w1->w_reversed = 1;
1003 			witness_increment_graph_generation();
1004 			mtx_leave(&w_mtx);
1005 
1006 			/*
1007 			 * There are known LORs between VNODE locks. They are
1008 			 * not an indication of a bug. VNODE locks are flagged
1009 			 * as such (LO_IS_VNODE) and we don't yell if the LOR
1010 			 * is between 2 VNODE locks.
1011 			 */
1012 			if ((lock->lo_flags & LO_IS_VNODE) != 0 &&
1013 			    (lock1->li_lock->lo_flags & LO_IS_VNODE) != 0)
1014 				goto out_splx;
1015 
1016 			/*
1017 			 * Ok, yell about it.
1018 			 */
1019 			printf("witness: ");
1020 			if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
1021 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
1022 				printf("lock order reversal: "
1023 				    "(sleepable after non-sleepable)\n");
1024 			else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0
1025 			    && is_kernel_lock(lock))
1026 				printf("lock order reversal: "
1027 				    "(Giant after non-sleepable)\n");
1028 			else
1029 				printf("lock order reversal:\n");
1030 
1031 			/*
1032 			 * Try to locate an earlier lock with
1033 			 * witness w in our list.
1034 			 */
1035 			do {
1036 				lock2 = &lle->ll_children[i];
1037 				KASSERT(lock2->li_lock != NULL);
1038 				if (lock2->li_lock->lo_witness == w)
1039 					break;
1040 				if (i == 0 && lle->ll_next != NULL) {
1041 					lle = lle->ll_next;
1042 					i = lle->ll_count - 1;
1043 					KASSERT(i >= 0 && i < LOCK_NCHILDREN);
1044 				} else
1045 					i--;
1046 			} while (i >= 0);
1047 			if (i < 0) {
1048 				printf(" 1st %p %s (%s)\n",
1049 				    lock1->li_lock, lock1->li_lock->lo_name,
1050 				    w1->w_type->lt_name);
1051 				printf(" 2nd %p %s (%s)\n",
1052 				    lock, lock->lo_name, w->w_type->lt_name);
1053 			} else {
1054 				printf(" 1st %p %s (%s)\n",
1055 				    lock2->li_lock, lock2->li_lock->lo_name,
1056 				    lock2->li_lock->lo_witness->w_type->
1057 				      lt_name);
1058 				printf(" 2nd %p %s (%s)\n",
1059 				    lock1->li_lock, lock1->li_lock->lo_name,
1060 				    w1->w_type->lt_name);
1061 				printf(" 3rd %p %s (%s)\n", lock,
1062 				    lock->lo_name, w->w_type->lt_name);
1063 			}
1064 			if (witness_watch > 1) {
1065 				struct witness_lock_order_data *wlod1, *wlod2;
1066 
1067 				mtx_enter(&w_mtx);
1068 				wlod1 = witness_lock_order_get(w, w1);
1069 				wlod2 = witness_lock_order_get(w1, w);
1070 				mtx_leave(&w_mtx);
1071 
1072 				/*
1073 				 * It is safe to access saved stack traces,
1074 				 * w_type, and w_class without the lock.
1075 				 * Once written, they never change.
1076 				 */
1077 
1078 				if (wlod1 != NULL) {
1079 					printf("lock order \"%s\"(%s) -> "
1080 					    "\"%s\"(%s) first seen at:\n",
1081 					    w->w_type->lt_name,
1082 					    w->w_class->lc_name,
1083 					    w1->w_type->lt_name,
1084 					    w1->w_class->lc_name);
1085 					stacktrace_print(
1086 					    &wlod1->wlod_stack, printf);
1087 				} else {
1088 					printf("lock order data "
1089 					    "w2 -> w1 missing\n");
1090 				}
1091 				if (wlod2 != NULL) {
1092 					printf("lock order \"%s\"(%s) -> "
1093 					    "\"%s\"(%s) first seen at:\n",
1094 					    w1->w_type->lt_name,
1095 					    w1->w_class->lc_name,
1096 					    w->w_type->lt_name,
1097 					    w->w_class->lc_name);
1098 					stacktrace_print(
1099 					    &wlod2->wlod_stack, printf);
1100 				} else {
1101 					printf("lock order data "
1102 					    "w1 -> w2 missing\n");
1103 				}
1104 			}
1105 			witness_debugger(0);
1106 			goto out_splx;
1107 		}
1108 	}
1109 
1110 	/*
1111 	 * If requested, build a new lock order.  However, don't build a new
1112 	 * relationship between a sleepable lock and Giant if it is in the
1113 	 * wrong direction.  The correct lock order is that sleepable locks
1114 	 * always come before Giant.
1115 	 */
1116 	if (flags & LOP_NEWORDER &&
1117 	    !(is_kernel_lock(plock->li_lock) &&
1118 	    (lock->lo_flags & LO_SLEEPABLE) != 0))
1119 		itismychild(plock->li_lock->lo_witness, w);
1120 out:
1121 	mtx_leave(&w_mtx);
1122 out_splx:
1123 	splx(s);
1124 }
1125 
1126 void
1127 witness_lock(struct lock_object *lock, int flags)
1128 {
1129 	struct lock_list_entry **lock_list, *lle;
1130 	struct lock_instance *instance;
1131 	struct proc *p;
1132 	struct witness *w;
1133 	int s;
1134 
1135 	if (witness_cold || witness_watch < 0 || panicstr != NULL ||
1136 	    db_active || (lock->lo_flags & LO_WITNESS) == 0)
1137 		return;
1138 
1139 	w = lock->lo_witness;
1140 	if (w == NULL)
1141 		w = lock->lo_witness =
1142 		    enroll(lock->lo_type, lock->lo_name, LOCK_CLASS(lock));
1143 
1144 	p = curproc;
1145 
1146 	/* Determine lock list for this lock. */
1147 	if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK)
1148 		lock_list = &p->p_sleeplocks;
1149 	else
1150 		lock_list = &witness_cpu[cpu_number()].wc_spinlocks;
1151 
1152 	s = splhigh();
1153 
1154 	/* Check to see if we are recursing on a lock we already own. */
1155 	instance = find_instance(*lock_list, lock);
1156 	if (instance != NULL) {
1157 		instance->li_flags++;
1158 		goto out;
1159 	}
1160 
1161 	w->w_acquired = 1;
1162 
1163 	/* Find the next open lock instance in the list and fill it. */
1164 	lle = *lock_list;
1165 	if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) {
1166 		lle = witness_lock_list_get();
1167 		if (lle == NULL)
1168 			goto out;
1169 		lle->ll_next = *lock_list;
1170 		*lock_list = lle;
1171 	}
1172 	instance = &lle->ll_children[lle->ll_count++];
1173 	instance->li_lock = lock;
1174 	if ((flags & LOP_EXCLUSIVE) != 0)
1175 		instance->li_flags = LI_EXCLUSIVE;
1176 	else
1177 		instance->li_flags = 0;
1178 	instance->li_stack = NULL;
1179 	if (witness_locktrace) {
1180 		instance->li_stack = witness_lock_stack_get();
1181 		if (instance->li_stack != NULL)
1182 			stacktrace_save(&instance->li_stack->ls_stack);
1183 	}
1184 out:
1185 	splx(s);
1186 }
1187 
1188 void
1189 witness_upgrade(struct lock_object *lock, int flags)
1190 {
1191 	struct lock_instance *instance;
1192 	struct lock_class *class;
1193 	int s;
1194 
1195 	KASSERTMSG(witness_cold == 0, "%s: witness_cold", __func__);
1196 	if (lock->lo_witness == NULL || witness_watch < 0 ||
1197 	    panicstr != NULL || db_active)
1198 		return;
1199 	class = LOCK_CLASS(lock);
1200 	if (witness_watch) {
1201 		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
1202 			panic("upgrade of non-upgradable lock (%s) %s",
1203 			    class->lc_name, lock->lo_name);
1204 		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
1205 			panic("upgrade of non-sleep lock (%s) %s",
1206 			    class->lc_name, lock->lo_name);
1207 	}
1208 	s = splhigh();
1209 	instance = find_instance(curproc->p_sleeplocks, lock);
1210 	if (instance == NULL) {
1211 		panic("upgrade of unlocked lock (%s) %s",
1212 		    class->lc_name, lock->lo_name);
1213 		goto out;
1214 	}
1215 	if (witness_watch) {
1216 		if ((instance->li_flags & LI_EXCLUSIVE) != 0)
1217 			panic("upgrade of exclusive lock (%s) %s",
1218 			    class->lc_name, lock->lo_name);
1219 		if ((instance->li_flags & LI_RECURSEMASK) != 0)
1220 			panic("upgrade of recursed lock (%s) %s r=%d",
1221 			    class->lc_name, lock->lo_name,
1222 			    instance->li_flags & LI_RECURSEMASK);
1223 	}
1224 	instance->li_flags |= LI_EXCLUSIVE;
1225 out:
1226 	splx(s);
1227 }
1228 
1229 void
1230 witness_downgrade(struct lock_object *lock, int flags)
1231 {
1232 	struct lock_instance *instance;
1233 	struct lock_class *class;
1234 	int s;
1235 
1236 	KASSERTMSG(witness_cold == 0, "%s: witness_cold", __func__);
1237 	if (lock->lo_witness == NULL || witness_watch < 0 ||
1238 	    panicstr != NULL || db_active)
1239 		return;
1240 	class = LOCK_CLASS(lock);
1241 	if (witness_watch) {
1242 		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
1243 			panic(
1244 			    "downgrade of non-upgradable lock (%s) %s",
1245 			    class->lc_name, lock->lo_name);
1246 		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
1247 			panic("downgrade of non-sleep lock (%s) %s",
1248 			    class->lc_name, lock->lo_name);
1249 	}
1250 	s = splhigh();
1251 	instance = find_instance(curproc->p_sleeplocks, lock);
1252 	if (instance == NULL) {
1253 		panic("downgrade of unlocked lock (%s) %s",
1254 		    class->lc_name, lock->lo_name);
1255 		goto out;
1256 	}
1257 	if (witness_watch) {
1258 		if ((instance->li_flags & LI_EXCLUSIVE) == 0)
1259 			panic("downgrade of shared lock (%s) %s",
1260 			    class->lc_name, lock->lo_name);
1261 		if ((instance->li_flags & LI_RECURSEMASK) != 0)
1262 			panic("downgrade of recursed lock (%s) %s r=%d",
1263 			    class->lc_name, lock->lo_name,
1264 			    instance->li_flags & LI_RECURSEMASK);
1265 	}
1266 	instance->li_flags &= ~LI_EXCLUSIVE;
1267 out:
1268 	splx(s);
1269 }
1270 
1271 void
1272 witness_unlock(struct lock_object *lock, int flags)
1273 {
1274 	struct lock_list_entry **lock_list, *lle;
1275 	struct lock_instance *instance;
1276 	struct lock_class *class;
1277 	struct proc *p;
1278 	int i, j;
1279 	int s;
1280 
1281 	if (witness_cold || lock->lo_witness == NULL ||
1282 	    panicstr != NULL || db_active)
1283 		return;
1284 	p = curproc;
1285 	class = LOCK_CLASS(lock);
1286 
1287 	/* Find lock instance associated with this lock. */
1288 	if (class->lc_flags & LC_SLEEPLOCK)
1289 		lock_list = &p->p_sleeplocks;
1290 	else
1291 		lock_list = &witness_cpu[cpu_number()].wc_spinlocks;
1292 
1293 	s = splhigh();
1294 
1295 	lle = *lock_list;
1296 	for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next)
1297 		for (i = 0; i < (*lock_list)->ll_count; i++) {
1298 			instance = &(*lock_list)->ll_children[i];
1299 			if (instance->li_lock == lock)
1300 				goto found;
1301 		}
1302 
1303 	/*
1304 	 * When disabling WITNESS through witness_watch we could end up in
1305 	 * having registered locks in the p_sleeplocks queue.
1306 	 * We have to make sure we flush these queues, so just search for
1307 	 * eventual register locks and remove them.
1308 	 */
1309 	if (witness_watch > 0) {
1310 		panic("lock (%s) %s not locked", class->lc_name, lock->lo_name);
1311 	}
1312 	goto out;
1313 
1314 found:
1315 
1316 	/* First, check for shared/exclusive mismatches. */
1317 	if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 &&
1318 	    (flags & LOP_EXCLUSIVE) == 0) {
1319 		printf("witness: shared unlock of (%s) %s "
1320 		    "while exclusively locked\n",
1321 		    class->lc_name, lock->lo_name);
1322 		panic("excl->ushare");
1323 	}
1324 	if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 &&
1325 	    (flags & LOP_EXCLUSIVE) != 0) {
1326 		printf("witness: exclusive unlock of (%s) %s "
1327 		    "while share locked\n", class->lc_name, lock->lo_name);
1328 		panic("share->uexcl");
1329 	}
1330 	/* If we are recursed, unrecurse. */
1331 	if ((instance->li_flags & LI_RECURSEMASK) > 0) {
1332 		instance->li_flags--;
1333 		goto out;
1334 	}
1335 	/* The lock is now being dropped, check for NORELEASE flag */
1336 	if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) {
1337 		printf("witness: forbidden unlock of (%s) %s\n",
1338 		    class->lc_name, lock->lo_name);
1339 		panic("lock marked norelease");
1340 	}
1341 
1342 	/* Release the stack buffer, if any. */
1343 	if (instance->li_stack != NULL) {
1344 		witness_lock_stack_free(instance->li_stack);
1345 		instance->li_stack = NULL;
1346 	}
1347 
1348 	/* Remove this item from the list. */
1349 	for (j = i; j < (*lock_list)->ll_count - 1; j++)
1350 		(*lock_list)->ll_children[j] =
1351 		    (*lock_list)->ll_children[j + 1];
1352 	(*lock_list)->ll_count--;
1353 
1354 	/*
1355 	 * In order to reduce contention on w_mtx, we want to keep always an
1356 	 * head object into lists so that frequent allocation from the
1357 	 * free witness pool (and subsequent locking) is avoided.
1358 	 * In order to maintain the current code simple, when the head
1359 	 * object is totally unloaded it means also that we do not have
1360 	 * further objects in the list, so the list ownership needs to be
1361 	 * hand over to another object if the current head needs to be freed.
1362 	 */
1363 	if ((*lock_list)->ll_count == 0) {
1364 		if (*lock_list == lle) {
1365 			if (lle->ll_next == NULL)
1366 				goto out;
1367 		} else
1368 			lle = *lock_list;
1369 		*lock_list = lle->ll_next;
1370 		witness_lock_list_free(lle);
1371 	}
1372 out:
1373 	splx(s);
1374 }
1375 
1376 void
1377 witness_thread_exit(struct proc *p)
1378 {
1379 	struct lock_list_entry *lle;
1380 	int i, n, s;
1381 
1382 	lle = p->p_sleeplocks;
1383 	if (lle == NULL || panicstr != NULL || db_active)
1384 		return;
1385 	if (lle->ll_count != 0) {
1386 		for (n = 0; lle != NULL; lle = lle->ll_next)
1387 			for (i = lle->ll_count - 1; i >= 0; i--) {
1388 				if (n == 0)
1389 					printf("witness: thread %p exiting "
1390 					    "with the following locks held:\n",
1391 					    p);
1392 				n++;
1393 				witness_list_lock(&lle->ll_children[i],
1394 				    printf);
1395 			}
1396 		panic("thread %p cannot exit while holding sleeplocks", p);
1397 	}
1398 	KASSERT(lle->ll_next == NULL);
1399 	s = splhigh();
1400 	witness_lock_list_free(lle);
1401 	splx(s);
1402 }
1403 
1404 /*
1405  * Warn if any locks other than 'lock' are held.  Flags can be passed in to
1406  * exempt Giant and sleepable locks from the checks as well.  If any
1407  * non-exempt locks are held, then a supplied message is printed to the
1408  * output channel along with a list of the offending locks.  If indicated in the
1409  * flags then a failure results in a panic as well.
1410  */
1411 int
1412 witness_warn(int flags, struct lock_object *lock, const char *fmt, ...)
1413 {
1414 	struct lock_list_entry *lock_list, *lle;
1415 	struct lock_instance *lock1;
1416 	struct proc *p;
1417 	va_list ap;
1418 	int i, n;
1419 
1420 	if (witness_cold || witness_watch < 1 || panicstr != NULL || db_active)
1421 		return (0);
1422 	n = 0;
1423 	p = curproc;
1424 	for (lle = p->p_sleeplocks; lle != NULL; lle = lle->ll_next)
1425 		for (i = lle->ll_count - 1; i >= 0; i--) {
1426 			lock1 = &lle->ll_children[i];
1427 			if (lock1->li_lock == lock)
1428 				continue;
1429 			if (flags & WARN_KERNELOK &&
1430 			    is_kernel_lock(lock1->li_lock))
1431 				continue;
1432 			if (flags & WARN_SLEEPOK &&
1433 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0)
1434 				continue;
1435 			if (n == 0) {
1436 				printf("witness: ");
1437 				va_start(ap, fmt);
1438 				vprintf(fmt, ap);
1439 				va_end(ap);
1440 				printf(" with the following %slocks held:\n",
1441 				    (flags & WARN_SLEEPOK) != 0 ?
1442 				    "non-sleepable " : "");
1443 			}
1444 			n++;
1445 			witness_list_lock(lock1, printf);
1446 		}
1447 
1448 	lock_list = witness_cpu[cpu_number()].wc_spinlocks;
1449 	if (lock_list != NULL && lock_list->ll_count != 0) {
1450 		/*
1451 		 * We should only have one spinlock and as long as
1452 		 * the flags cannot match for this locks class,
1453 		 * check if the first spinlock is the one curproc
1454 		 * should hold.
1455 		 */
1456 		lock1 = &lock_list->ll_children[lock_list->ll_count - 1];
1457 		if (lock_list->ll_count == 1 && lock_list->ll_next == NULL &&
1458 		    lock1->li_lock == lock && n == 0)
1459 			return (0);
1460 
1461 		printf("witness: ");
1462 		va_start(ap, fmt);
1463 		vprintf(fmt, ap);
1464 		va_end(ap);
1465 		printf(" with the following %slocks held:\n",
1466 		    (flags & WARN_SLEEPOK) != 0 ?  "non-sleepable " : "");
1467 		n += witness_list_locks(&lock_list, printf);
1468 	}
1469 	if (n > 0) {
1470 		if (flags & WARN_PANIC)
1471 			panic("%s", __func__);
1472 		else
1473 			witness_debugger(1);
1474 	}
1475 	return (n);
1476 }
1477 
1478 static struct witness *
1479 enroll(const struct lock_type *type, const char *subtype,
1480     struct lock_class *lock_class)
1481 {
1482 	struct witness *w;
1483 	struct witness_list *typelist;
1484 
1485 	KASSERT(type != NULL);
1486 
1487 	if (witness_watch < 0 || panicstr != NULL || db_active)
1488 		return (NULL);
1489 	if ((lock_class->lc_flags & LC_SPINLOCK)) {
1490 		typelist = &w_spin;
1491 	} else if ((lock_class->lc_flags & LC_SLEEPLOCK)) {
1492 		typelist = &w_sleep;
1493 	} else {
1494 		panic("lock class %s is not sleep or spin",
1495 		    lock_class->lc_name);
1496 		return (NULL);
1497 	}
1498 
1499 	mtx_enter(&w_mtx);
1500 	w = witness_hash_get(type, subtype);
1501 	if (w)
1502 		goto found;
1503 	if ((w = witness_get()) == NULL)
1504 		return (NULL);
1505 	w->w_type = type;
1506 	w->w_subtype = subtype;
1507 	w->w_class = lock_class;
1508 	SLIST_INSERT_HEAD(&w_all, w, w_list);
1509 	if (lock_class->lc_flags & LC_SPINLOCK) {
1510 		SLIST_INSERT_HEAD(&w_spin, w, w_typelist);
1511 		w_spin_cnt++;
1512 	} else if (lock_class->lc_flags & LC_SLEEPLOCK) {
1513 		SLIST_INSERT_HEAD(&w_sleep, w, w_typelist);
1514 		w_sleep_cnt++;
1515 	}
1516 
1517 	/* Insert new witness into the hash */
1518 	witness_hash_put(w);
1519 	witness_increment_graph_generation();
1520 	mtx_leave(&w_mtx);
1521 	return (w);
1522 found:
1523 	mtx_leave(&w_mtx);
1524 	if (lock_class != w->w_class)
1525 		panic("lock (%s) %s does not match earlier (%s) lock",
1526 		    type->lt_name, lock_class->lc_name, w->w_class->lc_name);
1527 	return (w);
1528 }
1529 
1530 static void
1531 adopt(struct witness *parent, struct witness *child)
1532 {
1533 	int pi, ci, i, j;
1534 
1535 	if (witness_cold == 0)
1536 		MUTEX_ASSERT_LOCKED(&w_mtx);
1537 
1538 	/* If the relationship is already known, there's no work to be done. */
1539 	if (isitmychild(parent, child))
1540 		return;
1541 
1542 	/* When the structure of the graph changes, bump up the generation. */
1543 	witness_increment_graph_generation();
1544 
1545 	/*
1546 	 * The hard part ... create the direct relationship, then propagate all
1547 	 * indirect relationships.
1548 	 */
1549 	pi = parent->w_index;
1550 	ci = child->w_index;
1551 	WITNESS_INDEX_ASSERT(pi);
1552 	WITNESS_INDEX_ASSERT(ci);
1553 	KASSERT(pi != ci);
1554 	w_rmatrix[pi][ci] |= WITNESS_PARENT;
1555 	w_rmatrix[ci][pi] |= WITNESS_CHILD;
1556 
1557 	/*
1558 	 * If parent was not already an ancestor of child,
1559 	 * then we increment the descendant and ancestor counters.
1560 	 */
1561 	if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) {
1562 		parent->w_num_descendants++;
1563 		child->w_num_ancestors++;
1564 	}
1565 
1566 	/*
1567 	 * Find each ancestor of 'pi'. Note that 'pi' itself is counted as
1568 	 * an ancestor of 'pi' during this loop.
1569 	 */
1570 	for (i = 1; i <= w_max_used_index; i++) {
1571 		if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 &&
1572 		    (i != pi))
1573 			continue;
1574 
1575 		/* Find each descendant of 'i' and mark it as a descendant. */
1576 		for (j = 1; j <= w_max_used_index; j++) {
1577 
1578 			/*
1579 			 * Skip children that are already marked as
1580 			 * descendants of 'i'.
1581 			 */
1582 			if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK)
1583 				continue;
1584 
1585 			/*
1586 			 * We are only interested in descendants of 'ci'. Note
1587 			 * that 'ci' itself is counted as a descendant of 'ci'.
1588 			 */
1589 			if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 &&
1590 			    (j != ci))
1591 				continue;
1592 			w_rmatrix[i][j] |= WITNESS_ANCESTOR;
1593 			w_rmatrix[j][i] |= WITNESS_DESCENDANT;
1594 			w_data[i].w_num_descendants++;
1595 			w_data[j].w_num_ancestors++;
1596 
1597 			/*
1598 			 * Make sure we aren't marking a node as both an
1599 			 * ancestor and descendant. We should have caught
1600 			 * this as a lock order reversal earlier.
1601 			 */
1602 			if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) &&
1603 			    (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) {
1604 				printf("witness: rmatrix paradox! [%d][%d]=%d "
1605 				    "both ancestor and descendant\n",
1606 				    i, j, w_rmatrix[i][j]);
1607 #ifdef DDB
1608 				db_stack_dump();
1609 #endif
1610 				printf("witness disabled\n");
1611 				witness_watch = -1;
1612 			}
1613 			if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) &&
1614 			    (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) {
1615 				printf("witness: rmatrix paradox! [%d][%d]=%d "
1616 				    "both ancestor and descendant\n",
1617 				    j, i, w_rmatrix[j][i]);
1618 #ifdef DDB
1619 				db_stack_dump();
1620 #endif
1621 				printf("witness disabled\n");
1622 				witness_watch = -1;
1623 			}
1624 		}
1625 	}
1626 }
1627 
1628 static void
1629 itismychild(struct witness *parent, struct witness *child)
1630 {
1631 	KASSERT(child != NULL && parent != NULL);
1632 	if (witness_cold == 0)
1633 		MUTEX_ASSERT_LOCKED(&w_mtx);
1634 
1635 	if (!witness_lock_type_equal(parent, child)) {
1636 		if (witness_cold == 0)
1637 			mtx_leave(&w_mtx);
1638 		panic(
1639 		    "%s: parent \"%s\" (%s) and child \"%s\" (%s) are not "
1640 		    "the same lock type", __func__, parent->w_type->lt_name,
1641 		    parent->w_class->lc_name, child->w_type->lt_name,
1642 		    child->w_class->lc_name);
1643 	}
1644 	adopt(parent, child);
1645 }
1646 
1647 /*
1648  * Generic code for the isitmy*() functions. The rmask parameter is the
1649  * expected relationship of w1 to w2.
1650  */
1651 static int
1652 _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname)
1653 {
1654 	unsigned char r1, r2;
1655 	int i1, i2;
1656 
1657 	i1 = w1->w_index;
1658 	i2 = w2->w_index;
1659 	WITNESS_INDEX_ASSERT(i1);
1660 	WITNESS_INDEX_ASSERT(i2);
1661 	r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK;
1662 	r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK;
1663 
1664 	/* The flags on one better be the inverse of the flags on the other */
1665 	if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) ||
1666 	    (WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) {
1667 		/* Don't squawk if we're potentially racing with an update. */
1668 		if (w_mtx.mtx_owner != curcpu())
1669 			return (0);
1670 		printf("witness: %s: rmatrix mismatch between %s (index %d) "
1671 		    "and %s (index %d): w_rmatrix[%d][%d] == %x but "
1672 		    "w_rmatrix[%d][%d] == %x\n",
1673 		    fname, w1->w_type->lt_name, i1, w2->w_type->lt_name,
1674 		    i2, i1, i2, r1,
1675 		    i2, i1, r2);
1676 #ifdef DDB
1677 		db_stack_dump();
1678 #endif
1679 		printf("witness disabled\n");
1680 		witness_watch = -1;
1681 	}
1682 	return (r1 & rmask);
1683 }
1684 
1685 /*
1686  * Checks if @child is a direct child of @parent.
1687  */
1688 static int
1689 isitmychild(struct witness *parent, struct witness *child)
1690 {
1691 
1692 	return (_isitmyx(parent, child, WITNESS_PARENT, __func__));
1693 }
1694 
1695 /*
1696  * Checks if @descendant is a direct or indirect descendant of @ancestor.
1697  */
1698 static int
1699 isitmydescendant(struct witness *ancestor, struct witness *descendant)
1700 {
1701 
1702 	return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK,
1703 	    __func__));
1704 }
1705 
1706 static struct witness *
1707 witness_get(void)
1708 {
1709 	struct witness *w;
1710 	int index;
1711 
1712 	if (witness_cold == 0)
1713 		MUTEX_ASSERT_LOCKED(&w_mtx);
1714 
1715 	if (witness_watch < 0) {
1716 		mtx_leave(&w_mtx);
1717 		return (NULL);
1718 	}
1719 	if (SLIST_EMPTY(&w_free)) {
1720 		witness_watch = -1;
1721 		mtx_leave(&w_mtx);
1722 		printf("WITNESS: unable to allocate a new witness object\n");
1723 		return (NULL);
1724 	}
1725 	w = SLIST_FIRST(&w_free);
1726 	SLIST_REMOVE_HEAD(&w_free, w_list);
1727 	w_free_cnt--;
1728 	index = w->w_index;
1729 	KASSERT(index > 0 && index == w_max_used_index + 1 &&
1730 	    index < witness_count);
1731 	memset(w, 0, sizeof(*w));
1732 	w->w_index = index;
1733 	if (index > w_max_used_index)
1734 		w_max_used_index = index;
1735 	return (w);
1736 }
1737 
1738 static void
1739 witness_free(struct witness *w)
1740 {
1741 	SLIST_INSERT_HEAD(&w_free, w, w_list);
1742 	w_free_cnt++;
1743 }
1744 
1745 static struct lock_list_entry *
1746 witness_lock_list_get(void)
1747 {
1748 	struct lock_list_entry *lle;
1749 	struct witness_cpu *wcpu = &witness_cpu[cpu_number()];
1750 
1751 	if (witness_watch < 0)
1752 		return (NULL);
1753 
1754 	splassert(IPL_HIGH);
1755 
1756 	if (wcpu->wc_lle_count > 0) {
1757 		lle = wcpu->wc_lle_cache;
1758 		wcpu->wc_lle_cache = lle->ll_next;
1759 		wcpu->wc_lle_count--;
1760 		memset(lle, 0, sizeof(*lle));
1761 		return (lle);
1762 	}
1763 
1764 	mtx_enter(&w_mtx);
1765 	lle = w_lock_list_free;
1766 	if (lle == NULL) {
1767 		witness_watch = -1;
1768 		mtx_leave(&w_mtx);
1769 		printf("%s: witness exhausted\n", __func__);
1770 		return (NULL);
1771 	}
1772 	w_lock_list_free = lle->ll_next;
1773 	mtx_leave(&w_mtx);
1774 	memset(lle, 0, sizeof(*lle));
1775 	return (lle);
1776 }
1777 
1778 static void
1779 witness_lock_list_free(struct lock_list_entry *lle)
1780 {
1781 	struct witness_cpu *wcpu = &witness_cpu[cpu_number()];
1782 
1783 	splassert(IPL_HIGH);
1784 
1785 	if (wcpu->wc_lle_count < WITNESS_LLE_CACHE_MAX) {
1786 		lle->ll_next = wcpu->wc_lle_cache;
1787 		wcpu->wc_lle_cache = lle;
1788 		wcpu->wc_lle_count++;
1789 		return;
1790 	}
1791 
1792 	mtx_enter(&w_mtx);
1793 	lle->ll_next = w_lock_list_free;
1794 	w_lock_list_free = lle;
1795 	mtx_leave(&w_mtx);
1796 }
1797 
1798 static union lock_stack *
1799 witness_lock_stack_get(void)
1800 {
1801 	union lock_stack *stack = NULL;
1802 	struct witness_cpu *wcpu = &witness_cpu[cpu_number()];
1803 
1804 	splassert(IPL_HIGH);
1805 
1806 	if (wcpu->wc_stk_count > 0) {
1807 		stack = wcpu->wc_stk_cache;
1808 		wcpu->wc_stk_cache = stack->ls_next;
1809 		wcpu->wc_stk_count--;
1810 		return (stack);
1811 	}
1812 
1813 	mtx_enter(&w_mtx);
1814 	if (w_lock_stack_free != NULL) {
1815 		stack = w_lock_stack_free;
1816 		w_lock_stack_free = stack->ls_next;
1817 	}
1818 	mtx_leave(&w_mtx);
1819 	return (stack);
1820 }
1821 
1822 static void
1823 witness_lock_stack_free(union lock_stack *stack)
1824 {
1825 	struct witness_cpu *wcpu = &witness_cpu[cpu_number()];
1826 
1827 	splassert(IPL_HIGH);
1828 
1829 	if (wcpu->wc_stk_count < WITNESS_STK_CACHE_MAX) {
1830 		stack->ls_next = wcpu->wc_stk_cache;
1831 		wcpu->wc_stk_cache = stack;
1832 		wcpu->wc_stk_count++;
1833 		return;
1834 	}
1835 
1836 	mtx_enter(&w_mtx);
1837 	stack->ls_next = w_lock_stack_free;
1838 	w_lock_stack_free = stack;
1839 	mtx_leave(&w_mtx);
1840 }
1841 
1842 static struct lock_instance *
1843 find_instance(struct lock_list_entry *list, const struct lock_object *lock)
1844 {
1845 	struct lock_list_entry *lle;
1846 	struct lock_instance *instance;
1847 	int i;
1848 
1849 	for (lle = list; lle != NULL; lle = lle->ll_next) {
1850 		for (i = lle->ll_count - 1; i >= 0; i--) {
1851 			instance = &lle->ll_children[i];
1852 			if (instance->li_lock == lock)
1853 				return (instance);
1854 		}
1855 	}
1856 	return (NULL);
1857 }
1858 
1859 static void
1860 witness_list_lock(struct lock_instance *instance,
1861     int (*prnt)(const char *fmt, ...))
1862 {
1863 	struct lock_object *lock;
1864 
1865 	lock = instance->li_lock;
1866 	prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ?
1867 	    "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name);
1868 	prnt(" r = %d (%p)\n", instance->li_flags & LI_RECURSEMASK, lock);
1869 	if (instance->li_stack != NULL)
1870 		stacktrace_print(&instance->li_stack->ls_stack, prnt);
1871 }
1872 
1873 #ifdef DDB
1874 static int
1875 witness_thread_has_locks(struct proc *p)
1876 {
1877 
1878 	if (p->p_sleeplocks == NULL)
1879 		return (0);
1880 	return (p->p_sleeplocks->ll_count != 0);
1881 }
1882 
1883 static int
1884 witness_process_has_locks(struct process *pr)
1885 {
1886 	struct proc *p;
1887 
1888 	TAILQ_FOREACH(p, &pr->ps_threads, p_thr_link) {
1889 		if (witness_thread_has_locks(p))
1890 			return (1);
1891 	}
1892 	return (0);
1893 }
1894 #endif
1895 
1896 int
1897 witness_list_locks(struct lock_list_entry **lock_list,
1898     int (*prnt)(const char *fmt, ...))
1899 {
1900 	struct lock_list_entry *lle;
1901 	int i, nheld;
1902 
1903 	nheld = 0;
1904 	for (lle = *lock_list; lle != NULL; lle = lle->ll_next)
1905 		for (i = lle->ll_count - 1; i >= 0; i--) {
1906 			witness_list_lock(&lle->ll_children[i], prnt);
1907 			nheld++;
1908 		}
1909 	return (nheld);
1910 }
1911 
1912 /*
1913  * This is a bit risky at best.  We call this function when we have timed
1914  * out acquiring a spin lock, and we assume that the other CPU is stuck
1915  * with this lock held.  So, we go groveling around in the other CPU's
1916  * per-cpu data to try to find the lock instance for this spin lock to
1917  * see when it was last acquired.
1918  */
1919 void
1920 witness_display_spinlock(struct lock_object *lock, struct proc *owner,
1921     int (*prnt)(const char *fmt, ...))
1922 {
1923 	struct lock_instance *instance;
1924 
1925 	if (owner->p_stat != SONPROC)
1926 		return;
1927 	instance = find_instance(
1928 	    witness_cpu[owner->p_cpu->ci_cpuid].wc_spinlocks, lock);
1929 	if (instance != NULL)
1930 		witness_list_lock(instance, prnt);
1931 }
1932 
1933 void
1934 witness_assert(const struct lock_object *lock, int flags)
1935 {
1936 	struct lock_instance *instance;
1937 	struct lock_class *class;
1938 
1939 	if (lock->lo_witness == NULL || witness_watch < 1 ||
1940 	    panicstr != NULL || db_active)
1941 		return;
1942 	class = LOCK_CLASS(lock);
1943 	if ((class->lc_flags & LC_SLEEPLOCK) != 0)
1944 		instance = find_instance(curproc->p_sleeplocks, lock);
1945 	else if ((class->lc_flags & LC_SPINLOCK) != 0)
1946 		instance = find_instance(
1947 		    witness_cpu[cpu_number()].wc_spinlocks, lock);
1948 	else {
1949 		panic("lock (%s) %s is not sleep or spin!",
1950 		    class->lc_name, lock->lo_name);
1951 		return;
1952 	}
1953 	switch (flags) {
1954 	case LA_UNLOCKED:
1955 		if (instance != NULL)
1956 			panic("lock (%s) %s locked",
1957 			    class->lc_name, lock->lo_name);
1958 		break;
1959 	case LA_LOCKED:
1960 	case LA_LOCKED | LA_RECURSED:
1961 	case LA_LOCKED | LA_NOTRECURSED:
1962 	case LA_SLOCKED:
1963 	case LA_SLOCKED | LA_RECURSED:
1964 	case LA_SLOCKED | LA_NOTRECURSED:
1965 	case LA_XLOCKED:
1966 	case LA_XLOCKED | LA_RECURSED:
1967 	case LA_XLOCKED | LA_NOTRECURSED:
1968 		if (instance == NULL) {
1969 			panic("lock (%s) %s not locked",
1970 			    class->lc_name, lock->lo_name);
1971 			break;
1972 		}
1973 		if ((flags & LA_XLOCKED) != 0 &&
1974 		    (instance->li_flags & LI_EXCLUSIVE) == 0)
1975 			panic(
1976 			    "lock (%s) %s not exclusively locked",
1977 			    class->lc_name, lock->lo_name);
1978 		if ((flags & LA_SLOCKED) != 0 &&
1979 		    (instance->li_flags & LI_EXCLUSIVE) != 0)
1980 			panic(
1981 			    "lock (%s) %s exclusively locked",
1982 			    class->lc_name, lock->lo_name);
1983 		if ((flags & LA_RECURSED) != 0 &&
1984 		    (instance->li_flags & LI_RECURSEMASK) == 0)
1985 			panic("lock (%s) %s not recursed",
1986 			    class->lc_name, lock->lo_name);
1987 		if ((flags & LA_NOTRECURSED) != 0 &&
1988 		    (instance->li_flags & LI_RECURSEMASK) != 0)
1989 			panic("lock (%s) %s recursed",
1990 			    class->lc_name, lock->lo_name);
1991 		break;
1992 	default:
1993 		panic("invalid lock assertion");
1994 
1995 	}
1996 }
1997 
1998 static void
1999 witness_setflag(struct lock_object *lock, int flag, int set)
2000 {
2001 	struct lock_list_entry *lock_list;
2002 	struct lock_instance *instance;
2003 	struct lock_class *class;
2004 
2005 	if (lock->lo_witness == NULL || witness_watch < 0 ||
2006 	    panicstr != NULL || db_active)
2007 		return;
2008 	class = LOCK_CLASS(lock);
2009 	if (class->lc_flags & LC_SLEEPLOCK)
2010 		lock_list = curproc->p_sleeplocks;
2011 	else
2012 		lock_list = witness_cpu[cpu_number()].wc_spinlocks;
2013 	instance = find_instance(lock_list, lock);
2014 	if (instance == NULL) {
2015 		panic("%s: lock (%s) %s not locked", __func__,
2016 		    class->lc_name, lock->lo_name);
2017 		return;
2018 	}
2019 
2020 	if (set)
2021 		instance->li_flags |= flag;
2022 	else
2023 		instance->li_flags &= ~flag;
2024 }
2025 
2026 void
2027 witness_norelease(struct lock_object *lock)
2028 {
2029 
2030 	witness_setflag(lock, LI_NORELEASE, 1);
2031 }
2032 
2033 void
2034 witness_releaseok(struct lock_object *lock)
2035 {
2036 
2037 	witness_setflag(lock, LI_NORELEASE, 0);
2038 }
2039 
2040 #ifdef DDB
2041 static void
2042 witness_ddb_list(struct proc *p)
2043 {
2044 	struct witness_cpu *wc = &witness_cpu[cpu_number()];
2045 
2046 	KASSERTMSG(witness_cold == 0, "%s: witness_cold", __func__);
2047 	KASSERTMSG(db_active, "%s: not in the debugger", __func__);
2048 
2049 	if (witness_watch < 1)
2050 		return;
2051 
2052 	witness_list_locks(&p->p_sleeplocks, db_printf);
2053 
2054 	/*
2055 	 * We only handle spinlocks if td == curproc.  This is somewhat broken
2056 	 * if td is currently executing on some other CPU and holds spin locks
2057 	 * as we won't display those locks.  If we had a MI way of getting
2058 	 * the per-cpu data for a given cpu then we could use
2059 	 * td->td_oncpu to get the list of spinlocks for this thread
2060 	 * and "fix" this.
2061 	 *
2062 	 * That still wouldn't really fix this unless we locked the scheduler
2063 	 * lock or stopped the other CPU to make sure it wasn't changing the
2064 	 * list out from under us.  It is probably best to just not try to
2065 	 * handle threads on other CPU's for now.
2066 	 */
2067 	if (p == curproc && wc->wc_spinlocks != NULL)
2068 		witness_list_locks(&wc->wc_spinlocks, db_printf);
2069 }
2070 
2071 void
2072 db_witness_list(db_expr_t addr, int have_addr, db_expr_t count, char *modif)
2073 {
2074 	struct proc *p;
2075 
2076 	if (have_addr)
2077 		p = (struct proc *)addr;
2078 	else
2079 		p = curproc;
2080 	witness_ddb_list(p);
2081 }
2082 
2083 void
2084 db_witness_list_all(db_expr_t addr, int have_addr, db_expr_t count, char *modif)
2085 {
2086 	CPU_INFO_ITERATOR cii;
2087 	struct cpu_info *ci;
2088 	struct lock_list_entry *lock_list;
2089 	struct process *pr;
2090 	struct proc *p;
2091 
2092 	CPU_INFO_FOREACH(cii, ci) {
2093 		lock_list = witness_cpu[CPU_INFO_UNIT(ci)].wc_spinlocks;
2094 		if (lock_list == NULL || lock_list->ll_count == 0)
2095 			continue;
2096 		db_printf("CPU %d:\n", CPU_INFO_UNIT(ci));
2097 		witness_list_locks(&lock_list, db_printf);
2098 	}
2099 
2100 	/*
2101 	 * It would be nice to list only threads and processes that actually
2102 	 * held sleep locks, but that information is currently not exported
2103 	 * by WITNESS.
2104 	 */
2105 	LIST_FOREACH(pr, &allprocess, ps_list) {
2106 		if (!witness_process_has_locks(pr))
2107 			continue;
2108 		TAILQ_FOREACH(p, &pr->ps_threads, p_thr_link) {
2109 			if (!witness_thread_has_locks(p))
2110 				continue;
2111 			db_printf("Process %d (%s) thread %p (%d)\n",
2112 			    pr->ps_pid, pr->ps_comm, p, p->p_tid);
2113 			witness_ddb_list(p);
2114 		}
2115 	}
2116 }
2117 
2118 void
2119 witness_print_badstacks(void)
2120 {
2121 	static struct witness tmp_w1, tmp_w2;
2122 	static struct witness_lock_order_data tmp_data1, tmp_data2;
2123 	struct witness_lock_order_data *data1, *data2;
2124 	struct witness *w1, *w2;
2125 	int error, generation, i, j;
2126 
2127 	if (witness_watch < 1) {
2128 		db_printf("witness watch is disabled\n");
2129 		return;
2130 	}
2131 	if (witness_cold) {
2132 		db_printf("witness is cold\n");
2133 		return;
2134 	}
2135 	error = 0;
2136 
2137 	memset(&tmp_w1, 0, sizeof(tmp_w1));
2138 	memset(&tmp_w2, 0, sizeof(tmp_w2));
2139 	memset(&tmp_data1, 0, sizeof(tmp_data1));
2140 	memset(&tmp_data2, 0, sizeof(tmp_data2));
2141 
2142 restart:
2143 	mtx_enter(&w_mtx);
2144 	generation = w_generation;
2145 	mtx_leave(&w_mtx);
2146 	db_printf("Number of known direct relationships is %d\n",
2147 	    w_lohash.wloh_count);
2148 	for (i = 1; i < w_max_used_index; i++) {
2149 		mtx_enter(&w_mtx);
2150 		if (generation != w_generation) {
2151 			mtx_leave(&w_mtx);
2152 
2153 			/* The graph has changed, try again. */
2154 			db_printf("Lock graph changed, restarting trace.\n");
2155 			goto restart;
2156 		}
2157 
2158 		w1 = &w_data[i];
2159 		if (w1->w_reversed == 0) {
2160 			mtx_leave(&w_mtx);
2161 			continue;
2162 		}
2163 
2164 		/* Copy w1 locally so we can release the spin lock. */
2165 		tmp_w1 = *w1;
2166 		mtx_leave(&w_mtx);
2167 
2168 		if (tmp_w1.w_reversed == 0)
2169 			continue;
2170 		for (j = 1; j < w_max_used_index; j++) {
2171 			if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j)
2172 				continue;
2173 
2174 			mtx_enter(&w_mtx);
2175 			if (generation != w_generation) {
2176 				mtx_leave(&w_mtx);
2177 
2178 				/* The graph has changed, try again. */
2179 				db_printf("Lock graph changed, "
2180 				    "restarting trace.\n");
2181 				goto restart;
2182 			}
2183 
2184 			w2 = &w_data[j];
2185 			data1 = witness_lock_order_get(w1, w2);
2186 			data2 = witness_lock_order_get(w2, w1);
2187 
2188 			/*
2189 			 * Copy information locally so we can release the
2190 			 * spin lock.
2191 			 */
2192 			tmp_w2 = *w2;
2193 
2194 			if (data1)
2195 				tmp_data1.wlod_stack = data1->wlod_stack;
2196 			if (data2 && data2 != data1)
2197 				tmp_data2.wlod_stack = data2->wlod_stack;
2198 			mtx_leave(&w_mtx);
2199 
2200 			db_printf("\nLock order reversal between \"%s\"(%s) "
2201 			    "and \"%s\"(%s)!\n",
2202 			    tmp_w1.w_type->lt_name, tmp_w1.w_class->lc_name,
2203 			    tmp_w2.w_type->lt_name, tmp_w2.w_class->lc_name);
2204 			if (data1) {
2205 				db_printf("Lock order \"%s\"(%s) -> \"%s\"(%s) "
2206 				    "first seen at:\n",
2207 				    tmp_w1.w_type->lt_name,
2208 				    tmp_w1.w_class->lc_name,
2209 				    tmp_w2.w_type->lt_name,
2210 				    tmp_w2.w_class->lc_name);
2211 				stacktrace_print(&tmp_data1.wlod_stack,
2212 				    db_printf);
2213 				db_printf("\n");
2214 			}
2215 			if (data2 && data2 != data1) {
2216 				db_printf("Lock order \"%s\"(%s) -> \"%s\"(%s) "
2217 				    "first seen at:\n",
2218 				    tmp_w2.w_type->lt_name,
2219 				    tmp_w2.w_class->lc_name,
2220 				    tmp_w1.w_type->lt_name,
2221 				    tmp_w1.w_class->lc_name);
2222 				stacktrace_print(&tmp_data2.wlod_stack,
2223 				    db_printf);
2224 				db_printf("\n");
2225 			}
2226 		}
2227 	}
2228 	mtx_enter(&w_mtx);
2229 	if (generation != w_generation) {
2230 		mtx_leave(&w_mtx);
2231 
2232 		/*
2233 		 * The graph changed while we were printing stack data,
2234 		 * try again.
2235 		 */
2236 		db_printf("Lock graph changed, restarting trace.\n");
2237 		goto restart;
2238 	}
2239 	mtx_leave(&w_mtx);
2240 }
2241 
2242 void
2243 db_witness_display(db_expr_t addr, int have_addr, db_expr_t count, char *modif)
2244 {
2245 	switch (modif[0]) {
2246 	case 'b':
2247 		witness_print_badstacks();
2248 		break;
2249 	default:
2250 		witness_ddb_display(db_printf);
2251 		break;
2252 	}
2253 }
2254 #endif
2255 
2256 void
2257 db_witness_print_fullgraph(void)
2258 {
2259 	struct witness *w;
2260 	int error;
2261 
2262 	if (witness_watch < 1) {
2263 		db_printf("witness watch is disabled\n");
2264 		return;
2265 	}
2266 	if (witness_cold) {
2267 		db_printf("witness is cold\n");
2268 		return;
2269 	}
2270 	error = 0;
2271 
2272 	mtx_enter(&w_mtx);
2273 	SLIST_FOREACH(w, &w_all, w_list)
2274 		w->w_displayed = 0;
2275 	SLIST_FOREACH(w, &w_all, w_list)
2276 		db_witness_add_fullgraph(w);
2277 	mtx_leave(&w_mtx);
2278 }
2279 
2280 static void
2281 db_witness_add_fullgraph(struct witness *w)
2282 {
2283 	int i;
2284 
2285 	if (w->w_displayed != 0 || w->w_acquired == 0)
2286 		return;
2287 	w->w_displayed = 1;
2288 
2289 	WITNESS_INDEX_ASSERT(w->w_index);
2290 	for (i = 1; i <= w_max_used_index; i++) {
2291 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) {
2292 			db_printf("\"%s\",\"%s\"\n", w->w_type->lt_name,
2293 			    w_data[i].w_type->lt_name);
2294 			db_witness_add_fullgraph(&w_data[i]);
2295 		}
2296 	}
2297 }
2298 
2299 /*
2300  * A simple hash function. Takes a key pointer and a key size. If size == 0,
2301  * interprets the key as a string and reads until the null
2302  * terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit
2303  * hash value computed from the key.
2304  */
2305 static uint32_t
2306 witness_hash_djb2(const uint8_t *key, uint32_t size)
2307 {
2308 	unsigned int hash = 5381;
2309 	int i;
2310 
2311 	/* hash = hash * 33 + key[i] */
2312 	if (size)
2313 		for (i = 0; i < size; i++)
2314 			hash = ((hash << 5) + hash) + (unsigned int)key[i];
2315 	else
2316 		for (i = 0; key[i] != 0; i++)
2317 			hash = ((hash << 5) + hash) + (unsigned int)key[i];
2318 
2319 	return (hash);
2320 }
2321 
2322 
2323 /*
2324  * Initializes the two witness hash tables. Called exactly once from
2325  * witness_initialize().
2326  */
2327 static void
2328 witness_init_hash_tables(void)
2329 {
2330 	int i;
2331 
2332 	KASSERT(witness_cold);
2333 
2334 	/* Initialize the hash tables. */
2335 	for (i = 0; i < WITNESS_HASH_SIZE; i++)
2336 		SLIST_INIT(&w_hash.wh_array[i]);
2337 
2338 	w_hash.wh_size = WITNESS_HASH_SIZE;
2339 	w_hash.wh_count = 0;
2340 
2341 	/* Initialize the lock order data hash. */
2342 	w_lofree = NULL;
2343 	for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) {
2344 		memset(&w_lodata[i], 0, sizeof(w_lodata[i]));
2345 		w_lodata[i].wlod_next = w_lofree;
2346 		w_lofree = &w_lodata[i];
2347 	}
2348 	w_lohash.wloh_size = WITNESS_LO_HASH_SIZE;
2349 	w_lohash.wloh_count = 0;
2350 	for (i = 0; i < WITNESS_LO_HASH_SIZE; i++)
2351 		w_lohash.wloh_array[i] = NULL;
2352 }
2353 
2354 static struct witness *
2355 witness_hash_get(const struct lock_type *type, const char *subtype)
2356 {
2357 	struct witness *w;
2358 	uint32_t hash;
2359 
2360 	KASSERT(type != NULL);
2361 	if (witness_cold == 0)
2362 		MUTEX_ASSERT_LOCKED(&w_mtx);
2363 	hash = (uint32_t)((uintptr_t)type ^ (uintptr_t)subtype) %
2364 	    w_hash.wh_size;
2365 	SLIST_FOREACH(w, &w_hash.wh_array[hash], w_hash_next) {
2366 		if (w->w_type == type && w->w_subtype == subtype)
2367 			goto out;
2368 	}
2369 
2370 out:
2371 	return (w);
2372 }
2373 
2374 static void
2375 witness_hash_put(struct witness *w)
2376 {
2377 	uint32_t hash;
2378 
2379 	KASSERT(w != NULL);
2380 	KASSERT(w->w_type != NULL);
2381 	if (witness_cold == 0)
2382 		MUTEX_ASSERT_LOCKED(&w_mtx);
2383 	KASSERTMSG(witness_hash_get(w->w_type, w->w_subtype) == NULL,
2384 	    "%s: trying to add a hash entry that already exists!", __func__);
2385 	KASSERTMSG(SLIST_NEXT(w, w_hash_next) == NULL,
2386 	    "%s: w->w_hash_next != NULL", __func__);
2387 
2388 	hash = (uint32_t)((uintptr_t)w->w_type ^ (uintptr_t)w->w_subtype) %
2389 	    w_hash.wh_size;
2390 	SLIST_INSERT_HEAD(&w_hash.wh_array[hash], w, w_hash_next);
2391 	w_hash.wh_count++;
2392 }
2393 
2394 
2395 static struct witness_lock_order_data *
2396 witness_lock_order_get(struct witness *parent, struct witness *child)
2397 {
2398 	struct witness_lock_order_data *data = NULL;
2399 	struct witness_lock_order_key key;
2400 	unsigned int hash;
2401 
2402 	KASSERT(parent != NULL && child != NULL);
2403 	key.from = parent->w_index;
2404 	key.to = child->w_index;
2405 	WITNESS_INDEX_ASSERT(key.from);
2406 	WITNESS_INDEX_ASSERT(key.to);
2407 	if ((w_rmatrix[parent->w_index][child->w_index]
2408 	    & WITNESS_LOCK_ORDER_KNOWN) == 0)
2409 		goto out;
2410 
2411 	hash = witness_hash_djb2((const char*)&key,
2412 	    sizeof(key)) % w_lohash.wloh_size;
2413 	data = w_lohash.wloh_array[hash];
2414 	while (data != NULL) {
2415 		if (witness_lock_order_key_equal(&data->wlod_key, &key))
2416 			break;
2417 		data = data->wlod_next;
2418 	}
2419 
2420 out:
2421 	return (data);
2422 }
2423 
2424 /*
2425  * Verify that parent and child have a known relationship, are not the same,
2426  * and child is actually a child of parent.  This is done without w_mtx
2427  * to avoid contention in the common case.
2428  */
2429 static int
2430 witness_lock_order_check(struct witness *parent, struct witness *child)
2431 {
2432 
2433 	if (parent != child &&
2434 	    w_rmatrix[parent->w_index][child->w_index]
2435 	    & WITNESS_LOCK_ORDER_KNOWN &&
2436 	    isitmychild(parent, child))
2437 		return (1);
2438 
2439 	return (0);
2440 }
2441 
2442 static int
2443 witness_lock_order_add(struct witness *parent, struct witness *child)
2444 {
2445 	static int lofree_empty_reported = 0;
2446 	struct witness_lock_order_data *data = NULL;
2447 	struct witness_lock_order_key key;
2448 	unsigned int hash;
2449 
2450 	KASSERT(parent != NULL && child != NULL);
2451 	key.from = parent->w_index;
2452 	key.to = child->w_index;
2453 	WITNESS_INDEX_ASSERT(key.from);
2454 	WITNESS_INDEX_ASSERT(key.to);
2455 	if (w_rmatrix[parent->w_index][child->w_index]
2456 	    & WITNESS_LOCK_ORDER_KNOWN)
2457 		return (1);
2458 
2459 	hash = witness_hash_djb2((const char*)&key,
2460 	    sizeof(key)) % w_lohash.wloh_size;
2461 	w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN;
2462 	data = w_lofree;
2463 	if (data == NULL) {
2464 		if (!lofree_empty_reported) {
2465 			lofree_empty_reported = 1;
2466 			printf("witness: out of free lock order entries\n");
2467 		}
2468 		return (0);
2469 	}
2470 	w_lofree = data->wlod_next;
2471 	data->wlod_next = w_lohash.wloh_array[hash];
2472 	data->wlod_key = key;
2473 	w_lohash.wloh_array[hash] = data;
2474 	w_lohash.wloh_count++;
2475 	stacktrace_save_at(&data->wlod_stack, 1);
2476 	return (1);
2477 }
2478 
2479 /* Call this whenever the structure of the witness graph changes. */
2480 static void
2481 witness_increment_graph_generation(void)
2482 {
2483 
2484 	if (witness_cold == 0)
2485 		MUTEX_ASSERT_LOCKED(&w_mtx);
2486 	w_generation++;
2487 }
2488 
2489 static void
2490 witness_debugger(int dump)
2491 {
2492 	switch (witness_watch) {
2493 	case 1:
2494 		break;
2495 	case 2:
2496 		if (dump)
2497 			db_stack_dump();
2498 		break;
2499 	case 3:
2500 		if (dump)
2501 			db_stack_dump();
2502 		db_enter();
2503 		break;
2504 	default:
2505 		panic("witness: locking error");
2506 	}
2507 }
2508 
2509 static int
2510 witness_alloc_stacks(void)
2511 {
2512 	union lock_stack *stacks;
2513 	unsigned int i, nstacks = LOCK_CHILDCOUNT * LOCK_NCHILDREN;
2514 
2515 	rw_assert_wrlock(&w_ctlock);
2516 
2517 	if (w_lock_stack_num >= nstacks)
2518 		return (0);
2519 
2520 	nstacks -= w_lock_stack_num;
2521 	stacks = mallocarray(nstacks, sizeof(*stacks), M_WITNESS,
2522 	    M_WAITOK | M_CANFAIL | M_ZERO);
2523 	if (stacks == NULL)
2524 		return (ENOMEM);
2525 
2526 	mtx_enter(&w_mtx);
2527 	for (i = 0; i < nstacks; i++) {
2528 		stacks[i].ls_next = w_lock_stack_free;
2529 		w_lock_stack_free = &stacks[i];
2530 	}
2531 	mtx_leave(&w_mtx);
2532 	w_lock_stack_num += nstacks;
2533 
2534 	return (0);
2535 }
2536 
2537 int
2538 witness_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
2539     void *newp, size_t newlen)
2540 {
2541 	int error, value;
2542 
2543 	if (namelen != 1)
2544 		return (ENOTDIR);
2545 
2546 	rw_enter_write(&w_ctlock);
2547 
2548 	switch (name[0]) {
2549 	case KERN_WITNESS_WATCH:
2550 		error = witness_sysctl_watch(oldp, oldlenp, newp, newlen);
2551 		break;
2552 	case KERN_WITNESS_LOCKTRACE:
2553 		value = witness_locktrace;
2554 		error = sysctl_int(oldp, oldlenp, newp, newlen, &value);
2555 		if (error == 0 && newp != NULL) {
2556 			switch (value) {
2557 			case 1:
2558 				error = witness_alloc_stacks();
2559 				/* FALLTHROUGH */
2560 			case 0:
2561 				if (error == 0)
2562 					witness_locktrace = value;
2563 				break;
2564 			default:
2565 				error = EINVAL;
2566 				break;
2567 			}
2568 		}
2569 		break;
2570 	default:
2571 		error = EOPNOTSUPP;
2572 		break;
2573 	}
2574 
2575 	rw_exit_write(&w_ctlock);
2576 
2577 	return (error);
2578 }
2579 
2580 int
2581 witness_sysctl_watch(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
2582 {
2583 	int error;
2584 	int value;
2585 
2586 	value = witness_watch;
2587 	error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
2588 	    &value, -1, 3);
2589 	if (error == 0 && newp != NULL) {
2590 		mtx_enter(&w_mtx);
2591 		if (value < 0 || witness_watch >= 0)
2592 			witness_watch = value;
2593 		else
2594 			error = EINVAL;
2595 		mtx_leave(&w_mtx);
2596 	}
2597 	return (error);
2598 }
2599