xref: /netbsd/sys/dev/lockstat.c (revision de40008d)
1 /*	$NetBSD: lockstat.c,v 1.30 2022/04/08 10:17:54 andvar Exp $	*/
2 
3 /*-
4  * Copyright (c) 2006, 2007, 2019 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Lock statistics driver, providing kernel support for the lockstat(8)
34  * command.
35  *
36  * We use a global lock word (lockstat_lock) to track device opens.
37  * Only one thread can hold the device at a time, providing a global lock.
38  *
39  * XXX Timings for contention on sleep locks are currently incorrect.
40  * XXX Convert this to use timecounters!
41  */
42 
43 #include <sys/cdefs.h>
44 __KERNEL_RCSID(0, "$NetBSD: lockstat.c,v 1.30 2022/04/08 10:17:54 andvar Exp $");
45 
46 #include <sys/types.h>
47 #include <sys/param.h>
48 
49 #include <sys/atomic.h>
50 #include <sys/conf.h>
51 #include <sys/cpu.h>
52 #include <sys/kernel.h>
53 #include <sys/kmem.h>
54 #include <sys/lock.h>
55 #include <sys/proc.h>
56 #include <sys/resourcevar.h>
57 #include <sys/syslog.h>
58 #include <sys/systm.h>
59 #include <sys/xcall.h>
60 
61 #include <dev/lockstat.h>
62 
63 #include "ioconf.h"
64 
65 #ifndef __HAVE_CPU_COUNTER
66 #error CPU counters not available
67 #endif
68 
69 #if LONG_BIT == 64
70 #define	LOCKSTAT_HASH_SHIFT	3
71 #elif LONG_BIT == 32
72 #define	LOCKSTAT_HASH_SHIFT	2
73 #endif
74 
75 #define	LOCKSTAT_MINBUFS	1000
76 #define	LOCKSTAT_DEFBUFS	20000
77 #define	LOCKSTAT_MAXBUFS	1000000
78 
79 #define	LOCKSTAT_HASH_SIZE	128
80 #define	LOCKSTAT_HASH_MASK	(LOCKSTAT_HASH_SIZE - 1)
81 #define	LOCKSTAT_HASH(key)	\
82 	((key >> LOCKSTAT_HASH_SHIFT) & LOCKSTAT_HASH_MASK)
83 
84 typedef struct lscpu {
85 	SLIST_HEAD(, lsbuf)	lc_free;
86 	u_int			lc_overflow;
87 	LIST_HEAD(lslist, lsbuf) lc_hash[LOCKSTAT_HASH_SIZE];
88 } lscpu_t;
89 
90 typedef struct lslist lslist_t;
91 
92 void	lockstat_start(lsenable_t *);
93 int	lockstat_alloc(lsenable_t *);
94 void	lockstat_init_tables(lsenable_t *);
95 int	lockstat_stop(lsdisable_t *);
96 void	lockstat_free(void);
97 
98 dev_type_open(lockstat_open);
99 dev_type_close(lockstat_close);
100 dev_type_read(lockstat_read);
101 dev_type_ioctl(lockstat_ioctl);
102 
103 volatile u_int	lockstat_enabled;
104 volatile u_int	lockstat_dev_enabled;
105 __cpu_simple_lock_t lockstat_enabled_lock;
106 uintptr_t	lockstat_csstart;
107 uintptr_t	lockstat_csend;
108 uintptr_t	lockstat_csmask;
109 uintptr_t	lockstat_lamask;
110 uintptr_t	lockstat_lockstart;
111 uintptr_t	lockstat_lockend;
112 __cpu_simple_lock_t lockstat_lock;
113 lwp_t		*lockstat_lwp;
114 lsbuf_t		*lockstat_baseb;
115 size_t		lockstat_sizeb;
116 int		lockstat_busy;
117 struct timespec	lockstat_stime;
118 
119 #ifdef KDTRACE_HOOKS
120 volatile u_int lockstat_dtrace_enabled;
121 CTASSERT(LB_NEVENT <= 3);
122 CTASSERT(LB_NLOCK <= (7 << LB_LOCK_SHIFT));
123 void
lockstat_probe_stub(uint32_t id,uintptr_t lock,uintptr_t callsite,uintptr_t flags,uintptr_t count,uintptr_t cycles)124 lockstat_probe_stub(uint32_t id, uintptr_t lock, uintptr_t callsite,
125     uintptr_t flags, uintptr_t count, uintptr_t cycles)
126 {
127 }
128 
129 uint32_t	lockstat_probemap[LS_NPROBES];
130 void		(*lockstat_probe_func)(uint32_t, uintptr_t, uintptr_t,
131 		    uintptr_t, uintptr_t, uintptr_t) = &lockstat_probe_stub;
132 #endif
133 
134 const struct cdevsw lockstat_cdevsw = {
135 	.d_open = lockstat_open,
136 	.d_close = lockstat_close,
137 	.d_read = lockstat_read,
138 	.d_write = nowrite,
139 	.d_ioctl = lockstat_ioctl,
140 	.d_stop = nostop,
141 	.d_tty = notty,
142 	.d_poll = nopoll,
143 	.d_mmap = nommap,
144 	.d_kqfilter = nokqfilter,
145 	.d_discard = nodiscard,
146 	.d_flag = D_OTHER | D_MPSAFE
147 };
148 
149 /*
150  * Called when the pseudo-driver is attached.
151  */
152 void
lockstatattach(int nunits)153 lockstatattach(int nunits)
154 {
155 
156 	(void)nunits;
157 
158 	__cpu_simple_lock_init(&lockstat_lock);
159 	__cpu_simple_lock_init(&lockstat_enabled_lock);
160 }
161 
162 /*
163  * Prepare the per-CPU tables for use, or clear down tables when tracing is
164  * stopped.
165  */
166 void
lockstat_init_tables(lsenable_t * le)167 lockstat_init_tables(lsenable_t *le)
168 {
169 	int i, per, slop, cpuno;
170 	CPU_INFO_ITERATOR cii;
171 	struct cpu_info *ci;
172 	lscpu_t *lc;
173 	lsbuf_t *lb;
174 
175 	/* coverity[assert_side_effect] */
176 	KASSERT(!lockstat_dev_enabled);
177 
178 	for (CPU_INFO_FOREACH(cii, ci)) {
179 		if (ci->ci_lockstat != NULL) {
180 			kmem_free(ci->ci_lockstat, sizeof(lscpu_t));
181 			ci->ci_lockstat = NULL;
182 		}
183 	}
184 
185 	if (le == NULL)
186 		return;
187 
188 	lb = lockstat_baseb;
189 	per = le->le_nbufs / ncpu;
190 	slop = le->le_nbufs - (per * ncpu);
191 	cpuno = 0;
192 	for (CPU_INFO_FOREACH(cii, ci)) {
193 		lc = kmem_alloc(sizeof(*lc), KM_SLEEP);
194 		lc->lc_overflow = 0;
195 		ci->ci_lockstat = lc;
196 
197 		SLIST_INIT(&lc->lc_free);
198 		for (i = 0; i < LOCKSTAT_HASH_SIZE; i++)
199 			LIST_INIT(&lc->lc_hash[i]);
200 
201 		for (i = per; i != 0; i--, lb++) {
202 			lb->lb_cpu = (uint16_t)cpuno;
203 			SLIST_INSERT_HEAD(&lc->lc_free, lb, lb_chain.slist);
204 		}
205 		if (--slop > 0) {
206 			lb->lb_cpu = (uint16_t)cpuno;
207 			SLIST_INSERT_HEAD(&lc->lc_free, lb, lb_chain.slist);
208 			lb++;
209 		}
210 		cpuno++;
211 	}
212 }
213 
214 /*
215  * Start collecting lock statistics.
216  */
217 void
lockstat_start(lsenable_t * le)218 lockstat_start(lsenable_t *le)
219 {
220 
221 	/* coverity[assert_side_effect] */
222 	KASSERT(!lockstat_dev_enabled);
223 
224 	lockstat_init_tables(le);
225 
226 	if ((le->le_flags & LE_CALLSITE) != 0)
227 		lockstat_csmask = (uintptr_t)-1LL;
228 	else
229 		lockstat_csmask = 0;
230 
231 	if ((le->le_flags & LE_LOCK) != 0)
232 		lockstat_lamask = (uintptr_t)-1LL;
233 	else
234 		lockstat_lamask = 0;
235 
236 	lockstat_csstart = le->le_csstart;
237 	lockstat_csend = le->le_csend;
238 	lockstat_lockstart = le->le_lockstart;
239 	lockstat_lockstart = le->le_lockstart;
240 	lockstat_lockend = le->le_lockend;
241 
242 	/*
243 	 * Ensure everything is initialized on all CPUs, by issuing a
244 	 * null xcall with the side effect of a release barrier on this
245 	 * CPU and an acquire barrier on all other CPUs, before they
246 	 * can witness any flags set in lockstat_dev_enabled -- this
247 	 * way we don't need to add any barriers in lockstat_event.
248 	 */
249 	xc_barrier(0);
250 
251 	/*
252 	 * Start timing after the xcall, so we don't spuriously count
253 	 * xcall communication time, but before flipping the switch, so
254 	 * we don't dirty sample with locks taken in the timecounter.
255 	 */
256 	getnanotime(&lockstat_stime);
257 
258 	LOCKSTAT_ENABLED_UPDATE_BEGIN();
259 	atomic_store_relaxed(&lockstat_dev_enabled, le->le_mask);
260 	LOCKSTAT_ENABLED_UPDATE_END();
261 }
262 
263 /*
264  * Stop collecting lock statistics.
265  */
266 int
lockstat_stop(lsdisable_t * ld)267 lockstat_stop(lsdisable_t *ld)
268 {
269 	CPU_INFO_ITERATOR cii;
270 	struct cpu_info *ci;
271 	u_int cpuno, overflow;
272 	struct timespec ts;
273 	int error;
274 	lwp_t *l;
275 
276 	/* coverity[assert_side_effect] */
277 	KASSERT(lockstat_dev_enabled);
278 
279 	/*
280 	 * Disable and wait for other CPUs to exit lockstat_event().
281 	 */
282 	LOCKSTAT_ENABLED_UPDATE_BEGIN();
283 	atomic_store_relaxed(&lockstat_dev_enabled, 0);
284 	LOCKSTAT_ENABLED_UPDATE_END();
285 	getnanotime(&ts);
286 	xc_barrier(0);
287 
288 	/*
289 	 * Did we run out of buffers while tracing?
290 	 */
291 	overflow = 0;
292 	for (CPU_INFO_FOREACH(cii, ci))
293 		overflow += ((lscpu_t *)ci->ci_lockstat)->lc_overflow;
294 
295 	if (overflow != 0) {
296 		error = EOVERFLOW;
297 		log(LOG_NOTICE, "lockstat: %d buffer allocations failed\n",
298 		    overflow);
299 	} else
300 		error = 0;
301 
302 	lockstat_init_tables(NULL);
303 
304 	/* Run through all LWPs and clear the slate for the next run. */
305 	mutex_enter(&proc_lock);
306 	LIST_FOREACH(l, &alllwp, l_list) {
307 		l->l_pfailaddr = 0;
308 		l->l_pfailtime = 0;
309 		l->l_pfaillock = 0;
310 	}
311 	mutex_exit(&proc_lock);
312 
313 	if (ld == NULL)
314 		return error;
315 
316 	/*
317 	 * Fill out the disable struct for the caller.
318 	 */
319 	timespecsub(&ts, &lockstat_stime, &ld->ld_time);
320 	ld->ld_size = lockstat_sizeb;
321 
322 	cpuno = 0;
323 	for (CPU_INFO_FOREACH(cii, ci)) {
324 		if (cpuno >= sizeof(ld->ld_freq) / sizeof(ld->ld_freq[0])) {
325 			log(LOG_WARNING, "lockstat: too many CPUs\n");
326 			break;
327 		}
328 		ld->ld_freq[cpuno++] = cpu_frequency(ci);
329 	}
330 
331 	return error;
332 }
333 
334 /*
335  * Allocate buffers for lockstat_start().
336  */
337 int
lockstat_alloc(lsenable_t * le)338 lockstat_alloc(lsenable_t *le)
339 {
340 	lsbuf_t *lb;
341 	size_t sz;
342 
343 	/* coverity[assert_side_effect] */
344 	KASSERT(!lockstat_dev_enabled);
345 	lockstat_free();
346 
347 	sz = sizeof(*lb) * le->le_nbufs;
348 
349 	lb = kmem_zalloc(sz, KM_SLEEP);
350 
351 	/* coverity[assert_side_effect] */
352 	KASSERT(!lockstat_dev_enabled);
353 	KASSERT(lockstat_baseb == NULL);
354 	lockstat_sizeb = sz;
355 	lockstat_baseb = lb;
356 
357 	return (0);
358 }
359 
360 /*
361  * Free allocated buffers after tracing has stopped.
362  */
363 void
lockstat_free(void)364 lockstat_free(void)
365 {
366 
367 	/* coverity[assert_side_effect] */
368 	KASSERT(!lockstat_dev_enabled);
369 
370 	if (lockstat_baseb != NULL) {
371 		kmem_free(lockstat_baseb, lockstat_sizeb);
372 		lockstat_baseb = NULL;
373 	}
374 }
375 
376 /*
377  * Main entry point from lock primitives.
378  */
379 void
lockstat_event(uintptr_t lock,uintptr_t callsite,u_int flags,u_int count,uint64_t cycles)380 lockstat_event(uintptr_t lock, uintptr_t callsite, u_int flags, u_int count,
381 	       uint64_t cycles)
382 {
383 	lslist_t *ll;
384 	lscpu_t *lc;
385 	lsbuf_t *lb;
386 	u_int event;
387 	int s;
388 
389 #ifdef KDTRACE_HOOKS
390 	uint32_t id;
391 	CTASSERT((LS_NPROBES & (LS_NPROBES - 1)) == 0);
392 	if ((id = atomic_load_relaxed(&lockstat_probemap[LS_COMPRESS(flags)]))
393 	    != 0)
394 		(*lockstat_probe_func)(id, lock, callsite, flags, count,
395 		    cycles);
396 #endif
397 
398 	if ((flags & atomic_load_relaxed(&lockstat_dev_enabled)) != flags ||
399 	    count == 0)
400 		return;
401 	if (lock < lockstat_lockstart || lock > lockstat_lockend)
402 		return;
403 	if (callsite < lockstat_csstart || callsite > lockstat_csend)
404 		return;
405 
406 	callsite &= lockstat_csmask;
407 	lock &= lockstat_lamask;
408 
409 	/*
410 	 * Find the table for this lock+callsite pair, and try to locate a
411 	 * buffer with the same key.
412 	 */
413 	s = splhigh();
414 	lc = curcpu()->ci_lockstat;
415 	ll = &lc->lc_hash[LOCKSTAT_HASH(lock ^ callsite)];
416 	event = (flags & LB_EVENT_MASK) - 1;
417 
418 	LIST_FOREACH(lb, ll, lb_chain.list) {
419 		if (lb->lb_lock == lock && lb->lb_callsite == callsite)
420 			break;
421 	}
422 
423 	if (lb != NULL) {
424 		/*
425 		 * We found a record.  Move it to the front of the list, as
426 		 * we're likely to hit it again soon.
427 		 */
428 		if (lb != LIST_FIRST(ll)) {
429 			LIST_REMOVE(lb, lb_chain.list);
430 			LIST_INSERT_HEAD(ll, lb, lb_chain.list);
431 		}
432 		lb->lb_counts[event] += count;
433 		lb->lb_times[event] += cycles;
434 	} else if ((lb = SLIST_FIRST(&lc->lc_free)) != NULL) {
435 		/*
436 		 * Pinch a new buffer and fill it out.
437 		 */
438 		SLIST_REMOVE_HEAD(&lc->lc_free, lb_chain.slist);
439 		LIST_INSERT_HEAD(ll, lb, lb_chain.list);
440 		lb->lb_flags = (uint16_t)flags;
441 		lb->lb_lock = lock;
442 		lb->lb_callsite = callsite;
443 		lb->lb_counts[event] = count;
444 		lb->lb_times[event] = cycles;
445 	} else {
446 		/*
447 		 * We didn't find a buffer and there were none free.
448 		 * lockstat_stop() will notice later on and report the
449 		 * error.
450 		 */
451 		 lc->lc_overflow++;
452 	}
453 
454 	splx(s);
455 }
456 
457 /*
458  * Accept an open() on /dev/lockstat.
459  */
460 int
lockstat_open(dev_t dev,int flag,int mode,lwp_t * l)461 lockstat_open(dev_t dev, int flag, int mode, lwp_t *l)
462 {
463 
464 	if (!__cpu_simple_lock_try(&lockstat_lock))
465 		return EBUSY;
466 	lockstat_lwp = curlwp;
467 	return 0;
468 }
469 
470 /*
471  * Accept the last close() on /dev/lockstat.
472  */
473 int
lockstat_close(dev_t dev,int flag,int mode,lwp_t * l)474 lockstat_close(dev_t dev, int flag, int mode, lwp_t *l)
475 {
476 
477 	lockstat_lwp = NULL;
478 	if (lockstat_dev_enabled) {
479 		lockstat_stop(NULL);
480 		lockstat_free();
481 	}
482 	__cpu_simple_unlock(&lockstat_lock);
483 	return 0;
484 }
485 
486 /*
487  * Handle control operations.
488  */
489 int
lockstat_ioctl(dev_t dev,u_long cmd,void * data,int flag,lwp_t * l)490 lockstat_ioctl(dev_t dev, u_long cmd, void *data, int flag, lwp_t *l)
491 {
492 	lsenable_t *le;
493 	int error;
494 
495 	if (lockstat_lwp != curlwp)
496 		return EBUSY;
497 
498 	switch (cmd) {
499 	case IOC_LOCKSTAT_GVERSION:
500 		*(int *)data = LS_VERSION;
501 		error = 0;
502 		break;
503 
504 	case IOC_LOCKSTAT_ENABLE:
505 		le = (lsenable_t *)data;
506 
507 		if (!cpu_hascounter()) {
508 			error = ENODEV;
509 			break;
510 		}
511 		if (atomic_load_relaxed(&lockstat_dev_enabled)) {
512 			error = EBUSY;
513 			break;
514 		}
515 
516 		/*
517 		 * Sanitize the arguments passed in and set up filtering.
518 		 */
519 		if (le->le_nbufs == 0) {
520 			le->le_nbufs = MIN(LOCKSTAT_DEFBUFS * ncpu,
521 			    LOCKSTAT_MAXBUFS);
522 		} else if (le->le_nbufs > LOCKSTAT_MAXBUFS ||
523 		    le->le_nbufs < LOCKSTAT_MINBUFS) {
524 			error = EINVAL;
525 			break;
526 		}
527 		if ((le->le_flags & LE_ONE_CALLSITE) == 0) {
528 			le->le_csstart = 0;
529 			le->le_csend = le->le_csstart - 1;
530 		}
531 		if ((le->le_flags & LE_ONE_LOCK) == 0) {
532 			le->le_lockstart = 0;
533 			le->le_lockend = le->le_lockstart - 1;
534 		}
535 		if ((le->le_mask & LB_EVENT_MASK) == 0)
536 			return EINVAL;
537 		if ((le->le_mask & LB_LOCK_MASK) == 0)
538 			return EINVAL;
539 
540 		/*
541 		 * Start tracing.
542 		 */
543 		if ((error = lockstat_alloc(le)) == 0)
544 			lockstat_start(le);
545 		break;
546 
547 	case IOC_LOCKSTAT_DISABLE:
548 		if (!atomic_load_relaxed(&lockstat_dev_enabled))
549 			error = EINVAL;
550 		else
551 			error = lockstat_stop((lsdisable_t *)data);
552 		break;
553 
554 	default:
555 		error = ENOTTY;
556 		break;
557 	}
558 
559 	return error;
560 }
561 
562 /*
563  * Copy buffers out to user-space.
564  */
565 int
lockstat_read(dev_t dev,struct uio * uio,int flag)566 lockstat_read(dev_t dev, struct uio *uio, int flag)
567 {
568 
569 	if (curlwp != lockstat_lwp || lockstat_dev_enabled)
570 		return EBUSY;
571 	return uiomove(lockstat_baseb, lockstat_sizeb, uio);
572 }
573