xref: /netbsd/sys/dev/lockstat.c (revision 6550d01e)
1 /*	$NetBSD: lockstat.c,v 1.15 2008/04/28 20:23:46 martin Exp $	*/
2 
3 /*-
4  * Copyright (c) 2006, 2007 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Lock statistics driver, providing kernel support for the lockstat(8)
34  * command.
35  *
36  * We use a global lock word (lockstat_lock) to track device opens.
37  * Only one thread can hold the device at a time, providing a global lock.
38  *
39  * XXX Timings for contention on sleep locks are currently incorrect.
40  */
41 
42 #include <sys/cdefs.h>
43 __KERNEL_RCSID(0, "$NetBSD: lockstat.c,v 1.15 2008/04/28 20:23:46 martin Exp $");
44 
45 #include <sys/types.h>
46 #include <sys/param.h>
47 #include <sys/proc.h>
48 #include <sys/resourcevar.h>
49 #include <sys/systm.h>
50 #include <sys/kernel.h>
51 #include <sys/kmem.h>
52 #include <sys/conf.h>
53 #include <sys/syslog.h>
54 #include <sys/atomic.h>
55 
56 #include <dev/lockstat.h>
57 
58 #include <machine/lock.h>
59 
60 #ifndef __HAVE_CPU_COUNTER
61 #error CPU counters not available
62 #endif
63 
64 #if LONG_BIT == 64
65 #define	LOCKSTAT_HASH_SHIFT	3
66 #elif LONG_BIT == 32
67 #define	LOCKSTAT_HASH_SHIFT	2
68 #endif
69 
70 #define	LOCKSTAT_MINBUFS	1000
71 #define	LOCKSTAT_DEFBUFS	10000
72 #define	LOCKSTAT_MAXBUFS	50000
73 
74 #define	LOCKSTAT_HASH_SIZE	128
75 #define	LOCKSTAT_HASH_MASK	(LOCKSTAT_HASH_SIZE - 1)
76 #define	LOCKSTAT_HASH(key)	\
77 	((key >> LOCKSTAT_HASH_SHIFT) & LOCKSTAT_HASH_MASK)
78 
79 typedef struct lscpu {
80 	SLIST_HEAD(, lsbuf)	lc_free;
81 	u_int			lc_overflow;
82 	LIST_HEAD(lslist, lsbuf) lc_hash[LOCKSTAT_HASH_SIZE];
83 } lscpu_t;
84 
85 typedef struct lslist lslist_t;
86 
87 void	lockstatattach(int);
88 void	lockstat_start(lsenable_t *);
89 int	lockstat_alloc(lsenable_t *);
90 void	lockstat_init_tables(lsenable_t *);
91 int	lockstat_stop(lsdisable_t *);
92 void	lockstat_free(void);
93 
94 dev_type_open(lockstat_open);
95 dev_type_close(lockstat_close);
96 dev_type_read(lockstat_read);
97 dev_type_ioctl(lockstat_ioctl);
98 
99 volatile u_int	lockstat_enabled;
100 uintptr_t	lockstat_csstart;
101 uintptr_t	lockstat_csend;
102 uintptr_t	lockstat_csmask;
103 uintptr_t	lockstat_lamask;
104 uintptr_t	lockstat_lockstart;
105 uintptr_t	lockstat_lockend;
106 __cpu_simple_lock_t lockstat_lock;
107 lwp_t		*lockstat_lwp;
108 lsbuf_t		*lockstat_baseb;
109 size_t		lockstat_sizeb;
110 int		lockstat_busy;
111 struct timespec	lockstat_stime;
112 
113 const struct cdevsw lockstat_cdevsw = {
114 	lockstat_open, lockstat_close, lockstat_read, nowrite, lockstat_ioctl,
115 	nostop, notty, nopoll, nommap, nokqfilter, D_OTHER | D_MPSAFE
116 };
117 
118 /*
119  * Called when the pseudo-driver is attached.
120  */
121 void
122 lockstatattach(int nunits)
123 {
124 
125 	(void)nunits;
126 
127 	__cpu_simple_lock_init(&lockstat_lock);
128 }
129 
130 /*
131  * Prepare the per-CPU tables for use, or clear down tables when tracing is
132  * stopped.
133  */
134 void
135 lockstat_init_tables(lsenable_t *le)
136 {
137 	int i, per, slop, cpuno;
138 	CPU_INFO_ITERATOR cii;
139 	struct cpu_info *ci;
140 	lscpu_t *lc;
141 	lsbuf_t *lb;
142 
143 	KASSERT(!lockstat_enabled);
144 
145 	for (CPU_INFO_FOREACH(cii, ci)) {
146 		if (ci->ci_lockstat != NULL) {
147 			kmem_free(ci->ci_lockstat, sizeof(lscpu_t));
148 			ci->ci_lockstat = NULL;
149 		}
150 	}
151 
152 	if (le == NULL)
153 		return;
154 
155 	lb = lockstat_baseb;
156 	per = le->le_nbufs / ncpu;
157 	slop = le->le_nbufs - (per * ncpu);
158 	cpuno = 0;
159 	for (CPU_INFO_FOREACH(cii, ci)) {
160 		lc = kmem_alloc(sizeof(*lc), KM_SLEEP);
161 		lc->lc_overflow = 0;
162 		ci->ci_lockstat = lc;
163 
164 		SLIST_INIT(&lc->lc_free);
165 		for (i = 0; i < LOCKSTAT_HASH_SIZE; i++)
166 			LIST_INIT(&lc->lc_hash[i]);
167 
168 		for (i = per; i != 0; i--, lb++) {
169 			lb->lb_cpu = (uint16_t)cpuno;
170 			SLIST_INSERT_HEAD(&lc->lc_free, lb, lb_chain.slist);
171 		}
172 		if (--slop > 0) {
173 			lb->lb_cpu = (uint16_t)cpuno;
174 			SLIST_INSERT_HEAD(&lc->lc_free, lb, lb_chain.slist);
175 			lb++;
176 		}
177 		cpuno++;
178 	}
179 }
180 
181 /*
182  * Start collecting lock statistics.
183  */
184 void
185 lockstat_start(lsenable_t *le)
186 {
187 
188 	KASSERT(!lockstat_enabled);
189 
190 	lockstat_init_tables(le);
191 
192 	if ((le->le_flags & LE_CALLSITE) != 0)
193 		lockstat_csmask = (uintptr_t)-1LL;
194 	else
195 		lockstat_csmask = 0;
196 
197 	if ((le->le_flags & LE_LOCK) != 0)
198 		lockstat_lamask = (uintptr_t)-1LL;
199 	else
200 		lockstat_lamask = 0;
201 
202 	lockstat_csstart = le->le_csstart;
203 	lockstat_csend = le->le_csend;
204 	lockstat_lockstart = le->le_lockstart;
205 	lockstat_lockstart = le->le_lockstart;
206 	lockstat_lockend = le->le_lockend;
207 	membar_sync();
208 	getnanotime(&lockstat_stime);
209 	lockstat_enabled = le->le_mask;
210 	membar_producer();
211 }
212 
213 /*
214  * Stop collecting lock statistics.
215  */
216 int
217 lockstat_stop(lsdisable_t *ld)
218 {
219 	CPU_INFO_ITERATOR cii;
220 	struct cpu_info *ci;
221 	u_int cpuno, overflow;
222 	struct timespec ts;
223 	int error;
224 	lwp_t *l;
225 
226 	KASSERT(lockstat_enabled);
227 
228 	/*
229 	 * Set enabled false, force a write barrier, and wait for other CPUs
230 	 * to exit lockstat_event().
231 	 */
232 	lockstat_enabled = 0;
233 	membar_producer();
234 	getnanotime(&ts);
235 	tsleep(&lockstat_stop, PPAUSE, "lockstat", mstohz(10));
236 
237 	/*
238 	 * Did we run out of buffers while tracing?
239 	 */
240 	overflow = 0;
241 	for (CPU_INFO_FOREACH(cii, ci))
242 		overflow += ((lscpu_t *)ci->ci_lockstat)->lc_overflow;
243 
244 	if (overflow != 0) {
245 		error = EOVERFLOW;
246 		log(LOG_NOTICE, "lockstat: %d buffer allocations failed\n",
247 		    overflow);
248 	} else
249 		error = 0;
250 
251 	lockstat_init_tables(NULL);
252 
253 	/* Run through all LWPs and clear the slate for the next run. */
254 	mutex_enter(proc_lock);
255 	LIST_FOREACH(l, &alllwp, l_list) {
256 		l->l_pfailaddr = 0;
257 		l->l_pfailtime = 0;
258 		l->l_pfaillock = 0;
259 	}
260 	mutex_exit(proc_lock);
261 
262 	if (ld == NULL)
263 		return error;
264 
265 	/*
266 	 * Fill out the disable struct for the caller.
267 	 */
268 	timespecsub(&ts, &lockstat_stime, &ld->ld_time);
269 	ld->ld_size = lockstat_sizeb;
270 
271 	cpuno = 0;
272 	for (CPU_INFO_FOREACH(cii, ci)) {
273 		if (cpuno > sizeof(ld->ld_freq) / sizeof(ld->ld_freq[0])) {
274 			log(LOG_WARNING, "lockstat: too many CPUs\n");
275 			break;
276 		}
277 		ld->ld_freq[cpuno++] = cpu_frequency(ci);
278 	}
279 
280 	return error;
281 }
282 
283 /*
284  * Allocate buffers for lockstat_start().
285  */
286 int
287 lockstat_alloc(lsenable_t *le)
288 {
289 	lsbuf_t *lb;
290 	size_t sz;
291 
292 	KASSERT(!lockstat_enabled);
293 	lockstat_free();
294 
295 	sz = sizeof(*lb) * le->le_nbufs;
296 
297 	lb = kmem_zalloc(sz, KM_SLEEP);
298 	if (lb == NULL)
299 		return (ENOMEM);
300 
301 	KASSERT(!lockstat_enabled);
302 	KASSERT(lockstat_baseb == NULL);
303 	lockstat_sizeb = sz;
304 	lockstat_baseb = lb;
305 
306 	return (0);
307 }
308 
309 /*
310  * Free allocated buffers after tracing has stopped.
311  */
312 void
313 lockstat_free(void)
314 {
315 
316 	KASSERT(!lockstat_enabled);
317 
318 	if (lockstat_baseb != NULL) {
319 		kmem_free(lockstat_baseb, lockstat_sizeb);
320 		lockstat_baseb = NULL;
321 	}
322 }
323 
324 /*
325  * Main entry point from lock primatives.
326  */
327 void
328 lockstat_event(uintptr_t lock, uintptr_t callsite, u_int flags, u_int count,
329 	       uint64_t cycles)
330 {
331 	lslist_t *ll;
332 	lscpu_t *lc;
333 	lsbuf_t *lb;
334 	u_int event;
335 	int s;
336 
337 	if ((flags & lockstat_enabled) != flags || count == 0)
338 		return;
339 	if (lock < lockstat_lockstart || lock > lockstat_lockend)
340 		return;
341 	if (callsite < lockstat_csstart || callsite > lockstat_csend)
342 		return;
343 
344 	callsite &= lockstat_csmask;
345 	lock &= lockstat_lamask;
346 
347 	/*
348 	 * Find the table for this lock+callsite pair, and try to locate a
349 	 * buffer with the same key.
350 	 */
351 	s = splhigh();
352 	lc = curcpu()->ci_lockstat;
353 	ll = &lc->lc_hash[LOCKSTAT_HASH(lock ^ callsite)];
354 	event = (flags & LB_EVENT_MASK) - 1;
355 
356 	LIST_FOREACH(lb, ll, lb_chain.list) {
357 		if (lb->lb_lock == lock && lb->lb_callsite == callsite)
358 			break;
359 	}
360 
361 	if (lb != NULL) {
362 		/*
363 		 * We found a record.  Move it to the front of the list, as
364 		 * we're likely to hit it again soon.
365 		 */
366 		if (lb != LIST_FIRST(ll)) {
367 			LIST_REMOVE(lb, lb_chain.list);
368 			LIST_INSERT_HEAD(ll, lb, lb_chain.list);
369 		}
370 		lb->lb_counts[event] += count;
371 		lb->lb_times[event] += cycles;
372 	} else if ((lb = SLIST_FIRST(&lc->lc_free)) != NULL) {
373 		/*
374 		 * Pinch a new buffer and fill it out.
375 		 */
376 		SLIST_REMOVE_HEAD(&lc->lc_free, lb_chain.slist);
377 		LIST_INSERT_HEAD(ll, lb, lb_chain.list);
378 		lb->lb_flags = (uint16_t)flags;
379 		lb->lb_lock = lock;
380 		lb->lb_callsite = callsite;
381 		lb->lb_counts[event] = count;
382 		lb->lb_times[event] = cycles;
383 	} else {
384 		/*
385 		 * We didn't find a buffer and there were none free.
386 		 * lockstat_stop() will notice later on and report the
387 		 * error.
388 		 */
389 		 lc->lc_overflow++;
390 	}
391 
392 	splx(s);
393 }
394 
395 /*
396  * Accept an open() on /dev/lockstat.
397  */
398 int
399 lockstat_open(dev_t dev, int flag, int mode, lwp_t *l)
400 {
401 
402 	if (!__cpu_simple_lock_try(&lockstat_lock))
403 		return EBUSY;
404 	lockstat_lwp = curlwp;
405 	return 0;
406 }
407 
408 /*
409  * Accept the last close() on /dev/lockstat.
410  */
411 int
412 lockstat_close(dev_t dev, int flag, int mode, lwp_t *l)
413 {
414 
415 	lockstat_lwp = NULL;
416 	__cpu_simple_unlock(&lockstat_lock);
417 	return 0;
418 }
419 
420 /*
421  * Handle control operations.
422  */
423 int
424 lockstat_ioctl(dev_t dev, u_long cmd, void *data, int flag, lwp_t *l)
425 {
426 	lsenable_t *le;
427 	int error;
428 
429 	if (lockstat_lwp != curlwp)
430 		return EBUSY;
431 
432 	switch (cmd) {
433 	case IOC_LOCKSTAT_GVERSION:
434 		*(int *)data = LS_VERSION;
435 		error = 0;
436 		break;
437 
438 	case IOC_LOCKSTAT_ENABLE:
439 		le = (lsenable_t *)data;
440 
441 		if (!cpu_hascounter()) {
442 			error = ENODEV;
443 			break;
444 		}
445 		if (lockstat_enabled) {
446 			error = EBUSY;
447 			break;
448 		}
449 
450 		/*
451 		 * Sanitize the arguments passed in and set up filtering.
452 		 */
453 		if (le->le_nbufs == 0)
454 			le->le_nbufs = LOCKSTAT_DEFBUFS;
455 		else if (le->le_nbufs > LOCKSTAT_MAXBUFS ||
456 		    le->le_nbufs < LOCKSTAT_MINBUFS) {
457 			error = EINVAL;
458 			break;
459 		}
460 		if ((le->le_flags & LE_ONE_CALLSITE) == 0) {
461 			le->le_csstart = 0;
462 			le->le_csend = le->le_csstart - 1;
463 		}
464 		if ((le->le_flags & LE_ONE_LOCK) == 0) {
465 			le->le_lockstart = 0;
466 			le->le_lockend = le->le_lockstart - 1;
467 		}
468 		if ((le->le_mask & LB_EVENT_MASK) == 0)
469 			return EINVAL;
470 		if ((le->le_mask & LB_LOCK_MASK) == 0)
471 			return EINVAL;
472 
473 		/*
474 		 * Start tracing.
475 		 */
476 		if ((error = lockstat_alloc(le)) == 0)
477 			lockstat_start(le);
478 		break;
479 
480 	case IOC_LOCKSTAT_DISABLE:
481 		if (!lockstat_enabled)
482 			error = EINVAL;
483 		else
484 			error = lockstat_stop((lsdisable_t *)data);
485 		break;
486 
487 	default:
488 		error = ENOTTY;
489 		break;
490 	}
491 
492 	return error;
493 }
494 
495 /*
496  * Copy buffers out to user-space.
497  */
498 int
499 lockstat_read(dev_t dev, struct uio *uio, int flag)
500 {
501 
502 	if (curlwp != lockstat_lwp || lockstat_enabled)
503 		return EBUSY;
504 	return uiomove(lockstat_baseb, lockstat_sizeb, uio);
505 }
506