xref: /illumos-gate/usr/src/uts/common/dtrace/profile.c (revision e75b2cb0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2011, Joyent, Inc. All rights reserved.
28  */
29 
30 #include <sys/errno.h>
31 #include <sys/stat.h>
32 #include <sys/modctl.h>
33 #include <sys/conf.h>
34 #include <sys/systm.h>
35 #include <sys/ddi.h>
36 #include <sys/sunddi.h>
37 #include <sys/cpuvar.h>
38 #include <sys/kmem.h>
39 #include <sys/strsubr.h>
40 #include <sys/dtrace.h>
41 #include <sys/cyclic.h>
42 #include <sys/atomic.h>
43 
44 static dev_info_t *profile_devi;
45 static dtrace_provider_id_t profile_id;
46 
47 /*
48  * Regardless of platform, the stack frames look like this in the case of the
49  * profile provider:
50  *
51  *	profile_fire
52  *	cyclic_expire
53  *	cyclic_fire
54  *	[ cbe ]
55  *	[ interrupt code ]
56  *
57  * On x86, there are five frames from the generic interrupt code; further, the
58  * interrupted instruction appears as its own stack frame, giving us a total of
59  * 10.
60  *
61  * On SPARC, the picture is further complicated because the compiler
62  * optimizes away tail-calls -- so the following frames are optimized away:
63  *
64  *	profile_fire
65  *	cyclic_expire
66  *
67  * This gives three frames.  However, on DEBUG kernels, the cyclic_expire
68  * frame cannot be tail-call eliminated, yielding four frames in this case.
69  *
70  * All of the above constraints lead to the mess below.  Yes, the profile
71  * provider should ideally figure this out on-the-fly by hitting one of its own
72  * probes and then walking its own stack trace.  This is complicated, however,
73  * and the static definition doesn't seem to be overly brittle.  Still, we
74  * allow for a manual override in case we get it completely wrong.
75  */
76 #ifdef __x86
77 #define	PROF_ARTIFICIAL_FRAMES	10
78 #else
79 #ifdef __sparc
80 #ifdef DEBUG
81 #define	PROF_ARTIFICIAL_FRAMES	4
82 #else
83 #define	PROF_ARTIFICIAL_FRAMES	3
84 #endif
85 #endif
86 #endif
87 
88 #define	PROF_NAMELEN		15
89 
90 #define	PROF_PROFILE		0
91 #define	PROF_TICK		1
92 #define	PROF_PREFIX_PROFILE	"profile-"
93 #define	PROF_PREFIX_TICK	"tick-"
94 
95 typedef struct profile_probe {
96 	char		prof_name[PROF_NAMELEN];
97 	dtrace_id_t	prof_id;
98 	int		prof_kind;
99 	hrtime_t	prof_interval;
100 	cyclic_id_t	prof_cyclic;
101 } profile_probe_t;
102 
103 typedef struct profile_probe_percpu {
104 	hrtime_t	profc_expected;
105 	hrtime_t	profc_interval;
106 	profile_probe_t	*profc_probe;
107 } profile_probe_percpu_t;
108 
109 hrtime_t	profile_interval_min = NANOSEC / 5000;		/* 5000 hz */
110 int		profile_aframes = 0;				/* override */
111 
112 static int profile_rates[] = {
113     97, 199, 499, 997, 1999,
114     4001, 4999, 0, 0, 0,
115     0, 0, 0, 0, 0,
116     0, 0, 0, 0, 0
117 };
118 
119 static int profile_ticks[] = {
120     1, 10, 100, 500, 1000,
121     5000, 0, 0, 0, 0,
122     0, 0, 0, 0, 0
123 };
124 
125 /*
126  * profile_max defines the upper bound on the number of profile probes that
127  * can exist (this is to prevent malicious or clumsy users from exhausing
128  * system resources by creating a slew of profile probes). At mod load time,
129  * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's
130  * present in the profile.conf file.
131  */
132 #define	PROFILE_MAX_DEFAULT	1000	/* default max. number of probes */
133 static uint32_t profile_max;		/* maximum number of profile probes */
134 static uint32_t profile_total;	/* current number of profile probes */
135 
136 static void
137 profile_fire(void *arg)
138 {
139 	profile_probe_percpu_t *pcpu = arg;
140 	profile_probe_t *prof = pcpu->profc_probe;
141 	hrtime_t late;
142 
143 	late = dtrace_gethrtime() - pcpu->profc_expected;
144 	pcpu->profc_expected += pcpu->profc_interval;
145 
146 	dtrace_probe(prof->prof_id, CPU->cpu_profile_pc,
147 	    CPU->cpu_profile_upc, late, 0, 0);
148 }
149 
150 static void
151 profile_tick(void *arg)
152 {
153 	profile_probe_t *prof = arg;
154 
155 	dtrace_probe(prof->prof_id, CPU->cpu_profile_pc,
156 	    CPU->cpu_profile_upc, 0, 0, 0);
157 }
158 
159 static void
160 profile_create(hrtime_t interval, const char *name, int kind)
161 {
162 	profile_probe_t *prof;
163 	int nr_frames = PROF_ARTIFICIAL_FRAMES + dtrace_mach_aframes();
164 
165 	if (profile_aframes)
166 		nr_frames = profile_aframes;
167 
168 	if (interval < profile_interval_min)
169 		return;
170 
171 	if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
172 		return;
173 
174 	atomic_inc_32(&profile_total);
175 	if (profile_total > profile_max) {
176 		atomic_dec_32(&profile_total);
177 		return;
178 	}
179 
180 	prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP);
181 	(void) strcpy(prof->prof_name, name);
182 	prof->prof_interval = interval;
183 	prof->prof_cyclic = CYCLIC_NONE;
184 	prof->prof_kind = kind;
185 	prof->prof_id = dtrace_probe_create(profile_id,
186 	    NULL, NULL, name, nr_frames, prof);
187 }
188 
189 /*ARGSUSED*/
190 static void
191 profile_provide(void *arg, const dtrace_probedesc_t *desc)
192 {
193 	int i, j, rate, kind;
194 	hrtime_t val = 0, mult = 1, len;
195 	const char *name, *suffix = NULL;
196 
197 	const struct {
198 		char *prefix;
199 		int kind;
200 	} types[] = {
201 		{ PROF_PREFIX_PROFILE, PROF_PROFILE },
202 		{ PROF_PREFIX_TICK, PROF_TICK },
203 		{ NULL, 0 }
204 	};
205 
206 	const struct {
207 		char *name;
208 		hrtime_t mult;
209 	} suffixes[] = {
210 		{ "ns",		NANOSEC / NANOSEC },
211 		{ "nsec",	NANOSEC / NANOSEC },
212 		{ "us",		NANOSEC / MICROSEC },
213 		{ "usec",	NANOSEC / MICROSEC },
214 		{ "ms",		NANOSEC / MILLISEC },
215 		{ "msec",	NANOSEC / MILLISEC },
216 		{ "s",		NANOSEC / SEC },
217 		{ "sec",	NANOSEC / SEC },
218 		{ "m",		NANOSEC * (hrtime_t)60 },
219 		{ "min",	NANOSEC * (hrtime_t)60 },
220 		{ "h",		NANOSEC * (hrtime_t)(60 * 60) },
221 		{ "hour",	NANOSEC * (hrtime_t)(60 * 60) },
222 		{ "d",		NANOSEC * (hrtime_t)(24 * 60 * 60) },
223 		{ "day",	NANOSEC * (hrtime_t)(24 * 60 * 60) },
224 		{ "hz",		0 },
225 		{ NULL }
226 	};
227 
228 	if (desc == NULL) {
229 		char n[PROF_NAMELEN];
230 
231 		/*
232 		 * If no description was provided, provide all of our probes.
233 		 */
234 		for (i = 0; i < sizeof (profile_rates) / sizeof (int); i++) {
235 			if ((rate = profile_rates[i]) == 0)
236 				continue;
237 
238 			(void) snprintf(n, PROF_NAMELEN, "%s%d",
239 			    PROF_PREFIX_PROFILE, rate);
240 			profile_create(NANOSEC / rate, n, PROF_PROFILE);
241 		}
242 
243 		for (i = 0; i < sizeof (profile_ticks) / sizeof (int); i++) {
244 			if ((rate = profile_ticks[i]) == 0)
245 				continue;
246 
247 			(void) snprintf(n, PROF_NAMELEN, "%s%d",
248 			    PROF_PREFIX_TICK, rate);
249 			profile_create(NANOSEC / rate, n, PROF_TICK);
250 		}
251 
252 		return;
253 	}
254 
255 	name = desc->dtpd_name;
256 
257 	for (i = 0; types[i].prefix != NULL; i++) {
258 		len = strlen(types[i].prefix);
259 
260 		if (strncmp(name, types[i].prefix, len) != 0)
261 			continue;
262 		break;
263 	}
264 
265 	if (types[i].prefix == NULL)
266 		return;
267 
268 	kind = types[i].kind;
269 	j = strlen(name) - len;
270 
271 	/*
272 	 * We need to start before any time suffix.
273 	 */
274 	for (j = strlen(name); j >= len; j--) {
275 		if (name[j] >= '0' && name[j] <= '9')
276 			break;
277 		suffix = &name[j];
278 	}
279 
280 	ASSERT(suffix != NULL);
281 
282 	/*
283 	 * Now determine the numerical value present in the probe name.
284 	 */
285 	for (; j >= len; j--) {
286 		if (name[j] < '0' || name[j] > '9')
287 			return;
288 
289 		val += (name[j] - '0') * mult;
290 		mult *= (hrtime_t)10;
291 	}
292 
293 	if (val == 0)
294 		return;
295 
296 	/*
297 	 * Look-up the suffix to determine the multiplier.
298 	 */
299 	for (i = 0, mult = 0; suffixes[i].name != NULL; i++) {
300 		if (strcasecmp(suffixes[i].name, suffix) == 0) {
301 			mult = suffixes[i].mult;
302 			break;
303 		}
304 	}
305 
306 	if (suffixes[i].name == NULL && *suffix != '\0')
307 		return;
308 
309 	if (mult == 0) {
310 		/*
311 		 * The default is frequency-per-second.
312 		 */
313 		val = NANOSEC / val;
314 	} else {
315 		val *= mult;
316 	}
317 
318 	profile_create(val, name, kind);
319 }
320 
321 /*ARGSUSED*/
322 static void
323 profile_destroy(void *arg, dtrace_id_t id, void *parg)
324 {
325 	profile_probe_t *prof = parg;
326 
327 	ASSERT(prof->prof_cyclic == CYCLIC_NONE);
328 	kmem_free(prof, sizeof (profile_probe_t));
329 
330 	ASSERT(profile_total >= 1);
331 	atomic_dec_32(&profile_total);
332 }
333 
334 /*ARGSUSED*/
335 static void
336 profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
337 {
338 	profile_probe_t *prof = arg;
339 	profile_probe_percpu_t *pcpu;
340 
341 	pcpu = kmem_zalloc(sizeof (profile_probe_percpu_t), KM_SLEEP);
342 	pcpu->profc_probe = prof;
343 
344 	hdlr->cyh_func = profile_fire;
345 	hdlr->cyh_arg = pcpu;
346 	hdlr->cyh_level = CY_HIGH_LEVEL;
347 
348 	when->cyt_interval = prof->prof_interval;
349 	when->cyt_when = dtrace_gethrtime() + when->cyt_interval;
350 
351 	pcpu->profc_expected = when->cyt_when;
352 	pcpu->profc_interval = when->cyt_interval;
353 }
354 
355 /*ARGSUSED*/
356 static void
357 profile_offline(void *arg, cpu_t *cpu, void *oarg)
358 {
359 	profile_probe_percpu_t *pcpu = oarg;
360 
361 	ASSERT(pcpu->profc_probe == arg);
362 	kmem_free(pcpu, sizeof (profile_probe_percpu_t));
363 }
364 
365 /*ARGSUSED*/
366 static int
367 profile_enable(void *arg, dtrace_id_t id, void *parg)
368 {
369 	profile_probe_t *prof = parg;
370 	cyc_omni_handler_t omni;
371 	cyc_handler_t hdlr;
372 	cyc_time_t when;
373 
374 	ASSERT(prof->prof_interval != 0);
375 	ASSERT(MUTEX_HELD(&cpu_lock));
376 
377 	if (prof->prof_kind == PROF_TICK) {
378 		hdlr.cyh_func = profile_tick;
379 		hdlr.cyh_arg = prof;
380 		hdlr.cyh_level = CY_HIGH_LEVEL;
381 
382 		when.cyt_interval = prof->prof_interval;
383 		when.cyt_when = dtrace_gethrtime() + when.cyt_interval;
384 	} else {
385 		ASSERT(prof->prof_kind == PROF_PROFILE);
386 		omni.cyo_online = profile_online;
387 		omni.cyo_offline = profile_offline;
388 		omni.cyo_arg = prof;
389 	}
390 
391 	if (prof->prof_kind == PROF_TICK) {
392 		prof->prof_cyclic = cyclic_add(&hdlr, &when);
393 	} else {
394 		prof->prof_cyclic = cyclic_add_omni(&omni);
395 	}
396 	return (0);
397 }
398 
399 /*ARGSUSED*/
400 static void
401 profile_disable(void *arg, dtrace_id_t id, void *parg)
402 {
403 	profile_probe_t *prof = parg;
404 
405 	ASSERT(prof->prof_cyclic != CYCLIC_NONE);
406 	ASSERT(MUTEX_HELD(&cpu_lock));
407 
408 	cyclic_remove(prof->prof_cyclic);
409 	prof->prof_cyclic = CYCLIC_NONE;
410 }
411 
412 /*ARGSUSED*/
413 static int
414 profile_mode(void *arg, dtrace_id_t id, void *parg)
415 {
416 	profile_probe_t *prof = parg;
417 	int mode;
418 
419 	if (CPU->cpu_profile_pc != 0) {
420 		mode = DTRACE_MODE_KERNEL;
421 	} else {
422 		mode = DTRACE_MODE_USER;
423 	}
424 
425 	if (prof->prof_kind == PROF_TICK) {
426 		mode |= DTRACE_MODE_NOPRIV_RESTRICT;
427 	} else {
428 		ASSERT(prof->prof_kind == PROF_PROFILE);
429 		mode |= DTRACE_MODE_NOPRIV_DROP;
430 	}
431 
432 	return (mode);
433 }
434 
435 static dtrace_pattr_t profile_attr = {
436 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
437 { DTRACE_STABILITY_UNSTABLE, DTRACE_STABILITY_UNSTABLE, DTRACE_CLASS_UNKNOWN },
438 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
439 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
440 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
441 };
442 
443 static dtrace_pops_t profile_pops = {
444 	profile_provide,
445 	NULL,
446 	profile_enable,
447 	profile_disable,
448 	NULL,
449 	NULL,
450 	NULL,
451 	NULL,
452 	profile_mode,
453 	profile_destroy
454 };
455 
456 static int
457 profile_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
458 {
459 	switch (cmd) {
460 	case DDI_ATTACH:
461 		break;
462 	case DDI_RESUME:
463 		return (DDI_SUCCESS);
464 	default:
465 		return (DDI_FAILURE);
466 	}
467 
468 	if (ddi_create_minor_node(devi, "profile", S_IFCHR, 0,
469 	    DDI_PSEUDO, 0) == DDI_FAILURE ||
470 	    dtrace_register("profile", &profile_attr,
471 	    DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER, NULL,
472 	    &profile_pops, NULL, &profile_id) != 0) {
473 		ddi_remove_minor_node(devi, NULL);
474 		return (DDI_FAILURE);
475 	}
476 
477 	profile_max = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
478 	    "profile-max-probes", PROFILE_MAX_DEFAULT);
479 
480 	ddi_report_dev(devi);
481 	profile_devi = devi;
482 	return (DDI_SUCCESS);
483 }
484 
485 static int
486 profile_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
487 {
488 	switch (cmd) {
489 	case DDI_DETACH:
490 		break;
491 	case DDI_SUSPEND:
492 		return (DDI_SUCCESS);
493 	default:
494 		return (DDI_FAILURE);
495 	}
496 
497 	if (dtrace_unregister(profile_id) != 0)
498 		return (DDI_FAILURE);
499 
500 	ddi_remove_minor_node(devi, NULL);
501 	return (DDI_SUCCESS);
502 }
503 
504 /*ARGSUSED*/
505 static int
506 profile_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
507 {
508 	int error;
509 
510 	switch (infocmd) {
511 	case DDI_INFO_DEVT2DEVINFO:
512 		*result = (void *)profile_devi;
513 		error = DDI_SUCCESS;
514 		break;
515 	case DDI_INFO_DEVT2INSTANCE:
516 		*result = (void *)0;
517 		error = DDI_SUCCESS;
518 		break;
519 	default:
520 		error = DDI_FAILURE;
521 	}
522 	return (error);
523 }
524 
525 /*ARGSUSED*/
526 static int
527 profile_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
528 {
529 	return (0);
530 }
531 
532 static struct cb_ops profile_cb_ops = {
533 	profile_open,		/* open */
534 	nodev,			/* close */
535 	nulldev,		/* strategy */
536 	nulldev,		/* print */
537 	nodev,			/* dump */
538 	nodev,			/* read */
539 	nodev,			/* write */
540 	nodev,			/* ioctl */
541 	nodev,			/* devmap */
542 	nodev,			/* mmap */
543 	nodev,			/* segmap */
544 	nochpoll,		/* poll */
545 	ddi_prop_op,		/* cb_prop_op */
546 	0,			/* streamtab  */
547 	D_NEW | D_MP		/* Driver compatibility flag */
548 };
549 
550 static struct dev_ops profile_ops = {
551 	DEVO_REV,		/* devo_rev, */
552 	0,			/* refcnt  */
553 	profile_info,		/* get_dev_info */
554 	nulldev,		/* identify */
555 	nulldev,		/* probe */
556 	profile_attach,		/* attach */
557 	profile_detach,		/* detach */
558 	nodev,			/* reset */
559 	&profile_cb_ops,	/* driver operations */
560 	NULL,			/* bus operations */
561 	nodev,			/* dev power */
562 	ddi_quiesce_not_needed,		/* quiesce */
563 };
564 
565 /*
566  * Module linkage information for the kernel.
567  */
568 static struct modldrv modldrv = {
569 	&mod_driverops,		/* module type (this is a pseudo driver) */
570 	"Profile Interrupt Tracing",	/* name of module */
571 	&profile_ops,		/* driver ops */
572 };
573 
574 static struct modlinkage modlinkage = {
575 	MODREV_1,
576 	(void *)&modldrv,
577 	NULL
578 };
579 
580 int
581 _init(void)
582 {
583 	return (mod_install(&modlinkage));
584 }
585 
586 int
587 _info(struct modinfo *modinfop)
588 {
589 	return (mod_info(&modlinkage, modinfop));
590 }
591 
592 int
593 _fini(void)
594 {
595 	return (mod_remove(&modlinkage));
596 }
597