xref: /freebsd/sys/cddl/dev/profile/profile.c (revision a91a2465)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  *
21  * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
22  *
23  */
24 
25 /*
26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27  * Use is subject to license terms.
28  */
29 
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/conf.h>
33 #include <sys/cpuvar.h>
34 #include <sys/endian.h>
35 #include <sys/fcntl.h>
36 #include <sys/filio.h>
37 #include <sys/kdb.h>
38 #include <sys/kernel.h>
39 #include <sys/kmem.h>
40 #include <sys/kthread.h>
41 #include <sys/limits.h>
42 #include <sys/linker.h>
43 #include <sys/lock.h>
44 #include <sys/malloc.h>
45 #include <sys/module.h>
46 #include <sys/mutex.h>
47 #include <sys/poll.h>
48 #include <sys/proc.h>
49 #include <sys/selinfo.h>
50 #include <sys/smp.h>
51 #include <sys/sysctl.h>
52 #include <sys/uio.h>
53 #include <sys/unistd.h>
54 #include <machine/cpu.h>
55 #include <machine/stdarg.h>
56 
57 #include <sys/dtrace.h>
58 #include <sys/dtrace_bsd.h>
59 
60 #include <cddl/dev/dtrace/dtrace_cddl.h>
61 
62 #define	PROF_NAMELEN		15
63 
64 #define	PROF_PROFILE		0
65 #define	PROF_TICK		1
66 #define	PROF_PREFIX_PROFILE	"profile-"
67 #define	PROF_PREFIX_TICK	"tick-"
68 
69 /*
70  * Regardless of platform, there are five artificial frames in the case of the
71  * profile provider:
72  *
73  *	profile_fire
74  *	cyclic_expire
75  *	cyclic_fire
76  *	[ cbe ]
77  *	[ locore ]
78  *
79  * On amd64, there are two frames associated with locore:  one in locore, and
80  * another in common interrupt dispatch code.  (i386 has not been modified to
81  * use this common layer.)  Further, on i386, the interrupted instruction
82  * appears as its own stack frame.  All of this means that we need to add one
83  * frame for amd64, and then take one away for both amd64 and i386.
84  *
85  * All of the above constraints lead to the mess below.  Yes, the profile
86  * provider should ideally figure this out on-the-fly by hiting one of its own
87  * probes and then walking its own stack trace.  This is complicated, however,
88  * and the static definition doesn't seem to be overly brittle.  Still, we
89  * allow for a manual override in case we get it completely wrong.
90  */
91 #ifdef __amd64
92 #define	PROF_ARTIFICIAL_FRAMES	10
93 #else
94 #ifdef __i386
95 #define	PROF_ARTIFICIAL_FRAMES	6
96 #endif
97 #endif
98 
99 #ifdef __powerpc__
100 /*
101  * This value is bogus just to make module compilable on powerpc
102  */
103 #define	PROF_ARTIFICIAL_FRAMES	8
104 #endif
105 
106 struct profile_probe_percpu;
107 
108 #ifdef __arm__
109 #define	PROF_ARTIFICIAL_FRAMES	3
110 #endif
111 
112 #ifdef __aarch64__
113 #define	PROF_ARTIFICIAL_FRAMES	12
114 #endif
115 
116 #ifdef __riscv
117 #define	PROF_ARTIFICIAL_FRAMES	12
118 #endif
119 
120 typedef struct profile_probe {
121 	char		prof_name[PROF_NAMELEN];
122 	dtrace_id_t	prof_id;
123 	int		prof_kind;
124 #ifdef illumos
125 	hrtime_t	prof_interval;
126 	cyclic_id_t	prof_cyclic;
127 #else
128 	sbintime_t	prof_interval;
129 	struct callout	prof_cyclic;
130 	sbintime_t	prof_expected;
131 	struct profile_probe_percpu **prof_pcpus;
132 #endif
133 } profile_probe_t;
134 
135 typedef struct profile_probe_percpu {
136 	hrtime_t	profc_expected;
137 	hrtime_t	profc_interval;
138 	profile_probe_t	*profc_probe;
139 #ifdef __FreeBSD__
140 	struct callout	profc_cyclic;
141 #endif
142 } profile_probe_percpu_t;
143 
144 static int	profile_unload(void);
145 static void	profile_create(hrtime_t, char *, int);
146 static void	profile_destroy(void *, dtrace_id_t, void *);
147 static void	profile_enable(void *, dtrace_id_t, void *);
148 static void	profile_disable(void *, dtrace_id_t, void *);
149 static void	profile_load(void *);
150 static void	profile_provide(void *, dtrace_probedesc_t *);
151 
152 static int profile_rates[] = {
153     97, 199, 499, 997, 1999,
154     4001, 4999, 0, 0, 0,
155     0, 0, 0, 0, 0,
156     0, 0, 0, 0, 0
157 };
158 
159 static int profile_ticks[] = {
160     1, 10, 100, 500, 1000,
161     5000, 0, 0, 0, 0,
162     0, 0, 0, 0, 0
163 };
164 
165 /*
166  * profile_max defines the upper bound on the number of profile probes that
167  * can exist (this is to prevent malicious or clumsy users from exhausing
168  * system resources by creating a slew of profile probes). At mod load time,
169  * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's
170  * present in the profile.conf file.
171  */
172 #define	PROFILE_MAX_DEFAULT	1000	/* default max. number of probes */
173 static uint32_t profile_max = PROFILE_MAX_DEFAULT;
174 					/* maximum number of profile probes */
175 static uint32_t profile_total;		/* current number of profile probes */
176 
177 static dtrace_pattr_t profile_attr = {
178 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
179 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
180 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
181 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
182 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
183 };
184 
185 static dtrace_pops_t profile_pops = {
186 	.dtps_provide =		profile_provide,
187 	.dtps_provide_module =	NULL,
188 	.dtps_enable =		profile_enable,
189 	.dtps_disable =		profile_disable,
190 	.dtps_suspend =		NULL,
191 	.dtps_resume =		NULL,
192 	.dtps_getargdesc =	NULL,
193 	.dtps_getargval =	NULL,
194 	.dtps_usermode =	NULL,
195 	.dtps_destroy =		profile_destroy
196 };
197 
198 static dtrace_provider_id_t	profile_id;
199 static hrtime_t			profile_interval_min = NANOSEC / 5000;	/* 5000 hz */
200 static int			profile_aframes = PROF_ARTIFICIAL_FRAMES;
201 
202 SYSCTL_DECL(_kern_dtrace);
203 SYSCTL_NODE(_kern_dtrace, OID_AUTO, profile, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
204     "DTrace profile parameters");
205 SYSCTL_INT(_kern_dtrace_profile, OID_AUTO, aframes, CTLFLAG_RW, &profile_aframes,
206     0, "Skipped frames for profile provider");
207 
208 static sbintime_t
209 nsec_to_sbt(hrtime_t nsec)
210 {
211 	time_t sec;
212 
213 	/*
214 	 * We need to calculate nsec * 2^32 / 10^9
215 	 * Seconds and nanoseconds are split to avoid overflow.
216 	 */
217 	sec = nsec / NANOSEC;
218 	nsec = nsec % NANOSEC;
219 	return (((sbintime_t)sec << 32) | ((sbintime_t)nsec << 32) / NANOSEC);
220 }
221 
222 static hrtime_t
223 sbt_to_nsec(sbintime_t sbt)
224 {
225 
226 	return ((sbt >> 32) * NANOSEC +
227 	    (((uint32_t)sbt * (hrtime_t)NANOSEC) >> 32));
228 }
229 
230 static void
231 profile_probe(profile_probe_t *prof, hrtime_t late)
232 {
233 	struct thread *td;
234 	struct trapframe *frame;
235 	uintfptr_t pc, upc;
236 
237 	td = curthread;
238 	pc = upc = 0;
239 
240 	/*
241 	 * td_intr_frame can be unset if this is a catch-up event upon waking up
242 	 * from idle sleep. This can only happen on a CPU idle thread. Use a
243 	 * representative arg0 value in this case so that one of the probe
244 	 * arguments is non-zero.
245 	 */
246 	frame = td->td_intr_frame;
247 	if (frame != NULL) {
248 		if (TRAPF_USERMODE(frame))
249 			upc = TRAPF_PC(frame);
250 		else {
251 			pc = TRAPF_PC(frame);
252 			td->t_dtrace_trapframe = frame;
253 		}
254 	} else if (TD_IS_IDLETHREAD(td))
255 		pc = (uintfptr_t)&cpu_idle;
256 
257 	dtrace_probe(prof->prof_id, pc, upc, late, 0, 0);
258 	td->t_dtrace_trapframe = NULL;
259 }
260 
261 static void
262 profile_fire(void *arg)
263 {
264 	profile_probe_percpu_t *pcpu = arg;
265 	profile_probe_t *prof = pcpu->profc_probe;
266 	hrtime_t late;
267 
268 	late = sbt_to_nsec(sbinuptime() - pcpu->profc_expected);
269 
270 	profile_probe(prof, late);
271 	pcpu->profc_expected += pcpu->profc_interval;
272 	callout_schedule_sbt_curcpu(&pcpu->profc_cyclic,
273 	    pcpu->profc_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
274 }
275 
276 static void
277 profile_tick(void *arg)
278 {
279 	profile_probe_t *prof = arg;
280 
281 	profile_probe(prof, 0);
282 	prof->prof_expected += prof->prof_interval;
283 	callout_schedule_sbt(&prof->prof_cyclic,
284 	    prof->prof_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
285 }
286 
287 static void
288 profile_create(hrtime_t interval, char *name, int kind)
289 {
290 	profile_probe_t *prof;
291 
292 	if (interval < profile_interval_min)
293 		return;
294 
295 	if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
296 		return;
297 
298 	atomic_add_32(&profile_total, 1);
299 	if (profile_total > profile_max) {
300 		atomic_add_32(&profile_total, -1);
301 		return;
302 	}
303 
304 	prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP);
305 	(void) strcpy(prof->prof_name, name);
306 #ifdef illumos
307 	prof->prof_interval = interval;
308 	prof->prof_cyclic = CYCLIC_NONE;
309 #else
310 	prof->prof_interval = nsec_to_sbt(interval);
311 	callout_init(&prof->prof_cyclic, 1);
312 #endif
313 	prof->prof_kind = kind;
314 	prof->prof_id = dtrace_probe_create(profile_id,
315 	    NULL, NULL, name,
316 	    profile_aframes, prof);
317 }
318 
319 /*ARGSUSED*/
320 static void
321 profile_provide(void *arg, dtrace_probedesc_t *desc)
322 {
323 	int i, j, rate, kind;
324 	hrtime_t val = 0, mult = 1, len = 0;
325 	char *name, *suffix = NULL;
326 
327 	const struct {
328 		char *prefix;
329 		int kind;
330 	} types[] = {
331 		{ PROF_PREFIX_PROFILE, PROF_PROFILE },
332 		{ PROF_PREFIX_TICK, PROF_TICK },
333 		{ 0, 0 }
334 	};
335 
336 	const struct {
337 		char *name;
338 		hrtime_t mult;
339 	} suffixes[] = {
340 		{ "ns", 	NANOSEC / NANOSEC },
341 		{ "nsec",	NANOSEC / NANOSEC },
342 		{ "us",		NANOSEC / MICROSEC },
343 		{ "usec",	NANOSEC / MICROSEC },
344 		{ "ms",		NANOSEC / MILLISEC },
345 		{ "msec",	NANOSEC / MILLISEC },
346 		{ "s",		NANOSEC / SEC },
347 		{ "sec",	NANOSEC / SEC },
348 		{ "m",		NANOSEC * (hrtime_t)60 },
349 		{ "min",	NANOSEC * (hrtime_t)60 },
350 		{ "h",		NANOSEC * (hrtime_t)(60 * 60) },
351 		{ "hour",	NANOSEC * (hrtime_t)(60 * 60) },
352 		{ "d",		NANOSEC * (hrtime_t)(24 * 60 * 60) },
353 		{ "day",	NANOSEC * (hrtime_t)(24 * 60 * 60) },
354 		{ "hz",		0 },
355 		{ NULL }
356 	};
357 
358 	if (desc == NULL) {
359 		char n[PROF_NAMELEN];
360 
361 		/*
362 		 * If no description was provided, provide all of our probes.
363 		 */
364 		for (i = 0; i < sizeof (profile_rates) / sizeof (int); i++) {
365 			if ((rate = profile_rates[i]) == 0)
366 				continue;
367 
368 			(void) snprintf(n, PROF_NAMELEN, "%s%d",
369 			    PROF_PREFIX_PROFILE, rate);
370 			profile_create(NANOSEC / rate, n, PROF_PROFILE);
371 		}
372 
373 		for (i = 0; i < sizeof (profile_ticks) / sizeof (int); i++) {
374 			if ((rate = profile_ticks[i]) == 0)
375 				continue;
376 
377 			(void) snprintf(n, PROF_NAMELEN, "%s%d",
378 			    PROF_PREFIX_TICK, rate);
379 			profile_create(NANOSEC / rate, n, PROF_TICK);
380 		}
381 
382 		return;
383 	}
384 
385 	name = desc->dtpd_name;
386 
387 	for (i = 0; types[i].prefix != NULL; i++) {
388 		len = strlen(types[i].prefix);
389 
390 		if (strncmp(name, types[i].prefix, len) != 0)
391 			continue;
392 		break;
393 	}
394 
395 	if (types[i].prefix == NULL)
396 		return;
397 
398 	kind = types[i].kind;
399 	j = strlen(name) - len;
400 
401 	/*
402 	 * We need to start before any time suffix.
403 	 */
404 	for (j = strlen(name); j >= len; j--) {
405 		if (name[j] >= '0' && name[j] <= '9')
406 			break;
407 		suffix = &name[j];
408 	}
409 
410 	ASSERT(suffix != NULL);
411 
412 	/*
413 	 * Now determine the numerical value present in the probe name.
414 	 */
415 	for (; j >= len; j--) {
416 		if (name[j] < '0' || name[j] > '9')
417 			return;
418 
419 		val += (name[j] - '0') * mult;
420 		mult *= (hrtime_t)10;
421 	}
422 
423 	if (val == 0)
424 		return;
425 
426 	/*
427 	 * Look-up the suffix to determine the multiplier.
428 	 */
429 	for (i = 0, mult = 0; suffixes[i].name != NULL; i++) {
430 		if (strcasecmp(suffixes[i].name, suffix) == 0) {
431 			mult = suffixes[i].mult;
432 			break;
433 		}
434 	}
435 
436 	if (suffixes[i].name == NULL && *suffix != '\0')
437 		return;
438 
439 	if (mult == 0) {
440 		/*
441 		 * The default is frequency-per-second.
442 		 */
443 		val = NANOSEC / val;
444 	} else {
445 		val *= mult;
446 	}
447 
448 	profile_create(val, name, kind);
449 }
450 
451 /* ARGSUSED */
452 static void
453 profile_destroy(void *arg, dtrace_id_t id, void *parg)
454 {
455 	profile_probe_t *prof = parg;
456 
457 #ifdef illumos
458 	ASSERT(prof->prof_cyclic == CYCLIC_NONE);
459 #else
460 	ASSERT(!callout_active(&prof->prof_cyclic) && prof->prof_pcpus == NULL);
461 #endif
462 	kmem_free(prof, sizeof (profile_probe_t));
463 
464 	ASSERT(profile_total >= 1);
465 	atomic_add_32(&profile_total, -1);
466 }
467 
468 #ifdef illumos
469 /*ARGSUSED*/
470 static void
471 profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
472 {
473 	profile_probe_t *prof = arg;
474 	profile_probe_percpu_t *pcpu;
475 
476 	pcpu = kmem_zalloc(sizeof (profile_probe_percpu_t), KM_SLEEP);
477 	pcpu->profc_probe = prof;
478 
479 	hdlr->cyh_func = profile_fire;
480 	hdlr->cyh_arg = pcpu;
481 
482 	when->cyt_interval = prof->prof_interval;
483 	when->cyt_when = gethrtime() + when->cyt_interval;
484 
485 	pcpu->profc_expected = when->cyt_when;
486 	pcpu->profc_interval = when->cyt_interval;
487 }
488 
489 /*ARGSUSED*/
490 static void
491 profile_offline(void *arg, cpu_t *cpu, void *oarg)
492 {
493 	profile_probe_percpu_t *pcpu = oarg;
494 
495 	ASSERT(pcpu->profc_probe == arg);
496 	kmem_free(pcpu, sizeof (profile_probe_percpu_t));
497 }
498 
499 /* ARGSUSED */
500 static void
501 profile_enable(void *arg, dtrace_id_t id, void *parg)
502 {
503 	profile_probe_t *prof = parg;
504 	cyc_omni_handler_t omni;
505 	cyc_handler_t hdlr;
506 	cyc_time_t when;
507 
508 	ASSERT(prof->prof_interval != 0);
509 	ASSERT(MUTEX_HELD(&cpu_lock));
510 
511 	if (prof->prof_kind == PROF_TICK) {
512 		hdlr.cyh_func = profile_tick;
513 		hdlr.cyh_arg = prof;
514 
515 		when.cyt_interval = prof->prof_interval;
516 		when.cyt_when = gethrtime() + when.cyt_interval;
517 	} else {
518 		ASSERT(prof->prof_kind == PROF_PROFILE);
519 		omni.cyo_online = profile_online;
520 		omni.cyo_offline = profile_offline;
521 		omni.cyo_arg = prof;
522 	}
523 
524 	if (prof->prof_kind == PROF_TICK) {
525 		prof->prof_cyclic = cyclic_add(&hdlr, &when);
526 	} else {
527 		prof->prof_cyclic = cyclic_add_omni(&omni);
528 	}
529 }
530 
531 /* ARGSUSED */
532 static void
533 profile_disable(void *arg, dtrace_id_t id, void *parg)
534 {
535 	profile_probe_t *prof = parg;
536 
537 	ASSERT(prof->prof_cyclic != CYCLIC_NONE);
538 	ASSERT(MUTEX_HELD(&cpu_lock));
539 
540 	cyclic_remove(prof->prof_cyclic);
541 	prof->prof_cyclic = CYCLIC_NONE;
542 }
543 
544 #else
545 
546 static void
547 profile_enable_omni(profile_probe_t *prof)
548 {
549 	profile_probe_percpu_t *pcpu;
550 	int cpu;
551 
552 	prof->prof_pcpus = kmem_zalloc((mp_maxid + 1) * sizeof(pcpu), KM_SLEEP);
553 	CPU_FOREACH(cpu) {
554 		pcpu = kmem_zalloc(sizeof(profile_probe_percpu_t), KM_SLEEP);
555 		prof->prof_pcpus[cpu] = pcpu;
556 		pcpu->profc_probe = prof;
557 		pcpu->profc_expected = sbinuptime() + prof->prof_interval;
558 		pcpu->profc_interval = prof->prof_interval;
559 		callout_init(&pcpu->profc_cyclic, 1);
560 		callout_reset_sbt_on(&pcpu->profc_cyclic,
561 		    pcpu->profc_expected, 0, profile_fire, pcpu,
562 		    cpu, C_DIRECT_EXEC | C_ABSOLUTE);
563 	}
564 }
565 
566 static void
567 profile_disable_omni(profile_probe_t *prof)
568 {
569 	profile_probe_percpu_t *pcpu;
570 	int cpu;
571 
572 	ASSERT(prof->prof_pcpus != NULL);
573 	CPU_FOREACH(cpu) {
574 		pcpu = prof->prof_pcpus[cpu];
575 		ASSERT(pcpu->profc_probe == prof);
576 		ASSERT(callout_active(&pcpu->profc_cyclic));
577 		callout_stop(&pcpu->profc_cyclic);
578 		callout_drain(&pcpu->profc_cyclic);
579 		kmem_free(pcpu, sizeof(profile_probe_percpu_t));
580 	}
581 	kmem_free(prof->prof_pcpus, (mp_maxid + 1) * sizeof(pcpu));
582 	prof->prof_pcpus = NULL;
583 }
584 
585 /* ARGSUSED */
586 static void
587 profile_enable(void *arg, dtrace_id_t id, void *parg)
588 {
589 	profile_probe_t *prof = parg;
590 
591 	if (prof->prof_kind == PROF_TICK) {
592 		prof->prof_expected = sbinuptime() + prof->prof_interval;
593 		callout_reset_sbt(&prof->prof_cyclic,
594 		    prof->prof_expected, 0, profile_tick, prof,
595 		    C_DIRECT_EXEC | C_ABSOLUTE);
596 	} else {
597 		ASSERT(prof->prof_kind == PROF_PROFILE);
598 		profile_enable_omni(prof);
599 	}
600 }
601 
602 /* ARGSUSED */
603 static void
604 profile_disable(void *arg, dtrace_id_t id, void *parg)
605 {
606 	profile_probe_t *prof = parg;
607 
608 	if (prof->prof_kind == PROF_TICK) {
609 		ASSERT(callout_active(&prof->prof_cyclic));
610 		callout_stop(&prof->prof_cyclic);
611 		callout_drain(&prof->prof_cyclic);
612 	} else {
613 		ASSERT(prof->prof_kind == PROF_PROFILE);
614 		profile_disable_omni(prof);
615 	}
616 }
617 #endif
618 
619 static void
620 profile_load(void *dummy)
621 {
622 	if (dtrace_register("profile", &profile_attr, DTRACE_PRIV_USER,
623 	    NULL, &profile_pops, NULL, &profile_id) != 0)
624 		return;
625 }
626 
627 
628 static int
629 profile_unload(void)
630 {
631 	int error = 0;
632 
633 	if ((error = dtrace_unregister(profile_id)) != 0)
634 		return (error);
635 
636 	return (error);
637 }
638 
639 /* ARGSUSED */
640 static int
641 profile_modevent(module_t mod __unused, int type, void *data __unused)
642 {
643 	int error = 0;
644 
645 	switch (type) {
646 	case MOD_LOAD:
647 		break;
648 
649 	case MOD_UNLOAD:
650 		break;
651 
652 	case MOD_SHUTDOWN:
653 		break;
654 
655 	default:
656 		error = EOPNOTSUPP;
657 		break;
658 
659 	}
660 	return (error);
661 }
662 
663 SYSINIT(profile_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_load, NULL);
664 SYSUNINIT(profile_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_unload, NULL);
665 
666 DEV_MODULE(profile, profile_modevent, NULL);
667 MODULE_VERSION(profile, 1);
668 MODULE_DEPEND(profile, dtrace, 1, 1, 1);
669 MODULE_DEPEND(profile, opensolaris, 1, 1, 1);
670