xref: /freebsd/sys/cddl/dev/profile/profile.c (revision 06c3fb27)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  *
21  * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
22  *
23  */
24 
25 /*
26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
27  * Use is subject to license terms.
28  */
29 
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/conf.h>
33 #include <sys/cpuvar.h>
34 #include <sys/endian.h>
35 #include <sys/fcntl.h>
36 #include <sys/filio.h>
37 #include <sys/kdb.h>
38 #include <sys/kernel.h>
39 #include <sys/kmem.h>
40 #include <sys/kthread.h>
41 #include <sys/limits.h>
42 #include <sys/linker.h>
43 #include <sys/lock.h>
44 #include <sys/malloc.h>
45 #include <sys/module.h>
46 #include <sys/mutex.h>
47 #include <sys/poll.h>
48 #include <sys/proc.h>
49 #include <sys/selinfo.h>
50 #include <sys/smp.h>
51 #include <sys/sysctl.h>
52 #include <sys/uio.h>
53 #include <sys/unistd.h>
54 #include <machine/cpu.h>
55 #include <machine/stdarg.h>
56 
57 #include <sys/dtrace.h>
58 #include <sys/dtrace_bsd.h>
59 
60 #define	PROF_NAMELEN		15
61 
62 #define	PROF_PROFILE		0
63 #define	PROF_TICK		1
64 #define	PROF_PREFIX_PROFILE	"profile-"
65 #define	PROF_PREFIX_TICK	"tick-"
66 
67 /*
68  * Regardless of platform, there are five artificial frames in the case of the
69  * profile provider:
70  *
71  *	profile_fire
72  *	cyclic_expire
73  *	cyclic_fire
74  *	[ cbe ]
75  *	[ locore ]
76  *
77  * On amd64, there are two frames associated with locore:  one in locore, and
78  * another in common interrupt dispatch code.  (i386 has not been modified to
79  * use this common layer.)  Further, on i386, the interrupted instruction
80  * appears as its own stack frame.  All of this means that we need to add one
81  * frame for amd64, and then take one away for both amd64 and i386.
82  *
83  * All of the above constraints lead to the mess below.  Yes, the profile
84  * provider should ideally figure this out on-the-fly by hiting one of its own
85  * probes and then walking its own stack trace.  This is complicated, however,
86  * and the static definition doesn't seem to be overly brittle.  Still, we
87  * allow for a manual override in case we get it completely wrong.
88  */
89 #ifdef __amd64
90 #define	PROF_ARTIFICIAL_FRAMES	10
91 #else
92 #ifdef __i386
93 #define	PROF_ARTIFICIAL_FRAMES	6
94 #endif
95 #endif
96 
97 #ifdef __powerpc__
98 /*
99  * This value is bogus just to make module compilable on powerpc
100  */
101 #define	PROF_ARTIFICIAL_FRAMES	8
102 #endif
103 
104 struct profile_probe_percpu;
105 
106 #ifdef __arm__
107 #define	PROF_ARTIFICIAL_FRAMES	3
108 #endif
109 
110 #ifdef __aarch64__
111 #define	PROF_ARTIFICIAL_FRAMES	12
112 #endif
113 
114 #ifdef __riscv
115 #define	PROF_ARTIFICIAL_FRAMES	12
116 #endif
117 
118 typedef struct profile_probe {
119 	char		prof_name[PROF_NAMELEN];
120 	dtrace_id_t	prof_id;
121 	int		prof_kind;
122 #ifdef illumos
123 	hrtime_t	prof_interval;
124 	cyclic_id_t	prof_cyclic;
125 #else
126 	sbintime_t	prof_interval;
127 	struct callout	prof_cyclic;
128 	sbintime_t	prof_expected;
129 	struct profile_probe_percpu **prof_pcpus;
130 #endif
131 } profile_probe_t;
132 
133 typedef struct profile_probe_percpu {
134 	hrtime_t	profc_expected;
135 	hrtime_t	profc_interval;
136 	profile_probe_t	*profc_probe;
137 #ifdef __FreeBSD__
138 	struct callout	profc_cyclic;
139 #endif
140 } profile_probe_percpu_t;
141 
142 static int	profile_unload(void);
143 static void	profile_create(hrtime_t, char *, int);
144 static void	profile_destroy(void *, dtrace_id_t, void *);
145 static void	profile_enable(void *, dtrace_id_t, void *);
146 static void	profile_disable(void *, dtrace_id_t, void *);
147 static void	profile_load(void *);
148 static void	profile_provide(void *, dtrace_probedesc_t *);
149 
150 static int profile_rates[] = {
151     97, 199, 499, 997, 1999,
152     4001, 4999, 0, 0, 0,
153     0, 0, 0, 0, 0,
154     0, 0, 0, 0, 0
155 };
156 
157 static int profile_ticks[] = {
158     1, 10, 100, 500, 1000,
159     5000, 0, 0, 0, 0,
160     0, 0, 0, 0, 0
161 };
162 
163 /*
164  * profile_max defines the upper bound on the number of profile probes that
165  * can exist (this is to prevent malicious or clumsy users from exhausing
166  * system resources by creating a slew of profile probes). At mod load time,
167  * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's
168  * present in the profile.conf file.
169  */
170 #define	PROFILE_MAX_DEFAULT	1000	/* default max. number of probes */
171 static uint32_t profile_max = PROFILE_MAX_DEFAULT;
172 					/* maximum number of profile probes */
173 static uint32_t profile_total;		/* current number of profile probes */
174 
175 static dtrace_pattr_t profile_attr = {
176 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
177 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
178 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
179 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
180 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
181 };
182 
183 static dtrace_pops_t profile_pops = {
184 	.dtps_provide =		profile_provide,
185 	.dtps_provide_module =	NULL,
186 	.dtps_enable =		profile_enable,
187 	.dtps_disable =		profile_disable,
188 	.dtps_suspend =		NULL,
189 	.dtps_resume =		NULL,
190 	.dtps_getargdesc =	NULL,
191 	.dtps_getargval =	NULL,
192 	.dtps_usermode =	NULL,
193 	.dtps_destroy =		profile_destroy
194 };
195 
196 static dtrace_provider_id_t	profile_id;
197 static hrtime_t			profile_interval_min = NANOSEC / 5000;	/* 5000 hz */
198 static int			profile_aframes = PROF_ARTIFICIAL_FRAMES;
199 
200 SYSCTL_DECL(_kern_dtrace);
201 SYSCTL_NODE(_kern_dtrace, OID_AUTO, profile, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
202     "DTrace profile parameters");
203 SYSCTL_INT(_kern_dtrace_profile, OID_AUTO, aframes, CTLFLAG_RW, &profile_aframes,
204     0, "Skipped frames for profile provider");
205 
206 static sbintime_t
207 nsec_to_sbt(hrtime_t nsec)
208 {
209 	time_t sec;
210 
211 	/*
212 	 * We need to calculate nsec * 2^32 / 10^9
213 	 * Seconds and nanoseconds are split to avoid overflow.
214 	 */
215 	sec = nsec / NANOSEC;
216 	nsec = nsec % NANOSEC;
217 	return (((sbintime_t)sec << 32) | ((sbintime_t)nsec << 32) / NANOSEC);
218 }
219 
220 static hrtime_t
221 sbt_to_nsec(sbintime_t sbt)
222 {
223 
224 	return ((sbt >> 32) * NANOSEC +
225 	    (((uint32_t)sbt * (hrtime_t)NANOSEC) >> 32));
226 }
227 
228 static void
229 profile_probe(profile_probe_t *prof, hrtime_t late)
230 {
231 	struct thread *td;
232 	struct trapframe *frame;
233 	uintfptr_t pc, upc;
234 
235 	td = curthread;
236 	pc = upc = 0;
237 
238 	/*
239 	 * td_intr_frame can be unset if this is a catch-up event upon waking up
240 	 * from idle sleep. This can only happen on a CPU idle thread. Use a
241 	 * representative arg0 value in this case so that one of the probe
242 	 * arguments is non-zero.
243 	 */
244 	frame = td->td_intr_frame;
245 	if (frame != NULL) {
246 		if (TRAPF_USERMODE(frame))
247 			upc = TRAPF_PC(frame);
248 		else
249 			pc = TRAPF_PC(frame);
250 	} else if (TD_IS_IDLETHREAD(td))
251 		pc = (uintfptr_t)&cpu_idle;
252 
253 	dtrace_probe(prof->prof_id, pc, upc, late, 0, 0);
254 }
255 
256 static void
257 profile_fire(void *arg)
258 {
259 	profile_probe_percpu_t *pcpu = arg;
260 	profile_probe_t *prof = pcpu->profc_probe;
261 	hrtime_t late;
262 
263 	late = sbt_to_nsec(sbinuptime() - pcpu->profc_expected);
264 
265 	profile_probe(prof, late);
266 	pcpu->profc_expected += pcpu->profc_interval;
267 	callout_schedule_sbt_curcpu(&pcpu->profc_cyclic,
268 	    pcpu->profc_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
269 }
270 
271 static void
272 profile_tick(void *arg)
273 {
274 	profile_probe_t *prof = arg;
275 
276 	profile_probe(prof, 0);
277 	prof->prof_expected += prof->prof_interval;
278 	callout_schedule_sbt(&prof->prof_cyclic,
279 	    prof->prof_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
280 }
281 
282 static void
283 profile_create(hrtime_t interval, char *name, int kind)
284 {
285 	profile_probe_t *prof;
286 
287 	if (interval < profile_interval_min)
288 		return;
289 
290 	if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
291 		return;
292 
293 	atomic_add_32(&profile_total, 1);
294 	if (profile_total > profile_max) {
295 		atomic_add_32(&profile_total, -1);
296 		return;
297 	}
298 
299 	prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP);
300 	(void) strcpy(prof->prof_name, name);
301 #ifdef illumos
302 	prof->prof_interval = interval;
303 	prof->prof_cyclic = CYCLIC_NONE;
304 #else
305 	prof->prof_interval = nsec_to_sbt(interval);
306 	callout_init(&prof->prof_cyclic, 1);
307 #endif
308 	prof->prof_kind = kind;
309 	prof->prof_id = dtrace_probe_create(profile_id,
310 	    NULL, NULL, name,
311 	    profile_aframes, prof);
312 }
313 
314 /*ARGSUSED*/
315 static void
316 profile_provide(void *arg, dtrace_probedesc_t *desc)
317 {
318 	int i, j, rate, kind;
319 	hrtime_t val = 0, mult = 1, len = 0;
320 	char *name, *suffix = NULL;
321 
322 	const struct {
323 		char *prefix;
324 		int kind;
325 	} types[] = {
326 		{ PROF_PREFIX_PROFILE, PROF_PROFILE },
327 		{ PROF_PREFIX_TICK, PROF_TICK },
328 		{ 0, 0 }
329 	};
330 
331 	const struct {
332 		char *name;
333 		hrtime_t mult;
334 	} suffixes[] = {
335 		{ "ns", 	NANOSEC / NANOSEC },
336 		{ "nsec",	NANOSEC / NANOSEC },
337 		{ "us",		NANOSEC / MICROSEC },
338 		{ "usec",	NANOSEC / MICROSEC },
339 		{ "ms",		NANOSEC / MILLISEC },
340 		{ "msec",	NANOSEC / MILLISEC },
341 		{ "s",		NANOSEC / SEC },
342 		{ "sec",	NANOSEC / SEC },
343 		{ "m",		NANOSEC * (hrtime_t)60 },
344 		{ "min",	NANOSEC * (hrtime_t)60 },
345 		{ "h",		NANOSEC * (hrtime_t)(60 * 60) },
346 		{ "hour",	NANOSEC * (hrtime_t)(60 * 60) },
347 		{ "d",		NANOSEC * (hrtime_t)(24 * 60 * 60) },
348 		{ "day",	NANOSEC * (hrtime_t)(24 * 60 * 60) },
349 		{ "hz",		0 },
350 		{ NULL }
351 	};
352 
353 	if (desc == NULL) {
354 		char n[PROF_NAMELEN];
355 
356 		/*
357 		 * If no description was provided, provide all of our probes.
358 		 */
359 		for (i = 0; i < sizeof (profile_rates) / sizeof (int); i++) {
360 			if ((rate = profile_rates[i]) == 0)
361 				continue;
362 
363 			(void) snprintf(n, PROF_NAMELEN, "%s%d",
364 			    PROF_PREFIX_PROFILE, rate);
365 			profile_create(NANOSEC / rate, n, PROF_PROFILE);
366 		}
367 
368 		for (i = 0; i < sizeof (profile_ticks) / sizeof (int); i++) {
369 			if ((rate = profile_ticks[i]) == 0)
370 				continue;
371 
372 			(void) snprintf(n, PROF_NAMELEN, "%s%d",
373 			    PROF_PREFIX_TICK, rate);
374 			profile_create(NANOSEC / rate, n, PROF_TICK);
375 		}
376 
377 		return;
378 	}
379 
380 	name = desc->dtpd_name;
381 
382 	for (i = 0; types[i].prefix != NULL; i++) {
383 		len = strlen(types[i].prefix);
384 
385 		if (strncmp(name, types[i].prefix, len) != 0)
386 			continue;
387 		break;
388 	}
389 
390 	if (types[i].prefix == NULL)
391 		return;
392 
393 	kind = types[i].kind;
394 	j = strlen(name) - len;
395 
396 	/*
397 	 * We need to start before any time suffix.
398 	 */
399 	for (j = strlen(name); j >= len; j--) {
400 		if (name[j] >= '0' && name[j] <= '9')
401 			break;
402 		suffix = &name[j];
403 	}
404 
405 	ASSERT(suffix != NULL);
406 
407 	/*
408 	 * Now determine the numerical value present in the probe name.
409 	 */
410 	for (; j >= len; j--) {
411 		if (name[j] < '0' || name[j] > '9')
412 			return;
413 
414 		val += (name[j] - '0') * mult;
415 		mult *= (hrtime_t)10;
416 	}
417 
418 	if (val == 0)
419 		return;
420 
421 	/*
422 	 * Look-up the suffix to determine the multiplier.
423 	 */
424 	for (i = 0, mult = 0; suffixes[i].name != NULL; i++) {
425 		if (strcasecmp(suffixes[i].name, suffix) == 0) {
426 			mult = suffixes[i].mult;
427 			break;
428 		}
429 	}
430 
431 	if (suffixes[i].name == NULL && *suffix != '\0')
432 		return;
433 
434 	if (mult == 0) {
435 		/*
436 		 * The default is frequency-per-second.
437 		 */
438 		val = NANOSEC / val;
439 	} else {
440 		val *= mult;
441 	}
442 
443 	profile_create(val, name, kind);
444 }
445 
446 /* ARGSUSED */
447 static void
448 profile_destroy(void *arg, dtrace_id_t id, void *parg)
449 {
450 	profile_probe_t *prof = parg;
451 
452 #ifdef illumos
453 	ASSERT(prof->prof_cyclic == CYCLIC_NONE);
454 #else
455 	ASSERT(!callout_active(&prof->prof_cyclic) && prof->prof_pcpus == NULL);
456 #endif
457 	kmem_free(prof, sizeof (profile_probe_t));
458 
459 	ASSERT(profile_total >= 1);
460 	atomic_add_32(&profile_total, -1);
461 }
462 
463 #ifdef illumos
464 /*ARGSUSED*/
465 static void
466 profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
467 {
468 	profile_probe_t *prof = arg;
469 	profile_probe_percpu_t *pcpu;
470 
471 	pcpu = kmem_zalloc(sizeof (profile_probe_percpu_t), KM_SLEEP);
472 	pcpu->profc_probe = prof;
473 
474 	hdlr->cyh_func = profile_fire;
475 	hdlr->cyh_arg = pcpu;
476 
477 	when->cyt_interval = prof->prof_interval;
478 	when->cyt_when = gethrtime() + when->cyt_interval;
479 
480 	pcpu->profc_expected = when->cyt_when;
481 	pcpu->profc_interval = when->cyt_interval;
482 }
483 
484 /*ARGSUSED*/
485 static void
486 profile_offline(void *arg, cpu_t *cpu, void *oarg)
487 {
488 	profile_probe_percpu_t *pcpu = oarg;
489 
490 	ASSERT(pcpu->profc_probe == arg);
491 	kmem_free(pcpu, sizeof (profile_probe_percpu_t));
492 }
493 
494 /* ARGSUSED */
495 static void
496 profile_enable(void *arg, dtrace_id_t id, void *parg)
497 {
498 	profile_probe_t *prof = parg;
499 	cyc_omni_handler_t omni;
500 	cyc_handler_t hdlr;
501 	cyc_time_t when;
502 
503 	ASSERT(prof->prof_interval != 0);
504 	ASSERT(MUTEX_HELD(&cpu_lock));
505 
506 	if (prof->prof_kind == PROF_TICK) {
507 		hdlr.cyh_func = profile_tick;
508 		hdlr.cyh_arg = prof;
509 
510 		when.cyt_interval = prof->prof_interval;
511 		when.cyt_when = gethrtime() + when.cyt_interval;
512 	} else {
513 		ASSERT(prof->prof_kind == PROF_PROFILE);
514 		omni.cyo_online = profile_online;
515 		omni.cyo_offline = profile_offline;
516 		omni.cyo_arg = prof;
517 	}
518 
519 	if (prof->prof_kind == PROF_TICK) {
520 		prof->prof_cyclic = cyclic_add(&hdlr, &when);
521 	} else {
522 		prof->prof_cyclic = cyclic_add_omni(&omni);
523 	}
524 }
525 
526 /* ARGSUSED */
527 static void
528 profile_disable(void *arg, dtrace_id_t id, void *parg)
529 {
530 	profile_probe_t *prof = parg;
531 
532 	ASSERT(prof->prof_cyclic != CYCLIC_NONE);
533 	ASSERT(MUTEX_HELD(&cpu_lock));
534 
535 	cyclic_remove(prof->prof_cyclic);
536 	prof->prof_cyclic = CYCLIC_NONE;
537 }
538 
539 #else
540 
541 static void
542 profile_enable_omni(profile_probe_t *prof)
543 {
544 	profile_probe_percpu_t *pcpu;
545 	int cpu;
546 
547 	prof->prof_pcpus = kmem_zalloc((mp_maxid + 1) * sizeof(pcpu), KM_SLEEP);
548 	CPU_FOREACH(cpu) {
549 		pcpu = kmem_zalloc(sizeof(profile_probe_percpu_t), KM_SLEEP);
550 		prof->prof_pcpus[cpu] = pcpu;
551 		pcpu->profc_probe = prof;
552 		pcpu->profc_expected = sbinuptime() + prof->prof_interval;
553 		pcpu->profc_interval = prof->prof_interval;
554 		callout_init(&pcpu->profc_cyclic, 1);
555 		callout_reset_sbt_on(&pcpu->profc_cyclic,
556 		    pcpu->profc_expected, 0, profile_fire, pcpu,
557 		    cpu, C_DIRECT_EXEC | C_ABSOLUTE);
558 	}
559 }
560 
561 static void
562 profile_disable_omni(profile_probe_t *prof)
563 {
564 	profile_probe_percpu_t *pcpu;
565 	int cpu;
566 
567 	ASSERT(prof->prof_pcpus != NULL);
568 	CPU_FOREACH(cpu) {
569 		pcpu = prof->prof_pcpus[cpu];
570 		ASSERT(pcpu->profc_probe == prof);
571 		ASSERT(callout_active(&pcpu->profc_cyclic));
572 		callout_stop(&pcpu->profc_cyclic);
573 		callout_drain(&pcpu->profc_cyclic);
574 		kmem_free(pcpu, sizeof(profile_probe_percpu_t));
575 	}
576 	kmem_free(prof->prof_pcpus, (mp_maxid + 1) * sizeof(pcpu));
577 	prof->prof_pcpus = NULL;
578 }
579 
580 /* ARGSUSED */
581 static void
582 profile_enable(void *arg, dtrace_id_t id, void *parg)
583 {
584 	profile_probe_t *prof = parg;
585 
586 	if (prof->prof_kind == PROF_TICK) {
587 		prof->prof_expected = sbinuptime() + prof->prof_interval;
588 		callout_reset_sbt(&prof->prof_cyclic,
589 		    prof->prof_expected, 0, profile_tick, prof,
590 		    C_DIRECT_EXEC | C_ABSOLUTE);
591 	} else {
592 		ASSERT(prof->prof_kind == PROF_PROFILE);
593 		profile_enable_omni(prof);
594 	}
595 }
596 
597 /* ARGSUSED */
598 static void
599 profile_disable(void *arg, dtrace_id_t id, void *parg)
600 {
601 	profile_probe_t *prof = parg;
602 
603 	if (prof->prof_kind == PROF_TICK) {
604 		ASSERT(callout_active(&prof->prof_cyclic));
605 		callout_stop(&prof->prof_cyclic);
606 		callout_drain(&prof->prof_cyclic);
607 	} else {
608 		ASSERT(prof->prof_kind == PROF_PROFILE);
609 		profile_disable_omni(prof);
610 	}
611 }
612 #endif
613 
614 static void
615 profile_load(void *dummy)
616 {
617 	if (dtrace_register("profile", &profile_attr, DTRACE_PRIV_USER,
618 	    NULL, &profile_pops, NULL, &profile_id) != 0)
619 		return;
620 }
621 
622 
623 static int
624 profile_unload(void)
625 {
626 	int error = 0;
627 
628 	if ((error = dtrace_unregister(profile_id)) != 0)
629 		return (error);
630 
631 	return (error);
632 }
633 
634 /* ARGSUSED */
635 static int
636 profile_modevent(module_t mod __unused, int type, void *data __unused)
637 {
638 	int error = 0;
639 
640 	switch (type) {
641 	case MOD_LOAD:
642 		break;
643 
644 	case MOD_UNLOAD:
645 		break;
646 
647 	case MOD_SHUTDOWN:
648 		break;
649 
650 	default:
651 		error = EOPNOTSUPP;
652 		break;
653 
654 	}
655 	return (error);
656 }
657 
658 SYSINIT(profile_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_load, NULL);
659 SYSUNINIT(profile_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_unload, NULL);
660 
661 DEV_MODULE(profile, profile_modevent, NULL);
662 MODULE_VERSION(profile, 1);
663 MODULE_DEPEND(profile, dtrace, 1, 1, 1);
664 MODULE_DEPEND(profile, opensolaris, 1, 1, 1);
665