1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Copyright (c) 2011, Joyent, Inc. All rights reserved.
28 */
29
30 #include <sys/errno.h>
31 #include <sys/stat.h>
32 #include <sys/modctl.h>
33 #include <sys/conf.h>
34 #include <sys/systm.h>
35 #include <sys/ddi.h>
36 #include <sys/sunddi.h>
37 #include <sys/cpuvar.h>
38 #include <sys/kmem.h>
39 #include <sys/strsubr.h>
40 #include <sys/dtrace.h>
41 #include <sys/cyclic.h>
42 #include <sys/atomic.h>
43
44 static dev_info_t *profile_devi;
45 static dtrace_provider_id_t profile_id;
46
47 /*
48 * Regardless of platform, the stack frames look like this in the case of the
49 * profile provider:
50 *
51 * profile_fire
52 * cyclic_expire
53 * cyclic_fire
54 * [ cbe ]
55 * [ interrupt code ]
56 *
57 * On x86, there are five frames from the generic interrupt code; further, the
58 * interrupted instruction appears as its own stack frame, giving us a total of
59 * 10.
60 *
61 * On SPARC, the picture is further complicated because the compiler
62 * optimizes away tail-calls -- so the following frames are optimized away:
63 *
64 * profile_fire
65 * cyclic_expire
66 *
67 * This gives three frames. However, on DEBUG kernels, the cyclic_expire
68 * frame cannot be tail-call eliminated, yielding four frames in this case.
69 *
70 * All of the above constraints lead to the mess below. Yes, the profile
71 * provider should ideally figure this out on-the-fly by hitting one of its own
72 * probes and then walking its own stack trace. This is complicated, however,
73 * and the static definition doesn't seem to be overly brittle. Still, we
74 * allow for a manual override in case we get it completely wrong.
75 */
76 #ifdef __x86
77 #define PROF_ARTIFICIAL_FRAMES 10
78 #else
79 #ifdef __sparc
80 #ifdef DEBUG
81 #define PROF_ARTIFICIAL_FRAMES 4
82 #else
83 #define PROF_ARTIFICIAL_FRAMES 3
84 #endif
85 #endif
86 #endif
87
88 #define PROF_NAMELEN 15
89
90 #define PROF_PROFILE 0
91 #define PROF_TICK 1
92 #define PROF_PREFIX_PROFILE "profile-"
93 #define PROF_PREFIX_TICK "tick-"
94
95 typedef struct profile_probe {
96 char prof_name[PROF_NAMELEN];
97 dtrace_id_t prof_id;
98 int prof_kind;
99 hrtime_t prof_interval;
100 cyclic_id_t prof_cyclic;
101 } profile_probe_t;
102
103 typedef struct profile_probe_percpu {
104 hrtime_t profc_expected;
105 hrtime_t profc_interval;
106 profile_probe_t *profc_probe;
107 } profile_probe_percpu_t;
108
109 hrtime_t profile_interval_min = NANOSEC / 5000; /* 5000 hz */
110 int profile_aframes = 0; /* override */
111
112 static int profile_rates[] = {
113 97, 199, 499, 997, 1999,
114 4001, 4999, 0, 0, 0,
115 0, 0, 0, 0, 0,
116 0, 0, 0, 0, 0
117 };
118
119 static int profile_ticks[] = {
120 1, 10, 100, 500, 1000,
121 5000, 0, 0, 0, 0,
122 0, 0, 0, 0, 0
123 };
124
125 /*
126 * profile_max defines the upper bound on the number of profile probes that
127 * can exist (this is to prevent malicious or clumsy users from exhausing
128 * system resources by creating a slew of profile probes). At mod load time,
129 * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's
130 * present in the profile.conf file.
131 */
132 #define PROFILE_MAX_DEFAULT 1000 /* default max. number of probes */
133 static uint32_t profile_max; /* maximum number of profile probes */
134 static uint32_t profile_total; /* current number of profile probes */
135
136 static void
profile_fire(void * arg)137 profile_fire(void *arg)
138 {
139 profile_probe_percpu_t *pcpu = arg;
140 profile_probe_t *prof = pcpu->profc_probe;
141 hrtime_t late;
142
143 late = dtrace_gethrtime() - pcpu->profc_expected;
144 pcpu->profc_expected += pcpu->profc_interval;
145
146 dtrace_probe(prof->prof_id, CPU->cpu_profile_pc,
147 CPU->cpu_profile_upc, late, 0, 0);
148 }
149
150 static void
profile_tick(void * arg)151 profile_tick(void *arg)
152 {
153 profile_probe_t *prof = arg;
154
155 dtrace_probe(prof->prof_id, CPU->cpu_profile_pc,
156 CPU->cpu_profile_upc, 0, 0, 0);
157 }
158
159 static void
profile_create(hrtime_t interval,const char * name,int kind)160 profile_create(hrtime_t interval, const char *name, int kind)
161 {
162 profile_probe_t *prof;
163 int nr_frames = PROF_ARTIFICIAL_FRAMES + dtrace_mach_aframes();
164
165 if (profile_aframes)
166 nr_frames = profile_aframes;
167
168 if (interval < profile_interval_min)
169 return;
170
171 if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
172 return;
173
174 atomic_inc_32(&profile_total);
175 if (profile_total > profile_max) {
176 atomic_dec_32(&profile_total);
177 return;
178 }
179
180 prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP);
181 (void) strcpy(prof->prof_name, name);
182 prof->prof_interval = interval;
183 prof->prof_cyclic = CYCLIC_NONE;
184 prof->prof_kind = kind;
185 prof->prof_id = dtrace_probe_create(profile_id,
186 NULL, NULL, name, nr_frames, prof);
187 }
188
189 /*ARGSUSED*/
190 static void
profile_provide(void * arg,const dtrace_probedesc_t * desc)191 profile_provide(void *arg, const dtrace_probedesc_t *desc)
192 {
193 int i, j, rate, kind;
194 hrtime_t val = 0, mult = 1, len;
195 const char *name, *suffix = NULL;
196
197 const struct {
198 char *prefix;
199 int kind;
200 } types[] = {
201 { PROF_PREFIX_PROFILE, PROF_PROFILE },
202 { PROF_PREFIX_TICK, PROF_TICK },
203 { NULL, 0 }
204 };
205
206 const struct {
207 char *name;
208 hrtime_t mult;
209 } suffixes[] = {
210 { "ns", NANOSEC / NANOSEC },
211 { "nsec", NANOSEC / NANOSEC },
212 { "us", NANOSEC / MICROSEC },
213 { "usec", NANOSEC / MICROSEC },
214 { "ms", NANOSEC / MILLISEC },
215 { "msec", NANOSEC / MILLISEC },
216 { "s", NANOSEC / SEC },
217 { "sec", NANOSEC / SEC },
218 { "m", NANOSEC * (hrtime_t)60 },
219 { "min", NANOSEC * (hrtime_t)60 },
220 { "h", NANOSEC * (hrtime_t)(60 * 60) },
221 { "hour", NANOSEC * (hrtime_t)(60 * 60) },
222 { "d", NANOSEC * (hrtime_t)(24 * 60 * 60) },
223 { "day", NANOSEC * (hrtime_t)(24 * 60 * 60) },
224 { "hz", 0 },
225 { NULL }
226 };
227
228 if (desc == NULL) {
229 char n[PROF_NAMELEN];
230
231 /*
232 * If no description was provided, provide all of our probes.
233 */
234 for (i = 0; i < sizeof (profile_rates) / sizeof (int); i++) {
235 if ((rate = profile_rates[i]) == 0)
236 continue;
237
238 (void) snprintf(n, PROF_NAMELEN, "%s%d",
239 PROF_PREFIX_PROFILE, rate);
240 profile_create(NANOSEC / rate, n, PROF_PROFILE);
241 }
242
243 for (i = 0; i < sizeof (profile_ticks) / sizeof (int); i++) {
244 if ((rate = profile_ticks[i]) == 0)
245 continue;
246
247 (void) snprintf(n, PROF_NAMELEN, "%s%d",
248 PROF_PREFIX_TICK, rate);
249 profile_create(NANOSEC / rate, n, PROF_TICK);
250 }
251
252 return;
253 }
254
255 name = desc->dtpd_name;
256
257 for (i = 0; types[i].prefix != NULL; i++) {
258 len = strlen(types[i].prefix);
259
260 if (strncmp(name, types[i].prefix, len) != 0)
261 continue;
262 break;
263 }
264
265 if (types[i].prefix == NULL)
266 return;
267
268 kind = types[i].kind;
269 j = strlen(name) - len;
270
271 /*
272 * We need to start before any time suffix.
273 */
274 for (j = strlen(name); j >= len; j--) {
275 if (name[j] >= '0' && name[j] <= '9')
276 break;
277 suffix = &name[j];
278 }
279
280 ASSERT(suffix != NULL);
281
282 /*
283 * Now determine the numerical value present in the probe name.
284 */
285 for (; j >= len; j--) {
286 if (name[j] < '0' || name[j] > '9')
287 return;
288
289 val += (name[j] - '0') * mult;
290 mult *= (hrtime_t)10;
291 }
292
293 if (val == 0)
294 return;
295
296 /*
297 * Look-up the suffix to determine the multiplier.
298 */
299 for (i = 0, mult = 0; suffixes[i].name != NULL; i++) {
300 if (strcasecmp(suffixes[i].name, suffix) == 0) {
301 mult = suffixes[i].mult;
302 break;
303 }
304 }
305
306 if (suffixes[i].name == NULL && *suffix != '\0')
307 return;
308
309 if (mult == 0) {
310 /*
311 * The default is frequency-per-second.
312 */
313 val = NANOSEC / val;
314 } else {
315 val *= mult;
316 }
317
318 profile_create(val, name, kind);
319 }
320
321 /*ARGSUSED*/
322 static void
profile_destroy(void * arg,dtrace_id_t id,void * parg)323 profile_destroy(void *arg, dtrace_id_t id, void *parg)
324 {
325 profile_probe_t *prof = parg;
326
327 ASSERT(prof->prof_cyclic == CYCLIC_NONE);
328 kmem_free(prof, sizeof (profile_probe_t));
329
330 ASSERT(profile_total >= 1);
331 atomic_dec_32(&profile_total);
332 }
333
334 /*ARGSUSED*/
335 static void
profile_online(void * arg,cpu_t * cpu,cyc_handler_t * hdlr,cyc_time_t * when)336 profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
337 {
338 profile_probe_t *prof = arg;
339 profile_probe_percpu_t *pcpu;
340
341 pcpu = kmem_zalloc(sizeof (profile_probe_percpu_t), KM_SLEEP);
342 pcpu->profc_probe = prof;
343
344 hdlr->cyh_func = profile_fire;
345 hdlr->cyh_arg = pcpu;
346 hdlr->cyh_level = CY_HIGH_LEVEL;
347
348 when->cyt_interval = prof->prof_interval;
349 when->cyt_when = dtrace_gethrtime() + when->cyt_interval;
350
351 pcpu->profc_expected = when->cyt_when;
352 pcpu->profc_interval = when->cyt_interval;
353 }
354
355 /*ARGSUSED*/
356 static void
profile_offline(void * arg,cpu_t * cpu,void * oarg)357 profile_offline(void *arg, cpu_t *cpu, void *oarg)
358 {
359 profile_probe_percpu_t *pcpu = oarg;
360
361 ASSERT(pcpu->profc_probe == arg);
362 kmem_free(pcpu, sizeof (profile_probe_percpu_t));
363 }
364
365 /*ARGSUSED*/
366 static int
profile_enable(void * arg,dtrace_id_t id,void * parg)367 profile_enable(void *arg, dtrace_id_t id, void *parg)
368 {
369 profile_probe_t *prof = parg;
370 cyc_omni_handler_t omni;
371 cyc_handler_t hdlr;
372 cyc_time_t when;
373
374 ASSERT(prof->prof_interval != 0);
375 ASSERT(MUTEX_HELD(&cpu_lock));
376
377 if (prof->prof_kind == PROF_TICK) {
378 hdlr.cyh_func = profile_tick;
379 hdlr.cyh_arg = prof;
380 hdlr.cyh_level = CY_HIGH_LEVEL;
381
382 when.cyt_interval = prof->prof_interval;
383 when.cyt_when = dtrace_gethrtime() + when.cyt_interval;
384 } else {
385 ASSERT(prof->prof_kind == PROF_PROFILE);
386 omni.cyo_online = profile_online;
387 omni.cyo_offline = profile_offline;
388 omni.cyo_arg = prof;
389 }
390
391 if (prof->prof_kind == PROF_TICK) {
392 prof->prof_cyclic = cyclic_add(&hdlr, &when);
393 } else {
394 prof->prof_cyclic = cyclic_add_omni(&omni);
395 }
396 return (0);
397 }
398
399 /*ARGSUSED*/
400 static void
profile_disable(void * arg,dtrace_id_t id,void * parg)401 profile_disable(void *arg, dtrace_id_t id, void *parg)
402 {
403 profile_probe_t *prof = parg;
404
405 ASSERT(prof->prof_cyclic != CYCLIC_NONE);
406 ASSERT(MUTEX_HELD(&cpu_lock));
407
408 cyclic_remove(prof->prof_cyclic);
409 prof->prof_cyclic = CYCLIC_NONE;
410 }
411
412 /*ARGSUSED*/
413 static int
profile_mode(void * arg,dtrace_id_t id,void * parg)414 profile_mode(void *arg, dtrace_id_t id, void *parg)
415 {
416 profile_probe_t *prof = parg;
417 int mode;
418
419 if (CPU->cpu_profile_pc != 0) {
420 mode = DTRACE_MODE_KERNEL;
421 } else {
422 mode = DTRACE_MODE_USER;
423 }
424
425 if (prof->prof_kind == PROF_TICK) {
426 mode |= DTRACE_MODE_NOPRIV_RESTRICT;
427 } else {
428 ASSERT(prof->prof_kind == PROF_PROFILE);
429 mode |= DTRACE_MODE_NOPRIV_DROP;
430 }
431
432 return (mode);
433 }
434
435 static dtrace_pattr_t profile_attr = {
436 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
437 { DTRACE_STABILITY_UNSTABLE, DTRACE_STABILITY_UNSTABLE, DTRACE_CLASS_UNKNOWN },
438 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
439 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
440 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
441 };
442
443 static dtrace_pops_t profile_pops = {
444 profile_provide,
445 NULL,
446 profile_enable,
447 profile_disable,
448 NULL,
449 NULL,
450 NULL,
451 NULL,
452 profile_mode,
453 profile_destroy
454 };
455
456 static int
profile_attach(dev_info_t * devi,ddi_attach_cmd_t cmd)457 profile_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
458 {
459 switch (cmd) {
460 case DDI_ATTACH:
461 break;
462 case DDI_RESUME:
463 return (DDI_SUCCESS);
464 default:
465 return (DDI_FAILURE);
466 }
467
468 if (ddi_create_minor_node(devi, "profile", S_IFCHR, 0,
469 DDI_PSEUDO, 0) == DDI_FAILURE ||
470 dtrace_register("profile", &profile_attr,
471 DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER, NULL,
472 &profile_pops, NULL, &profile_id) != 0) {
473 ddi_remove_minor_node(devi, NULL);
474 return (DDI_FAILURE);
475 }
476
477 profile_max = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
478 "profile-max-probes", PROFILE_MAX_DEFAULT);
479
480 ddi_report_dev(devi);
481 profile_devi = devi;
482 return (DDI_SUCCESS);
483 }
484
485 static int
profile_detach(dev_info_t * devi,ddi_detach_cmd_t cmd)486 profile_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
487 {
488 switch (cmd) {
489 case DDI_DETACH:
490 break;
491 case DDI_SUSPEND:
492 return (DDI_SUCCESS);
493 default:
494 return (DDI_FAILURE);
495 }
496
497 if (dtrace_unregister(profile_id) != 0)
498 return (DDI_FAILURE);
499
500 ddi_remove_minor_node(devi, NULL);
501 return (DDI_SUCCESS);
502 }
503
504 /*ARGSUSED*/
505 static int
profile_info(dev_info_t * dip,ddi_info_cmd_t infocmd,void * arg,void ** result)506 profile_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
507 {
508 int error;
509
510 switch (infocmd) {
511 case DDI_INFO_DEVT2DEVINFO:
512 *result = (void *)profile_devi;
513 error = DDI_SUCCESS;
514 break;
515 case DDI_INFO_DEVT2INSTANCE:
516 *result = (void *)0;
517 error = DDI_SUCCESS;
518 break;
519 default:
520 error = DDI_FAILURE;
521 }
522 return (error);
523 }
524
525 /*ARGSUSED*/
526 static int
profile_open(dev_t * devp,int flag,int otyp,cred_t * cred_p)527 profile_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
528 {
529 return (0);
530 }
531
532 static struct cb_ops profile_cb_ops = {
533 profile_open, /* open */
534 nodev, /* close */
535 nulldev, /* strategy */
536 nulldev, /* print */
537 nodev, /* dump */
538 nodev, /* read */
539 nodev, /* write */
540 nodev, /* ioctl */
541 nodev, /* devmap */
542 nodev, /* mmap */
543 nodev, /* segmap */
544 nochpoll, /* poll */
545 ddi_prop_op, /* cb_prop_op */
546 0, /* streamtab */
547 D_NEW | D_MP /* Driver compatibility flag */
548 };
549
550 static struct dev_ops profile_ops = {
551 DEVO_REV, /* devo_rev, */
552 0, /* refcnt */
553 profile_info, /* get_dev_info */
554 nulldev, /* identify */
555 nulldev, /* probe */
556 profile_attach, /* attach */
557 profile_detach, /* detach */
558 nodev, /* reset */
559 &profile_cb_ops, /* driver operations */
560 NULL, /* bus operations */
561 nodev, /* dev power */
562 ddi_quiesce_not_needed, /* quiesce */
563 };
564
565 /*
566 * Module linkage information for the kernel.
567 */
568 static struct modldrv modldrv = {
569 &mod_driverops, /* module type (this is a pseudo driver) */
570 "Profile Interrupt Tracing", /* name of module */
571 &profile_ops, /* driver ops */
572 };
573
574 static struct modlinkage modlinkage = {
575 MODREV_1,
576 (void *)&modldrv,
577 NULL
578 };
579
580 int
_init(void)581 _init(void)
582 {
583 return (mod_install(&modlinkage));
584 }
585
586 int
_info(struct modinfo * modinfop)587 _info(struct modinfo *modinfop)
588 {
589 return (mod_info(&modlinkage, modinfop));
590 }
591
592 int
_fini(void)593 _fini(void)
594 {
595 return (mod_remove(&modlinkage));
596 }
597