1 /* $OpenBSD: dt_dev.c,v 1.13 2021/04/23 07:21:02 bluhm Exp $ */ 2 3 /* 4 * Copyright (c) 2019 Martin Pieuchot <mpi@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 #include <sys/systm.h> 21 #include <sys/param.h> 22 #include <sys/device.h> 23 #include <sys/malloc.h> 24 #include <sys/proc.h> 25 26 #include <dev/dt/dtvar.h> 27 28 /* 29 * Number of frames to skip in stack traces. 30 * 31 * The number of frames required to execute dt(4) profiling code 32 * depends on the probe, context, architecture and possibly the 33 * compiler. 34 * 35 * Static probes (tracepoints) are executed in the context of the 36 * current thread and only need to skip frames up to the recording 37 * function. For example the syscall provider: 38 * 39 * dt_prov_syscall_entry+0x141 40 * syscall+0x205 <--- start here 41 * Xsyscall+0x128 42 * 43 * Probes executed in their own context, like the profile provider, 44 * need to skip the frames of that context which are different for 45 * every architecture. For example the profile provider executed 46 * from hardclock(9) on amd64: 47 * 48 * dt_prov_profile_enter+0x6e 49 * hardclock+0x1a9 50 * lapic_clockintr+0x3f 51 * Xresume_lapic_ltimer+0x26 52 * acpicpu_idle+0x1d2 <---- start here. 53 * sched_idle+0x225 54 * proc_trampoline+0x1c 55 */ 56 #if defined(__amd64__) 57 #define DT_FA_PROFILE 5 58 #define DT_FA_STATIC 2 59 #elif defined(__powerpc64__) 60 #define DT_FA_PROFILE 6 61 #define DT_FA_STATIC 2 62 #elif defined(__sparc64__) 63 #define DT_FA_PROFILE 5 64 #define DT_FA_STATIC 1 65 #else 66 #define DT_FA_STATIC 0 67 #define DT_FA_PROFILE 0 68 #endif 69 70 #define DT_EVTRING_SIZE 16 /* # of slots in per PCB event ring */ 71 72 #define DPRINTF(x...) /* nothing */ 73 74 /* 75 * Descriptor associated with each program opening /dev/dt. It is used 76 * to keep track of enabled PCBs. 77 * 78 * Locks used to protect struct members in this file: 79 * m per-softc mutex 80 * K kernel lock 81 */ 82 struct dt_softc { 83 SLIST_ENTRY(dt_softc) ds_next; /* [K] descriptor list */ 84 int ds_unit; /* [I] D_CLONE unique unit */ 85 pid_t ds_pid; /* [I] PID of tracing program */ 86 87 struct mutex ds_mtx; 88 89 struct dt_pcb_list ds_pcbs; /* [K] list of enabled PCBs */ 90 struct dt_evt *ds_bufqueue; /* [K] copy evts to userland */ 91 size_t ds_bufqlen; /* [K] length of the queue */ 92 int ds_recording; /* [K] currently recording? */ 93 int ds_evtcnt; /* [m] # of readable evts */ 94 95 /* Counters */ 96 uint64_t ds_readevt; /* [m] # of events read */ 97 uint64_t ds_dropevt; /* [m] # of events dropped */ 98 }; 99 100 SLIST_HEAD(, dt_softc) dtdev_list; /* [K] list of open /dev/dt nodes */ 101 102 /* 103 * Probes are created during dt_attach() and never modified/freed during 104 * the lifetime of the system. That's why we consider them as [I]mmutable. 105 */ 106 unsigned int dt_nprobes; /* [I] # of probes available */ 107 SIMPLEQ_HEAD(, dt_probe) dt_probe_list; /* [I] list of probes */ 108 109 struct rwlock dt_lock = RWLOCK_INITIALIZER("dtlk"); 110 volatile uint32_t dt_tracing = 0; /* [K] # of processes tracing */ 111 112 int allowdt; 113 114 void dtattach(struct device *, struct device *, void *); 115 int dtopen(dev_t, int, int, struct proc *); 116 int dtclose(dev_t, int, int, struct proc *); 117 int dtread(dev_t, struct uio *, int); 118 int dtioctl(dev_t, u_long, caddr_t, int, struct proc *); 119 120 struct dt_softc *dtlookup(int); 121 122 int dt_ioctl_list_probes(struct dt_softc *, struct dtioc_probe *); 123 int dt_ioctl_get_stats(struct dt_softc *, struct dtioc_stat *); 124 int dt_ioctl_record_start(struct dt_softc *); 125 void dt_ioctl_record_stop(struct dt_softc *); 126 int dt_ioctl_probe_enable(struct dt_softc *, struct dtioc_req *); 127 void dt_ioctl_probe_disable(struct dt_softc *, struct dtioc_req *); 128 129 int dt_pcb_ring_copy(struct dt_pcb *, struct dt_evt *, size_t, uint64_t *); 130 131 void 132 dtattach(struct device *parent, struct device *self, void *aux) 133 { 134 SLIST_INIT(&dtdev_list); 135 SIMPLEQ_INIT(&dt_probe_list); 136 137 /* Init providers */ 138 dt_nprobes += dt_prov_profile_init(); 139 dt_nprobes += dt_prov_syscall_init(); 140 dt_nprobes += dt_prov_static_init(); 141 142 printf("dt: %u probes\n", dt_nprobes); 143 } 144 145 int 146 dtopen(dev_t dev, int flags, int mode, struct proc *p) 147 { 148 struct dt_softc *sc; 149 int unit = minor(dev); 150 151 if (!allowdt) 152 return EPERM; 153 154 KASSERT(dtlookup(unit) == NULL); 155 156 sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_CANFAIL|M_ZERO); 157 if (sc == NULL) 158 return ENOMEM; 159 160 /* 161 * Enough space to empty 2 full rings of events in a single read. 162 */ 163 sc->ds_bufqlen = 2 * DT_EVTRING_SIZE; 164 sc->ds_bufqueue = mallocarray(sc->ds_bufqlen, sizeof(*sc->ds_bufqueue), 165 M_DEVBUF, M_WAITOK|M_CANFAIL); 166 if (sc->ds_bufqueue == NULL) 167 goto bad; 168 169 sc->ds_unit = unit; 170 sc->ds_pid = p->p_p->ps_pid; 171 TAILQ_INIT(&sc->ds_pcbs); 172 mtx_init(&sc->ds_mtx, IPL_HIGH); 173 sc->ds_evtcnt = 0; 174 sc->ds_readevt = 0; 175 sc->ds_dropevt = 0; 176 177 SLIST_INSERT_HEAD(&dtdev_list, sc, ds_next); 178 179 DPRINTF("dt%d: pid %d open\n", sc->ds_unit, sc->ds_pid); 180 181 return 0; 182 183 bad: 184 free(sc, M_DEVBUF, sizeof(*sc)); 185 return ENOMEM; 186 } 187 188 int 189 dtclose(dev_t dev, int flags, int mode, struct proc *p) 190 { 191 struct dt_softc *sc; 192 int unit = minor(dev); 193 194 sc = dtlookup(unit); 195 KASSERT(sc != NULL); 196 197 DPRINTF("dt%d: pid %d close\n", sc->ds_unit, sc->ds_pid); 198 199 SLIST_REMOVE(&dtdev_list, sc, dt_softc, ds_next); 200 dt_ioctl_record_stop(sc); 201 dt_pcb_purge(&sc->ds_pcbs); 202 203 free(sc->ds_bufqueue, M_DEVBUF, 204 sc->ds_bufqlen * sizeof(*sc->ds_bufqueue)); 205 free(sc, M_DEVBUF, sizeof(*sc)); 206 207 return 0; 208 } 209 210 int 211 dtread(dev_t dev, struct uio *uio, int flags) 212 { 213 struct sleep_state sls; 214 struct dt_softc *sc; 215 struct dt_evt *estq; 216 struct dt_pcb *dp; 217 int error = 0, unit = minor(dev); 218 size_t qlen, count, read = 0; 219 uint64_t dropped = 0; 220 221 sc = dtlookup(unit); 222 KASSERT(sc != NULL); 223 224 count = howmany(uio->uio_resid, sizeof(struct dt_evt)); 225 if (count < 1) 226 return (EMSGSIZE); 227 228 while (!sc->ds_evtcnt) { 229 sleep_setup(&sls, sc, PWAIT | PCATCH, "dtread", 0); 230 error = sleep_finish(&sls, !sc->ds_evtcnt); 231 if (error == EINTR || error == ERESTART) 232 break; 233 } 234 if (error) 235 return error; 236 237 estq = sc->ds_bufqueue; 238 qlen = MIN(sc->ds_bufqlen, count); 239 240 KERNEL_ASSERT_LOCKED(); 241 TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) { 242 count = dt_pcb_ring_copy(dp, estq, qlen, &dropped); 243 read += count; 244 estq += count; /* pointer aritmetic */ 245 qlen -= count; 246 if (qlen == 0) 247 break; 248 } 249 if (read > 0) 250 uiomove(sc->ds_bufqueue, read * sizeof(struct dt_evt), uio); 251 252 mtx_enter(&sc->ds_mtx); 253 sc->ds_evtcnt -= read; 254 sc->ds_readevt += read; 255 sc->ds_dropevt += dropped; 256 mtx_leave(&sc->ds_mtx); 257 258 return 0; 259 } 260 261 int 262 dtioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p) 263 { 264 struct dt_softc *sc; 265 int unit = minor(dev); 266 int on, error = 0; 267 268 sc = dtlookup(unit); 269 KASSERT(sc != NULL); 270 271 switch (cmd) { 272 case DTIOCGPLIST: 273 return dt_ioctl_list_probes(sc, (struct dtioc_probe *)addr); 274 case DTIOCGSTATS: 275 return dt_ioctl_get_stats(sc, (struct dtioc_stat *)addr); 276 case DTIOCRECORD: 277 case DTIOCPRBENABLE: 278 /* root only ioctl(2) */ 279 break; 280 default: 281 return ENOTTY; 282 } 283 284 if ((error = suser(p)) != 0) 285 return error; 286 287 switch (cmd) { 288 case DTIOCRECORD: 289 on = *(int *)addr; 290 if (on) 291 error = dt_ioctl_record_start(sc); 292 else 293 dt_ioctl_record_stop(sc); 294 break; 295 case DTIOCPRBENABLE: 296 error = dt_ioctl_probe_enable(sc, (struct dtioc_req *)addr); 297 break; 298 default: 299 KASSERT(0); 300 } 301 302 return error; 303 } 304 305 struct dt_softc * 306 dtlookup(int unit) 307 { 308 struct dt_softc *sc; 309 310 KERNEL_ASSERT_LOCKED(); 311 312 SLIST_FOREACH(sc, &dtdev_list, ds_next) { 313 if (sc->ds_unit == unit) 314 break; 315 } 316 317 return sc; 318 } 319 320 int 321 dtioc_req_isvalid(struct dtioc_req *dtrq) 322 { 323 switch (dtrq->dtrq_filter.dtf_operand) { 324 case DT_OP_NONE: 325 case DT_OP_EQ: 326 case DT_OP_NE: 327 break; 328 default: 329 return 0; 330 } 331 332 switch (dtrq->dtrq_filter.dtf_variable) { 333 case DT_FV_NONE: 334 case DT_FV_PID: 335 case DT_FV_TID: 336 break; 337 default: 338 return 0; 339 } 340 341 return 1; 342 } 343 344 int 345 dt_ioctl_list_probes(struct dt_softc *sc, struct dtioc_probe *dtpr) 346 { 347 struct dtioc_probe_info info, *dtpi; 348 struct dt_probe *dtp; 349 size_t size; 350 int error = 0; 351 352 size = dtpr->dtpr_size; 353 dtpr->dtpr_size = dt_nprobes * sizeof(*dtpi); 354 if (size == 0) 355 return 0; 356 357 dtpi = dtpr->dtpr_probes; 358 memset(&info, 0, sizeof(info)); 359 SIMPLEQ_FOREACH(dtp, &dt_probe_list, dtp_next) { 360 if (size < sizeof(*dtpi)) { 361 error = ENOSPC; 362 break; 363 } 364 info.dtpi_pbn = dtp->dtp_pbn; 365 info.dtpi_nargs = dtp->dtp_nargs; 366 strlcpy(info.dtpi_prov, dtp->dtp_prov->dtpv_name, 367 sizeof(info.dtpi_prov)); 368 strlcpy(info.dtpi_func, dtp->dtp_func, sizeof(info.dtpi_func)); 369 strlcpy(info.dtpi_name, dtp->dtp_name, sizeof(info.dtpi_name)); 370 error = copyout(&info, dtpi, sizeof(*dtpi)); 371 if (error) 372 break; 373 size -= sizeof(*dtpi); 374 dtpi++; 375 }; 376 377 return error; 378 } 379 380 int 381 dt_ioctl_get_stats(struct dt_softc *sc, struct dtioc_stat *dtst) 382 { 383 mtx_enter(&sc->ds_mtx); 384 dtst->dtst_readevt = sc->ds_readevt; 385 dtst->dtst_dropevt = sc->ds_dropevt; 386 mtx_leave(&sc->ds_mtx); 387 388 return 0; 389 } 390 391 int 392 dt_ioctl_record_start(struct dt_softc *sc) 393 { 394 struct dt_pcb *dp; 395 396 if (sc->ds_recording) 397 return EBUSY; 398 399 KERNEL_ASSERT_LOCKED(); 400 if (TAILQ_EMPTY(&sc->ds_pcbs)) 401 return ENOENT; 402 403 rw_enter_write(&dt_lock); 404 TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) { 405 struct dt_probe *dtp = dp->dp_dtp; 406 407 SMR_SLIST_INSERT_HEAD_LOCKED(&dtp->dtp_pcbs, dp, dp_pnext); 408 dtp->dtp_recording++; 409 dtp->dtp_prov->dtpv_recording++; 410 } 411 rw_exit_write(&dt_lock); 412 413 sc->ds_recording = 1; 414 dt_tracing++; 415 416 return 0; 417 } 418 419 void 420 dt_ioctl_record_stop(struct dt_softc *sc) 421 { 422 struct dt_pcb *dp; 423 424 KASSERT(suser(curproc) == 0); 425 426 if (!sc->ds_recording) 427 return; 428 429 DPRINTF("dt%d: pid %d disable\n", sc->ds_unit, sc->ds_pid); 430 431 dt_tracing--; 432 sc->ds_recording = 0; 433 434 rw_enter_write(&dt_lock); 435 TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) { 436 struct dt_probe *dtp = dp->dp_dtp; 437 438 dtp->dtp_recording--; 439 dtp->dtp_prov->dtpv_recording--; 440 SMR_SLIST_REMOVE_LOCKED(&dtp->dtp_pcbs, dp, dt_pcb, dp_pnext); 441 } 442 rw_exit_write(&dt_lock); 443 444 /* Wait until readers cannot access the PCBs. */ 445 smr_barrier(); 446 } 447 448 int 449 dt_ioctl_probe_enable(struct dt_softc *sc, struct dtioc_req *dtrq) 450 { 451 struct dt_pcb_list plist; 452 struct dt_probe *dtp; 453 int error; 454 455 KASSERT(suser(curproc) == 0); 456 457 if (!dtioc_req_isvalid(dtrq)) 458 return EINVAL; 459 460 SIMPLEQ_FOREACH(dtp, &dt_probe_list, dtp_next) { 461 if (dtp->dtp_pbn == dtrq->dtrq_pbn) 462 break; 463 } 464 if (dtp == NULL) 465 return ENOENT; 466 467 TAILQ_INIT(&plist); 468 error = dtp->dtp_prov->dtpv_alloc(dtp, sc, &plist, dtrq); 469 if (error) 470 return error; 471 472 DPRINTF("dt%d: pid %d enable %u : %b\n", sc->ds_unit, sc->ds_pid, 473 dtrq->dtrq_pbn, (unsigned int)dtrq->dtrq_evtflags, DTEVT_FLAG_BITS); 474 475 /* Append all PCBs to this instance */ 476 TAILQ_CONCAT(&sc->ds_pcbs, &plist, dp_snext); 477 478 return 0; 479 } 480 481 struct dt_probe * 482 dt_dev_alloc_probe(const char *func, const char *name, struct dt_provider *dtpv) 483 { 484 struct dt_probe *dtp; 485 486 dtp = malloc(sizeof(*dtp), M_DT, M_NOWAIT|M_ZERO); 487 if (dtp == NULL) 488 return NULL; 489 490 SMR_SLIST_INIT(&dtp->dtp_pcbs); 491 dtp->dtp_prov = dtpv; 492 dtp->dtp_func = func; 493 dtp->dtp_name = name; 494 dtp->dtp_sysnum = -1; 495 496 return dtp; 497 } 498 499 void 500 dt_dev_register_probe(struct dt_probe *dtp) 501 { 502 static uint64_t probe_nb; 503 504 dtp->dtp_pbn = ++probe_nb; 505 SIMPLEQ_INSERT_TAIL(&dt_probe_list, dtp, dtp_next); 506 } 507 508 struct dt_pcb * 509 dt_pcb_alloc(struct dt_probe *dtp, struct dt_softc *sc) 510 { 511 struct dt_pcb *dp; 512 513 dp = malloc(sizeof(*dp), M_DT, M_WAITOK|M_CANFAIL|M_ZERO); 514 if (dp == NULL) 515 goto bad; 516 517 dp->dp_ring = mallocarray(DT_EVTRING_SIZE, sizeof(*dp->dp_ring), M_DT, 518 M_WAITOK|M_CANFAIL|M_ZERO); 519 if (dp->dp_ring == NULL) 520 goto bad; 521 522 mtx_init(&dp->dp_mtx, IPL_HIGH); 523 dp->dp_sc = sc; 524 dp->dp_dtp = dtp; 525 return dp; 526 bad: 527 dt_pcb_free(dp); 528 return NULL; 529 } 530 531 void 532 dt_pcb_free(struct dt_pcb *dp) 533 { 534 if (dp == NULL) 535 return; 536 free(dp->dp_ring, M_DT, DT_EVTRING_SIZE * sizeof(*dp->dp_ring)); 537 free(dp, M_DT, sizeof(*dp)); 538 } 539 540 void 541 dt_pcb_purge(struct dt_pcb_list *plist) 542 { 543 struct dt_pcb *dp; 544 545 while ((dp = TAILQ_FIRST(plist)) != NULL) { 546 TAILQ_REMOVE(plist, dp, dp_snext); 547 dt_pcb_free(dp); 548 } 549 } 550 551 int 552 dt_pcb_filter(struct dt_pcb *dp) 553 { 554 struct dt_filter *dtf = &dp->dp_filter; 555 struct proc *p = curproc; 556 unsigned int var; 557 int match = 1; 558 559 /* Filter out tracing program. */ 560 if (dp->dp_sc->ds_pid == p->p_p->ps_pid) 561 return 1; 562 563 switch (dtf->dtf_variable) { 564 case DT_FV_PID: 565 var = p->p_p->ps_pid; 566 break; 567 case DT_FV_TID: 568 var = p->p_tid; 569 break; 570 case DT_FV_NONE: 571 break; 572 default: 573 KASSERT(0); 574 } 575 576 switch (dtf->dtf_operand) { 577 case DT_OP_EQ: 578 match = !!(var == dtf->dtf_value); 579 break; 580 case DT_OP_NE: 581 match = !!(var != dtf->dtf_value); 582 break; 583 case DT_OP_NONE: 584 break; 585 default: 586 KASSERT(0); 587 } 588 589 return !match; 590 } 591 592 593 /* 594 * Get a reference to the next free event state from the ring. 595 */ 596 struct dt_evt * 597 dt_pcb_ring_get(struct dt_pcb *dp, int profiling) 598 { 599 struct proc *p = curproc; 600 struct dt_evt *dtev; 601 int distance; 602 603 if (dt_pcb_filter(dp)) 604 return NULL; 605 606 mtx_enter(&dp->dp_mtx); 607 distance = dp->dp_prod - dp->dp_cons; 608 if (distance == 1 || distance == (1 - DT_EVTRING_SIZE)) { 609 /* read(2) isn't finished */ 610 dp->dp_dropevt++; 611 mtx_leave(&dp->dp_mtx); 612 return NULL; 613 } 614 615 /* 616 * Save states in next free event slot. 617 */ 618 dtev = &dp->dp_ring[dp->dp_cons]; 619 memset(dtev, 0, sizeof(*dtev)); 620 621 dtev->dtev_pbn = dp->dp_dtp->dtp_pbn; 622 dtev->dtev_cpu = cpu_number(); 623 dtev->dtev_pid = p->p_p->ps_pid; 624 dtev->dtev_tid = p->p_tid; 625 nanotime(&dtev->dtev_tsp); 626 627 if (ISSET(dp->dp_evtflags, DTEVT_EXECNAME)) 628 memcpy(dtev->dtev_comm, p->p_p->ps_comm, DTMAXCOMLEN - 1); 629 630 if (ISSET(dp->dp_evtflags, DTEVT_KSTACK|DTEVT_USTACK)) { 631 if (profiling) 632 stacktrace_save_at(&dtev->dtev_kstack, DT_FA_PROFILE); 633 else 634 stacktrace_save_at(&dtev->dtev_kstack, DT_FA_STATIC); 635 } 636 637 return dtev; 638 } 639 640 void 641 dt_pcb_ring_consume(struct dt_pcb *dp, struct dt_evt *dtev) 642 { 643 MUTEX_ASSERT_LOCKED(&dp->dp_mtx); 644 KASSERT(dtev == &dp->dp_ring[dp->dp_cons]); 645 646 dp->dp_cons = (dp->dp_cons + 1) % DT_EVTRING_SIZE; 647 mtx_leave(&dp->dp_mtx); 648 649 mtx_enter(&dp->dp_sc->ds_mtx); 650 dp->dp_sc->ds_evtcnt++; 651 mtx_leave(&dp->dp_sc->ds_mtx); 652 wakeup(dp->dp_sc); 653 } 654 655 /* 656 * Copy at most `qlen' events from `dp', producing the same amount 657 * of free slots. 658 */ 659 int 660 dt_pcb_ring_copy(struct dt_pcb *dp, struct dt_evt *estq, size_t qlen, 661 uint64_t *dropped) 662 { 663 size_t count, copied = 0; 664 unsigned int cons, prod; 665 666 KASSERT(qlen > 0); 667 668 mtx_enter(&dp->dp_mtx); 669 cons = dp->dp_cons; 670 prod = dp->dp_prod; 671 672 if (cons < prod) 673 count = DT_EVTRING_SIZE - prod; 674 else 675 count = cons - prod; 676 677 if (count == 0) 678 goto out; 679 680 *dropped += dp->dp_dropevt; 681 dp->dp_dropevt = 0; 682 683 count = MIN(count, qlen); 684 685 memcpy(&estq[0], &dp->dp_ring[prod], count * sizeof(*estq)); 686 copied += count; 687 688 /* Produce */ 689 prod = (prod + count) % DT_EVTRING_SIZE; 690 691 /* If the queue is full or the ring didn't wrap, stop here. */ 692 if (qlen == copied || prod != 0 || cons == 0) 693 goto out; 694 695 count = MIN(cons, (qlen - copied)); 696 memcpy(&estq[copied], &dp->dp_ring[0], count * sizeof(*estq)); 697 copied += count; 698 prod += count; 699 700 out: 701 dp->dp_prod = prod; 702 mtx_leave(&dp->dp_mtx); 703 return copied; 704 } 705