1 /* $OpenBSD: dt_dev.c,v 1.8 2020/07/04 08:06:07 anton Exp $ */ 2 3 /* 4 * Copyright (c) 2019 Martin Pieuchot <mpi@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 #include <sys/systm.h> 21 #include <sys/param.h> 22 #include <sys/device.h> 23 #include <sys/malloc.h> 24 #include <sys/proc.h> 25 26 #include <dev/dt/dtvar.h> 27 28 /* 29 * Number of frames to skip in stack traces. 30 * 31 * The number of frames required to execute dt(4) profiling code 32 * depends on the probe, context, architecture and possibly the 33 * compiler. 34 * 35 * Static probes (tracepoints) are executed in the context of the 36 * current thread and only need to skip frames up to the recording 37 * function. For example the syscall provider: 38 * 39 * dt_prov_syscall_entry+0x141 40 * syscall+0x205 <--- start here 41 * Xsyscall+0x128 42 * 43 * Probes executed in their own context, like the profile provider, 44 * need to skip the frames of that context which are different for 45 * every architecture. For example the profile provider executed 46 * from hardclock(9) on amd64: 47 * 48 * dt_prov_profile_enter+0x6e 49 * hardclock+0x1a9 50 * lapic_clockintr+0x3f 51 * Xresume_lapic_ltimer+0x26 52 * acpicpu_idle+0x1d2 <---- start here. 53 * sched_idle+0x225 54 * proc_trampoline+0x1c 55 */ 56 #if defined(__amd64__) 57 #define DT_FA_PROFILE 5 58 #define DT_FA_STATIC 2 59 #elif defined(__sparc64__) 60 #define DT_FA_PROFILE 5 61 #define DT_FA_STATIC 1 62 #else 63 #define DT_FA_STATIC 0 64 #define DT_FA_PROFILE 0 65 #endif 66 67 #define DT_EVTRING_SIZE 16 /* # of slots in per PCB event ring */ 68 69 #define DPRINTF(x...) /* nothing */ 70 71 /* 72 * Descriptor associated with each program opening /dev/dt. It is used 73 * to keep track of enabled PCBs. 74 * 75 * Locks used to protect struct members in this file: 76 * m per-softc mutex 77 * K kernel lock 78 */ 79 struct dt_softc { 80 SLIST_ENTRY(dt_softc) ds_next; /* [K] descriptor list */ 81 int ds_unit; /* [I] D_CLONE unique unit */ 82 pid_t ds_pid; /* [I] PID of tracing program */ 83 84 struct mutex ds_mtx; 85 86 struct dt_pcb_list ds_pcbs; /* [K] list of enabled PCBs */ 87 struct dt_evt *ds_bufqueue; /* [K] copy evts to userland */ 88 size_t ds_bufqlen; /* [K] length of the queue */ 89 int ds_recording; /* [K] currently recording? */ 90 int ds_evtcnt; /* [m] # of readable evts */ 91 92 /* Counters */ 93 uint64_t ds_readevt; /* [m] # of events read */ 94 uint64_t ds_dropevt; /* [m] # of events dropped */ 95 }; 96 97 SLIST_HEAD(, dt_softc) dtdev_list; /* [K] list of open /dev/dt nodes */ 98 99 /* 100 * Probes are created during dt_attach() and never modified/freed during 101 * the lifetime of the system. That's why we consider them as [I]mmutable. 102 */ 103 unsigned int dt_nprobes; /* [I] # of probes available */ 104 SIMPLEQ_HEAD(, dt_probe) dt_probe_list; /* [I] list of probes */ 105 106 struct rwlock dt_lock = RWLOCK_INITIALIZER("dtlk"); 107 volatile uint32_t dt_tracing = 0; /* [K] # of processes tracing */ 108 109 void dtattach(struct device *, struct device *, void *); 110 int dtopen(dev_t, int, int, struct proc *); 111 int dtclose(dev_t, int, int, struct proc *); 112 int dtread(dev_t, struct uio *, int); 113 int dtioctl(dev_t, u_long, caddr_t, int, struct proc *); 114 115 struct dt_softc *dtlookup(int); 116 117 int dt_ioctl_list_probes(struct dt_softc *, struct dtioc_probe *); 118 int dt_ioctl_get_stats(struct dt_softc *, struct dtioc_stat *); 119 int dt_ioctl_record_start(struct dt_softc *); 120 void dt_ioctl_record_stop(struct dt_softc *); 121 int dt_ioctl_probe_enable(struct dt_softc *, struct dtioc_req *); 122 void dt_ioctl_probe_disable(struct dt_softc *, struct dtioc_req *); 123 124 int dt_pcb_ring_copy(struct dt_pcb *, struct dt_evt *, size_t, uint64_t *); 125 126 void 127 dtattach(struct device *parent, struct device *self, void *aux) 128 { 129 SLIST_INIT(&dtdev_list); 130 SIMPLEQ_INIT(&dt_probe_list); 131 132 /* Init providers */ 133 dt_nprobes += dt_prov_profile_init(); 134 dt_nprobes += dt_prov_syscall_init(); 135 dt_nprobes += dt_prov_static_init(); 136 137 printf("dt: %u probes\n", dt_nprobes); 138 } 139 140 int 141 dtopen(dev_t dev, int flags, int mode, struct proc *p) 142 { 143 struct dt_softc *sc; 144 int unit = minor(dev); 145 extern int allowdt; 146 147 if (!allowdt) 148 return EPERM; 149 150 KASSERT(dtlookup(unit) == NULL); 151 152 sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_CANFAIL|M_ZERO); 153 if (sc == NULL) 154 return ENOMEM; 155 156 /* 157 * Enough space to empty 2 full rings of events in a single read. 158 */ 159 sc->ds_bufqlen = 2 * DT_EVTRING_SIZE; 160 sc->ds_bufqueue = mallocarray(sc->ds_bufqlen, sizeof(*sc->ds_bufqueue), 161 M_DEVBUF, M_WAITOK|M_CANFAIL); 162 if (sc->ds_bufqueue == NULL) 163 goto bad; 164 165 sc->ds_unit = unit; 166 sc->ds_pid = p->p_p->ps_pid; 167 TAILQ_INIT(&sc->ds_pcbs); 168 mtx_init(&sc->ds_mtx, IPL_HIGH); 169 sc->ds_evtcnt = 0; 170 sc->ds_readevt = 0; 171 sc->ds_dropevt = 0; 172 173 SLIST_INSERT_HEAD(&dtdev_list, sc, ds_next); 174 175 DPRINTF("dt%d: pid %d open\n", sc->ds_unit, sc->ds_pid); 176 177 return 0; 178 179 bad: 180 free(sc, M_DEVBUF, sizeof(*sc)); 181 return ENOMEM; 182 } 183 184 int 185 dtclose(dev_t dev, int flags, int mode, struct proc *p) 186 { 187 struct dt_softc *sc; 188 int unit = minor(dev); 189 190 sc = dtlookup(unit); 191 KASSERT(sc != NULL); 192 193 DPRINTF("dt%d: pid %d close\n", sc->ds_unit, sc->ds_pid); 194 195 SLIST_REMOVE(&dtdev_list, sc, dt_softc, ds_next); 196 dt_ioctl_record_stop(sc); 197 dt_pcb_purge(&sc->ds_pcbs); 198 199 free(sc->ds_bufqueue, M_DEVBUF, 200 sc->ds_bufqlen * sizeof(*sc->ds_bufqueue)); 201 free(sc, M_DEVBUF, sizeof(*sc)); 202 203 return 0; 204 } 205 206 int 207 dtread(dev_t dev, struct uio *uio, int flags) 208 { 209 struct sleep_state sls; 210 struct dt_softc *sc; 211 struct dt_evt *estq; 212 struct dt_pcb *dp; 213 int error, unit = minor(dev); 214 size_t qlen, count, read = 0; 215 uint64_t dropped = 0; 216 217 sc = dtlookup(unit); 218 KASSERT(sc != NULL); 219 220 count = howmany(uio->uio_resid, sizeof(struct dt_evt)); 221 if (count < 1) 222 return (EMSGSIZE); 223 224 while (!sc->ds_evtcnt) { 225 sleep_setup(&sls, sc, PWAIT | PCATCH, "dtread"); 226 sleep_setup_signal(&sls); 227 sleep_finish(&sls, !sc->ds_evtcnt); 228 error = sleep_finish_signal(&sls); 229 if (error == EINTR || error == ERESTART) 230 break; 231 } 232 if (error) 233 return error; 234 235 estq = sc->ds_bufqueue; 236 qlen = MIN(sc->ds_bufqlen, count); 237 238 KERNEL_ASSERT_LOCKED(); 239 TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) { 240 count = dt_pcb_ring_copy(dp, estq, qlen, &dropped); 241 read += count; 242 estq += count; /* pointer aritmetic */ 243 qlen -= count; 244 if (qlen == 0) 245 break; 246 } 247 if (read > 0) 248 uiomove(sc->ds_bufqueue, read * sizeof(struct dt_evt), uio); 249 250 mtx_enter(&sc->ds_mtx); 251 sc->ds_evtcnt -= read; 252 sc->ds_readevt += read; 253 sc->ds_dropevt += dropped; 254 mtx_leave(&sc->ds_mtx); 255 256 return 0; 257 } 258 259 int 260 dtioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p) 261 { 262 struct dt_softc *sc; 263 int unit = minor(dev); 264 int on, error = 0; 265 266 sc = dtlookup(unit); 267 KASSERT(sc != NULL); 268 269 switch (cmd) { 270 case DTIOCGPLIST: 271 return dt_ioctl_list_probes(sc, (struct dtioc_probe *)addr); 272 case DTIOCGSTATS: 273 return dt_ioctl_get_stats(sc, (struct dtioc_stat *)addr); 274 case DTIOCRECORD: 275 case DTIOCPRBENABLE: 276 /* root only ioctl(2) */ 277 break; 278 default: 279 return ENOTTY; 280 } 281 282 if ((error = suser(p)) != 0) 283 return error; 284 285 switch (cmd) { 286 case DTIOCRECORD: 287 on = *(int *)addr; 288 if (on) 289 error = dt_ioctl_record_start(sc); 290 else 291 dt_ioctl_record_stop(sc); 292 break; 293 case DTIOCPRBENABLE: 294 error = dt_ioctl_probe_enable(sc, (struct dtioc_req *)addr); 295 break; 296 default: 297 KASSERT(0); 298 } 299 300 return error; 301 } 302 303 struct dt_softc * 304 dtlookup(int unit) 305 { 306 struct dt_softc *sc; 307 308 KERNEL_ASSERT_LOCKED(); 309 310 SLIST_FOREACH(sc, &dtdev_list, ds_next) { 311 if (sc->ds_unit == unit) 312 break; 313 } 314 315 return sc; 316 } 317 318 int 319 dtioc_req_isvalid(struct dtioc_req *dtrq) 320 { 321 switch (dtrq->dtrq_filter.dtf_operand) { 322 case DT_OP_NONE: 323 case DT_OP_EQ: 324 case DT_OP_NE: 325 break; 326 default: 327 return 0; 328 } 329 330 switch (dtrq->dtrq_filter.dtf_variable) { 331 case DT_FV_NONE: 332 case DT_FV_PID: 333 case DT_FV_TID: 334 break; 335 default: 336 return 0; 337 } 338 339 return 1; 340 } 341 342 int 343 dt_ioctl_list_probes(struct dt_softc *sc, struct dtioc_probe *dtpr) 344 { 345 struct dtioc_probe_info info, *dtpi; 346 struct dt_probe *dtp; 347 size_t size; 348 int error = 0; 349 350 if (dtpr->dtpr_size == 0) { 351 dtpr->dtpr_size = dt_nprobes * sizeof(*dtpi); 352 return 0; 353 } 354 355 size = dtpr->dtpr_size; 356 dtpi = dtpr->dtpr_probes; 357 memset(&info, 0, sizeof(info)); 358 SIMPLEQ_FOREACH(dtp, &dt_probe_list, dtp_next) { 359 if (size < sizeof(*dtpi)) { 360 error = ENOSPC; 361 break; 362 } 363 info.dtpi_pbn = dtp->dtp_pbn; 364 info.dtpi_nargs = dtp->dtp_nargs; 365 strlcpy(info.dtpi_prov, dtp->dtp_prov->dtpv_name, 366 sizeof(info.dtpi_prov)); 367 strlcpy(info.dtpi_func, dtp->dtp_func, sizeof(info.dtpi_func)); 368 strlcpy(info.dtpi_name, dtp->dtp_name, sizeof(info.dtpi_name)); 369 error = copyout(&info, dtpi, sizeof(*dtpi)); 370 if (error) 371 break; 372 size -= sizeof(*dtpi); 373 dtpi++; 374 }; 375 376 return error; 377 } 378 379 int 380 dt_ioctl_get_stats(struct dt_softc *sc, struct dtioc_stat *dtst) 381 { 382 mtx_enter(&sc->ds_mtx); 383 dtst->dtst_readevt = sc->ds_readevt; 384 dtst->dtst_dropevt = sc->ds_dropevt; 385 mtx_leave(&sc->ds_mtx); 386 387 return 0; 388 } 389 390 int 391 dt_ioctl_record_start(struct dt_softc *sc) 392 { 393 struct dt_pcb *dp; 394 395 if (sc->ds_recording) 396 return EBUSY; 397 398 KERNEL_ASSERT_LOCKED(); 399 if (TAILQ_EMPTY(&sc->ds_pcbs)) 400 return ENOENT; 401 402 rw_enter_write(&dt_lock); 403 TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) { 404 struct dt_probe *dtp = dp->dp_dtp; 405 406 SMR_SLIST_INSERT_HEAD_LOCKED(&dtp->dtp_pcbs, dp, dp_pnext); 407 dtp->dtp_recording++; 408 dtp->dtp_prov->dtpv_recording++; 409 } 410 rw_exit_write(&dt_lock); 411 412 sc->ds_recording = 1; 413 dt_tracing++; 414 415 return 0; 416 } 417 418 void 419 dt_ioctl_record_stop(struct dt_softc *sc) 420 { 421 struct dt_pcb *dp; 422 423 KASSERT(suser(curproc) == 0); 424 425 if (!sc->ds_recording) 426 return; 427 428 DPRINTF("dt%d: pid %d disable\n", sc->ds_unit, sc->ds_pid); 429 430 dt_tracing--; 431 sc->ds_recording = 0; 432 433 rw_enter_write(&dt_lock); 434 TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) { 435 struct dt_probe *dtp = dp->dp_dtp; 436 437 dtp->dtp_recording--; 438 dtp->dtp_prov->dtpv_recording--; 439 SMR_SLIST_REMOVE_LOCKED(&dtp->dtp_pcbs, dp, dt_pcb, dp_pnext); 440 } 441 rw_exit_write(&dt_lock); 442 443 /* Wait until readers cannot access the PCBs. */ 444 smr_barrier(); 445 } 446 447 int 448 dt_ioctl_probe_enable(struct dt_softc *sc, struct dtioc_req *dtrq) 449 { 450 struct dt_pcb_list plist; 451 struct dt_probe *dtp; 452 int error; 453 454 KASSERT(suser(curproc) == 0); 455 456 if (!dtioc_req_isvalid(dtrq)) 457 return EINVAL; 458 459 SIMPLEQ_FOREACH(dtp, &dt_probe_list, dtp_next) { 460 if (dtp->dtp_pbn == dtrq->dtrq_pbn) 461 break; 462 } 463 if (dtp == NULL) 464 return ENOENT; 465 466 TAILQ_INIT(&plist); 467 error = dtp->dtp_prov->dtpv_alloc(dtp, sc, &plist, dtrq); 468 if (error) 469 return error; 470 471 DPRINTF("dt%d: pid %d enable %u : %b\n", sc->ds_unit, sc->ds_pid, 472 dtrq->dtrq_pbn, (unsigned int)dtrq->dtrq_evtflags, DTEVT_FLAG_BITS); 473 474 /* Append all PCBs to this instance */ 475 TAILQ_CONCAT(&sc->ds_pcbs, &plist, dp_snext); 476 477 return 0; 478 } 479 480 struct dt_probe * 481 dt_dev_alloc_probe(const char *func, const char *name, struct dt_provider *dtpv) 482 { 483 struct dt_probe *dtp; 484 485 dtp = malloc(sizeof(*dtp), M_DT, M_NOWAIT|M_ZERO); 486 if (dtp == NULL) 487 return NULL; 488 489 SMR_SLIST_INIT(&dtp->dtp_pcbs); 490 dtp->dtp_prov = dtpv; 491 dtp->dtp_func = func; 492 dtp->dtp_name = name; 493 dtp->dtp_sysnum = -1; 494 495 return dtp; 496 } 497 498 void 499 dt_dev_register_probe(struct dt_probe *dtp) 500 { 501 static uint64_t probe_nb; 502 503 dtp->dtp_pbn = ++probe_nb; 504 SIMPLEQ_INSERT_TAIL(&dt_probe_list, dtp, dtp_next); 505 } 506 507 struct dt_pcb * 508 dt_pcb_alloc(struct dt_probe *dtp, struct dt_softc *sc) 509 { 510 struct dt_pcb *dp; 511 512 dp = malloc(sizeof(*dp), M_DT, M_WAITOK|M_CANFAIL|M_ZERO); 513 if (dp == NULL) 514 goto bad; 515 516 dp->dp_ring = mallocarray(DT_EVTRING_SIZE, sizeof(*dp->dp_ring), M_DT, 517 M_WAITOK|M_CANFAIL|M_ZERO); 518 if (dp->dp_ring == NULL) 519 goto bad; 520 521 mtx_init(&dp->dp_mtx, IPL_HIGH); 522 dp->dp_sc = sc; 523 dp->dp_dtp = dtp; 524 return dp; 525 bad: 526 dt_pcb_free(dp); 527 return NULL; 528 } 529 530 void 531 dt_pcb_free(struct dt_pcb *dp) 532 { 533 if (dp == NULL) 534 return; 535 free(dp->dp_ring, M_DT, DT_EVTRING_SIZE * sizeof(*dp->dp_ring)); 536 free(dp, M_DT, sizeof(*dp)); 537 } 538 539 void 540 dt_pcb_purge(struct dt_pcb_list *plist) 541 { 542 struct dt_pcb *dp; 543 544 while ((dp = TAILQ_FIRST(plist)) != NULL) { 545 TAILQ_REMOVE(plist, dp, dp_snext); 546 dt_pcb_free(dp); 547 } 548 } 549 550 int 551 dt_pcb_filter(struct dt_pcb *dp) 552 { 553 struct dt_filter *dtf = &dp->dp_filter; 554 struct proc *p = curproc; 555 unsigned int var; 556 int match = 1; 557 558 /* Filter out tracing program. */ 559 if (dp->dp_sc->ds_pid == p->p_p->ps_pid) 560 return 1; 561 562 switch (dtf->dtf_variable) { 563 case DT_FV_PID: 564 var = p->p_p->ps_pid; 565 break; 566 case DT_FV_TID: 567 var = p->p_tid; 568 break; 569 case DT_FV_NONE: 570 break; 571 default: 572 KASSERT(0); 573 } 574 575 switch (dtf->dtf_operand) { 576 case DT_OP_EQ: 577 match = !!(var == dtf->dtf_value); 578 break; 579 case DT_OP_NE: 580 match = !!(var != dtf->dtf_value); 581 break; 582 case DT_OP_NONE: 583 break; 584 default: 585 KASSERT(0); 586 } 587 588 return !match; 589 } 590 591 592 /* 593 * Get a reference to the next free event state from the ring. 594 */ 595 struct dt_evt * 596 dt_pcb_ring_get(struct dt_pcb *dp, int profiling) 597 { 598 struct proc *p = curproc; 599 struct dt_evt *dtev; 600 int distance; 601 602 if (dt_pcb_filter(dp)) 603 return NULL; 604 605 mtx_enter(&dp->dp_mtx); 606 distance = dp->dp_prod - dp->dp_cons; 607 if (distance == 1 || distance == (1 - DT_EVTRING_SIZE)) { 608 /* read(2) isn't finished */ 609 dp->dp_dropevt++; 610 mtx_leave(&dp->dp_mtx); 611 return NULL; 612 } 613 614 /* 615 * Save states in next free event slot. 616 */ 617 dtev = &dp->dp_ring[dp->dp_cons]; 618 memset(dtev, 0, sizeof(*dtev)); 619 620 dtev->dtev_pbn = dp->dp_dtp->dtp_pbn; 621 dtev->dtev_cpu = cpu_number(); 622 dtev->dtev_pid = p->p_p->ps_pid; 623 dtev->dtev_tid = p->p_tid; 624 nanotime(&dtev->dtev_tsp); 625 626 if (ISSET(dp->dp_evtflags, DTEVT_EXECNAME)) 627 memcpy(dtev->dtev_comm, p->p_p->ps_comm, DTMAXCOMLEN - 1); 628 629 if (ISSET(dp->dp_evtflags, DTEVT_KSTACK|DTEVT_USTACK)) { 630 if (profiling) 631 stacktrace_save_at(&dtev->dtev_kstack, DT_FA_PROFILE); 632 else 633 stacktrace_save_at(&dtev->dtev_kstack, DT_FA_STATIC); 634 } 635 636 return dtev; 637 } 638 639 void 640 dt_pcb_ring_consume(struct dt_pcb *dp, struct dt_evt *dtev) 641 { 642 MUTEX_ASSERT_LOCKED(&dp->dp_mtx); 643 KASSERT(dtev == &dp->dp_ring[dp->dp_cons]); 644 645 dp->dp_cons = (dp->dp_cons + 1) % DT_EVTRING_SIZE; 646 mtx_leave(&dp->dp_mtx); 647 648 mtx_enter(&dp->dp_sc->ds_mtx); 649 dp->dp_sc->ds_evtcnt++; 650 mtx_leave(&dp->dp_sc->ds_mtx); 651 wakeup(dp->dp_sc); 652 } 653 654 /* 655 * Copy at most `qlen' events from `dp', producing the same amount 656 * of free slots. 657 */ 658 int 659 dt_pcb_ring_copy(struct dt_pcb *dp, struct dt_evt *estq, size_t qlen, 660 uint64_t *dropped) 661 { 662 size_t count, copied = 0; 663 unsigned int cons, prod; 664 665 KASSERT(qlen > 0); 666 667 mtx_enter(&dp->dp_mtx); 668 cons = dp->dp_cons; 669 prod = dp->dp_prod; 670 671 if (cons < prod) 672 count = DT_EVTRING_SIZE - prod; 673 else 674 count = cons - prod; 675 676 if (count == 0) 677 goto out; 678 679 *dropped += dp->dp_dropevt; 680 dp->dp_dropevt = 0; 681 682 count = MIN(count, qlen); 683 684 memcpy(&estq[0], &dp->dp_ring[prod], count * sizeof(*estq)); 685 copied += count; 686 687 /* Produce */ 688 prod = (prod + count) % DT_EVTRING_SIZE; 689 690 /* If the queue is full or the ring didn't wrap, stop here. */ 691 if (qlen == copied || prod != 0 || cons == 0) 692 goto out; 693 694 count = MIN(cons, (qlen - copied)); 695 memcpy(&estq[copied], &dp->dp_ring[0], count * sizeof(*estq)); 696 copied += count; 697 prod += count; 698 699 out: 700 dp->dp_prod = prod; 701 mtx_leave(&dp->dp_mtx); 702 return copied; 703 } 704