1 /* $OpenBSD: kern_clockintr.c,v 1.60 2023/10/11 00:02:25 cheloha Exp $ */ 2 /* 3 * Copyright (c) 2003 Dale Rahn <drahn@openbsd.org> 4 * Copyright (c) 2020 Mark Kettenis <kettenis@openbsd.org> 5 * Copyright (c) 2020-2022 Scott Cheloha <cheloha@openbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19 20 #include <sys/param.h> 21 #include <sys/systm.h> 22 #include <sys/atomic.h> 23 #include <sys/clockintr.h> 24 #include <sys/kernel.h> 25 #include <sys/malloc.h> 26 #include <sys/mutex.h> 27 #include <sys/resourcevar.h> 28 #include <sys/queue.h> 29 #include <sys/sched.h> 30 #include <sys/stdint.h> 31 #include <sys/sysctl.h> 32 #include <sys/time.h> 33 34 void clockintr_hardclock(struct clockintr *, void *, void *); 35 void clockintr_schedule(struct clockintr *, uint64_t); 36 void clockintr_schedule_locked(struct clockintr *, uint64_t); 37 void clockqueue_intrclock_install(struct clockintr_queue *, 38 const struct intrclock *); 39 uint64_t clockqueue_next(const struct clockintr_queue *); 40 void clockqueue_pend_delete(struct clockintr_queue *, struct clockintr *); 41 void clockqueue_pend_insert(struct clockintr_queue *, struct clockintr *, 42 uint64_t); 43 void clockqueue_reset_intrclock(struct clockintr_queue *); 44 void intrclock_rearm(struct intrclock *, uint64_t); 45 void intrclock_trigger(struct intrclock *); 46 uint64_t nsec_advance(uint64_t *, uint64_t, uint64_t); 47 48 /* 49 * Ready the calling CPU for clockintr_dispatch(). If this is our 50 * first time here, install the intrclock, if any, and set necessary 51 * flags. Advance the schedule as needed. 52 */ 53 void 54 clockintr_cpu_init(const struct intrclock *ic) 55 { 56 uint64_t multiplier = 0; 57 struct cpu_info *ci = curcpu(); 58 struct clockintr_queue *cq = &ci->ci_queue; 59 struct schedstate_percpu *spc = &ci->ci_schedstate; 60 int reset_cq_intrclock = 0; 61 62 if (ic != NULL) 63 clockqueue_intrclock_install(cq, ic); 64 65 /* TODO: Remove this from struct clockintr_queue. */ 66 if (cq->cq_hardclock == NULL) { 67 cq->cq_hardclock = clockintr_establish(ci, clockintr_hardclock, 68 NULL); 69 if (cq->cq_hardclock == NULL) 70 panic("%s: failed to establish hardclock", __func__); 71 } 72 73 /* 74 * Mask CQ_INTRCLOCK while we're advancing the internal clock 75 * interrupts. We don't want the intrclock to fire until this 76 * thread reaches clockintr_trigger(). 77 */ 78 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 79 CLR(cq->cq_flags, CQ_INTRCLOCK); 80 reset_cq_intrclock = 1; 81 } 82 83 /* 84 * Until we understand scheduler lock contention better, stagger 85 * the hardclock and statclock so they don't all happen at once. 86 * If we have no intrclock it doesn't matter, we have no control 87 * anyway. The primary CPU's starting offset is always zero, so 88 * leave the multiplier zero. 89 */ 90 if (!CPU_IS_PRIMARY(ci) && reset_cq_intrclock) 91 multiplier = CPU_INFO_UNIT(ci); 92 93 /* 94 * The first time we do this, the primary CPU cannot skip any 95 * hardclocks. We can skip hardclocks on subsequent calls because 96 * the global tick value is advanced during inittodr(9) on our 97 * behalf. 98 */ 99 if (CPU_IS_PRIMARY(ci)) { 100 if (cq->cq_hardclock->cl_expiration == 0) 101 clockintr_schedule(cq->cq_hardclock, 0); 102 else 103 clockintr_advance(cq->cq_hardclock, hardclock_period); 104 } else { 105 if (cq->cq_hardclock->cl_expiration == 0) { 106 clockintr_stagger(cq->cq_hardclock, hardclock_period, 107 multiplier, MAXCPUS); 108 } 109 clockintr_advance(cq->cq_hardclock, hardclock_period); 110 } 111 112 /* 113 * We can always advance the statclock. There is no reason to 114 * stagger a randomized statclock. 115 */ 116 if (!statclock_is_randomized) { 117 if (spc->spc_statclock->cl_expiration == 0) { 118 clockintr_stagger(spc->spc_statclock, statclock_avg, 119 multiplier, MAXCPUS); 120 } 121 } 122 clockintr_advance(spc->spc_statclock, statclock_avg); 123 124 /* 125 * XXX Need to find a better place to do this. We can't do it in 126 * sched_init_cpu() because initclocks() runs after it. 127 */ 128 if (spc->spc_itimer->cl_expiration == 0) { 129 clockintr_stagger(spc->spc_itimer, hardclock_period, 130 multiplier, MAXCPUS); 131 } 132 if (spc->spc_profclock->cl_expiration == 0) { 133 clockintr_stagger(spc->spc_profclock, profclock_period, 134 multiplier, MAXCPUS); 135 } 136 if (spc->spc_roundrobin->cl_expiration == 0) { 137 clockintr_stagger(spc->spc_roundrobin, hardclock_period, 138 multiplier, MAXCPUS); 139 } 140 clockintr_advance(spc->spc_roundrobin, roundrobin_period); 141 142 if (reset_cq_intrclock) 143 SET(cq->cq_flags, CQ_INTRCLOCK); 144 } 145 146 /* 147 * If we have an intrclock, trigger it to start the dispatch cycle. 148 */ 149 void 150 clockintr_trigger(void) 151 { 152 struct clockintr_queue *cq = &curcpu()->ci_queue; 153 154 KASSERT(ISSET(cq->cq_flags, CQ_INIT)); 155 156 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) 157 intrclock_trigger(&cq->cq_intrclock); 158 } 159 160 /* 161 * Run all expired events scheduled on the calling CPU. 162 */ 163 int 164 clockintr_dispatch(void *frame) 165 { 166 uint64_t lateness, run = 0, start; 167 struct cpu_info *ci = curcpu(); 168 struct clockintr *cl, *shadow; 169 struct clockintr_queue *cq = &ci->ci_queue; 170 uint32_t ogen; 171 172 if (cq->cq_dispatch != 0) 173 panic("%s: recursive dispatch", __func__); 174 cq->cq_dispatch = 1; 175 176 splassert(IPL_CLOCK); 177 KASSERT(ISSET(cq->cq_flags, CQ_INIT)); 178 179 mtx_enter(&cq->cq_mtx); 180 181 /* 182 * If nothing is scheduled or we arrived too early, we have 183 * nothing to do. 184 */ 185 start = nsecuptime(); 186 cq->cq_uptime = start; 187 if (TAILQ_EMPTY(&cq->cq_pend)) 188 goto stats; 189 if (cq->cq_uptime < clockqueue_next(cq)) 190 goto rearm; 191 lateness = start - clockqueue_next(cq); 192 193 /* 194 * Dispatch expired events. 195 */ 196 for (;;) { 197 cl = TAILQ_FIRST(&cq->cq_pend); 198 if (cl == NULL) 199 break; 200 if (cq->cq_uptime < cl->cl_expiration) { 201 /* Double-check the time before giving up. */ 202 cq->cq_uptime = nsecuptime(); 203 if (cq->cq_uptime < cl->cl_expiration) 204 break; 205 } 206 207 /* 208 * This clockintr has expired. Initialize a shadow copy 209 * and execute it. 210 */ 211 clockqueue_pend_delete(cq, cl); 212 shadow = &cq->cq_shadow; 213 shadow->cl_expiration = cl->cl_expiration; 214 shadow->cl_arg = cl->cl_arg; 215 shadow->cl_func = cl->cl_func; 216 cq->cq_running = cl; 217 mtx_leave(&cq->cq_mtx); 218 219 shadow->cl_func(shadow, frame, shadow->cl_arg); 220 221 mtx_enter(&cq->cq_mtx); 222 cq->cq_running = NULL; 223 if (ISSET(cl->cl_flags, CLST_IGNORE_SHADOW)) { 224 CLR(cl->cl_flags, CLST_IGNORE_SHADOW); 225 CLR(shadow->cl_flags, CLST_SHADOW_PENDING); 226 } 227 if (ISSET(shadow->cl_flags, CLST_SHADOW_PENDING)) { 228 CLR(shadow->cl_flags, CLST_SHADOW_PENDING); 229 clockqueue_pend_insert(cq, cl, shadow->cl_expiration); 230 } 231 run++; 232 } 233 234 /* 235 * Dispatch complete. 236 */ 237 rearm: 238 /* Rearm the interrupt clock if we have one. */ 239 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 240 if (!TAILQ_EMPTY(&cq->cq_pend)) { 241 intrclock_rearm(&cq->cq_intrclock, 242 clockqueue_next(cq) - cq->cq_uptime); 243 } 244 } 245 stats: 246 /* Update our stats. */ 247 ogen = cq->cq_gen; 248 cq->cq_gen = 0; 249 membar_producer(); 250 cq->cq_stat.cs_dispatched += cq->cq_uptime - start; 251 if (run > 0) { 252 cq->cq_stat.cs_lateness += lateness; 253 cq->cq_stat.cs_prompt++; 254 cq->cq_stat.cs_run += run; 255 } else if (!TAILQ_EMPTY(&cq->cq_pend)) { 256 cq->cq_stat.cs_early++; 257 cq->cq_stat.cs_earliness += clockqueue_next(cq) - cq->cq_uptime; 258 } else 259 cq->cq_stat.cs_spurious++; 260 membar_producer(); 261 cq->cq_gen = MAX(1, ogen + 1); 262 263 mtx_leave(&cq->cq_mtx); 264 265 if (cq->cq_dispatch != 1) 266 panic("%s: unexpected value: %u", __func__, cq->cq_dispatch); 267 cq->cq_dispatch = 0; 268 269 return run > 0; 270 } 271 272 uint64_t 273 clockintr_advance(struct clockintr *cl, uint64_t period) 274 { 275 uint64_t count, expiration; 276 struct clockintr_queue *cq = cl->cl_queue; 277 278 if (cl == &cq->cq_shadow) { 279 count = nsec_advance(&cl->cl_expiration, period, cq->cq_uptime); 280 SET(cl->cl_flags, CLST_SHADOW_PENDING); 281 } else { 282 mtx_enter(&cq->cq_mtx); 283 expiration = cl->cl_expiration; 284 count = nsec_advance(&expiration, period, nsecuptime()); 285 clockintr_schedule_locked(cl, expiration); 286 mtx_leave(&cq->cq_mtx); 287 } 288 return count; 289 } 290 291 uint64_t 292 clockintr_advance_random(struct clockintr *cl, uint64_t min, uint32_t mask) 293 { 294 uint64_t count = 0; 295 struct clockintr_queue *cq = cl->cl_queue; 296 uint32_t off; 297 298 KASSERT(cl == &cq->cq_shadow); 299 300 while (cl->cl_expiration <= cq->cq_uptime) { 301 while ((off = (random() & mask)) == 0) 302 continue; 303 cl->cl_expiration += min + off; 304 count++; 305 } 306 SET(cl->cl_flags, CLST_SHADOW_PENDING); 307 return count; 308 } 309 310 void 311 clockintr_cancel(struct clockintr *cl) 312 { 313 struct clockintr_queue *cq = cl->cl_queue; 314 int was_next; 315 316 if (cl == &cq->cq_shadow) { 317 CLR(cl->cl_flags, CLST_SHADOW_PENDING); 318 return; 319 } 320 321 mtx_enter(&cq->cq_mtx); 322 if (ISSET(cl->cl_flags, CLST_PENDING)) { 323 was_next = cl == TAILQ_FIRST(&cq->cq_pend); 324 clockqueue_pend_delete(cq, cl); 325 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 326 if (was_next && !TAILQ_EMPTY(&cq->cq_pend)) { 327 if (cq == &curcpu()->ci_queue) 328 clockqueue_reset_intrclock(cq); 329 } 330 } 331 } 332 if (cl == cq->cq_running) 333 SET(cl->cl_flags, CLST_IGNORE_SHADOW); 334 mtx_leave(&cq->cq_mtx); 335 } 336 337 struct clockintr * 338 clockintr_establish(struct cpu_info *ci, 339 void (*func)(struct clockintr *, void *, void *), void *arg) 340 { 341 struct clockintr *cl; 342 struct clockintr_queue *cq = &ci->ci_queue; 343 344 cl = malloc(sizeof *cl, M_DEVBUF, M_NOWAIT | M_ZERO); 345 if (cl == NULL) 346 return NULL; 347 cl->cl_arg = arg; 348 cl->cl_func = func; 349 cl->cl_queue = cq; 350 351 mtx_enter(&cq->cq_mtx); 352 TAILQ_INSERT_TAIL(&cq->cq_all, cl, cl_alink); 353 mtx_leave(&cq->cq_mtx); 354 return cl; 355 } 356 357 void 358 clockintr_schedule(struct clockintr *cl, uint64_t expiration) 359 { 360 struct clockintr_queue *cq = cl->cl_queue; 361 362 if (cl == &cq->cq_shadow) { 363 cl->cl_expiration = expiration; 364 SET(cl->cl_flags, CLST_SHADOW_PENDING); 365 } else { 366 mtx_enter(&cq->cq_mtx); 367 clockintr_schedule_locked(cl, expiration); 368 mtx_leave(&cq->cq_mtx); 369 } 370 } 371 372 void 373 clockintr_schedule_locked(struct clockintr *cl, uint64_t expiration) 374 { 375 struct clockintr_queue *cq = cl->cl_queue; 376 377 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 378 379 if (ISSET(cl->cl_flags, CLST_PENDING)) 380 clockqueue_pend_delete(cq, cl); 381 clockqueue_pend_insert(cq, cl, expiration); 382 if (ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 383 if (cl == TAILQ_FIRST(&cq->cq_pend)) { 384 if (cq == &curcpu()->ci_queue) 385 clockqueue_reset_intrclock(cq); 386 } 387 } 388 if (cl == cq->cq_running) 389 SET(cl->cl_flags, CLST_IGNORE_SHADOW); 390 } 391 392 void 393 clockintr_stagger(struct clockintr *cl, uint64_t period, uint32_t numer, 394 uint32_t denom) 395 { 396 struct clockintr_queue *cq = cl->cl_queue; 397 398 KASSERT(numer < denom); 399 400 mtx_enter(&cq->cq_mtx); 401 if (ISSET(cl->cl_flags, CLST_PENDING)) 402 panic("%s: clock interrupt pending", __func__); 403 cl->cl_expiration = period / denom * numer; 404 mtx_leave(&cq->cq_mtx); 405 } 406 407 void 408 clockintr_hardclock(struct clockintr *cl, void *frame, void *arg) 409 { 410 uint64_t count, i; 411 412 count = clockintr_advance(cl, hardclock_period); 413 for (i = 0; i < count; i++) 414 hardclock(frame); 415 } 416 417 void 418 clockqueue_init(struct clockintr_queue *cq) 419 { 420 if (ISSET(cq->cq_flags, CQ_INIT)) 421 return; 422 423 cq->cq_shadow.cl_queue = cq; 424 mtx_init(&cq->cq_mtx, IPL_CLOCK); 425 TAILQ_INIT(&cq->cq_all); 426 TAILQ_INIT(&cq->cq_pend); 427 cq->cq_gen = 1; 428 SET(cq->cq_flags, CQ_INIT); 429 } 430 431 void 432 clockqueue_intrclock_install(struct clockintr_queue *cq, 433 const struct intrclock *ic) 434 { 435 mtx_enter(&cq->cq_mtx); 436 if (!ISSET(cq->cq_flags, CQ_INTRCLOCK)) { 437 cq->cq_intrclock = *ic; 438 SET(cq->cq_flags, CQ_INTRCLOCK); 439 } 440 mtx_leave(&cq->cq_mtx); 441 } 442 443 uint64_t 444 clockqueue_next(const struct clockintr_queue *cq) 445 { 446 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 447 return TAILQ_FIRST(&cq->cq_pend)->cl_expiration; 448 } 449 450 void 451 clockqueue_pend_delete(struct clockintr_queue *cq, struct clockintr *cl) 452 { 453 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 454 KASSERT(ISSET(cl->cl_flags, CLST_PENDING)); 455 456 TAILQ_REMOVE(&cq->cq_pend, cl, cl_plink); 457 CLR(cl->cl_flags, CLST_PENDING); 458 } 459 460 void 461 clockqueue_pend_insert(struct clockintr_queue *cq, struct clockintr *cl, 462 uint64_t expiration) 463 { 464 struct clockintr *elm; 465 466 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 467 KASSERT(!ISSET(cl->cl_flags, CLST_PENDING)); 468 469 cl->cl_expiration = expiration; 470 TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink) { 471 if (cl->cl_expiration < elm->cl_expiration) 472 break; 473 } 474 if (elm == NULL) 475 TAILQ_INSERT_TAIL(&cq->cq_pend, cl, cl_plink); 476 else 477 TAILQ_INSERT_BEFORE(elm, cl, cl_plink); 478 SET(cl->cl_flags, CLST_PENDING); 479 } 480 481 void 482 clockqueue_reset_intrclock(struct clockintr_queue *cq) 483 { 484 uint64_t exp, now; 485 486 MUTEX_ASSERT_LOCKED(&cq->cq_mtx); 487 KASSERT(ISSET(cq->cq_flags, CQ_INTRCLOCK)); 488 489 exp = clockqueue_next(cq); 490 now = nsecuptime(); 491 if (now < exp) 492 intrclock_rearm(&cq->cq_intrclock, exp - now); 493 else 494 intrclock_trigger(&cq->cq_intrclock); 495 } 496 497 void 498 intrclock_rearm(struct intrclock *ic, uint64_t nsecs) 499 { 500 ic->ic_rearm(ic->ic_cookie, nsecs); 501 } 502 503 void 504 intrclock_trigger(struct intrclock *ic) 505 { 506 ic->ic_trigger(ic->ic_cookie); 507 } 508 509 /* 510 * Advance *next in increments of period until it exceeds now. 511 * Returns the number of increments *next was advanced. 512 * 513 * We check the common cases first to avoid division if possible. 514 * This does no overflow checking. 515 */ 516 uint64_t 517 nsec_advance(uint64_t *next, uint64_t period, uint64_t now) 518 { 519 uint64_t elapsed; 520 521 if (now < *next) 522 return 0; 523 524 if (now < *next + period) { 525 *next += period; 526 return 1; 527 } 528 529 elapsed = (now - *next) / period + 1; 530 *next += period * elapsed; 531 return elapsed; 532 } 533 534 int 535 sysctl_clockintr(int *name, u_int namelen, void *oldp, size_t *oldlenp, 536 void *newp, size_t newlen) 537 { 538 struct clockintr_stat sum, tmp; 539 struct clockintr_queue *cq; 540 struct cpu_info *ci; 541 CPU_INFO_ITERATOR cii; 542 uint32_t gen; 543 544 if (namelen != 1) 545 return ENOTDIR; 546 547 switch (name[0]) { 548 case KERN_CLOCKINTR_STATS: 549 memset(&sum, 0, sizeof sum); 550 CPU_INFO_FOREACH(cii, ci) { 551 cq = &ci->ci_queue; 552 if (!ISSET(cq->cq_flags, CQ_INIT)) 553 continue; 554 do { 555 gen = cq->cq_gen; 556 membar_consumer(); 557 tmp = cq->cq_stat; 558 membar_consumer(); 559 } while (gen == 0 || gen != cq->cq_gen); 560 sum.cs_dispatched += tmp.cs_dispatched; 561 sum.cs_early += tmp.cs_early; 562 sum.cs_earliness += tmp.cs_earliness; 563 sum.cs_lateness += tmp.cs_lateness; 564 sum.cs_prompt += tmp.cs_prompt; 565 sum.cs_run += tmp.cs_run; 566 sum.cs_spurious += tmp.cs_spurious; 567 } 568 return sysctl_rdstruct(oldp, oldlenp, newp, &sum, sizeof sum); 569 default: 570 break; 571 } 572 573 return EINVAL; 574 } 575 576 #ifdef DDB 577 578 #include <machine/db_machdep.h> 579 580 #include <ddb/db_interface.h> 581 #include <ddb/db_output.h> 582 #include <ddb/db_sym.h> 583 584 void db_show_clockintr(const struct clockintr *, const char *, u_int); 585 void db_show_clockintr_cpu(struct cpu_info *); 586 587 void 588 db_show_all_clockintr(db_expr_t addr, int haddr, db_expr_t count, char *modif) 589 { 590 struct timespec now; 591 struct cpu_info *ci; 592 CPU_INFO_ITERATOR cii; 593 int width = sizeof(long) * 2 + 2; /* +2 for "0x" prefix */ 594 595 nanouptime(&now); 596 db_printf("%20s\n", "UPTIME"); 597 db_printf("%10lld.%09ld\n", now.tv_sec, now.tv_nsec); 598 db_printf("\n"); 599 db_printf("%20s %5s %3s %*s %s\n", 600 "EXPIRATION", "STATE", "CPU", width, "ARG", "NAME"); 601 CPU_INFO_FOREACH(cii, ci) { 602 if (ISSET(ci->ci_queue.cq_flags, CQ_INIT)) 603 db_show_clockintr_cpu(ci); 604 } 605 } 606 607 void 608 db_show_clockintr_cpu(struct cpu_info *ci) 609 { 610 struct clockintr *elm; 611 struct clockintr_queue *cq = &ci->ci_queue; 612 u_int cpu = CPU_INFO_UNIT(ci); 613 614 if (cq->cq_running != NULL) 615 db_show_clockintr(cq->cq_running, "run", cpu); 616 TAILQ_FOREACH(elm, &cq->cq_pend, cl_plink) 617 db_show_clockintr(elm, "pend", cpu); 618 TAILQ_FOREACH(elm, &cq->cq_all, cl_alink) { 619 if (!ISSET(elm->cl_flags, CLST_PENDING)) 620 db_show_clockintr(elm, "idle", cpu); 621 } 622 } 623 624 void 625 db_show_clockintr(const struct clockintr *cl, const char *state, u_int cpu) 626 { 627 struct timespec ts; 628 char *name; 629 db_expr_t offset; 630 int width = sizeof(long) * 2; 631 632 NSEC_TO_TIMESPEC(cl->cl_expiration, &ts); 633 db_find_sym_and_offset((vaddr_t)cl->cl_func, &name, &offset); 634 if (name == NULL) 635 name = "?"; 636 db_printf("%10lld.%09ld %5s %3u 0x%0*lx %s\n", 637 ts.tv_sec, ts.tv_nsec, state, cpu, 638 width, (unsigned long)cl->cl_arg, name); 639 } 640 641 #endif /* DDB */ 642