1 /*- 2 * Copyright (c) 1982, 1986, 1990, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 35 * $FreeBSD: src/sys/kern/kern_synch.c,v 1.87.2.6 2002/10/13 07:29:53 kbyanc Exp $ 36 */ 37 38 #include "opt_ktrace.h" 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/proc.h> 43 #include <sys/kernel.h> 44 #include <sys/signalvar.h> 45 #include <sys/resourcevar.h> 46 #include <sys/vmmeter.h> 47 #include <sys/sysctl.h> 48 #include <sys/lock.h> 49 #include <sys/uio.h> 50 #include <sys/priv.h> 51 #include <sys/kcollect.h> 52 #ifdef KTRACE 53 #include <sys/ktrace.h> 54 #endif 55 #include <sys/ktr.h> 56 #include <sys/serialize.h> 57 58 #include <sys/signal2.h> 59 #include <sys/thread2.h> 60 #include <sys/spinlock2.h> 61 #include <sys/mutex2.h> 62 63 #include <machine/cpu.h> 64 #include <machine/smp.h> 65 66 #include <vm/vm_extern.h> 67 68 struct tslpque { 69 TAILQ_HEAD(, thread) queue; 70 const volatile void *ident0; 71 const volatile void *ident1; 72 const volatile void *ident2; 73 const volatile void *ident3; 74 }; 75 76 static void sched_setup (void *dummy); 77 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL); 78 static void sched_dyninit (void *dummy); 79 SYSINIT(sched_dyninit, SI_BOOT1_DYNALLOC, SI_ORDER_FIRST, sched_dyninit, NULL); 80 81 int lbolt; 82 void *lbolt_syncer; 83 int ncpus; 84 int ncpus_fit, ncpus_fit_mask; /* note: mask not cpumask_t */ 85 int safepri; 86 int tsleep_now_works; 87 int tsleep_crypto_dump = 0; 88 89 MALLOC_DEFINE(M_TSLEEP, "tslpque", "tsleep queues"); 90 91 #define __DEALL(ident) __DEQUALIFY(void *, ident) 92 93 #if !defined(KTR_TSLEEP) 94 #define KTR_TSLEEP KTR_ALL 95 #endif 96 KTR_INFO_MASTER(tsleep); 97 KTR_INFO(KTR_TSLEEP, tsleep, tsleep_beg, 0, "tsleep enter %p", const volatile void *ident); 98 KTR_INFO(KTR_TSLEEP, tsleep, tsleep_end, 1, "tsleep exit"); 99 KTR_INFO(KTR_TSLEEP, tsleep, wakeup_beg, 2, "wakeup enter %p", const volatile void *ident); 100 KTR_INFO(KTR_TSLEEP, tsleep, wakeup_end, 3, "wakeup exit"); 101 KTR_INFO(KTR_TSLEEP, tsleep, ilockfail, 4, "interlock failed %p", const volatile void *ident); 102 103 #define logtsleep1(name) KTR_LOG(tsleep_ ## name) 104 #define logtsleep2(name, val) KTR_LOG(tsleep_ ## name, val) 105 106 struct loadavg averunnable = 107 { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ 108 /* 109 * Constants for averages over 1, 5, and 15 minutes 110 * when sampling at 5 second intervals. 111 */ 112 static fixpt_t cexp[3] = { 113 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 114 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 115 0.9944598480048967 * FSCALE, /* exp(-1/180) */ 116 }; 117 118 static void endtsleep (void *); 119 static void loadav (void *arg); 120 static void schedcpu (void *arg); 121 122 static int pctcpu_decay = 10; 123 SYSCTL_INT(_kern, OID_AUTO, pctcpu_decay, CTLFLAG_RW, 124 &pctcpu_decay, 0, ""); 125 126 /* 127 * kernel uses `FSCALE', userland (SHOULD) use kern.fscale 128 */ 129 int fscale __unused = FSCALE; /* exported to systat */ 130 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, ""); 131 132 /* 133 * Issue a wakeup() from userland (debugging) 134 */ 135 static int 136 sysctl_wakeup(SYSCTL_HANDLER_ARGS) 137 { 138 uint64_t ident = 1; 139 int error = 0; 140 141 if (req->newptr != NULL) { 142 if (priv_check(curthread, PRIV_ROOT)) 143 return (EPERM); 144 error = SYSCTL_IN(req, &ident, sizeof(ident)); 145 if (error) 146 return error; 147 kprintf("issue wakeup %016jx\n", ident); 148 wakeup((void *)(intptr_t)ident); 149 } 150 if (req->oldptr != NULL) { 151 error = SYSCTL_OUT(req, &ident, sizeof(ident)); 152 } 153 return error; 154 } 155 156 static int 157 sysctl_wakeup_umtx(SYSCTL_HANDLER_ARGS) 158 { 159 uint64_t ident = 1; 160 int error = 0; 161 162 if (req->newptr != NULL) { 163 if (priv_check(curthread, PRIV_ROOT)) 164 return (EPERM); 165 error = SYSCTL_IN(req, &ident, sizeof(ident)); 166 if (error) 167 return error; 168 kprintf("issue wakeup %016jx, PDOMAIN_UMTX\n", ident); 169 wakeup_domain((void *)(intptr_t)ident, PDOMAIN_UMTX); 170 } 171 if (req->oldptr != NULL) { 172 error = SYSCTL_OUT(req, &ident, sizeof(ident)); 173 } 174 return error; 175 } 176 177 SYSCTL_PROC(_debug, OID_AUTO, wakeup, CTLTYPE_UQUAD|CTLFLAG_RW, 0, 0, 178 sysctl_wakeup, "Q", "issue wakeup(addr)"); 179 SYSCTL_PROC(_debug, OID_AUTO, wakeup_umtx, CTLTYPE_UQUAD|CTLFLAG_RW, 0, 0, 180 sysctl_wakeup_umtx, "Q", "issue wakeup(addr, PDOMAIN_UMTX)"); 181 182 /* 183 * Recompute process priorities, once a second. 184 * 185 * Since the userland schedulers are typically event oriented, if the 186 * estcpu calculation at wakeup() time is not sufficient to make a 187 * process runnable relative to other processes in the system we have 188 * a 1-second recalc to help out. 189 * 190 * This code also allows us to store sysclock_t data in the process structure 191 * without fear of an overrun, since sysclock_t are guarenteed to hold 192 * several seconds worth of count. 193 * 194 * WARNING! callouts can preempt normal threads. However, they will not 195 * preempt a thread holding a spinlock so we *can* safely use spinlocks. 196 */ 197 static int schedcpu_stats(struct proc *p, void *data __unused); 198 static int schedcpu_resource(struct proc *p, void *data __unused); 199 200 static void 201 schedcpu(void *arg) 202 { 203 allproc_scan(schedcpu_stats, NULL, 1); 204 allproc_scan(schedcpu_resource, NULL, 1); 205 if (mycpu->gd_cpuid == 0) { 206 wakeup((caddr_t)&lbolt); 207 wakeup(lbolt_syncer); 208 } 209 callout_reset(&mycpu->gd_schedcpu_callout, hz, schedcpu, NULL); 210 } 211 212 /* 213 * General process statistics once a second 214 */ 215 static int 216 schedcpu_stats(struct proc *p, void *data __unused) 217 { 218 struct lwp *lp; 219 220 /* 221 * Threads may not be completely set up if process in SIDL state. 222 */ 223 if (p->p_stat == SIDL) 224 return(0); 225 226 PHOLD(p); 227 if (lwkt_trytoken(&p->p_token) == FALSE) { 228 PRELE(p); 229 return(0); 230 } 231 232 p->p_swtime++; 233 FOREACH_LWP_IN_PROC(lp, p) { 234 if (lp->lwp_stat == LSSLEEP) { 235 ++lp->lwp_slptime; 236 if (lp->lwp_slptime == 1) 237 p->p_usched->uload_update(lp); 238 } 239 240 /* 241 * Only recalculate processes that are active or have slept 242 * less then 2 seconds. The schedulers understand this. 243 * Otherwise decay by 50% per second. 244 */ 245 if (lp->lwp_slptime <= 1) { 246 p->p_usched->recalculate(lp); 247 } else { 248 int decay; 249 250 decay = pctcpu_decay; 251 cpu_ccfence(); 252 if (decay <= 1) 253 decay = 1; 254 if (decay > 100) 255 decay = 100; 256 lp->lwp_pctcpu = (lp->lwp_pctcpu * (decay - 1)) / decay; 257 } 258 } 259 lwkt_reltoken(&p->p_token); 260 lwkt_yield(); 261 PRELE(p); 262 return(0); 263 } 264 265 /* 266 * Resource checks. XXX break out since ksignal/killproc can block, 267 * limiting us to one process killed per second. There is probably 268 * a better way. 269 */ 270 static int 271 schedcpu_resource(struct proc *p, void *data __unused) 272 { 273 u_int64_t ttime; 274 struct lwp *lp; 275 276 if (p->p_stat == SIDL) 277 return(0); 278 279 PHOLD(p); 280 if (lwkt_trytoken(&p->p_token) == FALSE) { 281 PRELE(p); 282 return(0); 283 } 284 285 if (p->p_stat == SZOMB || p->p_limit == NULL) { 286 lwkt_reltoken(&p->p_token); 287 PRELE(p); 288 return(0); 289 } 290 291 ttime = 0; 292 FOREACH_LWP_IN_PROC(lp, p) { 293 /* 294 * We may have caught an lp in the middle of being 295 * created, lwp_thread can be NULL. 296 */ 297 if (lp->lwp_thread) { 298 ttime += lp->lwp_thread->td_sticks; 299 ttime += lp->lwp_thread->td_uticks; 300 } 301 } 302 303 switch(plimit_testcpulimit(p->p_limit, ttime)) { 304 case PLIMIT_TESTCPU_KILL: 305 killproc(p, "exceeded maximum CPU limit"); 306 break; 307 case PLIMIT_TESTCPU_XCPU: 308 if ((p->p_flags & P_XCPU) == 0) { 309 p->p_flags |= P_XCPU; 310 ksignal(p, SIGXCPU); 311 } 312 break; 313 default: 314 break; 315 } 316 lwkt_reltoken(&p->p_token); 317 lwkt_yield(); 318 PRELE(p); 319 return(0); 320 } 321 322 /* 323 * This is only used by ps. Generate a cpu percentage use over 324 * a period of one second. 325 */ 326 void 327 updatepcpu(struct lwp *lp, int cpticks, int ttlticks) 328 { 329 fixpt_t acc; 330 int remticks; 331 332 acc = (cpticks << FSHIFT) / ttlticks; 333 if (ttlticks >= ESTCPUFREQ) { 334 lp->lwp_pctcpu = acc; 335 } else { 336 remticks = ESTCPUFREQ - ttlticks; 337 lp->lwp_pctcpu = (acc * ttlticks + lp->lwp_pctcpu * remticks) / 338 ESTCPUFREQ; 339 } 340 } 341 342 /* 343 * Handy macros to calculate hash indices. LOOKUP() calculates the 344 * global cpumask hash index, TCHASHSHIFT() converts that into the 345 * pcpu hash index. 346 * 347 * By making the pcpu hash arrays smaller we save a significant amount 348 * of memory at very low cost. The real cost is in IPIs, which are handled 349 * by the much larger global cpumask hash table. 350 */ 351 #define LOOKUP_PRIME 66555444443333333ULL 352 #define LOOKUP(x) ((((uintptr_t)(x) + ((uintptr_t)(x) >> 18)) ^ \ 353 LOOKUP_PRIME) % slpque_tablesize) 354 #define TCHASHSHIFT(x) ((x) >> 4) 355 356 static uint32_t slpque_tablesize; 357 static cpumask_t *slpque_cpumasks; 358 359 SYSCTL_UINT(_kern, OID_AUTO, slpque_tablesize, CTLFLAG_RD, &slpque_tablesize, 360 0, ""); 361 362 /* 363 * This is a dandy function that allows us to interlock tsleep/wakeup 364 * operations with unspecified upper level locks, such as lockmgr locks, 365 * simply by holding a critical section. The sequence is: 366 * 367 * (acquire upper level lock) 368 * tsleep_interlock(blah) 369 * (release upper level lock) 370 * tsleep(blah, ...) 371 * 372 * Basically this functions queues us on the tsleep queue without actually 373 * descheduling us. When tsleep() is later called with PINTERLOCK it 374 * assumes the thread was already queued, otherwise it queues it there. 375 * 376 * Thus it is possible to receive the wakeup prior to going to sleep and 377 * the race conditions are covered. 378 */ 379 static __inline void 380 _tsleep_interlock(globaldata_t gd, const volatile void *ident, int flags) 381 { 382 thread_t td = gd->gd_curthread; 383 struct tslpque *qp; 384 uint32_t cid; 385 uint32_t gid; 386 387 if (ident == NULL) { 388 kprintf("tsleep_interlock: NULL ident %s\n", td->td_comm); 389 print_backtrace(5); 390 } 391 392 crit_enter_quick(td); 393 if (td->td_flags & TDF_TSLEEPQ) { 394 /* 395 * Shortcut if unchanged 396 */ 397 if (td->td_wchan == ident && 398 td->td_wdomain == (flags & PDOMAIN_MASK)) { 399 crit_exit_quick(td); 400 return; 401 } 402 403 /* 404 * Remove current sleepq 405 */ 406 cid = LOOKUP(td->td_wchan); 407 gid = TCHASHSHIFT(cid); 408 qp = &gd->gd_tsleep_hash[gid]; 409 TAILQ_REMOVE(&qp->queue, td, td_sleepq); 410 if (TAILQ_FIRST(&qp->queue) == NULL) { 411 qp->ident0 = NULL; 412 qp->ident1 = NULL; 413 qp->ident2 = NULL; 414 qp->ident3 = NULL; 415 ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks[cid], 416 gd->gd_cpuid); 417 } 418 } else { 419 td->td_flags |= TDF_TSLEEPQ; 420 } 421 cid = LOOKUP(ident); 422 gid = TCHASHSHIFT(cid); 423 qp = &gd->gd_tsleep_hash[gid]; 424 TAILQ_INSERT_TAIL(&qp->queue, td, td_sleepq); 425 if (qp->ident0 != ident && qp->ident1 != ident && 426 qp->ident2 != ident && qp->ident3 != ident) { 427 if (qp->ident0 == NULL) 428 qp->ident0 = ident; 429 else if (qp->ident1 == NULL) 430 qp->ident1 = ident; 431 else if (qp->ident2 == NULL) 432 qp->ident2 = ident; 433 else if (qp->ident3 == NULL) 434 qp->ident3 = ident; 435 else 436 qp->ident0 = (void *)(intptr_t)-1; 437 } 438 ATOMIC_CPUMASK_ORBIT(slpque_cpumasks[cid], gd->gd_cpuid); 439 td->td_wchan = ident; 440 td->td_wdomain = flags & PDOMAIN_MASK; 441 crit_exit_quick(td); 442 } 443 444 void 445 tsleep_interlock(const volatile void *ident, int flags) 446 { 447 _tsleep_interlock(mycpu, ident, flags); 448 } 449 450 /* 451 * Remove thread from sleepq. Must be called with a critical section held. 452 * The thread must not be migrating. 453 */ 454 static __inline void 455 _tsleep_remove(thread_t td) 456 { 457 globaldata_t gd = mycpu; 458 struct tslpque *qp; 459 uint32_t cid; 460 uint32_t gid; 461 462 KKASSERT(td->td_gd == gd && IN_CRITICAL_SECT(td)); 463 KKASSERT((td->td_flags & TDF_MIGRATING) == 0); 464 if (td->td_flags & TDF_TSLEEPQ) { 465 td->td_flags &= ~TDF_TSLEEPQ; 466 cid = LOOKUP(td->td_wchan); 467 gid = TCHASHSHIFT(cid); 468 qp = &gd->gd_tsleep_hash[gid]; 469 TAILQ_REMOVE(&qp->queue, td, td_sleepq); 470 if (TAILQ_FIRST(&qp->queue) == NULL) { 471 ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks[cid], 472 gd->gd_cpuid); 473 } 474 td->td_wchan = NULL; 475 td->td_wdomain = 0; 476 } 477 } 478 479 void 480 tsleep_remove(thread_t td) 481 { 482 _tsleep_remove(td); 483 } 484 485 /* 486 * General sleep call. Suspends the current process until a wakeup is 487 * performed on the specified identifier. The process will then be made 488 * runnable with the specified priority. Sleeps at most timo/hz seconds 489 * (0 means no timeout). If flags includes PCATCH flag, signals are checked 490 * before and after sleeping, else signals are not checked. Returns 0 if 491 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 492 * signal needs to be delivered, ERESTART is returned if the current system 493 * call should be restarted if possible, and EINTR is returned if the system 494 * call should be interrupted by the signal (return EINTR). 495 * 496 * Note that if we are a process, we release_curproc() before messing with 497 * the LWKT scheduler. 498 * 499 * During autoconfiguration or after a panic, a sleep will simply 500 * lower the priority briefly to allow interrupts, then return. 501 * 502 * WARNING! This code can't block (short of switching away), or bad things 503 * will happen. No getting tokens, no blocking locks, etc. 504 */ 505 int 506 tsleep(const volatile void *ident, int flags, const char *wmesg, int timo) 507 { 508 struct thread *td = curthread; 509 struct lwp *lp = td->td_lwp; 510 struct proc *p = td->td_proc; /* may be NULL */ 511 globaldata_t gd; 512 int sig; 513 int catch; 514 int error; 515 int oldpri; 516 struct callout thandle; 517 518 /* 519 * Currently a severe hack. Make sure any delayed wakeups 520 * are flushed before we sleep or we might deadlock on whatever 521 * event we are sleeping on. 522 */ 523 if (td->td_flags & TDF_DELAYED_WAKEUP) 524 wakeup_end_delayed(); 525 526 /* 527 * NOTE: removed KTRPOINT, it could cause races due to blocking 528 * even in stable. Just scrap it for now. 529 */ 530 if (!tsleep_crypto_dump && (tsleep_now_works == 0 || panicstr)) { 531 /* 532 * After a panic, or before we actually have an operational 533 * softclock, just give interrupts a chance, then just return; 534 * 535 * don't run any other procs or panic below, 536 * in case this is the idle process and already asleep. 537 */ 538 splz(); 539 oldpri = td->td_pri; 540 lwkt_setpri_self(safepri); 541 lwkt_switch(); 542 lwkt_setpri_self(oldpri); 543 return (0); 544 } 545 logtsleep2(tsleep_beg, ident); 546 gd = td->td_gd; 547 KKASSERT(td != &gd->gd_idlethread); /* you must be kidding! */ 548 td->td_wakefromcpu = -1; /* overwritten by _wakeup */ 549 550 /* 551 * NOTE: all of this occurs on the current cpu, including any 552 * callout-based wakeups, so a critical section is a sufficient 553 * interlock. 554 * 555 * The entire sequence through to where we actually sleep must 556 * run without breaking the critical section. 557 */ 558 catch = flags & PCATCH; 559 error = 0; 560 sig = 0; 561 562 crit_enter_quick(td); 563 564 KASSERT(ident != NULL, ("tsleep: no ident")); 565 KASSERT(lp == NULL || 566 lp->lwp_stat == LSRUN || /* Obvious */ 567 lp->lwp_stat == LSSTOP, /* Set in tstop */ 568 ("tsleep %p %s %d", 569 ident, wmesg, lp->lwp_stat)); 570 571 /* 572 * We interlock the sleep queue if the caller has not already done 573 * it for us. This must be done before we potentially acquire any 574 * tokens or we can loose the wakeup. 575 */ 576 if ((flags & PINTERLOCKED) == 0) { 577 _tsleep_interlock(gd, ident, flags); 578 } 579 580 /* 581 * Setup for the current process (if this is a process). We must 582 * interlock with lwp_token to avoid remote wakeup races via 583 * setrunnable() 584 */ 585 if (lp) { 586 lwkt_gettoken(&lp->lwp_token); 587 588 /* 589 * If the umbrella process is in the SCORE state then 590 * make sure that the thread is flagged going into a 591 * normal sleep to allow the core dump to proceed, otherwise 592 * the coredump can end up waiting forever. If the normal 593 * sleep is woken up, the thread will enter a stopped state 594 * upon return to userland. 595 * 596 * We do not want to interrupt or cause a thread exist at 597 * this juncture because that will mess-up the state the 598 * coredump is trying to save. 599 */ 600 if (p->p_stat == SCORE && 601 (lp->lwp_mpflags & LWP_MP_WSTOP) == 0) { 602 atomic_set_int(&lp->lwp_mpflags, LWP_MP_WSTOP); 603 ++p->p_nstopped; 604 } 605 606 /* 607 * PCATCH requested. 608 */ 609 if (catch) { 610 /* 611 * Early termination if PCATCH was set and a 612 * signal is pending, interlocked with the 613 * critical section. 614 * 615 * Early termination only occurs when tsleep() is 616 * entered while in a normal LSRUN state. 617 */ 618 if ((sig = CURSIG(lp)) != 0) 619 goto resume; 620 621 /* 622 * Causes ksignal to wake us up if a signal is 623 * received (interlocked with lp->lwp_token). 624 */ 625 lp->lwp_flags |= LWP_SINTR; 626 } 627 } else { 628 KKASSERT(p == NULL); 629 } 630 631 /* 632 * Make sure the current process has been untangled from 633 * the userland scheduler and initialize slptime to start 634 * counting. 635 * 636 * NOTE: td->td_wakefromcpu is pre-set by the release function 637 * for the dfly scheduler, and then adjusted by _wakeup() 638 */ 639 if (lp) { 640 p->p_usched->release_curproc(lp); 641 lp->lwp_slptime = 0; 642 } 643 644 /* 645 * For PINTERLOCKED operation, TDF_TSLEEPQ might not be set if 646 * a wakeup() was processed before the thread could go to sleep. 647 * 648 * If TDF_TSLEEPQ is set, make sure the ident matches the recorded 649 * ident. If it does not then the thread slept inbetween the 650 * caller's initial tsleep_interlock() call and the caller's tsleep() 651 * call. 652 * 653 * Extreme loads can cause the sending of an IPI (e.g. wakeup()'s) 654 * to process incoming IPIs, thus draining incoming wakeups. 655 */ 656 if ((td->td_flags & TDF_TSLEEPQ) == 0) { 657 logtsleep2(ilockfail, ident); 658 goto resume; 659 } else if (td->td_wchan != ident || 660 td->td_wdomain != (flags & PDOMAIN_MASK)) { 661 logtsleep2(ilockfail, ident); 662 goto resume; 663 } 664 665 /* 666 * scheduling is blocked while in a critical section. Coincide 667 * the descheduled-by-tsleep flag with the descheduling of the 668 * lwkt. 669 * 670 * The timer callout is localized on our cpu and interlocked by 671 * our critical section. 672 */ 673 lwkt_deschedule_self(td); 674 td->td_flags |= TDF_TSLEEP_DESCHEDULED; 675 td->td_wmesg = wmesg; 676 677 /* 678 * Setup the timeout, if any. The timeout is only operable while 679 * the thread is flagged descheduled. 680 */ 681 KKASSERT((td->td_flags & TDF_TIMEOUT) == 0); 682 if (timo) { 683 callout_init_mp(&thandle); 684 callout_reset(&thandle, timo, endtsleep, td); 685 } 686 687 /* 688 * Beddy bye bye. 689 */ 690 if (lp) { 691 /* 692 * Ok, we are sleeping. Place us in the SSLEEP state. 693 */ 694 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 695 696 /* 697 * tstop() sets LSSTOP, so don't fiddle with that. 698 */ 699 if (lp->lwp_stat != LSSTOP) 700 lp->lwp_stat = LSSLEEP; 701 lp->lwp_ru.ru_nvcsw++; 702 p->p_usched->uload_update(lp); 703 lwkt_switch(); 704 705 /* 706 * And when we are woken up, put us back in LSRUN. If we 707 * slept for over a second, recalculate our estcpu. 708 */ 709 lp->lwp_stat = LSRUN; 710 if (lp->lwp_slptime) { 711 p->p_usched->uload_update(lp); 712 p->p_usched->recalculate(lp); 713 } 714 lp->lwp_slptime = 0; 715 } else { 716 lwkt_switch(); 717 } 718 719 /* 720 * Make sure we haven't switched cpus while we were asleep. It's 721 * not supposed to happen. Cleanup our temporary flags. 722 */ 723 KKASSERT(gd == td->td_gd); 724 725 /* 726 * Cleanup the timeout. If the timeout has already occured thandle 727 * has already been stopped, otherwise stop thandle. If the timeout 728 * is running (the callout thread must be blocked trying to get 729 * lwp_token) then wait for us to get scheduled. 730 */ 731 if (timo) { 732 while (td->td_flags & TDF_TIMEOUT_RUNNING) { 733 /* else we won't get rescheduled! */ 734 if (lp->lwp_stat != LSSTOP) 735 lp->lwp_stat = LSSLEEP; 736 lwkt_deschedule_self(td); 737 td->td_wmesg = "tsrace"; 738 lwkt_switch(); 739 kprintf("td %p %s: timeout race\n", td, td->td_comm); 740 } 741 if (td->td_flags & TDF_TIMEOUT) { 742 td->td_flags &= ~TDF_TIMEOUT; 743 error = EWOULDBLOCK; 744 } else { 745 /* does not block when on same cpu */ 746 callout_stop(&thandle); 747 } 748 } 749 td->td_flags &= ~TDF_TSLEEP_DESCHEDULED; 750 751 /* 752 * Make sure we have been removed from the sleepq. In most 753 * cases this will have been done for us already but it is 754 * possible for a scheduling IPI to be in-flight from a 755 * previous tsleep/tsleep_interlock() or due to a straight-out 756 * call to lwkt_schedule() (in the case of an interrupt thread), 757 * causing a spurious wakeup. 758 */ 759 _tsleep_remove(td); 760 td->td_wmesg = NULL; 761 762 /* 763 * Figure out the correct error return. If interrupted by a 764 * signal we want to return EINTR or ERESTART. 765 */ 766 resume: 767 if (lp) { 768 if (catch && error == 0) { 769 if (sig != 0 || (sig = CURSIG(lp))) { 770 if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig)) 771 error = EINTR; 772 else 773 error = ERESTART; 774 } 775 } 776 777 lp->lwp_flags &= ~LWP_SINTR; 778 779 /* 780 * Unconditionally set us to LSRUN on resume. lwp_stat could 781 * be in a weird state due to the goto resume, particularly 782 * when tsleep() is called from tstop(). 783 */ 784 lp->lwp_stat = LSRUN; 785 lwkt_reltoken(&lp->lwp_token); 786 } 787 logtsleep1(tsleep_end); 788 crit_exit_quick(td); 789 790 return (error); 791 } 792 793 /* 794 * Interlocked spinlock sleep. An exclusively held spinlock must 795 * be passed to ssleep(). The function will atomically release the 796 * spinlock and tsleep on the ident, then reacquire the spinlock and 797 * return. 798 * 799 * This routine is fairly important along the critical path, so optimize it 800 * heavily. 801 */ 802 int 803 ssleep(const volatile void *ident, struct spinlock *spin, int flags, 804 const char *wmesg, int timo) 805 { 806 globaldata_t gd = mycpu; 807 int error; 808 809 _tsleep_interlock(gd, ident, flags); 810 spin_unlock_quick(gd, spin); 811 error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo); 812 KKASSERT(gd == mycpu); 813 _spin_lock_quick(gd, spin, wmesg); 814 815 return (error); 816 } 817 818 int 819 lksleep(const volatile void *ident, struct lock *lock, int flags, 820 const char *wmesg, int timo) 821 { 822 globaldata_t gd = mycpu; 823 int error; 824 825 _tsleep_interlock(gd, ident, flags); 826 lockmgr(lock, LK_RELEASE); 827 error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo); 828 lockmgr(lock, LK_EXCLUSIVE); 829 830 return (error); 831 } 832 833 /* 834 * Interlocked mutex sleep. An exclusively held mutex must be passed 835 * to mtxsleep(). The function will atomically release the mutex 836 * and tsleep on the ident, then reacquire the mutex and return. 837 */ 838 int 839 mtxsleep(const volatile void *ident, struct mtx *mtx, int flags, 840 const char *wmesg, int timo) 841 { 842 globaldata_t gd = mycpu; 843 int error; 844 845 _tsleep_interlock(gd, ident, flags); 846 mtx_unlock(mtx); 847 error = tsleep(ident, flags | PINTERLOCKED, wmesg, timo); 848 mtx_lock_ex_quick(mtx); 849 850 return (error); 851 } 852 853 /* 854 * Interlocked serializer sleep. An exclusively held serializer must 855 * be passed to zsleep(). The function will atomically release 856 * the serializer and tsleep on the ident, then reacquire the serializer 857 * and return. 858 */ 859 int 860 zsleep(const volatile void *ident, struct lwkt_serialize *slz, int flags, 861 const char *wmesg, int timo) 862 { 863 globaldata_t gd = mycpu; 864 int ret; 865 866 ASSERT_SERIALIZED(slz); 867 868 _tsleep_interlock(gd, ident, flags); 869 lwkt_serialize_exit(slz); 870 ret = tsleep(ident, flags | PINTERLOCKED, wmesg, timo); 871 lwkt_serialize_enter(slz); 872 873 return ret; 874 } 875 876 /* 877 * Directly block on the LWKT thread by descheduling it. This 878 * is much faster then tsleep(), but the only legal way to wake 879 * us up is to directly schedule the thread. 880 * 881 * Setting TDF_SINTR will cause new signals to directly schedule us. 882 * 883 * This routine must be called while in a critical section. 884 */ 885 int 886 lwkt_sleep(const char *wmesg, int flags) 887 { 888 thread_t td = curthread; 889 int sig; 890 891 if ((flags & PCATCH) == 0 || td->td_lwp == NULL) { 892 td->td_flags |= TDF_BLOCKED; 893 td->td_wmesg = wmesg; 894 lwkt_deschedule_self(td); 895 lwkt_switch(); 896 td->td_wmesg = NULL; 897 td->td_flags &= ~TDF_BLOCKED; 898 return(0); 899 } 900 if ((sig = CURSIG(td->td_lwp)) != 0) { 901 if (SIGISMEMBER(td->td_proc->p_sigacts->ps_sigintr, sig)) 902 return(EINTR); 903 else 904 return(ERESTART); 905 906 } 907 td->td_flags |= TDF_BLOCKED | TDF_SINTR; 908 td->td_wmesg = wmesg; 909 lwkt_deschedule_self(td); 910 lwkt_switch(); 911 td->td_flags &= ~(TDF_BLOCKED | TDF_SINTR); 912 td->td_wmesg = NULL; 913 return(0); 914 } 915 916 /* 917 * Implement the timeout for tsleep. 918 * 919 * This type of callout timeout is scheduled on the same cpu the process 920 * is sleeping on. Also, at the moment, the MP lock is held. 921 */ 922 static void 923 endtsleep(void *arg) 924 { 925 thread_t td = arg; 926 struct lwp *lp; 927 928 /* 929 * We are going to have to get the lwp_token, which means we might 930 * block. This can race a tsleep getting woken up by other means 931 * so set TDF_TIMEOUT_RUNNING to force the tsleep to wait for our 932 * processing to complete (sorry tsleep!). 933 * 934 * We can safely set td_flags because td MUST be on the same cpu 935 * as we are. 936 */ 937 KKASSERT(td->td_gd == mycpu); 938 crit_enter(); 939 td->td_flags |= TDF_TIMEOUT_RUNNING | TDF_TIMEOUT; 940 941 /* 942 * This can block but TDF_TIMEOUT_RUNNING will prevent the thread 943 * from exiting the tsleep on us. The flag is interlocked by virtue 944 * of lp being on the same cpu as we are. 945 */ 946 if ((lp = td->td_lwp) != NULL) 947 lwkt_gettoken(&lp->lwp_token); 948 949 KKASSERT(td->td_flags & TDF_TSLEEP_DESCHEDULED); 950 951 if (lp) { 952 /* 953 * callout timer should normally never be set in tstop() 954 * because it passes a timeout of 0. However, there is a 955 * case during thread exit (which SSTOP's all the threads) 956 * for which tstop() must break out and can (properly) leave 957 * the thread in LSSTOP. 958 */ 959 KKASSERT(lp->lwp_stat != LSSTOP || 960 (lp->lwp_mpflags & LWP_MP_WEXIT)); 961 setrunnable(lp); 962 lwkt_reltoken(&lp->lwp_token); 963 } else { 964 _tsleep_remove(td); 965 lwkt_schedule(td); 966 } 967 KKASSERT(td->td_gd == mycpu); 968 td->td_flags &= ~TDF_TIMEOUT_RUNNING; 969 crit_exit(); 970 } 971 972 /* 973 * Make all processes sleeping on the specified identifier runnable. 974 * count may be zero or one only. 975 * 976 * The domain encodes the sleep/wakeup domain, flags, plus the originating 977 * cpu. 978 * 979 * This call may run without the MP lock held. We can only manipulate thread 980 * state on the cpu owning the thread. We CANNOT manipulate process state 981 * at all. 982 * 983 * _wakeup() can be passed to an IPI so we can't use (const volatile 984 * void *ident). 985 */ 986 static void 987 _wakeup(void *ident, int domain) 988 { 989 struct tslpque *qp; 990 struct thread *td; 991 struct thread *ntd; 992 globaldata_t gd; 993 cpumask_t mask; 994 uint32_t cid; 995 uint32_t gid; 996 int wids = 0; 997 998 crit_enter(); 999 logtsleep2(wakeup_beg, ident); 1000 gd = mycpu; 1001 cid = LOOKUP(ident); 1002 gid = TCHASHSHIFT(cid); 1003 qp = &gd->gd_tsleep_hash[gid]; 1004 restart: 1005 for (td = TAILQ_FIRST(&qp->queue); td != NULL; td = ntd) { 1006 ntd = TAILQ_NEXT(td, td_sleepq); 1007 if (td->td_wchan == ident && 1008 td->td_wdomain == (domain & PDOMAIN_MASK) 1009 ) { 1010 KKASSERT(td->td_gd == gd); 1011 _tsleep_remove(td); 1012 td->td_wakefromcpu = PWAKEUP_DECODE(domain); 1013 if (td->td_flags & TDF_TSLEEP_DESCHEDULED) { 1014 lwkt_schedule(td); 1015 if (domain & PWAKEUP_ONE) 1016 goto done; 1017 } 1018 goto restart; 1019 } 1020 if (td->td_wchan == qp->ident0) 1021 wids |= 1; 1022 else if (td->td_wchan == qp->ident1) 1023 wids |= 2; 1024 else if (td->td_wchan == qp->ident2) 1025 wids |= 4; 1026 else if (td->td_wchan == qp->ident3) 1027 wids |= 8; 1028 else 1029 wids |= 16; /* force ident0 to be retained (-1) */ 1030 } 1031 1032 /* 1033 * Because a bunch of cpumask array entries cover the same queue, it 1034 * is possible for our bit to remain set in some of them and cause 1035 * spurious wakeup IPIs later on. Make sure that the bit is cleared 1036 * when a spurious IPI occurs to prevent further spurious IPIs. 1037 */ 1038 if (TAILQ_FIRST(&qp->queue) == NULL) { 1039 ATOMIC_CPUMASK_NANDBIT(slpque_cpumasks[cid], gd->gd_cpuid); 1040 qp->ident0 = NULL; 1041 qp->ident1 = NULL; 1042 qp->ident2 = NULL; 1043 qp->ident3 = NULL; 1044 } else { 1045 if ((wids & 1) == 0) { 1046 if ((wids & 16) == 0) { 1047 qp->ident0 = NULL; 1048 } else { 1049 KKASSERT(qp->ident0 == (void *)(intptr_t)-1); 1050 } 1051 } 1052 if ((wids & 2) == 0) 1053 qp->ident1 = NULL; 1054 if ((wids & 4) == 0) 1055 qp->ident2 = NULL; 1056 if ((wids & 8) == 0) 1057 qp->ident3 = NULL; 1058 } 1059 1060 /* 1061 * We finished checking the current cpu but there still may be 1062 * more work to do. Either wakeup_one was requested and no matching 1063 * thread was found, or a normal wakeup was requested and we have 1064 * to continue checking cpus. 1065 * 1066 * It should be noted that this scheme is actually less expensive then 1067 * the old scheme when waking up multiple threads, since we send 1068 * only one IPI message per target candidate which may then schedule 1069 * multiple threads. Before we could have wound up sending an IPI 1070 * message for each thread on the target cpu (!= current cpu) that 1071 * needed to be woken up. 1072 * 1073 * NOTE: Wakeups occuring on remote cpus are asynchronous. This 1074 * should be ok since we are passing idents in the IPI rather 1075 * then thread pointers. 1076 * 1077 * NOTE: We MUST mfence (or use an atomic op) prior to reading 1078 * the cpumask, as another cpu may have written to it in 1079 * a fashion interlocked with whatever the caller did before 1080 * calling wakeup(). Otherwise we might miss the interaction 1081 * (kern_mutex.c can cause this problem). 1082 * 1083 * lfence is insufficient as it may allow a written state to 1084 * reorder around the cpumask load. 1085 */ 1086 if ((domain & PWAKEUP_MYCPU) == 0) { 1087 globaldata_t tgd; 1088 const volatile void *id0; 1089 int n; 1090 1091 cpu_mfence(); 1092 /* cpu_lfence(); */ 1093 mask = slpque_cpumasks[cid]; 1094 CPUMASK_ANDMASK(mask, gd->gd_other_cpus); 1095 while (CPUMASK_TESTNZERO(mask)) { 1096 n = BSRCPUMASK(mask); 1097 CPUMASK_NANDBIT(mask, n); 1098 tgd = globaldata_find(n); 1099 1100 /* 1101 * Both ident0 compares must from a single load 1102 * to avoid ident0 update races crossing the two 1103 * compares. 1104 */ 1105 qp = &tgd->gd_tsleep_hash[gid]; 1106 id0 = qp->ident0; 1107 cpu_ccfence(); 1108 if (id0 == (void *)(intptr_t)-1) { 1109 lwkt_send_ipiq2(tgd, _wakeup, ident, 1110 domain | PWAKEUP_MYCPU); 1111 ++tgd->gd_cnt.v_wakeup_colls; 1112 } else if (id0 == ident || 1113 qp->ident1 == ident || 1114 qp->ident2 == ident || 1115 qp->ident3 == ident) { 1116 lwkt_send_ipiq2(tgd, _wakeup, ident, 1117 domain | PWAKEUP_MYCPU); 1118 } 1119 } 1120 #if 0 1121 if (CPUMASK_TESTNZERO(mask)) { 1122 lwkt_send_ipiq2_mask(mask, _wakeup, ident, 1123 domain | PWAKEUP_MYCPU); 1124 } 1125 #endif 1126 } 1127 done: 1128 logtsleep1(wakeup_end); 1129 crit_exit(); 1130 } 1131 1132 /* 1133 * Wakeup all threads tsleep()ing on the specified ident, on all cpus 1134 */ 1135 void 1136 wakeup(const volatile void *ident) 1137 { 1138 globaldata_t gd = mycpu; 1139 thread_t td = gd->gd_curthread; 1140 1141 if (td && (td->td_flags & TDF_DELAYED_WAKEUP)) { 1142 /* 1143 * If we are in a delayed wakeup section, record up to two wakeups in 1144 * a per-CPU queue and issue them when we block or exit the delayed 1145 * wakeup section. 1146 */ 1147 if (atomic_cmpset_ptr(&gd->gd_delayed_wakeup[0], NULL, ident)) 1148 return; 1149 if (atomic_cmpset_ptr(&gd->gd_delayed_wakeup[1], NULL, ident)) 1150 return; 1151 1152 ident = atomic_swap_ptr(__DEQUALIFY(volatile void **, &gd->gd_delayed_wakeup[1]), 1153 __DEALL(ident)); 1154 ident = atomic_swap_ptr(__DEQUALIFY(volatile void **, &gd->gd_delayed_wakeup[0]), 1155 __DEALL(ident)); 1156 } 1157 1158 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, gd->gd_cpuid)); 1159 } 1160 1161 /* 1162 * Wakeup one thread tsleep()ing on the specified ident, on any cpu. 1163 */ 1164 void 1165 wakeup_one(const volatile void *ident) 1166 { 1167 /* XXX potentially round-robin the first responding cpu */ 1168 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) | 1169 PWAKEUP_ONE); 1170 } 1171 1172 /* 1173 * Wakeup threads tsleep()ing on the specified ident on the current cpu 1174 * only. 1175 */ 1176 void 1177 wakeup_mycpu(const volatile void *ident) 1178 { 1179 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) | 1180 PWAKEUP_MYCPU); 1181 } 1182 1183 /* 1184 * Wakeup one thread tsleep()ing on the specified ident on the current cpu 1185 * only. 1186 */ 1187 void 1188 wakeup_mycpu_one(const volatile void *ident) 1189 { 1190 /* XXX potentially round-robin the first responding cpu */ 1191 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mycpu->gd_cpuid) | 1192 PWAKEUP_MYCPU | PWAKEUP_ONE); 1193 } 1194 1195 /* 1196 * Wakeup all thread tsleep()ing on the specified ident on the specified cpu 1197 * only. 1198 */ 1199 void 1200 wakeup_oncpu(globaldata_t gd, const volatile void *ident) 1201 { 1202 globaldata_t mygd = mycpu; 1203 if (gd == mycpu) { 1204 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mygd->gd_cpuid) | 1205 PWAKEUP_MYCPU); 1206 } else { 1207 lwkt_send_ipiq2(gd, _wakeup, __DEALL(ident), 1208 PWAKEUP_ENCODE(0, mygd->gd_cpuid) | 1209 PWAKEUP_MYCPU); 1210 } 1211 } 1212 1213 /* 1214 * Wakeup one thread tsleep()ing on the specified ident on the specified cpu 1215 * only. 1216 */ 1217 void 1218 wakeup_oncpu_one(globaldata_t gd, const volatile void *ident) 1219 { 1220 globaldata_t mygd = mycpu; 1221 if (gd == mygd) { 1222 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(0, mygd->gd_cpuid) | 1223 PWAKEUP_MYCPU | PWAKEUP_ONE); 1224 } else { 1225 lwkt_send_ipiq2(gd, _wakeup, __DEALL(ident), 1226 PWAKEUP_ENCODE(0, mygd->gd_cpuid) | 1227 PWAKEUP_MYCPU | PWAKEUP_ONE); 1228 } 1229 } 1230 1231 /* 1232 * Wakeup all threads waiting on the specified ident that slept using 1233 * the specified domain, on all cpus. 1234 */ 1235 void 1236 wakeup_domain(const volatile void *ident, int domain) 1237 { 1238 _wakeup(__DEALL(ident), PWAKEUP_ENCODE(domain, mycpu->gd_cpuid)); 1239 } 1240 1241 /* 1242 * Wakeup one thread waiting on the specified ident that slept using 1243 * the specified domain, on any cpu. 1244 */ 1245 void 1246 wakeup_domain_one(const volatile void *ident, int domain) 1247 { 1248 /* XXX potentially round-robin the first responding cpu */ 1249 _wakeup(__DEALL(ident), 1250 PWAKEUP_ENCODE(domain, mycpu->gd_cpuid) | PWAKEUP_ONE); 1251 } 1252 1253 void 1254 wakeup_start_delayed(void) 1255 { 1256 globaldata_t gd = mycpu; 1257 1258 crit_enter(); 1259 gd->gd_curthread->td_flags |= TDF_DELAYED_WAKEUP; 1260 crit_exit(); 1261 } 1262 1263 void 1264 wakeup_end_delayed(void) 1265 { 1266 globaldata_t gd = mycpu; 1267 1268 if (gd->gd_curthread->td_flags & TDF_DELAYED_WAKEUP) { 1269 crit_enter(); 1270 gd->gd_curthread->td_flags &= ~TDF_DELAYED_WAKEUP; 1271 if (gd->gd_delayed_wakeup[0] || gd->gd_delayed_wakeup[1]) { 1272 if (gd->gd_delayed_wakeup[0]) { 1273 wakeup(gd->gd_delayed_wakeup[0]); 1274 gd->gd_delayed_wakeup[0] = NULL; 1275 } 1276 if (gd->gd_delayed_wakeup[1]) { 1277 wakeup(gd->gd_delayed_wakeup[1]); 1278 gd->gd_delayed_wakeup[1] = NULL; 1279 } 1280 } 1281 crit_exit(); 1282 } 1283 } 1284 1285 /* 1286 * setrunnable() 1287 * 1288 * Make a process runnable. lp->lwp_token must be held on call and this 1289 * function must be called from the cpu owning lp. 1290 * 1291 * This only has an effect if we are in LSSTOP or LSSLEEP. 1292 */ 1293 void 1294 setrunnable(struct lwp *lp) 1295 { 1296 thread_t td = lp->lwp_thread; 1297 1298 ASSERT_LWKT_TOKEN_HELD(&lp->lwp_token); 1299 KKASSERT(td->td_gd == mycpu); 1300 crit_enter(); 1301 if (lp->lwp_stat == LSSTOP) 1302 lp->lwp_stat = LSSLEEP; 1303 if (lp->lwp_stat == LSSLEEP) { 1304 _tsleep_remove(td); 1305 lwkt_schedule(td); 1306 } else if (td->td_flags & TDF_SINTR) { 1307 lwkt_schedule(td); 1308 } 1309 crit_exit(); 1310 } 1311 1312 /* 1313 * The process is stopped due to some condition, usually because p_stat is 1314 * set to SSTOP, but also possibly due to being traced. 1315 * 1316 * Caller must hold p->p_token 1317 * 1318 * NOTE! If the caller sets SSTOP, the caller must also clear P_WAITED 1319 * because the parent may check the child's status before the child actually 1320 * gets to this routine. 1321 * 1322 * This routine is called with the current lwp only, typically just 1323 * before returning to userland if the process state is detected as 1324 * possibly being in a stopped state. 1325 */ 1326 void 1327 tstop(void) 1328 { 1329 struct lwp *lp = curthread->td_lwp; 1330 struct proc *p = lp->lwp_proc; 1331 struct proc *q; 1332 1333 lwkt_gettoken(&lp->lwp_token); 1334 crit_enter(); 1335 1336 /* 1337 * If LWP_MP_WSTOP is set, we were sleeping 1338 * while our process was stopped. At this point 1339 * we were already counted as stopped. 1340 */ 1341 if ((lp->lwp_mpflags & LWP_MP_WSTOP) == 0) { 1342 /* 1343 * If we're the last thread to stop, signal 1344 * our parent. 1345 */ 1346 p->p_nstopped++; 1347 atomic_set_int(&lp->lwp_mpflags, LWP_MP_WSTOP); 1348 wakeup(&p->p_nstopped); 1349 if (p->p_nstopped == p->p_nthreads) { 1350 /* 1351 * Token required to interlock kern_wait() 1352 */ 1353 q = p->p_pptr; 1354 PHOLD(q); 1355 lwkt_gettoken(&q->p_token); 1356 p->p_flags &= ~P_WAITED; 1357 wakeup(p->p_pptr); 1358 if ((q->p_sigacts->ps_flag & PS_NOCLDSTOP) == 0) 1359 ksignal(q, SIGCHLD); 1360 lwkt_reltoken(&q->p_token); 1361 PRELE(q); 1362 } 1363 } 1364 1365 /* 1366 * Wait here while in a stopped state, interlocked with lwp_token. 1367 * We must break-out if the whole process is trying to exit. 1368 */ 1369 while (STOPLWP(p, lp)) { 1370 lp->lwp_stat = LSSTOP; 1371 tsleep(p, 0, "stop", 0); 1372 } 1373 p->p_nstopped--; 1374 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_WSTOP); 1375 crit_exit(); 1376 lwkt_reltoken(&lp->lwp_token); 1377 } 1378 1379 /* 1380 * Compute a tenex style load average of a quantity on 1381 * 1, 5 and 15 minute intervals. This is a pcpu callout. 1382 * 1383 * We segment the lwp scan on a pcpu basis. This does NOT 1384 * mean the associated lwps are on this cpu, it is done 1385 * just to break the work up. 1386 * 1387 * The callout on cpu0 rolls up the stats from the other 1388 * cpus. 1389 */ 1390 static int loadav_count_runnable(struct lwp *p, void *data); 1391 1392 static void 1393 loadav(void *arg) 1394 { 1395 globaldata_t gd = mycpu; 1396 struct loadavg *avg; 1397 int i, nrun; 1398 1399 nrun = 0; 1400 alllwp_scan(loadav_count_runnable, &nrun, 1); 1401 gd->gd_loadav_nrunnable = nrun; 1402 if (gd->gd_cpuid == 0) { 1403 avg = &averunnable; 1404 nrun = 0; 1405 for (i = 0; i < ncpus; ++i) 1406 nrun += globaldata_find(i)->gd_loadav_nrunnable; 1407 for (i = 0; i < 3; i++) { 1408 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + 1409 (long)nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; 1410 } 1411 } 1412 1413 /* 1414 * Schedule the next update to occur after 5 seconds, but add a 1415 * random variation to avoid synchronisation with processes that 1416 * run at regular intervals. 1417 */ 1418 callout_reset(&gd->gd_loadav_callout, 1419 hz * 4 + (int)(krandom() % (hz * 2 + 1)), 1420 loadav, NULL); 1421 } 1422 1423 static int 1424 loadav_count_runnable(struct lwp *lp, void *data) 1425 { 1426 int *nrunp = data; 1427 thread_t td; 1428 1429 switch (lp->lwp_stat) { 1430 case LSRUN: 1431 if ((td = lp->lwp_thread) == NULL) 1432 break; 1433 if (td->td_flags & TDF_BLOCKED) 1434 break; 1435 ++*nrunp; 1436 break; 1437 default: 1438 break; 1439 } 1440 lwkt_yield(); 1441 return(0); 1442 } 1443 1444 /* 1445 * Regular data collection 1446 */ 1447 static uint64_t 1448 collect_load_callback(int n) 1449 { 1450 int fscale = averunnable.fscale; 1451 1452 return ((averunnable.ldavg[0] * 100 + (fscale >> 1)) / fscale); 1453 } 1454 1455 static void 1456 sched_setup(void *dummy __unused) 1457 { 1458 globaldata_t save_gd = mycpu; 1459 globaldata_t gd; 1460 int n; 1461 1462 kcollect_register(KCOLLECT_LOAD, "load", collect_load_callback, 1463 KCOLLECT_SCALE(KCOLLECT_LOAD_FORMAT, 0)); 1464 1465 /* 1466 * Kick off timeout driven events by calling first time. We 1467 * split the work across available cpus to help scale it, 1468 * it can eat a lot of cpu when there are a lot of processes 1469 * on the system. 1470 */ 1471 for (n = 0; n < ncpus; ++n) { 1472 gd = globaldata_find(n); 1473 lwkt_setcpu_self(gd); 1474 callout_init_mp(&gd->gd_loadav_callout); 1475 callout_init_mp(&gd->gd_schedcpu_callout); 1476 schedcpu(NULL); 1477 loadav(NULL); 1478 } 1479 lwkt_setcpu_self(save_gd); 1480 } 1481 1482 /* 1483 * Extremely early initialization, dummy-up the tables so we don't have 1484 * to conditionalize for NULL in _wakeup() and tsleep_interlock(). Even 1485 * though the system isn't blocking this early, these functions still 1486 * try to access the hash table. 1487 * 1488 * This setup will be overridden once sched_dyninit() -> sleep_gdinit() 1489 * is called. 1490 */ 1491 void 1492 sleep_early_gdinit(globaldata_t gd) 1493 { 1494 static struct tslpque dummy_slpque; 1495 static cpumask_t dummy_cpumasks; 1496 1497 slpque_tablesize = 1; 1498 gd->gd_tsleep_hash = &dummy_slpque; 1499 slpque_cpumasks = &dummy_cpumasks; 1500 TAILQ_INIT(&dummy_slpque.queue); 1501 } 1502 1503 /* 1504 * PCPU initialization. Called after KMALLOC is operational, by 1505 * sched_dyninit() for cpu 0, and by mi_gdinit() for other cpus later. 1506 * 1507 * WARNING! The pcpu hash table is smaller than the global cpumask 1508 * hash table, which can save us a lot of memory when maxproc 1509 * is set high. 1510 */ 1511 void 1512 sleep_gdinit(globaldata_t gd) 1513 { 1514 struct thread *td; 1515 size_t hash_size; 1516 uint32_t n; 1517 uint32_t i; 1518 1519 /* 1520 * This shouldn't happen, that is there shouldn't be any threads 1521 * waiting on the dummy tsleep queue this early in the boot. 1522 */ 1523 if (gd->gd_cpuid == 0) { 1524 struct tslpque *qp = &gd->gd_tsleep_hash[0]; 1525 TAILQ_FOREACH(td, &qp->queue, td_sleepq) { 1526 kprintf("SLEEP_GDINIT SWITCH %s\n", td->td_comm); 1527 } 1528 } 1529 1530 /* 1531 * Note that we have to allocate one extra slot because we are 1532 * shifting a modulo value. TCHASHSHIFT(slpque_tablesize - 1) can 1533 * return the same value as TCHASHSHIFT(slpque_tablesize). 1534 */ 1535 n = TCHASHSHIFT(slpque_tablesize) + 1; 1536 1537 hash_size = sizeof(struct tslpque) * n; 1538 gd->gd_tsleep_hash = (void *)kmem_alloc3(&kernel_map, hash_size, 1539 VM_SUBSYS_GD, 1540 KM_CPU(gd->gd_cpuid)); 1541 memset(gd->gd_tsleep_hash, 0, hash_size); 1542 for (i = 0; i < n; ++i) 1543 TAILQ_INIT(&gd->gd_tsleep_hash[i].queue); 1544 } 1545 1546 /* 1547 * Dynamic initialization after the memory system is operational. 1548 */ 1549 static void 1550 sched_dyninit(void *dummy __unused) 1551 { 1552 int tblsize; 1553 int tblsize2; 1554 int n; 1555 1556 /* 1557 * Calculate table size for slpque hash. We want a prime number 1558 * large enough to avoid overloading slpque_cpumasks when the 1559 * system has a large number of sleeping processes, which will 1560 * spam IPIs on wakeup(). 1561 * 1562 * While it is true this is really a per-lwp factor, generally 1563 * speaking the maxproc limit is a good metric to go by. 1564 */ 1565 for (tblsize = maxproc | 1; ; tblsize += 2) { 1566 if (tblsize % 3 == 0) 1567 continue; 1568 if (tblsize % 5 == 0) 1569 continue; 1570 tblsize2 = (tblsize / 2) | 1; 1571 for (n = 7; n < tblsize2; n += 2) { 1572 if (tblsize % n == 0) 1573 break; 1574 } 1575 if (n == tblsize2) 1576 break; 1577 } 1578 1579 /* 1580 * PIDs are currently limited to 6 digits. Cap the table size 1581 * at double this. 1582 */ 1583 if (tblsize > 2000003) 1584 tblsize = 2000003; 1585 1586 slpque_tablesize = tblsize; 1587 slpque_cpumasks = kmalloc(sizeof(*slpque_cpumasks) * slpque_tablesize, 1588 M_TSLEEP, M_WAITOK | M_ZERO); 1589 sleep_gdinit(mycpu); 1590 } 1591