1 /*- 2 * Copyright (c) 1982, 1986, 1990, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 39 * $FreeBSD: src/sys/kern/kern_synch.c,v 1.87.2.6 2002/10/13 07:29:53 kbyanc Exp $ 40 * $DragonFly: src/sys/kern/kern_synch.c,v 1.22 2003/09/25 01:47:56 dillon Exp $ 41 */ 42 43 #include "opt_ktrace.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/proc.h> 48 #include <sys/kernel.h> 49 #include <sys/signalvar.h> 50 #include <sys/resourcevar.h> 51 #include <sys/vmmeter.h> 52 #include <sys/sysctl.h> 53 #include <sys/thread2.h> 54 #ifdef KTRACE 55 #include <sys/uio.h> 56 #include <sys/ktrace.h> 57 #endif 58 #include <sys/xwait.h> 59 60 #include <machine/cpu.h> 61 #include <machine/ipl.h> 62 #include <machine/smp.h> 63 64 static void sched_setup (void *dummy); 65 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL) 66 67 int hogticks; 68 int lbolt; 69 int sched_quantum; /* Roundrobin scheduling quantum in ticks. */ 70 int ncpus; 71 72 static struct callout loadav_callout; 73 74 struct loadavg averunnable = 75 { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ 76 /* 77 * Constants for averages over 1, 5, and 15 minutes 78 * when sampling at 5 second intervals. 79 */ 80 static fixpt_t cexp[3] = { 81 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 82 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 83 0.9944598480048967 * FSCALE, /* exp(-1/180) */ 84 }; 85 86 static void endtsleep (void *); 87 static void loadav (void *arg); 88 static void maybe_resched (struct proc *chk); 89 static void roundrobin (void *arg); 90 static void schedcpu (void *arg); 91 static void updatepri (struct proc *p); 92 static void crit_panicints(void); 93 94 static int 95 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) 96 { 97 int error, new_val; 98 99 new_val = sched_quantum * tick; 100 error = sysctl_handle_int(oidp, &new_val, 0, req); 101 if (error != 0 || req->newptr == NULL) 102 return (error); 103 if (new_val < tick) 104 return (EINVAL); 105 sched_quantum = new_val / tick; 106 hogticks = 2 * sched_quantum; 107 return (0); 108 } 109 110 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW, 111 0, sizeof sched_quantum, sysctl_kern_quantum, "I", ""); 112 113 /* 114 * Arrange to reschedule if necessary by checking to see if the current 115 * process is on the highest priority user scheduling queue. This may 116 * be run from an interrupt so we have to follow any preemption chains 117 * back to the original process. 118 */ 119 static void 120 maybe_resched(struct proc *chk) 121 { 122 struct proc *cur = lwkt_preempted_proc(); 123 124 if (cur == NULL) 125 return; 126 127 /* 128 * Check the user queue (realtime, normal, idle). Lower numbers 129 * indicate higher priority queues. Lower numbers are also better 130 * for p_priority. 131 */ 132 if (chk->p_rtprio.type < cur->p_rtprio.type) { 133 need_resched(); 134 } else if (chk->p_rtprio.type == cur->p_rtprio.type) { 135 if (chk->p_rtprio.type == RTP_PRIO_NORMAL) { 136 if (chk->p_priority / PPQ < cur->p_priority / PPQ) 137 need_resched(); 138 } else { 139 if (chk->p_rtprio.prio < cur->p_rtprio.prio) 140 need_resched(); 141 } 142 } 143 } 144 145 int 146 roundrobin_interval(void) 147 { 148 return (sched_quantum); 149 } 150 151 /* 152 * Force switch among equal priority processes every 100ms. 153 */ 154 #ifdef SMP 155 156 static void 157 roundrobin_remote(void *arg) 158 { 159 struct proc *p = lwkt_preempted_proc(); 160 if (p == NULL || RTP_PRIO_NEED_RR(p->p_rtprio.type)) 161 need_resched(); 162 } 163 164 #endif 165 166 static void 167 roundrobin(void *arg) 168 { 169 struct proc *p = lwkt_preempted_proc(); 170 if (p == NULL || RTP_PRIO_NEED_RR(p->p_rtprio.type)) 171 need_resched(); 172 #ifdef SMP 173 lwkt_send_ipiq_mask(mycpu->gd_other_cpus, roundrobin_remote, NULL); 174 #endif 175 timeout(roundrobin, NULL, sched_quantum); 176 } 177 178 #ifdef SMP 179 180 void 181 resched_cpus(u_int32_t mask) 182 { 183 lwkt_send_ipiq_mask(mask, roundrobin_remote, NULL); 184 } 185 186 #endif 187 188 /* 189 * Constants for digital decay and forget: 190 * 90% of (p_estcpu) usage in 5 * loadav time 191 * 95% of (p_pctcpu) usage in 60 seconds (load insensitive) 192 * Note that, as ps(1) mentions, this can let percentages 193 * total over 100% (I've seen 137.9% for 3 processes). 194 * 195 * Note that schedclock() updates p_estcpu and p_cpticks asynchronously. 196 * 197 * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds. 198 * That is, the system wants to compute a value of decay such 199 * that the following for loop: 200 * for (i = 0; i < (5 * loadavg); i++) 201 * p_estcpu *= decay; 202 * will compute 203 * p_estcpu *= 0.1; 204 * for all values of loadavg: 205 * 206 * Mathematically this loop can be expressed by saying: 207 * decay ** (5 * loadavg) ~= .1 208 * 209 * The system computes decay as: 210 * decay = (2 * loadavg) / (2 * loadavg + 1) 211 * 212 * We wish to prove that the system's computation of decay 213 * will always fulfill the equation: 214 * decay ** (5 * loadavg) ~= .1 215 * 216 * If we compute b as: 217 * b = 2 * loadavg 218 * then 219 * decay = b / (b + 1) 220 * 221 * We now need to prove two things: 222 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) 223 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) 224 * 225 * Facts: 226 * For x close to zero, exp(x) =~ 1 + x, since 227 * exp(x) = 0! + x**1/1! + x**2/2! + ... . 228 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. 229 * For x close to zero, ln(1+x) =~ x, since 230 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 231 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). 232 * ln(.1) =~ -2.30 233 * 234 * Proof of (1): 235 * Solve (factor)**(power) =~ .1 given power (5*loadav): 236 * solving for factor, 237 * ln(factor) =~ (-2.30/5*loadav), or 238 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = 239 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED 240 * 241 * Proof of (2): 242 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): 243 * solving for power, 244 * power*ln(b/(b+1)) =~ -2.30, or 245 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED 246 * 247 * Actual power values for the implemented algorithm are as follows: 248 * loadav: 1 2 3 4 249 * power: 5.68 10.32 14.94 19.55 250 */ 251 252 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ 253 #define loadfactor(loadav) (2 * (loadav)) 254 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) 255 256 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 257 static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 258 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 259 260 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */ 261 static int fscale __unused = FSCALE; 262 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, ""); 263 264 /* 265 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 266 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 267 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 268 * 269 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 270 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 271 * 272 * If you don't want to bother with the faster/more-accurate formula, you 273 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 274 * (more general) method of calculating the %age of CPU used by a process. 275 */ 276 #define CCPU_SHIFT 11 277 278 /* 279 * Recompute process priorities, every hz ticks. 280 */ 281 /* ARGSUSED */ 282 static void 283 schedcpu(void *arg) 284 { 285 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 286 struct proc *p; 287 struct proc *curp; 288 int realstathz, s; 289 290 curp = lwkt_preempted_proc(); /* YYY temporary hack */ 291 292 realstathz = stathz ? stathz : hz; 293 FOREACH_PROC_IN_SYSTEM(p) { 294 /* 295 * Increment time in/out of memory and sleep time 296 * (if sleeping). We ignore overflow; with 16-bit int's 297 * (remember them?) overflow takes 45 days. 298 */ 299 p->p_swtime++; 300 if (p->p_stat == SSLEEP || p->p_stat == SSTOP) 301 p->p_slptime++; 302 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 303 /* 304 * If the process has slept the entire second, 305 * stop recalculating its priority until it wakes up. 306 */ 307 if (p->p_slptime > 1) 308 continue; 309 s = splhigh(); /* prevent state changes and protect run queue */ 310 /* 311 * p_pctcpu is only for ps. 312 */ 313 #if (FSHIFT >= CCPU_SHIFT) 314 p->p_pctcpu += (realstathz == 100)? 315 ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT): 316 100 * (((fixpt_t) p->p_cpticks) 317 << (FSHIFT - CCPU_SHIFT)) / realstathz; 318 #else 319 p->p_pctcpu += ((FSCALE - ccpu) * 320 (p->p_cpticks * FSCALE / realstathz)) >> FSHIFT; 321 #endif 322 p->p_cpticks = 0; 323 p->p_estcpu = decay_cpu(loadfac, p->p_estcpu); 324 resetpriority(p); 325 splx(s); 326 } 327 wakeup((caddr_t)&lbolt); 328 timeout(schedcpu, (void *)0, hz); 329 } 330 331 /* 332 * Recalculate the priority of a process after it has slept for a while. 333 * For all load averages >= 1 and max p_estcpu of 255, sleeping for at 334 * least six times the loadfactor will decay p_estcpu to zero. 335 */ 336 static void 337 updatepri(struct proc *p) 338 { 339 unsigned int newcpu = p->p_estcpu; 340 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 341 342 if (p->p_slptime > 5 * loadfac) { 343 p->p_estcpu = 0; 344 } else { 345 p->p_slptime--; /* the first time was done in schedcpu */ 346 while (newcpu && --p->p_slptime) 347 newcpu = decay_cpu(loadfac, newcpu); 348 p->p_estcpu = newcpu; 349 } 350 resetpriority(p); 351 } 352 353 /* 354 * We're only looking at 7 bits of the address; everything is 355 * aligned to 4, lots of things are aligned to greater powers 356 * of 2. Shift right by 8, i.e. drop the bottom 256 worth. 357 */ 358 #define TABLESIZE 128 359 static TAILQ_HEAD(slpquehead, thread) slpque[TABLESIZE]; 360 #define LOOKUP(x) (((intptr_t)(x) >> 8) & (TABLESIZE - 1)) 361 362 /* 363 * During autoconfiguration or after a panic, a sleep will simply 364 * lower the priority briefly to allow interrupts, then return. 365 * The priority to be used (safepri) is machine-dependent, thus this 366 * value is initialized and maintained in the machine-dependent layers. 367 * This priority will typically be 0, or the lowest priority 368 * that is safe for use on the interrupt stack; it can be made 369 * higher to block network software interrupts after panics. 370 */ 371 int safepri; 372 373 void 374 sleepinit(void) 375 { 376 int i; 377 378 sched_quantum = hz/10; 379 hogticks = 2 * sched_quantum; 380 for (i = 0; i < TABLESIZE; i++) 381 TAILQ_INIT(&slpque[i]); 382 } 383 384 /* 385 * General sleep call. Suspends the current process until a wakeup is 386 * performed on the specified identifier. The process will then be made 387 * runnable with the specified priority. Sleeps at most timo/hz seconds 388 * (0 means no timeout). If flags includes PCATCH flag, signals are checked 389 * before and after sleeping, else signals are not checked. Returns 0 if 390 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 391 * signal needs to be delivered, ERESTART is returned if the current system 392 * call should be restarted if possible, and EINTR is returned if the system 393 * call should be interrupted by the signal (return EINTR). 394 * 395 * If the process has P_CURPROC set mi_switch() will not re-queue it to 396 * the userland scheduler queues because we are in a SSLEEP state. If 397 * we are not the current process then we have to remove ourselves from 398 * the scheduler queues. 399 * 400 * YYY priority now unused 401 */ 402 int 403 tsleep(ident, flags, wmesg, timo) 404 void *ident; 405 int flags, timo; 406 const char *wmesg; 407 { 408 struct thread *td = curthread; 409 struct proc *p = td->td_proc; /* may be NULL */ 410 int s, sig = 0, catch = flags & PCATCH; 411 int id = LOOKUP(ident); 412 struct callout_handle thandle; 413 414 /* 415 * NOTE: removed KTRPOINT, it could cause races due to blocking 416 * even in stable. Just scrap it for now. 417 */ 418 if (cold || panicstr) { 419 /* 420 * After a panic, or during autoconfiguration, 421 * just give interrupts a chance, then just return; 422 * don't run any other procs or panic below, 423 * in case this is the idle process and already asleep. 424 */ 425 crit_panicints(); 426 return (0); 427 } 428 KKASSERT(td != &mycpu->gd_idlethread); /* you must be kidding! */ 429 s = splhigh(); 430 KASSERT(ident != NULL, ("tsleep: no ident")); 431 KASSERT(p == NULL || p->p_stat == SRUN, ("tsleep %p %s %d", 432 ident, wmesg, p->p_stat)); 433 434 crit_enter(); 435 td->td_wchan = ident; 436 td->td_wmesg = wmesg; 437 if (p) 438 p->p_slptime = 0; 439 lwkt_deschedule_self(); 440 TAILQ_INSERT_TAIL(&slpque[id], td, td_threadq); 441 if (timo) 442 thandle = timeout(endtsleep, (void *)td, timo); 443 /* 444 * We put ourselves on the sleep queue and start our timeout 445 * before calling CURSIG, as we could stop there, and a wakeup 446 * or a SIGCONT (or both) could occur while we were stopped. 447 * A SIGCONT would cause us to be marked as SSLEEP 448 * without resuming us, thus we must be ready for sleep 449 * when CURSIG is called. If the wakeup happens while we're 450 * stopped, td->td_wchan will be 0 upon return from CURSIG. 451 */ 452 if (p) { 453 if (catch) { 454 p->p_flag |= P_SINTR; 455 if ((sig = CURSIG(p))) { 456 if (td->td_wchan) { 457 unsleep(td); 458 lwkt_schedule_self(); 459 } 460 p->p_stat = SRUN; 461 goto resume; 462 } 463 if (td->td_wchan == NULL) { 464 catch = 0; 465 goto resume; 466 } 467 } else { 468 sig = 0; 469 } 470 471 /* 472 * If we are not the current process we have to remove ourself 473 * from the run queue. 474 */ 475 KASSERT(p->p_stat == SRUN, ("PSTAT NOT SRUN %d %d", p->p_pid, p->p_stat)); 476 /* 477 * If this is the current 'user' process schedule another one. 478 */ 479 clrrunnable(p, SSLEEP); 480 p->p_stats->p_ru.ru_nvcsw++; 481 KKASSERT(td->td_release || (p->p_flag & P_CURPROC) == 0); 482 mi_switch(); 483 KASSERT(p->p_stat == SRUN, ("tsleep: stat not srun")); 484 } else { 485 lwkt_switch(); 486 } 487 resume: 488 crit_exit(); 489 if (p) 490 p->p_flag &= ~P_SINTR; 491 splx(s); 492 if (td->td_flags & TDF_TIMEOUT) { 493 td->td_flags &= ~TDF_TIMEOUT; 494 if (sig == 0) 495 return (EWOULDBLOCK); 496 } else if (timo) { 497 untimeout(endtsleep, (void *)td, thandle); 498 } else if (td->td_wmesg) { 499 /* 500 * This can happen if a thread is woken up directly. Clear 501 * wmesg to avoid debugging confusion. 502 */ 503 td->td_wmesg = NULL; 504 } 505 if (p) { 506 if (catch && (sig != 0 || (sig = CURSIG(p)))) { 507 if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig)) 508 return (EINTR); 509 return (ERESTART); 510 } 511 } 512 return (0); 513 } 514 515 /* 516 * Implement the timeout for tsleep. We interlock against 517 * wchan when setting TDF_TIMEOUT. For processes we remove 518 * the sleep if the process is stopped rather then sleeping, 519 * so it remains stopped. 520 */ 521 static void 522 endtsleep(void *arg) 523 { 524 thread_t td = arg; 525 struct proc *p; 526 int s; 527 528 s = splhigh(); 529 if (td->td_wchan) { 530 td->td_flags |= TDF_TIMEOUT; 531 if ((p = td->td_proc) != NULL) { 532 if (p->p_stat == SSLEEP) 533 setrunnable(p); 534 else 535 unsleep(td); 536 } else { 537 unsleep(td); 538 lwkt_schedule(td); 539 } 540 } 541 splx(s); 542 } 543 544 /* 545 * Remove a process from its wait queue 546 */ 547 void 548 unsleep(struct thread *td) 549 { 550 int s; 551 552 s = splhigh(); 553 if (td->td_wchan) { 554 #if 0 555 if (p->p_flag & P_XSLEEP) { 556 struct xwait *w = p->p_wchan; 557 TAILQ_REMOVE(&w->waitq, p, p_procq); 558 p->p_flag &= ~P_XSLEEP; 559 } else 560 #endif 561 TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_threadq); 562 td->td_wchan = NULL; 563 } 564 splx(s); 565 } 566 567 #if 0 568 /* 569 * Make all processes sleeping on the explicit lock structure runnable. 570 */ 571 void 572 xwakeup(struct xwait *w) 573 { 574 struct proc *p; 575 int s; 576 577 s = splhigh(); 578 ++w->gen; 579 while ((p = TAILQ_FIRST(&w->waitq)) != NULL) { 580 TAILQ_REMOVE(&w->waitq, p, p_procq); 581 KASSERT(p->p_wchan == w && (p->p_flag & P_XSLEEP), 582 ("xwakeup: wchan mismatch for %p (%p/%p) %08x", p, p->p_wchan, w, p->p_flag & P_XSLEEP)); 583 p->p_wchan = NULL; 584 p->p_flag &= ~P_XSLEEP; 585 if (p->p_stat == SSLEEP) { 586 /* OPTIMIZED EXPANSION OF setrunnable(p); */ 587 if (p->p_slptime > 1) 588 updatepri(p); 589 p->p_slptime = 0; 590 p->p_stat = SRUN; 591 if (p->p_flag & P_INMEM) { 592 setrunqueue(p); 593 maybe_resched(p); 594 } else { 595 p->p_flag |= P_SWAPINREQ; 596 wakeup((caddr_t)&proc0); 597 } 598 } 599 } 600 splx(s); 601 } 602 #endif 603 604 /* 605 * Make all processes sleeping on the specified identifier runnable. 606 */ 607 static void 608 _wakeup(void *ident, int count) 609 { 610 struct slpquehead *qp; 611 struct thread *td; 612 struct thread *ntd; 613 struct proc *p; 614 int s; 615 int id = LOOKUP(ident); 616 617 s = splhigh(); 618 qp = &slpque[id]; 619 restart: 620 for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) { 621 ntd = TAILQ_NEXT(td, td_threadq); 622 if (td->td_wchan == ident) { 623 TAILQ_REMOVE(qp, td, td_threadq); 624 td->td_wchan = NULL; 625 if ((p = td->td_proc) != NULL && p->p_stat == SSLEEP) { 626 /* OPTIMIZED EXPANSION OF setrunnable(p); */ 627 if (p->p_slptime > 1) 628 updatepri(p); 629 p->p_slptime = 0; 630 p->p_stat = SRUN; 631 if (p->p_flag & P_INMEM) { 632 setrunqueue(p); 633 if (p->p_flag & P_CURPROC) 634 maybe_resched(p); 635 } else { 636 p->p_flag |= P_SWAPINREQ; 637 wakeup((caddr_t)&proc0); 638 } 639 /* END INLINE EXPANSION */ 640 } else if (p == NULL) { 641 lwkt_schedule(td); 642 } 643 if (--count == 0) 644 break; 645 goto restart; 646 } 647 } 648 splx(s); 649 } 650 651 void 652 wakeup(void *ident) 653 { 654 _wakeup(ident, 0); 655 } 656 657 void 658 wakeup_one(void *ident) 659 { 660 _wakeup(ident, 1); 661 } 662 663 /* 664 * The machine independent parts of mi_switch(). 665 * Must be called at splstatclock() or higher. 666 */ 667 void 668 mi_switch() 669 { 670 struct thread *td = curthread; 671 struct proc *p = td->td_proc; /* XXX */ 672 struct rlimit *rlim; 673 int x; 674 u_int64_t ttime; 675 676 /* 677 * XXX this spl is almost unnecessary. It is partly to allow for 678 * sloppy callers that don't do it (issignal() via CURSIG() is the 679 * main offender). It is partly to work around a bug in the i386 680 * cpu_switch() (the ipl is not preserved). We ran for years 681 * without it. I think there was only a interrupt latency problem. 682 * The main caller, tsleep(), does an splx() a couple of instructions 683 * after calling here. The buggy caller, issignal(), usually calls 684 * here at spl0() and sometimes returns at splhigh(). The process 685 * then runs for a little too long at splhigh(). The ipl gets fixed 686 * when the process returns to user mode (or earlier). 687 * 688 * It would probably be better to always call here at spl0(). Callers 689 * are prepared to give up control to another process, so they must 690 * be prepared to be interrupted. The clock stuff here may not 691 * actually need splstatclock(). 692 */ 693 x = splstatclock(); 694 clear_resched(); 695 696 /* 697 * Check if the process exceeds its cpu resource allocation. 698 * If over max, kill it. Time spent in interrupts is not 699 * included. YYY 64 bit match is expensive. Ick. 700 */ 701 ttime = td->td_sticks + td->td_uticks; 702 if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY && 703 ttime > p->p_limit->p_cpulimit) { 704 rlim = &p->p_rlimit[RLIMIT_CPU]; 705 if (ttime / (rlim_t)1000000 >= rlim->rlim_max) { 706 killproc(p, "exceeded maximum CPU limit"); 707 } else { 708 psignal(p, SIGXCPU); 709 if (rlim->rlim_cur < rlim->rlim_max) { 710 /* XXX: we should make a private copy */ 711 rlim->rlim_cur += 5; 712 } 713 } 714 } 715 716 /* 717 * Pick a new current process and record its start time. If we 718 * are in a SSTOPped state we deschedule ourselves. YYY this needs 719 * to be cleaned up, remember that LWKTs stay on their run queue 720 * which works differently then the user scheduler which removes 721 * the process from the runq when it runs it. 722 */ 723 mycpu->gd_cnt.v_swtch++; 724 if (p->p_stat == SSTOP) 725 lwkt_deschedule_self(); 726 lwkt_switch(); 727 728 splx(x); 729 } 730 731 /* 732 * Change process state to be runnable, 733 * placing it on the run queue if it is in memory, 734 * and awakening the swapper if it isn't in memory. 735 */ 736 void 737 setrunnable(struct proc *p) 738 { 739 int s; 740 741 s = splhigh(); 742 switch (p->p_stat) { 743 case 0: 744 case SRUN: 745 case SZOMB: 746 default: 747 panic("setrunnable"); 748 case SSTOP: 749 case SSLEEP: 750 unsleep(p->p_thread); /* e.g. when sending signals */ 751 break; 752 753 case SIDL: 754 break; 755 } 756 p->p_stat = SRUN; 757 if (p->p_flag & P_INMEM) 758 setrunqueue(p); 759 splx(s); 760 if (p->p_slptime > 1) 761 updatepri(p); 762 p->p_slptime = 0; 763 if ((p->p_flag & P_INMEM) == 0) { 764 p->p_flag |= P_SWAPINREQ; 765 wakeup((caddr_t)&proc0); 766 } else { 767 maybe_resched(p); 768 } 769 } 770 771 /* 772 * Change the process state to NOT be runnable, removing it from the run 773 * queue. If P_CURPROC is not set and we are in SRUN the process is on the 774 * run queue (If P_INMEM is not set then it isn't because it is swapped). 775 */ 776 void 777 clrrunnable(struct proc *p, int stat) 778 { 779 int s; 780 781 s = splhigh(); 782 switch(p->p_stat) { 783 case SRUN: 784 if (p->p_flag & P_ONRUNQ) 785 remrunqueue(p); 786 break; 787 default: 788 break; 789 } 790 p->p_stat = stat; 791 splx(s); 792 } 793 794 /* 795 * Compute the priority of a process when running in user mode. 796 * Arrange to reschedule if the resulting priority is better 797 * than that of the current process. 798 * 799 * YYY real time / idle procs do not use p_priority XXX 800 */ 801 void 802 resetpriority(struct proc *p) 803 { 804 unsigned int newpriority; 805 int opq; 806 int npq; 807 808 if (p->p_rtprio.type != RTP_PRIO_NORMAL) 809 return; 810 newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT + 811 NICE_WEIGHT * p->p_nice; 812 newpriority = min(newpriority, MAXPRI); 813 npq = newpriority / PPQ; 814 crit_enter(); 815 opq = p->p_priority / PPQ; 816 if (p->p_stat == SRUN && (p->p_flag & P_ONRUNQ) && opq != npq) { 817 /* 818 * We have to move the process to another queue 819 */ 820 remrunqueue(p); 821 p->p_priority = newpriority; 822 setrunqueue(p); 823 } else { 824 /* 825 * We can just adjust the priority and it will be picked 826 * up later. 827 */ 828 KKASSERT(opq == npq || (p->p_flag & P_ONRUNQ) == 0); 829 p->p_priority = newpriority; 830 } 831 crit_exit(); 832 maybe_resched(p); 833 } 834 835 /* 836 * Compute a tenex style load average of a quantity on 837 * 1, 5 and 15 minute intervals. 838 */ 839 static void 840 loadav(void *arg) 841 { 842 int i, nrun; 843 struct loadavg *avg; 844 struct proc *p; 845 846 avg = &averunnable; 847 nrun = 0; 848 FOREACH_PROC_IN_SYSTEM(p) { 849 switch (p->p_stat) { 850 case SRUN: 851 case SIDL: 852 nrun++; 853 } 854 } 855 for (i = 0; i < 3; i++) 856 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + 857 nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; 858 859 /* 860 * Schedule the next update to occur after 5 seconds, but add a 861 * random variation to avoid synchronisation with processes that 862 * run at regular intervals. 863 */ 864 callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)), 865 loadav, NULL); 866 } 867 868 /* ARGSUSED */ 869 static void 870 sched_setup(dummy) 871 void *dummy; 872 { 873 874 callout_init(&loadav_callout); 875 876 /* Kick off timeout driven events by calling first time. */ 877 roundrobin(NULL); 878 schedcpu(NULL); 879 loadav(NULL); 880 } 881 882 /* 883 * We adjust the priority of the current process. The priority of 884 * a process gets worse as it accumulates CPU time. The cpu usage 885 * estimator (p_estcpu) is increased here. resetpriority() will 886 * compute a different priority each time p_estcpu increases by 887 * INVERSE_ESTCPU_WEIGHT 888 * (until MAXPRI is reached). The cpu usage estimator ramps up 889 * quite quickly when the process is running (linearly), and decays 890 * away exponentially, at a rate which is proportionally slower when 891 * the system is busy. The basic principle is that the system will 892 * 90% forget that the process used a lot of CPU time in 5 * loadav 893 * seconds. This causes the system to favor processes which haven't 894 * run much recently, and to round-robin among other processes. 895 */ 896 void 897 schedclock(p) 898 struct proc *p; 899 { 900 901 p->p_cpticks++; 902 p->p_estcpu = ESTCPULIM(p->p_estcpu + 1); 903 if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) 904 resetpriority(p); 905 } 906 907 static 908 void 909 crit_panicints(void) 910 { 911 int s; 912 int cpri; 913 914 s = splhigh(); 915 cpri = crit_panic_save(); 916 splx(safepri); 917 crit_panic_restore(cpri); 918 splx(s); 919 } 920 921