1 /*- 2 * Copyright (c) 1982, 1986, 1990, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 39 * $FreeBSD: src/sys/kern/kern_synch.c,v 1.87.2.6 2002/10/13 07:29:53 kbyanc Exp $ 40 * $DragonFly: src/sys/kern/kern_synch.c,v 1.33 2004/06/10 22:11:35 dillon Exp $ 41 */ 42 43 #include "opt_ktrace.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/proc.h> 48 #include <sys/kernel.h> 49 #include <sys/signalvar.h> 50 #include <sys/resourcevar.h> 51 #include <sys/vmmeter.h> 52 #include <sys/sysctl.h> 53 #include <sys/thread2.h> 54 #ifdef KTRACE 55 #include <sys/uio.h> 56 #include <sys/ktrace.h> 57 #endif 58 #include <sys/xwait.h> 59 60 #include <machine/cpu.h> 61 #include <machine/ipl.h> 62 #include <machine/smp.h> 63 64 static void sched_setup (void *dummy); 65 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL) 66 67 int hogticks; 68 int lbolt; 69 int sched_quantum; /* Roundrobin scheduling quantum in ticks. */ 70 int ncpus; 71 int ncpus2, ncpus2_shift, ncpus2_mask; 72 73 static struct callout loadav_callout; 74 75 struct loadavg averunnable = 76 { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ 77 /* 78 * Constants for averages over 1, 5, and 15 minutes 79 * when sampling at 5 second intervals. 80 */ 81 static fixpt_t cexp[3] = { 82 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 83 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 84 0.9944598480048967 * FSCALE, /* exp(-1/180) */ 85 }; 86 87 static void endtsleep (void *); 88 static void loadav (void *arg); 89 static void roundrobin (void *arg); 90 static void schedcpu (void *arg); 91 static void updatepri (struct proc *p); 92 static void crit_panicints(void); 93 94 static int 95 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) 96 { 97 int error, new_val; 98 99 new_val = sched_quantum * tick; 100 error = sysctl_handle_int(oidp, &new_val, 0, req); 101 if (error != 0 || req->newptr == NULL) 102 return (error); 103 if (new_val < tick) 104 return (EINVAL); 105 sched_quantum = new_val / tick; 106 hogticks = 2 * sched_quantum; 107 return (0); 108 } 109 110 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW, 111 0, sizeof sched_quantum, sysctl_kern_quantum, "I", ""); 112 113 int 114 roundrobin_interval(void) 115 { 116 return (sched_quantum); 117 } 118 119 /* 120 * Force switch among equal priority processes every 100ms. 121 * 122 * WARNING! The MP lock is not held on ipi message remotes. 123 */ 124 #ifdef SMP 125 126 static void 127 roundrobin_remote(void *arg) 128 { 129 struct proc *p = lwkt_preempted_proc(); 130 if (p == NULL || RTP_PRIO_NEED_RR(p->p_rtprio.type)) 131 need_user_resched(); 132 } 133 134 #endif 135 136 static void 137 roundrobin(void *arg) 138 { 139 struct proc *p = lwkt_preempted_proc(); 140 if (p == NULL || RTP_PRIO_NEED_RR(p->p_rtprio.type)) 141 need_user_resched(); 142 #ifdef SMP 143 lwkt_send_ipiq_mask(mycpu->gd_other_cpus, roundrobin_remote, NULL); 144 #endif 145 timeout(roundrobin, NULL, sched_quantum); 146 } 147 148 #ifdef SMP 149 150 void 151 resched_cpus(u_int32_t mask) 152 { 153 lwkt_send_ipiq_mask(mask, roundrobin_remote, NULL); 154 } 155 156 #endif 157 158 /* 159 * The load average is scaled by FSCALE (2048 typ). The estimated cpu is 160 * incremented at a rate of ESTCPUVFREQ per second (40hz typ), but this is 161 * divided up across all cpu bound processes running in the system so an 162 * individual process will get less under load. ESTCPULIM typicaly caps 163 * out at ESTCPUMAX (around 376, or 11 nice levels). 164 * 165 * Generally speaking the decay equation needs to break-even on growth 166 * at the limit at all load levels >= 1.0, so if the estimated cpu for 167 * a process increases by (ESTVCPUFREQ / load) per second, then the decay 168 * should reach this value when estcpu reaches ESTCPUMAX. That calculation 169 * is: 170 * 171 * ESTCPUMAX * decay = ESTCPUVFREQ / load 172 * decay = ESTCPUVFREQ / (load * ESTCPUMAX) 173 * decay = estcpu * 0.053 / load 174 * 175 * If the load is less then 1.0 we assume a load of 1.0. 176 */ 177 178 #define cload(loadav) ((loadav) < FSCALE ? FSCALE : (loadav)) 179 #define decay_cpu(loadav,estcpu) \ 180 ((estcpu) * (FSCALE * ESTCPUVFREQ / ESTCPUMAX) / cload(loadav)) 181 182 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 183 static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 184 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 185 186 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */ 187 static int fscale __unused = FSCALE; 188 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, ""); 189 190 /* 191 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 192 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 193 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 194 * 195 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 196 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 197 * 198 * If you don't want to bother with the faster/more-accurate formula, you 199 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 200 * (more general) method of calculating the %age of CPU used by a process. 201 */ 202 #define CCPU_SHIFT 11 203 204 /* 205 * Recompute process priorities, once a second. 206 */ 207 /* ARGSUSED */ 208 static void 209 schedcpu(void *arg) 210 { 211 fixpt_t loadfac = averunnable.ldavg[0]; 212 struct proc *p; 213 int s; 214 unsigned int ndecay; 215 216 FOREACH_PROC_IN_SYSTEM(p) { 217 /* 218 * Increment time in/out of memory and sleep time 219 * (if sleeping). We ignore overflow; with 16-bit int's 220 * (remember them?) overflow takes 45 days. 221 */ 222 p->p_swtime++; 223 if (p->p_stat == SSLEEP || p->p_stat == SSTOP) 224 p->p_slptime++; 225 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 226 /* 227 * If the process has slept the entire second, 228 * stop recalculating its priority until it wakes up. 229 */ 230 if (p->p_slptime > 1) 231 continue; 232 s = splhigh(); /* prevent state changes and protect run queue */ 233 /* 234 * p_pctcpu is only for ps. 235 */ 236 #if (FSHIFT >= CCPU_SHIFT) 237 p->p_pctcpu += (ESTCPUFREQ == 100)? 238 ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT): 239 100 * (((fixpt_t) p->p_cpticks) 240 << (FSHIFT - CCPU_SHIFT)) / ESTCPUFREQ; 241 #else 242 p->p_pctcpu += ((FSCALE - ccpu) * 243 (p->p_cpticks * FSCALE / ESTCPUFREQ)) >> FSHIFT; 244 #endif 245 p->p_cpticks = 0; 246 ndecay = decay_cpu(loadfac, p->p_estcpu); 247 if (p->p_estcpu > ndecay) 248 p->p_estcpu -= ndecay; 249 else 250 p->p_estcpu = 0; 251 resetpriority(p); 252 splx(s); 253 } 254 wakeup((caddr_t)&lbolt); 255 timeout(schedcpu, (void *)0, hz); 256 } 257 258 /* 259 * Recalculate the priority of a process after it has slept for a while. 260 * For all load averages >= 1 and max p_estcpu of 255, sleeping for at 261 * least six times the loadfactor will decay p_estcpu to zero. 262 */ 263 static void 264 updatepri(struct proc *p) 265 { 266 unsigned int ndecay; 267 268 ndecay = decay_cpu(averunnable.ldavg[0], p->p_estcpu) * p->p_slptime; 269 if (p->p_estcpu > ndecay) 270 p->p_estcpu -= ndecay; 271 else 272 p->p_estcpu = 0; 273 resetpriority(p); 274 } 275 276 /* 277 * We're only looking at 7 bits of the address; everything is 278 * aligned to 4, lots of things are aligned to greater powers 279 * of 2. Shift right by 8, i.e. drop the bottom 256 worth. 280 */ 281 #define TABLESIZE 128 282 static TAILQ_HEAD(slpquehead, thread) slpque[TABLESIZE]; 283 #define LOOKUP(x) (((intptr_t)(x) >> 8) & (TABLESIZE - 1)) 284 285 /* 286 * During autoconfiguration or after a panic, a sleep will simply 287 * lower the priority briefly to allow interrupts, then return. 288 * The priority to be used (safepri) is machine-dependent, thus this 289 * value is initialized and maintained in the machine-dependent layers. 290 * This priority will typically be 0, or the lowest priority 291 * that is safe for use on the interrupt stack; it can be made 292 * higher to block network software interrupts after panics. 293 */ 294 int safepri; 295 296 void 297 sleepinit(void) 298 { 299 int i; 300 301 sched_quantum = hz/10; 302 hogticks = 2 * sched_quantum; 303 for (i = 0; i < TABLESIZE; i++) 304 TAILQ_INIT(&slpque[i]); 305 } 306 307 /* 308 * General sleep call. Suspends the current process until a wakeup is 309 * performed on the specified identifier. The process will then be made 310 * runnable with the specified priority. Sleeps at most timo/hz seconds 311 * (0 means no timeout). If flags includes PCATCH flag, signals are checked 312 * before and after sleeping, else signals are not checked. Returns 0 if 313 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 314 * signal needs to be delivered, ERESTART is returned if the current system 315 * call should be restarted if possible, and EINTR is returned if the system 316 * call should be interrupted by the signal (return EINTR). 317 * 318 * Note that if we are a process, we release_curproc() before messing with 319 * the LWKT scheduler. 320 */ 321 int 322 tsleep(void *ident, int flags, const char *wmesg, int timo) 323 { 324 struct thread *td = curthread; 325 struct proc *p = td->td_proc; /* may be NULL */ 326 int sig = 0, catch = flags & PCATCH; 327 int id = LOOKUP(ident); 328 struct callout_handle thandle; 329 330 /* 331 * NOTE: removed KTRPOINT, it could cause races due to blocking 332 * even in stable. Just scrap it for now. 333 */ 334 if (cold || panicstr) { 335 /* 336 * After a panic, or during autoconfiguration, 337 * just give interrupts a chance, then just return; 338 * don't run any other procs or panic below, 339 * in case this is the idle process and already asleep. 340 */ 341 crit_panicints(); 342 return (0); 343 } 344 KKASSERT(td != &mycpu->gd_idlethread); /* you must be kidding! */ 345 crit_enter_quick(td); 346 KASSERT(ident != NULL, ("tsleep: no ident")); 347 KASSERT(p == NULL || p->p_stat == SRUN, ("tsleep %p %s %d", 348 ident, wmesg, p->p_stat)); 349 350 td->td_wchan = ident; 351 td->td_wmesg = wmesg; 352 if (p) { 353 if (flags & PNORESCHED) 354 td->td_flags |= TDF_NORESCHED; 355 release_curproc(p); 356 p->p_slptime = 0; 357 } 358 lwkt_deschedule_self(td); 359 TAILQ_INSERT_TAIL(&slpque[id], td, td_threadq); 360 if (timo) 361 thandle = timeout(endtsleep, (void *)td, timo); 362 /* 363 * We put ourselves on the sleep queue and start our timeout 364 * before calling CURSIG, as we could stop there, and a wakeup 365 * or a SIGCONT (or both) could occur while we were stopped. 366 * A SIGCONT would cause us to be marked as SSLEEP 367 * without resuming us, thus we must be ready for sleep 368 * when CURSIG is called. If the wakeup happens while we're 369 * stopped, td->td_wchan will be 0 upon return from CURSIG. 370 */ 371 if (p) { 372 if (catch) { 373 p->p_flag |= P_SINTR; 374 if ((sig = CURSIG(p))) { 375 if (td->td_wchan) { 376 unsleep(td); 377 lwkt_schedule_self(td); 378 } 379 p->p_stat = SRUN; 380 goto resume; 381 } 382 if (td->td_wchan == NULL) { 383 catch = 0; 384 goto resume; 385 } 386 } else { 387 sig = 0; 388 } 389 390 /* 391 * If we are not the current process we have to remove ourself 392 * from the run queue. 393 */ 394 KASSERT(p->p_stat == SRUN, ("PSTAT NOT SRUN %d %d", p->p_pid, p->p_stat)); 395 /* 396 * If this is the current 'user' process schedule another one. 397 */ 398 clrrunnable(p, SSLEEP); 399 p->p_stats->p_ru.ru_nvcsw++; 400 mi_switch(p); 401 KASSERT(p->p_stat == SRUN, ("tsleep: stat not srun")); 402 } else { 403 lwkt_switch(); 404 } 405 resume: 406 if (p) 407 p->p_flag &= ~P_SINTR; 408 crit_exit_quick(td); 409 td->td_flags &= ~TDF_NORESCHED; 410 if (td->td_flags & TDF_TIMEOUT) { 411 td->td_flags &= ~TDF_TIMEOUT; 412 if (sig == 0) 413 return (EWOULDBLOCK); 414 } else if (timo) { 415 untimeout(endtsleep, (void *)td, thandle); 416 } else if (td->td_wmesg) { 417 /* 418 * This can happen if a thread is woken up directly. Clear 419 * wmesg to avoid debugging confusion. 420 */ 421 td->td_wmesg = NULL; 422 } 423 /* inline of iscaught() */ 424 if (p) { 425 if (catch && (sig != 0 || (sig = CURSIG(p)))) { 426 if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig)) 427 return (EINTR); 428 return (ERESTART); 429 } 430 } 431 return (0); 432 } 433 434 /* 435 * Implement the timeout for tsleep. We interlock against 436 * wchan when setting TDF_TIMEOUT. For processes we remove 437 * the sleep if the process is stopped rather then sleeping, 438 * so it remains stopped. 439 */ 440 static void 441 endtsleep(void *arg) 442 { 443 thread_t td = arg; 444 struct proc *p; 445 446 crit_enter(); 447 if (td->td_wchan) { 448 td->td_flags |= TDF_TIMEOUT; 449 if ((p = td->td_proc) != NULL) { 450 if (p->p_stat == SSLEEP) 451 setrunnable(p); 452 else 453 unsleep(td); 454 } else { 455 unsleep(td); 456 lwkt_schedule(td); 457 } 458 } 459 crit_exit(); 460 } 461 462 /* 463 * Remove a process from its wait queue 464 */ 465 void 466 unsleep(struct thread *td) 467 { 468 crit_enter(); 469 if (td->td_wchan) { 470 #if 0 471 if (p->p_flag & P_XSLEEP) { 472 struct xwait *w = p->p_wchan; 473 TAILQ_REMOVE(&w->waitq, p, p_procq); 474 p->p_flag &= ~P_XSLEEP; 475 } else 476 #endif 477 TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_threadq); 478 td->td_wchan = NULL; 479 } 480 crit_exit(); 481 } 482 483 #if 0 484 /* 485 * Make all processes sleeping on the explicit lock structure runnable. 486 */ 487 void 488 xwakeup(struct xwait *w) 489 { 490 struct proc *p; 491 492 crit_enter(); 493 ++w->gen; 494 while ((p = TAILQ_FIRST(&w->waitq)) != NULL) { 495 TAILQ_REMOVE(&w->waitq, p, p_procq); 496 KASSERT(p->p_wchan == w && (p->p_flag & P_XSLEEP), 497 ("xwakeup: wchan mismatch for %p (%p/%p) %08x", p, p->p_wchan, w, p->p_flag & P_XSLEEP)); 498 p->p_wchan = NULL; 499 p->p_flag &= ~P_XSLEEP; 500 if (p->p_stat == SSLEEP) { 501 /* OPTIMIZED EXPANSION OF setrunnable(p); */ 502 if (p->p_slptime > 1) 503 updatepri(p); 504 p->p_slptime = 0; 505 p->p_stat = SRUN; 506 if (p->p_flag & P_INMEM) { 507 setrunqueue(p); 508 } else { 509 p->p_flag |= P_SWAPINREQ; 510 wakeup((caddr_t)&proc0); 511 } 512 } 513 } 514 crit_exit(); 515 } 516 #endif 517 518 /* 519 * Make all processes sleeping on the specified identifier runnable. 520 */ 521 static void 522 _wakeup(void *ident, int count) 523 { 524 struct slpquehead *qp; 525 struct thread *td; 526 struct thread *ntd; 527 struct proc *p; 528 int id = LOOKUP(ident); 529 530 crit_enter(); 531 qp = &slpque[id]; 532 restart: 533 for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) { 534 ntd = TAILQ_NEXT(td, td_threadq); 535 if (td->td_wchan == ident) { 536 TAILQ_REMOVE(qp, td, td_threadq); 537 td->td_wchan = NULL; 538 if ((p = td->td_proc) != NULL && p->p_stat == SSLEEP) { 539 /* OPTIMIZED EXPANSION OF setrunnable(p); */ 540 if (p->p_slptime > 1) 541 updatepri(p); 542 p->p_slptime = 0; 543 p->p_stat = SRUN; 544 if (p->p_flag & P_INMEM) { 545 setrunqueue(p); 546 } else { 547 p->p_flag |= P_SWAPINREQ; 548 wakeup((caddr_t)&proc0); 549 } 550 /* END INLINE EXPANSION */ 551 } else if (p == NULL) { 552 lwkt_schedule(td); 553 } 554 if (--count == 0) 555 break; 556 goto restart; 557 } 558 } 559 crit_exit(); 560 } 561 562 void 563 wakeup(void *ident) 564 { 565 _wakeup(ident, 0); 566 } 567 568 void 569 wakeup_one(void *ident) 570 { 571 _wakeup(ident, 1); 572 } 573 574 /* 575 * The machine independent parts of mi_switch(). 576 * 577 * 'p' must be the current process. 578 */ 579 void 580 mi_switch(struct proc *p) 581 { 582 thread_t td = p->p_thread; 583 struct rlimit *rlim; 584 u_int64_t ttime; 585 586 KKASSERT(td == mycpu->gd_curthread); 587 588 crit_enter_quick(td); 589 590 /* 591 * Check if the process exceeds its cpu resource allocation. 592 * If over max, kill it. Time spent in interrupts is not 593 * included. YYY 64 bit match is expensive. Ick. 594 */ 595 ttime = td->td_sticks + td->td_uticks; 596 if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY && 597 ttime > p->p_limit->p_cpulimit) { 598 rlim = &p->p_rlimit[RLIMIT_CPU]; 599 if (ttime / (rlim_t)1000000 >= rlim->rlim_max) { 600 killproc(p, "exceeded maximum CPU limit"); 601 } else { 602 psignal(p, SIGXCPU); 603 if (rlim->rlim_cur < rlim->rlim_max) { 604 /* XXX: we should make a private copy */ 605 rlim->rlim_cur += 5; 606 } 607 } 608 } 609 610 /* 611 * Pick a new current process and record its start time. If we 612 * are in a SSTOPped state we deschedule ourselves. YYY this needs 613 * to be cleaned up, remember that LWKTs stay on their run queue 614 * which works differently then the user scheduler which removes 615 * the process from the runq when it runs it. 616 */ 617 mycpu->gd_cnt.v_swtch++; 618 if (p->p_stat == SSTOP) 619 lwkt_deschedule_self(td); 620 lwkt_switch(); 621 crit_exit_quick(td); 622 } 623 624 /* 625 * Change process state to be runnable, 626 * placing it on the run queue if it is in memory, 627 * and awakening the swapper if it isn't in memory. 628 */ 629 void 630 setrunnable(struct proc *p) 631 { 632 int s; 633 634 s = splhigh(); 635 switch (p->p_stat) { 636 case 0: 637 case SRUN: 638 case SZOMB: 639 default: 640 panic("setrunnable"); 641 case SSTOP: 642 case SSLEEP: 643 unsleep(p->p_thread); /* e.g. when sending signals */ 644 break; 645 646 case SIDL: 647 break; 648 } 649 p->p_stat = SRUN; 650 if (p->p_flag & P_INMEM) 651 setrunqueue(p); 652 splx(s); 653 if (p->p_slptime > 1) 654 updatepri(p); 655 p->p_slptime = 0; 656 if ((p->p_flag & P_INMEM) == 0) { 657 p->p_flag |= P_SWAPINREQ; 658 wakeup((caddr_t)&proc0); 659 } 660 } 661 662 /* 663 * Change the process state to NOT be runnable, removing it from the run 664 * queue. 665 */ 666 void 667 clrrunnable(struct proc *p, int stat) 668 { 669 crit_enter_quick(p->p_thread); 670 if (p->p_stat == SRUN && (p->p_flag & P_ONRUNQ)) 671 remrunqueue(p); 672 p->p_stat = stat; 673 crit_exit_quick(p->p_thread); 674 } 675 676 /* 677 * Compute the priority of a process when running in user mode. 678 * Arrange to reschedule if the resulting priority is better 679 * than that of the current process. 680 */ 681 void 682 resetpriority(struct proc *p) 683 { 684 unsigned int newpriority; 685 int opq; 686 int npq; 687 688 /* 689 * Set p_priority for general process comparisons 690 */ 691 switch(p->p_rtprio.type) { 692 case RTP_PRIO_REALTIME: 693 p->p_priority = PRIBASE_REALTIME + p->p_rtprio.prio; 694 return; 695 case RTP_PRIO_NORMAL: 696 break; 697 case RTP_PRIO_IDLE: 698 p->p_priority = PRIBASE_IDLE + p->p_rtprio.prio; 699 return; 700 case RTP_PRIO_THREAD: 701 p->p_priority = PRIBASE_THREAD + p->p_rtprio.prio; 702 return; 703 } 704 705 /* 706 * NORMAL priorities fall through. These are based on niceness 707 * and cpu use. 708 */ 709 newpriority = NICE_ADJUST(p->p_nice - PRIO_MIN) + 710 p->p_estcpu / ESTCPURAMP; 711 newpriority = min(newpriority, MAXPRI); 712 npq = newpriority / PPQ; 713 crit_enter(); 714 opq = (p->p_priority & PRIMASK) / PPQ; 715 if (p->p_stat == SRUN && (p->p_flag & P_ONRUNQ) && opq != npq) { 716 /* 717 * We have to move the process to another queue 718 */ 719 remrunqueue(p); 720 p->p_priority = PRIBASE_NORMAL + newpriority; 721 setrunqueue(p); 722 } else { 723 /* 724 * We can just adjust the priority and it will be picked 725 * up later. 726 */ 727 KKASSERT(opq == npq || (p->p_flag & P_ONRUNQ) == 0); 728 p->p_priority = PRIBASE_NORMAL + newpriority; 729 } 730 crit_exit(); 731 } 732 733 /* 734 * Compute a tenex style load average of a quantity on 735 * 1, 5 and 15 minute intervals. 736 */ 737 static void 738 loadav(void *arg) 739 { 740 int i, nrun; 741 struct loadavg *avg; 742 struct proc *p; 743 744 avg = &averunnable; 745 nrun = 0; 746 FOREACH_PROC_IN_SYSTEM(p) { 747 thread_t td; 748 if (p->p_flag & P_CP_RELEASED) { 749 if ((td = p->p_thread) != NULL) { 750 if (td->td_flags & (TDF_RUNQ|TDF_RUNNING)) 751 nrun++; 752 } 753 } else { 754 switch (p->p_stat) { 755 case SRUN: 756 case SIDL: 757 nrun++; 758 break; 759 default: 760 break; 761 } 762 } 763 } 764 for (i = 0; i < 3; i++) 765 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + 766 nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; 767 768 /* 769 * Schedule the next update to occur after 5 seconds, but add a 770 * random variation to avoid synchronisation with processes that 771 * run at regular intervals. 772 */ 773 callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)), 774 loadav, NULL); 775 } 776 777 /* ARGSUSED */ 778 static void 779 sched_setup(void *dummy) 780 { 781 782 callout_init(&loadav_callout); 783 784 /* Kick off timeout driven events by calling first time. */ 785 roundrobin(NULL); 786 schedcpu(NULL); 787 loadav(NULL); 788 } 789 790 /* 791 * We adjust the priority of the current process. The priority of 792 * a process gets worse as it accumulates CPU time. The cpu usage 793 * estimator (p_estcpu) is increased here. resetpriority() will 794 * compute a different priority each time p_estcpu increases by 795 * INVERSE_ESTCPU_WEIGHT * (until MAXPRI is reached). 796 * 797 * The cpu usage estimator ramps up quite quickly when the process is 798 * running (linearly), and decays away exponentially, at a rate which 799 * is proportionally slower when the system is busy. The basic principle 800 * is that the system will 90% forget that the process used a lot of CPU 801 * time in 5 * loadav seconds. This causes the system to favor processes 802 * which haven't run much recently, and to round-robin among other processes. 803 * 804 * The actual schedulerclock interrupt rate is ESTCPUFREQ, but we generally 805 * want to ramp-up at a faster rate, ESTCPUVFREQ, so p_estcpu is scaled 806 * by (ESTCPUVFREQ / ESTCPUFREQ). You can control the ramp-up/ramp-down 807 * rate by adjusting ESTCPUVFREQ in sys/proc.h in integer multiples 808 * of ESTCPUFREQ. 809 * 810 * WARNING! called from a fast-int or an IPI, the MP lock MIGHT NOT BE HELD 811 * and we cannot block. 812 */ 813 void 814 schedulerclock(void *dummy) 815 { 816 struct thread *td; 817 struct proc *p; 818 819 td = curthread; 820 if ((p = td->td_proc) != NULL) { 821 p->p_cpticks++; /* cpticks runs at ESTCPUFREQ */ 822 p->p_estcpu = ESTCPULIM(p->p_estcpu + ESTCPUVFREQ / ESTCPUFREQ); 823 if (try_mplock()) { 824 resetpriority(p); 825 rel_mplock(); 826 } 827 } 828 } 829 830 static 831 void 832 crit_panicints(void) 833 { 834 int s; 835 int cpri; 836 837 s = splhigh(); 838 cpri = crit_panic_save(); 839 splx(safepri); 840 crit_panic_restore(cpri); 841 splx(s); 842 } 843 844