1 /*- 2 * Copyright (c) 1982, 1986, 1990, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * All rights reserved. 5 * 6 * %sccs.include.redist.c% 7 * 8 * @(#)kern_synch.c 8.1 (Berkeley) 06/10/93 9 */ 10 11 #include <sys/param.h> 12 #include <sys/systm.h> 13 #include <sys/proc.h> 14 #include <sys/kernel.h> 15 #include <sys/buf.h> 16 #include <sys/signalvar.h> 17 #include <sys/resourcevar.h> 18 #include <sys/vmmeter.h> 19 #ifdef KTRACE 20 #include <sys/ktrace.h> 21 #endif 22 23 #include <machine/cpu.h> 24 25 u_char curpri; /* usrpri of curproc */ 26 int lbolt; /* once a second sleep address */ 27 28 /* 29 * Force switch among equal priority processes every 100ms. 30 */ 31 /* ARGSUSED */ 32 void 33 roundrobin(arg) 34 void *arg; 35 { 36 37 need_resched(); 38 timeout(roundrobin, (void *)0, hz / 10); 39 } 40 41 /* 42 * constants for digital decay and forget 43 * 90% of (p_cpu) usage in 5*loadav time 44 * 95% of (p_pctcpu) usage in 60 seconds (load insensitive) 45 * Note that, as ps(1) mentions, this can let percentages 46 * total over 100% (I've seen 137.9% for 3 processes). 47 * 48 * Note that hardclock updates p_cpu and p_cpticks independently. 49 * 50 * We wish to decay away 90% of p_cpu in (5 * loadavg) seconds. 51 * That is, the system wants to compute a value of decay such 52 * that the following for loop: 53 * for (i = 0; i < (5 * loadavg); i++) 54 * p_cpu *= decay; 55 * will compute 56 * p_cpu *= 0.1; 57 * for all values of loadavg: 58 * 59 * Mathematically this loop can be expressed by saying: 60 * decay ** (5 * loadavg) ~= .1 61 * 62 * The system computes decay as: 63 * decay = (2 * loadavg) / (2 * loadavg + 1) 64 * 65 * We wish to prove that the system's computation of decay 66 * will always fulfill the equation: 67 * decay ** (5 * loadavg) ~= .1 68 * 69 * If we compute b as: 70 * b = 2 * loadavg 71 * then 72 * decay = b / (b + 1) 73 * 74 * We now need to prove two things: 75 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) 76 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) 77 * 78 * Facts: 79 * For x close to zero, exp(x) =~ 1 + x, since 80 * exp(x) = 0! + x**1/1! + x**2/2! + ... . 81 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. 82 * For x close to zero, ln(1+x) =~ x, since 83 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 84 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). 85 * ln(.1) =~ -2.30 86 * 87 * Proof of (1): 88 * Solve (factor)**(power) =~ .1 given power (5*loadav): 89 * solving for factor, 90 * ln(factor) =~ (-2.30/5*loadav), or 91 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = 92 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED 93 * 94 * Proof of (2): 95 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): 96 * solving for power, 97 * power*ln(b/(b+1)) =~ -2.30, or 98 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED 99 * 100 * Actual power values for the implemented algorithm are as follows: 101 * loadav: 1 2 3 4 102 * power: 5.68 10.32 14.94 19.55 103 */ 104 105 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ 106 #define loadfactor(loadav) (2 * (loadav)) 107 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) 108 109 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 110 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 111 112 /* 113 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 114 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 115 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 116 * 117 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 118 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 119 * 120 * If you dont want to bother with the faster/more-accurate formula, you 121 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 122 * (more general) method of calculating the %age of CPU used by a process. 123 */ 124 #define CCPU_SHIFT 11 125 126 /* 127 * Recompute process priorities, once a second 128 */ 129 /* ARGSUSED */ 130 void 131 schedcpu(arg) 132 void *arg; 133 { 134 register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 135 register struct proc *p; 136 register int s; 137 register unsigned int newcpu; 138 139 wakeup((caddr_t)&lbolt); 140 for (p = (struct proc *)allproc; p != NULL; p = p->p_nxt) { 141 /* 142 * Increment time in/out of memory and sleep time 143 * (if sleeping). We ignore overflow; with 16-bit int's 144 * (remember them?) overflow takes 45 days. 145 */ 146 p->p_time++; 147 if (p->p_stat == SSLEEP || p->p_stat == SSTOP) 148 p->p_slptime++; 149 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 150 /* 151 * If the process has slept the entire second, 152 * stop recalculating its priority until it wakes up. 153 */ 154 if (p->p_slptime > 1) 155 continue; 156 s = splstatclock(); /* prevent state changes */ 157 /* 158 * p_pctcpu is only for ps. 159 */ 160 #if (FSHIFT >= CCPU_SHIFT) 161 p->p_pctcpu += (hz == 100)? 162 ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT): 163 100 * (((fixpt_t) p->p_cpticks) 164 << (FSHIFT - CCPU_SHIFT)) / hz; 165 #else 166 p->p_pctcpu += ((FSCALE - ccpu) * 167 (p->p_cpticks * FSCALE / hz)) >> FSHIFT; 168 #endif 169 p->p_cpticks = 0; 170 newcpu = (u_int) decay_cpu(loadfac, p->p_cpu) + p->p_nice; 171 p->p_cpu = min(newcpu, UCHAR_MAX); 172 setpri(p); 173 if (p->p_pri >= PUSER) { 174 #define PPQ (128 / NQS) /* priorities per queue */ 175 if ((p != curproc) && 176 p->p_stat == SRUN && 177 (p->p_flag & SLOAD) && 178 (p->p_pri / PPQ) != (p->p_usrpri / PPQ)) { 179 remrq(p); 180 p->p_pri = p->p_usrpri; 181 setrq(p); 182 } else 183 p->p_pri = p->p_usrpri; 184 } 185 splx(s); 186 } 187 vmmeter(); 188 if (bclnlist != NULL) 189 wakeup((caddr_t)pageproc); 190 timeout(schedcpu, (void *)0, hz); 191 } 192 193 /* 194 * Recalculate the priority of a process after it has slept for a while. 195 * For all load averages >= 1 and max p_cpu of 255, sleeping for at least 196 * six times the loadfactor will decay p_cpu to zero. 197 */ 198 void 199 updatepri(p) 200 register struct proc *p; 201 { 202 register unsigned int newcpu = p->p_cpu; 203 register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 204 205 if (p->p_slptime > 5 * loadfac) 206 p->p_cpu = 0; 207 else { 208 p->p_slptime--; /* the first time was done in schedcpu */ 209 while (newcpu && --p->p_slptime) 210 newcpu = (int) decay_cpu(loadfac, newcpu); 211 p->p_cpu = min(newcpu, UCHAR_MAX); 212 } 213 setpri(p); 214 } 215 216 #define SQSIZE 0100 /* Must be power of 2 */ 217 #define HASH(x) (( (int) x >> 5) & (SQSIZE-1)) 218 struct slpque { 219 struct proc *sq_head; 220 struct proc **sq_tailp; 221 } slpque[SQSIZE]; 222 223 /* 224 * During autoconfiguration or after a panic, a sleep will simply 225 * lower the priority briefly to allow interrupts, then return. 226 * The priority to be used (safepri) is machine-dependent, thus this 227 * value is initialized and maintained in the machine-dependent layers. 228 * This priority will typically be 0, or the lowest priority 229 * that is safe for use on the interrupt stack; it can be made 230 * higher to block network software interrupts after panics. 231 */ 232 int safepri; 233 234 /* 235 * General sleep call. 236 * Suspends current process until a wakeup is made on chan. 237 * The process will then be made runnable with priority pri. 238 * Sleeps at most timo/hz seconds (0 means no timeout). 239 * If pri includes PCATCH flag, signals are checked 240 * before and after sleeping, else signals are not checked. 241 * Returns 0 if awakened, EWOULDBLOCK if the timeout expires. 242 * If PCATCH is set and a signal needs to be delivered, 243 * ERESTART is returned if the current system call should be restarted 244 * if possible, and EINTR is returned if the system call should 245 * be interrupted by the signal (return EINTR). 246 */ 247 int 248 tsleep(chan, pri, wmesg, timo) 249 void *chan; 250 int pri; 251 char *wmesg; 252 int timo; 253 { 254 register struct proc *p = curproc; 255 register struct slpque *qp; 256 register s; 257 int sig, catch = pri & PCATCH; 258 extern int cold; 259 void endtsleep __P((void *)); 260 261 #ifdef KTRACE 262 if (KTRPOINT(p, KTR_CSW)) 263 ktrcsw(p->p_tracep, 1, 0); 264 #endif 265 s = splhigh(); 266 if (cold || panicstr) { 267 /* 268 * After a panic, or during autoconfiguration, 269 * just give interrupts a chance, then just return; 270 * don't run any other procs or panic below, 271 * in case this is the idle process and already asleep. 272 */ 273 splx(safepri); 274 splx(s); 275 return (0); 276 } 277 #ifdef DIAGNOSTIC 278 if (chan == NULL || p->p_stat != SRUN || p->p_rlink) 279 panic("tsleep"); 280 #endif 281 p->p_wchan = chan; 282 p->p_wmesg = wmesg; 283 p->p_slptime = 0; 284 p->p_pri = pri & PRIMASK; 285 qp = &slpque[HASH(chan)]; 286 if (qp->sq_head == 0) 287 qp->sq_head = p; 288 else 289 *qp->sq_tailp = p; 290 *(qp->sq_tailp = &p->p_link) = 0; 291 if (timo) 292 timeout(endtsleep, (void *)p, timo); 293 /* 294 * We put ourselves on the sleep queue and start our timeout 295 * before calling CURSIG, as we could stop there, and a wakeup 296 * or a SIGCONT (or both) could occur while we were stopped. 297 * A SIGCONT would cause us to be marked as SSLEEP 298 * without resuming us, thus we must be ready for sleep 299 * when CURSIG is called. If the wakeup happens while we're 300 * stopped, p->p_wchan will be 0 upon return from CURSIG. 301 */ 302 if (catch) { 303 p->p_flag |= SSINTR; 304 if (sig = CURSIG(p)) { 305 if (p->p_wchan) 306 unsleep(p); 307 p->p_stat = SRUN; 308 goto resume; 309 } 310 if (p->p_wchan == 0) { 311 catch = 0; 312 goto resume; 313 } 314 } else 315 sig = 0; 316 p->p_stat = SSLEEP; 317 p->p_stats->p_ru.ru_nvcsw++; 318 swtch(); 319 resume: 320 curpri = p->p_usrpri; 321 splx(s); 322 p->p_flag &= ~SSINTR; 323 if (p->p_flag & STIMO) { 324 p->p_flag &= ~STIMO; 325 if (sig == 0) { 326 #ifdef KTRACE 327 if (KTRPOINT(p, KTR_CSW)) 328 ktrcsw(p->p_tracep, 0, 0); 329 #endif 330 return (EWOULDBLOCK); 331 } 332 } else if (timo) 333 untimeout(endtsleep, (void *)p); 334 if (catch && (sig != 0 || (sig = CURSIG(p)))) { 335 #ifdef KTRACE 336 if (KTRPOINT(p, KTR_CSW)) 337 ktrcsw(p->p_tracep, 0, 0); 338 #endif 339 if (p->p_sigacts->ps_sigintr & sigmask(sig)) 340 return (EINTR); 341 return (ERESTART); 342 } 343 #ifdef KTRACE 344 if (KTRPOINT(p, KTR_CSW)) 345 ktrcsw(p->p_tracep, 0, 0); 346 #endif 347 return (0); 348 } 349 350 /* 351 * Implement timeout for tsleep. 352 * If process hasn't been awakened (wchan non-zero), 353 * set timeout flag and undo the sleep. If proc 354 * is stopped, just unsleep so it will remain stopped. 355 */ 356 void 357 endtsleep(arg) 358 void *arg; 359 { 360 register struct proc *p; 361 int s; 362 363 p = (struct proc *)arg; 364 s = splhigh(); 365 if (p->p_wchan) { 366 if (p->p_stat == SSLEEP) 367 setrun(p); 368 else 369 unsleep(p); 370 p->p_flag |= STIMO; 371 } 372 splx(s); 373 } 374 375 /* 376 * Short-term, non-interruptable sleep. 377 */ 378 void 379 sleep(chan, pri) 380 void *chan; 381 int pri; 382 { 383 register struct proc *p = curproc; 384 register struct slpque *qp; 385 register s; 386 extern int cold; 387 388 #ifdef DIAGNOSTIC 389 if (pri > PZERO) { 390 printf("sleep called with pri %d > PZERO, wchan: %x\n", 391 pri, chan); 392 panic("old sleep"); 393 } 394 #endif 395 s = splhigh(); 396 if (cold || panicstr) { 397 /* 398 * After a panic, or during autoconfiguration, 399 * just give interrupts a chance, then just return; 400 * don't run any other procs or panic below, 401 * in case this is the idle process and already asleep. 402 */ 403 splx(safepri); 404 splx(s); 405 return; 406 } 407 #ifdef DIAGNOSTIC 408 if (chan == NULL || p->p_stat != SRUN || p->p_rlink) 409 panic("sleep"); 410 #endif 411 p->p_wchan = chan; 412 p->p_wmesg = NULL; 413 p->p_slptime = 0; 414 p->p_pri = pri; 415 qp = &slpque[HASH(chan)]; 416 if (qp->sq_head == 0) 417 qp->sq_head = p; 418 else 419 *qp->sq_tailp = p; 420 *(qp->sq_tailp = &p->p_link) = 0; 421 p->p_stat = SSLEEP; 422 p->p_stats->p_ru.ru_nvcsw++; 423 #ifdef KTRACE 424 if (KTRPOINT(p, KTR_CSW)) 425 ktrcsw(p->p_tracep, 1, 0); 426 #endif 427 swtch(); 428 #ifdef KTRACE 429 if (KTRPOINT(p, KTR_CSW)) 430 ktrcsw(p->p_tracep, 0, 0); 431 #endif 432 curpri = p->p_usrpri; 433 splx(s); 434 } 435 436 /* 437 * Remove a process from its wait queue 438 */ 439 void 440 unsleep(p) 441 register struct proc *p; 442 { 443 register struct slpque *qp; 444 register struct proc **hp; 445 int s; 446 447 s = splhigh(); 448 if (p->p_wchan) { 449 hp = &(qp = &slpque[HASH(p->p_wchan)])->sq_head; 450 while (*hp != p) 451 hp = &(*hp)->p_link; 452 *hp = p->p_link; 453 if (qp->sq_tailp == &p->p_link) 454 qp->sq_tailp = hp; 455 p->p_wchan = 0; 456 } 457 splx(s); 458 } 459 460 /* 461 * Wakeup on "chan"; set all processes 462 * sleeping on chan to run state. 463 */ 464 void 465 wakeup(chan) 466 register void *chan; 467 { 468 register struct slpque *qp; 469 register struct proc *p, **q; 470 int s; 471 472 s = splhigh(); 473 qp = &slpque[HASH(chan)]; 474 restart: 475 for (q = &qp->sq_head; p = *q; ) { 476 #ifdef DIAGNOSTIC 477 if (p->p_rlink || p->p_stat != SSLEEP && p->p_stat != SSTOP) 478 panic("wakeup"); 479 #endif 480 if (p->p_wchan == chan) { 481 p->p_wchan = 0; 482 *q = p->p_link; 483 if (qp->sq_tailp == &p->p_link) 484 qp->sq_tailp = q; 485 if (p->p_stat == SSLEEP) { 486 /* OPTIMIZED INLINE EXPANSION OF setrun(p) */ 487 if (p->p_slptime > 1) 488 updatepri(p); 489 p->p_slptime = 0; 490 p->p_stat = SRUN; 491 if (p->p_flag & SLOAD) 492 setrq(p); 493 /* 494 * Since curpri is a usrpri, 495 * p->p_pri is always better than curpri. 496 */ 497 if ((p->p_flag&SLOAD) == 0) 498 wakeup((caddr_t)&proc0); 499 else 500 need_resched(); 501 /* END INLINE EXPANSION */ 502 goto restart; 503 } 504 } else 505 q = &p->p_link; 506 } 507 splx(s); 508 } 509 510 /* 511 * The machine independent parts of swtch(). 512 * Must be called at splstatclock() or higher. 513 */ 514 void 515 swtch() 516 { 517 register struct proc *p = curproc; /* XXX */ 518 register struct rlimit *rlim; 519 register long s, u; 520 struct timeval tv; 521 522 /* 523 * Compute the amount of time during which the current 524 * process was running, and add that to its total so far. 525 */ 526 microtime(&tv); 527 u = p->p_rtime.tv_usec + (tv.tv_usec - runtime.tv_usec); 528 s = p->p_rtime.tv_sec + (tv.tv_sec - runtime.tv_sec); 529 if (u < 0) { 530 u += 1000000; 531 s--; 532 } else if (u >= 1000000) { 533 u -= 1000000; 534 s++; 535 } 536 p->p_rtime.tv_usec = u; 537 p->p_rtime.tv_sec = s; 538 539 /* 540 * Check if the process exceeds its cpu resource allocation. 541 * If over max, kill it. In any case, if it has run for more 542 * than 10 minutes, reduce priority to give others a chance. 543 */ 544 rlim = &p->p_rlimit[RLIMIT_CPU]; 545 if (s >= rlim->rlim_cur) { 546 if (s >= rlim->rlim_max) 547 psignal(p, SIGKILL); 548 else { 549 psignal(p, SIGXCPU); 550 if (rlim->rlim_cur < rlim->rlim_max) 551 rlim->rlim_cur += 5; 552 } 553 } 554 if (s > 10 * 60 && p->p_ucred->cr_uid && p->p_nice == NZERO) { 555 p->p_nice = NZERO + 4; 556 setpri(p); 557 } 558 559 /* 560 * Pick a new current process and record its start time. 561 */ 562 cnt.v_swtch++; 563 cpu_swtch(p); 564 microtime(&runtime); 565 } 566 567 /* 568 * Initialize the (doubly-linked) run queues 569 * to be empty. 570 */ 571 rqinit() 572 { 573 register int i; 574 575 for (i = 0; i < NQS; i++) 576 qs[i].ph_link = qs[i].ph_rlink = (struct proc *)&qs[i]; 577 } 578 579 /* 580 * Change process state to be runnable, 581 * placing it on the run queue if it is in memory, 582 * and awakening the swapper if it isn't in memory. 583 */ 584 void 585 setrun(p) 586 register struct proc *p; 587 { 588 register int s; 589 590 s = splhigh(); 591 switch (p->p_stat) { 592 593 case 0: 594 case SWAIT: 595 case SRUN: 596 case SZOMB: 597 default: 598 panic("setrun"); 599 600 case SSTOP: 601 case SSLEEP: 602 unsleep(p); /* e.g. when sending signals */ 603 break; 604 605 case SIDL: 606 break; 607 } 608 p->p_stat = SRUN; 609 if (p->p_flag & SLOAD) 610 setrq(p); 611 splx(s); 612 if (p->p_slptime > 1) 613 updatepri(p); 614 p->p_slptime = 0; 615 if ((p->p_flag&SLOAD) == 0) 616 wakeup((caddr_t)&proc0); 617 else if (p->p_pri < curpri) 618 need_resched(); 619 } 620 621 /* 622 * Compute priority of process when running in user mode. 623 * Arrange to reschedule if the resulting priority 624 * is better than that of the current process. 625 */ 626 void 627 setpri(p) 628 register struct proc *p; 629 { 630 register unsigned int newpri; 631 632 newpri = PUSER + p->p_cpu / 4 + 2 * p->p_nice; 633 newpri = min(newpri, MAXPRI); 634 p->p_usrpri = newpri; 635 if (newpri < curpri) 636 need_resched(); 637 } 638