1 /*-
2 * Copyright (c) 1982, 1986, 1990, 1991, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * %sccs.include.redist.c%
11 *
12 * @(#)kern_synch.c 8.9 (Berkeley) 05/19/95
13 */
14
15 #include <sys/param.h>
16 #include <sys/systm.h>
17 #include <sys/proc.h>
18 #include <sys/kernel.h>
19 #include <sys/buf.h>
20 #include <sys/signalvar.h>
21 #include <sys/resourcevar.h>
22 #include <sys/vmmeter.h>
23 #ifdef KTRACE
24 #include <sys/ktrace.h>
25 #endif
26
27 #include <machine/cpu.h>
28
29 u_char curpriority; /* usrpri of curproc */
30 int lbolt; /* once a second sleep address */
31
32 /*
33 * Force switch among equal priority processes every 100ms.
34 */
35 /* ARGSUSED */
36 void
roundrobin(arg)37 roundrobin(arg)
38 void *arg;
39 {
40
41 need_resched();
42 timeout(roundrobin, NULL, hz / 10);
43 }
44
45 /*
46 * Constants for digital decay and forget:
47 * 90% of (p_estcpu) usage in 5 * loadav time
48 * 95% of (p_pctcpu) usage in 60 seconds (load insensitive)
49 * Note that, as ps(1) mentions, this can let percentages
50 * total over 100% (I've seen 137.9% for 3 processes).
51 *
52 * Note that hardclock updates p_estcpu and p_cpticks independently.
53 *
54 * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
55 * That is, the system wants to compute a value of decay such
56 * that the following for loop:
57 * for (i = 0; i < (5 * loadavg); i++)
58 * p_estcpu *= decay;
59 * will compute
60 * p_estcpu *= 0.1;
61 * for all values of loadavg:
62 *
63 * Mathematically this loop can be expressed by saying:
64 * decay ** (5 * loadavg) ~= .1
65 *
66 * The system computes decay as:
67 * decay = (2 * loadavg) / (2 * loadavg + 1)
68 *
69 * We wish to prove that the system's computation of decay
70 * will always fulfill the equation:
71 * decay ** (5 * loadavg) ~= .1
72 *
73 * If we compute b as:
74 * b = 2 * loadavg
75 * then
76 * decay = b / (b + 1)
77 *
78 * We now need to prove two things:
79 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
80 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
81 *
82 * Facts:
83 * For x close to zero, exp(x) =~ 1 + x, since
84 * exp(x) = 0! + x**1/1! + x**2/2! + ... .
85 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
86 * For x close to zero, ln(1+x) =~ x, since
87 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1
88 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
89 * ln(.1) =~ -2.30
90 *
91 * Proof of (1):
92 * Solve (factor)**(power) =~ .1 given power (5*loadav):
93 * solving for factor,
94 * ln(factor) =~ (-2.30/5*loadav), or
95 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
96 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED
97 *
98 * Proof of (2):
99 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
100 * solving for power,
101 * power*ln(b/(b+1)) =~ -2.30, or
102 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED
103 *
104 * Actual power values for the implemented algorithm are as follows:
105 * loadav: 1 2 3 4
106 * power: 5.68 10.32 14.94 19.55
107 */
108
109 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
110 #define loadfactor(loadav) (2 * (loadav))
111 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE))
112
113 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
114 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
115
116 /*
117 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
118 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
119 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
120 *
121 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
122 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
123 *
124 * If you dont want to bother with the faster/more-accurate formula, you
125 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
126 * (more general) method of calculating the %age of CPU used by a process.
127 */
128 #define CCPU_SHIFT 11
129
130 /*
131 * Recompute process priorities, every hz ticks.
132 */
133 /* ARGSUSED */
134 void
schedcpu(arg)135 schedcpu(arg)
136 void *arg;
137 {
138 register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
139 register struct proc *p;
140 register int s;
141 register unsigned int newcpu;
142
143 wakeup((caddr_t)&lbolt);
144 for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
145 /*
146 * Increment time in/out of memory and sleep time
147 * (if sleeping). We ignore overflow; with 16-bit int's
148 * (remember them?) overflow takes 45 days.
149 */
150 p->p_swtime++;
151 if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
152 p->p_slptime++;
153 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
154 /*
155 * If the process has slept the entire second,
156 * stop recalculating its priority until it wakes up.
157 */
158 if (p->p_slptime > 1)
159 continue;
160 s = splstatclock(); /* prevent state changes */
161 /*
162 * p_pctcpu is only for ps.
163 */
164 #if (FSHIFT >= CCPU_SHIFT)
165 p->p_pctcpu += (hz == 100)?
166 ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
167 100 * (((fixpt_t) p->p_cpticks)
168 << (FSHIFT - CCPU_SHIFT)) / hz;
169 #else
170 p->p_pctcpu += ((FSCALE - ccpu) *
171 (p->p_cpticks * FSCALE / hz)) >> FSHIFT;
172 #endif
173 p->p_cpticks = 0;
174 newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu) + p->p_nice;
175 p->p_estcpu = min(newcpu, UCHAR_MAX);
176 resetpriority(p);
177 if (p->p_priority >= PUSER) {
178 #define PPQ (128 / NQS) /* priorities per queue */
179 if ((p != curproc) &&
180 p->p_stat == SRUN &&
181 (p->p_flag & P_INMEM) &&
182 (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
183 remrq(p);
184 p->p_priority = p->p_usrpri;
185 setrunqueue(p);
186 } else
187 p->p_priority = p->p_usrpri;
188 }
189 splx(s);
190 }
191 vmmeter();
192 if (bclnlist != NULL)
193 wakeup((caddr_t)pageproc);
194 timeout(schedcpu, (void *)0, hz);
195 }
196
197 /*
198 * Recalculate the priority of a process after it has slept for a while.
199 * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
200 * least six times the loadfactor will decay p_estcpu to zero.
201 */
202 void
updatepri(p)203 updatepri(p)
204 register struct proc *p;
205 {
206 register unsigned int newcpu = p->p_estcpu;
207 register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
208
209 if (p->p_slptime > 5 * loadfac)
210 p->p_estcpu = 0;
211 else {
212 p->p_slptime--; /* the first time was done in schedcpu */
213 while (newcpu && --p->p_slptime)
214 newcpu = (int) decay_cpu(loadfac, newcpu);
215 p->p_estcpu = min(newcpu, UCHAR_MAX);
216 }
217 resetpriority(p);
218 }
219
220 /*
221 * We're only looking at 7 bits of the address; everything is
222 * aligned to 4, lots of things are aligned to greater powers
223 * of 2. Shift right by 8, i.e. drop the bottom 256 worth.
224 */
225 #define TABLESIZE 128
226 #define LOOKUP(x) (((long)(x) >> 8) & (TABLESIZE - 1))
227 struct slpque {
228 struct proc *sq_head;
229 struct proc **sq_tailp;
230 } slpque[TABLESIZE];
231
232 /*
233 * During autoconfiguration or after a panic, a sleep will simply
234 * lower the priority briefly to allow interrupts, then return.
235 * The priority to be used (safepri) is machine-dependent, thus this
236 * value is initialized and maintained in the machine-dependent layers.
237 * This priority will typically be 0, or the lowest priority
238 * that is safe for use on the interrupt stack; it can be made
239 * higher to block network software interrupts after panics.
240 */
241 int safepri;
242
243 /*
244 * General sleep call. Suspends the current process until a wakeup is
245 * performed on the specified identifier. The process will then be made
246 * runnable with the specified priority. Sleeps at most timo/hz seconds
247 * (0 means no timeout). If pri includes PCATCH flag, signals are checked
248 * before and after sleeping, else signals are not checked. Returns 0 if
249 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
250 * signal needs to be delivered, ERESTART is returned if the current system
251 * call should be restarted if possible, and EINTR is returned if the system
252 * call should be interrupted by the signal (return EINTR).
253 */
254 int
tsleep(ident,priority,wmesg,timo)255 tsleep(ident, priority, wmesg, timo)
256 void *ident;
257 int priority, timo;
258 char *wmesg;
259 {
260 register struct proc *p = curproc;
261 register struct slpque *qp;
262 register s;
263 int sig, catch = priority & PCATCH;
264 extern int cold;
265 void endtsleep __P((void *));
266
267 #ifdef KTRACE
268 if (KTRPOINT(p, KTR_CSW))
269 ktrcsw(p->p_tracep, 1, 0);
270 #endif
271 s = splhigh();
272 if (cold || panicstr) {
273 /*
274 * After a panic, or during autoconfiguration,
275 * just give interrupts a chance, then just return;
276 * don't run any other procs or panic below,
277 * in case this is the idle process and already asleep.
278 */
279 splx(safepri);
280 splx(s);
281 return (0);
282 }
283 #ifdef DIAGNOSTIC
284 if (ident == NULL || p->p_stat != SRUN || p->p_back)
285 panic("tsleep");
286 #endif
287 p->p_wchan = ident;
288 p->p_wmesg = wmesg;
289 p->p_slptime = 0;
290 p->p_priority = priority & PRIMASK;
291 qp = &slpque[LOOKUP(ident)];
292 if (qp->sq_head == 0)
293 qp->sq_head = p;
294 else
295 *qp->sq_tailp = p;
296 *(qp->sq_tailp = &p->p_forw) = 0;
297 if (timo)
298 timeout(endtsleep, (void *)p, timo);
299 /*
300 * We put ourselves on the sleep queue and start our timeout
301 * before calling CURSIG, as we could stop there, and a wakeup
302 * or a SIGCONT (or both) could occur while we were stopped.
303 * A SIGCONT would cause us to be marked as SSLEEP
304 * without resuming us, thus we must be ready for sleep
305 * when CURSIG is called. If the wakeup happens while we're
306 * stopped, p->p_wchan will be 0 upon return from CURSIG.
307 */
308 if (catch) {
309 p->p_flag |= P_SINTR;
310 if (sig = CURSIG(p)) {
311 if (p->p_wchan)
312 unsleep(p);
313 p->p_stat = SRUN;
314 goto resume;
315 }
316 if (p->p_wchan == 0) {
317 catch = 0;
318 goto resume;
319 }
320 } else
321 sig = 0;
322 p->p_stat = SSLEEP;
323 p->p_stats->p_ru.ru_nvcsw++;
324 mi_switch();
325 resume:
326 curpriority = p->p_usrpri;
327 splx(s);
328 p->p_flag &= ~P_SINTR;
329 if (p->p_flag & P_TIMEOUT) {
330 p->p_flag &= ~P_TIMEOUT;
331 if (sig == 0) {
332 #ifdef KTRACE
333 if (KTRPOINT(p, KTR_CSW))
334 ktrcsw(p->p_tracep, 0, 0);
335 #endif
336 return (EWOULDBLOCK);
337 }
338 } else if (timo)
339 untimeout(endtsleep, (void *)p);
340 if (catch && (sig != 0 || (sig = CURSIG(p)))) {
341 #ifdef KTRACE
342 if (KTRPOINT(p, KTR_CSW))
343 ktrcsw(p->p_tracep, 0, 0);
344 #endif
345 if (p->p_sigacts->ps_sigintr & sigmask(sig))
346 return (EINTR);
347 return (ERESTART);
348 }
349 #ifdef KTRACE
350 if (KTRPOINT(p, KTR_CSW))
351 ktrcsw(p->p_tracep, 0, 0);
352 #endif
353 return (0);
354 }
355
356 /*
357 * Implement timeout for tsleep.
358 * If process hasn't been awakened (wchan non-zero),
359 * set timeout flag and undo the sleep. If proc
360 * is stopped, just unsleep so it will remain stopped.
361 */
362 void
endtsleep(arg)363 endtsleep(arg)
364 void *arg;
365 {
366 register struct proc *p;
367 int s;
368
369 p = (struct proc *)arg;
370 s = splhigh();
371 if (p->p_wchan) {
372 if (p->p_stat == SSLEEP)
373 setrunnable(p);
374 else
375 unsleep(p);
376 p->p_flag |= P_TIMEOUT;
377 }
378 splx(s);
379 }
380
381 /*
382 * Short-term, non-interruptable sleep.
383 */
384 void
sleep(ident,priority)385 sleep(ident, priority)
386 void *ident;
387 int priority;
388 {
389 register struct proc *p = curproc;
390 register struct slpque *qp;
391 register s;
392 extern int cold;
393
394 #ifdef DIAGNOSTIC
395 if (priority > PZERO) {
396 printf("sleep called with priority %d > PZERO, wchan: %x\n",
397 priority, ident);
398 panic("old sleep");
399 }
400 #endif
401 s = splhigh();
402 if (cold || panicstr) {
403 /*
404 * After a panic, or during autoconfiguration,
405 * just give interrupts a chance, then just return;
406 * don't run any other procs or panic below,
407 * in case this is the idle process and already asleep.
408 */
409 splx(safepri);
410 splx(s);
411 return;
412 }
413 #ifdef DIAGNOSTIC
414 if (ident == NULL || p->p_stat != SRUN || p->p_back)
415 panic("sleep");
416 #endif
417 p->p_wchan = ident;
418 p->p_wmesg = NULL;
419 p->p_slptime = 0;
420 p->p_priority = priority;
421 qp = &slpque[LOOKUP(ident)];
422 if (qp->sq_head == 0)
423 qp->sq_head = p;
424 else
425 *qp->sq_tailp = p;
426 *(qp->sq_tailp = &p->p_forw) = 0;
427 p->p_stat = SSLEEP;
428 p->p_stats->p_ru.ru_nvcsw++;
429 #ifdef KTRACE
430 if (KTRPOINT(p, KTR_CSW))
431 ktrcsw(p->p_tracep, 1, 0);
432 #endif
433 mi_switch();
434 #ifdef KTRACE
435 if (KTRPOINT(p, KTR_CSW))
436 ktrcsw(p->p_tracep, 0, 0);
437 #endif
438 curpriority = p->p_usrpri;
439 splx(s);
440 }
441
442 /*
443 * Remove a process from its wait queue
444 */
445 void
unsleep(p)446 unsleep(p)
447 register struct proc *p;
448 {
449 register struct slpque *qp;
450 register struct proc **hp;
451 int s;
452
453 s = splhigh();
454 if (p->p_wchan) {
455 hp = &(qp = &slpque[LOOKUP(p->p_wchan)])->sq_head;
456 while (*hp != p)
457 hp = &(*hp)->p_forw;
458 *hp = p->p_forw;
459 if (qp->sq_tailp == &p->p_forw)
460 qp->sq_tailp = hp;
461 p->p_wchan = 0;
462 }
463 splx(s);
464 }
465
466 /*
467 * Make all processes sleeping on the specified identifier runnable.
468 */
469 void
wakeup(ident)470 wakeup(ident)
471 register void *ident;
472 {
473 register struct slpque *qp;
474 register struct proc *p, **q;
475 int s;
476
477 s = splhigh();
478 qp = &slpque[LOOKUP(ident)];
479 restart:
480 for (q = &qp->sq_head; p = *q; ) {
481 #ifdef DIAGNOSTIC
482 if (p->p_back || p->p_stat != SSLEEP && p->p_stat != SSTOP)
483 panic("wakeup");
484 #endif
485 if (p->p_wchan == ident) {
486 p->p_wchan = 0;
487 *q = p->p_forw;
488 if (qp->sq_tailp == &p->p_forw)
489 qp->sq_tailp = q;
490 if (p->p_stat == SSLEEP) {
491 /* OPTIMIZED EXPANSION OF setrunnable(p); */
492 if (p->p_slptime > 1)
493 updatepri(p);
494 p->p_slptime = 0;
495 p->p_stat = SRUN;
496 if (p->p_flag & P_INMEM)
497 setrunqueue(p);
498 /*
499 * Since curpriority is a user priority,
500 * p->p_priority is always better than
501 * curpriority.
502 */
503 if ((p->p_flag & P_INMEM) == 0)
504 wakeup((caddr_t)&proc0);
505 else
506 need_resched();
507 /* END INLINE EXPANSION */
508 goto restart;
509 }
510 } else
511 q = &p->p_forw;
512 }
513 splx(s);
514 }
515
516 /*
517 * The machine independent parts of mi_switch().
518 * Must be called at splstatclock() or higher.
519 */
520 void
mi_switch()521 mi_switch()
522 {
523 register struct proc *p = curproc; /* XXX */
524 register struct rlimit *rlim;
525 register long s, u;
526 struct timeval tv;
527
528 #ifdef DEBUG
529 if (p->p_simple_locks)
530 panic("sleep: holding simple lock");
531 #endif
532 /*
533 * Compute the amount of time during which the current
534 * process was running, and add that to its total so far.
535 */
536 microtime(&tv);
537 u = p->p_rtime.tv_usec + (tv.tv_usec - runtime.tv_usec);
538 s = p->p_rtime.tv_sec + (tv.tv_sec - runtime.tv_sec);
539 if (u < 0) {
540 u += 1000000;
541 s--;
542 } else if (u >= 1000000) {
543 u -= 1000000;
544 s++;
545 }
546 p->p_rtime.tv_usec = u;
547 p->p_rtime.tv_sec = s;
548
549 /*
550 * Check if the process exceeds its cpu resource allocation.
551 * If over max, kill it. In any case, if it has run for more
552 * than 10 minutes, reduce priority to give others a chance.
553 */
554 rlim = &p->p_rlimit[RLIMIT_CPU];
555 if (s >= rlim->rlim_cur) {
556 if (s >= rlim->rlim_max)
557 psignal(p, SIGKILL);
558 else {
559 psignal(p, SIGXCPU);
560 if (rlim->rlim_cur < rlim->rlim_max)
561 rlim->rlim_cur += 5;
562 }
563 }
564 if (s > 10 * 60 && p->p_ucred->cr_uid && p->p_nice == NZERO) {
565 p->p_nice = NZERO + 4;
566 resetpriority(p);
567 }
568
569 /*
570 * Pick a new current process and record its start time.
571 */
572 cnt.v_swtch++;
573 cpu_switch(p);
574 microtime(&runtime);
575 }
576
577 /*
578 * Initialize the (doubly-linked) run queues
579 * to be empty.
580 */
581 void
rqinit()582 rqinit()
583 {
584 register int i;
585
586 for (i = 0; i < NQS; i++)
587 qs[i].ph_link = qs[i].ph_rlink = (struct proc *)&qs[i];
588 }
589
590 /*
591 * Change process state to be runnable,
592 * placing it on the run queue if it is in memory,
593 * and awakening the swapper if it isn't in memory.
594 */
595 void
setrunnable(p)596 setrunnable(p)
597 register struct proc *p;
598 {
599 register int s;
600
601 s = splhigh();
602 switch (p->p_stat) {
603 case 0:
604 case SRUN:
605 case SZOMB:
606 default:
607 panic("setrunnable");
608 case SSTOP:
609 case SSLEEP:
610 unsleep(p); /* e.g. when sending signals */
611 break;
612
613 case SIDL:
614 break;
615 }
616 p->p_stat = SRUN;
617 if (p->p_flag & P_INMEM)
618 setrunqueue(p);
619 splx(s);
620 if (p->p_slptime > 1)
621 updatepri(p);
622 p->p_slptime = 0;
623 if ((p->p_flag & P_INMEM) == 0)
624 wakeup((caddr_t)&proc0);
625 else if (p->p_priority < curpriority)
626 need_resched();
627 }
628
629 /*
630 * Compute the priority of a process when running in user mode.
631 * Arrange to reschedule if the resulting priority is better
632 * than that of the current process.
633 */
634 void
resetpriority(p)635 resetpriority(p)
636 register struct proc *p;
637 {
638 register unsigned int newpriority;
639
640 newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice;
641 newpriority = min(newpriority, MAXPRI);
642 p->p_usrpri = newpriority;
643 if (newpriority < curpriority)
644 need_resched();
645 }
646