xref: /original-bsd/sys/kern/kern_synch.c (revision 9a35f7df)
1 /*-
2  * Copyright (c) 1982, 1986, 1990, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * %sccs.include.redist.c%
11  *
12  *	@(#)kern_synch.c	8.9 (Berkeley) 05/19/95
13  */
14 
15 #include <sys/param.h>
16 #include <sys/systm.h>
17 #include <sys/proc.h>
18 #include <sys/kernel.h>
19 #include <sys/buf.h>
20 #include <sys/signalvar.h>
21 #include <sys/resourcevar.h>
22 #include <sys/vmmeter.h>
23 #ifdef KTRACE
24 #include <sys/ktrace.h>
25 #endif
26 
27 #include <machine/cpu.h>
28 
29 u_char	curpriority;		/* usrpri of curproc */
30 int	lbolt;			/* once a second sleep address */
31 
32 /*
33  * Force switch among equal priority processes every 100ms.
34  */
35 /* ARGSUSED */
36 void
37 roundrobin(arg)
38 	void *arg;
39 {
40 
41 	need_resched();
42 	timeout(roundrobin, NULL, hz / 10);
43 }
44 
45 /*
46  * Constants for digital decay and forget:
47  *	90% of (p_estcpu) usage in 5 * loadav time
48  *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
49  *          Note that, as ps(1) mentions, this can let percentages
50  *          total over 100% (I've seen 137.9% for 3 processes).
51  *
52  * Note that hardclock updates p_estcpu and p_cpticks independently.
53  *
54  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
55  * That is, the system wants to compute a value of decay such
56  * that the following for loop:
57  * 	for (i = 0; i < (5 * loadavg); i++)
58  * 		p_estcpu *= decay;
59  * will compute
60  * 	p_estcpu *= 0.1;
61  * for all values of loadavg:
62  *
63  * Mathematically this loop can be expressed by saying:
64  * 	decay ** (5 * loadavg) ~= .1
65  *
66  * The system computes decay as:
67  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
68  *
69  * We wish to prove that the system's computation of decay
70  * will always fulfill the equation:
71  * 	decay ** (5 * loadavg) ~= .1
72  *
73  * If we compute b as:
74  * 	b = 2 * loadavg
75  * then
76  * 	decay = b / (b + 1)
77  *
78  * We now need to prove two things:
79  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
80  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
81  *
82  * Facts:
83  *         For x close to zero, exp(x) =~ 1 + x, since
84  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
85  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
86  *         For x close to zero, ln(1+x) =~ x, since
87  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
88  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
89  *         ln(.1) =~ -2.30
90  *
91  * Proof of (1):
92  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
93  *	solving for factor,
94  *      ln(factor) =~ (-2.30/5*loadav), or
95  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
96  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
97  *
98  * Proof of (2):
99  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
100  *	solving for power,
101  *      power*ln(b/(b+1)) =~ -2.30, or
102  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
103  *
104  * Actual power values for the implemented algorithm are as follows:
105  *      loadav: 1       2       3       4
106  *      power:  5.68    10.32   14.94   19.55
107  */
108 
109 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
110 #define	loadfactor(loadav)	(2 * (loadav))
111 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
112 
113 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
114 fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;		/* exp(-1/20) */
115 
116 /*
117  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
118  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
119  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
120  *
121  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
122  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
123  *
124  * If you dont want to bother with the faster/more-accurate formula, you
125  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
126  * (more general) method of calculating the %age of CPU used by a process.
127  */
128 #define	CCPU_SHIFT	11
129 
130 /*
131  * Recompute process priorities, every hz ticks.
132  */
133 /* ARGSUSED */
134 void
135 schedcpu(arg)
136 	void *arg;
137 {
138 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
139 	register struct proc *p;
140 	register int s;
141 	register unsigned int newcpu;
142 
143 	wakeup((caddr_t)&lbolt);
144 	for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
145 		/*
146 		 * Increment time in/out of memory and sleep time
147 		 * (if sleeping).  We ignore overflow; with 16-bit int's
148 		 * (remember them?) overflow takes 45 days.
149 		 */
150 		p->p_swtime++;
151 		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
152 			p->p_slptime++;
153 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
154 		/*
155 		 * If the process has slept the entire second,
156 		 * stop recalculating its priority until it wakes up.
157 		 */
158 		if (p->p_slptime > 1)
159 			continue;
160 		s = splstatclock();	/* prevent state changes */
161 		/*
162 		 * p_pctcpu is only for ps.
163 		 */
164 #if	(FSHIFT >= CCPU_SHIFT)
165 		p->p_pctcpu += (hz == 100)?
166 			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
167                 	100 * (((fixpt_t) p->p_cpticks)
168 				<< (FSHIFT - CCPU_SHIFT)) / hz;
169 #else
170 		p->p_pctcpu += ((FSCALE - ccpu) *
171 			(p->p_cpticks * FSCALE / hz)) >> FSHIFT;
172 #endif
173 		p->p_cpticks = 0;
174 		newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu) + p->p_nice;
175 		p->p_estcpu = min(newcpu, UCHAR_MAX);
176 		resetpriority(p);
177 		if (p->p_priority >= PUSER) {
178 #define	PPQ	(128 / NQS)		/* priorities per queue */
179 			if ((p != curproc) &&
180 			    p->p_stat == SRUN &&
181 			    (p->p_flag & P_INMEM) &&
182 			    (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
183 				remrq(p);
184 				p->p_priority = p->p_usrpri;
185 				setrunqueue(p);
186 			} else
187 				p->p_priority = p->p_usrpri;
188 		}
189 		splx(s);
190 	}
191 	vmmeter();
192 	if (bclnlist != NULL)
193 		wakeup((caddr_t)pageproc);
194 	timeout(schedcpu, (void *)0, hz);
195 }
196 
197 /*
198  * Recalculate the priority of a process after it has slept for a while.
199  * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
200  * least six times the loadfactor will decay p_estcpu to zero.
201  */
202 void
203 updatepri(p)
204 	register struct proc *p;
205 {
206 	register unsigned int newcpu = p->p_estcpu;
207 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
208 
209 	if (p->p_slptime > 5 * loadfac)
210 		p->p_estcpu = 0;
211 	else {
212 		p->p_slptime--;	/* the first time was done in schedcpu */
213 		while (newcpu && --p->p_slptime)
214 			newcpu = (int) decay_cpu(loadfac, newcpu);
215 		p->p_estcpu = min(newcpu, UCHAR_MAX);
216 	}
217 	resetpriority(p);
218 }
219 
220 /*
221  * We're only looking at 7 bits of the address; everything is
222  * aligned to 4, lots of things are aligned to greater powers
223  * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
224  */
225 #define TABLESIZE	128
226 #define LOOKUP(x)	(((long)(x) >> 8) & (TABLESIZE - 1))
227 struct slpque {
228 	struct proc *sq_head;
229 	struct proc **sq_tailp;
230 } slpque[TABLESIZE];
231 
232 /*
233  * During autoconfiguration or after a panic, a sleep will simply
234  * lower the priority briefly to allow interrupts, then return.
235  * The priority to be used (safepri) is machine-dependent, thus this
236  * value is initialized and maintained in the machine-dependent layers.
237  * This priority will typically be 0, or the lowest priority
238  * that is safe for use on the interrupt stack; it can be made
239  * higher to block network software interrupts after panics.
240  */
241 int safepri;
242 
243 /*
244  * General sleep call.  Suspends the current process until a wakeup is
245  * performed on the specified identifier.  The process will then be made
246  * runnable with the specified priority.  Sleeps at most timo/hz seconds
247  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
248  * before and after sleeping, else signals are not checked.  Returns 0 if
249  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
250  * signal needs to be delivered, ERESTART is returned if the current system
251  * call should be restarted if possible, and EINTR is returned if the system
252  * call should be interrupted by the signal (return EINTR).
253  */
254 int
255 tsleep(ident, priority, wmesg, timo)
256 	void *ident;
257 	int priority, timo;
258 	char *wmesg;
259 {
260 	register struct proc *p = curproc;
261 	register struct slpque *qp;
262 	register s;
263 	int sig, catch = priority & PCATCH;
264 	extern int cold;
265 	void endtsleep __P((void *));
266 
267 #ifdef KTRACE
268 	if (KTRPOINT(p, KTR_CSW))
269 		ktrcsw(p->p_tracep, 1, 0);
270 #endif
271 	s = splhigh();
272 	if (cold || panicstr) {
273 		/*
274 		 * After a panic, or during autoconfiguration,
275 		 * just give interrupts a chance, then just return;
276 		 * don't run any other procs or panic below,
277 		 * in case this is the idle process and already asleep.
278 		 */
279 		splx(safepri);
280 		splx(s);
281 		return (0);
282 	}
283 #ifdef DIAGNOSTIC
284 	if (ident == NULL || p->p_stat != SRUN || p->p_back)
285 		panic("tsleep");
286 #endif
287 	p->p_wchan = ident;
288 	p->p_wmesg = wmesg;
289 	p->p_slptime = 0;
290 	p->p_priority = priority & PRIMASK;
291 	qp = &slpque[LOOKUP(ident)];
292 	if (qp->sq_head == 0)
293 		qp->sq_head = p;
294 	else
295 		*qp->sq_tailp = p;
296 	*(qp->sq_tailp = &p->p_forw) = 0;
297 	if (timo)
298 		timeout(endtsleep, (void *)p, timo);
299 	/*
300 	 * We put ourselves on the sleep queue and start our timeout
301 	 * before calling CURSIG, as we could stop there, and a wakeup
302 	 * or a SIGCONT (or both) could occur while we were stopped.
303 	 * A SIGCONT would cause us to be marked as SSLEEP
304 	 * without resuming us, thus we must be ready for sleep
305 	 * when CURSIG is called.  If the wakeup happens while we're
306 	 * stopped, p->p_wchan will be 0 upon return from CURSIG.
307 	 */
308 	if (catch) {
309 		p->p_flag |= P_SINTR;
310 		if (sig = CURSIG(p)) {
311 			if (p->p_wchan)
312 				unsleep(p);
313 			p->p_stat = SRUN;
314 			goto resume;
315 		}
316 		if (p->p_wchan == 0) {
317 			catch = 0;
318 			goto resume;
319 		}
320 	} else
321 		sig = 0;
322 	p->p_stat = SSLEEP;
323 	p->p_stats->p_ru.ru_nvcsw++;
324 	mi_switch();
325 resume:
326 	curpriority = p->p_usrpri;
327 	splx(s);
328 	p->p_flag &= ~P_SINTR;
329 	if (p->p_flag & P_TIMEOUT) {
330 		p->p_flag &= ~P_TIMEOUT;
331 		if (sig == 0) {
332 #ifdef KTRACE
333 			if (KTRPOINT(p, KTR_CSW))
334 				ktrcsw(p->p_tracep, 0, 0);
335 #endif
336 			return (EWOULDBLOCK);
337 		}
338 	} else if (timo)
339 		untimeout(endtsleep, (void *)p);
340 	if (catch && (sig != 0 || (sig = CURSIG(p)))) {
341 #ifdef KTRACE
342 		if (KTRPOINT(p, KTR_CSW))
343 			ktrcsw(p->p_tracep, 0, 0);
344 #endif
345 		if (p->p_sigacts->ps_sigintr & sigmask(sig))
346 			return (EINTR);
347 		return (ERESTART);
348 	}
349 #ifdef KTRACE
350 	if (KTRPOINT(p, KTR_CSW))
351 		ktrcsw(p->p_tracep, 0, 0);
352 #endif
353 	return (0);
354 }
355 
356 /*
357  * Implement timeout for tsleep.
358  * If process hasn't been awakened (wchan non-zero),
359  * set timeout flag and undo the sleep.  If proc
360  * is stopped, just unsleep so it will remain stopped.
361  */
362 void
363 endtsleep(arg)
364 	void *arg;
365 {
366 	register struct proc *p;
367 	int s;
368 
369 	p = (struct proc *)arg;
370 	s = splhigh();
371 	if (p->p_wchan) {
372 		if (p->p_stat == SSLEEP)
373 			setrunnable(p);
374 		else
375 			unsleep(p);
376 		p->p_flag |= P_TIMEOUT;
377 	}
378 	splx(s);
379 }
380 
381 /*
382  * Short-term, non-interruptable sleep.
383  */
384 void
385 sleep(ident, priority)
386 	void *ident;
387 	int priority;
388 {
389 	register struct proc *p = curproc;
390 	register struct slpque *qp;
391 	register s;
392 	extern int cold;
393 
394 #ifdef DIAGNOSTIC
395 	if (priority > PZERO) {
396 		printf("sleep called with priority %d > PZERO, wchan: %x\n",
397 		    priority, ident);
398 		panic("old sleep");
399 	}
400 #endif
401 	s = splhigh();
402 	if (cold || panicstr) {
403 		/*
404 		 * After a panic, or during autoconfiguration,
405 		 * just give interrupts a chance, then just return;
406 		 * don't run any other procs or panic below,
407 		 * in case this is the idle process and already asleep.
408 		 */
409 		splx(safepri);
410 		splx(s);
411 		return;
412 	}
413 #ifdef DIAGNOSTIC
414 	if (ident == NULL || p->p_stat != SRUN || p->p_back)
415 		panic("sleep");
416 #endif
417 	p->p_wchan = ident;
418 	p->p_wmesg = NULL;
419 	p->p_slptime = 0;
420 	p->p_priority = priority;
421 	qp = &slpque[LOOKUP(ident)];
422 	if (qp->sq_head == 0)
423 		qp->sq_head = p;
424 	else
425 		*qp->sq_tailp = p;
426 	*(qp->sq_tailp = &p->p_forw) = 0;
427 	p->p_stat = SSLEEP;
428 	p->p_stats->p_ru.ru_nvcsw++;
429 #ifdef KTRACE
430 	if (KTRPOINT(p, KTR_CSW))
431 		ktrcsw(p->p_tracep, 1, 0);
432 #endif
433 	mi_switch();
434 #ifdef KTRACE
435 	if (KTRPOINT(p, KTR_CSW))
436 		ktrcsw(p->p_tracep, 0, 0);
437 #endif
438 	curpriority = p->p_usrpri;
439 	splx(s);
440 }
441 
442 /*
443  * Remove a process from its wait queue
444  */
445 void
446 unsleep(p)
447 	register struct proc *p;
448 {
449 	register struct slpque *qp;
450 	register struct proc **hp;
451 	int s;
452 
453 	s = splhigh();
454 	if (p->p_wchan) {
455 		hp = &(qp = &slpque[LOOKUP(p->p_wchan)])->sq_head;
456 		while (*hp != p)
457 			hp = &(*hp)->p_forw;
458 		*hp = p->p_forw;
459 		if (qp->sq_tailp == &p->p_forw)
460 			qp->sq_tailp = hp;
461 		p->p_wchan = 0;
462 	}
463 	splx(s);
464 }
465 
466 /*
467  * Make all processes sleeping on the specified identifier runnable.
468  */
469 void
470 wakeup(ident)
471 	register void *ident;
472 {
473 	register struct slpque *qp;
474 	register struct proc *p, **q;
475 	int s;
476 
477 	s = splhigh();
478 	qp = &slpque[LOOKUP(ident)];
479 restart:
480 	for (q = &qp->sq_head; p = *q; ) {
481 #ifdef DIAGNOSTIC
482 		if (p->p_back || p->p_stat != SSLEEP && p->p_stat != SSTOP)
483 			panic("wakeup");
484 #endif
485 		if (p->p_wchan == ident) {
486 			p->p_wchan = 0;
487 			*q = p->p_forw;
488 			if (qp->sq_tailp == &p->p_forw)
489 				qp->sq_tailp = q;
490 			if (p->p_stat == SSLEEP) {
491 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
492 				if (p->p_slptime > 1)
493 					updatepri(p);
494 				p->p_slptime = 0;
495 				p->p_stat = SRUN;
496 				if (p->p_flag & P_INMEM)
497 					setrunqueue(p);
498 				/*
499 				 * Since curpriority is a user priority,
500 				 * p->p_priority is always better than
501 				 * curpriority.
502 				 */
503 				if ((p->p_flag & P_INMEM) == 0)
504 					wakeup((caddr_t)&proc0);
505 				else
506 					need_resched();
507 				/* END INLINE EXPANSION */
508 				goto restart;
509 			}
510 		} else
511 			q = &p->p_forw;
512 	}
513 	splx(s);
514 }
515 
516 /*
517  * The machine independent parts of mi_switch().
518  * Must be called at splstatclock() or higher.
519  */
520 void
521 mi_switch()
522 {
523 	register struct proc *p = curproc;	/* XXX */
524 	register struct rlimit *rlim;
525 	register long s, u;
526 	struct timeval tv;
527 
528 #ifdef DEBUG
529 	if (p->p_simple_locks)
530 		panic("sleep: holding simple lock");
531 #endif
532 	/*
533 	 * Compute the amount of time during which the current
534 	 * process was running, and add that to its total so far.
535 	 */
536 	microtime(&tv);
537 	u = p->p_rtime.tv_usec + (tv.tv_usec - runtime.tv_usec);
538 	s = p->p_rtime.tv_sec + (tv.tv_sec - runtime.tv_sec);
539 	if (u < 0) {
540 		u += 1000000;
541 		s--;
542 	} else if (u >= 1000000) {
543 		u -= 1000000;
544 		s++;
545 	}
546 	p->p_rtime.tv_usec = u;
547 	p->p_rtime.tv_sec = s;
548 
549 	/*
550 	 * Check if the process exceeds its cpu resource allocation.
551 	 * If over max, kill it.  In any case, if it has run for more
552 	 * than 10 minutes, reduce priority to give others a chance.
553 	 */
554 	rlim = &p->p_rlimit[RLIMIT_CPU];
555 	if (s >= rlim->rlim_cur) {
556 		if (s >= rlim->rlim_max)
557 			psignal(p, SIGKILL);
558 		else {
559 			psignal(p, SIGXCPU);
560 			if (rlim->rlim_cur < rlim->rlim_max)
561 				rlim->rlim_cur += 5;
562 		}
563 	}
564 	if (s > 10 * 60 && p->p_ucred->cr_uid && p->p_nice == NZERO) {
565 		p->p_nice = NZERO + 4;
566 		resetpriority(p);
567 	}
568 
569 	/*
570 	 * Pick a new current process and record its start time.
571 	 */
572 	cnt.v_swtch++;
573 	cpu_switch(p);
574 	microtime(&runtime);
575 }
576 
577 /*
578  * Initialize the (doubly-linked) run queues
579  * to be empty.
580  */
581 void
582 rqinit()
583 {
584 	register int i;
585 
586 	for (i = 0; i < NQS; i++)
587 		qs[i].ph_link = qs[i].ph_rlink = (struct proc *)&qs[i];
588 }
589 
590 /*
591  * Change process state to be runnable,
592  * placing it on the run queue if it is in memory,
593  * and awakening the swapper if it isn't in memory.
594  */
595 void
596 setrunnable(p)
597 	register struct proc *p;
598 {
599 	register int s;
600 
601 	s = splhigh();
602 	switch (p->p_stat) {
603 	case 0:
604 	case SRUN:
605 	case SZOMB:
606 	default:
607 		panic("setrunnable");
608 	case SSTOP:
609 	case SSLEEP:
610 		unsleep(p);		/* e.g. when sending signals */
611 		break;
612 
613 	case SIDL:
614 		break;
615 	}
616 	p->p_stat = SRUN;
617 	if (p->p_flag & P_INMEM)
618 		setrunqueue(p);
619 	splx(s);
620 	if (p->p_slptime > 1)
621 		updatepri(p);
622 	p->p_slptime = 0;
623 	if ((p->p_flag & P_INMEM) == 0)
624 		wakeup((caddr_t)&proc0);
625 	else if (p->p_priority < curpriority)
626 		need_resched();
627 }
628 
629 /*
630  * Compute the priority of a process when running in user mode.
631  * Arrange to reschedule if the resulting priority is better
632  * than that of the current process.
633  */
634 void
635 resetpriority(p)
636 	register struct proc *p;
637 {
638 	register unsigned int newpriority;
639 
640 	newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice;
641 	newpriority = min(newpriority, MAXPRI);
642 	p->p_usrpri = newpriority;
643 	if (newpriority < curpriority)
644 		need_resched();
645 }
646