xref: /dragonfly/sys/kern/usched_bsd4.c (revision 5de36205)
1 /*
2  * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.1 2005/06/30 16:38:49 dillon Exp $
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
32 #include <sys/lock.h>
33 #include <sys/queue.h>
34 #include <sys/proc.h>
35 #include <sys/rtprio.h>
36 #include <sys/thread2.h>
37 #include <sys/uio.h>
38 #include <sys/sysctl.h>
39 #include <sys/resourcevar.h>
40 #include <machine/ipl.h>
41 #include <machine/cpu.h>
42 #include <machine/smp.h>
43 
44 /*
45  * Priorities.  Note that with 32 run queues per scheduler each queue
46  * represents four priority levels.
47  */
48 
49 #define MAXPRI			128
50 #define PRIMASK			(MAXPRI - 1)
51 #define PRIBASE_REALTIME	0
52 #define PRIBASE_NORMAL		MAXPRI
53 #define PRIBASE_IDLE		(MAXPRI * 2)
54 #define PRIBASE_THREAD		(MAXPRI * 3)
55 #define PRIBASE_NULL		(MAXPRI * 4)
56 
57 #define NQS	32			/* 32 run queues. */
58 #define PPQ	(MAXPRI / NQS)		/* priorities per queue */
59 
60 /*
61  * NICEPPQ	- number of nice units per priority queue
62  * ESTCPURAMP	- number of scheduler ticks for estcpu to switch queues
63  *
64  * ESTCPUPPQ	- number of estcpu units per priority queue
65  * ESTCPUMAX	- number of estcpu units
66  * ESTCPUINCR	- amount we have to increment p_estcpu per scheduling tick at
67  *		  100% cpu.
68  */
69 #define NICEPPQ		2
70 #define ESTCPURAMP	4
71 #define ESTCPUPPQ	512
72 #define ESTCPUMAX	(ESTCPUPPQ * NQS)
73 #define ESTCPUINCR	(ESTCPUPPQ / ESTCPURAMP)
74 #define PRIO_RANGE	(PRIO_MAX - PRIO_MIN + 1)
75 
76 #define ESTCPULIM(v)	min((v), ESTCPUMAX)
77 
78 TAILQ_HEAD(rq, proc);
79 
80 #define p_priority	p_usdata.bsd4.priority
81 #define p_rqindex	p_usdata.bsd4.rqindex
82 #define p_origcpu	p_usdata.bsd4.origcpu
83 #define p_estcpu	p_usdata.bsd4.estcpu
84 
85 static void bsd4_acquire_curproc(struct proc *p);
86 static void bsd4_release_curproc(struct proc *p);
87 static void bsd4_select_curproc(globaldata_t gd);
88 static void bsd4_setrunqueue(struct proc *p);
89 static void bsd4_remrunqueue(struct proc *p);
90 static void bsd4_schedulerclock(struct proc *p, sysclock_t period,
91 				sysclock_t cpstamp);
92 static void bsd4_resetpriority(struct proc *p);
93 static void bsd4_forking(struct proc *pp, struct proc *p);
94 static void bsd4_exiting(struct proc *pp, struct proc *p);
95 
96 static void bsd4_recalculate_estcpu(struct proc *p);
97 
98 struct usched usched_bsd4 = {
99 	{ NULL },
100 	"bsd4", "Original DragonFly Scheduler",
101 	bsd4_acquire_curproc,
102 	bsd4_release_curproc,
103 	bsd4_select_curproc,
104 	bsd4_setrunqueue,
105 	bsd4_remrunqueue,
106 	bsd4_schedulerclock,
107 	bsd4_recalculate_estcpu,
108 	bsd4_resetpriority,
109 	bsd4_forking,
110 	bsd4_exiting
111 };
112 
113 /*
114  * We have NQS (32) run queues per scheduling class.  For the normal
115  * class, there are 128 priorities scaled onto these 32 queues.  New
116  * processes are added to the last entry in each queue, and processes
117  * are selected for running by taking them from the head and maintaining
118  * a simple FIFO arrangement.  Realtime and Idle priority processes have
119  * and explicit 0-31 priority which maps directly onto their class queue
120  * index.  When a queue has something in it, the corresponding bit is
121  * set in the queuebits variable, allowing a single read to determine
122  * the state of all 32 queues and then a ffs() to find the first busy
123  * queue.
124  */
125 static struct rq queues[NQS];
126 static struct rq rtqueues[NQS];
127 static struct rq idqueues[NQS];
128 static u_int32_t queuebits;
129 static u_int32_t rtqueuebits;
130 static u_int32_t idqueuebits;
131 static cpumask_t curprocmask = -1;	/* currently running a user process */
132 static cpumask_t rdyprocmask;		/* ready to accept a user process */
133 static int	 runqcount;
134 #ifdef SMP
135 static int	 scancpu;
136 #endif
137 
138 SYSCTL_INT(_debug, OID_AUTO, runqcount, CTLFLAG_RD, &runqcount, 0, "");
139 #ifdef INVARIANTS
140 static int usched_nonoptimal;
141 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW,
142         &usched_nonoptimal, 0, "acquire_curproc() was not optimal");
143 static int usched_optimal;
144 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
145         &usched_optimal, 0, "acquire_curproc() was optimal");
146 #endif
147 static int usched_debug = -1;
148 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, "");
149 #ifdef SMP
150 static int remote_resched = 1;
151 static int remote_resched_nonaffinity;
152 static int remote_resched_affinity;
153 static int choose_affinity;
154 SYSCTL_INT(_debug, OID_AUTO, remote_resched, CTLFLAG_RW,
155         &remote_resched, 0, "Resched to another cpu");
156 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
157         &remote_resched_nonaffinity, 0, "Number of remote rescheds");
158 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
159         &remote_resched_affinity, 0, "Number of remote rescheds");
160 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
161         &choose_affinity, 0, "chooseproc() was smart");
162 #endif
163 
164 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10;
165 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW,
166         &usched_bsd4_rrinterval, 0, "");
167 static int usched_bsd4_decay = ESTCPUINCR / 2;
168 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW,
169         &usched_bsd4_decay, 0, "");
170 
171 /*
172  * Initialize the run queues at boot time.
173  */
174 static void
175 rqinit(void *dummy)
176 {
177 	int i;
178 
179 	for (i = 0; i < NQS; i++) {
180 		TAILQ_INIT(&queues[i]);
181 		TAILQ_INIT(&rtqueues[i]);
182 		TAILQ_INIT(&idqueues[i]);
183 	}
184 	curprocmask &= ~1;
185 }
186 SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
187 
188 /*
189  * chooseproc() is called when a cpu needs a user process to LWKT schedule,
190  * it selects a user process and returns it.  If chkp is non-NULL and chkp
191  * has a better or equal then the process that would otherwise be
192  * chosen, NULL is returned.
193  *
194  * Until we fix the RUNQ code the chkp test has to be strict or we may
195  * bounce between processes trying to acquire the current process designation.
196  */
197 static
198 struct proc *
199 chooseproc(struct proc *chkp)
200 {
201 	struct proc *p;
202 	struct rq *q;
203 	u_int32_t *which;
204 	u_int32_t pri;
205 
206 	if (rtqueuebits) {
207 		pri = bsfl(rtqueuebits);
208 		q = &rtqueues[pri];
209 		which = &rtqueuebits;
210 	} else if (queuebits) {
211 		pri = bsfl(queuebits);
212 		q = &queues[pri];
213 		which = &queuebits;
214 	} else if (idqueuebits) {
215 		pri = bsfl(idqueuebits);
216 		q = &idqueues[pri];
217 		which = &idqueuebits;
218 	} else {
219 		return NULL;
220 	}
221 	p = TAILQ_FIRST(q);
222 	KASSERT(p, ("chooseproc: no proc on busy queue"));
223 
224 	/*
225 	 * If the passed process <chkp> is reasonably close to the selected
226 	 * processed <p>, return NULL (indicating that <chkp> should be kept).
227 	 *
228 	 * Note that we must error on the side of <chkp> to avoid bouncing
229 	 * between threads in the acquire code.
230 	 */
231 	if (chkp) {
232 		if (chkp->p_priority < p->p_priority + PPQ)
233 			return(NULL);
234 	}
235 
236 #ifdef SMP
237 	/*
238 	 * If the chosen process does not reside on this cpu spend a few
239 	 * cycles looking for a better candidate at the same priority level.
240 	 * This is a fallback check, setrunqueue() tries to wakeup the
241 	 * correct cpu and is our front-line affinity.
242 	 */
243 	if (p->p_thread->td_gd != mycpu &&
244 	    (chkp = TAILQ_NEXT(p, p_procq)) != NULL
245 	) {
246 		if (chkp->p_thread->td_gd == mycpu) {
247 			++choose_affinity;
248 			p = chkp;
249 		}
250 	}
251 #endif
252 
253 	TAILQ_REMOVE(q, p, p_procq);
254 	--runqcount;
255 	if (TAILQ_EMPTY(q))
256 		*which &= ~(1 << pri);
257 	KASSERT((p->p_flag & P_ONRUNQ) != 0, ("not on runq6!"));
258 	p->p_flag &= ~P_ONRUNQ;
259 	return p;
260 }
261 
262 #ifdef SMP
263 /*
264  * called via an ipi message to reschedule on another cpu.
265  */
266 static
267 void
268 need_user_resched_remote(void *dummy)
269 {
270 	need_user_resched();
271 }
272 
273 #endif
274 
275 /*
276  * setrunqueue() 'wakes up' a 'user' process.  GIANT must be held.  The
277  * user process may represent any user process, including the current
278  * process.
279  *
280  * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target
281  * cpus in an attempt to keep the process on the current cpu at least for
282  * a little while to take advantage of locality of reference (e.g. fork/exec
283  * or short fork/exit, and uio_yield()).
284  *
285  * CPU AFFINITY: cpu affinity is handled by attempting to either schedule
286  * or (user level) preempt on the same cpu that a process was previously
287  * scheduled to.  If we cannot do this but we are at enough of a higher
288  * priority then the processes running on other cpus, we will allow the
289  * process to be stolen by another cpu.
290  *
291  * WARNING! a thread can be acquired by another cpu the moment it is put
292  * on the user scheduler's run queue AND we release the MP lock.  Since we
293  * release the MP lock before switching out another cpu may begin stealing
294  * our current thread before we are completely switched out!  The
295  * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the
296  * thread before stealing it.
297  *
298  * NOTE on need_user_resched() calls: we have to call need_user_resched()
299  * if the new process is more important then the current process, or if
300  * the new process is the current process and is now less important then
301  * other processes.
302  *
303  * The associated thread must NOT be scheduled.
304  * The process must be runnable.
305  * This must be called at splhigh().
306  */
307 static void
308 bsd4_setrunqueue(struct proc *p)
309 {
310 	struct rq *q;
311 	struct globaldata *gd;
312 	int pri;
313 	int cpuid;
314 	u_int32_t needresched;
315 #ifdef SMP
316 	int count;
317 	cpumask_t mask;
318 #endif
319 
320 	crit_enter();
321 	KASSERT(p->p_stat == SRUN, ("setrunqueue: proc not SRUN"));
322 	KASSERT((p->p_flag & P_ONRUNQ) == 0,
323 	    ("process %d already on runq! flag %08x", p->p_pid, p->p_flag));
324 	KKASSERT((p->p_thread->td_flags & TDF_RUNQ) == 0);
325 
326 	/*
327 	 * Note: gd is the gd of the TARGET thread's cpu, not our cpu.
328 	 */
329 	gd = p->p_thread->td_gd;
330 	p->p_slptime = 0;
331 
332 	/*
333 	 * We have not been released, make sure that we are not the currently
334 	 * designated process.
335 	 */
336 	KKASSERT(gd->gd_uschedcp != p);
337 
338 	/*
339 	 * Check cpu affinity.  The associated thread is stable at the
340 	 * moment.  Note that we may be checking another cpu here so we
341 	 * have to be careful.  We are currently protected by the BGL.
342 	 *
343 	 * This allows us to avoid actually queueing the process.
344 	 * acquire_curproc() will handle any threads we mistakenly schedule.
345 	 */
346 	cpuid = gd->gd_cpuid;
347 
348 	if ((curprocmask & (1 << cpuid)) == 0) {
349 		curprocmask |= 1 << cpuid;
350 		gd->gd_uschedcp = p;
351 		gd->gd_upri = p->p_priority;
352 		lwkt_schedule(p->p_thread);
353 		/* CANNOT TOUCH PROC OR TD AFTER SCHEDULE CALL TO REMOTE CPU */
354 		crit_exit();
355 #ifdef SMP
356 		if (gd != mycpu)
357 			++remote_resched_affinity;
358 #endif
359 		return;
360 	}
361 
362 	/*
363 	 * gd and cpuid may still 'hint' at another cpu.  Even so we have
364 	 * to place this process on the userland scheduler's run queue for
365 	 * action by the target cpu.
366 	 */
367 	++runqcount;
368 	p->p_flag |= P_ONRUNQ;
369 	if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
370 		pri = (p->p_priority & PRIMASK) / PPQ;
371 		q = &queues[pri];
372 		queuebits |= 1 << pri;
373 		needresched = (queuebits & ((1 << pri) - 1));
374 	} else if (p->p_rtprio.type == RTP_PRIO_REALTIME ||
375 		   p->p_rtprio.type == RTP_PRIO_FIFO) {
376 		pri = (u_int8_t)p->p_rtprio.prio;
377 		q = &rtqueues[pri];
378 		rtqueuebits |= 1 << pri;
379 		needresched = (rtqueuebits & ((1 << pri) - 1));
380 	} else if (p->p_rtprio.type == RTP_PRIO_IDLE) {
381 		pri = (u_int8_t)p->p_rtprio.prio;
382 		q = &idqueues[pri];
383 		idqueuebits |= 1 << pri;
384 		needresched = (idqueuebits & ((1 << pri) - 1));
385 	} else {
386 		needresched = 0;
387 		panic("setrunqueue: invalid rtprio type");
388 	}
389 	KKASSERT(pri < 32);
390 	p->p_rqindex = pri;		/* remember the queue index */
391 	TAILQ_INSERT_TAIL(q, p, p_procq);
392 
393 #ifdef SMP
394 	/*
395 	 * Either wakeup other cpus user thread scheduler or request
396 	 * preemption on other cpus (which will also wakeup a HLT).
397 	 *
398 	 * NOTE!  gd and cpuid may still be our 'hint', not our current
399 	 * cpu info.
400 	 */
401 
402 	count = runqcount;
403 
404 	/*
405 	 * Check cpu affinity for user preemption (when the curprocmask bit
406 	 * is set).  Note that gd_upri is a speculative field (we modify
407 	 * another cpu's gd_upri to avoid sending ipiq storms).
408 	 */
409 	if (gd == mycpu) {
410 		if ((p->p_thread->td_flags & TDF_NORESCHED) == 0) {
411 			if (p->p_priority < gd->gd_upri - PPQ) {
412 				gd->gd_upri = p->p_priority;
413 				gd->gd_rrcount = 0;
414 				need_user_resched();
415 				--count;
416 			} else if (gd->gd_uschedcp == p && needresched) {
417 				gd->gd_rrcount = 0;
418 				need_user_resched();
419 				--count;
420 			}
421 		}
422 	} else if (remote_resched) {
423 		if (p->p_priority < gd->gd_upri - PPQ) {
424 			gd->gd_upri = p->p_priority;
425 			lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
426 			--count;
427 			++remote_resched_affinity;
428 		}
429 	}
430 
431 	/*
432 	 * No affinity, first schedule to any cpus that do not have a current
433 	 * process.  If there is a free cpu we always schedule to it.
434 	 */
435 	if (count &&
436 	    (mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0 &&
437 	    (p->p_flag & P_PASSIVE_ACQ) == 0) {
438 		if (!mask)
439 			printf("PROC %d nocpu to schedule it on\n", p->p_pid);
440 		while (mask && count) {
441 			cpuid = bsfl(mask);
442 			KKASSERT((curprocmask & (1 << cpuid)) == 0);
443 			rdyprocmask &= ~(1 << cpuid);
444 			lwkt_schedule(&globaldata_find(cpuid)->gd_schedthread);
445 			--count;
446 			mask &= ~(1 << cpuid);
447 		}
448 	}
449 
450 	/*
451 	 * If there are still runnable processes try to wakeup a random
452 	 * cpu that is running a much lower priority process in order to
453 	 * preempt on it.  Note that gd_upri is only a hint, so we can
454 	 * overwrite it from the wrong cpu.   If we can't find one, we
455 	 * are SOL.
456 	 *
457 	 * We depress the priority check so multiple cpu bound programs
458 	 * do not bounce between cpus.  Remember that the clock interrupt
459 	 * will also cause all cpus to reschedule.
460 	 *
461 	 * We must mask against rdyprocmask or we will race in the boot
462 	 * code (before all cpus have working scheduler helpers), plus
463 	 * some cpus might not be operational and/or not configured to
464 	 * handle user processes.
465 	 */
466 	if (count && remote_resched && ncpus > 1) {
467 		cpuid = scancpu;
468 		do {
469 			if (++cpuid == ncpus)
470 				cpuid = 0;
471 		} while (cpuid == mycpu->gd_cpuid);
472 		scancpu = cpuid;
473 
474 		if (rdyprocmask & (1 << cpuid)) {
475 			gd = globaldata_find(cpuid);
476 
477 			if (p->p_priority < gd->gd_upri - PPQ) {
478 				gd->gd_upri = p->p_priority;
479 				lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
480 				++remote_resched_nonaffinity;
481 			}
482 		}
483 	}
484 #else
485 	if ((p->p_thread->td_flags & TDF_NORESCHED) == 0) {
486 		if (p->p_priority < gd->gd_upri - PPQ) {
487 			gd->gd_upri = p->p_priority;
488 			gd->gd_rrcount = 0;
489 			need_user_resched();
490 		} else if (gd->gd_uschedcp == p && needresched) {
491 			gd->gd_rrcount = 0;
492 			need_user_resched();
493 		}
494 	}
495 #endif
496 	crit_exit();
497 }
498 
499 /*
500  * remrunqueue() removes a given process from the run queue that it is on,
501  * clearing the queue busy bit if it becomes empty.  This function is called
502  * when a userland process is selected for LWKT scheduling.  Note that
503  * LWKT scheduling is an abstraction of 'curproc'.. there could very well be
504  * several userland processes whos threads are scheduled or otherwise in
505  * a special state, and such processes are NOT on the userland scheduler's
506  * run queue.
507  *
508  * This must be called at splhigh().
509  */
510 static void
511 bsd4_remrunqueue(struct proc *p)
512 {
513 	struct rq *q;
514 	u_int32_t *which;
515 	u_int8_t pri;
516 
517 	crit_enter();
518 	KASSERT((p->p_flag & P_ONRUNQ) != 0, ("not on runq4!"));
519 	p->p_flag &= ~P_ONRUNQ;
520 	--runqcount;
521 	KKASSERT(runqcount >= 0);
522 	pri = p->p_rqindex;
523 	if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
524 		q = &queues[pri];
525 		which = &queuebits;
526 	} else if (p->p_rtprio.type == RTP_PRIO_REALTIME ||
527 		   p->p_rtprio.type == RTP_PRIO_FIFO) {
528 		q = &rtqueues[pri];
529 		which = &rtqueuebits;
530 	} else if (p->p_rtprio.type == RTP_PRIO_IDLE) {
531 		q = &idqueues[pri];
532 		which = &idqueuebits;
533 	} else {
534 		panic("remrunqueue: invalid rtprio type");
535 	}
536 	TAILQ_REMOVE(q, p, p_procq);
537 	if (TAILQ_EMPTY(q)) {
538 		KASSERT((*which & (1 << pri)) != 0,
539 			("remrunqueue: remove from empty queue"));
540 		*which &= ~(1 << pri);
541 	}
542 	crit_exit();
543 }
544 
545 /*
546  * This routine is called from a systimer IPI.  It MUST be MP-safe and
547  * the BGL IS NOT HELD ON ENTRY.  This routine is called at ESTCPUFREQ.
548  */
549 static
550 void
551 bsd4_schedulerclock(struct proc *p, sysclock_t period, sysclock_t cpstamp)
552 {
553 	globaldata_t gd = mycpu;
554 
555 	/*
556 	 * Do we need to round-robin?  We round-robin 10 times a second.
557 	 * This should only occur for cpu-bound batch processes.
558 	 */
559 	if (++gd->gd_rrcount >= usched_bsd4_rrinterval) {
560 		gd->gd_rrcount = 0;
561 		need_user_resched();
562 	}
563 
564 	/*
565 	 * As the process accumulates cpu time p_estcpu is bumped and may
566 	 * push the process into another scheduling queue.  It typically
567 	 * takes 4 ticks to bump the queue.
568 	 */
569 	p->p_estcpu = ESTCPULIM(p->p_estcpu + ESTCPUINCR);
570 
571 	/*
572 	 * Reducing p_origcpu over time causes more of our estcpu to be
573 	 * returned to the parent when we exit.  This is a small tweak
574 	 * for the batch detection heuristic.
575 	 */
576 	if (p->p_origcpu)
577 		--p->p_origcpu;
578 
579 	/* XXX optimize, avoid lock if no reset is required */
580 	if (try_mplock()) {
581 		bsd4_resetpriority(p);
582 		rel_mplock();
583 	}
584 }
585 
586 /*
587  * Release the current process designation on p.  P MUST BE CURPROC.
588  * Attempt to assign a new current process from the run queue.
589  *
590  * This function is called from exit1(), tsleep(), and the passive
591  * release code setup in <arch>/<arch>/trap.c
592  *
593  * If we do not have or cannot get the MP lock we just wakeup the userland
594  * helper scheduler thread for this cpu to do the work for us.
595  *
596  * WARNING!  The MP lock may be in an unsynchronized state due to the
597  * way get_mplock() works and the fact that this function may be called
598  * from a passive release during a lwkt_switch().   try_mplock() will deal
599  * with this for us but you should be aware that td_mpcount may not be
600  * useable.
601  */
602 static void
603 bsd4_release_curproc(struct proc *p)
604 {
605 	int cpuid;
606 	globaldata_t gd = mycpu;
607 
608 	KKASSERT(p->p_thread->td_gd == gd);
609 	crit_enter();
610 	cpuid = gd->gd_cpuid;
611 
612 	if (gd->gd_uschedcp == p) {
613 		if (try_mplock()) {
614 			/*
615 			 * YYY when the MP lock is not assumed (see else) we
616 			 * will have to check that gd_uschedcp is still == p
617 			 * after acquisition of the MP lock
618 			 */
619 			gd->gd_uschedcp = NULL;
620 			gd->gd_upri = PRIBASE_NULL;
621 			bsd4_select_curproc(gd);
622 			rel_mplock();
623 		} else {
624 			KKASSERT(0);	/* MP LOCK ALWAYS HELD AT THE MOMENT */
625 			gd->gd_uschedcp = NULL;
626 			gd->gd_upri = PRIBASE_NULL;
627 			/* YYY uschedcp and curprocmask */
628 			if (runqcount && (rdyprocmask & (1 << cpuid))) {
629 				rdyprocmask &= ~(1 << cpuid);
630 				lwkt_schedule(&mycpu->gd_schedthread);
631 			}
632 		}
633 	}
634 	crit_exit();
635 }
636 
637 /*
638  * Select a new current process, potentially retaining gd_uschedcp.  However,
639  * be sure to round-robin.  This routine is generally only called if a
640  * reschedule is requested and that typically only occurs if a new process
641  * has a better priority or when we are round-robining.
642  *
643  * NOTE: Must be called with giant held and the current cpu's gd.
644  * NOTE: The caller must handle the situation where it loses a
645  *	uschedcp designation that it previously held, typically by
646  *	calling acquire_curproc() again.
647  * NOTE: May not block
648  */
649 static
650 void
651 bsd4_select_curproc(globaldata_t gd)
652 {
653 	struct proc *np;
654 	int cpuid = gd->gd_cpuid;
655 	void *old;
656 
657 	clear_user_resched();
658 
659 	/*
660 	 * Choose the next designated current user process.
661 	 * Note that we cannot schedule gd_schedthread
662 	 * if runqcount is 0 without creating a scheduling
663 	 * loop.
664 	 *
665 	 * We do not clear the user resched request here,
666 	 * we need to test it later when we re-acquire.
667 	 *
668 	 * NOTE: chooseproc returns NULL if the chosen proc
669 	 * is gd_uschedcp. XXX needs cleanup.
670 	 */
671 	old = gd->gd_uschedcp;
672 	if ((np = chooseproc(gd->gd_uschedcp)) != NULL) {
673 		curprocmask |= 1 << cpuid;
674 		gd->gd_upri = np->p_priority;
675 		gd->gd_uschedcp = np;
676 		lwkt_acquire(np->p_thread);
677 		lwkt_schedule(np->p_thread);
678 	} else if (gd->gd_uschedcp) {
679 		gd->gd_upri = gd->gd_uschedcp->p_priority;
680 		KKASSERT(curprocmask & (1 << cpuid));
681 	} else if (runqcount && (rdyprocmask & (1 << cpuid))) {
682 		/*gd->gd_uschedcp = NULL;*/
683 		curprocmask &= ~(1 << cpuid);
684 		rdyprocmask &= ~(1 << cpuid);
685 		lwkt_schedule(&gd->gd_schedthread);
686 	} else {
687 		/*gd->gd_uschedcp = NULL;*/
688 		curprocmask &= ~(1 << cpuid);
689 	}
690 }
691 
692 /*
693  * Acquire the current process designation on the CURRENT process only.
694  * This function is called at kernel-user priority (not userland priority)
695  * when curproc does not match gd_uschedcp.
696  *
697  * Basically we recalculate our estcpu to hopefully give us a more
698  * favorable disposition, setrunqueue, then wait for the curproc
699  * designation to be handed to us (if the setrunqueue didn't do it).
700  */
701 static void
702 bsd4_acquire_curproc(struct proc *p)
703 {
704 	globaldata_t gd = mycpu;
705 
706 	crit_enter();
707 	++p->p_stats->p_ru.ru_nivcsw;
708 
709 	/*
710 	 * Loop until we become the current process.
711 	 */
712 	do {
713 		KKASSERT(p == gd->gd_curthread->td_proc);
714 		bsd4_recalculate_estcpu(p);
715 		lwkt_deschedule_self(gd->gd_curthread);
716 		bsd4_setrunqueue(p);
717 		lwkt_switch();
718 
719 		/*
720 		 * WE MAY HAVE BEEN MIGRATED TO ANOTHER CPU, RELOAD GD.
721 		 */
722 		gd = mycpu;
723 	} while (gd->gd_uschedcp != p);
724 
725 	crit_exit();
726 
727 	/*
728 	 * That's it.  Cleanup, we are done.  The caller can return to
729 	 * user mode now.
730 	 */
731 	KKASSERT((p->p_flag & P_ONRUNQ) == 0);
732 }
733 
734 /*
735  * Compute the priority of a process when running in user mode.
736  * Arrange to reschedule if the resulting priority is better
737  * than that of the current process.
738  */
739 static void
740 bsd4_resetpriority(struct proc *p)
741 {
742 	int newpriority;
743 	int opq;
744 	int npq;
745 
746 	/*
747 	 * Set p_priority for general process comparisons
748 	 */
749 	switch(p->p_rtprio.type) {
750 	case RTP_PRIO_REALTIME:
751 		p->p_priority = PRIBASE_REALTIME + p->p_rtprio.prio;
752 		return;
753 	case RTP_PRIO_NORMAL:
754 		break;
755 	case RTP_PRIO_IDLE:
756 		p->p_priority = PRIBASE_IDLE + p->p_rtprio.prio;
757 		return;
758 	case RTP_PRIO_THREAD:
759 		p->p_priority = PRIBASE_THREAD + p->p_rtprio.prio;
760 		return;
761 	}
762 
763 	/*
764 	 * NORMAL priorities fall through.  These are based on niceness
765 	 * and cpu use.  Lower numbers == higher priorities.
766 	 *
767 	 * Calculate our priority based on our niceness and estimated cpu.
768 	 * Note that the nice value adjusts the baseline, which effects
769 	 * cpu bursts but does not effect overall cpu use between cpu-bound
770 	 * processes.  The use of the nice field in the decay calculation
771 	 * controls the overall cpu use.
772 	 *
773 	 * This isn't an exact calculation.  We fit the full nice and
774 	 * estcpu range into the priority range so the actual PPQ value
775 	 * is incorrect, but it's still a reasonable way to think about it.
776 	 */
777 	newpriority = (p->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
778 	newpriority += p->p_estcpu * PPQ / ESTCPUPPQ;
779 	newpriority = newpriority * MAXPRI /
780 		    (PRIO_RANGE * PPQ / NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
781 	newpriority = MIN(newpriority, MAXPRI - 1);	/* sanity */
782 	newpriority = MAX(newpriority, 0);		/* sanity */
783 	npq = newpriority / PPQ;
784 	crit_enter();
785 	opq = (p->p_priority & PRIMASK) / PPQ;
786 	if (p->p_stat == SRUN && (p->p_flag & P_ONRUNQ) && opq != npq) {
787 		/*
788 		 * We have to move the process to another queue
789 		 */
790 		bsd4_remrunqueue(p);
791 		p->p_priority = PRIBASE_NORMAL + newpriority;
792 		bsd4_setrunqueue(p);
793 	} else {
794 		/*
795 		 * We can just adjust the priority and it will be picked
796 		 * up later.
797 		 */
798 		KKASSERT(opq == npq || (p->p_flag & P_ONRUNQ) == 0);
799 		p->p_priority = PRIBASE_NORMAL + newpriority;
800 	}
801 	crit_exit();
802 }
803 
804 /*
805  * Called from fork1() when a new child process is being created.
806  *
807  * Give the child process an initial estcpu that is more batch then
808  * its parent and dock the parent for the fork (but do not
809  * reschedule the parent).   This comprises the main part of our batch
810  * detection heuristic for both parallel forking and sequential execs.
811  *
812  * Interactive processes will decay the boosted estcpu quickly while batch
813  * processes will tend to compound it.
814  */
815 static void
816 bsd4_forking(struct proc *pp, struct proc *p)
817 {
818 	p->p_estcpu = ESTCPULIM(pp->p_estcpu + ESTCPUPPQ);
819 	p->p_origcpu = p->p_estcpu;
820 	pp->p_estcpu = ESTCPULIM(pp->p_estcpu + ESTCPUPPQ);
821 }
822 
823 /*
824  * Called when the parent reaps a child.   Propogate cpu use by the child
825  * back to the parent.
826  */
827 static void
828 bsd4_exiting(struct proc *pp, struct proc *p)
829 {
830 	int delta;
831 
832 	if (pp->p_pid != 1) {
833 		delta = p->p_estcpu - p->p_origcpu;
834 		if (delta > 0)
835 			pp->p_estcpu = ESTCPULIM(pp->p_estcpu + delta);
836 	}
837 }
838 
839 /*
840  * Called from acquire and from kern_synch's one-second timer with a
841  * critical section held.
842  *
843  * Decay p_estcpu based on the number of ticks we haven't been running
844  * and our p_nice.  As the load increases each process observes a larger
845  * number of idle ticks (because other processes are running in them).
846  * This observation leads to a larger correction which tends to make the
847  * system more 'batchy'.
848  *
849  * Note that no recalculation occurs for a process which sleeps and wakes
850  * up in the same tick.  That is, a system doing thousands of context
851  * switches per second will still only do serious estcpu calculations
852  * ESTCPUFREQ times per second.
853  */
854 static
855 void
856 bsd4_recalculate_estcpu(struct proc *p)
857 {
858 	globaldata_t gd = mycpu;
859 	sysclock_t cpbase;
860 	int loadfac;
861 	int ndecay;
862 	int nticks;
863 	int nleft;
864 
865 	/*
866 	 * We have to subtract periodic to get the last schedclock
867 	 * timeout time, otherwise we would get the upcoming timeout.
868 	 * Keep in mind that a process can migrate between cpus and
869 	 * while the scheduler clock should be very close, boundary
870 	 * conditions could lead to a small negative delta.
871 	 */
872 	cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
873 
874 	if (p->p_slptime > 1) {
875 		/*
876 		 * Too much time has passed, do a coarse correction.
877 		 */
878 		p->p_estcpu = p->p_estcpu >> 1;
879 		bsd4_resetpriority(p);
880 		p->p_cpbase = cpbase;
881 		p->p_cpticks = 0;
882 	} else if (p->p_cpbase != cpbase) {
883 		/*
884 		 * Adjust estcpu if we are in a different tick.  Don't waste
885 		 * time if we are in the same tick.
886 		 *
887 		 * First calculate the number of ticks in the measurement
888 		 * interval.
889 		 */
890 		nticks = (cpbase - p->p_cpbase) / gd->gd_schedclock.periodic;
891 		updatepcpu(p, p->p_cpticks, nticks);
892 
893 		if ((nleft = nticks - p->p_cpticks) < 0)
894 			nleft = 0;
895 		if (usched_debug == p->p_pid) {
896 			printf("pid %d estcpu %d cpticks %d nticks %d nleft %d",
897 				p->p_pid, p->p_estcpu,
898 				p->p_cpticks, nticks, nleft);
899 		}
900 
901 		/*
902 		 * Calculate a decay value based on ticks remaining scaled
903 		 * down by the instantanious load and p_nice.
904 		 */
905 		if ((loadfac = runqcount) < 2)
906 			loadfac = 2;
907 		ndecay = nleft * usched_bsd4_decay * 2 *
908 			(PRIO_MAX * 2 - p->p_nice) / (loadfac * PRIO_MAX * 2);
909 
910 		/*
911 		 * Adjust p_estcpu.  Handle a border case where batch jobs
912 		 * can get stalled long enough to decay to zero when they
913 		 * shouldn't.
914 		 */
915 		if (p->p_estcpu > ndecay * 2)
916 			p->p_estcpu -= ndecay;
917 		else
918 			p->p_estcpu >>= 1;
919 
920 		if (usched_debug == p->p_pid)
921 			printf(" ndecay %d estcpu %d\n", ndecay, p->p_estcpu);
922 
923 		bsd4_resetpriority(p);
924 		p->p_cpbase = cpbase;
925 		p->p_cpticks = 0;
926 	}
927 }
928 
929 #ifdef SMP
930 
931 /*
932  * For SMP systems a user scheduler helper thread is created for each
933  * cpu and is used to allow one cpu to wakeup another for the purposes of
934  * scheduling userland threads from setrunqueue().  UP systems do not
935  * need the helper since there is only one cpu.  We can't use the idle
936  * thread for this because we need to hold the MP lock.  Additionally,
937  * doing things this way allows us to HLT idle cpus on MP systems.
938  */
939 static void
940 sched_thread(void *dummy)
941 {
942     globaldata_t gd = mycpu;
943     int cpuid = gd->gd_cpuid;		/* doesn't change */
944     u_int32_t cpumask = 1 << cpuid;	/* doesn't change */
945 
946     get_mplock();			/* hold the MP lock */
947     for (;;) {
948 	struct proc *np;
949 
950 	lwkt_deschedule_self(gd->gd_curthread);	/* interlock */
951 	rdyprocmask |= cpumask;
952 	crit_enter_quick(gd->gd_curthread);
953 	if ((curprocmask & cpumask) == 0 && (np = chooseproc(NULL)) != NULL) {
954 	    curprocmask |= cpumask;
955 	    gd->gd_upri = np->p_priority;
956 	    gd->gd_uschedcp = np;
957 	    lwkt_acquire(np->p_thread);
958 	    lwkt_schedule(np->p_thread);
959 	}
960 	crit_exit_quick(gd->gd_curthread);
961 	lwkt_switch();
962     }
963 }
964 
965 /*
966  * Setup our scheduler helpers.  Note that curprocmask bit 0 has already
967  * been cleared by rqinit() and we should not mess with it further.
968  */
969 static void
970 sched_thread_cpu_init(void)
971 {
972     int i;
973 
974     if (bootverbose)
975 	printf("start scheduler helpers on cpus:");
976 
977     for (i = 0; i < ncpus; ++i) {
978 	globaldata_t dgd = globaldata_find(i);
979 	cpumask_t mask = 1 << i;
980 
981 	if ((mask & smp_active_mask) == 0)
982 	    continue;
983 
984 	if (bootverbose)
985 	    printf(" %d", i);
986 
987 	lwkt_create(sched_thread, NULL, NULL, &dgd->gd_schedthread,
988 		    TDF_STOPREQ, i, "usched %d", i);
989 
990 	/*
991 	 * Allow user scheduling on the target cpu.  cpu #0 has already
992 	 * been enabled in rqinit().
993 	 */
994 	if (i)
995 	    curprocmask &= ~mask;
996 	rdyprocmask |= mask;
997     }
998     if (bootverbose)
999 	printf("\n");
1000 }
1001 SYSINIT(uschedtd, SI_SUB_FINISH_SMP, SI_ORDER_ANY, sched_thread_cpu_init, NULL)
1002 
1003 #endif
1004 
1005