xref: /dragonfly/sys/kern/usched_bsd4.c (revision 2e3ed54d)
1 /*
2  * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.6 2005/11/21 21:59:50 dillon Exp $
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
32 #include <sys/lock.h>
33 #include <sys/queue.h>
34 #include <sys/proc.h>
35 #include <sys/rtprio.h>
36 #include <sys/thread2.h>
37 #include <sys/uio.h>
38 #include <sys/sysctl.h>
39 #include <sys/resourcevar.h>
40 #include <machine/ipl.h>
41 #include <machine/cpu.h>
42 #include <machine/smp.h>
43 
44 /*
45  * Priorities.  Note that with 32 run queues per scheduler each queue
46  * represents four priority levels.
47  */
48 
49 #define MAXPRI			128
50 #define PRIMASK			(MAXPRI - 1)
51 #define PRIBASE_REALTIME	0
52 #define PRIBASE_NORMAL		MAXPRI
53 #define PRIBASE_IDLE		(MAXPRI * 2)
54 #define PRIBASE_THREAD		(MAXPRI * 3)
55 #define PRIBASE_NULL		(MAXPRI * 4)
56 
57 #define NQS	32			/* 32 run queues. */
58 #define PPQ	(MAXPRI / NQS)		/* priorities per queue */
59 
60 /*
61  * NICEPPQ	- number of nice units per priority queue
62  * ESTCPURAMP	- number of scheduler ticks for estcpu to switch queues
63  *
64  * ESTCPUPPQ	- number of estcpu units per priority queue
65  * ESTCPUMAX	- number of estcpu units
66  * ESTCPUINCR	- amount we have to increment p_estcpu per scheduling tick at
67  *		  100% cpu.
68  */
69 #define NICEPPQ		2
70 #define ESTCPURAMP	4
71 #define ESTCPUPPQ	512
72 #define ESTCPUMAX	(ESTCPUPPQ * NQS)
73 #define ESTCPUINCR	(ESTCPUPPQ / ESTCPURAMP)
74 #define PRIO_RANGE	(PRIO_MAX - PRIO_MIN + 1)
75 
76 #define ESTCPULIM(v)	min((v), ESTCPUMAX)
77 
78 TAILQ_HEAD(rq, lwp);
79 
80 #define lwp_priority	lwp_usdata.bsd4.priority
81 #define lwp_rqindex	lwp_usdata.bsd4.rqindex
82 #define lwp_origcpu	lwp_usdata.bsd4.origcpu
83 #define lwp_estcpu	lwp_usdata.bsd4.estcpu
84 
85 static void bsd4_acquire_curproc(struct lwp *lp);
86 static void bsd4_release_curproc(struct lwp *lp);
87 static void bsd4_select_curproc(globaldata_t gd);
88 static void bsd4_setrunqueue(struct lwp *lp);
89 static void bsd4_remrunqueue(struct lwp *lp);
90 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period,
91 				sysclock_t cpstamp);
92 static void bsd4_resetpriority(struct lwp *lp);
93 static void bsd4_forking(struct lwp *plp, struct lwp *lp);
94 static void bsd4_exiting(struct lwp *plp, struct lwp *lp);
95 
96 static void bsd4_recalculate_estcpu(struct lwp *lp);
97 
98 struct usched usched_bsd4 = {
99 	{ NULL },
100 	"bsd4", "Original DragonFly Scheduler",
101 	NULL,			/* default registration */
102 	NULL,			/* default deregistration */
103 	bsd4_acquire_curproc,
104 	bsd4_release_curproc,
105 	bsd4_select_curproc,
106 	bsd4_setrunqueue,
107 	bsd4_remrunqueue,
108 	bsd4_schedulerclock,
109 	bsd4_recalculate_estcpu,
110 	bsd4_resetpriority,
111 	bsd4_forking,
112 	bsd4_exiting,
113 	NULL			/* setcpumask not supported */
114 };
115 
116 /*
117  * We have NQS (32) run queues per scheduling class.  For the normal
118  * class, there are 128 priorities scaled onto these 32 queues.  New
119  * processes are added to the last entry in each queue, and processes
120  * are selected for running by taking them from the head and maintaining
121  * a simple FIFO arrangement.  Realtime and Idle priority processes have
122  * and explicit 0-31 priority which maps directly onto their class queue
123  * index.  When a queue has something in it, the corresponding bit is
124  * set in the queuebits variable, allowing a single read to determine
125  * the state of all 32 queues and then a ffs() to find the first busy
126  * queue.
127  */
128 static struct rq queues[NQS];
129 static struct rq rtqueues[NQS];
130 static struct rq idqueues[NQS];
131 static u_int32_t queuebits;
132 static u_int32_t rtqueuebits;
133 static u_int32_t idqueuebits;
134 static cpumask_t curprocmask = -1;	/* currently running a user process */
135 static cpumask_t rdyprocmask;		/* ready to accept a user process */
136 static int	 runqcount;
137 #ifdef SMP
138 static int	 scancpu;
139 #endif
140 
141 SYSCTL_INT(_debug, OID_AUTO, runqcount, CTLFLAG_RD, &runqcount, 0, "");
142 #ifdef INVARIANTS
143 static int usched_nonoptimal;
144 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW,
145         &usched_nonoptimal, 0, "acquire_curproc() was not optimal");
146 static int usched_optimal;
147 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
148         &usched_optimal, 0, "acquire_curproc() was optimal");
149 #endif
150 static int usched_debug = -1;
151 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, "");
152 #ifdef SMP
153 static int remote_resched = 1;
154 static int remote_resched_nonaffinity;
155 static int remote_resched_affinity;
156 static int choose_affinity;
157 SYSCTL_INT(_debug, OID_AUTO, remote_resched, CTLFLAG_RW,
158         &remote_resched, 0, "Resched to another cpu");
159 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
160         &remote_resched_nonaffinity, 0, "Number of remote rescheds");
161 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
162         &remote_resched_affinity, 0, "Number of remote rescheds");
163 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
164         &choose_affinity, 0, "chooseproc() was smart");
165 #endif
166 
167 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10;
168 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW,
169         &usched_bsd4_rrinterval, 0, "");
170 static int usched_bsd4_decay = ESTCPUINCR / 2;
171 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW,
172         &usched_bsd4_decay, 0, "");
173 
174 /*
175  * Initialize the run queues at boot time.
176  */
177 static void
178 rqinit(void *dummy)
179 {
180 	int i;
181 
182 	for (i = 0; i < NQS; i++) {
183 		TAILQ_INIT(&queues[i]);
184 		TAILQ_INIT(&rtqueues[i]);
185 		TAILQ_INIT(&idqueues[i]);
186 	}
187 	atomic_clear_int(&curprocmask, 1);
188 }
189 SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
190 
191 /*
192  * chooseproc() is called when a cpu needs a user process to LWKT schedule,
193  * it selects a user process and returns it.  If chkp is non-NULL and chkp
194  * has a better or equal then the process that would otherwise be
195  * chosen, NULL is returned.
196  *
197  * Until we fix the RUNQ code the chkp test has to be strict or we may
198  * bounce between processes trying to acquire the current process designation.
199  */
200 static
201 struct lwp *
202 chooseproc(struct lwp *chklp)
203 {
204 	struct lwp *lp;
205 	struct rq *q;
206 	u_int32_t *which;
207 	u_int32_t pri;
208 
209 	if (rtqueuebits) {
210 		pri = bsfl(rtqueuebits);
211 		q = &rtqueues[pri];
212 		which = &rtqueuebits;
213 	} else if (queuebits) {
214 		pri = bsfl(queuebits);
215 		q = &queues[pri];
216 		which = &queuebits;
217 	} else if (idqueuebits) {
218 		pri = bsfl(idqueuebits);
219 		q = &idqueues[pri];
220 		which = &idqueuebits;
221 	} else {
222 		return NULL;
223 	}
224 	lp = TAILQ_FIRST(q);
225 	KASSERT(lp, ("chooseproc: no lwp on busy queue"));
226 
227 	/*
228 	 * If the passed lwp <chklp> is reasonably close to the selected
229 	 * lwp <lp>, return NULL (indicating that <chklp> should be kept).
230 	 *
231 	 * Note that we must error on the side of <chklp> to avoid bouncing
232 	 * between threads in the acquire code.
233 	 */
234 	if (chklp) {
235 		if (chklp->lwp_priority < lp->lwp_priority + PPQ)
236 			return(NULL);
237 	}
238 
239 #ifdef SMP
240 	/*
241 	 * If the chosen lwp does not reside on this cpu spend a few
242 	 * cycles looking for a better candidate at the same priority level.
243 	 * This is a fallback check, setrunqueue() tries to wakeup the
244 	 * correct cpu and is our front-line affinity.
245 	 */
246 	if (lp->lwp_thread->td_gd != mycpu &&
247 	    (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL
248 	) {
249 		if (chklp->lwp_thread->td_gd == mycpu) {
250 			++choose_affinity;
251 			lp = chklp;
252 		}
253 	}
254 #endif
255 
256 	TAILQ_REMOVE(q, lp, lwp_procq);
257 	--runqcount;
258 	if (TAILQ_EMPTY(q))
259 		*which &= ~(1 << pri);
260 	KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq6!"));
261 	lp->lwp_proc->p_flag &= ~P_ONRUNQ;
262 	return lp;
263 }
264 
265 #ifdef SMP
266 /*
267  * called via an ipi message to reschedule on another cpu.
268  */
269 static
270 void
271 need_user_resched_remote(void *dummy)
272 {
273 	need_user_resched();
274 }
275 
276 #endif
277 
278 /*
279  * setrunqueue() 'wakes up' a 'user' process.  GIANT must be held.  The
280  * user process may represent any user process, including the current
281  * process.
282  *
283  * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target
284  * cpus in an attempt to keep the process on the current cpu at least for
285  * a little while to take advantage of locality of reference (e.g. fork/exec
286  * or short fork/exit, and uio_yield()).
287  *
288  * CPU AFFINITY: cpu affinity is handled by attempting to either schedule
289  * or (user level) preempt on the same cpu that a process was previously
290  * scheduled to.  If we cannot do this but we are at enough of a higher
291  * priority then the processes running on other cpus, we will allow the
292  * process to be stolen by another cpu.
293  *
294  * WARNING! a thread can be acquired by another cpu the moment it is put
295  * on the user scheduler's run queue AND we release the MP lock.  Since we
296  * release the MP lock before switching out another cpu may begin stealing
297  * our current thread before we are completely switched out!  The
298  * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the
299  * thread before stealing it.
300  *
301  * NOTE on need_user_resched() calls: we have to call need_user_resched()
302  * if the new process is more important then the current process, or if
303  * the new process is the current process and is now less important then
304  * other processes.
305  *
306  * The associated thread must NOT be scheduled.
307  * The process must be runnable.
308  * This must be called at splhigh().
309  */
310 static void
311 bsd4_setrunqueue(struct lwp *lp)
312 {
313 	struct rq *q;
314 	struct globaldata *gd;
315 	int pri;
316 	int cpuid;
317 	u_int32_t needresched;
318 #ifdef SMP
319 	int count;
320 	cpumask_t mask;
321 #endif
322 
323 	ASSERT_MP_LOCK_HELD(lp->lwp_thread);
324 	crit_enter();
325 	KASSERT(lp->lwp_proc->p_stat == SRUN, ("setrunqueue: proc not SRUN"));
326 	KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0,
327 	    ("lwp %d/%d already on runq! flag %08x", lp->lwp_proc->p_pid,
328 	     lp->lwp_tid, lp->lwp_proc->p_flag));
329 	KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
330 
331 	/*
332 	 * Note: gd is the gd of the TARGET thread's cpu, not our cpu.
333 	 */
334 	gd = lp->lwp_thread->td_gd;
335 
336 	/*
337 	 * Because recalculate is only called once or twice for long sleeps,
338 	 * not every second forever while the process is sleeping, we have
339 	 * to manually call it to resynchronize p_cpbase on wakeup or it
340 	 * will wrap if the process was sleeping long enough (e.g. ~10 min
341 	 * with the ACPI timer) and really mess up the nticks calculation.
342 	 */
343 	if (lp->lwp_slptime) {
344 	    bsd4_recalculate_estcpu(lp);
345 	    lp->lwp_slptime = 0;
346 	}
347 	/*
348 	 * We have not been released, make sure that we are not the currently
349 	 * designated process.
350 	 */
351 	KKASSERT(gd->gd_uschedcp != lp);
352 
353 	/*
354 	 * Check cpu affinity.  The associated thread is stable at the
355 	 * moment.  Note that we may be checking another cpu here so we
356 	 * have to be careful.  We are currently protected by the BGL.
357 	 *
358 	 * This allows us to avoid actually queueing the process.
359 	 * acquire_curproc() will handle any threads we mistakenly schedule.
360 	 */
361 	cpuid = gd->gd_cpuid;
362 
363 	if ((curprocmask & (1 << cpuid)) == 0) {
364 		atomic_set_int(&curprocmask, 1 << cpuid);
365 		gd->gd_uschedcp = lp;
366 		gd->gd_upri = lp->lwp_priority;
367 		lwkt_schedule(lp->lwp_thread);
368 		/* CANNOT TOUCH PROC OR TD AFTER SCHEDULE CALL TO REMOTE CPU */
369 		crit_exit();
370 #ifdef SMP
371 		if (gd != mycpu)
372 			++remote_resched_affinity;
373 #endif
374 		return;
375 	}
376 
377 	/*
378 	 * gd and cpuid may still 'hint' at another cpu.  Even so we have
379 	 * to place this process on the userland scheduler's run queue for
380 	 * action by the target cpu.
381 	 */
382 	++runqcount;
383 	lp->lwp_proc->p_flag |= P_ONRUNQ;
384 	if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) {
385 		pri = (lp->lwp_priority & PRIMASK) / PPQ;
386 		q = &queues[pri];
387 		queuebits |= 1 << pri;
388 		needresched = (queuebits & ((1 << pri) - 1));
389 	} else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME ||
390 		   lp->lwp_rtprio.type == RTP_PRIO_FIFO) {
391 		pri = (u_int8_t)lp->lwp_rtprio.prio;
392 		q = &rtqueues[pri];
393 		rtqueuebits |= 1 << pri;
394 		needresched = (rtqueuebits & ((1 << pri) - 1));
395 	} else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) {
396 		pri = (u_int8_t)lp->lwp_rtprio.prio;
397 		q = &idqueues[pri];
398 		idqueuebits |= 1 << pri;
399 		needresched = (idqueuebits & ((1 << pri) - 1));
400 	} else {
401 		needresched = 0;
402 		panic("setrunqueue: invalid rtprio type");
403 	}
404 	KKASSERT(pri < 32);
405 	lp->lwp_rqindex = pri;		/* remember the queue index */
406 	TAILQ_INSERT_TAIL(q, lp, lwp_procq);
407 
408 #ifdef SMP
409 	/*
410 	 * Either wakeup other cpus user thread scheduler or request
411 	 * preemption on other cpus (which will also wakeup a HLT).
412 	 *
413 	 * NOTE!  gd and cpuid may still be our 'hint', not our current
414 	 * cpu info.
415 	 */
416 
417 	count = runqcount;
418 
419 	/*
420 	 * Check cpu affinity for user preemption (when the curprocmask bit
421 	 * is set).  Note that gd_upri is a speculative field (we modify
422 	 * another cpu's gd_upri to avoid sending ipiq storms).
423 	 */
424 	if (gd == mycpu) {
425 		if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) {
426 			if (lp->lwp_priority < gd->gd_upri - PPQ) {
427 				gd->gd_upri = lp->lwp_priority;
428 				gd->gd_rrcount = 0;
429 				need_user_resched();
430 				--count;
431 			} else if (gd->gd_uschedcp == lp && needresched) {
432 				gd->gd_rrcount = 0;
433 				need_user_resched();
434 				--count;
435 			}
436 		}
437 	} else if (remote_resched) {
438 		if (lp->lwp_priority < gd->gd_upri - PPQ) {
439 			gd->gd_upri = lp->lwp_priority;
440 			lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
441 			--count;
442 			++remote_resched_affinity;
443 		}
444 	}
445 
446 	/*
447 	 * No affinity, first schedule to any cpus that do not have a current
448 	 * process.  If there is a free cpu we always schedule to it.
449 	 */
450 	if (count &&
451 	    (mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0 &&
452 	    (lp->lwp_proc->p_flag & P_PASSIVE_ACQ) == 0) {
453 		if (!mask)
454 			printf("lwp %d/%d nocpu to schedule it on\n",
455 			       lp->lwp_proc->p_pid, lp->lwp_tid);
456 		while (mask && count) {
457 			cpuid = bsfl(mask);
458 			KKASSERT((curprocmask & (1 << cpuid)) == 0);
459 			atomic_clear_int(&rdyprocmask, 1 << cpuid);
460 			lwkt_schedule(&globaldata_find(cpuid)->gd_schedthread);
461 			--count;
462 			mask &= ~(1 << cpuid);
463 		}
464 	}
465 
466 	/*
467 	 * If there are still runnable processes try to wakeup a random
468 	 * cpu that is running a much lower priority process in order to
469 	 * preempt on it.  Note that gd_upri is only a hint, so we can
470 	 * overwrite it from the wrong cpu.   If we can't find one, we
471 	 * are SOL.
472 	 *
473 	 * We depress the priority check so multiple cpu bound programs
474 	 * do not bounce between cpus.  Remember that the clock interrupt
475 	 * will also cause all cpus to reschedule.
476 	 *
477 	 * We must mask against rdyprocmask or we will race in the boot
478 	 * code (before all cpus have working scheduler helpers), plus
479 	 * some cpus might not be operational and/or not configured to
480 	 * handle user processes.
481 	 */
482 	if (count && remote_resched && ncpus > 1) {
483 		cpuid = scancpu;
484 		do {
485 			if (++cpuid == ncpus)
486 				cpuid = 0;
487 		} while (cpuid == mycpu->gd_cpuid);
488 		scancpu = cpuid;
489 
490 		if (rdyprocmask & (1 << cpuid)) {
491 			gd = globaldata_find(cpuid);
492 
493 			if (lp->lwp_priority < gd->gd_upri - PPQ) {
494 				gd->gd_upri = lp->lwp_priority;
495 				lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
496 				++remote_resched_nonaffinity;
497 			}
498 		}
499 	}
500 #else
501 	if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) {
502 		if (lp->lwp_priority < gd->gd_upri - PPQ) {
503 			gd->gd_upri = lp->lwp_priority;
504 			gd->gd_rrcount = 0;
505 			need_user_resched();
506 		} else if (gd->gd_uschedcp == lp && needresched) {
507 			gd->gd_rrcount = 0;
508 			need_user_resched();
509 		}
510 	}
511 #endif
512 	crit_exit();
513 }
514 
515 /*
516  * remrunqueue() removes a given process from the run queue that it is on,
517  * clearing the queue busy bit if it becomes empty.  This function is called
518  * when a userland process is selected for LWKT scheduling.  Note that
519  * LWKT scheduling is an abstraction of 'curproc'.. there could very well be
520  * several userland processes whos threads are scheduled or otherwise in
521  * a special state, and such processes are NOT on the userland scheduler's
522  * run queue.
523  *
524  * This must be called at splhigh().
525  */
526 static void
527 bsd4_remrunqueue(struct lwp *lp)
528 {
529 	struct rq *q;
530 	u_int32_t *which;
531 	u_int8_t pri;
532 
533 	ASSERT_MP_LOCK_HELD(lp->lwp_thread);
534 	crit_enter();
535 	KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq4!"));
536 	lp->lwp_proc->p_flag &= ~P_ONRUNQ;
537 	--runqcount;
538 	KKASSERT(runqcount >= 0);
539 	pri = lp->lwp_rqindex;
540 	if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) {
541 		q = &queues[pri];
542 		which = &queuebits;
543 	} else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME ||
544 		   lp->lwp_rtprio.type == RTP_PRIO_FIFO) {
545 		q = &rtqueues[pri];
546 		which = &rtqueuebits;
547 	} else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) {
548 		q = &idqueues[pri];
549 		which = &idqueuebits;
550 	} else {
551 		panic("remrunqueue: invalid rtprio type");
552 	}
553 	TAILQ_REMOVE(q, lp, lwp_procq);
554 	if (TAILQ_EMPTY(q)) {
555 		KASSERT((*which & (1 << pri)) != 0,
556 			("remrunqueue: remove from empty queue"));
557 		*which &= ~(1 << pri);
558 	}
559 	crit_exit();
560 }
561 
562 /*
563  * This routine is called from a systimer IPI.  It MUST be MP-safe and
564  * the BGL IS NOT HELD ON ENTRY.  This routine is called at ESTCPUFREQ.
565  *
566  * MPSAFE
567  */
568 static
569 void
570 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
571 {
572 	globaldata_t gd = mycpu;
573 
574 	/*
575 	 * Do we need to round-robin?  We round-robin 10 times a second.
576 	 * This should only occur for cpu-bound batch processes.
577 	 */
578 	if (++gd->gd_rrcount >= usched_bsd4_rrinterval) {
579 		gd->gd_rrcount = 0;
580 		need_user_resched();
581 	}
582 
583 	/*
584 	 * As the process accumulates cpu time p_estcpu is bumped and may
585 	 * push the process into another scheduling queue.  It typically
586 	 * takes 4 ticks to bump the queue.
587 	 */
588 	lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR);
589 
590 	/*
591 	 * Reducing p_origcpu over time causes more of our estcpu to be
592 	 * returned to the parent when we exit.  This is a small tweak
593 	 * for the batch detection heuristic.
594 	 */
595 	if (lp->lwp_origcpu)
596 		--lp->lwp_origcpu;
597 
598 	/* XXX optimize, avoid lock if no reset is required */
599 	if (try_mplock()) {
600 		bsd4_resetpriority(lp);
601 		rel_mplock();
602 	}
603 }
604 
605 /*
606  * Release the current process designation on p.  P MUST BE CURPROC.
607  * Attempt to assign a new current process from the run queue.
608  *
609  * This function is called from exit1(), tsleep(), and the passive
610  * release code setup in <arch>/<arch>/trap.c
611  *
612  * If we do not have or cannot get the MP lock we just wakeup the userland
613  * helper scheduler thread for this cpu to do the work for us.
614  *
615  * WARNING!  The MP lock may be in an unsynchronized state due to the
616  * way get_mplock() works and the fact that this function may be called
617  * from a passive release during a lwkt_switch().   try_mplock() will deal
618  * with this for us but you should be aware that td_mpcount may not be
619  * useable.
620  */
621 static void
622 bsd4_release_curproc(struct lwp *lp)
623 {
624 	int cpuid;
625 	globaldata_t gd = mycpu;
626 
627 	KKASSERT(lp->lwp_thread->td_gd == gd);
628 	crit_enter();
629 	cpuid = gd->gd_cpuid;
630 
631 	if (gd->gd_uschedcp == lp) {
632 		if (try_mplock()) {
633 			/*
634 			 * If we can obtain the MP lock we can directly
635 			 * select the next current process.
636 			 *
637 			 * bsd4_select_curproc() will adjust curprocmask
638 			 * for us.
639 			 */
640 			gd->gd_uschedcp = NULL;
641 			gd->gd_upri = PRIBASE_NULL;
642 			bsd4_select_curproc(gd);
643 			rel_mplock();
644 		} else {
645 			/*
646 			 * If we cannot obtain the MP lock schedule our
647 			 * helper thread to select the next current
648 			 * process.
649 			 *
650 			 * This is the only place where we adjust curprocmask
651 			 * and rdyprocmask without holding the MP lock.
652 			 */
653 			gd->gd_uschedcp = NULL;
654 			gd->gd_upri = PRIBASE_NULL;
655 			atomic_clear_int(&curprocmask, 1 << cpuid);
656 			if (runqcount && (rdyprocmask & (1 << cpuid))) {
657 				atomic_clear_int(&rdyprocmask, 1 << cpuid);
658 				lwkt_schedule(&mycpu->gd_schedthread);
659 			}
660 		}
661 	}
662 	crit_exit();
663 }
664 
665 /*
666  * Select a new current process, potentially retaining gd_uschedcp.  However,
667  * be sure to round-robin.  This routine is generally only called if a
668  * reschedule is requested and that typically only occurs if a new process
669  * has a better priority or when we are round-robining.
670  *
671  * NOTE: Must be called with giant held and the current cpu's gd.
672  * NOTE: The caller must handle the situation where it loses a
673  *	uschedcp designation that it previously held, typically by
674  *	calling acquire_curproc() again.
675  * NOTE: May not block
676  */
677 static
678 void
679 bsd4_select_curproc(globaldata_t gd)
680 {
681 	struct lwp *nlp;
682 	int cpuid = gd->gd_cpuid;
683 	void *old;
684 
685 	clear_user_resched();
686 	get_mplock();
687 
688 	/*
689 	 * Choose the next designated current user process.
690 	 * Note that we cannot schedule gd_schedthread
691 	 * if runqcount is 0 without creating a scheduling
692 	 * loop.
693 	 *
694 	 * We do not clear the user resched request here,
695 	 * we need to test it later when we re-acquire.
696 	 *
697 	 * NOTE: chooseproc returns NULL if the chosen lwp
698 	 * is gd_uschedcp. XXX needs cleanup.
699 	 */
700 	old = gd->gd_uschedcp;
701 	if ((nlp = chooseproc(gd->gd_uschedcp)) != NULL) {
702 		atomic_set_int(&curprocmask, 1 << cpuid);
703 		gd->gd_upri = nlp->lwp_priority;
704 		gd->gd_uschedcp = nlp;
705 		lwkt_acquire(nlp->lwp_thread);
706 		lwkt_schedule(nlp->lwp_thread);
707 	} else if (gd->gd_uschedcp) {
708 		gd->gd_upri = gd->gd_uschedcp->lwp_priority;
709 		KKASSERT(curprocmask & (1 << cpuid));
710 	} else if (runqcount && (rdyprocmask & (1 << cpuid))) {
711 		/*gd->gd_uschedcp = NULL;*/
712 		atomic_clear_int(&curprocmask, 1 << cpuid);
713 		atomic_clear_int(&rdyprocmask, 1 << cpuid);
714 		lwkt_schedule(&gd->gd_schedthread);
715 	} else {
716 		/*gd->gd_uschedcp = NULL;*/
717 		atomic_clear_int(&curprocmask, 1 << cpuid);
718 	}
719 	rel_mplock();
720 }
721 
722 /*
723  * Acquire the current process designation on the CURRENT process only.
724  * This function is called at kernel-user priority (not userland priority)
725  * when curlwp does not match gd_uschedcp.
726  *
727  * This function is only called just prior to returning to user mode.
728  *
729  * Basically we recalculate our estcpu to hopefully give us a more
730  * favorable disposition, setrunqueue, then wait for the curlwp
731  * designation to be handed to us (if the setrunqueue didn't do it).
732  *
733  * WARNING! THIS FUNCTION MAY CAUSE THE CURRENT THREAD TO MIGRATE TO
734  * ANOTHER CPU!  Because most of the kernel assumes that no migration will
735  * occur, this function is called only under very controlled circumstances.
736  */
737 static void
738 bsd4_acquire_curproc(struct lwp *lp)
739 {
740 	globaldata_t gd = mycpu;
741 
742 	get_mplock();
743 	crit_enter();
744 
745 	/*
746 	 * Recalculate our priority and put us back on the userland
747 	 * scheduler's runq.
748 	 *
749 	 * Only increment the involuntary context switch count if the
750 	 * setrunqueue call did not immediately schedule us.
751 	 */
752 	KKASSERT(lp == gd->gd_curthread->td_lwp);
753 	bsd4_recalculate_estcpu(lp);
754 	lwkt_deschedule_self(gd->gd_curthread);
755 	bsd4_setrunqueue(lp);
756 	if ((gd->gd_curthread->td_flags & TDF_RUNQ) == 0)
757 		++lp->lwp_stats->p_ru.ru_nivcsw;
758 	lwkt_switch();
759 
760 	/*
761 	 * Because we put ourselves back on the userland scheduler's run
762 	 * queue, WE MAY HAVE BEEN MIGRATED TO ANOTHER CPU
763 	 */
764 	gd = mycpu;
765 
766 	/*
767 	 * We better be the current process when we wake up, and we had
768 	 * better not be on the run queue.
769 	 */
770 	KKASSERT(gd->gd_uschedcp == lp);
771 	KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
772 
773 	crit_exit();
774 	rel_mplock();
775 }
776 
777 /*
778  * Compute the priority of a process when running in user mode.
779  * Arrange to reschedule if the resulting priority is better
780  * than that of the current process.
781  */
782 static void
783 bsd4_resetpriority(struct lwp *lp)
784 {
785 	int newpriority;
786 	int opq;
787 	int npq;
788 
789 	ASSERT_MP_LOCK_HELD(curthread);
790 
791 	/*
792 	 * Set p_priority for general process comparisons
793 	 */
794 	switch(lp->lwp_rtprio.type) {
795 	case RTP_PRIO_REALTIME:
796 		lp->lwp_priority = PRIBASE_REALTIME + lp->lwp_rtprio.prio;
797 		return;
798 	case RTP_PRIO_NORMAL:
799 		break;
800 	case RTP_PRIO_IDLE:
801 		lp->lwp_priority = PRIBASE_IDLE + lp->lwp_rtprio.prio;
802 		return;
803 	case RTP_PRIO_THREAD:
804 		lp->lwp_priority = PRIBASE_THREAD + lp->lwp_rtprio.prio;
805 		return;
806 	}
807 
808 	/*
809 	 * NORMAL priorities fall through.  These are based on niceness
810 	 * and cpu use.  Lower numbers == higher priorities.
811 	 *
812 	 * Calculate our priority based on our niceness and estimated cpu.
813 	 * Note that the nice value adjusts the baseline, which effects
814 	 * cpu bursts but does not effect overall cpu use between cpu-bound
815 	 * processes.  The use of the nice field in the decay calculation
816 	 * controls the overall cpu use.
817 	 *
818 	 * This isn't an exact calculation.  We fit the full nice and
819 	 * estcpu range into the priority range so the actual PPQ value
820 	 * is incorrect, but it's still a reasonable way to think about it.
821 	 */
822 	newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
823 	newpriority += lp->lwp_estcpu * PPQ / ESTCPUPPQ;
824 	newpriority = newpriority * MAXPRI /
825 		    (PRIO_RANGE * PPQ / NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
826 	newpriority = MIN(newpriority, MAXPRI - 1);	/* sanity */
827 	newpriority = MAX(newpriority, 0);		/* sanity */
828 	npq = newpriority / PPQ;
829 	crit_enter();
830 	opq = (lp->lwp_priority & PRIMASK) / PPQ;
831 	if (lp->lwp_proc->p_stat == SRUN && (lp->lwp_proc->p_flag & P_ONRUNQ) && opq != npq) {
832 		/*
833 		 * We have to move the process to another queue
834 		 */
835 		bsd4_remrunqueue(lp);
836 		lp->lwp_priority = PRIBASE_NORMAL + newpriority;
837 		bsd4_setrunqueue(lp);
838 	} else {
839 		/*
840 		 * We can just adjust the priority and it will be picked
841 		 * up later.
842 		 */
843 		KKASSERT(opq == npq || (lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
844 		lp->lwp_priority = PRIBASE_NORMAL + newpriority;
845 	}
846 	crit_exit();
847 }
848 
849 /*
850  * Called from fork1() when a new child process is being created.
851  *
852  * Give the child process an initial estcpu that is more batch then
853  * its parent and dock the parent for the fork (but do not
854  * reschedule the parent).   This comprises the main part of our batch
855  * detection heuristic for both parallel forking and sequential execs.
856  *
857  * Interactive processes will decay the boosted estcpu quickly while batch
858  * processes will tend to compound it.
859  * XXX lwp should be "spawning" instead of "forking"
860  *
861  * MPSAFE
862  */
863 static void
864 bsd4_forking(struct lwp *plp, struct lwp *lp)
865 {
866 	lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ);
867 	lp->lwp_origcpu = lp->lwp_estcpu;
868 	plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ);
869 }
870 
871 /*
872  * Called when the parent reaps a child.   Propogate cpu use by the child
873  * back to the parent.
874  *
875  * MPSAFE
876  */
877 static void
878 bsd4_exiting(struct lwp *plp, struct lwp *lp)
879 {
880 	int delta;
881 
882 	if (plp->lwp_proc->p_pid != 1) {
883 		delta = lp->lwp_estcpu - lp->lwp_origcpu;
884 		if (delta > 0)
885 			plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + delta);
886 	}
887 }
888 
889 /*
890  * Called from acquire and from kern_synch's one-second timer with a
891  * critical section held.
892  *
893  * Decay p_estcpu based on the number of ticks we haven't been running
894  * and our p_nice.  As the load increases each process observes a larger
895  * number of idle ticks (because other processes are running in them).
896  * This observation leads to a larger correction which tends to make the
897  * system more 'batchy'.
898  *
899  * Note that no recalculation occurs for a process which sleeps and wakes
900  * up in the same tick.  That is, a system doing thousands of context
901  * switches per second will still only do serious estcpu calculations
902  * ESTCPUFREQ times per second.
903  */
904 static
905 void
906 bsd4_recalculate_estcpu(struct lwp *lp)
907 {
908 	globaldata_t gd = mycpu;
909 	sysclock_t cpbase;
910 	int loadfac;
911 	int ndecay;
912 	int nticks;
913 	int nleft;
914 
915 	ASSERT_MP_LOCK_HELD(curthread);
916 
917 	/*
918 	 * We have to subtract periodic to get the last schedclock
919 	 * timeout time, otherwise we would get the upcoming timeout.
920 	 * Keep in mind that a process can migrate between cpus and
921 	 * while the scheduler clock should be very close, boundary
922 	 * conditions could lead to a small negative delta.
923 	 */
924 	cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
925 
926 	if (lp->lwp_slptime > 1) {
927 		/*
928 		 * Too much time has passed, do a coarse correction.
929 		 */
930 		lp->lwp_estcpu = lp->lwp_estcpu >> 1;
931 		bsd4_resetpriority(lp);
932 		lp->lwp_cpbase = cpbase;
933 		lp->lwp_cpticks = 0;
934 	} else if (lp->lwp_cpbase != cpbase) {
935 		/*
936 		 * Adjust estcpu if we are in a different tick.  Don't waste
937 		 * time if we are in the same tick.
938 		 *
939 		 * First calculate the number of ticks in the measurement
940 		 * interval.
941 		 */
942 		nticks = (cpbase - lp->lwp_cpbase) / gd->gd_schedclock.periodic;
943 		updatepcpu(lp, lp->lwp_cpticks, nticks);
944 
945 		if ((nleft = nticks - lp->lwp_cpticks) < 0)
946 			nleft = 0;
947 		if (usched_debug == lp->lwp_proc->p_pid) {
948 			printf("pid %d tid %d estcpu %d cpticks %d nticks %d nleft %d",
949 				lp->lwp_proc->p_pid, lp->lwp_tid, lp->lwp_estcpu,
950 				lp->lwp_cpticks, nticks, nleft);
951 		}
952 
953 		/*
954 		 * Calculate a decay value based on ticks remaining scaled
955 		 * down by the instantanious load and p_nice.
956 		 */
957 		if ((loadfac = runqcount) < 2)
958 			loadfac = 2;
959 		ndecay = nleft * usched_bsd4_decay * 2 *
960 			(PRIO_MAX * 2 - lp->lwp_proc->p_nice) / (loadfac * PRIO_MAX * 2);
961 
962 		/*
963 		 * Adjust p_estcpu.  Handle a border case where batch jobs
964 		 * can get stalled long enough to decay to zero when they
965 		 * shouldn't.
966 		 */
967 		if (lp->lwp_estcpu > ndecay * 2)
968 			lp->lwp_estcpu -= ndecay;
969 		else
970 			lp->lwp_estcpu >>= 1;
971 
972 		if (usched_debug == lp->lwp_proc->p_pid)
973 			printf(" ndecay %d estcpu %d\n", ndecay, lp->lwp_estcpu);
974 
975 		bsd4_resetpriority(lp);
976 		lp->lwp_cpbase = cpbase;
977 		lp->lwp_cpticks = 0;
978 	}
979 }
980 
981 #ifdef SMP
982 
983 /*
984  * For SMP systems a user scheduler helper thread is created for each
985  * cpu and is used to allow one cpu to wakeup another for the purposes of
986  * scheduling userland threads from setrunqueue().  UP systems do not
987  * need the helper since there is only one cpu.  We can't use the idle
988  * thread for this because we need to hold the MP lock.  Additionally,
989  * doing things this way allows us to HLT idle cpus on MP systems.
990  */
991 static void
992 sched_thread(void *dummy)
993 {
994     globaldata_t gd = mycpu;
995     int cpuid = gd->gd_cpuid;		/* doesn't change */
996     u_int32_t cpumask = 1 << cpuid;	/* doesn't change */
997 
998     ASSERT_MP_LOCK_HELD(curthread);
999     for (;;) {
1000 	struct lwp *nlp;
1001 
1002 	lwkt_deschedule_self(gd->gd_curthread);	/* interlock */
1003 	atomic_set_int(&rdyprocmask, cpumask);
1004 	crit_enter_quick(gd->gd_curthread);
1005 	if ((curprocmask & cpumask) == 0 && (nlp = chooseproc(NULL)) != NULL) {
1006 	    atomic_set_int(&curprocmask, cpumask);
1007 	    gd->gd_upri = nlp->lwp_priority;
1008 	    gd->gd_uschedcp = nlp;
1009 	    lwkt_acquire(nlp->lwp_thread);
1010 	    lwkt_schedule(nlp->lwp_thread);
1011 	}
1012 	crit_exit_quick(gd->gd_curthread);
1013 	lwkt_switch();
1014     }
1015 }
1016 
1017 /*
1018  * Setup our scheduler helpers.  Note that curprocmask bit 0 has already
1019  * been cleared by rqinit() and we should not mess with it further.
1020  */
1021 static void
1022 sched_thread_cpu_init(void)
1023 {
1024     int i;
1025 
1026     if (bootverbose)
1027 	printf("start scheduler helpers on cpus:");
1028 
1029     for (i = 0; i < ncpus; ++i) {
1030 	globaldata_t dgd = globaldata_find(i);
1031 	cpumask_t mask = 1 << i;
1032 
1033 	if ((mask & smp_active_mask) == 0)
1034 	    continue;
1035 
1036 	if (bootverbose)
1037 	    printf(" %d", i);
1038 
1039 	lwkt_create(sched_thread, NULL, NULL, &dgd->gd_schedthread,
1040 		    TDF_STOPREQ, i, "usched %d", i);
1041 
1042 	/*
1043 	 * Allow user scheduling on the target cpu.  cpu #0 has already
1044 	 * been enabled in rqinit().
1045 	 */
1046 	if (i)
1047 	    atomic_clear_int(&curprocmask, mask);
1048 	atomic_set_int(&rdyprocmask, mask);
1049     }
1050     if (bootverbose)
1051 	printf("\n");
1052 }
1053 SYSINIT(uschedtd, SI_SUB_FINISH_SMP, SI_ORDER_ANY, sched_thread_cpu_init, NULL)
1054 
1055 #endif
1056 
1057