xref: /netbsd/sys/kern/kern_runq.c (revision 35dcdcad)
1 /*	$NetBSD: kern_runq.c,v 1.69 2020/05/23 21:24:41 ad Exp $	*/
2 
3 /*-
4  * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
34  * All rights reserved.
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  *
45  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
46  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
49  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55  * SUCH DAMAGE.
56  */
57 
58 #include <sys/cdefs.h>
59 __KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.69 2020/05/23 21:24:41 ad Exp $");
60 
61 #include "opt_dtrace.h"
62 
63 #include <sys/param.h>
64 #include <sys/kernel.h>
65 #include <sys/bitops.h>
66 #include <sys/cpu.h>
67 #include <sys/idle.h>
68 #include <sys/intr.h>
69 #include <sys/kmem.h>
70 #include <sys/lwp.h>
71 #include <sys/mutex.h>
72 #include <sys/proc.h>
73 #include <sys/pset.h>
74 #include <sys/sched.h>
75 #include <sys/syscallargs.h>
76 #include <sys/sysctl.h>
77 #include <sys/systm.h>
78 #include <sys/types.h>
79 #include <sys/evcnt.h>
80 #include <sys/atomic.h>
81 
82 /*
83  * Bits per map.
84  */
85 #define	BITMAP_BITS	(32)
86 #define	BITMAP_SHIFT	(5)
87 #define	BITMAP_MSB	(0x80000000U)
88 #define	BITMAP_MASK	(BITMAP_BITS - 1)
89 
90 const int	schedppq = 1;
91 
92 static void	*sched_getrq(struct schedstate_percpu *, const pri_t);
93 #ifdef MULTIPROCESSOR
94 static lwp_t *	sched_catchlwp(struct cpu_info *);
95 #endif
96 
97 /*
98  * Preemption control.
99  */
100 #ifdef __HAVE_PREEMPTION
101 # ifdef DEBUG
102 int		sched_kpreempt_pri = 0;
103 # else
104 int		sched_kpreempt_pri = PRI_USER_RT;
105 # endif
106 #else
107 int		sched_kpreempt_pri = 1000;
108 #endif
109 
110 /*
111  * Migration and balancing.
112  */
113 static u_int	cacheht_time;	/* Cache hotness time */
114 static u_int	min_catch;	/* Minimal LWP count for catching */
115 static u_int	skim_interval;	/* Rate limit for stealing LWPs */
116 
117 #ifdef KDTRACE_HOOKS
118 struct lwp *curthread;
119 #endif
120 
121 void
runq_init(void)122 runq_init(void)
123 {
124 
125 	/* Pulling from remote packages, LWP must not have run for 10ms. */
126 	cacheht_time = 10;
127 
128 	/* Minimal count of LWPs for catching */
129 	min_catch = 1;
130 
131 	/* Steal from other CPUs at most every 10ms. */
132 	skim_interval = 10;
133 }
134 
135 void
sched_cpuattach(struct cpu_info * ci)136 sched_cpuattach(struct cpu_info *ci)
137 {
138 	struct schedstate_percpu *spc;
139 	size_t size;
140 	void *p;
141 	u_int i;
142 
143 	spc = &ci->ci_schedstate;
144 	spc->spc_nextpkg = ci;
145 
146 	if (spc->spc_lwplock == NULL) {
147 		spc->spc_lwplock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
148 	}
149 	if (ci == lwp0.l_cpu) {
150 		/* Initialize the scheduler structure of the primary LWP */
151 		lwp0.l_mutex = spc->spc_lwplock;
152 	}
153 	if (spc->spc_mutex != NULL) {
154 		/* Already initialized. */
155 		return;
156 	}
157 
158 	/* Allocate the run queue */
159 	size = roundup2(sizeof(spc->spc_queue[0]) * PRI_COUNT, coherency_unit) +
160 	    coherency_unit;
161 	p = kmem_alloc(size, KM_SLEEP);
162 	spc->spc_queue = (void *)roundup2((uintptr_t)p, coherency_unit);
163 
164 	/* Initialize run queues */
165 	spc->spc_mutex = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
166 	for (i = 0; i < PRI_COUNT; i++)
167 		TAILQ_INIT(&spc->spc_queue[i]);
168 }
169 
170 /*
171  * Control of the runqueue.
172  */
173 static inline void *
sched_getrq(struct schedstate_percpu * spc,const pri_t prio)174 sched_getrq(struct schedstate_percpu *spc, const pri_t prio)
175 {
176 
177 	KASSERT(prio < PRI_COUNT);
178 	return &spc->spc_queue[prio];
179 }
180 
181 /*
182  * Put an LWP onto a run queue.  The LWP must be locked by spc_mutex for
183  * l_cpu.
184  */
185 void
sched_enqueue(struct lwp * l)186 sched_enqueue(struct lwp *l)
187 {
188 	struct schedstate_percpu *spc;
189 	TAILQ_HEAD(, lwp) *q_head;
190 	const pri_t eprio = lwp_eprio(l);
191 	struct cpu_info *ci;
192 
193 	ci = l->l_cpu;
194 	spc = &ci->ci_schedstate;
195 	KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
196 
197 	/* Enqueue the thread */
198 	q_head = sched_getrq(spc, eprio);
199 	if (TAILQ_EMPTY(q_head)) {
200 		u_int i;
201 		uint32_t q;
202 
203 		/* Mark bit */
204 		i = eprio >> BITMAP_SHIFT;
205 		q = BITMAP_MSB >> (eprio & BITMAP_MASK);
206 		KASSERT((spc->spc_bitmap[i] & q) == 0);
207 		spc->spc_bitmap[i] |= q;
208 	}
209 
210 	/*
211 	 * Determine run queue position according to POSIX.  XXX Explicitly
212 	 * lowering a thread's priority with pthread_setschedparam() is not
213 	 * handled.
214 	 */
215 	if ((l->l_pflag & LP_PREEMPTING) != 0) {
216 		switch (l->l_class) {
217 		case SCHED_OTHER:
218 			TAILQ_INSERT_TAIL(q_head, l, l_runq);
219 			break;
220 		case SCHED_FIFO:
221 			TAILQ_INSERT_HEAD(q_head, l, l_runq);
222 			break;
223 		case SCHED_RR:
224 			if (getticks() - l->l_rticks >= sched_rrticks) {
225 				TAILQ_INSERT_TAIL(q_head, l, l_runq);
226 			} else {
227 				TAILQ_INSERT_HEAD(q_head, l, l_runq);
228 			}
229 			break;
230 		default: /* SCHED_OTHER */
231 			panic("sched_enqueue: LWP %p has class %d\n",
232 			    l, l->l_class);
233 		}
234 	} else {
235 		TAILQ_INSERT_TAIL(q_head, l, l_runq);
236 	}
237 	spc->spc_flags &= ~SPCF_IDLE;
238 	spc->spc_count++;
239 	if ((l->l_pflag & LP_BOUND) == 0) {
240 		atomic_store_relaxed(&spc->spc_mcount,
241 		    atomic_load_relaxed(&spc->spc_mcount) + 1);
242 	}
243 
244 	/*
245 	 * Update the value of highest priority in the runqueue,
246 	 * if priority of this thread is higher.
247 	 */
248 	if (eprio > spc->spc_maxpriority)
249 		spc->spc_maxpriority = eprio;
250 
251 	sched_newts(l);
252 }
253 
254 /*
255  * Remove and LWP from the run queue it's on.  The LWP must be in state
256  * LSRUN.
257  */
258 void
sched_dequeue(struct lwp * l)259 sched_dequeue(struct lwp *l)
260 {
261 	TAILQ_HEAD(, lwp) *q_head;
262 	struct schedstate_percpu *spc;
263 	const pri_t eprio = lwp_eprio(l);
264 
265 	spc = &l->l_cpu->ci_schedstate;
266 
267 	KASSERT(lwp_locked(l, spc->spc_mutex));
268 	KASSERT(eprio <= spc->spc_maxpriority);
269 	KASSERT(spc->spc_bitmap[eprio >> BITMAP_SHIFT] != 0);
270 	KASSERT(spc->spc_count > 0);
271 
272 	if (spc->spc_migrating == l)
273 		spc->spc_migrating = NULL;
274 
275 	spc->spc_count--;
276 	if ((l->l_pflag & LP_BOUND) == 0) {
277 		atomic_store_relaxed(&spc->spc_mcount,
278 		    atomic_load_relaxed(&spc->spc_mcount) - 1);
279 	}
280 
281 	q_head = sched_getrq(spc, eprio);
282 	TAILQ_REMOVE(q_head, l, l_runq);
283 	if (TAILQ_EMPTY(q_head)) {
284 		u_int i;
285 		uint32_t q;
286 
287 		/* Unmark bit */
288 		i = eprio >> BITMAP_SHIFT;
289 		q = BITMAP_MSB >> (eprio & BITMAP_MASK);
290 		KASSERT((spc->spc_bitmap[i] & q) != 0);
291 		spc->spc_bitmap[i] &= ~q;
292 
293 		/*
294 		 * Update the value of highest priority in the runqueue, in a
295 		 * case it was a last thread in the queue of highest priority.
296 		 */
297 		if (eprio != spc->spc_maxpriority)
298 			return;
299 
300 		do {
301 			if (spc->spc_bitmap[i] != 0) {
302 				q = ffs(spc->spc_bitmap[i]);
303 				spc->spc_maxpriority =
304 				    (i << BITMAP_SHIFT) + (BITMAP_BITS - q);
305 				return;
306 			}
307 		} while (i--);
308 
309 		/* If not found - set the lowest value */
310 		spc->spc_maxpriority = 0;
311 	}
312 }
313 
314 /*
315  * Cause a preemption on the given CPU, if the priority "pri" is higher
316  * priority than the running LWP.  If "unlock" is specified, and ideally it
317  * will be for concurrency reasons, spc_mutex will be dropped before return.
318  */
319 void
sched_resched_cpu(struct cpu_info * ci,pri_t pri,bool unlock)320 sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock)
321 {
322 	struct schedstate_percpu *spc;
323 	u_int o, n, f;
324 	lwp_t *l;
325 
326 	spc = &ci->ci_schedstate;
327 
328 	KASSERT(mutex_owned(spc->spc_mutex));
329 
330 	/*
331 	 * If the priority level we're evaluating wouldn't cause a new LWP
332 	 * to be run on the CPU, then we have nothing to do.
333 	 */
334 	if (pri <= spc->spc_curpriority || !mp_online) {
335 		if (__predict_true(unlock)) {
336 			spc_unlock(ci);
337 		}
338 		return;
339 	}
340 
341 	/*
342 	 * Figure out what kind of preemption we should do.
343 	 */
344 	l = ci->ci_onproc;
345 	if ((l->l_flag & LW_IDLE) != 0) {
346 		f = RESCHED_IDLE | RESCHED_UPREEMPT;
347 	} else if (pri >= sched_kpreempt_pri && (l->l_pflag & LP_INTR) == 0) {
348 		/* We can't currently preempt softints - should be able to. */
349 #ifdef __HAVE_PREEMPTION
350 		f = RESCHED_KPREEMPT;
351 #else
352 		/* Leave door open for test: set kpreempt_pri with sysctl. */
353 		f = RESCHED_UPREEMPT;
354 #endif
355 		/*
356 		 * l_dopreempt must be set with the CPU locked to sync with
357 		 * mi_switch().  It must also be set with an atomic to sync
358 		 * with kpreempt().
359 		 */
360 		atomic_or_uint(&l->l_dopreempt, DOPREEMPT_ACTIVE);
361 	} else {
362 		f = RESCHED_UPREEMPT;
363 	}
364 	if (ci != curcpu()) {
365 		f |= RESCHED_REMOTE;
366 	}
367 
368 	/*
369 	 * Things can start as soon as ci_want_resched is touched: x86 has
370 	 * an instruction that monitors the memory cell it's in.  Drop the
371 	 * schedstate lock in advance, otherwise the remote CPU can awaken
372 	 * and immediately block on the lock.
373 	 */
374 	if (__predict_true(unlock)) {
375 		spc_unlock(ci);
376 	}
377 
378 	/*
379 	 * The caller almost always has a second scheduler lock held: either
380 	 * the running LWP lock (spc_lwplock), or a sleep queue lock.  That
381 	 * keeps preemption disabled, which among other things ensures all
382 	 * LWPs involved won't be freed while we're here (see lwp_dtor()).
383 	 */
384  	KASSERT(kpreempt_disabled());
385 
386 	for (o = 0;; o = n) {
387 		n = atomic_cas_uint(&ci->ci_want_resched, o, o | f);
388 		if (__predict_true(o == n)) {
389 			/*
390 			 * We're the first to set a resched on the CPU.  Try
391 			 * to avoid causing a needless trip through trap()
392 			 * to handle an AST fault, if it's known the LWP
393 			 * will either block or go through userret() soon.
394 			 */
395 			if (l != curlwp || cpu_intr_p()) {
396 				cpu_need_resched(ci, l, f);
397 			}
398 			break;
399 		}
400 		if (__predict_true(
401 		    (n & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)) >=
402 		    (f & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)))) {
403 			/* Already in progress, nothing to do. */
404 			break;
405 		}
406 	}
407 }
408 
409 /*
410  * Cause a preemption on the given CPU, if the priority of LWP "l" in state
411  * LSRUN, is higher priority than the running LWP.  If "unlock" is
412  * specified, and ideally it will be for concurrency reasons, spc_mutex will
413  * be dropped before return.
414  */
415 void
sched_resched_lwp(struct lwp * l,bool unlock)416 sched_resched_lwp(struct lwp *l, bool unlock)
417 {
418 	struct cpu_info *ci = l->l_cpu;
419 
420 	KASSERT(lwp_locked(l, ci->ci_schedstate.spc_mutex));
421 	KASSERT(l->l_stat == LSRUN);
422 
423 	sched_resched_cpu(ci, lwp_eprio(l), unlock);
424 }
425 
426 /*
427  * Migration and balancing.
428  */
429 
430 #ifdef MULTIPROCESSOR
431 
432 /*
433  * Estimate if LWP is cache-hot.
434  */
435 static inline bool
lwp_cache_hot(const struct lwp * l)436 lwp_cache_hot(const struct lwp *l)
437 {
438 
439 	/* Leave new LWPs in peace, determination has already been made. */
440 	if (l->l_stat == LSIDL)
441 		return true;
442 
443 	if (__predict_false(l->l_slptime != 0 || l->l_rticks == 0))
444 		return false;
445 
446 	return (getticks() - l->l_rticks < mstohz(cacheht_time));
447 }
448 
449 /*
450  * Check if LWP can migrate to the chosen CPU.
451  */
452 static inline bool
sched_migratable(const struct lwp * l,struct cpu_info * ci)453 sched_migratable(const struct lwp *l, struct cpu_info *ci)
454 {
455 	const struct schedstate_percpu *spc = &ci->ci_schedstate;
456 	KASSERT(lwp_locked(__UNCONST(l), NULL));
457 
458 	/* Is CPU offline? */
459 	if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
460 		return false;
461 
462 	/* Is affinity set? */
463 	if (__predict_false(l->l_affinity))
464 		return kcpuset_isset(l->l_affinity, cpu_index(ci));
465 
466 	/* Is there a processor-set? */
467 	return (spc->spc_psid == l->l_psid);
468 }
469 
470 /*
471  * A small helper to do round robin through CPU packages.
472  */
473 static struct cpu_info *
sched_nextpkg(void)474 sched_nextpkg(void)
475 {
476 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
477 
478 	spc->spc_nextpkg =
479 	    spc->spc_nextpkg->ci_sibling[CPUREL_PACKAGE1ST];
480 
481 	return spc->spc_nextpkg;
482 }
483 
484 /*
485  * Find a CPU to run LWP "l".  Look for the CPU with the lowest priority
486  * thread.  In case of equal priority, prefer first class CPUs, and amongst
487  * the remainder choose the CPU with the fewest runqueue entries.
488  *
489  * Begin the search in the CPU package which "pivot" is a member of.
490  */
491 static struct cpu_info * __noinline
sched_bestcpu(struct lwp * l,struct cpu_info * pivot)492 sched_bestcpu(struct lwp *l, struct cpu_info *pivot)
493 {
494 	struct cpu_info *bestci, *curci, *outer;
495 	struct schedstate_percpu *bestspc, *curspc;
496 	pri_t bestpri, curpri;
497 
498 	/*
499 	 * If this fails (it shouldn't), run on the given CPU.  This also
500 	 * gives us a weak preference for "pivot" to begin with.
501 	 */
502 	bestci = pivot;
503 	bestspc = &bestci->ci_schedstate;
504 	if (sched_migratable(l, bestci)) {
505 		bestpri = MAX(bestspc->spc_curpriority,
506 		    bestspc->spc_maxpriority);
507 	} else {
508 		/* Invalidate the priority. */
509 		bestpri = PRI_COUNT;
510 	}
511 
512 	/* In the outer loop scroll through all CPU packages. */
513 	pivot = pivot->ci_package1st;
514 	outer = pivot;
515 	do {
516 		/* In the inner loop scroll through all CPUs in package. */
517 		curci = outer;
518 		do {
519 			if (!sched_migratable(l, curci)) {
520 				continue;
521 			}
522 
523 			curspc = &curci->ci_schedstate;
524 
525 			/* If this CPU is idle and 1st class, we're done. */
526 			if ((curspc->spc_flags & (SPCF_IDLE | SPCF_1STCLASS)) ==
527 			    (SPCF_IDLE | SPCF_1STCLASS)) {
528 				return curci;
529 			}
530 
531 			curpri = MAX(curspc->spc_curpriority,
532 			    curspc->spc_maxpriority);
533 
534 			if (curpri > bestpri) {
535 				continue;
536 			}
537 			if (curpri == bestpri) {
538 				/* Prefer first class CPUs over others. */
539 				if ((curspc->spc_flags & SPCF_1STCLASS) == 0 &&
540 				    (bestspc->spc_flags & SPCF_1STCLASS) != 0) {
541 				    	continue;
542 				}
543 				/*
544 				 * Pick the least busy CPU.  Make sure this is not
545 				 * <=, otherwise it defeats the above preference.
546 				 */
547 				if (bestspc->spc_count < curspc->spc_count) {
548 					continue;
549 				}
550 			}
551 
552 			bestpri = curpri;
553 			bestci = curci;
554 			bestspc = curspc;
555 
556 		} while (curci = curci->ci_sibling[CPUREL_PACKAGE],
557 		    curci != outer);
558 	} while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST],
559 	    outer != pivot);
560 
561 	return bestci;
562 }
563 
564 /*
565  * Estimate the migration of LWP to the other CPU.
566  * Take and return the CPU, if migration is needed.
567  */
568 struct cpu_info *
sched_takecpu(struct lwp * l)569 sched_takecpu(struct lwp *l)
570 {
571 	struct schedstate_percpu *spc, *tspc;
572 	struct cpu_info *ci, *curci, *tci;
573 	pri_t eprio;
574 	int flags;
575 
576 	KASSERT(lwp_locked(l, NULL));
577 
578 	/* If thread is strictly bound, do not estimate other CPUs */
579 	ci = l->l_cpu;
580 	if (l->l_pflag & LP_BOUND)
581 		return ci;
582 
583 	spc = &ci->ci_schedstate;
584 	eprio = lwp_eprio(l);
585 
586 	/*
587 	 * Handle new LWPs.  For vfork() with a timeshared child, make it
588 	 * run on the same CPU as the parent if no other LWPs in queue.
589 	 * Otherwise scatter far and wide - try for an even distribution
590 	 * across all CPU packages and CPUs.
591 	 */
592 	if (l->l_stat == LSIDL) {
593 		if (curlwp->l_vforkwaiting && l->l_class == SCHED_OTHER) {
594 			if (sched_migratable(l, curlwp->l_cpu) && eprio >
595 			    curlwp->l_cpu->ci_schedstate.spc_maxpriority) {
596 				return curlwp->l_cpu;
597 			}
598 		} else {
599 			return sched_bestcpu(l, sched_nextpkg());
600 		}
601 		flags = SPCF_IDLE;
602 	} else {
603 		flags = SPCF_IDLE | SPCF_1STCLASS;
604 	}
605 
606 	/*
607 	 * Try to send the LWP back to the first CPU in the same core if
608 	 * idle.  This keeps LWPs clustered in the run queues of 1st class
609 	 * CPUs.  This implies stickiness.  If we didn't find a home for
610 	 * a vfork() child above, try to use any SMT sibling to help out.
611 	 */
612 	tci = ci;
613 	do {
614 		tspc = &tci->ci_schedstate;
615 		if ((tspc->spc_flags & flags) == flags &&
616 		    sched_migratable(l, tci)) {
617 			return tci;
618 		}
619 		tci = tci->ci_sibling[CPUREL_CORE];
620 	} while (tci != ci);
621 
622 	/*
623 	 * Otherwise the LWP is "sticky", i.e.  generally preferring to stay
624 	 * on the same CPU.
625 	 */
626 	if (sched_migratable(l, ci) && (eprio > spc->spc_curpriority ||
627 	    (lwp_cache_hot(l) && l->l_class == SCHED_OTHER))) {
628 		return ci;
629 	}
630 
631 	/*
632 	 * If the current CPU core is idle, run there and avoid the
633 	 * expensive scan of CPUs below.
634 	 */
635 	curci = curcpu();
636 	tci = curci;
637 	do {
638 		tspc = &tci->ci_schedstate;
639 		if ((tspc->spc_flags & flags) == flags &&
640 		    sched_migratable(l, tci)) {
641 			return tci;
642 		}
643 		tci = tci->ci_sibling[CPUREL_CORE];
644 	} while (tci != curci);
645 
646 	/*
647 	 * Didn't find a new home above - happens infrequently.  Start the
648 	 * search in last CPU package that the LWP ran in, but expand to
649 	 * include the whole system if needed.
650 	 */
651 	return sched_bestcpu(l, l->l_cpu);
652 }
653 
654 /*
655  * Tries to catch an LWP from the runqueue of other CPU.
656  */
657 static struct lwp *
sched_catchlwp(struct cpu_info * ci)658 sched_catchlwp(struct cpu_info *ci)
659 {
660 	struct cpu_info *curci = curcpu();
661 	struct schedstate_percpu *spc, *curspc;
662 	TAILQ_HEAD(, lwp) *q_head;
663 	struct lwp *l;
664 	bool gentle;
665 
666 	curspc = &curci->ci_schedstate;
667 	spc = &ci->ci_schedstate;
668 
669 	/*
670 	 * Be more aggressive if this CPU is first class, and the other
671 	 * is not.
672 	 */
673 	gentle = ((curspc->spc_flags & SPCF_1STCLASS) == 0 ||
674 	    (spc->spc_flags & SPCF_1STCLASS) != 0);
675 
676 	if (atomic_load_relaxed(&spc->spc_mcount) < (gentle ? min_catch : 1) ||
677 	    curspc->spc_psid != spc->spc_psid) {
678 		spc_unlock(ci);
679 		return NULL;
680 	}
681 
682 	/* Take the highest priority thread */
683 	q_head = sched_getrq(spc, spc->spc_maxpriority);
684 	l = TAILQ_FIRST(q_head);
685 
686 	for (;;) {
687 		/* Check the first and next result from the queue */
688 		if (l == NULL) {
689 			break;
690 		}
691 		KASSERTMSG(l->l_stat == LSRUN, "%s l %p (%s) l_stat %d",
692 		    ci->ci_data.cpu_name,
693 		    l, (l->l_name ? l->l_name : l->l_proc->p_comm), l->l_stat);
694 
695 		/* Look for threads, whose are allowed to migrate */
696 		if ((l->l_pflag & LP_BOUND) ||
697 		    (gentle && lwp_cache_hot(l)) ||
698 		    !sched_migratable(l, curci)) {
699 			l = TAILQ_NEXT(l, l_runq);
700 			/* XXX Gap: could walk down priority list. */
701 			continue;
702 		}
703 
704 		/* Grab the thread, and move to the local run queue */
705 		sched_dequeue(l);
706 		l->l_cpu = curci;
707 		lwp_unlock_to(l, curspc->spc_mutex);
708 		sched_enqueue(l);
709 		return l;
710 	}
711 	spc_unlock(ci);
712 
713 	return l;
714 }
715 
716 /*
717  * Called from sched_idle() to handle migration.  Return the CPU that we
718  * pushed the LWP to (may be NULL).
719  */
720 static struct cpu_info *
sched_idle_migrate(void)721 sched_idle_migrate(void)
722 {
723 	struct cpu_info *ci = curcpu(), *tci = NULL;
724 	struct schedstate_percpu *spc, *tspc;
725 	bool dlock = false;
726 
727 	spc = &ci->ci_schedstate;
728 	spc_lock(ci);
729 	for (;;) {
730 		struct lwp *l;
731 
732 		l = spc->spc_migrating;
733 		if (l == NULL)
734 			break;
735 
736 		/*
737 		 * If second attempt, and target CPU has changed,
738 		 * drop the old lock.
739 		 */
740 		if (dlock == true && tci != l->l_target_cpu) {
741 			KASSERT(tci != NULL);
742 			spc_unlock(tci);
743 			dlock = false;
744 		}
745 
746 		/*
747 		 * Nothing to do if destination has changed to the
748 		 * local CPU, or migration was done by other CPU.
749 		 */
750 		tci = l->l_target_cpu;
751 		if (tci == NULL || tci == ci) {
752 			spc->spc_migrating = NULL;
753 			l->l_target_cpu = NULL;
754 			break;
755 		}
756 		tspc = &tci->ci_schedstate;
757 
758 		/*
759 		 * Double-lock the runqueues.
760 		 * We do that only once.
761 		 */
762 		if (dlock == false) {
763 			dlock = true;
764 			if (ci < tci) {
765 				spc_lock(tci);
766 			} else if (!mutex_tryenter(tspc->spc_mutex)) {
767 				spc_unlock(ci);
768 				spc_lock(tci);
769 				spc_lock(ci);
770 				/* Check the situation again.. */
771 				continue;
772 			}
773 		}
774 
775 		/* Migrate the thread */
776 		KASSERT(l->l_stat == LSRUN);
777 		spc->spc_migrating = NULL;
778 		l->l_target_cpu = NULL;
779 		sched_dequeue(l);
780 		l->l_cpu = tci;
781 		lwp_setlock(l, tspc->spc_mutex);
782 		sched_enqueue(l);
783 		sched_resched_lwp(l, true);
784 		/* tci now unlocked */
785 		spc_unlock(ci);
786 		return tci;
787 	}
788 	if (dlock == true) {
789 		KASSERT(tci != NULL);
790 		spc_unlock(tci);
791 	}
792 	spc_unlock(ci);
793 	return NULL;
794 }
795 
796 /*
797  * Try to steal an LWP from "tci".
798  */
799 static bool
sched_steal(struct cpu_info * ci,struct cpu_info * tci)800 sched_steal(struct cpu_info *ci, struct cpu_info *tci)
801 {
802 	struct schedstate_percpu *spc, *tspc;
803 	lwp_t *l;
804 
805 	spc = &ci->ci_schedstate;
806 	tspc = &tci->ci_schedstate;
807 	if (atomic_load_relaxed(&tspc->spc_mcount) != 0 &&
808 	    spc->spc_psid == tspc->spc_psid) {
809 		spc_dlock(ci, tci);
810 		l = sched_catchlwp(tci);
811 		spc_unlock(ci);
812 		if (l != NULL) {
813 			return true;
814 		}
815 	}
816 	return false;
817 }
818 
819 /*
820  * Called from each CPU's idle loop.
821  */
822 void
sched_idle(void)823 sched_idle(void)
824 {
825 	struct cpu_info *ci, *inner, *outer, *first, *tci, *mci;
826 	struct schedstate_percpu *spc, *tspc;
827 	struct lwp *l;
828 
829 	ci = curcpu();
830 	spc = &ci->ci_schedstate;
831 	tci = NULL;
832 	mci = NULL;
833 
834 	/*
835 	 * Handle LWP migrations off this CPU to another.  If there a is
836 	 * migration to do then remember the CPU the LWP was sent to, and
837 	 * don't steal the LWP back from that CPU below.
838 	 */
839 	if (spc->spc_migrating != NULL) {
840 		mci = sched_idle_migrate();
841 	}
842 
843 	/* If this CPU is offline, or we have an LWP to run, we're done. */
844 	if ((spc->spc_flags & SPCF_OFFLINE) != 0 || spc->spc_count != 0) {
845 		return;
846 	}
847 
848 	/* Deal with SMT. */
849 	if (ci->ci_nsibling[CPUREL_CORE] > 1) {
850 		/* Try to help our siblings out. */
851 		tci = ci->ci_sibling[CPUREL_CORE];
852 		while (tci != ci) {
853 			if (tci != mci && sched_steal(ci, tci)) {
854 				return;
855 			}
856 			tci = tci->ci_sibling[CPUREL_CORE];
857 		}
858 		/*
859 		 * If not the first SMT in the core, and in the default
860 		 * processor set, the search ends here.
861 		 */
862 		if ((spc->spc_flags & SPCF_1STCLASS) == 0 &&
863 		    spc->spc_psid == PS_NONE) {
864 			return;
865 		}
866 	}
867 
868 	/*
869 	 * Find something to run, unless this CPU exceeded the rate limit.
870 	 * Start looking on the current package to maximise L2/L3 cache
871 	 * locality.  Then expand to looking at the rest of the system.
872 	 *
873 	 * XXX Should probably look at 2nd class CPUs first, but they will
874 	 * shed jobs via preempt() anyway.
875 	 */
876 	if (spc->spc_nextskim > getticks()) {
877 		return;
878 	}
879 	spc->spc_nextskim = getticks() + mstohz(skim_interval);
880 
881 	/* In the outer loop scroll through all CPU packages, starting here. */
882 	first = ci->ci_package1st;
883 	outer = first;
884 	do {
885 		/* In the inner loop scroll through all CPUs in package. */
886 		inner = outer;
887 		do {
888 			/* Don't hit the locks unless needed. */
889 			tspc = &inner->ci_schedstate;
890 			if (ci == inner || ci == mci ||
891 			    spc->spc_psid != tspc->spc_psid ||
892 			    atomic_load_relaxed(&tspc->spc_mcount) < min_catch) {
893 				continue;
894 			}
895 			spc_dlock(ci, inner);
896 			l = sched_catchlwp(inner);
897 			spc_unlock(ci);
898 			if (l != NULL) {
899 				/* Got it! */
900 				return;
901 			}
902 		} while (inner = inner->ci_sibling[CPUREL_PACKAGE],
903 		    inner != outer);
904 	} while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST],
905 	    outer != first);
906 }
907 
908 /*
909  * Called from mi_switch() when an LWP has been preempted / has yielded.
910  * The LWP is presently in the CPU's run queue.  Here we look for a better
911  * CPU to teleport the LWP to; there may not be one.
912  */
913 void
sched_preempted(struct lwp * l)914 sched_preempted(struct lwp *l)
915 {
916 	const int flags = SPCF_IDLE | SPCF_1STCLASS;
917 	struct schedstate_percpu *tspc;
918 	struct cpu_info *ci, *tci;
919 
920 	ci = l->l_cpu;
921 	tspc = &ci->ci_schedstate;
922 
923 	KASSERT(tspc->spc_count >= 1);
924 
925 	/*
926 	 * Try to select another CPU if:
927 	 *
928 	 * - there is no migration pending already
929 	 * - and this LWP is running on a 2nd class CPU
930 	 * - or this LWP is a child of vfork() that has just done execve()
931 	 */
932 	if (l->l_target_cpu != NULL ||
933 	    ((tspc->spc_flags & SPCF_1STCLASS) != 0 &&
934 	    (l->l_pflag & LP_TELEPORT) == 0)) {
935 		return;
936 	}
937 
938 	/*
939 	 * Fast path: if the first SMT in the core is idle, send it back
940 	 * there, because the cache is shared (cheap) and we want all LWPs
941 	 * to be clustered on 1st class CPUs (either running there or on
942 	 * their runqueues).
943 	 */
944 	tci = ci->ci_sibling[CPUREL_CORE];
945 	while (tci != ci) {
946 		tspc = &tci->ci_schedstate;
947 		if ((tspc->spc_flags & flags) == flags &&
948 		    sched_migratable(l, tci)) {
949 		    	l->l_target_cpu = tci;
950 			l->l_pflag &= ~LP_TELEPORT;
951 		    	return;
952 		}
953 		tci = tci->ci_sibling[CPUREL_CORE];
954 	}
955 
956 	if ((l->l_pflag & LP_TELEPORT) != 0) {
957 		/*
958 		 * A child of vfork(): now that the parent is released,
959 		 * scatter far and wide, to match the LSIDL distribution
960 		 * done in sched_takecpu().
961 		 */
962 		l->l_pflag &= ~LP_TELEPORT;
963 		tci = sched_bestcpu(l, sched_nextpkg());
964 		if (tci != ci) {
965 			l->l_target_cpu = tci;
966 		}
967 	} else {
968 		/*
969 		 * Try to find a better CPU to take it, but don't move to
970 		 * another 2nd class CPU, and don't move to a non-idle CPU,
971 		 * because that would prevent SMT being used to maximise
972 		 * throughput.
973 		 *
974 		 * Search in the current CPU package in order to try and
975 		 * keep L2/L3 cache locality, but expand to include the
976 		 * whole system if needed.
977 		 */
978 		tci = sched_bestcpu(l, l->l_cpu);
979 		if (tci != ci &&
980 		    (tci->ci_schedstate.spc_flags & flags) == flags) {
981 			l->l_target_cpu = tci;
982 		}
983 	}
984 }
985 
986 /*
987  * Called during execve() by a child of vfork().  Does two things:
988  *
989  * - If the parent has been awoken and put back on curcpu then give the
990  *   CPU back to the parent.
991  *
992  * - If curlwp is not on a 1st class CPU then find somewhere else to run,
993  *   since it dodged the distribution in sched_takecpu() when first set
994  *   runnable.
995  */
996 void
sched_vforkexec(struct lwp * l,bool samecpu)997 sched_vforkexec(struct lwp *l, bool samecpu)
998 {
999 
1000 	KASSERT(l == curlwp);
1001 	if ((samecpu && ncpu > 1) ||
1002 	    (l->l_cpu->ci_schedstate.spc_flags & SPCF_1STCLASS) == 0) {
1003 		l->l_pflag |= LP_TELEPORT;
1004 		preempt();
1005 	}
1006 }
1007 
1008 #else
1009 
1010 /*
1011  * stubs for !MULTIPROCESSOR
1012  */
1013 
1014 struct cpu_info *
sched_takecpu(struct lwp * l)1015 sched_takecpu(struct lwp *l)
1016 {
1017 
1018 	return l->l_cpu;
1019 }
1020 
1021 void
sched_idle(void)1022 sched_idle(void)
1023 {
1024 
1025 }
1026 
1027 void
sched_preempted(struct lwp * l)1028 sched_preempted(struct lwp *l)
1029 {
1030 
1031 }
1032 
1033 void
sched_vforkexec(struct lwp * l,bool samecpu)1034 sched_vforkexec(struct lwp *l, bool samecpu)
1035 {
1036 
1037 	KASSERT(l == curlwp);
1038 }
1039 
1040 #endif	/* MULTIPROCESSOR */
1041 
1042 /*
1043  * Scheduling statistics and balancing.
1044  */
1045 void
sched_lwp_stats(struct lwp * l)1046 sched_lwp_stats(struct lwp *l)
1047 {
1048 	int batch;
1049 
1050 	KASSERT(lwp_locked(l, NULL));
1051 
1052 	/* Update sleep time */
1053 	if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
1054 	    l->l_stat == LSSUSPENDED)
1055 		l->l_slptime++;
1056 
1057 	/*
1058 	 * Set that thread is more CPU-bound, if sum of run time exceeds the
1059 	 * sum of sleep time.  Check if thread is CPU-bound a first time.
1060 	 */
1061 	batch = (l->l_rticksum > l->l_slpticksum);
1062 	if (batch != 0) {
1063 		if ((l->l_flag & LW_BATCH) == 0)
1064 			batch = 0;
1065 		l->l_flag |= LW_BATCH;
1066 	} else
1067 		l->l_flag &= ~LW_BATCH;
1068 
1069 	/* Reset the time sums */
1070 	l->l_slpticksum = 0;
1071 	l->l_rticksum = 0;
1072 
1073 	/* Scheduler-specific hook */
1074 	sched_pstats_hook(l, batch);
1075 #ifdef KDTRACE_HOOKS
1076 	curthread = l;
1077 #endif
1078 }
1079 
1080 /*
1081  * Scheduler mill.
1082  */
1083 struct lwp *
sched_nextlwp(void)1084 sched_nextlwp(void)
1085 {
1086 	struct cpu_info *ci = curcpu();
1087 	struct schedstate_percpu *spc;
1088 	TAILQ_HEAD(, lwp) *q_head;
1089 	struct lwp *l;
1090 
1091 	/* Update the last run time on switch */
1092 	l = curlwp;
1093 	l->l_rticksum += (getticks() - l->l_rticks);
1094 
1095 	/* Return to idle LWP if there is a migrating thread */
1096 	spc = &ci->ci_schedstate;
1097 	if (__predict_false(spc->spc_migrating != NULL))
1098 		return NULL;
1099 
1100 	/* Return to idle LWP if there is no runnable job */
1101 	if (__predict_false(spc->spc_count == 0))
1102 		return NULL;
1103 
1104 	/* Take the highest priority thread */
1105 	KASSERT(spc->spc_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]);
1106 	q_head = sched_getrq(spc, spc->spc_maxpriority);
1107 	l = TAILQ_FIRST(q_head);
1108 	KASSERT(l != NULL);
1109 
1110 	sched_oncpu(l);
1111 	l->l_rticks = getticks();
1112 
1113 	return l;
1114 }
1115 
1116 /*
1117  * sched_curcpu_runnable_p: return if curcpu() should exit the idle loop.
1118  */
1119 
1120 bool
sched_curcpu_runnable_p(void)1121 sched_curcpu_runnable_p(void)
1122 {
1123 	const struct cpu_info *ci;
1124 	const struct schedstate_percpu *spc;
1125 	bool rv;
1126 
1127 	kpreempt_disable();
1128 	ci = curcpu();
1129 	spc = &ci->ci_schedstate;
1130 	rv = (spc->spc_count != 0);
1131 #ifndef __HAVE_FAST_SOFTINTS
1132 	rv |= (ci->ci_data.cpu_softints != 0);
1133 #endif
1134 	kpreempt_enable();
1135 
1136 	return rv;
1137 }
1138 
1139 /*
1140  * Sysctl nodes and initialization.
1141  */
1142 
1143 SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup")
1144 {
1145 	const struct sysctlnode *node = NULL;
1146 
1147 	sysctl_createv(clog, 0, NULL, &node,
1148 		CTLFLAG_PERMANENT,
1149 		CTLTYPE_NODE, "sched",
1150 		SYSCTL_DESCR("Scheduler options"),
1151 		NULL, 0, NULL, 0,
1152 		CTL_KERN, CTL_CREATE, CTL_EOL);
1153 
1154 	if (node == NULL)
1155 		return;
1156 
1157 	sysctl_createv(clog, 0, &node, NULL,
1158 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1159 		CTLTYPE_INT, "cacheht_time",
1160 		SYSCTL_DESCR("Cache hotness time (in ms)"),
1161 		NULL, 0, &cacheht_time, 0,
1162 		CTL_CREATE, CTL_EOL);
1163 	sysctl_createv(clog, 0, &node, NULL,
1164 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1165 		CTLTYPE_INT, "skim_interval",
1166 		SYSCTL_DESCR("Rate limit for stealing from other CPUs (in ms)"),
1167 		NULL, 0, &skim_interval, 0,
1168 		CTL_CREATE, CTL_EOL);
1169 	sysctl_createv(clog, 0, &node, NULL,
1170 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1171 		CTLTYPE_INT, "min_catch",
1172 		SYSCTL_DESCR("Minimal count of threads for catching"),
1173 		NULL, 0, &min_catch, 0,
1174 		CTL_CREATE, CTL_EOL);
1175 	sysctl_createv(clog, 0, &node, NULL,
1176 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1177 		CTLTYPE_INT, "timesoftints",
1178 		SYSCTL_DESCR("Track CPU time for soft interrupts"),
1179 		NULL, 0, &softint_timing, 0,
1180 		CTL_CREATE, CTL_EOL);
1181 	sysctl_createv(clog, 0, &node, NULL,
1182 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
1183 		CTLTYPE_INT, "kpreempt_pri",
1184 		SYSCTL_DESCR("Minimum priority to trigger kernel preemption"),
1185 		NULL, 0, &sched_kpreempt_pri, 0,
1186 		CTL_CREATE, CTL_EOL);
1187 }
1188 
1189 /*
1190  * Debugging.
1191  */
1192 
1193 #ifdef DDB
1194 
1195 void
sched_print_runqueue(void (* pr)(const char *,...))1196 sched_print_runqueue(void (*pr)(const char *, ...))
1197 {
1198 	struct cpu_info *ci, *tci;
1199 	struct schedstate_percpu *spc;
1200 	struct lwp *l;
1201 	struct proc *p;
1202 	CPU_INFO_ITERATOR cii;
1203 
1204 	for (CPU_INFO_FOREACH(cii, ci)) {
1205 		int i;
1206 
1207 		spc = &ci->ci_schedstate;
1208 
1209 		(*pr)("Run-queue (CPU = %u):\n", ci->ci_index);
1210 		(*pr)(" pid.lid = %d.%d, r_count = %u, "
1211 		    "maxpri = %d, mlwp = %p\n",
1212 #ifdef MULTIPROCESSOR
1213 		    ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid,
1214 #else
1215 		    curlwp->l_proc->p_pid, curlwp->l_lid,
1216 #endif
1217 		    spc->spc_count, spc->spc_maxpriority,
1218 		    spc->spc_migrating);
1219 		i = (PRI_COUNT >> BITMAP_SHIFT) - 1;
1220 		do {
1221 			uint32_t q;
1222 			q = spc->spc_bitmap[i];
1223 			(*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q);
1224 		} while (i--);
1225 	}
1226 
1227 	(*pr)("   %5s %4s %4s %10s %3s %18s %4s %4s %s\n",
1228 	    "LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "TCI", "LRTICKS");
1229 
1230 	PROCLIST_FOREACH(p, &allproc) {
1231 		(*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm);
1232 		LIST_FOREACH(l, &p->p_lwps, l_sibling) {
1233 			ci = l->l_cpu;
1234 			tci = l->l_target_cpu;
1235 			(*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %4d %u\n",
1236 			    (int)l->l_lid, l->l_priority, lwp_eprio(l),
1237 			    l->l_flag, l->l_stat == LSRUN ? "RQ" :
1238 			    (l->l_stat == LSSLEEP ? "SQ" : "-"),
1239 			    l, ci->ci_index, (tci ? tci->ci_index : -1),
1240 			    (u_int)(getticks() - l->l_rticks));
1241 		}
1242 	}
1243 }
1244 
1245 #endif
1246