xref: /openbsd/sys/kern/kern_sched.c (revision 35bbce86)
1 /*	$OpenBSD: kern_sched.c,v 1.103 2024/11/24 13:05:14 claudio Exp $	*/
2 /*
3  * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #include <sys/param.h>
19 
20 #include <sys/sched.h>
21 #include <sys/proc.h>
22 #include <sys/kthread.h>
23 #include <sys/systm.h>
24 #include <sys/clockintr.h>
25 #include <sys/resourcevar.h>
26 #include <sys/task.h>
27 #include <sys/time.h>
28 #include <sys/smr.h>
29 #include <sys/tracepoint.h>
30 
31 #include <uvm/uvm_extern.h>
32 
33 void sched_kthreads_create(void *);
34 
35 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
36 struct proc *sched_steal_proc(struct cpu_info *);
37 
38 /*
39  * To help choosing which cpu should run which process we keep track
40  * of cpus which are currently idle and which cpus have processes
41  * queued.
42  */
43 struct cpuset sched_idle_cpus;
44 struct cpuset sched_queued_cpus;
45 struct cpuset sched_all_cpus;
46 
47 /*
48  * Some general scheduler counters.
49  */
50 uint64_t sched_nmigrations;	/* Cpu migration counter */
51 uint64_t sched_nomigrations;	/* Cpu no migration counter */
52 uint64_t sched_noidle;		/* Times we didn't pick the idle task */
53 uint64_t sched_stolen;		/* Times we stole proc from other cpus */
54 uint64_t sched_choose;		/* Times we chose a cpu */
55 uint64_t sched_wasidle;		/* Times we came out of idle */
56 
57 int sched_smt;
58 
59 /*
60  * A few notes about cpu_switchto that is implemented in MD code.
61  *
62  * cpu_switchto takes two arguments, the old proc and the proc
63  * it should switch to. The new proc will never be NULL, so we always have
64  * a saved state that we need to switch to. The old proc however can
65  * be NULL if the process is exiting. NULL for the old proc simply
66  * means "don't bother saving old state".
67  *
68  * cpu_switchto is supposed to atomically load the new state of the process
69  * including the pcb, pmap and setting curproc, the p_cpu pointer in the
70  * proc and p_stat to SONPROC. Atomically with respect to interrupts, other
71  * cpus in the system must not depend on this state being consistent.
72  * Therefore no locking is necessary in cpu_switchto other than blocking
73  * interrupts during the context switch.
74  */
75 
76 /*
77  * sched_init_cpu is called from main() for the boot cpu, then it's the
78  * responsibility of the MD code to call it for all other cpus.
79  */
80 void
sched_init_cpu(struct cpu_info * ci)81 sched_init_cpu(struct cpu_info *ci)
82 {
83 	struct schedstate_percpu *spc = &ci->ci_schedstate;
84 	int i;
85 
86 	for (i = 0; i < SCHED_NQS; i++)
87 		TAILQ_INIT(&spc->spc_qs[i]);
88 
89 	spc->spc_idleproc = NULL;
90 
91 	clockintr_bind(&spc->spc_itimer, ci, itimer_update, NULL);
92 	clockintr_bind(&spc->spc_profclock, ci, profclock, NULL);
93 	clockintr_bind(&spc->spc_roundrobin, ci, roundrobin, NULL);
94 	clockintr_bind(&spc->spc_statclock, ci, statclock, NULL);
95 
96 	kthread_create_deferred(sched_kthreads_create, ci);
97 
98 	LIST_INIT(&spc->spc_deadproc);
99 	SIMPLEQ_INIT(&spc->spc_deferred);
100 
101 	/*
102 	 * Slight hack here until the cpuset code handles cpu_info
103 	 * structures.
104 	 */
105 	cpuset_init_cpu(ci);
106 
107 #ifdef __HAVE_CPU_TOPOLOGY
108 	if (!sched_smt && ci->ci_smt_id > 0)
109 		return;
110 #endif
111 	cpuset_add(&sched_all_cpus, ci);
112 }
113 
114 void
sched_kthreads_create(void * v)115 sched_kthreads_create(void *v)
116 {
117 	struct cpu_info *ci = v;
118 	struct schedstate_percpu *spc = &ci->ci_schedstate;
119 	static int num;
120 
121 	if (fork1(&proc0, FORK_SHAREVM|FORK_SHAREFILES|FORK_NOZOMBIE|
122 	    FORK_SYSTEM|FORK_IDLE, sched_idle, ci, NULL,
123 	    &spc->spc_idleproc))
124 		panic("fork idle");
125 
126 	/* Name it as specified. */
127 	snprintf(spc->spc_idleproc->p_p->ps_comm,
128 	    sizeof(spc->spc_idleproc->p_p->ps_comm),
129 	    "idle%d", num);
130 
131 	num++;
132 }
133 
134 void
sched_idle(void * v)135 sched_idle(void *v)
136 {
137 	struct schedstate_percpu *spc;
138 	struct proc *p = curproc;
139 	struct cpu_info *ci = v;
140 
141 	KERNEL_UNLOCK();
142 
143 	spc = &ci->ci_schedstate;
144 
145 	/*
146 	 * First time we enter here, we're not supposed to idle,
147 	 * just go away for a while.
148 	 */
149 	SCHED_LOCK();
150 	cpuset_add(&sched_idle_cpus, ci);
151 	p->p_stat = SSLEEP;
152 	p->p_cpu = ci;
153 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
154 	mi_switch();
155 	cpuset_del(&sched_idle_cpus, ci);
156 	SCHED_UNLOCK();
157 
158 	KASSERT(ci == curcpu());
159 	KASSERT(curproc == spc->spc_idleproc);
160 
161 	while (1) {
162 		while (!cpu_is_idle(curcpu())) {
163 			struct proc *dead;
164 
165 			SCHED_LOCK();
166 			p->p_stat = SSLEEP;
167 			mi_switch();
168 			SCHED_UNLOCK();
169 
170 			while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
171 				LIST_REMOVE(dead, p_hash);
172 				exit2(dead);
173 			}
174 		}
175 
176 		splassert(IPL_NONE);
177 
178 		smr_idle();
179 
180 		cpuset_add(&sched_idle_cpus, ci);
181 		cpu_idle_enter();
182 		while (spc->spc_whichqs == 0) {
183 #ifdef MULTIPROCESSOR
184 			if (spc->spc_schedflags & SPCF_SHOULDHALT &&
185 			    (spc->spc_schedflags & SPCF_HALTED) == 0) {
186 				cpuset_del(&sched_idle_cpus, ci);
187 				SCHED_LOCK();
188 				atomic_setbits_int(&spc->spc_schedflags,
189 				    spc->spc_whichqs ? 0 : SPCF_HALTED);
190 				SCHED_UNLOCK();
191 				wakeup(spc);
192 			}
193 #endif
194 			cpu_idle_cycle();
195 		}
196 		cpu_idle_leave();
197 		cpuset_del(&sched_idle_cpus, ci);
198 	}
199 }
200 
201 /*
202  * To free our address space we have to jump through a few hoops.
203  * The freeing is done by the reaper, but until we have one reaper
204  * per cpu, we have no way of putting this proc on the deadproc list
205  * and waking up the reaper without risking having our address space and
206  * stack torn from under us before we manage to switch to another proc.
207  * Therefore we have a per-cpu list of dead processes where we put this
208  * proc and have idle clean up that list and move it to the reaper list.
209  * All this will be unnecessary once we can bind the reaper this cpu
210  * and not risk having it switch to another in case it sleeps.
211  */
212 void
sched_exit(struct proc * p)213 sched_exit(struct proc *p)
214 {
215 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
216 
217 	LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash);
218 
219 	tuagg_add_runtime();
220 
221 	KERNEL_ASSERT_LOCKED();
222 	sched_toidle();
223 }
224 
225 void
sched_toidle(void)226 sched_toidle(void)
227 {
228 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
229 	struct proc *idle;
230 
231 #ifdef MULTIPROCESSOR
232 	/* This process no longer needs to hold the kernel lock. */
233 	if (_kernel_lock_held())
234 		__mp_release_all(&kernel_lock);
235 #endif
236 
237 	if (ISSET(spc->spc_schedflags, SPCF_ITIMER)) {
238 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_ITIMER);
239 		clockintr_cancel(&spc->spc_itimer);
240 	}
241 	if (ISSET(spc->spc_schedflags, SPCF_PROFCLOCK)) {
242 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_PROFCLOCK);
243 		clockintr_cancel(&spc->spc_profclock);
244 	}
245 
246 	atomic_clearbits_int(&spc->spc_schedflags, SPCF_SWITCHCLEAR);
247 
248 	SCHED_LOCK();
249 	idle = spc->spc_idleproc;
250 	idle->p_stat = SRUN;
251 
252 	uvmexp.swtch++;
253 	if (curproc != NULL)
254 		TRACEPOINT(sched, off__cpu, idle->p_tid + THREAD_PID_OFFSET,
255 		    idle->p_p->ps_pid);
256 	cpu_switchto(NULL, idle);
257 	panic("cpu_switchto returned");
258 }
259 
260 /*
261  * Run queue management.
262  */
263 void
sched_init_runqueues(void)264 sched_init_runqueues(void)
265 {
266 }
267 
268 void
setrunqueue(struct cpu_info * ci,struct proc * p,uint8_t prio)269 setrunqueue(struct cpu_info *ci, struct proc *p, uint8_t prio)
270 {
271 	struct schedstate_percpu *spc;
272 	int queue = prio >> 2;
273 
274 	if (ci == NULL)
275 		ci = sched_choosecpu(p);
276 
277 	KASSERT(ci != NULL);
278 	SCHED_ASSERT_LOCKED();
279 	KASSERT(p->p_wchan == NULL);
280 	KASSERT(!ISSET(p->p_flag, P_WSLEEP));
281 
282 	p->p_cpu = ci;
283 	p->p_stat = SRUN;
284 	p->p_runpri = prio;
285 
286 	spc = &p->p_cpu->ci_schedstate;
287 	spc->spc_nrun++;
288 	TRACEPOINT(sched, enqueue, p->p_tid + THREAD_PID_OFFSET,
289 	    p->p_p->ps_pid);
290 
291 	TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
292 	spc->spc_whichqs |= (1U << queue);
293 	cpuset_add(&sched_queued_cpus, p->p_cpu);
294 
295 	if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
296 		cpu_unidle(p->p_cpu);
297 	else if (prio < spc->spc_curpriority)
298 		need_resched(ci);
299 }
300 
301 void
remrunqueue(struct proc * p)302 remrunqueue(struct proc *p)
303 {
304 	struct schedstate_percpu *spc;
305 	int queue = p->p_runpri >> 2;
306 
307 	SCHED_ASSERT_LOCKED();
308 	spc = &p->p_cpu->ci_schedstate;
309 	spc->spc_nrun--;
310 	TRACEPOINT(sched, dequeue, p->p_tid + THREAD_PID_OFFSET,
311 	    p->p_p->ps_pid);
312 
313 	TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
314 	if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
315 		spc->spc_whichqs &= ~(1U << queue);
316 		if (spc->spc_whichqs == 0)
317 			cpuset_del(&sched_queued_cpus, p->p_cpu);
318 	}
319 }
320 
321 struct proc *
sched_chooseproc(void)322 sched_chooseproc(void)
323 {
324 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
325 	struct proc *p;
326 	int queue;
327 
328 	SCHED_ASSERT_LOCKED();
329 
330 #ifdef MULTIPROCESSOR
331 	if (spc->spc_schedflags & SPCF_SHOULDHALT) {
332 		if (spc->spc_whichqs) {
333 			for (queue = 0; queue < SCHED_NQS; queue++) {
334 				while ((p = TAILQ_FIRST(&spc->spc_qs[queue]))) {
335 					remrunqueue(p);
336 					setrunqueue(NULL, p, p->p_runpri);
337 					if (p->p_cpu == curcpu()) {
338 						KASSERT(p->p_flag & P_CPUPEG);
339 						goto again;
340 					}
341 				}
342 			}
343 		}
344 		p = spc->spc_idleproc;
345 		if (p == NULL)
346 			panic("no idleproc set on CPU%d",
347 			    CPU_INFO_UNIT(curcpu()));
348 		p->p_stat = SRUN;
349 		KASSERT(p->p_wchan == NULL);
350 		return (p);
351 	}
352 again:
353 #endif
354 
355 	if (spc->spc_whichqs) {
356 		queue = ffs(spc->spc_whichqs) - 1;
357 		p = TAILQ_FIRST(&spc->spc_qs[queue]);
358 		remrunqueue(p);
359 		sched_noidle++;
360 		if (p->p_stat != SRUN)
361 			panic("thread %d not in SRUN: %d", p->p_tid, p->p_stat);
362 	} else if ((p = sched_steal_proc(curcpu())) == NULL) {
363 		p = spc->spc_idleproc;
364 		if (p == NULL)
365 			panic("no idleproc set on CPU%d",
366 			    CPU_INFO_UNIT(curcpu()));
367 		p->p_stat = SRUN;
368 	}
369 
370 	KASSERT(p->p_wchan == NULL);
371 	KASSERT(!ISSET(p->p_flag, P_WSLEEP));
372 	return (p);
373 }
374 
375 struct cpu_info *
sched_choosecpu_fork(struct proc * parent,int flags)376 sched_choosecpu_fork(struct proc *parent, int flags)
377 {
378 #ifdef MULTIPROCESSOR
379 	struct cpu_info *choice = NULL;
380 	int run, best_run = INT_MAX;
381 	struct cpu_info *ci;
382 	struct cpuset set;
383 
384 #if 0
385 	/*
386 	 * XXX
387 	 * Don't do this until we have a painless way to move the cpu in exec.
388 	 * Preferably when nuking the old pmap and getting a new one on a
389 	 * new cpu.
390 	 */
391 	/*
392 	 * PPWAIT forks are simple. We know that the parent will not
393 	 * run until we exec and choose another cpu, so we just steal its
394 	 * cpu.
395 	 */
396 	if (flags & FORK_PPWAIT)
397 		return (parent->p_cpu);
398 #endif
399 
400 	/*
401 	 * Look at all cpus that are currently idle and have nothing queued.
402 	 * If there are none, pick the one with least queued procs first,
403 	 * then the one with lowest load average.
404 	 */
405 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
406 	cpuset_intersection(&set, &set, &sched_all_cpus);
407 	if (cpuset_first(&set) == NULL)
408 		cpuset_copy(&set, &sched_all_cpus);
409 
410 	while ((ci = cpuset_first(&set)) != NULL) {
411 		cpuset_del(&set, ci);
412 
413 		run = ci->ci_schedstate.spc_nrun;
414 
415 		if (choice == NULL || run < best_run) {
416 			choice = ci;
417 			best_run = run;
418 		}
419 	}
420 
421 	return (choice);
422 #else
423 	return (curcpu());
424 #endif
425 }
426 
427 struct cpu_info *
sched_choosecpu(struct proc * p)428 sched_choosecpu(struct proc *p)
429 {
430 #ifdef MULTIPROCESSOR
431 	struct cpu_info *choice = NULL;
432 	int last_cost = INT_MAX;
433 	struct cpu_info *ci;
434 	struct cpuset set;
435 
436 	/*
437 	 * If pegged to a cpu, don't allow it to move.
438 	 */
439 	if (p->p_flag & P_CPUPEG)
440 		return (p->p_cpu);
441 
442 	sched_choose++;
443 
444 	/*
445 	 * Look at all cpus that are currently idle and have nothing queued.
446 	 * If there are none, pick the cheapest of those.
447 	 * (idle + queued could mean that the cpu is handling an interrupt
448 	 * at this moment and haven't had time to leave idle yet).
449 	 */
450 	cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
451 	cpuset_intersection(&set, &set, &sched_all_cpus);
452 
453 	/*
454 	 * First, just check if our current cpu is in that set, if it is,
455 	 * this is simple.
456 	 * Also, our cpu might not be idle, but if it's the current cpu
457 	 * and it has nothing else queued and we're curproc, take it.
458 	 */
459 	if (cpuset_isset(&set, p->p_cpu) ||
460 	    (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
461 	    (p->p_cpu->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0 &&
462 	    curproc == p)) {
463 		sched_wasidle++;
464 		return (p->p_cpu);
465 	}
466 
467 	if (cpuset_first(&set) == NULL)
468 		cpuset_copy(&set, &sched_all_cpus);
469 
470 	while ((ci = cpuset_first(&set)) != NULL) {
471 		int cost = sched_proc_to_cpu_cost(ci, p);
472 
473 		if (choice == NULL || cost < last_cost) {
474 			choice = ci;
475 			last_cost = cost;
476 		}
477 		cpuset_del(&set, ci);
478 	}
479 
480 	if (p->p_cpu != choice)
481 		sched_nmigrations++;
482 	else
483 		sched_nomigrations++;
484 
485 	return (choice);
486 #else
487 	return (curcpu());
488 #endif
489 }
490 
491 /*
492  * Attempt to steal a proc from some cpu.
493  */
494 struct proc *
sched_steal_proc(struct cpu_info * self)495 sched_steal_proc(struct cpu_info *self)
496 {
497 	struct proc *best = NULL;
498 #ifdef MULTIPROCESSOR
499 	struct schedstate_percpu *spc;
500 	int bestcost = INT_MAX;
501 	struct cpu_info *ci;
502 	struct cpuset set;
503 
504 	KASSERT((self->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0);
505 
506 	/* Don't steal if we don't want to schedule processes in this CPU. */
507 	if (!cpuset_isset(&sched_all_cpus, self))
508 		return (NULL);
509 
510 	cpuset_copy(&set, &sched_queued_cpus);
511 
512 	while ((ci = cpuset_first(&set)) != NULL) {
513 		struct proc *p;
514 		int queue;
515 		int cost;
516 
517 		cpuset_del(&set, ci);
518 
519 		spc = &ci->ci_schedstate;
520 
521 		queue = ffs(spc->spc_whichqs) - 1;
522 		TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
523 			if (p->p_flag & P_CPUPEG)
524 				continue;
525 
526 			cost = sched_proc_to_cpu_cost(self, p);
527 
528 			if (best == NULL || cost < bestcost) {
529 				best = p;
530 				bestcost = cost;
531 			}
532 		}
533 	}
534 	if (best == NULL)
535 		return (NULL);
536 
537 	TRACEPOINT(sched, steal, best->p_tid + THREAD_PID_OFFSET,
538 	    best->p_p->ps_pid, CPU_INFO_UNIT(self));
539 
540 	remrunqueue(best);
541 	best->p_cpu = self;
542 
543 	sched_stolen++;
544 #endif
545 	return (best);
546 }
547 
548 #ifdef MULTIPROCESSOR
549 /*
550  * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
551  */
552 static int
log2(unsigned int i)553 log2(unsigned int i)
554 {
555 	int ret = 0;
556 
557 	while (i >>= 1)
558 		ret++;
559 
560 	return (ret);
561 }
562 
563 /*
564  * Calculate the cost of moving the proc to this cpu.
565  *
566  * What we want is some guesstimate of how much "performance" it will
567  * cost us to move the proc here. Not just for caches and TLBs and NUMA
568  * memory, but also for the proc itself. A highly loaded cpu might not
569  * be the best candidate for this proc since it won't get run.
570  *
571  * Just total guesstimates for now.
572  */
573 
574 int sched_cost_priority = 1;
575 int sched_cost_runnable = 3;
576 int sched_cost_resident = 1;
577 #endif
578 
579 int
sched_proc_to_cpu_cost(struct cpu_info * ci,struct proc * p)580 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
581 {
582 	int cost = 0;
583 #ifdef MULTIPROCESSOR
584 	struct schedstate_percpu *spc;
585 	int l2resident = 0;
586 
587 	spc = &ci->ci_schedstate;
588 
589 	/*
590 	 * First, account for the priority of the proc we want to move.
591 	 * More willing to move, the lower the priority of the destination
592 	 * and the higher the priority of the proc.
593 	 */
594 	if (!cpuset_isset(&sched_idle_cpus, ci)) {
595 		cost += (p->p_usrpri - spc->spc_curpriority) *
596 		    sched_cost_priority;
597 		cost += sched_cost_runnable;
598 	}
599 	if (cpuset_isset(&sched_queued_cpus, ci))
600 		cost += spc->spc_nrun * sched_cost_runnable;
601 
602 	/*
603 	 * Try to avoid the primary cpu as it handles hardware interrupts.
604 	 *
605 	 * XXX Needs to be revisited when we distribute interrupts
606 	 * over cpus.
607 	 */
608 	if (CPU_IS_PRIMARY(ci))
609 		cost += sched_cost_runnable;
610 
611 	/*
612 	 * If the proc is on this cpu already, lower the cost by how much
613 	 * it has been running and an estimate of its footprint.
614 	 */
615 	if (p->p_cpu == ci && p->p_slptime == 0) {
616 		l2resident =
617 		    log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
618 		cost -= l2resident * sched_cost_resident;
619 	}
620 #endif
621 	return (cost);
622 }
623 
624 /*
625  * Peg a proc to a cpu.
626  */
627 void
sched_peg_curproc(struct cpu_info * ci)628 sched_peg_curproc(struct cpu_info *ci)
629 {
630 	struct proc *p = curproc;
631 
632 	SCHED_LOCK();
633 	atomic_setbits_int(&p->p_flag, P_CPUPEG);
634 	setrunqueue(ci, p, p->p_usrpri);
635 	p->p_ru.ru_nvcsw++;
636 	mi_switch();
637 	SCHED_UNLOCK();
638 }
639 
640 void
sched_unpeg_curproc(void)641 sched_unpeg_curproc(void)
642 {
643 	struct proc *p = curproc;
644 
645 	atomic_clearbits_int(&p->p_flag, P_CPUPEG);
646 }
647 
648 #ifdef MULTIPROCESSOR
649 
650 void
sched_start_secondary_cpus(void)651 sched_start_secondary_cpus(void)
652 {
653 	CPU_INFO_ITERATOR cii;
654 	struct cpu_info *ci;
655 
656 	CPU_INFO_FOREACH(cii, ci) {
657 		struct schedstate_percpu *spc = &ci->ci_schedstate;
658 
659 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
660 			continue;
661 		atomic_clearbits_int(&spc->spc_schedflags,
662 		    SPCF_SHOULDHALT | SPCF_HALTED);
663 #ifdef __HAVE_CPU_TOPOLOGY
664 		if (!sched_smt && ci->ci_smt_id > 0)
665 			continue;
666 #endif
667 		cpuset_add(&sched_all_cpus, ci);
668 	}
669 }
670 
671 void
sched_stop_secondary_cpus(void)672 sched_stop_secondary_cpus(void)
673 {
674 	CPU_INFO_ITERATOR cii;
675 	struct cpu_info *ci;
676 
677 	/*
678 	 * Make sure we stop the secondary CPUs.
679 	 */
680 	CPU_INFO_FOREACH(cii, ci) {
681 		struct schedstate_percpu *spc = &ci->ci_schedstate;
682 
683 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
684 			continue;
685 		cpuset_del(&sched_all_cpus, ci);
686 		atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT);
687 	}
688 	CPU_INFO_FOREACH(cii, ci) {
689 		struct schedstate_percpu *spc = &ci->ci_schedstate;
690 
691 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
692 			continue;
693 		while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
694 			sleep_setup(spc, PZERO, "schedstate");
695 			sleep_finish(0,
696 			    (spc->spc_schedflags & SPCF_HALTED) == 0);
697 		}
698 	}
699 }
700 
701 struct sched_barrier_state {
702 	struct cpu_info *ci;
703 	struct cond cond;
704 };
705 
706 void
sched_barrier_task(void * arg)707 sched_barrier_task(void *arg)
708 {
709 	struct sched_barrier_state *sb = arg;
710 	struct cpu_info *ci = sb->ci;
711 
712 	sched_peg_curproc(ci);
713 	cond_signal(&sb->cond);
714 	sched_unpeg_curproc();
715 }
716 
717 void
sched_barrier(struct cpu_info * ci)718 sched_barrier(struct cpu_info *ci)
719 {
720 	struct sched_barrier_state sb;
721 	struct task task;
722 	CPU_INFO_ITERATOR cii;
723 
724 	if (ci == NULL) {
725 		CPU_INFO_FOREACH(cii, ci) {
726 			if (CPU_IS_PRIMARY(ci))
727 				break;
728 		}
729 	}
730 	KASSERT(ci != NULL);
731 
732 	if (ci == curcpu())
733 		return;
734 
735 	sb.ci = ci;
736 	cond_init(&sb.cond);
737 	task_set(&task, sched_barrier_task, &sb);
738 
739 	task_add(systqmp, &task);
740 	cond_wait(&sb.cond, "sbar");
741 }
742 
743 #else
744 
745 void
sched_barrier(struct cpu_info * ci)746 sched_barrier(struct cpu_info *ci)
747 {
748 }
749 
750 #endif
751 
752 /*
753  * Functions to manipulate cpu sets.
754  */
755 struct cpu_info *cpuset_infos[MAXCPUS];
756 static struct cpuset cpuset_all;
757 
758 void
cpuset_init_cpu(struct cpu_info * ci)759 cpuset_init_cpu(struct cpu_info *ci)
760 {
761 	cpuset_add(&cpuset_all, ci);
762 	cpuset_infos[CPU_INFO_UNIT(ci)] = ci;
763 }
764 
765 void
cpuset_clear(struct cpuset * cs)766 cpuset_clear(struct cpuset *cs)
767 {
768 	memset(cs, 0, sizeof(*cs));
769 }
770 
771 void
cpuset_add(struct cpuset * cs,struct cpu_info * ci)772 cpuset_add(struct cpuset *cs, struct cpu_info *ci)
773 {
774 	unsigned int num = CPU_INFO_UNIT(ci);
775 	atomic_setbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
776 }
777 
778 void
cpuset_del(struct cpuset * cs,struct cpu_info * ci)779 cpuset_del(struct cpuset *cs, struct cpu_info *ci)
780 {
781 	unsigned int num = CPU_INFO_UNIT(ci);
782 	atomic_clearbits_int(&cs->cs_set[num/32], (1U << (num % 32)));
783 }
784 
785 int
cpuset_isset(struct cpuset * cs,struct cpu_info * ci)786 cpuset_isset(struct cpuset *cs, struct cpu_info *ci)
787 {
788 	unsigned int num = CPU_INFO_UNIT(ci);
789 	return (cs->cs_set[num/32] & (1U << (num % 32)));
790 }
791 
792 void
cpuset_add_all(struct cpuset * cs)793 cpuset_add_all(struct cpuset *cs)
794 {
795 	cpuset_copy(cs, &cpuset_all);
796 }
797 
798 void
cpuset_copy(struct cpuset * to,struct cpuset * from)799 cpuset_copy(struct cpuset *to, struct cpuset *from)
800 {
801 	memcpy(to, from, sizeof(*to));
802 }
803 
804 struct cpu_info *
cpuset_first(struct cpuset * cs)805 cpuset_first(struct cpuset *cs)
806 {
807 	int i;
808 
809 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
810 		if (cs->cs_set[i])
811 			return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]);
812 
813 	return (NULL);
814 }
815 
816 void
cpuset_union(struct cpuset * to,struct cpuset * a,struct cpuset * b)817 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b)
818 {
819 	int i;
820 
821 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
822 		to->cs_set[i] = a->cs_set[i] | b->cs_set[i];
823 }
824 
825 void
cpuset_intersection(struct cpuset * to,struct cpuset * a,struct cpuset * b)826 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b)
827 {
828 	int i;
829 
830 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
831 		to->cs_set[i] = a->cs_set[i] & b->cs_set[i];
832 }
833 
834 void
cpuset_complement(struct cpuset * to,struct cpuset * a,struct cpuset * b)835 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b)
836 {
837 	int i;
838 
839 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
840 		to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i];
841 }
842 
843 int
cpuset_cardinality(struct cpuset * cs)844 cpuset_cardinality(struct cpuset *cs)
845 {
846 	int cardinality, i, n;
847 
848 	cardinality = 0;
849 
850 	for (i = 0; i < CPUSET_ASIZE(ncpus); i++)
851 		for (n = cs->cs_set[i]; n != 0; n &= n - 1)
852 			cardinality++;
853 
854 	return (cardinality);
855 }
856 
857 int
sysctl_hwncpuonline(void)858 sysctl_hwncpuonline(void)
859 {
860 	return cpuset_cardinality(&sched_all_cpus);
861 }
862 
863 int
cpu_is_online(struct cpu_info * ci)864 cpu_is_online(struct cpu_info *ci)
865 {
866 	return cpuset_isset(&sched_all_cpus, ci);
867 }
868 
869 #ifdef __HAVE_CPU_TOPOLOGY
870 
871 #include <sys/sysctl.h>
872 
873 int
sysctl_hwsmt(void * oldp,size_t * oldlenp,void * newp,size_t newlen)874 sysctl_hwsmt(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
875 {
876 	CPU_INFO_ITERATOR cii;
877 	struct cpu_info *ci;
878 	int err, newsmt;
879 
880 	newsmt = sched_smt;
881 	err = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &newsmt, 0, 1);
882 	if (err)
883 		return err;
884 	if (newsmt == sched_smt)
885 		return 0;
886 
887 	sched_smt = newsmt;
888 	CPU_INFO_FOREACH(cii, ci) {
889 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
890 			continue;
891 		if (ci->ci_smt_id == 0)
892 			continue;
893 		if (sched_smt)
894 			cpuset_add(&sched_all_cpus, ci);
895 		else
896 			cpuset_del(&sched_all_cpus, ci);
897 	}
898 
899 	return 0;
900 }
901 
902 #endif
903