xref: /netbsd/sys/rump/librump/rumpkern/scheduler.c (revision e31d31d5)
1 /*      $NetBSD: scheduler.c,v 1.53 2022/04/09 23:45:14 riastradh Exp $	*/
2 
3 /*
4  * Copyright (c) 2010, 2011 Antti Kantee.  All Rights Reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.53 2022/04/09 23:45:14 riastradh Exp $");
30 
31 #include <sys/param.h>
32 #include <sys/atomic.h>
33 #include <sys/cpu.h>
34 #include <sys/kmem.h>
35 #include <sys/mutex.h>
36 #include <sys/namei.h>
37 #include <sys/queue.h>
38 #include <sys/select.h>
39 #include <sys/systm.h>
40 
41 #include <rump-sys/kern.h>
42 
43 #include <rump/rumpuser.h>
44 
45 static struct rumpcpu {
46 	/* needed in fastpath */
47 	struct cpu_info *rcpu_ci;
48 	void *rcpu_prevlwp;
49 
50 	/* needed in slowpath */
51 	struct rumpuser_mtx *rcpu_mtx;
52 	struct rumpuser_cv *rcpu_cv;
53 	int rcpu_wanted;
54 
55 	/* offset 20 (P=4) or 36 (P=8) here */
56 
57 	/*
58 	 * Some stats.  Not really that necessary, but we should
59 	 * have room.  Note that these overflow quite fast, so need
60 	 * to be collected often.
61 	 */
62 	unsigned int rcpu_fastpath;
63 	unsigned int rcpu_slowpath;
64 	unsigned int rcpu_migrated;
65 
66 	/* offset 32 (P=4) or 50 (P=8) */
67 
68 	int rcpu_align[0] __aligned(CACHE_LINE_SIZE);
69 } rcpu_storage[MAXCPUS];
70 
71 static inline struct rumpcpu *
cpuinfo_to_rumpcpu(struct cpu_info * ci)72 cpuinfo_to_rumpcpu(struct cpu_info *ci)
73 {
74 
75 	return &rcpu_storage[cpu_index(ci)];
76 }
77 
78 struct cpu_info rump_bootcpu;
79 
80 #define RCPULWP_BUSY	((void *)-1)
81 #define RCPULWP_WANTED	((void *)-2)
82 
83 static struct rumpuser_mtx *lwp0mtx;
84 static struct rumpuser_cv *lwp0cv;
85 static unsigned nextcpu;
86 
87 kmutex_t unruntime_lock; /* unruntime lwp lock.  practically unused */
88 
89 static bool lwp0isbusy = false;
90 
91 /*
92  * Keep some stats.
93  *
94  * Keeping track of there is not really critical for speed, unless
95  * stats happen to be on a different cache line (CACHE_LINE_SIZE is
96  * really just a coarse estimate), so default for the performant case
97  * (i.e. no stats).
98  */
99 #ifdef RUMPSCHED_STATS
100 #define SCHED_FASTPATH(rcpu) rcpu->rcpu_fastpath++;
101 #define SCHED_SLOWPATH(rcpu) rcpu->rcpu_slowpath++;
102 #define SCHED_MIGRATED(rcpu) rcpu->rcpu_migrated++;
103 #else
104 #define SCHED_FASTPATH(rcpu)
105 #define SCHED_SLOWPATH(rcpu)
106 #define SCHED_MIGRATED(rcpu)
107 #endif
108 
109 struct cpu_info *
cpu_lookup(u_int index)110 cpu_lookup(u_int index)
111 {
112 
113 	return rcpu_storage[index].rcpu_ci;
114 }
115 
116 static inline struct rumpcpu *
getnextcpu(void)117 getnextcpu(void)
118 {
119 	unsigned newcpu;
120 
121 	newcpu = atomic_inc_uint_nv(&nextcpu);
122 	if (__predict_false(ncpu > UINT_MAX/2))
123 		atomic_and_uint(&nextcpu, 0);
124 	newcpu = newcpu % ncpu;
125 
126 	return &rcpu_storage[newcpu];
127 }
128 
129 /* this could/should be mi_attach_cpu? */
130 void
rump_cpus_bootstrap(int * nump)131 rump_cpus_bootstrap(int *nump)
132 {
133 	int num = *nump;
134 
135 	if (num > MAXCPUS) {
136 		aprint_verbose("CPU limit: %d wanted, %d (MAXCPUS) "
137 		    "available (adjusted)\n", num, MAXCPUS);
138 		num = MAXCPUS;
139 	}
140 
141 	cpu_setmodel("rumpcore (virtual)");
142 
143 	mi_cpu_init();
144 
145 	/* attach first cpu for bootstrap */
146 	rump_cpu_attach(&rump_bootcpu);
147 	ncpu = 1;
148 	*nump = num;
149 }
150 
151 void
rump_scheduler_init(int numcpu)152 rump_scheduler_init(int numcpu)
153 {
154 	struct rumpcpu *rcpu;
155 	struct cpu_info *ci;
156 	int i;
157 
158 	rumpuser_mutex_init(&lwp0mtx, RUMPUSER_MTX_SPIN);
159 	rumpuser_cv_init(&lwp0cv);
160 	for (i = 0; i < numcpu; i++) {
161 		if (i == 0) {
162 			ci = &rump_bootcpu;
163 		} else {
164 			ci = kmem_zalloc(sizeof(*ci), KM_SLEEP);
165 			ci->ci_index = i;
166 		}
167 
168 		rcpu = &rcpu_storage[i];
169 		rcpu->rcpu_ci = ci;
170 		rcpu->rcpu_wanted = 0;
171 		rumpuser_cv_init(&rcpu->rcpu_cv);
172 		rumpuser_mutex_init(&rcpu->rcpu_mtx, RUMPUSER_MTX_SPIN);
173 
174 		ci->ci_schedstate.spc_mutex =
175 		    mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
176 		ci->ci_schedstate.spc_flags = SPCF_RUNNING;
177 	}
178 
179 	mutex_init(&unruntime_lock, MUTEX_DEFAULT, IPL_SCHED);
180 }
181 
182 void
rump_schedlock_cv_signal(struct cpu_info * ci,struct rumpuser_cv * cv)183 rump_schedlock_cv_signal(struct cpu_info *ci, struct rumpuser_cv *cv)
184 {
185 	struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(ci);
186 
187 	rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
188 	rumpuser_cv_signal(cv);
189 	rumpuser_mutex_exit(rcpu->rcpu_mtx);
190 }
191 
192 /*
193  * condvar ops using scheduler lock as the rumpuser interlock.
194  */
195 void
rump_schedlock_cv_wait(struct rumpuser_cv * cv)196 rump_schedlock_cv_wait(struct rumpuser_cv *cv)
197 {
198 	struct lwp *l = curlwp;
199 	struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu);
200 
201 	/* mutex will be taken and released in cpu schedule/unschedule */
202 	rumpuser_cv_wait(cv, rcpu->rcpu_mtx);
203 }
204 
205 int
rump_schedlock_cv_timedwait(struct rumpuser_cv * cv,const struct timespec * ts)206 rump_schedlock_cv_timedwait(struct rumpuser_cv *cv, const struct timespec *ts)
207 {
208 	struct lwp *l = curlwp;
209 	struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu);
210 
211 	/* mutex will be taken and released in cpu schedule/unschedule */
212 	return rumpuser_cv_timedwait(cv, rcpu->rcpu_mtx,
213 	    ts->tv_sec, ts->tv_nsec);
214 }
215 
216 static void
lwp0busy(void)217 lwp0busy(void)
218 {
219 
220 	/* busy lwp0 */
221 	KASSERT(curlwp == NULL || curlwp->l_stat != LSONPROC);
222 	rumpuser_mutex_enter_nowrap(lwp0mtx);
223 	while (lwp0isbusy)
224 		rumpuser_cv_wait_nowrap(lwp0cv, lwp0mtx);
225 	lwp0isbusy = true;
226 	rumpuser_mutex_exit(lwp0mtx);
227 }
228 
229 static void
lwp0rele(void)230 lwp0rele(void)
231 {
232 
233 	rumpuser_mutex_enter_nowrap(lwp0mtx);
234 	KASSERT(lwp0isbusy == true);
235 	lwp0isbusy = false;
236 	rumpuser_cv_signal(lwp0cv);
237 	rumpuser_mutex_exit(lwp0mtx);
238 }
239 
240 /*
241  * rump_schedule: ensure that the calling host thread has a valid lwp context.
242  * ie. ensure that curlwp != NULL.  Also, ensure that there
243  * a 1:1 mapping between the lwp and rump kernel cpu.
244  */
245 void
rump_schedule()246 rump_schedule()
247 {
248 	struct lwp *l;
249 
250 	/*
251 	 * If there is no dedicated lwp, allocate a temp one and
252 	 * set it to be free'd upon unschedule().  Use lwp0 context
253 	 * for reserving the necessary resources.  Don't optimize
254 	 * for this case -- anyone who cares about performance will
255 	 * start a real thread.
256 	 */
257 	if (__predict_true((l = curlwp) != NULL)) {
258 		rump_schedule_cpu(l);
259 		LWP_CACHE_CREDS(l, l->l_proc);
260 	} else {
261 		lwp0busy();
262 
263 		/* schedule cpu and use lwp0 */
264 		rump_schedule_cpu(&lwp0);
265 		rump_lwproc_curlwp_set(&lwp0);
266 
267 		/* allocate thread, switch to it, and release lwp0 */
268 		l = rump__lwproc_alloclwp(initproc);
269 		rump_lwproc_switch(l);
270 		lwp0rele();
271 
272 		/*
273 		 * mark new thread dead-on-unschedule.  this
274 		 * means that we'll be running with l_refcnt == 0.
275 		 * relax, it's fine.
276 		 */
277 		rump_lwproc_releaselwp();
278 	}
279 }
280 
281 void
rump_schedule_cpu(struct lwp * l)282 rump_schedule_cpu(struct lwp *l)
283 {
284 
285 	rump_schedule_cpu_interlock(l, NULL);
286 }
287 
288 /*
289  * Schedule a CPU.  This optimizes for the case where we schedule
290  * the same thread often, and we have nCPU >= nFrequently-Running-Thread
291  * (where CPU is virtual rump cpu, not host CPU).
292  */
293 void
rump_schedule_cpu_interlock(struct lwp * l,void * interlock)294 rump_schedule_cpu_interlock(struct lwp *l, void *interlock)
295 {
296 	struct rumpcpu *rcpu;
297 	struct cpu_info *ci;
298 	void *old;
299 	bool domigrate;
300 	bool bound = l->l_pflag & LP_BOUND;
301 
302 	l->l_stat = LSRUN;
303 
304 	/*
305 	 * First, try fastpath: if we were the previous user of the
306 	 * CPU, everything is in order cachewise and we can just
307 	 * proceed to use it.
308 	 *
309 	 * If we are a different thread (i.e. CAS fails), we must go
310 	 * through a memory barrier to ensure we get a truthful
311 	 * view of the world.
312 	 */
313 
314 	KASSERT(l->l_target_cpu != NULL);
315 	rcpu = cpuinfo_to_rumpcpu(l->l_target_cpu);
316 	if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, l, RCPULWP_BUSY) == l) {
317 		if (interlock == rcpu->rcpu_mtx)
318 			rumpuser_mutex_exit(rcpu->rcpu_mtx);
319 		SCHED_FASTPATH(rcpu);
320 		/* jones, you're the man */
321 		goto fastlane;
322 	}
323 
324 	/*
325 	 * Else, it's the slowpath for us.  First, determine if we
326 	 * can migrate.
327 	 */
328 	if (ncpu == 1)
329 		domigrate = false;
330 	else
331 		domigrate = true;
332 
333 	/* Take lock.  This acts as a load barrier too. */
334 	if (interlock != rcpu->rcpu_mtx)
335 		rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
336 
337 	for (;;) {
338 		SCHED_SLOWPATH(rcpu);
339 		old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, RCPULWP_WANTED);
340 
341 		/* CPU is free? */
342 		if (old != RCPULWP_BUSY && old != RCPULWP_WANTED) {
343 			if (atomic_cas_ptr(&rcpu->rcpu_prevlwp,
344 			    RCPULWP_WANTED, RCPULWP_BUSY) == RCPULWP_WANTED) {
345 				break;
346 			}
347 		}
348 
349 		/*
350 		 * Do we want to migrate once?
351 		 * This may need a slightly better algorithm, or we
352 		 * might cache pingpong eternally for non-frequent
353 		 * threads.
354 		 */
355 		if (domigrate && !bound) {
356 			domigrate = false;
357 			SCHED_MIGRATED(rcpu);
358 			rumpuser_mutex_exit(rcpu->rcpu_mtx);
359 			rcpu = getnextcpu();
360 			rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
361 			continue;
362 		}
363 
364 		/* Want CPU, wait until it's released an retry */
365 		rcpu->rcpu_wanted++;
366 		rumpuser_cv_wait_nowrap(rcpu->rcpu_cv, rcpu->rcpu_mtx);
367 		rcpu->rcpu_wanted--;
368 	}
369 	rumpuser_mutex_exit(rcpu->rcpu_mtx);
370 
371  fastlane:
372 	ci = rcpu->rcpu_ci;
373 	l->l_cpu = l->l_target_cpu = ci;
374 	l->l_mutex = rcpu->rcpu_ci->ci_schedstate.spc_mutex;
375 	l->l_ncsw++;
376 	l->l_stat = LSONPROC;
377 
378 	/*
379 	 * No interrupts, so ci_curlwp === cpu_onproc.
380 	 * Okay, we could make an attempt to not set cpu_onproc
381 	 * in the case that an interrupt is scheduled immediately
382 	 * after a user proc, but leave that for later.
383 	 */
384 	ci->ci_curlwp = ci->ci_onproc = l;
385 }
386 
387 void
rump_unschedule()388 rump_unschedule()
389 {
390 	struct lwp *l = curlwp;
391 #ifdef DIAGNOSTIC
392 	int nlock;
393 
394 	KERNEL_UNLOCK_ALL(l, &nlock);
395 	KASSERT(nlock == 0);
396 #endif
397 
398 	KASSERT(l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex);
399 	rump_unschedule_cpu(l);
400 	l->l_mutex = &unruntime_lock;
401 	l->l_stat = LSSTOP;
402 
403 	/*
404 	 * Check special conditions:
405 	 *  1) do we need to free the lwp which just unscheduled?
406 	 *     (locking order: lwp0, cpu)
407 	 *  2) do we want to clear curlwp for the current host thread
408 	 */
409 	if (__predict_false(l->l_flag & LW_WEXIT)) {
410 		lwp0busy();
411 
412 		/* Now that we have lwp0, we can schedule a CPU again */
413 		rump_schedule_cpu(l);
414 
415 		/* switch to lwp0.  this frees the old thread */
416 		KASSERT(l->l_flag & LW_WEXIT);
417 		rump_lwproc_switch(&lwp0);
418 
419 		/* release lwp0 */
420 		rump_unschedule_cpu(&lwp0);
421 		lwp0.l_mutex = &unruntime_lock;
422 		lwp0.l_pflag &= ~LP_RUNNING;
423 		lwp0rele();
424 		rump_lwproc_curlwp_clear(&lwp0);
425 
426 	} else if (__predict_false(l->l_flag & LW_RUMP_CLEAR)) {
427 		rump_lwproc_curlwp_clear(l);
428 		l->l_flag &= ~LW_RUMP_CLEAR;
429 	}
430 }
431 
432 void
rump_unschedule_cpu(struct lwp * l)433 rump_unschedule_cpu(struct lwp *l)
434 {
435 
436 	rump_unschedule_cpu_interlock(l, NULL);
437 }
438 
439 void
rump_unschedule_cpu_interlock(struct lwp * l,void * interlock)440 rump_unschedule_cpu_interlock(struct lwp *l, void *interlock)
441 {
442 
443 	if ((l->l_pflag & LP_INTR) == 0)
444 		rump_softint_run(l->l_cpu);
445 	rump_unschedule_cpu1(l, interlock);
446 }
447 
448 void
rump_unschedule_cpu1(struct lwp * l,void * interlock)449 rump_unschedule_cpu1(struct lwp *l, void *interlock)
450 {
451 	struct rumpcpu *rcpu;
452 	struct cpu_info *ci;
453 	void *old;
454 
455 	ci = l->l_cpu;
456 	ci->ci_curlwp = ci->ci_onproc = NULL;
457 	rcpu = cpuinfo_to_rumpcpu(ci);
458 
459 	KASSERT(rcpu->rcpu_ci == ci);
460 
461 	/*
462 	 * Make sure all stores are seen before the CPU release.  This
463 	 * is relevant only in the non-fastpath scheduling case, but
464 	 * we don't know here if that's going to happen, so need to
465 	 * expect the worst.
466 	 *
467 	 * If the scheduler interlock was requested by the caller, we
468 	 * need to obtain it before we release the CPU.  Otherwise, we risk a
469 	 * race condition where another thread is scheduled onto the
470 	 * rump kernel CPU before our current thread can
471 	 * grab the interlock.
472 	 */
473 	if (interlock == rcpu->rcpu_mtx)
474 		rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
475 	else
476 		membar_release(); /* XXX what does this pair with? */
477 
478 	/* Release the CPU. */
479 	old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, l);
480 
481 	/* No waiters?  No problems.  We're outta here. */
482 	if (old == RCPULWP_BUSY) {
483 		return;
484 	}
485 
486 	KASSERT(old == RCPULWP_WANTED);
487 
488 	/*
489 	 * Ok, things weren't so snappy.
490 	 *
491 	 * Snailpath: take lock and signal anyone waiting for this CPU.
492 	 */
493 
494 	if (interlock != rcpu->rcpu_mtx)
495 		rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
496 	if (rcpu->rcpu_wanted)
497 		rumpuser_cv_broadcast(rcpu->rcpu_cv);
498 	if (interlock != rcpu->rcpu_mtx)
499 		rumpuser_mutex_exit(rcpu->rcpu_mtx);
500 }
501 
502 /* Give up and retake CPU (perhaps a different one) */
503 void
yield()504 yield()
505 {
506 	struct lwp *l = curlwp;
507 	int nlocks;
508 
509 	KERNEL_UNLOCK_ALL(l, &nlocks);
510 	rump_unschedule_cpu(l);
511 	rump_schedule_cpu(l);
512 	KERNEL_LOCK(nlocks, l);
513 }
514 
515 void
preempt()516 preempt()
517 {
518 
519 	yield();
520 }
521 
522 bool
kpreempt(uintptr_t where)523 kpreempt(uintptr_t where)
524 {
525 
526 	return false;
527 }
528 
529 /*
530  * There is no kernel thread preemption in rump currently.  But call
531  * the implementing macros anyway in case they grow some side-effects
532  * down the road.
533  */
534 void
kpreempt_disable(void)535 kpreempt_disable(void)
536 {
537 
538 	KPREEMPT_DISABLE(curlwp);
539 }
540 
541 void
kpreempt_enable(void)542 kpreempt_enable(void)
543 {
544 
545 	KPREEMPT_ENABLE(curlwp);
546 }
547 
548 bool
kpreempt_disabled(void)549 kpreempt_disabled(void)
550 {
551 #if 0
552 	const lwp_t *l = curlwp;
553 
554 	return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
555 	    (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled();
556 #endif
557 	/* XXX: emulate cpu_kpreempt_disabled() */
558 	return true;
559 }
560 
561 void
suspendsched(void)562 suspendsched(void)
563 {
564 
565 	/*
566 	 * Could wait until everyone is out and block further entries,
567 	 * but skip that for now.
568 	 */
569 }
570 
571 void
sched_nice(struct proc * p,int level)572 sched_nice(struct proc *p, int level)
573 {
574 
575 	/* nothing to do for now */
576 }
577 
578 void
setrunnable(struct lwp * l)579 setrunnable(struct lwp *l)
580 {
581 
582 	sched_enqueue(l);
583 }
584 
585 void
sched_enqueue(struct lwp * l)586 sched_enqueue(struct lwp *l)
587 {
588 
589 	rump_thread_allow(l);
590 }
591 
592 void
sched_resched_cpu(struct cpu_info * ci,pri_t pri,bool unlock)593 sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock)
594 {
595 
596 }
597 
598 void
sched_resched_lwp(struct lwp * l,bool unlock)599 sched_resched_lwp(struct lwp *l, bool unlock)
600 {
601 
602 }
603 
604 void
sched_dequeue(struct lwp * l)605 sched_dequeue(struct lwp *l)
606 {
607 
608 	panic("sched_dequeue not implemented");
609 }
610 
611 void
preempt_point(void)612 preempt_point(void)
613 {
614 
615 }
616 
617 bool
preempt_needed(void)618 preempt_needed(void)
619 {
620 
621 	return false;
622 }
623