1 /* $NetBSD: scheduler.c,v 1.53 2022/04/09 23:45:14 riastradh Exp $ */
2
3 /*
4 * Copyright (c) 2010, 2011 Antti Kantee. All Rights Reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #include <sys/cdefs.h>
29 __KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.53 2022/04/09 23:45:14 riastradh Exp $");
30
31 #include <sys/param.h>
32 #include <sys/atomic.h>
33 #include <sys/cpu.h>
34 #include <sys/kmem.h>
35 #include <sys/mutex.h>
36 #include <sys/namei.h>
37 #include <sys/queue.h>
38 #include <sys/select.h>
39 #include <sys/systm.h>
40
41 #include <rump-sys/kern.h>
42
43 #include <rump/rumpuser.h>
44
45 static struct rumpcpu {
46 /* needed in fastpath */
47 struct cpu_info *rcpu_ci;
48 void *rcpu_prevlwp;
49
50 /* needed in slowpath */
51 struct rumpuser_mtx *rcpu_mtx;
52 struct rumpuser_cv *rcpu_cv;
53 int rcpu_wanted;
54
55 /* offset 20 (P=4) or 36 (P=8) here */
56
57 /*
58 * Some stats. Not really that necessary, but we should
59 * have room. Note that these overflow quite fast, so need
60 * to be collected often.
61 */
62 unsigned int rcpu_fastpath;
63 unsigned int rcpu_slowpath;
64 unsigned int rcpu_migrated;
65
66 /* offset 32 (P=4) or 50 (P=8) */
67
68 int rcpu_align[0] __aligned(CACHE_LINE_SIZE);
69 } rcpu_storage[MAXCPUS];
70
71 static inline struct rumpcpu *
cpuinfo_to_rumpcpu(struct cpu_info * ci)72 cpuinfo_to_rumpcpu(struct cpu_info *ci)
73 {
74
75 return &rcpu_storage[cpu_index(ci)];
76 }
77
78 struct cpu_info rump_bootcpu;
79
80 #define RCPULWP_BUSY ((void *)-1)
81 #define RCPULWP_WANTED ((void *)-2)
82
83 static struct rumpuser_mtx *lwp0mtx;
84 static struct rumpuser_cv *lwp0cv;
85 static unsigned nextcpu;
86
87 kmutex_t unruntime_lock; /* unruntime lwp lock. practically unused */
88
89 static bool lwp0isbusy = false;
90
91 /*
92 * Keep some stats.
93 *
94 * Keeping track of there is not really critical for speed, unless
95 * stats happen to be on a different cache line (CACHE_LINE_SIZE is
96 * really just a coarse estimate), so default for the performant case
97 * (i.e. no stats).
98 */
99 #ifdef RUMPSCHED_STATS
100 #define SCHED_FASTPATH(rcpu) rcpu->rcpu_fastpath++;
101 #define SCHED_SLOWPATH(rcpu) rcpu->rcpu_slowpath++;
102 #define SCHED_MIGRATED(rcpu) rcpu->rcpu_migrated++;
103 #else
104 #define SCHED_FASTPATH(rcpu)
105 #define SCHED_SLOWPATH(rcpu)
106 #define SCHED_MIGRATED(rcpu)
107 #endif
108
109 struct cpu_info *
cpu_lookup(u_int index)110 cpu_lookup(u_int index)
111 {
112
113 return rcpu_storage[index].rcpu_ci;
114 }
115
116 static inline struct rumpcpu *
getnextcpu(void)117 getnextcpu(void)
118 {
119 unsigned newcpu;
120
121 newcpu = atomic_inc_uint_nv(&nextcpu);
122 if (__predict_false(ncpu > UINT_MAX/2))
123 atomic_and_uint(&nextcpu, 0);
124 newcpu = newcpu % ncpu;
125
126 return &rcpu_storage[newcpu];
127 }
128
129 /* this could/should be mi_attach_cpu? */
130 void
rump_cpus_bootstrap(int * nump)131 rump_cpus_bootstrap(int *nump)
132 {
133 int num = *nump;
134
135 if (num > MAXCPUS) {
136 aprint_verbose("CPU limit: %d wanted, %d (MAXCPUS) "
137 "available (adjusted)\n", num, MAXCPUS);
138 num = MAXCPUS;
139 }
140
141 cpu_setmodel("rumpcore (virtual)");
142
143 mi_cpu_init();
144
145 /* attach first cpu for bootstrap */
146 rump_cpu_attach(&rump_bootcpu);
147 ncpu = 1;
148 *nump = num;
149 }
150
151 void
rump_scheduler_init(int numcpu)152 rump_scheduler_init(int numcpu)
153 {
154 struct rumpcpu *rcpu;
155 struct cpu_info *ci;
156 int i;
157
158 rumpuser_mutex_init(&lwp0mtx, RUMPUSER_MTX_SPIN);
159 rumpuser_cv_init(&lwp0cv);
160 for (i = 0; i < numcpu; i++) {
161 if (i == 0) {
162 ci = &rump_bootcpu;
163 } else {
164 ci = kmem_zalloc(sizeof(*ci), KM_SLEEP);
165 ci->ci_index = i;
166 }
167
168 rcpu = &rcpu_storage[i];
169 rcpu->rcpu_ci = ci;
170 rcpu->rcpu_wanted = 0;
171 rumpuser_cv_init(&rcpu->rcpu_cv);
172 rumpuser_mutex_init(&rcpu->rcpu_mtx, RUMPUSER_MTX_SPIN);
173
174 ci->ci_schedstate.spc_mutex =
175 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
176 ci->ci_schedstate.spc_flags = SPCF_RUNNING;
177 }
178
179 mutex_init(&unruntime_lock, MUTEX_DEFAULT, IPL_SCHED);
180 }
181
182 void
rump_schedlock_cv_signal(struct cpu_info * ci,struct rumpuser_cv * cv)183 rump_schedlock_cv_signal(struct cpu_info *ci, struct rumpuser_cv *cv)
184 {
185 struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(ci);
186
187 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
188 rumpuser_cv_signal(cv);
189 rumpuser_mutex_exit(rcpu->rcpu_mtx);
190 }
191
192 /*
193 * condvar ops using scheduler lock as the rumpuser interlock.
194 */
195 void
rump_schedlock_cv_wait(struct rumpuser_cv * cv)196 rump_schedlock_cv_wait(struct rumpuser_cv *cv)
197 {
198 struct lwp *l = curlwp;
199 struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu);
200
201 /* mutex will be taken and released in cpu schedule/unschedule */
202 rumpuser_cv_wait(cv, rcpu->rcpu_mtx);
203 }
204
205 int
rump_schedlock_cv_timedwait(struct rumpuser_cv * cv,const struct timespec * ts)206 rump_schedlock_cv_timedwait(struct rumpuser_cv *cv, const struct timespec *ts)
207 {
208 struct lwp *l = curlwp;
209 struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu);
210
211 /* mutex will be taken and released in cpu schedule/unschedule */
212 return rumpuser_cv_timedwait(cv, rcpu->rcpu_mtx,
213 ts->tv_sec, ts->tv_nsec);
214 }
215
216 static void
lwp0busy(void)217 lwp0busy(void)
218 {
219
220 /* busy lwp0 */
221 KASSERT(curlwp == NULL || curlwp->l_stat != LSONPROC);
222 rumpuser_mutex_enter_nowrap(lwp0mtx);
223 while (lwp0isbusy)
224 rumpuser_cv_wait_nowrap(lwp0cv, lwp0mtx);
225 lwp0isbusy = true;
226 rumpuser_mutex_exit(lwp0mtx);
227 }
228
229 static void
lwp0rele(void)230 lwp0rele(void)
231 {
232
233 rumpuser_mutex_enter_nowrap(lwp0mtx);
234 KASSERT(lwp0isbusy == true);
235 lwp0isbusy = false;
236 rumpuser_cv_signal(lwp0cv);
237 rumpuser_mutex_exit(lwp0mtx);
238 }
239
240 /*
241 * rump_schedule: ensure that the calling host thread has a valid lwp context.
242 * ie. ensure that curlwp != NULL. Also, ensure that there
243 * a 1:1 mapping between the lwp and rump kernel cpu.
244 */
245 void
rump_schedule()246 rump_schedule()
247 {
248 struct lwp *l;
249
250 /*
251 * If there is no dedicated lwp, allocate a temp one and
252 * set it to be free'd upon unschedule(). Use lwp0 context
253 * for reserving the necessary resources. Don't optimize
254 * for this case -- anyone who cares about performance will
255 * start a real thread.
256 */
257 if (__predict_true((l = curlwp) != NULL)) {
258 rump_schedule_cpu(l);
259 LWP_CACHE_CREDS(l, l->l_proc);
260 } else {
261 lwp0busy();
262
263 /* schedule cpu and use lwp0 */
264 rump_schedule_cpu(&lwp0);
265 rump_lwproc_curlwp_set(&lwp0);
266
267 /* allocate thread, switch to it, and release lwp0 */
268 l = rump__lwproc_alloclwp(initproc);
269 rump_lwproc_switch(l);
270 lwp0rele();
271
272 /*
273 * mark new thread dead-on-unschedule. this
274 * means that we'll be running with l_refcnt == 0.
275 * relax, it's fine.
276 */
277 rump_lwproc_releaselwp();
278 }
279 }
280
281 void
rump_schedule_cpu(struct lwp * l)282 rump_schedule_cpu(struct lwp *l)
283 {
284
285 rump_schedule_cpu_interlock(l, NULL);
286 }
287
288 /*
289 * Schedule a CPU. This optimizes for the case where we schedule
290 * the same thread often, and we have nCPU >= nFrequently-Running-Thread
291 * (where CPU is virtual rump cpu, not host CPU).
292 */
293 void
rump_schedule_cpu_interlock(struct lwp * l,void * interlock)294 rump_schedule_cpu_interlock(struct lwp *l, void *interlock)
295 {
296 struct rumpcpu *rcpu;
297 struct cpu_info *ci;
298 void *old;
299 bool domigrate;
300 bool bound = l->l_pflag & LP_BOUND;
301
302 l->l_stat = LSRUN;
303
304 /*
305 * First, try fastpath: if we were the previous user of the
306 * CPU, everything is in order cachewise and we can just
307 * proceed to use it.
308 *
309 * If we are a different thread (i.e. CAS fails), we must go
310 * through a memory barrier to ensure we get a truthful
311 * view of the world.
312 */
313
314 KASSERT(l->l_target_cpu != NULL);
315 rcpu = cpuinfo_to_rumpcpu(l->l_target_cpu);
316 if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, l, RCPULWP_BUSY) == l) {
317 if (interlock == rcpu->rcpu_mtx)
318 rumpuser_mutex_exit(rcpu->rcpu_mtx);
319 SCHED_FASTPATH(rcpu);
320 /* jones, you're the man */
321 goto fastlane;
322 }
323
324 /*
325 * Else, it's the slowpath for us. First, determine if we
326 * can migrate.
327 */
328 if (ncpu == 1)
329 domigrate = false;
330 else
331 domigrate = true;
332
333 /* Take lock. This acts as a load barrier too. */
334 if (interlock != rcpu->rcpu_mtx)
335 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
336
337 for (;;) {
338 SCHED_SLOWPATH(rcpu);
339 old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, RCPULWP_WANTED);
340
341 /* CPU is free? */
342 if (old != RCPULWP_BUSY && old != RCPULWP_WANTED) {
343 if (atomic_cas_ptr(&rcpu->rcpu_prevlwp,
344 RCPULWP_WANTED, RCPULWP_BUSY) == RCPULWP_WANTED) {
345 break;
346 }
347 }
348
349 /*
350 * Do we want to migrate once?
351 * This may need a slightly better algorithm, or we
352 * might cache pingpong eternally for non-frequent
353 * threads.
354 */
355 if (domigrate && !bound) {
356 domigrate = false;
357 SCHED_MIGRATED(rcpu);
358 rumpuser_mutex_exit(rcpu->rcpu_mtx);
359 rcpu = getnextcpu();
360 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
361 continue;
362 }
363
364 /* Want CPU, wait until it's released an retry */
365 rcpu->rcpu_wanted++;
366 rumpuser_cv_wait_nowrap(rcpu->rcpu_cv, rcpu->rcpu_mtx);
367 rcpu->rcpu_wanted--;
368 }
369 rumpuser_mutex_exit(rcpu->rcpu_mtx);
370
371 fastlane:
372 ci = rcpu->rcpu_ci;
373 l->l_cpu = l->l_target_cpu = ci;
374 l->l_mutex = rcpu->rcpu_ci->ci_schedstate.spc_mutex;
375 l->l_ncsw++;
376 l->l_stat = LSONPROC;
377
378 /*
379 * No interrupts, so ci_curlwp === cpu_onproc.
380 * Okay, we could make an attempt to not set cpu_onproc
381 * in the case that an interrupt is scheduled immediately
382 * after a user proc, but leave that for later.
383 */
384 ci->ci_curlwp = ci->ci_onproc = l;
385 }
386
387 void
rump_unschedule()388 rump_unschedule()
389 {
390 struct lwp *l = curlwp;
391 #ifdef DIAGNOSTIC
392 int nlock;
393
394 KERNEL_UNLOCK_ALL(l, &nlock);
395 KASSERT(nlock == 0);
396 #endif
397
398 KASSERT(l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex);
399 rump_unschedule_cpu(l);
400 l->l_mutex = &unruntime_lock;
401 l->l_stat = LSSTOP;
402
403 /*
404 * Check special conditions:
405 * 1) do we need to free the lwp which just unscheduled?
406 * (locking order: lwp0, cpu)
407 * 2) do we want to clear curlwp for the current host thread
408 */
409 if (__predict_false(l->l_flag & LW_WEXIT)) {
410 lwp0busy();
411
412 /* Now that we have lwp0, we can schedule a CPU again */
413 rump_schedule_cpu(l);
414
415 /* switch to lwp0. this frees the old thread */
416 KASSERT(l->l_flag & LW_WEXIT);
417 rump_lwproc_switch(&lwp0);
418
419 /* release lwp0 */
420 rump_unschedule_cpu(&lwp0);
421 lwp0.l_mutex = &unruntime_lock;
422 lwp0.l_pflag &= ~LP_RUNNING;
423 lwp0rele();
424 rump_lwproc_curlwp_clear(&lwp0);
425
426 } else if (__predict_false(l->l_flag & LW_RUMP_CLEAR)) {
427 rump_lwproc_curlwp_clear(l);
428 l->l_flag &= ~LW_RUMP_CLEAR;
429 }
430 }
431
432 void
rump_unschedule_cpu(struct lwp * l)433 rump_unschedule_cpu(struct lwp *l)
434 {
435
436 rump_unschedule_cpu_interlock(l, NULL);
437 }
438
439 void
rump_unschedule_cpu_interlock(struct lwp * l,void * interlock)440 rump_unschedule_cpu_interlock(struct lwp *l, void *interlock)
441 {
442
443 if ((l->l_pflag & LP_INTR) == 0)
444 rump_softint_run(l->l_cpu);
445 rump_unschedule_cpu1(l, interlock);
446 }
447
448 void
rump_unschedule_cpu1(struct lwp * l,void * interlock)449 rump_unschedule_cpu1(struct lwp *l, void *interlock)
450 {
451 struct rumpcpu *rcpu;
452 struct cpu_info *ci;
453 void *old;
454
455 ci = l->l_cpu;
456 ci->ci_curlwp = ci->ci_onproc = NULL;
457 rcpu = cpuinfo_to_rumpcpu(ci);
458
459 KASSERT(rcpu->rcpu_ci == ci);
460
461 /*
462 * Make sure all stores are seen before the CPU release. This
463 * is relevant only in the non-fastpath scheduling case, but
464 * we don't know here if that's going to happen, so need to
465 * expect the worst.
466 *
467 * If the scheduler interlock was requested by the caller, we
468 * need to obtain it before we release the CPU. Otherwise, we risk a
469 * race condition where another thread is scheduled onto the
470 * rump kernel CPU before our current thread can
471 * grab the interlock.
472 */
473 if (interlock == rcpu->rcpu_mtx)
474 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
475 else
476 membar_release(); /* XXX what does this pair with? */
477
478 /* Release the CPU. */
479 old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, l);
480
481 /* No waiters? No problems. We're outta here. */
482 if (old == RCPULWP_BUSY) {
483 return;
484 }
485
486 KASSERT(old == RCPULWP_WANTED);
487
488 /*
489 * Ok, things weren't so snappy.
490 *
491 * Snailpath: take lock and signal anyone waiting for this CPU.
492 */
493
494 if (interlock != rcpu->rcpu_mtx)
495 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
496 if (rcpu->rcpu_wanted)
497 rumpuser_cv_broadcast(rcpu->rcpu_cv);
498 if (interlock != rcpu->rcpu_mtx)
499 rumpuser_mutex_exit(rcpu->rcpu_mtx);
500 }
501
502 /* Give up and retake CPU (perhaps a different one) */
503 void
yield()504 yield()
505 {
506 struct lwp *l = curlwp;
507 int nlocks;
508
509 KERNEL_UNLOCK_ALL(l, &nlocks);
510 rump_unschedule_cpu(l);
511 rump_schedule_cpu(l);
512 KERNEL_LOCK(nlocks, l);
513 }
514
515 void
preempt()516 preempt()
517 {
518
519 yield();
520 }
521
522 bool
kpreempt(uintptr_t where)523 kpreempt(uintptr_t where)
524 {
525
526 return false;
527 }
528
529 /*
530 * There is no kernel thread preemption in rump currently. But call
531 * the implementing macros anyway in case they grow some side-effects
532 * down the road.
533 */
534 void
kpreempt_disable(void)535 kpreempt_disable(void)
536 {
537
538 KPREEMPT_DISABLE(curlwp);
539 }
540
541 void
kpreempt_enable(void)542 kpreempt_enable(void)
543 {
544
545 KPREEMPT_ENABLE(curlwp);
546 }
547
548 bool
kpreempt_disabled(void)549 kpreempt_disabled(void)
550 {
551 #if 0
552 const lwp_t *l = curlwp;
553
554 return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
555 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled();
556 #endif
557 /* XXX: emulate cpu_kpreempt_disabled() */
558 return true;
559 }
560
561 void
suspendsched(void)562 suspendsched(void)
563 {
564
565 /*
566 * Could wait until everyone is out and block further entries,
567 * but skip that for now.
568 */
569 }
570
571 void
sched_nice(struct proc * p,int level)572 sched_nice(struct proc *p, int level)
573 {
574
575 /* nothing to do for now */
576 }
577
578 void
setrunnable(struct lwp * l)579 setrunnable(struct lwp *l)
580 {
581
582 sched_enqueue(l);
583 }
584
585 void
sched_enqueue(struct lwp * l)586 sched_enqueue(struct lwp *l)
587 {
588
589 rump_thread_allow(l);
590 }
591
592 void
sched_resched_cpu(struct cpu_info * ci,pri_t pri,bool unlock)593 sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock)
594 {
595
596 }
597
598 void
sched_resched_lwp(struct lwp * l,bool unlock)599 sched_resched_lwp(struct lwp *l, bool unlock)
600 {
601
602 }
603
604 void
sched_dequeue(struct lwp * l)605 sched_dequeue(struct lwp *l)
606 {
607
608 panic("sched_dequeue not implemented");
609 }
610
611 void
preempt_point(void)612 preempt_point(void)
613 {
614
615 }
616
617 bool
preempt_needed(void)618 preempt_needed(void)
619 {
620
621 return false;
622 }
623