1 /* $NetBSD: scheduler.c,v 1.44 2016/02/19 18:38:37 pooka Exp $ */
2
3 /*
4 * Copyright (c) 2010, 2011 Antti Kantee. All Rights Reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28 #include <sys/cdefs.h>
29 __KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.44 2016/02/19 18:38:37 pooka Exp $");
30
31 #include <sys/param.h>
32 #include <sys/atomic.h>
33 #include <sys/cpu.h>
34 #include <sys/kmem.h>
35 #include <sys/mutex.h>
36 #include <sys/namei.h>
37 #include <sys/queue.h>
38 #include <sys/select.h>
39 #include <sys/systm.h>
40
41 #include <rump-sys/kern.h>
42
43 #include <rump/rumpuser.h>
44
45 static struct rumpcpu {
46 /* needed in fastpath */
47 struct cpu_info *rcpu_ci;
48 void *rcpu_prevlwp;
49
50 /* needed in slowpath */
51 struct rumpuser_mtx *rcpu_mtx;
52 struct rumpuser_cv *rcpu_cv;
53 int rcpu_wanted;
54
55 /* offset 20 (P=4) or 36 (P=8) here */
56
57 /*
58 * Some stats. Not really that necessary, but we should
59 * have room. Note that these overflow quite fast, so need
60 * to be collected often.
61 */
62 unsigned int rcpu_fastpath;
63 unsigned int rcpu_slowpath;
64 unsigned int rcpu_migrated;
65
66 /* offset 32 (P=4) or 50 (P=8) */
67
68 int rcpu_align[0] __aligned(CACHE_LINE_SIZE);
69 } rcpu_storage[MAXCPUS];
70
71 static inline struct rumpcpu *
cpuinfo_to_rumpcpu(struct cpu_info * ci)72 cpuinfo_to_rumpcpu(struct cpu_info *ci)
73 {
74
75 return &rcpu_storage[cpu_index(ci)];
76 }
77
78 struct cpu_info rump_bootcpu;
79 kcpuset_t *kcpuset_attached = NULL;
80 kcpuset_t *kcpuset_running = NULL;
81 int ncpu, ncpuonline;
82
83 kmutex_t cpu_lock;
84
85 #define RCPULWP_BUSY ((void *)-1)
86 #define RCPULWP_WANTED ((void *)-2)
87
88 static struct rumpuser_mtx *lwp0mtx;
89 static struct rumpuser_cv *lwp0cv;
90 static unsigned nextcpu;
91
92 kmutex_t unruntime_lock; /* unruntime lwp lock. practically unused */
93
94 static bool lwp0isbusy = false;
95
96 /*
97 * Keep some stats.
98 *
99 * Keeping track of there is not really critical for speed, unless
100 * stats happen to be on a different cache line (CACHE_LINE_SIZE is
101 * really just a coarse estimate), so default for the performant case
102 * (i.e. no stats).
103 */
104 #ifdef RUMPSCHED_STATS
105 #define SCHED_FASTPATH(rcpu) rcpu->rcpu_fastpath++;
106 #define SCHED_SLOWPATH(rcpu) rcpu->rcpu_slowpath++;
107 #define SCHED_MIGRATED(rcpu) rcpu->rcpu_migrated++;
108 #else
109 #define SCHED_FASTPATH(rcpu)
110 #define SCHED_SLOWPATH(rcpu)
111 #define SCHED_MIGRATED(rcpu)
112 #endif
113
114 struct cpu_info *
cpu_lookup(u_int index)115 cpu_lookup(u_int index)
116 {
117
118 return rcpu_storage[index].rcpu_ci;
119 }
120
121 static inline struct rumpcpu *
getnextcpu(void)122 getnextcpu(void)
123 {
124 unsigned newcpu;
125
126 newcpu = atomic_inc_uint_nv(&nextcpu);
127 if (__predict_false(ncpu > UINT_MAX/2))
128 atomic_and_uint(&nextcpu, 0);
129 newcpu = newcpu % ncpu;
130
131 return &rcpu_storage[newcpu];
132 }
133
134 /* this could/should be mi_attach_cpu? */
135 void
rump_cpus_bootstrap(int * nump)136 rump_cpus_bootstrap(int *nump)
137 {
138 int num = *nump;
139
140 if (num > MAXCPUS) {
141 aprint_verbose("CPU limit: %d wanted, %d (MAXCPUS) "
142 "available (adjusted)\n", num, MAXCPUS);
143 num = MAXCPUS;
144 }
145
146 mutex_init(&cpu_lock, MUTEX_DEFAULT, IPL_NONE);
147
148 kcpuset_create(&kcpuset_attached, true);
149 kcpuset_create(&kcpuset_running, true);
150
151 /* attach first cpu for bootstrap */
152 rump_cpu_attach(&rump_bootcpu);
153 ncpu = 1;
154 *nump = num;
155 }
156
157 void
rump_scheduler_init(int numcpu)158 rump_scheduler_init(int numcpu)
159 {
160 struct rumpcpu *rcpu;
161 struct cpu_info *ci;
162 int i;
163
164 rumpuser_mutex_init(&lwp0mtx, RUMPUSER_MTX_SPIN);
165 rumpuser_cv_init(&lwp0cv);
166 for (i = 0; i < numcpu; i++) {
167 if (i == 0) {
168 ci = &rump_bootcpu;
169 } else {
170 ci = kmem_zalloc(sizeof(*ci), KM_SLEEP);
171 ci->ci_index = i;
172 }
173
174 rcpu = &rcpu_storage[i];
175 rcpu->rcpu_ci = ci;
176 rcpu->rcpu_wanted = 0;
177 rumpuser_cv_init(&rcpu->rcpu_cv);
178 rumpuser_mutex_init(&rcpu->rcpu_mtx, RUMPUSER_MTX_SPIN);
179
180 ci->ci_schedstate.spc_mutex =
181 mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
182 ci->ci_schedstate.spc_flags = SPCF_RUNNING;
183 }
184
185 mutex_init(&unruntime_lock, MUTEX_DEFAULT, IPL_SCHED);
186 }
187
188 /*
189 * condvar ops using scheduler lock as the rumpuser interlock.
190 */
191 void
rump_schedlock_cv_wait(struct rumpuser_cv * cv)192 rump_schedlock_cv_wait(struct rumpuser_cv *cv)
193 {
194 struct lwp *l = curlwp;
195 struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu);
196
197 /* mutex will be taken and released in cpu schedule/unschedule */
198 rumpuser_cv_wait(cv, rcpu->rcpu_mtx);
199 }
200
201 int
rump_schedlock_cv_timedwait(struct rumpuser_cv * cv,const struct timespec * ts)202 rump_schedlock_cv_timedwait(struct rumpuser_cv *cv, const struct timespec *ts)
203 {
204 struct lwp *l = curlwp;
205 struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu);
206
207 /* mutex will be taken and released in cpu schedule/unschedule */
208 return rumpuser_cv_timedwait(cv, rcpu->rcpu_mtx,
209 ts->tv_sec, ts->tv_nsec);
210 }
211
212 static void
lwp0busy(void)213 lwp0busy(void)
214 {
215
216 /* busy lwp0 */
217 KASSERT(curlwp == NULL || curlwp->l_stat != LSONPROC);
218 rumpuser_mutex_enter_nowrap(lwp0mtx);
219 while (lwp0isbusy)
220 rumpuser_cv_wait_nowrap(lwp0cv, lwp0mtx);
221 lwp0isbusy = true;
222 rumpuser_mutex_exit(lwp0mtx);
223 }
224
225 static void
lwp0rele(void)226 lwp0rele(void)
227 {
228
229 rumpuser_mutex_enter_nowrap(lwp0mtx);
230 KASSERT(lwp0isbusy == true);
231 lwp0isbusy = false;
232 rumpuser_cv_signal(lwp0cv);
233 rumpuser_mutex_exit(lwp0mtx);
234 }
235
236 /*
237 * rump_schedule: ensure that the calling host thread has a valid lwp context.
238 * ie. ensure that curlwp != NULL. Also, ensure that there
239 * a 1:1 mapping between the lwp and rump kernel cpu.
240 */
241 void
rump_schedule()242 rump_schedule()
243 {
244 struct lwp *l;
245
246 /*
247 * If there is no dedicated lwp, allocate a temp one and
248 * set it to be free'd upon unschedule(). Use lwp0 context
249 * for reserving the necessary resources. Don't optimize
250 * for this case -- anyone who cares about performance will
251 * start a real thread.
252 */
253 if (__predict_true((l = curlwp) != NULL)) {
254 rump_schedule_cpu(l);
255 LWP_CACHE_CREDS(l, l->l_proc);
256 } else {
257 lwp0busy();
258
259 /* schedule cpu and use lwp0 */
260 rump_schedule_cpu(&lwp0);
261 rump_lwproc_curlwp_set(&lwp0);
262
263 /* allocate thread, switch to it, and release lwp0 */
264 l = rump__lwproc_alloclwp(initproc);
265 rump_lwproc_switch(l);
266 lwp0rele();
267
268 /*
269 * mark new thread dead-on-unschedule. this
270 * means that we'll be running with l_refcnt == 0.
271 * relax, it's fine.
272 */
273 rump_lwproc_releaselwp();
274 }
275 }
276
277 void
rump_schedule_cpu(struct lwp * l)278 rump_schedule_cpu(struct lwp *l)
279 {
280
281 rump_schedule_cpu_interlock(l, NULL);
282 }
283
284 /*
285 * Schedule a CPU. This optimizes for the case where we schedule
286 * the same thread often, and we have nCPU >= nFrequently-Running-Thread
287 * (where CPU is virtual rump cpu, not host CPU).
288 */
289 void
rump_schedule_cpu_interlock(struct lwp * l,void * interlock)290 rump_schedule_cpu_interlock(struct lwp *l, void *interlock)
291 {
292 struct rumpcpu *rcpu;
293 struct cpu_info *ci;
294 void *old;
295 bool domigrate;
296 bool bound = l->l_pflag & LP_BOUND;
297
298 l->l_stat = LSRUN;
299
300 /*
301 * First, try fastpath: if we were the previous user of the
302 * CPU, everything is in order cachewise and we can just
303 * proceed to use it.
304 *
305 * If we are a different thread (i.e. CAS fails), we must go
306 * through a memory barrier to ensure we get a truthful
307 * view of the world.
308 */
309
310 KASSERT(l->l_target_cpu != NULL);
311 rcpu = cpuinfo_to_rumpcpu(l->l_target_cpu);
312 if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, l, RCPULWP_BUSY) == l) {
313 if (interlock == rcpu->rcpu_mtx)
314 rumpuser_mutex_exit(rcpu->rcpu_mtx);
315 SCHED_FASTPATH(rcpu);
316 /* jones, you're the man */
317 goto fastlane;
318 }
319
320 /*
321 * Else, it's the slowpath for us. First, determine if we
322 * can migrate.
323 */
324 if (ncpu == 1)
325 domigrate = false;
326 else
327 domigrate = true;
328
329 /* Take lock. This acts as a load barrier too. */
330 if (interlock != rcpu->rcpu_mtx)
331 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
332
333 for (;;) {
334 SCHED_SLOWPATH(rcpu);
335 old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, RCPULWP_WANTED);
336
337 /* CPU is free? */
338 if (old != RCPULWP_BUSY && old != RCPULWP_WANTED) {
339 if (atomic_cas_ptr(&rcpu->rcpu_prevlwp,
340 RCPULWP_WANTED, RCPULWP_BUSY) == RCPULWP_WANTED) {
341 break;
342 }
343 }
344
345 /*
346 * Do we want to migrate once?
347 * This may need a slightly better algorithm, or we
348 * might cache pingpong eternally for non-frequent
349 * threads.
350 */
351 if (domigrate && !bound) {
352 domigrate = false;
353 SCHED_MIGRATED(rcpu);
354 rumpuser_mutex_exit(rcpu->rcpu_mtx);
355 rcpu = getnextcpu();
356 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
357 continue;
358 }
359
360 /* Want CPU, wait until it's released an retry */
361 rcpu->rcpu_wanted++;
362 rumpuser_cv_wait_nowrap(rcpu->rcpu_cv, rcpu->rcpu_mtx);
363 rcpu->rcpu_wanted--;
364 }
365 rumpuser_mutex_exit(rcpu->rcpu_mtx);
366
367 fastlane:
368 ci = rcpu->rcpu_ci;
369 l->l_cpu = l->l_target_cpu = ci;
370 l->l_mutex = rcpu->rcpu_ci->ci_schedstate.spc_mutex;
371 l->l_ncsw++;
372 l->l_stat = LSONPROC;
373
374 /*
375 * No interrupts, so ci_curlwp === cpu_onproc.
376 * Okay, we could make an attempt to not set cpu_onproc
377 * in the case that an interrupt is scheduled immediately
378 * after a user proc, but leave that for later.
379 */
380 ci->ci_curlwp = ci->ci_data.cpu_onproc = l;
381 }
382
383 void
rump_unschedule()384 rump_unschedule()
385 {
386 struct lwp *l = curlwp;
387 #ifdef DIAGNOSTIC
388 int nlock;
389
390 KERNEL_UNLOCK_ALL(l, &nlock);
391 KASSERT(nlock == 0);
392 #endif
393
394 KASSERT(l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex);
395 rump_unschedule_cpu(l);
396 l->l_mutex = &unruntime_lock;
397 l->l_stat = LSSTOP;
398
399 /*
400 * Check special conditions:
401 * 1) do we need to free the lwp which just unscheduled?
402 * (locking order: lwp0, cpu)
403 * 2) do we want to clear curlwp for the current host thread
404 */
405 if (__predict_false(l->l_flag & LW_WEXIT)) {
406 lwp0busy();
407
408 /* Now that we have lwp0, we can schedule a CPU again */
409 rump_schedule_cpu(l);
410
411 /* switch to lwp0. this frees the old thread */
412 KASSERT(l->l_flag & LW_WEXIT);
413 rump_lwproc_switch(&lwp0);
414
415 /* release lwp0 */
416 rump_unschedule_cpu(&lwp0);
417 lwp0.l_mutex = &unruntime_lock;
418 lwp0.l_pflag &= ~LP_RUNNING;
419 lwp0rele();
420 rump_lwproc_curlwp_clear(&lwp0);
421
422 } else if (__predict_false(l->l_flag & LW_RUMP_CLEAR)) {
423 rump_lwproc_curlwp_clear(l);
424 l->l_flag &= ~LW_RUMP_CLEAR;
425 }
426 }
427
428 void
rump_unschedule_cpu(struct lwp * l)429 rump_unschedule_cpu(struct lwp *l)
430 {
431
432 rump_unschedule_cpu_interlock(l, NULL);
433 }
434
435 void
rump_unschedule_cpu_interlock(struct lwp * l,void * interlock)436 rump_unschedule_cpu_interlock(struct lwp *l, void *interlock)
437 {
438
439 if ((l->l_pflag & LP_INTR) == 0)
440 rump_softint_run(l->l_cpu);
441 rump_unschedule_cpu1(l, interlock);
442 }
443
444 void
rump_unschedule_cpu1(struct lwp * l,void * interlock)445 rump_unschedule_cpu1(struct lwp *l, void *interlock)
446 {
447 struct rumpcpu *rcpu;
448 struct cpu_info *ci;
449 void *old;
450
451 ci = l->l_cpu;
452 ci->ci_curlwp = ci->ci_data.cpu_onproc = NULL;
453 rcpu = cpuinfo_to_rumpcpu(ci);
454
455 KASSERT(rcpu->rcpu_ci == ci);
456
457 /*
458 * Make sure all stores are seen before the CPU release. This
459 * is relevant only in the non-fastpath scheduling case, but
460 * we don't know here if that's going to happen, so need to
461 * expect the worst.
462 *
463 * If the scheduler interlock was requested by the caller, we
464 * need to obtain it before we release the CPU. Otherwise, we risk a
465 * race condition where another thread is scheduled onto the
466 * rump kernel CPU before our current thread can
467 * grab the interlock.
468 */
469 if (interlock == rcpu->rcpu_mtx)
470 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
471 else
472 membar_exit();
473
474 /* Release the CPU. */
475 old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, l);
476
477 /* No waiters? No problems. We're outta here. */
478 if (old == RCPULWP_BUSY) {
479 return;
480 }
481
482 KASSERT(old == RCPULWP_WANTED);
483
484 /*
485 * Ok, things weren't so snappy.
486 *
487 * Snailpath: take lock and signal anyone waiting for this CPU.
488 */
489
490 if (interlock != rcpu->rcpu_mtx)
491 rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
492 if (rcpu->rcpu_wanted)
493 rumpuser_cv_broadcast(rcpu->rcpu_cv);
494 if (interlock != rcpu->rcpu_mtx)
495 rumpuser_mutex_exit(rcpu->rcpu_mtx);
496 }
497
498 /* Give up and retake CPU (perhaps a different one) */
499 void
yield()500 yield()
501 {
502 struct lwp *l = curlwp;
503 int nlocks;
504
505 KERNEL_UNLOCK_ALL(l, &nlocks);
506 rump_unschedule_cpu(l);
507 rump_schedule_cpu(l);
508 KERNEL_LOCK(nlocks, l);
509 }
510
511 void
preempt()512 preempt()
513 {
514
515 yield();
516 }
517
518 bool
kpreempt(uintptr_t where)519 kpreempt(uintptr_t where)
520 {
521
522 return false;
523 }
524
525 /*
526 * There is no kernel thread preemption in rump currently. But call
527 * the implementing macros anyway in case they grow some side-effects
528 * down the road.
529 */
530 void
kpreempt_disable(void)531 kpreempt_disable(void)
532 {
533
534 KPREEMPT_DISABLE(curlwp);
535 }
536
537 void
kpreempt_enable(void)538 kpreempt_enable(void)
539 {
540
541 KPREEMPT_ENABLE(curlwp);
542 }
543
544 bool
kpreempt_disabled(void)545 kpreempt_disabled(void)
546 {
547 #if 0
548 const lwp_t *l = curlwp;
549
550 return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
551 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled();
552 #endif
553 /* XXX: emulate cpu_kpreempt_disabled() */
554 return true;
555 }
556
557 void
suspendsched(void)558 suspendsched(void)
559 {
560
561 /*
562 * Could wait until everyone is out and block further entries,
563 * but skip that for now.
564 */
565 }
566
567 void
sched_nice(struct proc * p,int level)568 sched_nice(struct proc *p, int level)
569 {
570
571 /* nothing to do for now */
572 }
573
574 void
sched_enqueue(struct lwp * l,bool swtch)575 sched_enqueue(struct lwp *l, bool swtch)
576 {
577
578 if (swtch)
579 panic("sched_enqueue with switcheroo");
580 rump_thread_allow(l);
581 }
582
583 void
sched_dequeue(struct lwp * l)584 sched_dequeue(struct lwp *l)
585 {
586
587 panic("sched_dequeue not implemented");
588 }
589