1 /*      $NetBSD: scheduler.c,v 1.44 2016/02/19 18:38:37 pooka Exp $	*/
2 
3 /*
4  * Copyright (c) 2010, 2011 Antti Kantee.  All Rights Reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.44 2016/02/19 18:38:37 pooka Exp $");
30 
31 #include <sys/param.h>
32 #include <sys/atomic.h>
33 #include <sys/cpu.h>
34 #include <sys/kmem.h>
35 #include <sys/mutex.h>
36 #include <sys/namei.h>
37 #include <sys/queue.h>
38 #include <sys/select.h>
39 #include <sys/systm.h>
40 
41 #include <rump-sys/kern.h>
42 
43 #include <rump/rumpuser.h>
44 
45 static struct rumpcpu {
46 	/* needed in fastpath */
47 	struct cpu_info *rcpu_ci;
48 	void *rcpu_prevlwp;
49 
50 	/* needed in slowpath */
51 	struct rumpuser_mtx *rcpu_mtx;
52 	struct rumpuser_cv *rcpu_cv;
53 	int rcpu_wanted;
54 
55 	/* offset 20 (P=4) or 36 (P=8) here */
56 
57 	/*
58 	 * Some stats.  Not really that necessary, but we should
59 	 * have room.  Note that these overflow quite fast, so need
60 	 * to be collected often.
61 	 */
62 	unsigned int rcpu_fastpath;
63 	unsigned int rcpu_slowpath;
64 	unsigned int rcpu_migrated;
65 
66 	/* offset 32 (P=4) or 50 (P=8) */
67 
68 	int rcpu_align[0] __aligned(CACHE_LINE_SIZE);
69 } rcpu_storage[MAXCPUS];
70 
71 static inline struct rumpcpu *
cpuinfo_to_rumpcpu(struct cpu_info * ci)72 cpuinfo_to_rumpcpu(struct cpu_info *ci)
73 {
74 
75 	return &rcpu_storage[cpu_index(ci)];
76 }
77 
78 struct cpu_info rump_bootcpu;
79 kcpuset_t *kcpuset_attached = NULL;
80 kcpuset_t *kcpuset_running = NULL;
81 int ncpu, ncpuonline;
82 
83 kmutex_t cpu_lock;
84 
85 #define RCPULWP_BUSY	((void *)-1)
86 #define RCPULWP_WANTED	((void *)-2)
87 
88 static struct rumpuser_mtx *lwp0mtx;
89 static struct rumpuser_cv *lwp0cv;
90 static unsigned nextcpu;
91 
92 kmutex_t unruntime_lock; /* unruntime lwp lock.  practically unused */
93 
94 static bool lwp0isbusy = false;
95 
96 /*
97  * Keep some stats.
98  *
99  * Keeping track of there is not really critical for speed, unless
100  * stats happen to be on a different cache line (CACHE_LINE_SIZE is
101  * really just a coarse estimate), so default for the performant case
102  * (i.e. no stats).
103  */
104 #ifdef RUMPSCHED_STATS
105 #define SCHED_FASTPATH(rcpu) rcpu->rcpu_fastpath++;
106 #define SCHED_SLOWPATH(rcpu) rcpu->rcpu_slowpath++;
107 #define SCHED_MIGRATED(rcpu) rcpu->rcpu_migrated++;
108 #else
109 #define SCHED_FASTPATH(rcpu)
110 #define SCHED_SLOWPATH(rcpu)
111 #define SCHED_MIGRATED(rcpu)
112 #endif
113 
114 struct cpu_info *
cpu_lookup(u_int index)115 cpu_lookup(u_int index)
116 {
117 
118 	return rcpu_storage[index].rcpu_ci;
119 }
120 
121 static inline struct rumpcpu *
getnextcpu(void)122 getnextcpu(void)
123 {
124 	unsigned newcpu;
125 
126 	newcpu = atomic_inc_uint_nv(&nextcpu);
127 	if (__predict_false(ncpu > UINT_MAX/2))
128 		atomic_and_uint(&nextcpu, 0);
129 	newcpu = newcpu % ncpu;
130 
131 	return &rcpu_storage[newcpu];
132 }
133 
134 /* this could/should be mi_attach_cpu? */
135 void
rump_cpus_bootstrap(int * nump)136 rump_cpus_bootstrap(int *nump)
137 {
138 	int num = *nump;
139 
140 	if (num > MAXCPUS) {
141 		aprint_verbose("CPU limit: %d wanted, %d (MAXCPUS) "
142 		    "available (adjusted)\n", num, MAXCPUS);
143 		num = MAXCPUS;
144 	}
145 
146 	mutex_init(&cpu_lock, MUTEX_DEFAULT, IPL_NONE);
147 
148 	kcpuset_create(&kcpuset_attached, true);
149 	kcpuset_create(&kcpuset_running, true);
150 
151 	/* attach first cpu for bootstrap */
152 	rump_cpu_attach(&rump_bootcpu);
153 	ncpu = 1;
154 	*nump = num;
155 }
156 
157 void
rump_scheduler_init(int numcpu)158 rump_scheduler_init(int numcpu)
159 {
160 	struct rumpcpu *rcpu;
161 	struct cpu_info *ci;
162 	int i;
163 
164 	rumpuser_mutex_init(&lwp0mtx, RUMPUSER_MTX_SPIN);
165 	rumpuser_cv_init(&lwp0cv);
166 	for (i = 0; i < numcpu; i++) {
167 		if (i == 0) {
168 			ci = &rump_bootcpu;
169 		} else {
170 			ci = kmem_zalloc(sizeof(*ci), KM_SLEEP);
171 			ci->ci_index = i;
172 		}
173 
174 		rcpu = &rcpu_storage[i];
175 		rcpu->rcpu_ci = ci;
176 		rcpu->rcpu_wanted = 0;
177 		rumpuser_cv_init(&rcpu->rcpu_cv);
178 		rumpuser_mutex_init(&rcpu->rcpu_mtx, RUMPUSER_MTX_SPIN);
179 
180 		ci->ci_schedstate.spc_mutex =
181 		    mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
182 		ci->ci_schedstate.spc_flags = SPCF_RUNNING;
183 	}
184 
185 	mutex_init(&unruntime_lock, MUTEX_DEFAULT, IPL_SCHED);
186 }
187 
188 /*
189  * condvar ops using scheduler lock as the rumpuser interlock.
190  */
191 void
rump_schedlock_cv_wait(struct rumpuser_cv * cv)192 rump_schedlock_cv_wait(struct rumpuser_cv *cv)
193 {
194 	struct lwp *l = curlwp;
195 	struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu);
196 
197 	/* mutex will be taken and released in cpu schedule/unschedule */
198 	rumpuser_cv_wait(cv, rcpu->rcpu_mtx);
199 }
200 
201 int
rump_schedlock_cv_timedwait(struct rumpuser_cv * cv,const struct timespec * ts)202 rump_schedlock_cv_timedwait(struct rumpuser_cv *cv, const struct timespec *ts)
203 {
204 	struct lwp *l = curlwp;
205 	struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu);
206 
207 	/* mutex will be taken and released in cpu schedule/unschedule */
208 	return rumpuser_cv_timedwait(cv, rcpu->rcpu_mtx,
209 	    ts->tv_sec, ts->tv_nsec);
210 }
211 
212 static void
lwp0busy(void)213 lwp0busy(void)
214 {
215 
216 	/* busy lwp0 */
217 	KASSERT(curlwp == NULL || curlwp->l_stat != LSONPROC);
218 	rumpuser_mutex_enter_nowrap(lwp0mtx);
219 	while (lwp0isbusy)
220 		rumpuser_cv_wait_nowrap(lwp0cv, lwp0mtx);
221 	lwp0isbusy = true;
222 	rumpuser_mutex_exit(lwp0mtx);
223 }
224 
225 static void
lwp0rele(void)226 lwp0rele(void)
227 {
228 
229 	rumpuser_mutex_enter_nowrap(lwp0mtx);
230 	KASSERT(lwp0isbusy == true);
231 	lwp0isbusy = false;
232 	rumpuser_cv_signal(lwp0cv);
233 	rumpuser_mutex_exit(lwp0mtx);
234 }
235 
236 /*
237  * rump_schedule: ensure that the calling host thread has a valid lwp context.
238  * ie. ensure that curlwp != NULL.  Also, ensure that there
239  * a 1:1 mapping between the lwp and rump kernel cpu.
240  */
241 void
rump_schedule()242 rump_schedule()
243 {
244 	struct lwp *l;
245 
246 	/*
247 	 * If there is no dedicated lwp, allocate a temp one and
248 	 * set it to be free'd upon unschedule().  Use lwp0 context
249 	 * for reserving the necessary resources.  Don't optimize
250 	 * for this case -- anyone who cares about performance will
251 	 * start a real thread.
252 	 */
253 	if (__predict_true((l = curlwp) != NULL)) {
254 		rump_schedule_cpu(l);
255 		LWP_CACHE_CREDS(l, l->l_proc);
256 	} else {
257 		lwp0busy();
258 
259 		/* schedule cpu and use lwp0 */
260 		rump_schedule_cpu(&lwp0);
261 		rump_lwproc_curlwp_set(&lwp0);
262 
263 		/* allocate thread, switch to it, and release lwp0 */
264 		l = rump__lwproc_alloclwp(initproc);
265 		rump_lwproc_switch(l);
266 		lwp0rele();
267 
268 		/*
269 		 * mark new thread dead-on-unschedule.  this
270 		 * means that we'll be running with l_refcnt == 0.
271 		 * relax, it's fine.
272 		 */
273 		rump_lwproc_releaselwp();
274 	}
275 }
276 
277 void
rump_schedule_cpu(struct lwp * l)278 rump_schedule_cpu(struct lwp *l)
279 {
280 
281 	rump_schedule_cpu_interlock(l, NULL);
282 }
283 
284 /*
285  * Schedule a CPU.  This optimizes for the case where we schedule
286  * the same thread often, and we have nCPU >= nFrequently-Running-Thread
287  * (where CPU is virtual rump cpu, not host CPU).
288  */
289 void
rump_schedule_cpu_interlock(struct lwp * l,void * interlock)290 rump_schedule_cpu_interlock(struct lwp *l, void *interlock)
291 {
292 	struct rumpcpu *rcpu;
293 	struct cpu_info *ci;
294 	void *old;
295 	bool domigrate;
296 	bool bound = l->l_pflag & LP_BOUND;
297 
298 	l->l_stat = LSRUN;
299 
300 	/*
301 	 * First, try fastpath: if we were the previous user of the
302 	 * CPU, everything is in order cachewise and we can just
303 	 * proceed to use it.
304 	 *
305 	 * If we are a different thread (i.e. CAS fails), we must go
306 	 * through a memory barrier to ensure we get a truthful
307 	 * view of the world.
308 	 */
309 
310 	KASSERT(l->l_target_cpu != NULL);
311 	rcpu = cpuinfo_to_rumpcpu(l->l_target_cpu);
312 	if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, l, RCPULWP_BUSY) == l) {
313 		if (interlock == rcpu->rcpu_mtx)
314 			rumpuser_mutex_exit(rcpu->rcpu_mtx);
315 		SCHED_FASTPATH(rcpu);
316 		/* jones, you're the man */
317 		goto fastlane;
318 	}
319 
320 	/*
321 	 * Else, it's the slowpath for us.  First, determine if we
322 	 * can migrate.
323 	 */
324 	if (ncpu == 1)
325 		domigrate = false;
326 	else
327 		domigrate = true;
328 
329 	/* Take lock.  This acts as a load barrier too. */
330 	if (interlock != rcpu->rcpu_mtx)
331 		rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
332 
333 	for (;;) {
334 		SCHED_SLOWPATH(rcpu);
335 		old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, RCPULWP_WANTED);
336 
337 		/* CPU is free? */
338 		if (old != RCPULWP_BUSY && old != RCPULWP_WANTED) {
339 			if (atomic_cas_ptr(&rcpu->rcpu_prevlwp,
340 			    RCPULWP_WANTED, RCPULWP_BUSY) == RCPULWP_WANTED) {
341 				break;
342 			}
343 		}
344 
345 		/*
346 		 * Do we want to migrate once?
347 		 * This may need a slightly better algorithm, or we
348 		 * might cache pingpong eternally for non-frequent
349 		 * threads.
350 		 */
351 		if (domigrate && !bound) {
352 			domigrate = false;
353 			SCHED_MIGRATED(rcpu);
354 			rumpuser_mutex_exit(rcpu->rcpu_mtx);
355 			rcpu = getnextcpu();
356 			rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
357 			continue;
358 		}
359 
360 		/* Want CPU, wait until it's released an retry */
361 		rcpu->rcpu_wanted++;
362 		rumpuser_cv_wait_nowrap(rcpu->rcpu_cv, rcpu->rcpu_mtx);
363 		rcpu->rcpu_wanted--;
364 	}
365 	rumpuser_mutex_exit(rcpu->rcpu_mtx);
366 
367  fastlane:
368 	ci = rcpu->rcpu_ci;
369 	l->l_cpu = l->l_target_cpu = ci;
370 	l->l_mutex = rcpu->rcpu_ci->ci_schedstate.spc_mutex;
371 	l->l_ncsw++;
372 	l->l_stat = LSONPROC;
373 
374 	/*
375 	 * No interrupts, so ci_curlwp === cpu_onproc.
376 	 * Okay, we could make an attempt to not set cpu_onproc
377 	 * in the case that an interrupt is scheduled immediately
378 	 * after a user proc, but leave that for later.
379 	 */
380 	ci->ci_curlwp = ci->ci_data.cpu_onproc = l;
381 }
382 
383 void
rump_unschedule()384 rump_unschedule()
385 {
386 	struct lwp *l = curlwp;
387 #ifdef DIAGNOSTIC
388 	int nlock;
389 
390 	KERNEL_UNLOCK_ALL(l, &nlock);
391 	KASSERT(nlock == 0);
392 #endif
393 
394 	KASSERT(l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex);
395 	rump_unschedule_cpu(l);
396 	l->l_mutex = &unruntime_lock;
397 	l->l_stat = LSSTOP;
398 
399 	/*
400 	 * Check special conditions:
401 	 *  1) do we need to free the lwp which just unscheduled?
402 	 *     (locking order: lwp0, cpu)
403 	 *  2) do we want to clear curlwp for the current host thread
404 	 */
405 	if (__predict_false(l->l_flag & LW_WEXIT)) {
406 		lwp0busy();
407 
408 		/* Now that we have lwp0, we can schedule a CPU again */
409 		rump_schedule_cpu(l);
410 
411 		/* switch to lwp0.  this frees the old thread */
412 		KASSERT(l->l_flag & LW_WEXIT);
413 		rump_lwproc_switch(&lwp0);
414 
415 		/* release lwp0 */
416 		rump_unschedule_cpu(&lwp0);
417 		lwp0.l_mutex = &unruntime_lock;
418 		lwp0.l_pflag &= ~LP_RUNNING;
419 		lwp0rele();
420 		rump_lwproc_curlwp_clear(&lwp0);
421 
422 	} else if (__predict_false(l->l_flag & LW_RUMP_CLEAR)) {
423 		rump_lwproc_curlwp_clear(l);
424 		l->l_flag &= ~LW_RUMP_CLEAR;
425 	}
426 }
427 
428 void
rump_unschedule_cpu(struct lwp * l)429 rump_unschedule_cpu(struct lwp *l)
430 {
431 
432 	rump_unschedule_cpu_interlock(l, NULL);
433 }
434 
435 void
rump_unschedule_cpu_interlock(struct lwp * l,void * interlock)436 rump_unschedule_cpu_interlock(struct lwp *l, void *interlock)
437 {
438 
439 	if ((l->l_pflag & LP_INTR) == 0)
440 		rump_softint_run(l->l_cpu);
441 	rump_unschedule_cpu1(l, interlock);
442 }
443 
444 void
rump_unschedule_cpu1(struct lwp * l,void * interlock)445 rump_unschedule_cpu1(struct lwp *l, void *interlock)
446 {
447 	struct rumpcpu *rcpu;
448 	struct cpu_info *ci;
449 	void *old;
450 
451 	ci = l->l_cpu;
452 	ci->ci_curlwp = ci->ci_data.cpu_onproc = NULL;
453 	rcpu = cpuinfo_to_rumpcpu(ci);
454 
455 	KASSERT(rcpu->rcpu_ci == ci);
456 
457 	/*
458 	 * Make sure all stores are seen before the CPU release.  This
459 	 * is relevant only in the non-fastpath scheduling case, but
460 	 * we don't know here if that's going to happen, so need to
461 	 * expect the worst.
462 	 *
463 	 * If the scheduler interlock was requested by the caller, we
464 	 * need to obtain it before we release the CPU.  Otherwise, we risk a
465 	 * race condition where another thread is scheduled onto the
466 	 * rump kernel CPU before our current thread can
467 	 * grab the interlock.
468 	 */
469 	if (interlock == rcpu->rcpu_mtx)
470 		rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
471 	else
472 		membar_exit();
473 
474 	/* Release the CPU. */
475 	old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, l);
476 
477 	/* No waiters?  No problems.  We're outta here. */
478 	if (old == RCPULWP_BUSY) {
479 		return;
480 	}
481 
482 	KASSERT(old == RCPULWP_WANTED);
483 
484 	/*
485 	 * Ok, things weren't so snappy.
486 	 *
487 	 * Snailpath: take lock and signal anyone waiting for this CPU.
488 	 */
489 
490 	if (interlock != rcpu->rcpu_mtx)
491 		rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
492 	if (rcpu->rcpu_wanted)
493 		rumpuser_cv_broadcast(rcpu->rcpu_cv);
494 	if (interlock != rcpu->rcpu_mtx)
495 		rumpuser_mutex_exit(rcpu->rcpu_mtx);
496 }
497 
498 /* Give up and retake CPU (perhaps a different one) */
499 void
yield()500 yield()
501 {
502 	struct lwp *l = curlwp;
503 	int nlocks;
504 
505 	KERNEL_UNLOCK_ALL(l, &nlocks);
506 	rump_unschedule_cpu(l);
507 	rump_schedule_cpu(l);
508 	KERNEL_LOCK(nlocks, l);
509 }
510 
511 void
preempt()512 preempt()
513 {
514 
515 	yield();
516 }
517 
518 bool
kpreempt(uintptr_t where)519 kpreempt(uintptr_t where)
520 {
521 
522 	return false;
523 }
524 
525 /*
526  * There is no kernel thread preemption in rump currently.  But call
527  * the implementing macros anyway in case they grow some side-effects
528  * down the road.
529  */
530 void
kpreempt_disable(void)531 kpreempt_disable(void)
532 {
533 
534 	KPREEMPT_DISABLE(curlwp);
535 }
536 
537 void
kpreempt_enable(void)538 kpreempt_enable(void)
539 {
540 
541 	KPREEMPT_ENABLE(curlwp);
542 }
543 
544 bool
kpreempt_disabled(void)545 kpreempt_disabled(void)
546 {
547 #if 0
548 	const lwp_t *l = curlwp;
549 
550 	return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
551 	    (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled();
552 #endif
553 	/* XXX: emulate cpu_kpreempt_disabled() */
554 	return true;
555 }
556 
557 void
suspendsched(void)558 suspendsched(void)
559 {
560 
561 	/*
562 	 * Could wait until everyone is out and block further entries,
563 	 * but skip that for now.
564 	 */
565 }
566 
567 void
sched_nice(struct proc * p,int level)568 sched_nice(struct proc *p, int level)
569 {
570 
571 	/* nothing to do for now */
572 }
573 
574 void
sched_enqueue(struct lwp * l,bool swtch)575 sched_enqueue(struct lwp *l, bool swtch)
576 {
577 
578 	if (swtch)
579 		panic("sched_enqueue with switcheroo");
580 	rump_thread_allow(l);
581 }
582 
583 void
sched_dequeue(struct lwp * l)584 sched_dequeue(struct lwp *l)
585 {
586 
587 	panic("sched_dequeue not implemented");
588 }
589