xref: /dragonfly/sys/kern/usched_dummy.c (revision 0ac6bf9d)
1 /*
2  * Copyright (c) 2006 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/kern/usched_dummy.c,v 1.3 2006/06/10 20:19:38 dillon Exp $
35  */
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/lock.h>
41 #include <sys/queue.h>
42 #include <sys/proc.h>
43 #include <sys/rtprio.h>
44 #include <sys/uio.h>
45 #include <sys/sysctl.h>
46 #include <sys/resourcevar.h>
47 #include <sys/spinlock.h>
48 #include <machine/ipl.h>
49 #include <machine/cpu.h>
50 #include <machine/smp.h>
51 
52 #include <sys/thread2.h>
53 #include <sys/spinlock2.h>
54 
55 #define MAXPRI			128
56 #define PRIBASE_REALTIME	0
57 #define PRIBASE_NORMAL		MAXPRI
58 #define PRIBASE_IDLE		(MAXPRI * 2)
59 #define PRIBASE_THREAD		(MAXPRI * 3)
60 #define PRIBASE_NULL		(MAXPRI * 4)
61 
62 #define lwp_priority	lwp_usdata.bsd4.priority
63 #define lwp_estcpu	lwp_usdata.bsd4.estcpu
64 
65 static void dummy_acquire_curproc(struct lwp *lp);
66 static void dummy_release_curproc(struct lwp *lp);
67 static void dummy_select_curproc(globaldata_t gd);
68 static void dummy_setrunqueue(struct lwp *lp);
69 static void dummy_schedulerclock(struct lwp *lp, sysclock_t period,
70 				sysclock_t cpstamp);
71 static void dummy_recalculate_estcpu(struct lwp *lp);
72 static void dummy_resetpriority(struct lwp *lp);
73 static void dummy_forking(struct lwp *plp, struct lwp *lp);
74 static void dummy_exiting(struct lwp *plp, struct lwp *lp);
75 
76 struct usched usched_dummy = {
77 	{ NULL },
78 	"dummy", "Dummy DragonFly Scheduler",
79 	NULL,			/* default registration */
80 	NULL,			/* default deregistration */
81 	dummy_acquire_curproc,
82 	dummy_release_curproc,
83 	dummy_setrunqueue,
84 	dummy_schedulerclock,
85 	dummy_recalculate_estcpu,
86 	dummy_resetpriority,
87 	dummy_forking,
88 	dummy_exiting,
89 	NULL			/* setcpumask not supported */
90 };
91 
92 struct usched_dummy_pcpu {
93 	int	rrcount;
94 	struct thread helper_thread;
95 	struct lwp *uschedcp;
96 };
97 
98 typedef struct usched_dummy_pcpu *dummy_pcpu_t;
99 
100 static struct usched_dummy_pcpu dummy_pcpu[MAXCPU];
101 static cpumask_t dummy_curprocmask = -1;
102 static cpumask_t dummy_rdyprocmask;
103 static struct spinlock dummy_spin;
104 static TAILQ_HEAD(rq, lwp) dummy_runq;
105 static int dummy_runqcount;
106 
107 static int usched_dummy_rrinterval = (ESTCPUFREQ + 9) / 10;
108 SYSCTL_INT(_kern, OID_AUTO, usched_dummy_rrinterval, CTLFLAG_RW,
109         &usched_dummy_rrinterval, 0, "");
110 
111 /*
112  * Initialize the run queues at boot time, clear cpu 0 in curprocmask
113  * to allow dummy scheduling on cpu 0.
114  */
115 static void
116 dummyinit(void *dummy)
117 {
118 	TAILQ_INIT(&dummy_runq);
119 	spin_init(&dummy_spin);
120 	atomic_clear_int(&dummy_curprocmask, 1);
121 }
122 SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, dummyinit, NULL)
123 
124 /*
125  * DUMMY_ACQUIRE_CURPROC
126  *
127  * This function is called when the kernel intends to return to userland.
128  * It is responsible for making the thread the current designated userland
129  * thread for this cpu, blocking if necessary.
130  *
131  * We are expected to handle userland reschedule requests here too.
132  *
133  * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE
134  * TO ANOTHER CPU!  Because most of the kernel assumes that no migration will
135  * occur, this function is called only under very controlled circumstances.
136  *
137  * MPSAFE
138  */
139 static void
140 dummy_acquire_curproc(struct lwp *lp)
141 {
142 	globaldata_t gd = mycpu;
143 	dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid];
144 	thread_t td = lp->lwp_thread;
145 
146 	/*
147 	 * Possibly select another thread
148 	 */
149 	if (user_resched_wanted())
150 		dummy_select_curproc(gd);
151 
152 	/*
153 	 * If this cpu has no current thread, select ourself
154 	 */
155 	if (dd->uschedcp == NULL && TAILQ_EMPTY(&dummy_runq)) {
156 		atomic_set_int(&dummy_curprocmask, gd->gd_cpumask);
157 		dd->uschedcp = lp;
158 		return;
159 	}
160 
161 	/*
162 	 * If this cpu's current user process thread is not our thread,
163 	 * deschedule ourselves and place us on the run queue, then
164 	 * switch away.
165 	 *
166 	 * We loop until we become the current process.  Its a good idea
167 	 * to run any passive release(s) before we mess with the scheduler
168 	 * so our thread is in the expected state.
169 	 */
170 	KKASSERT(dd->uschedcp != lp);
171 	if (td->td_release)
172 		td->td_release(lp->lwp_thread);
173 	do {
174 		crit_enter();
175 		lwkt_deschedule_self(td);
176 		dummy_setrunqueue(lp);
177 		if ((td->td_flags & TDF_RUNQ) == 0)
178 			++lp->lwp_stats->p_ru.ru_nivcsw;
179 		lwkt_switch();		/* WE MAY MIGRATE TO ANOTHER CPU */
180 		crit_exit();
181 		gd = mycpu;
182 		dd = &dummy_pcpu[gd->gd_cpuid];
183 		KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
184 	} while (dd->uschedcp != lp);
185 }
186 
187 /*
188  * DUMMY_RELEASE_CURPROC
189  *
190  * This routine detaches the current thread from the userland scheduler,
191  * usually because the thread needs to run in the kernel (at kernel priority)
192  * for a while.
193  *
194  * This routine is also responsible for selecting a new thread to
195  * make the current thread.
196  *
197  * WARNING!  The MP lock may be in an unsynchronized state due to the
198  * way get_mplock() works and the fact that this function may be called
199  * from a passive release during a lwkt_switch().   try_mplock() will deal
200  * with this for us but you should be aware that td_mpcount may not be
201  * useable.
202  *
203  * MPSAFE
204  */
205 static void
206 dummy_release_curproc(struct lwp *lp)
207 {
208 	globaldata_t gd = mycpu;
209 	dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid];
210 
211 	KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
212 	if (dd->uschedcp == lp) {
213 		dummy_select_curproc(gd);
214 	}
215 }
216 
217 /*
218  * DUMMY_SELECT_CURPROC
219  *
220  * Select a new current process for this cpu.  This satisfies a user
221  * scheduler reschedule request so clear that too.
222  *
223  * This routine is also responsible for equal-priority round-robining,
224  * typically triggered from dummy_schedulerclock().  In our dummy example
225  * all the 'user' threads are LWKT scheduled all at once and we just
226  * call lwkt_switch().
227  *
228  * MPSAFE
229  */
230 static
231 void
232 dummy_select_curproc(globaldata_t gd)
233 {
234 	dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid];
235 	struct lwp *lp;
236 
237 	clear_user_resched();
238 	spin_lock_wr(&dummy_spin);
239 	if ((lp = TAILQ_FIRST(&dummy_runq)) == NULL) {
240 		dd->uschedcp = NULL;
241 		atomic_clear_int(&dummy_curprocmask, gd->gd_cpumask);
242 		spin_unlock_wr(&dummy_spin);
243 	} else {
244 		--dummy_runqcount;
245 		TAILQ_REMOVE(&dummy_runq, lp, lwp_procq);
246 		lp->lwp_proc->p_flag &= ~P_ONRUNQ;
247 		dd->uschedcp = lp;
248 		atomic_set_int(&dummy_curprocmask, gd->gd_cpumask);
249 		spin_unlock_wr(&dummy_spin);
250 #ifdef SMP
251 		lwkt_acquire(lp->lwp_thread);
252 #endif
253 		lwkt_schedule(lp->lwp_thread);
254 	}
255 }
256 
257 /*
258  * DUMMY_SETRUNQUEUE
259  *
260  * This routine is called to schedule a new user process after a fork.
261  * The scheduler module itself might also call this routine to place
262  * the current process on the userland scheduler's run queue prior
263  * to calling dummy_select_curproc().
264  *
265  * The caller may set P_PASSIVE_ACQ in p_flag to indicate that we should
266  * attempt to leave the thread on the current cpu.
267  *
268  * MPSAFE
269  */
270 static void
271 dummy_setrunqueue(struct lwp *lp)
272 {
273 	globaldata_t gd = mycpu;
274 	dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid];
275 	cpumask_t mask;
276 	int cpuid;
277 
278 	if (dd->uschedcp == NULL) {
279 		dd->uschedcp = lp;
280 		atomic_set_int(&dummy_curprocmask, gd->gd_cpumask);
281 		lwkt_schedule(lp->lwp_thread);
282 	} else {
283 		/*
284 		 * Add to our global runq
285 		 */
286 		KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
287 		spin_lock_wr(&dummy_spin);
288 		++dummy_runqcount;
289 		TAILQ_INSERT_TAIL(&dummy_runq, lp, lwp_procq);
290 		lp->lwp_proc->p_flag |= P_ONRUNQ;
291 #ifdef SMP
292 		lwkt_giveaway(lp->lwp_thread);
293 #endif
294 
295 		/* lp = TAILQ_FIRST(&dummy_runq); */
296 
297 		/*
298 		 * Notify the next available cpu.  P.S. some
299 		 * cpu affinity could be done here.
300 		 *
301 		 * The rdyprocmask bit placeholds the knowledge that there
302 		 * is a process on the runq that needs service.  If the
303 		 * helper thread cannot find a home for it it will forward
304 		 * the request to another available cpu.
305 		 */
306 		mask = ~dummy_curprocmask & dummy_rdyprocmask &
307 		       gd->gd_other_cpus;
308 		if (mask) {
309 			cpuid = bsfl(mask);
310 			atomic_clear_int(&dummy_rdyprocmask, 1 << cpuid);
311 			spin_unlock_wr(&dummy_spin);
312 			lwkt_schedule(&dummy_pcpu[cpuid].helper_thread);
313 		} else {
314 			spin_unlock_wr(&dummy_spin);
315 		}
316 	}
317 }
318 
319 /*
320  * This routine is called from a systimer IPI.  Thus it is called with
321  * a critical section held.  Any spinlocks we get here that are also
322  * obtained in other procedures must be proected by a critical section
323  * in those other procedures to avoid a deadlock.
324  *
325  * The MP lock may or may not be held on entry and cannot be obtained
326  * by this routine (because it is called from a systimer IPI).  Additionally,
327  * because this is equivalent to a FAST interrupt, spinlocks cannot be used
328  * (or at least, you have to check that gd_spin* counts are 0 before you
329  * can).
330  *
331  * This routine is called at ESTCPUFREQ on each cpu independantly.
332  *
333  * This routine typically queues a reschedule request, which will cause
334  * the scheduler's BLAH_select_curproc() to be called as soon as possible.
335  *
336  * MPSAFE
337  */
338 static
339 void
340 dummy_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
341 {
342 	globaldata_t gd = mycpu;
343 	dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid];
344 
345 	if (++dd->rrcount >= usched_dummy_rrinterval) {
346 		dd->rrcount = 0;
347 		need_user_resched();
348 	}
349 }
350 
351 /*
352  * DUMMY_RECALCULATE_ESTCPU
353  *
354  * Called once a second for any process that is running or has slept
355  * for less then 2 seconds.
356  *
357  * MPSAFE
358  */
359 static
360 void
361 dummy_recalculate_estcpu(struct lwp *lp)
362 {
363 }
364 
365 /*
366  * DUMMY_RESETPRIORITY
367  *
368  * This routine is called after the kernel has potentially modified
369  * the lwp_rtprio structure.  The target process may be running or sleeping
370  * or scheduled but not yet running or owned by another cpu.  Basically,
371  * it can be in virtually any state.
372  *
373  * This routine is called by fork1() for initial setup with the process
374  * of the run queue, and also may be called normally with the process on or
375  * off the run queue.
376  *
377  * MPSAFE
378  */
379 static void
380 dummy_resetpriority(struct lwp *lp)
381 {
382 	/* XXX spinlock usually needed */
383 	/*
384 	 * Set p_priority for general process comparisons
385 	 */
386 	switch(lp->lwp_rtprio.type) {
387 	case RTP_PRIO_REALTIME:
388 		lp->lwp_priority = PRIBASE_REALTIME + lp->lwp_rtprio.prio;
389 		return;
390 	case RTP_PRIO_NORMAL:
391 		lp->lwp_priority = PRIBASE_NORMAL + lp->lwp_rtprio.prio;
392 		break;
393 	case RTP_PRIO_IDLE:
394 		lp->lwp_priority = PRIBASE_IDLE + lp->lwp_rtprio.prio;
395 		return;
396 	case RTP_PRIO_THREAD:
397 		lp->lwp_priority = PRIBASE_THREAD + lp->lwp_rtprio.prio;
398 		return;
399 	}
400 	/* XXX spinlock usually needed */
401 }
402 
403 
404 /*
405  * DUMMY_FORKING
406  *
407  * Called from fork1() when a new child process is being created.  Allows
408  * the scheduler to predispose the child process before it gets scheduled.
409  *
410  * MPSAFE
411  */
412 static void
413 dummy_forking(struct lwp *plp, struct lwp *lp)
414 {
415 	lp->lwp_estcpu = plp->lwp_estcpu;
416 #if 0
417 	++plp->lwp_estcpu;
418 #endif
419 }
420 
421 /*
422  * DUMMY_EXITING
423  *
424  * Called when the parent reaps a child.   Typically used to propogate cpu
425  * use by the child back to the parent as part of a batch detection
426  * heuristic.
427  *
428  * NOTE: cpu use is not normally back-propogated to PID 1.
429  *
430  * MPSAFE
431  */
432 static void
433 dummy_exiting(struct lwp *plp, struct lwp *lp)
434 {
435 }
436 
437 /*
438  * SMP systems may need a scheduler helper thread.  This is how one can be
439  * setup.
440  *
441  * We use a neat LWKT scheduling trick to interlock the helper thread.  It
442  * is possible to deschedule an LWKT thread and then do some work before
443  * switching away.  The thread can be rescheduled at any time, even before
444  * we switch away.
445  */
446 #ifdef SMP
447 
448 static void
449 dummy_sched_thread(void *dummy)
450 {
451     globaldata_t gd;
452     dummy_pcpu_t dd;
453     struct lwp *lp;
454     cpumask_t cpumask;
455     cpumask_t tmpmask;
456     int cpuid;
457     int tmpid;
458 
459     gd = mycpu;
460     cpuid = gd->gd_cpuid;
461     dd = &dummy_pcpu[cpuid];
462     cpumask = 1 << cpuid;
463 
464     /*
465      * Our Scheduler helper thread does not need to hold the MP lock
466      */
467     rel_mplock();
468 
469     for (;;) {
470 	lwkt_deschedule_self(gd->gd_curthread);		/* interlock */
471 	atomic_set_int(&dummy_rdyprocmask, cpumask);
472 	spin_lock_wr(&dummy_spin);
473 	if (dd->uschedcp) {
474 		/*
475 		 * We raced another cpu trying to schedule a thread onto us.
476 		 * If the runq isn't empty hit another free cpu.
477 		 */
478 		tmpmask = ~dummy_curprocmask & dummy_rdyprocmask &
479 		          gd->gd_other_cpus;
480 		if (tmpmask && dummy_runqcount) {
481 			tmpid = bsfl(tmpmask);
482 			KKASSERT(tmpid != cpuid);
483 			atomic_clear_int(&dummy_rdyprocmask, 1 << tmpid);
484 			spin_unlock_wr(&dummy_spin);
485 			lwkt_schedule(&dummy_pcpu[tmpid].helper_thread);
486 		} else {
487 			spin_unlock_wr(&dummy_spin);
488 		}
489 	} else if ((lp = TAILQ_FIRST(&dummy_runq)) != NULL) {
490 		--dummy_runqcount;
491 		TAILQ_REMOVE(&dummy_runq, lp, lwp_procq);
492 		lp->lwp_proc->p_flag &= ~P_ONRUNQ;
493 		dd->uschedcp = lp;
494 		atomic_set_int(&dummy_curprocmask, cpumask);
495 		spin_unlock_wr(&dummy_spin);
496 #ifdef SMP
497 		lwkt_acquire(lp->lwp_thread);
498 #endif
499 		lwkt_schedule(lp->lwp_thread);
500 	} else {
501 		spin_unlock_wr(&dummy_spin);
502 	}
503 	lwkt_switch();
504     }
505 }
506 
507 /*
508  * Setup our scheduler helpers.  Note that curprocmask bit 0 has already
509  * been cleared by rqinit() and we should not mess with it further.
510  */
511 static void
512 dummy_sched_thread_cpu_init(void)
513 {
514     int i;
515 
516     if (bootverbose)
517 	printf("start dummy scheduler helpers on cpus:");
518 
519     for (i = 0; i < ncpus; ++i) {
520 	dummy_pcpu_t dd = &dummy_pcpu[i];
521 	cpumask_t mask = 1 << i;
522 
523 	if ((mask & smp_active_mask) == 0)
524 	    continue;
525 
526 	if (bootverbose)
527 	    printf(" %d", i);
528 
529 	lwkt_create(dummy_sched_thread, NULL, NULL, &dd->helper_thread,
530 		    TDF_STOPREQ, i, "dsched %d", i);
531 
532 	/*
533 	 * Allow user scheduling on the target cpu.  cpu #0 has already
534 	 * been enabled in rqinit().
535 	 */
536 	if (i)
537 	    atomic_clear_int(&dummy_curprocmask, mask);
538 	atomic_set_int(&dummy_rdyprocmask, mask);
539     }
540     if (bootverbose)
541 	printf("\n");
542 }
543 SYSINIT(uschedtd, SI_SUB_FINISH_SMP, SI_ORDER_ANY,
544 	dummy_sched_thread_cpu_init, NULL)
545 
546 #endif
547 
548