xref: /dragonfly/sys/kern/usched_bsd4.c (revision f3025b16)
1 /*
2  * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/kernel.h>
30 #include <sys/lock.h>
31 #include <sys/queue.h>
32 #include <sys/proc.h>
33 #include <sys/rtprio.h>
34 #include <sys/uio.h>
35 #include <sys/sysctl.h>
36 #include <sys/resourcevar.h>
37 #include <sys/spinlock.h>
38 #include <sys/cpu_topology.h>
39 #include <sys/thread2.h>
40 #include <sys/spinlock2.h>
41 #include <sys/mplock2.h>
42 
43 #include <sys/ktr.h>
44 
45 #include <machine/cpu.h>
46 #include <machine/smp.h>
47 
48 /*
49  * Priorities.  Note that with 32 run queues per scheduler each queue
50  * represents four priority levels.
51  */
52 
53 #define MAXPRI			128
54 #define PRIMASK			(MAXPRI - 1)
55 #define PRIBASE_REALTIME	0
56 #define PRIBASE_NORMAL		MAXPRI
57 #define PRIBASE_IDLE		(MAXPRI * 2)
58 #define PRIBASE_THREAD		(MAXPRI * 3)
59 #define PRIBASE_NULL		(MAXPRI * 4)
60 
61 #define NQS	32			/* 32 run queues. */
62 #define PPQ	(MAXPRI / NQS)		/* priorities per queue */
63 #define PPQMASK	(PPQ - 1)
64 
65 /*
66  * NICEPPQ	- number of nice units per priority queue
67  *
68  * ESTCPUPPQ	- number of estcpu units per priority queue
69  * ESTCPUMAX	- number of estcpu units
70  */
71 #define NICEPPQ		2
72 #define ESTCPUPPQ	512
73 #define ESTCPUMAX	(ESTCPUPPQ * NQS)
74 #define BATCHMAX	(ESTCPUFREQ * 30)
75 #define PRIO_RANGE	(PRIO_MAX - PRIO_MIN + 1)
76 
77 #define ESTCPULIM(v)	min((v), ESTCPUMAX)
78 
79 TAILQ_HEAD(rq, lwp);
80 
81 #define lwp_priority	lwp_usdata.bsd4.priority
82 #define lwp_rqindex	lwp_usdata.bsd4.rqindex
83 #define lwp_estcpu	lwp_usdata.bsd4.estcpu
84 #define lwp_batch	lwp_usdata.bsd4.batch
85 #define lwp_rqtype	lwp_usdata.bsd4.rqtype
86 
87 static void bsd4_acquire_curproc(struct lwp *lp);
88 static void bsd4_release_curproc(struct lwp *lp);
89 static void bsd4_select_curproc(globaldata_t gd);
90 static void bsd4_setrunqueue(struct lwp *lp);
91 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period,
92 				sysclock_t cpstamp);
93 static void bsd4_recalculate_estcpu(struct lwp *lp);
94 static void bsd4_resetpriority(struct lwp *lp);
95 static void bsd4_forking(struct lwp *plp, struct lwp *lp);
96 static void bsd4_exiting(struct lwp *lp, struct proc *);
97 static void bsd4_yield(struct lwp *lp);
98 
99 #ifdef SMP
100 static void need_user_resched_remote(void *dummy);
101 static int batchy_looser_pri_test(struct lwp* lp);
102 static struct lwp *chooseproc_locked_cache_coherent(struct lwp *chklp);
103 #endif
104 static struct lwp *chooseproc_locked(struct lwp *chklp);
105 static void bsd4_remrunqueue_locked(struct lwp *lp);
106 static void bsd4_setrunqueue_locked(struct lwp *lp);
107 
108 struct usched usched_bsd4 = {
109 	{ NULL },
110 	"bsd4", "Original DragonFly Scheduler",
111 	NULL,			/* default registration */
112 	NULL,			/* default deregistration */
113 	bsd4_acquire_curproc,
114 	bsd4_release_curproc,
115 	bsd4_setrunqueue,
116 	bsd4_schedulerclock,
117 	bsd4_recalculate_estcpu,
118 	bsd4_resetpriority,
119 	bsd4_forking,
120 	bsd4_exiting,
121 	NULL,			/* setcpumask not supported */
122 	bsd4_yield
123 };
124 
125 struct usched_bsd4_pcpu {
126 	struct thread	helper_thread;
127 	short		rrcount;
128 	short		upri;
129 	struct lwp	*uschedcp;
130 	struct lwp	*old_uschedcp;
131 #ifdef SMP
132 	cpu_node_t	*cpunode;
133 #endif
134 };
135 
136 typedef struct usched_bsd4_pcpu	*bsd4_pcpu_t;
137 
138 /*
139  * We have NQS (32) run queues per scheduling class.  For the normal
140  * class, there are 128 priorities scaled onto these 32 queues.  New
141  * processes are added to the last entry in each queue, and processes
142  * are selected for running by taking them from the head and maintaining
143  * a simple FIFO arrangement.  Realtime and Idle priority processes have
144  * and explicit 0-31 priority which maps directly onto their class queue
145  * index.  When a queue has something in it, the corresponding bit is
146  * set in the queuebits variable, allowing a single read to determine
147  * the state of all 32 queues and then a ffs() to find the first busy
148  * queue.
149  */
150 static struct rq bsd4_queues[NQS];
151 static struct rq bsd4_rtqueues[NQS];
152 static struct rq bsd4_idqueues[NQS];
153 static u_int32_t bsd4_queuebits;
154 static u_int32_t bsd4_rtqueuebits;
155 static u_int32_t bsd4_idqueuebits;
156 static cpumask_t bsd4_curprocmask = -1;	/* currently running a user process */
157 static cpumask_t bsd4_rdyprocmask;	/* ready to accept a user process */
158 static int	 bsd4_runqcount;
159 #ifdef SMP
160 static volatile int bsd4_scancpu;
161 #endif
162 static struct spinlock bsd4_spin;
163 static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU];
164 static struct sysctl_ctx_list usched_bsd4_sysctl_ctx;
165 static struct sysctl_oid *usched_bsd4_sysctl_tree;
166 
167 /* Debug info exposed through debug.* sysctl */
168 
169 SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, &bsd4_runqcount, 0,
170     "Number of run queues");
171 #ifdef INVARIANTS
172 static int usched_nonoptimal;
173 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW,
174         &usched_nonoptimal, 0, "acquire_curproc() was not optimal");
175 static int usched_optimal;
176 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
177         &usched_optimal, 0, "acquire_curproc() was optimal");
178 #endif
179 
180 static int usched_bsd4_debug = -1;
181 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_bsd4_debug, 0,
182     "Print debug information for this pid");
183 static int usched_bsd4_pid_debug = -1;
184 SYSCTL_INT(_debug, OID_AUTO, pid_debug, CTLFLAG_RW, &usched_bsd4_pid_debug, 0,
185     "Print KTR debug information for this pid");
186 
187 #ifdef SMP
188 static int remote_resched_nonaffinity;
189 static int remote_resched_affinity;
190 static int choose_affinity;
191 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
192         &remote_resched_nonaffinity, 0, "Number of remote rescheds");
193 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
194         &remote_resched_affinity, 0, "Number of remote rescheds");
195 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
196         &choose_affinity, 0, "chooseproc() was smart");
197 #endif
198 
199 
200 /* Tunning usched_bsd4 - configurable through kern.usched_bsd4.* */
201 #ifdef SMP
202 static int usched_bsd4_smt = 0;
203 static int usched_bsd4_cache_coherent = 0;
204 static int usched_bsd4_upri_affinity = 16; /* 32 queues - half-way */
205 static int usched_bsd4_queue_checks = 5;
206 static int usched_bsd4_stick_to_level = 0;
207 #endif
208 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10;
209 static int usched_bsd4_decay = 8;
210 static int usched_bsd4_batch_time = 10;
211 
212 /* KTR debug printings */
213 
214 KTR_INFO_MASTER(usched);
215 
216 #if !defined(KTR_USCHED_BSD4)
217 #define	KTR_USCHED_BSD4	KTR_ALL
218 #endif
219 
220 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_urw, 0,
221     "USCHED_BSD4(bsd4_acquire_curproc in user_reseched_wanted "
222     "after release: pid %d, cpuid %d, curr_cpuid %d)",
223     pid_t pid, int cpuid, int curr);
224 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_before_loop, 0,
225     "USCHED_BSD4(bsd4_acquire_curproc before loop: pid %d, cpuid %d, "
226     "curr_cpuid %d)",
227     pid_t pid, int cpuid, int curr);
228 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_not, 0,
229     "USCHED_BSD4(bsd4_acquire_curproc couldn't acquire after "
230     "bsd4_setrunqueue: pid %d, cpuid %d, curr_lp pid %d, curr_cpuid %d)",
231     pid_t pid, int cpuid, pid_t curr_pid, int curr_cpuid);
232 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_switch, 0,
233     "USCHED_BSD4(bsd4_acquire_curproc after lwkt_switch: pid %d, "
234     "cpuid %d, curr_cpuid %d)",
235     pid_t pid, int cpuid, int curr);
236 
237 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_release_curproc, 0,
238     "USCHED_BSD4(bsd4_release_curproc before select: pid %d, "
239     "cpuid %d, curr_cpuid %d)",
240     pid_t pid, int cpuid, int curr);
241 
242 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_select_curproc, 0,
243     "USCHED_BSD4(bsd4_release_curproc before select: pid %d, "
244     "cpuid %d, old_pid %d, old_cpuid %d, curr_cpuid %d)",
245     pid_t pid, int cpuid, pid_t old_pid, int old_cpuid, int curr);
246 
247 #ifdef SMP
248 KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_false, 0,
249     "USCHED_BSD4(batchy_looser_pri_test false: pid %d, "
250     "cpuid %d, verify_mask %lu)",
251     pid_t pid, int cpuid, cpumask_t mask);
252 KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_true, 0,
253     "USCHED_BSD4(batchy_looser_pri_test true: pid %d, "
254     "cpuid %d, verify_mask %lu)",
255     pid_t pid, int cpuid, cpumask_t mask);
256 
257 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_smt, 0,
258     "USCHED_BSD4(bsd4_setrunqueue free cpus smt: pid %d, cpuid %d, "
259     "mask %lu, curr_cpuid %d)",
260     pid_t pid, int cpuid, cpumask_t mask, int curr);
261 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_non_smt, 0,
262     "USCHED_BSD4(bsd4_setrunqueue free cpus check non_smt: pid %d, "
263     "cpuid %d, mask %lu, curr_cpuid %d)",
264     pid_t pid, int cpuid, cpumask_t mask, int curr);
265 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_rc, 0,
266     "USCHED_BSD4(bsd4_setrunqueue running cpus check: pid %d, "
267     "cpuid %d, mask %lu, curr_cpuid %d)",
268     pid_t pid, int cpuid, cpumask_t mask, int curr);
269 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found, 0,
270     "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, "
271     "mask %lu, found_cpuid %d, curr_cpuid %d)",
272     pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr);
273 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_not_found, 0,
274     "USCHED_BSD4(bsd4_setrunqueue not found cpu: pid %d, cpuid %d, "
275     "try_cpuid %d, curr_cpuid %d)",
276     pid_t pid, int cpuid, int try_cpuid, int curr);
277 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found_best_cpuid, 0,
278     "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, "
279     "mask %lu, found_cpuid %d, curr_cpuid %d)",
280     pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr);
281 #endif
282 
283 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc, 0,
284     "USCHED_BSD4(chooseproc: pid %d, old_cpuid %d, curr_cpuid %d)",
285     pid_t pid, int old_cpuid, int curr);
286 #ifdef SMP
287 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc, 0,
288     "USCHED_BSD4(chooseproc_cc: pid %d, old_cpuid %d, curr_cpuid %d)",
289     pid_t pid, int old_cpuid, int curr);
290 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_not_good, 0,
291     "USCHED_BSD4(chooseproc_cc not good: pid %d, old_cpumask %lu, "
292     "sibling_mask %lu, curr_cpumask %lu)",
293     pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr);
294 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_elected, 0,
295     "USCHED_BSD4(chooseproc_cc elected: pid %d, old_cpumask %lu, "
296     "sibling_mask %lu, curr_cpumask: %lu)",
297     pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr);
298 
299 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process, 0,
300     "USCHED_BSD4(sched_thread %d no process scheduled: pid %d, old_cpuid %d)",
301     int id, pid_t pid, int cpuid);
302 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_process, 0,
303     "USCHED_BSD4(sched_thread %d process scheduled: pid %d, old_cpuid %d)",
304     int id, pid_t pid, int cpuid);
305 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process_found, 0,
306     "USCHED_BSD4(sched_thread %d no process found; tmpmask %lu)",
307     int id, cpumask_t tmpmask);
308 #endif
309 
310 /*
311  * Initialize the run queues at boot time.
312  */
313 static void
314 rqinit(void *dummy)
315 {
316 	int i;
317 
318 	spin_init(&bsd4_spin);
319 	for (i = 0; i < NQS; i++) {
320 		TAILQ_INIT(&bsd4_queues[i]);
321 		TAILQ_INIT(&bsd4_rtqueues[i]);
322 		TAILQ_INIT(&bsd4_idqueues[i]);
323 	}
324 	atomic_clear_cpumask(&bsd4_curprocmask, 1);
325 }
326 SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, rqinit, NULL)
327 
328 /*
329  * BSD4_ACQUIRE_CURPROC
330  *
331  * This function is called when the kernel intends to return to userland.
332  * It is responsible for making the thread the current designated userland
333  * thread for this cpu, blocking if necessary.
334  *
335  * The kernel has already depressed our LWKT priority so we must not switch
336  * until we have either assigned or disposed of the thread.
337  *
338  * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE
339  * TO ANOTHER CPU!  Because most of the kernel assumes that no migration will
340  * occur, this function is called only under very controlled circumstances.
341  *
342  * MPSAFE
343  */
344 static void
345 bsd4_acquire_curproc(struct lwp *lp)
346 {
347 	globaldata_t gd;
348 	bsd4_pcpu_t dd;
349 	thread_t td;
350 #if 0
351 	struct lwp *olp;
352 #endif
353 
354 	/*
355 	 * Make sure we aren't sitting on a tsleep queue.
356 	 */
357 	td = lp->lwp_thread;
358 	crit_enter_quick(td);
359 	if (td->td_flags & TDF_TSLEEPQ)
360 		tsleep_remove(td);
361 	bsd4_recalculate_estcpu(lp);
362 
363 	/*
364 	 * If a reschedule was requested give another thread the
365 	 * driver's seat.
366 	 */
367 	if (user_resched_wanted()) {
368 		clear_user_resched();
369 		bsd4_release_curproc(lp);
370 
371 		KTR_COND_LOG(usched_bsd4_acquire_curproc_urw,
372 		    lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
373 		    lp->lwp_proc->p_pid,
374 		    lp->lwp_thread->td_gd->gd_cpuid,
375 		    mycpu->gd_cpuid);
376 	}
377 
378 	/*
379 	 * Loop until we are the current user thread
380 	 */
381 	gd = mycpu;
382 	dd = &bsd4_pcpu[gd->gd_cpuid];
383 
384 	KTR_COND_LOG(usched_bsd4_acquire_curproc_before_loop,
385 	    lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
386 	    lp->lwp_proc->p_pid,
387 	    lp->lwp_thread->td_gd->gd_cpuid,
388 	    gd->gd_cpuid);
389 
390 	do {
391 		/*
392 		 * Process any pending events and higher priority threads.
393 		 */
394 		lwkt_yield();
395 
396 		/*
397 		 * Become the currently scheduled user thread for this cpu
398 		 * if we can do so trivially.
399 		 *
400 		 * We can steal another thread's current thread designation
401 		 * on this cpu since if we are running that other thread
402 		 * must not be, so we can safely deschedule it.
403 		 */
404 		if (dd->uschedcp == lp) {
405 			/*
406 			 * We are already the current lwp (hot path).
407 			 */
408 			dd->upri = lp->lwp_priority;
409 		} else if (dd->uschedcp == NULL) {
410 			/*
411 			 * We can trivially become the current lwp.
412 			 */
413 			atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
414 			dd->uschedcp = lp;
415 			dd->upri = lp->lwp_priority;
416 		} else if (dd->upri > lp->lwp_priority) {
417 			/*
418 			 * We can steal the current cpu's lwp designation
419 			 * away simply by replacing it.  The other thread
420 			 * will stall when it tries to return to userland.
421 			 */
422 			dd->uschedcp = lp;
423 			dd->upri = lp->lwp_priority;
424 			/*
425 			lwkt_deschedule(olp->lwp_thread);
426 			bsd4_setrunqueue(olp);
427 			*/
428 		} else {
429 			/*
430 			 * We cannot become the current lwp, place the lp
431 			 * on the bsd4 run-queue and deschedule ourselves.
432 			 *
433 			 * When we are reactivated we will have another
434 			 * chance.
435 			 */
436 			lwkt_deschedule(lp->lwp_thread);
437 
438 			bsd4_setrunqueue(lp);
439 
440 			KTR_COND_LOG(usched_bsd4_acquire_curproc_not,
441 			    lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
442 			    lp->lwp_proc->p_pid,
443 			    lp->lwp_thread->td_gd->gd_cpuid,
444 			    dd->uschedcp->lwp_proc->p_pid,
445 			    gd->gd_cpuid);
446 
447 
448 			lwkt_switch();
449 
450 			/*
451 			 * Reload after a switch or setrunqueue/switch possibly
452 			 * moved us to another cpu.
453 			 */
454 			gd = mycpu;
455 			dd = &bsd4_pcpu[gd->gd_cpuid];
456 
457 			KTR_COND_LOG(usched_bsd4_acquire_curproc_switch,
458 			    lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
459 			    lp->lwp_proc->p_pid,
460 			    lp->lwp_thread->td_gd->gd_cpuid,
461 			    gd->gd_cpuid);
462 		}
463 	} while (dd->uschedcp != lp);
464 
465 	crit_exit_quick(td);
466 	KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
467 }
468 
469 /*
470  * BSD4_RELEASE_CURPROC
471  *
472  * This routine detaches the current thread from the userland scheduler,
473  * usually because the thread needs to run or block in the kernel (at
474  * kernel priority) for a while.
475  *
476  * This routine is also responsible for selecting a new thread to
477  * make the current thread.
478  *
479  * NOTE: This implementation differs from the dummy example in that
480  * bsd4_select_curproc() is able to select the current process, whereas
481  * dummy_select_curproc() is not able to select the current process.
482  * This means we have to NULL out uschedcp.
483  *
484  * Additionally, note that we may already be on a run queue if releasing
485  * via the lwkt_switch() in bsd4_setrunqueue().
486  *
487  * MPSAFE
488  */
489 
490 static void
491 bsd4_release_curproc(struct lwp *lp)
492 {
493 	globaldata_t gd = mycpu;
494 	bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
495 
496 	if (dd->uschedcp == lp) {
497 		crit_enter();
498 		KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
499 
500 		KTR_COND_LOG(usched_bsd4_release_curproc,
501 		    lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
502 		    lp->lwp_proc->p_pid,
503 		    lp->lwp_thread->td_gd->gd_cpuid,
504 		    gd->gd_cpuid);
505 
506 		dd->uschedcp = NULL;	/* don't let lp be selected */
507 		dd->upri = PRIBASE_NULL;
508 		atomic_clear_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
509 		dd->old_uschedcp = lp;	/* used only for KTR debug prints */
510 		bsd4_select_curproc(gd);
511 		crit_exit();
512 	}
513 }
514 
515 /*
516  * BSD4_SELECT_CURPROC
517  *
518  * Select a new current process for this cpu and clear any pending user
519  * reschedule request.  The cpu currently has no current process.
520  *
521  * This routine is also responsible for equal-priority round-robining,
522  * typically triggered from bsd4_schedulerclock().  In our dummy example
523  * all the 'user' threads are LWKT scheduled all at once and we just
524  * call lwkt_switch().
525  *
526  * The calling process is not on the queue and cannot be selected.
527  *
528  * MPSAFE
529  */
530 static
531 void
532 bsd4_select_curproc(globaldata_t gd)
533 {
534 	bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
535 	struct lwp *nlp;
536 	int cpuid = gd->gd_cpuid;
537 
538 	crit_enter_gd(gd);
539 
540 	spin_lock(&bsd4_spin);
541 #ifdef SMP
542 	if(usched_bsd4_cache_coherent)
543 		nlp = chooseproc_locked_cache_coherent(dd->uschedcp);
544 	else
545 #endif
546 		nlp = chooseproc_locked(dd->uschedcp);
547 
548 	if (nlp) {
549 
550 		KTR_COND_LOG(usched_bsd4_select_curproc,
551 		    nlp->lwp_proc->p_pid == usched_bsd4_pid_debug,
552 		    nlp->lwp_proc->p_pid,
553 		    nlp->lwp_thread->td_gd->gd_cpuid,
554 		    dd->old_uschedcp->lwp_proc->p_pid,
555 		    dd->old_uschedcp->lwp_thread->td_gd->gd_cpuid,
556 		    gd->gd_cpuid);
557 
558 		atomic_set_cpumask(&bsd4_curprocmask, CPUMASK(cpuid));
559 		dd->upri = nlp->lwp_priority;
560 		dd->uschedcp = nlp;
561 		spin_unlock(&bsd4_spin);
562 #ifdef SMP
563 		lwkt_acquire(nlp->lwp_thread);
564 #endif
565 		lwkt_schedule(nlp->lwp_thread);
566 	} else {
567 		spin_unlock(&bsd4_spin);
568 	}
569 
570 #if 0
571 	} else if (bsd4_runqcount && (bsd4_rdyprocmask & CPUMASK(cpuid))) {
572 		atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid));
573 		spin_unlock(&bsd4_spin);
574 		lwkt_schedule(&dd->helper_thread);
575 	} else {
576 		spin_unlock(&bsd4_spin);
577 	}
578 #endif
579 	crit_exit_gd(gd);
580 }
581 #ifdef SMP
582 
583 /*
584  * batchy_looser_pri_test() - determine if a process is batchy or not
585  * relative to the other processes running in the system
586  */
587 static int
588 batchy_looser_pri_test(struct lwp* lp)
589 {
590 	cpumask_t mask;
591 	bsd4_pcpu_t other_dd;
592 	int cpu;
593 
594 	/* Current running processes */
595 	mask = bsd4_curprocmask & smp_active_mask
596 	    & usched_global_cpumask;
597 
598 	while(mask) {
599 		cpu = BSFCPUMASK(mask);
600 		other_dd = &bsd4_pcpu[cpu];
601 		if (other_dd->upri - lp->lwp_priority > usched_bsd4_upri_affinity * PPQ) {
602 
603 			KTR_COND_LOG(usched_batchy_test_false,
604 			    lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
605 			    lp->lwp_proc->p_pid,
606 			    lp->lwp_thread->td_gd->gd_cpuid,
607 			    (unsigned long)mask);
608 
609 			return 0;
610 		}
611 		mask &= ~CPUMASK(cpu);
612 	}
613 
614 	KTR_COND_LOG(usched_batchy_test_true,
615 	    lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
616 	    lp->lwp_proc->p_pid,
617 	    lp->lwp_thread->td_gd->gd_cpuid,
618 	    (unsigned long)mask);
619 
620 	return 1;
621 }
622 
623 #endif
624 /*
625  *
626  * BSD4_SETRUNQUEUE
627  *
628  * Place the specified lwp on the user scheduler's run queue.  This routine
629  * must be called with the thread descheduled.  The lwp must be runnable.
630  *
631  * The thread may be the current thread as a special case.
632  *
633  * MPSAFE
634  */
635 static void
636 bsd4_setrunqueue(struct lwp *lp)
637 {
638 	globaldata_t gd;
639 	bsd4_pcpu_t dd;
640 #ifdef SMP
641 	int cpuid;
642 	cpumask_t mask;
643 	cpumask_t tmpmask;
644 #endif
645 
646 	/*
647 	 * First validate the process state relative to the current cpu.
648 	 * We don't need the spinlock for this, just a critical section.
649 	 * We are in control of the process.
650 	 */
651 	crit_enter();
652 	KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN"));
653 	KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0,
654 	    ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid,
655 	     lp->lwp_tid, lp->lwp_proc->p_flags, lp->lwp_flags));
656 	KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
657 
658 	/*
659 	 * Note: gd and dd are relative to the target thread's last cpu,
660 	 * NOT our current cpu.
661 	 */
662 	gd = lp->lwp_thread->td_gd;
663 	dd = &bsd4_pcpu[gd->gd_cpuid];
664 
665 	/*
666 	 * This process is not supposed to be scheduled anywhere or assigned
667 	 * as the current process anywhere.  Assert the condition.
668 	 */
669 	KKASSERT(dd->uschedcp != lp);
670 
671 #ifndef SMP
672 	/*
673 	 * If we are not SMP we do not have a scheduler helper to kick
674 	 * and must directly activate the process if none are scheduled.
675 	 *
676 	 * This is really only an issue when bootstrapping init since
677 	 * the caller in all other cases will be a user process, and
678 	 * even if released (dd->uschedcp == NULL), that process will
679 	 * kickstart the scheduler when it returns to user mode from
680 	 * the kernel.
681 	 */
682 	if (dd->uschedcp == NULL) {
683 		atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask);
684 		dd->uschedcp = lp;
685 		dd->upri = lp->lwp_priority;
686 		lwkt_schedule(lp->lwp_thread);
687 		crit_exit();
688 		return;
689 	}
690 #endif
691 
692 #ifdef SMP
693 	/*
694 	 * XXX fixme.  Could be part of a remrunqueue/setrunqueue
695 	 * operation when the priority is recalculated, so TDF_MIGRATING
696 	 * may already be set.
697 	 */
698 	if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0)
699 		lwkt_giveaway(lp->lwp_thread);
700 #endif
701 
702 	/*
703 	 * We lose control of lp the moment we release the spinlock after
704 	 * having placed lp on the queue.  i.e. another cpu could pick it
705 	 * up and it could exit, or its priority could be further adjusted,
706 	 * or something like that.
707 	 */
708 	spin_lock(&bsd4_spin);
709 	bsd4_setrunqueue_locked(lp);
710 	lp->lwp_setrunqueue_ticks = sched_ticks;
711 
712 #ifdef SMP
713 	/*
714 	 * Kick the scheduler helper on one of the other cpu's
715 	 * and request a reschedule if appropriate.
716 	 *
717 	 * NOTE: We check all cpus whos rdyprocmask is set.  First we
718 	 *	 look for cpus without designated lps, then we look for
719 	 *	 cpus with designated lps with a worse priority than our
720 	 *	 process.
721 	 */
722 	++bsd4_scancpu;
723 
724 	if(usched_bsd4_smt) {
725 
726 		/*
727 		 * SMT heuristic - Try to schedule on a free physical core. If no physical core
728 		 * found than choose the one that has an interactive thread
729 		 */
730 
731 		int best_cpuid = -1;
732 		int min_prio = MAXPRI * MAXPRI;
733 		int sibling;
734 
735 		cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
736 		mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask &
737 		    smp_active_mask & usched_global_cpumask;
738 
739 		KTR_COND_LOG(usched_bsd4_setrunqueue_fc_smt,
740 		    lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
741 		    lp->lwp_proc->p_pid,
742 		    lp->lwp_thread->td_gd->gd_cpuid,
743 		    (unsigned long)mask,
744 		    mycpu->gd_cpuid);
745 
746 		while (mask) {
747 			tmpmask = ~(CPUMASK(cpuid) - 1);
748 			if (mask & tmpmask)
749 				cpuid = BSFCPUMASK(mask & tmpmask);
750 			else
751 				cpuid = BSFCPUMASK(mask);
752 			gd = globaldata_find(cpuid);
753 			dd = &bsd4_pcpu[cpuid];
754 
755 			if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) {
756 				if (dd->cpunode->parent_node->members & ~dd->cpunode->members & mask) {
757 
758 					KTR_COND_LOG(usched_bsd4_setrunqueue_found,
759 					    lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
760 					    lp->lwp_proc->p_pid,
761 					    lp->lwp_thread->td_gd->gd_cpuid,
762 					    (unsigned long)mask,
763 					    cpuid,
764 					    mycpu->gd_cpuid);
765 
766 					goto found;
767 				} else {
768 					sibling = BSFCPUMASK(dd->cpunode->parent_node->members &
769 					    ~dd->cpunode->members);
770 					if (min_prio > bsd4_pcpu[sibling].upri) {
771 						min_prio = bsd4_pcpu[sibling].upri;
772 						best_cpuid = cpuid;
773 					}
774 				}
775 			}
776 			mask &= ~CPUMASK(cpuid);
777 		}
778 
779 		if (best_cpuid != -1) {
780 			cpuid = best_cpuid;
781 			gd = globaldata_find(cpuid);
782 			dd = &bsd4_pcpu[cpuid];
783 
784 			KTR_COND_LOG(usched_bsd4_setrunqueue_found_best_cpuid,
785 			    lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
786 			    lp->lwp_proc->p_pid,
787 			    lp->lwp_thread->td_gd->gd_cpuid,
788 			    (unsigned long)mask,
789 			    cpuid,
790 			    mycpu->gd_cpuid);
791 
792 			goto found;
793 		}
794 	} else {
795 		/* Fallback to the original heuristic */
796 		cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
797 		mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask &
798 		       smp_active_mask & usched_global_cpumask;
799 
800 		KTR_COND_LOG(usched_bsd4_setrunqueue_fc_non_smt,
801 		    lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
802 		    lp->lwp_proc->p_pid,
803 		    lp->lwp_thread->td_gd->gd_cpuid,
804 		    (unsigned long)mask,
805 		    mycpu->gd_cpuid);
806 
807 		while (mask) {
808 			tmpmask = ~(CPUMASK(cpuid) - 1);
809 			if (mask & tmpmask)
810 				cpuid = BSFCPUMASK(mask & tmpmask);
811 			else
812 				cpuid = BSFCPUMASK(mask);
813 			gd = globaldata_find(cpuid);
814 			dd = &bsd4_pcpu[cpuid];
815 
816 			if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) {
817 
818 				KTR_COND_LOG(usched_bsd4_setrunqueue_found,
819 				    lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
820 				    lp->lwp_proc->p_pid,
821 				    lp->lwp_thread->td_gd->gd_cpuid,
822 				    (unsigned long)mask,
823 				    cpuid,
824 				    mycpu->gd_cpuid);
825 
826 				goto found;
827 			}
828 			mask &= ~CPUMASK(cpuid);
829 		}
830 	}
831 
832 	/*
833 	 * Then cpus which might have a currently running lp
834 	 */
835 	mask = bsd4_curprocmask & bsd4_rdyprocmask &
836 	       lp->lwp_cpumask & smp_active_mask & usched_global_cpumask;
837 
838 	KTR_COND_LOG(usched_bsd4_setrunqueue_rc,
839 	    lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
840 	    lp->lwp_proc->p_pid,
841 	    lp->lwp_thread->td_gd->gd_cpuid,
842 	    (unsigned long)mask,
843 	    mycpu->gd_cpuid);
844 
845 	while (mask) {
846 		tmpmask = ~(CPUMASK(cpuid) - 1);
847 		if (mask & tmpmask)
848 			cpuid = BSFCPUMASK(mask & tmpmask);
849 		else
850 			cpuid = BSFCPUMASK(mask);
851 		gd = globaldata_find(cpuid);
852 		dd = &bsd4_pcpu[cpuid];
853 
854 		if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
855 
856 			KTR_COND_LOG(usched_bsd4_setrunqueue_found,
857 			    lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
858 			    lp->lwp_proc->p_pid,
859 			    lp->lwp_thread->td_gd->gd_cpuid,
860 			    (unsigned long)mask,
861 			    cpuid,
862 			    mycpu->gd_cpuid);
863 
864 			goto found;
865 		}
866 		mask &= ~CPUMASK(cpuid);
867 	}
868 
869 	/*
870 	 * If we cannot find a suitable cpu we reload from bsd4_scancpu
871 	 * and round-robin.  Other cpus will pickup as they release their
872 	 * current lwps or become ready.
873 	 *
874 	 * Avoid a degenerate system lockup case if usched_global_cpumask
875 	 * is set to 0 or otherwise does not cover lwp_cpumask.
876 	 *
877 	 * We only kick the target helper thread in this case, we do not
878 	 * set the user resched flag because
879 	 */
880 	cpuid = (bsd4_scancpu & 0xFFFF) % ncpus;
881 	if ((CPUMASK(cpuid) & usched_global_cpumask) == 0) {
882 		cpuid = 0;
883 	}
884 	gd = globaldata_find(cpuid);
885 	dd = &bsd4_pcpu[cpuid];
886 
887 	KTR_COND_LOG(usched_bsd4_setrunqueue_not_found,
888 	    lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
889 	    lp->lwp_proc->p_pid,
890 	    lp->lwp_thread->td_gd->gd_cpuid,
891 	    cpuid,
892 	    mycpu->gd_cpuid);
893 
894 found:
895 	if (gd == mycpu) {
896 		spin_unlock(&bsd4_spin);
897 		if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
898 			if (dd->uschedcp == NULL) {
899 				wakeup_mycpu(&dd->helper_thread);
900 			} else {
901 				need_user_resched();
902 			}
903 		}
904 	} else {
905 		atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid));
906 		spin_unlock(&bsd4_spin);
907 		if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK))
908 			lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
909 		else
910 			wakeup(&dd->helper_thread);
911 	}
912 #else
913 	/*
914 	 * Request a reschedule if appropriate.
915 	 */
916 	spin_unlock(&bsd4_spin);
917 	if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) {
918 		need_user_resched();
919 	}
920 #endif
921 	crit_exit();
922 }
923 
924 /*
925  * This routine is called from a systimer IPI.  It MUST be MP-safe and
926  * the BGL IS NOT HELD ON ENTRY.  This routine is called at ESTCPUFREQ on
927  * each cpu.
928  *
929  * MPSAFE
930  */
931 static
932 void
933 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
934 {
935 	globaldata_t gd = mycpu;
936 	bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid];
937 
938 	/*
939 	 * Do we need to round-robin?  We round-robin 10 times a second.
940 	 * This should only occur for cpu-bound batch processes.
941 	 */
942 	if (++dd->rrcount >= usched_bsd4_rrinterval) {
943 		dd->rrcount = 0;
944 		need_user_resched();
945 	}
946 
947 	/*
948 	 * Adjust estcpu upward using a real time equivalent calculation.
949 	 */
950 	lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUMAX / ESTCPUFREQ + 1);
951 
952 	/*
953 	 * Spinlocks also hold a critical section so there should not be
954 	 * any active.
955 	 */
956 	KKASSERT(gd->gd_spinlocks_wr == 0);
957 
958 	bsd4_resetpriority(lp);
959 #if 0
960 	/*
961 	* if we can't call bsd4_resetpriority for some reason we must call
962 	 * need user_resched().
963 	 */
964 	need_user_resched();
965 #endif
966 }
967 
968 /*
969  * Called from acquire and from kern_synch's one-second timer (one of the
970  * callout helper threads) with a critical section held.
971  *
972  * Decay p_estcpu based on the number of ticks we haven't been running
973  * and our p_nice.  As the load increases each process observes a larger
974  * number of idle ticks (because other processes are running in them).
975  * This observation leads to a larger correction which tends to make the
976  * system more 'batchy'.
977  *
978  * Note that no recalculation occurs for a process which sleeps and wakes
979  * up in the same tick.  That is, a system doing thousands of context
980  * switches per second will still only do serious estcpu calculations
981  * ESTCPUFREQ times per second.
982  *
983  * MPSAFE
984  */
985 static
986 void
987 bsd4_recalculate_estcpu(struct lwp *lp)
988 {
989 	globaldata_t gd = mycpu;
990 	sysclock_t cpbase;
991 	sysclock_t ttlticks;
992 	int estcpu;
993 	int decay_factor;
994 
995 	/*
996 	 * We have to subtract periodic to get the last schedclock
997 	 * timeout time, otherwise we would get the upcoming timeout.
998 	 * Keep in mind that a process can migrate between cpus and
999 	 * while the scheduler clock should be very close, boundary
1000 	 * conditions could lead to a small negative delta.
1001 	 */
1002 	cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
1003 
1004 	if (lp->lwp_slptime > 1) {
1005 		/*
1006 		 * Too much time has passed, do a coarse correction.
1007 		 */
1008 		lp->lwp_estcpu = lp->lwp_estcpu >> 1;
1009 		bsd4_resetpriority(lp);
1010 		lp->lwp_cpbase = cpbase;
1011 		lp->lwp_cpticks = 0;
1012 		lp->lwp_batch -= ESTCPUFREQ;
1013 		if (lp->lwp_batch < 0)
1014 			lp->lwp_batch = 0;
1015 	} else if (lp->lwp_cpbase != cpbase) {
1016 		/*
1017 		 * Adjust estcpu if we are in a different tick.  Don't waste
1018 		 * time if we are in the same tick.
1019 		 *
1020 		 * First calculate the number of ticks in the measurement
1021 		 * interval.  The ttlticks calculation can wind up 0 due to
1022 		 * a bug in the handling of lwp_slptime  (as yet not found),
1023 		 * so make sure we do not get a divide by 0 panic.
1024 		 */
1025 		ttlticks = (cpbase - lp->lwp_cpbase) /
1026 			   gd->gd_schedclock.periodic;
1027 		if (ttlticks < 0) {
1028 			ttlticks = 0;
1029 			lp->lwp_cpbase = cpbase;
1030 		}
1031 		if (ttlticks == 0)
1032 			return;
1033 		updatepcpu(lp, lp->lwp_cpticks, ttlticks);
1034 
1035 		/*
1036 		 * Calculate the percentage of one cpu used factoring in ncpus
1037 		 * and the load and adjust estcpu.  Handle degenerate cases
1038 		 * by adding 1 to bsd4_runqcount.
1039 		 *
1040 		 * estcpu is scaled by ESTCPUMAX.
1041 		 *
1042 		 * bsd4_runqcount is the excess number of user processes
1043 		 * that cannot be immediately scheduled to cpus.  We want
1044 		 * to count these as running to avoid range compression
1045 		 * in the base calculation (which is the actual percentage
1046 		 * of one cpu used).
1047 		 */
1048 		estcpu = (lp->lwp_cpticks * ESTCPUMAX) *
1049 			 (bsd4_runqcount + ncpus) / (ncpus * ttlticks);
1050 
1051 		/*
1052 		 * If estcpu is > 50% we become more batch-like
1053 		 * If estcpu is <= 50% we become less batch-like
1054 		 *
1055 		 * It takes 30 cpu seconds to traverse the entire range.
1056 		 */
1057 		if (estcpu > ESTCPUMAX / 2) {
1058 			lp->lwp_batch += ttlticks;
1059 			if (lp->lwp_batch > BATCHMAX)
1060 				lp->lwp_batch = BATCHMAX;
1061 		} else {
1062 			lp->lwp_batch -= ttlticks;
1063 			if (lp->lwp_batch < 0)
1064 				lp->lwp_batch = 0;
1065 		}
1066 
1067 		if (usched_bsd4_debug == lp->lwp_proc->p_pid) {
1068 			kprintf("pid %d lwp %p estcpu %3d %3d bat %d cp %d/%d",
1069 				lp->lwp_proc->p_pid, lp,
1070 				estcpu, lp->lwp_estcpu,
1071 				lp->lwp_batch,
1072 				lp->lwp_cpticks, ttlticks);
1073 		}
1074 
1075 		/*
1076 		 * Adjust lp->lwp_esetcpu.  The decay factor determines how
1077 		 * quickly lwp_estcpu collapses to its realtime calculation.
1078 		 * A slower collapse gives us a more accurate number but
1079 		 * can cause a cpu hog to eat too much cpu before the
1080 		 * scheduler decides to downgrade it.
1081 		 *
1082 		 * NOTE: p_nice is accounted for in bsd4_resetpriority(),
1083 		 *	 and not here, but we must still ensure that a
1084 		 *	 cpu-bound nice -20 process does not completely
1085 		 *	 override a cpu-bound nice +20 process.
1086 		 *
1087 		 * NOTE: We must use ESTCPULIM() here to deal with any
1088 		 *	 overshoot.
1089 		 */
1090 		decay_factor = usched_bsd4_decay;
1091 		if (decay_factor < 1)
1092 			decay_factor = 1;
1093 		if (decay_factor > 1024)
1094 			decay_factor = 1024;
1095 
1096 		lp->lwp_estcpu = ESTCPULIM(
1097 			(lp->lwp_estcpu * decay_factor + estcpu) /
1098 			(decay_factor + 1));
1099 
1100 		if (usched_bsd4_debug == lp->lwp_proc->p_pid)
1101 			kprintf(" finalestcpu %d\n", lp->lwp_estcpu);
1102 		bsd4_resetpriority(lp);
1103 		lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic;
1104 		lp->lwp_cpticks = 0;
1105 	}
1106 }
1107 
1108 /*
1109  * Compute the priority of a process when running in user mode.
1110  * Arrange to reschedule if the resulting priority is better
1111  * than that of the current process.
1112  *
1113  * This routine may be called with any process.
1114  *
1115  * This routine is called by fork1() for initial setup with the process
1116  * of the run queue, and also may be called normally with the process on or
1117  * off the run queue.
1118  *
1119  * MPSAFE
1120  */
1121 static void
1122 bsd4_resetpriority(struct lwp *lp)
1123 {
1124 	bsd4_pcpu_t dd;
1125 	int newpriority;
1126 	u_short newrqtype;
1127 	int reschedcpu;
1128 	int checkpri;
1129 	int estcpu;
1130 
1131 	/*
1132 	 * Calculate the new priority and queue type
1133 	 */
1134 	crit_enter();
1135 	spin_lock(&bsd4_spin);
1136 
1137 	newrqtype = lp->lwp_rtprio.type;
1138 
1139 	switch(newrqtype) {
1140 	case RTP_PRIO_REALTIME:
1141 	case RTP_PRIO_FIFO:
1142 		newpriority = PRIBASE_REALTIME +
1143 			     (lp->lwp_rtprio.prio & PRIMASK);
1144 		break;
1145 	case RTP_PRIO_NORMAL:
1146 		/*
1147 		 * Detune estcpu based on batchiness.  lwp_batch ranges
1148 		 * from 0 to  BATCHMAX.  Limit estcpu for the sake of
1149 		 * the priority calculation to between 50% and 100%.
1150 		 */
1151 		estcpu = lp->lwp_estcpu * (lp->lwp_batch + BATCHMAX) /
1152 			 (BATCHMAX * 2);
1153 
1154 		/*
1155 		 * p_nice piece		Adds (0-40) * 2		0-80
1156 		 * estcpu		Adds 16384  * 4 / 512   0-128
1157 		 */
1158 		newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
1159 		newpriority += estcpu * PPQ / ESTCPUPPQ;
1160 		newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ /
1161 			      NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
1162 		newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK);
1163 		break;
1164 	case RTP_PRIO_IDLE:
1165 		newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK);
1166 		break;
1167 	case RTP_PRIO_THREAD:
1168 		newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK);
1169 		break;
1170 	default:
1171 		panic("Bad RTP_PRIO %d", newrqtype);
1172 		/* NOT REACHED */
1173 	}
1174 
1175 	/*
1176 	 * The newpriority incorporates the queue type so do a simple masked
1177 	 * check to determine if the process has moved to another queue.  If
1178 	 * it has, and it is currently on a run queue, then move it.
1179 	 */
1180 	if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) {
1181 		lp->lwp_priority = newpriority;
1182 		if (lp->lwp_mpflags & LWP_MP_ONRUNQ) {
1183 			bsd4_remrunqueue_locked(lp);
1184 			lp->lwp_rqtype = newrqtype;
1185 			lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
1186 			bsd4_setrunqueue_locked(lp);
1187 			checkpri = 1;
1188 		} else {
1189 			lp->lwp_rqtype = newrqtype;
1190 			lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
1191 			checkpri = 0;
1192 		}
1193 		reschedcpu = lp->lwp_thread->td_gd->gd_cpuid;
1194 	} else {
1195 		lp->lwp_priority = newpriority;
1196 		reschedcpu = -1;
1197 		checkpri = 1;
1198 	}
1199 
1200 	/*
1201 	 * Determine if we need to reschedule the target cpu.  This only
1202 	 * occurs if the LWP is already on a scheduler queue, which means
1203 	 * that idle cpu notification has already occured.  At most we
1204 	 * need only issue a need_user_resched() on the appropriate cpu.
1205 	 *
1206 	 * The LWP may be owned by a CPU different from the current one,
1207 	 * in which case dd->uschedcp may be modified without an MP lock
1208 	 * or a spinlock held.  The worst that happens is that the code
1209 	 * below causes a spurious need_user_resched() on the target CPU
1210 	 * and dd->pri to be wrong for a short period of time, both of
1211 	 * which are harmless.
1212 	 *
1213 	 * If checkpri is 0 we are adjusting the priority of the current
1214 	 * process, possibly higher (less desireable), so ignore the upri
1215 	 * check which will fail in that case.
1216 	 */
1217 	if (reschedcpu >= 0) {
1218 		dd = &bsd4_pcpu[reschedcpu];
1219 		if ((bsd4_rdyprocmask & CPUMASK(reschedcpu)) &&
1220 		    (checkpri == 0 ||
1221 		     (dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK))) {
1222 #ifdef SMP
1223 			if (reschedcpu == mycpu->gd_cpuid) {
1224 				spin_unlock(&bsd4_spin);
1225 				need_user_resched();
1226 			} else {
1227 				spin_unlock(&bsd4_spin);
1228 				atomic_clear_cpumask(&bsd4_rdyprocmask,
1229 						     CPUMASK(reschedcpu));
1230 				lwkt_send_ipiq(lp->lwp_thread->td_gd,
1231 					       need_user_resched_remote, NULL);
1232 			}
1233 #else
1234 			spin_unlock(&bsd4_spin);
1235 			need_user_resched();
1236 #endif
1237 		} else {
1238 			spin_unlock(&bsd4_spin);
1239 		}
1240 	} else {
1241 		spin_unlock(&bsd4_spin);
1242 	}
1243 	crit_exit();
1244 }
1245 
1246 /*
1247  * MPSAFE
1248  */
1249 static
1250 void
1251 bsd4_yield(struct lwp *lp)
1252 {
1253 #if 0
1254 	/* FUTURE (or something similar) */
1255 	switch(lp->lwp_rqtype) {
1256 	case RTP_PRIO_NORMAL:
1257 		lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR);
1258 		break;
1259 	default:
1260 		break;
1261 	}
1262 #endif
1263         need_user_resched();
1264 }
1265 
1266 /*
1267  * Called from fork1() when a new child process is being created.
1268  *
1269  * Give the child process an initial estcpu that is more batch then
1270  * its parent and dock the parent for the fork (but do not
1271  * reschedule the parent).   This comprises the main part of our batch
1272  * detection heuristic for both parallel forking and sequential execs.
1273  *
1274  * XXX lwp should be "spawning" instead of "forking"
1275  *
1276  * MPSAFE
1277  */
1278 static void
1279 bsd4_forking(struct lwp *plp, struct lwp *lp)
1280 {
1281 	/*
1282 	 * Put the child 4 queue slots (out of 32) higher than the parent
1283 	 * (less desireable than the parent).
1284 	 */
1285 	lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ * 4);
1286 
1287 	/*
1288 	 * The batch status of children always starts out centerline
1289 	 * and will inch-up or inch-down as appropriate.  It takes roughly
1290 	 * ~15 seconds of >50% cpu to hit the limit.
1291 	 */
1292 	lp->lwp_batch = BATCHMAX / 2;
1293 
1294 	/*
1295 	 * Dock the parent a cost for the fork, protecting us from fork
1296 	 * bombs.  If the parent is forking quickly make the child more
1297 	 * batchy.
1298 	 */
1299 	plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ / 16);
1300 }
1301 
1302 /*
1303  * Called when a parent waits for a child.
1304  *
1305  * MPSAFE
1306  */
1307 static void
1308 bsd4_exiting(struct lwp *lp, struct proc *child_proc)
1309 {
1310 }
1311 
1312 /*
1313  * chooseproc() is called when a cpu needs a user process to LWKT schedule,
1314  * it selects a user process and returns it.  If chklp is non-NULL and chklp
1315  * has a better or equal priority then the process that would otherwise be
1316  * chosen, NULL is returned.
1317  *
1318  * Until we fix the RUNQ code the chklp test has to be strict or we may
1319  * bounce between processes trying to acquire the current process designation.
1320  *
1321  * MPSAFE - must be called with bsd4_spin exclusive held.  The spinlock is
1322  *	    left intact through the entire routine.
1323  */
1324 static
1325 struct lwp *
1326 chooseproc_locked(struct lwp *chklp)
1327 {
1328 	struct lwp *lp;
1329 	struct rq *q;
1330 	u_int32_t *which, *which2;
1331 	u_int32_t pri;
1332 	u_int32_t rtqbits;
1333 	u_int32_t tsqbits;
1334 	u_int32_t idqbits;
1335 	cpumask_t cpumask;
1336 
1337 	rtqbits = bsd4_rtqueuebits;
1338 	tsqbits = bsd4_queuebits;
1339 	idqbits = bsd4_idqueuebits;
1340 	cpumask = mycpu->gd_cpumask;
1341 
1342 
1343 #ifdef SMP
1344 again:
1345 #endif
1346 	if (rtqbits) {
1347 		pri = bsfl(rtqbits);
1348 		q = &bsd4_rtqueues[pri];
1349 		which = &bsd4_rtqueuebits;
1350 		which2 = &rtqbits;
1351 	} else if (tsqbits) {
1352 		pri = bsfl(tsqbits);
1353 		q = &bsd4_queues[pri];
1354 		which = &bsd4_queuebits;
1355 		which2 = &tsqbits;
1356 	} else if (idqbits) {
1357 		pri = bsfl(idqbits);
1358 		q = &bsd4_idqueues[pri];
1359 		which = &bsd4_idqueuebits;
1360 		which2 = &idqbits;
1361 	} else {
1362 		return NULL;
1363 	}
1364 	lp = TAILQ_FIRST(q);
1365 	KASSERT(lp, ("chooseproc: no lwp on busy queue"));
1366 
1367 #ifdef SMP
1368 	while ((lp->lwp_cpumask & cpumask) == 0) {
1369 		lp = TAILQ_NEXT(lp, lwp_procq);
1370 		if (lp == NULL) {
1371 			*which2 &= ~(1 << pri);
1372 			goto again;
1373 		}
1374 	}
1375 #endif
1376 
1377 	/*
1378 	 * If the passed lwp <chklp> is reasonably close to the selected
1379 	 * lwp <lp>, return NULL (indicating that <chklp> should be kept).
1380 	 *
1381 	 * Note that we must error on the side of <chklp> to avoid bouncing
1382 	 * between threads in the acquire code.
1383 	 */
1384 	if (chklp) {
1385 		if (chklp->lwp_priority < lp->lwp_priority + PPQ)
1386 			return(NULL);
1387 	}
1388 
1389 #ifdef SMP
1390 	/*
1391 	 * If the chosen lwp does not reside on this cpu spend a few
1392 	 * cycles looking for a better candidate at the same priority level.
1393 	 * This is a fallback check, setrunqueue() tries to wakeup the
1394 	 * correct cpu and is our front-line affinity.
1395 	 */
1396 	if (lp->lwp_thread->td_gd != mycpu &&
1397 	    (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL
1398 	) {
1399 		if (chklp->lwp_thread->td_gd == mycpu) {
1400 			++choose_affinity;
1401 			lp = chklp;
1402 		}
1403 	}
1404 #endif
1405 
1406 	KTR_COND_LOG(usched_chooseproc,
1407 	    lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1408 	    lp->lwp_proc->p_pid,
1409 	    lp->lwp_thread->td_gd->gd_cpuid,
1410 	    mycpu->gd_cpuid);
1411 
1412 	TAILQ_REMOVE(q, lp, lwp_procq);
1413 	--bsd4_runqcount;
1414 	if (TAILQ_EMPTY(q))
1415 		*which &= ~(1 << pri);
1416 	KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!"));
1417 	atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1418 	return lp;
1419 }
1420 
1421 #ifdef SMP
1422 /*
1423  * chooseproc() - with a cache coherence heuristic. Try to pull a process that
1424  * has its home on the current CPU> If the process doesn't have its home here
1425  * and is a batchy one (see batcy_looser_pri_test), we can wait for a
1426  * sched_tick, may be its home will become free and pull it in. Anyway,
1427  * we can't wait more than one tick. If that tick expired, we pull in that
1428  * process, no matter what.
1429  */
1430 static
1431 struct lwp *
1432 chooseproc_locked_cache_coherent(struct lwp *chklp)
1433 {
1434 	struct lwp *lp;
1435 	struct rq *q;
1436 	u_int32_t *which, *which2;
1437 	u_int32_t pri;
1438 	u_int32_t checks;
1439 	u_int32_t rtqbits;
1440 	u_int32_t tsqbits;
1441 	u_int32_t idqbits;
1442 	cpumask_t cpumask;
1443 
1444 	struct lwp * min_level_lwp = NULL;
1445 	struct rq *min_q = NULL;
1446 	cpumask_t siblings;
1447 	cpu_node_t* cpunode = NULL;
1448 	u_int32_t min_level = MAXCPU;	/* number of levels < MAXCPU */
1449 	u_int32_t *min_which = NULL;
1450 	u_int32_t min_pri = 0;
1451 	u_int32_t level = 0;
1452 
1453 	rtqbits = bsd4_rtqueuebits;
1454 	tsqbits = bsd4_queuebits;
1455 	idqbits = bsd4_idqueuebits;
1456 	cpumask = mycpu->gd_cpumask;
1457 
1458 	/* Get the mask coresponding to the sysctl configured level */
1459 	cpunode = bsd4_pcpu[mycpu->gd_cpuid].cpunode;
1460 	level = usched_bsd4_stick_to_level;
1461 	while (level) {
1462 		cpunode = cpunode->parent_node;
1463 		level--;
1464 	}
1465 	/* The cpus which can ellect a process */
1466 	siblings = cpunode->members;
1467 
1468 again:
1469 	if (rtqbits) {
1470 		pri = bsfl(rtqbits);
1471 		q = &bsd4_rtqueues[pri];
1472 		which = &bsd4_rtqueuebits;
1473 		which2 = &rtqbits;
1474 	} else if (tsqbits) {
1475 		pri = bsfl(tsqbits);
1476 		q = &bsd4_queues[pri];
1477 		which = &bsd4_queuebits;
1478 		which2 = &tsqbits;
1479 	} else if (idqbits) {
1480 		pri = bsfl(idqbits);
1481 		q = &bsd4_idqueues[pri];
1482 		which = &bsd4_idqueuebits;
1483 		which2 = &idqbits;
1484 	} else {
1485 		return NULL;
1486 	}
1487 	lp = TAILQ_FIRST(q);
1488 	KASSERT(lp, ("chooseproc: no lwp on busy queue"));
1489 
1490 	/* Limit the number of checks/queue to a configurable value to
1491 	 * minimize the contention (we are in a locked region
1492 	 */
1493 	for (checks = 0; checks < usched_bsd4_queue_checks; checks++) {
1494 
1495 		if ((lp->lwp_cpumask & cpumask) == 0 ||
1496 		    ((siblings & lp->lwp_thread->td_gd->gd_cpumask) == 0 &&
1497 		      batchy_looser_pri_test(lp) &&
1498 		      (lp->lwp_setrunqueue_ticks == sched_ticks ||
1499 		       lp->lwp_setrunqueue_ticks == (int)(sched_ticks - 1)))) {
1500 
1501 			KTR_COND_LOG(usched_chooseproc_cc_not_good,
1502 			    lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1503 			    lp->lwp_proc->p_pid,
1504 			    (unsigned long)lp->lwp_thread->td_gd->gd_cpumask,
1505 			    (unsigned long)siblings,
1506 			    (unsigned long)cpumask);
1507 
1508 			cpunode = bsd4_pcpu[lp->lwp_thread->td_gd->gd_cpuid].cpunode;
1509 			level = 0;
1510 			while (cpunode) {
1511 				if (cpunode->members & cpumask) {
1512 					break;
1513 				}
1514 				cpunode = cpunode->parent_node;
1515 				level++;
1516 			}
1517 			if (level < min_level) {
1518 				min_level_lwp = lp;
1519 				min_level = level;
1520 				min_q = q;
1521 				min_which = which;
1522 				min_pri = pri;
1523 			}
1524 
1525 			lp = TAILQ_NEXT(lp, lwp_procq);
1526 			if (lp == NULL) {
1527 				*which2 &= ~(1 << pri);
1528 				goto again;
1529 			}
1530 		} else {
1531 			KTR_COND_LOG(usched_chooseproc_cc_elected,
1532 			    lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1533 			    lp->lwp_proc->p_pid,
1534 			    (unsigned long)lp->lwp_thread->td_gd->gd_cpumask,
1535 			    (unsigned long)siblings,
1536 			    (unsigned long)cpumask);
1537 
1538 			goto found;
1539 		}
1540 	}
1541 	lp = min_level_lwp;
1542 	q = min_q;
1543 	which = min_which;
1544 	pri = min_pri;
1545 	KASSERT(lp, ("chooseproc: at least the first lp was good"));
1546 
1547 found:
1548 
1549 	/*
1550 	 * If the passed lwp <chklp> is reasonably close to the selected
1551 	 * lwp <lp>, return NULL (indicating that <chklp> should be kept).
1552 	 *
1553 	 * Note that we must error on the side of <chklp> to avoid bouncing
1554 	 * between threads in the acquire code.
1555 	 */
1556 	if (chklp) {
1557 		if (chklp->lwp_priority < lp->lwp_priority + PPQ)
1558 			return(NULL);
1559 	}
1560 
1561 	KTR_COND_LOG(usched_chooseproc_cc,
1562 	    lp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1563 	    lp->lwp_proc->p_pid,
1564 	    lp->lwp_thread->td_gd->gd_cpuid,
1565 	    mycpu->gd_cpuid);
1566 
1567 	TAILQ_REMOVE(q, lp, lwp_procq);
1568 	--bsd4_runqcount;
1569 	if (TAILQ_EMPTY(q))
1570 		*which &= ~(1 << pri);
1571 	KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!"));
1572 	atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1573 	return lp;
1574 }
1575 
1576 
1577 static
1578 void
1579 need_user_resched_remote(void *dummy)
1580 {
1581 	globaldata_t gd = mycpu;
1582 	bsd4_pcpu_t  dd = &bsd4_pcpu[gd->gd_cpuid];
1583 
1584 	need_user_resched();
1585 
1586 	/* Call wakeup_mycpu to avoid sending IPIs to other CPUs */
1587 	wakeup_mycpu(&dd->helper_thread);
1588 }
1589 
1590 #endif
1591 
1592 /*
1593  * bsd4_remrunqueue_locked() removes a given process from the run queue
1594  * that it is on, clearing the queue busy bit if it becomes empty.
1595  *
1596  * Note that user process scheduler is different from the LWKT schedule.
1597  * The user process scheduler only manages user processes but it uses LWKT
1598  * underneath, and a user process operating in the kernel will often be
1599  * 'released' from our management.
1600  *
1601  * MPSAFE - bsd4_spin must be held exclusively on call
1602  */
1603 static void
1604 bsd4_remrunqueue_locked(struct lwp *lp)
1605 {
1606 	struct rq *q;
1607 	u_int32_t *which;
1608 	u_int8_t pri;
1609 
1610 	KKASSERT(lp->lwp_mpflags & LWP_MP_ONRUNQ);
1611 	atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1612 	--bsd4_runqcount;
1613 	KKASSERT(bsd4_runqcount >= 0);
1614 
1615 	pri = lp->lwp_rqindex;
1616 	switch(lp->lwp_rqtype) {
1617 	case RTP_PRIO_NORMAL:
1618 		q = &bsd4_queues[pri];
1619 		which = &bsd4_queuebits;
1620 		break;
1621 	case RTP_PRIO_REALTIME:
1622 	case RTP_PRIO_FIFO:
1623 		q = &bsd4_rtqueues[pri];
1624 		which = &bsd4_rtqueuebits;
1625 		break;
1626 	case RTP_PRIO_IDLE:
1627 		q = &bsd4_idqueues[pri];
1628 		which = &bsd4_idqueuebits;
1629 		break;
1630 	default:
1631 		panic("remrunqueue: invalid rtprio type");
1632 		/* NOT REACHED */
1633 	}
1634 	TAILQ_REMOVE(q, lp, lwp_procq);
1635 	if (TAILQ_EMPTY(q)) {
1636 		KASSERT((*which & (1 << pri)) != 0,
1637 			("remrunqueue: remove from empty queue"));
1638 		*which &= ~(1 << pri);
1639 	}
1640 }
1641 
1642 /*
1643  * bsd4_setrunqueue_locked()
1644  *
1645  * Add a process whos rqtype and rqindex had previously been calculated
1646  * onto the appropriate run queue.   Determine if the addition requires
1647  * a reschedule on a cpu and return the cpuid or -1.
1648  *
1649  * NOTE: Lower priorities are better priorities.
1650  *
1651  * MPSAFE - bsd4_spin must be held exclusively on call
1652  */
1653 static void
1654 bsd4_setrunqueue_locked(struct lwp *lp)
1655 {
1656 	struct rq *q;
1657 	u_int32_t *which;
1658 	int pri;
1659 
1660 	KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
1661 	atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1662 	++bsd4_runqcount;
1663 
1664 	pri = lp->lwp_rqindex;
1665 
1666 	switch(lp->lwp_rqtype) {
1667 	case RTP_PRIO_NORMAL:
1668 		q = &bsd4_queues[pri];
1669 		which = &bsd4_queuebits;
1670 		break;
1671 	case RTP_PRIO_REALTIME:
1672 	case RTP_PRIO_FIFO:
1673 		q = &bsd4_rtqueues[pri];
1674 		which = &bsd4_rtqueuebits;
1675 		break;
1676 	case RTP_PRIO_IDLE:
1677 		q = &bsd4_idqueues[pri];
1678 		which = &bsd4_idqueuebits;
1679 		break;
1680 	default:
1681 		panic("remrunqueue: invalid rtprio type");
1682 		/* NOT REACHED */
1683 	}
1684 
1685 	/*
1686 	 * Add to the correct queue and set the appropriate bit.  If no
1687 	 * lower priority (i.e. better) processes are in the queue then
1688 	 * we want a reschedule, calculate the best cpu for the job.
1689 	 *
1690 	 * Always run reschedules on the LWPs original cpu.
1691 	 */
1692 	TAILQ_INSERT_TAIL(q, lp, lwp_procq);
1693 	*which |= 1 << pri;
1694 }
1695 
1696 #ifdef SMP
1697 
1698 /*
1699  * For SMP systems a user scheduler helper thread is created for each
1700  * cpu and is used to allow one cpu to wakeup another for the purposes of
1701  * scheduling userland threads from setrunqueue().
1702  *
1703  * UP systems do not need the helper since there is only one cpu.
1704  *
1705  * We can't use the idle thread for this because we might block.
1706  * Additionally, doing things this way allows us to HLT idle cpus
1707  * on MP systems.
1708  *
1709  * MPSAFE
1710  */
1711 static void
1712 sched_thread(void *dummy)
1713 {
1714     globaldata_t gd;
1715     bsd4_pcpu_t  dd;
1716     bsd4_pcpu_t  tmpdd;
1717     struct lwp *nlp;
1718     cpumask_t mask;
1719     int cpuid;
1720 #ifdef SMP
1721     cpumask_t tmpmask;
1722     int tmpid;
1723 #endif
1724 
1725     gd = mycpu;
1726     cpuid = gd->gd_cpuid;	/* doesn't change */
1727     mask = gd->gd_cpumask;	/* doesn't change */
1728     dd = &bsd4_pcpu[cpuid];
1729 
1730     /*
1731      * Since we are woken up only when no user processes are scheduled
1732      * on a cpu, we can run at an ultra low priority.
1733      */
1734     lwkt_setpri_self(TDPRI_USER_SCHEDULER);
1735 
1736     tsleep(&dd->helper_thread, 0, "sched_thread_sleep", 0);
1737 
1738     for (;;) {
1739 //again:
1740 	/*
1741 	 * We use the LWKT deschedule-interlock trick to avoid racing
1742 	 * bsd4_rdyprocmask.  This means we cannot block through to the
1743 	 * manual lwkt_switch() call we make below.
1744 	 */
1745 	crit_enter_gd(gd);
1746 	//lwkt_deschedule_self(gd->gd_curthread);
1747 	tsleep_interlock(&dd->helper_thread, 0);
1748 	spin_lock(&bsd4_spin);
1749 	atomic_set_cpumask(&bsd4_rdyprocmask, mask);
1750 
1751 	clear_user_resched();	/* This satisfied the reschedule request */
1752 	dd->rrcount = 0;	/* Reset the round-robin counter */
1753 
1754 	if ((bsd4_curprocmask & mask) == 0) {
1755 		/*
1756 		 * No thread is currently scheduled.
1757 		 */
1758 		KKASSERT(dd->uschedcp == NULL);
1759 		if ((nlp = chooseproc_locked(NULL)) != NULL) {
1760 
1761 			KTR_COND_LOG(usched_sched_thread_no_process,
1762 			    nlp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1763 			    gd->gd_cpuid,
1764 			    nlp->lwp_proc->p_pid,
1765 			    nlp->lwp_thread->td_gd->gd_cpuid);
1766 
1767 			atomic_set_cpumask(&bsd4_curprocmask, mask);
1768 			dd->upri = nlp->lwp_priority;
1769 			dd->uschedcp = nlp;
1770 			spin_unlock(&bsd4_spin);
1771 #ifdef SMP
1772 			lwkt_acquire(nlp->lwp_thread);
1773 #endif
1774 			lwkt_schedule(nlp->lwp_thread);
1775 		} else {
1776 			spin_unlock(&bsd4_spin);
1777 		}
1778 	} else if (bsd4_runqcount) {
1779 		if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) {
1780 
1781 			KTR_COND_LOG(usched_sched_thread_process,
1782 			    nlp->lwp_proc->p_pid == usched_bsd4_pid_debug,
1783 			    gd->gd_cpuid,
1784 			    nlp->lwp_proc->p_pid,
1785 			    nlp->lwp_thread->td_gd->gd_cpuid);
1786 
1787 			dd->upri = nlp->lwp_priority;
1788 			dd->uschedcp = nlp;
1789 			spin_unlock(&bsd4_spin);
1790 #ifdef SMP
1791 			lwkt_acquire(nlp->lwp_thread);
1792 #endif
1793 			lwkt_schedule(nlp->lwp_thread);
1794 		} else {
1795 			/*
1796 			 * CHAINING CONDITION TRAIN
1797 			 *
1798 			 * We could not deal with the scheduler wakeup
1799 			 * request on this cpu, locate a ready scheduler
1800 			 * with no current lp assignment and chain to it.
1801 			 *
1802 			 * This ensures that a wakeup race which fails due
1803 			 * to priority test does not leave other unscheduled
1804 			 * cpus idle when the runqueue is not empty.
1805 			 */
1806 			tmpmask = ~bsd4_curprocmask &
1807 			    bsd4_rdyprocmask & smp_active_mask;
1808 			if (tmpmask) {
1809 				tmpid = BSFCPUMASK(tmpmask);
1810 				tmpdd = &bsd4_pcpu[tmpid];
1811 				atomic_clear_cpumask(&bsd4_rdyprocmask,
1812 				    CPUMASK(tmpid));
1813 				spin_unlock(&bsd4_spin);
1814 				wakeup(&tmpdd->helper_thread);
1815 			} else {
1816 				spin_unlock(&bsd4_spin);
1817 			}
1818 
1819 			KTR_LOG(usched_sched_thread_no_process_found,
1820 			    gd->gd_cpuid,
1821 			    (unsigned long)tmpmask);
1822 		}
1823 	} else {
1824 		/*
1825 		 * The runq is empty.
1826 		 */
1827 		spin_unlock(&bsd4_spin);
1828 	}
1829 
1830 	/*
1831 	 * We're descheduled unless someone scheduled us.  Switch away.
1832 	 * Exiting the critical section will cause splz() to be called
1833 	 * for us if interrupts and such are pending.
1834 	 */
1835 	crit_exit_gd(gd);
1836 	tsleep(&dd->helper_thread, PINTERLOCKED, "sched_thread_sleep", 0);
1837 //	lwkt_switch();
1838     }
1839 }
1840 
1841 /* sysctl stick_to_level parameter */
1842 static int
1843 sysctl_usched_bsd4_stick_to_level(SYSCTL_HANDLER_ARGS)
1844 {
1845 	int error, new_val;
1846 
1847 	new_val = usched_bsd4_stick_to_level;
1848 
1849 	error = sysctl_handle_int(oidp, &new_val, 0, req);
1850         if (error != 0 || req->newptr == NULL)
1851 		return (error);
1852 	if (new_val > cpu_topology_levels_number - 1 ||
1853 	    new_val < 0)
1854 		return (EINVAL);
1855 	usched_bsd4_stick_to_level = new_val;
1856 	return (0);
1857 }
1858 
1859 /*
1860  * Setup our scheduler helpers.  Note that curprocmask bit 0 has already
1861  * been cleared by rqinit() and we should not mess with it further.
1862  */
1863 static void
1864 sched_thread_cpu_init(void)
1865 {
1866 	int i;
1867 	int cpuid;
1868 	int smt_not_supported = 0;
1869 	int cache_coherent_not_supported = 0;
1870 	if (bootverbose)
1871 		kprintf("Start scheduler helpers on cpus:\n");
1872 
1873 	sysctl_ctx_init(&usched_bsd4_sysctl_ctx);
1874 	usched_bsd4_sysctl_tree = SYSCTL_ADD_NODE(&usched_bsd4_sysctl_ctx,
1875 	    SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO,
1876 	    "usched_bsd4", CTLFLAG_RD, 0, "");
1877 
1878 	for (i = 0; i < ncpus; ++i) {
1879 		bsd4_pcpu_t dd = &bsd4_pcpu[i];
1880 		cpumask_t mask = CPUMASK(i);
1881 
1882 		if ((mask & smp_active_mask) == 0)
1883 		    continue;
1884 
1885 		dd->cpunode = get_cpu_node_by_cpuid(i);
1886 
1887 		if (dd->cpunode == NULL) {
1888 			smt_not_supported = 1;
1889 			cache_coherent_not_supported = 1;
1890 			if (bootverbose)
1891 				kprintf ("\tcpu%d - WARNING: No CPU NODE found for cpu\n", i);
1892 
1893 		} else {
1894 
1895 			switch (dd->cpunode->type) {
1896 				case THREAD_LEVEL:
1897 					if (bootverbose)
1898 						kprintf ("\tcpu%d - HyperThreading available. "
1899 						    "Core siblings: ", i);
1900 					break;
1901 				case CORE_LEVEL:
1902 					smt_not_supported = 1;
1903 
1904 					if (bootverbose)
1905 						kprintf ("\tcpu%d - No HT available, multi-core/physical "
1906 						    "cpu. Physical siblings: ", i);
1907 					break;
1908 				case CHIP_LEVEL:
1909 					smt_not_supported = 1;
1910 
1911 					if (bootverbose)
1912 						kprintf ("\tcpu%d - No HT available, single-core/physical cpu. "
1913 						    "Package Siblings: ", i);
1914 					break;
1915 				default:
1916 					/* Let's go for safe defaults here */
1917 					smt_not_supported = 1;
1918 					cache_coherent_not_supported = 1;
1919 					if (bootverbose)
1920 						kprintf ("\tcpu%d - Unknown cpunode->type=%u. Siblings: ",
1921 						    i, (unsigned int)dd->cpunode->type);
1922 					break;
1923 			}
1924 
1925 			if (bootverbose) {
1926 				if (dd->cpunode->parent_node != NULL) {
1927 					CPUSET_FOREACH(cpuid, dd->cpunode->parent_node->members)
1928 						kprintf("cpu%d ", cpuid);
1929 					kprintf("\n");
1930 				} else {
1931 					kprintf(" no siblings\n");
1932 				}
1933 			}
1934 		}
1935 
1936 		lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread,
1937 		    0, i, "usched %d", i);
1938 
1939 		/*
1940 		 * Allow user scheduling on the target cpu.  cpu #0 has already
1941 		 * been enabled in rqinit().
1942 		 */
1943 		if (i)
1944 		    atomic_clear_cpumask(&bsd4_curprocmask, mask);
1945 		atomic_set_cpumask(&bsd4_rdyprocmask, mask);
1946 		dd->upri = PRIBASE_NULL;
1947 
1948 	}
1949 
1950 	/* usched_bsd4 sysctl configurable parameters */
1951 
1952 	SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1953 	    SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1954 	    OID_AUTO, "rrinterval", CTLFLAG_RW,
1955 	    &usched_bsd4_rrinterval, 0, "");
1956 	SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1957 	    SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1958 	    OID_AUTO, "decay", CTLFLAG_RW,
1959 	    &usched_bsd4_decay, 0, "Extra decay when not running");
1960 	SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1961 	    SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1962 	    OID_AUTO, "batch_time", CTLFLAG_RW,
1963 	    &usched_bsd4_batch_time, 0, "Minimum batch counter value");
1964 
1965 	/* Add enable/disable option for SMT scheduling if supported */
1966 	if (smt_not_supported) {
1967 		usched_bsd4_smt = 0;
1968 		SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx,
1969 		    SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1970 		    OID_AUTO, "smt", CTLFLAG_RD,
1971 		    "NOT SUPPORTED", 0, "SMT NOT SUPPORTED");
1972 	} else {
1973 		usched_bsd4_smt = 1;
1974 		SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1975 		    SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1976 		    OID_AUTO, "smt", CTLFLAG_RW,
1977 		    &usched_bsd4_smt, 0, "Enable/Disable SMT scheduling");
1978 
1979 	}
1980 
1981 	/* Add enable/disable option for cache coherent scheduling if supported */
1982 	if (cache_coherent_not_supported) {
1983 #ifdef SMP
1984 		usched_bsd4_cache_coherent = 0;
1985 		SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx,
1986 		    SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1987 		    OID_AUTO, "cache_coherent", CTLFLAG_RD,
1988 		    "NOT SUPPORTED", 0, "Cache coherence NOT SUPPORTED");
1989 #endif
1990 	} else {
1991 #ifdef SMP
1992 		usched_bsd4_cache_coherent = 1;
1993 		SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
1994 		    SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
1995 		    OID_AUTO, "cache_coherent", CTLFLAG_RW,
1996 		    &usched_bsd4_cache_coherent, 0,
1997 		    "Enable/Disable cache coherent scheduling");
1998 #endif
1999 
2000 		SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2001 		    SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2002 		    OID_AUTO, "upri_affinity", CTLFLAG_RW,
2003 		    &usched_bsd4_upri_affinity, 1,
2004 		    "Number of PPQs in user priority check");
2005 
2006 		SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2007 		    SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2008 		    OID_AUTO, "queue_checks", CTLFLAG_RW,
2009 		    &usched_bsd4_queue_checks, 5,
2010 		    "Number of LWP to check from a queue before giving up");
2011 
2012 		SYSCTL_ADD_PROC(&usched_bsd4_sysctl_ctx,
2013 		    SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2014 		    OID_AUTO, "stick_to_level", CTLTYPE_INT | CTLFLAG_RW,
2015 		    NULL, sizeof usched_bsd4_stick_to_level,
2016 		    sysctl_usched_bsd4_stick_to_level, "I",
2017 		    "Stick a process to this level. See sysctl"
2018 		    "paremter hw.cpu_topology.level_description");
2019 	}
2020 }
2021 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND,
2022 	sched_thread_cpu_init, NULL)
2023 #else /* No SMP options - just add the configurable parameters to sysctl */
2024 
2025 static void
2026 sched_sysctl_tree_init(void)
2027 {
2028 	sysctl_ctx_init(&usched_bsd4_sysctl_ctx);
2029 	usched_bsd4_sysctl_tree = SYSCTL_ADD_NODE(&usched_bsd4_sysctl_ctx,
2030 	    SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO,
2031 	    "usched_bsd4", CTLFLAG_RD, 0, "");
2032 
2033 	/* usched_bsd4 sysctl configurable parameters */
2034 	SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2035 	    SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2036 	    OID_AUTO, "rrinterval", CTLFLAG_RW,
2037 	    &usched_bsd4_rrinterval, 0, "");
2038 	SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2039 	    SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2040 	    OID_AUTO, "decay", CTLFLAG_RW,
2041 	    &usched_bsd4_decay, 0, "Extra decay when not running");
2042 	SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx,
2043 	    SYSCTL_CHILDREN(usched_bsd4_sysctl_tree),
2044 	    OID_AUTO, "batch_time", CTLFLAG_RW,
2045 	    &usched_bsd4_batch_time, 0, "Minimum batch counter value");
2046 }
2047 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND,
2048 	sched_sysctl_tree_init, NULL)
2049 #endif
2050 
2051