1 /* 2 * Copyright (c) 2012 The DragonFly Project. All rights reserved. 3 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Matthew Dillon <dillon@backplane.com>, 7 * by Mihai Carabas <mihai.carabas@gmail.com> 8 * and many others. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/kernel.h> 35 #include <sys/lock.h> 36 #include <sys/queue.h> 37 #include <sys/proc.h> 38 #include <sys/rtprio.h> 39 #include <sys/uio.h> 40 #include <sys/sysctl.h> 41 #include <sys/resourcevar.h> 42 #include <sys/spinlock.h> 43 #include <sys/cpu_topology.h> 44 #include <sys/thread2.h> 45 #include <sys/spinlock2.h> 46 #include <sys/mplock2.h> 47 48 #include <sys/ktr.h> 49 50 #include <machine/cpu.h> 51 #include <machine/smp.h> 52 53 /* 54 * Priorities. Note that with 32 run queues per scheduler each queue 55 * represents four priority levels. 56 */ 57 58 #define MAXPRI 128 59 #define PRIMASK (MAXPRI - 1) 60 #define PRIBASE_REALTIME 0 61 #define PRIBASE_NORMAL MAXPRI 62 #define PRIBASE_IDLE (MAXPRI * 2) 63 #define PRIBASE_THREAD (MAXPRI * 3) 64 #define PRIBASE_NULL (MAXPRI * 4) 65 66 #define NQS 32 /* 32 run queues. */ 67 #define PPQ (MAXPRI / NQS) /* priorities per queue */ 68 #define PPQMASK (PPQ - 1) 69 70 /* 71 * NICEPPQ - number of nice units per priority queue 72 * 73 * ESTCPUPPQ - number of estcpu units per priority queue 74 * ESTCPUMAX - number of estcpu units 75 */ 76 #define NICEPPQ 2 77 #define ESTCPUPPQ 512 78 #define ESTCPUMAX (ESTCPUPPQ * NQS) 79 #define BATCHMAX (ESTCPUFREQ * 30) 80 #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1) 81 82 #define ESTCPULIM(v) min((v), ESTCPUMAX) 83 84 TAILQ_HEAD(rq, lwp); 85 86 #define lwp_priority lwp_usdata.bsd4.priority 87 #define lwp_rqindex lwp_usdata.bsd4.rqindex 88 #define lwp_estcpu lwp_usdata.bsd4.estcpu 89 #define lwp_batch lwp_usdata.bsd4.batch 90 #define lwp_rqtype lwp_usdata.bsd4.rqtype 91 92 static void bsd4_acquire_curproc(struct lwp *lp); 93 static void bsd4_release_curproc(struct lwp *lp); 94 static void bsd4_select_curproc(globaldata_t gd); 95 static void bsd4_setrunqueue(struct lwp *lp); 96 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period, 97 sysclock_t cpstamp); 98 static void bsd4_recalculate_estcpu(struct lwp *lp); 99 static void bsd4_resetpriority(struct lwp *lp); 100 static void bsd4_forking(struct lwp *plp, struct lwp *lp); 101 static void bsd4_exiting(struct lwp *lp, struct proc *); 102 static void bsd4_uload_update(struct lwp *lp); 103 static void bsd4_yield(struct lwp *lp); 104 static void bsd4_need_user_resched_remote(void *dummy); 105 static int bsd4_batchy_looser_pri_test(struct lwp* lp); 106 static struct lwp *bsd4_chooseproc_locked_cache_coherent(struct lwp *chklp); 107 static void bsd4_kick_helper(struct lwp *lp); 108 static struct lwp *bsd4_chooseproc_locked(struct lwp *chklp); 109 static void bsd4_remrunqueue_locked(struct lwp *lp); 110 static void bsd4_setrunqueue_locked(struct lwp *lp); 111 112 struct usched usched_bsd4 = { 113 { NULL }, 114 "bsd4", "Original DragonFly Scheduler", 115 NULL, /* default registration */ 116 NULL, /* default deregistration */ 117 bsd4_acquire_curproc, 118 bsd4_release_curproc, 119 bsd4_setrunqueue, 120 bsd4_schedulerclock, 121 bsd4_recalculate_estcpu, 122 bsd4_resetpriority, 123 bsd4_forking, 124 bsd4_exiting, 125 bsd4_uload_update, 126 NULL, /* setcpumask not supported */ 127 bsd4_yield 128 }; 129 130 struct usched_bsd4_pcpu { 131 struct thread helper_thread; 132 short rrcount; 133 short upri; 134 struct lwp *uschedcp; 135 struct lwp *old_uschedcp; 136 cpu_node_t *cpunode; 137 }; 138 139 typedef struct usched_bsd4_pcpu *bsd4_pcpu_t; 140 141 /* 142 * We have NQS (32) run queues per scheduling class. For the normal 143 * class, there are 128 priorities scaled onto these 32 queues. New 144 * processes are added to the last entry in each queue, and processes 145 * are selected for running by taking them from the head and maintaining 146 * a simple FIFO arrangement. Realtime and Idle priority processes have 147 * and explicit 0-31 priority which maps directly onto their class queue 148 * index. When a queue has something in it, the corresponding bit is 149 * set in the queuebits variable, allowing a single read to determine 150 * the state of all 32 queues and then a ffs() to find the first busy 151 * queue. 152 */ 153 static struct rq bsd4_queues[NQS]; 154 static struct rq bsd4_rtqueues[NQS]; 155 static struct rq bsd4_idqueues[NQS]; 156 static u_int32_t bsd4_queuebits; 157 static u_int32_t bsd4_rtqueuebits; 158 static u_int32_t bsd4_idqueuebits; 159 static cpumask_t bsd4_curprocmask = -1; /* currently running a user process */ 160 static cpumask_t bsd4_rdyprocmask; /* ready to accept a user process */ 161 static int bsd4_runqcount; 162 static volatile int bsd4_scancpu; 163 static struct spinlock bsd4_spin; 164 static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU]; 165 static struct sysctl_ctx_list usched_bsd4_sysctl_ctx; 166 static struct sysctl_oid *usched_bsd4_sysctl_tree; 167 168 /* Debug info exposed through debug.* sysctl */ 169 170 SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, 171 &bsd4_runqcount, 0, 172 "Number of run queues"); 173 174 static int usched_bsd4_debug = -1; 175 SYSCTL_INT(_debug, OID_AUTO, bsd4_scdebug, CTLFLAG_RW, 176 &usched_bsd4_debug, 0, 177 "Print debug information for this pid"); 178 179 static int usched_bsd4_pid_debug = -1; 180 SYSCTL_INT(_debug, OID_AUTO, bsd4_pid_debug, CTLFLAG_RW, 181 &usched_bsd4_pid_debug, 0, 182 "Print KTR debug information for this pid"); 183 184 /* Tunning usched_bsd4 - configurable through kern.usched_bsd4.* */ 185 static int usched_bsd4_smt = 0; 186 static int usched_bsd4_cache_coherent = 0; 187 static int usched_bsd4_upri_affinity = 16; /* 32 queues - half-way */ 188 static int usched_bsd4_queue_checks = 5; 189 static int usched_bsd4_stick_to_level = 0; 190 static long usched_bsd4_kicks; 191 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10; 192 static int usched_bsd4_decay = 8; 193 static int usched_bsd4_batch_time = 10; 194 195 /* KTR debug printings */ 196 197 KTR_INFO_MASTER_EXTERN(usched); 198 199 #if !defined(KTR_USCHED_BSD4) 200 #define KTR_USCHED_BSD4 KTR_ALL 201 #endif 202 203 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_urw, 0, 204 "USCHED_BSD4(bsd4_acquire_curproc in user_reseched_wanted " 205 "after release: pid %d, cpuid %d, curr_cpuid %d)", 206 pid_t pid, int cpuid, int curr); 207 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_before_loop, 0, 208 "USCHED_BSD4(bsd4_acquire_curproc before loop: pid %d, cpuid %d, " 209 "curr_cpuid %d)", 210 pid_t pid, int cpuid, int curr); 211 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_not, 0, 212 "USCHED_BSD4(bsd4_acquire_curproc couldn't acquire after " 213 "bsd4_setrunqueue: pid %d, cpuid %d, curr_lp pid %d, curr_cpuid %d)", 214 pid_t pid, int cpuid, pid_t curr_pid, int curr_cpuid); 215 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_switch, 0, 216 "USCHED_BSD4(bsd4_acquire_curproc after lwkt_switch: pid %d, " 217 "cpuid %d, curr_cpuid %d)", 218 pid_t pid, int cpuid, int curr); 219 220 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_release_curproc, 0, 221 "USCHED_BSD4(bsd4_release_curproc before select: pid %d, " 222 "cpuid %d, curr_cpuid %d)", 223 pid_t pid, int cpuid, int curr); 224 225 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_select_curproc, 0, 226 "USCHED_BSD4(bsd4_release_curproc before select: pid %d, " 227 "cpuid %d, old_pid %d, old_cpuid %d, curr_cpuid %d)", 228 pid_t pid, int cpuid, pid_t old_pid, int old_cpuid, int curr); 229 230 KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_false, 0, 231 "USCHED_BSD4(batchy_looser_pri_test false: pid %d, " 232 "cpuid %d, verify_mask %lu)", 233 pid_t pid, int cpuid, cpumask_t mask); 234 KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_true, 0, 235 "USCHED_BSD4(batchy_looser_pri_test true: pid %d, " 236 "cpuid %d, verify_mask %lu)", 237 pid_t pid, int cpuid, cpumask_t mask); 238 239 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_smt, 0, 240 "USCHED_BSD4(bsd4_setrunqueue free cpus smt: pid %d, cpuid %d, " 241 "mask %lu, curr_cpuid %d)", 242 pid_t pid, int cpuid, cpumask_t mask, int curr); 243 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_non_smt, 0, 244 "USCHED_BSD4(bsd4_setrunqueue free cpus check non_smt: pid %d, " 245 "cpuid %d, mask %lu, curr_cpuid %d)", 246 pid_t pid, int cpuid, cpumask_t mask, int curr); 247 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_rc, 0, 248 "USCHED_BSD4(bsd4_setrunqueue running cpus check: pid %d, " 249 "cpuid %d, mask %lu, curr_cpuid %d)", 250 pid_t pid, int cpuid, cpumask_t mask, int curr); 251 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found, 0, 252 "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, " 253 "mask %lu, found_cpuid %d, curr_cpuid %d)", 254 pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr); 255 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_not_found, 0, 256 "USCHED_BSD4(bsd4_setrunqueue not found cpu: pid %d, cpuid %d, " 257 "try_cpuid %d, curr_cpuid %d)", 258 pid_t pid, int cpuid, int try_cpuid, int curr); 259 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found_best_cpuid, 0, 260 "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, " 261 "mask %lu, found_cpuid %d, curr_cpuid %d)", 262 pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr); 263 264 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc, 0, 265 "USCHED_BSD4(chooseproc: pid %d, old_cpuid %d, curr_cpuid %d)", 266 pid_t pid, int old_cpuid, int curr); 267 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc, 0, 268 "USCHED_BSD4(chooseproc_cc: pid %d, old_cpuid %d, curr_cpuid %d)", 269 pid_t pid, int old_cpuid, int curr); 270 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_not_good, 0, 271 "USCHED_BSD4(chooseproc_cc not good: pid %d, old_cpumask %lu, " 272 "sibling_mask %lu, curr_cpumask %lu)", 273 pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr); 274 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_elected, 0, 275 "USCHED_BSD4(chooseproc_cc elected: pid %d, old_cpumask %lu, " 276 "sibling_mask %lu, curr_cpumask: %lu)", 277 pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr); 278 279 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process, 0, 280 "USCHED_BSD4(sched_thread %d no process scheduled: pid %d, old_cpuid %d)", 281 int id, pid_t pid, int cpuid); 282 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_process, 0, 283 "USCHED_BSD4(sched_thread %d process scheduled: pid %d, old_cpuid %d)", 284 int id, pid_t pid, int cpuid); 285 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process_found, 0, 286 "USCHED_BSD4(sched_thread %d no process found; tmpmask %lu)", 287 int id, cpumask_t tmpmask); 288 289 /* 290 * Initialize the run queues at boot time. 291 */ 292 static void 293 bsd4_rqinit(void *dummy) 294 { 295 int i; 296 297 spin_init(&bsd4_spin); 298 for (i = 0; i < NQS; i++) { 299 TAILQ_INIT(&bsd4_queues[i]); 300 TAILQ_INIT(&bsd4_rtqueues[i]); 301 TAILQ_INIT(&bsd4_idqueues[i]); 302 } 303 atomic_clear_cpumask(&bsd4_curprocmask, 1); 304 } 305 SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, bsd4_rqinit, NULL) 306 307 /* 308 * BSD4_ACQUIRE_CURPROC 309 * 310 * This function is called when the kernel intends to return to userland. 311 * It is responsible for making the thread the current designated userland 312 * thread for this cpu, blocking if necessary. 313 * 314 * The kernel will not depress our LWKT priority until after we return, 315 * in case we have to shove over to another cpu. 316 * 317 * We must determine our thread's disposition before we switch away. This 318 * is very sensitive code. 319 * 320 * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE 321 * TO ANOTHER CPU! Because most of the kernel assumes that no migration will 322 * occur, this function is called only under very controlled circumstances. 323 * 324 * MPSAFE 325 */ 326 static void 327 bsd4_acquire_curproc(struct lwp *lp) 328 { 329 globaldata_t gd; 330 bsd4_pcpu_t dd; 331 thread_t td; 332 #if 0 333 struct lwp *olp; 334 #endif 335 336 /* 337 * Make sure we aren't sitting on a tsleep queue. 338 */ 339 td = lp->lwp_thread; 340 crit_enter_quick(td); 341 if (td->td_flags & TDF_TSLEEPQ) 342 tsleep_remove(td); 343 bsd4_recalculate_estcpu(lp); 344 345 /* 346 * If a reschedule was requested give another thread the 347 * driver's seat. 348 */ 349 if (user_resched_wanted()) { 350 clear_user_resched(); 351 bsd4_release_curproc(lp); 352 353 KTR_COND_LOG(usched_bsd4_acquire_curproc_urw, 354 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 355 lp->lwp_proc->p_pid, 356 lp->lwp_thread->td_gd->gd_cpuid, 357 mycpu->gd_cpuid); 358 } 359 360 /* 361 * Loop until we are the current user thread 362 */ 363 gd = mycpu; 364 dd = &bsd4_pcpu[gd->gd_cpuid]; 365 366 KTR_COND_LOG(usched_bsd4_acquire_curproc_before_loop, 367 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 368 lp->lwp_proc->p_pid, 369 lp->lwp_thread->td_gd->gd_cpuid, 370 gd->gd_cpuid); 371 372 do { 373 /* 374 * Process any pending events and higher priority threads. 375 */ 376 lwkt_yield(); 377 378 /* 379 * Become the currently scheduled user thread for this cpu 380 * if we can do so trivially. 381 * 382 * We can steal another thread's current thread designation 383 * on this cpu since if we are running that other thread 384 * must not be, so we can safely deschedule it. 385 */ 386 if (dd->uschedcp == lp) { 387 /* 388 * We are already the current lwp (hot path). 389 */ 390 dd->upri = lp->lwp_priority; 391 } else if (dd->uschedcp == NULL) { 392 /* 393 * We can trivially become the current lwp. 394 */ 395 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask); 396 dd->uschedcp = lp; 397 dd->upri = lp->lwp_priority; 398 } else if (dd->upri > lp->lwp_priority) { 399 /* 400 * We can steal the current cpu's lwp designation 401 * away simply by replacing it. The other thread 402 * will stall when it tries to return to userland. 403 */ 404 dd->uschedcp = lp; 405 dd->upri = lp->lwp_priority; 406 /* 407 lwkt_deschedule(olp->lwp_thread); 408 bsd4_setrunqueue(olp); 409 */ 410 } else { 411 /* 412 * We cannot become the current lwp, place the lp 413 * on the bsd4 run-queue and deschedule ourselves. 414 * 415 * When we are reactivated we will have another 416 * chance. 417 */ 418 lwkt_deschedule(lp->lwp_thread); 419 420 bsd4_setrunqueue(lp); 421 422 KTR_COND_LOG(usched_bsd4_acquire_curproc_not, 423 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 424 lp->lwp_proc->p_pid, 425 lp->lwp_thread->td_gd->gd_cpuid, 426 dd->uschedcp->lwp_proc->p_pid, 427 gd->gd_cpuid); 428 429 430 lwkt_switch(); 431 432 /* 433 * Reload after a switch or setrunqueue/switch possibly 434 * moved us to another cpu. 435 */ 436 gd = mycpu; 437 dd = &bsd4_pcpu[gd->gd_cpuid]; 438 439 KTR_COND_LOG(usched_bsd4_acquire_curproc_switch, 440 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 441 lp->lwp_proc->p_pid, 442 lp->lwp_thread->td_gd->gd_cpuid, 443 gd->gd_cpuid); 444 } 445 } while (dd->uschedcp != lp); 446 447 crit_exit_quick(td); 448 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 449 } 450 451 /* 452 * BSD4_RELEASE_CURPROC 453 * 454 * This routine detaches the current thread from the userland scheduler, 455 * usually because the thread needs to run or block in the kernel (at 456 * kernel priority) for a while. 457 * 458 * This routine is also responsible for selecting a new thread to 459 * make the current thread. 460 * 461 * NOTE: This implementation differs from the dummy example in that 462 * bsd4_select_curproc() is able to select the current process, whereas 463 * dummy_select_curproc() is not able to select the current process. 464 * This means we have to NULL out uschedcp. 465 * 466 * Additionally, note that we may already be on a run queue if releasing 467 * via the lwkt_switch() in bsd4_setrunqueue(). 468 * 469 * MPSAFE 470 */ 471 472 static void 473 bsd4_release_curproc(struct lwp *lp) 474 { 475 globaldata_t gd = mycpu; 476 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 477 478 if (dd->uschedcp == lp) { 479 crit_enter(); 480 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 481 482 KTR_COND_LOG(usched_bsd4_release_curproc, 483 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 484 lp->lwp_proc->p_pid, 485 lp->lwp_thread->td_gd->gd_cpuid, 486 gd->gd_cpuid); 487 488 dd->uschedcp = NULL; /* don't let lp be selected */ 489 dd->upri = PRIBASE_NULL; 490 atomic_clear_cpumask(&bsd4_curprocmask, gd->gd_cpumask); 491 dd->old_uschedcp = lp; /* used only for KTR debug prints */ 492 bsd4_select_curproc(gd); 493 crit_exit(); 494 } 495 } 496 497 /* 498 * BSD4_SELECT_CURPROC 499 * 500 * Select a new current process for this cpu and clear any pending user 501 * reschedule request. The cpu currently has no current process. 502 * 503 * This routine is also responsible for equal-priority round-robining, 504 * typically triggered from bsd4_schedulerclock(). In our dummy example 505 * all the 'user' threads are LWKT scheduled all at once and we just 506 * call lwkt_switch(). 507 * 508 * The calling process is not on the queue and cannot be selected. 509 * 510 * MPSAFE 511 */ 512 static 513 void 514 bsd4_select_curproc(globaldata_t gd) 515 { 516 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 517 struct lwp *nlp; 518 int cpuid = gd->gd_cpuid; 519 520 crit_enter_gd(gd); 521 522 spin_lock(&bsd4_spin); 523 if(usched_bsd4_cache_coherent) 524 nlp = bsd4_chooseproc_locked_cache_coherent(dd->uschedcp); 525 else 526 nlp = bsd4_chooseproc_locked(dd->uschedcp); 527 528 if (nlp) { 529 530 KTR_COND_LOG(usched_bsd4_select_curproc, 531 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, 532 nlp->lwp_proc->p_pid, 533 nlp->lwp_thread->td_gd->gd_cpuid, 534 dd->old_uschedcp->lwp_proc->p_pid, 535 dd->old_uschedcp->lwp_thread->td_gd->gd_cpuid, 536 gd->gd_cpuid); 537 538 atomic_set_cpumask(&bsd4_curprocmask, CPUMASK(cpuid)); 539 dd->upri = nlp->lwp_priority; 540 dd->uschedcp = nlp; 541 dd->rrcount = 0; /* reset round robin */ 542 spin_unlock(&bsd4_spin); 543 lwkt_acquire(nlp->lwp_thread); 544 lwkt_schedule(nlp->lwp_thread); 545 } else { 546 spin_unlock(&bsd4_spin); 547 } 548 549 #if 0 550 } else if (bsd4_runqcount && (bsd4_rdyprocmask & CPUMASK(cpuid))) { 551 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid)); 552 spin_unlock(&bsd4_spin); 553 lwkt_schedule(&dd->helper_thread); 554 } else { 555 spin_unlock(&bsd4_spin); 556 } 557 #endif 558 crit_exit_gd(gd); 559 } 560 561 /* 562 * batchy_looser_pri_test() - determine if a process is batchy or not 563 * relative to the other processes running in the system 564 */ 565 static int 566 bsd4_batchy_looser_pri_test(struct lwp* lp) 567 { 568 cpumask_t mask; 569 bsd4_pcpu_t other_dd; 570 int cpu; 571 572 /* Current running processes */ 573 mask = bsd4_curprocmask & smp_active_mask 574 & usched_global_cpumask; 575 576 while(mask) { 577 cpu = BSFCPUMASK(mask); 578 other_dd = &bsd4_pcpu[cpu]; 579 if (other_dd->upri - lp->lwp_priority > usched_bsd4_upri_affinity * PPQ) { 580 581 KTR_COND_LOG(usched_batchy_test_false, 582 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 583 lp->lwp_proc->p_pid, 584 lp->lwp_thread->td_gd->gd_cpuid, 585 (unsigned long)mask); 586 587 return 0; 588 } 589 mask &= ~CPUMASK(cpu); 590 } 591 592 KTR_COND_LOG(usched_batchy_test_true, 593 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 594 lp->lwp_proc->p_pid, 595 lp->lwp_thread->td_gd->gd_cpuid, 596 (unsigned long)mask); 597 598 return 1; 599 } 600 601 /* 602 * 603 * BSD4_SETRUNQUEUE 604 * 605 * Place the specified lwp on the user scheduler's run queue. This routine 606 * must be called with the thread descheduled. The lwp must be runnable. 607 * 608 * The thread may be the current thread as a special case. 609 * 610 * MPSAFE 611 */ 612 static void 613 bsd4_setrunqueue(struct lwp *lp) 614 { 615 globaldata_t gd; 616 bsd4_pcpu_t dd; 617 int cpuid; 618 cpumask_t mask; 619 cpumask_t tmpmask; 620 621 /* 622 * First validate the process state relative to the current cpu. 623 * We don't need the spinlock for this, just a critical section. 624 * We are in control of the process. 625 */ 626 crit_enter(); 627 KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN")); 628 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0, 629 ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid, 630 lp->lwp_tid, lp->lwp_proc->p_flags, lp->lwp_flags)); 631 KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0); 632 633 /* 634 * Note: gd and dd are relative to the target thread's last cpu, 635 * NOT our current cpu. 636 */ 637 gd = lp->lwp_thread->td_gd; 638 dd = &bsd4_pcpu[gd->gd_cpuid]; 639 640 /* 641 * This process is not supposed to be scheduled anywhere or assigned 642 * as the current process anywhere. Assert the condition. 643 */ 644 KKASSERT(dd->uschedcp != lp); 645 646 /* 647 * XXX fixme. Could be part of a remrunqueue/setrunqueue 648 * operation when the priority is recalculated, so TDF_MIGRATING 649 * may already be set. 650 */ 651 if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0) 652 lwkt_giveaway(lp->lwp_thread); 653 654 /* 655 * We lose control of lp the moment we release the spinlock after 656 * having placed lp on the queue. i.e. another cpu could pick it 657 * up and it could exit, or its priority could be further adjusted, 658 * or something like that. 659 */ 660 spin_lock(&bsd4_spin); 661 bsd4_setrunqueue_locked(lp); 662 lp->lwp_rebal_ticks = sched_ticks; 663 664 /* 665 * Kick the scheduler helper on one of the other cpu's 666 * and request a reschedule if appropriate. 667 * 668 * NOTE: We check all cpus whos rdyprocmask is set. First we 669 * look for cpus without designated lps, then we look for 670 * cpus with designated lps with a worse priority than our 671 * process. 672 */ 673 ++bsd4_scancpu; 674 675 if (usched_bsd4_smt) { 676 677 /* 678 * SMT heuristic - Try to schedule on a free physical core. 679 * If no physical core found than choose the one that has 680 * an interactive thread. 681 */ 682 683 int best_cpuid = -1; 684 int min_prio = MAXPRI * MAXPRI; 685 int sibling; 686 687 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 688 mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask & 689 smp_active_mask & usched_global_cpumask; 690 691 KTR_COND_LOG(usched_bsd4_setrunqueue_fc_smt, 692 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 693 lp->lwp_proc->p_pid, 694 lp->lwp_thread->td_gd->gd_cpuid, 695 (unsigned long)mask, 696 mycpu->gd_cpuid); 697 698 while (mask) { 699 tmpmask = ~(CPUMASK(cpuid) - 1); 700 if (mask & tmpmask) 701 cpuid = BSFCPUMASK(mask & tmpmask); 702 else 703 cpuid = BSFCPUMASK(mask); 704 gd = globaldata_find(cpuid); 705 dd = &bsd4_pcpu[cpuid]; 706 707 if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) { 708 if (dd->cpunode->parent_node->members & ~dd->cpunode->members & mask) { 709 710 KTR_COND_LOG(usched_bsd4_setrunqueue_found, 711 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 712 lp->lwp_proc->p_pid, 713 lp->lwp_thread->td_gd->gd_cpuid, 714 (unsigned long)mask, 715 cpuid, 716 mycpu->gd_cpuid); 717 718 goto found; 719 } else { 720 sibling = BSFCPUMASK(dd->cpunode->parent_node->members & 721 ~dd->cpunode->members); 722 if (min_prio > bsd4_pcpu[sibling].upri) { 723 min_prio = bsd4_pcpu[sibling].upri; 724 best_cpuid = cpuid; 725 } 726 } 727 } 728 mask &= ~CPUMASK(cpuid); 729 } 730 731 if (best_cpuid != -1) { 732 cpuid = best_cpuid; 733 gd = globaldata_find(cpuid); 734 dd = &bsd4_pcpu[cpuid]; 735 736 KTR_COND_LOG(usched_bsd4_setrunqueue_found_best_cpuid, 737 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 738 lp->lwp_proc->p_pid, 739 lp->lwp_thread->td_gd->gd_cpuid, 740 (unsigned long)mask, 741 cpuid, 742 mycpu->gd_cpuid); 743 744 goto found; 745 } 746 } else { 747 /* Fallback to the original heuristic */ 748 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 749 mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask & 750 smp_active_mask & usched_global_cpumask; 751 752 KTR_COND_LOG(usched_bsd4_setrunqueue_fc_non_smt, 753 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 754 lp->lwp_proc->p_pid, 755 lp->lwp_thread->td_gd->gd_cpuid, 756 (unsigned long)mask, 757 mycpu->gd_cpuid); 758 759 while (mask) { 760 tmpmask = ~(CPUMASK(cpuid) - 1); 761 if (mask & tmpmask) 762 cpuid = BSFCPUMASK(mask & tmpmask); 763 else 764 cpuid = BSFCPUMASK(mask); 765 gd = globaldata_find(cpuid); 766 dd = &bsd4_pcpu[cpuid]; 767 768 if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) { 769 770 KTR_COND_LOG(usched_bsd4_setrunqueue_found, 771 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 772 lp->lwp_proc->p_pid, 773 lp->lwp_thread->td_gd->gd_cpuid, 774 (unsigned long)mask, 775 cpuid, 776 mycpu->gd_cpuid); 777 778 goto found; 779 } 780 mask &= ~CPUMASK(cpuid); 781 } 782 } 783 784 /* 785 * Then cpus which might have a currently running lp 786 */ 787 mask = bsd4_curprocmask & bsd4_rdyprocmask & 788 lp->lwp_cpumask & smp_active_mask & usched_global_cpumask; 789 790 KTR_COND_LOG(usched_bsd4_setrunqueue_rc, 791 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 792 lp->lwp_proc->p_pid, 793 lp->lwp_thread->td_gd->gd_cpuid, 794 (unsigned long)mask, 795 mycpu->gd_cpuid); 796 797 while (mask) { 798 tmpmask = ~(CPUMASK(cpuid) - 1); 799 if (mask & tmpmask) 800 cpuid = BSFCPUMASK(mask & tmpmask); 801 else 802 cpuid = BSFCPUMASK(mask); 803 gd = globaldata_find(cpuid); 804 dd = &bsd4_pcpu[cpuid]; 805 806 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 807 808 KTR_COND_LOG(usched_bsd4_setrunqueue_found, 809 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 810 lp->lwp_proc->p_pid, 811 lp->lwp_thread->td_gd->gd_cpuid, 812 (unsigned long)mask, 813 cpuid, 814 mycpu->gd_cpuid); 815 816 goto found; 817 } 818 mask &= ~CPUMASK(cpuid); 819 } 820 821 /* 822 * If we cannot find a suitable cpu we reload from bsd4_scancpu 823 * and round-robin. Other cpus will pickup as they release their 824 * current lwps or become ready. 825 * 826 * Avoid a degenerate system lockup case if usched_global_cpumask 827 * is set to 0 or otherwise does not cover lwp_cpumask. 828 * 829 * We only kick the target helper thread in this case, we do not 830 * set the user resched flag because 831 */ 832 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 833 if ((CPUMASK(cpuid) & usched_global_cpumask) == 0) { 834 cpuid = 0; 835 } 836 gd = globaldata_find(cpuid); 837 dd = &bsd4_pcpu[cpuid]; 838 839 KTR_COND_LOG(usched_bsd4_setrunqueue_not_found, 840 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 841 lp->lwp_proc->p_pid, 842 lp->lwp_thread->td_gd->gd_cpuid, 843 cpuid, 844 mycpu->gd_cpuid); 845 846 found: 847 if (gd == mycpu) { 848 spin_unlock(&bsd4_spin); 849 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 850 if (dd->uschedcp == NULL) { 851 wakeup_mycpu(&dd->helper_thread); 852 } else { 853 need_user_resched(); 854 } 855 } 856 } else { 857 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid)); 858 spin_unlock(&bsd4_spin); 859 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) 860 lwkt_send_ipiq(gd, bsd4_need_user_resched_remote, NULL); 861 else 862 wakeup(&dd->helper_thread); 863 } 864 crit_exit(); 865 } 866 867 /* 868 * This routine is called from a systimer IPI. It MUST be MP-safe and 869 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ on 870 * each cpu. 871 * 872 * This routine is called on every sched tick. If the currently running 873 * thread belongs to this scheduler it will be called with a non-NULL lp, 874 * otherwise it will be called with a NULL lp. 875 * 876 * MPSAFE 877 */ 878 static 879 void 880 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp) 881 { 882 globaldata_t gd = mycpu; 883 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 884 885 /* 886 * No impl if no lp running. 887 */ 888 if (lp == NULL) 889 return; 890 891 /* 892 * Do we need to round-robin? We round-robin 10 times a second. 893 * This should only occur for cpu-bound batch processes. 894 */ 895 if (++dd->rrcount >= usched_bsd4_rrinterval) { 896 dd->rrcount = 0; 897 need_user_resched(); 898 } 899 900 /* 901 * Adjust estcpu upward using a real time equivalent calculation. 902 */ 903 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUMAX / ESTCPUFREQ + 1); 904 905 /* 906 * Spinlocks also hold a critical section so there should not be 907 * any active. 908 */ 909 KKASSERT(gd->gd_spinlocks == 0); 910 911 bsd4_resetpriority(lp); 912 } 913 914 /* 915 * Called from acquire and from kern_synch's one-second timer (one of the 916 * callout helper threads) with a critical section held. 917 * 918 * Decay p_estcpu based on the number of ticks we haven't been running 919 * and our p_nice. As the load increases each process observes a larger 920 * number of idle ticks (because other processes are running in them). 921 * This observation leads to a larger correction which tends to make the 922 * system more 'batchy'. 923 * 924 * Note that no recalculation occurs for a process which sleeps and wakes 925 * up in the same tick. That is, a system doing thousands of context 926 * switches per second will still only do serious estcpu calculations 927 * ESTCPUFREQ times per second. 928 * 929 * MPSAFE 930 */ 931 static 932 void 933 bsd4_recalculate_estcpu(struct lwp *lp) 934 { 935 globaldata_t gd = mycpu; 936 sysclock_t cpbase; 937 sysclock_t ttlticks; 938 int estcpu; 939 int decay_factor; 940 941 /* 942 * We have to subtract periodic to get the last schedclock 943 * timeout time, otherwise we would get the upcoming timeout. 944 * Keep in mind that a process can migrate between cpus and 945 * while the scheduler clock should be very close, boundary 946 * conditions could lead to a small negative delta. 947 */ 948 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic; 949 950 if (lp->lwp_slptime > 1) { 951 /* 952 * Too much time has passed, do a coarse correction. 953 */ 954 lp->lwp_estcpu = lp->lwp_estcpu >> 1; 955 bsd4_resetpriority(lp); 956 lp->lwp_cpbase = cpbase; 957 lp->lwp_cpticks = 0; 958 lp->lwp_batch -= ESTCPUFREQ; 959 if (lp->lwp_batch < 0) 960 lp->lwp_batch = 0; 961 } else if (lp->lwp_cpbase != cpbase) { 962 /* 963 * Adjust estcpu if we are in a different tick. Don't waste 964 * time if we are in the same tick. 965 * 966 * First calculate the number of ticks in the measurement 967 * interval. The ttlticks calculation can wind up 0 due to 968 * a bug in the handling of lwp_slptime (as yet not found), 969 * so make sure we do not get a divide by 0 panic. 970 */ 971 ttlticks = (cpbase - lp->lwp_cpbase) / 972 gd->gd_schedclock.periodic; 973 if ((ssysclock_t)ttlticks < 0) { 974 ttlticks = 0; 975 lp->lwp_cpbase = cpbase; 976 } 977 if (ttlticks == 0) 978 return; 979 updatepcpu(lp, lp->lwp_cpticks, ttlticks); 980 981 /* 982 * Calculate the percentage of one cpu used factoring in ncpus 983 * and the load and adjust estcpu. Handle degenerate cases 984 * by adding 1 to bsd4_runqcount. 985 * 986 * estcpu is scaled by ESTCPUMAX. 987 * 988 * bsd4_runqcount is the excess number of user processes 989 * that cannot be immediately scheduled to cpus. We want 990 * to count these as running to avoid range compression 991 * in the base calculation (which is the actual percentage 992 * of one cpu used). 993 */ 994 estcpu = (lp->lwp_cpticks * ESTCPUMAX) * 995 (bsd4_runqcount + ncpus) / (ncpus * ttlticks); 996 997 /* 998 * If estcpu is > 50% we become more batch-like 999 * If estcpu is <= 50% we become less batch-like 1000 * 1001 * It takes 30 cpu seconds to traverse the entire range. 1002 */ 1003 if (estcpu > ESTCPUMAX / 2) { 1004 lp->lwp_batch += ttlticks; 1005 if (lp->lwp_batch > BATCHMAX) 1006 lp->lwp_batch = BATCHMAX; 1007 } else { 1008 lp->lwp_batch -= ttlticks; 1009 if (lp->lwp_batch < 0) 1010 lp->lwp_batch = 0; 1011 } 1012 1013 if (usched_bsd4_debug == lp->lwp_proc->p_pid) { 1014 kprintf("pid %d lwp %p estcpu %3d %3d bat %d cp %d/%d", 1015 lp->lwp_proc->p_pid, lp, 1016 estcpu, lp->lwp_estcpu, 1017 lp->lwp_batch, 1018 lp->lwp_cpticks, ttlticks); 1019 } 1020 1021 /* 1022 * Adjust lp->lwp_esetcpu. The decay factor determines how 1023 * quickly lwp_estcpu collapses to its realtime calculation. 1024 * A slower collapse gives us a more accurate number but 1025 * can cause a cpu hog to eat too much cpu before the 1026 * scheduler decides to downgrade it. 1027 * 1028 * NOTE: p_nice is accounted for in bsd4_resetpriority(), 1029 * and not here, but we must still ensure that a 1030 * cpu-bound nice -20 process does not completely 1031 * override a cpu-bound nice +20 process. 1032 * 1033 * NOTE: We must use ESTCPULIM() here to deal with any 1034 * overshoot. 1035 */ 1036 decay_factor = usched_bsd4_decay; 1037 if (decay_factor < 1) 1038 decay_factor = 1; 1039 if (decay_factor > 1024) 1040 decay_factor = 1024; 1041 1042 lp->lwp_estcpu = ESTCPULIM( 1043 (lp->lwp_estcpu * decay_factor + estcpu) / 1044 (decay_factor + 1)); 1045 1046 if (usched_bsd4_debug == lp->lwp_proc->p_pid) 1047 kprintf(" finalestcpu %d\n", lp->lwp_estcpu); 1048 bsd4_resetpriority(lp); 1049 lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic; 1050 lp->lwp_cpticks = 0; 1051 } 1052 } 1053 1054 /* 1055 * Compute the priority of a process when running in user mode. 1056 * Arrange to reschedule if the resulting priority is better 1057 * than that of the current process. 1058 * 1059 * This routine may be called with any process. 1060 * 1061 * This routine is called by fork1() for initial setup with the process 1062 * of the run queue, and also may be called normally with the process on or 1063 * off the run queue. 1064 * 1065 * MPSAFE 1066 */ 1067 static void 1068 bsd4_resetpriority(struct lwp *lp) 1069 { 1070 bsd4_pcpu_t dd; 1071 int newpriority; 1072 u_short newrqtype; 1073 int reschedcpu; 1074 int checkpri; 1075 int estcpu; 1076 1077 /* 1078 * Calculate the new priority and queue type 1079 */ 1080 crit_enter(); 1081 spin_lock(&bsd4_spin); 1082 1083 newrqtype = lp->lwp_rtprio.type; 1084 1085 switch(newrqtype) { 1086 case RTP_PRIO_REALTIME: 1087 case RTP_PRIO_FIFO: 1088 newpriority = PRIBASE_REALTIME + 1089 (lp->lwp_rtprio.prio & PRIMASK); 1090 break; 1091 case RTP_PRIO_NORMAL: 1092 /* 1093 * Detune estcpu based on batchiness. lwp_batch ranges 1094 * from 0 to BATCHMAX. Limit estcpu for the sake of 1095 * the priority calculation to between 50% and 100%. 1096 */ 1097 estcpu = lp->lwp_estcpu * (lp->lwp_batch + BATCHMAX) / 1098 (BATCHMAX * 2); 1099 1100 /* 1101 * p_nice piece Adds (0-40) * 2 0-80 1102 * estcpu Adds 16384 * 4 / 512 0-128 1103 */ 1104 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ; 1105 newpriority += estcpu * PPQ / ESTCPUPPQ; 1106 newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ / 1107 NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ); 1108 newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK); 1109 break; 1110 case RTP_PRIO_IDLE: 1111 newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK); 1112 break; 1113 case RTP_PRIO_THREAD: 1114 newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK); 1115 break; 1116 default: 1117 panic("Bad RTP_PRIO %d", newrqtype); 1118 /* NOT REACHED */ 1119 } 1120 1121 /* 1122 * The newpriority incorporates the queue type so do a simple masked 1123 * check to determine if the process has moved to another queue. If 1124 * it has, and it is currently on a run queue, then move it. 1125 * 1126 * td_upri has normal sense (higher values are more desireable), so 1127 * negate it. 1128 */ 1129 lp->lwp_thread->td_upri = -(newpriority & ~PPQMASK); 1130 if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) { 1131 lp->lwp_priority = newpriority; 1132 if (lp->lwp_mpflags & LWP_MP_ONRUNQ) { 1133 bsd4_remrunqueue_locked(lp); 1134 lp->lwp_rqtype = newrqtype; 1135 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; 1136 bsd4_setrunqueue_locked(lp); 1137 checkpri = 1; 1138 } else { 1139 lp->lwp_rqtype = newrqtype; 1140 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; 1141 checkpri = 0; 1142 } 1143 reschedcpu = lp->lwp_thread->td_gd->gd_cpuid; 1144 } else { 1145 lp->lwp_priority = newpriority; 1146 reschedcpu = -1; 1147 checkpri = 1; 1148 } 1149 1150 /* 1151 * Determine if we need to reschedule the target cpu. This only 1152 * occurs if the LWP is already on a scheduler queue, which means 1153 * that idle cpu notification has already occured. At most we 1154 * need only issue a need_user_resched() on the appropriate cpu. 1155 * 1156 * The LWP may be owned by a CPU different from the current one, 1157 * in which case dd->uschedcp may be modified without an MP lock 1158 * or a spinlock held. The worst that happens is that the code 1159 * below causes a spurious need_user_resched() on the target CPU 1160 * and dd->pri to be wrong for a short period of time, both of 1161 * which are harmless. 1162 * 1163 * If checkpri is 0 we are adjusting the priority of the current 1164 * process, possibly higher (less desireable), so ignore the upri 1165 * check which will fail in that case. 1166 */ 1167 if (reschedcpu >= 0) { 1168 dd = &bsd4_pcpu[reschedcpu]; 1169 if ((bsd4_rdyprocmask & CPUMASK(reschedcpu)) && 1170 (checkpri == 0 || 1171 (dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK))) { 1172 if (reschedcpu == mycpu->gd_cpuid) { 1173 spin_unlock(&bsd4_spin); 1174 need_user_resched(); 1175 } else { 1176 spin_unlock(&bsd4_spin); 1177 atomic_clear_cpumask(&bsd4_rdyprocmask, 1178 CPUMASK(reschedcpu)); 1179 lwkt_send_ipiq(lp->lwp_thread->td_gd, 1180 bsd4_need_user_resched_remote, 1181 NULL); 1182 } 1183 } else { 1184 spin_unlock(&bsd4_spin); 1185 } 1186 } else { 1187 spin_unlock(&bsd4_spin); 1188 } 1189 crit_exit(); 1190 } 1191 1192 /* 1193 * MPSAFE 1194 */ 1195 static 1196 void 1197 bsd4_yield(struct lwp *lp) 1198 { 1199 #if 0 1200 /* FUTURE (or something similar) */ 1201 switch(lp->lwp_rqtype) { 1202 case RTP_PRIO_NORMAL: 1203 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR); 1204 break; 1205 default: 1206 break; 1207 } 1208 #endif 1209 need_user_resched(); 1210 } 1211 1212 /* 1213 * Called from fork1() when a new child process is being created. 1214 * 1215 * Give the child process an initial estcpu that is more batch then 1216 * its parent and dock the parent for the fork (but do not 1217 * reschedule the parent). This comprises the main part of our batch 1218 * detection heuristic for both parallel forking and sequential execs. 1219 * 1220 * XXX lwp should be "spawning" instead of "forking" 1221 * 1222 * MPSAFE 1223 */ 1224 static void 1225 bsd4_forking(struct lwp *plp, struct lwp *lp) 1226 { 1227 /* 1228 * Put the child 4 queue slots (out of 32) higher than the parent 1229 * (less desireable than the parent). 1230 */ 1231 lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ * 4); 1232 1233 /* 1234 * The batch status of children always starts out centerline 1235 * and will inch-up or inch-down as appropriate. It takes roughly 1236 * ~15 seconds of >50% cpu to hit the limit. 1237 */ 1238 lp->lwp_batch = BATCHMAX / 2; 1239 1240 /* 1241 * Dock the parent a cost for the fork, protecting us from fork 1242 * bombs. If the parent is forking quickly make the child more 1243 * batchy. 1244 */ 1245 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ / 16); 1246 } 1247 1248 /* 1249 * Called when a lwp is being removed from this scheduler, typically 1250 * during lwp_exit(). 1251 */ 1252 static void 1253 bsd4_exiting(struct lwp *lp, struct proc *child_proc) 1254 { 1255 } 1256 1257 static void 1258 bsd4_uload_update(struct lwp *lp) 1259 { 1260 } 1261 1262 /* 1263 * chooseproc() is called when a cpu needs a user process to LWKT schedule, 1264 * it selects a user process and returns it. If chklp is non-NULL and chklp 1265 * has a better or equal priority then the process that would otherwise be 1266 * chosen, NULL is returned. 1267 * 1268 * Until we fix the RUNQ code the chklp test has to be strict or we may 1269 * bounce between processes trying to acquire the current process designation. 1270 * 1271 * MPSAFE - must be called with bsd4_spin exclusive held. The spinlock is 1272 * left intact through the entire routine. 1273 */ 1274 static 1275 struct lwp * 1276 bsd4_chooseproc_locked(struct lwp *chklp) 1277 { 1278 struct lwp *lp; 1279 struct rq *q; 1280 u_int32_t *which, *which2; 1281 u_int32_t pri; 1282 u_int32_t rtqbits; 1283 u_int32_t tsqbits; 1284 u_int32_t idqbits; 1285 cpumask_t cpumask; 1286 1287 rtqbits = bsd4_rtqueuebits; 1288 tsqbits = bsd4_queuebits; 1289 idqbits = bsd4_idqueuebits; 1290 cpumask = mycpu->gd_cpumask; 1291 1292 1293 again: 1294 if (rtqbits) { 1295 pri = bsfl(rtqbits); 1296 q = &bsd4_rtqueues[pri]; 1297 which = &bsd4_rtqueuebits; 1298 which2 = &rtqbits; 1299 } else if (tsqbits) { 1300 pri = bsfl(tsqbits); 1301 q = &bsd4_queues[pri]; 1302 which = &bsd4_queuebits; 1303 which2 = &tsqbits; 1304 } else if (idqbits) { 1305 pri = bsfl(idqbits); 1306 q = &bsd4_idqueues[pri]; 1307 which = &bsd4_idqueuebits; 1308 which2 = &idqbits; 1309 } else { 1310 return NULL; 1311 } 1312 lp = TAILQ_FIRST(q); 1313 KASSERT(lp, ("chooseproc: no lwp on busy queue")); 1314 1315 while ((lp->lwp_cpumask & cpumask) == 0) { 1316 lp = TAILQ_NEXT(lp, lwp_procq); 1317 if (lp == NULL) { 1318 *which2 &= ~(1 << pri); 1319 goto again; 1320 } 1321 } 1322 1323 /* 1324 * If the passed lwp <chklp> is reasonably close to the selected 1325 * lwp <lp>, return NULL (indicating that <chklp> should be kept). 1326 * 1327 * Note that we must error on the side of <chklp> to avoid bouncing 1328 * between threads in the acquire code. 1329 */ 1330 if (chklp) { 1331 if (chklp->lwp_priority < lp->lwp_priority + PPQ) 1332 return(NULL); 1333 } 1334 1335 /* 1336 * If the chosen lwp does not reside on this cpu spend a few 1337 * cycles looking for a better candidate at the same priority level. 1338 * This is a fallback check, setrunqueue() tries to wakeup the 1339 * correct cpu and is our front-line affinity. 1340 */ 1341 if (lp->lwp_thread->td_gd != mycpu && 1342 (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL 1343 ) { 1344 if (chklp->lwp_thread->td_gd == mycpu) { 1345 lp = chklp; 1346 } 1347 } 1348 1349 KTR_COND_LOG(usched_chooseproc, 1350 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1351 lp->lwp_proc->p_pid, 1352 lp->lwp_thread->td_gd->gd_cpuid, 1353 mycpu->gd_cpuid); 1354 1355 TAILQ_REMOVE(q, lp, lwp_procq); 1356 --bsd4_runqcount; 1357 if (TAILQ_EMPTY(q)) 1358 *which &= ~(1 << pri); 1359 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!")); 1360 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1361 1362 return lp; 1363 } 1364 1365 /* 1366 * chooseproc() - with a cache coherence heuristic. Try to pull a process that 1367 * has its home on the current CPU> If the process doesn't have its home here 1368 * and is a batchy one (see batcy_looser_pri_test), we can wait for a 1369 * sched_tick, may be its home will become free and pull it in. Anyway, 1370 * we can't wait more than one tick. If that tick expired, we pull in that 1371 * process, no matter what. 1372 */ 1373 static 1374 struct lwp * 1375 bsd4_chooseproc_locked_cache_coherent(struct lwp *chklp) 1376 { 1377 struct lwp *lp; 1378 struct rq *q; 1379 u_int32_t *which, *which2; 1380 u_int32_t pri; 1381 u_int32_t checks; 1382 u_int32_t rtqbits; 1383 u_int32_t tsqbits; 1384 u_int32_t idqbits; 1385 cpumask_t cpumask; 1386 1387 struct lwp * min_level_lwp = NULL; 1388 struct rq *min_q = NULL; 1389 cpumask_t siblings; 1390 cpu_node_t* cpunode = NULL; 1391 u_int32_t min_level = MAXCPU; /* number of levels < MAXCPU */ 1392 u_int32_t *min_which = NULL; 1393 u_int32_t min_pri = 0; 1394 u_int32_t level = 0; 1395 1396 rtqbits = bsd4_rtqueuebits; 1397 tsqbits = bsd4_queuebits; 1398 idqbits = bsd4_idqueuebits; 1399 cpumask = mycpu->gd_cpumask; 1400 1401 /* Get the mask coresponding to the sysctl configured level */ 1402 cpunode = bsd4_pcpu[mycpu->gd_cpuid].cpunode; 1403 level = usched_bsd4_stick_to_level; 1404 while (level) { 1405 cpunode = cpunode->parent_node; 1406 level--; 1407 } 1408 /* The cpus which can ellect a process */ 1409 siblings = cpunode->members; 1410 checks = 0; 1411 1412 again: 1413 if (rtqbits) { 1414 pri = bsfl(rtqbits); 1415 q = &bsd4_rtqueues[pri]; 1416 which = &bsd4_rtqueuebits; 1417 which2 = &rtqbits; 1418 } else if (tsqbits) { 1419 pri = bsfl(tsqbits); 1420 q = &bsd4_queues[pri]; 1421 which = &bsd4_queuebits; 1422 which2 = &tsqbits; 1423 } else if (idqbits) { 1424 pri = bsfl(idqbits); 1425 q = &bsd4_idqueues[pri]; 1426 which = &bsd4_idqueuebits; 1427 which2 = &idqbits; 1428 } else { 1429 /* 1430 * No more left and we didn't reach the checks limit. 1431 */ 1432 bsd4_kick_helper(min_level_lwp); 1433 return NULL; 1434 } 1435 lp = TAILQ_FIRST(q); 1436 KASSERT(lp, ("chooseproc: no lwp on busy queue")); 1437 1438 /* 1439 * Limit the number of checks/queue to a configurable value to 1440 * minimize the contention (we are in a locked region 1441 */ 1442 while (checks < usched_bsd4_queue_checks) { 1443 if ((lp->lwp_cpumask & cpumask) == 0 || 1444 ((siblings & lp->lwp_thread->td_gd->gd_cpumask) == 0 && 1445 (lp->lwp_rebal_ticks == sched_ticks || 1446 lp->lwp_rebal_ticks == (int)(sched_ticks - 1)) && 1447 bsd4_batchy_looser_pri_test(lp))) { 1448 1449 KTR_COND_LOG(usched_chooseproc_cc_not_good, 1450 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1451 lp->lwp_proc->p_pid, 1452 (unsigned long)lp->lwp_thread->td_gd->gd_cpumask, 1453 (unsigned long)siblings, 1454 (unsigned long)cpumask); 1455 1456 cpunode = bsd4_pcpu[lp->lwp_thread->td_gd->gd_cpuid].cpunode; 1457 level = 0; 1458 while (cpunode) { 1459 if (cpunode->members & cpumask) 1460 break; 1461 cpunode = cpunode->parent_node; 1462 level++; 1463 } 1464 if (level < min_level || 1465 (level == min_level && min_level_lwp && 1466 lp->lwp_priority < min_level_lwp->lwp_priority)) { 1467 bsd4_kick_helper(min_level_lwp); 1468 min_level_lwp = lp; 1469 min_level = level; 1470 min_q = q; 1471 min_which = which; 1472 min_pri = pri; 1473 } else { 1474 bsd4_kick_helper(lp); 1475 } 1476 lp = TAILQ_NEXT(lp, lwp_procq); 1477 if (lp == NULL) { 1478 *which2 &= ~(1 << pri); 1479 goto again; 1480 } 1481 } else { 1482 KTR_COND_LOG(usched_chooseproc_cc_elected, 1483 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1484 lp->lwp_proc->p_pid, 1485 (unsigned long)lp->lwp_thread->td_gd->gd_cpumask, 1486 (unsigned long)siblings, 1487 (unsigned long)cpumask); 1488 1489 goto found; 1490 } 1491 ++checks; 1492 } 1493 1494 /* 1495 * Checks exhausted, we tried to defer too many threads, so schedule 1496 * the best of the worst. 1497 */ 1498 lp = min_level_lwp; 1499 q = min_q; 1500 which = min_which; 1501 pri = min_pri; 1502 KASSERT(lp, ("chooseproc: at least the first lp was good")); 1503 1504 found: 1505 1506 /* 1507 * If the passed lwp <chklp> is reasonably close to the selected 1508 * lwp <lp>, return NULL (indicating that <chklp> should be kept). 1509 * 1510 * Note that we must error on the side of <chklp> to avoid bouncing 1511 * between threads in the acquire code. 1512 */ 1513 if (chklp) { 1514 if (chklp->lwp_priority < lp->lwp_priority + PPQ) { 1515 bsd4_kick_helper(lp); 1516 return(NULL); 1517 } 1518 } 1519 1520 KTR_COND_LOG(usched_chooseproc_cc, 1521 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1522 lp->lwp_proc->p_pid, 1523 lp->lwp_thread->td_gd->gd_cpuid, 1524 mycpu->gd_cpuid); 1525 1526 TAILQ_REMOVE(q, lp, lwp_procq); 1527 --bsd4_runqcount; 1528 if (TAILQ_EMPTY(q)) 1529 *which &= ~(1 << pri); 1530 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!")); 1531 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1532 1533 return lp; 1534 } 1535 1536 /* 1537 * If we aren't willing to schedule a ready process on our cpu, give it's 1538 * target cpu a kick rather than wait for the next tick. 1539 * 1540 * Called with bsd4_spin held. 1541 */ 1542 static 1543 void 1544 bsd4_kick_helper(struct lwp *lp) 1545 { 1546 globaldata_t gd; 1547 bsd4_pcpu_t dd; 1548 1549 if (lp == NULL) 1550 return; 1551 gd = lp->lwp_thread->td_gd; 1552 dd = &bsd4_pcpu[gd->gd_cpuid]; 1553 if ((smp_active_mask & usched_global_cpumask & 1554 bsd4_rdyprocmask & gd->gd_cpumask) == 0) { 1555 return; 1556 } 1557 ++usched_bsd4_kicks; 1558 atomic_clear_cpumask(&bsd4_rdyprocmask, gd->gd_cpumask); 1559 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 1560 lwkt_send_ipiq(gd, bsd4_need_user_resched_remote, NULL); 1561 } else { 1562 wakeup(&dd->helper_thread); 1563 } 1564 } 1565 1566 static 1567 void 1568 bsd4_need_user_resched_remote(void *dummy) 1569 { 1570 globaldata_t gd = mycpu; 1571 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 1572 1573 need_user_resched(); 1574 1575 /* Call wakeup_mycpu to avoid sending IPIs to other CPUs */ 1576 wakeup_mycpu(&dd->helper_thread); 1577 } 1578 1579 /* 1580 * bsd4_remrunqueue_locked() removes a given process from the run queue 1581 * that it is on, clearing the queue busy bit if it becomes empty. 1582 * 1583 * Note that user process scheduler is different from the LWKT schedule. 1584 * The user process scheduler only manages user processes but it uses LWKT 1585 * underneath, and a user process operating in the kernel will often be 1586 * 'released' from our management. 1587 * 1588 * MPSAFE - bsd4_spin must be held exclusively on call 1589 */ 1590 static void 1591 bsd4_remrunqueue_locked(struct lwp *lp) 1592 { 1593 struct rq *q; 1594 u_int32_t *which; 1595 u_int8_t pri; 1596 1597 KKASSERT(lp->lwp_mpflags & LWP_MP_ONRUNQ); 1598 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1599 --bsd4_runqcount; 1600 KKASSERT(bsd4_runqcount >= 0); 1601 1602 pri = lp->lwp_rqindex; 1603 switch(lp->lwp_rqtype) { 1604 case RTP_PRIO_NORMAL: 1605 q = &bsd4_queues[pri]; 1606 which = &bsd4_queuebits; 1607 break; 1608 case RTP_PRIO_REALTIME: 1609 case RTP_PRIO_FIFO: 1610 q = &bsd4_rtqueues[pri]; 1611 which = &bsd4_rtqueuebits; 1612 break; 1613 case RTP_PRIO_IDLE: 1614 q = &bsd4_idqueues[pri]; 1615 which = &bsd4_idqueuebits; 1616 break; 1617 default: 1618 panic("remrunqueue: invalid rtprio type"); 1619 /* NOT REACHED */ 1620 } 1621 TAILQ_REMOVE(q, lp, lwp_procq); 1622 if (TAILQ_EMPTY(q)) { 1623 KASSERT((*which & (1 << pri)) != 0, 1624 ("remrunqueue: remove from empty queue")); 1625 *which &= ~(1 << pri); 1626 } 1627 } 1628 1629 /* 1630 * bsd4_setrunqueue_locked() 1631 * 1632 * Add a process whos rqtype and rqindex had previously been calculated 1633 * onto the appropriate run queue. Determine if the addition requires 1634 * a reschedule on a cpu and return the cpuid or -1. 1635 * 1636 * NOTE: Lower priorities are better priorities. 1637 * 1638 * MPSAFE - bsd4_spin must be held exclusively on call 1639 */ 1640 static void 1641 bsd4_setrunqueue_locked(struct lwp *lp) 1642 { 1643 struct rq *q; 1644 u_int32_t *which; 1645 int pri; 1646 1647 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 1648 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1649 ++bsd4_runqcount; 1650 1651 pri = lp->lwp_rqindex; 1652 1653 switch(lp->lwp_rqtype) { 1654 case RTP_PRIO_NORMAL: 1655 q = &bsd4_queues[pri]; 1656 which = &bsd4_queuebits; 1657 break; 1658 case RTP_PRIO_REALTIME: 1659 case RTP_PRIO_FIFO: 1660 q = &bsd4_rtqueues[pri]; 1661 which = &bsd4_rtqueuebits; 1662 break; 1663 case RTP_PRIO_IDLE: 1664 q = &bsd4_idqueues[pri]; 1665 which = &bsd4_idqueuebits; 1666 break; 1667 default: 1668 panic("remrunqueue: invalid rtprio type"); 1669 /* NOT REACHED */ 1670 } 1671 1672 /* 1673 * Add to the correct queue and set the appropriate bit. If no 1674 * lower priority (i.e. better) processes are in the queue then 1675 * we want a reschedule, calculate the best cpu for the job. 1676 * 1677 * Always run reschedules on the LWPs original cpu. 1678 */ 1679 TAILQ_INSERT_TAIL(q, lp, lwp_procq); 1680 *which |= 1 << pri; 1681 } 1682 1683 /* 1684 * For SMP systems a user scheduler helper thread is created for each 1685 * cpu and is used to allow one cpu to wakeup another for the purposes of 1686 * scheduling userland threads from setrunqueue(). 1687 * 1688 * UP systems do not need the helper since there is only one cpu. 1689 * 1690 * We can't use the idle thread for this because we might block. 1691 * Additionally, doing things this way allows us to HLT idle cpus 1692 * on MP systems. 1693 * 1694 * MPSAFE 1695 */ 1696 static void 1697 sched_thread(void *dummy) 1698 { 1699 globaldata_t gd; 1700 bsd4_pcpu_t dd; 1701 bsd4_pcpu_t tmpdd; 1702 struct lwp *nlp; 1703 cpumask_t mask; 1704 int cpuid; 1705 cpumask_t tmpmask; 1706 int tmpid; 1707 1708 gd = mycpu; 1709 cpuid = gd->gd_cpuid; /* doesn't change */ 1710 mask = gd->gd_cpumask; /* doesn't change */ 1711 dd = &bsd4_pcpu[cpuid]; 1712 1713 /* 1714 * Since we are woken up only when no user processes are scheduled 1715 * on a cpu, we can run at an ultra low priority. 1716 */ 1717 lwkt_setpri_self(TDPRI_USER_SCHEDULER); 1718 1719 tsleep(&dd->helper_thread, 0, "sched_thread_sleep", 0); 1720 1721 for (;;) { 1722 /* 1723 * We use the LWKT deschedule-interlock trick to avoid racing 1724 * bsd4_rdyprocmask. This means we cannot block through to the 1725 * manual lwkt_switch() call we make below. 1726 */ 1727 crit_enter_gd(gd); 1728 tsleep_interlock(&dd->helper_thread, 0); 1729 spin_lock(&bsd4_spin); 1730 atomic_set_cpumask(&bsd4_rdyprocmask, mask); 1731 1732 clear_user_resched(); /* This satisfied the reschedule request */ 1733 dd->rrcount = 0; /* Reset the round-robin counter */ 1734 1735 if ((bsd4_curprocmask & mask) == 0) { 1736 /* 1737 * No thread is currently scheduled. 1738 */ 1739 KKASSERT(dd->uschedcp == NULL); 1740 if ((nlp = bsd4_chooseproc_locked(NULL)) != NULL) { 1741 KTR_COND_LOG(usched_sched_thread_no_process, 1742 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1743 gd->gd_cpuid, 1744 nlp->lwp_proc->p_pid, 1745 nlp->lwp_thread->td_gd->gd_cpuid); 1746 1747 atomic_set_cpumask(&bsd4_curprocmask, mask); 1748 dd->upri = nlp->lwp_priority; 1749 dd->uschedcp = nlp; 1750 dd->rrcount = 0; /* reset round robin */ 1751 spin_unlock(&bsd4_spin); 1752 lwkt_acquire(nlp->lwp_thread); 1753 lwkt_schedule(nlp->lwp_thread); 1754 } else { 1755 spin_unlock(&bsd4_spin); 1756 } 1757 } else if (bsd4_runqcount) { 1758 if ((nlp = bsd4_chooseproc_locked(dd->uschedcp)) != NULL) { 1759 KTR_COND_LOG(usched_sched_thread_process, 1760 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1761 gd->gd_cpuid, 1762 nlp->lwp_proc->p_pid, 1763 nlp->lwp_thread->td_gd->gd_cpuid); 1764 1765 dd->upri = nlp->lwp_priority; 1766 dd->uschedcp = nlp; 1767 dd->rrcount = 0; /* reset round robin */ 1768 spin_unlock(&bsd4_spin); 1769 lwkt_acquire(nlp->lwp_thread); 1770 lwkt_schedule(nlp->lwp_thread); 1771 } else { 1772 /* 1773 * CHAINING CONDITION TRAIN 1774 * 1775 * We could not deal with the scheduler wakeup 1776 * request on this cpu, locate a ready scheduler 1777 * with no current lp assignment and chain to it. 1778 * 1779 * This ensures that a wakeup race which fails due 1780 * to priority test does not leave other unscheduled 1781 * cpus idle when the runqueue is not empty. 1782 */ 1783 tmpmask = ~bsd4_curprocmask & 1784 bsd4_rdyprocmask & smp_active_mask; 1785 if (tmpmask) { 1786 tmpid = BSFCPUMASK(tmpmask); 1787 tmpdd = &bsd4_pcpu[tmpid]; 1788 atomic_clear_cpumask(&bsd4_rdyprocmask, 1789 CPUMASK(tmpid)); 1790 spin_unlock(&bsd4_spin); 1791 wakeup(&tmpdd->helper_thread); 1792 } else { 1793 spin_unlock(&bsd4_spin); 1794 } 1795 1796 KTR_LOG(usched_sched_thread_no_process_found, 1797 gd->gd_cpuid, (unsigned long)tmpmask); 1798 } 1799 } else { 1800 /* 1801 * The runq is empty. 1802 */ 1803 spin_unlock(&bsd4_spin); 1804 } 1805 1806 /* 1807 * We're descheduled unless someone scheduled us. Switch away. 1808 * Exiting the critical section will cause splz() to be called 1809 * for us if interrupts and such are pending. 1810 */ 1811 crit_exit_gd(gd); 1812 tsleep(&dd->helper_thread, PINTERLOCKED, "schslp", 0); 1813 } 1814 } 1815 1816 /* sysctl stick_to_level parameter */ 1817 static int 1818 sysctl_usched_bsd4_stick_to_level(SYSCTL_HANDLER_ARGS) 1819 { 1820 int error, new_val; 1821 1822 new_val = usched_bsd4_stick_to_level; 1823 1824 error = sysctl_handle_int(oidp, &new_val, 0, req); 1825 if (error != 0 || req->newptr == NULL) 1826 return (error); 1827 if (new_val > cpu_topology_levels_number - 1 || new_val < 0) 1828 return (EINVAL); 1829 usched_bsd4_stick_to_level = new_val; 1830 return (0); 1831 } 1832 1833 /* 1834 * Setup our scheduler helpers. Note that curprocmask bit 0 has already 1835 * been cleared by rqinit() and we should not mess with it further. 1836 */ 1837 static void 1838 sched_thread_cpu_init(void) 1839 { 1840 int i; 1841 int cpuid; 1842 int smt_not_supported = 0; 1843 int cache_coherent_not_supported = 0; 1844 1845 if (bootverbose) 1846 kprintf("Start scheduler helpers on cpus:\n"); 1847 1848 sysctl_ctx_init(&usched_bsd4_sysctl_ctx); 1849 usched_bsd4_sysctl_tree = 1850 SYSCTL_ADD_NODE(&usched_bsd4_sysctl_ctx, 1851 SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO, 1852 "usched_bsd4", CTLFLAG_RD, 0, ""); 1853 1854 for (i = 0; i < ncpus; ++i) { 1855 bsd4_pcpu_t dd = &bsd4_pcpu[i]; 1856 cpumask_t mask = CPUMASK(i); 1857 1858 if ((mask & smp_active_mask) == 0) 1859 continue; 1860 1861 dd->cpunode = get_cpu_node_by_cpuid(i); 1862 1863 if (dd->cpunode == NULL) { 1864 smt_not_supported = 1; 1865 cache_coherent_not_supported = 1; 1866 if (bootverbose) 1867 kprintf ("\tcpu%d - WARNING: No CPU NODE " 1868 "found for cpu\n", i); 1869 } else { 1870 switch (dd->cpunode->type) { 1871 case THREAD_LEVEL: 1872 if (bootverbose) 1873 kprintf ("\tcpu%d - HyperThreading " 1874 "available. Core siblings: ", 1875 i); 1876 break; 1877 case CORE_LEVEL: 1878 smt_not_supported = 1; 1879 1880 if (bootverbose) 1881 kprintf ("\tcpu%d - No HT available, " 1882 "multi-core/physical " 1883 "cpu. Physical siblings: ", 1884 i); 1885 break; 1886 case CHIP_LEVEL: 1887 smt_not_supported = 1; 1888 1889 if (bootverbose) 1890 kprintf ("\tcpu%d - No HT available, " 1891 "single-core/physical cpu. " 1892 "Package Siblings: ", 1893 i); 1894 break; 1895 default: 1896 /* Let's go for safe defaults here */ 1897 smt_not_supported = 1; 1898 cache_coherent_not_supported = 1; 1899 if (bootverbose) 1900 kprintf ("\tcpu%d - Unknown cpunode->" 1901 "type=%u. Siblings: ", 1902 i, 1903 (u_int)dd->cpunode->type); 1904 break; 1905 } 1906 1907 if (bootverbose) { 1908 if (dd->cpunode->parent_node != NULL) { 1909 CPUSET_FOREACH(cpuid, dd->cpunode->parent_node->members) 1910 kprintf("cpu%d ", cpuid); 1911 kprintf("\n"); 1912 } else { 1913 kprintf(" no siblings\n"); 1914 } 1915 } 1916 } 1917 1918 lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread, 1919 0, i, "usched %d", i); 1920 1921 /* 1922 * Allow user scheduling on the target cpu. cpu #0 has already 1923 * been enabled in rqinit(). 1924 */ 1925 if (i) 1926 atomic_clear_cpumask(&bsd4_curprocmask, mask); 1927 atomic_set_cpumask(&bsd4_rdyprocmask, mask); 1928 dd->upri = PRIBASE_NULL; 1929 1930 } 1931 1932 /* usched_bsd4 sysctl configurable parameters */ 1933 1934 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1935 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1936 OID_AUTO, "rrinterval", CTLFLAG_RW, 1937 &usched_bsd4_rrinterval, 0, ""); 1938 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1939 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1940 OID_AUTO, "decay", CTLFLAG_RW, 1941 &usched_bsd4_decay, 0, "Extra decay when not running"); 1942 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1943 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1944 OID_AUTO, "batch_time", CTLFLAG_RW, 1945 &usched_bsd4_batch_time, 0, "Min batch counter value"); 1946 SYSCTL_ADD_LONG(&usched_bsd4_sysctl_ctx, 1947 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1948 OID_AUTO, "kicks", CTLFLAG_RW, 1949 &usched_bsd4_kicks, "Number of kickstarts"); 1950 1951 /* Add enable/disable option for SMT scheduling if supported */ 1952 if (smt_not_supported) { 1953 usched_bsd4_smt = 0; 1954 SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx, 1955 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1956 OID_AUTO, "smt", CTLFLAG_RD, 1957 "NOT SUPPORTED", 0, "SMT NOT SUPPORTED"); 1958 } else { 1959 usched_bsd4_smt = 1; 1960 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1961 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1962 OID_AUTO, "smt", CTLFLAG_RW, 1963 &usched_bsd4_smt, 0, "Enable SMT scheduling"); 1964 } 1965 1966 /* 1967 * Add enable/disable option for cache coherent scheduling 1968 * if supported 1969 */ 1970 if (cache_coherent_not_supported) { 1971 usched_bsd4_cache_coherent = 0; 1972 SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx, 1973 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1974 OID_AUTO, "cache_coherent", CTLFLAG_RD, 1975 "NOT SUPPORTED", 0, 1976 "Cache coherence NOT SUPPORTED"); 1977 } else { 1978 usched_bsd4_cache_coherent = 1; 1979 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1980 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1981 OID_AUTO, "cache_coherent", CTLFLAG_RW, 1982 &usched_bsd4_cache_coherent, 0, 1983 "Enable/Disable cache coherent scheduling"); 1984 1985 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1986 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1987 OID_AUTO, "upri_affinity", CTLFLAG_RW, 1988 &usched_bsd4_upri_affinity, 1, 1989 "Number of PPQs in user priority check"); 1990 1991 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1992 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1993 OID_AUTO, "queue_checks", CTLFLAG_RW, 1994 &usched_bsd4_queue_checks, 5, 1995 "LWPs to check from a queue before giving up"); 1996 1997 SYSCTL_ADD_PROC(&usched_bsd4_sysctl_ctx, 1998 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1999 OID_AUTO, "stick_to_level", 2000 CTLTYPE_INT | CTLFLAG_RW, 2001 NULL, sizeof usched_bsd4_stick_to_level, 2002 sysctl_usched_bsd4_stick_to_level, "I", 2003 "Stick a process to this level. See sysctl" 2004 "paremter hw.cpu_topology.level_description"); 2005 } 2006 } 2007 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND, 2008 sched_thread_cpu_init, NULL) 2009