1 /* 2 * Copyright (c) 2012 The DragonFly Project. All rights reserved. 3 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Matthew Dillon <dillon@backplane.com>, 7 * by Mihai Carabas <mihai.carabas@gmail.com> 8 * and many others. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/kernel.h> 35 #include <sys/lock.h> 36 #include <sys/queue.h> 37 #include <sys/proc.h> 38 #include <sys/rtprio.h> 39 #include <sys/uio.h> 40 #include <sys/sysctl.h> 41 #include <sys/resourcevar.h> 42 #include <sys/spinlock.h> 43 #include <sys/cpu_topology.h> 44 #include <sys/thread2.h> 45 #include <sys/spinlock2.h> 46 47 #include <sys/ktr.h> 48 49 #include <machine/cpu.h> 50 #include <machine/smp.h> 51 52 /* 53 * Priorities. Note that with 32 run queues per scheduler each queue 54 * represents four priority levels. 55 */ 56 57 #define MAXPRI 128 58 #define PRIMASK (MAXPRI - 1) 59 #define PRIBASE_REALTIME 0 60 #define PRIBASE_NORMAL MAXPRI 61 #define PRIBASE_IDLE (MAXPRI * 2) 62 #define PRIBASE_THREAD (MAXPRI * 3) 63 #define PRIBASE_NULL (MAXPRI * 4) 64 65 #define NQS 32 /* 32 run queues. */ 66 #define PPQ (MAXPRI / NQS) /* priorities per queue */ 67 #define PPQMASK (PPQ - 1) 68 69 /* 70 * NICEPPQ - number of nice units per priority queue 71 * 72 * ESTCPUPPQ - number of estcpu units per priority queue 73 * ESTCPUMAX - number of estcpu units 74 */ 75 #define NICEPPQ 2 76 #define ESTCPUPPQ 512 77 #define ESTCPUMAX (ESTCPUPPQ * NQS) 78 #define BATCHMAX (ESTCPUFREQ * 30) 79 #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1) 80 81 #define ESTCPULIM(v) min((v), ESTCPUMAX) 82 83 TAILQ_HEAD(rq, lwp); 84 85 #define lwp_priority lwp_usdata.bsd4.priority 86 #define lwp_rqindex lwp_usdata.bsd4.rqindex 87 #define lwp_estcpu lwp_usdata.bsd4.estcpu 88 #define lwp_batch lwp_usdata.bsd4.batch 89 #define lwp_rqtype lwp_usdata.bsd4.rqtype 90 91 static void bsd4_acquire_curproc(struct lwp *lp); 92 static void bsd4_release_curproc(struct lwp *lp); 93 static void bsd4_select_curproc(globaldata_t gd); 94 static void bsd4_setrunqueue(struct lwp *lp); 95 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period, 96 sysclock_t cpstamp); 97 static void bsd4_recalculate_estcpu(struct lwp *lp); 98 static void bsd4_resetpriority(struct lwp *lp); 99 static void bsd4_forking(struct lwp *plp, struct lwp *lp); 100 static void bsd4_exiting(struct lwp *lp, struct proc *); 101 static void bsd4_uload_update(struct lwp *lp); 102 static void bsd4_yield(struct lwp *lp); 103 static void bsd4_need_user_resched_remote(void *dummy); 104 static int bsd4_batchy_looser_pri_test(struct lwp* lp); 105 static struct lwp *bsd4_chooseproc_locked_cache_coherent(struct lwp *chklp); 106 static void bsd4_kick_helper(struct lwp *lp); 107 static struct lwp *bsd4_chooseproc_locked(struct lwp *chklp); 108 static void bsd4_remrunqueue_locked(struct lwp *lp); 109 static void bsd4_setrunqueue_locked(struct lwp *lp); 110 static void bsd4_changedcpu(struct lwp *lp); 111 112 struct usched usched_bsd4 = { 113 { NULL }, 114 "bsd4", "Original DragonFly Scheduler", 115 NULL, /* default registration */ 116 NULL, /* default deregistration */ 117 bsd4_acquire_curproc, 118 bsd4_release_curproc, 119 bsd4_setrunqueue, 120 bsd4_schedulerclock, 121 bsd4_recalculate_estcpu, 122 bsd4_resetpriority, 123 bsd4_forking, 124 bsd4_exiting, 125 bsd4_uload_update, 126 NULL, /* setcpumask not supported */ 127 bsd4_yield, 128 bsd4_changedcpu 129 }; 130 131 struct usched_bsd4_pcpu { 132 struct thread *helper_thread; 133 short rrcount; 134 short upri; 135 struct lwp *uschedcp; 136 struct lwp *old_uschedcp; 137 cpu_node_t *cpunode; 138 }; 139 140 typedef struct usched_bsd4_pcpu *bsd4_pcpu_t; 141 142 /* 143 * We have NQS (32) run queues per scheduling class. For the normal 144 * class, there are 128 priorities scaled onto these 32 queues. New 145 * processes are added to the last entry in each queue, and processes 146 * are selected for running by taking them from the head and maintaining 147 * a simple FIFO arrangement. Realtime and Idle priority processes have 148 * and explicit 0-31 priority which maps directly onto their class queue 149 * index. When a queue has something in it, the corresponding bit is 150 * set in the queuebits variable, allowing a single read to determine 151 * the state of all 32 queues and then a ffs() to find the first busy 152 * queue. 153 */ 154 static struct rq bsd4_queues[NQS]; 155 static struct rq bsd4_rtqueues[NQS]; 156 static struct rq bsd4_idqueues[NQS]; 157 static u_int32_t bsd4_queuebits; 158 static u_int32_t bsd4_rtqueuebits; 159 static u_int32_t bsd4_idqueuebits; 160 /* currently running a user process */ 161 static cpumask_t bsd4_curprocmask = CPUMASK_INITIALIZER_ALLONES; 162 /* ready to accept a user process */ 163 static cpumask_t bsd4_rdyprocmask; 164 static int bsd4_runqcount; 165 static volatile int bsd4_scancpu; 166 static struct spinlock bsd4_spin; 167 static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU]; 168 static struct sysctl_ctx_list usched_bsd4_sysctl_ctx; 169 static struct sysctl_oid *usched_bsd4_sysctl_tree; 170 171 /* Debug info exposed through debug.* sysctl */ 172 173 SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, 174 &bsd4_runqcount, 0, 175 "Number of run queues"); 176 177 static int usched_bsd4_debug = -1; 178 SYSCTL_INT(_debug, OID_AUTO, bsd4_scdebug, CTLFLAG_RW, 179 &usched_bsd4_debug, 0, 180 "Print debug information for this pid"); 181 182 static int usched_bsd4_pid_debug = -1; 183 SYSCTL_INT(_debug, OID_AUTO, bsd4_pid_debug, CTLFLAG_RW, 184 &usched_bsd4_pid_debug, 0, 185 "Print KTR debug information for this pid"); 186 187 /* Tunning usched_bsd4 - configurable through kern.usched_bsd4.* */ 188 static int usched_bsd4_smt = 0; 189 static int usched_bsd4_cache_coherent = 0; 190 static int usched_bsd4_upri_affinity = 16; /* 32 queues - half-way */ 191 static int usched_bsd4_queue_checks = 5; 192 static int usched_bsd4_stick_to_level = 0; 193 static long usched_bsd4_kicks; 194 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10; 195 static int usched_bsd4_decay = 8; 196 static int usched_bsd4_batch_time = 10; 197 198 /* KTR debug printings */ 199 200 KTR_INFO_MASTER_EXTERN(usched); 201 202 #if !defined(KTR_USCHED_BSD4) 203 #define KTR_USCHED_BSD4 KTR_ALL 204 #endif 205 206 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_urw, 0, 207 "USCHED_BSD4(bsd4_acquire_curproc in user_reseched_wanted " 208 "after release: pid %d, cpuid %d, curr_cpuid %d)", 209 pid_t pid, int cpuid, int curr); 210 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_before_loop, 0, 211 "USCHED_BSD4(bsd4_acquire_curproc before loop: pid %d, cpuid %d, " 212 "curr_cpuid %d)", 213 pid_t pid, int cpuid, int curr); 214 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_not, 0, 215 "USCHED_BSD4(bsd4_acquire_curproc couldn't acquire after " 216 "bsd4_setrunqueue: pid %d, cpuid %d, curr_lp pid %d, curr_cpuid %d)", 217 pid_t pid, int cpuid, pid_t curr_pid, int curr_cpuid); 218 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_switch, 0, 219 "USCHED_BSD4(bsd4_acquire_curproc after lwkt_switch: pid %d, " 220 "cpuid %d, curr_cpuid %d)", 221 pid_t pid, int cpuid, int curr); 222 223 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_release_curproc, 0, 224 "USCHED_BSD4(bsd4_release_curproc before select: pid %d, " 225 "cpuid %d, curr_cpuid %d)", 226 pid_t pid, int cpuid, int curr); 227 228 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_select_curproc, 0, 229 "USCHED_BSD4(bsd4_release_curproc before select: pid %d, " 230 "cpuid %d, old_pid %d, old_cpuid %d, curr_cpuid %d)", 231 pid_t pid, int cpuid, pid_t old_pid, int old_cpuid, int curr); 232 233 KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_false, 0, 234 "USCHED_BSD4(batchy_looser_pri_test false: pid %d, " 235 "cpuid %d, verify_mask %lu)", 236 pid_t pid, int cpuid, unsigned long mask); 237 KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_true, 0, 238 "USCHED_BSD4(batchy_looser_pri_test true: pid %d, " 239 "cpuid %d, verify_mask %lu)", 240 pid_t pid, int cpuid, unsigned long mask); 241 242 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_smt, 0, 243 "USCHED_BSD4(bsd4_setrunqueue free cpus smt: pid %d, cpuid %d, " 244 "mask %lu, curr_cpuid %d)", 245 pid_t pid, int cpuid, unsigned long mask, int curr); 246 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_non_smt, 0, 247 "USCHED_BSD4(bsd4_setrunqueue free cpus check non_smt: pid %d, " 248 "cpuid %d, mask %lu, curr_cpuid %d)", 249 pid_t pid, int cpuid, unsigned long mask, int curr); 250 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_rc, 0, 251 "USCHED_BSD4(bsd4_setrunqueue running cpus check: pid %d, " 252 "cpuid %d, mask %lu, curr_cpuid %d)", 253 pid_t pid, int cpuid, unsigned long mask, int curr); 254 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found, 0, 255 "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, " 256 "mask %lu, found_cpuid %d, curr_cpuid %d)", 257 pid_t pid, int cpuid, unsigned long mask, int found_cpuid, int curr); 258 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_not_found, 0, 259 "USCHED_BSD4(bsd4_setrunqueue not found cpu: pid %d, cpuid %d, " 260 "try_cpuid %d, curr_cpuid %d)", 261 pid_t pid, int cpuid, int try_cpuid, int curr); 262 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found_best_cpuid, 0, 263 "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, " 264 "mask %lu, found_cpuid %d, curr_cpuid %d)", 265 pid_t pid, int cpuid, unsigned long mask, int found_cpuid, int curr); 266 267 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_chooseproc, 0, 268 "USCHED_BSD4(chooseproc: pid %d, old_cpuid %d, curr_cpuid %d)", 269 pid_t pid, int old_cpuid, int curr); 270 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc, 0, 271 "USCHED_BSD4(chooseproc_cc: pid %d, old_cpuid %d, curr_cpuid %d)", 272 pid_t pid, int old_cpuid, int curr); 273 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_not_good, 0, 274 "USCHED_BSD4(chooseproc_cc not good: pid %d, old_cpumask %lu, " 275 "sibling_mask %lu, curr_cpumask %lu)", 276 pid_t pid, unsigned long old_cpumask, unsigned long sibling_mask, unsigned long curr); 277 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_elected, 0, 278 "USCHED_BSD4(chooseproc_cc elected: pid %d, old_cpumask %lu, " 279 "sibling_mask %lu, curr_cpumask: %lu)", 280 pid_t pid, unsigned long old_cpumask, unsigned long sibling_mask, unsigned long curr); 281 282 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process, 0, 283 "USCHED_BSD4(sched_thread %d no process scheduled: pid %d, old_cpuid %d)", 284 int id, pid_t pid, int cpuid); 285 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_process, 0, 286 "USCHED_BSD4(sched_thread %d process scheduled: pid %d, old_cpuid %d)", 287 int id, pid_t pid, int cpuid); 288 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process_found, 0, 289 "USCHED_BSD4(sched_thread %d no process found; tmpmask %lu)", 290 int id, unsigned long tmpmask); 291 292 /* 293 * Initialize the run queues at boot time. 294 */ 295 static void 296 bsd4_rqinit(void *dummy) 297 { 298 int i; 299 300 spin_init(&bsd4_spin, "bsd4rq"); 301 for (i = 0; i < NQS; i++) { 302 TAILQ_INIT(&bsd4_queues[i]); 303 TAILQ_INIT(&bsd4_rtqueues[i]); 304 TAILQ_INIT(&bsd4_idqueues[i]); 305 } 306 ATOMIC_CPUMASK_NANDBIT(bsd4_curprocmask, 0); 307 } 308 SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, bsd4_rqinit, NULL); 309 310 /* 311 * BSD4_ACQUIRE_CURPROC 312 * 313 * This function is called when the kernel intends to return to userland. 314 * It is responsible for making the thread the current designated userland 315 * thread for this cpu, blocking if necessary. 316 * 317 * The kernel will not depress our LWKT priority until after we return, 318 * in case we have to shove over to another cpu. 319 * 320 * We must determine our thread's disposition before we switch away. This 321 * is very sensitive code. 322 * 323 * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE 324 * TO ANOTHER CPU! Because most of the kernel assumes that no migration will 325 * occur, this function is called only under very controlled circumstances. 326 * 327 * MPSAFE 328 */ 329 static void 330 bsd4_acquire_curproc(struct lwp *lp) 331 { 332 globaldata_t gd; 333 bsd4_pcpu_t dd; 334 thread_t td; 335 #if 0 336 struct lwp *olp; 337 #endif 338 339 /* 340 * Make sure we aren't sitting on a tsleep queue. 341 */ 342 td = lp->lwp_thread; 343 crit_enter_quick(td); 344 if (td->td_flags & TDF_TSLEEPQ) 345 tsleep_remove(td); 346 bsd4_recalculate_estcpu(lp); 347 348 /* 349 * If a reschedule was requested give another thread the 350 * driver's seat. 351 */ 352 if (user_resched_wanted()) { 353 clear_user_resched(); 354 bsd4_release_curproc(lp); 355 356 KTR_COND_LOG(usched_bsd4_acquire_curproc_urw, 357 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 358 lp->lwp_proc->p_pid, 359 lp->lwp_thread->td_gd->gd_cpuid, 360 mycpu->gd_cpuid); 361 } 362 363 /* 364 * Loop until we are the current user thread 365 */ 366 gd = mycpu; 367 dd = &bsd4_pcpu[gd->gd_cpuid]; 368 369 KTR_COND_LOG(usched_bsd4_acquire_curproc_before_loop, 370 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 371 lp->lwp_proc->p_pid, 372 lp->lwp_thread->td_gd->gd_cpuid, 373 gd->gd_cpuid); 374 375 do { 376 /* 377 * Process any pending events and higher priority threads. 378 */ 379 lwkt_yield(); 380 381 /* This lwp is an outcast; force reschedule. */ 382 if (__predict_false( 383 CPUMASK_TESTBIT(lp->lwp_cpumask, gd->gd_cpuid) == 0)) { 384 bsd4_release_curproc(lp); 385 goto resched; 386 } 387 388 /* 389 * Become the currently scheduled user thread for this cpu 390 * if we can do so trivially. 391 * 392 * We can steal another thread's current thread designation 393 * on this cpu since if we are running that other thread 394 * must not be, so we can safely deschedule it. 395 */ 396 if (dd->uschedcp == lp) { 397 /* 398 * We are already the current lwp (hot path). 399 */ 400 dd->upri = lp->lwp_priority; 401 } else if (dd->uschedcp == NULL) { 402 /* 403 * We can trivially become the current lwp. 404 */ 405 ATOMIC_CPUMASK_ORBIT(bsd4_curprocmask, gd->gd_cpuid); 406 dd->uschedcp = lp; 407 dd->upri = lp->lwp_priority; 408 } else if (dd->upri > lp->lwp_priority) { 409 /* 410 * We can steal the current cpu's lwp designation 411 * away simply by replacing it. The other thread 412 * will stall when it tries to return to userland. 413 */ 414 dd->uschedcp = lp; 415 dd->upri = lp->lwp_priority; 416 /* 417 lwkt_deschedule(olp->lwp_thread); 418 bsd4_setrunqueue(olp); 419 */ 420 } else { 421 resched: 422 /* 423 * We cannot become the current lwp, place the lp 424 * on the bsd4 run-queue and deschedule ourselves. 425 * 426 * When we are reactivated we will have another 427 * chance. 428 */ 429 lwkt_deschedule(lp->lwp_thread); 430 431 bsd4_setrunqueue(lp); 432 433 KTR_COND_LOG(usched_bsd4_acquire_curproc_not, 434 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 435 lp->lwp_proc->p_pid, 436 lp->lwp_thread->td_gd->gd_cpuid, 437 dd->uschedcp->lwp_proc->p_pid, 438 gd->gd_cpuid); 439 440 441 lwkt_switch(); 442 443 /* 444 * Reload after a switch or setrunqueue/switch possibly 445 * moved us to another cpu. 446 */ 447 gd = mycpu; 448 dd = &bsd4_pcpu[gd->gd_cpuid]; 449 450 KTR_COND_LOG(usched_bsd4_acquire_curproc_switch, 451 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 452 lp->lwp_proc->p_pid, 453 lp->lwp_thread->td_gd->gd_cpuid, 454 gd->gd_cpuid); 455 } 456 } while (dd->uschedcp != lp); 457 458 crit_exit_quick(td); 459 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 460 } 461 462 /* 463 * BSD4_RELEASE_CURPROC 464 * 465 * This routine detaches the current thread from the userland scheduler, 466 * usually because the thread needs to run or block in the kernel (at 467 * kernel priority) for a while. 468 * 469 * This routine is also responsible for selecting a new thread to 470 * make the current thread. 471 * 472 * NOTE: This implementation differs from the dummy example in that 473 * bsd4_select_curproc() is able to select the current process, whereas 474 * dummy_select_curproc() is not able to select the current process. 475 * This means we have to NULL out uschedcp. 476 * 477 * Additionally, note that we may already be on a run queue if releasing 478 * via the lwkt_switch() in bsd4_setrunqueue(). 479 * 480 * MPSAFE 481 */ 482 483 static void 484 bsd4_release_curproc(struct lwp *lp) 485 { 486 globaldata_t gd = mycpu; 487 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 488 489 if (dd->uschedcp == lp) { 490 crit_enter(); 491 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 492 493 KTR_COND_LOG(usched_bsd4_release_curproc, 494 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 495 lp->lwp_proc->p_pid, 496 lp->lwp_thread->td_gd->gd_cpuid, 497 gd->gd_cpuid); 498 499 dd->uschedcp = NULL; /* don't let lp be selected */ 500 dd->upri = PRIBASE_NULL; 501 ATOMIC_CPUMASK_NANDBIT(bsd4_curprocmask, gd->gd_cpuid); 502 dd->old_uschedcp = lp; /* used only for KTR debug prints */ 503 bsd4_select_curproc(gd); 504 crit_exit(); 505 } 506 } 507 508 /* 509 * BSD4_SELECT_CURPROC 510 * 511 * Select a new current process for this cpu and clear any pending user 512 * reschedule request. The cpu currently has no current process. 513 * 514 * This routine is also responsible for equal-priority round-robining, 515 * typically triggered from bsd4_schedulerclock(). In our dummy example 516 * all the 'user' threads are LWKT scheduled all at once and we just 517 * call lwkt_switch(). 518 * 519 * The calling process is not on the queue and cannot be selected. 520 * 521 * MPSAFE 522 */ 523 static 524 void 525 bsd4_select_curproc(globaldata_t gd) 526 { 527 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 528 struct lwp *nlp; 529 int cpuid = gd->gd_cpuid; 530 531 crit_enter_gd(gd); 532 533 spin_lock(&bsd4_spin); 534 if(usched_bsd4_cache_coherent) 535 nlp = bsd4_chooseproc_locked_cache_coherent(dd->uschedcp); 536 else 537 nlp = bsd4_chooseproc_locked(dd->uschedcp); 538 539 if (nlp) { 540 541 KTR_COND_LOG(usched_bsd4_select_curproc, 542 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, 543 nlp->lwp_proc->p_pid, 544 nlp->lwp_thread->td_gd->gd_cpuid, 545 dd->old_uschedcp->lwp_proc->p_pid, 546 dd->old_uschedcp->lwp_thread->td_gd->gd_cpuid, 547 gd->gd_cpuid); 548 549 ATOMIC_CPUMASK_ORBIT(bsd4_curprocmask, cpuid); 550 dd->upri = nlp->lwp_priority; 551 dd->uschedcp = nlp; 552 dd->rrcount = 0; /* reset round robin */ 553 spin_unlock(&bsd4_spin); 554 lwkt_acquire(nlp->lwp_thread); 555 lwkt_schedule(nlp->lwp_thread); 556 } else { 557 spin_unlock(&bsd4_spin); 558 } 559 560 #if 0 561 } else if (bsd4_runqcount && CPUMASK_TESTBIT(bsd4_rdyprocmask, cpuid)) { 562 ATOMIC_CPUMASK_NANDBIT(bsd4_rdyprocmask, cpuid); 563 spin_unlock(&bsd4_spin); 564 lwkt_schedule(dd->helper_thread); 565 } else { 566 spin_unlock(&bsd4_spin); 567 } 568 #endif 569 crit_exit_gd(gd); 570 } 571 572 /* 573 * batchy_looser_pri_test() - determine if a process is batchy or not 574 * relative to the other processes running in the system 575 */ 576 static int 577 bsd4_batchy_looser_pri_test(struct lwp* lp) 578 { 579 cpumask_t mask; 580 bsd4_pcpu_t other_dd; 581 int cpu; 582 583 /* Current running processes */ 584 mask = bsd4_curprocmask; 585 CPUMASK_ANDMASK(mask, smp_active_mask); 586 CPUMASK_ANDMASK(mask, usched_global_cpumask); 587 588 while (CPUMASK_TESTNZERO(mask)) { 589 cpu = BSFCPUMASK(mask); 590 other_dd = &bsd4_pcpu[cpu]; 591 if (other_dd->upri - lp->lwp_priority > usched_bsd4_upri_affinity * PPQ) { 592 593 KTR_COND_LOG(usched_batchy_test_false, 594 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 595 lp->lwp_proc->p_pid, 596 lp->lwp_thread->td_gd->gd_cpuid, 597 (unsigned long)CPUMASK_LOWMASK(mask)); 598 599 return 0; 600 } 601 CPUMASK_NANDBIT(mask, cpu); 602 } 603 604 KTR_COND_LOG(usched_batchy_test_true, 605 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 606 lp->lwp_proc->p_pid, 607 lp->lwp_thread->td_gd->gd_cpuid, 608 (unsigned long)CPUMASK_LOWMASK(mask)); 609 610 return 1; 611 } 612 613 /* 614 * 615 * BSD4_SETRUNQUEUE 616 * 617 * Place the specified lwp on the user scheduler's run queue. This routine 618 * must be called with the thread descheduled. The lwp must be runnable. 619 * 620 * The thread may be the current thread as a special case. 621 * 622 * MPSAFE 623 */ 624 static void 625 bsd4_setrunqueue(struct lwp *lp) 626 { 627 globaldata_t gd; 628 bsd4_pcpu_t dd; 629 int cpuid; 630 cpumask_t mask; 631 cpumask_t tmpmask; 632 633 /* 634 * First validate the process state relative to the current cpu. 635 * We don't need the spinlock for this, just a critical section. 636 * We are in control of the process. 637 */ 638 crit_enter(); 639 KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN")); 640 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0, 641 ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid, 642 lp->lwp_tid, lp->lwp_proc->p_flags, lp->lwp_flags)); 643 KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0); 644 645 /* 646 * Note: gd and dd are relative to the target thread's last cpu, 647 * NOT our current cpu. 648 */ 649 gd = lp->lwp_thread->td_gd; 650 dd = &bsd4_pcpu[gd->gd_cpuid]; 651 652 /* 653 * This process is not supposed to be scheduled anywhere or assigned 654 * as the current process anywhere. Assert the condition. 655 */ 656 KKASSERT(dd->uschedcp != lp); 657 658 /* 659 * XXX fixme. Could be part of a remrunqueue/setrunqueue 660 * operation when the priority is recalculated, so TDF_MIGRATING 661 * may already be set. 662 */ 663 if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0) 664 lwkt_giveaway(lp->lwp_thread); 665 666 /* 667 * We lose control of lp the moment we release the spinlock after 668 * having placed lp on the queue. i.e. another cpu could pick it 669 * up and it could exit, or its priority could be further adjusted, 670 * or something like that. 671 */ 672 spin_lock(&bsd4_spin); 673 bsd4_setrunqueue_locked(lp); 674 lp->lwp_rebal_ticks = sched_ticks; 675 676 /* 677 * Kick the scheduler helper on one of the other cpu's 678 * and request a reschedule if appropriate. 679 * 680 * NOTE: We check all cpus whos rdyprocmask is set. First we 681 * look for cpus without designated lps, then we look for 682 * cpus with designated lps with a worse priority than our 683 * process. 684 */ 685 ++bsd4_scancpu; 686 687 if (usched_bsd4_smt) { 688 689 /* 690 * SMT heuristic - Try to schedule on a free physical core. 691 * If no physical core found than choose the one that has 692 * an interactive thread. 693 */ 694 695 int best_cpuid = -1; 696 int min_prio = MAXPRI * MAXPRI; 697 int sibling; 698 699 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 700 mask = bsd4_rdyprocmask; 701 CPUMASK_NANDMASK(mask, bsd4_curprocmask); 702 CPUMASK_ANDMASK(mask, lp->lwp_cpumask); 703 CPUMASK_ANDMASK(mask, smp_active_mask); 704 CPUMASK_ANDMASK(mask, usched_global_cpumask); 705 706 KTR_COND_LOG(usched_bsd4_setrunqueue_fc_smt, 707 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 708 lp->lwp_proc->p_pid, 709 lp->lwp_thread->td_gd->gd_cpuid, 710 (unsigned long)CPUMASK_LOWMASK(mask), 711 mycpu->gd_cpuid); 712 713 while (CPUMASK_TESTNZERO(mask)) { 714 CPUMASK_ASSNBMASK(tmpmask, cpuid); 715 if (CPUMASK_TESTMASK(tmpmask, mask)) { 716 CPUMASK_ANDMASK(tmpmask, mask); 717 cpuid = BSFCPUMASK(tmpmask); 718 } else { 719 cpuid = BSFCPUMASK(mask); 720 } 721 gd = globaldata_find(cpuid); 722 dd = &bsd4_pcpu[cpuid]; 723 724 if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) { 725 tmpmask = dd->cpunode->parent_node->members; 726 CPUMASK_NANDMASK(tmpmask, dd->cpunode->members); 727 CPUMASK_ANDMASK(tmpmask, mask); 728 if (CPUMASK_TESTNZERO(tmpmask)) { 729 KTR_COND_LOG(usched_bsd4_setrunqueue_found, 730 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 731 lp->lwp_proc->p_pid, 732 lp->lwp_thread->td_gd->gd_cpuid, 733 (unsigned long)CPUMASK_LOWMASK(mask), 734 cpuid, 735 mycpu->gd_cpuid); 736 737 goto found; 738 } else { 739 tmpmask = 740 dd->cpunode->parent_node->members; 741 CPUMASK_NANDMASK(tmpmask, 742 dd->cpunode->members); 743 sibling = BSFCPUMASK(tmpmask); 744 if (min_prio > 745 bsd4_pcpu[sibling].upri) { 746 min_prio = 747 bsd4_pcpu[sibling].upri; 748 best_cpuid = cpuid; 749 } 750 } 751 } 752 CPUMASK_NANDBIT(mask, cpuid); 753 } 754 755 if (best_cpuid != -1) { 756 cpuid = best_cpuid; 757 gd = globaldata_find(cpuid); 758 dd = &bsd4_pcpu[cpuid]; 759 760 KTR_COND_LOG(usched_bsd4_setrunqueue_found_best_cpuid, 761 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 762 lp->lwp_proc->p_pid, 763 lp->lwp_thread->td_gd->gd_cpuid, 764 (unsigned long)CPUMASK_LOWMASK(mask), 765 cpuid, 766 mycpu->gd_cpuid); 767 768 goto found; 769 } 770 } else { 771 /* Fallback to the original heuristic */ 772 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 773 mask = bsd4_rdyprocmask; 774 CPUMASK_NANDMASK(mask, bsd4_curprocmask); 775 CPUMASK_ANDMASK(mask, lp->lwp_cpumask); 776 CPUMASK_ANDMASK(mask, smp_active_mask); 777 CPUMASK_ANDMASK(mask, usched_global_cpumask); 778 779 KTR_COND_LOG(usched_bsd4_setrunqueue_fc_non_smt, 780 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 781 lp->lwp_proc->p_pid, 782 lp->lwp_thread->td_gd->gd_cpuid, 783 (unsigned long)CPUMASK_LOWMASK(mask), 784 mycpu->gd_cpuid); 785 786 while (CPUMASK_TESTNZERO(mask)) { 787 CPUMASK_ASSNBMASK(tmpmask, cpuid); 788 if (CPUMASK_TESTMASK(tmpmask, mask)) { 789 CPUMASK_ANDMASK(tmpmask, mask); 790 cpuid = BSFCPUMASK(tmpmask); 791 } else { 792 cpuid = BSFCPUMASK(mask); 793 } 794 gd = globaldata_find(cpuid); 795 dd = &bsd4_pcpu[cpuid]; 796 797 if ((dd->upri & ~PPQMASK) >= 798 (lp->lwp_priority & ~PPQMASK)) { 799 KTR_COND_LOG(usched_bsd4_setrunqueue_found, 800 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 801 lp->lwp_proc->p_pid, 802 lp->lwp_thread->td_gd->gd_cpuid, 803 (unsigned long)CPUMASK_LOWMASK(mask), 804 cpuid, 805 mycpu->gd_cpuid); 806 807 goto found; 808 } 809 CPUMASK_NANDBIT(mask, cpuid); 810 } 811 } 812 813 /* 814 * Then cpus which might have a currently running lp 815 */ 816 mask = bsd4_curprocmask; 817 CPUMASK_ANDMASK(mask, bsd4_rdyprocmask); 818 CPUMASK_ANDMASK(mask, lp->lwp_cpumask); 819 CPUMASK_ANDMASK(mask, smp_active_mask); 820 CPUMASK_ANDMASK(mask, usched_global_cpumask); 821 822 KTR_COND_LOG(usched_bsd4_setrunqueue_rc, 823 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 824 lp->lwp_proc->p_pid, 825 lp->lwp_thread->td_gd->gd_cpuid, 826 (unsigned long)CPUMASK_LOWMASK(mask), 827 mycpu->gd_cpuid); 828 829 while (CPUMASK_TESTNZERO(mask)) { 830 CPUMASK_ASSNBMASK(tmpmask, cpuid); 831 if (CPUMASK_TESTMASK(tmpmask, mask)) { 832 CPUMASK_ANDMASK(tmpmask, mask); 833 cpuid = BSFCPUMASK(tmpmask); 834 } else { 835 cpuid = BSFCPUMASK(mask); 836 } 837 gd = globaldata_find(cpuid); 838 dd = &bsd4_pcpu[cpuid]; 839 840 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 841 KTR_COND_LOG(usched_bsd4_setrunqueue_found, 842 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 843 lp->lwp_proc->p_pid, 844 lp->lwp_thread->td_gd->gd_cpuid, 845 (unsigned long)CPUMASK_LOWMASK(mask), 846 cpuid, 847 mycpu->gd_cpuid); 848 849 goto found; 850 } 851 CPUMASK_NANDBIT(mask, cpuid); 852 } 853 854 /* 855 * If we cannot find a suitable cpu we reload from bsd4_scancpu 856 * and round-robin. Other cpus will pickup as they release their 857 * current lwps or become ready. 858 * 859 * Avoid a degenerate system lockup case if usched_global_cpumask 860 * is set to 0 or otherwise does not cover lwp_cpumask. 861 * 862 * We only kick the target helper thread in this case, we do not 863 * set the user resched flag because 864 */ 865 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 866 if (CPUMASK_TESTBIT(lp->lwp_cpumask, cpuid) == 0) 867 cpuid = BSFCPUMASK(lp->lwp_cpumask); 868 else if (CPUMASK_TESTBIT(usched_global_cpumask, cpuid) == 0) 869 cpuid = 0; 870 gd = globaldata_find(cpuid); 871 dd = &bsd4_pcpu[cpuid]; 872 873 KTR_COND_LOG(usched_bsd4_setrunqueue_not_found, 874 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 875 lp->lwp_proc->p_pid, 876 lp->lwp_thread->td_gd->gd_cpuid, 877 cpuid, 878 mycpu->gd_cpuid); 879 880 found: 881 if (gd == mycpu) { 882 spin_unlock(&bsd4_spin); 883 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 884 if (dd->uschedcp == NULL) { 885 wakeup_mycpu(dd->helper_thread); 886 } else { 887 need_user_resched(); 888 } 889 } 890 } else { 891 ATOMIC_CPUMASK_NANDBIT(bsd4_rdyprocmask, cpuid); 892 spin_unlock(&bsd4_spin); 893 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) 894 lwkt_send_ipiq(gd, bsd4_need_user_resched_remote, NULL); 895 else 896 wakeup(dd->helper_thread); 897 } 898 crit_exit(); 899 } 900 901 /* 902 * This routine is called from a systimer IPI. It MUST be MP-safe and 903 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ on 904 * each cpu. 905 * 906 * This routine is called on every sched tick. If the currently running 907 * thread belongs to this scheduler it will be called with a non-NULL lp, 908 * otherwise it will be called with a NULL lp. 909 * 910 * MPSAFE 911 */ 912 static 913 void 914 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp) 915 { 916 globaldata_t gd = mycpu; 917 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 918 919 /* 920 * No impl if no lp running. 921 */ 922 if (lp == NULL) 923 return; 924 925 /* 926 * Do we need to round-robin? We round-robin 10 times a second. 927 * This should only occur for cpu-bound batch processes. 928 */ 929 if (++dd->rrcount >= usched_bsd4_rrinterval) { 930 dd->rrcount = 0; 931 need_user_resched(); 932 } 933 934 /* 935 * Adjust estcpu upward using a real time equivalent calculation. 936 */ 937 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUMAX / ESTCPUFREQ + 1); 938 939 /* 940 * Spinlocks also hold a critical section so there should not be 941 * any active. 942 */ 943 KKASSERT(gd->gd_spinlocks == 0); 944 945 bsd4_resetpriority(lp); 946 } 947 948 /* 949 * Called from acquire and from kern_synch's one-second timer (one of the 950 * callout helper threads) with a critical section held. 951 * 952 * Decay p_estcpu based on the number of ticks we haven't been running 953 * and our p_nice. As the load increases each process observes a larger 954 * number of idle ticks (because other processes are running in them). 955 * This observation leads to a larger correction which tends to make the 956 * system more 'batchy'. 957 * 958 * Note that no recalculation occurs for a process which sleeps and wakes 959 * up in the same tick. That is, a system doing thousands of context 960 * switches per second will still only do serious estcpu calculations 961 * ESTCPUFREQ times per second. 962 * 963 * MPSAFE 964 */ 965 static 966 void 967 bsd4_recalculate_estcpu(struct lwp *lp) 968 { 969 globaldata_t gd = mycpu; 970 sysclock_t cpbase; 971 sysclock_t ttlticks; 972 int estcpu; 973 int decay_factor; 974 975 /* 976 * We have to subtract periodic to get the last schedclock 977 * timeout time, otherwise we would get the upcoming timeout. 978 * Keep in mind that a process can migrate between cpus and 979 * while the scheduler clock should be very close, boundary 980 * conditions could lead to a small negative delta. 981 */ 982 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic; 983 984 if (lp->lwp_slptime > 1) { 985 /* 986 * Too much time has passed, do a coarse correction. 987 */ 988 lp->lwp_estcpu = lp->lwp_estcpu >> 1; 989 bsd4_resetpriority(lp); 990 lp->lwp_cpbase = cpbase; 991 lp->lwp_cpticks = 0; 992 lp->lwp_batch -= ESTCPUFREQ; 993 if (lp->lwp_batch < 0) 994 lp->lwp_batch = 0; 995 } else if (lp->lwp_cpbase != cpbase) { 996 /* 997 * Adjust estcpu if we are in a different tick. Don't waste 998 * time if we are in the same tick. 999 * 1000 * First calculate the number of ticks in the measurement 1001 * interval. The ttlticks calculation can wind up 0 due to 1002 * a bug in the handling of lwp_slptime (as yet not found), 1003 * so make sure we do not get a divide by 0 panic. 1004 */ 1005 ttlticks = (cpbase - lp->lwp_cpbase) / 1006 gd->gd_schedclock.periodic; 1007 if ((ssysclock_t)ttlticks < 0) { 1008 ttlticks = 0; 1009 lp->lwp_cpbase = cpbase; 1010 } 1011 if (ttlticks == 0) 1012 return; 1013 updatepcpu(lp, lp->lwp_cpticks, ttlticks); 1014 1015 /* 1016 * Calculate the percentage of one cpu used factoring in ncpus 1017 * and the load and adjust estcpu. Handle degenerate cases 1018 * by adding 1 to bsd4_runqcount. 1019 * 1020 * estcpu is scaled by ESTCPUMAX. 1021 * 1022 * bsd4_runqcount is the excess number of user processes 1023 * that cannot be immediately scheduled to cpus. We want 1024 * to count these as running to avoid range compression 1025 * in the base calculation (which is the actual percentage 1026 * of one cpu used). 1027 */ 1028 estcpu = (lp->lwp_cpticks * ESTCPUMAX) * 1029 (bsd4_runqcount + ncpus) / (ncpus * ttlticks); 1030 1031 /* 1032 * If estcpu is > 50% we become more batch-like 1033 * If estcpu is <= 50% we become less batch-like 1034 * 1035 * It takes 30 cpu seconds to traverse the entire range. 1036 */ 1037 if (estcpu > ESTCPUMAX / 2) { 1038 lp->lwp_batch += ttlticks; 1039 if (lp->lwp_batch > BATCHMAX) 1040 lp->lwp_batch = BATCHMAX; 1041 } else { 1042 lp->lwp_batch -= ttlticks; 1043 if (lp->lwp_batch < 0) 1044 lp->lwp_batch = 0; 1045 } 1046 1047 if (usched_bsd4_debug == lp->lwp_proc->p_pid) { 1048 kprintf("pid %d lwp %p estcpu %3d %3d bat %d cp %d/%d", 1049 lp->lwp_proc->p_pid, lp, 1050 estcpu, lp->lwp_estcpu, 1051 lp->lwp_batch, 1052 lp->lwp_cpticks, ttlticks); 1053 } 1054 1055 /* 1056 * Adjust lp->lwp_esetcpu. The decay factor determines how 1057 * quickly lwp_estcpu collapses to its realtime calculation. 1058 * A slower collapse gives us a more accurate number but 1059 * can cause a cpu hog to eat too much cpu before the 1060 * scheduler decides to downgrade it. 1061 * 1062 * NOTE: p_nice is accounted for in bsd4_resetpriority(), 1063 * and not here, but we must still ensure that a 1064 * cpu-bound nice -20 process does not completely 1065 * override a cpu-bound nice +20 process. 1066 * 1067 * NOTE: We must use ESTCPULIM() here to deal with any 1068 * overshoot. 1069 */ 1070 decay_factor = usched_bsd4_decay; 1071 if (decay_factor < 1) 1072 decay_factor = 1; 1073 if (decay_factor > 1024) 1074 decay_factor = 1024; 1075 1076 lp->lwp_estcpu = ESTCPULIM( 1077 (lp->lwp_estcpu * decay_factor + estcpu) / 1078 (decay_factor + 1)); 1079 1080 if (usched_bsd4_debug == lp->lwp_proc->p_pid) 1081 kprintf(" finalestcpu %d\n", lp->lwp_estcpu); 1082 bsd4_resetpriority(lp); 1083 lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic; 1084 lp->lwp_cpticks = 0; 1085 } 1086 } 1087 1088 /* 1089 * Compute the priority of a process when running in user mode. 1090 * Arrange to reschedule if the resulting priority is better 1091 * than that of the current process. 1092 * 1093 * This routine may be called with any process. 1094 * 1095 * This routine is called by fork1() for initial setup with the process 1096 * of the run queue, and also may be called normally with the process on or 1097 * off the run queue. 1098 * 1099 * MPSAFE 1100 */ 1101 static void 1102 bsd4_resetpriority(struct lwp *lp) 1103 { 1104 bsd4_pcpu_t dd; 1105 int newpriority; 1106 u_short newrqtype; 1107 int reschedcpu; 1108 int checkpri; 1109 int estcpu; 1110 1111 /* 1112 * Calculate the new priority and queue type 1113 */ 1114 crit_enter(); 1115 spin_lock(&bsd4_spin); 1116 1117 newrqtype = lp->lwp_rtprio.type; 1118 1119 switch(newrqtype) { 1120 case RTP_PRIO_REALTIME: 1121 case RTP_PRIO_FIFO: 1122 newpriority = PRIBASE_REALTIME + 1123 (lp->lwp_rtprio.prio & PRIMASK); 1124 break; 1125 case RTP_PRIO_NORMAL: 1126 /* 1127 * Detune estcpu based on batchiness. lwp_batch ranges 1128 * from 0 to BATCHMAX. Limit estcpu for the sake of 1129 * the priority calculation to between 50% and 100%. 1130 */ 1131 estcpu = lp->lwp_estcpu * (lp->lwp_batch + BATCHMAX) / 1132 (BATCHMAX * 2); 1133 1134 /* 1135 * p_nice piece Adds (0-40) * 2 0-80 1136 * estcpu Adds 16384 * 4 / 512 0-128 1137 */ 1138 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ; 1139 newpriority += estcpu * PPQ / ESTCPUPPQ; 1140 newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ / 1141 NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ); 1142 newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK); 1143 break; 1144 case RTP_PRIO_IDLE: 1145 newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK); 1146 break; 1147 case RTP_PRIO_THREAD: 1148 newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK); 1149 break; 1150 default: 1151 panic("Bad RTP_PRIO %d", newrqtype); 1152 /* NOT REACHED */ 1153 } 1154 1155 /* 1156 * The newpriority incorporates the queue type so do a simple masked 1157 * check to determine if the process has moved to another queue. If 1158 * it has, and it is currently on a run queue, then move it. 1159 * 1160 * td_upri has normal sense (higher values are more desireable), so 1161 * negate it. 1162 */ 1163 lp->lwp_thread->td_upri = -(newpriority & ~PPQMASK); 1164 if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) { 1165 lp->lwp_priority = newpriority; 1166 if (lp->lwp_mpflags & LWP_MP_ONRUNQ) { 1167 bsd4_remrunqueue_locked(lp); 1168 lp->lwp_rqtype = newrqtype; 1169 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; 1170 bsd4_setrunqueue_locked(lp); 1171 checkpri = 1; 1172 } else { 1173 lp->lwp_rqtype = newrqtype; 1174 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; 1175 checkpri = 0; 1176 } 1177 reschedcpu = lp->lwp_thread->td_gd->gd_cpuid; 1178 } else { 1179 lp->lwp_priority = newpriority; 1180 reschedcpu = -1; 1181 checkpri = 1; 1182 } 1183 1184 /* 1185 * Determine if we need to reschedule the target cpu. This only 1186 * occurs if the LWP is already on a scheduler queue, which means 1187 * that idle cpu notification has already occured. At most we 1188 * need only issue a need_user_resched() on the appropriate cpu. 1189 * 1190 * The LWP may be owned by a CPU different from the current one, 1191 * in which case dd->uschedcp may be modified without an MP lock 1192 * or a spinlock held. The worst that happens is that the code 1193 * below causes a spurious need_user_resched() on the target CPU 1194 * and dd->pri to be wrong for a short period of time, both of 1195 * which are harmless. 1196 * 1197 * If checkpri is 0 we are adjusting the priority of the current 1198 * process, possibly higher (less desireable), so ignore the upri 1199 * check which will fail in that case. 1200 */ 1201 if (reschedcpu >= 0) { 1202 dd = &bsd4_pcpu[reschedcpu]; 1203 if (CPUMASK_TESTBIT(bsd4_rdyprocmask, reschedcpu) && 1204 (checkpri == 0 || 1205 (dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK))) { 1206 if (reschedcpu == mycpu->gd_cpuid) { 1207 spin_unlock(&bsd4_spin); 1208 need_user_resched(); 1209 } else { 1210 spin_unlock(&bsd4_spin); 1211 ATOMIC_CPUMASK_NANDBIT(bsd4_rdyprocmask, 1212 reschedcpu); 1213 lwkt_send_ipiq(lp->lwp_thread->td_gd, 1214 bsd4_need_user_resched_remote, 1215 NULL); 1216 } 1217 } else { 1218 spin_unlock(&bsd4_spin); 1219 } 1220 } else { 1221 spin_unlock(&bsd4_spin); 1222 } 1223 crit_exit(); 1224 } 1225 1226 /* 1227 * MPSAFE 1228 */ 1229 static 1230 void 1231 bsd4_yield(struct lwp *lp) 1232 { 1233 #if 0 1234 /* FUTURE (or something similar) */ 1235 switch(lp->lwp_rqtype) { 1236 case RTP_PRIO_NORMAL: 1237 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR); 1238 break; 1239 default: 1240 break; 1241 } 1242 #endif 1243 need_user_resched(); 1244 } 1245 1246 static 1247 void 1248 bsd4_changedcpu(struct lwp *lp __unused) 1249 { 1250 } 1251 1252 /* 1253 * Called from fork1() when a new child process is being created. 1254 * 1255 * Give the child process an initial estcpu that is more batch then 1256 * its parent and dock the parent for the fork (but do not 1257 * reschedule the parent). This comprises the main part of our batch 1258 * detection heuristic for both parallel forking and sequential execs. 1259 * 1260 * XXX lwp should be "spawning" instead of "forking" 1261 * 1262 * MPSAFE 1263 */ 1264 static void 1265 bsd4_forking(struct lwp *plp, struct lwp *lp) 1266 { 1267 /* 1268 * Put the child 4 queue slots (out of 32) higher than the parent 1269 * (less desireable than the parent). 1270 */ 1271 lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ * 4); 1272 1273 /* 1274 * The batch status of children always starts out centerline 1275 * and will inch-up or inch-down as appropriate. It takes roughly 1276 * ~15 seconds of >50% cpu to hit the limit. 1277 */ 1278 lp->lwp_batch = BATCHMAX / 2; 1279 1280 /* 1281 * Dock the parent a cost for the fork, protecting us from fork 1282 * bombs. If the parent is forking quickly make the child more 1283 * batchy. 1284 */ 1285 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ / 16); 1286 } 1287 1288 /* 1289 * Called when a lwp is being removed from this scheduler, typically 1290 * during lwp_exit(). 1291 */ 1292 static void 1293 bsd4_exiting(struct lwp *lp, struct proc *child_proc) 1294 { 1295 } 1296 1297 static void 1298 bsd4_uload_update(struct lwp *lp) 1299 { 1300 } 1301 1302 /* 1303 * chooseproc() is called when a cpu needs a user process to LWKT schedule, 1304 * it selects a user process and returns it. If chklp is non-NULL and chklp 1305 * has a better or equal priority then the process that would otherwise be 1306 * chosen, NULL is returned. 1307 * 1308 * Until we fix the RUNQ code the chklp test has to be strict or we may 1309 * bounce between processes trying to acquire the current process designation. 1310 * 1311 * MPSAFE - must be called with bsd4_spin exclusive held. The spinlock is 1312 * left intact through the entire routine. 1313 */ 1314 static 1315 struct lwp * 1316 bsd4_chooseproc_locked(struct lwp *chklp) 1317 { 1318 struct lwp *lp; 1319 struct rq *q; 1320 u_int32_t *which, *which2; 1321 u_int32_t pri; 1322 u_int32_t rtqbits; 1323 u_int32_t tsqbits; 1324 u_int32_t idqbits; 1325 cpumask_t cpumask; 1326 1327 rtqbits = bsd4_rtqueuebits; 1328 tsqbits = bsd4_queuebits; 1329 idqbits = bsd4_idqueuebits; 1330 cpumask = mycpu->gd_cpumask; 1331 1332 1333 again: 1334 if (rtqbits) { 1335 pri = bsfl(rtqbits); 1336 q = &bsd4_rtqueues[pri]; 1337 which = &bsd4_rtqueuebits; 1338 which2 = &rtqbits; 1339 } else if (tsqbits) { 1340 pri = bsfl(tsqbits); 1341 q = &bsd4_queues[pri]; 1342 which = &bsd4_queuebits; 1343 which2 = &tsqbits; 1344 } else if (idqbits) { 1345 pri = bsfl(idqbits); 1346 q = &bsd4_idqueues[pri]; 1347 which = &bsd4_idqueuebits; 1348 which2 = &idqbits; 1349 } else { 1350 return NULL; 1351 } 1352 lp = TAILQ_FIRST(q); 1353 KASSERT(lp, ("chooseproc: no lwp on busy queue")); 1354 1355 while (CPUMASK_TESTMASK(lp->lwp_cpumask, cpumask) == 0) { 1356 lp = TAILQ_NEXT(lp, lwp_procq); 1357 if (lp == NULL) { 1358 *which2 &= ~(1 << pri); 1359 goto again; 1360 } 1361 } 1362 1363 /* 1364 * If the passed lwp <chklp> is reasonably close to the selected 1365 * lwp <lp>, return NULL (indicating that <chklp> should be kept). 1366 * 1367 * Note that we must error on the side of <chklp> to avoid bouncing 1368 * between threads in the acquire code. 1369 */ 1370 if (chklp) { 1371 if (chklp->lwp_priority < lp->lwp_priority + PPQ) 1372 return(NULL); 1373 } 1374 1375 /* 1376 * If the chosen lwp does not reside on this cpu spend a few 1377 * cycles looking for a better candidate at the same priority level. 1378 * This is a fallback check, setrunqueue() tries to wakeup the 1379 * correct cpu and is our front-line affinity. 1380 */ 1381 if (lp->lwp_thread->td_gd != mycpu && 1382 (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL 1383 ) { 1384 if (chklp->lwp_thread->td_gd == mycpu) { 1385 lp = chklp; 1386 } 1387 } 1388 1389 KTR_COND_LOG(usched_bsd4_chooseproc, 1390 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1391 lp->lwp_proc->p_pid, 1392 lp->lwp_thread->td_gd->gd_cpuid, 1393 mycpu->gd_cpuid); 1394 1395 TAILQ_REMOVE(q, lp, lwp_procq); 1396 --bsd4_runqcount; 1397 if (TAILQ_EMPTY(q)) 1398 *which &= ~(1 << pri); 1399 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!")); 1400 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1401 1402 return lp; 1403 } 1404 1405 /* 1406 * chooseproc() - with a cache coherence heuristic. Try to pull a process that 1407 * has its home on the current CPU> If the process doesn't have its home here 1408 * and is a batchy one (see batcy_looser_pri_test), we can wait for a 1409 * sched_tick, may be its home will become free and pull it in. Anyway, 1410 * we can't wait more than one tick. If that tick expired, we pull in that 1411 * process, no matter what. 1412 */ 1413 static 1414 struct lwp * 1415 bsd4_chooseproc_locked_cache_coherent(struct lwp *chklp) 1416 { 1417 struct lwp *lp; 1418 struct rq *q; 1419 u_int32_t *which, *which2; 1420 u_int32_t pri; 1421 u_int32_t checks; 1422 u_int32_t rtqbits; 1423 u_int32_t tsqbits; 1424 u_int32_t idqbits; 1425 cpumask_t cpumask; 1426 1427 struct lwp * min_level_lwp = NULL; 1428 struct rq *min_q = NULL; 1429 cpumask_t siblings; 1430 cpu_node_t* cpunode = NULL; 1431 u_int32_t min_level = MAXCPU; /* number of levels < MAXCPU */ 1432 u_int32_t *min_which = NULL; 1433 u_int32_t min_pri = 0; 1434 u_int32_t level = 0; 1435 1436 rtqbits = bsd4_rtqueuebits; 1437 tsqbits = bsd4_queuebits; 1438 idqbits = bsd4_idqueuebits; 1439 cpumask = mycpu->gd_cpumask; 1440 1441 /* Get the mask coresponding to the sysctl configured level */ 1442 cpunode = bsd4_pcpu[mycpu->gd_cpuid].cpunode; 1443 level = usched_bsd4_stick_to_level; 1444 while (level) { 1445 cpunode = cpunode->parent_node; 1446 level--; 1447 } 1448 /* The cpus which can ellect a process */ 1449 siblings = cpunode->members; 1450 checks = 0; 1451 1452 again: 1453 if (rtqbits) { 1454 pri = bsfl(rtqbits); 1455 q = &bsd4_rtqueues[pri]; 1456 which = &bsd4_rtqueuebits; 1457 which2 = &rtqbits; 1458 } else if (tsqbits) { 1459 pri = bsfl(tsqbits); 1460 q = &bsd4_queues[pri]; 1461 which = &bsd4_queuebits; 1462 which2 = &tsqbits; 1463 } else if (idqbits) { 1464 pri = bsfl(idqbits); 1465 q = &bsd4_idqueues[pri]; 1466 which = &bsd4_idqueuebits; 1467 which2 = &idqbits; 1468 } else { 1469 /* 1470 * No more left and we didn't reach the checks limit. 1471 */ 1472 bsd4_kick_helper(min_level_lwp); 1473 return NULL; 1474 } 1475 lp = TAILQ_FIRST(q); 1476 KASSERT(lp, ("chooseproc: no lwp on busy queue")); 1477 1478 /* 1479 * Limit the number of checks/queue to a configurable value to 1480 * minimize the contention (we are in a locked region 1481 */ 1482 while (checks < usched_bsd4_queue_checks) { 1483 if (CPUMASK_TESTMASK(lp->lwp_cpumask, cpumask) == 0 || 1484 (CPUMASK_TESTMASK(siblings, 1485 lp->lwp_thread->td_gd->gd_cpumask) == 0 && 1486 (lp->lwp_rebal_ticks == sched_ticks || 1487 lp->lwp_rebal_ticks == (int)(sched_ticks - 1)) && 1488 bsd4_batchy_looser_pri_test(lp))) { 1489 1490 KTR_COND_LOG(usched_chooseproc_cc_not_good, 1491 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1492 lp->lwp_proc->p_pid, 1493 (unsigned long)CPUMASK_LOWMASK( 1494 lp->lwp_thread->td_gd->gd_cpumask), 1495 (unsigned long)CPUMASK_LOWMASK(siblings), 1496 (unsigned long)CPUMASK_LOWMASK(cpumask)); 1497 1498 cpunode = bsd4_pcpu[lp->lwp_thread->td_gd->gd_cpuid].cpunode; 1499 level = 0; 1500 while (cpunode) { 1501 if (CPUMASK_TESTMASK(cpunode->members, 1502 cpumask)) { 1503 break; 1504 } 1505 cpunode = cpunode->parent_node; 1506 level++; 1507 } 1508 if (level < min_level || 1509 (level == min_level && min_level_lwp && 1510 lp->lwp_priority < min_level_lwp->lwp_priority)) { 1511 bsd4_kick_helper(min_level_lwp); 1512 min_level_lwp = lp; 1513 min_level = level; 1514 min_q = q; 1515 min_which = which; 1516 min_pri = pri; 1517 } else { 1518 bsd4_kick_helper(lp); 1519 } 1520 lp = TAILQ_NEXT(lp, lwp_procq); 1521 if (lp == NULL) { 1522 *which2 &= ~(1 << pri); 1523 goto again; 1524 } 1525 } else { 1526 KTR_COND_LOG(usched_chooseproc_cc_elected, 1527 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1528 lp->lwp_proc->p_pid, 1529 (unsigned long)CPUMASK_LOWMASK( 1530 lp->lwp_thread->td_gd->gd_cpumask), 1531 (unsigned long)CPUMASK_LOWMASK(siblings), 1532 (unsigned long)CPUMASK_LOWMASK(cpumask)); 1533 1534 goto found; 1535 } 1536 ++checks; 1537 } 1538 1539 /* 1540 * Checks exhausted, we tried to defer too many threads, so schedule 1541 * the best of the worst. 1542 */ 1543 lp = min_level_lwp; 1544 q = min_q; 1545 which = min_which; 1546 pri = min_pri; 1547 KASSERT(lp, ("chooseproc: at least the first lp was good")); 1548 1549 found: 1550 1551 /* 1552 * If the passed lwp <chklp> is reasonably close to the selected 1553 * lwp <lp>, return NULL (indicating that <chklp> should be kept). 1554 * 1555 * Note that we must error on the side of <chklp> to avoid bouncing 1556 * between threads in the acquire code. 1557 */ 1558 if (chklp) { 1559 if (chklp->lwp_priority < lp->lwp_priority + PPQ) { 1560 bsd4_kick_helper(lp); 1561 return(NULL); 1562 } 1563 } 1564 1565 KTR_COND_LOG(usched_chooseproc_cc, 1566 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1567 lp->lwp_proc->p_pid, 1568 lp->lwp_thread->td_gd->gd_cpuid, 1569 mycpu->gd_cpuid); 1570 1571 TAILQ_REMOVE(q, lp, lwp_procq); 1572 --bsd4_runqcount; 1573 if (TAILQ_EMPTY(q)) 1574 *which &= ~(1 << pri); 1575 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!")); 1576 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1577 1578 return lp; 1579 } 1580 1581 /* 1582 * If we aren't willing to schedule a ready process on our cpu, give it's 1583 * target cpu a kick rather than wait for the next tick. 1584 * 1585 * Called with bsd4_spin held. 1586 */ 1587 static 1588 void 1589 bsd4_kick_helper(struct lwp *lp) 1590 { 1591 globaldata_t gd; 1592 bsd4_pcpu_t dd; 1593 cpumask_t tmpmask; 1594 1595 if (lp == NULL) 1596 return; 1597 gd = lp->lwp_thread->td_gd; 1598 dd = &bsd4_pcpu[gd->gd_cpuid]; 1599 1600 tmpmask = smp_active_mask; 1601 CPUMASK_ANDMASK(tmpmask, usched_global_cpumask); 1602 CPUMASK_ANDMASK(tmpmask, bsd4_rdyprocmask); 1603 CPUMASK_ANDMASK(tmpmask, gd->gd_cpumask); 1604 if (CPUMASK_TESTZERO(tmpmask)) 1605 return; 1606 1607 ++usched_bsd4_kicks; 1608 ATOMIC_CPUMASK_NANDBIT(bsd4_rdyprocmask, gd->gd_cpuid); 1609 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 1610 lwkt_send_ipiq(gd, bsd4_need_user_resched_remote, NULL); 1611 } else { 1612 wakeup(dd->helper_thread); 1613 } 1614 } 1615 1616 static 1617 void 1618 bsd4_need_user_resched_remote(void *dummy) 1619 { 1620 globaldata_t gd = mycpu; 1621 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 1622 1623 need_user_resched(); 1624 1625 /* Call wakeup_mycpu to avoid sending IPIs to other CPUs */ 1626 wakeup_mycpu(dd->helper_thread); 1627 } 1628 1629 /* 1630 * bsd4_remrunqueue_locked() removes a given process from the run queue 1631 * that it is on, clearing the queue busy bit if it becomes empty. 1632 * 1633 * Note that user process scheduler is different from the LWKT schedule. 1634 * The user process scheduler only manages user processes but it uses LWKT 1635 * underneath, and a user process operating in the kernel will often be 1636 * 'released' from our management. 1637 * 1638 * MPSAFE - bsd4_spin must be held exclusively on call 1639 */ 1640 static void 1641 bsd4_remrunqueue_locked(struct lwp *lp) 1642 { 1643 struct rq *q; 1644 u_int32_t *which; 1645 u_int8_t pri; 1646 1647 KKASSERT(lp->lwp_mpflags & LWP_MP_ONRUNQ); 1648 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1649 --bsd4_runqcount; 1650 KKASSERT(bsd4_runqcount >= 0); 1651 1652 pri = lp->lwp_rqindex; 1653 switch(lp->lwp_rqtype) { 1654 case RTP_PRIO_NORMAL: 1655 q = &bsd4_queues[pri]; 1656 which = &bsd4_queuebits; 1657 break; 1658 case RTP_PRIO_REALTIME: 1659 case RTP_PRIO_FIFO: 1660 q = &bsd4_rtqueues[pri]; 1661 which = &bsd4_rtqueuebits; 1662 break; 1663 case RTP_PRIO_IDLE: 1664 q = &bsd4_idqueues[pri]; 1665 which = &bsd4_idqueuebits; 1666 break; 1667 default: 1668 panic("remrunqueue: invalid rtprio type"); 1669 /* NOT REACHED */ 1670 } 1671 TAILQ_REMOVE(q, lp, lwp_procq); 1672 if (TAILQ_EMPTY(q)) { 1673 KASSERT((*which & (1 << pri)) != 0, 1674 ("remrunqueue: remove from empty queue")); 1675 *which &= ~(1 << pri); 1676 } 1677 } 1678 1679 /* 1680 * bsd4_setrunqueue_locked() 1681 * 1682 * Add a process whos rqtype and rqindex had previously been calculated 1683 * onto the appropriate run queue. Determine if the addition requires 1684 * a reschedule on a cpu and return the cpuid or -1. 1685 * 1686 * NOTE: Lower priorities are better priorities. 1687 * 1688 * MPSAFE - bsd4_spin must be held exclusively on call 1689 */ 1690 static void 1691 bsd4_setrunqueue_locked(struct lwp *lp) 1692 { 1693 struct rq *q; 1694 u_int32_t *which; 1695 int pri; 1696 1697 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 1698 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1699 ++bsd4_runqcount; 1700 1701 pri = lp->lwp_rqindex; 1702 1703 switch(lp->lwp_rqtype) { 1704 case RTP_PRIO_NORMAL: 1705 q = &bsd4_queues[pri]; 1706 which = &bsd4_queuebits; 1707 break; 1708 case RTP_PRIO_REALTIME: 1709 case RTP_PRIO_FIFO: 1710 q = &bsd4_rtqueues[pri]; 1711 which = &bsd4_rtqueuebits; 1712 break; 1713 case RTP_PRIO_IDLE: 1714 q = &bsd4_idqueues[pri]; 1715 which = &bsd4_idqueuebits; 1716 break; 1717 default: 1718 panic("remrunqueue: invalid rtprio type"); 1719 /* NOT REACHED */ 1720 } 1721 1722 /* 1723 * Add to the correct queue and set the appropriate bit. If no 1724 * lower priority (i.e. better) processes are in the queue then 1725 * we want a reschedule, calculate the best cpu for the job. 1726 * 1727 * Always run reschedules on the LWPs original cpu. 1728 */ 1729 TAILQ_INSERT_TAIL(q, lp, lwp_procq); 1730 *which |= 1 << pri; 1731 } 1732 1733 /* 1734 * For SMP systems a user scheduler helper thread is created for each 1735 * cpu and is used to allow one cpu to wakeup another for the purposes of 1736 * scheduling userland threads from setrunqueue(). 1737 * 1738 * UP systems do not need the helper since there is only one cpu. 1739 * 1740 * We can't use the idle thread for this because we might block. 1741 * Additionally, doing things this way allows us to HLT idle cpus 1742 * on MP systems. 1743 * 1744 * MPSAFE 1745 */ 1746 static void 1747 sched_thread(void *dummy) 1748 { 1749 globaldata_t gd; 1750 bsd4_pcpu_t dd; 1751 bsd4_pcpu_t tmpdd; 1752 struct lwp *nlp; 1753 cpumask_t mask; 1754 int cpuid; 1755 cpumask_t tmpmask; 1756 int tmpid; 1757 1758 gd = mycpu; 1759 cpuid = gd->gd_cpuid; /* doesn't change */ 1760 mask = gd->gd_cpumask; /* doesn't change */ 1761 dd = &bsd4_pcpu[cpuid]; 1762 1763 /* 1764 * Since we are woken up only when no user processes are scheduled 1765 * on a cpu, we can run at an ultra low priority. 1766 */ 1767 lwkt_setpri_self(TDPRI_USER_SCHEDULER); 1768 1769 tsleep(dd->helper_thread, 0, "sched_thread_sleep", 0); 1770 1771 for (;;) { 1772 /* 1773 * We use the LWKT deschedule-interlock trick to avoid racing 1774 * bsd4_rdyprocmask. This means we cannot block through to the 1775 * manual lwkt_switch() call we make below. 1776 */ 1777 crit_enter_gd(gd); 1778 tsleep_interlock(dd->helper_thread, 0); 1779 spin_lock(&bsd4_spin); 1780 ATOMIC_CPUMASK_ORMASK(bsd4_rdyprocmask, mask); 1781 1782 clear_user_resched(); /* This satisfied the reschedule request */ 1783 dd->rrcount = 0; /* Reset the round-robin counter */ 1784 1785 if (CPUMASK_TESTMASK(bsd4_curprocmask, mask) == 0) { 1786 /* 1787 * No thread is currently scheduled. 1788 */ 1789 KKASSERT(dd->uschedcp == NULL); 1790 if ((nlp = bsd4_chooseproc_locked(NULL)) != NULL) { 1791 KTR_COND_LOG(usched_sched_thread_no_process, 1792 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1793 gd->gd_cpuid, 1794 nlp->lwp_proc->p_pid, 1795 nlp->lwp_thread->td_gd->gd_cpuid); 1796 1797 ATOMIC_CPUMASK_ORMASK(bsd4_curprocmask, mask); 1798 dd->upri = nlp->lwp_priority; 1799 dd->uschedcp = nlp; 1800 dd->rrcount = 0; /* reset round robin */ 1801 spin_unlock(&bsd4_spin); 1802 lwkt_acquire(nlp->lwp_thread); 1803 lwkt_schedule(nlp->lwp_thread); 1804 } else { 1805 spin_unlock(&bsd4_spin); 1806 } 1807 } else if (bsd4_runqcount) { 1808 if ((nlp = bsd4_chooseproc_locked(dd->uschedcp)) != NULL) { 1809 KTR_COND_LOG(usched_sched_thread_process, 1810 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1811 gd->gd_cpuid, 1812 nlp->lwp_proc->p_pid, 1813 nlp->lwp_thread->td_gd->gd_cpuid); 1814 1815 dd->upri = nlp->lwp_priority; 1816 dd->uschedcp = nlp; 1817 dd->rrcount = 0; /* reset round robin */ 1818 spin_unlock(&bsd4_spin); 1819 lwkt_acquire(nlp->lwp_thread); 1820 lwkt_schedule(nlp->lwp_thread); 1821 } else { 1822 /* 1823 * CHAINING CONDITION TRAIN 1824 * 1825 * We could not deal with the scheduler wakeup 1826 * request on this cpu, locate a ready scheduler 1827 * with no current lp assignment and chain to it. 1828 * 1829 * This ensures that a wakeup race which fails due 1830 * to priority test does not leave other unscheduled 1831 * cpus idle when the runqueue is not empty. 1832 */ 1833 tmpmask = bsd4_rdyprocmask; 1834 CPUMASK_NANDMASK(tmpmask, bsd4_curprocmask); 1835 CPUMASK_ANDMASK(tmpmask, smp_active_mask); 1836 if (CPUMASK_TESTNZERO(tmpmask)) { 1837 tmpid = BSFCPUMASK(tmpmask); 1838 tmpdd = &bsd4_pcpu[tmpid]; 1839 ATOMIC_CPUMASK_NANDBIT(bsd4_rdyprocmask, tmpid); 1840 spin_unlock(&bsd4_spin); 1841 wakeup(tmpdd->helper_thread); 1842 } else { 1843 spin_unlock(&bsd4_spin); 1844 } 1845 1846 KTR_LOG(usched_sched_thread_no_process_found, 1847 gd->gd_cpuid, (unsigned long)CPUMASK_LOWMASK(tmpmask)); 1848 } 1849 } else { 1850 /* 1851 * The runq is empty. 1852 */ 1853 spin_unlock(&bsd4_spin); 1854 } 1855 1856 /* 1857 * We're descheduled unless someone scheduled us. Switch away. 1858 * Exiting the critical section will cause splz() to be called 1859 * for us if interrupts and such are pending. 1860 */ 1861 crit_exit_gd(gd); 1862 tsleep(dd->helper_thread, PINTERLOCKED, "schslp", 0); 1863 } 1864 } 1865 1866 /* sysctl stick_to_level parameter */ 1867 static int 1868 sysctl_usched_bsd4_stick_to_level(SYSCTL_HANDLER_ARGS) 1869 { 1870 int error, new_val; 1871 1872 new_val = usched_bsd4_stick_to_level; 1873 1874 error = sysctl_handle_int(oidp, &new_val, 0, req); 1875 if (error != 0 || req->newptr == NULL) 1876 return (error); 1877 if (new_val > cpu_topology_levels_number - 1 || new_val < 0) 1878 return (EINVAL); 1879 usched_bsd4_stick_to_level = new_val; 1880 return (0); 1881 } 1882 1883 /* 1884 * Setup our scheduler helpers. Note that curprocmask bit 0 has already 1885 * been cleared by rqinit() and we should not mess with it further. 1886 */ 1887 static void 1888 sched_thread_cpu_init(void) 1889 { 1890 int i; 1891 int smt_not_supported = 0; 1892 int cache_coherent_not_supported = 0; 1893 1894 if (bootverbose) 1895 kprintf("Start usched_bsd4 helpers on cpus:\n"); 1896 1897 sysctl_ctx_init(&usched_bsd4_sysctl_ctx); 1898 usched_bsd4_sysctl_tree = 1899 SYSCTL_ADD_NODE(&usched_bsd4_sysctl_ctx, 1900 SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO, 1901 "usched_bsd4", CTLFLAG_RD, 0, ""); 1902 1903 for (i = 0; i < ncpus; ++i) { 1904 bsd4_pcpu_t dd = &bsd4_pcpu[i]; 1905 cpumask_t mask; 1906 1907 CPUMASK_ASSBIT(mask, i); 1908 1909 if (CPUMASK_TESTMASK(mask, smp_active_mask) == 0) 1910 continue; 1911 1912 dd->cpunode = get_cpu_node_by_cpuid(i); 1913 1914 if (dd->cpunode == NULL) { 1915 smt_not_supported = 1; 1916 cache_coherent_not_supported = 1; 1917 if (bootverbose) 1918 kprintf (" cpu%d - WARNING: No CPU NODE " 1919 "found for cpu\n", i); 1920 } else { 1921 switch (dd->cpunode->type) { 1922 case THREAD_LEVEL: 1923 if (bootverbose) 1924 kprintf (" cpu%d - HyperThreading " 1925 "available. Core siblings: ", 1926 i); 1927 break; 1928 case CORE_LEVEL: 1929 smt_not_supported = 1; 1930 1931 if (bootverbose) 1932 kprintf (" cpu%d - No HT available, " 1933 "multi-core/physical " 1934 "cpu. Physical siblings: ", 1935 i); 1936 break; 1937 case CHIP_LEVEL: 1938 smt_not_supported = 1; 1939 1940 if (bootverbose) 1941 kprintf (" cpu%d - No HT available, " 1942 "single-core/physical cpu. " 1943 "Package Siblings: ", 1944 i); 1945 break; 1946 default: 1947 /* Let's go for safe defaults here */ 1948 smt_not_supported = 1; 1949 cache_coherent_not_supported = 1; 1950 if (bootverbose) 1951 kprintf (" cpu%d - Unknown cpunode->" 1952 "type=%u. Siblings: ", 1953 i, 1954 (u_int)dd->cpunode->type); 1955 break; 1956 } 1957 1958 if (bootverbose) { 1959 if (dd->cpunode->parent_node != NULL) { 1960 kprint_cpuset(&dd->cpunode-> 1961 parent_node->members); 1962 kprintf("\n"); 1963 } else { 1964 kprintf(" no siblings\n"); 1965 } 1966 } 1967 } 1968 1969 lwkt_create(sched_thread, NULL, &dd->helper_thread, NULL, 1970 0, i, "usched %d", i); 1971 1972 /* 1973 * Allow user scheduling on the target cpu. cpu #0 has already 1974 * been enabled in rqinit(). 1975 */ 1976 if (i) 1977 ATOMIC_CPUMASK_NANDMASK(bsd4_curprocmask, mask); 1978 ATOMIC_CPUMASK_ORMASK(bsd4_rdyprocmask, mask); 1979 dd->upri = PRIBASE_NULL; 1980 1981 } 1982 1983 /* usched_bsd4 sysctl configurable parameters */ 1984 1985 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1986 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1987 OID_AUTO, "rrinterval", CTLFLAG_RW, 1988 &usched_bsd4_rrinterval, 0, ""); 1989 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1990 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1991 OID_AUTO, "decay", CTLFLAG_RW, 1992 &usched_bsd4_decay, 0, "Extra decay when not running"); 1993 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1994 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1995 OID_AUTO, "batch_time", CTLFLAG_RW, 1996 &usched_bsd4_batch_time, 0, "Min batch counter value"); 1997 SYSCTL_ADD_LONG(&usched_bsd4_sysctl_ctx, 1998 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1999 OID_AUTO, "kicks", CTLFLAG_RW, 2000 &usched_bsd4_kicks, "Number of kickstarts"); 2001 2002 /* Add enable/disable option for SMT scheduling if supported */ 2003 if (smt_not_supported) { 2004 usched_bsd4_smt = 0; 2005 SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx, 2006 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2007 OID_AUTO, "smt", CTLFLAG_RD, 2008 "NOT SUPPORTED", 0, "SMT NOT SUPPORTED"); 2009 } else { 2010 usched_bsd4_smt = 1; 2011 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2012 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2013 OID_AUTO, "smt", CTLFLAG_RW, 2014 &usched_bsd4_smt, 0, "Enable SMT scheduling"); 2015 } 2016 2017 /* 2018 * Add enable/disable option for cache coherent scheduling 2019 * if supported 2020 */ 2021 if (cache_coherent_not_supported) { 2022 usched_bsd4_cache_coherent = 0; 2023 SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx, 2024 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2025 OID_AUTO, "cache_coherent", CTLFLAG_RD, 2026 "NOT SUPPORTED", 0, 2027 "Cache coherence NOT SUPPORTED"); 2028 } else { 2029 usched_bsd4_cache_coherent = 1; 2030 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2031 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2032 OID_AUTO, "cache_coherent", CTLFLAG_RW, 2033 &usched_bsd4_cache_coherent, 0, 2034 "Enable/Disable cache coherent scheduling"); 2035 2036 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2037 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2038 OID_AUTO, "upri_affinity", CTLFLAG_RW, 2039 &usched_bsd4_upri_affinity, 1, 2040 "Number of PPQs in user priority check"); 2041 2042 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2043 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2044 OID_AUTO, "queue_checks", CTLFLAG_RW, 2045 &usched_bsd4_queue_checks, 5, 2046 "LWPs to check from a queue before giving up"); 2047 2048 SYSCTL_ADD_PROC(&usched_bsd4_sysctl_ctx, 2049 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2050 OID_AUTO, "stick_to_level", 2051 CTLTYPE_INT | CTLFLAG_RW, 2052 NULL, sizeof usched_bsd4_stick_to_level, 2053 sysctl_usched_bsd4_stick_to_level, "I", 2054 "Stick a process to this level. See sysctl" 2055 "paremter hw.cpu_topology.level_description"); 2056 } 2057 } 2058 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND, 2059 sched_thread_cpu_init, NULL); 2060