1 /* 2 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/param.h> 28 #include <sys/systm.h> 29 #include <sys/kernel.h> 30 #include <sys/lock.h> 31 #include <sys/queue.h> 32 #include <sys/proc.h> 33 #include <sys/rtprio.h> 34 #include <sys/uio.h> 35 #include <sys/sysctl.h> 36 #include <sys/resourcevar.h> 37 #include <sys/spinlock.h> 38 #include <sys/cpu_topology.h> 39 #include <sys/thread2.h> 40 #include <sys/spinlock2.h> 41 #include <sys/mplock2.h> 42 43 #include <sys/ktr.h> 44 45 #include <machine/cpu.h> 46 #include <machine/smp.h> 47 48 /* 49 * Priorities. Note that with 32 run queues per scheduler each queue 50 * represents four priority levels. 51 */ 52 53 #define MAXPRI 128 54 #define PRIMASK (MAXPRI - 1) 55 #define PRIBASE_REALTIME 0 56 #define PRIBASE_NORMAL MAXPRI 57 #define PRIBASE_IDLE (MAXPRI * 2) 58 #define PRIBASE_THREAD (MAXPRI * 3) 59 #define PRIBASE_NULL (MAXPRI * 4) 60 61 #define NQS 32 /* 32 run queues. */ 62 #define PPQ (MAXPRI / NQS) /* priorities per queue */ 63 #define PPQMASK (PPQ - 1) 64 65 /* 66 * NICEPPQ - number of nice units per priority queue 67 * 68 * ESTCPUPPQ - number of estcpu units per priority queue 69 * ESTCPUMAX - number of estcpu units 70 */ 71 #define NICEPPQ 2 72 #define ESTCPUPPQ 512 73 #define ESTCPUMAX (ESTCPUPPQ * NQS) 74 #define BATCHMAX (ESTCPUFREQ * 30) 75 #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1) 76 77 #define ESTCPULIM(v) min((v), ESTCPUMAX) 78 79 TAILQ_HEAD(rq, lwp); 80 81 #define lwp_priority lwp_usdata.bsd4.priority 82 #define lwp_rqindex lwp_usdata.bsd4.rqindex 83 #define lwp_estcpu lwp_usdata.bsd4.estcpu 84 #define lwp_batch lwp_usdata.bsd4.batch 85 #define lwp_rqtype lwp_usdata.bsd4.rqtype 86 87 static void bsd4_acquire_curproc(struct lwp *lp); 88 static void bsd4_release_curproc(struct lwp *lp); 89 static void bsd4_select_curproc(globaldata_t gd); 90 static void bsd4_setrunqueue(struct lwp *lp); 91 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period, 92 sysclock_t cpstamp); 93 static void bsd4_recalculate_estcpu(struct lwp *lp); 94 static void bsd4_resetpriority(struct lwp *lp); 95 static void bsd4_forking(struct lwp *plp, struct lwp *lp); 96 static void bsd4_exiting(struct lwp *lp, struct proc *); 97 static void bsd4_yield(struct lwp *lp); 98 99 #ifdef SMP 100 static void need_user_resched_remote(void *dummy); 101 static int batchy_looser_pri_test(struct lwp* lp); 102 static struct lwp *chooseproc_locked_cache_coherent(struct lwp *chklp); 103 #endif 104 static struct lwp *chooseproc_locked(struct lwp *chklp); 105 static void bsd4_remrunqueue_locked(struct lwp *lp); 106 static void bsd4_setrunqueue_locked(struct lwp *lp); 107 108 struct usched usched_bsd4 = { 109 { NULL }, 110 "bsd4", "Original DragonFly Scheduler", 111 NULL, /* default registration */ 112 NULL, /* default deregistration */ 113 bsd4_acquire_curproc, 114 bsd4_release_curproc, 115 bsd4_setrunqueue, 116 bsd4_schedulerclock, 117 bsd4_recalculate_estcpu, 118 bsd4_resetpriority, 119 bsd4_forking, 120 bsd4_exiting, 121 NULL, /* setcpumask not supported */ 122 bsd4_yield 123 }; 124 125 struct usched_bsd4_pcpu { 126 struct thread helper_thread; 127 short rrcount; 128 short upri; 129 struct lwp *uschedcp; 130 struct lwp *old_uschedcp; 131 #ifdef SMP 132 cpu_node_t *cpunode; 133 #endif 134 }; 135 136 typedef struct usched_bsd4_pcpu *bsd4_pcpu_t; 137 138 /* 139 * We have NQS (32) run queues per scheduling class. For the normal 140 * class, there are 128 priorities scaled onto these 32 queues. New 141 * processes are added to the last entry in each queue, and processes 142 * are selected for running by taking them from the head and maintaining 143 * a simple FIFO arrangement. Realtime and Idle priority processes have 144 * and explicit 0-31 priority which maps directly onto their class queue 145 * index. When a queue has something in it, the corresponding bit is 146 * set in the queuebits variable, allowing a single read to determine 147 * the state of all 32 queues and then a ffs() to find the first busy 148 * queue. 149 */ 150 static struct rq bsd4_queues[NQS]; 151 static struct rq bsd4_rtqueues[NQS]; 152 static struct rq bsd4_idqueues[NQS]; 153 static u_int32_t bsd4_queuebits; 154 static u_int32_t bsd4_rtqueuebits; 155 static u_int32_t bsd4_idqueuebits; 156 static cpumask_t bsd4_curprocmask = -1; /* currently running a user process */ 157 static cpumask_t bsd4_rdyprocmask; /* ready to accept a user process */ 158 static int bsd4_runqcount; 159 #ifdef SMP 160 static volatile int bsd4_scancpu; 161 #endif 162 static struct spinlock bsd4_spin; 163 static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU]; 164 static struct sysctl_ctx_list usched_bsd4_sysctl_ctx; 165 static struct sysctl_oid *usched_bsd4_sysctl_tree; 166 167 /* Debug info exposed through debug.* sysctl */ 168 169 SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, &bsd4_runqcount, 0, 170 "Number of run queues"); 171 #ifdef INVARIANTS 172 static int usched_nonoptimal; 173 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW, 174 &usched_nonoptimal, 0, "acquire_curproc() was not optimal"); 175 static int usched_optimal; 176 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW, 177 &usched_optimal, 0, "acquire_curproc() was optimal"); 178 #endif 179 180 static int usched_bsd4_debug = -1; 181 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_bsd4_debug, 0, 182 "Print debug information for this pid"); 183 static int usched_bsd4_pid_debug = -1; 184 SYSCTL_INT(_debug, OID_AUTO, pid_debug, CTLFLAG_RW, &usched_bsd4_pid_debug, 0, 185 "Print KTR debug information for this pid"); 186 187 #ifdef SMP 188 static int remote_resched_nonaffinity; 189 static int remote_resched_affinity; 190 static int choose_affinity; 191 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD, 192 &remote_resched_nonaffinity, 0, "Number of remote rescheds"); 193 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD, 194 &remote_resched_affinity, 0, "Number of remote rescheds"); 195 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD, 196 &choose_affinity, 0, "chooseproc() was smart"); 197 #endif 198 199 200 /* Tunning usched_bsd4 - configurable through kern.usched_bsd4.* */ 201 #ifdef SMP 202 static int usched_bsd4_smt = 0; 203 static int usched_bsd4_cache_coherent = 0; 204 static int usched_bsd4_upri_affinity = 16; /* 32 queues - half-way */ 205 static int usched_bsd4_queue_checks = 5; 206 static int usched_bsd4_stick_to_level = 0; 207 #endif 208 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10; 209 static int usched_bsd4_decay = 8; 210 static int usched_bsd4_batch_time = 10; 211 212 /* KTR debug printings */ 213 214 KTR_INFO_MASTER(usched); 215 216 #if !defined(KTR_USCHED_BSD4) 217 #define KTR_USCHED_BSD4 KTR_ALL 218 #endif 219 220 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_urw, 0, 221 "USCHED_BSD4(bsd4_acquire_curproc in user_reseched_wanted " 222 "after release: pid %d, cpuid %d, curr_cpuid %d)", 223 pid_t pid, int cpuid, int curr); 224 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_before_loop, 0, 225 "USCHED_BSD4(bsd4_acquire_curproc before loop: pid %d, cpuid %d, " 226 "curr_cpuid %d)", 227 pid_t pid, int cpuid, int curr); 228 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_not, 0, 229 "USCHED_BSD4(bsd4_acquire_curproc couldn't acquire after " 230 "bsd4_setrunqueue: pid %d, cpuid %d, curr_lp pid %d, curr_cpuid %d)", 231 pid_t pid, int cpuid, pid_t curr_pid, int curr_cpuid); 232 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_switch, 0, 233 "USCHED_BSD4(bsd4_acquire_curproc after lwkt_switch: pid %d, " 234 "cpuid %d, curr_cpuid %d)", 235 pid_t pid, int cpuid, int curr); 236 237 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_release_curproc, 0, 238 "USCHED_BSD4(bsd4_release_curproc before select: pid %d, " 239 "cpuid %d, curr_cpuid %d)", 240 pid_t pid, int cpuid, int curr); 241 242 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_select_curproc, 0, 243 "USCHED_BSD4(bsd4_release_curproc before select: pid %d, " 244 "cpuid %d, old_pid %d, old_cpuid %d, curr_cpuid %d)", 245 pid_t pid, int cpuid, pid_t old_pid, int old_cpuid, int curr); 246 247 #ifdef SMP 248 KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_false, 0, 249 "USCHED_BSD4(batchy_looser_pri_test false: pid %d, " 250 "cpuid %d, verify_mask %lu)", 251 pid_t pid, int cpuid, cpumask_t mask); 252 KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_true, 0, 253 "USCHED_BSD4(batchy_looser_pri_test true: pid %d, " 254 "cpuid %d, verify_mask %lu)", 255 pid_t pid, int cpuid, cpumask_t mask); 256 257 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_smt, 0, 258 "USCHED_BSD4(bsd4_setrunqueue free cpus smt: pid %d, cpuid %d, " 259 "mask %lu, curr_cpuid %d)", 260 pid_t pid, int cpuid, cpumask_t mask, int curr); 261 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_non_smt, 0, 262 "USCHED_BSD4(bsd4_setrunqueue free cpus check non_smt: pid %d, " 263 "cpuid %d, mask %lu, curr_cpuid %d)", 264 pid_t pid, int cpuid, cpumask_t mask, int curr); 265 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_rc, 0, 266 "USCHED_BSD4(bsd4_setrunqueue running cpus check: pid %d, " 267 "cpuid %d, mask %lu, curr_cpuid %d)", 268 pid_t pid, int cpuid, cpumask_t mask, int curr); 269 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found, 0, 270 "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, " 271 "mask %lu, found_cpuid %d, curr_cpuid %d)", 272 pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr); 273 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_not_found, 0, 274 "USCHED_BSD4(bsd4_setrunqueue not found cpu: pid %d, cpuid %d, " 275 "try_cpuid %d, curr_cpuid %d)", 276 pid_t pid, int cpuid, int try_cpuid, int curr); 277 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found_best_cpuid, 0, 278 "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, " 279 "mask %lu, found_cpuid %d, curr_cpuid %d)", 280 pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr); 281 #endif 282 283 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc, 0, 284 "USCHED_BSD4(chooseproc: pid %d, old_cpuid %d, curr_cpuid %d)", 285 pid_t pid, int old_cpuid, int curr); 286 #ifdef SMP 287 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc, 0, 288 "USCHED_BSD4(chooseproc_cc: pid %d, old_cpuid %d, curr_cpuid %d)", 289 pid_t pid, int old_cpuid, int curr); 290 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_not_good, 0, 291 "USCHED_BSD4(chooseproc_cc not good: pid %d, old_cpumask %lu, " 292 "sibling_mask %lu, curr_cpumask %lu)", 293 pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr); 294 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_elected, 0, 295 "USCHED_BSD4(chooseproc_cc elected: pid %d, old_cpumask %lu, " 296 "sibling_mask %lu, curr_cpumask: %lu)", 297 pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr); 298 299 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process, 0, 300 "USCHED_BSD4(sched_thread %d no process scheduled: pid %d, old_cpuid %d)", 301 int id, pid_t pid, int cpuid); 302 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_process, 0, 303 "USCHED_BSD4(sched_thread %d process scheduled: pid %d, old_cpuid %d)", 304 int id, pid_t pid, int cpuid); 305 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process_found, 0, 306 "USCHED_BSD4(sched_thread %d no process found; tmpmask %lu)", 307 int id, cpumask_t tmpmask); 308 #endif 309 310 /* 311 * Initialize the run queues at boot time. 312 */ 313 static void 314 rqinit(void *dummy) 315 { 316 int i; 317 318 spin_init(&bsd4_spin); 319 for (i = 0; i < NQS; i++) { 320 TAILQ_INIT(&bsd4_queues[i]); 321 TAILQ_INIT(&bsd4_rtqueues[i]); 322 TAILQ_INIT(&bsd4_idqueues[i]); 323 } 324 atomic_clear_cpumask(&bsd4_curprocmask, 1); 325 } 326 SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, rqinit, NULL) 327 328 /* 329 * BSD4_ACQUIRE_CURPROC 330 * 331 * This function is called when the kernel intends to return to userland. 332 * It is responsible for making the thread the current designated userland 333 * thread for this cpu, blocking if necessary. 334 * 335 * The kernel has already depressed our LWKT priority so we must not switch 336 * until we have either assigned or disposed of the thread. 337 * 338 * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE 339 * TO ANOTHER CPU! Because most of the kernel assumes that no migration will 340 * occur, this function is called only under very controlled circumstances. 341 * 342 * MPSAFE 343 */ 344 static void 345 bsd4_acquire_curproc(struct lwp *lp) 346 { 347 globaldata_t gd; 348 bsd4_pcpu_t dd; 349 thread_t td; 350 #if 0 351 struct lwp *olp; 352 #endif 353 354 /* 355 * Make sure we aren't sitting on a tsleep queue. 356 */ 357 td = lp->lwp_thread; 358 crit_enter_quick(td); 359 if (td->td_flags & TDF_TSLEEPQ) 360 tsleep_remove(td); 361 bsd4_recalculate_estcpu(lp); 362 363 /* 364 * If a reschedule was requested give another thread the 365 * driver's seat. 366 */ 367 if (user_resched_wanted()) { 368 clear_user_resched(); 369 bsd4_release_curproc(lp); 370 371 KTR_COND_LOG(usched_bsd4_acquire_curproc_urw, 372 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 373 lp->lwp_proc->p_pid, 374 lp->lwp_thread->td_gd->gd_cpuid, 375 mycpu->gd_cpuid); 376 } 377 378 /* 379 * Loop until we are the current user thread 380 */ 381 gd = mycpu; 382 dd = &bsd4_pcpu[gd->gd_cpuid]; 383 384 KTR_COND_LOG(usched_bsd4_acquire_curproc_before_loop, 385 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 386 lp->lwp_proc->p_pid, 387 lp->lwp_thread->td_gd->gd_cpuid, 388 gd->gd_cpuid); 389 390 do { 391 /* 392 * Process any pending events and higher priority threads. 393 */ 394 lwkt_yield(); 395 396 /* 397 * Become the currently scheduled user thread for this cpu 398 * if we can do so trivially. 399 * 400 * We can steal another thread's current thread designation 401 * on this cpu since if we are running that other thread 402 * must not be, so we can safely deschedule it. 403 */ 404 if (dd->uschedcp == lp) { 405 /* 406 * We are already the current lwp (hot path). 407 */ 408 dd->upri = lp->lwp_priority; 409 } else if (dd->uschedcp == NULL) { 410 /* 411 * We can trivially become the current lwp. 412 */ 413 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask); 414 dd->uschedcp = lp; 415 dd->upri = lp->lwp_priority; 416 } else if (dd->upri > lp->lwp_priority) { 417 /* 418 * We can steal the current cpu's lwp designation 419 * away simply by replacing it. The other thread 420 * will stall when it tries to return to userland. 421 */ 422 dd->uschedcp = lp; 423 dd->upri = lp->lwp_priority; 424 /* 425 lwkt_deschedule(olp->lwp_thread); 426 bsd4_setrunqueue(olp); 427 */ 428 } else { 429 /* 430 * We cannot become the current lwp, place the lp 431 * on the bsd4 run-queue and deschedule ourselves. 432 * 433 * When we are reactivated we will have another 434 * chance. 435 */ 436 lwkt_deschedule(lp->lwp_thread); 437 438 bsd4_setrunqueue(lp); 439 440 KTR_COND_LOG(usched_bsd4_acquire_curproc_not, 441 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 442 lp->lwp_proc->p_pid, 443 lp->lwp_thread->td_gd->gd_cpuid, 444 dd->uschedcp->lwp_proc->p_pid, 445 gd->gd_cpuid); 446 447 448 lwkt_switch(); 449 450 /* 451 * Reload after a switch or setrunqueue/switch possibly 452 * moved us to another cpu. 453 */ 454 gd = mycpu; 455 dd = &bsd4_pcpu[gd->gd_cpuid]; 456 457 KTR_COND_LOG(usched_bsd4_acquire_curproc_switch, 458 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 459 lp->lwp_proc->p_pid, 460 lp->lwp_thread->td_gd->gd_cpuid, 461 gd->gd_cpuid); 462 } 463 } while (dd->uschedcp != lp); 464 465 crit_exit_quick(td); 466 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 467 } 468 469 /* 470 * BSD4_RELEASE_CURPROC 471 * 472 * This routine detaches the current thread from the userland scheduler, 473 * usually because the thread needs to run or block in the kernel (at 474 * kernel priority) for a while. 475 * 476 * This routine is also responsible for selecting a new thread to 477 * make the current thread. 478 * 479 * NOTE: This implementation differs from the dummy example in that 480 * bsd4_select_curproc() is able to select the current process, whereas 481 * dummy_select_curproc() is not able to select the current process. 482 * This means we have to NULL out uschedcp. 483 * 484 * Additionally, note that we may already be on a run queue if releasing 485 * via the lwkt_switch() in bsd4_setrunqueue(). 486 * 487 * MPSAFE 488 */ 489 490 static void 491 bsd4_release_curproc(struct lwp *lp) 492 { 493 globaldata_t gd = mycpu; 494 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 495 496 if (dd->uschedcp == lp) { 497 crit_enter(); 498 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 499 500 KTR_COND_LOG(usched_bsd4_release_curproc, 501 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 502 lp->lwp_proc->p_pid, 503 lp->lwp_thread->td_gd->gd_cpuid, 504 gd->gd_cpuid); 505 506 dd->uschedcp = NULL; /* don't let lp be selected */ 507 dd->upri = PRIBASE_NULL; 508 atomic_clear_cpumask(&bsd4_curprocmask, gd->gd_cpumask); 509 dd->old_uschedcp = lp; /* used only for KTR debug prints */ 510 bsd4_select_curproc(gd); 511 crit_exit(); 512 } 513 } 514 515 /* 516 * BSD4_SELECT_CURPROC 517 * 518 * Select a new current process for this cpu and clear any pending user 519 * reschedule request. The cpu currently has no current process. 520 * 521 * This routine is also responsible for equal-priority round-robining, 522 * typically triggered from bsd4_schedulerclock(). In our dummy example 523 * all the 'user' threads are LWKT scheduled all at once and we just 524 * call lwkt_switch(). 525 * 526 * The calling process is not on the queue and cannot be selected. 527 * 528 * MPSAFE 529 */ 530 static 531 void 532 bsd4_select_curproc(globaldata_t gd) 533 { 534 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 535 struct lwp *nlp; 536 int cpuid = gd->gd_cpuid; 537 538 crit_enter_gd(gd); 539 540 spin_lock(&bsd4_spin); 541 #ifdef SMP 542 if(usched_bsd4_cache_coherent) 543 nlp = chooseproc_locked_cache_coherent(dd->uschedcp); 544 else 545 #endif 546 nlp = chooseproc_locked(dd->uschedcp); 547 548 if (nlp) { 549 550 KTR_COND_LOG(usched_bsd4_select_curproc, 551 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, 552 nlp->lwp_proc->p_pid, 553 nlp->lwp_thread->td_gd->gd_cpuid, 554 dd->old_uschedcp->lwp_proc->p_pid, 555 dd->old_uschedcp->lwp_thread->td_gd->gd_cpuid, 556 gd->gd_cpuid); 557 558 atomic_set_cpumask(&bsd4_curprocmask, CPUMASK(cpuid)); 559 dd->upri = nlp->lwp_priority; 560 dd->uschedcp = nlp; 561 spin_unlock(&bsd4_spin); 562 #ifdef SMP 563 lwkt_acquire(nlp->lwp_thread); 564 #endif 565 lwkt_schedule(nlp->lwp_thread); 566 } else { 567 spin_unlock(&bsd4_spin); 568 } 569 570 #if 0 571 } else if (bsd4_runqcount && (bsd4_rdyprocmask & CPUMASK(cpuid))) { 572 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid)); 573 spin_unlock(&bsd4_spin); 574 lwkt_schedule(&dd->helper_thread); 575 } else { 576 spin_unlock(&bsd4_spin); 577 } 578 #endif 579 crit_exit_gd(gd); 580 } 581 #ifdef SMP 582 583 /* 584 * batchy_looser_pri_test() - determine if a process is batchy or not 585 * relative to the other processes running in the system 586 */ 587 static int 588 batchy_looser_pri_test(struct lwp* lp) 589 { 590 cpumask_t mask; 591 bsd4_pcpu_t other_dd; 592 int cpu; 593 594 /* Current running processes */ 595 mask = bsd4_curprocmask & smp_active_mask 596 & usched_global_cpumask; 597 598 while(mask) { 599 cpu = BSFCPUMASK(mask); 600 other_dd = &bsd4_pcpu[cpu]; 601 if (other_dd->upri - lp->lwp_priority > usched_bsd4_upri_affinity * PPQ) { 602 603 KTR_COND_LOG(usched_batchy_test_false, 604 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 605 lp->lwp_proc->p_pid, 606 lp->lwp_thread->td_gd->gd_cpuid, 607 (unsigned long)mask); 608 609 return 0; 610 } 611 mask &= ~CPUMASK(cpu); 612 } 613 614 KTR_COND_LOG(usched_batchy_test_true, 615 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 616 lp->lwp_proc->p_pid, 617 lp->lwp_thread->td_gd->gd_cpuid, 618 (unsigned long)mask); 619 620 return 1; 621 } 622 623 #endif 624 /* 625 * 626 * BSD4_SETRUNQUEUE 627 * 628 * Place the specified lwp on the user scheduler's run queue. This routine 629 * must be called with the thread descheduled. The lwp must be runnable. 630 * 631 * The thread may be the current thread as a special case. 632 * 633 * MPSAFE 634 */ 635 static void 636 bsd4_setrunqueue(struct lwp *lp) 637 { 638 globaldata_t gd; 639 bsd4_pcpu_t dd; 640 #ifdef SMP 641 int cpuid; 642 cpumask_t mask; 643 cpumask_t tmpmask; 644 #endif 645 646 /* 647 * First validate the process state relative to the current cpu. 648 * We don't need the spinlock for this, just a critical section. 649 * We are in control of the process. 650 */ 651 crit_enter(); 652 KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN")); 653 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0, 654 ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid, 655 lp->lwp_tid, lp->lwp_proc->p_flags, lp->lwp_flags)); 656 KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0); 657 658 /* 659 * Note: gd and dd are relative to the target thread's last cpu, 660 * NOT our current cpu. 661 */ 662 gd = lp->lwp_thread->td_gd; 663 dd = &bsd4_pcpu[gd->gd_cpuid]; 664 665 /* 666 * This process is not supposed to be scheduled anywhere or assigned 667 * as the current process anywhere. Assert the condition. 668 */ 669 KKASSERT(dd->uschedcp != lp); 670 671 #ifndef SMP 672 /* 673 * If we are not SMP we do not have a scheduler helper to kick 674 * and must directly activate the process if none are scheduled. 675 * 676 * This is really only an issue when bootstrapping init since 677 * the caller in all other cases will be a user process, and 678 * even if released (dd->uschedcp == NULL), that process will 679 * kickstart the scheduler when it returns to user mode from 680 * the kernel. 681 */ 682 if (dd->uschedcp == NULL) { 683 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask); 684 dd->uschedcp = lp; 685 dd->upri = lp->lwp_priority; 686 lwkt_schedule(lp->lwp_thread); 687 crit_exit(); 688 return; 689 } 690 #endif 691 692 #ifdef SMP 693 /* 694 * XXX fixme. Could be part of a remrunqueue/setrunqueue 695 * operation when the priority is recalculated, so TDF_MIGRATING 696 * may already be set. 697 */ 698 if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0) 699 lwkt_giveaway(lp->lwp_thread); 700 #endif 701 702 /* 703 * We lose control of lp the moment we release the spinlock after 704 * having placed lp on the queue. i.e. another cpu could pick it 705 * up and it could exit, or its priority could be further adjusted, 706 * or something like that. 707 */ 708 spin_lock(&bsd4_spin); 709 bsd4_setrunqueue_locked(lp); 710 lp->lwp_setrunqueue_ticks = sched_ticks; 711 712 #ifdef SMP 713 /* 714 * Kick the scheduler helper on one of the other cpu's 715 * and request a reschedule if appropriate. 716 * 717 * NOTE: We check all cpus whos rdyprocmask is set. First we 718 * look for cpus without designated lps, then we look for 719 * cpus with designated lps with a worse priority than our 720 * process. 721 */ 722 ++bsd4_scancpu; 723 724 if(usched_bsd4_smt) { 725 726 /* 727 * SMT heuristic - Try to schedule on a free physical core. If no physical core 728 * found than choose the one that has an interactive thread 729 */ 730 731 int best_cpuid = -1; 732 int min_prio = MAXPRI * MAXPRI; 733 int sibling; 734 735 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 736 mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask & 737 smp_active_mask & usched_global_cpumask; 738 739 KTR_COND_LOG(usched_bsd4_setrunqueue_fc_smt, 740 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 741 lp->lwp_proc->p_pid, 742 lp->lwp_thread->td_gd->gd_cpuid, 743 (unsigned long)mask, 744 mycpu->gd_cpuid); 745 746 while (mask) { 747 tmpmask = ~(CPUMASK(cpuid) - 1); 748 if (mask & tmpmask) 749 cpuid = BSFCPUMASK(mask & tmpmask); 750 else 751 cpuid = BSFCPUMASK(mask); 752 gd = globaldata_find(cpuid); 753 dd = &bsd4_pcpu[cpuid]; 754 755 if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) { 756 if (dd->cpunode->parent_node->members & ~dd->cpunode->members & mask) { 757 758 KTR_COND_LOG(usched_bsd4_setrunqueue_found, 759 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 760 lp->lwp_proc->p_pid, 761 lp->lwp_thread->td_gd->gd_cpuid, 762 (unsigned long)mask, 763 cpuid, 764 mycpu->gd_cpuid); 765 766 goto found; 767 } else { 768 sibling = BSFCPUMASK(dd->cpunode->parent_node->members & 769 ~dd->cpunode->members); 770 if (min_prio > bsd4_pcpu[sibling].upri) { 771 min_prio = bsd4_pcpu[sibling].upri; 772 best_cpuid = cpuid; 773 } 774 } 775 } 776 mask &= ~CPUMASK(cpuid); 777 } 778 779 if (best_cpuid != -1) { 780 cpuid = best_cpuid; 781 gd = globaldata_find(cpuid); 782 dd = &bsd4_pcpu[cpuid]; 783 784 KTR_COND_LOG(usched_bsd4_setrunqueue_found_best_cpuid, 785 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 786 lp->lwp_proc->p_pid, 787 lp->lwp_thread->td_gd->gd_cpuid, 788 (unsigned long)mask, 789 cpuid, 790 mycpu->gd_cpuid); 791 792 goto found; 793 } 794 } else { 795 /* Fallback to the original heuristic */ 796 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 797 mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask & 798 smp_active_mask & usched_global_cpumask; 799 800 KTR_COND_LOG(usched_bsd4_setrunqueue_fc_non_smt, 801 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 802 lp->lwp_proc->p_pid, 803 lp->lwp_thread->td_gd->gd_cpuid, 804 (unsigned long)mask, 805 mycpu->gd_cpuid); 806 807 while (mask) { 808 tmpmask = ~(CPUMASK(cpuid) - 1); 809 if (mask & tmpmask) 810 cpuid = BSFCPUMASK(mask & tmpmask); 811 else 812 cpuid = BSFCPUMASK(mask); 813 gd = globaldata_find(cpuid); 814 dd = &bsd4_pcpu[cpuid]; 815 816 if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) { 817 818 KTR_COND_LOG(usched_bsd4_setrunqueue_found, 819 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 820 lp->lwp_proc->p_pid, 821 lp->lwp_thread->td_gd->gd_cpuid, 822 (unsigned long)mask, 823 cpuid, 824 mycpu->gd_cpuid); 825 826 goto found; 827 } 828 mask &= ~CPUMASK(cpuid); 829 } 830 } 831 832 /* 833 * Then cpus which might have a currently running lp 834 */ 835 mask = bsd4_curprocmask & bsd4_rdyprocmask & 836 lp->lwp_cpumask & smp_active_mask & usched_global_cpumask; 837 838 KTR_COND_LOG(usched_bsd4_setrunqueue_rc, 839 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 840 lp->lwp_proc->p_pid, 841 lp->lwp_thread->td_gd->gd_cpuid, 842 (unsigned long)mask, 843 mycpu->gd_cpuid); 844 845 while (mask) { 846 tmpmask = ~(CPUMASK(cpuid) - 1); 847 if (mask & tmpmask) 848 cpuid = BSFCPUMASK(mask & tmpmask); 849 else 850 cpuid = BSFCPUMASK(mask); 851 gd = globaldata_find(cpuid); 852 dd = &bsd4_pcpu[cpuid]; 853 854 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 855 856 KTR_COND_LOG(usched_bsd4_setrunqueue_found, 857 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 858 lp->lwp_proc->p_pid, 859 lp->lwp_thread->td_gd->gd_cpuid, 860 (unsigned long)mask, 861 cpuid, 862 mycpu->gd_cpuid); 863 864 goto found; 865 } 866 mask &= ~CPUMASK(cpuid); 867 } 868 869 /* 870 * If we cannot find a suitable cpu we reload from bsd4_scancpu 871 * and round-robin. Other cpus will pickup as they release their 872 * current lwps or become ready. 873 * 874 * Avoid a degenerate system lockup case if usched_global_cpumask 875 * is set to 0 or otherwise does not cover lwp_cpumask. 876 * 877 * We only kick the target helper thread in this case, we do not 878 * set the user resched flag because 879 */ 880 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 881 if ((CPUMASK(cpuid) & usched_global_cpumask) == 0) { 882 cpuid = 0; 883 } 884 gd = globaldata_find(cpuid); 885 dd = &bsd4_pcpu[cpuid]; 886 887 KTR_COND_LOG(usched_bsd4_setrunqueue_not_found, 888 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 889 lp->lwp_proc->p_pid, 890 lp->lwp_thread->td_gd->gd_cpuid, 891 cpuid, 892 mycpu->gd_cpuid); 893 894 found: 895 if (gd == mycpu) { 896 spin_unlock(&bsd4_spin); 897 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 898 if (dd->uschedcp == NULL) { 899 wakeup_mycpu(&dd->helper_thread); 900 } else { 901 need_user_resched(); 902 } 903 } 904 } else { 905 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid)); 906 spin_unlock(&bsd4_spin); 907 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) 908 lwkt_send_ipiq(gd, need_user_resched_remote, NULL); 909 else 910 wakeup(&dd->helper_thread); 911 } 912 #else 913 /* 914 * Request a reschedule if appropriate. 915 */ 916 spin_unlock(&bsd4_spin); 917 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 918 need_user_resched(); 919 } 920 #endif 921 crit_exit(); 922 } 923 924 /* 925 * This routine is called from a systimer IPI. It MUST be MP-safe and 926 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ on 927 * each cpu. 928 * 929 * MPSAFE 930 */ 931 static 932 void 933 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp) 934 { 935 globaldata_t gd = mycpu; 936 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 937 938 /* 939 * Do we need to round-robin? We round-robin 10 times a second. 940 * This should only occur for cpu-bound batch processes. 941 */ 942 if (++dd->rrcount >= usched_bsd4_rrinterval) { 943 dd->rrcount = 0; 944 need_user_resched(); 945 } 946 947 /* 948 * Adjust estcpu upward using a real time equivalent calculation. 949 */ 950 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUMAX / ESTCPUFREQ + 1); 951 952 /* 953 * Spinlocks also hold a critical section so there should not be 954 * any active. 955 */ 956 KKASSERT(gd->gd_spinlocks_wr == 0); 957 958 bsd4_resetpriority(lp); 959 #if 0 960 /* 961 * if we can't call bsd4_resetpriority for some reason we must call 962 * need user_resched(). 963 */ 964 need_user_resched(); 965 #endif 966 } 967 968 /* 969 * Called from acquire and from kern_synch's one-second timer (one of the 970 * callout helper threads) with a critical section held. 971 * 972 * Decay p_estcpu based on the number of ticks we haven't been running 973 * and our p_nice. As the load increases each process observes a larger 974 * number of idle ticks (because other processes are running in them). 975 * This observation leads to a larger correction which tends to make the 976 * system more 'batchy'. 977 * 978 * Note that no recalculation occurs for a process which sleeps and wakes 979 * up in the same tick. That is, a system doing thousands of context 980 * switches per second will still only do serious estcpu calculations 981 * ESTCPUFREQ times per second. 982 * 983 * MPSAFE 984 */ 985 static 986 void 987 bsd4_recalculate_estcpu(struct lwp *lp) 988 { 989 globaldata_t gd = mycpu; 990 sysclock_t cpbase; 991 sysclock_t ttlticks; 992 int estcpu; 993 int decay_factor; 994 995 /* 996 * We have to subtract periodic to get the last schedclock 997 * timeout time, otherwise we would get the upcoming timeout. 998 * Keep in mind that a process can migrate between cpus and 999 * while the scheduler clock should be very close, boundary 1000 * conditions could lead to a small negative delta. 1001 */ 1002 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic; 1003 1004 if (lp->lwp_slptime > 1) { 1005 /* 1006 * Too much time has passed, do a coarse correction. 1007 */ 1008 lp->lwp_estcpu = lp->lwp_estcpu >> 1; 1009 bsd4_resetpriority(lp); 1010 lp->lwp_cpbase = cpbase; 1011 lp->lwp_cpticks = 0; 1012 lp->lwp_batch -= ESTCPUFREQ; 1013 if (lp->lwp_batch < 0) 1014 lp->lwp_batch = 0; 1015 } else if (lp->lwp_cpbase != cpbase) { 1016 /* 1017 * Adjust estcpu if we are in a different tick. Don't waste 1018 * time if we are in the same tick. 1019 * 1020 * First calculate the number of ticks in the measurement 1021 * interval. The ttlticks calculation can wind up 0 due to 1022 * a bug in the handling of lwp_slptime (as yet not found), 1023 * so make sure we do not get a divide by 0 panic. 1024 */ 1025 ttlticks = (cpbase - lp->lwp_cpbase) / 1026 gd->gd_schedclock.periodic; 1027 if (ttlticks < 0) { 1028 ttlticks = 0; 1029 lp->lwp_cpbase = cpbase; 1030 } 1031 if (ttlticks == 0) 1032 return; 1033 updatepcpu(lp, lp->lwp_cpticks, ttlticks); 1034 1035 /* 1036 * Calculate the percentage of one cpu used factoring in ncpus 1037 * and the load and adjust estcpu. Handle degenerate cases 1038 * by adding 1 to bsd4_runqcount. 1039 * 1040 * estcpu is scaled by ESTCPUMAX. 1041 * 1042 * bsd4_runqcount is the excess number of user processes 1043 * that cannot be immediately scheduled to cpus. We want 1044 * to count these as running to avoid range compression 1045 * in the base calculation (which is the actual percentage 1046 * of one cpu used). 1047 */ 1048 estcpu = (lp->lwp_cpticks * ESTCPUMAX) * 1049 (bsd4_runqcount + ncpus) / (ncpus * ttlticks); 1050 1051 /* 1052 * If estcpu is > 50% we become more batch-like 1053 * If estcpu is <= 50% we become less batch-like 1054 * 1055 * It takes 30 cpu seconds to traverse the entire range. 1056 */ 1057 if (estcpu > ESTCPUMAX / 2) { 1058 lp->lwp_batch += ttlticks; 1059 if (lp->lwp_batch > BATCHMAX) 1060 lp->lwp_batch = BATCHMAX; 1061 } else { 1062 lp->lwp_batch -= ttlticks; 1063 if (lp->lwp_batch < 0) 1064 lp->lwp_batch = 0; 1065 } 1066 1067 if (usched_bsd4_debug == lp->lwp_proc->p_pid) { 1068 kprintf("pid %d lwp %p estcpu %3d %3d bat %d cp %d/%d", 1069 lp->lwp_proc->p_pid, lp, 1070 estcpu, lp->lwp_estcpu, 1071 lp->lwp_batch, 1072 lp->lwp_cpticks, ttlticks); 1073 } 1074 1075 /* 1076 * Adjust lp->lwp_esetcpu. The decay factor determines how 1077 * quickly lwp_estcpu collapses to its realtime calculation. 1078 * A slower collapse gives us a more accurate number but 1079 * can cause a cpu hog to eat too much cpu before the 1080 * scheduler decides to downgrade it. 1081 * 1082 * NOTE: p_nice is accounted for in bsd4_resetpriority(), 1083 * and not here, but we must still ensure that a 1084 * cpu-bound nice -20 process does not completely 1085 * override a cpu-bound nice +20 process. 1086 * 1087 * NOTE: We must use ESTCPULIM() here to deal with any 1088 * overshoot. 1089 */ 1090 decay_factor = usched_bsd4_decay; 1091 if (decay_factor < 1) 1092 decay_factor = 1; 1093 if (decay_factor > 1024) 1094 decay_factor = 1024; 1095 1096 lp->lwp_estcpu = ESTCPULIM( 1097 (lp->lwp_estcpu * decay_factor + estcpu) / 1098 (decay_factor + 1)); 1099 1100 if (usched_bsd4_debug == lp->lwp_proc->p_pid) 1101 kprintf(" finalestcpu %d\n", lp->lwp_estcpu); 1102 bsd4_resetpriority(lp); 1103 lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic; 1104 lp->lwp_cpticks = 0; 1105 } 1106 } 1107 1108 /* 1109 * Compute the priority of a process when running in user mode. 1110 * Arrange to reschedule if the resulting priority is better 1111 * than that of the current process. 1112 * 1113 * This routine may be called with any process. 1114 * 1115 * This routine is called by fork1() for initial setup with the process 1116 * of the run queue, and also may be called normally with the process on or 1117 * off the run queue. 1118 * 1119 * MPSAFE 1120 */ 1121 static void 1122 bsd4_resetpriority(struct lwp *lp) 1123 { 1124 bsd4_pcpu_t dd; 1125 int newpriority; 1126 u_short newrqtype; 1127 int reschedcpu; 1128 int checkpri; 1129 int estcpu; 1130 1131 /* 1132 * Calculate the new priority and queue type 1133 */ 1134 crit_enter(); 1135 spin_lock(&bsd4_spin); 1136 1137 newrqtype = lp->lwp_rtprio.type; 1138 1139 switch(newrqtype) { 1140 case RTP_PRIO_REALTIME: 1141 case RTP_PRIO_FIFO: 1142 newpriority = PRIBASE_REALTIME + 1143 (lp->lwp_rtprio.prio & PRIMASK); 1144 break; 1145 case RTP_PRIO_NORMAL: 1146 /* 1147 * Detune estcpu based on batchiness. lwp_batch ranges 1148 * from 0 to BATCHMAX. Limit estcpu for the sake of 1149 * the priority calculation to between 50% and 100%. 1150 */ 1151 estcpu = lp->lwp_estcpu * (lp->lwp_batch + BATCHMAX) / 1152 (BATCHMAX * 2); 1153 1154 /* 1155 * p_nice piece Adds (0-40) * 2 0-80 1156 * estcpu Adds 16384 * 4 / 512 0-128 1157 */ 1158 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ; 1159 newpriority += estcpu * PPQ / ESTCPUPPQ; 1160 newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ / 1161 NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ); 1162 newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK); 1163 break; 1164 case RTP_PRIO_IDLE: 1165 newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK); 1166 break; 1167 case RTP_PRIO_THREAD: 1168 newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK); 1169 break; 1170 default: 1171 panic("Bad RTP_PRIO %d", newrqtype); 1172 /* NOT REACHED */ 1173 } 1174 1175 /* 1176 * The newpriority incorporates the queue type so do a simple masked 1177 * check to determine if the process has moved to another queue. If 1178 * it has, and it is currently on a run queue, then move it. 1179 */ 1180 if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) { 1181 lp->lwp_priority = newpriority; 1182 if (lp->lwp_mpflags & LWP_MP_ONRUNQ) { 1183 bsd4_remrunqueue_locked(lp); 1184 lp->lwp_rqtype = newrqtype; 1185 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; 1186 bsd4_setrunqueue_locked(lp); 1187 checkpri = 1; 1188 } else { 1189 lp->lwp_rqtype = newrqtype; 1190 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; 1191 checkpri = 0; 1192 } 1193 reschedcpu = lp->lwp_thread->td_gd->gd_cpuid; 1194 } else { 1195 lp->lwp_priority = newpriority; 1196 reschedcpu = -1; 1197 checkpri = 1; 1198 } 1199 1200 /* 1201 * Determine if we need to reschedule the target cpu. This only 1202 * occurs if the LWP is already on a scheduler queue, which means 1203 * that idle cpu notification has already occured. At most we 1204 * need only issue a need_user_resched() on the appropriate cpu. 1205 * 1206 * The LWP may be owned by a CPU different from the current one, 1207 * in which case dd->uschedcp may be modified without an MP lock 1208 * or a spinlock held. The worst that happens is that the code 1209 * below causes a spurious need_user_resched() on the target CPU 1210 * and dd->pri to be wrong for a short period of time, both of 1211 * which are harmless. 1212 * 1213 * If checkpri is 0 we are adjusting the priority of the current 1214 * process, possibly higher (less desireable), so ignore the upri 1215 * check which will fail in that case. 1216 */ 1217 if (reschedcpu >= 0) { 1218 dd = &bsd4_pcpu[reschedcpu]; 1219 if ((bsd4_rdyprocmask & CPUMASK(reschedcpu)) && 1220 (checkpri == 0 || 1221 (dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK))) { 1222 #ifdef SMP 1223 if (reschedcpu == mycpu->gd_cpuid) { 1224 spin_unlock(&bsd4_spin); 1225 need_user_resched(); 1226 } else { 1227 spin_unlock(&bsd4_spin); 1228 atomic_clear_cpumask(&bsd4_rdyprocmask, 1229 CPUMASK(reschedcpu)); 1230 lwkt_send_ipiq(lp->lwp_thread->td_gd, 1231 need_user_resched_remote, NULL); 1232 } 1233 #else 1234 spin_unlock(&bsd4_spin); 1235 need_user_resched(); 1236 #endif 1237 } else { 1238 spin_unlock(&bsd4_spin); 1239 } 1240 } else { 1241 spin_unlock(&bsd4_spin); 1242 } 1243 crit_exit(); 1244 } 1245 1246 /* 1247 * MPSAFE 1248 */ 1249 static 1250 void 1251 bsd4_yield(struct lwp *lp) 1252 { 1253 #if 0 1254 /* FUTURE (or something similar) */ 1255 switch(lp->lwp_rqtype) { 1256 case RTP_PRIO_NORMAL: 1257 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR); 1258 break; 1259 default: 1260 break; 1261 } 1262 #endif 1263 need_user_resched(); 1264 } 1265 1266 /* 1267 * Called from fork1() when a new child process is being created. 1268 * 1269 * Give the child process an initial estcpu that is more batch then 1270 * its parent and dock the parent for the fork (but do not 1271 * reschedule the parent). This comprises the main part of our batch 1272 * detection heuristic for both parallel forking and sequential execs. 1273 * 1274 * XXX lwp should be "spawning" instead of "forking" 1275 * 1276 * MPSAFE 1277 */ 1278 static void 1279 bsd4_forking(struct lwp *plp, struct lwp *lp) 1280 { 1281 /* 1282 * Put the child 4 queue slots (out of 32) higher than the parent 1283 * (less desireable than the parent). 1284 */ 1285 lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ * 4); 1286 1287 /* 1288 * The batch status of children always starts out centerline 1289 * and will inch-up or inch-down as appropriate. It takes roughly 1290 * ~15 seconds of >50% cpu to hit the limit. 1291 */ 1292 lp->lwp_batch = BATCHMAX / 2; 1293 1294 /* 1295 * Dock the parent a cost for the fork, protecting us from fork 1296 * bombs. If the parent is forking quickly make the child more 1297 * batchy. 1298 */ 1299 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ / 16); 1300 } 1301 1302 /* 1303 * Called when a parent waits for a child. 1304 * 1305 * MPSAFE 1306 */ 1307 static void 1308 bsd4_exiting(struct lwp *lp, struct proc *child_proc) 1309 { 1310 } 1311 1312 /* 1313 * chooseproc() is called when a cpu needs a user process to LWKT schedule, 1314 * it selects a user process and returns it. If chklp is non-NULL and chklp 1315 * has a better or equal priority then the process that would otherwise be 1316 * chosen, NULL is returned. 1317 * 1318 * Until we fix the RUNQ code the chklp test has to be strict or we may 1319 * bounce between processes trying to acquire the current process designation. 1320 * 1321 * MPSAFE - must be called with bsd4_spin exclusive held. The spinlock is 1322 * left intact through the entire routine. 1323 */ 1324 static 1325 struct lwp * 1326 chooseproc_locked(struct lwp *chklp) 1327 { 1328 struct lwp *lp; 1329 struct rq *q; 1330 u_int32_t *which, *which2; 1331 u_int32_t pri; 1332 u_int32_t rtqbits; 1333 u_int32_t tsqbits; 1334 u_int32_t idqbits; 1335 cpumask_t cpumask; 1336 1337 rtqbits = bsd4_rtqueuebits; 1338 tsqbits = bsd4_queuebits; 1339 idqbits = bsd4_idqueuebits; 1340 cpumask = mycpu->gd_cpumask; 1341 1342 1343 #ifdef SMP 1344 again: 1345 #endif 1346 if (rtqbits) { 1347 pri = bsfl(rtqbits); 1348 q = &bsd4_rtqueues[pri]; 1349 which = &bsd4_rtqueuebits; 1350 which2 = &rtqbits; 1351 } else if (tsqbits) { 1352 pri = bsfl(tsqbits); 1353 q = &bsd4_queues[pri]; 1354 which = &bsd4_queuebits; 1355 which2 = &tsqbits; 1356 } else if (idqbits) { 1357 pri = bsfl(idqbits); 1358 q = &bsd4_idqueues[pri]; 1359 which = &bsd4_idqueuebits; 1360 which2 = &idqbits; 1361 } else { 1362 return NULL; 1363 } 1364 lp = TAILQ_FIRST(q); 1365 KASSERT(lp, ("chooseproc: no lwp on busy queue")); 1366 1367 #ifdef SMP 1368 while ((lp->lwp_cpumask & cpumask) == 0) { 1369 lp = TAILQ_NEXT(lp, lwp_procq); 1370 if (lp == NULL) { 1371 *which2 &= ~(1 << pri); 1372 goto again; 1373 } 1374 } 1375 #endif 1376 1377 /* 1378 * If the passed lwp <chklp> is reasonably close to the selected 1379 * lwp <lp>, return NULL (indicating that <chklp> should be kept). 1380 * 1381 * Note that we must error on the side of <chklp> to avoid bouncing 1382 * between threads in the acquire code. 1383 */ 1384 if (chklp) { 1385 if (chklp->lwp_priority < lp->lwp_priority + PPQ) 1386 return(NULL); 1387 } 1388 1389 #ifdef SMP 1390 /* 1391 * If the chosen lwp does not reside on this cpu spend a few 1392 * cycles looking for a better candidate at the same priority level. 1393 * This is a fallback check, setrunqueue() tries to wakeup the 1394 * correct cpu and is our front-line affinity. 1395 */ 1396 if (lp->lwp_thread->td_gd != mycpu && 1397 (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL 1398 ) { 1399 if (chklp->lwp_thread->td_gd == mycpu) { 1400 ++choose_affinity; 1401 lp = chklp; 1402 } 1403 } 1404 #endif 1405 1406 KTR_COND_LOG(usched_chooseproc, 1407 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1408 lp->lwp_proc->p_pid, 1409 lp->lwp_thread->td_gd->gd_cpuid, 1410 mycpu->gd_cpuid); 1411 1412 TAILQ_REMOVE(q, lp, lwp_procq); 1413 --bsd4_runqcount; 1414 if (TAILQ_EMPTY(q)) 1415 *which &= ~(1 << pri); 1416 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!")); 1417 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1418 return lp; 1419 } 1420 1421 #ifdef SMP 1422 /* 1423 * chooseproc() - with a cache coherence heuristic. Try to pull a process that 1424 * has its home on the current CPU> If the process doesn't have its home here 1425 * and is a batchy one (see batcy_looser_pri_test), we can wait for a 1426 * sched_tick, may be its home will become free and pull it in. Anyway, 1427 * we can't wait more than one tick. If that tick expired, we pull in that 1428 * process, no matter what. 1429 */ 1430 static 1431 struct lwp * 1432 chooseproc_locked_cache_coherent(struct lwp *chklp) 1433 { 1434 struct lwp *lp; 1435 struct rq *q; 1436 u_int32_t *which, *which2; 1437 u_int32_t pri; 1438 u_int32_t checks; 1439 u_int32_t rtqbits; 1440 u_int32_t tsqbits; 1441 u_int32_t idqbits; 1442 cpumask_t cpumask; 1443 1444 struct lwp * min_level_lwp = NULL; 1445 struct rq *min_q = NULL; 1446 cpumask_t siblings; 1447 cpu_node_t* cpunode = NULL; 1448 u_int32_t min_level = MAXCPU; /* number of levels < MAXCPU */ 1449 u_int32_t *min_which = NULL; 1450 u_int32_t min_pri = 0; 1451 u_int32_t level = 0; 1452 1453 rtqbits = bsd4_rtqueuebits; 1454 tsqbits = bsd4_queuebits; 1455 idqbits = bsd4_idqueuebits; 1456 cpumask = mycpu->gd_cpumask; 1457 1458 /* Get the mask coresponding to the sysctl configured level */ 1459 cpunode = bsd4_pcpu[mycpu->gd_cpuid].cpunode; 1460 level = usched_bsd4_stick_to_level; 1461 while (level) { 1462 cpunode = cpunode->parent_node; 1463 level--; 1464 } 1465 /* The cpus which can ellect a process */ 1466 siblings = cpunode->members; 1467 1468 again: 1469 if (rtqbits) { 1470 pri = bsfl(rtqbits); 1471 q = &bsd4_rtqueues[pri]; 1472 which = &bsd4_rtqueuebits; 1473 which2 = &rtqbits; 1474 } else if (tsqbits) { 1475 pri = bsfl(tsqbits); 1476 q = &bsd4_queues[pri]; 1477 which = &bsd4_queuebits; 1478 which2 = &tsqbits; 1479 } else if (idqbits) { 1480 pri = bsfl(idqbits); 1481 q = &bsd4_idqueues[pri]; 1482 which = &bsd4_idqueuebits; 1483 which2 = &idqbits; 1484 } else { 1485 return NULL; 1486 } 1487 lp = TAILQ_FIRST(q); 1488 KASSERT(lp, ("chooseproc: no lwp on busy queue")); 1489 1490 /* Limit the number of checks/queue to a configurable value to 1491 * minimize the contention (we are in a locked region 1492 */ 1493 for (checks = 0; checks < usched_bsd4_queue_checks; checks++) { 1494 1495 if ((lp->lwp_cpumask & cpumask) == 0 || 1496 ((siblings & lp->lwp_thread->td_gd->gd_cpumask) == 0 && 1497 batchy_looser_pri_test(lp) && 1498 (lp->lwp_setrunqueue_ticks == sched_ticks || 1499 lp->lwp_setrunqueue_ticks == (int)(sched_ticks - 1)))) { 1500 1501 KTR_COND_LOG(usched_chooseproc_cc_not_good, 1502 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1503 lp->lwp_proc->p_pid, 1504 (unsigned long)lp->lwp_thread->td_gd->gd_cpumask, 1505 (unsigned long)siblings, 1506 (unsigned long)cpumask); 1507 1508 cpunode = bsd4_pcpu[lp->lwp_thread->td_gd->gd_cpuid].cpunode; 1509 level = 0; 1510 while (cpunode) { 1511 if (cpunode->members & cpumask) { 1512 break; 1513 } 1514 cpunode = cpunode->parent_node; 1515 level++; 1516 } 1517 if (level < min_level) { 1518 min_level_lwp = lp; 1519 min_level = level; 1520 min_q = q; 1521 min_which = which; 1522 min_pri = pri; 1523 } 1524 1525 lp = TAILQ_NEXT(lp, lwp_procq); 1526 if (lp == NULL) { 1527 *which2 &= ~(1 << pri); 1528 goto again; 1529 } 1530 } else { 1531 KTR_COND_LOG(usched_chooseproc_cc_elected, 1532 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1533 lp->lwp_proc->p_pid, 1534 (unsigned long)lp->lwp_thread->td_gd->gd_cpumask, 1535 (unsigned long)siblings, 1536 (unsigned long)cpumask); 1537 1538 goto found; 1539 } 1540 } 1541 lp = min_level_lwp; 1542 q = min_q; 1543 which = min_which; 1544 pri = min_pri; 1545 KASSERT(lp, ("chooseproc: at least the first lp was good")); 1546 1547 found: 1548 1549 /* 1550 * If the passed lwp <chklp> is reasonably close to the selected 1551 * lwp <lp>, return NULL (indicating that <chklp> should be kept). 1552 * 1553 * Note that we must error on the side of <chklp> to avoid bouncing 1554 * between threads in the acquire code. 1555 */ 1556 if (chklp) { 1557 if (chklp->lwp_priority < lp->lwp_priority + PPQ) 1558 return(NULL); 1559 } 1560 1561 KTR_COND_LOG(usched_chooseproc_cc, 1562 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1563 lp->lwp_proc->p_pid, 1564 lp->lwp_thread->td_gd->gd_cpuid, 1565 mycpu->gd_cpuid); 1566 1567 TAILQ_REMOVE(q, lp, lwp_procq); 1568 --bsd4_runqcount; 1569 if (TAILQ_EMPTY(q)) 1570 *which &= ~(1 << pri); 1571 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!")); 1572 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1573 return lp; 1574 } 1575 1576 1577 static 1578 void 1579 need_user_resched_remote(void *dummy) 1580 { 1581 globaldata_t gd = mycpu; 1582 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 1583 1584 need_user_resched(); 1585 1586 /* Call wakeup_mycpu to avoid sending IPIs to other CPUs */ 1587 wakeup_mycpu(&dd->helper_thread); 1588 } 1589 1590 #endif 1591 1592 /* 1593 * bsd4_remrunqueue_locked() removes a given process from the run queue 1594 * that it is on, clearing the queue busy bit if it becomes empty. 1595 * 1596 * Note that user process scheduler is different from the LWKT schedule. 1597 * The user process scheduler only manages user processes but it uses LWKT 1598 * underneath, and a user process operating in the kernel will often be 1599 * 'released' from our management. 1600 * 1601 * MPSAFE - bsd4_spin must be held exclusively on call 1602 */ 1603 static void 1604 bsd4_remrunqueue_locked(struct lwp *lp) 1605 { 1606 struct rq *q; 1607 u_int32_t *which; 1608 u_int8_t pri; 1609 1610 KKASSERT(lp->lwp_mpflags & LWP_MP_ONRUNQ); 1611 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1612 --bsd4_runqcount; 1613 KKASSERT(bsd4_runqcount >= 0); 1614 1615 pri = lp->lwp_rqindex; 1616 switch(lp->lwp_rqtype) { 1617 case RTP_PRIO_NORMAL: 1618 q = &bsd4_queues[pri]; 1619 which = &bsd4_queuebits; 1620 break; 1621 case RTP_PRIO_REALTIME: 1622 case RTP_PRIO_FIFO: 1623 q = &bsd4_rtqueues[pri]; 1624 which = &bsd4_rtqueuebits; 1625 break; 1626 case RTP_PRIO_IDLE: 1627 q = &bsd4_idqueues[pri]; 1628 which = &bsd4_idqueuebits; 1629 break; 1630 default: 1631 panic("remrunqueue: invalid rtprio type"); 1632 /* NOT REACHED */ 1633 } 1634 TAILQ_REMOVE(q, lp, lwp_procq); 1635 if (TAILQ_EMPTY(q)) { 1636 KASSERT((*which & (1 << pri)) != 0, 1637 ("remrunqueue: remove from empty queue")); 1638 *which &= ~(1 << pri); 1639 } 1640 } 1641 1642 /* 1643 * bsd4_setrunqueue_locked() 1644 * 1645 * Add a process whos rqtype and rqindex had previously been calculated 1646 * onto the appropriate run queue. Determine if the addition requires 1647 * a reschedule on a cpu and return the cpuid or -1. 1648 * 1649 * NOTE: Lower priorities are better priorities. 1650 * 1651 * MPSAFE - bsd4_spin must be held exclusively on call 1652 */ 1653 static void 1654 bsd4_setrunqueue_locked(struct lwp *lp) 1655 { 1656 struct rq *q; 1657 u_int32_t *which; 1658 int pri; 1659 1660 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 1661 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1662 ++bsd4_runqcount; 1663 1664 pri = lp->lwp_rqindex; 1665 1666 switch(lp->lwp_rqtype) { 1667 case RTP_PRIO_NORMAL: 1668 q = &bsd4_queues[pri]; 1669 which = &bsd4_queuebits; 1670 break; 1671 case RTP_PRIO_REALTIME: 1672 case RTP_PRIO_FIFO: 1673 q = &bsd4_rtqueues[pri]; 1674 which = &bsd4_rtqueuebits; 1675 break; 1676 case RTP_PRIO_IDLE: 1677 q = &bsd4_idqueues[pri]; 1678 which = &bsd4_idqueuebits; 1679 break; 1680 default: 1681 panic("remrunqueue: invalid rtprio type"); 1682 /* NOT REACHED */ 1683 } 1684 1685 /* 1686 * Add to the correct queue and set the appropriate bit. If no 1687 * lower priority (i.e. better) processes are in the queue then 1688 * we want a reschedule, calculate the best cpu for the job. 1689 * 1690 * Always run reschedules on the LWPs original cpu. 1691 */ 1692 TAILQ_INSERT_TAIL(q, lp, lwp_procq); 1693 *which |= 1 << pri; 1694 } 1695 1696 #ifdef SMP 1697 1698 /* 1699 * For SMP systems a user scheduler helper thread is created for each 1700 * cpu and is used to allow one cpu to wakeup another for the purposes of 1701 * scheduling userland threads from setrunqueue(). 1702 * 1703 * UP systems do not need the helper since there is only one cpu. 1704 * 1705 * We can't use the idle thread for this because we might block. 1706 * Additionally, doing things this way allows us to HLT idle cpus 1707 * on MP systems. 1708 * 1709 * MPSAFE 1710 */ 1711 static void 1712 sched_thread(void *dummy) 1713 { 1714 globaldata_t gd; 1715 bsd4_pcpu_t dd; 1716 bsd4_pcpu_t tmpdd; 1717 struct lwp *nlp; 1718 cpumask_t mask; 1719 int cpuid; 1720 #ifdef SMP 1721 cpumask_t tmpmask; 1722 int tmpid; 1723 #endif 1724 1725 gd = mycpu; 1726 cpuid = gd->gd_cpuid; /* doesn't change */ 1727 mask = gd->gd_cpumask; /* doesn't change */ 1728 dd = &bsd4_pcpu[cpuid]; 1729 1730 /* 1731 * Since we are woken up only when no user processes are scheduled 1732 * on a cpu, we can run at an ultra low priority. 1733 */ 1734 lwkt_setpri_self(TDPRI_USER_SCHEDULER); 1735 1736 tsleep(&dd->helper_thread, 0, "sched_thread_sleep", 0); 1737 1738 for (;;) { 1739 //again: 1740 /* 1741 * We use the LWKT deschedule-interlock trick to avoid racing 1742 * bsd4_rdyprocmask. This means we cannot block through to the 1743 * manual lwkt_switch() call we make below. 1744 */ 1745 crit_enter_gd(gd); 1746 //lwkt_deschedule_self(gd->gd_curthread); 1747 tsleep_interlock(&dd->helper_thread, 0); 1748 spin_lock(&bsd4_spin); 1749 atomic_set_cpumask(&bsd4_rdyprocmask, mask); 1750 1751 clear_user_resched(); /* This satisfied the reschedule request */ 1752 dd->rrcount = 0; /* Reset the round-robin counter */ 1753 1754 if ((bsd4_curprocmask & mask) == 0) { 1755 /* 1756 * No thread is currently scheduled. 1757 */ 1758 KKASSERT(dd->uschedcp == NULL); 1759 if ((nlp = chooseproc_locked(NULL)) != NULL) { 1760 1761 KTR_COND_LOG(usched_sched_thread_no_process, 1762 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1763 gd->gd_cpuid, 1764 nlp->lwp_proc->p_pid, 1765 nlp->lwp_thread->td_gd->gd_cpuid); 1766 1767 atomic_set_cpumask(&bsd4_curprocmask, mask); 1768 dd->upri = nlp->lwp_priority; 1769 dd->uschedcp = nlp; 1770 spin_unlock(&bsd4_spin); 1771 #ifdef SMP 1772 lwkt_acquire(nlp->lwp_thread); 1773 #endif 1774 lwkt_schedule(nlp->lwp_thread); 1775 } else { 1776 spin_unlock(&bsd4_spin); 1777 } 1778 } else if (bsd4_runqcount) { 1779 if ((nlp = chooseproc_locked(dd->uschedcp)) != NULL) { 1780 1781 KTR_COND_LOG(usched_sched_thread_process, 1782 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1783 gd->gd_cpuid, 1784 nlp->lwp_proc->p_pid, 1785 nlp->lwp_thread->td_gd->gd_cpuid); 1786 1787 dd->upri = nlp->lwp_priority; 1788 dd->uschedcp = nlp; 1789 spin_unlock(&bsd4_spin); 1790 #ifdef SMP 1791 lwkt_acquire(nlp->lwp_thread); 1792 #endif 1793 lwkt_schedule(nlp->lwp_thread); 1794 } else { 1795 /* 1796 * CHAINING CONDITION TRAIN 1797 * 1798 * We could not deal with the scheduler wakeup 1799 * request on this cpu, locate a ready scheduler 1800 * with no current lp assignment and chain to it. 1801 * 1802 * This ensures that a wakeup race which fails due 1803 * to priority test does not leave other unscheduled 1804 * cpus idle when the runqueue is not empty. 1805 */ 1806 tmpmask = ~bsd4_curprocmask & 1807 bsd4_rdyprocmask & smp_active_mask; 1808 if (tmpmask) { 1809 tmpid = BSFCPUMASK(tmpmask); 1810 tmpdd = &bsd4_pcpu[tmpid]; 1811 atomic_clear_cpumask(&bsd4_rdyprocmask, 1812 CPUMASK(tmpid)); 1813 spin_unlock(&bsd4_spin); 1814 wakeup(&tmpdd->helper_thread); 1815 } else { 1816 spin_unlock(&bsd4_spin); 1817 } 1818 1819 KTR_LOG(usched_sched_thread_no_process_found, 1820 gd->gd_cpuid, 1821 (unsigned long)tmpmask); 1822 } 1823 } else { 1824 /* 1825 * The runq is empty. 1826 */ 1827 spin_unlock(&bsd4_spin); 1828 } 1829 1830 /* 1831 * We're descheduled unless someone scheduled us. Switch away. 1832 * Exiting the critical section will cause splz() to be called 1833 * for us if interrupts and such are pending. 1834 */ 1835 crit_exit_gd(gd); 1836 tsleep(&dd->helper_thread, PINTERLOCKED, "sched_thread_sleep", 0); 1837 // lwkt_switch(); 1838 } 1839 } 1840 1841 /* sysctl stick_to_level parameter */ 1842 static int 1843 sysctl_usched_bsd4_stick_to_level(SYSCTL_HANDLER_ARGS) 1844 { 1845 int error, new_val; 1846 1847 new_val = usched_bsd4_stick_to_level; 1848 1849 error = sysctl_handle_int(oidp, &new_val, 0, req); 1850 if (error != 0 || req->newptr == NULL) 1851 return (error); 1852 if (new_val > cpu_topology_levels_number - 1 || 1853 new_val < 0) 1854 return (EINVAL); 1855 usched_bsd4_stick_to_level = new_val; 1856 return (0); 1857 } 1858 1859 /* 1860 * Setup our scheduler helpers. Note that curprocmask bit 0 has already 1861 * been cleared by rqinit() and we should not mess with it further. 1862 */ 1863 static void 1864 sched_thread_cpu_init(void) 1865 { 1866 int i; 1867 int cpuid; 1868 int smt_not_supported = 0; 1869 int cache_coherent_not_supported = 0; 1870 if (bootverbose) 1871 kprintf("Start scheduler helpers on cpus:\n"); 1872 1873 sysctl_ctx_init(&usched_bsd4_sysctl_ctx); 1874 usched_bsd4_sysctl_tree = SYSCTL_ADD_NODE(&usched_bsd4_sysctl_ctx, 1875 SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO, 1876 "usched_bsd4", CTLFLAG_RD, 0, ""); 1877 1878 for (i = 0; i < ncpus; ++i) { 1879 bsd4_pcpu_t dd = &bsd4_pcpu[i]; 1880 cpumask_t mask = CPUMASK(i); 1881 1882 if ((mask & smp_active_mask) == 0) 1883 continue; 1884 1885 dd->cpunode = get_cpu_node_by_cpuid(i); 1886 1887 if (dd->cpunode == NULL) { 1888 smt_not_supported = 1; 1889 cache_coherent_not_supported = 1; 1890 if (bootverbose) 1891 kprintf ("\tcpu%d - WARNING: No CPU NODE found for cpu\n", i); 1892 1893 } else { 1894 1895 switch (dd->cpunode->type) { 1896 case THREAD_LEVEL: 1897 if (bootverbose) 1898 kprintf ("\tcpu%d - HyperThreading available. " 1899 "Core siblings: ", i); 1900 break; 1901 case CORE_LEVEL: 1902 smt_not_supported = 1; 1903 1904 if (bootverbose) 1905 kprintf ("\tcpu%d - No HT available, multi-core/physical " 1906 "cpu. Physical siblings: ", i); 1907 break; 1908 case CHIP_LEVEL: 1909 smt_not_supported = 1; 1910 1911 if (bootverbose) 1912 kprintf ("\tcpu%d - No HT available, single-core/physical cpu. " 1913 "Package Siblings: ", i); 1914 break; 1915 default: 1916 /* Let's go for safe defaults here */ 1917 smt_not_supported = 1; 1918 cache_coherent_not_supported = 1; 1919 if (bootverbose) 1920 kprintf ("\tcpu%d - Unknown cpunode->type=%u. Siblings: ", 1921 i, (unsigned int)dd->cpunode->type); 1922 break; 1923 } 1924 1925 if (bootverbose) { 1926 if (dd->cpunode->parent_node != NULL) { 1927 CPUSET_FOREACH(cpuid, dd->cpunode->parent_node->members) 1928 kprintf("cpu%d ", cpuid); 1929 kprintf("\n"); 1930 } else { 1931 kprintf(" no siblings\n"); 1932 } 1933 } 1934 } 1935 1936 lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread, 1937 0, i, "usched %d", i); 1938 1939 /* 1940 * Allow user scheduling on the target cpu. cpu #0 has already 1941 * been enabled in rqinit(). 1942 */ 1943 if (i) 1944 atomic_clear_cpumask(&bsd4_curprocmask, mask); 1945 atomic_set_cpumask(&bsd4_rdyprocmask, mask); 1946 dd->upri = PRIBASE_NULL; 1947 1948 } 1949 1950 /* usched_bsd4 sysctl configurable parameters */ 1951 1952 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1953 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1954 OID_AUTO, "rrinterval", CTLFLAG_RW, 1955 &usched_bsd4_rrinterval, 0, ""); 1956 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1957 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1958 OID_AUTO, "decay", CTLFLAG_RW, 1959 &usched_bsd4_decay, 0, "Extra decay when not running"); 1960 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1961 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1962 OID_AUTO, "batch_time", CTLFLAG_RW, 1963 &usched_bsd4_batch_time, 0, "Minimum batch counter value"); 1964 1965 /* Add enable/disable option for SMT scheduling if supported */ 1966 if (smt_not_supported) { 1967 usched_bsd4_smt = 0; 1968 SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx, 1969 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1970 OID_AUTO, "smt", CTLFLAG_RD, 1971 "NOT SUPPORTED", 0, "SMT NOT SUPPORTED"); 1972 } else { 1973 usched_bsd4_smt = 1; 1974 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1975 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1976 OID_AUTO, "smt", CTLFLAG_RW, 1977 &usched_bsd4_smt, 0, "Enable/Disable SMT scheduling"); 1978 1979 } 1980 1981 /* Add enable/disable option for cache coherent scheduling if supported */ 1982 if (cache_coherent_not_supported) { 1983 #ifdef SMP 1984 usched_bsd4_cache_coherent = 0; 1985 SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx, 1986 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1987 OID_AUTO, "cache_coherent", CTLFLAG_RD, 1988 "NOT SUPPORTED", 0, "Cache coherence NOT SUPPORTED"); 1989 #endif 1990 } else { 1991 #ifdef SMP 1992 usched_bsd4_cache_coherent = 1; 1993 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1994 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1995 OID_AUTO, "cache_coherent", CTLFLAG_RW, 1996 &usched_bsd4_cache_coherent, 0, 1997 "Enable/Disable cache coherent scheduling"); 1998 #endif 1999 2000 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2001 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2002 OID_AUTO, "upri_affinity", CTLFLAG_RW, 2003 &usched_bsd4_upri_affinity, 1, 2004 "Number of PPQs in user priority check"); 2005 2006 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2007 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2008 OID_AUTO, "queue_checks", CTLFLAG_RW, 2009 &usched_bsd4_queue_checks, 5, 2010 "Number of LWP to check from a queue before giving up"); 2011 2012 SYSCTL_ADD_PROC(&usched_bsd4_sysctl_ctx, 2013 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2014 OID_AUTO, "stick_to_level", CTLTYPE_INT | CTLFLAG_RW, 2015 NULL, sizeof usched_bsd4_stick_to_level, 2016 sysctl_usched_bsd4_stick_to_level, "I", 2017 "Stick a process to this level. See sysctl" 2018 "paremter hw.cpu_topology.level_description"); 2019 } 2020 } 2021 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND, 2022 sched_thread_cpu_init, NULL) 2023 #else /* No SMP options - just add the configurable parameters to sysctl */ 2024 2025 static void 2026 sched_sysctl_tree_init(void) 2027 { 2028 sysctl_ctx_init(&usched_bsd4_sysctl_ctx); 2029 usched_bsd4_sysctl_tree = SYSCTL_ADD_NODE(&usched_bsd4_sysctl_ctx, 2030 SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO, 2031 "usched_bsd4", CTLFLAG_RD, 0, ""); 2032 2033 /* usched_bsd4 sysctl configurable parameters */ 2034 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2035 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2036 OID_AUTO, "rrinterval", CTLFLAG_RW, 2037 &usched_bsd4_rrinterval, 0, ""); 2038 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2039 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2040 OID_AUTO, "decay", CTLFLAG_RW, 2041 &usched_bsd4_decay, 0, "Extra decay when not running"); 2042 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2043 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2044 OID_AUTO, "batch_time", CTLFLAG_RW, 2045 &usched_bsd4_batch_time, 0, "Minimum batch counter value"); 2046 } 2047 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND, 2048 sched_sysctl_tree_init, NULL) 2049 #endif 2050 2051