1 /* 2 * Copyright (c) 2012 The DragonFly Project. All rights reserved. 3 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Matthew Dillon <dillon@backplane.com>, 7 * by Mihai Carabas <mihai.carabas@gmail.com> 8 * and many others. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/kernel.h> 35 #include <sys/lock.h> 36 #include <sys/queue.h> 37 #include <sys/proc.h> 38 #include <sys/rtprio.h> 39 #include <sys/uio.h> 40 #include <sys/sysctl.h> 41 #include <sys/resourcevar.h> 42 #include <sys/spinlock.h> 43 #include <sys/cpu_topology.h> 44 #include <sys/thread2.h> 45 #include <sys/spinlock2.h> 46 #include <sys/mplock2.h> 47 48 #include <sys/ktr.h> 49 50 #include <machine/cpu.h> 51 #include <machine/smp.h> 52 53 /* 54 * Priorities. Note that with 32 run queues per scheduler each queue 55 * represents four priority levels. 56 */ 57 58 #define MAXPRI 128 59 #define PRIMASK (MAXPRI - 1) 60 #define PRIBASE_REALTIME 0 61 #define PRIBASE_NORMAL MAXPRI 62 #define PRIBASE_IDLE (MAXPRI * 2) 63 #define PRIBASE_THREAD (MAXPRI * 3) 64 #define PRIBASE_NULL (MAXPRI * 4) 65 66 #define NQS 32 /* 32 run queues. */ 67 #define PPQ (MAXPRI / NQS) /* priorities per queue */ 68 #define PPQMASK (PPQ - 1) 69 70 /* 71 * NICEPPQ - number of nice units per priority queue 72 * 73 * ESTCPUPPQ - number of estcpu units per priority queue 74 * ESTCPUMAX - number of estcpu units 75 */ 76 #define NICEPPQ 2 77 #define ESTCPUPPQ 512 78 #define ESTCPUMAX (ESTCPUPPQ * NQS) 79 #define BATCHMAX (ESTCPUFREQ * 30) 80 #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1) 81 82 #define ESTCPULIM(v) min((v), ESTCPUMAX) 83 84 TAILQ_HEAD(rq, lwp); 85 86 #define lwp_priority lwp_usdata.bsd4.priority 87 #define lwp_rqindex lwp_usdata.bsd4.rqindex 88 #define lwp_estcpu lwp_usdata.bsd4.estcpu 89 #define lwp_batch lwp_usdata.bsd4.batch 90 #define lwp_rqtype lwp_usdata.bsd4.rqtype 91 92 static void bsd4_acquire_curproc(struct lwp *lp); 93 static void bsd4_release_curproc(struct lwp *lp); 94 static void bsd4_select_curproc(globaldata_t gd); 95 static void bsd4_setrunqueue(struct lwp *lp); 96 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period, 97 sysclock_t cpstamp); 98 static void bsd4_recalculate_estcpu(struct lwp *lp); 99 static void bsd4_resetpriority(struct lwp *lp); 100 static void bsd4_forking(struct lwp *plp, struct lwp *lp); 101 static void bsd4_exiting(struct lwp *lp, struct proc *); 102 static void bsd4_uload_update(struct lwp *lp); 103 static void bsd4_yield(struct lwp *lp); 104 static void bsd4_need_user_resched_remote(void *dummy); 105 static int bsd4_batchy_looser_pri_test(struct lwp* lp); 106 static struct lwp *bsd4_chooseproc_locked_cache_coherent(struct lwp *chklp); 107 static void bsd4_kick_helper(struct lwp *lp); 108 static struct lwp *bsd4_chooseproc_locked(struct lwp *chklp); 109 static void bsd4_remrunqueue_locked(struct lwp *lp); 110 static void bsd4_setrunqueue_locked(struct lwp *lp); 111 static void bsd4_changedcpu(struct lwp *lp); 112 113 struct usched usched_bsd4 = { 114 { NULL }, 115 "bsd4", "Original DragonFly Scheduler", 116 NULL, /* default registration */ 117 NULL, /* default deregistration */ 118 bsd4_acquire_curproc, 119 bsd4_release_curproc, 120 bsd4_setrunqueue, 121 bsd4_schedulerclock, 122 bsd4_recalculate_estcpu, 123 bsd4_resetpriority, 124 bsd4_forking, 125 bsd4_exiting, 126 bsd4_uload_update, 127 NULL, /* setcpumask not supported */ 128 bsd4_yield, 129 bsd4_changedcpu 130 }; 131 132 struct usched_bsd4_pcpu { 133 struct thread *helper_thread; 134 short rrcount; 135 short upri; 136 struct lwp *uschedcp; 137 struct lwp *old_uschedcp; 138 cpu_node_t *cpunode; 139 }; 140 141 typedef struct usched_bsd4_pcpu *bsd4_pcpu_t; 142 143 /* 144 * We have NQS (32) run queues per scheduling class. For the normal 145 * class, there are 128 priorities scaled onto these 32 queues. New 146 * processes are added to the last entry in each queue, and processes 147 * are selected for running by taking them from the head and maintaining 148 * a simple FIFO arrangement. Realtime and Idle priority processes have 149 * and explicit 0-31 priority which maps directly onto their class queue 150 * index. When a queue has something in it, the corresponding bit is 151 * set in the queuebits variable, allowing a single read to determine 152 * the state of all 32 queues and then a ffs() to find the first busy 153 * queue. 154 */ 155 static struct rq bsd4_queues[NQS]; 156 static struct rq bsd4_rtqueues[NQS]; 157 static struct rq bsd4_idqueues[NQS]; 158 static u_int32_t bsd4_queuebits; 159 static u_int32_t bsd4_rtqueuebits; 160 static u_int32_t bsd4_idqueuebits; 161 /* currently running a user process */ 162 static cpumask_t bsd4_curprocmask = CPUMASK_INITIALIZER_ALLONES; 163 /* ready to accept a user process */ 164 static cpumask_t bsd4_rdyprocmask; 165 static int bsd4_runqcount; 166 static volatile int bsd4_scancpu; 167 static struct spinlock bsd4_spin; 168 static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU]; 169 static struct sysctl_ctx_list usched_bsd4_sysctl_ctx; 170 static struct sysctl_oid *usched_bsd4_sysctl_tree; 171 172 /* Debug info exposed through debug.* sysctl */ 173 174 SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, 175 &bsd4_runqcount, 0, 176 "Number of run queues"); 177 178 static int usched_bsd4_debug = -1; 179 SYSCTL_INT(_debug, OID_AUTO, bsd4_scdebug, CTLFLAG_RW, 180 &usched_bsd4_debug, 0, 181 "Print debug information for this pid"); 182 183 static int usched_bsd4_pid_debug = -1; 184 SYSCTL_INT(_debug, OID_AUTO, bsd4_pid_debug, CTLFLAG_RW, 185 &usched_bsd4_pid_debug, 0, 186 "Print KTR debug information for this pid"); 187 188 /* Tunning usched_bsd4 - configurable through kern.usched_bsd4.* */ 189 static int usched_bsd4_smt = 0; 190 static int usched_bsd4_cache_coherent = 0; 191 static int usched_bsd4_upri_affinity = 16; /* 32 queues - half-way */ 192 static int usched_bsd4_queue_checks = 5; 193 static int usched_bsd4_stick_to_level = 0; 194 static long usched_bsd4_kicks; 195 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10; 196 static int usched_bsd4_decay = 8; 197 static int usched_bsd4_batch_time = 10; 198 199 /* KTR debug printings */ 200 201 KTR_INFO_MASTER_EXTERN(usched); 202 203 #if !defined(KTR_USCHED_BSD4) 204 #define KTR_USCHED_BSD4 KTR_ALL 205 #endif 206 207 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_urw, 0, 208 "USCHED_BSD4(bsd4_acquire_curproc in user_reseched_wanted " 209 "after release: pid %d, cpuid %d, curr_cpuid %d)", 210 pid_t pid, int cpuid, int curr); 211 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_before_loop, 0, 212 "USCHED_BSD4(bsd4_acquire_curproc before loop: pid %d, cpuid %d, " 213 "curr_cpuid %d)", 214 pid_t pid, int cpuid, int curr); 215 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_not, 0, 216 "USCHED_BSD4(bsd4_acquire_curproc couldn't acquire after " 217 "bsd4_setrunqueue: pid %d, cpuid %d, curr_lp pid %d, curr_cpuid %d)", 218 pid_t pid, int cpuid, pid_t curr_pid, int curr_cpuid); 219 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_switch, 0, 220 "USCHED_BSD4(bsd4_acquire_curproc after lwkt_switch: pid %d, " 221 "cpuid %d, curr_cpuid %d)", 222 pid_t pid, int cpuid, int curr); 223 224 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_release_curproc, 0, 225 "USCHED_BSD4(bsd4_release_curproc before select: pid %d, " 226 "cpuid %d, curr_cpuid %d)", 227 pid_t pid, int cpuid, int curr); 228 229 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_select_curproc, 0, 230 "USCHED_BSD4(bsd4_release_curproc before select: pid %d, " 231 "cpuid %d, old_pid %d, old_cpuid %d, curr_cpuid %d)", 232 pid_t pid, int cpuid, pid_t old_pid, int old_cpuid, int curr); 233 234 KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_false, 0, 235 "USCHED_BSD4(batchy_looser_pri_test false: pid %d, " 236 "cpuid %d, verify_mask %lu)", 237 pid_t pid, int cpuid, unsigned long mask); 238 KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_true, 0, 239 "USCHED_BSD4(batchy_looser_pri_test true: pid %d, " 240 "cpuid %d, verify_mask %lu)", 241 pid_t pid, int cpuid, unsigned long mask); 242 243 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_smt, 0, 244 "USCHED_BSD4(bsd4_setrunqueue free cpus smt: pid %d, cpuid %d, " 245 "mask %lu, curr_cpuid %d)", 246 pid_t pid, int cpuid, unsigned long mask, int curr); 247 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_non_smt, 0, 248 "USCHED_BSD4(bsd4_setrunqueue free cpus check non_smt: pid %d, " 249 "cpuid %d, mask %lu, curr_cpuid %d)", 250 pid_t pid, int cpuid, unsigned long mask, int curr); 251 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_rc, 0, 252 "USCHED_BSD4(bsd4_setrunqueue running cpus check: pid %d, " 253 "cpuid %d, mask %lu, curr_cpuid %d)", 254 pid_t pid, int cpuid, unsigned long mask, int curr); 255 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found, 0, 256 "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, " 257 "mask %lu, found_cpuid %d, curr_cpuid %d)", 258 pid_t pid, int cpuid, unsigned long mask, int found_cpuid, int curr); 259 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_not_found, 0, 260 "USCHED_BSD4(bsd4_setrunqueue not found cpu: pid %d, cpuid %d, " 261 "try_cpuid %d, curr_cpuid %d)", 262 pid_t pid, int cpuid, int try_cpuid, int curr); 263 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found_best_cpuid, 0, 264 "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, " 265 "mask %lu, found_cpuid %d, curr_cpuid %d)", 266 pid_t pid, int cpuid, unsigned long mask, int found_cpuid, int curr); 267 268 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_chooseproc, 0, 269 "USCHED_BSD4(chooseproc: pid %d, old_cpuid %d, curr_cpuid %d)", 270 pid_t pid, int old_cpuid, int curr); 271 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc, 0, 272 "USCHED_BSD4(chooseproc_cc: pid %d, old_cpuid %d, curr_cpuid %d)", 273 pid_t pid, int old_cpuid, int curr); 274 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_not_good, 0, 275 "USCHED_BSD4(chooseproc_cc not good: pid %d, old_cpumask %lu, " 276 "sibling_mask %lu, curr_cpumask %lu)", 277 pid_t pid, unsigned long old_cpumask, unsigned long sibling_mask, unsigned long curr); 278 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_elected, 0, 279 "USCHED_BSD4(chooseproc_cc elected: pid %d, old_cpumask %lu, " 280 "sibling_mask %lu, curr_cpumask: %lu)", 281 pid_t pid, unsigned long old_cpumask, unsigned long sibling_mask, unsigned long curr); 282 283 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process, 0, 284 "USCHED_BSD4(sched_thread %d no process scheduled: pid %d, old_cpuid %d)", 285 int id, pid_t pid, int cpuid); 286 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_process, 0, 287 "USCHED_BSD4(sched_thread %d process scheduled: pid %d, old_cpuid %d)", 288 int id, pid_t pid, int cpuid); 289 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process_found, 0, 290 "USCHED_BSD4(sched_thread %d no process found; tmpmask %lu)", 291 int id, unsigned long tmpmask); 292 293 /* 294 * Initialize the run queues at boot time. 295 */ 296 static void 297 bsd4_rqinit(void *dummy) 298 { 299 int i; 300 301 spin_init(&bsd4_spin, "bsd4rq"); 302 for (i = 0; i < NQS; i++) { 303 TAILQ_INIT(&bsd4_queues[i]); 304 TAILQ_INIT(&bsd4_rtqueues[i]); 305 TAILQ_INIT(&bsd4_idqueues[i]); 306 } 307 ATOMIC_CPUMASK_NANDBIT(bsd4_curprocmask, 0); 308 } 309 SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, bsd4_rqinit, NULL); 310 311 /* 312 * BSD4_ACQUIRE_CURPROC 313 * 314 * This function is called when the kernel intends to return to userland. 315 * It is responsible for making the thread the current designated userland 316 * thread for this cpu, blocking if necessary. 317 * 318 * The kernel will not depress our LWKT priority until after we return, 319 * in case we have to shove over to another cpu. 320 * 321 * We must determine our thread's disposition before we switch away. This 322 * is very sensitive code. 323 * 324 * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE 325 * TO ANOTHER CPU! Because most of the kernel assumes that no migration will 326 * occur, this function is called only under very controlled circumstances. 327 * 328 * MPSAFE 329 */ 330 static void 331 bsd4_acquire_curproc(struct lwp *lp) 332 { 333 globaldata_t gd; 334 bsd4_pcpu_t dd; 335 thread_t td; 336 #if 0 337 struct lwp *olp; 338 #endif 339 340 /* 341 * Make sure we aren't sitting on a tsleep queue. 342 */ 343 td = lp->lwp_thread; 344 crit_enter_quick(td); 345 if (td->td_flags & TDF_TSLEEPQ) 346 tsleep_remove(td); 347 bsd4_recalculate_estcpu(lp); 348 349 /* 350 * If a reschedule was requested give another thread the 351 * driver's seat. 352 */ 353 if (user_resched_wanted()) { 354 clear_user_resched(); 355 bsd4_release_curproc(lp); 356 357 KTR_COND_LOG(usched_bsd4_acquire_curproc_urw, 358 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 359 lp->lwp_proc->p_pid, 360 lp->lwp_thread->td_gd->gd_cpuid, 361 mycpu->gd_cpuid); 362 } 363 364 /* 365 * Loop until we are the current user thread 366 */ 367 gd = mycpu; 368 dd = &bsd4_pcpu[gd->gd_cpuid]; 369 370 KTR_COND_LOG(usched_bsd4_acquire_curproc_before_loop, 371 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 372 lp->lwp_proc->p_pid, 373 lp->lwp_thread->td_gd->gd_cpuid, 374 gd->gd_cpuid); 375 376 do { 377 /* 378 * Process any pending events and higher priority threads. 379 */ 380 lwkt_yield(); 381 382 /* 383 * Become the currently scheduled user thread for this cpu 384 * if we can do so trivially. 385 * 386 * We can steal another thread's current thread designation 387 * on this cpu since if we are running that other thread 388 * must not be, so we can safely deschedule it. 389 */ 390 if (dd->uschedcp == lp) { 391 /* 392 * We are already the current lwp (hot path). 393 */ 394 dd->upri = lp->lwp_priority; 395 } else if (dd->uschedcp == NULL) { 396 /* 397 * We can trivially become the current lwp. 398 */ 399 ATOMIC_CPUMASK_ORBIT(bsd4_curprocmask, gd->gd_cpuid); 400 dd->uschedcp = lp; 401 dd->upri = lp->lwp_priority; 402 } else if (dd->upri > lp->lwp_priority) { 403 /* 404 * We can steal the current cpu's lwp designation 405 * away simply by replacing it. The other thread 406 * will stall when it tries to return to userland. 407 */ 408 dd->uschedcp = lp; 409 dd->upri = lp->lwp_priority; 410 /* 411 lwkt_deschedule(olp->lwp_thread); 412 bsd4_setrunqueue(olp); 413 */ 414 } else { 415 /* 416 * We cannot become the current lwp, place the lp 417 * on the bsd4 run-queue and deschedule ourselves. 418 * 419 * When we are reactivated we will have another 420 * chance. 421 */ 422 lwkt_deschedule(lp->lwp_thread); 423 424 bsd4_setrunqueue(lp); 425 426 KTR_COND_LOG(usched_bsd4_acquire_curproc_not, 427 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 428 lp->lwp_proc->p_pid, 429 lp->lwp_thread->td_gd->gd_cpuid, 430 dd->uschedcp->lwp_proc->p_pid, 431 gd->gd_cpuid); 432 433 434 lwkt_switch(); 435 436 /* 437 * Reload after a switch or setrunqueue/switch possibly 438 * moved us to another cpu. 439 */ 440 gd = mycpu; 441 dd = &bsd4_pcpu[gd->gd_cpuid]; 442 443 KTR_COND_LOG(usched_bsd4_acquire_curproc_switch, 444 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 445 lp->lwp_proc->p_pid, 446 lp->lwp_thread->td_gd->gd_cpuid, 447 gd->gd_cpuid); 448 } 449 } while (dd->uschedcp != lp); 450 451 crit_exit_quick(td); 452 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 453 } 454 455 /* 456 * BSD4_RELEASE_CURPROC 457 * 458 * This routine detaches the current thread from the userland scheduler, 459 * usually because the thread needs to run or block in the kernel (at 460 * kernel priority) for a while. 461 * 462 * This routine is also responsible for selecting a new thread to 463 * make the current thread. 464 * 465 * NOTE: This implementation differs from the dummy example in that 466 * bsd4_select_curproc() is able to select the current process, whereas 467 * dummy_select_curproc() is not able to select the current process. 468 * This means we have to NULL out uschedcp. 469 * 470 * Additionally, note that we may already be on a run queue if releasing 471 * via the lwkt_switch() in bsd4_setrunqueue(). 472 * 473 * MPSAFE 474 */ 475 476 static void 477 bsd4_release_curproc(struct lwp *lp) 478 { 479 globaldata_t gd = mycpu; 480 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 481 482 if (dd->uschedcp == lp) { 483 crit_enter(); 484 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 485 486 KTR_COND_LOG(usched_bsd4_release_curproc, 487 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 488 lp->lwp_proc->p_pid, 489 lp->lwp_thread->td_gd->gd_cpuid, 490 gd->gd_cpuid); 491 492 dd->uschedcp = NULL; /* don't let lp be selected */ 493 dd->upri = PRIBASE_NULL; 494 ATOMIC_CPUMASK_NANDBIT(bsd4_curprocmask, gd->gd_cpuid); 495 dd->old_uschedcp = lp; /* used only for KTR debug prints */ 496 bsd4_select_curproc(gd); 497 crit_exit(); 498 } 499 } 500 501 /* 502 * BSD4_SELECT_CURPROC 503 * 504 * Select a new current process for this cpu and clear any pending user 505 * reschedule request. The cpu currently has no current process. 506 * 507 * This routine is also responsible for equal-priority round-robining, 508 * typically triggered from bsd4_schedulerclock(). In our dummy example 509 * all the 'user' threads are LWKT scheduled all at once and we just 510 * call lwkt_switch(). 511 * 512 * The calling process is not on the queue and cannot be selected. 513 * 514 * MPSAFE 515 */ 516 static 517 void 518 bsd4_select_curproc(globaldata_t gd) 519 { 520 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 521 struct lwp *nlp; 522 int cpuid = gd->gd_cpuid; 523 524 crit_enter_gd(gd); 525 526 spin_lock(&bsd4_spin); 527 if(usched_bsd4_cache_coherent) 528 nlp = bsd4_chooseproc_locked_cache_coherent(dd->uschedcp); 529 else 530 nlp = bsd4_chooseproc_locked(dd->uschedcp); 531 532 if (nlp) { 533 534 KTR_COND_LOG(usched_bsd4_select_curproc, 535 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, 536 nlp->lwp_proc->p_pid, 537 nlp->lwp_thread->td_gd->gd_cpuid, 538 dd->old_uschedcp->lwp_proc->p_pid, 539 dd->old_uschedcp->lwp_thread->td_gd->gd_cpuid, 540 gd->gd_cpuid); 541 542 ATOMIC_CPUMASK_ORBIT(bsd4_curprocmask, cpuid); 543 dd->upri = nlp->lwp_priority; 544 dd->uschedcp = nlp; 545 dd->rrcount = 0; /* reset round robin */ 546 spin_unlock(&bsd4_spin); 547 lwkt_acquire(nlp->lwp_thread); 548 lwkt_schedule(nlp->lwp_thread); 549 } else { 550 spin_unlock(&bsd4_spin); 551 } 552 553 #if 0 554 } else if (bsd4_runqcount && CPUMASK_TESTBIT(bsd4_rdyprocmask, cpuid)) { 555 ATOMIC_CPUMASK_NANDBIT(bsd4_rdyprocmask, cpuid); 556 spin_unlock(&bsd4_spin); 557 lwkt_schedule(dd->helper_thread); 558 } else { 559 spin_unlock(&bsd4_spin); 560 } 561 #endif 562 crit_exit_gd(gd); 563 } 564 565 /* 566 * batchy_looser_pri_test() - determine if a process is batchy or not 567 * relative to the other processes running in the system 568 */ 569 static int 570 bsd4_batchy_looser_pri_test(struct lwp* lp) 571 { 572 cpumask_t mask; 573 bsd4_pcpu_t other_dd; 574 int cpu; 575 576 /* Current running processes */ 577 mask = bsd4_curprocmask; 578 CPUMASK_ANDMASK(mask, smp_active_mask); 579 CPUMASK_ANDMASK(mask, usched_global_cpumask); 580 581 while (CPUMASK_TESTNZERO(mask)) { 582 cpu = BSFCPUMASK(mask); 583 other_dd = &bsd4_pcpu[cpu]; 584 if (other_dd->upri - lp->lwp_priority > usched_bsd4_upri_affinity * PPQ) { 585 586 KTR_COND_LOG(usched_batchy_test_false, 587 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 588 lp->lwp_proc->p_pid, 589 lp->lwp_thread->td_gd->gd_cpuid, 590 (unsigned long)CPUMASK_LOWMASK(mask)); 591 592 return 0; 593 } 594 CPUMASK_NANDBIT(mask, cpu); 595 } 596 597 KTR_COND_LOG(usched_batchy_test_true, 598 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 599 lp->lwp_proc->p_pid, 600 lp->lwp_thread->td_gd->gd_cpuid, 601 (unsigned long)CPUMASK_LOWMASK(mask)); 602 603 return 1; 604 } 605 606 /* 607 * 608 * BSD4_SETRUNQUEUE 609 * 610 * Place the specified lwp on the user scheduler's run queue. This routine 611 * must be called with the thread descheduled. The lwp must be runnable. 612 * 613 * The thread may be the current thread as a special case. 614 * 615 * MPSAFE 616 */ 617 static void 618 bsd4_setrunqueue(struct lwp *lp) 619 { 620 globaldata_t gd; 621 bsd4_pcpu_t dd; 622 int cpuid; 623 cpumask_t mask; 624 cpumask_t tmpmask; 625 626 /* 627 * First validate the process state relative to the current cpu. 628 * We don't need the spinlock for this, just a critical section. 629 * We are in control of the process. 630 */ 631 crit_enter(); 632 KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN")); 633 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0, 634 ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid, 635 lp->lwp_tid, lp->lwp_proc->p_flags, lp->lwp_flags)); 636 KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0); 637 638 /* 639 * Note: gd and dd are relative to the target thread's last cpu, 640 * NOT our current cpu. 641 */ 642 gd = lp->lwp_thread->td_gd; 643 dd = &bsd4_pcpu[gd->gd_cpuid]; 644 645 /* 646 * This process is not supposed to be scheduled anywhere or assigned 647 * as the current process anywhere. Assert the condition. 648 */ 649 KKASSERT(dd->uschedcp != lp); 650 651 /* 652 * XXX fixme. Could be part of a remrunqueue/setrunqueue 653 * operation when the priority is recalculated, so TDF_MIGRATING 654 * may already be set. 655 */ 656 if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0) 657 lwkt_giveaway(lp->lwp_thread); 658 659 /* 660 * We lose control of lp the moment we release the spinlock after 661 * having placed lp on the queue. i.e. another cpu could pick it 662 * up and it could exit, or its priority could be further adjusted, 663 * or something like that. 664 */ 665 spin_lock(&bsd4_spin); 666 bsd4_setrunqueue_locked(lp); 667 lp->lwp_rebal_ticks = sched_ticks; 668 669 /* 670 * Kick the scheduler helper on one of the other cpu's 671 * and request a reschedule if appropriate. 672 * 673 * NOTE: We check all cpus whos rdyprocmask is set. First we 674 * look for cpus without designated lps, then we look for 675 * cpus with designated lps with a worse priority than our 676 * process. 677 */ 678 ++bsd4_scancpu; 679 680 if (usched_bsd4_smt) { 681 682 /* 683 * SMT heuristic - Try to schedule on a free physical core. 684 * If no physical core found than choose the one that has 685 * an interactive thread. 686 */ 687 688 int best_cpuid = -1; 689 int min_prio = MAXPRI * MAXPRI; 690 int sibling; 691 692 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 693 mask = bsd4_rdyprocmask; 694 CPUMASK_NANDMASK(mask, bsd4_curprocmask); 695 CPUMASK_ANDMASK(mask, lp->lwp_cpumask); 696 CPUMASK_ANDMASK(mask, smp_active_mask); 697 CPUMASK_ANDMASK(mask, usched_global_cpumask); 698 699 KTR_COND_LOG(usched_bsd4_setrunqueue_fc_smt, 700 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 701 lp->lwp_proc->p_pid, 702 lp->lwp_thread->td_gd->gd_cpuid, 703 (unsigned long)CPUMASK_LOWMASK(mask), 704 mycpu->gd_cpuid); 705 706 while (CPUMASK_TESTNZERO(mask)) { 707 CPUMASK_ASSNBMASK(tmpmask, cpuid); 708 if (CPUMASK_TESTMASK(tmpmask, mask)) { 709 CPUMASK_ANDMASK(tmpmask, mask); 710 cpuid = BSFCPUMASK(tmpmask); 711 } else { 712 cpuid = BSFCPUMASK(mask); 713 } 714 gd = globaldata_find(cpuid); 715 dd = &bsd4_pcpu[cpuid]; 716 717 if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) { 718 tmpmask = dd->cpunode->parent_node->members; 719 CPUMASK_NANDMASK(tmpmask, dd->cpunode->members); 720 CPUMASK_ANDMASK(tmpmask, mask); 721 if (CPUMASK_TESTNZERO(tmpmask)) { 722 KTR_COND_LOG(usched_bsd4_setrunqueue_found, 723 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 724 lp->lwp_proc->p_pid, 725 lp->lwp_thread->td_gd->gd_cpuid, 726 (unsigned long)CPUMASK_LOWMASK(mask), 727 cpuid, 728 mycpu->gd_cpuid); 729 730 goto found; 731 } else { 732 tmpmask = 733 dd->cpunode->parent_node->members; 734 CPUMASK_NANDMASK(tmpmask, 735 dd->cpunode->members); 736 sibling = BSFCPUMASK(tmpmask); 737 if (min_prio > 738 bsd4_pcpu[sibling].upri) { 739 min_prio = 740 bsd4_pcpu[sibling].upri; 741 best_cpuid = cpuid; 742 } 743 } 744 } 745 CPUMASK_NANDBIT(mask, cpuid); 746 } 747 748 if (best_cpuid != -1) { 749 cpuid = best_cpuid; 750 gd = globaldata_find(cpuid); 751 dd = &bsd4_pcpu[cpuid]; 752 753 KTR_COND_LOG(usched_bsd4_setrunqueue_found_best_cpuid, 754 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 755 lp->lwp_proc->p_pid, 756 lp->lwp_thread->td_gd->gd_cpuid, 757 (unsigned long)CPUMASK_LOWMASK(mask), 758 cpuid, 759 mycpu->gd_cpuid); 760 761 goto found; 762 } 763 } else { 764 /* Fallback to the original heuristic */ 765 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 766 mask = bsd4_rdyprocmask; 767 CPUMASK_NANDMASK(mask, bsd4_curprocmask); 768 CPUMASK_ANDMASK(mask, lp->lwp_cpumask); 769 CPUMASK_ANDMASK(mask, smp_active_mask); 770 CPUMASK_ANDMASK(mask, usched_global_cpumask); 771 772 KTR_COND_LOG(usched_bsd4_setrunqueue_fc_non_smt, 773 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 774 lp->lwp_proc->p_pid, 775 lp->lwp_thread->td_gd->gd_cpuid, 776 (unsigned long)CPUMASK_LOWMASK(mask), 777 mycpu->gd_cpuid); 778 779 while (CPUMASK_TESTNZERO(mask)) { 780 CPUMASK_ASSNBMASK(tmpmask, cpuid); 781 if (CPUMASK_TESTMASK(tmpmask, mask)) { 782 CPUMASK_ANDMASK(tmpmask, mask); 783 cpuid = BSFCPUMASK(tmpmask); 784 } else { 785 cpuid = BSFCPUMASK(mask); 786 } 787 gd = globaldata_find(cpuid); 788 dd = &bsd4_pcpu[cpuid]; 789 790 if ((dd->upri & ~PPQMASK) >= 791 (lp->lwp_priority & ~PPQMASK)) { 792 KTR_COND_LOG(usched_bsd4_setrunqueue_found, 793 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 794 lp->lwp_proc->p_pid, 795 lp->lwp_thread->td_gd->gd_cpuid, 796 (unsigned long)CPUMASK_LOWMASK(mask), 797 cpuid, 798 mycpu->gd_cpuid); 799 800 goto found; 801 } 802 CPUMASK_NANDBIT(mask, cpuid); 803 } 804 } 805 806 /* 807 * Then cpus which might have a currently running lp 808 */ 809 mask = bsd4_curprocmask; 810 CPUMASK_ANDMASK(mask, bsd4_rdyprocmask); 811 CPUMASK_ANDMASK(mask, lp->lwp_cpumask); 812 CPUMASK_ANDMASK(mask, smp_active_mask); 813 CPUMASK_ANDMASK(mask, usched_global_cpumask); 814 815 KTR_COND_LOG(usched_bsd4_setrunqueue_rc, 816 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 817 lp->lwp_proc->p_pid, 818 lp->lwp_thread->td_gd->gd_cpuid, 819 (unsigned long)CPUMASK_LOWMASK(mask), 820 mycpu->gd_cpuid); 821 822 while (CPUMASK_TESTNZERO(mask)) { 823 CPUMASK_ASSNBMASK(tmpmask, cpuid); 824 if (CPUMASK_TESTMASK(tmpmask, mask)) { 825 CPUMASK_ANDMASK(tmpmask, mask); 826 cpuid = BSFCPUMASK(tmpmask); 827 } else { 828 cpuid = BSFCPUMASK(mask); 829 } 830 gd = globaldata_find(cpuid); 831 dd = &bsd4_pcpu[cpuid]; 832 833 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 834 KTR_COND_LOG(usched_bsd4_setrunqueue_found, 835 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 836 lp->lwp_proc->p_pid, 837 lp->lwp_thread->td_gd->gd_cpuid, 838 (unsigned long)CPUMASK_LOWMASK(mask), 839 cpuid, 840 mycpu->gd_cpuid); 841 842 goto found; 843 } 844 CPUMASK_NANDBIT(mask, cpuid); 845 } 846 847 /* 848 * If we cannot find a suitable cpu we reload from bsd4_scancpu 849 * and round-robin. Other cpus will pickup as they release their 850 * current lwps or become ready. 851 * 852 * Avoid a degenerate system lockup case if usched_global_cpumask 853 * is set to 0 or otherwise does not cover lwp_cpumask. 854 * 855 * We only kick the target helper thread in this case, we do not 856 * set the user resched flag because 857 */ 858 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 859 if (CPUMASK_TESTBIT(usched_global_cpumask, cpuid) == 0) 860 cpuid = 0; 861 gd = globaldata_find(cpuid); 862 dd = &bsd4_pcpu[cpuid]; 863 864 KTR_COND_LOG(usched_bsd4_setrunqueue_not_found, 865 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 866 lp->lwp_proc->p_pid, 867 lp->lwp_thread->td_gd->gd_cpuid, 868 cpuid, 869 mycpu->gd_cpuid); 870 871 found: 872 if (gd == mycpu) { 873 spin_unlock(&bsd4_spin); 874 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 875 if (dd->uschedcp == NULL) { 876 wakeup_mycpu(dd->helper_thread); 877 } else { 878 need_user_resched(); 879 } 880 } 881 } else { 882 ATOMIC_CPUMASK_NANDBIT(bsd4_rdyprocmask, cpuid); 883 spin_unlock(&bsd4_spin); 884 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) 885 lwkt_send_ipiq(gd, bsd4_need_user_resched_remote, NULL); 886 else 887 wakeup(dd->helper_thread); 888 } 889 crit_exit(); 890 } 891 892 /* 893 * This routine is called from a systimer IPI. It MUST be MP-safe and 894 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ on 895 * each cpu. 896 * 897 * This routine is called on every sched tick. If the currently running 898 * thread belongs to this scheduler it will be called with a non-NULL lp, 899 * otherwise it will be called with a NULL lp. 900 * 901 * MPSAFE 902 */ 903 static 904 void 905 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp) 906 { 907 globaldata_t gd = mycpu; 908 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 909 910 /* 911 * No impl if no lp running. 912 */ 913 if (lp == NULL) 914 return; 915 916 /* 917 * Do we need to round-robin? We round-robin 10 times a second. 918 * This should only occur for cpu-bound batch processes. 919 */ 920 if (++dd->rrcount >= usched_bsd4_rrinterval) { 921 dd->rrcount = 0; 922 need_user_resched(); 923 } 924 925 /* 926 * Adjust estcpu upward using a real time equivalent calculation. 927 */ 928 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUMAX / ESTCPUFREQ + 1); 929 930 /* 931 * Spinlocks also hold a critical section so there should not be 932 * any active. 933 */ 934 KKASSERT(gd->gd_spinlocks == 0); 935 936 bsd4_resetpriority(lp); 937 } 938 939 /* 940 * Called from acquire and from kern_synch's one-second timer (one of the 941 * callout helper threads) with a critical section held. 942 * 943 * Decay p_estcpu based on the number of ticks we haven't been running 944 * and our p_nice. As the load increases each process observes a larger 945 * number of idle ticks (because other processes are running in them). 946 * This observation leads to a larger correction which tends to make the 947 * system more 'batchy'. 948 * 949 * Note that no recalculation occurs for a process which sleeps and wakes 950 * up in the same tick. That is, a system doing thousands of context 951 * switches per second will still only do serious estcpu calculations 952 * ESTCPUFREQ times per second. 953 * 954 * MPSAFE 955 */ 956 static 957 void 958 bsd4_recalculate_estcpu(struct lwp *lp) 959 { 960 globaldata_t gd = mycpu; 961 sysclock_t cpbase; 962 sysclock_t ttlticks; 963 int estcpu; 964 int decay_factor; 965 966 /* 967 * We have to subtract periodic to get the last schedclock 968 * timeout time, otherwise we would get the upcoming timeout. 969 * Keep in mind that a process can migrate between cpus and 970 * while the scheduler clock should be very close, boundary 971 * conditions could lead to a small negative delta. 972 */ 973 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic; 974 975 if (lp->lwp_slptime > 1) { 976 /* 977 * Too much time has passed, do a coarse correction. 978 */ 979 lp->lwp_estcpu = lp->lwp_estcpu >> 1; 980 bsd4_resetpriority(lp); 981 lp->lwp_cpbase = cpbase; 982 lp->lwp_cpticks = 0; 983 lp->lwp_batch -= ESTCPUFREQ; 984 if (lp->lwp_batch < 0) 985 lp->lwp_batch = 0; 986 } else if (lp->lwp_cpbase != cpbase) { 987 /* 988 * Adjust estcpu if we are in a different tick. Don't waste 989 * time if we are in the same tick. 990 * 991 * First calculate the number of ticks in the measurement 992 * interval. The ttlticks calculation can wind up 0 due to 993 * a bug in the handling of lwp_slptime (as yet not found), 994 * so make sure we do not get a divide by 0 panic. 995 */ 996 ttlticks = (cpbase - lp->lwp_cpbase) / 997 gd->gd_schedclock.periodic; 998 if ((ssysclock_t)ttlticks < 0) { 999 ttlticks = 0; 1000 lp->lwp_cpbase = cpbase; 1001 } 1002 if (ttlticks == 0) 1003 return; 1004 updatepcpu(lp, lp->lwp_cpticks, ttlticks); 1005 1006 /* 1007 * Calculate the percentage of one cpu used factoring in ncpus 1008 * and the load and adjust estcpu. Handle degenerate cases 1009 * by adding 1 to bsd4_runqcount. 1010 * 1011 * estcpu is scaled by ESTCPUMAX. 1012 * 1013 * bsd4_runqcount is the excess number of user processes 1014 * that cannot be immediately scheduled to cpus. We want 1015 * to count these as running to avoid range compression 1016 * in the base calculation (which is the actual percentage 1017 * of one cpu used). 1018 */ 1019 estcpu = (lp->lwp_cpticks * ESTCPUMAX) * 1020 (bsd4_runqcount + ncpus) / (ncpus * ttlticks); 1021 1022 /* 1023 * If estcpu is > 50% we become more batch-like 1024 * If estcpu is <= 50% we become less batch-like 1025 * 1026 * It takes 30 cpu seconds to traverse the entire range. 1027 */ 1028 if (estcpu > ESTCPUMAX / 2) { 1029 lp->lwp_batch += ttlticks; 1030 if (lp->lwp_batch > BATCHMAX) 1031 lp->lwp_batch = BATCHMAX; 1032 } else { 1033 lp->lwp_batch -= ttlticks; 1034 if (lp->lwp_batch < 0) 1035 lp->lwp_batch = 0; 1036 } 1037 1038 if (usched_bsd4_debug == lp->lwp_proc->p_pid) { 1039 kprintf("pid %d lwp %p estcpu %3d %3d bat %d cp %d/%d", 1040 lp->lwp_proc->p_pid, lp, 1041 estcpu, lp->lwp_estcpu, 1042 lp->lwp_batch, 1043 lp->lwp_cpticks, ttlticks); 1044 } 1045 1046 /* 1047 * Adjust lp->lwp_esetcpu. The decay factor determines how 1048 * quickly lwp_estcpu collapses to its realtime calculation. 1049 * A slower collapse gives us a more accurate number but 1050 * can cause a cpu hog to eat too much cpu before the 1051 * scheduler decides to downgrade it. 1052 * 1053 * NOTE: p_nice is accounted for in bsd4_resetpriority(), 1054 * and not here, but we must still ensure that a 1055 * cpu-bound nice -20 process does not completely 1056 * override a cpu-bound nice +20 process. 1057 * 1058 * NOTE: We must use ESTCPULIM() here to deal with any 1059 * overshoot. 1060 */ 1061 decay_factor = usched_bsd4_decay; 1062 if (decay_factor < 1) 1063 decay_factor = 1; 1064 if (decay_factor > 1024) 1065 decay_factor = 1024; 1066 1067 lp->lwp_estcpu = ESTCPULIM( 1068 (lp->lwp_estcpu * decay_factor + estcpu) / 1069 (decay_factor + 1)); 1070 1071 if (usched_bsd4_debug == lp->lwp_proc->p_pid) 1072 kprintf(" finalestcpu %d\n", lp->lwp_estcpu); 1073 bsd4_resetpriority(lp); 1074 lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic; 1075 lp->lwp_cpticks = 0; 1076 } 1077 } 1078 1079 /* 1080 * Compute the priority of a process when running in user mode. 1081 * Arrange to reschedule if the resulting priority is better 1082 * than that of the current process. 1083 * 1084 * This routine may be called with any process. 1085 * 1086 * This routine is called by fork1() for initial setup with the process 1087 * of the run queue, and also may be called normally with the process on or 1088 * off the run queue. 1089 * 1090 * MPSAFE 1091 */ 1092 static void 1093 bsd4_resetpriority(struct lwp *lp) 1094 { 1095 bsd4_pcpu_t dd; 1096 int newpriority; 1097 u_short newrqtype; 1098 int reschedcpu; 1099 int checkpri; 1100 int estcpu; 1101 1102 /* 1103 * Calculate the new priority and queue type 1104 */ 1105 crit_enter(); 1106 spin_lock(&bsd4_spin); 1107 1108 newrqtype = lp->lwp_rtprio.type; 1109 1110 switch(newrqtype) { 1111 case RTP_PRIO_REALTIME: 1112 case RTP_PRIO_FIFO: 1113 newpriority = PRIBASE_REALTIME + 1114 (lp->lwp_rtprio.prio & PRIMASK); 1115 break; 1116 case RTP_PRIO_NORMAL: 1117 /* 1118 * Detune estcpu based on batchiness. lwp_batch ranges 1119 * from 0 to BATCHMAX. Limit estcpu for the sake of 1120 * the priority calculation to between 50% and 100%. 1121 */ 1122 estcpu = lp->lwp_estcpu * (lp->lwp_batch + BATCHMAX) / 1123 (BATCHMAX * 2); 1124 1125 /* 1126 * p_nice piece Adds (0-40) * 2 0-80 1127 * estcpu Adds 16384 * 4 / 512 0-128 1128 */ 1129 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ; 1130 newpriority += estcpu * PPQ / ESTCPUPPQ; 1131 newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ / 1132 NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ); 1133 newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK); 1134 break; 1135 case RTP_PRIO_IDLE: 1136 newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK); 1137 break; 1138 case RTP_PRIO_THREAD: 1139 newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK); 1140 break; 1141 default: 1142 panic("Bad RTP_PRIO %d", newrqtype); 1143 /* NOT REACHED */ 1144 } 1145 1146 /* 1147 * The newpriority incorporates the queue type so do a simple masked 1148 * check to determine if the process has moved to another queue. If 1149 * it has, and it is currently on a run queue, then move it. 1150 * 1151 * td_upri has normal sense (higher values are more desireable), so 1152 * negate it. 1153 */ 1154 lp->lwp_thread->td_upri = -(newpriority & ~PPQMASK); 1155 if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) { 1156 lp->lwp_priority = newpriority; 1157 if (lp->lwp_mpflags & LWP_MP_ONRUNQ) { 1158 bsd4_remrunqueue_locked(lp); 1159 lp->lwp_rqtype = newrqtype; 1160 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; 1161 bsd4_setrunqueue_locked(lp); 1162 checkpri = 1; 1163 } else { 1164 lp->lwp_rqtype = newrqtype; 1165 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; 1166 checkpri = 0; 1167 } 1168 reschedcpu = lp->lwp_thread->td_gd->gd_cpuid; 1169 } else { 1170 lp->lwp_priority = newpriority; 1171 reschedcpu = -1; 1172 checkpri = 1; 1173 } 1174 1175 /* 1176 * Determine if we need to reschedule the target cpu. This only 1177 * occurs if the LWP is already on a scheduler queue, which means 1178 * that idle cpu notification has already occured. At most we 1179 * need only issue a need_user_resched() on the appropriate cpu. 1180 * 1181 * The LWP may be owned by a CPU different from the current one, 1182 * in which case dd->uschedcp may be modified without an MP lock 1183 * or a spinlock held. The worst that happens is that the code 1184 * below causes a spurious need_user_resched() on the target CPU 1185 * and dd->pri to be wrong for a short period of time, both of 1186 * which are harmless. 1187 * 1188 * If checkpri is 0 we are adjusting the priority of the current 1189 * process, possibly higher (less desireable), so ignore the upri 1190 * check which will fail in that case. 1191 */ 1192 if (reschedcpu >= 0) { 1193 dd = &bsd4_pcpu[reschedcpu]; 1194 if (CPUMASK_TESTBIT(bsd4_rdyprocmask, reschedcpu) && 1195 (checkpri == 0 || 1196 (dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK))) { 1197 if (reschedcpu == mycpu->gd_cpuid) { 1198 spin_unlock(&bsd4_spin); 1199 need_user_resched(); 1200 } else { 1201 spin_unlock(&bsd4_spin); 1202 ATOMIC_CPUMASK_NANDBIT(bsd4_rdyprocmask, 1203 reschedcpu); 1204 lwkt_send_ipiq(lp->lwp_thread->td_gd, 1205 bsd4_need_user_resched_remote, 1206 NULL); 1207 } 1208 } else { 1209 spin_unlock(&bsd4_spin); 1210 } 1211 } else { 1212 spin_unlock(&bsd4_spin); 1213 } 1214 crit_exit(); 1215 } 1216 1217 /* 1218 * MPSAFE 1219 */ 1220 static 1221 void 1222 bsd4_yield(struct lwp *lp) 1223 { 1224 #if 0 1225 /* FUTURE (or something similar) */ 1226 switch(lp->lwp_rqtype) { 1227 case RTP_PRIO_NORMAL: 1228 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR); 1229 break; 1230 default: 1231 break; 1232 } 1233 #endif 1234 need_user_resched(); 1235 } 1236 1237 static 1238 void 1239 bsd4_changedcpu(struct lwp *lp __unused) 1240 { 1241 } 1242 1243 /* 1244 * Called from fork1() when a new child process is being created. 1245 * 1246 * Give the child process an initial estcpu that is more batch then 1247 * its parent and dock the parent for the fork (but do not 1248 * reschedule the parent). This comprises the main part of our batch 1249 * detection heuristic for both parallel forking and sequential execs. 1250 * 1251 * XXX lwp should be "spawning" instead of "forking" 1252 * 1253 * MPSAFE 1254 */ 1255 static void 1256 bsd4_forking(struct lwp *plp, struct lwp *lp) 1257 { 1258 /* 1259 * Put the child 4 queue slots (out of 32) higher than the parent 1260 * (less desireable than the parent). 1261 */ 1262 lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ * 4); 1263 1264 /* 1265 * The batch status of children always starts out centerline 1266 * and will inch-up or inch-down as appropriate. It takes roughly 1267 * ~15 seconds of >50% cpu to hit the limit. 1268 */ 1269 lp->lwp_batch = BATCHMAX / 2; 1270 1271 /* 1272 * Dock the parent a cost for the fork, protecting us from fork 1273 * bombs. If the parent is forking quickly make the child more 1274 * batchy. 1275 */ 1276 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ / 16); 1277 } 1278 1279 /* 1280 * Called when a lwp is being removed from this scheduler, typically 1281 * during lwp_exit(). 1282 */ 1283 static void 1284 bsd4_exiting(struct lwp *lp, struct proc *child_proc) 1285 { 1286 } 1287 1288 static void 1289 bsd4_uload_update(struct lwp *lp) 1290 { 1291 } 1292 1293 /* 1294 * chooseproc() is called when a cpu needs a user process to LWKT schedule, 1295 * it selects a user process and returns it. If chklp is non-NULL and chklp 1296 * has a better or equal priority then the process that would otherwise be 1297 * chosen, NULL is returned. 1298 * 1299 * Until we fix the RUNQ code the chklp test has to be strict or we may 1300 * bounce between processes trying to acquire the current process designation. 1301 * 1302 * MPSAFE - must be called with bsd4_spin exclusive held. The spinlock is 1303 * left intact through the entire routine. 1304 */ 1305 static 1306 struct lwp * 1307 bsd4_chooseproc_locked(struct lwp *chklp) 1308 { 1309 struct lwp *lp; 1310 struct rq *q; 1311 u_int32_t *which, *which2; 1312 u_int32_t pri; 1313 u_int32_t rtqbits; 1314 u_int32_t tsqbits; 1315 u_int32_t idqbits; 1316 cpumask_t cpumask; 1317 1318 rtqbits = bsd4_rtqueuebits; 1319 tsqbits = bsd4_queuebits; 1320 idqbits = bsd4_idqueuebits; 1321 cpumask = mycpu->gd_cpumask; 1322 1323 1324 again: 1325 if (rtqbits) { 1326 pri = bsfl(rtqbits); 1327 q = &bsd4_rtqueues[pri]; 1328 which = &bsd4_rtqueuebits; 1329 which2 = &rtqbits; 1330 } else if (tsqbits) { 1331 pri = bsfl(tsqbits); 1332 q = &bsd4_queues[pri]; 1333 which = &bsd4_queuebits; 1334 which2 = &tsqbits; 1335 } else if (idqbits) { 1336 pri = bsfl(idqbits); 1337 q = &bsd4_idqueues[pri]; 1338 which = &bsd4_idqueuebits; 1339 which2 = &idqbits; 1340 } else { 1341 return NULL; 1342 } 1343 lp = TAILQ_FIRST(q); 1344 KASSERT(lp, ("chooseproc: no lwp on busy queue")); 1345 1346 while (CPUMASK_TESTMASK(lp->lwp_cpumask, cpumask) == 0) { 1347 lp = TAILQ_NEXT(lp, lwp_procq); 1348 if (lp == NULL) { 1349 *which2 &= ~(1 << pri); 1350 goto again; 1351 } 1352 } 1353 1354 /* 1355 * If the passed lwp <chklp> is reasonably close to the selected 1356 * lwp <lp>, return NULL (indicating that <chklp> should be kept). 1357 * 1358 * Note that we must error on the side of <chklp> to avoid bouncing 1359 * between threads in the acquire code. 1360 */ 1361 if (chklp) { 1362 if (chklp->lwp_priority < lp->lwp_priority + PPQ) 1363 return(NULL); 1364 } 1365 1366 /* 1367 * If the chosen lwp does not reside on this cpu spend a few 1368 * cycles looking for a better candidate at the same priority level. 1369 * This is a fallback check, setrunqueue() tries to wakeup the 1370 * correct cpu and is our front-line affinity. 1371 */ 1372 if (lp->lwp_thread->td_gd != mycpu && 1373 (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL 1374 ) { 1375 if (chklp->lwp_thread->td_gd == mycpu) { 1376 lp = chklp; 1377 } 1378 } 1379 1380 KTR_COND_LOG(usched_bsd4_chooseproc, 1381 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1382 lp->lwp_proc->p_pid, 1383 lp->lwp_thread->td_gd->gd_cpuid, 1384 mycpu->gd_cpuid); 1385 1386 TAILQ_REMOVE(q, lp, lwp_procq); 1387 --bsd4_runqcount; 1388 if (TAILQ_EMPTY(q)) 1389 *which &= ~(1 << pri); 1390 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!")); 1391 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1392 1393 return lp; 1394 } 1395 1396 /* 1397 * chooseproc() - with a cache coherence heuristic. Try to pull a process that 1398 * has its home on the current CPU> If the process doesn't have its home here 1399 * and is a batchy one (see batcy_looser_pri_test), we can wait for a 1400 * sched_tick, may be its home will become free and pull it in. Anyway, 1401 * we can't wait more than one tick. If that tick expired, we pull in that 1402 * process, no matter what. 1403 */ 1404 static 1405 struct lwp * 1406 bsd4_chooseproc_locked_cache_coherent(struct lwp *chklp) 1407 { 1408 struct lwp *lp; 1409 struct rq *q; 1410 u_int32_t *which, *which2; 1411 u_int32_t pri; 1412 u_int32_t checks; 1413 u_int32_t rtqbits; 1414 u_int32_t tsqbits; 1415 u_int32_t idqbits; 1416 cpumask_t cpumask; 1417 1418 struct lwp * min_level_lwp = NULL; 1419 struct rq *min_q = NULL; 1420 cpumask_t siblings; 1421 cpu_node_t* cpunode = NULL; 1422 u_int32_t min_level = MAXCPU; /* number of levels < MAXCPU */ 1423 u_int32_t *min_which = NULL; 1424 u_int32_t min_pri = 0; 1425 u_int32_t level = 0; 1426 1427 rtqbits = bsd4_rtqueuebits; 1428 tsqbits = bsd4_queuebits; 1429 idqbits = bsd4_idqueuebits; 1430 cpumask = mycpu->gd_cpumask; 1431 1432 /* Get the mask coresponding to the sysctl configured level */ 1433 cpunode = bsd4_pcpu[mycpu->gd_cpuid].cpunode; 1434 level = usched_bsd4_stick_to_level; 1435 while (level) { 1436 cpunode = cpunode->parent_node; 1437 level--; 1438 } 1439 /* The cpus which can ellect a process */ 1440 siblings = cpunode->members; 1441 checks = 0; 1442 1443 again: 1444 if (rtqbits) { 1445 pri = bsfl(rtqbits); 1446 q = &bsd4_rtqueues[pri]; 1447 which = &bsd4_rtqueuebits; 1448 which2 = &rtqbits; 1449 } else if (tsqbits) { 1450 pri = bsfl(tsqbits); 1451 q = &bsd4_queues[pri]; 1452 which = &bsd4_queuebits; 1453 which2 = &tsqbits; 1454 } else if (idqbits) { 1455 pri = bsfl(idqbits); 1456 q = &bsd4_idqueues[pri]; 1457 which = &bsd4_idqueuebits; 1458 which2 = &idqbits; 1459 } else { 1460 /* 1461 * No more left and we didn't reach the checks limit. 1462 */ 1463 bsd4_kick_helper(min_level_lwp); 1464 return NULL; 1465 } 1466 lp = TAILQ_FIRST(q); 1467 KASSERT(lp, ("chooseproc: no lwp on busy queue")); 1468 1469 /* 1470 * Limit the number of checks/queue to a configurable value to 1471 * minimize the contention (we are in a locked region 1472 */ 1473 while (checks < usched_bsd4_queue_checks) { 1474 if (CPUMASK_TESTMASK(lp->lwp_cpumask, cpumask) == 0 || 1475 (CPUMASK_TESTMASK(siblings, 1476 lp->lwp_thread->td_gd->gd_cpumask) == 0 && 1477 (lp->lwp_rebal_ticks == sched_ticks || 1478 lp->lwp_rebal_ticks == (int)(sched_ticks - 1)) && 1479 bsd4_batchy_looser_pri_test(lp))) { 1480 1481 KTR_COND_LOG(usched_chooseproc_cc_not_good, 1482 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1483 lp->lwp_proc->p_pid, 1484 (unsigned long)CPUMASK_LOWMASK( 1485 lp->lwp_thread->td_gd->gd_cpumask), 1486 (unsigned long)CPUMASK_LOWMASK(siblings), 1487 (unsigned long)CPUMASK_LOWMASK(cpumask)); 1488 1489 cpunode = bsd4_pcpu[lp->lwp_thread->td_gd->gd_cpuid].cpunode; 1490 level = 0; 1491 while (cpunode) { 1492 if (CPUMASK_TESTMASK(cpunode->members, 1493 cpumask)) { 1494 break; 1495 } 1496 cpunode = cpunode->parent_node; 1497 level++; 1498 } 1499 if (level < min_level || 1500 (level == min_level && min_level_lwp && 1501 lp->lwp_priority < min_level_lwp->lwp_priority)) { 1502 bsd4_kick_helper(min_level_lwp); 1503 min_level_lwp = lp; 1504 min_level = level; 1505 min_q = q; 1506 min_which = which; 1507 min_pri = pri; 1508 } else { 1509 bsd4_kick_helper(lp); 1510 } 1511 lp = TAILQ_NEXT(lp, lwp_procq); 1512 if (lp == NULL) { 1513 *which2 &= ~(1 << pri); 1514 goto again; 1515 } 1516 } else { 1517 KTR_COND_LOG(usched_chooseproc_cc_elected, 1518 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1519 lp->lwp_proc->p_pid, 1520 (unsigned long)CPUMASK_LOWMASK( 1521 lp->lwp_thread->td_gd->gd_cpumask), 1522 (unsigned long)CPUMASK_LOWMASK(siblings), 1523 (unsigned long)CPUMASK_LOWMASK(cpumask)); 1524 1525 goto found; 1526 } 1527 ++checks; 1528 } 1529 1530 /* 1531 * Checks exhausted, we tried to defer too many threads, so schedule 1532 * the best of the worst. 1533 */ 1534 lp = min_level_lwp; 1535 q = min_q; 1536 which = min_which; 1537 pri = min_pri; 1538 KASSERT(lp, ("chooseproc: at least the first lp was good")); 1539 1540 found: 1541 1542 /* 1543 * If the passed lwp <chklp> is reasonably close to the selected 1544 * lwp <lp>, return NULL (indicating that <chklp> should be kept). 1545 * 1546 * Note that we must error on the side of <chklp> to avoid bouncing 1547 * between threads in the acquire code. 1548 */ 1549 if (chklp) { 1550 if (chklp->lwp_priority < lp->lwp_priority + PPQ) { 1551 bsd4_kick_helper(lp); 1552 return(NULL); 1553 } 1554 } 1555 1556 KTR_COND_LOG(usched_chooseproc_cc, 1557 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1558 lp->lwp_proc->p_pid, 1559 lp->lwp_thread->td_gd->gd_cpuid, 1560 mycpu->gd_cpuid); 1561 1562 TAILQ_REMOVE(q, lp, lwp_procq); 1563 --bsd4_runqcount; 1564 if (TAILQ_EMPTY(q)) 1565 *which &= ~(1 << pri); 1566 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!")); 1567 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1568 1569 return lp; 1570 } 1571 1572 /* 1573 * If we aren't willing to schedule a ready process on our cpu, give it's 1574 * target cpu a kick rather than wait for the next tick. 1575 * 1576 * Called with bsd4_spin held. 1577 */ 1578 static 1579 void 1580 bsd4_kick_helper(struct lwp *lp) 1581 { 1582 globaldata_t gd; 1583 bsd4_pcpu_t dd; 1584 cpumask_t tmpmask; 1585 1586 if (lp == NULL) 1587 return; 1588 gd = lp->lwp_thread->td_gd; 1589 dd = &bsd4_pcpu[gd->gd_cpuid]; 1590 1591 tmpmask = smp_active_mask; 1592 CPUMASK_ANDMASK(tmpmask, usched_global_cpumask); 1593 CPUMASK_ANDMASK(tmpmask, bsd4_rdyprocmask); 1594 CPUMASK_ANDMASK(tmpmask, gd->gd_cpumask); 1595 if (CPUMASK_TESTZERO(tmpmask)) 1596 return; 1597 1598 ++usched_bsd4_kicks; 1599 ATOMIC_CPUMASK_NANDBIT(bsd4_rdyprocmask, gd->gd_cpuid); 1600 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 1601 lwkt_send_ipiq(gd, bsd4_need_user_resched_remote, NULL); 1602 } else { 1603 wakeup(dd->helper_thread); 1604 } 1605 } 1606 1607 static 1608 void 1609 bsd4_need_user_resched_remote(void *dummy) 1610 { 1611 globaldata_t gd = mycpu; 1612 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 1613 1614 need_user_resched(); 1615 1616 /* Call wakeup_mycpu to avoid sending IPIs to other CPUs */ 1617 wakeup_mycpu(dd->helper_thread); 1618 } 1619 1620 /* 1621 * bsd4_remrunqueue_locked() removes a given process from the run queue 1622 * that it is on, clearing the queue busy bit if it becomes empty. 1623 * 1624 * Note that user process scheduler is different from the LWKT schedule. 1625 * The user process scheduler only manages user processes but it uses LWKT 1626 * underneath, and a user process operating in the kernel will often be 1627 * 'released' from our management. 1628 * 1629 * MPSAFE - bsd4_spin must be held exclusively on call 1630 */ 1631 static void 1632 bsd4_remrunqueue_locked(struct lwp *lp) 1633 { 1634 struct rq *q; 1635 u_int32_t *which; 1636 u_int8_t pri; 1637 1638 KKASSERT(lp->lwp_mpflags & LWP_MP_ONRUNQ); 1639 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1640 --bsd4_runqcount; 1641 KKASSERT(bsd4_runqcount >= 0); 1642 1643 pri = lp->lwp_rqindex; 1644 switch(lp->lwp_rqtype) { 1645 case RTP_PRIO_NORMAL: 1646 q = &bsd4_queues[pri]; 1647 which = &bsd4_queuebits; 1648 break; 1649 case RTP_PRIO_REALTIME: 1650 case RTP_PRIO_FIFO: 1651 q = &bsd4_rtqueues[pri]; 1652 which = &bsd4_rtqueuebits; 1653 break; 1654 case RTP_PRIO_IDLE: 1655 q = &bsd4_idqueues[pri]; 1656 which = &bsd4_idqueuebits; 1657 break; 1658 default: 1659 panic("remrunqueue: invalid rtprio type"); 1660 /* NOT REACHED */ 1661 } 1662 TAILQ_REMOVE(q, lp, lwp_procq); 1663 if (TAILQ_EMPTY(q)) { 1664 KASSERT((*which & (1 << pri)) != 0, 1665 ("remrunqueue: remove from empty queue")); 1666 *which &= ~(1 << pri); 1667 } 1668 } 1669 1670 /* 1671 * bsd4_setrunqueue_locked() 1672 * 1673 * Add a process whos rqtype and rqindex had previously been calculated 1674 * onto the appropriate run queue. Determine if the addition requires 1675 * a reschedule on a cpu and return the cpuid or -1. 1676 * 1677 * NOTE: Lower priorities are better priorities. 1678 * 1679 * MPSAFE - bsd4_spin must be held exclusively on call 1680 */ 1681 static void 1682 bsd4_setrunqueue_locked(struct lwp *lp) 1683 { 1684 struct rq *q; 1685 u_int32_t *which; 1686 int pri; 1687 1688 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 1689 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1690 ++bsd4_runqcount; 1691 1692 pri = lp->lwp_rqindex; 1693 1694 switch(lp->lwp_rqtype) { 1695 case RTP_PRIO_NORMAL: 1696 q = &bsd4_queues[pri]; 1697 which = &bsd4_queuebits; 1698 break; 1699 case RTP_PRIO_REALTIME: 1700 case RTP_PRIO_FIFO: 1701 q = &bsd4_rtqueues[pri]; 1702 which = &bsd4_rtqueuebits; 1703 break; 1704 case RTP_PRIO_IDLE: 1705 q = &bsd4_idqueues[pri]; 1706 which = &bsd4_idqueuebits; 1707 break; 1708 default: 1709 panic("remrunqueue: invalid rtprio type"); 1710 /* NOT REACHED */ 1711 } 1712 1713 /* 1714 * Add to the correct queue and set the appropriate bit. If no 1715 * lower priority (i.e. better) processes are in the queue then 1716 * we want a reschedule, calculate the best cpu for the job. 1717 * 1718 * Always run reschedules on the LWPs original cpu. 1719 */ 1720 TAILQ_INSERT_TAIL(q, lp, lwp_procq); 1721 *which |= 1 << pri; 1722 } 1723 1724 /* 1725 * For SMP systems a user scheduler helper thread is created for each 1726 * cpu and is used to allow one cpu to wakeup another for the purposes of 1727 * scheduling userland threads from setrunqueue(). 1728 * 1729 * UP systems do not need the helper since there is only one cpu. 1730 * 1731 * We can't use the idle thread for this because we might block. 1732 * Additionally, doing things this way allows us to HLT idle cpus 1733 * on MP systems. 1734 * 1735 * MPSAFE 1736 */ 1737 static void 1738 sched_thread(void *dummy) 1739 { 1740 globaldata_t gd; 1741 bsd4_pcpu_t dd; 1742 bsd4_pcpu_t tmpdd; 1743 struct lwp *nlp; 1744 cpumask_t mask; 1745 int cpuid; 1746 cpumask_t tmpmask; 1747 int tmpid; 1748 1749 gd = mycpu; 1750 cpuid = gd->gd_cpuid; /* doesn't change */ 1751 mask = gd->gd_cpumask; /* doesn't change */ 1752 dd = &bsd4_pcpu[cpuid]; 1753 1754 /* 1755 * Since we are woken up only when no user processes are scheduled 1756 * on a cpu, we can run at an ultra low priority. 1757 */ 1758 lwkt_setpri_self(TDPRI_USER_SCHEDULER); 1759 1760 tsleep(dd->helper_thread, 0, "sched_thread_sleep", 0); 1761 1762 for (;;) { 1763 /* 1764 * We use the LWKT deschedule-interlock trick to avoid racing 1765 * bsd4_rdyprocmask. This means we cannot block through to the 1766 * manual lwkt_switch() call we make below. 1767 */ 1768 crit_enter_gd(gd); 1769 tsleep_interlock(dd->helper_thread, 0); 1770 spin_lock(&bsd4_spin); 1771 ATOMIC_CPUMASK_ORMASK(bsd4_rdyprocmask, mask); 1772 1773 clear_user_resched(); /* This satisfied the reschedule request */ 1774 dd->rrcount = 0; /* Reset the round-robin counter */ 1775 1776 if (CPUMASK_TESTMASK(bsd4_curprocmask, mask) == 0) { 1777 /* 1778 * No thread is currently scheduled. 1779 */ 1780 KKASSERT(dd->uschedcp == NULL); 1781 if ((nlp = bsd4_chooseproc_locked(NULL)) != NULL) { 1782 KTR_COND_LOG(usched_sched_thread_no_process, 1783 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1784 gd->gd_cpuid, 1785 nlp->lwp_proc->p_pid, 1786 nlp->lwp_thread->td_gd->gd_cpuid); 1787 1788 ATOMIC_CPUMASK_ORMASK(bsd4_curprocmask, mask); 1789 dd->upri = nlp->lwp_priority; 1790 dd->uschedcp = nlp; 1791 dd->rrcount = 0; /* reset round robin */ 1792 spin_unlock(&bsd4_spin); 1793 lwkt_acquire(nlp->lwp_thread); 1794 lwkt_schedule(nlp->lwp_thread); 1795 } else { 1796 spin_unlock(&bsd4_spin); 1797 } 1798 } else if (bsd4_runqcount) { 1799 if ((nlp = bsd4_chooseproc_locked(dd->uschedcp)) != NULL) { 1800 KTR_COND_LOG(usched_sched_thread_process, 1801 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1802 gd->gd_cpuid, 1803 nlp->lwp_proc->p_pid, 1804 nlp->lwp_thread->td_gd->gd_cpuid); 1805 1806 dd->upri = nlp->lwp_priority; 1807 dd->uschedcp = nlp; 1808 dd->rrcount = 0; /* reset round robin */ 1809 spin_unlock(&bsd4_spin); 1810 lwkt_acquire(nlp->lwp_thread); 1811 lwkt_schedule(nlp->lwp_thread); 1812 } else { 1813 /* 1814 * CHAINING CONDITION TRAIN 1815 * 1816 * We could not deal with the scheduler wakeup 1817 * request on this cpu, locate a ready scheduler 1818 * with no current lp assignment and chain to it. 1819 * 1820 * This ensures that a wakeup race which fails due 1821 * to priority test does not leave other unscheduled 1822 * cpus idle when the runqueue is not empty. 1823 */ 1824 tmpmask = bsd4_rdyprocmask; 1825 CPUMASK_NANDMASK(tmpmask, bsd4_curprocmask); 1826 CPUMASK_ANDMASK(tmpmask, smp_active_mask); 1827 if (CPUMASK_TESTNZERO(tmpmask)) { 1828 tmpid = BSFCPUMASK(tmpmask); 1829 tmpdd = &bsd4_pcpu[tmpid]; 1830 ATOMIC_CPUMASK_NANDBIT(bsd4_rdyprocmask, tmpid); 1831 spin_unlock(&bsd4_spin); 1832 wakeup(tmpdd->helper_thread); 1833 } else { 1834 spin_unlock(&bsd4_spin); 1835 } 1836 1837 KTR_LOG(usched_sched_thread_no_process_found, 1838 gd->gd_cpuid, (unsigned long)CPUMASK_LOWMASK(tmpmask)); 1839 } 1840 } else { 1841 /* 1842 * The runq is empty. 1843 */ 1844 spin_unlock(&bsd4_spin); 1845 } 1846 1847 /* 1848 * We're descheduled unless someone scheduled us. Switch away. 1849 * Exiting the critical section will cause splz() to be called 1850 * for us if interrupts and such are pending. 1851 */ 1852 crit_exit_gd(gd); 1853 tsleep(dd->helper_thread, PINTERLOCKED, "schslp", 0); 1854 } 1855 } 1856 1857 /* sysctl stick_to_level parameter */ 1858 static int 1859 sysctl_usched_bsd4_stick_to_level(SYSCTL_HANDLER_ARGS) 1860 { 1861 int error, new_val; 1862 1863 new_val = usched_bsd4_stick_to_level; 1864 1865 error = sysctl_handle_int(oidp, &new_val, 0, req); 1866 if (error != 0 || req->newptr == NULL) 1867 return (error); 1868 if (new_val > cpu_topology_levels_number - 1 || new_val < 0) 1869 return (EINVAL); 1870 usched_bsd4_stick_to_level = new_val; 1871 return (0); 1872 } 1873 1874 /* 1875 * Setup our scheduler helpers. Note that curprocmask bit 0 has already 1876 * been cleared by rqinit() and we should not mess with it further. 1877 */ 1878 static void 1879 sched_thread_cpu_init(void) 1880 { 1881 int i; 1882 int smt_not_supported = 0; 1883 int cache_coherent_not_supported = 0; 1884 1885 if (bootverbose) 1886 kprintf("Start usched_bsd4 helpers on cpus:\n"); 1887 1888 sysctl_ctx_init(&usched_bsd4_sysctl_ctx); 1889 usched_bsd4_sysctl_tree = 1890 SYSCTL_ADD_NODE(&usched_bsd4_sysctl_ctx, 1891 SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO, 1892 "usched_bsd4", CTLFLAG_RD, 0, ""); 1893 1894 for (i = 0; i < ncpus; ++i) { 1895 bsd4_pcpu_t dd = &bsd4_pcpu[i]; 1896 cpumask_t mask; 1897 1898 CPUMASK_ASSBIT(mask, i); 1899 1900 if (CPUMASK_TESTMASK(mask, smp_active_mask) == 0) 1901 continue; 1902 1903 dd->cpunode = get_cpu_node_by_cpuid(i); 1904 1905 if (dd->cpunode == NULL) { 1906 smt_not_supported = 1; 1907 cache_coherent_not_supported = 1; 1908 if (bootverbose) 1909 kprintf (" cpu%d - WARNING: No CPU NODE " 1910 "found for cpu\n", i); 1911 } else { 1912 switch (dd->cpunode->type) { 1913 case THREAD_LEVEL: 1914 if (bootverbose) 1915 kprintf (" cpu%d - HyperThreading " 1916 "available. Core siblings: ", 1917 i); 1918 break; 1919 case CORE_LEVEL: 1920 smt_not_supported = 1; 1921 1922 if (bootverbose) 1923 kprintf (" cpu%d - No HT available, " 1924 "multi-core/physical " 1925 "cpu. Physical siblings: ", 1926 i); 1927 break; 1928 case CHIP_LEVEL: 1929 smt_not_supported = 1; 1930 1931 if (bootverbose) 1932 kprintf (" cpu%d - No HT available, " 1933 "single-core/physical cpu. " 1934 "Package Siblings: ", 1935 i); 1936 break; 1937 default: 1938 /* Let's go for safe defaults here */ 1939 smt_not_supported = 1; 1940 cache_coherent_not_supported = 1; 1941 if (bootverbose) 1942 kprintf (" cpu%d - Unknown cpunode->" 1943 "type=%u. Siblings: ", 1944 i, 1945 (u_int)dd->cpunode->type); 1946 break; 1947 } 1948 1949 if (bootverbose) { 1950 if (dd->cpunode->parent_node != NULL) { 1951 kprint_cpuset(&dd->cpunode-> 1952 parent_node->members); 1953 kprintf("\n"); 1954 } else { 1955 kprintf(" no siblings\n"); 1956 } 1957 } 1958 } 1959 1960 lwkt_create(sched_thread, NULL, &dd->helper_thread, NULL, 1961 0, i, "usched %d", i); 1962 1963 /* 1964 * Allow user scheduling on the target cpu. cpu #0 has already 1965 * been enabled in rqinit(). 1966 */ 1967 if (i) 1968 ATOMIC_CPUMASK_NANDMASK(bsd4_curprocmask, mask); 1969 ATOMIC_CPUMASK_ORMASK(bsd4_rdyprocmask, mask); 1970 dd->upri = PRIBASE_NULL; 1971 1972 } 1973 1974 /* usched_bsd4 sysctl configurable parameters */ 1975 1976 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1977 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1978 OID_AUTO, "rrinterval", CTLFLAG_RW, 1979 &usched_bsd4_rrinterval, 0, ""); 1980 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1981 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1982 OID_AUTO, "decay", CTLFLAG_RW, 1983 &usched_bsd4_decay, 0, "Extra decay when not running"); 1984 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1985 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1986 OID_AUTO, "batch_time", CTLFLAG_RW, 1987 &usched_bsd4_batch_time, 0, "Min batch counter value"); 1988 SYSCTL_ADD_LONG(&usched_bsd4_sysctl_ctx, 1989 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1990 OID_AUTO, "kicks", CTLFLAG_RW, 1991 &usched_bsd4_kicks, "Number of kickstarts"); 1992 1993 /* Add enable/disable option for SMT scheduling if supported */ 1994 if (smt_not_supported) { 1995 usched_bsd4_smt = 0; 1996 SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx, 1997 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1998 OID_AUTO, "smt", CTLFLAG_RD, 1999 "NOT SUPPORTED", 0, "SMT NOT SUPPORTED"); 2000 } else { 2001 usched_bsd4_smt = 1; 2002 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2003 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2004 OID_AUTO, "smt", CTLFLAG_RW, 2005 &usched_bsd4_smt, 0, "Enable SMT scheduling"); 2006 } 2007 2008 /* 2009 * Add enable/disable option for cache coherent scheduling 2010 * if supported 2011 */ 2012 if (cache_coherent_not_supported) { 2013 usched_bsd4_cache_coherent = 0; 2014 SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx, 2015 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2016 OID_AUTO, "cache_coherent", CTLFLAG_RD, 2017 "NOT SUPPORTED", 0, 2018 "Cache coherence NOT SUPPORTED"); 2019 } else { 2020 usched_bsd4_cache_coherent = 1; 2021 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2022 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2023 OID_AUTO, "cache_coherent", CTLFLAG_RW, 2024 &usched_bsd4_cache_coherent, 0, 2025 "Enable/Disable cache coherent scheduling"); 2026 2027 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2028 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2029 OID_AUTO, "upri_affinity", CTLFLAG_RW, 2030 &usched_bsd4_upri_affinity, 1, 2031 "Number of PPQs in user priority check"); 2032 2033 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2034 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2035 OID_AUTO, "queue_checks", CTLFLAG_RW, 2036 &usched_bsd4_queue_checks, 5, 2037 "LWPs to check from a queue before giving up"); 2038 2039 SYSCTL_ADD_PROC(&usched_bsd4_sysctl_ctx, 2040 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2041 OID_AUTO, "stick_to_level", 2042 CTLTYPE_INT | CTLFLAG_RW, 2043 NULL, sizeof usched_bsd4_stick_to_level, 2044 sysctl_usched_bsd4_stick_to_level, "I", 2045 "Stick a process to this level. See sysctl" 2046 "paremter hw.cpu_topology.level_description"); 2047 } 2048 } 2049 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND, 2050 sched_thread_cpu_init, NULL); 2051