1 /* 2 * Copyright (c) 2012 The DragonFly Project. All rights reserved. 3 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>. All rights reserved. 4 * 5 * This code is derived from software contributed to The DragonFly Project 6 * by Matthew Dillon <dillon@backplane.com>, 7 * by Mihai Carabas <mihai.carabas@gmail.com> 8 * and many others. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/kernel.h> 35 #include <sys/lock.h> 36 #include <sys/queue.h> 37 #include <sys/proc.h> 38 #include <sys/rtprio.h> 39 #include <sys/uio.h> 40 #include <sys/sysctl.h> 41 #include <sys/resourcevar.h> 42 #include <sys/spinlock.h> 43 #include <sys/cpu_topology.h> 44 #include <sys/thread2.h> 45 #include <sys/spinlock2.h> 46 #include <sys/mplock2.h> 47 48 #include <sys/ktr.h> 49 50 #include <machine/cpu.h> 51 #include <machine/smp.h> 52 53 /* 54 * Priorities. Note that with 32 run queues per scheduler each queue 55 * represents four priority levels. 56 */ 57 58 #define MAXPRI 128 59 #define PRIMASK (MAXPRI - 1) 60 #define PRIBASE_REALTIME 0 61 #define PRIBASE_NORMAL MAXPRI 62 #define PRIBASE_IDLE (MAXPRI * 2) 63 #define PRIBASE_THREAD (MAXPRI * 3) 64 #define PRIBASE_NULL (MAXPRI * 4) 65 66 #define NQS 32 /* 32 run queues. */ 67 #define PPQ (MAXPRI / NQS) /* priorities per queue */ 68 #define PPQMASK (PPQ - 1) 69 70 /* 71 * NICEPPQ - number of nice units per priority queue 72 * 73 * ESTCPUPPQ - number of estcpu units per priority queue 74 * ESTCPUMAX - number of estcpu units 75 */ 76 #define NICEPPQ 2 77 #define ESTCPUPPQ 512 78 #define ESTCPUMAX (ESTCPUPPQ * NQS) 79 #define BATCHMAX (ESTCPUFREQ * 30) 80 #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1) 81 82 #define ESTCPULIM(v) min((v), ESTCPUMAX) 83 84 TAILQ_HEAD(rq, lwp); 85 86 #define lwp_priority lwp_usdata.bsd4.priority 87 #define lwp_rqindex lwp_usdata.bsd4.rqindex 88 #define lwp_estcpu lwp_usdata.bsd4.estcpu 89 #define lwp_batch lwp_usdata.bsd4.batch 90 #define lwp_rqtype lwp_usdata.bsd4.rqtype 91 92 static void bsd4_acquire_curproc(struct lwp *lp); 93 static void bsd4_release_curproc(struct lwp *lp); 94 static void bsd4_select_curproc(globaldata_t gd); 95 static void bsd4_setrunqueue(struct lwp *lp); 96 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period, 97 sysclock_t cpstamp); 98 static void bsd4_recalculate_estcpu(struct lwp *lp); 99 static void bsd4_resetpriority(struct lwp *lp); 100 static void bsd4_forking(struct lwp *plp, struct lwp *lp); 101 static void bsd4_exiting(struct lwp *lp, struct proc *); 102 static void bsd4_uload_update(struct lwp *lp); 103 static void bsd4_yield(struct lwp *lp); 104 static void bsd4_need_user_resched_remote(void *dummy); 105 static int bsd4_batchy_looser_pri_test(struct lwp* lp); 106 static struct lwp *bsd4_chooseproc_locked_cache_coherent(struct lwp *chklp); 107 static void bsd4_kick_helper(struct lwp *lp); 108 static struct lwp *bsd4_chooseproc_locked(struct lwp *chklp); 109 static void bsd4_remrunqueue_locked(struct lwp *lp); 110 static void bsd4_setrunqueue_locked(struct lwp *lp); 111 static void bsd4_changedcpu(struct lwp *lp); 112 113 struct usched usched_bsd4 = { 114 { NULL }, 115 "bsd4", "Original DragonFly Scheduler", 116 NULL, /* default registration */ 117 NULL, /* default deregistration */ 118 bsd4_acquire_curproc, 119 bsd4_release_curproc, 120 bsd4_setrunqueue, 121 bsd4_schedulerclock, 122 bsd4_recalculate_estcpu, 123 bsd4_resetpriority, 124 bsd4_forking, 125 bsd4_exiting, 126 bsd4_uload_update, 127 NULL, /* setcpumask not supported */ 128 bsd4_yield, 129 bsd4_changedcpu 130 }; 131 132 struct usched_bsd4_pcpu { 133 struct thread helper_thread; 134 short rrcount; 135 short upri; 136 struct lwp *uschedcp; 137 struct lwp *old_uschedcp; 138 cpu_node_t *cpunode; 139 }; 140 141 typedef struct usched_bsd4_pcpu *bsd4_pcpu_t; 142 143 /* 144 * We have NQS (32) run queues per scheduling class. For the normal 145 * class, there are 128 priorities scaled onto these 32 queues. New 146 * processes are added to the last entry in each queue, and processes 147 * are selected for running by taking them from the head and maintaining 148 * a simple FIFO arrangement. Realtime and Idle priority processes have 149 * and explicit 0-31 priority which maps directly onto their class queue 150 * index. When a queue has something in it, the corresponding bit is 151 * set in the queuebits variable, allowing a single read to determine 152 * the state of all 32 queues and then a ffs() to find the first busy 153 * queue. 154 */ 155 static struct rq bsd4_queues[NQS]; 156 static struct rq bsd4_rtqueues[NQS]; 157 static struct rq bsd4_idqueues[NQS]; 158 static u_int32_t bsd4_queuebits; 159 static u_int32_t bsd4_rtqueuebits; 160 static u_int32_t bsd4_idqueuebits; 161 static cpumask_t bsd4_curprocmask = -1; /* currently running a user process */ 162 static cpumask_t bsd4_rdyprocmask; /* ready to accept a user process */ 163 static int bsd4_runqcount; 164 static volatile int bsd4_scancpu; 165 static struct spinlock bsd4_spin; 166 static struct usched_bsd4_pcpu bsd4_pcpu[MAXCPU]; 167 static struct sysctl_ctx_list usched_bsd4_sysctl_ctx; 168 static struct sysctl_oid *usched_bsd4_sysctl_tree; 169 170 /* Debug info exposed through debug.* sysctl */ 171 172 SYSCTL_INT(_debug, OID_AUTO, bsd4_runqcount, CTLFLAG_RD, 173 &bsd4_runqcount, 0, 174 "Number of run queues"); 175 176 static int usched_bsd4_debug = -1; 177 SYSCTL_INT(_debug, OID_AUTO, bsd4_scdebug, CTLFLAG_RW, 178 &usched_bsd4_debug, 0, 179 "Print debug information for this pid"); 180 181 static int usched_bsd4_pid_debug = -1; 182 SYSCTL_INT(_debug, OID_AUTO, bsd4_pid_debug, CTLFLAG_RW, 183 &usched_bsd4_pid_debug, 0, 184 "Print KTR debug information for this pid"); 185 186 /* Tunning usched_bsd4 - configurable through kern.usched_bsd4.* */ 187 static int usched_bsd4_smt = 0; 188 static int usched_bsd4_cache_coherent = 0; 189 static int usched_bsd4_upri_affinity = 16; /* 32 queues - half-way */ 190 static int usched_bsd4_queue_checks = 5; 191 static int usched_bsd4_stick_to_level = 0; 192 static long usched_bsd4_kicks; 193 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10; 194 static int usched_bsd4_decay = 8; 195 static int usched_bsd4_batch_time = 10; 196 197 /* KTR debug printings */ 198 199 KTR_INFO_MASTER_EXTERN(usched); 200 201 #if !defined(KTR_USCHED_BSD4) 202 #define KTR_USCHED_BSD4 KTR_ALL 203 #endif 204 205 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_urw, 0, 206 "USCHED_BSD4(bsd4_acquire_curproc in user_reseched_wanted " 207 "after release: pid %d, cpuid %d, curr_cpuid %d)", 208 pid_t pid, int cpuid, int curr); 209 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_before_loop, 0, 210 "USCHED_BSD4(bsd4_acquire_curproc before loop: pid %d, cpuid %d, " 211 "curr_cpuid %d)", 212 pid_t pid, int cpuid, int curr); 213 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_not, 0, 214 "USCHED_BSD4(bsd4_acquire_curproc couldn't acquire after " 215 "bsd4_setrunqueue: pid %d, cpuid %d, curr_lp pid %d, curr_cpuid %d)", 216 pid_t pid, int cpuid, pid_t curr_pid, int curr_cpuid); 217 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_acquire_curproc_switch, 0, 218 "USCHED_BSD4(bsd4_acquire_curproc after lwkt_switch: pid %d, " 219 "cpuid %d, curr_cpuid %d)", 220 pid_t pid, int cpuid, int curr); 221 222 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_release_curproc, 0, 223 "USCHED_BSD4(bsd4_release_curproc before select: pid %d, " 224 "cpuid %d, curr_cpuid %d)", 225 pid_t pid, int cpuid, int curr); 226 227 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_select_curproc, 0, 228 "USCHED_BSD4(bsd4_release_curproc before select: pid %d, " 229 "cpuid %d, old_pid %d, old_cpuid %d, curr_cpuid %d)", 230 pid_t pid, int cpuid, pid_t old_pid, int old_cpuid, int curr); 231 232 KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_false, 0, 233 "USCHED_BSD4(batchy_looser_pri_test false: pid %d, " 234 "cpuid %d, verify_mask %lu)", 235 pid_t pid, int cpuid, cpumask_t mask); 236 KTR_INFO(KTR_USCHED_BSD4, usched, batchy_test_true, 0, 237 "USCHED_BSD4(batchy_looser_pri_test true: pid %d, " 238 "cpuid %d, verify_mask %lu)", 239 pid_t pid, int cpuid, cpumask_t mask); 240 241 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_smt, 0, 242 "USCHED_BSD4(bsd4_setrunqueue free cpus smt: pid %d, cpuid %d, " 243 "mask %lu, curr_cpuid %d)", 244 pid_t pid, int cpuid, cpumask_t mask, int curr); 245 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_fc_non_smt, 0, 246 "USCHED_BSD4(bsd4_setrunqueue free cpus check non_smt: pid %d, " 247 "cpuid %d, mask %lu, curr_cpuid %d)", 248 pid_t pid, int cpuid, cpumask_t mask, int curr); 249 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_rc, 0, 250 "USCHED_BSD4(bsd4_setrunqueue running cpus check: pid %d, " 251 "cpuid %d, mask %lu, curr_cpuid %d)", 252 pid_t pid, int cpuid, cpumask_t mask, int curr); 253 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found, 0, 254 "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, " 255 "mask %lu, found_cpuid %d, curr_cpuid %d)", 256 pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr); 257 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_not_found, 0, 258 "USCHED_BSD4(bsd4_setrunqueue not found cpu: pid %d, cpuid %d, " 259 "try_cpuid %d, curr_cpuid %d)", 260 pid_t pid, int cpuid, int try_cpuid, int curr); 261 KTR_INFO(KTR_USCHED_BSD4, usched, bsd4_setrunqueue_found_best_cpuid, 0, 262 "USCHED_BSD4(bsd4_setrunqueue found cpu: pid %d, cpuid %d, " 263 "mask %lu, found_cpuid %d, curr_cpuid %d)", 264 pid_t pid, int cpuid, cpumask_t mask, int found_cpuid, int curr); 265 266 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc, 0, 267 "USCHED_BSD4(chooseproc: pid %d, old_cpuid %d, curr_cpuid %d)", 268 pid_t pid, int old_cpuid, int curr); 269 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc, 0, 270 "USCHED_BSD4(chooseproc_cc: pid %d, old_cpuid %d, curr_cpuid %d)", 271 pid_t pid, int old_cpuid, int curr); 272 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_not_good, 0, 273 "USCHED_BSD4(chooseproc_cc not good: pid %d, old_cpumask %lu, " 274 "sibling_mask %lu, curr_cpumask %lu)", 275 pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr); 276 KTR_INFO(KTR_USCHED_BSD4, usched, chooseproc_cc_elected, 0, 277 "USCHED_BSD4(chooseproc_cc elected: pid %d, old_cpumask %lu, " 278 "sibling_mask %lu, curr_cpumask: %lu)", 279 pid_t pid, cpumask_t old_cpumask, cpumask_t sibling_mask, cpumask_t curr); 280 281 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process, 0, 282 "USCHED_BSD4(sched_thread %d no process scheduled: pid %d, old_cpuid %d)", 283 int id, pid_t pid, int cpuid); 284 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_process, 0, 285 "USCHED_BSD4(sched_thread %d process scheduled: pid %d, old_cpuid %d)", 286 int id, pid_t pid, int cpuid); 287 KTR_INFO(KTR_USCHED_BSD4, usched, sched_thread_no_process_found, 0, 288 "USCHED_BSD4(sched_thread %d no process found; tmpmask %lu)", 289 int id, cpumask_t tmpmask); 290 291 /* 292 * Initialize the run queues at boot time. 293 */ 294 static void 295 bsd4_rqinit(void *dummy) 296 { 297 int i; 298 299 spin_init(&bsd4_spin); 300 for (i = 0; i < NQS; i++) { 301 TAILQ_INIT(&bsd4_queues[i]); 302 TAILQ_INIT(&bsd4_rtqueues[i]); 303 TAILQ_INIT(&bsd4_idqueues[i]); 304 } 305 atomic_clear_cpumask(&bsd4_curprocmask, 1); 306 } 307 SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, bsd4_rqinit, NULL) 308 309 /* 310 * BSD4_ACQUIRE_CURPROC 311 * 312 * This function is called when the kernel intends to return to userland. 313 * It is responsible for making the thread the current designated userland 314 * thread for this cpu, blocking if necessary. 315 * 316 * The kernel will not depress our LWKT priority until after we return, 317 * in case we have to shove over to another cpu. 318 * 319 * We must determine our thread's disposition before we switch away. This 320 * is very sensitive code. 321 * 322 * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE 323 * TO ANOTHER CPU! Because most of the kernel assumes that no migration will 324 * occur, this function is called only under very controlled circumstances. 325 * 326 * MPSAFE 327 */ 328 static void 329 bsd4_acquire_curproc(struct lwp *lp) 330 { 331 globaldata_t gd; 332 bsd4_pcpu_t dd; 333 thread_t td; 334 #if 0 335 struct lwp *olp; 336 #endif 337 338 /* 339 * Make sure we aren't sitting on a tsleep queue. 340 */ 341 td = lp->lwp_thread; 342 crit_enter_quick(td); 343 if (td->td_flags & TDF_TSLEEPQ) 344 tsleep_remove(td); 345 bsd4_recalculate_estcpu(lp); 346 347 /* 348 * If a reschedule was requested give another thread the 349 * driver's seat. 350 */ 351 if (user_resched_wanted()) { 352 clear_user_resched(); 353 bsd4_release_curproc(lp); 354 355 KTR_COND_LOG(usched_bsd4_acquire_curproc_urw, 356 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 357 lp->lwp_proc->p_pid, 358 lp->lwp_thread->td_gd->gd_cpuid, 359 mycpu->gd_cpuid); 360 } 361 362 /* 363 * Loop until we are the current user thread 364 */ 365 gd = mycpu; 366 dd = &bsd4_pcpu[gd->gd_cpuid]; 367 368 KTR_COND_LOG(usched_bsd4_acquire_curproc_before_loop, 369 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 370 lp->lwp_proc->p_pid, 371 lp->lwp_thread->td_gd->gd_cpuid, 372 gd->gd_cpuid); 373 374 do { 375 /* 376 * Process any pending events and higher priority threads. 377 */ 378 lwkt_yield(); 379 380 /* 381 * Become the currently scheduled user thread for this cpu 382 * if we can do so trivially. 383 * 384 * We can steal another thread's current thread designation 385 * on this cpu since if we are running that other thread 386 * must not be, so we can safely deschedule it. 387 */ 388 if (dd->uschedcp == lp) { 389 /* 390 * We are already the current lwp (hot path). 391 */ 392 dd->upri = lp->lwp_priority; 393 } else if (dd->uschedcp == NULL) { 394 /* 395 * We can trivially become the current lwp. 396 */ 397 atomic_set_cpumask(&bsd4_curprocmask, gd->gd_cpumask); 398 dd->uschedcp = lp; 399 dd->upri = lp->lwp_priority; 400 } else if (dd->upri > lp->lwp_priority) { 401 /* 402 * We can steal the current cpu's lwp designation 403 * away simply by replacing it. The other thread 404 * will stall when it tries to return to userland. 405 */ 406 dd->uschedcp = lp; 407 dd->upri = lp->lwp_priority; 408 /* 409 lwkt_deschedule(olp->lwp_thread); 410 bsd4_setrunqueue(olp); 411 */ 412 } else { 413 /* 414 * We cannot become the current lwp, place the lp 415 * on the bsd4 run-queue and deschedule ourselves. 416 * 417 * When we are reactivated we will have another 418 * chance. 419 */ 420 lwkt_deschedule(lp->lwp_thread); 421 422 bsd4_setrunqueue(lp); 423 424 KTR_COND_LOG(usched_bsd4_acquire_curproc_not, 425 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 426 lp->lwp_proc->p_pid, 427 lp->lwp_thread->td_gd->gd_cpuid, 428 dd->uschedcp->lwp_proc->p_pid, 429 gd->gd_cpuid); 430 431 432 lwkt_switch(); 433 434 /* 435 * Reload after a switch or setrunqueue/switch possibly 436 * moved us to another cpu. 437 */ 438 gd = mycpu; 439 dd = &bsd4_pcpu[gd->gd_cpuid]; 440 441 KTR_COND_LOG(usched_bsd4_acquire_curproc_switch, 442 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 443 lp->lwp_proc->p_pid, 444 lp->lwp_thread->td_gd->gd_cpuid, 445 gd->gd_cpuid); 446 } 447 } while (dd->uschedcp != lp); 448 449 crit_exit_quick(td); 450 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 451 } 452 453 /* 454 * BSD4_RELEASE_CURPROC 455 * 456 * This routine detaches the current thread from the userland scheduler, 457 * usually because the thread needs to run or block in the kernel (at 458 * kernel priority) for a while. 459 * 460 * This routine is also responsible for selecting a new thread to 461 * make the current thread. 462 * 463 * NOTE: This implementation differs from the dummy example in that 464 * bsd4_select_curproc() is able to select the current process, whereas 465 * dummy_select_curproc() is not able to select the current process. 466 * This means we have to NULL out uschedcp. 467 * 468 * Additionally, note that we may already be on a run queue if releasing 469 * via the lwkt_switch() in bsd4_setrunqueue(). 470 * 471 * MPSAFE 472 */ 473 474 static void 475 bsd4_release_curproc(struct lwp *lp) 476 { 477 globaldata_t gd = mycpu; 478 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 479 480 if (dd->uschedcp == lp) { 481 crit_enter(); 482 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 483 484 KTR_COND_LOG(usched_bsd4_release_curproc, 485 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 486 lp->lwp_proc->p_pid, 487 lp->lwp_thread->td_gd->gd_cpuid, 488 gd->gd_cpuid); 489 490 dd->uschedcp = NULL; /* don't let lp be selected */ 491 dd->upri = PRIBASE_NULL; 492 atomic_clear_cpumask(&bsd4_curprocmask, gd->gd_cpumask); 493 dd->old_uschedcp = lp; /* used only for KTR debug prints */ 494 bsd4_select_curproc(gd); 495 crit_exit(); 496 } 497 } 498 499 /* 500 * BSD4_SELECT_CURPROC 501 * 502 * Select a new current process for this cpu and clear any pending user 503 * reschedule request. The cpu currently has no current process. 504 * 505 * This routine is also responsible for equal-priority round-robining, 506 * typically triggered from bsd4_schedulerclock(). In our dummy example 507 * all the 'user' threads are LWKT scheduled all at once and we just 508 * call lwkt_switch(). 509 * 510 * The calling process is not on the queue and cannot be selected. 511 * 512 * MPSAFE 513 */ 514 static 515 void 516 bsd4_select_curproc(globaldata_t gd) 517 { 518 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 519 struct lwp *nlp; 520 int cpuid = gd->gd_cpuid; 521 522 crit_enter_gd(gd); 523 524 spin_lock(&bsd4_spin); 525 if(usched_bsd4_cache_coherent) 526 nlp = bsd4_chooseproc_locked_cache_coherent(dd->uschedcp); 527 else 528 nlp = bsd4_chooseproc_locked(dd->uschedcp); 529 530 if (nlp) { 531 532 KTR_COND_LOG(usched_bsd4_select_curproc, 533 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, 534 nlp->lwp_proc->p_pid, 535 nlp->lwp_thread->td_gd->gd_cpuid, 536 dd->old_uschedcp->lwp_proc->p_pid, 537 dd->old_uschedcp->lwp_thread->td_gd->gd_cpuid, 538 gd->gd_cpuid); 539 540 atomic_set_cpumask(&bsd4_curprocmask, CPUMASK(cpuid)); 541 dd->upri = nlp->lwp_priority; 542 dd->uschedcp = nlp; 543 dd->rrcount = 0; /* reset round robin */ 544 spin_unlock(&bsd4_spin); 545 lwkt_acquire(nlp->lwp_thread); 546 lwkt_schedule(nlp->lwp_thread); 547 } else { 548 spin_unlock(&bsd4_spin); 549 } 550 551 #if 0 552 } else if (bsd4_runqcount && (bsd4_rdyprocmask & CPUMASK(cpuid))) { 553 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid)); 554 spin_unlock(&bsd4_spin); 555 lwkt_schedule(&dd->helper_thread); 556 } else { 557 spin_unlock(&bsd4_spin); 558 } 559 #endif 560 crit_exit_gd(gd); 561 } 562 563 /* 564 * batchy_looser_pri_test() - determine if a process is batchy or not 565 * relative to the other processes running in the system 566 */ 567 static int 568 bsd4_batchy_looser_pri_test(struct lwp* lp) 569 { 570 cpumask_t mask; 571 bsd4_pcpu_t other_dd; 572 int cpu; 573 574 /* Current running processes */ 575 mask = bsd4_curprocmask & smp_active_mask 576 & usched_global_cpumask; 577 578 while(mask) { 579 cpu = BSFCPUMASK(mask); 580 other_dd = &bsd4_pcpu[cpu]; 581 if (other_dd->upri - lp->lwp_priority > usched_bsd4_upri_affinity * PPQ) { 582 583 KTR_COND_LOG(usched_batchy_test_false, 584 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 585 lp->lwp_proc->p_pid, 586 lp->lwp_thread->td_gd->gd_cpuid, 587 (unsigned long)mask); 588 589 return 0; 590 } 591 mask &= ~CPUMASK(cpu); 592 } 593 594 KTR_COND_LOG(usched_batchy_test_true, 595 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 596 lp->lwp_proc->p_pid, 597 lp->lwp_thread->td_gd->gd_cpuid, 598 (unsigned long)mask); 599 600 return 1; 601 } 602 603 /* 604 * 605 * BSD4_SETRUNQUEUE 606 * 607 * Place the specified lwp on the user scheduler's run queue. This routine 608 * must be called with the thread descheduled. The lwp must be runnable. 609 * 610 * The thread may be the current thread as a special case. 611 * 612 * MPSAFE 613 */ 614 static void 615 bsd4_setrunqueue(struct lwp *lp) 616 { 617 globaldata_t gd; 618 bsd4_pcpu_t dd; 619 int cpuid; 620 cpumask_t mask; 621 cpumask_t tmpmask; 622 623 /* 624 * First validate the process state relative to the current cpu. 625 * We don't need the spinlock for this, just a critical section. 626 * We are in control of the process. 627 */ 628 crit_enter(); 629 KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN")); 630 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0, 631 ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid, 632 lp->lwp_tid, lp->lwp_proc->p_flags, lp->lwp_flags)); 633 KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0); 634 635 /* 636 * Note: gd and dd are relative to the target thread's last cpu, 637 * NOT our current cpu. 638 */ 639 gd = lp->lwp_thread->td_gd; 640 dd = &bsd4_pcpu[gd->gd_cpuid]; 641 642 /* 643 * This process is not supposed to be scheduled anywhere or assigned 644 * as the current process anywhere. Assert the condition. 645 */ 646 KKASSERT(dd->uschedcp != lp); 647 648 /* 649 * XXX fixme. Could be part of a remrunqueue/setrunqueue 650 * operation when the priority is recalculated, so TDF_MIGRATING 651 * may already be set. 652 */ 653 if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0) 654 lwkt_giveaway(lp->lwp_thread); 655 656 /* 657 * We lose control of lp the moment we release the spinlock after 658 * having placed lp on the queue. i.e. another cpu could pick it 659 * up and it could exit, or its priority could be further adjusted, 660 * or something like that. 661 */ 662 spin_lock(&bsd4_spin); 663 bsd4_setrunqueue_locked(lp); 664 lp->lwp_rebal_ticks = sched_ticks; 665 666 /* 667 * Kick the scheduler helper on one of the other cpu's 668 * and request a reschedule if appropriate. 669 * 670 * NOTE: We check all cpus whos rdyprocmask is set. First we 671 * look for cpus without designated lps, then we look for 672 * cpus with designated lps with a worse priority than our 673 * process. 674 */ 675 ++bsd4_scancpu; 676 677 if (usched_bsd4_smt) { 678 679 /* 680 * SMT heuristic - Try to schedule on a free physical core. 681 * If no physical core found than choose the one that has 682 * an interactive thread. 683 */ 684 685 int best_cpuid = -1; 686 int min_prio = MAXPRI * MAXPRI; 687 int sibling; 688 689 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 690 mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask & 691 smp_active_mask & usched_global_cpumask; 692 693 KTR_COND_LOG(usched_bsd4_setrunqueue_fc_smt, 694 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 695 lp->lwp_proc->p_pid, 696 lp->lwp_thread->td_gd->gd_cpuid, 697 (unsigned long)mask, 698 mycpu->gd_cpuid); 699 700 while (mask) { 701 tmpmask = ~(CPUMASK(cpuid) - 1); 702 if (mask & tmpmask) 703 cpuid = BSFCPUMASK(mask & tmpmask); 704 else 705 cpuid = BSFCPUMASK(mask); 706 gd = globaldata_find(cpuid); 707 dd = &bsd4_pcpu[cpuid]; 708 709 if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) { 710 if (dd->cpunode->parent_node->members & ~dd->cpunode->members & mask) { 711 712 KTR_COND_LOG(usched_bsd4_setrunqueue_found, 713 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 714 lp->lwp_proc->p_pid, 715 lp->lwp_thread->td_gd->gd_cpuid, 716 (unsigned long)mask, 717 cpuid, 718 mycpu->gd_cpuid); 719 720 goto found; 721 } else { 722 sibling = BSFCPUMASK(dd->cpunode->parent_node->members & 723 ~dd->cpunode->members); 724 if (min_prio > bsd4_pcpu[sibling].upri) { 725 min_prio = bsd4_pcpu[sibling].upri; 726 best_cpuid = cpuid; 727 } 728 } 729 } 730 mask &= ~CPUMASK(cpuid); 731 } 732 733 if (best_cpuid != -1) { 734 cpuid = best_cpuid; 735 gd = globaldata_find(cpuid); 736 dd = &bsd4_pcpu[cpuid]; 737 738 KTR_COND_LOG(usched_bsd4_setrunqueue_found_best_cpuid, 739 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 740 lp->lwp_proc->p_pid, 741 lp->lwp_thread->td_gd->gd_cpuid, 742 (unsigned long)mask, 743 cpuid, 744 mycpu->gd_cpuid); 745 746 goto found; 747 } 748 } else { 749 /* Fallback to the original heuristic */ 750 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 751 mask = ~bsd4_curprocmask & bsd4_rdyprocmask & lp->lwp_cpumask & 752 smp_active_mask & usched_global_cpumask; 753 754 KTR_COND_LOG(usched_bsd4_setrunqueue_fc_non_smt, 755 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 756 lp->lwp_proc->p_pid, 757 lp->lwp_thread->td_gd->gd_cpuid, 758 (unsigned long)mask, 759 mycpu->gd_cpuid); 760 761 while (mask) { 762 tmpmask = ~(CPUMASK(cpuid) - 1); 763 if (mask & tmpmask) 764 cpuid = BSFCPUMASK(mask & tmpmask); 765 else 766 cpuid = BSFCPUMASK(mask); 767 gd = globaldata_find(cpuid); 768 dd = &bsd4_pcpu[cpuid]; 769 770 if ((dd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK)) { 771 772 KTR_COND_LOG(usched_bsd4_setrunqueue_found, 773 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 774 lp->lwp_proc->p_pid, 775 lp->lwp_thread->td_gd->gd_cpuid, 776 (unsigned long)mask, 777 cpuid, 778 mycpu->gd_cpuid); 779 780 goto found; 781 } 782 mask &= ~CPUMASK(cpuid); 783 } 784 } 785 786 /* 787 * Then cpus which might have a currently running lp 788 */ 789 mask = bsd4_curprocmask & bsd4_rdyprocmask & 790 lp->lwp_cpumask & smp_active_mask & usched_global_cpumask; 791 792 KTR_COND_LOG(usched_bsd4_setrunqueue_rc, 793 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 794 lp->lwp_proc->p_pid, 795 lp->lwp_thread->td_gd->gd_cpuid, 796 (unsigned long)mask, 797 mycpu->gd_cpuid); 798 799 while (mask) { 800 tmpmask = ~(CPUMASK(cpuid) - 1); 801 if (mask & tmpmask) 802 cpuid = BSFCPUMASK(mask & tmpmask); 803 else 804 cpuid = BSFCPUMASK(mask); 805 gd = globaldata_find(cpuid); 806 dd = &bsd4_pcpu[cpuid]; 807 808 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 809 810 KTR_COND_LOG(usched_bsd4_setrunqueue_found, 811 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 812 lp->lwp_proc->p_pid, 813 lp->lwp_thread->td_gd->gd_cpuid, 814 (unsigned long)mask, 815 cpuid, 816 mycpu->gd_cpuid); 817 818 goto found; 819 } 820 mask &= ~CPUMASK(cpuid); 821 } 822 823 /* 824 * If we cannot find a suitable cpu we reload from bsd4_scancpu 825 * and round-robin. Other cpus will pickup as they release their 826 * current lwps or become ready. 827 * 828 * Avoid a degenerate system lockup case if usched_global_cpumask 829 * is set to 0 or otherwise does not cover lwp_cpumask. 830 * 831 * We only kick the target helper thread in this case, we do not 832 * set the user resched flag because 833 */ 834 cpuid = (bsd4_scancpu & 0xFFFF) % ncpus; 835 if ((CPUMASK(cpuid) & usched_global_cpumask) == 0) { 836 cpuid = 0; 837 } 838 gd = globaldata_find(cpuid); 839 dd = &bsd4_pcpu[cpuid]; 840 841 KTR_COND_LOG(usched_bsd4_setrunqueue_not_found, 842 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 843 lp->lwp_proc->p_pid, 844 lp->lwp_thread->td_gd->gd_cpuid, 845 cpuid, 846 mycpu->gd_cpuid); 847 848 found: 849 if (gd == mycpu) { 850 spin_unlock(&bsd4_spin); 851 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 852 if (dd->uschedcp == NULL) { 853 wakeup_mycpu(&dd->helper_thread); 854 } else { 855 need_user_resched(); 856 } 857 } 858 } else { 859 atomic_clear_cpumask(&bsd4_rdyprocmask, CPUMASK(cpuid)); 860 spin_unlock(&bsd4_spin); 861 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) 862 lwkt_send_ipiq(gd, bsd4_need_user_resched_remote, NULL); 863 else 864 wakeup(&dd->helper_thread); 865 } 866 crit_exit(); 867 } 868 869 /* 870 * This routine is called from a systimer IPI. It MUST be MP-safe and 871 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ on 872 * each cpu. 873 * 874 * This routine is called on every sched tick. If the currently running 875 * thread belongs to this scheduler it will be called with a non-NULL lp, 876 * otherwise it will be called with a NULL lp. 877 * 878 * MPSAFE 879 */ 880 static 881 void 882 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp) 883 { 884 globaldata_t gd = mycpu; 885 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 886 887 /* 888 * No impl if no lp running. 889 */ 890 if (lp == NULL) 891 return; 892 893 /* 894 * Do we need to round-robin? We round-robin 10 times a second. 895 * This should only occur for cpu-bound batch processes. 896 */ 897 if (++dd->rrcount >= usched_bsd4_rrinterval) { 898 dd->rrcount = 0; 899 need_user_resched(); 900 } 901 902 /* 903 * Adjust estcpu upward using a real time equivalent calculation. 904 */ 905 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUMAX / ESTCPUFREQ + 1); 906 907 /* 908 * Spinlocks also hold a critical section so there should not be 909 * any active. 910 */ 911 KKASSERT(gd->gd_spinlocks == 0); 912 913 bsd4_resetpriority(lp); 914 } 915 916 /* 917 * Called from acquire and from kern_synch's one-second timer (one of the 918 * callout helper threads) with a critical section held. 919 * 920 * Decay p_estcpu based on the number of ticks we haven't been running 921 * and our p_nice. As the load increases each process observes a larger 922 * number of idle ticks (because other processes are running in them). 923 * This observation leads to a larger correction which tends to make the 924 * system more 'batchy'. 925 * 926 * Note that no recalculation occurs for a process which sleeps and wakes 927 * up in the same tick. That is, a system doing thousands of context 928 * switches per second will still only do serious estcpu calculations 929 * ESTCPUFREQ times per second. 930 * 931 * MPSAFE 932 */ 933 static 934 void 935 bsd4_recalculate_estcpu(struct lwp *lp) 936 { 937 globaldata_t gd = mycpu; 938 sysclock_t cpbase; 939 sysclock_t ttlticks; 940 int estcpu; 941 int decay_factor; 942 943 /* 944 * We have to subtract periodic to get the last schedclock 945 * timeout time, otherwise we would get the upcoming timeout. 946 * Keep in mind that a process can migrate between cpus and 947 * while the scheduler clock should be very close, boundary 948 * conditions could lead to a small negative delta. 949 */ 950 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic; 951 952 if (lp->lwp_slptime > 1) { 953 /* 954 * Too much time has passed, do a coarse correction. 955 */ 956 lp->lwp_estcpu = lp->lwp_estcpu >> 1; 957 bsd4_resetpriority(lp); 958 lp->lwp_cpbase = cpbase; 959 lp->lwp_cpticks = 0; 960 lp->lwp_batch -= ESTCPUFREQ; 961 if (lp->lwp_batch < 0) 962 lp->lwp_batch = 0; 963 } else if (lp->lwp_cpbase != cpbase) { 964 /* 965 * Adjust estcpu if we are in a different tick. Don't waste 966 * time if we are in the same tick. 967 * 968 * First calculate the number of ticks in the measurement 969 * interval. The ttlticks calculation can wind up 0 due to 970 * a bug in the handling of lwp_slptime (as yet not found), 971 * so make sure we do not get a divide by 0 panic. 972 */ 973 ttlticks = (cpbase - lp->lwp_cpbase) / 974 gd->gd_schedclock.periodic; 975 if ((ssysclock_t)ttlticks < 0) { 976 ttlticks = 0; 977 lp->lwp_cpbase = cpbase; 978 } 979 if (ttlticks == 0) 980 return; 981 updatepcpu(lp, lp->lwp_cpticks, ttlticks); 982 983 /* 984 * Calculate the percentage of one cpu used factoring in ncpus 985 * and the load and adjust estcpu. Handle degenerate cases 986 * by adding 1 to bsd4_runqcount. 987 * 988 * estcpu is scaled by ESTCPUMAX. 989 * 990 * bsd4_runqcount is the excess number of user processes 991 * that cannot be immediately scheduled to cpus. We want 992 * to count these as running to avoid range compression 993 * in the base calculation (which is the actual percentage 994 * of one cpu used). 995 */ 996 estcpu = (lp->lwp_cpticks * ESTCPUMAX) * 997 (bsd4_runqcount + ncpus) / (ncpus * ttlticks); 998 999 /* 1000 * If estcpu is > 50% we become more batch-like 1001 * If estcpu is <= 50% we become less batch-like 1002 * 1003 * It takes 30 cpu seconds to traverse the entire range. 1004 */ 1005 if (estcpu > ESTCPUMAX / 2) { 1006 lp->lwp_batch += ttlticks; 1007 if (lp->lwp_batch > BATCHMAX) 1008 lp->lwp_batch = BATCHMAX; 1009 } else { 1010 lp->lwp_batch -= ttlticks; 1011 if (lp->lwp_batch < 0) 1012 lp->lwp_batch = 0; 1013 } 1014 1015 if (usched_bsd4_debug == lp->lwp_proc->p_pid) { 1016 kprintf("pid %d lwp %p estcpu %3d %3d bat %d cp %d/%d", 1017 lp->lwp_proc->p_pid, lp, 1018 estcpu, lp->lwp_estcpu, 1019 lp->lwp_batch, 1020 lp->lwp_cpticks, ttlticks); 1021 } 1022 1023 /* 1024 * Adjust lp->lwp_esetcpu. The decay factor determines how 1025 * quickly lwp_estcpu collapses to its realtime calculation. 1026 * A slower collapse gives us a more accurate number but 1027 * can cause a cpu hog to eat too much cpu before the 1028 * scheduler decides to downgrade it. 1029 * 1030 * NOTE: p_nice is accounted for in bsd4_resetpriority(), 1031 * and not here, but we must still ensure that a 1032 * cpu-bound nice -20 process does not completely 1033 * override a cpu-bound nice +20 process. 1034 * 1035 * NOTE: We must use ESTCPULIM() here to deal with any 1036 * overshoot. 1037 */ 1038 decay_factor = usched_bsd4_decay; 1039 if (decay_factor < 1) 1040 decay_factor = 1; 1041 if (decay_factor > 1024) 1042 decay_factor = 1024; 1043 1044 lp->lwp_estcpu = ESTCPULIM( 1045 (lp->lwp_estcpu * decay_factor + estcpu) / 1046 (decay_factor + 1)); 1047 1048 if (usched_bsd4_debug == lp->lwp_proc->p_pid) 1049 kprintf(" finalestcpu %d\n", lp->lwp_estcpu); 1050 bsd4_resetpriority(lp); 1051 lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic; 1052 lp->lwp_cpticks = 0; 1053 } 1054 } 1055 1056 /* 1057 * Compute the priority of a process when running in user mode. 1058 * Arrange to reschedule if the resulting priority is better 1059 * than that of the current process. 1060 * 1061 * This routine may be called with any process. 1062 * 1063 * This routine is called by fork1() for initial setup with the process 1064 * of the run queue, and also may be called normally with the process on or 1065 * off the run queue. 1066 * 1067 * MPSAFE 1068 */ 1069 static void 1070 bsd4_resetpriority(struct lwp *lp) 1071 { 1072 bsd4_pcpu_t dd; 1073 int newpriority; 1074 u_short newrqtype; 1075 int reschedcpu; 1076 int checkpri; 1077 int estcpu; 1078 1079 /* 1080 * Calculate the new priority and queue type 1081 */ 1082 crit_enter(); 1083 spin_lock(&bsd4_spin); 1084 1085 newrqtype = lp->lwp_rtprio.type; 1086 1087 switch(newrqtype) { 1088 case RTP_PRIO_REALTIME: 1089 case RTP_PRIO_FIFO: 1090 newpriority = PRIBASE_REALTIME + 1091 (lp->lwp_rtprio.prio & PRIMASK); 1092 break; 1093 case RTP_PRIO_NORMAL: 1094 /* 1095 * Detune estcpu based on batchiness. lwp_batch ranges 1096 * from 0 to BATCHMAX. Limit estcpu for the sake of 1097 * the priority calculation to between 50% and 100%. 1098 */ 1099 estcpu = lp->lwp_estcpu * (lp->lwp_batch + BATCHMAX) / 1100 (BATCHMAX * 2); 1101 1102 /* 1103 * p_nice piece Adds (0-40) * 2 0-80 1104 * estcpu Adds 16384 * 4 / 512 0-128 1105 */ 1106 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ; 1107 newpriority += estcpu * PPQ / ESTCPUPPQ; 1108 newpriority = newpriority * MAXPRI / (PRIO_RANGE * PPQ / 1109 NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ); 1110 newpriority = PRIBASE_NORMAL + (newpriority & PRIMASK); 1111 break; 1112 case RTP_PRIO_IDLE: 1113 newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK); 1114 break; 1115 case RTP_PRIO_THREAD: 1116 newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK); 1117 break; 1118 default: 1119 panic("Bad RTP_PRIO %d", newrqtype); 1120 /* NOT REACHED */ 1121 } 1122 1123 /* 1124 * The newpriority incorporates the queue type so do a simple masked 1125 * check to determine if the process has moved to another queue. If 1126 * it has, and it is currently on a run queue, then move it. 1127 * 1128 * td_upri has normal sense (higher values are more desireable), so 1129 * negate it. 1130 */ 1131 lp->lwp_thread->td_upri = -(newpriority & ~PPQMASK); 1132 if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) { 1133 lp->lwp_priority = newpriority; 1134 if (lp->lwp_mpflags & LWP_MP_ONRUNQ) { 1135 bsd4_remrunqueue_locked(lp); 1136 lp->lwp_rqtype = newrqtype; 1137 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; 1138 bsd4_setrunqueue_locked(lp); 1139 checkpri = 1; 1140 } else { 1141 lp->lwp_rqtype = newrqtype; 1142 lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ; 1143 checkpri = 0; 1144 } 1145 reschedcpu = lp->lwp_thread->td_gd->gd_cpuid; 1146 } else { 1147 lp->lwp_priority = newpriority; 1148 reschedcpu = -1; 1149 checkpri = 1; 1150 } 1151 1152 /* 1153 * Determine if we need to reschedule the target cpu. This only 1154 * occurs if the LWP is already on a scheduler queue, which means 1155 * that idle cpu notification has already occured. At most we 1156 * need only issue a need_user_resched() on the appropriate cpu. 1157 * 1158 * The LWP may be owned by a CPU different from the current one, 1159 * in which case dd->uschedcp may be modified without an MP lock 1160 * or a spinlock held. The worst that happens is that the code 1161 * below causes a spurious need_user_resched() on the target CPU 1162 * and dd->pri to be wrong for a short period of time, both of 1163 * which are harmless. 1164 * 1165 * If checkpri is 0 we are adjusting the priority of the current 1166 * process, possibly higher (less desireable), so ignore the upri 1167 * check which will fail in that case. 1168 */ 1169 if (reschedcpu >= 0) { 1170 dd = &bsd4_pcpu[reschedcpu]; 1171 if ((bsd4_rdyprocmask & CPUMASK(reschedcpu)) && 1172 (checkpri == 0 || 1173 (dd->upri & ~PRIMASK) > (lp->lwp_priority & ~PRIMASK))) { 1174 if (reschedcpu == mycpu->gd_cpuid) { 1175 spin_unlock(&bsd4_spin); 1176 need_user_resched(); 1177 } else { 1178 spin_unlock(&bsd4_spin); 1179 atomic_clear_cpumask(&bsd4_rdyprocmask, 1180 CPUMASK(reschedcpu)); 1181 lwkt_send_ipiq(lp->lwp_thread->td_gd, 1182 bsd4_need_user_resched_remote, 1183 NULL); 1184 } 1185 } else { 1186 spin_unlock(&bsd4_spin); 1187 } 1188 } else { 1189 spin_unlock(&bsd4_spin); 1190 } 1191 crit_exit(); 1192 } 1193 1194 /* 1195 * MPSAFE 1196 */ 1197 static 1198 void 1199 bsd4_yield(struct lwp *lp) 1200 { 1201 #if 0 1202 /* FUTURE (or something similar) */ 1203 switch(lp->lwp_rqtype) { 1204 case RTP_PRIO_NORMAL: 1205 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR); 1206 break; 1207 default: 1208 break; 1209 } 1210 #endif 1211 need_user_resched(); 1212 } 1213 1214 static 1215 void 1216 bsd4_changedcpu(struct lwp *lp __unused) 1217 { 1218 } 1219 1220 /* 1221 * Called from fork1() when a new child process is being created. 1222 * 1223 * Give the child process an initial estcpu that is more batch then 1224 * its parent and dock the parent for the fork (but do not 1225 * reschedule the parent). This comprises the main part of our batch 1226 * detection heuristic for both parallel forking and sequential execs. 1227 * 1228 * XXX lwp should be "spawning" instead of "forking" 1229 * 1230 * MPSAFE 1231 */ 1232 static void 1233 bsd4_forking(struct lwp *plp, struct lwp *lp) 1234 { 1235 /* 1236 * Put the child 4 queue slots (out of 32) higher than the parent 1237 * (less desireable than the parent). 1238 */ 1239 lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ * 4); 1240 1241 /* 1242 * The batch status of children always starts out centerline 1243 * and will inch-up or inch-down as appropriate. It takes roughly 1244 * ~15 seconds of >50% cpu to hit the limit. 1245 */ 1246 lp->lwp_batch = BATCHMAX / 2; 1247 1248 /* 1249 * Dock the parent a cost for the fork, protecting us from fork 1250 * bombs. If the parent is forking quickly make the child more 1251 * batchy. 1252 */ 1253 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ / 16); 1254 } 1255 1256 /* 1257 * Called when a lwp is being removed from this scheduler, typically 1258 * during lwp_exit(). 1259 */ 1260 static void 1261 bsd4_exiting(struct lwp *lp, struct proc *child_proc) 1262 { 1263 } 1264 1265 static void 1266 bsd4_uload_update(struct lwp *lp) 1267 { 1268 } 1269 1270 /* 1271 * chooseproc() is called when a cpu needs a user process to LWKT schedule, 1272 * it selects a user process and returns it. If chklp is non-NULL and chklp 1273 * has a better or equal priority then the process that would otherwise be 1274 * chosen, NULL is returned. 1275 * 1276 * Until we fix the RUNQ code the chklp test has to be strict or we may 1277 * bounce between processes trying to acquire the current process designation. 1278 * 1279 * MPSAFE - must be called with bsd4_spin exclusive held. The spinlock is 1280 * left intact through the entire routine. 1281 */ 1282 static 1283 struct lwp * 1284 bsd4_chooseproc_locked(struct lwp *chklp) 1285 { 1286 struct lwp *lp; 1287 struct rq *q; 1288 u_int32_t *which, *which2; 1289 u_int32_t pri; 1290 u_int32_t rtqbits; 1291 u_int32_t tsqbits; 1292 u_int32_t idqbits; 1293 cpumask_t cpumask; 1294 1295 rtqbits = bsd4_rtqueuebits; 1296 tsqbits = bsd4_queuebits; 1297 idqbits = bsd4_idqueuebits; 1298 cpumask = mycpu->gd_cpumask; 1299 1300 1301 again: 1302 if (rtqbits) { 1303 pri = bsfl(rtqbits); 1304 q = &bsd4_rtqueues[pri]; 1305 which = &bsd4_rtqueuebits; 1306 which2 = &rtqbits; 1307 } else if (tsqbits) { 1308 pri = bsfl(tsqbits); 1309 q = &bsd4_queues[pri]; 1310 which = &bsd4_queuebits; 1311 which2 = &tsqbits; 1312 } else if (idqbits) { 1313 pri = bsfl(idqbits); 1314 q = &bsd4_idqueues[pri]; 1315 which = &bsd4_idqueuebits; 1316 which2 = &idqbits; 1317 } else { 1318 return NULL; 1319 } 1320 lp = TAILQ_FIRST(q); 1321 KASSERT(lp, ("chooseproc: no lwp on busy queue")); 1322 1323 while ((lp->lwp_cpumask & cpumask) == 0) { 1324 lp = TAILQ_NEXT(lp, lwp_procq); 1325 if (lp == NULL) { 1326 *which2 &= ~(1 << pri); 1327 goto again; 1328 } 1329 } 1330 1331 /* 1332 * If the passed lwp <chklp> is reasonably close to the selected 1333 * lwp <lp>, return NULL (indicating that <chklp> should be kept). 1334 * 1335 * Note that we must error on the side of <chklp> to avoid bouncing 1336 * between threads in the acquire code. 1337 */ 1338 if (chklp) { 1339 if (chklp->lwp_priority < lp->lwp_priority + PPQ) 1340 return(NULL); 1341 } 1342 1343 /* 1344 * If the chosen lwp does not reside on this cpu spend a few 1345 * cycles looking for a better candidate at the same priority level. 1346 * This is a fallback check, setrunqueue() tries to wakeup the 1347 * correct cpu and is our front-line affinity. 1348 */ 1349 if (lp->lwp_thread->td_gd != mycpu && 1350 (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL 1351 ) { 1352 if (chklp->lwp_thread->td_gd == mycpu) { 1353 lp = chklp; 1354 } 1355 } 1356 1357 KTR_COND_LOG(usched_chooseproc, 1358 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1359 lp->lwp_proc->p_pid, 1360 lp->lwp_thread->td_gd->gd_cpuid, 1361 mycpu->gd_cpuid); 1362 1363 TAILQ_REMOVE(q, lp, lwp_procq); 1364 --bsd4_runqcount; 1365 if (TAILQ_EMPTY(q)) 1366 *which &= ~(1 << pri); 1367 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!")); 1368 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1369 1370 return lp; 1371 } 1372 1373 /* 1374 * chooseproc() - with a cache coherence heuristic. Try to pull a process that 1375 * has its home on the current CPU> If the process doesn't have its home here 1376 * and is a batchy one (see batcy_looser_pri_test), we can wait for a 1377 * sched_tick, may be its home will become free and pull it in. Anyway, 1378 * we can't wait more than one tick. If that tick expired, we pull in that 1379 * process, no matter what. 1380 */ 1381 static 1382 struct lwp * 1383 bsd4_chooseproc_locked_cache_coherent(struct lwp *chklp) 1384 { 1385 struct lwp *lp; 1386 struct rq *q; 1387 u_int32_t *which, *which2; 1388 u_int32_t pri; 1389 u_int32_t checks; 1390 u_int32_t rtqbits; 1391 u_int32_t tsqbits; 1392 u_int32_t idqbits; 1393 cpumask_t cpumask; 1394 1395 struct lwp * min_level_lwp = NULL; 1396 struct rq *min_q = NULL; 1397 cpumask_t siblings; 1398 cpu_node_t* cpunode = NULL; 1399 u_int32_t min_level = MAXCPU; /* number of levels < MAXCPU */ 1400 u_int32_t *min_which = NULL; 1401 u_int32_t min_pri = 0; 1402 u_int32_t level = 0; 1403 1404 rtqbits = bsd4_rtqueuebits; 1405 tsqbits = bsd4_queuebits; 1406 idqbits = bsd4_idqueuebits; 1407 cpumask = mycpu->gd_cpumask; 1408 1409 /* Get the mask coresponding to the sysctl configured level */ 1410 cpunode = bsd4_pcpu[mycpu->gd_cpuid].cpunode; 1411 level = usched_bsd4_stick_to_level; 1412 while (level) { 1413 cpunode = cpunode->parent_node; 1414 level--; 1415 } 1416 /* The cpus which can ellect a process */ 1417 siblings = cpunode->members; 1418 checks = 0; 1419 1420 again: 1421 if (rtqbits) { 1422 pri = bsfl(rtqbits); 1423 q = &bsd4_rtqueues[pri]; 1424 which = &bsd4_rtqueuebits; 1425 which2 = &rtqbits; 1426 } else if (tsqbits) { 1427 pri = bsfl(tsqbits); 1428 q = &bsd4_queues[pri]; 1429 which = &bsd4_queuebits; 1430 which2 = &tsqbits; 1431 } else if (idqbits) { 1432 pri = bsfl(idqbits); 1433 q = &bsd4_idqueues[pri]; 1434 which = &bsd4_idqueuebits; 1435 which2 = &idqbits; 1436 } else { 1437 /* 1438 * No more left and we didn't reach the checks limit. 1439 */ 1440 bsd4_kick_helper(min_level_lwp); 1441 return NULL; 1442 } 1443 lp = TAILQ_FIRST(q); 1444 KASSERT(lp, ("chooseproc: no lwp on busy queue")); 1445 1446 /* 1447 * Limit the number of checks/queue to a configurable value to 1448 * minimize the contention (we are in a locked region 1449 */ 1450 while (checks < usched_bsd4_queue_checks) { 1451 if ((lp->lwp_cpumask & cpumask) == 0 || 1452 ((siblings & lp->lwp_thread->td_gd->gd_cpumask) == 0 && 1453 (lp->lwp_rebal_ticks == sched_ticks || 1454 lp->lwp_rebal_ticks == (int)(sched_ticks - 1)) && 1455 bsd4_batchy_looser_pri_test(lp))) { 1456 1457 KTR_COND_LOG(usched_chooseproc_cc_not_good, 1458 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1459 lp->lwp_proc->p_pid, 1460 (unsigned long)lp->lwp_thread->td_gd->gd_cpumask, 1461 (unsigned long)siblings, 1462 (unsigned long)cpumask); 1463 1464 cpunode = bsd4_pcpu[lp->lwp_thread->td_gd->gd_cpuid].cpunode; 1465 level = 0; 1466 while (cpunode) { 1467 if (cpunode->members & cpumask) 1468 break; 1469 cpunode = cpunode->parent_node; 1470 level++; 1471 } 1472 if (level < min_level || 1473 (level == min_level && min_level_lwp && 1474 lp->lwp_priority < min_level_lwp->lwp_priority)) { 1475 bsd4_kick_helper(min_level_lwp); 1476 min_level_lwp = lp; 1477 min_level = level; 1478 min_q = q; 1479 min_which = which; 1480 min_pri = pri; 1481 } else { 1482 bsd4_kick_helper(lp); 1483 } 1484 lp = TAILQ_NEXT(lp, lwp_procq); 1485 if (lp == NULL) { 1486 *which2 &= ~(1 << pri); 1487 goto again; 1488 } 1489 } else { 1490 KTR_COND_LOG(usched_chooseproc_cc_elected, 1491 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1492 lp->lwp_proc->p_pid, 1493 (unsigned long)lp->lwp_thread->td_gd->gd_cpumask, 1494 (unsigned long)siblings, 1495 (unsigned long)cpumask); 1496 1497 goto found; 1498 } 1499 ++checks; 1500 } 1501 1502 /* 1503 * Checks exhausted, we tried to defer too many threads, so schedule 1504 * the best of the worst. 1505 */ 1506 lp = min_level_lwp; 1507 q = min_q; 1508 which = min_which; 1509 pri = min_pri; 1510 KASSERT(lp, ("chooseproc: at least the first lp was good")); 1511 1512 found: 1513 1514 /* 1515 * If the passed lwp <chklp> is reasonably close to the selected 1516 * lwp <lp>, return NULL (indicating that <chklp> should be kept). 1517 * 1518 * Note that we must error on the side of <chklp> to avoid bouncing 1519 * between threads in the acquire code. 1520 */ 1521 if (chklp) { 1522 if (chklp->lwp_priority < lp->lwp_priority + PPQ) { 1523 bsd4_kick_helper(lp); 1524 return(NULL); 1525 } 1526 } 1527 1528 KTR_COND_LOG(usched_chooseproc_cc, 1529 lp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1530 lp->lwp_proc->p_pid, 1531 lp->lwp_thread->td_gd->gd_cpuid, 1532 mycpu->gd_cpuid); 1533 1534 TAILQ_REMOVE(q, lp, lwp_procq); 1535 --bsd4_runqcount; 1536 if (TAILQ_EMPTY(q)) 1537 *which &= ~(1 << pri); 1538 KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!")); 1539 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1540 1541 return lp; 1542 } 1543 1544 /* 1545 * If we aren't willing to schedule a ready process on our cpu, give it's 1546 * target cpu a kick rather than wait for the next tick. 1547 * 1548 * Called with bsd4_spin held. 1549 */ 1550 static 1551 void 1552 bsd4_kick_helper(struct lwp *lp) 1553 { 1554 globaldata_t gd; 1555 bsd4_pcpu_t dd; 1556 1557 if (lp == NULL) 1558 return; 1559 gd = lp->lwp_thread->td_gd; 1560 dd = &bsd4_pcpu[gd->gd_cpuid]; 1561 if ((smp_active_mask & usched_global_cpumask & 1562 bsd4_rdyprocmask & gd->gd_cpumask) == 0) { 1563 return; 1564 } 1565 ++usched_bsd4_kicks; 1566 atomic_clear_cpumask(&bsd4_rdyprocmask, gd->gd_cpumask); 1567 if ((dd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK)) { 1568 lwkt_send_ipiq(gd, bsd4_need_user_resched_remote, NULL); 1569 } else { 1570 wakeup(&dd->helper_thread); 1571 } 1572 } 1573 1574 static 1575 void 1576 bsd4_need_user_resched_remote(void *dummy) 1577 { 1578 globaldata_t gd = mycpu; 1579 bsd4_pcpu_t dd = &bsd4_pcpu[gd->gd_cpuid]; 1580 1581 need_user_resched(); 1582 1583 /* Call wakeup_mycpu to avoid sending IPIs to other CPUs */ 1584 wakeup_mycpu(&dd->helper_thread); 1585 } 1586 1587 /* 1588 * bsd4_remrunqueue_locked() removes a given process from the run queue 1589 * that it is on, clearing the queue busy bit if it becomes empty. 1590 * 1591 * Note that user process scheduler is different from the LWKT schedule. 1592 * The user process scheduler only manages user processes but it uses LWKT 1593 * underneath, and a user process operating in the kernel will often be 1594 * 'released' from our management. 1595 * 1596 * MPSAFE - bsd4_spin must be held exclusively on call 1597 */ 1598 static void 1599 bsd4_remrunqueue_locked(struct lwp *lp) 1600 { 1601 struct rq *q; 1602 u_int32_t *which; 1603 u_int8_t pri; 1604 1605 KKASSERT(lp->lwp_mpflags & LWP_MP_ONRUNQ); 1606 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1607 --bsd4_runqcount; 1608 KKASSERT(bsd4_runqcount >= 0); 1609 1610 pri = lp->lwp_rqindex; 1611 switch(lp->lwp_rqtype) { 1612 case RTP_PRIO_NORMAL: 1613 q = &bsd4_queues[pri]; 1614 which = &bsd4_queuebits; 1615 break; 1616 case RTP_PRIO_REALTIME: 1617 case RTP_PRIO_FIFO: 1618 q = &bsd4_rtqueues[pri]; 1619 which = &bsd4_rtqueuebits; 1620 break; 1621 case RTP_PRIO_IDLE: 1622 q = &bsd4_idqueues[pri]; 1623 which = &bsd4_idqueuebits; 1624 break; 1625 default: 1626 panic("remrunqueue: invalid rtprio type"); 1627 /* NOT REACHED */ 1628 } 1629 TAILQ_REMOVE(q, lp, lwp_procq); 1630 if (TAILQ_EMPTY(q)) { 1631 KASSERT((*which & (1 << pri)) != 0, 1632 ("remrunqueue: remove from empty queue")); 1633 *which &= ~(1 << pri); 1634 } 1635 } 1636 1637 /* 1638 * bsd4_setrunqueue_locked() 1639 * 1640 * Add a process whos rqtype and rqindex had previously been calculated 1641 * onto the appropriate run queue. Determine if the addition requires 1642 * a reschedule on a cpu and return the cpuid or -1. 1643 * 1644 * NOTE: Lower priorities are better priorities. 1645 * 1646 * MPSAFE - bsd4_spin must be held exclusively on call 1647 */ 1648 static void 1649 bsd4_setrunqueue_locked(struct lwp *lp) 1650 { 1651 struct rq *q; 1652 u_int32_t *which; 1653 int pri; 1654 1655 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 1656 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 1657 ++bsd4_runqcount; 1658 1659 pri = lp->lwp_rqindex; 1660 1661 switch(lp->lwp_rqtype) { 1662 case RTP_PRIO_NORMAL: 1663 q = &bsd4_queues[pri]; 1664 which = &bsd4_queuebits; 1665 break; 1666 case RTP_PRIO_REALTIME: 1667 case RTP_PRIO_FIFO: 1668 q = &bsd4_rtqueues[pri]; 1669 which = &bsd4_rtqueuebits; 1670 break; 1671 case RTP_PRIO_IDLE: 1672 q = &bsd4_idqueues[pri]; 1673 which = &bsd4_idqueuebits; 1674 break; 1675 default: 1676 panic("remrunqueue: invalid rtprio type"); 1677 /* NOT REACHED */ 1678 } 1679 1680 /* 1681 * Add to the correct queue and set the appropriate bit. If no 1682 * lower priority (i.e. better) processes are in the queue then 1683 * we want a reschedule, calculate the best cpu for the job. 1684 * 1685 * Always run reschedules on the LWPs original cpu. 1686 */ 1687 TAILQ_INSERT_TAIL(q, lp, lwp_procq); 1688 *which |= 1 << pri; 1689 } 1690 1691 /* 1692 * For SMP systems a user scheduler helper thread is created for each 1693 * cpu and is used to allow one cpu to wakeup another for the purposes of 1694 * scheduling userland threads from setrunqueue(). 1695 * 1696 * UP systems do not need the helper since there is only one cpu. 1697 * 1698 * We can't use the idle thread for this because we might block. 1699 * Additionally, doing things this way allows us to HLT idle cpus 1700 * on MP systems. 1701 * 1702 * MPSAFE 1703 */ 1704 static void 1705 sched_thread(void *dummy) 1706 { 1707 globaldata_t gd; 1708 bsd4_pcpu_t dd; 1709 bsd4_pcpu_t tmpdd; 1710 struct lwp *nlp; 1711 cpumask_t mask; 1712 int cpuid; 1713 cpumask_t tmpmask; 1714 int tmpid; 1715 1716 gd = mycpu; 1717 cpuid = gd->gd_cpuid; /* doesn't change */ 1718 mask = gd->gd_cpumask; /* doesn't change */ 1719 dd = &bsd4_pcpu[cpuid]; 1720 1721 /* 1722 * Since we are woken up only when no user processes are scheduled 1723 * on a cpu, we can run at an ultra low priority. 1724 */ 1725 lwkt_setpri_self(TDPRI_USER_SCHEDULER); 1726 1727 tsleep(&dd->helper_thread, 0, "sched_thread_sleep", 0); 1728 1729 for (;;) { 1730 /* 1731 * We use the LWKT deschedule-interlock trick to avoid racing 1732 * bsd4_rdyprocmask. This means we cannot block through to the 1733 * manual lwkt_switch() call we make below. 1734 */ 1735 crit_enter_gd(gd); 1736 tsleep_interlock(&dd->helper_thread, 0); 1737 spin_lock(&bsd4_spin); 1738 atomic_set_cpumask(&bsd4_rdyprocmask, mask); 1739 1740 clear_user_resched(); /* This satisfied the reschedule request */ 1741 dd->rrcount = 0; /* Reset the round-robin counter */ 1742 1743 if ((bsd4_curprocmask & mask) == 0) { 1744 /* 1745 * No thread is currently scheduled. 1746 */ 1747 KKASSERT(dd->uschedcp == NULL); 1748 if ((nlp = bsd4_chooseproc_locked(NULL)) != NULL) { 1749 KTR_COND_LOG(usched_sched_thread_no_process, 1750 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1751 gd->gd_cpuid, 1752 nlp->lwp_proc->p_pid, 1753 nlp->lwp_thread->td_gd->gd_cpuid); 1754 1755 atomic_set_cpumask(&bsd4_curprocmask, mask); 1756 dd->upri = nlp->lwp_priority; 1757 dd->uschedcp = nlp; 1758 dd->rrcount = 0; /* reset round robin */ 1759 spin_unlock(&bsd4_spin); 1760 lwkt_acquire(nlp->lwp_thread); 1761 lwkt_schedule(nlp->lwp_thread); 1762 } else { 1763 spin_unlock(&bsd4_spin); 1764 } 1765 } else if (bsd4_runqcount) { 1766 if ((nlp = bsd4_chooseproc_locked(dd->uschedcp)) != NULL) { 1767 KTR_COND_LOG(usched_sched_thread_process, 1768 nlp->lwp_proc->p_pid == usched_bsd4_pid_debug, 1769 gd->gd_cpuid, 1770 nlp->lwp_proc->p_pid, 1771 nlp->lwp_thread->td_gd->gd_cpuid); 1772 1773 dd->upri = nlp->lwp_priority; 1774 dd->uschedcp = nlp; 1775 dd->rrcount = 0; /* reset round robin */ 1776 spin_unlock(&bsd4_spin); 1777 lwkt_acquire(nlp->lwp_thread); 1778 lwkt_schedule(nlp->lwp_thread); 1779 } else { 1780 /* 1781 * CHAINING CONDITION TRAIN 1782 * 1783 * We could not deal with the scheduler wakeup 1784 * request on this cpu, locate a ready scheduler 1785 * with no current lp assignment and chain to it. 1786 * 1787 * This ensures that a wakeup race which fails due 1788 * to priority test does not leave other unscheduled 1789 * cpus idle when the runqueue is not empty. 1790 */ 1791 tmpmask = ~bsd4_curprocmask & 1792 bsd4_rdyprocmask & smp_active_mask; 1793 if (tmpmask) { 1794 tmpid = BSFCPUMASK(tmpmask); 1795 tmpdd = &bsd4_pcpu[tmpid]; 1796 atomic_clear_cpumask(&bsd4_rdyprocmask, 1797 CPUMASK(tmpid)); 1798 spin_unlock(&bsd4_spin); 1799 wakeup(&tmpdd->helper_thread); 1800 } else { 1801 spin_unlock(&bsd4_spin); 1802 } 1803 1804 KTR_LOG(usched_sched_thread_no_process_found, 1805 gd->gd_cpuid, (unsigned long)tmpmask); 1806 } 1807 } else { 1808 /* 1809 * The runq is empty. 1810 */ 1811 spin_unlock(&bsd4_spin); 1812 } 1813 1814 /* 1815 * We're descheduled unless someone scheduled us. Switch away. 1816 * Exiting the critical section will cause splz() to be called 1817 * for us if interrupts and such are pending. 1818 */ 1819 crit_exit_gd(gd); 1820 tsleep(&dd->helper_thread, PINTERLOCKED, "schslp", 0); 1821 } 1822 } 1823 1824 /* sysctl stick_to_level parameter */ 1825 static int 1826 sysctl_usched_bsd4_stick_to_level(SYSCTL_HANDLER_ARGS) 1827 { 1828 int error, new_val; 1829 1830 new_val = usched_bsd4_stick_to_level; 1831 1832 error = sysctl_handle_int(oidp, &new_val, 0, req); 1833 if (error != 0 || req->newptr == NULL) 1834 return (error); 1835 if (new_val > cpu_topology_levels_number - 1 || new_val < 0) 1836 return (EINVAL); 1837 usched_bsd4_stick_to_level = new_val; 1838 return (0); 1839 } 1840 1841 /* 1842 * Setup our scheduler helpers. Note that curprocmask bit 0 has already 1843 * been cleared by rqinit() and we should not mess with it further. 1844 */ 1845 static void 1846 sched_thread_cpu_init(void) 1847 { 1848 int i; 1849 int cpuid; 1850 int smt_not_supported = 0; 1851 int cache_coherent_not_supported = 0; 1852 1853 if (bootverbose) 1854 kprintf("Start scheduler helpers on cpus:\n"); 1855 1856 sysctl_ctx_init(&usched_bsd4_sysctl_ctx); 1857 usched_bsd4_sysctl_tree = 1858 SYSCTL_ADD_NODE(&usched_bsd4_sysctl_ctx, 1859 SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO, 1860 "usched_bsd4", CTLFLAG_RD, 0, ""); 1861 1862 for (i = 0; i < ncpus; ++i) { 1863 bsd4_pcpu_t dd = &bsd4_pcpu[i]; 1864 cpumask_t mask = CPUMASK(i); 1865 1866 if ((mask & smp_active_mask) == 0) 1867 continue; 1868 1869 dd->cpunode = get_cpu_node_by_cpuid(i); 1870 1871 if (dd->cpunode == NULL) { 1872 smt_not_supported = 1; 1873 cache_coherent_not_supported = 1; 1874 if (bootverbose) 1875 kprintf ("\tcpu%d - WARNING: No CPU NODE " 1876 "found for cpu\n", i); 1877 } else { 1878 switch (dd->cpunode->type) { 1879 case THREAD_LEVEL: 1880 if (bootverbose) 1881 kprintf ("\tcpu%d - HyperThreading " 1882 "available. Core siblings: ", 1883 i); 1884 break; 1885 case CORE_LEVEL: 1886 smt_not_supported = 1; 1887 1888 if (bootverbose) 1889 kprintf ("\tcpu%d - No HT available, " 1890 "multi-core/physical " 1891 "cpu. Physical siblings: ", 1892 i); 1893 break; 1894 case CHIP_LEVEL: 1895 smt_not_supported = 1; 1896 1897 if (bootverbose) 1898 kprintf ("\tcpu%d - No HT available, " 1899 "single-core/physical cpu. " 1900 "Package Siblings: ", 1901 i); 1902 break; 1903 default: 1904 /* Let's go for safe defaults here */ 1905 smt_not_supported = 1; 1906 cache_coherent_not_supported = 1; 1907 if (bootverbose) 1908 kprintf ("\tcpu%d - Unknown cpunode->" 1909 "type=%u. Siblings: ", 1910 i, 1911 (u_int)dd->cpunode->type); 1912 break; 1913 } 1914 1915 if (bootverbose) { 1916 if (dd->cpunode->parent_node != NULL) { 1917 CPUSET_FOREACH(cpuid, dd->cpunode->parent_node->members) 1918 kprintf("cpu%d ", cpuid); 1919 kprintf("\n"); 1920 } else { 1921 kprintf(" no siblings\n"); 1922 } 1923 } 1924 } 1925 1926 lwkt_create(sched_thread, NULL, NULL, &dd->helper_thread, 1927 0, i, "usched %d", i); 1928 1929 /* 1930 * Allow user scheduling on the target cpu. cpu #0 has already 1931 * been enabled in rqinit(). 1932 */ 1933 if (i) 1934 atomic_clear_cpumask(&bsd4_curprocmask, mask); 1935 atomic_set_cpumask(&bsd4_rdyprocmask, mask); 1936 dd->upri = PRIBASE_NULL; 1937 1938 } 1939 1940 /* usched_bsd4 sysctl configurable parameters */ 1941 1942 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1943 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1944 OID_AUTO, "rrinterval", CTLFLAG_RW, 1945 &usched_bsd4_rrinterval, 0, ""); 1946 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1947 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1948 OID_AUTO, "decay", CTLFLAG_RW, 1949 &usched_bsd4_decay, 0, "Extra decay when not running"); 1950 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1951 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1952 OID_AUTO, "batch_time", CTLFLAG_RW, 1953 &usched_bsd4_batch_time, 0, "Min batch counter value"); 1954 SYSCTL_ADD_LONG(&usched_bsd4_sysctl_ctx, 1955 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1956 OID_AUTO, "kicks", CTLFLAG_RW, 1957 &usched_bsd4_kicks, "Number of kickstarts"); 1958 1959 /* Add enable/disable option for SMT scheduling if supported */ 1960 if (smt_not_supported) { 1961 usched_bsd4_smt = 0; 1962 SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx, 1963 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1964 OID_AUTO, "smt", CTLFLAG_RD, 1965 "NOT SUPPORTED", 0, "SMT NOT SUPPORTED"); 1966 } else { 1967 usched_bsd4_smt = 1; 1968 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1969 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1970 OID_AUTO, "smt", CTLFLAG_RW, 1971 &usched_bsd4_smt, 0, "Enable SMT scheduling"); 1972 } 1973 1974 /* 1975 * Add enable/disable option for cache coherent scheduling 1976 * if supported 1977 */ 1978 if (cache_coherent_not_supported) { 1979 usched_bsd4_cache_coherent = 0; 1980 SYSCTL_ADD_STRING(&usched_bsd4_sysctl_ctx, 1981 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1982 OID_AUTO, "cache_coherent", CTLFLAG_RD, 1983 "NOT SUPPORTED", 0, 1984 "Cache coherence NOT SUPPORTED"); 1985 } else { 1986 usched_bsd4_cache_coherent = 1; 1987 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1988 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1989 OID_AUTO, "cache_coherent", CTLFLAG_RW, 1990 &usched_bsd4_cache_coherent, 0, 1991 "Enable/Disable cache coherent scheduling"); 1992 1993 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 1994 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 1995 OID_AUTO, "upri_affinity", CTLFLAG_RW, 1996 &usched_bsd4_upri_affinity, 1, 1997 "Number of PPQs in user priority check"); 1998 1999 SYSCTL_ADD_INT(&usched_bsd4_sysctl_ctx, 2000 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2001 OID_AUTO, "queue_checks", CTLFLAG_RW, 2002 &usched_bsd4_queue_checks, 5, 2003 "LWPs to check from a queue before giving up"); 2004 2005 SYSCTL_ADD_PROC(&usched_bsd4_sysctl_ctx, 2006 SYSCTL_CHILDREN(usched_bsd4_sysctl_tree), 2007 OID_AUTO, "stick_to_level", 2008 CTLTYPE_INT | CTLFLAG_RW, 2009 NULL, sizeof usched_bsd4_stick_to_level, 2010 sysctl_usched_bsd4_stick_to_level, "I", 2011 "Stick a process to this level. See sysctl" 2012 "paremter hw.cpu_topology.level_description"); 2013 } 2014 } 2015 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND, 2016 sched_thread_cpu_init, NULL) 2017