1 /* 2 * Copyright (c) 2006 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/kernel.h> 38 #include <sys/lock.h> 39 #include <sys/queue.h> 40 #include <sys/proc.h> 41 #include <sys/rtprio.h> 42 #include <sys/uio.h> 43 #include <sys/sysctl.h> 44 #include <sys/resourcevar.h> 45 #include <sys/spinlock.h> 46 #include <machine/cpu.h> 47 #include <machine/smp.h> 48 49 #include <sys/thread2.h> 50 #include <sys/spinlock2.h> 51 #include <sys/mplock2.h> 52 53 #define MAXPRI 128 54 #define PRIBASE_REALTIME 0 55 #define PRIBASE_NORMAL MAXPRI 56 #define PRIBASE_IDLE (MAXPRI * 2) 57 #define PRIBASE_THREAD (MAXPRI * 3) 58 #define PRIBASE_NULL (MAXPRI * 4) 59 60 #define lwp_priority lwp_usdata.bsd4.priority 61 #define lwp_estcpu lwp_usdata.bsd4.estcpu 62 63 static void dummy_acquire_curproc(struct lwp *lp); 64 static void dummy_release_curproc(struct lwp *lp); 65 static void dummy_select_curproc(globaldata_t gd); 66 static void dummy_setrunqueue(struct lwp *lp); 67 static void dummy_schedulerclock(struct lwp *lp, sysclock_t period, 68 sysclock_t cpstamp); 69 static void dummy_recalculate_estcpu(struct lwp *lp); 70 static void dummy_resetpriority(struct lwp *lp); 71 static void dummy_forking(struct lwp *plp, struct lwp *lp); 72 static void dummy_exiting(struct lwp *plp, struct proc *child); 73 static void dummy_uload_update(struct lwp *lp); 74 static void dummy_yield(struct lwp *lp); 75 76 struct usched usched_dummy = { 77 { NULL }, 78 "dummy", "Dummy DragonFly Scheduler", 79 NULL, /* default registration */ 80 NULL, /* default deregistration */ 81 dummy_acquire_curproc, 82 dummy_release_curproc, 83 dummy_setrunqueue, 84 dummy_schedulerclock, 85 dummy_recalculate_estcpu, 86 dummy_resetpriority, 87 dummy_forking, 88 dummy_exiting, 89 dummy_uload_update, 90 NULL, /* setcpumask not supported */ 91 dummy_yield 92 }; 93 94 struct usched_dummy_pcpu { 95 int rrcount; 96 struct thread helper_thread; 97 struct lwp *uschedcp; 98 }; 99 100 typedef struct usched_dummy_pcpu *dummy_pcpu_t; 101 102 static struct usched_dummy_pcpu dummy_pcpu[MAXCPU]; 103 static cpumask_t dummy_curprocmask = -1; 104 static cpumask_t dummy_rdyprocmask; 105 static struct spinlock dummy_spin; 106 static TAILQ_HEAD(rq, lwp) dummy_runq; 107 static int dummy_runqcount; 108 109 static int usched_dummy_rrinterval = (ESTCPUFREQ + 9) / 10; 110 SYSCTL_INT(_kern, OID_AUTO, usched_dummy_rrinterval, CTLFLAG_RW, 111 &usched_dummy_rrinterval, 0, ""); 112 113 /* 114 * Initialize the run queues at boot time, clear cpu 0 in curprocmask 115 * to allow dummy scheduling on cpu 0. 116 */ 117 static void 118 dummyinit(void *dummy) 119 { 120 TAILQ_INIT(&dummy_runq); 121 spin_init(&dummy_spin); 122 atomic_clear_cpumask(&dummy_curprocmask, 1); 123 } 124 SYSINIT(runqueue, SI_BOOT2_USCHED, SI_ORDER_FIRST, dummyinit, NULL) 125 126 /* 127 * DUMMY_ACQUIRE_CURPROC 128 * 129 * This function is called when the kernel intends to return to userland. 130 * It is responsible for making the thread the current designated userland 131 * thread for this cpu, blocking if necessary. 132 * 133 * The kernel will not depress our LWKT priority until after we return, 134 * in case we have to shove over to another cpu. 135 * 136 * We must determine our thread's disposition before we switch away. This 137 * is very sensitive code. 138 * 139 * We are expected to handle userland reschedule requests here too. 140 * 141 * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE 142 * TO ANOTHER CPU! Because most of the kernel assumes that no migration will 143 * occur, this function is called only under very controlled circumstances. 144 * 145 * MPSAFE 146 */ 147 static void 148 dummy_acquire_curproc(struct lwp *lp) 149 { 150 globaldata_t gd = mycpu; 151 dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid]; 152 thread_t td = lp->lwp_thread; 153 154 /* 155 * Possibly select another thread 156 */ 157 if (user_resched_wanted()) 158 dummy_select_curproc(gd); 159 160 /* 161 * If this cpu has no current thread, select ourself 162 */ 163 if (dd->uschedcp == lp || 164 (dd->uschedcp == NULL && TAILQ_EMPTY(&dummy_runq))) { 165 atomic_set_cpumask(&dummy_curprocmask, gd->gd_cpumask); 166 dd->uschedcp = lp; 167 return; 168 } 169 170 /* 171 * If this cpu's current user process thread is not our thread, 172 * deschedule ourselves and place us on the run queue, then 173 * switch away. 174 * 175 * We loop until we become the current process. Its a good idea 176 * to run any passive release(s) before we mess with the scheduler 177 * so our thread is in the expected state. 178 */ 179 KKASSERT(dd->uschedcp != lp); 180 if (td->td_release) 181 td->td_release(lp->lwp_thread); 182 do { 183 crit_enter(); 184 lwkt_deschedule_self(td); 185 dummy_setrunqueue(lp); 186 if ((td->td_flags & TDF_RUNQ) == 0) 187 ++lp->lwp_ru.ru_nivcsw; 188 lwkt_switch(); /* WE MAY MIGRATE TO ANOTHER CPU */ 189 crit_exit(); 190 gd = mycpu; 191 dd = &dummy_pcpu[gd->gd_cpuid]; 192 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 193 } while (dd->uschedcp != lp); 194 } 195 196 /* 197 * DUMMY_RELEASE_CURPROC 198 * 199 * This routine detaches the current thread from the userland scheduler, 200 * usually because the thread needs to run in the kernel (at kernel priority) 201 * for a while. 202 * 203 * This routine is also responsible for selecting a new thread to 204 * make the current thread. 205 * 206 * MPSAFE 207 */ 208 static void 209 dummy_release_curproc(struct lwp *lp) 210 { 211 globaldata_t gd = mycpu; 212 dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid]; 213 214 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 215 if (dd->uschedcp == lp) { 216 dummy_select_curproc(gd); 217 } 218 } 219 220 /* 221 * DUMMY_SELECT_CURPROC 222 * 223 * Select a new current process for this cpu. This satisfies a user 224 * scheduler reschedule request so clear that too. 225 * 226 * This routine is also responsible for equal-priority round-robining, 227 * typically triggered from dummy_schedulerclock(). In our dummy example 228 * all the 'user' threads are LWKT scheduled all at once and we just 229 * call lwkt_switch(). 230 * 231 * MPSAFE 232 */ 233 static 234 void 235 dummy_select_curproc(globaldata_t gd) 236 { 237 dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid]; 238 struct lwp *lp; 239 240 clear_user_resched(); 241 spin_lock(&dummy_spin); 242 if ((lp = TAILQ_FIRST(&dummy_runq)) == NULL) { 243 dd->uschedcp = NULL; 244 atomic_clear_cpumask(&dummy_curprocmask, gd->gd_cpumask); 245 spin_unlock(&dummy_spin); 246 } else { 247 --dummy_runqcount; 248 TAILQ_REMOVE(&dummy_runq, lp, lwp_procq); 249 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 250 dd->uschedcp = lp; 251 atomic_set_cpumask(&dummy_curprocmask, gd->gd_cpumask); 252 spin_unlock(&dummy_spin); 253 lwkt_acquire(lp->lwp_thread); 254 lwkt_schedule(lp->lwp_thread); 255 } 256 } 257 258 /* 259 * DUMMY_SETRUNQUEUE 260 * 261 * This routine is called to schedule a new user process after a fork. 262 * The scheduler module itself might also call this routine to place 263 * the current process on the userland scheduler's run queue prior 264 * to calling dummy_select_curproc(). 265 * 266 * The caller may set LWP_PASSIVE_ACQ in lwp_flags to indicate that we should 267 * attempt to leave the thread on the current cpu. 268 * 269 * MPSAFE 270 */ 271 static void 272 dummy_setrunqueue(struct lwp *lp) 273 { 274 globaldata_t gd = mycpu; 275 dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid]; 276 cpumask_t mask; 277 int cpuid; 278 279 if (dd->uschedcp == NULL) { 280 dd->uschedcp = lp; 281 atomic_set_cpumask(&dummy_curprocmask, gd->gd_cpumask); 282 lwkt_schedule(lp->lwp_thread); 283 } else { 284 /* 285 * Add to our global runq 286 */ 287 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0); 288 spin_lock(&dummy_spin); 289 ++dummy_runqcount; 290 TAILQ_INSERT_TAIL(&dummy_runq, lp, lwp_procq); 291 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 292 lwkt_giveaway(lp->lwp_thread); 293 294 /* lp = TAILQ_FIRST(&dummy_runq); */ 295 296 /* 297 * Notify the next available cpu. P.S. some 298 * cpu affinity could be done here. 299 * 300 * The rdyprocmask bit placeholds the knowledge that there 301 * is a process on the runq that needs service. If the 302 * helper thread cannot find a home for it it will forward 303 * the request to another available cpu. 304 */ 305 mask = ~dummy_curprocmask & dummy_rdyprocmask & 306 gd->gd_other_cpus; 307 if (mask) { 308 cpuid = BSFCPUMASK(mask); 309 atomic_clear_cpumask(&dummy_rdyprocmask, CPUMASK(cpuid)); 310 spin_unlock(&dummy_spin); 311 lwkt_schedule(&dummy_pcpu[cpuid].helper_thread); 312 } else { 313 spin_unlock(&dummy_spin); 314 } 315 } 316 } 317 318 /* 319 * This routine is called from a systimer IPI. It must NEVER block. 320 * If a lwp compatible with this scheduler is the currently running 321 * thread this function is called with a non-NULL lp, otherwise it 322 * will be called with a NULL lp. 323 * 324 * This routine is called at ESTCPUFREQ on each cpu independantly. 325 * 326 * This routine typically queues a reschedule request, which will cause 327 * the scheduler's BLAH_select_curproc() to be called as soon as possible. 328 */ 329 static 330 void 331 dummy_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp) 332 { 333 globaldata_t gd = mycpu; 334 dummy_pcpu_t dd = &dummy_pcpu[gd->gd_cpuid]; 335 336 if (lp == NULL) 337 return; 338 339 if (++dd->rrcount >= usched_dummy_rrinterval) { 340 dd->rrcount = 0; 341 need_user_resched(); 342 } 343 } 344 345 /* 346 * DUMMY_RECALCULATE_ESTCPU 347 * 348 * Called once a second for any process that is running or has slept 349 * for less then 2 seconds. 350 * 351 * MPSAFE 352 */ 353 static 354 void 355 dummy_recalculate_estcpu(struct lwp *lp) 356 { 357 } 358 359 /* 360 * MPSAFE 361 */ 362 static 363 void 364 dummy_yield(struct lwp *lp) 365 { 366 need_user_resched(); 367 } 368 369 /* 370 * DUMMY_RESETPRIORITY 371 * 372 * This routine is called after the kernel has potentially modified 373 * the lwp_rtprio structure. The target process may be running or sleeping 374 * or scheduled but not yet running or owned by another cpu. Basically, 375 * it can be in virtually any state. 376 * 377 * This routine is called by fork1() for initial setup with the process 378 * of the run queue, and also may be called normally with the process on or 379 * off the run queue. 380 * 381 * MPSAFE 382 */ 383 static void 384 dummy_resetpriority(struct lwp *lp) 385 { 386 /* XXX spinlock usually needed */ 387 /* 388 * Set p_priority for general process comparisons 389 */ 390 switch(lp->lwp_rtprio.type) { 391 case RTP_PRIO_REALTIME: 392 lp->lwp_priority = PRIBASE_REALTIME + lp->lwp_rtprio.prio; 393 return; 394 case RTP_PRIO_NORMAL: 395 lp->lwp_priority = PRIBASE_NORMAL + lp->lwp_rtprio.prio; 396 break; 397 case RTP_PRIO_IDLE: 398 lp->lwp_priority = PRIBASE_IDLE + lp->lwp_rtprio.prio; 399 return; 400 case RTP_PRIO_THREAD: 401 lp->lwp_priority = PRIBASE_THREAD + lp->lwp_rtprio.prio; 402 return; 403 } 404 405 /* 406 * td_upri has normal sense (higher numbers are more desireable), 407 * so negate it. 408 */ 409 lp->lwp_thread->td_upri = -lp->lwp_priority; 410 /* XXX spinlock usually needed */ 411 } 412 413 414 /* 415 * DUMMY_FORKING 416 * 417 * Called from fork1() when a new child process is being created. Allows 418 * the scheduler to predispose the child process before it gets scheduled. 419 * 420 * MPSAFE 421 */ 422 static void 423 dummy_forking(struct lwp *plp, struct lwp *lp) 424 { 425 lp->lwp_estcpu = plp->lwp_estcpu; 426 #if 0 427 ++plp->lwp_estcpu; 428 #endif 429 } 430 431 /* 432 * Called when a lwp is being removed from this scheduler, typically 433 * during lwp_exit(). 434 */ 435 static void 436 dummy_exiting(struct lwp *plp, struct proc *child) 437 { 438 } 439 440 static void 441 dummy_uload_update(struct lwp *lp) 442 { 443 } 444 445 /* 446 * SMP systems may need a scheduler helper thread. This is how one can be 447 * setup. 448 * 449 * We use a neat LWKT scheduling trick to interlock the helper thread. It 450 * is possible to deschedule an LWKT thread and then do some work before 451 * switching away. The thread can be rescheduled at any time, even before 452 * we switch away. 453 * 454 * MPSAFE 455 */ 456 static void 457 dummy_sched_thread(void *dummy) 458 { 459 globaldata_t gd; 460 dummy_pcpu_t dd; 461 struct lwp *lp; 462 cpumask_t cpumask; 463 cpumask_t tmpmask; 464 int cpuid; 465 int tmpid; 466 467 gd = mycpu; 468 cpuid = gd->gd_cpuid; 469 dd = &dummy_pcpu[cpuid]; 470 cpumask = CPUMASK(cpuid); 471 472 for (;;) { 473 lwkt_deschedule_self(gd->gd_curthread); /* interlock */ 474 atomic_set_cpumask(&dummy_rdyprocmask, cpumask); 475 spin_lock(&dummy_spin); 476 if (dd->uschedcp) { 477 /* 478 * We raced another cpu trying to schedule a thread onto us. 479 * If the runq isn't empty hit another free cpu. 480 */ 481 tmpmask = ~dummy_curprocmask & dummy_rdyprocmask & 482 gd->gd_other_cpus; 483 if (tmpmask && dummy_runqcount) { 484 tmpid = BSFCPUMASK(tmpmask); 485 KKASSERT(tmpid != cpuid); 486 atomic_clear_cpumask(&dummy_rdyprocmask, CPUMASK(tmpid)); 487 spin_unlock(&dummy_spin); 488 lwkt_schedule(&dummy_pcpu[tmpid].helper_thread); 489 } else { 490 spin_unlock(&dummy_spin); 491 } 492 } else if ((lp = TAILQ_FIRST(&dummy_runq)) != NULL) { 493 --dummy_runqcount; 494 TAILQ_REMOVE(&dummy_runq, lp, lwp_procq); 495 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ); 496 dd->uschedcp = lp; 497 atomic_set_cpumask(&dummy_curprocmask, cpumask); 498 spin_unlock(&dummy_spin); 499 lwkt_acquire(lp->lwp_thread); 500 lwkt_schedule(lp->lwp_thread); 501 } else { 502 spin_unlock(&dummy_spin); 503 } 504 lwkt_switch(); 505 } 506 } 507 508 /* 509 * Setup our scheduler helpers. Note that curprocmask bit 0 has already 510 * been cleared by rqinit() and we should not mess with it further. 511 */ 512 static void 513 dummy_sched_thread_cpu_init(void) 514 { 515 int i; 516 517 if (bootverbose) 518 kprintf("start dummy scheduler helpers on cpus:"); 519 520 for (i = 0; i < ncpus; ++i) { 521 dummy_pcpu_t dd = &dummy_pcpu[i]; 522 cpumask_t mask = CPUMASK(i); 523 524 if ((mask & smp_active_mask) == 0) 525 continue; 526 527 if (bootverbose) 528 kprintf(" %d", i); 529 530 lwkt_create(dummy_sched_thread, NULL, NULL, &dd->helper_thread, 531 TDF_NOSTART, i, "dsched %d", i); 532 533 /* 534 * Allow user scheduling on the target cpu. cpu #0 has already 535 * been enabled in rqinit(). 536 */ 537 if (i) 538 atomic_clear_cpumask(&dummy_curprocmask, mask); 539 atomic_set_cpumask(&dummy_rdyprocmask, mask); 540 } 541 if (bootverbose) 542 kprintf("\n"); 543 } 544 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND, 545 dummy_sched_thread_cpu_init, NULL) 546