1 /* $OpenBSD: kern_sched.c,v 1.36 2015/03/14 03:38:50 jsg Exp $ */ 2 /* 3 * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 #include <sys/param.h> 19 20 #include <sys/sched.h> 21 #include <sys/proc.h> 22 #include <sys/kthread.h> 23 #include <sys/systm.h> 24 #include <sys/resourcevar.h> 25 #include <sys/signalvar.h> 26 #include <sys/mutex.h> 27 28 #include <uvm/uvm_extern.h> 29 30 void sched_kthreads_create(void *); 31 32 int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p); 33 struct proc *sched_steal_proc(struct cpu_info *); 34 35 /* 36 * To help choosing which cpu should run which process we keep track 37 * of cpus which are currently idle and which cpus have processes 38 * queued. 39 */ 40 struct cpuset sched_idle_cpus; 41 struct cpuset sched_queued_cpus; 42 struct cpuset sched_all_cpus; 43 44 /* 45 * Some general scheduler counters. 46 */ 47 uint64_t sched_nmigrations; /* Cpu migration counter */ 48 uint64_t sched_nomigrations; /* Cpu no migration counter */ 49 uint64_t sched_noidle; /* Times we didn't pick the idle task */ 50 uint64_t sched_stolen; /* Times we stole proc from other cpus */ 51 uint64_t sched_choose; /* Times we chose a cpu */ 52 uint64_t sched_wasidle; /* Times we came out of idle */ 53 54 /* 55 * A few notes about cpu_switchto that is implemented in MD code. 56 * 57 * cpu_switchto takes two arguments, the old proc and the proc 58 * it should switch to. The new proc will never be NULL, so we always have 59 * a saved state that we need to switch to. The old proc however can 60 * be NULL if the process is exiting. NULL for the old proc simply 61 * means "don't bother saving old state". 62 * 63 * cpu_switchto is supposed to atomically load the new state of the process 64 * including the pcb, pmap and setting curproc, the p_cpu pointer in the 65 * proc and p_stat to SONPROC. Atomically with respect to interrupts, other 66 * cpus in the system must not depend on this state being consistent. 67 * Therefore no locking is necessary in cpu_switchto other than blocking 68 * interrupts during the context switch. 69 */ 70 71 /* 72 * sched_init_cpu is called from main() for the boot cpu, then it's the 73 * responsibility of the MD code to call it for all other cpus. 74 */ 75 void 76 sched_init_cpu(struct cpu_info *ci) 77 { 78 struct schedstate_percpu *spc = &ci->ci_schedstate; 79 int i; 80 81 for (i = 0; i < SCHED_NQS; i++) 82 TAILQ_INIT(&spc->spc_qs[i]); 83 84 spc->spc_idleproc = NULL; 85 86 kthread_create_deferred(sched_kthreads_create, ci); 87 88 LIST_INIT(&spc->spc_deadproc); 89 90 /* 91 * Slight hack here until the cpuset code handles cpu_info 92 * structures. 93 */ 94 cpuset_init_cpu(ci); 95 cpuset_add(&sched_all_cpus, ci); 96 } 97 98 void 99 sched_kthreads_create(void *v) 100 { 101 struct cpu_info *ci = v; 102 struct schedstate_percpu *spc = &ci->ci_schedstate; 103 static int num; 104 105 if (fork1(&proc0, FORK_SHAREVM|FORK_SHAREFILES|FORK_NOZOMBIE| 106 FORK_SYSTEM|FORK_SIGHAND|FORK_IDLE, NULL, 0, sched_idle, ci, NULL, 107 &spc->spc_idleproc)) 108 panic("fork idle"); 109 110 /* Name it as specified. */ 111 snprintf(spc->spc_idleproc->p_comm, sizeof(spc->spc_idleproc->p_comm), 112 "idle%d", num); 113 114 num++; 115 } 116 117 void 118 sched_idle(void *v) 119 { 120 struct schedstate_percpu *spc; 121 struct proc *p = curproc; 122 struct cpu_info *ci = v; 123 int s; 124 125 KERNEL_UNLOCK(); 126 127 spc = &ci->ci_schedstate; 128 129 /* 130 * First time we enter here, we're not supposed to idle, 131 * just go away for a while. 132 */ 133 SCHED_LOCK(s); 134 cpuset_add(&sched_idle_cpus, ci); 135 p->p_stat = SSLEEP; 136 p->p_cpu = ci; 137 atomic_setbits_int(&p->p_flag, P_CPUPEG); 138 mi_switch(); 139 cpuset_del(&sched_idle_cpus, ci); 140 SCHED_UNLOCK(s); 141 142 KASSERT(ci == curcpu()); 143 KASSERT(curproc == spc->spc_idleproc); 144 145 while (1) { 146 while (!curcpu_is_idle()) { 147 struct proc *dead; 148 149 SCHED_LOCK(s); 150 p->p_stat = SSLEEP; 151 mi_switch(); 152 SCHED_UNLOCK(s); 153 154 while ((dead = LIST_FIRST(&spc->spc_deadproc))) { 155 LIST_REMOVE(dead, p_hash); 156 exit2(dead); 157 } 158 } 159 160 splassert(IPL_NONE); 161 162 cpuset_add(&sched_idle_cpus, ci); 163 cpu_idle_enter(); 164 while (spc->spc_whichqs == 0) { 165 #ifdef MULTIPROCESSOR 166 if (spc->spc_schedflags & SPCF_SHOULDHALT && 167 (spc->spc_schedflags & SPCF_HALTED) == 0) { 168 cpuset_del(&sched_idle_cpus, ci); 169 SCHED_LOCK(s); 170 atomic_setbits_int(&spc->spc_schedflags, 171 spc->spc_whichqs ? 0 : SPCF_HALTED); 172 SCHED_UNLOCK(s); 173 wakeup(spc); 174 } 175 #endif 176 cpu_idle_cycle(); 177 } 178 cpu_idle_leave(); 179 cpuset_del(&sched_idle_cpus, ci); 180 } 181 } 182 183 /* 184 * To free our address space we have to jump through a few hoops. 185 * The freeing is done by the reaper, but until we have one reaper 186 * per cpu, we have no way of putting this proc on the deadproc list 187 * and waking up the reaper without risking having our address space and 188 * stack torn from under us before we manage to switch to another proc. 189 * Therefore we have a per-cpu list of dead processes where we put this 190 * proc and have idle clean up that list and move it to the reaper list. 191 * All this will be unnecessary once we can bind the reaper this cpu 192 * and not risk having it switch to another in case it sleeps. 193 */ 194 void 195 sched_exit(struct proc *p) 196 { 197 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 198 struct timespec ts; 199 struct proc *idle; 200 int s; 201 202 nanouptime(&ts); 203 timespecsub(&ts, &spc->spc_runtime, &ts); 204 timespecadd(&p->p_rtime, &ts, &p->p_rtime); 205 206 LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash); 207 208 /* This process no longer needs to hold the kernel lock. */ 209 KERNEL_UNLOCK(); 210 211 SCHED_LOCK(s); 212 idle = spc->spc_idleproc; 213 idle->p_stat = SRUN; 214 cpu_switchto(NULL, idle); 215 panic("cpu_switchto returned"); 216 } 217 218 /* 219 * Run queue management. 220 */ 221 void 222 sched_init_runqueues(void) 223 { 224 } 225 226 void 227 setrunqueue(struct proc *p) 228 { 229 struct schedstate_percpu *spc; 230 int queue = p->p_priority >> 2; 231 232 SCHED_ASSERT_LOCKED(); 233 spc = &p->p_cpu->ci_schedstate; 234 spc->spc_nrun++; 235 236 TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq); 237 spc->spc_whichqs |= (1 << queue); 238 cpuset_add(&sched_queued_cpus, p->p_cpu); 239 240 if (cpuset_isset(&sched_idle_cpus, p->p_cpu)) 241 cpu_unidle(p->p_cpu); 242 } 243 244 void 245 remrunqueue(struct proc *p) 246 { 247 struct schedstate_percpu *spc; 248 int queue = p->p_priority >> 2; 249 250 SCHED_ASSERT_LOCKED(); 251 spc = &p->p_cpu->ci_schedstate; 252 spc->spc_nrun--; 253 254 TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq); 255 if (TAILQ_EMPTY(&spc->spc_qs[queue])) { 256 spc->spc_whichqs &= ~(1 << queue); 257 if (spc->spc_whichqs == 0) 258 cpuset_del(&sched_queued_cpus, p->p_cpu); 259 } 260 } 261 262 struct proc * 263 sched_chooseproc(void) 264 { 265 struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 266 struct proc *p; 267 int queue; 268 269 SCHED_ASSERT_LOCKED(); 270 271 #ifdef MULTIPROCESSOR 272 if (spc->spc_schedflags & SPCF_SHOULDHALT) { 273 if (spc->spc_whichqs) { 274 for (queue = 0; queue < SCHED_NQS; queue++) { 275 while ((p = TAILQ_FIRST(&spc->spc_qs[queue]))) { 276 remrunqueue(p); 277 p->p_cpu = sched_choosecpu(p); 278 KASSERT(p->p_cpu != curcpu()); 279 setrunqueue(p); 280 } 281 } 282 } 283 p = spc->spc_idleproc; 284 KASSERT(p); 285 KASSERT(p->p_wchan == NULL); 286 p->p_stat = SRUN; 287 return (p); 288 } 289 #endif 290 291 again: 292 if (spc->spc_whichqs) { 293 queue = ffs(spc->spc_whichqs) - 1; 294 p = TAILQ_FIRST(&spc->spc_qs[queue]); 295 remrunqueue(p); 296 sched_noidle++; 297 KASSERT(p->p_stat == SRUN); 298 } else if ((p = sched_steal_proc(curcpu())) == NULL) { 299 p = spc->spc_idleproc; 300 if (p == NULL) { 301 int s; 302 /* 303 * We get here if someone decides to switch during 304 * boot before forking kthreads, bleh. 305 * This is kind of like a stupid idle loop. 306 */ 307 #ifdef MULTIPROCESSOR 308 __mp_unlock(&sched_lock); 309 #endif 310 spl0(); 311 delay(10); 312 SCHED_LOCK(s); 313 goto again; 314 } 315 KASSERT(p); 316 p->p_stat = SRUN; 317 } 318 319 KASSERT(p->p_wchan == NULL); 320 return (p); 321 } 322 323 struct cpu_info * 324 sched_choosecpu_fork(struct proc *parent, int flags) 325 { 326 #ifdef MULTIPROCESSOR 327 struct cpu_info *choice = NULL; 328 fixpt_t load, best_load = ~0; 329 int run, best_run = INT_MAX; 330 struct cpu_info *ci; 331 struct cpuset set; 332 333 #if 0 334 /* 335 * XXX 336 * Don't do this until we have a painless way to move the cpu in exec. 337 * Preferably when nuking the old pmap and getting a new one on a 338 * new cpu. 339 */ 340 /* 341 * PPWAIT forks are simple. We know that the parent will not 342 * run until we exec and choose another cpu, so we just steal its 343 * cpu. 344 */ 345 if (flags & FORK_PPWAIT) 346 return (parent->p_cpu); 347 #endif 348 349 /* 350 * Look at all cpus that are currently idle and have nothing queued. 351 * If there are none, pick the one with least queued procs first, 352 * then the one with lowest load average. 353 */ 354 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 355 cpuset_intersection(&set, &set, &sched_all_cpus); 356 if (cpuset_first(&set) == NULL) 357 cpuset_copy(&set, &sched_all_cpus); 358 359 while ((ci = cpuset_first(&set)) != NULL) { 360 cpuset_del(&set, ci); 361 362 load = ci->ci_schedstate.spc_ldavg; 363 run = ci->ci_schedstate.spc_nrun; 364 365 if (choice == NULL || run < best_run || 366 (run == best_run &&load < best_load)) { 367 choice = ci; 368 best_load = load; 369 best_run = run; 370 } 371 } 372 373 return (choice); 374 #else 375 return (curcpu()); 376 #endif 377 } 378 379 struct cpu_info * 380 sched_choosecpu(struct proc *p) 381 { 382 #ifdef MULTIPROCESSOR 383 struct cpu_info *choice = NULL; 384 int last_cost = INT_MAX; 385 struct cpu_info *ci; 386 struct cpuset set; 387 388 /* 389 * If pegged to a cpu, don't allow it to move. 390 */ 391 if (p->p_flag & P_CPUPEG) 392 return (p->p_cpu); 393 394 sched_choose++; 395 396 /* 397 * Look at all cpus that are currently idle and have nothing queued. 398 * If there are none, pick the cheapest of those. 399 * (idle + queued could mean that the cpu is handling an interrupt 400 * at this moment and haven't had time to leave idle yet). 401 */ 402 cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 403 cpuset_intersection(&set, &set, &sched_all_cpus); 404 405 /* 406 * First, just check if our current cpu is in that set, if it is, 407 * this is simple. 408 * Also, our cpu might not be idle, but if it's the current cpu 409 * and it has nothing else queued and we're curproc, take it. 410 */ 411 if (cpuset_isset(&set, p->p_cpu) || 412 (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 && 413 (p->p_cpu->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0 && 414 curproc == p)) { 415 sched_wasidle++; 416 return (p->p_cpu); 417 } 418 419 if (cpuset_first(&set) == NULL) 420 cpuset_copy(&set, &sched_all_cpus); 421 422 while ((ci = cpuset_first(&set)) != NULL) { 423 int cost = sched_proc_to_cpu_cost(ci, p); 424 425 if (choice == NULL || cost < last_cost) { 426 choice = ci; 427 last_cost = cost; 428 } 429 cpuset_del(&set, ci); 430 } 431 432 if (p->p_cpu != choice) 433 sched_nmigrations++; 434 else 435 sched_nomigrations++; 436 437 return (choice); 438 #else 439 return (curcpu()); 440 #endif 441 } 442 443 /* 444 * Attempt to steal a proc from some cpu. 445 */ 446 struct proc * 447 sched_steal_proc(struct cpu_info *self) 448 { 449 struct proc *best = NULL; 450 #ifdef MULTIPROCESSOR 451 struct schedstate_percpu *spc; 452 int bestcost = INT_MAX; 453 struct cpu_info *ci; 454 struct cpuset set; 455 456 KASSERT((self->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0); 457 458 cpuset_copy(&set, &sched_queued_cpus); 459 460 while ((ci = cpuset_first(&set)) != NULL) { 461 struct proc *p; 462 int queue; 463 int cost; 464 465 cpuset_del(&set, ci); 466 467 spc = &ci->ci_schedstate; 468 469 queue = ffs(spc->spc_whichqs) - 1; 470 TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) { 471 if (p->p_flag & P_CPUPEG) 472 continue; 473 474 cost = sched_proc_to_cpu_cost(self, p); 475 476 if (best == NULL || cost < bestcost) { 477 best = p; 478 bestcost = cost; 479 } 480 } 481 } 482 if (best == NULL) 483 return (NULL); 484 485 spc = &best->p_cpu->ci_schedstate; 486 remrunqueue(best); 487 best->p_cpu = self; 488 489 sched_stolen++; 490 #endif 491 return (best); 492 } 493 494 #ifdef MULTIPROCESSOR 495 /* 496 * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). 497 */ 498 static int 499 log2(unsigned int i) 500 { 501 int ret = 0; 502 503 while (i >>= 1) 504 ret++; 505 506 return (ret); 507 } 508 509 /* 510 * Calculate the cost of moving the proc to this cpu. 511 * 512 * What we want is some guesstimate of how much "performance" it will 513 * cost us to move the proc here. Not just for caches and TLBs and NUMA 514 * memory, but also for the proc itself. A highly loaded cpu might not 515 * be the best candidate for this proc since it won't get run. 516 * 517 * Just total guesstimates for now. 518 */ 519 520 int sched_cost_load = 1; 521 int sched_cost_priority = 1; 522 int sched_cost_runnable = 3; 523 int sched_cost_resident = 1; 524 #endif 525 526 int 527 sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p) 528 { 529 int cost = 0; 530 #ifdef MULTIPROCESSOR 531 struct schedstate_percpu *spc; 532 int l2resident = 0; 533 534 spc = &ci->ci_schedstate; 535 536 /* 537 * First, account for the priority of the proc we want to move. 538 * More willing to move, the lower the priority of the destination 539 * and the higher the priority of the proc. 540 */ 541 if (!cpuset_isset(&sched_idle_cpus, ci)) { 542 cost += (p->p_priority - spc->spc_curpriority) * 543 sched_cost_priority; 544 cost += sched_cost_runnable; 545 } 546 if (cpuset_isset(&sched_queued_cpus, ci)) 547 cost += spc->spc_nrun * sched_cost_runnable; 548 549 /* 550 * Higher load on the destination means we don't want to go there. 551 */ 552 cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT); 553 554 /* 555 * If the proc is on this cpu already, lower the cost by how much 556 * it has been running and an estimate of its footprint. 557 */ 558 if (p->p_cpu == ci && p->p_slptime == 0) { 559 l2resident = 560 log2(pmap_resident_count(p->p_vmspace->vm_map.pmap)); 561 cost -= l2resident * sched_cost_resident; 562 } 563 #endif 564 return (cost); 565 } 566 567 /* 568 * Peg a proc to a cpu. 569 */ 570 void 571 sched_peg_curproc(struct cpu_info *ci) 572 { 573 struct proc *p = curproc; 574 int s; 575 576 SCHED_LOCK(s); 577 p->p_priority = p->p_usrpri; 578 p->p_stat = SRUN; 579 p->p_cpu = ci; 580 atomic_setbits_int(&p->p_flag, P_CPUPEG); 581 setrunqueue(p); 582 p->p_ru.ru_nvcsw++; 583 mi_switch(); 584 SCHED_UNLOCK(s); 585 } 586 587 #ifdef MULTIPROCESSOR 588 589 void 590 sched_start_secondary_cpus(void) 591 { 592 CPU_INFO_ITERATOR cii; 593 struct cpu_info *ci; 594 595 CPU_INFO_FOREACH(cii, ci) { 596 struct schedstate_percpu *spc = &ci->ci_schedstate; 597 598 if (CPU_IS_PRIMARY(ci)) 599 continue; 600 cpuset_add(&sched_all_cpus, ci); 601 atomic_clearbits_int(&spc->spc_schedflags, 602 SPCF_SHOULDHALT | SPCF_HALTED); 603 } 604 } 605 606 void 607 sched_stop_secondary_cpus(void) 608 { 609 CPU_INFO_ITERATOR cii; 610 struct cpu_info *ci; 611 612 /* 613 * Make sure we stop the secondary CPUs. 614 */ 615 CPU_INFO_FOREACH(cii, ci) { 616 struct schedstate_percpu *spc = &ci->ci_schedstate; 617 618 if (CPU_IS_PRIMARY(ci)) 619 continue; 620 cpuset_del(&sched_all_cpus, ci); 621 atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT); 622 } 623 CPU_INFO_FOREACH(cii, ci) { 624 struct schedstate_percpu *spc = &ci->ci_schedstate; 625 struct sleep_state sls; 626 627 if (CPU_IS_PRIMARY(ci)) 628 continue; 629 while ((spc->spc_schedflags & SPCF_HALTED) == 0) { 630 sleep_setup(&sls, spc, PZERO, "schedstate"); 631 sleep_finish(&sls, 632 (spc->spc_schedflags & SPCF_HALTED) == 0); 633 } 634 } 635 } 636 637 #endif 638 639 /* 640 * Functions to manipulate cpu sets. 641 */ 642 struct cpu_info *cpuset_infos[MAXCPUS]; 643 static struct cpuset cpuset_all; 644 645 void 646 cpuset_init_cpu(struct cpu_info *ci) 647 { 648 cpuset_add(&cpuset_all, ci); 649 cpuset_infos[CPU_INFO_UNIT(ci)] = ci; 650 } 651 652 void 653 cpuset_clear(struct cpuset *cs) 654 { 655 memset(cs, 0, sizeof(*cs)); 656 } 657 658 void 659 cpuset_add(struct cpuset *cs, struct cpu_info *ci) 660 { 661 unsigned int num = CPU_INFO_UNIT(ci); 662 atomic_setbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 663 } 664 665 void 666 cpuset_del(struct cpuset *cs, struct cpu_info *ci) 667 { 668 unsigned int num = CPU_INFO_UNIT(ci); 669 atomic_clearbits_int(&cs->cs_set[num/32], (1 << (num % 32))); 670 } 671 672 int 673 cpuset_isset(struct cpuset *cs, struct cpu_info *ci) 674 { 675 unsigned int num = CPU_INFO_UNIT(ci); 676 return (cs->cs_set[num/32] & (1 << (num % 32))); 677 } 678 679 void 680 cpuset_add_all(struct cpuset *cs) 681 { 682 cpuset_copy(cs, &cpuset_all); 683 } 684 685 void 686 cpuset_copy(struct cpuset *to, struct cpuset *from) 687 { 688 memcpy(to, from, sizeof(*to)); 689 } 690 691 struct cpu_info * 692 cpuset_first(struct cpuset *cs) 693 { 694 int i; 695 696 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 697 if (cs->cs_set[i]) 698 return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]); 699 700 return (NULL); 701 } 702 703 void 704 cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b) 705 { 706 int i; 707 708 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 709 to->cs_set[i] = a->cs_set[i] | b->cs_set[i]; 710 } 711 712 void 713 cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b) 714 { 715 int i; 716 717 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 718 to->cs_set[i] = a->cs_set[i] & b->cs_set[i]; 719 } 720 721 void 722 cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b) 723 { 724 int i; 725 726 for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 727 to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i]; 728 } 729