1 /* 2 * Copyright (c) 2003-2011 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * pmap invalidation support code. Certain hardware requirements must 37 * be dealt with when manipulating page table entries and page directory 38 * entries within a pmap. In particular, we cannot safely manipulate 39 * page tables which are in active use by another cpu (even if it is 40 * running in userland) for two reasons: First, TLB writebacks will 41 * race against our own modifications and tests. Second, even if we 42 * were to use bus-locked instruction we can still screw up the 43 * target cpu's instruction pipeline due to Intel cpu errata. 44 */ 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/kernel.h> 49 #include <sys/proc.h> 50 #include <sys/vmmeter.h> 51 #include <sys/thread2.h> 52 #include <sys/sysctl.h> 53 54 #include <vm/vm.h> 55 #include <vm/pmap.h> 56 #include <vm/vm_object.h> 57 58 #include <machine/cputypes.h> 59 #include <machine/md_var.h> 60 #include <machine/specialreg.h> 61 #include <machine/smp.h> 62 #include <machine/globaldata.h> 63 #include <machine/pmap.h> 64 #include <machine/pmap_inval.h> 65 #include <machine/clock.h> 66 67 #if 1 /* DEBUGGING */ 68 #define LOOPRECOVER /* enable watchdog */ 69 #endif 70 71 /* 72 * Watchdog recovery interval, in seconds. 73 * 74 * The watchdog value is generous for two reasons. First, because the 75 * situation is not supposed to happen at all (but does), and second, 76 * because VMs could be very slow at handling IPIs. 77 */ 78 #define LOOPRECOVER_TIMEOUT1 2 /* initial recovery */ 79 #define LOOPRECOVER_TIMEOUT2 1 /* repeated recoveries */ 80 81 #define MAX_INVAL_PAGES 128 82 83 struct pmap_inval_info { 84 vm_offset_t va; 85 pt_entry_t *ptep; 86 pt_entry_t opte; 87 pt_entry_t npte; 88 enum { INVDONE, INVSTORE, INVCMPSET } mode; 89 int success; 90 vm_pindex_t npgs; 91 cpumask_t done; 92 cpumask_t mask; 93 #ifdef LOOPRECOVER 94 cpumask_t sigmask; 95 int failed; 96 tsc_uclock_t tsc_target; 97 #endif 98 } __cachealign; 99 100 typedef struct pmap_inval_info pmap_inval_info_t; 101 102 static pmap_inval_info_t invinfo[MAXCPU]; 103 extern cpumask_t smp_invmask; 104 #ifdef LOOPRECOVER 105 #ifdef LOOPMASK_IN 106 extern cpumask_t smp_in_mask; 107 #endif 108 extern cpumask_t smp_smurf_mask; 109 #endif 110 static int pmap_inval_watchdog_print; /* must always default off */ 111 static int pmap_inval_force_allcpus; 112 static int pmap_inval_force_nonopt; 113 114 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_watchdog_print, CTLFLAG_RW, 115 &pmap_inval_watchdog_print, 0, ""); 116 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_force_allcpus, CTLFLAG_RW, 117 &pmap_inval_force_allcpus, 0, ""); 118 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_force_nonopt, CTLFLAG_RW, 119 &pmap_inval_force_nonopt, 0, ""); 120 121 static void 122 pmap_inval_init(pmap_t pmap) 123 { 124 cpulock_t olock; 125 cpulock_t nlock; 126 127 crit_enter_id("inval"); 128 129 if (pmap != kernel_pmap) { 130 for (;;) { 131 olock = pmap->pm_active_lock; 132 cpu_ccfence(); 133 nlock = olock | CPULOCK_EXCL; 134 if (olock != nlock && 135 atomic_cmpset_int(&pmap->pm_active_lock, 136 olock, nlock)) { 137 break; 138 } 139 lwkt_process_ipiq(); 140 cpu_pause(); 141 } 142 atomic_add_64(&pmap->pm_invgen, 1); 143 } 144 } 145 146 static void 147 pmap_inval_done(pmap_t pmap) 148 { 149 if (pmap != kernel_pmap) { 150 atomic_add_64(&pmap->pm_invgen, 1); 151 atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL); 152 } 153 crit_exit_id("inval"); 154 } 155 156 #ifdef LOOPRECOVER 157 158 /* 159 * Debugging and lost IPI recovery code. 160 */ 161 static 162 __inline 163 int 164 loopwdog(struct pmap_inval_info *info) 165 { 166 tsc_uclock_t tsc; 167 168 tsc = rdtsc(); 169 if ((tsc_sclock_t)(info->tsc_target - tsc) < 0 && tsc_frequency) { 170 info->tsc_target = tsc + (tsc_frequency * LOOPRECOVER_TIMEOUT2); 171 return 1; 172 } 173 return 0; 174 } 175 176 static 177 void 178 loopdebug(const char *msg, pmap_inval_info_t *info) 179 { 180 int p; 181 int cpu = mycpu->gd_cpuid; 182 183 /* 184 * Don't kprintf() anything if the pmap inval watchdog gets hit. 185 * DRM can cause an occassional watchdog hit (at least with a 1/16 186 * second watchdog), and attempting to kprintf to the KVM frame buffer 187 * from Xinvltlb, which ignores critical sections, can implode the 188 * system. 189 */ 190 if (pmap_inval_watchdog_print == 0) 191 return; 192 193 cpu_lfence(); 194 #ifdef LOOPRECOVER 195 atomic_add_long(&smp_smurf_mask.ary[0], 0); 196 #endif 197 kprintf("ipilost-%s! %d mode=%d m=%08jx d=%08jx " 198 #ifdef LOOPRECOVER 199 "s=%08jx " 200 #endif 201 #ifdef LOOPMASK_IN 202 "in=%08jx " 203 #endif 204 #ifdef LOOPRECOVER 205 "smurf=%08jx\n" 206 #endif 207 , msg, cpu, info->mode, 208 info->mask.ary[0], 209 info->done.ary[0] 210 #ifdef LOOPRECOVER 211 , info->sigmask.ary[0] 212 #endif 213 #ifdef LOOPMASK_IN 214 , smp_in_mask.ary[0] 215 #endif 216 #ifdef LOOPRECOVER 217 , smp_smurf_mask.ary[0] 218 #endif 219 ); 220 kprintf("mdglob "); 221 for (p = 0; p < ncpus; ++p) 222 kprintf(" %d", CPU_prvspace[p]->mdglobaldata.gd_xinvaltlb); 223 kprintf("\n"); 224 } 225 226 #endif 227 228 #ifdef CHECKSIG 229 230 #define CHECKSIGMASK(info) _checksigmask(info, __FILE__, __LINE__) 231 232 static 233 void 234 _checksigmask(pmap_inval_info_t *info, const char *file, int line) 235 { 236 cpumask_t tmp; 237 238 tmp = info->mask; 239 CPUMASK_ANDMASK(tmp, info->sigmask); 240 if (CPUMASK_CMPMASKNEQ(tmp, info->mask)) { 241 kprintf("\"%s\" line %d: bad sig/mask %08jx %08jx\n", 242 file, line, info->sigmask.ary[0], info->mask.ary[0]); 243 } 244 } 245 246 #else 247 248 #define CHECKSIGMASK(info) 249 250 #endif 251 252 /* 253 * Invalidate the specified va across all cpus associated with the pmap. 254 * If va == (vm_offset_t)-1, we invltlb() instead of invlpg(). The operation 255 * will be done fully synchronously with storing npte into *ptep and returning 256 * opte. 257 * 258 * If ptep is NULL the operation will execute semi-synchronously. 259 * ptep must be NULL if npgs > 1 260 */ 261 pt_entry_t 262 pmap_inval_smp(pmap_t pmap, vm_offset_t va, vm_pindex_t npgs, 263 pt_entry_t *ptep, pt_entry_t npte) 264 { 265 globaldata_t gd = mycpu; 266 pmap_inval_info_t *info; 267 pt_entry_t opte = 0; 268 int cpu = gd->gd_cpuid; 269 cpumask_t tmpmask; 270 unsigned long rflags; 271 272 /* 273 * Initialize invalidation for pmap and enter critical section. 274 * This will enter a critical section for us. 275 */ 276 if (pmap == NULL) 277 pmap = kernel_pmap; 278 279 /* 280 * Shortcut single-cpu case if possible. 281 */ 282 if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask) && 283 pmap_inval_force_nonopt == 0) { 284 /* 285 * Convert to invltlb if there are too many pages to 286 * invlpg on. 287 */ 288 if (pmap->pm_flags & PMAP_MULTI) 289 pmap_inval_init(pmap); 290 if (npgs == 1) { 291 if (ptep) 292 opte = atomic_swap_long(ptep, npte); 293 if (va == (vm_offset_t)-1) 294 cpu_invltlb(); 295 else 296 cpu_invlpg((void *)va); 297 } else if (va == (vm_offset_t)-1 || npgs > MAX_INVAL_PAGES) { 298 if (ptep) { 299 while (npgs) { 300 opte = atomic_swap_long(ptep, npte); 301 ++ptep; 302 --npgs; 303 } 304 } 305 cpu_invltlb(); 306 } else { 307 while (npgs) { 308 if (ptep) { 309 opte = atomic_swap_long(ptep, npte); 310 ++ptep; 311 } 312 cpu_invlpg((void *)va); 313 va += PAGE_SIZE; 314 --npgs; 315 } 316 } 317 if (pmap->pm_flags & PMAP_MULTI) 318 pmap_inval_done(pmap); 319 320 /* 321 * Knock on NVMM flush. 322 * 323 * NOTE: pmap_enter() also calls this function and requires 324 * the old PTE be returned, so can't place this TLB 325 * callback at the beginning and return 0. 326 */ 327 if (__predict_false(pmap->pm_tlb_flush != NULL)) { 328 KKASSERT(pmap->pm_data != NULL); 329 pmap->pm_tlb_flush(pmap); 330 } 331 332 return opte; 333 } 334 335 /* 336 * We need a critical section to prevent getting preempted while 337 * we setup our command. A preemption might execute its own 338 * pmap_inval*() command and create confusion below. 339 * 340 * tsc_target is our watchdog timeout that will attempt to recover 341 * from a lost IPI. Set to 1/16 second for now. 342 */ 343 pmap_inval_init(pmap); 344 info = &invinfo[cpu]; 345 346 /* 347 * We must wait for other cpus which may still be finishing up a 348 * prior operation that we requested. 349 * 350 * We do not have to disable interrupts here. An Xinvltlb can occur 351 * at any time (even within a critical section), but it will not 352 * act on our command until we set our done bits. 353 */ 354 while (CPUMASK_TESTNZERO(info->done)) { 355 #ifdef LOOPRECOVER 356 if (loopwdog(info)) { 357 info->failed = 1; 358 loopdebug("A", info); 359 /* XXX recover from possible bug */ 360 CPUMASK_ASSZERO(info->done); 361 } 362 #endif 363 cpu_pause(); 364 } 365 KKASSERT(info->mode == INVDONE); 366 cpu_mfence(); 367 368 /* 369 * Must set our cpu in the invalidation scan mask before 370 * any possibility of [partial] execution (remember, XINVLTLB 371 * can interrupt a critical section). 372 */ 373 ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu); 374 375 info->tsc_target = rdtsc() + (tsc_frequency * LOOPRECOVER_TIMEOUT1); 376 info->va = va; 377 info->npgs = npgs; 378 info->ptep = ptep; 379 info->npte = npte; 380 info->opte = 0; 381 #ifdef LOOPRECOVER 382 info->failed = 0; 383 #endif 384 info->mode = INVSTORE; 385 386 tmpmask = pmap->pm_active; /* volatile (bits may be cleared) */ 387 if (pmap_inval_force_allcpus) 388 tmpmask = smp_active_mask; 389 cpu_ccfence(); 390 CPUMASK_ANDMASK(tmpmask, smp_active_mask); 391 392 /* 393 * If ptep is NULL the operation can be semi-synchronous, which means 394 * we can improve performance by flagging and removing idle cpus 395 * (see the idleinvlclr function in mp_machdep.c). 396 * 397 * Typically kernel page table operation is semi-synchronous. 398 */ 399 if (ptep == NULL) 400 smp_smurf_idleinvlclr(&tmpmask); 401 CPUMASK_ORBIT(tmpmask, cpu); 402 info->mask = tmpmask; 403 404 /* 405 * Command may start executing the moment 'done' is initialized, 406 * disable current cpu interrupt to prevent 'done' field from 407 * changing (other cpus can't clear done bits until the originating 408 * cpu clears its mask bit, but other cpus CAN start clearing their 409 * mask bits). 410 */ 411 #ifdef LOOPRECOVER 412 info->sigmask = tmpmask; 413 CHECKSIGMASK(info); 414 #endif 415 cpu_sfence(); 416 rflags = read_rflags(); 417 cpu_disable_intr(); 418 419 ATOMIC_CPUMASK_COPY(info->done, tmpmask); 420 /* execution can begin here on other cpus due to races */ 421 422 /* 423 * Pass our copy of the done bits (so they don't change out from 424 * under us) to generate the Xinvltlb interrupt on the targets. 425 * 426 * smp_invlpg() issues the command, synchronizes with other cpus, 427 * and executes the command on our cpu. Upon return other cpus 428 * may still be in the process of exiting their synchroniization. 429 */ 430 smp_invlpg(&tmpmask); 431 opte = info->opte; 432 KKASSERT(info->mode == INVDONE); 433 434 /* 435 * Target cpus will be in their loop exiting concurrently with our 436 * cleanup. They will not lose the bitmask they obtained before so 437 * we can safely clear this bit. 438 */ 439 ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu); 440 write_rflags(rflags); 441 pmap_inval_done(pmap); 442 443 /* Knock on NVMM flush. */ 444 if (__predict_false(pmap->pm_tlb_flush != NULL)) { 445 KKASSERT(pmap->pm_data != NULL); 446 pmap->pm_tlb_flush(pmap); 447 } 448 449 return opte; 450 } 451 452 /* 453 * API function - invalidate the pte at (va) and replace *ptep with npte 454 * atomically only if *ptep equals opte, across the pmap's active cpus. 455 * 456 * Returns 1 on success, 0 on failure (caller typically retries). 457 */ 458 int 459 pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep, 460 pt_entry_t opte, pt_entry_t npte) 461 { 462 globaldata_t gd = mycpu; 463 pmap_inval_info_t *info; 464 int success; 465 int cpu = gd->gd_cpuid; 466 cpumask_t tmpmask; 467 unsigned long rflags; 468 469 /* 470 * Initialize invalidation for pmap and enter critical section. 471 */ 472 if (pmap == NULL) 473 pmap = kernel_pmap; 474 475 /* 476 * Shortcut single-cpu case if possible. 477 */ 478 if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask) && 479 pmap_inval_force_nonopt == 0) { 480 if (pmap->pm_flags & PMAP_MULTI) 481 pmap_inval_init(pmap); 482 if (atomic_cmpset_long(ptep, opte, npte)) { 483 if (va == (vm_offset_t)-1) 484 cpu_invltlb(); 485 else 486 cpu_invlpg((void *)va); 487 if (pmap->pm_flags & PMAP_MULTI) 488 pmap_inval_done(pmap); 489 return 1; 490 } else { 491 if (pmap->pm_flags & PMAP_MULTI) 492 pmap_inval_done(pmap); 493 return 0; 494 } 495 } 496 497 /* 498 * We need a critical section to prevent getting preempted while 499 * we setup our command. A preemption might execute its own 500 * pmap_inval*() command and create confusion below. 501 */ 502 pmap_inval_init(pmap); 503 info = &invinfo[cpu]; 504 505 /* 506 * We must wait for other cpus which may still be finishing 507 * up a prior operation. 508 */ 509 while (CPUMASK_TESTNZERO(info->done)) { 510 #ifdef LOOPRECOVER 511 if (loopwdog(info)) { 512 info->failed = 1; 513 loopdebug("B", info); 514 /* XXX recover from possible bug */ 515 CPUMASK_ASSZERO(info->done); 516 } 517 #endif 518 cpu_pause(); 519 } 520 KKASSERT(info->mode == INVDONE); 521 cpu_mfence(); 522 523 /* 524 * Must set our cpu in the invalidation scan mask before 525 * any possibility of [partial] execution (remember, XINVLTLB 526 * can interrupt a critical section). 527 */ 528 ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu); 529 530 info->tsc_target = rdtsc() + (tsc_frequency * LOOPRECOVER_TIMEOUT1); 531 info->va = va; 532 info->npgs = 1; /* unused */ 533 info->ptep = ptep; 534 info->npte = npte; 535 info->opte = opte; 536 #ifdef LOOPRECOVER 537 info->failed = 0; 538 #endif 539 info->mode = INVCMPSET; 540 info->success = 0; 541 542 tmpmask = pmap->pm_active; /* volatile */ 543 if (pmap_inval_force_allcpus) 544 tmpmask = smp_active_mask; 545 cpu_ccfence(); 546 CPUMASK_ANDMASK(tmpmask, smp_active_mask); 547 CPUMASK_ORBIT(tmpmask, cpu); 548 info->mask = tmpmask; 549 550 /* 551 * Command may start executing the moment 'done' is initialized, 552 * disable current cpu interrupt to prevent 'done' field from 553 * changing (other cpus can't clear done bits until the originating 554 * cpu clears its mask bit). 555 */ 556 #ifdef LOOPRECOVER 557 info->sigmask = tmpmask; 558 CHECKSIGMASK(info); 559 #endif 560 cpu_sfence(); 561 rflags = read_rflags(); 562 cpu_disable_intr(); 563 564 ATOMIC_CPUMASK_COPY(info->done, tmpmask); 565 566 /* 567 * Pass our copy of the done bits (so they don't change out from 568 * under us) to generate the Xinvltlb interrupt on the targets. 569 */ 570 smp_invlpg(&tmpmask); 571 success = info->success; 572 KKASSERT(info->mode == INVDONE); 573 574 ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu); 575 write_rflags(rflags); 576 pmap_inval_done(pmap); 577 578 return success; 579 } 580 581 void 582 pmap_inval_bulk_init(pmap_inval_bulk_t *bulk, struct pmap *pmap) 583 { 584 bulk->pmap = pmap; 585 bulk->va_beg = 0; 586 bulk->va_end = 0; 587 bulk->count = 0; 588 } 589 590 pt_entry_t 591 pmap_inval_bulk(pmap_inval_bulk_t *bulk, vm_offset_t va, 592 pt_entry_t *ptep, pt_entry_t npte) 593 { 594 pt_entry_t pte; 595 596 /* 597 * Degenerate case, localized or we don't care (e.g. because we 598 * are jacking the entire page table) or the pmap is not in-use 599 * by anyone. No invalidations are done on any cpu. 600 */ 601 if (bulk == NULL) { 602 pte = atomic_swap_long(ptep, npte); 603 return pte; 604 } 605 606 /* 607 * If it isn't the kernel pmap we execute the operation synchronously 608 * on all cpus belonging to the pmap, which avoids concurrency bugs in 609 * the hw related to changing pte's out from under threads. 610 * 611 * Eventually I would like to implement streaming pmap invalidation 612 * for user pmaps to reduce mmap/munmap overheads for heavily-loaded 613 * threaded programs. 614 */ 615 if (bulk->pmap != kernel_pmap) { 616 pte = pmap_inval_smp(bulk->pmap, va, 1, ptep, npte); 617 return pte; 618 } 619 620 /* 621 * This is the kernel_pmap. All unmap operations presume that there 622 * are no other cpus accessing the addresses in question. Implement 623 * the bulking algorithm. collect the required information and 624 * synchronize once at the end. 625 */ 626 pte = atomic_swap_long(ptep, npte); 627 if (va == (vm_offset_t)-1) { 628 bulk->va_beg = va; 629 } else if (bulk->va_beg == bulk->va_end) { 630 bulk->va_beg = va; 631 bulk->va_end = va + PAGE_SIZE; 632 } else if (va == bulk->va_end) { 633 bulk->va_end = va + PAGE_SIZE; 634 } else { 635 bulk->va_beg = (vm_offset_t)-1; 636 bulk->va_end = 0; 637 #if 0 638 pmap_inval_bulk_flush(bulk); 639 bulk->count = 1; 640 if (va == (vm_offset_t)-1) { 641 bulk->va_beg = va; 642 bulk->va_end = 0; 643 } else { 644 bulk->va_beg = va; 645 bulk->va_end = va + PAGE_SIZE; 646 } 647 #endif 648 } 649 ++bulk->count; 650 651 return pte; 652 } 653 654 void 655 pmap_inval_bulk_flush(pmap_inval_bulk_t *bulk) 656 { 657 if (bulk == NULL) 658 return; 659 if (bulk->va_beg != bulk->va_end) { 660 if (bulk->va_beg == (vm_offset_t)-1) { 661 pmap_inval_smp(bulk->pmap, bulk->va_beg, 1, NULL, 0); 662 } else { 663 vm_pindex_t n; 664 665 n = (bulk->va_end - bulk->va_beg) >> PAGE_SHIFT; 666 pmap_inval_smp(bulk->pmap, bulk->va_beg, n, NULL, 0); 667 } 668 } 669 bulk->va_beg = 0; 670 bulk->va_end = 0; 671 bulk->count = 0; 672 } 673 674 /* 675 * Called from Xinvl with a critical section held and interrupts enabled. 676 */ 677 int 678 pmap_inval_intr(cpumask_t *cpumaskp, int toolong) 679 { 680 globaldata_t gd = mycpu; 681 pmap_inval_info_t *info; 682 int loopme = 0; 683 int cpu; 684 cpumask_t cpumask; 685 686 /* 687 * Check all cpus for invalidations we may need to service. 688 */ 689 cpu_ccfence(); 690 cpu = gd->gd_cpuid; 691 cpumask = *cpumaskp; 692 693 while (CPUMASK_TESTNZERO(cpumask)) { 694 int n = BSFCPUMASK(cpumask); 695 696 #ifdef LOOPRECOVER 697 KKASSERT(n >= 0 && n < MAXCPU); 698 #endif 699 700 CPUMASK_NANDBIT(cpumask, n); 701 info = &invinfo[n]; 702 703 /* 704 * Checkout cpu (cpu) for work in the target cpu info (n) 705 * 706 * if (n == cpu) - check our cpu for a master operation 707 * if (n != cpu) - check other cpus for a slave operation 708 * 709 * Due to interrupts/races we can catch a new operation 710 * in an older interrupt in other cpus. 711 * 712 * A fence is needed once we detect the (not) done bit. 713 */ 714 if (!CPUMASK_TESTBIT(info->done, cpu)) 715 continue; 716 cpu_lfence(); 717 #ifdef LOOPRECOVER 718 if (toolong) { 719 kprintf("pminvl %d->%d %08jx %08jx mode=%d\n", 720 cpu, n, info->done.ary[0], info->mask.ary[0], 721 info->mode); 722 } 723 #endif 724 725 /* 726 * info->mask and info->done always contain the originating 727 * cpu until the originator is done. Targets may still be 728 * present in info->done after the originator is done (they 729 * will be finishing up their loops). 730 * 731 * Clear info->mask bits on other cpus to indicate that they 732 * have quiesced (entered the loop). Once the other mask bits 733 * are clear we can execute the operation on the original, 734 * then clear the mask and done bits on the originator. The 735 * targets will then finish up their side and clear their 736 * done bits. 737 * 738 * The command is considered 100% done when all done bits have 739 * been cleared. 740 */ 741 if (n != cpu) { 742 /* 743 * Command state machine for 'other' cpus. 744 */ 745 if (CPUMASK_TESTBIT(info->mask, cpu)) { 746 /* 747 * Other cpus indicate to originator that they 748 * are quiesced. 749 */ 750 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu); 751 loopme = 1; 752 } else if (info->ptep && 753 CPUMASK_TESTBIT(info->mask, n)) { 754 /* 755 * Other cpu must wait for the originator (n) 756 * to complete its command if ptep is not NULL. 757 */ 758 loopme = 1; 759 } else { 760 /* 761 * Other cpu detects that the originator has 762 * completed its command, or there was no 763 * command. 764 * 765 * Now that the page table entry has changed, 766 * we can follow up with our own invalidation. 767 */ 768 vm_offset_t va = info->va; 769 vm_pindex_t npgs; 770 771 if (va == (vm_offset_t)-1 || 772 info->npgs > MAX_INVAL_PAGES) { 773 cpu_invltlb(); 774 } else { 775 for (npgs = info->npgs; npgs; --npgs) { 776 cpu_invlpg((void *)va); 777 va += PAGE_SIZE; 778 } 779 } 780 ATOMIC_CPUMASK_NANDBIT(info->done, cpu); 781 /* info invalid now */ 782 /* loopme left alone */ 783 } 784 } else if (CPUMASK_TESTBIT(info->mask, cpu)) { 785 /* 786 * Originator is waiting for other cpus 787 */ 788 if (CPUMASK_CMPMASKNEQ(info->mask, gd->gd_cpumask)) { 789 /* 790 * Originator waits for other cpus to enter 791 * their loop (aka quiesce). 792 * 793 * If this bugs out the IPI may have been lost, 794 * try to reissue by resetting our own 795 * reentrancy bit and clearing the smurf mask 796 * for the cpus that did not respond, then 797 * reissuing the IPI. 798 */ 799 loopme = 1; 800 #ifdef LOOPRECOVER 801 if (loopwdog(info)) { 802 info->failed = 1; 803 loopdebug("C", info); 804 /* XXX recover from possible bug */ 805 cpu_disable_intr(); 806 ATOMIC_CPUMASK_NANDMASK(smp_smurf_mask, 807 info->mask); 808 smp_invlpg(&smp_active_mask); 809 810 /* 811 * Force outer-loop retest of Xinvltlb 812 * requests (see mp_machdep.c). 813 */ 814 cpu_enable_intr(); 815 } 816 #endif 817 } else { 818 /* 819 * Originator executes operation and clears 820 * mask to allow other cpus to finish. 821 */ 822 KKASSERT(info->mode != INVDONE); 823 if (info->mode == INVSTORE) { 824 if (info->ptep) 825 info->opte = atomic_swap_long(info->ptep, info->npte); 826 CHECKSIGMASK(info); 827 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu); 828 CHECKSIGMASK(info); 829 } else { 830 if (atomic_cmpset_long(info->ptep, 831 info->opte, info->npte)) { 832 info->success = 1; 833 } else { 834 info->success = 0; 835 } 836 CHECKSIGMASK(info); 837 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu); 838 CHECKSIGMASK(info); 839 } 840 loopme = 1; 841 } 842 } else { 843 /* 844 * Originator does not have to wait for the other 845 * cpus to finish. It clears its done bit. A new 846 * command will not be initiated by the originator 847 * until the other cpus have cleared their done bits 848 * (asynchronously). 849 */ 850 vm_offset_t va = info->va; 851 vm_pindex_t npgs; 852 853 if (va == (vm_offset_t)-1 || 854 info->npgs > MAX_INVAL_PAGES) { 855 cpu_invltlb(); 856 } else { 857 for (npgs = info->npgs; npgs; --npgs) { 858 cpu_invlpg((void *)va); 859 va += PAGE_SIZE; 860 } 861 } 862 863 /* leave loopme alone */ 864 /* other cpus may still be finishing up */ 865 /* can't race originator since that's us */ 866 info->mode = INVDONE; 867 ATOMIC_CPUMASK_NANDBIT(info->done, cpu); 868 } 869 } 870 return loopme; 871 } 872