1 /* 2 * Copyright (c) 2003-2011 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * pmap invalidation support code. Certain hardware requirements must 37 * be dealt with when manipulating page table entries and page directory 38 * entries within a pmap. In particular, we cannot safely manipulate 39 * page tables which are in active use by another cpu (even if it is 40 * running in userland) for two reasons: First, TLB writebacks will 41 * race against our own modifications and tests. Second, even if we 42 * were to use bus-locked instruction we can still screw up the 43 * target cpu's instruction pipeline due to Intel cpu errata. 44 */ 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/kernel.h> 49 #include <sys/proc.h> 50 #include <sys/vmmeter.h> 51 #include <sys/thread2.h> 52 #include <sys/sysctl.h> 53 54 #include <vm/vm.h> 55 #include <vm/pmap.h> 56 #include <vm/vm_object.h> 57 58 #include <machine/cputypes.h> 59 #include <machine/md_var.h> 60 #include <machine/specialreg.h> 61 #include <machine/smp.h> 62 #include <machine/globaldata.h> 63 #include <machine/pmap.h> 64 #include <machine/pmap_inval.h> 65 #include <machine/clock.h> 66 67 #if 1 /* DEBUGGING */ 68 #define LOOPRECOVER /* enable watchdog */ 69 #endif 70 71 /* 72 * Watchdog recovery interval, in seconds. 73 * 74 * The watchdog value is generous for two reasons. First, because the 75 * situation is not supposed to happen at all (but does), and second, 76 * because VMs could be very slow at handling IPIs. 77 */ 78 #define LOOPRECOVER_TIMEOUT1 2 /* initial recovery */ 79 #define LOOPRECOVER_TIMEOUT2 1 /* repeated recoveries */ 80 81 #define MAX_INVAL_PAGES 128 82 83 struct pmap_inval_info { 84 vm_offset_t va; 85 pt_entry_t *ptep; 86 pt_entry_t opte; 87 pt_entry_t npte; 88 enum { INVDONE, INVSTORE, INVCMPSET } mode; 89 int success; 90 vm_pindex_t npgs; 91 cpumask_t done; 92 cpumask_t mask; 93 #ifdef LOOPRECOVER 94 cpumask_t sigmask; 95 int failed; 96 tsc_uclock_t tsc_target; 97 #endif 98 } __cachealign; 99 100 typedef struct pmap_inval_info pmap_inval_info_t; 101 102 static pmap_inval_info_t invinfo[MAXCPU]; 103 extern cpumask_t smp_invmask; 104 #ifdef LOOPRECOVER 105 #ifdef LOOPMASK_IN 106 extern cpumask_t smp_in_mask; 107 #endif 108 extern cpumask_t smp_smurf_mask; 109 #endif 110 static int pmap_inval_watchdog_print; /* must always default off */ 111 static int pmap_inval_force_allcpus; 112 static int pmap_inval_force_nonopt; 113 114 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_watchdog_print, CTLFLAG_RW, 115 &pmap_inval_watchdog_print, 0, ""); 116 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_force_allcpus, CTLFLAG_RW, 117 &pmap_inval_force_allcpus, 0, ""); 118 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_force_nonopt, CTLFLAG_RW, 119 &pmap_inval_force_nonopt, 0, ""); 120 121 static void 122 pmap_inval_init(pmap_t pmap) 123 { 124 cpulock_t olock; 125 cpulock_t nlock; 126 127 crit_enter_id("inval"); 128 129 if (pmap != &kernel_pmap) { 130 for (;;) { 131 olock = pmap->pm_active_lock; 132 cpu_ccfence(); 133 nlock = olock | CPULOCK_EXCL; 134 if (olock != nlock && 135 atomic_cmpset_int(&pmap->pm_active_lock, 136 olock, nlock)) { 137 break; 138 } 139 lwkt_process_ipiq(); 140 cpu_pause(); 141 } 142 atomic_add_acq_long(&pmap->pm_invgen, 1); 143 } 144 } 145 146 static void 147 pmap_inval_done(pmap_t pmap) 148 { 149 if (pmap != &kernel_pmap) { 150 atomic_add_acq_long(&pmap->pm_invgen, 1); 151 atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL); 152 } 153 crit_exit_id("inval"); 154 } 155 156 #ifdef LOOPRECOVER 157 158 /* 159 * Debugging and lost IPI recovery code. 160 */ 161 static 162 __inline 163 int 164 loopwdog(struct pmap_inval_info *info) 165 { 166 tsc_uclock_t tsc; 167 168 tsc = rdtsc(); 169 if ((tsc_sclock_t)(info->tsc_target - tsc) < 0 && tsc_frequency) { 170 info->tsc_target = tsc + (tsc_frequency * LOOPRECOVER_TIMEOUT2); 171 return 1; 172 } 173 return 0; 174 } 175 176 static 177 void 178 loopdebug(const char *msg, pmap_inval_info_t *info) 179 { 180 int p; 181 int cpu = mycpu->gd_cpuid; 182 183 /* 184 * Don't kprintf() anything if the pmap inval watchdog gets hit. 185 * DRM can cause an occassional watchdog hit (at least with a 1/16 186 * second watchdog), and attempting to kprintf to the KVM frame buffer 187 * from Xinvltlb, which ignores critical sections, can implode the 188 * system. 189 */ 190 if (pmap_inval_watchdog_print == 0) 191 return; 192 193 cpu_lfence(); 194 #ifdef LOOPRECOVER 195 atomic_add_long(&smp_smurf_mask.ary[0], 0); 196 #endif 197 kprintf("ipilost-%s! %d mode=%d m=%08jx d=%08jx " 198 #ifdef LOOPRECOVER 199 "s=%08jx " 200 #endif 201 #ifdef LOOPMASK_IN 202 "in=%08jx " 203 #endif 204 #ifdef LOOPRECOVER 205 "smurf=%08jx\n" 206 #endif 207 , msg, cpu, info->mode, 208 info->mask.ary[0], 209 info->done.ary[0] 210 #ifdef LOOPRECOVER 211 , info->sigmask.ary[0] 212 #endif 213 #ifdef LOOPMASK_IN 214 , smp_in_mask.ary[0] 215 #endif 216 #ifdef LOOPRECOVER 217 , smp_smurf_mask.ary[0] 218 #endif 219 ); 220 kprintf("mdglob "); 221 for (p = 0; p < ncpus; ++p) 222 kprintf(" %d", CPU_prvspace[p]->mdglobaldata.gd_xinvaltlb); 223 kprintf("\n"); 224 } 225 226 #endif 227 228 #ifdef CHECKSIG 229 230 #define CHECKSIGMASK(info) _checksigmask(info, __FILE__, __LINE__) 231 232 static 233 void 234 _checksigmask(pmap_inval_info_t *info, const char *file, int line) 235 { 236 cpumask_t tmp; 237 238 tmp = info->mask; 239 CPUMASK_ANDMASK(tmp, info->sigmask); 240 if (CPUMASK_CMPMASKNEQ(tmp, info->mask)) { 241 kprintf("\"%s\" line %d: bad sig/mask %08jx %08jx\n", 242 file, line, info->sigmask.ary[0], info->mask.ary[0]); 243 } 244 } 245 246 #else 247 248 #define CHECKSIGMASK(info) 249 250 #endif 251 252 /* 253 * Invalidate the specified va across all cpus associated with the pmap. 254 * If va == (vm_offset_t)-1, we invltlb() instead of invlpg(). The operation 255 * will be done fully synchronously with storing npte into *ptep and returning 256 * opte. 257 * 258 * If ptep is NULL the operation will execute semi-synchronously. 259 * ptep must be NULL if npgs > 1 260 */ 261 pt_entry_t 262 pmap_inval_smp(pmap_t pmap, vm_offset_t va, vm_pindex_t npgs, 263 pt_entry_t *ptep, pt_entry_t npte) 264 { 265 globaldata_t gd = mycpu; 266 pmap_inval_info_t *info; 267 pt_entry_t opte = 0; 268 int cpu = gd->gd_cpuid; 269 cpumask_t tmpmask; 270 unsigned long rflags; 271 272 /* 273 * Initialize invalidation for pmap and enter critical section. 274 * This will enter a critical section for us. 275 */ 276 if (pmap == NULL) 277 pmap = &kernel_pmap; 278 pmap_inval_init(pmap); 279 280 /* 281 * Shortcut single-cpu case if possible. 282 */ 283 if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask) && 284 pmap_inval_force_nonopt == 0) { 285 /* 286 * Convert to invltlb if there are too many pages to 287 * invlpg on. 288 */ 289 if (npgs == 1) { 290 if (ptep) 291 opte = atomic_swap_long(ptep, npte); 292 if (va == (vm_offset_t)-1) 293 cpu_invltlb(); 294 else 295 cpu_invlpg((void *)va); 296 } else if (va == (vm_offset_t)-1 || npgs > MAX_INVAL_PAGES) { 297 if (ptep) { 298 while (npgs) { 299 opte = atomic_swap_long(ptep, npte); 300 ++ptep; 301 --npgs; 302 } 303 } 304 cpu_invltlb(); 305 } else { 306 while (npgs) { 307 if (ptep) { 308 opte = atomic_swap_long(ptep, npte); 309 ++ptep; 310 } 311 cpu_invlpg((void *)va); 312 va += PAGE_SIZE; 313 --npgs; 314 } 315 } 316 pmap_inval_done(pmap); 317 318 return opte; 319 } 320 321 /* 322 * We need a critical section to prevent getting preempted while 323 * we setup our command. A preemption might execute its own 324 * pmap_inval*() command and create confusion below. 325 * 326 * tsc_target is our watchdog timeout that will attempt to recover 327 * from a lost IPI. Set to 1/16 second for now. 328 */ 329 info = &invinfo[cpu]; 330 331 /* 332 * We must wait for other cpus which may still be finishing up a 333 * prior operation that we requested. 334 * 335 * We do not have to disable interrupts here. An Xinvltlb can occur 336 * at any time (even within a critical section), but it will not 337 * act on our command until we set our done bits. 338 */ 339 while (CPUMASK_TESTNZERO(info->done)) { 340 #ifdef LOOPRECOVER 341 if (loopwdog(info)) { 342 info->failed = 1; 343 loopdebug("A", info); 344 /* XXX recover from possible bug */ 345 CPUMASK_ASSZERO(info->done); 346 } 347 #endif 348 cpu_pause(); 349 } 350 KKASSERT(info->mode == INVDONE); 351 cpu_mfence(); 352 353 /* 354 * Must set our cpu in the invalidation scan mask before 355 * any possibility of [partial] execution (remember, XINVLTLB 356 * can interrupt a critical section). 357 */ 358 ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu); 359 360 info->tsc_target = rdtsc() + (tsc_frequency * LOOPRECOVER_TIMEOUT1); 361 info->va = va; 362 info->npgs = npgs; 363 info->ptep = ptep; 364 info->npte = npte; 365 info->opte = 0; 366 #ifdef LOOPRECOVER 367 info->failed = 0; 368 #endif 369 info->mode = INVSTORE; 370 371 tmpmask = pmap->pm_active; /* volatile (bits may be cleared) */ 372 if (pmap_inval_force_allcpus) 373 tmpmask = smp_active_mask; 374 cpu_ccfence(); 375 CPUMASK_ANDMASK(tmpmask, smp_active_mask); 376 377 /* 378 * If ptep is NULL the operation can be semi-synchronous, which means 379 * we can improve performance by flagging and removing idle cpus 380 * (see the idleinvlclr function in mp_machdep.c). 381 * 382 * Typically kernel page table operation is semi-synchronous. 383 */ 384 if (ptep == NULL) 385 smp_smurf_idleinvlclr(&tmpmask); 386 CPUMASK_ORBIT(tmpmask, cpu); 387 info->mask = tmpmask; 388 389 /* 390 * Command may start executing the moment 'done' is initialized, 391 * disable current cpu interrupt to prevent 'done' field from 392 * changing (other cpus can't clear done bits until the originating 393 * cpu clears its mask bit, but other cpus CAN start clearing their 394 * mask bits). 395 */ 396 #ifdef LOOPRECOVER 397 info->sigmask = tmpmask; 398 CHECKSIGMASK(info); 399 #endif 400 cpu_sfence(); 401 rflags = read_rflags(); 402 cpu_disable_intr(); 403 404 ATOMIC_CPUMASK_COPY(info->done, tmpmask); 405 /* execution can begin here on other cpus due to races */ 406 407 /* 408 * Pass our copy of the done bits (so they don't change out from 409 * under us) to generate the Xinvltlb interrupt on the targets. 410 */ 411 smp_invlpg(&tmpmask); 412 opte = info->opte; 413 KKASSERT(info->mode == INVDONE); 414 415 /* 416 * Target cpus will be in their loop exiting concurrently with our 417 * cleanup. They will not lose the bitmask they obtained before so 418 * we can safely clear this bit. 419 */ 420 ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu); 421 write_rflags(rflags); 422 pmap_inval_done(pmap); 423 424 return opte; 425 } 426 427 /* 428 * API function - invalidate the pte at (va) and replace *ptep with npte 429 * atomically only if *ptep equals opte, across the pmap's active cpus. 430 * 431 * Returns 1 on success, 0 on failure (caller typically retries). 432 */ 433 int 434 pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep, 435 pt_entry_t opte, pt_entry_t npte) 436 { 437 globaldata_t gd = mycpu; 438 pmap_inval_info_t *info; 439 int success; 440 int cpu = gd->gd_cpuid; 441 cpumask_t tmpmask; 442 unsigned long rflags; 443 444 /* 445 * Initialize invalidation for pmap and enter critical section. 446 */ 447 if (pmap == NULL) 448 pmap = &kernel_pmap; 449 pmap_inval_init(pmap); 450 451 /* 452 * Shortcut single-cpu case if possible. 453 */ 454 if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask) && 455 pmap_inval_force_nonopt == 0) { 456 if (atomic_cmpset_long(ptep, opte, npte)) { 457 if (va == (vm_offset_t)-1) 458 cpu_invltlb(); 459 else 460 cpu_invlpg((void *)va); 461 pmap_inval_done(pmap); 462 return 1; 463 } else { 464 pmap_inval_done(pmap); 465 return 0; 466 } 467 } 468 469 /* 470 * We need a critical section to prevent getting preempted while 471 * we setup our command. A preemption might execute its own 472 * pmap_inval*() command and create confusion below. 473 */ 474 info = &invinfo[cpu]; 475 476 /* 477 * We must wait for other cpus which may still be finishing 478 * up a prior operation. 479 */ 480 while (CPUMASK_TESTNZERO(info->done)) { 481 #ifdef LOOPRECOVER 482 if (loopwdog(info)) { 483 info->failed = 1; 484 loopdebug("B", info); 485 /* XXX recover from possible bug */ 486 CPUMASK_ASSZERO(info->done); 487 } 488 #endif 489 cpu_pause(); 490 } 491 KKASSERT(info->mode == INVDONE); 492 cpu_mfence(); 493 494 /* 495 * Must set our cpu in the invalidation scan mask before 496 * any possibility of [partial] execution (remember, XINVLTLB 497 * can interrupt a critical section). 498 */ 499 ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu); 500 501 info->tsc_target = rdtsc() + (tsc_frequency * LOOPRECOVER_TIMEOUT1); 502 info->va = va; 503 info->npgs = 1; /* unused */ 504 info->ptep = ptep; 505 info->npte = npte; 506 info->opte = opte; 507 #ifdef LOOPRECOVER 508 info->failed = 0; 509 #endif 510 info->mode = INVCMPSET; 511 info->success = 0; 512 513 tmpmask = pmap->pm_active; /* volatile */ 514 if (pmap_inval_force_allcpus) 515 tmpmask = smp_active_mask; 516 cpu_ccfence(); 517 CPUMASK_ANDMASK(tmpmask, smp_active_mask); 518 CPUMASK_ORBIT(tmpmask, cpu); 519 info->mask = tmpmask; 520 521 /* 522 * Command may start executing the moment 'done' is initialized, 523 * disable current cpu interrupt to prevent 'done' field from 524 * changing (other cpus can't clear done bits until the originating 525 * cpu clears its mask bit). 526 */ 527 #ifdef LOOPRECOVER 528 info->sigmask = tmpmask; 529 CHECKSIGMASK(info); 530 #endif 531 cpu_sfence(); 532 rflags = read_rflags(); 533 cpu_disable_intr(); 534 535 ATOMIC_CPUMASK_COPY(info->done, tmpmask); 536 537 /* 538 * Pass our copy of the done bits (so they don't change out from 539 * under us) to generate the Xinvltlb interrupt on the targets. 540 */ 541 smp_invlpg(&tmpmask); 542 success = info->success; 543 KKASSERT(info->mode == INVDONE); 544 545 ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu); 546 write_rflags(rflags); 547 pmap_inval_done(pmap); 548 549 return success; 550 } 551 552 void 553 pmap_inval_bulk_init(pmap_inval_bulk_t *bulk, struct pmap *pmap) 554 { 555 bulk->pmap = pmap; 556 bulk->va_beg = 0; 557 bulk->va_end = 0; 558 bulk->count = 0; 559 } 560 561 pt_entry_t 562 pmap_inval_bulk(pmap_inval_bulk_t *bulk, vm_offset_t va, 563 pt_entry_t *ptep, pt_entry_t npte) 564 { 565 pt_entry_t pte; 566 567 /* 568 * Degenerate case, localized or we don't care (e.g. because we 569 * are jacking the entire page table) or the pmap is not in-use 570 * by anyone. No invalidations are done on any cpu. 571 */ 572 if (bulk == NULL) { 573 pte = atomic_swap_long(ptep, npte); 574 return pte; 575 } 576 577 /* 578 * If it isn't the kernel pmap we execute the operation synchronously 579 * on all cpus belonging to the pmap, which avoids concurrency bugs in 580 * the hw related to changing pte's out from under threads. 581 * 582 * Eventually I would like to implement streaming pmap invalidation 583 * for user pmaps to reduce mmap/munmap overheads for heavily-loaded 584 * threaded programs. 585 */ 586 if (bulk->pmap != &kernel_pmap) { 587 pte = pmap_inval_smp(bulk->pmap, va, 1, ptep, npte); 588 return pte; 589 } 590 591 /* 592 * This is the kernel_pmap. All unmap operations presume that there 593 * are no other cpus accessing the addresses in question. Implement 594 * the bulking algorithm. collect the required information and 595 * synchronize once at the end. 596 */ 597 pte = atomic_swap_long(ptep, npte); 598 if (va == (vm_offset_t)-1) { 599 bulk->va_beg = va; 600 } else if (bulk->va_beg == bulk->va_end) { 601 bulk->va_beg = va; 602 bulk->va_end = va + PAGE_SIZE; 603 } else if (va == bulk->va_end) { 604 bulk->va_end = va + PAGE_SIZE; 605 } else { 606 bulk->va_beg = (vm_offset_t)-1; 607 bulk->va_end = 0; 608 #if 0 609 pmap_inval_bulk_flush(bulk); 610 bulk->count = 1; 611 if (va == (vm_offset_t)-1) { 612 bulk->va_beg = va; 613 bulk->va_end = 0; 614 } else { 615 bulk->va_beg = va; 616 bulk->va_end = va + PAGE_SIZE; 617 } 618 #endif 619 } 620 ++bulk->count; 621 622 return pte; 623 } 624 625 void 626 pmap_inval_bulk_flush(pmap_inval_bulk_t *bulk) 627 { 628 if (bulk == NULL) 629 return; 630 if (bulk->va_beg != bulk->va_end) { 631 if (bulk->va_beg == (vm_offset_t)-1) { 632 pmap_inval_smp(bulk->pmap, bulk->va_beg, 1, NULL, 0); 633 } else { 634 vm_pindex_t n; 635 636 n = (bulk->va_end - bulk->va_beg) >> PAGE_SHIFT; 637 pmap_inval_smp(bulk->pmap, bulk->va_beg, n, NULL, 0); 638 } 639 } 640 bulk->va_beg = 0; 641 bulk->va_end = 0; 642 bulk->count = 0; 643 } 644 645 /* 646 * Called from Xinvl with a critical section held and interrupts enabled. 647 */ 648 int 649 pmap_inval_intr(cpumask_t *cpumaskp, int toolong) 650 { 651 globaldata_t gd = mycpu; 652 pmap_inval_info_t *info; 653 int loopme = 0; 654 int cpu; 655 cpumask_t cpumask; 656 657 /* 658 * Check all cpus for invalidations we may need to service. 659 */ 660 cpu_ccfence(); 661 cpu = gd->gd_cpuid; 662 cpumask = *cpumaskp; 663 664 while (CPUMASK_TESTNZERO(cpumask)) { 665 int n = BSFCPUMASK(cpumask); 666 667 #ifdef LOOPRECOVER 668 KKASSERT(n >= 0 && n < MAXCPU); 669 #endif 670 671 CPUMASK_NANDBIT(cpumask, n); 672 info = &invinfo[n]; 673 674 /* 675 * Checkout cpu (cpu) for work in the target cpu info (n) 676 * 677 * if (n == cpu) - check our cpu for a master operation 678 * if (n != cpu) - check other cpus for a slave operation 679 * 680 * Due to interrupts/races we can catch a new operation 681 * in an older interrupt in other cpus. 682 * 683 * A fence is needed once we detect the (not) done bit. 684 */ 685 if (!CPUMASK_TESTBIT(info->done, cpu)) 686 continue; 687 cpu_lfence(); 688 #ifdef LOOPRECOVER 689 if (toolong) { 690 kprintf("pminvl %d->%d %08jx %08jx mode=%d\n", 691 cpu, n, info->done.ary[0], info->mask.ary[0], 692 info->mode); 693 } 694 #endif 695 696 /* 697 * info->mask and info->done always contain the originating 698 * cpu until the originator is done. Targets may still be 699 * present in info->done after the originator is done (they 700 * will be finishing up their loops). 701 * 702 * Clear info->mask bits on other cpus to indicate that they 703 * have quiesced (entered the loop). Once the other mask bits 704 * are clear we can execute the operation on the original, 705 * then clear the mask and done bits on the originator. The 706 * targets will then finish up their side and clear their 707 * done bits. 708 * 709 * The command is considered 100% done when all done bits have 710 * been cleared. 711 */ 712 if (n != cpu) { 713 /* 714 * Command state machine for 'other' cpus. 715 */ 716 if (CPUMASK_TESTBIT(info->mask, cpu)) { 717 /* 718 * Other cpus indicate to originator that they 719 * are quiesced. 720 */ 721 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu); 722 loopme = 1; 723 } else if (info->ptep && 724 CPUMASK_TESTBIT(info->mask, n)) { 725 /* 726 * Other cpu must wait for the originator (n) 727 * to complete its command if ptep is not NULL. 728 */ 729 loopme = 1; 730 } else { 731 /* 732 * Other cpu detects that the originator has 733 * completed its command, or there was no 734 * command. 735 * 736 * Now that the page table entry has changed, 737 * we can follow up with our own invalidation. 738 */ 739 vm_offset_t va = info->va; 740 vm_pindex_t npgs; 741 742 if (va == (vm_offset_t)-1 || 743 info->npgs > MAX_INVAL_PAGES) { 744 cpu_invltlb(); 745 } else { 746 for (npgs = info->npgs; npgs; --npgs) { 747 cpu_invlpg((void *)va); 748 va += PAGE_SIZE; 749 } 750 } 751 ATOMIC_CPUMASK_NANDBIT(info->done, cpu); 752 /* info invalid now */ 753 /* loopme left alone */ 754 } 755 } else if (CPUMASK_TESTBIT(info->mask, cpu)) { 756 /* 757 * Originator is waiting for other cpus 758 */ 759 if (CPUMASK_CMPMASKNEQ(info->mask, gd->gd_cpumask)) { 760 /* 761 * Originator waits for other cpus to enter 762 * their loop (aka quiesce). 763 * 764 * If this bugs out the IPI may have been lost, 765 * try to reissue by resetting our own 766 * reentrancy bit and clearing the smurf mask 767 * for the cpus that did not respond, then 768 * reissuing the IPI. 769 */ 770 loopme = 1; 771 #ifdef LOOPRECOVER 772 if (loopwdog(info)) { 773 info->failed = 1; 774 loopdebug("C", info); 775 /* XXX recover from possible bug */ 776 cpu_disable_intr(); 777 ATOMIC_CPUMASK_NANDMASK(smp_smurf_mask, 778 info->mask); 779 smp_invlpg(&smp_active_mask); 780 781 /* 782 * Force outer-loop retest of Xinvltlb 783 * requests (see mp_machdep.c). 784 */ 785 cpu_enable_intr(); 786 } 787 #endif 788 } else { 789 /* 790 * Originator executes operation and clears 791 * mask to allow other cpus to finish. 792 */ 793 KKASSERT(info->mode != INVDONE); 794 if (info->mode == INVSTORE) { 795 if (info->ptep) 796 info->opte = atomic_swap_long(info->ptep, info->npte); 797 CHECKSIGMASK(info); 798 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu); 799 CHECKSIGMASK(info); 800 } else { 801 if (atomic_cmpset_long(info->ptep, 802 info->opte, info->npte)) { 803 info->success = 1; 804 } else { 805 info->success = 0; 806 } 807 CHECKSIGMASK(info); 808 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu); 809 CHECKSIGMASK(info); 810 } 811 loopme = 1; 812 } 813 } else { 814 /* 815 * Originator does not have to wait for the other 816 * cpus to finish. It clears its done bit. A new 817 * command will not be initiated by the originator 818 * until the other cpus have cleared their done bits 819 * (asynchronously). 820 */ 821 vm_offset_t va = info->va; 822 vm_pindex_t npgs; 823 824 if (va == (vm_offset_t)-1 || 825 info->npgs > MAX_INVAL_PAGES) { 826 cpu_invltlb(); 827 } else { 828 for (npgs = info->npgs; npgs; --npgs) { 829 cpu_invlpg((void *)va); 830 va += PAGE_SIZE; 831 } 832 } 833 834 /* leave loopme alone */ 835 /* other cpus may still be finishing up */ 836 /* can't race originator since that's us */ 837 info->mode = INVDONE; 838 ATOMIC_CPUMASK_NANDBIT(info->done, cpu); 839 } 840 } 841 return loopme; 842 } 843