1 /* 2 * Copyright (c) 2003-2011 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * pmap invalidation support code. Certain hardware requirements must 37 * be dealt with when manipulating page table entries and page directory 38 * entries within a pmap. In particular, we cannot safely manipulate 39 * page tables which are in active use by another cpu (even if it is 40 * running in userland) for two reasons: First, TLB writebacks will 41 * race against our own modifications and tests. Second, even if we 42 * were to use bus-locked instruction we can still screw up the 43 * target cpu's instruction pipeline due to Intel cpu errata. 44 */ 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/kernel.h> 49 #include <sys/proc.h> 50 #include <sys/vmmeter.h> 51 #include <sys/thread2.h> 52 #include <sys/sysctl.h> 53 54 #include <vm/vm.h> 55 #include <vm/pmap.h> 56 #include <vm/vm_object.h> 57 58 #include <machine/cputypes.h> 59 #include <machine/md_var.h> 60 #include <machine/specialreg.h> 61 #include <machine/smp.h> 62 #include <machine/globaldata.h> 63 #include <machine/pmap.h> 64 #include <machine/pmap_inval.h> 65 #include <machine/clock.h> 66 67 #if 1 /* DEBUGGING */ 68 #define LOOPRECOVER /* enable watchdog */ 69 #endif 70 71 /* 72 * Watchdog recovery interval, in seconds. 73 * 74 * The watchdog value is generous for two reasons. First, because the 75 * situation is not supposed to happen at all (but does), and second, 76 * because VMs could be very slow at handling IPIs. 77 */ 78 #define LOOPRECOVER_TIMEOUT1 2 /* initial recovery */ 79 #define LOOPRECOVER_TIMEOUT2 1 /* repeated recoveries */ 80 81 #define MAX_INVAL_PAGES 128 82 83 struct pmap_inval_info { 84 vm_offset_t va; 85 pt_entry_t *ptep; 86 pt_entry_t opte; 87 pt_entry_t npte; 88 enum { INVDONE, INVSTORE, INVCMPSET } mode; 89 int success; 90 vm_pindex_t npgs; 91 cpumask_t done; 92 cpumask_t mask; 93 #ifdef LOOPRECOVER 94 cpumask_t sigmask; 95 int failed; 96 tsc_uclock_t tsc_target; 97 #endif 98 } __cachealign; 99 100 typedef struct pmap_inval_info pmap_inval_info_t; 101 102 static pmap_inval_info_t invinfo[MAXCPU]; 103 extern cpumask_t smp_invmask; 104 #ifdef LOOPRECOVER 105 #ifdef LOOPMASK_IN 106 extern cpumask_t smp_in_mask; 107 #endif 108 extern cpumask_t smp_smurf_mask; 109 #endif 110 static int pmap_inval_watchdog_print; /* must always default off */ 111 static int pmap_inval_force_allcpus; 112 static int pmap_inval_force_nonopt; 113 114 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_watchdog_print, CTLFLAG_RW, 115 &pmap_inval_watchdog_print, 0, ""); 116 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_force_allcpus, CTLFLAG_RW, 117 &pmap_inval_force_allcpus, 0, ""); 118 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_force_nonopt, CTLFLAG_RW, 119 &pmap_inval_force_nonopt, 0, ""); 120 121 static void 122 pmap_inval_init(pmap_t pmap) 123 { 124 cpulock_t olock; 125 cpulock_t nlock; 126 127 crit_enter_id("inval"); 128 129 if (pmap != &kernel_pmap) { 130 for (;;) { 131 olock = pmap->pm_active_lock; 132 cpu_ccfence(); 133 nlock = olock | CPULOCK_EXCL; 134 if (olock != nlock && 135 atomic_cmpset_int(&pmap->pm_active_lock, 136 olock, nlock)) { 137 break; 138 } 139 lwkt_process_ipiq(); 140 cpu_pause(); 141 } 142 atomic_add_acq_long(&pmap->pm_invgen, 1); 143 } 144 } 145 146 static void 147 pmap_inval_done(pmap_t pmap) 148 { 149 if (pmap != &kernel_pmap) { 150 atomic_add_acq_long(&pmap->pm_invgen, 1); 151 atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL); 152 } 153 crit_exit_id("inval"); 154 } 155 156 #ifdef LOOPRECOVER 157 158 /* 159 * Debugging and lost IPI recovery code. 160 */ 161 static 162 __inline 163 int 164 loopwdog(struct pmap_inval_info *info) 165 { 166 tsc_uclock_t tsc; 167 168 tsc = rdtsc(); 169 if ((tsc_sclock_t)(info->tsc_target - tsc) < 0 && tsc_frequency) { 170 info->tsc_target = tsc + (tsc_frequency * LOOPRECOVER_TIMEOUT2); 171 return 1; 172 } 173 return 0; 174 } 175 176 static 177 void 178 loopdebug(const char *msg, pmap_inval_info_t *info) 179 { 180 int p; 181 int cpu = mycpu->gd_cpuid; 182 183 /* 184 * Don't kprintf() anything if the pmap inval watchdog gets hit. 185 * DRM can cause an occassional watchdog hit (at least with a 1/16 186 * second watchdog), and attempting to kprintf to the KVM frame buffer 187 * from Xinvltlb, which ignores critical sections, can implode the 188 * system. 189 */ 190 if (pmap_inval_watchdog_print == 0) 191 return; 192 193 cpu_lfence(); 194 #ifdef LOOPRECOVER 195 atomic_add_long(&smp_smurf_mask.ary[0], 0); 196 #endif 197 kprintf("ipilost-%s! %d mode=%d m=%08jx d=%08jx " 198 #ifdef LOOPRECOVER 199 "s=%08jx " 200 #endif 201 #ifdef LOOPMASK_IN 202 "in=%08jx " 203 #endif 204 #ifdef LOOPRECOVER 205 "smurf=%08jx\n" 206 #endif 207 , msg, cpu, info->mode, 208 info->mask.ary[0], 209 info->done.ary[0] 210 #ifdef LOOPRECOVER 211 , info->sigmask.ary[0] 212 #endif 213 #ifdef LOOPMASK_IN 214 , smp_in_mask.ary[0] 215 #endif 216 #ifdef LOOPRECOVER 217 , smp_smurf_mask.ary[0] 218 #endif 219 ); 220 kprintf("mdglob "); 221 for (p = 0; p < ncpus; ++p) 222 kprintf(" %d", CPU_prvspace[p]->mdglobaldata.gd_xinvaltlb); 223 kprintf("\n"); 224 } 225 226 #endif 227 228 #ifdef CHECKSIG 229 230 #define CHECKSIGMASK(info) _checksigmask(info, __FILE__, __LINE__) 231 232 static 233 void 234 _checksigmask(pmap_inval_info_t *info, const char *file, int line) 235 { 236 cpumask_t tmp; 237 238 tmp = info->mask; 239 CPUMASK_ANDMASK(tmp, info->sigmask); 240 if (CPUMASK_CMPMASKNEQ(tmp, info->mask)) { 241 kprintf("\"%s\" line %d: bad sig/mask %08jx %08jx\n", 242 file, line, info->sigmask.ary[0], info->mask.ary[0]); 243 } 244 } 245 246 #else 247 248 #define CHECKSIGMASK(info) 249 250 #endif 251 252 /* 253 * Invalidate the specified va across all cpus associated with the pmap. 254 * If va == (vm_offset_t)-1, we invltlb() instead of invlpg(). The operation 255 * will be done fully synchronously with storing npte into *ptep and returning 256 * opte. 257 * 258 * If ptep is NULL the operation will execute semi-synchronously. 259 * ptep must be NULL if npgs > 1 260 */ 261 pt_entry_t 262 pmap_inval_smp(pmap_t pmap, vm_offset_t va, vm_pindex_t npgs, 263 pt_entry_t *ptep, pt_entry_t npte) 264 { 265 globaldata_t gd = mycpu; 266 pmap_inval_info_t *info; 267 pt_entry_t opte = 0; 268 int cpu = gd->gd_cpuid; 269 cpumask_t tmpmask; 270 unsigned long rflags; 271 272 /* 273 * Initialize invalidation for pmap and enter critical section. 274 * This will enter a critical section for us. 275 */ 276 if (pmap == NULL) 277 pmap = &kernel_pmap; 278 279 /* 280 * Shortcut single-cpu case if possible. 281 */ 282 if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask) && 283 pmap_inval_force_nonopt == 0) { 284 /* 285 * Convert to invltlb if there are too many pages to 286 * invlpg on. 287 */ 288 if (pmap->pm_flags & PMAP_MULTI) 289 pmap_inval_init(pmap); 290 if (npgs == 1) { 291 if (ptep) 292 opte = atomic_swap_long(ptep, npte); 293 if (va == (vm_offset_t)-1) 294 cpu_invltlb(); 295 else 296 cpu_invlpg((void *)va); 297 } else if (va == (vm_offset_t)-1 || npgs > MAX_INVAL_PAGES) { 298 if (ptep) { 299 while (npgs) { 300 opte = atomic_swap_long(ptep, npte); 301 ++ptep; 302 --npgs; 303 } 304 } 305 cpu_invltlb(); 306 } else { 307 while (npgs) { 308 if (ptep) { 309 opte = atomic_swap_long(ptep, npte); 310 ++ptep; 311 } 312 cpu_invlpg((void *)va); 313 va += PAGE_SIZE; 314 --npgs; 315 } 316 } 317 if (pmap->pm_flags & PMAP_MULTI) 318 pmap_inval_done(pmap); 319 320 return opte; 321 } 322 323 /* 324 * We need a critical section to prevent getting preempted while 325 * we setup our command. A preemption might execute its own 326 * pmap_inval*() command and create confusion below. 327 * 328 * tsc_target is our watchdog timeout that will attempt to recover 329 * from a lost IPI. Set to 1/16 second for now. 330 */ 331 pmap_inval_init(pmap); 332 info = &invinfo[cpu]; 333 334 /* 335 * We must wait for other cpus which may still be finishing up a 336 * prior operation that we requested. 337 * 338 * We do not have to disable interrupts here. An Xinvltlb can occur 339 * at any time (even within a critical section), but it will not 340 * act on our command until we set our done bits. 341 */ 342 while (CPUMASK_TESTNZERO(info->done)) { 343 #ifdef LOOPRECOVER 344 if (loopwdog(info)) { 345 info->failed = 1; 346 loopdebug("A", info); 347 /* XXX recover from possible bug */ 348 CPUMASK_ASSZERO(info->done); 349 } 350 #endif 351 cpu_pause(); 352 } 353 KKASSERT(info->mode == INVDONE); 354 cpu_mfence(); 355 356 /* 357 * Must set our cpu in the invalidation scan mask before 358 * any possibility of [partial] execution (remember, XINVLTLB 359 * can interrupt a critical section). 360 */ 361 ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu); 362 363 info->tsc_target = rdtsc() + (tsc_frequency * LOOPRECOVER_TIMEOUT1); 364 info->va = va; 365 info->npgs = npgs; 366 info->ptep = ptep; 367 info->npte = npte; 368 info->opte = 0; 369 #ifdef LOOPRECOVER 370 info->failed = 0; 371 #endif 372 info->mode = INVSTORE; 373 374 tmpmask = pmap->pm_active; /* volatile (bits may be cleared) */ 375 if (pmap_inval_force_allcpus) 376 tmpmask = smp_active_mask; 377 cpu_ccfence(); 378 CPUMASK_ANDMASK(tmpmask, smp_active_mask); 379 380 /* 381 * If ptep is NULL the operation can be semi-synchronous, which means 382 * we can improve performance by flagging and removing idle cpus 383 * (see the idleinvlclr function in mp_machdep.c). 384 * 385 * Typically kernel page table operation is semi-synchronous. 386 */ 387 if (ptep == NULL) 388 smp_smurf_idleinvlclr(&tmpmask); 389 CPUMASK_ORBIT(tmpmask, cpu); 390 info->mask = tmpmask; 391 392 /* 393 * Command may start executing the moment 'done' is initialized, 394 * disable current cpu interrupt to prevent 'done' field from 395 * changing (other cpus can't clear done bits until the originating 396 * cpu clears its mask bit, but other cpus CAN start clearing their 397 * mask bits). 398 */ 399 #ifdef LOOPRECOVER 400 info->sigmask = tmpmask; 401 CHECKSIGMASK(info); 402 #endif 403 cpu_sfence(); 404 rflags = read_rflags(); 405 cpu_disable_intr(); 406 407 ATOMIC_CPUMASK_COPY(info->done, tmpmask); 408 /* execution can begin here on other cpus due to races */ 409 410 /* 411 * Pass our copy of the done bits (so they don't change out from 412 * under us) to generate the Xinvltlb interrupt on the targets. 413 */ 414 smp_invlpg(&tmpmask); 415 opte = info->opte; 416 KKASSERT(info->mode == INVDONE); 417 418 /* 419 * Target cpus will be in their loop exiting concurrently with our 420 * cleanup. They will not lose the bitmask they obtained before so 421 * we can safely clear this bit. 422 */ 423 ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu); 424 write_rflags(rflags); 425 pmap_inval_done(pmap); 426 427 return opte; 428 } 429 430 /* 431 * API function - invalidate the pte at (va) and replace *ptep with npte 432 * atomically only if *ptep equals opte, across the pmap's active cpus. 433 * 434 * Returns 1 on success, 0 on failure (caller typically retries). 435 */ 436 int 437 pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep, 438 pt_entry_t opte, pt_entry_t npte) 439 { 440 globaldata_t gd = mycpu; 441 pmap_inval_info_t *info; 442 int success; 443 int cpu = gd->gd_cpuid; 444 cpumask_t tmpmask; 445 unsigned long rflags; 446 447 /* 448 * Initialize invalidation for pmap and enter critical section. 449 */ 450 if (pmap == NULL) 451 pmap = &kernel_pmap; 452 453 /* 454 * Shortcut single-cpu case if possible. 455 */ 456 if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask) && 457 pmap_inval_force_nonopt == 0) { 458 if (pmap->pm_flags & PMAP_MULTI) 459 pmap_inval_init(pmap); 460 if (atomic_cmpset_long(ptep, opte, npte)) { 461 if (va == (vm_offset_t)-1) 462 cpu_invltlb(); 463 else 464 cpu_invlpg((void *)va); 465 if (pmap->pm_flags & PMAP_MULTI) 466 pmap_inval_done(pmap); 467 return 1; 468 } else { 469 if (pmap->pm_flags & PMAP_MULTI) 470 pmap_inval_done(pmap); 471 return 0; 472 } 473 } 474 475 /* 476 * We need a critical section to prevent getting preempted while 477 * we setup our command. A preemption might execute its own 478 * pmap_inval*() command and create confusion below. 479 */ 480 pmap_inval_init(pmap); 481 info = &invinfo[cpu]; 482 483 /* 484 * We must wait for other cpus which may still be finishing 485 * up a prior operation. 486 */ 487 while (CPUMASK_TESTNZERO(info->done)) { 488 #ifdef LOOPRECOVER 489 if (loopwdog(info)) { 490 info->failed = 1; 491 loopdebug("B", info); 492 /* XXX recover from possible bug */ 493 CPUMASK_ASSZERO(info->done); 494 } 495 #endif 496 cpu_pause(); 497 } 498 KKASSERT(info->mode == INVDONE); 499 cpu_mfence(); 500 501 /* 502 * Must set our cpu in the invalidation scan mask before 503 * any possibility of [partial] execution (remember, XINVLTLB 504 * can interrupt a critical section). 505 */ 506 ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu); 507 508 info->tsc_target = rdtsc() + (tsc_frequency * LOOPRECOVER_TIMEOUT1); 509 info->va = va; 510 info->npgs = 1; /* unused */ 511 info->ptep = ptep; 512 info->npte = npte; 513 info->opte = opte; 514 #ifdef LOOPRECOVER 515 info->failed = 0; 516 #endif 517 info->mode = INVCMPSET; 518 info->success = 0; 519 520 tmpmask = pmap->pm_active; /* volatile */ 521 if (pmap_inval_force_allcpus) 522 tmpmask = smp_active_mask; 523 cpu_ccfence(); 524 CPUMASK_ANDMASK(tmpmask, smp_active_mask); 525 CPUMASK_ORBIT(tmpmask, cpu); 526 info->mask = tmpmask; 527 528 /* 529 * Command may start executing the moment 'done' is initialized, 530 * disable current cpu interrupt to prevent 'done' field from 531 * changing (other cpus can't clear done bits until the originating 532 * cpu clears its mask bit). 533 */ 534 #ifdef LOOPRECOVER 535 info->sigmask = tmpmask; 536 CHECKSIGMASK(info); 537 #endif 538 cpu_sfence(); 539 rflags = read_rflags(); 540 cpu_disable_intr(); 541 542 ATOMIC_CPUMASK_COPY(info->done, tmpmask); 543 544 /* 545 * Pass our copy of the done bits (so they don't change out from 546 * under us) to generate the Xinvltlb interrupt on the targets. 547 */ 548 smp_invlpg(&tmpmask); 549 success = info->success; 550 KKASSERT(info->mode == INVDONE); 551 552 ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu); 553 write_rflags(rflags); 554 pmap_inval_done(pmap); 555 556 return success; 557 } 558 559 void 560 pmap_inval_bulk_init(pmap_inval_bulk_t *bulk, struct pmap *pmap) 561 { 562 bulk->pmap = pmap; 563 bulk->va_beg = 0; 564 bulk->va_end = 0; 565 bulk->count = 0; 566 } 567 568 pt_entry_t 569 pmap_inval_bulk(pmap_inval_bulk_t *bulk, vm_offset_t va, 570 pt_entry_t *ptep, pt_entry_t npte) 571 { 572 pt_entry_t pte; 573 574 /* 575 * Degenerate case, localized or we don't care (e.g. because we 576 * are jacking the entire page table) or the pmap is not in-use 577 * by anyone. No invalidations are done on any cpu. 578 */ 579 if (bulk == NULL) { 580 pte = atomic_swap_long(ptep, npte); 581 return pte; 582 } 583 584 /* 585 * If it isn't the kernel pmap we execute the operation synchronously 586 * on all cpus belonging to the pmap, which avoids concurrency bugs in 587 * the hw related to changing pte's out from under threads. 588 * 589 * Eventually I would like to implement streaming pmap invalidation 590 * for user pmaps to reduce mmap/munmap overheads for heavily-loaded 591 * threaded programs. 592 */ 593 if (bulk->pmap != &kernel_pmap) { 594 pte = pmap_inval_smp(bulk->pmap, va, 1, ptep, npte); 595 return pte; 596 } 597 598 /* 599 * This is the kernel_pmap. All unmap operations presume that there 600 * are no other cpus accessing the addresses in question. Implement 601 * the bulking algorithm. collect the required information and 602 * synchronize once at the end. 603 */ 604 pte = atomic_swap_long(ptep, npte); 605 if (va == (vm_offset_t)-1) { 606 bulk->va_beg = va; 607 } else if (bulk->va_beg == bulk->va_end) { 608 bulk->va_beg = va; 609 bulk->va_end = va + PAGE_SIZE; 610 } else if (va == bulk->va_end) { 611 bulk->va_end = va + PAGE_SIZE; 612 } else { 613 bulk->va_beg = (vm_offset_t)-1; 614 bulk->va_end = 0; 615 #if 0 616 pmap_inval_bulk_flush(bulk); 617 bulk->count = 1; 618 if (va == (vm_offset_t)-1) { 619 bulk->va_beg = va; 620 bulk->va_end = 0; 621 } else { 622 bulk->va_beg = va; 623 bulk->va_end = va + PAGE_SIZE; 624 } 625 #endif 626 } 627 ++bulk->count; 628 629 return pte; 630 } 631 632 void 633 pmap_inval_bulk_flush(pmap_inval_bulk_t *bulk) 634 { 635 if (bulk == NULL) 636 return; 637 if (bulk->va_beg != bulk->va_end) { 638 if (bulk->va_beg == (vm_offset_t)-1) { 639 pmap_inval_smp(bulk->pmap, bulk->va_beg, 1, NULL, 0); 640 } else { 641 vm_pindex_t n; 642 643 n = (bulk->va_end - bulk->va_beg) >> PAGE_SHIFT; 644 pmap_inval_smp(bulk->pmap, bulk->va_beg, n, NULL, 0); 645 } 646 } 647 bulk->va_beg = 0; 648 bulk->va_end = 0; 649 bulk->count = 0; 650 } 651 652 /* 653 * Called from Xinvl with a critical section held and interrupts enabled. 654 */ 655 int 656 pmap_inval_intr(cpumask_t *cpumaskp, int toolong) 657 { 658 globaldata_t gd = mycpu; 659 pmap_inval_info_t *info; 660 int loopme = 0; 661 int cpu; 662 cpumask_t cpumask; 663 664 /* 665 * Check all cpus for invalidations we may need to service. 666 */ 667 cpu_ccfence(); 668 cpu = gd->gd_cpuid; 669 cpumask = *cpumaskp; 670 671 while (CPUMASK_TESTNZERO(cpumask)) { 672 int n = BSFCPUMASK(cpumask); 673 674 #ifdef LOOPRECOVER 675 KKASSERT(n >= 0 && n < MAXCPU); 676 #endif 677 678 CPUMASK_NANDBIT(cpumask, n); 679 info = &invinfo[n]; 680 681 /* 682 * Checkout cpu (cpu) for work in the target cpu info (n) 683 * 684 * if (n == cpu) - check our cpu for a master operation 685 * if (n != cpu) - check other cpus for a slave operation 686 * 687 * Due to interrupts/races we can catch a new operation 688 * in an older interrupt in other cpus. 689 * 690 * A fence is needed once we detect the (not) done bit. 691 */ 692 if (!CPUMASK_TESTBIT(info->done, cpu)) 693 continue; 694 cpu_lfence(); 695 #ifdef LOOPRECOVER 696 if (toolong) { 697 kprintf("pminvl %d->%d %08jx %08jx mode=%d\n", 698 cpu, n, info->done.ary[0], info->mask.ary[0], 699 info->mode); 700 } 701 #endif 702 703 /* 704 * info->mask and info->done always contain the originating 705 * cpu until the originator is done. Targets may still be 706 * present in info->done after the originator is done (they 707 * will be finishing up their loops). 708 * 709 * Clear info->mask bits on other cpus to indicate that they 710 * have quiesced (entered the loop). Once the other mask bits 711 * are clear we can execute the operation on the original, 712 * then clear the mask and done bits on the originator. The 713 * targets will then finish up their side and clear their 714 * done bits. 715 * 716 * The command is considered 100% done when all done bits have 717 * been cleared. 718 */ 719 if (n != cpu) { 720 /* 721 * Command state machine for 'other' cpus. 722 */ 723 if (CPUMASK_TESTBIT(info->mask, cpu)) { 724 /* 725 * Other cpus indicate to originator that they 726 * are quiesced. 727 */ 728 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu); 729 loopme = 1; 730 } else if (info->ptep && 731 CPUMASK_TESTBIT(info->mask, n)) { 732 /* 733 * Other cpu must wait for the originator (n) 734 * to complete its command if ptep is not NULL. 735 */ 736 loopme = 1; 737 } else { 738 /* 739 * Other cpu detects that the originator has 740 * completed its command, or there was no 741 * command. 742 * 743 * Now that the page table entry has changed, 744 * we can follow up with our own invalidation. 745 */ 746 vm_offset_t va = info->va; 747 vm_pindex_t npgs; 748 749 if (va == (vm_offset_t)-1 || 750 info->npgs > MAX_INVAL_PAGES) { 751 cpu_invltlb(); 752 } else { 753 for (npgs = info->npgs; npgs; --npgs) { 754 cpu_invlpg((void *)va); 755 va += PAGE_SIZE; 756 } 757 } 758 ATOMIC_CPUMASK_NANDBIT(info->done, cpu); 759 /* info invalid now */ 760 /* loopme left alone */ 761 } 762 } else if (CPUMASK_TESTBIT(info->mask, cpu)) { 763 /* 764 * Originator is waiting for other cpus 765 */ 766 if (CPUMASK_CMPMASKNEQ(info->mask, gd->gd_cpumask)) { 767 /* 768 * Originator waits for other cpus to enter 769 * their loop (aka quiesce). 770 * 771 * If this bugs out the IPI may have been lost, 772 * try to reissue by resetting our own 773 * reentrancy bit and clearing the smurf mask 774 * for the cpus that did not respond, then 775 * reissuing the IPI. 776 */ 777 loopme = 1; 778 #ifdef LOOPRECOVER 779 if (loopwdog(info)) { 780 info->failed = 1; 781 loopdebug("C", info); 782 /* XXX recover from possible bug */ 783 cpu_disable_intr(); 784 ATOMIC_CPUMASK_NANDMASK(smp_smurf_mask, 785 info->mask); 786 smp_invlpg(&smp_active_mask); 787 788 /* 789 * Force outer-loop retest of Xinvltlb 790 * requests (see mp_machdep.c). 791 */ 792 cpu_enable_intr(); 793 } 794 #endif 795 } else { 796 /* 797 * Originator executes operation and clears 798 * mask to allow other cpus to finish. 799 */ 800 KKASSERT(info->mode != INVDONE); 801 if (info->mode == INVSTORE) { 802 if (info->ptep) 803 info->opte = atomic_swap_long(info->ptep, info->npte); 804 CHECKSIGMASK(info); 805 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu); 806 CHECKSIGMASK(info); 807 } else { 808 if (atomic_cmpset_long(info->ptep, 809 info->opte, info->npte)) { 810 info->success = 1; 811 } else { 812 info->success = 0; 813 } 814 CHECKSIGMASK(info); 815 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu); 816 CHECKSIGMASK(info); 817 } 818 loopme = 1; 819 } 820 } else { 821 /* 822 * Originator does not have to wait for the other 823 * cpus to finish. It clears its done bit. A new 824 * command will not be initiated by the originator 825 * until the other cpus have cleared their done bits 826 * (asynchronously). 827 */ 828 vm_offset_t va = info->va; 829 vm_pindex_t npgs; 830 831 if (va == (vm_offset_t)-1 || 832 info->npgs > MAX_INVAL_PAGES) { 833 cpu_invltlb(); 834 } else { 835 for (npgs = info->npgs; npgs; --npgs) { 836 cpu_invlpg((void *)va); 837 va += PAGE_SIZE; 838 } 839 } 840 841 /* leave loopme alone */ 842 /* other cpus may still be finishing up */ 843 /* can't race originator since that's us */ 844 info->mode = INVDONE; 845 ATOMIC_CPUMASK_NANDBIT(info->done, cpu); 846 } 847 } 848 return loopme; 849 } 850