1 /* 2 * Copyright (c) 2003-2011 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 /* 36 * pmap invalidation support code. Certain hardware requirements must 37 * be dealt with when manipulating page table entries and page directory 38 * entries within a pmap. In particular, we cannot safely manipulate 39 * page tables which are in active use by another cpu (even if it is 40 * running in userland) for two reasons: First, TLB writebacks will 41 * race against our own modifications and tests. Second, even if we 42 * were to use bus-locked instruction we can still screw up the 43 * target cpu's instruction pipeline due to Intel cpu errata. 44 */ 45 46 #include <sys/param.h> 47 #include <sys/systm.h> 48 #include <sys/kernel.h> 49 #include <sys/proc.h> 50 #include <sys/vmmeter.h> 51 #include <sys/thread2.h> 52 #include <sys/sysctl.h> 53 54 #include <vm/vm.h> 55 #include <vm/pmap.h> 56 #include <vm/vm_object.h> 57 58 #include <machine/cputypes.h> 59 #include <machine/md_var.h> 60 #include <machine/specialreg.h> 61 #include <machine/smp.h> 62 #include <machine/globaldata.h> 63 #include <machine/pmap.h> 64 #include <machine/pmap_inval.h> 65 #include <machine/clock.h> 66 67 #if 1 /* DEBUGGING */ 68 #define LOOPRECOVER /* enable watchdog */ 69 #endif 70 71 /* 72 * Watchdog recovery interval, in seconds. 73 * 74 * The watchdog value is generous for two reasons. First, because the 75 * situation is not supposed to happen at all (but does), and second, 76 * because VMs could be very slow at handling IPIs. 77 */ 78 #define LOOPRECOVER_TIMEOUT1 2 /* initial recovery */ 79 #define LOOPRECOVER_TIMEOUT2 1 /* repeated recoveries */ 80 81 #define MAX_INVAL_PAGES 128 82 83 struct pmap_inval_info { 84 vm_offset_t va; 85 pt_entry_t *ptep; 86 pt_entry_t opte; 87 pt_entry_t npte; 88 enum { INVDONE, INVSTORE, INVCMPSET } mode; 89 int success; 90 vm_pindex_t npgs; 91 cpumask_t done; 92 cpumask_t mask; 93 #ifdef LOOPRECOVER 94 cpumask_t sigmask; 95 int failed; 96 tsc_uclock_t tsc_target; 97 #endif 98 } __cachealign; 99 100 typedef struct pmap_inval_info pmap_inval_info_t; 101 102 static pmap_inval_info_t invinfo[MAXCPU]; 103 extern cpumask_t smp_invmask; 104 #ifdef LOOPRECOVER 105 #ifdef LOOPMASK_IN 106 extern cpumask_t smp_in_mask; 107 #endif 108 extern cpumask_t smp_smurf_mask; 109 #endif 110 static int pmap_inval_watchdog_print; /* must always default off */ 111 static int pmap_inval_force_allcpus; 112 static int pmap_inval_force_nonopt; 113 114 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_watchdog_print, CTLFLAG_RW, 115 &pmap_inval_watchdog_print, 0, ""); 116 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_force_allcpus, CTLFLAG_RW, 117 &pmap_inval_force_allcpus, 0, ""); 118 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_force_nonopt, CTLFLAG_RW, 119 &pmap_inval_force_nonopt, 0, ""); 120 121 static void 122 pmap_inval_init(pmap_t pmap) 123 { 124 cpulock_t olock; 125 cpulock_t nlock; 126 127 crit_enter_id("inval"); 128 129 if (pmap != kernel_pmap) { 130 for (;;) { 131 olock = pmap->pm_active_lock; 132 cpu_ccfence(); 133 nlock = olock | CPULOCK_EXCL; 134 if (olock != nlock && 135 atomic_cmpset_int(&pmap->pm_active_lock, 136 olock, nlock)) { 137 break; 138 } 139 lwkt_process_ipiq(); 140 cpu_pause(); 141 } 142 atomic_add_64(&pmap->pm_invgen, 1); 143 } 144 } 145 146 static void 147 pmap_inval_done(pmap_t pmap) 148 { 149 if (pmap != kernel_pmap) { 150 atomic_add_64(&pmap->pm_invgen, 1); 151 atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL); 152 } 153 crit_exit_id("inval"); 154 } 155 156 #ifdef LOOPRECOVER 157 158 /* 159 * Debugging and lost IPI recovery code. 160 */ 161 static 162 __inline 163 int 164 loopwdog(struct pmap_inval_info *info) 165 { 166 tsc_uclock_t tsc; 167 168 tsc = rdtsc(); 169 if ((tsc_sclock_t)(info->tsc_target - tsc) < 0 && tsc_frequency) { 170 info->tsc_target = tsc + (tsc_frequency * LOOPRECOVER_TIMEOUT2); 171 return 1; 172 } 173 return 0; 174 } 175 176 static 177 void 178 loopdebug(const char *msg, pmap_inval_info_t *info) 179 { 180 int p; 181 int cpu = mycpu->gd_cpuid; 182 183 /* 184 * Don't kprintf() anything if the pmap inval watchdog gets hit. 185 * DRM can cause an occassional watchdog hit (at least with a 1/16 186 * second watchdog), and attempting to kprintf to the KVM frame buffer 187 * from Xinvltlb, which ignores critical sections, can implode the 188 * system. 189 */ 190 if (pmap_inval_watchdog_print == 0) 191 return; 192 193 cpu_lfence(); 194 #ifdef LOOPRECOVER 195 atomic_add_long(&smp_smurf_mask.ary[0], 0); 196 #endif 197 kprintf("ipilost-%s! %d mode=%d m=%08jx d=%08jx " 198 #ifdef LOOPRECOVER 199 "s=%08jx " 200 #endif 201 #ifdef LOOPMASK_IN 202 "in=%08jx " 203 #endif 204 #ifdef LOOPRECOVER 205 "smurf=%08jx\n" 206 #endif 207 , msg, cpu, info->mode, 208 info->mask.ary[0], 209 info->done.ary[0] 210 #ifdef LOOPRECOVER 211 , info->sigmask.ary[0] 212 #endif 213 #ifdef LOOPMASK_IN 214 , smp_in_mask.ary[0] 215 #endif 216 #ifdef LOOPRECOVER 217 , smp_smurf_mask.ary[0] 218 #endif 219 ); 220 kprintf("mdglob "); 221 for (p = 0; p < ncpus; ++p) 222 kprintf(" %d", CPU_prvspace[p]->mdglobaldata.gd_xinvaltlb); 223 kprintf("\n"); 224 } 225 226 #endif 227 228 #ifdef CHECKSIG 229 230 #define CHECKSIGMASK(info) _checksigmask(info, __FILE__, __LINE__) 231 232 static 233 void 234 _checksigmask(pmap_inval_info_t *info, const char *file, int line) 235 { 236 cpumask_t tmp; 237 238 tmp = info->mask; 239 CPUMASK_ANDMASK(tmp, info->sigmask); 240 if (CPUMASK_CMPMASKNEQ(tmp, info->mask)) { 241 kprintf("\"%s\" line %d: bad sig/mask %08jx %08jx\n", 242 file, line, info->sigmask.ary[0], info->mask.ary[0]); 243 } 244 } 245 246 #else 247 248 #define CHECKSIGMASK(info) 249 250 #endif 251 252 /* 253 * Invalidate the specified va across all cpus associated with the pmap. 254 * If va == (vm_offset_t)-1, we invltlb() instead of invlpg(). The operation 255 * will be done fully synchronously with storing npte into *ptep and returning 256 * opte. 257 * 258 * If ptep is NULL the operation will execute semi-synchronously. 259 * ptep must be NULL if npgs > 1 260 */ 261 pt_entry_t 262 pmap_inval_smp(pmap_t pmap, vm_offset_t va, vm_pindex_t npgs, 263 pt_entry_t *ptep, pt_entry_t npte) 264 { 265 globaldata_t gd = mycpu; 266 pmap_inval_info_t *info; 267 pt_entry_t opte = 0; 268 int cpu = gd->gd_cpuid; 269 cpumask_t tmpmask; 270 unsigned long rflags; 271 272 /* 273 * Initialize invalidation for pmap and enter critical section. 274 * This will enter a critical section for us. 275 */ 276 if (pmap == NULL) 277 pmap = kernel_pmap; 278 279 /* 280 * Shortcut single-cpu case if possible. 281 */ 282 if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask) && 283 pmap_inval_force_nonopt == 0) { 284 /* 285 * Convert to invltlb if there are too many pages to 286 * invlpg on. 287 */ 288 if (pmap->pm_flags & PMAP_MULTI) 289 pmap_inval_init(pmap); 290 if (npgs == 1) { 291 if (ptep) 292 opte = atomic_swap_long(ptep, npte); 293 if (va == (vm_offset_t)-1) 294 cpu_invltlb(); 295 else 296 cpu_invlpg((void *)va); 297 } else if (va == (vm_offset_t)-1 || npgs > MAX_INVAL_PAGES) { 298 if (ptep) { 299 while (npgs) { 300 opte = atomic_swap_long(ptep, npte); 301 ++ptep; 302 --npgs; 303 } 304 } 305 cpu_invltlb(); 306 } else { 307 while (npgs) { 308 if (ptep) { 309 opte = atomic_swap_long(ptep, npte); 310 ++ptep; 311 } 312 cpu_invlpg((void *)va); 313 va += PAGE_SIZE; 314 --npgs; 315 } 316 } 317 if (pmap->pm_flags & PMAP_MULTI) 318 pmap_inval_done(pmap); 319 320 return opte; 321 } 322 323 /* 324 * We need a critical section to prevent getting preempted while 325 * we setup our command. A preemption might execute its own 326 * pmap_inval*() command and create confusion below. 327 * 328 * tsc_target is our watchdog timeout that will attempt to recover 329 * from a lost IPI. Set to 1/16 second for now. 330 */ 331 pmap_inval_init(pmap); 332 info = &invinfo[cpu]; 333 334 /* 335 * We must wait for other cpus which may still be finishing up a 336 * prior operation that we requested. 337 * 338 * We do not have to disable interrupts here. An Xinvltlb can occur 339 * at any time (even within a critical section), but it will not 340 * act on our command until we set our done bits. 341 */ 342 while (CPUMASK_TESTNZERO(info->done)) { 343 #ifdef LOOPRECOVER 344 if (loopwdog(info)) { 345 info->failed = 1; 346 loopdebug("A", info); 347 /* XXX recover from possible bug */ 348 CPUMASK_ASSZERO(info->done); 349 } 350 #endif 351 cpu_pause(); 352 } 353 KKASSERT(info->mode == INVDONE); 354 cpu_mfence(); 355 356 /* 357 * Must set our cpu in the invalidation scan mask before 358 * any possibility of [partial] execution (remember, XINVLTLB 359 * can interrupt a critical section). 360 */ 361 ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu); 362 363 info->tsc_target = rdtsc() + (tsc_frequency * LOOPRECOVER_TIMEOUT1); 364 info->va = va; 365 info->npgs = npgs; 366 info->ptep = ptep; 367 info->npte = npte; 368 info->opte = 0; 369 #ifdef LOOPRECOVER 370 info->failed = 0; 371 #endif 372 info->mode = INVSTORE; 373 374 tmpmask = pmap->pm_active; /* volatile (bits may be cleared) */ 375 if (pmap_inval_force_allcpus) 376 tmpmask = smp_active_mask; 377 cpu_ccfence(); 378 CPUMASK_ANDMASK(tmpmask, smp_active_mask); 379 380 /* 381 * If ptep is NULL the operation can be semi-synchronous, which means 382 * we can improve performance by flagging and removing idle cpus 383 * (see the idleinvlclr function in mp_machdep.c). 384 * 385 * Typically kernel page table operation is semi-synchronous. 386 */ 387 if (ptep == NULL) 388 smp_smurf_idleinvlclr(&tmpmask); 389 CPUMASK_ORBIT(tmpmask, cpu); 390 info->mask = tmpmask; 391 392 /* 393 * Command may start executing the moment 'done' is initialized, 394 * disable current cpu interrupt to prevent 'done' field from 395 * changing (other cpus can't clear done bits until the originating 396 * cpu clears its mask bit, but other cpus CAN start clearing their 397 * mask bits). 398 */ 399 #ifdef LOOPRECOVER 400 info->sigmask = tmpmask; 401 CHECKSIGMASK(info); 402 #endif 403 cpu_sfence(); 404 rflags = read_rflags(); 405 cpu_disable_intr(); 406 407 ATOMIC_CPUMASK_COPY(info->done, tmpmask); 408 /* execution can begin here on other cpus due to races */ 409 410 /* 411 * Pass our copy of the done bits (so they don't change out from 412 * under us) to generate the Xinvltlb interrupt on the targets. 413 * 414 * smp_invlpg() issues the command, synchronizes with other cpus, 415 * and executes the command on our cpu. Upon return other cpus 416 * may still be in the process of exiting their synchroniization. 417 */ 418 smp_invlpg(&tmpmask); 419 opte = info->opte; 420 KKASSERT(info->mode == INVDONE); 421 422 /* 423 * Target cpus will be in their loop exiting concurrently with our 424 * cleanup. They will not lose the bitmask they obtained before so 425 * we can safely clear this bit. 426 */ 427 ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu); 428 write_rflags(rflags); 429 pmap_inval_done(pmap); 430 431 return opte; 432 } 433 434 /* 435 * API function - invalidate the pte at (va) and replace *ptep with npte 436 * atomically only if *ptep equals opte, across the pmap's active cpus. 437 * 438 * Returns 1 on success, 0 on failure (caller typically retries). 439 */ 440 int 441 pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep, 442 pt_entry_t opte, pt_entry_t npte) 443 { 444 globaldata_t gd = mycpu; 445 pmap_inval_info_t *info; 446 int success; 447 int cpu = gd->gd_cpuid; 448 cpumask_t tmpmask; 449 unsigned long rflags; 450 451 /* 452 * Initialize invalidation for pmap and enter critical section. 453 */ 454 if (pmap == NULL) 455 pmap = kernel_pmap; 456 457 /* 458 * Shortcut single-cpu case if possible. 459 */ 460 if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask) && 461 pmap_inval_force_nonopt == 0) { 462 if (pmap->pm_flags & PMAP_MULTI) 463 pmap_inval_init(pmap); 464 if (atomic_cmpset_long(ptep, opte, npte)) { 465 if (va == (vm_offset_t)-1) 466 cpu_invltlb(); 467 else 468 cpu_invlpg((void *)va); 469 if (pmap->pm_flags & PMAP_MULTI) 470 pmap_inval_done(pmap); 471 return 1; 472 } else { 473 if (pmap->pm_flags & PMAP_MULTI) 474 pmap_inval_done(pmap); 475 return 0; 476 } 477 } 478 479 /* 480 * We need a critical section to prevent getting preempted while 481 * we setup our command. A preemption might execute its own 482 * pmap_inval*() command and create confusion below. 483 */ 484 pmap_inval_init(pmap); 485 info = &invinfo[cpu]; 486 487 /* 488 * We must wait for other cpus which may still be finishing 489 * up a prior operation. 490 */ 491 while (CPUMASK_TESTNZERO(info->done)) { 492 #ifdef LOOPRECOVER 493 if (loopwdog(info)) { 494 info->failed = 1; 495 loopdebug("B", info); 496 /* XXX recover from possible bug */ 497 CPUMASK_ASSZERO(info->done); 498 } 499 #endif 500 cpu_pause(); 501 } 502 KKASSERT(info->mode == INVDONE); 503 cpu_mfence(); 504 505 /* 506 * Must set our cpu in the invalidation scan mask before 507 * any possibility of [partial] execution (remember, XINVLTLB 508 * can interrupt a critical section). 509 */ 510 ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu); 511 512 info->tsc_target = rdtsc() + (tsc_frequency * LOOPRECOVER_TIMEOUT1); 513 info->va = va; 514 info->npgs = 1; /* unused */ 515 info->ptep = ptep; 516 info->npte = npte; 517 info->opte = opte; 518 #ifdef LOOPRECOVER 519 info->failed = 0; 520 #endif 521 info->mode = INVCMPSET; 522 info->success = 0; 523 524 tmpmask = pmap->pm_active; /* volatile */ 525 if (pmap_inval_force_allcpus) 526 tmpmask = smp_active_mask; 527 cpu_ccfence(); 528 CPUMASK_ANDMASK(tmpmask, smp_active_mask); 529 CPUMASK_ORBIT(tmpmask, cpu); 530 info->mask = tmpmask; 531 532 /* 533 * Command may start executing the moment 'done' is initialized, 534 * disable current cpu interrupt to prevent 'done' field from 535 * changing (other cpus can't clear done bits until the originating 536 * cpu clears its mask bit). 537 */ 538 #ifdef LOOPRECOVER 539 info->sigmask = tmpmask; 540 CHECKSIGMASK(info); 541 #endif 542 cpu_sfence(); 543 rflags = read_rflags(); 544 cpu_disable_intr(); 545 546 ATOMIC_CPUMASK_COPY(info->done, tmpmask); 547 548 /* 549 * Pass our copy of the done bits (so they don't change out from 550 * under us) to generate the Xinvltlb interrupt on the targets. 551 */ 552 smp_invlpg(&tmpmask); 553 success = info->success; 554 KKASSERT(info->mode == INVDONE); 555 556 ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu); 557 write_rflags(rflags); 558 pmap_inval_done(pmap); 559 560 return success; 561 } 562 563 void 564 pmap_inval_bulk_init(pmap_inval_bulk_t *bulk, struct pmap *pmap) 565 { 566 bulk->pmap = pmap; 567 bulk->va_beg = 0; 568 bulk->va_end = 0; 569 bulk->count = 0; 570 } 571 572 pt_entry_t 573 pmap_inval_bulk(pmap_inval_bulk_t *bulk, vm_offset_t va, 574 pt_entry_t *ptep, pt_entry_t npte) 575 { 576 pt_entry_t pte; 577 578 /* 579 * Degenerate case, localized or we don't care (e.g. because we 580 * are jacking the entire page table) or the pmap is not in-use 581 * by anyone. No invalidations are done on any cpu. 582 */ 583 if (bulk == NULL) { 584 pte = atomic_swap_long(ptep, npte); 585 return pte; 586 } 587 588 /* 589 * If it isn't the kernel pmap we execute the operation synchronously 590 * on all cpus belonging to the pmap, which avoids concurrency bugs in 591 * the hw related to changing pte's out from under threads. 592 * 593 * Eventually I would like to implement streaming pmap invalidation 594 * for user pmaps to reduce mmap/munmap overheads for heavily-loaded 595 * threaded programs. 596 */ 597 if (bulk->pmap != kernel_pmap) { 598 pte = pmap_inval_smp(bulk->pmap, va, 1, ptep, npte); 599 return pte; 600 } 601 602 /* 603 * This is the kernel_pmap. All unmap operations presume that there 604 * are no other cpus accessing the addresses in question. Implement 605 * the bulking algorithm. collect the required information and 606 * synchronize once at the end. 607 */ 608 pte = atomic_swap_long(ptep, npte); 609 if (va == (vm_offset_t)-1) { 610 bulk->va_beg = va; 611 } else if (bulk->va_beg == bulk->va_end) { 612 bulk->va_beg = va; 613 bulk->va_end = va + PAGE_SIZE; 614 } else if (va == bulk->va_end) { 615 bulk->va_end = va + PAGE_SIZE; 616 } else { 617 bulk->va_beg = (vm_offset_t)-1; 618 bulk->va_end = 0; 619 #if 0 620 pmap_inval_bulk_flush(bulk); 621 bulk->count = 1; 622 if (va == (vm_offset_t)-1) { 623 bulk->va_beg = va; 624 bulk->va_end = 0; 625 } else { 626 bulk->va_beg = va; 627 bulk->va_end = va + PAGE_SIZE; 628 } 629 #endif 630 } 631 ++bulk->count; 632 633 return pte; 634 } 635 636 void 637 pmap_inval_bulk_flush(pmap_inval_bulk_t *bulk) 638 { 639 if (bulk == NULL) 640 return; 641 if (bulk->va_beg != bulk->va_end) { 642 if (bulk->va_beg == (vm_offset_t)-1) { 643 pmap_inval_smp(bulk->pmap, bulk->va_beg, 1, NULL, 0); 644 } else { 645 vm_pindex_t n; 646 647 n = (bulk->va_end - bulk->va_beg) >> PAGE_SHIFT; 648 pmap_inval_smp(bulk->pmap, bulk->va_beg, n, NULL, 0); 649 } 650 } 651 bulk->va_beg = 0; 652 bulk->va_end = 0; 653 bulk->count = 0; 654 } 655 656 /* 657 * Called from Xinvl with a critical section held and interrupts enabled. 658 */ 659 int 660 pmap_inval_intr(cpumask_t *cpumaskp, int toolong) 661 { 662 globaldata_t gd = mycpu; 663 pmap_inval_info_t *info; 664 int loopme = 0; 665 int cpu; 666 cpumask_t cpumask; 667 668 /* 669 * Check all cpus for invalidations we may need to service. 670 */ 671 cpu_ccfence(); 672 cpu = gd->gd_cpuid; 673 cpumask = *cpumaskp; 674 675 while (CPUMASK_TESTNZERO(cpumask)) { 676 int n = BSFCPUMASK(cpumask); 677 678 #ifdef LOOPRECOVER 679 KKASSERT(n >= 0 && n < MAXCPU); 680 #endif 681 682 CPUMASK_NANDBIT(cpumask, n); 683 info = &invinfo[n]; 684 685 /* 686 * Checkout cpu (cpu) for work in the target cpu info (n) 687 * 688 * if (n == cpu) - check our cpu for a master operation 689 * if (n != cpu) - check other cpus for a slave operation 690 * 691 * Due to interrupts/races we can catch a new operation 692 * in an older interrupt in other cpus. 693 * 694 * A fence is needed once we detect the (not) done bit. 695 */ 696 if (!CPUMASK_TESTBIT(info->done, cpu)) 697 continue; 698 cpu_lfence(); 699 #ifdef LOOPRECOVER 700 if (toolong) { 701 kprintf("pm_inval_intr: WARNING, taking too long " 702 "cpus=%d->%d done=%08jx mask=%08jx " 703 "mode=%d\n", 704 cpu, n, info->done.ary[0], info->mask.ary[0], 705 info->mode); 706 } 707 #endif 708 709 /* 710 * info->mask and info->done always contain the originating 711 * cpu until the originator is done. Targets may still be 712 * present in info->done after the originator is done (they 713 * will be finishing up their loops). 714 * 715 * Clear info->mask bits on other cpus to indicate that they 716 * have quiesced (entered the loop). Once the other mask bits 717 * are clear we can execute the operation on the original, 718 * then clear the mask and done bits on the originator. The 719 * targets will then finish up their side and clear their 720 * done bits. 721 * 722 * The command is considered 100% done when all done bits have 723 * been cleared. 724 */ 725 if (n != cpu) { 726 /* 727 * Command state machine for 'other' cpus. 728 */ 729 if (CPUMASK_TESTBIT(info->mask, cpu)) { 730 /* 731 * Other cpus indicate to originator that they 732 * are quiesced. 733 */ 734 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu); 735 loopme = 1; 736 } else if (info->ptep && 737 CPUMASK_TESTBIT(info->mask, n)) { 738 /* 739 * Other cpu must wait for the originator (n) 740 * to complete its command if ptep is not NULL. 741 */ 742 loopme = 1; 743 } else { 744 /* 745 * Other cpu detects that the originator has 746 * completed its command, or there was no 747 * command. 748 * 749 * Now that the page table entry has changed, 750 * we can follow up with our own invalidation. 751 */ 752 vm_offset_t va = info->va; 753 vm_pindex_t npgs; 754 755 if (va == (vm_offset_t)-1 || 756 info->npgs > MAX_INVAL_PAGES) { 757 cpu_invltlb(); 758 } else { 759 for (npgs = info->npgs; npgs; --npgs) { 760 cpu_invlpg((void *)va); 761 va += PAGE_SIZE; 762 } 763 } 764 ATOMIC_CPUMASK_NANDBIT(info->done, cpu); 765 /* info invalid now */ 766 /* loopme left alone */ 767 } 768 } else if (CPUMASK_TESTBIT(info->mask, cpu)) { 769 /* 770 * Originator is waiting for other cpus 771 */ 772 if (CPUMASK_CMPMASKNEQ(info->mask, gd->gd_cpumask)) { 773 /* 774 * Originator waits for other cpus to enter 775 * their loop (aka quiesce). 776 * 777 * If this bugs out the IPI may have been lost, 778 * try to reissue by resetting our own 779 * reentrancy bit and clearing the smurf mask 780 * for the cpus that did not respond, then 781 * reissuing the IPI. 782 */ 783 loopme = 1; 784 #ifdef LOOPRECOVER 785 if (loopwdog(info)) { 786 info->failed = 1; 787 loopdebug("C", info); 788 /* XXX recover from possible bug */ 789 cpu_disable_intr(); 790 ATOMIC_CPUMASK_NANDMASK(smp_smurf_mask, 791 info->mask); 792 smp_invlpg(&smp_active_mask); 793 794 /* 795 * Force outer-loop retest of Xinvltlb 796 * requests (see mp_machdep.c). 797 */ 798 cpu_enable_intr(); 799 } 800 #endif 801 } else { 802 /* 803 * Originator executes operation and clears 804 * mask to allow other cpus to finish. 805 */ 806 KKASSERT(info->mode != INVDONE); 807 if (info->mode == INVSTORE) { 808 if (info->ptep) 809 info->opte = atomic_swap_long(info->ptep, info->npte); 810 CHECKSIGMASK(info); 811 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu); 812 CHECKSIGMASK(info); 813 } else { 814 if (atomic_cmpset_long(info->ptep, 815 info->opte, info->npte)) { 816 info->success = 1; 817 } else { 818 info->success = 0; 819 } 820 CHECKSIGMASK(info); 821 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu); 822 CHECKSIGMASK(info); 823 } 824 loopme = 1; 825 } 826 } else { 827 /* 828 * Originator does not have to wait for the other 829 * cpus to finish. It clears its done bit. A new 830 * command will not be initiated by the originator 831 * until the other cpus have cleared their done bits 832 * (asynchronously). 833 */ 834 vm_offset_t va = info->va; 835 vm_pindex_t npgs; 836 837 if (va == (vm_offset_t)-1 || 838 info->npgs > MAX_INVAL_PAGES) { 839 cpu_invltlb(); 840 } else { 841 for (npgs = info->npgs; npgs; --npgs) { 842 cpu_invlpg((void *)va); 843 va += PAGE_SIZE; 844 } 845 } 846 847 /* leave loopme alone */ 848 /* other cpus may still be finishing up */ 849 /* can't race originator since that's us */ 850 info->mode = INVDONE; 851 ATOMIC_CPUMASK_NANDBIT(info->done, cpu); 852 } 853 } 854 return loopme; 855 } 856