1 /*
2 * Copyright (c) 2003-2011 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 /*
36 * pmap invalidation support code. Certain hardware requirements must
37 * be dealt with when manipulating page table entries and page directory
38 * entries within a pmap. In particular, we cannot safely manipulate
39 * page tables which are in active use by another cpu (even if it is
40 * running in userland) for two reasons: First, TLB writebacks will
41 * race against our own modifications and tests. Second, even if we
42 * were to use bus-locked instruction we can still screw up the
43 * target cpu's instruction pipeline due to Intel cpu errata.
44 */
45
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
49 #include <sys/proc.h>
50 #include <sys/vmmeter.h>
51 #include <sys/thread2.h>
52 #include <sys/sysctl.h>
53
54 #include <vm/vm.h>
55 #include <vm/pmap.h>
56 #include <vm/vm_object.h>
57
58 #include <machine/cputypes.h>
59 #include <machine/md_var.h>
60 #include <machine/specialreg.h>
61 #include <machine/smp.h>
62 #include <machine/globaldata.h>
63 #include <machine/pmap.h>
64 #include <machine/pmap_inval.h>
65 #include <machine/clock.h>
66
67 #if 1 /* DEBUGGING */
68 #define LOOPRECOVER /* enable watchdog */
69 #endif
70
71 /*
72 * Watchdog recovery interval, in seconds.
73 *
74 * The watchdog value is generous for two reasons. First, because the
75 * situation is not supposed to happen at all (but does), and second,
76 * because VMs could be very slow at handling IPIs.
77 */
78 #define LOOPRECOVER_TIMEOUT1 2 /* initial recovery */
79 #define LOOPRECOVER_TIMEOUT2 1 /* repeated recoveries */
80
81 #define MAX_INVAL_PAGES 128
82
83 struct pmap_inval_info {
84 vm_offset_t va;
85 pt_entry_t *ptep;
86 pt_entry_t opte;
87 pt_entry_t npte;
88 enum { INVDONE, INVSTORE, INVCMPSET } mode;
89 int success;
90 vm_pindex_t npgs;
91 cpumask_t done;
92 cpumask_t mask;
93 #ifdef LOOPRECOVER
94 cpumask_t sigmask;
95 int failed;
96 tsc_uclock_t tsc_target;
97 #endif
98 } __cachealign;
99
100 typedef struct pmap_inval_info pmap_inval_info_t;
101
102 static pmap_inval_info_t invinfo[MAXCPU];
103 extern cpumask_t smp_invmask;
104 #ifdef LOOPRECOVER
105 #ifdef LOOPMASK_IN
106 extern cpumask_t smp_in_mask;
107 #endif
108 extern cpumask_t smp_smurf_mask;
109 #endif
110 static int pmap_inval_watchdog_print; /* must always default off */
111 static int pmap_inval_force_allcpus;
112 static int pmap_inval_force_nonopt;
113
114 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_watchdog_print, CTLFLAG_RW,
115 &pmap_inval_watchdog_print, 0, "");
116 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_force_allcpus, CTLFLAG_RW,
117 &pmap_inval_force_allcpus, 0, "");
118 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_force_nonopt, CTLFLAG_RW,
119 &pmap_inval_force_nonopt, 0, "");
120
121 static void
pmap_inval_init(pmap_t pmap)122 pmap_inval_init(pmap_t pmap)
123 {
124 cpulock_t olock;
125 cpulock_t nlock;
126
127 crit_enter_id("inval");
128
129 if (pmap != kernel_pmap) {
130 for (;;) {
131 olock = pmap->pm_active_lock;
132 cpu_ccfence();
133 nlock = olock | CPULOCK_EXCL;
134 if (olock != nlock &&
135 atomic_cmpset_int(&pmap->pm_active_lock,
136 olock, nlock)) {
137 break;
138 }
139 lwkt_process_ipiq();
140 cpu_pause();
141 }
142 atomic_add_64(&pmap->pm_invgen, 1);
143 }
144 }
145
146 static void
pmap_inval_done(pmap_t pmap)147 pmap_inval_done(pmap_t pmap)
148 {
149 if (pmap != kernel_pmap) {
150 atomic_add_64(&pmap->pm_invgen, 1);
151 atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL);
152 }
153 crit_exit_id("inval");
154 }
155
156 #ifdef LOOPRECOVER
157
158 /*
159 * Debugging and lost IPI recovery code.
160 */
161 static
162 __inline
163 int
loopwdog(struct pmap_inval_info * info)164 loopwdog(struct pmap_inval_info *info)
165 {
166 tsc_uclock_t tsc;
167
168 tsc = rdtsc();
169 if ((tsc_sclock_t)(info->tsc_target - tsc) < 0 && tsc_frequency) {
170 info->tsc_target = tsc + (tsc_frequency * LOOPRECOVER_TIMEOUT2);
171 return 1;
172 }
173 return 0;
174 }
175
176 static
177 void
loopdebug(const char * msg,pmap_inval_info_t * info)178 loopdebug(const char *msg, pmap_inval_info_t *info)
179 {
180 int p;
181 int cpu = mycpu->gd_cpuid;
182
183 /*
184 * Don't kprintf() anything if the pmap inval watchdog gets hit.
185 * DRM can cause an occassional watchdog hit (at least with a 1/16
186 * second watchdog), and attempting to kprintf to the KVM frame buffer
187 * from Xinvltlb, which ignores critical sections, can implode the
188 * system.
189 */
190 if (pmap_inval_watchdog_print == 0)
191 return;
192
193 cpu_lfence();
194 #ifdef LOOPRECOVER
195 atomic_add_long(&smp_smurf_mask.ary[0], 0);
196 #endif
197 kprintf("ipilost-%s! %d mode=%d m=%08jx d=%08jx "
198 #ifdef LOOPRECOVER
199 "s=%08jx "
200 #endif
201 #ifdef LOOPMASK_IN
202 "in=%08jx "
203 #endif
204 #ifdef LOOPRECOVER
205 "smurf=%08jx\n"
206 #endif
207 , msg, cpu, info->mode,
208 info->mask.ary[0],
209 info->done.ary[0]
210 #ifdef LOOPRECOVER
211 , info->sigmask.ary[0]
212 #endif
213 #ifdef LOOPMASK_IN
214 , smp_in_mask.ary[0]
215 #endif
216 #ifdef LOOPRECOVER
217 , smp_smurf_mask.ary[0]
218 #endif
219 );
220 kprintf("mdglob ");
221 for (p = 0; p < ncpus; ++p)
222 kprintf(" %d", CPU_prvspace[p]->mdglobaldata.gd_xinvaltlb);
223 kprintf("\n");
224 }
225
226 #endif
227
228 #ifdef CHECKSIG
229
230 #define CHECKSIGMASK(info) _checksigmask(info, __FILE__, __LINE__)
231
232 static
233 void
_checksigmask(pmap_inval_info_t * info,const char * file,int line)234 _checksigmask(pmap_inval_info_t *info, const char *file, int line)
235 {
236 cpumask_t tmp;
237
238 tmp = info->mask;
239 CPUMASK_ANDMASK(tmp, info->sigmask);
240 if (CPUMASK_CMPMASKNEQ(tmp, info->mask)) {
241 kprintf("\"%s\" line %d: bad sig/mask %08jx %08jx\n",
242 file, line, info->sigmask.ary[0], info->mask.ary[0]);
243 }
244 }
245
246 #else
247
248 #define CHECKSIGMASK(info)
249
250 #endif
251
252 /*
253 * Invalidate the specified va across all cpus associated with the pmap.
254 * If va == (vm_offset_t)-1, we invltlb() instead of invlpg(). The operation
255 * will be done fully synchronously with storing npte into *ptep and returning
256 * opte.
257 *
258 * If ptep is NULL the operation will execute semi-synchronously.
259 * ptep must be NULL if npgs > 1
260 */
261 pt_entry_t
pmap_inval_smp(pmap_t pmap,vm_offset_t va,vm_pindex_t npgs,pt_entry_t * ptep,pt_entry_t npte)262 pmap_inval_smp(pmap_t pmap, vm_offset_t va, vm_pindex_t npgs,
263 pt_entry_t *ptep, pt_entry_t npte)
264 {
265 globaldata_t gd = mycpu;
266 pmap_inval_info_t *info;
267 pt_entry_t opte = 0;
268 int cpu = gd->gd_cpuid;
269 cpumask_t tmpmask;
270 unsigned long rflags;
271
272 /*
273 * Initialize invalidation for pmap and enter critical section.
274 * This will enter a critical section for us.
275 */
276 if (pmap == NULL)
277 pmap = kernel_pmap;
278
279 /*
280 * Shortcut single-cpu case if possible.
281 */
282 if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask) &&
283 pmap_inval_force_nonopt == 0) {
284 /*
285 * Convert to invltlb if there are too many pages to
286 * invlpg on.
287 */
288 if (pmap->pm_flags & PMAP_MULTI)
289 pmap_inval_init(pmap);
290 if (npgs == 1) {
291 if (ptep)
292 opte = atomic_swap_long(ptep, npte);
293 if (va == (vm_offset_t)-1)
294 cpu_invltlb();
295 else
296 cpu_invlpg((void *)va);
297 } else if (va == (vm_offset_t)-1 || npgs > MAX_INVAL_PAGES) {
298 if (ptep) {
299 while (npgs) {
300 opte = atomic_swap_long(ptep, npte);
301 ++ptep;
302 --npgs;
303 }
304 }
305 cpu_invltlb();
306 } else {
307 while (npgs) {
308 if (ptep) {
309 opte = atomic_swap_long(ptep, npte);
310 ++ptep;
311 }
312 cpu_invlpg((void *)va);
313 va += PAGE_SIZE;
314 --npgs;
315 }
316 }
317 if (pmap->pm_flags & PMAP_MULTI)
318 pmap_inval_done(pmap);
319
320 return opte;
321 }
322
323 /*
324 * We need a critical section to prevent getting preempted while
325 * we setup our command. A preemption might execute its own
326 * pmap_inval*() command and create confusion below.
327 *
328 * tsc_target is our watchdog timeout that will attempt to recover
329 * from a lost IPI. Set to 1/16 second for now.
330 */
331 pmap_inval_init(pmap);
332 info = &invinfo[cpu];
333
334 /*
335 * We must wait for other cpus which may still be finishing up a
336 * prior operation that we requested.
337 *
338 * We do not have to disable interrupts here. An Xinvltlb can occur
339 * at any time (even within a critical section), but it will not
340 * act on our command until we set our done bits.
341 */
342 while (CPUMASK_TESTNZERO(info->done)) {
343 #ifdef LOOPRECOVER
344 if (loopwdog(info)) {
345 info->failed = 1;
346 loopdebug("A", info);
347 /* XXX recover from possible bug */
348 CPUMASK_ASSZERO(info->done);
349 }
350 #endif
351 cpu_pause();
352 }
353 KKASSERT(info->mode == INVDONE);
354 cpu_mfence();
355
356 /*
357 * Must set our cpu in the invalidation scan mask before
358 * any possibility of [partial] execution (remember, XINVLTLB
359 * can interrupt a critical section).
360 */
361 ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
362
363 info->tsc_target = rdtsc() + (tsc_frequency * LOOPRECOVER_TIMEOUT1);
364 info->va = va;
365 info->npgs = npgs;
366 info->ptep = ptep;
367 info->npte = npte;
368 info->opte = 0;
369 #ifdef LOOPRECOVER
370 info->failed = 0;
371 #endif
372 info->mode = INVSTORE;
373
374 tmpmask = pmap->pm_active; /* volatile (bits may be cleared) */
375 if (pmap_inval_force_allcpus)
376 tmpmask = smp_active_mask;
377 cpu_ccfence();
378 CPUMASK_ANDMASK(tmpmask, smp_active_mask);
379
380 /*
381 * If ptep is NULL the operation can be semi-synchronous, which means
382 * we can improve performance by flagging and removing idle cpus
383 * (see the idleinvlclr function in mp_machdep.c).
384 *
385 * Typically kernel page table operation is semi-synchronous.
386 */
387 if (ptep == NULL)
388 smp_smurf_idleinvlclr(&tmpmask);
389 CPUMASK_ORBIT(tmpmask, cpu);
390 info->mask = tmpmask;
391
392 /*
393 * Command may start executing the moment 'done' is initialized,
394 * disable current cpu interrupt to prevent 'done' field from
395 * changing (other cpus can't clear done bits until the originating
396 * cpu clears its mask bit, but other cpus CAN start clearing their
397 * mask bits).
398 */
399 #ifdef LOOPRECOVER
400 info->sigmask = tmpmask;
401 CHECKSIGMASK(info);
402 #endif
403 cpu_sfence();
404 rflags = read_rflags();
405 cpu_disable_intr();
406
407 ATOMIC_CPUMASK_COPY(info->done, tmpmask);
408 /* execution can begin here on other cpus due to races */
409
410 /*
411 * Pass our copy of the done bits (so they don't change out from
412 * under us) to generate the Xinvltlb interrupt on the targets.
413 *
414 * smp_invlpg() issues the command, synchronizes with other cpus,
415 * and executes the command on our cpu. Upon return other cpus
416 * may still be in the process of exiting their synchroniization.
417 */
418 smp_invlpg(&tmpmask);
419 opte = info->opte;
420 KKASSERT(info->mode == INVDONE);
421
422 /*
423 * Target cpus will be in their loop exiting concurrently with our
424 * cleanup. They will not lose the bitmask they obtained before so
425 * we can safely clear this bit.
426 */
427 ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
428 write_rflags(rflags);
429 pmap_inval_done(pmap);
430
431 return opte;
432 }
433
434 /*
435 * API function - invalidate the pte at (va) and replace *ptep with npte
436 * atomically only if *ptep equals opte, across the pmap's active cpus.
437 *
438 * Returns 1 on success, 0 on failure (caller typically retries).
439 */
440 int
pmap_inval_smp_cmpset(pmap_t pmap,vm_offset_t va,pt_entry_t * ptep,pt_entry_t opte,pt_entry_t npte)441 pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep,
442 pt_entry_t opte, pt_entry_t npte)
443 {
444 globaldata_t gd = mycpu;
445 pmap_inval_info_t *info;
446 int success;
447 int cpu = gd->gd_cpuid;
448 cpumask_t tmpmask;
449 unsigned long rflags;
450
451 /*
452 * Initialize invalidation for pmap and enter critical section.
453 */
454 if (pmap == NULL)
455 pmap = kernel_pmap;
456
457 /*
458 * Shortcut single-cpu case if possible.
459 */
460 if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask) &&
461 pmap_inval_force_nonopt == 0) {
462 if (pmap->pm_flags & PMAP_MULTI)
463 pmap_inval_init(pmap);
464 if (atomic_cmpset_long(ptep, opte, npte)) {
465 if (va == (vm_offset_t)-1)
466 cpu_invltlb();
467 else
468 cpu_invlpg((void *)va);
469 if (pmap->pm_flags & PMAP_MULTI)
470 pmap_inval_done(pmap);
471 return 1;
472 } else {
473 if (pmap->pm_flags & PMAP_MULTI)
474 pmap_inval_done(pmap);
475 return 0;
476 }
477 }
478
479 /*
480 * We need a critical section to prevent getting preempted while
481 * we setup our command. A preemption might execute its own
482 * pmap_inval*() command and create confusion below.
483 */
484 pmap_inval_init(pmap);
485 info = &invinfo[cpu];
486
487 /*
488 * We must wait for other cpus which may still be finishing
489 * up a prior operation.
490 */
491 while (CPUMASK_TESTNZERO(info->done)) {
492 #ifdef LOOPRECOVER
493 if (loopwdog(info)) {
494 info->failed = 1;
495 loopdebug("B", info);
496 /* XXX recover from possible bug */
497 CPUMASK_ASSZERO(info->done);
498 }
499 #endif
500 cpu_pause();
501 }
502 KKASSERT(info->mode == INVDONE);
503 cpu_mfence();
504
505 /*
506 * Must set our cpu in the invalidation scan mask before
507 * any possibility of [partial] execution (remember, XINVLTLB
508 * can interrupt a critical section).
509 */
510 ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
511
512 info->tsc_target = rdtsc() + (tsc_frequency * LOOPRECOVER_TIMEOUT1);
513 info->va = va;
514 info->npgs = 1; /* unused */
515 info->ptep = ptep;
516 info->npte = npte;
517 info->opte = opte;
518 #ifdef LOOPRECOVER
519 info->failed = 0;
520 #endif
521 info->mode = INVCMPSET;
522 info->success = 0;
523
524 tmpmask = pmap->pm_active; /* volatile */
525 if (pmap_inval_force_allcpus)
526 tmpmask = smp_active_mask;
527 cpu_ccfence();
528 CPUMASK_ANDMASK(tmpmask, smp_active_mask);
529 CPUMASK_ORBIT(tmpmask, cpu);
530 info->mask = tmpmask;
531
532 /*
533 * Command may start executing the moment 'done' is initialized,
534 * disable current cpu interrupt to prevent 'done' field from
535 * changing (other cpus can't clear done bits until the originating
536 * cpu clears its mask bit).
537 */
538 #ifdef LOOPRECOVER
539 info->sigmask = tmpmask;
540 CHECKSIGMASK(info);
541 #endif
542 cpu_sfence();
543 rflags = read_rflags();
544 cpu_disable_intr();
545
546 ATOMIC_CPUMASK_COPY(info->done, tmpmask);
547
548 /*
549 * Pass our copy of the done bits (so they don't change out from
550 * under us) to generate the Xinvltlb interrupt on the targets.
551 */
552 smp_invlpg(&tmpmask);
553 success = info->success;
554 KKASSERT(info->mode == INVDONE);
555
556 ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
557 write_rflags(rflags);
558 pmap_inval_done(pmap);
559
560 return success;
561 }
562
563 void
pmap_inval_bulk_init(pmap_inval_bulk_t * bulk,struct pmap * pmap)564 pmap_inval_bulk_init(pmap_inval_bulk_t *bulk, struct pmap *pmap)
565 {
566 bulk->pmap = pmap;
567 bulk->va_beg = 0;
568 bulk->va_end = 0;
569 bulk->count = 0;
570 }
571
572 pt_entry_t
pmap_inval_bulk(pmap_inval_bulk_t * bulk,vm_offset_t va,pt_entry_t * ptep,pt_entry_t npte)573 pmap_inval_bulk(pmap_inval_bulk_t *bulk, vm_offset_t va,
574 pt_entry_t *ptep, pt_entry_t npte)
575 {
576 pt_entry_t pte;
577
578 /*
579 * Degenerate case, localized or we don't care (e.g. because we
580 * are jacking the entire page table) or the pmap is not in-use
581 * by anyone. No invalidations are done on any cpu.
582 */
583 if (bulk == NULL) {
584 pte = atomic_swap_long(ptep, npte);
585 return pte;
586 }
587
588 /*
589 * If it isn't the kernel pmap we execute the operation synchronously
590 * on all cpus belonging to the pmap, which avoids concurrency bugs in
591 * the hw related to changing pte's out from under threads.
592 *
593 * Eventually I would like to implement streaming pmap invalidation
594 * for user pmaps to reduce mmap/munmap overheads for heavily-loaded
595 * threaded programs.
596 */
597 if (bulk->pmap != kernel_pmap) {
598 pte = pmap_inval_smp(bulk->pmap, va, 1, ptep, npte);
599 return pte;
600 }
601
602 /*
603 * This is the kernel_pmap. All unmap operations presume that there
604 * are no other cpus accessing the addresses in question. Implement
605 * the bulking algorithm. collect the required information and
606 * synchronize once at the end.
607 */
608 pte = atomic_swap_long(ptep, npte);
609 if (va == (vm_offset_t)-1) {
610 bulk->va_beg = va;
611 } else if (bulk->va_beg == bulk->va_end) {
612 bulk->va_beg = va;
613 bulk->va_end = va + PAGE_SIZE;
614 } else if (va == bulk->va_end) {
615 bulk->va_end = va + PAGE_SIZE;
616 } else {
617 bulk->va_beg = (vm_offset_t)-1;
618 bulk->va_end = 0;
619 #if 0
620 pmap_inval_bulk_flush(bulk);
621 bulk->count = 1;
622 if (va == (vm_offset_t)-1) {
623 bulk->va_beg = va;
624 bulk->va_end = 0;
625 } else {
626 bulk->va_beg = va;
627 bulk->va_end = va + PAGE_SIZE;
628 }
629 #endif
630 }
631 ++bulk->count;
632
633 return pte;
634 }
635
636 void
pmap_inval_bulk_flush(pmap_inval_bulk_t * bulk)637 pmap_inval_bulk_flush(pmap_inval_bulk_t *bulk)
638 {
639 if (bulk == NULL)
640 return;
641 if (bulk->va_beg != bulk->va_end) {
642 if (bulk->va_beg == (vm_offset_t)-1) {
643 pmap_inval_smp(bulk->pmap, bulk->va_beg, 1, NULL, 0);
644 } else {
645 vm_pindex_t n;
646
647 n = (bulk->va_end - bulk->va_beg) >> PAGE_SHIFT;
648 pmap_inval_smp(bulk->pmap, bulk->va_beg, n, NULL, 0);
649 }
650 }
651 bulk->va_beg = 0;
652 bulk->va_end = 0;
653 bulk->count = 0;
654 }
655
656 /*
657 * Called from Xinvl with a critical section held and interrupts enabled.
658 */
659 int
pmap_inval_intr(cpumask_t * cpumaskp,int toolong)660 pmap_inval_intr(cpumask_t *cpumaskp, int toolong)
661 {
662 globaldata_t gd = mycpu;
663 pmap_inval_info_t *info;
664 int loopme = 0;
665 int cpu;
666 cpumask_t cpumask;
667
668 /*
669 * Check all cpus for invalidations we may need to service.
670 */
671 cpu_ccfence();
672 cpu = gd->gd_cpuid;
673 cpumask = *cpumaskp;
674
675 while (CPUMASK_TESTNZERO(cpumask)) {
676 int n = BSFCPUMASK(cpumask);
677
678 #ifdef LOOPRECOVER
679 KKASSERT(n >= 0 && n < MAXCPU);
680 #endif
681
682 CPUMASK_NANDBIT(cpumask, n);
683 info = &invinfo[n];
684
685 /*
686 * Checkout cpu (cpu) for work in the target cpu info (n)
687 *
688 * if (n == cpu) - check our cpu for a master operation
689 * if (n != cpu) - check other cpus for a slave operation
690 *
691 * Due to interrupts/races we can catch a new operation
692 * in an older interrupt in other cpus.
693 *
694 * A fence is needed once we detect the (not) done bit.
695 */
696 if (!CPUMASK_TESTBIT(info->done, cpu))
697 continue;
698 cpu_lfence();
699 #ifdef LOOPRECOVER
700 if (toolong) {
701 kprintf("pm_inval_intr: WARNING, taking too long "
702 "cpus=%d->%d done=%08jx mask=%08jx "
703 "mode=%d\n",
704 cpu, n, info->done.ary[0], info->mask.ary[0],
705 info->mode);
706 }
707 #endif
708
709 /*
710 * info->mask and info->done always contain the originating
711 * cpu until the originator is done. Targets may still be
712 * present in info->done after the originator is done (they
713 * will be finishing up their loops).
714 *
715 * Clear info->mask bits on other cpus to indicate that they
716 * have quiesced (entered the loop). Once the other mask bits
717 * are clear we can execute the operation on the original,
718 * then clear the mask and done bits on the originator. The
719 * targets will then finish up their side and clear their
720 * done bits.
721 *
722 * The command is considered 100% done when all done bits have
723 * been cleared.
724 */
725 if (n != cpu) {
726 /*
727 * Command state machine for 'other' cpus.
728 */
729 if (CPUMASK_TESTBIT(info->mask, cpu)) {
730 /*
731 * Other cpus indicate to originator that they
732 * are quiesced.
733 */
734 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
735 loopme = 1;
736 } else if (info->ptep &&
737 CPUMASK_TESTBIT(info->mask, n)) {
738 /*
739 * Other cpu must wait for the originator (n)
740 * to complete its command if ptep is not NULL.
741 */
742 loopme = 1;
743 } else {
744 /*
745 * Other cpu detects that the originator has
746 * completed its command, or there was no
747 * command.
748 *
749 * Now that the page table entry has changed,
750 * we can follow up with our own invalidation.
751 */
752 vm_offset_t va = info->va;
753 vm_pindex_t npgs;
754
755 if (va == (vm_offset_t)-1 ||
756 info->npgs > MAX_INVAL_PAGES) {
757 cpu_invltlb();
758 } else {
759 for (npgs = info->npgs; npgs; --npgs) {
760 cpu_invlpg((void *)va);
761 va += PAGE_SIZE;
762 }
763 }
764 ATOMIC_CPUMASK_NANDBIT(info->done, cpu);
765 /* info invalid now */
766 /* loopme left alone */
767 }
768 } else if (CPUMASK_TESTBIT(info->mask, cpu)) {
769 /*
770 * Originator is waiting for other cpus
771 */
772 if (CPUMASK_CMPMASKNEQ(info->mask, gd->gd_cpumask)) {
773 /*
774 * Originator waits for other cpus to enter
775 * their loop (aka quiesce).
776 *
777 * If this bugs out the IPI may have been lost,
778 * try to reissue by resetting our own
779 * reentrancy bit and clearing the smurf mask
780 * for the cpus that did not respond, then
781 * reissuing the IPI.
782 */
783 loopme = 1;
784 #ifdef LOOPRECOVER
785 if (loopwdog(info)) {
786 info->failed = 1;
787 loopdebug("C", info);
788 /* XXX recover from possible bug */
789 cpu_disable_intr();
790 ATOMIC_CPUMASK_NANDMASK(smp_smurf_mask,
791 info->mask);
792 smp_invlpg(&smp_active_mask);
793
794 /*
795 * Force outer-loop retest of Xinvltlb
796 * requests (see mp_machdep.c).
797 */
798 cpu_enable_intr();
799 }
800 #endif
801 } else {
802 /*
803 * Originator executes operation and clears
804 * mask to allow other cpus to finish.
805 */
806 KKASSERT(info->mode != INVDONE);
807 if (info->mode == INVSTORE) {
808 if (info->ptep)
809 info->opte = atomic_swap_long(info->ptep, info->npte);
810 CHECKSIGMASK(info);
811 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
812 CHECKSIGMASK(info);
813 } else {
814 if (atomic_cmpset_long(info->ptep,
815 info->opte, info->npte)) {
816 info->success = 1;
817 } else {
818 info->success = 0;
819 }
820 CHECKSIGMASK(info);
821 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
822 CHECKSIGMASK(info);
823 }
824 loopme = 1;
825 }
826 } else {
827 /*
828 * Originator does not have to wait for the other
829 * cpus to finish. It clears its done bit. A new
830 * command will not be initiated by the originator
831 * until the other cpus have cleared their done bits
832 * (asynchronously).
833 */
834 vm_offset_t va = info->va;
835 vm_pindex_t npgs;
836
837 if (va == (vm_offset_t)-1 ||
838 info->npgs > MAX_INVAL_PAGES) {
839 cpu_invltlb();
840 } else {
841 for (npgs = info->npgs; npgs; --npgs) {
842 cpu_invlpg((void *)va);
843 va += PAGE_SIZE;
844 }
845 }
846
847 /* leave loopme alone */
848 /* other cpus may still be finishing up */
849 /* can't race originator since that's us */
850 info->mode = INVDONE;
851 ATOMIC_CPUMASK_NANDBIT(info->done, cpu);
852 }
853 }
854 return loopme;
855 }
856