1 /*
2  * Copyright (c) 2003-2011 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * pmap invalidation support code.  Certain hardware requirements must
37  * be dealt with when manipulating page table entries and page directory
38  * entries within a pmap.  In particular, we cannot safely manipulate
39  * page tables which are in active use by another cpu (even if it is
40  * running in userland) for two reasons: First, TLB writebacks will
41  * race against our own modifications and tests.  Second, even if we
42  * were to use bus-locked instruction we can still screw up the
43  * target cpu's instruction pipeline due to Intel cpu errata.
44  */
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
49 #include <sys/proc.h>
50 #include <sys/vmmeter.h>
51 #include <sys/thread2.h>
52 #include <sys/sysctl.h>
53 
54 #include <vm/vm.h>
55 #include <vm/pmap.h>
56 #include <vm/vm_object.h>
57 
58 #include <machine/cputypes.h>
59 #include <machine/md_var.h>
60 #include <machine/specialreg.h>
61 #include <machine/smp.h>
62 #include <machine/globaldata.h>
63 #include <machine/pmap.h>
64 #include <machine/pmap_inval.h>
65 #include <machine/clock.h>
66 
67 #if 1	/* DEBUGGING */
68 #define LOOPRECOVER			/* enable watchdog */
69 #endif
70 
71 /*
72  * Watchdog recovery interval, in seconds.
73  *
74  * The watchdog value is generous for two reasons.  First, because the
75  * situation is not supposed to happen at all (but does), and second,
76  * because VMs could be very slow at handling IPIs.
77  */
78 #define LOOPRECOVER_TIMEOUT1	2	/* initial recovery */
79 #define LOOPRECOVER_TIMEOUT2	1	/* repeated recoveries */
80 
81 #define MAX_INVAL_PAGES		128
82 
83 struct pmap_inval_info {
84 	vm_offset_t	va;
85 	pt_entry_t	*ptep;
86 	pt_entry_t	opte;
87 	pt_entry_t	npte;
88 	enum { INVDONE, INVSTORE, INVCMPSET } mode;
89 	int		success;
90 	int		npgs;
91 	cpumask_t	done;
92 	cpumask_t	mask;
93 #ifdef LOOPRECOVER
94 	cpumask_t	sigmask;
95 	int		failed;
96 	int64_t		tsc_target;
97 #endif
98 } __cachealign;
99 
100 typedef struct pmap_inval_info pmap_inval_info_t;
101 
102 static pmap_inval_info_t	invinfo[MAXCPU];
103 extern cpumask_t		smp_invmask;
104 #ifdef LOOPRECOVER
105 #ifdef LOOPMASK_IN
106 extern cpumask_t		smp_in_mask;
107 #endif
108 extern cpumask_t		smp_smurf_mask;
109 #endif
110 static long pmap_inval_bulk_count;
111 static int pmap_inval_watchdog_print;	/* must always default off */
112 
113 SYSCTL_LONG(_machdep, OID_AUTO, pmap_inval_bulk_count, CTLFLAG_RW,
114 	    &pmap_inval_bulk_count, 0, "");
115 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_watchdog_print, CTLFLAG_RW,
116 	    &pmap_inval_watchdog_print, 0, "");
117 
118 static void
119 pmap_inval_init(pmap_t pmap)
120 {
121 	cpulock_t olock;
122 	cpulock_t nlock;
123 
124 	crit_enter_id("inval");
125 
126 	if (pmap != &kernel_pmap) {
127 		for (;;) {
128 			olock = pmap->pm_active_lock;
129 			cpu_ccfence();
130 			nlock = olock | CPULOCK_EXCL;
131 			if (olock != nlock &&
132 			    atomic_cmpset_int(&pmap->pm_active_lock,
133 					      olock, nlock)) {
134 				break;
135 			}
136 			lwkt_process_ipiq();
137 			cpu_pause();
138 		}
139 		atomic_add_acq_long(&pmap->pm_invgen, 1);
140 	}
141 }
142 
143 static void
144 pmap_inval_done(pmap_t pmap)
145 {
146 	if (pmap != &kernel_pmap) {
147 		atomic_add_acq_long(&pmap->pm_invgen, 1);
148 		atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL);
149 	}
150 	crit_exit_id("inval");
151 }
152 
153 #ifdef LOOPRECOVER
154 
155 /*
156  * Debugging and lost IPI recovery code.
157  */
158 static
159 __inline
160 int
161 loopwdog(struct pmap_inval_info *info)
162 {
163 	int64_t tsc;
164 
165 	tsc = rdtsc();
166 	if (info->tsc_target - tsc < 0 && tsc_frequency) {
167 		info->tsc_target = tsc + (tsc_frequency * LOOPRECOVER_TIMEOUT2);
168 		return 1;
169 	}
170 	return 0;
171 }
172 
173 static
174 void
175 loopdebug(const char *msg, pmap_inval_info_t *info)
176 {
177 	int p;
178 	int cpu = mycpu->gd_cpuid;
179 
180 	/*
181 	 * Don't kprintf() anything if the pmap inval watchdog gets hit.
182 	 * DRM can cause an occassional watchdog hit (at least with a 1/16
183 	 * second watchdog), and attempting to kprintf to the KVM frame buffer
184 	 * from Xinvltlb, which ignores critical sections, can implode the
185 	 * system.
186 	 */
187 	if (pmap_inval_watchdog_print == 0)
188 		return;
189 
190 	cpu_lfence();
191 #ifdef LOOPRECOVER
192 	atomic_add_long(&smp_smurf_mask.ary[0], 0);
193 #endif
194 	kprintf("ipilost-%s! %d mode=%d m=%08jx d=%08jx "
195 #ifdef LOOPRECOVER
196 		"s=%08jx "
197 #endif
198 #ifdef LOOPMASK_IN
199 		"in=%08jx "
200 #endif
201 #ifdef LOOPRECOVER
202 		"smurf=%08jx\n"
203 #endif
204 		, msg, cpu, info->mode,
205 		info->mask.ary[0],
206 		info->done.ary[0]
207 #ifdef LOOPRECOVER
208 		, info->sigmask.ary[0]
209 #endif
210 #ifdef LOOPMASK_IN
211 		, smp_in_mask.ary[0]
212 #endif
213 #ifdef LOOPRECOVER
214 		, smp_smurf_mask.ary[0]
215 #endif
216 		);
217 	kprintf("mdglob ");
218 	for (p = 0; p < ncpus; ++p)
219 		kprintf(" %d", CPU_prvspace[p]->mdglobaldata.gd_xinvaltlb);
220 	kprintf("\n");
221 }
222 
223 #endif
224 
225 #ifdef CHECKSIG
226 
227 #define CHECKSIGMASK(info)	_checksigmask(info, __FILE__, __LINE__)
228 
229 static
230 void
231 _checksigmask(pmap_inval_info_t *info, const char *file, int line)
232 {
233 	cpumask_t tmp;
234 
235 	tmp = info->mask;
236 	CPUMASK_ANDMASK(tmp, info->sigmask);
237 	if (CPUMASK_CMPMASKNEQ(tmp, info->mask)) {
238 		kprintf("\"%s\" line %d: bad sig/mask %08jx %08jx\n",
239 			file, line, info->sigmask.ary[0], info->mask.ary[0]);
240 	}
241 }
242 
243 #else
244 
245 #define CHECKSIGMASK(info)
246 
247 #endif
248 
249 /*
250  * Invalidate the specified va across all cpus associated with the pmap.
251  * If va == (vm_offset_t)-1, we invltlb() instead of invlpg().  The operation
252  * will be done fully synchronously with storing npte into *ptep and returning
253  * opte.
254  *
255  * If ptep is NULL the operation will execute semi-synchronously.
256  * ptep must be NULL if npgs > 1
257  */
258 pt_entry_t
259 pmap_inval_smp(pmap_t pmap, vm_offset_t va, int npgs,
260 	       pt_entry_t *ptep, pt_entry_t npte)
261 {
262 	globaldata_t gd = mycpu;
263 	pmap_inval_info_t *info;
264 	pt_entry_t opte = 0;
265 	int cpu = gd->gd_cpuid;
266 	cpumask_t tmpmask;
267 	unsigned long rflags;
268 
269 	/*
270 	 * Initialize invalidation for pmap and enter critical section.
271 	 */
272 	if (pmap == NULL)
273 		pmap = &kernel_pmap;
274 	pmap_inval_init(pmap);
275 
276 	/*
277 	 * Shortcut single-cpu case if possible.
278 	 */
279 	if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
280 		/*
281 		 * Convert to invltlb if there are too many pages to
282 		 * invlpg on.
283 		 */
284 		if (npgs > MAX_INVAL_PAGES) {
285 			npgs = 0;
286 			va = (vm_offset_t)-1;
287 		}
288 
289 		/*
290 		 * Invalidate the specified pages, handle invltlb if requested.
291 		 */
292 		while (npgs) {
293 			--npgs;
294 			if (ptep) {
295 				opte = atomic_swap_long(ptep, npte);
296 				++ptep;
297 			}
298 			if (va == (vm_offset_t)-1)
299 				break;
300 			cpu_invlpg((void *)va);
301 			va += PAGE_SIZE;
302 		}
303 		if (va == (vm_offset_t)-1)
304 			cpu_invltlb();
305 		pmap_inval_done(pmap);
306 
307 		return opte;
308 	}
309 
310 	/*
311 	 * We need a critical section to prevent getting preempted while
312 	 * we setup our command.  A preemption might execute its own
313 	 * pmap_inval*() command and create confusion below.
314 	 *
315 	 * tsc_target is our watchdog timeout that will attempt to recover
316 	 * from a lost IPI.  Set to 1/16 second for now.
317 	 */
318 	info = &invinfo[cpu];
319 	info->tsc_target = rdtsc() + (tsc_frequency * LOOPRECOVER_TIMEOUT1);
320 
321 	/*
322 	 * We must wait for other cpus which may still be finishing up a
323 	 * prior operation that we requested.
324 	 *
325 	 * We do not have to disable interrupts here.  An Xinvltlb can occur
326 	 * at any time (even within a critical section), but it will not
327 	 * act on our command until we set our done bits.
328 	 */
329 	while (CPUMASK_TESTNZERO(info->done)) {
330 #ifdef LOOPRECOVER
331 		if (loopwdog(info)) {
332 			info->failed = 1;
333 			loopdebug("A", info);
334 			/* XXX recover from possible bug */
335 			CPUMASK_ASSZERO(info->done);
336 		}
337 #endif
338 		cpu_pause();
339 	}
340 	KKASSERT(info->mode == INVDONE);
341 
342 	/*
343 	 * Must set our cpu in the invalidation scan mask before
344 	 * any possibility of [partial] execution (remember, XINVLTLB
345 	 * can interrupt a critical section).
346 	 */
347 	ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
348 
349 	info->va = va;
350 	info->npgs = npgs;
351 	info->ptep = ptep;
352 	info->npte = npte;
353 	info->opte = 0;
354 #ifdef LOOPRECOVER
355 	info->failed = 0;
356 #endif
357 	info->mode = INVSTORE;
358 
359 	tmpmask = pmap->pm_active;	/* volatile (bits may be cleared) */
360 	cpu_ccfence();
361 	CPUMASK_ANDMASK(tmpmask, smp_active_mask);
362 
363 	/*
364 	 * If ptep is NULL the operation can be semi-synchronous, which means
365 	 * we can improve performance by flagging and removing idle cpus
366 	 * (see the idleinvlclr function in mp_machdep.c).
367 	 *
368 	 * Typically kernel page table operation is semi-synchronous.
369 	 */
370 	if (ptep == NULL)
371 		smp_smurf_idleinvlclr(&tmpmask);
372 	CPUMASK_ORBIT(tmpmask, cpu);
373 	info->mask = tmpmask;
374 
375 	/*
376 	 * Command may start executing the moment 'done' is initialized,
377 	 * disable current cpu interrupt to prevent 'done' field from
378 	 * changing (other cpus can't clear done bits until the originating
379 	 * cpu clears its mask bit, but other cpus CAN start clearing their
380 	 * mask bits).
381 	 */
382 #ifdef LOOPRECOVER
383 	info->sigmask = tmpmask;
384 	CHECKSIGMASK(info);
385 #endif
386 	cpu_sfence();
387 	rflags = read_rflags();
388 	cpu_disable_intr();
389 
390 	ATOMIC_CPUMASK_COPY(info->done, tmpmask);
391 	/* execution can begin here due to races */
392 
393 	/*
394 	 * Pass our copy of the done bits (so they don't change out from
395 	 * under us) to generate the Xinvltlb interrupt on the targets.
396 	 */
397 	smp_invlpg(&tmpmask);
398 	opte = info->opte;
399 	KKASSERT(info->mode == INVDONE);
400 
401 	/*
402 	 * Target cpus will be in their loop exiting concurrently with our
403 	 * cleanup.  They will not lose the bitmask they obtained before so
404 	 * we can safely clear this bit.
405 	 */
406 	ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
407 	write_rflags(rflags);
408 	pmap_inval_done(pmap);
409 
410 	return opte;
411 }
412 
413 /*
414  * API function - invalidate the pte at (va) and replace *ptep with npte
415  * atomically only if *ptep equals opte, across the pmap's active cpus.
416  *
417  * Returns 1 on success, 0 on failure (caller typically retries).
418  */
419 int
420 pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep,
421 		      pt_entry_t opte, pt_entry_t npte)
422 {
423 	globaldata_t gd = mycpu;
424 	pmap_inval_info_t *info;
425 	int success;
426 	int cpu = gd->gd_cpuid;
427 	cpumask_t tmpmask;
428 	unsigned long rflags;
429 
430 	/*
431 	 * Initialize invalidation for pmap and enter critical section.
432 	 */
433 	if (pmap == NULL)
434 		pmap = &kernel_pmap;
435 	pmap_inval_init(pmap);
436 
437 	/*
438 	 * Shortcut single-cpu case if possible.
439 	 */
440 	if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
441 		if (atomic_cmpset_long(ptep, opte, npte)) {
442 			if (va == (vm_offset_t)-1)
443 				cpu_invltlb();
444 			else
445 				cpu_invlpg((void *)va);
446 			pmap_inval_done(pmap);
447 			return 1;
448 		} else {
449 			pmap_inval_done(pmap);
450 			return 0;
451 		}
452 	}
453 
454 	/*
455 	 * We need a critical section to prevent getting preempted while
456 	 * we setup our command.  A preemption might execute its own
457 	 * pmap_inval*() command and create confusion below.
458 	 */
459 	info = &invinfo[cpu];
460 	info->tsc_target = rdtsc() + (tsc_frequency * LOOPRECOVER_TIMEOUT1);
461 
462 	/*
463 	 * We must wait for other cpus which may still be finishing
464 	 * up a prior operation.
465 	 */
466 	while (CPUMASK_TESTNZERO(info->done)) {
467 #ifdef LOOPRECOVER
468 		if (loopwdog(info)) {
469 			info->failed = 1;
470 			loopdebug("B", info);
471 			/* XXX recover from possible bug */
472 			CPUMASK_ASSZERO(info->done);
473 		}
474 #endif
475 		cpu_pause();
476 	}
477 	KKASSERT(info->mode == INVDONE);
478 
479 	/*
480 	 * Must set our cpu in the invalidation scan mask before
481 	 * any possibility of [partial] execution (remember, XINVLTLB
482 	 * can interrupt a critical section).
483 	 */
484 	ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
485 
486 	info->va = va;
487 	info->npgs = 1;			/* unused */
488 	info->ptep = ptep;
489 	info->npte = npte;
490 	info->opte = opte;
491 #ifdef LOOPRECOVER
492 	info->failed = 0;
493 #endif
494 	info->mode = INVCMPSET;
495 	info->success = 0;
496 
497 	tmpmask = pmap->pm_active;	/* volatile */
498 	cpu_ccfence();
499 	CPUMASK_ANDMASK(tmpmask, smp_active_mask);
500 	CPUMASK_ORBIT(tmpmask, cpu);
501 	info->mask = tmpmask;
502 
503 	/*
504 	 * Command may start executing the moment 'done' is initialized,
505 	 * disable current cpu interrupt to prevent 'done' field from
506 	 * changing (other cpus can't clear done bits until the originating
507 	 * cpu clears its mask bit).
508 	 */
509 #ifdef LOOPRECOVER
510 	info->sigmask = tmpmask;
511 	CHECKSIGMASK(info);
512 #endif
513 	cpu_sfence();
514 	rflags = read_rflags();
515 	cpu_disable_intr();
516 
517 	ATOMIC_CPUMASK_COPY(info->done, tmpmask);
518 
519 	/*
520 	 * Pass our copy of the done bits (so they don't change out from
521 	 * under us) to generate the Xinvltlb interrupt on the targets.
522 	 */
523 	smp_invlpg(&tmpmask);
524 	success = info->success;
525 	KKASSERT(info->mode == INVDONE);
526 
527 	ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
528 	write_rflags(rflags);
529 	pmap_inval_done(pmap);
530 
531 	return success;
532 }
533 
534 void
535 pmap_inval_bulk_init(pmap_inval_bulk_t *bulk, struct pmap *pmap)
536 {
537 	bulk->pmap = pmap;
538 	bulk->va_beg = 0;
539 	bulk->va_end = 0;
540 	bulk->count = 0;
541 }
542 
543 pt_entry_t
544 pmap_inval_bulk(pmap_inval_bulk_t *bulk, vm_offset_t va,
545 		pt_entry_t *ptep, pt_entry_t npte)
546 {
547 	pt_entry_t pte;
548 
549 	/*
550 	 * Degenerate case, localized or we don't care (e.g. because we
551 	 * are jacking the entire page table) or the pmap is not in-use
552 	 * by anyone.  No invalidations are done on any cpu.
553 	 */
554 	if (bulk == NULL) {
555 		pte = atomic_swap_long(ptep, npte);
556 		return pte;
557 	}
558 
559 	/*
560 	 * If it isn't the kernel pmap we execute the operation synchronously
561 	 * on all cpus belonging to the pmap, which avoids concurrency bugs in
562 	 * the hw related to changing pte's out from under threads.
563 	 *
564 	 * Eventually I would like to implement streaming pmap invalidation
565 	 * for user pmaps to reduce mmap/munmap overheads for heavily-loaded
566 	 * threaded programs.
567 	 */
568 	if (bulk->pmap != &kernel_pmap) {
569 		pte = pmap_inval_smp(bulk->pmap, va, 1, ptep, npte);
570 		return pte;
571 	}
572 
573 	/*
574 	 * This is the kernel_pmap.  All unmap operations presume that there
575 	 * are no other cpus accessing the addresses in question.  Implement
576 	 * the bulking algorithm.  collect the required information and
577 	 * synchronize once at the end.
578 	 */
579 	pte = atomic_swap_long(ptep, npte);
580 	if (va == (vm_offset_t)-1) {
581 		bulk->va_beg = va;
582 	} else if (bulk->va_beg == bulk->va_end) {
583 		bulk->va_beg = va;
584 		bulk->va_end = va + PAGE_SIZE;
585 	} else if (va == bulk->va_end) {
586 		bulk->va_end = va + PAGE_SIZE;
587 	} else {
588 		bulk->va_beg = (vm_offset_t)-1;
589 		bulk->va_end = 0;
590 #if 0
591 		pmap_inval_bulk_flush(bulk);
592 		bulk->count = 1;
593 		if (va == (vm_offset_t)-1) {
594 			bulk->va_beg = va;
595 			bulk->va_end = 0;
596 		} else {
597 			bulk->va_beg = va;
598 			bulk->va_end = va + PAGE_SIZE;
599 		}
600 #endif
601 	}
602 	++bulk->count;
603 
604 	return pte;
605 }
606 
607 void
608 pmap_inval_bulk_flush(pmap_inval_bulk_t *bulk)
609 {
610 	if (bulk == NULL)
611 		return;
612 	if (bulk->count > 0)
613 		pmap_inval_bulk_count += (bulk->count - 1);
614 	if (bulk->va_beg != bulk->va_end) {
615 		if (bulk->va_beg == (vm_offset_t)-1) {
616 			pmap_inval_smp(bulk->pmap, bulk->va_beg, 1, NULL, 0);
617 		} else {
618 			long n;
619 
620 			n = (bulk->va_end - bulk->va_beg) >> PAGE_SHIFT;
621 			pmap_inval_smp(bulk->pmap, bulk->va_beg, n, NULL, 0);
622 		}
623 	}
624 	bulk->va_beg = 0;
625 	bulk->va_end = 0;
626 	bulk->count = 0;
627 }
628 
629 /*
630  * Called with a critical section held and interrupts enabled.
631  */
632 int
633 pmap_inval_intr(cpumask_t *cpumaskp, int toolong)
634 {
635 	globaldata_t gd = mycpu;
636 	pmap_inval_info_t *info;
637 	int loopme = 0;
638 	int cpu;
639 	cpumask_t cpumask;
640 
641 	/*
642 	 * Check all cpus for invalidations we may need to service.
643 	 */
644 	cpu_ccfence();
645 	cpu = gd->gd_cpuid;
646 	cpumask = *cpumaskp;
647 
648         while (CPUMASK_TESTNZERO(cpumask)) {
649                 int n = BSFCPUMASK(cpumask);
650 
651 #ifdef LOOPRECOVER
652 		KKASSERT(n >= 0 && n < MAXCPU);
653 #endif
654 
655                 CPUMASK_NANDBIT(cpumask, n);
656 		info = &invinfo[n];
657 
658 		/*
659 		 * Due to interrupts/races we can catch a new operation
660 		 * in an older interrupt.  A fence is needed once we detect
661 		 * the (not) done bit.
662 		 */
663 		if (!CPUMASK_TESTBIT(info->done, cpu))
664 			continue;
665 		cpu_lfence();
666 #ifdef LOOPRECOVER
667 		if (toolong) {
668 			kprintf("pminvl %d->%d %08jx %08jx mode=%d\n",
669 				cpu, n, info->done.ary[0], info->mask.ary[0],
670 				info->mode);
671 		}
672 #endif
673 
674 		/*
675 		 * info->mask and info->done always contain the originating
676 		 * cpu until the originator is done.  Targets may still be
677 		 * present in info->done after the originator is done (they
678 		 * will be finishing up their loops).
679 		 *
680 		 * Clear info->mask bits on other cpus to indicate that they
681 		 * have quiesced (entered the loop).  Once the other mask bits
682 		 * are clear we can execute the operation on the original,
683 		 * then clear the mask and done bits on the originator.  The
684 		 * targets will then finish up their side and clear their
685 		 * done bits.
686 		 *
687 		 * The command is considered 100% done when all done bits have
688 		 * been cleared.
689 		 */
690 		if (n != cpu) {
691 			/*
692 			 * Command state machine for 'other' cpus.
693 			 */
694 			if (CPUMASK_TESTBIT(info->mask, cpu)) {
695 				/*
696 				 * Other cpu indicate to originator that they
697 				 * are quiesced.
698 				 */
699 				ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
700 				loopme = 1;
701 			} else if (info->ptep &&
702 				   CPUMASK_TESTBIT(info->mask, n)) {
703 				/*
704 				 * Other cpu must wait for the originator (n)
705 				 * to complete its command if ptep is not NULL.
706 				 */
707 				loopme = 1;
708 			} else {
709 				/*
710 				 * Other cpu detects that the originator has
711 				 * completed its command, or there was no
712 				 * command.
713 				 *
714 				 * Now that the page table entry has changed,
715 				 * we can follow up with our own invalidation.
716 				 */
717 				vm_offset_t va = info->va;
718 				int npgs;
719 
720 				if (va == (vm_offset_t)-1 ||
721 				    info->npgs > MAX_INVAL_PAGES) {
722 					cpu_invltlb();
723 				} else {
724 					for (npgs = info->npgs; npgs; --npgs) {
725 						cpu_invlpg((void *)va);
726 						va += PAGE_SIZE;
727 					}
728 				}
729 				ATOMIC_CPUMASK_NANDBIT(info->done, cpu);
730 				/* info invalid now */
731 				/* loopme left alone */
732 			}
733 		} else if (CPUMASK_TESTBIT(info->mask, cpu)) {
734 			/*
735 			 * Originator is waiting for other cpus
736 			 */
737 			if (CPUMASK_CMPMASKNEQ(info->mask, gd->gd_cpumask)) {
738 				/*
739 				 * Originator waits for other cpus to enter
740 				 * their loop (aka quiesce).
741 				 *
742 				 * If this bugs out the IPI may have been lost,
743 				 * try to reissue by resetting our own
744 				 * reentrancy bit and clearing the smurf mask
745 				 * for the cpus that did not respond, then
746 				 * reissuing the IPI.
747 				 */
748 				loopme = 1;
749 #ifdef LOOPRECOVER
750 				if (loopwdog(info)) {
751 					info->failed = 1;
752 					loopdebug("C", info);
753 					/* XXX recover from possible bug */
754 					mdcpu->gd_xinvaltlb = 0;
755 					ATOMIC_CPUMASK_NANDMASK(smp_smurf_mask,
756 								info->mask);
757 					cpu_disable_intr();
758 					smp_invlpg(&smp_active_mask);
759 
760 					/*
761 					 * Force outer-loop retest of Xinvltlb
762 					 * requests (see mp_machdep.c).
763 					 */
764 					mdcpu->gd_xinvaltlb = 2;
765 					cpu_enable_intr();
766 				}
767 #endif
768 			} else {
769 				/*
770 				 * Originator executes operation and clears
771 				 * mask to allow other cpus to finish.
772 				 */
773 				KKASSERT(info->mode != INVDONE);
774 				if (info->mode == INVSTORE) {
775 					if (info->ptep)
776 						info->opte = atomic_swap_long(info->ptep, info->npte);
777 					CHECKSIGMASK(info);
778 					ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
779 					CHECKSIGMASK(info);
780 				} else {
781 					if (atomic_cmpset_long(info->ptep,
782 							      info->opte, info->npte)) {
783 						info->success = 1;
784 					} else {
785 						info->success = 0;
786 					}
787 					CHECKSIGMASK(info);
788 					ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
789 					CHECKSIGMASK(info);
790 				}
791 				loopme = 1;
792 			}
793 		} else {
794 			/*
795 			 * Originator does not have to wait for the other
796 			 * cpus to finish.  It clears its done bit.  A new
797 			 * command will not be initiated by the originator
798 			 * until the other cpus have cleared their done bits
799 			 * (asynchronously).
800 			 */
801 			vm_offset_t va = info->va;
802 			int npgs;
803 
804 			if (va == (vm_offset_t)-1 ||
805 			    info->npgs > MAX_INVAL_PAGES) {
806 				cpu_invltlb();
807 			} else {
808 				for (npgs = info->npgs; npgs; --npgs) {
809 					cpu_invlpg((void *)va);
810 					va += PAGE_SIZE;
811 				}
812 			}
813 
814 			/* leave loopme alone */
815 			/* other cpus may still be finishing up */
816 			/* can't race originator since that's us */
817 			info->mode = INVDONE;
818 			ATOMIC_CPUMASK_NANDBIT(info->done, cpu);
819 		}
820         }
821 	return loopme;
822 }
823