1 /*
2  * Copyright (c) 2003-2011 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * pmap invalidation support code.  Certain hardware requirements must
37  * be dealt with when manipulating page table entries and page directory
38  * entries within a pmap.  In particular, we cannot safely manipulate
39  * page tables which are in active use by another cpu (even if it is
40  * running in userland) for two reasons: First, TLB writebacks will
41  * race against our own modifications and tests.  Second, even if we
42  * were to use bus-locked instruction we can still screw up the
43  * target cpu's instruction pipeline due to Intel cpu errata.
44  */
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
49 #include <sys/proc.h>
50 #include <sys/vmmeter.h>
51 #include <sys/thread2.h>
52 #include <sys/sysctl.h>
53 
54 #include <vm/vm.h>
55 #include <vm/pmap.h>
56 #include <vm/vm_object.h>
57 
58 #include <machine/cputypes.h>
59 #include <machine/md_var.h>
60 #include <machine/specialreg.h>
61 #include <machine/smp.h>
62 #include <machine/globaldata.h>
63 #include <machine/pmap.h>
64 #include <machine/pmap_inval.h>
65 
66 #if 1	/* DEBUGGING */
67 #define LOOPMASK	(/* 32 * */ 16 * 128 * 1024 - 1)
68 #endif
69 
70 #define MAX_INVAL_PAGES		128
71 
72 struct pmap_inval_info {
73 	vm_offset_t	va;
74 	pt_entry_t	*ptep;
75 	pt_entry_t	opte;
76 	pt_entry_t	npte;
77 	enum { INVDONE, INVSTORE, INVCMPSET } mode;
78 	int		success;
79 	int		npgs;
80 	cpumask_t	done;
81 	cpumask_t	mask;
82 #ifdef LOOPMASK
83 	cpumask_t	sigmask;
84 	int		failed;
85 	int		xloops;
86 #endif
87 } __cachealign;
88 
89 typedef struct pmap_inval_info pmap_inval_info_t;
90 
91 static pmap_inval_info_t	invinfo[MAXCPU];
92 extern cpumask_t		smp_invmask;
93 #ifdef LOOPMASK
94 #ifdef LOOPMASK_IN
95 extern cpumask_t		smp_in_mask;
96 #endif
97 extern cpumask_t		smp_smurf_mask;
98 #endif
99 static long pmap_inval_bulk_count;
100 
101 SYSCTL_LONG(_machdep, OID_AUTO, pmap_inval_bulk_count, CTLFLAG_RW,
102 	    &pmap_inval_bulk_count, 0, "");
103 
104 static void
105 pmap_inval_init(pmap_t pmap)
106 {
107 	cpulock_t olock;
108 	cpulock_t nlock;
109 
110 	crit_enter_id("inval");
111 
112 	if (pmap != &kernel_pmap) {
113 		for (;;) {
114 			olock = pmap->pm_active_lock;
115 			cpu_ccfence();
116 			nlock = olock | CPULOCK_EXCL;
117 			if (olock != nlock &&
118 			    atomic_cmpset_int(&pmap->pm_active_lock,
119 					      olock, nlock)) {
120 				break;
121 			}
122 			lwkt_process_ipiq();
123 			cpu_pause();
124 		}
125 		atomic_add_acq_long(&pmap->pm_invgen, 1);
126 	}
127 }
128 
129 static void
130 pmap_inval_done(pmap_t pmap)
131 {
132 	if (pmap != &kernel_pmap) {
133 		atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL);
134 		atomic_add_acq_long(&pmap->pm_invgen, 1);
135 	}
136 	crit_exit_id("inval");
137 }
138 
139 #ifdef LOOPMASK
140 
141 /*
142  * API function - invalidation the pte at (va) and replace *ptep with
143  * npte atomically across the pmap's active cpus.
144  *
145  * This is a holy mess.
146  *
147  * Returns the previous contents of *ptep.
148  */
149 static
150 void
151 loopdebug(const char *msg, pmap_inval_info_t *info)
152 {
153 	int p;
154 	int cpu = mycpu->gd_cpuid;
155 
156 	cpu_lfence();
157 #ifdef LOOPMASK
158 	atomic_add_long(&smp_smurf_mask.ary[0], 0);
159 #endif
160 	kprintf("%s %d mode=%d m=%08jx d=%08jx "
161 #ifdef LOOPMASK
162 		"s=%08jx "
163 #endif
164 #ifdef LOOPMASK_IN
165 		"in=%08jx "
166 #endif
167 #ifdef LOOPMASK
168 		"smurf=%08jx\n"
169 #endif
170 		, msg, cpu, info->mode,
171 		info->mask.ary[0],
172 		info->done.ary[0]
173 #ifdef LOOPMASK
174 		, info->sigmask.ary[0]
175 #endif
176 #ifdef LOOPMASK_IN
177 		, smp_in_mask.ary[0]
178 #endif
179 #ifdef LOOPMASK
180 		, smp_smurf_mask.ary[0]
181 #endif
182 		);
183 	kprintf("mdglob ");
184 	for (p = 0; p < ncpus; ++p)
185 		kprintf(" %d", CPU_prvspace[p]->mdglobaldata.gd_xinvaltlb);
186 	kprintf("\n");
187 }
188 
189 #endif
190 
191 #ifdef CHECKSIG
192 
193 #define CHECKSIGMASK(info)	_checksigmask(info, __FILE__, __LINE__)
194 
195 static
196 void
197 _checksigmask(pmap_inval_info_t *info, const char *file, int line)
198 {
199 	cpumask_t tmp;
200 
201 	tmp = info->mask;
202 	CPUMASK_ANDMASK(tmp, info->sigmask);
203 	if (CPUMASK_CMPMASKNEQ(tmp, info->mask)) {
204 		kprintf("\"%s\" line %d: bad sig/mask %08jx %08jx\n",
205 			file, line, info->sigmask.ary[0], info->mask.ary[0]);
206 	}
207 }
208 
209 #else
210 
211 #define CHECKSIGMASK(info)
212 
213 #endif
214 
215 /*
216  * Invalidate the specified va across all cpus associated with the pmap.
217  * If va == (vm_offset_t)-1, we invltlb() instead of invlpg().  The operation
218  * will be done fully synchronously with storing npte into *ptep and returning
219  * opte.
220  *
221  * If ptep is NULL the operation will execute semi-synchronously.
222  * ptep must be NULL if npgs > 1
223  */
224 pt_entry_t
225 pmap_inval_smp(pmap_t pmap, vm_offset_t va, int npgs,
226 	       pt_entry_t *ptep, pt_entry_t npte)
227 {
228 	globaldata_t gd = mycpu;
229 	pmap_inval_info_t *info;
230 	pt_entry_t opte = 0;
231 	int cpu = gd->gd_cpuid;
232 	cpumask_t tmpmask;
233 	unsigned long rflags;
234 
235 	/*
236 	 * Initialize invalidation for pmap and enter critical section.
237 	 */
238 	if (pmap == NULL)
239 		pmap = &kernel_pmap;
240 	pmap_inval_init(pmap);
241 
242 	/*
243 	 * Shortcut single-cpu case if possible.
244 	 */
245 	if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
246 		/*
247 		 * Convert to invltlb if there are too many pages to
248 		 * invlpg on.
249 		 */
250 		if (npgs > MAX_INVAL_PAGES) {
251 			npgs = 0;
252 			va = (vm_offset_t)-1;
253 		}
254 
255 		/*
256 		 * Invalidate the specified pages, handle invltlb if requested.
257 		 */
258 		while (npgs) {
259 			--npgs;
260 			if (ptep) {
261 				opte = atomic_swap_long(ptep, npte);
262 				++ptep;
263 			}
264 			if (va == (vm_offset_t)-1)
265 				break;
266 			cpu_invlpg((void *)va);
267 			va += PAGE_SIZE;
268 		}
269 		if (va == (vm_offset_t)-1)
270 			cpu_invltlb();
271 		pmap_inval_done(pmap);
272 
273 		return opte;
274 	}
275 
276 	/*
277 	 * We need a critical section to prevent getting preempted while
278 	 * we setup our command.  A preemption might execute its own
279 	 * pmap_inval*() command and create confusion below.
280 	 */
281 	info = &invinfo[cpu];
282 
283 	/*
284 	 * We must wait for other cpus which may still be finishing up a
285 	 * prior operation that we requested.
286 	 *
287 	 * We do not have to disable interrupts here.  An Xinvltlb can occur
288 	 * at any time (even within a critical section), but it will not
289 	 * act on our command until we set our done bits.
290 	 */
291 	while (CPUMASK_TESTNZERO(info->done)) {
292 #ifdef LOOPMASK
293 		int loops;
294 
295 		loops = ++info->xloops;
296 		if ((loops & LOOPMASK) == 0) {
297 			info->failed = 1;
298 			loopdebug("orig_waitA", info);
299 			/* XXX recover from possible bug */
300 			CPUMASK_ASSZERO(info->done);
301 		}
302 #endif
303 		cpu_pause();
304 	}
305 	KKASSERT(info->mode == INVDONE);
306 
307 	/*
308 	 * Must set our cpu in the invalidation scan mask before
309 	 * any possibility of [partial] execution (remember, XINVLTLB
310 	 * can interrupt a critical section).
311 	 */
312 	ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
313 
314 	info->va = va;
315 	info->npgs = npgs;
316 	info->ptep = ptep;
317 	info->npte = npte;
318 	info->opte = 0;
319 #ifdef LOOPMASK
320 	info->failed = 0;
321 #endif
322 	info->mode = INVSTORE;
323 
324 	tmpmask = pmap->pm_active;	/* volatile (bits may be cleared) */
325 	cpu_ccfence();
326 	CPUMASK_ANDMASK(tmpmask, smp_active_mask);
327 
328 	/*
329 	 * If ptep is NULL the operation can be semi-synchronous, which means
330 	 * we can improve performance by flagging and removing idle cpus
331 	 * (see the idleinvlclr function in mp_machdep.c).
332 	 *
333 	 * Typically kernel page table operation is semi-synchronous.
334 	 */
335 	if (ptep == NULL)
336 		smp_smurf_idleinvlclr(&tmpmask);
337 	CPUMASK_ORBIT(tmpmask, cpu);
338 	info->mask = tmpmask;
339 
340 	/*
341 	 * Command may start executing the moment 'done' is initialized,
342 	 * disable current cpu interrupt to prevent 'done' field from
343 	 * changing (other cpus can't clear done bits until the originating
344 	 * cpu clears its mask bit, but other cpus CAN start clearing their
345 	 * mask bits).
346 	 */
347 #ifdef LOOPMASK
348 	info->sigmask = tmpmask;
349 	CHECKSIGMASK(info);
350 #endif
351 	cpu_sfence();
352 	rflags = read_rflags();
353 	cpu_disable_intr();
354 
355 	ATOMIC_CPUMASK_COPY(info->done, tmpmask);
356 	/* execution can begin here due to races */
357 
358 	/*
359 	 * Pass our copy of the done bits (so they don't change out from
360 	 * under us) to generate the Xinvltlb interrupt on the targets.
361 	 */
362 	smp_invlpg(&tmpmask);
363 	opte = info->opte;
364 	KKASSERT(info->mode == INVDONE);
365 
366 	/*
367 	 * Target cpus will be in their loop exiting concurrently with our
368 	 * cleanup.  They will not lose the bitmask they obtained before so
369 	 * we can safely clear this bit.
370 	 */
371 	ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
372 	write_rflags(rflags);
373 	pmap_inval_done(pmap);
374 
375 	return opte;
376 }
377 
378 /*
379  * API function - invalidate the pte at (va) and replace *ptep with npte
380  * atomically only if *ptep equals opte, across the pmap's active cpus.
381  *
382  * Returns 1 on success, 0 on failure (caller typically retries).
383  */
384 int
385 pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep,
386 		      pt_entry_t opte, pt_entry_t npte)
387 {
388 	globaldata_t gd = mycpu;
389 	pmap_inval_info_t *info;
390 	int success;
391 	int cpu = gd->gd_cpuid;
392 	cpumask_t tmpmask;
393 	unsigned long rflags;
394 
395 	/*
396 	 * Initialize invalidation for pmap and enter critical section.
397 	 */
398 	if (pmap == NULL)
399 		pmap = &kernel_pmap;
400 	pmap_inval_init(pmap);
401 
402 	/*
403 	 * Shortcut single-cpu case if possible.
404 	 */
405 	if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
406 		if (atomic_cmpset_long(ptep, opte, npte)) {
407 			if (va == (vm_offset_t)-1)
408 				cpu_invltlb();
409 			else
410 				cpu_invlpg((void *)va);
411 			pmap_inval_done(pmap);
412 			return 1;
413 		} else {
414 			pmap_inval_done(pmap);
415 			return 0;
416 		}
417 	}
418 
419 	/*
420 	 * We need a critical section to prevent getting preempted while
421 	 * we setup our command.  A preemption might execute its own
422 	 * pmap_inval*() command and create confusion below.
423 	 */
424 	info = &invinfo[cpu];
425 
426 	/*
427 	 * We must wait for other cpus which may still be finishing
428 	 * up a prior operation.
429 	 */
430 	while (CPUMASK_TESTNZERO(info->done)) {
431 #ifdef LOOPMASK
432 		int loops;
433 
434 		loops = ++info->xloops;
435 		if ((loops & LOOPMASK) == 0) {
436 			info->failed = 1;
437 			loopdebug("orig_waitB", info);
438 			/* XXX recover from possible bug */
439 			CPUMASK_ASSZERO(info->done);
440 		}
441 #endif
442 		cpu_pause();
443 	}
444 	KKASSERT(info->mode == INVDONE);
445 
446 	/*
447 	 * Must set our cpu in the invalidation scan mask before
448 	 * any possibility of [partial] execution (remember, XINVLTLB
449 	 * can interrupt a critical section).
450 	 */
451 	ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
452 
453 	info->va = va;
454 	info->npgs = 1;			/* unused */
455 	info->ptep = ptep;
456 	info->npte = npte;
457 	info->opte = opte;
458 #ifdef LOOPMASK
459 	info->failed = 0;
460 #endif
461 	info->mode = INVCMPSET;
462 	info->success = 0;
463 
464 	tmpmask = pmap->pm_active;	/* volatile */
465 	cpu_ccfence();
466 	CPUMASK_ANDMASK(tmpmask, smp_active_mask);
467 	CPUMASK_ORBIT(tmpmask, cpu);
468 	info->mask = tmpmask;
469 
470 	/*
471 	 * Command may start executing the moment 'done' is initialized,
472 	 * disable current cpu interrupt to prevent 'done' field from
473 	 * changing (other cpus can't clear done bits until the originating
474 	 * cpu clears its mask bit).
475 	 */
476 #ifdef LOOPMASK
477 	info->sigmask = tmpmask;
478 	CHECKSIGMASK(info);
479 #endif
480 	cpu_sfence();
481 	rflags = read_rflags();
482 	cpu_disable_intr();
483 
484 	ATOMIC_CPUMASK_COPY(info->done, tmpmask);
485 
486 	/*
487 	 * Pass our copy of the done bits (so they don't change out from
488 	 * under us) to generate the Xinvltlb interrupt on the targets.
489 	 */
490 	smp_invlpg(&tmpmask);
491 	success = info->success;
492 	KKASSERT(info->mode == INVDONE);
493 
494 	ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
495 	write_rflags(rflags);
496 	pmap_inval_done(pmap);
497 
498 	return success;
499 }
500 
501 void
502 pmap_inval_bulk_init(pmap_inval_bulk_t *bulk, struct pmap *pmap)
503 {
504 	bulk->pmap = pmap;
505 	bulk->va_beg = 0;
506 	bulk->va_end = 0;
507 	bulk->count = 0;
508 }
509 
510 pt_entry_t
511 pmap_inval_bulk(pmap_inval_bulk_t *bulk, vm_offset_t va,
512 		pt_entry_t *ptep, pt_entry_t npte)
513 {
514 	pt_entry_t pte;
515 
516 	/*
517 	 * Degenerate case, localized or we don't care (e.g. because we
518 	 * are jacking the entire page table) or the pmap is not in-use
519 	 * by anyone.  No invalidations are done on any cpu.
520 	 */
521 	if (bulk == NULL) {
522 		pte = atomic_swap_long(ptep, npte);
523 		return pte;
524 	}
525 
526 	/*
527 	 * If it isn't the kernel pmap we execute the operation synchronously
528 	 * on all cpus belonging to the pmap, which avoids concurrency bugs in
529 	 * the hw related to changing pte's out from under threads.
530 	 *
531 	 * Eventually I would like to implement streaming pmap invalidation
532 	 * for user pmaps to reduce mmap/munmap overheads for heavily-loaded
533 	 * threaded programs.
534 	 */
535 	if (bulk->pmap != &kernel_pmap) {
536 		pte = pmap_inval_smp(bulk->pmap, va, 1, ptep, npte);
537 		return pte;
538 	}
539 
540 	/*
541 	 * This is the kernel_pmap.  All unmap operations presume that there
542 	 * are no other cpus accessing the addresses in question.  Implement
543 	 * the bulking algorithm.  collect the required information and
544 	 * synchronize once at the end.
545 	 */
546 	pte = atomic_swap_long(ptep, npte);
547 	if (va == (vm_offset_t)-1) {
548 		bulk->va_beg = va;
549 	} else if (bulk->va_beg == bulk->va_end) {
550 		bulk->va_beg = va;
551 		bulk->va_end = va + PAGE_SIZE;
552 	} else if (va == bulk->va_end) {
553 		bulk->va_end = va + PAGE_SIZE;
554 	} else {
555 		bulk->va_beg = (vm_offset_t)-1;
556 		bulk->va_end = 0;
557 #if 0
558 		pmap_inval_bulk_flush(bulk);
559 		bulk->count = 1;
560 		if (va == (vm_offset_t)-1) {
561 			bulk->va_beg = va;
562 			bulk->va_end = 0;
563 		} else {
564 			bulk->va_beg = va;
565 			bulk->va_end = va + PAGE_SIZE;
566 		}
567 #endif
568 	}
569 	++bulk->count;
570 
571 	return pte;
572 }
573 
574 void
575 pmap_inval_bulk_flush(pmap_inval_bulk_t *bulk)
576 {
577 	if (bulk == NULL)
578 		return;
579 	if (bulk->count > 0)
580 		pmap_inval_bulk_count += (bulk->count - 1);
581 	if (bulk->va_beg != bulk->va_end) {
582 		if (bulk->va_beg == (vm_offset_t)-1) {
583 			pmap_inval_smp(bulk->pmap, bulk->va_beg, 1, NULL, 0);
584 		} else {
585 			long n;
586 
587 			n = (bulk->va_end - bulk->va_beg) >> PAGE_SHIFT;
588 			pmap_inval_smp(bulk->pmap, bulk->va_beg, n, NULL, 0);
589 		}
590 	}
591 	bulk->va_beg = 0;
592 	bulk->va_end = 0;
593 	bulk->count = 0;
594 }
595 
596 /*
597  * Called with a critical section held and interrupts enabled.
598  */
599 int
600 pmap_inval_intr(cpumask_t *cpumaskp, int toolong)
601 {
602 	globaldata_t gd = mycpu;
603 	pmap_inval_info_t *info;
604 	int loopme = 0;
605 	int cpu;
606 	cpumask_t cpumask;
607 #ifdef LOOPMASK
608 	int loops;
609 #endif
610 
611 	/*
612 	 * Check all cpus for invalidations we may need to service.
613 	 */
614 	cpu_ccfence();
615 	cpu = gd->gd_cpuid;
616 	cpumask = *cpumaskp;
617 
618         while (CPUMASK_TESTNZERO(cpumask)) {
619                 int n = BSFCPUMASK(cpumask);
620 
621 #ifdef LOOPMASK
622 		KKASSERT(n >= 0 && n < MAXCPU);
623 #endif
624 
625                 CPUMASK_NANDBIT(cpumask, n);
626 		info = &invinfo[n];
627 
628 		/*
629 		 * Due to interrupts/races we can catch a new operation
630 		 * in an older interrupt.  A fence is needed once we detect
631 		 * the (not) done bit.
632 		 */
633 		if (!CPUMASK_TESTBIT(info->done, cpu))
634 			continue;
635 		cpu_lfence();
636 #ifdef LOOPMASK
637 		if (toolong) {
638 			kprintf("pminvl %d->%d %08jx %08jx mode=%d\n",
639 				cpu, n, info->done.ary[0], info->mask.ary[0],
640 				info->mode);
641 		}
642 #endif
643 
644 		/*
645 		 * info->mask and info->done always contain the originating
646 		 * cpu until the originator is done.  Targets may still be
647 		 * present in info->done after the originator is done (they
648 		 * will be finishing up their loops).
649 		 *
650 		 * Clear info->mask bits on other cpus to indicate that they
651 		 * have quiesced (entered the loop).  Once the other mask bits
652 		 * are clear we can execute the operation on the original,
653 		 * then clear the mask and done bits on the originator.  The
654 		 * targets will then finish up their side and clear their
655 		 * done bits.
656 		 *
657 		 * The command is considered 100% done when all done bits have
658 		 * been cleared.
659 		 */
660 		if (n != cpu) {
661 			/*
662 			 * Command state machine for 'other' cpus.
663 			 */
664 			if (CPUMASK_TESTBIT(info->mask, cpu)) {
665 				/*
666 				 * Other cpu indicate to originator that they
667 				 * are quiesced.
668 				 */
669 				ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
670 				loopme = 1;
671 			} else if (info->ptep &&
672 				   CPUMASK_TESTBIT(info->mask, n)) {
673 				/*
674 				 * Other cpu must wait for the originator (n)
675 				 * to complete its command if ptep is not NULL.
676 				 */
677 				loopme = 1;
678 			} else {
679 				/*
680 				 * Other cpu detects that the originator has
681 				 * completed its command, or there was no
682 				 * command.
683 				 *
684 				 * Now that the page table entry has changed,
685 				 * we can follow up with our own invalidation.
686 				 */
687 				vm_offset_t va = info->va;
688 				int npgs;
689 
690 				if (va == (vm_offset_t)-1 ||
691 				    info->npgs > MAX_INVAL_PAGES) {
692 					cpu_invltlb();
693 				} else {
694 					for (npgs = info->npgs; npgs; --npgs) {
695 						cpu_invlpg((void *)va);
696 						va += PAGE_SIZE;
697 					}
698 				}
699 				ATOMIC_CPUMASK_NANDBIT(info->done, cpu);
700 				/* info invalid now */
701 				/* loopme left alone */
702 			}
703 		} else if (CPUMASK_TESTBIT(info->mask, cpu)) {
704 			/*
705 			 * Originator is waiting for other cpus
706 			 */
707 			if (CPUMASK_CMPMASKNEQ(info->mask, gd->gd_cpumask)) {
708 				/*
709 				 * Originator waits for other cpus to enter
710 				 * their loop (aka quiesce).
711 				 */
712 				loopme = 1;
713 #ifdef LOOPMASK
714 				loops = ++info->xloops;
715 				if ((loops & LOOPMASK) == 0) {
716 					info->failed = 1;
717 					loopdebug("orig_waitC", info);
718 					/* XXX recover from possible bug */
719 					mdcpu->gd_xinvaltlb = 0;
720 					cpu_disable_intr();
721 					smp_invlpg(&smp_active_mask);
722 					cpu_enable_intr();
723 				}
724 #endif
725 			} else {
726 				/*
727 				 * Originator executes operation and clears
728 				 * mask to allow other cpus to finish.
729 				 */
730 				KKASSERT(info->mode != INVDONE);
731 				if (info->mode == INVSTORE) {
732 					if (info->ptep)
733 						info->opte = atomic_swap_long(info->ptep, info->npte);
734 					CHECKSIGMASK(info);
735 					ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
736 					CHECKSIGMASK(info);
737 				} else {
738 					if (atomic_cmpset_long(info->ptep,
739 							      info->opte, info->npte)) {
740 						info->success = 1;
741 					} else {
742 						info->success = 0;
743 					}
744 					CHECKSIGMASK(info);
745 					ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
746 					CHECKSIGMASK(info);
747 				}
748 				loopme = 1;
749 			}
750 		} else {
751 			/*
752 			 * Originator does not have to wait for the other
753 			 * cpus to finish.  It clears its done bit.  A new
754 			 * command will not be initiated by the originator
755 			 * until the other cpus have cleared their done bits
756 			 * (asynchronously).
757 			 */
758 			vm_offset_t va = info->va;
759 			int npgs;
760 
761 			if (va == (vm_offset_t)-1 ||
762 			    info->npgs > MAX_INVAL_PAGES) {
763 				cpu_invltlb();
764 			} else {
765 				for (npgs = info->npgs; npgs; --npgs) {
766 					cpu_invlpg((void *)va);
767 					va += PAGE_SIZE;
768 				}
769 			}
770 #ifdef LOOPMASK
771 			info->xloops = 0;
772 #endif
773 			/* leave loopme alone */
774 			/* other cpus may still be finishing up */
775 			/* can't race originator since that's us */
776 			info->mode = INVDONE;
777 			ATOMIC_CPUMASK_NANDBIT(info->done, cpu);
778 		}
779         }
780 	return loopme;
781 }
782