1 /*
2  * Copyright (c) 2003-2011 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * pmap invalidation support code.  Certain hardware requirements must
37  * be dealt with when manipulating page table entries and page directory
38  * entries within a pmap.  In particular, we cannot safely manipulate
39  * page tables which are in active use by another cpu (even if it is
40  * running in userland) for two reasons: First, TLB writebacks will
41  * race against our own modifications and tests.  Second, even if we
42  * were to use bus-locked instruction we can still screw up the
43  * target cpu's instruction pipeline due to Intel cpu errata.
44  */
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
49 #include <sys/proc.h>
50 #include <sys/vmmeter.h>
51 #include <sys/thread2.h>
52 #include <sys/sysctl.h>
53 
54 #include <vm/vm.h>
55 #include <vm/pmap.h>
56 #include <vm/vm_object.h>
57 
58 #include <machine/cputypes.h>
59 #include <machine/md_var.h>
60 #include <machine/specialreg.h>
61 #include <machine/smp.h>
62 #include <machine/globaldata.h>
63 #include <machine/pmap.h>
64 #include <machine/pmap_inval.h>
65 
66 #if 1	/* DEBUGGING */
67 #define LOOPMASK	(/* 32 * */ 16 * 128 * 1024 - 1)
68 #endif
69 
70 #define MAX_INVAL_PAGES		128
71 
72 struct pmap_inval_info {
73 	vm_offset_t	va;
74 	pt_entry_t	*ptep;
75 	pt_entry_t	opte;
76 	pt_entry_t	npte;
77 	enum { INVDONE, INVSTORE, INVCMPSET } mode;
78 	int		success;
79 	int		npgs;
80 	cpumask_t	done;
81 	cpumask_t	mask;
82 #ifdef LOOPMASK
83 	cpumask_t	sigmask;
84 	int		failed;
85 	int		xloops;
86 #endif
87 } __cachealign;
88 
89 typedef struct pmap_inval_info pmap_inval_info_t;
90 
91 static pmap_inval_info_t	invinfo[MAXCPU];
92 extern cpumask_t		smp_invmask;
93 #ifdef LOOPMASK
94 #ifdef LOOPMASK_IN
95 extern cpumask_t		smp_in_mask;
96 #endif
97 extern cpumask_t		smp_smurf_mask;
98 #endif
99 static long pmap_inval_bulk_count;
100 
101 SYSCTL_LONG(_machdep, OID_AUTO, pmap_inval_bulk_count, CTLFLAG_RW,
102 	    &pmap_inval_bulk_count, 0, "");
103 
104 static void
105 pmap_inval_init(pmap_t pmap)
106 {
107 	cpulock_t olock;
108 	cpulock_t nlock;
109 
110 	crit_enter_id("inval");
111 
112 	if (pmap != &kernel_pmap) {
113 		for (;;) {
114 			olock = pmap->pm_active_lock;
115 			cpu_ccfence();
116 			nlock = olock | CPULOCK_EXCL;
117 			if (olock != nlock &&
118 			    atomic_cmpset_int(&pmap->pm_active_lock,
119 					      olock, nlock)) {
120 				break;
121 			}
122 			lwkt_process_ipiq();
123 			cpu_pause();
124 		}
125 		atomic_add_acq_long(&pmap->pm_invgen, 1);
126 	}
127 }
128 
129 static void
130 pmap_inval_done(pmap_t pmap)
131 {
132 	if (pmap != &kernel_pmap) {
133 		atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL);
134 		atomic_add_acq_long(&pmap->pm_invgen, 1);
135 	}
136 	crit_exit_id("inval");
137 }
138 
139 /*
140  * API function - invalidation the pte at (va) and replace *ptep with
141  * npte atomically across the pmap's active cpus.
142  *
143  * This is a holy mess.
144  *
145  * Returns the previous contents of *ptep.
146  */
147 static
148 void
149 loopdebug(const char *msg, pmap_inval_info_t *info)
150 {
151 	int p;
152 	int cpu = mycpu->gd_cpuid;
153 
154 	cpu_lfence();
155 	atomic_add_long(&smp_smurf_mask.ary[0], 0);
156 	kprintf("%s %d mode=%d m=%08jx d=%08jx s=%08jx "
157 #ifdef LOOPMASK_IN
158 		"in=%08jx "
159 #endif
160 		"smurf=%08jx\n",
161 		msg, cpu, info->mode,
162 		info->mask.ary[0],
163 		info->done.ary[0],
164 		info->sigmask.ary[0],
165 #ifdef LOOPMASK_IN
166 		smp_in_mask.ary[0],
167 #endif
168 		smp_smurf_mask.ary[0]);
169 	kprintf("mdglob ");
170 	for (p = 0; p < ncpus; ++p)
171 		kprintf(" %d", CPU_prvspace[p]->mdglobaldata.gd_xinvaltlb);
172 	kprintf("\n");
173 }
174 
175 #ifdef CHECKSIG
176 
177 #define CHECKSIGMASK(info)	_checksigmask(info, __FILE__, __LINE__)
178 
179 static
180 void
181 _checksigmask(pmap_inval_info_t *info, const char *file, int line)
182 {
183 	cpumask_t tmp;
184 
185 	tmp = info->mask;
186 	CPUMASK_ANDMASK(tmp, info->sigmask);
187 	if (CPUMASK_CMPMASKNEQ(tmp, info->mask)) {
188 		kprintf("\"%s\" line %d: bad sig/mask %08jx %08jx\n",
189 			file, line, info->sigmask.ary[0], info->mask.ary[0]);
190 	}
191 }
192 
193 #else
194 
195 #define CHECKSIGMASK(info)
196 
197 #endif
198 
199 /*
200  * Invalidate the specified va across all cpus associated with the pmap.
201  * If va == (vm_offset_t)-1, we invltlb() instead of invlpg().  The operation
202  * will be done fully synchronously with storing npte into *ptep and returning
203  * opte.
204  *
205  * If ptep is NULL the operation will execute semi-synchronously.
206  * ptep must be NULL if npgs > 1
207  */
208 pt_entry_t
209 pmap_inval_smp(pmap_t pmap, vm_offset_t va, int npgs,
210 	       pt_entry_t *ptep, pt_entry_t npte)
211 {
212 	globaldata_t gd = mycpu;
213 	pmap_inval_info_t *info;
214 	pt_entry_t opte = 0;
215 	int cpu = gd->gd_cpuid;
216 	cpumask_t tmpmask;
217 	unsigned long rflags;
218 
219 	/*
220 	 * Initialize invalidation for pmap and enter critical section.
221 	 */
222 	if (pmap == NULL)
223 		pmap = &kernel_pmap;
224 	pmap_inval_init(pmap);
225 
226 	/*
227 	 * Shortcut single-cpu case if possible.
228 	 */
229 	if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
230 		/*
231 		 * Convert to invltlb if there are too many pages to
232 		 * invlpg on.
233 		 */
234 		if (npgs > MAX_INVAL_PAGES) {
235 			npgs = 0;
236 			va = (vm_offset_t)-1;
237 		}
238 
239 		/*
240 		 * Invalidate the specified pages, handle invltlb if requested.
241 		 */
242 		while (npgs) {
243 			--npgs;
244 			if (ptep) {
245 				opte = atomic_swap_long(ptep, npte);
246 				++ptep;
247 			}
248 			if (va == (vm_offset_t)-1)
249 				break;
250 			cpu_invlpg((void *)va);
251 			va += PAGE_SIZE;
252 		}
253 		if (va == (vm_offset_t)-1)
254 			cpu_invltlb();
255 		pmap_inval_done(pmap);
256 
257 		return opte;
258 	}
259 
260 	/*
261 	 * We need a critical section to prevent getting preempted while
262 	 * we setup our command.  A preemption might execute its own
263 	 * pmap_inval*() command and create confusion below.
264 	 */
265 	info = &invinfo[cpu];
266 
267 	/*
268 	 * We must wait for other cpus which may still be finishing up a
269 	 * prior operation that we requested.
270 	 *
271 	 * We do not have to disable interrupts here.  An Xinvltlb can occur
272 	 * at any time (even within a critical section), but it will not
273 	 * act on our command until we set our done bits.
274 	 */
275 	while (CPUMASK_TESTNZERO(info->done)) {
276 #ifdef LOOPMASK
277 		int loops;
278 
279 		loops = ++info->xloops;
280 		if ((loops & LOOPMASK) == 0) {
281 			info->failed = 1;
282 			loopdebug("orig_waitA", info);
283 			/* XXX recover from possible bug */
284 			CPUMASK_ASSZERO(info->done);
285 		}
286 #endif
287 		cpu_pause();
288 	}
289 	KKASSERT(info->mode == INVDONE);
290 
291 	/*
292 	 * Must set our cpu in the invalidation scan mask before
293 	 * any possibility of [partial] execution (remember, XINVLTLB
294 	 * can interrupt a critical section).
295 	 */
296 	ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
297 
298 	info->va = va;
299 	info->npgs = npgs;
300 	info->ptep = ptep;
301 	info->npte = npte;
302 	info->opte = 0;
303 #ifdef LOOPMASK
304 	info->failed = 0;
305 #endif
306 	info->mode = INVSTORE;
307 
308 	tmpmask = pmap->pm_active;	/* volatile (bits may be cleared) */
309 	cpu_ccfence();
310 	CPUMASK_ANDMASK(tmpmask, smp_active_mask);
311 
312 	/*
313 	 * If ptep is NULL the operation can be semi-synchronous, which means
314 	 * we can improve performance by flagging and removing idle cpus
315 	 * (see the idleinvlclr function in mp_machdep.c).
316 	 *
317 	 * Typically kernel page table operation is semi-synchronous.
318 	 */
319 	if (ptep == NULL)
320 		smp_smurf_idleinvlclr(&tmpmask);
321 	CPUMASK_ORBIT(tmpmask, cpu);
322 	info->mask = tmpmask;
323 
324 	/*
325 	 * Command may start executing the moment 'done' is initialized,
326 	 * disable current cpu interrupt to prevent 'done' field from
327 	 * changing (other cpus can't clear done bits until the originating
328 	 * cpu clears its mask bit, but other cpus CAN start clearing their
329 	 * mask bits).
330 	 */
331 #ifdef LOOPMASK
332 	info->sigmask = tmpmask;
333 	CHECKSIGMASK(info);
334 #endif
335 	cpu_sfence();
336 	rflags = read_rflags();
337 	cpu_disable_intr();
338 
339 	ATOMIC_CPUMASK_COPY(info->done, tmpmask);
340 	/* execution can begin here due to races */
341 
342 	/*
343 	 * Pass our copy of the done bits (so they don't change out from
344 	 * under us) to generate the Xinvltlb interrupt on the targets.
345 	 */
346 	smp_invlpg(&tmpmask);
347 	opte = info->opte;
348 	KKASSERT(info->mode == INVDONE);
349 
350 	/*
351 	 * Target cpus will be in their loop exiting concurrently with our
352 	 * cleanup.  They will not lose the bitmask they obtained before so
353 	 * we can safely clear this bit.
354 	 */
355 	ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
356 	write_rflags(rflags);
357 	pmap_inval_done(pmap);
358 
359 	return opte;
360 }
361 
362 /*
363  * API function - invalidate the pte at (va) and replace *ptep with npte
364  * atomically only if *ptep equals opte, across the pmap's active cpus.
365  *
366  * Returns 1 on success, 0 on failure (caller typically retries).
367  */
368 int
369 pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep,
370 		      pt_entry_t opte, pt_entry_t npte)
371 {
372 	globaldata_t gd = mycpu;
373 	pmap_inval_info_t *info;
374 	int success;
375 	int cpu = gd->gd_cpuid;
376 	cpumask_t tmpmask;
377 	unsigned long rflags;
378 
379 	/*
380 	 * Initialize invalidation for pmap and enter critical section.
381 	 */
382 	if (pmap == NULL)
383 		pmap = &kernel_pmap;
384 	pmap_inval_init(pmap);
385 
386 	/*
387 	 * Shortcut single-cpu case if possible.
388 	 */
389 	if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
390 		if (atomic_cmpset_long(ptep, opte, npte)) {
391 			if (va == (vm_offset_t)-1)
392 				cpu_invltlb();
393 			else
394 				cpu_invlpg((void *)va);
395 			pmap_inval_done(pmap);
396 			return 1;
397 		} else {
398 			pmap_inval_done(pmap);
399 			return 0;
400 		}
401 	}
402 
403 	/*
404 	 * We need a critical section to prevent getting preempted while
405 	 * we setup our command.  A preemption might execute its own
406 	 * pmap_inval*() command and create confusion below.
407 	 */
408 	info = &invinfo[cpu];
409 
410 	/*
411 	 * We must wait for other cpus which may still be finishing
412 	 * up a prior operation.
413 	 */
414 	while (CPUMASK_TESTNZERO(info->done)) {
415 #ifdef LOOPMASK
416 		int loops;
417 
418 		loops = ++info->xloops;
419 		if ((loops & LOOPMASK) == 0) {
420 			info->failed = 1;
421 			loopdebug("orig_waitB", info);
422 			/* XXX recover from possible bug */
423 			CPUMASK_ASSZERO(info->done);
424 		}
425 #endif
426 		cpu_pause();
427 	}
428 	KKASSERT(info->mode == INVDONE);
429 
430 	/*
431 	 * Must set our cpu in the invalidation scan mask before
432 	 * any possibility of [partial] execution (remember, XINVLTLB
433 	 * can interrupt a critical section).
434 	 */
435 	ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
436 
437 	info->va = va;
438 	info->npgs = 1;			/* unused */
439 	info->ptep = ptep;
440 	info->npte = npte;
441 	info->opte = opte;
442 	info->failed = 0;
443 	info->mode = INVCMPSET;
444 	info->success = 0;
445 
446 	tmpmask = pmap->pm_active;	/* volatile */
447 	cpu_ccfence();
448 	CPUMASK_ANDMASK(tmpmask, smp_active_mask);
449 	CPUMASK_ORBIT(tmpmask, cpu);
450 	info->mask = tmpmask;
451 
452 	/*
453 	 * Command may start executing the moment 'done' is initialized,
454 	 * disable current cpu interrupt to prevent 'done' field from
455 	 * changing (other cpus can't clear done bits until the originating
456 	 * cpu clears its mask bit).
457 	 */
458 #ifdef LOOPMASK
459 	info->sigmask = tmpmask;
460 	CHECKSIGMASK(info);
461 #endif
462 	cpu_sfence();
463 	rflags = read_rflags();
464 	cpu_disable_intr();
465 
466 	ATOMIC_CPUMASK_COPY(info->done, tmpmask);
467 
468 	/*
469 	 * Pass our copy of the done bits (so they don't change out from
470 	 * under us) to generate the Xinvltlb interrupt on the targets.
471 	 */
472 	smp_invlpg(&tmpmask);
473 	success = info->success;
474 	KKASSERT(info->mode == INVDONE);
475 
476 	ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
477 	write_rflags(rflags);
478 	pmap_inval_done(pmap);
479 
480 	return success;
481 }
482 
483 void
484 pmap_inval_bulk_init(pmap_inval_bulk_t *bulk, struct pmap *pmap)
485 {
486 	bulk->pmap = pmap;
487 	bulk->va_beg = 0;
488 	bulk->va_end = 0;
489 	bulk->count = 0;
490 }
491 
492 pt_entry_t
493 pmap_inval_bulk(pmap_inval_bulk_t *bulk, vm_offset_t va,
494 		pt_entry_t *ptep, pt_entry_t npte)
495 {
496 	pt_entry_t pte;
497 
498 	/*
499 	 * Degenerate case, localized or we don't care (e.g. because we
500 	 * are jacking the entire page table) or the pmap is not in-use
501 	 * by anyone.  No invalidations are done on any cpu.
502 	 */
503 	if (bulk == NULL) {
504 		pte = atomic_swap_long(ptep, npte);
505 		return pte;
506 	}
507 
508 	/*
509 	 * If it isn't the kernel pmap we execute the operation synchronously
510 	 * on all cpus belonging to the pmap, which avoids concurrency bugs in
511 	 * the hw related to changing pte's out from under threads.
512 	 *
513 	 * Eventually I would like to implement streaming pmap invalidation
514 	 * for user pmaps to reduce mmap/munmap overheads for heavily-loaded
515 	 * threaded programs.
516 	 */
517 	if (bulk->pmap != &kernel_pmap) {
518 		pte = pmap_inval_smp(bulk->pmap, va, 1, ptep, npte);
519 		return pte;
520 	}
521 
522 	/*
523 	 * This is the kernel_pmap.  All unmap operations presume that there
524 	 * are no other cpus accessing the addresses in question.  Implement
525 	 * the bulking algorithm.  collect the required information and
526 	 * synchronize once at the end.
527 	 */
528 	pte = atomic_swap_long(ptep, npte);
529 	if (va == (vm_offset_t)-1) {
530 		bulk->va_beg = va;
531 	} else if (bulk->va_beg == bulk->va_end) {
532 		bulk->va_beg = va;
533 		bulk->va_end = va + PAGE_SIZE;
534 	} else if (va == bulk->va_end) {
535 		bulk->va_end = va + PAGE_SIZE;
536 	} else {
537 		bulk->va_beg = (vm_offset_t)-1;
538 		bulk->va_end = 0;
539 #if 0
540 		pmap_inval_bulk_flush(bulk);
541 		bulk->count = 1;
542 		if (va == (vm_offset_t)-1) {
543 			bulk->va_beg = va;
544 			bulk->va_end = 0;
545 		} else {
546 			bulk->va_beg = va;
547 			bulk->va_end = va + PAGE_SIZE;
548 		}
549 #endif
550 	}
551 	++bulk->count;
552 
553 	return pte;
554 }
555 
556 void
557 pmap_inval_bulk_flush(pmap_inval_bulk_t *bulk)
558 {
559 	if (bulk == NULL)
560 		return;
561 	if (bulk->count > 0)
562 		pmap_inval_bulk_count += (bulk->count - 1);
563 	if (bulk->va_beg != bulk->va_end) {
564 		if (bulk->va_beg == (vm_offset_t)-1) {
565 			pmap_inval_smp(bulk->pmap, bulk->va_beg, 1, NULL, 0);
566 		} else {
567 			long n;
568 
569 			n = (bulk->va_end - bulk->va_beg) >> PAGE_SHIFT;
570 			pmap_inval_smp(bulk->pmap, bulk->va_beg, n, NULL, 0);
571 		}
572 	}
573 	bulk->va_beg = 0;
574 	bulk->va_end = 0;
575 	bulk->count = 0;
576 }
577 
578 /*
579  * Called with a critical section held and interrupts enabled.
580  */
581 int
582 pmap_inval_intr(cpumask_t *cpumaskp, int toolong)
583 {
584 	globaldata_t gd = mycpu;
585 	pmap_inval_info_t *info;
586 	int loopme = 0;
587 	int cpu;
588 	cpumask_t cpumask;
589 #ifdef LOOPMASK
590 	int loops;
591 #endif
592 
593 	/*
594 	 * Check all cpus for invalidations we may need to service.
595 	 */
596 	cpu_ccfence();
597 	cpu = gd->gd_cpuid;
598 	cpumask = *cpumaskp;
599 
600         while (CPUMASK_TESTNZERO(cpumask)) {
601                 int n = BSFCPUMASK(cpumask);
602 
603 #ifdef LOOPMASK
604 		KKASSERT(n >= 0 && n < MAXCPU);
605 #endif
606 
607                 CPUMASK_NANDBIT(cpumask, n);
608 		info = &invinfo[n];
609 
610 		/*
611 		 * Due to interrupts/races we can catch a new operation
612 		 * in an older interrupt.  A fence is needed once we detect
613 		 * the (not) done bit.
614 		 */
615 		if (!CPUMASK_TESTBIT(info->done, cpu))
616 			continue;
617 		cpu_lfence();
618 #ifdef LOOPMASK
619 		if (toolong) {
620 			kprintf("pminvl %d->%d %08jx %08jx mode=%d\n",
621 				cpu, n, info->done.ary[0], info->mask.ary[0],
622 				info->mode);
623 		}
624 #endif
625 
626 		/*
627 		 * info->mask and info->done always contain the originating
628 		 * cpu until the originator is done.  Targets may still be
629 		 * present in info->done after the originator is done (they
630 		 * will be finishing up their loops).
631 		 *
632 		 * Clear info->mask bits on other cpus to indicate that they
633 		 * have quiesced (entered the loop).  Once the other mask bits
634 		 * are clear we can execute the operation on the original,
635 		 * then clear the mask and done bits on the originator.  The
636 		 * targets will then finish up their side and clear their
637 		 * done bits.
638 		 *
639 		 * The command is considered 100% done when all done bits have
640 		 * been cleared.
641 		 */
642 		if (n != cpu) {
643 			/*
644 			 * Command state machine for 'other' cpus.
645 			 */
646 			if (CPUMASK_TESTBIT(info->mask, cpu)) {
647 				/*
648 				 * Other cpu indicate to originator that they
649 				 * are quiesced.
650 				 */
651 				ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
652 				loopme = 1;
653 			} else if (info->ptep &&
654 				   CPUMASK_TESTBIT(info->mask, n)) {
655 				/*
656 				 * Other cpu must wait for the originator (n)
657 				 * to complete its command if ptep is not NULL.
658 				 */
659 				loopme = 1;
660 			} else {
661 				/*
662 				 * Other cpu detects that the originator has
663 				 * completed its command, or there was no
664 				 * command.
665 				 *
666 				 * Now that the page table entry has changed,
667 				 * we can follow up with our own invalidation.
668 				 */
669 				vm_offset_t va = info->va;
670 				int npgs;
671 
672 				if (va == (vm_offset_t)-1 ||
673 				    info->npgs > MAX_INVAL_PAGES) {
674 					cpu_invltlb();
675 				} else {
676 					for (npgs = info->npgs; npgs; --npgs) {
677 						cpu_invlpg((void *)va);
678 						va += PAGE_SIZE;
679 					}
680 				}
681 				ATOMIC_CPUMASK_NANDBIT(info->done, cpu);
682 				/* info invalid now */
683 				/* loopme left alone */
684 			}
685 		} else if (CPUMASK_TESTBIT(info->mask, cpu)) {
686 			/*
687 			 * Originator is waiting for other cpus
688 			 */
689 			if (CPUMASK_CMPMASKNEQ(info->mask, gd->gd_cpumask)) {
690 				/*
691 				 * Originator waits for other cpus to enter
692 				 * their loop (aka quiesce).
693 				 */
694 				loopme = 1;
695 #ifdef LOOPMASK
696 				loops = ++info->xloops;
697 				if ((loops & LOOPMASK) == 0) {
698 					info->failed = 1;
699 					loopdebug("orig_waitC", info);
700 					/* XXX recover from possible bug */
701 					mdcpu->gd_xinvaltlb = 0;
702 					cpu_disable_intr();
703 					smp_invlpg(&smp_active_mask);
704 					cpu_enable_intr();
705 				}
706 #endif
707 			} else {
708 				/*
709 				 * Originator executes operation and clears
710 				 * mask to allow other cpus to finish.
711 				 */
712 				KKASSERT(info->mode != INVDONE);
713 				if (info->mode == INVSTORE) {
714 					if (info->ptep)
715 						info->opte = atomic_swap_long(info->ptep, info->npte);
716 					CHECKSIGMASK(info);
717 					ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
718 					CHECKSIGMASK(info);
719 				} else {
720 					if (atomic_cmpset_long(info->ptep,
721 							      info->opte, info->npte)) {
722 						info->success = 1;
723 					} else {
724 						info->success = 0;
725 					}
726 					CHECKSIGMASK(info);
727 					ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
728 					CHECKSIGMASK(info);
729 				}
730 				loopme = 1;
731 			}
732 		} else {
733 			/*
734 			 * Originator does not have to wait for the other
735 			 * cpus to finish.  It clears its done bit.  A new
736 			 * command will not be initiated by the originator
737 			 * until the other cpus have cleared their done bits
738 			 * (asynchronously).
739 			 */
740 			vm_offset_t va = info->va;
741 			int npgs;
742 
743 			if (va == (vm_offset_t)-1 ||
744 			    info->npgs > MAX_INVAL_PAGES) {
745 				cpu_invltlb();
746 			} else {
747 				for (npgs = info->npgs; npgs; --npgs) {
748 					cpu_invlpg((void *)va);
749 					va += PAGE_SIZE;
750 				}
751 			}
752 #ifdef LOOPMASK
753 			info->xloops = 0;
754 #endif
755 			/* leave loopme alone */
756 			/* other cpus may still be finishing up */
757 			/* can't race originator since that's us */
758 			info->mode = INVDONE;
759 			ATOMIC_CPUMASK_NANDBIT(info->done, cpu);
760 		}
761         }
762 	return loopme;
763 }
764