1 /*
2  * Copyright (c) 2003-2011 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 /*
36  * pmap invalidation support code.  Certain hardware requirements must
37  * be dealt with when manipulating page table entries and page directory
38  * entries within a pmap.  In particular, we cannot safely manipulate
39  * page tables which are in active use by another cpu (even if it is
40  * running in userland) for two reasons: First, TLB writebacks will
41  * race against our own modifications and tests.  Second, even if we
42  * were to use bus-locked instruction we can still screw up the
43  * target cpu's instruction pipeline due to Intel cpu errata.
44  */
45 
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
49 #include <sys/proc.h>
50 #include <sys/vmmeter.h>
51 #include <sys/thread2.h>
52 #include <sys/sysctl.h>
53 
54 #include <vm/vm.h>
55 #include <vm/pmap.h>
56 #include <vm/vm_object.h>
57 
58 #include <machine/cputypes.h>
59 #include <machine/md_var.h>
60 #include <machine/specialreg.h>
61 #include <machine/smp.h>
62 #include <machine/globaldata.h>
63 #include <machine/pmap.h>
64 #include <machine/pmap_inval.h>
65 #include <machine/clock.h>
66 
67 #if 1	/* DEBUGGING */
68 #define LOOPRECOVER			/* enable watchdog */
69 #endif
70 
71 /*
72  * Watchdog recovery interval, in seconds.
73  *
74  * The watchdog value is generous for two reasons.  First, because the
75  * situation is not supposed to happen at all (but does), and second,
76  * because VMs could be very slow at handling IPIs.
77  */
78 #define LOOPRECOVER_TIMEOUT1	2	/* initial recovery */
79 #define LOOPRECOVER_TIMEOUT2	1	/* repeated recoveries */
80 
81 #define MAX_INVAL_PAGES		128
82 
83 struct pmap_inval_info {
84 	vm_offset_t	va;
85 	pt_entry_t	*ptep;
86 	pt_entry_t	opte;
87 	pt_entry_t	npte;
88 	enum { INVDONE, INVSTORE, INVCMPSET } mode;
89 	int		success;
90 	vm_pindex_t	npgs;
91 	cpumask_t	done;
92 	cpumask_t	mask;
93 #ifdef LOOPRECOVER
94 	cpumask_t	sigmask;
95 	int		failed;
96 	tsc_uclock_t	tsc_target;
97 #endif
98 } __cachealign;
99 
100 typedef struct pmap_inval_info pmap_inval_info_t;
101 
102 static pmap_inval_info_t	invinfo[MAXCPU];
103 extern cpumask_t		smp_invmask;
104 #ifdef LOOPRECOVER
105 #ifdef LOOPMASK_IN
106 extern cpumask_t		smp_in_mask;
107 #endif
108 extern cpumask_t		smp_smurf_mask;
109 #endif
110 static int pmap_inval_watchdog_print;	/* must always default off */
111 static int pmap_inval_force_allcpus;
112 static int pmap_inval_force_nonopt;
113 
114 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_watchdog_print, CTLFLAG_RW,
115 	    &pmap_inval_watchdog_print, 0, "");
116 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_force_allcpus, CTLFLAG_RW,
117 	    &pmap_inval_force_allcpus, 0, "");
118 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_force_nonopt, CTLFLAG_RW,
119 	    &pmap_inval_force_nonopt, 0, "");
120 
121 static void
122 pmap_inval_init(pmap_t pmap)
123 {
124 	cpulock_t olock;
125 	cpulock_t nlock;
126 
127 	crit_enter_id("inval");
128 
129 	if (pmap != kernel_pmap) {
130 		for (;;) {
131 			olock = pmap->pm_active_lock;
132 			cpu_ccfence();
133 			nlock = olock | CPULOCK_EXCL;
134 			if (olock != nlock &&
135 			    atomic_cmpset_int(&pmap->pm_active_lock,
136 					      olock, nlock)) {
137 				break;
138 			}
139 			lwkt_process_ipiq();
140 			cpu_pause();
141 		}
142 		atomic_add_64(&pmap->pm_invgen, 1);
143 	}
144 }
145 
146 static void
147 pmap_inval_done(pmap_t pmap)
148 {
149 	if (pmap != kernel_pmap) {
150 		atomic_add_64(&pmap->pm_invgen, 1);
151 		atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL);
152 	}
153 	crit_exit_id("inval");
154 }
155 
156 #ifdef LOOPRECOVER
157 
158 /*
159  * Debugging and lost IPI recovery code.
160  */
161 static
162 __inline
163 int
164 loopwdog(struct pmap_inval_info *info)
165 {
166 	tsc_uclock_t tsc;
167 
168 	tsc = rdtsc();
169 	if ((tsc_sclock_t)(info->tsc_target - tsc) < 0 && tsc_frequency) {
170 		info->tsc_target = tsc + (tsc_frequency * LOOPRECOVER_TIMEOUT2);
171 		return 1;
172 	}
173 	return 0;
174 }
175 
176 static
177 void
178 loopdebug(const char *msg, pmap_inval_info_t *info)
179 {
180 	int p;
181 	int cpu = mycpu->gd_cpuid;
182 
183 	/*
184 	 * Don't kprintf() anything if the pmap inval watchdog gets hit.
185 	 * DRM can cause an occassional watchdog hit (at least with a 1/16
186 	 * second watchdog), and attempting to kprintf to the KVM frame buffer
187 	 * from Xinvltlb, which ignores critical sections, can implode the
188 	 * system.
189 	 */
190 	if (pmap_inval_watchdog_print == 0)
191 		return;
192 
193 	cpu_lfence();
194 #ifdef LOOPRECOVER
195 	atomic_add_long(&smp_smurf_mask.ary[0], 0);
196 #endif
197 	kprintf("ipilost-%s! %d mode=%d m=%08jx d=%08jx "
198 #ifdef LOOPRECOVER
199 		"s=%08jx "
200 #endif
201 #ifdef LOOPMASK_IN
202 		"in=%08jx "
203 #endif
204 #ifdef LOOPRECOVER
205 		"smurf=%08jx\n"
206 #endif
207 		, msg, cpu, info->mode,
208 		info->mask.ary[0],
209 		info->done.ary[0]
210 #ifdef LOOPRECOVER
211 		, info->sigmask.ary[0]
212 #endif
213 #ifdef LOOPMASK_IN
214 		, smp_in_mask.ary[0]
215 #endif
216 #ifdef LOOPRECOVER
217 		, smp_smurf_mask.ary[0]
218 #endif
219 		);
220 	kprintf("mdglob ");
221 	for (p = 0; p < ncpus; ++p)
222 		kprintf(" %d", CPU_prvspace[p]->mdglobaldata.gd_xinvaltlb);
223 	kprintf("\n");
224 }
225 
226 #endif
227 
228 #ifdef CHECKSIG
229 
230 #define CHECKSIGMASK(info)	_checksigmask(info, __FILE__, __LINE__)
231 
232 static
233 void
234 _checksigmask(pmap_inval_info_t *info, const char *file, int line)
235 {
236 	cpumask_t tmp;
237 
238 	tmp = info->mask;
239 	CPUMASK_ANDMASK(tmp, info->sigmask);
240 	if (CPUMASK_CMPMASKNEQ(tmp, info->mask)) {
241 		kprintf("\"%s\" line %d: bad sig/mask %08jx %08jx\n",
242 			file, line, info->sigmask.ary[0], info->mask.ary[0]);
243 	}
244 }
245 
246 #else
247 
248 #define CHECKSIGMASK(info)
249 
250 #endif
251 
252 /*
253  * Invalidate the specified va across all cpus associated with the pmap.
254  * If va == (vm_offset_t)-1, we invltlb() instead of invlpg().  The operation
255  * will be done fully synchronously with storing npte into *ptep and returning
256  * opte.
257  *
258  * If ptep is NULL the operation will execute semi-synchronously.
259  * ptep must be NULL if npgs > 1
260  */
261 pt_entry_t
262 pmap_inval_smp(pmap_t pmap, vm_offset_t va, vm_pindex_t npgs,
263 	       pt_entry_t *ptep, pt_entry_t npte)
264 {
265 	globaldata_t gd = mycpu;
266 	pmap_inval_info_t *info;
267 	pt_entry_t opte = 0;
268 	int cpu = gd->gd_cpuid;
269 	cpumask_t tmpmask;
270 	unsigned long rflags;
271 
272 	/*
273 	 * Initialize invalidation for pmap and enter critical section.
274 	 * This will enter a critical section for us.
275 	 */
276 	if (pmap == NULL)
277 		pmap = kernel_pmap;
278 
279 	/*
280 	 * Shortcut single-cpu case if possible.
281 	 */
282 	if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask) &&
283 	    pmap_inval_force_nonopt == 0) {
284 		/*
285 		 * Convert to invltlb if there are too many pages to
286 		 * invlpg on.
287 		 */
288 		if (pmap->pm_flags & PMAP_MULTI)
289 			pmap_inval_init(pmap);
290 		if (npgs == 1) {
291 			if (ptep)
292 				opte = atomic_swap_long(ptep, npte);
293 			if (va == (vm_offset_t)-1)
294 				cpu_invltlb();
295 			else
296 				cpu_invlpg((void *)va);
297 		} else if (va == (vm_offset_t)-1 || npgs > MAX_INVAL_PAGES) {
298 			if (ptep) {
299 				while (npgs) {
300 					opte = atomic_swap_long(ptep, npte);
301 					++ptep;
302 					--npgs;
303 				}
304 			}
305 			cpu_invltlb();
306 		} else {
307 			while (npgs) {
308 				if (ptep) {
309 					opte = atomic_swap_long(ptep, npte);
310 					++ptep;
311 				}
312 				cpu_invlpg((void *)va);
313 				va += PAGE_SIZE;
314 				--npgs;
315 			}
316 		}
317 		if (pmap->pm_flags & PMAP_MULTI)
318 			pmap_inval_done(pmap);
319 
320 		/*
321 		 * Knock on NVMM flush.
322 		 *
323 		 * NOTE: pmap_enter() also calls this function and requires
324 		 *       the old PTE be returned, so can't place this TLB
325 		 *       callback at the beginning and return 0.
326 		 */
327 		if (__predict_false(pmap->pm_tlb_flush != NULL)) {
328 			KKASSERT(pmap->pm_data != NULL);
329 			pmap->pm_tlb_flush(pmap);
330 		}
331 
332 		return opte;
333 	}
334 
335 	/*
336 	 * We need a critical section to prevent getting preempted while
337 	 * we setup our command.  A preemption might execute its own
338 	 * pmap_inval*() command and create confusion below.
339 	 *
340 	 * tsc_target is our watchdog timeout that will attempt to recover
341 	 * from a lost IPI.  Set to 1/16 second for now.
342 	 */
343 	pmap_inval_init(pmap);
344 	info = &invinfo[cpu];
345 
346 	/*
347 	 * We must wait for other cpus which may still be finishing up a
348 	 * prior operation that we requested.
349 	 *
350 	 * We do not have to disable interrupts here.  An Xinvltlb can occur
351 	 * at any time (even within a critical section), but it will not
352 	 * act on our command until we set our done bits.
353 	 */
354 	while (CPUMASK_TESTNZERO(info->done)) {
355 #ifdef LOOPRECOVER
356 		if (loopwdog(info)) {
357 			info->failed = 1;
358 			loopdebug("A", info);
359 			/* XXX recover from possible bug */
360 			CPUMASK_ASSZERO(info->done);
361 		}
362 #endif
363 		cpu_pause();
364 	}
365 	KKASSERT(info->mode == INVDONE);
366 	cpu_mfence();
367 
368 	/*
369 	 * Must set our cpu in the invalidation scan mask before
370 	 * any possibility of [partial] execution (remember, XINVLTLB
371 	 * can interrupt a critical section).
372 	 */
373 	ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
374 
375 	info->tsc_target = rdtsc() + (tsc_frequency * LOOPRECOVER_TIMEOUT1);
376 	info->va = va;
377 	info->npgs = npgs;
378 	info->ptep = ptep;
379 	info->npte = npte;
380 	info->opte = 0;
381 #ifdef LOOPRECOVER
382 	info->failed = 0;
383 #endif
384 	info->mode = INVSTORE;
385 
386 	tmpmask = pmap->pm_active;	/* volatile (bits may be cleared) */
387 	if (pmap_inval_force_allcpus)
388 		tmpmask = smp_active_mask;
389 	cpu_ccfence();
390 	CPUMASK_ANDMASK(tmpmask, smp_active_mask);
391 
392 	/*
393 	 * If ptep is NULL the operation can be semi-synchronous, which means
394 	 * we can improve performance by flagging and removing idle cpus
395 	 * (see the idleinvlclr function in mp_machdep.c).
396 	 *
397 	 * Typically kernel page table operation is semi-synchronous.
398 	 */
399 	if (ptep == NULL)
400 		smp_smurf_idleinvlclr(&tmpmask);
401 	CPUMASK_ORBIT(tmpmask, cpu);
402 	info->mask = tmpmask;
403 
404 	/*
405 	 * Command may start executing the moment 'done' is initialized,
406 	 * disable current cpu interrupt to prevent 'done' field from
407 	 * changing (other cpus can't clear done bits until the originating
408 	 * cpu clears its mask bit, but other cpus CAN start clearing their
409 	 * mask bits).
410 	 */
411 #ifdef LOOPRECOVER
412 	info->sigmask = tmpmask;
413 	CHECKSIGMASK(info);
414 #endif
415 	cpu_sfence();
416 	rflags = read_rflags();
417 	cpu_disable_intr();
418 
419 	ATOMIC_CPUMASK_COPY(info->done, tmpmask);
420 	/* execution can begin here on other cpus due to races */
421 
422 	/*
423 	 * Pass our copy of the done bits (so they don't change out from
424 	 * under us) to generate the Xinvltlb interrupt on the targets.
425 	 *
426 	 * smp_invlpg() issues the command, synchronizes with other cpus,
427 	 * and executes the command on our cpu.  Upon return other cpus
428 	 * may still be in the process of exiting their synchroniization.
429 	 */
430 	smp_invlpg(&tmpmask);
431 	opte = info->opte;
432 	KKASSERT(info->mode == INVDONE);
433 
434 	/*
435 	 * Target cpus will be in their loop exiting concurrently with our
436 	 * cleanup.  They will not lose the bitmask they obtained before so
437 	 * we can safely clear this bit.
438 	 */
439 	ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
440 	write_rflags(rflags);
441 	pmap_inval_done(pmap);
442 
443 	/* Knock on NVMM flush. */
444 	if (__predict_false(pmap->pm_tlb_flush != NULL)) {
445 		KKASSERT(pmap->pm_data != NULL);
446 		pmap->pm_tlb_flush(pmap);
447 	}
448 
449 	return opte;
450 }
451 
452 /*
453  * API function - invalidate the pte at (va) and replace *ptep with npte
454  * atomically only if *ptep equals opte, across the pmap's active cpus.
455  *
456  * Returns 1 on success, 0 on failure (caller typically retries).
457  */
458 int
459 pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep,
460 		      pt_entry_t opte, pt_entry_t npte)
461 {
462 	globaldata_t gd = mycpu;
463 	pmap_inval_info_t *info;
464 	int success;
465 	int cpu = gd->gd_cpuid;
466 	cpumask_t tmpmask;
467 	unsigned long rflags;
468 
469 	/*
470 	 * Initialize invalidation for pmap and enter critical section.
471 	 */
472 	if (pmap == NULL)
473 		pmap = kernel_pmap;
474 
475 	/*
476 	 * Shortcut single-cpu case if possible.
477 	 */
478 	if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask) &&
479 	    pmap_inval_force_nonopt == 0) {
480 		if (pmap->pm_flags & PMAP_MULTI)
481 			pmap_inval_init(pmap);
482 		if (atomic_cmpset_long(ptep, opte, npte)) {
483 			if (va == (vm_offset_t)-1)
484 				cpu_invltlb();
485 			else
486 				cpu_invlpg((void *)va);
487 			if (pmap->pm_flags & PMAP_MULTI)
488 				pmap_inval_done(pmap);
489 			return 1;
490 		} else {
491 			if (pmap->pm_flags & PMAP_MULTI)
492 				pmap_inval_done(pmap);
493 			return 0;
494 		}
495 	}
496 
497 	/*
498 	 * We need a critical section to prevent getting preempted while
499 	 * we setup our command.  A preemption might execute its own
500 	 * pmap_inval*() command and create confusion below.
501 	 */
502 	pmap_inval_init(pmap);
503 	info = &invinfo[cpu];
504 
505 	/*
506 	 * We must wait for other cpus which may still be finishing
507 	 * up a prior operation.
508 	 */
509 	while (CPUMASK_TESTNZERO(info->done)) {
510 #ifdef LOOPRECOVER
511 		if (loopwdog(info)) {
512 			info->failed = 1;
513 			loopdebug("B", info);
514 			/* XXX recover from possible bug */
515 			CPUMASK_ASSZERO(info->done);
516 		}
517 #endif
518 		cpu_pause();
519 	}
520 	KKASSERT(info->mode == INVDONE);
521 	cpu_mfence();
522 
523 	/*
524 	 * Must set our cpu in the invalidation scan mask before
525 	 * any possibility of [partial] execution (remember, XINVLTLB
526 	 * can interrupt a critical section).
527 	 */
528 	ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
529 
530 	info->tsc_target = rdtsc() + (tsc_frequency * LOOPRECOVER_TIMEOUT1);
531 	info->va = va;
532 	info->npgs = 1;			/* unused */
533 	info->ptep = ptep;
534 	info->npte = npte;
535 	info->opte = opte;
536 #ifdef LOOPRECOVER
537 	info->failed = 0;
538 #endif
539 	info->mode = INVCMPSET;
540 	info->success = 0;
541 
542 	tmpmask = pmap->pm_active;	/* volatile */
543 	if (pmap_inval_force_allcpus)
544 		tmpmask = smp_active_mask;
545 	cpu_ccfence();
546 	CPUMASK_ANDMASK(tmpmask, smp_active_mask);
547 	CPUMASK_ORBIT(tmpmask, cpu);
548 	info->mask = tmpmask;
549 
550 	/*
551 	 * Command may start executing the moment 'done' is initialized,
552 	 * disable current cpu interrupt to prevent 'done' field from
553 	 * changing (other cpus can't clear done bits until the originating
554 	 * cpu clears its mask bit).
555 	 */
556 #ifdef LOOPRECOVER
557 	info->sigmask = tmpmask;
558 	CHECKSIGMASK(info);
559 #endif
560 	cpu_sfence();
561 	rflags = read_rflags();
562 	cpu_disable_intr();
563 
564 	ATOMIC_CPUMASK_COPY(info->done, tmpmask);
565 
566 	/*
567 	 * Pass our copy of the done bits (so they don't change out from
568 	 * under us) to generate the Xinvltlb interrupt on the targets.
569 	 */
570 	smp_invlpg(&tmpmask);
571 	success = info->success;
572 	KKASSERT(info->mode == INVDONE);
573 
574 	ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
575 	write_rflags(rflags);
576 	pmap_inval_done(pmap);
577 
578 	return success;
579 }
580 
581 void
582 pmap_inval_bulk_init(pmap_inval_bulk_t *bulk, struct pmap *pmap)
583 {
584 	bulk->pmap = pmap;
585 	bulk->va_beg = 0;
586 	bulk->va_end = 0;
587 	bulk->count = 0;
588 }
589 
590 pt_entry_t
591 pmap_inval_bulk(pmap_inval_bulk_t *bulk, vm_offset_t va,
592 		pt_entry_t *ptep, pt_entry_t npte)
593 {
594 	pt_entry_t pte;
595 
596 	/*
597 	 * Degenerate case, localized or we don't care (e.g. because we
598 	 * are jacking the entire page table) or the pmap is not in-use
599 	 * by anyone.  No invalidations are done on any cpu.
600 	 */
601 	if (bulk == NULL) {
602 		pte = atomic_swap_long(ptep, npte);
603 		return pte;
604 	}
605 
606 	/*
607 	 * If it isn't the kernel pmap we execute the operation synchronously
608 	 * on all cpus belonging to the pmap, which avoids concurrency bugs in
609 	 * the hw related to changing pte's out from under threads.
610 	 *
611 	 * Eventually I would like to implement streaming pmap invalidation
612 	 * for user pmaps to reduce mmap/munmap overheads for heavily-loaded
613 	 * threaded programs.
614 	 */
615 	if (bulk->pmap != kernel_pmap) {
616 		pte = pmap_inval_smp(bulk->pmap, va, 1, ptep, npte);
617 		return pte;
618 	}
619 
620 	/*
621 	 * This is the kernel_pmap.  All unmap operations presume that there
622 	 * are no other cpus accessing the addresses in question.  Implement
623 	 * the bulking algorithm.  collect the required information and
624 	 * synchronize once at the end.
625 	 */
626 	pte = atomic_swap_long(ptep, npte);
627 	if (va == (vm_offset_t)-1) {
628 		bulk->va_beg = va;
629 	} else if (bulk->va_beg == bulk->va_end) {
630 		bulk->va_beg = va;
631 		bulk->va_end = va + PAGE_SIZE;
632 	} else if (va == bulk->va_end) {
633 		bulk->va_end = va + PAGE_SIZE;
634 	} else {
635 		bulk->va_beg = (vm_offset_t)-1;
636 		bulk->va_end = 0;
637 #if 0
638 		pmap_inval_bulk_flush(bulk);
639 		bulk->count = 1;
640 		if (va == (vm_offset_t)-1) {
641 			bulk->va_beg = va;
642 			bulk->va_end = 0;
643 		} else {
644 			bulk->va_beg = va;
645 			bulk->va_end = va + PAGE_SIZE;
646 		}
647 #endif
648 	}
649 	++bulk->count;
650 
651 	return pte;
652 }
653 
654 void
655 pmap_inval_bulk_flush(pmap_inval_bulk_t *bulk)
656 {
657 	if (bulk == NULL)
658 		return;
659 	if (bulk->va_beg != bulk->va_end) {
660 		if (bulk->va_beg == (vm_offset_t)-1) {
661 			pmap_inval_smp(bulk->pmap, bulk->va_beg, 1, NULL, 0);
662 		} else {
663 			vm_pindex_t n;
664 
665 			n = (bulk->va_end - bulk->va_beg) >> PAGE_SHIFT;
666 			pmap_inval_smp(bulk->pmap, bulk->va_beg, n, NULL, 0);
667 		}
668 	}
669 	bulk->va_beg = 0;
670 	bulk->va_end = 0;
671 	bulk->count = 0;
672 }
673 
674 /*
675  * Called from Xinvl with a critical section held and interrupts enabled.
676  */
677 int
678 pmap_inval_intr(cpumask_t *cpumaskp, int toolong)
679 {
680 	globaldata_t gd = mycpu;
681 	pmap_inval_info_t *info;
682 	int loopme = 0;
683 	int cpu;
684 	cpumask_t cpumask;
685 
686 	/*
687 	 * Check all cpus for invalidations we may need to service.
688 	 */
689 	cpu_ccfence();
690 	cpu = gd->gd_cpuid;
691 	cpumask = *cpumaskp;
692 
693         while (CPUMASK_TESTNZERO(cpumask)) {
694                 int n = BSFCPUMASK(cpumask);
695 
696 #ifdef LOOPRECOVER
697 		KKASSERT(n >= 0 && n < MAXCPU);
698 #endif
699 
700                 CPUMASK_NANDBIT(cpumask, n);
701 		info = &invinfo[n];
702 
703 		/*
704 		 * Checkout cpu (cpu) for work in the target cpu info (n)
705 		 *
706 		 * if (n == cpu) - check our cpu for a master operation
707 		 * if (n != cpu) - check other cpus for a slave operation
708 		 *
709 		 * Due to interrupts/races we can catch a new operation
710 		 * in an older interrupt in other cpus.
711 		 *
712 		 * A fence is needed once we detect the (not) done bit.
713 		 */
714 		if (!CPUMASK_TESTBIT(info->done, cpu))
715 			continue;
716 		cpu_lfence();
717 #ifdef LOOPRECOVER
718 		if (toolong) {
719 			kprintf("pminvl %d->%d %08jx %08jx mode=%d\n",
720 				cpu, n, info->done.ary[0], info->mask.ary[0],
721 				info->mode);
722 		}
723 #endif
724 
725 		/*
726 		 * info->mask and info->done always contain the originating
727 		 * cpu until the originator is done.  Targets may still be
728 		 * present in info->done after the originator is done (they
729 		 * will be finishing up their loops).
730 		 *
731 		 * Clear info->mask bits on other cpus to indicate that they
732 		 * have quiesced (entered the loop).  Once the other mask bits
733 		 * are clear we can execute the operation on the original,
734 		 * then clear the mask and done bits on the originator.  The
735 		 * targets will then finish up their side and clear their
736 		 * done bits.
737 		 *
738 		 * The command is considered 100% done when all done bits have
739 		 * been cleared.
740 		 */
741 		if (n != cpu) {
742 			/*
743 			 * Command state machine for 'other' cpus.
744 			 */
745 			if (CPUMASK_TESTBIT(info->mask, cpu)) {
746 				/*
747 				 * Other cpus indicate to originator that they
748 				 * are quiesced.
749 				 */
750 				ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
751 				loopme = 1;
752 			} else if (info->ptep &&
753 				   CPUMASK_TESTBIT(info->mask, n)) {
754 				/*
755 				 * Other cpu must wait for the originator (n)
756 				 * to complete its command if ptep is not NULL.
757 				 */
758 				loopme = 1;
759 			} else {
760 				/*
761 				 * Other cpu detects that the originator has
762 				 * completed its command, or there was no
763 				 * command.
764 				 *
765 				 * Now that the page table entry has changed,
766 				 * we can follow up with our own invalidation.
767 				 */
768 				vm_offset_t va = info->va;
769 				vm_pindex_t npgs;
770 
771 				if (va == (vm_offset_t)-1 ||
772 				    info->npgs > MAX_INVAL_PAGES) {
773 					cpu_invltlb();
774 				} else {
775 					for (npgs = info->npgs; npgs; --npgs) {
776 						cpu_invlpg((void *)va);
777 						va += PAGE_SIZE;
778 					}
779 				}
780 				ATOMIC_CPUMASK_NANDBIT(info->done, cpu);
781 				/* info invalid now */
782 				/* loopme left alone */
783 			}
784 		} else if (CPUMASK_TESTBIT(info->mask, cpu)) {
785 			/*
786 			 * Originator is waiting for other cpus
787 			 */
788 			if (CPUMASK_CMPMASKNEQ(info->mask, gd->gd_cpumask)) {
789 				/*
790 				 * Originator waits for other cpus to enter
791 				 * their loop (aka quiesce).
792 				 *
793 				 * If this bugs out the IPI may have been lost,
794 				 * try to reissue by resetting our own
795 				 * reentrancy bit and clearing the smurf mask
796 				 * for the cpus that did not respond, then
797 				 * reissuing the IPI.
798 				 */
799 				loopme = 1;
800 #ifdef LOOPRECOVER
801 				if (loopwdog(info)) {
802 					info->failed = 1;
803 					loopdebug("C", info);
804 					/* XXX recover from possible bug */
805 					cpu_disable_intr();
806 					ATOMIC_CPUMASK_NANDMASK(smp_smurf_mask,
807 								info->mask);
808 					smp_invlpg(&smp_active_mask);
809 
810 					/*
811 					 * Force outer-loop retest of Xinvltlb
812 					 * requests (see mp_machdep.c).
813 					 */
814 					cpu_enable_intr();
815 				}
816 #endif
817 			} else {
818 				/*
819 				 * Originator executes operation and clears
820 				 * mask to allow other cpus to finish.
821 				 */
822 				KKASSERT(info->mode != INVDONE);
823 				if (info->mode == INVSTORE) {
824 					if (info->ptep)
825 						info->opte = atomic_swap_long(info->ptep, info->npte);
826 					CHECKSIGMASK(info);
827 					ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
828 					CHECKSIGMASK(info);
829 				} else {
830 					if (atomic_cmpset_long(info->ptep,
831 							      info->opte, info->npte)) {
832 						info->success = 1;
833 					} else {
834 						info->success = 0;
835 					}
836 					CHECKSIGMASK(info);
837 					ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
838 					CHECKSIGMASK(info);
839 				}
840 				loopme = 1;
841 			}
842 		} else {
843 			/*
844 			 * Originator does not have to wait for the other
845 			 * cpus to finish.  It clears its done bit.  A new
846 			 * command will not be initiated by the originator
847 			 * until the other cpus have cleared their done bits
848 			 * (asynchronously).
849 			 */
850 			vm_offset_t va = info->va;
851 			vm_pindex_t npgs;
852 
853 			if (va == (vm_offset_t)-1 ||
854 			    info->npgs > MAX_INVAL_PAGES) {
855 				cpu_invltlb();
856 			} else {
857 				for (npgs = info->npgs; npgs; --npgs) {
858 					cpu_invlpg((void *)va);
859 					va += PAGE_SIZE;
860 				}
861 			}
862 
863 			/* leave loopme alone */
864 			/* other cpus may still be finishing up */
865 			/* can't race originator since that's us */
866 			info->mode = INVDONE;
867 			ATOMIC_CPUMASK_NANDBIT(info->done, cpu);
868 		}
869         }
870 	return loopme;
871 }
872