xref: /illumos-gate/usr/src/uts/intel/io/vmm/io/vlapic.c (revision d17be682)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  * Copyright (c) 2019 Joyent, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 /*
32  * This file and its contents are supplied under the terms of the
33  * Common Development and Distribution License ("CDDL"), version 1.0.
34  * You may only use this file in accordance with the terms of version
35  * 1.0 of the CDDL.
36  *
37  * A full copy of the text of the CDDL should have accompanied this
38  * source.  A copy of the CDDL is also available via the Internet at
39  * http://www.illumos.org/license/CDDL.
40  *
41  * Copyright 2014 Pluribus Networks Inc.
42  * Copyright 2018 Joyent, Inc.
43  * Copyright 2023 Oxide Computer Company
44  */
45 
46 #include <sys/cdefs.h>
47 __FBSDID("$FreeBSD$");
48 
49 #include <sys/param.h>
50 #include <sys/kernel.h>
51 #include <sys/kmem.h>
52 #include <sys/mutex.h>
53 #include <sys/systm.h>
54 #include <sys/cpuset.h>
55 
56 #include <x86/specialreg.h>
57 #include <x86/apicreg.h>
58 
59 #include <machine/clock.h>
60 
61 #include <machine/vmm.h>
62 #include <sys/vmm_kernel.h>
63 
64 #include "vmm_lapic.h"
65 #include "vmm_stat.h"
66 
67 #include "vlapic.h"
68 #include "vlapic_priv.h"
69 #include "vioapic.h"
70 
71 
72 /*
73  * The 4 high bits of a given interrupt vector represent its priority.  The same
74  * is true for the contents of the TPR when it is used to calculate the ultimate
75  * PPR of an APIC - the 4 high bits hold the priority.
76  */
77 #define	PRIO(x)			((x) & 0xf0)
78 
79 #define	VLAPIC_VERSION		(0x14)
80 
81 /*
82  * The 'vlapic->timer_lock' is used to provide mutual exclusion between the
83  * vlapic_callout_handler() and vcpu accesses to:
84  * - timer_freq_bt, timer_period_bt, timer_fire_bt
85  * - timer LVT register
86  */
87 #define	VLAPIC_TIMER_LOCK(vlapic)	mutex_enter(&((vlapic)->timer_lock))
88 #define	VLAPIC_TIMER_UNLOCK(vlapic)	mutex_exit(&((vlapic)->timer_lock))
89 #define	VLAPIC_TIMER_LOCKED(vlapic)	MUTEX_HELD(&((vlapic)->timer_lock))
90 
91 /*
92  * APIC timer frequency:
93  * - arbitrary but chosen to be in the ballpark of contemporary hardware.
94  * - power-of-two to avoid loss of precision when calculating times
95  */
96 #define	VLAPIC_BUS_FREQ		(128 * 1024 * 1024)
97 
98 #define	APICBASE_ADDR_MASK	0xfffffffffffff000UL
99 
100 #define	APIC_VALID_MASK_ESR	(APIC_ESR_SEND_CS_ERROR | \
101 		APIC_ESR_RECEIVE_CS_ERROR | APIC_ESR_SEND_ACCEPT | \
102 		APIC_ESR_RECEIVE_ACCEPT | APIC_ESR_SEND_ILLEGAL_VECTOR | \
103 		APIC_ESR_RECEIVE_ILLEGAL_VECTOR | APIC_ESR_ILLEGAL_REGISTER)
104 
105 static void vlapic_set_error(struct vlapic *, uint32_t, bool);
106 static void vlapic_callout_handler(void *arg);
107 
108 static __inline bool
109 vlapic_x2mode(const struct vlapic *vlapic)
110 {
111 	return ((vlapic->msr_apicbase & APICBASE_X2APIC) != 0);
112 }
113 
114 static __inline bool
115 vlapic_hw_disabled(const struct vlapic *vlapic)
116 {
117 	return ((vlapic->msr_apicbase & APICBASE_ENABLED) == 0);
118 }
119 
120 static __inline bool
121 vlapic_sw_disabled(const struct vlapic *vlapic)
122 {
123 	const struct LAPIC *lapic = vlapic->apic_page;
124 
125 	return ((lapic->svr & APIC_SVR_ENABLE) == 0);
126 }
127 
128 static __inline bool
129 vlapic_enabled(const struct vlapic *vlapic)
130 {
131 	return (!vlapic_hw_disabled(vlapic) && !vlapic_sw_disabled(vlapic));
132 }
133 
134 static __inline uint32_t
135 vlapic_get_id(const struct vlapic *vlapic)
136 {
137 
138 	if (vlapic_x2mode(vlapic))
139 		return (vlapic->vcpuid);
140 	else
141 		return (vlapic->vcpuid << 24);
142 }
143 
144 static uint32_t
145 x2apic_ldr(const struct vlapic *vlapic)
146 {
147 	int apicid;
148 	uint32_t ldr;
149 
150 	apicid = vlapic_get_id(vlapic);
151 	ldr = 1 << (apicid & 0xf);
152 	ldr |= (apicid & 0xffff0) << 12;
153 	return (ldr);
154 }
155 
156 void
157 vlapic_dfr_write_handler(struct vlapic *vlapic)
158 {
159 	struct LAPIC *lapic;
160 
161 	lapic = vlapic->apic_page;
162 	if (vlapic_x2mode(vlapic)) {
163 		/* Ignore write to DFR in x2APIC mode */
164 		lapic->dfr = 0;
165 		return;
166 	}
167 
168 	lapic->dfr &= APIC_DFR_MODEL_MASK;
169 	lapic->dfr |= APIC_DFR_RESERVED;
170 }
171 
172 void
173 vlapic_ldr_write_handler(struct vlapic *vlapic)
174 {
175 	struct LAPIC *lapic;
176 
177 	lapic = vlapic->apic_page;
178 
179 	/* LDR is read-only in x2apic mode */
180 	if (vlapic_x2mode(vlapic)) {
181 		/* Ignore write to LDR in x2APIC mode */
182 		lapic->ldr = x2apic_ldr(vlapic);
183 	} else {
184 		lapic->ldr &= ~APIC_LDR_RESERVED;
185 	}
186 }
187 
188 void
189 vlapic_id_write_handler(struct vlapic *vlapic)
190 {
191 	struct LAPIC *lapic;
192 
193 	/*
194 	 * We don't allow the ID register to be modified so reset it back to
195 	 * its default value.
196 	 */
197 	lapic = vlapic->apic_page;
198 	lapic->id = vlapic_get_id(vlapic);
199 }
200 
201 static int
202 vlapic_timer_divisor(uint32_t dcr)
203 {
204 	switch (dcr & 0xB) {
205 	case APIC_TDCR_1:
206 		return (1);
207 	case APIC_TDCR_2:
208 		return (2);
209 	case APIC_TDCR_4:
210 		return (4);
211 	case APIC_TDCR_8:
212 		return (8);
213 	case APIC_TDCR_16:
214 		return (16);
215 	case APIC_TDCR_32:
216 		return (32);
217 	case APIC_TDCR_64:
218 		return (64);
219 	case APIC_TDCR_128:
220 		return (128);
221 	default:
222 		panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
223 	}
224 }
225 
226 #if 0
227 static inline void
228 vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
229 {
230 	printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
231 	    *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
232 	    *lvt & APIC_LVTT_M);
233 }
234 #endif
235 
236 static uint32_t
237 vlapic_get_ccr(struct vlapic *vlapic)
238 {
239 	struct LAPIC *lapic;
240 	uint32_t ccr;
241 
242 	ccr = 0;
243 	lapic = vlapic->apic_page;
244 
245 	VLAPIC_TIMER_LOCK(vlapic);
246 	if (callout_active(&vlapic->callout)) {
247 		/*
248 		 * If the timer is scheduled to expire in the future then
249 		 * compute the value of 'ccr' based on the remaining time.
250 		 */
251 
252 		const hrtime_t now = gethrtime();
253 		if (vlapic->timer_fire_when > now) {
254 			ccr += hrt_freq_count(vlapic->timer_fire_when - now,
255 			    vlapic->timer_cur_freq);
256 		}
257 	}
258 
259 	/*
260 	 * Clamp CCR value to that programmed in ICR - its theoretical maximum.
261 	 * Normal operation should never result in this being necessary.  Only
262 	 * strange circumstances due to state importation as part of instance
263 	 * save/restore or live-migration require such wariness.
264 	 */
265 	if (ccr > lapic->icr_timer) {
266 		ccr = lapic->icr_timer;
267 		vlapic->stats.vs_clamp_ccr++;
268 	}
269 	VLAPIC_TIMER_UNLOCK(vlapic);
270 	return (ccr);
271 }
272 
273 static void
274 vlapic_update_divider(struct vlapic *vlapic)
275 {
276 	struct LAPIC *lapic = vlapic->apic_page;
277 
278 	ASSERT(VLAPIC_TIMER_LOCKED(vlapic));
279 
280 	vlapic->timer_cur_freq =
281 	    VLAPIC_BUS_FREQ / vlapic_timer_divisor(lapic->dcr_timer);
282 	vlapic->timer_period =
283 	    hrt_freq_interval(vlapic->timer_cur_freq, lapic->icr_timer);
284 }
285 
286 void
287 vlapic_dcr_write_handler(struct vlapic *vlapic)
288 {
289 	/*
290 	 * Update the timer frequency and the timer period.
291 	 *
292 	 * XXX changes to the frequency divider will not take effect until
293 	 * the timer is reloaded.
294 	 */
295 	VLAPIC_TIMER_LOCK(vlapic);
296 	vlapic_update_divider(vlapic);
297 	VLAPIC_TIMER_UNLOCK(vlapic);
298 }
299 
300 void
301 vlapic_esr_write_handler(struct vlapic *vlapic)
302 {
303 	struct LAPIC *lapic;
304 
305 	lapic = vlapic->apic_page;
306 	lapic->esr = vlapic->esr_pending;
307 	vlapic->esr_pending = 0;
308 }
309 
310 vcpu_notify_t
311 vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
312 {
313 	struct LAPIC *lapic;
314 	uint32_t *irrptr, *tmrptr, mask, tmr;
315 	int idx;
316 
317 	KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector));
318 
319 	lapic = vlapic->apic_page;
320 	if (!(lapic->svr & APIC_SVR_ENABLE)) {
321 		/* ignore interrupt on software-disabled APIC */
322 		return (VCPU_NOTIFY_NONE);
323 	}
324 
325 	if (vector < 16) {
326 		vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR,
327 		    false);
328 
329 		/*
330 		 * If the error LVT is configured to interrupt the vCPU, it will
331 		 * have delivered a notification through that mechanism.
332 		 */
333 		return (VCPU_NOTIFY_NONE);
334 	}
335 
336 	if (vlapic->ops.set_intr_ready) {
337 		return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level));
338 	}
339 
340 	idx = (vector / 32) * 4;
341 	mask = 1 << (vector % 32);
342 	tmrptr = &lapic->tmr0;
343 	irrptr = &lapic->irr0;
344 
345 	/*
346 	 * Update TMR for requested vector, if necessary.
347 	 * This must be done prior to asserting the bit in IRR so that the
348 	 * proper TMR state is always visible before the to-be-queued interrupt
349 	 * can be injected.
350 	 */
351 	tmr = atomic_load_acq_32(&tmrptr[idx]);
352 	if ((tmr & mask) != (level ? mask : 0)) {
353 		if (level) {
354 			atomic_set_int(&tmrptr[idx], mask);
355 		} else {
356 			atomic_clear_int(&tmrptr[idx], mask);
357 		}
358 	}
359 
360 	/* Now set the bit in IRR */
361 	atomic_set_int(&irrptr[idx], mask);
362 
363 	return (VCPU_NOTIFY_EXIT);
364 }
365 
366 static __inline uint32_t *
367 vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset)
368 {
369 	struct LAPIC	*lapic = vlapic->apic_page;
370 	int		i;
371 
372 	switch (offset) {
373 	case APIC_OFFSET_CMCI_LVT:
374 		return (&lapic->lvt_cmci);
375 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
376 		i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
377 		return ((&lapic->lvt_timer) + i);
378 	default:
379 		panic("vlapic_get_lvt: invalid LVT\n");
380 	}
381 }
382 
383 static __inline int
384 lvt_off_to_idx(uint32_t offset)
385 {
386 	int index;
387 
388 	switch (offset) {
389 	case APIC_OFFSET_CMCI_LVT:
390 		index = APIC_LVT_CMCI;
391 		break;
392 	case APIC_OFFSET_TIMER_LVT:
393 		index = APIC_LVT_TIMER;
394 		break;
395 	case APIC_OFFSET_THERM_LVT:
396 		index = APIC_LVT_THERMAL;
397 		break;
398 	case APIC_OFFSET_PERF_LVT:
399 		index = APIC_LVT_PMC;
400 		break;
401 	case APIC_OFFSET_LINT0_LVT:
402 		index = APIC_LVT_LINT0;
403 		break;
404 	case APIC_OFFSET_LINT1_LVT:
405 		index = APIC_LVT_LINT1;
406 		break;
407 	case APIC_OFFSET_ERROR_LVT:
408 		index = APIC_LVT_ERROR;
409 		break;
410 	default:
411 		index = -1;
412 		break;
413 	}
414 	KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: "
415 	    "invalid lvt index %d for offset %x", index, offset));
416 
417 	return (index);
418 }
419 
420 static __inline uint32_t
421 vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
422 {
423 	int idx;
424 	uint32_t val;
425 
426 	idx = lvt_off_to_idx(offset);
427 	val = atomic_load_acq_32(&vlapic->lvt_last[idx]);
428 	return (val);
429 }
430 
431 void
432 vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset)
433 {
434 	uint32_t *lvtptr, mask, val;
435 	struct LAPIC *lapic;
436 	int idx;
437 
438 	lapic = vlapic->apic_page;
439 	lvtptr = vlapic_get_lvtptr(vlapic, offset);
440 	val = *lvtptr;
441 	idx = lvt_off_to_idx(offset);
442 
443 	if (!(lapic->svr & APIC_SVR_ENABLE))
444 		val |= APIC_LVT_M;
445 	mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR;
446 	switch (offset) {
447 	case APIC_OFFSET_TIMER_LVT:
448 		mask |= APIC_LVTT_TM;
449 		break;
450 	case APIC_OFFSET_ERROR_LVT:
451 		break;
452 	case APIC_OFFSET_LINT0_LVT:
453 	case APIC_OFFSET_LINT1_LVT:
454 		mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP;
455 		/* FALLTHROUGH */
456 	default:
457 		mask |= APIC_LVT_DM;
458 		break;
459 	}
460 	val &= mask;
461 	*lvtptr = val;
462 	atomic_store_rel_32(&vlapic->lvt_last[idx], val);
463 }
464 
465 static void
466 vlapic_refresh_lvts(struct vlapic *vlapic)
467 {
468 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT);
469 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT);
470 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT);
471 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT);
472 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT);
473 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT);
474 	vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT);
475 }
476 
477 static void
478 vlapic_mask_lvts(struct vlapic *vlapic)
479 {
480 	struct LAPIC *lapic = vlapic->apic_page;
481 
482 	lapic->lvt_cmci |= APIC_LVT_M;
483 	lapic->lvt_timer |= APIC_LVT_M;
484 	lapic->lvt_thermal |= APIC_LVT_M;
485 	lapic->lvt_pcint |= APIC_LVT_M;
486 	lapic->lvt_lint0 |= APIC_LVT_M;
487 	lapic->lvt_lint1 |= APIC_LVT_M;
488 	lapic->lvt_error |= APIC_LVT_M;
489 	vlapic_refresh_lvts(vlapic);
490 }
491 
492 static int
493 vlapic_fire_lvt(struct vlapic *vlapic, uint_t lvt)
494 {
495 	uint32_t mode, reg, vec;
496 	vcpu_notify_t notify;
497 
498 	reg = atomic_load_acq_32(&vlapic->lvt_last[lvt]);
499 
500 	if (reg & APIC_LVT_M)
501 		return (0);
502 	vec = reg & APIC_LVT_VECTOR;
503 	mode = reg & APIC_LVT_DM;
504 
505 	switch (mode) {
506 	case APIC_LVT_DM_FIXED:
507 		if (vec < 16) {
508 			vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR,
509 			    lvt == APIC_LVT_ERROR);
510 			return (0);
511 		}
512 		notify = vlapic_set_intr_ready(vlapic, vec, false);
513 		vcpu_notify_event_type(vlapic->vm, vlapic->vcpuid, notify);
514 		break;
515 	case APIC_LVT_DM_NMI:
516 		(void) vm_inject_nmi(vlapic->vm, vlapic->vcpuid);
517 		break;
518 	case APIC_LVT_DM_EXTINT:
519 		(void) vm_inject_extint(vlapic->vm, vlapic->vcpuid);
520 		break;
521 	default:
522 		// Other modes ignored
523 		return (0);
524 	}
525 	return (1);
526 }
527 
528 static uint_t
529 vlapic_active_isr(struct vlapic *vlapic)
530 {
531 	int i;
532 	uint32_t *isrp;
533 
534 	isrp = &vlapic->apic_page->isr7;
535 
536 	for (i = 7; i >= 0; i--, isrp -= 4) {
537 		uint32_t reg = *isrp;
538 
539 		if (reg != 0) {
540 			uint_t vec = (i * 32) + bsrl(reg);
541 
542 			if (vec < 16) {
543 				/*
544 				 * Truncate the illegal low vectors to value of
545 				 * 0, indicating that no active ISR was found.
546 				 */
547 				return (0);
548 			}
549 			return (vec);
550 		}
551 	}
552 
553 	return (0);
554 }
555 
556 /*
557  * After events which might arbitrarily change the value of PPR, such as a TPR
558  * write or an EOI, calculate that new PPR value and store it in the APIC page.
559  */
560 static void
561 vlapic_update_ppr(struct vlapic *vlapic)
562 {
563 	int isrvec, tpr, ppr;
564 
565 	isrvec = vlapic_active_isr(vlapic);
566 	tpr = vlapic->apic_page->tpr;
567 
568 	/*
569 	 * Algorithm adopted from section "Interrupt, Task and Processor
570 	 * Priority" in Intel Architecture Manual Vol 3a.
571 	 */
572 	if (PRIO(tpr) >= PRIO(isrvec)) {
573 		ppr = tpr;
574 	} else {
575 		ppr = PRIO(isrvec);
576 	}
577 
578 	vlapic->apic_page->ppr = ppr;
579 }
580 
581 /*
582  * When a vector is asserted in ISR as in-service, the PPR must be raised to the
583  * priority of that vector, as the vCPU would have been at a lower priority in
584  * order for the vector to be accepted.
585  */
586 static void
587 vlapic_raise_ppr(struct vlapic *vlapic, int vec)
588 {
589 	struct LAPIC *lapic = vlapic->apic_page;
590 	int ppr;
591 
592 	ppr = PRIO(vec);
593 
594 	lapic->ppr = ppr;
595 }
596 
597 void
598 vlapic_sync_tpr(struct vlapic *vlapic)
599 {
600 	vlapic_update_ppr(vlapic);
601 }
602 
603 static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt");
604 
605 static void
606 vlapic_process_eoi(struct vlapic *vlapic)
607 {
608 	struct LAPIC	*lapic = vlapic->apic_page;
609 	uint32_t	*isrptr, *tmrptr;
610 	int		i;
611 	uint_t		idx, bitpos, vector;
612 
613 	isrptr = &lapic->isr0;
614 	tmrptr = &lapic->tmr0;
615 
616 	for (i = 7; i >= 0; i--) {
617 		idx = i * 4;
618 		if (isrptr[idx] != 0) {
619 			bitpos = bsrl(isrptr[idx]);
620 			vector = i * 32 + bitpos;
621 
622 			isrptr[idx] &= ~(1 << bitpos);
623 			vlapic_update_ppr(vlapic);
624 			if ((tmrptr[idx] & (1 << bitpos)) != 0) {
625 				vioapic_process_eoi(vlapic->vm, vlapic->vcpuid,
626 				    vector);
627 			}
628 			return;
629 		}
630 	}
631 	vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_GRATUITOUS_EOI, 1);
632 }
633 
634 static __inline int
635 vlapic_get_lvt_field(uint32_t lvt, uint32_t mask)
636 {
637 
638 	return (lvt & mask);
639 }
640 
641 static __inline int
642 vlapic_periodic_timer(struct vlapic *vlapic)
643 {
644 	uint32_t lvt;
645 
646 	lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
647 
648 	return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
649 }
650 
651 static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic");
652 
653 static void
654 vlapic_set_error(struct vlapic *vlapic, uint32_t mask, bool lvt_error)
655 {
656 
657 	vlapic->esr_pending |= mask;
658 
659 	/*
660 	 * Avoid infinite recursion if the error LVT itself is configured with
661 	 * an illegal vector.
662 	 */
663 	if (lvt_error)
664 		return;
665 
666 	if (vlapic_fire_lvt(vlapic, APIC_LVT_ERROR)) {
667 		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1);
668 	}
669 }
670 
671 static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic");
672 
673 static void
674 vlapic_fire_timer(struct vlapic *vlapic)
675 {
676 	ASSERT(VLAPIC_TIMER_LOCKED(vlapic));
677 
678 	if (vlapic_fire_lvt(vlapic, APIC_LVT_TIMER)) {
679 		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1);
680 	}
681 }
682 
683 static VMM_STAT(VLAPIC_INTR_CMC,
684 	"corrected machine check interrupts generated by vlapic");
685 
686 void
687 vlapic_fire_cmci(struct vlapic *vlapic)
688 {
689 
690 	if (vlapic_fire_lvt(vlapic, APIC_LVT_CMCI)) {
691 		vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1);
692 	}
693 }
694 
695 static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1,
696 	"lvts triggered");
697 
698 int
699 vlapic_trigger_lvt(struct vlapic *vlapic, int vector)
700 {
701 	if (!vlapic_enabled(vlapic)) {
702 		/*
703 		 * When the local APIC is global/hardware disabled,
704 		 * LINT[1:0] pins are configured as INTR and NMI pins,
705 		 * respectively.
706 		 */
707 		switch (vector) {
708 			case APIC_LVT_LINT0:
709 				(void) vm_inject_extint(vlapic->vm,
710 				    vlapic->vcpuid);
711 				break;
712 			case APIC_LVT_LINT1:
713 				(void) vm_inject_nmi(vlapic->vm,
714 				    vlapic->vcpuid);
715 				break;
716 			default:
717 				break;
718 		}
719 		return (0);
720 	}
721 
722 	switch (vector) {
723 	case APIC_LVT_LINT0:
724 	case APIC_LVT_LINT1:
725 	case APIC_LVT_TIMER:
726 	case APIC_LVT_ERROR:
727 	case APIC_LVT_PMC:
728 	case APIC_LVT_THERMAL:
729 	case APIC_LVT_CMCI:
730 		if (vlapic_fire_lvt(vlapic, vector)) {
731 			vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid,
732 			    LVTS_TRIGGERRED, vector, 1);
733 		}
734 		break;
735 	default:
736 		return (EINVAL);
737 	}
738 	return (0);
739 }
740 
741 static void
742 vlapic_callout_reset(struct vlapic *vlapic)
743 {
744 	callout_reset_hrtime(&vlapic->callout, vlapic->timer_fire_when,
745 	    vlapic_callout_handler, vlapic, C_ABSOLUTE);
746 }
747 
748 static void
749 vlapic_callout_handler(void *arg)
750 {
751 	struct vlapic *vlapic = arg;
752 
753 	VLAPIC_TIMER_LOCK(vlapic);
754 	if (callout_pending(&vlapic->callout))	/* callout was reset */
755 		goto done;
756 
757 	if (!callout_active(&vlapic->callout))	/* callout was stopped */
758 		goto done;
759 
760 	callout_deactivate(&vlapic->callout);
761 
762 	vlapic_fire_timer(vlapic);
763 
764 	if (vlapic_periodic_timer(vlapic)) {
765 		/*
766 		 * Compute the delta between when the timer was supposed to
767 		 * fire and the present time.  We can depend on the fact that
768 		 * cyclics (which underly these callouts) will never be called
769 		 * early.
770 		 */
771 		const hrtime_t now = gethrtime();
772 		const hrtime_t delta = now - vlapic->timer_fire_when;
773 		if (delta >= vlapic->timer_period) {
774 			/*
775 			 * If we are so behind that we have missed an entire
776 			 * timer period, reset the time base rather than
777 			 * attempting to catch up.
778 			 */
779 			vlapic->timer_fire_when = now + vlapic->timer_period;
780 		} else {
781 			vlapic->timer_fire_when += vlapic->timer_period;
782 		}
783 		vlapic_callout_reset(vlapic);
784 	} else {
785 		/*
786 		 * Clear the target time so that logic can distinguish from a
787 		 * timer which has fired (where the value is zero) from one
788 		 * which is held pending due to the instance being paused (where
789 		 * the value is non-zero, but the callout is not pending).
790 		 */
791 		vlapic->timer_fire_when = 0;
792 	}
793 done:
794 	VLAPIC_TIMER_UNLOCK(vlapic);
795 }
796 
797 void
798 vlapic_icrtmr_write_handler(struct vlapic *vlapic)
799 {
800 	struct LAPIC *lapic = vlapic->apic_page;
801 
802 	VLAPIC_TIMER_LOCK(vlapic);
803 	vlapic->timer_period = hrt_freq_interval(vlapic->timer_cur_freq,
804 	    lapic->icr_timer);
805 	if (vlapic->timer_period != 0) {
806 		vlapic->timer_fire_when = gethrtime() + vlapic->timer_period;
807 		vlapic_callout_reset(vlapic);
808 	} else {
809 		vlapic->timer_fire_when = 0;
810 		callout_stop(&vlapic->callout);
811 	}
812 	VLAPIC_TIMER_UNLOCK(vlapic);
813 }
814 
815 /*
816  * This function populates 'dmask' with the set of vcpus that match the
817  * addressing specified by the (dest, phys, lowprio) tuple.
818  *
819  * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit)
820  * or xAPIC (8-bit) destination field.
821  */
822 void
823 vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
824     bool lowprio, bool x2apic_dest)
825 {
826 	struct vlapic *vlapic;
827 	uint32_t dfr, ldr, ldest, cluster;
828 	uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id;
829 	cpuset_t amask;
830 	int vcpuid;
831 
832 	if ((x2apic_dest && dest == 0xffffffff) ||
833 	    (!x2apic_dest && dest == 0xff)) {
834 		/*
835 		 * Broadcast in both logical and physical modes.
836 		 */
837 		*dmask = vm_active_cpus(vm);
838 		return;
839 	}
840 
841 	if (phys) {
842 		/*
843 		 * Physical mode: destination is APIC ID.
844 		 */
845 		CPU_ZERO(dmask);
846 		vcpuid = vm_apicid2vcpuid(vm, dest);
847 		amask = vm_active_cpus(vm);
848 		if (vcpuid < vm_get_maxcpus(vm) && CPU_ISSET(vcpuid, &amask))
849 			CPU_SET(vcpuid, dmask);
850 	} else {
851 		/*
852 		 * In the "Flat Model" the MDA is interpreted as an 8-bit wide
853 		 * bitmask. This model is only available in the xAPIC mode.
854 		 */
855 		mda_flat_ldest = dest & 0xff;
856 
857 		/*
858 		 * In the "Cluster Model" the MDA is used to identify a
859 		 * specific cluster and a set of APICs in that cluster.
860 		 */
861 		if (x2apic_dest) {
862 			mda_cluster_id = dest >> 16;
863 			mda_cluster_ldest = dest & 0xffff;
864 		} else {
865 			mda_cluster_id = (dest >> 4) & 0xf;
866 			mda_cluster_ldest = dest & 0xf;
867 		}
868 
869 		/*
870 		 * Logical mode: match each APIC that has a bit set
871 		 * in its LDR that matches a bit in the ldest.
872 		 */
873 		CPU_ZERO(dmask);
874 		amask = vm_active_cpus(vm);
875 		while ((vcpuid = CPU_FFS(&amask)) != 0) {
876 			vcpuid--;
877 			CPU_CLR(vcpuid, &amask);
878 
879 			vlapic = vm_lapic(vm, vcpuid);
880 			dfr = vlapic->apic_page->dfr;
881 			ldr = vlapic->apic_page->ldr;
882 
883 			if ((dfr & APIC_DFR_MODEL_MASK) ==
884 			    APIC_DFR_MODEL_FLAT) {
885 				ldest = ldr >> 24;
886 				mda_ldest = mda_flat_ldest;
887 			} else if ((dfr & APIC_DFR_MODEL_MASK) ==
888 			    APIC_DFR_MODEL_CLUSTER) {
889 				if (vlapic_x2mode(vlapic)) {
890 					cluster = ldr >> 16;
891 					ldest = ldr & 0xffff;
892 				} else {
893 					cluster = ldr >> 28;
894 					ldest = (ldr >> 24) & 0xf;
895 				}
896 				if (cluster != mda_cluster_id)
897 					continue;
898 				mda_ldest = mda_cluster_ldest;
899 			} else {
900 				/*
901 				 * Guest has configured a bad logical
902 				 * model for this vcpu - skip it.
903 				 */
904 				continue;
905 			}
906 
907 			if ((mda_ldest & ldest) != 0) {
908 				CPU_SET(vcpuid, dmask);
909 				if (lowprio)
910 					break;
911 			}
912 		}
913 	}
914 }
915 
916 static VMM_STAT(VLAPIC_IPI_SEND, "ipis sent from vcpu");
917 static VMM_STAT(VLAPIC_IPI_RECV, "ipis received by vcpu");
918 
919 static void
920 vlapic_set_tpr(struct vlapic *vlapic, uint8_t val)
921 {
922 	struct LAPIC *lapic = vlapic->apic_page;
923 
924 	if (lapic->tpr != val) {
925 		lapic->tpr = val;
926 		vlapic_update_ppr(vlapic);
927 	}
928 }
929 
930 void
931 vlapic_set_cr8(struct vlapic *vlapic, uint64_t val)
932 {
933 	uint8_t tpr;
934 
935 	if (val & ~0xf) {
936 		vm_inject_gp(vlapic->vm, vlapic->vcpuid);
937 		return;
938 	}
939 
940 	tpr = val << 4;
941 	vlapic_set_tpr(vlapic, tpr);
942 }
943 
944 uint64_t
945 vlapic_get_cr8(const struct vlapic *vlapic)
946 {
947 	const struct LAPIC *lapic = vlapic->apic_page;
948 
949 	return (lapic->tpr >> 4);
950 }
951 
952 static bool
953 vlapic_is_icr_valid(uint64_t icrval)
954 {
955 	uint32_t mode = icrval & APIC_DELMODE_MASK;
956 	uint32_t level = icrval & APIC_LEVEL_MASK;
957 	uint32_t trigger = icrval & APIC_TRIGMOD_MASK;
958 	uint32_t shorthand = icrval & APIC_DEST_MASK;
959 
960 	switch (mode) {
961 	case APIC_DELMODE_FIXED:
962 		if (trigger == APIC_TRIGMOD_EDGE)
963 			return (true);
964 		/*
965 		 * AMD allows a level assert IPI and Intel converts a level
966 		 * assert IPI into an edge IPI.
967 		 */
968 		if (trigger == APIC_TRIGMOD_LEVEL && level == APIC_LEVEL_ASSERT)
969 			return (true);
970 		break;
971 	case APIC_DELMODE_LOWPRIO:
972 	case APIC_DELMODE_SMI:
973 	case APIC_DELMODE_NMI:
974 	case APIC_DELMODE_INIT:
975 		if (trigger == APIC_TRIGMOD_EDGE &&
976 		    (shorthand == APIC_DEST_DESTFLD ||
977 		    shorthand == APIC_DEST_ALLESELF)) {
978 			return (true);
979 		}
980 		/*
981 		 * AMD allows a level assert IPI and Intel converts a level
982 		 * assert IPI into an edge IPI.
983 		 */
984 		if (trigger == APIC_TRIGMOD_LEVEL &&
985 		    level == APIC_LEVEL_ASSERT &&
986 		    (shorthand == APIC_DEST_DESTFLD ||
987 		    shorthand == APIC_DEST_ALLESELF)) {
988 			return (true);
989 		}
990 		/*
991 		 * An level triggered deassert INIT is defined in the Intel
992 		 * Multiprocessor Specification and the Intel Software Developer
993 		 * Manual. Due to the MPS it's required to send a level assert
994 		 * INIT to a cpu and then a level deassert INIT. Some operating
995 		 * systems e.g. FreeBSD or Linux use that algorithm. According
996 		 * to the SDM a level deassert INIT is only supported by Pentium
997 		 * and P6 processors. It's always send to all cpus regardless of
998 		 * the destination or shorthand field. It resets the arbitration
999 		 * id register. This register is not software accessible and
1000 		 * only required for the APIC bus arbitration. So, the level
1001 		 * deassert INIT doesn't need any emulation and we should ignore
1002 		 * it. The SDM also defines that newer processors don't support
1003 		 * the level deassert INIT and it's not valid any more. As it's
1004 		 * defined for older systems, it can't be invalid per se.
1005 		 * Otherwise, backward compatibility would be broken. However,
1006 		 * when returning false here, it'll be ignored which is the
1007 		 * desired behaviour.
1008 		 */
1009 		if (mode == APIC_DELMODE_INIT &&
1010 		    trigger == APIC_TRIGMOD_LEVEL &&
1011 		    level == APIC_LEVEL_DEASSERT) {
1012 			return (false);
1013 		}
1014 		break;
1015 	case APIC_DELMODE_STARTUP:
1016 		if (shorthand == APIC_DEST_DESTFLD ||
1017 		    shorthand == APIC_DEST_ALLESELF) {
1018 			return (true);
1019 		}
1020 		break;
1021 	case APIC_DELMODE_RR:
1022 		/* Only available on AMD! */
1023 		if (trigger == APIC_TRIGMOD_EDGE &&
1024 		    shorthand == APIC_DEST_DESTFLD) {
1025 			return (true);
1026 		}
1027 		break;
1028 	case APIC_DELMODE_RESV:
1029 		return (false);
1030 	default:
1031 		panic("vlapic_is_icr_valid: invalid mode 0x%08x", mode);
1032 	}
1033 
1034 	return (false);
1035 }
1036 
1037 void
1038 vlapic_icrlo_write_handler(struct vlapic *vlapic)
1039 {
1040 	int i;
1041 	cpuset_t dmask;
1042 	uint64_t icrval;
1043 	uint32_t dest, vec, mode, dsh;
1044 	struct LAPIC *lapic;
1045 
1046 	lapic = vlapic->apic_page;
1047 	lapic->icr_lo &= ~APIC_DELSTAT_PEND;
1048 	icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo;
1049 
1050 	/*
1051 	 * Ignore invalid combinations of the icr.
1052 	 */
1053 	if (!vlapic_is_icr_valid(icrval))
1054 		return;
1055 
1056 	if (vlapic_x2mode(vlapic))
1057 		dest = icrval >> 32;
1058 	else
1059 		dest = icrval >> (32 + 24);
1060 	vec = icrval & APIC_VECTOR_MASK;
1061 	mode = icrval & APIC_DELMODE_MASK;
1062 	dsh = icrval & APIC_DEST_MASK;
1063 
1064 	if (mode == APIC_DELMODE_FIXED && vec < 16) {
1065 		vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, false);
1066 		return;
1067 	}
1068 
1069 	if (mode == APIC_DELMODE_INIT &&
1070 	    (icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) {
1071 		/* No work required to deassert INIT */
1072 		return;
1073 	}
1074 
1075 	switch (dsh) {
1076 	case APIC_DEST_DESTFLD:
1077 		vlapic_calcdest(vlapic->vm, &dmask, dest,
1078 		    (icrval & APIC_DESTMODE_LOG) == 0, false,
1079 		    vlapic_x2mode(vlapic));
1080 		break;
1081 	case APIC_DEST_SELF:
1082 		CPU_SETOF(vlapic->vcpuid, &dmask);
1083 		break;
1084 	case APIC_DEST_ALLISELF:
1085 		dmask = vm_active_cpus(vlapic->vm);
1086 		break;
1087 	case APIC_DEST_ALLESELF:
1088 		dmask = vm_active_cpus(vlapic->vm);
1089 		CPU_CLR(vlapic->vcpuid, &dmask);
1090 		break;
1091 	default:
1092 		/*
1093 		 * All possible delivery notations are covered above.
1094 		 * We should never end up here.
1095 		 */
1096 		panic("unknown delivery shorthand: %x", dsh);
1097 	}
1098 
1099 	while ((i = CPU_FFS(&dmask)) != 0) {
1100 		i--;
1101 		CPU_CLR(i, &dmask);
1102 		switch (mode) {
1103 		case APIC_DELMODE_FIXED:
1104 			(void) lapic_intr_edge(vlapic->vm, i, vec);
1105 			vmm_stat_incr(vlapic->vm, vlapic->vcpuid,
1106 			    VLAPIC_IPI_SEND, 1);
1107 			vmm_stat_incr(vlapic->vm, i,
1108 			    VLAPIC_IPI_RECV, 1);
1109 			break;
1110 		case APIC_DELMODE_NMI:
1111 			(void) vm_inject_nmi(vlapic->vm, i);
1112 			break;
1113 		case APIC_DELMODE_INIT:
1114 			(void) vm_inject_init(vlapic->vm, i);
1115 			break;
1116 		case APIC_DELMODE_STARTUP:
1117 			(void) vm_inject_sipi(vlapic->vm, i, vec);
1118 			break;
1119 		case APIC_DELMODE_LOWPRIO:
1120 		case APIC_DELMODE_SMI:
1121 		default:
1122 			/* Unhandled IPI modes (for now) */
1123 			break;
1124 		}
1125 	}
1126 }
1127 
1128 void
1129 vlapic_self_ipi_handler(struct vlapic *vlapic, uint32_t val)
1130 {
1131 	const int vec = val & 0xff;
1132 
1133 	/* self-IPI is only exposed via x2APIC */
1134 	ASSERT(vlapic_x2mode(vlapic));
1135 
1136 	(void) lapic_intr_edge(vlapic->vm, vlapic->vcpuid, vec);
1137 	vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_IPI_SEND, 1);
1138 	vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_IPI_RECV, 1);
1139 }
1140 
1141 int
1142 vlapic_pending_intr(struct vlapic *vlapic, int *vecptr)
1143 {
1144 	struct LAPIC	*lapic = vlapic->apic_page;
1145 	int		 idx, i, bitpos, vector;
1146 	uint32_t	*irrptr, val;
1147 
1148 	if (vlapic->ops.sync_state) {
1149 		(*vlapic->ops.sync_state)(vlapic);
1150 	}
1151 
1152 	irrptr = &lapic->irr0;
1153 
1154 	for (i = 7; i >= 0; i--) {
1155 		idx = i * 4;
1156 		val = atomic_load_acq_int(&irrptr[idx]);
1157 		bitpos = fls(val);
1158 		if (bitpos != 0) {
1159 			vector = i * 32 + (bitpos - 1);
1160 			if (PRIO(vector) > PRIO(lapic->ppr)) {
1161 				if (vecptr != NULL)
1162 					*vecptr = vector;
1163 				return (1);
1164 			} else
1165 				break;
1166 		}
1167 	}
1168 	return (0);
1169 }
1170 
1171 void
1172 vlapic_intr_accepted(struct vlapic *vlapic, int vector)
1173 {
1174 	struct LAPIC	*lapic = vlapic->apic_page;
1175 	uint32_t	*irrptr, *isrptr;
1176 	int		idx;
1177 
1178 	KASSERT(vector >= 16 && vector < 256, ("invalid vector %d", vector));
1179 
1180 	if (vlapic->ops.intr_accepted)
1181 		return ((*vlapic->ops.intr_accepted)(vlapic, vector));
1182 
1183 	/*
1184 	 * clear the ready bit for vector being accepted in irr
1185 	 * and set the vector as in service in isr.
1186 	 */
1187 	idx = (vector / 32) * 4;
1188 
1189 	irrptr = &lapic->irr0;
1190 	atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
1191 
1192 	isrptr = &lapic->isr0;
1193 	isrptr[idx] |= 1 << (vector % 32);
1194 
1195 	/*
1196 	 * The only way a fresh vector could be accepted into ISR is if it was
1197 	 * of a higher priority than the current PPR.  With that vector now
1198 	 * in-service, the PPR must be raised.
1199 	 */
1200 	vlapic_raise_ppr(vlapic, vector);
1201 }
1202 
1203 void
1204 vlapic_svr_write_handler(struct vlapic *vlapic)
1205 {
1206 	struct LAPIC *lapic;
1207 	uint32_t old, new, changed;
1208 
1209 	lapic = vlapic->apic_page;
1210 
1211 	new = lapic->svr;
1212 	old = vlapic->svr_last;
1213 	vlapic->svr_last = new;
1214 
1215 	changed = old ^ new;
1216 	if ((changed & APIC_SVR_ENABLE) != 0) {
1217 		if ((new & APIC_SVR_ENABLE) == 0) {
1218 			/*
1219 			 * The apic is now disabled so stop the apic timer
1220 			 * and mask all the LVT entries.
1221 			 */
1222 			VLAPIC_TIMER_LOCK(vlapic);
1223 			callout_stop(&vlapic->callout);
1224 			VLAPIC_TIMER_UNLOCK(vlapic);
1225 			vlapic_mask_lvts(vlapic);
1226 		} else {
1227 			/*
1228 			 * The apic is now enabled so restart the apic timer
1229 			 * if it is configured in periodic mode.
1230 			 */
1231 			if (vlapic_periodic_timer(vlapic))
1232 				vlapic_icrtmr_write_handler(vlapic);
1233 		}
1234 	}
1235 }
1236 
1237 static bool
1238 vlapic_read(struct vlapic *vlapic, uint16_t offset, uint32_t *outp)
1239 {
1240 	struct LAPIC *lapic = vlapic->apic_page;
1241 	uint32_t *reg;
1242 	int i;
1243 
1244 	ASSERT3U(offset & 0x3, ==, 0);
1245 	ASSERT3U(offset, <, PAGESIZE);
1246 	ASSERT3P(outp, !=, NULL);
1247 
1248 	uint32_t data = 0;
1249 	switch (offset) {
1250 	case APIC_OFFSET_ID:
1251 		data = lapic->id;
1252 		break;
1253 	case APIC_OFFSET_VER:
1254 		data = lapic->version;
1255 		break;
1256 	case APIC_OFFSET_TPR:
1257 		data = lapic->tpr;
1258 		break;
1259 	case APIC_OFFSET_APR:
1260 		data = lapic->apr;
1261 		break;
1262 	case APIC_OFFSET_PPR:
1263 		data = lapic->ppr;
1264 		break;
1265 	case APIC_OFFSET_LDR:
1266 		data = lapic->ldr;
1267 		break;
1268 	case APIC_OFFSET_DFR:
1269 		data = lapic->dfr;
1270 		break;
1271 	case APIC_OFFSET_SVR:
1272 		data = lapic->svr;
1273 		break;
1274 	case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1275 		i = (offset - APIC_OFFSET_ISR0) >> 2;
1276 		reg = &lapic->isr0;
1277 		data = *(reg + i);
1278 		break;
1279 	case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1280 		i = (offset - APIC_OFFSET_TMR0) >> 2;
1281 		reg = &lapic->tmr0;
1282 		data = *(reg + i);
1283 		break;
1284 	case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1285 		i = (offset - APIC_OFFSET_IRR0) >> 2;
1286 		reg = &lapic->irr0;
1287 		data = atomic_load_acq_int(reg + i);
1288 		break;
1289 	case APIC_OFFSET_ESR:
1290 		data = lapic->esr;
1291 		break;
1292 	case APIC_OFFSET_ICR_LOW:
1293 		data = lapic->icr_lo;
1294 		break;
1295 	case APIC_OFFSET_ICR_HI:
1296 		data = lapic->icr_hi;
1297 		break;
1298 	case APIC_OFFSET_CMCI_LVT:
1299 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1300 		data = vlapic_get_lvt(vlapic, offset);
1301 #ifdef INVARIANTS
1302 		reg = vlapic_get_lvtptr(vlapic, offset);
1303 		ASSERT3U(data, ==, *reg);
1304 #endif
1305 		break;
1306 	case APIC_OFFSET_TIMER_ICR:
1307 		data = lapic->icr_timer;
1308 		break;
1309 	case APIC_OFFSET_TIMER_CCR:
1310 		data = vlapic_get_ccr(vlapic);
1311 		break;
1312 	case APIC_OFFSET_TIMER_DCR:
1313 		data = lapic->dcr_timer;
1314 		break;
1315 	case APIC_OFFSET_RRR:
1316 		data = 0;
1317 		break;
1318 
1319 	case APIC_OFFSET_SELF_IPI:
1320 	case APIC_OFFSET_EOI:
1321 		/* Write-only register */
1322 		*outp = 0;
1323 		return (false);
1324 
1325 	default:
1326 		/* Invalid register */
1327 		*outp = 0;
1328 		return (false);
1329 	}
1330 
1331 	*outp = data;
1332 	return (true);
1333 }
1334 
1335 static bool
1336 vlapic_write(struct vlapic *vlapic, uint16_t offset, uint32_t data)
1337 {
1338 	struct LAPIC	*lapic = vlapic->apic_page;
1339 	uint32_t	*regptr;
1340 
1341 	ASSERT3U(offset & 0xf, ==, 0);
1342 	ASSERT3U(offset, <, PAGESIZE);
1343 
1344 	switch (offset) {
1345 	case APIC_OFFSET_ID:
1346 		lapic->id = data;
1347 		vlapic_id_write_handler(vlapic);
1348 		break;
1349 	case APIC_OFFSET_TPR:
1350 		vlapic_set_tpr(vlapic, data & 0xff);
1351 		break;
1352 	case APIC_OFFSET_EOI:
1353 		vlapic_process_eoi(vlapic);
1354 		break;
1355 	case APIC_OFFSET_LDR:
1356 		lapic->ldr = data;
1357 		vlapic_ldr_write_handler(vlapic);
1358 		break;
1359 	case APIC_OFFSET_DFR:
1360 		lapic->dfr = data;
1361 		vlapic_dfr_write_handler(vlapic);
1362 		break;
1363 	case APIC_OFFSET_SVR:
1364 		lapic->svr = data;
1365 		vlapic_svr_write_handler(vlapic);
1366 		break;
1367 	case APIC_OFFSET_ICR_LOW:
1368 		lapic->icr_lo = data;
1369 		vlapic_icrlo_write_handler(vlapic);
1370 		break;
1371 	case APIC_OFFSET_ICR_HI:
1372 		lapic->icr_hi = data;
1373 		break;
1374 	case APIC_OFFSET_CMCI_LVT:
1375 	case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
1376 		regptr = vlapic_get_lvtptr(vlapic, offset);
1377 		*regptr = data;
1378 		vlapic_lvt_write_handler(vlapic, offset);
1379 		break;
1380 	case APIC_OFFSET_TIMER_ICR:
1381 		lapic->icr_timer = data;
1382 		vlapic_icrtmr_write_handler(vlapic);
1383 		break;
1384 
1385 	case APIC_OFFSET_TIMER_DCR:
1386 		lapic->dcr_timer = data;
1387 		vlapic_dcr_write_handler(vlapic);
1388 		break;
1389 
1390 	case APIC_OFFSET_ESR:
1391 		vlapic_esr_write_handler(vlapic);
1392 		break;
1393 
1394 	case APIC_OFFSET_SELF_IPI:
1395 		if (vlapic_x2mode(vlapic))
1396 			vlapic_self_ipi_handler(vlapic, data);
1397 		break;
1398 
1399 	case APIC_OFFSET_VER:
1400 	case APIC_OFFSET_APR:
1401 	case APIC_OFFSET_PPR:
1402 	case APIC_OFFSET_RRR:
1403 	case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
1404 	case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
1405 	case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
1406 	case APIC_OFFSET_TIMER_CCR:
1407 		/* Read-only register */
1408 		return (false);
1409 
1410 	default:
1411 		/* Invalid register */
1412 		return (false);
1413 	}
1414 
1415 	return (true);
1416 }
1417 
1418 void
1419 vlapic_reset(struct vlapic *vlapic)
1420 {
1421 	struct LAPIC *lapic = vlapic->apic_page;
1422 	uint32_t *isrptr, *tmrptr, *irrptr;
1423 
1424 	/* Reset any timer-related state first */
1425 	VLAPIC_TIMER_LOCK(vlapic);
1426 	callout_stop(&vlapic->callout);
1427 	lapic->icr_timer = 0;
1428 	lapic->ccr_timer = 0;
1429 	lapic->dcr_timer = 0;
1430 	vlapic_update_divider(vlapic);
1431 	VLAPIC_TIMER_UNLOCK(vlapic);
1432 
1433 	/*
1434 	 * Sync any APIC acceleration (APICv/AVIC) state into the APIC page so
1435 	 * it is not leftover after the reset.  This is performed after the APIC
1436 	 * timer has been stopped, in case it happened to fire just prior to
1437 	 * being deactivated.
1438 	 */
1439 	if (vlapic->ops.sync_state) {
1440 		(*vlapic->ops.sync_state)(vlapic);
1441 	}
1442 
1443 	vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
1444 	if (vlapic->vcpuid == 0)
1445 		vlapic->msr_apicbase |= APICBASE_BSP;
1446 
1447 	lapic->id = vlapic_get_id(vlapic);
1448 	lapic->version = VLAPIC_VERSION;
1449 	lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT);
1450 
1451 	lapic->tpr = 0;
1452 	lapic->apr = 0;
1453 	lapic->ppr = 0;
1454 
1455 	lapic->eoi = 0;
1456 	lapic->ldr = 0;
1457 	lapic->dfr = 0xffffffff;
1458 	lapic->svr = APIC_SVR_VECTOR;
1459 	vlapic->svr_last = lapic->svr;
1460 
1461 	isrptr = &lapic->isr0;
1462 	tmrptr = &lapic->tmr0;
1463 	irrptr = &lapic->irr0;
1464 	for (uint_t i = 0; i < 8; i++) {
1465 		atomic_store_rel_int(&isrptr[i * 4], 0);
1466 		atomic_store_rel_int(&tmrptr[i * 4], 0);
1467 		atomic_store_rel_int(&irrptr[i * 4], 0);
1468 	}
1469 
1470 	lapic->esr = 0;
1471 	vlapic->esr_pending = 0;
1472 	lapic->icr_lo = 0;
1473 	lapic->icr_hi = 0;
1474 
1475 	lapic->lvt_cmci = 0;
1476 	lapic->lvt_timer = 0;
1477 	lapic->lvt_thermal = 0;
1478 	lapic->lvt_pcint = 0;
1479 	lapic->lvt_lint0 = 0;
1480 	lapic->lvt_lint1 = 0;
1481 	lapic->lvt_error = 0;
1482 	vlapic_mask_lvts(vlapic);
1483 }
1484 
1485 void
1486 vlapic_init(struct vlapic *vlapic)
1487 {
1488 	KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized"));
1489 	KASSERT(vlapic->vcpuid >= 0 &&
1490 	    vlapic->vcpuid < vm_get_maxcpus(vlapic->vm),
1491 	    ("vlapic_init: vcpuid is not initialized"));
1492 	KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not "
1493 	    "initialized"));
1494 
1495 	/*
1496 	 * If the vlapic is configured in x2apic mode then it will be
1497 	 * accessed in the critical section via the MSR emulation code.
1498 	 *
1499 	 * Therefore the timer mutex must be a spinlock because blockable
1500 	 * mutexes cannot be acquired in a critical section.
1501 	 */
1502 	mutex_init(&vlapic->timer_lock, NULL, MUTEX_ADAPTIVE, NULL);
1503 	callout_init(&vlapic->callout, 1);
1504 
1505 	vlapic_reset(vlapic);
1506 }
1507 
1508 void
1509 vlapic_cleanup(struct vlapic *vlapic)
1510 {
1511 	callout_drain(&vlapic->callout);
1512 	mutex_destroy(&vlapic->timer_lock);
1513 }
1514 
1515 int
1516 vlapic_mmio_read(struct vlapic *vlapic, uint64_t gpa, uint64_t *valp,
1517     uint_t size)
1518 {
1519 	ASSERT3U(gpa, >=, DEFAULT_APIC_BASE);
1520 	ASSERT3U(gpa, <, DEFAULT_APIC_BASE + PAGE_SIZE);
1521 
1522 	/* Ignore MMIO accesses when in x2APIC mode or hardware disabled */
1523 	if (vlapic_x2mode(vlapic) || vlapic_hw_disabled(vlapic)) {
1524 		*valp = UINT64_MAX;
1525 		return (0);
1526 	}
1527 
1528 	const uint16_t off = gpa - DEFAULT_APIC_BASE;
1529 	uint32_t raw = 0;
1530 	(void) vlapic_read(vlapic, off & ~0xf, &raw);
1531 
1532 	/* Shift and mask reads which are small and/or unaligned */
1533 	const uint8_t align = off & 0xf;
1534 	if (align < 4) {
1535 		*valp = (uint64_t)raw << (align * 8);
1536 	} else {
1537 		*valp = 0;
1538 	}
1539 
1540 	return (0);
1541 }
1542 
1543 int
1544 vlapic_mmio_write(struct vlapic *vlapic, uint64_t gpa, uint64_t val,
1545     uint_t size)
1546 {
1547 	ASSERT3U(gpa, >=, DEFAULT_APIC_BASE);
1548 	ASSERT3U(gpa, <, DEFAULT_APIC_BASE + PAGE_SIZE);
1549 
1550 	/* Ignore MMIO accesses when in x2APIC mode or hardware disabled */
1551 	if (vlapic_x2mode(vlapic) || vlapic_hw_disabled(vlapic)) {
1552 		return (0);
1553 	}
1554 
1555 	const uint16_t off = gpa - DEFAULT_APIC_BASE;
1556 	/* Ignore writes which are not 32-bits wide and 16-byte aligned */
1557 	if ((off & 0xf) != 0 || size != 4) {
1558 		return (0);
1559 	}
1560 
1561 	(void) vlapic_write(vlapic, off, (uint32_t)val);
1562 	return (0);
1563 }
1564 
1565 /* Should attempts to change the APIC base address be rejected with a #GP?  */
1566 int vlapic_gp_on_addr_change = 1;
1567 
1568 static vm_msr_result_t
1569 vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val)
1570 {
1571 	const uint64_t diff = vlapic->msr_apicbase ^ val;
1572 
1573 	/*
1574 	 * Until the LAPIC emulation for switching between xAPIC and x2APIC
1575 	 * modes is more polished, it will remain off-limits from being altered
1576 	 * by the guest.
1577 	 */
1578 	const uint64_t reserved_bits = APICBASE_RESERVED | APICBASE_X2APIC |
1579 	    APICBASE_BSP;
1580 	if ((diff & reserved_bits) != 0) {
1581 		return (VMR_GP);
1582 	}
1583 
1584 	/* We do not presently allow the LAPIC access address to be modified. */
1585 	if ((diff & APICBASE_ADDR_MASK) != 0) {
1586 		/*
1587 		 * Explicitly rebuffing such requests with a #GP is the most
1588 		 * straightforward way to handle the situation, but certain
1589 		 * consumers (such as the KVM unit tests) may balk at the
1590 		 * otherwise unexpected exception.
1591 		 */
1592 		if (vlapic_gp_on_addr_change) {
1593 			return (VMR_GP);
1594 		}
1595 
1596 		/* If silence is required, just ignore the address change. */
1597 		val = (val & ~APICBASE_ADDR_MASK) | DEFAULT_APIC_BASE;
1598 	}
1599 
1600 	vlapic->msr_apicbase = val;
1601 	return (VMR_OK);
1602 }
1603 
1604 static __inline uint16_t
1605 vlapic_msr_to_regoff(uint32_t msr)
1606 {
1607 	ASSERT3U(msr, >=, MSR_APIC_000);
1608 	ASSERT3U(msr, <, (MSR_APIC_000 + 0x100));
1609 
1610 	return ((msr - MSR_APIC_000) << 4);
1611 }
1612 
1613 bool
1614 vlapic_owned_msr(uint32_t msr)
1615 {
1616 	if (msr == MSR_APICBASE) {
1617 		return (true);
1618 	}
1619 	if (msr >= MSR_APIC_000 &&
1620 	    msr < (MSR_APIC_000 + 0x100)) {
1621 		return (true);
1622 	}
1623 	return (false);
1624 }
1625 
1626 vm_msr_result_t
1627 vlapic_rdmsr(struct vlapic *vlapic, uint32_t msr, uint64_t *valp)
1628 {
1629 	ASSERT(vlapic_owned_msr(msr));
1630 	ASSERT3P(valp, !=, NULL);
1631 
1632 	if (msr == MSR_APICBASE) {
1633 		*valp = vlapic->msr_apicbase;
1634 		return (VMR_OK);
1635 	}
1636 
1637 	/* #GP for x2APIC MSR accesses in xAPIC mode */
1638 	if (!vlapic_x2mode(vlapic)) {
1639 		return (VMR_GP);
1640 	}
1641 
1642 	uint64_t out = 0;
1643 	const uint16_t reg = vlapic_msr_to_regoff(msr);
1644 	switch (reg) {
1645 	case APIC_OFFSET_ICR_LOW: {
1646 		/* Read from ICR register gets entire (64-bit) value */
1647 		uint32_t low = 0, high = 0;
1648 		bool valid;
1649 
1650 		valid = vlapic_read(vlapic, APIC_OFFSET_ICR_HI, &high);
1651 		VERIFY(valid);
1652 		valid = vlapic_read(vlapic, APIC_OFFSET_ICR_LOW, &low);
1653 		VERIFY(valid);
1654 
1655 		*valp = ((uint64_t)high << 32) | low;
1656 		return (VMR_OK);
1657 		}
1658 	case APIC_OFFSET_ICR_HI:
1659 		/* Already covered by ICR_LOW */
1660 		return (VMR_GP);
1661 	default:
1662 		break;
1663 	}
1664 	if (!vlapic_read(vlapic, reg, (uint32_t *)&out)) {
1665 		return (VMR_GP);
1666 	}
1667 	*valp = out;
1668 	return (VMR_OK);
1669 }
1670 
1671 vm_msr_result_t
1672 vlapic_wrmsr(struct vlapic *vlapic, uint32_t msr, uint64_t val)
1673 {
1674 	ASSERT(vlapic_owned_msr(msr));
1675 
1676 	if (msr == MSR_APICBASE) {
1677 		return (vlapic_set_apicbase(vlapic, val));
1678 	}
1679 
1680 	/* #GP for x2APIC MSR accesses in xAPIC mode */
1681 	if (!vlapic_x2mode(vlapic)) {
1682 		return (VMR_GP);
1683 	}
1684 
1685 	const uint16_t reg = vlapic_msr_to_regoff(msr);
1686 	switch (reg) {
1687 	case APIC_OFFSET_ICR_LOW: {
1688 		/* Write to ICR register sets entire (64-bit) value */
1689 		bool valid;
1690 
1691 		valid = vlapic_write(vlapic, APIC_OFFSET_ICR_HI, val >> 32);
1692 		VERIFY(valid);
1693 		valid = vlapic_write(vlapic, APIC_OFFSET_ICR_LOW, val);
1694 		VERIFY(valid);
1695 		return (VMR_OK);
1696 		}
1697 	case APIC_OFFSET_ICR_HI:
1698 		/* Already covered by ICR_LOW */
1699 		return (VMR_GP);
1700 	case APIC_OFFSET_ESR:
1701 		/* Only 0 may be written from x2APIC mode */
1702 		if (val != 0) {
1703 			return (VMR_GP);
1704 		}
1705 		break;
1706 	default:
1707 		break;
1708 	}
1709 	if (!vlapic_write(vlapic, reg, val)) {
1710 		return (VMR_GP);
1711 	}
1712 	return (VMR_OK);
1713 }
1714 
1715 void
1716 vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1717 {
1718 	struct vlapic *vlapic;
1719 	struct LAPIC *lapic;
1720 
1721 	vlapic = vm_lapic(vm, vcpuid);
1722 
1723 	if (state == X2APIC_DISABLED)
1724 		vlapic->msr_apicbase &= ~APICBASE_X2APIC;
1725 	else
1726 		vlapic->msr_apicbase |= APICBASE_X2APIC;
1727 
1728 	/*
1729 	 * Reset the local APIC registers whose values are mode-dependent.
1730 	 *
1731 	 * XXX this works because the APIC mode can be changed only at vcpu
1732 	 * initialization time.
1733 	 */
1734 	lapic = vlapic->apic_page;
1735 	lapic->id = vlapic_get_id(vlapic);
1736 	if (vlapic_x2mode(vlapic)) {
1737 		lapic->ldr = x2apic_ldr(vlapic);
1738 		lapic->dfr = 0;
1739 	} else {
1740 		lapic->ldr = 0;
1741 		lapic->dfr = 0xffffffff;
1742 	}
1743 
1744 	if (state == X2APIC_ENABLED) {
1745 		if (vlapic->ops.enable_x2apic_mode)
1746 			(*vlapic->ops.enable_x2apic_mode)(vlapic);
1747 	}
1748 }
1749 
1750 void
1751 vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
1752     int delmode, int vec)
1753 {
1754 	bool lowprio;
1755 	int vcpuid;
1756 	cpuset_t dmask;
1757 
1758 	if (delmode != IOART_DELFIXED &&
1759 	    delmode != IOART_DELLOPRI &&
1760 	    delmode != IOART_DELEXINT) {
1761 		/* Invalid delivery mode */
1762 		return;
1763 	}
1764 	lowprio = (delmode == IOART_DELLOPRI);
1765 
1766 	/*
1767 	 * We don't provide any virtual interrupt redirection hardware so
1768 	 * all interrupts originating from the ioapic or MSI specify the
1769 	 * 'dest' in the legacy xAPIC format.
1770 	 */
1771 	vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false);
1772 
1773 	while ((vcpuid = CPU_FFS(&dmask)) != 0) {
1774 		vcpuid--;
1775 		CPU_CLR(vcpuid, &dmask);
1776 		if (delmode == IOART_DELEXINT) {
1777 			(void) vm_inject_extint(vm, vcpuid);
1778 		} else {
1779 			(void) lapic_set_intr(vm, vcpuid, vec, level);
1780 		}
1781 	}
1782 }
1783 
1784 void
1785 vlapic_post_intr(struct vlapic *vlapic, int hostcpu)
1786 {
1787 	/*
1788 	 * Post an interrupt to the vcpu currently running on 'hostcpu'.
1789 	 *
1790 	 * This is done by leveraging features like Posted Interrupts (Intel)
1791 	 * Doorbell MSR (AMD AVIC) that avoid a VM exit.
1792 	 *
1793 	 * If neither of these features are available then fallback to
1794 	 * sending an IPI to 'hostcpu'.
1795 	 */
1796 	if (vlapic->ops.post_intr)
1797 		(*vlapic->ops.post_intr)(vlapic, hostcpu);
1798 	else
1799 		poke_cpu(hostcpu);
1800 }
1801 
1802 void
1803 vlapic_localize_resources(struct vlapic *vlapic)
1804 {
1805 	vmm_glue_callout_localize(&vlapic->callout);
1806 }
1807 
1808 void
1809 vlapic_pause(struct vlapic *vlapic)
1810 {
1811 	VLAPIC_TIMER_LOCK(vlapic);
1812 	callout_stop(&vlapic->callout);
1813 	VLAPIC_TIMER_UNLOCK(vlapic);
1814 
1815 }
1816 
1817 void
1818 vlapic_resume(struct vlapic *vlapic)
1819 {
1820 	VLAPIC_TIMER_LOCK(vlapic);
1821 	if (vlapic->timer_fire_when != 0) {
1822 		vlapic_callout_reset(vlapic);
1823 	}
1824 	VLAPIC_TIMER_UNLOCK(vlapic);
1825 }
1826 
1827 static int
1828 vlapic_data_read(void *datap, const vmm_data_req_t *req)
1829 {
1830 	VERIFY3U(req->vdr_class, ==, VDC_LAPIC);
1831 	VERIFY3U(req->vdr_version, ==, 1);
1832 	VERIFY3U(req->vdr_len, >=, sizeof (struct vdi_lapic_v1));
1833 
1834 	struct vlapic *vlapic = datap;
1835 	struct vdi_lapic_v1 *out = req->vdr_data;
1836 
1837 	VLAPIC_TIMER_LOCK(vlapic);
1838 
1839 	if (vlapic->ops.sync_state) {
1840 		(*vlapic->ops.sync_state)(vlapic);
1841 	}
1842 
1843 	out->vl_msr_apicbase = vlapic->msr_apicbase;
1844 	out->vl_esr_pending = vlapic->esr_pending;
1845 	if (vlapic->timer_fire_when != 0) {
1846 		out->vl_timer_target =
1847 		    vm_normalize_hrtime(vlapic->vm, vlapic->timer_fire_when);
1848 	} else {
1849 		out->vl_timer_target = 0;
1850 	}
1851 
1852 	const struct LAPIC *lapic = vlapic->apic_page;
1853 	struct vdi_lapic_page_v1 *out_page = &out->vl_lapic;
1854 
1855 	/*
1856 	 * While this might appear, at first glance, to be missing some fields,
1857 	 * they are intentionally omitted:
1858 	 * - PPR: its contents are always generated at runtime
1859 	 * - EOI: write-only, and contents are ignored after handling
1860 	 * - RRD: (aka RRR) read-only and always 0
1861 	 * - CCR: calculated from underlying timer data
1862 	 */
1863 	out_page->vlp_id = lapic->id;
1864 	out_page->vlp_version = lapic->version;
1865 	out_page->vlp_tpr = lapic->tpr;
1866 	out_page->vlp_apr = lapic->apr;
1867 	out_page->vlp_ldr = lapic->ldr;
1868 	out_page->vlp_dfr = lapic->dfr;
1869 	out_page->vlp_svr = lapic->svr;
1870 	out_page->vlp_esr = lapic->esr;
1871 	out_page->vlp_icr = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo;
1872 	out_page->vlp_icr_timer = lapic->icr_timer;
1873 	out_page->vlp_dcr_timer = lapic->dcr_timer;
1874 
1875 	out_page->vlp_lvt_cmci = lapic->lvt_cmci;
1876 	out_page->vlp_lvt_timer = lapic->lvt_timer;
1877 	out_page->vlp_lvt_thermal = lapic->lvt_thermal;
1878 	out_page->vlp_lvt_pcint = lapic->lvt_pcint;
1879 	out_page->vlp_lvt_lint0 = lapic->lvt_lint0;
1880 	out_page->vlp_lvt_lint1 = lapic->lvt_lint1;
1881 	out_page->vlp_lvt_error = lapic->lvt_error;
1882 
1883 	const uint32_t *isrptr = &lapic->isr0;
1884 	const uint32_t *tmrptr = &lapic->tmr0;
1885 	const uint32_t *irrptr = &lapic->irr0;
1886 	for (uint_t i = 0; i < 8; i++) {
1887 		out_page->vlp_isr[i] = isrptr[i * 4];
1888 		out_page->vlp_tmr[i] = tmrptr[i * 4];
1889 		out_page->vlp_irr[i] = irrptr[i * 4];
1890 	}
1891 	VLAPIC_TIMER_UNLOCK(vlapic);
1892 
1893 	return (0);
1894 }
1895 
1896 static uint8_t
1897 popc8(uint8_t val)
1898 {
1899 	uint8_t cnt;
1900 
1901 	for (cnt = 0; val != 0; val &= (val - 1)) {
1902 		cnt++;
1903 	}
1904 	return (cnt);
1905 }
1906 
1907 /*
1908  * Descriptions for the various failures which can occur when validating
1909  * to-be-written vlapic state.
1910  */
1911 enum vlapic_validation_error {
1912 	VVE_OK,
1913 	VVE_BAD_ID,
1914 	VVE_BAD_VERSION,
1915 	VVE_BAD_MSR_BASE,
1916 	VVE_BAD_ESR,
1917 	VVE_BAD_TPR,
1918 	VVE_LOW_VECTOR,
1919 	VVE_ISR_PRIORITY,
1920 };
1921 
1922 static enum vlapic_validation_error
1923 vlapic_data_validate(const struct vlapic *vlapic, const vmm_data_req_t *req)
1924 {
1925 	ASSERT(req->vdr_version == 1 &&
1926 	    req->vdr_len >= sizeof (struct vdi_lapic_v1));
1927 	const struct vdi_lapic_v1 *src = req->vdr_data;
1928 
1929 	if ((src->vl_esr_pending & ~APIC_VALID_MASK_ESR) != 0 ||
1930 	    (src->vl_lapic.vlp_esr & ~APIC_VALID_MASK_ESR) != 0) {
1931 		return (VVE_BAD_ESR);
1932 	}
1933 
1934 	/* Use the same restrictions as the wrmsr accessor for now */
1935 	const uint64_t apicbase_reserved = APICBASE_RESERVED | APICBASE_X2APIC |
1936 	    APICBASE_BSP;
1937 	const uint64_t diff = src->vl_msr_apicbase ^ vlapic->msr_apicbase;
1938 	if ((diff & apicbase_reserved) != 0) {
1939 		return (VVE_BAD_MSR_BASE);
1940 	}
1941 
1942 	const struct vdi_lapic_page_v1 *page = &src->vl_lapic;
1943 	/*
1944 	 * Demand that ID match for now.  This can be further updated when some
1945 	 * of the x2apic handling is improved.
1946 	 */
1947 	if (page->vlp_id != vlapic_get_id(vlapic)) {
1948 		return (VVE_BAD_ID);
1949 	}
1950 
1951 	if (page->vlp_version != vlapic->apic_page->version) {
1952 		return (VVE_BAD_VERSION);
1953 	}
1954 
1955 	if (page->vlp_tpr > 0xff) {
1956 		return (VVE_BAD_TPR);
1957 	}
1958 
1959 	/* Vectors 0-15 are not expected to be handled by the lapic */
1960 	if ((page->vlp_isr[0] & 0xffff) != 0 ||
1961 	    (page->vlp_irr[0] & 0xffff) != 0 ||
1962 	    (page->vlp_tmr[0] & 0xffff) != 0) {
1963 		return (VVE_LOW_VECTOR);
1964 	}
1965 
1966 	/* Only one interrupt should be in-service for each priority level */
1967 	for (uint_t i = 0; i < 8; i++) {
1968 		if (popc8((uint8_t)page->vlp_isr[i]) > 1 ||
1969 		    popc8((uint8_t)(page->vlp_isr[i] >> 8)) > 1 ||
1970 		    popc8((uint8_t)(page->vlp_isr[i] >> 16)) > 1 ||
1971 		    popc8((uint8_t)(page->vlp_isr[i] >> 24)) > 1) {
1972 			return (VVE_ISR_PRIORITY);
1973 		}
1974 	}
1975 
1976 	return (VVE_OK);
1977 }
1978 
1979 static int
1980 vlapic_data_write(void *datap, const vmm_data_req_t *req)
1981 {
1982 	VERIFY3U(req->vdr_class, ==, VDC_LAPIC);
1983 	VERIFY3U(req->vdr_version, ==, 1);
1984 	VERIFY3U(req->vdr_len, >=, sizeof (struct vdi_lapic_v1));
1985 
1986 	struct vlapic *vlapic = datap;
1987 	if (vlapic_data_validate(vlapic, req) != VVE_OK) {
1988 		return (EINVAL);
1989 	}
1990 	const struct vdi_lapic_v1 *src = req->vdr_data;
1991 	const struct vdi_lapic_page_v1 *page = &src->vl_lapic;
1992 	struct LAPIC *lapic = vlapic->apic_page;
1993 
1994 	VLAPIC_TIMER_LOCK(vlapic);
1995 
1996 	/* Already ensured by vlapic_data_validate() */
1997 	VERIFY3U(page->vlp_version, ==, lapic->version);
1998 
1999 	vlapic->msr_apicbase = src->vl_msr_apicbase;
2000 	vlapic->esr_pending = src->vl_esr_pending;
2001 
2002 	lapic->tpr = page->vlp_tpr;
2003 	lapic->apr = page->vlp_apr;
2004 	lapic->ldr = page->vlp_ldr;
2005 	lapic->dfr = page->vlp_dfr;
2006 	lapic->svr = page->vlp_svr;
2007 	lapic->esr = page->vlp_esr;
2008 	lapic->icr_lo = (uint32_t)page->vlp_icr;
2009 	lapic->icr_hi = (uint32_t)(page->vlp_icr >> 32);
2010 
2011 	lapic->icr_timer = page->vlp_icr_timer;
2012 	lapic->dcr_timer = page->vlp_dcr_timer;
2013 	vlapic_update_divider(vlapic);
2014 
2015 	/* cleanse LDR/DFR */
2016 	vlapic_ldr_write_handler(vlapic);
2017 	vlapic_dfr_write_handler(vlapic);
2018 
2019 	lapic->lvt_cmci = page->vlp_lvt_cmci;
2020 	lapic->lvt_timer = page->vlp_lvt_timer;
2021 	lapic->lvt_thermal = page->vlp_lvt_thermal;
2022 	lapic->lvt_pcint = page->vlp_lvt_pcint;
2023 	lapic->lvt_lint0 = page->vlp_lvt_lint0;
2024 	lapic->lvt_lint1 = page->vlp_lvt_lint1;
2025 	lapic->lvt_error = page->vlp_lvt_error;
2026 	/* cleanse LVTs */
2027 	vlapic_refresh_lvts(vlapic);
2028 
2029 	uint32_t *isrptr = &lapic->isr0;
2030 	uint32_t *tmrptr = &lapic->tmr0;
2031 	uint32_t *irrptr = &lapic->irr0;
2032 	for (uint_t i = 0; i < 8; i++) {
2033 		isrptr[i * 4] = page->vlp_isr[i];
2034 		tmrptr[i * 4] = page->vlp_tmr[i];
2035 		irrptr[i * 4] = page->vlp_irr[i];
2036 	}
2037 
2038 	if (src->vl_timer_target != 0) {
2039 		vlapic->timer_fire_when =
2040 		    vm_denormalize_hrtime(vlapic->vm, src->vl_timer_target);
2041 
2042 		/*
2043 		 * Check to see if timer expiration would result computed CCR
2044 		 * values in excess of what is configured in ICR/DCR.
2045 		 */
2046 		const hrtime_t now = gethrtime();
2047 		if (vlapic->timer_fire_when > now) {
2048 			const uint32_t ccr = hrt_freq_count(
2049 			    vlapic->timer_fire_when - now,
2050 			    vlapic->timer_cur_freq);
2051 
2052 			/*
2053 			 * Until we have a richer event/logging system
2054 			 * available, just note such an overage as a stat.
2055 			 */
2056 			if (ccr > lapic->icr_timer) {
2057 				vlapic->stats.vs_import_timer_overage++;
2058 			}
2059 		}
2060 
2061 		if (!vm_is_paused(vlapic->vm)) {
2062 			vlapic_callout_reset(vlapic);
2063 		}
2064 	} else {
2065 		vlapic->timer_fire_when = 0;
2066 	}
2067 
2068 	if (vlapic->ops.sync_state) {
2069 		(*vlapic->ops.sync_state)(vlapic);
2070 	}
2071 	VLAPIC_TIMER_UNLOCK(vlapic);
2072 
2073 	return (0);
2074 }
2075 
2076 static const vmm_data_version_entry_t lapic_v1 = {
2077 	.vdve_class = VDC_LAPIC,
2078 	.vdve_version = 1,
2079 	.vdve_len_expect = sizeof (struct vdi_lapic_v1),
2080 	.vdve_readf = vlapic_data_read,
2081 	.vdve_writef = vlapic_data_write,
2082 };
2083 VMM_DATA_VERSION(lapic_v1);
2084