1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2012 Sandvine, Inc.
5  * Copyright (c) 2012 NetApp, Inc.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 /*
32  * This file and its contents are supplied under the terms of the
33  * Common Development and Distribution License ("CDDL"), version 1.0.
34  * You may only use this file in accordance with the terms of version
35  * 1.0 of the CDDL.
36  *
37  * A full copy of the text of the CDDL should have accompanied this
38  * source.  A copy of the CDDL is also available via the Internet at
39  * http://www.illumos.org/license/CDDL.
40  *
41  * Copyright 2015 Pluribus Networks Inc.
42  * Copyright 2018 Joyent, Inc.
43  * Copyright 2021 Oxide Computer Company
44  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
45  */
46 
47 #include <sys/cdefs.h>
48 __FBSDID("$FreeBSD$");
49 
50 #include <sys/param.h>
51 #include <sys/pcpu.h>
52 #include <sys/systm.h>
53 #include <sys/proc.h>
54 
55 #include <machine/vmparam.h>
56 #include <machine/vmm.h>
57 #include <sys/vmm_kernel.h>
58 #include <sys/vmm_vm.h>
59 
60 #include <sys/vmm_instruction_emul.h>
61 #include <x86/psl.h>
62 #include <x86/specialreg.h>
63 
64 #include "vmm_ioport.h"
65 
66 enum vie_status {
67 	VIES_INIT		= (1U << 0),
68 	VIES_MMIO		= (1U << 1),
69 	VIES_INOUT		= (1U << 2),
70 	VIES_OTHER		= (1U << 3),
71 	VIES_INST_FETCH		= (1U << 4),
72 	VIES_INST_DECODE	= (1U << 5),
73 	VIES_PENDING_MMIO	= (1U << 6),
74 	VIES_PENDING_INOUT	= (1U << 7),
75 	VIES_REPEAT		= (1U << 8),
76 	VIES_USER_FALLBACK	= (1U << 9),
77 	VIES_COMPLETE		= (1U << 10),
78 };
79 
80 /* State of request to perform emulated access (inout or MMIO) */
81 enum vie_req {
82 	VR_NONE,
83 	VR_PENDING,
84 	VR_DONE,
85 };
86 
87 struct vie_mmio {
88 	uint64_t		data;
89 	uint64_t		gpa;
90 	uint8_t			bytes;
91 	enum vie_req		state;
92 };
93 
94 struct vie_op {
95 	uint8_t		op_byte;	/* actual opcode byte */
96 	uint8_t		op_type;	/* type of operation (e.g. MOV) */
97 	uint16_t	op_flags;
98 };
99 
100 #define	VIE_INST_SIZE	15
101 struct vie {
102 	uint8_t		inst[VIE_INST_SIZE];	/* instruction bytes */
103 	uint8_t		num_valid;		/* size of the instruction */
104 	uint8_t		num_processed;
105 
106 	uint8_t		addrsize:4, opsize:4;	/* address and operand sizes */
107 	uint8_t		rex_w:1,		/* REX prefix */
108 			rex_r:1,
109 			rex_x:1,
110 			rex_b:1,
111 			rex_present:1,
112 			repz_present:1,		/* REP/REPE/REPZ prefix */
113 			repnz_present:1,	/* REPNE/REPNZ prefix */
114 			opsize_override:1,	/* Operand size override */
115 			addrsize_override:1,	/* Address size override */
116 			segment_override:1;	/* Segment override */
117 
118 	uint8_t		mod:2,			/* ModRM byte */
119 			reg:4,
120 			rm:4;
121 
122 	uint8_t		ss:2,			/* SIB byte */
123 			vex_present:1,		/* VEX prefixed */
124 			vex_l:1,		/* L bit */
125 			index:4,		/* SIB byte */
126 			base:4;			/* SIB byte */
127 
128 	uint8_t		disp_bytes;
129 	uint8_t		imm_bytes;
130 
131 	uint8_t		scale;
132 
133 	uint8_t		vex_reg:4,	/* vvvv: first source reg specifier */
134 			vex_pp:2,	/* pp */
135 			_sparebits:2;
136 
137 	uint8_t		_sparebytes[2];
138 
139 	int		base_register;		/* VM_REG_GUEST_xyz */
140 	int		index_register;		/* VM_REG_GUEST_xyz */
141 	int		segment_register;	/* VM_REG_GUEST_xyz */
142 
143 	int64_t		displacement;		/* optional addr displacement */
144 	int64_t		immediate;		/* optional immediate operand */
145 
146 	struct vie_op	op;			/* opcode description */
147 
148 	enum vie_status	status;
149 
150 	struct vm_guest_paging paging;		/* guest paging state */
151 
152 	uint64_t	mmio_gpa;		/* faulting GPA */
153 	struct vie_mmio	mmio_req_read;
154 	struct vie_mmio	mmio_req_write;
155 
156 	struct vm_inout	inout;			/* active in/out op */
157 	enum vie_req	inout_req_state;
158 	uint32_t	inout_req_val;		/* value from userspace */
159 };
160 
161 
162 /* struct vie_op.op_type */
163 enum {
164 	VIE_OP_TYPE_NONE = 0,
165 	VIE_OP_TYPE_MOV,
166 	VIE_OP_TYPE_MOVSX,
167 	VIE_OP_TYPE_MOVZX,
168 	VIE_OP_TYPE_MOV_CR,
169 	VIE_OP_TYPE_AND,
170 	VIE_OP_TYPE_OR,
171 	VIE_OP_TYPE_SUB,
172 	VIE_OP_TYPE_TWO_BYTE,
173 	VIE_OP_TYPE_PUSH,
174 	VIE_OP_TYPE_CMP,
175 	VIE_OP_TYPE_POP,
176 	VIE_OP_TYPE_MOVS,
177 	VIE_OP_TYPE_GROUP1,
178 	VIE_OP_TYPE_STOS,
179 	VIE_OP_TYPE_BITTEST,
180 	VIE_OP_TYPE_TWOB_GRP15,
181 	VIE_OP_TYPE_ADD,
182 	VIE_OP_TYPE_TEST,
183 	VIE_OP_TYPE_BEXTR,
184 	VIE_OP_TYPE_CLTS,
185 	VIE_OP_TYPE_LAST
186 };
187 
188 /* struct vie_op.op_flags */
189 #define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
190 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
191 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
192 #define	VIE_OP_F_NO_MODRM	(1 << 3)
193 #define	VIE_OP_F_NO_GLA_VERIFICATION	(1 << 4)
194 #define	VIE_OP_F_REG_REG	(1 << 5)  /* special-case for mov-cr */
195 
196 static const struct vie_op three_byte_opcodes_0f38[256] = {
197 	[0xF7] = {
198 		.op_byte = 0xF7,
199 		.op_type = VIE_OP_TYPE_BEXTR,
200 	},
201 };
202 
203 static const struct vie_op two_byte_opcodes[256] = {
204 	[0x06] = {
205 		.op_byte = 0x06,
206 		.op_type = VIE_OP_TYPE_CLTS,
207 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
208 	},
209 	[0x20] = {
210 		.op_byte = 0x20,
211 		.op_type = VIE_OP_TYPE_MOV_CR,
212 		.op_flags = VIE_OP_F_REG_REG | VIE_OP_F_NO_GLA_VERIFICATION
213 	},
214 	[0x22] = {
215 		.op_byte = 0x22,
216 		.op_type = VIE_OP_TYPE_MOV_CR,
217 		.op_flags = VIE_OP_F_REG_REG | VIE_OP_F_NO_GLA_VERIFICATION
218 	},
219 	[0xAE] = {
220 		.op_byte = 0xAE,
221 		.op_type = VIE_OP_TYPE_TWOB_GRP15,
222 	},
223 	[0xB6] = {
224 		.op_byte = 0xB6,
225 		.op_type = VIE_OP_TYPE_MOVZX,
226 	},
227 	[0xB7] = {
228 		.op_byte = 0xB7,
229 		.op_type = VIE_OP_TYPE_MOVZX,
230 	},
231 	[0xBA] = {
232 		.op_byte = 0xBA,
233 		.op_type = VIE_OP_TYPE_BITTEST,
234 		.op_flags = VIE_OP_F_IMM8,
235 	},
236 	[0xBE] = {
237 		.op_byte = 0xBE,
238 		.op_type = VIE_OP_TYPE_MOVSX,
239 	},
240 };
241 
242 static const struct vie_op one_byte_opcodes[256] = {
243 	[0x03] = {
244 		.op_byte = 0x03,
245 		.op_type = VIE_OP_TYPE_ADD,
246 	},
247 	[0x0F] = {
248 		.op_byte = 0x0F,
249 		.op_type = VIE_OP_TYPE_TWO_BYTE
250 	},
251 	[0x0B] = {
252 		.op_byte = 0x0B,
253 		.op_type = VIE_OP_TYPE_OR,
254 	},
255 	[0x2B] = {
256 		.op_byte = 0x2B,
257 		.op_type = VIE_OP_TYPE_SUB,
258 	},
259 	[0x39] = {
260 		.op_byte = 0x39,
261 		.op_type = VIE_OP_TYPE_CMP,
262 	},
263 	[0x3B] = {
264 		.op_byte = 0x3B,
265 		.op_type = VIE_OP_TYPE_CMP,
266 	},
267 	[0x88] = {
268 		.op_byte = 0x88,
269 		.op_type = VIE_OP_TYPE_MOV,
270 	},
271 	[0x89] = {
272 		.op_byte = 0x89,
273 		.op_type = VIE_OP_TYPE_MOV,
274 	},
275 	[0x8A] = {
276 		.op_byte = 0x8A,
277 		.op_type = VIE_OP_TYPE_MOV,
278 	},
279 	[0x8B] = {
280 		.op_byte = 0x8B,
281 		.op_type = VIE_OP_TYPE_MOV,
282 	},
283 	[0xA1] = {
284 		.op_byte = 0xA1,
285 		.op_type = VIE_OP_TYPE_MOV,
286 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
287 	},
288 	[0xA3] = {
289 		.op_byte = 0xA3,
290 		.op_type = VIE_OP_TYPE_MOV,
291 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
292 	},
293 	[0xA4] = {
294 		.op_byte = 0xA4,
295 		.op_type = VIE_OP_TYPE_MOVS,
296 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
297 	},
298 	[0xA5] = {
299 		.op_byte = 0xA5,
300 		.op_type = VIE_OP_TYPE_MOVS,
301 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
302 	},
303 	[0xAA] = {
304 		.op_byte = 0xAA,
305 		.op_type = VIE_OP_TYPE_STOS,
306 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
307 	},
308 	[0xAB] = {
309 		.op_byte = 0xAB,
310 		.op_type = VIE_OP_TYPE_STOS,
311 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
312 	},
313 	[0xC6] = {
314 		/* XXX Group 11 extended opcode - not just MOV */
315 		.op_byte = 0xC6,
316 		.op_type = VIE_OP_TYPE_MOV,
317 		.op_flags = VIE_OP_F_IMM8,
318 	},
319 	[0xC7] = {
320 		.op_byte = 0xC7,
321 		.op_type = VIE_OP_TYPE_MOV,
322 		.op_flags = VIE_OP_F_IMM,
323 	},
324 	[0x23] = {
325 		.op_byte = 0x23,
326 		.op_type = VIE_OP_TYPE_AND,
327 	},
328 	[0x80] = {
329 		/* Group 1 extended opcode */
330 		.op_byte = 0x80,
331 		.op_type = VIE_OP_TYPE_GROUP1,
332 		.op_flags = VIE_OP_F_IMM8,
333 	},
334 	[0x81] = {
335 		/* Group 1 extended opcode */
336 		.op_byte = 0x81,
337 		.op_type = VIE_OP_TYPE_GROUP1,
338 		.op_flags = VIE_OP_F_IMM,
339 	},
340 	[0x83] = {
341 		/* Group 1 extended opcode */
342 		.op_byte = 0x83,
343 		.op_type = VIE_OP_TYPE_GROUP1,
344 		.op_flags = VIE_OP_F_IMM8,
345 	},
346 	[0x8F] = {
347 		/* XXX Group 1A extended opcode - not just POP */
348 		.op_byte = 0x8F,
349 		.op_type = VIE_OP_TYPE_POP,
350 	},
351 	[0xF6] = {
352 		/* XXX Group 3 extended opcode - not just TEST */
353 		.op_byte = 0xF6,
354 		.op_type = VIE_OP_TYPE_TEST,
355 		.op_flags = VIE_OP_F_IMM8,
356 	},
357 	[0xF7] = {
358 		/* XXX Group 3 extended opcode - not just TEST */
359 		.op_byte = 0xF7,
360 		.op_type = VIE_OP_TYPE_TEST,
361 		.op_flags = VIE_OP_F_IMM,
362 	},
363 	[0xFF] = {
364 		/* XXX Group 5 extended opcode - not just PUSH */
365 		.op_byte = 0xFF,
366 		.op_type = VIE_OP_TYPE_PUSH,
367 	}
368 };
369 
370 /* struct vie.mod */
371 #define	VIE_MOD_INDIRECT		0
372 #define	VIE_MOD_INDIRECT_DISP8		1
373 #define	VIE_MOD_INDIRECT_DISP32		2
374 #define	VIE_MOD_DIRECT			3
375 
376 /* struct vie.rm */
377 #define	VIE_RM_SIB			4
378 #define	VIE_RM_DISP32			5
379 
380 #define	GB				(1024 * 1024 * 1024)
381 
382 
383 /*
384  * Paging defines, previously pulled in from machine/pmap.h
385  */
386 #define	PG_V	(1 << 0) /* Present */
387 #define	PG_RW	(1 << 1) /* Read/Write */
388 #define	PG_U	(1 << 2) /* User/Supervisor */
389 #define	PG_A	(1 << 5) /* Accessed */
390 #define	PG_M	(1 << 6) /* Dirty */
391 #define	PG_PS	(1 << 7) /* Largepage */
392 
393 /*
394  * Paging except defines, previously pulled in from machine/pmap.h
395  */
396 #define	PGEX_P		(1 << 0) /* Non-present/Protection */
397 #define	PGEX_W		(1 << 1) /* Read/Write */
398 #define	PGEX_U		(1 << 2) /* User/Supervisor */
399 #define	PGEX_RSV	(1 << 3) /* (Non-)Reserved */
400 #define	PGEX_I		(1 << 4) /* Instruction */
401 
402 
403 static enum vm_reg_name gpr_map[16] = {
404 	VM_REG_GUEST_RAX,
405 	VM_REG_GUEST_RCX,
406 	VM_REG_GUEST_RDX,
407 	VM_REG_GUEST_RBX,
408 	VM_REG_GUEST_RSP,
409 	VM_REG_GUEST_RBP,
410 	VM_REG_GUEST_RSI,
411 	VM_REG_GUEST_RDI,
412 	VM_REG_GUEST_R8,
413 	VM_REG_GUEST_R9,
414 	VM_REG_GUEST_R10,
415 	VM_REG_GUEST_R11,
416 	VM_REG_GUEST_R12,
417 	VM_REG_GUEST_R13,
418 	VM_REG_GUEST_R14,
419 	VM_REG_GUEST_R15
420 };
421 
422 static enum vm_reg_name cr_map[16] = {
423 	VM_REG_GUEST_CR0,
424 	VM_REG_LAST,
425 	VM_REG_GUEST_CR2,
426 	VM_REG_GUEST_CR3,
427 	VM_REG_GUEST_CR4,
428 	VM_REG_LAST,
429 	VM_REG_LAST,
430 	VM_REG_LAST,
431 	VM_REG_LAST,
432 	VM_REG_LAST,
433 	VM_REG_LAST,
434 	VM_REG_LAST,
435 	VM_REG_LAST,
436 	VM_REG_LAST,
437 	VM_REG_LAST,
438 	VM_REG_LAST
439 };
440 
441 static uint64_t size2mask[] = {
442 	[1] = 0xff,
443 	[2] = 0xffff,
444 	[4] = 0xffffffff,
445 	[8] = 0xffffffffffffffff,
446 };
447 
448 
449 static int vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid,
450     uint64_t gpa, uint64_t *rval, int bytes);
451 static int vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid,
452     uint64_t gpa, uint64_t wval, int bytes);
453 static int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
454     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
455     int prot, uint64_t *gla);
456 static int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla);
457 static int vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf,
458     uint64_t gla);
459 static uint64_t vie_size2mask(int size);
460 
461 struct vie *
462 vie_alloc()
463 {
464 	return (kmem_zalloc(sizeof (struct vie), KM_SLEEP));
465 }
466 
467 void
468 vie_free(struct vie *vie)
469 {
470 	kmem_free(vie, sizeof (struct vie));
471 }
472 
473 enum vm_reg_name
474 vie_regnum_map(uint8_t regnum)
475 {
476 	VERIFY3U(regnum, <, 16);
477 	return (gpr_map[regnum]);
478 }
479 
480 static void
481 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
482 {
483 	*lhbr = 0;
484 	*reg = gpr_map[vie->reg];
485 
486 	/*
487 	 * 64-bit mode imposes limitations on accessing legacy high byte
488 	 * registers (lhbr).
489 	 *
490 	 * The legacy high-byte registers cannot be addressed if the REX
491 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
492 	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
493 	 *
494 	 * If the REX prefix is not present then the values 4, 5, 6 and 7
495 	 * of the 'ModRM:reg' field address the legacy high-byte registers,
496 	 * %ah, %ch, %dh and %bh respectively.
497 	 */
498 	if (!vie->rex_present) {
499 		if (vie->reg & 0x4) {
500 			*lhbr = 1;
501 			*reg = gpr_map[vie->reg & 0x3];
502 		}
503 	}
504 }
505 
506 static int
507 vie_read_bytereg(struct vie *vie, struct vm *vm, int vcpuid, uint8_t *rval)
508 {
509 	uint64_t val;
510 	int error, lhbr;
511 	enum vm_reg_name reg;
512 
513 	vie_calc_bytereg(vie, &reg, &lhbr);
514 	error = vm_get_register(vm, vcpuid, reg, &val);
515 
516 	/*
517 	 * To obtain the value of a legacy high byte register shift the
518 	 * base register right by 8 bits (%ah = %rax >> 8).
519 	 */
520 	if (lhbr)
521 		*rval = val >> 8;
522 	else
523 		*rval = val;
524 	return (error);
525 }
526 
527 static int
528 vie_write_bytereg(struct vie *vie, struct vm *vm, int vcpuid, uint8_t byte)
529 {
530 	uint64_t origval, val, mask;
531 	int error, lhbr;
532 	enum vm_reg_name reg;
533 
534 	vie_calc_bytereg(vie, &reg, &lhbr);
535 	error = vm_get_register(vm, vcpuid, reg, &origval);
536 	if (error == 0) {
537 		val = byte;
538 		mask = 0xff;
539 		if (lhbr) {
540 			/*
541 			 * Shift left by 8 to store 'byte' in a legacy high
542 			 * byte register.
543 			 */
544 			val <<= 8;
545 			mask <<= 8;
546 		}
547 		val |= origval & ~mask;
548 		error = vm_set_register(vm, vcpuid, reg, val);
549 	}
550 	return (error);
551 }
552 
553 static int
554 vie_update_register(struct vm *vm, int vcpuid, enum vm_reg_name reg,
555     uint64_t val, int size)
556 {
557 	int error;
558 	uint64_t origval;
559 
560 	switch (size) {
561 	case 1:
562 	case 2:
563 		error = vm_get_register(vm, vcpuid, reg, &origval);
564 		if (error)
565 			return (error);
566 		val &= size2mask[size];
567 		val |= origval & ~size2mask[size];
568 		break;
569 	case 4:
570 		val &= 0xffffffffUL;
571 		break;
572 	case 8:
573 		break;
574 	default:
575 		return (EINVAL);
576 	}
577 
578 	error = vm_set_register(vm, vcpuid, reg, val);
579 	return (error);
580 }
581 
582 static int
583 vie_repeat(struct vie *vie)
584 {
585 	vie->status |= VIES_REPEAT;
586 
587 	/*
588 	 * Clear out any cached operation values so the repeated instruction can
589 	 * begin without using that stale state.  Other state, such as the
590 	 * decoding results, are kept around as it will not vary between
591 	 * iterations of a rep-prefixed instruction.
592 	 */
593 	if ((vie->status & VIES_MMIO) != 0) {
594 		vie->mmio_req_read.state = VR_NONE;
595 		vie->mmio_req_write.state = VR_NONE;
596 	} else if ((vie->status & VIES_INOUT) != 0) {
597 		vie->inout_req_state = VR_NONE;
598 	} else {
599 		panic("unexpected emulation state");
600 	}
601 
602 	return (EAGAIN);
603 }
604 
605 #define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
606 
607 /*
608  * Return the status flags that would result from doing (x - y).
609  */
610 /* BEGIN CSTYLED */
611 #define	GETCC(sz)							\
612 static ulong_t								\
613 getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
614 {									\
615 	ulong_t rflags;							\
616 									\
617 	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
618 	    "=r" (rflags), "+r" (x) : "m" (y));				\
619 	return (rflags);						\
620 } struct __hack
621 /* END CSTYLED */
622 
623 GETCC(8);
624 GETCC(16);
625 GETCC(32);
626 GETCC(64);
627 
628 static ulong_t
629 getcc(int opsize, uint64_t x, uint64_t y)
630 {
631 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
632 	    ("getcc: invalid operand size %d", opsize));
633 
634 	if (opsize == 1)
635 		return (getcc8(x, y));
636 	else if (opsize == 2)
637 		return (getcc16(x, y));
638 	else if (opsize == 4)
639 		return (getcc32(x, y));
640 	else
641 		return (getcc64(x, y));
642 }
643 
644 /*
645  * Macro creation of functions getaddflags{8,16,32,64}
646  */
647 /* BEGIN CSTYLED */
648 #define	GETADDFLAGS(sz)							\
649 static ulong_t								\
650 getaddflags##sz(uint##sz##_t x, uint##sz##_t y)				\
651 {									\
652 	ulong_t rflags;							\
653 									\
654 	__asm __volatile("add %2,%1; pushfq; popq %0" :			\
655 	    "=r" (rflags), "+r" (x) : "m" (y));				\
656 	return (rflags);						\
657 } struct __hack
658 /* END CSTYLED */
659 
660 GETADDFLAGS(8);
661 GETADDFLAGS(16);
662 GETADDFLAGS(32);
663 GETADDFLAGS(64);
664 
665 static ulong_t
666 getaddflags(int opsize, uint64_t x, uint64_t y)
667 {
668 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
669 	    ("getaddflags: invalid operand size %d", opsize));
670 
671 	if (opsize == 1)
672 		return (getaddflags8(x, y));
673 	else if (opsize == 2)
674 		return (getaddflags16(x, y));
675 	else if (opsize == 4)
676 		return (getaddflags32(x, y));
677 	else
678 		return (getaddflags64(x, y));
679 }
680 
681 /*
682  * Return the status flags that would result from doing (x & y).
683  */
684 /* BEGIN CSTYLED */
685 #define	GETANDFLAGS(sz)							\
686 static ulong_t								\
687 getandflags##sz(uint##sz##_t x, uint##sz##_t y)				\
688 {									\
689 	ulong_t rflags;							\
690 									\
691 	__asm __volatile("and %2,%1; pushfq; popq %0" :			\
692 	    "=r" (rflags), "+r" (x) : "m" (y));				\
693 	return (rflags);						\
694 } struct __hack
695 /* END CSTYLED */
696 
697 GETANDFLAGS(8);
698 GETANDFLAGS(16);
699 GETANDFLAGS(32);
700 GETANDFLAGS(64);
701 
702 static ulong_t
703 getandflags(int opsize, uint64_t x, uint64_t y)
704 {
705 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
706 	    ("getandflags: invalid operand size %d", opsize));
707 
708 	if (opsize == 1)
709 		return (getandflags8(x, y));
710 	else if (opsize == 2)
711 		return (getandflags16(x, y));
712 	else if (opsize == 4)
713 		return (getandflags32(x, y));
714 	else
715 		return (getandflags64(x, y));
716 }
717 
718 static int
719 vie_emulate_mov_cr(struct vie *vie, struct vm *vm, int vcpuid)
720 {
721 	uint64_t val;
722 	int err;
723 	enum vm_reg_name gpr = gpr_map[vie->rm];
724 	enum vm_reg_name cr = cr_map[vie->reg];
725 
726 	uint_t size = 4;
727 	if (vie->paging.cpu_mode == CPU_MODE_64BIT) {
728 		size = 8;
729 	}
730 
731 	switch (vie->op.op_byte) {
732 	case 0x20:
733 		/*
734 		 * MOV control register (ModRM:reg) to reg (ModRM:r/m)
735 		 * 20/r:	mov r32, CR0-CR7
736 		 * 20/r:	mov r64, CR0-CR7
737 		 * REX.R + 20/0:	mov r64, CR8
738 		 */
739 		if (vie->paging.cpl != 0) {
740 			vm_inject_gp(vm, vcpuid);
741 			vie->num_processed = 0;
742 			return (0);
743 		}
744 		err = vm_get_register(vm, vcpuid, cr, &val);
745 		if (err != 0) {
746 			/* #UD for access to non-existent CRs */
747 			vm_inject_ud(vm, vcpuid);
748 			vie->num_processed = 0;
749 			return (0);
750 		}
751 		err = vie_update_register(vm, vcpuid, gpr, val, size);
752 		break;
753 	case 0x22: {
754 		/*
755 		 * MOV reg (ModRM:r/m) to control register (ModRM:reg)
756 		 * 22/r:	mov CR0-CR7, r32
757 		 * 22/r:	mov CR0-CR7, r64
758 		 * REX.R + 22/0:	mov CR8, r64
759 		 */
760 		uint64_t old, diff;
761 
762 		if (vie->paging.cpl != 0) {
763 			vm_inject_gp(vm, vcpuid);
764 			vie->num_processed = 0;
765 			return (0);
766 		}
767 		err = vm_get_register(vm, vcpuid, cr, &old);
768 		if (err != 0) {
769 			/* #UD for access to non-existent CRs */
770 			vm_inject_ud(vm, vcpuid);
771 			vie->num_processed = 0;
772 			return (0);
773 		}
774 		err = vm_get_register(vm, vcpuid, gpr, &val);
775 		VERIFY0(err);
776 		val &= size2mask[size];
777 		diff = old ^ val;
778 
779 		switch (cr) {
780 		case VM_REG_GUEST_CR0:
781 			if ((diff & CR0_PG) != 0) {
782 				uint64_t efer;
783 
784 				err = vm_get_register(vm, vcpuid,
785 				    VM_REG_GUEST_EFER, &efer);
786 				VERIFY0(err);
787 
788 				/* Keep the long-mode state in EFER in sync */
789 				if ((val & CR0_PG) != 0 &&
790 				    (efer & EFER_LME) != 0) {
791 					efer |= EFER_LMA;
792 				}
793 				if ((val & CR0_PG) == 0 &&
794 				    (efer & EFER_LME) != 0) {
795 					efer &= ~EFER_LMA;
796 				}
797 
798 				err = vm_set_register(vm, vcpuid,
799 				    VM_REG_GUEST_EFER, efer);
800 				VERIFY0(err);
801 			}
802 			/* TODO: enforce more of the #GP checks */
803 			err = vm_set_register(vm, vcpuid, cr, val);
804 			VERIFY0(err);
805 			break;
806 		case VM_REG_GUEST_CR2:
807 		case VM_REG_GUEST_CR3:
808 		case VM_REG_GUEST_CR4:
809 			/* TODO: enforce more of the #GP checks */
810 			err = vm_set_register(vm, vcpuid, cr, val);
811 			break;
812 		default:
813 			/* The cr_map mapping should prevent this */
814 			panic("invalid cr %d", cr);
815 		}
816 		break;
817 	}
818 	default:
819 		return (EINVAL);
820 	}
821 	return (err);
822 }
823 
824 static int
825 vie_emulate_mov(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
826 {
827 	int error, size;
828 	enum vm_reg_name reg;
829 	uint8_t byte;
830 	uint64_t val;
831 
832 	size = vie->opsize;
833 	error = EINVAL;
834 
835 	switch (vie->op.op_byte) {
836 	case 0x88:
837 		/*
838 		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
839 		 * 88/r:	mov r/m8, r8
840 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
841 		 */
842 		size = 1;	/* override for byte operation */
843 		error = vie_read_bytereg(vie, vm, vcpuid, &byte);
844 		if (error == 0) {
845 			error = vie_mmio_write(vie, vm, vcpuid, gpa, byte,
846 			    size);
847 		}
848 		break;
849 	case 0x89:
850 		/*
851 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
852 		 * 89/r:	mov r/m16, r16
853 		 * 89/r:	mov r/m32, r32
854 		 * REX.W + 89/r	mov r/m64, r64
855 		 */
856 		reg = gpr_map[vie->reg];
857 		error = vm_get_register(vm, vcpuid, reg, &val);
858 		if (error == 0) {
859 			val &= size2mask[size];
860 			error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
861 		}
862 		break;
863 	case 0x8A:
864 		/*
865 		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
866 		 * 8A/r:	mov r8, r/m8
867 		 * REX + 8A/r:	mov r8, r/m8
868 		 */
869 		size = 1;	/* override for byte operation */
870 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size);
871 		if (error == 0)
872 			error = vie_write_bytereg(vie, vm, vcpuid, val);
873 		break;
874 	case 0x8B:
875 		/*
876 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
877 		 * 8B/r:	mov r16, r/m16
878 		 * 8B/r:	mov r32, r/m32
879 		 * REX.W 8B/r:	mov r64, r/m64
880 		 */
881 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size);
882 		if (error == 0) {
883 			reg = gpr_map[vie->reg];
884 			error = vie_update_register(vm, vcpuid, reg, val, size);
885 		}
886 		break;
887 	case 0xA1:
888 		/*
889 		 * MOV from seg:moffset to AX/EAX/RAX
890 		 * A1:		mov AX, moffs16
891 		 * A1:		mov EAX, moffs32
892 		 * REX.W + A1:	mov RAX, moffs64
893 		 */
894 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size);
895 		if (error == 0) {
896 			reg = VM_REG_GUEST_RAX;
897 			error = vie_update_register(vm, vcpuid, reg, val, size);
898 		}
899 		break;
900 	case 0xA3:
901 		/*
902 		 * MOV from AX/EAX/RAX to seg:moffset
903 		 * A3:		mov moffs16, AX
904 		 * A3:		mov moffs32, EAX
905 		 * REX.W + A3:	mov moffs64, RAX
906 		 */
907 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
908 		if (error == 0) {
909 			val &= size2mask[size];
910 			error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
911 		}
912 		break;
913 	case 0xC6:
914 		/*
915 		 * MOV from imm8 to mem (ModRM:r/m)
916 		 * C6/0		mov r/m8, imm8
917 		 * REX + C6/0	mov r/m8, imm8
918 		 */
919 		size = 1;	/* override for byte operation */
920 		val = vie->immediate;
921 		error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
922 		break;
923 	case 0xC7:
924 		/*
925 		 * MOV from imm16/imm32 to mem (ModRM:r/m)
926 		 * C7/0		mov r/m16, imm16
927 		 * C7/0		mov r/m32, imm32
928 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
929 		 */
930 		val = vie->immediate & size2mask[size];
931 		error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
932 		break;
933 	default:
934 		break;
935 	}
936 
937 	return (error);
938 }
939 
940 static int
941 vie_emulate_movx(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
942 {
943 	int error, size;
944 	enum vm_reg_name reg;
945 	uint64_t val;
946 
947 	size = vie->opsize;
948 	error = EINVAL;
949 
950 	switch (vie->op.op_byte) {
951 	case 0xB6:
952 		/*
953 		 * MOV and zero extend byte from mem (ModRM:r/m) to
954 		 * reg (ModRM:reg).
955 		 *
956 		 * 0F B6/r		movzx r16, r/m8
957 		 * 0F B6/r		movzx r32, r/m8
958 		 * REX.W + 0F B6/r	movzx r64, r/m8
959 		 */
960 
961 		/* get the first operand */
962 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1);
963 		if (error)
964 			break;
965 
966 		/* get the second operand */
967 		reg = gpr_map[vie->reg];
968 
969 		/* zero-extend byte */
970 		val = (uint8_t)val;
971 
972 		/* write the result */
973 		error = vie_update_register(vm, vcpuid, reg, val, size);
974 		break;
975 	case 0xB7:
976 		/*
977 		 * MOV and zero extend word from mem (ModRM:r/m) to
978 		 * reg (ModRM:reg).
979 		 *
980 		 * 0F B7/r		movzx r32, r/m16
981 		 * REX.W + 0F B7/r	movzx r64, r/m16
982 		 */
983 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 2);
984 		if (error)
985 			return (error);
986 
987 		reg = gpr_map[vie->reg];
988 
989 		/* zero-extend word */
990 		val = (uint16_t)val;
991 
992 		error = vie_update_register(vm, vcpuid, reg, val, size);
993 		break;
994 	case 0xBE:
995 		/*
996 		 * MOV and sign extend byte from mem (ModRM:r/m) to
997 		 * reg (ModRM:reg).
998 		 *
999 		 * 0F BE/r		movsx r16, r/m8
1000 		 * 0F BE/r		movsx r32, r/m8
1001 		 * REX.W + 0F BE/r	movsx r64, r/m8
1002 		 */
1003 
1004 		/* get the first operand */
1005 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1);
1006 		if (error)
1007 			break;
1008 
1009 		/* get the second operand */
1010 		reg = gpr_map[vie->reg];
1011 
1012 		/* sign extend byte */
1013 		val = (int8_t)val;
1014 
1015 		/* write the result */
1016 		error = vie_update_register(vm, vcpuid, reg, val, size);
1017 		break;
1018 	default:
1019 		break;
1020 	}
1021 	return (error);
1022 }
1023 
1024 /*
1025  * Helper function to calculate and validate a linear address.
1026  */
1027 static int
1028 vie_get_gla(struct vie *vie, struct vm *vm, int vcpuid, int opsize,
1029     int addrsize, int prot, enum vm_reg_name seg, enum vm_reg_name gpr,
1030     uint64_t *gla)
1031 {
1032 	struct seg_desc desc;
1033 	uint64_t cr0, val, rflags;
1034 	int error;
1035 	struct vm_guest_paging *paging;
1036 
1037 	paging = &vie->paging;
1038 
1039 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
1040 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
1041 
1042 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1043 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1044 
1045 	error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
1046 	KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
1047 	    __func__, error, seg));
1048 
1049 	error = vm_get_register(vm, vcpuid, gpr, &val);
1050 	KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
1051 	    error, gpr));
1052 
1053 	if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
1054 	    addrsize, prot, gla)) {
1055 		if (seg == VM_REG_GUEST_SS)
1056 			vm_inject_ss(vm, vcpuid, 0);
1057 		else
1058 			vm_inject_gp(vm, vcpuid);
1059 		return (-1);
1060 	}
1061 
1062 	if (vie_canonical_check(paging->cpu_mode, *gla)) {
1063 		if (seg == VM_REG_GUEST_SS)
1064 			vm_inject_ss(vm, vcpuid, 0);
1065 		else
1066 			vm_inject_gp(vm, vcpuid);
1067 		return (-1);
1068 	}
1069 
1070 	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
1071 		vm_inject_ac(vm, vcpuid, 0);
1072 		return (-1);
1073 	}
1074 
1075 	return (0);
1076 }
1077 
1078 static int
1079 vie_emulate_movs(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1080 {
1081 	struct vm_copyinfo copyinfo[2];
1082 	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
1083 	uint64_t rcx, rdi, rsi, rflags;
1084 	int error, fault, opsize, seg, repeat;
1085 	struct vm_guest_paging *paging;
1086 
1087 	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
1088 	val = 0;
1089 	error = 0;
1090 	paging = &vie->paging;
1091 
1092 	/*
1093 	 * XXX although the MOVS instruction is only supposed to be used with
1094 	 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
1095 	 *
1096 	 * Empirically the "repnz" prefix has identical behavior to "rep"
1097 	 * and the zero flag does not make a difference.
1098 	 */
1099 	repeat = vie->repz_present | vie->repnz_present;
1100 
1101 	if (repeat) {
1102 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
1103 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
1104 
1105 		/*
1106 		 * The count register is %rcx, %ecx or %cx depending on the
1107 		 * address size of the instruction.
1108 		 */
1109 		if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
1110 			error = 0;
1111 			goto done;
1112 		}
1113 	}
1114 
1115 	/*
1116 	 *	Source		Destination	Comments
1117 	 *	--------------------------------------------
1118 	 * (1)  memory		memory		n/a
1119 	 * (2)  memory		mmio		emulated
1120 	 * (3)  mmio		memory		emulated
1121 	 * (4)  mmio		mmio		emulated
1122 	 *
1123 	 * At this point we don't have sufficient information to distinguish
1124 	 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
1125 	 * out because it will succeed only when operating on regular memory.
1126 	 *
1127 	 * XXX the emulation doesn't properly handle the case where 'gpa'
1128 	 * is straddling the boundary between the normal memory and MMIO.
1129 	 */
1130 
1131 	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
1132 	if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize, PROT_READ, seg,
1133 	    VM_REG_GUEST_RSI, &srcaddr) != 0) {
1134 		goto done;
1135 	}
1136 
1137 	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
1138 	    copyinfo, nitems(copyinfo), &fault);
1139 	if (error == 0) {
1140 		if (fault)
1141 			goto done;	/* Resume guest to handle fault */
1142 
1143 		/*
1144 		 * case (2): read from system memory and write to mmio.
1145 		 */
1146 		vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
1147 		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1148 		error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize);
1149 		if (error)
1150 			goto done;
1151 	} else {
1152 		/*
1153 		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
1154 		 * if 'srcaddr' is in the mmio space.
1155 		 */
1156 
1157 		if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize,
1158 		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI,
1159 		    &dstaddr) != 0) {
1160 			goto done;
1161 		}
1162 
1163 		error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
1164 		    PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
1165 		if (error == 0) {
1166 			if (fault)
1167 				goto done;    /* Resume guest to handle fault */
1168 
1169 			/*
1170 			 * case (3): read from MMIO and write to system memory.
1171 			 *
1172 			 * A MMIO read can have side-effects so we
1173 			 * commit to it only after vm_copy_setup() is
1174 			 * successful. If a page-fault needs to be
1175 			 * injected into the guest then it will happen
1176 			 * before the MMIO read is attempted.
1177 			 */
1178 			error = vie_mmio_read(vie, vm, vcpuid, gpa, &val,
1179 			    opsize);
1180 
1181 			if (error == 0) {
1182 				vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
1183 			}
1184 			/*
1185 			 * Regardless of whether the MMIO read was successful or
1186 			 * not, the copy resources must be cleaned up.
1187 			 */
1188 			vm_copy_teardown(vm, vcpuid, copyinfo,
1189 			    nitems(copyinfo));
1190 			if (error != 0) {
1191 				goto done;
1192 			}
1193 		} else {
1194 			/*
1195 			 * Case (4): read from and write to mmio.
1196 			 *
1197 			 * Commit to the MMIO read/write (with potential
1198 			 * side-effects) only after we are sure that the
1199 			 * instruction is not going to be restarted due
1200 			 * to address translation faults.
1201 			 */
1202 			error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
1203 			    PROT_READ, &srcgpa, &fault);
1204 			if (error || fault)
1205 				goto done;
1206 
1207 			error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
1208 			    PROT_WRITE, &dstgpa, &fault);
1209 			if (error || fault)
1210 				goto done;
1211 
1212 			error = vie_mmio_read(vie, vm, vcpuid, srcgpa, &val,
1213 			    opsize);
1214 			if (error)
1215 				goto done;
1216 
1217 			error = vie_mmio_write(vie, vm, vcpuid, dstgpa, val,
1218 			    opsize);
1219 			if (error)
1220 				goto done;
1221 		}
1222 	}
1223 
1224 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
1225 	KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
1226 
1227 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
1228 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
1229 
1230 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1231 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1232 
1233 	if (rflags & PSL_D) {
1234 		rsi -= opsize;
1235 		rdi -= opsize;
1236 	} else {
1237 		rsi += opsize;
1238 		rdi += opsize;
1239 	}
1240 
1241 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
1242 	    vie->addrsize);
1243 	KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
1244 
1245 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
1246 	    vie->addrsize);
1247 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
1248 
1249 	if (repeat) {
1250 		rcx = rcx - 1;
1251 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
1252 		    rcx, vie->addrsize);
1253 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
1254 
1255 		/*
1256 		 * Repeat the instruction if the count register is not zero.
1257 		 */
1258 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
1259 			return (vie_repeat(vie));
1260 	}
1261 done:
1262 	return (error);
1263 }
1264 
1265 static int
1266 vie_emulate_stos(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1267 {
1268 	int error, opsize, repeat;
1269 	uint64_t val;
1270 	uint64_t rcx, rdi, rflags;
1271 
1272 	opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
1273 	repeat = vie->repz_present | vie->repnz_present;
1274 
1275 	if (repeat) {
1276 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
1277 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
1278 
1279 		/*
1280 		 * The count register is %rcx, %ecx or %cx depending on the
1281 		 * address size of the instruction.
1282 		 */
1283 		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
1284 			return (0);
1285 	}
1286 
1287 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
1288 	KASSERT(!error, ("%s: error %d getting rax", __func__, error));
1289 
1290 	error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize);
1291 	if (error)
1292 		return (error);
1293 
1294 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
1295 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
1296 
1297 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1298 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1299 
1300 	if (rflags & PSL_D)
1301 		rdi -= opsize;
1302 	else
1303 		rdi += opsize;
1304 
1305 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
1306 	    vie->addrsize);
1307 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
1308 
1309 	if (repeat) {
1310 		rcx = rcx - 1;
1311 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
1312 		    rcx, vie->addrsize);
1313 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
1314 
1315 		/*
1316 		 * Repeat the instruction if the count register is not zero.
1317 		 */
1318 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
1319 			return (vie_repeat(vie));
1320 	}
1321 
1322 	return (0);
1323 }
1324 
1325 static int
1326 vie_emulate_and(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1327 {
1328 	int error, size;
1329 	enum vm_reg_name reg;
1330 	uint64_t result, rflags, rflags2, val1, val2;
1331 
1332 	size = vie->opsize;
1333 	error = EINVAL;
1334 
1335 	switch (vie->op.op_byte) {
1336 	case 0x23:
1337 		/*
1338 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
1339 		 * result in reg.
1340 		 *
1341 		 * 23/r		and r16, r/m16
1342 		 * 23/r		and r32, r/m32
1343 		 * REX.W + 23/r	and r64, r/m64
1344 		 */
1345 
1346 		/* get the first operand */
1347 		reg = gpr_map[vie->reg];
1348 		error = vm_get_register(vm, vcpuid, reg, &val1);
1349 		if (error)
1350 			break;
1351 
1352 		/* get the second operand */
1353 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1354 		if (error)
1355 			break;
1356 
1357 		/* perform the operation and write the result */
1358 		result = val1 & val2;
1359 		error = vie_update_register(vm, vcpuid, reg, result, size);
1360 		break;
1361 	case 0x81:
1362 	case 0x83:
1363 		/*
1364 		 * AND mem (ModRM:r/m) with immediate and store the
1365 		 * result in mem.
1366 		 *
1367 		 * 81 /4		and r/m16, imm16
1368 		 * 81 /4		and r/m32, imm32
1369 		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
1370 		 *
1371 		 * 83 /4		and r/m16, imm8 sign-extended to 16
1372 		 * 83 /4		and r/m32, imm8 sign-extended to 32
1373 		 * REX.W + 83/4		and r/m64, imm8 sign-extended to 64
1374 		 */
1375 
1376 		/* get the first operand */
1377 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size);
1378 		if (error)
1379 			break;
1380 
1381 		/*
1382 		 * perform the operation with the pre-fetched immediate
1383 		 * operand and write the result
1384 		 */
1385 		result = val1 & vie->immediate;
1386 		error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size);
1387 		break;
1388 	default:
1389 		break;
1390 	}
1391 	if (error)
1392 		return (error);
1393 
1394 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1395 	if (error)
1396 		return (error);
1397 
1398 	/*
1399 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1400 	 * to the result; AF is undefined.
1401 	 *
1402 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1403 	 */
1404 	rflags2 = getcc(size, result, 0);
1405 	rflags &= ~RFLAGS_STATUS_BITS;
1406 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1407 
1408 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1409 	return (error);
1410 }
1411 
1412 static int
1413 vie_emulate_or(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1414 {
1415 	int error, size;
1416 	enum vm_reg_name reg;
1417 	uint64_t result, rflags, rflags2, val1, val2;
1418 
1419 	size = vie->opsize;
1420 	error = EINVAL;
1421 
1422 	switch (vie->op.op_byte) {
1423 	case 0x0B:
1424 		/*
1425 		 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
1426 		 * result in reg.
1427 		 *
1428 		 * 0b/r		or r16, r/m16
1429 		 * 0b/r		or r32, r/m32
1430 		 * REX.W + 0b/r	or r64, r/m64
1431 		 */
1432 
1433 		/* get the first operand */
1434 		reg = gpr_map[vie->reg];
1435 		error = vm_get_register(vm, vcpuid, reg, &val1);
1436 		if (error)
1437 			break;
1438 
1439 		/* get the second operand */
1440 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1441 		if (error)
1442 			break;
1443 
1444 		/* perform the operation and write the result */
1445 		result = val1 | val2;
1446 		error = vie_update_register(vm, vcpuid, reg, result, size);
1447 		break;
1448 	case 0x81:
1449 	case 0x83:
1450 		/*
1451 		 * OR mem (ModRM:r/m) with immediate and store the
1452 		 * result in mem.
1453 		 *
1454 		 * 81 /1		or r/m16, imm16
1455 		 * 81 /1		or r/m32, imm32
1456 		 * REX.W + 81 /1	or r/m64, imm32 sign-extended to 64
1457 		 *
1458 		 * 83 /1		or r/m16, imm8 sign-extended to 16
1459 		 * 83 /1		or r/m32, imm8 sign-extended to 32
1460 		 * REX.W + 83/1		or r/m64, imm8 sign-extended to 64
1461 		 */
1462 
1463 		/* get the first operand */
1464 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size);
1465 		if (error)
1466 			break;
1467 
1468 		/*
1469 		 * perform the operation with the pre-fetched immediate
1470 		 * operand and write the result
1471 		 */
1472 		result = val1 | vie->immediate;
1473 		error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size);
1474 		break;
1475 	default:
1476 		break;
1477 	}
1478 	if (error)
1479 		return (error);
1480 
1481 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1482 	if (error)
1483 		return (error);
1484 
1485 	/*
1486 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1487 	 * to the result; AF is undefined.
1488 	 *
1489 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1490 	 */
1491 	rflags2 = getcc(size, result, 0);
1492 	rflags &= ~RFLAGS_STATUS_BITS;
1493 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1494 
1495 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1496 	return (error);
1497 }
1498 
1499 static int
1500 vie_emulate_cmp(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1501 {
1502 	int error, size;
1503 	uint64_t regop, memop, op1, op2, rflags, rflags2;
1504 	enum vm_reg_name reg;
1505 
1506 	size = vie->opsize;
1507 	switch (vie->op.op_byte) {
1508 	case 0x39:
1509 	case 0x3B:
1510 		/*
1511 		 * 39/r		CMP r/m16, r16
1512 		 * 39/r		CMP r/m32, r32
1513 		 * REX.W 39/r	CMP r/m64, r64
1514 		 *
1515 		 * 3B/r		CMP r16, r/m16
1516 		 * 3B/r		CMP r32, r/m32
1517 		 * REX.W + 3B/r	CMP r64, r/m64
1518 		 *
1519 		 * Compare the first operand with the second operand and
1520 		 * set status flags in EFLAGS register. The comparison is
1521 		 * performed by subtracting the second operand from the first
1522 		 * operand and then setting the status flags.
1523 		 */
1524 
1525 		/* Get the register operand */
1526 		reg = gpr_map[vie->reg];
1527 		error = vm_get_register(vm, vcpuid, reg, &regop);
1528 		if (error)
1529 			return (error);
1530 
1531 		/* Get the memory operand */
1532 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &memop, size);
1533 		if (error)
1534 			return (error);
1535 
1536 		if (vie->op.op_byte == 0x3B) {
1537 			op1 = regop;
1538 			op2 = memop;
1539 		} else {
1540 			op1 = memop;
1541 			op2 = regop;
1542 		}
1543 		rflags2 = getcc(size, op1, op2);
1544 		break;
1545 	case 0x80:
1546 	case 0x81:
1547 	case 0x83:
1548 		/*
1549 		 * 80 /7		cmp r/m8, imm8
1550 		 * REX + 80 /7		cmp r/m8, imm8
1551 		 *
1552 		 * 81 /7		cmp r/m16, imm16
1553 		 * 81 /7		cmp r/m32, imm32
1554 		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
1555 		 *
1556 		 * 83 /7		cmp r/m16, imm8 sign-extended to 16
1557 		 * 83 /7		cmp r/m32, imm8 sign-extended to 32
1558 		 * REX.W + 83 /7	cmp r/m64, imm8 sign-extended to 64
1559 		 *
1560 		 * Compare mem (ModRM:r/m) with immediate and set
1561 		 * status flags according to the results.  The
1562 		 * comparison is performed by subtracting the
1563 		 * immediate from the first operand and then setting
1564 		 * the status flags.
1565 		 *
1566 		 */
1567 		if (vie->op.op_byte == 0x80)
1568 			size = 1;
1569 
1570 		/* get the first operand */
1571 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size);
1572 		if (error)
1573 			return (error);
1574 
1575 		rflags2 = getcc(size, op1, vie->immediate);
1576 		break;
1577 	default:
1578 		return (EINVAL);
1579 	}
1580 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1581 	if (error)
1582 		return (error);
1583 	rflags &= ~RFLAGS_STATUS_BITS;
1584 	rflags |= rflags2 & RFLAGS_STATUS_BITS;
1585 
1586 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1587 	return (error);
1588 }
1589 
1590 static int
1591 vie_emulate_test(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1592 {
1593 	int error, size;
1594 	uint64_t op1, rflags, rflags2;
1595 
1596 	size = vie->opsize;
1597 	error = EINVAL;
1598 
1599 	switch (vie->op.op_byte) {
1600 	case 0xF6:
1601 		/*
1602 		 * F6 /0		test r/m8, imm8
1603 		 *
1604 		 * Test mem (ModRM:r/m) with immediate and set status
1605 		 * flags according to the results.  The comparison is
1606 		 * performed by anding the immediate from the first
1607 		 * operand and then setting the status flags.
1608 		 */
1609 		if ((vie->reg & 7) != 0)
1610 			return (EINVAL);
1611 
1612 		size = 1;	/* override for byte operation */
1613 
1614 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size);
1615 		if (error)
1616 			return (error);
1617 
1618 		rflags2 = getandflags(size, op1, vie->immediate);
1619 		break;
1620 	case 0xF7:
1621 		/*
1622 		 * F7 /0		test r/m16, imm16
1623 		 * F7 /0		test r/m32, imm32
1624 		 * REX.W + F7 /0	test r/m64, imm32 sign-extended to 64
1625 		 *
1626 		 * Test mem (ModRM:r/m) with immediate and set status
1627 		 * flags according to the results.  The comparison is
1628 		 * performed by anding the immediate from the first
1629 		 * operand and then setting the status flags.
1630 		 */
1631 		if ((vie->reg & 7) != 0)
1632 			return (EINVAL);
1633 
1634 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size);
1635 		if (error)
1636 			return (error);
1637 
1638 		rflags2 = getandflags(size, op1, vie->immediate);
1639 		break;
1640 	default:
1641 		return (EINVAL);
1642 	}
1643 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1644 	if (error)
1645 		return (error);
1646 
1647 	/*
1648 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1649 	 * to the result; AF is undefined.
1650 	 */
1651 	rflags &= ~RFLAGS_STATUS_BITS;
1652 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1653 
1654 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1655 	return (error);
1656 }
1657 
1658 static int
1659 vie_emulate_bextr(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1660 {
1661 	uint64_t src1, src2, dst, rflags;
1662 	unsigned start, len;
1663 	int error, size;
1664 	struct vm_guest_paging *paging;
1665 
1666 	size = vie->opsize;
1667 	error = EINVAL;
1668 	paging = &vie->paging;
1669 
1670 	/*
1671 	 * VEX.LZ.0F38.W0 F7 /r		BEXTR r32a, r/m32, r32b
1672 	 * VEX.LZ.0F38.W1 F7 /r		BEXTR r64a, r/m64, r64b
1673 	 *
1674 	 * Destination operand is ModRM:reg.  Source operands are ModRM:r/m and
1675 	 * Vex.vvvv.
1676 	 *
1677 	 * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored).
1678 	 */
1679 	if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT)
1680 		size = 4;
1681 
1682 	/*
1683 	 * Extracts contiguous bits from the first /source/ operand (second
1684 	 * operand) using an index and length specified in the second /source/
1685 	 * operand (third operand).
1686 	 */
1687 	error = vie_mmio_read(vie, vm, vcpuid, gpa, &src1, size);
1688 	if (error)
1689 		return (error);
1690 	error = vm_get_register(vm, vcpuid, gpr_map[vie->vex_reg], &src2);
1691 	if (error)
1692 		return (error);
1693 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1694 	if (error)
1695 		return (error);
1696 
1697 	start = (src2 & 0xff);
1698 	len = (src2 & 0xff00) >> 8;
1699 
1700 	/* If no bits are extracted, the destination register is cleared. */
1701 	dst = 0;
1702 
1703 	/* If START exceeds the operand size, no bits are extracted. */
1704 	if (start > size * 8)
1705 		goto done;
1706 	/* Length is bounded by both the destination size and start offset. */
1707 	if (start + len > size * 8)
1708 		len = (size * 8) - start;
1709 	if (len == 0)
1710 		goto done;
1711 
1712 	if (start > 0)
1713 		src1 = (src1 >> start);
1714 	if (len < 64)
1715 		src1 = src1 & ((1ull << len) - 1);
1716 	dst = src1;
1717 
1718 done:
1719 	error = vie_update_register(vm, vcpuid, gpr_map[vie->reg], dst, size);
1720 	if (error)
1721 		return (error);
1722 
1723 	/*
1724 	 * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result.
1725 	 * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared.
1726 	 */
1727 	rflags &= ~RFLAGS_STATUS_BITS;
1728 	if (dst == 0)
1729 		rflags |= PSL_Z;
1730 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags,
1731 	    8);
1732 	return (error);
1733 }
1734 
1735 static int
1736 vie_emulate_add(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1737 {
1738 	int error, size;
1739 	uint64_t nval, rflags, rflags2, val1, val2;
1740 	enum vm_reg_name reg;
1741 
1742 	size = vie->opsize;
1743 	error = EINVAL;
1744 
1745 	switch (vie->op.op_byte) {
1746 	case 0x03:
1747 		/*
1748 		 * ADD r/m to r and store the result in r
1749 		 *
1750 		 * 03/r			ADD r16, r/m16
1751 		 * 03/r			ADD r32, r/m32
1752 		 * REX.W + 03/r		ADD r64, r/m64
1753 		 */
1754 
1755 		/* get the first operand */
1756 		reg = gpr_map[vie->reg];
1757 		error = vm_get_register(vm, vcpuid, reg, &val1);
1758 		if (error)
1759 			break;
1760 
1761 		/* get the second operand */
1762 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1763 		if (error)
1764 			break;
1765 
1766 		/* perform the operation and write the result */
1767 		nval = val1 + val2;
1768 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1769 		break;
1770 	default:
1771 		break;
1772 	}
1773 
1774 	if (!error) {
1775 		rflags2 = getaddflags(size, val1, val2);
1776 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1777 		    &rflags);
1778 		if (error)
1779 			return (error);
1780 
1781 		rflags &= ~RFLAGS_STATUS_BITS;
1782 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1783 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1784 		    rflags, 8);
1785 	}
1786 
1787 	return (error);
1788 }
1789 
1790 static int
1791 vie_emulate_sub(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1792 {
1793 	int error, size;
1794 	uint64_t nval, rflags, rflags2, val1, val2;
1795 	enum vm_reg_name reg;
1796 
1797 	size = vie->opsize;
1798 	error = EINVAL;
1799 
1800 	switch (vie->op.op_byte) {
1801 	case 0x2B:
1802 		/*
1803 		 * SUB r/m from r and store the result in r
1804 		 *
1805 		 * 2B/r		SUB r16, r/m16
1806 		 * 2B/r		SUB r32, r/m32
1807 		 * REX.W + 2B/r	SUB r64, r/m64
1808 		 */
1809 
1810 		/* get the first operand */
1811 		reg = gpr_map[vie->reg];
1812 		error = vm_get_register(vm, vcpuid, reg, &val1);
1813 		if (error)
1814 			break;
1815 
1816 		/* get the second operand */
1817 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size);
1818 		if (error)
1819 			break;
1820 
1821 		/* perform the operation and write the result */
1822 		nval = val1 - val2;
1823 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1824 		break;
1825 	default:
1826 		break;
1827 	}
1828 
1829 	if (!error) {
1830 		rflags2 = getcc(size, val1, val2);
1831 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1832 		    &rflags);
1833 		if (error)
1834 			return (error);
1835 
1836 		rflags &= ~RFLAGS_STATUS_BITS;
1837 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1838 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1839 		    rflags, 8);
1840 	}
1841 
1842 	return (error);
1843 }
1844 
1845 static int
1846 vie_emulate_stack_op(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1847 {
1848 	struct vm_copyinfo copyinfo[2];
1849 	struct seg_desc ss_desc;
1850 	uint64_t cr0, rflags, rsp, stack_gla, val;
1851 	int error, fault, size, stackaddrsize, pushop;
1852 	struct vm_guest_paging *paging;
1853 
1854 	val = 0;
1855 	size = vie->opsize;
1856 	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
1857 	paging = &vie->paging;
1858 
1859 	/*
1860 	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
1861 	 */
1862 	if (paging->cpu_mode == CPU_MODE_REAL) {
1863 		stackaddrsize = 2;
1864 	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
1865 		/*
1866 		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
1867 		 * - Stack pointer size is always 64-bits.
1868 		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
1869 		 * - 16-bit PUSH/POP is supported by using the operand size
1870 		 *   override prefix (66H).
1871 		 */
1872 		stackaddrsize = 8;
1873 		size = vie->opsize_override ? 2 : 8;
1874 	} else {
1875 		/*
1876 		 * In protected or compatibility mode the 'B' flag in the
1877 		 * stack-segment descriptor determines the size of the
1878 		 * stack pointer.
1879 		 */
1880 		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
1881 		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
1882 		    __func__, error));
1883 		if (SEG_DESC_DEF32(ss_desc.access))
1884 			stackaddrsize = 4;
1885 		else
1886 			stackaddrsize = 2;
1887 	}
1888 
1889 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
1890 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
1891 
1892 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1893 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1894 
1895 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
1896 	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
1897 	if (pushop) {
1898 		rsp -= size;
1899 	}
1900 
1901 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
1902 	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
1903 	    &stack_gla)) {
1904 		vm_inject_ss(vm, vcpuid, 0);
1905 		return (0);
1906 	}
1907 
1908 	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
1909 		vm_inject_ss(vm, vcpuid, 0);
1910 		return (0);
1911 	}
1912 
1913 	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
1914 		vm_inject_ac(vm, vcpuid, 0);
1915 		return (0);
1916 	}
1917 
1918 	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
1919 	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
1920 	    &fault);
1921 	if (error || fault)
1922 		return (error);
1923 
1924 	if (pushop) {
1925 		error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size);
1926 		if (error == 0)
1927 			vm_copyout(vm, vcpuid, &val, copyinfo, size);
1928 	} else {
1929 		vm_copyin(vm, vcpuid, copyinfo, &val, size);
1930 		error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size);
1931 		rsp += size;
1932 	}
1933 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1934 
1935 	if (error == 0) {
1936 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
1937 		    stackaddrsize);
1938 		KASSERT(error == 0, ("error %d updating rsp", error));
1939 	}
1940 	return (error);
1941 }
1942 
1943 static int
1944 vie_emulate_push(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1945 {
1946 	int error;
1947 
1948 	/*
1949 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1950 	 *
1951 	 * PUSH is part of the group 5 extended opcodes and is identified
1952 	 * by ModRM:reg = b110.
1953 	 */
1954 	if ((vie->reg & 7) != 6)
1955 		return (EINVAL);
1956 
1957 	error = vie_emulate_stack_op(vie, vm, vcpuid, gpa);
1958 	return (error);
1959 }
1960 
1961 static int
1962 vie_emulate_pop(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1963 {
1964 	int error;
1965 
1966 	/*
1967 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1968 	 *
1969 	 * POP is part of the group 1A extended opcodes and is identified
1970 	 * by ModRM:reg = b000.
1971 	 */
1972 	if ((vie->reg & 7) != 0)
1973 		return (EINVAL);
1974 
1975 	error = vie_emulate_stack_op(vie, vm, vcpuid, gpa);
1976 	return (error);
1977 }
1978 
1979 static int
1980 vie_emulate_group1(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
1981 {
1982 	int error;
1983 
1984 	switch (vie->reg & 7) {
1985 	case 0x1:	/* OR */
1986 		error = vie_emulate_or(vie, vm, vcpuid, gpa);
1987 		break;
1988 	case 0x4:	/* AND */
1989 		error = vie_emulate_and(vie, vm, vcpuid, gpa);
1990 		break;
1991 	case 0x7:	/* CMP */
1992 		error = vie_emulate_cmp(vie, vm, vcpuid, gpa);
1993 		break;
1994 	default:
1995 		error = EINVAL;
1996 		break;
1997 	}
1998 
1999 	return (error);
2000 }
2001 
2002 static int
2003 vie_emulate_bittest(struct vie *vie, struct vm *vm, int vcpuid, uint64_t gpa)
2004 {
2005 	uint64_t val, rflags;
2006 	int error, bitmask, bitoff;
2007 
2008 	/*
2009 	 * 0F BA is a Group 8 extended opcode.
2010 	 *
2011 	 * Currently we only emulate the 'Bit Test' instruction which is
2012 	 * identified by a ModR/M:reg encoding of 100b.
2013 	 */
2014 	if ((vie->reg & 7) != 4)
2015 		return (EINVAL);
2016 
2017 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
2018 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
2019 
2020 	error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, vie->opsize);
2021 	if (error)
2022 		return (error);
2023 
2024 	/*
2025 	 * Intel SDM, Vol 2, Table 3-2:
2026 	 * "Range of Bit Positions Specified by Bit Offset Operands"
2027 	 */
2028 	bitmask = vie->opsize * 8 - 1;
2029 	bitoff = vie->immediate & bitmask;
2030 
2031 	/* Copy the bit into the Carry flag in %rflags */
2032 	if (val & (1UL << bitoff))
2033 		rflags |= PSL_C;
2034 	else
2035 		rflags &= ~PSL_C;
2036 
2037 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
2038 	KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
2039 
2040 	return (0);
2041 }
2042 
2043 static int
2044 vie_emulate_twob_group15(struct vie *vie, struct vm *vm, int vcpuid,
2045     uint64_t gpa)
2046 {
2047 	int error;
2048 	uint64_t buf;
2049 
2050 	switch (vie->reg & 7) {
2051 	case 0x7:	/* CLFLUSH, CLFLUSHOPT, and SFENCE */
2052 		if (vie->mod == 0x3) {
2053 			/*
2054 			 * SFENCE.  Ignore it, VM exit provides enough
2055 			 * barriers on its own.
2056 			 */
2057 			error = 0;
2058 		} else {
2059 			/*
2060 			 * CLFLUSH, CLFLUSHOPT.  Only check for access
2061 			 * rights.
2062 			 */
2063 			error = vie_mmio_read(vie, vm, vcpuid, gpa, &buf, 1);
2064 		}
2065 		break;
2066 	default:
2067 		error = EINVAL;
2068 		break;
2069 	}
2070 
2071 	return (error);
2072 }
2073 
2074 static int
2075 vie_emulate_clts(struct vie *vie, struct vm *vm, int vcpuid)
2076 {
2077 	uint64_t val;
2078 	int error __maybe_unused;
2079 
2080 	if (vie->paging.cpl != 0) {
2081 		vm_inject_gp(vm, vcpuid);
2082 		vie->num_processed = 0;
2083 		return (0);
2084 	}
2085 
2086 	error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &val);
2087 	ASSERT(error == 0);
2088 
2089 	/* Clear %cr0.TS */
2090 	val &= ~CR0_TS;
2091 
2092 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, val);
2093 	ASSERT(error == 0);
2094 
2095 	return (0);
2096 }
2097 
2098 static int
2099 vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa,
2100     uint64_t *rval, int bytes)
2101 {
2102 	int err;
2103 
2104 	if (vie->mmio_req_read.state == VR_DONE) {
2105 		ASSERT(vie->mmio_req_read.bytes == bytes);
2106 		ASSERT(vie->mmio_req_read.gpa == gpa);
2107 
2108 		*rval = vie->mmio_req_read.data;
2109 		return (0);
2110 	}
2111 
2112 	err = vm_service_mmio_read(vm, cpuid, gpa, rval, bytes);
2113 	if (err == 0) {
2114 		/*
2115 		 * A successful read from an in-kernel-emulated device may come
2116 		 * with side effects, so stash the result in case it's used for
2117 		 * an instruction which subsequently needs to issue an MMIO
2118 		 * write to userspace.
2119 		 */
2120 		ASSERT(vie->mmio_req_read.state == VR_NONE);
2121 
2122 		vie->mmio_req_read.bytes = bytes;
2123 		vie->mmio_req_read.gpa = gpa;
2124 		vie->mmio_req_read.data = *rval;
2125 		vie->mmio_req_read.state = VR_DONE;
2126 
2127 	} else if (err == ESRCH) {
2128 		/* Hope that userspace emulation can fulfill this read */
2129 		vie->mmio_req_read.bytes = bytes;
2130 		vie->mmio_req_read.gpa = gpa;
2131 		vie->mmio_req_read.state = VR_PENDING;
2132 		vie->status |= VIES_PENDING_MMIO;
2133 	} else if (err < 0) {
2134 		/*
2135 		 * The MMIO read failed in such a way that fallback to handling
2136 		 * in userspace is required.
2137 		 */
2138 		vie->status |= VIES_USER_FALLBACK;
2139 	}
2140 	return (err);
2141 }
2142 
2143 static int
2144 vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa,
2145     uint64_t wval, int bytes)
2146 {
2147 	int err;
2148 
2149 	if (vie->mmio_req_write.state == VR_DONE) {
2150 		ASSERT(vie->mmio_req_write.bytes == bytes);
2151 		ASSERT(vie->mmio_req_write.gpa == gpa);
2152 
2153 		return (0);
2154 	}
2155 
2156 	err = vm_service_mmio_write(vm, cpuid, gpa, wval, bytes);
2157 	if (err == 0) {
2158 		/*
2159 		 * A successful write to an in-kernel-emulated device probably
2160 		 * results in side effects, so stash the fact that such a write
2161 		 * succeeded in case the operation requires other work.
2162 		 */
2163 		vie->mmio_req_write.bytes = bytes;
2164 		vie->mmio_req_write.gpa = gpa;
2165 		vie->mmio_req_write.data = wval;
2166 		vie->mmio_req_write.state = VR_DONE;
2167 	} else if (err == ESRCH) {
2168 		/* Hope that userspace emulation can fulfill this write */
2169 		vie->mmio_req_write.bytes = bytes;
2170 		vie->mmio_req_write.gpa = gpa;
2171 		vie->mmio_req_write.data = wval;
2172 		vie->mmio_req_write.state = VR_PENDING;
2173 		vie->status |= VIES_PENDING_MMIO;
2174 	} else if (err < 0) {
2175 		/*
2176 		 * The MMIO write failed in such a way that fallback to handling
2177 		 * in userspace is required.
2178 		 */
2179 		vie->status |= VIES_USER_FALLBACK;
2180 	}
2181 	return (err);
2182 }
2183 
2184 int
2185 vie_emulate_mmio(struct vie *vie, struct vm *vm, int vcpuid)
2186 {
2187 	int error;
2188 	uint64_t gpa;
2189 
2190 	if ((vie->status & (VIES_INST_DECODE | VIES_MMIO)) !=
2191 	    (VIES_INST_DECODE | VIES_MMIO)) {
2192 		return (EINVAL);
2193 	}
2194 
2195 	gpa = vie->mmio_gpa;
2196 
2197 	switch (vie->op.op_type) {
2198 	case VIE_OP_TYPE_GROUP1:
2199 		error = vie_emulate_group1(vie, vm, vcpuid, gpa);
2200 		break;
2201 	case VIE_OP_TYPE_POP:
2202 		error = vie_emulate_pop(vie, vm, vcpuid, gpa);
2203 		break;
2204 	case VIE_OP_TYPE_PUSH:
2205 		error = vie_emulate_push(vie, vm, vcpuid, gpa);
2206 		break;
2207 	case VIE_OP_TYPE_CMP:
2208 		error = vie_emulate_cmp(vie, vm, vcpuid, gpa);
2209 		break;
2210 	case VIE_OP_TYPE_MOV:
2211 		error = vie_emulate_mov(vie, vm, vcpuid, gpa);
2212 		break;
2213 	case VIE_OP_TYPE_MOVSX:
2214 	case VIE_OP_TYPE_MOVZX:
2215 		error = vie_emulate_movx(vie, vm, vcpuid, gpa);
2216 		break;
2217 	case VIE_OP_TYPE_MOVS:
2218 		error = vie_emulate_movs(vie, vm, vcpuid, gpa);
2219 		break;
2220 	case VIE_OP_TYPE_STOS:
2221 		error = vie_emulate_stos(vie, vm, vcpuid, gpa);
2222 		break;
2223 	case VIE_OP_TYPE_AND:
2224 		error = vie_emulate_and(vie, vm, vcpuid, gpa);
2225 		break;
2226 	case VIE_OP_TYPE_OR:
2227 		error = vie_emulate_or(vie, vm, vcpuid, gpa);
2228 		break;
2229 	case VIE_OP_TYPE_SUB:
2230 		error = vie_emulate_sub(vie, vm, vcpuid, gpa);
2231 		break;
2232 	case VIE_OP_TYPE_BITTEST:
2233 		error = vie_emulate_bittest(vie, vm, vcpuid, gpa);
2234 		break;
2235 	case VIE_OP_TYPE_TWOB_GRP15:
2236 		error = vie_emulate_twob_group15(vie, vm, vcpuid, gpa);
2237 		break;
2238 	case VIE_OP_TYPE_ADD:
2239 		error = vie_emulate_add(vie, vm, vcpuid, gpa);
2240 		break;
2241 	case VIE_OP_TYPE_TEST:
2242 		error = vie_emulate_test(vie, vm, vcpuid, gpa);
2243 		break;
2244 	case VIE_OP_TYPE_BEXTR:
2245 		error = vie_emulate_bextr(vie, vm, vcpuid, gpa);
2246 		break;
2247 	default:
2248 		error = EINVAL;
2249 		break;
2250 	}
2251 
2252 	if (error == ESRCH) {
2253 		/* Return to userspace with the mmio request */
2254 		return (-1);
2255 	}
2256 
2257 	return (error);
2258 }
2259 
2260 static int
2261 vie_emulate_inout_port(struct vie *vie, struct vm *vm, int vcpuid,
2262     uint32_t *eax)
2263 {
2264 	uint32_t mask, val;
2265 	bool in;
2266 	int err;
2267 
2268 	mask = vie_size2mask(vie->inout.bytes);
2269 	in = (vie->inout.flags & INOUT_IN) != 0;
2270 
2271 	if (!in) {
2272 		val = *eax & mask;
2273 	}
2274 
2275 	if (vie->inout_req_state != VR_DONE) {
2276 		err = vm_ioport_access(vm, vcpuid, in, vie->inout.port,
2277 		    vie->inout.bytes, &val);
2278 		val &= mask;
2279 	} else {
2280 		/*
2281 		 * This port access was handled in userspace and the result was
2282 		 * injected in to be handled now.
2283 		 */
2284 		val = vie->inout_req_val & mask;
2285 		vie->inout_req_state = VR_NONE;
2286 		err = 0;
2287 	}
2288 
2289 	if (err == ESRCH) {
2290 		vie->status |= VIES_PENDING_INOUT;
2291 		vie->inout_req_state = VR_PENDING;
2292 		return (err);
2293 	} else if (err != 0) {
2294 		return (err);
2295 	}
2296 
2297 	if (in) {
2298 		*eax = (*eax & ~mask) | val;
2299 	}
2300 	return (0);
2301 }
2302 
2303 static enum vm_reg_name
2304 vie_inout_segname(const struct vie *vie)
2305 {
2306 	uint8_t segidx = vie->inout.segment;
2307 	const enum vm_reg_name segmap[] = {
2308 		VM_REG_GUEST_ES,
2309 		VM_REG_GUEST_CS,
2310 		VM_REG_GUEST_SS,
2311 		VM_REG_GUEST_DS,
2312 		VM_REG_GUEST_FS,
2313 		VM_REG_GUEST_GS,
2314 	};
2315 	const uint8_t maxidx = (sizeof (segmap) / sizeof (segmap[0]));
2316 
2317 	if (segidx >= maxidx) {
2318 		panic("unexpected segment index %u", segidx);
2319 	}
2320 	return (segmap[segidx]);
2321 }
2322 
2323 static int
2324 vie_emulate_inout_str(struct vie *vie, struct vm *vm, int vcpuid)
2325 {
2326 	uint8_t bytes, addrsize;
2327 	uint64_t index, count = 0, gla, rflags;
2328 	int prot, err, fault;
2329 	bool in, repeat;
2330 	enum vm_reg_name seg_reg, idx_reg;
2331 	struct vm_copyinfo copyinfo[2];
2332 
2333 	in = (vie->inout.flags & INOUT_IN) != 0;
2334 	bytes = vie->inout.bytes;
2335 	addrsize = vie->inout.addrsize;
2336 	prot = in ? PROT_WRITE : PROT_READ;
2337 
2338 	ASSERT(bytes == 1 || bytes == 2 || bytes == 4);
2339 	ASSERT(addrsize == 2 || addrsize == 4 || addrsize == 8);
2340 
2341 	idx_reg = (in) ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
2342 	seg_reg = vie_inout_segname(vie);
2343 	err = vm_get_register(vm, vcpuid, idx_reg, &index);
2344 	ASSERT(err == 0);
2345 	index = index & vie_size2mask(addrsize);
2346 
2347 	repeat = (vie->inout.flags & INOUT_REP) != 0;
2348 
2349 	/* Count register */
2350 	if (repeat) {
2351 		err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &count);
2352 		count &= vie_size2mask(addrsize);
2353 
2354 		if (count == 0) {
2355 			/*
2356 			 * If we were asked to emulate a REP INS/OUTS when the
2357 			 * count register is zero, no further work is required.
2358 			 */
2359 			return (0);
2360 		}
2361 	} else {
2362 		count = 1;
2363 	}
2364 
2365 	gla = 0;
2366 	if (vie_get_gla(vie, vm, vcpuid, bytes, addrsize, prot, seg_reg,
2367 	    idx_reg, &gla) != 0) {
2368 		/* vie_get_gla() already injected the appropriate fault */
2369 		return (0);
2370 	}
2371 
2372 	/*
2373 	 * The INS/OUTS emulate currently assumes that the memory target resides
2374 	 * within the guest system memory, rather than a device MMIO region.  If
2375 	 * such a case becomes a necessity, that additional handling could be
2376 	 * put in place.
2377 	 */
2378 	err = vm_copy_setup(vm, vcpuid, &vie->paging, gla, bytes, prot,
2379 	    copyinfo, nitems(copyinfo), &fault);
2380 
2381 	if (err) {
2382 		/* Unrecoverable error */
2383 		return (err);
2384 	} else if (fault) {
2385 		/* Resume guest to handle fault */
2386 		return (0);
2387 	}
2388 
2389 	if (!in) {
2390 		vm_copyin(vm, vcpuid, copyinfo, &vie->inout.eax, bytes);
2391 	}
2392 
2393 	err = vie_emulate_inout_port(vie, vm, vcpuid, &vie->inout.eax);
2394 
2395 	if (err == 0 && in) {
2396 		vm_copyout(vm, vcpuid, &vie->inout.eax, copyinfo, bytes);
2397 	}
2398 
2399 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
2400 
2401 	if (err == 0) {
2402 		err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
2403 		    &rflags);
2404 		ASSERT(err == 0);
2405 
2406 		/* Update index */
2407 		if (rflags & PSL_D) {
2408 			index -= bytes;
2409 		} else {
2410 			index += bytes;
2411 		}
2412 
2413 		/* Update index register */
2414 		err = vie_update_register(vm, vcpuid, idx_reg, index, addrsize);
2415 		ASSERT(err == 0);
2416 
2417 		/*
2418 		 * Update count register only if the instruction had a repeat
2419 		 * prefix.
2420 		 */
2421 		if ((vie->inout.flags & INOUT_REP) != 0) {
2422 			count--;
2423 			err = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
2424 			    count, addrsize);
2425 			ASSERT(err == 0);
2426 
2427 			if (count != 0) {
2428 				return (vie_repeat(vie));
2429 			}
2430 		}
2431 	}
2432 
2433 	return (err);
2434 }
2435 
2436 int
2437 vie_emulate_inout(struct vie *vie, struct vm *vm, int vcpuid)
2438 {
2439 	int err = 0;
2440 
2441 	if ((vie->status & VIES_INOUT) == 0) {
2442 		return (EINVAL);
2443 	}
2444 
2445 	if ((vie->inout.flags & INOUT_STR) == 0) {
2446 		/*
2447 		 * For now, using the 'rep' prefixes with plain (non-string)
2448 		 * in/out is not supported.
2449 		 */
2450 		if ((vie->inout.flags & INOUT_REP) != 0) {
2451 			return (EINVAL);
2452 		}
2453 
2454 		err = vie_emulate_inout_port(vie, vm, vcpuid, &vie->inout.eax);
2455 		if (err == 0 && (vie->inout.flags & INOUT_IN) != 0) {
2456 			/*
2457 			 * With the inX access now a success, the result needs
2458 			 * to be stored in the guest %rax.
2459 			 */
2460 			err = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX,
2461 			    vie->inout.eax);
2462 			VERIFY0(err);
2463 		}
2464 	} else {
2465 		vie->status &= ~VIES_REPEAT;
2466 		err = vie_emulate_inout_str(vie, vm, vcpuid);
2467 
2468 	}
2469 	if (err < 0) {
2470 		/*
2471 		 * Access to an I/O port failed in such a way that fallback to
2472 		 * handling in userspace is required.
2473 		 */
2474 		vie->status |= VIES_USER_FALLBACK;
2475 	} else if (err == ESRCH) {
2476 		ASSERT(vie->status & VIES_PENDING_INOUT);
2477 		/* Return to userspace with the in/out request */
2478 		err = -1;
2479 	}
2480 
2481 	return (err);
2482 }
2483 
2484 int
2485 vie_emulate_other(struct vie *vie, struct vm *vm, int vcpuid)
2486 {
2487 	int error;
2488 
2489 	if ((vie->status & (VIES_INST_DECODE | VIES_OTHER)) !=
2490 	    (VIES_INST_DECODE | VIES_OTHER)) {
2491 		return (EINVAL);
2492 	}
2493 
2494 	switch (vie->op.op_type) {
2495 	case VIE_OP_TYPE_CLTS:
2496 		error = vie_emulate_clts(vie, vm, vcpuid);
2497 		break;
2498 	case VIE_OP_TYPE_MOV_CR:
2499 		error = vie_emulate_mov_cr(vie, vm, vcpuid);
2500 		break;
2501 	default:
2502 		error = EINVAL;
2503 		break;
2504 	}
2505 
2506 	return (error);
2507 }
2508 
2509 void
2510 vie_reset(struct vie *vie)
2511 {
2512 	vie->status = 0;
2513 	vie->num_processed = vie->num_valid = 0;
2514 }
2515 
2516 void
2517 vie_advance_pc(struct vie *vie, uint64_t *nextrip)
2518 {
2519 	VERIFY((vie->status & VIES_REPEAT) == 0);
2520 
2521 	*nextrip += vie->num_processed;
2522 	vie_reset(vie);
2523 }
2524 
2525 void
2526 vie_exitinfo(const struct vie *vie, struct vm_exit *vme)
2527 {
2528 	if (vie->status & VIES_USER_FALLBACK) {
2529 		/*
2530 		 * Despite the fact that the instruction was successfully
2531 		 * decoded, some aspect of the emulation failed in such a way
2532 		 * that it is left up to userspace to complete the operation.
2533 		 */
2534 		vie_fallback_exitinfo(vie, vme);
2535 	} else if (vie->status & VIES_MMIO) {
2536 		vme->exitcode = VM_EXITCODE_MMIO;
2537 		if (vie->mmio_req_read.state == VR_PENDING) {
2538 			vme->u.mmio.gpa = vie->mmio_req_read.gpa;
2539 			vme->u.mmio.data = 0;
2540 			vme->u.mmio.bytes = vie->mmio_req_read.bytes;
2541 			vme->u.mmio.read = 1;
2542 		} else if (vie->mmio_req_write.state == VR_PENDING) {
2543 			vme->u.mmio.gpa = vie->mmio_req_write.gpa;
2544 			vme->u.mmio.data = vie->mmio_req_write.data &
2545 			    vie_size2mask(vie->mmio_req_write.bytes);
2546 			vme->u.mmio.bytes = vie->mmio_req_write.bytes;
2547 			vme->u.mmio.read = 0;
2548 		} else {
2549 			panic("bad pending MMIO state");
2550 		}
2551 	} else if (vie->status & VIES_INOUT) {
2552 		vme->exitcode = VM_EXITCODE_INOUT;
2553 		vme->u.inout.port = vie->inout.port;
2554 		vme->u.inout.bytes = vie->inout.bytes;
2555 		if ((vie->inout.flags & INOUT_IN) != 0) {
2556 			vme->u.inout.flags = INOUT_IN;
2557 			vme->u.inout.eax = 0;
2558 		} else {
2559 			vme->u.inout.flags = 0;
2560 			vme->u.inout.eax = vie->inout.eax &
2561 			    vie_size2mask(vie->inout.bytes);
2562 		}
2563 	} else {
2564 		panic("no pending operation");
2565 	}
2566 }
2567 
2568 /*
2569  * In the case of a decoding or verification failure, bailing out to userspace
2570  * to do the instruction emulation is our only option for now.
2571  */
2572 void
2573 vie_fallback_exitinfo(const struct vie *vie, struct vm_exit *vme)
2574 {
2575 	if ((vie->status & VIES_INST_FETCH) == 0) {
2576 		bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul));
2577 	} else {
2578 		ASSERT(sizeof (vie->inst) == sizeof (vme->u.inst_emul.inst));
2579 
2580 		bcopy(vie->inst, vme->u.inst_emul.inst, sizeof (vie->inst));
2581 		vme->u.inst_emul.num_valid = vie->num_valid;
2582 	}
2583 	vme->exitcode = VM_EXITCODE_INST_EMUL;
2584 }
2585 
2586 void
2587 vie_cs_info(const struct vie *vie, struct vm *vm, int vcpuid, uint64_t *cs_base,
2588     int *cs_d)
2589 {
2590 	struct seg_desc cs_desc;
2591 	int error __maybe_unused;
2592 
2593 	error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &cs_desc);
2594 	ASSERT(error == 0);
2595 
2596 	/* Initialization required for the paging info to be populated */
2597 	VERIFY(vie->status & VIES_INIT);
2598 	switch (vie->paging.cpu_mode) {
2599 	case CPU_MODE_REAL:
2600 		*cs_base = cs_desc.base;
2601 		*cs_d = 0;
2602 		break;
2603 	case CPU_MODE_PROTECTED:
2604 	case CPU_MODE_COMPATIBILITY:
2605 		*cs_base = cs_desc.base;
2606 		*cs_d = SEG_DESC_DEF32(cs_desc.access) ? 1 : 0;
2607 		break;
2608 	default:
2609 		*cs_base = 0;
2610 		*cs_d = 0;
2611 		break;
2612 	}
2613 }
2614 
2615 bool
2616 vie_pending(const struct vie *vie)
2617 {
2618 	/*
2619 	 * These VIE status bits indicate conditions which must be addressed
2620 	 * through either device IO fulfillment (with corresponding
2621 	 * vie_fulfill_*()) or complete userspace emulation (followed by a
2622 	 * vie_reset()).
2623 	 */
2624 	const enum vie_status of_interest =
2625 	    VIES_PENDING_MMIO | VIES_PENDING_INOUT | VIES_USER_FALLBACK;
2626 
2627 	return ((vie->status & of_interest) != 0);
2628 }
2629 
2630 bool
2631 vie_needs_fetch(const struct vie *vie)
2632 {
2633 	if (vie->status & VIES_INST_FETCH) {
2634 		ASSERT(vie->num_valid != 0);
2635 		return (false);
2636 	}
2637 	return (true);
2638 }
2639 
2640 static int
2641 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
2642 {
2643 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
2644 	    ("%s: invalid size %d", __func__, size));
2645 	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
2646 
2647 	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
2648 		return (0);
2649 
2650 	return ((gla & (size - 1)) ? 1 : 0);
2651 }
2652 
2653 static int
2654 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
2655 {
2656 	uint64_t mask;
2657 
2658 	if (cpu_mode != CPU_MODE_64BIT)
2659 		return (0);
2660 
2661 	/*
2662 	 * The value of the bit 47 in the 'gla' should be replicated in the
2663 	 * most significant 16 bits.
2664 	 */
2665 	mask = ~((1UL << 48) - 1);
2666 	if (gla & (1UL << 47))
2667 		return ((gla & mask) != mask);
2668 	else
2669 		return ((gla & mask) != 0);
2670 }
2671 
2672 static uint64_t
2673 vie_size2mask(int size)
2674 {
2675 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
2676 	    ("vie_size2mask: invalid size %d", size));
2677 	return (size2mask[size]);
2678 }
2679 
2680 static int
2681 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
2682     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
2683     int prot, uint64_t *gla)
2684 {
2685 	uint64_t firstoff, low_limit, high_limit, segbase;
2686 	int glasize, type;
2687 
2688 	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
2689 	    ("%s: invalid segment %d", __func__, seg));
2690 	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
2691 	    ("%s: invalid operand size %d", __func__, length));
2692 	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
2693 	    ("%s: invalid prot %x", __func__, prot));
2694 
2695 	firstoff = offset;
2696 	if (cpu_mode == CPU_MODE_64BIT) {
2697 		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
2698 		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
2699 		glasize = 8;
2700 	} else {
2701 		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
2702 		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
2703 		glasize = 4;
2704 		/*
2705 		 * If the segment selector is loaded with a NULL selector
2706 		 * then the descriptor is unusable and attempting to use
2707 		 * it results in a #GP(0).
2708 		 */
2709 		if (SEG_DESC_UNUSABLE(desc->access))
2710 			return (-1);
2711 
2712 		/*
2713 		 * The processor generates a #NP exception when a segment
2714 		 * register is loaded with a selector that points to a
2715 		 * descriptor that is not present. If this was the case then
2716 		 * it would have been checked before the VM-exit.
2717 		 */
2718 		KASSERT(SEG_DESC_PRESENT(desc->access),
2719 		    ("segment %d not present: %x", seg, desc->access));
2720 
2721 		/*
2722 		 * The descriptor type must indicate a code/data segment.
2723 		 */
2724 		type = SEG_DESC_TYPE(desc->access);
2725 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
2726 		    "descriptor type %x", seg, type));
2727 
2728 		if (prot & PROT_READ) {
2729 			/* #GP on a read access to a exec-only code segment */
2730 			if ((type & 0xA) == 0x8)
2731 				return (-1);
2732 		}
2733 
2734 		if (prot & PROT_WRITE) {
2735 			/*
2736 			 * #GP on a write access to a code segment or a
2737 			 * read-only data segment.
2738 			 */
2739 			if (type & 0x8)			/* code segment */
2740 				return (-1);
2741 
2742 			if ((type & 0xA) == 0)		/* read-only data seg */
2743 				return (-1);
2744 		}
2745 
2746 		/*
2747 		 * 'desc->limit' is fully expanded taking granularity into
2748 		 * account.
2749 		 */
2750 		if ((type & 0xC) == 0x4) {
2751 			/* expand-down data segment */
2752 			low_limit = desc->limit + 1;
2753 			high_limit = SEG_DESC_DEF32(desc->access) ?
2754 			    0xffffffff : 0xffff;
2755 		} else {
2756 			/* code segment or expand-up data segment */
2757 			low_limit = 0;
2758 			high_limit = desc->limit;
2759 		}
2760 
2761 		while (length > 0) {
2762 			offset &= vie_size2mask(addrsize);
2763 			if (offset < low_limit || offset > high_limit)
2764 				return (-1);
2765 			offset++;
2766 			length--;
2767 		}
2768 	}
2769 
2770 	/*
2771 	 * In 64-bit mode all segments except %fs and %gs have a segment
2772 	 * base address of 0.
2773 	 */
2774 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
2775 	    seg != VM_REG_GUEST_GS) {
2776 		segbase = 0;
2777 	} else {
2778 		segbase = desc->base;
2779 	}
2780 
2781 	/*
2782 	 * Truncate 'firstoff' to the effective address size before adding
2783 	 * it to the segment base.
2784 	 */
2785 	firstoff &= vie_size2mask(addrsize);
2786 	*gla = (segbase + firstoff) & vie_size2mask(glasize);
2787 	return (0);
2788 }
2789 
2790 void
2791 vie_init_mmio(struct vie *vie, const char *inst_bytes, uint8_t inst_length,
2792     const struct vm_guest_paging *paging, uint64_t gpa)
2793 {
2794 	KASSERT(inst_length <= VIE_INST_SIZE,
2795 	    ("%s: invalid instruction length (%d)", __func__, inst_length));
2796 
2797 	bzero(vie, sizeof (struct vie));
2798 
2799 	vie->base_register = VM_REG_LAST;
2800 	vie->index_register = VM_REG_LAST;
2801 	vie->segment_register = VM_REG_LAST;
2802 	vie->status = VIES_INIT | VIES_MMIO;
2803 
2804 	if (inst_length != 0) {
2805 		bcopy(inst_bytes, vie->inst, inst_length);
2806 		vie->num_valid = inst_length;
2807 		vie->status |= VIES_INST_FETCH;
2808 	}
2809 
2810 	vie->paging = *paging;
2811 	vie->mmio_gpa = gpa;
2812 }
2813 
2814 void
2815 vie_init_inout(struct vie *vie, const struct vm_inout *inout, uint8_t inst_len,
2816     const struct vm_guest_paging *paging)
2817 {
2818 	bzero(vie, sizeof (struct vie));
2819 
2820 	vie->status = VIES_INIT | VIES_INOUT;
2821 
2822 	vie->inout = *inout;
2823 	vie->paging = *paging;
2824 
2825 	/*
2826 	 * Since VMX/SVM assists already decoded the nature of the in/out
2827 	 * instruction, let the status reflect that.
2828 	 */
2829 	vie->status |= VIES_INST_FETCH | VIES_INST_DECODE;
2830 	vie->num_processed = inst_len;
2831 }
2832 
2833 void
2834 vie_init_other(struct vie *vie, const struct vm_guest_paging *paging)
2835 {
2836 	bzero(vie, sizeof (struct vie));
2837 
2838 	vie->base_register = VM_REG_LAST;
2839 	vie->index_register = VM_REG_LAST;
2840 	vie->segment_register = VM_REG_LAST;
2841 	vie->status = VIES_INIT | VIES_OTHER;
2842 
2843 	vie->paging = *paging;
2844 }
2845 
2846 int
2847 vie_fulfill_mmio(struct vie *vie, const struct vm_mmio *result)
2848 {
2849 	struct vie_mmio *pending;
2850 
2851 	if ((vie->status & VIES_MMIO) == 0 ||
2852 	    (vie->status & VIES_PENDING_MMIO) == 0) {
2853 		return (EINVAL);
2854 	}
2855 
2856 	if (result->read) {
2857 		pending = &vie->mmio_req_read;
2858 	} else {
2859 		pending = &vie->mmio_req_write;
2860 	}
2861 
2862 	if (pending->state != VR_PENDING ||
2863 	    pending->bytes != result->bytes || pending->gpa != result->gpa) {
2864 		return (EINVAL);
2865 	}
2866 
2867 	if (result->read) {
2868 		pending->data = result->data & vie_size2mask(pending->bytes);
2869 	}
2870 	pending->state = VR_DONE;
2871 	vie->status &= ~VIES_PENDING_MMIO;
2872 
2873 	return (0);
2874 }
2875 
2876 int
2877 vie_fulfill_inout(struct vie *vie, const struct vm_inout *result)
2878 {
2879 	if ((vie->status & VIES_INOUT) == 0 ||
2880 	    (vie->status & VIES_PENDING_INOUT) == 0) {
2881 		return (EINVAL);
2882 	}
2883 	if ((vie->inout.flags & INOUT_IN) != (result->flags & INOUT_IN) ||
2884 	    vie->inout.bytes != result->bytes ||
2885 	    vie->inout.port != result->port) {
2886 		return (EINVAL);
2887 	}
2888 
2889 	if (result->flags & INOUT_IN) {
2890 		vie->inout_req_val = result->eax &
2891 		    vie_size2mask(vie->inout.bytes);
2892 	}
2893 	vie->inout_req_state = VR_DONE;
2894 	vie->status &= ~(VIES_PENDING_INOUT);
2895 
2896 	return (0);
2897 }
2898 
2899 uint64_t
2900 vie_mmio_gpa(const struct vie *vie)
2901 {
2902 	return (vie->mmio_gpa);
2903 }
2904 
2905 static int
2906 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
2907 {
2908 	int error_code = 0;
2909 
2910 	if (pte & PG_V)
2911 		error_code |= PGEX_P;
2912 	if (prot & PROT_WRITE)
2913 		error_code |= PGEX_W;
2914 	if (usermode)
2915 		error_code |= PGEX_U;
2916 	if (rsvd)
2917 		error_code |= PGEX_RSV;
2918 	if (prot & PROT_EXEC)
2919 		error_code |= PGEX_I;
2920 
2921 	return (error_code);
2922 }
2923 
2924 static void
2925 ptp_release(vm_page_t **vmp)
2926 {
2927 	if (*vmp != NULL) {
2928 		(void) vmp_release(*vmp);
2929 		*vmp = NULL;
2930 	}
2931 }
2932 
2933 static void *
2934 ptp_hold(struct vm *vm, int vcpu, uintptr_t gpa, size_t len, vm_page_t **vmp)
2935 {
2936 	vm_client_t *vmc = vm_get_vmclient(vm, vcpu);
2937 	const uintptr_t hold_gpa = gpa & PAGEMASK;
2938 
2939 	/* Hold must not cross a page boundary */
2940 	VERIFY3U(gpa + len, <=, hold_gpa + PAGESIZE);
2941 
2942 	if (*vmp != NULL) {
2943 		(void) vmp_release(*vmp);
2944 	}
2945 
2946 	*vmp = vmc_hold(vmc, hold_gpa, PROT_READ | PROT_WRITE);
2947 	if (*vmp == NULL) {
2948 		return (NULL);
2949 	}
2950 
2951 	return ((caddr_t)vmp_get_writable(*vmp) + (gpa - hold_gpa));
2952 }
2953 
2954 static int
2955 _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2956     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)
2957 {
2958 	int nlevels, pfcode;
2959 	int ptpshift = 0, ptpindex = 0;
2960 	uint64_t ptpphys;
2961 	uint64_t *ptpbase = NULL, pte = 0, pgsize = 0;
2962 	vm_page_t *cookie = NULL;
2963 	const bool usermode = paging->cpl == 3;
2964 	const bool writable = (prot & PROT_WRITE) != 0;
2965 
2966 	*guest_fault = 0;
2967 restart:
2968 	ptpphys = paging->cr3;		/* root of the page tables */
2969 	ptp_release(&cookie);
2970 
2971 	if (vie_canonical_check(paging->cpu_mode, gla)) {
2972 		/*
2973 		 * XXX assuming a non-stack reference otherwise a stack fault
2974 		 * should be generated.
2975 		 */
2976 		if (!check_only)
2977 			vm_inject_gp(vm, vcpuid);
2978 		*guest_fault = 1;
2979 		return (0);
2980 	}
2981 
2982 	if (paging->paging_mode == PAGING_MODE_FLAT) {
2983 		*gpa = gla;
2984 		return (0);
2985 	}
2986 
2987 	if (paging->paging_mode == PAGING_MODE_32) {
2988 		uint32_t *ptpbase32, pte32;
2989 
2990 		nlevels = 2;
2991 		while (--nlevels >= 0) {
2992 			/* Zero out the lower 12 bits. */
2993 			ptpphys &= ~0xfff;
2994 
2995 			ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE,
2996 			    &cookie);
2997 
2998 			if (ptpbase32 == NULL) {
2999 				return (EFAULT);
3000 			}
3001 
3002 			ptpshift = PAGE_SHIFT + nlevels * 10;
3003 			ptpindex = (gla >> ptpshift) & 0x3FF;
3004 			pgsize = 1UL << ptpshift;
3005 
3006 			pte32 = ptpbase32[ptpindex];
3007 
3008 			if ((pte32 & PG_V) == 0 ||
3009 			    (usermode && (pte32 & PG_U) == 0) ||
3010 			    (writable && (pte32 & PG_RW) == 0)) {
3011 				if (!check_only) {
3012 					pfcode = pf_error_code(usermode, prot,
3013 					    0, pte32);
3014 					vm_inject_pf(vm, vcpuid, pfcode, gla);
3015 				}
3016 
3017 				ptp_release(&cookie);
3018 				*guest_fault = 1;
3019 				return (0);
3020 			}
3021 
3022 			/*
3023 			 * Emulate the x86 MMU's management of the accessed
3024 			 * and dirty flags. While the accessed flag is set
3025 			 * at every level of the page table, the dirty flag
3026 			 * is only set at the last level providing the guest
3027 			 * physical address.
3028 			 */
3029 			if (!check_only && (pte32 & PG_A) == 0) {
3030 				if (atomic_cmpset_32(&ptpbase32[ptpindex],
3031 				    pte32, pte32 | PG_A) == 0) {
3032 					goto restart;
3033 				}
3034 			}
3035 
3036 			/* XXX must be ignored if CR4.PSE=0 */
3037 			if (nlevels > 0 && (pte32 & PG_PS) != 0)
3038 				break;
3039 
3040 			ptpphys = pte32;
3041 		}
3042 
3043 		/* Set the dirty bit in the page table entry if necessary */
3044 		if (!check_only && writable && (pte32 & PG_M) == 0) {
3045 			if (atomic_cmpset_32(&ptpbase32[ptpindex],
3046 			    pte32, pte32 | PG_M) == 0) {
3047 				goto restart;
3048 			}
3049 		}
3050 
3051 		/* Zero out the lower 'ptpshift' bits */
3052 		pte32 >>= ptpshift; pte32 <<= ptpshift;
3053 		*gpa = pte32 | (gla & (pgsize - 1));
3054 		ptp_release(&cookie);
3055 		return (0);
3056 	}
3057 
3058 	if (paging->paging_mode == PAGING_MODE_PAE) {
3059 		/* Zero out the lower 5 bits and the upper 32 bits */
3060 		ptpphys &= 0xffffffe0UL;
3061 
3062 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof (*ptpbase) * 4,
3063 		    &cookie);
3064 		if (ptpbase == NULL) {
3065 			return (EFAULT);
3066 		}
3067 
3068 		ptpindex = (gla >> 30) & 0x3;
3069 
3070 		pte = ptpbase[ptpindex];
3071 
3072 		if ((pte & PG_V) == 0) {
3073 			if (!check_only) {
3074 				pfcode = pf_error_code(usermode, prot, 0, pte);
3075 				vm_inject_pf(vm, vcpuid, pfcode, gla);
3076 			}
3077 
3078 			ptp_release(&cookie);
3079 			*guest_fault = 1;
3080 			return (0);
3081 		}
3082 
3083 		ptpphys = pte;
3084 
3085 		nlevels = 2;
3086 	} else {
3087 		nlevels = 4;
3088 	}
3089 
3090 	while (--nlevels >= 0) {
3091 		/* Zero out the lower 12 bits and the upper 12 bits */
3092 		ptpphys &= 0x000ffffffffff000UL;
3093 
3094 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie);
3095 		if (ptpbase == NULL) {
3096 			return (EFAULT);
3097 		}
3098 
3099 		ptpshift = PAGE_SHIFT + nlevels * 9;
3100 		ptpindex = (gla >> ptpshift) & 0x1FF;
3101 		pgsize = 1UL << ptpshift;
3102 
3103 		pte = ptpbase[ptpindex];
3104 
3105 		if ((pte & PG_V) == 0 ||
3106 		    (usermode && (pte & PG_U) == 0) ||
3107 		    (writable && (pte & PG_RW) == 0)) {
3108 			if (!check_only) {
3109 				pfcode = pf_error_code(usermode, prot, 0, pte);
3110 				vm_inject_pf(vm, vcpuid, pfcode, gla);
3111 			}
3112 
3113 			ptp_release(&cookie);
3114 			*guest_fault = 1;
3115 			return (0);
3116 		}
3117 
3118 		/* Set the accessed bit in the page table entry */
3119 		if (!check_only && (pte & PG_A) == 0) {
3120 			if (atomic_cmpset_64(&ptpbase[ptpindex],
3121 			    pte, pte | PG_A) == 0) {
3122 				goto restart;
3123 			}
3124 		}
3125 
3126 		if (nlevels > 0 && (pte & PG_PS) != 0) {
3127 			if (pgsize > 1 * GB) {
3128 				if (!check_only) {
3129 					pfcode = pf_error_code(usermode, prot,
3130 					    1, pte);
3131 					vm_inject_pf(vm, vcpuid, pfcode, gla);
3132 				}
3133 
3134 				ptp_release(&cookie);
3135 				*guest_fault = 1;
3136 				return (0);
3137 			}
3138 			break;
3139 		}
3140 
3141 		ptpphys = pte;
3142 	}
3143 
3144 	/* Set the dirty bit in the page table entry if necessary */
3145 	if (!check_only && writable && (pte & PG_M) == 0) {
3146 		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
3147 			goto restart;
3148 	}
3149 	ptp_release(&cookie);
3150 
3151 	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
3152 	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
3153 	*gpa = pte | (gla & (pgsize - 1));
3154 	return (0);
3155 }
3156 
3157 int
3158 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3159     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
3160 {
3161 
3162 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
3163 	    false));
3164 }
3165 
3166 int
3167 vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
3168     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
3169 {
3170 
3171 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
3172 	    true));
3173 }
3174 
3175 int
3176 vie_fetch_instruction(struct vie *vie, struct vm *vm, int vcpuid, uint64_t rip,
3177     int *faultptr)
3178 {
3179 	struct vm_copyinfo copyinfo[2];
3180 	int error, prot;
3181 
3182 	if ((vie->status & VIES_INIT) == 0) {
3183 		return (EINVAL);
3184 	}
3185 
3186 	prot = PROT_READ | PROT_EXEC;
3187 	error = vm_copy_setup(vm, vcpuid, &vie->paging, rip, VIE_INST_SIZE,
3188 	    prot, copyinfo, nitems(copyinfo), faultptr);
3189 	if (error || *faultptr)
3190 		return (error);
3191 
3192 	vm_copyin(vm, vcpuid, copyinfo, vie->inst, VIE_INST_SIZE);
3193 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
3194 	vie->num_valid = VIE_INST_SIZE;
3195 	vie->status |= VIES_INST_FETCH;
3196 	return (0);
3197 }
3198 
3199 static int
3200 vie_peek(struct vie *vie, uint8_t *x)
3201 {
3202 
3203 	if (vie->num_processed < vie->num_valid) {
3204 		*x = vie->inst[vie->num_processed];
3205 		return (0);
3206 	} else
3207 		return (-1);
3208 }
3209 
3210 static void
3211 vie_advance(struct vie *vie)
3212 {
3213 
3214 	vie->num_processed++;
3215 }
3216 
3217 static bool
3218 segment_override(uint8_t x, int *seg)
3219 {
3220 
3221 	switch (x) {
3222 	case 0x2E:
3223 		*seg = VM_REG_GUEST_CS;
3224 		break;
3225 	case 0x36:
3226 		*seg = VM_REG_GUEST_SS;
3227 		break;
3228 	case 0x3E:
3229 		*seg = VM_REG_GUEST_DS;
3230 		break;
3231 	case 0x26:
3232 		*seg = VM_REG_GUEST_ES;
3233 		break;
3234 	case 0x64:
3235 		*seg = VM_REG_GUEST_FS;
3236 		break;
3237 	case 0x65:
3238 		*seg = VM_REG_GUEST_GS;
3239 		break;
3240 	default:
3241 		return (false);
3242 	}
3243 	return (true);
3244 }
3245 
3246 static int
3247 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
3248 {
3249 	uint8_t x;
3250 
3251 	while (1) {
3252 		if (vie_peek(vie, &x))
3253 			return (-1);
3254 
3255 		if (x == 0x66)
3256 			vie->opsize_override = 1;
3257 		else if (x == 0x67)
3258 			vie->addrsize_override = 1;
3259 		else if (x == 0xF3)
3260 			vie->repz_present = 1;
3261 		else if (x == 0xF2)
3262 			vie->repnz_present = 1;
3263 		else if (segment_override(x, &vie->segment_register))
3264 			vie->segment_override = 1;
3265 		else
3266 			break;
3267 
3268 		vie_advance(vie);
3269 	}
3270 
3271 	/*
3272 	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
3273 	 * - Only one REX prefix is allowed per instruction.
3274 	 * - The REX prefix must immediately precede the opcode byte or the
3275 	 *   escape opcode byte.
3276 	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
3277 	 *   the mandatory prefix must come before the REX prefix.
3278 	 */
3279 	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
3280 		vie->rex_present = 1;
3281 		vie->rex_w = x & 0x8 ? 1 : 0;
3282 		vie->rex_r = x & 0x4 ? 1 : 0;
3283 		vie->rex_x = x & 0x2 ? 1 : 0;
3284 		vie->rex_b = x & 0x1 ? 1 : 0;
3285 		vie_advance(vie);
3286 	}
3287 
3288 	/*
3289 	 * § 2.3.5, "The VEX Prefix", SDM Vol 2.
3290 	 */
3291 	if ((cpu_mode == CPU_MODE_64BIT ||
3292 	    cpu_mode == CPU_MODE_COMPATIBILITY) && x == 0xC4) {
3293 		const struct vie_op *optab;
3294 
3295 		/* 3-byte VEX prefix. */
3296 		vie->vex_present = 1;
3297 
3298 		vie_advance(vie);
3299 		if (vie_peek(vie, &x))
3300 			return (-1);
3301 
3302 		/*
3303 		 * 2nd byte: [R', X', B', mmmmm[4:0]].  Bits are inverted
3304 		 * relative to REX encoding.
3305 		 */
3306 		vie->rex_r = x & 0x80 ? 0 : 1;
3307 		vie->rex_x = x & 0x40 ? 0 : 1;
3308 		vie->rex_b = x & 0x20 ? 0 : 1;
3309 
3310 		switch (x & 0x1F) {
3311 		case 0x2:
3312 			/* 0F 38. */
3313 			optab = three_byte_opcodes_0f38;
3314 			break;
3315 		case 0x1:
3316 			/* 0F class - nothing handled here yet. */
3317 			/* FALLTHROUGH */
3318 		case 0x3:
3319 			/* 0F 3A class - nothing handled here yet. */
3320 			/* FALLTHROUGH */
3321 		default:
3322 			/* Reserved (#UD). */
3323 			return (-1);
3324 		}
3325 
3326 		vie_advance(vie);
3327 		if (vie_peek(vie, &x))
3328 			return (-1);
3329 
3330 		/* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */
3331 		vie->rex_w = x & 0x80 ? 1 : 0;
3332 
3333 		vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3);
3334 		vie->vex_l = !!(x & 0x4);
3335 		vie->vex_pp = (x & 0x3);
3336 
3337 		/* PP: 1=66 2=F3 3=F2 prefixes. */
3338 		switch (vie->vex_pp) {
3339 		case 0x1:
3340 			vie->opsize_override = 1;
3341 			break;
3342 		case 0x2:
3343 			vie->repz_present = 1;
3344 			break;
3345 		case 0x3:
3346 			vie->repnz_present = 1;
3347 			break;
3348 		}
3349 
3350 		vie_advance(vie);
3351 
3352 		/* Opcode, sans literal prefix prefix. */
3353 		if (vie_peek(vie, &x))
3354 			return (-1);
3355 
3356 		vie->op = optab[x];
3357 		if (vie->op.op_type == VIE_OP_TYPE_NONE)
3358 			return (-1);
3359 
3360 		vie_advance(vie);
3361 	}
3362 
3363 	/*
3364 	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
3365 	 */
3366 	if (cpu_mode == CPU_MODE_64BIT) {
3367 		/*
3368 		 * Default address size is 64-bits and default operand size
3369 		 * is 32-bits.
3370 		 */
3371 		vie->addrsize = vie->addrsize_override ? 4 : 8;
3372 		if (vie->rex_w)
3373 			vie->opsize = 8;
3374 		else if (vie->opsize_override)
3375 			vie->opsize = 2;
3376 		else
3377 			vie->opsize = 4;
3378 	} else if (cs_d) {
3379 		/* Default address and operand sizes are 32-bits */
3380 		vie->addrsize = vie->addrsize_override ? 2 : 4;
3381 		vie->opsize = vie->opsize_override ? 2 : 4;
3382 	} else {
3383 		/* Default address and operand sizes are 16-bits */
3384 		vie->addrsize = vie->addrsize_override ? 4 : 2;
3385 		vie->opsize = vie->opsize_override ? 4 : 2;
3386 	}
3387 	return (0);
3388 }
3389 
3390 static int
3391 decode_two_byte_opcode(struct vie *vie)
3392 {
3393 	uint8_t x;
3394 
3395 	if (vie_peek(vie, &x))
3396 		return (-1);
3397 
3398 	vie->op = two_byte_opcodes[x];
3399 
3400 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
3401 		return (-1);
3402 
3403 	vie_advance(vie);
3404 	return (0);
3405 }
3406 
3407 static int
3408 decode_opcode(struct vie *vie)
3409 {
3410 	uint8_t x;
3411 
3412 	if (vie_peek(vie, &x))
3413 		return (-1);
3414 
3415 	/* Already did this via VEX prefix. */
3416 	if (vie->op.op_type != VIE_OP_TYPE_NONE)
3417 		return (0);
3418 
3419 	vie->op = one_byte_opcodes[x];
3420 
3421 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
3422 		return (-1);
3423 
3424 	vie_advance(vie);
3425 
3426 	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
3427 		return (decode_two_byte_opcode(vie));
3428 
3429 	return (0);
3430 }
3431 
3432 static int
3433 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
3434 {
3435 	uint8_t x;
3436 	/*
3437 	 * Handling mov-to/from-cr is special since it is not issuing
3438 	 * mmio/pio requests and can be done in real mode.  We must bypass some
3439 	 * of the other existing decoding restrictions for it.
3440 	 */
3441 	const bool is_movcr = ((vie->op.op_flags & VIE_OP_F_REG_REG) != 0);
3442 
3443 	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
3444 		return (0);
3445 
3446 	if (cpu_mode == CPU_MODE_REAL && !is_movcr)
3447 		return (-1);
3448 
3449 	if (vie_peek(vie, &x))
3450 		return (-1);
3451 
3452 	vie->mod = (x >> 6) & 0x3;
3453 	vie->rm =  (x >> 0) & 0x7;
3454 	vie->reg = (x >> 3) & 0x7;
3455 
3456 	/*
3457 	 * A direct addressing mode makes no sense in the context of an EPT
3458 	 * fault. There has to be a memory access involved to cause the
3459 	 * EPT fault.
3460 	 */
3461 	if (vie->mod == VIE_MOD_DIRECT && !is_movcr)
3462 		return (-1);
3463 
3464 	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
3465 	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
3466 		/*
3467 		 * Table 2-5: Special Cases of REX Encodings
3468 		 *
3469 		 * mod=0, r/m=5 is used in the compatibility mode to
3470 		 * indicate a disp32 without a base register.
3471 		 *
3472 		 * mod!=3, r/m=4 is used in the compatibility mode to
3473 		 * indicate that the SIB byte is present.
3474 		 *
3475 		 * The 'b' bit in the REX prefix is don't care in
3476 		 * this case.
3477 		 */
3478 	} else {
3479 		vie->rm |= (vie->rex_b << 3);
3480 	}
3481 
3482 	vie->reg |= (vie->rex_r << 3);
3483 
3484 	/* SIB */
3485 	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
3486 		goto done;
3487 
3488 	vie->base_register = gpr_map[vie->rm];
3489 
3490 	switch (vie->mod) {
3491 	case VIE_MOD_INDIRECT_DISP8:
3492 		vie->disp_bytes = 1;
3493 		break;
3494 	case VIE_MOD_INDIRECT_DISP32:
3495 		vie->disp_bytes = 4;
3496 		break;
3497 	case VIE_MOD_INDIRECT:
3498 		if (vie->rm == VIE_RM_DISP32) {
3499 			vie->disp_bytes = 4;
3500 			/*
3501 			 * Table 2-7. RIP-Relative Addressing
3502 			 *
3503 			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
3504 			 * whereas in compatibility mode it just implies disp32.
3505 			 */
3506 
3507 			if (cpu_mode == CPU_MODE_64BIT)
3508 				vie->base_register = VM_REG_GUEST_RIP;
3509 			else
3510 				vie->base_register = VM_REG_LAST;
3511 		}
3512 		break;
3513 	}
3514 
3515 done:
3516 	vie_advance(vie);
3517 
3518 	return (0);
3519 }
3520 
3521 static int
3522 decode_sib(struct vie *vie)
3523 {
3524 	uint8_t x;
3525 
3526 	/* Proceed only if SIB byte is present */
3527 	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
3528 		return (0);
3529 
3530 	if (vie_peek(vie, &x))
3531 		return (-1);
3532 
3533 	/* De-construct the SIB byte */
3534 	vie->ss = (x >> 6) & 0x3;
3535 	vie->index = (x >> 3) & 0x7;
3536 	vie->base = (x >> 0) & 0x7;
3537 
3538 	/* Apply the REX prefix modifiers */
3539 	vie->index |= vie->rex_x << 3;
3540 	vie->base |= vie->rex_b << 3;
3541 
3542 	switch (vie->mod) {
3543 	case VIE_MOD_INDIRECT_DISP8:
3544 		vie->disp_bytes = 1;
3545 		break;
3546 	case VIE_MOD_INDIRECT_DISP32:
3547 		vie->disp_bytes = 4;
3548 		break;
3549 	}
3550 
3551 	if (vie->mod == VIE_MOD_INDIRECT &&
3552 	    (vie->base == 5 || vie->base == 13)) {
3553 		/*
3554 		 * Special case when base register is unused if mod = 0
3555 		 * and base = %rbp or %r13.
3556 		 *
3557 		 * Documented in:
3558 		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
3559 		 * Table 2-5: Special Cases of REX Encodings
3560 		 */
3561 		vie->disp_bytes = 4;
3562 	} else {
3563 		vie->base_register = gpr_map[vie->base];
3564 	}
3565 
3566 	/*
3567 	 * All encodings of 'index' are valid except for %rsp (4).
3568 	 *
3569 	 * Documented in:
3570 	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
3571 	 * Table 2-5: Special Cases of REX Encodings
3572 	 */
3573 	if (vie->index != 4)
3574 		vie->index_register = gpr_map[vie->index];
3575 
3576 	/* 'scale' makes sense only in the context of an index register */
3577 	if (vie->index_register < VM_REG_LAST)
3578 		vie->scale = 1 << vie->ss;
3579 
3580 	vie_advance(vie);
3581 
3582 	return (0);
3583 }
3584 
3585 static int
3586 decode_displacement(struct vie *vie)
3587 {
3588 	int n, i;
3589 	uint8_t x;
3590 
3591 	union {
3592 		char	buf[4];
3593 		int8_t	signed8;
3594 		int32_t	signed32;
3595 	} u;
3596 
3597 	if ((n = vie->disp_bytes) == 0)
3598 		return (0);
3599 
3600 	if (n != 1 && n != 4)
3601 		panic("decode_displacement: invalid disp_bytes %d", n);
3602 
3603 	for (i = 0; i < n; i++) {
3604 		if (vie_peek(vie, &x))
3605 			return (-1);
3606 
3607 		u.buf[i] = x;
3608 		vie_advance(vie);
3609 	}
3610 
3611 	if (n == 1)
3612 		vie->displacement = u.signed8;		/* sign-extended */
3613 	else
3614 		vie->displacement = u.signed32;		/* sign-extended */
3615 
3616 	return (0);
3617 }
3618 
3619 static int
3620 decode_immediate(struct vie *vie)
3621 {
3622 	int i, n;
3623 	uint8_t x;
3624 	union {
3625 		char	buf[4];
3626 		int8_t	signed8;
3627 		int16_t	signed16;
3628 		int32_t	signed32;
3629 	} u;
3630 
3631 	/* Figure out immediate operand size (if any) */
3632 	if (vie->op.op_flags & VIE_OP_F_IMM) {
3633 		/*
3634 		 * Section 2.2.1.5 "Immediates", Intel SDM:
3635 		 * In 64-bit mode the typical size of immediate operands
3636 		 * remains 32-bits. When the operand size if 64-bits, the
3637 		 * processor sign-extends all immediates to 64-bits prior
3638 		 * to their use.
3639 		 */
3640 		if (vie->opsize == 4 || vie->opsize == 8)
3641 			vie->imm_bytes = 4;
3642 		else
3643 			vie->imm_bytes = 2;
3644 	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
3645 		vie->imm_bytes = 1;
3646 	}
3647 
3648 	if ((n = vie->imm_bytes) == 0)
3649 		return (0);
3650 
3651 	KASSERT(n == 1 || n == 2 || n == 4,
3652 	    ("%s: invalid number of immediate bytes: %d", __func__, n));
3653 
3654 	for (i = 0; i < n; i++) {
3655 		if (vie_peek(vie, &x))
3656 			return (-1);
3657 
3658 		u.buf[i] = x;
3659 		vie_advance(vie);
3660 	}
3661 
3662 	/* sign-extend the immediate value before use */
3663 	if (n == 1)
3664 		vie->immediate = u.signed8;
3665 	else if (n == 2)
3666 		vie->immediate = u.signed16;
3667 	else
3668 		vie->immediate = u.signed32;
3669 
3670 	return (0);
3671 }
3672 
3673 static int
3674 decode_moffset(struct vie *vie)
3675 {
3676 	int i, n;
3677 	uint8_t x;
3678 	union {
3679 		char	buf[8];
3680 		uint64_t u64;
3681 	} u;
3682 
3683 	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
3684 		return (0);
3685 
3686 	/*
3687 	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
3688 	 * The memory offset size follows the address-size of the instruction.
3689 	 */
3690 	n = vie->addrsize;
3691 	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
3692 
3693 	u.u64 = 0;
3694 	for (i = 0; i < n; i++) {
3695 		if (vie_peek(vie, &x))
3696 			return (-1);
3697 
3698 		u.buf[i] = x;
3699 		vie_advance(vie);
3700 	}
3701 	vie->displacement = u.u64;
3702 	return (0);
3703 }
3704 
3705 /*
3706  * Verify that the 'guest linear address' provided as collateral of the nested
3707  * page table fault matches with our instruction decoding.
3708  */
3709 int
3710 vie_verify_gla(struct vie *vie, struct vm *vm, int cpuid, uint64_t gla)
3711 {
3712 	int error;
3713 	uint64_t base, segbase, idx, gla2;
3714 	enum vm_reg_name seg;
3715 	struct seg_desc desc;
3716 
3717 	ASSERT((vie->status & VIES_INST_DECODE) != 0);
3718 
3719 	/*
3720 	 * If there was no valid GLA context with the exit, or the decoded
3721 	 * instruction acts on more than one address, verification is done.
3722 	 */
3723 	if (gla == VIE_INVALID_GLA ||
3724 	    (vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) != 0) {
3725 		return (0);
3726 	}
3727 
3728 	base = 0;
3729 	if (vie->base_register != VM_REG_LAST) {
3730 		error = vm_get_register(vm, cpuid, vie->base_register, &base);
3731 		if (error) {
3732 			printf("verify_gla: error %d getting base reg %d\n",
3733 			    error, vie->base_register);
3734 			return (-1);
3735 		}
3736 
3737 		/*
3738 		 * RIP-relative addressing starts from the following
3739 		 * instruction
3740 		 */
3741 		if (vie->base_register == VM_REG_GUEST_RIP)
3742 			base += vie->num_processed;
3743 	}
3744 
3745 	idx = 0;
3746 	if (vie->index_register != VM_REG_LAST) {
3747 		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
3748 		if (error) {
3749 			printf("verify_gla: error %d getting index reg %d\n",
3750 			    error, vie->index_register);
3751 			return (-1);
3752 		}
3753 	}
3754 
3755 	/*
3756 	 * From "Specifying a Segment Selector", Intel SDM, Vol 1
3757 	 *
3758 	 * In 64-bit mode, segmentation is generally (but not
3759 	 * completely) disabled.  The exceptions are the FS and GS
3760 	 * segments.
3761 	 *
3762 	 * In legacy IA-32 mode, when the ESP or EBP register is used
3763 	 * as the base, the SS segment is the default segment.  For
3764 	 * other data references, except when relative to stack or
3765 	 * string destination the DS segment is the default.  These
3766 	 * can be overridden to allow other segments to be accessed.
3767 	 */
3768 	if (vie->segment_override) {
3769 		seg = vie->segment_register;
3770 	} else if (vie->base_register == VM_REG_GUEST_RSP ||
3771 	    vie->base_register == VM_REG_GUEST_RBP) {
3772 		seg = VM_REG_GUEST_SS;
3773 	} else {
3774 		seg = VM_REG_GUEST_DS;
3775 	}
3776 	if (vie->paging.cpu_mode == CPU_MODE_64BIT &&
3777 	    seg != VM_REG_GUEST_FS && seg != VM_REG_GUEST_GS) {
3778 		segbase = 0;
3779 	} else {
3780 		error = vm_get_seg_desc(vm, cpuid, seg, &desc);
3781 		if (error) {
3782 			printf("verify_gla: error %d getting segment"
3783 			    " descriptor %d", error, vie->segment_register);
3784 			return (-1);
3785 		}
3786 		segbase = desc.base;
3787 	}
3788 
3789 	gla2 = segbase + base + vie->scale * idx + vie->displacement;
3790 	gla2 &= size2mask[vie->addrsize];
3791 	if (gla != gla2) {
3792 		printf("verify_gla mismatch: segbase(0x%0lx)"
3793 		    "base(0x%0lx), scale(%d), index(0x%0lx), "
3794 		    "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
3795 		    segbase, base, vie->scale, idx, vie->displacement,
3796 		    gla, gla2);
3797 		return (-1);
3798 	}
3799 
3800 	return (0);
3801 }
3802 
3803 int
3804 vie_decode_instruction(struct vie *vie, struct vm *vm, int cpuid, int cs_d)
3805 {
3806 	enum vm_cpu_mode cpu_mode;
3807 
3808 	if ((vie->status & VIES_INST_FETCH) == 0) {
3809 		return (EINVAL);
3810 	}
3811 
3812 	cpu_mode = vie->paging.cpu_mode;
3813 
3814 	if (decode_prefixes(vie, cpu_mode, cs_d))
3815 		return (-1);
3816 
3817 	if (decode_opcode(vie))
3818 		return (-1);
3819 
3820 	if (decode_modrm(vie, cpu_mode))
3821 		return (-1);
3822 
3823 	if (decode_sib(vie))
3824 		return (-1);
3825 
3826 	if (decode_displacement(vie))
3827 		return (-1);
3828 
3829 	if (decode_immediate(vie))
3830 		return (-1);
3831 
3832 	if (decode_moffset(vie))
3833 		return (-1);
3834 
3835 	vie->status |= VIES_INST_DECODE;
3836 
3837 	return (0);
3838 }
3839