1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2012 Sandvine, Inc.
5  * Copyright (c) 2012 NetApp, Inc.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #ifdef _KERNEL
36 #include <sys/param.h>
37 #include <sys/pcpu.h>
38 #include <sys/systm.h>
39 #include <sys/proc.h>
40 
41 #include <vm/vm.h>
42 #include <vm/pmap.h>
43 
44 #include <machine/vmparam.h>
45 #include <machine/vmm.h>
46 #else	/* !_KERNEL */
47 #include <sys/types.h>
48 #include <sys/errno.h>
49 #include <sys/_iovec.h>
50 
51 #include <machine/vmm.h>
52 
53 #include <assert.h>
54 #include <vmmapi.h>
55 #define	KASSERT(exp,msg)	assert((exp))
56 #endif	/* _KERNEL */
57 
58 #include <machine/vmm_instruction_emul.h>
59 #include <x86/psl.h>
60 #include <x86/specialreg.h>
61 
62 /* struct vie_op.op_type */
63 enum {
64 	VIE_OP_TYPE_NONE = 0,
65 	VIE_OP_TYPE_MOV,
66 	VIE_OP_TYPE_MOVSX,
67 	VIE_OP_TYPE_MOVZX,
68 	VIE_OP_TYPE_AND,
69 	VIE_OP_TYPE_OR,
70 	VIE_OP_TYPE_SUB,
71 	VIE_OP_TYPE_TWO_BYTE,
72 	VIE_OP_TYPE_PUSH,
73 	VIE_OP_TYPE_CMP,
74 	VIE_OP_TYPE_POP,
75 	VIE_OP_TYPE_MOVS,
76 	VIE_OP_TYPE_GROUP1,
77 	VIE_OP_TYPE_STOS,
78 	VIE_OP_TYPE_BITTEST,
79 	VIE_OP_TYPE_TWOB_GRP15,
80 	VIE_OP_TYPE_LAST
81 };
82 
83 /* struct vie_op.op_flags */
84 #define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
85 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
86 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
87 #define	VIE_OP_F_NO_MODRM	(1 << 3)
88 #define	VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
89 
90 static const struct vie_op two_byte_opcodes[256] = {
91 	[0xAE] = {
92 		  .op_byte = 0xAE,
93 		  .op_type = VIE_OP_TYPE_TWOB_GRP15,
94 	},
95 	[0xB6] = {
96 		.op_byte = 0xB6,
97 		.op_type = VIE_OP_TYPE_MOVZX,
98 	},
99 	[0xB7] = {
100 		.op_byte = 0xB7,
101 		.op_type = VIE_OP_TYPE_MOVZX,
102 	},
103 	[0xBA] = {
104 		.op_byte = 0xBA,
105 		.op_type = VIE_OP_TYPE_BITTEST,
106 		.op_flags = VIE_OP_F_IMM8,
107 	},
108 	[0xBE] = {
109 		.op_byte = 0xBE,
110 		.op_type = VIE_OP_TYPE_MOVSX,
111 	},
112 };
113 
114 static const struct vie_op one_byte_opcodes[256] = {
115 	[0x0F] = {
116 		.op_byte = 0x0F,
117 		.op_type = VIE_OP_TYPE_TWO_BYTE
118 	},
119 	[0x0B] = {
120 		.op_byte = 0x0B,
121 		.op_type = VIE_OP_TYPE_OR,
122 	},
123 	[0x2B] = {
124 		.op_byte = 0x2B,
125 		.op_type = VIE_OP_TYPE_SUB,
126 	},
127 	[0x39] = {
128 		.op_byte = 0x39,
129 		.op_type = VIE_OP_TYPE_CMP,
130 	},
131 	[0x3B] = {
132 		.op_byte = 0x3B,
133 		.op_type = VIE_OP_TYPE_CMP,
134 	},
135 	[0x88] = {
136 		.op_byte = 0x88,
137 		.op_type = VIE_OP_TYPE_MOV,
138 	},
139 	[0x89] = {
140 		.op_byte = 0x89,
141 		.op_type = VIE_OP_TYPE_MOV,
142 	},
143 	[0x8A] = {
144 		.op_byte = 0x8A,
145 		.op_type = VIE_OP_TYPE_MOV,
146 	},
147 	[0x8B] = {
148 		.op_byte = 0x8B,
149 		.op_type = VIE_OP_TYPE_MOV,
150 	},
151 	[0xA1] = {
152 		.op_byte = 0xA1,
153 		.op_type = VIE_OP_TYPE_MOV,
154 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
155 	},
156 	[0xA3] = {
157 		.op_byte = 0xA3,
158 		.op_type = VIE_OP_TYPE_MOV,
159 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
160 	},
161 	[0xA4] = {
162 		.op_byte = 0xA4,
163 		.op_type = VIE_OP_TYPE_MOVS,
164 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
165 	},
166 	[0xA5] = {
167 		.op_byte = 0xA5,
168 		.op_type = VIE_OP_TYPE_MOVS,
169 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
170 	},
171 	[0xAA] = {
172 		.op_byte = 0xAA,
173 		.op_type = VIE_OP_TYPE_STOS,
174 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
175 	},
176 	[0xAB] = {
177 		.op_byte = 0xAB,
178 		.op_type = VIE_OP_TYPE_STOS,
179 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
180 	},
181 	[0xC6] = {
182 		/* XXX Group 11 extended opcode - not just MOV */
183 		.op_byte = 0xC6,
184 		.op_type = VIE_OP_TYPE_MOV,
185 		.op_flags = VIE_OP_F_IMM8,
186 	},
187 	[0xC7] = {
188 		.op_byte = 0xC7,
189 		.op_type = VIE_OP_TYPE_MOV,
190 		.op_flags = VIE_OP_F_IMM,
191 	},
192 	[0x23] = {
193 		.op_byte = 0x23,
194 		.op_type = VIE_OP_TYPE_AND,
195 	},
196 	[0x80] = {
197 		/* Group 1 extended opcode */
198 		.op_byte = 0x80,
199 		.op_type = VIE_OP_TYPE_GROUP1,
200 		.op_flags = VIE_OP_F_IMM8,
201 	},
202 	[0x81] = {
203 		/* Group 1 extended opcode */
204 		.op_byte = 0x81,
205 		.op_type = VIE_OP_TYPE_GROUP1,
206 		.op_flags = VIE_OP_F_IMM,
207 	},
208 	[0x83] = {
209 		/* Group 1 extended opcode */
210 		.op_byte = 0x83,
211 		.op_type = VIE_OP_TYPE_GROUP1,
212 		.op_flags = VIE_OP_F_IMM8,
213 	},
214 	[0x8F] = {
215 		/* XXX Group 1A extended opcode - not just POP */
216 		.op_byte = 0x8F,
217 		.op_type = VIE_OP_TYPE_POP,
218 	},
219 	[0xFF] = {
220 		/* XXX Group 5 extended opcode - not just PUSH */
221 		.op_byte = 0xFF,
222 		.op_type = VIE_OP_TYPE_PUSH,
223 	}
224 };
225 
226 /* struct vie.mod */
227 #define	VIE_MOD_INDIRECT		0
228 #define	VIE_MOD_INDIRECT_DISP8		1
229 #define	VIE_MOD_INDIRECT_DISP32		2
230 #define	VIE_MOD_DIRECT			3
231 
232 /* struct vie.rm */
233 #define	VIE_RM_SIB			4
234 #define	VIE_RM_DISP32			5
235 
236 #define	GB				(1024 * 1024 * 1024)
237 
238 static enum vm_reg_name gpr_map[16] = {
239 	VM_REG_GUEST_RAX,
240 	VM_REG_GUEST_RCX,
241 	VM_REG_GUEST_RDX,
242 	VM_REG_GUEST_RBX,
243 	VM_REG_GUEST_RSP,
244 	VM_REG_GUEST_RBP,
245 	VM_REG_GUEST_RSI,
246 	VM_REG_GUEST_RDI,
247 	VM_REG_GUEST_R8,
248 	VM_REG_GUEST_R9,
249 	VM_REG_GUEST_R10,
250 	VM_REG_GUEST_R11,
251 	VM_REG_GUEST_R12,
252 	VM_REG_GUEST_R13,
253 	VM_REG_GUEST_R14,
254 	VM_REG_GUEST_R15
255 };
256 
257 static uint64_t size2mask[] = {
258 	[1] = 0xff,
259 	[2] = 0xffff,
260 	[4] = 0xffffffff,
261 	[8] = 0xffffffffffffffff,
262 };
263 
264 static int
265 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
266 {
267 	int error;
268 
269 	error = vm_get_register(vm, vcpuid, reg, rval);
270 
271 	return (error);
272 }
273 
274 static void
275 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
276 {
277 	*lhbr = 0;
278 	*reg = gpr_map[vie->reg];
279 
280 	/*
281 	 * 64-bit mode imposes limitations on accessing legacy high byte
282 	 * registers (lhbr).
283 	 *
284 	 * The legacy high-byte registers cannot be addressed if the REX
285 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
286 	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
287 	 *
288 	 * If the REX prefix is not present then the values 4, 5, 6 and 7
289 	 * of the 'ModRM:reg' field address the legacy high-byte registers,
290 	 * %ah, %ch, %dh and %bh respectively.
291 	 */
292 	if (!vie->rex_present) {
293 		if (vie->reg & 0x4) {
294 			*lhbr = 1;
295 			*reg = gpr_map[vie->reg & 0x3];
296 		}
297 	}
298 }
299 
300 static int
301 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
302 {
303 	uint64_t val;
304 	int error, lhbr;
305 	enum vm_reg_name reg;
306 
307 	vie_calc_bytereg(vie, &reg, &lhbr);
308 	error = vm_get_register(vm, vcpuid, reg, &val);
309 
310 	/*
311 	 * To obtain the value of a legacy high byte register shift the
312 	 * base register right by 8 bits (%ah = %rax >> 8).
313 	 */
314 	if (lhbr)
315 		*rval = val >> 8;
316 	else
317 		*rval = val;
318 	return (error);
319 }
320 
321 static int
322 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
323 {
324 	uint64_t origval, val, mask;
325 	int error, lhbr;
326 	enum vm_reg_name reg;
327 
328 	vie_calc_bytereg(vie, &reg, &lhbr);
329 	error = vm_get_register(vm, vcpuid, reg, &origval);
330 	if (error == 0) {
331 		val = byte;
332 		mask = 0xff;
333 		if (lhbr) {
334 			/*
335 			 * Shift left by 8 to store 'byte' in a legacy high
336 			 * byte register.
337 			 */
338 			val <<= 8;
339 			mask <<= 8;
340 		}
341 		val |= origval & ~mask;
342 		error = vm_set_register(vm, vcpuid, reg, val);
343 	}
344 	return (error);
345 }
346 
347 int
348 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
349 		    uint64_t val, int size)
350 {
351 	int error;
352 	uint64_t origval;
353 
354 	switch (size) {
355 	case 1:
356 	case 2:
357 		error = vie_read_register(vm, vcpuid, reg, &origval);
358 		if (error)
359 			return (error);
360 		val &= size2mask[size];
361 		val |= origval & ~size2mask[size];
362 		break;
363 	case 4:
364 		val &= 0xffffffffUL;
365 		break;
366 	case 8:
367 		break;
368 	default:
369 		return (EINVAL);
370 	}
371 
372 	error = vm_set_register(vm, vcpuid, reg, val);
373 	return (error);
374 }
375 
376 #define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
377 
378 /*
379  * Return the status flags that would result from doing (x - y).
380  */
381 #define	GETCC(sz)							\
382 static u_long								\
383 getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
384 {									\
385 	u_long rflags;							\
386 									\
387 	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
388 	    "=r" (rflags), "+r" (x) : "m" (y));				\
389 	return (rflags);						\
390 } struct __hack
391 
392 GETCC(8);
393 GETCC(16);
394 GETCC(32);
395 GETCC(64);
396 
397 static u_long
398 getcc(int opsize, uint64_t x, uint64_t y)
399 {
400 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
401 	    ("getcc: invalid operand size %d", opsize));
402 
403 	if (opsize == 1)
404 		return (getcc8(x, y));
405 	else if (opsize == 2)
406 		return (getcc16(x, y));
407 	else if (opsize == 4)
408 		return (getcc32(x, y));
409 	else
410 		return (getcc64(x, y));
411 }
412 
413 static int
414 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
415 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
416 {
417 	int error, size;
418 	enum vm_reg_name reg;
419 	uint8_t byte;
420 	uint64_t val;
421 
422 	size = vie->opsize;
423 	error = EINVAL;
424 
425 	switch (vie->op.op_byte) {
426 	case 0x88:
427 		/*
428 		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
429 		 * 88/r:	mov r/m8, r8
430 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
431 		 */
432 		size = 1;	/* override for byte operation */
433 		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
434 		if (error == 0)
435 			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
436 		break;
437 	case 0x89:
438 		/*
439 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
440 		 * 89/r:	mov r/m16, r16
441 		 * 89/r:	mov r/m32, r32
442 		 * REX.W + 89/r	mov r/m64, r64
443 		 */
444 		reg = gpr_map[vie->reg];
445 		error = vie_read_register(vm, vcpuid, reg, &val);
446 		if (error == 0) {
447 			val &= size2mask[size];
448 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
449 		}
450 		break;
451 	case 0x8A:
452 		/*
453 		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
454 		 * 8A/r:	mov r8, r/m8
455 		 * REX + 8A/r:	mov r8, r/m8
456 		 */
457 		size = 1;	/* override for byte operation */
458 		error = memread(vm, vcpuid, gpa, &val, size, arg);
459 		if (error == 0)
460 			error = vie_write_bytereg(vm, vcpuid, vie, val);
461 		break;
462 	case 0x8B:
463 		/*
464 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
465 		 * 8B/r:	mov r16, r/m16
466 		 * 8B/r:	mov r32, r/m32
467 		 * REX.W 8B/r:	mov r64, r/m64
468 		 */
469 		error = memread(vm, vcpuid, gpa, &val, size, arg);
470 		if (error == 0) {
471 			reg = gpr_map[vie->reg];
472 			error = vie_update_register(vm, vcpuid, reg, val, size);
473 		}
474 		break;
475 	case 0xA1:
476 		/*
477 		 * MOV from seg:moffset to AX/EAX/RAX
478 		 * A1:		mov AX, moffs16
479 		 * A1:		mov EAX, moffs32
480 		 * REX.W + A1:	mov RAX, moffs64
481 		 */
482 		error = memread(vm, vcpuid, gpa, &val, size, arg);
483 		if (error == 0) {
484 			reg = VM_REG_GUEST_RAX;
485 			error = vie_update_register(vm, vcpuid, reg, val, size);
486 		}
487 		break;
488 	case 0xA3:
489 		/*
490 		 * MOV from AX/EAX/RAX to seg:moffset
491 		 * A3:		mov moffs16, AX
492 		 * A3:		mov moffs32, EAX
493 		 * REX.W + A3:	mov moffs64, RAX
494 		 */
495 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
496 		if (error == 0) {
497 			val &= size2mask[size];
498 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
499 		}
500 		break;
501 	case 0xC6:
502 		/*
503 		 * MOV from imm8 to mem (ModRM:r/m)
504 		 * C6/0		mov r/m8, imm8
505 		 * REX + C6/0	mov r/m8, imm8
506 		 */
507 		size = 1;	/* override for byte operation */
508 		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
509 		break;
510 	case 0xC7:
511 		/*
512 		 * MOV from imm16/imm32 to mem (ModRM:r/m)
513 		 * C7/0		mov r/m16, imm16
514 		 * C7/0		mov r/m32, imm32
515 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
516 		 */
517 		val = vie->immediate & size2mask[size];
518 		error = memwrite(vm, vcpuid, gpa, val, size, arg);
519 		break;
520 	default:
521 		break;
522 	}
523 
524 	return (error);
525 }
526 
527 static int
528 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
529 	     mem_region_read_t memread, mem_region_write_t memwrite,
530 	     void *arg)
531 {
532 	int error, size;
533 	enum vm_reg_name reg;
534 	uint64_t val;
535 
536 	size = vie->opsize;
537 	error = EINVAL;
538 
539 	switch (vie->op.op_byte) {
540 	case 0xB6:
541 		/*
542 		 * MOV and zero extend byte from mem (ModRM:r/m) to
543 		 * reg (ModRM:reg).
544 		 *
545 		 * 0F B6/r		movzx r16, r/m8
546 		 * 0F B6/r		movzx r32, r/m8
547 		 * REX.W + 0F B6/r	movzx r64, r/m8
548 		 */
549 
550 		/* get the first operand */
551 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
552 		if (error)
553 			break;
554 
555 		/* get the second operand */
556 		reg = gpr_map[vie->reg];
557 
558 		/* zero-extend byte */
559 		val = (uint8_t)val;
560 
561 		/* write the result */
562 		error = vie_update_register(vm, vcpuid, reg, val, size);
563 		break;
564 	case 0xB7:
565 		/*
566 		 * MOV and zero extend word from mem (ModRM:r/m) to
567 		 * reg (ModRM:reg).
568 		 *
569 		 * 0F B7/r		movzx r32, r/m16
570 		 * REX.W + 0F B7/r	movzx r64, r/m16
571 		 */
572 		error = memread(vm, vcpuid, gpa, &val, 2, arg);
573 		if (error)
574 			return (error);
575 
576 		reg = gpr_map[vie->reg];
577 
578 		/* zero-extend word */
579 		val = (uint16_t)val;
580 
581 		error = vie_update_register(vm, vcpuid, reg, val, size);
582 		break;
583 	case 0xBE:
584 		/*
585 		 * MOV and sign extend byte from mem (ModRM:r/m) to
586 		 * reg (ModRM:reg).
587 		 *
588 		 * 0F BE/r		movsx r16, r/m8
589 		 * 0F BE/r		movsx r32, r/m8
590 		 * REX.W + 0F BE/r	movsx r64, r/m8
591 		 */
592 
593 		/* get the first operand */
594 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
595 		if (error)
596 			break;
597 
598 		/* get the second operand */
599 		reg = gpr_map[vie->reg];
600 
601 		/* sign extend byte */
602 		val = (int8_t)val;
603 
604 		/* write the result */
605 		error = vie_update_register(vm, vcpuid, reg, val, size);
606 		break;
607 	default:
608 		break;
609 	}
610 	return (error);
611 }
612 
613 /*
614  * Helper function to calculate and validate a linear address.
615  */
616 static int
617 get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
618     int opsize, int addrsize, int prot, enum vm_reg_name seg,
619     enum vm_reg_name gpr, uint64_t *gla, int *fault)
620 {
621 	struct seg_desc desc;
622 	uint64_t cr0, val, rflags;
623 	int error;
624 
625 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
626 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
627 
628 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
629 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
630 
631 	error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
632 	KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
633 	    __func__, error, seg));
634 
635 	error = vie_read_register(vm, vcpuid, gpr, &val);
636 	KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
637 	    error, gpr));
638 
639 	if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
640 	    addrsize, prot, gla)) {
641 		if (seg == VM_REG_GUEST_SS)
642 			vm_inject_ss(vm, vcpuid, 0);
643 		else
644 			vm_inject_gp(vm, vcpuid);
645 		goto guest_fault;
646 	}
647 
648 	if (vie_canonical_check(paging->cpu_mode, *gla)) {
649 		if (seg == VM_REG_GUEST_SS)
650 			vm_inject_ss(vm, vcpuid, 0);
651 		else
652 			vm_inject_gp(vm, vcpuid);
653 		goto guest_fault;
654 	}
655 
656 	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
657 		vm_inject_ac(vm, vcpuid, 0);
658 		goto guest_fault;
659 	}
660 
661 	*fault = 0;
662 	return (0);
663 
664 guest_fault:
665 	*fault = 1;
666 	return (0);
667 }
668 
669 static int
670 emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
671     struct vm_guest_paging *paging, mem_region_read_t memread,
672     mem_region_write_t memwrite, void *arg)
673 {
674 #ifdef _KERNEL
675 	struct vm_copyinfo copyinfo[2];
676 #else
677 	struct iovec copyinfo[2];
678 #endif
679 	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
680 	uint64_t rcx, rdi, rsi, rflags;
681 	int error, fault, opsize, seg, repeat;
682 
683 	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
684 	val = 0;
685 	error = 0;
686 
687 	/*
688 	 * XXX although the MOVS instruction is only supposed to be used with
689 	 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
690 	 *
691 	 * Empirically the "repnz" prefix has identical behavior to "rep"
692 	 * and the zero flag does not make a difference.
693 	 */
694 	repeat = vie->repz_present | vie->repnz_present;
695 
696 	if (repeat) {
697 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
698 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
699 
700 		/*
701 		 * The count register is %rcx, %ecx or %cx depending on the
702 		 * address size of the instruction.
703 		 */
704 		if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
705 			error = 0;
706 			goto done;
707 		}
708 	}
709 
710 	/*
711 	 *	Source		Destination	Comments
712 	 *	--------------------------------------------
713 	 * (1)  memory		memory		n/a
714 	 * (2)  memory		mmio		emulated
715 	 * (3)  mmio		memory		emulated
716 	 * (4)  mmio		mmio		emulated
717 	 *
718 	 * At this point we don't have sufficient information to distinguish
719 	 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
720 	 * out because it will succeed only when operating on regular memory.
721 	 *
722 	 * XXX the emulation doesn't properly handle the case where 'gpa'
723 	 * is straddling the boundary between the normal memory and MMIO.
724 	 */
725 
726 	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
727 	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
728 	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault);
729 	if (error || fault)
730 		goto done;
731 
732 	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
733 	    copyinfo, nitems(copyinfo), &fault);
734 	if (error == 0) {
735 		if (fault)
736 			goto done;	/* Resume guest to handle fault */
737 
738 		/*
739 		 * case (2): read from system memory and write to mmio.
740 		 */
741 		vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
742 		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
743 		error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
744 		if (error)
745 			goto done;
746 	} else {
747 		/*
748 		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
749 		 * if 'srcaddr' is in the mmio space.
750 		 */
751 
752 		error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
753 		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr,
754 		    &fault);
755 		if (error || fault)
756 			goto done;
757 
758 		error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
759 		    PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
760 		if (error == 0) {
761 			if (fault)
762 				goto done;    /* Resume guest to handle fault */
763 
764 			/*
765 			 * case (3): read from MMIO and write to system memory.
766 			 *
767 			 * A MMIO read can have side-effects so we
768 			 * commit to it only after vm_copy_setup() is
769 			 * successful. If a page-fault needs to be
770 			 * injected into the guest then it will happen
771 			 * before the MMIO read is attempted.
772 			 */
773 			error = memread(vm, vcpuid, gpa, &val, opsize, arg);
774 			if (error)
775 				goto done;
776 
777 			vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
778 			vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
779 		} else {
780 			/*
781 			 * Case (4): read from and write to mmio.
782 			 *
783 			 * Commit to the MMIO read/write (with potential
784 			 * side-effects) only after we are sure that the
785 			 * instruction is not going to be restarted due
786 			 * to address translation faults.
787 			 */
788 			error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
789 			    PROT_READ, &srcgpa, &fault);
790 			if (error || fault)
791 				goto done;
792 
793 			error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
794 			   PROT_WRITE, &dstgpa, &fault);
795 			if (error || fault)
796 				goto done;
797 
798 			error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
799 			if (error)
800 				goto done;
801 
802 			error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg);
803 			if (error)
804 				goto done;
805 		}
806 	}
807 
808 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
809 	KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
810 
811 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
812 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
813 
814 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
815 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
816 
817 	if (rflags & PSL_D) {
818 		rsi -= opsize;
819 		rdi -= opsize;
820 	} else {
821 		rsi += opsize;
822 		rdi += opsize;
823 	}
824 
825 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
826 	    vie->addrsize);
827 	KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
828 
829 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
830 	    vie->addrsize);
831 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
832 
833 	if (repeat) {
834 		rcx = rcx - 1;
835 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
836 		    rcx, vie->addrsize);
837 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
838 
839 		/*
840 		 * Repeat the instruction if the count register is not zero.
841 		 */
842 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
843 			vm_restart_instruction(vm, vcpuid);
844 	}
845 done:
846 	KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d",
847 	    __func__, error));
848 	return (error);
849 }
850 
851 static int
852 emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
853     struct vm_guest_paging *paging, mem_region_read_t memread,
854     mem_region_write_t memwrite, void *arg)
855 {
856 	int error, opsize, repeat;
857 	uint64_t val;
858 	uint64_t rcx, rdi, rflags;
859 
860 	opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
861 	repeat = vie->repz_present | vie->repnz_present;
862 
863 	if (repeat) {
864 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
865 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
866 
867 		/*
868 		 * The count register is %rcx, %ecx or %cx depending on the
869 		 * address size of the instruction.
870 		 */
871 		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
872 			return (0);
873 	}
874 
875 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
876 	KASSERT(!error, ("%s: error %d getting rax", __func__, error));
877 
878 	error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
879 	if (error)
880 		return (error);
881 
882 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
883 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
884 
885 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
886 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
887 
888 	if (rflags & PSL_D)
889 		rdi -= opsize;
890 	else
891 		rdi += opsize;
892 
893 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
894 	    vie->addrsize);
895 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
896 
897 	if (repeat) {
898 		rcx = rcx - 1;
899 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
900 		    rcx, vie->addrsize);
901 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
902 
903 		/*
904 		 * Repeat the instruction if the count register is not zero.
905 		 */
906 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
907 			vm_restart_instruction(vm, vcpuid);
908 	}
909 
910 	return (0);
911 }
912 
913 static int
914 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
915 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
916 {
917 	int error, size;
918 	enum vm_reg_name reg;
919 	uint64_t result, rflags, rflags2, val1, val2;
920 
921 	size = vie->opsize;
922 	error = EINVAL;
923 
924 	switch (vie->op.op_byte) {
925 	case 0x23:
926 		/*
927 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
928 		 * result in reg.
929 		 *
930 		 * 23/r		and r16, r/m16
931 		 * 23/r		and r32, r/m32
932 		 * REX.W + 23/r	and r64, r/m64
933 		 */
934 
935 		/* get the first operand */
936 		reg = gpr_map[vie->reg];
937 		error = vie_read_register(vm, vcpuid, reg, &val1);
938 		if (error)
939 			break;
940 
941 		/* get the second operand */
942 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
943 		if (error)
944 			break;
945 
946 		/* perform the operation and write the result */
947 		result = val1 & val2;
948 		error = vie_update_register(vm, vcpuid, reg, result, size);
949 		break;
950 	case 0x81:
951 	case 0x83:
952 		/*
953 		 * AND mem (ModRM:r/m) with immediate and store the
954 		 * result in mem.
955 		 *
956 		 * 81 /4		and r/m16, imm16
957 		 * 81 /4		and r/m32, imm32
958 		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
959 		 *
960 		 * 83 /4		and r/m16, imm8 sign-extended to 16
961 		 * 83 /4		and r/m32, imm8 sign-extended to 32
962 		 * REX.W + 83/4		and r/m64, imm8 sign-extended to 64
963 		 */
964 
965 		/* get the first operand */
966                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
967                 if (error)
968 			break;
969 
970                 /*
971 		 * perform the operation with the pre-fetched immediate
972 		 * operand and write the result
973 		 */
974                 result = val1 & vie->immediate;
975                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
976 		break;
977 	default:
978 		break;
979 	}
980 	if (error)
981 		return (error);
982 
983 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
984 	if (error)
985 		return (error);
986 
987 	/*
988 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
989 	 * to the result; AF is undefined.
990 	 *
991 	 * The updated status flags are obtained by subtracting 0 from 'result'.
992 	 */
993 	rflags2 = getcc(size, result, 0);
994 	rflags &= ~RFLAGS_STATUS_BITS;
995 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
996 
997 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
998 	return (error);
999 }
1000 
1001 static int
1002 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1003 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1004 {
1005 	int error, size;
1006 	enum vm_reg_name reg;
1007 	uint64_t result, rflags, rflags2, val1, val2;
1008 
1009 	size = vie->opsize;
1010 	error = EINVAL;
1011 
1012 	switch (vie->op.op_byte) {
1013 	case 0x0B:
1014 		/*
1015 		 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
1016 		 * result in reg.
1017 		 *
1018 		 * 0b/r         or r16, r/m16
1019 		 * 0b/r         or r32, r/m32
1020 		 * REX.W + 0b/r or r64, r/m64
1021 		 */
1022 
1023 		/* get the first operand */
1024 		reg = gpr_map[vie->reg];
1025 		error = vie_read_register(vm, vcpuid, reg, &val1);
1026 		if (error)
1027 			break;
1028 
1029 		/* get the second operand */
1030 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1031 		if (error)
1032 			break;
1033 
1034 		/* perform the operation and write the result */
1035 		result = val1 | val2;
1036 		error = vie_update_register(vm, vcpuid, reg, result, size);
1037 		break;
1038 	case 0x81:
1039 	case 0x83:
1040 		/*
1041 		 * OR mem (ModRM:r/m) with immediate and store the
1042 		 * result in mem.
1043 		 *
1044 		 * 81 /1		or r/m16, imm16
1045 		 * 81 /1		or r/m32, imm32
1046 		 * REX.W + 81 /1	or r/m64, imm32 sign-extended to 64
1047 		 *
1048 		 * 83 /1		or r/m16, imm8 sign-extended to 16
1049 		 * 83 /1		or r/m32, imm8 sign-extended to 32
1050 		 * REX.W + 83/1		or r/m64, imm8 sign-extended to 64
1051 		 */
1052 
1053 		/* get the first operand */
1054                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
1055                 if (error)
1056 			break;
1057 
1058                 /*
1059 		 * perform the operation with the pre-fetched immediate
1060 		 * operand and write the result
1061 		 */
1062                 result = val1 | vie->immediate;
1063                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
1064 		break;
1065 	default:
1066 		break;
1067 	}
1068 	if (error)
1069 		return (error);
1070 
1071 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1072 	if (error)
1073 		return (error);
1074 
1075 	/*
1076 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1077 	 * to the result; AF is undefined.
1078 	 *
1079 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1080 	 */
1081 	rflags2 = getcc(size, result, 0);
1082 	rflags &= ~RFLAGS_STATUS_BITS;
1083 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1084 
1085 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1086 	return (error);
1087 }
1088 
1089 static int
1090 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1091 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1092 {
1093 	int error, size;
1094 	uint64_t regop, memop, op1, op2, rflags, rflags2;
1095 	enum vm_reg_name reg;
1096 
1097 	size = vie->opsize;
1098 	switch (vie->op.op_byte) {
1099 	case 0x39:
1100 	case 0x3B:
1101 		/*
1102 		 * 39/r		CMP r/m16, r16
1103 		 * 39/r		CMP r/m32, r32
1104 		 * REX.W 39/r	CMP r/m64, r64
1105 		 *
1106 		 * 3B/r		CMP r16, r/m16
1107 		 * 3B/r		CMP r32, r/m32
1108 		 * REX.W + 3B/r	CMP r64, r/m64
1109 		 *
1110 		 * Compare the first operand with the second operand and
1111 		 * set status flags in EFLAGS register. The comparison is
1112 		 * performed by subtracting the second operand from the first
1113 		 * operand and then setting the status flags.
1114 		 */
1115 
1116 		/* Get the register operand */
1117 		reg = gpr_map[vie->reg];
1118 		error = vie_read_register(vm, vcpuid, reg, &regop);
1119 		if (error)
1120 			return (error);
1121 
1122 		/* Get the memory operand */
1123 		error = memread(vm, vcpuid, gpa, &memop, size, arg);
1124 		if (error)
1125 			return (error);
1126 
1127 		if (vie->op.op_byte == 0x3B) {
1128 			op1 = regop;
1129 			op2 = memop;
1130 		} else {
1131 			op1 = memop;
1132 			op2 = regop;
1133 		}
1134 		rflags2 = getcc(size, op1, op2);
1135 		break;
1136 	case 0x80:
1137 	case 0x81:
1138 	case 0x83:
1139 		/*
1140 		 * 80 /7		cmp r/m8, imm8
1141 		 * REX + 80 /7		cmp r/m8, imm8
1142 		 *
1143 		 * 81 /7		cmp r/m16, imm16
1144 		 * 81 /7		cmp r/m32, imm32
1145 		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
1146 		 *
1147 		 * 83 /7		cmp r/m16, imm8 sign-extended to 16
1148 		 * 83 /7		cmp r/m32, imm8 sign-extended to 32
1149 		 * REX.W + 83 /7	cmp r/m64, imm8 sign-extended to 64
1150 		 *
1151 		 * Compare mem (ModRM:r/m) with immediate and set
1152 		 * status flags according to the results.  The
1153 		 * comparison is performed by subtracting the
1154 		 * immediate from the first operand and then setting
1155 		 * the status flags.
1156 		 *
1157 		 */
1158 		if (vie->op.op_byte == 0x80)
1159 			size = 1;
1160 
1161 		/* get the first operand */
1162                 error = memread(vm, vcpuid, gpa, &op1, size, arg);
1163 		if (error)
1164 			return (error);
1165 
1166 		rflags2 = getcc(size, op1, vie->immediate);
1167 		break;
1168 	default:
1169 		return (EINVAL);
1170 	}
1171 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1172 	if (error)
1173 		return (error);
1174 	rflags &= ~RFLAGS_STATUS_BITS;
1175 	rflags |= rflags2 & RFLAGS_STATUS_BITS;
1176 
1177 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1178 	return (error);
1179 }
1180 
1181 static int
1182 emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1183 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1184 {
1185 	int error, size;
1186 	uint64_t nval, rflags, rflags2, val1, val2;
1187 	enum vm_reg_name reg;
1188 
1189 	size = vie->opsize;
1190 	error = EINVAL;
1191 
1192 	switch (vie->op.op_byte) {
1193 	case 0x2B:
1194 		/*
1195 		 * SUB r/m from r and store the result in r
1196 		 *
1197 		 * 2B/r            SUB r16, r/m16
1198 		 * 2B/r            SUB r32, r/m32
1199 		 * REX.W + 2B/r    SUB r64, r/m64
1200 		 */
1201 
1202 		/* get the first operand */
1203 		reg = gpr_map[vie->reg];
1204 		error = vie_read_register(vm, vcpuid, reg, &val1);
1205 		if (error)
1206 			break;
1207 
1208 		/* get the second operand */
1209 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1210 		if (error)
1211 			break;
1212 
1213 		/* perform the operation and write the result */
1214 		nval = val1 - val2;
1215 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1216 		break;
1217 	default:
1218 		break;
1219 	}
1220 
1221 	if (!error) {
1222 		rflags2 = getcc(size, val1, val2);
1223 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1224 		    &rflags);
1225 		if (error)
1226 			return (error);
1227 
1228 		rflags &= ~RFLAGS_STATUS_BITS;
1229 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1230 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1231 		    rflags, 8);
1232 	}
1233 
1234 	return (error);
1235 }
1236 
1237 static int
1238 emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1239     struct vm_guest_paging *paging, mem_region_read_t memread,
1240     mem_region_write_t memwrite, void *arg)
1241 {
1242 #ifdef _KERNEL
1243 	struct vm_copyinfo copyinfo[2];
1244 #else
1245 	struct iovec copyinfo[2];
1246 #endif
1247 	struct seg_desc ss_desc;
1248 	uint64_t cr0, rflags, rsp, stack_gla, val;
1249 	int error, fault, size, stackaddrsize, pushop;
1250 
1251 	val = 0;
1252 	size = vie->opsize;
1253 	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
1254 
1255 	/*
1256 	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
1257 	 */
1258 	if (paging->cpu_mode == CPU_MODE_REAL) {
1259 		stackaddrsize = 2;
1260 	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
1261 		/*
1262 		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
1263 		 * - Stack pointer size is always 64-bits.
1264 		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
1265 		 * - 16-bit PUSH/POP is supported by using the operand size
1266 		 *   override prefix (66H).
1267 		 */
1268 		stackaddrsize = 8;
1269 		size = vie->opsize_override ? 2 : 8;
1270 	} else {
1271 		/*
1272 		 * In protected or compatibility mode the 'B' flag in the
1273 		 * stack-segment descriptor determines the size of the
1274 		 * stack pointer.
1275 		 */
1276 		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
1277 		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
1278 		    __func__, error));
1279 		if (SEG_DESC_DEF32(ss_desc.access))
1280 			stackaddrsize = 4;
1281 		else
1282 			stackaddrsize = 2;
1283 	}
1284 
1285 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
1286 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
1287 
1288 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1289 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1290 
1291 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
1292 	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
1293 	if (pushop) {
1294 		rsp -= size;
1295 	}
1296 
1297 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
1298 	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
1299 	    &stack_gla)) {
1300 		vm_inject_ss(vm, vcpuid, 0);
1301 		return (0);
1302 	}
1303 
1304 	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
1305 		vm_inject_ss(vm, vcpuid, 0);
1306 		return (0);
1307 	}
1308 
1309 	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
1310 		vm_inject_ac(vm, vcpuid, 0);
1311 		return (0);
1312 	}
1313 
1314 	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
1315 	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
1316 	    &fault);
1317 	if (error || fault)
1318 		return (error);
1319 
1320 	if (pushop) {
1321 		error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
1322 		if (error == 0)
1323 			vm_copyout(vm, vcpuid, &val, copyinfo, size);
1324 	} else {
1325 		vm_copyin(vm, vcpuid, copyinfo, &val, size);
1326 		error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg);
1327 		rsp += size;
1328 	}
1329 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1330 
1331 	if (error == 0) {
1332 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
1333 		    stackaddrsize);
1334 		KASSERT(error == 0, ("error %d updating rsp", error));
1335 	}
1336 	return (error);
1337 }
1338 
1339 static int
1340 emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1341     struct vm_guest_paging *paging, mem_region_read_t memread,
1342     mem_region_write_t memwrite, void *arg)
1343 {
1344 	int error;
1345 
1346 	/*
1347 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1348 	 *
1349 	 * PUSH is part of the group 5 extended opcodes and is identified
1350 	 * by ModRM:reg = b110.
1351 	 */
1352 	if ((vie->reg & 7) != 6)
1353 		return (EINVAL);
1354 
1355 	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
1356 	    memwrite, arg);
1357 	return (error);
1358 }
1359 
1360 static int
1361 emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1362     struct vm_guest_paging *paging, mem_region_read_t memread,
1363     mem_region_write_t memwrite, void *arg)
1364 {
1365 	int error;
1366 
1367 	/*
1368 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1369 	 *
1370 	 * POP is part of the group 1A extended opcodes and is identified
1371 	 * by ModRM:reg = b000.
1372 	 */
1373 	if ((vie->reg & 7) != 0)
1374 		return (EINVAL);
1375 
1376 	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
1377 	    memwrite, arg);
1378 	return (error);
1379 }
1380 
1381 static int
1382 emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1383     struct vm_guest_paging *paging, mem_region_read_t memread,
1384     mem_region_write_t memwrite, void *memarg)
1385 {
1386 	int error;
1387 
1388 	switch (vie->reg & 7) {
1389 	case 0x1:	/* OR */
1390 		error = emulate_or(vm, vcpuid, gpa, vie,
1391 		    memread, memwrite, memarg);
1392 		break;
1393 	case 0x4:	/* AND */
1394 		error = emulate_and(vm, vcpuid, gpa, vie,
1395 		    memread, memwrite, memarg);
1396 		break;
1397 	case 0x7:	/* CMP */
1398 		error = emulate_cmp(vm, vcpuid, gpa, vie,
1399 		    memread, memwrite, memarg);
1400 		break;
1401 	default:
1402 		error = EINVAL;
1403 		break;
1404 	}
1405 
1406 	return (error);
1407 }
1408 
1409 static int
1410 emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1411     mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
1412 {
1413 	uint64_t val, rflags;
1414 	int error, bitmask, bitoff;
1415 
1416 	/*
1417 	 * 0F BA is a Group 8 extended opcode.
1418 	 *
1419 	 * Currently we only emulate the 'Bit Test' instruction which is
1420 	 * identified by a ModR/M:reg encoding of 100b.
1421 	 */
1422 	if ((vie->reg & 7) != 4)
1423 		return (EINVAL);
1424 
1425 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1426 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1427 
1428 	error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg);
1429 	if (error)
1430 		return (error);
1431 
1432 	/*
1433 	 * Intel SDM, Vol 2, Table 3-2:
1434 	 * "Range of Bit Positions Specified by Bit Offset Operands"
1435 	 */
1436 	bitmask = vie->opsize * 8 - 1;
1437 	bitoff = vie->immediate & bitmask;
1438 
1439 	/* Copy the bit into the Carry flag in %rflags */
1440 	if (val & (1UL << bitoff))
1441 		rflags |= PSL_C;
1442 	else
1443 		rflags &= ~PSL_C;
1444 
1445 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1446 	KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
1447 
1448 	return (0);
1449 }
1450 
1451 static int
1452 emulate_twob_group15(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1453     mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
1454 {
1455 	int error;
1456 	uint64_t buf;
1457 
1458 	switch (vie->reg & 7) {
1459 	case 0x7:	/* CLFLUSH, CLFLUSHOPT, and SFENCE */
1460 		if (vie->mod == 0x3) {
1461 			/*
1462 			 * SFENCE.  Ignore it, VM exit provides enough
1463 			 * barriers on its own.
1464 			 */
1465 			error = 0;
1466 		} else {
1467 			/*
1468 			 * CLFLUSH, CLFLUSHOPT.  Only check for access
1469 			 * rights.
1470 			 */
1471 			error = memread(vm, vcpuid, gpa, &buf, 1, memarg);
1472 		}
1473 		break;
1474 	default:
1475 		error = EINVAL;
1476 		break;
1477 	}
1478 
1479 	return (error);
1480 }
1481 
1482 int
1483 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1484     struct vm_guest_paging *paging, mem_region_read_t memread,
1485     mem_region_write_t memwrite, void *memarg)
1486 {
1487 	int error;
1488 
1489 	if (!vie->decoded)
1490 		return (EINVAL);
1491 
1492 	switch (vie->op.op_type) {
1493 	case VIE_OP_TYPE_GROUP1:
1494 		error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread,
1495 		    memwrite, memarg);
1496 		break;
1497 	case VIE_OP_TYPE_POP:
1498 		error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread,
1499 		    memwrite, memarg);
1500 		break;
1501 	case VIE_OP_TYPE_PUSH:
1502 		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
1503 		    memwrite, memarg);
1504 		break;
1505 	case VIE_OP_TYPE_CMP:
1506 		error = emulate_cmp(vm, vcpuid, gpa, vie,
1507 				    memread, memwrite, memarg);
1508 		break;
1509 	case VIE_OP_TYPE_MOV:
1510 		error = emulate_mov(vm, vcpuid, gpa, vie,
1511 				    memread, memwrite, memarg);
1512 		break;
1513 	case VIE_OP_TYPE_MOVSX:
1514 	case VIE_OP_TYPE_MOVZX:
1515 		error = emulate_movx(vm, vcpuid, gpa, vie,
1516 				     memread, memwrite, memarg);
1517 		break;
1518 	case VIE_OP_TYPE_MOVS:
1519 		error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread,
1520 		    memwrite, memarg);
1521 		break;
1522 	case VIE_OP_TYPE_STOS:
1523 		error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread,
1524 		    memwrite, memarg);
1525 		break;
1526 	case VIE_OP_TYPE_AND:
1527 		error = emulate_and(vm, vcpuid, gpa, vie,
1528 				    memread, memwrite, memarg);
1529 		break;
1530 	case VIE_OP_TYPE_OR:
1531 		error = emulate_or(vm, vcpuid, gpa, vie,
1532 				    memread, memwrite, memarg);
1533 		break;
1534 	case VIE_OP_TYPE_SUB:
1535 		error = emulate_sub(vm, vcpuid, gpa, vie,
1536 				    memread, memwrite, memarg);
1537 		break;
1538 	case VIE_OP_TYPE_BITTEST:
1539 		error = emulate_bittest(vm, vcpuid, gpa, vie,
1540 		    memread, memwrite, memarg);
1541 		break;
1542 	case VIE_OP_TYPE_TWOB_GRP15:
1543 		error = emulate_twob_group15(vm, vcpuid, gpa, vie,
1544 		    memread, memwrite, memarg);
1545 		break;
1546 	default:
1547 		error = EINVAL;
1548 		break;
1549 	}
1550 
1551 	return (error);
1552 }
1553 
1554 int
1555 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
1556 {
1557 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1558 	    ("%s: invalid size %d", __func__, size));
1559 	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
1560 
1561 	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
1562 		return (0);
1563 
1564 	return ((gla & (size - 1)) ? 1 : 0);
1565 }
1566 
1567 int
1568 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
1569 {
1570 	uint64_t mask;
1571 
1572 	if (cpu_mode != CPU_MODE_64BIT)
1573 		return (0);
1574 
1575 	/*
1576 	 * The value of the bit 47 in the 'gla' should be replicated in the
1577 	 * most significant 16 bits.
1578 	 */
1579 	mask = ~((1UL << 48) - 1);
1580 	if (gla & (1UL << 47))
1581 		return ((gla & mask) != mask);
1582 	else
1583 		return ((gla & mask) != 0);
1584 }
1585 
1586 uint64_t
1587 vie_size2mask(int size)
1588 {
1589 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1590 	    ("vie_size2mask: invalid size %d", size));
1591 	return (size2mask[size]);
1592 }
1593 
1594 int
1595 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
1596     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
1597     int prot, uint64_t *gla)
1598 {
1599 	uint64_t firstoff, low_limit, high_limit, segbase;
1600 	int glasize, type;
1601 
1602 	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
1603 	    ("%s: invalid segment %d", __func__, seg));
1604 	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
1605 	    ("%s: invalid operand size %d", __func__, length));
1606 	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
1607 	    ("%s: invalid prot %#x", __func__, prot));
1608 
1609 	firstoff = offset;
1610 	if (cpu_mode == CPU_MODE_64BIT) {
1611 		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
1612 		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
1613 		glasize = 8;
1614 	} else {
1615 		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
1616 		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
1617 		glasize = 4;
1618 		/*
1619 		 * If the segment selector is loaded with a NULL selector
1620 		 * then the descriptor is unusable and attempting to use
1621 		 * it results in a #GP(0).
1622 		 */
1623 		if (SEG_DESC_UNUSABLE(desc->access))
1624 			return (-1);
1625 
1626 		/*
1627 		 * The processor generates a #NP exception when a segment
1628 		 * register is loaded with a selector that points to a
1629 		 * descriptor that is not present. If this was the case then
1630 		 * it would have been checked before the VM-exit.
1631 		 */
1632 		KASSERT(SEG_DESC_PRESENT(desc->access),
1633 		    ("segment %d not present: %#x", seg, desc->access));
1634 
1635 		/*
1636 		 * The descriptor type must indicate a code/data segment.
1637 		 */
1638 		type = SEG_DESC_TYPE(desc->access);
1639 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
1640 		    "descriptor type %#x", seg, type));
1641 
1642 		if (prot & PROT_READ) {
1643 			/* #GP on a read access to a exec-only code segment */
1644 			if ((type & 0xA) == 0x8)
1645 				return (-1);
1646 		}
1647 
1648 		if (prot & PROT_WRITE) {
1649 			/*
1650 			 * #GP on a write access to a code segment or a
1651 			 * read-only data segment.
1652 			 */
1653 			if (type & 0x8)			/* code segment */
1654 				return (-1);
1655 
1656 			if ((type & 0xA) == 0)		/* read-only data seg */
1657 				return (-1);
1658 		}
1659 
1660 		/*
1661 		 * 'desc->limit' is fully expanded taking granularity into
1662 		 * account.
1663 		 */
1664 		if ((type & 0xC) == 0x4) {
1665 			/* expand-down data segment */
1666 			low_limit = desc->limit + 1;
1667 			high_limit = SEG_DESC_DEF32(desc->access) ?
1668 			    0xffffffff : 0xffff;
1669 		} else {
1670 			/* code segment or expand-up data segment */
1671 			low_limit = 0;
1672 			high_limit = desc->limit;
1673 		}
1674 
1675 		while (length > 0) {
1676 			offset &= vie_size2mask(addrsize);
1677 			if (offset < low_limit || offset > high_limit)
1678 				return (-1);
1679 			offset++;
1680 			length--;
1681 		}
1682 	}
1683 
1684 	/*
1685 	 * In 64-bit mode all segments except %fs and %gs have a segment
1686 	 * base address of 0.
1687 	 */
1688 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
1689 	    seg != VM_REG_GUEST_GS) {
1690 		segbase = 0;
1691 	} else {
1692 		segbase = desc->base;
1693 	}
1694 
1695 	/*
1696 	 * Truncate 'firstoff' to the effective address size before adding
1697 	 * it to the segment base.
1698 	 */
1699 	firstoff &= vie_size2mask(addrsize);
1700 	*gla = (segbase + firstoff) & vie_size2mask(glasize);
1701 	return (0);
1702 }
1703 
1704 #ifdef _KERNEL
1705 void
1706 vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
1707 {
1708 	KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
1709 	    ("%s: invalid instruction length (%d)", __func__, inst_length));
1710 
1711 	bzero(vie, sizeof(struct vie));
1712 
1713 	vie->base_register = VM_REG_LAST;
1714 	vie->index_register = VM_REG_LAST;
1715 	vie->segment_register = VM_REG_LAST;
1716 
1717 	if (inst_length) {
1718 		bcopy(inst_bytes, vie->inst, inst_length);
1719 		vie->num_valid = inst_length;
1720 	}
1721 }
1722 
1723 static int
1724 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
1725 {
1726 	int error_code = 0;
1727 
1728 	if (pte & PG_V)
1729 		error_code |= PGEX_P;
1730 	if (prot & VM_PROT_WRITE)
1731 		error_code |= PGEX_W;
1732 	if (usermode)
1733 		error_code |= PGEX_U;
1734 	if (rsvd)
1735 		error_code |= PGEX_RSV;
1736 	if (prot & VM_PROT_EXECUTE)
1737 		error_code |= PGEX_I;
1738 
1739 	return (error_code);
1740 }
1741 
1742 static void
1743 ptp_release(void **cookie)
1744 {
1745 	if (*cookie != NULL) {
1746 		vm_gpa_release(*cookie);
1747 		*cookie = NULL;
1748 	}
1749 }
1750 
1751 static void *
1752 ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
1753 {
1754 	void *ptr;
1755 
1756 	ptp_release(cookie);
1757 	ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, VM_PROT_RW, cookie);
1758 	return (ptr);
1759 }
1760 
1761 static int
1762 _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1763     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)
1764 {
1765 	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
1766 	u_int retries;
1767 	uint64_t *ptpbase, ptpphys, pte, pgsize;
1768 	uint32_t *ptpbase32, pte32;
1769 	void *cookie;
1770 
1771 	*guest_fault = 0;
1772 
1773 	usermode = (paging->cpl == 3 ? 1 : 0);
1774 	writable = prot & VM_PROT_WRITE;
1775 	cookie = NULL;
1776 	retval = 0;
1777 	retries = 0;
1778 restart:
1779 	ptpphys = paging->cr3;		/* root of the page tables */
1780 	ptp_release(&cookie);
1781 	if (retries++ > 0)
1782 		maybe_yield();
1783 
1784 	if (vie_canonical_check(paging->cpu_mode, gla)) {
1785 		/*
1786 		 * XXX assuming a non-stack reference otherwise a stack fault
1787 		 * should be generated.
1788 		 */
1789 		if (!check_only)
1790 			vm_inject_gp(vm, vcpuid);
1791 		goto fault;
1792 	}
1793 
1794 	if (paging->paging_mode == PAGING_MODE_FLAT) {
1795 		*gpa = gla;
1796 		goto done;
1797 	}
1798 
1799 	if (paging->paging_mode == PAGING_MODE_32) {
1800 		nlevels = 2;
1801 		while (--nlevels >= 0) {
1802 			/* Zero out the lower 12 bits. */
1803 			ptpphys &= ~0xfff;
1804 
1805 			ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE,
1806 			    &cookie);
1807 
1808 			if (ptpbase32 == NULL)
1809 				goto error;
1810 
1811 			ptpshift = PAGE_SHIFT + nlevels * 10;
1812 			ptpindex = (gla >> ptpshift) & 0x3FF;
1813 			pgsize = 1UL << ptpshift;
1814 
1815 			pte32 = ptpbase32[ptpindex];
1816 
1817 			if ((pte32 & PG_V) == 0 ||
1818 			    (usermode && (pte32 & PG_U) == 0) ||
1819 			    (writable && (pte32 & PG_RW) == 0)) {
1820 				if (!check_only) {
1821 					pfcode = pf_error_code(usermode, prot, 0,
1822 					    pte32);
1823 					vm_inject_pf(vm, vcpuid, pfcode, gla);
1824 				}
1825 				goto fault;
1826 			}
1827 
1828 			/*
1829 			 * Emulate the x86 MMU's management of the accessed
1830 			 * and dirty flags. While the accessed flag is set
1831 			 * at every level of the page table, the dirty flag
1832 			 * is only set at the last level providing the guest
1833 			 * physical address.
1834 			 */
1835 			if (!check_only && (pte32 & PG_A) == 0) {
1836 				if (atomic_cmpset_32(&ptpbase32[ptpindex],
1837 				    pte32, pte32 | PG_A) == 0) {
1838 					goto restart;
1839 				}
1840 			}
1841 
1842 			/* XXX must be ignored if CR4.PSE=0 */
1843 			if (nlevels > 0 && (pte32 & PG_PS) != 0)
1844 				break;
1845 
1846 			ptpphys = pte32;
1847 		}
1848 
1849 		/* Set the dirty bit in the page table entry if necessary */
1850 		if (!check_only && writable && (pte32 & PG_M) == 0) {
1851 			if (atomic_cmpset_32(&ptpbase32[ptpindex],
1852 			    pte32, pte32 | PG_M) == 0) {
1853 				goto restart;
1854 			}
1855 		}
1856 
1857 		/* Zero out the lower 'ptpshift' bits */
1858 		pte32 >>= ptpshift; pte32 <<= ptpshift;
1859 		*gpa = pte32 | (gla & (pgsize - 1));
1860 		goto done;
1861 	}
1862 
1863 	if (paging->paging_mode == PAGING_MODE_PAE) {
1864 		/* Zero out the lower 5 bits and the upper 32 bits */
1865 		ptpphys &= 0xffffffe0UL;
1866 
1867 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof(*ptpbase) * 4,
1868 		    &cookie);
1869 		if (ptpbase == NULL)
1870 			goto error;
1871 
1872 		ptpindex = (gla >> 30) & 0x3;
1873 
1874 		pte = ptpbase[ptpindex];
1875 
1876 		if ((pte & PG_V) == 0) {
1877 			if (!check_only) {
1878 				pfcode = pf_error_code(usermode, prot, 0, pte);
1879 				vm_inject_pf(vm, vcpuid, pfcode, gla);
1880 			}
1881 			goto fault;
1882 		}
1883 
1884 		ptpphys = pte;
1885 
1886 		nlevels = 2;
1887 	} else
1888 		nlevels = 4;
1889 	while (--nlevels >= 0) {
1890 		/* Zero out the lower 12 bits and the upper 12 bits */
1891 		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
1892 
1893 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie);
1894 		if (ptpbase == NULL)
1895 			goto error;
1896 
1897 		ptpshift = PAGE_SHIFT + nlevels * 9;
1898 		ptpindex = (gla >> ptpshift) & 0x1FF;
1899 		pgsize = 1UL << ptpshift;
1900 
1901 		pte = ptpbase[ptpindex];
1902 
1903 		if ((pte & PG_V) == 0 ||
1904 		    (usermode && (pte & PG_U) == 0) ||
1905 		    (writable && (pte & PG_RW) == 0)) {
1906 			if (!check_only) {
1907 				pfcode = pf_error_code(usermode, prot, 0, pte);
1908 				vm_inject_pf(vm, vcpuid, pfcode, gla);
1909 			}
1910 			goto fault;
1911 		}
1912 
1913 		/* Set the accessed bit in the page table entry */
1914 		if (!check_only && (pte & PG_A) == 0) {
1915 			if (atomic_cmpset_64(&ptpbase[ptpindex],
1916 			    pte, pte | PG_A) == 0) {
1917 				goto restart;
1918 			}
1919 		}
1920 
1921 		if (nlevels > 0 && (pte & PG_PS) != 0) {
1922 			if (pgsize > 1 * GB) {
1923 				if (!check_only) {
1924 					pfcode = pf_error_code(usermode, prot, 1,
1925 					    pte);
1926 					vm_inject_pf(vm, vcpuid, pfcode, gla);
1927 				}
1928 				goto fault;
1929 			}
1930 			break;
1931 		}
1932 
1933 		ptpphys = pte;
1934 	}
1935 
1936 	/* Set the dirty bit in the page table entry if necessary */
1937 	if (!check_only && writable && (pte & PG_M) == 0) {
1938 		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
1939 			goto restart;
1940 	}
1941 
1942 	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
1943 	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
1944 	*gpa = pte | (gla & (pgsize - 1));
1945 done:
1946 	ptp_release(&cookie);
1947 	KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",
1948 	    __func__, retval));
1949 	return (retval);
1950 error:
1951 	retval = EFAULT;
1952 	goto done;
1953 fault:
1954 	*guest_fault = 1;
1955 	goto done;
1956 }
1957 
1958 int
1959 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1960     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
1961 {
1962 
1963 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
1964 	    false));
1965 }
1966 
1967 int
1968 vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1969     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
1970 {
1971 
1972 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
1973 	    true));
1974 }
1975 
1976 int
1977 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1978     uint64_t rip, int inst_length, struct vie *vie, int *faultptr)
1979 {
1980 	struct vm_copyinfo copyinfo[2];
1981 	int error, prot;
1982 
1983 	if (inst_length > VIE_INST_SIZE)
1984 		panic("vmm_fetch_instruction: invalid length %d", inst_length);
1985 
1986 	prot = PROT_READ | PROT_EXEC;
1987 	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
1988 	    copyinfo, nitems(copyinfo), faultptr);
1989 	if (error || *faultptr)
1990 		return (error);
1991 
1992 	vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
1993 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1994 	vie->num_valid = inst_length;
1995 	return (0);
1996 }
1997 
1998 static int
1999 vie_peek(struct vie *vie, uint8_t *x)
2000 {
2001 
2002 	if (vie->num_processed < vie->num_valid) {
2003 		*x = vie->inst[vie->num_processed];
2004 		return (0);
2005 	} else
2006 		return (-1);
2007 }
2008 
2009 static void
2010 vie_advance(struct vie *vie)
2011 {
2012 
2013 	vie->num_processed++;
2014 }
2015 
2016 static bool
2017 segment_override(uint8_t x, int *seg)
2018 {
2019 
2020 	switch (x) {
2021 	case 0x2E:
2022 		*seg = VM_REG_GUEST_CS;
2023 		break;
2024 	case 0x36:
2025 		*seg = VM_REG_GUEST_SS;
2026 		break;
2027 	case 0x3E:
2028 		*seg = VM_REG_GUEST_DS;
2029 		break;
2030 	case 0x26:
2031 		*seg = VM_REG_GUEST_ES;
2032 		break;
2033 	case 0x64:
2034 		*seg = VM_REG_GUEST_FS;
2035 		break;
2036 	case 0x65:
2037 		*seg = VM_REG_GUEST_GS;
2038 		break;
2039 	default:
2040 		return (false);
2041 	}
2042 	return (true);
2043 }
2044 
2045 static int
2046 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
2047 {
2048 	uint8_t x;
2049 
2050 	while (1) {
2051 		if (vie_peek(vie, &x))
2052 			return (-1);
2053 
2054 		if (x == 0x66)
2055 			vie->opsize_override = 1;
2056 		else if (x == 0x67)
2057 			vie->addrsize_override = 1;
2058 		else if (x == 0xF3)
2059 			vie->repz_present = 1;
2060 		else if (x == 0xF2)
2061 			vie->repnz_present = 1;
2062 		else if (segment_override(x, &vie->segment_register))
2063 			vie->segment_override = 1;
2064 		else
2065 			break;
2066 
2067 		vie_advance(vie);
2068 	}
2069 
2070 	/*
2071 	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
2072 	 * - Only one REX prefix is allowed per instruction.
2073 	 * - The REX prefix must immediately precede the opcode byte or the
2074 	 *   escape opcode byte.
2075 	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
2076 	 *   the mandatory prefix must come before the REX prefix.
2077 	 */
2078 	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
2079 		vie->rex_present = 1;
2080 		vie->rex_w = x & 0x8 ? 1 : 0;
2081 		vie->rex_r = x & 0x4 ? 1 : 0;
2082 		vie->rex_x = x & 0x2 ? 1 : 0;
2083 		vie->rex_b = x & 0x1 ? 1 : 0;
2084 		vie_advance(vie);
2085 	}
2086 
2087 	/*
2088 	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
2089 	 */
2090 	if (cpu_mode == CPU_MODE_64BIT) {
2091 		/*
2092 		 * Default address size is 64-bits and default operand size
2093 		 * is 32-bits.
2094 		 */
2095 		vie->addrsize = vie->addrsize_override ? 4 : 8;
2096 		if (vie->rex_w)
2097 			vie->opsize = 8;
2098 		else if (vie->opsize_override)
2099 			vie->opsize = 2;
2100 		else
2101 			vie->opsize = 4;
2102 	} else if (cs_d) {
2103 		/* Default address and operand sizes are 32-bits */
2104 		vie->addrsize = vie->addrsize_override ? 2 : 4;
2105 		vie->opsize = vie->opsize_override ? 2 : 4;
2106 	} else {
2107 		/* Default address and operand sizes are 16-bits */
2108 		vie->addrsize = vie->addrsize_override ? 4 : 2;
2109 		vie->opsize = vie->opsize_override ? 4 : 2;
2110 	}
2111 	return (0);
2112 }
2113 
2114 static int
2115 decode_two_byte_opcode(struct vie *vie)
2116 {
2117 	uint8_t x;
2118 
2119 	if (vie_peek(vie, &x))
2120 		return (-1);
2121 
2122 	vie->op = two_byte_opcodes[x];
2123 
2124 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
2125 		return (-1);
2126 
2127 	vie_advance(vie);
2128 	return (0);
2129 }
2130 
2131 static int
2132 decode_opcode(struct vie *vie)
2133 {
2134 	uint8_t x;
2135 
2136 	if (vie_peek(vie, &x))
2137 		return (-1);
2138 
2139 	vie->op = one_byte_opcodes[x];
2140 
2141 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
2142 		return (-1);
2143 
2144 	vie_advance(vie);
2145 
2146 	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
2147 		return (decode_two_byte_opcode(vie));
2148 
2149 	return (0);
2150 }
2151 
2152 static int
2153 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
2154 {
2155 	uint8_t x;
2156 
2157 	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
2158 		return (0);
2159 
2160 	if (cpu_mode == CPU_MODE_REAL)
2161 		return (-1);
2162 
2163 	if (vie_peek(vie, &x))
2164 		return (-1);
2165 
2166 	vie->mod = (x >> 6) & 0x3;
2167 	vie->rm =  (x >> 0) & 0x7;
2168 	vie->reg = (x >> 3) & 0x7;
2169 
2170 	/*
2171 	 * A direct addressing mode makes no sense in the context of an EPT
2172 	 * fault. There has to be a memory access involved to cause the
2173 	 * EPT fault.
2174 	 */
2175 	if (vie->mod == VIE_MOD_DIRECT)
2176 		return (-1);
2177 
2178 	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
2179 	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
2180 		/*
2181 		 * Table 2-5: Special Cases of REX Encodings
2182 		 *
2183 		 * mod=0, r/m=5 is used in the compatibility mode to
2184 		 * indicate a disp32 without a base register.
2185 		 *
2186 		 * mod!=3, r/m=4 is used in the compatibility mode to
2187 		 * indicate that the SIB byte is present.
2188 		 *
2189 		 * The 'b' bit in the REX prefix is don't care in
2190 		 * this case.
2191 		 */
2192 	} else {
2193 		vie->rm |= (vie->rex_b << 3);
2194 	}
2195 
2196 	vie->reg |= (vie->rex_r << 3);
2197 
2198 	/* SIB */
2199 	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
2200 		goto done;
2201 
2202 	vie->base_register = gpr_map[vie->rm];
2203 
2204 	switch (vie->mod) {
2205 	case VIE_MOD_INDIRECT_DISP8:
2206 		vie->disp_bytes = 1;
2207 		break;
2208 	case VIE_MOD_INDIRECT_DISP32:
2209 		vie->disp_bytes = 4;
2210 		break;
2211 	case VIE_MOD_INDIRECT:
2212 		if (vie->rm == VIE_RM_DISP32) {
2213 			vie->disp_bytes = 4;
2214 			/*
2215 			 * Table 2-7. RIP-Relative Addressing
2216 			 *
2217 			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
2218 			 * whereas in compatibility mode it just implies disp32.
2219 			 */
2220 
2221 			if (cpu_mode == CPU_MODE_64BIT)
2222 				vie->base_register = VM_REG_GUEST_RIP;
2223 			else
2224 				vie->base_register = VM_REG_LAST;
2225 		}
2226 		break;
2227 	}
2228 
2229 done:
2230 	vie_advance(vie);
2231 
2232 	return (0);
2233 }
2234 
2235 static int
2236 decode_sib(struct vie *vie)
2237 {
2238 	uint8_t x;
2239 
2240 	/* Proceed only if SIB byte is present */
2241 	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
2242 		return (0);
2243 
2244 	if (vie_peek(vie, &x))
2245 		return (-1);
2246 
2247 	/* De-construct the SIB byte */
2248 	vie->ss = (x >> 6) & 0x3;
2249 	vie->index = (x >> 3) & 0x7;
2250 	vie->base = (x >> 0) & 0x7;
2251 
2252 	/* Apply the REX prefix modifiers */
2253 	vie->index |= vie->rex_x << 3;
2254 	vie->base |= vie->rex_b << 3;
2255 
2256 	switch (vie->mod) {
2257 	case VIE_MOD_INDIRECT_DISP8:
2258 		vie->disp_bytes = 1;
2259 		break;
2260 	case VIE_MOD_INDIRECT_DISP32:
2261 		vie->disp_bytes = 4;
2262 		break;
2263 	}
2264 
2265 	if (vie->mod == VIE_MOD_INDIRECT &&
2266 	    (vie->base == 5 || vie->base == 13)) {
2267 		/*
2268 		 * Special case when base register is unused if mod = 0
2269 		 * and base = %rbp or %r13.
2270 		 *
2271 		 * Documented in:
2272 		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2273 		 * Table 2-5: Special Cases of REX Encodings
2274 		 */
2275 		vie->disp_bytes = 4;
2276 	} else {
2277 		vie->base_register = gpr_map[vie->base];
2278 	}
2279 
2280 	/*
2281 	 * All encodings of 'index' are valid except for %rsp (4).
2282 	 *
2283 	 * Documented in:
2284 	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2285 	 * Table 2-5: Special Cases of REX Encodings
2286 	 */
2287 	if (vie->index != 4)
2288 		vie->index_register = gpr_map[vie->index];
2289 
2290 	/* 'scale' makes sense only in the context of an index register */
2291 	if (vie->index_register < VM_REG_LAST)
2292 		vie->scale = 1 << vie->ss;
2293 
2294 	vie_advance(vie);
2295 
2296 	return (0);
2297 }
2298 
2299 static int
2300 decode_displacement(struct vie *vie)
2301 {
2302 	int n, i;
2303 	uint8_t x;
2304 
2305 	union {
2306 		char	buf[4];
2307 		int8_t	signed8;
2308 		int32_t	signed32;
2309 	} u;
2310 
2311 	if ((n = vie->disp_bytes) == 0)
2312 		return (0);
2313 
2314 	if (n != 1 && n != 4)
2315 		panic("decode_displacement: invalid disp_bytes %d", n);
2316 
2317 	for (i = 0; i < n; i++) {
2318 		if (vie_peek(vie, &x))
2319 			return (-1);
2320 
2321 		u.buf[i] = x;
2322 		vie_advance(vie);
2323 	}
2324 
2325 	if (n == 1)
2326 		vie->displacement = u.signed8;		/* sign-extended */
2327 	else
2328 		vie->displacement = u.signed32;		/* sign-extended */
2329 
2330 	return (0);
2331 }
2332 
2333 static int
2334 decode_immediate(struct vie *vie)
2335 {
2336 	int i, n;
2337 	uint8_t x;
2338 	union {
2339 		char	buf[4];
2340 		int8_t	signed8;
2341 		int16_t	signed16;
2342 		int32_t	signed32;
2343 	} u;
2344 
2345 	/* Figure out immediate operand size (if any) */
2346 	if (vie->op.op_flags & VIE_OP_F_IMM) {
2347 		/*
2348 		 * Section 2.2.1.5 "Immediates", Intel SDM:
2349 		 * In 64-bit mode the typical size of immediate operands
2350 		 * remains 32-bits. When the operand size if 64-bits, the
2351 		 * processor sign-extends all immediates to 64-bits prior
2352 		 * to their use.
2353 		 */
2354 		if (vie->opsize == 4 || vie->opsize == 8)
2355 			vie->imm_bytes = 4;
2356 		else
2357 			vie->imm_bytes = 2;
2358 	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
2359 		vie->imm_bytes = 1;
2360 	}
2361 
2362 	if ((n = vie->imm_bytes) == 0)
2363 		return (0);
2364 
2365 	KASSERT(n == 1 || n == 2 || n == 4,
2366 	    ("%s: invalid number of immediate bytes: %d", __func__, n));
2367 
2368 	for (i = 0; i < n; i++) {
2369 		if (vie_peek(vie, &x))
2370 			return (-1);
2371 
2372 		u.buf[i] = x;
2373 		vie_advance(vie);
2374 	}
2375 
2376 	/* sign-extend the immediate value before use */
2377 	if (n == 1)
2378 		vie->immediate = u.signed8;
2379 	else if (n == 2)
2380 		vie->immediate = u.signed16;
2381 	else
2382 		vie->immediate = u.signed32;
2383 
2384 	return (0);
2385 }
2386 
2387 static int
2388 decode_moffset(struct vie *vie)
2389 {
2390 	int i, n;
2391 	uint8_t x;
2392 	union {
2393 		char	buf[8];
2394 		uint64_t u64;
2395 	} u;
2396 
2397 	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
2398 		return (0);
2399 
2400 	/*
2401 	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
2402 	 * The memory offset size follows the address-size of the instruction.
2403 	 */
2404 	n = vie->addrsize;
2405 	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
2406 
2407 	u.u64 = 0;
2408 	for (i = 0; i < n; i++) {
2409 		if (vie_peek(vie, &x))
2410 			return (-1);
2411 
2412 		u.buf[i] = x;
2413 		vie_advance(vie);
2414 	}
2415 	vie->displacement = u.u64;
2416 	return (0);
2417 }
2418 
2419 /*
2420  * Verify that the 'guest linear address' provided as collateral of the nested
2421  * page table fault matches with our instruction decoding.
2422  */
2423 static int
2424 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie,
2425     enum vm_cpu_mode cpu_mode)
2426 {
2427 	int error;
2428 	uint64_t base, segbase, idx, gla2;
2429 	enum vm_reg_name seg;
2430 	struct seg_desc desc;
2431 
2432 	/* Skip 'gla' verification */
2433 	if (gla == VIE_INVALID_GLA)
2434 		return (0);
2435 
2436 	base = 0;
2437 	if (vie->base_register != VM_REG_LAST) {
2438 		error = vm_get_register(vm, cpuid, vie->base_register, &base);
2439 		if (error) {
2440 			printf("verify_gla: error %d getting base reg %d\n",
2441 				error, vie->base_register);
2442 			return (-1);
2443 		}
2444 
2445 		/*
2446 		 * RIP-relative addressing starts from the following
2447 		 * instruction
2448 		 */
2449 		if (vie->base_register == VM_REG_GUEST_RIP)
2450 			base += vie->num_processed;
2451 	}
2452 
2453 	idx = 0;
2454 	if (vie->index_register != VM_REG_LAST) {
2455 		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
2456 		if (error) {
2457 			printf("verify_gla: error %d getting index reg %d\n",
2458 				error, vie->index_register);
2459 			return (-1);
2460 		}
2461 	}
2462 
2463 	/*
2464 	 * From "Specifying a Segment Selector", Intel SDM, Vol 1
2465 	 *
2466 	 * In 64-bit mode, segmentation is generally (but not
2467 	 * completely) disabled.  The exceptions are the FS and GS
2468 	 * segments.
2469 	 *
2470 	 * In legacy IA-32 mode, when the ESP or EBP register is used
2471 	 * as the base, the SS segment is the default segment.  For
2472 	 * other data references, except when relative to stack or
2473 	 * string destination the DS segment is the default.  These
2474 	 * can be overridden to allow other segments to be accessed.
2475 	 */
2476 	if (vie->segment_override)
2477 		seg = vie->segment_register;
2478 	else if (vie->base_register == VM_REG_GUEST_RSP ||
2479 	    vie->base_register == VM_REG_GUEST_RBP)
2480 		seg = VM_REG_GUEST_SS;
2481 	else
2482 		seg = VM_REG_GUEST_DS;
2483 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
2484 	    seg != VM_REG_GUEST_GS) {
2485 		segbase = 0;
2486 	} else {
2487 		error = vm_get_seg_desc(vm, cpuid, seg, &desc);
2488 		if (error) {
2489 			printf("verify_gla: error %d getting segment"
2490 			       " descriptor %d", error,
2491 			       vie->segment_register);
2492 			return (-1);
2493 		}
2494 		segbase = desc.base;
2495 	}
2496 
2497 	gla2 = segbase + base + vie->scale * idx + vie->displacement;
2498 	gla2 &= size2mask[vie->addrsize];
2499 	if (gla != gla2) {
2500 		printf("verify_gla mismatch: segbase(0x%0lx)"
2501 		       "base(0x%0lx), scale(%d), index(0x%0lx), "
2502 		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
2503 		       segbase, base, vie->scale, idx, vie->displacement,
2504 		       gla, gla2);
2505 		return (-1);
2506 	}
2507 
2508 	return (0);
2509 }
2510 
2511 int
2512 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
2513 		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
2514 {
2515 
2516 	if (decode_prefixes(vie, cpu_mode, cs_d))
2517 		return (-1);
2518 
2519 	if (decode_opcode(vie))
2520 		return (-1);
2521 
2522 	if (decode_modrm(vie, cpu_mode))
2523 		return (-1);
2524 
2525 	if (decode_sib(vie))
2526 		return (-1);
2527 
2528 	if (decode_displacement(vie))
2529 		return (-1);
2530 
2531 	if (decode_immediate(vie))
2532 		return (-1);
2533 
2534 	if (decode_moffset(vie))
2535 		return (-1);
2536 
2537 	if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
2538 		if (verify_gla(vm, cpuid, gla, vie, cpu_mode))
2539 			return (-1);
2540 	}
2541 
2542 	vie->decoded = 1;	/* success */
2543 
2544 	return (0);
2545 }
2546 #endif	/* _KERNEL */
2547