1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2012 Sandvine, Inc.
5  * Copyright (c) 2012 NetApp, Inc.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #ifdef _KERNEL
36 #include <sys/param.h>
37 #include <sys/pcpu.h>
38 #include <sys/systm.h>
39 #include <sys/proc.h>
40 
41 #include <vm/vm.h>
42 #include <vm/pmap.h>
43 
44 #include <machine/vmparam.h>
45 #include <machine/vmm.h>
46 #else	/* !_KERNEL */
47 #include <sys/types.h>
48 #include <sys/errno.h>
49 #include <sys/_iovec.h>
50 
51 #include <machine/vmm.h>
52 
53 #include <assert.h>
54 #include <vmmapi.h>
55 #define	KASSERT(exp,msg)	assert((exp))
56 #endif	/* _KERNEL */
57 
58 #include <machine/vmm_instruction_emul.h>
59 #include <x86/psl.h>
60 #include <x86/specialreg.h>
61 
62 /* struct vie_op.op_type */
63 enum {
64 	VIE_OP_TYPE_NONE = 0,
65 	VIE_OP_TYPE_MOV,
66 	VIE_OP_TYPE_MOVSX,
67 	VIE_OP_TYPE_MOVZX,
68 	VIE_OP_TYPE_AND,
69 	VIE_OP_TYPE_OR,
70 	VIE_OP_TYPE_SUB,
71 	VIE_OP_TYPE_TWO_BYTE,
72 	VIE_OP_TYPE_PUSH,
73 	VIE_OP_TYPE_CMP,
74 	VIE_OP_TYPE_POP,
75 	VIE_OP_TYPE_MOVS,
76 	VIE_OP_TYPE_GROUP1,
77 	VIE_OP_TYPE_STOS,
78 	VIE_OP_TYPE_BITTEST,
79 	VIE_OP_TYPE_LAST
80 };
81 
82 /* struct vie_op.op_flags */
83 #define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
84 #define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
85 #define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
86 #define	VIE_OP_F_NO_MODRM	(1 << 3)
87 #define	VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
88 
89 static const struct vie_op two_byte_opcodes[256] = {
90 	[0xB6] = {
91 		.op_byte = 0xB6,
92 		.op_type = VIE_OP_TYPE_MOVZX,
93 	},
94 	[0xB7] = {
95 		.op_byte = 0xB7,
96 		.op_type = VIE_OP_TYPE_MOVZX,
97 	},
98 	[0xBA] = {
99 		.op_byte = 0xBA,
100 		.op_type = VIE_OP_TYPE_BITTEST,
101 		.op_flags = VIE_OP_F_IMM8,
102 	},
103 	[0xBE] = {
104 		.op_byte = 0xBE,
105 		.op_type = VIE_OP_TYPE_MOVSX,
106 	},
107 };
108 
109 static const struct vie_op one_byte_opcodes[256] = {
110 	[0x0F] = {
111 		.op_byte = 0x0F,
112 		.op_type = VIE_OP_TYPE_TWO_BYTE
113 	},
114 	[0x0B] = {
115 		.op_byte = 0x0B,
116 		.op_type = VIE_OP_TYPE_OR,
117 	},
118 	[0x2B] = {
119 		.op_byte = 0x2B,
120 		.op_type = VIE_OP_TYPE_SUB,
121 	},
122 	[0x39] = {
123 		.op_byte = 0x39,
124 		.op_type = VIE_OP_TYPE_CMP,
125 	},
126 	[0x3B] = {
127 		.op_byte = 0x3B,
128 		.op_type = VIE_OP_TYPE_CMP,
129 	},
130 	[0x88] = {
131 		.op_byte = 0x88,
132 		.op_type = VIE_OP_TYPE_MOV,
133 	},
134 	[0x89] = {
135 		.op_byte = 0x89,
136 		.op_type = VIE_OP_TYPE_MOV,
137 	},
138 	[0x8A] = {
139 		.op_byte = 0x8A,
140 		.op_type = VIE_OP_TYPE_MOV,
141 	},
142 	[0x8B] = {
143 		.op_byte = 0x8B,
144 		.op_type = VIE_OP_TYPE_MOV,
145 	},
146 	[0xA1] = {
147 		.op_byte = 0xA1,
148 		.op_type = VIE_OP_TYPE_MOV,
149 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
150 	},
151 	[0xA3] = {
152 		.op_byte = 0xA3,
153 		.op_type = VIE_OP_TYPE_MOV,
154 		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
155 	},
156 	[0xA4] = {
157 		.op_byte = 0xA4,
158 		.op_type = VIE_OP_TYPE_MOVS,
159 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
160 	},
161 	[0xA5] = {
162 		.op_byte = 0xA5,
163 		.op_type = VIE_OP_TYPE_MOVS,
164 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
165 	},
166 	[0xAA] = {
167 		.op_byte = 0xAA,
168 		.op_type = VIE_OP_TYPE_STOS,
169 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
170 	},
171 	[0xAB] = {
172 		.op_byte = 0xAB,
173 		.op_type = VIE_OP_TYPE_STOS,
174 		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
175 	},
176 	[0xC6] = {
177 		/* XXX Group 11 extended opcode - not just MOV */
178 		.op_byte = 0xC6,
179 		.op_type = VIE_OP_TYPE_MOV,
180 		.op_flags = VIE_OP_F_IMM8,
181 	},
182 	[0xC7] = {
183 		.op_byte = 0xC7,
184 		.op_type = VIE_OP_TYPE_MOV,
185 		.op_flags = VIE_OP_F_IMM,
186 	},
187 	[0x23] = {
188 		.op_byte = 0x23,
189 		.op_type = VIE_OP_TYPE_AND,
190 	},
191 	[0x80] = {
192 		/* Group 1 extended opcode */
193 		.op_byte = 0x80,
194 		.op_type = VIE_OP_TYPE_GROUP1,
195 		.op_flags = VIE_OP_F_IMM8,
196 	},
197 	[0x81] = {
198 		/* Group 1 extended opcode */
199 		.op_byte = 0x81,
200 		.op_type = VIE_OP_TYPE_GROUP1,
201 		.op_flags = VIE_OP_F_IMM,
202 	},
203 	[0x83] = {
204 		/* Group 1 extended opcode */
205 		.op_byte = 0x83,
206 		.op_type = VIE_OP_TYPE_GROUP1,
207 		.op_flags = VIE_OP_F_IMM8,
208 	},
209 	[0x8F] = {
210 		/* XXX Group 1A extended opcode - not just POP */
211 		.op_byte = 0x8F,
212 		.op_type = VIE_OP_TYPE_POP,
213 	},
214 	[0xFF] = {
215 		/* XXX Group 5 extended opcode - not just PUSH */
216 		.op_byte = 0xFF,
217 		.op_type = VIE_OP_TYPE_PUSH,
218 	}
219 };
220 
221 /* struct vie.mod */
222 #define	VIE_MOD_INDIRECT		0
223 #define	VIE_MOD_INDIRECT_DISP8		1
224 #define	VIE_MOD_INDIRECT_DISP32		2
225 #define	VIE_MOD_DIRECT			3
226 
227 /* struct vie.rm */
228 #define	VIE_RM_SIB			4
229 #define	VIE_RM_DISP32			5
230 
231 #define	GB				(1024 * 1024 * 1024)
232 
233 static enum vm_reg_name gpr_map[16] = {
234 	VM_REG_GUEST_RAX,
235 	VM_REG_GUEST_RCX,
236 	VM_REG_GUEST_RDX,
237 	VM_REG_GUEST_RBX,
238 	VM_REG_GUEST_RSP,
239 	VM_REG_GUEST_RBP,
240 	VM_REG_GUEST_RSI,
241 	VM_REG_GUEST_RDI,
242 	VM_REG_GUEST_R8,
243 	VM_REG_GUEST_R9,
244 	VM_REG_GUEST_R10,
245 	VM_REG_GUEST_R11,
246 	VM_REG_GUEST_R12,
247 	VM_REG_GUEST_R13,
248 	VM_REG_GUEST_R14,
249 	VM_REG_GUEST_R15
250 };
251 
252 static uint64_t size2mask[] = {
253 	[1] = 0xff,
254 	[2] = 0xffff,
255 	[4] = 0xffffffff,
256 	[8] = 0xffffffffffffffff,
257 };
258 
259 static int
260 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
261 {
262 	int error;
263 
264 	error = vm_get_register(vm, vcpuid, reg, rval);
265 
266 	return (error);
267 }
268 
269 static void
270 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
271 {
272 	*lhbr = 0;
273 	*reg = gpr_map[vie->reg];
274 
275 	/*
276 	 * 64-bit mode imposes limitations on accessing legacy high byte
277 	 * registers (lhbr).
278 	 *
279 	 * The legacy high-byte registers cannot be addressed if the REX
280 	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
281 	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
282 	 *
283 	 * If the REX prefix is not present then the values 4, 5, 6 and 7
284 	 * of the 'ModRM:reg' field address the legacy high-byte registers,
285 	 * %ah, %ch, %dh and %bh respectively.
286 	 */
287 	if (!vie->rex_present) {
288 		if (vie->reg & 0x4) {
289 			*lhbr = 1;
290 			*reg = gpr_map[vie->reg & 0x3];
291 		}
292 	}
293 }
294 
295 static int
296 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
297 {
298 	uint64_t val;
299 	int error, lhbr;
300 	enum vm_reg_name reg;
301 
302 	vie_calc_bytereg(vie, &reg, &lhbr);
303 	error = vm_get_register(vm, vcpuid, reg, &val);
304 
305 	/*
306 	 * To obtain the value of a legacy high byte register shift the
307 	 * base register right by 8 bits (%ah = %rax >> 8).
308 	 */
309 	if (lhbr)
310 		*rval = val >> 8;
311 	else
312 		*rval = val;
313 	return (error);
314 }
315 
316 static int
317 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
318 {
319 	uint64_t origval, val, mask;
320 	int error, lhbr;
321 	enum vm_reg_name reg;
322 
323 	vie_calc_bytereg(vie, &reg, &lhbr);
324 	error = vm_get_register(vm, vcpuid, reg, &origval);
325 	if (error == 0) {
326 		val = byte;
327 		mask = 0xff;
328 		if (lhbr) {
329 			/*
330 			 * Shift left by 8 to store 'byte' in a legacy high
331 			 * byte register.
332 			 */
333 			val <<= 8;
334 			mask <<= 8;
335 		}
336 		val |= origval & ~mask;
337 		error = vm_set_register(vm, vcpuid, reg, val);
338 	}
339 	return (error);
340 }
341 
342 int
343 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
344 		    uint64_t val, int size)
345 {
346 	int error;
347 	uint64_t origval;
348 
349 	switch (size) {
350 	case 1:
351 	case 2:
352 		error = vie_read_register(vm, vcpuid, reg, &origval);
353 		if (error)
354 			return (error);
355 		val &= size2mask[size];
356 		val |= origval & ~size2mask[size];
357 		break;
358 	case 4:
359 		val &= 0xffffffffUL;
360 		break;
361 	case 8:
362 		break;
363 	default:
364 		return (EINVAL);
365 	}
366 
367 	error = vm_set_register(vm, vcpuid, reg, val);
368 	return (error);
369 }
370 
371 #define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
372 
373 /*
374  * Return the status flags that would result from doing (x - y).
375  */
376 #define	GETCC(sz)							\
377 static u_long								\
378 getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
379 {									\
380 	u_long rflags;							\
381 									\
382 	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
383 	    "=r" (rflags), "+r" (x) : "m" (y));				\
384 	return (rflags);						\
385 } struct __hack
386 
387 GETCC(8);
388 GETCC(16);
389 GETCC(32);
390 GETCC(64);
391 
392 static u_long
393 getcc(int opsize, uint64_t x, uint64_t y)
394 {
395 	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
396 	    ("getcc: invalid operand size %d", opsize));
397 
398 	if (opsize == 1)
399 		return (getcc8(x, y));
400 	else if (opsize == 2)
401 		return (getcc16(x, y));
402 	else if (opsize == 4)
403 		return (getcc32(x, y));
404 	else
405 		return (getcc64(x, y));
406 }
407 
408 static int
409 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
410 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
411 {
412 	int error, size;
413 	enum vm_reg_name reg;
414 	uint8_t byte;
415 	uint64_t val;
416 
417 	size = vie->opsize;
418 	error = EINVAL;
419 
420 	switch (vie->op.op_byte) {
421 	case 0x88:
422 		/*
423 		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
424 		 * 88/r:	mov r/m8, r8
425 		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
426 		 */
427 		size = 1;	/* override for byte operation */
428 		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
429 		if (error == 0)
430 			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
431 		break;
432 	case 0x89:
433 		/*
434 		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
435 		 * 89/r:	mov r/m16, r16
436 		 * 89/r:	mov r/m32, r32
437 		 * REX.W + 89/r	mov r/m64, r64
438 		 */
439 		reg = gpr_map[vie->reg];
440 		error = vie_read_register(vm, vcpuid, reg, &val);
441 		if (error == 0) {
442 			val &= size2mask[size];
443 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
444 		}
445 		break;
446 	case 0x8A:
447 		/*
448 		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
449 		 * 8A/r:	mov r8, r/m8
450 		 * REX + 8A/r:	mov r8, r/m8
451 		 */
452 		size = 1;	/* override for byte operation */
453 		error = memread(vm, vcpuid, gpa, &val, size, arg);
454 		if (error == 0)
455 			error = vie_write_bytereg(vm, vcpuid, vie, val);
456 		break;
457 	case 0x8B:
458 		/*
459 		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
460 		 * 8B/r:	mov r16, r/m16
461 		 * 8B/r:	mov r32, r/m32
462 		 * REX.W 8B/r:	mov r64, r/m64
463 		 */
464 		error = memread(vm, vcpuid, gpa, &val, size, arg);
465 		if (error == 0) {
466 			reg = gpr_map[vie->reg];
467 			error = vie_update_register(vm, vcpuid, reg, val, size);
468 		}
469 		break;
470 	case 0xA1:
471 		/*
472 		 * MOV from seg:moffset to AX/EAX/RAX
473 		 * A1:		mov AX, moffs16
474 		 * A1:		mov EAX, moffs32
475 		 * REX.W + A1:	mov RAX, moffs64
476 		 */
477 		error = memread(vm, vcpuid, gpa, &val, size, arg);
478 		if (error == 0) {
479 			reg = VM_REG_GUEST_RAX;
480 			error = vie_update_register(vm, vcpuid, reg, val, size);
481 		}
482 		break;
483 	case 0xA3:
484 		/*
485 		 * MOV from AX/EAX/RAX to seg:moffset
486 		 * A3:		mov moffs16, AX
487 		 * A3:		mov moffs32, EAX
488 		 * REX.W + A3:	mov moffs64, RAX
489 		 */
490 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
491 		if (error == 0) {
492 			val &= size2mask[size];
493 			error = memwrite(vm, vcpuid, gpa, val, size, arg);
494 		}
495 		break;
496 	case 0xC6:
497 		/*
498 		 * MOV from imm8 to mem (ModRM:r/m)
499 		 * C6/0		mov r/m8, imm8
500 		 * REX + C6/0	mov r/m8, imm8
501 		 */
502 		size = 1;	/* override for byte operation */
503 		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
504 		break;
505 	case 0xC7:
506 		/*
507 		 * MOV from imm16/imm32 to mem (ModRM:r/m)
508 		 * C7/0		mov r/m16, imm16
509 		 * C7/0		mov r/m32, imm32
510 		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
511 		 */
512 		val = vie->immediate & size2mask[size];
513 		error = memwrite(vm, vcpuid, gpa, val, size, arg);
514 		break;
515 	default:
516 		break;
517 	}
518 
519 	return (error);
520 }
521 
522 static int
523 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
524 	     mem_region_read_t memread, mem_region_write_t memwrite,
525 	     void *arg)
526 {
527 	int error, size;
528 	enum vm_reg_name reg;
529 	uint64_t val;
530 
531 	size = vie->opsize;
532 	error = EINVAL;
533 
534 	switch (vie->op.op_byte) {
535 	case 0xB6:
536 		/*
537 		 * MOV and zero extend byte from mem (ModRM:r/m) to
538 		 * reg (ModRM:reg).
539 		 *
540 		 * 0F B6/r		movzx r16, r/m8
541 		 * 0F B6/r		movzx r32, r/m8
542 		 * REX.W + 0F B6/r	movzx r64, r/m8
543 		 */
544 
545 		/* get the first operand */
546 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
547 		if (error)
548 			break;
549 
550 		/* get the second operand */
551 		reg = gpr_map[vie->reg];
552 
553 		/* zero-extend byte */
554 		val = (uint8_t)val;
555 
556 		/* write the result */
557 		error = vie_update_register(vm, vcpuid, reg, val, size);
558 		break;
559 	case 0xB7:
560 		/*
561 		 * MOV and zero extend word from mem (ModRM:r/m) to
562 		 * reg (ModRM:reg).
563 		 *
564 		 * 0F B7/r		movzx r32, r/m16
565 		 * REX.W + 0F B7/r	movzx r64, r/m16
566 		 */
567 		error = memread(vm, vcpuid, gpa, &val, 2, arg);
568 		if (error)
569 			return (error);
570 
571 		reg = gpr_map[vie->reg];
572 
573 		/* zero-extend word */
574 		val = (uint16_t)val;
575 
576 		error = vie_update_register(vm, vcpuid, reg, val, size);
577 		break;
578 	case 0xBE:
579 		/*
580 		 * MOV and sign extend byte from mem (ModRM:r/m) to
581 		 * reg (ModRM:reg).
582 		 *
583 		 * 0F BE/r		movsx r16, r/m8
584 		 * 0F BE/r		movsx r32, r/m8
585 		 * REX.W + 0F BE/r	movsx r64, r/m8
586 		 */
587 
588 		/* get the first operand */
589 		error = memread(vm, vcpuid, gpa, &val, 1, arg);
590 		if (error)
591 			break;
592 
593 		/* get the second operand */
594 		reg = gpr_map[vie->reg];
595 
596 		/* sign extend byte */
597 		val = (int8_t)val;
598 
599 		/* write the result */
600 		error = vie_update_register(vm, vcpuid, reg, val, size);
601 		break;
602 	default:
603 		break;
604 	}
605 	return (error);
606 }
607 
608 /*
609  * Helper function to calculate and validate a linear address.
610  */
611 static int
612 get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
613     int opsize, int addrsize, int prot, enum vm_reg_name seg,
614     enum vm_reg_name gpr, uint64_t *gla, int *fault)
615 {
616 	struct seg_desc desc;
617 	uint64_t cr0, val, rflags;
618 	int error;
619 
620 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
621 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
622 
623 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
624 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
625 
626 	error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
627 	KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
628 	    __func__, error, seg));
629 
630 	error = vie_read_register(vm, vcpuid, gpr, &val);
631 	KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
632 	    error, gpr));
633 
634 	if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
635 	    addrsize, prot, gla)) {
636 		if (seg == VM_REG_GUEST_SS)
637 			vm_inject_ss(vm, vcpuid, 0);
638 		else
639 			vm_inject_gp(vm, vcpuid);
640 		goto guest_fault;
641 	}
642 
643 	if (vie_canonical_check(paging->cpu_mode, *gla)) {
644 		if (seg == VM_REG_GUEST_SS)
645 			vm_inject_ss(vm, vcpuid, 0);
646 		else
647 			vm_inject_gp(vm, vcpuid);
648 		goto guest_fault;
649 	}
650 
651 	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
652 		vm_inject_ac(vm, vcpuid, 0);
653 		goto guest_fault;
654 	}
655 
656 	*fault = 0;
657 	return (0);
658 
659 guest_fault:
660 	*fault = 1;
661 	return (0);
662 }
663 
664 static int
665 emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
666     struct vm_guest_paging *paging, mem_region_read_t memread,
667     mem_region_write_t memwrite, void *arg)
668 {
669 #ifdef _KERNEL
670 	struct vm_copyinfo copyinfo[2];
671 #else
672 	struct iovec copyinfo[2];
673 #endif
674 	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
675 	uint64_t rcx, rdi, rsi, rflags;
676 	int error, fault, opsize, seg, repeat;
677 
678 	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
679 	val = 0;
680 	error = 0;
681 
682 	/*
683 	 * XXX although the MOVS instruction is only supposed to be used with
684 	 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
685 	 *
686 	 * Empirically the "repnz" prefix has identical behavior to "rep"
687 	 * and the zero flag does not make a difference.
688 	 */
689 	repeat = vie->repz_present | vie->repnz_present;
690 
691 	if (repeat) {
692 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
693 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
694 
695 		/*
696 		 * The count register is %rcx, %ecx or %cx depending on the
697 		 * address size of the instruction.
698 		 */
699 		if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
700 			error = 0;
701 			goto done;
702 		}
703 	}
704 
705 	/*
706 	 *	Source		Destination	Comments
707 	 *	--------------------------------------------
708 	 * (1)  memory		memory		n/a
709 	 * (2)  memory		mmio		emulated
710 	 * (3)  mmio		memory		emulated
711 	 * (4)  mmio		mmio		emulated
712 	 *
713 	 * At this point we don't have sufficient information to distinguish
714 	 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
715 	 * out because it will succeed only when operating on regular memory.
716 	 *
717 	 * XXX the emulation doesn't properly handle the case where 'gpa'
718 	 * is straddling the boundary between the normal memory and MMIO.
719 	 */
720 
721 	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
722 	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
723 	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault);
724 	if (error || fault)
725 		goto done;
726 
727 	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
728 	    copyinfo, nitems(copyinfo), &fault);
729 	if (error == 0) {
730 		if (fault)
731 			goto done;	/* Resume guest to handle fault */
732 
733 		/*
734 		 * case (2): read from system memory and write to mmio.
735 		 */
736 		vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
737 		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
738 		error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
739 		if (error)
740 			goto done;
741 	} else {
742 		/*
743 		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
744 		 * if 'srcaddr' is in the mmio space.
745 		 */
746 
747 		error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
748 		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr,
749 		    &fault);
750 		if (error || fault)
751 			goto done;
752 
753 		error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
754 		    PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
755 		if (error == 0) {
756 			if (fault)
757 				goto done;    /* Resume guest to handle fault */
758 
759 			/*
760 			 * case (3): read from MMIO and write to system memory.
761 			 *
762 			 * A MMIO read can have side-effects so we
763 			 * commit to it only after vm_copy_setup() is
764 			 * successful. If a page-fault needs to be
765 			 * injected into the guest then it will happen
766 			 * before the MMIO read is attempted.
767 			 */
768 			error = memread(vm, vcpuid, gpa, &val, opsize, arg);
769 			if (error)
770 				goto done;
771 
772 			vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
773 			vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
774 		} else {
775 			/*
776 			 * Case (4): read from and write to mmio.
777 			 *
778 			 * Commit to the MMIO read/write (with potential
779 			 * side-effects) only after we are sure that the
780 			 * instruction is not going to be restarted due
781 			 * to address translation faults.
782 			 */
783 			error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
784 			    PROT_READ, &srcgpa, &fault);
785 			if (error || fault)
786 				goto done;
787 
788 			error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
789 			   PROT_WRITE, &dstgpa, &fault);
790 			if (error || fault)
791 				goto done;
792 
793 			error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
794 			if (error)
795 				goto done;
796 
797 			error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg);
798 			if (error)
799 				goto done;
800 		}
801 	}
802 
803 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
804 	KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
805 
806 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
807 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
808 
809 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
810 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
811 
812 	if (rflags & PSL_D) {
813 		rsi -= opsize;
814 		rdi -= opsize;
815 	} else {
816 		rsi += opsize;
817 		rdi += opsize;
818 	}
819 
820 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
821 	    vie->addrsize);
822 	KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
823 
824 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
825 	    vie->addrsize);
826 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
827 
828 	if (repeat) {
829 		rcx = rcx - 1;
830 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
831 		    rcx, vie->addrsize);
832 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
833 
834 		/*
835 		 * Repeat the instruction if the count register is not zero.
836 		 */
837 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
838 			vm_restart_instruction(vm, vcpuid);
839 	}
840 done:
841 	KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d",
842 	    __func__, error));
843 	return (error);
844 }
845 
846 static int
847 emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
848     struct vm_guest_paging *paging, mem_region_read_t memread,
849     mem_region_write_t memwrite, void *arg)
850 {
851 	int error, opsize, repeat;
852 	uint64_t val;
853 	uint64_t rcx, rdi, rflags;
854 
855 	opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
856 	repeat = vie->repz_present | vie->repnz_present;
857 
858 	if (repeat) {
859 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
860 		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
861 
862 		/*
863 		 * The count register is %rcx, %ecx or %cx depending on the
864 		 * address size of the instruction.
865 		 */
866 		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
867 			return (0);
868 	}
869 
870 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
871 	KASSERT(!error, ("%s: error %d getting rax", __func__, error));
872 
873 	error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
874 	if (error)
875 		return (error);
876 
877 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
878 	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
879 
880 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
881 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
882 
883 	if (rflags & PSL_D)
884 		rdi -= opsize;
885 	else
886 		rdi += opsize;
887 
888 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
889 	    vie->addrsize);
890 	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
891 
892 	if (repeat) {
893 		rcx = rcx - 1;
894 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
895 		    rcx, vie->addrsize);
896 		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
897 
898 		/*
899 		 * Repeat the instruction if the count register is not zero.
900 		 */
901 		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
902 			vm_restart_instruction(vm, vcpuid);
903 	}
904 
905 	return (0);
906 }
907 
908 static int
909 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
910 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
911 {
912 	int error, size;
913 	enum vm_reg_name reg;
914 	uint64_t result, rflags, rflags2, val1, val2;
915 
916 	size = vie->opsize;
917 	error = EINVAL;
918 
919 	switch (vie->op.op_byte) {
920 	case 0x23:
921 		/*
922 		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
923 		 * result in reg.
924 		 *
925 		 * 23/r		and r16, r/m16
926 		 * 23/r		and r32, r/m32
927 		 * REX.W + 23/r	and r64, r/m64
928 		 */
929 
930 		/* get the first operand */
931 		reg = gpr_map[vie->reg];
932 		error = vie_read_register(vm, vcpuid, reg, &val1);
933 		if (error)
934 			break;
935 
936 		/* get the second operand */
937 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
938 		if (error)
939 			break;
940 
941 		/* perform the operation and write the result */
942 		result = val1 & val2;
943 		error = vie_update_register(vm, vcpuid, reg, result, size);
944 		break;
945 	case 0x81:
946 	case 0x83:
947 		/*
948 		 * AND mem (ModRM:r/m) with immediate and store the
949 		 * result in mem.
950 		 *
951 		 * 81 /4		and r/m16, imm16
952 		 * 81 /4		and r/m32, imm32
953 		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
954 		 *
955 		 * 83 /4		and r/m16, imm8 sign-extended to 16
956 		 * 83 /4		and r/m32, imm8 sign-extended to 32
957 		 * REX.W + 83/4		and r/m64, imm8 sign-extended to 64
958 		 */
959 
960 		/* get the first operand */
961                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
962                 if (error)
963 			break;
964 
965                 /*
966 		 * perform the operation with the pre-fetched immediate
967 		 * operand and write the result
968 		 */
969                 result = val1 & vie->immediate;
970                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
971 		break;
972 	default:
973 		break;
974 	}
975 	if (error)
976 		return (error);
977 
978 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
979 	if (error)
980 		return (error);
981 
982 	/*
983 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
984 	 * to the result; AF is undefined.
985 	 *
986 	 * The updated status flags are obtained by subtracting 0 from 'result'.
987 	 */
988 	rflags2 = getcc(size, result, 0);
989 	rflags &= ~RFLAGS_STATUS_BITS;
990 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
991 
992 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
993 	return (error);
994 }
995 
996 static int
997 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
998 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
999 {
1000 	int error, size;
1001 	enum vm_reg_name reg;
1002 	uint64_t result, rflags, rflags2, val1, val2;
1003 
1004 	size = vie->opsize;
1005 	error = EINVAL;
1006 
1007 	switch (vie->op.op_byte) {
1008 	case 0x0B:
1009 		/*
1010 		 * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
1011 		 * result in reg.
1012 		 *
1013 		 * 0b/r         or r16, r/m16
1014 		 * 0b/r         or r32, r/m32
1015 		 * REX.W + 0b/r or r64, r/m64
1016 		 */
1017 
1018 		/* get the first operand */
1019 		reg = gpr_map[vie->reg];
1020 		error = vie_read_register(vm, vcpuid, reg, &val1);
1021 		if (error)
1022 			break;
1023 
1024 		/* get the second operand */
1025 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1026 		if (error)
1027 			break;
1028 
1029 		/* perform the operation and write the result */
1030 		result = val1 | val2;
1031 		error = vie_update_register(vm, vcpuid, reg, result, size);
1032 		break;
1033 	case 0x81:
1034 	case 0x83:
1035 		/*
1036 		 * OR mem (ModRM:r/m) with immediate and store the
1037 		 * result in mem.
1038 		 *
1039 		 * 81 /1		or r/m16, imm16
1040 		 * 81 /1		or r/m32, imm32
1041 		 * REX.W + 81 /1	or r/m64, imm32 sign-extended to 64
1042 		 *
1043 		 * 83 /1		or r/m16, imm8 sign-extended to 16
1044 		 * 83 /1		or r/m32, imm8 sign-extended to 32
1045 		 * REX.W + 83/1		or r/m64, imm8 sign-extended to 64
1046 		 */
1047 
1048 		/* get the first operand */
1049                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
1050                 if (error)
1051 			break;
1052 
1053                 /*
1054 		 * perform the operation with the pre-fetched immediate
1055 		 * operand and write the result
1056 		 */
1057                 result = val1 | vie->immediate;
1058                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
1059 		break;
1060 	default:
1061 		break;
1062 	}
1063 	if (error)
1064 		return (error);
1065 
1066 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1067 	if (error)
1068 		return (error);
1069 
1070 	/*
1071 	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1072 	 * to the result; AF is undefined.
1073 	 *
1074 	 * The updated status flags are obtained by subtracting 0 from 'result'.
1075 	 */
1076 	rflags2 = getcc(size, result, 0);
1077 	rflags &= ~RFLAGS_STATUS_BITS;
1078 	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1079 
1080 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1081 	return (error);
1082 }
1083 
1084 static int
1085 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1086 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1087 {
1088 	int error, size;
1089 	uint64_t regop, memop, op1, op2, rflags, rflags2;
1090 	enum vm_reg_name reg;
1091 
1092 	size = vie->opsize;
1093 	switch (vie->op.op_byte) {
1094 	case 0x39:
1095 	case 0x3B:
1096 		/*
1097 		 * 39/r		CMP r/m16, r16
1098 		 * 39/r		CMP r/m32, r32
1099 		 * REX.W 39/r	CMP r/m64, r64
1100 		 *
1101 		 * 3B/r		CMP r16, r/m16
1102 		 * 3B/r		CMP r32, r/m32
1103 		 * REX.W + 3B/r	CMP r64, r/m64
1104 		 *
1105 		 * Compare the first operand with the second operand and
1106 		 * set status flags in EFLAGS register. The comparison is
1107 		 * performed by subtracting the second operand from the first
1108 		 * operand and then setting the status flags.
1109 		 */
1110 
1111 		/* Get the register operand */
1112 		reg = gpr_map[vie->reg];
1113 		error = vie_read_register(vm, vcpuid, reg, &regop);
1114 		if (error)
1115 			return (error);
1116 
1117 		/* Get the memory operand */
1118 		error = memread(vm, vcpuid, gpa, &memop, size, arg);
1119 		if (error)
1120 			return (error);
1121 
1122 		if (vie->op.op_byte == 0x3B) {
1123 			op1 = regop;
1124 			op2 = memop;
1125 		} else {
1126 			op1 = memop;
1127 			op2 = regop;
1128 		}
1129 		rflags2 = getcc(size, op1, op2);
1130 		break;
1131 	case 0x80:
1132 	case 0x81:
1133 	case 0x83:
1134 		/*
1135 		 * 80 /7		cmp r/m8, imm8
1136 		 * REX + 80 /7		cmp r/m8, imm8
1137 		 *
1138 		 * 81 /7		cmp r/m16, imm16
1139 		 * 81 /7		cmp r/m32, imm32
1140 		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
1141 		 *
1142 		 * 83 /7		cmp r/m16, imm8 sign-extended to 16
1143 		 * 83 /7		cmp r/m32, imm8 sign-extended to 32
1144 		 * REX.W + 83 /7	cmp r/m64, imm8 sign-extended to 64
1145 		 *
1146 		 * Compare mem (ModRM:r/m) with immediate and set
1147 		 * status flags according to the results.  The
1148 		 * comparison is performed by subtracting the
1149 		 * immediate from the first operand and then setting
1150 		 * the status flags.
1151 		 *
1152 		 */
1153 		if (vie->op.op_byte == 0x80)
1154 			size = 1;
1155 
1156 		/* get the first operand */
1157                 error = memread(vm, vcpuid, gpa, &op1, size, arg);
1158 		if (error)
1159 			return (error);
1160 
1161 		rflags2 = getcc(size, op1, vie->immediate);
1162 		break;
1163 	default:
1164 		return (EINVAL);
1165 	}
1166 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1167 	if (error)
1168 		return (error);
1169 	rflags &= ~RFLAGS_STATUS_BITS;
1170 	rflags |= rflags2 & RFLAGS_STATUS_BITS;
1171 
1172 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1173 	return (error);
1174 }
1175 
1176 static int
1177 emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1178 	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1179 {
1180 	int error, size;
1181 	uint64_t nval, rflags, rflags2, val1, val2;
1182 	enum vm_reg_name reg;
1183 
1184 	size = vie->opsize;
1185 	error = EINVAL;
1186 
1187 	switch (vie->op.op_byte) {
1188 	case 0x2B:
1189 		/*
1190 		 * SUB r/m from r and store the result in r
1191 		 *
1192 		 * 2B/r            SUB r16, r/m16
1193 		 * 2B/r            SUB r32, r/m32
1194 		 * REX.W + 2B/r    SUB r64, r/m64
1195 		 */
1196 
1197 		/* get the first operand */
1198 		reg = gpr_map[vie->reg];
1199 		error = vie_read_register(vm, vcpuid, reg, &val1);
1200 		if (error)
1201 			break;
1202 
1203 		/* get the second operand */
1204 		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1205 		if (error)
1206 			break;
1207 
1208 		/* perform the operation and write the result */
1209 		nval = val1 - val2;
1210 		error = vie_update_register(vm, vcpuid, reg, nval, size);
1211 		break;
1212 	default:
1213 		break;
1214 	}
1215 
1216 	if (!error) {
1217 		rflags2 = getcc(size, val1, val2);
1218 		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1219 		    &rflags);
1220 		if (error)
1221 			return (error);
1222 
1223 		rflags &= ~RFLAGS_STATUS_BITS;
1224 		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1225 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1226 		    rflags, 8);
1227 	}
1228 
1229 	return (error);
1230 }
1231 
1232 static int
1233 emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1234     struct vm_guest_paging *paging, mem_region_read_t memread,
1235     mem_region_write_t memwrite, void *arg)
1236 {
1237 #ifdef _KERNEL
1238 	struct vm_copyinfo copyinfo[2];
1239 #else
1240 	struct iovec copyinfo[2];
1241 #endif
1242 	struct seg_desc ss_desc;
1243 	uint64_t cr0, rflags, rsp, stack_gla, val;
1244 	int error, fault, size, stackaddrsize, pushop;
1245 
1246 	val = 0;
1247 	size = vie->opsize;
1248 	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
1249 
1250 	/*
1251 	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
1252 	 */
1253 	if (paging->cpu_mode == CPU_MODE_REAL) {
1254 		stackaddrsize = 2;
1255 	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
1256 		/*
1257 		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
1258 		 * - Stack pointer size is always 64-bits.
1259 		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
1260 		 * - 16-bit PUSH/POP is supported by using the operand size
1261 		 *   override prefix (66H).
1262 		 */
1263 		stackaddrsize = 8;
1264 		size = vie->opsize_override ? 2 : 8;
1265 	} else {
1266 		/*
1267 		 * In protected or compatibility mode the 'B' flag in the
1268 		 * stack-segment descriptor determines the size of the
1269 		 * stack pointer.
1270 		 */
1271 		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
1272 		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
1273 		    __func__, error));
1274 		if (SEG_DESC_DEF32(ss_desc.access))
1275 			stackaddrsize = 4;
1276 		else
1277 			stackaddrsize = 2;
1278 	}
1279 
1280 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
1281 	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
1282 
1283 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1284 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1285 
1286 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
1287 	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
1288 	if (pushop) {
1289 		rsp -= size;
1290 	}
1291 
1292 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
1293 	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
1294 	    &stack_gla)) {
1295 		vm_inject_ss(vm, vcpuid, 0);
1296 		return (0);
1297 	}
1298 
1299 	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
1300 		vm_inject_ss(vm, vcpuid, 0);
1301 		return (0);
1302 	}
1303 
1304 	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
1305 		vm_inject_ac(vm, vcpuid, 0);
1306 		return (0);
1307 	}
1308 
1309 	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
1310 	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
1311 	    &fault);
1312 	if (error || fault)
1313 		return (error);
1314 
1315 	if (pushop) {
1316 		error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
1317 		if (error == 0)
1318 			vm_copyout(vm, vcpuid, &val, copyinfo, size);
1319 	} else {
1320 		vm_copyin(vm, vcpuid, copyinfo, &val, size);
1321 		error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg);
1322 		rsp += size;
1323 	}
1324 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1325 
1326 	if (error == 0) {
1327 		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
1328 		    stackaddrsize);
1329 		KASSERT(error == 0, ("error %d updating rsp", error));
1330 	}
1331 	return (error);
1332 }
1333 
1334 static int
1335 emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1336     struct vm_guest_paging *paging, mem_region_read_t memread,
1337     mem_region_write_t memwrite, void *arg)
1338 {
1339 	int error;
1340 
1341 	/*
1342 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1343 	 *
1344 	 * PUSH is part of the group 5 extended opcodes and is identified
1345 	 * by ModRM:reg = b110.
1346 	 */
1347 	if ((vie->reg & 7) != 6)
1348 		return (EINVAL);
1349 
1350 	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
1351 	    memwrite, arg);
1352 	return (error);
1353 }
1354 
1355 static int
1356 emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1357     struct vm_guest_paging *paging, mem_region_read_t memread,
1358     mem_region_write_t memwrite, void *arg)
1359 {
1360 	int error;
1361 
1362 	/*
1363 	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1364 	 *
1365 	 * POP is part of the group 1A extended opcodes and is identified
1366 	 * by ModRM:reg = b000.
1367 	 */
1368 	if ((vie->reg & 7) != 0)
1369 		return (EINVAL);
1370 
1371 	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
1372 	    memwrite, arg);
1373 	return (error);
1374 }
1375 
1376 static int
1377 emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1378     struct vm_guest_paging *paging, mem_region_read_t memread,
1379     mem_region_write_t memwrite, void *memarg)
1380 {
1381 	int error;
1382 
1383 	switch (vie->reg & 7) {
1384 	case 0x1:	/* OR */
1385 		error = emulate_or(vm, vcpuid, gpa, vie,
1386 		    memread, memwrite, memarg);
1387 		break;
1388 	case 0x4:	/* AND */
1389 		error = emulate_and(vm, vcpuid, gpa, vie,
1390 		    memread, memwrite, memarg);
1391 		break;
1392 	case 0x7:	/* CMP */
1393 		error = emulate_cmp(vm, vcpuid, gpa, vie,
1394 		    memread, memwrite, memarg);
1395 		break;
1396 	default:
1397 		error = EINVAL;
1398 		break;
1399 	}
1400 
1401 	return (error);
1402 }
1403 
1404 static int
1405 emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1406     mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
1407 {
1408 	uint64_t val, rflags;
1409 	int error, bitmask, bitoff;
1410 
1411 	/*
1412 	 * 0F BA is a Group 8 extended opcode.
1413 	 *
1414 	 * Currently we only emulate the 'Bit Test' instruction which is
1415 	 * identified by a ModR/M:reg encoding of 100b.
1416 	 */
1417 	if ((vie->reg & 7) != 4)
1418 		return (EINVAL);
1419 
1420 	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1421 	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1422 
1423 	error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg);
1424 	if (error)
1425 		return (error);
1426 
1427 	/*
1428 	 * Intel SDM, Vol 2, Table 3-2:
1429 	 * "Range of Bit Positions Specified by Bit Offset Operands"
1430 	 */
1431 	bitmask = vie->opsize * 8 - 1;
1432 	bitoff = vie->immediate & bitmask;
1433 
1434 	/* Copy the bit into the Carry flag in %rflags */
1435 	if (val & (1UL << bitoff))
1436 		rflags |= PSL_C;
1437 	else
1438 		rflags &= ~PSL_C;
1439 
1440 	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1441 	KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
1442 
1443 	return (0);
1444 }
1445 
1446 int
1447 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1448     struct vm_guest_paging *paging, mem_region_read_t memread,
1449     mem_region_write_t memwrite, void *memarg)
1450 {
1451 	int error;
1452 
1453 	if (!vie->decoded)
1454 		return (EINVAL);
1455 
1456 	switch (vie->op.op_type) {
1457 	case VIE_OP_TYPE_GROUP1:
1458 		error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread,
1459 		    memwrite, memarg);
1460 		break;
1461 	case VIE_OP_TYPE_POP:
1462 		error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread,
1463 		    memwrite, memarg);
1464 		break;
1465 	case VIE_OP_TYPE_PUSH:
1466 		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
1467 		    memwrite, memarg);
1468 		break;
1469 	case VIE_OP_TYPE_CMP:
1470 		error = emulate_cmp(vm, vcpuid, gpa, vie,
1471 				    memread, memwrite, memarg);
1472 		break;
1473 	case VIE_OP_TYPE_MOV:
1474 		error = emulate_mov(vm, vcpuid, gpa, vie,
1475 				    memread, memwrite, memarg);
1476 		break;
1477 	case VIE_OP_TYPE_MOVSX:
1478 	case VIE_OP_TYPE_MOVZX:
1479 		error = emulate_movx(vm, vcpuid, gpa, vie,
1480 				     memread, memwrite, memarg);
1481 		break;
1482 	case VIE_OP_TYPE_MOVS:
1483 		error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread,
1484 		    memwrite, memarg);
1485 		break;
1486 	case VIE_OP_TYPE_STOS:
1487 		error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread,
1488 		    memwrite, memarg);
1489 		break;
1490 	case VIE_OP_TYPE_AND:
1491 		error = emulate_and(vm, vcpuid, gpa, vie,
1492 				    memread, memwrite, memarg);
1493 		break;
1494 	case VIE_OP_TYPE_OR:
1495 		error = emulate_or(vm, vcpuid, gpa, vie,
1496 				    memread, memwrite, memarg);
1497 		break;
1498 	case VIE_OP_TYPE_SUB:
1499 		error = emulate_sub(vm, vcpuid, gpa, vie,
1500 				    memread, memwrite, memarg);
1501 		break;
1502 	case VIE_OP_TYPE_BITTEST:
1503 		error = emulate_bittest(vm, vcpuid, gpa, vie,
1504 		    memread, memwrite, memarg);
1505 		break;
1506 	default:
1507 		error = EINVAL;
1508 		break;
1509 	}
1510 
1511 	return (error);
1512 }
1513 
1514 int
1515 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
1516 {
1517 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1518 	    ("%s: invalid size %d", __func__, size));
1519 	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
1520 
1521 	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
1522 		return (0);
1523 
1524 	return ((gla & (size - 1)) ? 1 : 0);
1525 }
1526 
1527 int
1528 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
1529 {
1530 	uint64_t mask;
1531 
1532 	if (cpu_mode != CPU_MODE_64BIT)
1533 		return (0);
1534 
1535 	/*
1536 	 * The value of the bit 47 in the 'gla' should be replicated in the
1537 	 * most significant 16 bits.
1538 	 */
1539 	mask = ~((1UL << 48) - 1);
1540 	if (gla & (1UL << 47))
1541 		return ((gla & mask) != mask);
1542 	else
1543 		return ((gla & mask) != 0);
1544 }
1545 
1546 uint64_t
1547 vie_size2mask(int size)
1548 {
1549 	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1550 	    ("vie_size2mask: invalid size %d", size));
1551 	return (size2mask[size]);
1552 }
1553 
1554 int
1555 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
1556     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
1557     int prot, uint64_t *gla)
1558 {
1559 	uint64_t firstoff, low_limit, high_limit, segbase;
1560 	int glasize, type;
1561 
1562 	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
1563 	    ("%s: invalid segment %d", __func__, seg));
1564 	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
1565 	    ("%s: invalid operand size %d", __func__, length));
1566 	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
1567 	    ("%s: invalid prot %#x", __func__, prot));
1568 
1569 	firstoff = offset;
1570 	if (cpu_mode == CPU_MODE_64BIT) {
1571 		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
1572 		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
1573 		glasize = 8;
1574 	} else {
1575 		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
1576 		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
1577 		glasize = 4;
1578 		/*
1579 		 * If the segment selector is loaded with a NULL selector
1580 		 * then the descriptor is unusable and attempting to use
1581 		 * it results in a #GP(0).
1582 		 */
1583 		if (SEG_DESC_UNUSABLE(desc->access))
1584 			return (-1);
1585 
1586 		/*
1587 		 * The processor generates a #NP exception when a segment
1588 		 * register is loaded with a selector that points to a
1589 		 * descriptor that is not present. If this was the case then
1590 		 * it would have been checked before the VM-exit.
1591 		 */
1592 		KASSERT(SEG_DESC_PRESENT(desc->access),
1593 		    ("segment %d not present: %#x", seg, desc->access));
1594 
1595 		/*
1596 		 * The descriptor type must indicate a code/data segment.
1597 		 */
1598 		type = SEG_DESC_TYPE(desc->access);
1599 		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
1600 		    "descriptor type %#x", seg, type));
1601 
1602 		if (prot & PROT_READ) {
1603 			/* #GP on a read access to a exec-only code segment */
1604 			if ((type & 0xA) == 0x8)
1605 				return (-1);
1606 		}
1607 
1608 		if (prot & PROT_WRITE) {
1609 			/*
1610 			 * #GP on a write access to a code segment or a
1611 			 * read-only data segment.
1612 			 */
1613 			if (type & 0x8)			/* code segment */
1614 				return (-1);
1615 
1616 			if ((type & 0xA) == 0)		/* read-only data seg */
1617 				return (-1);
1618 		}
1619 
1620 		/*
1621 		 * 'desc->limit' is fully expanded taking granularity into
1622 		 * account.
1623 		 */
1624 		if ((type & 0xC) == 0x4) {
1625 			/* expand-down data segment */
1626 			low_limit = desc->limit + 1;
1627 			high_limit = SEG_DESC_DEF32(desc->access) ?
1628 			    0xffffffff : 0xffff;
1629 		} else {
1630 			/* code segment or expand-up data segment */
1631 			low_limit = 0;
1632 			high_limit = desc->limit;
1633 		}
1634 
1635 		while (length > 0) {
1636 			offset &= vie_size2mask(addrsize);
1637 			if (offset < low_limit || offset > high_limit)
1638 				return (-1);
1639 			offset++;
1640 			length--;
1641 		}
1642 	}
1643 
1644 	/*
1645 	 * In 64-bit mode all segments except %fs and %gs have a segment
1646 	 * base address of 0.
1647 	 */
1648 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
1649 	    seg != VM_REG_GUEST_GS) {
1650 		segbase = 0;
1651 	} else {
1652 		segbase = desc->base;
1653 	}
1654 
1655 	/*
1656 	 * Truncate 'firstoff' to the effective address size before adding
1657 	 * it to the segment base.
1658 	 */
1659 	firstoff &= vie_size2mask(addrsize);
1660 	*gla = (segbase + firstoff) & vie_size2mask(glasize);
1661 	return (0);
1662 }
1663 
1664 #ifdef _KERNEL
1665 void
1666 vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
1667 {
1668 	KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
1669 	    ("%s: invalid instruction length (%d)", __func__, inst_length));
1670 
1671 	bzero(vie, sizeof(struct vie));
1672 
1673 	vie->base_register = VM_REG_LAST;
1674 	vie->index_register = VM_REG_LAST;
1675 	vie->segment_register = VM_REG_LAST;
1676 
1677 	if (inst_length) {
1678 		bcopy(inst_bytes, vie->inst, inst_length);
1679 		vie->num_valid = inst_length;
1680 	}
1681 }
1682 
1683 static int
1684 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
1685 {
1686 	int error_code = 0;
1687 
1688 	if (pte & PG_V)
1689 		error_code |= PGEX_P;
1690 	if (prot & VM_PROT_WRITE)
1691 		error_code |= PGEX_W;
1692 	if (usermode)
1693 		error_code |= PGEX_U;
1694 	if (rsvd)
1695 		error_code |= PGEX_RSV;
1696 	if (prot & VM_PROT_EXECUTE)
1697 		error_code |= PGEX_I;
1698 
1699 	return (error_code);
1700 }
1701 
1702 static void
1703 ptp_release(void **cookie)
1704 {
1705 	if (*cookie != NULL) {
1706 		vm_gpa_release(*cookie);
1707 		*cookie = NULL;
1708 	}
1709 }
1710 
1711 static void *
1712 ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
1713 {
1714 	void *ptr;
1715 
1716 	ptp_release(cookie);
1717 	ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, VM_PROT_RW, cookie);
1718 	return (ptr);
1719 }
1720 
1721 static int
1722 _vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1723     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)
1724 {
1725 	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
1726 	u_int retries;
1727 	uint64_t *ptpbase, ptpphys, pte, pgsize;
1728 	uint32_t *ptpbase32, pte32;
1729 	void *cookie;
1730 
1731 	*guest_fault = 0;
1732 
1733 	usermode = (paging->cpl == 3 ? 1 : 0);
1734 	writable = prot & VM_PROT_WRITE;
1735 	cookie = NULL;
1736 	retval = 0;
1737 	retries = 0;
1738 restart:
1739 	ptpphys = paging->cr3;		/* root of the page tables */
1740 	ptp_release(&cookie);
1741 	if (retries++ > 0)
1742 		maybe_yield();
1743 
1744 	if (vie_canonical_check(paging->cpu_mode, gla)) {
1745 		/*
1746 		 * XXX assuming a non-stack reference otherwise a stack fault
1747 		 * should be generated.
1748 		 */
1749 		if (!check_only)
1750 			vm_inject_gp(vm, vcpuid);
1751 		goto fault;
1752 	}
1753 
1754 	if (paging->paging_mode == PAGING_MODE_FLAT) {
1755 		*gpa = gla;
1756 		goto done;
1757 	}
1758 
1759 	if (paging->paging_mode == PAGING_MODE_32) {
1760 		nlevels = 2;
1761 		while (--nlevels >= 0) {
1762 			/* Zero out the lower 12 bits. */
1763 			ptpphys &= ~0xfff;
1764 
1765 			ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE,
1766 			    &cookie);
1767 
1768 			if (ptpbase32 == NULL)
1769 				goto error;
1770 
1771 			ptpshift = PAGE_SHIFT + nlevels * 10;
1772 			ptpindex = (gla >> ptpshift) & 0x3FF;
1773 			pgsize = 1UL << ptpshift;
1774 
1775 			pte32 = ptpbase32[ptpindex];
1776 
1777 			if ((pte32 & PG_V) == 0 ||
1778 			    (usermode && (pte32 & PG_U) == 0) ||
1779 			    (writable && (pte32 & PG_RW) == 0)) {
1780 				if (!check_only) {
1781 					pfcode = pf_error_code(usermode, prot, 0,
1782 					    pte32);
1783 					vm_inject_pf(vm, vcpuid, pfcode, gla);
1784 				}
1785 				goto fault;
1786 			}
1787 
1788 			/*
1789 			 * Emulate the x86 MMU's management of the accessed
1790 			 * and dirty flags. While the accessed flag is set
1791 			 * at every level of the page table, the dirty flag
1792 			 * is only set at the last level providing the guest
1793 			 * physical address.
1794 			 */
1795 			if (!check_only && (pte32 & PG_A) == 0) {
1796 				if (atomic_cmpset_32(&ptpbase32[ptpindex],
1797 				    pte32, pte32 | PG_A) == 0) {
1798 					goto restart;
1799 				}
1800 			}
1801 
1802 			/* XXX must be ignored if CR4.PSE=0 */
1803 			if (nlevels > 0 && (pte32 & PG_PS) != 0)
1804 				break;
1805 
1806 			ptpphys = pte32;
1807 		}
1808 
1809 		/* Set the dirty bit in the page table entry if necessary */
1810 		if (!check_only && writable && (pte32 & PG_M) == 0) {
1811 			if (atomic_cmpset_32(&ptpbase32[ptpindex],
1812 			    pte32, pte32 | PG_M) == 0) {
1813 				goto restart;
1814 			}
1815 		}
1816 
1817 		/* Zero out the lower 'ptpshift' bits */
1818 		pte32 >>= ptpshift; pte32 <<= ptpshift;
1819 		*gpa = pte32 | (gla & (pgsize - 1));
1820 		goto done;
1821 	}
1822 
1823 	if (paging->paging_mode == PAGING_MODE_PAE) {
1824 		/* Zero out the lower 5 bits and the upper 32 bits */
1825 		ptpphys &= 0xffffffe0UL;
1826 
1827 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof(*ptpbase) * 4,
1828 		    &cookie);
1829 		if (ptpbase == NULL)
1830 			goto error;
1831 
1832 		ptpindex = (gla >> 30) & 0x3;
1833 
1834 		pte = ptpbase[ptpindex];
1835 
1836 		if ((pte & PG_V) == 0) {
1837 			if (!check_only) {
1838 				pfcode = pf_error_code(usermode, prot, 0, pte);
1839 				vm_inject_pf(vm, vcpuid, pfcode, gla);
1840 			}
1841 			goto fault;
1842 		}
1843 
1844 		ptpphys = pte;
1845 
1846 		nlevels = 2;
1847 	} else
1848 		nlevels = 4;
1849 	while (--nlevels >= 0) {
1850 		/* Zero out the lower 12 bits and the upper 12 bits */
1851 		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
1852 
1853 		ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie);
1854 		if (ptpbase == NULL)
1855 			goto error;
1856 
1857 		ptpshift = PAGE_SHIFT + nlevels * 9;
1858 		ptpindex = (gla >> ptpshift) & 0x1FF;
1859 		pgsize = 1UL << ptpshift;
1860 
1861 		pte = ptpbase[ptpindex];
1862 
1863 		if ((pte & PG_V) == 0 ||
1864 		    (usermode && (pte & PG_U) == 0) ||
1865 		    (writable && (pte & PG_RW) == 0)) {
1866 			if (!check_only) {
1867 				pfcode = pf_error_code(usermode, prot, 0, pte);
1868 				vm_inject_pf(vm, vcpuid, pfcode, gla);
1869 			}
1870 			goto fault;
1871 		}
1872 
1873 		/* Set the accessed bit in the page table entry */
1874 		if (!check_only && (pte & PG_A) == 0) {
1875 			if (atomic_cmpset_64(&ptpbase[ptpindex],
1876 			    pte, pte | PG_A) == 0) {
1877 				goto restart;
1878 			}
1879 		}
1880 
1881 		if (nlevels > 0 && (pte & PG_PS) != 0) {
1882 			if (pgsize > 1 * GB) {
1883 				if (!check_only) {
1884 					pfcode = pf_error_code(usermode, prot, 1,
1885 					    pte);
1886 					vm_inject_pf(vm, vcpuid, pfcode, gla);
1887 				}
1888 				goto fault;
1889 			}
1890 			break;
1891 		}
1892 
1893 		ptpphys = pte;
1894 	}
1895 
1896 	/* Set the dirty bit in the page table entry if necessary */
1897 	if (!check_only && writable && (pte & PG_M) == 0) {
1898 		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
1899 			goto restart;
1900 	}
1901 
1902 	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
1903 	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
1904 	*gpa = pte | (gla & (pgsize - 1));
1905 done:
1906 	ptp_release(&cookie);
1907 	KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",
1908 	    __func__, retval));
1909 	return (retval);
1910 error:
1911 	retval = EFAULT;
1912 	goto done;
1913 fault:
1914 	*guest_fault = 1;
1915 	goto done;
1916 }
1917 
1918 int
1919 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1920     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
1921 {
1922 
1923 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
1924 	    false));
1925 }
1926 
1927 int
1928 vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1929     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
1930 {
1931 
1932 	return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault,
1933 	    true));
1934 }
1935 
1936 int
1937 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1938     uint64_t rip, int inst_length, struct vie *vie, int *faultptr)
1939 {
1940 	struct vm_copyinfo copyinfo[2];
1941 	int error, prot;
1942 
1943 	if (inst_length > VIE_INST_SIZE)
1944 		panic("vmm_fetch_instruction: invalid length %d", inst_length);
1945 
1946 	prot = PROT_READ | PROT_EXEC;
1947 	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
1948 	    copyinfo, nitems(copyinfo), faultptr);
1949 	if (error || *faultptr)
1950 		return (error);
1951 
1952 	vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
1953 	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1954 	vie->num_valid = inst_length;
1955 	return (0);
1956 }
1957 
1958 static int
1959 vie_peek(struct vie *vie, uint8_t *x)
1960 {
1961 
1962 	if (vie->num_processed < vie->num_valid) {
1963 		*x = vie->inst[vie->num_processed];
1964 		return (0);
1965 	} else
1966 		return (-1);
1967 }
1968 
1969 static void
1970 vie_advance(struct vie *vie)
1971 {
1972 
1973 	vie->num_processed++;
1974 }
1975 
1976 static bool
1977 segment_override(uint8_t x, int *seg)
1978 {
1979 
1980 	switch (x) {
1981 	case 0x2E:
1982 		*seg = VM_REG_GUEST_CS;
1983 		break;
1984 	case 0x36:
1985 		*seg = VM_REG_GUEST_SS;
1986 		break;
1987 	case 0x3E:
1988 		*seg = VM_REG_GUEST_DS;
1989 		break;
1990 	case 0x26:
1991 		*seg = VM_REG_GUEST_ES;
1992 		break;
1993 	case 0x64:
1994 		*seg = VM_REG_GUEST_FS;
1995 		break;
1996 	case 0x65:
1997 		*seg = VM_REG_GUEST_GS;
1998 		break;
1999 	default:
2000 		return (false);
2001 	}
2002 	return (true);
2003 }
2004 
2005 static int
2006 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
2007 {
2008 	uint8_t x;
2009 
2010 	while (1) {
2011 		if (vie_peek(vie, &x))
2012 			return (-1);
2013 
2014 		if (x == 0x66)
2015 			vie->opsize_override = 1;
2016 		else if (x == 0x67)
2017 			vie->addrsize_override = 1;
2018 		else if (x == 0xF3)
2019 			vie->repz_present = 1;
2020 		else if (x == 0xF2)
2021 			vie->repnz_present = 1;
2022 		else if (segment_override(x, &vie->segment_register))
2023 			vie->segment_override = 1;
2024 		else
2025 			break;
2026 
2027 		vie_advance(vie);
2028 	}
2029 
2030 	/*
2031 	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
2032 	 * - Only one REX prefix is allowed per instruction.
2033 	 * - The REX prefix must immediately precede the opcode byte or the
2034 	 *   escape opcode byte.
2035 	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
2036 	 *   the mandatory prefix must come before the REX prefix.
2037 	 */
2038 	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
2039 		vie->rex_present = 1;
2040 		vie->rex_w = x & 0x8 ? 1 : 0;
2041 		vie->rex_r = x & 0x4 ? 1 : 0;
2042 		vie->rex_x = x & 0x2 ? 1 : 0;
2043 		vie->rex_b = x & 0x1 ? 1 : 0;
2044 		vie_advance(vie);
2045 	}
2046 
2047 	/*
2048 	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
2049 	 */
2050 	if (cpu_mode == CPU_MODE_64BIT) {
2051 		/*
2052 		 * Default address size is 64-bits and default operand size
2053 		 * is 32-bits.
2054 		 */
2055 		vie->addrsize = vie->addrsize_override ? 4 : 8;
2056 		if (vie->rex_w)
2057 			vie->opsize = 8;
2058 		else if (vie->opsize_override)
2059 			vie->opsize = 2;
2060 		else
2061 			vie->opsize = 4;
2062 	} else if (cs_d) {
2063 		/* Default address and operand sizes are 32-bits */
2064 		vie->addrsize = vie->addrsize_override ? 2 : 4;
2065 		vie->opsize = vie->opsize_override ? 2 : 4;
2066 	} else {
2067 		/* Default address and operand sizes are 16-bits */
2068 		vie->addrsize = vie->addrsize_override ? 4 : 2;
2069 		vie->opsize = vie->opsize_override ? 4 : 2;
2070 	}
2071 	return (0);
2072 }
2073 
2074 static int
2075 decode_two_byte_opcode(struct vie *vie)
2076 {
2077 	uint8_t x;
2078 
2079 	if (vie_peek(vie, &x))
2080 		return (-1);
2081 
2082 	vie->op = two_byte_opcodes[x];
2083 
2084 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
2085 		return (-1);
2086 
2087 	vie_advance(vie);
2088 	return (0);
2089 }
2090 
2091 static int
2092 decode_opcode(struct vie *vie)
2093 {
2094 	uint8_t x;
2095 
2096 	if (vie_peek(vie, &x))
2097 		return (-1);
2098 
2099 	vie->op = one_byte_opcodes[x];
2100 
2101 	if (vie->op.op_type == VIE_OP_TYPE_NONE)
2102 		return (-1);
2103 
2104 	vie_advance(vie);
2105 
2106 	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
2107 		return (decode_two_byte_opcode(vie));
2108 
2109 	return (0);
2110 }
2111 
2112 static int
2113 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
2114 {
2115 	uint8_t x;
2116 
2117 	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
2118 		return (0);
2119 
2120 	if (cpu_mode == CPU_MODE_REAL)
2121 		return (-1);
2122 
2123 	if (vie_peek(vie, &x))
2124 		return (-1);
2125 
2126 	vie->mod = (x >> 6) & 0x3;
2127 	vie->rm =  (x >> 0) & 0x7;
2128 	vie->reg = (x >> 3) & 0x7;
2129 
2130 	/*
2131 	 * A direct addressing mode makes no sense in the context of an EPT
2132 	 * fault. There has to be a memory access involved to cause the
2133 	 * EPT fault.
2134 	 */
2135 	if (vie->mod == VIE_MOD_DIRECT)
2136 		return (-1);
2137 
2138 	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
2139 	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
2140 		/*
2141 		 * Table 2-5: Special Cases of REX Encodings
2142 		 *
2143 		 * mod=0, r/m=5 is used in the compatibility mode to
2144 		 * indicate a disp32 without a base register.
2145 		 *
2146 		 * mod!=3, r/m=4 is used in the compatibility mode to
2147 		 * indicate that the SIB byte is present.
2148 		 *
2149 		 * The 'b' bit in the REX prefix is don't care in
2150 		 * this case.
2151 		 */
2152 	} else {
2153 		vie->rm |= (vie->rex_b << 3);
2154 	}
2155 
2156 	vie->reg |= (vie->rex_r << 3);
2157 
2158 	/* SIB */
2159 	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
2160 		goto done;
2161 
2162 	vie->base_register = gpr_map[vie->rm];
2163 
2164 	switch (vie->mod) {
2165 	case VIE_MOD_INDIRECT_DISP8:
2166 		vie->disp_bytes = 1;
2167 		break;
2168 	case VIE_MOD_INDIRECT_DISP32:
2169 		vie->disp_bytes = 4;
2170 		break;
2171 	case VIE_MOD_INDIRECT:
2172 		if (vie->rm == VIE_RM_DISP32) {
2173 			vie->disp_bytes = 4;
2174 			/*
2175 			 * Table 2-7. RIP-Relative Addressing
2176 			 *
2177 			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
2178 			 * whereas in compatibility mode it just implies disp32.
2179 			 */
2180 
2181 			if (cpu_mode == CPU_MODE_64BIT)
2182 				vie->base_register = VM_REG_GUEST_RIP;
2183 			else
2184 				vie->base_register = VM_REG_LAST;
2185 		}
2186 		break;
2187 	}
2188 
2189 done:
2190 	vie_advance(vie);
2191 
2192 	return (0);
2193 }
2194 
2195 static int
2196 decode_sib(struct vie *vie)
2197 {
2198 	uint8_t x;
2199 
2200 	/* Proceed only if SIB byte is present */
2201 	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
2202 		return (0);
2203 
2204 	if (vie_peek(vie, &x))
2205 		return (-1);
2206 
2207 	/* De-construct the SIB byte */
2208 	vie->ss = (x >> 6) & 0x3;
2209 	vie->index = (x >> 3) & 0x7;
2210 	vie->base = (x >> 0) & 0x7;
2211 
2212 	/* Apply the REX prefix modifiers */
2213 	vie->index |= vie->rex_x << 3;
2214 	vie->base |= vie->rex_b << 3;
2215 
2216 	switch (vie->mod) {
2217 	case VIE_MOD_INDIRECT_DISP8:
2218 		vie->disp_bytes = 1;
2219 		break;
2220 	case VIE_MOD_INDIRECT_DISP32:
2221 		vie->disp_bytes = 4;
2222 		break;
2223 	}
2224 
2225 	if (vie->mod == VIE_MOD_INDIRECT &&
2226 	    (vie->base == 5 || vie->base == 13)) {
2227 		/*
2228 		 * Special case when base register is unused if mod = 0
2229 		 * and base = %rbp or %r13.
2230 		 *
2231 		 * Documented in:
2232 		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2233 		 * Table 2-5: Special Cases of REX Encodings
2234 		 */
2235 		vie->disp_bytes = 4;
2236 	} else {
2237 		vie->base_register = gpr_map[vie->base];
2238 	}
2239 
2240 	/*
2241 	 * All encodings of 'index' are valid except for %rsp (4).
2242 	 *
2243 	 * Documented in:
2244 	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2245 	 * Table 2-5: Special Cases of REX Encodings
2246 	 */
2247 	if (vie->index != 4)
2248 		vie->index_register = gpr_map[vie->index];
2249 
2250 	/* 'scale' makes sense only in the context of an index register */
2251 	if (vie->index_register < VM_REG_LAST)
2252 		vie->scale = 1 << vie->ss;
2253 
2254 	vie_advance(vie);
2255 
2256 	return (0);
2257 }
2258 
2259 static int
2260 decode_displacement(struct vie *vie)
2261 {
2262 	int n, i;
2263 	uint8_t x;
2264 
2265 	union {
2266 		char	buf[4];
2267 		int8_t	signed8;
2268 		int32_t	signed32;
2269 	} u;
2270 
2271 	if ((n = vie->disp_bytes) == 0)
2272 		return (0);
2273 
2274 	if (n != 1 && n != 4)
2275 		panic("decode_displacement: invalid disp_bytes %d", n);
2276 
2277 	for (i = 0; i < n; i++) {
2278 		if (vie_peek(vie, &x))
2279 			return (-1);
2280 
2281 		u.buf[i] = x;
2282 		vie_advance(vie);
2283 	}
2284 
2285 	if (n == 1)
2286 		vie->displacement = u.signed8;		/* sign-extended */
2287 	else
2288 		vie->displacement = u.signed32;		/* sign-extended */
2289 
2290 	return (0);
2291 }
2292 
2293 static int
2294 decode_immediate(struct vie *vie)
2295 {
2296 	int i, n;
2297 	uint8_t x;
2298 	union {
2299 		char	buf[4];
2300 		int8_t	signed8;
2301 		int16_t	signed16;
2302 		int32_t	signed32;
2303 	} u;
2304 
2305 	/* Figure out immediate operand size (if any) */
2306 	if (vie->op.op_flags & VIE_OP_F_IMM) {
2307 		/*
2308 		 * Section 2.2.1.5 "Immediates", Intel SDM:
2309 		 * In 64-bit mode the typical size of immediate operands
2310 		 * remains 32-bits. When the operand size if 64-bits, the
2311 		 * processor sign-extends all immediates to 64-bits prior
2312 		 * to their use.
2313 		 */
2314 		if (vie->opsize == 4 || vie->opsize == 8)
2315 			vie->imm_bytes = 4;
2316 		else
2317 			vie->imm_bytes = 2;
2318 	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
2319 		vie->imm_bytes = 1;
2320 	}
2321 
2322 	if ((n = vie->imm_bytes) == 0)
2323 		return (0);
2324 
2325 	KASSERT(n == 1 || n == 2 || n == 4,
2326 	    ("%s: invalid number of immediate bytes: %d", __func__, n));
2327 
2328 	for (i = 0; i < n; i++) {
2329 		if (vie_peek(vie, &x))
2330 			return (-1);
2331 
2332 		u.buf[i] = x;
2333 		vie_advance(vie);
2334 	}
2335 
2336 	/* sign-extend the immediate value before use */
2337 	if (n == 1)
2338 		vie->immediate = u.signed8;
2339 	else if (n == 2)
2340 		vie->immediate = u.signed16;
2341 	else
2342 		vie->immediate = u.signed32;
2343 
2344 	return (0);
2345 }
2346 
2347 static int
2348 decode_moffset(struct vie *vie)
2349 {
2350 	int i, n;
2351 	uint8_t x;
2352 	union {
2353 		char	buf[8];
2354 		uint64_t u64;
2355 	} u;
2356 
2357 	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
2358 		return (0);
2359 
2360 	/*
2361 	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
2362 	 * The memory offset size follows the address-size of the instruction.
2363 	 */
2364 	n = vie->addrsize;
2365 	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
2366 
2367 	u.u64 = 0;
2368 	for (i = 0; i < n; i++) {
2369 		if (vie_peek(vie, &x))
2370 			return (-1);
2371 
2372 		u.buf[i] = x;
2373 		vie_advance(vie);
2374 	}
2375 	vie->displacement = u.u64;
2376 	return (0);
2377 }
2378 
2379 /*
2380  * Verify that the 'guest linear address' provided as collateral of the nested
2381  * page table fault matches with our instruction decoding.
2382  */
2383 static int
2384 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie,
2385     enum vm_cpu_mode cpu_mode)
2386 {
2387 	int error;
2388 	uint64_t base, segbase, idx, gla2;
2389 	enum vm_reg_name seg;
2390 	struct seg_desc desc;
2391 
2392 	/* Skip 'gla' verification */
2393 	if (gla == VIE_INVALID_GLA)
2394 		return (0);
2395 
2396 	base = 0;
2397 	if (vie->base_register != VM_REG_LAST) {
2398 		error = vm_get_register(vm, cpuid, vie->base_register, &base);
2399 		if (error) {
2400 			printf("verify_gla: error %d getting base reg %d\n",
2401 				error, vie->base_register);
2402 			return (-1);
2403 		}
2404 
2405 		/*
2406 		 * RIP-relative addressing starts from the following
2407 		 * instruction
2408 		 */
2409 		if (vie->base_register == VM_REG_GUEST_RIP)
2410 			base += vie->num_processed;
2411 	}
2412 
2413 	idx = 0;
2414 	if (vie->index_register != VM_REG_LAST) {
2415 		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
2416 		if (error) {
2417 			printf("verify_gla: error %d getting index reg %d\n",
2418 				error, vie->index_register);
2419 			return (-1);
2420 		}
2421 	}
2422 
2423 	/*
2424 	 * From "Specifying a Segment Selector", Intel SDM, Vol 1
2425 	 *
2426 	 * In 64-bit mode, segmentation is generally (but not
2427 	 * completely) disabled.  The exceptions are the FS and GS
2428 	 * segments.
2429 	 *
2430 	 * In legacy IA-32 mode, when the ESP or EBP register is used
2431 	 * as the base, the SS segment is the default segment.  For
2432 	 * other data references, except when relative to stack or
2433 	 * string destination the DS segment is the default.  These
2434 	 * can be overridden to allow other segments to be accessed.
2435 	 */
2436 	if (vie->segment_override)
2437 		seg = vie->segment_register;
2438 	else if (vie->base_register == VM_REG_GUEST_RSP ||
2439 	    vie->base_register == VM_REG_GUEST_RBP)
2440 		seg = VM_REG_GUEST_SS;
2441 	else
2442 		seg = VM_REG_GUEST_DS;
2443 	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
2444 	    seg != VM_REG_GUEST_GS) {
2445 		segbase = 0;
2446 	} else {
2447 		error = vm_get_seg_desc(vm, cpuid, seg, &desc);
2448 		if (error) {
2449 			printf("verify_gla: error %d getting segment"
2450 			       " descriptor %d", error,
2451 			       vie->segment_register);
2452 			return (-1);
2453 		}
2454 		segbase = desc.base;
2455 	}
2456 
2457 	gla2 = segbase + base + vie->scale * idx + vie->displacement;
2458 	gla2 &= size2mask[vie->addrsize];
2459 	if (gla != gla2) {
2460 		printf("verify_gla mismatch: segbase(0x%0lx)"
2461 		       "base(0x%0lx), scale(%d), index(0x%0lx), "
2462 		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
2463 		       segbase, base, vie->scale, idx, vie->displacement,
2464 		       gla, gla2);
2465 		return (-1);
2466 	}
2467 
2468 	return (0);
2469 }
2470 
2471 int
2472 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
2473 		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
2474 {
2475 
2476 	if (decode_prefixes(vie, cpu_mode, cs_d))
2477 		return (-1);
2478 
2479 	if (decode_opcode(vie))
2480 		return (-1);
2481 
2482 	if (decode_modrm(vie, cpu_mode))
2483 		return (-1);
2484 
2485 	if (decode_sib(vie))
2486 		return (-1);
2487 
2488 	if (decode_displacement(vie))
2489 		return (-1);
2490 
2491 	if (decode_immediate(vie))
2492 		return (-1);
2493 
2494 	if (decode_moffset(vie))
2495 		return (-1);
2496 
2497 	if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
2498 		if (verify_gla(vm, cpuid, gla, vie, cpu_mode))
2499 			return (-1);
2500 	}
2501 
2502 	vie->decoded = 1;	/* success */
2503 
2504 	return (0);
2505 }
2506 #endif	/* _KERNEL */
2507