xref: /dragonfly/lib/libnvmm/libnvmm_x86.c (revision 634ba020)
1 /*	$NetBSD: libnvmm_x86.c,v 1.31.4.1 2019/11/10 12:58:30 martin Exp $	*/
2 
3 /*
4  * Copyright (c) 2018-2019 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <unistd.h>
38 #include <fcntl.h>
39 #include <errno.h>
40 #include <sys/ioctl.h>
41 #include <sys/mman.h>
42 #include <machine/vmparam.h>
43 #include <machine/pte.h>
44 #include <machine/psl.h>
45 
46 #define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
47 #define __cacheline_aligned __attribute__((__aligned__(64)))
48 
49 #include <x86/specialreg.h>
50 
51 /* -------------------------------------------------------------------------- */
52 
53 /*
54  * Undocumented debugging function. Helpful.
55  */
56 int
57 nvmm_vcpu_dump(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu)
58 {
59 	struct nvmm_x64_state *state = vcpu->state;
60 	uint16_t *attr;
61 	size_t i;
62 	int ret;
63 
64 	const char *segnames[] = {
65 		"ES", "CS", "SS", "DS", "FS", "GS", "GDT", "IDT", "LDT", "TR"
66 	};
67 
68 	ret = nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_ALL);
69 	if (ret == -1)
70 		return -1;
71 
72 	printf("+ VCPU id=%d\n", (int)vcpu->cpuid);
73 	printf("| -> RAX=%"PRIx64"\n", state->gprs[NVMM_X64_GPR_RAX]);
74 	printf("| -> RCX=%"PRIx64"\n", state->gprs[NVMM_X64_GPR_RCX]);
75 	printf("| -> RDX=%"PRIx64"\n", state->gprs[NVMM_X64_GPR_RDX]);
76 	printf("| -> RBX=%"PRIx64"\n", state->gprs[NVMM_X64_GPR_RBX]);
77 	printf("| -> RSP=%"PRIx64"\n", state->gprs[NVMM_X64_GPR_RSP]);
78 	printf("| -> RBP=%"PRIx64"\n", state->gprs[NVMM_X64_GPR_RBP]);
79 	printf("| -> RSI=%"PRIx64"\n", state->gprs[NVMM_X64_GPR_RSI]);
80 	printf("| -> RDI=%"PRIx64"\n", state->gprs[NVMM_X64_GPR_RDI]);
81 	printf("| -> RIP=%"PRIx64"\n", state->gprs[NVMM_X64_GPR_RIP]);
82 	printf("| -> RFLAGS=%p\n", (void *)state->gprs[NVMM_X64_GPR_RFLAGS]);
83 	for (i = 0; i < NVMM_X64_NSEG; i++) {
84 		attr = (uint16_t *)&state->segs[i].attrib;
85 		printf("| -> %s: sel=0x%x base=%"PRIx64", limit=%x, "
86 		    "attrib=%x [type=%d,l=%d,def=%d]\n",
87 		    segnames[i],
88 		    state->segs[i].selector,
89 		    state->segs[i].base,
90 		    state->segs[i].limit,
91 		    *attr,
92 		    state->segs[i].attrib.type,
93 		    state->segs[i].attrib.l,
94 		    state->segs[i].attrib.def);
95 	}
96 	printf("| -> MSR_EFER=%"PRIx64"\n", state->msrs[NVMM_X64_MSR_EFER]);
97 	printf("| -> CR0=%"PRIx64"\n", state->crs[NVMM_X64_CR_CR0]);
98 	printf("| -> CR3=%"PRIx64"\n", state->crs[NVMM_X64_CR_CR3]);
99 	printf("| -> CR4=%"PRIx64"\n", state->crs[NVMM_X64_CR_CR4]);
100 	printf("| -> CR8=%"PRIx64"\n", state->crs[NVMM_X64_CR_CR8]);
101 
102 	return 0;
103 }
104 
105 /* -------------------------------------------------------------------------- */
106 
107 #define PTE32_L1_SHIFT	12
108 #define PTE32_L2_SHIFT	22
109 
110 #define PTE32_L2_MASK	0xffc00000
111 #define PTE32_L1_MASK	0x003ff000
112 
113 #define PTE32_L2_FRAME	(PTE32_L2_MASK)
114 #define PTE32_L1_FRAME	(PTE32_L2_FRAME|PTE32_L1_MASK)
115 
116 #define pte32_l1idx(va)	(((va) & PTE32_L1_MASK) >> PTE32_L1_SHIFT)
117 #define pte32_l2idx(va)	(((va) & PTE32_L2_MASK) >> PTE32_L2_SHIFT)
118 
119 #define CR3_FRAME_32BIT	__BITS(31, 12)
120 
121 typedef uint32_t pte_32bit_t;
122 
123 static int
124 x86_gva_to_gpa_32bit(struct nvmm_machine *mach, uint64_t cr3,
125     gvaddr_t gva, gpaddr_t *gpa, bool has_pse, nvmm_prot_t *prot)
126 {
127 	gpaddr_t L2gpa, L1gpa;
128 	uintptr_t L2hva, L1hva;
129 	pte_32bit_t *pdir, pte;
130 	nvmm_prot_t pageprot;
131 
132 	/* We begin with an RWXU access. */
133 	*prot = NVMM_PROT_ALL;
134 
135 	/* Parse L2. */
136 	L2gpa = (cr3 & CR3_FRAME_32BIT);
137 	if (nvmm_gpa_to_hva(mach, L2gpa, &L2hva, &pageprot) == -1)
138 		return -1;
139 	pdir = (pte_32bit_t *)L2hva;
140 	pte = pdir[pte32_l2idx(gva)];
141 	if ((pte & PTE_P) == 0)
142 		return -1;
143 	if ((pte & PTE_U) == 0)
144 		*prot &= ~NVMM_PROT_USER;
145 	if ((pte & PTE_W) == 0)
146 		*prot &= ~NVMM_PROT_WRITE;
147 	if ((pte & PTE_PS) && !has_pse)
148 		return -1;
149 	if (pte & PTE_PS) {
150 		*gpa = (pte & PTE32_L2_FRAME);
151 		*gpa = *gpa + (gva & PTE32_L1_MASK);
152 		return 0;
153 	}
154 
155 	/* Parse L1. */
156 	L1gpa = (pte & PTE_FRAME);
157 	if (nvmm_gpa_to_hva(mach, L1gpa, &L1hva, &pageprot) == -1)
158 		return -1;
159 	pdir = (pte_32bit_t *)L1hva;
160 	pte = pdir[pte32_l1idx(gva)];
161 	if ((pte & PTE_P) == 0)
162 		return -1;
163 	if ((pte & PTE_U) == 0)
164 		*prot &= ~NVMM_PROT_USER;
165 	if ((pte & PTE_W) == 0)
166 		*prot &= ~NVMM_PROT_WRITE;
167 	if (pte & PTE_PS)
168 		return -1;
169 
170 	*gpa = (pte & PTE_FRAME);
171 	return 0;
172 }
173 
174 /* -------------------------------------------------------------------------- */
175 
176 #define	PTE32_PAE_L1_SHIFT	12
177 #define	PTE32_PAE_L2_SHIFT	21
178 #define	PTE32_PAE_L3_SHIFT	30
179 
180 #define	PTE32_PAE_L3_MASK	0xc0000000
181 #define	PTE32_PAE_L2_MASK	0x3fe00000
182 #define	PTE32_PAE_L1_MASK	0x001ff000
183 
184 #define	PTE32_PAE_L3_FRAME	(PTE32_PAE_L3_MASK)
185 #define	PTE32_PAE_L2_FRAME	(PTE32_PAE_L3_FRAME|PTE32_PAE_L2_MASK)
186 #define	PTE32_PAE_L1_FRAME	(PTE32_PAE_L2_FRAME|PTE32_PAE_L1_MASK)
187 
188 #define pte32_pae_l1idx(va)	(((va) & PTE32_PAE_L1_MASK) >> PTE32_PAE_L1_SHIFT)
189 #define pte32_pae_l2idx(va)	(((va) & PTE32_PAE_L2_MASK) >> PTE32_PAE_L2_SHIFT)
190 #define pte32_pae_l3idx(va)	(((va) & PTE32_PAE_L3_MASK) >> PTE32_PAE_L3_SHIFT)
191 
192 #define CR3_FRAME_32BIT_PAE	__BITS(31, 5)
193 
194 typedef uint64_t pte_32bit_pae_t;
195 
196 static int
197 x86_gva_to_gpa_32bit_pae(struct nvmm_machine *mach, uint64_t cr3,
198     gvaddr_t gva, gpaddr_t *gpa, nvmm_prot_t *prot)
199 {
200 	gpaddr_t L3gpa, L2gpa, L1gpa;
201 	uintptr_t L3hva, L2hva, L1hva;
202 	pte_32bit_pae_t *pdir, pte;
203 	nvmm_prot_t pageprot;
204 
205 	/* We begin with an RWXU access. */
206 	*prot = NVMM_PROT_ALL;
207 
208 	/* Parse L3. */
209 	L3gpa = (cr3 & CR3_FRAME_32BIT_PAE);
210 	if (nvmm_gpa_to_hva(mach, L3gpa, &L3hva, &pageprot) == -1)
211 		return -1;
212 	pdir = (pte_32bit_pae_t *)L3hva;
213 	pte = pdir[pte32_pae_l3idx(gva)];
214 	if ((pte & PTE_P) == 0)
215 		return -1;
216 	if (pte & PTE_NX)
217 		*prot &= ~NVMM_PROT_EXEC;
218 	if (pte & PTE_PS)
219 		return -1;
220 
221 	/* Parse L2. */
222 	L2gpa = (pte & PTE_FRAME);
223 	if (nvmm_gpa_to_hva(mach, L2gpa, &L2hva, &pageprot) == -1)
224 		return -1;
225 	pdir = (pte_32bit_pae_t *)L2hva;
226 	pte = pdir[pte32_pae_l2idx(gva)];
227 	if ((pte & PTE_P) == 0)
228 		return -1;
229 	if ((pte & PTE_U) == 0)
230 		*prot &= ~NVMM_PROT_USER;
231 	if ((pte & PTE_W) == 0)
232 		*prot &= ~NVMM_PROT_WRITE;
233 	if (pte & PTE_NX)
234 		*prot &= ~NVMM_PROT_EXEC;
235 	if (pte & PTE_PS) {
236 		*gpa = (pte & PTE32_PAE_L2_FRAME);
237 		*gpa = *gpa + (gva & PTE32_PAE_L1_MASK);
238 		return 0;
239 	}
240 
241 	/* Parse L1. */
242 	L1gpa = (pte & PTE_FRAME);
243 	if (nvmm_gpa_to_hva(mach, L1gpa, &L1hva, &pageprot) == -1)
244 		return -1;
245 	pdir = (pte_32bit_pae_t *)L1hva;
246 	pte = pdir[pte32_pae_l1idx(gva)];
247 	if ((pte & PTE_P) == 0)
248 		return -1;
249 	if ((pte & PTE_U) == 0)
250 		*prot &= ~NVMM_PROT_USER;
251 	if ((pte & PTE_W) == 0)
252 		*prot &= ~NVMM_PROT_WRITE;
253 	if (pte & PTE_NX)
254 		*prot &= ~NVMM_PROT_EXEC;
255 	if (pte & PTE_PS)
256 		return -1;
257 
258 	*gpa = (pte & PTE_FRAME);
259 	return 0;
260 }
261 
262 /* -------------------------------------------------------------------------- */
263 
264 #define PTE64_L1_SHIFT	12
265 #define PTE64_L2_SHIFT	21
266 #define PTE64_L3_SHIFT	30
267 #define PTE64_L4_SHIFT	39
268 
269 #define PTE64_L4_MASK	0x0000ff8000000000
270 #define PTE64_L3_MASK	0x0000007fc0000000
271 #define PTE64_L2_MASK	0x000000003fe00000
272 #define PTE64_L1_MASK	0x00000000001ff000
273 
274 #define PTE64_L4_FRAME	PTE64_L4_MASK
275 #define PTE64_L3_FRAME	(PTE64_L4_FRAME|PTE64_L3_MASK)
276 #define PTE64_L2_FRAME	(PTE64_L3_FRAME|PTE64_L2_MASK)
277 #define PTE64_L1_FRAME	(PTE64_L2_FRAME|PTE64_L1_MASK)
278 
279 #define pte64_l1idx(va)	(((va) & PTE64_L1_MASK) >> PTE64_L1_SHIFT)
280 #define pte64_l2idx(va)	(((va) & PTE64_L2_MASK) >> PTE64_L2_SHIFT)
281 #define pte64_l3idx(va)	(((va) & PTE64_L3_MASK) >> PTE64_L3_SHIFT)
282 #define pte64_l4idx(va)	(((va) & PTE64_L4_MASK) >> PTE64_L4_SHIFT)
283 
284 #define CR3_FRAME_64BIT	__BITS(51, 12)
285 
286 typedef uint64_t pte_64bit_t;
287 
288 static inline bool
289 x86_gva_64bit_canonical(gvaddr_t gva)
290 {
291 	/* Bits 63:47 must have the same value. */
292 #define SIGN_EXTEND	0xffff800000000000ULL
293 	return (gva & SIGN_EXTEND) == 0 || (gva & SIGN_EXTEND) == SIGN_EXTEND;
294 }
295 
296 static int
297 x86_gva_to_gpa_64bit(struct nvmm_machine *mach, uint64_t cr3,
298     gvaddr_t gva, gpaddr_t *gpa, nvmm_prot_t *prot)
299 {
300 	gpaddr_t L4gpa, L3gpa, L2gpa, L1gpa;
301 	uintptr_t L4hva, L3hva, L2hva, L1hva;
302 	pte_64bit_t *pdir, pte;
303 	nvmm_prot_t pageprot;
304 
305 	/* We begin with an RWXU access. */
306 	*prot = NVMM_PROT_ALL;
307 
308 	if (!x86_gva_64bit_canonical(gva))
309 		return -1;
310 
311 	/* Parse L4. */
312 	L4gpa = (cr3 & CR3_FRAME_64BIT);
313 	if (nvmm_gpa_to_hva(mach, L4gpa, &L4hva, &pageprot) == -1)
314 		return -1;
315 	pdir = (pte_64bit_t *)L4hva;
316 	pte = pdir[pte64_l4idx(gva)];
317 	if ((pte & PTE_P) == 0)
318 		return -1;
319 	if ((pte & PTE_U) == 0)
320 		*prot &= ~NVMM_PROT_USER;
321 	if ((pte & PTE_W) == 0)
322 		*prot &= ~NVMM_PROT_WRITE;
323 	if (pte & PTE_NX)
324 		*prot &= ~NVMM_PROT_EXEC;
325 	if (pte & PTE_PS)
326 		return -1;
327 
328 	/* Parse L3. */
329 	L3gpa = (pte & PTE_FRAME);
330 	if (nvmm_gpa_to_hva(mach, L3gpa, &L3hva, &pageprot) == -1)
331 		return -1;
332 	pdir = (pte_64bit_t *)L3hva;
333 	pte = pdir[pte64_l3idx(gva)];
334 	if ((pte & PTE_P) == 0)
335 		return -1;
336 	if ((pte & PTE_U) == 0)
337 		*prot &= ~NVMM_PROT_USER;
338 	if ((pte & PTE_W) == 0)
339 		*prot &= ~NVMM_PROT_WRITE;
340 	if (pte & PTE_NX)
341 		*prot &= ~NVMM_PROT_EXEC;
342 	if (pte & PTE_PS) {
343 		*gpa = (pte & PTE64_L3_FRAME);
344 		*gpa = *gpa + (gva & (PTE64_L2_MASK|PTE64_L1_MASK));
345 		return 0;
346 	}
347 
348 	/* Parse L2. */
349 	L2gpa = (pte & PTE_FRAME);
350 	if (nvmm_gpa_to_hva(mach, L2gpa, &L2hva, &pageprot) == -1)
351 		return -1;
352 	pdir = (pte_64bit_t *)L2hva;
353 	pte = pdir[pte64_l2idx(gva)];
354 	if ((pte & PTE_P) == 0)
355 		return -1;
356 	if ((pte & PTE_U) == 0)
357 		*prot &= ~NVMM_PROT_USER;
358 	if ((pte & PTE_W) == 0)
359 		*prot &= ~NVMM_PROT_WRITE;
360 	if (pte & PTE_NX)
361 		*prot &= ~NVMM_PROT_EXEC;
362 	if (pte & PTE_PS) {
363 		*gpa = (pte & PTE64_L2_FRAME);
364 		*gpa = *gpa + (gva & PTE64_L1_MASK);
365 		return 0;
366 	}
367 
368 	/* Parse L1. */
369 	L1gpa = (pte & PTE_FRAME);
370 	if (nvmm_gpa_to_hva(mach, L1gpa, &L1hva, &pageprot) == -1)
371 		return -1;
372 	pdir = (pte_64bit_t *)L1hva;
373 	pte = pdir[pte64_l1idx(gva)];
374 	if ((pte & PTE_P) == 0)
375 		return -1;
376 	if ((pte & PTE_U) == 0)
377 		*prot &= ~NVMM_PROT_USER;
378 	if ((pte & PTE_W) == 0)
379 		*prot &= ~NVMM_PROT_WRITE;
380 	if (pte & PTE_NX)
381 		*prot &= ~NVMM_PROT_EXEC;
382 	if (pte & PTE_PS)
383 		return -1;
384 
385 	*gpa = (pte & PTE_FRAME);
386 	return 0;
387 }
388 
389 static inline int
390 x86_gva_to_gpa(struct nvmm_machine *mach, struct nvmm_x64_state *state,
391     gvaddr_t gva, gpaddr_t *gpa, nvmm_prot_t *prot)
392 {
393 	bool is_pae, is_lng, has_pse;
394 	uint64_t cr3;
395 	size_t off;
396 	int ret;
397 
398 	if ((state->crs[NVMM_X64_CR_CR0] & CR0_PG) == 0) {
399 		/* No paging. */
400 		*prot = NVMM_PROT_ALL;
401 		*gpa = gva;
402 		return 0;
403 	}
404 
405 	off = (gva & PAGE_MASK);
406 	gva &= ~PAGE_MASK;
407 
408 	is_pae = (state->crs[NVMM_X64_CR_CR4] & CR4_PAE) != 0;
409 	is_lng = (state->msrs[NVMM_X64_MSR_EFER] & EFER_LMA) != 0;
410 	has_pse = (state->crs[NVMM_X64_CR_CR4] & CR4_PSE) != 0;
411 	cr3 = state->crs[NVMM_X64_CR_CR3];
412 
413 	if (is_pae && is_lng) {
414 		/* 64bit */
415 		ret = x86_gva_to_gpa_64bit(mach, cr3, gva, gpa, prot);
416 	} else if (is_pae && !is_lng) {
417 		/* 32bit PAE */
418 		ret = x86_gva_to_gpa_32bit_pae(mach, cr3, gva, gpa, prot);
419 	} else if (!is_pae && !is_lng) {
420 		/* 32bit */
421 		ret = x86_gva_to_gpa_32bit(mach, cr3, gva, gpa, has_pse, prot);
422 	} else {
423 		ret = -1;
424 	}
425 
426 	if (ret == -1) {
427 		errno = EFAULT;
428 	}
429 
430 	*gpa = *gpa + off;
431 
432 	return ret;
433 }
434 
435 int
436 nvmm_gva_to_gpa(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu,
437     gvaddr_t gva, gpaddr_t *gpa, nvmm_prot_t *prot)
438 {
439 	struct nvmm_x64_state *state = vcpu->state;
440 	int ret;
441 
442 	ret = nvmm_vcpu_getstate(mach, vcpu,
443 	    NVMM_X64_STATE_CRS | NVMM_X64_STATE_MSRS);
444 	if (ret == -1)
445 		return -1;
446 
447 	return x86_gva_to_gpa(mach, state, gva, gpa, prot);
448 }
449 
450 /* -------------------------------------------------------------------------- */
451 
452 #define DISASSEMBLER_BUG()	\
453 	do {			\
454 		errno = EINVAL;	\
455 		return -1;	\
456 	} while (0);
457 
458 static inline bool
459 is_long_mode(struct nvmm_x64_state *state)
460 {
461 	return (state->msrs[NVMM_X64_MSR_EFER] & EFER_LMA) != 0;
462 }
463 
464 static inline bool
465 is_64bit(struct nvmm_x64_state *state)
466 {
467 	return (state->segs[NVMM_X64_SEG_CS].attrib.l != 0);
468 }
469 
470 static inline bool
471 is_32bit(struct nvmm_x64_state *state)
472 {
473 	return (state->segs[NVMM_X64_SEG_CS].attrib.l == 0) &&
474 	    (state->segs[NVMM_X64_SEG_CS].attrib.def == 1);
475 }
476 
477 static inline bool
478 is_16bit(struct nvmm_x64_state *state)
479 {
480 	return (state->segs[NVMM_X64_SEG_CS].attrib.l == 0) &&
481 	    (state->segs[NVMM_X64_SEG_CS].attrib.def == 0);
482 }
483 
484 static int
485 segment_check(struct nvmm_x64_state_seg *seg, gvaddr_t gva, size_t size)
486 {
487 	uint64_t limit;
488 
489 	/*
490 	 * This is incomplete. We should check topdown, etc, really that's
491 	 * tiring.
492 	 */
493 	if (__predict_false(!seg->attrib.p)) {
494 		goto error;
495 	}
496 
497 	limit = (uint64_t)seg->limit + 1;
498 	if (__predict_true(seg->attrib.g)) {
499 		limit *= PAGE_SIZE;
500 	}
501 
502 	if (__predict_false(gva + size > limit)) {
503 		goto error;
504 	}
505 
506 	return 0;
507 
508 error:
509 	errno = EFAULT;
510 	return -1;
511 }
512 
513 static inline void
514 segment_apply(struct nvmm_x64_state_seg *seg, gvaddr_t *gva)
515 {
516 	*gva += seg->base;
517 }
518 
519 static inline uint64_t
520 size_to_mask(size_t size)
521 {
522 	switch (size) {
523 	case 1:
524 		return 0x00000000000000FF;
525 	case 2:
526 		return 0x000000000000FFFF;
527 	case 4:
528 		return 0x00000000FFFFFFFF;
529 	case 8:
530 	default:
531 		return 0xFFFFFFFFFFFFFFFF;
532 	}
533 }
534 
535 static uint64_t
536 rep_get_cnt(struct nvmm_x64_state *state, size_t adsize)
537 {
538 	uint64_t mask, cnt;
539 
540 	mask = size_to_mask(adsize);
541 	cnt = state->gprs[NVMM_X64_GPR_RCX] & mask;
542 
543 	return cnt;
544 }
545 
546 static void
547 rep_set_cnt(struct nvmm_x64_state *state, size_t adsize, uint64_t cnt)
548 {
549 	uint64_t mask;
550 
551 	/* XXX: should we zero-extend? */
552 	mask = size_to_mask(adsize);
553 	state->gprs[NVMM_X64_GPR_RCX] &= ~mask;
554 	state->gprs[NVMM_X64_GPR_RCX] |= cnt;
555 }
556 
557 static int
558 read_guest_memory(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu,
559     gvaddr_t gva, uint8_t *data, size_t size)
560 {
561 	struct nvmm_x64_state *state = vcpu->state;
562 	struct nvmm_mem mem;
563 	nvmm_prot_t prot;
564 	gpaddr_t gpa;
565 	uintptr_t hva;
566 	bool is_mmio;
567 	int ret, remain;
568 
569 	ret = x86_gva_to_gpa(mach, state, gva, &gpa, &prot);
570 	if (__predict_false(ret == -1)) {
571 		return -1;
572 	}
573 	if (__predict_false(!(prot & NVMM_PROT_READ))) {
574 		errno = EFAULT;
575 		return -1;
576 	}
577 
578 	if ((gva & PAGE_MASK) + size > PAGE_SIZE) {
579 		remain = ((gva & PAGE_MASK) + size - PAGE_SIZE);
580 	} else {
581 		remain = 0;
582 	}
583 	size -= remain;
584 
585 	ret = nvmm_gpa_to_hva(mach, gpa, &hva, &prot);
586 	is_mmio = (ret == -1);
587 
588 	if (is_mmio) {
589 		mem.mach = mach;
590 		mem.vcpu = vcpu;
591 		mem.data = data;
592 		mem.gpa = gpa;
593 		mem.write = false;
594 		mem.size = size;
595 		(*vcpu->cbs.mem)(&mem);
596 	} else {
597 		if (__predict_false(!(prot & NVMM_PROT_READ))) {
598 			errno = EFAULT;
599 			return -1;
600 		}
601 		memcpy(data, (uint8_t *)hva, size);
602 	}
603 
604 	if (remain > 0) {
605 		ret = read_guest_memory(mach, vcpu, gva + size,
606 		    data + size, remain);
607 	} else {
608 		ret = 0;
609 	}
610 
611 	return ret;
612 }
613 
614 static int
615 write_guest_memory(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu,
616     gvaddr_t gva, uint8_t *data, size_t size)
617 {
618 	struct nvmm_x64_state *state = vcpu->state;
619 	struct nvmm_mem mem;
620 	nvmm_prot_t prot;
621 	gpaddr_t gpa;
622 	uintptr_t hva;
623 	bool is_mmio;
624 	int ret, remain;
625 
626 	ret = x86_gva_to_gpa(mach, state, gva, &gpa, &prot);
627 	if (__predict_false(ret == -1)) {
628 		return -1;
629 	}
630 	if (__predict_false(!(prot & NVMM_PROT_WRITE))) {
631 		errno = EFAULT;
632 		return -1;
633 	}
634 
635 	if ((gva & PAGE_MASK) + size > PAGE_SIZE) {
636 		remain = ((gva & PAGE_MASK) + size - PAGE_SIZE);
637 	} else {
638 		remain = 0;
639 	}
640 	size -= remain;
641 
642 	ret = nvmm_gpa_to_hva(mach, gpa, &hva, &prot);
643 	is_mmio = (ret == -1);
644 
645 	if (is_mmio) {
646 		mem.mach = mach;
647 		mem.vcpu = vcpu;
648 		mem.data = data;
649 		mem.gpa = gpa;
650 		mem.write = true;
651 		mem.size = size;
652 		(*vcpu->cbs.mem)(&mem);
653 	} else {
654 		if (__predict_false(!(prot & NVMM_PROT_WRITE))) {
655 			errno = EFAULT;
656 			return -1;
657 		}
658 		memcpy((uint8_t *)hva, data, size);
659 	}
660 
661 	if (remain > 0) {
662 		ret = write_guest_memory(mach, vcpu, gva + size,
663 		    data + size, remain);
664 	} else {
665 		ret = 0;
666 	}
667 
668 	return ret;
669 }
670 
671 /* -------------------------------------------------------------------------- */
672 
673 static int fetch_segment(struct nvmm_machine *, struct nvmm_vcpu *);
674 
675 #define NVMM_IO_BATCH_SIZE	32
676 
677 static int
678 assist_io_batch(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu,
679     struct nvmm_io *io, gvaddr_t gva, uint64_t cnt)
680 {
681 	uint8_t iobuf[NVMM_IO_BATCH_SIZE];
682 	size_t i, iosize, iocnt;
683 	int ret;
684 
685 	cnt = MIN(cnt, NVMM_IO_BATCH_SIZE);
686 	iosize = MIN(io->size * cnt, NVMM_IO_BATCH_SIZE);
687 	iocnt = iosize / io->size;
688 
689 	io->data = iobuf;
690 
691 	if (!io->in) {
692 		ret = read_guest_memory(mach, vcpu, gva, iobuf, iosize);
693 		if (ret == -1)
694 			return -1;
695 	}
696 
697 	for (i = 0; i < iocnt; i++) {
698 		(*vcpu->cbs.io)(io);
699 		io->data += io->size;
700 	}
701 
702 	if (io->in) {
703 		ret = write_guest_memory(mach, vcpu, gva, iobuf, iosize);
704 		if (ret == -1)
705 			return -1;
706 	}
707 
708 	return iocnt;
709 }
710 
711 int
712 nvmm_assist_io(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu)
713 {
714 	struct nvmm_x64_state *state = vcpu->state;
715 	struct nvmm_vcpu_exit *exit = vcpu->exit;
716 	struct nvmm_io io;
717 	uint64_t cnt = 0; /* GCC */
718 	uint8_t iobuf[8];
719 	int iocnt = 1;
720 	gvaddr_t gva = 0; /* GCC */
721 	int reg = 0; /* GCC */
722 	int ret, seg;
723 	bool psld = false;
724 
725 	if (__predict_false(exit->reason != NVMM_VCPU_EXIT_IO)) {
726 		errno = EINVAL;
727 		return -1;
728 	}
729 
730 	io.mach = mach;
731 	io.vcpu = vcpu;
732 	io.port = exit->u.io.port;
733 	io.in = exit->u.io.in;
734 	io.size = exit->u.io.operand_size;
735 	io.data = iobuf;
736 
737 	ret = nvmm_vcpu_getstate(mach, vcpu,
738 	    NVMM_X64_STATE_GPRS | NVMM_X64_STATE_SEGS |
739 	    NVMM_X64_STATE_CRS | NVMM_X64_STATE_MSRS);
740 	if (ret == -1)
741 		return -1;
742 
743 	if (exit->u.io.rep) {
744 		cnt = rep_get_cnt(state, exit->u.io.address_size);
745 		if (__predict_false(cnt == 0)) {
746 			state->gprs[NVMM_X64_GPR_RIP] = exit->u.io.npc;
747 			goto out;
748 		}
749 	}
750 
751 	if (__predict_false(state->gprs[NVMM_X64_GPR_RFLAGS] & PSL_D)) {
752 		psld = true;
753 	}
754 
755 	/*
756 	 * Determine GVA.
757 	 */
758 	if (exit->u.io.str) {
759 		if (io.in) {
760 			reg = NVMM_X64_GPR_RDI;
761 		} else {
762 			reg = NVMM_X64_GPR_RSI;
763 		}
764 
765 		gva = state->gprs[reg];
766 		gva &= size_to_mask(exit->u.io.address_size);
767 
768 		if (exit->u.io.seg != -1) {
769 			seg = exit->u.io.seg;
770 		} else {
771 			if (io.in) {
772 				seg = NVMM_X64_SEG_ES;
773 			} else {
774 				seg = fetch_segment(mach, vcpu);
775 				if (seg == -1)
776 					return -1;
777 			}
778 		}
779 
780 		if (__predict_true(is_long_mode(state))) {
781 			if (seg == NVMM_X64_SEG_GS || seg == NVMM_X64_SEG_FS) {
782 				segment_apply(&state->segs[seg], &gva);
783 			}
784 		} else {
785 			ret = segment_check(&state->segs[seg], gva, io.size);
786 			if (ret == -1)
787 				return -1;
788 			segment_apply(&state->segs[seg], &gva);
789 		}
790 
791 		if (exit->u.io.rep && !psld) {
792 			iocnt = assist_io_batch(mach, vcpu, &io, gva, cnt);
793 			if (iocnt == -1)
794 				return -1;
795 			goto done;
796 		}
797 	}
798 
799 	if (!io.in) {
800 		if (!exit->u.io.str) {
801 			memcpy(io.data, &state->gprs[NVMM_X64_GPR_RAX], io.size);
802 		} else {
803 			ret = read_guest_memory(mach, vcpu, gva, io.data,
804 			    io.size);
805 			if (ret == -1)
806 				return -1;
807 		}
808 	}
809 
810 	(*vcpu->cbs.io)(&io);
811 
812 	if (io.in) {
813 		if (!exit->u.io.str) {
814 			memcpy(&state->gprs[NVMM_X64_GPR_RAX], io.data, io.size);
815 			if (io.size == 4) {
816 				/* Zero-extend to 64 bits. */
817 				state->gprs[NVMM_X64_GPR_RAX] &= size_to_mask(4);
818 			}
819 		} else {
820 			ret = write_guest_memory(mach, vcpu, gva, io.data,
821 			    io.size);
822 			if (ret == -1)
823 				return -1;
824 		}
825 	}
826 
827 done:
828 	if (exit->u.io.str) {
829 		if (__predict_false(psld)) {
830 			state->gprs[reg] -= iocnt * io.size;
831 		} else {
832 			state->gprs[reg] += iocnt * io.size;
833 		}
834 	}
835 
836 	if (exit->u.io.rep) {
837 		cnt -= iocnt;
838 		rep_set_cnt(state, exit->u.io.address_size, cnt);
839 		if (cnt == 0) {
840 			state->gprs[NVMM_X64_GPR_RIP] = exit->u.io.npc;
841 		}
842 	} else {
843 		state->gprs[NVMM_X64_GPR_RIP] = exit->u.io.npc;
844 	}
845 
846 out:
847 	ret = nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_GPRS);
848 	if (ret == -1)
849 		return -1;
850 
851 	return 0;
852 }
853 
854 /* -------------------------------------------------------------------------- */
855 
856 struct x86_emul {
857 	bool readreg;
858 	bool backprop;
859 	bool notouch;
860 	void (*func)(struct nvmm_vcpu *, struct nvmm_mem *, uint64_t *);
861 };
862 
863 static void x86_func_or(struct nvmm_vcpu *, struct nvmm_mem *, uint64_t *);
864 static void x86_func_and(struct nvmm_vcpu *, struct nvmm_mem *, uint64_t *);
865 static void x86_func_xchg(struct nvmm_vcpu *, struct nvmm_mem *, uint64_t *);
866 static void x86_func_sub(struct nvmm_vcpu *, struct nvmm_mem *, uint64_t *);
867 static void x86_func_xor(struct nvmm_vcpu *, struct nvmm_mem *, uint64_t *);
868 static void x86_func_cmp(struct nvmm_vcpu *, struct nvmm_mem *, uint64_t *);
869 static void x86_func_test(struct nvmm_vcpu *, struct nvmm_mem *, uint64_t *);
870 static void x86_func_mov(struct nvmm_vcpu *, struct nvmm_mem *, uint64_t *);
871 static void x86_func_stos(struct nvmm_vcpu *, struct nvmm_mem *, uint64_t *);
872 static void x86_func_lods(struct nvmm_vcpu *, struct nvmm_mem *, uint64_t *);
873 static void x86_func_movs(struct nvmm_vcpu *, struct nvmm_mem *, uint64_t *);
874 
875 static const struct x86_emul x86_emul_or = {
876 	.readreg = true,
877 	.func = x86_func_or
878 };
879 
880 static const struct x86_emul x86_emul_and = {
881 	.readreg = true,
882 	.func = x86_func_and
883 };
884 
885 static const struct x86_emul x86_emul_xchg = {
886 	.readreg = true,
887 	.backprop = true,
888 	.func = x86_func_xchg
889 };
890 
891 static const struct x86_emul x86_emul_sub = {
892 	.readreg = true,
893 	.func = x86_func_sub
894 };
895 
896 static const struct x86_emul x86_emul_xor = {
897 	.readreg = true,
898 	.func = x86_func_xor
899 };
900 
901 static const struct x86_emul x86_emul_cmp = {
902 	.notouch = true,
903 	.func = x86_func_cmp
904 };
905 
906 static const struct x86_emul x86_emul_test = {
907 	.notouch = true,
908 	.func = x86_func_test
909 };
910 
911 static const struct x86_emul x86_emul_mov = {
912 	.func = x86_func_mov
913 };
914 
915 static const struct x86_emul x86_emul_stos = {
916 	.func = x86_func_stos
917 };
918 
919 static const struct x86_emul x86_emul_lods = {
920 	.func = x86_func_lods
921 };
922 
923 static const struct x86_emul x86_emul_movs = {
924 	.func = x86_func_movs
925 };
926 
927 /* Legacy prefixes. */
928 #define LEG_LOCK	0xF0
929 #define LEG_REPN	0xF2
930 #define LEG_REP		0xF3
931 #define LEG_OVR_CS	0x2E
932 #define LEG_OVR_SS	0x36
933 #define LEG_OVR_DS	0x3E
934 #define LEG_OVR_ES	0x26
935 #define LEG_OVR_FS	0x64
936 #define LEG_OVR_GS	0x65
937 #define LEG_OPR_OVR	0x66
938 #define LEG_ADR_OVR	0x67
939 
940 struct x86_legpref {
941 	bool opr_ovr:1;
942 	bool adr_ovr:1;
943 	bool rep:1;
944 	bool repn:1;
945 	int8_t seg;
946 };
947 
948 struct x86_rexpref {
949 	bool b:1;
950 	bool x:1;
951 	bool r:1;
952 	bool w:1;
953 	bool present:1;
954 };
955 
956 struct x86_reg {
957 	int num;	/* NVMM GPR state index */
958 	uint64_t mask;
959 };
960 
961 struct x86_dualreg {
962 	int reg1;
963 	int reg2;
964 };
965 
966 enum x86_disp_type {
967 	DISP_NONE,
968 	DISP_0,
969 	DISP_1,
970 	DISP_2,
971 	DISP_4
972 };
973 
974 struct x86_disp {
975 	enum x86_disp_type type;
976 	uint64_t data; /* 4 bytes, but can be sign-extended */
977 };
978 
979 struct x86_regmodrm {
980 	uint8_t mod:2;
981 	uint8_t reg:3;
982 	uint8_t rm:3;
983 };
984 
985 struct x86_immediate {
986 	uint64_t data;
987 };
988 
989 struct x86_sib {
990 	uint8_t scale;
991 	const struct x86_reg *idx;
992 	const struct x86_reg *bas;
993 };
994 
995 enum x86_store_type {
996 	STORE_NONE,
997 	STORE_REG,
998 	STORE_DUALREG,
999 	STORE_IMM,
1000 	STORE_SIB,
1001 	STORE_DMO
1002 };
1003 
1004 struct x86_store {
1005 	enum x86_store_type type;
1006 	union {
1007 		const struct x86_reg *reg;
1008 		struct x86_dualreg dualreg;
1009 		struct x86_immediate imm;
1010 		struct x86_sib sib;
1011 		uint64_t dmo;
1012 	} u;
1013 	struct x86_disp disp;
1014 	int hardseg;
1015 };
1016 
1017 struct x86_instr {
1018 	uint8_t len;
1019 	struct x86_legpref legpref;
1020 	struct x86_rexpref rexpref;
1021 	struct x86_regmodrm regmodrm;
1022 	uint8_t operand_size;
1023 	uint8_t address_size;
1024 	uint64_t zeroextend_mask;
1025 
1026 	const struct x86_opcode *opcode;
1027 	const struct x86_emul *emul;
1028 
1029 	struct x86_store src;
1030 	struct x86_store dst;
1031 	struct x86_store *strm;
1032 };
1033 
1034 struct x86_decode_fsm {
1035 	/* vcpu */
1036 	bool is64bit;
1037 	bool is32bit;
1038 	bool is16bit;
1039 
1040 	/* fsm */
1041 	int (*fn)(struct x86_decode_fsm *, struct x86_instr *);
1042 	uint8_t *buf;
1043 	uint8_t *end;
1044 };
1045 
1046 struct x86_opcode {
1047 	bool valid:1;
1048 	bool regmodrm:1;
1049 	bool regtorm:1;
1050 	bool dmo:1;
1051 	bool todmo:1;
1052 	bool movs:1;
1053 	bool stos:1;
1054 	bool lods:1;
1055 	bool szoverride:1;
1056 	bool group1:1;
1057 	bool group3:1;
1058 	bool group11:1;
1059 	bool immediate:1;
1060 	uint8_t defsize;
1061 	uint8_t flags;
1062 	const struct x86_emul *emul;
1063 };
1064 
1065 struct x86_group_entry {
1066 	const struct x86_emul *emul;
1067 };
1068 
1069 #define OPSIZE_BYTE 0x01
1070 #define OPSIZE_WORD 0x02 /* 2 bytes */
1071 #define OPSIZE_DOUB 0x04 /* 4 bytes */
1072 #define OPSIZE_QUAD 0x08 /* 8 bytes */
1073 
1074 #define FLAG_imm8	0x01
1075 #define FLAG_immz	0x02
1076 #define FLAG_ze		0x04
1077 
1078 static const struct x86_group_entry group1[8] __cacheline_aligned = {
1079 	[1] = { .emul = &x86_emul_or },
1080 	[4] = { .emul = &x86_emul_and },
1081 	[6] = { .emul = &x86_emul_xor },
1082 	[7] = { .emul = &x86_emul_cmp }
1083 };
1084 
1085 static const struct x86_group_entry group3[8] __cacheline_aligned = {
1086 	[0] = { .emul = &x86_emul_test },
1087 	[1] = { .emul = &x86_emul_test }
1088 };
1089 
1090 static const struct x86_group_entry group11[8] __cacheline_aligned = {
1091 	[0] = { .emul = &x86_emul_mov }
1092 };
1093 
1094 static const struct x86_opcode primary_opcode_table[256] __cacheline_aligned = {
1095 	/*
1096 	 * Group1
1097 	 */
1098 	[0x80] = {
1099 		/* Eb, Ib */
1100 		.valid = true,
1101 		.regmodrm = true,
1102 		.regtorm = true,
1103 		.szoverride = false,
1104 		.defsize = OPSIZE_BYTE,
1105 		.group1 = true,
1106 		.immediate = true,
1107 		.emul = NULL /* group1 */
1108 	},
1109 	[0x81] = {
1110 		/* Ev, Iz */
1111 		.valid = true,
1112 		.regmodrm = true,
1113 		.regtorm = true,
1114 		.szoverride = true,
1115 		.defsize = -1,
1116 		.group1 = true,
1117 		.immediate = true,
1118 		.flags = FLAG_immz,
1119 		.emul = NULL /* group1 */
1120 	},
1121 	[0x83] = {
1122 		/* Ev, Ib */
1123 		.valid = true,
1124 		.regmodrm = true,
1125 		.regtorm = true,
1126 		.szoverride = true,
1127 		.defsize = -1,
1128 		.group1 = true,
1129 		.immediate = true,
1130 		.flags = FLAG_imm8,
1131 		.emul = NULL /* group1 */
1132 	},
1133 
1134 	/*
1135 	 * Group3
1136 	 */
1137 	[0xF6] = {
1138 		/* Eb, Ib */
1139 		.valid = true,
1140 		.regmodrm = true,
1141 		.regtorm = true,
1142 		.szoverride = false,
1143 		.defsize = OPSIZE_BYTE,
1144 		.group3 = true,
1145 		.immediate = true,
1146 		.emul = NULL /* group3 */
1147 	},
1148 	[0xF7] = {
1149 		/* Ev, Iz */
1150 		.valid = true,
1151 		.regmodrm = true,
1152 		.regtorm = true,
1153 		.szoverride = true,
1154 		.defsize = -1,
1155 		.group3 = true,
1156 		.immediate = true,
1157 		.flags = FLAG_immz,
1158 		.emul = NULL /* group3 */
1159 	},
1160 
1161 	/*
1162 	 * Group11
1163 	 */
1164 	[0xC6] = {
1165 		/* Eb, Ib */
1166 		.valid = true,
1167 		.regmodrm = true,
1168 		.regtorm = true,
1169 		.szoverride = false,
1170 		.defsize = OPSIZE_BYTE,
1171 		.group11 = true,
1172 		.immediate = true,
1173 		.emul = NULL /* group11 */
1174 	},
1175 	[0xC7] = {
1176 		/* Ev, Iz */
1177 		.valid = true,
1178 		.regmodrm = true,
1179 		.regtorm = true,
1180 		.szoverride = true,
1181 		.defsize = -1,
1182 		.group11 = true,
1183 		.immediate = true,
1184 		.flags = FLAG_immz,
1185 		.emul = NULL /* group11 */
1186 	},
1187 
1188 	/*
1189 	 * OR
1190 	 */
1191 	[0x08] = {
1192 		/* Eb, Gb */
1193 		.valid = true,
1194 		.regmodrm = true,
1195 		.regtorm = true,
1196 		.szoverride = false,
1197 		.defsize = OPSIZE_BYTE,
1198 		.emul = &x86_emul_or
1199 	},
1200 	[0x09] = {
1201 		/* Ev, Gv */
1202 		.valid = true,
1203 		.regmodrm = true,
1204 		.regtorm = true,
1205 		.szoverride = true,
1206 		.defsize = -1,
1207 		.emul = &x86_emul_or
1208 	},
1209 	[0x0A] = {
1210 		/* Gb, Eb */
1211 		.valid = true,
1212 		.regmodrm = true,
1213 		.regtorm = false,
1214 		.szoverride = false,
1215 		.defsize = OPSIZE_BYTE,
1216 		.emul = &x86_emul_or
1217 	},
1218 	[0x0B] = {
1219 		/* Gv, Ev */
1220 		.valid = true,
1221 		.regmodrm = true,
1222 		.regtorm = false,
1223 		.szoverride = true,
1224 		.defsize = -1,
1225 		.emul = &x86_emul_or
1226 	},
1227 
1228 	/*
1229 	 * AND
1230 	 */
1231 	[0x20] = {
1232 		/* Eb, Gb */
1233 		.valid = true,
1234 		.regmodrm = true,
1235 		.regtorm = true,
1236 		.szoverride = false,
1237 		.defsize = OPSIZE_BYTE,
1238 		.emul = &x86_emul_and
1239 	},
1240 	[0x21] = {
1241 		/* Ev, Gv */
1242 		.valid = true,
1243 		.regmodrm = true,
1244 		.regtorm = true,
1245 		.szoverride = true,
1246 		.defsize = -1,
1247 		.emul = &x86_emul_and
1248 	},
1249 	[0x22] = {
1250 		/* Gb, Eb */
1251 		.valid = true,
1252 		.regmodrm = true,
1253 		.regtorm = false,
1254 		.szoverride = false,
1255 		.defsize = OPSIZE_BYTE,
1256 		.emul = &x86_emul_and
1257 	},
1258 	[0x23] = {
1259 		/* Gv, Ev */
1260 		.valid = true,
1261 		.regmodrm = true,
1262 		.regtorm = false,
1263 		.szoverride = true,
1264 		.defsize = -1,
1265 		.emul = &x86_emul_and
1266 	},
1267 
1268 	/*
1269 	 * SUB
1270 	 */
1271 	[0x28] = {
1272 		/* Eb, Gb */
1273 		.valid = true,
1274 		.regmodrm = true,
1275 		.regtorm = true,
1276 		.szoverride = false,
1277 		.defsize = OPSIZE_BYTE,
1278 		.emul = &x86_emul_sub
1279 	},
1280 	[0x29] = {
1281 		/* Ev, Gv */
1282 		.valid = true,
1283 		.regmodrm = true,
1284 		.regtorm = true,
1285 		.szoverride = true,
1286 		.defsize = -1,
1287 		.emul = &x86_emul_sub
1288 	},
1289 	[0x2A] = {
1290 		/* Gb, Eb */
1291 		.valid = true,
1292 		.regmodrm = true,
1293 		.regtorm = false,
1294 		.szoverride = false,
1295 		.defsize = OPSIZE_BYTE,
1296 		.emul = &x86_emul_sub
1297 	},
1298 	[0x2B] = {
1299 		/* Gv, Ev */
1300 		.valid = true,
1301 		.regmodrm = true,
1302 		.regtorm = false,
1303 		.szoverride = true,
1304 		.defsize = -1,
1305 		.emul = &x86_emul_sub
1306 	},
1307 
1308 	/*
1309 	 * XOR
1310 	 */
1311 	[0x30] = {
1312 		/* Eb, Gb */
1313 		.valid = true,
1314 		.regmodrm = true,
1315 		.regtorm = true,
1316 		.szoverride = false,
1317 		.defsize = OPSIZE_BYTE,
1318 		.emul = &x86_emul_xor
1319 	},
1320 	[0x31] = {
1321 		/* Ev, Gv */
1322 		.valid = true,
1323 		.regmodrm = true,
1324 		.regtorm = true,
1325 		.szoverride = true,
1326 		.defsize = -1,
1327 		.emul = &x86_emul_xor
1328 	},
1329 	[0x32] = {
1330 		/* Gb, Eb */
1331 		.valid = true,
1332 		.regmodrm = true,
1333 		.regtorm = false,
1334 		.szoverride = false,
1335 		.defsize = OPSIZE_BYTE,
1336 		.emul = &x86_emul_xor
1337 	},
1338 	[0x33] = {
1339 		/* Gv, Ev */
1340 		.valid = true,
1341 		.regmodrm = true,
1342 		.regtorm = false,
1343 		.szoverride = true,
1344 		.defsize = -1,
1345 		.emul = &x86_emul_xor
1346 	},
1347 
1348 	/*
1349 	 * XCHG
1350 	 */
1351 	[0x86] = {
1352 		/* Eb, Gb */
1353 		.valid = true,
1354 		.regmodrm = true,
1355 		.regtorm = true,
1356 		.szoverride = false,
1357 		.defsize = OPSIZE_BYTE,
1358 		.emul = &x86_emul_xchg
1359 	},
1360 	[0x87] = {
1361 		/* Ev, Gv */
1362 		.valid = true,
1363 		.regmodrm = true,
1364 		.regtorm = true,
1365 		.szoverride = true,
1366 		.defsize = -1,
1367 		.emul = &x86_emul_xchg
1368 	},
1369 
1370 	/*
1371 	 * MOV
1372 	 */
1373 	[0x88] = {
1374 		/* Eb, Gb */
1375 		.valid = true,
1376 		.regmodrm = true,
1377 		.regtorm = true,
1378 		.szoverride = false,
1379 		.defsize = OPSIZE_BYTE,
1380 		.emul = &x86_emul_mov
1381 	},
1382 	[0x89] = {
1383 		/* Ev, Gv */
1384 		.valid = true,
1385 		.regmodrm = true,
1386 		.regtorm = true,
1387 		.szoverride = true,
1388 		.defsize = -1,
1389 		.emul = &x86_emul_mov
1390 	},
1391 	[0x8A] = {
1392 		/* Gb, Eb */
1393 		.valid = true,
1394 		.regmodrm = true,
1395 		.regtorm = false,
1396 		.szoverride = false,
1397 		.defsize = OPSIZE_BYTE,
1398 		.emul = &x86_emul_mov
1399 	},
1400 	[0x8B] = {
1401 		/* Gv, Ev */
1402 		.valid = true,
1403 		.regmodrm = true,
1404 		.regtorm = false,
1405 		.szoverride = true,
1406 		.defsize = -1,
1407 		.emul = &x86_emul_mov
1408 	},
1409 	[0xA0] = {
1410 		/* AL, Ob */
1411 		.valid = true,
1412 		.dmo = true,
1413 		.todmo = false,
1414 		.szoverride = false,
1415 		.defsize = OPSIZE_BYTE,
1416 		.emul = &x86_emul_mov
1417 	},
1418 	[0xA1] = {
1419 		/* rAX, Ov */
1420 		.valid = true,
1421 		.dmo = true,
1422 		.todmo = false,
1423 		.szoverride = true,
1424 		.defsize = -1,
1425 		.emul = &x86_emul_mov
1426 	},
1427 	[0xA2] = {
1428 		/* Ob, AL */
1429 		.valid = true,
1430 		.dmo = true,
1431 		.todmo = true,
1432 		.szoverride = false,
1433 		.defsize = OPSIZE_BYTE,
1434 		.emul = &x86_emul_mov
1435 	},
1436 	[0xA3] = {
1437 		/* Ov, rAX */
1438 		.valid = true,
1439 		.dmo = true,
1440 		.todmo = true,
1441 		.szoverride = true,
1442 		.defsize = -1,
1443 		.emul = &x86_emul_mov
1444 	},
1445 
1446 	/*
1447 	 * MOVS
1448 	 */
1449 	[0xA4] = {
1450 		/* Yb, Xb */
1451 		.valid = true,
1452 		.movs = true,
1453 		.szoverride = false,
1454 		.defsize = OPSIZE_BYTE,
1455 		.emul = &x86_emul_movs
1456 	},
1457 	[0xA5] = {
1458 		/* Yv, Xv */
1459 		.valid = true,
1460 		.movs = true,
1461 		.szoverride = true,
1462 		.defsize = -1,
1463 		.emul = &x86_emul_movs
1464 	},
1465 
1466 	/*
1467 	 * STOS
1468 	 */
1469 	[0xAA] = {
1470 		/* Yb, AL */
1471 		.valid = true,
1472 		.stos = true,
1473 		.szoverride = false,
1474 		.defsize = OPSIZE_BYTE,
1475 		.emul = &x86_emul_stos
1476 	},
1477 	[0xAB] = {
1478 		/* Yv, rAX */
1479 		.valid = true,
1480 		.stos = true,
1481 		.szoverride = true,
1482 		.defsize = -1,
1483 		.emul = &x86_emul_stos
1484 	},
1485 
1486 	/*
1487 	 * LODS
1488 	 */
1489 	[0xAC] = {
1490 		/* AL, Xb */
1491 		.valid = true,
1492 		.lods = true,
1493 		.szoverride = false,
1494 		.defsize = OPSIZE_BYTE,
1495 		.emul = &x86_emul_lods
1496 	},
1497 	[0xAD] = {
1498 		/* rAX, Xv */
1499 		.valid = true,
1500 		.lods = true,
1501 		.szoverride = true,
1502 		.defsize = -1,
1503 		.emul = &x86_emul_lods
1504 	},
1505 };
1506 
1507 static const struct x86_opcode secondary_opcode_table[256] __cacheline_aligned = {
1508 	/*
1509 	 * MOVZX
1510 	 */
1511 	[0xB6] = {
1512 		/* Gv, Eb */
1513 		.valid = true,
1514 		.regmodrm = true,
1515 		.regtorm = false,
1516 		.szoverride = true,
1517 		.defsize = OPSIZE_BYTE,
1518 		.flags = FLAG_ze,
1519 		.emul = &x86_emul_mov
1520 	},
1521 	[0xB7] = {
1522 		/* Gv, Ew */
1523 		.valid = true,
1524 		.regmodrm = true,
1525 		.regtorm = false,
1526 		.szoverride = true,
1527 		.defsize = OPSIZE_WORD,
1528 		.flags = FLAG_ze,
1529 		.emul = &x86_emul_mov
1530 	},
1531 };
1532 
1533 static const struct x86_reg gpr_map__rip = { NVMM_X64_GPR_RIP, 0xFFFFFFFFFFFFFFFF };
1534 
1535 /* [REX-present][enc][opsize] */
1536 static const struct x86_reg gpr_map__special[2][4][8] __cacheline_aligned = {
1537 	[false] = {
1538 		/* No REX prefix. */
1539 		[0b00] = {
1540 			[0] = { NVMM_X64_GPR_RAX, 0x000000000000FF00 }, /* AH */
1541 			[1] = { NVMM_X64_GPR_RSP, 0x000000000000FFFF }, /* SP */
1542 			[2] = { -1, 0 },
1543 			[3] = { NVMM_X64_GPR_RSP, 0x00000000FFFFFFFF }, /* ESP */
1544 			[4] = { -1, 0 },
1545 			[5] = { -1, 0 },
1546 			[6] = { -1, 0 },
1547 			[7] = { -1, 0 },
1548 		},
1549 		[0b01] = {
1550 			[0] = { NVMM_X64_GPR_RCX, 0x000000000000FF00 }, /* CH */
1551 			[1] = { NVMM_X64_GPR_RBP, 0x000000000000FFFF }, /* BP */
1552 			[2] = { -1, 0 },
1553 			[3] = { NVMM_X64_GPR_RBP, 0x00000000FFFFFFFF },	/* EBP */
1554 			[4] = { -1, 0 },
1555 			[5] = { -1, 0 },
1556 			[6] = { -1, 0 },
1557 			[7] = { -1, 0 },
1558 		},
1559 		[0b10] = {
1560 			[0] = { NVMM_X64_GPR_RDX, 0x000000000000FF00 }, /* DH */
1561 			[1] = { NVMM_X64_GPR_RSI, 0x000000000000FFFF }, /* SI */
1562 			[2] = { -1, 0 },
1563 			[3] = { NVMM_X64_GPR_RSI, 0x00000000FFFFFFFF }, /* ESI */
1564 			[4] = { -1, 0 },
1565 			[5] = { -1, 0 },
1566 			[6] = { -1, 0 },
1567 			[7] = { -1, 0 },
1568 		},
1569 		[0b11] = {
1570 			[0] = { NVMM_X64_GPR_RBX, 0x000000000000FF00 }, /* BH */
1571 			[1] = { NVMM_X64_GPR_RDI, 0x000000000000FFFF }, /* DI */
1572 			[2] = { -1, 0 },
1573 			[3] = { NVMM_X64_GPR_RDI, 0x00000000FFFFFFFF }, /* EDI */
1574 			[4] = { -1, 0 },
1575 			[5] = { -1, 0 },
1576 			[6] = { -1, 0 },
1577 			[7] = { -1, 0 },
1578 		}
1579 	},
1580 	[true] = {
1581 		/* Has REX prefix. */
1582 		[0b00] = {
1583 			[0] = { NVMM_X64_GPR_RSP, 0x00000000000000FF }, /* SPL */
1584 			[1] = { NVMM_X64_GPR_RSP, 0x000000000000FFFF }, /* SP */
1585 			[2] = { -1, 0 },
1586 			[3] = { NVMM_X64_GPR_RSP, 0x00000000FFFFFFFF }, /* ESP */
1587 			[4] = { -1, 0 },
1588 			[5] = { -1, 0 },
1589 			[6] = { -1, 0 },
1590 			[7] = { NVMM_X64_GPR_RSP, 0xFFFFFFFFFFFFFFFF }, /* RSP */
1591 		},
1592 		[0b01] = {
1593 			[0] = { NVMM_X64_GPR_RBP, 0x00000000000000FF }, /* BPL */
1594 			[1] = { NVMM_X64_GPR_RBP, 0x000000000000FFFF }, /* BP */
1595 			[2] = { -1, 0 },
1596 			[3] = { NVMM_X64_GPR_RBP, 0x00000000FFFFFFFF }, /* EBP */
1597 			[4] = { -1, 0 },
1598 			[5] = { -1, 0 },
1599 			[6] = { -1, 0 },
1600 			[7] = { NVMM_X64_GPR_RBP, 0xFFFFFFFFFFFFFFFF }, /* RBP */
1601 		},
1602 		[0b10] = {
1603 			[0] = { NVMM_X64_GPR_RSI, 0x00000000000000FF }, /* SIL */
1604 			[1] = { NVMM_X64_GPR_RSI, 0x000000000000FFFF }, /* SI */
1605 			[2] = { -1, 0 },
1606 			[3] = { NVMM_X64_GPR_RSI, 0x00000000FFFFFFFF }, /* ESI */
1607 			[4] = { -1, 0 },
1608 			[5] = { -1, 0 },
1609 			[6] = { -1, 0 },
1610 			[7] = { NVMM_X64_GPR_RSI, 0xFFFFFFFFFFFFFFFF }, /* RSI */
1611 		},
1612 		[0b11] = {
1613 			[0] = { NVMM_X64_GPR_RDI, 0x00000000000000FF }, /* DIL */
1614 			[1] = { NVMM_X64_GPR_RDI, 0x000000000000FFFF }, /* DI */
1615 			[2] = { -1, 0 },
1616 			[3] = { NVMM_X64_GPR_RDI, 0x00000000FFFFFFFF }, /* EDI */
1617 			[4] = { -1, 0 },
1618 			[5] = { -1, 0 },
1619 			[6] = { -1, 0 },
1620 			[7] = { NVMM_X64_GPR_RDI, 0xFFFFFFFFFFFFFFFF }, /* RDI */
1621 		}
1622 	}
1623 };
1624 
1625 /* [depends][enc][size] */
1626 static const struct x86_reg gpr_map[2][8][8] __cacheline_aligned = {
1627 	[false] = {
1628 		/* Not extended. */
1629 		[0b000] = {
1630 			[0] = { NVMM_X64_GPR_RAX, 0x00000000000000FF }, /* AL */
1631 			[1] = { NVMM_X64_GPR_RAX, 0x000000000000FFFF }, /* AX */
1632 			[2] = { -1, 0 },
1633 			[3] = { NVMM_X64_GPR_RAX, 0x00000000FFFFFFFF }, /* EAX */
1634 			[4] = { -1, 0 },
1635 			[5] = { -1, 0 },
1636 			[6] = { -1, 0 },
1637 			[7] = { NVMM_X64_GPR_RAX, 0xFFFFFFFFFFFFFFFF }, /* RAX */
1638 		},
1639 		[0b001] = {
1640 			[0] = { NVMM_X64_GPR_RCX, 0x00000000000000FF }, /* CL */
1641 			[1] = { NVMM_X64_GPR_RCX, 0x000000000000FFFF }, /* CX */
1642 			[2] = { -1, 0 },
1643 			[3] = { NVMM_X64_GPR_RCX, 0x00000000FFFFFFFF }, /* ECX */
1644 			[4] = { -1, 0 },
1645 			[5] = { -1, 0 },
1646 			[6] = { -1, 0 },
1647 			[7] = { NVMM_X64_GPR_RCX, 0xFFFFFFFFFFFFFFFF }, /* RCX */
1648 		},
1649 		[0b010] = {
1650 			[0] = { NVMM_X64_GPR_RDX, 0x00000000000000FF }, /* DL */
1651 			[1] = { NVMM_X64_GPR_RDX, 0x000000000000FFFF }, /* DX */
1652 			[2] = { -1, 0 },
1653 			[3] = { NVMM_X64_GPR_RDX, 0x00000000FFFFFFFF }, /* EDX */
1654 			[4] = { -1, 0 },
1655 			[5] = { -1, 0 },
1656 			[6] = { -1, 0 },
1657 			[7] = { NVMM_X64_GPR_RDX, 0xFFFFFFFFFFFFFFFF }, /* RDX */
1658 		},
1659 		[0b011] = {
1660 			[0] = { NVMM_X64_GPR_RBX, 0x00000000000000FF }, /* BL */
1661 			[1] = { NVMM_X64_GPR_RBX, 0x000000000000FFFF }, /* BX */
1662 			[2] = { -1, 0 },
1663 			[3] = { NVMM_X64_GPR_RBX, 0x00000000FFFFFFFF }, /* EBX */
1664 			[4] = { -1, 0 },
1665 			[5] = { -1, 0 },
1666 			[6] = { -1, 0 },
1667 			[7] = { NVMM_X64_GPR_RBX, 0xFFFFFFFFFFFFFFFF }, /* RBX */
1668 		},
1669 		[0b100] = {
1670 			[0] = { -1, 0 }, /* SPECIAL */
1671 			[1] = { -1, 0 }, /* SPECIAL */
1672 			[2] = { -1, 0 },
1673 			[3] = { -1, 0 }, /* SPECIAL */
1674 			[4] = { -1, 0 },
1675 			[5] = { -1, 0 },
1676 			[6] = { -1, 0 },
1677 			[7] = { -1, 0 }, /* SPECIAL */
1678 		},
1679 		[0b101] = {
1680 			[0] = { -1, 0 }, /* SPECIAL */
1681 			[1] = { -1, 0 }, /* SPECIAL */
1682 			[2] = { -1, 0 },
1683 			[3] = { -1, 0 }, /* SPECIAL */
1684 			[4] = { -1, 0 },
1685 			[5] = { -1, 0 },
1686 			[6] = { -1, 0 },
1687 			[7] = { -1, 0 }, /* SPECIAL */
1688 		},
1689 		[0b110] = {
1690 			[0] = { -1, 0 }, /* SPECIAL */
1691 			[1] = { -1, 0 }, /* SPECIAL */
1692 			[2] = { -1, 0 },
1693 			[3] = { -1, 0 }, /* SPECIAL */
1694 			[4] = { -1, 0 },
1695 			[5] = { -1, 0 },
1696 			[6] = { -1, 0 },
1697 			[7] = { -1, 0 }, /* SPECIAL */
1698 		},
1699 		[0b111] = {
1700 			[0] = { -1, 0 }, /* SPECIAL */
1701 			[1] = { -1, 0 }, /* SPECIAL */
1702 			[2] = { -1, 0 },
1703 			[3] = { -1, 0 }, /* SPECIAL */
1704 			[4] = { -1, 0 },
1705 			[5] = { -1, 0 },
1706 			[6] = { -1, 0 },
1707 			[7] = { -1, 0 }, /* SPECIAL */
1708 		},
1709 	},
1710 	[true] = {
1711 		/* Extended. */
1712 		[0b000] = {
1713 			[0] = { NVMM_X64_GPR_R8, 0x00000000000000FF }, /* R8B */
1714 			[1] = { NVMM_X64_GPR_R8, 0x000000000000FFFF }, /* R8W */
1715 			[2] = { -1, 0 },
1716 			[3] = { NVMM_X64_GPR_R8, 0x00000000FFFFFFFF }, /* R8D */
1717 			[4] = { -1, 0 },
1718 			[5] = { -1, 0 },
1719 			[6] = { -1, 0 },
1720 			[7] = { NVMM_X64_GPR_R8, 0xFFFFFFFFFFFFFFFF }, /* R8 */
1721 		},
1722 		[0b001] = {
1723 			[0] = { NVMM_X64_GPR_R9, 0x00000000000000FF }, /* R9B */
1724 			[1] = { NVMM_X64_GPR_R9, 0x000000000000FFFF }, /* R9W */
1725 			[2] = { -1, 0 },
1726 			[3] = { NVMM_X64_GPR_R9, 0x00000000FFFFFFFF }, /* R9D */
1727 			[4] = { -1, 0 },
1728 			[5] = { -1, 0 },
1729 			[6] = { -1, 0 },
1730 			[7] = { NVMM_X64_GPR_R9, 0xFFFFFFFFFFFFFFFF }, /* R9 */
1731 		},
1732 		[0b010] = {
1733 			[0] = { NVMM_X64_GPR_R10, 0x00000000000000FF }, /* R10B */
1734 			[1] = { NVMM_X64_GPR_R10, 0x000000000000FFFF }, /* R10W */
1735 			[2] = { -1, 0 },
1736 			[3] = { NVMM_X64_GPR_R10, 0x00000000FFFFFFFF }, /* R10D */
1737 			[4] = { -1, 0 },
1738 			[5] = { -1, 0 },
1739 			[6] = { -1, 0 },
1740 			[7] = { NVMM_X64_GPR_R10, 0xFFFFFFFFFFFFFFFF }, /* R10 */
1741 		},
1742 		[0b011] = {
1743 			[0] = { NVMM_X64_GPR_R11, 0x00000000000000FF }, /* R11B */
1744 			[1] = { NVMM_X64_GPR_R11, 0x000000000000FFFF }, /* R11W */
1745 			[2] = { -1, 0 },
1746 			[3] = { NVMM_X64_GPR_R11, 0x00000000FFFFFFFF }, /* R11D */
1747 			[4] = { -1, 0 },
1748 			[5] = { -1, 0 },
1749 			[6] = { -1, 0 },
1750 			[7] = { NVMM_X64_GPR_R11, 0xFFFFFFFFFFFFFFFF }, /* R11 */
1751 		},
1752 		[0b100] = {
1753 			[0] = { NVMM_X64_GPR_R12, 0x00000000000000FF }, /* R12B */
1754 			[1] = { NVMM_X64_GPR_R12, 0x000000000000FFFF }, /* R12W */
1755 			[2] = { -1, 0 },
1756 			[3] = { NVMM_X64_GPR_R12, 0x00000000FFFFFFFF }, /* R12D */
1757 			[4] = { -1, 0 },
1758 			[5] = { -1, 0 },
1759 			[6] = { -1, 0 },
1760 			[7] = { NVMM_X64_GPR_R12, 0xFFFFFFFFFFFFFFFF }, /* R12 */
1761 		},
1762 		[0b101] = {
1763 			[0] = { NVMM_X64_GPR_R13, 0x00000000000000FF }, /* R13B */
1764 			[1] = { NVMM_X64_GPR_R13, 0x000000000000FFFF }, /* R13W */
1765 			[2] = { -1, 0 },
1766 			[3] = { NVMM_X64_GPR_R13, 0x00000000FFFFFFFF }, /* R13D */
1767 			[4] = { -1, 0 },
1768 			[5] = { -1, 0 },
1769 			[6] = { -1, 0 },
1770 			[7] = { NVMM_X64_GPR_R13, 0xFFFFFFFFFFFFFFFF }, /* R13 */
1771 		},
1772 		[0b110] = {
1773 			[0] = { NVMM_X64_GPR_R14, 0x00000000000000FF }, /* R14B */
1774 			[1] = { NVMM_X64_GPR_R14, 0x000000000000FFFF }, /* R14W */
1775 			[2] = { -1, 0 },
1776 			[3] = { NVMM_X64_GPR_R14, 0x00000000FFFFFFFF }, /* R14D */
1777 			[4] = { -1, 0 },
1778 			[5] = { -1, 0 },
1779 			[6] = { -1, 0 },
1780 			[7] = { NVMM_X64_GPR_R14, 0xFFFFFFFFFFFFFFFF }, /* R14 */
1781 		},
1782 		[0b111] = {
1783 			[0] = { NVMM_X64_GPR_R15, 0x00000000000000FF }, /* R15B */
1784 			[1] = { NVMM_X64_GPR_R15, 0x000000000000FFFF }, /* R15W */
1785 			[2] = { -1, 0 },
1786 			[3] = { NVMM_X64_GPR_R15, 0x00000000FFFFFFFF }, /* R15D */
1787 			[4] = { -1, 0 },
1788 			[5] = { -1, 0 },
1789 			[6] = { -1, 0 },
1790 			[7] = { NVMM_X64_GPR_R15, 0xFFFFFFFFFFFFFFFF }, /* R15 */
1791 		},
1792 	}
1793 };
1794 
1795 /* [enc] */
1796 static const int gpr_dual_reg1_rm[8] __cacheline_aligned = {
1797 	[0b000] = NVMM_X64_GPR_RBX, /* BX (+SI) */
1798 	[0b001] = NVMM_X64_GPR_RBX, /* BX (+DI) */
1799 	[0b010] = NVMM_X64_GPR_RBP, /* BP (+SI) */
1800 	[0b011] = NVMM_X64_GPR_RBP, /* BP (+DI) */
1801 	[0b100] = NVMM_X64_GPR_RSI, /* SI */
1802 	[0b101] = NVMM_X64_GPR_RDI, /* DI */
1803 	[0b110] = NVMM_X64_GPR_RBP, /* BP */
1804 	[0b111] = NVMM_X64_GPR_RBX, /* BX */
1805 };
1806 
1807 static int
1808 node_overflow(struct x86_decode_fsm *fsm, struct x86_instr *instr)
1809 {
1810 	fsm->fn = NULL;
1811 	return -1;
1812 }
1813 
1814 static int
1815 fsm_read(struct x86_decode_fsm *fsm, uint8_t *bytes, size_t n)
1816 {
1817 	if (fsm->buf + n > fsm->end) {
1818 		return -1;
1819 	}
1820 	memcpy(bytes, fsm->buf, n);
1821 	return 0;
1822 }
1823 
1824 static inline void
1825 fsm_advance(struct x86_decode_fsm *fsm, size_t n,
1826     int (*fn)(struct x86_decode_fsm *, struct x86_instr *))
1827 {
1828 	fsm->buf += n;
1829 	if (fsm->buf > fsm->end) {
1830 		fsm->fn = node_overflow;
1831 	} else {
1832 		fsm->fn = fn;
1833 	}
1834 }
1835 
1836 static const struct x86_reg *
1837 resolve_special_register(struct x86_instr *instr, uint8_t enc, size_t regsize)
1838 {
1839 	enc &= 0b11;
1840 	if (regsize == 8) {
1841 		/* May be 64bit without REX */
1842 		return &gpr_map__special[1][enc][regsize-1];
1843 	}
1844 	return &gpr_map__special[instr->rexpref.present][enc][regsize-1];
1845 }
1846 
1847 /*
1848  * Special node, for MOVS. Fake two displacements of zero on the source and
1849  * destination registers.
1850  */
1851 static int
1852 node_movs(struct x86_decode_fsm *fsm, struct x86_instr *instr)
1853 {
1854 	size_t adrsize;
1855 
1856 	adrsize = instr->address_size;
1857 
1858 	/* DS:RSI */
1859 	instr->src.type = STORE_REG;
1860 	instr->src.u.reg = &gpr_map__special[1][2][adrsize-1];
1861 	instr->src.disp.type = DISP_0;
1862 
1863 	/* ES:RDI, force ES */
1864 	instr->dst.type = STORE_REG;
1865 	instr->dst.u.reg = &gpr_map__special[1][3][adrsize-1];
1866 	instr->dst.disp.type = DISP_0;
1867 	instr->dst.hardseg = NVMM_X64_SEG_ES;
1868 
1869 	fsm_advance(fsm, 0, NULL);
1870 
1871 	return 0;
1872 }
1873 
1874 /*
1875  * Special node, for STOS and LODS. Fake a displacement of zero on the
1876  * destination register.
1877  */
1878 static int
1879 node_stlo(struct x86_decode_fsm *fsm, struct x86_instr *instr)
1880 {
1881 	const struct x86_opcode *opcode = instr->opcode;
1882 	struct x86_store *stlo, *streg;
1883 	size_t adrsize, regsize;
1884 
1885 	adrsize = instr->address_size;
1886 	regsize = instr->operand_size;
1887 
1888 	if (opcode->stos) {
1889 		streg = &instr->src;
1890 		stlo = &instr->dst;
1891 	} else {
1892 		streg = &instr->dst;
1893 		stlo = &instr->src;
1894 	}
1895 
1896 	streg->type = STORE_REG;
1897 	streg->u.reg = &gpr_map[0][0][regsize-1]; /* ?AX */
1898 
1899 	stlo->type = STORE_REG;
1900 	if (opcode->stos) {
1901 		/* ES:RDI, force ES */
1902 		stlo->u.reg = &gpr_map__special[1][3][adrsize-1];
1903 		stlo->hardseg = NVMM_X64_SEG_ES;
1904 	} else {
1905 		/* DS:RSI */
1906 		stlo->u.reg = &gpr_map__special[1][2][adrsize-1];
1907 	}
1908 	stlo->disp.type = DISP_0;
1909 
1910 	fsm_advance(fsm, 0, NULL);
1911 
1912 	return 0;
1913 }
1914 
1915 static int
1916 node_dmo(struct x86_decode_fsm *fsm, struct x86_instr *instr)
1917 {
1918 	const struct x86_opcode *opcode = instr->opcode;
1919 	struct x86_store *stdmo, *streg;
1920 	size_t adrsize, regsize;
1921 
1922 	adrsize = instr->address_size;
1923 	regsize = instr->operand_size;
1924 
1925 	if (opcode->todmo) {
1926 		streg = &instr->src;
1927 		stdmo = &instr->dst;
1928 	} else {
1929 		streg = &instr->dst;
1930 		stdmo = &instr->src;
1931 	}
1932 
1933 	streg->type = STORE_REG;
1934 	streg->u.reg = &gpr_map[0][0][regsize-1]; /* ?AX */
1935 
1936 	stdmo->type = STORE_DMO;
1937 	if (fsm_read(fsm, (uint8_t *)&stdmo->u.dmo, adrsize) == -1) {
1938 		return -1;
1939 	}
1940 	fsm_advance(fsm, adrsize, NULL);
1941 
1942 	return 0;
1943 }
1944 
1945 static inline uint64_t
1946 sign_extend(uint64_t val, int size)
1947 {
1948 	if (size == 1) {
1949 		if (val & __BIT(7))
1950 			val |= 0xFFFFFFFFFFFFFF00;
1951 	} else if (size == 2) {
1952 		if (val & __BIT(15))
1953 			val |= 0xFFFFFFFFFFFF0000;
1954 	} else if (size == 4) {
1955 		if (val & __BIT(31))
1956 			val |= 0xFFFFFFFF00000000;
1957 	}
1958 	return val;
1959 }
1960 
1961 static int
1962 node_immediate(struct x86_decode_fsm *fsm, struct x86_instr *instr)
1963 {
1964 	const struct x86_opcode *opcode = instr->opcode;
1965 	struct x86_store *store;
1966 	uint8_t immsize;
1967 	size_t sesize = 0;
1968 
1969 	/* The immediate is the source */
1970 	store = &instr->src;
1971 	immsize = instr->operand_size;
1972 
1973 	if (opcode->flags & FLAG_imm8) {
1974 		sesize = immsize;
1975 		immsize = 1;
1976 	} else if ((opcode->flags & FLAG_immz) && (immsize == 8)) {
1977 		sesize = immsize;
1978 		immsize = 4;
1979 	}
1980 
1981 	store->type = STORE_IMM;
1982 	if (fsm_read(fsm, (uint8_t *)&store->u.imm.data, immsize) == -1) {
1983 		return -1;
1984 	}
1985 	fsm_advance(fsm, immsize, NULL);
1986 
1987 	if (sesize != 0) {
1988 		store->u.imm.data = sign_extend(store->u.imm.data, sesize);
1989 	}
1990 
1991 	return 0;
1992 }
1993 
1994 static int
1995 node_disp(struct x86_decode_fsm *fsm, struct x86_instr *instr)
1996 {
1997 	const struct x86_opcode *opcode = instr->opcode;
1998 	uint64_t data = 0;
1999 	size_t n;
2000 
2001 	if (instr->strm->disp.type == DISP_1) {
2002 		n = 1;
2003 	} else if (instr->strm->disp.type == DISP_2) {
2004 		n = 2;
2005 	} else if (instr->strm->disp.type == DISP_4) {
2006 		n = 4;
2007 	} else {
2008 		DISASSEMBLER_BUG();
2009 	}
2010 
2011 	if (fsm_read(fsm, (uint8_t *)&data, n) == -1) {
2012 		return -1;
2013 	}
2014 
2015 	if (__predict_true(fsm->is64bit)) {
2016 		data = sign_extend(data, n);
2017 	}
2018 
2019 	instr->strm->disp.data = data;
2020 
2021 	if (opcode->immediate) {
2022 		fsm_advance(fsm, n, node_immediate);
2023 	} else {
2024 		fsm_advance(fsm, n, NULL);
2025 	}
2026 
2027 	return 0;
2028 }
2029 
2030 /*
2031  * Special node to handle 16bit addressing encoding, which can reference two
2032  * registers at once.
2033  */
2034 static int
2035 node_dual(struct x86_decode_fsm *fsm, struct x86_instr *instr)
2036 {
2037 	int reg1, reg2;
2038 
2039 	reg1 = gpr_dual_reg1_rm[instr->regmodrm.rm];
2040 
2041 	if (instr->regmodrm.rm == 0b000 ||
2042 	    instr->regmodrm.rm == 0b010) {
2043 		reg2 = NVMM_X64_GPR_RSI;
2044 	} else if (instr->regmodrm.rm == 0b001 ||
2045 	    instr->regmodrm.rm == 0b011) {
2046 		reg2 = NVMM_X64_GPR_RDI;
2047 	} else {
2048 		DISASSEMBLER_BUG();
2049 	}
2050 
2051 	instr->strm->type = STORE_DUALREG;
2052 	instr->strm->u.dualreg.reg1 = reg1;
2053 	instr->strm->u.dualreg.reg2 = reg2;
2054 
2055 	if (instr->strm->disp.type == DISP_NONE) {
2056 		DISASSEMBLER_BUG();
2057 	} else if (instr->strm->disp.type == DISP_0) {
2058 		/* Indirect register addressing mode */
2059 		if (instr->opcode->immediate) {
2060 			fsm_advance(fsm, 1, node_immediate);
2061 		} else {
2062 			fsm_advance(fsm, 1, NULL);
2063 		}
2064 	} else {
2065 		fsm_advance(fsm, 1, node_disp);
2066 	}
2067 
2068 	return 0;
2069 }
2070 
2071 static const struct x86_reg *
2072 get_register_idx(struct x86_instr *instr, uint8_t index)
2073 {
2074 	uint8_t enc = index;
2075 	const struct x86_reg *reg;
2076 	size_t regsize;
2077 
2078 	regsize = instr->address_size;
2079 	reg = &gpr_map[instr->rexpref.x][enc][regsize-1];
2080 
2081 	if (reg->num == -1) {
2082 		reg = resolve_special_register(instr, enc, regsize);
2083 	}
2084 
2085 	return reg;
2086 }
2087 
2088 static const struct x86_reg *
2089 get_register_bas(struct x86_instr *instr, uint8_t base)
2090 {
2091 	uint8_t enc = base;
2092 	const struct x86_reg *reg;
2093 	size_t regsize;
2094 
2095 	regsize = instr->address_size;
2096 	reg = &gpr_map[instr->rexpref.b][enc][regsize-1];
2097 	if (reg->num == -1) {
2098 		reg = resolve_special_register(instr, enc, regsize);
2099 	}
2100 
2101 	return reg;
2102 }
2103 
2104 static int
2105 node_sib(struct x86_decode_fsm *fsm, struct x86_instr *instr)
2106 {
2107 	const struct x86_opcode *opcode;
2108 	uint8_t scale, index, base;
2109 	bool noindex, nobase;
2110 	uint8_t byte;
2111 
2112 	if (fsm_read(fsm, &byte, sizeof(byte)) == -1) {
2113 		return -1;
2114 	}
2115 
2116 	scale = ((byte & 0b11000000) >> 6);
2117 	index = ((byte & 0b00111000) >> 3);
2118 	base  = ((byte & 0b00000111) >> 0);
2119 
2120 	opcode = instr->opcode;
2121 
2122 	noindex = false;
2123 	nobase = false;
2124 
2125 	if (index == 0b100 && !instr->rexpref.x) {
2126 		/* Special case: the index is null */
2127 		noindex = true;
2128 	}
2129 
2130 	if (instr->regmodrm.mod == 0b00 && base == 0b101) {
2131 		/* Special case: the base is null + disp32 */
2132 		instr->strm->disp.type = DISP_4;
2133 		nobase = true;
2134 	}
2135 
2136 	instr->strm->type = STORE_SIB;
2137 	instr->strm->u.sib.scale = (1 << scale);
2138 	if (!noindex)
2139 		instr->strm->u.sib.idx = get_register_idx(instr, index);
2140 	if (!nobase)
2141 		instr->strm->u.sib.bas = get_register_bas(instr, base);
2142 
2143 	/* May have a displacement, or an immediate */
2144 	if (instr->strm->disp.type == DISP_1 ||
2145 	    instr->strm->disp.type == DISP_2 ||
2146 	    instr->strm->disp.type == DISP_4) {
2147 		fsm_advance(fsm, 1, node_disp);
2148 	} else if (opcode->immediate) {
2149 		fsm_advance(fsm, 1, node_immediate);
2150 	} else {
2151 		fsm_advance(fsm, 1, NULL);
2152 	}
2153 
2154 	return 0;
2155 }
2156 
2157 static const struct x86_reg *
2158 get_register_reg(struct x86_instr *instr, const struct x86_opcode *opcode)
2159 {
2160 	uint8_t enc = instr->regmodrm.reg;
2161 	const struct x86_reg *reg;
2162 	size_t regsize;
2163 
2164 	regsize = instr->operand_size;
2165 
2166 	reg = &gpr_map[instr->rexpref.r][enc][regsize-1];
2167 	if (reg->num == -1) {
2168 		reg = resolve_special_register(instr, enc, regsize);
2169 	}
2170 
2171 	return reg;
2172 }
2173 
2174 static const struct x86_reg *
2175 get_register_rm(struct x86_instr *instr, const struct x86_opcode *opcode)
2176 {
2177 	uint8_t enc = instr->regmodrm.rm;
2178 	const struct x86_reg *reg;
2179 	size_t regsize;
2180 
2181 	if (instr->strm->disp.type == DISP_NONE) {
2182 		regsize = instr->operand_size;
2183 	} else {
2184 		/* Indirect access, the size is that of the address. */
2185 		regsize = instr->address_size;
2186 	}
2187 
2188 	reg = &gpr_map[instr->rexpref.b][enc][regsize-1];
2189 	if (reg->num == -1) {
2190 		reg = resolve_special_register(instr, enc, regsize);
2191 	}
2192 
2193 	return reg;
2194 }
2195 
2196 static inline bool
2197 has_sib(struct x86_instr *instr)
2198 {
2199 	return (instr->address_size != 2 && /* no SIB in 16bit addressing */
2200 	    instr->regmodrm.mod != 0b11 &&
2201 	    instr->regmodrm.rm == 0b100);
2202 }
2203 
2204 static inline bool
2205 is_rip_relative(struct x86_decode_fsm *fsm, struct x86_instr *instr)
2206 {
2207 	return (fsm->is64bit && /* RIP-relative only in 64bit mode */
2208 	    instr->regmodrm.mod == 0b00 &&
2209 	    instr->regmodrm.rm == 0b101);
2210 }
2211 
2212 static inline bool
2213 is_disp32_only(struct x86_decode_fsm *fsm, struct x86_instr *instr)
2214 {
2215 	return (!fsm->is64bit && /* no disp32-only in 64bit mode */
2216 	    instr->address_size != 2 && /* no disp32-only in 16bit addressing */
2217 	    instr->regmodrm.mod == 0b00 &&
2218 	    instr->regmodrm.rm == 0b101);
2219 }
2220 
2221 static inline bool
2222 is_disp16_only(struct x86_decode_fsm *fsm, struct x86_instr *instr)
2223 {
2224 	return (instr->address_size == 2 && /* disp16-only only in 16bit addr */
2225 	    instr->regmodrm.mod == 0b00 &&
2226 	    instr->regmodrm.rm == 0b110);
2227 }
2228 
2229 static inline bool
2230 is_dual(struct x86_decode_fsm *fsm, struct x86_instr *instr)
2231 {
2232 	return (instr->address_size == 2 &&
2233 	    instr->regmodrm.mod != 0b11 &&
2234 	    instr->regmodrm.rm <= 0b011);
2235 }
2236 
2237 static enum x86_disp_type
2238 get_disp_type(struct x86_instr *instr)
2239 {
2240 	switch (instr->regmodrm.mod) {
2241 	case 0b00:	/* indirect */
2242 		return DISP_0;
2243 	case 0b01:	/* indirect+1 */
2244 		return DISP_1;
2245 	case 0b10:	/* indirect+{2,4} */
2246 		if (__predict_false(instr->address_size == 2)) {
2247 			return DISP_2;
2248 		}
2249 		return DISP_4;
2250 	case 0b11:	/* direct */
2251 	default:	/* llvm */
2252 		return DISP_NONE;
2253 	}
2254 	__unreachable();
2255 }
2256 
2257 static int
2258 node_regmodrm(struct x86_decode_fsm *fsm, struct x86_instr *instr)
2259 {
2260 	struct x86_store *strg, *strm;
2261 	const struct x86_opcode *opcode;
2262 	const struct x86_reg *reg;
2263 	uint8_t byte;
2264 
2265 	if (fsm_read(fsm, &byte, sizeof(byte)) == -1) {
2266 		return -1;
2267 	}
2268 
2269 	opcode = instr->opcode;
2270 
2271 	instr->regmodrm.rm  = ((byte & 0b00000111) >> 0);
2272 	instr->regmodrm.reg = ((byte & 0b00111000) >> 3);
2273 	instr->regmodrm.mod = ((byte & 0b11000000) >> 6);
2274 
2275 	if (opcode->regtorm) {
2276 		strg = &instr->src;
2277 		strm = &instr->dst;
2278 	} else { /* RM to REG */
2279 		strm = &instr->src;
2280 		strg = &instr->dst;
2281 	}
2282 
2283 	/* Save for later use. */
2284 	instr->strm = strm;
2285 
2286 	/*
2287 	 * Special cases: Groups. The REG field of REGMODRM is the index in
2288 	 * the group. op1 gets overwritten in the Immediate node, if any.
2289 	 */
2290 	if (opcode->group1) {
2291 		if (group1[instr->regmodrm.reg].emul == NULL) {
2292 			return -1;
2293 		}
2294 		instr->emul = group1[instr->regmodrm.reg].emul;
2295 	} else if (opcode->group3) {
2296 		if (group3[instr->regmodrm.reg].emul == NULL) {
2297 			return -1;
2298 		}
2299 		instr->emul = group3[instr->regmodrm.reg].emul;
2300 	} else if (opcode->group11) {
2301 		if (group11[instr->regmodrm.reg].emul == NULL) {
2302 			return -1;
2303 		}
2304 		instr->emul = group11[instr->regmodrm.reg].emul;
2305 	}
2306 
2307 	if (!opcode->immediate) {
2308 		reg = get_register_reg(instr, opcode);
2309 		if (reg == NULL) {
2310 			return -1;
2311 		}
2312 		strg->type = STORE_REG;
2313 		strg->u.reg = reg;
2314 	}
2315 
2316 	/* The displacement applies to RM. */
2317 	strm->disp.type = get_disp_type(instr);
2318 
2319 	if (has_sib(instr)) {
2320 		/* Overwrites RM */
2321 		fsm_advance(fsm, 1, node_sib);
2322 		return 0;
2323 	}
2324 
2325 	if (is_rip_relative(fsm, instr)) {
2326 		/* Overwrites RM */
2327 		strm->type = STORE_REG;
2328 		strm->u.reg = &gpr_map__rip;
2329 		strm->disp.type = DISP_4;
2330 		fsm_advance(fsm, 1, node_disp);
2331 		return 0;
2332 	}
2333 
2334 	if (is_disp32_only(fsm, instr)) {
2335 		/* Overwrites RM */
2336 		strm->type = STORE_REG;
2337 		strm->u.reg = NULL;
2338 		strm->disp.type = DISP_4;
2339 		fsm_advance(fsm, 1, node_disp);
2340 		return 0;
2341 	}
2342 
2343 	if (__predict_false(is_disp16_only(fsm, instr))) {
2344 		/* Overwrites RM */
2345 		strm->type = STORE_REG;
2346 		strm->u.reg = NULL;
2347 		strm->disp.type = DISP_2;
2348 		fsm_advance(fsm, 1, node_disp);
2349 		return 0;
2350 	}
2351 
2352 	if (__predict_false(is_dual(fsm, instr))) {
2353 		/* Overwrites RM */
2354 		fsm_advance(fsm, 0, node_dual);
2355 		return 0;
2356 	}
2357 
2358 	reg = get_register_rm(instr, opcode);
2359 	if (reg == NULL) {
2360 		return -1;
2361 	}
2362 	strm->type = STORE_REG;
2363 	strm->u.reg = reg;
2364 
2365 	if (strm->disp.type == DISP_NONE) {
2366 		/* Direct register addressing mode */
2367 		if (opcode->immediate) {
2368 			fsm_advance(fsm, 1, node_immediate);
2369 		} else {
2370 			fsm_advance(fsm, 1, NULL);
2371 		}
2372 	} else if (strm->disp.type == DISP_0) {
2373 		/* Indirect register addressing mode */
2374 		if (opcode->immediate) {
2375 			fsm_advance(fsm, 1, node_immediate);
2376 		} else {
2377 			fsm_advance(fsm, 1, NULL);
2378 		}
2379 	} else {
2380 		fsm_advance(fsm, 1, node_disp);
2381 	}
2382 
2383 	return 0;
2384 }
2385 
2386 static size_t
2387 get_operand_size(struct x86_decode_fsm *fsm, struct x86_instr *instr)
2388 {
2389 	const struct x86_opcode *opcode = instr->opcode;
2390 	int opsize;
2391 
2392 	/* Get the opsize */
2393 	if (!opcode->szoverride) {
2394 		opsize = opcode->defsize;
2395 	} else if (instr->rexpref.present && instr->rexpref.w) {
2396 		opsize = 8;
2397 	} else {
2398 		if (!fsm->is16bit) {
2399 			if (instr->legpref.opr_ovr) {
2400 				opsize = 2;
2401 			} else {
2402 				opsize = 4;
2403 			}
2404 		} else { /* 16bit */
2405 			if (instr->legpref.opr_ovr) {
2406 				opsize = 4;
2407 			} else {
2408 				opsize = 2;
2409 			}
2410 		}
2411 	}
2412 
2413 	return opsize;
2414 }
2415 
2416 static size_t
2417 get_address_size(struct x86_decode_fsm *fsm, struct x86_instr *instr)
2418 {
2419 	if (fsm->is64bit) {
2420 		if (__predict_false(instr->legpref.adr_ovr)) {
2421 			return 4;
2422 		}
2423 		return 8;
2424 	}
2425 
2426 	if (fsm->is32bit) {
2427 		if (__predict_false(instr->legpref.adr_ovr)) {
2428 			return 2;
2429 		}
2430 		return 4;
2431 	}
2432 
2433 	/* 16bit. */
2434 	if (__predict_false(instr->legpref.adr_ovr)) {
2435 		return 4;
2436 	}
2437 	return 2;
2438 }
2439 
2440 static int
2441 node_primary_opcode(struct x86_decode_fsm *fsm, struct x86_instr *instr)
2442 {
2443 	const struct x86_opcode *opcode;
2444 	uint8_t byte;
2445 
2446 	if (fsm_read(fsm, &byte, sizeof(byte)) == -1) {
2447 		return -1;
2448 	}
2449 
2450 	opcode = &primary_opcode_table[byte];
2451 	if (__predict_false(!opcode->valid)) {
2452 		return -1;
2453 	}
2454 
2455 	instr->opcode = opcode;
2456 	instr->emul = opcode->emul;
2457 	instr->operand_size = get_operand_size(fsm, instr);
2458 	instr->address_size = get_address_size(fsm, instr);
2459 
2460 	if (fsm->is64bit && (instr->operand_size == 4)) {
2461 		/* Zero-extend to 64 bits. */
2462 		instr->zeroextend_mask = ~size_to_mask(4);
2463 	}
2464 
2465 	if (opcode->regmodrm) {
2466 		fsm_advance(fsm, 1, node_regmodrm);
2467 	} else if (opcode->dmo) {
2468 		/* Direct-Memory Offsets */
2469 		fsm_advance(fsm, 1, node_dmo);
2470 	} else if (opcode->stos || opcode->lods) {
2471 		fsm_advance(fsm, 1, node_stlo);
2472 	} else if (opcode->movs) {
2473 		fsm_advance(fsm, 1, node_movs);
2474 	} else {
2475 		return -1;
2476 	}
2477 
2478 	return 0;
2479 }
2480 
2481 static int
2482 node_secondary_opcode(struct x86_decode_fsm *fsm, struct x86_instr *instr)
2483 {
2484 	const struct x86_opcode *opcode;
2485 	uint8_t byte;
2486 
2487 	if (fsm_read(fsm, &byte, sizeof(byte)) == -1) {
2488 		return -1;
2489 	}
2490 
2491 	opcode = &secondary_opcode_table[byte];
2492 	if (__predict_false(!opcode->valid)) {
2493 		return -1;
2494 	}
2495 
2496 	instr->opcode = opcode;
2497 	instr->emul = opcode->emul;
2498 	instr->operand_size = get_operand_size(fsm, instr);
2499 	instr->address_size = get_address_size(fsm, instr);
2500 
2501 	if (fsm->is64bit && (instr->operand_size == 4)) {
2502 		/* Zero-extend to 64 bits. */
2503 		instr->zeroextend_mask = ~size_to_mask(4);
2504 	}
2505 
2506 	if (opcode->flags & FLAG_ze) {
2507 		/*
2508 		 * Compute the mask for zero-extend. Update the operand size,
2509 		 * we move fewer bytes.
2510 		 */
2511 		instr->zeroextend_mask |= size_to_mask(instr->operand_size);
2512 		instr->zeroextend_mask &= ~size_to_mask(opcode->defsize);
2513 		instr->operand_size = opcode->defsize;
2514 	}
2515 
2516 	if (opcode->regmodrm) {
2517 		fsm_advance(fsm, 1, node_regmodrm);
2518 	} else {
2519 		return -1;
2520 	}
2521 
2522 	return 0;
2523 }
2524 
2525 static int
2526 node_main(struct x86_decode_fsm *fsm, struct x86_instr *instr)
2527 {
2528 	uint8_t byte;
2529 
2530 #define ESCAPE	0x0F
2531 #define VEX_1	0xC5
2532 #define VEX_2	0xC4
2533 #define XOP	0x8F
2534 
2535 	if (fsm_read(fsm, &byte, sizeof(byte)) == -1) {
2536 		return -1;
2537 	}
2538 
2539 	/*
2540 	 * We don't take XOP. It is AMD-specific, and it was removed shortly
2541 	 * after being introduced.
2542 	 */
2543 	if (byte == ESCAPE) {
2544 		fsm_advance(fsm, 1, node_secondary_opcode);
2545 	} else if (!instr->rexpref.present) {
2546 		if (byte == VEX_1) {
2547 			return -1;
2548 		} else if (byte == VEX_2) {
2549 			return -1;
2550 		} else {
2551 			fsm->fn = node_primary_opcode;
2552 		}
2553 	} else {
2554 		fsm->fn = node_primary_opcode;
2555 	}
2556 
2557 	return 0;
2558 }
2559 
2560 static int
2561 node_rex_prefix(struct x86_decode_fsm *fsm, struct x86_instr *instr)
2562 {
2563 	struct x86_rexpref *rexpref = &instr->rexpref;
2564 	uint8_t byte;
2565 	size_t n = 0;
2566 
2567 	if (fsm_read(fsm, &byte, sizeof(byte)) == -1) {
2568 		return -1;
2569 	}
2570 
2571 	if (byte >= 0x40 && byte <= 0x4F) {
2572 		if (__predict_false(!fsm->is64bit)) {
2573 			return -1;
2574 		}
2575 		rexpref->b = ((byte & 0x1) != 0);
2576 		rexpref->x = ((byte & 0x2) != 0);
2577 		rexpref->r = ((byte & 0x4) != 0);
2578 		rexpref->w = ((byte & 0x8) != 0);
2579 		rexpref->present = true;
2580 		n = 1;
2581 	}
2582 
2583 	fsm_advance(fsm, n, node_main);
2584 	return 0;
2585 }
2586 
2587 static int
2588 node_legacy_prefix(struct x86_decode_fsm *fsm, struct x86_instr *instr)
2589 {
2590 	uint8_t byte;
2591 
2592 	if (fsm_read(fsm, &byte, sizeof(byte)) == -1) {
2593 		return -1;
2594 	}
2595 
2596 	if (byte == LEG_OPR_OVR) {
2597 		instr->legpref.opr_ovr = 1;
2598 	} else if (byte == LEG_OVR_DS) {
2599 		instr->legpref.seg = NVMM_X64_SEG_DS;
2600 	} else if (byte == LEG_OVR_ES) {
2601 		instr->legpref.seg = NVMM_X64_SEG_ES;
2602 	} else if (byte == LEG_REP) {
2603 		instr->legpref.rep = 1;
2604 	} else if (byte == LEG_OVR_GS) {
2605 		instr->legpref.seg = NVMM_X64_SEG_GS;
2606 	} else if (byte == LEG_OVR_FS) {
2607 		instr->legpref.seg = NVMM_X64_SEG_FS;
2608 	} else if (byte == LEG_ADR_OVR) {
2609 		instr->legpref.adr_ovr = 1;
2610 	} else if (byte == LEG_OVR_CS) {
2611 		instr->legpref.seg = NVMM_X64_SEG_CS;
2612 	} else if (byte == LEG_OVR_SS) {
2613 		instr->legpref.seg = NVMM_X64_SEG_SS;
2614 	} else if (byte == LEG_REPN) {
2615 		instr->legpref.repn = 1;
2616 	} else if (byte == LEG_LOCK) {
2617 		/* ignore */
2618 	} else {
2619 		/* not a legacy prefix */
2620 		fsm_advance(fsm, 0, node_rex_prefix);
2621 		return 0;
2622 	}
2623 
2624 	fsm_advance(fsm, 1, node_legacy_prefix);
2625 	return 0;
2626 }
2627 
2628 static int
2629 x86_decode(uint8_t *inst_bytes, size_t inst_len, struct x86_instr *instr,
2630     struct nvmm_x64_state *state)
2631 {
2632 	struct x86_decode_fsm fsm;
2633 	int ret;
2634 
2635 	memset(instr, 0, sizeof(*instr));
2636 	instr->legpref.seg = -1;
2637 	instr->src.hardseg = -1;
2638 	instr->dst.hardseg = -1;
2639 
2640 	fsm.is64bit = is_64bit(state);
2641 	fsm.is32bit = is_32bit(state);
2642 	fsm.is16bit = is_16bit(state);
2643 
2644 	fsm.fn = node_legacy_prefix;
2645 	fsm.buf = inst_bytes;
2646 	fsm.end = inst_bytes + inst_len;
2647 
2648 	while (fsm.fn != NULL) {
2649 		ret = (*fsm.fn)(&fsm, instr);
2650 		if (ret == -1)
2651 			return -1;
2652 	}
2653 
2654 	instr->len = fsm.buf - inst_bytes;
2655 
2656 	return 0;
2657 }
2658 
2659 /* -------------------------------------------------------------------------- */
2660 
2661 #define EXEC_INSTR(sz, instr)						\
2662 static uint##sz##_t							\
2663 exec_##instr##sz(uint##sz##_t op1, uint##sz##_t op2, uint64_t *rflags)	\
2664 {									\
2665 	uint##sz##_t res;						\
2666 	__asm __volatile (						\
2667 		#instr"	%2, %3;"					\
2668 		"mov	%3, %1;"					\
2669 		"pushfq;"						\
2670 		"popq	%0"						\
2671 	    : "=r" (*rflags), "=r" (res)				\
2672 	    : "r" (op1), "r" (op2));					\
2673 	return res;							\
2674 }
2675 
2676 #define EXEC_DISPATCHER(instr)						\
2677 static uint64_t								\
2678 exec_##instr(uint64_t op1, uint64_t op2, uint64_t *rflags, size_t opsize) \
2679 {									\
2680 	switch (opsize) {						\
2681 	case 1:								\
2682 		return exec_##instr##8(op1, op2, rflags);		\
2683 	case 2:								\
2684 		return exec_##instr##16(op1, op2, rflags);		\
2685 	case 4:								\
2686 		return exec_##instr##32(op1, op2, rflags);		\
2687 	default:							\
2688 		return exec_##instr##64(op1, op2, rflags);		\
2689 	}								\
2690 }
2691 
2692 /* SUB: ret = op1 - op2 */
2693 #define PSL_SUB_MASK	(PSL_V|PSL_C|PSL_Z|PSL_N|PSL_PF|PSL_AF)
2694 EXEC_INSTR(8, sub)
2695 EXEC_INSTR(16, sub)
2696 EXEC_INSTR(32, sub)
2697 EXEC_INSTR(64, sub)
2698 EXEC_DISPATCHER(sub)
2699 
2700 /* OR:  ret = op1 | op2 */
2701 #define PSL_OR_MASK	(PSL_V|PSL_C|PSL_Z|PSL_N|PSL_PF)
2702 EXEC_INSTR(8, or)
2703 EXEC_INSTR(16, or)
2704 EXEC_INSTR(32, or)
2705 EXEC_INSTR(64, or)
2706 EXEC_DISPATCHER(or)
2707 
2708 /* AND: ret = op1 & op2 */
2709 #define PSL_AND_MASK	(PSL_V|PSL_C|PSL_Z|PSL_N|PSL_PF)
2710 EXEC_INSTR(8, and)
2711 EXEC_INSTR(16, and)
2712 EXEC_INSTR(32, and)
2713 EXEC_INSTR(64, and)
2714 EXEC_DISPATCHER(and)
2715 
2716 /* XOR: ret = op1 ^ op2 */
2717 #define PSL_XOR_MASK	(PSL_V|PSL_C|PSL_Z|PSL_N|PSL_PF)
2718 EXEC_INSTR(8, xor)
2719 EXEC_INSTR(16, xor)
2720 EXEC_INSTR(32, xor)
2721 EXEC_INSTR(64, xor)
2722 EXEC_DISPATCHER(xor)
2723 
2724 /* -------------------------------------------------------------------------- */
2725 
2726 /*
2727  * Emulation functions. We don't care about the order of the operands, except
2728  * for SUB, CMP and TEST. For these ones we look at mem->write to determine who
2729  * is op1 and who is op2.
2730  */
2731 
2732 static void
2733 x86_func_or(struct nvmm_vcpu *vcpu, struct nvmm_mem *mem, uint64_t *gprs)
2734 {
2735 	uint64_t *retval = (uint64_t *)mem->data;
2736 	const bool write = mem->write;
2737 	uint64_t *op1, op2, fl, ret;
2738 
2739 	op1 = (uint64_t *)mem->data;
2740 	op2 = 0;
2741 
2742 	/* Fetch the value to be OR'ed (op2). */
2743 	mem->data = (uint8_t *)&op2;
2744 	mem->write = false;
2745 	(*vcpu->cbs.mem)(mem);
2746 
2747 	/* Perform the OR. */
2748 	ret = exec_or(*op1, op2, &fl, mem->size);
2749 
2750 	if (write) {
2751 		/* Write back the result. */
2752 		mem->data = (uint8_t *)&ret;
2753 		mem->write = true;
2754 		(*vcpu->cbs.mem)(mem);
2755 	} else {
2756 		/* Return data to the caller. */
2757 		*retval = ret;
2758 	}
2759 
2760 	gprs[NVMM_X64_GPR_RFLAGS] &= ~PSL_OR_MASK;
2761 	gprs[NVMM_X64_GPR_RFLAGS] |= (fl & PSL_OR_MASK);
2762 }
2763 
2764 static void
2765 x86_func_and(struct nvmm_vcpu *vcpu, struct nvmm_mem *mem, uint64_t *gprs)
2766 {
2767 	uint64_t *retval = (uint64_t *)mem->data;
2768 	const bool write = mem->write;
2769 	uint64_t *op1, op2, fl, ret;
2770 
2771 	op1 = (uint64_t *)mem->data;
2772 	op2 = 0;
2773 
2774 	/* Fetch the value to be AND'ed (op2). */
2775 	mem->data = (uint8_t *)&op2;
2776 	mem->write = false;
2777 	(*vcpu->cbs.mem)(mem);
2778 
2779 	/* Perform the AND. */
2780 	ret = exec_and(*op1, op2, &fl, mem->size);
2781 
2782 	if (write) {
2783 		/* Write back the result. */
2784 		mem->data = (uint8_t *)&ret;
2785 		mem->write = true;
2786 		(*vcpu->cbs.mem)(mem);
2787 	} else {
2788 		/* Return data to the caller. */
2789 		*retval = ret;
2790 	}
2791 
2792 	gprs[NVMM_X64_GPR_RFLAGS] &= ~PSL_AND_MASK;
2793 	gprs[NVMM_X64_GPR_RFLAGS] |= (fl & PSL_AND_MASK);
2794 }
2795 
2796 static void
2797 x86_func_xchg(struct nvmm_vcpu *vcpu, struct nvmm_mem *mem, uint64_t *gprs)
2798 {
2799 	uint64_t *op1, op2;
2800 
2801 	op1 = (uint64_t *)mem->data;
2802 	op2 = 0;
2803 
2804 	/* Fetch op2. */
2805 	mem->data = (uint8_t *)&op2;
2806 	mem->write = false;
2807 	(*vcpu->cbs.mem)(mem);
2808 
2809 	/* Write op1 in op2. */
2810 	mem->data = (uint8_t *)op1;
2811 	mem->write = true;
2812 	(*vcpu->cbs.mem)(mem);
2813 
2814 	/* Write op2 in op1. */
2815 	*op1 = op2;
2816 }
2817 
2818 static void
2819 x86_func_sub(struct nvmm_vcpu *vcpu, struct nvmm_mem *mem, uint64_t *gprs)
2820 {
2821 	uint64_t *retval = (uint64_t *)mem->data;
2822 	const bool write = mem->write;
2823 	uint64_t *op1, *op2, fl, ret;
2824 	uint64_t tmp;
2825 	bool memop1;
2826 
2827 	memop1 = !mem->write;
2828 	op1 = memop1 ? &tmp : (uint64_t *)mem->data;
2829 	op2 = memop1 ? (uint64_t *)mem->data : &tmp;
2830 
2831 	/* Fetch the value to be SUB'ed (op1 or op2). */
2832 	mem->data = (uint8_t *)&tmp;
2833 	mem->write = false;
2834 	(*vcpu->cbs.mem)(mem);
2835 
2836 	/* Perform the SUB. */
2837 	ret = exec_sub(*op1, *op2, &fl, mem->size);
2838 
2839 	if (write) {
2840 		/* Write back the result. */
2841 		mem->data = (uint8_t *)&ret;
2842 		mem->write = true;
2843 		(*vcpu->cbs.mem)(mem);
2844 	} else {
2845 		/* Return data to the caller. */
2846 		*retval = ret;
2847 	}
2848 
2849 	gprs[NVMM_X64_GPR_RFLAGS] &= ~PSL_SUB_MASK;
2850 	gprs[NVMM_X64_GPR_RFLAGS] |= (fl & PSL_SUB_MASK);
2851 }
2852 
2853 static void
2854 x86_func_xor(struct nvmm_vcpu *vcpu, struct nvmm_mem *mem, uint64_t *gprs)
2855 {
2856 	uint64_t *retval = (uint64_t *)mem->data;
2857 	const bool write = mem->write;
2858 	uint64_t *op1, op2, fl, ret;
2859 
2860 	op1 = (uint64_t *)mem->data;
2861 	op2 = 0;
2862 
2863 	/* Fetch the value to be XOR'ed (op2). */
2864 	mem->data = (uint8_t *)&op2;
2865 	mem->write = false;
2866 	(*vcpu->cbs.mem)(mem);
2867 
2868 	/* Perform the XOR. */
2869 	ret = exec_xor(*op1, op2, &fl, mem->size);
2870 
2871 	if (write) {
2872 		/* Write back the result. */
2873 		mem->data = (uint8_t *)&ret;
2874 		mem->write = true;
2875 		(*vcpu->cbs.mem)(mem);
2876 	} else {
2877 		/* Return data to the caller. */
2878 		*retval = ret;
2879 	}
2880 
2881 	gprs[NVMM_X64_GPR_RFLAGS] &= ~PSL_XOR_MASK;
2882 	gprs[NVMM_X64_GPR_RFLAGS] |= (fl & PSL_XOR_MASK);
2883 }
2884 
2885 static void
2886 x86_func_cmp(struct nvmm_vcpu *vcpu, struct nvmm_mem *mem, uint64_t *gprs)
2887 {
2888 	uint64_t *op1, *op2, fl;
2889 	uint64_t tmp;
2890 	bool memop1;
2891 
2892 	memop1 = !mem->write;
2893 	op1 = memop1 ? &tmp : (uint64_t *)mem->data;
2894 	op2 = memop1 ? (uint64_t *)mem->data : &tmp;
2895 
2896 	/* Fetch the value to be CMP'ed (op1 or op2). */
2897 	mem->data = (uint8_t *)&tmp;
2898 	mem->write = false;
2899 	(*vcpu->cbs.mem)(mem);
2900 
2901 	/* Perform the CMP. */
2902 	exec_sub(*op1, *op2, &fl, mem->size);
2903 
2904 	gprs[NVMM_X64_GPR_RFLAGS] &= ~PSL_SUB_MASK;
2905 	gprs[NVMM_X64_GPR_RFLAGS] |= (fl & PSL_SUB_MASK);
2906 }
2907 
2908 static void
2909 x86_func_test(struct nvmm_vcpu *vcpu, struct nvmm_mem *mem, uint64_t *gprs)
2910 {
2911 	uint64_t *op1, *op2, fl;
2912 	uint64_t tmp;
2913 	bool memop1;
2914 
2915 	memop1 = !mem->write;
2916 	op1 = memop1 ? &tmp : (uint64_t *)mem->data;
2917 	op2 = memop1 ? (uint64_t *)mem->data : &tmp;
2918 
2919 	/* Fetch the value to be TEST'ed (op1 or op2). */
2920 	mem->data = (uint8_t *)&tmp;
2921 	mem->write = false;
2922 	(*vcpu->cbs.mem)(mem);
2923 
2924 	/* Perform the TEST. */
2925 	exec_and(*op1, *op2, &fl, mem->size);
2926 
2927 	gprs[NVMM_X64_GPR_RFLAGS] &= ~PSL_AND_MASK;
2928 	gprs[NVMM_X64_GPR_RFLAGS] |= (fl & PSL_AND_MASK);
2929 }
2930 
2931 static void
2932 x86_func_mov(struct nvmm_vcpu *vcpu, struct nvmm_mem *mem, uint64_t *gprs)
2933 {
2934 	/*
2935 	 * Nothing special, just move without emulation.
2936 	 */
2937 	(*vcpu->cbs.mem)(mem);
2938 }
2939 
2940 static void
2941 x86_func_stos(struct nvmm_vcpu *vcpu, struct nvmm_mem *mem, uint64_t *gprs)
2942 {
2943 	/*
2944 	 * Just move, and update RDI.
2945 	 */
2946 	(*vcpu->cbs.mem)(mem);
2947 
2948 	if (gprs[NVMM_X64_GPR_RFLAGS] & PSL_D) {
2949 		gprs[NVMM_X64_GPR_RDI] -= mem->size;
2950 	} else {
2951 		gprs[NVMM_X64_GPR_RDI] += mem->size;
2952 	}
2953 }
2954 
2955 static void
2956 x86_func_lods(struct nvmm_vcpu *vcpu, struct nvmm_mem *mem, uint64_t *gprs)
2957 {
2958 	/*
2959 	 * Just move, and update RSI.
2960 	 */
2961 	(*vcpu->cbs.mem)(mem);
2962 
2963 	if (gprs[NVMM_X64_GPR_RFLAGS] & PSL_D) {
2964 		gprs[NVMM_X64_GPR_RSI] -= mem->size;
2965 	} else {
2966 		gprs[NVMM_X64_GPR_RSI] += mem->size;
2967 	}
2968 }
2969 
2970 static void
2971 x86_func_movs(struct nvmm_vcpu *vcpu, struct nvmm_mem *mem, uint64_t *gprs)
2972 {
2973 	/*
2974 	 * Special instruction: double memory operand. Don't call the cb,
2975 	 * because the storage has already been performed earlier.
2976 	 */
2977 
2978 	if (gprs[NVMM_X64_GPR_RFLAGS] & PSL_D) {
2979 		gprs[NVMM_X64_GPR_RSI] -= mem->size;
2980 		gprs[NVMM_X64_GPR_RDI] -= mem->size;
2981 	} else {
2982 		gprs[NVMM_X64_GPR_RSI] += mem->size;
2983 		gprs[NVMM_X64_GPR_RDI] += mem->size;
2984 	}
2985 }
2986 
2987 /* -------------------------------------------------------------------------- */
2988 
2989 static inline uint64_t
2990 gpr_read_address(struct x86_instr *instr, struct nvmm_x64_state *state, int gpr)
2991 {
2992 	uint64_t val;
2993 
2994 	val = state->gprs[gpr];
2995 	val &= size_to_mask(instr->address_size);
2996 
2997 	return val;
2998 }
2999 
3000 static int
3001 store_to_gva(struct nvmm_x64_state *state, struct x86_instr *instr,
3002     struct x86_store *store, gvaddr_t *gvap, size_t size)
3003 {
3004 	struct x86_sib *sib;
3005 	gvaddr_t gva = 0;
3006 	uint64_t reg;
3007 	int ret, seg;
3008 
3009 	if (store->type == STORE_SIB) {
3010 		sib = &store->u.sib;
3011 		if (sib->bas != NULL)
3012 			gva += gpr_read_address(instr, state, sib->bas->num);
3013 		if (sib->idx != NULL) {
3014 			reg = gpr_read_address(instr, state, sib->idx->num);
3015 			gva += sib->scale * reg;
3016 		}
3017 	} else if (store->type == STORE_REG) {
3018 		if (store->u.reg == NULL) {
3019 			/* The base is null. Happens with disp32-only and
3020 			 * disp16-only. */
3021 		} else {
3022 			gva = gpr_read_address(instr, state, store->u.reg->num);
3023 		}
3024 	} else if (store->type == STORE_DUALREG) {
3025 		gva = gpr_read_address(instr, state, store->u.dualreg.reg1) +
3026 		    gpr_read_address(instr, state, store->u.dualreg.reg2);
3027 	} else {
3028 		gva = store->u.dmo;
3029 	}
3030 
3031 	if (store->disp.type != DISP_NONE) {
3032 		gva += store->disp.data;
3033 	}
3034 
3035 	if (store->hardseg != -1) {
3036 		seg = store->hardseg;
3037 	} else {
3038 		if (__predict_false(instr->legpref.seg != -1)) {
3039 			seg = instr->legpref.seg;
3040 		} else {
3041 			seg = NVMM_X64_SEG_DS;
3042 		}
3043 	}
3044 
3045 	if (__predict_true(is_long_mode(state))) {
3046 		if (seg == NVMM_X64_SEG_GS || seg == NVMM_X64_SEG_FS) {
3047 			segment_apply(&state->segs[seg], &gva);
3048 		}
3049 	} else {
3050 		ret = segment_check(&state->segs[seg], gva, size);
3051 		if (ret == -1)
3052 			return -1;
3053 		segment_apply(&state->segs[seg], &gva);
3054 	}
3055 
3056 	*gvap = gva;
3057 	return 0;
3058 }
3059 
3060 static int
3061 fetch_segment(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu)
3062 {
3063 	struct nvmm_x64_state *state = vcpu->state;
3064 	uint8_t inst_bytes[5], byte;
3065 	size_t i, fetchsize;
3066 	gvaddr_t gva;
3067 	int ret, seg;
3068 
3069 	fetchsize = sizeof(inst_bytes);
3070 
3071 	gva = state->gprs[NVMM_X64_GPR_RIP];
3072 	if (__predict_false(!is_long_mode(state))) {
3073 		ret = segment_check(&state->segs[NVMM_X64_SEG_CS], gva,
3074 		    fetchsize);
3075 		if (ret == -1)
3076 			return -1;
3077 		segment_apply(&state->segs[NVMM_X64_SEG_CS], &gva);
3078 	}
3079 
3080 	ret = read_guest_memory(mach, vcpu, gva, inst_bytes, fetchsize);
3081 	if (ret == -1)
3082 		return -1;
3083 
3084 	seg = NVMM_X64_SEG_DS;
3085 	for (i = 0; i < fetchsize; i++) {
3086 		byte = inst_bytes[i];
3087 
3088 		if (byte == LEG_OVR_DS) {
3089 			seg = NVMM_X64_SEG_DS;
3090 		} else if (byte == LEG_OVR_ES) {
3091 			seg = NVMM_X64_SEG_ES;
3092 		} else if (byte == LEG_OVR_GS) {
3093 			seg = NVMM_X64_SEG_GS;
3094 		} else if (byte == LEG_OVR_FS) {
3095 			seg = NVMM_X64_SEG_FS;
3096 		} else if (byte == LEG_OVR_CS) {
3097 			seg = NVMM_X64_SEG_CS;
3098 		} else if (byte == LEG_OVR_SS) {
3099 			seg = NVMM_X64_SEG_SS;
3100 		} else if (byte == LEG_OPR_OVR) {
3101 			/* nothing */
3102 		} else if (byte == LEG_ADR_OVR) {
3103 			/* nothing */
3104 		} else if (byte == LEG_REP) {
3105 			/* nothing */
3106 		} else if (byte == LEG_REPN) {
3107 			/* nothing */
3108 		} else if (byte == LEG_LOCK) {
3109 			/* nothing */
3110 		} else {
3111 			return seg;
3112 		}
3113 	}
3114 
3115 	return seg;
3116 }
3117 
3118 static int
3119 fetch_instruction(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu,
3120     struct nvmm_vcpu_exit *exit)
3121 {
3122 	struct nvmm_x64_state *state = vcpu->state;
3123 	size_t fetchsize;
3124 	gvaddr_t gva;
3125 	int ret;
3126 
3127 	fetchsize = sizeof(exit->u.mem.inst_bytes);
3128 
3129 	gva = state->gprs[NVMM_X64_GPR_RIP];
3130 	if (__predict_false(!is_long_mode(state))) {
3131 		ret = segment_check(&state->segs[NVMM_X64_SEG_CS], gva,
3132 		    fetchsize);
3133 		if (ret == -1)
3134 			return -1;
3135 		segment_apply(&state->segs[NVMM_X64_SEG_CS], &gva);
3136 	}
3137 
3138 	ret = read_guest_memory(mach, vcpu, gva, exit->u.mem.inst_bytes,
3139 	    fetchsize);
3140 	if (ret == -1)
3141 		return -1;
3142 
3143 	exit->u.mem.inst_len = fetchsize;
3144 
3145 	return 0;
3146 }
3147 
3148 static int
3149 assist_mem_double(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu,
3150     struct x86_instr *instr)
3151 {
3152 	struct nvmm_x64_state *state = vcpu->state;
3153 	struct nvmm_mem mem;
3154 	uint8_t data[8];
3155 	gvaddr_t gva;
3156 	size_t size;
3157 	int ret;
3158 
3159 	size = instr->operand_size;
3160 
3161 	/* Source. */
3162 	ret = store_to_gva(state, instr, &instr->src, &gva, size);
3163 	if (ret == -1)
3164 		return -1;
3165 	ret = read_guest_memory(mach, vcpu, gva, data, size);
3166 	if (ret == -1)
3167 		return -1;
3168 
3169 	/* Destination. */
3170 	ret = store_to_gva(state, instr, &instr->dst, &gva, size);
3171 	if (ret == -1)
3172 		return -1;
3173 	ret = write_guest_memory(mach, vcpu, gva, data, size);
3174 	if (ret == -1)
3175 		return -1;
3176 
3177 	mem.size = size;
3178 	(*instr->emul->func)(vcpu, &mem, state->gprs);
3179 
3180 	return 0;
3181 }
3182 
3183 static int
3184 assist_mem_single(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu,
3185     struct x86_instr *instr)
3186 {
3187 	struct nvmm_x64_state *state = vcpu->state;
3188 	struct nvmm_vcpu_exit *exit = vcpu->exit;
3189 	struct nvmm_mem mem;
3190 	uint8_t membuf[8];
3191 	uint64_t val;
3192 
3193 	memset(membuf, 0, sizeof(membuf));
3194 
3195 	mem.mach = mach;
3196 	mem.vcpu = vcpu;
3197 	mem.gpa = exit->u.mem.gpa;
3198 	mem.size = instr->operand_size;
3199 	mem.data = membuf;
3200 
3201 	/* Determine the direction. */
3202 	switch (instr->src.type) {
3203 	case STORE_REG:
3204 		if (instr->src.disp.type != DISP_NONE) {
3205 			/* Indirect access. */
3206 			mem.write = false;
3207 		} else {
3208 			/* Direct access. */
3209 			mem.write = true;
3210 		}
3211 		break;
3212 	case STORE_DUALREG:
3213 		if (instr->src.disp.type == DISP_NONE) {
3214 			DISASSEMBLER_BUG();
3215 		}
3216 		mem.write = false;
3217 		break;
3218 	case STORE_IMM:
3219 		mem.write = true;
3220 		break;
3221 	case STORE_SIB:
3222 		mem.write = false;
3223 		break;
3224 	case STORE_DMO:
3225 		mem.write = false;
3226 		break;
3227 	default:
3228 		DISASSEMBLER_BUG();
3229 	}
3230 
3231 	if (mem.write) {
3232 		switch (instr->src.type) {
3233 		case STORE_REG:
3234 			/* The instruction was "reg -> mem". Fetch the register
3235 			 * in membuf. */
3236 			if (__predict_false(instr->src.disp.type != DISP_NONE)) {
3237 				DISASSEMBLER_BUG();
3238 			}
3239 			val = state->gprs[instr->src.u.reg->num];
3240 			val = __SHIFTOUT(val, instr->src.u.reg->mask);
3241 			memcpy(mem.data, &val, mem.size);
3242 			break;
3243 		case STORE_IMM:
3244 			/* The instruction was "imm -> mem". Fetch the immediate
3245 			 * in membuf. */
3246 			memcpy(mem.data, &instr->src.u.imm.data, mem.size);
3247 			break;
3248 		default:
3249 			DISASSEMBLER_BUG();
3250 		}
3251 	} else if (instr->emul->readreg) {
3252 		/* The instruction was "mem -> reg", but the value of the
3253 		 * register matters for the emul func. Fetch it in membuf. */
3254 		if (__predict_false(instr->dst.type != STORE_REG)) {
3255 			DISASSEMBLER_BUG();
3256 		}
3257 		if (__predict_false(instr->dst.disp.type != DISP_NONE)) {
3258 			DISASSEMBLER_BUG();
3259 		}
3260 		val = state->gprs[instr->dst.u.reg->num];
3261 		val = __SHIFTOUT(val, instr->dst.u.reg->mask);
3262 		memcpy(mem.data, &val, mem.size);
3263 	}
3264 
3265 	(*instr->emul->func)(vcpu, &mem, state->gprs);
3266 
3267 	if (instr->emul->notouch) {
3268 		/* We're done. */
3269 		return 0;
3270 	}
3271 
3272 	if (!mem.write) {
3273 		/* The instruction was "mem -> reg". The emul func has filled
3274 		 * membuf with the memory content. Install membuf in the
3275 		 * register. */
3276 		if (__predict_false(instr->dst.type != STORE_REG)) {
3277 			DISASSEMBLER_BUG();
3278 		}
3279 		if (__predict_false(instr->dst.disp.type != DISP_NONE)) {
3280 			DISASSEMBLER_BUG();
3281 		}
3282 		memcpy(&val, membuf, sizeof(uint64_t));
3283 		val = __SHIFTIN(val, instr->dst.u.reg->mask);
3284 		state->gprs[instr->dst.u.reg->num] &= ~instr->dst.u.reg->mask;
3285 		state->gprs[instr->dst.u.reg->num] |= val;
3286 		state->gprs[instr->dst.u.reg->num] &= ~instr->zeroextend_mask;
3287 	} else if (instr->emul->backprop) {
3288 		/* The instruction was "reg -> mem", but the memory must be
3289 		 * back-propagated to the register. Install membuf in the
3290 		 * register. */
3291 		if (__predict_false(instr->src.type != STORE_REG)) {
3292 			DISASSEMBLER_BUG();
3293 		}
3294 		if (__predict_false(instr->src.disp.type != DISP_NONE)) {
3295 			DISASSEMBLER_BUG();
3296 		}
3297 		memcpy(&val, membuf, sizeof(uint64_t));
3298 		val = __SHIFTIN(val, instr->src.u.reg->mask);
3299 		state->gprs[instr->src.u.reg->num] &= ~instr->src.u.reg->mask;
3300 		state->gprs[instr->src.u.reg->num] |= val;
3301 		state->gprs[instr->src.u.reg->num] &= ~instr->zeroextend_mask;
3302 	}
3303 
3304 	return 0;
3305 }
3306 
3307 int
3308 nvmm_assist_mem(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu)
3309 {
3310 	struct nvmm_x64_state *state = vcpu->state;
3311 	struct nvmm_vcpu_exit *exit = vcpu->exit;
3312 	struct x86_instr instr;
3313 	uint64_t cnt = 0; /* GCC */
3314 	int ret;
3315 
3316 	if (__predict_false(exit->reason != NVMM_VCPU_EXIT_MEMORY)) {
3317 		errno = EINVAL;
3318 		return -1;
3319 	}
3320 
3321 	ret = nvmm_vcpu_getstate(mach, vcpu,
3322 	    NVMM_X64_STATE_GPRS | NVMM_X64_STATE_SEGS |
3323 	    NVMM_X64_STATE_CRS | NVMM_X64_STATE_MSRS);
3324 	if (ret == -1)
3325 		return -1;
3326 
3327 	if (exit->u.mem.inst_len == 0) {
3328 		/*
3329 		 * The instruction was not fetched from the kernel. Fetch
3330 		 * it ourselves.
3331 		 */
3332 		ret = fetch_instruction(mach, vcpu, exit);
3333 		if (ret == -1)
3334 			return -1;
3335 	}
3336 
3337 	ret = x86_decode(exit->u.mem.inst_bytes, exit->u.mem.inst_len,
3338 	    &instr, state);
3339 	if (ret == -1) {
3340 		errno = ENODEV;
3341 		return -1;
3342 	}
3343 
3344 	if (instr.legpref.rep || instr.legpref.repn) {
3345 		cnt = rep_get_cnt(state, instr.address_size);
3346 		if (__predict_false(cnt == 0)) {
3347 			state->gprs[NVMM_X64_GPR_RIP] += instr.len;
3348 			goto out;
3349 		}
3350 	}
3351 
3352 	if (instr.opcode->movs) {
3353 		ret = assist_mem_double(mach, vcpu, &instr);
3354 	} else {
3355 		ret = assist_mem_single(mach, vcpu, &instr);
3356 	}
3357 	if (ret == -1) {
3358 		errno = ENODEV;
3359 		return -1;
3360 	}
3361 
3362 	if (instr.legpref.rep || instr.legpref.repn) {
3363 		cnt -= 1;
3364 		rep_set_cnt(state, instr.address_size, cnt);
3365 		if (cnt == 0) {
3366 			state->gprs[NVMM_X64_GPR_RIP] += instr.len;
3367 		} else if (__predict_false(instr.legpref.repn)) {
3368 			if (state->gprs[NVMM_X64_GPR_RFLAGS] & PSL_Z) {
3369 				state->gprs[NVMM_X64_GPR_RIP] += instr.len;
3370 			}
3371 		}
3372 	} else {
3373 		state->gprs[NVMM_X64_GPR_RIP] += instr.len;
3374 	}
3375 
3376 out:
3377 	ret = nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_GPRS);
3378 	if (ret == -1)
3379 		return -1;
3380 
3381 	return 0;
3382 }
3383