xref: /freebsd/sys/i386/i386/vm86.c (revision 61e21613)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 1997 Jonathan Lemon
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/priv.h>
32 #include <sys/proc.h>
33 #include <sys/lock.h>
34 #include <sys/malloc.h>
35 #include <sys/mutex.h>
36 
37 #include <vm/vm.h>
38 #include <vm/pmap.h>
39 #include <vm/vm_map.h>
40 #include <vm/vm_page.h>
41 
42 #include <machine/md_var.h>
43 #include <machine/pcb.h>
44 #include <machine/pcb_ext.h>
45 #include <machine/psl.h>
46 #include <machine/specialreg.h>
47 #include <machine/sysarch.h>
48 
49 extern int vm86pa;
50 extern struct pcb *vm86pcb;
51 
52 static struct mtx vm86_lock;
53 
54 extern int vm86_bioscall(struct vm86frame *);
55 extern void vm86_biosret(struct vm86frame *);
56 
57 void vm86_prepcall(struct vm86frame *);
58 
59 struct system_map {
60 	int		type;
61 	vm_offset_t	start;
62 	vm_offset_t	end;
63 };
64 
65 #define	HLT	0xf4
66 #define	CLI	0xfa
67 #define	STI	0xfb
68 #define	PUSHF	0x9c
69 #define	POPF	0x9d
70 #define	INTn	0xcd
71 #define	IRET	0xcf
72 #define	CALLm	0xff
73 #define OPERAND_SIZE_PREFIX	0x66
74 #define ADDRESS_SIZE_PREFIX	0x67
75 #define PUSH_MASK	~(PSL_VM | PSL_RF | PSL_I)
76 #define POP_MASK	~(PSL_VIP | PSL_VIF | PSL_VM | PSL_RF | PSL_IOPL)
77 
78 static int
79 vm86_suword16(volatile void *base, int word)
80 {
81 
82 	if (curthread->td_critnest != 0) {
83 		*(volatile uint16_t *)base = word;
84 		return (0);
85 	}
86 	return (suword16(base, word));
87 }
88 
89 static int
90 vm86_suword(volatile void *base, long word)
91 {
92 
93 	if (curthread->td_critnest != 0) {
94 		*(volatile long *)base = word;
95 		return (0);
96 	}
97 	return (suword(base, word));
98 }
99 
100 static int
101 vm86_fubyte(volatile const void *base)
102 {
103 
104 	if (curthread->td_critnest != 0)
105 		return (*(volatile const u_char *)base);
106 	return (fubyte(base));
107 }
108 
109 static int
110 vm86_fuword16(volatile const void *base)
111 {
112 
113 	if (curthread->td_critnest != 0)
114 		return (*(volatile const uint16_t *)base);
115 	return (fuword16(base));
116 }
117 
118 static long
119 vm86_fuword(volatile const void *base)
120 {
121 
122 	if (curthread->td_critnest != 0)
123 		return (*(volatile const long *)base);
124 	return (fuword(base));
125 }
126 
127 static __inline caddr_t
128 MAKE_ADDR(u_short sel, u_short off)
129 {
130 	return ((caddr_t)((sel << 4) + off));
131 }
132 
133 static __inline void
134 GET_VEC(u_int vec, u_short *sel, u_short *off)
135 {
136 	*sel = vec >> 16;
137 	*off = vec & 0xffff;
138 }
139 
140 static __inline u_int
141 MAKE_VEC(u_short sel, u_short off)
142 {
143 	return ((sel << 16) | off);
144 }
145 
146 static __inline void
147 PUSH(u_short x, struct vm86frame *vmf)
148 {
149 	vmf->vmf_sp -= 2;
150 	vm86_suword16(MAKE_ADDR(vmf->vmf_ss, vmf->vmf_sp), x);
151 }
152 
153 static __inline void
154 PUSHL(u_int x, struct vm86frame *vmf)
155 {
156 	vmf->vmf_sp -= 4;
157 	vm86_suword(MAKE_ADDR(vmf->vmf_ss, vmf->vmf_sp), x);
158 }
159 
160 static __inline u_short
161 POP(struct vm86frame *vmf)
162 {
163 	u_short x = vm86_fuword16(MAKE_ADDR(vmf->vmf_ss, vmf->vmf_sp));
164 
165 	vmf->vmf_sp += 2;
166 	return (x);
167 }
168 
169 static __inline u_int
170 POPL(struct vm86frame *vmf)
171 {
172 	u_int x = vm86_fuword(MAKE_ADDR(vmf->vmf_ss, vmf->vmf_sp));
173 
174 	vmf->vmf_sp += 4;
175 	return (x);
176 }
177 
178 int
179 vm86_emulate(struct vm86frame *vmf)
180 {
181 	struct vm86_kernel *vm86;
182 	caddr_t addr;
183 	u_char i_byte;
184 	u_int temp_flags;
185 	int inc_ip = 1;
186 	int retcode = 0;
187 
188 	/*
189 	 * pcb_ext contains the address of the extension area, or zero if
190 	 * the extension is not present.  (This check should not be needed,
191 	 * as we can't enter vm86 mode until we set up an extension area)
192 	 */
193 	if (curpcb->pcb_ext == 0)
194 		return (SIGBUS);
195 	vm86 = &curpcb->pcb_ext->ext_vm86;
196 
197 	if (vmf->vmf_eflags & PSL_T)
198 		retcode = SIGTRAP;
199 
200 	addr = MAKE_ADDR(vmf->vmf_cs, vmf->vmf_ip);
201 	i_byte = vm86_fubyte(addr);
202 	if (i_byte == ADDRESS_SIZE_PREFIX) {
203 		i_byte = vm86_fubyte(++addr);
204 		inc_ip++;
205 	}
206 
207 	if (vm86->vm86_has_vme) {
208 		switch (i_byte) {
209 		case OPERAND_SIZE_PREFIX:
210 			i_byte = vm86_fubyte(++addr);
211 			inc_ip++;
212 			switch (i_byte) {
213 			case PUSHF:
214 				if (vmf->vmf_eflags & PSL_VIF)
215 					PUSHL((vmf->vmf_eflags & PUSH_MASK)
216 					    | PSL_IOPL | PSL_I, vmf);
217 				else
218 					PUSHL((vmf->vmf_eflags & PUSH_MASK)
219 					    | PSL_IOPL, vmf);
220 				vmf->vmf_ip += inc_ip;
221 				return (retcode);
222 
223 			case POPF:
224 				temp_flags = POPL(vmf) & POP_MASK;
225 				vmf->vmf_eflags = (vmf->vmf_eflags & ~POP_MASK)
226 				    | temp_flags | PSL_VM | PSL_I;
227 				vmf->vmf_ip += inc_ip;
228 				if (temp_flags & PSL_I) {
229 					vmf->vmf_eflags |= PSL_VIF;
230 					if (vmf->vmf_eflags & PSL_VIP)
231 						break;
232 				} else {
233 					vmf->vmf_eflags &= ~PSL_VIF;
234 				}
235 				return (retcode);
236 			}
237 			break;
238 
239 		/* VME faults here if VIP is set, but does not set VIF. */
240 		case STI:
241 			vmf->vmf_eflags |= PSL_VIF;
242 			vmf->vmf_ip += inc_ip;
243 			if ((vmf->vmf_eflags & PSL_VIP) == 0) {
244 				uprintf("fatal sti\n");
245 				return (SIGKILL);
246 			}
247 			break;
248 
249 		/* VME if no redirection support */
250 		case INTn:
251 			break;
252 
253 		/* VME if trying to set PSL_T, or PSL_I when VIP is set */
254 		case POPF:
255 			temp_flags = POP(vmf) & POP_MASK;
256 			vmf->vmf_flags = (vmf->vmf_flags & ~POP_MASK)
257 			    | temp_flags | PSL_VM | PSL_I;
258 			vmf->vmf_ip += inc_ip;
259 			if (temp_flags & PSL_I) {
260 				vmf->vmf_eflags |= PSL_VIF;
261 				if (vmf->vmf_eflags & PSL_VIP)
262 					break;
263 			} else {
264 				vmf->vmf_eflags &= ~PSL_VIF;
265 			}
266 			return (retcode);
267 
268 		/* VME if trying to set PSL_T, or PSL_I when VIP is set */
269 		case IRET:
270 			vmf->vmf_ip = POP(vmf);
271 			vmf->vmf_cs = POP(vmf);
272 			temp_flags = POP(vmf) & POP_MASK;
273 			vmf->vmf_flags = (vmf->vmf_flags & ~POP_MASK)
274 			    | temp_flags | PSL_VM | PSL_I;
275 			if (temp_flags & PSL_I) {
276 				vmf->vmf_eflags |= PSL_VIF;
277 				if (vmf->vmf_eflags & PSL_VIP)
278 					break;
279 			} else {
280 				vmf->vmf_eflags &= ~PSL_VIF;
281 			}
282 			return (retcode);
283 		}
284 		return (SIGBUS);
285 	}
286 
287 	switch (i_byte) {
288 	case OPERAND_SIZE_PREFIX:
289 		i_byte = vm86_fubyte(++addr);
290 		inc_ip++;
291 		switch (i_byte) {
292 		case PUSHF:
293 			if (vm86->vm86_eflags & PSL_VIF)
294 				PUSHL((vmf->vmf_flags & PUSH_MASK)
295 				    | PSL_IOPL | PSL_I, vmf);
296 			else
297 				PUSHL((vmf->vmf_flags & PUSH_MASK)
298 				    | PSL_IOPL, vmf);
299 			vmf->vmf_ip += inc_ip;
300 			return (retcode);
301 
302 		case POPF:
303 			temp_flags = POPL(vmf) & POP_MASK;
304 			vmf->vmf_eflags = (vmf->vmf_eflags & ~POP_MASK)
305 			    | temp_flags | PSL_VM | PSL_I;
306 			vmf->vmf_ip += inc_ip;
307 			if (temp_flags & PSL_I) {
308 				vm86->vm86_eflags |= PSL_VIF;
309 				if (vm86->vm86_eflags & PSL_VIP)
310 					break;
311 			} else {
312 				vm86->vm86_eflags &= ~PSL_VIF;
313 			}
314 			return (retcode);
315 		}
316 		return (SIGBUS);
317 
318 	case CLI:
319 		vm86->vm86_eflags &= ~PSL_VIF;
320 		vmf->vmf_ip += inc_ip;
321 		return (retcode);
322 
323 	case STI:
324 		/* if there is a pending interrupt, go to the emulator */
325 		vm86->vm86_eflags |= PSL_VIF;
326 		vmf->vmf_ip += inc_ip;
327 		if (vm86->vm86_eflags & PSL_VIP)
328 			break;
329 		return (retcode);
330 
331 	case PUSHF:
332 		if (vm86->vm86_eflags & PSL_VIF)
333 			PUSH((vmf->vmf_flags & PUSH_MASK)
334 			    | PSL_IOPL | PSL_I, vmf);
335 		else
336 			PUSH((vmf->vmf_flags & PUSH_MASK) | PSL_IOPL, vmf);
337 		vmf->vmf_ip += inc_ip;
338 		return (retcode);
339 
340 	case INTn:
341 		i_byte = vm86_fubyte(addr + 1);
342 		if ((vm86->vm86_intmap[i_byte >> 3] & (1 << (i_byte & 7))) != 0)
343 			break;
344 		if (vm86->vm86_eflags & PSL_VIF)
345 			PUSH((vmf->vmf_flags & PUSH_MASK)
346 			    | PSL_IOPL | PSL_I, vmf);
347 		else
348 			PUSH((vmf->vmf_flags & PUSH_MASK) | PSL_IOPL, vmf);
349 		PUSH(vmf->vmf_cs, vmf);
350 		PUSH(vmf->vmf_ip + inc_ip + 1, vmf);	/* increment IP */
351 		GET_VEC(vm86_fuword((caddr_t)(i_byte * 4)),
352 		     &vmf->vmf_cs, &vmf->vmf_ip);
353 		vmf->vmf_flags &= ~PSL_T;
354 		vm86->vm86_eflags &= ~PSL_VIF;
355 		return (retcode);
356 
357 	case IRET:
358 		vmf->vmf_ip = POP(vmf);
359 		vmf->vmf_cs = POP(vmf);
360 		temp_flags = POP(vmf) & POP_MASK;
361 		vmf->vmf_flags = (vmf->vmf_flags & ~POP_MASK)
362 		    | temp_flags | PSL_VM | PSL_I;
363 		if (temp_flags & PSL_I) {
364 			vm86->vm86_eflags |= PSL_VIF;
365 			if (vm86->vm86_eflags & PSL_VIP)
366 				break;
367 		} else {
368 			vm86->vm86_eflags &= ~PSL_VIF;
369 		}
370 		return (retcode);
371 
372 	case POPF:
373 		temp_flags = POP(vmf) & POP_MASK;
374 		vmf->vmf_flags = (vmf->vmf_flags & ~POP_MASK)
375 		    | temp_flags | PSL_VM | PSL_I;
376 		vmf->vmf_ip += inc_ip;
377 		if (temp_flags & PSL_I) {
378 			vm86->vm86_eflags |= PSL_VIF;
379 			if (vm86->vm86_eflags & PSL_VIP)
380 				break;
381 		} else {
382 			vm86->vm86_eflags &= ~PSL_VIF;
383 		}
384 		return (retcode);
385 	}
386 	return (SIGBUS);
387 }
388 
389 #define PGTABLE_SIZE	((1024 + 64) * 1024 / PAGE_SIZE)
390 #define INTMAP_SIZE	32
391 #define IOMAP_SIZE	ctob(IOPAGES)
392 #define TSS_SIZE \
393 	(sizeof(struct pcb_ext) - sizeof(struct segment_descriptor) + \
394 	 INTMAP_SIZE + IOMAP_SIZE + 1)
395 
396 struct vm86_layout_pae {
397 	uint64_t	vml_pgtbl[PGTABLE_SIZE];
398 	struct 	pcb vml_pcb;
399 	struct	pcb_ext vml_ext;
400 	char	vml_intmap[INTMAP_SIZE];
401 	char	vml_iomap[IOMAP_SIZE];
402 	char	vml_iomap_trailer;
403 };
404 
405 struct vm86_layout_nopae {
406 	uint32_t	vml_pgtbl[PGTABLE_SIZE];
407 	struct 	pcb vml_pcb;
408 	struct	pcb_ext vml_ext;
409 	char	vml_intmap[INTMAP_SIZE];
410 	char	vml_iomap[IOMAP_SIZE];
411 	char	vml_iomap_trailer;
412 };
413 
414 _Static_assert(sizeof(struct vm86_layout_pae) <= ctob(3),
415     "struct vm86_layout_pae exceeds space allocated in locore.s");
416 _Static_assert(sizeof(struct vm86_layout_nopae) <= ctob(3),
417     "struct vm86_layout_nopae exceeds space allocated in locore.s");
418 
419 static void
420 vm86_initialize_pae(void)
421 {
422 	int i;
423 	u_int *addr;
424 	struct vm86_layout_pae *vml;
425 	struct pcb *pcb;
426 	struct pcb_ext *ext;
427 	struct soft_segment_descriptor ssd = {
428 		0,			/* segment base address (overwritten) */
429 		0,			/* length (overwritten) */
430 		SDT_SYS386TSS,		/* segment type */
431 		0,			/* priority level */
432 		1,			/* descriptor present */
433 		0, 0,
434 		0,			/* default 16 size */
435 		0			/* granularity */
436 	};
437 
438 	/*
439 	 * Below is the memory layout that we use for the vm86 region.
440 	 *
441 	 * +--------+
442 	 * |        |
443 	 * |        |
444 	 * | page 0 |
445 	 * |        | +--------+
446 	 * |        | | stack  |
447 	 * +--------+ +--------+ <--------- vm86paddr
448 	 * |        | |Page Tbl| 1M + 64K = 272 entries = 1088 bytes
449 	 * |        | +--------+
450 	 * |        | |  PCB   | size: ~240 bytes
451 	 * | page 1 | |PCB Ext | size: ~140 bytes (includes TSS)
452 	 * |        | +--------+
453 	 * |        | |int map |
454 	 * |        | +--------+
455 	 * +--------+ |        |
456 	 * | page 2 | |  I/O   |
457 	 * +--------+ | bitmap |
458 	 * | page 3 | |        |
459 	 * |        | +--------+
460 	 * +--------+
461 	 */
462 
463 	/*
464 	 * A rudimentary PCB must be installed, in order to get to the
465 	 * PCB extension area.  We use the PCB area as a scratchpad for
466 	 * data storage, the layout of which is shown below.
467 	 *
468 	 * pcb_esi	= new PTD entry 0
469 	 * pcb_ebp	= pointer to frame on vm86 stack
470 	 * pcb_esp	=    stack frame pointer at time of switch
471 	 * pcb_ebx	= va of vm86 page table
472 	 * pcb_eip	=    argument pointer to initial call
473 	 * pcb_vm86[0]	=    saved TSS descriptor, word 0
474 	 * pcb_vm86[1]	=    saved TSS descriptor, word 1
475 	 */
476 #define new_ptd		pcb_esi
477 #define vm86_frame	pcb_ebp
478 #define pgtable_va	pcb_ebx
479 
480 	vml = (struct vm86_layout_pae *)vm86paddr;
481 	pcb = &vml->vml_pcb;
482 	ext = &vml->vml_ext;
483 
484 	mtx_init(&vm86_lock, "vm86 lock", NULL, MTX_DEF);
485 
486 	bzero(pcb, sizeof(struct pcb));
487 	pcb->new_ptd = vm86pa | PG_V | PG_RW | PG_U;
488 	pcb->vm86_frame = vm86paddr - sizeof(struct vm86frame);
489 	pcb->pgtable_va = vm86paddr;
490 	pcb->pcb_flags = PCB_VM86CALL;
491 	pcb->pcb_ext = ext;
492 
493 	bzero(ext, sizeof(struct pcb_ext));
494 	ext->ext_tss.tss_esp0 = vm86paddr;
495 	ext->ext_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
496 	ext->ext_tss.tss_ioopt =
497 		((u_int)vml->vml_iomap - (u_int)&ext->ext_tss) << 16;
498 	ext->ext_iomap = vml->vml_iomap;
499 	ext->ext_vm86.vm86_intmap = vml->vml_intmap;
500 
501 	if (cpu_feature & CPUID_VME)
502 		ext->ext_vm86.vm86_has_vme = (rcr4() & CR4_VME ? 1 : 0);
503 
504 	addr = (u_int *)ext->ext_vm86.vm86_intmap;
505 	for (i = 0; i < (INTMAP_SIZE + IOMAP_SIZE) / sizeof(u_int); i++)
506 		*addr++ = 0;
507 	vml->vml_iomap_trailer = 0xff;
508 
509 	ssd.ssd_base = (u_int)&ext->ext_tss;
510 	ssd.ssd_limit = TSS_SIZE - 1;
511 	ssdtosd(&ssd, &ext->ext_tssd);
512 
513 	vm86pcb = pcb;
514 
515 #if 0
516         /*
517          * use whatever is leftover of the vm86 page layout as a
518          * message buffer so we can capture early output.
519          */
520         msgbufinit((vm_offset_t)vm86paddr + sizeof(struct vm86_layout),
521             ctob(3) - sizeof(struct vm86_layout));
522 #endif
523 }
524 
525 static void
526 vm86_initialize_nopae(void)
527 {
528 	int i;
529 	u_int *addr;
530 	struct vm86_layout_nopae *vml;
531 	struct pcb *pcb;
532 	struct pcb_ext *ext;
533 	struct soft_segment_descriptor ssd = {
534 		0,			/* segment base address (overwritten) */
535 		0,			/* length (overwritten) */
536 		SDT_SYS386TSS,		/* segment type */
537 		0,			/* priority level */
538 		1,			/* descriptor present */
539 		0, 0,
540 		0,			/* default 16 size */
541 		0			/* granularity */
542 	};
543 
544 	vml = (struct vm86_layout_nopae *)vm86paddr;
545 	pcb = &vml->vml_pcb;
546 	ext = &vml->vml_ext;
547 
548 	mtx_init(&vm86_lock, "vm86 lock", NULL, MTX_DEF);
549 
550 	bzero(pcb, sizeof(struct pcb));
551 	pcb->new_ptd = vm86pa | PG_V | PG_RW | PG_U;
552 	pcb->vm86_frame = vm86paddr - sizeof(struct vm86frame);
553 	pcb->pgtable_va = vm86paddr;
554 	pcb->pcb_flags = PCB_VM86CALL;
555 	pcb->pcb_ext = ext;
556 
557 	bzero(ext, sizeof(struct pcb_ext));
558 	ext->ext_tss.tss_esp0 = vm86paddr;
559 	ext->ext_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
560 	ext->ext_tss.tss_ioopt =
561 		((u_int)vml->vml_iomap - (u_int)&ext->ext_tss) << 16;
562 	ext->ext_iomap = vml->vml_iomap;
563 	ext->ext_vm86.vm86_intmap = vml->vml_intmap;
564 
565 	if (cpu_feature & CPUID_VME)
566 		ext->ext_vm86.vm86_has_vme = (rcr4() & CR4_VME ? 1 : 0);
567 
568 	addr = (u_int *)ext->ext_vm86.vm86_intmap;
569 	for (i = 0; i < (INTMAP_SIZE + IOMAP_SIZE) / sizeof(u_int); i++)
570 		*addr++ = 0;
571 	vml->vml_iomap_trailer = 0xff;
572 
573 	ssd.ssd_base = (u_int)&ext->ext_tss;
574 	ssd.ssd_limit = TSS_SIZE - 1;
575 	ssdtosd(&ssd, &ext->ext_tssd);
576 
577 	vm86pcb = pcb;
578 
579 #if 0
580         /*
581          * use whatever is leftover of the vm86 page layout as a
582          * message buffer so we can capture early output.
583          */
584         msgbufinit((vm_offset_t)vm86paddr + sizeof(struct vm86_layout),
585             ctob(3) - sizeof(struct vm86_layout));
586 #endif
587 }
588 
589 void
590 vm86_initialize(void)
591 {
592 
593 	if (pae_mode)
594 		vm86_initialize_pae();
595 	else
596 		vm86_initialize_nopae();
597 }
598 
599 vm_offset_t
600 vm86_getpage(struct vm86context *vmc, int pagenum)
601 {
602 	int i;
603 
604 	for (i = 0; i < vmc->npages; i++)
605 		if (vmc->pmap[i].pte_num == pagenum)
606 			return (vmc->pmap[i].kva);
607 	return (0);
608 }
609 
610 vm_offset_t
611 vm86_addpage(struct vm86context *vmc, int pagenum, vm_offset_t kva)
612 {
613 	int i, flags = 0;
614 
615 	for (i = 0; i < vmc->npages; i++)
616 		if (vmc->pmap[i].pte_num == pagenum)
617 			goto overlap;
618 
619 	if (vmc->npages == VM86_PMAPSIZE)
620 		goto full;			/* XXX grow map? */
621 
622 	if (kva == 0) {
623 		kva = (vm_offset_t)malloc(PAGE_SIZE, M_TEMP, M_WAITOK);
624 		flags = VMAP_MALLOC;
625 	}
626 
627 	i = vmc->npages++;
628 	vmc->pmap[i].flags = flags;
629 	vmc->pmap[i].kva = kva;
630 	vmc->pmap[i].pte_num = pagenum;
631 	return (kva);
632 overlap:
633 	panic("vm86_addpage: overlap");
634 full:
635 	panic("vm86_addpage: not enough room");
636 }
637 
638 /*
639  * called from vm86_bioscall, while in vm86 address space, to finalize setup.
640  */
641 void
642 vm86_prepcall(struct vm86frame *vmf)
643 {
644 	struct vm86_kernel *vm86;
645 	uint32_t *stack;
646 	uint8_t *code;
647 
648 	code = (void *)0xa00;
649 	stack = (void *)(0x1000 - 2);	/* keep aligned */
650 	if ((vmf->vmf_trapno & PAGE_MASK) <= 0xff) {
651 		/* interrupt call requested */
652 		code[0] = INTn;
653 		code[1] = vmf->vmf_trapno & 0xff;
654 		code[2] = HLT;
655 		vmf->vmf_ip = (uintptr_t)code;
656 		vmf->vmf_cs = 0;
657 	} else {
658 		code[0] = HLT;
659 		stack--;
660 		stack[0] = MAKE_VEC(0, (uintptr_t)code);
661 	}
662 	vmf->vmf_sp = (uintptr_t)stack;
663 	vmf->vmf_ss = 0;
664 	vmf->kernel_fs = vmf->kernel_es = vmf->kernel_ds = 0;
665 	vmf->vmf_eflags = PSL_VIF | PSL_VM | PSL_USER;
666 
667 	vm86 = &curpcb->pcb_ext->ext_vm86;
668 	if (!vm86->vm86_has_vme)
669 		vm86->vm86_eflags = vmf->vmf_eflags;  /* save VIF, VIP */
670 }
671 
672 /*
673  * vm86 trap handler; determines whether routine succeeded or not.
674  * Called while in vm86 space, returns to calling process.
675  */
676 void
677 vm86_trap(struct vm86frame *vmf)
678 {
679 	void (*p)(struct vm86frame *);
680 	caddr_t addr;
681 
682 	/* "should not happen" */
683 	if ((vmf->vmf_eflags & PSL_VM) == 0)
684 		panic("vm86_trap called, but not in vm86 mode");
685 
686 	addr = MAKE_ADDR(vmf->vmf_cs, vmf->vmf_ip);
687 	if (*(u_char *)addr == HLT)
688 		vmf->vmf_trapno = vmf->vmf_eflags & PSL_C;
689 	else
690 		vmf->vmf_trapno = vmf->vmf_trapno << 16;
691 
692 	p = (void (*)(struct vm86frame *))((uintptr_t)vm86_biosret +
693 	    setidt_disp);
694 	p(vmf);
695 }
696 
697 int
698 vm86_intcall(int intnum, struct vm86frame *vmf)
699 {
700 	int (*p)(struct vm86frame *);
701 	int retval;
702 
703 	if (intnum < 0 || intnum > 0xff)
704 		return (EINVAL);
705 
706 	vmf->vmf_trapno = intnum;
707 	p = (int (*)(struct vm86frame *))((uintptr_t)vm86_bioscall +
708 	    setidt_disp);
709 	mtx_lock(&vm86_lock);
710 	critical_enter();
711 	retval = p(vmf);
712 	critical_exit();
713 	mtx_unlock(&vm86_lock);
714 	return (retval);
715 }
716 
717 /*
718  * struct vm86context contains the page table to use when making
719  * vm86 calls.  If intnum is a valid interrupt number (0-255), then
720  * the "interrupt trampoline" will be used, otherwise we use the
721  * caller's cs:ip routine.
722  */
723 int
724 vm86_datacall(int intnum, struct vm86frame *vmf, struct vm86context *vmc)
725 {
726 	uint64_t *pte_pae;
727 	uint32_t *pte_nopae;
728 	int (*p)(struct vm86frame *);
729 	vm_paddr_t page;
730 	int i, entry, retval;
731 
732 	mtx_lock(&vm86_lock);
733 	if (pae_mode) {
734 		pte_pae = (uint64_t *)vm86paddr;
735 		for (i = 0; i < vmc->npages; i++) {
736 			page = vtophys(vmc->pmap[i].kva & PG_FRAME_PAE);
737 			entry = vmc->pmap[i].pte_num;
738 			vmc->pmap[i].old_pte = pte_pae[entry];
739 			pte_pae[entry] = page | PG_V | PG_RW | PG_U;
740 			pmap_invalidate_page(kernel_pmap, vmc->pmap[i].kva);
741 		}
742 	} else {
743 		pte_nopae = (uint32_t *)vm86paddr;
744 		for (i = 0; i < vmc->npages; i++) {
745 			page = vtophys(vmc->pmap[i].kva & PG_FRAME_NOPAE);
746 			entry = vmc->pmap[i].pte_num;
747 			vmc->pmap[i].old_pte = pte_nopae[entry];
748 			pte_nopae[entry] = page | PG_V | PG_RW | PG_U;
749 			pmap_invalidate_page(kernel_pmap, vmc->pmap[i].kva);
750 		}
751 	}
752 
753 	vmf->vmf_trapno = intnum;
754 	p = (int (*)(struct vm86frame *))((uintptr_t)vm86_bioscall +
755 	    setidt_disp);
756 	critical_enter();
757 	retval = p(vmf);
758 	critical_exit();
759 
760 	if (pae_mode) {
761 		for (i = 0; i < vmc->npages; i++) {
762 			entry = vmc->pmap[i].pte_num;
763 			pte_pae[entry] = vmc->pmap[i].old_pte;
764 			pmap_invalidate_page(kernel_pmap, vmc->pmap[i].kva);
765 		}
766 	} else {
767 		for (i = 0; i < vmc->npages; i++) {
768 			entry = vmc->pmap[i].pte_num;
769 			pte_nopae[entry] = vmc->pmap[i].old_pte;
770 			pmap_invalidate_page(kernel_pmap, vmc->pmap[i].kva);
771 		}
772 	}
773 	mtx_unlock(&vm86_lock);
774 
775 	return (retval);
776 }
777 
778 vm_offset_t
779 vm86_getaddr(struct vm86context *vmc, u_short sel, u_short off)
780 {
781 	int i, page;
782 	vm_offset_t addr;
783 
784 	addr = (vm_offset_t)MAKE_ADDR(sel, off);
785 	page = addr >> PAGE_SHIFT;
786 	for (i = 0; i < vmc->npages; i++)
787 		if (page == vmc->pmap[i].pte_num)
788 			return (vmc->pmap[i].kva + (addr & PAGE_MASK));
789 	return (0);
790 }
791 
792 int
793 vm86_getptr(struct vm86context *vmc, vm_offset_t kva, u_short *sel,
794      u_short *off)
795 {
796 	int i;
797 
798 	for (i = 0; i < vmc->npages; i++)
799 		if (kva >= vmc->pmap[i].kva &&
800 		    kva < vmc->pmap[i].kva + PAGE_SIZE) {
801 			*off = kva - vmc->pmap[i].kva;
802 			*sel = vmc->pmap[i].pte_num << 8;
803 			return (1);
804 		}
805 	return (0);
806 }
807 
808 int
809 vm86_sysarch(struct thread *td, char *args)
810 {
811 	int error = 0;
812 	struct i386_vm86_args ua;
813 	struct vm86_kernel *vm86;
814 
815 	if ((error = copyin(args, &ua, sizeof(struct i386_vm86_args))) != 0)
816 		return (error);
817 
818 	if (td->td_pcb->pcb_ext == 0)
819 		if ((error = i386_extend_pcb(td)) != 0)
820 			return (error);
821 	vm86 = &td->td_pcb->pcb_ext->ext_vm86;
822 
823 	switch (ua.sub_op) {
824 	case VM86_INIT: {
825 		struct vm86_init_args sa;
826 
827 		if ((error = copyin(ua.sub_args, &sa, sizeof(sa))) != 0)
828 			return (error);
829 		if (cpu_feature & CPUID_VME)
830 			vm86->vm86_has_vme = (rcr4() & CR4_VME ? 1 : 0);
831 		else
832 			vm86->vm86_has_vme = 0;
833 		vm86->vm86_inited = 1;
834 		vm86->vm86_debug = sa.debug;
835 		bcopy(&sa.int_map, vm86->vm86_intmap, 32);
836 		}
837 		break;
838 
839 #if 0
840 	case VM86_SET_VME: {
841 		struct vm86_vme_args sa;
842 
843 		if ((cpu_feature & CPUID_VME) == 0)
844 			return (ENODEV);
845 
846 		if (error = copyin(ua.sub_args, &sa, sizeof(sa)))
847 			return (error);
848 		if (sa.state)
849 			load_cr4(rcr4() | CR4_VME);
850 		else
851 			load_cr4(rcr4() & ~CR4_VME);
852 		}
853 		break;
854 #endif
855 
856 	case VM86_GET_VME: {
857 		struct vm86_vme_args sa;
858 
859 		sa.state = (rcr4() & CR4_VME ? 1 : 0);
860         	error = copyout(&sa, ua.sub_args, sizeof(sa));
861 		}
862 		break;
863 
864 	case VM86_INTCALL: {
865 		struct vm86_intcall_args sa;
866 
867 		if ((error = priv_check(td, PRIV_VM86_INTCALL)))
868 			return (error);
869 		if ((error = copyin(ua.sub_args, &sa, sizeof(sa))))
870 			return (error);
871 		if ((error = vm86_intcall(sa.intnum, &sa.vmf)))
872 			return (error);
873 		error = copyout(&sa, ua.sub_args, sizeof(sa));
874 		}
875 		break;
876 
877 	default:
878 		error = EINVAL;
879 	}
880 	return (error);
881 }
882