1 /*	$NetBSD: linux_machdep.c,v 1.84 2002/12/06 03:37:19 junyoung Exp $	*/
2 
3 /*-
4  * Copyright (c) 1995, 2000 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Frank van der Linden.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the NetBSD
21  *	Foundation, Inc. and its contributors.
22  * 4. Neither the name of The NetBSD Foundation nor the names of its
23  *    contributors may be used to endorse or promote products derived
24  *    from this software without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  * POSSIBILITY OF SUCH DAMAGE.
37  */
38 
39 #include <sys/cdefs.h>
40 __KERNEL_RCSID(0, "$NetBSD: linux_machdep.c,v 1.84 2002/12/06 03:37:19 junyoung Exp $");
41 
42 #if defined(_KERNEL_OPT)
43 #include "opt_vm86.h"
44 #include "opt_user_ldt.h"
45 #endif
46 
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/signalvar.h>
50 #include <sys/kernel.h>
51 #include <sys/proc.h>
52 #include <sys/user.h>
53 #include <sys/buf.h>
54 #include <sys/reboot.h>
55 #include <sys/conf.h>
56 #include <sys/exec.h>
57 #include <sys/file.h>
58 #include <sys/callout.h>
59 #include <sys/malloc.h>
60 #include <sys/mbuf.h>
61 #include <sys/msgbuf.h>
62 #include <sys/mount.h>
63 #include <sys/vnode.h>
64 #include <sys/device.h>
65 #include <sys/syscallargs.h>
66 #include <sys/filedesc.h>
67 #include <sys/exec_elf.h>
68 #include <sys/disklabel.h>
69 #include <sys/ioctl.h>
70 #include <miscfs/specfs/specdev.h>
71 
72 #include <compat/linux/common/linux_types.h>
73 #include <compat/linux/common/linux_signal.h>
74 #include <compat/linux/common/linux_util.h>
75 #include <compat/linux/common/linux_ioctl.h>
76 #include <compat/linux/common/linux_hdio.h>
77 #include <compat/linux/common/linux_exec.h>
78 #include <compat/linux/common/linux_machdep.h>
79 
80 #include <compat/linux/linux_syscallargs.h>
81 
82 #include <machine/cpu.h>
83 #include <machine/cpufunc.h>
84 #include <machine/psl.h>
85 #include <machine/reg.h>
86 #include <machine/segments.h>
87 #include <machine/specialreg.h>
88 #include <machine/sysarch.h>
89 #include <machine/vm86.h>
90 #include <machine/vmparam.h>
91 
92 /*
93  * To see whether wscons is configured (for virtual console ioctl calls).
94  */
95 #if defined(_KERNEL_OPT)
96 #include "wsdisplay.h"
97 #endif
98 #if (NWSDISPLAY > 0)
99 #include <dev/wscons/wsconsio.h>
100 #include <dev/wscons/wsdisplay_usl_io.h>
101 #if defined(_KERNEL_OPT)
102 #include "opt_xserver.h"
103 #endif
104 #endif
105 
106 #ifdef USER_LDT
107 #include <machine/cpu.h>
108 int linux_read_ldt __P((struct proc *, struct linux_sys_modify_ldt_args *,
109     register_t *));
110 int linux_write_ldt __P((struct proc *, struct linux_sys_modify_ldt_args *,
111     register_t *));
112 #endif
113 
114 #ifdef DEBUG_LINUX
115 #define DPRINTF(a) uprintf a
116 #else
117 #define DPRINTF(a)
118 #endif
119 
120 static struct biosdisk_info *fd2biosinfo __P((struct proc *, struct file *));
121 extern struct disklist *i386_alldisks;
122 static void linux_savecontext __P((struct proc *, struct trapframe *,
123     sigset_t *, struct linux_sigcontext *));
124 static void linux_rt_sendsig __P((int, sigset_t *, u_long));
125 static void linux_old_sendsig __P((int, sigset_t *, u_long));
126 
127 extern char linux_sigcode[], linux_rt_sigcode[];
128 /*
129  * Deal with some i386-specific things in the Linux emulation code.
130  */
131 
132 void
133 linux_setregs(p, epp, stack)
134 	struct proc *p;
135 	struct exec_package *epp;
136 	u_long stack;
137 {
138 	struct pcb *pcb = &p->p_addr->u_pcb;
139 	struct trapframe *tf;
140 
141 #if NNPX > 0
142 	/* If we were using the FPU, forget about it. */
143 	if (npxproc == p)
144 		npxdrop();
145 #endif
146 
147 #ifdef USER_LDT
148 	pmap_ldt_cleanup(p);
149 #endif
150 
151 	p->p_md.md_flags &= ~MDP_USEDFPU;
152 
153 	if (i386_use_fxsave) {
154 		pcb->pcb_savefpu.sv_xmm.sv_env.en_cw = __Linux_NPXCW__;
155 		pcb->pcb_savefpu.sv_xmm.sv_env.en_mxcsr = __INITIAL_MXCSR__;
156 	} else
157 		pcb->pcb_savefpu.sv_87.sv_env.en_cw = __Linux_NPXCW__;
158 
159 	tf = p->p_md.md_regs;
160 	tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
161 	tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
162 	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
163 	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
164 	tf->tf_edi = 0;
165 	tf->tf_esi = 0;
166 	tf->tf_ebp = 0;
167 	tf->tf_ebx = (int)p->p_psstr;
168 	tf->tf_edx = 0;
169 	tf->tf_ecx = 0;
170 	tf->tf_eax = 0;
171 	tf->tf_eip = epp->ep_entry;
172 	tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
173 	tf->tf_eflags = PSL_USERSET;
174 	tf->tf_esp = stack;
175 	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
176 }
177 
178 /*
179  * Send an interrupt to process.
180  *
181  * Stack is set up to allow sigcode stored
182  * in u. to call routine, followed by kcall
183  * to sigreturn routine below.  After sigreturn
184  * resets the signal mask, the stack, and the
185  * frame pointer, it returns to the user
186  * specified pc, psl.
187  */
188 
189 void
190 linux_sendsig(sig, mask, code)
191 	int sig;
192 	sigset_t *mask;
193 	u_long code;
194 {
195 	if (SIGACTION(curproc, sig).sa_flags & SA_SIGINFO)
196 		linux_rt_sendsig(sig, mask, code);
197 	else
198 		linux_old_sendsig(sig, mask, code);
199 }
200 
201 
202 static void
203 linux_savecontext(p, tf, mask, sc)
204 	struct proc *p;
205 	struct trapframe *tf;
206 	sigset_t *mask;
207 	struct linux_sigcontext *sc;
208 {
209 	/* Save register context. */
210 #ifdef VM86
211 	if (tf->tf_eflags & PSL_VM) {
212 		sc->sc_gs = tf->tf_vm86_gs;
213 		sc->sc_fs = tf->tf_vm86_fs;
214 		sc->sc_es = tf->tf_vm86_es;
215 		sc->sc_ds = tf->tf_vm86_ds;
216 		sc->sc_eflags = get_vflags(p);
217 	} else
218 #endif
219 	{
220 		sc->sc_gs = tf->tf_gs;
221 		sc->sc_fs = tf->tf_fs;
222 		sc->sc_es = tf->tf_es;
223 		sc->sc_ds = tf->tf_ds;
224 		sc->sc_eflags = tf->tf_eflags;
225 	}
226 	sc->sc_edi = tf->tf_edi;
227 	sc->sc_esi = tf->tf_esi;
228 	sc->sc_esp = tf->tf_esp;
229 	sc->sc_ebp = tf->tf_ebp;
230 	sc->sc_ebx = tf->tf_ebx;
231 	sc->sc_edx = tf->tf_edx;
232 	sc->sc_ecx = tf->tf_ecx;
233 	sc->sc_eax = tf->tf_eax;
234 	sc->sc_eip = tf->tf_eip;
235 	sc->sc_cs = tf->tf_cs;
236 	sc->sc_esp_at_signal = tf->tf_esp;
237 	sc->sc_ss = tf->tf_ss;
238 	sc->sc_err = tf->tf_err;
239 	sc->sc_trapno = tf->tf_trapno;
240 	sc->sc_cr2 = p->p_addr->u_pcb.pcb_cr2;
241 	sc->sc_387 = NULL;
242 
243 	/* Save signal stack. */
244 	/* Linux doesn't save the onstack flag in sigframe */
245 
246 	/* Save signal mask. */
247 	native_to_linux_old_sigset(&sc->sc_mask, mask);
248 }
249 
250 static void
251 linux_rt_sendsig(sig, mask, code)
252 	int sig;
253 	sigset_t *mask;
254 	u_long code;
255 {
256 	struct proc *p = curproc;
257 	struct trapframe *tf;
258 	struct linux_rt_sigframe *fp, frame;
259 	int onstack;
260 	sig_t catcher = SIGACTION(p, sig).sa_handler;
261 	struct sigaltstack *sas = &p->p_sigctx.ps_sigstk;
262 
263 	tf = p->p_md.md_regs;
264 
265 	/* Do we need to jump onto the signal stack? */
266 	onstack = (sas->ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 &&
267 	    (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
268 
269 
270 	/* Allocate space for the signal handler context. */
271 	if (onstack)
272 		fp = (struct linux_rt_sigframe *)((caddr_t)sas->ss_sp +
273 		    sas->ss_size);
274 	else
275 		fp = (struct linux_rt_sigframe *)tf->tf_esp;
276 	fp--;
277 
278 	DPRINTF(("rt: onstack = %d, fp = %p sig = %d eip = 0x%x\n", onstack, fp,
279 	    sig, tf->tf_eip));
280 
281 	/* Build stack frame for signal trampoline. */
282 	frame.sf_handler = catcher;
283 	frame.sf_sig = native_to_linux_signo[sig];
284 	frame.sf_sip = &fp->sf_si;
285 	frame.sf_scp = &fp->sf_sc;
286 
287 	/*
288 	 * XXX: zero siginfo out until we provide more info.
289 	 */
290 	(void)memset(&frame.sf_si, 0, sizeof(frame.sf_si));
291 
292 	/* Save register context. */
293 	linux_savecontext(p, tf, mask, &frame.sf_sc);
294 
295 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
296 		/*
297 		 * Process has trashed its stack; give it an illegal
298 		 * instruction to halt it in its tracks.
299 		 */
300 		sigexit(p, SIGILL);
301 		/* NOTREACHED */
302 	}
303 
304 	/*
305 	 * Build context to run handler in.
306 	 */
307 	tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
308 	tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
309 	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
310 	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
311 	tf->tf_eip = ((int)p->p_sigctx.ps_sigcode) +
312 	    (linux_rt_sigcode - linux_sigcode);
313 	tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
314 	tf->tf_eflags &= ~(PSL_T|PSL_VM|PSL_AC);
315 	tf->tf_esp = (int)fp;
316 	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
317 
318 	/* Remember that we're now on the signal stack. */
319 	if (onstack)
320 		sas->ss_flags |= SS_ONSTACK;
321 }
322 
323 static void
324 linux_old_sendsig(sig, mask, code)
325 	int sig;
326 	sigset_t *mask;
327 	u_long code;
328 {
329 	struct proc *p = curproc;
330 	struct trapframe *tf;
331 	struct linux_sigframe *fp, frame;
332 	int onstack;
333 	sig_t catcher = SIGACTION(p, sig).sa_handler;
334 	struct sigaltstack *sas = &p->p_sigctx.ps_sigstk;
335 
336 	tf = p->p_md.md_regs;
337 
338 	/* Do we need to jump onto the signal stack? */
339 	onstack = (sas->ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 &&
340 	    (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
341 
342 	/* Allocate space for the signal handler context. */
343 	if (onstack)
344 		fp = (struct linux_sigframe *) ((caddr_t)sas->ss_sp +
345 		    sas->ss_size);
346 	else
347 		fp = (struct linux_sigframe *)tf->tf_esp;
348 	fp--;
349 
350 	DPRINTF(("old: onstack = %d, fp = %p sig = %d eip = 0x%x\n",
351 	    onstack, fp, sig, tf->tf_eip));
352 
353 	/* Build stack frame for signal trampoline. */
354 	frame.sf_handler = catcher;
355 	frame.sf_sig = native_to_linux_signo[sig];
356 
357 	linux_savecontext(p, tf, mask, &frame.sf_sc);
358 
359 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
360 		/*
361 		 * Process has trashed its stack; give it an illegal
362 		 * instruction to halt it in its tracks.
363 		 */
364 		sigexit(p, SIGILL);
365 		/* NOTREACHED */
366 	}
367 
368 	/*
369 	 * Build context to run handler in.
370 	 */
371 	tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
372 	tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
373 	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
374 	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
375 	tf->tf_eip = (int)p->p_sigctx.ps_sigcode;
376 	tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
377 	tf->tf_eflags &= ~(PSL_T|PSL_VM|PSL_AC);
378 	tf->tf_esp = (int)fp;
379 	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
380 
381 	/* Remember that we're now on the signal stack. */
382 	if (onstack)
383 		sas->ss_flags |= SS_ONSTACK;
384 }
385 
386 /*
387  * System call to cleanup state after a signal
388  * has been taken.  Reset signal mask and
389  * stack state from context left by sendsig (above).
390  * Return to previous pc and psl as specified by
391  * context left by sendsig. Check carefully to
392  * make sure that the user has not modified the
393  * psl to gain improper privileges or to cause
394  * a machine fault.
395  */
396 int
397 linux_sys_rt_sigreturn(p, v, retval)
398 	struct proc *p;
399 	void *v;
400 	register_t *retval;
401 {
402 	/* XXX XAX write me */
403 	return(ENOSYS);
404 }
405 
406 int
407 linux_sys_sigreturn(p, v, retval)
408 	struct proc *p;
409 	void *v;
410 	register_t *retval;
411 {
412 	struct linux_sys_sigreturn_args /* {
413 		syscallarg(struct linux_sigcontext *) scp;
414 	} */ *uap = v;
415 	struct linux_sigcontext *scp, context;
416 	struct trapframe *tf;
417 	sigset_t mask;
418 	ssize_t ss_gap;
419 	struct sigaltstack *sas = &p->p_sigctx.ps_sigstk;
420 
421 	/*
422 	 * The trampoline code hands us the context.
423 	 * It is unsafe to keep track of it ourselves, in the event that a
424 	 * program jumps out of a signal handler.
425 	 */
426 	scp = SCARG(uap, scp);
427 	if (copyin((caddr_t)scp, &context, sizeof(*scp)) != 0)
428 		return EFAULT;
429 
430 	/* Restore register context. */
431 	tf = p->p_md.md_regs;
432 
433 	DPRINTF(("sigreturn enter esp=%x eip=%x\n", tf->tf_esp, tf->tf_eip));
434 #ifdef VM86
435 	if (context.sc_eflags & PSL_VM) {
436 		void syscall_vm86 __P((struct trapframe));
437 
438 		tf->tf_vm86_gs = context.sc_gs;
439 		tf->tf_vm86_fs = context.sc_fs;
440 		tf->tf_vm86_es = context.sc_es;
441 		tf->tf_vm86_ds = context.sc_ds;
442 		set_vflags(p, context.sc_eflags);
443 		p->p_md.md_syscall = syscall_vm86;
444 	} else
445 #endif
446 	{
447 		/*
448 		 * Check for security violations.  If we're returning to
449 		 * protected mode, the CPU will validate the segment registers
450 		 * automatically and generate a trap on violations.  We handle
451 		 * the trap, rather than doing all of the checking here.
452 		 */
453 		if (((context.sc_eflags ^ tf->tf_eflags) & PSL_USERSTATIC) != 0 ||
454 		    !USERMODE(context.sc_cs, context.sc_eflags))
455 			return EINVAL;
456 
457 		tf->tf_gs = context.sc_gs;
458 		tf->tf_fs = context.sc_fs;
459 		tf->tf_es = context.sc_es;
460 		tf->tf_ds = context.sc_ds;
461 #ifdef VM86
462 		if (tf->tf_eflags & PSL_VM)
463 			(*p->p_emul->e_syscall_intern)(p);
464 #endif
465 		tf->tf_eflags = context.sc_eflags;
466 	}
467 	tf->tf_edi = context.sc_edi;
468 	tf->tf_esi = context.sc_esi;
469 	tf->tf_ebp = context.sc_ebp;
470 	tf->tf_ebx = context.sc_ebx;
471 	tf->tf_edx = context.sc_edx;
472 	tf->tf_ecx = context.sc_ecx;
473 	tf->tf_eax = context.sc_eax;
474 	tf->tf_eip = context.sc_eip;
475 	tf->tf_cs = context.sc_cs;
476 	tf->tf_esp = context.sc_esp_at_signal;
477 	tf->tf_ss = context.sc_ss;
478 
479 	/* Restore signal stack. */
480 	/*
481 	 * Linux really does it this way; it doesn't have space in sigframe
482 	 * to save the onstack flag.
483 	 */
484 	ss_gap = (ssize_t)
485 	    ((caddr_t) context.sc_esp_at_signal - (caddr_t) sas->ss_sp);
486 	if (ss_gap >= 0 && ss_gap < sas->ss_size)
487 		sas->ss_flags |= SS_ONSTACK;
488 	else
489 		sas->ss_flags &= ~SS_ONSTACK;
490 
491 	/* Restore signal mask. */
492 	linux_old_to_native_sigset(&mask, &context.sc_mask);
493 	(void) sigprocmask1(p, SIG_SETMASK, &mask, 0);
494 	DPRINTF(("sigreturn exit esp=%x eip=%x\n", tf->tf_esp, tf->tf_eip));
495 	return EJUSTRETURN;
496 }
497 
498 #ifdef USER_LDT
499 
500 int
501 linux_read_ldt(p, uap, retval)
502 	struct proc *p;
503 	struct linux_sys_modify_ldt_args /* {
504 		syscallarg(int) func;
505 		syscallarg(void *) ptr;
506 		syscallarg(size_t) bytecount;
507 	} */ *uap;
508 	register_t *retval;
509 {
510 	struct i386_get_ldt_args gl;
511 	int error;
512 	caddr_t sg;
513 	char *parms;
514 
515 	DPRINTF(("linux_read_ldt!"));
516 	sg = stackgap_init(p, 0);
517 
518 	gl.start = 0;
519 	gl.desc = SCARG(uap, ptr);
520 	gl.num = SCARG(uap, bytecount) / sizeof(union descriptor);
521 
522 	parms = stackgap_alloc(p, &sg, sizeof(gl));
523 
524 	if ((error = copyout(&gl, parms, sizeof(gl))) != 0)
525 		return (error);
526 
527 	if ((error = i386_get_ldt(p, parms, retval)) != 0)
528 		return (error);
529 
530 	*retval *= sizeof(union descriptor);
531 	return (0);
532 }
533 
534 struct linux_ldt_info {
535 	u_int entry_number;
536 	u_long base_addr;
537 	u_int limit;
538 	u_int seg_32bit:1;
539 	u_int contents:2;
540 	u_int read_exec_only:1;
541 	u_int limit_in_pages:1;
542 	u_int seg_not_present:1;
543 	u_int useable:1;
544 };
545 
546 int
547 linux_write_ldt(p, uap, retval)
548 	struct proc *p;
549 	struct linux_sys_modify_ldt_args /* {
550 		syscallarg(int) func;
551 		syscallarg(void *) ptr;
552 		syscallarg(size_t) bytecount;
553 	} */ *uap;
554 	register_t *retval;
555 {
556 	struct linux_ldt_info ldt_info;
557 	struct segment_descriptor sd;
558 	struct i386_set_ldt_args sl;
559 	int error;
560 	caddr_t sg;
561 	char *parms;
562 	int oldmode = (int)retval[0];
563 
564 	DPRINTF(("linux_write_ldt %d\n", oldmode));
565 	if (SCARG(uap, bytecount) != sizeof(ldt_info))
566 		return (EINVAL);
567 	if ((error = copyin(SCARG(uap, ptr), &ldt_info, sizeof(ldt_info))) != 0)
568 		return error;
569 	if (ldt_info.entry_number >= 8192)
570 		return (EINVAL);
571 	if (ldt_info.contents == 3) {
572 		if (oldmode)
573 			return (EINVAL);
574 		if (ldt_info.seg_not_present)
575 			return (EINVAL);
576 	}
577 
578 	if (ldt_info.base_addr == 0 && ldt_info.limit == 0 &&
579 	    (oldmode || (ldt_info.contents == 0 &&
580 	    ldt_info.read_exec_only == 1 && ldt_info.seg_32bit == 0 &&
581 	    ldt_info.limit_in_pages == 0 && ldt_info.seg_not_present == 1 &&
582 	    ldt_info.useable == 0))) {
583 		/* this means you should zero the ldt */
584 		(void)memset(&sd, 0, sizeof(sd));
585 	} else {
586 		sd.sd_lobase = ldt_info.base_addr & 0xffffff;
587 		sd.sd_hibase = (ldt_info.base_addr >> 24) & 0xff;
588 		sd.sd_lolimit = ldt_info.limit & 0xffff;
589 		sd.sd_hilimit = (ldt_info.limit >> 16) & 0xf;
590 		sd.sd_type = 16 | (ldt_info.contents << 2) |
591 		    (!ldt_info.read_exec_only << 1);
592 		sd.sd_dpl = SEL_UPL;
593 		sd.sd_p = !ldt_info.seg_not_present;
594 		sd.sd_def32 = ldt_info.seg_32bit;
595 		sd.sd_gran = ldt_info.limit_in_pages;
596 		if (!oldmode)
597 			sd.sd_xx = ldt_info.useable;
598 		else
599 			sd.sd_xx = 0;
600 	}
601 	sg = stackgap_init(p, 0);
602 	sl.start = ldt_info.entry_number;
603 	sl.desc = stackgap_alloc(p, &sg, sizeof(sd));
604 	sl.num = 1;
605 
606 	DPRINTF(("linux_write_ldt: idx=%d, base=0x%lx, limit=0x%x\n",
607 	    ldt_info.entry_number, ldt_info.base_addr, ldt_info.limit));
608 
609 	parms = stackgap_alloc(p, &sg, sizeof(sl));
610 
611 	if ((error = copyout(&sd, sl.desc, sizeof(sd))) != 0)
612 		return (error);
613 	if ((error = copyout(&sl, parms, sizeof(sl))) != 0)
614 		return (error);
615 
616 	if ((error = i386_set_ldt(p, parms, retval)) != 0)
617 		return (error);
618 
619 	*retval = 0;
620 	return (0);
621 }
622 
623 #endif /* USER_LDT */
624 
625 int
626 linux_sys_modify_ldt(p, v, retval)
627 	struct proc *p;
628 	void *v;
629 	register_t *retval;
630 {
631 	struct linux_sys_modify_ldt_args /* {
632 		syscallarg(int) func;
633 		syscallarg(void *) ptr;
634 		syscallarg(size_t) bytecount;
635 	} */ *uap = v;
636 
637 	switch (SCARG(uap, func)) {
638 #ifdef USER_LDT
639 	case 0:
640 		return linux_read_ldt(p, uap, retval);
641 	case 1:
642 		retval[0] = 1;
643 		return linux_write_ldt(p, uap, retval);
644 	case 2:
645 #ifdef notyet
646 		return (linux_read_default_ldt(p, uap, retval);
647 #else
648 		return (ENOSYS);
649 #endif
650 	case 0x11:
651 		retval[0] = 0;
652 		return linux_write_ldt(p, uap, retval);
653 #endif /* USER_LDT */
654 
655 	default:
656 		return (ENOSYS);
657 	}
658 }
659 
660 /*
661  * XXX Pathetic hack to make svgalib work. This will fake the major
662  * device number of an opened VT so that svgalib likes it. grmbl.
663  * Should probably do it 'wrong the right way' and use a mapping
664  * array for all major device numbers, and map linux_mknod too.
665  */
666 dev_t
667 linux_fakedev(dev, raw)
668 	dev_t dev;
669 	int raw;
670 {
671 	if (raw) {
672 #if (NWSDISPLAY > 0)
673 		extern const struct cdevsw wsdisplay_cdevsw;
674 		if (cdevsw_lookup(dev) == &wsdisplay_cdevsw)
675 			return makedev(LINUX_CONS_MAJOR, (minor(dev) + 1));
676 #endif
677 	}
678 
679 	return dev;
680 }
681 
682 #if (NWSDISPLAY > 0)
683 /*
684  * That's not complete, but enough to get an X server running.
685  */
686 #define NR_KEYS 128
687 static const u_short plain_map[NR_KEYS] = {
688 	0x0200,	0x001b,	0x0031,	0x0032,	0x0033,	0x0034,	0x0035,	0x0036,
689 	0x0037,	0x0038,	0x0039,	0x0030,	0x002d,	0x003d,	0x007f,	0x0009,
690 	0x0b71,	0x0b77,	0x0b65,	0x0b72,	0x0b74,	0x0b79,	0x0b75,	0x0b69,
691 	0x0b6f,	0x0b70,	0x005b,	0x005d,	0x0201,	0x0702,	0x0b61,	0x0b73,
692 	0x0b64,	0x0b66,	0x0b67,	0x0b68,	0x0b6a,	0x0b6b,	0x0b6c,	0x003b,
693 	0x0027,	0x0060,	0x0700,	0x005c,	0x0b7a,	0x0b78,	0x0b63,	0x0b76,
694 	0x0b62,	0x0b6e,	0x0b6d,	0x002c,	0x002e,	0x002f,	0x0700,	0x030c,
695 	0x0703,	0x0020,	0x0207,	0x0100,	0x0101,	0x0102,	0x0103,	0x0104,
696 	0x0105,	0x0106,	0x0107,	0x0108,	0x0109,	0x0208,	0x0209,	0x0307,
697 	0x0308,	0x0309,	0x030b,	0x0304,	0x0305,	0x0306,	0x030a,	0x0301,
698 	0x0302,	0x0303,	0x0300,	0x0310,	0x0206,	0x0200,	0x003c,	0x010a,
699 	0x010b,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
700 	0x030e,	0x0702,	0x030d,	0x001c,	0x0701,	0x0205,	0x0114,	0x0603,
701 	0x0118,	0x0601,	0x0602,	0x0117,	0x0600,	0x0119,	0x0115,	0x0116,
702 	0x011a,	0x010c,	0x010d,	0x011b,	0x011c,	0x0110,	0x0311,	0x011d,
703 	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
704 }, shift_map[NR_KEYS] = {
705 	0x0200,	0x001b,	0x0021,	0x0040,	0x0023,	0x0024,	0x0025,	0x005e,
706 	0x0026,	0x002a,	0x0028,	0x0029,	0x005f,	0x002b,	0x007f,	0x0009,
707 	0x0b51,	0x0b57,	0x0b45,	0x0b52,	0x0b54,	0x0b59,	0x0b55,	0x0b49,
708 	0x0b4f,	0x0b50,	0x007b,	0x007d,	0x0201,	0x0702,	0x0b41,	0x0b53,
709 	0x0b44,	0x0b46,	0x0b47,	0x0b48,	0x0b4a,	0x0b4b,	0x0b4c,	0x003a,
710 	0x0022,	0x007e,	0x0700,	0x007c,	0x0b5a,	0x0b58,	0x0b43,	0x0b56,
711 	0x0b42,	0x0b4e,	0x0b4d,	0x003c,	0x003e,	0x003f,	0x0700,	0x030c,
712 	0x0703,	0x0020,	0x0207,	0x010a,	0x010b,	0x010c,	0x010d,	0x010e,
713 	0x010f,	0x0110,	0x0111,	0x0112,	0x0113,	0x0213,	0x0203,	0x0307,
714 	0x0308,	0x0309,	0x030b,	0x0304,	0x0305,	0x0306,	0x030a,	0x0301,
715 	0x0302,	0x0303,	0x0300,	0x0310,	0x0206,	0x0200,	0x003e,	0x010a,
716 	0x010b,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
717 	0x030e,	0x0702,	0x030d,	0x0200,	0x0701,	0x0205,	0x0114,	0x0603,
718 	0x020b,	0x0601,	0x0602,	0x0117,	0x0600,	0x020a,	0x0115,	0x0116,
719 	0x011a,	0x010c,	0x010d,	0x011b,	0x011c,	0x0110,	0x0311,	0x011d,
720 	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
721 }, altgr_map[NR_KEYS] = {
722 	0x0200,	0x0200,	0x0200,	0x0040,	0x0200,	0x0024,	0x0200,	0x0200,
723 	0x007b,	0x005b,	0x005d,	0x007d,	0x005c,	0x0200,	0x0200,	0x0200,
724 	0x0b71,	0x0b77,	0x0918,	0x0b72,	0x0b74,	0x0b79,	0x0b75,	0x0b69,
725 	0x0b6f,	0x0b70,	0x0200,	0x007e,	0x0201,	0x0702,	0x0914,	0x0b73,
726 	0x0917,	0x0919,	0x0b67,	0x0b68,	0x0b6a,	0x0b6b,	0x0b6c,	0x0200,
727 	0x0200,	0x0200,	0x0700,	0x0200,	0x0b7a,	0x0b78,	0x0916,	0x0b76,
728 	0x0915,	0x0b6e,	0x0b6d,	0x0200,	0x0200,	0x0200,	0x0700,	0x030c,
729 	0x0703,	0x0200,	0x0207,	0x050c,	0x050d,	0x050e,	0x050f,	0x0510,
730 	0x0511,	0x0512,	0x0513,	0x0514,	0x0515,	0x0208,	0x0202,	0x0911,
731 	0x0912,	0x0913,	0x030b,	0x090e,	0x090f,	0x0910,	0x030a,	0x090b,
732 	0x090c,	0x090d,	0x090a,	0x0310,	0x0206,	0x0200,	0x007c,	0x0516,
733 	0x0517,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
734 	0x030e,	0x0702,	0x030d,	0x0200,	0x0701,	0x0205,	0x0114,	0x0603,
735 	0x0118,	0x0601,	0x0602,	0x0117,	0x0600,	0x0119,	0x0115,	0x0116,
736 	0x011a,	0x010c,	0x010d,	0x011b,	0x011c,	0x0110,	0x0311,	0x011d,
737 	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
738 }, ctrl_map[NR_KEYS] = {
739 	0x0200,	0x0200,	0x0200,	0x0000,	0x001b,	0x001c,	0x001d,	0x001e,
740 	0x001f,	0x007f,	0x0200,	0x0200,	0x001f,	0x0200,	0x0008,	0x0200,
741 	0x0011,	0x0017,	0x0005,	0x0012,	0x0014,	0x0019,	0x0015,	0x0009,
742 	0x000f,	0x0010,	0x001b,	0x001d,	0x0201,	0x0702,	0x0001,	0x0013,
743 	0x0004,	0x0006,	0x0007,	0x0008,	0x000a,	0x000b,	0x000c,	0x0200,
744 	0x0007,	0x0000,	0x0700,	0x001c,	0x001a,	0x0018,	0x0003,	0x0016,
745 	0x0002,	0x000e,	0x000d,	0x0200,	0x020e,	0x007f,	0x0700,	0x030c,
746 	0x0703,	0x0000,	0x0207,	0x0100,	0x0101,	0x0102,	0x0103,	0x0104,
747 	0x0105,	0x0106,	0x0107,	0x0108,	0x0109,	0x0208,	0x0204,	0x0307,
748 	0x0308,	0x0309,	0x030b,	0x0304,	0x0305,	0x0306,	0x030a,	0x0301,
749 	0x0302,	0x0303,	0x0300,	0x0310,	0x0206,	0x0200,	0x0200,	0x010a,
750 	0x010b,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
751 	0x030e,	0x0702,	0x030d,	0x001c,	0x0701,	0x0205,	0x0114,	0x0603,
752 	0x0118,	0x0601,	0x0602,	0x0117,	0x0600,	0x0119,	0x0115,	0x0116,
753 	0x011a,	0x010c,	0x010d,	0x011b,	0x011c,	0x0110,	0x0311,	0x011d,
754 	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
755 };
756 
757 const u_short * const linux_keytabs[] = {
758 	plain_map, shift_map, altgr_map, altgr_map, ctrl_map
759 };
760 #endif
761 
762 static struct biosdisk_info *
763 fd2biosinfo(p, fp)
764 	struct proc *p;
765 	struct file *fp;
766 {
767 	struct vnode *vp;
768 	const char *blkname;
769 	char diskname[16];
770 	int i;
771 	struct nativedisk_info *nip;
772 	struct disklist *dl = i386_alldisks;
773 
774 	if (fp->f_type != DTYPE_VNODE)
775 		return NULL;
776 	vp = (struct vnode *)fp->f_data;
777 
778 	if (vp->v_type != VBLK)
779 		return NULL;
780 
781 	blkname = devsw_blk2name(major(vp->v_rdev));
782 	snprintf(diskname, sizeof diskname, "%s%u", blkname,
783 	    DISKUNIT(vp->v_rdev));
784 
785 	for (i = 0; i < dl->dl_nnativedisks; i++) {
786 		nip = &dl->dl_nativedisks[i];
787 		if (strcmp(diskname, nip->ni_devname))
788 			continue;
789 		if (nip->ni_nmatches != 0)
790 			return &dl->dl_biosdisks[nip->ni_biosmatches[0]];
791 	}
792 
793 	return NULL;
794 }
795 
796 
797 /*
798  * We come here in a last attempt to satisfy a Linux ioctl() call
799  */
800 int
801 linux_machdepioctl(p, v, retval)
802 	struct proc *p;
803 	void *v;
804 	register_t *retval;
805 {
806 	struct linux_sys_ioctl_args /* {
807 		syscallarg(int) fd;
808 		syscallarg(u_long) com;
809 		syscallarg(caddr_t) data;
810 	} */ *uap = v;
811 	struct sys_ioctl_args bia;
812 	u_long com;
813 	int error, error1;
814 #if (NWSDISPLAY > 0)
815 	struct vt_mode lvt;
816 	caddr_t bvtp, sg;
817 	struct kbentry kbe;
818 #endif
819 	struct linux_hd_geometry hdg;
820 	struct linux_hd_big_geometry hdg_big;
821 	struct biosdisk_info *bip;
822 	struct filedesc *fdp;
823 	struct file *fp;
824 	int fd;
825 	struct disklabel label, *labp;
826 	struct partinfo partp;
827 	int (*ioctlf) __P((struct file *, u_long, caddr_t, struct proc *));
828 	u_long start, biostotal, realtotal;
829 	u_char heads, sectors;
830 	u_int cylinders;
831 	struct ioctl_pt pt;
832 
833 	fd = SCARG(uap, fd);
834 	SCARG(&bia, fd) = fd;
835 	SCARG(&bia, data) = SCARG(uap, data);
836 	com = SCARG(uap, com);
837 
838 	fdp = p->p_fd;
839 
840 	if ((fp = fd_getfile(fdp, fd)) == NULL)
841 		return (EBADF);
842 
843 	switch (com) {
844 #if (NWSDISPLAY > 0)
845 	case LINUX_KDGKBMODE:
846 		com = KDGKBMODE;
847 		break;
848 	case LINUX_KDSKBMODE:
849 		com = KDSKBMODE;
850 		if ((unsigned)SCARG(uap, data) == LINUX_K_MEDIUMRAW)
851 			SCARG(&bia, data) = (caddr_t)K_RAW;
852 		break;
853 	case LINUX_KIOCSOUND:
854 		SCARG(&bia, data) =
855 		    (caddr_t)(((unsigned long)SCARG(&bia, data)) & 0xffff);
856 		/* fall through */
857 	case LINUX_KDMKTONE:
858 		com = KDMKTONE;
859 		break;
860 	case LINUX_KDSETMODE:
861 		com = KDSETMODE;
862 		break;
863 	case LINUX_KDGETMODE:
864 		/* KD_* values are equal to the wscons numbers */
865 		com = WSDISPLAYIO_GMODE;
866 		break;
867 	case LINUX_KDENABIO:
868 		com = KDENABIO;
869 		break;
870 	case LINUX_KDDISABIO:
871 		com = KDDISABIO;
872 		break;
873 	case LINUX_KDGETLED:
874 		com = KDGETLED;
875 		break;
876 	case LINUX_KDSETLED:
877 		com = KDSETLED;
878 		break;
879 	case LINUX_VT_OPENQRY:
880 		com = VT_OPENQRY;
881 		break;
882 	case LINUX_VT_GETMODE:
883 		SCARG(&bia, com) = VT_GETMODE;
884 		if ((error = sys_ioctl(p, &bia, retval)))
885 			return error;
886 		if ((error = copyin(SCARG(uap, data), (caddr_t)&lvt,
887 		    sizeof (struct vt_mode))))
888 			return error;
889 		lvt.relsig = native_to_linux_signo[lvt.relsig];
890 		lvt.acqsig = native_to_linux_signo[lvt.acqsig];
891 		lvt.frsig = native_to_linux_signo[lvt.frsig];
892 		return copyout((caddr_t)&lvt, SCARG(uap, data),
893 		    sizeof (struct vt_mode));
894 	case LINUX_VT_SETMODE:
895 		com = VT_SETMODE;
896 		if ((error = copyin(SCARG(uap, data), (caddr_t)&lvt,
897 		    sizeof (struct vt_mode))))
898 			return error;
899 		lvt.relsig = linux_to_native_signo[lvt.relsig];
900 		lvt.acqsig = linux_to_native_signo[lvt.acqsig];
901 		lvt.frsig = linux_to_native_signo[lvt.frsig];
902 		sg = stackgap_init(p, 0);
903 		bvtp = stackgap_alloc(p, &sg, sizeof (struct vt_mode));
904 		if ((error = copyout(&lvt, bvtp, sizeof (struct vt_mode))))
905 			return error;
906 		SCARG(&bia, data) = bvtp;
907 		break;
908 	case LINUX_VT_DISALLOCATE:
909 		/* XXX should use WSDISPLAYIO_DELSCREEN */
910 		return 0;
911 	case LINUX_VT_RELDISP:
912 		com = VT_RELDISP;
913 		break;
914 	case LINUX_VT_ACTIVATE:
915 		com = VT_ACTIVATE;
916 		break;
917 	case LINUX_VT_WAITACTIVE:
918 		com = VT_WAITACTIVE;
919 		break;
920 	case LINUX_VT_GETSTATE:
921 		com = VT_GETSTATE;
922 		break;
923 	case LINUX_KDGKBTYPE:
924 		/* This is what Linux does. */
925 		return (subyte(SCARG(uap, data), KB_101));
926 	case LINUX_KDGKBENT:
927 		/*
928 		 * The Linux KDGKBENT ioctl is different from the
929 		 * SYSV original. So we handle it in machdep code.
930 		 * XXX We should use keyboard mapping information
931 		 * from wsdisplay, but this would be expensive.
932 		 */
933 		if ((error = copyin(SCARG(uap, data), &kbe,
934 				    sizeof(struct kbentry))))
935 			return (error);
936 		if (kbe.kb_table >= sizeof(linux_keytabs) / sizeof(u_short *)
937 		    || kbe.kb_index >= NR_KEYS)
938 			return (EINVAL);
939 		kbe.kb_value = linux_keytabs[kbe.kb_table][kbe.kb_index];
940 		return (copyout(&kbe, SCARG(uap, data),
941 				sizeof(struct kbentry)));
942 #endif
943 	case LINUX_HDIO_GETGEO:
944 	case LINUX_HDIO_GETGEO_BIG:
945 		/*
946 		 * Try to mimic Linux behaviour: return the BIOS geometry
947 		 * if possible (extending its # of cylinders if it's beyond
948 		 * the 1023 limit), fall back to the MI geometry (i.e.
949 		 * the real geometry) if not found, by returning an
950 		 * error. See common/linux_hdio.c
951 		 */
952 		FILE_USE(fp);
953 		bip = fd2biosinfo(p, fp);
954 		ioctlf = fp->f_ops->fo_ioctl;
955 		error = ioctlf(fp, DIOCGDEFLABEL, (caddr_t)&label, p);
956 		error1 = ioctlf(fp, DIOCGPART, (caddr_t)&partp, p);
957 		FILE_UNUSE(fp, p);
958 		if (error != 0 && error1 != 0)
959 			return error1;
960 		labp = error != 0 ? &label : partp.disklab;
961 		start = error1 != 0 ? partp.part->p_offset : 0;
962 		if (bip != NULL && bip->bi_head != 0 && bip->bi_sec != 0
963 		    && bip->bi_cyl != 0) {
964 			heads = bip->bi_head;
965 			sectors = bip->bi_sec;
966 			cylinders = bip->bi_cyl;
967 			biostotal = heads * sectors * cylinders;
968 			realtotal = labp->d_ntracks * labp->d_nsectors *
969 			    labp->d_ncylinders;
970 			if (realtotal > biostotal)
971 				cylinders = realtotal / (heads * sectors);
972 		} else {
973 			heads = labp->d_ntracks;
974 			cylinders = labp->d_ncylinders;
975 			sectors = labp->d_nsectors;
976 		}
977 		if (com == LINUX_HDIO_GETGEO) {
978 			hdg.start = start;
979 			hdg.heads = heads;
980 			hdg.cylinders = cylinders;
981 			hdg.sectors = sectors;
982 			return copyout(&hdg, SCARG(uap, data), sizeof hdg);
983 		} else {
984 			hdg_big.start = start;
985 			hdg_big.heads = heads;
986 			hdg_big.cylinders = cylinders;
987 			hdg_big.sectors = sectors;
988 			return copyout(&hdg_big, SCARG(uap, data),
989 			    sizeof hdg_big);
990 		}
991 
992 	default:
993 		/*
994 		 * Unknown to us. If it's on a device, just pass it through
995 		 * using PTIOCLINUX, the device itself might be able to
996 		 * make some sense of it.
997 		 * XXX hack: if the function returns EJUSTRETURN,
998 		 * it has stuffed a sysctl return value in pt.data.
999 		 */
1000 		FILE_USE(fp);
1001 		ioctlf = fp->f_ops->fo_ioctl;
1002 		pt.com = SCARG(uap, com);
1003 		pt.data = SCARG(uap, data);
1004 		error = ioctlf(fp, PTIOCLINUX, (caddr_t)&pt, p);
1005 		FILE_UNUSE(fp, p);
1006 		if (error == EJUSTRETURN) {
1007 			retval[0] = (register_t)pt.data;
1008 			error = 0;
1009 		}
1010 
1011 		if (error == ENOTTY)
1012 			DPRINTF(("linux_machdepioctl: invalid ioctl %08lx\n",
1013 			    com));
1014 		return error;
1015 	}
1016 	SCARG(&bia, com) = com;
1017 	return sys_ioctl(p, &bia, retval);
1018 }
1019 
1020 /*
1021  * Set I/O permissions for a process. Just set the maximum level
1022  * right away (ignoring the argument), otherwise we would have
1023  * to rely on I/O permission maps, which are not implemented.
1024  */
1025 int
1026 linux_sys_iopl(p, v, retval)
1027 	struct proc *p;
1028 	void *v;
1029 	register_t *retval;
1030 {
1031 #if 0
1032 	struct linux_sys_iopl_args /* {
1033 		syscallarg(int) level;
1034 	} */ *uap = v;
1035 #endif
1036 	struct trapframe *fp = p->p_md.md_regs;
1037 
1038 	if (suser(p->p_ucred, &p->p_acflag) != 0)
1039 		return EPERM;
1040 	fp->tf_eflags |= PSL_IOPL;
1041 	*retval = 0;
1042 	return 0;
1043 }
1044 
1045 /*
1046  * See above. If a root process tries to set access to an I/O port,
1047  * just let it have the whole range.
1048  */
1049 int
1050 linux_sys_ioperm(p, v, retval)
1051 	struct proc *p;
1052 	void *v;
1053 	register_t *retval;
1054 {
1055 	struct linux_sys_ioperm_args /* {
1056 		syscallarg(unsigned int) lo;
1057 		syscallarg(unsigned int) hi;
1058 		syscallarg(int) val;
1059 	} */ *uap = v;
1060 	struct trapframe *fp = p->p_md.md_regs;
1061 
1062 	if (suser(p->p_ucred, &p->p_acflag) != 0)
1063 		return EPERM;
1064 	if (SCARG(uap, val))
1065 		fp->tf_eflags |= PSL_IOPL;
1066 	*retval = 0;
1067 	return 0;
1068 }
1069