xref: /dragonfly/sys/kern/sys_generic.c (revision 0cfebe3d)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
39  * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $
40  * $DragonFly: src/sys/kern/sys_generic.c,v 1.47 2008/01/10 22:30:27 nth Exp $
41  */
42 
43 #include "opt_ktrace.h"
44 
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/sysproto.h>
48 #include <sys/filedesc.h>
49 #include <sys/filio.h>
50 #include <sys/fcntl.h>
51 #include <sys/file.h>
52 #include <sys/proc.h>
53 #include <sys/signalvar.h>
54 #include <sys/socketvar.h>
55 #include <sys/uio.h>
56 #include <sys/kernel.h>
57 #include <sys/kern_syscall.h>
58 #include <sys/malloc.h>
59 #include <sys/mapped_ioctl.h>
60 #include <sys/poll.h>
61 #include <sys/queue.h>
62 #include <sys/resourcevar.h>
63 #include <sys/sysctl.h>
64 #include <sys/sysent.h>
65 #include <sys/buf.h>
66 #ifdef KTRACE
67 #include <sys/ktrace.h>
68 #endif
69 #include <vm/vm.h>
70 #include <vm/vm_page.h>
71 #include <sys/file2.h>
72 
73 #include <machine/limits.h>
74 
75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
76 static MALLOC_DEFINE(M_IOCTLMAP, "ioctlmap", "mapped ioctl handler buffer");
77 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
78 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
79 
80 static int 	doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex,
81 			struct timeval *tv, int *res);
82 static int	pollscan (struct proc *, struct pollfd *, u_int, int *);
83 static int	selscan (struct proc *, fd_mask **, fd_mask **,
84 			int, int *);
85 static int	dofileread(int, struct file *, struct uio *, int, int *);
86 static int	dofilewrite(int, struct file *, struct uio *, int, int *);
87 
88 /*
89  * Read system call.
90  *
91  * MPSAFE
92  */
93 int
94 sys_read(struct read_args *uap)
95 {
96 	struct thread *td = curthread;
97 	struct uio auio;
98 	struct iovec aiov;
99 	int error;
100 
101 	aiov.iov_base = uap->buf;
102 	aiov.iov_len = uap->nbyte;
103 	auio.uio_iov = &aiov;
104 	auio.uio_iovcnt = 1;
105 	auio.uio_offset = -1;
106 	auio.uio_resid = uap->nbyte;
107 	auio.uio_rw = UIO_READ;
108 	auio.uio_segflg = UIO_USERSPACE;
109 	auio.uio_td = td;
110 
111 	if (auio.uio_resid < 0)
112 		error = EINVAL;
113 	else
114 		error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_result);
115 	return(error);
116 }
117 
118 /*
119  * Positioned (Pread) read system call
120  *
121  * MPSAFE
122  */
123 int
124 sys_extpread(struct extpread_args *uap)
125 {
126 	struct thread *td = curthread;
127 	struct uio auio;
128 	struct iovec aiov;
129 	int error;
130 	int flags;
131 
132 	aiov.iov_base = uap->buf;
133 	aiov.iov_len = uap->nbyte;
134 	auio.uio_iov = &aiov;
135 	auio.uio_iovcnt = 1;
136 	auio.uio_offset = uap->offset;
137 	auio.uio_resid = uap->nbyte;
138 	auio.uio_rw = UIO_READ;
139 	auio.uio_segflg = UIO_USERSPACE;
140 	auio.uio_td = td;
141 
142 	flags = uap->flags & O_FMASK;
143 	if (uap->offset != (off_t)-1)
144 		flags |= O_FOFFSET;
145 
146 	if (auio.uio_resid < 0)
147 		error = EINVAL;
148 	else
149 		error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_result);
150 	return(error);
151 }
152 
153 /*
154  * Scatter read system call.
155  *
156  * MPSAFE
157  */
158 int
159 sys_readv(struct readv_args *uap)
160 {
161 	struct thread *td = curthread;
162 	struct uio auio;
163 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
164 	int error;
165 
166 	error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
167 			     &auio.uio_resid);
168 	if (error)
169 		return (error);
170 	auio.uio_iov = iov;
171 	auio.uio_iovcnt = uap->iovcnt;
172 	auio.uio_offset = -1;
173 	auio.uio_rw = UIO_READ;
174 	auio.uio_segflg = UIO_USERSPACE;
175 	auio.uio_td = td;
176 
177 	error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_result);
178 
179 	iovec_free(&iov, aiov);
180 	return (error);
181 }
182 
183 
184 /*
185  * Scatter positioned read system call.
186  *
187  * MPSAFE
188  */
189 int
190 sys_extpreadv(struct extpreadv_args *uap)
191 {
192 	struct thread *td = curthread;
193 	struct uio auio;
194 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
195 	int error;
196 	int flags;
197 
198 	error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
199 			     &auio.uio_resid);
200 	if (error)
201 		return (error);
202 	auio.uio_iov = iov;
203 	auio.uio_iovcnt = uap->iovcnt;
204 	auio.uio_offset = uap->offset;
205 	auio.uio_rw = UIO_READ;
206 	auio.uio_segflg = UIO_USERSPACE;
207 	auio.uio_td = td;
208 
209 	flags = uap->flags & O_FMASK;
210 	if (uap->offset != (off_t)-1)
211 		flags |= O_FOFFSET;
212 
213 	error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_result);
214 
215 	iovec_free(&iov, aiov);
216 	return(error);
217 }
218 
219 /*
220  * MPSAFE
221  */
222 int
223 kern_preadv(int fd, struct uio *auio, int flags, int *res)
224 {
225 	struct thread *td = curthread;
226 	struct proc *p = td->td_proc;
227 	struct file *fp;
228 	int error;
229 
230 	KKASSERT(p);
231 
232 	fp = holdfp(p->p_fd, fd, FREAD);
233 	if (fp == NULL)
234 		return (EBADF);
235 	if (flags & O_FOFFSET && fp->f_type != DTYPE_VNODE) {
236 		error = ESPIPE;
237 	} else if (auio->uio_resid < 0) {
238 		error = EINVAL;
239 	} else {
240 		error = dofileread(fd, fp, auio, flags, res);
241 	}
242 	fdrop(fp);
243 	return(error);
244 }
245 
246 /*
247  * Common code for readv and preadv that reads data in
248  * from a file using the passed in uio, offset, and flags.
249  *
250  * MPALMOSTSAFE - ktrace needs help
251  */
252 static int
253 dofileread(int fd, struct file *fp, struct uio *auio, int flags, int *res)
254 {
255 	struct thread *td = curthread;
256 	struct proc *p = td->td_proc;
257 	int error;
258 	int len;
259 #ifdef KTRACE
260 	struct iovec *ktriov = NULL;
261 	struct uio ktruio;
262 #endif
263 
264 #ifdef KTRACE
265 	/*
266 	 * if tracing, save a copy of iovec
267 	 */
268 	if (KTRPOINT(td, KTR_GENIO))  {
269 		int iovlen = auio->uio_iovcnt * sizeof(struct iovec);
270 
271 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
272 		bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen);
273 		ktruio = *auio;
274 	}
275 #endif
276 	len = auio->uio_resid;
277 	error = fo_read(fp, auio, fp->f_cred, flags);
278 	if (error) {
279 		if (auio->uio_resid != len && (error == ERESTART ||
280 		    error == EINTR || error == EWOULDBLOCK))
281 			error = 0;
282 	}
283 #ifdef KTRACE
284 	if (ktriov != NULL) {
285 		if (error == 0) {
286 			ktruio.uio_iov = ktriov;
287 			ktruio.uio_resid = len - auio->uio_resid;
288 			get_mplock();
289 			ktrgenio(p, fd, UIO_READ, &ktruio, error);
290 			rel_mplock();
291 		}
292 		FREE(ktriov, M_TEMP);
293 	}
294 #endif
295 	if (error == 0)
296 		*res = len - auio->uio_resid;
297 
298 	return(error);
299 }
300 
301 /*
302  * Write system call
303  *
304  * MPSAFE
305  */
306 int
307 sys_write(struct write_args *uap)
308 {
309 	struct thread *td = curthread;
310 	struct uio auio;
311 	struct iovec aiov;
312 	int error;
313 
314 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
315 	aiov.iov_len = uap->nbyte;
316 	auio.uio_iov = &aiov;
317 	auio.uio_iovcnt = 1;
318 	auio.uio_offset = -1;
319 	auio.uio_resid = uap->nbyte;
320 	auio.uio_rw = UIO_WRITE;
321 	auio.uio_segflg = UIO_USERSPACE;
322 	auio.uio_td = td;
323 
324 	if (auio.uio_resid < 0)
325 		error = EINVAL;
326 	else
327 		error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_result);
328 
329 	return(error);
330 }
331 
332 /*
333  * Pwrite system call
334  *
335  * MPSAFE
336  */
337 int
338 sys_extpwrite(struct extpwrite_args *uap)
339 {
340 	struct thread *td = curthread;
341 	struct uio auio;
342 	struct iovec aiov;
343 	int error;
344 	int flags;
345 
346 	aiov.iov_base = (void *)(uintptr_t)uap->buf;
347 	aiov.iov_len = uap->nbyte;
348 	auio.uio_iov = &aiov;
349 	auio.uio_iovcnt = 1;
350 	auio.uio_offset = uap->offset;
351 	auio.uio_resid = uap->nbyte;
352 	auio.uio_rw = UIO_WRITE;
353 	auio.uio_segflg = UIO_USERSPACE;
354 	auio.uio_td = td;
355 
356 	flags = uap->flags & O_FMASK;
357 	if (uap->offset != (off_t)-1)
358 		flags |= O_FOFFSET;
359 
360 	if (auio.uio_resid < 0)
361 		error = EINVAL;
362 	else
363 		error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_result);
364 
365 	return(error);
366 }
367 
368 /*
369  * MPSAFE
370  */
371 int
372 sys_writev(struct writev_args *uap)
373 {
374 	struct thread *td = curthread;
375 	struct uio auio;
376 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
377 	int error;
378 
379 	error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
380 			     &auio.uio_resid);
381 	if (error)
382 		return (error);
383 	auio.uio_iov = iov;
384 	auio.uio_iovcnt = uap->iovcnt;
385 	auio.uio_offset = -1;
386 	auio.uio_rw = UIO_WRITE;
387 	auio.uio_segflg = UIO_USERSPACE;
388 	auio.uio_td = td;
389 
390 	error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_result);
391 
392 	iovec_free(&iov, aiov);
393 	return (error);
394 }
395 
396 
397 /*
398  * Gather positioned write system call
399  *
400  * MPSAFE
401  */
402 int
403 sys_extpwritev(struct extpwritev_args *uap)
404 {
405 	struct thread *td = curthread;
406 	struct uio auio;
407 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
408 	int error;
409 	int flags;
410 
411 	error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
412 			     &auio.uio_resid);
413 	if (error)
414 		return (error);
415 	auio.uio_iov = iov;
416 	auio.uio_iovcnt = uap->iovcnt;
417 	auio.uio_offset = uap->offset;
418 	auio.uio_rw = UIO_WRITE;
419 	auio.uio_segflg = UIO_USERSPACE;
420 	auio.uio_td = td;
421 
422 	flags = uap->flags & O_FMASK;
423 	if (uap->offset != (off_t)-1)
424 		flags |= O_FOFFSET;
425 
426 	error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_result);
427 
428 	iovec_free(&iov, aiov);
429 	return(error);
430 }
431 
432 /*
433  * MPSAFE
434  */
435 int
436 kern_pwritev(int fd, struct uio *auio, int flags, int *res)
437 {
438 	struct thread *td = curthread;
439 	struct proc *p = td->td_proc;
440 	struct file *fp;
441 	int error;
442 
443 	KKASSERT(p);
444 
445 	fp = holdfp(p->p_fd, fd, FWRITE);
446 	if (fp == NULL)
447 		return (EBADF);
448 	else if ((flags & O_FOFFSET) && fp->f_type != DTYPE_VNODE) {
449 		error = ESPIPE;
450 	} else {
451 		error = dofilewrite(fd, fp, auio, flags, res);
452 	}
453 
454 	fdrop(fp);
455 	return (error);
456 }
457 
458 /*
459  * Common code for writev and pwritev that writes data to
460  * a file using the passed in uio, offset, and flags.
461  *
462  * MPALMOSTSAFE - ktrace needs help
463  */
464 static int
465 dofilewrite(int fd, struct file *fp, struct uio *auio, int flags, int *res)
466 {
467 	struct thread *td = curthread;
468 	struct lwp *lp = td->td_lwp;
469 	struct proc *p = td->td_proc;
470 	int error;
471 	int len;
472 #ifdef KTRACE
473 	struct iovec *ktriov = NULL;
474 	struct uio ktruio;
475 #endif
476 
477 #ifdef KTRACE
478 	/*
479 	 * if tracing, save a copy of iovec and uio
480 	 */
481 	if (KTRPOINT(td, KTR_GENIO))  {
482 		int iovlen = auio->uio_iovcnt * sizeof(struct iovec);
483 
484 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
485 		bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen);
486 		ktruio = *auio;
487 	}
488 #endif
489 	len = auio->uio_resid;
490 	if (fp->f_type == DTYPE_VNODE)
491 		bwillwrite();
492 	error = fo_write(fp, auio, fp->f_cred, flags);
493 	if (error) {
494 		if (auio->uio_resid != len && (error == ERESTART ||
495 		    error == EINTR || error == EWOULDBLOCK))
496 			error = 0;
497 		/* Socket layer is responsible for issuing SIGPIPE. */
498 		if (error == EPIPE) {
499 			get_mplock();
500 			lwpsignal(p, lp, SIGPIPE);
501 			rel_mplock();
502 		}
503 	}
504 #ifdef KTRACE
505 	if (ktriov != NULL) {
506 		if (error == 0) {
507 			ktruio.uio_iov = ktriov;
508 			ktruio.uio_resid = len - auio->uio_resid;
509 			get_mplock();
510 			ktrgenio(p, fd, UIO_WRITE, &ktruio, error);
511 			rel_mplock();
512 		}
513 		FREE(ktriov, M_TEMP);
514 	}
515 #endif
516 	if (error == 0)
517 		*res = len - auio->uio_resid;
518 
519 	return(error);
520 }
521 
522 /*
523  * Ioctl system call
524  */
525 /* ARGSUSED */
526 int
527 sys_ioctl(struct ioctl_args *uap)
528 {
529 	return(mapped_ioctl(uap->fd, uap->com, uap->data, NULL));
530 }
531 
532 struct ioctl_map_entry {
533 	const char *subsys;
534 	struct ioctl_map_range *cmd_ranges;
535 	LIST_ENTRY(ioctl_map_entry) entries;
536 };
537 
538 /*
539  * The true heart of all ioctl syscall handlers (native, emulation).
540  * If map != NULL, it will be searched for a matching entry for com,
541  * and appropriate conversions/conversion functions will be utilized.
542  */
543 int
544 mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map)
545 {
546 	struct thread *td = curthread;
547 	struct proc *p = td->td_proc;
548 	struct ucred *cred;
549 	struct file *fp;
550 	struct ioctl_map_range *iomc = NULL;
551 	int error;
552 	u_int size;
553 	u_long ocom = com;
554 	caddr_t data, memp;
555 	int tmp;
556 #define STK_PARAMS	128
557 	union {
558 	    char stkbuf[STK_PARAMS];
559 	    long align;
560 	} ubuf;
561 
562 	KKASSERT(p);
563 	cred = p->p_ucred;
564 
565 	fp = holdfp(p->p_fd, fd, FREAD|FWRITE);
566 	if (fp == NULL)
567 		return(EBADF);
568 
569 	if (map != NULL) {	/* obey translation map */
570 		u_long maskcmd;
571 		struct ioctl_map_entry *e;
572 
573 		maskcmd = com & map->mask;
574 
575 		LIST_FOREACH(e, &map->mapping, entries) {
576 			for (iomc = e->cmd_ranges; iomc->start != 0 ||
577 			     iomc->maptocmd != 0 || iomc->wrapfunc != NULL ||
578 			     iomc->mapfunc != NULL;
579 			     iomc++) {
580 				if (maskcmd >= iomc->start &&
581 				    maskcmd <= iomc->end)
582 					break;
583 			}
584 
585 			/* Did we find a match? */
586 			if (iomc->start != 0 || iomc->maptocmd != 0 ||
587 			    iomc->wrapfunc != NULL || iomc->mapfunc != NULL)
588 				break;
589 		}
590 
591 		if (iomc == NULL ||
592 		    (iomc->start == 0 && iomc->maptocmd == 0
593 		     && iomc->wrapfunc == NULL && iomc->mapfunc == NULL)) {
594 			kprintf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n",
595 			       map->sys, fd, maskcmd,
596 			       (int)((maskcmd >> 8) & 0xff),
597 			       (int)(maskcmd & 0xff));
598 			error = EINVAL;
599 			goto done;
600 		}
601 
602 		/*
603 		 * If it's a non-range one to one mapping, maptocmd should be
604 		 * correct. If it's a ranged one to one mapping, we pass the
605 		 * original value of com, and for a range mapped to a different
606 		 * range, we always need a mapping function to translate the
607 		 * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff
608 		 */
609 		if (iomc->start == iomc->end && iomc->maptocmd == iomc->maptoend) {
610 			com = iomc->maptocmd;
611 		} else if (iomc->start == iomc->maptocmd && iomc->end == iomc->maptoend) {
612 			if (iomc->mapfunc != NULL)
613 				com = iomc->mapfunc(iomc->start, iomc->end,
614 						    iomc->start, iomc->end,
615 						    com, com);
616 		} else {
617 			if (iomc->mapfunc != NULL) {
618 				com = iomc->mapfunc(iomc->start, iomc->end,
619 						    iomc->maptocmd, iomc->maptoend,
620 						    com, ocom);
621 			} else {
622 				kprintf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n",
623 				       map->sys, fd, maskcmd,
624 				       (int)((maskcmd >> 8) & 0xff),
625 				       (int)(maskcmd & 0xff));
626 				error = EINVAL;
627 				goto done;
628 			}
629 		}
630 	}
631 
632 	switch (com) {
633 	case FIONCLEX:
634 		error = fclrfdflags(p->p_fd, fd, UF_EXCLOSE);
635 		goto done;
636 	case FIOCLEX:
637 		error = fsetfdflags(p->p_fd, fd, UF_EXCLOSE);
638 		goto done;
639 	}
640 
641 	/*
642 	 * Interpret high order word to find amount of data to be
643 	 * copied to/from the user's address space.
644 	 */
645 	size = IOCPARM_LEN(com);
646 	if (size > IOCPARM_MAX) {
647 		error = ENOTTY;
648 		goto done;
649 	}
650 
651 	memp = NULL;
652 	if (size > sizeof (ubuf.stkbuf)) {
653 		memp = kmalloc(size, M_IOCTLOPS, M_WAITOK);
654 		data = memp;
655 	} else {
656 		data = ubuf.stkbuf;
657 	}
658 	if ((com & IOC_IN) != 0) {
659 		if (size != 0) {
660 			error = copyin(uspc_data, data, (u_int)size);
661 			if (error) {
662 				if (memp != NULL)
663 					kfree(memp, M_IOCTLOPS);
664 				goto done;
665 			}
666 		} else {
667 			*(caddr_t *)data = uspc_data;
668 		}
669 	} else if ((com & IOC_OUT) != 0 && size) {
670 		/*
671 		 * Zero the buffer so the user always
672 		 * gets back something deterministic.
673 		 */
674 		bzero(data, size);
675 	} else if ((com & IOC_VOID) != 0) {
676 		*(caddr_t *)data = uspc_data;
677 	}
678 
679 	switch (com) {
680 	case FIONBIO:
681 		if ((tmp = *(int *)data))
682 			fp->f_flag |= FNONBLOCK;
683 		else
684 			fp->f_flag &= ~FNONBLOCK;
685 		error = 0;
686 		break;
687 
688 	case FIOASYNC:
689 		if ((tmp = *(int *)data))
690 			fp->f_flag |= FASYNC;
691 		else
692 			fp->f_flag &= ~FASYNC;
693 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, cred);
694 		break;
695 
696 	default:
697 		/*
698 		 *  If there is a override function,
699 		 *  call it instead of directly routing the call
700 		 */
701 		if (map != NULL && iomc->wrapfunc != NULL)
702 			error = iomc->wrapfunc(fp, com, ocom, data, cred);
703 		else
704 			error = fo_ioctl(fp, com, data, cred);
705 		/*
706 		 * Copy any data to user, size was
707 		 * already set and checked above.
708 		 */
709 		if (error == 0 && (com & IOC_OUT) != 0 && size != 0)
710 			error = copyout(data, uspc_data, (u_int)size);
711 		break;
712 	}
713 	if (memp != NULL)
714 		kfree(memp, M_IOCTLOPS);
715 done:
716 	fdrop(fp);
717 	return(error);
718 }
719 
720 int
721 mapped_ioctl_register_handler(struct ioctl_map_handler *he)
722 {
723 	struct ioctl_map_entry *ne;
724 
725 	KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL &&
726 		 he->subsys != NULL && *he->subsys != '\0');
727 
728 	ne = kmalloc(sizeof(struct ioctl_map_entry), M_IOCTLMAP, M_WAITOK);
729 
730 	ne->subsys = he->subsys;
731 	ne->cmd_ranges = he->cmd_ranges;
732 
733 	LIST_INSERT_HEAD(&he->map->mapping, ne, entries);
734 
735 	return(0);
736 }
737 
738 int
739 mapped_ioctl_unregister_handler(struct ioctl_map_handler *he)
740 {
741 	struct ioctl_map_entry *ne;
742 
743 	KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL);
744 
745 	LIST_FOREACH(ne, &he->map->mapping, entries) {
746 		if (ne->cmd_ranges != he->cmd_ranges)
747 			continue;
748 		LIST_REMOVE(ne, entries);
749 		kfree(ne, M_IOCTLMAP);
750 		return(0);
751 	}
752 	return(EINVAL);
753 }
754 
755 static int	nselcoll;	/* Select collisions since boot */
756 int	selwait;
757 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
758 
759 /*
760  * Select system call.
761  */
762 int
763 sys_select(struct select_args *uap)
764 {
765 	struct timeval ktv;
766 	struct timeval *ktvp;
767 	int error;
768 
769 	/*
770 	 * Get timeout if any.
771 	 */
772 	if (uap->tv != NULL) {
773 		error = copyin(uap->tv, &ktv, sizeof (ktv));
774 		if (error)
775 			return (error);
776 		error = itimerfix(&ktv);
777 		if (error)
778 			return (error);
779 		ktvp = &ktv;
780 	} else {
781 		ktvp = NULL;
782 	}
783 
784 	/*
785 	 * Do real work.
786 	 */
787 	error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktvp,
788 			&uap->sysmsg_result);
789 
790 	return (error);
791 }
792 
793 
794 /*
795  * Pselect system call.
796  */
797 int
798 sys_pselect(struct pselect_args *uap)
799 {
800 	struct thread *td = curthread;
801 	struct lwp *lp = td->td_lwp;
802 	struct timespec kts;
803 	struct timeval ktv;
804 	struct timeval *ktvp;
805 	sigset_t sigmask;
806 	int error;
807 
808 	/*
809 	 * Get timeout if any and convert it.
810 	 * Round up during conversion to avoid timeout going off early.
811 	 */
812 	if (uap->ts != NULL) {
813 		error = copyin(uap->ts, &kts, sizeof (kts));
814 		if (error)
815 			return (error);
816 		ktv.tv_sec = kts.tv_sec;
817 		ktv.tv_usec = (kts.tv_nsec + 999) / 1000;
818 		error = itimerfix(&ktv);
819 		if (error)
820 			return (error);
821 		ktvp = &ktv;
822 	} else {
823 		ktvp = NULL;
824 	}
825 
826 	/*
827 	 * Install temporary signal mask if any provided.
828 	 */
829 	if (uap->sigmask != NULL) {
830 		error = copyin(uap->sigmask, &sigmask, sizeof(sigmask));
831 		if (error)
832 			return (error);
833 		lp->lwp_oldsigmask = lp->lwp_sigmask;
834 		SIG_CANTMASK(sigmask);
835 		lp->lwp_sigmask = sigmask;
836 	}
837 
838 	/*
839 	 * Do real job.
840 	 */
841 	error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktvp,
842 			&uap->sysmsg_result);
843 
844 	if (uap->sigmask != NULL) {
845 		/* doselect() responsible for turning ERESTART into EINTR */
846 		KKASSERT(error != ERESTART);
847 		if (error == EINTR) {
848 			/*
849 			 * We can't restore the previous signal mask now
850 			 * because it could block the signal that interrupted
851 			 * us.  So make a note to restore it after executing
852 			 * the handler.
853 			 */
854 			lp->lwp_flag |= LWP_OLDMASK;
855 		} else {
856 			/*
857 			 * No handler to run. Restore previous mask immediately.
858 			 */
859 			lp->lwp_sigmask = lp->lwp_oldsigmask;
860 		}
861 	}
862 
863 	return (error);
864 }
865 
866 /*
867  * Common code for sys_select() and sys_pselect().
868  *
869  * in, out and ex are userland pointers.  tv must point to validated
870  * kernel-side timeout value or NULL for infinite timeout.  res must
871  * point to syscall return value.
872  */
873 static int
874 doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex, struct timeval *tv,
875 		int *res)
876 {
877 	struct lwp *lp = curthread->td_lwp;
878 	struct proc *p = curproc;
879 
880 	/*
881 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
882 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
883 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
884 	 * of 256.
885 	 */
886 	fd_mask s_selbits[howmany(2048, NFDBITS)];
887 	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
888 	struct timeval atv, rtv, ttv;
889 	int ncoll, error, timo;
890 	u_int nbufbytes, ncpbytes, nfdbits;
891 
892 	if (nd < 0)
893 		return (EINVAL);
894 	if (nd > p->p_fd->fd_nfiles)
895 		nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
896 
897 	/*
898 	 * Allocate just enough bits for the non-null fd_sets.  Use the
899 	 * preallocated auto buffer if possible.
900 	 */
901 	nfdbits = roundup(nd, NFDBITS);
902 	ncpbytes = nfdbits / NBBY;
903 	nbufbytes = 0;
904 	if (in != NULL)
905 		nbufbytes += 2 * ncpbytes;
906 	if (ou != NULL)
907 		nbufbytes += 2 * ncpbytes;
908 	if (ex != NULL)
909 		nbufbytes += 2 * ncpbytes;
910 	if (nbufbytes <= sizeof s_selbits)
911 		selbits = &s_selbits[0];
912 	else
913 		selbits = kmalloc(nbufbytes, M_SELECT, M_WAITOK);
914 
915 	/*
916 	 * Assign pointers into the bit buffers and fetch the input bits.
917 	 * Put the output buffers together so that they can be bzeroed
918 	 * together.
919 	 */
920 	sbp = selbits;
921 #define	getbits(name, x) \
922 	do {								\
923 		if (name == NULL)					\
924 			ibits[x] = NULL;				\
925 		else {							\
926 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
927 			obits[x] = sbp;					\
928 			sbp += ncpbytes / sizeof *sbp;			\
929 			error = copyin(name, ibits[x], ncpbytes);	\
930 			if (error != 0)					\
931 				goto done;				\
932 		}							\
933 	} while (0)
934 	getbits(in, 0);
935 	getbits(ou, 1);
936 	getbits(ex, 2);
937 #undef	getbits
938 	if (nbufbytes != 0)
939 		bzero(selbits, nbufbytes / 2);
940 
941 	if (tv != NULL) {
942 		atv = *tv;
943 		getmicrouptime(&rtv);
944 		timevaladd(&atv, &rtv);
945 	} else {
946 		atv.tv_sec = 0;
947 		atv.tv_usec = 0;
948 	}
949 	timo = 0;
950 retry:
951 	ncoll = nselcoll;
952 	lp->lwp_flag |= LWP_SELECT;
953 	error = selscan(p, ibits, obits, nd, res);
954 	if (error || *res)
955 		goto done;
956 	if (atv.tv_sec || atv.tv_usec) {
957 		getmicrouptime(&rtv);
958 		if (timevalcmp(&rtv, &atv, >=))
959 			goto done;
960 		ttv = atv;
961 		timevalsub(&ttv, &rtv);
962 		timo = ttv.tv_sec > 24 * 60 * 60 ?
963 		    24 * 60 * 60 * hz : tvtohz_high(&ttv);
964 	}
965 	crit_enter();
966 	if ((lp->lwp_flag & LWP_SELECT) == 0 || nselcoll != ncoll) {
967 		crit_exit();
968 		goto retry;
969 	}
970 	lp->lwp_flag &= ~LWP_SELECT;
971 
972 	error = tsleep((caddr_t)&selwait, PCATCH, "select", timo);
973 
974 	crit_exit();
975 	if (error == 0)
976 		goto retry;
977 done:
978 	lp->lwp_flag &= ~LWP_SELECT;
979 	/* select is not restarted after signals... */
980 	if (error == ERESTART)
981 		error = EINTR;
982 	if (error == EWOULDBLOCK)
983 		error = 0;
984 #define	putbits(name, x) \
985 	if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
986 		error = error2;
987 	if (error == 0) {
988 		int error2;
989 
990 		putbits(in, 0);
991 		putbits(ou, 1);
992 		putbits(ex, 2);
993 #undef putbits
994 	}
995 	if (selbits != &s_selbits[0])
996 		kfree(selbits, M_SELECT);
997 	return (error);
998 }
999 
1000 static int
1001 selscan(struct proc *p, fd_mask **ibits, fd_mask **obits, int nfd, int *res)
1002 {
1003 	int msk, i, fd;
1004 	fd_mask bits;
1005 	struct file *fp;
1006 	int n = 0;
1007 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
1008 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
1009 
1010 	for (msk = 0; msk < 3; msk++) {
1011 		if (ibits[msk] == NULL)
1012 			continue;
1013 		for (i = 0; i < nfd; i += NFDBITS) {
1014 			bits = ibits[msk][i/NFDBITS];
1015 			/* ffs(int mask) not portable, fd_mask is long */
1016 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
1017 				if (!(bits & 1))
1018 					continue;
1019 				fp = holdfp(p->p_fd, fd, -1);
1020 				if (fp == NULL)
1021 					return (EBADF);
1022 				if (fo_poll(fp, flag[msk], fp->f_cred)) {
1023 					obits[msk][(fd)/NFDBITS] |=
1024 					    ((fd_mask)1 << ((fd) % NFDBITS));
1025 					n++;
1026 				}
1027 				fdrop(fp);
1028 			}
1029 		}
1030 	}
1031 	*res = n;
1032 	return (0);
1033 }
1034 
1035 /*
1036  * Poll system call.
1037  */
1038 int
1039 sys_poll(struct poll_args *uap)
1040 {
1041 	struct pollfd *bits;
1042 	struct pollfd smallbits[32];
1043 	struct timeval atv, rtv, ttv;
1044 	int ncoll, error = 0, timo;
1045 	u_int nfds;
1046 	size_t ni;
1047 	struct lwp *lp = curthread->td_lwp;
1048 	struct proc *p = curproc;
1049 
1050 	nfds = uap->nfds;
1051 	/*
1052 	 * This is kinda bogus.  We have fd limits, but that is not
1053 	 * really related to the size of the pollfd array.  Make sure
1054 	 * we let the process use at least FD_SETSIZE entries and at
1055 	 * least enough for the current limits.  We want to be reasonably
1056 	 * safe, but not overly restrictive.
1057 	 */
1058 	if (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && nfds > FD_SETSIZE)
1059 		return (EINVAL);
1060 	ni = nfds * sizeof(struct pollfd);
1061 	if (ni > sizeof(smallbits))
1062 		bits = kmalloc(ni, M_TEMP, M_WAITOK);
1063 	else
1064 		bits = smallbits;
1065 	error = copyin(uap->fds, bits, ni);
1066 	if (error)
1067 		goto done;
1068 	if (uap->timeout != INFTIM) {
1069 		atv.tv_sec = uap->timeout / 1000;
1070 		atv.tv_usec = (uap->timeout % 1000) * 1000;
1071 		if (itimerfix(&atv)) {
1072 			error = EINVAL;
1073 			goto done;
1074 		}
1075 		getmicrouptime(&rtv);
1076 		timevaladd(&atv, &rtv);
1077 	} else {
1078 		atv.tv_sec = 0;
1079 		atv.tv_usec = 0;
1080 	}
1081 	timo = 0;
1082 retry:
1083 	ncoll = nselcoll;
1084 	lp->lwp_flag |= LWP_SELECT;
1085 	error = pollscan(p, bits, nfds, &uap->sysmsg_result);
1086 	if (error || uap->sysmsg_result)
1087 		goto done;
1088 	if (atv.tv_sec || atv.tv_usec) {
1089 		getmicrouptime(&rtv);
1090 		if (timevalcmp(&rtv, &atv, >=))
1091 			goto done;
1092 		ttv = atv;
1093 		timevalsub(&ttv, &rtv);
1094 		timo = ttv.tv_sec > 24 * 60 * 60 ?
1095 		    24 * 60 * 60 * hz : tvtohz_high(&ttv);
1096 	}
1097 	crit_enter();
1098 	if ((lp->lwp_flag & LWP_SELECT) == 0 || nselcoll != ncoll) {
1099 		crit_exit();
1100 		goto retry;
1101 	}
1102 	lp->lwp_flag &= ~LWP_SELECT;
1103 	error = tsleep((caddr_t)&selwait, PCATCH, "poll", timo);
1104 	crit_exit();
1105 	if (error == 0)
1106 		goto retry;
1107 done:
1108 	lp->lwp_flag &= ~LWP_SELECT;
1109 	/* poll is not restarted after signals... */
1110 	if (error == ERESTART)
1111 		error = EINTR;
1112 	if (error == EWOULDBLOCK)
1113 		error = 0;
1114 	if (error == 0) {
1115 		error = copyout(bits, uap->fds, ni);
1116 		if (error)
1117 			goto out;
1118 	}
1119 out:
1120 	if (ni > sizeof(smallbits))
1121 		kfree(bits, M_TEMP);
1122 	return (error);
1123 }
1124 
1125 static int
1126 pollscan(struct proc *p, struct pollfd *fds, u_int nfd, int *res)
1127 {
1128 	int i;
1129 	struct file *fp;
1130 	int n = 0;
1131 
1132 	for (i = 0; i < nfd; i++, fds++) {
1133 		if (fds->fd >= p->p_fd->fd_nfiles) {
1134 			fds->revents = POLLNVAL;
1135 			n++;
1136 		} else if (fds->fd < 0) {
1137 			fds->revents = 0;
1138 		} else {
1139 			fp = holdfp(p->p_fd, fds->fd, -1);
1140 			if (fp == NULL) {
1141 				fds->revents = POLLNVAL;
1142 				n++;
1143 			} else {
1144 				/*
1145 				 * Note: backend also returns POLLHUP and
1146 				 * POLLERR if appropriate.
1147 				 */
1148 				fds->revents = fo_poll(fp, fds->events,
1149 							fp->f_cred);
1150 				if (fds->revents != 0)
1151 					n++;
1152 				fdrop(fp);
1153 			}
1154 		}
1155 	}
1156 	*res = n;
1157 	return (0);
1158 }
1159 
1160 /*
1161  * OpenBSD poll system call.
1162  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1163  */
1164 int
1165 sys_openbsd_poll(struct openbsd_poll_args *uap)
1166 {
1167 	return (sys_poll((struct poll_args *)uap));
1168 }
1169 
1170 /*ARGSUSED*/
1171 int
1172 seltrue(cdev_t dev, int events)
1173 {
1174 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1175 }
1176 
1177 /*
1178  * Record a select request.  A global wait must be used since a process/thread
1179  * might go away after recording its request.
1180  */
1181 void
1182 selrecord(struct thread *selector, struct selinfo *sip)
1183 {
1184 	struct proc *p;
1185 	struct lwp *lp = NULL;
1186 
1187 	if (selector->td_lwp == NULL)
1188 		panic("selrecord: thread needs a process");
1189 
1190 	if (sip->si_pid == selector->td_proc->p_pid &&
1191 	    sip->si_tid == selector->td_lwp->lwp_tid)
1192 		return;
1193 	if (sip->si_pid && (p = pfind(sip->si_pid)))
1194 		lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, sip->si_tid);
1195 	if (lp != NULL && lp->lwp_wchan == (caddr_t)&selwait) {
1196 		sip->si_flags |= SI_COLL;
1197 	} else {
1198 		sip->si_pid = selector->td_proc->p_pid;
1199 		sip->si_tid = selector->td_lwp->lwp_tid;
1200 	}
1201 }
1202 
1203 /*
1204  * Do a wakeup when a selectable event occurs.
1205  */
1206 void
1207 selwakeup(struct selinfo *sip)
1208 {
1209 	struct proc *p;
1210 	struct lwp *lp = NULL;
1211 
1212 	if (sip->si_pid == 0)
1213 		return;
1214 	if (sip->si_flags & SI_COLL) {
1215 		nselcoll++;
1216 		sip->si_flags &= ~SI_COLL;
1217 		wakeup((caddr_t)&selwait);	/* YYY fixable */
1218 	}
1219 	p = pfind(sip->si_pid);
1220 	sip->si_pid = 0;
1221 	if (p == NULL)
1222 		return;
1223 	lp = lwp_rb_tree_RB_LOOKUP(&p->p_lwp_tree, sip->si_tid);
1224 	if (lp == NULL)
1225 		return;
1226 
1227 	crit_enter();
1228 	if (lp->lwp_wchan == (caddr_t)&selwait) {
1229 		/*
1230 		 * Flag the process to break the tsleep when
1231 		 * setrunnable is called, but only call setrunnable
1232 		 * here if the process is not in a stopped state.
1233 		 */
1234 		lp->lwp_flag |= LWP_BREAKTSLEEP;
1235 		if (p->p_stat != SSTOP)
1236 			setrunnable(lp);
1237 	} else if (lp->lwp_flag & LWP_SELECT) {
1238 		lp->lwp_flag &= ~LWP_SELECT;
1239 	}
1240 	crit_exit();
1241 }
1242 
1243