xref: /dragonfly/sys/kern/uipc_syscalls.c (revision 1bf4b486)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1990, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * sendfile(2) and related extensions:
6  * Copyright (c) 1998, David Greenman. All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. All advertising materials mentioning features or use of this software
17  *    must display the following acknowledgement:
18  *	This product includes software developed by the University of
19  *	California, Berkeley and its contributors.
20  * 4. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
37  * $FreeBSD: src/sys/kern/uipc_syscalls.c,v 1.65.2.17 2003/04/04 17:11:16 tegge Exp $
38  * $DragonFly: src/sys/kern/uipc_syscalls.c,v 1.57 2005/07/15 17:54:47 eirikn Exp $
39  */
40 
41 #include "opt_ktrace.h"
42 #include "opt_sctp.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/kernel.h>
47 #include <sys/sysproto.h>
48 #include <sys/malloc.h>
49 #include <sys/filedesc.h>
50 #include <sys/event.h>
51 #include <sys/proc.h>
52 #include <sys/fcntl.h>
53 #include <sys/file.h>
54 #include <sys/filio.h>
55 #include <sys/kern_syscall.h>
56 #include <sys/mbuf.h>
57 #include <sys/protosw.h>
58 #include <sys/sfbuf.h>
59 #include <sys/socket.h>
60 #include <sys/socketvar.h>
61 #include <sys/socketops.h>
62 #include <sys/uio.h>
63 #include <sys/vnode.h>
64 #include <sys/lock.h>
65 #include <sys/mount.h>
66 #ifdef KTRACE
67 #include <sys/ktrace.h>
68 #endif
69 #include <vm/vm.h>
70 #include <vm/vm_object.h>
71 #include <vm/vm_page.h>
72 #include <vm/vm_pageout.h>
73 #include <vm/vm_kern.h>
74 #include <vm/vm_extern.h>
75 #include <sys/file2.h>
76 #include <sys/signalvar.h>
77 
78 #include <sys/thread2.h>
79 #include <sys/msgport2.h>
80 
81 #ifdef SCTP
82 #include <netinet/sctp_peeloff.h>
83 #endif /* SCTP */
84 
85 struct sfbuf_mref {
86 	struct sf_buf	*sf;
87 	int		mref_count;
88 };
89 
90 static MALLOC_DEFINE(M_SENDFILE, "sendfile", "sendfile sfbuf ref structures");
91 
92 /*
93  * System call interface to the socket abstraction.
94  */
95 
96 extern	struct fileops socketops;
97 
98 /*
99  * socket_args(int domain, int type, int protocol)
100  */
101 int
102 kern_socket(int domain, int type, int protocol, int *res)
103 {
104 	struct thread *td = curthread;
105 	struct proc *p = td->td_proc;
106 	struct filedesc *fdp;
107 	struct socket *so;
108 	struct file *fp;
109 	int fd, error;
110 
111 	KKASSERT(p);
112 	fdp = p->p_fd;
113 
114 	error = falloc(p, &fp, &fd);
115 	if (error)
116 		return (error);
117 	error = socreate(domain, &so, type, protocol, td);
118 	if (error) {
119 		if (fdp->fd_files[fd].fp == fp) {
120 			funsetfd(fdp, fd);
121 			fdrop(fp, td);
122 		}
123 	} else {
124 		fp->f_data = (caddr_t)so;
125 		fp->f_flag = FREAD|FWRITE;
126 		fp->f_ops = &socketops;
127 		fp->f_type = DTYPE_SOCKET;
128 		*res = fd;
129 	}
130 	fdrop(fp, td);
131 	return (error);
132 }
133 
134 int
135 socket(struct socket_args *uap)
136 {
137 	int error;
138 
139 	error = kern_socket(uap->domain, uap->type, uap->protocol,
140 	    &uap->sysmsg_result);
141 
142 	return (error);
143 }
144 
145 int
146 kern_bind(int s, struct sockaddr *sa)
147 {
148 	struct thread *td = curthread;
149 	struct proc *p = td->td_proc;
150 	struct file *fp;
151 	int error;
152 
153 	KKASSERT(p);
154 	error = holdsock(p->p_fd, s, &fp);
155 	if (error)
156 		return (error);
157 	error = sobind((struct socket *)fp->f_data, sa, td);
158 	fdrop(fp, td);
159 	return (error);
160 }
161 
162 /*
163  * bind_args(int s, caddr_t name, int namelen)
164  */
165 int
166 bind(struct bind_args *uap)
167 {
168 	struct sockaddr *sa;
169 	int error;
170 
171 	error = getsockaddr(&sa, uap->name, uap->namelen);
172 	if (error)
173 		return (error);
174 	error = kern_bind(uap->s, sa);
175 	FREE(sa, M_SONAME);
176 
177 	return (error);
178 }
179 
180 int
181 kern_listen(int s, int backlog)
182 {
183 	struct thread *td = curthread;
184 	struct proc *p = td->td_proc;
185 	struct file *fp;
186 	int error;
187 
188 	KKASSERT(p);
189 	error = holdsock(p->p_fd, s, &fp);
190 	if (error)
191 		return (error);
192 	error = solisten((struct socket *)fp->f_data, backlog, td);
193 	fdrop(fp, td);
194 	return(error);
195 }
196 
197 /*
198  * listen_args(int s, int backlog)
199  */
200 int
201 listen(struct listen_args *uap)
202 {
203 	int error;
204 
205 	error = kern_listen(uap->s, uap->backlog);
206 	return (error);
207 }
208 
209 /*
210  * Returns the accepted socket as well.
211  */
212 static boolean_t
213 soaccept_predicate(struct netmsg *msg0)
214 {
215 	struct netmsg_so_notify *msg = (struct netmsg_so_notify *)msg0;
216 	struct socket *head = msg->nm_so;
217 
218 	if (head->so_error != 0) {
219 		msg->nm_lmsg.ms_error = head->so_error;
220 		return (TRUE);
221 	}
222 	if (!TAILQ_EMPTY(&head->so_comp)) {
223 		/* Abuse nm_so field as copy in/copy out parameter. XXX JH */
224 		msg->nm_so = TAILQ_FIRST(&head->so_comp);
225 		TAILQ_REMOVE(&head->so_comp, msg->nm_so, so_list);
226 		head->so_qlen--;
227 
228 		msg->nm_lmsg.ms_error = 0;
229 		return (TRUE);
230 	}
231 	if (head->so_state & SS_CANTRCVMORE) {
232 		msg->nm_lmsg.ms_error = ECONNABORTED;
233 		return (TRUE);
234 	}
235 	if (head->so_state & SS_NBIO) {
236 		msg->nm_lmsg.ms_error = EWOULDBLOCK;
237 		return (TRUE);
238 	}
239 
240 	return (FALSE);
241 }
242 
243 /*
244  * The second argument to kern_accept() is a handle to a struct sockaddr.
245  * This allows kern_accept() to return a pointer to an allocated struct
246  * sockaddr which must be freed later with FREE().  The caller must
247  * initialize *name to NULL.
248  */
249 int
250 kern_accept(int s, struct sockaddr **name, int *namelen, int *res)
251 {
252 	struct thread *td = curthread;
253 	struct proc *p = td->td_proc;
254 	struct filedesc *fdp = p->p_fd;
255 	struct file *lfp = NULL;
256 	struct file *nfp = NULL;
257 	struct sockaddr *sa;
258 	struct socket *head, *so;
259 	struct netmsg_so_notify msg;
260 	lwkt_port_t port;
261 	int fd;
262 	u_int fflag;		/* type must match fp->f_flag */
263 	int error, tmp;
264 
265 	if (name && namelen && *namelen < 0)
266 		return (EINVAL);
267 
268 	error = holdsock(fdp, s, &lfp);
269 	if (error)
270 		return (error);
271 
272 	error = falloc(p, &nfp, &fd);
273 	if (error) {		/* Probably ran out of file descriptors. */
274 		*res = -1;
275 		fdrop(lfp, td);
276 		return (error);
277 	}
278 	*res = fd;
279 
280 	head = (struct socket *)lfp->f_data;
281 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
282 		error = EINVAL;
283 		goto done;
284 	}
285 
286 	/* optimize for uniprocessor case later XXX JH */
287 	port = head->so_proto->pr_mport(head, NULL, PRU_PRED);
288 	lwkt_initmsg(&msg.nm_lmsg, &curthread->td_msgport,
289 		     MSGF_PCATCH | MSGF_ABORTABLE,
290 		     lwkt_cmd_func(netmsg_so_notify),
291 		     lwkt_cmd_func(netmsg_so_notify_abort));
292 	msg.nm_predicate = soaccept_predicate;
293 	msg.nm_so = head;
294 	msg.nm_etype = NM_REVENT;
295 	error = lwkt_domsg(port, &msg.nm_lmsg);
296 	if (error)
297 		goto done;
298 
299 	/*
300 	 * At this point we have the connection that's ready to be accepted.
301 	 */
302 	so = msg.nm_so;
303 
304 	fflag = lfp->f_flag;
305 
306 	/* connection has been removed from the listen queue */
307 	KNOTE(&head->so_rcv.sb_sel.si_note, 0);
308 
309 	so->so_state &= ~SS_COMP;
310 	so->so_head = NULL;
311 	if (head->so_sigio != NULL)
312 		fsetown(fgetown(head->so_sigio), &so->so_sigio);
313 
314 	nfp->f_data = (caddr_t)so;
315 	nfp->f_flag = fflag;
316 	nfp->f_ops = &socketops;
317 	nfp->f_type = DTYPE_SOCKET;
318 	/* Sync socket nonblocking/async state with file flags */
319 	tmp = fflag & FNONBLOCK;
320 	(void) fo_ioctl(nfp, FIONBIO, (caddr_t)&tmp, td);
321 	tmp = fflag & FASYNC;
322 	(void) fo_ioctl(nfp, FIOASYNC, (caddr_t)&tmp, td);
323 
324 	sa = NULL;
325 	error = soaccept(so, &sa);
326 
327 	/*
328 	 * Set the returned name and namelen as applicable.  Set the returned
329 	 * namelen to 0 for older code which might ignore the return value
330 	 * from accept.
331 	 */
332 	if (error == 0) {
333 		if (sa && name && namelen) {
334 			if (*namelen > sa->sa_len)
335 				*namelen = sa->sa_len;
336 			*name = sa;
337 		} else {
338 			if (sa)
339 				FREE(sa, M_SONAME);
340 		}
341 	}
342 
343 done:
344 	/*
345 	 * close the new descriptor, assuming someone hasn't ripped it
346 	 * out from under us.  Note that *res is normally ignored if an
347 	 * error is returned but a syscall message will still have access
348 	 * to the result code.
349 	 */
350 	if (error) {
351 		*res = -1;
352 		if (fdp->fd_files[fd].fp == nfp) {
353 			funsetfd(fdp, fd);
354 			fdrop(nfp, td);
355 		}
356 	}
357 
358 	/*
359 	 * Release explicitly held references before returning.
360 	 */
361 	if (nfp)
362 		fdrop(nfp, td);
363 	fdrop(lfp, td);
364 	return (error);
365 }
366 
367 /*
368  * accept_args(int s, caddr_t name, int *anamelen)
369  */
370 int
371 accept(struct accept_args *uap)
372 {
373 	struct sockaddr *sa = NULL;
374 	int sa_len;
375 	int error;
376 
377 	if (uap->name) {
378 		error = copyin(uap->anamelen, &sa_len, sizeof(sa_len));
379 		if (error)
380 			return (error);
381 
382 		error = kern_accept(uap->s, &sa, &sa_len, &uap->sysmsg_result);
383 
384 		if (error == 0)
385 			error = copyout(sa, uap->name, sa_len);
386 		if (error == 0) {
387 			error = copyout(&sa_len, uap->anamelen,
388 			    sizeof(*uap->anamelen));
389 		}
390 		if (sa)
391 			FREE(sa, M_SONAME);
392 	} else {
393 		error = kern_accept(uap->s, NULL, 0, &uap->sysmsg_result);
394 	}
395 	return (error);
396 }
397 
398 /*
399  * Returns TRUE if predicate satisfied.
400  */
401 static boolean_t
402 soconnected_predicate(struct netmsg *msg0)
403 {
404 	struct netmsg_so_notify *msg = (struct netmsg_so_notify *)msg0;
405 	struct socket *so = msg->nm_so;
406 
407 	/* check predicate */
408 	if (!(so->so_state & SS_ISCONNECTING) || so->so_error != 0) {
409 		msg->nm_lmsg.ms_error = so->so_error;
410 		return (TRUE);
411 	}
412 
413 	return (FALSE);
414 }
415 
416 int
417 kern_connect(int s, struct sockaddr *sa)
418 {
419 	struct thread *td = curthread;
420 	struct proc *p = td->td_proc;
421 	struct file *fp;
422 	struct socket *so;
423 	int error;
424 
425 	error = holdsock(p->p_fd, s, &fp);
426 	if (error)
427 		return (error);
428 	so = (struct socket *)fp->f_data;
429 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
430 		error = EALREADY;
431 		goto done;
432 	}
433 	error = soconnect(so, sa, td);
434 	if (error)
435 		goto bad;
436 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
437 		error = EINPROGRESS;
438 		goto done;
439 	}
440 	if ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
441 		struct netmsg_so_notify msg;
442 		lwkt_port_t port;
443 
444 		port = so->so_proto->pr_mport(so, sa, PRU_PRED);
445 		lwkt_initmsg(&msg.nm_lmsg,
446 			    &curthread->td_msgport,
447 			    MSGF_PCATCH | MSGF_ABORTABLE,
448 			    lwkt_cmd_func(netmsg_so_notify),
449 			    lwkt_cmd_func(netmsg_so_notify_abort));
450 		msg.nm_predicate = soconnected_predicate;
451 		msg.nm_so = so;
452 		msg.nm_etype = NM_REVENT;
453 		error = lwkt_domsg(port, &msg.nm_lmsg);
454 	}
455 	if (error == 0) {
456 		error = so->so_error;
457 		so->so_error = 0;
458 	}
459 bad:
460 	so->so_state &= ~SS_ISCONNECTING;
461 	if (error == ERESTART)
462 		error = EINTR;
463 done:
464 	fdrop(fp, td);
465 	return (error);
466 }
467 
468 /*
469  * connect_args(int s, caddr_t name, int namelen)
470  */
471 int
472 connect(struct connect_args *uap)
473 {
474 	struct sockaddr *sa;
475 	int error;
476 
477 	error = getsockaddr(&sa, uap->name, uap->namelen);
478 	if (error)
479 		return (error);
480 	error = kern_connect(uap->s, sa);
481 	FREE(sa, M_SONAME);
482 
483 	return (error);
484 }
485 
486 int
487 kern_socketpair(int domain, int type, int protocol, int *sv)
488 {
489 	struct thread *td = curthread;
490 	struct proc *p = td->td_proc;
491 	struct filedesc *fdp;
492 	struct file *fp1, *fp2;
493 	struct socket *so1, *so2;
494 	int fd, error;
495 
496 	KKASSERT(p);
497 	fdp = p->p_fd;
498 	error = socreate(domain, &so1, type, protocol, td);
499 	if (error)
500 		return (error);
501 	error = socreate(domain, &so2, type, protocol, td);
502 	if (error)
503 		goto free1;
504 	error = falloc(p, &fp1, &fd);
505 	if (error)
506 		goto free2;
507 	sv[0] = fd;
508 	fp1->f_data = (caddr_t)so1;
509 	error = falloc(p, &fp2, &fd);
510 	if (error)
511 		goto free3;
512 	fp2->f_data = (caddr_t)so2;
513 	sv[1] = fd;
514 	error = soconnect2(so1, so2);
515 	if (error)
516 		goto free4;
517 	if (type == SOCK_DGRAM) {
518 		/*
519 		 * Datagram socket connection is asymmetric.
520 		 */
521 		 error = soconnect2(so2, so1);
522 		 if (error)
523 			goto free4;
524 	}
525 	fp1->f_flag = fp2->f_flag = FREAD|FWRITE;
526 	fp1->f_ops = fp2->f_ops = &socketops;
527 	fp1->f_type = fp2->f_type = DTYPE_SOCKET;
528 	fdrop(fp1, td);
529 	fdrop(fp2, td);
530 	return (error);
531 free4:
532 	if (fdp->fd_files[sv[1]].fp == fp2) {
533 		funsetfd(fdp, sv[1]);
534 		fdrop(fp2, td);
535 	}
536 	fdrop(fp2, td);
537 free3:
538 	if (fdp->fd_files[sv[0]].fp == fp1) {
539 		funsetfd(fdp, sv[0]);
540 		fdrop(fp1, td);
541 	}
542 	fdrop(fp1, td);
543 free2:
544 	(void)soclose(so2);
545 free1:
546 	(void)soclose(so1);
547 	return (error);
548 }
549 
550 /*
551  * socketpair(int domain, int type, int protocol, int *rsv)
552  */
553 int
554 socketpair(struct socketpair_args *uap)
555 {
556 	int error, sockv[2];
557 
558 	error = kern_socketpair(uap->domain, uap->type, uap->protocol, sockv);
559 
560 	if (error == 0)
561 		error = copyout(sockv, uap->rsv, sizeof(sockv));
562 	return (error);
563 }
564 
565 int
566 kern_sendmsg(int s, struct sockaddr *sa, struct uio *auio,
567     struct mbuf *control, int flags, int *res)
568 {
569 	struct thread *td = curthread;
570 	struct proc *p = td->td_proc;
571 	struct file *fp;
572 	int len, error;
573 	struct socket *so;
574 #ifdef KTRACE
575 	struct iovec *ktriov = NULL;
576 	struct uio ktruio;
577 #endif
578 
579 	error = holdsock(p->p_fd, s, &fp);
580 	if (error)
581 		return (error);
582 	if (auio->uio_resid < 0) {
583 		error = EINVAL;
584 		goto done;
585 	}
586 #ifdef KTRACE
587 	if (KTRPOINT(td, KTR_GENIO)) {
588 		int iovlen = auio->uio_iovcnt * sizeof (struct iovec);
589 
590 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
591 		bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen);
592 		ktruio = *auio;
593 	}
594 #endif
595 	len = auio->uio_resid;
596 	so = (struct socket *)fp->f_data;
597 	error = so_pru_sosend(so, sa, auio, NULL, control, flags, td);
598 	if (error) {
599 		if (auio->uio_resid != len && (error == ERESTART ||
600 		    error == EINTR || error == EWOULDBLOCK))
601 			error = 0;
602 		if (error == EPIPE)
603 			psignal(p, SIGPIPE);
604 	}
605 #ifdef KTRACE
606 	if (ktriov != NULL) {
607 		if (error == 0) {
608 			ktruio.uio_iov = ktriov;
609 			ktruio.uio_resid = len - auio->uio_resid;
610 			ktrgenio(p->p_tracep, s, UIO_WRITE, &ktruio, error);
611 		}
612 		FREE(ktriov, M_TEMP);
613 	}
614 #endif
615 	if (error == 0)
616 		*res  = len - auio->uio_resid;
617 done:
618 	fdrop(fp, td);
619 	return (error);
620 }
621 
622 /*
623  * sendto_args(int s, caddr_t buf, size_t len, int flags, caddr_t to, int tolen)
624  */
625 int
626 sendto(struct sendto_args *uap)
627 {
628 	struct thread *td = curthread;
629 	struct uio auio;
630 	struct iovec aiov;
631 	struct sockaddr *sa = NULL;
632 	int error;
633 
634 	if (uap->to) {
635 		error = getsockaddr(&sa, uap->to, uap->tolen);
636 		if (error)
637 			return (error);
638 	}
639 	aiov.iov_base = uap->buf;
640 	aiov.iov_len = uap->len;
641 	auio.uio_iov = &aiov;
642 	auio.uio_iovcnt = 1;
643 	auio.uio_offset = 0;
644 	auio.uio_resid = uap->len;
645 	auio.uio_segflg = UIO_USERSPACE;
646 	auio.uio_rw = UIO_WRITE;
647 	auio.uio_td = td;
648 
649 	error = kern_sendmsg(uap->s, sa, &auio, NULL, uap->flags,
650 	    &uap->sysmsg_result);
651 
652 	if (sa)
653 		FREE(sa, M_SONAME);
654 	return (error);
655 }
656 
657 /*
658  * sendmsg_args(int s, caddr_t msg, int flags)
659  */
660 int
661 sendmsg(struct sendmsg_args *uap)
662 {
663 	struct thread *td = curthread;
664 	struct msghdr msg;
665 	struct uio auio;
666 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
667 	struct sockaddr *sa = NULL;
668 	struct mbuf *control = NULL;
669 	int error;
670 
671 	error = copyin(uap->msg, (caddr_t)&msg, sizeof(msg));
672 	if (error)
673 		return (error);
674 
675 	/*
676 	 * Conditionally copyin msg.msg_name.
677 	 */
678 	if (msg.msg_name) {
679 		error = getsockaddr(&sa, msg.msg_name, msg.msg_namelen);
680 		if (error)
681 			return (error);
682 	}
683 
684 	/*
685 	 * Populate auio.
686 	 */
687 	error = iovec_copyin(msg.msg_iov, &iov, aiov, msg.msg_iovlen,
688 	    &auio.uio_resid);
689 	if (error)
690 		goto cleanup;
691 	auio.uio_iov = iov;
692 	auio.uio_iovcnt = msg.msg_iovlen;
693 	auio.uio_offset = 0;
694 	auio.uio_segflg = UIO_USERSPACE;
695 	auio.uio_rw = UIO_WRITE;
696 	auio.uio_td = td;
697 
698 	/*
699 	 * Conditionally copyin msg.msg_control.
700 	 */
701 	if (msg.msg_control) {
702 		if (msg.msg_controllen < sizeof(struct cmsghdr) ||
703 		    msg.msg_controllen > MLEN) {
704 			error = EINVAL;
705 			goto cleanup;
706 		}
707 		control = m_get(MB_WAIT, MT_CONTROL);
708 		if (control == NULL) {
709 			error = ENOBUFS;
710 			goto cleanup;
711 		}
712 		control->m_len = msg.msg_controllen;
713 		error = copyin(msg.msg_control, mtod(control, caddr_t),
714 		    msg.msg_controllen);
715 		if (error) {
716 			m_free(control);
717 			goto cleanup;
718 		}
719 	}
720 
721 	error = kern_sendmsg(uap->s, sa, &auio, control, uap->flags,
722 	    &uap->sysmsg_result);
723 
724 cleanup:
725 	if (sa)
726 		FREE(sa, M_SONAME);
727 	iovec_free(&iov, aiov);
728 	return (error);
729 }
730 
731 /*
732  * kern_recvmsg() takes a handle to sa and control.  If the handle is non-
733  * null, it returns a dynamically allocated struct sockaddr and an mbuf.
734  * Don't forget to FREE() and m_free() these if they are returned.
735  */
736 int
737 kern_recvmsg(int s, struct sockaddr **sa, struct uio *auio,
738     struct mbuf **control, int *flags, int *res)
739 {
740 	struct thread *td = curthread;
741 	struct proc *p = td->td_proc;
742 	struct file *fp;
743 	int len, error;
744 	struct socket *so;
745 #ifdef KTRACE
746 	struct iovec *ktriov = NULL;
747 	struct uio ktruio;
748 #endif
749 
750 	error = holdsock(p->p_fd, s, &fp);
751 	if (error)
752 		return (error);
753 	if (auio->uio_resid < 0) {
754 		error = EINVAL;
755 		goto done;
756 	}
757 #ifdef KTRACE
758 	if (KTRPOINT(td, KTR_GENIO)) {
759 		int iovlen = auio->uio_iovcnt * sizeof (struct iovec);
760 
761 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
762 		bcopy(auio->uio_iov, ktriov, iovlen);
763 		ktruio = *auio;
764 	}
765 #endif
766 	len = auio->uio_resid;
767 	so = (struct socket *)fp->f_data;
768 	error = so_pru_soreceive(so, sa, auio, NULL, control, flags);
769 	if (error) {
770 		if (auio->uio_resid != len && (error == ERESTART ||
771 		    error == EINTR || error == EWOULDBLOCK))
772 			error = 0;
773 	}
774 #ifdef KTRACE
775 	if (ktriov != NULL) {
776 		if (error == 0) {
777 			ktruio.uio_iov = ktriov;
778 			ktruio.uio_resid = len - auio->uio_resid;
779 			ktrgenio(p->p_tracep, s, UIO_READ, &ktruio, error);
780 		}
781 		FREE(ktriov, M_TEMP);
782 	}
783 #endif
784 	if (error == 0)
785 		*res = len - auio->uio_resid;
786 done:
787 	fdrop(fp, td);
788 	return (error);
789 }
790 
791 /*
792  * recvfrom_args(int s, caddr_t buf, size_t len, int flags,
793  *			caddr_t from, int *fromlenaddr)
794  */
795 int
796 recvfrom(struct recvfrom_args *uap)
797 {
798 	struct thread *td = curthread;
799 	struct uio auio;
800 	struct iovec aiov;
801 	struct sockaddr *sa = NULL;
802 	int error, fromlen;
803 
804 	if (uap->from && uap->fromlenaddr) {
805 		error = copyin(uap->fromlenaddr, &fromlen, sizeof(fromlen));
806 		if (error)
807 			return (error);
808 		if (fromlen < 0)
809 			return (EINVAL);
810 	} else {
811 		fromlen = 0;
812 	}
813 	aiov.iov_base = uap->buf;
814 	aiov.iov_len = uap->len;
815 	auio.uio_iov = &aiov;
816 	auio.uio_iovcnt = 1;
817 	auio.uio_offset = 0;
818 	auio.uio_resid = uap->len;
819 	auio.uio_segflg = UIO_USERSPACE;
820 	auio.uio_rw = UIO_READ;
821 	auio.uio_td = td;
822 
823 	error = kern_recvmsg(uap->s, uap->from ? &sa : NULL, &auio, NULL,
824 	    &uap->flags, &uap->sysmsg_result);
825 
826 	if (error == 0 && uap->from) {
827 		/* note: sa may still be NULL */
828 		if (sa) {
829 			fromlen = MIN(fromlen, sa->sa_len);
830 			error = copyout(sa, uap->from, fromlen);
831 		} else {
832 			fromlen = 0;
833 		}
834 		if (error == 0) {
835 			error = copyout(&fromlen, uap->fromlenaddr,
836 					sizeof(fromlen));
837 		}
838 	}
839 	if (sa)
840 		FREE(sa, M_SONAME);
841 
842 	return (error);
843 }
844 
845 /*
846  * recvmsg_args(int s, struct msghdr *msg, int flags)
847  */
848 int
849 recvmsg(struct recvmsg_args *uap)
850 {
851 	struct thread *td = curthread;
852 	struct msghdr msg;
853 	struct uio auio;
854 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
855 	struct mbuf *m, *control = NULL;
856 	struct sockaddr *sa = NULL;
857 	caddr_t ctlbuf;
858 	socklen_t *ufromlenp, *ucontrollenp;
859 	int error, fromlen, controllen, len, flags, *uflagsp;
860 
861 	/*
862 	 * This copyin handles everything except the iovec.
863 	 */
864 	error = copyin(uap->msg, &msg, sizeof(msg));
865 	if (error)
866 		return (error);
867 
868 	if (msg.msg_name && msg.msg_namelen < 0)
869 		return (EINVAL);
870 	if (msg.msg_control && msg.msg_controllen < 0)
871 		return (EINVAL);
872 
873 	ufromlenp = (socklen_t *)((caddr_t)uap->msg + offsetof(struct msghdr,
874 	    msg_namelen));
875 	ucontrollenp = (socklen_t *)((caddr_t)uap->msg + offsetof(struct msghdr,
876 	    msg_controllen));
877 	uflagsp = (int *)((caddr_t)uap->msg + offsetof(struct msghdr,
878 	    msg_flags));
879 
880 	/*
881 	 * Populate auio.
882 	 */
883 	error = iovec_copyin(msg.msg_iov, &iov, aiov, msg.msg_iovlen,
884 	    &auio.uio_resid);
885 	if (error)
886 		return (error);
887 	auio.uio_iov = iov;
888 	auio.uio_iovcnt = msg.msg_iovlen;
889 	auio.uio_offset = 0;
890 	auio.uio_segflg = UIO_USERSPACE;
891 	auio.uio_rw = UIO_READ;
892 	auio.uio_td = td;
893 
894 	flags = uap->flags;
895 
896 	error = kern_recvmsg(uap->s, msg.msg_name ? &sa : NULL, &auio,
897 	    msg.msg_control ? &control : NULL, &flags, &uap->sysmsg_result);
898 
899 	/*
900 	 * Conditionally copyout the name and populate the namelen field.
901 	 */
902 	if (error == 0 && msg.msg_name) {
903 		fromlen = MIN(msg.msg_namelen, sa->sa_len);
904 		error = copyout(sa, msg.msg_name, fromlen);
905 		if (error == 0)
906 			error = copyout(&fromlen, ufromlenp,
907 			    sizeof(*ufromlenp));
908 	}
909 
910 	/*
911 	 * Copyout msg.msg_control and msg.msg_controllen.
912 	 */
913 	if (error == 0 && msg.msg_control) {
914 		len = msg.msg_controllen;
915 		m = control;
916 		ctlbuf = (caddr_t)msg.msg_control;
917 
918 		while(m && len > 0) {
919 			unsigned int tocopy;
920 
921 			if (len >= m->m_len) {
922 				tocopy = m->m_len;
923 			} else {
924 				msg.msg_flags |= MSG_CTRUNC;
925 				tocopy = len;
926 			}
927 
928 			error = copyout(mtod(m, caddr_t), ctlbuf, tocopy);
929 			if (error)
930 				goto cleanup;
931 
932 			ctlbuf += tocopy;
933 			len -= tocopy;
934 			m = m->m_next;
935 		}
936 		controllen = ctlbuf - (caddr_t)msg.msg_control;
937 		error = copyout(&controllen, ucontrollenp,
938 		    sizeof(*ucontrollenp));
939 	}
940 
941 	if (error == 0)
942 		error = copyout(&flags, uflagsp, sizeof(*uflagsp));
943 
944 cleanup:
945 	if (sa)
946 		FREE(sa, M_SONAME);
947 	iovec_free(&iov, aiov);
948 	if (control)
949 		m_freem(control);
950 	return (error);
951 }
952 
953 /*
954  * If sopt->sopt_td == NULL, then sopt->sopt_val is treated as an
955  * in kernel pointer instead of a userland pointer.  This allows us
956  * to manipulate socket options in the emulation code.
957  */
958 int
959 kern_setsockopt(int s, struct sockopt *sopt)
960 {
961 	struct thread *td = curthread;
962 	struct proc *p = td->td_proc;
963 	struct file *fp;
964 	int error;
965 
966 	if (sopt->sopt_val == 0 && sopt->sopt_valsize != 0)
967 		return (EFAULT);
968 	if (sopt->sopt_valsize < 0)
969 		return (EINVAL);
970 
971 	error = holdsock(p->p_fd, s, &fp);
972 	if (error)
973 		return (error);
974 
975 	error = sosetopt((struct socket *)fp->f_data, sopt);
976 	fdrop(fp, td);
977 	return (error);
978 }
979 
980 /*
981  * setsockopt_args(int s, int level, int name, caddr_t val, int valsize)
982  */
983 int
984 setsockopt(struct setsockopt_args *uap)
985 {
986 	struct thread *td = curthread;
987 	struct sockopt sopt;
988 	int error;
989 
990 	sopt.sopt_level = uap->level;
991 	sopt.sopt_name = uap->name;
992 	sopt.sopt_val = uap->val;
993 	sopt.sopt_valsize = uap->valsize;
994 	sopt.sopt_td = td;
995 
996 	error = kern_setsockopt(uap->s, &sopt);
997 	return(error);
998 }
999 
1000 /*
1001  * If sopt->sopt_td == NULL, then sopt->sopt_val is treated as an
1002  * in kernel pointer instead of a userland pointer.  This allows us
1003  * to manipulate socket options in the emulation code.
1004  */
1005 int
1006 kern_getsockopt(int s, struct sockopt *sopt)
1007 {
1008 	struct thread *td = curthread;
1009 	struct proc *p = td->td_proc;
1010 	struct file *fp;
1011 	int error;
1012 
1013 	if (sopt->sopt_val == 0 && sopt->sopt_valsize != 0)
1014 		return (EFAULT);
1015 	if (sopt->sopt_valsize < 0)
1016 		return (EINVAL);
1017 
1018 	error = holdsock(p->p_fd, s, &fp);
1019 	if (error)
1020 		return (error);
1021 
1022 	error = sogetopt((struct socket *)fp->f_data, sopt);
1023 	fdrop(fp, td);
1024 	return (error);
1025 }
1026 
1027 /*
1028  * getsockopt_Args(int s, int level, int name, caddr_t val, int *avalsize)
1029  */
1030 int
1031 getsockopt(struct getsockopt_args *uap)
1032 {
1033 	struct thread *td = curthread;
1034 	struct	sockopt sopt;
1035 	int	error, valsize;
1036 
1037 	if (uap->val) {
1038 		error = copyin(uap->avalsize, &valsize, sizeof(valsize));
1039 		if (error)
1040 			return (error);
1041 		if (valsize < 0)
1042 			return (EINVAL);
1043 	} else {
1044 		valsize = 0;
1045 	}
1046 
1047 	sopt.sopt_level = uap->level;
1048 	sopt.sopt_name = uap->name;
1049 	sopt.sopt_val = uap->val;
1050 	sopt.sopt_valsize = valsize;
1051 	sopt.sopt_td = td;
1052 
1053 	error = kern_getsockopt(uap->s, &sopt);
1054 	if (error == 0) {
1055 		valsize = sopt.sopt_valsize;
1056 		error = copyout(&valsize, uap->avalsize, sizeof(valsize));
1057 	}
1058 	return (error);
1059 }
1060 
1061 /*
1062  * The second argument to kern_getsockname() is a handle to a struct sockaddr.
1063  * This allows kern_getsockname() to return a pointer to an allocated struct
1064  * sockaddr which must be freed later with FREE().  The caller must
1065  * initialize *name to NULL.
1066  */
1067 int
1068 kern_getsockname(int s, struct sockaddr **name, int *namelen)
1069 {
1070 	struct thread *td = curthread;
1071 	struct proc *p = td->td_proc;
1072 	struct file *fp;
1073 	struct socket *so;
1074 	struct sockaddr *sa = NULL;
1075 	int error;
1076 
1077 	error = holdsock(p->p_fd, s, &fp);
1078 	if (error)
1079 		return (error);
1080 	if (*namelen < 0) {
1081 		fdrop(fp, td);
1082 		return (EINVAL);
1083 	}
1084 	so = (struct socket *)fp->f_data;
1085 	error = so_pru_sockaddr(so, &sa);
1086 	if (error == 0) {
1087 		if (sa == 0) {
1088 			*namelen = 0;
1089 		} else {
1090 			*namelen = MIN(*namelen, sa->sa_len);
1091 			*name = sa;
1092 		}
1093 	}
1094 
1095 	fdrop(fp, td);
1096 	return (error);
1097 }
1098 
1099 /*
1100  * getsockname_args(int fdes, caddr_t asa, int *alen)
1101  *
1102  * Get socket name.
1103  */
1104 int
1105 getsockname(struct getsockname_args *uap)
1106 {
1107 	struct sockaddr *sa = NULL;
1108 	int error, sa_len;
1109 
1110 	error = copyin(uap->alen, &sa_len, sizeof(sa_len));
1111 	if (error)
1112 		return (error);
1113 
1114 	error = kern_getsockname(uap->fdes, &sa, &sa_len);
1115 
1116 	if (error == 0)
1117 		error = copyout(sa, uap->asa, sa_len);
1118 	if (error == 0)
1119 		error = copyout(&sa_len, uap->alen, sizeof(*uap->alen));
1120 	if (sa)
1121 		FREE(sa, M_SONAME);
1122 	return (error);
1123 }
1124 
1125 /*
1126  * The second argument to kern_getpeername() is a handle to a struct sockaddr.
1127  * This allows kern_getpeername() to return a pointer to an allocated struct
1128  * sockaddr which must be freed later with FREE().  The caller must
1129  * initialize *name to NULL.
1130  */
1131 int
1132 kern_getpeername(int s, struct sockaddr **name, int *namelen)
1133 {
1134 	struct thread *td = curthread;
1135 	struct proc *p = td->td_proc;
1136 	struct file *fp;
1137 	struct socket *so;
1138 	struct sockaddr *sa = NULL;
1139 	int error;
1140 
1141 	error = holdsock(p->p_fd, s, &fp);
1142 	if (error)
1143 		return (error);
1144 	if (*namelen < 0) {
1145 		fdrop(fp, td);
1146 		return (EINVAL);
1147 	}
1148 	so = (struct socket *)fp->f_data;
1149 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1150 		fdrop(fp, td);
1151 		return (ENOTCONN);
1152 	}
1153 	error = so_pru_peeraddr(so, &sa);
1154 	if (error == 0) {
1155 		if (sa == 0) {
1156 			*namelen = 0;
1157 		} else {
1158 			*namelen = MIN(*namelen, sa->sa_len);
1159 			*name = sa;
1160 		}
1161 	}
1162 
1163 	fdrop(fp, td);
1164 	return (error);
1165 }
1166 
1167 /*
1168  * getpeername_args(int fdes, caddr_t asa, int *alen)
1169  *
1170  * Get name of peer for connected socket.
1171  */
1172 int
1173 getpeername(struct getpeername_args *uap)
1174 {
1175 	struct sockaddr *sa = NULL;
1176 	int error, sa_len;
1177 
1178 	error = copyin(uap->alen, &sa_len, sizeof(sa_len));
1179 	if (error)
1180 		return (error);
1181 
1182 	error = kern_getpeername(uap->fdes, &sa, &sa_len);
1183 
1184 	if (error == 0)
1185 		error = copyout(sa, uap->asa, sa_len);
1186 	if (error == 0)
1187 		error = copyout(&sa_len, uap->alen, sizeof(*uap->alen));
1188 	if (sa)
1189 		FREE(sa, M_SONAME);
1190 	return (error);
1191 }
1192 
1193 int
1194 getsockaddr(struct sockaddr **namp, caddr_t uaddr, size_t len)
1195 {
1196 	struct sockaddr *sa;
1197 	int error;
1198 
1199 	*namp = NULL;
1200 	if (len > SOCK_MAXADDRLEN)
1201 		return ENAMETOOLONG;
1202 	if (len < offsetof(struct sockaddr, sa_data[0]))
1203 		return EDOM;
1204 	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1205 	error = copyin(uaddr, sa, len);
1206 	if (error) {
1207 		FREE(sa, M_SONAME);
1208 	} else {
1209 #if BYTE_ORDER != BIG_ENDIAN
1210 		/*
1211 		 * The bind(), connect(), and sendto() syscalls were not
1212 		 * versioned for COMPAT_43.  Thus, this check must stay.
1213 		 */
1214 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1215 			sa->sa_family = sa->sa_len;
1216 #endif
1217 		sa->sa_len = len;
1218 		*namp = sa;
1219 	}
1220 	return error;
1221 }
1222 
1223 /*
1224  * holdsock() - load the struct file pointer associated
1225  * with a socket into *fpp.  If an error occurs, non-zero
1226  * will be returned and *fpp will be set to NULL.
1227  */
1228 int
1229 holdsock(struct filedesc *fdp, int fdes, struct file **fpp)
1230 {
1231 	struct file *fp;
1232 	int error = 0;
1233 
1234 	*fpp = NULL;
1235 	if ((unsigned)fdes >= fdp->fd_nfiles)
1236 		return EBADF;
1237 	if ((fp = fdp->fd_files[fdes].fp) == NULL)
1238 		return EBADF;
1239 	if (fp->f_type != DTYPE_SOCKET)
1240 		return ENOTSOCK;
1241 	fhold(fp);
1242 	*fpp = fp;
1243 	return (error);
1244 }
1245 
1246 /*
1247  * Detach a mapped page and release resources back to the system.
1248  * We must release our wiring and if the object is ripped out
1249  * from under the vm_page we become responsible for freeing the
1250  * page.
1251  *
1252  * XXX HACK XXX TEMPORARY UNTIL WE IMPLEMENT EXT MBUF REFERENCE COUNTING
1253  */
1254 static void
1255 sf_buf_mref(void *arg)
1256 {
1257 	struct sfbuf_mref *sfm = arg;
1258 
1259 	++sfm->mref_count;
1260 }
1261 
1262 static void
1263 sf_buf_mfree(void *arg)
1264 {
1265 	struct sfbuf_mref *sfm = arg;
1266 	vm_page_t m;
1267 
1268 	KKASSERT(sfm->mref_count > 0);
1269 	if (--sfm->mref_count == 0) {
1270 		m = sf_buf_page(sfm->sf);
1271 		sf_buf_free(sfm->sf);
1272 		crit_enter();
1273 		vm_page_unwire(m, 0);
1274 		if (m->wire_count == 0 && m->object == NULL)
1275 			vm_page_try_to_free(m);
1276 		crit_exit();
1277 		free(sfm, M_SENDFILE);
1278 	}
1279 }
1280 
1281 /*
1282  * sendfile(2).
1283  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1284  *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1285  *
1286  * Send a file specified by 'fd' and starting at 'offset' to a socket
1287  * specified by 's'. Send only 'nbytes' of the file or until EOF if
1288  * nbytes == 0. Optionally add a header and/or trailer to the socket
1289  * output. If specified, write the total number of bytes sent into *sbytes.
1290  *
1291  * In FreeBSD kern/uipc_syscalls.c,v 1.103, a bug was fixed that caused
1292  * the headers to count against the remaining bytes to be sent from
1293  * the file descriptor.  We may wish to implement a compatibility syscall
1294  * in the future.
1295  */
1296 int
1297 sendfile(struct sendfile_args *uap)
1298 {
1299 	struct thread *td = curthread;
1300 	struct proc *p = td->td_proc;
1301 	struct file *fp;
1302 	struct filedesc *fdp;
1303 	struct vnode *vp = NULL;
1304 	struct sf_hdtr hdtr;
1305 	struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
1306 	struct uio auio;
1307 	struct mbuf *mheader = NULL;
1308 	off_t hdtr_size = 0, sbytes;
1309 	int error, hbytes = 0, tbytes;
1310 
1311 	KKASSERT(p);
1312 	fdp = p->p_fd;
1313 
1314 	/*
1315 	 * Do argument checking. Must be a regular file in, stream
1316 	 * type and connected socket out, positive offset.
1317 	 */
1318 	fp = holdfp(fdp, uap->fd, FREAD);
1319 	if (fp == NULL) {
1320 		return (EBADF);
1321 	}
1322 	if (fp->f_type != DTYPE_VNODE) {
1323 		fdrop(fp, td);
1324 		return (EINVAL);
1325 	}
1326 	vp = (struct vnode *)fp->f_data;
1327 	vref(vp);
1328 	fdrop(fp, td);
1329 
1330 	/*
1331 	 * If specified, get the pointer to the sf_hdtr struct for
1332 	 * any headers/trailers.
1333 	 */
1334 	if (uap->hdtr) {
1335 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1336 		if (error)
1337 			goto done;
1338 		/*
1339 		 * Send any headers.
1340 		 */
1341 		if (hdtr.headers) {
1342 			error = iovec_copyin(hdtr.headers, &iov, aiov,
1343 			    hdtr.hdr_cnt, &hbytes);
1344 			if (error)
1345 				goto done;
1346 			auio.uio_iov = iov;
1347 			auio.uio_iovcnt = hdtr.hdr_cnt;
1348 			auio.uio_offset = 0;
1349 			auio.uio_segflg = UIO_USERSPACE;
1350 			auio.uio_rw = UIO_WRITE;
1351 			auio.uio_td = td;
1352 			auio.uio_resid = hbytes;
1353 
1354 			mheader = m_uiomove(&auio);
1355 
1356 			iovec_free(&iov, aiov);
1357 			if (mheader == NULL)
1358 				goto done;
1359 		}
1360 	}
1361 
1362 	error = kern_sendfile(vp, uap->s, uap->offset, uap->nbytes, mheader,
1363 	    &sbytes, uap->flags);
1364 	if (error)
1365 		goto done;
1366 
1367 	/*
1368 	 * Send trailers. Wimp out and use writev(2).
1369 	 */
1370 	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
1371 		error = iovec_copyin(hdtr.trailers, &iov, aiov,
1372 		    hdtr.trl_cnt, &auio.uio_resid);
1373 		if (error)
1374 			goto done;
1375 		auio.uio_iov = iov;
1376 		auio.uio_iovcnt = hdtr.trl_cnt;
1377 		auio.uio_offset = 0;
1378 		auio.uio_segflg = UIO_USERSPACE;
1379 		auio.uio_rw = UIO_WRITE;
1380 		auio.uio_td = td;
1381 
1382 		error = kern_sendmsg(uap->s, NULL, &auio, NULL, 0, &tbytes);
1383 
1384 		iovec_free(&iov, aiov);
1385 		if (error)
1386 			goto done;
1387 		hdtr_size += tbytes;	/* trailer bytes successfully sent */
1388 	}
1389 
1390 done:
1391 	if (uap->sbytes != NULL) {
1392 		sbytes += hdtr_size;
1393 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
1394 	}
1395 	if (vp)
1396 		vrele(vp);
1397 	return (error);
1398 }
1399 
1400 int
1401 kern_sendfile(struct vnode *vp, int sfd, off_t offset, size_t nbytes,
1402     struct mbuf *mheader, off_t *sbytes, int flags)
1403 {
1404 	struct thread *td = curthread;
1405 	struct proc *p = td->td_proc;
1406 	struct vm_object *obj;
1407 	struct socket *so;
1408 	struct file *fp;
1409 	struct mbuf *m;
1410 	struct sf_buf *sf;
1411 	struct sfbuf_mref *sfm;
1412 	struct vm_page *pg;
1413 	off_t off, xfsize;
1414 	off_t hbytes = 0;
1415 	int error = 0;
1416 
1417 	if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) {
1418 		error = EINVAL;
1419 		goto done0;
1420 	}
1421 	error = holdsock(p->p_fd, sfd, &fp);
1422 	if (error)
1423 		goto done0;
1424 	so = (struct socket *)fp->f_data;
1425 	if (so->so_type != SOCK_STREAM) {
1426 		error = EINVAL;
1427 		goto done;
1428 	}
1429 	if ((so->so_state & SS_ISCONNECTED) == 0) {
1430 		error = ENOTCONN;
1431 		goto done;
1432 	}
1433 	if (offset < 0) {
1434 		error = EINVAL;
1435 		goto done;
1436 	}
1437 
1438 	*sbytes = 0;
1439 	/*
1440 	 * Protect against multiple writers to the socket.
1441 	 */
1442 	(void) sblock(&so->so_snd, M_WAITOK);
1443 
1444 	/*
1445 	 * Loop through the pages in the file, starting with the requested
1446 	 * offset. Get a file page (do I/O if necessary), map the file page
1447 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1448 	 * it on the socket.
1449 	 */
1450 	for (off = offset; ; off += xfsize, *sbytes += xfsize + hbytes) {
1451 		vm_pindex_t pindex;
1452 		vm_offset_t pgoff;
1453 
1454 		pindex = OFF_TO_IDX(off);
1455 retry_lookup:
1456 		/*
1457 		 * Calculate the amount to transfer. Not to exceed a page,
1458 		 * the EOF, or the passed in nbytes.
1459 		 */
1460 		xfsize = obj->un_pager.vnp.vnp_size - off;
1461 		if (xfsize > PAGE_SIZE)
1462 			xfsize = PAGE_SIZE;
1463 		pgoff = (vm_offset_t)(off & PAGE_MASK);
1464 		if (PAGE_SIZE - pgoff < xfsize)
1465 			xfsize = PAGE_SIZE - pgoff;
1466 		if (nbytes && xfsize > (nbytes - *sbytes))
1467 			xfsize = nbytes - *sbytes;
1468 		if (xfsize <= 0)
1469 			break;
1470 		/*
1471 		 * Optimize the non-blocking case by looking at the socket space
1472 		 * before going to the extra work of constituting the sf_buf.
1473 		 */
1474 		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
1475 			if (so->so_state & SS_CANTSENDMORE)
1476 				error = EPIPE;
1477 			else
1478 				error = EAGAIN;
1479 			sbunlock(&so->so_snd);
1480 			goto done;
1481 		}
1482 		/*
1483 		 * Attempt to look up the page.
1484 		 *
1485 		 *	Allocate if not found, wait and loop if busy, then
1486 		 *	wire the page.  critical section protection is
1487 		 * 	required to maintain the object association (an
1488 		 *	interrupt can free the page) through to the
1489 		 *	vm_page_wire() call.
1490 		 */
1491 		crit_enter();
1492 		pg = vm_page_lookup(obj, pindex);
1493 		if (pg == NULL) {
1494 			pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
1495 			if (pg == NULL) {
1496 				vm_wait();
1497 				crit_exit();
1498 				goto retry_lookup;
1499 			}
1500 			vm_page_wakeup(pg);
1501 		} else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) {
1502 			crit_exit();
1503 			goto retry_lookup;
1504 		}
1505 		vm_page_wire(pg);
1506 		crit_exit();
1507 
1508 		/*
1509 		 * If page is not valid for what we need, initiate I/O
1510 		 */
1511 
1512 		if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
1513 			struct uio auio;
1514 			struct iovec aiov;
1515 			int bsize;
1516 
1517 			/*
1518 			 * Ensure that our page is still around when the I/O
1519 			 * completes.
1520 			 */
1521 			vm_page_io_start(pg);
1522 
1523 			/*
1524 			 * Get the page from backing store.
1525 			 */
1526 			bsize = vp->v_mount->mnt_stat.f_iosize;
1527 			auio.uio_iov = &aiov;
1528 			auio.uio_iovcnt = 1;
1529 			aiov.iov_base = 0;
1530 			aiov.iov_len = MAXBSIZE;
1531 			auio.uio_resid = MAXBSIZE;
1532 			auio.uio_offset = trunc_page(off);
1533 			auio.uio_segflg = UIO_NOCOPY;
1534 			auio.uio_rw = UIO_READ;
1535 			auio.uio_td = td;
1536 			vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td);
1537 			error = VOP_READ(vp, &auio,
1538 				    IO_VMIO | ((MAXBSIZE / bsize) << 16),
1539 				    p->p_ucred);
1540 			VOP_UNLOCK(vp, 0, td);
1541 			vm_page_flag_clear(pg, PG_ZERO);
1542 			vm_page_io_finish(pg);
1543 			if (error) {
1544 				crit_enter();
1545 				vm_page_unwire(pg, 0);
1546 				vm_page_try_to_free(pg);
1547 				crit_exit();
1548 				sbunlock(&so->so_snd);
1549 				goto done;
1550 			}
1551 		}
1552 
1553 
1554 		/*
1555 		 * Get a sendfile buf. We usually wait as long as necessary,
1556 		 * but this wait can be interrupted.
1557 		 */
1558 		if ((sf = sf_buf_alloc(pg, SFB_CATCH)) == NULL) {
1559 			crit_enter();
1560 			vm_page_unwire(pg, 0);
1561 			vm_page_try_to_free(pg);
1562 			crit_exit();
1563 			sbunlock(&so->so_snd);
1564 			error = EINTR;
1565 			goto done;
1566 		}
1567 
1568 		/*
1569 		 * Get an mbuf header and set it up as having external storage.
1570 		 */
1571 		MGETHDR(m, MB_WAIT, MT_DATA);
1572 		if (m == NULL) {
1573 			error = ENOBUFS;
1574 			sf_buf_free(sf);
1575 			sbunlock(&so->so_snd);
1576 			goto done;
1577 		}
1578 
1579 		/*
1580 		 * sfm is a temporary hack, use a per-cpu cache for this.
1581 		 */
1582 		sfm = malloc(sizeof(struct sfbuf_mref), M_SENDFILE, M_WAITOK);
1583 		sfm->sf = sf;
1584 		sfm->mref_count = 1;
1585 
1586 		m->m_ext.ext_free = sf_buf_mfree;
1587 		m->m_ext.ext_ref = sf_buf_mref;
1588 		m->m_ext.ext_arg = sfm;
1589 		m->m_ext.ext_buf = (void *)sf->kva;
1590 		m->m_ext.ext_size = PAGE_SIZE;
1591 		m->m_data = (char *) sf->kva + pgoff;
1592 		m->m_flags |= M_EXT;
1593 		m->m_pkthdr.len = m->m_len = xfsize;
1594 		KKASSERT((m->m_flags & (M_EXT_CLUSTER)) == 0);
1595 
1596 		if (mheader != NULL) {
1597 			hbytes = mheader->m_pkthdr.len;
1598 			mheader->m_pkthdr.len += m->m_pkthdr.len;
1599 			m_cat(mheader, m);
1600 			m = mheader;
1601 			mheader = NULL;
1602 		} else
1603 			hbytes = 0;
1604 
1605 		/*
1606 		 * Add the buffer to the socket buffer chain.
1607 		 */
1608 		crit_enter();
1609 retry_space:
1610 		/*
1611 		 * Make sure that the socket is still able to take more data.
1612 		 * CANTSENDMORE being true usually means that the connection
1613 		 * was closed. so_error is true when an error was sensed after
1614 		 * a previous send.
1615 		 * The state is checked after the page mapping and buffer
1616 		 * allocation above since those operations may block and make
1617 		 * any socket checks stale. From this point forward, nothing
1618 		 * blocks before the pru_send (or more accurately, any blocking
1619 		 * results in a loop back to here to re-check).
1620 		 */
1621 		if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
1622 			if (so->so_state & SS_CANTSENDMORE) {
1623 				error = EPIPE;
1624 			} else {
1625 				error = so->so_error;
1626 				so->so_error = 0;
1627 			}
1628 			m_freem(m);
1629 			sbunlock(&so->so_snd);
1630 			crit_exit();
1631 			goto done;
1632 		}
1633 		/*
1634 		 * Wait for socket space to become available. We do this just
1635 		 * after checking the connection state above in order to avoid
1636 		 * a race condition with sbwait().
1637 		 */
1638 		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
1639 			if (so->so_state & SS_NBIO) {
1640 				m_freem(m);
1641 				sbunlock(&so->so_snd);
1642 				crit_exit();
1643 				error = EAGAIN;
1644 				goto done;
1645 			}
1646 			error = sbwait(&so->so_snd);
1647 			/*
1648 			 * An error from sbwait usually indicates that we've
1649 			 * been interrupted by a signal. If we've sent anything
1650 			 * then return bytes sent, otherwise return the error.
1651 			 */
1652 			if (error) {
1653 				m_freem(m);
1654 				sbunlock(&so->so_snd);
1655 				crit_exit();
1656 				goto done;
1657 			}
1658 			goto retry_space;
1659 		}
1660 		error = so_pru_send(so, 0, m, NULL, NULL, td);
1661 		crit_exit();
1662 		if (error) {
1663 			sbunlock(&so->so_snd);
1664 			goto done;
1665 		}
1666 	}
1667 	if (mheader != NULL) {
1668 		*sbytes += mheader->m_pkthdr.len;
1669 		error = so_pru_send(so, 0, mheader, NULL, NULL, td);
1670 		mheader = NULL;
1671 	}
1672 	sbunlock(&so->so_snd);
1673 
1674 done:
1675 	fdrop(fp, td);
1676 done0:
1677 	if (mheader != NULL)
1678 		m_freem(mheader);
1679 	return (error);
1680 }
1681 
1682 int
1683 sctp_peeloff(struct sctp_peeloff_args *uap)
1684 {
1685 #ifdef SCTP
1686 	struct thread *td = curthread;
1687 	struct proc *p = td->td_proc;
1688 	struct filedesc *fdp = p->p_fd;
1689 	struct file *lfp = NULL;
1690 	struct file *nfp = NULL;
1691 	int error;
1692 	struct socket *head, *so;
1693 	caddr_t assoc_id;
1694 	int fd;
1695 	short fflag;		/* type must match fp->f_flag */
1696 
1697 	assoc_id = uap->name;
1698 	error = holdsock(fdp, uap->sd, &lfp);
1699 	if (error) {
1700 		return (error);
1701 	}
1702 	crit_enter();
1703 	head = (struct socket *)lfp->f_data;
1704 	error = sctp_can_peel_off(head, assoc_id);
1705 	if (error) {
1706 		crit_exit();
1707 		goto done;
1708 	}
1709 	/*
1710 	 * At this point we know we do have a assoc to pull
1711 	 * we proceed to get the fd setup. This may block
1712 	 * but that is ok.
1713 	 */
1714 
1715 	fflag = lfp->f_flag;
1716 	error = falloc(p, &nfp, &fd);
1717 	if (error) {
1718 		/*
1719 		 * Probably ran out of file descriptors. Put the
1720 		 * unaccepted connection back onto the queue and
1721 		 * do another wakeup so some other process might
1722 		 * have a chance at it.
1723 		 */
1724 		crit_exit();
1725 		goto done;
1726 	}
1727 	fhold(nfp);
1728 	uap->sysmsg_result = fd;
1729 
1730 	so = sctp_get_peeloff(head, assoc_id, &error);
1731 	if (so == NULL) {
1732 		/*
1733 		 * Either someone else peeled it off OR
1734 		 * we can't get a socket.
1735 		 */
1736 		goto noconnection;
1737 	}
1738 	so->so_state &= ~SS_COMP;
1739 	so->so_state &= ~SS_NOFDREF;
1740 	so->so_head = NULL;
1741 	if (head->so_sigio != NULL)
1742 		fsetown(fgetown(head->so_sigio), &so->so_sigio);
1743 
1744 	nfp->f_data = (caddr_t)so;
1745 	nfp->f_flag = fflag;
1746 	nfp->f_ops = &socketops;
1747 	nfp->f_type = DTYPE_SOCKET;
1748 
1749 noconnection:
1750 	/*
1751 	 * close the new descriptor, assuming someone hasn't ripped it
1752 	 * out from under us.
1753 	 */
1754 	if (error) {
1755 		if (fdp->fd_files[fd].fp == nfp) {
1756 			funsetfd(fdp, fd);
1757 			fdrop(nfp, td);
1758 		}
1759 	}
1760 	crit_exit();
1761 	/*
1762 	 * Release explicitly held references before returning.
1763 	 */
1764 done:
1765 	if (nfp != NULL)
1766 		fdrop(nfp, td);
1767 	fdrop(lfp, td);
1768 	return (error);
1769 #else /* SCTP */
1770 	return(EOPNOTSUPP);
1771 #endif /* SCTP */
1772 }
1773