1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
5  *      The Regents of the University of California.  All rights reserved.
6  * Copyright (c) 2004 The FreeBSD Foundation.  All rights reserved.
7  * Copyright (c) 2004-2008 Robert N. M. Watson.  All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c
34  */
35 
36 /*
37  *
38  * Copyright (c) 2010 Isilon Systems, Inc.
39  * Copyright (c) 2010 iX Systems, Inc.
40  * Copyright (c) 2010 Panasas, Inc.
41  * All rights reserved.
42  *
43  * Redistribution and use in source and binary forms, with or without
44  * modification, are permitted provided that the following conditions
45  * are met:
46  * 1. Redistributions of source code must retain the above copyright
47  *    notice unmodified, this list of conditions, and the following
48  *    disclaimer.
49  * 2. Redistributions in binary form must reproduce the above copyright
50  *    notice, this list of conditions and the following disclaimer in the
51  *    documentation and/or other materials provided with the distribution.
52  *
53  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
54  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
55  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
56  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
57  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
58  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
59  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
60  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
61  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
62  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
63  *
64  */
65 #include <sys/cdefs.h>
66 __FBSDID("$FreeBSD$");
67 
68 #include <sys/param.h>
69 #include <sys/kernel.h>
70 #include <sys/malloc.h>
71 
72 #include "sdp.h"
73 
74 #include <net/if.h>
75 #include <net/route.h>
76 #include <net/vnet.h>
77 #include <sys/sysctl.h>
78 
79 uma_zone_t	sdp_zone;
80 struct rwlock	sdp_lock;
81 LIST_HEAD(, sdp_sock) sdp_list;
82 
83 struct workqueue_struct *rx_comp_wq;
84 
85 RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock");
86 #define	SDP_LIST_WLOCK()	rw_wlock(&sdp_lock)
87 #define	SDP_LIST_RLOCK()	rw_rlock(&sdp_lock)
88 #define	SDP_LIST_WUNLOCK()	rw_wunlock(&sdp_lock)
89 #define	SDP_LIST_RUNLOCK()	rw_runlock(&sdp_lock)
90 #define	SDP_LIST_WLOCK_ASSERT()	rw_assert(&sdp_lock, RW_WLOCKED)
91 #define	SDP_LIST_RLOCK_ASSERT()	rw_assert(&sdp_lock, RW_RLOCKED)
92 #define	SDP_LIST_LOCK_ASSERT()	rw_assert(&sdp_lock, RW_LOCKED)
93 
94 MALLOC_DEFINE(M_SDP, "sdp", "Sockets Direct Protocol");
95 
96 static void sdp_stop_keepalive_timer(struct socket *so);
97 
98 /*
99  * SDP protocol interface to socket abstraction.
100  */
101 /*
102  * sdp_sendspace and sdp_recvspace are the default send and receive window
103  * sizes, respectively.
104  */
105 u_long	sdp_sendspace = 1024*32;
106 u_long	sdp_recvspace = 1024*64;
107 
108 static int sdp_count;
109 
110 /*
111  * Disable async. CMA events for sockets which are being torn down.
112  */
113 static void
114 sdp_destroy_cma(struct sdp_sock *ssk)
115 {
116 
117 	if (ssk->id == NULL)
118 		return;
119 	rdma_destroy_id(ssk->id);
120 	ssk->id = NULL;
121 }
122 
123 static int
124 sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred)
125 {
126 	struct sockaddr_in *sin;
127 	struct sockaddr_in null;
128 	int error;
129 
130 	SDP_WLOCK_ASSERT(ssk);
131 
132 	if (ssk->lport != 0 || ssk->laddr != INADDR_ANY)
133 		return (EINVAL);
134 	/* rdma_bind_addr handles bind races.  */
135 	SDP_WUNLOCK(ssk);
136 	if (ssk->id == NULL)
137 		ssk->id = rdma_create_id(&init_net, sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC);
138 	if (ssk->id == NULL) {
139 		SDP_WLOCK(ssk);
140 		return (ENOMEM);
141 	}
142 	if (nam == NULL) {
143 		null.sin_family = AF_INET;
144 		null.sin_len = sizeof(null);
145 		null.sin_addr.s_addr = INADDR_ANY;
146 		null.sin_port = 0;
147 		bzero(&null.sin_zero, sizeof(null.sin_zero));
148 		nam = (struct sockaddr *)&null;
149 	}
150 	error = -rdma_bind_addr(ssk->id, nam);
151 	SDP_WLOCK(ssk);
152 	if (error == 0) {
153 		sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr;
154 		ssk->laddr = sin->sin_addr.s_addr;
155 		ssk->lport = sin->sin_port;
156 	} else
157 		sdp_destroy_cma(ssk);
158 	return (error);
159 }
160 
161 static void
162 sdp_pcbfree(struct sdp_sock *ssk)
163 {
164 
165 	KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk));
166 	KASSERT((ssk->flags & SDP_DESTROY) == 0,
167 	    ("ssk %p already destroyed", ssk));
168 
169 	sdp_dbg(ssk->socket, "Freeing pcb");
170 	SDP_WLOCK_ASSERT(ssk);
171 	ssk->flags |= SDP_DESTROY;
172 	SDP_WUNLOCK(ssk);
173 	SDP_LIST_WLOCK();
174 	sdp_count--;
175 	LIST_REMOVE(ssk, list);
176 	SDP_LIST_WUNLOCK();
177 	crfree(ssk->cred);
178 	ssk->qp_active = 0;
179 	if (ssk->qp) {
180 		ib_destroy_qp(ssk->qp);
181 		ssk->qp = NULL;
182 	}
183 	sdp_tx_ring_destroy(ssk);
184 	sdp_rx_ring_destroy(ssk);
185 	sdp_destroy_cma(ssk);
186 	rw_destroy(&ssk->rx_ring.destroyed_lock);
187 	rw_destroy(&ssk->lock);
188 	uma_zfree(sdp_zone, ssk);
189 }
190 
191 /*
192  * Common routines to return a socket address.
193  */
194 static struct sockaddr *
195 sdp_sockaddr(in_port_t port, struct in_addr *addr_p)
196 {
197 	struct sockaddr_in *sin;
198 
199 	sin = malloc(sizeof *sin, M_SONAME,
200 		M_WAITOK | M_ZERO);
201 	sin->sin_family = AF_INET;
202 	sin->sin_len = sizeof(*sin);
203 	sin->sin_addr = *addr_p;
204 	sin->sin_port = port;
205 
206 	return (struct sockaddr *)sin;
207 }
208 
209 static int
210 sdp_getsockaddr(struct socket *so, struct sockaddr **nam)
211 {
212 	struct sdp_sock *ssk;
213 	struct in_addr addr;
214 	in_port_t port;
215 
216 	ssk = sdp_sk(so);
217 	SDP_RLOCK(ssk);
218 	port = ssk->lport;
219 	addr.s_addr = ssk->laddr;
220 	SDP_RUNLOCK(ssk);
221 
222 	*nam = sdp_sockaddr(port, &addr);
223 	return 0;
224 }
225 
226 static int
227 sdp_getpeeraddr(struct socket *so, struct sockaddr **nam)
228 {
229 	struct sdp_sock *ssk;
230 	struct in_addr addr;
231 	in_port_t port;
232 
233 	ssk = sdp_sk(so);
234 	SDP_RLOCK(ssk);
235 	port = ssk->fport;
236 	addr.s_addr = ssk->faddr;
237 	SDP_RUNLOCK(ssk);
238 
239 	*nam = sdp_sockaddr(port, &addr);
240 	return 0;
241 }
242 
243 static void
244 sdp_pcbnotifyall(struct in_addr faddr, int errno,
245     struct sdp_sock *(*notify)(struct sdp_sock *, int))
246 {
247 	struct sdp_sock *ssk, *ssk_temp;
248 
249 	SDP_LIST_WLOCK();
250 	LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) {
251 		SDP_WLOCK(ssk);
252 		if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) {
253 			SDP_WUNLOCK(ssk);
254 			continue;
255 		}
256 		if ((ssk->flags & SDP_DESTROY) == 0)
257 			if ((*notify)(ssk, errno))
258 				SDP_WUNLOCK(ssk);
259 	}
260 	SDP_LIST_WUNLOCK();
261 }
262 
263 #if 0
264 static void
265 sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg)
266 {
267 	struct sdp_sock *ssk;
268 
269 	SDP_LIST_RLOCK();
270 	LIST_FOREACH(ssk, &sdp_list, list) {
271 		SDP_WLOCK(ssk);
272 		func(ssk, arg);
273 		SDP_WUNLOCK(ssk);
274 	}
275 	SDP_LIST_RUNLOCK();
276 }
277 #endif
278 
279 static void
280 sdp_output_reset(struct sdp_sock *ssk)
281 {
282 	struct rdma_cm_id *id;
283 
284 	SDP_WLOCK_ASSERT(ssk);
285 	if (ssk->id) {
286 		id = ssk->id;
287 		ssk->qp_active = 0;
288 		SDP_WUNLOCK(ssk);
289 		rdma_disconnect(id);
290 		SDP_WLOCK(ssk);
291 	}
292 	ssk->state = TCPS_CLOSED;
293 }
294 
295 /*
296  * Attempt to close a SDP socket, marking it as dropped, and freeing
297  * the socket if we hold the only reference.
298  */
299 static struct sdp_sock *
300 sdp_closed(struct sdp_sock *ssk)
301 {
302 	struct socket *so;
303 
304 	SDP_WLOCK_ASSERT(ssk);
305 
306 	ssk->flags |= SDP_DROPPED;
307 	so = ssk->socket;
308 	soisdisconnected(so);
309 	if (ssk->flags & SDP_SOCKREF) {
310 		KASSERT(so->so_state & SS_PROTOREF,
311 		    ("sdp_closed: !SS_PROTOREF"));
312 		ssk->flags &= ~SDP_SOCKREF;
313 		SDP_WUNLOCK(ssk);
314 		SOCK_LOCK(so);
315 		so->so_state &= ~SS_PROTOREF;
316 		sofree(so);
317 		return (NULL);
318 	}
319 	return (ssk);
320 }
321 
322 /*
323  * Perform timer based shutdowns which can not operate in
324  * callout context.
325  */
326 static void
327 sdp_shutdown_task(void *data, int pending)
328 {
329 	struct sdp_sock *ssk;
330 
331 	ssk = data;
332 	SDP_WLOCK(ssk);
333 	/*
334 	 * I don't think this can race with another call to pcbfree()
335 	 * because SDP_TIMEWAIT protects it.  SDP_DESTROY may be redundant.
336 	 */
337 	if (ssk->flags & SDP_DESTROY)
338 		panic("sdp_shutdown_task: Racing with pcbfree for ssk %p",
339 		    ssk);
340 	if (ssk->flags & SDP_DISCON)
341 		sdp_output_reset(ssk);
342 	/* We have to clear this so sdp_detach() will call pcbfree(). */
343 	ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT);
344 	if ((ssk->flags & SDP_DROPPED) == 0 &&
345 	    sdp_closed(ssk) == NULL)
346 		return;
347 	if (ssk->socket == NULL) {
348 		sdp_pcbfree(ssk);
349 		return;
350 	}
351 	SDP_WUNLOCK(ssk);
352 }
353 
354 /*
355  * 2msl has expired, schedule the shutdown task.
356  */
357 static void
358 sdp_2msl_timeout(void *data)
359 {
360 	struct sdp_sock *ssk;
361 
362 	ssk = data;
363 	/* Callout canceled. */
364         if (!callout_active(&ssk->keep2msl))
365 		goto out;
366         callout_deactivate(&ssk->keep2msl);
367 	/* Should be impossible, defensive programming. */
368 	if ((ssk->flags & SDP_TIMEWAIT) == 0)
369 		goto out;
370 	taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task);
371 out:
372 	SDP_WUNLOCK(ssk);
373 	return;
374 }
375 
376 /*
377  * Schedule the 2msl wait timer.
378  */
379 static void
380 sdp_2msl_wait(struct sdp_sock *ssk)
381 {
382 
383 	SDP_WLOCK_ASSERT(ssk);
384 	ssk->flags |= SDP_TIMEWAIT;
385 	ssk->state = TCPS_TIME_WAIT;
386 	soisdisconnected(ssk->socket);
387 	callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk);
388 }
389 
390 /*
391  * Timed out waiting for the final fin/ack from rdma_disconnect().
392  */
393 static void
394 sdp_dreq_timeout(void *data)
395 {
396 	struct sdp_sock *ssk;
397 
398 	ssk = data;
399 	/* Callout canceled. */
400         if (!callout_active(&ssk->keep2msl))
401 		goto out;
402 	/* Callout rescheduled, probably as a different timer. */
403 	if (callout_pending(&ssk->keep2msl))
404 		goto out;
405         callout_deactivate(&ssk->keep2msl);
406 	if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK)
407 		goto out;
408 	if ((ssk->flags & SDP_DREQWAIT) == 0)
409 		goto out;
410 	ssk->flags &= ~SDP_DREQWAIT;
411 	ssk->flags |= SDP_DISCON;
412 	sdp_2msl_wait(ssk);
413 	ssk->qp_active = 0;
414 out:
415 	SDP_WUNLOCK(ssk);
416 }
417 
418 /*
419  * Received the final fin/ack.  Cancel the 2msl.
420  */
421 void
422 sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk)
423 {
424 	sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n");
425 	ssk->flags &= ~SDP_DREQWAIT;
426 	sdp_2msl_wait(ssk);
427 }
428 
429 static int
430 sdp_init_sock(struct socket *sk)
431 {
432 	struct sdp_sock *ssk = sdp_sk(sk);
433 
434 	sdp_dbg(sk, "%s\n", __func__);
435 
436 	callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED);
437 	TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk);
438 #ifdef SDP_ZCOPY
439 	INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout);
440 	ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */
441 	ssk->tx_ring.rdma_inflight = NULL;
442 #endif
443 	atomic_set(&ssk->mseq_ack, 0);
444 	sdp_rx_ring_init(ssk);
445 	ssk->tx_ring.buffer = NULL;
446 
447 	return 0;
448 }
449 
450 /*
451  * Allocate an sdp_sock for the socket and reserve socket buffer space.
452  */
453 static int
454 sdp_attach(struct socket *so, int proto, struct thread *td)
455 {
456 	struct sdp_sock *ssk;
457 	int error;
458 
459 	ssk = sdp_sk(so);
460 	KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so));
461 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
462 		error = soreserve(so, sdp_sendspace, sdp_recvspace);
463 		if (error)
464 			return (error);
465 	}
466 	so->so_rcv.sb_flags |= SB_AUTOSIZE;
467 	so->so_snd.sb_flags |= SB_AUTOSIZE;
468 	ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO);
469 	if (ssk == NULL)
470 		return (ENOBUFS);
471 	rw_init(&ssk->lock, "sdpsock");
472 	ssk->socket = so;
473 	ssk->cred = crhold(so->so_cred);
474 	so->so_pcb = (caddr_t)ssk;
475 	sdp_init_sock(so);
476 	ssk->flags = 0;
477 	ssk->qp_active = 0;
478 	ssk->state = TCPS_CLOSED;
479 	mbufq_init(&ssk->rxctlq, INT_MAX);
480 	SDP_LIST_WLOCK();
481 	LIST_INSERT_HEAD(&sdp_list, ssk, list);
482 	sdp_count++;
483 	SDP_LIST_WUNLOCK();
484 	if ((so->so_options & SO_LINGER) && so->so_linger == 0)
485 		so->so_linger = TCP_LINGERTIME;
486 
487 	return (0);
488 }
489 
490 /*
491  * Detach SDP from the socket, potentially leaving it around for the
492  * timewait to expire.
493  */
494 static void
495 sdp_detach(struct socket *so)
496 {
497 	struct sdp_sock *ssk;
498 
499 	ssk = sdp_sk(so);
500 	SDP_WLOCK(ssk);
501 	KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL"));
502 	ssk->socket->so_pcb = NULL;
503 	ssk->socket = NULL;
504 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT))
505 		SDP_WUNLOCK(ssk);
506 	else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT)
507 		sdp_pcbfree(ssk);
508 	else
509 		panic("sdp_detach: Unexpected state, ssk %p.\n", ssk);
510 }
511 
512 /*
513  * Allocate a local address for the socket.
514  */
515 static int
516 sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
517 {
518 	int error = 0;
519 	struct sdp_sock *ssk;
520 	struct sockaddr_in *sin;
521 
522 	sin = (struct sockaddr_in *)nam;
523 	if (nam->sa_len != sizeof (*sin))
524 		return (EINVAL);
525 	if (sin->sin_family != AF_INET)
526 		return (EINVAL);
527 	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
528 		return (EAFNOSUPPORT);
529 
530 	ssk = sdp_sk(so);
531 	SDP_WLOCK(ssk);
532 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
533 		error = EINVAL;
534 		goto out;
535 	}
536 	error = sdp_pcbbind(ssk, nam, td->td_ucred);
537 out:
538 	SDP_WUNLOCK(ssk);
539 
540 	return (error);
541 }
542 
543 /*
544  * Prepare to accept connections.
545  */
546 static int
547 sdp_listen(struct socket *so, int backlog, struct thread *td)
548 {
549 	int error = 0;
550 	struct sdp_sock *ssk;
551 
552 	ssk = sdp_sk(so);
553 	SDP_WLOCK(ssk);
554 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
555 		error = EINVAL;
556 		goto out;
557 	}
558 	if (error == 0 && ssk->lport == 0)
559 		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
560 	SOCK_LOCK(so);
561 	if (error == 0)
562 		error = solisten_proto_check(so);
563 	if (error == 0) {
564 		solisten_proto(so, backlog);
565 		ssk->state = TCPS_LISTEN;
566 	}
567 	SOCK_UNLOCK(so);
568 
569 out:
570 	SDP_WUNLOCK(ssk);
571 	if (error == 0)
572 		error = -rdma_listen(ssk->id, backlog);
573 	return (error);
574 }
575 
576 /*
577  * Initiate a SDP connection to nam.
578  */
579 static int
580 sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td)
581 {
582 	struct sockaddr_in src;
583 	struct socket *so;
584 	int error;
585 
586 	so = ssk->socket;
587 
588 	SDP_WLOCK_ASSERT(ssk);
589 	if (ssk->lport == 0) {
590 		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
591 		if (error)
592 			return error;
593 	}
594 	src.sin_family = AF_INET;
595 	src.sin_len = sizeof(src);
596 	bzero(&src.sin_zero, sizeof(src.sin_zero));
597 	src.sin_port = ssk->lport;
598 	src.sin_addr.s_addr = ssk->laddr;
599 	soisconnecting(so);
600 	SDP_WUNLOCK(ssk);
601 	error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam,
602 	    SDP_RESOLVE_TIMEOUT);
603 	SDP_WLOCK(ssk);
604 	if (error == 0)
605 		ssk->state = TCPS_SYN_SENT;
606 
607 	return 0;
608 }
609 
610 /*
611  * Initiate SDP connection.
612  */
613 static int
614 sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
615 {
616 	int error = 0;
617 	struct sdp_sock *ssk;
618 	struct sockaddr_in *sin;
619 
620 	sin = (struct sockaddr_in *)nam;
621 	if (nam->sa_len != sizeof (*sin))
622 		return (EINVAL);
623 	if (sin->sin_family != AF_INET)
624 		return (EINVAL);
625 	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
626 		return (EAFNOSUPPORT);
627 	if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0)
628 		return (error);
629 	ssk = sdp_sk(so);
630 	SDP_WLOCK(ssk);
631 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED))
632 		error = EINVAL;
633 	else
634 		error = sdp_start_connect(ssk, nam, td);
635 	SDP_WUNLOCK(ssk);
636 	return (error);
637 }
638 
639 /*
640  * Drop a SDP socket, reporting
641  * the specified error.  If connection is synchronized,
642  * then send a RST to peer.
643  */
644 static struct sdp_sock *
645 sdp_drop(struct sdp_sock *ssk, int errno)
646 {
647 	struct socket *so;
648 
649 	SDP_WLOCK_ASSERT(ssk);
650 	so = ssk->socket;
651 	if (TCPS_HAVERCVDSYN(ssk->state))
652 		sdp_output_reset(ssk);
653 	if (errno == ETIMEDOUT && ssk->softerror)
654 		errno = ssk->softerror;
655 	so->so_error = errno;
656 	return (sdp_closed(ssk));
657 }
658 
659 /*
660  * User issued close, and wish to trail through shutdown states:
661  * if never received SYN, just forget it.  If got a SYN from peer,
662  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
663  * If already got a FIN from peer, then almost done; go to LAST_ACK
664  * state.  In all other cases, have already sent FIN to peer (e.g.
665  * after PRU_SHUTDOWN), and just have to play tedious game waiting
666  * for peer to send FIN or not respond to keep-alives, etc.
667  * We can let the user exit from the close as soon as the FIN is acked.
668  */
669 static void
670 sdp_usrclosed(struct sdp_sock *ssk)
671 {
672 
673 	SDP_WLOCK_ASSERT(ssk);
674 
675 	switch (ssk->state) {
676 	case TCPS_LISTEN:
677 		ssk->state = TCPS_CLOSED;
678 		SDP_WUNLOCK(ssk);
679 		sdp_destroy_cma(ssk);
680 		SDP_WLOCK(ssk);
681 		/* FALLTHROUGH */
682 	case TCPS_CLOSED:
683 		ssk = sdp_closed(ssk);
684 		/*
685 		 * sdp_closed() should never return NULL here as the socket is
686 		 * still open.
687 		 */
688 		KASSERT(ssk != NULL,
689 		    ("sdp_usrclosed: sdp_closed() returned NULL"));
690 		break;
691 
692 	case TCPS_SYN_SENT:
693 		/* FALLTHROUGH */
694 	case TCPS_SYN_RECEIVED:
695 		ssk->flags |= SDP_NEEDFIN;
696 		break;
697 
698 	case TCPS_ESTABLISHED:
699 		ssk->flags |= SDP_NEEDFIN;
700 		ssk->state = TCPS_FIN_WAIT_1;
701 		break;
702 
703 	case TCPS_CLOSE_WAIT:
704 		ssk->state = TCPS_LAST_ACK;
705 		break;
706 	}
707 	if (ssk->state >= TCPS_FIN_WAIT_2) {
708 		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
709 		if (ssk->state == TCPS_FIN_WAIT_2)
710 			sdp_2msl_wait(ssk);
711 		else
712 			soisdisconnected(ssk->socket);
713 	}
714 }
715 
716 static void
717 sdp_output_disconnect(struct sdp_sock *ssk)
718 {
719 
720 	SDP_WLOCK_ASSERT(ssk);
721 	callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT,
722 	    sdp_dreq_timeout, ssk);
723 	ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT;
724 	sdp_post_sends(ssk, M_NOWAIT);
725 }
726 
727 /*
728  * Initiate or continue a disconnect.
729  * If embryonic state, just send reset (once).
730  * If in ``let data drain'' option and linger null, just drop.
731  * Otherwise (hard), mark socket disconnecting and drop
732  * current input data; switch states based on user close, and
733  * send segment to peer (with FIN).
734  */
735 static void
736 sdp_start_disconnect(struct sdp_sock *ssk)
737 {
738 	struct socket *so;
739 	int unread;
740 
741 	so = ssk->socket;
742 	SDP_WLOCK_ASSERT(ssk);
743 	sdp_stop_keepalive_timer(so);
744 	/*
745 	 * Neither sdp_closed() nor sdp_drop() should return NULL, as the
746 	 * socket is still open.
747 	 */
748 	if (ssk->state < TCPS_ESTABLISHED) {
749 		ssk = sdp_closed(ssk);
750 		KASSERT(ssk != NULL,
751 		    ("sdp_start_disconnect: sdp_close() returned NULL"));
752 	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
753 		ssk = sdp_drop(ssk, 0);
754 		KASSERT(ssk != NULL,
755 		    ("sdp_start_disconnect: sdp_drop() returned NULL"));
756 	} else {
757 		soisdisconnecting(so);
758 		unread = sbused(&so->so_rcv);
759 		sbflush(&so->so_rcv);
760 		sdp_usrclosed(ssk);
761 		if (!(ssk->flags & SDP_DROPPED)) {
762 			if (unread)
763 				sdp_output_reset(ssk);
764 			else
765 				sdp_output_disconnect(ssk);
766 		}
767 	}
768 }
769 
770 /*
771  * User initiated disconnect.
772  */
773 static int
774 sdp_disconnect(struct socket *so)
775 {
776 	struct sdp_sock *ssk;
777 	int error = 0;
778 
779 	ssk = sdp_sk(so);
780 	SDP_WLOCK(ssk);
781 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
782 		error = ECONNRESET;
783 		goto out;
784 	}
785 	sdp_start_disconnect(ssk);
786 out:
787 	SDP_WUNLOCK(ssk);
788 	return (error);
789 }
790 
791 /*
792  * Accept a connection.  Essentially all the work is done at higher levels;
793  * just return the address of the peer, storing through addr.
794  *
795  *
796  * XXX This is broken XXX
797  *
798  * The rationale for acquiring the sdp lock here is somewhat complicated,
799  * and is described in detail in the commit log entry for r175612.  Acquiring
800  * it delays an accept(2) racing with sonewconn(), which inserts the socket
801  * before the address/port fields are initialized.  A better fix would
802  * prevent the socket from being placed in the listen queue until all fields
803  * are fully initialized.
804  */
805 static int
806 sdp_accept(struct socket *so, struct sockaddr **nam)
807 {
808 	struct sdp_sock *ssk = NULL;
809 	struct in_addr addr;
810 	in_port_t port;
811 	int error;
812 
813 	if (so->so_state & SS_ISDISCONNECTED)
814 		return (ECONNABORTED);
815 
816 	port = 0;
817 	addr.s_addr = 0;
818 	error = 0;
819 	ssk = sdp_sk(so);
820 	SDP_WLOCK(ssk);
821 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
822 		error = ECONNABORTED;
823 		goto out;
824 	}
825 	port = ssk->fport;
826 	addr.s_addr = ssk->faddr;
827 out:
828 	SDP_WUNLOCK(ssk);
829 	if (error == 0)
830 		*nam = sdp_sockaddr(port, &addr);
831 	return error;
832 }
833 
834 /*
835  * Mark the connection as being incapable of further output.
836  */
837 static int
838 sdp_shutdown(struct socket *so)
839 {
840 	int error = 0;
841 	struct sdp_sock *ssk;
842 
843 	ssk = sdp_sk(so);
844 	SDP_WLOCK(ssk);
845 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
846 		error = ECONNRESET;
847 		goto out;
848 	}
849 	socantsendmore(so);
850 	sdp_usrclosed(ssk);
851 	if (!(ssk->flags & SDP_DROPPED))
852 		sdp_output_disconnect(ssk);
853 
854 out:
855 	SDP_WUNLOCK(ssk);
856 
857 	return (error);
858 }
859 
860 static void
861 sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt)
862 {
863 	struct mbuf *n;
864 	int ncnt;
865 
866 	SOCKBUF_LOCK_ASSERT(sb);
867 	SBLASTRECORDCHK(sb);
868 	KASSERT(mb->m_flags & M_PKTHDR,
869 		("sdp_append: %p Missing packet header.\n", mb));
870 	n = sb->sb_lastrecord;
871 	/*
872 	 * If the queue is empty just set all pointers and proceed.
873 	 */
874 	if (n == NULL) {
875 		sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb;
876 		for (; mb; mb = mb->m_next) {
877 	                sb->sb_mbtail = mb;
878 			sballoc(sb, mb);
879 		}
880 		return;
881 	}
882 	/*
883 	 * Count the number of mbufs in the current tail.
884 	 */
885 	for (ncnt = 0; n->m_next; n = n->m_next)
886 		ncnt++;
887 	n = sb->sb_lastrecord;
888 	/*
889 	 * If the two chains can fit in a single sdp packet and
890 	 * the last record has not been sent yet (WRITABLE) coalesce
891 	 * them.  The lastrecord remains the same but we must strip the
892 	 * packet header and then let sbcompress do the hard part.
893 	 */
894 	if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES &&
895 	    n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE <
896 	    ssk->xmit_size_goal) {
897 		m_adj(mb, SDP_HEAD_SIZE);
898 		n->m_pkthdr.len += mb->m_pkthdr.len;
899 		n->m_flags |= mb->m_flags & (M_PUSH | M_URG);
900 		m_demote(mb, 1, 0);
901 		sbcompress(sb, mb, sb->sb_mbtail);
902 		return;
903 	}
904 	/*
905 	 * Not compressible, just append to the end and adjust counters.
906 	 */
907 	sb->sb_lastrecord->m_flags |= M_PUSH;
908 	sb->sb_lastrecord->m_nextpkt = mb;
909 	sb->sb_lastrecord = mb;
910 	if (sb->sb_sndptr == NULL)
911 		sb->sb_sndptr = mb;
912 	for (; mb; mb = mb->m_next) {
913 		sb->sb_mbtail = mb;
914 		sballoc(sb, mb);
915 	}
916 }
917 
918 /*
919  * Do a send by putting data in output queue and updating urgent
920  * marker if URG set.  Possibly send more data.  Unlike the other
921  * pru_*() routines, the mbuf chains are our responsibility.  We
922  * must either enqueue them or free them.  The other pru_* routines
923  * generally are caller-frees.
924  *
925  * This comes from sendfile, normal sends will come from sdp_sosend().
926  */
927 static int
928 sdp_send(struct socket *so, int flags, struct mbuf *m,
929     struct sockaddr *nam, struct mbuf *control, struct thread *td)
930 {
931 	struct sdp_sock *ssk;
932 	struct mbuf *n;
933 	int error;
934 	int cnt;
935 
936 	error = 0;
937 	ssk = sdp_sk(so);
938 	KASSERT(m->m_flags & M_PKTHDR,
939 	    ("sdp_send: %p no packet header", m));
940 	M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK);
941 	mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA;
942 	for (n = m, cnt = 0; n->m_next; n = n->m_next)
943 		cnt++;
944 	if (cnt > SDP_MAX_SEND_SGES) {
945 		n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES);
946 		if (n == NULL) {
947 			m_freem(m);
948 			return (EMSGSIZE);
949 		}
950 		m = n;
951 		for (cnt = 0; n->m_next; n = n->m_next)
952 			cnt++;
953 	}
954 	SDP_WLOCK(ssk);
955 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
956 		if (control)
957 			m_freem(control);
958 		if (m)
959 			m_freem(m);
960 		error = ECONNRESET;
961 		goto out;
962 	}
963 	if (control) {
964 		/* SDP doesn't support control messages. */
965 		if (control->m_len) {
966 			m_freem(control);
967 			if (m)
968 				m_freem(m);
969 			error = EINVAL;
970 			goto out;
971 		}
972 		m_freem(control);	/* empty control, just free it */
973 	}
974 	if (!(flags & PRUS_OOB)) {
975 		SOCKBUF_LOCK(&so->so_snd);
976 		sdp_append(ssk, &so->so_snd, m, cnt);
977 		SOCKBUF_UNLOCK(&so->so_snd);
978 		if (nam && ssk->state < TCPS_SYN_SENT) {
979 			/*
980 			 * Do implied connect if not yet connected.
981 			 */
982 			error = sdp_start_connect(ssk, nam, td);
983 			if (error)
984 				goto out;
985 		}
986 		if (flags & PRUS_EOF) {
987 			/*
988 			 * Close the send side of the connection after
989 			 * the data is sent.
990 			 */
991 			socantsendmore(so);
992 			sdp_usrclosed(ssk);
993 			if (!(ssk->flags & SDP_DROPPED))
994 				sdp_output_disconnect(ssk);
995 		} else if (!(ssk->flags & SDP_DROPPED) &&
996 		    !(flags & PRUS_MORETOCOME))
997 			sdp_post_sends(ssk, M_NOWAIT);
998 		SDP_WUNLOCK(ssk);
999 		return (0);
1000 	} else {
1001 		SOCKBUF_LOCK(&so->so_snd);
1002 		if (sbspace(&so->so_snd) < -512) {
1003 			SOCKBUF_UNLOCK(&so->so_snd);
1004 			m_freem(m);
1005 			error = ENOBUFS;
1006 			goto out;
1007 		}
1008 		/*
1009 		 * According to RFC961 (Assigned Protocols),
1010 		 * the urgent pointer points to the last octet
1011 		 * of urgent data.  We continue, however,
1012 		 * to consider it to indicate the first octet
1013 		 * of data past the urgent section.
1014 		 * Otherwise, snd_up should be one lower.
1015 		 */
1016 		m->m_flags |= M_URG | M_PUSH;
1017 		sdp_append(ssk, &so->so_snd, m, cnt);
1018 		SOCKBUF_UNLOCK(&so->so_snd);
1019 		if (nam && ssk->state < TCPS_SYN_SENT) {
1020 			/*
1021 			 * Do implied connect if not yet connected.
1022 			 */
1023 			error = sdp_start_connect(ssk, nam, td);
1024 			if (error)
1025 				goto out;
1026 		}
1027 		sdp_post_sends(ssk, M_NOWAIT);
1028 		SDP_WUNLOCK(ssk);
1029 		return (0);
1030 	}
1031 out:
1032 	SDP_WUNLOCK(ssk);
1033 	return (error);
1034 }
1035 
1036 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1037 
1038 /*
1039  * Send on a socket.  If send must go all at once and message is larger than
1040  * send buffering, then hard error.  Lock against other senders.  If must go
1041  * all at once and not enough room now, then inform user that this would
1042  * block and do nothing.  Otherwise, if nonblocking, send as much as
1043  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
1044  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
1045  * in mbuf chain must be small enough to send all at once.
1046  *
1047  * Returns nonzero on error, timeout or signal; callers must check for short
1048  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
1049  * on return.
1050  */
1051 static int
1052 sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1053     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1054 {
1055 	struct sdp_sock *ssk;
1056 	long space, resid;
1057 	int atomic;
1058 	int error;
1059 	int copy;
1060 
1061 	if (uio != NULL)
1062 		resid = uio->uio_resid;
1063 	else
1064 		resid = top->m_pkthdr.len;
1065 	atomic = top != NULL;
1066 	if (control != NULL) {
1067 		if (control->m_len) {
1068 			m_freem(control);
1069 			if (top)
1070 				m_freem(top);
1071 			return (EINVAL);
1072 		}
1073 		m_freem(control);
1074 		control = NULL;
1075 	}
1076 	/*
1077 	 * In theory resid should be unsigned.  However, space must be
1078 	 * signed, as it might be less than 0 if we over-committed, and we
1079 	 * must use a signed comparison of space and resid.  On the other
1080 	 * hand, a negative resid causes us to loop sending 0-length
1081 	 * segments to the protocol.
1082 	 *
1083 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1084 	 * type sockets since that's an error.
1085 	 */
1086 	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1087 		error = EINVAL;
1088 		goto out;
1089 	}
1090 	if (td != NULL)
1091 		td->td_ru.ru_msgsnd++;
1092 
1093 	ssk = sdp_sk(so);
1094 	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1095 	if (error)
1096 		goto out;
1097 
1098 restart:
1099 	do {
1100 		SOCKBUF_LOCK(&so->so_snd);
1101 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1102 			SOCKBUF_UNLOCK(&so->so_snd);
1103 			error = EPIPE;
1104 			goto release;
1105 		}
1106 		if (so->so_error) {
1107 			error = so->so_error;
1108 			so->so_error = 0;
1109 			SOCKBUF_UNLOCK(&so->so_snd);
1110 			goto release;
1111 		}
1112 		if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) {
1113 			SOCKBUF_UNLOCK(&so->so_snd);
1114 			error = ENOTCONN;
1115 			goto release;
1116 		}
1117 		space = sbspace(&so->so_snd);
1118 		if (flags & MSG_OOB)
1119 			space += 1024;
1120 		if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) {
1121 			SOCKBUF_UNLOCK(&so->so_snd);
1122 			error = EMSGSIZE;
1123 			goto release;
1124 		}
1125 		if (space < resid &&
1126 		    (atomic || space < so->so_snd.sb_lowat)) {
1127 			if ((so->so_state & SS_NBIO) ||
1128 			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
1129 				SOCKBUF_UNLOCK(&so->so_snd);
1130 				error = EWOULDBLOCK;
1131 				goto release;
1132 			}
1133 			error = sbwait(&so->so_snd);
1134 			SOCKBUF_UNLOCK(&so->so_snd);
1135 			if (error)
1136 				goto release;
1137 			goto restart;
1138 		}
1139 		SOCKBUF_UNLOCK(&so->so_snd);
1140 		do {
1141 			if (uio == NULL) {
1142 				resid = 0;
1143 				if (flags & MSG_EOR)
1144 					top->m_flags |= M_EOR;
1145 			} else {
1146 				/*
1147 				 * Copy the data from userland into a mbuf
1148 				 * chain.  If no data is to be copied in,
1149 				 * a single empty mbuf is returned.
1150 				 */
1151 				copy = min(space,
1152 				    ssk->xmit_size_goal - SDP_HEAD_SIZE);
1153 				top = m_uiotombuf(uio, M_WAITOK, copy,
1154 				    0, M_PKTHDR |
1155 				    ((flags & MSG_EOR) ? M_EOR : 0));
1156 				if (top == NULL) {
1157 					/* only possible error */
1158 					error = EFAULT;
1159 					goto release;
1160 				}
1161 				space -= resid - uio->uio_resid;
1162 				resid = uio->uio_resid;
1163 			}
1164 			/*
1165 			 * XXX all the SBS_CANTSENDMORE checks previously
1166 			 * done could be out of date after dropping the
1167 			 * socket lock.
1168 			 */
1169 			error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB :
1170 			/*
1171 			 * Set EOF on the last send if the user specified
1172 			 * MSG_EOF.
1173 			 */
1174 			    ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF :
1175 			/* If there is more to send set PRUS_MORETOCOME. */
1176 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1177 			    top, addr, NULL, td);
1178 			top = NULL;
1179 			if (error)
1180 				goto release;
1181 		} while (resid && space > 0);
1182 	} while (resid);
1183 
1184 release:
1185 	sbunlock(&so->so_snd);
1186 out:
1187 	if (top != NULL)
1188 		m_freem(top);
1189 	return (error);
1190 }
1191 
1192 /*
1193  * The part of soreceive() that implements reading non-inline out-of-band
1194  * data from a socket.  For more complete comments, see soreceive(), from
1195  * which this code originated.
1196  *
1197  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1198  * unable to return an mbuf chain to the caller.
1199  */
1200 static int
1201 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1202 {
1203 	struct protosw *pr = so->so_proto;
1204 	struct mbuf *m;
1205 	int error;
1206 
1207 	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1208 
1209 	m = m_get(M_WAITOK, MT_DATA);
1210 	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1211 	if (error)
1212 		goto bad;
1213 	do {
1214 		error = uiomove(mtod(m, void *),
1215 		    (int) min(uio->uio_resid, m->m_len), uio);
1216 		m = m_free(m);
1217 	} while (uio->uio_resid && error == 0 && m);
1218 bad:
1219 	if (m != NULL)
1220 		m_freem(m);
1221 	return (error);
1222 }
1223 
1224 /*
1225  * Optimized version of soreceive() for stream (TCP) sockets.
1226  */
1227 static int
1228 sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio,
1229     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1230 {
1231 	int len = 0, error = 0, flags, oresid;
1232 	struct sockbuf *sb;
1233 	struct mbuf *m, *n = NULL;
1234 	struct sdp_sock *ssk;
1235 
1236 	/* We only do stream sockets. */
1237 	if (so->so_type != SOCK_STREAM)
1238 		return (EINVAL);
1239 	if (psa != NULL)
1240 		*psa = NULL;
1241 	if (controlp != NULL)
1242 		return (EINVAL);
1243 	if (flagsp != NULL)
1244 		flags = *flagsp &~ MSG_EOR;
1245 	else
1246 		flags = 0;
1247 	if (flags & MSG_OOB)
1248 		return (soreceive_rcvoob(so, uio, flags));
1249 	if (mp0 != NULL)
1250 		*mp0 = NULL;
1251 
1252 	sb = &so->so_rcv;
1253 	ssk = sdp_sk(so);
1254 
1255 	/* Prevent other readers from entering the socket. */
1256 	error = sblock(sb, SBLOCKWAIT(flags));
1257 	if (error)
1258 		goto out;
1259 	SOCKBUF_LOCK(sb);
1260 
1261 	/* Easy one, no space to copyout anything. */
1262 	if (uio->uio_resid == 0) {
1263 		error = EINVAL;
1264 		goto out;
1265 	}
1266 	oresid = uio->uio_resid;
1267 
1268 	/* We will never ever get anything unless we are connected. */
1269 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1270 		/* When disconnecting there may be still some data left. */
1271 		if (sbavail(sb))
1272 			goto deliver;
1273 		if (!(so->so_state & SS_ISDISCONNECTED))
1274 			error = ENOTCONN;
1275 		goto out;
1276 	}
1277 
1278 	/* Socket buffer is empty and we shall not block. */
1279 	if (sbavail(sb) == 0 &&
1280 	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1281 		error = EAGAIN;
1282 		goto out;
1283 	}
1284 
1285 restart:
1286 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1287 
1288 	/* Abort if socket has reported problems. */
1289 	if (so->so_error) {
1290 		if (sbavail(sb))
1291 			goto deliver;
1292 		if (oresid > uio->uio_resid)
1293 			goto out;
1294 		error = so->so_error;
1295 		if (!(flags & MSG_PEEK))
1296 			so->so_error = 0;
1297 		goto out;
1298 	}
1299 
1300 	/* Door is closed.  Deliver what is left, if any. */
1301 	if (sb->sb_state & SBS_CANTRCVMORE) {
1302 		if (sbavail(sb))
1303 			goto deliver;
1304 		else
1305 			goto out;
1306 	}
1307 
1308 	/* Socket buffer got some data that we shall deliver now. */
1309 	if (sbavail(sb) && !(flags & MSG_WAITALL) &&
1310 	    ((so->so_state & SS_NBIO) ||
1311 	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1312 	     sbavail(sb) >= sb->sb_lowat ||
1313 	     sbavail(sb) >= uio->uio_resid ||
1314 	     sbavail(sb) >= sb->sb_hiwat) ) {
1315 		goto deliver;
1316 	}
1317 
1318 	/* On MSG_WAITALL we must wait until all data or error arrives. */
1319 	if ((flags & MSG_WAITALL) &&
1320 	    (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat))
1321 		goto deliver;
1322 
1323 	/*
1324 	 * Wait and block until (more) data comes in.
1325 	 * NB: Drops the sockbuf lock during wait.
1326 	 */
1327 	error = sbwait(sb);
1328 	if (error)
1329 		goto out;
1330 	goto restart;
1331 
1332 deliver:
1333 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1334 	KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__));
1335 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1336 
1337 	/* Statistics. */
1338 	if (uio->uio_td)
1339 		uio->uio_td->td_ru.ru_msgrcv++;
1340 
1341 	/* Fill uio until full or current end of socket buffer is reached. */
1342 	len = min(uio->uio_resid, sbavail(sb));
1343 	if (mp0 != NULL) {
1344 		/* Dequeue as many mbufs as possible. */
1345 		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1346 			for (*mp0 = m = sb->sb_mb;
1347 			     m != NULL && m->m_len <= len;
1348 			     m = m->m_next) {
1349 				len -= m->m_len;
1350 				uio->uio_resid -= m->m_len;
1351 				sbfree(sb, m);
1352 				n = m;
1353 			}
1354 			sb->sb_mb = m;
1355 			if (sb->sb_mb == NULL)
1356 				SB_EMPTY_FIXUP(sb);
1357 			n->m_next = NULL;
1358 		}
1359 		/* Copy the remainder. */
1360 		if (len > 0) {
1361 			KASSERT(sb->sb_mb != NULL,
1362 			    ("%s: len > 0 && sb->sb_mb empty", __func__));
1363 
1364 			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
1365 			if (m == NULL)
1366 				len = 0;	/* Don't flush data from sockbuf. */
1367 			else
1368 				uio->uio_resid -= m->m_len;
1369 			if (*mp0 != NULL)
1370 				n->m_next = m;
1371 			else
1372 				*mp0 = m;
1373 			if (*mp0 == NULL) {
1374 				error = ENOBUFS;
1375 				goto out;
1376 			}
1377 		}
1378 	} else {
1379 		/* NB: Must unlock socket buffer as uiomove may sleep. */
1380 		SOCKBUF_UNLOCK(sb);
1381 		error = m_mbuftouio(uio, sb->sb_mb, len);
1382 		SOCKBUF_LOCK(sb);
1383 		if (error)
1384 			goto out;
1385 	}
1386 	SBLASTRECORDCHK(sb);
1387 	SBLASTMBUFCHK(sb);
1388 
1389 	/*
1390 	 * Remove the delivered data from the socket buffer unless we
1391 	 * were only peeking.
1392 	 */
1393 	if (!(flags & MSG_PEEK)) {
1394 		if (len > 0)
1395 			sbdrop_locked(sb, len);
1396 
1397 		/* Notify protocol that we drained some data. */
1398 		SOCKBUF_UNLOCK(sb);
1399 		SDP_WLOCK(ssk);
1400 		sdp_do_posts(ssk);
1401 		SDP_WUNLOCK(ssk);
1402 		SOCKBUF_LOCK(sb);
1403 	}
1404 
1405 	/*
1406 	 * For MSG_WAITALL we may have to loop again and wait for
1407 	 * more data to come in.
1408 	 */
1409 	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
1410 		goto restart;
1411 out:
1412 	SOCKBUF_LOCK_ASSERT(sb);
1413 	SBLASTRECORDCHK(sb);
1414 	SBLASTMBUFCHK(sb);
1415 	SOCKBUF_UNLOCK(sb);
1416 	sbunlock(sb);
1417 	return (error);
1418 }
1419 
1420 /*
1421  * Abort is used to teardown a connection typically while sitting in
1422  * the accept queue.
1423  */
1424 void
1425 sdp_abort(struct socket *so)
1426 {
1427 	struct sdp_sock *ssk;
1428 
1429 	ssk = sdp_sk(so);
1430 	SDP_WLOCK(ssk);
1431 	/*
1432 	 * If we have not yet dropped, do it now.
1433 	 */
1434 	if (!(ssk->flags & SDP_TIMEWAIT) &&
1435 	    !(ssk->flags & SDP_DROPPED))
1436 		sdp_drop(ssk, ECONNABORTED);
1437 	KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X",
1438 	    ssk, ssk->flags));
1439 	SDP_WUNLOCK(ssk);
1440 }
1441 
1442 /*
1443  * Close a SDP socket and initiate a friendly disconnect.
1444  */
1445 static void
1446 sdp_close(struct socket *so)
1447 {
1448 	struct sdp_sock *ssk;
1449 
1450 	ssk = sdp_sk(so);
1451 	SDP_WLOCK(ssk);
1452 	/*
1453 	 * If we have not yet dropped, do it now.
1454 	 */
1455 	if (!(ssk->flags & SDP_TIMEWAIT) &&
1456 	    !(ssk->flags & SDP_DROPPED))
1457 		sdp_start_disconnect(ssk);
1458 
1459 	/*
1460 	 * If we've still not dropped let the socket layer know we're
1461 	 * holding on to the socket and pcb for a while.
1462 	 */
1463 	if (!(ssk->flags & SDP_DROPPED)) {
1464 		SOCK_LOCK(so);
1465 		so->so_state |= SS_PROTOREF;
1466 		SOCK_UNLOCK(so);
1467 		ssk->flags |= SDP_SOCKREF;
1468 	}
1469 	SDP_WUNLOCK(ssk);
1470 }
1471 
1472 /*
1473  * User requests out-of-band data.
1474  */
1475 static int
1476 sdp_rcvoob(struct socket *so, struct mbuf *m, int flags)
1477 {
1478 	int error = 0;
1479 	struct sdp_sock *ssk;
1480 
1481 	ssk = sdp_sk(so);
1482 	SDP_WLOCK(ssk);
1483 	if (!rx_ring_trylock(&ssk->rx_ring)) {
1484 		SDP_WUNLOCK(ssk);
1485 		return (ECONNRESET);
1486 	}
1487 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1488 		error = ECONNRESET;
1489 		goto out;
1490 	}
1491 	if ((so->so_oobmark == 0 &&
1492 	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
1493 	    so->so_options & SO_OOBINLINE ||
1494 	    ssk->oobflags & SDP_HADOOB) {
1495 		error = EINVAL;
1496 		goto out;
1497 	}
1498 	if ((ssk->oobflags & SDP_HAVEOOB) == 0) {
1499 		error = EWOULDBLOCK;
1500 		goto out;
1501 	}
1502 	m->m_len = 1;
1503 	*mtod(m, caddr_t) = ssk->iobc;
1504 	if ((flags & MSG_PEEK) == 0)
1505 		ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB);
1506 out:
1507 	rx_ring_unlock(&ssk->rx_ring);
1508 	SDP_WUNLOCK(ssk);
1509 	return (error);
1510 }
1511 
1512 void
1513 sdp_urg(struct sdp_sock *ssk, struct mbuf *mb)
1514 {
1515 	struct mbuf *m;
1516 	struct socket *so;
1517 
1518 	so = ssk->socket;
1519 	if (so == NULL)
1520 		return;
1521 
1522 	so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1;
1523 	sohasoutofband(so);
1524 	ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB);
1525 	if (!(so->so_options & SO_OOBINLINE)) {
1526 		for (m = mb; m->m_next != NULL; m = m->m_next);
1527 		ssk->iobc = *(mtod(m, char *) + m->m_len - 1);
1528 		ssk->oobflags |= SDP_HAVEOOB;
1529 		m->m_len--;
1530 		mb->m_pkthdr.len--;
1531 	}
1532 }
1533 
1534 /*
1535  * Notify a sdp socket of an asynchronous error.
1536  *
1537  * Do not wake up user since there currently is no mechanism for
1538  * reporting soft errors (yet - a kqueue filter may be added).
1539  */
1540 struct sdp_sock *
1541 sdp_notify(struct sdp_sock *ssk, int error)
1542 {
1543 
1544 	SDP_WLOCK_ASSERT(ssk);
1545 
1546 	if ((ssk->flags & SDP_TIMEWAIT) ||
1547 	    (ssk->flags & SDP_DROPPED))
1548 		return (ssk);
1549 
1550 	/*
1551 	 * Ignore some errors if we are hooked up.
1552 	 */
1553 	if (ssk->state == TCPS_ESTABLISHED &&
1554 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
1555 	     error == EHOSTDOWN))
1556 		return (ssk);
1557 	ssk->softerror = error;
1558 	return sdp_drop(ssk, error);
1559 }
1560 
1561 static void
1562 sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
1563 {
1564 	struct in_addr faddr;
1565 
1566 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
1567 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1568 		return;
1569 
1570 	sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify);
1571 }
1572 
1573 static int
1574 sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
1575     struct thread *td)
1576 {
1577 	return (EOPNOTSUPP);
1578 }
1579 
1580 static void
1581 sdp_keepalive_timeout(void *data)
1582 {
1583 	struct sdp_sock *ssk;
1584 
1585 	ssk = data;
1586 	/* Callout canceled. */
1587         if (!callout_active(&ssk->keep2msl))
1588                 return;
1589 	/* Callout rescheduled as a different kind of timer. */
1590 	if (callout_pending(&ssk->keep2msl))
1591 		goto out;
1592         callout_deactivate(&ssk->keep2msl);
1593 	if (ssk->flags & SDP_DROPPED ||
1594 	    (ssk->socket->so_options & SO_KEEPALIVE) == 0)
1595 		goto out;
1596 	sdp_post_keepalive(ssk);
1597 	callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1598 	    sdp_keepalive_timeout, ssk);
1599 out:
1600 	SDP_WUNLOCK(ssk);
1601 }
1602 
1603 
1604 void
1605 sdp_start_keepalive_timer(struct socket *so)
1606 {
1607 	struct sdp_sock *ssk;
1608 
1609 	ssk = sdp_sk(so);
1610 	if (!callout_pending(&ssk->keep2msl))
1611                 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
1612                     sdp_keepalive_timeout, ssk);
1613 }
1614 
1615 static void
1616 sdp_stop_keepalive_timer(struct socket *so)
1617 {
1618 	struct sdp_sock *ssk;
1619 
1620 	ssk = sdp_sk(so);
1621 	callout_stop(&ssk->keep2msl);
1622 }
1623 
1624 /*
1625  * sdp_ctloutput() must drop the inpcb lock before performing copyin on
1626  * socket option arguments.  When it re-acquires the lock after the copy, it
1627  * has to revalidate that the connection is still valid for the socket
1628  * option.
1629  */
1630 #define SDP_WLOCK_RECHECK(inp) do {					\
1631 	SDP_WLOCK(ssk);							\
1632 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {		\
1633 		SDP_WUNLOCK(ssk);					\
1634 		return (ECONNRESET);					\
1635 	}								\
1636 } while(0)
1637 
1638 static int
1639 sdp_ctloutput(struct socket *so, struct sockopt *sopt)
1640 {
1641 	int	error, opt, optval;
1642 	struct sdp_sock *ssk;
1643 
1644 	error = 0;
1645 	ssk = sdp_sk(so);
1646 	if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) {
1647 		SDP_WLOCK(ssk);
1648 		if (so->so_options & SO_KEEPALIVE)
1649 			sdp_start_keepalive_timer(so);
1650 		else
1651 			sdp_stop_keepalive_timer(so);
1652 		SDP_WUNLOCK(ssk);
1653 	}
1654 	if (sopt->sopt_level != IPPROTO_TCP)
1655 		return (error);
1656 
1657 	SDP_WLOCK(ssk);
1658 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
1659 		SDP_WUNLOCK(ssk);
1660 		return (ECONNRESET);
1661 	}
1662 
1663 	switch (sopt->sopt_dir) {
1664 	case SOPT_SET:
1665 		switch (sopt->sopt_name) {
1666 		case TCP_NODELAY:
1667 			SDP_WUNLOCK(ssk);
1668 			error = sooptcopyin(sopt, &optval, sizeof optval,
1669 			    sizeof optval);
1670 			if (error)
1671 				return (error);
1672 
1673 			SDP_WLOCK_RECHECK(ssk);
1674 			opt = SDP_NODELAY;
1675 			if (optval)
1676 				ssk->flags |= opt;
1677 			else
1678 				ssk->flags &= ~opt;
1679 			sdp_do_posts(ssk);
1680 			SDP_WUNLOCK(ssk);
1681 			break;
1682 
1683 		default:
1684 			SDP_WUNLOCK(ssk);
1685 			error = ENOPROTOOPT;
1686 			break;
1687 		}
1688 		break;
1689 
1690 	case SOPT_GET:
1691 		switch (sopt->sopt_name) {
1692 		case TCP_NODELAY:
1693 			optval = ssk->flags & SDP_NODELAY;
1694 			SDP_WUNLOCK(ssk);
1695 			error = sooptcopyout(sopt, &optval, sizeof optval);
1696 			break;
1697 		default:
1698 			SDP_WUNLOCK(ssk);
1699 			error = ENOPROTOOPT;
1700 			break;
1701 		}
1702 		break;
1703 	}
1704 	return (error);
1705 }
1706 #undef SDP_WLOCK_RECHECK
1707 
1708 int sdp_mod_count = 0;
1709 int sdp_mod_usec = 0;
1710 
1711 void
1712 sdp_set_default_moderation(struct sdp_sock *ssk)
1713 {
1714 	if (sdp_mod_count <= 0 || sdp_mod_usec <= 0)
1715 		return;
1716 	ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec);
1717 }
1718 
1719 static void
1720 sdp_dev_add(struct ib_device *device)
1721 {
1722 	struct ib_fmr_pool_param param;
1723 	struct sdp_device *sdp_dev;
1724 
1725 	sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO);
1726 	sdp_dev->pd = ib_alloc_pd(device, 0);
1727 	if (IS_ERR(sdp_dev->pd))
1728 		goto out_pd;
1729 	memset(&param, 0, sizeof param);
1730 	param.max_pages_per_fmr = SDP_FMR_SIZE;
1731 	param.page_shift = PAGE_SHIFT;
1732 	param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ);
1733 	param.pool_size = SDP_FMR_POOL_SIZE;
1734 	param.dirty_watermark = SDP_FMR_DIRTY_SIZE;
1735 	param.cache = 1;
1736 	sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, &param);
1737 	if (IS_ERR(sdp_dev->fmr_pool))
1738 		goto out_fmr;
1739 	ib_set_client_data(device, &sdp_client, sdp_dev);
1740 	return;
1741 
1742 out_fmr:
1743 	ib_dealloc_pd(sdp_dev->pd);
1744 out_pd:
1745 	free(sdp_dev, M_SDP);
1746 }
1747 
1748 static void
1749 sdp_dev_rem(struct ib_device *device, void *client_data)
1750 {
1751 	struct sdp_device *sdp_dev;
1752 	struct sdp_sock *ssk;
1753 
1754 	SDP_LIST_WLOCK();
1755 	LIST_FOREACH(ssk, &sdp_list, list) {
1756 		if (ssk->ib_device != device)
1757 			continue;
1758 		SDP_WLOCK(ssk);
1759 		if ((ssk->flags & SDP_DESTROY) == 0)
1760 			ssk = sdp_notify(ssk, ECONNRESET);
1761 		if (ssk)
1762 			SDP_WUNLOCK(ssk);
1763 	}
1764 	SDP_LIST_WUNLOCK();
1765 	/*
1766 	 * XXX Do I need to wait between these two?
1767 	 */
1768 	sdp_dev = ib_get_client_data(device, &sdp_client);
1769 	if (!sdp_dev)
1770 		return;
1771 	ib_flush_fmr_pool(sdp_dev->fmr_pool);
1772 	ib_destroy_fmr_pool(sdp_dev->fmr_pool);
1773 	ib_dealloc_pd(sdp_dev->pd);
1774 	free(sdp_dev, M_SDP);
1775 }
1776 
1777 struct ib_client sdp_client =
1778     { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem };
1779 
1780 
1781 static int
1782 sdp_pcblist(SYSCTL_HANDLER_ARGS)
1783 {
1784 	int error, n, i;
1785 	struct sdp_sock *ssk;
1786 	struct xinpgen xig;
1787 
1788 	/*
1789 	 * The process of preparing the TCB list is too time-consuming and
1790 	 * resource-intensive to repeat twice on every request.
1791 	 */
1792 	if (req->oldptr == NULL) {
1793 		n = sdp_count;
1794 		n += imax(n / 8, 10);
1795 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
1796 		return (0);
1797 	}
1798 
1799 	if (req->newptr != NULL)
1800 		return (EPERM);
1801 
1802 	/*
1803 	 * OK, now we're committed to doing something.
1804 	 */
1805 	SDP_LIST_RLOCK();
1806 	n = sdp_count;
1807 	SDP_LIST_RUNLOCK();
1808 
1809 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
1810 		+ n * sizeof(struct xtcpcb));
1811 	if (error != 0)
1812 		return (error);
1813 
1814 	bzero(&xig, sizeof(xig));
1815 	xig.xig_len = sizeof xig;
1816 	xig.xig_count = n;
1817 	xig.xig_gen = 0;
1818 	xig.xig_sogen = so_gencnt;
1819 	error = SYSCTL_OUT(req, &xig, sizeof xig);
1820 	if (error)
1821 		return (error);
1822 
1823 	SDP_LIST_RLOCK();
1824 	for (ssk = LIST_FIRST(&sdp_list), i = 0;
1825 	    ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) {
1826 		struct xtcpcb xt;
1827 
1828 		SDP_RLOCK(ssk);
1829 		if (ssk->flags & SDP_TIMEWAIT) {
1830 			if (ssk->cred != NULL)
1831 				error = cr_cansee(req->td->td_ucred,
1832 				    ssk->cred);
1833 			else
1834 				error = EINVAL;	/* Skip this inp. */
1835 		} else if (ssk->socket)
1836 			error = cr_canseesocket(req->td->td_ucred,
1837 			    ssk->socket);
1838 		else
1839 			error = EINVAL;
1840 		if (error) {
1841 			error = 0;
1842 			goto next;
1843 		}
1844 
1845 		bzero(&xt, sizeof(xt));
1846 		xt.xt_len = sizeof xt;
1847 		xt.xt_inp.inp_gencnt = 0;
1848 		xt.xt_inp.inp_vflag = INP_IPV4;
1849 		memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr));
1850 		xt.xt_inp.inp_lport = ssk->lport;
1851 		memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr));
1852 		xt.xt_inp.inp_fport = ssk->fport;
1853 		xt.t_state = ssk->state;
1854 		if (ssk->socket != NULL)
1855 			sotoxsocket(ssk->socket, &xt.xt_inp.xi_socket);
1856 		xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP;
1857 		SDP_RUNLOCK(ssk);
1858 		error = SYSCTL_OUT(req, &xt, sizeof xt);
1859 		if (error)
1860 			break;
1861 		i++;
1862 		continue;
1863 next:
1864 		SDP_RUNLOCK(ssk);
1865 	}
1866 	if (!error) {
1867 		/*
1868 		 * Give the user an updated idea of our state.
1869 		 * If the generation differs from what we told
1870 		 * her before, she knows that something happened
1871 		 * while we were processing this request, and it
1872 		 * might be necessary to retry.
1873 		 */
1874 		xig.xig_gen = 0;
1875 		xig.xig_sogen = so_gencnt;
1876 		xig.xig_count = sdp_count;
1877 		error = SYSCTL_OUT(req, &xig, sizeof xig);
1878 	}
1879 	SDP_LIST_RUNLOCK();
1880 	return (error);
1881 }
1882 
1883 static SYSCTL_NODE(_net_inet, -1,  sdp,    CTLFLAG_RW, 0,  "SDP");
1884 
1885 SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist,
1886     CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb",
1887     "List of active SDP connections");
1888 
1889 static void
1890 sdp_zone_change(void *tag)
1891 {
1892 
1893 	uma_zone_set_max(sdp_zone, maxsockets);
1894 }
1895 
1896 static void
1897 sdp_init(void)
1898 {
1899 
1900 	LIST_INIT(&sdp_list);
1901 	sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock),
1902 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1903 	uma_zone_set_max(sdp_zone, maxsockets);
1904 	EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL,
1905 		EVENTHANDLER_PRI_ANY);
1906 	rx_comp_wq = create_singlethread_workqueue("rx_comp_wq");
1907 	ib_register_client(&sdp_client);
1908 }
1909 
1910 extern struct domain sdpdomain;
1911 
1912 struct pr_usrreqs sdp_usrreqs = {
1913 	.pru_abort =		sdp_abort,
1914 	.pru_accept =		sdp_accept,
1915 	.pru_attach =		sdp_attach,
1916 	.pru_bind =		sdp_bind,
1917 	.pru_connect =		sdp_connect,
1918 	.pru_control =		sdp_control,
1919 	.pru_detach =		sdp_detach,
1920 	.pru_disconnect =	sdp_disconnect,
1921 	.pru_listen =		sdp_listen,
1922 	.pru_peeraddr =		sdp_getpeeraddr,
1923 	.pru_rcvoob =		sdp_rcvoob,
1924 	.pru_send =		sdp_send,
1925 	.pru_sosend =		sdp_sosend,
1926 	.pru_soreceive =	sdp_sorecv,
1927 	.pru_shutdown =		sdp_shutdown,
1928 	.pru_sockaddr =		sdp_getsockaddr,
1929 	.pru_close =		sdp_close,
1930 };
1931 
1932 struct protosw sdpsw[] = {
1933 {
1934 	.pr_type =		SOCK_STREAM,
1935 	.pr_domain =		&sdpdomain,
1936 	.pr_protocol =		IPPROTO_IP,
1937 	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1938 	.pr_ctlinput =		sdp_ctlinput,
1939 	.pr_ctloutput =		sdp_ctloutput,
1940 	.pr_usrreqs =		&sdp_usrreqs
1941 },
1942 {
1943 	.pr_type =		SOCK_STREAM,
1944 	.pr_domain =		&sdpdomain,
1945 	.pr_protocol =		IPPROTO_TCP,
1946 	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
1947 	.pr_ctlinput =		sdp_ctlinput,
1948 	.pr_ctloutput =		sdp_ctloutput,
1949 	.pr_usrreqs =		&sdp_usrreqs
1950 },
1951 };
1952 
1953 struct domain sdpdomain = {
1954 	.dom_family =		AF_INET_SDP,
1955 	.dom_name =		"SDP",
1956 	.dom_init =		sdp_init,
1957 	.dom_protosw =		sdpsw,
1958 	.dom_protoswNPROTOSW =	&sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])],
1959 };
1960 
1961 DOMAIN_SET(sdp);
1962 
1963 int sdp_debug_level = 1;
1964 int sdp_data_debug_level = 0;
1965