xref: /minix/minix/net/lwip/tcpsock.c (revision ef8d499e)
1 /* LWIP service - tcpsock.c - TCP sockets */
2 /*
3  * This module implements support for TCP sockets based on lwIP's core TCP PCB
4  * module, which is largely but not fully cooperative with exactly what we want
5  * to achieve, with as a result that this module is rather complicated.
6  *
7  * Each socket has a send queue and a receive queue.  Both are using lwIP's own
8  * (pbuf) buffers, which largely come out of the main 512-byte buffer pool.
9  * The buffers on the send queue are allocated and freed by us--the latter only
10  * once they are no longer in use by lwIP as well.  A bit counterintuitively,
11  * we deliberately use a smaller lwIP per-PCB TCP send buffer limit
12  * (TCP_SND_BUF) in the lwIP send configuration (lwipopts.h) in order to more
13  * easily trigger conditions where we cannot enqueue data (or the final FIN)
14  * right away.  This way, we get to test the internal logic of this module a
15  * lot more easily.  The small lwIP send queue size should not have any impact
16  * on performance, as our own per-socket send queues can be much larger and we
17  * enqueue more of that on the lwIP PCB as soon as we can in all cases.
18  *
19  * The receive queue consists of whatever buffers were given to us by lwIP, but
20  * since those may be many buffers with small amounts of data each, we perform
21  * fairly aggressive merging of consecutive buffers.  The intended result is
22  * that we waste no more than 50% of memory within the receive queue.  Merging
23  * requires memory copies, which makes it expensive, but we do not configure
24  * lwIP with enough buffers to make running out of buffers a non-issue, so this
25  * trade-off is necessary.  Practical experience and measurements of the merge
26  * policy will have to show whether and how the current policy may be improved.
27  *
28  * As can be expected, the connection close semantics are by far the most
29  * complicated part of this module.  We attempt to get rid of the lwIP PCB as
30  * soon as we can, letting lwIP take care of the TIME_WAIT state for example.
31  * However, there are various conditions that have to be met before we can
32  * forget about the PCB here--most importantly, that none of our sent data
33  * blocks are still referenced by lwIP because they have not yet been sent or
34  * acknowledged.  We can only free the data blocks once lwIP is done with them.
35  *
36  * We do consider the TCP state of lwIP's PCB, in order to avoid duplicating
37  * full state tracking here.  However, we do not look at a socket's TCP state
38  * while in a lwIP-generated event for that socket, because the state may not
39  * necessarily reflect the (correct or new) TCP state of the connection, nor
40  * may the PCB be available--this is the case for error events.  For these
41  * reasons we use a few internal TCPF_ flags to perform partial state tracking.
42  *
43  * More generally, we tend to access lwIP PCB fields directly only when lwIP's
44  * own BSD API implementation does that too and there is no better alternative.
45  * One example of this is the check to see if our FIN was acknowledged, for
46  * SO_LINGER support.  In terms of maintenance, our hope is that if lwIP's API
47  * changes later, we can change our code to imitate whatever lwIP's BSD API
48  * implementation does at that point.
49  */
50 
51 #include <sys/socketvar.h>
52 #include <netinet/in.h>
53 #include <netinet/tcp.h>
54 #include <netinet/ip_var.h>
55 #include <netinet/tcp_timer.h>
56 #include <netinet/tcp_var.h>
57 #include <netinet/tcp_fsm.h>
58 
59 /*
60  * Unfortunately, NetBSD and lwIP have different definitions of a few relevant
61  * preprocessor variables.  Make sure we do not attempt to use the NetBSD one
62  * where it matters.  We do need one of the NetBSD definitions though.
63  */
64 static const unsigned int NETBSD_TF_NODELAY = TF_NODELAY;
65 #undef TF_NODELAY
66 #undef TCP_MSS
67 
68 #include "lwip.h"
69 #include "tcpisn.h"
70 
71 #include "lwip/tcp.h"
72 #include "lwip/priv/tcp_priv.h" /* for tcp_pcb_lists */
73 
74 /*
75  * The number of TCP sockets (NR_TCPSOCK) is defined in the lwIP configuration.
76  */
77 
78 /*
79  * We fully control the send buffer, so we can let its size be set to whatever
80  * we want.  The receive buffer is different: if it is smaller than the window
81  * size, we may have to refuse data that lwIP hands us, at which point more
82  * incoming data will cause lwIP to abort the TCP connection--even aside from
83  * performance issues.  Therefore, we must make sure the receive buffer is
84  * larger than the TCP window at all times.
85  */
86 #define TCP_SNDBUF_MIN	1		/* minimum TCP send buffer size */
87 #define TCP_SNDBUF_DEF	32768		/* default TCP send buffer size */
88 #define TCP_SNDBUF_MAX	131072		/* maximum TCP send buffer size */
89 #define TCP_RCVBUF_MIN	TCP_WND		/* minimum TCP receive buffer size */
90 #define TCP_RCVBUF_DEF	MAX(TCP_WND, 32768) /* default TCP recv buffer size */
91 #define TCP_RCVBUF_MAX	MAX(TCP_WND, 131072) /* maximum TCP recv buffer size */
92 
93 /*
94  * The total number of buffers that may in use for TCP socket send queues.  The
95  * goal is to allow at least some progress to be made on receiving from TCP
96  * sockets and on differently-typed sockets, at least as long as the LWIP
97  * service can manage to allocate the memory it wants.  For the case that it
98  * does not, we can only reactively kill off TCP sockets and/or free enqueued
99  * ethernet packets, neither of which is currently implemented (TODO).
100  */
101 #define TCP_MAX_SENDBUFS	(mempool_max_buffers() * 3 / 4)
102 
103 /* Polling intervals, in 500-millsecond units. */
104 #define TCP_POLL_REG_INTERVAL	10	/* interval for reattempting sends */
105 #define TCP_POLL_CLOSE_INTERVAL	1	/* interval while closing connection */
106 
107 static struct tcpsock {
108 	struct ipsock tcp_ipsock;		/* IP socket, MUST be first */
109 	struct tcp_pcb *tcp_pcb;		/* lwIP TCP control block */
110 	union pxfer_tcp_queue {			/* free/accept queue */
111 		TAILQ_ENTRY(tcpsock) tq_next;	/* next in queue */
112 		TAILQ_HEAD(, tcpsock) tq_head;	/* head of queue */
113 	} tcp_queue;
114 	struct tcpsock *tcp_listener;		/* listener if on accept q. */
115 	struct {				/* send queue */
116 		struct pbuf *ts_head;		/* first pbuf w/unacked data */
117 		struct pbuf *ts_unsent;		/* first pbuf w/unsent data */
118 		struct pbuf *ts_tail;		/* most recently added data */
119 		size_t ts_len;			/* total sent + unsent */
120 		unsigned short ts_head_off;	/* offset into head pbuf */
121 		unsigned short ts_unsent_off;	/* offset into unsent pbuf */
122 	} tcp_snd;
123 	struct {				/* receive queue */
124 		struct pbuf *tr_head;		/* first pbuf w/unrecvd data */
125 		struct pbuf **tr_pre_tailp;	/* ptr-ptr to newest pbuf */
126 		size_t tr_len;			/* bytes on receive queue */
127 		unsigned short tr_head_off;	/* offset into head pbuf */
128 		unsigned short tr_unacked;	/* current window reduction */
129 	} tcp_rcv;
130 } tcp_array[NR_TCPSOCK];
131 
132 static TAILQ_HEAD(, tcpsock) tcp_freelist;	/* list of free TCP sockets */
133 
134 static const struct sockevent_ops tcpsock_ops;
135 
136 static unsigned int tcpsock_sendbufs;		/* # send buffers in use */
137 static unsigned int tcpsock_recvbufs;		/* # receive buffers in use */
138 
139 /* A bunch of macros that are just for convenience. */
140 #define tcpsock_get_id(tcp)	(SOCKID_TCP | (sockid_t)((tcp) - tcp_array))
141 #define tcpsock_get_ipsock(tcp)	(&(tcp)->tcp_ipsock)
142 #define tcpsock_get_sock(tcp)	(ipsock_get_sock(tcpsock_get_ipsock(tcp)))
143 #define tcpsock_get_sndbuf(tcp)	(ipsock_get_sndbuf(tcpsock_get_ipsock(tcp)))
144 #define tcpsock_get_rcvbuf(tcp)	(ipsock_get_rcvbuf(tcpsock_get_ipsock(tcp)))
145 #define tcpsock_is_ipv6(tcp)	(ipsock_is_ipv6(tcpsock_get_ipsock(tcp)))
146 #define tcpsock_is_shutdown(tcp,fl) \
147 	(sockevent_is_shutdown(tcpsock_get_sock(tcp), fl))
148 #define tcpsock_is_listening(tcp) \
149 	(sockevent_is_listening(tcpsock_get_sock(tcp)))
150 #define tcpsock_get_flags(tcp)	(ipsock_get_flags(tcpsock_get_ipsock(tcp)))
151 #define tcpsock_set_flag(tcp,fl) \
152 	(ipsock_set_flag(tcpsock_get_ipsock(tcp), fl))
153 #define tcpsock_clear_flag(tcp,fl) \
154 	(ipsock_clear_flag(tcpsock_get_ipsock(tcp), fl))
155 
156 static ssize_t tcpsock_pcblist(struct rmib_call *, struct rmib_node *,
157 	struct rmib_oldp *, struct rmib_newp *);
158 
159 /* The CTL_NET {PF_INET,PF_INET6} IPPROTO_TCP subtree. */
160 /* TODO: add many more and make some of them writable.. */
161 static struct rmib_node net_inet_tcp_table[] = {
162 /* 2*/	[TCPCTL_SENDSPACE]	= RMIB_INT(RMIB_RO, TCP_SNDBUF_DEF,
163 				    "sendspace",
164 				    "Default TCP send buffer size"),
165 /* 3*/	[TCPCTL_RECVSPACE]	= RMIB_INT(RMIB_RO, TCP_RCVBUF_DEF,
166 				    "recvspace",
167 				    "Default TCP receive buffer size"),
168 /*29*/	[TCPCTL_LOOPBACKCKSUM]	= RMIB_FUNC(RMIB_RW | CTLTYPE_INT, sizeof(int),
169 				    loopif_cksum, "do_loopback_cksum",
170 				    "Perform TCP checksum on loopback"),
171 /*+0*/	[TCPCTL_MAXID]		= RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0,
172 				    tcpsock_pcblist, "pcblist",
173 				    "TCP protocol control block list"),
174 /*+1*/	[TCPCTL_MAXID + 1]	= RMIB_FUNC(RMIB_RW | CTLFLAG_PRIVATE |
175 				    CTLFLAG_HIDDEN | CTLTYPE_STRING,
176 				    TCPISN_SECRET_HEX_LENGTH, tcpisn_secret,
177 				    "isn_secret",
178 				    "TCP ISN secret (MINIX 3 specific)")
179 };
180 
181 static struct rmib_node net_inet_tcp_node =
182     RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp", "TCP related settings");
183 static struct rmib_node net_inet6_tcp6_node =
184     RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp6", "TCP related settings");
185 
186 /*
187  * Initialize the TCP sockets module.
188  */
189 void
tcpsock_init(void)190 tcpsock_init(void)
191 {
192 	unsigned int slot;
193 
194 	/* Initialize the list of free TCP sockets. */
195 	TAILQ_INIT(&tcp_freelist);
196 
197 	for (slot = 0; slot < __arraycount(tcp_array); slot++)
198 		TAILQ_INSERT_TAIL(&tcp_freelist, &tcp_array[slot],
199 		    tcp_queue.tq_next);
200 
201 	/* Initialize other variables. */
202 	tcpsock_sendbufs = 0;
203 
204 	/* Register the net.inet.tcp and net.inet6.tcp6 RMIB subtrees. */
205 	mibtree_register_inet(PF_INET, IPPROTO_TCP, &net_inet_tcp_node);
206 	mibtree_register_inet(PF_INET6, IPPROTO_TCP, &net_inet6_tcp6_node);
207 }
208 
209 /*
210  * Initialize the state of a TCP socket's send queue.
211  */
212 static void
tcpsock_reset_send(struct tcpsock * tcp)213 tcpsock_reset_send(struct tcpsock * tcp)
214 {
215 
216 	tcp->tcp_snd.ts_tail = NULL;
217 	tcp->tcp_snd.ts_unsent = NULL;
218 	tcp->tcp_snd.ts_head = NULL;
219 	tcp->tcp_snd.ts_len = 0;
220 	tcp->tcp_snd.ts_unsent_off = 0;
221 	tcp->tcp_snd.ts_head_off = 0;
222 }
223 
224 /*
225  * Initialize the state of a TCP socket's receive queue.
226  */
227 static void
tcpsock_reset_recv(struct tcpsock * tcp)228 tcpsock_reset_recv(struct tcpsock * tcp)
229 {
230 
231 	tcp->tcp_rcv.tr_pre_tailp = NULL;
232 	tcp->tcp_rcv.tr_head = NULL;
233 	tcp->tcp_rcv.tr_len = 0;
234 	tcp->tcp_rcv.tr_head_off = 0;
235 	tcp->tcp_rcv.tr_unacked = 0;
236 }
237 
238 /*
239  * Create a TCP socket.
240  */
241 sockid_t
tcpsock_socket(int domain,int protocol,struct sock ** sockp,const struct sockevent_ops ** ops)242 tcpsock_socket(int domain, int protocol, struct sock ** sockp,
243 	const struct sockevent_ops ** ops)
244 {
245 	struct tcpsock *tcp;
246 	uint8_t ip_type;
247 
248 	switch (protocol) {
249 	case 0:
250 	case IPPROTO_TCP:
251 		break;
252 
253 	default:
254 		return EPROTONOSUPPORT;
255 	}
256 
257 	if (TAILQ_EMPTY(&tcp_freelist))
258 		return ENOBUFS;
259 
260 	tcp = TAILQ_FIRST(&tcp_freelist);
261 
262 	/*
263 	 * Initialize the structure.  Do not memset it to zero, as it is still
264 	 * part of the linked free list.  Initialization may still fail.  When
265 	 * adding new fields, make sure to change tcpsock_clone() accordingly.
266 	 */
267 
268 	ip_type = ipsock_socket(tcpsock_get_ipsock(tcp), domain,
269 	    TCP_SNDBUF_DEF, TCP_RCVBUF_DEF, sockp);
270 
271 	if ((tcp->tcp_pcb = tcp_new_ip_type(ip_type)) == NULL)
272 		return ENOBUFS;
273 	tcp_arg(tcp->tcp_pcb, tcp);
274 
275 	tcp->tcp_listener = NULL;
276 
277 	tcpsock_reset_send(tcp);
278 	tcpsock_reset_recv(tcp);
279 
280 	TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next);
281 
282 	*ops = &tcpsock_ops;
283 	return tcpsock_get_id(tcp);
284 }
285 
286 /*
287  * Create a TCP socket for the TCP PCB 'pcb' which identifies a new connection
288  * incoming on listening socket 'listener'.  The new socket is essentially a
289  * "clone" of the listening TCP socket, in that it should inherit any settings
290  * from the listening socket.  The socket has not yet been accepted by userland
291  * so add it to the queue of connetions pending for the listening socket.  On
292  * success, return OK.  On failure, return a negative error code.
293  */
294 static int
tcpsock_clone(struct tcpsock * listener,struct tcp_pcb * pcb)295 tcpsock_clone(struct tcpsock * listener, struct tcp_pcb * pcb)
296 {
297 	struct tcpsock *tcp;
298 
299 	if (TAILQ_EMPTY(&tcp_freelist))
300 		return ENOBUFS;
301 
302 	tcp = TAILQ_FIRST(&tcp_freelist);
303 
304 	/*
305 	 * Initialize the structure.  Do not memset it to zero, as it is still
306 	 * part of the linked free list.  Initialization may still fail.  Most
307 	 * settings should be inherited from the listening socket here, rather
308 	 * than being initialized to their default state.
309 	 */
310 
311 	ipsock_clone(tcpsock_get_ipsock(listener), tcpsock_get_ipsock(tcp),
312 	    tcpsock_get_id(tcp));
313 
314 	tcp->tcp_pcb = pcb;
315 	tcp_arg(pcb, tcp);
316 
317 	tcpsock_reset_send(tcp);
318 	tcpsock_reset_recv(tcp);
319 
320 	/*
321 	 * Remove the new socket from the free list, and add it to the queue of
322 	 * the listening socket--in this order, because the same next pointer
323 	 * is used for both.
324 	 */
325 	TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next);
326 
327 	TAILQ_INSERT_TAIL(&listener->tcp_queue.tq_head, tcp,
328 	    tcp_queue.tq_next);
329 	tcp->tcp_listener = listener;
330 
331 	return OK;
332 }
333 
334 /*
335  * Allocate a buffer from the pool, using the standard pool size.  The returned
336  * buffer is a single element--never a chain.
337  */
338 static struct pbuf *
tcpsock_alloc_buf(void)339 tcpsock_alloc_buf(void)
340 {
341 	struct pbuf *pbuf;
342 
343 	pbuf = pbuf_alloc(PBUF_RAW, MEMPOOL_BUFSIZE, PBUF_RAM);
344 
345 	assert(pbuf == NULL || pbuf->len == pbuf->tot_len);
346 
347 	return pbuf;
348 }
349 
350 /*
351  * Free the given buffer.  Ensure that pbuf_free() will not attempt to free the
352  * next buffer(s) in the chain as well.  This may be called for pbufs other
353  * than those allocated with tcpsock_alloc_buf().
354  */
355 static void
tcpsock_free_buf(struct pbuf * pbuf)356 tcpsock_free_buf(struct pbuf * pbuf)
357 {
358 
359 	/*
360 	 * Resetting the length is currently not necessary, but better safe
361 	 * than sorry..
362 	 */
363 	pbuf->len = pbuf->tot_len;
364 	pbuf->next = NULL;
365 
366 	pbuf_free(pbuf);
367 }
368 
369 /*
370  * Clear the send queue of a TCP socket.  The caller must ensure that lwIP will
371  * no longer access any of data on the send queue.
372  */
373 static void
tcpsock_clear_send(struct tcpsock * tcp)374 tcpsock_clear_send(struct tcpsock * tcp)
375 {
376 	struct pbuf *phead;
377 
378 	assert(tcp->tcp_pcb == NULL);
379 
380 	while ((phead = tcp->tcp_snd.ts_head) != NULL) {
381 		tcp->tcp_snd.ts_head = phead->next;
382 
383 		assert(tcpsock_sendbufs > 0);
384 		tcpsock_sendbufs--;
385 
386 		tcpsock_free_buf(phead);
387 	}
388 
389 	tcpsock_reset_send(tcp);
390 }
391 
392 /*
393  * Clear the receive queue of a TCP socket.  If 'ack_data' is set, also
394  * acknowledge the previous contents of the receive queue to lwIP.
395  */
396 static size_t
tcpsock_clear_recv(struct tcpsock * tcp,int ack_data)397 tcpsock_clear_recv(struct tcpsock * tcp, int ack_data)
398 {
399 	struct pbuf *phead;
400 	size_t rlen;
401 
402 	rlen = tcp->tcp_rcv.tr_len;
403 
404 	while ((phead = tcp->tcp_rcv.tr_head) != NULL) {
405 		tcp->tcp_rcv.tr_head = phead->next;
406 
407 		assert(tcpsock_recvbufs > 0);
408 		tcpsock_recvbufs--;
409 
410 		tcpsock_free_buf(phead);
411 	}
412 
413 	/*
414 	 * From now on, we will basically be discarding incoming data as fast
415 	 * as possible, to keep the full window open at all times.
416 	 */
417 	if (ack_data && tcp->tcp_pcb != NULL && tcp->tcp_rcv.tr_unacked > 0)
418 		tcp_recved(tcp->tcp_pcb, tcp->tcp_rcv.tr_unacked);
419 
420 	tcpsock_reset_recv(tcp);
421 
422 	return rlen;
423 }
424 
425 /*
426  * The TCP socket's PCB has been detached from the socket, typically because
427  * the connection was aborted, either by us or by lwIP.  Either way, any TCP
428  * connection is gone.  Clear the socket's send queue, remove the socket from
429  * a listening socket's queue, and if the socket itself is ready and allowed to
430  * be freed, free it now.  The socket is ready to be freed if it was either on
431  * a listening queue or being closed already.  The socket is allowed to be
432  * freed only if 'may_free' is TRUE.  If the socket is not freed, its receive
433  * queue is left as is, as it may still have data to be received by userland.
434  */
435 static int
tcpsock_cleanup(struct tcpsock * tcp,int may_free)436 tcpsock_cleanup(struct tcpsock * tcp, int may_free)
437 {
438 	int destroy;
439 
440 	assert(tcp->tcp_pcb == NULL);
441 
442 	/*
443 	 * Free any data on the send queue.  This is safe to do right now,
444 	 * because the PCB has been aborted (or was already gone).  We must be
445 	 * very careful about clearing the send queue in all other situations.
446 	 */
447 	tcpsock_clear_send(tcp);
448 
449 	/*
450 	 * If this was a socket pending acceptance, remove it from the
451 	 * corresponding listener socket's queue, and free it.  Otherwise, free
452 	 * the socket only if it suspended a graceful close operation.
453 	 */
454 	if (tcp->tcp_listener != NULL) {
455 		TAILQ_REMOVE(&tcp->tcp_listener->tcp_queue.tq_head, tcp,
456 		    tcp_queue.tq_next);
457 		tcp->tcp_listener = NULL;
458 
459 		/*
460 		 * The listener socket's backlog count should be adjusted by
461 		 * lwIP whenever the PCB is freed up, so we need (and must) not
462 		 * attempt to do that here.
463 		 */
464 
465 		destroy = TRUE;
466 	} else
467 		destroy = sockevent_is_closing(tcpsock_get_sock(tcp));
468 
469 	/*
470 	 * Do not free the socket if 'may_free' is FALSE.  That flag may be set
471 	 * if we are currently in the second tcpsock_close() call on the
472 	 * socket, in which case sockevent_is_closing() is TRUE but we must
473 	 * still not free the socket now: doing so would derail libsockevent.
474 	 */
475 	if (destroy && may_free) {
476 		(void)tcpsock_clear_recv(tcp, FALSE /*ack_data*/);
477 
478 		sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE);
479 	}
480 
481 	return destroy;
482 }
483 
484 /*
485  * Abort the lwIP PCB for the given socket, using tcp_abort().  If the PCB is
486  * connected, this will cause the connection to be reset.  The PCB, which must
487  * have still been present before the call, will be gone after the call.
488  */
489 static void
tcpsock_pcb_abort(struct tcpsock * tcp)490 tcpsock_pcb_abort(struct tcpsock * tcp)
491 {
492 
493 	assert(tcp->tcp_pcb != NULL);
494 	assert(!tcpsock_is_listening(tcp));
495 
496 	tcp_recv(tcp->tcp_pcb, NULL);
497 	tcp_sent(tcp->tcp_pcb, NULL);
498 	tcp_err(tcp->tcp_pcb, NULL);
499 	tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL);
500 
501 	tcp_arg(tcp->tcp_pcb, NULL);
502 
503 	tcp_abort(tcp->tcp_pcb);
504 
505 	tcp->tcp_pcb = NULL;
506 }
507 
508 /*
509  * Close the lwIP PCB for the given socket, using tcp_close().  If the PCB is
510  * connected, its graceful close will be finished by lwIP in the background.
511  * The PCB, which must have still been present before the call, will be gone
512  * after the call.
513  */
514 static void
tcpsock_pcb_close(struct tcpsock * tcp)515 tcpsock_pcb_close(struct tcpsock * tcp)
516 {
517 	err_t err;
518 
519 	assert(tcp->tcp_pcb != NULL);
520 	assert(tcp->tcp_snd.ts_len == 0);
521 
522 	if (!tcpsock_is_listening(tcp)) {
523 		tcp_recv(tcp->tcp_pcb, NULL);
524 		tcp_sent(tcp->tcp_pcb, NULL);
525 		tcp_err(tcp->tcp_pcb, NULL);
526 		tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL);
527 	}
528 
529 	tcp_arg(tcp->tcp_pcb, NULL);
530 
531 	if ((err = tcp_close(tcp->tcp_pcb)) != ERR_OK)
532 		panic("unexpected TCP close failure: %d", err);
533 
534 	tcp->tcp_pcb = NULL;
535 }
536 
537 /*
538  * Return TRUE if all conditions are met for closing the TCP socket's PCB, or
539  * FALSE if they are not.  Upon calling this function, the socket's PCB must
540  * still be around.
541  */
542 static int
tcpsock_may_close(struct tcpsock * tcp)543 tcpsock_may_close(struct tcpsock * tcp)
544 {
545 
546 	assert(tcp->tcp_pcb != NULL);
547 
548 	/*
549 	 * Regular closing of the PCB requires three conditions to be met:
550 	 *
551 	 * 1. all our data has been transmitted AND acknowledged, so that we do
552 	 *    not risk corruption in case there are still unsent or unack'ed
553 	 *    data buffers that may otherwise be recycled too soon;
554 	 * 2. we have sent our FIN to the peer; and,
555 	 * 3. we have received a FIN from the peer.
556 	 */
557 	return ((tcpsock_get_flags(tcp) & (TCPF_SENT_FIN | TCPF_RCVD_FIN)) ==
558 	    (TCPF_SENT_FIN | TCPF_RCVD_FIN) && tcp->tcp_snd.ts_len == 0);
559 }
560 
561 /*
562  * The given socket is ready to be closed as per the tcpsock_may_close() rules.
563  * This implies that its send queue is already empty.  Gracefully close the
564  * PCB.  In addition, if the socket is being closed gracefully, meaning we
565  * suspended an earlier tcpsock_close() call (and as such already emptied the
566  * receive queue as well), then tell libsockevent that the close is finished,
567  * freeing the socket.  Return TRUE if the socket has indeed been freed this
568  * way, or FALSE if the socket is still around.
569  */
570 static int
tcpsock_finish_close(struct tcpsock * tcp)571 tcpsock_finish_close(struct tcpsock * tcp)
572 {
573 
574 	assert(tcp->tcp_snd.ts_len == 0);
575 	assert(tcp->tcp_listener == NULL);
576 
577 	/*
578 	 * If we get here, we have already shut down the sending side of the
579 	 * PCB.  Technically, we are interested only in shutting down the
580 	 * receiving side of the PCB here, so that lwIP may decide to recycle
581 	 * the socket later etcetera.  We call tcp_close() because we do not
582 	 * want to rely on tcp_shutdown(RX) doing the exact same thing.
583 	 * However, we do rely on the fact that the PCB is not immediately
584 	 * destroyed by the tcp_close() call: otherwise we may have to return
585 	 * ERR_ABRT if this function is called from a lwIP-generated event.
586 	 */
587 	tcpsock_pcb_close(tcp);
588 
589 	/*
590 	 * If we suspended an earlier tcpsock_close() call, we have to tell
591 	 * libsockevent that the close operation is now complete.
592 	 */
593 	if (sockevent_is_closing(tcpsock_get_sock(tcp))) {
594 		assert(tcp->tcp_rcv.tr_len == 0);
595 
596 		sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE);
597 
598 		return TRUE;
599 	} else
600 		return FALSE;
601 }
602 
603 /*
604  * Attempt to start or resume enqueuing data and/or a FIN to send on the given
605  * TCP socket.  Return TRUE if anything at all could be newly enqueued on the
606  * lwIP PCB, even if less than desired.  In that case, the caller should try to
607  * send whatever was enqueued, and if applicable, check if the socket may now
608  * be closed (due to the FIN being enqueued).  In particular, in any situation
609  * where the socket may be in the process of being closed, the caller must use
610  * tcpsock_may_close() if TRUE is returned.  Return FALSE if nothing new could
611  * be enqueued, in which case no send attempt need to be made either.
612  */
613 static int
tcpsock_pcb_enqueue(struct tcpsock * tcp)614 tcpsock_pcb_enqueue(struct tcpsock * tcp)
615 {
616 	struct pbuf *punsent;
617 	size_t space, chunk;
618 	unsigned int flags;
619 	err_t err;
620 	int enqueued;
621 
622 	assert(tcp->tcp_pcb != NULL);
623 
624 	if (tcpsock_get_flags(tcp) & TCPF_FULL)
625 		return FALSE;
626 
627 	/*
628 	 * Attempt to enqueue more unsent data, if any, on the PCB's send
629 	 * queue.
630 	 */
631 	enqueued = FALSE;
632 
633 	while (tcp->tcp_snd.ts_unsent != NULL) {
634 		if ((space = tcp_sndbuf(tcp->tcp_pcb)) == 0)
635 			break;
636 
637 		/*
638 		 * We may maintain a non-NULL unsent pointer even when there is
639 		 * nothing more to send right now, because the tail buffer may
640 		 * be filled up further later on.
641 		 */
642 		punsent = tcp->tcp_snd.ts_unsent;
643 
644 		assert(punsent->len >= tcp->tcp_snd.ts_unsent_off);
645 
646 		chunk = (size_t)punsent->len - tcp->tcp_snd.ts_unsent_off;
647 		if (chunk == 0)
648 			break;
649 
650 		if (chunk > space)
651 			chunk = space;
652 
653 		/* Try to enqueue more data for sending. */
654 		if (chunk < punsent->len || punsent->next != NULL)
655 			flags = TCP_WRITE_FLAG_MORE;
656 		else
657 			flags = 0;
658 
659 		err = tcp_write(tcp->tcp_pcb, (char *)punsent->payload +
660 		    tcp->tcp_snd.ts_unsent_off, chunk, flags);
661 
662 		/*
663 		 * Since tcp_write() enqueues data only, it should only return
664 		 * out-of-memory errors; no fatal ones.  In any case, stop.
665 		 */
666 		if (err != ERR_OK) {
667 			assert(err == ERR_MEM);
668 
669 			break;
670 		}
671 
672 		/* We have successfully enqueued data. */
673 		enqueued = TRUE;
674 
675 		tcp->tcp_snd.ts_unsent_off += chunk;
676 
677 		if (tcp->tcp_snd.ts_unsent_off < punsent->tot_len) {
678 			assert(tcp->tcp_snd.ts_unsent_off < punsent->len ||
679 			    punsent->next == NULL);
680 
681 			break;
682 		}
683 
684 		tcp->tcp_snd.ts_unsent = punsent->next;
685 		tcp->tcp_snd.ts_unsent_off = 0;
686 	}
687 
688 	/*
689 	 * If all pending data has been enqueued for sending, and we should
690 	 * shut down the sending end of the socket, try that now.
691 	 */
692 	if ((tcp->tcp_snd.ts_unsent == NULL ||
693 	    tcp->tcp_snd.ts_unsent_off == tcp->tcp_snd.ts_unsent->len) &&
694 	    tcpsock_is_shutdown(tcp, SFL_SHUT_WR) &&
695 	    !(tcpsock_get_flags(tcp) & TCPF_SENT_FIN)) {
696 		err = tcp_shutdown(tcp->tcp_pcb, 0 /*shut_rx*/, 1 /*shut_tx*/);
697 
698 		if (err == ERR_OK) {
699 			/*
700 			 * We have successfully enqueued a FIN.  The caller is
701 			 * now responsible for checking whether the PCB and
702 			 * possibly even the socket object can now be freed.
703 			 */
704 			tcpsock_set_flag(tcp, TCPF_SENT_FIN);
705 
706 			enqueued = TRUE;
707 		} else {
708 			assert(err == ERR_MEM);
709 
710 			/*
711 			 * FIXME: the resolution for lwIP bug #47485 has taken
712 			 * away even more control over the closing process from
713 			 * us, making tracking sockets especially for SO_LINGER
714 			 * even harder.  For now, we simply effectively undo
715 			 * the patch by clearing TF_CLOSEPEND if tcp_shutdown()
716 			 * returns ERR_MEM.  This will not be sustainable in
717 			 * the long term, though.
718 			 */
719 			tcp->tcp_pcb->flags &= ~TF_CLOSEPEND;
720 
721 			tcpsock_set_flag(tcp, TCPF_FULL);
722 		}
723 	}
724 
725 	return enqueued;
726 }
727 
728 /*
729  * Request lwIP to start sending any enqueued data and/or FIN on the TCP
730  * socket's lwIP PCB.  On success, return OK.  On failure, return a negative
731  * error code, after cleaning up the socket, freeing the PCB.  If the socket
732  * was already being closed, also free the socket object in that case; the
733  * caller must then not touch the socket object anymore upon return.  If the
734  * socket object is not freed, and if 'raise_error' is TRUE, raise the error
735  * on the socket object.
736  */
737 static int
tcpsock_pcb_send(struct tcpsock * tcp,int raise_error)738 tcpsock_pcb_send(struct tcpsock * tcp, int raise_error)
739 {
740 	err_t err;
741 	int r;
742 
743 	assert(tcp->tcp_pcb != NULL);
744 
745 	/*
746 	 * If we have enqueued something, ask lwIP to send TCP packets now.
747 	 * This may result in a fatal error, in which case we clean up the
748 	 * socket and return the error to the caller.  Since cleaning up the
749 	 * socket may free the socket object, and the caller cannot tell
750 	 * whether that will happen or has happened, also possibly raise the
751 	 * error on the socket object if it is not gone.  As such, callers that
752 	 * set 'raise_error' to FALSE must know for sure that the socket was
753 	 * not being closed, for example because the caller is processing a
754 	 * (send) call from userland.
755 	 */
756 	err = tcp_output(tcp->tcp_pcb);
757 
758 	if (err != ERR_OK && err != ERR_MEM) {
759 		tcpsock_pcb_abort(tcp);
760 
761 		r = util_convert_err(err);
762 
763 		if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) {
764 			if (raise_error)
765 				sockevent_set_error(tcpsock_get_sock(tcp), r);
766 		}
767 		/* Otherwise, do not touch the socket object anymore! */
768 
769 		return r;
770 	} else
771 		return OK;
772 }
773 
774 /*
775  * Callback from lwIP.  The given number of data bytes have been acknowledged
776  * as received by the remote end.  Dequeue and free data from the TCP socket's
777  * send queue as appropriate.
778  */
779 static err_t
tcpsock_event_sent(void * arg,struct tcp_pcb * pcb __unused,uint16_t len)780 tcpsock_event_sent(void * arg, struct tcp_pcb * pcb __unused, uint16_t len)
781 {
782 	struct tcpsock *tcp = (struct tcpsock *)arg;
783 	struct pbuf *phead;
784 	size_t left;
785 
786 	assert(tcp != NULL);
787 	assert(pcb == tcp->tcp_pcb);
788 	assert(len > 0);
789 
790 	assert(tcp->tcp_snd.ts_len >= len);
791 	assert(tcp->tcp_snd.ts_head != NULL);
792 
793 	left = len;
794 
795 	/*
796 	 * First see if we can free up whole buffers.  Check against the head
797 	 * buffer's 'len' rather than 'tot_len', or we may end up leaving an
798 	 * empty buffer on the chain.
799 	 */
800 	while ((phead = tcp->tcp_snd.ts_head) != NULL &&
801 	    left >= (size_t)phead->len - tcp->tcp_snd.ts_head_off) {
802 		left -= (size_t)phead->len - tcp->tcp_snd.ts_head_off;
803 
804 		tcp->tcp_snd.ts_head = phead->next;
805 		tcp->tcp_snd.ts_head_off = 0;
806 
807 		if (phead == tcp->tcp_snd.ts_unsent) {
808 			assert(tcp->tcp_snd.ts_unsent_off == phead->len);
809 
810 			tcp->tcp_snd.ts_unsent = phead->next;
811 			tcp->tcp_snd.ts_unsent_off = 0;
812 		}
813 
814 		assert(tcpsock_sendbufs > 0);
815 		tcpsock_sendbufs--;
816 
817 		tcpsock_free_buf(phead);
818 	}
819 
820 	/*
821 	 * The rest of the given length is for less than the current head
822 	 * buffer.
823 	 */
824 	if (left > 0) {
825 		assert(tcp->tcp_snd.ts_head != NULL);
826 		assert((size_t)tcp->tcp_snd.ts_head->len -
827 		    tcp->tcp_snd.ts_head_off > left);
828 
829 		tcp->tcp_snd.ts_head_off += left;
830 	}
831 
832 	tcp->tcp_snd.ts_len -= (size_t)len;
833 
834 	if (tcp->tcp_snd.ts_head == NULL) {
835 		assert(tcp->tcp_snd.ts_len == 0);
836 		assert(tcp->tcp_snd.ts_unsent == NULL);
837 		tcp->tcp_snd.ts_tail = NULL;
838 	} else
839 		assert(tcp->tcp_snd.ts_len > 0);
840 
841 	/*
842 	 * If we emptied the send queue, and we already managed to send a FIN
843 	 * earlier, we may now have met all requirements to close the socket's
844 	 * PCB.  Otherwise, we may also be able to send more now, so try to
845 	 * resume sending.  Since we are invoked from the "sent" event,
846 	 * tcp_output() will not actually process anything, and so we do not
847 	 * call it either.  If we did, we would have to deal with errors here.
848 	 */
849 	if (tcpsock_may_close(tcp)) {
850 		if (tcpsock_finish_close(tcp))
851 			return ERR_OK;
852 	} else {
853 		tcpsock_clear_flag(tcp, TCPF_FULL);
854 
855 		/*
856 		 * If we now manage to enqueue a FIN, we may be ready to close
857 		 * the PCB after all.
858 		 */
859 		if (tcpsock_pcb_enqueue(tcp)) {
860 			if (tcpsock_may_close(tcp) &&
861 			    tcpsock_finish_close(tcp))
862 				return ERR_OK;
863 		}
864 	}
865 
866 	/* The user may also be able to send more now. */
867 	sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND);
868 
869 	return ERR_OK;
870 }
871 
872 /*
873  * Check whether any (additional) data previously received on a TCP socket
874  * should be acknowledged, possibly allowing the remote end to send additional
875  * data as a result.
876  */
877 static void
tcpsock_ack_recv(struct tcpsock * tcp)878 tcpsock_ack_recv(struct tcpsock * tcp)
879 {
880 	size_t rcvbuf, left, delta, ack;
881 
882 	assert(tcp->tcp_pcb != NULL);
883 
884 	/*
885 	 * We must make sure that at all times, we can still add an entire
886 	 * window's worth of data to the receive queue.  If the amount of free
887 	 * space drops below that threshold, we stop acknowledging received
888 	 * data.  The user may change the receive buffer size at all times; we
889 	 * update the window size lazily as appropriate.
890 	 */
891 	rcvbuf = tcpsock_get_rcvbuf(tcp);
892 
893 	if (rcvbuf > tcp->tcp_rcv.tr_len && tcp->tcp_rcv.tr_unacked > 0) {
894 		/*
895 		 * The number of bytes that lwIP can still give us at any time
896 		 * is represented as 'left'.  The number of bytes that we still
897 		 * allow to be stored in the receive queue is represented as
898 		 * 'delta'.  We must make sure that 'left' does not ever exceed
899 		 * 'delta' while acknowledging as many bytes as possible under
900 		 * that rule.
901 		 */
902 		left = TCP_WND - tcp->tcp_rcv.tr_unacked;
903 		delta = rcvbuf - tcp->tcp_rcv.tr_len;
904 
905 		if (left < delta) {
906 			ack = delta - left;
907 
908 			if (ack > tcp->tcp_rcv.tr_unacked)
909 				ack = tcp->tcp_rcv.tr_unacked;
910 
911 			tcp_recved(tcp->tcp_pcb, ack);
912 
913 			tcp->tcp_rcv.tr_unacked -= ack;
914 
915 			assert(tcp->tcp_rcv.tr_len + TCP_WND -
916 			    tcp->tcp_rcv.tr_unacked <= rcvbuf);
917 		}
918 	}
919 }
920 
921 /*
922  * Attempt to merge two consecutive underfilled buffers in the receive queue of
923  * a TCP socket, freeing up one of the two buffers as a result.  The first
924  * (oldest) buffer is 'ptail', and the pointer to this buffer is stored at
925  * 'pnext'.  The second (new) buffer is 'pbuf', which is already attached to
926  * the first buffer.  The second buffer may be followed by additional buffers
927  * with even more new data.  Return TRUE if buffers have been merged, in which
928  * case the pointer at 'pnext' may have changed, and no assumptions should be
929  * made about whether 'ptail' and 'pbuf' still exist in any form.  Return FALSE
930  * if no merging was necessary or if no new buffer could be allocated.
931  */
932 static int
tcpsock_try_merge(struct pbuf ** pnext,struct pbuf * ptail,struct pbuf * pbuf)933 tcpsock_try_merge(struct pbuf **pnext, struct pbuf * ptail, struct pbuf * pbuf)
934 {
935 	struct pbuf *pnew;
936 
937 	assert(*pnext == ptail);
938 	assert(ptail->next == pbuf);
939 
940 	/*
941 	 * Unfortunately, we cannot figure out what kind of pbuf we were given
942 	 * by the lower layers, so we cannot merge two buffers without first
943 	 * allocating a third.  Once we have done that, though, we can easily
944 	 * merge more into that new buffer.  For now we use the following
945 	 * policies:
946 	 *
947 	 * 1. if two consecutive lwIP-provided buffers are both used less than
948 	 *    half the size of a full buffer, try to allocate a new buffer and
949 	 *    copy both lwIP-provided buffers into that new buffer, freeing up
950 	 *    the pair afterwards;
951 	 * 2. if the tail buffer on the chain is allocated by us and not yet
952 	 *    full, and the next buffer's contents can be added to the tail
953 	 *    buffer in their entirety, do just that.
954 	 *
955 	 * Obviously there is a trade-off between the performance overhead of
956 	 * copying and the resource overhead of keeping less-than-full buffers
957 	 * on the receive queue, but this policy should both keep actual memory
958 	 * usage to no more than twice the receive queue length and prevent
959 	 * excessive copying.  The policy deliberately performs more aggressive
960 	 * merging into a buffer that we allocated ourselves.
961 	 */
962 	if (ptail->tot_len <= MEMPOOL_BUFSIZE / 2 &&
963 	    pbuf->len <= MEMPOOL_BUFSIZE / 2) {
964 		/*
965 		 * Case #1.
966 		 */
967 		assert(ptail->tot_len == ptail->len);
968 		assert(pbuf->tot_len == pbuf->len);
969 
970 		pnew = tcpsock_alloc_buf();
971 		if (pnew == NULL)
972 			return FALSE;
973 
974 		memcpy(pnew->payload, ptail->payload, ptail->len);
975 		memcpy((char *)pnew->payload + ptail->len, pbuf->payload,
976 		    pbuf->len);
977 		pnew->len = ptail->len + pbuf->len;
978 		assert(pnew->len <= pnew->tot_len);
979 
980 		pnew->next = pbuf->next;
981 		/* For now, we need not inherit any flags from either pbuf. */
982 
983 		*pnext = pnew;
984 
985 		/* One allocated, two about to be deallocated. */
986 		assert(tcpsock_recvbufs > 0);
987 		tcpsock_recvbufs--;
988 
989 		tcpsock_free_buf(ptail);
990 		tcpsock_free_buf(pbuf);
991 
992 		return TRUE;
993 	} else if (ptail->tot_len - ptail->len >= pbuf->len) {
994 		/*
995 		 * Case #2.
996 		 */
997 		memcpy((char *)ptail->payload + ptail->len, pbuf->payload,
998 		    pbuf->len);
999 
1000 		ptail->len += pbuf->len;
1001 
1002 		ptail->next = pbuf->next;
1003 
1004 		assert(tcpsock_recvbufs > 0);
1005 		tcpsock_recvbufs--;
1006 
1007 		tcpsock_free_buf(pbuf);
1008 
1009 		return TRUE;
1010 	} else
1011 		return FALSE;
1012 }
1013 
1014 /*
1015  * Callback from lwIP.  New data or flags have been received on a TCP socket.
1016  */
1017 static err_t
tcpsock_event_recv(void * arg,struct tcp_pcb * pcb __unused,struct pbuf * pbuf,err_t err)1018 tcpsock_event_recv(void * arg, struct tcp_pcb * pcb __unused,
1019 	struct pbuf * pbuf, err_t err)
1020 {
1021 	struct tcpsock *tcp = (struct tcpsock *)arg;
1022 	struct pbuf *ptail, **pprevp;
1023 	size_t len;
1024 
1025 	assert(tcp != NULL);
1026 	assert(pcb == tcp->tcp_pcb);
1027 
1028 	/*
1029 	 * lwIP should never provide anything other than ERR_OK in 'err', and
1030 	 * it is not clear what we should do if it would.  If lwIP ever changes
1031 	 * in this regard, we will likely have to change this code accordingly.
1032 	 */
1033 	if (err != ERR_OK)
1034 		panic("TCP receive event with error: %d", err);
1035 
1036 	/* If the given buffer is NULL, we have received a FIN. */
1037 	if (pbuf == NULL) {
1038 		tcpsock_set_flag(tcp, TCPF_RCVD_FIN);
1039 
1040 		/* Userland may now receive EOF. */
1041 		if (!tcpsock_is_shutdown(tcp, SFL_SHUT_RD))
1042 			sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV);
1043 
1044 		/*
1045 		 * If we were in the process of closing the socket, and we
1046 		 * receive a FIN before our FIN got acknowledged, we close the
1047 		 * socket anyway, as described in tcpsock_close().  However, if
1048 		 * there is still unacknowledged outgoing data or we did not
1049 		 * even manage to send our FIN yet, hold off closing the socket
1050 		 * for now.
1051 		 */
1052 		if (tcpsock_may_close(tcp))
1053 			(void)tcpsock_finish_close(tcp);
1054 
1055 		return ERR_OK;
1056 	}
1057 
1058 	/*
1059 	 * If the socket is being closed, receiving new data should cause a
1060 	 * reset.
1061 	 */
1062 	if (sockevent_is_closing(tcpsock_get_sock(tcp))) {
1063 		tcpsock_pcb_abort(tcp);
1064 
1065 		(void)tcpsock_cleanup(tcp, TRUE /*may_free*/);
1066 		/* Do not touch the socket object anymore! */
1067 
1068 		pbuf_free(pbuf);
1069 
1070 		return ERR_ABRT;
1071 	}
1072 
1073 	/*
1074 	 * If the socket has already been shut down for reading, discard the
1075 	 * incoming data and do nothing else.
1076 	 */
1077 	if (tcpsock_is_shutdown(tcp, SFL_SHUT_RD)) {
1078 		tcp_recved(tcp->tcp_pcb, pbuf->tot_len);
1079 
1080 		pbuf_free(pbuf);
1081 
1082 		return ERR_OK;
1083 	}
1084 
1085 	/*
1086 	 * We deliberately ignore the PBUF_FLAG_PUSH flag.  This flag would
1087 	 * enable the receive functionality to delay delivering "un-pushed"
1088 	 * data to applications.  The implementation of this scheme could track
1089 	 * the amount of data up to and including the last-pushed segment using
1090 	 * a "tr_push_len" field or so.  Deciding when to deliver "un-pushed"
1091 	 * data after all is a bit tricker though.  As far as I can tell, the
1092 	 * BSDs do not implement anything like that.  Windows does, and this
1093 	 * results in interaction problems with even more lightweight TCP/IP
1094 	 * stacks that do not send the TCP PSH flag.  Currently, there is no
1095 	 * obvious benefit for us to support delaying data delivery like that.
1096 	 * In addition, testing its implementation reliably would be difficult.
1097 	 */
1098 
1099 	len = (size_t)pbuf->tot_len;
1100 
1101 	/*
1102 	 * Count the number of buffers that are now owned by us.  The new total
1103 	 * of buffers owned by us must not exceed the size of the memory pool.
1104 	 * Any more would indicate an accounting error.  Note that
1105 	 * tcpsock_recvbufs is currently used for debugging only!
1106 	 */
1107 	tcpsock_recvbufs += pbuf_clen(pbuf);
1108 	assert(tcpsock_recvbufs < mempool_cur_buffers());
1109 
1110 	/*
1111 	 * The pre-tail pointer points to whatever is pointing to the tail
1112 	 * buffer.  The latter pointer may be the 'tr_head' field in our
1113 	 * tcpsock structure, or the 'next' field in the penultimate buffer,
1114 	 * or NULL if there are currently no buffers on the receive queue.
1115 	 */
1116 	if ((pprevp = tcp->tcp_rcv.tr_pre_tailp) != NULL) {
1117 		ptail = *pprevp;
1118 
1119 		assert(ptail != NULL);
1120 		assert(ptail->next == NULL);
1121 		assert(tcp->tcp_rcv.tr_head != NULL);
1122 
1123 		ptail->next = pbuf;
1124 		pbuf->tot_len = pbuf->len;	/* to help freeing on merges */
1125 
1126 		if (tcpsock_try_merge(pprevp, ptail, pbuf)) {
1127 			ptail = *pprevp;
1128 			pbuf = ptail->next;
1129 		}
1130 
1131 		if (pbuf != NULL)
1132 			pprevp = &ptail->next;
1133 	} else {
1134 		assert(tcp->tcp_rcv.tr_head == NULL);
1135 		assert(tcp->tcp_rcv.tr_head_off == 0);
1136 
1137 		tcp->tcp_rcv.tr_head = pbuf;
1138 
1139 		pprevp = &tcp->tcp_rcv.tr_head;
1140 	}
1141 
1142 	/*
1143 	 * Chop up the chain into individual buffers.  This is necessary as we
1144 	 * overload 'tot_len' to mean "space available in the buffer", as we
1145 	 * want for buffers allocated by us as part of buffer merges.  Also get
1146 	 * a pointer to the pointer to the new penultimate tail buffer.  Due to
1147 	 * merging, the chain may already be empty by now, though.
1148 	 */
1149 	if (pbuf != NULL) {
1150 		for (; pbuf->next != NULL; pbuf = pbuf->next) {
1151 			pbuf->tot_len = pbuf->len;
1152 
1153 			pprevp = &pbuf->next;
1154 		}
1155 		assert(pbuf->len == pbuf->tot_len);
1156 	}
1157 
1158 	assert(*pprevp != NULL);
1159 	assert((*pprevp)->next == NULL);
1160 	tcp->tcp_rcv.tr_pre_tailp = pprevp;
1161 
1162 	tcp->tcp_rcv.tr_len += len;
1163 	tcp->tcp_rcv.tr_unacked += len;
1164 
1165 	assert(tcp->tcp_rcv.tr_unacked <= TCP_WND);
1166 
1167 	/*
1168 	 * Note that tr_len may now exceed the receive buffer size in the
1169 	 * highly exceptional case that the user is adjusting the latter after
1170 	 * the socket had already received data.
1171 	 */
1172 
1173 	/* See if we can immediately acknowledge some or all of the data. */
1174 	tcpsock_ack_recv(tcp);
1175 
1176 	/* Also wake up any receivers now. */
1177 	sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV);
1178 
1179 	return ERR_OK;
1180 }
1181 
1182 /*
1183  * Callback from lwIP.  The PCB corresponding to the socket identified by 'arg'
1184  * has been closed by lwIP, with the reason specified in 'err': either the
1185  * connection has been aborted locally (ERR_ABRT), it has been reset by the
1186  * remote end (ERR_RST), or it is closed due to state transitions (ERR_CLSD).
1187  */
1188 static void
tcpsock_event_err(void * arg,err_t err)1189 tcpsock_event_err(void * arg, err_t err)
1190 {
1191 	struct tcpsock *tcp = (struct tcpsock *)arg;
1192 	int r;
1193 
1194 	assert(tcp != NULL);
1195 	assert(tcp->tcp_pcb != NULL);
1196 	assert(err != ERR_OK);
1197 
1198 	/* The original PCB is now gone, or will be shortly. */
1199 	tcp->tcp_pcb = NULL;
1200 
1201 	/*
1202 	 * Clean up the socket.  As a result it may be freed, in which case we
1203 	 * must not touch it anymore.  No need to return ERR_ABRT from here, as
1204 	 * the PCB has been aborted already.
1205 	 */
1206 	if (tcpsock_cleanup(tcp, TRUE /*may_free*/))
1207 		return;
1208 
1209 	if (err == ERR_CLSD) {
1210 		/*
1211 		 * We may get here if the socket is shut down for writing and
1212 		 * we already received a FIN from the remote side, thus putting
1213 		 * the socket in LAST_ACK state, and we receive that last
1214 		 * acknowledgment.  There is nothing more we need to do.
1215 		 *
1216 		 * We will never get here in the other case that ERR_CLSD is
1217 		 * raised, which is when the socket is reset because of
1218 		 * unacknowledged data while closing: we handle the
1219 		 * reset-on-ACK case ourselves in tcpsock_close(), and the
1220 		 * socket is in closing state after that.
1221 		 */
1222 		assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR));
1223 		assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN);
1224 	} else {
1225 		/*
1226 		 * Anything else should be an error directly from lwIP;
1227 		 * currently either ERR_ABRT and ERR_RST.  Covert it to a
1228 		 * regular error and set it on the socket.  Doing so will also
1229 		 * raise the appropriate events.
1230 		 */
1231 		/*
1232 		 * Unfortunately, lwIP is not throwing accurate errors even
1233 		 * when it can.  We convert some errors to reflect more
1234 		 * accurately the most likely cause.
1235 		 *
1236 		 * TODO: fix lwIP in this regard..
1237 		 */
1238 		r = util_convert_err(err);
1239 
1240 		if (tcpsock_get_flags(tcp) & TCPF_CONNECTING) {
1241 			switch (err) {
1242 			case ERR_ABRT:	r = ETIMEDOUT;		break;
1243 			case ERR_RST:	r = ECONNREFUSED;	break;
1244 			}
1245 		}
1246 
1247 		sockevent_set_error(tcpsock_get_sock(tcp), r);
1248 	}
1249 }
1250 
1251 /*
1252  * Callback from lwIP.  Perform regular checks on a TCP socket.  This function
1253  * is called one per five seconds on connected sockets, and twice per second on
1254  * closing sockets.
1255  */
1256 static err_t
tcpsock_event_poll(void * arg,struct tcp_pcb * pcb __unused)1257 tcpsock_event_poll(void * arg, struct tcp_pcb * pcb __unused)
1258 {
1259 	struct tcpsock *tcp = (struct tcpsock *)arg;
1260 	err_t err;
1261 	int r;
1262 
1263 	assert(tcp != NULL);
1264 	assert(pcb == tcp->tcp_pcb);
1265 
1266 	/*
1267 	 * If we ended up running out of buffers earlier, try resuming any send
1268 	 * requests now, both for enqueuing TCP data with lwIP and for user
1269 	 * requests.
1270 	 */
1271 	if (tcpsock_get_flags(tcp) & (TCPF_FULL | TCPF_OOM)) {
1272 		tcpsock_clear_flag(tcp, TCPF_FULL);
1273 		tcpsock_clear_flag(tcp, TCPF_OOM);
1274 
1275 		/* See if we can enqueue more data with lwIP. */
1276 		if (tcpsock_pcb_enqueue(tcp)) {
1277 			/* In some cases, we can now close the PCB. */
1278 			if (tcpsock_may_close(tcp)) {
1279 				(void)tcpsock_finish_close(tcp);
1280 				/*
1281 				 * The PCB is definitely gone here, and the
1282 				 * entire socket object may be gone now too.
1283 				 * Do not touch either anymore!
1284 				 */
1285 
1286 				return ERR_OK;
1287 			}
1288 
1289 			/*
1290 			 * If actually sending the data fails, the PCB will be
1291 			 * gone, and the socket object may be gone as well.  Do
1292 			 * not touch either anymore in that case!
1293 			 */
1294 			if (tcpsock_pcb_send(tcp, TRUE /*raise_error*/) != OK)
1295 				return ERR_ABRT;
1296 		}
1297 
1298 		/*
1299 		 * If we ran out of buffers earlier, it may be possible to take
1300 		 * in more data from a user process now, even if we did not
1301 		 * manage to enqueue any more pending data with lwIP.
1302 		 */
1303 		sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND);
1304 
1305 		assert(tcp->tcp_pcb != NULL);
1306 	} else if (tcp->tcp_snd.ts_unsent != NULL &&
1307 	    tcp->tcp_snd.ts_unsent_off < tcp->tcp_snd.ts_unsent->len) {
1308 		/*
1309 		 * If the send buffer is full, we will no longer call
1310 		 * tcp_output(), which means we may also miss out on fatal
1311 		 * errors that would otherwise kill the connection (e.g., no
1312 		 * route).  As a result, the connection may erroneously
1313 		 * continue to exist for a long time.  To avoid this, we call
1314 		 * tcp_output() every once in a while when there are still
1315 		 * unsent data.
1316 		 */
1317 		err = tcp_output(tcp->tcp_pcb);
1318 
1319 		if (err != ERR_OK && err != ERR_MEM) {
1320 			tcpsock_pcb_abort(tcp);
1321 
1322 			if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) {
1323 				r = util_convert_err(err);
1324 
1325 				sockevent_set_error(tcpsock_get_sock(tcp), r);
1326 			}
1327 			/* Otherwise do not touch the socket object anymore! */
1328 
1329 			return ERR_ABRT;
1330 		}
1331 	}
1332 
1333 	/*
1334 	 * If we are closing the socket, and we sent a FIN, see if the FIN got
1335 	 * acknowledged.  If so, finish closing the socket.  Unfortunately, we
1336 	 * can perform this check by polling only.  TODO: change lwIP..
1337 	 */
1338 	if (sockevent_is_closing(tcpsock_get_sock(tcp)) &&
1339 	    (tcpsock_get_flags(tcp) & TCPF_SENT_FIN) &&
1340 	    tcp->tcp_pcb->unsent == NULL && tcp->tcp_pcb->unacked == NULL) {
1341 		assert(tcp->tcp_snd.ts_len == 0);
1342 
1343 		tcpsock_finish_close(tcp);
1344 	}
1345 
1346 	return ERR_OK;
1347 }
1348 
1349 /*
1350  * Bind a TCP socket to a local address.
1351  */
1352 static int
tcpsock_bind(struct sock * sock,const struct sockaddr * addr,socklen_t addr_len,endpoint_t user_endpt)1353 tcpsock_bind(struct sock * sock, const struct sockaddr * addr,
1354 	socklen_t addr_len, endpoint_t user_endpt)
1355 {
1356 	struct tcpsock *tcp = (struct tcpsock *)sock;
1357 	ip_addr_t ipaddr;
1358 	uint16_t port;
1359 	err_t err;
1360 	int r;
1361 
1362 	if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED)
1363 		return EINVAL;
1364 
1365 	if ((r = ipsock_get_src_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
1366 	    user_endpt, &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port,
1367 	    FALSE /*allow_mcast*/, &ipaddr, &port)) != OK)
1368 		return r;
1369 
1370 	err = tcp_bind(tcp->tcp_pcb, &ipaddr, port);
1371 
1372 	return util_convert_err(err);
1373 }
1374 
1375 /*
1376  * Callback from lwIP.  A new connection 'pcb' has arrived on the listening
1377  * socket identified by 'arg'.  Note that 'pcb' may be NULL in the case that
1378  * lwIP could not accept the connection itself.
1379  */
1380 static err_t
tcpsock_event_accept(void * arg,struct tcp_pcb * pcb,err_t err)1381 tcpsock_event_accept(void * arg, struct tcp_pcb * pcb, err_t err)
1382 {
1383 	struct tcpsock *tcp = (struct tcpsock *)arg;
1384 
1385 	assert(tcp != NULL);
1386 	assert(tcpsock_is_listening(tcp));
1387 
1388 	/*
1389 	 * If the given PCB is NULL, then lwIP ran out of memory allocating a
1390 	 * PCB for the new connection.  There is nothing we can do with that
1391 	 * information.  Also check 'err' just to make sure.
1392 	 */
1393 	if (pcb == NULL || err != OK)
1394 		return ERR_OK;
1395 
1396 	/*
1397 	 * The TCP socket is the listening socket, but the PCB is for the
1398 	 * incoming connection.
1399 	 */
1400 	if (tcpsock_clone(tcp, pcb) != OK) {
1401 		/*
1402 		 * We could not allocate the resources necessary to accept the
1403 		 * connection.  Abort it immediately.
1404 		 */
1405 		tcp_abort(pcb);
1406 
1407 		return ERR_ABRT;
1408 	}
1409 
1410 	/*
1411 	 * The connection has not yet been accepted, and thus should still be
1412 	 * considered on the listen queue.
1413 	 */
1414 	tcp_backlog_delayed(pcb);
1415 
1416 	/* Set the callback functions. */
1417 	tcp_recv(pcb, tcpsock_event_recv);
1418 	tcp_sent(pcb, tcpsock_event_sent);
1419 	tcp_err(pcb, tcpsock_event_err);
1420 	tcp_poll(pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL);
1421 
1422 	sockevent_raise(tcpsock_get_sock(tcp), SEV_ACCEPT);
1423 
1424 	return ERR_OK;
1425 }
1426 
1427 /*
1428  * Put a TCP socket in listening mode.
1429  */
1430 static int
tcpsock_listen(struct sock * sock,int backlog)1431 tcpsock_listen(struct sock * sock, int backlog)
1432 {
1433 	struct tcpsock *tcp = (struct tcpsock *)sock;
1434 	struct tcp_pcb *pcb;
1435 	err_t err;
1436 
1437 	/* The maximum backlog value must not exceed its field size. */
1438 	assert(SOMAXCONN <= UINT8_MAX);
1439 
1440 	/*
1441 	 * Allow only CLOSED sockets to enter listening mode.  If the socket
1442 	 * was already in listening mode, allow its backlog value to be
1443 	 * updated, even if it was shut down already (making this a no-op).
1444 	 */
1445 	if (!tcpsock_is_listening(tcp) &&
1446 	    (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED))
1447 		return EINVAL;
1448 
1449 	/*
1450 	 * If the socket was not already in listening mode, put it in that mode
1451 	 * now.  That involves switching PCBs as lwIP attempts to save memory
1452 	 * by replacing the original PCB with a smaller one.  If the socket was
1453 	 * already in listening mode, simply update its backlog value--this has
1454 	 * no effect on the sockets already in the backlog.
1455 	 */
1456 	if (!tcpsock_is_listening(tcp)) {
1457 		assert(tcp->tcp_pcb != NULL);
1458 
1459 		/*
1460 		 * If the socket has not been bound to a port yet, do that
1461 		 * first.  This does mean that the listen call may fail with
1462 		 * side effects, but that is acceptable in this case.
1463 		 */
1464 		if (tcp->tcp_pcb->local_port == 0) {
1465 			err = tcp_bind(tcp->tcp_pcb, &tcp->tcp_pcb->local_ip,
1466 			    0 /*port*/);
1467 
1468 			if (err != ERR_OK)
1469 				return util_convert_err(err);
1470 		}
1471 
1472 		/*
1473 		 * Clear the argument on the PCB that is about to be replaced,
1474 		 * because if we do not, once the PCB is reused (which does not
1475 		 * clear the argument), we might get weird events.  Do this
1476 		 * before the tcp_listen() call, because we should no longer
1477 		 * access the old PCB afterwards (even if we can).
1478 		 */
1479 		tcp_arg(tcp->tcp_pcb, NULL);
1480 
1481 		pcb = tcp_listen_with_backlog_and_err(tcp->tcp_pcb, backlog,
1482 		    &err);
1483 
1484 		if (pcb == NULL) {
1485 			tcp_arg(tcp->tcp_pcb, tcp); /* oops, undo. */
1486 
1487 			return util_convert_err(err);
1488 		}
1489 
1490 		tcp_arg(pcb, tcp);
1491 		tcp->tcp_pcb = pcb;
1492 
1493 		tcp_accept(pcb, tcpsock_event_accept);
1494 
1495 		/* Initialize the queue head for sockets pending acceptance. */
1496 		TAILQ_INIT(&tcp->tcp_queue.tq_head);
1497 	} else if (tcp->tcp_pcb != NULL)
1498 		tcp_backlog_set(tcp->tcp_pcb, backlog);
1499 
1500 	return OK;
1501 }
1502 
1503 /*
1504  * Callback from lwIP.  A socket connection attempt has succeeded.  Note that
1505  * failed socket events will trigger the tcpsock_event_err() callback instead.
1506  */
1507 static err_t
tcpsock_event_connected(void * arg,struct tcp_pcb * pcb __unused,err_t err)1508 tcpsock_event_connected(void * arg, struct tcp_pcb * pcb __unused, err_t err)
1509 {
1510 	struct tcpsock *tcp = (struct tcpsock *)arg;
1511 
1512 	assert(tcp != NULL);
1513 	assert(pcb == tcp->tcp_pcb);
1514 	assert(tcpsock_get_flags(tcp) & TCPF_CONNECTING);
1515 
1516 	/*
1517 	 * If lwIP ever changes so that this callback is called for connect
1518 	 * failures as well, then we need to change the code here accordingly.
1519 	 */
1520 	if (err != ERR_OK)
1521 		panic("TCP connected event with error: %d", err);
1522 
1523 	tcpsock_clear_flag(tcp, TCPF_CONNECTING);
1524 
1525 	sockevent_raise(tcpsock_get_sock(tcp), SEV_CONNECT | SEV_SEND);
1526 
1527 	return ERR_OK;
1528 }
1529 
1530 /*
1531  * Connect a TCP socket to a remote address.
1532  */
1533 static int
tcpsock_connect(struct sock * sock,const struct sockaddr * addr,socklen_t addr_len,endpoint_t user_endpt)1534 tcpsock_connect(struct sock * sock, const struct sockaddr * addr,
1535 	socklen_t addr_len, endpoint_t user_endpt)
1536 {
1537 	struct tcpsock *tcp = (struct tcpsock *)sock;
1538 	ip_addr_t dst_addr;
1539 	uint16_t dst_port;
1540 	err_t err;
1541 	int r;
1542 
1543 	/*
1544 	 * Listening sockets may not have a PCB, so we use higher-level flags
1545 	 * to throw the correct error code for those instead.
1546 	 */
1547 	if (tcpsock_is_listening(tcp))
1548 		return EOPNOTSUPP;
1549 
1550 	/*
1551 	 * If there is no longer any PCB, we obviously cannot perform the
1552 	 * connection, but POSIX is not clear on which error to return.  We
1553 	 * copy NetBSD's.
1554 	 */
1555 	if (tcp->tcp_pcb == NULL)
1556 		return EINVAL;
1557 
1558 	/*
1559 	 * The only state from which a connection can be initiated, is CLOSED.
1560 	 * Some of the other states require distinct error codes, though.
1561 	 */
1562 	switch (tcp->tcp_pcb->state) {
1563 	case CLOSED:
1564 		break;
1565 	case SYN_SENT:
1566 		return EALREADY;
1567 	case LISTEN:
1568 		assert(0); /* we just checked.. */
1569 	default:
1570 		return EISCONN;
1571 	}
1572 
1573 	/*
1574 	 * Get the destination address, and attempt to start connecting.  If
1575 	 * the socket was not bound before, or it was bound to a port only,
1576 	 * then lwIP will select a source address for us.  We cannot do this
1577 	 * ourselves even if we wanted to: it is impossible to re-bind a TCP
1578 	 * PCB in the case it was previously bound to a port only.
1579 	 */
1580 	if ((r = ipsock_get_dst_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
1581 	    &tcp->tcp_pcb->local_ip, &dst_addr, &dst_port)) != OK)
1582 		return r;
1583 
1584 	err = tcp_connect(tcp->tcp_pcb, &dst_addr, dst_port,
1585 	    tcpsock_event_connected);
1586 
1587 	/*
1588 	 * Note that various tcp_connect() error cases will leave the PCB with
1589 	 * a newly set local and remote IP address anyway.  We should be
1590 	 * careful not to rely on the addresses being as they were before.
1591 	 */
1592 	if (err != ERR_OK)
1593 		return util_convert_err(err);
1594 
1595 	/* Set the other callback functions. */
1596 	tcp_recv(tcp->tcp_pcb, tcpsock_event_recv);
1597 	tcp_sent(tcp->tcp_pcb, tcpsock_event_sent);
1598 	tcp_err(tcp->tcp_pcb, tcpsock_event_err);
1599 	tcp_poll(tcp->tcp_pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL);
1600 
1601 	/*
1602 	 * Set a flag so that we can correct lwIP's error codes in case the
1603 	 * connection fails.
1604 	 */
1605 	tcpsock_set_flag(tcp, TCPF_CONNECTING);
1606 
1607 	return SUSPEND;
1608 }
1609 
1610 /*
1611  * Test whether any new connections are pending on a listening TCP socket.
1612  */
1613 static int
tcpsock_test_accept(struct sock * sock)1614 tcpsock_test_accept(struct sock * sock)
1615 {
1616 	struct tcpsock *tcp = (struct tcpsock *)sock;
1617 
1618 	/* Is this socket in listening mode at all? */
1619 	if (!tcpsock_is_listening(tcp))
1620 		return EINVAL;
1621 
1622 	/* Are there any connections to accept right now? */
1623 	if (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head))
1624 		return OK;
1625 
1626 	/* If the socket has been shut down, we return ECONNABORTED. */
1627 	if (tcp->tcp_pcb == NULL)
1628 		return ECONNABORTED;
1629 
1630 	/* Otherwise, wait for a new connection first. */
1631 	return SUSPEND;
1632 }
1633 
1634 /*
1635  * Accept a connection on a listening TCP socket, creating a new TCP socket.
1636  */
1637 static sockid_t
tcpsock_accept(struct sock * sock,struct sockaddr * addr,socklen_t * addr_len,endpoint_t user_endpt __unused,struct sock ** newsockp)1638 tcpsock_accept(struct sock * sock, struct sockaddr * addr,
1639 	socklen_t * addr_len, endpoint_t user_endpt __unused,
1640 	struct sock ** newsockp)
1641 {
1642 	struct tcpsock *listener = (struct tcpsock *)sock;
1643 	struct tcpsock *tcp;
1644 	int r;
1645 
1646 	if ((r = tcpsock_test_accept(sock)) != OK)
1647 		return r;
1648 	/* Below, we must not assume that the listener has a PCB. */
1649 
1650 	tcp = TAILQ_FIRST(&listener->tcp_queue.tq_head);
1651 	assert(tcp->tcp_listener == listener);
1652 	assert(tcp->tcp_pcb != NULL);
1653 
1654 	TAILQ_REMOVE(&listener->tcp_queue.tq_head, tcp, tcp_queue.tq_next);
1655 	tcp->tcp_listener = NULL;
1656 
1657 	tcp_backlog_accepted(tcp->tcp_pcb);
1658 
1659 	ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
1660 	    &tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port);
1661 
1662 	/*
1663 	 * Set 'newsockp' to NULL so that libsockevent knows we already cloned
1664 	 * the socket, and it must not be reinitialized anymore.
1665 	 */
1666 	*newsockp = NULL;
1667 	return tcpsock_get_id(tcp);
1668 }
1669 
1670 /*
1671  * Perform preliminary checks on a send request.
1672  */
1673 static int
tcpsock_pre_send(struct sock * sock,size_t len __unused,socklen_t ctl_len __unused,const struct sockaddr * addr __unused,socklen_t addr_len __unused,endpoint_t user_endpt __unused,int flags)1674 tcpsock_pre_send(struct sock * sock, size_t len __unused,
1675 	socklen_t ctl_len __unused, const struct sockaddr * addr __unused,
1676 	socklen_t addr_len __unused, endpoint_t user_endpt __unused, int flags)
1677 {
1678 
1679 	/*
1680 	 * Reject calls with unknown flags.  Since libsockevent strips out the
1681 	 * flags it handles itself here, we only have to test for ones we can
1682 	 * not handle.  Currently, there are no send flags that we support.
1683 	 */
1684 	if (flags != 0)
1685 		return EOPNOTSUPP;
1686 
1687 	return OK;
1688 }
1689 
1690 /*
1691  * Test whether the given number of data bytes can be sent on a TCP socket.
1692  */
1693 static int
tcpsock_test_send(struct sock * sock,size_t min)1694 tcpsock_test_send(struct sock * sock, size_t min)
1695 {
1696 	struct tcpsock *tcp = (struct tcpsock *)sock;
1697 	size_t sndbuf;
1698 
1699 	if (tcp->tcp_pcb == NULL)
1700 		return EPIPE;
1701 
1702 	switch (tcp->tcp_pcb->state) {
1703 	case CLOSED:			/* new */
1704 	case LISTEN:			/* listening */
1705 		return ENOTCONN;
1706 	case SYN_SENT:			/* connecting */
1707 	case SYN_RCVD:			/* simultaneous open, maybe someday? */
1708 		return SUSPEND;
1709 	case ESTABLISHED:		/* connected */
1710 	case CLOSE_WAIT:		/* closed remotely */
1711 		break;
1712 	default:			/* shut down locally */
1713 		assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR));
1714 		return EPIPE;
1715 	}
1716 
1717 	sndbuf = tcpsock_get_sndbuf(tcp);
1718 	if (min > sndbuf)
1719 		min = sndbuf;
1720 
1721 	if (tcp->tcp_snd.ts_len + min > sndbuf)
1722 		return SUSPEND;
1723 	else
1724 		return OK;
1725 }
1726 
1727 /*
1728  * Send data on a TCP socket.
1729  */
1730 static int
tcpsock_send(struct sock * sock,const struct sockdriver_data * data,size_t len,size_t * offp,const struct sockdriver_data * ctl __unused,socklen_t ctl_len __unused,socklen_t * ctl_off __unused,const struct sockaddr * addr __unused,socklen_t addr_len __unused,endpoint_t user_endpt __unused,int flags __unused,size_t min)1731 tcpsock_send(struct sock * sock, const struct sockdriver_data * data,
1732 	size_t len, size_t * offp, const struct sockdriver_data * ctl __unused,
1733 	socklen_t ctl_len __unused, socklen_t * ctl_off __unused,
1734 	const struct sockaddr * addr __unused, socklen_t addr_len __unused,
1735 	endpoint_t user_endpt __unused, int flags __unused, size_t min)
1736 {
1737 	struct tcpsock *tcp = (struct tcpsock *)sock;
1738 	struct pbuf *ptail, *pfirst, *pnext, *plast;
1739 	size_t off, tail_off, chunk, left, sndbuf;
1740 	int r;
1741 
1742 	if ((r = tcpsock_test_send(sock, min)) != OK)
1743 		return r;
1744 
1745 	if (len == 0)
1746 		return OK;	/* nothing to do */
1747 
1748 	sndbuf = tcpsock_get_sndbuf(tcp);
1749 	if (min > sndbuf)
1750 		min = sndbuf;
1751 	assert(min > 0);
1752 
1753 	assert(sndbuf > tcp->tcp_snd.ts_len);
1754 	left = sndbuf - tcp->tcp_snd.ts_len;
1755 	if (left > len)
1756 		left = len;
1757 
1758 	/*
1759 	 * First see if we can fit any more data in the current tail buffer.
1760 	 * If so, we set 'ptail' to point to it and 'tail_off' to the previous
1761 	 * length of the tail buffer, while optimistically extending it to
1762 	 * include the new data.  If not, we set them to NULL/0.
1763 	 */
1764 	if ((ptail = tcp->tcp_snd.ts_tail) != NULL &&
1765 	    ptail->len < ptail->tot_len) {
1766 		assert(ptail->len > 0);
1767 		tail_off = (size_t)ptail->len;
1768 
1769 		/*
1770 		 * Optimistically extend the head buffer to include whatever
1771 		 * fits in it.  This is needed for util_copy_data().
1772 		 */
1773 		assert(ptail->tot_len > ptail->len);
1774 		off = (size_t)ptail->tot_len - (size_t)ptail->len;
1775 		if (off > left)
1776 			off = left;
1777 		ptail->len += off;
1778 	} else {
1779 		ptail = NULL;
1780 		tail_off = 0;
1781 		off = 0;
1782 	}
1783 
1784 	/*
1785 	 * Then, if there is more to send, allocate new buffers as needed.  If
1786 	 * we run out of memory, work with whatever we did manage to grab.
1787 	 */
1788 	pfirst = NULL;
1789 	plast = NULL;
1790 	while (off < left) {
1791 		if (tcpsock_sendbufs >= TCP_MAX_SENDBUFS ||
1792 		    (pnext = tcpsock_alloc_buf()) == NULL) {
1793 			/*
1794 			 * Chances are that we will end up suspending this send
1795 			 * request because of being out of buffers.  We try to
1796 			 * resume such requests from the polling function.
1797 			 */
1798 			tcpsock_set_flag(tcp, TCPF_OOM);
1799 
1800 			break;
1801 		}
1802 
1803 		tcpsock_sendbufs++;
1804 
1805 		if (pfirst == NULL)
1806 			pfirst = pnext;
1807 		else
1808 			plast->next = pnext;
1809 		plast = pnext;
1810 
1811 		chunk = (size_t)pnext->tot_len;
1812 		if (chunk > left - off)
1813 			chunk = left - off;
1814 		pnext->len = chunk;
1815 		off += chunk;
1816 	}
1817 
1818 	/*
1819 	 * Copy in the data and continue, unless we did not manage to find
1820 	 * enough space to even meet the low send watermark, in which case we
1821 	 * undo any allocation and suspend the call until later.
1822 	 */
1823 	if (off >= min) {
1824 		/*
1825 		 * Optimistically attach the new buffers to the tail, also for
1826 		 * util_copy_data().  We undo all this if the copy fails.
1827 		 */
1828 		if (ptail != NULL) {
1829 			ptail->next = pfirst;
1830 
1831 			pnext = ptail;
1832 		} else
1833 			pnext = pfirst;
1834 
1835 		assert(pnext != NULL);
1836 
1837 		r = util_copy_data(data, off, *offp, pnext, tail_off,
1838 		    TRUE /*copy_in*/);
1839 	} else
1840 		r = SUSPEND;
1841 
1842 	if (r != OK) {
1843 		/* Undo the modifications made so far. */
1844 		while (pfirst != NULL) {
1845 			pnext = pfirst->next;
1846 
1847 			assert(tcpsock_sendbufs > 0);
1848 			tcpsock_sendbufs--;
1849 
1850 			tcpsock_free_buf(pfirst);
1851 
1852 			pfirst = pnext;
1853 		}
1854 
1855 		if (ptail != NULL) {
1856 			ptail->next = NULL;
1857 
1858 			ptail->len = tail_off;
1859 		}
1860 
1861 		return r;
1862 	}
1863 
1864 	/* Attach the new buffers, if any, to the buffer tail. */
1865 	if (pfirst != NULL) {
1866 		if ((ptail = tcp->tcp_snd.ts_tail) != NULL) {
1867 			assert(ptail->len == ptail->tot_len);
1868 
1869 			/*
1870 			 * Due to our earlier optimistic modifications, this
1871 			 * may or may not be redundant.
1872 			 */
1873 			ptail->next = pfirst;
1874 		}
1875 
1876 		assert(plast != NULL);
1877 		tcp->tcp_snd.ts_tail = plast;
1878 
1879 		if (tcp->tcp_snd.ts_head == NULL) {
1880 			tcp->tcp_snd.ts_head = pfirst;
1881 			assert(tcp->tcp_snd.ts_head_off == 0);
1882 		}
1883 		if (tcp->tcp_snd.ts_unsent == NULL) {
1884 			tcp->tcp_snd.ts_unsent = pfirst;
1885 			assert(tcp->tcp_snd.ts_unsent_off == 0);
1886 		}
1887 	}
1888 
1889 	tcp->tcp_snd.ts_len += off;
1890 
1891 	/*
1892 	 * See if we can send any of the data we just enqueued.  The socket is
1893 	 * still open as we are still processing a call from userland on it;
1894 	 * this saves us from having to deal with the cases that the following
1895 	 * calls end up freeing the socket object.
1896 	 */
1897 	if (tcpsock_pcb_enqueue(tcp) &&
1898 	    (r = tcpsock_pcb_send(tcp, FALSE /*raise_error*/)) != OK) {
1899 		/*
1900 		 * That did not go well.  Return the error immediately if we
1901 		 * had not made any progress earlier.  Otherwise, return our
1902 		 * partial progress and leave the error to be picked up later.
1903 		 */
1904 		if (*offp > 0) {
1905 			sockevent_set_error(tcpsock_get_sock(tcp), r);
1906 
1907 			return OK;
1908 		} else
1909 			return r;
1910 	}
1911 
1912 	*offp += off;
1913 	return (off < len) ? SUSPEND : OK;
1914 }
1915 
1916 /*
1917  * Perform preliminary checks on a receive request.
1918  */
1919 static int
tcpsock_pre_recv(struct sock * sock __unused,endpoint_t user_endpt __unused,int flags)1920 tcpsock_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused,
1921 	int flags)
1922 {
1923 
1924 	/*
1925 	 * Reject calls with unknown flags.  Since libsockevent strips out the
1926 	 * flags it handles itself here, we only have to test for ones we can
1927 	 * not handle.
1928 	 */
1929 	if ((flags & ~(MSG_PEEK | MSG_WAITALL)) != 0)
1930 		return EOPNOTSUPP;
1931 
1932 	return OK;
1933 }
1934 
1935 /*
1936  * Return TRUE if receive calls may wait for more data to come in on the
1937  * connection, or FALSE if we already know that that is not going to happen.
1938  */
1939 static int
tcpsock_may_wait(struct tcpsock * tcp)1940 tcpsock_may_wait(struct tcpsock * tcp)
1941 {
1942 
1943 	return (tcp->tcp_pcb != NULL &&
1944 	    !(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN));
1945 }
1946 
1947 /*
1948  * Test whether data can be received on a TCP socket, and if so, how many bytes
1949  * of data.
1950  */
1951 static int
tcpsock_test_recv(struct sock * sock,size_t min,size_t * size)1952 tcpsock_test_recv(struct sock * sock, size_t min, size_t * size)
1953 {
1954 	struct tcpsock *tcp = (struct tcpsock *)sock;
1955 	int may_wait;
1956 
1957 	/* If there is and never was a connection, refuse the call at all. */
1958 	if (tcp->tcp_pcb != NULL && (tcp->tcp_pcb->state == CLOSED ||
1959 	    tcp->tcp_pcb->state == LISTEN))
1960 		return ENOTCONN;
1961 
1962 	/*
1963 	 * If we are certain that no more data will come in later, ignore the
1964 	 * low receive watermark.  Otherwise, bound it to the size of the
1965 	 * receive buffer, or receive calls may block forever.
1966 	 */
1967 	if (!(may_wait = tcpsock_may_wait(tcp)))
1968 		min = 1;
1969 	else if (min > tcpsock_get_rcvbuf(tcp))
1970 		min = tcpsock_get_rcvbuf(tcp);
1971 
1972 	if (tcp->tcp_rcv.tr_len >= min) {
1973 		if (size != NULL)
1974 			*size = tcp->tcp_rcv.tr_len;
1975 
1976 		return OK;
1977 	}
1978 
1979 	return (may_wait) ? SUSPEND : SOCKEVENT_EOF;
1980 }
1981 
1982 /*
1983  * Receive data on a TCP socket.
1984  */
1985 static int
tcpsock_recv(struct sock * sock,const struct sockdriver_data * data,size_t len,size_t * offp,const struct sockdriver_data * ctl __unused,socklen_t ctl_len __unused,socklen_t * ctl_off __unused,struct sockaddr * addr __unused,socklen_t * addr_len __unused,endpoint_t user_endpt __unused,int flags,size_t min,int * rflags __unused)1986 tcpsock_recv(struct sock * sock, const struct sockdriver_data * data,
1987 	size_t len, size_t * offp, const struct sockdriver_data * ctl __unused,
1988 	socklen_t ctl_len __unused, socklen_t * ctl_off __unused,
1989 	struct sockaddr * addr __unused, socklen_t * addr_len __unused,
1990 	endpoint_t user_endpt __unused, int flags, size_t min,
1991 	int * rflags __unused)
1992 {
1993 	struct tcpsock *tcp = (struct tcpsock *)sock;
1994 	struct pbuf *ptail;
1995 	size_t off, left;
1996 	int r;
1997 
1998 	/* See if we can receive at all, and if so, how much at most. */
1999 	if ((r = tcpsock_test_recv(sock, min, NULL)) != OK)
2000 		return r;
2001 
2002 	if (len == 0)
2003 		return OK;	/* nothing to do */
2004 
2005 	off = tcp->tcp_rcv.tr_len;
2006 	if (off > len)
2007 		off = len;
2008 
2009 	assert(tcp->tcp_rcv.tr_head != NULL);
2010 	assert(tcp->tcp_rcv.tr_head_off < tcp->tcp_rcv.tr_head->len);
2011 
2012 	/* Copy out the data to the caller. */
2013 	if ((r = util_copy_data(data, off, *offp, tcp->tcp_rcv.tr_head,
2014 	    tcp->tcp_rcv.tr_head_off, FALSE /*copy_in*/)) != OK)
2015 		return r;
2016 
2017 	/* Unless peeking, remove the data from the receive queue. */
2018 	if (!(flags & MSG_PEEK)) {
2019 		left = off;
2020 
2021 		/* Dequeue and free as many entire buffers as possible. */
2022 		while ((ptail = tcp->tcp_rcv.tr_head) != NULL &&
2023 		    left >= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off) {
2024 			left -= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off;
2025 
2026 			tcp->tcp_rcv.tr_head = ptail->next;
2027 			tcp->tcp_rcv.tr_head_off = 0;
2028 
2029 			if (tcp->tcp_rcv.tr_head == NULL)
2030 				tcp->tcp_rcv.tr_pre_tailp = NULL;
2031 			else if (tcp->tcp_rcv.tr_pre_tailp == &ptail->next)
2032 				tcp->tcp_rcv.tr_pre_tailp =
2033 				    &tcp->tcp_rcv.tr_head;
2034 
2035 			assert(tcpsock_recvbufs > 0);
2036 			tcpsock_recvbufs--;
2037 
2038 			tcpsock_free_buf(ptail);
2039 		}
2040 
2041 		/*
2042 		 * If only part of the (new) head buffer is consumed, adjust
2043 		 * the saved offset into that buffer.
2044 		 */
2045 		if (left > 0) {
2046 			assert(tcp->tcp_rcv.tr_head != NULL);
2047 			assert((size_t)tcp->tcp_rcv.tr_head->len -
2048 			    tcp->tcp_rcv.tr_head_off > left);
2049 
2050 			tcp->tcp_rcv.tr_head_off += left;
2051 		}
2052 
2053 		tcp->tcp_rcv.tr_len -= off;
2054 
2055 		if (tcp->tcp_rcv.tr_head != NULL) {
2056 			assert(tcp->tcp_rcv.tr_pre_tailp != NULL);
2057 			assert(tcp->tcp_rcv.tr_len > 0);
2058 		} else {
2059 			assert(tcp->tcp_rcv.tr_pre_tailp == NULL);
2060 			assert(tcp->tcp_rcv.tr_len == 0);
2061 		}
2062 
2063 		/*
2064 		 * The receive buffer has shrunk, so there may now be space to
2065 		 * receive more data.
2066 		 */
2067 		if (tcp->tcp_pcb != NULL)
2068 			tcpsock_ack_recv(tcp);
2069 	} else
2070 		flags &= ~MSG_WAITALL; /* for the check below */
2071 
2072 	/* Advance the current copy position, and see if we are done. */
2073 	*offp += off;
2074 	if ((flags & MSG_WAITALL) && off < len && tcpsock_may_wait(tcp))
2075 		return SUSPEND;
2076 	else
2077 		return OK;
2078 }
2079 
2080 /*
2081  * Update the set of flag-type socket options on a TCP socket.
2082  */
2083 static void
tcpsock_setsockmask(struct sock * sock,unsigned int mask)2084 tcpsock_setsockmask(struct sock * sock, unsigned int mask)
2085 {
2086 	struct tcpsock *tcp = (struct tcpsock *)sock;
2087 
2088 	if (tcp->tcp_pcb == NULL)
2089 		return;
2090 
2091 	if (mask & SO_REUSEADDR)
2092 		ip_set_option(tcp->tcp_pcb, SOF_REUSEADDR);
2093 	else
2094 		ip_reset_option(tcp->tcp_pcb, SOF_REUSEADDR);
2095 
2096 	if (mask & SO_KEEPALIVE)
2097 		ip_set_option(tcp->tcp_pcb, SOF_KEEPALIVE);
2098 	else
2099 		ip_reset_option(tcp->tcp_pcb, SOF_KEEPALIVE);
2100 }
2101 
2102 /*
2103  * Prepare a helper structure for IP-level option processing.
2104  */
2105 static void
tcpsock_get_ipopts(struct tcpsock * tcp,struct ipopts * ipopts)2106 tcpsock_get_ipopts(struct tcpsock * tcp, struct ipopts * ipopts)
2107 {
2108 
2109 	ipopts->local_ip = &tcp->tcp_pcb->local_ip;
2110 	ipopts->remote_ip = &tcp->tcp_pcb->remote_ip;
2111 	ipopts->tos = &tcp->tcp_pcb->tos;
2112 	ipopts->ttl = &tcp->tcp_pcb->ttl;
2113 	ipopts->sndmin = TCP_SNDBUF_MIN;
2114 	ipopts->sndmax = TCP_SNDBUF_MAX;
2115 	ipopts->rcvmin = TCP_RCVBUF_MIN;
2116 	ipopts->rcvmax = TCP_RCVBUF_MAX;
2117 }
2118 
2119 /*
2120  * Set socket options on a TCP socket.
2121  */
2122 static int
tcpsock_setsockopt(struct sock * sock,int level,int name,const struct sockdriver_data * data,socklen_t len)2123 tcpsock_setsockopt(struct sock * sock, int level, int name,
2124 	const struct sockdriver_data * data, socklen_t len)
2125 {
2126 	struct tcpsock *tcp = (struct tcpsock *)sock;
2127 	struct ipopts ipopts;
2128 	uint32_t uval;
2129 	int r, val;
2130 
2131 	if (tcp->tcp_pcb == NULL)
2132 		return ECONNRESET;
2133 
2134 	/* Handle TCP-level options. */
2135 	switch (level) {
2136 	case IPPROTO_IPV6:
2137 		switch (name) {
2138 		case IPV6_RECVTCLASS:
2139 			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2140 			    len)) != OK)
2141 				return r;
2142 
2143 			/*
2144 			 * This option is not supported for TCP sockets; it
2145 			 * would not even make sense.  However, named(8)
2146 			 * insists on trying to set it anyway.  We accept the
2147 			 * request but ignore the value, not even returning
2148 			 * what was set through getsockopt(2).
2149 			 */
2150 			return OK;
2151 
2152 		case IPV6_FAITH:
2153 			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2154 			    len)) != OK)
2155 				return r;
2156 
2157 			/*
2158 			 * This option is not supported at all, but to save
2159 			 * ourselves from having to remember the current state
2160 			 * for getsockopt(2), we also refuse to enable it.
2161 			 */
2162 			if (val != 0)
2163 				return EINVAL;
2164 
2165 			return OK;
2166 		}
2167 
2168 		break;
2169 
2170 	case IPPROTO_TCP:
2171 		switch (name) {
2172 		case TCP_NODELAY:
2173 			/*
2174 			 * lwIP's listening TCP PCBs do not have this field.
2175 			 * If this ever becomes an issue, we can create our own
2176 			 * shadow flag and do the inheritance ourselves.
2177 			 */
2178 			if (tcp->tcp_pcb->state == LISTEN)
2179 				return EINVAL;
2180 
2181 			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2182 			    len)) != OK)
2183 				return r;
2184 
2185 			if (val)
2186 				tcp_nagle_disable(tcp->tcp_pcb);
2187 			else
2188 				tcp_nagle_enable(tcp->tcp_pcb);
2189 
2190 			return OK;
2191 
2192 		case TCP_KEEPIDLE:
2193 		case TCP_KEEPINTVL:
2194 			/*
2195 			 * lwIP's listening TCP PCBs do not have these fields.
2196 			 */
2197 			if (tcp->tcp_pcb->state == LISTEN)
2198 				return EINVAL;
2199 
2200 			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2201 			    len)) != OK)
2202 				return r;
2203 
2204 			if (val == 0)
2205 				return EINVAL;
2206 
2207 			/*
2208 			 * The given value is unsigned, but lwIP stores the
2209 			 * value in milliseconds in a uint32_t field, so we
2210 			 * have to limit large values to whatever fits in the
2211 			 * field anyway.
2212 			 */
2213 			if (val < 0 || (uint32_t)val > UINT32_MAX / 1000)
2214 				uval = UINT32_MAX;
2215 			else
2216 				uval = (uint32_t)val * 1000;
2217 
2218 			if (name == TCP_KEEPIDLE)
2219 				tcp->tcp_pcb->keep_idle = uval;
2220 			else
2221 				tcp->tcp_pcb->keep_intvl = uval;
2222 
2223 			return OK;
2224 
2225 		case TCP_KEEPCNT:
2226 			/* lwIP's listening TCP PCBs do not have this field. */
2227 			if (tcp->tcp_pcb->state == LISTEN)
2228 				return EINVAL;
2229 
2230 			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
2231 			    len)) != OK)
2232 				return r;
2233 
2234 			if (val == 0)
2235 				return EINVAL;
2236 
2237 			tcp->tcp_pcb->keep_cnt = (uint32_t)val;
2238 
2239 			return OK;
2240 		}
2241 
2242 		return EOPNOTSUPP;
2243 	}
2244 
2245 	/* Handle all other options at the IP level. */
2246 	tcpsock_get_ipopts(tcp, &ipopts);
2247 
2248 	return ipsock_setsockopt(tcpsock_get_ipsock(tcp), level, name, data,
2249 	    len, &ipopts);
2250 }
2251 
2252 /*
2253  * Retrieve socket options on a TCP socket.
2254  */
2255 static int
tcpsock_getsockopt(struct sock * sock,int level,int name,const struct sockdriver_data * data,socklen_t * len)2256 tcpsock_getsockopt(struct sock * sock, int level, int name,
2257 	const struct sockdriver_data * data, socklen_t * len)
2258 {
2259 	struct tcpsock *tcp = (struct tcpsock *)sock;
2260 	struct ipopts ipopts;
2261 	int val;
2262 
2263 	if (tcp->tcp_pcb == NULL)
2264 		return ECONNRESET;
2265 
2266 	/* Handle TCP-level options. */
2267 	switch (level) {
2268 	case IPPROTO_IPV6:
2269 		switch (name) {
2270 		case IPV6_RECVTCLASS:
2271 		case IPV6_FAITH:
2272 			val = 0;
2273 
2274 			return sockdriver_copyout_opt(data, &val, sizeof(val),
2275 			    len);
2276 		}
2277 
2278 		break;
2279 
2280 	case IPPROTO_TCP:
2281 		switch (name) {
2282 		case TCP_NODELAY:
2283 			/* lwIP's listening TCP PCBs do not have this field. */
2284 			if (tcp->tcp_pcb->state == LISTEN)
2285 				return EINVAL;
2286 
2287 			val = tcp_nagle_disabled(tcp->tcp_pcb);
2288 
2289 			return sockdriver_copyout_opt(data, &val, sizeof(val),
2290 			    len);
2291 
2292 		case TCP_MAXSEG:
2293 			/* lwIP's listening TCP PCBs do not have this field. */
2294 			if (tcp->tcp_pcb->state == LISTEN)
2295 				return EINVAL;
2296 
2297 			/* This option is read-only at this time. */
2298 			val = tcp->tcp_pcb->mss;
2299 
2300 			return sockdriver_copyout_opt(data, &val, sizeof(val),
2301 			    len);
2302 
2303 		case TCP_KEEPIDLE:
2304 			/* lwIP's listening TCP PCBs do not have this field. */
2305 			if (tcp->tcp_pcb->state == LISTEN)
2306 				return EINVAL;
2307 
2308 			val = (int)(tcp->tcp_pcb->keep_idle / 1000);
2309 
2310 			return sockdriver_copyout_opt(data, &val, sizeof(val),
2311 			    len);
2312 
2313 		case TCP_KEEPINTVL:
2314 			/* lwIP's listening TCP PCBs do not have this field. */
2315 			if (tcp->tcp_pcb->state == LISTEN)
2316 				return EINVAL;
2317 
2318 			val = (int)(tcp->tcp_pcb->keep_intvl / 1000);
2319 
2320 			return sockdriver_copyout_opt(data, &val, sizeof(val),
2321 			    len);
2322 
2323 		case TCP_KEEPCNT:
2324 			/* lwIP's listening TCP PCBs do not have this field. */
2325 			if (tcp->tcp_pcb->state == LISTEN)
2326 				return EINVAL;
2327 
2328 			val = (int)tcp->tcp_pcb->keep_cnt;
2329 
2330 			return sockdriver_copyout_opt(data, &val, sizeof(val),
2331 			    len);
2332 		}
2333 
2334 		return EOPNOTSUPP;
2335 	}
2336 
2337 	/* Handle all other options at the IP level. */
2338 	tcpsock_get_ipopts(tcp, &ipopts);
2339 
2340 	return ipsock_getsockopt(tcpsock_get_ipsock(tcp), level, name, data,
2341 	    len, &ipopts);
2342 }
2343 
2344 /*
2345  * Retrieve the local socket address of a TCP socket.
2346  */
2347 static int
tcpsock_getsockname(struct sock * sock,struct sockaddr * addr,socklen_t * addr_len)2348 tcpsock_getsockname(struct sock * sock, struct sockaddr * addr,
2349 	socklen_t * addr_len)
2350 {
2351 	struct tcpsock *tcp = (struct tcpsock *)sock;
2352 
2353 	if (tcp->tcp_pcb == NULL)
2354 		return EINVAL;
2355 
2356 	ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
2357 	    &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port);
2358 
2359 	return OK;
2360 }
2361 
2362 /*
2363  * Retrieve the remote socket address of a TCP socket.
2364  */
2365 static int
tcpsock_getpeername(struct sock * sock,struct sockaddr * addr,socklen_t * addr_len)2366 tcpsock_getpeername(struct sock * sock, struct sockaddr * addr,
2367 	socklen_t * addr_len)
2368 {
2369 	struct tcpsock *tcp = (struct tcpsock *)sock;
2370 
2371 	if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED ||
2372 	    tcp->tcp_pcb->state == LISTEN || tcp->tcp_pcb->state == SYN_SENT)
2373 		return ENOTCONN;
2374 
2375 	ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
2376 	    &tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port);
2377 
2378 	return OK;
2379 }
2380 
2381 /*
2382  * Perform a TCP half-close on a TCP socket.  This operation may not complete
2383  * immediately due to memory conditions, in which case it will be completed at
2384  * a later time.
2385  */
2386 static void
tcpsock_send_fin(struct tcpsock * tcp)2387 tcpsock_send_fin(struct tcpsock * tcp)
2388 {
2389 
2390 	sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_WR);
2391 
2392 	/*
2393 	 * Attempt to send the FIN.  If a fatal error occurs as a result, raise
2394 	 * it as an asynchronous error, because this function's callers cannot
2395 	 * do much with it.  That happens to match the way these functions are
2396 	 * used elsewhere.  In any case, as a result, the PCB may be closed.
2397 	 * However, we are never called from a situation where the socket is
2398 	 * being closed here, so the socket object will not be freed either.
2399 	 */
2400 	if (tcpsock_pcb_enqueue(tcp)) {
2401 		assert(!sockevent_is_closing(tcpsock_get_sock(tcp)));
2402 
2403 		if (tcpsock_may_close(tcp))
2404 			tcpsock_finish_close(tcp);
2405 		else
2406 			(void)tcpsock_pcb_send(tcp, TRUE /*raise_error*/);
2407 	}
2408 }
2409 
2410 /*
2411  * Shut down a TCP socket for reading and/or writing.
2412  */
2413 static int
tcpsock_shutdown(struct sock * sock,unsigned int mask)2414 tcpsock_shutdown(struct sock * sock, unsigned int mask)
2415 {
2416 	struct tcpsock *tcp = (struct tcpsock *)sock;
2417 
2418 	/*
2419 	 * If the PCB is gone, we want to allow shutdowns for reading but not
2420 	 * writing: shutting down for writing affects the PCB, shutting down
2421 	 * for reading does not.  Also, if the PCB is in CLOSED state, we would
2422 	 * not know how to deal with subsequent operations after a shutdown for
2423 	 * writing, so forbid such calls altogether.
2424 	 */
2425 	if ((tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED) &&
2426 	    (mask & SFL_SHUT_WR))
2427 		return ENOTCONN;
2428 
2429 	/*
2430 	 * Handle listening sockets as a special case.  Shutting down a
2431 	 * listening socket frees its PCB.  Sockets pending on the accept queue
2432 	 * may still be accepted, but after that, accept(2) will start
2433 	 * returning ECONNABORTED.  This feature allows multi-process server
2434 	 * applications to shut down gracefully, supposedly..
2435 	 */
2436 	if (tcpsock_is_listening(tcp)) {
2437 		if (tcp->tcp_pcb != NULL)
2438 			tcpsock_pcb_close(tcp);
2439 
2440 		return OK;
2441 	}
2442 
2443 	/*
2444 	 * We control shutdown-for-reading locally, and intentially do not tell
2445 	 * lwIP about it: if we do that and also shut down for writing, the PCB
2446 	 * may disappear (now or eventually), which is not what we want.
2447 	 * Instead, we only tell lwIP to shut down for reading once we actually
2448 	 * want to get rid of the PCB, using tcp_close().  In the meantime, if
2449 	 * the socket is shut down for reading by the user, we simply discard
2450 	 * received data as fast as we can--one out of a number of possible
2451 	 * design choices there, and (reportedly) the one used by the BSDs.
2452 	 */
2453 	if (mask & SFL_SHUT_RD)
2454 		(void)tcpsock_clear_recv(tcp, TRUE /*ack_data*/);
2455 
2456 	/*
2457 	 * Shutting down for writing a connecting socket simply closes its PCB.
2458 	 * Closing a PCB in SYN_SENT state simply deallocates it, so this can
2459 	 * not fail.  On the other hand, for connected sockets we want to send
2460 	 * a FIN, which may fail due to memory shortage, in which case we have
2461 	 * to try again later..
2462 	 */
2463 	if (mask & SFL_SHUT_WR) {
2464 		if (tcp->tcp_pcb->state == SYN_SENT)
2465 			tcpsock_pcb_close(tcp);
2466 		else if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR))
2467 			tcpsock_send_fin(tcp);
2468 	}
2469 
2470 	return OK;
2471 }
2472 
2473 /*
2474  * Close a TCP socket.  Complete the operation immediately if possible, or
2475  * otherwise initiate the closing process and complete it later, notifying
2476  * libsockevent about that as well.  Depending on linger settings, this
2477  * function may be called twice on the same socket: the first time with the
2478  * 'force' flag cleared, and the second time with the 'force' flag set.
2479  */
2480 static int
tcpsock_close(struct sock * sock,int force)2481 tcpsock_close(struct sock * sock, int force)
2482 {
2483 	struct tcpsock *tcp = (struct tcpsock *)sock;
2484 	struct tcpsock *queued;
2485 	size_t rlen;
2486 
2487 	assert(tcp->tcp_listener == NULL);
2488 
2489 	/*
2490 	 * If this was a listening socket, so abort and clean up any and all
2491 	 * connections on its listener queue.  Note that the listening socket
2492 	 * may or may not have a PCB at this point.
2493 	 */
2494 	if (tcpsock_is_listening(tcp)) {
2495 		while (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head)) {
2496 			queued = TAILQ_FIRST(&tcp->tcp_queue.tq_head);
2497 
2498 			tcpsock_pcb_abort(queued);
2499 
2500 			(void)tcpsock_cleanup(queued, TRUE /*may_free*/);
2501 		}
2502 	}
2503 
2504 	/*
2505 	 * Clear the receive queue, and make sure that we no longer add new
2506 	 * data to it.  The latter is relevant only for the case that we end up
2507 	 * returning SUSPEND below.  Remember whether there were bytes left,
2508 	 * because we should reset the connection if there were.
2509 	 */
2510 	rlen = tcpsock_clear_recv(tcp, FALSE /*ack_data*/);
2511 
2512 	sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_RD);
2513 
2514 	/*
2515 	 * If the socket is connected, perform a graceful shutdown, unless 1)
2516 	 * we are asked to force-close the socket, or 2) if the local side has
2517 	 * not consumed all data, as per RFC 1122 Sec.4.2.2.13.  Normally lwIP
2518 	 * would take care of the second point, but we may have data in our
2519 	 * receive buffer of which lwIP is not aware.
2520 	 *
2521 	 * Implementing proper linger support is somewhat difficult with lwIP.
2522 	 * In particular, we cannot reliably wait for our FIN to be ACK'ed by
2523 	 * the other side in all cases:
2524 	 *
2525 	 * - the lwIP TCP transition from states CLOSING to TIME_WAIT does not
2526 	 *   trigger any event and once in the TIME_WAIT state, the poll event
2527 	 *   no longer triggers either;
2528 	 * - the lwIP TCP transition from states FIN_WAIT_1 and FIN_WAIT_2 to
2529 	 *   TIME_WAIT will trigger a receive event, but it is not clear
2530 	 *   whether we can reliably check that our FIN was ACK'ed from there.
2531 	 *
2532 	 * That means we have to compromise.  Instead of the proper approach,
2533 	 * we complete our side of the close operation whenever:
2534 	 *
2535 	 * 1. all of or data was acknowledged, AND,
2536 	 * 2. our FIN was sent, AND,
2537 	 * 3a. our FIN was acknowledged, OR,
2538 	 * 3b. we received a FIN from the other side.
2539 	 *
2540 	 * With the addition of the rule 3b, we do not run into the above
2541 	 * reliability problems, but we may return from SO_LINGER-blocked close
2542 	 * calls too early and thus give callers a false impression of success.
2543 	 * TODO: if lwIP ever gets improved on this point, the code in this
2544 	 * module should be rewritten to make use of the improvements.
2545 	 *
2546 	 * The set of rules is basically the same as for closing the PCB early
2547 	 * as per tcpsock_may_close(), except with the check for our FIN being
2548 	 * acknowledged.  Unfortunately only the FIN_WAIT_2, TIME_WAIT, and
2549 	 * (reentered) CLOSED TCP states guarantee that there are no
2550 	 * unacknowledged data segments anymore, so we may have to wait for
2551 	 * reaching any one of these before we can actually finish closing the
2552 	 * socket with tcp_close().
2553 	 *
2554 	 * In addition, lwIP does not tell us when our FIN gets acknowledged,
2555 	 * so we have to use polling and direct access to lwIP's PCB fields
2556 	 * instead, just like lwIP's BSD API does.  There is no other way.
2557 	 * Also, we may not even be able to send the FIN right away, in which
2558 	 * case we must defer that until later.
2559 	 */
2560 	if (tcp->tcp_pcb != NULL) {
2561 		switch (tcp->tcp_pcb->state) {
2562 		case CLOSE_WAIT:
2563 		case CLOSING:
2564 		case LAST_ACK:
2565 			assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN);
2566 
2567 			/* FALLTHROUGH */
2568 		case SYN_RCVD:
2569 		case ESTABLISHED:
2570 		case FIN_WAIT_1:
2571 			/* First check if we should abort the connection. */
2572 			if (force || rlen > 0)
2573 				break;
2574 
2575 			/*
2576 			 * If we have not sent a FIN yet, try sending it now;
2577 			 * if all other conditions are met for closing the
2578 			 * socket, successful FIN transmission will complete
2579 			 * the close.  Otherwise, perform the close check
2580 			 * explicitly.
2581 			 */
2582 			if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR))
2583 				tcpsock_send_fin(tcp);
2584 			else if (tcpsock_may_close(tcp))
2585 				tcpsock_pcb_close(tcp);
2586 
2587 			/*
2588 			 * If at this point the PCB is gone, we managed to
2589 			 * close the connection immediately, and the socket has
2590 			 * already been cleaned up by now.  This may occur if
2591 			 * there is no unacknowledged data and we already
2592 			 * received a FIN earlier on.
2593 			 */
2594 			if (tcp->tcp_pcb == NULL)
2595 				return OK;
2596 
2597 			/*
2598 			 * Complete the close operation at a later time.
2599 			 * Adjust the polling interval, so that we can detect
2600 			 * completion of the close as quickly as possible.
2601 			 */
2602 			tcp_poll(tcp->tcp_pcb, tcpsock_event_poll,
2603 			    TCP_POLL_CLOSE_INTERVAL);
2604 
2605 			return SUSPEND;
2606 
2607 		default:
2608 			/*
2609 			 * The connection is either not yet established, or
2610 			 * already in a state where we can close it right now.
2611 			 */
2612 			tcpsock_pcb_close(tcp);
2613 		}
2614 	}
2615 
2616 	/*
2617 	 * Abort the connection is the PCB is still around, and clean up the
2618 	 * socket.  We cannot let tcpsock_cleanup() free the socket object yet,
2619 	 * because we are still in the callback from libsockevent, and the
2620 	 * latter cannot handle the socket object being freed from here.
2621 	 */
2622 	if (tcp->tcp_pcb != NULL)
2623 		tcpsock_pcb_abort(tcp);
2624 
2625 	(void)tcpsock_cleanup(tcp, FALSE /*may_free*/);
2626 
2627 	return OK;
2628 }
2629 
2630 /*
2631  * Free up a closed TCP socket.
2632  */
2633 static void
tcpsock_free(struct sock * sock)2634 tcpsock_free(struct sock * sock)
2635 {
2636 	struct tcpsock *tcp = (struct tcpsock *)sock;
2637 
2638 	assert(tcp->tcp_pcb == NULL);
2639 	assert(tcp->tcp_snd.ts_len == 0);
2640 	assert(tcp->tcp_snd.ts_head == NULL);
2641 	assert(tcp->tcp_rcv.tr_len == 0);
2642 	assert(tcp->tcp_rcv.tr_head == NULL);
2643 
2644 	TAILQ_INSERT_HEAD(&tcp_freelist, tcp, tcp_queue.tq_next);
2645 }
2646 
2647 /* This table maps TCP states from lwIP numbers to NetBSD numbers. */
2648 static const struct {
2649 	int tsm_tstate;
2650 	int tsm_sostate;
2651 } tcpsock_statemap[] = {
2652 	[CLOSED]	= { TCPS_CLOSED,	SS_ISDISCONNECTED	},
2653 	[LISTEN]	= { TCPS_LISTEN,	0			},
2654 	[SYN_SENT]	= { TCPS_SYN_SENT,	SS_ISCONNECTING		},
2655 	[SYN_RCVD]	= { TCPS_SYN_RECEIVED,	SS_ISCONNECTING		},
2656 	[ESTABLISHED]	= { TCPS_ESTABLISHED,	SS_ISCONNECTED		},
2657 	[FIN_WAIT_1]	= { TCPS_FIN_WAIT_1,	SS_ISDISCONNECTING	},
2658 	[FIN_WAIT_2]	= { TCPS_FIN_WAIT_2,	SS_ISDISCONNECTING	},
2659 	[CLOSE_WAIT]	= { TCPS_CLOSE_WAIT,	SS_ISCONNECTED		},
2660 	[CLOSING]	= { TCPS_CLOSING,	SS_ISDISCONNECTING	},
2661 	[LAST_ACK]	= { TCPS_LAST_ACK,	SS_ISDISCONNECTING	},
2662 	[TIME_WAIT]	= { TCPS_TIME_WAIT,	SS_ISDISCONNECTED	},
2663 };
2664 
2665 /*
2666  * Fill the given kinfo_pcb sysctl(7) structure with information about the TCP
2667  * PCB identified by the given pointer.
2668  */
2669 static void
tcpsock_get_info(struct kinfo_pcb * ki,const void * ptr)2670 tcpsock_get_info(struct kinfo_pcb * ki, const void * ptr)
2671 {
2672 	const struct tcp_pcb *pcb = (const struct tcp_pcb *)ptr;
2673 	struct tcpsock *tcp;
2674 
2675 	/*
2676 	 * Not all TCP PCBs have an associated tcpsock structure.  We are
2677 	 * careful enough clearing the callback argument for PCBs on any of the
2678 	 * TCP lists that we can use that callback argument to determine
2679 	 * whether there is an associated tcpsock structure, although with one
2680 	 * exception: PCBs for incoming connections that have not yet been
2681 	 * fully established (i.e., in SYN_RCVD state).  These will have the
2682 	 * callback argument of the listening socket (which itself may already
2683 	 * have been deallocated at this point) but should not be considered as
2684 	 * associated with the listening socket's tcpsock structure.
2685 	 */
2686 	if (pcb->callback_arg != NULL && pcb->state != SYN_RCVD) {
2687 		tcp = (struct tcpsock *)pcb->callback_arg;
2688 		assert(tcp >= tcp_array &&
2689 		    tcp < &tcp_array[__arraycount(tcp_array)]);
2690 
2691 		/* TODO: change this so that sockstat(1) may work one day. */
2692 		ki->ki_sockaddr = (uint64_t)(uintptr_t)tcpsock_get_sock(tcp);
2693 	} else {
2694 		/* No tcpsock.  Could also be in TIME_WAIT state etc. */
2695 		tcp = NULL;
2696 
2697 		ki->ki_sostate = SS_NOFDREF;
2698 	}
2699 
2700 	ki->ki_type = SOCK_STREAM;
2701 
2702 	if ((unsigned int)pcb->state < __arraycount(tcpsock_statemap)) {
2703 		ki->ki_tstate = tcpsock_statemap[pcb->state].tsm_tstate;
2704 		/* TODO: this needs work, but does anything rely on it? */
2705 		ki->ki_sostate |= tcpsock_statemap[pcb->state].tsm_sostate;
2706 	}
2707 
2708 	/* Careful with the LISTEN state here (see below). */
2709 	ipsock_get_info(ki, &pcb->local_ip, pcb->local_port,
2710 	    &pcb->remote_ip, (pcb->state != LISTEN) ? pcb->remote_port : 0);
2711 
2712 	/*
2713 	 * The PCBs for listening sockets are actually smaller.  Thus, for
2714 	 * listening sockets, do not attempt to access any of the fields beyond
2715 	 * those provided in the smaller structure.
2716 	 */
2717 	if (pcb->state == LISTEN) {
2718 		assert(tcp != NULL);
2719 		ki->ki_refs =
2720 		    (uint64_t)(uintptr_t)TAILQ_FIRST(&tcp->tcp_queue.tq_head);
2721 	} else {
2722 		if (tcp_nagle_disabled(pcb))
2723 			ki->ki_tflags |= NETBSD_TF_NODELAY;
2724 
2725 		if (tcp != NULL) {
2726 			ki->ki_rcvq = tcp->tcp_rcv.tr_len;
2727 			ki->ki_sndq = tcp->tcp_snd.ts_len;
2728 
2729 			if (tcp->tcp_listener != NULL)
2730 				ki->ki_nextref = (uint64_t)(uintptr_t)
2731 				    TAILQ_NEXT(tcp, tcp_queue.tq_next);
2732 		}
2733 	}
2734 }
2735 
2736 /*
2737  * Given either NULL or a previously returned TCP PCB pointer, return the first
2738  * or next TCP PCB pointer, or NULL if there are no more.  The current
2739  * implementation supports only one concurrent iteration at once.
2740  */
2741 static const void *
tcpsock_enum(const void * last)2742 tcpsock_enum(const void * last)
2743 {
2744 	static struct {
2745 		unsigned int i;
2746 		const struct tcp_pcb *pcb;
2747 	} iter;
2748 
2749 	if (last != NULL && (iter.pcb = iter.pcb->next) != NULL)
2750 		return (const void *)iter.pcb;
2751 
2752 	for (iter.i = (last != NULL) ? iter.i + 1 : 0;
2753 	    iter.i < __arraycount(tcp_pcb_lists); iter.i++) {
2754 		if ((iter.pcb = *tcp_pcb_lists[iter.i]) != NULL)
2755 			return (const void *)iter.pcb;
2756 	}
2757 
2758 	return NULL;
2759 }
2760 
2761 /*
2762  * Obtain the list of TCP protocol control blocks, for sysctl(7).
2763  */
2764 static ssize_t
tcpsock_pcblist(struct rmib_call * call,struct rmib_node * node __unused,struct rmib_oldp * oldp,struct rmib_newp * newp __unused)2765 tcpsock_pcblist(struct rmib_call * call, struct rmib_node * node __unused,
2766 	struct rmib_oldp * oldp, struct rmib_newp * newp __unused)
2767 {
2768 
2769 	return util_pcblist(call, oldp, tcpsock_enum, tcpsock_get_info);
2770 }
2771 
2772 static const struct sockevent_ops tcpsock_ops = {
2773 	.sop_bind		= tcpsock_bind,
2774 	.sop_listen		= tcpsock_listen,
2775 	.sop_connect		= tcpsock_connect,
2776 	.sop_accept		= tcpsock_accept,
2777 	.sop_test_accept	= tcpsock_test_accept,
2778 	.sop_pre_send		= tcpsock_pre_send,
2779 	.sop_send		= tcpsock_send,
2780 	.sop_test_send		= tcpsock_test_send,
2781 	.sop_pre_recv		= tcpsock_pre_recv,
2782 	.sop_recv		= tcpsock_recv,
2783 	.sop_test_recv		= tcpsock_test_recv,
2784 	.sop_ioctl		= ifconf_ioctl,
2785 	.sop_setsockmask	= tcpsock_setsockmask,
2786 	.sop_setsockopt		= tcpsock_setsockopt,
2787 	.sop_getsockopt		= tcpsock_getsockopt,
2788 	.sop_getsockname	= tcpsock_getsockname,
2789 	.sop_getpeername	= tcpsock_getpeername,
2790 	.sop_shutdown		= tcpsock_shutdown,
2791 	.sop_close		= tcpsock_close,
2792 	.sop_free		= tcpsock_free
2793 };
2794