1 /* LWIP service - tcpsock.c - TCP sockets */ 2 /* 3 * This module implements support for TCP sockets based on lwIP's core TCP PCB 4 * module, which is largely but not fully cooperative with exactly what we want 5 * to achieve, with as a result that this module is rather complicated. 6 * 7 * Each socket has a send queue and a receive queue. Both are using lwIP's own 8 * (pbuf) buffers, which largely come out of the main 512-byte buffer pool. 9 * The buffers on the send queue are allocated and freed by us--the latter only 10 * once they are no longer in use by lwIP as well. A bit counterintuitively, 11 * we deliberately use a smaller lwIP per-PCB TCP send buffer limit 12 * (TCP_SND_BUF) in the lwIP send configuration (lwipopts.h) in order to more 13 * easily trigger conditions where we cannot enqueue data (or the final FIN) 14 * right away. This way, we get to test the internal logic of this module a 15 * lot more easily. The small lwIP send queue size should not have any impact 16 * on performance, as our own per-socket send queues can be much larger and we 17 * enqueue more of that on the lwIP PCB as soon as we can in all cases. 18 * 19 * The receive queue consists of whatever buffers were given to us by lwIP, but 20 * since those may be many buffers with small amounts of data each, we perform 21 * fairly aggressive merging of consecutive buffers. The intended result is 22 * that we waste no more than 50% of memory within the receive queue. Merging 23 * requires memory copies, which makes it expensive, but we do not configure 24 * lwIP with enough buffers to make running out of buffers a non-issue, so this 25 * trade-off is necessary. Practical experience and measurements of the merge 26 * policy will have to show whether and how the current policy may be improved. 27 * 28 * As can be expected, the connection close semantics are by far the most 29 * complicated part of this module. We attempt to get rid of the lwIP PCB as 30 * soon as we can, letting lwIP take care of the TIME_WAIT state for example. 31 * However, there are various conditions that have to be met before we can 32 * forget about the PCB here--most importantly, that none of our sent data 33 * blocks are still referenced by lwIP because they have not yet been sent or 34 * acknowledged. We can only free the data blocks once lwIP is done with them. 35 * 36 * We do consider the TCP state of lwIP's PCB, in order to avoid duplicating 37 * full state tracking here. However, we do not look at a socket's TCP state 38 * while in a lwIP-generated event for that socket, because the state may not 39 * necessarily reflect the (correct or new) TCP state of the connection, nor 40 * may the PCB be available--this is the case for error events. For these 41 * reasons we use a few internal TCPF_ flags to perform partial state tracking. 42 * 43 * More generally, we tend to access lwIP PCB fields directly only when lwIP's 44 * own BSD API implementation does that too and there is no better alternative. 45 * One example of this is the check to see if our FIN was acknowledged, for 46 * SO_LINGER support. In terms of maintenance, our hope is that if lwIP's API 47 * changes later, we can change our code to imitate whatever lwIP's BSD API 48 * implementation does at that point. 49 */ 50 51 #include <sys/socketvar.h> 52 #include <netinet/in.h> 53 #include <netinet/tcp.h> 54 #include <netinet/ip_var.h> 55 #include <netinet/tcp_timer.h> 56 #include <netinet/tcp_var.h> 57 #include <netinet/tcp_fsm.h> 58 59 /* 60 * Unfortunately, NetBSD and lwIP have different definitions of a few relevant 61 * preprocessor variables. Make sure we do not attempt to use the NetBSD one 62 * where it matters. We do need one of the NetBSD definitions though. 63 */ 64 static const unsigned int NETBSD_TF_NODELAY = TF_NODELAY; 65 #undef TF_NODELAY 66 #undef TCP_MSS 67 68 #include "lwip.h" 69 #include "tcpisn.h" 70 71 #include "lwip/tcp.h" 72 #include "lwip/priv/tcp_priv.h" /* for tcp_pcb_lists */ 73 74 /* 75 * The number of TCP sockets (NR_TCPSOCK) is defined in the lwIP configuration. 76 */ 77 78 /* 79 * We fully control the send buffer, so we can let its size be set to whatever 80 * we want. The receive buffer is different: if it is smaller than the window 81 * size, we may have to refuse data that lwIP hands us, at which point more 82 * incoming data will cause lwIP to abort the TCP connection--even aside from 83 * performance issues. Therefore, we must make sure the receive buffer is 84 * larger than the TCP window at all times. 85 */ 86 #define TCP_SNDBUF_MIN 1 /* minimum TCP send buffer size */ 87 #define TCP_SNDBUF_DEF 32768 /* default TCP send buffer size */ 88 #define TCP_SNDBUF_MAX 131072 /* maximum TCP send buffer size */ 89 #define TCP_RCVBUF_MIN TCP_WND /* minimum TCP receive buffer size */ 90 #define TCP_RCVBUF_DEF MAX(TCP_WND, 32768) /* default TCP recv buffer size */ 91 #define TCP_RCVBUF_MAX MAX(TCP_WND, 131072) /* maximum TCP recv buffer size */ 92 93 /* 94 * The total number of buffers that may in use for TCP socket send queues. The 95 * goal is to allow at least some progress to be made on receiving from TCP 96 * sockets and on differently-typed sockets, at least as long as the LWIP 97 * service can manage to allocate the memory it wants. For the case that it 98 * does not, we can only reactively kill off TCP sockets and/or free enqueued 99 * ethernet packets, neither of which is currently implemented (TODO). 100 */ 101 #define TCP_MAX_SENDBUFS (mempool_max_buffers() * 3 / 4) 102 103 /* Polling intervals, in 500-millsecond units. */ 104 #define TCP_POLL_REG_INTERVAL 10 /* interval for reattempting sends */ 105 #define TCP_POLL_CLOSE_INTERVAL 1 /* interval while closing connection */ 106 107 static struct tcpsock { 108 struct ipsock tcp_ipsock; /* IP socket, MUST be first */ 109 struct tcp_pcb *tcp_pcb; /* lwIP TCP control block */ 110 union pxfer_tcp_queue { /* free/accept queue */ 111 TAILQ_ENTRY(tcpsock) tq_next; /* next in queue */ 112 TAILQ_HEAD(, tcpsock) tq_head; /* head of queue */ 113 } tcp_queue; 114 struct tcpsock *tcp_listener; /* listener if on accept q. */ 115 struct { /* send queue */ 116 struct pbuf *ts_head; /* first pbuf w/unacked data */ 117 struct pbuf *ts_unsent; /* first pbuf w/unsent data */ 118 struct pbuf *ts_tail; /* most recently added data */ 119 size_t ts_len; /* total sent + unsent */ 120 unsigned short ts_head_off; /* offset into head pbuf */ 121 unsigned short ts_unsent_off; /* offset into unsent pbuf */ 122 } tcp_snd; 123 struct { /* receive queue */ 124 struct pbuf *tr_head; /* first pbuf w/unrecvd data */ 125 struct pbuf **tr_pre_tailp; /* ptr-ptr to newest pbuf */ 126 size_t tr_len; /* bytes on receive queue */ 127 unsigned short tr_head_off; /* offset into head pbuf */ 128 unsigned short tr_unacked; /* current window reduction */ 129 } tcp_rcv; 130 } tcp_array[NR_TCPSOCK]; 131 132 static TAILQ_HEAD(, tcpsock) tcp_freelist; /* list of free TCP sockets */ 133 134 static const struct sockevent_ops tcpsock_ops; 135 136 static unsigned int tcpsock_sendbufs; /* # send buffers in use */ 137 static unsigned int tcpsock_recvbufs; /* # receive buffers in use */ 138 139 /* A bunch of macros that are just for convenience. */ 140 #define tcpsock_get_id(tcp) (SOCKID_TCP | (sockid_t)((tcp) - tcp_array)) 141 #define tcpsock_get_ipsock(tcp) (&(tcp)->tcp_ipsock) 142 #define tcpsock_get_sock(tcp) (ipsock_get_sock(tcpsock_get_ipsock(tcp))) 143 #define tcpsock_get_sndbuf(tcp) (ipsock_get_sndbuf(tcpsock_get_ipsock(tcp))) 144 #define tcpsock_get_rcvbuf(tcp) (ipsock_get_rcvbuf(tcpsock_get_ipsock(tcp))) 145 #define tcpsock_is_ipv6(tcp) (ipsock_is_ipv6(tcpsock_get_ipsock(tcp))) 146 #define tcpsock_is_shutdown(tcp,fl) \ 147 (sockevent_is_shutdown(tcpsock_get_sock(tcp), fl)) 148 #define tcpsock_is_listening(tcp) \ 149 (sockevent_is_listening(tcpsock_get_sock(tcp))) 150 #define tcpsock_get_flags(tcp) (ipsock_get_flags(tcpsock_get_ipsock(tcp))) 151 #define tcpsock_set_flag(tcp,fl) \ 152 (ipsock_set_flag(tcpsock_get_ipsock(tcp), fl)) 153 #define tcpsock_clear_flag(tcp,fl) \ 154 (ipsock_clear_flag(tcpsock_get_ipsock(tcp), fl)) 155 156 static ssize_t tcpsock_pcblist(struct rmib_call *, struct rmib_node *, 157 struct rmib_oldp *, struct rmib_newp *); 158 159 /* The CTL_NET {PF_INET,PF_INET6} IPPROTO_TCP subtree. */ 160 /* TODO: add many more and make some of them writable.. */ 161 static struct rmib_node net_inet_tcp_table[] = { 162 /* 2*/ [TCPCTL_SENDSPACE] = RMIB_INT(RMIB_RO, TCP_SNDBUF_DEF, 163 "sendspace", 164 "Default TCP send buffer size"), 165 /* 3*/ [TCPCTL_RECVSPACE] = RMIB_INT(RMIB_RO, TCP_RCVBUF_DEF, 166 "recvspace", 167 "Default TCP receive buffer size"), 168 /*29*/ [TCPCTL_LOOPBACKCKSUM] = RMIB_FUNC(RMIB_RW | CTLTYPE_INT, sizeof(int), 169 loopif_cksum, "do_loopback_cksum", 170 "Perform TCP checksum on loopback"), 171 /*+0*/ [TCPCTL_MAXID] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, 172 tcpsock_pcblist, "pcblist", 173 "TCP protocol control block list"), 174 /*+1*/ [TCPCTL_MAXID + 1] = RMIB_FUNC(RMIB_RW | CTLFLAG_PRIVATE | 175 CTLFLAG_HIDDEN | CTLTYPE_STRING, 176 TCPISN_SECRET_HEX_LENGTH, tcpisn_secret, 177 "isn_secret", 178 "TCP ISN secret (MINIX 3 specific)") 179 }; 180 181 static struct rmib_node net_inet_tcp_node = 182 RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp", "TCP related settings"); 183 static struct rmib_node net_inet6_tcp6_node = 184 RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp6", "TCP related settings"); 185 186 /* 187 * Initialize the TCP sockets module. 188 */ 189 void 190 tcpsock_init(void) 191 { 192 unsigned int slot; 193 194 /* Initialize the list of free TCP sockets. */ 195 TAILQ_INIT(&tcp_freelist); 196 197 for (slot = 0; slot < __arraycount(tcp_array); slot++) 198 TAILQ_INSERT_TAIL(&tcp_freelist, &tcp_array[slot], 199 tcp_queue.tq_next); 200 201 /* Initialize other variables. */ 202 tcpsock_sendbufs = 0; 203 204 /* Register the net.inet.tcp and net.inet6.tcp6 RMIB subtrees. */ 205 mibtree_register_inet(PF_INET, IPPROTO_TCP, &net_inet_tcp_node); 206 mibtree_register_inet(PF_INET6, IPPROTO_TCP, &net_inet6_tcp6_node); 207 } 208 209 /* 210 * Initialize the state of a TCP socket's send queue. 211 */ 212 static void 213 tcpsock_reset_send(struct tcpsock * tcp) 214 { 215 216 tcp->tcp_snd.ts_tail = NULL; 217 tcp->tcp_snd.ts_unsent = NULL; 218 tcp->tcp_snd.ts_head = NULL; 219 tcp->tcp_snd.ts_len = 0; 220 tcp->tcp_snd.ts_unsent_off = 0; 221 tcp->tcp_snd.ts_head_off = 0; 222 } 223 224 /* 225 * Initialize the state of a TCP socket's receive queue. 226 */ 227 static void 228 tcpsock_reset_recv(struct tcpsock * tcp) 229 { 230 231 tcp->tcp_rcv.tr_pre_tailp = NULL; 232 tcp->tcp_rcv.tr_head = NULL; 233 tcp->tcp_rcv.tr_len = 0; 234 tcp->tcp_rcv.tr_head_off = 0; 235 tcp->tcp_rcv.tr_unacked = 0; 236 } 237 238 /* 239 * Create a TCP socket. 240 */ 241 sockid_t 242 tcpsock_socket(int domain, int protocol, struct sock ** sockp, 243 const struct sockevent_ops ** ops) 244 { 245 struct tcpsock *tcp; 246 uint8_t ip_type; 247 248 switch (protocol) { 249 case 0: 250 case IPPROTO_TCP: 251 break; 252 253 default: 254 return EPROTONOSUPPORT; 255 } 256 257 if (TAILQ_EMPTY(&tcp_freelist)) 258 return ENOBUFS; 259 260 tcp = TAILQ_FIRST(&tcp_freelist); 261 262 /* 263 * Initialize the structure. Do not memset it to zero, as it is still 264 * part of the linked free list. Initialization may still fail. When 265 * adding new fields, make sure to change tcpsock_clone() accordingly. 266 */ 267 268 ip_type = ipsock_socket(tcpsock_get_ipsock(tcp), domain, 269 TCP_SNDBUF_DEF, TCP_RCVBUF_DEF, sockp); 270 271 if ((tcp->tcp_pcb = tcp_new_ip_type(ip_type)) == NULL) 272 return ENOBUFS; 273 tcp_arg(tcp->tcp_pcb, tcp); 274 275 tcp->tcp_listener = NULL; 276 277 tcpsock_reset_send(tcp); 278 tcpsock_reset_recv(tcp); 279 280 TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next); 281 282 *ops = &tcpsock_ops; 283 return tcpsock_get_id(tcp); 284 } 285 286 /* 287 * Create a TCP socket for the TCP PCB 'pcb' which identifies a new connection 288 * incoming on listening socket 'listener'. The new socket is essentially a 289 * "clone" of the listening TCP socket, in that it should inherit any settings 290 * from the listening socket. The socket has not yet been accepted by userland 291 * so add it to the queue of connetions pending for the listening socket. On 292 * success, return OK. On failure, return a negative error code. 293 */ 294 static int 295 tcpsock_clone(struct tcpsock * listener, struct tcp_pcb * pcb) 296 { 297 struct tcpsock *tcp; 298 299 if (TAILQ_EMPTY(&tcp_freelist)) 300 return ENOBUFS; 301 302 tcp = TAILQ_FIRST(&tcp_freelist); 303 304 /* 305 * Initialize the structure. Do not memset it to zero, as it is still 306 * part of the linked free list. Initialization may still fail. Most 307 * settings should be inherited from the listening socket here, rather 308 * than being initialized to their default state. 309 */ 310 311 ipsock_clone(tcpsock_get_ipsock(listener), tcpsock_get_ipsock(tcp), 312 tcpsock_get_id(tcp)); 313 314 tcp->tcp_pcb = pcb; 315 tcp_arg(pcb, tcp); 316 317 tcpsock_reset_send(tcp); 318 tcpsock_reset_recv(tcp); 319 320 /* 321 * Remove the new socket from the free list, and add it to the queue of 322 * the listening socket--in this order, because the same next pointer 323 * is used for both. 324 */ 325 TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next); 326 327 TAILQ_INSERT_TAIL(&listener->tcp_queue.tq_head, tcp, 328 tcp_queue.tq_next); 329 tcp->tcp_listener = listener; 330 331 return OK; 332 } 333 334 /* 335 * Allocate a buffer from the pool, using the standard pool size. The returned 336 * buffer is a single element--never a chain. 337 */ 338 static struct pbuf * 339 tcpsock_alloc_buf(void) 340 { 341 struct pbuf *pbuf; 342 343 pbuf = pbuf_alloc(PBUF_RAW, MEMPOOL_BUFSIZE, PBUF_RAM); 344 345 assert(pbuf == NULL || pbuf->len == pbuf->tot_len); 346 347 return pbuf; 348 } 349 350 /* 351 * Free the given buffer. Ensure that pbuf_free() will not attempt to free the 352 * next buffer(s) in the chain as well. This may be called for pbufs other 353 * than those allocated with tcpsock_alloc_buf(). 354 */ 355 static void 356 tcpsock_free_buf(struct pbuf * pbuf) 357 { 358 359 /* 360 * Resetting the length is currently not necessary, but better safe 361 * than sorry.. 362 */ 363 pbuf->len = pbuf->tot_len; 364 pbuf->next = NULL; 365 366 pbuf_free(pbuf); 367 } 368 369 /* 370 * Clear the send queue of a TCP socket. The caller must ensure that lwIP will 371 * no longer access any of data on the send queue. 372 */ 373 static void 374 tcpsock_clear_send(struct tcpsock * tcp) 375 { 376 struct pbuf *phead; 377 378 assert(tcp->tcp_pcb == NULL); 379 380 while ((phead = tcp->tcp_snd.ts_head) != NULL) { 381 tcp->tcp_snd.ts_head = phead->next; 382 383 assert(tcpsock_sendbufs > 0); 384 tcpsock_sendbufs--; 385 386 tcpsock_free_buf(phead); 387 } 388 389 tcpsock_reset_send(tcp); 390 } 391 392 /* 393 * Clear the receive queue of a TCP socket. If 'ack_data' is set, also 394 * acknowledge the previous contents of the receive queue to lwIP. 395 */ 396 static size_t 397 tcpsock_clear_recv(struct tcpsock * tcp, int ack_data) 398 { 399 struct pbuf *phead; 400 size_t rlen; 401 402 rlen = tcp->tcp_rcv.tr_len; 403 404 while ((phead = tcp->tcp_rcv.tr_head) != NULL) { 405 tcp->tcp_rcv.tr_head = phead->next; 406 407 assert(tcpsock_recvbufs > 0); 408 tcpsock_recvbufs--; 409 410 tcpsock_free_buf(phead); 411 } 412 413 /* 414 * From now on, we will basically be discarding incoming data as fast 415 * as possible, to keep the full window open at all times. 416 */ 417 if (ack_data && tcp->tcp_pcb != NULL && tcp->tcp_rcv.tr_unacked > 0) 418 tcp_recved(tcp->tcp_pcb, tcp->tcp_rcv.tr_unacked); 419 420 tcpsock_reset_recv(tcp); 421 422 return rlen; 423 } 424 425 /* 426 * The TCP socket's PCB has been detached from the socket, typically because 427 * the connection was aborted, either by us or by lwIP. Either way, any TCP 428 * connection is gone. Clear the socket's send queue, remove the socket from 429 * a listening socket's queue, and if the socket itself is ready and allowed to 430 * be freed, free it now. The socket is ready to be freed if it was either on 431 * a listening queue or being closed already. The socket is allowed to be 432 * freed only if 'may_free' is TRUE. If the socket is not freed, its receive 433 * queue is left as is, as it may still have data to be received by userland. 434 */ 435 static int 436 tcpsock_cleanup(struct tcpsock * tcp, int may_free) 437 { 438 int destroy; 439 440 assert(tcp->tcp_pcb == NULL); 441 442 /* 443 * Free any data on the send queue. This is safe to do right now, 444 * because the PCB has been aborted (or was already gone). We must be 445 * very careful about clearing the send queue in all other situations. 446 */ 447 tcpsock_clear_send(tcp); 448 449 /* 450 * If this was a socket pending acceptance, remove it from the 451 * corresponding listener socket's queue, and free it. Otherwise, free 452 * the socket only if it suspended a graceful close operation. 453 */ 454 if (tcp->tcp_listener != NULL) { 455 TAILQ_REMOVE(&tcp->tcp_listener->tcp_queue.tq_head, tcp, 456 tcp_queue.tq_next); 457 tcp->tcp_listener = NULL; 458 459 /* 460 * The listener socket's backlog count should be adjusted by 461 * lwIP whenever the PCB is freed up, so we need (and must) not 462 * attempt to do that here. 463 */ 464 465 destroy = TRUE; 466 } else 467 destroy = sockevent_is_closing(tcpsock_get_sock(tcp)); 468 469 /* 470 * Do not free the socket if 'may_free' is FALSE. That flag may be set 471 * if we are currently in the second tcpsock_close() call on the 472 * socket, in which case sockevent_is_closing() is TRUE but we must 473 * still not free the socket now: doing so would derail libsockevent. 474 */ 475 if (destroy && may_free) { 476 (void)tcpsock_clear_recv(tcp, FALSE /*ack_data*/); 477 478 sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE); 479 } 480 481 return destroy; 482 } 483 484 /* 485 * Abort the lwIP PCB for the given socket, using tcp_abort(). If the PCB is 486 * connected, this will cause the connection to be reset. The PCB, which must 487 * have still been present before the call, will be gone after the call. 488 */ 489 static void 490 tcpsock_pcb_abort(struct tcpsock * tcp) 491 { 492 493 assert(tcp->tcp_pcb != NULL); 494 assert(!tcpsock_is_listening(tcp)); 495 496 tcp_recv(tcp->tcp_pcb, NULL); 497 tcp_sent(tcp->tcp_pcb, NULL); 498 tcp_err(tcp->tcp_pcb, NULL); 499 tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL); 500 501 tcp_arg(tcp->tcp_pcb, NULL); 502 503 tcp_abort(tcp->tcp_pcb); 504 505 tcp->tcp_pcb = NULL; 506 } 507 508 /* 509 * Close the lwIP PCB for the given socket, using tcp_close(). If the PCB is 510 * connected, its graceful close will be finished by lwIP in the background. 511 * The PCB, which must have still been present before the call, will be gone 512 * after the call. 513 */ 514 static void 515 tcpsock_pcb_close(struct tcpsock * tcp) 516 { 517 err_t err; 518 519 assert(tcp->tcp_pcb != NULL); 520 assert(tcp->tcp_snd.ts_len == 0); 521 522 if (!tcpsock_is_listening(tcp)) { 523 tcp_recv(tcp->tcp_pcb, NULL); 524 tcp_sent(tcp->tcp_pcb, NULL); 525 tcp_err(tcp->tcp_pcb, NULL); 526 tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL); 527 } 528 529 tcp_arg(tcp->tcp_pcb, NULL); 530 531 if ((err = tcp_close(tcp->tcp_pcb)) != ERR_OK) 532 panic("unexpected TCP close failure: %d", err); 533 534 tcp->tcp_pcb = NULL; 535 } 536 537 /* 538 * Return TRUE if all conditions are met for closing the TCP socket's PCB, or 539 * FALSE if they are not. Upon calling this function, the socket's PCB must 540 * still be around. 541 */ 542 static int 543 tcpsock_may_close(struct tcpsock * tcp) 544 { 545 546 assert(tcp->tcp_pcb != NULL); 547 548 /* 549 * Regular closing of the PCB requires three conditions to be met: 550 * 551 * 1. all our data has been transmitted AND acknowledged, so that we do 552 * not risk corruption in case there are still unsent or unack'ed 553 * data buffers that may otherwise be recycled too soon; 554 * 2. we have sent our FIN to the peer; and, 555 * 3. we have received a FIN from the peer. 556 */ 557 return ((tcpsock_get_flags(tcp) & (TCPF_SENT_FIN | TCPF_RCVD_FIN)) == 558 (TCPF_SENT_FIN | TCPF_RCVD_FIN) && tcp->tcp_snd.ts_len == 0); 559 } 560 561 /* 562 * The given socket is ready to be closed as per the tcpsock_may_close() rules. 563 * This implies that its send queue is already empty. Gracefully close the 564 * PCB. In addition, if the socket is being closed gracefully, meaning we 565 * suspended an earlier tcpsock_close() call (and as such already emptied the 566 * receive queue as well), then tell libsockevent that the close is finished, 567 * freeing the socket. Return TRUE if the socket has indeed been freed this 568 * way, or FALSE if the socket is still around. 569 */ 570 static int 571 tcpsock_finish_close(struct tcpsock * tcp) 572 { 573 574 assert(tcp->tcp_snd.ts_len == 0); 575 assert(tcp->tcp_listener == NULL); 576 577 /* 578 * If we get here, we have already shut down the sending side of the 579 * PCB. Technically, we are interested only in shutting down the 580 * receiving side of the PCB here, so that lwIP may decide to recycle 581 * the socket later etcetera. We call tcp_close() because we do not 582 * want to rely on tcp_shutdown(RX) doing the exact same thing. 583 * However, we do rely on the fact that the PCB is not immediately 584 * destroyed by the tcp_close() call: otherwise we may have to return 585 * ERR_ABRT if this function is called from a lwIP-generated event. 586 */ 587 tcpsock_pcb_close(tcp); 588 589 /* 590 * If we suspended an earlier tcpsock_close() call, we have to tell 591 * libsockevent that the close operation is now complete. 592 */ 593 if (sockevent_is_closing(tcpsock_get_sock(tcp))) { 594 assert(tcp->tcp_rcv.tr_len == 0); 595 596 sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE); 597 598 return TRUE; 599 } else 600 return FALSE; 601 } 602 603 /* 604 * Attempt to start or resume enqueuing data and/or a FIN to send on the given 605 * TCP socket. Return TRUE if anything at all could be newly enqueued on the 606 * lwIP PCB, even if less than desired. In that case, the caller should try to 607 * send whatever was enqueued, and if applicable, check if the socket may now 608 * be closed (due to the FIN being enqueued). In particular, in any situation 609 * where the socket may be in the process of being closed, the caller must use 610 * tcpsock_may_close() if TRUE is returned. Return FALSE if nothing new could 611 * be enqueued, in which case no send attempt need to be made either. 612 */ 613 static int 614 tcpsock_pcb_enqueue(struct tcpsock * tcp) 615 { 616 struct pbuf *punsent; 617 size_t space, chunk; 618 unsigned int flags; 619 err_t err; 620 int enqueued; 621 622 assert(tcp->tcp_pcb != NULL); 623 624 if (tcpsock_get_flags(tcp) & TCPF_FULL) 625 return FALSE; 626 627 /* 628 * Attempt to enqueue more unsent data, if any, on the PCB's send 629 * queue. 630 */ 631 enqueued = FALSE; 632 633 while (tcp->tcp_snd.ts_unsent != NULL) { 634 if ((space = tcp_sndbuf(tcp->tcp_pcb)) == 0) 635 break; 636 637 /* 638 * We may maintain a non-NULL unsent pointer even when there is 639 * nothing more to send right now, because the tail buffer may 640 * be filled up further later on. 641 */ 642 punsent = tcp->tcp_snd.ts_unsent; 643 644 assert(punsent->len >= tcp->tcp_snd.ts_unsent_off); 645 646 chunk = (size_t)punsent->len - tcp->tcp_snd.ts_unsent_off; 647 if (chunk == 0) 648 break; 649 650 if (chunk > space) 651 chunk = space; 652 653 /* Try to enqueue more data for sending. */ 654 if (chunk < punsent->len || punsent->next != NULL) 655 flags = TCP_WRITE_FLAG_MORE; 656 else 657 flags = 0; 658 659 err = tcp_write(tcp->tcp_pcb, (char *)punsent->payload + 660 tcp->tcp_snd.ts_unsent_off, chunk, flags); 661 662 /* 663 * Since tcp_write() enqueues data only, it should only return 664 * out-of-memory errors; no fatal ones. In any case, stop. 665 */ 666 if (err != ERR_OK) { 667 assert(err == ERR_MEM); 668 669 break; 670 } 671 672 /* We have successfully enqueued data. */ 673 enqueued = TRUE; 674 675 tcp->tcp_snd.ts_unsent_off += chunk; 676 677 if (tcp->tcp_snd.ts_unsent_off < punsent->tot_len) { 678 assert(tcp->tcp_snd.ts_unsent_off < punsent->len || 679 punsent->next == NULL); 680 681 break; 682 } 683 684 tcp->tcp_snd.ts_unsent = punsent->next; 685 tcp->tcp_snd.ts_unsent_off = 0; 686 } 687 688 /* 689 * If all pending data has been enqueued for sending, and we should 690 * shut down the sending end of the socket, try that now. 691 */ 692 if ((tcp->tcp_snd.ts_unsent == NULL || 693 tcp->tcp_snd.ts_unsent_off == tcp->tcp_snd.ts_unsent->len) && 694 tcpsock_is_shutdown(tcp, SFL_SHUT_WR) && 695 !(tcpsock_get_flags(tcp) & TCPF_SENT_FIN)) { 696 err = tcp_shutdown(tcp->tcp_pcb, 0 /*shut_rx*/, 1 /*shut_tx*/); 697 698 if (err == ERR_OK) { 699 /* 700 * We have successfully enqueued a FIN. The caller is 701 * now responsible for checking whether the PCB and 702 * possibly even the socket object can now be freed. 703 */ 704 tcpsock_set_flag(tcp, TCPF_SENT_FIN); 705 706 enqueued = TRUE; 707 } else { 708 assert(err == ERR_MEM); 709 710 /* 711 * FIXME: the resolution for lwIP bug #47485 has taken 712 * away even more control over the closing process from 713 * us, making tracking sockets especially for SO_LINGER 714 * even harder. For now, we simply effectively undo 715 * the patch by clearing TF_CLOSEPEND if tcp_shutdown() 716 * returns ERR_MEM. This will not be sustainable in 717 * the long term, though. 718 */ 719 tcp->tcp_pcb->flags &= ~TF_CLOSEPEND; 720 721 tcpsock_set_flag(tcp, TCPF_FULL); 722 } 723 } 724 725 return enqueued; 726 } 727 728 /* 729 * Request lwIP to start sending any enqueued data and/or FIN on the TCP 730 * socket's lwIP PCB. On success, return OK. On failure, return a negative 731 * error code, after cleaning up the socket, freeing the PCB. If the socket 732 * was already being closed, also free the socket object in that case; the 733 * caller must then not touch the socket object anymore upon return. If the 734 * socket object is not freed, and if 'raise_error' is TRUE, raise the error 735 * on the socket object. 736 */ 737 static int 738 tcpsock_pcb_send(struct tcpsock * tcp, int raise_error) 739 { 740 err_t err; 741 int r; 742 743 assert(tcp->tcp_pcb != NULL); 744 745 /* 746 * If we have enqueued something, ask lwIP to send TCP packets now. 747 * This may result in a fatal error, in which case we clean up the 748 * socket and return the error to the caller. Since cleaning up the 749 * socket may free the socket object, and the caller cannot tell 750 * whether that will happen or has happened, also possibly raise the 751 * error on the socket object if it is not gone. As such, callers that 752 * set 'raise_error' to FALSE must know for sure that the socket was 753 * not being closed, for example because the caller is processing a 754 * (send) call from userland. 755 */ 756 err = tcp_output(tcp->tcp_pcb); 757 758 if (err != ERR_OK && err != ERR_MEM) { 759 tcpsock_pcb_abort(tcp); 760 761 r = util_convert_err(err); 762 763 if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) { 764 if (raise_error) 765 sockevent_set_error(tcpsock_get_sock(tcp), r); 766 } 767 /* Otherwise, do not touch the socket object anymore! */ 768 769 return r; 770 } else 771 return OK; 772 } 773 774 /* 775 * Callback from lwIP. The given number of data bytes have been acknowledged 776 * as received by the remote end. Dequeue and free data from the TCP socket's 777 * send queue as appropriate. 778 */ 779 static err_t 780 tcpsock_event_sent(void * arg, struct tcp_pcb * pcb __unused, uint16_t len) 781 { 782 struct tcpsock *tcp = (struct tcpsock *)arg; 783 struct pbuf *phead; 784 size_t left; 785 786 assert(tcp != NULL); 787 assert(pcb == tcp->tcp_pcb); 788 assert(len > 0); 789 790 assert(tcp->tcp_snd.ts_len >= len); 791 assert(tcp->tcp_snd.ts_head != NULL); 792 793 left = len; 794 795 /* 796 * First see if we can free up whole buffers. Check against the head 797 * buffer's 'len' rather than 'tot_len', or we may end up leaving an 798 * empty buffer on the chain. 799 */ 800 while ((phead = tcp->tcp_snd.ts_head) != NULL && 801 left >= (size_t)phead->len - tcp->tcp_snd.ts_head_off) { 802 left -= (size_t)phead->len - tcp->tcp_snd.ts_head_off; 803 804 tcp->tcp_snd.ts_head = phead->next; 805 tcp->tcp_snd.ts_head_off = 0; 806 807 if (phead == tcp->tcp_snd.ts_unsent) { 808 assert(tcp->tcp_snd.ts_unsent_off == phead->len); 809 810 tcp->tcp_snd.ts_unsent = phead->next; 811 tcp->tcp_snd.ts_unsent_off = 0; 812 } 813 814 assert(tcpsock_sendbufs > 0); 815 tcpsock_sendbufs--; 816 817 tcpsock_free_buf(phead); 818 } 819 820 /* 821 * The rest of the given length is for less than the current head 822 * buffer. 823 */ 824 if (left > 0) { 825 assert(tcp->tcp_snd.ts_head != NULL); 826 assert((size_t)tcp->tcp_snd.ts_head->len - 827 tcp->tcp_snd.ts_head_off > left); 828 829 tcp->tcp_snd.ts_head_off += left; 830 } 831 832 tcp->tcp_snd.ts_len -= (size_t)len; 833 834 if (tcp->tcp_snd.ts_head == NULL) { 835 assert(tcp->tcp_snd.ts_len == 0); 836 assert(tcp->tcp_snd.ts_unsent == NULL); 837 tcp->tcp_snd.ts_tail = NULL; 838 } else 839 assert(tcp->tcp_snd.ts_len > 0); 840 841 /* 842 * If we emptied the send queue, and we already managed to send a FIN 843 * earlier, we may now have met all requirements to close the socket's 844 * PCB. Otherwise, we may also be able to send more now, so try to 845 * resume sending. Since we are invoked from the "sent" event, 846 * tcp_output() will not actually process anything, and so we do not 847 * call it either. If we did, we would have to deal with errors here. 848 */ 849 if (tcpsock_may_close(tcp)) { 850 if (tcpsock_finish_close(tcp)) 851 return ERR_OK; 852 } else { 853 tcpsock_clear_flag(tcp, TCPF_FULL); 854 855 /* 856 * If we now manage to enqueue a FIN, we may be ready to close 857 * the PCB after all. 858 */ 859 if (tcpsock_pcb_enqueue(tcp)) { 860 if (tcpsock_may_close(tcp) && 861 tcpsock_finish_close(tcp)) 862 return ERR_OK; 863 } 864 } 865 866 /* The user may also be able to send more now. */ 867 sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND); 868 869 return ERR_OK; 870 } 871 872 /* 873 * Check whether any (additional) data previously received on a TCP socket 874 * should be acknowledged, possibly allowing the remote end to send additional 875 * data as a result. 876 */ 877 static void 878 tcpsock_ack_recv(struct tcpsock * tcp) 879 { 880 size_t rcvbuf, left, delta, ack; 881 882 assert(tcp->tcp_pcb != NULL); 883 884 /* 885 * We must make sure that at all times, we can still add an entire 886 * window's worth of data to the receive queue. If the amount of free 887 * space drops below that threshold, we stop acknowledging received 888 * data. The user may change the receive buffer size at all times; we 889 * update the window size lazily as appropriate. 890 */ 891 rcvbuf = tcpsock_get_rcvbuf(tcp); 892 893 if (rcvbuf > tcp->tcp_rcv.tr_len && tcp->tcp_rcv.tr_unacked > 0) { 894 /* 895 * The number of bytes that lwIP can still give us at any time 896 * is represented as 'left'. The number of bytes that we still 897 * allow to be stored in the receive queue is represented as 898 * 'delta'. We must make sure that 'left' does not ever exceed 899 * 'delta' while acknowledging as many bytes as possible under 900 * that rule. 901 */ 902 left = TCP_WND - tcp->tcp_rcv.tr_unacked; 903 delta = rcvbuf - tcp->tcp_rcv.tr_len; 904 905 if (left < delta) { 906 ack = delta - left; 907 908 if (ack > tcp->tcp_rcv.tr_unacked) 909 ack = tcp->tcp_rcv.tr_unacked; 910 911 tcp_recved(tcp->tcp_pcb, ack); 912 913 tcp->tcp_rcv.tr_unacked -= ack; 914 915 assert(tcp->tcp_rcv.tr_len + TCP_WND - 916 tcp->tcp_rcv.tr_unacked <= rcvbuf); 917 } 918 } 919 } 920 921 /* 922 * Attempt to merge two consecutive underfilled buffers in the receive queue of 923 * a TCP socket, freeing up one of the two buffers as a result. The first 924 * (oldest) buffer is 'ptail', and the pointer to this buffer is stored at 925 * 'pnext'. The second (new) buffer is 'pbuf', which is already attached to 926 * the first buffer. The second buffer may be followed by additional buffers 927 * with even more new data. Return TRUE if buffers have been merged, in which 928 * case the pointer at 'pnext' may have changed, and no assumptions should be 929 * made about whether 'ptail' and 'pbuf' still exist in any form. Return FALSE 930 * if no merging was necessary or if no new buffer could be allocated. 931 */ 932 static int 933 tcpsock_try_merge(struct pbuf **pnext, struct pbuf * ptail, struct pbuf * pbuf) 934 { 935 struct pbuf *pnew; 936 937 assert(*pnext == ptail); 938 assert(ptail->next == pbuf); 939 940 /* 941 * Unfortunately, we cannot figure out what kind of pbuf we were given 942 * by the lower layers, so we cannot merge two buffers without first 943 * allocating a third. Once we have done that, though, we can easily 944 * merge more into that new buffer. For now we use the following 945 * policies: 946 * 947 * 1. if two consecutive lwIP-provided buffers are both used less than 948 * half the size of a full buffer, try to allocate a new buffer and 949 * copy both lwIP-provided buffers into that new buffer, freeing up 950 * the pair afterwards; 951 * 2. if the tail buffer on the chain is allocated by us and not yet 952 * full, and the next buffer's contents can be added to the tail 953 * buffer in their entirety, do just that. 954 * 955 * Obviously there is a trade-off between the performance overhead of 956 * copying and the resource overhead of keeping less-than-full buffers 957 * on the receive queue, but this policy should both keep actual memory 958 * usage to no more than twice the receive queue length and prevent 959 * excessive copying. The policy deliberately performs more aggressive 960 * merging into a buffer that we allocated ourselves. 961 */ 962 if (ptail->tot_len <= MEMPOOL_BUFSIZE / 2 && 963 pbuf->len <= MEMPOOL_BUFSIZE / 2) { 964 /* 965 * Case #1. 966 */ 967 assert(ptail->tot_len == ptail->len); 968 assert(pbuf->tot_len == pbuf->len); 969 970 pnew = tcpsock_alloc_buf(); 971 if (pnew == NULL) 972 return FALSE; 973 974 memcpy(pnew->payload, ptail->payload, ptail->len); 975 memcpy((char *)pnew->payload + ptail->len, pbuf->payload, 976 pbuf->len); 977 pnew->len = ptail->len + pbuf->len; 978 assert(pnew->len <= pnew->tot_len); 979 980 pnew->next = pbuf->next; 981 /* For now, we need not inherit any flags from either pbuf. */ 982 983 *pnext = pnew; 984 985 /* One allocated, two about to be deallocated. */ 986 assert(tcpsock_recvbufs > 0); 987 tcpsock_recvbufs--; 988 989 tcpsock_free_buf(ptail); 990 tcpsock_free_buf(pbuf); 991 992 return TRUE; 993 } else if (ptail->tot_len - ptail->len >= pbuf->len) { 994 /* 995 * Case #2. 996 */ 997 memcpy((char *)ptail->payload + ptail->len, pbuf->payload, 998 pbuf->len); 999 1000 ptail->len += pbuf->len; 1001 1002 ptail->next = pbuf->next; 1003 1004 assert(tcpsock_recvbufs > 0); 1005 tcpsock_recvbufs--; 1006 1007 tcpsock_free_buf(pbuf); 1008 1009 return TRUE; 1010 } else 1011 return FALSE; 1012 } 1013 1014 /* 1015 * Callback from lwIP. New data or flags have been received on a TCP socket. 1016 */ 1017 static err_t 1018 tcpsock_event_recv(void * arg, struct tcp_pcb * pcb __unused, 1019 struct pbuf * pbuf, err_t err) 1020 { 1021 struct tcpsock *tcp = (struct tcpsock *)arg; 1022 struct pbuf *ptail, **pprevp; 1023 size_t len; 1024 1025 assert(tcp != NULL); 1026 assert(pcb == tcp->tcp_pcb); 1027 1028 /* 1029 * lwIP should never provide anything other than ERR_OK in 'err', and 1030 * it is not clear what we should do if it would. If lwIP ever changes 1031 * in this regard, we will likely have to change this code accordingly. 1032 */ 1033 if (err != ERR_OK) 1034 panic("TCP receive event with error: %d", err); 1035 1036 /* If the given buffer is NULL, we have received a FIN. */ 1037 if (pbuf == NULL) { 1038 tcpsock_set_flag(tcp, TCPF_RCVD_FIN); 1039 1040 /* Userland may now receive EOF. */ 1041 if (!tcpsock_is_shutdown(tcp, SFL_SHUT_RD)) 1042 sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV); 1043 1044 /* 1045 * If we were in the process of closing the socket, and we 1046 * receive a FIN before our FIN got acknowledged, we close the 1047 * socket anyway, as described in tcpsock_close(). However, if 1048 * there is still unacknowledged outgoing data or we did not 1049 * even manage to send our FIN yet, hold off closing the socket 1050 * for now. 1051 */ 1052 if (tcpsock_may_close(tcp)) 1053 (void)tcpsock_finish_close(tcp); 1054 1055 return ERR_OK; 1056 } 1057 1058 /* 1059 * If the socket is being closed, receiving new data should cause a 1060 * reset. 1061 */ 1062 if (sockevent_is_closing(tcpsock_get_sock(tcp))) { 1063 tcpsock_pcb_abort(tcp); 1064 1065 (void)tcpsock_cleanup(tcp, TRUE /*may_free*/); 1066 /* Do not touch the socket object anymore! */ 1067 1068 pbuf_free(pbuf); 1069 1070 return ERR_ABRT; 1071 } 1072 1073 /* 1074 * If the socket has already been shut down for reading, discard the 1075 * incoming data and do nothing else. 1076 */ 1077 if (tcpsock_is_shutdown(tcp, SFL_SHUT_RD)) { 1078 tcp_recved(tcp->tcp_pcb, pbuf->tot_len); 1079 1080 pbuf_free(pbuf); 1081 1082 return ERR_OK; 1083 } 1084 1085 /* 1086 * We deliberately ignore the PBUF_FLAG_PUSH flag. This flag would 1087 * enable the receive functionality to delay delivering "un-pushed" 1088 * data to applications. The implementation of this scheme could track 1089 * the amount of data up to and including the last-pushed segment using 1090 * a "tr_push_len" field or so. Deciding when to deliver "un-pushed" 1091 * data after all is a bit tricker though. As far as I can tell, the 1092 * BSDs do not implement anything like that. Windows does, and this 1093 * results in interaction problems with even more lightweight TCP/IP 1094 * stacks that do not send the TCP PSH flag. Currently, there is no 1095 * obvious benefit for us to support delaying data delivery like that. 1096 * In addition, testing its implementation reliably would be difficult. 1097 */ 1098 1099 len = (size_t)pbuf->tot_len; 1100 1101 /* 1102 * Count the number of buffers that are now owned by us. The new total 1103 * of buffers owned by us must not exceed the size of the memory pool. 1104 * Any more would indicate an accounting error. Note that 1105 * tcpsock_recvbufs is currently used for debugging only! 1106 */ 1107 tcpsock_recvbufs += pbuf_clen(pbuf); 1108 assert(tcpsock_recvbufs < mempool_cur_buffers()); 1109 1110 /* 1111 * The pre-tail pointer points to whatever is pointing to the tail 1112 * buffer. The latter pointer may be the 'tr_head' field in our 1113 * tcpsock structure, or the 'next' field in the penultimate buffer, 1114 * or NULL if there are currently no buffers on the receive queue. 1115 */ 1116 if ((pprevp = tcp->tcp_rcv.tr_pre_tailp) != NULL) { 1117 ptail = *pprevp; 1118 1119 assert(ptail != NULL); 1120 assert(ptail->next == NULL); 1121 assert(tcp->tcp_rcv.tr_head != NULL); 1122 1123 ptail->next = pbuf; 1124 pbuf->tot_len = pbuf->len; /* to help freeing on merges */ 1125 1126 if (tcpsock_try_merge(pprevp, ptail, pbuf)) { 1127 ptail = *pprevp; 1128 pbuf = ptail->next; 1129 } 1130 1131 if (pbuf != NULL) 1132 pprevp = &ptail->next; 1133 } else { 1134 assert(tcp->tcp_rcv.tr_head == NULL); 1135 assert(tcp->tcp_rcv.tr_head_off == 0); 1136 1137 tcp->tcp_rcv.tr_head = pbuf; 1138 1139 pprevp = &tcp->tcp_rcv.tr_head; 1140 } 1141 1142 /* 1143 * Chop up the chain into individual buffers. This is necessary as we 1144 * overload 'tot_len' to mean "space available in the buffer", as we 1145 * want for buffers allocated by us as part of buffer merges. Also get 1146 * a pointer to the pointer to the new penultimate tail buffer. Due to 1147 * merging, the chain may already be empty by now, though. 1148 */ 1149 if (pbuf != NULL) { 1150 for (; pbuf->next != NULL; pbuf = pbuf->next) { 1151 pbuf->tot_len = pbuf->len; 1152 1153 pprevp = &pbuf->next; 1154 } 1155 assert(pbuf->len == pbuf->tot_len); 1156 } 1157 1158 assert(*pprevp != NULL); 1159 assert((*pprevp)->next == NULL); 1160 tcp->tcp_rcv.tr_pre_tailp = pprevp; 1161 1162 tcp->tcp_rcv.tr_len += len; 1163 tcp->tcp_rcv.tr_unacked += len; 1164 1165 assert(tcp->tcp_rcv.tr_unacked <= TCP_WND); 1166 1167 /* 1168 * Note that tr_len may now exceed the receive buffer size in the 1169 * highly exceptional case that the user is adjusting the latter after 1170 * the socket had already received data. 1171 */ 1172 1173 /* See if we can immediately acknowledge some or all of the data. */ 1174 tcpsock_ack_recv(tcp); 1175 1176 /* Also wake up any receivers now. */ 1177 sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV); 1178 1179 return ERR_OK; 1180 } 1181 1182 /* 1183 * Callback from lwIP. The PCB corresponding to the socket identified by 'arg' 1184 * has been closed by lwIP, with the reason specified in 'err': either the 1185 * connection has been aborted locally (ERR_ABRT), it has been reset by the 1186 * remote end (ERR_RST), or it is closed due to state transitions (ERR_CLSD). 1187 */ 1188 static void 1189 tcpsock_event_err(void * arg, err_t err) 1190 { 1191 struct tcpsock *tcp = (struct tcpsock *)arg; 1192 int r; 1193 1194 assert(tcp != NULL); 1195 assert(tcp->tcp_pcb != NULL); 1196 assert(err != ERR_OK); 1197 1198 /* The original PCB is now gone, or will be shortly. */ 1199 tcp->tcp_pcb = NULL; 1200 1201 /* 1202 * Clean up the socket. As a result it may be freed, in which case we 1203 * must not touch it anymore. No need to return ERR_ABRT from here, as 1204 * the PCB has been aborted already. 1205 */ 1206 if (tcpsock_cleanup(tcp, TRUE /*may_free*/)) 1207 return; 1208 1209 if (err == ERR_CLSD) { 1210 /* 1211 * We may get here if the socket is shut down for writing and 1212 * we already received a FIN from the remote side, thus putting 1213 * the socket in LAST_ACK state, and we receive that last 1214 * acknowledgment. There is nothing more we need to do. 1215 * 1216 * We will never get here in the other case that ERR_CLSD is 1217 * raised, which is when the socket is reset because of 1218 * unacknowledged data while closing: we handle the 1219 * reset-on-ACK case ourselves in tcpsock_close(), and the 1220 * socket is in closing state after that. 1221 */ 1222 assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR)); 1223 assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN); 1224 } else { 1225 /* 1226 * Anything else should be an error directly from lwIP; 1227 * currently either ERR_ABRT and ERR_RST. Covert it to a 1228 * regular error and set it on the socket. Doing so will also 1229 * raise the appropriate events. 1230 */ 1231 /* 1232 * Unfortunately, lwIP is not throwing accurate errors even 1233 * when it can. We convert some errors to reflect more 1234 * accurately the most likely cause. 1235 * 1236 * TODO: fix lwIP in this regard.. 1237 */ 1238 r = util_convert_err(err); 1239 1240 if (tcpsock_get_flags(tcp) & TCPF_CONNECTING) { 1241 switch (err) { 1242 case ERR_ABRT: r = ETIMEDOUT; break; 1243 case ERR_RST: r = ECONNREFUSED; break; 1244 } 1245 } 1246 1247 sockevent_set_error(tcpsock_get_sock(tcp), r); 1248 } 1249 } 1250 1251 /* 1252 * Callback from lwIP. Perform regular checks on a TCP socket. This function 1253 * is called one per five seconds on connected sockets, and twice per second on 1254 * closing sockets. 1255 */ 1256 static err_t 1257 tcpsock_event_poll(void * arg, struct tcp_pcb * pcb __unused) 1258 { 1259 struct tcpsock *tcp = (struct tcpsock *)arg; 1260 err_t err; 1261 int r; 1262 1263 assert(tcp != NULL); 1264 assert(pcb == tcp->tcp_pcb); 1265 1266 /* 1267 * If we ended up running out of buffers earlier, try resuming any send 1268 * requests now, both for enqueuing TCP data with lwIP and for user 1269 * requests. 1270 */ 1271 if (tcpsock_get_flags(tcp) & (TCPF_FULL | TCPF_OOM)) { 1272 tcpsock_clear_flag(tcp, TCPF_FULL); 1273 tcpsock_clear_flag(tcp, TCPF_OOM); 1274 1275 /* See if we can enqueue more data with lwIP. */ 1276 if (tcpsock_pcb_enqueue(tcp)) { 1277 /* In some cases, we can now close the PCB. */ 1278 if (tcpsock_may_close(tcp)) { 1279 (void)tcpsock_finish_close(tcp); 1280 /* 1281 * The PCB is definitely gone here, and the 1282 * entire socket object may be gone now too. 1283 * Do not touch either anymore! 1284 */ 1285 1286 return ERR_OK; 1287 } 1288 1289 /* 1290 * If actually sending the data fails, the PCB will be 1291 * gone, and the socket object may be gone as well. Do 1292 * not touch either anymore in that case! 1293 */ 1294 if (tcpsock_pcb_send(tcp, TRUE /*raise_error*/) != OK) 1295 return ERR_ABRT; 1296 } 1297 1298 /* 1299 * If we ran out of buffers earlier, it may be possible to take 1300 * in more data from a user process now, even if we did not 1301 * manage to enqueue any more pending data with lwIP. 1302 */ 1303 sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND); 1304 1305 assert(tcp->tcp_pcb != NULL); 1306 } else if (tcp->tcp_snd.ts_unsent != NULL && 1307 tcp->tcp_snd.ts_unsent_off < tcp->tcp_snd.ts_unsent->len) { 1308 /* 1309 * If the send buffer is full, we will no longer call 1310 * tcp_output(), which means we may also miss out on fatal 1311 * errors that would otherwise kill the connection (e.g., no 1312 * route). As a result, the connection may erroneously 1313 * continue to exist for a long time. To avoid this, we call 1314 * tcp_output() every once in a while when there are still 1315 * unsent data. 1316 */ 1317 err = tcp_output(tcp->tcp_pcb); 1318 1319 if (err != ERR_OK && err != ERR_MEM) { 1320 tcpsock_pcb_abort(tcp); 1321 1322 if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) { 1323 r = util_convert_err(err); 1324 1325 sockevent_set_error(tcpsock_get_sock(tcp), r); 1326 } 1327 /* Otherwise do not touch the socket object anymore! */ 1328 1329 return ERR_ABRT; 1330 } 1331 } 1332 1333 /* 1334 * If we are closing the socket, and we sent a FIN, see if the FIN got 1335 * acknowledged. If so, finish closing the socket. Unfortunately, we 1336 * can perform this check by polling only. TODO: change lwIP.. 1337 */ 1338 if (sockevent_is_closing(tcpsock_get_sock(tcp)) && 1339 (tcpsock_get_flags(tcp) & TCPF_SENT_FIN) && 1340 tcp->tcp_pcb->unsent == NULL && tcp->tcp_pcb->unacked == NULL) { 1341 assert(tcp->tcp_snd.ts_len == 0); 1342 1343 tcpsock_finish_close(tcp); 1344 } 1345 1346 return ERR_OK; 1347 } 1348 1349 /* 1350 * Bind a TCP socket to a local address. 1351 */ 1352 static int 1353 tcpsock_bind(struct sock * sock, const struct sockaddr * addr, 1354 socklen_t addr_len, endpoint_t user_endpt) 1355 { 1356 struct tcpsock *tcp = (struct tcpsock *)sock; 1357 ip_addr_t ipaddr; 1358 uint16_t port; 1359 err_t err; 1360 int r; 1361 1362 if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED) 1363 return EINVAL; 1364 1365 if ((r = ipsock_get_src_addr(tcpsock_get_ipsock(tcp), addr, addr_len, 1366 user_endpt, &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port, 1367 FALSE /*allow_mcast*/, &ipaddr, &port)) != OK) 1368 return r; 1369 1370 err = tcp_bind(tcp->tcp_pcb, &ipaddr, port); 1371 1372 return util_convert_err(err); 1373 } 1374 1375 /* 1376 * Callback from lwIP. A new connection 'pcb' has arrived on the listening 1377 * socket identified by 'arg'. Note that 'pcb' may be NULL in the case that 1378 * lwIP could not accept the connection itself. 1379 */ 1380 static err_t 1381 tcpsock_event_accept(void * arg, struct tcp_pcb * pcb, err_t err) 1382 { 1383 struct tcpsock *tcp = (struct tcpsock *)arg; 1384 1385 assert(tcp != NULL); 1386 assert(tcpsock_is_listening(tcp)); 1387 1388 /* 1389 * If the given PCB is NULL, then lwIP ran out of memory allocating a 1390 * PCB for the new connection. There is nothing we can do with that 1391 * information. Also check 'err' just to make sure. 1392 */ 1393 if (pcb == NULL || err != OK) 1394 return ERR_OK; 1395 1396 /* 1397 * The TCP socket is the listening socket, but the PCB is for the 1398 * incoming connection. 1399 */ 1400 if (tcpsock_clone(tcp, pcb) != OK) { 1401 /* 1402 * We could not allocate the resources necessary to accept the 1403 * connection. Abort it immediately. 1404 */ 1405 tcp_abort(pcb); 1406 1407 return ERR_ABRT; 1408 } 1409 1410 /* 1411 * The connection has not yet been accepted, and thus should still be 1412 * considered on the listen queue. 1413 */ 1414 tcp_backlog_delayed(pcb); 1415 1416 /* Set the callback functions. */ 1417 tcp_recv(pcb, tcpsock_event_recv); 1418 tcp_sent(pcb, tcpsock_event_sent); 1419 tcp_err(pcb, tcpsock_event_err); 1420 tcp_poll(pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL); 1421 1422 sockevent_raise(tcpsock_get_sock(tcp), SEV_ACCEPT); 1423 1424 return ERR_OK; 1425 } 1426 1427 /* 1428 * Put a TCP socket in listening mode. 1429 */ 1430 static int 1431 tcpsock_listen(struct sock * sock, int backlog) 1432 { 1433 struct tcpsock *tcp = (struct tcpsock *)sock; 1434 struct tcp_pcb *pcb; 1435 err_t err; 1436 1437 /* The maximum backlog value must not exceed its field size. */ 1438 assert(SOMAXCONN <= UINT8_MAX); 1439 1440 /* 1441 * Allow only CLOSED sockets to enter listening mode. If the socket 1442 * was already in listening mode, allow its backlog value to be 1443 * updated, even if it was shut down already (making this a no-op). 1444 */ 1445 if (!tcpsock_is_listening(tcp) && 1446 (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED)) 1447 return EINVAL; 1448 1449 /* 1450 * If the socket was not already in listening mode, put it in that mode 1451 * now. That involves switching PCBs as lwIP attempts to save memory 1452 * by replacing the original PCB with a smaller one. If the socket was 1453 * already in listening mode, simply update its backlog value--this has 1454 * no effect on the sockets already in the backlog. 1455 */ 1456 if (!tcpsock_is_listening(tcp)) { 1457 assert(tcp->tcp_pcb != NULL); 1458 1459 /* 1460 * If the socket has not been bound to a port yet, do that 1461 * first. This does mean that the listen call may fail with 1462 * side effects, but that is acceptable in this case. 1463 */ 1464 if (tcp->tcp_pcb->local_port == 0) { 1465 err = tcp_bind(tcp->tcp_pcb, &tcp->tcp_pcb->local_ip, 1466 0 /*port*/); 1467 1468 if (err != ERR_OK) 1469 return util_convert_err(err); 1470 } 1471 1472 /* 1473 * Clear the argument on the PCB that is about to be replaced, 1474 * because if we do not, once the PCB is reused (which does not 1475 * clear the argument), we might get weird events. Do this 1476 * before the tcp_listen() call, because we should no longer 1477 * access the old PCB afterwards (even if we can). 1478 */ 1479 tcp_arg(tcp->tcp_pcb, NULL); 1480 1481 pcb = tcp_listen_with_backlog_and_err(tcp->tcp_pcb, backlog, 1482 &err); 1483 1484 if (pcb == NULL) { 1485 tcp_arg(tcp->tcp_pcb, tcp); /* oops, undo. */ 1486 1487 return util_convert_err(err); 1488 } 1489 1490 tcp_arg(pcb, tcp); 1491 tcp->tcp_pcb = pcb; 1492 1493 tcp_accept(pcb, tcpsock_event_accept); 1494 1495 /* Initialize the queue head for sockets pending acceptance. */ 1496 TAILQ_INIT(&tcp->tcp_queue.tq_head); 1497 } else if (tcp->tcp_pcb != NULL) 1498 tcp_backlog_set(tcp->tcp_pcb, backlog); 1499 1500 return OK; 1501 } 1502 1503 /* 1504 * Callback from lwIP. A socket connection attempt has succeeded. Note that 1505 * failed socket events will trigger the tcpsock_event_err() callback instead. 1506 */ 1507 static err_t 1508 tcpsock_event_connected(void * arg, struct tcp_pcb * pcb __unused, err_t err) 1509 { 1510 struct tcpsock *tcp = (struct tcpsock *)arg; 1511 1512 assert(tcp != NULL); 1513 assert(pcb == tcp->tcp_pcb); 1514 assert(tcpsock_get_flags(tcp) & TCPF_CONNECTING); 1515 1516 /* 1517 * If lwIP ever changes so that this callback is called for connect 1518 * failures as well, then we need to change the code here accordingly. 1519 */ 1520 if (err != ERR_OK) 1521 panic("TCP connected event with error: %d", err); 1522 1523 tcpsock_clear_flag(tcp, TCPF_CONNECTING); 1524 1525 sockevent_raise(tcpsock_get_sock(tcp), SEV_CONNECT | SEV_SEND); 1526 1527 return ERR_OK; 1528 } 1529 1530 /* 1531 * Connect a TCP socket to a remote address. 1532 */ 1533 static int 1534 tcpsock_connect(struct sock * sock, const struct sockaddr * addr, 1535 socklen_t addr_len, endpoint_t user_endpt) 1536 { 1537 struct tcpsock *tcp = (struct tcpsock *)sock; 1538 ip_addr_t dst_addr; 1539 uint16_t dst_port; 1540 err_t err; 1541 int r; 1542 1543 /* 1544 * Listening sockets may not have a PCB, so we use higher-level flags 1545 * to throw the correct error code for those instead. 1546 */ 1547 if (tcpsock_is_listening(tcp)) 1548 return EOPNOTSUPP; 1549 1550 /* 1551 * If there is no longer any PCB, we obviously cannot perform the 1552 * connection, but POSIX is not clear on which error to return. We 1553 * copy NetBSD's. 1554 */ 1555 if (tcp->tcp_pcb == NULL) 1556 return EINVAL; 1557 1558 /* 1559 * The only state from which a connection can be initiated, is CLOSED. 1560 * Some of the other states require distinct error codes, though. 1561 */ 1562 switch (tcp->tcp_pcb->state) { 1563 case CLOSED: 1564 break; 1565 case SYN_SENT: 1566 return EALREADY; 1567 case LISTEN: 1568 assert(0); /* we just checked.. */ 1569 default: 1570 return EISCONN; 1571 } 1572 1573 /* 1574 * Get the destination address, and attempt to start connecting. If 1575 * the socket was not bound before, or it was bound to a port only, 1576 * then lwIP will select a source address for us. We cannot do this 1577 * ourselves even if we wanted to: it is impossible to re-bind a TCP 1578 * PCB in the case it was previously bound to a port only. 1579 */ 1580 if ((r = ipsock_get_dst_addr(tcpsock_get_ipsock(tcp), addr, addr_len, 1581 &tcp->tcp_pcb->local_ip, &dst_addr, &dst_port)) != OK) 1582 return r; 1583 1584 err = tcp_connect(tcp->tcp_pcb, &dst_addr, dst_port, 1585 tcpsock_event_connected); 1586 1587 /* 1588 * Note that various tcp_connect() error cases will leave the PCB with 1589 * a newly set local and remote IP address anyway. We should be 1590 * careful not to rely on the addresses being as they were before. 1591 */ 1592 if (err != ERR_OK) 1593 return util_convert_err(err); 1594 1595 /* Set the other callback functions. */ 1596 tcp_recv(tcp->tcp_pcb, tcpsock_event_recv); 1597 tcp_sent(tcp->tcp_pcb, tcpsock_event_sent); 1598 tcp_err(tcp->tcp_pcb, tcpsock_event_err); 1599 tcp_poll(tcp->tcp_pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL); 1600 1601 /* 1602 * Set a flag so that we can correct lwIP's error codes in case the 1603 * connection fails. 1604 */ 1605 tcpsock_set_flag(tcp, TCPF_CONNECTING); 1606 1607 return SUSPEND; 1608 } 1609 1610 /* 1611 * Test whether any new connections are pending on a listening TCP socket. 1612 */ 1613 static int 1614 tcpsock_test_accept(struct sock * sock) 1615 { 1616 struct tcpsock *tcp = (struct tcpsock *)sock; 1617 1618 /* Is this socket in listening mode at all? */ 1619 if (!tcpsock_is_listening(tcp)) 1620 return EINVAL; 1621 1622 /* Are there any connections to accept right now? */ 1623 if (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head)) 1624 return OK; 1625 1626 /* If the socket has been shut down, we return ECONNABORTED. */ 1627 if (tcp->tcp_pcb == NULL) 1628 return ECONNABORTED; 1629 1630 /* Otherwise, wait for a new connection first. */ 1631 return SUSPEND; 1632 } 1633 1634 /* 1635 * Accept a connection on a listening TCP socket, creating a new TCP socket. 1636 */ 1637 static sockid_t 1638 tcpsock_accept(struct sock * sock, struct sockaddr * addr, 1639 socklen_t * addr_len, endpoint_t user_endpt __unused, 1640 struct sock ** newsockp) 1641 { 1642 struct tcpsock *listener = (struct tcpsock *)sock; 1643 struct tcpsock *tcp; 1644 int r; 1645 1646 if ((r = tcpsock_test_accept(sock)) != OK) 1647 return r; 1648 /* Below, we must not assume that the listener has a PCB. */ 1649 1650 tcp = TAILQ_FIRST(&listener->tcp_queue.tq_head); 1651 assert(tcp->tcp_listener == listener); 1652 assert(tcp->tcp_pcb != NULL); 1653 1654 TAILQ_REMOVE(&listener->tcp_queue.tq_head, tcp, tcp_queue.tq_next); 1655 tcp->tcp_listener = NULL; 1656 1657 tcp_backlog_accepted(tcp->tcp_pcb); 1658 1659 ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len, 1660 &tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port); 1661 1662 /* 1663 * Set 'newsockp' to NULL so that libsockevent knows we already cloned 1664 * the socket, and it must not be reinitialized anymore. 1665 */ 1666 *newsockp = NULL; 1667 return tcpsock_get_id(tcp); 1668 } 1669 1670 /* 1671 * Perform preliminary checks on a send request. 1672 */ 1673 static int 1674 tcpsock_pre_send(struct sock * sock, size_t len __unused, 1675 socklen_t ctl_len __unused, const struct sockaddr * addr __unused, 1676 socklen_t addr_len __unused, endpoint_t user_endpt __unused, int flags) 1677 { 1678 1679 /* 1680 * Reject calls with unknown flags. Since libsockevent strips out the 1681 * flags it handles itself here, we only have to test for ones we can 1682 * not handle. Currently, there are no send flags that we support. 1683 */ 1684 if (flags != 0) 1685 return EOPNOTSUPP; 1686 1687 return OK; 1688 } 1689 1690 /* 1691 * Test whether the given number of data bytes can be sent on a TCP socket. 1692 */ 1693 static int 1694 tcpsock_test_send(struct sock * sock, size_t min) 1695 { 1696 struct tcpsock *tcp = (struct tcpsock *)sock; 1697 size_t sndbuf; 1698 1699 if (tcp->tcp_pcb == NULL) 1700 return EPIPE; 1701 1702 switch (tcp->tcp_pcb->state) { 1703 case CLOSED: /* new */ 1704 case LISTEN: /* listening */ 1705 return ENOTCONN; 1706 case SYN_SENT: /* connecting */ 1707 case SYN_RCVD: /* simultaneous open, maybe someday? */ 1708 return SUSPEND; 1709 case ESTABLISHED: /* connected */ 1710 case CLOSE_WAIT: /* closed remotely */ 1711 break; 1712 default: /* shut down locally */ 1713 assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR)); 1714 return EPIPE; 1715 } 1716 1717 sndbuf = tcpsock_get_sndbuf(tcp); 1718 if (min > sndbuf) 1719 min = sndbuf; 1720 1721 if (tcp->tcp_snd.ts_len + min > sndbuf) 1722 return SUSPEND; 1723 else 1724 return OK; 1725 } 1726 1727 /* 1728 * Send data on a TCP socket. 1729 */ 1730 static int 1731 tcpsock_send(struct sock * sock, const struct sockdriver_data * data, 1732 size_t len, size_t * offp, const struct sockdriver_data * ctl __unused, 1733 socklen_t ctl_len __unused, socklen_t * ctl_off __unused, 1734 const struct sockaddr * addr __unused, socklen_t addr_len __unused, 1735 endpoint_t user_endpt __unused, int flags __unused, size_t min) 1736 { 1737 struct tcpsock *tcp = (struct tcpsock *)sock; 1738 struct pbuf *ptail, *pfirst, *pnext, *plast; 1739 size_t off, tail_off, chunk, left, sndbuf; 1740 int r; 1741 1742 if ((r = tcpsock_test_send(sock, min)) != OK) 1743 return r; 1744 1745 if (len == 0) 1746 return OK; /* nothing to do */ 1747 1748 sndbuf = tcpsock_get_sndbuf(tcp); 1749 if (min > sndbuf) 1750 min = sndbuf; 1751 assert(min > 0); 1752 1753 assert(sndbuf > tcp->tcp_snd.ts_len); 1754 left = sndbuf - tcp->tcp_snd.ts_len; 1755 if (left > len) 1756 left = len; 1757 1758 /* 1759 * First see if we can fit any more data in the current tail buffer. 1760 * If so, we set 'ptail' to point to it and 'tail_off' to the previous 1761 * length of the tail buffer, while optimistically extending it to 1762 * include the new data. If not, we set them to NULL/0. 1763 */ 1764 if ((ptail = tcp->tcp_snd.ts_tail) != NULL && 1765 ptail->len < ptail->tot_len) { 1766 assert(ptail->len > 0); 1767 tail_off = (size_t)ptail->len; 1768 1769 /* 1770 * Optimistically extend the head buffer to include whatever 1771 * fits in it. This is needed for util_copy_data(). 1772 */ 1773 assert(ptail->tot_len > ptail->len); 1774 off = (size_t)ptail->tot_len - (size_t)ptail->len; 1775 if (off > left) 1776 off = left; 1777 ptail->len += off; 1778 } else { 1779 ptail = NULL; 1780 tail_off = 0; 1781 off = 0; 1782 } 1783 1784 /* 1785 * Then, if there is more to send, allocate new buffers as needed. If 1786 * we run out of memory, work with whatever we did manage to grab. 1787 */ 1788 pfirst = NULL; 1789 plast = NULL; 1790 while (off < left) { 1791 if (tcpsock_sendbufs >= TCP_MAX_SENDBUFS || 1792 (pnext = tcpsock_alloc_buf()) == NULL) { 1793 /* 1794 * Chances are that we will end up suspending this send 1795 * request because of being out of buffers. We try to 1796 * resume such requests from the polling function. 1797 */ 1798 tcpsock_set_flag(tcp, TCPF_OOM); 1799 1800 break; 1801 } 1802 1803 tcpsock_sendbufs++; 1804 1805 if (pfirst == NULL) 1806 pfirst = pnext; 1807 else 1808 plast->next = pnext; 1809 plast = pnext; 1810 1811 chunk = (size_t)pnext->tot_len; 1812 if (chunk > left - off) 1813 chunk = left - off; 1814 pnext->len = chunk; 1815 off += chunk; 1816 } 1817 1818 /* 1819 * Copy in the data and continue, unless we did not manage to find 1820 * enough space to even meet the low send watermark, in which case we 1821 * undo any allocation and suspend the call until later. 1822 */ 1823 if (off >= min) { 1824 /* 1825 * Optimistically attach the new buffers to the tail, also for 1826 * util_copy_data(). We undo all this if the copy fails. 1827 */ 1828 if (ptail != NULL) { 1829 ptail->next = pfirst; 1830 1831 pnext = ptail; 1832 } else 1833 pnext = pfirst; 1834 1835 assert(pnext != NULL); 1836 1837 r = util_copy_data(data, off, *offp, pnext, tail_off, 1838 TRUE /*copy_in*/); 1839 } else 1840 r = SUSPEND; 1841 1842 if (r != OK) { 1843 /* Undo the modifications made so far. */ 1844 while (pfirst != NULL) { 1845 pnext = pfirst->next; 1846 1847 assert(tcpsock_sendbufs > 0); 1848 tcpsock_sendbufs--; 1849 1850 tcpsock_free_buf(pfirst); 1851 1852 pfirst = pnext; 1853 } 1854 1855 if (ptail != NULL) { 1856 ptail->next = NULL; 1857 1858 ptail->len = tail_off; 1859 } 1860 1861 return r; 1862 } 1863 1864 /* Attach the new buffers, if any, to the buffer tail. */ 1865 if (pfirst != NULL) { 1866 if ((ptail = tcp->tcp_snd.ts_tail) != NULL) { 1867 assert(ptail->len == ptail->tot_len); 1868 1869 /* 1870 * Due to our earlier optimistic modifications, this 1871 * may or may not be redundant. 1872 */ 1873 ptail->next = pfirst; 1874 } 1875 1876 assert(plast != NULL); 1877 tcp->tcp_snd.ts_tail = plast; 1878 1879 if (tcp->tcp_snd.ts_head == NULL) { 1880 tcp->tcp_snd.ts_head = pfirst; 1881 assert(tcp->tcp_snd.ts_head_off == 0); 1882 } 1883 if (tcp->tcp_snd.ts_unsent == NULL) { 1884 tcp->tcp_snd.ts_unsent = pfirst; 1885 assert(tcp->tcp_snd.ts_unsent_off == 0); 1886 } 1887 } 1888 1889 tcp->tcp_snd.ts_len += off; 1890 1891 /* 1892 * See if we can send any of the data we just enqueued. The socket is 1893 * still open as we are still processing a call from userland on it; 1894 * this saves us from having to deal with the cases that the following 1895 * calls end up freeing the socket object. 1896 */ 1897 if (tcpsock_pcb_enqueue(tcp) && 1898 (r = tcpsock_pcb_send(tcp, FALSE /*raise_error*/)) != OK) { 1899 /* 1900 * That did not go well. Return the error immediately if we 1901 * had not made any progress earlier. Otherwise, return our 1902 * partial progress and leave the error to be picked up later. 1903 */ 1904 if (*offp > 0) { 1905 sockevent_set_error(tcpsock_get_sock(tcp), r); 1906 1907 return OK; 1908 } else 1909 return r; 1910 } 1911 1912 *offp += off; 1913 return (off < len) ? SUSPEND : OK; 1914 } 1915 1916 /* 1917 * Perform preliminary checks on a receive request. 1918 */ 1919 static int 1920 tcpsock_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused, 1921 int flags) 1922 { 1923 1924 /* 1925 * Reject calls with unknown flags. Since libsockevent strips out the 1926 * flags it handles itself here, we only have to test for ones we can 1927 * not handle. 1928 */ 1929 if ((flags & ~(MSG_PEEK | MSG_WAITALL)) != 0) 1930 return EOPNOTSUPP; 1931 1932 return OK; 1933 } 1934 1935 /* 1936 * Return TRUE if receive calls may wait for more data to come in on the 1937 * connection, or FALSE if we already know that that is not going to happen. 1938 */ 1939 static int 1940 tcpsock_may_wait(struct tcpsock * tcp) 1941 { 1942 1943 return (tcp->tcp_pcb != NULL && 1944 !(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN)); 1945 } 1946 1947 /* 1948 * Test whether data can be received on a TCP socket, and if so, how many bytes 1949 * of data. 1950 */ 1951 static int 1952 tcpsock_test_recv(struct sock * sock, size_t min, size_t * size) 1953 { 1954 struct tcpsock *tcp = (struct tcpsock *)sock; 1955 int may_wait; 1956 1957 /* If there is and never was a connection, refuse the call at all. */ 1958 if (tcp->tcp_pcb != NULL && (tcp->tcp_pcb->state == CLOSED || 1959 tcp->tcp_pcb->state == LISTEN)) 1960 return ENOTCONN; 1961 1962 /* 1963 * If we are certain that no more data will come in later, ignore the 1964 * low receive watermark. Otherwise, bound it to the size of the 1965 * receive buffer, or receive calls may block forever. 1966 */ 1967 if (!(may_wait = tcpsock_may_wait(tcp))) 1968 min = 1; 1969 else if (min > tcpsock_get_rcvbuf(tcp)) 1970 min = tcpsock_get_rcvbuf(tcp); 1971 1972 if (tcp->tcp_rcv.tr_len >= min) { 1973 if (size != NULL) 1974 *size = tcp->tcp_rcv.tr_len; 1975 1976 return OK; 1977 } 1978 1979 return (may_wait) ? SUSPEND : SOCKEVENT_EOF; 1980 } 1981 1982 /* 1983 * Receive data on a TCP socket. 1984 */ 1985 static int 1986 tcpsock_recv(struct sock * sock, const struct sockdriver_data * data, 1987 size_t len, size_t * offp, const struct sockdriver_data * ctl __unused, 1988 socklen_t ctl_len __unused, socklen_t * ctl_off __unused, 1989 struct sockaddr * addr __unused, socklen_t * addr_len __unused, 1990 endpoint_t user_endpt __unused, int flags, size_t min, 1991 int * rflags __unused) 1992 { 1993 struct tcpsock *tcp = (struct tcpsock *)sock; 1994 struct pbuf *ptail; 1995 size_t off, left; 1996 int r; 1997 1998 /* See if we can receive at all, and if so, how much at most. */ 1999 if ((r = tcpsock_test_recv(sock, min, NULL)) != OK) 2000 return r; 2001 2002 if (len == 0) 2003 return OK; /* nothing to do */ 2004 2005 off = tcp->tcp_rcv.tr_len; 2006 if (off > len) 2007 off = len; 2008 2009 assert(tcp->tcp_rcv.tr_head != NULL); 2010 assert(tcp->tcp_rcv.tr_head_off < tcp->tcp_rcv.tr_head->len); 2011 2012 /* Copy out the data to the caller. */ 2013 if ((r = util_copy_data(data, off, *offp, tcp->tcp_rcv.tr_head, 2014 tcp->tcp_rcv.tr_head_off, FALSE /*copy_in*/)) != OK) 2015 return r; 2016 2017 /* Unless peeking, remove the data from the receive queue. */ 2018 if (!(flags & MSG_PEEK)) { 2019 left = off; 2020 2021 /* Dequeue and free as many entire buffers as possible. */ 2022 while ((ptail = tcp->tcp_rcv.tr_head) != NULL && 2023 left >= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off) { 2024 left -= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off; 2025 2026 tcp->tcp_rcv.tr_head = ptail->next; 2027 tcp->tcp_rcv.tr_head_off = 0; 2028 2029 if (tcp->tcp_rcv.tr_head == NULL) 2030 tcp->tcp_rcv.tr_pre_tailp = NULL; 2031 else if (tcp->tcp_rcv.tr_pre_tailp == &ptail->next) 2032 tcp->tcp_rcv.tr_pre_tailp = 2033 &tcp->tcp_rcv.tr_head; 2034 2035 assert(tcpsock_recvbufs > 0); 2036 tcpsock_recvbufs--; 2037 2038 tcpsock_free_buf(ptail); 2039 } 2040 2041 /* 2042 * If only part of the (new) head buffer is consumed, adjust 2043 * the saved offset into that buffer. 2044 */ 2045 if (left > 0) { 2046 assert(tcp->tcp_rcv.tr_head != NULL); 2047 assert((size_t)tcp->tcp_rcv.tr_head->len - 2048 tcp->tcp_rcv.tr_head_off > left); 2049 2050 tcp->tcp_rcv.tr_head_off += left; 2051 } 2052 2053 tcp->tcp_rcv.tr_len -= off; 2054 2055 if (tcp->tcp_rcv.tr_head != NULL) { 2056 assert(tcp->tcp_rcv.tr_pre_tailp != NULL); 2057 assert(tcp->tcp_rcv.tr_len > 0); 2058 } else { 2059 assert(tcp->tcp_rcv.tr_pre_tailp == NULL); 2060 assert(tcp->tcp_rcv.tr_len == 0); 2061 } 2062 2063 /* 2064 * The receive buffer has shrunk, so there may now be space to 2065 * receive more data. 2066 */ 2067 if (tcp->tcp_pcb != NULL) 2068 tcpsock_ack_recv(tcp); 2069 } else 2070 flags &= ~MSG_WAITALL; /* for the check below */ 2071 2072 /* Advance the current copy position, and see if we are done. */ 2073 *offp += off; 2074 if ((flags & MSG_WAITALL) && off < len && tcpsock_may_wait(tcp)) 2075 return SUSPEND; 2076 else 2077 return OK; 2078 } 2079 2080 /* 2081 * Update the set of flag-type socket options on a TCP socket. 2082 */ 2083 static void 2084 tcpsock_setsockmask(struct sock * sock, unsigned int mask) 2085 { 2086 struct tcpsock *tcp = (struct tcpsock *)sock; 2087 2088 if (tcp->tcp_pcb == NULL) 2089 return; 2090 2091 if (mask & SO_REUSEADDR) 2092 ip_set_option(tcp->tcp_pcb, SOF_REUSEADDR); 2093 else 2094 ip_reset_option(tcp->tcp_pcb, SOF_REUSEADDR); 2095 2096 if (mask & SO_KEEPALIVE) 2097 ip_set_option(tcp->tcp_pcb, SOF_KEEPALIVE); 2098 else 2099 ip_reset_option(tcp->tcp_pcb, SOF_KEEPALIVE); 2100 } 2101 2102 /* 2103 * Prepare a helper structure for IP-level option processing. 2104 */ 2105 static void 2106 tcpsock_get_ipopts(struct tcpsock * tcp, struct ipopts * ipopts) 2107 { 2108 2109 ipopts->local_ip = &tcp->tcp_pcb->local_ip; 2110 ipopts->remote_ip = &tcp->tcp_pcb->remote_ip; 2111 ipopts->tos = &tcp->tcp_pcb->tos; 2112 ipopts->ttl = &tcp->tcp_pcb->ttl; 2113 ipopts->sndmin = TCP_SNDBUF_MIN; 2114 ipopts->sndmax = TCP_SNDBUF_MAX; 2115 ipopts->rcvmin = TCP_RCVBUF_MIN; 2116 ipopts->rcvmax = TCP_RCVBUF_MAX; 2117 } 2118 2119 /* 2120 * Set socket options on a TCP socket. 2121 */ 2122 static int 2123 tcpsock_setsockopt(struct sock * sock, int level, int name, 2124 const struct sockdriver_data * data, socklen_t len) 2125 { 2126 struct tcpsock *tcp = (struct tcpsock *)sock; 2127 struct ipopts ipopts; 2128 uint32_t uval; 2129 int r, val; 2130 2131 if (tcp->tcp_pcb == NULL) 2132 return ECONNRESET; 2133 2134 /* Handle TCP-level options. */ 2135 switch (level) { 2136 case IPPROTO_IPV6: 2137 switch (name) { 2138 case IPV6_RECVTCLASS: 2139 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), 2140 len)) != OK) 2141 return r; 2142 2143 /* 2144 * This option is not supported for TCP sockets; it 2145 * would not even make sense. However, named(8) 2146 * insists on trying to set it anyway. We accept the 2147 * request but ignore the value, not even returning 2148 * what was set through getsockopt(2). 2149 */ 2150 return OK; 2151 2152 case IPV6_FAITH: 2153 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), 2154 len)) != OK) 2155 return r; 2156 2157 /* 2158 * This option is not supported at all, but to save 2159 * ourselves from having to remember the current state 2160 * for getsockopt(2), we also refuse to enable it. 2161 */ 2162 if (val != 0) 2163 return EINVAL; 2164 2165 return OK; 2166 } 2167 2168 break; 2169 2170 case IPPROTO_TCP: 2171 switch (name) { 2172 case TCP_NODELAY: 2173 /* 2174 * lwIP's listening TCP PCBs do not have this field. 2175 * If this ever becomes an issue, we can create our own 2176 * shadow flag and do the inheritance ourselves. 2177 */ 2178 if (tcp->tcp_pcb->state == LISTEN) 2179 return EINVAL; 2180 2181 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), 2182 len)) != OK) 2183 return r; 2184 2185 if (val) 2186 tcp_nagle_disable(tcp->tcp_pcb); 2187 else 2188 tcp_nagle_enable(tcp->tcp_pcb); 2189 2190 return OK; 2191 2192 case TCP_KEEPIDLE: 2193 case TCP_KEEPINTVL: 2194 /* 2195 * lwIP's listening TCP PCBs do not have these fields. 2196 */ 2197 if (tcp->tcp_pcb->state == LISTEN) 2198 return EINVAL; 2199 2200 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), 2201 len)) != OK) 2202 return r; 2203 2204 if (val == 0) 2205 return EINVAL; 2206 2207 /* 2208 * The given value is unsigned, but lwIP stores the 2209 * value in milliseconds in a uint32_t field, so we 2210 * have to limit large values to whatever fits in the 2211 * field anyway. 2212 */ 2213 if (val < 0 || (uint32_t)val > UINT32_MAX / 1000) 2214 uval = UINT32_MAX; 2215 else 2216 uval = (uint32_t)val * 1000; 2217 2218 if (name == TCP_KEEPIDLE) 2219 tcp->tcp_pcb->keep_idle = uval; 2220 else 2221 tcp->tcp_pcb->keep_intvl = uval; 2222 2223 return OK; 2224 2225 case TCP_KEEPCNT: 2226 /* lwIP's listening TCP PCBs do not have this field. */ 2227 if (tcp->tcp_pcb->state == LISTEN) 2228 return EINVAL; 2229 2230 if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), 2231 len)) != OK) 2232 return r; 2233 2234 if (val == 0) 2235 return EINVAL; 2236 2237 tcp->tcp_pcb->keep_cnt = (uint32_t)val; 2238 2239 return OK; 2240 } 2241 2242 return EOPNOTSUPP; 2243 } 2244 2245 /* Handle all other options at the IP level. */ 2246 tcpsock_get_ipopts(tcp, &ipopts); 2247 2248 return ipsock_setsockopt(tcpsock_get_ipsock(tcp), level, name, data, 2249 len, &ipopts); 2250 } 2251 2252 /* 2253 * Retrieve socket options on a TCP socket. 2254 */ 2255 static int 2256 tcpsock_getsockopt(struct sock * sock, int level, int name, 2257 const struct sockdriver_data * data, socklen_t * len) 2258 { 2259 struct tcpsock *tcp = (struct tcpsock *)sock; 2260 struct ipopts ipopts; 2261 int val; 2262 2263 if (tcp->tcp_pcb == NULL) 2264 return ECONNRESET; 2265 2266 /* Handle TCP-level options. */ 2267 switch (level) { 2268 case IPPROTO_IPV6: 2269 switch (name) { 2270 case IPV6_RECVTCLASS: 2271 case IPV6_FAITH: 2272 val = 0; 2273 2274 return sockdriver_copyout_opt(data, &val, sizeof(val), 2275 len); 2276 } 2277 2278 break; 2279 2280 case IPPROTO_TCP: 2281 switch (name) { 2282 case TCP_NODELAY: 2283 /* lwIP's listening TCP PCBs do not have this field. */ 2284 if (tcp->tcp_pcb->state == LISTEN) 2285 return EINVAL; 2286 2287 val = tcp_nagle_disabled(tcp->tcp_pcb); 2288 2289 return sockdriver_copyout_opt(data, &val, sizeof(val), 2290 len); 2291 2292 case TCP_MAXSEG: 2293 /* lwIP's listening TCP PCBs do not have this field. */ 2294 if (tcp->tcp_pcb->state == LISTEN) 2295 return EINVAL; 2296 2297 /* This option is read-only at this time. */ 2298 val = tcp->tcp_pcb->mss; 2299 2300 return sockdriver_copyout_opt(data, &val, sizeof(val), 2301 len); 2302 2303 case TCP_KEEPIDLE: 2304 /* lwIP's listening TCP PCBs do not have this field. */ 2305 if (tcp->tcp_pcb->state == LISTEN) 2306 return EINVAL; 2307 2308 val = (int)(tcp->tcp_pcb->keep_idle / 1000); 2309 2310 return sockdriver_copyout_opt(data, &val, sizeof(val), 2311 len); 2312 2313 case TCP_KEEPINTVL: 2314 /* lwIP's listening TCP PCBs do not have this field. */ 2315 if (tcp->tcp_pcb->state == LISTEN) 2316 return EINVAL; 2317 2318 val = (int)(tcp->tcp_pcb->keep_intvl / 1000); 2319 2320 return sockdriver_copyout_opt(data, &val, sizeof(val), 2321 len); 2322 2323 case TCP_KEEPCNT: 2324 /* lwIP's listening TCP PCBs do not have this field. */ 2325 if (tcp->tcp_pcb->state == LISTEN) 2326 return EINVAL; 2327 2328 val = (int)tcp->tcp_pcb->keep_cnt; 2329 2330 return sockdriver_copyout_opt(data, &val, sizeof(val), 2331 len); 2332 } 2333 2334 return EOPNOTSUPP; 2335 } 2336 2337 /* Handle all other options at the IP level. */ 2338 tcpsock_get_ipopts(tcp, &ipopts); 2339 2340 return ipsock_getsockopt(tcpsock_get_ipsock(tcp), level, name, data, 2341 len, &ipopts); 2342 } 2343 2344 /* 2345 * Retrieve the local socket address of a TCP socket. 2346 */ 2347 static int 2348 tcpsock_getsockname(struct sock * sock, struct sockaddr * addr, 2349 socklen_t * addr_len) 2350 { 2351 struct tcpsock *tcp = (struct tcpsock *)sock; 2352 2353 if (tcp->tcp_pcb == NULL) 2354 return EINVAL; 2355 2356 ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len, 2357 &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port); 2358 2359 return OK; 2360 } 2361 2362 /* 2363 * Retrieve the remote socket address of a TCP socket. 2364 */ 2365 static int 2366 tcpsock_getpeername(struct sock * sock, struct sockaddr * addr, 2367 socklen_t * addr_len) 2368 { 2369 struct tcpsock *tcp = (struct tcpsock *)sock; 2370 2371 if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED || 2372 tcp->tcp_pcb->state == LISTEN || tcp->tcp_pcb->state == SYN_SENT) 2373 return ENOTCONN; 2374 2375 ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len, 2376 &tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port); 2377 2378 return OK; 2379 } 2380 2381 /* 2382 * Perform a TCP half-close on a TCP socket. This operation may not complete 2383 * immediately due to memory conditions, in which case it will be completed at 2384 * a later time. 2385 */ 2386 static void 2387 tcpsock_send_fin(struct tcpsock * tcp) 2388 { 2389 2390 sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_WR); 2391 2392 /* 2393 * Attempt to send the FIN. If a fatal error occurs as a result, raise 2394 * it as an asynchronous error, because this function's callers cannot 2395 * do much with it. That happens to match the way these functions are 2396 * used elsewhere. In any case, as a result, the PCB may be closed. 2397 * However, we are never called from a situation where the socket is 2398 * being closed here, so the socket object will not be freed either. 2399 */ 2400 if (tcpsock_pcb_enqueue(tcp)) { 2401 assert(!sockevent_is_closing(tcpsock_get_sock(tcp))); 2402 2403 if (tcpsock_may_close(tcp)) 2404 tcpsock_finish_close(tcp); 2405 else 2406 (void)tcpsock_pcb_send(tcp, TRUE /*raise_error*/); 2407 } 2408 } 2409 2410 /* 2411 * Shut down a TCP socket for reading and/or writing. 2412 */ 2413 static int 2414 tcpsock_shutdown(struct sock * sock, unsigned int mask) 2415 { 2416 struct tcpsock *tcp = (struct tcpsock *)sock; 2417 2418 /* 2419 * If the PCB is gone, we want to allow shutdowns for reading but not 2420 * writing: shutting down for writing affects the PCB, shutting down 2421 * for reading does not. Also, if the PCB is in CLOSED state, we would 2422 * not know how to deal with subsequent operations after a shutdown for 2423 * writing, so forbid such calls altogether. 2424 */ 2425 if ((tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED) && 2426 (mask & SFL_SHUT_WR)) 2427 return ENOTCONN; 2428 2429 /* 2430 * Handle listening sockets as a special case. Shutting down a 2431 * listening socket frees its PCB. Sockets pending on the accept queue 2432 * may still be accepted, but after that, accept(2) will start 2433 * returning ECONNABORTED. This feature allows multi-process server 2434 * applications to shut down gracefully, supposedly.. 2435 */ 2436 if (tcpsock_is_listening(tcp)) { 2437 if (tcp->tcp_pcb != NULL) 2438 tcpsock_pcb_close(tcp); 2439 2440 return OK; 2441 } 2442 2443 /* 2444 * We control shutdown-for-reading locally, and intentially do not tell 2445 * lwIP about it: if we do that and also shut down for writing, the PCB 2446 * may disappear (now or eventually), which is not what we want. 2447 * Instead, we only tell lwIP to shut down for reading once we actually 2448 * want to get rid of the PCB, using tcp_close(). In the meantime, if 2449 * the socket is shut down for reading by the user, we simply discard 2450 * received data as fast as we can--one out of a number of possible 2451 * design choices there, and (reportedly) the one used by the BSDs. 2452 */ 2453 if (mask & SFL_SHUT_RD) 2454 (void)tcpsock_clear_recv(tcp, TRUE /*ack_data*/); 2455 2456 /* 2457 * Shutting down for writing a connecting socket simply closes its PCB. 2458 * Closing a PCB in SYN_SENT state simply deallocates it, so this can 2459 * not fail. On the other hand, for connected sockets we want to send 2460 * a FIN, which may fail due to memory shortage, in which case we have 2461 * to try again later.. 2462 */ 2463 if (mask & SFL_SHUT_WR) { 2464 if (tcp->tcp_pcb->state == SYN_SENT) 2465 tcpsock_pcb_close(tcp); 2466 else if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR)) 2467 tcpsock_send_fin(tcp); 2468 } 2469 2470 return OK; 2471 } 2472 2473 /* 2474 * Close a TCP socket. Complete the operation immediately if possible, or 2475 * otherwise initiate the closing process and complete it later, notifying 2476 * libsockevent about that as well. Depending on linger settings, this 2477 * function may be called twice on the same socket: the first time with the 2478 * 'force' flag cleared, and the second time with the 'force' flag set. 2479 */ 2480 static int 2481 tcpsock_close(struct sock * sock, int force) 2482 { 2483 struct tcpsock *tcp = (struct tcpsock *)sock; 2484 struct tcpsock *queued; 2485 size_t rlen; 2486 2487 assert(tcp->tcp_listener == NULL); 2488 2489 /* 2490 * If this was a listening socket, so abort and clean up any and all 2491 * connections on its listener queue. Note that the listening socket 2492 * may or may not have a PCB at this point. 2493 */ 2494 if (tcpsock_is_listening(tcp)) { 2495 while (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head)) { 2496 queued = TAILQ_FIRST(&tcp->tcp_queue.tq_head); 2497 2498 tcpsock_pcb_abort(queued); 2499 2500 (void)tcpsock_cleanup(queued, TRUE /*may_free*/); 2501 } 2502 } 2503 2504 /* 2505 * Clear the receive queue, and make sure that we no longer add new 2506 * data to it. The latter is relevant only for the case that we end up 2507 * returning SUSPEND below. Remember whether there were bytes left, 2508 * because we should reset the connection if there were. 2509 */ 2510 rlen = tcpsock_clear_recv(tcp, FALSE /*ack_data*/); 2511 2512 sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_RD); 2513 2514 /* 2515 * If the socket is connected, perform a graceful shutdown, unless 1) 2516 * we are asked to force-close the socket, or 2) if the local side has 2517 * not consumed all data, as per RFC 1122 Sec.4.2.2.13. Normally lwIP 2518 * would take care of the second point, but we may have data in our 2519 * receive buffer of which lwIP is not aware. 2520 * 2521 * Implementing proper linger support is somewhat difficult with lwIP. 2522 * In particular, we cannot reliably wait for our FIN to be ACK'ed by 2523 * the other side in all cases: 2524 * 2525 * - the lwIP TCP transition from states CLOSING to TIME_WAIT does not 2526 * trigger any event and once in the TIME_WAIT state, the poll event 2527 * no longer triggers either; 2528 * - the lwIP TCP transition from states FIN_WAIT_1 and FIN_WAIT_2 to 2529 * TIME_WAIT will trigger a receive event, but it is not clear 2530 * whether we can reliably check that our FIN was ACK'ed from there. 2531 * 2532 * That means we have to compromise. Instead of the proper approach, 2533 * we complete our side of the close operation whenever: 2534 * 2535 * 1. all of or data was acknowledged, AND, 2536 * 2. our FIN was sent, AND, 2537 * 3a. our FIN was acknowledged, OR, 2538 * 3b. we received a FIN from the other side. 2539 * 2540 * With the addition of the rule 3b, we do not run into the above 2541 * reliability problems, but we may return from SO_LINGER-blocked close 2542 * calls too early and thus give callers a false impression of success. 2543 * TODO: if lwIP ever gets improved on this point, the code in this 2544 * module should be rewritten to make use of the improvements. 2545 * 2546 * The set of rules is basically the same as for closing the PCB early 2547 * as per tcpsock_may_close(), except with the check for our FIN being 2548 * acknowledged. Unfortunately only the FIN_WAIT_2, TIME_WAIT, and 2549 * (reentered) CLOSED TCP states guarantee that there are no 2550 * unacknowledged data segments anymore, so we may have to wait for 2551 * reaching any one of these before we can actually finish closing the 2552 * socket with tcp_close(). 2553 * 2554 * In addition, lwIP does not tell us when our FIN gets acknowledged, 2555 * so we have to use polling and direct access to lwIP's PCB fields 2556 * instead, just like lwIP's BSD API does. There is no other way. 2557 * Also, we may not even be able to send the FIN right away, in which 2558 * case we must defer that until later. 2559 */ 2560 if (tcp->tcp_pcb != NULL) { 2561 switch (tcp->tcp_pcb->state) { 2562 case CLOSE_WAIT: 2563 case CLOSING: 2564 case LAST_ACK: 2565 assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN); 2566 2567 /* FALLTHROUGH */ 2568 case SYN_RCVD: 2569 case ESTABLISHED: 2570 case FIN_WAIT_1: 2571 /* First check if we should abort the connection. */ 2572 if (force || rlen > 0) 2573 break; 2574 2575 /* 2576 * If we have not sent a FIN yet, try sending it now; 2577 * if all other conditions are met for closing the 2578 * socket, successful FIN transmission will complete 2579 * the close. Otherwise, perform the close check 2580 * explicitly. 2581 */ 2582 if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR)) 2583 tcpsock_send_fin(tcp); 2584 else if (tcpsock_may_close(tcp)) 2585 tcpsock_pcb_close(tcp); 2586 2587 /* 2588 * If at this point the PCB is gone, we managed to 2589 * close the connection immediately, and the socket has 2590 * already been cleaned up by now. This may occur if 2591 * there is no unacknowledged data and we already 2592 * received a FIN earlier on. 2593 */ 2594 if (tcp->tcp_pcb == NULL) 2595 return OK; 2596 2597 /* 2598 * Complete the close operation at a later time. 2599 * Adjust the polling interval, so that we can detect 2600 * completion of the close as quickly as possible. 2601 */ 2602 tcp_poll(tcp->tcp_pcb, tcpsock_event_poll, 2603 TCP_POLL_CLOSE_INTERVAL); 2604 2605 return SUSPEND; 2606 2607 default: 2608 /* 2609 * The connection is either not yet established, or 2610 * already in a state where we can close it right now. 2611 */ 2612 tcpsock_pcb_close(tcp); 2613 } 2614 } 2615 2616 /* 2617 * Abort the connection is the PCB is still around, and clean up the 2618 * socket. We cannot let tcpsock_cleanup() free the socket object yet, 2619 * because we are still in the callback from libsockevent, and the 2620 * latter cannot handle the socket object being freed from here. 2621 */ 2622 if (tcp->tcp_pcb != NULL) 2623 tcpsock_pcb_abort(tcp); 2624 2625 (void)tcpsock_cleanup(tcp, FALSE /*may_free*/); 2626 2627 return OK; 2628 } 2629 2630 /* 2631 * Free up a closed TCP socket. 2632 */ 2633 static void 2634 tcpsock_free(struct sock * sock) 2635 { 2636 struct tcpsock *tcp = (struct tcpsock *)sock; 2637 2638 assert(tcp->tcp_pcb == NULL); 2639 assert(tcp->tcp_snd.ts_len == 0); 2640 assert(tcp->tcp_snd.ts_head == NULL); 2641 assert(tcp->tcp_rcv.tr_len == 0); 2642 assert(tcp->tcp_rcv.tr_head == NULL); 2643 2644 TAILQ_INSERT_HEAD(&tcp_freelist, tcp, tcp_queue.tq_next); 2645 } 2646 2647 /* This table maps TCP states from lwIP numbers to NetBSD numbers. */ 2648 static const struct { 2649 int tsm_tstate; 2650 int tsm_sostate; 2651 } tcpsock_statemap[] = { 2652 [CLOSED] = { TCPS_CLOSED, SS_ISDISCONNECTED }, 2653 [LISTEN] = { TCPS_LISTEN, 0 }, 2654 [SYN_SENT] = { TCPS_SYN_SENT, SS_ISCONNECTING }, 2655 [SYN_RCVD] = { TCPS_SYN_RECEIVED, SS_ISCONNECTING }, 2656 [ESTABLISHED] = { TCPS_ESTABLISHED, SS_ISCONNECTED }, 2657 [FIN_WAIT_1] = { TCPS_FIN_WAIT_1, SS_ISDISCONNECTING }, 2658 [FIN_WAIT_2] = { TCPS_FIN_WAIT_2, SS_ISDISCONNECTING }, 2659 [CLOSE_WAIT] = { TCPS_CLOSE_WAIT, SS_ISCONNECTED }, 2660 [CLOSING] = { TCPS_CLOSING, SS_ISDISCONNECTING }, 2661 [LAST_ACK] = { TCPS_LAST_ACK, SS_ISDISCONNECTING }, 2662 [TIME_WAIT] = { TCPS_TIME_WAIT, SS_ISDISCONNECTED }, 2663 }; 2664 2665 /* 2666 * Fill the given kinfo_pcb sysctl(7) structure with information about the TCP 2667 * PCB identified by the given pointer. 2668 */ 2669 static void 2670 tcpsock_get_info(struct kinfo_pcb * ki, const void * ptr) 2671 { 2672 const struct tcp_pcb *pcb = (const struct tcp_pcb *)ptr; 2673 struct tcpsock *tcp; 2674 2675 /* 2676 * Not all TCP PCBs have an associated tcpsock structure. We are 2677 * careful enough clearing the callback argument for PCBs on any of the 2678 * TCP lists that we can use that callback argument to determine 2679 * whether there is an associated tcpsock structure, although with one 2680 * exception: PCBs for incoming connections that have not yet been 2681 * fully established (i.e., in SYN_RCVD state). These will have the 2682 * callback argument of the listening socket (which itself may already 2683 * have been deallocated at this point) but should not be considered as 2684 * associated with the listening socket's tcpsock structure. 2685 */ 2686 if (pcb->callback_arg != NULL && pcb->state != SYN_RCVD) { 2687 tcp = (struct tcpsock *)pcb->callback_arg; 2688 assert(tcp >= tcp_array && 2689 tcp < &tcp_array[__arraycount(tcp_array)]); 2690 2691 /* TODO: change this so that sockstat(1) may work one day. */ 2692 ki->ki_sockaddr = (uint64_t)(uintptr_t)tcpsock_get_sock(tcp); 2693 } else { 2694 /* No tcpsock. Could also be in TIME_WAIT state etc. */ 2695 tcp = NULL; 2696 2697 ki->ki_sostate = SS_NOFDREF; 2698 } 2699 2700 ki->ki_type = SOCK_STREAM; 2701 2702 if ((unsigned int)pcb->state < __arraycount(tcpsock_statemap)) { 2703 ki->ki_tstate = tcpsock_statemap[pcb->state].tsm_tstate; 2704 /* TODO: this needs work, but does anything rely on it? */ 2705 ki->ki_sostate |= tcpsock_statemap[pcb->state].tsm_sostate; 2706 } 2707 2708 /* Careful with the LISTEN state here (see below). */ 2709 ipsock_get_info(ki, &pcb->local_ip, pcb->local_port, 2710 &pcb->remote_ip, (pcb->state != LISTEN) ? pcb->remote_port : 0); 2711 2712 /* 2713 * The PCBs for listening sockets are actually smaller. Thus, for 2714 * listening sockets, do not attempt to access any of the fields beyond 2715 * those provided in the smaller structure. 2716 */ 2717 if (pcb->state == LISTEN) { 2718 assert(tcp != NULL); 2719 ki->ki_refs = 2720 (uint64_t)(uintptr_t)TAILQ_FIRST(&tcp->tcp_queue.tq_head); 2721 } else { 2722 if (tcp_nagle_disabled(pcb)) 2723 ki->ki_tflags |= NETBSD_TF_NODELAY; 2724 2725 if (tcp != NULL) { 2726 ki->ki_rcvq = tcp->tcp_rcv.tr_len; 2727 ki->ki_sndq = tcp->tcp_snd.ts_len; 2728 2729 if (tcp->tcp_listener != NULL) 2730 ki->ki_nextref = (uint64_t)(uintptr_t) 2731 TAILQ_NEXT(tcp, tcp_queue.tq_next); 2732 } 2733 } 2734 } 2735 2736 /* 2737 * Given either NULL or a previously returned TCP PCB pointer, return the first 2738 * or next TCP PCB pointer, or NULL if there are no more. The current 2739 * implementation supports only one concurrent iteration at once. 2740 */ 2741 static const void * 2742 tcpsock_enum(const void * last) 2743 { 2744 static struct { 2745 unsigned int i; 2746 const struct tcp_pcb *pcb; 2747 } iter; 2748 2749 if (last != NULL && (iter.pcb = iter.pcb->next) != NULL) 2750 return (const void *)iter.pcb; 2751 2752 for (iter.i = (last != NULL) ? iter.i + 1 : 0; 2753 iter.i < __arraycount(tcp_pcb_lists); iter.i++) { 2754 if ((iter.pcb = *tcp_pcb_lists[iter.i]) != NULL) 2755 return (const void *)iter.pcb; 2756 } 2757 2758 return NULL; 2759 } 2760 2761 /* 2762 * Obtain the list of TCP protocol control blocks, for sysctl(7). 2763 */ 2764 static ssize_t 2765 tcpsock_pcblist(struct rmib_call * call, struct rmib_node * node __unused, 2766 struct rmib_oldp * oldp, struct rmib_newp * newp __unused) 2767 { 2768 2769 return util_pcblist(call, oldp, tcpsock_enum, tcpsock_get_info); 2770 } 2771 2772 static const struct sockevent_ops tcpsock_ops = { 2773 .sop_bind = tcpsock_bind, 2774 .sop_listen = tcpsock_listen, 2775 .sop_connect = tcpsock_connect, 2776 .sop_accept = tcpsock_accept, 2777 .sop_test_accept = tcpsock_test_accept, 2778 .sop_pre_send = tcpsock_pre_send, 2779 .sop_send = tcpsock_send, 2780 .sop_test_send = tcpsock_test_send, 2781 .sop_pre_recv = tcpsock_pre_recv, 2782 .sop_recv = tcpsock_recv, 2783 .sop_test_recv = tcpsock_test_recv, 2784 .sop_ioctl = ifconf_ioctl, 2785 .sop_setsockmask = tcpsock_setsockmask, 2786 .sop_setsockopt = tcpsock_setsockopt, 2787 .sop_getsockopt = tcpsock_getsockopt, 2788 .sop_getsockname = tcpsock_getsockname, 2789 .sop_getpeername = tcpsock_getpeername, 2790 .sop_shutdown = tcpsock_shutdown, 2791 .sop_close = tcpsock_close, 2792 .sop_free = tcpsock_free 2793 }; 2794