1 #include <string.h>
2 #include <stdlib.h>
3 #include <stdio.h>
4 #include <assert.h>
5 #include <errno.h>
6 #include <byteswap.h>
7 #include <ipxe/timer.h>
8 #include <ipxe/iobuf.h>
9 #include <ipxe/malloc.h>
10 #include <ipxe/init.h>
11 #include <ipxe/retry.h>
12 #include <ipxe/refcnt.h>
13 #include <ipxe/pending.h>
14 #include <ipxe/xfer.h>
15 #include <ipxe/open.h>
16 #include <ipxe/uri.h>
17 #include <ipxe/netdevice.h>
18 #include <ipxe/profile.h>
19 #include <ipxe/process.h>
20 #include <ipxe/job.h>
21 #include <ipxe/tcpip.h>
22 #include <ipxe/tcp.h>
23
24 /** @file
25 *
26 * TCP protocol
27 *
28 */
29
30 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
31
32 /** A TCP connection */
33 struct tcp_connection {
34 /** Reference counter */
35 struct refcnt refcnt;
36 /** List of TCP connections */
37 struct list_head list;
38
39 /** Flags */
40 unsigned int flags;
41
42 /** Data transfer interface */
43 struct interface xfer;
44
45 /** Remote socket address */
46 struct sockaddr_tcpip peer;
47 /** Local port */
48 unsigned int local_port;
49 /** Maximum segment size */
50 size_t mss;
51
52 /** Current TCP state */
53 unsigned int tcp_state;
54 /** Previous TCP state
55 *
56 * Maintained only for debug messages
57 */
58 unsigned int prev_tcp_state;
59 /** Current sequence number
60 *
61 * Equivalent to SND.UNA in RFC 793 terminology.
62 */
63 uint32_t snd_seq;
64 /** Unacknowledged sequence count
65 *
66 * Equivalent to (SND.NXT-SND.UNA) in RFC 793 terminology.
67 */
68 uint32_t snd_sent;
69 /** Send window
70 *
71 * Equivalent to SND.WND in RFC 793 terminology
72 */
73 uint32_t snd_win;
74 /** Current acknowledgement number
75 *
76 * Equivalent to RCV.NXT in RFC 793 terminology.
77 */
78 uint32_t rcv_ack;
79 /** Receive window
80 *
81 * Equivalent to RCV.WND in RFC 793 terminology.
82 */
83 uint32_t rcv_win;
84 /** Received timestamp value
85 *
86 * Updated when a packet is received; copied to ts_recent when
87 * the window is advanced.
88 */
89 uint32_t ts_val;
90 /** Most recent received timestamp that advanced the window
91 *
92 * Equivalent to TS.Recent in RFC 1323 terminology.
93 */
94 uint32_t ts_recent;
95 /** Send window scale
96 *
97 * Equivalent to Snd.Wind.Scale in RFC 1323 terminology
98 */
99 uint8_t snd_win_scale;
100 /** Receive window scale
101 *
102 * Equivalent to Rcv.Wind.Scale in RFC 1323 terminology
103 */
104 uint8_t rcv_win_scale;
105
106 /** Selective acknowledgement list (in host-endian order) */
107 struct tcp_sack_block sack[TCP_SACK_MAX];
108
109 /** Transmit queue */
110 struct list_head tx_queue;
111 /** Receive queue */
112 struct list_head rx_queue;
113 /** Transmission process */
114 struct process process;
115 /** Retransmission timer */
116 struct retry_timer timer;
117 /** Keepalive timer */
118 struct retry_timer keepalive;
119 /** Shutdown (TIME_WAIT) timer */
120 struct retry_timer wait;
121
122 /** Pending operations for SYN and FIN */
123 struct pending_operation pending_flags;
124 /** Pending operations for transmit queue */
125 struct pending_operation pending_data;
126 };
127
128 /** TCP flags */
129 enum tcp_flags {
130 /** TCP data transfer interface has been closed */
131 TCP_XFER_CLOSED = 0x0001,
132 /** TCP timestamps are enabled */
133 TCP_TS_ENABLED = 0x0002,
134 /** TCP acknowledgement is pending */
135 TCP_ACK_PENDING = 0x0004,
136 /** TCP selective acknowledgement is enabled */
137 TCP_SACK_ENABLED = 0x0008,
138 };
139
140 /** TCP internal header
141 *
142 * This is the header that replaces the TCP header for packets
143 * enqueued on the receive queue.
144 */
145 struct tcp_rx_queued_header {
146 /** SEQ value, in host-endian order
147 *
148 * This represents the SEQ value at the time the packet is
149 * enqueued, and so excludes the SYN, if present.
150 */
151 uint32_t seq;
152 /** Next SEQ value, in host-endian order */
153 uint32_t nxt;
154 /** Flags
155 *
156 * Only FIN is valid within this flags byte; all other flags
157 * have already been processed by the time the packet is
158 * enqueued.
159 */
160 uint8_t flags;
161 /** Reserved */
162 uint8_t reserved[3];
163 };
164
165 /**
166 * List of registered TCP connections
167 */
168 static LIST_HEAD ( tcp_conns );
169
170 /** Transmit profiler */
171 static struct profiler tcp_tx_profiler __profiler = { .name = "tcp.tx" };
172
173 /** Receive profiler */
174 static struct profiler tcp_rx_profiler __profiler = { .name = "tcp.rx" };
175
176 /** Data transfer profiler */
177 static struct profiler tcp_xfer_profiler __profiler = { .name = "tcp.xfer" };
178
179 /* Forward declarations */
180 static struct process_descriptor tcp_process_desc;
181 static struct interface_descriptor tcp_xfer_desc;
182 static void tcp_expired ( struct retry_timer *timer, int over );
183 static void tcp_keepalive_expired ( struct retry_timer *timer, int over );
184 static void tcp_wait_expired ( struct retry_timer *timer, int over );
185 static struct tcp_connection * tcp_demux ( unsigned int local_port );
186 static int tcp_rx_ack ( struct tcp_connection *tcp, uint32_t ack,
187 uint32_t win );
188
189 /**
190 * Name TCP state
191 *
192 * @v state TCP state
193 * @ret name Name of TCP state
194 */
195 static inline __attribute__ (( always_inline )) const char *
tcp_state(int state)196 tcp_state ( int state ) {
197 switch ( state ) {
198 case TCP_CLOSED: return "CLOSED";
199 case TCP_LISTEN: return "LISTEN";
200 case TCP_SYN_SENT: return "SYN_SENT";
201 case TCP_SYN_RCVD: return "SYN_RCVD";
202 case TCP_ESTABLISHED: return "ESTABLISHED";
203 case TCP_FIN_WAIT_1: return "FIN_WAIT_1";
204 case TCP_FIN_WAIT_2: return "FIN_WAIT_2";
205 case TCP_CLOSING_OR_LAST_ACK: return "CLOSING/LAST_ACK";
206 case TCP_TIME_WAIT: return "TIME_WAIT";
207 case TCP_CLOSE_WAIT: return "CLOSE_WAIT";
208 default: return "INVALID";
209 }
210 }
211
212 /**
213 * Dump TCP state transition
214 *
215 * @v tcp TCP connection
216 */
217 static inline __attribute__ (( always_inline )) void
tcp_dump_state(struct tcp_connection * tcp)218 tcp_dump_state ( struct tcp_connection *tcp ) {
219
220 if ( tcp->tcp_state != tcp->prev_tcp_state ) {
221 DBGC ( tcp, "TCP %p transitioned from %s to %s\n", tcp,
222 tcp_state ( tcp->prev_tcp_state ),
223 tcp_state ( tcp->tcp_state ) );
224 }
225 tcp->prev_tcp_state = tcp->tcp_state;
226 }
227
228 /**
229 * Dump TCP flags
230 *
231 * @v flags TCP flags
232 */
233 static inline __attribute__ (( always_inline )) void
tcp_dump_flags(struct tcp_connection * tcp,unsigned int flags)234 tcp_dump_flags ( struct tcp_connection *tcp, unsigned int flags ) {
235 if ( flags & TCP_RST )
236 DBGC2 ( tcp, " RST" );
237 if ( flags & TCP_SYN )
238 DBGC2 ( tcp, " SYN" );
239 if ( flags & TCP_PSH )
240 DBGC2 ( tcp, " PSH" );
241 if ( flags & TCP_FIN )
242 DBGC2 ( tcp, " FIN" );
243 if ( flags & TCP_ACK )
244 DBGC2 ( tcp, " ACK" );
245 }
246
247 /***************************************************************************
248 *
249 * Open and close
250 *
251 ***************************************************************************
252 */
253
254 /**
255 * Check if local TCP port is available
256 *
257 * @v port Local port number
258 * @ret port Local port number, or negative error
259 */
tcp_port_available(int port)260 static int tcp_port_available ( int port ) {
261
262 return ( tcp_demux ( port ) ? -EADDRINUSE : port );
263 }
264
265 /**
266 * Open a TCP connection
267 *
268 * @v xfer Data transfer interface
269 * @v peer Peer socket address
270 * @v local Local socket address, or NULL
271 * @ret rc Return status code
272 */
tcp_open(struct interface * xfer,struct sockaddr * peer,struct sockaddr * local)273 static int tcp_open ( struct interface *xfer, struct sockaddr *peer,
274 struct sockaddr *local ) {
275 struct sockaddr_tcpip *st_peer = ( struct sockaddr_tcpip * ) peer;
276 struct sockaddr_tcpip *st_local = ( struct sockaddr_tcpip * ) local;
277 struct tcp_connection *tcp;
278 size_t mtu;
279 int port;
280 int rc;
281
282 /* Allocate and initialise structure */
283 tcp = zalloc ( sizeof ( *tcp ) );
284 if ( ! tcp )
285 return -ENOMEM;
286 DBGC ( tcp, "TCP %p allocated\n", tcp );
287 ref_init ( &tcp->refcnt, NULL );
288 intf_init ( &tcp->xfer, &tcp_xfer_desc, &tcp->refcnt );
289 process_init_stopped ( &tcp->process, &tcp_process_desc, &tcp->refcnt );
290 timer_init ( &tcp->timer, tcp_expired, &tcp->refcnt );
291 timer_init ( &tcp->keepalive, tcp_keepalive_expired, &tcp->refcnt );
292 timer_init ( &tcp->wait, tcp_wait_expired, &tcp->refcnt );
293 tcp->prev_tcp_state = TCP_CLOSED;
294 tcp->tcp_state = TCP_STATE_SENT ( TCP_SYN );
295 tcp_dump_state ( tcp );
296 tcp->snd_seq = random();
297 INIT_LIST_HEAD ( &tcp->tx_queue );
298 INIT_LIST_HEAD ( &tcp->rx_queue );
299 memcpy ( &tcp->peer, st_peer, sizeof ( tcp->peer ) );
300
301 /* Calculate MSS */
302 mtu = tcpip_mtu ( &tcp->peer );
303 if ( ! mtu ) {
304 DBGC ( tcp, "TCP %p has no route to %s\n",
305 tcp, sock_ntoa ( peer ) );
306 rc = -ENETUNREACH;
307 goto err;
308 }
309 tcp->mss = ( mtu - sizeof ( struct tcp_header ) );
310
311 /* Bind to local port */
312 port = tcpip_bind ( st_local, tcp_port_available );
313 if ( port < 0 ) {
314 rc = port;
315 DBGC ( tcp, "TCP %p could not bind: %s\n",
316 tcp, strerror ( rc ) );
317 goto err;
318 }
319 tcp->local_port = port;
320 DBGC ( tcp, "TCP %p bound to port %d\n", tcp, tcp->local_port );
321
322 /* Start timer to initiate SYN */
323 start_timer_nodelay ( &tcp->timer );
324
325 /* Add a pending operation for the SYN */
326 pending_get ( &tcp->pending_flags );
327
328 /* Attach parent interface, transfer reference to connection
329 * list and return
330 */
331 intf_plug_plug ( &tcp->xfer, xfer );
332 list_add ( &tcp->list, &tcp_conns );
333 return 0;
334
335 err:
336 ref_put ( &tcp->refcnt );
337 return rc;
338 }
339
340 /**
341 * Close TCP connection
342 *
343 * @v tcp TCP connection
344 * @v rc Reason for close
345 *
346 * Closes the data transfer interface. If the TCP state machine is in
347 * a suitable state, the connection will be deleted.
348 */
tcp_close(struct tcp_connection * tcp,int rc)349 static void tcp_close ( struct tcp_connection *tcp, int rc ) {
350 struct io_buffer *iobuf;
351 struct io_buffer *tmp;
352
353 /* Close data transfer interface */
354 intf_shutdown ( &tcp->xfer, rc );
355 tcp->flags |= TCP_XFER_CLOSED;
356
357 /* If we are in CLOSED, or have otherwise not yet received a
358 * SYN (i.e. we are in LISTEN or SYN_SENT), just delete the
359 * connection.
360 */
361 if ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) {
362
363 /* Transition to CLOSED for the sake of debugging messages */
364 tcp->tcp_state = TCP_CLOSED;
365 tcp_dump_state ( tcp );
366
367 /* Free any unprocessed I/O buffers */
368 list_for_each_entry_safe ( iobuf, tmp, &tcp->rx_queue, list ) {
369 list_del ( &iobuf->list );
370 free_iob ( iobuf );
371 }
372
373 /* Free any unsent I/O buffers */
374 list_for_each_entry_safe ( iobuf, tmp, &tcp->tx_queue, list ) {
375 list_del ( &iobuf->list );
376 free_iob ( iobuf );
377 pending_put ( &tcp->pending_data );
378 }
379 assert ( ! is_pending ( &tcp->pending_data ) );
380
381 /* Remove pending operations for SYN and FIN, if applicable */
382 pending_put ( &tcp->pending_flags );
383 pending_put ( &tcp->pending_flags );
384
385 /* Remove from list and drop reference */
386 process_del ( &tcp->process );
387 stop_timer ( &tcp->timer );
388 stop_timer ( &tcp->keepalive );
389 stop_timer ( &tcp->wait );
390 list_del ( &tcp->list );
391 ref_put ( &tcp->refcnt );
392 DBGC ( tcp, "TCP %p connection deleted\n", tcp );
393 return;
394 }
395
396 /* If we have not had our SYN acknowledged (i.e. we are in
397 * SYN_RCVD), pretend that it has been acknowledged so that we
398 * can send a FIN without breaking things.
399 */
400 if ( ! ( tcp->tcp_state & TCP_STATE_ACKED ( TCP_SYN ) ) )
401 tcp_rx_ack ( tcp, ( tcp->snd_seq + 1 ), 0 );
402
403 /* Stop keepalive timer */
404 stop_timer ( &tcp->keepalive );
405
406 /* If we have no data remaining to send, start sending FIN */
407 if ( list_empty ( &tcp->tx_queue ) &&
408 ! ( tcp->tcp_state & TCP_STATE_SENT ( TCP_FIN ) ) ) {
409
410 tcp->tcp_state |= TCP_STATE_SENT ( TCP_FIN );
411 tcp_dump_state ( tcp );
412 process_add ( &tcp->process );
413
414 /* Add a pending operation for the FIN */
415 pending_get ( &tcp->pending_flags );
416 }
417 }
418
419 /***************************************************************************
420 *
421 * Transmit data path
422 *
423 ***************************************************************************
424 */
425
426 /**
427 * Calculate transmission window
428 *
429 * @v tcp TCP connection
430 * @ret len Maximum length that can be sent in a single packet
431 */
tcp_xmit_win(struct tcp_connection * tcp)432 static size_t tcp_xmit_win ( struct tcp_connection *tcp ) {
433 size_t len;
434
435 /* Not ready if we're not in a suitable connection state */
436 if ( ! TCP_CAN_SEND_DATA ( tcp->tcp_state ) )
437 return 0;
438
439 /* Length is the minimum of the receiver's window and the path MTU */
440 len = tcp->snd_win;
441 if ( len > TCP_PATH_MTU )
442 len = TCP_PATH_MTU;
443
444 return len;
445 }
446
447 /**
448 * Check data-transfer flow control window
449 *
450 * @v tcp TCP connection
451 * @ret len Length of window
452 */
tcp_xfer_window(struct tcp_connection * tcp)453 static size_t tcp_xfer_window ( struct tcp_connection *tcp ) {
454
455 /* Not ready if data queue is non-empty. This imposes a limit
456 * of only one unACKed packet in the TX queue at any time; we
457 * do this to conserve memory usage.
458 */
459 if ( ! list_empty ( &tcp->tx_queue ) )
460 return 0;
461
462 /* Return TCP window length */
463 return tcp_xmit_win ( tcp );
464 }
465
466 /**
467 * Find selective acknowledgement block
468 *
469 * @v tcp TCP connection
470 * @v seq SEQ value in SACK block (in host-endian order)
471 * @v sack SACK block to fill in (in host-endian order)
472 * @ret len Length of SACK block
473 */
tcp_sack_block(struct tcp_connection * tcp,uint32_t seq,struct tcp_sack_block * sack)474 static uint32_t tcp_sack_block ( struct tcp_connection *tcp, uint32_t seq,
475 struct tcp_sack_block *sack ) {
476 struct io_buffer *iobuf;
477 struct tcp_rx_queued_header *tcpqhdr;
478 uint32_t left = tcp->rcv_ack;
479 uint32_t right = left;
480
481 /* Find highest block which does not start after SEQ */
482 list_for_each_entry ( iobuf, &tcp->rx_queue, list ) {
483 tcpqhdr = iobuf->data;
484 if ( tcp_cmp ( tcpqhdr->seq, right ) > 0 ) {
485 if ( tcp_cmp ( tcpqhdr->seq, seq ) > 0 )
486 break;
487 left = tcpqhdr->seq;
488 }
489 if ( tcp_cmp ( tcpqhdr->nxt, right ) > 0 )
490 right = tcpqhdr->nxt;
491 }
492
493 /* Fail if this block does not contain SEQ */
494 if ( tcp_cmp ( right, seq ) < 0 )
495 return 0;
496
497 /* Populate SACK block */
498 sack->left = left;
499 sack->right = right;
500 return ( right - left );
501 }
502
503 /**
504 * Update TCP selective acknowledgement list
505 *
506 * @v tcp TCP connection
507 * @v seq SEQ value in first SACK block (in host-endian order)
508 * @ret count Number of SACK blocks
509 */
tcp_sack(struct tcp_connection * tcp,uint32_t seq)510 static unsigned int tcp_sack ( struct tcp_connection *tcp, uint32_t seq ) {
511 struct tcp_sack_block sack[TCP_SACK_MAX];
512 unsigned int old = 0;
513 unsigned int new = 0;
514 unsigned int i;
515 uint32_t len;
516
517 /* Populate first new SACK block */
518 len = tcp_sack_block ( tcp, seq, &sack[0] );
519 if ( len )
520 new++;
521
522 /* Populate remaining new SACK blocks based on old SACK blocks */
523 for ( old = 0 ; old < TCP_SACK_MAX ; old++ ) {
524
525 /* Stop if we run out of space in the new list */
526 if ( new == TCP_SACK_MAX )
527 break;
528
529 /* Skip empty old SACK blocks */
530 if ( tcp->sack[old].left == tcp->sack[old].right )
531 continue;
532
533 /* Populate new SACK block */
534 len = tcp_sack_block ( tcp, tcp->sack[old].left, &sack[new] );
535 if ( len == 0 )
536 continue;
537
538 /* Eliminate duplicates */
539 for ( i = 0 ; i < new ; i++ ) {
540 if ( sack[i].left == sack[new].left ) {
541 new--;
542 break;
543 }
544 }
545 new++;
546 }
547
548 /* Update SACK list */
549 memset ( tcp->sack, 0, sizeof ( tcp->sack ) );
550 memcpy ( tcp->sack, sack, ( new * sizeof ( tcp->sack[0] ) ) );
551 return new;
552 }
553
554 /**
555 * Process TCP transmit queue
556 *
557 * @v tcp TCP connection
558 * @v max_len Maximum length to process
559 * @v dest I/O buffer to fill with data, or NULL
560 * @v remove Remove data from queue
561 * @ret len Length of data processed
562 *
563 * This processes at most @c max_len bytes from the TCP connection's
564 * transmit queue. Data will be copied into the @c dest I/O buffer
565 * (if provided) and, if @c remove is true, removed from the transmit
566 * queue.
567 */
tcp_process_tx_queue(struct tcp_connection * tcp,size_t max_len,struct io_buffer * dest,int remove)568 static size_t tcp_process_tx_queue ( struct tcp_connection *tcp, size_t max_len,
569 struct io_buffer *dest, int remove ) {
570 struct io_buffer *iobuf;
571 struct io_buffer *tmp;
572 size_t frag_len;
573 size_t len = 0;
574
575 list_for_each_entry_safe ( iobuf, tmp, &tcp->tx_queue, list ) {
576 frag_len = iob_len ( iobuf );
577 if ( frag_len > max_len )
578 frag_len = max_len;
579 if ( dest ) {
580 memcpy ( iob_put ( dest, frag_len ), iobuf->data,
581 frag_len );
582 }
583 if ( remove ) {
584 iob_pull ( iobuf, frag_len );
585 if ( ! iob_len ( iobuf ) ) {
586 list_del ( &iobuf->list );
587 free_iob ( iobuf );
588 pending_put ( &tcp->pending_data );
589 }
590 }
591 len += frag_len;
592 max_len -= frag_len;
593 }
594 return len;
595 }
596
597 /**
598 * Transmit any outstanding data (with selective acknowledgement)
599 *
600 * @v tcp TCP connection
601 * @v sack_seq SEQ for first selective acknowledgement (if any)
602 *
603 * Transmits any outstanding data on the connection.
604 *
605 * Note that even if an error is returned, the retransmission timer
606 * will have been started if necessary, and so the stack will
607 * eventually attempt to retransmit the failed packet.
608 */
tcp_xmit_sack(struct tcp_connection * tcp,uint32_t sack_seq)609 static void tcp_xmit_sack ( struct tcp_connection *tcp, uint32_t sack_seq ) {
610 struct io_buffer *iobuf;
611 struct tcp_header *tcphdr;
612 struct tcp_mss_option *mssopt;
613 struct tcp_window_scale_padded_option *wsopt;
614 struct tcp_timestamp_padded_option *tsopt;
615 struct tcp_sack_permitted_padded_option *spopt;
616 struct tcp_sack_padded_option *sackopt;
617 struct tcp_sack_block *sack;
618 void *payload;
619 unsigned int flags;
620 unsigned int sack_count;
621 unsigned int i;
622 size_t len = 0;
623 size_t sack_len;
624 uint32_t seq_len;
625 uint32_t max_rcv_win;
626 uint32_t max_representable_win;
627 int rc;
628
629 /* Start profiling */
630 profile_start ( &tcp_tx_profiler );
631
632 /* If retransmission timer is already running, do nothing */
633 if ( timer_running ( &tcp->timer ) )
634 return;
635
636 /* Calculate both the actual (payload) and sequence space
637 * lengths that we wish to transmit.
638 */
639 if ( TCP_CAN_SEND_DATA ( tcp->tcp_state ) ) {
640 len = tcp_process_tx_queue ( tcp, tcp_xmit_win ( tcp ),
641 NULL, 0 );
642 }
643 seq_len = len;
644 flags = TCP_FLAGS_SENDING ( tcp->tcp_state );
645 if ( flags & ( TCP_SYN | TCP_FIN ) ) {
646 /* SYN or FIN consume one byte, and we can never send both */
647 assert ( ! ( ( flags & TCP_SYN ) && ( flags & TCP_FIN ) ) );
648 seq_len++;
649 }
650 tcp->snd_sent = seq_len;
651
652 /* If we have nothing to transmit, stop now */
653 if ( ( seq_len == 0 ) && ! ( tcp->flags & TCP_ACK_PENDING ) )
654 return;
655
656 /* If we are transmitting anything that requires
657 * acknowledgement (i.e. consumes sequence space), start the
658 * retransmission timer. Do this before attempting to
659 * allocate the I/O buffer, in case allocation itself fails.
660 */
661 if ( seq_len )
662 start_timer ( &tcp->timer );
663
664 /* Allocate I/O buffer */
665 iobuf = alloc_iob ( len + TCP_MAX_HEADER_LEN );
666 if ( ! iobuf ) {
667 DBGC ( tcp, "TCP %p could not allocate iobuf for %08x..%08x "
668 "%08x\n", tcp, tcp->snd_seq, ( tcp->snd_seq + seq_len ),
669 tcp->rcv_ack );
670 return;
671 }
672 iob_reserve ( iobuf, TCP_MAX_HEADER_LEN );
673
674 /* Fill data payload from transmit queue */
675 tcp_process_tx_queue ( tcp, len, iobuf, 0 );
676
677 /* Expand receive window if possible */
678 max_rcv_win = xfer_window ( &tcp->xfer );
679 if ( max_rcv_win > TCP_MAX_WINDOW_SIZE )
680 max_rcv_win = TCP_MAX_WINDOW_SIZE;
681 max_representable_win = ( 0xffff << tcp->rcv_win_scale );
682 if ( max_rcv_win > max_representable_win )
683 max_rcv_win = max_representable_win;
684 max_rcv_win &= ~0x03; /* Keep everything dword-aligned */
685 if ( tcp->rcv_win < max_rcv_win )
686 tcp->rcv_win = max_rcv_win;
687
688 /* Fill up the TCP header */
689 payload = iobuf->data;
690 if ( flags & TCP_SYN ) {
691 mssopt = iob_push ( iobuf, sizeof ( *mssopt ) );
692 mssopt->kind = TCP_OPTION_MSS;
693 mssopt->length = sizeof ( *mssopt );
694 mssopt->mss = htons ( tcp->mss );
695 wsopt = iob_push ( iobuf, sizeof ( *wsopt ) );
696 wsopt->nop = TCP_OPTION_NOP;
697 wsopt->wsopt.kind = TCP_OPTION_WS;
698 wsopt->wsopt.length = sizeof ( wsopt->wsopt );
699 wsopt->wsopt.scale = TCP_RX_WINDOW_SCALE;
700 spopt = iob_push ( iobuf, sizeof ( *spopt ) );
701 memset ( spopt->nop, TCP_OPTION_NOP, sizeof ( spopt->nop ) );
702 spopt->spopt.kind = TCP_OPTION_SACK_PERMITTED;
703 spopt->spopt.length = sizeof ( spopt->spopt );
704 }
705 if ( ( flags & TCP_SYN ) || ( tcp->flags & TCP_TS_ENABLED ) ) {
706 tsopt = iob_push ( iobuf, sizeof ( *tsopt ) );
707 memset ( tsopt->nop, TCP_OPTION_NOP, sizeof ( tsopt->nop ) );
708 tsopt->tsopt.kind = TCP_OPTION_TS;
709 tsopt->tsopt.length = sizeof ( tsopt->tsopt );
710 tsopt->tsopt.tsval = htonl ( currticks() );
711 tsopt->tsopt.tsecr = htonl ( tcp->ts_recent );
712 }
713 if ( ( tcp->flags & TCP_SACK_ENABLED ) &&
714 ( ! list_empty ( &tcp->rx_queue ) ) &&
715 ( ( sack_count = tcp_sack ( tcp, sack_seq ) ) != 0 ) ) {
716 sack_len = ( sack_count * sizeof ( *sack ) );
717 sackopt = iob_push ( iobuf, ( sizeof ( *sackopt ) + sack_len ));
718 memset ( sackopt->nop, TCP_OPTION_NOP, sizeof ( sackopt->nop ));
719 sackopt->sackopt.kind = TCP_OPTION_SACK;
720 sackopt->sackopt.length =
721 ( sizeof ( sackopt->sackopt ) + sack_len );
722 sack = ( ( ( void * ) sackopt ) + sizeof ( *sackopt ) );
723 for ( i = 0 ; i < sack_count ; i++, sack++ ) {
724 sack->left = htonl ( tcp->sack[i].left );
725 sack->right = htonl ( tcp->sack[i].right );
726 }
727 }
728 if ( len != 0 )
729 flags |= TCP_PSH;
730 tcphdr = iob_push ( iobuf, sizeof ( *tcphdr ) );
731 memset ( tcphdr, 0, sizeof ( *tcphdr ) );
732 tcphdr->src = htons ( tcp->local_port );
733 tcphdr->dest = tcp->peer.st_port;
734 tcphdr->seq = htonl ( tcp->snd_seq );
735 tcphdr->ack = htonl ( tcp->rcv_ack );
736 tcphdr->hlen = ( ( payload - iobuf->data ) << 2 );
737 tcphdr->flags = flags;
738 tcphdr->win = htons ( tcp->rcv_win >> tcp->rcv_win_scale );
739 tcphdr->csum = tcpip_chksum ( iobuf->data, iob_len ( iobuf ) );
740
741 /* Dump header */
742 DBGC2 ( tcp, "TCP %p TX %d->%d %08x..%08x %08x %4zd",
743 tcp, ntohs ( tcphdr->src ), ntohs ( tcphdr->dest ),
744 ntohl ( tcphdr->seq ), ( ntohl ( tcphdr->seq ) + seq_len ),
745 ntohl ( tcphdr->ack ), len );
746 tcp_dump_flags ( tcp, tcphdr->flags );
747 DBGC2 ( tcp, "\n" );
748
749 /* Transmit packet */
750 if ( ( rc = tcpip_tx ( iobuf, &tcp_protocol, NULL, &tcp->peer, NULL,
751 &tcphdr->csum ) ) != 0 ) {
752 DBGC ( tcp, "TCP %p could not transmit %08x..%08x %08x: %s\n",
753 tcp, tcp->snd_seq, ( tcp->snd_seq + tcp->snd_sent ),
754 tcp->rcv_ack, strerror ( rc ) );
755 return;
756 }
757
758 /* Clear ACK-pending flag */
759 tcp->flags &= ~TCP_ACK_PENDING;
760
761 profile_stop ( &tcp_tx_profiler );
762 }
763
764 /**
765 * Transmit any outstanding data
766 *
767 * @v tcp TCP connection
768 */
tcp_xmit(struct tcp_connection * tcp)769 static void tcp_xmit ( struct tcp_connection *tcp ) {
770
771 /* Transmit without an explicit first SACK */
772 tcp_xmit_sack ( tcp, tcp->rcv_ack );
773 }
774
775 /** TCP process descriptor */
776 static struct process_descriptor tcp_process_desc =
777 PROC_DESC_ONCE ( struct tcp_connection, process, tcp_xmit );
778
779 /**
780 * Retransmission timer expired
781 *
782 * @v timer Retransmission timer
783 * @v over Failure indicator
784 */
tcp_expired(struct retry_timer * timer,int over)785 static void tcp_expired ( struct retry_timer *timer, int over ) {
786 struct tcp_connection *tcp =
787 container_of ( timer, struct tcp_connection, timer );
788
789 DBGC ( tcp, "TCP %p timer %s in %s for %08x..%08x %08x\n", tcp,
790 ( over ? "expired" : "fired" ), tcp_state ( tcp->tcp_state ),
791 tcp->snd_seq, ( tcp->snd_seq + tcp->snd_sent ), tcp->rcv_ack );
792
793 assert ( ( tcp->tcp_state == TCP_SYN_SENT ) ||
794 ( tcp->tcp_state == TCP_SYN_RCVD ) ||
795 ( tcp->tcp_state == TCP_ESTABLISHED ) ||
796 ( tcp->tcp_state == TCP_FIN_WAIT_1 ) ||
797 ( tcp->tcp_state == TCP_CLOSE_WAIT ) ||
798 ( tcp->tcp_state == TCP_CLOSING_OR_LAST_ACK ) );
799
800 if ( over ) {
801 /* If we have finally timed out and given up,
802 * terminate the connection
803 */
804 tcp->tcp_state = TCP_CLOSED;
805 tcp_dump_state ( tcp );
806 tcp_close ( tcp, -ETIMEDOUT );
807 } else {
808 /* Otherwise, retransmit the packet */
809 tcp_xmit ( tcp );
810 }
811 }
812
813 /**
814 * Keepalive timer expired
815 *
816 * @v timer Keepalive timer
817 * @v over Failure indicator
818 */
tcp_keepalive_expired(struct retry_timer * timer,int over __unused)819 static void tcp_keepalive_expired ( struct retry_timer *timer,
820 int over __unused ) {
821 struct tcp_connection *tcp =
822 container_of ( timer, struct tcp_connection, keepalive );
823
824 DBGC ( tcp, "TCP %p sending keepalive\n", tcp );
825
826 /* Reset keepalive timer */
827 start_timer_fixed ( &tcp->keepalive, TCP_KEEPALIVE_DELAY );
828
829 /* Send keepalive. We do this only to preserve or restore
830 * state in intermediate devices (e.g. firewall NAT tables);
831 * we don't actually care about eliciting a response to verify
832 * that the peer is still alive. We therefore send just a
833 * pure ACK, to keep our transmit path simple.
834 */
835 tcp->flags |= TCP_ACK_PENDING;
836 tcp_xmit ( tcp );
837 }
838
839 /**
840 * Shutdown timer expired
841 *
842 * @v timer Shutdown timer
843 * @v over Failure indicator
844 */
tcp_wait_expired(struct retry_timer * timer,int over __unused)845 static void tcp_wait_expired ( struct retry_timer *timer, int over __unused ) {
846 struct tcp_connection *tcp =
847 container_of ( timer, struct tcp_connection, wait );
848
849 assert ( tcp->tcp_state == TCP_TIME_WAIT );
850
851 DBGC ( tcp, "TCP %p wait complete in %s for %08x..%08x %08x\n", tcp,
852 tcp_state ( tcp->tcp_state ), tcp->snd_seq,
853 ( tcp->snd_seq + tcp->snd_sent ), tcp->rcv_ack );
854
855 tcp->tcp_state = TCP_CLOSED;
856 tcp_dump_state ( tcp );
857 tcp_close ( tcp, 0 );
858 }
859
860 /**
861 * Send RST response to incoming packet
862 *
863 * @v in_tcphdr TCP header of incoming packet
864 * @ret rc Return status code
865 */
tcp_xmit_reset(struct tcp_connection * tcp,struct sockaddr_tcpip * st_dest,struct tcp_header * in_tcphdr)866 static int tcp_xmit_reset ( struct tcp_connection *tcp,
867 struct sockaddr_tcpip *st_dest,
868 struct tcp_header *in_tcphdr ) {
869 struct io_buffer *iobuf;
870 struct tcp_header *tcphdr;
871 int rc;
872
873 /* Allocate space for dataless TX buffer */
874 iobuf = alloc_iob ( TCP_MAX_HEADER_LEN );
875 if ( ! iobuf ) {
876 DBGC ( tcp, "TCP %p could not allocate iobuf for RST "
877 "%08x..%08x %08x\n", tcp, ntohl ( in_tcphdr->ack ),
878 ntohl ( in_tcphdr->ack ), ntohl ( in_tcphdr->seq ) );
879 return -ENOMEM;
880 }
881 iob_reserve ( iobuf, TCP_MAX_HEADER_LEN );
882
883 /* Construct RST response */
884 tcphdr = iob_push ( iobuf, sizeof ( *tcphdr ) );
885 memset ( tcphdr, 0, sizeof ( *tcphdr ) );
886 tcphdr->src = in_tcphdr->dest;
887 tcphdr->dest = in_tcphdr->src;
888 tcphdr->seq = in_tcphdr->ack;
889 tcphdr->ack = in_tcphdr->seq;
890 tcphdr->hlen = ( ( sizeof ( *tcphdr ) / 4 ) << 4 );
891 tcphdr->flags = ( TCP_RST | TCP_ACK );
892 tcphdr->win = htons ( 0 );
893 tcphdr->csum = tcpip_chksum ( iobuf->data, iob_len ( iobuf ) );
894
895 /* Dump header */
896 DBGC2 ( tcp, "TCP %p TX %d->%d %08x..%08x %08x %4d",
897 tcp, ntohs ( tcphdr->src ), ntohs ( tcphdr->dest ),
898 ntohl ( tcphdr->seq ), ( ntohl ( tcphdr->seq ) ),
899 ntohl ( tcphdr->ack ), 0 );
900 tcp_dump_flags ( tcp, tcphdr->flags );
901 DBGC2 ( tcp, "\n" );
902
903 /* Transmit packet */
904 if ( ( rc = tcpip_tx ( iobuf, &tcp_protocol, NULL, st_dest,
905 NULL, &tcphdr->csum ) ) != 0 ) {
906 DBGC ( tcp, "TCP %p could not transmit RST %08x..%08x %08x: "
907 "%s\n", tcp, ntohl ( in_tcphdr->ack ),
908 ntohl ( in_tcphdr->ack ), ntohl ( in_tcphdr->seq ),
909 strerror ( rc ) );
910 return rc;
911 }
912
913 return 0;
914 }
915
916 /***************************************************************************
917 *
918 * Receive data path
919 *
920 ***************************************************************************
921 */
922
923 /**
924 * Identify TCP connection by local port number
925 *
926 * @v local_port Local port
927 * @ret tcp TCP connection, or NULL
928 */
tcp_demux(unsigned int local_port)929 static struct tcp_connection * tcp_demux ( unsigned int local_port ) {
930 struct tcp_connection *tcp;
931
932 list_for_each_entry ( tcp, &tcp_conns, list ) {
933 if ( tcp->local_port == local_port )
934 return tcp;
935 }
936 return NULL;
937 }
938
939 /**
940 * Parse TCP received options
941 *
942 * @v tcp TCP connection (may be NULL)
943 * @v tcphdr TCP header
944 * @v hlen TCP header length
945 * @v options Options structure to fill in
946 * @ret rc Return status code
947 */
tcp_rx_opts(struct tcp_connection * tcp,const struct tcp_header * tcphdr,size_t hlen,struct tcp_options * options)948 static int tcp_rx_opts ( struct tcp_connection *tcp,
949 const struct tcp_header *tcphdr, size_t hlen,
950 struct tcp_options *options ) {
951 const void *data = ( ( ( void * ) tcphdr ) + sizeof ( *tcphdr ) );
952 const void *end = ( ( ( void * ) tcphdr ) + hlen );
953 const struct tcp_option *option;
954 unsigned int kind;
955 size_t remaining;
956 size_t min;
957
958 /* Sanity check */
959 assert ( hlen >= sizeof ( *tcphdr ) );
960
961 /* Parse options */
962 memset ( options, 0, sizeof ( *options ) );
963 while ( ( remaining = ( end - data ) ) ) {
964
965 /* Extract option code */
966 option = data;
967 kind = option->kind;
968
969 /* Handle single-byte options */
970 if ( kind == TCP_OPTION_END )
971 break;
972 if ( kind == TCP_OPTION_NOP ) {
973 data++;
974 continue;
975 }
976
977 /* Handle multi-byte options */
978 min = sizeof ( *option );
979 switch ( kind ) {
980 case TCP_OPTION_MSS:
981 /* Ignore received MSS */
982 break;
983 case TCP_OPTION_WS:
984 options->wsopt = data;
985 min = sizeof ( *options->wsopt );
986 break;
987 case TCP_OPTION_SACK_PERMITTED:
988 options->spopt = data;
989 min = sizeof ( *options->spopt );
990 break;
991 case TCP_OPTION_SACK:
992 /* Ignore received SACKs */
993 break;
994 case TCP_OPTION_TS:
995 options->tsopt = data;
996 min = sizeof ( *options->tsopt );
997 break;
998 default:
999 DBGC ( tcp, "TCP %p received unknown option %d\n",
1000 tcp, kind );
1001 break;
1002 }
1003 if ( remaining < min ) {
1004 DBGC ( tcp, "TCP %p received truncated option %d\n",
1005 tcp, kind );
1006 return -EINVAL;
1007 }
1008 if ( option->length < min ) {
1009 DBGC ( tcp, "TCP %p received underlength option %d\n",
1010 tcp, kind );
1011 return -EINVAL;
1012 }
1013 if ( option->length > remaining ) {
1014 DBGC ( tcp, "TCP %p received overlength option %d\n",
1015 tcp, kind );
1016 return -EINVAL;
1017 }
1018 data += option->length;
1019 }
1020
1021 return 0;
1022 }
1023
1024 /**
1025 * Consume received sequence space
1026 *
1027 * @v tcp TCP connection
1028 * @v seq_len Sequence space length to consume
1029 */
tcp_rx_seq(struct tcp_connection * tcp,uint32_t seq_len)1030 static void tcp_rx_seq ( struct tcp_connection *tcp, uint32_t seq_len ) {
1031 unsigned int sack;
1032
1033 /* Sanity check */
1034 assert ( seq_len > 0 );
1035
1036 /* Update acknowledgement number */
1037 tcp->rcv_ack += seq_len;
1038
1039 /* Update window */
1040 if ( tcp->rcv_win > seq_len ) {
1041 tcp->rcv_win -= seq_len;
1042 } else {
1043 tcp->rcv_win = 0;
1044 }
1045
1046 /* Update timestamp */
1047 tcp->ts_recent = tcp->ts_val;
1048
1049 /* Update SACK list */
1050 for ( sack = 0 ; sack < TCP_SACK_MAX ; sack++ ) {
1051 if ( tcp->sack[sack].left == tcp->sack[sack].right )
1052 continue;
1053 if ( tcp_cmp ( tcp->sack[sack].left, tcp->rcv_ack ) < 0 )
1054 tcp->sack[sack].left = tcp->rcv_ack;
1055 if ( tcp_cmp ( tcp->sack[sack].right, tcp->rcv_ack ) < 0 )
1056 tcp->sack[sack].right = tcp->rcv_ack;
1057 }
1058
1059 /* Mark ACK as pending */
1060 tcp->flags |= TCP_ACK_PENDING;
1061 }
1062
1063 /**
1064 * Handle TCP received SYN
1065 *
1066 * @v tcp TCP connection
1067 * @v seq SEQ value (in host-endian order)
1068 * @v options TCP options
1069 * @ret rc Return status code
1070 */
tcp_rx_syn(struct tcp_connection * tcp,uint32_t seq,struct tcp_options * options)1071 static int tcp_rx_syn ( struct tcp_connection *tcp, uint32_t seq,
1072 struct tcp_options *options ) {
1073
1074 /* Synchronise sequence numbers on first SYN */
1075 if ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) {
1076 tcp->rcv_ack = seq;
1077 if ( options->tsopt )
1078 tcp->flags |= TCP_TS_ENABLED;
1079 if ( options->spopt )
1080 tcp->flags |= TCP_SACK_ENABLED;
1081 if ( options->wsopt ) {
1082 tcp->snd_win_scale = options->wsopt->scale;
1083 tcp->rcv_win_scale = TCP_RX_WINDOW_SCALE;
1084 }
1085 DBGC ( tcp, "TCP %p using %stimestamps, %sSACK, TX window "
1086 "x%d, RX window x%d\n", tcp,
1087 ( ( tcp->flags & TCP_TS_ENABLED ) ? "" : "no " ),
1088 ( ( tcp->flags & TCP_SACK_ENABLED ) ? "" : "no " ),
1089 ( 1 << tcp->snd_win_scale ),
1090 ( 1 << tcp->rcv_win_scale ) );
1091 }
1092
1093 /* Ignore duplicate SYN */
1094 if ( seq != tcp->rcv_ack )
1095 return 0;
1096
1097 /* Acknowledge SYN */
1098 tcp_rx_seq ( tcp, 1 );
1099
1100 /* Mark SYN as received and start sending ACKs with each packet */
1101 tcp->tcp_state |= ( TCP_STATE_SENT ( TCP_ACK ) |
1102 TCP_STATE_RCVD ( TCP_SYN ) );
1103
1104 return 0;
1105 }
1106
1107 /**
1108 * Handle TCP received ACK
1109 *
1110 * @v tcp TCP connection
1111 * @v ack ACK value (in host-endian order)
1112 * @v win WIN value (in host-endian order)
1113 * @ret rc Return status code
1114 */
tcp_rx_ack(struct tcp_connection * tcp,uint32_t ack,uint32_t win)1115 static int tcp_rx_ack ( struct tcp_connection *tcp, uint32_t ack,
1116 uint32_t win ) {
1117 uint32_t ack_len = ( ack - tcp->snd_seq );
1118 size_t len;
1119 unsigned int acked_flags;
1120
1121 /* Check for out-of-range or old duplicate ACKs */
1122 if ( ack_len > tcp->snd_sent ) {
1123 DBGC ( tcp, "TCP %p received ACK for %08x..%08x, "
1124 "sent only %08x..%08x\n", tcp, tcp->snd_seq,
1125 ( tcp->snd_seq + ack_len ), tcp->snd_seq,
1126 ( tcp->snd_seq + tcp->snd_sent ) );
1127
1128 if ( TCP_HAS_BEEN_ESTABLISHED ( tcp->tcp_state ) ) {
1129 /* Just ignore what might be old duplicate ACKs */
1130 return 0;
1131 } else {
1132 /* Send RST if an out-of-range ACK is received
1133 * on a not-yet-established connection, as per
1134 * RFC 793.
1135 */
1136 return -EINVAL;
1137 }
1138 }
1139
1140 /* Update window size */
1141 tcp->snd_win = win;
1142
1143 /* Hold off (or start) the keepalive timer, if applicable */
1144 if ( ! ( tcp->tcp_state & TCP_STATE_SENT ( TCP_FIN ) ) )
1145 start_timer_fixed ( &tcp->keepalive, TCP_KEEPALIVE_DELAY );
1146
1147 /* Ignore ACKs that don't actually acknowledge any new data.
1148 * (In particular, do not stop the retransmission timer; this
1149 * avoids creating a sorceror's apprentice syndrome when a
1150 * duplicate ACK is received and we still have data in our
1151 * transmit queue.)
1152 */
1153 if ( ack_len == 0 )
1154 return 0;
1155
1156 /* Stop the retransmission timer */
1157 stop_timer ( &tcp->timer );
1158
1159 /* Determine acknowledged flags and data length */
1160 len = ack_len;
1161 acked_flags = ( TCP_FLAGS_SENDING ( tcp->tcp_state ) &
1162 ( TCP_SYN | TCP_FIN ) );
1163 if ( acked_flags ) {
1164 len--;
1165 pending_put ( &tcp->pending_flags );
1166 }
1167
1168 /* Update SEQ and sent counters */
1169 tcp->snd_seq = ack;
1170 tcp->snd_sent = 0;
1171
1172 /* Remove any acknowledged data from transmit queue */
1173 tcp_process_tx_queue ( tcp, len, NULL, 1 );
1174
1175 /* Mark SYN/FIN as acknowledged if applicable. */
1176 if ( acked_flags )
1177 tcp->tcp_state |= TCP_STATE_ACKED ( acked_flags );
1178
1179 /* Start sending FIN if we've had all possible data ACKed */
1180 if ( list_empty ( &tcp->tx_queue ) &&
1181 ( tcp->flags & TCP_XFER_CLOSED ) &&
1182 ! ( tcp->tcp_state & TCP_STATE_SENT ( TCP_FIN ) ) ) {
1183 tcp->tcp_state |= TCP_STATE_SENT ( TCP_FIN );
1184 pending_get ( &tcp->pending_flags );
1185 }
1186
1187 return 0;
1188 }
1189
1190 /**
1191 * Handle TCP received data
1192 *
1193 * @v tcp TCP connection
1194 * @v seq SEQ value (in host-endian order)
1195 * @v iobuf I/O buffer
1196 * @ret rc Return status code
1197 *
1198 * This function takes ownership of the I/O buffer.
1199 */
tcp_rx_data(struct tcp_connection * tcp,uint32_t seq,struct io_buffer * iobuf)1200 static int tcp_rx_data ( struct tcp_connection *tcp, uint32_t seq,
1201 struct io_buffer *iobuf ) {
1202 uint32_t already_rcvd;
1203 uint32_t len;
1204 int rc;
1205
1206 /* Ignore duplicate or out-of-order data */
1207 already_rcvd = ( tcp->rcv_ack - seq );
1208 len = iob_len ( iobuf );
1209 if ( already_rcvd >= len ) {
1210 free_iob ( iobuf );
1211 return 0;
1212 }
1213 iob_pull ( iobuf, already_rcvd );
1214 len -= already_rcvd;
1215
1216 /* Acknowledge new data */
1217 tcp_rx_seq ( tcp, len );
1218
1219 /* Deliver data to application */
1220 profile_start ( &tcp_xfer_profiler );
1221 if ( ( rc = xfer_deliver_iob ( &tcp->xfer, iobuf ) ) != 0 ) {
1222 DBGC ( tcp, "TCP %p could not deliver %08x..%08x: %s\n",
1223 tcp, seq, ( seq + len ), strerror ( rc ) );
1224 return rc;
1225 }
1226 profile_stop ( &tcp_xfer_profiler );
1227
1228 return 0;
1229 }
1230
1231 /**
1232 * Handle TCP received FIN
1233 *
1234 * @v tcp TCP connection
1235 * @v seq SEQ value (in host-endian order)
1236 * @ret rc Return status code
1237 */
tcp_rx_fin(struct tcp_connection * tcp,uint32_t seq)1238 static int tcp_rx_fin ( struct tcp_connection *tcp, uint32_t seq ) {
1239
1240 /* Ignore duplicate or out-of-order FIN */
1241 if ( seq != tcp->rcv_ack )
1242 return 0;
1243
1244 /* Acknowledge FIN */
1245 tcp_rx_seq ( tcp, 1 );
1246
1247 /* Mark FIN as received */
1248 tcp->tcp_state |= TCP_STATE_RCVD ( TCP_FIN );
1249
1250 /* Close connection */
1251 tcp_close ( tcp, 0 );
1252
1253 return 0;
1254 }
1255
1256 /**
1257 * Handle TCP received RST
1258 *
1259 * @v tcp TCP connection
1260 * @v seq SEQ value (in host-endian order)
1261 * @ret rc Return status code
1262 */
tcp_rx_rst(struct tcp_connection * tcp,uint32_t seq)1263 static int tcp_rx_rst ( struct tcp_connection *tcp, uint32_t seq ) {
1264
1265 /* Accept RST only if it falls within the window. If we have
1266 * not yet received a SYN, then we have no window to test
1267 * against, so fall back to checking that our SYN has been
1268 * ACKed.
1269 */
1270 if ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) {
1271 if ( ! tcp_in_window ( seq, tcp->rcv_ack, tcp->rcv_win ) )
1272 return 0;
1273 } else {
1274 if ( ! ( tcp->tcp_state & TCP_STATE_ACKED ( TCP_SYN ) ) )
1275 return 0;
1276 }
1277
1278 /* Abort connection */
1279 tcp->tcp_state = TCP_CLOSED;
1280 tcp_dump_state ( tcp );
1281 tcp_close ( tcp, -ECONNRESET );
1282
1283 DBGC ( tcp, "TCP %p connection reset by peer\n", tcp );
1284 return -ECONNRESET;
1285 }
1286
1287 /**
1288 * Enqueue received TCP packet
1289 *
1290 * @v tcp TCP connection
1291 * @v seq SEQ value (in host-endian order)
1292 * @v flags TCP flags
1293 * @v iobuf I/O buffer
1294 */
tcp_rx_enqueue(struct tcp_connection * tcp,uint32_t seq,uint8_t flags,struct io_buffer * iobuf)1295 static void tcp_rx_enqueue ( struct tcp_connection *tcp, uint32_t seq,
1296 uint8_t flags, struct io_buffer *iobuf ) {
1297 struct tcp_rx_queued_header *tcpqhdr;
1298 struct io_buffer *queued;
1299 size_t len;
1300 uint32_t seq_len;
1301 uint32_t nxt;
1302
1303 /* Calculate remaining flags and sequence length. Note that
1304 * SYN, if present, has already been processed by this point.
1305 */
1306 flags &= TCP_FIN;
1307 len = iob_len ( iobuf );
1308 seq_len = ( len + ( flags ? 1 : 0 ) );
1309 nxt = ( seq + seq_len );
1310
1311 /* Discard immediately (to save memory) if:
1312 *
1313 * a) we have not yet received a SYN (and so have no defined
1314 * receive window), or
1315 * b) the packet lies entirely outside the receive window, or
1316 * c) there is no further content to process.
1317 */
1318 if ( ( ! ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) ) ||
1319 ( tcp_cmp ( seq, tcp->rcv_ack + tcp->rcv_win ) >= 0 ) ||
1320 ( tcp_cmp ( nxt, tcp->rcv_ack ) < 0 ) ||
1321 ( seq_len == 0 ) ) {
1322 free_iob ( iobuf );
1323 return;
1324 }
1325
1326 /* Add internal header */
1327 tcpqhdr = iob_push ( iobuf, sizeof ( *tcpqhdr ) );
1328 tcpqhdr->seq = seq;
1329 tcpqhdr->nxt = nxt;
1330 tcpqhdr->flags = flags;
1331
1332 /* Add to RX queue */
1333 list_for_each_entry ( queued, &tcp->rx_queue, list ) {
1334 tcpqhdr = queued->data;
1335 if ( tcp_cmp ( seq, tcpqhdr->seq ) < 0 )
1336 break;
1337 }
1338 list_add_tail ( &iobuf->list, &queued->list );
1339 }
1340
1341 /**
1342 * Process receive queue
1343 *
1344 * @v tcp TCP connection
1345 */
tcp_process_rx_queue(struct tcp_connection * tcp)1346 static void tcp_process_rx_queue ( struct tcp_connection *tcp ) {
1347 struct io_buffer *iobuf;
1348 struct tcp_rx_queued_header *tcpqhdr;
1349 uint32_t seq;
1350 unsigned int flags;
1351 size_t len;
1352
1353 /* Process all applicable received buffers. Note that we
1354 * cannot use list_for_each_entry() to iterate over the RX
1355 * queue, since tcp_discard() may remove packets from the RX
1356 * queue while we are processing.
1357 */
1358 while ( ( iobuf = list_first_entry ( &tcp->rx_queue, struct io_buffer,
1359 list ) ) ) {
1360
1361 /* Stop processing when we hit the first gap */
1362 tcpqhdr = iobuf->data;
1363 if ( tcp_cmp ( tcpqhdr->seq, tcp->rcv_ack ) > 0 )
1364 break;
1365
1366 /* Strip internal header and remove from RX queue */
1367 list_del ( &iobuf->list );
1368 seq = tcpqhdr->seq;
1369 flags = tcpqhdr->flags;
1370 iob_pull ( iobuf, sizeof ( *tcpqhdr ) );
1371 len = iob_len ( iobuf );
1372
1373 /* Handle new data, if any */
1374 tcp_rx_data ( tcp, seq, iob_disown ( iobuf ) );
1375 seq += len;
1376
1377 /* Handle FIN, if present */
1378 if ( flags & TCP_FIN ) {
1379 tcp_rx_fin ( tcp, seq );
1380 seq++;
1381 }
1382 }
1383 }
1384
1385 /**
1386 * Process received packet
1387 *
1388 * @v iobuf I/O buffer
1389 * @v netdev Network device
1390 * @v st_src Partially-filled source address
1391 * @v st_dest Partially-filled destination address
1392 * @v pshdr_csum Pseudo-header checksum
1393 * @ret rc Return status code
1394 */
tcp_rx(struct io_buffer * iobuf,struct net_device * netdev __unused,struct sockaddr_tcpip * st_src,struct sockaddr_tcpip * st_dest __unused,uint16_t pshdr_csum)1395 static int tcp_rx ( struct io_buffer *iobuf,
1396 struct net_device *netdev __unused,
1397 struct sockaddr_tcpip *st_src,
1398 struct sockaddr_tcpip *st_dest __unused,
1399 uint16_t pshdr_csum ) {
1400 struct tcp_header *tcphdr = iobuf->data;
1401 struct tcp_connection *tcp;
1402 struct tcp_options options;
1403 size_t hlen;
1404 uint16_t csum;
1405 uint32_t seq;
1406 uint32_t ack;
1407 uint16_t raw_win;
1408 uint32_t win;
1409 unsigned int flags;
1410 size_t len;
1411 uint32_t seq_len;
1412 size_t old_xfer_window;
1413 int rc;
1414
1415 /* Start profiling */
1416 profile_start ( &tcp_rx_profiler );
1417
1418 /* Sanity check packet */
1419 if ( iob_len ( iobuf ) < sizeof ( *tcphdr ) ) {
1420 DBG ( "TCP packet too short at %zd bytes (min %zd bytes)\n",
1421 iob_len ( iobuf ), sizeof ( *tcphdr ) );
1422 rc = -EINVAL;
1423 goto discard;
1424 }
1425 hlen = ( ( tcphdr->hlen & TCP_MASK_HLEN ) / 16 ) * 4;
1426 if ( hlen < sizeof ( *tcphdr ) ) {
1427 DBG ( "TCP header too short at %zd bytes (min %zd bytes)\n",
1428 hlen, sizeof ( *tcphdr ) );
1429 rc = -EINVAL;
1430 goto discard;
1431 }
1432 if ( hlen > iob_len ( iobuf ) ) {
1433 DBG ( "TCP header too long at %zd bytes (max %zd bytes)\n",
1434 hlen, iob_len ( iobuf ) );
1435 rc = -EINVAL;
1436 goto discard;
1437 }
1438 csum = tcpip_continue_chksum ( pshdr_csum, iobuf->data,
1439 iob_len ( iobuf ) );
1440 if ( csum != 0 ) {
1441 DBG ( "TCP checksum incorrect (is %04x including checksum "
1442 "field, should be 0000)\n", csum );
1443 rc = -EINVAL;
1444 goto discard;
1445 }
1446
1447 /* Parse parameters from header and strip header */
1448 tcp = tcp_demux ( ntohs ( tcphdr->dest ) );
1449 seq = ntohl ( tcphdr->seq );
1450 ack = ntohl ( tcphdr->ack );
1451 raw_win = ntohs ( tcphdr->win );
1452 flags = tcphdr->flags;
1453 if ( ( rc = tcp_rx_opts ( tcp, tcphdr, hlen, &options ) ) != 0 )
1454 goto discard;
1455 if ( tcp && options.tsopt )
1456 tcp->ts_val = ntohl ( options.tsopt->tsval );
1457 iob_pull ( iobuf, hlen );
1458 len = iob_len ( iobuf );
1459 seq_len = ( len + ( ( flags & TCP_SYN ) ? 1 : 0 ) +
1460 ( ( flags & TCP_FIN ) ? 1 : 0 ) );
1461
1462 /* Dump header */
1463 DBGC2 ( tcp, "TCP %p RX %d<-%d %08x %08x..%08x %4zd",
1464 tcp, ntohs ( tcphdr->dest ), ntohs ( tcphdr->src ),
1465 ntohl ( tcphdr->ack ), ntohl ( tcphdr->seq ),
1466 ( ntohl ( tcphdr->seq ) + seq_len ), len );
1467 tcp_dump_flags ( tcp, tcphdr->flags );
1468 DBGC2 ( tcp, "\n" );
1469
1470 /* If no connection was found, silently drop packet */
1471 if ( ! tcp ) {
1472 rc = -ENOTCONN;
1473 goto discard;
1474 }
1475
1476 /* Record old data-transfer window */
1477 old_xfer_window = tcp_xfer_window ( tcp );
1478
1479 /* Handle ACK, if present */
1480 if ( flags & TCP_ACK ) {
1481 win = ( raw_win << tcp->snd_win_scale );
1482 if ( ( rc = tcp_rx_ack ( tcp, ack, win ) ) != 0 ) {
1483 tcp_xmit_reset ( tcp, st_src, tcphdr );
1484 goto discard;
1485 }
1486 }
1487
1488 /* Force an ACK if this packet is out of order */
1489 if ( ( tcp->tcp_state & TCP_STATE_RCVD ( TCP_SYN ) ) &&
1490 ( seq != tcp->rcv_ack ) ) {
1491 tcp->flags |= TCP_ACK_PENDING;
1492 }
1493
1494 /* Handle SYN, if present */
1495 if ( flags & TCP_SYN ) {
1496 tcp_rx_syn ( tcp, seq, &options );
1497 seq++;
1498 }
1499
1500 /* Handle RST, if present */
1501 if ( flags & TCP_RST ) {
1502 if ( ( rc = tcp_rx_rst ( tcp, seq ) ) != 0 )
1503 goto discard;
1504 }
1505
1506 /* Enqueue received data */
1507 tcp_rx_enqueue ( tcp, seq, flags, iob_disown ( iobuf ) );
1508
1509 /* Process receive queue */
1510 tcp_process_rx_queue ( tcp );
1511
1512 /* Dump out any state change as a result of the received packet */
1513 tcp_dump_state ( tcp );
1514
1515 /* Schedule transmission of ACK (and any pending data). If we
1516 * have received any out-of-order packets (i.e. if the receive
1517 * queue remains non-empty after processing) then send the ACK
1518 * immediately in order to trigger Fast Retransmission.
1519 */
1520 if ( list_empty ( &tcp->rx_queue ) ) {
1521 process_add ( &tcp->process );
1522 } else {
1523 tcp_xmit_sack ( tcp, seq );
1524 }
1525
1526 /* If this packet was the last we expect to receive, set up
1527 * timer to expire and cause the connection to be freed.
1528 */
1529 if ( TCP_CLOSED_GRACEFULLY ( tcp->tcp_state ) ) {
1530 stop_timer ( &tcp->wait );
1531 start_timer_fixed ( &tcp->wait, ( 2 * TCP_MSL ) );
1532 }
1533
1534 /* Notify application if window has changed */
1535 if ( tcp_xfer_window ( tcp ) != old_xfer_window )
1536 xfer_window_changed ( &tcp->xfer );
1537
1538 profile_stop ( &tcp_rx_profiler );
1539 return 0;
1540
1541 discard:
1542 /* Free received packet */
1543 free_iob ( iobuf );
1544 return rc;
1545 }
1546
1547 /** TCP protocol */
1548 struct tcpip_protocol tcp_protocol __tcpip_protocol = {
1549 .name = "TCP",
1550 .rx = tcp_rx,
1551 .tcpip_proto = IP_TCP,
1552 };
1553
1554 /**
1555 * Discard some cached TCP data
1556 *
1557 * @ret discarded Number of cached items discarded
1558 */
tcp_discard(void)1559 static unsigned int tcp_discard ( void ) {
1560 struct tcp_connection *tcp;
1561 struct io_buffer *iobuf;
1562 unsigned int discarded = 0;
1563
1564 /* Try to drop one queued RX packet from each connection */
1565 list_for_each_entry ( tcp, &tcp_conns, list ) {
1566 list_for_each_entry_reverse ( iobuf, &tcp->rx_queue, list ) {
1567
1568 /* Remove packet from queue */
1569 list_del ( &iobuf->list );
1570 free_iob ( iobuf );
1571
1572 /* Report discard */
1573 discarded++;
1574 break;
1575 }
1576 }
1577
1578 return discarded;
1579 }
1580
1581 /** TCP cache discarder */
1582 struct cache_discarder tcp_discarder __cache_discarder ( CACHE_NORMAL ) = {
1583 .discard = tcp_discard,
1584 };
1585
1586 /**
1587 * Find first TCP connection that has not yet been closed
1588 *
1589 * @ret tcp First unclosed connection, or NULL
1590 */
tcp_first_unclosed(void)1591 static struct tcp_connection * tcp_first_unclosed ( void ) {
1592 struct tcp_connection *tcp;
1593
1594 /* Find first connection which has not yet been closed */
1595 list_for_each_entry ( tcp, &tcp_conns, list ) {
1596 if ( ! ( tcp->flags & TCP_XFER_CLOSED ) )
1597 return tcp;
1598 }
1599 return NULL;
1600 }
1601
1602 /**
1603 * Find first TCP connection that has not yet finished all operations
1604 *
1605 * @ret tcp First unfinished connection, or NULL
1606 */
tcp_first_unfinished(void)1607 static struct tcp_connection * tcp_first_unfinished ( void ) {
1608 struct tcp_connection *tcp;
1609
1610 /* Find first connection which has not yet closed gracefully,
1611 * or which still has a pending transmission (e.g. to ACK the
1612 * received FIN).
1613 */
1614 list_for_each_entry ( tcp, &tcp_conns, list ) {
1615 if ( ( ! TCP_CLOSED_GRACEFULLY ( tcp->tcp_state ) ) ||
1616 process_running ( &tcp->process ) ) {
1617 return tcp;
1618 }
1619 }
1620 return NULL;
1621 }
1622
1623 /**
1624 * Shut down all TCP connections
1625 *
1626 */
tcp_shutdown(int booting __unused)1627 static void tcp_shutdown ( int booting __unused ) {
1628 struct tcp_connection *tcp;
1629 unsigned long start;
1630 unsigned long elapsed;
1631
1632 /* Initiate a graceful close of all connections, allowing for
1633 * the fact that the connection list may change as we do so.
1634 */
1635 while ( ( tcp = tcp_first_unclosed() ) ) {
1636 DBGC ( tcp, "TCP %p closing for shutdown\n", tcp );
1637 tcp_close ( tcp, -ECANCELED );
1638 }
1639
1640 /* Wait for all connections to finish closing gracefully */
1641 start = currticks();
1642 while ( ( tcp = tcp_first_unfinished() ) &&
1643 ( ( elapsed = ( currticks() - start ) ) < TCP_FINISH_TIMEOUT )){
1644 step();
1645 }
1646
1647 /* Forcibly close any remaining connections */
1648 while ( ( tcp = list_first_entry ( &tcp_conns, struct tcp_connection,
1649 list ) ) != NULL ) {
1650 tcp->tcp_state = TCP_CLOSED;
1651 tcp_dump_state ( tcp );
1652 tcp_close ( tcp, -ECANCELED );
1653 }
1654 }
1655
1656 /** TCP shutdown function */
1657 struct startup_fn tcp_startup_fn __startup_fn ( STARTUP_LATE ) = {
1658 .name = "tcp",
1659 .shutdown = tcp_shutdown,
1660 };
1661
1662 /***************************************************************************
1663 *
1664 * Data transfer interface
1665 *
1666 ***************************************************************************
1667 */
1668
1669 /**
1670 * Close interface
1671 *
1672 * @v tcp TCP connection
1673 * @v rc Reason for close
1674 */
tcp_xfer_close(struct tcp_connection * tcp,int rc)1675 static void tcp_xfer_close ( struct tcp_connection *tcp, int rc ) {
1676
1677 /* Close data transfer interface */
1678 tcp_close ( tcp, rc );
1679
1680 /* Transmit FIN, if possible */
1681 tcp_xmit ( tcp );
1682 }
1683
1684 /**
1685 * Deliver datagram as I/O buffer
1686 *
1687 * @v tcp TCP connection
1688 * @v iobuf Datagram I/O buffer
1689 * @v meta Data transfer metadata
1690 * @ret rc Return status code
1691 */
tcp_xfer_deliver(struct tcp_connection * tcp,struct io_buffer * iobuf,struct xfer_metadata * meta __unused)1692 static int tcp_xfer_deliver ( struct tcp_connection *tcp,
1693 struct io_buffer *iobuf,
1694 struct xfer_metadata *meta __unused ) {
1695
1696 /* Enqueue packet */
1697 list_add_tail ( &iobuf->list, &tcp->tx_queue );
1698
1699 /* Each enqueued packet is a pending operation */
1700 pending_get ( &tcp->pending_data );
1701
1702 /* Transmit data, if possible */
1703 tcp_xmit ( tcp );
1704
1705 return 0;
1706 }
1707
1708 /**
1709 * Report job progress
1710 *
1711 * @v tcp TCP connection
1712 * @v progress Progress report to fill in
1713 * @ret ongoing_rc Ongoing job status code (if known)
1714 */
tcp_progress(struct tcp_connection * tcp,struct job_progress * progress)1715 static int tcp_progress ( struct tcp_connection *tcp,
1716 struct job_progress *progress ) {
1717
1718 /* Report connection in progress if applicable */
1719 if ( ! TCP_HAS_BEEN_ESTABLISHED ( tcp->tcp_state ) ) {
1720 snprintf ( progress->message, sizeof ( progress->message ),
1721 "connecting" );
1722 }
1723
1724 return 0;
1725 }
1726
1727 /** TCP data transfer interface operations */
1728 static struct interface_operation tcp_xfer_operations[] = {
1729 INTF_OP ( xfer_deliver, struct tcp_connection *, tcp_xfer_deliver ),
1730 INTF_OP ( xfer_window, struct tcp_connection *, tcp_xfer_window ),
1731 INTF_OP ( job_progress, struct tcp_connection *, tcp_progress ),
1732 INTF_OP ( intf_close, struct tcp_connection *, tcp_xfer_close ),
1733 };
1734
1735 /** TCP data transfer interface descriptor */
1736 static struct interface_descriptor tcp_xfer_desc =
1737 INTF_DESC ( struct tcp_connection, xfer, tcp_xfer_operations );
1738
1739 /***************************************************************************
1740 *
1741 * Openers
1742 *
1743 ***************************************************************************
1744 */
1745
1746 /** TCP socket opener */
1747 struct socket_opener tcp_socket_opener __socket_opener = {
1748 .semantics = TCP_SOCK_STREAM,
1749 .open = tcp_open,
1750 };
1751
1752 /** Linkage hack */
1753 int tcp_sock_stream = TCP_SOCK_STREAM;
1754
1755 /**
1756 * Open TCP URI
1757 *
1758 * @v xfer Data transfer interface
1759 * @v uri URI
1760 * @ret rc Return status code
1761 */
tcp_open_uri(struct interface * xfer,struct uri * uri)1762 static int tcp_open_uri ( struct interface *xfer, struct uri *uri ) {
1763 struct sockaddr_tcpip peer;
1764
1765 /* Sanity check */
1766 if ( ! uri->host )
1767 return -EINVAL;
1768
1769 memset ( &peer, 0, sizeof ( peer ) );
1770 peer.st_port = htons ( uri_port ( uri, 0 ) );
1771 return xfer_open_named_socket ( xfer, SOCK_STREAM,
1772 ( struct sockaddr * ) &peer,
1773 uri->host, NULL );
1774 }
1775
1776 /** TCP URI opener */
1777 struct uri_opener tcp_uri_opener __uri_opener = {
1778 .scheme = "tcp",
1779 .open = tcp_open_uri,
1780 };
1781
1782