1 #ifndef _IPXE_TCP_H
2 #define _IPXE_TCP_H
3 
4 /** @file
5  *
6  * TCP protocol
7  *
8  * This file defines the iPXE TCP API.
9  *
10  */
11 
12 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
13 
14 #include <ipxe/tcpip.h>
15 
16 /**
17  * A TCP header
18  */
19 struct tcp_header {
20 	uint16_t src;		/* Source port */
21 	uint16_t dest;		/* Destination port */
22 	uint32_t seq;		/* Sequence number */
23 	uint32_t ack;		/* Acknowledgement number */
24 	uint8_t hlen;		/* Header length (4), Reserved (4) */
25 	uint8_t flags;		/* Reserved (2), Flags (6) */
26 	uint16_t win;		/* Advertised window */
27 	uint16_t csum;		/* Checksum */
28 	uint16_t urg;		/* Urgent pointer */
29 };
30 
31 /** @defgroup tcpopts TCP options
32  * @{
33  */
34 
35 /** End of TCP options list */
36 #define TCP_OPTION_END 0
37 
38 /** TCP option pad */
39 #define TCP_OPTION_NOP 1
40 
41 /** Generic TCP option */
42 struct tcp_option {
43 	uint8_t kind;
44 	uint8_t length;
45 } __attribute__ (( packed ));
46 
47 /** TCP MSS option */
48 struct tcp_mss_option {
49 	uint8_t kind;
50 	uint8_t length;
51 	uint16_t mss;
52 } __attribute__ (( packed ));
53 
54 /** Code for the TCP MSS option */
55 #define TCP_OPTION_MSS 2
56 
57 /** TCP window scale option */
58 struct tcp_window_scale_option {
59 	uint8_t kind;
60 	uint8_t length;
61 	uint8_t scale;
62 } __attribute__ (( packed ));
63 
64 /** Padded TCP window scale option (used for sending) */
65 struct tcp_window_scale_padded_option {
66 	uint8_t nop;
67 	struct tcp_window_scale_option wsopt;
68 } __attribute (( packed ));
69 
70 /** Code for the TCP window scale option */
71 #define TCP_OPTION_WS 3
72 
73 /** Advertised TCP window scale
74  *
75  * Using a scale factor of 2**9 provides for a maximum window of 32MB,
76  * which is sufficient to allow Gigabit-speed transfers with a 200ms
77  * RTT.  The minimum advertised window is 512 bytes, which is still
78  * less than a single packet.
79  */
80 #define TCP_RX_WINDOW_SCALE 9
81 
82 /** TCP selective acknowledgement permitted option */
83 struct tcp_sack_permitted_option {
84 	uint8_t kind;
85 	uint8_t length;
86 } __attribute__ (( packed ));
87 
88 /** Padded TCP selective acknowledgement permitted option (used for sending) */
89 struct tcp_sack_permitted_padded_option {
90 	uint8_t nop[2];
91 	struct tcp_sack_permitted_option spopt;
92 } __attribute__ (( packed ));
93 
94 /** Code for the TCP selective acknowledgement permitted option */
95 #define TCP_OPTION_SACK_PERMITTED 4
96 
97 /** TCP selective acknowledgement option */
98 struct tcp_sack_option {
99 	uint8_t kind;
100 	uint8_t length;
101 } __attribute__ (( packed ));
102 
103 /** TCP selective acknowledgement block */
104 struct tcp_sack_block {
105 	uint32_t left;
106 	uint32_t right;
107 } __attribute__ (( packed ));
108 
109 /** Maximum number of selective acknowledgement blocks
110  *
111  * This allows for the presence of the TCP timestamp option.
112  */
113 #define TCP_SACK_MAX 3
114 
115 /** Padded TCP selective acknowledgement option (used for sending) */
116 struct tcp_sack_padded_option {
117 	uint8_t nop[2];
118 	struct tcp_sack_option sackopt;
119 } __attribute__ (( packed ));
120 
121 /** Code for the TCP selective acknowledgement option */
122 #define TCP_OPTION_SACK 5
123 
124 /** TCP timestamp option */
125 struct tcp_timestamp_option {
126 	uint8_t kind;
127 	uint8_t length;
128 	uint32_t tsval;
129 	uint32_t tsecr;
130 } __attribute__ (( packed ));
131 
132 /** Padded TCP timestamp option (used for sending) */
133 struct tcp_timestamp_padded_option {
134 	uint8_t nop[2];
135 	struct tcp_timestamp_option tsopt;
136 } __attribute__ (( packed ));
137 
138 /** Code for the TCP timestamp option */
139 #define TCP_OPTION_TS 8
140 
141 /** Parsed TCP options */
142 struct tcp_options {
143 	/** Window scale option, if present */
144 	const struct tcp_window_scale_option *wsopt;
145 	/** SACK permitted option, if present */
146 	const struct tcp_sack_permitted_option *spopt;
147 	/** Timestamp option, if present */
148 	const struct tcp_timestamp_option *tsopt;
149 };
150 
151 /** @} */
152 
153 /*
154  * TCP flags
155  */
156 #define TCP_CWR		0x80
157 #define TCP_ECE		0x40
158 #define TCP_URG		0x20
159 #define TCP_ACK		0x10
160 #define TCP_PSH		0x08
161 #define TCP_RST		0x04
162 #define TCP_SYN		0x02
163 #define TCP_FIN		0x01
164 
165 /**
166 * @defgroup tcpstates TCP states
167 *
168 * The TCP state is defined by a combination of the flags that have
169 * been sent to the peer, the flags that have been acknowledged by the
170 * peer, and the flags that have been received from the peer.
171 *
172 * @{
173 */
174 
175 /** TCP flags that have been sent in outgoing packets */
176 #define TCP_STATE_SENT(flags) ( (flags) << 0 )
177 #define TCP_FLAGS_SENT(state) ( ( (state) >> 0 ) & 0xff )
178 
179 /** TCP flags that have been acknowledged by the peer
180  *
181  * Note that this applies only to SYN and FIN.
182  */
183 #define TCP_STATE_ACKED(flags) ( (flags) << 8 )
184 #define TCP_FLAGS_ACKED(state) ( ( (state) >> 8 ) & 0xff )
185 
186 /** TCP flags that have been received from the peer
187  *
188  * Note that this applies only to SYN and FIN, and that once SYN has
189  * been received, we should always be sending ACK.
190  */
191 #define TCP_STATE_RCVD(flags) ( (flags) << 16 )
192 #define TCP_FLAGS_RCVD(state) ( ( (state) >> 16 ) & 0xff )
193 
194 /** TCP flags that are currently being sent in outgoing packets */
195 #define TCP_FLAGS_SENDING(state) \
196 	( TCP_FLAGS_SENT ( state ) & ~TCP_FLAGS_ACKED ( state ) )
197 
198 /** CLOSED
199  *
200  * The connection has not yet been used for anything.
201  */
202 #define TCP_CLOSED TCP_RST
203 
204 /** LISTEN
205  *
206  * Not currently used as a state; we have no support for listening
207  * connections.  Given a unique value to avoid compiler warnings.
208  */
209 #define TCP_LISTEN 0
210 
211 /** SYN_SENT
212  *
213  * SYN has been sent, nothing has yet been received or acknowledged.
214  */
215 #define TCP_SYN_SENT	( TCP_STATE_SENT ( TCP_SYN ) )
216 
217 /** SYN_RCVD
218  *
219  * SYN has been sent but not acknowledged, SYN has been received.
220  */
221 #define TCP_SYN_RCVD	( TCP_STATE_SENT ( TCP_SYN | TCP_ACK ) |	    \
222 			  TCP_STATE_RCVD ( TCP_SYN ) )
223 
224 /** ESTABLISHED
225  *
226  * SYN has been sent and acknowledged, SYN has been received.
227  */
228 #define TCP_ESTABLISHED	( TCP_STATE_SENT ( TCP_SYN | TCP_ACK ) |	    \
229 			  TCP_STATE_ACKED ( TCP_SYN ) |			    \
230 			  TCP_STATE_RCVD ( TCP_SYN ) )
231 
232 /** FIN_WAIT_1
233  *
234  * SYN has been sent and acknowledged, SYN has been received, FIN has
235  * been sent but not acknowledged, FIN has not been received.
236  *
237  * RFC 793 shows that we can enter FIN_WAIT_1 without have had SYN
238  * acknowledged, i.e. if the application closes the connection after
239  * sending and receiving SYN, but before having had SYN acknowledged.
240  * However, we have to *pretend* that SYN has been acknowledged
241  * anyway, otherwise we end up sending SYN and FIN in the same
242  * sequence number slot.  Therefore, when we transition from SYN_RCVD
243  * to FIN_WAIT_1, we have to remember to set TCP_STATE_ACKED(TCP_SYN)
244  * and increment our sequence number.
245  */
246 #define TCP_FIN_WAIT_1	( TCP_STATE_SENT ( TCP_SYN | TCP_ACK | TCP_FIN ) |  \
247 			  TCP_STATE_ACKED ( TCP_SYN ) |			    \
248 			  TCP_STATE_RCVD ( TCP_SYN ) )
249 
250 /** FIN_WAIT_2
251  *
252  * SYN has been sent and acknowledged, SYN has been received, FIN has
253  * been sent and acknowledged, FIN ha not been received.
254  */
255 #define TCP_FIN_WAIT_2	( TCP_STATE_SENT ( TCP_SYN | TCP_ACK | TCP_FIN ) |  \
256 			  TCP_STATE_ACKED ( TCP_SYN | TCP_FIN ) |	    \
257 			  TCP_STATE_RCVD ( TCP_SYN ) )
258 
259 /** CLOSING / LAST_ACK
260  *
261  * SYN has been sent and acknowledged, SYN has been received, FIN has
262  * been sent but not acknowledged, FIN has been received.
263  *
264  * This state actually encompasses both CLOSING and LAST_ACK; they are
265  * identical with the definition of state that we use.  I don't
266  * *believe* that they need to be distinguished.
267  */
268 #define TCP_CLOSING_OR_LAST_ACK						    \
269 			( TCP_STATE_SENT ( TCP_SYN | TCP_ACK | TCP_FIN ) |  \
270 			  TCP_STATE_ACKED ( TCP_SYN ) |			    \
271 			  TCP_STATE_RCVD ( TCP_SYN | TCP_FIN ) )
272 
273 /** TIME_WAIT
274  *
275  * SYN has been sent and acknowledged, SYN has been received, FIN has
276  * been sent and acknowledged, FIN has been received.
277  */
278 #define TCP_TIME_WAIT	( TCP_STATE_SENT ( TCP_SYN | TCP_ACK | TCP_FIN ) |  \
279 			  TCP_STATE_ACKED ( TCP_SYN | TCP_FIN ) |	    \
280 			  TCP_STATE_RCVD ( TCP_SYN | TCP_FIN ) )
281 
282 /** CLOSE_WAIT
283  *
284  * SYN has been sent and acknowledged, SYN has been received, FIN has
285  * been received.
286  */
287 #define TCP_CLOSE_WAIT	( TCP_STATE_SENT ( TCP_SYN | TCP_ACK ) |	    \
288 			  TCP_STATE_ACKED ( TCP_SYN ) |			    \
289 			  TCP_STATE_RCVD ( TCP_SYN | TCP_FIN ) )
290 
291 /** Can send data in current state
292  *
293  * We can send data if and only if we have had our SYN acked and we
294  * have not yet sent our FIN.
295  */
296 #define TCP_CAN_SEND_DATA(state)					    \
297 	( ( (state) & ( TCP_STATE_ACKED ( TCP_SYN ) |			    \
298 			TCP_STATE_SENT ( TCP_FIN ) ) )			    \
299 	  == TCP_STATE_ACKED ( TCP_SYN ) )
300 
301 /** Have ever been fully established
302  *
303  * We have been fully established if we have both received a SYN and
304  * had our own SYN acked.
305  */
306 #define TCP_HAS_BEEN_ESTABLISHED(state)					    \
307 	( ( (state) & ( TCP_STATE_ACKED ( TCP_SYN ) |			    \
308 			TCP_STATE_RCVD ( TCP_SYN ) ) )			    \
309 	  == ( TCP_STATE_ACKED ( TCP_SYN ) | TCP_STATE_RCVD ( TCP_SYN ) ) )
310 
311 /** Have closed gracefully
312  *
313  * We have closed gracefully if we have both received a FIN and had
314  * our own FIN acked.
315  */
316 #define TCP_CLOSED_GRACEFULLY(state)					    \
317 	( ( (state) & ( TCP_STATE_ACKED ( TCP_FIN ) |			    \
318 			TCP_STATE_RCVD ( TCP_FIN ) ) )			    \
319 	  == ( TCP_STATE_ACKED ( TCP_FIN ) | TCP_STATE_RCVD ( TCP_FIN ) ) )
320 
321 /** @} */
322 
323 /** Mask for TCP header length field */
324 #define TCP_MASK_HLEN	0xf0
325 
326 /** Smallest port number on which a TCP connection can listen */
327 #define TCP_MIN_PORT 1
328 
329 /**
330  * Maxmimum advertised TCP window size
331  *
332  * The maximum bandwidth on any link is limited by
333  *
334  *    max_bandwidth * round_trip_time = tcp_window
335  *
336  * Some rough expectations for achievable bandwidths over various
337  * links are:
338  *
339  *    a) Gigabit LAN: expected bandwidth 125MB/s, typical RTT 0.5ms,
340  *       minimum required window 64kB
341  *
342  *    b) Home Internet connection: expected bandwidth 10MB/s, typical
343  *       RTT 25ms, minimum required window 256kB
344  *
345  *    c) WAN: expected bandwidth 2MB/s, typical RTT 100ms, minimum
346  *       required window 200kB.
347  *
348  * The maximum possible value for the TCP window size is 1GB (using
349  * the maximum window scale of 2**14).  However, it is advisable to
350  * keep the window size as small as possible (without limiting
351  * bandwidth), since in the event of a lost packet the window size
352  * represents the maximum amount that will need to be retransmitted.
353  *
354  * We therefore choose a maximum window size of 256kB.
355  */
356 #define TCP_MAX_WINDOW_SIZE	( 256 * 1024 )
357 
358 /**
359  * Path MTU
360  *
361  * IPv6 requires all data link layers to support a datagram size of
362  * 1280 bytes.  We choose to use this as our maximum transmitted
363  * datagram size, on the assumption that any practical link layer we
364  * encounter will allow this size.  This is a very conservative
365  * assumption in practice, but the impact of making such a
366  * conservative assumption is insignificant since the amount of data
367  * that we transmit (rather than receive) is negligible.
368  *
369  * We allow space within this 1280 bytes for an IPv6 header, a TCP
370  * header, and a (padded) TCP timestamp option.
371  */
372 #define TCP_PATH_MTU							\
373 	( 1280 - 40 /* IPv6 */ - 20 /* TCP */ - 12 /* TCP timestamp */ )
374 
375 /** TCP maximum segment lifetime
376  *
377  * Currently set to 2 minutes, as per RFC 793.
378  */
379 #define TCP_MSL ( 2 * 60 * TICKS_PER_SEC )
380 
381 /**
382  * TCP keepalive period
383  *
384  * We send keepalive ACKs after this period of inactivity has elapsed
385  * on an established connection.
386  */
387 #define TCP_KEEPALIVE_DELAY ( 15 * TICKS_PER_SEC )
388 
389 /**
390  * TCP maximum header length
391  *
392  */
393 #define TCP_MAX_HEADER_LEN					\
394 	( MAX_LL_NET_HEADER_LEN +				\
395 	  sizeof ( struct tcp_header ) +			\
396 	  sizeof ( struct tcp_mss_option ) +			\
397 	  sizeof ( struct tcp_window_scale_padded_option ) +	\
398 	  sizeof ( struct tcp_timestamp_padded_option ) )
399 
400 /**
401  * Compare TCP sequence numbers
402  *
403  * @v seq1		Sequence number 1
404  * @v seq2		Sequence number 2
405  * @ret diff		Sequence difference
406  *
407  * Analogous to memcmp(), returns an integer less than, equal to, or
408  * greater than zero if @c seq1 is found, respectively, to be before,
409  * equal to, or after @c seq2.
410  */
411 static inline __attribute__ (( always_inline )) int32_t
tcp_cmp(uint32_t seq1,uint32_t seq2)412 tcp_cmp ( uint32_t seq1, uint32_t seq2 ) {
413 	return ( ( int32_t ) ( seq1 - seq2 ) );
414 }
415 
416 /**
417  * Check if TCP sequence number lies within window
418  *
419  * @v seq		Sequence number
420  * @v start		Start of window
421  * @v len		Length of window
422  * @ret in_window	Sequence number is within window
423  */
tcp_in_window(uint32_t seq,uint32_t start,uint32_t len)424 static inline int tcp_in_window ( uint32_t seq, uint32_t start,
425 				  uint32_t len ) {
426 	return ( ( seq - start ) < len );
427 }
428 
429 /** TCP finish wait time
430  *
431  * Currently set to one second, since we should not allow a slowly
432  * responding server to substantially delay a call to shutdown().
433  */
434 #define TCP_FINISH_TIMEOUT ( 1 * TICKS_PER_SEC )
435 
436 extern struct tcpip_protocol tcp_protocol __tcpip_protocol;
437 
438 #endif /* _IPXE_TCP_H */
439