xref: /freebsd/sys/netinet/khelp/h_ertt.c (revision 9eb0e832)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2009-2010
5  * 	Swinburne University of Technology, Melbourne, Australia
6  * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
7  * Copyright (c) 2010-2011 The FreeBSD Foundation
8  * All rights reserved.
9  *
10  * This software was developed at the Centre for Advanced Internet
11  * Architectures, Swinburne University of Technology, by David Hayes, made
12  * possible in part by a grant from the Cisco University Research Program Fund
13  * at Community Foundation Silicon Valley.
14  *
15  * Portions of this software were developed at the Centre for Advanced
16  * Internet Architectures, Swinburne University of Technology, Melbourne,
17  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
18  *
19  * Redistribution and use in source and binary forms, with or without
20  * modification, are permitted provided that the following conditions
21  * are met:
22  * 1. Redistributions of source code must retain the above copyright
23  *    notice, this list of conditions and the following disclaimer.
24  * 2. Redistributions in binary form must reproduce the above copyright
25  *    notice, this list of conditions and the following disclaimer in the
26  *    documentation and/or other materials provided with the distribution.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include <sys/param.h>
45 #include <sys/kernel.h>
46 #include <sys/mbuf.h>
47 #include <sys/module.h>
48 #include <sys/hhook.h>
49 #include <sys/khelp.h>
50 #include <sys/module_khelp.h>
51 #include <sys/socket.h>
52 #include <sys/sockopt.h>
53 
54 #include <net/vnet.h>
55 
56 #include <netinet/in.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/tcp_seq.h>
59 #include <netinet/tcp_var.h>
60 
61 #include <netinet/khelp/h_ertt.h>
62 
63 #include <vm/uma.h>
64 
65 uma_zone_t txseginfo_zone;
66 
67 /* Smoothing factor for delayed ack guess. */
68 #define	DLYACK_SMOOTH	5
69 
70 /* Max number of time stamp errors allowed in a session. */
71 #define	MAX_TS_ERR	10
72 
73 static int ertt_packet_measurement_hook(int hhook_type, int hhook_id,
74     void *udata, void *ctx_data, void *hdata, struct osd *hosd);
75 static int ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id,
76     void *udata, void *ctx_data, void *hdata, struct osd *hosd);
77 static int ertt_mod_init(void);
78 static int ertt_mod_destroy(void);
79 static int ertt_uma_ctor(void *mem, int size, void *arg, int flags);
80 static void ertt_uma_dtor(void *mem, int size, void *arg);
81 
82 /*
83  * Contains information about the sent segment for comparison with the
84  * corresponding ack.
85  */
86 struct txseginfo {
87 	/* Segment length. */
88 	uint32_t	len;
89 	/* Segment sequence number. */
90 	tcp_seq		seq;
91 	/* Time stamp indicating when the packet was sent. */
92 	uint32_t	tx_ts;
93 	/* Last received receiver ts (if the TCP option is used). */
94 	uint32_t	rx_ts;
95 	uint32_t	flags;
96 	TAILQ_ENTRY (txseginfo) txsegi_lnk;
97 };
98 
99 /* Flags for struct txseginfo. */
100 #define	TXSI_TSO		0x01 /* TSO was used for this entry. */
101 #define	TXSI_RTT_MEASURE_START	0x02 /* Start a per RTT measurement. */
102 #define	TXSI_RX_MEASURE_END	0x04 /* Measure the rx rate until this txsi. */
103 
104 struct helper ertt_helper = {
105 	.mod_init = ertt_mod_init,
106 	.mod_destroy = ertt_mod_destroy,
107 	.h_flags = HELPER_NEEDS_OSD,
108 	.h_classes = HELPER_CLASS_TCP
109 };
110 
111 /* Define the helper hook info required by ERTT. */
112 struct hookinfo ertt_hooks[] = {
113 	{
114 		.hook_type = HHOOK_TYPE_TCP,
115 		.hook_id = HHOOK_TCP_EST_IN,
116 		.hook_udata = NULL,
117 		.hook_func = &ertt_packet_measurement_hook
118 	},
119 	{
120 		.hook_type = HHOOK_TYPE_TCP,
121 		.hook_id = HHOOK_TCP_EST_OUT,
122 		.hook_udata = NULL,
123 		.hook_func = &ertt_add_tx_segment_info_hook
124 	}
125 };
126 
127 /* Flags to indicate how marked_packet_rtt should handle this txsi. */
128 #define	MULTI_ACK		0x01 /* More than this txsi is acked. */
129 #define	OLD_TXSI		0x02 /* TXSI is old according to timestamps. */
130 #define	CORRECT_ACK		0X04 /* Acks this TXSI. */
131 #define	FORCED_MEASUREMENT	0X08 /* Force an RTT measurement. */
132 
133 /*
134  * This fuction measures the RTT of a particular segment/ack pair, or the next
135  * closest if this will yield an inaccurate result due to delayed acking or
136  * other issues.
137  */
138 static void inline
139 marked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp,
140     uint32_t *pmeasurenext, int *pmeasurenext_len, int *prtt_bytes_adjust,
141     int mflag)
142 {
143 
144 	/*
145 	 * If we can't measure this one properly due to delayed acking adjust
146 	 * byte counters and flag to measure next txsi. Note that since the
147 	 * marked packet's transmitted bytes are measured we need to subtract the
148 	 * transmitted bytes. Then pretend the next txsi was marked.
149 	 */
150 	if (mflag & (MULTI_ACK|OLD_TXSI)) {
151 		*pmeasurenext = txsi->tx_ts;
152 		*pmeasurenext_len = txsi->len;
153 		*prtt_bytes_adjust += *pmeasurenext_len;
154 	} else {
155 		if (mflag & FORCED_MEASUREMENT) {
156 			e_t->markedpkt_rtt = tcp_ts_getticks() -
157 			    *pmeasurenext + 1;
158 			e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt +
159 			    *pmeasurenext_len - *prtt_bytes_adjust;
160 		} else {
161 			e_t->markedpkt_rtt = tcp_ts_getticks() -
162 			    txsi->tx_ts + 1;
163 			e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt -
164 			    *prtt_bytes_adjust;
165 		}
166 		e_t->marked_snd_cwnd = tp->snd_cwnd;
167 
168 		/*
169 		 * Reset the ERTT_MEASUREMENT_IN_PROGRESS flag to indicate to
170 		 * add_tx_segment_info that a new measurement should be started.
171 		 */
172 		e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS;
173 		/*
174 		 * Set ERTT_NEW_MEASUREMENT to tell the congestion control
175 		 * algorithm that a new marked RTT measurement has has been made
176 		 * and is available for use.
177 		 */
178 		e_t->flags |= ERTT_NEW_MEASUREMENT;
179 
180 		if (tp->t_flags & TF_TSO) {
181 			/* Temporarily disable TSO to aid a new measurement. */
182 			tp->t_flags &= ~TF_TSO;
183 			/* Keep track that we've disabled it. */
184 			e_t->flags |= ERTT_TSO_DISABLED;
185 		}
186 	}
187 }
188 
189 /*
190  * Ertt_packet_measurements uses a small amount of state kept on each packet
191  * sent to match incoming acknowledgements. This enables more accurate and
192  * secure round trip time measurements. The resulting measurement is used for
193  * congestion control algorithms which require a more accurate time.
194  * Ertt_packet_measurements is called via the helper hook in tcp_input.c
195  */
196 static int
197 ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata,
198     void *ctx_data, void *hdata, struct osd *hosd)
199 {
200 	struct ertt *e_t;
201 	struct tcpcb *tp;
202 	struct tcphdr *th;
203 	struct tcpopt *to;
204 	struct tcp_hhook_data *thdp;
205 	struct txseginfo *txsi;
206 	int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust;
207 	uint32_t measurenext, rts;
208 	tcp_seq ack;
209 
210 	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
211 	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
212 
213 	e_t = (struct ertt *)hdata;
214 	thdp = ctx_data;
215 	tp = thdp->tp;
216 	th = thdp->th;
217 	to = thdp->to;
218 	new_sacked_bytes = (tp->sackhint.last_sack_ack != 0);
219 	measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0;
220 	acked = th->th_ack - tp->snd_una;
221 
222 	INP_WLOCK_ASSERT(tptoinpcb(tp));
223 
224 	/* Packet has provided new acknowledgements. */
225 	if (acked > 0 || new_sacked_bytes) {
226 		if (acked == 0 && new_sacked_bytes) {
227 			/* Use last sacked data. */
228 			ack = tp->sackhint.last_sack_ack;
229 		} else
230 			ack = th->th_ack;
231 
232 		txsi = TAILQ_FIRST(&e_t->txsegi_q);
233 		while (txsi != NULL) {
234 			rts = 0;
235 
236 			/* Acknowledgement is acking more than this txsi. */
237 			if (SEQ_GT(ack, txsi->seq + txsi->len)) {
238 				if (txsi->flags & TXSI_RTT_MEASURE_START ||
239 				    measurenext) {
240 					marked_packet_rtt(txsi, e_t, tp,
241 					    &measurenext, &measurenext_len,
242 					    &rtt_bytes_adjust, MULTI_ACK);
243 				}
244 				TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
245 				uma_zfree(txseginfo_zone, txsi);
246 				txsi = TAILQ_FIRST(&e_t->txsegi_q);
247 				continue;
248 			}
249 
250 			/*
251 			 * Guess if delayed acks are being used by the receiver.
252 			 *
253 			 * XXXDH: A simple heuristic that could be improved
254 			 */
255 			if (!new_sacked_bytes) {
256 				if (acked > tp->t_maxseg) {
257 					e_t->dlyack_rx +=
258 					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
259 					    1 : 0;
260 					multiack = 1;
261 				} else if (acked > txsi->len) {
262 					multiack = 1;
263 					e_t->dlyack_rx +=
264 					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
265 					    1 : 0;
266 				} else if (acked == tp->t_maxseg ||
267 					   acked == txsi->len) {
268 					e_t->dlyack_rx -=
269 					    (e_t->dlyack_rx > 0) ? 1 : 0;
270 				}
271 				/* Otherwise leave dlyack_rx the way it was. */
272 			}
273 
274 			/*
275 			 * Time stamps are only to help match the txsi with the
276 			 * received acknowledgements.
277 			 */
278 			if (e_t->timestamp_errors < MAX_TS_ERR &&
279 			    (to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
280 				/*
281 				 * Note: All packets sent with the offload will
282 				 * have the same time stamp. If we are sending
283 				 * on a fast interface and the t_maxseg is much
284 				 * smaller than one tick, this will be fine. The
285 				 * time stamp would be the same whether we were
286 				 * using tso or not. However, if the interface
287 				 * is slow, this will cause problems with the
288 				 * calculations. If the interface is slow, there
289 				 * is not reason to be using tso, and it should
290 				 * be turned off.
291 				 */
292 				/*
293 				 * If there are too many time stamp errors, time
294 				 * stamps won't be trusted
295 				 */
296 				rts = to->to_tsecr;
297 				/* Before this packet. */
298 				if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts))
299 					/* When delayed acking is used, the
300 					 * reflected time stamp is of the first
301 					 * packet and thus may be before
302 					 * txsi->tx_ts.
303 					 */
304 					break;
305 				if (TSTMP_GT(rts, txsi->tx_ts)) {
306 					/*
307 					 * If reflected time stamp is later than
308 					 * tx_tsi, then this txsi is old.
309 					 */
310 					if (txsi->flags & TXSI_RTT_MEASURE_START
311 					    || measurenext) {
312 						marked_packet_rtt(txsi, e_t, tp,
313 						    &measurenext, &measurenext_len,
314 						    &rtt_bytes_adjust, OLD_TXSI);
315 					}
316 					TAILQ_REMOVE(&e_t->txsegi_q, txsi,
317 					    txsegi_lnk);
318 					uma_zfree(txseginfo_zone, txsi);
319 					txsi = TAILQ_FIRST(&e_t->txsegi_q);
320 					continue;
321 				}
322 				if (rts == txsi->tx_ts &&
323 				    TSTMP_LT(to->to_tsval, txsi->rx_ts)) {
324 					/*
325 					 * Segment received before sent!
326 					 * Something is wrong with the received
327 					 * timestamps so increment errors. If
328 					 * this keeps up we will ignore
329 					 * timestamps.
330 					 */
331 					e_t->timestamp_errors++;
332 				}
333 			}
334 			/*
335 			 * Acknowledging a sequence number before this txsi.
336 			 * If it is an old txsi that may have had the same seq
337 			 * numbers, it should have been removed if time stamps
338 			 * are being used.
339 			 */
340 			if (SEQ_LEQ(ack, txsi->seq))
341 				break; /* Before first packet in txsi. */
342 
343 			/*
344 			 * Only ack > txsi->seq and ack <= txsi->seq+txsi->len
345 			 * past this point.
346 			 *
347 			 * If delayed acks are being used, an acknowledgement
348 			 * for a single segment will have been delayed by the
349 			 * receiver and will yield an inaccurate measurement. In
350 			 * this case, we only make the measurement if more than
351 			 * one segment is being acknowledged or sack is
352 			 * currently being used.
353 			 */
354 			if (!e_t->dlyack_rx || multiack || new_sacked_bytes) {
355 				/* Make an accurate new measurement. */
356 				e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1;
357 
358 				if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0)
359 					e_t->minrtt = e_t->rtt;
360 
361 				if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0)
362 					e_t->maxrtt = e_t->rtt;
363 			}
364 
365 			if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext)
366 				marked_packet_rtt(txsi, e_t, tp,
367 				    &measurenext, &measurenext_len,
368 				    &rtt_bytes_adjust, CORRECT_ACK);
369 
370 			if (txsi->flags & TXSI_TSO) {
371 				if (txsi->len > acked) {
372 					txsi->len -= acked;
373 					/*
374 					 * This presumes ack for first bytes in
375 					 * txsi, this may not be true but it
376 					 * shouldn't cause problems for the
377 					 * timing.
378 					 *
379 					 * We remeasure RTT even though we only
380 					 * have a single txsi. The rationale
381 					 * behind this is that it is better to
382 					 * have a slightly inaccurate
383 					 * measurement than no additional
384 					 * measurement for the rest of the bulk
385 					 * transfer. Since TSO is only used on
386 					 * high speed interface cards, so the
387 					 * packets should be transmitted at line
388 					 * rate back to back with little
389 					 * difference in transmission times (in
390 					 * ticks).
391 					 */
392 					txsi->seq += acked;
393 					/*
394 					 * Reset txsi measure flag so we don't
395 					 * use it for another RTT measurement.
396 					 */
397 					txsi->flags &= ~TXSI_RTT_MEASURE_START;
398 					/*
399 					 * There is still more data to be acked
400 					 * from tso bulk transmission, so we
401 					 * won't remove it from the TAILQ yet.
402 					 */
403 					break;
404 				}
405 				txsi->len = 0;
406 			}
407 
408 			TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
409 			uma_zfree(txseginfo_zone, txsi);
410 			break;
411 		}
412 
413 		if (measurenext) {
414 			/*
415 			 * We need to do a RTT measurement. It won't be the best
416 			 * if we do it here.
417 			 */
418 			marked_packet_rtt(txsi, e_t, tp,
419 			    &measurenext, &measurenext_len,
420 			    &rtt_bytes_adjust, FORCED_MEASUREMENT);
421 		}
422 	}
423 
424 	return (0);
425 }
426 
427 /*
428  * Add information about a transmitted segment to a list.
429  * This is called via the helper hook in tcp_output.c
430  */
431 static int
432 ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, void *udata,
433     void *ctx_data, void *hdata, struct osd *hosd)
434 {
435 	struct ertt *e_t;
436 	struct tcpcb *tp;
437 	struct tcphdr *th;
438 	struct tcpopt *to;
439 	struct tcp_hhook_data *thdp;
440 	struct txseginfo *txsi;
441 	uint32_t len;
442 	int tso;
443 
444 	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
445 	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
446 
447 	e_t = (struct ertt *)hdata;
448 	thdp = ctx_data;
449 	tp = thdp->tp;
450 	th = thdp->th;
451 	to = thdp->to;
452 	len = thdp->len;
453 	tso = thdp->tso;
454 
455 	INP_WLOCK_ASSERT(tptoinpcb(tp));
456 
457 	if (len > 0) {
458 		txsi = uma_zalloc(txseginfo_zone, M_NOWAIT);
459 		if (txsi != NULL) {
460 			/* Construct txsi setting the necessary flags. */
461 			txsi->flags = 0; /* Needs to be initialised. */
462 			txsi->seq = ntohl(th->th_seq);
463 			txsi->len = len;
464 			if (tso)
465 				txsi->flags |= TXSI_TSO;
466 			else if (e_t->flags & ERTT_TSO_DISABLED) {
467 				tp->t_flags |= TF_TSO;
468 				e_t->flags &= ~ERTT_TSO_DISABLED;
469 			}
470 
471 			if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS) {
472 				e_t->bytes_tx_in_rtt += len;
473 			} else {
474 				txsi->flags |= TXSI_RTT_MEASURE_START;
475 				e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS;
476 				e_t->bytes_tx_in_rtt = len;
477 			}
478 
479 			if (((tp->t_flags & TF_NOOPT) == 0) &&
480 			    (to->to_flags & TOF_TS)) {
481 				txsi->tx_ts = ntohl(to->to_tsval) -
482 				    tp->ts_offset;
483 				txsi->rx_ts = ntohl(to->to_tsecr);
484 			} else {
485 				txsi->tx_ts = tcp_ts_getticks();
486 				txsi->rx_ts = 0; /* No received time stamp. */
487 			}
488 			TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk);
489 		}
490 	}
491 
492 	return (0);
493 }
494 
495 static int
496 ertt_mod_init(void)
497 {
498 
499 	txseginfo_zone = uma_zcreate("ertt_txseginfo", sizeof(struct txseginfo),
500 	    NULL, NULL, NULL, NULL, 0, 0);
501 
502 	return (0);
503 }
504 
505 static int
506 ertt_mod_destroy(void)
507 {
508 
509 	uma_zdestroy(txseginfo_zone);
510 
511 	return (0);
512 }
513 
514 static int
515 ertt_uma_ctor(void *mem, int size, void *arg, int flags)
516 {
517 	struct ertt *e_t;
518 
519 	e_t = mem;
520 
521 	TAILQ_INIT(&e_t->txsegi_q);
522 	e_t->timestamp_errors = 0;
523 	e_t->minrtt = 0;
524 	e_t->maxrtt = 0;
525 	e_t->rtt = 0;
526 	e_t->flags = 0;
527 	e_t->dlyack_rx = 0;
528 	e_t->bytes_tx_in_rtt = 0;
529 	e_t->markedpkt_rtt = 0;
530 
531 	return (0);
532 }
533 
534 static void
535 ertt_uma_dtor(void *mem, int size, void *arg)
536 {
537 	struct ertt *e_t;
538 	struct txseginfo *n_txsi, *txsi;
539 
540 	e_t = mem;
541 	txsi = TAILQ_FIRST(&e_t->txsegi_q);
542 	while (txsi != NULL) {
543 		n_txsi = TAILQ_NEXT(txsi, txsegi_lnk);
544 		uma_zfree(txseginfo_zone, txsi);
545 		txsi = n_txsi;
546 	}
547 }
548 
549 KHELP_DECLARE_MOD_UMA(ertt, &ertt_helper, ertt_hooks, 1, sizeof(struct ertt),
550     ertt_uma_ctor, ertt_uma_dtor);
551