xref: /freebsd/sys/netinet/khelp/h_ertt.c (revision aa0a1e58)
1 /*-
2  * Copyright (c) 2009-2010
3  * 	Swinburne University of Technology, Melbourne, Australia
4  * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
5  * Copyright (c) 2010-2011 The FreeBSD Foundation
6  * All rights reserved.
7  *
8  * This software was developed at the Centre for Advanced Internet
9  * Architectures, Swinburne University, by David Hayes, made possible in part by
10  * a grant from the Cisco University Research Program Fund at Community
11  * Foundation Silicon Valley.
12  *
13  * Portions of this software were developed at the Centre for Advanced
14  * Internet Architectures, Swinburne University of Technology, Melbourne,
15  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
16  *
17  * Redistribution and use in source and binary forms, with or without
18  * modification, are permitted provided that the following conditions
19  * are met:
20  * 1. Redistributions of source code must retain the above copyright
21  *    notice, this list of conditions and the following disclaimer.
22  * 2. Redistributions in binary form must reproduce the above copyright
23  *    notice, this list of conditions and the following disclaimer in the
24  *    documentation and/or other materials provided with the distribution.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  */
38 
39 #include <sys/cdefs.h>
40 __FBSDID("$FreeBSD$");
41 
42 #include <sys/param.h>
43 #include <sys/kernel.h>
44 #include <sys/mbuf.h>
45 #include <sys/module.h>
46 #include <sys/hhook.h>
47 #include <sys/khelp.h>
48 #include <sys/module_khelp.h>
49 #include <sys/socket.h>
50 #include <sys/sockopt.h>
51 
52 #include <net/vnet.h>
53 
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/tcp_seq.h>
57 #include <netinet/tcp_var.h>
58 
59 #include <netinet/khelp/h_ertt.h>
60 
61 #include <vm/uma.h>
62 
63 uma_zone_t txseginfo_zone;
64 
65 /* Smoothing factor for delayed ack guess. */
66 #define	DLYACK_SMOOTH	5
67 
68 /* Max number of time stamp errors allowed in a session. */
69 #define	MAX_TS_ERR	10
70 
71 static int ertt_packet_measurement_hook(int hhook_type, int hhook_id,
72     void *udata, void *ctx_data, void *hdata, struct osd *hosd);
73 static int ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id,
74     void *udata, void *ctx_data, void *hdata, struct osd *hosd);
75 static int ertt_mod_init(void);
76 static int ertt_mod_destroy(void);
77 static int ertt_uma_ctor(void *mem, int size, void *arg, int flags);
78 static void ertt_uma_dtor(void *mem, int size, void *arg);
79 
80 /*
81  * Contains information about the sent segment for comparison with the
82  * corresponding ack.
83  */
84 struct txseginfo {
85 	/* Segment length. */
86 	long		len;
87 	/* Segment sequence number. */
88 	tcp_seq		seq;
89 	/* Time stamp indicating when the packet was sent. */
90 	uint32_t	tx_ts;
91 	/* Last received receiver ts (if the TCP option is used). */
92 	uint32_t	rx_ts;
93 	uint32_t	flags;
94 	TAILQ_ENTRY (txseginfo) txsegi_lnk;
95 };
96 
97 /* Flags for struct txseginfo. */
98 #define	TXSI_TSO		0x01 /* TSO was used for this entry. */
99 #define	TXSI_RTT_MEASURE_START	0x02 /* Start a per RTT measurement. */
100 #define	TXSI_RX_MEASURE_END	0x04 /* Measure the rx rate until this txsi. */
101 
102 struct helper ertt_helper = {
103 	.mod_init = ertt_mod_init,
104 	.mod_destroy = ertt_mod_destroy,
105 	.h_flags = HELPER_NEEDS_OSD,
106 	.h_classes = HELPER_CLASS_TCP
107 };
108 
109 /* Define the helper hook info required by ERTT. */
110 struct hookinfo ertt_hooks[] = {
111 	{
112 		.hook_type = HHOOK_TYPE_TCP,
113 		.hook_id = HHOOK_TCP_EST_IN,
114 		.hook_udata = NULL,
115 		.hook_func = &ertt_packet_measurement_hook
116 	},
117 	{
118 		.hook_type = HHOOK_TYPE_TCP,
119 		.hook_id = HHOOK_TCP_EST_OUT,
120 		.hook_udata = NULL,
121 		.hook_func = &ertt_add_tx_segment_info_hook
122 	}
123 };
124 
125 /* Flags to indicate how marked_packet_rtt should handle this txsi. */
126 #define	MULTI_ACK		0x01 /* More than this txsi is acked. */
127 #define	OLD_TXSI		0x02 /* TXSI is old according to timestamps. */
128 #define	CORRECT_ACK		0X04 /* Acks this TXSI. */
129 #define	FORCED_MEASUREMENT	0X08 /* Force an RTT measurement. */
130 
131 /*
132  * This fuction measures the RTT of a particular segment/ack pair, or the next
133  * closest if this will yield an inaccurate result due to delayed acking or
134  * other issues.
135  */
136 static void inline
137 marked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp,
138     uint32_t *pmeasurenext, int *pmeasurenext_len, int *prtt_bytes_adjust,
139     int mflag)
140 {
141 
142 	/*
143 	 * If we can't measure this one properly due to delayed acking adjust
144 	 * byte counters and flag to measure next txsi. Note that since the
145 	 * marked packet's transmitted bytes are measured we need to subtract the
146 	 * transmitted bytes. Then pretend the next txsi was marked.
147 	 */
148 	if (mflag & (MULTI_ACK|OLD_TXSI)) {
149 		*pmeasurenext = txsi->tx_ts;
150 		*pmeasurenext_len = txsi->len;
151 		*prtt_bytes_adjust += *pmeasurenext_len;
152 	} else {
153 		if (mflag & FORCED_MEASUREMENT) {
154 			e_t->markedpkt_rtt = ticks - *pmeasurenext + 1;
155 			e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt +
156 			    *pmeasurenext_len - *prtt_bytes_adjust;
157 		} else {
158 			e_t->markedpkt_rtt = ticks - txsi->tx_ts + 1;
159 			e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt -
160 			    *prtt_bytes_adjust;
161 		}
162 		e_t->marked_snd_cwnd = tp->snd_cwnd;
163 
164 		/*
165 		 * Reset the ERTT_MEASUREMENT_IN_PROGRESS flag to indicate to
166 		 * add_tx_segment_info that a new measurement should be started.
167 		 */
168 		e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS;
169 		/*
170 		 * Set ERTT_NEW_MEASUREMENT to tell the congestion control
171 		 * algorithm that a new marked RTT measurement has has been made
172 		 * and is available for use.
173 		 */
174 		e_t->flags |= ERTT_NEW_MEASUREMENT;
175 
176 		if (tp->t_flags & TF_TSO) {
177 			/* Temporarily disable TSO to aid a new measurment. */
178 			tp->t_flags &= ~TF_TSO;
179 			/* Keep track that we've disabled it. */
180 			e_t->flags |= ERTT_TSO_DISABLED;
181 		}
182 	}
183 }
184 
185 /*
186  * Ertt_packet_measurements uses a small amount of state kept on each packet
187  * sent to match incoming acknowledgements. This enables more accurate and
188  * secure round trip time measurements. The resulting measurement is used for
189  * congestion control algorithms which require a more accurate time.
190  * Ertt_packet_measurements is called via the helper hook in tcp_input.c
191  */
192 static int
193 ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata,
194     void *ctx_data, void *hdata, struct osd *hosd)
195 {
196 	struct ertt *e_t;
197 	struct tcpcb *tp;
198 	struct tcphdr *th;
199 	struct tcpopt *to;
200 	struct tcp_hhook_data *thdp;
201 	struct txseginfo *txsi;
202 	int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust;
203 	uint32_t measurenext, rts;
204 	tcp_seq ack;
205 
206 	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
207 	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
208 
209 	e_t = (struct ertt *)hdata;
210 	thdp = ctx_data;
211 	tp = thdp->tp;
212 	th = thdp->th;
213 	to = thdp->to;
214 	new_sacked_bytes = (tp->sackhint.last_sack_ack != 0);
215 	measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0;
216 	acked = th->th_ack - tp->snd_una;
217 
218 	INP_WLOCK_ASSERT(tp->t_inpcb);
219 
220 	/* Packet has provided new acknowledgements. */
221 	if (acked > 0 || new_sacked_bytes) {
222 		if (acked == 0 && new_sacked_bytes) {
223 			/* Use last sacked data. */
224 			ack = tp->sackhint.last_sack_ack;
225 		} else
226 			ack = th->th_ack;
227 
228 		txsi = TAILQ_FIRST(&e_t->txsegi_q);
229 		while (txsi != NULL) {
230 			rts = 0;
231 
232 			/* Acknowledgement is acking more than this txsi. */
233 			if (SEQ_GT(ack, txsi->seq + txsi->len)) {
234 				if (txsi->flags & TXSI_RTT_MEASURE_START ||
235 				    measurenext) {
236 					marked_packet_rtt(txsi, e_t, tp,
237 					    &measurenext, &measurenext_len,
238 					    &rtt_bytes_adjust, MULTI_ACK);
239 				}
240 				TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
241 				uma_zfree(txseginfo_zone, txsi);
242 				txsi = TAILQ_FIRST(&e_t->txsegi_q);
243 				continue;
244 			}
245 
246 			/*
247 			 * Guess if delayed acks are being used by the receiver.
248 			 *
249 			 * XXXDH: A simple heuristic that could be improved
250 			 */
251 			if (!new_sacked_bytes) {
252 				if (acked > tp->t_maxseg) {
253 					e_t->dlyack_rx +=
254 					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
255 					    1 : 0;
256 					multiack = 1;
257 				} else if (acked > txsi->len) {
258 					multiack = 1;
259 					e_t->dlyack_rx +=
260 					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
261 					    1 : 0;
262 				} else if (acked == tp->t_maxseg ||
263 					   acked == txsi->len) {
264 					e_t->dlyack_rx -=
265 					    (e_t->dlyack_rx > 0) ? 1 : 0;
266 				}
267 				/* Otherwise leave dlyack_rx the way it was. */
268 			}
269 
270 			/*
271 			 * Time stamps are only to help match the txsi with the
272 			 * received acknowledgements.
273 			 */
274 			if (e_t->timestamp_errors < MAX_TS_ERR &&
275 			    (to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
276 				/*
277 				 * Note: All packets sent with the offload will
278 				 * have the same time stamp. If we are sending
279 				 * on a fast interface and the t_maxseg is much
280 				 * smaller than one tick, this will be fine. The
281 				 * time stamp would be the same whether we were
282 				 * using tso or not. However, if the interface
283 				 * is slow, this will cause problems with the
284 				 * calculations. If the interface is slow, there
285 				 * is not reason to be using tso, and it should
286 				 * be turned off.
287 				 */
288 				/*
289 				 * If there are too many time stamp errors, time
290 				 * stamps won't be trusted
291 				 */
292 				rts = to->to_tsecr;
293 				/* Before this packet. */
294 				if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts))
295 					/* When delayed acking is used, the
296 					 * reflected time stamp is of the first
297 					 * packet and thus may be before
298 					 * txsi->tx_ts.
299 					 */
300 					break;
301 				if (TSTMP_GT(rts, txsi->tx_ts)) {
302 					/*
303 					 * If reflected time stamp is later than
304 					 * tx_tsi, then this txsi is old.
305 					 */
306 					if (txsi->flags & TXSI_RTT_MEASURE_START
307 					    || measurenext) {
308 						marked_packet_rtt(txsi, e_t, tp,
309 						    &measurenext, &measurenext_len,
310 						    &rtt_bytes_adjust, OLD_TXSI);
311 					}
312 					TAILQ_REMOVE(&e_t->txsegi_q, txsi,
313 					    txsegi_lnk);
314 					uma_zfree(txseginfo_zone, txsi);
315 					txsi = TAILQ_FIRST(&e_t->txsegi_q);
316 					continue;
317 				}
318 				if (rts == txsi->tx_ts &&
319 				    TSTMP_LT(to->to_tsval, txsi->rx_ts)) {
320 					/*
321 					 * Segment received before sent!
322 					 * Something is wrong with the received
323 					 * timestamps so increment errors. If
324 					 * this keeps up we will ignore
325 					 * timestamps.
326 					 */
327 					e_t->timestamp_errors++;
328 				}
329 			}
330 			/*
331 			 * Acknowledging a sequence number before this txsi.
332 			 * If it is an old txsi that may have had the same seq
333 			 * numbers, it should have been removed if time stamps
334 			 * are being used.
335 			 */
336 			if (SEQ_LEQ(ack, txsi->seq))
337 				break; /* Before first packet in txsi. */
338 
339 			/*
340 			 * Only ack > txsi->seq and ack <= txsi->seq+txsi->len
341 			 * past this point.
342 			 *
343 			 * If delayed acks are being used, an acknowledgement
344 			 * for a single segment will have been delayed by the
345 			 * receiver and will yield an inaccurate measurement. In
346 			 * this case, we only make the measurement if more than
347 			 * one segment is being acknowledged or sack is
348 			 * currently being used.
349 			 */
350 			if (!e_t->dlyack_rx || multiack || new_sacked_bytes) {
351 				/* Make an accurate new measurement. */
352 				e_t->rtt = ticks - txsi->tx_ts + 1;
353 
354 				if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0)
355 					e_t->minrtt = e_t->rtt;
356 
357 				if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0)
358 					e_t->maxrtt = e_t->rtt;
359 			}
360 
361 			if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext)
362 				marked_packet_rtt(txsi, e_t, tp,
363 				    &measurenext, &measurenext_len,
364 				    &rtt_bytes_adjust, CORRECT_ACK);
365 
366 			if (txsi->flags & TXSI_TSO) {
367 				txsi->len -= acked;
368 				if (txsi->len > 0) {
369 					/*
370 					 * This presumes ack for first bytes in
371 					 * txsi, this may not be true but it
372 					 * shouldn't cause problems for the
373 					 * timing.
374 					 *
375 					 * We remeasure RTT even though we only
376 					 * have a single txsi. The rationale
377 					 * behind this is that it is better to
378 					 * have a slightly inaccurate
379 					 * measurement than no additional
380 					 * measurement for the rest of the bulk
381 					 * transfer. Since TSO is only used on
382 					 * high speed interface cards, so the
383 					 * packets should be transmitted at line
384 					 * rate back to back with little
385 					 * difference in transmission times (in
386 					 * ticks).
387 					 */
388 					txsi->seq += acked;
389 					/*
390 					 * Reset txsi measure flag so we don't
391 					 * use it for another RTT measurement.
392 					 */
393 					txsi->flags &= ~TXSI_RTT_MEASURE_START;
394 					/*
395 					 * There is still more data to be acked
396 					 * from tso bulk transmission, so we
397 					 * won't remove it from the TAILQ yet.
398 					 */
399 					break;
400 				}
401 			}
402 
403 			TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
404 			uma_zfree(txseginfo_zone, txsi);
405 			break;
406 		}
407 
408 		if (measurenext) {
409 			/*
410 			 * We need to do a RTT measurement. It won't be the best
411 			 * if we do it here.
412 			 */
413 			marked_packet_rtt(txsi, e_t, tp,
414 			    &measurenext, &measurenext_len,
415 			    &rtt_bytes_adjust, FORCED_MEASUREMENT);
416 		}
417 	}
418 
419 	return (0);
420 }
421 
422 /*
423  * Add information about a transmitted segment to a list.
424  * This is called via the helper hook in tcp_output.c
425  */
426 static int
427 ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, void *udata,
428     void *ctx_data, void *hdata, struct osd *hosd)
429 {
430 	struct ertt *e_t;
431 	struct tcpcb *tp;
432 	struct tcphdr *th;
433 	struct tcpopt *to;
434 	struct tcp_hhook_data *thdp;
435 	struct txseginfo *txsi;
436 	long len;
437 	int tso;
438 
439 	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
440 	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
441 
442 	e_t = (struct ertt *)hdata;
443 	thdp = ctx_data;
444 	tp = thdp->tp;
445 	th = thdp->th;
446 	to = thdp->to;
447 	len = thdp->len;
448 	tso = thdp->tso;
449 
450 	INP_WLOCK_ASSERT(tp->t_inpcb);
451 
452 	if (len > 0) {
453 		txsi = uma_zalloc(txseginfo_zone, M_NOWAIT);
454 		if (txsi != NULL) {
455 			/* Construct txsi setting the necessary flags. */
456 			txsi->flags = 0; /* Needs to be initialised. */
457 			txsi->seq = ntohl(th->th_seq);
458 			txsi->len = len;
459 			if (tso)
460 				txsi->flags |= TXSI_TSO;
461 			else if (e_t->flags & ERTT_TSO_DISABLED) {
462 				tp->t_flags |= TF_TSO;
463 				e_t->flags &= ~ERTT_TSO_DISABLED;
464 			}
465 
466 			if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS) {
467 				e_t->bytes_tx_in_rtt += len;
468 			} else {
469 				txsi->flags |= TXSI_RTT_MEASURE_START;
470 				e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS;
471 				e_t->bytes_tx_in_rtt = len;
472 			}
473 
474 			if (((tp->t_flags & TF_NOOPT) == 0) &&
475 			    (to->to_flags & TOF_TS)) {
476 				txsi->tx_ts = ntohl(to->to_tsval) -
477 				    tp->ts_offset;
478 				txsi->rx_ts = ntohl(to->to_tsecr);
479 			} else {
480 				txsi->tx_ts = ticks;
481 				txsi->rx_ts = 0; /* No received time stamp. */
482 			}
483 			TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk);
484 		}
485 	}
486 
487 	return (0);
488 }
489 
490 static int
491 ertt_mod_init(void)
492 {
493 
494 	txseginfo_zone = uma_zcreate("ertt_txseginfo", sizeof(struct txseginfo),
495 	    NULL, NULL, NULL, NULL, 0, 0);
496 
497 	return (0);
498 }
499 
500 static int
501 ertt_mod_destroy(void)
502 {
503 
504 	uma_zdestroy(txseginfo_zone);
505 
506 	return (0);
507 }
508 
509 static int
510 ertt_uma_ctor(void *mem, int size, void *arg, int flags)
511 {
512 	struct ertt *e_t;
513 
514 	e_t = mem;
515 
516 	TAILQ_INIT(&e_t->txsegi_q);
517 	e_t->timestamp_errors = 0;
518 	e_t->minrtt = 0;
519 	e_t->maxrtt = 0;
520 	e_t->rtt = 0;
521 	e_t->flags = 0;
522 	e_t->dlyack_rx = 0;
523 	e_t->bytes_tx_in_rtt = 0;
524 	e_t->markedpkt_rtt = 0;
525 
526 	return (0);
527 }
528 
529 static void
530 ertt_uma_dtor(void *mem, int size, void *arg)
531 {
532 	struct ertt *e_t;
533 	struct txseginfo *n_txsi, *txsi;
534 
535 	e_t = mem;
536 	txsi = TAILQ_FIRST(&e_t->txsegi_q);
537 	while (txsi != NULL) {
538 		n_txsi = TAILQ_NEXT(txsi, txsegi_lnk);
539 		uma_zfree(txseginfo_zone, txsi);
540 		txsi = n_txsi;
541 	}
542 }
543 
544 KHELP_DECLARE_MOD_UMA(ertt, &ertt_helper, ertt_hooks, 1, sizeof(struct ertt),
545     ertt_uma_ctor, ertt_uma_dtor);
546