xref: /freebsd/sys/netinet/khelp/h_ertt.c (revision 3ac12506)
1050570efSLawrence Stewart /*-
2050570efSLawrence Stewart  * Copyright (c) 2009-2010
3050570efSLawrence Stewart  * 	Swinburne University of Technology, Melbourne, Australia
4050570efSLawrence Stewart  * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
5050570efSLawrence Stewart  * Copyright (c) 2010-2011 The FreeBSD Foundation
6050570efSLawrence Stewart  * All rights reserved.
7050570efSLawrence Stewart  *
8050570efSLawrence Stewart  * This software was developed at the Centre for Advanced Internet
9891b8ed4SLawrence Stewart  * Architectures, Swinburne University of Technology, by David Hayes, made
10891b8ed4SLawrence Stewart  * possible in part by a grant from the Cisco University Research Program Fund
11891b8ed4SLawrence Stewart  * at Community Foundation Silicon Valley.
12050570efSLawrence Stewart  *
13050570efSLawrence Stewart  * Portions of this software were developed at the Centre for Advanced
14050570efSLawrence Stewart  * Internet Architectures, Swinburne University of Technology, Melbourne,
15050570efSLawrence Stewart  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
16050570efSLawrence Stewart  *
17050570efSLawrence Stewart  * Redistribution and use in source and binary forms, with or without
18050570efSLawrence Stewart  * modification, are permitted provided that the following conditions
19050570efSLawrence Stewart  * are met:
20050570efSLawrence Stewart  * 1. Redistributions of source code must retain the above copyright
21050570efSLawrence Stewart  *    notice, this list of conditions and the following disclaimer.
22050570efSLawrence Stewart  * 2. Redistributions in binary form must reproduce the above copyright
23050570efSLawrence Stewart  *    notice, this list of conditions and the following disclaimer in the
24050570efSLawrence Stewart  *    documentation and/or other materials provided with the distribution.
25050570efSLawrence Stewart  *
26050570efSLawrence Stewart  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
27050570efSLawrence Stewart  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28050570efSLawrence Stewart  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29050570efSLawrence Stewart  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
30050570efSLawrence Stewart  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31050570efSLawrence Stewart  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32050570efSLawrence Stewart  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33050570efSLawrence Stewart  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34050570efSLawrence Stewart  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35050570efSLawrence Stewart  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36050570efSLawrence Stewart  * SUCH DAMAGE.
37050570efSLawrence Stewart  */
38050570efSLawrence Stewart 
39050570efSLawrence Stewart #include <sys/cdefs.h>
40050570efSLawrence Stewart __FBSDID("$FreeBSD$");
41050570efSLawrence Stewart 
42050570efSLawrence Stewart #include <sys/param.h>
43050570efSLawrence Stewart #include <sys/kernel.h>
44050570efSLawrence Stewart #include <sys/mbuf.h>
45050570efSLawrence Stewart #include <sys/module.h>
46050570efSLawrence Stewart #include <sys/hhook.h>
47050570efSLawrence Stewart #include <sys/khelp.h>
48050570efSLawrence Stewart #include <sys/module_khelp.h>
49050570efSLawrence Stewart #include <sys/socket.h>
50050570efSLawrence Stewart #include <sys/sockopt.h>
51050570efSLawrence Stewart 
52050570efSLawrence Stewart #include <net/vnet.h>
53050570efSLawrence Stewart 
54050570efSLawrence Stewart #include <netinet/in.h>
55050570efSLawrence Stewart #include <netinet/in_pcb.h>
56050570efSLawrence Stewart #include <netinet/tcp_seq.h>
57050570efSLawrence Stewart #include <netinet/tcp_var.h>
58050570efSLawrence Stewart 
59050570efSLawrence Stewart #include <netinet/khelp/h_ertt.h>
60050570efSLawrence Stewart 
61050570efSLawrence Stewart #include <vm/uma.h>
62050570efSLawrence Stewart 
63050570efSLawrence Stewart uma_zone_t txseginfo_zone;
64050570efSLawrence Stewart 
65050570efSLawrence Stewart /* Smoothing factor for delayed ack guess. */
66050570efSLawrence Stewart #define	DLYACK_SMOOTH	5
67050570efSLawrence Stewart 
68050570efSLawrence Stewart /* Max number of time stamp errors allowed in a session. */
69050570efSLawrence Stewart #define	MAX_TS_ERR	10
70050570efSLawrence Stewart 
71050570efSLawrence Stewart static int ertt_packet_measurement_hook(int hhook_type, int hhook_id,
72050570efSLawrence Stewart     void *udata, void *ctx_data, void *hdata, struct osd *hosd);
73050570efSLawrence Stewart static int ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id,
74050570efSLawrence Stewart     void *udata, void *ctx_data, void *hdata, struct osd *hosd);
75050570efSLawrence Stewart static int ertt_mod_init(void);
76050570efSLawrence Stewart static int ertt_mod_destroy(void);
77050570efSLawrence Stewart static int ertt_uma_ctor(void *mem, int size, void *arg, int flags);
78050570efSLawrence Stewart static void ertt_uma_dtor(void *mem, int size, void *arg);
79050570efSLawrence Stewart 
80050570efSLawrence Stewart /*
81050570efSLawrence Stewart  * Contains information about the sent segment for comparison with the
82050570efSLawrence Stewart  * corresponding ack.
83050570efSLawrence Stewart  */
84050570efSLawrence Stewart struct txseginfo {
85050570efSLawrence Stewart 	/* Segment length. */
863ac12506SJonathan T. Looney 	uint32_t	len;
87050570efSLawrence Stewart 	/* Segment sequence number. */
88050570efSLawrence Stewart 	tcp_seq		seq;
89050570efSLawrence Stewart 	/* Time stamp indicating when the packet was sent. */
90050570efSLawrence Stewart 	uint32_t	tx_ts;
91050570efSLawrence Stewart 	/* Last received receiver ts (if the TCP option is used). */
92050570efSLawrence Stewart 	uint32_t	rx_ts;
93050570efSLawrence Stewart 	uint32_t	flags;
94050570efSLawrence Stewart 	TAILQ_ENTRY (txseginfo) txsegi_lnk;
95050570efSLawrence Stewart };
96050570efSLawrence Stewart 
97050570efSLawrence Stewart /* Flags for struct txseginfo. */
98050570efSLawrence Stewart #define	TXSI_TSO		0x01 /* TSO was used for this entry. */
99050570efSLawrence Stewart #define	TXSI_RTT_MEASURE_START	0x02 /* Start a per RTT measurement. */
100050570efSLawrence Stewart #define	TXSI_RX_MEASURE_END	0x04 /* Measure the rx rate until this txsi. */
101050570efSLawrence Stewart 
102050570efSLawrence Stewart struct helper ertt_helper = {
103050570efSLawrence Stewart 	.mod_init = ertt_mod_init,
104050570efSLawrence Stewart 	.mod_destroy = ertt_mod_destroy,
105050570efSLawrence Stewart 	.h_flags = HELPER_NEEDS_OSD,
106050570efSLawrence Stewart 	.h_classes = HELPER_CLASS_TCP
107050570efSLawrence Stewart };
108050570efSLawrence Stewart 
109050570efSLawrence Stewart /* Define the helper hook info required by ERTT. */
110050570efSLawrence Stewart struct hookinfo ertt_hooks[] = {
111050570efSLawrence Stewart 	{
112050570efSLawrence Stewart 		.hook_type = HHOOK_TYPE_TCP,
113050570efSLawrence Stewart 		.hook_id = HHOOK_TCP_EST_IN,
114050570efSLawrence Stewart 		.hook_udata = NULL,
115050570efSLawrence Stewart 		.hook_func = &ertt_packet_measurement_hook
116050570efSLawrence Stewart 	},
117050570efSLawrence Stewart 	{
118050570efSLawrence Stewart 		.hook_type = HHOOK_TYPE_TCP,
119050570efSLawrence Stewart 		.hook_id = HHOOK_TCP_EST_OUT,
120050570efSLawrence Stewart 		.hook_udata = NULL,
121050570efSLawrence Stewart 		.hook_func = &ertt_add_tx_segment_info_hook
122050570efSLawrence Stewart 	}
123050570efSLawrence Stewart };
124050570efSLawrence Stewart 
125050570efSLawrence Stewart /* Flags to indicate how marked_packet_rtt should handle this txsi. */
126050570efSLawrence Stewart #define	MULTI_ACK		0x01 /* More than this txsi is acked. */
127050570efSLawrence Stewart #define	OLD_TXSI		0x02 /* TXSI is old according to timestamps. */
128050570efSLawrence Stewart #define	CORRECT_ACK		0X04 /* Acks this TXSI. */
129050570efSLawrence Stewart #define	FORCED_MEASUREMENT	0X08 /* Force an RTT measurement. */
130050570efSLawrence Stewart 
131050570efSLawrence Stewart /*
132050570efSLawrence Stewart  * This fuction measures the RTT of a particular segment/ack pair, or the next
133050570efSLawrence Stewart  * closest if this will yield an inaccurate result due to delayed acking or
134050570efSLawrence Stewart  * other issues.
135050570efSLawrence Stewart  */
136050570efSLawrence Stewart static void inline
137050570efSLawrence Stewart marked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp,
138050570efSLawrence Stewart     uint32_t *pmeasurenext, int *pmeasurenext_len, int *prtt_bytes_adjust,
139050570efSLawrence Stewart     int mflag)
140050570efSLawrence Stewart {
141050570efSLawrence Stewart 
142050570efSLawrence Stewart 	/*
143050570efSLawrence Stewart 	 * If we can't measure this one properly due to delayed acking adjust
144050570efSLawrence Stewart 	 * byte counters and flag to measure next txsi. Note that since the
145050570efSLawrence Stewart 	 * marked packet's transmitted bytes are measured we need to subtract the
146050570efSLawrence Stewart 	 * transmitted bytes. Then pretend the next txsi was marked.
147050570efSLawrence Stewart 	 */
148050570efSLawrence Stewart 	if (mflag & (MULTI_ACK|OLD_TXSI)) {
149050570efSLawrence Stewart 		*pmeasurenext = txsi->tx_ts;
150050570efSLawrence Stewart 		*pmeasurenext_len = txsi->len;
151050570efSLawrence Stewart 		*prtt_bytes_adjust += *pmeasurenext_len;
152050570efSLawrence Stewart 	} else {
153050570efSLawrence Stewart 		if (mflag & FORCED_MEASUREMENT) {
154ee24d3b8SLawrence Stewart 			e_t->markedpkt_rtt = tcp_ts_getticks() -
155ee24d3b8SLawrence Stewart 			    *pmeasurenext + 1;
156050570efSLawrence Stewart 			e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt +
157050570efSLawrence Stewart 			    *pmeasurenext_len - *prtt_bytes_adjust;
158050570efSLawrence Stewart 		} else {
159ee24d3b8SLawrence Stewart 			e_t->markedpkt_rtt = tcp_ts_getticks() -
160ee24d3b8SLawrence Stewart 			    txsi->tx_ts + 1;
161050570efSLawrence Stewart 			e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt -
162050570efSLawrence Stewart 			    *prtt_bytes_adjust;
163050570efSLawrence Stewart 		}
164050570efSLawrence Stewart 		e_t->marked_snd_cwnd = tp->snd_cwnd;
165050570efSLawrence Stewart 
166050570efSLawrence Stewart 		/*
167050570efSLawrence Stewart 		 * Reset the ERTT_MEASUREMENT_IN_PROGRESS flag to indicate to
168050570efSLawrence Stewart 		 * add_tx_segment_info that a new measurement should be started.
169050570efSLawrence Stewart 		 */
170050570efSLawrence Stewart 		e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS;
171050570efSLawrence Stewart 		/*
172050570efSLawrence Stewart 		 * Set ERTT_NEW_MEASUREMENT to tell the congestion control
173050570efSLawrence Stewart 		 * algorithm that a new marked RTT measurement has has been made
174050570efSLawrence Stewart 		 * and is available for use.
175050570efSLawrence Stewart 		 */
176050570efSLawrence Stewart 		e_t->flags |= ERTT_NEW_MEASUREMENT;
177050570efSLawrence Stewart 
178050570efSLawrence Stewart 		if (tp->t_flags & TF_TSO) {
179050570efSLawrence Stewart 			/* Temporarily disable TSO to aid a new measurment. */
180050570efSLawrence Stewart 			tp->t_flags &= ~TF_TSO;
181050570efSLawrence Stewart 			/* Keep track that we've disabled it. */
182050570efSLawrence Stewart 			e_t->flags |= ERTT_TSO_DISABLED;
183050570efSLawrence Stewart 		}
184050570efSLawrence Stewart 	}
185050570efSLawrence Stewart }
186050570efSLawrence Stewart 
187050570efSLawrence Stewart /*
188050570efSLawrence Stewart  * Ertt_packet_measurements uses a small amount of state kept on each packet
189050570efSLawrence Stewart  * sent to match incoming acknowledgements. This enables more accurate and
190050570efSLawrence Stewart  * secure round trip time measurements. The resulting measurement is used for
191050570efSLawrence Stewart  * congestion control algorithms which require a more accurate time.
192050570efSLawrence Stewart  * Ertt_packet_measurements is called via the helper hook in tcp_input.c
193050570efSLawrence Stewart  */
194050570efSLawrence Stewart static int
195050570efSLawrence Stewart ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata,
196050570efSLawrence Stewart     void *ctx_data, void *hdata, struct osd *hosd)
197050570efSLawrence Stewart {
198050570efSLawrence Stewart 	struct ertt *e_t;
199050570efSLawrence Stewart 	struct tcpcb *tp;
200050570efSLawrence Stewart 	struct tcphdr *th;
201050570efSLawrence Stewart 	struct tcpopt *to;
202050570efSLawrence Stewart 	struct tcp_hhook_data *thdp;
203050570efSLawrence Stewart 	struct txseginfo *txsi;
204050570efSLawrence Stewart 	int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust;
205050570efSLawrence Stewart 	uint32_t measurenext, rts;
206050570efSLawrence Stewart 	tcp_seq ack;
207050570efSLawrence Stewart 
208050570efSLawrence Stewart 	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
209050570efSLawrence Stewart 	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
210050570efSLawrence Stewart 
211050570efSLawrence Stewart 	e_t = (struct ertt *)hdata;
212050570efSLawrence Stewart 	thdp = ctx_data;
213050570efSLawrence Stewart 	tp = thdp->tp;
214050570efSLawrence Stewart 	th = thdp->th;
215050570efSLawrence Stewart 	to = thdp->to;
216050570efSLawrence Stewart 	new_sacked_bytes = (tp->sackhint.last_sack_ack != 0);
217050570efSLawrence Stewart 	measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0;
218050570efSLawrence Stewart 	acked = th->th_ack - tp->snd_una;
219050570efSLawrence Stewart 
220050570efSLawrence Stewart 	INP_WLOCK_ASSERT(tp->t_inpcb);
221050570efSLawrence Stewart 
222050570efSLawrence Stewart 	/* Packet has provided new acknowledgements. */
223050570efSLawrence Stewart 	if (acked > 0 || new_sacked_bytes) {
224050570efSLawrence Stewart 		if (acked == 0 && new_sacked_bytes) {
225050570efSLawrence Stewart 			/* Use last sacked data. */
226050570efSLawrence Stewart 			ack = tp->sackhint.last_sack_ack;
227050570efSLawrence Stewart 		} else
228050570efSLawrence Stewart 			ack = th->th_ack;
229050570efSLawrence Stewart 
230050570efSLawrence Stewart 		txsi = TAILQ_FIRST(&e_t->txsegi_q);
231050570efSLawrence Stewart 		while (txsi != NULL) {
232050570efSLawrence Stewart 			rts = 0;
233050570efSLawrence Stewart 
234050570efSLawrence Stewart 			/* Acknowledgement is acking more than this txsi. */
235050570efSLawrence Stewart 			if (SEQ_GT(ack, txsi->seq + txsi->len)) {
236050570efSLawrence Stewart 				if (txsi->flags & TXSI_RTT_MEASURE_START ||
237050570efSLawrence Stewart 				    measurenext) {
238050570efSLawrence Stewart 					marked_packet_rtt(txsi, e_t, tp,
239050570efSLawrence Stewart 					    &measurenext, &measurenext_len,
240050570efSLawrence Stewart 					    &rtt_bytes_adjust, MULTI_ACK);
241050570efSLawrence Stewart 				}
242050570efSLawrence Stewart 				TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
243050570efSLawrence Stewart 				uma_zfree(txseginfo_zone, txsi);
244050570efSLawrence Stewart 				txsi = TAILQ_FIRST(&e_t->txsegi_q);
245050570efSLawrence Stewart 				continue;
246050570efSLawrence Stewart 			}
247050570efSLawrence Stewart 
248050570efSLawrence Stewart 			/*
249050570efSLawrence Stewart 			 * Guess if delayed acks are being used by the receiver.
250050570efSLawrence Stewart 			 *
251050570efSLawrence Stewart 			 * XXXDH: A simple heuristic that could be improved
252050570efSLawrence Stewart 			 */
253050570efSLawrence Stewart 			if (!new_sacked_bytes) {
254050570efSLawrence Stewart 				if (acked > tp->t_maxseg) {
255050570efSLawrence Stewart 					e_t->dlyack_rx +=
256050570efSLawrence Stewart 					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
257050570efSLawrence Stewart 					    1 : 0;
258050570efSLawrence Stewart 					multiack = 1;
259050570efSLawrence Stewart 				} else if (acked > txsi->len) {
260050570efSLawrence Stewart 					multiack = 1;
261050570efSLawrence Stewart 					e_t->dlyack_rx +=
262050570efSLawrence Stewart 					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
263050570efSLawrence Stewart 					    1 : 0;
264050570efSLawrence Stewart 				} else if (acked == tp->t_maxseg ||
265050570efSLawrence Stewart 					   acked == txsi->len) {
266050570efSLawrence Stewart 					e_t->dlyack_rx -=
267050570efSLawrence Stewart 					    (e_t->dlyack_rx > 0) ? 1 : 0;
268050570efSLawrence Stewart 				}
269050570efSLawrence Stewart 				/* Otherwise leave dlyack_rx the way it was. */
270050570efSLawrence Stewart 			}
271050570efSLawrence Stewart 
272050570efSLawrence Stewart 			/*
273050570efSLawrence Stewart 			 * Time stamps are only to help match the txsi with the
274050570efSLawrence Stewart 			 * received acknowledgements.
275050570efSLawrence Stewart 			 */
276050570efSLawrence Stewart 			if (e_t->timestamp_errors < MAX_TS_ERR &&
277050570efSLawrence Stewart 			    (to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
278050570efSLawrence Stewart 				/*
279050570efSLawrence Stewart 				 * Note: All packets sent with the offload will
280050570efSLawrence Stewart 				 * have the same time stamp. If we are sending
281050570efSLawrence Stewart 				 * on a fast interface and the t_maxseg is much
282050570efSLawrence Stewart 				 * smaller than one tick, this will be fine. The
283050570efSLawrence Stewart 				 * time stamp would be the same whether we were
284050570efSLawrence Stewart 				 * using tso or not. However, if the interface
285050570efSLawrence Stewart 				 * is slow, this will cause problems with the
286050570efSLawrence Stewart 				 * calculations. If the interface is slow, there
287050570efSLawrence Stewart 				 * is not reason to be using tso, and it should
288050570efSLawrence Stewart 				 * be turned off.
289050570efSLawrence Stewart 				 */
290050570efSLawrence Stewart 				/*
291050570efSLawrence Stewart 				 * If there are too many time stamp errors, time
292050570efSLawrence Stewart 				 * stamps won't be trusted
293050570efSLawrence Stewart 				 */
294050570efSLawrence Stewart 				rts = to->to_tsecr;
295050570efSLawrence Stewart 				/* Before this packet. */
296050570efSLawrence Stewart 				if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts))
297050570efSLawrence Stewart 					/* When delayed acking is used, the
298050570efSLawrence Stewart 					 * reflected time stamp is of the first
299050570efSLawrence Stewart 					 * packet and thus may be before
300050570efSLawrence Stewart 					 * txsi->tx_ts.
301050570efSLawrence Stewart 					 */
302050570efSLawrence Stewart 					break;
303050570efSLawrence Stewart 				if (TSTMP_GT(rts, txsi->tx_ts)) {
304050570efSLawrence Stewart 					/*
305050570efSLawrence Stewart 					 * If reflected time stamp is later than
306050570efSLawrence Stewart 					 * tx_tsi, then this txsi is old.
307050570efSLawrence Stewart 					 */
308050570efSLawrence Stewart 					if (txsi->flags & TXSI_RTT_MEASURE_START
309050570efSLawrence Stewart 					    || measurenext) {
310050570efSLawrence Stewart 						marked_packet_rtt(txsi, e_t, tp,
311050570efSLawrence Stewart 						    &measurenext, &measurenext_len,
312050570efSLawrence Stewart 						    &rtt_bytes_adjust, OLD_TXSI);
313050570efSLawrence Stewart 					}
314050570efSLawrence Stewart 					TAILQ_REMOVE(&e_t->txsegi_q, txsi,
315050570efSLawrence Stewart 					    txsegi_lnk);
316050570efSLawrence Stewart 					uma_zfree(txseginfo_zone, txsi);
317050570efSLawrence Stewart 					txsi = TAILQ_FIRST(&e_t->txsegi_q);
318050570efSLawrence Stewart 					continue;
319050570efSLawrence Stewart 				}
320050570efSLawrence Stewart 				if (rts == txsi->tx_ts &&
321050570efSLawrence Stewart 				    TSTMP_LT(to->to_tsval, txsi->rx_ts)) {
322050570efSLawrence Stewart 					/*
323050570efSLawrence Stewart 					 * Segment received before sent!
324050570efSLawrence Stewart 					 * Something is wrong with the received
325050570efSLawrence Stewart 					 * timestamps so increment errors. If
326050570efSLawrence Stewart 					 * this keeps up we will ignore
327050570efSLawrence Stewart 					 * timestamps.
328050570efSLawrence Stewart 					 */
329050570efSLawrence Stewart 					e_t->timestamp_errors++;
330050570efSLawrence Stewart 				}
331050570efSLawrence Stewart 			}
332050570efSLawrence Stewart 			/*
333050570efSLawrence Stewart 			 * Acknowledging a sequence number before this txsi.
334050570efSLawrence Stewart 			 * If it is an old txsi that may have had the same seq
335050570efSLawrence Stewart 			 * numbers, it should have been removed if time stamps
336050570efSLawrence Stewart 			 * are being used.
337050570efSLawrence Stewart 			 */
338050570efSLawrence Stewart 			if (SEQ_LEQ(ack, txsi->seq))
339050570efSLawrence Stewart 				break; /* Before first packet in txsi. */
340050570efSLawrence Stewart 
341050570efSLawrence Stewart 			/*
342050570efSLawrence Stewart 			 * Only ack > txsi->seq and ack <= txsi->seq+txsi->len
343050570efSLawrence Stewart 			 * past this point.
344050570efSLawrence Stewart 			 *
345050570efSLawrence Stewart 			 * If delayed acks are being used, an acknowledgement
346050570efSLawrence Stewart 			 * for a single segment will have been delayed by the
347050570efSLawrence Stewart 			 * receiver and will yield an inaccurate measurement. In
348050570efSLawrence Stewart 			 * this case, we only make the measurement if more than
349050570efSLawrence Stewart 			 * one segment is being acknowledged or sack is
350050570efSLawrence Stewart 			 * currently being used.
351050570efSLawrence Stewart 			 */
352050570efSLawrence Stewart 			if (!e_t->dlyack_rx || multiack || new_sacked_bytes) {
353050570efSLawrence Stewart 				/* Make an accurate new measurement. */
354ee24d3b8SLawrence Stewart 				e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1;
355050570efSLawrence Stewart 
356050570efSLawrence Stewart 				if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0)
357050570efSLawrence Stewart 					e_t->minrtt = e_t->rtt;
358050570efSLawrence Stewart 
359050570efSLawrence Stewart 				if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0)
360050570efSLawrence Stewart 					e_t->maxrtt = e_t->rtt;
361050570efSLawrence Stewart 			}
362050570efSLawrence Stewart 
363050570efSLawrence Stewart 			if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext)
364050570efSLawrence Stewart 				marked_packet_rtt(txsi, e_t, tp,
365050570efSLawrence Stewart 				    &measurenext, &measurenext_len,
366050570efSLawrence Stewart 				    &rtt_bytes_adjust, CORRECT_ACK);
367050570efSLawrence Stewart 
368050570efSLawrence Stewart 			if (txsi->flags & TXSI_TSO) {
3693ac12506SJonathan T. Looney 				if (txsi->len > acked) {
370050570efSLawrence Stewart 					txsi->len -= acked;
371050570efSLawrence Stewart 					/*
372050570efSLawrence Stewart 					 * This presumes ack for first bytes in
373050570efSLawrence Stewart 					 * txsi, this may not be true but it
374050570efSLawrence Stewart 					 * shouldn't cause problems for the
375050570efSLawrence Stewart 					 * timing.
376050570efSLawrence Stewart 					 *
377050570efSLawrence Stewart 					 * We remeasure RTT even though we only
378050570efSLawrence Stewart 					 * have a single txsi. The rationale
379050570efSLawrence Stewart 					 * behind this is that it is better to
380050570efSLawrence Stewart 					 * have a slightly inaccurate
381050570efSLawrence Stewart 					 * measurement than no additional
382050570efSLawrence Stewart 					 * measurement for the rest of the bulk
383050570efSLawrence Stewart 					 * transfer. Since TSO is only used on
384050570efSLawrence Stewart 					 * high speed interface cards, so the
385050570efSLawrence Stewart 					 * packets should be transmitted at line
386050570efSLawrence Stewart 					 * rate back to back with little
387050570efSLawrence Stewart 					 * difference in transmission times (in
388050570efSLawrence Stewart 					 * ticks).
389050570efSLawrence Stewart 					 */
390050570efSLawrence Stewart 					txsi->seq += acked;
391050570efSLawrence Stewart 					/*
392050570efSLawrence Stewart 					 * Reset txsi measure flag so we don't
393050570efSLawrence Stewart 					 * use it for another RTT measurement.
394050570efSLawrence Stewart 					 */
395050570efSLawrence Stewart 					txsi->flags &= ~TXSI_RTT_MEASURE_START;
396050570efSLawrence Stewart 					/*
397050570efSLawrence Stewart 					 * There is still more data to be acked
398050570efSLawrence Stewart 					 * from tso bulk transmission, so we
399050570efSLawrence Stewart 					 * won't remove it from the TAILQ yet.
400050570efSLawrence Stewart 					 */
401050570efSLawrence Stewart 					break;
402050570efSLawrence Stewart 				}
4033ac12506SJonathan T. Looney 				txsi->len = 0;
404050570efSLawrence Stewart 			}
405050570efSLawrence Stewart 
406050570efSLawrence Stewart 			TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
407050570efSLawrence Stewart 			uma_zfree(txseginfo_zone, txsi);
408050570efSLawrence Stewart 			break;
409050570efSLawrence Stewart 		}
410050570efSLawrence Stewart 
411050570efSLawrence Stewart 		if (measurenext) {
412050570efSLawrence Stewart 			/*
413050570efSLawrence Stewart 			 * We need to do a RTT measurement. It won't be the best
414050570efSLawrence Stewart 			 * if we do it here.
415050570efSLawrence Stewart 			 */
416050570efSLawrence Stewart 			marked_packet_rtt(txsi, e_t, tp,
417050570efSLawrence Stewart 			    &measurenext, &measurenext_len,
418050570efSLawrence Stewart 			    &rtt_bytes_adjust, FORCED_MEASUREMENT);
419050570efSLawrence Stewart 		}
420050570efSLawrence Stewart 	}
421050570efSLawrence Stewart 
422050570efSLawrence Stewart 	return (0);
423050570efSLawrence Stewart }
424050570efSLawrence Stewart 
425050570efSLawrence Stewart /*
426050570efSLawrence Stewart  * Add information about a transmitted segment to a list.
427050570efSLawrence Stewart  * This is called via the helper hook in tcp_output.c
428050570efSLawrence Stewart  */
429050570efSLawrence Stewart static int
430050570efSLawrence Stewart ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, void *udata,
431050570efSLawrence Stewart     void *ctx_data, void *hdata, struct osd *hosd)
432050570efSLawrence Stewart {
433050570efSLawrence Stewart 	struct ertt *e_t;
434050570efSLawrence Stewart 	struct tcpcb *tp;
435050570efSLawrence Stewart 	struct tcphdr *th;
436050570efSLawrence Stewart 	struct tcpopt *to;
437050570efSLawrence Stewart 	struct tcp_hhook_data *thdp;
438050570efSLawrence Stewart 	struct txseginfo *txsi;
4393ac12506SJonathan T. Looney 	uint32_t len;
440050570efSLawrence Stewart 	int tso;
441050570efSLawrence Stewart 
442050570efSLawrence Stewart 	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
443050570efSLawrence Stewart 	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
444050570efSLawrence Stewart 
445050570efSLawrence Stewart 	e_t = (struct ertt *)hdata;
446050570efSLawrence Stewart 	thdp = ctx_data;
447050570efSLawrence Stewart 	tp = thdp->tp;
448050570efSLawrence Stewart 	th = thdp->th;
449050570efSLawrence Stewart 	to = thdp->to;
450050570efSLawrence Stewart 	len = thdp->len;
451050570efSLawrence Stewart 	tso = thdp->tso;
452050570efSLawrence Stewart 
453050570efSLawrence Stewart 	INP_WLOCK_ASSERT(tp->t_inpcb);
454050570efSLawrence Stewart 
455050570efSLawrence Stewart 	if (len > 0) {
456050570efSLawrence Stewart 		txsi = uma_zalloc(txseginfo_zone, M_NOWAIT);
457050570efSLawrence Stewart 		if (txsi != NULL) {
458050570efSLawrence Stewart 			/* Construct txsi setting the necessary flags. */
459050570efSLawrence Stewart 			txsi->flags = 0; /* Needs to be initialised. */
460050570efSLawrence Stewart 			txsi->seq = ntohl(th->th_seq);
461050570efSLawrence Stewart 			txsi->len = len;
462050570efSLawrence Stewart 			if (tso)
463050570efSLawrence Stewart 				txsi->flags |= TXSI_TSO;
464050570efSLawrence Stewart 			else if (e_t->flags & ERTT_TSO_DISABLED) {
465050570efSLawrence Stewart 				tp->t_flags |= TF_TSO;
466050570efSLawrence Stewart 				e_t->flags &= ~ERTT_TSO_DISABLED;
467050570efSLawrence Stewart 			}
468050570efSLawrence Stewart 
469050570efSLawrence Stewart 			if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS) {
470050570efSLawrence Stewart 				e_t->bytes_tx_in_rtt += len;
471050570efSLawrence Stewart 			} else {
472050570efSLawrence Stewart 				txsi->flags |= TXSI_RTT_MEASURE_START;
473050570efSLawrence Stewart 				e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS;
474050570efSLawrence Stewart 				e_t->bytes_tx_in_rtt = len;
475050570efSLawrence Stewart 			}
476050570efSLawrence Stewart 
477050570efSLawrence Stewart 			if (((tp->t_flags & TF_NOOPT) == 0) &&
478050570efSLawrence Stewart 			    (to->to_flags & TOF_TS)) {
479050570efSLawrence Stewart 				txsi->tx_ts = ntohl(to->to_tsval) -
480050570efSLawrence Stewart 				    tp->ts_offset;
481050570efSLawrence Stewart 				txsi->rx_ts = ntohl(to->to_tsecr);
482050570efSLawrence Stewart 			} else {
483ee24d3b8SLawrence Stewart 				txsi->tx_ts = tcp_ts_getticks();
484050570efSLawrence Stewart 				txsi->rx_ts = 0; /* No received time stamp. */
485050570efSLawrence Stewart 			}
486050570efSLawrence Stewart 			TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk);
487050570efSLawrence Stewart 		}
488050570efSLawrence Stewart 	}
489050570efSLawrence Stewart 
490050570efSLawrence Stewart 	return (0);
491050570efSLawrence Stewart }
492050570efSLawrence Stewart 
493050570efSLawrence Stewart static int
494050570efSLawrence Stewart ertt_mod_init(void)
495050570efSLawrence Stewart {
496050570efSLawrence Stewart 
497050570efSLawrence Stewart 	txseginfo_zone = uma_zcreate("ertt_txseginfo", sizeof(struct txseginfo),
498050570efSLawrence Stewart 	    NULL, NULL, NULL, NULL, 0, 0);
499050570efSLawrence Stewart 
500050570efSLawrence Stewart 	return (0);
501050570efSLawrence Stewart }
502050570efSLawrence Stewart 
503050570efSLawrence Stewart static int
504050570efSLawrence Stewart ertt_mod_destroy(void)
505050570efSLawrence Stewart {
506050570efSLawrence Stewart 
507050570efSLawrence Stewart 	uma_zdestroy(txseginfo_zone);
508050570efSLawrence Stewart 
509050570efSLawrence Stewart 	return (0);
510050570efSLawrence Stewart }
511050570efSLawrence Stewart 
512050570efSLawrence Stewart static int
513050570efSLawrence Stewart ertt_uma_ctor(void *mem, int size, void *arg, int flags)
514050570efSLawrence Stewart {
515050570efSLawrence Stewart 	struct ertt *e_t;
516050570efSLawrence Stewart 
517050570efSLawrence Stewart 	e_t = mem;
518050570efSLawrence Stewart 
519050570efSLawrence Stewart 	TAILQ_INIT(&e_t->txsegi_q);
520050570efSLawrence Stewart 	e_t->timestamp_errors = 0;
521050570efSLawrence Stewart 	e_t->minrtt = 0;
522050570efSLawrence Stewart 	e_t->maxrtt = 0;
523050570efSLawrence Stewart 	e_t->rtt = 0;
524050570efSLawrence Stewart 	e_t->flags = 0;
525050570efSLawrence Stewart 	e_t->dlyack_rx = 0;
526050570efSLawrence Stewart 	e_t->bytes_tx_in_rtt = 0;
527050570efSLawrence Stewart 	e_t->markedpkt_rtt = 0;
528050570efSLawrence Stewart 
529050570efSLawrence Stewart 	return (0);
530050570efSLawrence Stewart }
531050570efSLawrence Stewart 
532050570efSLawrence Stewart static void
533050570efSLawrence Stewart ertt_uma_dtor(void *mem, int size, void *arg)
534050570efSLawrence Stewart {
535050570efSLawrence Stewart 	struct ertt *e_t;
536050570efSLawrence Stewart 	struct txseginfo *n_txsi, *txsi;
537050570efSLawrence Stewart 
538050570efSLawrence Stewart 	e_t = mem;
539050570efSLawrence Stewart 	txsi = TAILQ_FIRST(&e_t->txsegi_q);
540050570efSLawrence Stewart 	while (txsi != NULL) {
541050570efSLawrence Stewart 		n_txsi = TAILQ_NEXT(txsi, txsegi_lnk);
542050570efSLawrence Stewart 		uma_zfree(txseginfo_zone, txsi);
543050570efSLawrence Stewart 		txsi = n_txsi;
544050570efSLawrence Stewart 	}
545050570efSLawrence Stewart }
546050570efSLawrence Stewart 
547050570efSLawrence Stewart KHELP_DECLARE_MOD_UMA(ertt, &ertt_helper, ertt_hooks, 1, sizeof(struct ertt),
548050570efSLawrence Stewart     ertt_uma_ctor, ertt_uma_dtor);
549