xref: /freebsd/sys/netinet/khelp/h_ertt.c (revision fdafd315)
1050570efSLawrence Stewart /*-
24d846d26SWarner Losh  * SPDX-License-Identifier: BSD-2-Clause
3fe267a55SPedro F. Giffuni  *
4050570efSLawrence Stewart  * Copyright (c) 2009-2010
5050570efSLawrence Stewart  * 	Swinburne University of Technology, Melbourne, Australia
6050570efSLawrence Stewart  * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
7050570efSLawrence Stewart  * Copyright (c) 2010-2011 The FreeBSD Foundation
8050570efSLawrence Stewart  * All rights reserved.
9050570efSLawrence Stewart  *
10050570efSLawrence Stewart  * This software was developed at the Centre for Advanced Internet
11891b8ed4SLawrence Stewart  * Architectures, Swinburne University of Technology, by David Hayes, made
12891b8ed4SLawrence Stewart  * possible in part by a grant from the Cisco University Research Program Fund
13891b8ed4SLawrence Stewart  * at Community Foundation Silicon Valley.
14050570efSLawrence Stewart  *
15050570efSLawrence Stewart  * Portions of this software were developed at the Centre for Advanced
16050570efSLawrence Stewart  * Internet Architectures, Swinburne University of Technology, Melbourne,
17050570efSLawrence Stewart  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
18050570efSLawrence Stewart  *
19050570efSLawrence Stewart  * Redistribution and use in source and binary forms, with or without
20050570efSLawrence Stewart  * modification, are permitted provided that the following conditions
21050570efSLawrence Stewart  * are met:
22050570efSLawrence Stewart  * 1. Redistributions of source code must retain the above copyright
23050570efSLawrence Stewart  *    notice, this list of conditions and the following disclaimer.
24050570efSLawrence Stewart  * 2. Redistributions in binary form must reproduce the above copyright
25050570efSLawrence Stewart  *    notice, this list of conditions and the following disclaimer in the
26050570efSLawrence Stewart  *    documentation and/or other materials provided with the distribution.
27050570efSLawrence Stewart  *
28050570efSLawrence Stewart  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
29050570efSLawrence Stewart  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30050570efSLawrence Stewart  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31050570efSLawrence Stewart  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
32050570efSLawrence Stewart  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33050570efSLawrence Stewart  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34050570efSLawrence Stewart  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35050570efSLawrence Stewart  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36050570efSLawrence Stewart  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37050570efSLawrence Stewart  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38050570efSLawrence Stewart  * SUCH DAMAGE.
39050570efSLawrence Stewart  */
40050570efSLawrence Stewart 
41050570efSLawrence Stewart #include <sys/param.h>
42050570efSLawrence Stewart #include <sys/kernel.h>
43050570efSLawrence Stewart #include <sys/mbuf.h>
44050570efSLawrence Stewart #include <sys/module.h>
45050570efSLawrence Stewart #include <sys/hhook.h>
46050570efSLawrence Stewart #include <sys/khelp.h>
47050570efSLawrence Stewart #include <sys/module_khelp.h>
48050570efSLawrence Stewart #include <sys/socket.h>
49050570efSLawrence Stewart #include <sys/sockopt.h>
50050570efSLawrence Stewart 
51050570efSLawrence Stewart #include <net/vnet.h>
52050570efSLawrence Stewart 
53050570efSLawrence Stewart #include <netinet/in.h>
54050570efSLawrence Stewart #include <netinet/in_pcb.h>
55050570efSLawrence Stewart #include <netinet/tcp_seq.h>
56050570efSLawrence Stewart #include <netinet/tcp_var.h>
57050570efSLawrence Stewart 
58050570efSLawrence Stewart #include <netinet/khelp/h_ertt.h>
59050570efSLawrence Stewart 
60050570efSLawrence Stewart #include <vm/uma.h>
61050570efSLawrence Stewart 
62050570efSLawrence Stewart uma_zone_t txseginfo_zone;
63050570efSLawrence Stewart 
64050570efSLawrence Stewart /* Smoothing factor for delayed ack guess. */
65050570efSLawrence Stewart #define	DLYACK_SMOOTH	5
66050570efSLawrence Stewart 
67050570efSLawrence Stewart /* Max number of time stamp errors allowed in a session. */
68050570efSLawrence Stewart #define	MAX_TS_ERR	10
69050570efSLawrence Stewart 
70050570efSLawrence Stewart static int ertt_packet_measurement_hook(int hhook_type, int hhook_id,
71050570efSLawrence Stewart     void *udata, void *ctx_data, void *hdata, struct osd *hosd);
72050570efSLawrence Stewart static int ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id,
73050570efSLawrence Stewart     void *udata, void *ctx_data, void *hdata, struct osd *hosd);
74050570efSLawrence Stewart static int ertt_mod_init(void);
75050570efSLawrence Stewart static int ertt_mod_destroy(void);
76050570efSLawrence Stewart static int ertt_uma_ctor(void *mem, int size, void *arg, int flags);
77050570efSLawrence Stewart static void ertt_uma_dtor(void *mem, int size, void *arg);
78050570efSLawrence Stewart 
79050570efSLawrence Stewart /*
80050570efSLawrence Stewart  * Contains information about the sent segment for comparison with the
81050570efSLawrence Stewart  * corresponding ack.
82050570efSLawrence Stewart  */
83050570efSLawrence Stewart struct txseginfo {
84050570efSLawrence Stewart 	/* Segment length. */
853ac12506SJonathan T. Looney 	uint32_t	len;
86050570efSLawrence Stewart 	/* Segment sequence number. */
87050570efSLawrence Stewart 	tcp_seq		seq;
88050570efSLawrence Stewart 	/* Time stamp indicating when the packet was sent. */
89050570efSLawrence Stewart 	uint32_t	tx_ts;
90050570efSLawrence Stewart 	/* Last received receiver ts (if the TCP option is used). */
91050570efSLawrence Stewart 	uint32_t	rx_ts;
92050570efSLawrence Stewart 	uint32_t	flags;
93050570efSLawrence Stewart 	TAILQ_ENTRY (txseginfo) txsegi_lnk;
94050570efSLawrence Stewart };
95050570efSLawrence Stewart 
96050570efSLawrence Stewart /* Flags for struct txseginfo. */
97050570efSLawrence Stewart #define	TXSI_TSO		0x01 /* TSO was used for this entry. */
98050570efSLawrence Stewart #define	TXSI_RTT_MEASURE_START	0x02 /* Start a per RTT measurement. */
99050570efSLawrence Stewart #define	TXSI_RX_MEASURE_END	0x04 /* Measure the rx rate until this txsi. */
100050570efSLawrence Stewart 
101050570efSLawrence Stewart struct helper ertt_helper = {
102050570efSLawrence Stewart 	.mod_init = ertt_mod_init,
103050570efSLawrence Stewart 	.mod_destroy = ertt_mod_destroy,
104050570efSLawrence Stewart 	.h_flags = HELPER_NEEDS_OSD,
105050570efSLawrence Stewart 	.h_classes = HELPER_CLASS_TCP
106050570efSLawrence Stewart };
107050570efSLawrence Stewart 
108050570efSLawrence Stewart /* Define the helper hook info required by ERTT. */
109050570efSLawrence Stewart struct hookinfo ertt_hooks[] = {
110050570efSLawrence Stewart 	{
111050570efSLawrence Stewart 		.hook_type = HHOOK_TYPE_TCP,
112050570efSLawrence Stewart 		.hook_id = HHOOK_TCP_EST_IN,
113050570efSLawrence Stewart 		.hook_udata = NULL,
114050570efSLawrence Stewart 		.hook_func = &ertt_packet_measurement_hook
115050570efSLawrence Stewart 	},
116050570efSLawrence Stewart 	{
117050570efSLawrence Stewart 		.hook_type = HHOOK_TYPE_TCP,
118050570efSLawrence Stewart 		.hook_id = HHOOK_TCP_EST_OUT,
119050570efSLawrence Stewart 		.hook_udata = NULL,
120050570efSLawrence Stewart 		.hook_func = &ertt_add_tx_segment_info_hook
121050570efSLawrence Stewart 	}
122050570efSLawrence Stewart };
123050570efSLawrence Stewart 
124050570efSLawrence Stewart /* Flags to indicate how marked_packet_rtt should handle this txsi. */
125050570efSLawrence Stewart #define	MULTI_ACK		0x01 /* More than this txsi is acked. */
126050570efSLawrence Stewart #define	OLD_TXSI		0x02 /* TXSI is old according to timestamps. */
127050570efSLawrence Stewart #define	CORRECT_ACK		0X04 /* Acks this TXSI. */
128050570efSLawrence Stewart #define	FORCED_MEASUREMENT	0X08 /* Force an RTT measurement. */
129050570efSLawrence Stewart 
130050570efSLawrence Stewart /*
131050570efSLawrence Stewart  * This fuction measures the RTT of a particular segment/ack pair, or the next
132050570efSLawrence Stewart  * closest if this will yield an inaccurate result due to delayed acking or
133050570efSLawrence Stewart  * other issues.
134050570efSLawrence Stewart  */
135050570efSLawrence Stewart static void inline
marked_packet_rtt(struct txseginfo * txsi,struct ertt * e_t,struct tcpcb * tp,uint32_t * pmeasurenext,int * pmeasurenext_len,int * prtt_bytes_adjust,int mflag)136050570efSLawrence Stewart marked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp,
137050570efSLawrence Stewart     uint32_t *pmeasurenext, int *pmeasurenext_len, int *prtt_bytes_adjust,
138050570efSLawrence Stewart     int mflag)
139050570efSLawrence Stewart {
140050570efSLawrence Stewart 
141050570efSLawrence Stewart 	/*
142050570efSLawrence Stewart 	 * If we can't measure this one properly due to delayed acking adjust
143050570efSLawrence Stewart 	 * byte counters and flag to measure next txsi. Note that since the
144050570efSLawrence Stewart 	 * marked packet's transmitted bytes are measured we need to subtract the
145050570efSLawrence Stewart 	 * transmitted bytes. Then pretend the next txsi was marked.
146050570efSLawrence Stewart 	 */
147050570efSLawrence Stewart 	if (mflag & (MULTI_ACK|OLD_TXSI)) {
148050570efSLawrence Stewart 		*pmeasurenext = txsi->tx_ts;
149050570efSLawrence Stewart 		*pmeasurenext_len = txsi->len;
150050570efSLawrence Stewart 		*prtt_bytes_adjust += *pmeasurenext_len;
151050570efSLawrence Stewart 	} else {
152050570efSLawrence Stewart 		if (mflag & FORCED_MEASUREMENT) {
153ee24d3b8SLawrence Stewart 			e_t->markedpkt_rtt = tcp_ts_getticks() -
154ee24d3b8SLawrence Stewart 			    *pmeasurenext + 1;
155050570efSLawrence Stewart 			e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt +
156050570efSLawrence Stewart 			    *pmeasurenext_len - *prtt_bytes_adjust;
157050570efSLawrence Stewart 		} else {
158ee24d3b8SLawrence Stewart 			e_t->markedpkt_rtt = tcp_ts_getticks() -
159ee24d3b8SLawrence Stewart 			    txsi->tx_ts + 1;
160050570efSLawrence Stewart 			e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt -
161050570efSLawrence Stewart 			    *prtt_bytes_adjust;
162050570efSLawrence Stewart 		}
163050570efSLawrence Stewart 		e_t->marked_snd_cwnd = tp->snd_cwnd;
164050570efSLawrence Stewart 
165050570efSLawrence Stewart 		/*
166050570efSLawrence Stewart 		 * Reset the ERTT_MEASUREMENT_IN_PROGRESS flag to indicate to
167050570efSLawrence Stewart 		 * add_tx_segment_info that a new measurement should be started.
168050570efSLawrence Stewart 		 */
169050570efSLawrence Stewart 		e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS;
170050570efSLawrence Stewart 		/*
171050570efSLawrence Stewart 		 * Set ERTT_NEW_MEASUREMENT to tell the congestion control
172050570efSLawrence Stewart 		 * algorithm that a new marked RTT measurement has has been made
173050570efSLawrence Stewart 		 * and is available for use.
174050570efSLawrence Stewart 		 */
175050570efSLawrence Stewart 		e_t->flags |= ERTT_NEW_MEASUREMENT;
176050570efSLawrence Stewart 
177050570efSLawrence Stewart 		if (tp->t_flags & TF_TSO) {
1788d30ef92SGordon Bergling 			/* Temporarily disable TSO to aid a new measurement. */
179050570efSLawrence Stewart 			tp->t_flags &= ~TF_TSO;
180050570efSLawrence Stewart 			/* Keep track that we've disabled it. */
181050570efSLawrence Stewart 			e_t->flags |= ERTT_TSO_DISABLED;
182050570efSLawrence Stewart 		}
183050570efSLawrence Stewart 	}
184050570efSLawrence Stewart }
185050570efSLawrence Stewart 
186050570efSLawrence Stewart /*
187050570efSLawrence Stewart  * Ertt_packet_measurements uses a small amount of state kept on each packet
188050570efSLawrence Stewart  * sent to match incoming acknowledgements. This enables more accurate and
189050570efSLawrence Stewart  * secure round trip time measurements. The resulting measurement is used for
190050570efSLawrence Stewart  * congestion control algorithms which require a more accurate time.
191050570efSLawrence Stewart  * Ertt_packet_measurements is called via the helper hook in tcp_input.c
192050570efSLawrence Stewart  */
193050570efSLawrence Stewart static int
ertt_packet_measurement_hook(int hhook_type,int hhook_id,void * udata,void * ctx_data,void * hdata,struct osd * hosd)194050570efSLawrence Stewart ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata,
195050570efSLawrence Stewart     void *ctx_data, void *hdata, struct osd *hosd)
196050570efSLawrence Stewart {
197050570efSLawrence Stewart 	struct ertt *e_t;
198050570efSLawrence Stewart 	struct tcpcb *tp;
199050570efSLawrence Stewart 	struct tcphdr *th;
200050570efSLawrence Stewart 	struct tcpopt *to;
201050570efSLawrence Stewart 	struct tcp_hhook_data *thdp;
202050570efSLawrence Stewart 	struct txseginfo *txsi;
203050570efSLawrence Stewart 	int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust;
204050570efSLawrence Stewart 	uint32_t measurenext, rts;
205050570efSLawrence Stewart 	tcp_seq ack;
206050570efSLawrence Stewart 
207050570efSLawrence Stewart 	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
208050570efSLawrence Stewart 	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
209050570efSLawrence Stewart 
210050570efSLawrence Stewart 	e_t = (struct ertt *)hdata;
211050570efSLawrence Stewart 	thdp = ctx_data;
212050570efSLawrence Stewart 	tp = thdp->tp;
213050570efSLawrence Stewart 	th = thdp->th;
214050570efSLawrence Stewart 	to = thdp->to;
215050570efSLawrence Stewart 	new_sacked_bytes = (tp->sackhint.last_sack_ack != 0);
216050570efSLawrence Stewart 	measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0;
217050570efSLawrence Stewart 	acked = th->th_ack - tp->snd_una;
218050570efSLawrence Stewart 
2199eb0e832SGleb Smirnoff 	INP_WLOCK_ASSERT(tptoinpcb(tp));
220050570efSLawrence Stewart 
221050570efSLawrence Stewart 	/* Packet has provided new acknowledgements. */
222050570efSLawrence Stewart 	if (acked > 0 || new_sacked_bytes) {
223050570efSLawrence Stewart 		if (acked == 0 && new_sacked_bytes) {
224050570efSLawrence Stewart 			/* Use last sacked data. */
225050570efSLawrence Stewart 			ack = tp->sackhint.last_sack_ack;
226050570efSLawrence Stewart 		} else
227050570efSLawrence Stewart 			ack = th->th_ack;
228050570efSLawrence Stewart 
229050570efSLawrence Stewart 		txsi = TAILQ_FIRST(&e_t->txsegi_q);
230050570efSLawrence Stewart 		while (txsi != NULL) {
231050570efSLawrence Stewart 			rts = 0;
232050570efSLawrence Stewart 
233050570efSLawrence Stewart 			/* Acknowledgement is acking more than this txsi. */
234050570efSLawrence Stewart 			if (SEQ_GT(ack, txsi->seq + txsi->len)) {
235050570efSLawrence Stewart 				if (txsi->flags & TXSI_RTT_MEASURE_START ||
236050570efSLawrence Stewart 				    measurenext) {
237050570efSLawrence Stewart 					marked_packet_rtt(txsi, e_t, tp,
238050570efSLawrence Stewart 					    &measurenext, &measurenext_len,
239050570efSLawrence Stewart 					    &rtt_bytes_adjust, MULTI_ACK);
240050570efSLawrence Stewart 				}
241050570efSLawrence Stewart 				TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
242050570efSLawrence Stewart 				uma_zfree(txseginfo_zone, txsi);
243050570efSLawrence Stewart 				txsi = TAILQ_FIRST(&e_t->txsegi_q);
244050570efSLawrence Stewart 				continue;
245050570efSLawrence Stewart 			}
246050570efSLawrence Stewart 
247050570efSLawrence Stewart 			/*
248050570efSLawrence Stewart 			 * Guess if delayed acks are being used by the receiver.
249050570efSLawrence Stewart 			 *
250050570efSLawrence Stewart 			 * XXXDH: A simple heuristic that could be improved
251050570efSLawrence Stewart 			 */
252050570efSLawrence Stewart 			if (!new_sacked_bytes) {
253050570efSLawrence Stewart 				if (acked > tp->t_maxseg) {
254050570efSLawrence Stewart 					e_t->dlyack_rx +=
255050570efSLawrence Stewart 					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
256050570efSLawrence Stewart 					    1 : 0;
257050570efSLawrence Stewart 					multiack = 1;
258050570efSLawrence Stewart 				} else if (acked > txsi->len) {
259050570efSLawrence Stewart 					multiack = 1;
260050570efSLawrence Stewart 					e_t->dlyack_rx +=
261050570efSLawrence Stewart 					    (e_t->dlyack_rx < DLYACK_SMOOTH) ?
262050570efSLawrence Stewart 					    1 : 0;
263050570efSLawrence Stewart 				} else if (acked == tp->t_maxseg ||
264050570efSLawrence Stewart 					   acked == txsi->len) {
265050570efSLawrence Stewart 					e_t->dlyack_rx -=
266050570efSLawrence Stewart 					    (e_t->dlyack_rx > 0) ? 1 : 0;
267050570efSLawrence Stewart 				}
268050570efSLawrence Stewart 				/* Otherwise leave dlyack_rx the way it was. */
269050570efSLawrence Stewart 			}
270050570efSLawrence Stewart 
271050570efSLawrence Stewart 			/*
272050570efSLawrence Stewart 			 * Time stamps are only to help match the txsi with the
273050570efSLawrence Stewart 			 * received acknowledgements.
274050570efSLawrence Stewart 			 */
275050570efSLawrence Stewart 			if (e_t->timestamp_errors < MAX_TS_ERR &&
276050570efSLawrence Stewart 			    (to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
277050570efSLawrence Stewart 				/*
278050570efSLawrence Stewart 				 * Note: All packets sent with the offload will
279050570efSLawrence Stewart 				 * have the same time stamp. If we are sending
280050570efSLawrence Stewart 				 * on a fast interface and the t_maxseg is much
281050570efSLawrence Stewart 				 * smaller than one tick, this will be fine. The
282050570efSLawrence Stewart 				 * time stamp would be the same whether we were
283050570efSLawrence Stewart 				 * using tso or not. However, if the interface
284050570efSLawrence Stewart 				 * is slow, this will cause problems with the
285050570efSLawrence Stewart 				 * calculations. If the interface is slow, there
286050570efSLawrence Stewart 				 * is not reason to be using tso, and it should
287050570efSLawrence Stewart 				 * be turned off.
288050570efSLawrence Stewart 				 */
289050570efSLawrence Stewart 				/*
290050570efSLawrence Stewart 				 * If there are too many time stamp errors, time
291050570efSLawrence Stewart 				 * stamps won't be trusted
292050570efSLawrence Stewart 				 */
293050570efSLawrence Stewart 				rts = to->to_tsecr;
294050570efSLawrence Stewart 				/* Before this packet. */
295050570efSLawrence Stewart 				if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts))
296050570efSLawrence Stewart 					/* When delayed acking is used, the
297050570efSLawrence Stewart 					 * reflected time stamp is of the first
298050570efSLawrence Stewart 					 * packet and thus may be before
299050570efSLawrence Stewart 					 * txsi->tx_ts.
300050570efSLawrence Stewart 					 */
301050570efSLawrence Stewart 					break;
302050570efSLawrence Stewart 				if (TSTMP_GT(rts, txsi->tx_ts)) {
303050570efSLawrence Stewart 					/*
304050570efSLawrence Stewart 					 * If reflected time stamp is later than
305050570efSLawrence Stewart 					 * tx_tsi, then this txsi is old.
306050570efSLawrence Stewart 					 */
307050570efSLawrence Stewart 					if (txsi->flags & TXSI_RTT_MEASURE_START
308050570efSLawrence Stewart 					    || measurenext) {
309050570efSLawrence Stewart 						marked_packet_rtt(txsi, e_t, tp,
310050570efSLawrence Stewart 						    &measurenext, &measurenext_len,
311050570efSLawrence Stewart 						    &rtt_bytes_adjust, OLD_TXSI);
312050570efSLawrence Stewart 					}
313050570efSLawrence Stewart 					TAILQ_REMOVE(&e_t->txsegi_q, txsi,
314050570efSLawrence Stewart 					    txsegi_lnk);
315050570efSLawrence Stewart 					uma_zfree(txseginfo_zone, txsi);
316050570efSLawrence Stewart 					txsi = TAILQ_FIRST(&e_t->txsegi_q);
317050570efSLawrence Stewart 					continue;
318050570efSLawrence Stewart 				}
319050570efSLawrence Stewart 				if (rts == txsi->tx_ts &&
320050570efSLawrence Stewart 				    TSTMP_LT(to->to_tsval, txsi->rx_ts)) {
321050570efSLawrence Stewart 					/*
322050570efSLawrence Stewart 					 * Segment received before sent!
323050570efSLawrence Stewart 					 * Something is wrong with the received
324050570efSLawrence Stewart 					 * timestamps so increment errors. If
325050570efSLawrence Stewart 					 * this keeps up we will ignore
326050570efSLawrence Stewart 					 * timestamps.
327050570efSLawrence Stewart 					 */
328050570efSLawrence Stewart 					e_t->timestamp_errors++;
329050570efSLawrence Stewart 				}
330050570efSLawrence Stewart 			}
331050570efSLawrence Stewart 			/*
332050570efSLawrence Stewart 			 * Acknowledging a sequence number before this txsi.
333050570efSLawrence Stewart 			 * If it is an old txsi that may have had the same seq
334050570efSLawrence Stewart 			 * numbers, it should have been removed if time stamps
335050570efSLawrence Stewart 			 * are being used.
336050570efSLawrence Stewart 			 */
337050570efSLawrence Stewart 			if (SEQ_LEQ(ack, txsi->seq))
338050570efSLawrence Stewart 				break; /* Before first packet in txsi. */
339050570efSLawrence Stewart 
340050570efSLawrence Stewart 			/*
341050570efSLawrence Stewart 			 * Only ack > txsi->seq and ack <= txsi->seq+txsi->len
342050570efSLawrence Stewart 			 * past this point.
343050570efSLawrence Stewart 			 *
344050570efSLawrence Stewart 			 * If delayed acks are being used, an acknowledgement
345050570efSLawrence Stewart 			 * for a single segment will have been delayed by the
346050570efSLawrence Stewart 			 * receiver and will yield an inaccurate measurement. In
347050570efSLawrence Stewart 			 * this case, we only make the measurement if more than
348050570efSLawrence Stewart 			 * one segment is being acknowledged or sack is
349050570efSLawrence Stewart 			 * currently being used.
350050570efSLawrence Stewart 			 */
351050570efSLawrence Stewart 			if (!e_t->dlyack_rx || multiack || new_sacked_bytes) {
352050570efSLawrence Stewart 				/* Make an accurate new measurement. */
353ee24d3b8SLawrence Stewart 				e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1;
354050570efSLawrence Stewart 
355050570efSLawrence Stewart 				if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0)
356050570efSLawrence Stewart 					e_t->minrtt = e_t->rtt;
357050570efSLawrence Stewart 
358050570efSLawrence Stewart 				if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0)
359050570efSLawrence Stewart 					e_t->maxrtt = e_t->rtt;
360050570efSLawrence Stewart 			}
361050570efSLawrence Stewart 
362050570efSLawrence Stewart 			if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext)
363050570efSLawrence Stewart 				marked_packet_rtt(txsi, e_t, tp,
364050570efSLawrence Stewart 				    &measurenext, &measurenext_len,
365050570efSLawrence Stewart 				    &rtt_bytes_adjust, CORRECT_ACK);
366050570efSLawrence Stewart 
367050570efSLawrence Stewart 			if (txsi->flags & TXSI_TSO) {
3683ac12506SJonathan T. Looney 				if (txsi->len > acked) {
369050570efSLawrence Stewart 					txsi->len -= acked;
370050570efSLawrence Stewart 					/*
371050570efSLawrence Stewart 					 * This presumes ack for first bytes in
372050570efSLawrence Stewart 					 * txsi, this may not be true but it
373050570efSLawrence Stewart 					 * shouldn't cause problems for the
374050570efSLawrence Stewart 					 * timing.
375050570efSLawrence Stewart 					 *
376050570efSLawrence Stewart 					 * We remeasure RTT even though we only
377050570efSLawrence Stewart 					 * have a single txsi. The rationale
378050570efSLawrence Stewart 					 * behind this is that it is better to
379050570efSLawrence Stewart 					 * have a slightly inaccurate
380050570efSLawrence Stewart 					 * measurement than no additional
381050570efSLawrence Stewart 					 * measurement for the rest of the bulk
382050570efSLawrence Stewart 					 * transfer. Since TSO is only used on
383050570efSLawrence Stewart 					 * high speed interface cards, so the
384050570efSLawrence Stewart 					 * packets should be transmitted at line
385050570efSLawrence Stewart 					 * rate back to back with little
386050570efSLawrence Stewart 					 * difference in transmission times (in
387050570efSLawrence Stewart 					 * ticks).
388050570efSLawrence Stewart 					 */
389050570efSLawrence Stewart 					txsi->seq += acked;
390050570efSLawrence Stewart 					/*
391050570efSLawrence Stewart 					 * Reset txsi measure flag so we don't
392050570efSLawrence Stewart 					 * use it for another RTT measurement.
393050570efSLawrence Stewart 					 */
394050570efSLawrence Stewart 					txsi->flags &= ~TXSI_RTT_MEASURE_START;
395050570efSLawrence Stewart 					/*
396050570efSLawrence Stewart 					 * There is still more data to be acked
397050570efSLawrence Stewart 					 * from tso bulk transmission, so we
398050570efSLawrence Stewart 					 * won't remove it from the TAILQ yet.
399050570efSLawrence Stewart 					 */
400050570efSLawrence Stewart 					break;
401050570efSLawrence Stewart 				}
4023ac12506SJonathan T. Looney 				txsi->len = 0;
403050570efSLawrence Stewart 			}
404050570efSLawrence Stewart 
405050570efSLawrence Stewart 			TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
406050570efSLawrence Stewart 			uma_zfree(txseginfo_zone, txsi);
407050570efSLawrence Stewart 			break;
408050570efSLawrence Stewart 		}
409050570efSLawrence Stewart 
410050570efSLawrence Stewart 		if (measurenext) {
411050570efSLawrence Stewart 			/*
412050570efSLawrence Stewart 			 * We need to do a RTT measurement. It won't be the best
413050570efSLawrence Stewart 			 * if we do it here.
414050570efSLawrence Stewart 			 */
415050570efSLawrence Stewart 			marked_packet_rtt(txsi, e_t, tp,
416050570efSLawrence Stewart 			    &measurenext, &measurenext_len,
417050570efSLawrence Stewart 			    &rtt_bytes_adjust, FORCED_MEASUREMENT);
418050570efSLawrence Stewart 		}
419050570efSLawrence Stewart 	}
420050570efSLawrence Stewart 
421050570efSLawrence Stewart 	return (0);
422050570efSLawrence Stewart }
423050570efSLawrence Stewart 
424050570efSLawrence Stewart /*
425050570efSLawrence Stewart  * Add information about a transmitted segment to a list.
426050570efSLawrence Stewart  * This is called via the helper hook in tcp_output.c
427050570efSLawrence Stewart  */
428050570efSLawrence Stewart static int
ertt_add_tx_segment_info_hook(int hhook_type,int hhook_id,void * udata,void * ctx_data,void * hdata,struct osd * hosd)429050570efSLawrence Stewart ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, void *udata,
430050570efSLawrence Stewart     void *ctx_data, void *hdata, struct osd *hosd)
431050570efSLawrence Stewart {
432050570efSLawrence Stewart 	struct ertt *e_t;
433050570efSLawrence Stewart 	struct tcpcb *tp;
434050570efSLawrence Stewart 	struct tcphdr *th;
435050570efSLawrence Stewart 	struct tcpopt *to;
436050570efSLawrence Stewart 	struct tcp_hhook_data *thdp;
437050570efSLawrence Stewart 	struct txseginfo *txsi;
4383ac12506SJonathan T. Looney 	uint32_t len;
439050570efSLawrence Stewart 	int tso;
440050570efSLawrence Stewart 
441050570efSLawrence Stewart 	KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
442050570efSLawrence Stewart 	KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
443050570efSLawrence Stewart 
444050570efSLawrence Stewart 	e_t = (struct ertt *)hdata;
445050570efSLawrence Stewart 	thdp = ctx_data;
446050570efSLawrence Stewart 	tp = thdp->tp;
447050570efSLawrence Stewart 	th = thdp->th;
448050570efSLawrence Stewart 	to = thdp->to;
449050570efSLawrence Stewart 	len = thdp->len;
450050570efSLawrence Stewart 	tso = thdp->tso;
451050570efSLawrence Stewart 
4529eb0e832SGleb Smirnoff 	INP_WLOCK_ASSERT(tptoinpcb(tp));
453050570efSLawrence Stewart 
454050570efSLawrence Stewart 	if (len > 0) {
455050570efSLawrence Stewart 		txsi = uma_zalloc(txseginfo_zone, M_NOWAIT);
456050570efSLawrence Stewart 		if (txsi != NULL) {
457050570efSLawrence Stewart 			/* Construct txsi setting the necessary flags. */
458050570efSLawrence Stewart 			txsi->flags = 0; /* Needs to be initialised. */
459050570efSLawrence Stewart 			txsi->seq = ntohl(th->th_seq);
460050570efSLawrence Stewart 			txsi->len = len;
461050570efSLawrence Stewart 			if (tso)
462050570efSLawrence Stewart 				txsi->flags |= TXSI_TSO;
463050570efSLawrence Stewart 			else if (e_t->flags & ERTT_TSO_DISABLED) {
464050570efSLawrence Stewart 				tp->t_flags |= TF_TSO;
465050570efSLawrence Stewart 				e_t->flags &= ~ERTT_TSO_DISABLED;
466050570efSLawrence Stewart 			}
467050570efSLawrence Stewart 
468050570efSLawrence Stewart 			if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS) {
469050570efSLawrence Stewart 				e_t->bytes_tx_in_rtt += len;
470050570efSLawrence Stewart 			} else {
471050570efSLawrence Stewart 				txsi->flags |= TXSI_RTT_MEASURE_START;
472050570efSLawrence Stewart 				e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS;
473050570efSLawrence Stewart 				e_t->bytes_tx_in_rtt = len;
474050570efSLawrence Stewart 			}
475050570efSLawrence Stewart 
476050570efSLawrence Stewart 			if (((tp->t_flags & TF_NOOPT) == 0) &&
477050570efSLawrence Stewart 			    (to->to_flags & TOF_TS)) {
478050570efSLawrence Stewart 				txsi->tx_ts = ntohl(to->to_tsval) -
479050570efSLawrence Stewart 				    tp->ts_offset;
480050570efSLawrence Stewart 				txsi->rx_ts = ntohl(to->to_tsecr);
481050570efSLawrence Stewart 			} else {
482ee24d3b8SLawrence Stewart 				txsi->tx_ts = tcp_ts_getticks();
483050570efSLawrence Stewart 				txsi->rx_ts = 0; /* No received time stamp. */
484050570efSLawrence Stewart 			}
485050570efSLawrence Stewart 			TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk);
486050570efSLawrence Stewart 		}
487050570efSLawrence Stewart 	}
488050570efSLawrence Stewart 
489050570efSLawrence Stewart 	return (0);
490050570efSLawrence Stewart }
491050570efSLawrence Stewart 
492050570efSLawrence Stewart static int
ertt_mod_init(void)493050570efSLawrence Stewart ertt_mod_init(void)
494050570efSLawrence Stewart {
495050570efSLawrence Stewart 
496050570efSLawrence Stewart 	txseginfo_zone = uma_zcreate("ertt_txseginfo", sizeof(struct txseginfo),
497050570efSLawrence Stewart 	    NULL, NULL, NULL, NULL, 0, 0);
498050570efSLawrence Stewart 
499050570efSLawrence Stewart 	return (0);
500050570efSLawrence Stewart }
501050570efSLawrence Stewart 
502050570efSLawrence Stewart static int
ertt_mod_destroy(void)503050570efSLawrence Stewart ertt_mod_destroy(void)
504050570efSLawrence Stewart {
505050570efSLawrence Stewart 
506050570efSLawrence Stewart 	uma_zdestroy(txseginfo_zone);
507050570efSLawrence Stewart 
508050570efSLawrence Stewart 	return (0);
509050570efSLawrence Stewart }
510050570efSLawrence Stewart 
511050570efSLawrence Stewart static int
ertt_uma_ctor(void * mem,int size,void * arg,int flags)512050570efSLawrence Stewart ertt_uma_ctor(void *mem, int size, void *arg, int flags)
513050570efSLawrence Stewart {
514050570efSLawrence Stewart 	struct ertt *e_t;
515050570efSLawrence Stewart 
516050570efSLawrence Stewart 	e_t = mem;
517050570efSLawrence Stewart 
518050570efSLawrence Stewart 	TAILQ_INIT(&e_t->txsegi_q);
519050570efSLawrence Stewart 	e_t->timestamp_errors = 0;
520050570efSLawrence Stewart 	e_t->minrtt = 0;
521050570efSLawrence Stewart 	e_t->maxrtt = 0;
522050570efSLawrence Stewart 	e_t->rtt = 0;
523050570efSLawrence Stewart 	e_t->flags = 0;
524050570efSLawrence Stewart 	e_t->dlyack_rx = 0;
525050570efSLawrence Stewart 	e_t->bytes_tx_in_rtt = 0;
526050570efSLawrence Stewart 	e_t->markedpkt_rtt = 0;
527050570efSLawrence Stewart 
528050570efSLawrence Stewart 	return (0);
529050570efSLawrence Stewart }
530050570efSLawrence Stewart 
531050570efSLawrence Stewart static void
ertt_uma_dtor(void * mem,int size,void * arg)532050570efSLawrence Stewart ertt_uma_dtor(void *mem, int size, void *arg)
533050570efSLawrence Stewart {
534050570efSLawrence Stewart 	struct ertt *e_t;
535050570efSLawrence Stewart 	struct txseginfo *n_txsi, *txsi;
536050570efSLawrence Stewart 
537050570efSLawrence Stewart 	e_t = mem;
538050570efSLawrence Stewart 	txsi = TAILQ_FIRST(&e_t->txsegi_q);
539050570efSLawrence Stewart 	while (txsi != NULL) {
540050570efSLawrence Stewart 		n_txsi = TAILQ_NEXT(txsi, txsegi_lnk);
541050570efSLawrence Stewart 		uma_zfree(txseginfo_zone, txsi);
542050570efSLawrence Stewart 		txsi = n_txsi;
543050570efSLawrence Stewart 	}
544050570efSLawrence Stewart }
545050570efSLawrence Stewart 
546050570efSLawrence Stewart KHELP_DECLARE_MOD_UMA(ertt, &ertt_helper, ertt_hooks, 1, sizeof(struct ertt),
547050570efSLawrence Stewart     ertt_uma_ctor, ertt_uma_dtor);
548