1050570efSLawrence Stewart /*-
24d846d26SWarner Losh * SPDX-License-Identifier: BSD-2-Clause
3fe267a55SPedro F. Giffuni *
4050570efSLawrence Stewart * Copyright (c) 2009-2010
5050570efSLawrence Stewart * Swinburne University of Technology, Melbourne, Australia
6050570efSLawrence Stewart * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
7050570efSLawrence Stewart * Copyright (c) 2010-2011 The FreeBSD Foundation
8050570efSLawrence Stewart * All rights reserved.
9050570efSLawrence Stewart *
10050570efSLawrence Stewart * This software was developed at the Centre for Advanced Internet
11891b8ed4SLawrence Stewart * Architectures, Swinburne University of Technology, by David Hayes, made
12891b8ed4SLawrence Stewart * possible in part by a grant from the Cisco University Research Program Fund
13891b8ed4SLawrence Stewart * at Community Foundation Silicon Valley.
14050570efSLawrence Stewart *
15050570efSLawrence Stewart * Portions of this software were developed at the Centre for Advanced
16050570efSLawrence Stewart * Internet Architectures, Swinburne University of Technology, Melbourne,
17050570efSLawrence Stewart * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
18050570efSLawrence Stewart *
19050570efSLawrence Stewart * Redistribution and use in source and binary forms, with or without
20050570efSLawrence Stewart * modification, are permitted provided that the following conditions
21050570efSLawrence Stewart * are met:
22050570efSLawrence Stewart * 1. Redistributions of source code must retain the above copyright
23050570efSLawrence Stewart * notice, this list of conditions and the following disclaimer.
24050570efSLawrence Stewart * 2. Redistributions in binary form must reproduce the above copyright
25050570efSLawrence Stewart * notice, this list of conditions and the following disclaimer in the
26050570efSLawrence Stewart * documentation and/or other materials provided with the distribution.
27050570efSLawrence Stewart *
28050570efSLawrence Stewart * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
29050570efSLawrence Stewart * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30050570efSLawrence Stewart * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31050570efSLawrence Stewart * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
32050570efSLawrence Stewart * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33050570efSLawrence Stewart * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34050570efSLawrence Stewart * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35050570efSLawrence Stewart * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36050570efSLawrence Stewart * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37050570efSLawrence Stewart * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38050570efSLawrence Stewart * SUCH DAMAGE.
39050570efSLawrence Stewart */
40050570efSLawrence Stewart
41050570efSLawrence Stewart #include <sys/param.h>
42050570efSLawrence Stewart #include <sys/kernel.h>
43050570efSLawrence Stewart #include <sys/mbuf.h>
44050570efSLawrence Stewart #include <sys/module.h>
45050570efSLawrence Stewart #include <sys/hhook.h>
46050570efSLawrence Stewart #include <sys/khelp.h>
47050570efSLawrence Stewart #include <sys/module_khelp.h>
48050570efSLawrence Stewart #include <sys/socket.h>
49050570efSLawrence Stewart #include <sys/sockopt.h>
50050570efSLawrence Stewart
51050570efSLawrence Stewart #include <net/vnet.h>
52050570efSLawrence Stewart
53050570efSLawrence Stewart #include <netinet/in.h>
54050570efSLawrence Stewart #include <netinet/in_pcb.h>
55050570efSLawrence Stewart #include <netinet/tcp_seq.h>
56050570efSLawrence Stewart #include <netinet/tcp_var.h>
57050570efSLawrence Stewart
58050570efSLawrence Stewart #include <netinet/khelp/h_ertt.h>
59050570efSLawrence Stewart
60050570efSLawrence Stewart #include <vm/uma.h>
61050570efSLawrence Stewart
62050570efSLawrence Stewart uma_zone_t txseginfo_zone;
63050570efSLawrence Stewart
64050570efSLawrence Stewart /* Smoothing factor for delayed ack guess. */
65050570efSLawrence Stewart #define DLYACK_SMOOTH 5
66050570efSLawrence Stewart
67050570efSLawrence Stewart /* Max number of time stamp errors allowed in a session. */
68050570efSLawrence Stewart #define MAX_TS_ERR 10
69050570efSLawrence Stewart
70050570efSLawrence Stewart static int ertt_packet_measurement_hook(int hhook_type, int hhook_id,
71050570efSLawrence Stewart void *udata, void *ctx_data, void *hdata, struct osd *hosd);
72050570efSLawrence Stewart static int ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id,
73050570efSLawrence Stewart void *udata, void *ctx_data, void *hdata, struct osd *hosd);
74050570efSLawrence Stewart static int ertt_mod_init(void);
75050570efSLawrence Stewart static int ertt_mod_destroy(void);
76050570efSLawrence Stewart static int ertt_uma_ctor(void *mem, int size, void *arg, int flags);
77050570efSLawrence Stewart static void ertt_uma_dtor(void *mem, int size, void *arg);
78050570efSLawrence Stewart
79050570efSLawrence Stewart /*
80050570efSLawrence Stewart * Contains information about the sent segment for comparison with the
81050570efSLawrence Stewart * corresponding ack.
82050570efSLawrence Stewart */
83050570efSLawrence Stewart struct txseginfo {
84050570efSLawrence Stewart /* Segment length. */
853ac12506SJonathan T. Looney uint32_t len;
86050570efSLawrence Stewart /* Segment sequence number. */
87050570efSLawrence Stewart tcp_seq seq;
88050570efSLawrence Stewart /* Time stamp indicating when the packet was sent. */
89050570efSLawrence Stewart uint32_t tx_ts;
90050570efSLawrence Stewart /* Last received receiver ts (if the TCP option is used). */
91050570efSLawrence Stewart uint32_t rx_ts;
92050570efSLawrence Stewart uint32_t flags;
93050570efSLawrence Stewart TAILQ_ENTRY (txseginfo) txsegi_lnk;
94050570efSLawrence Stewart };
95050570efSLawrence Stewart
96050570efSLawrence Stewart /* Flags for struct txseginfo. */
97050570efSLawrence Stewart #define TXSI_TSO 0x01 /* TSO was used for this entry. */
98050570efSLawrence Stewart #define TXSI_RTT_MEASURE_START 0x02 /* Start a per RTT measurement. */
99050570efSLawrence Stewart #define TXSI_RX_MEASURE_END 0x04 /* Measure the rx rate until this txsi. */
100050570efSLawrence Stewart
101050570efSLawrence Stewart struct helper ertt_helper = {
102050570efSLawrence Stewart .mod_init = ertt_mod_init,
103050570efSLawrence Stewart .mod_destroy = ertt_mod_destroy,
104050570efSLawrence Stewart .h_flags = HELPER_NEEDS_OSD,
105050570efSLawrence Stewart .h_classes = HELPER_CLASS_TCP
106050570efSLawrence Stewart };
107050570efSLawrence Stewart
108050570efSLawrence Stewart /* Define the helper hook info required by ERTT. */
109050570efSLawrence Stewart struct hookinfo ertt_hooks[] = {
110050570efSLawrence Stewart {
111050570efSLawrence Stewart .hook_type = HHOOK_TYPE_TCP,
112050570efSLawrence Stewart .hook_id = HHOOK_TCP_EST_IN,
113050570efSLawrence Stewart .hook_udata = NULL,
114050570efSLawrence Stewart .hook_func = &ertt_packet_measurement_hook
115050570efSLawrence Stewart },
116050570efSLawrence Stewart {
117050570efSLawrence Stewart .hook_type = HHOOK_TYPE_TCP,
118050570efSLawrence Stewart .hook_id = HHOOK_TCP_EST_OUT,
119050570efSLawrence Stewart .hook_udata = NULL,
120050570efSLawrence Stewart .hook_func = &ertt_add_tx_segment_info_hook
121050570efSLawrence Stewart }
122050570efSLawrence Stewart };
123050570efSLawrence Stewart
124050570efSLawrence Stewart /* Flags to indicate how marked_packet_rtt should handle this txsi. */
125050570efSLawrence Stewart #define MULTI_ACK 0x01 /* More than this txsi is acked. */
126050570efSLawrence Stewart #define OLD_TXSI 0x02 /* TXSI is old according to timestamps. */
127050570efSLawrence Stewart #define CORRECT_ACK 0X04 /* Acks this TXSI. */
128050570efSLawrence Stewart #define FORCED_MEASUREMENT 0X08 /* Force an RTT measurement. */
129050570efSLawrence Stewart
130050570efSLawrence Stewart /*
131050570efSLawrence Stewart * This fuction measures the RTT of a particular segment/ack pair, or the next
132050570efSLawrence Stewart * closest if this will yield an inaccurate result due to delayed acking or
133050570efSLawrence Stewart * other issues.
134050570efSLawrence Stewart */
135050570efSLawrence Stewart static void inline
marked_packet_rtt(struct txseginfo * txsi,struct ertt * e_t,struct tcpcb * tp,uint32_t * pmeasurenext,int * pmeasurenext_len,int * prtt_bytes_adjust,int mflag)136050570efSLawrence Stewart marked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp,
137050570efSLawrence Stewart uint32_t *pmeasurenext, int *pmeasurenext_len, int *prtt_bytes_adjust,
138050570efSLawrence Stewart int mflag)
139050570efSLawrence Stewart {
140050570efSLawrence Stewart
141050570efSLawrence Stewart /*
142050570efSLawrence Stewart * If we can't measure this one properly due to delayed acking adjust
143050570efSLawrence Stewart * byte counters and flag to measure next txsi. Note that since the
144050570efSLawrence Stewart * marked packet's transmitted bytes are measured we need to subtract the
145050570efSLawrence Stewart * transmitted bytes. Then pretend the next txsi was marked.
146050570efSLawrence Stewart */
147050570efSLawrence Stewart if (mflag & (MULTI_ACK|OLD_TXSI)) {
148050570efSLawrence Stewart *pmeasurenext = txsi->tx_ts;
149050570efSLawrence Stewart *pmeasurenext_len = txsi->len;
150050570efSLawrence Stewart *prtt_bytes_adjust += *pmeasurenext_len;
151050570efSLawrence Stewart } else {
152050570efSLawrence Stewart if (mflag & FORCED_MEASUREMENT) {
153ee24d3b8SLawrence Stewart e_t->markedpkt_rtt = tcp_ts_getticks() -
154ee24d3b8SLawrence Stewart *pmeasurenext + 1;
155050570efSLawrence Stewart e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt +
156050570efSLawrence Stewart *pmeasurenext_len - *prtt_bytes_adjust;
157050570efSLawrence Stewart } else {
158ee24d3b8SLawrence Stewart e_t->markedpkt_rtt = tcp_ts_getticks() -
159ee24d3b8SLawrence Stewart txsi->tx_ts + 1;
160050570efSLawrence Stewart e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt -
161050570efSLawrence Stewart *prtt_bytes_adjust;
162050570efSLawrence Stewart }
163050570efSLawrence Stewart e_t->marked_snd_cwnd = tp->snd_cwnd;
164050570efSLawrence Stewart
165050570efSLawrence Stewart /*
166050570efSLawrence Stewart * Reset the ERTT_MEASUREMENT_IN_PROGRESS flag to indicate to
167050570efSLawrence Stewart * add_tx_segment_info that a new measurement should be started.
168050570efSLawrence Stewart */
169050570efSLawrence Stewart e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS;
170050570efSLawrence Stewart /*
171050570efSLawrence Stewart * Set ERTT_NEW_MEASUREMENT to tell the congestion control
172050570efSLawrence Stewart * algorithm that a new marked RTT measurement has has been made
173050570efSLawrence Stewart * and is available for use.
174050570efSLawrence Stewart */
175050570efSLawrence Stewart e_t->flags |= ERTT_NEW_MEASUREMENT;
176050570efSLawrence Stewart
177050570efSLawrence Stewart if (tp->t_flags & TF_TSO) {
1788d30ef92SGordon Bergling /* Temporarily disable TSO to aid a new measurement. */
179050570efSLawrence Stewart tp->t_flags &= ~TF_TSO;
180050570efSLawrence Stewart /* Keep track that we've disabled it. */
181050570efSLawrence Stewart e_t->flags |= ERTT_TSO_DISABLED;
182050570efSLawrence Stewart }
183050570efSLawrence Stewart }
184050570efSLawrence Stewart }
185050570efSLawrence Stewart
186050570efSLawrence Stewart /*
187050570efSLawrence Stewart * Ertt_packet_measurements uses a small amount of state kept on each packet
188050570efSLawrence Stewart * sent to match incoming acknowledgements. This enables more accurate and
189050570efSLawrence Stewart * secure round trip time measurements. The resulting measurement is used for
190050570efSLawrence Stewart * congestion control algorithms which require a more accurate time.
191050570efSLawrence Stewart * Ertt_packet_measurements is called via the helper hook in tcp_input.c
192050570efSLawrence Stewart */
193050570efSLawrence Stewart static int
ertt_packet_measurement_hook(int hhook_type,int hhook_id,void * udata,void * ctx_data,void * hdata,struct osd * hosd)194050570efSLawrence Stewart ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata,
195050570efSLawrence Stewart void *ctx_data, void *hdata, struct osd *hosd)
196050570efSLawrence Stewart {
197050570efSLawrence Stewart struct ertt *e_t;
198050570efSLawrence Stewart struct tcpcb *tp;
199050570efSLawrence Stewart struct tcphdr *th;
200050570efSLawrence Stewart struct tcpopt *to;
201050570efSLawrence Stewart struct tcp_hhook_data *thdp;
202050570efSLawrence Stewart struct txseginfo *txsi;
203050570efSLawrence Stewart int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust;
204050570efSLawrence Stewart uint32_t measurenext, rts;
205050570efSLawrence Stewart tcp_seq ack;
206050570efSLawrence Stewart
207050570efSLawrence Stewart KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
208050570efSLawrence Stewart KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
209050570efSLawrence Stewart
210050570efSLawrence Stewart e_t = (struct ertt *)hdata;
211050570efSLawrence Stewart thdp = ctx_data;
212050570efSLawrence Stewart tp = thdp->tp;
213050570efSLawrence Stewart th = thdp->th;
214050570efSLawrence Stewart to = thdp->to;
215050570efSLawrence Stewart new_sacked_bytes = (tp->sackhint.last_sack_ack != 0);
216050570efSLawrence Stewart measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0;
217050570efSLawrence Stewart acked = th->th_ack - tp->snd_una;
218050570efSLawrence Stewart
2199eb0e832SGleb Smirnoff INP_WLOCK_ASSERT(tptoinpcb(tp));
220050570efSLawrence Stewart
221050570efSLawrence Stewart /* Packet has provided new acknowledgements. */
222050570efSLawrence Stewart if (acked > 0 || new_sacked_bytes) {
223050570efSLawrence Stewart if (acked == 0 && new_sacked_bytes) {
224050570efSLawrence Stewart /* Use last sacked data. */
225050570efSLawrence Stewart ack = tp->sackhint.last_sack_ack;
226050570efSLawrence Stewart } else
227050570efSLawrence Stewart ack = th->th_ack;
228050570efSLawrence Stewart
229050570efSLawrence Stewart txsi = TAILQ_FIRST(&e_t->txsegi_q);
230050570efSLawrence Stewart while (txsi != NULL) {
231050570efSLawrence Stewart rts = 0;
232050570efSLawrence Stewart
233050570efSLawrence Stewart /* Acknowledgement is acking more than this txsi. */
234050570efSLawrence Stewart if (SEQ_GT(ack, txsi->seq + txsi->len)) {
235050570efSLawrence Stewart if (txsi->flags & TXSI_RTT_MEASURE_START ||
236050570efSLawrence Stewart measurenext) {
237050570efSLawrence Stewart marked_packet_rtt(txsi, e_t, tp,
238050570efSLawrence Stewart &measurenext, &measurenext_len,
239050570efSLawrence Stewart &rtt_bytes_adjust, MULTI_ACK);
240050570efSLawrence Stewart }
241050570efSLawrence Stewart TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
242050570efSLawrence Stewart uma_zfree(txseginfo_zone, txsi);
243050570efSLawrence Stewart txsi = TAILQ_FIRST(&e_t->txsegi_q);
244050570efSLawrence Stewart continue;
245050570efSLawrence Stewart }
246050570efSLawrence Stewart
247050570efSLawrence Stewart /*
248050570efSLawrence Stewart * Guess if delayed acks are being used by the receiver.
249050570efSLawrence Stewart *
250050570efSLawrence Stewart * XXXDH: A simple heuristic that could be improved
251050570efSLawrence Stewart */
252050570efSLawrence Stewart if (!new_sacked_bytes) {
253050570efSLawrence Stewart if (acked > tp->t_maxseg) {
254050570efSLawrence Stewart e_t->dlyack_rx +=
255050570efSLawrence Stewart (e_t->dlyack_rx < DLYACK_SMOOTH) ?
256050570efSLawrence Stewart 1 : 0;
257050570efSLawrence Stewart multiack = 1;
258050570efSLawrence Stewart } else if (acked > txsi->len) {
259050570efSLawrence Stewart multiack = 1;
260050570efSLawrence Stewart e_t->dlyack_rx +=
261050570efSLawrence Stewart (e_t->dlyack_rx < DLYACK_SMOOTH) ?
262050570efSLawrence Stewart 1 : 0;
263050570efSLawrence Stewart } else if (acked == tp->t_maxseg ||
264050570efSLawrence Stewart acked == txsi->len) {
265050570efSLawrence Stewart e_t->dlyack_rx -=
266050570efSLawrence Stewart (e_t->dlyack_rx > 0) ? 1 : 0;
267050570efSLawrence Stewart }
268050570efSLawrence Stewart /* Otherwise leave dlyack_rx the way it was. */
269050570efSLawrence Stewart }
270050570efSLawrence Stewart
271050570efSLawrence Stewart /*
272050570efSLawrence Stewart * Time stamps are only to help match the txsi with the
273050570efSLawrence Stewart * received acknowledgements.
274050570efSLawrence Stewart */
275050570efSLawrence Stewart if (e_t->timestamp_errors < MAX_TS_ERR &&
276050570efSLawrence Stewart (to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
277050570efSLawrence Stewart /*
278050570efSLawrence Stewart * Note: All packets sent with the offload will
279050570efSLawrence Stewart * have the same time stamp. If we are sending
280050570efSLawrence Stewart * on a fast interface and the t_maxseg is much
281050570efSLawrence Stewart * smaller than one tick, this will be fine. The
282050570efSLawrence Stewart * time stamp would be the same whether we were
283050570efSLawrence Stewart * using tso or not. However, if the interface
284050570efSLawrence Stewart * is slow, this will cause problems with the
285050570efSLawrence Stewart * calculations. If the interface is slow, there
286050570efSLawrence Stewart * is not reason to be using tso, and it should
287050570efSLawrence Stewart * be turned off.
288050570efSLawrence Stewart */
289050570efSLawrence Stewart /*
290050570efSLawrence Stewart * If there are too many time stamp errors, time
291050570efSLawrence Stewart * stamps won't be trusted
292050570efSLawrence Stewart */
293050570efSLawrence Stewart rts = to->to_tsecr;
294050570efSLawrence Stewart /* Before this packet. */
295050570efSLawrence Stewart if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts))
296050570efSLawrence Stewart /* When delayed acking is used, the
297050570efSLawrence Stewart * reflected time stamp is of the first
298050570efSLawrence Stewart * packet and thus may be before
299050570efSLawrence Stewart * txsi->tx_ts.
300050570efSLawrence Stewart */
301050570efSLawrence Stewart break;
302050570efSLawrence Stewart if (TSTMP_GT(rts, txsi->tx_ts)) {
303050570efSLawrence Stewart /*
304050570efSLawrence Stewart * If reflected time stamp is later than
305050570efSLawrence Stewart * tx_tsi, then this txsi is old.
306050570efSLawrence Stewart */
307050570efSLawrence Stewart if (txsi->flags & TXSI_RTT_MEASURE_START
308050570efSLawrence Stewart || measurenext) {
309050570efSLawrence Stewart marked_packet_rtt(txsi, e_t, tp,
310050570efSLawrence Stewart &measurenext, &measurenext_len,
311050570efSLawrence Stewart &rtt_bytes_adjust, OLD_TXSI);
312050570efSLawrence Stewart }
313050570efSLawrence Stewart TAILQ_REMOVE(&e_t->txsegi_q, txsi,
314050570efSLawrence Stewart txsegi_lnk);
315050570efSLawrence Stewart uma_zfree(txseginfo_zone, txsi);
316050570efSLawrence Stewart txsi = TAILQ_FIRST(&e_t->txsegi_q);
317050570efSLawrence Stewart continue;
318050570efSLawrence Stewart }
319050570efSLawrence Stewart if (rts == txsi->tx_ts &&
320050570efSLawrence Stewart TSTMP_LT(to->to_tsval, txsi->rx_ts)) {
321050570efSLawrence Stewart /*
322050570efSLawrence Stewart * Segment received before sent!
323050570efSLawrence Stewart * Something is wrong with the received
324050570efSLawrence Stewart * timestamps so increment errors. If
325050570efSLawrence Stewart * this keeps up we will ignore
326050570efSLawrence Stewart * timestamps.
327050570efSLawrence Stewart */
328050570efSLawrence Stewart e_t->timestamp_errors++;
329050570efSLawrence Stewart }
330050570efSLawrence Stewart }
331050570efSLawrence Stewart /*
332050570efSLawrence Stewart * Acknowledging a sequence number before this txsi.
333050570efSLawrence Stewart * If it is an old txsi that may have had the same seq
334050570efSLawrence Stewart * numbers, it should have been removed if time stamps
335050570efSLawrence Stewart * are being used.
336050570efSLawrence Stewart */
337050570efSLawrence Stewart if (SEQ_LEQ(ack, txsi->seq))
338050570efSLawrence Stewart break; /* Before first packet in txsi. */
339050570efSLawrence Stewart
340050570efSLawrence Stewart /*
341050570efSLawrence Stewart * Only ack > txsi->seq and ack <= txsi->seq+txsi->len
342050570efSLawrence Stewart * past this point.
343050570efSLawrence Stewart *
344050570efSLawrence Stewart * If delayed acks are being used, an acknowledgement
345050570efSLawrence Stewart * for a single segment will have been delayed by the
346050570efSLawrence Stewart * receiver and will yield an inaccurate measurement. In
347050570efSLawrence Stewart * this case, we only make the measurement if more than
348050570efSLawrence Stewart * one segment is being acknowledged or sack is
349050570efSLawrence Stewart * currently being used.
350050570efSLawrence Stewart */
351050570efSLawrence Stewart if (!e_t->dlyack_rx || multiack || new_sacked_bytes) {
352050570efSLawrence Stewart /* Make an accurate new measurement. */
353ee24d3b8SLawrence Stewart e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1;
354050570efSLawrence Stewart
355050570efSLawrence Stewart if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0)
356050570efSLawrence Stewart e_t->minrtt = e_t->rtt;
357050570efSLawrence Stewart
358050570efSLawrence Stewart if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0)
359050570efSLawrence Stewart e_t->maxrtt = e_t->rtt;
360050570efSLawrence Stewart }
361050570efSLawrence Stewart
362050570efSLawrence Stewart if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext)
363050570efSLawrence Stewart marked_packet_rtt(txsi, e_t, tp,
364050570efSLawrence Stewart &measurenext, &measurenext_len,
365050570efSLawrence Stewart &rtt_bytes_adjust, CORRECT_ACK);
366050570efSLawrence Stewart
367050570efSLawrence Stewart if (txsi->flags & TXSI_TSO) {
3683ac12506SJonathan T. Looney if (txsi->len > acked) {
369050570efSLawrence Stewart txsi->len -= acked;
370050570efSLawrence Stewart /*
371050570efSLawrence Stewart * This presumes ack for first bytes in
372050570efSLawrence Stewart * txsi, this may not be true but it
373050570efSLawrence Stewart * shouldn't cause problems for the
374050570efSLawrence Stewart * timing.
375050570efSLawrence Stewart *
376050570efSLawrence Stewart * We remeasure RTT even though we only
377050570efSLawrence Stewart * have a single txsi. The rationale
378050570efSLawrence Stewart * behind this is that it is better to
379050570efSLawrence Stewart * have a slightly inaccurate
380050570efSLawrence Stewart * measurement than no additional
381050570efSLawrence Stewart * measurement for the rest of the bulk
382050570efSLawrence Stewart * transfer. Since TSO is only used on
383050570efSLawrence Stewart * high speed interface cards, so the
384050570efSLawrence Stewart * packets should be transmitted at line
385050570efSLawrence Stewart * rate back to back with little
386050570efSLawrence Stewart * difference in transmission times (in
387050570efSLawrence Stewart * ticks).
388050570efSLawrence Stewart */
389050570efSLawrence Stewart txsi->seq += acked;
390050570efSLawrence Stewart /*
391050570efSLawrence Stewart * Reset txsi measure flag so we don't
392050570efSLawrence Stewart * use it for another RTT measurement.
393050570efSLawrence Stewart */
394050570efSLawrence Stewart txsi->flags &= ~TXSI_RTT_MEASURE_START;
395050570efSLawrence Stewart /*
396050570efSLawrence Stewart * There is still more data to be acked
397050570efSLawrence Stewart * from tso bulk transmission, so we
398050570efSLawrence Stewart * won't remove it from the TAILQ yet.
399050570efSLawrence Stewart */
400050570efSLawrence Stewart break;
401050570efSLawrence Stewart }
4023ac12506SJonathan T. Looney txsi->len = 0;
403050570efSLawrence Stewart }
404050570efSLawrence Stewart
405050570efSLawrence Stewart TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk);
406050570efSLawrence Stewart uma_zfree(txseginfo_zone, txsi);
407050570efSLawrence Stewart break;
408050570efSLawrence Stewart }
409050570efSLawrence Stewart
410050570efSLawrence Stewart if (measurenext) {
411050570efSLawrence Stewart /*
412050570efSLawrence Stewart * We need to do a RTT measurement. It won't be the best
413050570efSLawrence Stewart * if we do it here.
414050570efSLawrence Stewart */
415050570efSLawrence Stewart marked_packet_rtt(txsi, e_t, tp,
416050570efSLawrence Stewart &measurenext, &measurenext_len,
417050570efSLawrence Stewart &rtt_bytes_adjust, FORCED_MEASUREMENT);
418050570efSLawrence Stewart }
419050570efSLawrence Stewart }
420050570efSLawrence Stewart
421050570efSLawrence Stewart return (0);
422050570efSLawrence Stewart }
423050570efSLawrence Stewart
424050570efSLawrence Stewart /*
425050570efSLawrence Stewart * Add information about a transmitted segment to a list.
426050570efSLawrence Stewart * This is called via the helper hook in tcp_output.c
427050570efSLawrence Stewart */
428050570efSLawrence Stewart static int
ertt_add_tx_segment_info_hook(int hhook_type,int hhook_id,void * udata,void * ctx_data,void * hdata,struct osd * hosd)429050570efSLawrence Stewart ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, void *udata,
430050570efSLawrence Stewart void *ctx_data, void *hdata, struct osd *hosd)
431050570efSLawrence Stewart {
432050570efSLawrence Stewart struct ertt *e_t;
433050570efSLawrence Stewart struct tcpcb *tp;
434050570efSLawrence Stewart struct tcphdr *th;
435050570efSLawrence Stewart struct tcpopt *to;
436050570efSLawrence Stewart struct tcp_hhook_data *thdp;
437050570efSLawrence Stewart struct txseginfo *txsi;
4383ac12506SJonathan T. Looney uint32_t len;
439050570efSLawrence Stewart int tso;
440050570efSLawrence Stewart
441050570efSLawrence Stewart KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__));
442050570efSLawrence Stewart KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__));
443050570efSLawrence Stewart
444050570efSLawrence Stewart e_t = (struct ertt *)hdata;
445050570efSLawrence Stewart thdp = ctx_data;
446050570efSLawrence Stewart tp = thdp->tp;
447050570efSLawrence Stewart th = thdp->th;
448050570efSLawrence Stewart to = thdp->to;
449050570efSLawrence Stewart len = thdp->len;
450050570efSLawrence Stewart tso = thdp->tso;
451050570efSLawrence Stewart
4529eb0e832SGleb Smirnoff INP_WLOCK_ASSERT(tptoinpcb(tp));
453050570efSLawrence Stewart
454050570efSLawrence Stewart if (len > 0) {
455050570efSLawrence Stewart txsi = uma_zalloc(txseginfo_zone, M_NOWAIT);
456050570efSLawrence Stewart if (txsi != NULL) {
457050570efSLawrence Stewart /* Construct txsi setting the necessary flags. */
458050570efSLawrence Stewart txsi->flags = 0; /* Needs to be initialised. */
459050570efSLawrence Stewart txsi->seq = ntohl(th->th_seq);
460050570efSLawrence Stewart txsi->len = len;
461050570efSLawrence Stewart if (tso)
462050570efSLawrence Stewart txsi->flags |= TXSI_TSO;
463050570efSLawrence Stewart else if (e_t->flags & ERTT_TSO_DISABLED) {
464050570efSLawrence Stewart tp->t_flags |= TF_TSO;
465050570efSLawrence Stewart e_t->flags &= ~ERTT_TSO_DISABLED;
466050570efSLawrence Stewart }
467050570efSLawrence Stewart
468050570efSLawrence Stewart if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS) {
469050570efSLawrence Stewart e_t->bytes_tx_in_rtt += len;
470050570efSLawrence Stewart } else {
471050570efSLawrence Stewart txsi->flags |= TXSI_RTT_MEASURE_START;
472050570efSLawrence Stewart e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS;
473050570efSLawrence Stewart e_t->bytes_tx_in_rtt = len;
474050570efSLawrence Stewart }
475050570efSLawrence Stewart
476050570efSLawrence Stewart if (((tp->t_flags & TF_NOOPT) == 0) &&
477050570efSLawrence Stewart (to->to_flags & TOF_TS)) {
478050570efSLawrence Stewart txsi->tx_ts = ntohl(to->to_tsval) -
479050570efSLawrence Stewart tp->ts_offset;
480050570efSLawrence Stewart txsi->rx_ts = ntohl(to->to_tsecr);
481050570efSLawrence Stewart } else {
482ee24d3b8SLawrence Stewart txsi->tx_ts = tcp_ts_getticks();
483050570efSLawrence Stewart txsi->rx_ts = 0; /* No received time stamp. */
484050570efSLawrence Stewart }
485050570efSLawrence Stewart TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk);
486050570efSLawrence Stewart }
487050570efSLawrence Stewart }
488050570efSLawrence Stewart
489050570efSLawrence Stewart return (0);
490050570efSLawrence Stewart }
491050570efSLawrence Stewart
492050570efSLawrence Stewart static int
ertt_mod_init(void)493050570efSLawrence Stewart ertt_mod_init(void)
494050570efSLawrence Stewart {
495050570efSLawrence Stewart
496050570efSLawrence Stewart txseginfo_zone = uma_zcreate("ertt_txseginfo", sizeof(struct txseginfo),
497050570efSLawrence Stewart NULL, NULL, NULL, NULL, 0, 0);
498050570efSLawrence Stewart
499050570efSLawrence Stewart return (0);
500050570efSLawrence Stewart }
501050570efSLawrence Stewart
502050570efSLawrence Stewart static int
ertt_mod_destroy(void)503050570efSLawrence Stewart ertt_mod_destroy(void)
504050570efSLawrence Stewart {
505050570efSLawrence Stewart
506050570efSLawrence Stewart uma_zdestroy(txseginfo_zone);
507050570efSLawrence Stewart
508050570efSLawrence Stewart return (0);
509050570efSLawrence Stewart }
510050570efSLawrence Stewart
511050570efSLawrence Stewart static int
ertt_uma_ctor(void * mem,int size,void * arg,int flags)512050570efSLawrence Stewart ertt_uma_ctor(void *mem, int size, void *arg, int flags)
513050570efSLawrence Stewart {
514050570efSLawrence Stewart struct ertt *e_t;
515050570efSLawrence Stewart
516050570efSLawrence Stewart e_t = mem;
517050570efSLawrence Stewart
518050570efSLawrence Stewart TAILQ_INIT(&e_t->txsegi_q);
519050570efSLawrence Stewart e_t->timestamp_errors = 0;
520050570efSLawrence Stewart e_t->minrtt = 0;
521050570efSLawrence Stewart e_t->maxrtt = 0;
522050570efSLawrence Stewart e_t->rtt = 0;
523050570efSLawrence Stewart e_t->flags = 0;
524050570efSLawrence Stewart e_t->dlyack_rx = 0;
525050570efSLawrence Stewart e_t->bytes_tx_in_rtt = 0;
526050570efSLawrence Stewart e_t->markedpkt_rtt = 0;
527050570efSLawrence Stewart
528050570efSLawrence Stewart return (0);
529050570efSLawrence Stewart }
530050570efSLawrence Stewart
531050570efSLawrence Stewart static void
ertt_uma_dtor(void * mem,int size,void * arg)532050570efSLawrence Stewart ertt_uma_dtor(void *mem, int size, void *arg)
533050570efSLawrence Stewart {
534050570efSLawrence Stewart struct ertt *e_t;
535050570efSLawrence Stewart struct txseginfo *n_txsi, *txsi;
536050570efSLawrence Stewart
537050570efSLawrence Stewart e_t = mem;
538050570efSLawrence Stewart txsi = TAILQ_FIRST(&e_t->txsegi_q);
539050570efSLawrence Stewart while (txsi != NULL) {
540050570efSLawrence Stewart n_txsi = TAILQ_NEXT(txsi, txsegi_lnk);
541050570efSLawrence Stewart uma_zfree(txseginfo_zone, txsi);
542050570efSLawrence Stewart txsi = n_txsi;
543050570efSLawrence Stewart }
544050570efSLawrence Stewart }
545050570efSLawrence Stewart
546050570efSLawrence Stewart KHELP_DECLARE_MOD_UMA(ertt, &ertt_helper, ertt_hooks, 1, sizeof(struct ertt),
547050570efSLawrence Stewart ertt_uma_ctor, ertt_uma_dtor);
548