1050570efSLawrence Stewart /*- 2050570efSLawrence Stewart * Copyright (c) 2009-2010 3050570efSLawrence Stewart * Swinburne University of Technology, Melbourne, Australia 4050570efSLawrence Stewart * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org> 5050570efSLawrence Stewart * Copyright (c) 2010-2011 The FreeBSD Foundation 6050570efSLawrence Stewart * All rights reserved. 7050570efSLawrence Stewart * 8050570efSLawrence Stewart * This software was developed at the Centre for Advanced Internet 9891b8ed4SLawrence Stewart * Architectures, Swinburne University of Technology, by David Hayes, made 10891b8ed4SLawrence Stewart * possible in part by a grant from the Cisco University Research Program Fund 11891b8ed4SLawrence Stewart * at Community Foundation Silicon Valley. 12050570efSLawrence Stewart * 13050570efSLawrence Stewart * Portions of this software were developed at the Centre for Advanced 14050570efSLawrence Stewart * Internet Architectures, Swinburne University of Technology, Melbourne, 15050570efSLawrence Stewart * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 16050570efSLawrence Stewart * 17050570efSLawrence Stewart * Redistribution and use in source and binary forms, with or without 18050570efSLawrence Stewart * modification, are permitted provided that the following conditions 19050570efSLawrence Stewart * are met: 20050570efSLawrence Stewart * 1. Redistributions of source code must retain the above copyright 21050570efSLawrence Stewart * notice, this list of conditions and the following disclaimer. 22050570efSLawrence Stewart * 2. Redistributions in binary form must reproduce the above copyright 23050570efSLawrence Stewart * notice, this list of conditions and the following disclaimer in the 24050570efSLawrence Stewart * documentation and/or other materials provided with the distribution. 25050570efSLawrence Stewart * 26050570efSLawrence Stewart * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 27050570efSLawrence Stewart * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28050570efSLawrence Stewart * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29050570efSLawrence Stewart * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 30050570efSLawrence Stewart * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31050570efSLawrence Stewart * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32050570efSLawrence Stewart * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33050570efSLawrence Stewart * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34050570efSLawrence Stewart * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35050570efSLawrence Stewart * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36050570efSLawrence Stewart * SUCH DAMAGE. 37050570efSLawrence Stewart */ 38050570efSLawrence Stewart 39050570efSLawrence Stewart #include <sys/cdefs.h> 40050570efSLawrence Stewart __FBSDID("$FreeBSD$"); 41050570efSLawrence Stewart 42050570efSLawrence Stewart #include <sys/param.h> 43050570efSLawrence Stewart #include <sys/kernel.h> 44050570efSLawrence Stewart #include <sys/mbuf.h> 45050570efSLawrence Stewart #include <sys/module.h> 46050570efSLawrence Stewart #include <sys/hhook.h> 47050570efSLawrence Stewart #include <sys/khelp.h> 48050570efSLawrence Stewart #include <sys/module_khelp.h> 49050570efSLawrence Stewart #include <sys/socket.h> 50050570efSLawrence Stewart #include <sys/sockopt.h> 51050570efSLawrence Stewart 52050570efSLawrence Stewart #include <net/vnet.h> 53050570efSLawrence Stewart 54050570efSLawrence Stewart #include <netinet/in.h> 55050570efSLawrence Stewart #include <netinet/in_pcb.h> 56050570efSLawrence Stewart #include <netinet/tcp_seq.h> 57050570efSLawrence Stewart #include <netinet/tcp_var.h> 58050570efSLawrence Stewart 59050570efSLawrence Stewart #include <netinet/khelp/h_ertt.h> 60050570efSLawrence Stewart 61050570efSLawrence Stewart #include <vm/uma.h> 62050570efSLawrence Stewart 63050570efSLawrence Stewart uma_zone_t txseginfo_zone; 64050570efSLawrence Stewart 65050570efSLawrence Stewart /* Smoothing factor for delayed ack guess. */ 66050570efSLawrence Stewart #define DLYACK_SMOOTH 5 67050570efSLawrence Stewart 68050570efSLawrence Stewart /* Max number of time stamp errors allowed in a session. */ 69050570efSLawrence Stewart #define MAX_TS_ERR 10 70050570efSLawrence Stewart 71050570efSLawrence Stewart static int ertt_packet_measurement_hook(int hhook_type, int hhook_id, 72050570efSLawrence Stewart void *udata, void *ctx_data, void *hdata, struct osd *hosd); 73050570efSLawrence Stewart static int ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, 74050570efSLawrence Stewart void *udata, void *ctx_data, void *hdata, struct osd *hosd); 75050570efSLawrence Stewart static int ertt_mod_init(void); 76050570efSLawrence Stewart static int ertt_mod_destroy(void); 77050570efSLawrence Stewart static int ertt_uma_ctor(void *mem, int size, void *arg, int flags); 78050570efSLawrence Stewart static void ertt_uma_dtor(void *mem, int size, void *arg); 79050570efSLawrence Stewart 80050570efSLawrence Stewart /* 81050570efSLawrence Stewart * Contains information about the sent segment for comparison with the 82050570efSLawrence Stewart * corresponding ack. 83050570efSLawrence Stewart */ 84050570efSLawrence Stewart struct txseginfo { 85050570efSLawrence Stewart /* Segment length. */ 863ac12506SJonathan T. Looney uint32_t len; 87050570efSLawrence Stewart /* Segment sequence number. */ 88050570efSLawrence Stewart tcp_seq seq; 89050570efSLawrence Stewart /* Time stamp indicating when the packet was sent. */ 90050570efSLawrence Stewart uint32_t tx_ts; 91050570efSLawrence Stewart /* Last received receiver ts (if the TCP option is used). */ 92050570efSLawrence Stewart uint32_t rx_ts; 93050570efSLawrence Stewart uint32_t flags; 94050570efSLawrence Stewart TAILQ_ENTRY (txseginfo) txsegi_lnk; 95050570efSLawrence Stewart }; 96050570efSLawrence Stewart 97050570efSLawrence Stewart /* Flags for struct txseginfo. */ 98050570efSLawrence Stewart #define TXSI_TSO 0x01 /* TSO was used for this entry. */ 99050570efSLawrence Stewart #define TXSI_RTT_MEASURE_START 0x02 /* Start a per RTT measurement. */ 100050570efSLawrence Stewart #define TXSI_RX_MEASURE_END 0x04 /* Measure the rx rate until this txsi. */ 101050570efSLawrence Stewart 102050570efSLawrence Stewart struct helper ertt_helper = { 103050570efSLawrence Stewart .mod_init = ertt_mod_init, 104050570efSLawrence Stewart .mod_destroy = ertt_mod_destroy, 105050570efSLawrence Stewart .h_flags = HELPER_NEEDS_OSD, 106050570efSLawrence Stewart .h_classes = HELPER_CLASS_TCP 107050570efSLawrence Stewart }; 108050570efSLawrence Stewart 109050570efSLawrence Stewart /* Define the helper hook info required by ERTT. */ 110050570efSLawrence Stewart struct hookinfo ertt_hooks[] = { 111050570efSLawrence Stewart { 112050570efSLawrence Stewart .hook_type = HHOOK_TYPE_TCP, 113050570efSLawrence Stewart .hook_id = HHOOK_TCP_EST_IN, 114050570efSLawrence Stewart .hook_udata = NULL, 115050570efSLawrence Stewart .hook_func = &ertt_packet_measurement_hook 116050570efSLawrence Stewart }, 117050570efSLawrence Stewart { 118050570efSLawrence Stewart .hook_type = HHOOK_TYPE_TCP, 119050570efSLawrence Stewart .hook_id = HHOOK_TCP_EST_OUT, 120050570efSLawrence Stewart .hook_udata = NULL, 121050570efSLawrence Stewart .hook_func = &ertt_add_tx_segment_info_hook 122050570efSLawrence Stewart } 123050570efSLawrence Stewart }; 124050570efSLawrence Stewart 125050570efSLawrence Stewart /* Flags to indicate how marked_packet_rtt should handle this txsi. */ 126050570efSLawrence Stewart #define MULTI_ACK 0x01 /* More than this txsi is acked. */ 127050570efSLawrence Stewart #define OLD_TXSI 0x02 /* TXSI is old according to timestamps. */ 128050570efSLawrence Stewart #define CORRECT_ACK 0X04 /* Acks this TXSI. */ 129050570efSLawrence Stewart #define FORCED_MEASUREMENT 0X08 /* Force an RTT measurement. */ 130050570efSLawrence Stewart 131050570efSLawrence Stewart /* 132050570efSLawrence Stewart * This fuction measures the RTT of a particular segment/ack pair, or the next 133050570efSLawrence Stewart * closest if this will yield an inaccurate result due to delayed acking or 134050570efSLawrence Stewart * other issues. 135050570efSLawrence Stewart */ 136050570efSLawrence Stewart static void inline 137050570efSLawrence Stewart marked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp, 138050570efSLawrence Stewart uint32_t *pmeasurenext, int *pmeasurenext_len, int *prtt_bytes_adjust, 139050570efSLawrence Stewart int mflag) 140050570efSLawrence Stewart { 141050570efSLawrence Stewart 142050570efSLawrence Stewart /* 143050570efSLawrence Stewart * If we can't measure this one properly due to delayed acking adjust 144050570efSLawrence Stewart * byte counters and flag to measure next txsi. Note that since the 145050570efSLawrence Stewart * marked packet's transmitted bytes are measured we need to subtract the 146050570efSLawrence Stewart * transmitted bytes. Then pretend the next txsi was marked. 147050570efSLawrence Stewart */ 148050570efSLawrence Stewart if (mflag & (MULTI_ACK|OLD_TXSI)) { 149050570efSLawrence Stewart *pmeasurenext = txsi->tx_ts; 150050570efSLawrence Stewart *pmeasurenext_len = txsi->len; 151050570efSLawrence Stewart *prtt_bytes_adjust += *pmeasurenext_len; 152050570efSLawrence Stewart } else { 153050570efSLawrence Stewart if (mflag & FORCED_MEASUREMENT) { 154ee24d3b8SLawrence Stewart e_t->markedpkt_rtt = tcp_ts_getticks() - 155ee24d3b8SLawrence Stewart *pmeasurenext + 1; 156050570efSLawrence Stewart e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt + 157050570efSLawrence Stewart *pmeasurenext_len - *prtt_bytes_adjust; 158050570efSLawrence Stewart } else { 159ee24d3b8SLawrence Stewart e_t->markedpkt_rtt = tcp_ts_getticks() - 160ee24d3b8SLawrence Stewart txsi->tx_ts + 1; 161050570efSLawrence Stewart e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt - 162050570efSLawrence Stewart *prtt_bytes_adjust; 163050570efSLawrence Stewart } 164050570efSLawrence Stewart e_t->marked_snd_cwnd = tp->snd_cwnd; 165050570efSLawrence Stewart 166050570efSLawrence Stewart /* 167050570efSLawrence Stewart * Reset the ERTT_MEASUREMENT_IN_PROGRESS flag to indicate to 168050570efSLawrence Stewart * add_tx_segment_info that a new measurement should be started. 169050570efSLawrence Stewart */ 170050570efSLawrence Stewart e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS; 171050570efSLawrence Stewart /* 172050570efSLawrence Stewart * Set ERTT_NEW_MEASUREMENT to tell the congestion control 173050570efSLawrence Stewart * algorithm that a new marked RTT measurement has has been made 174050570efSLawrence Stewart * and is available for use. 175050570efSLawrence Stewart */ 176050570efSLawrence Stewart e_t->flags |= ERTT_NEW_MEASUREMENT; 177050570efSLawrence Stewart 178050570efSLawrence Stewart if (tp->t_flags & TF_TSO) { 179050570efSLawrence Stewart /* Temporarily disable TSO to aid a new measurment. */ 180050570efSLawrence Stewart tp->t_flags &= ~TF_TSO; 181050570efSLawrence Stewart /* Keep track that we've disabled it. */ 182050570efSLawrence Stewart e_t->flags |= ERTT_TSO_DISABLED; 183050570efSLawrence Stewart } 184050570efSLawrence Stewart } 185050570efSLawrence Stewart } 186050570efSLawrence Stewart 187050570efSLawrence Stewart /* 188050570efSLawrence Stewart * Ertt_packet_measurements uses a small amount of state kept on each packet 189050570efSLawrence Stewart * sent to match incoming acknowledgements. This enables more accurate and 190050570efSLawrence Stewart * secure round trip time measurements. The resulting measurement is used for 191050570efSLawrence Stewart * congestion control algorithms which require a more accurate time. 192050570efSLawrence Stewart * Ertt_packet_measurements is called via the helper hook in tcp_input.c 193050570efSLawrence Stewart */ 194050570efSLawrence Stewart static int 195050570efSLawrence Stewart ertt_packet_measurement_hook(int hhook_type, int hhook_id, void *udata, 196050570efSLawrence Stewart void *ctx_data, void *hdata, struct osd *hosd) 197050570efSLawrence Stewart { 198050570efSLawrence Stewart struct ertt *e_t; 199050570efSLawrence Stewart struct tcpcb *tp; 200050570efSLawrence Stewart struct tcphdr *th; 201050570efSLawrence Stewart struct tcpopt *to; 202050570efSLawrence Stewart struct tcp_hhook_data *thdp; 203050570efSLawrence Stewart struct txseginfo *txsi; 204050570efSLawrence Stewart int acked, measurenext_len, multiack, new_sacked_bytes, rtt_bytes_adjust; 205050570efSLawrence Stewart uint32_t measurenext, rts; 206050570efSLawrence Stewart tcp_seq ack; 207050570efSLawrence Stewart 208050570efSLawrence Stewart KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__)); 209050570efSLawrence Stewart KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__)); 210050570efSLawrence Stewart 211050570efSLawrence Stewart e_t = (struct ertt *)hdata; 212050570efSLawrence Stewart thdp = ctx_data; 213050570efSLawrence Stewart tp = thdp->tp; 214050570efSLawrence Stewart th = thdp->th; 215050570efSLawrence Stewart to = thdp->to; 216050570efSLawrence Stewart new_sacked_bytes = (tp->sackhint.last_sack_ack != 0); 217050570efSLawrence Stewart measurenext = measurenext_len = multiack = rts = rtt_bytes_adjust = 0; 218050570efSLawrence Stewart acked = th->th_ack - tp->snd_una; 219050570efSLawrence Stewart 220050570efSLawrence Stewart INP_WLOCK_ASSERT(tp->t_inpcb); 221050570efSLawrence Stewart 222050570efSLawrence Stewart /* Packet has provided new acknowledgements. */ 223050570efSLawrence Stewart if (acked > 0 || new_sacked_bytes) { 224050570efSLawrence Stewart if (acked == 0 && new_sacked_bytes) { 225050570efSLawrence Stewart /* Use last sacked data. */ 226050570efSLawrence Stewart ack = tp->sackhint.last_sack_ack; 227050570efSLawrence Stewart } else 228050570efSLawrence Stewart ack = th->th_ack; 229050570efSLawrence Stewart 230050570efSLawrence Stewart txsi = TAILQ_FIRST(&e_t->txsegi_q); 231050570efSLawrence Stewart while (txsi != NULL) { 232050570efSLawrence Stewart rts = 0; 233050570efSLawrence Stewart 234050570efSLawrence Stewart /* Acknowledgement is acking more than this txsi. */ 235050570efSLawrence Stewart if (SEQ_GT(ack, txsi->seq + txsi->len)) { 236050570efSLawrence Stewart if (txsi->flags & TXSI_RTT_MEASURE_START || 237050570efSLawrence Stewart measurenext) { 238050570efSLawrence Stewart marked_packet_rtt(txsi, e_t, tp, 239050570efSLawrence Stewart &measurenext, &measurenext_len, 240050570efSLawrence Stewart &rtt_bytes_adjust, MULTI_ACK); 241050570efSLawrence Stewart } 242050570efSLawrence Stewart TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); 243050570efSLawrence Stewart uma_zfree(txseginfo_zone, txsi); 244050570efSLawrence Stewart txsi = TAILQ_FIRST(&e_t->txsegi_q); 245050570efSLawrence Stewart continue; 246050570efSLawrence Stewart } 247050570efSLawrence Stewart 248050570efSLawrence Stewart /* 249050570efSLawrence Stewart * Guess if delayed acks are being used by the receiver. 250050570efSLawrence Stewart * 251050570efSLawrence Stewart * XXXDH: A simple heuristic that could be improved 252050570efSLawrence Stewart */ 253050570efSLawrence Stewart if (!new_sacked_bytes) { 254050570efSLawrence Stewart if (acked > tp->t_maxseg) { 255050570efSLawrence Stewart e_t->dlyack_rx += 256050570efSLawrence Stewart (e_t->dlyack_rx < DLYACK_SMOOTH) ? 257050570efSLawrence Stewart 1 : 0; 258050570efSLawrence Stewart multiack = 1; 259050570efSLawrence Stewart } else if (acked > txsi->len) { 260050570efSLawrence Stewart multiack = 1; 261050570efSLawrence Stewart e_t->dlyack_rx += 262050570efSLawrence Stewart (e_t->dlyack_rx < DLYACK_SMOOTH) ? 263050570efSLawrence Stewart 1 : 0; 264050570efSLawrence Stewart } else if (acked == tp->t_maxseg || 265050570efSLawrence Stewart acked == txsi->len) { 266050570efSLawrence Stewart e_t->dlyack_rx -= 267050570efSLawrence Stewart (e_t->dlyack_rx > 0) ? 1 : 0; 268050570efSLawrence Stewart } 269050570efSLawrence Stewart /* Otherwise leave dlyack_rx the way it was. */ 270050570efSLawrence Stewart } 271050570efSLawrence Stewart 272050570efSLawrence Stewart /* 273050570efSLawrence Stewart * Time stamps are only to help match the txsi with the 274050570efSLawrence Stewart * received acknowledgements. 275050570efSLawrence Stewart */ 276050570efSLawrence Stewart if (e_t->timestamp_errors < MAX_TS_ERR && 277050570efSLawrence Stewart (to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 278050570efSLawrence Stewart /* 279050570efSLawrence Stewart * Note: All packets sent with the offload will 280050570efSLawrence Stewart * have the same time stamp. If we are sending 281050570efSLawrence Stewart * on a fast interface and the t_maxseg is much 282050570efSLawrence Stewart * smaller than one tick, this will be fine. The 283050570efSLawrence Stewart * time stamp would be the same whether we were 284050570efSLawrence Stewart * using tso or not. However, if the interface 285050570efSLawrence Stewart * is slow, this will cause problems with the 286050570efSLawrence Stewart * calculations. If the interface is slow, there 287050570efSLawrence Stewart * is not reason to be using tso, and it should 288050570efSLawrence Stewart * be turned off. 289050570efSLawrence Stewart */ 290050570efSLawrence Stewart /* 291050570efSLawrence Stewart * If there are too many time stamp errors, time 292050570efSLawrence Stewart * stamps won't be trusted 293050570efSLawrence Stewart */ 294050570efSLawrence Stewart rts = to->to_tsecr; 295050570efSLawrence Stewart /* Before this packet. */ 296050570efSLawrence Stewart if (!e_t->dlyack_rx && TSTMP_LT(rts, txsi->tx_ts)) 297050570efSLawrence Stewart /* When delayed acking is used, the 298050570efSLawrence Stewart * reflected time stamp is of the first 299050570efSLawrence Stewart * packet and thus may be before 300050570efSLawrence Stewart * txsi->tx_ts. 301050570efSLawrence Stewart */ 302050570efSLawrence Stewart break; 303050570efSLawrence Stewart if (TSTMP_GT(rts, txsi->tx_ts)) { 304050570efSLawrence Stewart /* 305050570efSLawrence Stewart * If reflected time stamp is later than 306050570efSLawrence Stewart * tx_tsi, then this txsi is old. 307050570efSLawrence Stewart */ 308050570efSLawrence Stewart if (txsi->flags & TXSI_RTT_MEASURE_START 309050570efSLawrence Stewart || measurenext) { 310050570efSLawrence Stewart marked_packet_rtt(txsi, e_t, tp, 311050570efSLawrence Stewart &measurenext, &measurenext_len, 312050570efSLawrence Stewart &rtt_bytes_adjust, OLD_TXSI); 313050570efSLawrence Stewart } 314050570efSLawrence Stewart TAILQ_REMOVE(&e_t->txsegi_q, txsi, 315050570efSLawrence Stewart txsegi_lnk); 316050570efSLawrence Stewart uma_zfree(txseginfo_zone, txsi); 317050570efSLawrence Stewart txsi = TAILQ_FIRST(&e_t->txsegi_q); 318050570efSLawrence Stewart continue; 319050570efSLawrence Stewart } 320050570efSLawrence Stewart if (rts == txsi->tx_ts && 321050570efSLawrence Stewart TSTMP_LT(to->to_tsval, txsi->rx_ts)) { 322050570efSLawrence Stewart /* 323050570efSLawrence Stewart * Segment received before sent! 324050570efSLawrence Stewart * Something is wrong with the received 325050570efSLawrence Stewart * timestamps so increment errors. If 326050570efSLawrence Stewart * this keeps up we will ignore 327050570efSLawrence Stewart * timestamps. 328050570efSLawrence Stewart */ 329050570efSLawrence Stewart e_t->timestamp_errors++; 330050570efSLawrence Stewart } 331050570efSLawrence Stewart } 332050570efSLawrence Stewart /* 333050570efSLawrence Stewart * Acknowledging a sequence number before this txsi. 334050570efSLawrence Stewart * If it is an old txsi that may have had the same seq 335050570efSLawrence Stewart * numbers, it should have been removed if time stamps 336050570efSLawrence Stewart * are being used. 337050570efSLawrence Stewart */ 338050570efSLawrence Stewart if (SEQ_LEQ(ack, txsi->seq)) 339050570efSLawrence Stewart break; /* Before first packet in txsi. */ 340050570efSLawrence Stewart 341050570efSLawrence Stewart /* 342050570efSLawrence Stewart * Only ack > txsi->seq and ack <= txsi->seq+txsi->len 343050570efSLawrence Stewart * past this point. 344050570efSLawrence Stewart * 345050570efSLawrence Stewart * If delayed acks are being used, an acknowledgement 346050570efSLawrence Stewart * for a single segment will have been delayed by the 347050570efSLawrence Stewart * receiver and will yield an inaccurate measurement. In 348050570efSLawrence Stewart * this case, we only make the measurement if more than 349050570efSLawrence Stewart * one segment is being acknowledged or sack is 350050570efSLawrence Stewart * currently being used. 351050570efSLawrence Stewart */ 352050570efSLawrence Stewart if (!e_t->dlyack_rx || multiack || new_sacked_bytes) { 353050570efSLawrence Stewart /* Make an accurate new measurement. */ 354ee24d3b8SLawrence Stewart e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1; 355050570efSLawrence Stewart 356050570efSLawrence Stewart if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0) 357050570efSLawrence Stewart e_t->minrtt = e_t->rtt; 358050570efSLawrence Stewart 359050570efSLawrence Stewart if (e_t->rtt > e_t->maxrtt || e_t->maxrtt == 0) 360050570efSLawrence Stewart e_t->maxrtt = e_t->rtt; 361050570efSLawrence Stewart } 362050570efSLawrence Stewart 363050570efSLawrence Stewart if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext) 364050570efSLawrence Stewart marked_packet_rtt(txsi, e_t, tp, 365050570efSLawrence Stewart &measurenext, &measurenext_len, 366050570efSLawrence Stewart &rtt_bytes_adjust, CORRECT_ACK); 367050570efSLawrence Stewart 368050570efSLawrence Stewart if (txsi->flags & TXSI_TSO) { 3693ac12506SJonathan T. Looney if (txsi->len > acked) { 370050570efSLawrence Stewart txsi->len -= acked; 371050570efSLawrence Stewart /* 372050570efSLawrence Stewart * This presumes ack for first bytes in 373050570efSLawrence Stewart * txsi, this may not be true but it 374050570efSLawrence Stewart * shouldn't cause problems for the 375050570efSLawrence Stewart * timing. 376050570efSLawrence Stewart * 377050570efSLawrence Stewart * We remeasure RTT even though we only 378050570efSLawrence Stewart * have a single txsi. The rationale 379050570efSLawrence Stewart * behind this is that it is better to 380050570efSLawrence Stewart * have a slightly inaccurate 381050570efSLawrence Stewart * measurement than no additional 382050570efSLawrence Stewart * measurement for the rest of the bulk 383050570efSLawrence Stewart * transfer. Since TSO is only used on 384050570efSLawrence Stewart * high speed interface cards, so the 385050570efSLawrence Stewart * packets should be transmitted at line 386050570efSLawrence Stewart * rate back to back with little 387050570efSLawrence Stewart * difference in transmission times (in 388050570efSLawrence Stewart * ticks). 389050570efSLawrence Stewart */ 390050570efSLawrence Stewart txsi->seq += acked; 391050570efSLawrence Stewart /* 392050570efSLawrence Stewart * Reset txsi measure flag so we don't 393050570efSLawrence Stewart * use it for another RTT measurement. 394050570efSLawrence Stewart */ 395050570efSLawrence Stewart txsi->flags &= ~TXSI_RTT_MEASURE_START; 396050570efSLawrence Stewart /* 397050570efSLawrence Stewart * There is still more data to be acked 398050570efSLawrence Stewart * from tso bulk transmission, so we 399050570efSLawrence Stewart * won't remove it from the TAILQ yet. 400050570efSLawrence Stewart */ 401050570efSLawrence Stewart break; 402050570efSLawrence Stewart } 4033ac12506SJonathan T. Looney txsi->len = 0; 404050570efSLawrence Stewart } 405050570efSLawrence Stewart 406050570efSLawrence Stewart TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); 407050570efSLawrence Stewart uma_zfree(txseginfo_zone, txsi); 408050570efSLawrence Stewart break; 409050570efSLawrence Stewart } 410050570efSLawrence Stewart 411050570efSLawrence Stewart if (measurenext) { 412050570efSLawrence Stewart /* 413050570efSLawrence Stewart * We need to do a RTT measurement. It won't be the best 414050570efSLawrence Stewart * if we do it here. 415050570efSLawrence Stewart */ 416050570efSLawrence Stewart marked_packet_rtt(txsi, e_t, tp, 417050570efSLawrence Stewart &measurenext, &measurenext_len, 418050570efSLawrence Stewart &rtt_bytes_adjust, FORCED_MEASUREMENT); 419050570efSLawrence Stewart } 420050570efSLawrence Stewart } 421050570efSLawrence Stewart 422050570efSLawrence Stewart return (0); 423050570efSLawrence Stewart } 424050570efSLawrence Stewart 425050570efSLawrence Stewart /* 426050570efSLawrence Stewart * Add information about a transmitted segment to a list. 427050570efSLawrence Stewart * This is called via the helper hook in tcp_output.c 428050570efSLawrence Stewart */ 429050570efSLawrence Stewart static int 430050570efSLawrence Stewart ertt_add_tx_segment_info_hook(int hhook_type, int hhook_id, void *udata, 431050570efSLawrence Stewart void *ctx_data, void *hdata, struct osd *hosd) 432050570efSLawrence Stewart { 433050570efSLawrence Stewart struct ertt *e_t; 434050570efSLawrence Stewart struct tcpcb *tp; 435050570efSLawrence Stewart struct tcphdr *th; 436050570efSLawrence Stewart struct tcpopt *to; 437050570efSLawrence Stewart struct tcp_hhook_data *thdp; 438050570efSLawrence Stewart struct txseginfo *txsi; 4393ac12506SJonathan T. Looney uint32_t len; 440050570efSLawrence Stewart int tso; 441050570efSLawrence Stewart 442050570efSLawrence Stewart KASSERT(ctx_data != NULL, ("%s: ctx_data is NULL!", __func__)); 443050570efSLawrence Stewart KASSERT(hdata != NULL, ("%s: hdata is NULL!", __func__)); 444050570efSLawrence Stewart 445050570efSLawrence Stewart e_t = (struct ertt *)hdata; 446050570efSLawrence Stewart thdp = ctx_data; 447050570efSLawrence Stewart tp = thdp->tp; 448050570efSLawrence Stewart th = thdp->th; 449050570efSLawrence Stewart to = thdp->to; 450050570efSLawrence Stewart len = thdp->len; 451050570efSLawrence Stewart tso = thdp->tso; 452050570efSLawrence Stewart 453050570efSLawrence Stewart INP_WLOCK_ASSERT(tp->t_inpcb); 454050570efSLawrence Stewart 455050570efSLawrence Stewart if (len > 0) { 456050570efSLawrence Stewart txsi = uma_zalloc(txseginfo_zone, M_NOWAIT); 457050570efSLawrence Stewart if (txsi != NULL) { 458050570efSLawrence Stewart /* Construct txsi setting the necessary flags. */ 459050570efSLawrence Stewart txsi->flags = 0; /* Needs to be initialised. */ 460050570efSLawrence Stewart txsi->seq = ntohl(th->th_seq); 461050570efSLawrence Stewart txsi->len = len; 462050570efSLawrence Stewart if (tso) 463050570efSLawrence Stewart txsi->flags |= TXSI_TSO; 464050570efSLawrence Stewart else if (e_t->flags & ERTT_TSO_DISABLED) { 465050570efSLawrence Stewart tp->t_flags |= TF_TSO; 466050570efSLawrence Stewart e_t->flags &= ~ERTT_TSO_DISABLED; 467050570efSLawrence Stewart } 468050570efSLawrence Stewart 469050570efSLawrence Stewart if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS) { 470050570efSLawrence Stewart e_t->bytes_tx_in_rtt += len; 471050570efSLawrence Stewart } else { 472050570efSLawrence Stewart txsi->flags |= TXSI_RTT_MEASURE_START; 473050570efSLawrence Stewart e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS; 474050570efSLawrence Stewart e_t->bytes_tx_in_rtt = len; 475050570efSLawrence Stewart } 476050570efSLawrence Stewart 477050570efSLawrence Stewart if (((tp->t_flags & TF_NOOPT) == 0) && 478050570efSLawrence Stewart (to->to_flags & TOF_TS)) { 479050570efSLawrence Stewart txsi->tx_ts = ntohl(to->to_tsval) - 480050570efSLawrence Stewart tp->ts_offset; 481050570efSLawrence Stewart txsi->rx_ts = ntohl(to->to_tsecr); 482050570efSLawrence Stewart } else { 483ee24d3b8SLawrence Stewart txsi->tx_ts = tcp_ts_getticks(); 484050570efSLawrence Stewart txsi->rx_ts = 0; /* No received time stamp. */ 485050570efSLawrence Stewart } 486050570efSLawrence Stewart TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk); 487050570efSLawrence Stewart } 488050570efSLawrence Stewart } 489050570efSLawrence Stewart 490050570efSLawrence Stewart return (0); 491050570efSLawrence Stewart } 492050570efSLawrence Stewart 493050570efSLawrence Stewart static int 494050570efSLawrence Stewart ertt_mod_init(void) 495050570efSLawrence Stewart { 496050570efSLawrence Stewart 497050570efSLawrence Stewart txseginfo_zone = uma_zcreate("ertt_txseginfo", sizeof(struct txseginfo), 498050570efSLawrence Stewart NULL, NULL, NULL, NULL, 0, 0); 499050570efSLawrence Stewart 500050570efSLawrence Stewart return (0); 501050570efSLawrence Stewart } 502050570efSLawrence Stewart 503050570efSLawrence Stewart static int 504050570efSLawrence Stewart ertt_mod_destroy(void) 505050570efSLawrence Stewart { 506050570efSLawrence Stewart 507050570efSLawrence Stewart uma_zdestroy(txseginfo_zone); 508050570efSLawrence Stewart 509050570efSLawrence Stewart return (0); 510050570efSLawrence Stewart } 511050570efSLawrence Stewart 512050570efSLawrence Stewart static int 513050570efSLawrence Stewart ertt_uma_ctor(void *mem, int size, void *arg, int flags) 514050570efSLawrence Stewart { 515050570efSLawrence Stewart struct ertt *e_t; 516050570efSLawrence Stewart 517050570efSLawrence Stewart e_t = mem; 518050570efSLawrence Stewart 519050570efSLawrence Stewart TAILQ_INIT(&e_t->txsegi_q); 520050570efSLawrence Stewart e_t->timestamp_errors = 0; 521050570efSLawrence Stewart e_t->minrtt = 0; 522050570efSLawrence Stewart e_t->maxrtt = 0; 523050570efSLawrence Stewart e_t->rtt = 0; 524050570efSLawrence Stewart e_t->flags = 0; 525050570efSLawrence Stewart e_t->dlyack_rx = 0; 526050570efSLawrence Stewart e_t->bytes_tx_in_rtt = 0; 527050570efSLawrence Stewart e_t->markedpkt_rtt = 0; 528050570efSLawrence Stewart 529050570efSLawrence Stewart return (0); 530050570efSLawrence Stewart } 531050570efSLawrence Stewart 532050570efSLawrence Stewart static void 533050570efSLawrence Stewart ertt_uma_dtor(void *mem, int size, void *arg) 534050570efSLawrence Stewart { 535050570efSLawrence Stewart struct ertt *e_t; 536050570efSLawrence Stewart struct txseginfo *n_txsi, *txsi; 537050570efSLawrence Stewart 538050570efSLawrence Stewart e_t = mem; 539050570efSLawrence Stewart txsi = TAILQ_FIRST(&e_t->txsegi_q); 540050570efSLawrence Stewart while (txsi != NULL) { 541050570efSLawrence Stewart n_txsi = TAILQ_NEXT(txsi, txsegi_lnk); 542050570efSLawrence Stewart uma_zfree(txseginfo_zone, txsi); 543050570efSLawrence Stewart txsi = n_txsi; 544050570efSLawrence Stewart } 545050570efSLawrence Stewart } 546050570efSLawrence Stewart 547050570efSLawrence Stewart KHELP_DECLARE_MOD_UMA(ertt, &ertt_helper, ertt_hooks, 1, sizeof(struct ertt), 548050570efSLawrence Stewart ertt_uma_ctor, ertt_uma_dtor); 549