1 /* $OpenBSD: tcp_output.c,v 1.144 2024/04/17 20:48:51 bluhm Exp $ */ 2 /* $NetBSD: tcp_output.c,v 1.16 1997/06/03 16:17:09 kml Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include "pf.h" 72 #include "stoeplitz.h" 73 74 #include <sys/param.h> 75 #include <sys/systm.h> 76 #include <sys/mbuf.h> 77 #include <sys/protosw.h> 78 #include <sys/socket.h> 79 #include <sys/socketvar.h> 80 #include <sys/kernel.h> 81 82 #include <net/if.h> 83 #include <net/if_var.h> 84 #include <net/route.h> 85 #if NPF > 0 86 #include <net/pfvar.h> 87 #endif 88 89 #include <netinet/in.h> 90 #include <netinet/ip.h> 91 #include <netinet/in_pcb.h> 92 #include <netinet/ip_var.h> 93 #include <netinet6/ip6_var.h> 94 #include <netinet/tcp.h> 95 #define TCPOUTFLAGS 96 #include <netinet/tcp_fsm.h> 97 #include <netinet/tcp_seq.h> 98 #include <netinet/tcp_timer.h> 99 #include <netinet/tcp_var.h> 100 #include <netinet/tcp_debug.h> 101 102 #ifdef notyet 103 extern struct mbuf *m_copypack(); 104 #endif 105 106 extern int tcprexmtthresh; 107 108 #ifdef TCP_SACK_DEBUG 109 void tcp_print_holes(struct tcpcb *tp); 110 111 void 112 tcp_print_holes(struct tcpcb *tp) 113 { 114 struct sackhole *p = tp->snd_holes; 115 if (p == NULL) 116 return; 117 printf("Hole report: start--end dups rxmit\n"); 118 while (p) { 119 printf("%x--%x d %d r %x\n", p->start, p->end, p->dups, 120 p->rxmit); 121 p = p->next; 122 } 123 printf("\n"); 124 } 125 #endif /* TCP_SACK_DEBUG */ 126 127 /* 128 * Returns pointer to a sackhole if there are any pending retransmissions; 129 * NULL otherwise. 130 */ 131 struct sackhole * 132 tcp_sack_output(struct tcpcb *tp) 133 { 134 struct sackhole *p; 135 136 if (!tp->sack_enable) 137 return (NULL); 138 p = tp->snd_holes; 139 while (p) { 140 if (p->dups >= tcprexmtthresh && SEQ_LT(p->rxmit, p->end)) { 141 if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */ 142 p = p->next; 143 continue; 144 } 145 #ifdef TCP_SACK_DEBUG 146 if (p) 147 tcp_print_holes(tp); 148 #endif 149 return (p); 150 } 151 p = p->next; 152 } 153 return (NULL); 154 } 155 156 /* 157 * After a timeout, the SACK list may be rebuilt. This SACK information 158 * should be used to avoid retransmitting SACKed data. This function 159 * traverses the SACK list to see if snd_nxt should be moved forward. 160 */ 161 162 void 163 tcp_sack_adjust(struct tcpcb *tp) 164 { 165 struct sackhole *cur = tp->snd_holes; 166 if (cur == NULL) 167 return; /* No holes */ 168 if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack)) 169 return; /* We're already beyond any SACKed blocks */ 170 /* 171 * Two cases for which we want to advance snd_nxt: 172 * i) snd_nxt lies between end of one hole and beginning of another 173 * ii) snd_nxt lies between end of last hole and rcv_lastsack 174 */ 175 while (cur->next) { 176 if (SEQ_LT(tp->snd_nxt, cur->end)) 177 return; 178 if (SEQ_GEQ(tp->snd_nxt, cur->next->start)) 179 cur = cur->next; 180 else { 181 tp->snd_nxt = cur->next->start; 182 return; 183 } 184 } 185 if (SEQ_LT(tp->snd_nxt, cur->end)) 186 return; 187 tp->snd_nxt = tp->rcv_lastsack; 188 return; 189 } 190 191 /* 192 * Tcp output routine: figure out what should be sent and send it. 193 */ 194 int 195 tcp_output(struct tcpcb *tp) 196 { 197 struct socket *so = tp->t_inpcb->inp_socket; 198 long len, win, txmaxseg; 199 int off, flags, error; 200 struct mbuf *m; 201 struct tcphdr *th; 202 u_int32_t optbuf[howmany(MAX_TCPOPTLEN, sizeof(u_int32_t))]; 203 u_char *opt = (u_char *)optbuf; 204 unsigned int optlen, hdrlen, packetlen; 205 int idle, sendalot = 0; 206 int i, sack_rxmit = 0; 207 struct sackhole *p; 208 uint64_t now; 209 #ifdef TCP_SIGNATURE 210 unsigned int sigoff; 211 #endif /* TCP_SIGNATURE */ 212 #ifdef TCP_ECN 213 int needect; 214 #endif 215 int tso; 216 217 if (tp->t_flags & TF_BLOCKOUTPUT) { 218 tp->t_flags |= TF_NEEDOUTPUT; 219 return (0); 220 } else 221 tp->t_flags &= ~TF_NEEDOUTPUT; 222 223 #if defined(TCP_SIGNATURE) && defined(DIAGNOSTIC) 224 if (tp->sack_enable && (tp->t_flags & TF_SIGNATURE)) 225 return (EINVAL); 226 #endif /* defined(TCP_SIGNATURE) && defined(DIAGNOSTIC) */ 227 228 now = tcp_now(); 229 230 /* 231 * Determine length of data that should be transmitted, 232 * and flags that will be used. 233 * If there is some data or critical controls (SYN, RST) 234 * to send, then transmit; otherwise, investigate further. 235 */ 236 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 237 if (idle && (now - tp->t_rcvtime) >= tp->t_rxtcur) 238 /* 239 * We have been idle for "a while" and no acks are 240 * expected to clock out any data we send -- 241 * slow start to get ack "clock" running again. 242 */ 243 tp->snd_cwnd = 2 * tp->t_maxseg; 244 245 /* remember 'idle' for next invocation of tcp_output */ 246 if (idle && soissending(so)) { 247 tp->t_flags |= TF_LASTIDLE; 248 idle = 0; 249 } else 250 tp->t_flags &= ~TF_LASTIDLE; 251 252 again: 253 /* 254 * If we've recently taken a timeout, snd_max will be greater than 255 * snd_nxt. There may be SACK information that allows us to avoid 256 * resending already delivered data. Adjust snd_nxt accordingly. 257 */ 258 if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max)) 259 tcp_sack_adjust(tp); 260 off = tp->snd_nxt - tp->snd_una; 261 win = ulmin(tp->snd_wnd, tp->snd_cwnd); 262 263 flags = tcp_outflags[tp->t_state]; 264 265 /* 266 * Send any SACK-generated retransmissions. If we're explicitly trying 267 * to send out new data (when sendalot is 1), bypass this function. 268 * If we retransmit in fast recovery mode, decrement snd_cwnd, since 269 * we're replacing a (future) new transmission with a retransmission 270 * now, and we previously incremented snd_cwnd in tcp_input(). 271 */ 272 if (tp->sack_enable && !sendalot) { 273 if (tp->t_dupacks >= tcprexmtthresh && 274 (p = tcp_sack_output(tp))) { 275 off = p->rxmit - tp->snd_una; 276 sack_rxmit = 1; 277 /* Coalesce holes into a single retransmission */ 278 len = min(tp->t_maxseg, p->end - p->rxmit); 279 if (SEQ_LT(tp->snd_una, tp->snd_last)) 280 tp->snd_cwnd -= tp->t_maxseg; 281 } 282 } 283 284 sendalot = 0; 285 tso = 0; 286 /* 287 * If in persist timeout with window of 0, send 1 byte. 288 * Otherwise, if window is small but nonzero 289 * and timer expired, we will send what we can 290 * and go to transmit state. 291 */ 292 if (tp->t_force) { 293 if (win == 0) { 294 /* 295 * If we still have some data to send, then 296 * clear the FIN bit. Usually this would 297 * happen below when it realizes that we 298 * aren't sending all the data. However, 299 * if we have exactly 1 byte of unset data, 300 * then it won't clear the FIN bit below, 301 * and if we are in persist state, we wind 302 * up sending the packet without recording 303 * that we sent the FIN bit. 304 * 305 * We can't just blindly clear the FIN bit, 306 * because if we don't have any more data 307 * to send then the probe will be the FIN 308 * itself. 309 */ 310 if (off < so->so_snd.sb_cc) 311 flags &= ~TH_FIN; 312 win = 1; 313 } else { 314 TCP_TIMER_DISARM(tp, TCPT_PERSIST); 315 tp->t_rxtshift = 0; 316 } 317 } 318 319 if (!sack_rxmit) { 320 len = ulmin(so->so_snd.sb_cc, win) - off; 321 } 322 323 if (len < 0) { 324 /* 325 * If FIN has been sent but not acked, 326 * but we haven't been called to retransmit, 327 * len will be -1. Otherwise, window shrank 328 * after we sent into it. If window shrank to 0, 329 * cancel pending retransmit, pull snd_nxt back 330 * to (closed) window, and set the persist timer 331 * if it isn't already going. If the window didn't 332 * close completely, just wait for an ACK. 333 */ 334 len = 0; 335 if (win == 0) { 336 TCP_TIMER_DISARM(tp, TCPT_REXMT); 337 tp->t_rxtshift = 0; 338 tp->snd_nxt = tp->snd_una; 339 if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) 340 tcp_setpersist(tp); 341 } 342 } 343 344 /* 345 * Never send more than half a buffer full. This insures that we can 346 * always keep 2 packets on the wire, no matter what SO_SNDBUF is, and 347 * therefore acks will never be delayed unless we run out of data to 348 * transmit. 349 */ 350 txmaxseg = ulmin(so->so_snd.sb_hiwat / 2, tp->t_maxseg); 351 352 if (len > txmaxseg) { 353 if (tcp_do_tso && 354 tp->t_inpcb->inp_options == NULL && 355 tp->t_inpcb->inp_outputopts6 == NULL && 356 #ifdef TCP_SIGNATURE 357 ((tp->t_flags & TF_SIGNATURE) == 0) && 358 #endif 359 len >= 2 * tp->t_maxseg && 360 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 361 !(flags & (TH_SYN|TH_RST|TH_FIN))) { 362 tso = 1; 363 /* avoid small chopped packets */ 364 if (len > (len / tp->t_maxseg) * tp->t_maxseg) { 365 len = (len / tp->t_maxseg) * tp->t_maxseg; 366 sendalot = 1; 367 } 368 } else { 369 len = txmaxseg; 370 sendalot = 1; 371 } 372 } 373 if (off + len < so->so_snd.sb_cc) 374 flags &= ~TH_FIN; 375 376 win = sbspace(so, &so->so_rcv); 377 378 /* 379 * Sender silly window avoidance. If connection is idle 380 * and can send all data, a maximum segment, 381 * at least a maximum default-size segment do it, 382 * or are forced, do it; otherwise don't bother. 383 * If peer's buffer is tiny, then send 384 * when window is at least half open. 385 * If retransmitting (possibly after persist timer forced us 386 * to send into a small window), then must resend. 387 */ 388 if (len) { 389 if (len >= txmaxseg) 390 goto send; 391 if ((idle || (tp->t_flags & TF_NODELAY)) && 392 len + off >= so->so_snd.sb_cc && !soissending(so) && 393 (tp->t_flags & TF_NOPUSH) == 0) 394 goto send; 395 if (tp->t_force) 396 goto send; 397 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) 398 goto send; 399 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 400 goto send; 401 if (sack_rxmit) 402 goto send; 403 } 404 405 /* 406 * Compare available window to amount of window 407 * known to peer (as advertised window less 408 * next expected input). If the difference is at least two 409 * max size segments, or at least 50% of the maximum possible 410 * window, then want to send a window update to peer. 411 */ 412 if (win > 0) { 413 /* 414 * "adv" is the amount we can increase the window, 415 * taking into account that we are limited by 416 * TCP_MAXWIN << tp->rcv_scale. 417 */ 418 long adv = lmin(win, (long)TCP_MAXWIN << tp->rcv_scale) - 419 (tp->rcv_adv - tp->rcv_nxt); 420 421 if (adv >= (long) (2 * tp->t_maxseg)) 422 goto send; 423 if (2 * adv >= (long) so->so_rcv.sb_hiwat) 424 goto send; 425 } 426 427 /* 428 * Send if we owe peer an ACK. 429 */ 430 if (tp->t_flags & TF_ACKNOW) 431 goto send; 432 if (flags & (TH_SYN|TH_RST)) 433 goto send; 434 if (SEQ_GT(tp->snd_up, tp->snd_una)) 435 goto send; 436 /* 437 * If our state indicates that FIN should be sent 438 * and we have not yet done so, or we're retransmitting the FIN, 439 * then we need to send. 440 */ 441 if (flags & TH_FIN && 442 ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) 443 goto send; 444 /* 445 * In SACK, it is possible for tcp_output to fail to send a segment 446 * after the retransmission timer has been turned off. Make sure 447 * that the retransmission timer is set. 448 */ 449 if (SEQ_GT(tp->snd_max, tp->snd_una) && 450 TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 && 451 TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) { 452 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 453 return (0); 454 } 455 456 /* 457 * TCP window updates are not reliable, rather a polling protocol 458 * using ``persist'' packets is used to insure receipt of window 459 * updates. The three ``states'' for the output side are: 460 * idle not doing retransmits or persists 461 * persisting to move a small or zero window 462 * (re)transmitting and thereby not persisting 463 * 464 * tp->t_timer[TCPT_PERSIST] 465 * is set when we are in persist state. 466 * tp->t_force 467 * is set when we are called to send a persist packet. 468 * tp->t_timer[TCPT_REXMT] 469 * is set when we are retransmitting 470 * The output side is idle when both timers are zero. 471 * 472 * If send window is too small, there is data to transmit, and no 473 * retransmit or persist is pending, then go to persist state. 474 * If nothing happens soon, send when timer expires: 475 * if window is nonzero, transmit what we can, 476 * otherwise force out a byte. 477 */ 478 if (so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 && 479 TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) { 480 tp->t_rxtshift = 0; 481 tcp_setpersist(tp); 482 } 483 484 /* 485 * No reason to send a segment, just return. 486 */ 487 return (0); 488 489 send: 490 /* 491 * Before ESTABLISHED, force sending of initial options 492 * unless TCP set not to do any options. 493 * NOTE: we assume that the IP/TCP header plus TCP options 494 * always fit in a single mbuf, leaving room for a maximum 495 * link header, i.e. 496 * max_linkhdr + sizeof(network header) + sizeof(struct tcphdr + 497 * optlen <= MHLEN 498 */ 499 optlen = 0; 500 501 switch (tp->pf) { 502 case 0: /*default to PF_INET*/ 503 case PF_INET: 504 hdrlen = sizeof(struct ip) + sizeof(struct tcphdr); 505 break; 506 #ifdef INET6 507 case PF_INET6: 508 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 509 break; 510 #endif /* INET6 */ 511 default: 512 return (EPFNOSUPPORT); 513 } 514 515 if (flags & TH_SYN) { 516 tp->snd_nxt = tp->iss; 517 if ((tp->t_flags & TF_NOOPT) == 0) { 518 u_int16_t mss; 519 520 opt[0] = TCPOPT_MAXSEG; 521 opt[1] = 4; 522 mss = htons((u_int16_t) tcp_mss(tp, 0)); 523 memcpy(opt + 2, &mss, sizeof(mss)); 524 optlen = 4; 525 526 if (flags & TH_ACK) 527 tcp_mss_update(tp); 528 /* 529 * If this is the first SYN of connection (not a SYN 530 * ACK), include SACK_PERMIT_HDR option. If this is a 531 * SYN ACK, include SACK_PERMIT_HDR option if peer has 532 * already done so. 533 */ 534 if (tp->sack_enable && ((flags & TH_ACK) == 0 || 535 (tp->t_flags & TF_SACK_PERMIT))) { 536 *((u_int32_t *) (opt + optlen)) = 537 htonl(TCPOPT_SACK_PERMIT_HDR); 538 optlen += 4; 539 } 540 if ((tp->t_flags & TF_REQ_SCALE) && 541 ((flags & TH_ACK) == 0 || 542 (tp->t_flags & TF_RCVD_SCALE))) { 543 *((u_int32_t *) (opt + optlen)) = htonl( 544 TCPOPT_NOP << 24 | 545 TCPOPT_WINDOW << 16 | 546 TCPOLEN_WINDOW << 8 | 547 tp->request_r_scale); 548 optlen += 4; 549 } 550 } 551 } 552 553 /* 554 * Send a timestamp and echo-reply if this is a SYN and our side 555 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side 556 * and our peer have sent timestamps in our SYN's. 557 */ 558 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 559 (flags & TH_RST) == 0 && 560 ((flags & (TH_SYN|TH_ACK)) == TH_SYN || 561 (tp->t_flags & TF_RCVD_TSTMP))) { 562 u_int32_t *lp = (u_int32_t *)(opt + optlen); 563 564 /* Form timestamp option as shown in appendix A of RFC 1323. */ 565 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 566 *lp++ = htonl(now + tp->ts_modulate); 567 *lp = htonl(tp->ts_recent); 568 optlen += TCPOLEN_TSTAMP_APPA; 569 } 570 /* Set receive buffer autosizing timestamp. */ 571 if (tp->rfbuf_ts == 0) { 572 tp->rfbuf_ts = now; 573 tp->rfbuf_cnt = 0; 574 } 575 576 #ifdef TCP_SIGNATURE 577 if (tp->t_flags & TF_SIGNATURE) { 578 u_int8_t *bp = (u_int8_t *)(opt + optlen); 579 580 /* Send signature option */ 581 *(bp++) = TCPOPT_SIGNATURE; 582 *(bp++) = TCPOLEN_SIGNATURE; 583 sigoff = optlen + 2; 584 585 { 586 unsigned int i; 587 588 for (i = 0; i < 16; i++) 589 *(bp++) = 0; 590 } 591 592 593 /* Pad options list to the next 32 bit boundary and 594 * terminate it. 595 */ 596 *bp++ = TCPOPT_NOP; 597 *bp++ = TCPOPT_NOP; 598 599 optlen += TCPOLEN_SIGLEN; 600 } 601 #endif /* TCP_SIGNATURE */ 602 603 /* 604 * Send SACKs if necessary. This should be the last option processed. 605 * Only as many SACKs are sent as are permitted by the maximum options 606 * size. No more than three SACKs are sent. 607 */ 608 if (tp->sack_enable && tp->t_state == TCPS_ESTABLISHED && 609 (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT && 610 tp->rcv_numsacks) { 611 u_int32_t *lp = (u_int32_t *)(opt + optlen); 612 u_int32_t *olp = lp++; 613 int count = 0; /* actual number of SACKs inserted */ 614 int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK; 615 616 tcpstat_inc(tcps_sack_snd_opts); 617 maxsack = min(maxsack, TCP_MAX_SACK); 618 for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) { 619 struct sackblk sack = tp->sackblks[i]; 620 if (sack.start == 0 && sack.end == 0) 621 continue; 622 *lp++ = htonl(sack.start); 623 *lp++ = htonl(sack.end); 624 count++; 625 } 626 *olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2)); 627 optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */ 628 } 629 630 #ifdef DIAGNOSTIC 631 if (optlen > MAX_TCPOPTLEN) 632 panic("tcp_output: options too long"); 633 #endif /* DIAGNOSTIC */ 634 635 hdrlen += optlen; 636 637 /* 638 * Adjust data length if insertion of options will 639 * bump the packet length beyond the t_maxopd length. 640 * Clear the FIN bit because we cut off the tail of 641 * the segment. 642 */ 643 if (len > tp->t_maxopd - optlen) { 644 if (tso) { 645 if (len + hdrlen + max_linkhdr > MAXMCLBYTES) { 646 len = MAXMCLBYTES - hdrlen - max_linkhdr; 647 sendalot = 1; 648 } 649 } else { 650 len = tp->t_maxopd - optlen; 651 sendalot = 1; 652 } 653 flags &= ~TH_FIN; 654 } 655 656 #ifdef DIAGNOSTIC 657 if (max_linkhdr + hdrlen > MCLBYTES) 658 panic("tcphdr too big"); 659 #endif 660 661 /* 662 * Grab a header mbuf, attaching a copy of data to 663 * be transmitted, and initialize the header from 664 * the template for sends on this connection. 665 */ 666 if (len) { 667 if (tp->t_force && len == 1) 668 tcpstat_inc(tcps_sndprobe); 669 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 670 tcpstat_pkt(tcps_sndrexmitpack, tcps_sndrexmitbyte, 671 len); 672 tp->t_sndrexmitpack++; 673 } else { 674 tcpstat_pkt(tcps_sndpack, tcps_sndbyte, len); 675 } 676 #ifdef notyet 677 if ((m = m_copypack(so->so_snd.sb_mb, off, 678 (int)len, max_linkhdr + hdrlen)) == 0) { 679 error = ENOBUFS; 680 goto out; 681 } 682 /* 683 * m_copypack left space for our hdr; use it. 684 */ 685 m->m_len += hdrlen; 686 m->m_data -= hdrlen; 687 #else 688 MGETHDR(m, M_DONTWAIT, MT_HEADER); 689 if (m != NULL && max_linkhdr + hdrlen > MHLEN) { 690 MCLGET(m, M_DONTWAIT); 691 if ((m->m_flags & M_EXT) == 0) { 692 m_freem(m); 693 m = NULL; 694 } 695 } 696 if (m == NULL) { 697 error = ENOBUFS; 698 goto out; 699 } 700 m->m_data += max_linkhdr; 701 m->m_len = hdrlen; 702 if (len <= m_trailingspace(m)) { 703 m_copydata(so->so_snd.sb_mb, off, (int) len, 704 mtod(m, caddr_t) + hdrlen); 705 m->m_len += len; 706 } else { 707 m->m_next = m_copym(so->so_snd.sb_mb, off, (int) len, 708 M_NOWAIT); 709 if (m->m_next == 0) { 710 (void) m_free(m); 711 error = ENOBUFS; 712 goto out; 713 } 714 } 715 if (so->so_snd.sb_mb->m_flags & M_PKTHDR) 716 m->m_pkthdr.ph_loopcnt = 717 so->so_snd.sb_mb->m_pkthdr.ph_loopcnt; 718 #endif 719 /* 720 * If we're sending everything we've got, set PUSH. 721 * (This will keep happy those implementations which only 722 * give data to the user when a buffer fills or 723 * a PUSH comes in.) 724 */ 725 if (off + len == so->so_snd.sb_cc && !soissending(so)) 726 flags |= TH_PUSH; 727 tp->t_sndtime = now; 728 } else { 729 if (tp->t_flags & TF_ACKNOW) 730 tcpstat_inc(tcps_sndacks); 731 else if (flags & (TH_SYN|TH_FIN|TH_RST)) 732 tcpstat_inc(tcps_sndctrl); 733 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 734 tcpstat_inc(tcps_sndurg); 735 else 736 tcpstat_inc(tcps_sndwinup); 737 738 MGETHDR(m, M_DONTWAIT, MT_HEADER); 739 if (m != NULL && max_linkhdr + hdrlen > MHLEN) { 740 MCLGET(m, M_DONTWAIT); 741 if ((m->m_flags & M_EXT) == 0) { 742 m_freem(m); 743 m = NULL; 744 } 745 } 746 if (m == NULL) { 747 error = ENOBUFS; 748 goto out; 749 } 750 m->m_data += max_linkhdr; 751 m->m_len = hdrlen; 752 } 753 m->m_pkthdr.ph_ifidx = 0; 754 m->m_pkthdr.len = hdrlen + len; 755 756 /* Enable TSO and specify the size of the resulting segments. */ 757 if (tso) { 758 SET(m->m_pkthdr.csum_flags, M_TCP_TSO); 759 m->m_pkthdr.ph_mss = tp->t_maxseg; 760 } 761 762 if (!tp->t_template) 763 panic("tcp_output"); 764 #ifdef DIAGNOSTIC 765 if (tp->t_template->m_len != hdrlen - optlen) 766 panic("tcp_output: template len != hdrlen - optlen"); 767 #endif /* DIAGNOSTIC */ 768 memcpy(mtod(m, caddr_t), mtod(tp->t_template, caddr_t), 769 tp->t_template->m_len); 770 th = (struct tcphdr *)(mtod(m, caddr_t) + tp->t_template->m_len - 771 sizeof(struct tcphdr)); 772 773 /* 774 * Fill in fields, remembering maximum advertised 775 * window for use in delaying messages about window sizes. 776 * If resending a FIN, be sure not to use a new sequence number. 777 */ 778 if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) && 779 (tp->snd_nxt == tp->snd_max)) 780 tp->snd_nxt--; 781 /* 782 * If we are doing retransmissions, then snd_nxt will 783 * not reflect the first unsent octet. For ACK only 784 * packets, we do not want the sequence number of the 785 * retransmitted packet, we want the sequence number 786 * of the next unsent octet. So, if there is no data 787 * (and no SYN or FIN), use snd_max instead of snd_nxt 788 * when filling in ti_seq. But if we are in persist 789 * state, snd_max might reflect one byte beyond the 790 * right edge of the window, so use snd_nxt in that 791 * case, since we know we aren't doing a retransmission. 792 * (retransmit and persist are mutually exclusive...) 793 */ 794 if (len || (flags & (TH_SYN|TH_FIN)) || 795 TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) 796 th->th_seq = htonl(tp->snd_nxt); 797 else 798 th->th_seq = htonl(tp->snd_max); 799 800 if (sack_rxmit) { 801 /* 802 * If sendalot was turned on (due to option stuffing), turn it 803 * off. Properly set th_seq field. Advance the ret'x pointer 804 * by len. 805 */ 806 if (sendalot) 807 sendalot = 0; 808 th->th_seq = htonl(p->rxmit); 809 p->rxmit += len; 810 tcpstat_pkt(tcps_sack_rexmits, tcps_sack_rexmit_bytes, len); 811 } 812 813 th->th_ack = htonl(tp->rcv_nxt); 814 if (optlen) { 815 memcpy(th + 1, opt, optlen); 816 th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; 817 } 818 #ifdef TCP_ECN 819 if (tcp_do_ecn) { 820 /* 821 * if we have received congestion experienced segs, 822 * set ECE bit. 823 */ 824 if (tp->t_flags & TF_RCVD_CE) { 825 flags |= TH_ECE; 826 tcpstat_inc(tcps_ecn_sndece); 827 } 828 if (!(tp->t_flags & TF_DISABLE_ECN)) { 829 /* 830 * if this is a SYN seg, set ECE and CWR. 831 * set only ECE for SYN-ACK if peer supports ECN. 832 */ 833 if ((flags & (TH_SYN|TH_ACK)) == TH_SYN) 834 flags |= (TH_ECE|TH_CWR); 835 else if ((tp->t_flags & TF_ECN_PERMIT) && 836 (flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) 837 flags |= TH_ECE; 838 } 839 /* 840 * if we have reduced the congestion window, notify 841 * the peer by setting CWR bit. 842 */ 843 if ((tp->t_flags & TF_ECN_PERMIT) && 844 (tp->t_flags & TF_SEND_CWR)) { 845 flags |= TH_CWR; 846 tp->t_flags &= ~TF_SEND_CWR; 847 tcpstat_inc(tcps_ecn_sndcwr); 848 } 849 } 850 #endif 851 th->th_flags = flags; 852 853 /* 854 * Calculate receive window. Don't shrink window, 855 * but avoid silly window syndrome. 856 */ 857 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) 858 win = 0; 859 if (win > (long)TCP_MAXWIN << tp->rcv_scale) 860 win = (long)TCP_MAXWIN << tp->rcv_scale; 861 if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt)) 862 win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt); 863 if (flags & TH_RST) 864 win = 0; 865 th->th_win = htons((u_int16_t) (win>>tp->rcv_scale)); 866 if (th->th_win == 0) 867 tp->t_sndzerowin++; 868 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 869 u_int32_t urp = tp->snd_up - tp->snd_nxt; 870 if (urp > IP_MAXPACKET) 871 urp = IP_MAXPACKET; 872 th->th_urp = htons((u_int16_t)urp); 873 th->th_flags |= TH_URG; 874 } else 875 /* 876 * If no urgent pointer to send, then we pull 877 * the urgent pointer to the left edge of the send window 878 * so that it doesn't drift into the send window on sequence 879 * number wraparound. 880 */ 881 tp->snd_up = tp->snd_una; /* drag it along */ 882 883 #ifdef TCP_SIGNATURE 884 if (tp->t_flags & TF_SIGNATURE) { 885 int iphlen; 886 union sockaddr_union src, dst; 887 struct tdb *tdb; 888 889 bzero(&src, sizeof(union sockaddr_union)); 890 bzero(&dst, sizeof(union sockaddr_union)); 891 892 switch (tp->pf) { 893 case 0: /*default to PF_INET*/ 894 case AF_INET: 895 iphlen = sizeof(struct ip); 896 src.sa.sa_len = sizeof(struct sockaddr_in); 897 src.sa.sa_family = AF_INET; 898 src.sin.sin_addr = mtod(m, struct ip *)->ip_src; 899 dst.sa.sa_len = sizeof(struct sockaddr_in); 900 dst.sa.sa_family = AF_INET; 901 dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst; 902 break; 903 #ifdef INET6 904 case AF_INET6: 905 iphlen = sizeof(struct ip6_hdr); 906 src.sa.sa_len = sizeof(struct sockaddr_in6); 907 src.sa.sa_family = AF_INET6; 908 src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src; 909 dst.sa.sa_len = sizeof(struct sockaddr_in6); 910 dst.sa.sa_family = AF_INET6; 911 dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst; 912 break; 913 #endif /* INET6 */ 914 } 915 916 tdb = gettdbbysrcdst(rtable_l2(tp->t_inpcb->inp_rtableid), 917 0, &src, &dst, IPPROTO_TCP); 918 if (tdb == NULL) { 919 m_freem(m); 920 return (EPERM); 921 } 922 923 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 0, 924 mtod(m, caddr_t) + hdrlen - optlen + sigoff) < 0) { 925 m_freem(m); 926 tdb_unref(tdb); 927 return (EINVAL); 928 } 929 tdb_unref(tdb); 930 } 931 #endif /* TCP_SIGNATURE */ 932 933 /* Defer checksumming until later (ip_output() or hardware) */ 934 m->m_pkthdr.csum_flags |= M_TCP_CSUM_OUT; 935 936 /* 937 * In transmit state, time the transmission and arrange for 938 * the retransmit. In persist state, just set snd_max. 939 */ 940 if (tp->t_force == 0 || TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) { 941 tcp_seq startseq = tp->snd_nxt; 942 943 /* 944 * Advance snd_nxt over sequence space of this segment. 945 */ 946 if (flags & (TH_SYN|TH_FIN)) { 947 if (flags & TH_SYN) 948 tp->snd_nxt++; 949 if (flags & TH_FIN) { 950 tp->snd_nxt++; 951 tp->t_flags |= TF_SENTFIN; 952 } 953 } 954 if (tp->sack_enable) { 955 if (sack_rxmit && (p->rxmit != tp->snd_nxt)) { 956 goto timer; 957 } 958 } 959 tp->snd_nxt += len; 960 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 961 tp->snd_max = tp->snd_nxt; 962 /* 963 * Time this transmission if not a retransmission and 964 * not currently timing anything. 965 */ 966 if (tp->t_rtttime == 0) { 967 tp->t_rtttime = now; 968 tp->t_rtseq = startseq; 969 tcpstat_inc(tcps_segstimed); 970 } 971 } 972 973 /* 974 * Set retransmit timer if not currently set, 975 * and not doing an ack or a keep-alive probe. 976 * Initial value for retransmit timer is smoothed 977 * round-trip time + 2 * round-trip time variance. 978 * Initialize shift counter which is used for backoff 979 * of retransmit time. 980 */ 981 timer: 982 if (tp->sack_enable && sack_rxmit && 983 TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 && 984 tp->snd_nxt != tp->snd_max) { 985 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 986 if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) { 987 TCP_TIMER_DISARM(tp, TCPT_PERSIST); 988 tp->t_rxtshift = 0; 989 } 990 } 991 992 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 && 993 tp->snd_nxt != tp->snd_una) { 994 TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur); 995 if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) { 996 TCP_TIMER_DISARM(tp, TCPT_PERSIST); 997 tp->t_rxtshift = 0; 998 } 999 } 1000 1001 if (len == 0 && so->so_snd.sb_cc && 1002 TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 && 1003 TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) { 1004 /* 1005 * Avoid a situation where we do not set persist timer 1006 * after a zero window condition. For example: 1007 * 1) A -> B: packet with enough data to fill the window 1008 * 2) B -> A: ACK for #1 + new data (0 window 1009 * advertisement) 1010 * 3) A -> B: ACK for #2, 0 len packet 1011 * 1012 * In this case, A will not activate the persist timer, 1013 * because it chose to send a packet. Unless tcp_output 1014 * is called for some other reason (delayed ack timer, 1015 * another input packet from B, socket syscall), A will 1016 * not send zero window probes. 1017 * 1018 * So, if you send a 0-length packet, but there is data 1019 * in the socket buffer, and neither the rexmt or 1020 * persist timer is already set, then activate the 1021 * persist timer. 1022 */ 1023 tp->t_rxtshift = 0; 1024 tcp_setpersist(tp); 1025 } 1026 } else 1027 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) 1028 tp->snd_max = tp->snd_nxt + len; 1029 1030 tcp_update_sndspace(tp); 1031 1032 /* 1033 * Trace. 1034 */ 1035 if (so->so_options & SO_DEBUG) 1036 tcp_trace(TA_OUTPUT, tp->t_state, tp, tp, mtod(m, caddr_t), 0, 1037 len); 1038 1039 /* 1040 * Fill in IP length and desired time to live and 1041 * send to IP level. There should be a better way 1042 * to handle ttl and tos; we could keep them in 1043 * the template, but need a way to checksum without them. 1044 */ 1045 1046 #ifdef TCP_ECN 1047 /* 1048 * if peer is ECN capable, set the ECT bit in the IP header. 1049 * but don't set ECT for a pure ack, a retransmit or a window probe. 1050 */ 1051 needect = 0; 1052 if (tcp_do_ecn && (tp->t_flags & TF_ECN_PERMIT)) { 1053 if (len == 0 || SEQ_LT(tp->snd_nxt, tp->snd_max) || 1054 (tp->t_force && len == 1)) { 1055 /* don't set ECT */ 1056 } else { 1057 needect = 1; 1058 tcpstat_inc(tcps_ecn_sndect); 1059 } 1060 } 1061 #endif 1062 1063 /* force routing table */ 1064 m->m_pkthdr.ph_rtableid = tp->t_inpcb->inp_rtableid; 1065 1066 #if NPF > 0 1067 pf_mbuf_link_inpcb(m, tp->t_inpcb); 1068 #endif 1069 1070 switch (tp->pf) { 1071 case 0: /*default to PF_INET*/ 1072 case AF_INET: 1073 { 1074 struct ip *ip; 1075 1076 ip = mtod(m, struct ip *); 1077 ip->ip_len = htons(m->m_pkthdr.len); 1078 packetlen = m->m_pkthdr.len; 1079 ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl; 1080 ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos; 1081 #ifdef TCP_ECN 1082 if (needect) 1083 ip->ip_tos |= IPTOS_ECN_ECT0; 1084 #endif 1085 } 1086 #if NSTOEPLITZ > 0 1087 m->m_pkthdr.ph_flowid = tp->t_inpcb->inp_flowid; 1088 SET(m->m_pkthdr.csum_flags, M_FLOWID); 1089 #endif 1090 error = ip_output(m, tp->t_inpcb->inp_options, 1091 &tp->t_inpcb->inp_route, 1092 (ip_mtudisc ? IP_MTUDISC : 0), NULL, 1093 &tp->t_inpcb->inp_seclevel, 0); 1094 break; 1095 #ifdef INET6 1096 case AF_INET6: 1097 { 1098 struct ip6_hdr *ip6; 1099 1100 ip6 = mtod(m, struct ip6_hdr *); 1101 ip6->ip6_plen = m->m_pkthdr.len - 1102 sizeof(struct ip6_hdr); 1103 packetlen = m->m_pkthdr.len; 1104 ip6->ip6_nxt = IPPROTO_TCP; 1105 ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb); 1106 #ifdef TCP_ECN 1107 if (needect) 1108 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 1109 #endif 1110 } 1111 error = ip6_output(m, tp->t_inpcb->inp_outputopts6, 1112 &tp->t_inpcb->inp_route, 0, NULL, 1113 &tp->t_inpcb->inp_seclevel); 1114 break; 1115 #endif /* INET6 */ 1116 } 1117 1118 if (error) { 1119 out: 1120 if (error == ENOBUFS) { 1121 /* 1122 * If the interface queue is full, or IP cannot 1123 * get an mbuf, trigger TCP slow start. 1124 */ 1125 tp->snd_cwnd = tp->t_maxseg; 1126 return (0); 1127 } 1128 if (error == EMSGSIZE) { 1129 /* 1130 * ip_output() will have already fixed the route 1131 * for us. tcp_mtudisc() will, as its last action, 1132 * initiate retransmission, so it is important to 1133 * not do so here. 1134 */ 1135 tcp_mtudisc(tp->t_inpcb, -1); 1136 return (0); 1137 } 1138 if ((error == EHOSTUNREACH || error == ENETDOWN) && 1139 TCPS_HAVERCVDSYN(tp->t_state)) { 1140 tp->t_softerror = error; 1141 return (0); 1142 } 1143 1144 /* Restart the delayed ACK timer, if necessary. */ 1145 if (TCP_TIMER_ISARMED(tp, TCPT_DELACK)) 1146 TCP_TIMER_ARM(tp, TCPT_DELACK, tcp_delack_msecs); 1147 1148 return (error); 1149 } 1150 1151 if (packetlen > tp->t_pmtud_mtu_sent) 1152 tp->t_pmtud_mtu_sent = packetlen; 1153 1154 tcpstat_inc(tcps_sndtotal); 1155 if (TCP_TIMER_ISARMED(tp, TCPT_DELACK)) 1156 tcpstat_inc(tcps_delack); 1157 1158 /* 1159 * Data sent (as far as we can tell). 1160 * If this advertises a larger window than any other segment, 1161 * then remember the size of the advertised window. 1162 * Any pending ACK has now been sent. 1163 */ 1164 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) 1165 tp->rcv_adv = tp->rcv_nxt + win; 1166 tp->last_ack_sent = tp->rcv_nxt; 1167 tp->t_sndacktime = now; 1168 tp->t_flags &= ~TF_ACKNOW; 1169 TCP_TIMER_DISARM(tp, TCPT_DELACK); 1170 if (sendalot) 1171 goto again; 1172 return (0); 1173 } 1174 1175 void 1176 tcp_setpersist(struct tcpcb *tp) 1177 { 1178 int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + TCP_RTT_BASE_SHIFT); 1179 int msec; 1180 1181 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT)) 1182 panic("tcp_output REXMT"); 1183 /* 1184 * Start/restart persistence timer. 1185 */ 1186 if (t < tp->t_rttmin) 1187 t = tp->t_rttmin; 1188 TCPT_RANGESET(msec, t * tcp_backoff[tp->t_rxtshift], 1189 TCPTV_PERSMIN, TCPTV_PERSMAX); 1190 TCP_TIMER_ARM(tp, TCPT_PERSIST, msec); 1191 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 1192 tp->t_rxtshift++; 1193 } 1194 1195 int 1196 tcp_chopper(struct mbuf *m0, struct mbuf_list *ml, struct ifnet *ifp, 1197 u_int mss) 1198 { 1199 struct ip *ip = NULL; 1200 #ifdef INET6 1201 struct ip6_hdr *ip6 = NULL; 1202 #endif 1203 struct tcphdr *th; 1204 int firstlen, iphlen, hlen, tlen, off; 1205 int error; 1206 1207 ml_init(ml); 1208 ml_enqueue(ml, m0); 1209 1210 ip = mtod(m0, struct ip *); 1211 switch (ip->ip_v) { 1212 case 4: 1213 iphlen = ip->ip_hl << 2; 1214 if (ISSET(ip->ip_off, htons(IP_OFFMASK | IP_MF)) || 1215 iphlen != sizeof(struct ip) || ip->ip_p != IPPROTO_TCP) { 1216 /* only TCP without fragment or IP option supported */ 1217 error = EPROTOTYPE; 1218 goto bad; 1219 } 1220 break; 1221 #ifdef INET6 1222 case 6: 1223 ip = NULL; 1224 ip6 = mtod(m0, struct ip6_hdr *); 1225 iphlen = sizeof(struct ip6_hdr); 1226 if (ip6->ip6_nxt != IPPROTO_TCP) { 1227 /* only TCP without IPv6 header chain supported */ 1228 error = EPROTOTYPE; 1229 goto bad; 1230 } 1231 break; 1232 #endif 1233 default: 1234 panic("%s: unknown ip version %d", __func__, ip->ip_v); 1235 } 1236 1237 tlen = m0->m_pkthdr.len; 1238 if (tlen < iphlen + sizeof(struct tcphdr)) { 1239 error = ENOPROTOOPT; 1240 goto bad; 1241 } 1242 /* IP and TCP header should be contiguous, this check is paranoia */ 1243 if (m0->m_len < iphlen + sizeof(*th)) { 1244 ml_dequeue(ml); 1245 if ((m0 = m_pullup(m0, iphlen + sizeof(*th))) == NULL) { 1246 error = ENOBUFS; 1247 goto bad; 1248 } 1249 ml_enqueue(ml, m0); 1250 } 1251 th = (struct tcphdr *)(mtod(m0, caddr_t) + iphlen); 1252 hlen = iphlen + (th->th_off << 2); 1253 if (tlen < hlen) { 1254 error = ENOPROTOOPT; 1255 goto bad; 1256 } 1257 firstlen = MIN(tlen - hlen, mss); 1258 1259 CLR(m0->m_pkthdr.csum_flags, M_TCP_TSO); 1260 1261 /* 1262 * Loop through length of payload after first segment, 1263 * make new header and copy data of each part and link onto chain. 1264 */ 1265 for (off = hlen + firstlen; off < tlen; off += mss) { 1266 struct mbuf *m; 1267 struct tcphdr *mhth; 1268 int len; 1269 1270 len = MIN(tlen - off, mss); 1271 1272 MGETHDR(m, M_DONTWAIT, MT_HEADER); 1273 if (m == NULL) { 1274 error = ENOBUFS; 1275 goto bad; 1276 } 1277 ml_enqueue(ml, m); 1278 if ((error = m_dup_pkthdr(m, m0, M_DONTWAIT)) != 0) 1279 goto bad; 1280 1281 /* IP and TCP header to the end, space for link layer header */ 1282 m->m_len = hlen; 1283 m_align(m, hlen); 1284 1285 /* copy and adjust TCP header */ 1286 mhth = (struct tcphdr *)(mtod(m, caddr_t) + iphlen); 1287 memcpy(mhth, th, hlen - iphlen); 1288 mhth->th_seq = htonl(ntohl(th->th_seq) + (off - hlen)); 1289 if (off + len < tlen) 1290 CLR(mhth->th_flags, TH_PUSH|TH_FIN); 1291 1292 /* add mbuf chain with payload */ 1293 m->m_pkthdr.len = hlen + len; 1294 if ((m->m_next = m_copym(m0, off, len, M_DONTWAIT)) == NULL) { 1295 error = ENOBUFS; 1296 goto bad; 1297 } 1298 1299 /* copy and adjust IP header, calculate checksum */ 1300 SET(m->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); 1301 if (ip) { 1302 struct ip *mhip; 1303 1304 mhip = mtod(m, struct ip *); 1305 *mhip = *ip; 1306 mhip->ip_len = htons(hlen + len); 1307 mhip->ip_id = htons(ip_randomid()); 1308 in_hdr_cksum_out(m, ifp); 1309 in_proto_cksum_out(m, ifp); 1310 } 1311 #ifdef INET6 1312 if (ip6) { 1313 struct ip6_hdr *mhip6; 1314 1315 mhip6 = mtod(m, struct ip6_hdr *); 1316 *mhip6 = *ip6; 1317 mhip6->ip6_plen = htons(hlen - iphlen + len); 1318 in6_proto_cksum_out(m, ifp); 1319 } 1320 #endif 1321 } 1322 1323 /* 1324 * Update first segment by trimming what's been copied out 1325 * and updating header, then send each segment (in order). 1326 */ 1327 if (hlen + firstlen < tlen) { 1328 m_adj(m0, hlen + firstlen - tlen); 1329 CLR(th->th_flags, TH_PUSH|TH_FIN); 1330 } 1331 /* adjust IP header, calculate checksum */ 1332 SET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT); 1333 if (ip) { 1334 ip->ip_len = htons(m0->m_pkthdr.len); 1335 in_hdr_cksum_out(m0, ifp); 1336 in_proto_cksum_out(m0, ifp); 1337 } 1338 #ifdef INET6 1339 if (ip6) { 1340 ip6->ip6_plen = htons(m0->m_pkthdr.len - iphlen); 1341 in6_proto_cksum_out(m0, ifp); 1342 } 1343 #endif 1344 1345 tcpstat_add(tcps_outpkttso, ml_len(ml)); 1346 return 0; 1347 1348 bad: 1349 tcpstat_inc(tcps_outbadtso); 1350 ml_purge(ml); 1351 return error; 1352 } 1353 1354 int 1355 tcp_if_output_tso(struct ifnet *ifp, struct mbuf **mp, struct sockaddr *dst, 1356 struct rtentry *rt, uint32_t ifcap, u_int mtu) 1357 { 1358 struct mbuf_list ml; 1359 int error; 1360 1361 /* caller must fail later or fragment */ 1362 if (!ISSET((*mp)->m_pkthdr.csum_flags, M_TCP_TSO)) 1363 return 0; 1364 if ((*mp)->m_pkthdr.ph_mss > mtu) { 1365 CLR((*mp)->m_pkthdr.csum_flags, M_TCP_TSO); 1366 return 0; 1367 } 1368 1369 /* network interface hardware will do TSO */ 1370 if (in_ifcap_cksum(*mp, ifp, ifcap)) { 1371 if (ISSET(ifcap, IFCAP_TSOv4)) { 1372 in_hdr_cksum_out(*mp, ifp); 1373 in_proto_cksum_out(*mp, ifp); 1374 } 1375 #ifdef INET6 1376 if (ISSET(ifcap, IFCAP_TSOv6)) 1377 in6_proto_cksum_out(*mp, ifp); 1378 #endif 1379 error = ifp->if_output(ifp, *mp, dst, rt); 1380 if (!error) 1381 tcpstat_inc(tcps_outhwtso); 1382 goto done; 1383 } 1384 1385 /* as fallback do TSO in software */ 1386 if ((error = tcp_chopper(*mp, &ml, ifp, (*mp)->m_pkthdr.ph_mss)) || 1387 (error = if_output_ml(ifp, &ml, dst, rt))) 1388 goto done; 1389 tcpstat_inc(tcps_outswtso); 1390 1391 done: 1392 *mp = NULL; 1393 return error; 1394 } 1395