1 /* 2 * Copyright (c) 1982, 1986, 1988, 1990 Regents of the University of California. 3 * All rights reserved. 4 * 5 * %sccs.include.redist.c% 6 * 7 * @(#)tcp_output.c 7.24 (Berkeley) 10/11/92 8 */ 9 10 #include <sys/param.h> 11 #include <sys/systm.h> 12 #include <sys/malloc.h> 13 #include <sys/mbuf.h> 14 #include <sys/protosw.h> 15 #include <sys/socket.h> 16 #include <sys/socketvar.h> 17 #include <sys/errno.h> 18 19 #include <net/route.h> 20 21 #include <netinet/in.h> 22 #include <netinet/in_systm.h> 23 #include <netinet/ip.h> 24 #include <netinet/in_pcb.h> 25 #include <netinet/ip_var.h> 26 #include <netinet/tcp.h> 27 #define TCPOUTFLAGS 28 #include <netinet/tcp_fsm.h> 29 #include <netinet/tcp_seq.h> 30 #include <netinet/tcp_timer.h> 31 #include <netinet/tcp_var.h> 32 #include <netinet/tcpip.h> 33 #include <netinet/tcp_debug.h> 34 35 #ifdef notyet 36 extern struct mbuf *m_copypack(); 37 #endif 38 39 /* 40 * Initial options. 41 */ 42 u_char tcp_initopt[4] = { TCPOPT_MAXSEG, 4, 0x0, 0x0, }; 43 44 /* 45 * Tcp output routine: figure out what should be sent and send it. 46 */ 47 tcp_output(tp) 48 register struct tcpcb *tp; 49 { 50 register struct socket *so = tp->t_inpcb->inp_socket; 51 register long len, win; 52 int off, flags, error; 53 register struct mbuf *m; 54 register struct tcpiphdr *ti; 55 u_char *opt; 56 unsigned optlen, hdrlen; 57 int idle, sendalot; 58 59 /* 60 * Determine length of data that should be transmitted, 61 * and flags that will be used. 62 * If there is some data or critical controls (SYN, RST) 63 * to send, then transmit; otherwise, investigate further. 64 */ 65 idle = (tp->snd_max == tp->snd_una); 66 if (idle && tp->t_idle >= tp->t_rxtcur) 67 /* 68 * We have been idle for "a while" and no acks are 69 * expected to clock out any data we send -- 70 * slow start to get ack "clock" running again. 71 */ 72 tp->snd_cwnd = tp->t_maxseg; 73 again: 74 sendalot = 0; 75 off = tp->snd_nxt - tp->snd_una; 76 win = min(tp->snd_wnd, tp->snd_cwnd); 77 78 /* 79 * If in persist timeout with window of 0, send 1 byte. 80 * Otherwise, if window is small but nonzero 81 * and timer expired, we will send what we can 82 * and go to transmit state. 83 */ 84 if (tp->t_force) { 85 if (win == 0) 86 win = 1; 87 else { 88 tp->t_timer[TCPT_PERSIST] = 0; 89 tp->t_rxtshift = 0; 90 } 91 } 92 93 flags = tcp_outflags[tp->t_state]; 94 len = min(so->so_snd.sb_cc, win) - off; 95 96 if (len < 0) { 97 /* 98 * If FIN has been sent but not acked, 99 * but we haven't been called to retransmit, 100 * len will be -1. Otherwise, window shrank 101 * after we sent into it. If window shrank to 0, 102 * cancel pending retransmit and pull snd_nxt 103 * back to (closed) window. We will enter persist 104 * state below. If the window didn't close completely, 105 * just wait for an ACK. 106 */ 107 len = 0; 108 if (win == 0) { 109 tp->t_timer[TCPT_REXMT] = 0; 110 tp->snd_nxt = tp->snd_una; 111 } 112 } 113 if (len > tp->t_maxseg) { 114 len = tp->t_maxseg; 115 sendalot = 1; 116 } 117 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) 118 flags &= ~TH_FIN; 119 120 win = sbspace(&so->so_rcv); 121 122 /* 123 * Sender silly window avoidance. If connection is idle 124 * and can send all data, a maximum segment, 125 * at least a maximum default-size segment do it, 126 * or are forced, do it; otherwise don't bother. 127 * If peer's buffer is tiny, then send 128 * when window is at least half open. 129 * If retransmitting (possibly after persist timer forced us 130 * to send into a small window), then must resend. 131 */ 132 if (len) { 133 if (len == tp->t_maxseg) 134 goto send; 135 if ((idle || tp->t_flags & TF_NODELAY) && 136 len + off >= so->so_snd.sb_cc) 137 goto send; 138 if (tp->t_force) 139 goto send; 140 if (len >= tp->max_sndwnd / 2) 141 goto send; 142 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 143 goto send; 144 } 145 146 /* 147 * Compare available window to amount of window 148 * known to peer (as advertised window less 149 * next expected input). If the difference is at least two 150 * max size segments, or at least 50% of the maximum possible 151 * window, then want to send a window update to peer. 152 */ 153 if (win > 0) { 154 long adv = win - (tp->rcv_adv - tp->rcv_nxt); 155 156 if (adv >= (long) (2 * tp->t_maxseg)) 157 goto send; 158 if (2 * adv >= (long) so->so_rcv.sb_hiwat) 159 goto send; 160 } 161 162 /* 163 * Send if we owe peer an ACK. 164 */ 165 if (tp->t_flags & TF_ACKNOW) 166 goto send; 167 if (flags & (TH_SYN|TH_RST)) 168 goto send; 169 if (SEQ_GT(tp->snd_up, tp->snd_una)) 170 goto send; 171 /* 172 * If our state indicates that FIN should be sent 173 * and we have not yet done so, or we're retransmitting the FIN, 174 * then we need to send. 175 */ 176 if (flags & TH_FIN && 177 ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) 178 goto send; 179 180 /* 181 * TCP window updates are not reliable, rather a polling protocol 182 * using ``persist'' packets is used to insure receipt of window 183 * updates. The three ``states'' for the output side are: 184 * idle not doing retransmits or persists 185 * persisting to move a small or zero window 186 * (re)transmitting and thereby not persisting 187 * 188 * tp->t_timer[TCPT_PERSIST] 189 * is set when we are in persist state. 190 * tp->t_force 191 * is set when we are called to send a persist packet. 192 * tp->t_timer[TCPT_REXMT] 193 * is set when we are retransmitting 194 * The output side is idle when both timers are zero. 195 * 196 * If send window is too small, there is data to transmit, and no 197 * retransmit or persist is pending, then go to persist state. 198 * If nothing happens soon, send when timer expires: 199 * if window is nonzero, transmit what we can, 200 * otherwise force out a byte. 201 */ 202 if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && 203 tp->t_timer[TCPT_PERSIST] == 0) { 204 tp->t_rxtshift = 0; 205 tcp_setpersist(tp); 206 } 207 208 /* 209 * No reason to send a segment, just return. 210 */ 211 return (0); 212 213 send: 214 /* 215 * Before ESTABLISHED, force sending of initial options 216 * unless TCP set not to do any options. 217 * NOTE: we assume that the IP/TCP header plus TCP options 218 * always fit in a single mbuf, leaving room for a maximum 219 * link header, i.e. 220 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN 221 */ 222 optlen = 0; 223 hdrlen = sizeof (struct tcpiphdr); 224 if (flags & TH_SYN && (tp->t_flags & TF_NOOPT) == 0) { 225 u_short mss; 226 227 opt = tcp_initopt; 228 optlen = sizeof (tcp_initopt); 229 hdrlen += sizeof (tcp_initopt); 230 mss = htons((u_short) tcp_mss(tp, 0)); 231 bcopy((caddr_t)&mss, (caddr_t)(opt + 2), sizeof(mss)); 232 #ifdef DIAGNOSTIC 233 if (max_linkhdr + hdrlen > MHLEN) 234 panic("tcphdr too big"); 235 #endif 236 } 237 238 /* 239 * Grab a header mbuf, attaching a copy of data to 240 * be transmitted, and initialize the header from 241 * the template for sends on this connection. 242 */ 243 if (len) { 244 if (tp->t_force && len == 1) 245 tcpstat.tcps_sndprobe++; 246 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 247 tcpstat.tcps_sndrexmitpack++; 248 tcpstat.tcps_sndrexmitbyte += len; 249 } else { 250 tcpstat.tcps_sndpack++; 251 tcpstat.tcps_sndbyte += len; 252 } 253 #ifdef notyet 254 if ((m = m_copypack(so->so_snd.sb_mb, off, 255 (int)len, max_linkhdr + hdrlen)) == 0) { 256 error = ENOBUFS; 257 goto out; 258 } 259 /* 260 * m_copypack left space for our hdr; use it. 261 */ 262 m->m_len += hdrlen; 263 m->m_data -= hdrlen; 264 #else 265 MGETHDR(m, M_DONTWAIT, MT_HEADER); 266 if (m == NULL) { 267 error = ENOBUFS; 268 goto out; 269 } 270 m->m_data += max_linkhdr; 271 m->m_len = hdrlen; 272 if (len <= MHLEN - hdrlen - max_linkhdr) { 273 m_copydata(so->so_snd.sb_mb, off, (int) len, 274 mtod(m, caddr_t) + hdrlen); 275 m->m_len += len; 276 } else { 277 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len); 278 if (m->m_next == 0) 279 len = 0; 280 } 281 #endif 282 /* 283 * If we're sending everything we've got, set PUSH. 284 * (This will keep happy those implementations which only 285 * give data to the user when a buffer fills or 286 * a PUSH comes in.) 287 */ 288 if (off + len == so->so_snd.sb_cc) 289 flags |= TH_PUSH; 290 } else { 291 if (tp->t_flags & TF_ACKNOW) 292 tcpstat.tcps_sndacks++; 293 else if (flags & (TH_SYN|TH_FIN|TH_RST)) 294 tcpstat.tcps_sndctrl++; 295 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 296 tcpstat.tcps_sndurg++; 297 else 298 tcpstat.tcps_sndwinup++; 299 300 MGETHDR(m, M_DONTWAIT, MT_HEADER); 301 if (m == NULL) { 302 error = ENOBUFS; 303 goto out; 304 } 305 m->m_data += max_linkhdr; 306 m->m_len = hdrlen; 307 } 308 m->m_pkthdr.rcvif = (struct ifnet *)0; 309 ti = mtod(m, struct tcpiphdr *); 310 if (tp->t_template == 0) 311 panic("tcp_output"); 312 bcopy((caddr_t)tp->t_template, (caddr_t)ti, sizeof (struct tcpiphdr)); 313 314 /* 315 * Fill in fields, remembering maximum advertised 316 * window for use in delaying messages about window sizes. 317 * If resending a FIN, be sure not to use a new sequence number. 318 */ 319 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 320 tp->snd_nxt == tp->snd_max) 321 tp->snd_nxt--; 322 ti->ti_seq = htonl(tp->snd_nxt); 323 ti->ti_ack = htonl(tp->rcv_nxt); 324 if (optlen) { 325 bcopy((caddr_t)opt, (caddr_t)(ti + 1), optlen); 326 ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2; 327 } 328 ti->ti_flags = flags; 329 /* 330 * Calculate receive window. Don't shrink window, 331 * but avoid silly window syndrome. 332 */ 333 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg) 334 win = 0; 335 if (win > TCP_MAXWIN) 336 win = TCP_MAXWIN; 337 if (win < (long)(tp->rcv_adv - tp->rcv_nxt)) 338 win = (long)(tp->rcv_adv - tp->rcv_nxt); 339 ti->ti_win = htons((u_short)win); 340 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 341 ti->ti_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); 342 ti->ti_flags |= TH_URG; 343 } else 344 /* 345 * If no urgent pointer to send, then we pull 346 * the urgent pointer to the left edge of the send window 347 * so that it doesn't drift into the send window on sequence 348 * number wraparound. 349 */ 350 tp->snd_up = tp->snd_una; /* drag it along */ 351 352 /* 353 * Put TCP length in extended header, and then 354 * checksum extended header and data. 355 */ 356 if (len + optlen) 357 ti->ti_len = htons((u_short)(sizeof (struct tcphdr) + 358 optlen + len)); 359 ti->ti_sum = in_cksum(m, (int)(hdrlen + len)); 360 361 /* 362 * In transmit state, time the transmission and arrange for 363 * the retransmit. In persist state, just set snd_max. 364 */ 365 if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) { 366 tcp_seq startseq = tp->snd_nxt; 367 368 /* 369 * Advance snd_nxt over sequence space of this segment. 370 */ 371 if (flags & (TH_SYN|TH_FIN)) { 372 if (flags & TH_SYN) 373 tp->snd_nxt++; 374 if (flags & TH_FIN) { 375 tp->snd_nxt++; 376 tp->t_flags |= TF_SENTFIN; 377 } 378 } 379 tp->snd_nxt += len; 380 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 381 tp->snd_max = tp->snd_nxt; 382 /* 383 * Time this transmission if not a retransmission and 384 * not currently timing anything. 385 */ 386 if (tp->t_rtt == 0) { 387 tp->t_rtt = 1; 388 tp->t_rtseq = startseq; 389 tcpstat.tcps_segstimed++; 390 } 391 } 392 393 /* 394 * Set retransmit timer if not currently set, 395 * and not doing an ack or a keep-alive probe. 396 * Initial value for retransmit timer is smoothed 397 * round-trip time + 2 * round-trip time variance. 398 * Initialize shift counter which is used for backoff 399 * of retransmit time. 400 */ 401 if (tp->t_timer[TCPT_REXMT] == 0 && 402 tp->snd_nxt != tp->snd_una) { 403 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; 404 if (tp->t_timer[TCPT_PERSIST]) { 405 tp->t_timer[TCPT_PERSIST] = 0; 406 tp->t_rxtshift = 0; 407 } 408 } 409 } else 410 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) 411 tp->snd_max = tp->snd_nxt + len; 412 413 /* 414 * Trace. 415 */ 416 if (so->so_options & SO_DEBUG) 417 tcp_trace(TA_OUTPUT, tp->t_state, tp, ti, 0); 418 419 /* 420 * Fill in IP length and desired time to live and 421 * send to IP level. There should be a better way 422 * to handle ttl and tos; we could keep them in 423 * the template, but need a way to checksum without them. 424 */ 425 m->m_pkthdr.len = hdrlen + len; 426 ((struct ip *)ti)->ip_len = m->m_pkthdr.len; 427 ((struct ip *)ti)->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl; /* XXX */ 428 ((struct ip *)ti)->ip_tos = tp->t_inpcb->inp_ip.ip_tos; /* XXX */ 429 #if BSD >= 43 430 error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, 431 so->so_options & SO_DONTROUTE); 432 #else 433 error = ip_output(m, (struct mbuf *)0, &tp->t_inpcb->inp_route, 434 so->so_options & SO_DONTROUTE); 435 #endif 436 if (error) { 437 out: 438 if (error == ENOBUFS) { 439 tcp_quench(tp->t_inpcb); 440 return (0); 441 } 442 if ((error == EHOSTUNREACH || error == ENETDOWN) 443 && TCPS_HAVERCVDSYN(tp->t_state)) { 444 tp->t_softerror = error; 445 return (0); 446 } 447 return (error); 448 } 449 tcpstat.tcps_sndtotal++; 450 451 /* 452 * Data sent (as far as we can tell). 453 * If this advertises a larger window than any other segment, 454 * then remember the size of the advertised window. 455 * Any pending ACK has now been sent. 456 */ 457 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) 458 tp->rcv_adv = tp->rcv_nxt + win; 459 tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); 460 if (sendalot) 461 goto again; 462 return (0); 463 } 464 465 tcp_setpersist(tp) 466 register struct tcpcb *tp; 467 { 468 register t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; 469 470 if (tp->t_timer[TCPT_REXMT]) 471 panic("tcp_output REXMT"); 472 /* 473 * Start/restart persistance timer. 474 */ 475 TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], 476 t * tcp_backoff[tp->t_rxtshift], 477 TCPTV_PERSMIN, TCPTV_PERSMAX); 478 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 479 tp->t_rxtshift++; 480 } 481