1 /*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * %sccs.include.redist.c%
6 *
7 * @(#)tcp_output.c 8.4 (Berkeley) 05/24/95
8 */
9
10 #include <sys/param.h>
11 #include <sys/systm.h>
12 #include <sys/malloc.h>
13 #include <sys/mbuf.h>
14 #include <sys/protosw.h>
15 #include <sys/socket.h>
16 #include <sys/socketvar.h>
17 #include <sys/errno.h>
18
19 #include <net/route.h>
20
21 #include <netinet/in.h>
22 #include <netinet/in_systm.h>
23 #include <netinet/ip.h>
24 #include <netinet/in_pcb.h>
25 #include <netinet/ip_var.h>
26 #include <netinet/tcp.h>
27 #define TCPOUTFLAGS
28 #include <netinet/tcp_fsm.h>
29 #include <netinet/tcp_seq.h>
30 #include <netinet/tcp_timer.h>
31 #include <netinet/tcp_var.h>
32 #include <netinet/tcpip.h>
33 #include <netinet/tcp_debug.h>
34
35 #ifdef notyet
36 extern struct mbuf *m_copypack();
37 #endif
38
39
40 #define MAX_TCPOPTLEN 32 /* max # bytes that go in options */
41
42 /*
43 * Tcp output routine: figure out what should be sent and send it.
44 */
45 int
tcp_output(tp)46 tcp_output(tp)
47 register struct tcpcb *tp;
48 {
49 register struct socket *so = tp->t_inpcb->inp_socket;
50 register long len, win;
51 int off, flags, error;
52 register struct mbuf *m;
53 register struct tcpiphdr *ti;
54 u_char opt[MAX_TCPOPTLEN];
55 unsigned optlen, hdrlen;
56 int idle, sendalot;
57
58 /*
59 * Determine length of data that should be transmitted,
60 * and flags that will be used.
61 * If there is some data or critical controls (SYN, RST)
62 * to send, then transmit; otherwise, investigate further.
63 */
64 idle = (tp->snd_max == tp->snd_una);
65 if (idle && tp->t_idle >= tp->t_rxtcur)
66 /*
67 * We have been idle for "a while" and no acks are
68 * expected to clock out any data we send --
69 * slow start to get ack "clock" running again.
70 */
71 tp->snd_cwnd = tp->t_maxseg;
72 again:
73 sendalot = 0;
74 off = tp->snd_nxt - tp->snd_una;
75 win = min(tp->snd_wnd, tp->snd_cwnd);
76
77 flags = tcp_outflags[tp->t_state];
78 /*
79 * If in persist timeout with window of 0, send 1 byte.
80 * Otherwise, if window is small but nonzero
81 * and timer expired, we will send what we can
82 * and go to transmit state.
83 */
84 if (tp->t_force) {
85 if (win == 0) {
86 /*
87 * If we still have some data to send, then
88 * clear the FIN bit. Usually this would
89 * happen below when it realizes that we
90 * aren't sending all the data. However,
91 * if we have exactly 1 byte of unset data,
92 * then it won't clear the FIN bit below,
93 * and if we are in persist state, we wind
94 * up sending the packet without recording
95 * that we sent the FIN bit.
96 *
97 * We can't just blindly clear the FIN bit,
98 * because if we don't have any more data
99 * to send then the probe will be the FIN
100 * itself.
101 */
102 if (off < so->so_snd.sb_cc)
103 flags &= ~TH_FIN;
104 win = 1;
105 } else {
106 tp->t_timer[TCPT_PERSIST] = 0;
107 tp->t_rxtshift = 0;
108 }
109 }
110
111 len = min(so->so_snd.sb_cc, win) - off;
112
113 if (len < 0) {
114 /*
115 * If FIN has been sent but not acked,
116 * but we haven't been called to retransmit,
117 * len will be -1. Otherwise, window shrank
118 * after we sent into it. If window shrank to 0,
119 * cancel pending retransmit and pull snd_nxt
120 * back to (closed) window. We will enter persist
121 * state below. If the window didn't close completely,
122 * just wait for an ACK.
123 */
124 len = 0;
125 if (win == 0) {
126 tp->t_timer[TCPT_REXMT] = 0;
127 tp->snd_nxt = tp->snd_una;
128 }
129 }
130 if (len > tp->t_maxseg) {
131 len = tp->t_maxseg;
132 sendalot = 1;
133 }
134 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc))
135 flags &= ~TH_FIN;
136
137 win = sbspace(&so->so_rcv);
138
139 /*
140 * Sender silly window avoidance. If connection is idle
141 * and can send all data, a maximum segment,
142 * at least a maximum default-size segment do it,
143 * or are forced, do it; otherwise don't bother.
144 * If peer's buffer is tiny, then send
145 * when window is at least half open.
146 * If retransmitting (possibly after persist timer forced us
147 * to send into a small window), then must resend.
148 */
149 if (len) {
150 if (len == tp->t_maxseg)
151 goto send;
152 if ((idle || tp->t_flags & TF_NODELAY) &&
153 len + off >= so->so_snd.sb_cc)
154 goto send;
155 if (tp->t_force)
156 goto send;
157 if (len >= tp->max_sndwnd / 2)
158 goto send;
159 if (SEQ_LT(tp->snd_nxt, tp->snd_max))
160 goto send;
161 }
162
163 /*
164 * Compare available window to amount of window
165 * known to peer (as advertised window less
166 * next expected input). If the difference is at least two
167 * max size segments, or at least 50% of the maximum possible
168 * window, then want to send a window update to peer.
169 */
170 if (win > 0) {
171 /*
172 * "adv" is the amount we can increase the window,
173 * taking into account that we are limited by
174 * TCP_MAXWIN << tp->rcv_scale.
175 */
176 long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) -
177 (tp->rcv_adv - tp->rcv_nxt);
178
179 if (adv >= (long) (2 * tp->t_maxseg))
180 goto send;
181 if (2 * adv >= (long) so->so_rcv.sb_hiwat)
182 goto send;
183 }
184
185 /*
186 * Send if we owe peer an ACK.
187 */
188 if (tp->t_flags & TF_ACKNOW)
189 goto send;
190 if (flags & (TH_SYN|TH_RST))
191 goto send;
192 if (SEQ_GT(tp->snd_up, tp->snd_una))
193 goto send;
194 /*
195 * If our state indicates that FIN should be sent
196 * and we have not yet done so, or we're retransmitting the FIN,
197 * then we need to send.
198 */
199 if (flags & TH_FIN &&
200 ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
201 goto send;
202
203 /*
204 * TCP window updates are not reliable, rather a polling protocol
205 * using ``persist'' packets is used to insure receipt of window
206 * updates. The three ``states'' for the output side are:
207 * idle not doing retransmits or persists
208 * persisting to move a small or zero window
209 * (re)transmitting and thereby not persisting
210 *
211 * tp->t_timer[TCPT_PERSIST]
212 * is set when we are in persist state.
213 * tp->t_force
214 * is set when we are called to send a persist packet.
215 * tp->t_timer[TCPT_REXMT]
216 * is set when we are retransmitting
217 * The output side is idle when both timers are zero.
218 *
219 * If send window is too small, there is data to transmit, and no
220 * retransmit or persist is pending, then go to persist state.
221 * If nothing happens soon, send when timer expires:
222 * if window is nonzero, transmit what we can,
223 * otherwise force out a byte.
224 */
225 if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 &&
226 tp->t_timer[TCPT_PERSIST] == 0) {
227 tp->t_rxtshift = 0;
228 tcp_setpersist(tp);
229 }
230
231 /*
232 * No reason to send a segment, just return.
233 */
234 return (0);
235
236 send:
237 /*
238 * Before ESTABLISHED, force sending of initial options
239 * unless TCP set not to do any options.
240 * NOTE: we assume that the IP/TCP header plus TCP options
241 * always fit in a single mbuf, leaving room for a maximum
242 * link header, i.e.
243 * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN
244 */
245 optlen = 0;
246 hdrlen = sizeof (struct tcpiphdr);
247 if (flags & TH_SYN) {
248 tp->snd_nxt = tp->iss;
249 if ((tp->t_flags & TF_NOOPT) == 0) {
250 u_short mss;
251
252 opt[0] = TCPOPT_MAXSEG;
253 opt[1] = 4;
254 mss = htons((u_short) tcp_mss(tp, 0));
255 bcopy((caddr_t)&mss, (caddr_t)(opt + 2), sizeof(mss));
256 optlen = 4;
257
258 if ((tp->t_flags & TF_REQ_SCALE) &&
259 ((flags & TH_ACK) == 0 ||
260 (tp->t_flags & TF_RCVD_SCALE))) {
261 *((u_long *) (opt + optlen)) = htonl(
262 TCPOPT_NOP << 24 |
263 TCPOPT_WINDOW << 16 |
264 TCPOLEN_WINDOW << 8 |
265 tp->request_r_scale);
266 optlen += 4;
267 }
268 }
269 }
270
271 /*
272 * Send a timestamp and echo-reply if this is a SYN and our side
273 * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
274 * and our peer have sent timestamps in our SYN's.
275 */
276 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
277 (flags & TH_RST) == 0 &&
278 ((flags & (TH_SYN|TH_ACK)) == TH_SYN ||
279 (tp->t_flags & TF_RCVD_TSTMP))) {
280 u_long *lp = (u_long *)(opt + optlen);
281
282 /* Form timestamp option as shown in appendix A of RFC 1323. */
283 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
284 *lp++ = htonl(tcp_now);
285 *lp = htonl(tp->ts_recent);
286 optlen += TCPOLEN_TSTAMP_APPA;
287 }
288
289 hdrlen += optlen;
290
291 /*
292 * Adjust data length if insertion of options will
293 * bump the packet length beyond the t_maxseg length.
294 */
295 if (len > tp->t_maxseg - optlen) {
296 len = tp->t_maxseg - optlen;
297 sendalot = 1;
298 flags &= ~TH_FIN;
299 }
300
301
302 #ifdef DIAGNOSTIC
303 if (max_linkhdr + hdrlen > MHLEN)
304 panic("tcphdr too big");
305 #endif
306
307 /*
308 * Grab a header mbuf, attaching a copy of data to
309 * be transmitted, and initialize the header from
310 * the template for sends on this connection.
311 */
312 if (len) {
313 if (tp->t_force && len == 1)
314 tcpstat.tcps_sndprobe++;
315 else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
316 tcpstat.tcps_sndrexmitpack++;
317 tcpstat.tcps_sndrexmitbyte += len;
318 } else {
319 tcpstat.tcps_sndpack++;
320 tcpstat.tcps_sndbyte += len;
321 }
322 #ifdef notyet
323 if ((m = m_copypack(so->so_snd.sb_mb, off,
324 (int)len, max_linkhdr + hdrlen)) == 0) {
325 error = ENOBUFS;
326 goto out;
327 }
328 /*
329 * m_copypack left space for our hdr; use it.
330 */
331 m->m_len += hdrlen;
332 m->m_data -= hdrlen;
333 #else
334 MGETHDR(m, M_DONTWAIT, MT_HEADER);
335 if (m == NULL) {
336 error = ENOBUFS;
337 goto out;
338 }
339 m->m_data += max_linkhdr;
340 m->m_len = hdrlen;
341 if (len <= MHLEN - hdrlen - max_linkhdr) {
342 m_copydata(so->so_snd.sb_mb, off, (int) len,
343 mtod(m, caddr_t) + hdrlen);
344 m->m_len += len;
345 } else {
346 m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
347 if (m->m_next == 0) {
348 (void) m_free(m);
349 error = ENOBUFS;
350 goto out;
351 }
352 }
353 #endif
354 /*
355 * If we're sending everything we've got, set PUSH.
356 * (This will keep happy those implementations which only
357 * give data to the user when a buffer fills or
358 * a PUSH comes in.)
359 */
360 if (off + len == so->so_snd.sb_cc)
361 flags |= TH_PUSH;
362 } else {
363 if (tp->t_flags & TF_ACKNOW)
364 tcpstat.tcps_sndacks++;
365 else if (flags & (TH_SYN|TH_FIN|TH_RST))
366 tcpstat.tcps_sndctrl++;
367 else if (SEQ_GT(tp->snd_up, tp->snd_una))
368 tcpstat.tcps_sndurg++;
369 else
370 tcpstat.tcps_sndwinup++;
371
372 MGETHDR(m, M_DONTWAIT, MT_HEADER);
373 if (m == NULL) {
374 error = ENOBUFS;
375 goto out;
376 }
377 m->m_data += max_linkhdr;
378 m->m_len = hdrlen;
379 }
380 m->m_pkthdr.rcvif = (struct ifnet *)0;
381 ti = mtod(m, struct tcpiphdr *);
382 if (tp->t_template == 0)
383 panic("tcp_output");
384 bcopy((caddr_t)tp->t_template, (caddr_t)ti, sizeof (struct tcpiphdr));
385
386 /*
387 * Fill in fields, remembering maximum advertised
388 * window for use in delaying messages about window sizes.
389 * If resending a FIN, be sure not to use a new sequence number.
390 */
391 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
392 tp->snd_nxt == tp->snd_max)
393 tp->snd_nxt--;
394 /*
395 * If we are doing retransmissions, then snd_nxt will
396 * not reflect the first unsent octet. For ACK only
397 * packets, we do not want the sequence number of the
398 * retransmitted packet, we want the sequence number
399 * of the next unsent octet. So, if there is no data
400 * (and no SYN or FIN), use snd_max instead of snd_nxt
401 * when filling in ti_seq. But if we are in persist
402 * state, snd_max might reflect one byte beyond the
403 * right edge of the window, so use snd_nxt in that
404 * case, since we know we aren't doing a retransmission.
405 * (retransmit and persist are mutually exclusive...)
406 */
407 if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST])
408 ti->ti_seq = htonl(tp->snd_nxt);
409 else
410 ti->ti_seq = htonl(tp->snd_max);
411 ti->ti_ack = htonl(tp->rcv_nxt);
412 if (optlen) {
413 bcopy((caddr_t)opt, (caddr_t)(ti + 1), optlen);
414 ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2;
415 }
416 ti->ti_flags = flags;
417 /*
418 * Calculate receive window. Don't shrink window,
419 * but avoid silly window syndrome.
420 */
421 if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
422 win = 0;
423 if (win > (long)TCP_MAXWIN << tp->rcv_scale)
424 win = (long)TCP_MAXWIN << tp->rcv_scale;
425 if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
426 win = (long)(tp->rcv_adv - tp->rcv_nxt);
427 ti->ti_win = htons((u_short) (win>>tp->rcv_scale));
428 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
429 ti->ti_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
430 ti->ti_flags |= TH_URG;
431 } else
432 /*
433 * If no urgent pointer to send, then we pull
434 * the urgent pointer to the left edge of the send window
435 * so that it doesn't drift into the send window on sequence
436 * number wraparound.
437 */
438 tp->snd_up = tp->snd_una; /* drag it along */
439
440 /*
441 * Put TCP length in extended header, and then
442 * checksum extended header and data.
443 */
444 if (len + optlen)
445 ti->ti_len = htons((u_short)(sizeof (struct tcphdr) +
446 optlen + len));
447 ti->ti_sum = in_cksum(m, (int)(hdrlen + len));
448
449 /*
450 * In transmit state, time the transmission and arrange for
451 * the retransmit. In persist state, just set snd_max.
452 */
453 if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
454 tcp_seq startseq = tp->snd_nxt;
455
456 /*
457 * Advance snd_nxt over sequence space of this segment.
458 */
459 if (flags & (TH_SYN|TH_FIN)) {
460 if (flags & TH_SYN)
461 tp->snd_nxt++;
462 if (flags & TH_FIN) {
463 tp->snd_nxt++;
464 tp->t_flags |= TF_SENTFIN;
465 }
466 }
467 tp->snd_nxt += len;
468 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
469 tp->snd_max = tp->snd_nxt;
470 /*
471 * Time this transmission if not a retransmission and
472 * not currently timing anything.
473 */
474 if (tp->t_rtt == 0) {
475 tp->t_rtt = 1;
476 tp->t_rtseq = startseq;
477 tcpstat.tcps_segstimed++;
478 }
479 }
480
481 /*
482 * Set retransmit timer if not currently set,
483 * and not doing an ack or a keep-alive probe.
484 * Initial value for retransmit timer is smoothed
485 * round-trip time + 2 * round-trip time variance.
486 * Initialize shift counter which is used for backoff
487 * of retransmit time.
488 */
489 if (tp->t_timer[TCPT_REXMT] == 0 &&
490 tp->snd_nxt != tp->snd_una) {
491 tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
492 if (tp->t_timer[TCPT_PERSIST]) {
493 tp->t_timer[TCPT_PERSIST] = 0;
494 tp->t_rxtshift = 0;
495 }
496 }
497 } else
498 if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
499 tp->snd_max = tp->snd_nxt + len;
500
501 /*
502 * Trace.
503 */
504 if (so->so_options & SO_DEBUG)
505 tcp_trace(TA_OUTPUT, tp->t_state, tp, ti, 0);
506
507 /*
508 * Fill in IP length and desired time to live and
509 * send to IP level. There should be a better way
510 * to handle ttl and tos; we could keep them in
511 * the template, but need a way to checksum without them.
512 */
513 m->m_pkthdr.len = hdrlen + len;
514 #ifdef TUBA
515 if (tp->t_tuba_pcb)
516 error = tuba_output(m, tp);
517 else
518 #endif
519 {
520 ((struct ip *)ti)->ip_len = m->m_pkthdr.len;
521 ((struct ip *)ti)->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl; /* XXX */
522 ((struct ip *)ti)->ip_tos = tp->t_inpcb->inp_ip.ip_tos; /* XXX */
523 #if BSD >= 43
524 error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
525 so->so_options & SO_DONTROUTE, 0);
526 #else
527 error = ip_output(m, (struct mbuf *)0, &tp->t_inpcb->inp_route,
528 so->so_options & SO_DONTROUTE);
529 #endif
530 }
531 if (error) {
532 out:
533 if (error == ENOBUFS) {
534 tcp_quench(tp->t_inpcb, 0);
535 return (0);
536 }
537 if ((error == EHOSTUNREACH || error == ENETDOWN)
538 && TCPS_HAVERCVDSYN(tp->t_state)) {
539 tp->t_softerror = error;
540 return (0);
541 }
542 return (error);
543 }
544 tcpstat.tcps_sndtotal++;
545
546 /*
547 * Data sent (as far as we can tell).
548 * If this advertises a larger window than any other segment,
549 * then remember the size of the advertised window.
550 * Any pending ACK has now been sent.
551 */
552 if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
553 tp->rcv_adv = tp->rcv_nxt + win;
554 tp->last_ack_sent = tp->rcv_nxt;
555 tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
556 if (sendalot)
557 goto again;
558 return (0);
559 }
560
561 void
tcp_setpersist(tp)562 tcp_setpersist(tp)
563 register struct tcpcb *tp;
564 {
565 register t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
566
567 if (tp->t_timer[TCPT_REXMT])
568 panic("tcp_output REXMT");
569 /*
570 * Start/restart persistance timer.
571 */
572 TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
573 t * tcp_backoff[tp->t_rxtshift],
574 TCPTV_PERSMIN, TCPTV_PERSMAX);
575 if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
576 tp->t_rxtshift++;
577 }
578