1 /* $OpenBSD: tcp_usrreq.c,v 1.231 2024/04/12 16:07:09 bluhm Exp $ */
2 /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */
3
4 /*
5 * Copyright (c) 1982, 1986, 1988, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
33 *
34 * NRL grants permission for redistribution and use in source and binary
35 * forms, with or without modification, of the software and documentation
36 * created at NRL provided that the following conditions are met:
37 *
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 3. All advertising materials mentioning features or use of this software
44 * must display the following acknowledgements:
45 * This product includes software developed by the University of
46 * California, Berkeley and its contributors.
47 * This product includes software developed at the Information
48 * Technology Division, US Naval Research Laboratory.
49 * 4. Neither the name of the NRL nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64 *
65 * The views and conclusions contained in the software and documentation
66 * are those of the authors and should not be interpreted as representing
67 * official policies, either expressed or implied, of the US Naval
68 * Research Laboratory (NRL).
69 */
70
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/mbuf.h>
74 #include <sys/socket.h>
75 #include <sys/socketvar.h>
76 #include <sys/protosw.h>
77 #include <sys/stat.h>
78 #include <sys/sysctl.h>
79 #include <sys/domain.h>
80 #include <sys/kernel.h>
81 #include <sys/pool.h>
82 #include <sys/proc.h>
83
84 #include <net/if.h>
85 #include <net/if_var.h>
86 #include <net/route.h>
87
88 #include <netinet/in.h>
89 #include <netinet/in_var.h>
90 #include <netinet/ip.h>
91 #include <netinet/in_pcb.h>
92 #include <netinet/ip_var.h>
93 #include <netinet6/ip6_var.h>
94 #include <netinet/tcp.h>
95 #include <netinet/tcp_fsm.h>
96 #include <netinet/tcp_seq.h>
97 #include <netinet/tcp_timer.h>
98 #include <netinet/tcp_var.h>
99 #include <netinet/tcp_debug.h>
100
101 #ifdef INET6
102 #include <netinet6/in6_var.h>
103 #endif
104
105 #ifndef TCP_SENDSPACE
106 #define TCP_SENDSPACE 1024*16
107 #endif
108 u_int tcp_sendspace = TCP_SENDSPACE;
109 #ifndef TCP_RECVSPACE
110 #define TCP_RECVSPACE 1024*16
111 #endif
112 u_int tcp_recvspace = TCP_RECVSPACE;
113 u_int tcp_autorcvbuf_inc = 16 * 1024;
114
115 const struct pr_usrreqs tcp_usrreqs = {
116 .pru_attach = tcp_attach,
117 .pru_detach = tcp_detach,
118 .pru_bind = tcp_bind,
119 .pru_listen = tcp_listen,
120 .pru_connect = tcp_connect,
121 .pru_accept = tcp_accept,
122 .pru_disconnect = tcp_disconnect,
123 .pru_shutdown = tcp_shutdown,
124 .pru_rcvd = tcp_rcvd,
125 .pru_send = tcp_send,
126 .pru_abort = tcp_abort,
127 .pru_sense = tcp_sense,
128 .pru_rcvoob = tcp_rcvoob,
129 .pru_sendoob = tcp_sendoob,
130 .pru_control = in_control,
131 .pru_sockaddr = tcp_sockaddr,
132 .pru_peeraddr = tcp_peeraddr,
133 };
134
135 #ifdef INET6
136 const struct pr_usrreqs tcp6_usrreqs = {
137 .pru_attach = tcp_attach,
138 .pru_detach = tcp_detach,
139 .pru_bind = tcp_bind,
140 .pru_listen = tcp_listen,
141 .pru_connect = tcp_connect,
142 .pru_accept = tcp_accept,
143 .pru_disconnect = tcp_disconnect,
144 .pru_shutdown = tcp_shutdown,
145 .pru_rcvd = tcp_rcvd,
146 .pru_send = tcp_send,
147 .pru_abort = tcp_abort,
148 .pru_sense = tcp_sense,
149 .pru_rcvoob = tcp_rcvoob,
150 .pru_sendoob = tcp_sendoob,
151 .pru_control = in6_control,
152 .pru_sockaddr = tcp_sockaddr,
153 .pru_peeraddr = tcp_peeraddr,
154 };
155 #endif
156
157 const struct sysctl_bounded_args tcpctl_vars[] = {
158 { TCPCTL_RFC1323, &tcp_do_rfc1323, 0, 1 },
159 { TCPCTL_SACK, &tcp_do_sack, 0, 1 },
160 { TCPCTL_MSSDFLT, &tcp_mssdflt, TCP_MSS, 65535 },
161 { TCPCTL_RSTPPSLIMIT, &tcp_rst_ppslim, 1, 1000 * 1000 },
162 { TCPCTL_ACK_ON_PUSH, &tcp_ack_on_push, 0, 1 },
163 #ifdef TCP_ECN
164 { TCPCTL_ECN, &tcp_do_ecn, 0, 1 },
165 #endif
166 { TCPCTL_SYN_CACHE_LIMIT, &tcp_syn_cache_limit, 1, 1000 * 1000 },
167 { TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX },
168 { TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 },
169 { TCPCTL_ALWAYS_KEEPALIVE, &tcp_always_keepalive, 0, 1 },
170 { TCPCTL_TSO, &tcp_do_tso, 0, 1 },
171 };
172
173 struct inpcbtable tcbtable;
174 #ifdef INET6
175 struct inpcbtable tcb6table;
176 #endif
177
178 int tcp_fill_info(struct tcpcb *, struct socket *, struct mbuf *);
179 int tcp_ident(void *, size_t *, void *, size_t, int);
180
181 static inline int tcp_sogetpcb(struct socket *, struct inpcb **,
182 struct tcpcb **);
183
184 static inline int
tcp_sogetpcb(struct socket * so,struct inpcb ** rinp,struct tcpcb ** rtp)185 tcp_sogetpcb(struct socket *so, struct inpcb **rinp, struct tcpcb **rtp)
186 {
187 struct inpcb *inp;
188 struct tcpcb *tp;
189
190 /*
191 * When a TCP is attached to a socket, then there will be
192 * a (struct inpcb) pointed at by the socket, and this
193 * structure will point at a subsidiary (struct tcpcb).
194 */
195 if ((inp = sotoinpcb(so)) == NULL || (tp = intotcpcb(inp)) == NULL) {
196 if (so->so_error)
197 return so->so_error;
198 return EINVAL;
199 }
200
201 *rinp = inp;
202 *rtp = tp;
203
204 return 0;
205 }
206
207 /*
208 * Export internal TCP state information via a struct tcp_info without
209 * leaking any sensitive information. Sequence numbers are reported
210 * relative to the initial sequence number.
211 */
212 int
tcp_fill_info(struct tcpcb * tp,struct socket * so,struct mbuf * m)213 tcp_fill_info(struct tcpcb *tp, struct socket *so, struct mbuf *m)
214 {
215 struct proc *p = curproc;
216 struct tcp_info *ti;
217 u_int t = 1000; /* msec => usec */
218 uint64_t now;
219
220 if (sizeof(*ti) > MLEN) {
221 MCLGETL(m, M_WAITOK, sizeof(*ti));
222 if (!ISSET(m->m_flags, M_EXT))
223 return ENOMEM;
224 }
225 ti = mtod(m, struct tcp_info *);
226 m->m_len = sizeof(*ti);
227 memset(ti, 0, sizeof(*ti));
228 now = tcp_now();
229
230 ti->tcpi_state = tp->t_state;
231 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
232 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
233 if (tp->t_flags & TF_SACK_PERMIT)
234 ti->tcpi_options |= TCPI_OPT_SACK;
235 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
236 ti->tcpi_options |= TCPI_OPT_WSCALE;
237 ti->tcpi_snd_wscale = tp->snd_scale;
238 ti->tcpi_rcv_wscale = tp->rcv_scale;
239 }
240 #ifdef TCP_ECN
241 if (tp->t_flags & TF_ECN_PERMIT)
242 ti->tcpi_options |= TCPI_OPT_ECN;
243 #endif
244
245 ti->tcpi_rto = tp->t_rxtcur * t;
246 ti->tcpi_snd_mss = tp->t_maxseg;
247 ti->tcpi_rcv_mss = tp->t_peermss;
248
249 ti->tcpi_last_data_sent = (now - tp->t_sndtime) * t;
250 ti->tcpi_last_ack_sent = (now - tp->t_sndacktime) * t;
251 ti->tcpi_last_data_recv = (now - tp->t_rcvtime) * t;
252 ti->tcpi_last_ack_recv = (now - tp->t_rcvacktime) * t;
253
254 ti->tcpi_rtt = ((uint64_t)tp->t_srtt * t) >>
255 (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT);
256 ti->tcpi_rttvar = ((uint64_t)tp->t_rttvar * t) >>
257 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT);
258 ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
259 ti->tcpi_snd_cwnd = tp->snd_cwnd;
260
261 ti->tcpi_rcv_space = tp->rcv_wnd;
262
263 /*
264 * Provide only minimal information for unprivileged processes.
265 */
266 if (suser(p) != 0)
267 return 0;
268
269 /* FreeBSD-specific extension fields for tcp_info. */
270 ti->tcpi_snd_wnd = tp->snd_wnd;
271 ti->tcpi_snd_nxt = tp->snd_nxt - tp->iss;
272 ti->tcpi_rcv_nxt = tp->rcv_nxt - tp->irs;
273 /* missing tcpi_toe_tid */
274 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
275 ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
276 ti->tcpi_snd_zerowin = tp->t_sndzerowin;
277
278 /* OpenBSD extensions */
279 ti->tcpi_rttmin = tp->t_rttmin * t;
280 ti->tcpi_max_sndwnd = tp->max_sndwnd;
281 ti->tcpi_rcv_adv = tp->rcv_adv - tp->irs;
282 ti->tcpi_rcv_up = tp->rcv_up - tp->irs;
283 ti->tcpi_snd_una = tp->snd_una - tp->iss;
284 ti->tcpi_snd_up = tp->snd_up - tp->iss;
285 ti->tcpi_snd_wl1 = tp->snd_wl1 - tp->iss;
286 ti->tcpi_snd_wl2 = tp->snd_wl2 - tp->iss;
287 ti->tcpi_snd_max = tp->snd_max - tp->iss;
288
289 ti->tcpi_ts_recent = tp->ts_recent; /* XXX value from the wire */
290 ti->tcpi_ts_recent_age = (now - tp->ts_recent_age) * t;
291 ti->tcpi_rfbuf_cnt = tp->rfbuf_cnt;
292 ti->tcpi_rfbuf_ts = (now - tp->rfbuf_ts) * t;
293
294 ti->tcpi_so_rcv_sb_cc = so->so_rcv.sb_cc;
295 ti->tcpi_so_rcv_sb_hiwat = so->so_rcv.sb_hiwat;
296 ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat;
297 ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat;
298 ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc;
299 ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat;
300 ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat;
301 ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat;
302
303 return 0;
304 }
305
306 int
tcp_ctloutput(int op,struct socket * so,int level,int optname,struct mbuf * m)307 tcp_ctloutput(int op, struct socket *so, int level, int optname,
308 struct mbuf *m)
309 {
310 int error = 0;
311 struct inpcb *inp;
312 struct tcpcb *tp;
313 int i;
314
315 inp = sotoinpcb(so);
316 if (inp == NULL)
317 return (ECONNRESET);
318 if (level != IPPROTO_TCP) {
319 #ifdef INET6
320 if (ISSET(inp->inp_flags, INP_IPV6))
321 error = ip6_ctloutput(op, so, level, optname, m);
322 else
323 #endif
324 error = ip_ctloutput(op, so, level, optname, m);
325 return (error);
326 }
327 tp = intotcpcb(inp);
328
329 switch (op) {
330
331 case PRCO_SETOPT:
332 switch (optname) {
333
334 case TCP_NODELAY:
335 if (m == NULL || m->m_len < sizeof (int))
336 error = EINVAL;
337 else if (*mtod(m, int *))
338 tp->t_flags |= TF_NODELAY;
339 else
340 tp->t_flags &= ~TF_NODELAY;
341 break;
342
343 case TCP_NOPUSH:
344 if (m == NULL || m->m_len < sizeof (int))
345 error = EINVAL;
346 else if (*mtod(m, int *))
347 tp->t_flags |= TF_NOPUSH;
348 else if (tp->t_flags & TF_NOPUSH) {
349 tp->t_flags &= ~TF_NOPUSH;
350 if (TCPS_HAVEESTABLISHED(tp->t_state))
351 error = tcp_output(tp);
352 }
353 break;
354
355 case TCP_MAXSEG:
356 if (m == NULL || m->m_len < sizeof (int)) {
357 error = EINVAL;
358 break;
359 }
360
361 i = *mtod(m, int *);
362 if (i > 0 && i <= tp->t_maxseg)
363 tp->t_maxseg = i;
364 else
365 error = EINVAL;
366 break;
367
368 case TCP_SACK_ENABLE:
369 if (m == NULL || m->m_len < sizeof (int)) {
370 error = EINVAL;
371 break;
372 }
373
374 if (TCPS_HAVEESTABLISHED(tp->t_state)) {
375 error = EPERM;
376 break;
377 }
378
379 if (tp->t_flags & TF_SIGNATURE) {
380 error = EPERM;
381 break;
382 }
383
384 if (*mtod(m, int *))
385 tp->sack_enable = 1;
386 else
387 tp->sack_enable = 0;
388 break;
389 #ifdef TCP_SIGNATURE
390 case TCP_MD5SIG:
391 if (m == NULL || m->m_len < sizeof (int)) {
392 error = EINVAL;
393 break;
394 }
395
396 if (TCPS_HAVEESTABLISHED(tp->t_state)) {
397 error = EPERM;
398 break;
399 }
400
401 if (*mtod(m, int *)) {
402 tp->t_flags |= TF_SIGNATURE;
403 tp->sack_enable = 0;
404 } else
405 tp->t_flags &= ~TF_SIGNATURE;
406 break;
407 #endif /* TCP_SIGNATURE */
408 default:
409 error = ENOPROTOOPT;
410 break;
411 }
412 break;
413
414 case PRCO_GETOPT:
415 switch (optname) {
416 case TCP_NODELAY:
417 m->m_len = sizeof(int);
418 *mtod(m, int *) = tp->t_flags & TF_NODELAY;
419 break;
420 case TCP_NOPUSH:
421 m->m_len = sizeof(int);
422 *mtod(m, int *) = tp->t_flags & TF_NOPUSH;
423 break;
424 case TCP_MAXSEG:
425 m->m_len = sizeof(int);
426 *mtod(m, int *) = tp->t_maxseg;
427 break;
428 case TCP_SACK_ENABLE:
429 m->m_len = sizeof(int);
430 *mtod(m, int *) = tp->sack_enable;
431 break;
432 case TCP_INFO:
433 error = tcp_fill_info(tp, so, m);
434 break;
435 #ifdef TCP_SIGNATURE
436 case TCP_MD5SIG:
437 m->m_len = sizeof(int);
438 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE;
439 break;
440 #endif
441 default:
442 error = ENOPROTOOPT;
443 break;
444 }
445 break;
446 }
447 return (error);
448 }
449
450 /*
451 * Attach TCP protocol to socket, allocating
452 * internet protocol control block, tcp control block,
453 * buffer space, and entering LISTEN state to accept connections.
454 */
455 int
tcp_attach(struct socket * so,int proto,int wait)456 tcp_attach(struct socket *so, int proto, int wait)
457 {
458 struct inpcbtable *table;
459 struct tcpcb *tp;
460 struct inpcb *inp;
461 int error;
462
463 if (so->so_pcb)
464 return EISCONN;
465 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 ||
466 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) ||
467 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) {
468 error = soreserve(so, tcp_sendspace, tcp_recvspace);
469 if (error)
470 return (error);
471 }
472
473 NET_ASSERT_LOCKED();
474 #ifdef INET6
475 if (so->so_proto->pr_domain->dom_family == PF_INET6)
476 table = &tcb6table;
477 else
478 #endif
479 table = &tcbtable;
480 error = in_pcballoc(so, table, wait);
481 if (error)
482 return (error);
483 inp = sotoinpcb(so);
484 tp = tcp_newtcpcb(inp, wait);
485 if (tp == NULL) {
486 unsigned int nofd = so->so_state & SS_NOFDREF; /* XXX */
487
488 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */
489 in_pcbdetach(inp);
490 so->so_state |= nofd;
491 return (ENOBUFS);
492 }
493 tp->t_state = TCPS_CLOSED;
494 #ifdef INET6
495 if (ISSET(inp->inp_flags, INP_IPV6))
496 tp->pf = PF_INET6;
497 else
498 #endif
499 tp->pf = PF_INET;
500 if ((so->so_options & SO_LINGER) && so->so_linger == 0)
501 so->so_linger = TCP_LINGERTIME;
502
503 if (so->so_options & SO_DEBUG)
504 tcp_trace(TA_USER, TCPS_CLOSED, tp, tp, NULL, PRU_ATTACH, 0);
505 return (0);
506 }
507
508 int
tcp_detach(struct socket * so)509 tcp_detach(struct socket *so)
510 {
511 struct inpcb *inp;
512 struct tcpcb *otp = NULL, *tp;
513 int error;
514 short ostate;
515
516 soassertlocked(so);
517
518 if ((error = tcp_sogetpcb(so, &inp, &tp)))
519 return (error);
520
521 if (so->so_options & SO_DEBUG) {
522 otp = tp;
523 ostate = tp->t_state;
524 }
525
526 /*
527 * Detach the TCP protocol from the socket.
528 * If the protocol state is non-embryonic, then can't
529 * do this directly: have to initiate a PRU_DISCONNECT,
530 * which may finish later; embryonic TCB's can just
531 * be discarded here.
532 */
533 tp = tcp_dodisconnect(tp);
534
535 if (otp)
536 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DETACH, 0);
537 return (0);
538 }
539
540 /*
541 * Give the socket an address.
542 */
543 int
tcp_bind(struct socket * so,struct mbuf * nam,struct proc * p)544 tcp_bind(struct socket *so, struct mbuf *nam, struct proc *p)
545 {
546 struct inpcb *inp;
547 struct tcpcb *tp;
548 int error;
549 short ostate;
550
551 soassertlocked(so);
552
553 if ((error = tcp_sogetpcb(so, &inp, &tp)))
554 return (error);
555
556 if (so->so_options & SO_DEBUG)
557 ostate = tp->t_state;
558
559 error = in_pcbbind(inp, nam, p);
560
561 if (so->so_options & SO_DEBUG)
562 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_BIND, 0);
563 return (error);
564 }
565
566 /*
567 * Prepare to accept connections.
568 */
569 int
tcp_listen(struct socket * so)570 tcp_listen(struct socket *so)
571 {
572 struct inpcb *inp;
573 struct tcpcb *tp, *otp = NULL;
574 int error;
575 short ostate;
576
577 soassertlocked(so);
578
579 if ((error = tcp_sogetpcb(so, &inp, &tp)))
580 return (error);
581
582 if (so->so_options & SO_DEBUG) {
583 otp = tp;
584 ostate = tp->t_state;
585 }
586
587 if (inp->inp_lport == 0)
588 if ((error = in_pcbbind(inp, NULL, curproc)))
589 goto out;
590
591 /*
592 * If the in_pcbbind() above is called, the tp->pf
593 * should still be whatever it was before.
594 */
595 tp->t_state = TCPS_LISTEN;
596
597 out:
598 if (otp)
599 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_LISTEN, 0);
600 return (error);
601 }
602
603 /*
604 * Initiate connection to peer.
605 * Create a template for use in transmissions on this connection.
606 * Enter SYN_SENT state, and mark socket as connecting.
607 * Start keep-alive timer, and seed output sequence space.
608 * Send initial segment on connection.
609 */
610 int
tcp_connect(struct socket * so,struct mbuf * nam)611 tcp_connect(struct socket *so, struct mbuf *nam)
612 {
613 struct inpcb *inp;
614 struct tcpcb *tp, *otp = NULL;
615 int error;
616 short ostate;
617
618 soassertlocked(so);
619
620 if ((error = tcp_sogetpcb(so, &inp, &tp)))
621 return (error);
622
623 if (so->so_options & SO_DEBUG) {
624 otp = tp;
625 ostate = tp->t_state;
626 }
627
628 #ifdef INET6
629 if (ISSET(inp->inp_flags, INP_IPV6)) {
630 struct sockaddr_in6 *sin6;
631
632 if ((error = in6_nam2sin6(nam, &sin6)))
633 goto out;
634 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
635 IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
636 error = EINVAL;
637 goto out;
638 }
639 } else
640 #endif
641 {
642 struct sockaddr_in *sin;
643
644 if ((error = in_nam2sin(nam, &sin)))
645 goto out;
646 if ((sin->sin_addr.s_addr == INADDR_ANY) ||
647 (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
648 IN_MULTICAST(sin->sin_addr.s_addr) ||
649 in_broadcast(sin->sin_addr, inp->inp_rtableid)) {
650 error = EINVAL;
651 goto out;
652 }
653 }
654 error = in_pcbconnect(inp, nam);
655 if (error)
656 goto out;
657
658 tp->t_template = tcp_template(tp);
659 if (tp->t_template == 0) {
660 in_pcbunset_faddr(inp);
661 in_pcbdisconnect(inp);
662 error = ENOBUFS;
663 goto out;
664 }
665
666 so->so_state |= SS_CONNECTOUT;
667
668 /* Compute window scaling to request. */
669 tcp_rscale(tp, sb_max);
670
671 soisconnecting(so);
672 tcpstat_inc(tcps_connattempt);
673 tp->t_state = TCPS_SYN_SENT;
674 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init);
675 tcp_set_iss_tsm(tp);
676 tcp_sendseqinit(tp);
677 tp->snd_last = tp->snd_una;
678 error = tcp_output(tp);
679
680 out:
681 if (otp)
682 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_CONNECT, 0);
683 return (error);
684 }
685
686 /*
687 * Accept a connection. Essentially all the work is done at higher
688 * levels; just return the address of the peer, storing through addr.
689 */
690 int
tcp_accept(struct socket * so,struct mbuf * nam)691 tcp_accept(struct socket *so, struct mbuf *nam)
692 {
693 struct inpcb *inp;
694 struct tcpcb *tp;
695 int error;
696
697 soassertlocked(so);
698
699 if ((error = tcp_sogetpcb(so, &inp, &tp)))
700 return (error);
701
702 in_setpeeraddr(inp, nam);
703
704 if (so->so_options & SO_DEBUG)
705 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_ACCEPT, 0);
706 return (0);
707 }
708
709 /*
710 * Initiate disconnect from peer.
711 * If connection never passed embryonic stage, just drop;
712 * else if don't need to let data drain, then can just drop anyways,
713 * else have to begin TCP shutdown process: mark socket disconnecting,
714 * drain unread data, state switch to reflect user close, and
715 * send segment (e.g. FIN) to peer. Socket will be really disconnected
716 * when peer sends FIN and acks ours.
717 *
718 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
719 */
720 int
tcp_disconnect(struct socket * so)721 tcp_disconnect(struct socket *so)
722 {
723 struct inpcb *inp;
724 struct tcpcb *tp, *otp = NULL;
725 int error;
726 short ostate;
727
728 soassertlocked(so);
729
730 if ((error = tcp_sogetpcb(so, &inp, &tp)))
731 return (error);
732
733 if (so->so_options & SO_DEBUG) {
734 otp = tp;
735 ostate = tp->t_state;
736 }
737
738 tp = tcp_dodisconnect(tp);
739
740 if (otp)
741 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DISCONNECT, 0);
742 return (0);
743 }
744
745 /*
746 * Mark the connection as being incapable of further output.
747 */
748 int
tcp_shutdown(struct socket * so)749 tcp_shutdown(struct socket *so)
750 {
751 struct inpcb *inp;
752 struct tcpcb *tp, *otp = NULL;
753 int error;
754 short ostate;
755
756 soassertlocked(so);
757
758 if ((error = tcp_sogetpcb(so, &inp, &tp)))
759 return (error);
760
761 if (so->so_options & SO_DEBUG) {
762 otp = tp;
763 ostate = tp->t_state;
764 }
765
766 if (so->so_snd.sb_state & SS_CANTSENDMORE)
767 goto out;
768
769 socantsendmore(so);
770 tp = tcp_usrclosed(tp);
771 if (tp)
772 error = tcp_output(tp);
773
774 out:
775 if (otp)
776 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_SHUTDOWN, 0);
777 return (error);
778 }
779
780 /*
781 * After a receive, possibly send window update to peer.
782 */
783 void
tcp_rcvd(struct socket * so)784 tcp_rcvd(struct socket *so)
785 {
786 struct inpcb *inp;
787 struct tcpcb *tp;
788 short ostate;
789
790 soassertlocked(so);
791
792 if (tcp_sogetpcb(so, &inp, &tp))
793 return;
794
795 if (so->so_options & SO_DEBUG)
796 ostate = tp->t_state;
797
798 /*
799 * soreceive() calls this function when a user receives
800 * ancillary data on a listening socket. We don't call
801 * tcp_output in such a case, since there is no header
802 * template for a listening socket and hence the kernel
803 * will panic.
804 */
805 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0)
806 (void) tcp_output(tp);
807
808 if (so->so_options & SO_DEBUG)
809 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_RCVD, 0);
810 }
811
812 /*
813 * Do a send by putting data in output queue and updating urgent
814 * marker if URG set. Possibly send more data.
815 */
816 int
tcp_send(struct socket * so,struct mbuf * m,struct mbuf * nam,struct mbuf * control)817 tcp_send(struct socket *so, struct mbuf *m, struct mbuf *nam,
818 struct mbuf *control)
819 {
820 struct inpcb *inp;
821 struct tcpcb *tp;
822 int error;
823 short ostate;
824
825 soassertlocked(so);
826
827 if (control && control->m_len) {
828 error = EINVAL;
829 goto out;
830 }
831
832 if ((error = tcp_sogetpcb(so, &inp, &tp)))
833 goto out;
834
835 if (so->so_options & SO_DEBUG)
836 ostate = tp->t_state;
837
838 sbappendstream(so, &so->so_snd, m);
839 m = NULL;
840
841 error = tcp_output(tp);
842
843 if (so->so_options & SO_DEBUG)
844 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SEND, 0);
845
846 out:
847 m_freem(control);
848 m_freem(m);
849
850 return (error);
851 }
852
853 /*
854 * Abort the TCP.
855 */
856 void
tcp_abort(struct socket * so)857 tcp_abort(struct socket *so)
858 {
859 struct inpcb *inp;
860 struct tcpcb *tp, *otp = NULL;
861 short ostate;
862
863 soassertlocked(so);
864
865 if (tcp_sogetpcb(so, &inp, &tp))
866 return;
867
868 if (so->so_options & SO_DEBUG) {
869 otp = tp;
870 ostate = tp->t_state;
871 }
872
873 tp = tcp_drop(tp, ECONNABORTED);
874
875 if (otp)
876 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_ABORT, 0);
877 }
878
879 int
tcp_sense(struct socket * so,struct stat * ub)880 tcp_sense(struct socket *so, struct stat *ub)
881 {
882 struct inpcb *inp;
883 struct tcpcb *tp;
884 int error;
885
886 soassertlocked(so);
887
888 if ((error = tcp_sogetpcb(so, &inp, &tp)))
889 return (error);
890
891 ub->st_blksize = so->so_snd.sb_hiwat;
892
893 if (so->so_options & SO_DEBUG)
894 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0);
895 return (0);
896 }
897
898 int
tcp_rcvoob(struct socket * so,struct mbuf * m,int flags)899 tcp_rcvoob(struct socket *so, struct mbuf *m, int flags)
900 {
901 struct inpcb *inp;
902 struct tcpcb *tp;
903 int error;
904
905 soassertlocked(so);
906
907 if ((error = tcp_sogetpcb(so, &inp, &tp)))
908 return (error);
909
910 if ((so->so_oobmark == 0 &&
911 (so->so_rcv.sb_state & SS_RCVATMARK) == 0) ||
912 so->so_options & SO_OOBINLINE ||
913 tp->t_oobflags & TCPOOB_HADDATA) {
914 error = EINVAL;
915 goto out;
916 }
917 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
918 error = EWOULDBLOCK;
919 goto out;
920 }
921 m->m_len = 1;
922 *mtod(m, caddr_t) = tp->t_iobc;
923 if ((flags & MSG_PEEK) == 0)
924 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
925 out:
926 if (so->so_options & SO_DEBUG)
927 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_RCVOOB, 0);
928 return (error);
929 }
930
931 int
tcp_sendoob(struct socket * so,struct mbuf * m,struct mbuf * nam,struct mbuf * control)932 tcp_sendoob(struct socket *so, struct mbuf *m, struct mbuf *nam,
933 struct mbuf *control)
934 {
935 struct inpcb *inp;
936 struct tcpcb *tp;
937 int error;
938 short ostate;
939
940 soassertlocked(so);
941
942 if (control && control->m_len) {
943 error = EINVAL;
944 goto release;
945 }
946
947 if ((error = tcp_sogetpcb(so, &inp, &tp)))
948 goto release;
949
950 if (so->so_options & SO_DEBUG)
951 ostate = tp->t_state;
952
953 if (sbspace(so, &so->so_snd) < -512) {
954 error = ENOBUFS;
955 goto out;
956 }
957
958 /*
959 * According to RFC961 (Assigned Protocols),
960 * the urgent pointer points to the last octet
961 * of urgent data. We continue, however,
962 * to consider it to indicate the first octet
963 * of data past the urgent section.
964 * Otherwise, snd_up should be one lower.
965 */
966 sbappendstream(so, &so->so_snd, m);
967 m = NULL;
968 tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
969 tp->t_force = 1;
970 error = tcp_output(tp);
971 tp->t_force = 0;
972
973 out:
974 if (so->so_options & SO_DEBUG)
975 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SENDOOB, 0);
976
977 release:
978 m_freem(control);
979 m_freem(m);
980
981 return (error);
982 }
983
984 int
tcp_sockaddr(struct socket * so,struct mbuf * nam)985 tcp_sockaddr(struct socket *so, struct mbuf *nam)
986 {
987 struct inpcb *inp;
988 struct tcpcb *tp;
989 int error;
990
991 soassertlocked(so);
992
993 if ((error = tcp_sogetpcb(so, &inp, &tp)))
994 return (error);
995
996 in_setsockaddr(inp, nam);
997
998 if (so->so_options & SO_DEBUG)
999 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL,
1000 PRU_SOCKADDR, 0);
1001 return (0);
1002 }
1003
1004 int
tcp_peeraddr(struct socket * so,struct mbuf * nam)1005 tcp_peeraddr(struct socket *so, struct mbuf *nam)
1006 {
1007 struct inpcb *inp;
1008 struct tcpcb *tp;
1009 int error;
1010
1011 soassertlocked(so);
1012
1013 if ((error = tcp_sogetpcb(so, &inp, &tp)))
1014 return (error);
1015
1016 in_setpeeraddr(inp, nam);
1017
1018 if (so->so_options & SO_DEBUG)
1019 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_PEERADDR, 0);
1020 return (0);
1021 }
1022
1023 /*
1024 * Initiate (or continue) disconnect.
1025 * If embryonic state, just send reset (once).
1026 * If in ``let data drain'' option and linger null, just drop.
1027 * Otherwise (hard), mark socket disconnecting and drop
1028 * current input data; switch states based on user close, and
1029 * send segment to peer (with FIN).
1030 */
1031 struct tcpcb *
tcp_dodisconnect(struct tcpcb * tp)1032 tcp_dodisconnect(struct tcpcb *tp)
1033 {
1034 struct socket *so = tp->t_inpcb->inp_socket;
1035
1036 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
1037 tp = tcp_close(tp);
1038 else if ((so->so_options & SO_LINGER) && so->so_linger == 0)
1039 tp = tcp_drop(tp, 0);
1040 else {
1041 soisdisconnecting(so);
1042 sbflush(so, &so->so_rcv);
1043 tp = tcp_usrclosed(tp);
1044 if (tp)
1045 (void) tcp_output(tp);
1046 }
1047 return (tp);
1048 }
1049
1050 /*
1051 * User issued close, and wish to trail through shutdown states:
1052 * if never received SYN, just forget it. If got a SYN from peer,
1053 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
1054 * If already got a FIN from peer, then almost done; go to LAST_ACK
1055 * state. In all other cases, have already sent FIN to peer (e.g.
1056 * after PRU_SHUTDOWN), and just have to play tedious game waiting
1057 * for peer to send FIN or not respond to keep-alives, etc.
1058 * We can let the user exit from the close as soon as the FIN is acked.
1059 */
1060 struct tcpcb *
tcp_usrclosed(struct tcpcb * tp)1061 tcp_usrclosed(struct tcpcb *tp)
1062 {
1063
1064 switch (tp->t_state) {
1065
1066 case TCPS_CLOSED:
1067 case TCPS_LISTEN:
1068 case TCPS_SYN_SENT:
1069 tp->t_state = TCPS_CLOSED;
1070 tp = tcp_close(tp);
1071 break;
1072
1073 case TCPS_SYN_RECEIVED:
1074 case TCPS_ESTABLISHED:
1075 tp->t_state = TCPS_FIN_WAIT_1;
1076 break;
1077
1078 case TCPS_CLOSE_WAIT:
1079 tp->t_state = TCPS_LAST_ACK;
1080 break;
1081 }
1082 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) {
1083 soisdisconnected(tp->t_inpcb->inp_socket);
1084 /*
1085 * If we are in FIN_WAIT_2, we arrived here because the
1086 * application did a shutdown of the send side. Like the
1087 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after
1088 * a full close, we start a timer to make sure sockets are
1089 * not left in FIN_WAIT_2 forever.
1090 */
1091 if (tp->t_state == TCPS_FIN_WAIT_2)
1092 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle);
1093 }
1094 return (tp);
1095 }
1096
1097 /*
1098 * Look up a socket for ident or tcpdrop, ...
1099 */
1100 int
tcp_ident(void * oldp,size_t * oldlenp,void * newp,size_t newlen,int dodrop)1101 tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop)
1102 {
1103 int error = 0;
1104 struct tcp_ident_mapping tir;
1105 struct inpcb *inp;
1106 struct tcpcb *tp = NULL;
1107 struct sockaddr_in *fin, *lin;
1108 #ifdef INET6
1109 struct sockaddr_in6 *fin6, *lin6;
1110 struct in6_addr f6, l6;
1111 #endif
1112
1113 NET_ASSERT_LOCKED();
1114
1115 if (dodrop) {
1116 if (oldp != NULL || *oldlenp != 0)
1117 return (EINVAL);
1118 if (newp == NULL)
1119 return (EPERM);
1120 if (newlen < sizeof(tir))
1121 return (ENOMEM);
1122 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 )
1123 return (error);
1124 } else {
1125 if (oldp == NULL)
1126 return (EINVAL);
1127 if (*oldlenp < sizeof(tir))
1128 return (ENOMEM);
1129 if (newp != NULL || newlen != 0)
1130 return (EINVAL);
1131 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 )
1132 return (error);
1133 }
1134 switch (tir.faddr.ss_family) {
1135 #ifdef INET6
1136 case AF_INET6:
1137 fin6 = (struct sockaddr_in6 *)&tir.faddr;
1138 error = in6_embedscope(&f6, fin6, NULL, NULL);
1139 if (error)
1140 return EINVAL; /*?*/
1141 lin6 = (struct sockaddr_in6 *)&tir.laddr;
1142 error = in6_embedscope(&l6, lin6, NULL, NULL);
1143 if (error)
1144 return EINVAL; /*?*/
1145 break;
1146 #endif
1147 case AF_INET:
1148 fin = (struct sockaddr_in *)&tir.faddr;
1149 lin = (struct sockaddr_in *)&tir.laddr;
1150 break;
1151 default:
1152 return (EINVAL);
1153 }
1154
1155 switch (tir.faddr.ss_family) {
1156 #ifdef INET6
1157 case AF_INET6:
1158 inp = in6_pcblookup(&tcb6table, &f6,
1159 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain);
1160 break;
1161 #endif
1162 case AF_INET:
1163 inp = in_pcblookup(&tcbtable, fin->sin_addr,
1164 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain);
1165 break;
1166 default:
1167 unhandled_af(tir.faddr.ss_family);
1168 }
1169
1170 if (dodrop) {
1171 if (inp && (tp = intotcpcb(inp)) &&
1172 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0))
1173 tp = tcp_drop(tp, ECONNABORTED);
1174 else
1175 error = ESRCH;
1176 in_pcbunref(inp);
1177 return (error);
1178 }
1179
1180 if (inp == NULL) {
1181 tcpstat_inc(tcps_pcbhashmiss);
1182 switch (tir.faddr.ss_family) {
1183 #ifdef INET6
1184 case AF_INET6:
1185 inp = in6_pcblookup_listen(&tcb6table,
1186 &l6, lin6->sin6_port, NULL, tir.rdomain);
1187 break;
1188 #endif
1189 case AF_INET:
1190 inp = in_pcblookup_listen(&tcbtable,
1191 lin->sin_addr, lin->sin_port, NULL, tir.rdomain);
1192 break;
1193 }
1194 }
1195
1196 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) {
1197 tir.ruid = inp->inp_socket->so_ruid;
1198 tir.euid = inp->inp_socket->so_euid;
1199 } else {
1200 tir.ruid = -1;
1201 tir.euid = -1;
1202 }
1203
1204 *oldlenp = sizeof (tir);
1205 error = copyout((void *)&tir, oldp, sizeof (tir));
1206 in_pcbunref(inp);
1207 return (error);
1208 }
1209
1210 int
tcp_sysctl_tcpstat(void * oldp,size_t * oldlenp,void * newp)1211 tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp)
1212 {
1213 uint64_t counters[tcps_ncounters];
1214 struct tcpstat tcpstat;
1215 struct syn_cache_set *set;
1216 int i = 0;
1217
1218 #define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0)
1219
1220 memset(&tcpstat, 0, sizeof tcpstat);
1221 counters_read(tcpcounters, counters, nitems(counters), NULL);
1222 ASSIGN(tcps_connattempt);
1223 ASSIGN(tcps_accepts);
1224 ASSIGN(tcps_connects);
1225 ASSIGN(tcps_drops);
1226 ASSIGN(tcps_conndrops);
1227 ASSIGN(tcps_closed);
1228 ASSIGN(tcps_segstimed);
1229 ASSIGN(tcps_rttupdated);
1230 ASSIGN(tcps_delack);
1231 ASSIGN(tcps_timeoutdrop);
1232 ASSIGN(tcps_rexmttimeo);
1233 ASSIGN(tcps_persisttimeo);
1234 ASSIGN(tcps_persistdrop);
1235 ASSIGN(tcps_keeptimeo);
1236 ASSIGN(tcps_keepprobe);
1237 ASSIGN(tcps_keepdrops);
1238 ASSIGN(tcps_sndtotal);
1239 ASSIGN(tcps_sndpack);
1240 ASSIGN(tcps_sndbyte);
1241 ASSIGN(tcps_sndrexmitpack);
1242 ASSIGN(tcps_sndrexmitbyte);
1243 ASSIGN(tcps_sndrexmitfast);
1244 ASSIGN(tcps_sndacks);
1245 ASSIGN(tcps_sndprobe);
1246 ASSIGN(tcps_sndurg);
1247 ASSIGN(tcps_sndwinup);
1248 ASSIGN(tcps_sndctrl);
1249 ASSIGN(tcps_rcvtotal);
1250 ASSIGN(tcps_rcvpack);
1251 ASSIGN(tcps_rcvbyte);
1252 ASSIGN(tcps_rcvbadsum);
1253 ASSIGN(tcps_rcvbadoff);
1254 ASSIGN(tcps_rcvmemdrop);
1255 ASSIGN(tcps_rcvnosec);
1256 ASSIGN(tcps_rcvshort);
1257 ASSIGN(tcps_rcvduppack);
1258 ASSIGN(tcps_rcvdupbyte);
1259 ASSIGN(tcps_rcvpartduppack);
1260 ASSIGN(tcps_rcvpartdupbyte);
1261 ASSIGN(tcps_rcvoopack);
1262 ASSIGN(tcps_rcvoobyte);
1263 ASSIGN(tcps_rcvpackafterwin);
1264 ASSIGN(tcps_rcvbyteafterwin);
1265 ASSIGN(tcps_rcvafterclose);
1266 ASSIGN(tcps_rcvwinprobe);
1267 ASSIGN(tcps_rcvdupack);
1268 ASSIGN(tcps_rcvacktoomuch);
1269 ASSIGN(tcps_rcvacktooold);
1270 ASSIGN(tcps_rcvackpack);
1271 ASSIGN(tcps_rcvackbyte);
1272 ASSIGN(tcps_rcvwinupd);
1273 ASSIGN(tcps_pawsdrop);
1274 ASSIGN(tcps_predack);
1275 ASSIGN(tcps_preddat);
1276 ASSIGN(tcps_pcbhashmiss);
1277 ASSIGN(tcps_noport);
1278 ASSIGN(tcps_badsyn);
1279 ASSIGN(tcps_dropsyn);
1280 ASSIGN(tcps_rcvbadsig);
1281 ASSIGN(tcps_rcvgoodsig);
1282 ASSIGN(tcps_inswcsum);
1283 ASSIGN(tcps_outswcsum);
1284 ASSIGN(tcps_ecn_accepts);
1285 ASSIGN(tcps_ecn_rcvece);
1286 ASSIGN(tcps_ecn_rcvcwr);
1287 ASSIGN(tcps_ecn_rcvce);
1288 ASSIGN(tcps_ecn_sndect);
1289 ASSIGN(tcps_ecn_sndece);
1290 ASSIGN(tcps_ecn_sndcwr);
1291 ASSIGN(tcps_cwr_ecn);
1292 ASSIGN(tcps_cwr_frecovery);
1293 ASSIGN(tcps_cwr_timeout);
1294 ASSIGN(tcps_sc_added);
1295 ASSIGN(tcps_sc_completed);
1296 ASSIGN(tcps_sc_timed_out);
1297 ASSIGN(tcps_sc_overflowed);
1298 ASSIGN(tcps_sc_reset);
1299 ASSIGN(tcps_sc_unreach);
1300 ASSIGN(tcps_sc_bucketoverflow);
1301 ASSIGN(tcps_sc_aborted);
1302 ASSIGN(tcps_sc_dupesyn);
1303 ASSIGN(tcps_sc_dropped);
1304 ASSIGN(tcps_sc_collisions);
1305 ASSIGN(tcps_sc_retransmitted);
1306 ASSIGN(tcps_sc_seedrandom);
1307 ASSIGN(tcps_sc_hash_size);
1308 ASSIGN(tcps_sc_entry_count);
1309 ASSIGN(tcps_sc_entry_limit);
1310 ASSIGN(tcps_sc_bucket_maxlen);
1311 ASSIGN(tcps_sc_bucket_limit);
1312 ASSIGN(tcps_sc_uses_left);
1313 ASSIGN(tcps_conndrained);
1314 ASSIGN(tcps_sack_recovery_episode);
1315 ASSIGN(tcps_sack_rexmits);
1316 ASSIGN(tcps_sack_rexmit_bytes);
1317 ASSIGN(tcps_sack_rcv_opts);
1318 ASSIGN(tcps_sack_snd_opts);
1319 ASSIGN(tcps_sack_drop_opts);
1320 ASSIGN(tcps_outswtso);
1321 ASSIGN(tcps_outhwtso);
1322 ASSIGN(tcps_outpkttso);
1323 ASSIGN(tcps_outbadtso);
1324 ASSIGN(tcps_inswlro);
1325 ASSIGN(tcps_inhwlro);
1326 ASSIGN(tcps_inpktlro);
1327 ASSIGN(tcps_inbadlro);
1328
1329 #undef ASSIGN
1330
1331 mtx_enter(&syn_cache_mtx);
1332 set = &tcp_syn_cache[tcp_syn_cache_active];
1333 tcpstat.tcps_sc_hash_size = set->scs_size;
1334 tcpstat.tcps_sc_entry_count = set->scs_count;
1335 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit;
1336 tcpstat.tcps_sc_bucket_maxlen = 0;
1337 for (i = 0; i < set->scs_size; i++) {
1338 if (tcpstat.tcps_sc_bucket_maxlen <
1339 set->scs_buckethead[i].sch_length)
1340 tcpstat.tcps_sc_bucket_maxlen =
1341 set->scs_buckethead[i].sch_length;
1342 }
1343 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit;
1344 tcpstat.tcps_sc_uses_left = set->scs_use;
1345 mtx_leave(&syn_cache_mtx);
1346
1347 return (sysctl_rdstruct(oldp, oldlenp, newp,
1348 &tcpstat, sizeof(tcpstat)));
1349 }
1350
1351 /*
1352 * Sysctl for tcp variables.
1353 */
1354 int
tcp_sysctl(int * name,u_int namelen,void * oldp,size_t * oldlenp,void * newp,size_t newlen)1355 tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
1356 size_t newlen)
1357 {
1358 int error, nval;
1359
1360 /* All sysctl names at this level are terminal. */
1361 if (namelen != 1)
1362 return (ENOTDIR);
1363
1364 switch (name[0]) {
1365 case TCPCTL_KEEPINITTIME:
1366 NET_LOCK();
1367 nval = tcptv_keep_init / TCP_TIME(1);
1368 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval,
1369 1, 3 * (TCPTV_KEEP_INIT / TCP_TIME(1)));
1370 if (!error)
1371 tcptv_keep_init = TCP_TIME(nval);
1372 NET_UNLOCK();
1373 return (error);
1374
1375 case TCPCTL_KEEPIDLE:
1376 NET_LOCK();
1377 nval = tcp_keepidle / TCP_TIME(1);
1378 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval,
1379 1, 5 * (TCPTV_KEEP_IDLE / TCP_TIME(1)));
1380 if (!error)
1381 tcp_keepidle = TCP_TIME(nval);
1382 NET_UNLOCK();
1383 return (error);
1384
1385 case TCPCTL_KEEPINTVL:
1386 NET_LOCK();
1387 nval = tcp_keepintvl / TCP_TIME(1);
1388 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval,
1389 1, 3 * (TCPTV_KEEPINTVL / TCP_TIME(1)));
1390 if (!error)
1391 tcp_keepintvl = TCP_TIME(nval);
1392 NET_UNLOCK();
1393 return (error);
1394
1395 case TCPCTL_BADDYNAMIC:
1396 NET_LOCK();
1397 error = sysctl_struct(oldp, oldlenp, newp, newlen,
1398 baddynamicports.tcp, sizeof(baddynamicports.tcp));
1399 NET_UNLOCK();
1400 return (error);
1401
1402 case TCPCTL_ROOTONLY:
1403 if (newp && securelevel > 0)
1404 return (EPERM);
1405 NET_LOCK();
1406 error = sysctl_struct(oldp, oldlenp, newp, newlen,
1407 rootonlyports.tcp, sizeof(rootonlyports.tcp));
1408 NET_UNLOCK();
1409 return (error);
1410
1411 case TCPCTL_IDENT:
1412 NET_LOCK();
1413 error = tcp_ident(oldp, oldlenp, newp, newlen, 0);
1414 NET_UNLOCK();
1415 return (error);
1416
1417 case TCPCTL_DROP:
1418 NET_LOCK();
1419 error = tcp_ident(oldp, oldlenp, newp, newlen, 1);
1420 NET_UNLOCK();
1421 return (error);
1422
1423 case TCPCTL_REASS_LIMIT:
1424 NET_LOCK();
1425 nval = tcp_reass_limit;
1426 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
1427 if (!error && nval != tcp_reass_limit) {
1428 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0);
1429 if (!error)
1430 tcp_reass_limit = nval;
1431 }
1432 NET_UNLOCK();
1433 return (error);
1434
1435 case TCPCTL_SACKHOLE_LIMIT:
1436 NET_LOCK();
1437 nval = tcp_sackhole_limit;
1438 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval);
1439 if (!error && nval != tcp_sackhole_limit) {
1440 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0);
1441 if (!error)
1442 tcp_sackhole_limit = nval;
1443 }
1444 NET_UNLOCK();
1445 return (error);
1446
1447 case TCPCTL_STATS:
1448 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp));
1449
1450 case TCPCTL_SYN_USE_LIMIT:
1451 NET_LOCK();
1452 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
1453 &tcp_syn_use_limit, 0, INT_MAX);
1454 if (!error && newp != NULL) {
1455 /*
1456 * Global tcp_syn_use_limit is used when reseeding a
1457 * new cache. Also update the value in active cache.
1458 */
1459 mtx_enter(&syn_cache_mtx);
1460 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit)
1461 tcp_syn_cache[0].scs_use = tcp_syn_use_limit;
1462 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit)
1463 tcp_syn_cache[1].scs_use = tcp_syn_use_limit;
1464 mtx_leave(&syn_cache_mtx);
1465 }
1466 NET_UNLOCK();
1467 return (error);
1468
1469 case TCPCTL_SYN_HASH_SIZE:
1470 NET_LOCK();
1471 nval = tcp_syn_hash_size;
1472 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
1473 &nval, 1, 100000);
1474 if (!error && nval != tcp_syn_hash_size) {
1475 /*
1476 * If global hash size has been changed,
1477 * switch sets as soon as possible. Then
1478 * the actual hash array will be reallocated.
1479 */
1480 mtx_enter(&syn_cache_mtx);
1481 if (tcp_syn_cache[0].scs_size != nval)
1482 tcp_syn_cache[0].scs_use = 0;
1483 if (tcp_syn_cache[1].scs_size != nval)
1484 tcp_syn_cache[1].scs_use = 0;
1485 tcp_syn_hash_size = nval;
1486 mtx_leave(&syn_cache_mtx);
1487 }
1488 NET_UNLOCK();
1489 return (error);
1490
1491 default:
1492 NET_LOCK();
1493 error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars),
1494 name, namelen, oldp, oldlenp, newp, newlen);
1495 NET_UNLOCK();
1496 return (error);
1497 }
1498 /* NOTREACHED */
1499 }
1500
1501 /*
1502 * Scale the send buffer so that inflight data is not accounted against
1503 * the limit. The buffer will scale with the congestion window, if the
1504 * the receiver stops acking data the window will shrink and therefore
1505 * the buffer size will shrink as well.
1506 * In low memory situation try to shrink the buffer to the initial size
1507 * disabling the send buffer scaling as long as the situation persists.
1508 */
1509 void
tcp_update_sndspace(struct tcpcb * tp)1510 tcp_update_sndspace(struct tcpcb *tp)
1511 {
1512 struct socket *so = tp->t_inpcb->inp_socket;
1513 u_long nmax = so->so_snd.sb_hiwat;
1514
1515 if (sbchecklowmem()) {
1516 /* low on memory try to get rid of some */
1517 if (tcp_sendspace < nmax)
1518 nmax = tcp_sendspace;
1519 } else if (so->so_snd.sb_wat != tcp_sendspace)
1520 /* user requested buffer size, auto-scaling disabled */
1521 nmax = so->so_snd.sb_wat;
1522 else
1523 /* automatic buffer scaling */
1524 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max -
1525 tp->snd_una);
1526
1527 /* a writable socket must be preserved because of poll(2) semantics */
1528 if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) {
1529 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat)
1530 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat;
1531 /* keep in sync with sbreserve() calculation */
1532 if (nmax * 8 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat)
1533 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+7) / 8;
1534 }
1535
1536 /* round to MSS boundary */
1537 nmax = roundup(nmax, tp->t_maxseg);
1538
1539 if (nmax != so->so_snd.sb_hiwat)
1540 sbreserve(so, &so->so_snd, nmax);
1541 }
1542
1543 /*
1544 * Scale the recv buffer by looking at how much data was transferred in
1545 * one approximated RTT. If more than a big part of the recv buffer was
1546 * transferred during that time we increase the buffer by a constant.
1547 * In low memory situation try to shrink the buffer to the initial size.
1548 */
1549 void
tcp_update_rcvspace(struct tcpcb * tp)1550 tcp_update_rcvspace(struct tcpcb *tp)
1551 {
1552 struct socket *so = tp->t_inpcb->inp_socket;
1553 u_long nmax = so->so_rcv.sb_hiwat;
1554
1555 if (sbchecklowmem()) {
1556 /* low on memory try to get rid of some */
1557 if (tcp_recvspace < nmax)
1558 nmax = tcp_recvspace;
1559 } else if (so->so_rcv.sb_wat != tcp_recvspace)
1560 /* user requested buffer size, auto-scaling disabled */
1561 nmax = so->so_rcv.sb_wat;
1562 else {
1563 /* automatic buffer scaling */
1564 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7)
1565 nmax = MIN(sb_max, so->so_rcv.sb_hiwat +
1566 tcp_autorcvbuf_inc);
1567 }
1568
1569 /* a readable socket must be preserved because of poll(2) semantics */
1570 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat &&
1571 nmax < so->so_snd.sb_lowat)
1572 nmax = so->so_snd.sb_lowat;
1573
1574 if (nmax == so->so_rcv.sb_hiwat)
1575 return;
1576
1577 /* round to MSS boundary */
1578 nmax = roundup(nmax, tp->t_maxseg);
1579 sbreserve(so, &so->so_rcv, nmax);
1580 }
1581