1 /* $OpenBSD: tcp_subr.c,v 1.174 2018/10/04 17:33:41 bluhm Exp $ */ 2 /* $NetBSD: tcp_subr.c,v 1.22 1996/02/13 23:44:00 christos Exp $ */ 3 4 /* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71 #include <sys/param.h> 72 #include <sys/systm.h> 73 #include <sys/mbuf.h> 74 #include <sys/socket.h> 75 #include <sys/socketvar.h> 76 #include <sys/timeout.h> 77 #include <sys/protosw.h> 78 #include <sys/kernel.h> 79 #include <sys/pool.h> 80 81 #include <net/route.h> 82 83 #include <netinet/in.h> 84 #include <netinet/ip.h> 85 #include <netinet/in_pcb.h> 86 #include <netinet/ip_var.h> 87 #include <netinet/ip_icmp.h> 88 #include <netinet/tcp.h> 89 #include <netinet/tcp_fsm.h> 90 #include <netinet/tcp_seq.h> 91 #include <netinet/tcp_timer.h> 92 #include <netinet/tcp_var.h> 93 94 #ifdef INET6 95 #include <netinet6/ip6protosw.h> 96 #endif /* INET6 */ 97 98 #include <crypto/md5.h> 99 #include <crypto/sha2.h> 100 101 /* patchable/settable parameters for tcp */ 102 int tcp_mssdflt = TCP_MSS; 103 int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; 104 105 /* values controllable via sysctl */ 106 int tcp_do_rfc1323 = 1; 107 int tcp_do_sack = 1; /* RFC 2018 selective ACKs */ 108 int tcp_ack_on_push = 0; /* set to enable immediate ACK-on-PUSH */ 109 #ifdef TCP_ECN 110 int tcp_do_ecn = 0; /* RFC3168 ECN enabled/disabled? */ 111 #endif 112 int tcp_do_rfc3390 = 2; /* Increase TCP's Initial Window to 10*mss */ 113 114 u_int32_t tcp_now = 1; 115 116 #ifndef TCB_INITIAL_HASH_SIZE 117 #define TCB_INITIAL_HASH_SIZE 128 118 #endif 119 120 int tcp_reass_limit = NMBCLUSTERS / 8; /* hardlimit for tcpqe_pool */ 121 int tcp_sackhole_limit = 32*1024; /* hardlimit for sackhl_pool */ 122 123 struct pool tcpcb_pool; 124 struct pool tcpqe_pool; 125 struct pool sackhl_pool; 126 127 struct cpumem *tcpcounters; /* tcp statistics */ 128 129 u_char tcp_secret[16]; 130 SHA2_CTX tcp_secret_ctx; 131 tcp_seq tcp_iss; 132 133 /* 134 * Tcp initialization 135 */ 136 void 137 tcp_init(void) 138 { 139 tcp_iss = 1; /* wrong */ 140 pool_init(&tcpcb_pool, sizeof(struct tcpcb), 0, IPL_SOFTNET, 0, 141 "tcpcb", NULL); 142 pool_init(&tcpqe_pool, sizeof(struct tcpqent), 0, IPL_SOFTNET, 0, 143 "tcpqe", NULL); 144 pool_sethardlimit(&tcpqe_pool, tcp_reass_limit, NULL, 0); 145 pool_init(&sackhl_pool, sizeof(struct sackhole), 0, IPL_SOFTNET, 0, 146 "sackhl", NULL); 147 pool_sethardlimit(&sackhl_pool, tcp_sackhole_limit, NULL, 0); 148 in_pcbinit(&tcbtable, TCB_INITIAL_HASH_SIZE); 149 tcpcounters = counters_alloc(tcps_ncounters); 150 151 arc4random_buf(tcp_secret, sizeof(tcp_secret)); 152 SHA512Init(&tcp_secret_ctx); 153 SHA512Update(&tcp_secret_ctx, tcp_secret, sizeof(tcp_secret)); 154 155 #ifdef INET6 156 /* 157 * Since sizeof(struct ip6_hdr) > sizeof(struct ip), we 158 * do max length checks/computations only on the former. 159 */ 160 if (max_protohdr < (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) 161 max_protohdr = (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)); 162 if ((max_linkhdr + sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) > 163 MHLEN) 164 panic("tcp_init"); 165 166 icmp6_mtudisc_callback_register(tcp6_mtudisc_callback); 167 #endif /* INET6 */ 168 169 /* Initialize the compressed state engine. */ 170 syn_cache_init(); 171 172 /* Initialize timer state. */ 173 tcp_timer_init(); 174 } 175 176 /* 177 * Create template to be used to send tcp packets on a connection. 178 * Call after host entry created, allocates an mbuf and fills 179 * in a skeletal tcp/ip header, minimizing the amount of work 180 * necessary when the connection is used. 181 * 182 * To support IPv6 in addition to IPv4 and considering that the sizes of 183 * the IPv4 and IPv6 headers are not the same, we now use a separate pointer 184 * for the TCP header. Also, we made the former tcpiphdr header pointer 185 * into just an IP overlay pointer, with casting as appropriate for v6. rja 186 */ 187 struct mbuf * 188 tcp_template(struct tcpcb *tp) 189 { 190 struct inpcb *inp = tp->t_inpcb; 191 struct mbuf *m; 192 struct tcphdr *th; 193 194 CTASSERT(sizeof(struct ip) + sizeof(struct tcphdr) <= MHLEN); 195 CTASSERT(sizeof(struct ip6_hdr) + sizeof(struct tcphdr) <= MHLEN); 196 197 if ((m = tp->t_template) == 0) { 198 m = m_get(M_DONTWAIT, MT_HEADER); 199 if (m == NULL) 200 return (0); 201 202 switch (tp->pf) { 203 case 0: /*default to PF_INET*/ 204 case AF_INET: 205 m->m_len = sizeof(struct ip); 206 break; 207 #ifdef INET6 208 case AF_INET6: 209 m->m_len = sizeof(struct ip6_hdr); 210 break; 211 #endif /* INET6 */ 212 } 213 m->m_len += sizeof (struct tcphdr); 214 } 215 216 switch(tp->pf) { 217 case AF_INET: 218 { 219 struct ipovly *ipovly; 220 221 ipovly = mtod(m, struct ipovly *); 222 223 bzero(ipovly->ih_x1, sizeof ipovly->ih_x1); 224 ipovly->ih_pr = IPPROTO_TCP; 225 ipovly->ih_len = htons(sizeof (struct tcphdr)); 226 ipovly->ih_src = inp->inp_laddr; 227 ipovly->ih_dst = inp->inp_faddr; 228 229 th = (struct tcphdr *)(mtod(m, caddr_t) + 230 sizeof(struct ip)); 231 } 232 break; 233 #ifdef INET6 234 case AF_INET6: 235 { 236 struct ip6_hdr *ip6; 237 238 ip6 = mtod(m, struct ip6_hdr *); 239 240 ip6->ip6_src = inp->inp_laddr6; 241 ip6->ip6_dst = inp->inp_faddr6; 242 ip6->ip6_flow = htonl(0x60000000) | 243 (inp->inp_flowinfo & IPV6_FLOWLABEL_MASK); 244 245 ip6->ip6_nxt = IPPROTO_TCP; 246 ip6->ip6_plen = htons(sizeof(struct tcphdr)); /*XXX*/ 247 ip6->ip6_hlim = in6_selecthlim(inp); /*XXX*/ 248 249 th = (struct tcphdr *)(mtod(m, caddr_t) + 250 sizeof(struct ip6_hdr)); 251 } 252 break; 253 #endif /* INET6 */ 254 } 255 256 th->th_sport = inp->inp_lport; 257 th->th_dport = inp->inp_fport; 258 th->th_seq = 0; 259 th->th_ack = 0; 260 th->th_x2 = 0; 261 th->th_off = 5; 262 th->th_flags = 0; 263 th->th_win = 0; 264 th->th_urp = 0; 265 th->th_sum = 0; 266 return (m); 267 } 268 269 /* 270 * Send a single message to the TCP at address specified by 271 * the given TCP/IP header. If m == 0, then we make a copy 272 * of the tcpiphdr at ti and send directly to the addressed host. 273 * This is used to force keep alive messages out using the TCP 274 * template for a connection tp->t_template. If flags are given 275 * then we send a message back to the TCP which originated the 276 * segment ti, and discard the mbuf containing it and any other 277 * attached mbufs. 278 * 279 * In any case the ack and sequence number of the transmitted 280 * segment are as specified by the parameters. 281 */ 282 void 283 tcp_respond(struct tcpcb *tp, caddr_t template, struct tcphdr *th0, 284 tcp_seq ack, tcp_seq seq, int flags, u_int rtableid) 285 { 286 int tlen; 287 int win = 0; 288 struct mbuf *m = NULL; 289 struct tcphdr *th; 290 struct ip *ip; 291 #ifdef INET6 292 struct ip6_hdr *ip6; 293 #endif 294 int af; /* af on wire */ 295 296 if (tp) { 297 struct socket *so = tp->t_inpcb->inp_socket; 298 win = sbspace(so, &so->so_rcv); 299 /* 300 * If this is called with an unconnected 301 * socket/tp/pcb (tp->pf is 0), we lose. 302 */ 303 af = tp->pf; 304 } else 305 af = (((struct ip *)template)->ip_v == 6) ? AF_INET6 : AF_INET; 306 307 m = m_gethdr(M_DONTWAIT, MT_HEADER); 308 if (m == NULL) 309 return; 310 m->m_data += max_linkhdr; 311 tlen = 0; 312 313 #define xchg(a,b,type) do { type t; t=a; a=b; b=t; } while (0) 314 switch (af) { 315 #ifdef INET6 316 case AF_INET6: 317 ip6 = mtod(m, struct ip6_hdr *); 318 th = (struct tcphdr *)(ip6 + 1); 319 tlen = sizeof(*ip6) + sizeof(*th); 320 if (th0) { 321 bcopy(template, ip6, sizeof(*ip6)); 322 bcopy(th0, th, sizeof(*th)); 323 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); 324 } else { 325 bcopy(template, ip6, tlen); 326 } 327 break; 328 #endif /* INET6 */ 329 case AF_INET: 330 ip = mtod(m, struct ip *); 331 th = (struct tcphdr *)(ip + 1); 332 tlen = sizeof(*ip) + sizeof(*th); 333 if (th0) { 334 bcopy(template, ip, sizeof(*ip)); 335 bcopy(th0, th, sizeof(*th)); 336 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, u_int32_t); 337 } else { 338 bcopy(template, ip, tlen); 339 } 340 break; 341 } 342 if (th0) 343 xchg(th->th_dport, th->th_sport, u_int16_t); 344 else 345 flags = TH_ACK; 346 #undef xchg 347 348 th->th_seq = htonl(seq); 349 th->th_ack = htonl(ack); 350 th->th_x2 = 0; 351 th->th_off = sizeof (struct tcphdr) >> 2; 352 th->th_flags = flags; 353 if (tp) 354 win >>= tp->rcv_scale; 355 if (win > TCP_MAXWIN) 356 win = TCP_MAXWIN; 357 th->th_win = htons((u_int16_t)win); 358 th->th_urp = 0; 359 360 if (tp && (tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && 361 (flags & TH_RST) == 0 && (tp->t_flags & TF_RCVD_TSTMP)) { 362 u_int32_t *lp = (u_int32_t *)(th + 1); 363 /* Form timestamp option as shown in appendix A of RFC 1323. */ 364 *lp++ = htonl(TCPOPT_TSTAMP_HDR); 365 *lp++ = htonl(tcp_now + tp->ts_modulate); 366 *lp = htonl(tp->ts_recent); 367 tlen += TCPOLEN_TSTAMP_APPA; 368 th->th_off = (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA) >> 2; 369 } 370 371 m->m_len = tlen; 372 m->m_pkthdr.len = tlen; 373 m->m_pkthdr.ph_ifidx = 0; 374 m->m_pkthdr.csum_flags |= M_TCP_CSUM_OUT; 375 376 /* force routing table */ 377 if (tp) 378 m->m_pkthdr.ph_rtableid = tp->t_inpcb->inp_rtableid; 379 else 380 m->m_pkthdr.ph_rtableid = rtableid; 381 382 switch (af) { 383 #ifdef INET6 384 case AF_INET6: 385 ip6->ip6_flow = htonl(0x60000000); 386 ip6->ip6_nxt = IPPROTO_TCP; 387 ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL); /*XXX*/ 388 ip6->ip6_plen = tlen - sizeof(struct ip6_hdr); 389 ip6->ip6_plen = htons(ip6->ip6_plen); 390 ip6_output(m, tp ? tp->t_inpcb->inp_outputopts6 : NULL, 391 tp ? &tp->t_inpcb->inp_route6 : NULL, 392 0, NULL, 393 tp ? tp->t_inpcb : NULL); 394 break; 395 #endif /* INET6 */ 396 case AF_INET: 397 ip->ip_len = htons(tlen); 398 ip->ip_ttl = ip_defttl; 399 ip->ip_tos = 0; 400 ip_output(m, NULL, 401 tp ? &tp->t_inpcb->inp_route : NULL, 402 ip_mtudisc ? IP_MTUDISC : 0, NULL, 403 tp ? tp->t_inpcb : NULL, 0); 404 break; 405 } 406 } 407 408 /* 409 * Create a new TCP control block, making an 410 * empty reassembly queue and hooking it to the argument 411 * protocol control block. 412 */ 413 struct tcpcb * 414 tcp_newtcpcb(struct inpcb *inp) 415 { 416 struct tcpcb *tp; 417 int i; 418 419 tp = pool_get(&tcpcb_pool, PR_NOWAIT|PR_ZERO); 420 if (tp == NULL) 421 return (NULL); 422 TAILQ_INIT(&tp->t_segq); 423 tp->t_maxseg = tcp_mssdflt; 424 tp->t_maxopd = 0; 425 426 for (i = 0; i < TCPT_NTIMERS; i++) 427 TCP_TIMER_INIT(tp, i); 428 429 tp->sack_enable = tcp_do_sack; 430 tp->t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0; 431 tp->t_inpcb = inp; 432 /* 433 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no 434 * rtt estimate. Set rttvar so that srtt + 2 * rttvar gives 435 * reasonable initial retransmit time. 436 */ 437 tp->t_srtt = TCPTV_SRTTBASE; 438 tp->t_rttvar = tcp_rttdflt * PR_SLOWHZ << 439 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1); 440 tp->t_rttmin = TCPTV_MIN; 441 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 442 TCPTV_MIN, TCPTV_REXMTMAX); 443 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; 444 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; 445 446 tp->t_pmtud_mtu_sent = 0; 447 tp->t_pmtud_mss_acked = 0; 448 449 #ifdef INET6 450 /* we disallow IPv4 mapped address completely. */ 451 if ((inp->inp_flags & INP_IPV6) == 0) 452 tp->pf = PF_INET; 453 else 454 tp->pf = PF_INET6; 455 #else 456 tp->pf = PF_INET; 457 #endif 458 459 #ifdef INET6 460 if (inp->inp_flags & INP_IPV6) 461 inp->inp_ipv6.ip6_hlim = ip6_defhlim; 462 else 463 #endif /* INET6 */ 464 inp->inp_ip.ip_ttl = ip_defttl; 465 466 inp->inp_ppcb = (caddr_t)tp; 467 return (tp); 468 } 469 470 /* 471 * Drop a TCP connection, reporting 472 * the specified error. If connection is synchronized, 473 * then send a RST to peer. 474 */ 475 struct tcpcb * 476 tcp_drop(struct tcpcb *tp, int errno) 477 { 478 struct socket *so = tp->t_inpcb->inp_socket; 479 480 if (TCPS_HAVERCVDSYN(tp->t_state)) { 481 tp->t_state = TCPS_CLOSED; 482 (void) tcp_output(tp); 483 tcpstat_inc(tcps_drops); 484 } else 485 tcpstat_inc(tcps_conndrops); 486 if (errno == ETIMEDOUT && tp->t_softerror) 487 errno = tp->t_softerror; 488 so->so_error = errno; 489 return (tcp_close(tp)); 490 } 491 492 /* 493 * Close a TCP control block: 494 * discard all space held by the tcp 495 * discard internet protocol block 496 * wake up any sleepers 497 */ 498 struct tcpcb * 499 tcp_close(struct tcpcb *tp) 500 { 501 struct inpcb *inp = tp->t_inpcb; 502 struct socket *so = inp->inp_socket; 503 struct sackhole *p, *q; 504 505 /* free the reassembly queue, if any */ 506 tcp_freeq(tp); 507 508 tcp_canceltimers(tp); 509 syn_cache_cleanup(tp); 510 511 /* Free SACK holes. */ 512 q = p = tp->snd_holes; 513 while (p != 0) { 514 q = p->next; 515 pool_put(&sackhl_pool, p); 516 p = q; 517 } 518 519 m_free(tp->t_template); 520 /* Free tcpcb after all pending timers have been run. */ 521 TCP_TIMER_ARM(tp, TCPT_REAPER, 0); 522 523 inp->inp_ppcb = NULL; 524 soisdisconnected(so); 525 in_pcbdetach(inp); 526 return (NULL); 527 } 528 529 int 530 tcp_freeq(struct tcpcb *tp) 531 { 532 struct tcpqent *qe; 533 int rv = 0; 534 535 while ((qe = TAILQ_FIRST(&tp->t_segq)) != NULL) { 536 TAILQ_REMOVE(&tp->t_segq, qe, tcpqe_q); 537 m_freem(qe->tcpqe_m); 538 pool_put(&tcpqe_pool, qe); 539 rv = 1; 540 } 541 return (rv); 542 } 543 544 /* 545 * Compute proper scaling value for receiver window from buffer space 546 */ 547 548 void 549 tcp_rscale(struct tcpcb *tp, u_long hiwat) 550 { 551 tp->request_r_scale = 0; 552 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 553 TCP_MAXWIN << tp->request_r_scale < hiwat) 554 tp->request_r_scale++; 555 } 556 557 /* 558 * Notify a tcp user of an asynchronous error; 559 * store error as soft error, but wake up user 560 * (for now, won't do anything until can select for soft error). 561 */ 562 void 563 tcp_notify(struct inpcb *inp, int error) 564 { 565 struct tcpcb *tp = intotcpcb(inp); 566 struct socket *so = inp->inp_socket; 567 568 /* 569 * Ignore some errors if we are hooked up. 570 * If connection hasn't completed, has retransmitted several times, 571 * and receives a second error, give up now. This is better 572 * than waiting a long time to establish a connection that 573 * can never complete. 574 */ 575 if (tp->t_state == TCPS_ESTABLISHED && 576 (error == EHOSTUNREACH || error == ENETUNREACH || 577 error == EHOSTDOWN)) { 578 return; 579 } else if (TCPS_HAVEESTABLISHED(tp->t_state) == 0 && 580 tp->t_rxtshift > 3 && tp->t_softerror) 581 so->so_error = error; 582 else 583 tp->t_softerror = error; 584 wakeup((caddr_t) &so->so_timeo); 585 sorwakeup(so); 586 sowwakeup(so); 587 } 588 589 #ifdef INET6 590 void 591 tcp6_ctlinput(int cmd, struct sockaddr *sa, u_int rdomain, void *d) 592 { 593 struct tcphdr th; 594 struct tcpcb *tp; 595 void (*notify)(struct inpcb *, int) = tcp_notify; 596 struct ip6_hdr *ip6; 597 const struct sockaddr_in6 *sa6_src = NULL; 598 struct sockaddr_in6 *sa6 = satosin6(sa); 599 struct inpcb *inp; 600 struct mbuf *m; 601 tcp_seq seq; 602 int off; 603 struct { 604 u_int16_t th_sport; 605 u_int16_t th_dport; 606 u_int32_t th_seq; 607 } *thp; 608 609 CTASSERT(sizeof(*thp) <= sizeof(th)); 610 if (sa->sa_family != AF_INET6 || 611 sa->sa_len != sizeof(struct sockaddr_in6) || 612 IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) || 613 IN6_IS_ADDR_V4MAPPED(&sa6->sin6_addr)) 614 return; 615 if ((unsigned)cmd >= PRC_NCMDS) 616 return; 617 else if (cmd == PRC_QUENCH) { 618 /* 619 * Don't honor ICMP Source Quench messages meant for 620 * TCP connections. 621 */ 622 /* XXX there's no PRC_QUENCH in IPv6 */ 623 return; 624 } else if (PRC_IS_REDIRECT(cmd)) 625 notify = in_rtchange, d = NULL; 626 else if (cmd == PRC_MSGSIZE) 627 ; /* special code is present, see below */ 628 else if (cmd == PRC_HOSTDEAD) 629 d = NULL; 630 else if (inet6ctlerrmap[cmd] == 0) 631 return; 632 633 /* if the parameter is from icmp6, decode it. */ 634 if (d != NULL) { 635 struct ip6ctlparam *ip6cp = (struct ip6ctlparam *)d; 636 m = ip6cp->ip6c_m; 637 ip6 = ip6cp->ip6c_ip6; 638 off = ip6cp->ip6c_off; 639 sa6_src = ip6cp->ip6c_src; 640 } else { 641 m = NULL; 642 ip6 = NULL; 643 sa6_src = &sa6_any; 644 } 645 646 if (ip6) { 647 /* 648 * XXX: We assume that when ip6 is non NULL, 649 * M and OFF are valid. 650 */ 651 652 /* check if we can safely examine src and dst ports */ 653 if (m->m_pkthdr.len < off + sizeof(*thp)) 654 return; 655 656 bzero(&th, sizeof(th)); 657 m_copydata(m, off, sizeof(*thp), (caddr_t)&th); 658 659 /* 660 * Check to see if we have a valid TCP connection 661 * corresponding to the address in the ICMPv6 message 662 * payload. 663 */ 664 inp = in6_pcbhashlookup(&tcbtable, &sa6->sin6_addr, 665 th.th_dport, &sa6_src->sin6_addr, th.th_sport, rdomain); 666 if (cmd == PRC_MSGSIZE) { 667 /* 668 * Depending on the value of "valid" and routing table 669 * size (mtudisc_{hi,lo}wat), we will: 670 * - recalcurate the new MTU and create the 671 * corresponding routing entry, or 672 * - ignore the MTU change notification. 673 */ 674 icmp6_mtudisc_update((struct ip6ctlparam *)d, inp != NULL); 675 return; 676 } 677 if (inp) { 678 seq = ntohl(th.th_seq); 679 if (inp->inp_socket && 680 (tp = intotcpcb(inp)) && 681 SEQ_GEQ(seq, tp->snd_una) && 682 SEQ_LT(seq, tp->snd_max)) 683 notify(inp, inet6ctlerrmap[cmd]); 684 } else if (inet6ctlerrmap[cmd] == EHOSTUNREACH || 685 inet6ctlerrmap[cmd] == ENETUNREACH || 686 inet6ctlerrmap[cmd] == EHOSTDOWN) 687 syn_cache_unreach((struct sockaddr *)sa6_src, 688 sa, &th, rdomain); 689 } else { 690 (void) in6_pcbnotify(&tcbtable, sa6, 0, 691 sa6_src, 0, rdomain, cmd, NULL, notify); 692 } 693 } 694 #endif 695 696 void 697 tcp_ctlinput(int cmd, struct sockaddr *sa, u_int rdomain, void *v) 698 { 699 struct ip *ip = v; 700 struct tcphdr *th; 701 struct tcpcb *tp; 702 struct inpcb *inp; 703 struct in_addr faddr; 704 tcp_seq seq; 705 u_int mtu; 706 void (*notify)(struct inpcb *, int) = tcp_notify; 707 int errno; 708 709 if (sa->sa_family != AF_INET) 710 return; 711 faddr = satosin(sa)->sin_addr; 712 if (faddr.s_addr == INADDR_ANY) 713 return; 714 715 if ((unsigned)cmd >= PRC_NCMDS) 716 return; 717 errno = inetctlerrmap[cmd]; 718 if (cmd == PRC_QUENCH) 719 /* 720 * Don't honor ICMP Source Quench messages meant for 721 * TCP connections. 722 */ 723 return; 724 else if (PRC_IS_REDIRECT(cmd)) 725 notify = in_rtchange, ip = 0; 726 else if (cmd == PRC_MSGSIZE && ip_mtudisc && ip) { 727 /* 728 * Verify that the packet in the icmp payload refers 729 * to an existing TCP connection. 730 */ 731 th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); 732 seq = ntohl(th->th_seq); 733 inp = in_pcbhashlookup(&tcbtable, 734 ip->ip_dst, th->th_dport, ip->ip_src, th->th_sport, 735 rdomain); 736 if (inp && (tp = intotcpcb(inp)) && 737 SEQ_GEQ(seq, tp->snd_una) && 738 SEQ_LT(seq, tp->snd_max)) { 739 struct icmp *icp; 740 icp = (struct icmp *)((caddr_t)ip - 741 offsetof(struct icmp, icmp_ip)); 742 743 /* 744 * If the ICMP message advertises a Next-Hop MTU 745 * equal or larger than the maximum packet size we have 746 * ever sent, drop the message. 747 */ 748 mtu = (u_int)ntohs(icp->icmp_nextmtu); 749 if (mtu >= tp->t_pmtud_mtu_sent) 750 return; 751 if (mtu >= tcp_hdrsz(tp) + tp->t_pmtud_mss_acked) { 752 /* 753 * Calculate new MTU, and create corresponding 754 * route (traditional PMTUD). 755 */ 756 tp->t_flags &= ~TF_PMTUD_PEND; 757 icmp_mtudisc(icp, inp->inp_rtableid); 758 } else { 759 /* 760 * Record the information got in the ICMP 761 * message; act on it later. 762 * If we had already recorded an ICMP message, 763 * replace the old one only if the new message 764 * refers to an older TCP segment 765 */ 766 if (tp->t_flags & TF_PMTUD_PEND) { 767 if (SEQ_LT(tp->t_pmtud_th_seq, seq)) 768 return; 769 } else 770 tp->t_flags |= TF_PMTUD_PEND; 771 tp->t_pmtud_th_seq = seq; 772 tp->t_pmtud_nextmtu = icp->icmp_nextmtu; 773 tp->t_pmtud_ip_len = icp->icmp_ip.ip_len; 774 tp->t_pmtud_ip_hl = icp->icmp_ip.ip_hl; 775 return; 776 } 777 } else { 778 /* ignore if we don't have a matching connection */ 779 return; 780 } 781 notify = tcp_mtudisc, ip = 0; 782 } else if (cmd == PRC_MTUINC) 783 notify = tcp_mtudisc_increase, ip = 0; 784 else if (cmd == PRC_HOSTDEAD) 785 ip = 0; 786 else if (errno == 0) 787 return; 788 789 if (ip) { 790 th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); 791 inp = in_pcbhashlookup(&tcbtable, 792 ip->ip_dst, th->th_dport, ip->ip_src, th->th_sport, 793 rdomain); 794 if (inp) { 795 seq = ntohl(th->th_seq); 796 if (inp->inp_socket && 797 (tp = intotcpcb(inp)) && 798 SEQ_GEQ(seq, tp->snd_una) && 799 SEQ_LT(seq, tp->snd_max)) 800 notify(inp, errno); 801 } else if (inetctlerrmap[cmd] == EHOSTUNREACH || 802 inetctlerrmap[cmd] == ENETUNREACH || 803 inetctlerrmap[cmd] == EHOSTDOWN) { 804 struct sockaddr_in sin; 805 806 bzero(&sin, sizeof(sin)); 807 sin.sin_len = sizeof(sin); 808 sin.sin_family = AF_INET; 809 sin.sin_port = th->th_sport; 810 sin.sin_addr = ip->ip_src; 811 syn_cache_unreach(sintosa(&sin), sa, th, rdomain); 812 } 813 } else 814 in_pcbnotifyall(&tcbtable, sa, rdomain, errno, notify); 815 } 816 817 818 #ifdef INET6 819 /* 820 * Path MTU Discovery handlers. 821 */ 822 void 823 tcp6_mtudisc_callback(struct sockaddr_in6 *sin6, u_int rdomain) 824 { 825 (void) in6_pcbnotify(&tcbtable, sin6, 0, 826 &sa6_any, 0, rdomain, PRC_MSGSIZE, NULL, tcp_mtudisc); 827 } 828 #endif /* INET6 */ 829 830 /* 831 * On receipt of path MTU corrections, flush old route and replace it 832 * with the new one. Retransmit all unacknowledged packets, to ensure 833 * that all packets will be received. 834 */ 835 void 836 tcp_mtudisc(struct inpcb *inp, int errno) 837 { 838 struct tcpcb *tp = intotcpcb(inp); 839 struct rtentry *rt; 840 int change = 0; 841 842 if (tp == NULL) 843 return; 844 845 rt = in_pcbrtentry(inp); 846 if (rt != NULL) { 847 int orig_maxseg = tp->t_maxseg; 848 849 /* 850 * If this was not a host route, remove and realloc. 851 */ 852 if ((rt->rt_flags & RTF_HOST) == 0) { 853 in_rtchange(inp, errno); 854 if ((rt = in_pcbrtentry(inp)) == NULL) 855 return; 856 } 857 if (orig_maxseg != tp->t_maxseg || 858 (rt->rt_locks & RTV_MTU)) 859 change = 1; 860 } 861 tcp_mss(tp, -1); 862 863 /* 864 * Resend unacknowledged packets 865 */ 866 tp->snd_nxt = tp->snd_una; 867 if (change || errno > 0) 868 tcp_output(tp); 869 } 870 871 void 872 tcp_mtudisc_increase(struct inpcb *inp, int errno) 873 { 874 struct tcpcb *tp = intotcpcb(inp); 875 struct rtentry *rt = in_pcbrtentry(inp); 876 877 if (tp != 0 && rt != 0) { 878 /* 879 * If this was a host route, remove and realloc. 880 */ 881 if (rt->rt_flags & RTF_HOST) 882 in_rtchange(inp, errno); 883 884 /* also takes care of congestion window */ 885 tcp_mss(tp, -1); 886 } 887 } 888 889 /* 890 * Generate new ISNs with a method based on RFC1948 891 */ 892 #define TCP_ISS_CONN_INC 4096 893 894 void 895 tcp_set_iss_tsm(struct tcpcb *tp) 896 { 897 SHA2_CTX ctx; 898 union { 899 uint8_t bytes[SHA512_DIGEST_LENGTH]; 900 uint32_t words[2]; 901 } digest; 902 u_int rdomain = rtable_l2(tp->t_inpcb->inp_rtableid); 903 904 ctx = tcp_secret_ctx; 905 SHA512Update(&ctx, &rdomain, sizeof(rdomain)); 906 SHA512Update(&ctx, &tp->t_inpcb->inp_lport, sizeof(u_short)); 907 SHA512Update(&ctx, &tp->t_inpcb->inp_fport, sizeof(u_short)); 908 if (tp->pf == AF_INET6) { 909 SHA512Update(&ctx, &tp->t_inpcb->inp_laddr6, 910 sizeof(struct in6_addr)); 911 SHA512Update(&ctx, &tp->t_inpcb->inp_faddr6, 912 sizeof(struct in6_addr)); 913 } else { 914 SHA512Update(&ctx, &tp->t_inpcb->inp_laddr, 915 sizeof(struct in_addr)); 916 SHA512Update(&ctx, &tp->t_inpcb->inp_faddr, 917 sizeof(struct in_addr)); 918 } 919 SHA512Final(digest.bytes, &ctx); 920 tcp_iss += TCP_ISS_CONN_INC; 921 tp->iss = digest.words[0] + tcp_iss; 922 tp->ts_modulate = digest.words[1]; 923 } 924 925 #ifdef TCP_SIGNATURE 926 int 927 tcp_signature_tdb_attach(void) 928 { 929 return (0); 930 } 931 932 int 933 tcp_signature_tdb_init(struct tdb *tdbp, struct xformsw *xsp, 934 struct ipsecinit *ii) 935 { 936 if ((ii->ii_authkeylen < 1) || (ii->ii_authkeylen > 80)) 937 return (EINVAL); 938 939 tdbp->tdb_amxkey = malloc(ii->ii_authkeylen, M_XDATA, M_NOWAIT); 940 if (tdbp->tdb_amxkey == NULL) 941 return (ENOMEM); 942 memcpy(tdbp->tdb_amxkey, ii->ii_authkey, ii->ii_authkeylen); 943 tdbp->tdb_amxkeylen = ii->ii_authkeylen; 944 945 return (0); 946 } 947 948 int 949 tcp_signature_tdb_zeroize(struct tdb *tdbp) 950 { 951 if (tdbp->tdb_amxkey) { 952 explicit_bzero(tdbp->tdb_amxkey, tdbp->tdb_amxkeylen); 953 free(tdbp->tdb_amxkey, M_XDATA, tdbp->tdb_amxkeylen); 954 tdbp->tdb_amxkey = NULL; 955 } 956 957 return (0); 958 } 959 960 int 961 tcp_signature_tdb_input(struct mbuf *m, struct tdb *tdbp, int skip, int protoff) 962 { 963 return (0); 964 } 965 966 int 967 tcp_signature_tdb_output(struct mbuf *m, struct tdb *tdbp, struct mbuf **mp, 968 int skip, int protoff) 969 { 970 return (EINVAL); 971 } 972 973 int 974 tcp_signature_apply(caddr_t fstate, caddr_t data, unsigned int len) 975 { 976 MD5Update((MD5_CTX *)fstate, (char *)data, len); 977 return 0; 978 } 979 980 int 981 tcp_signature(struct tdb *tdb, int af, struct mbuf *m, struct tcphdr *th, 982 int iphlen, int doswap, char *sig) 983 { 984 MD5_CTX ctx; 985 int len; 986 struct tcphdr th0; 987 988 MD5Init(&ctx); 989 990 switch(af) { 991 case 0: 992 case AF_INET: { 993 struct ippseudo ippseudo; 994 struct ip *ip; 995 996 ip = mtod(m, struct ip *); 997 998 ippseudo.ippseudo_src = ip->ip_src; 999 ippseudo.ippseudo_dst = ip->ip_dst; 1000 ippseudo.ippseudo_pad = 0; 1001 ippseudo.ippseudo_p = IPPROTO_TCP; 1002 ippseudo.ippseudo_len = htons(m->m_pkthdr.len - iphlen); 1003 1004 MD5Update(&ctx, (char *)&ippseudo, 1005 sizeof(struct ippseudo)); 1006 break; 1007 } 1008 #ifdef INET6 1009 case AF_INET6: { 1010 struct ip6_hdr_pseudo ip6pseudo; 1011 struct ip6_hdr *ip6; 1012 1013 ip6 = mtod(m, struct ip6_hdr *); 1014 bzero(&ip6pseudo, sizeof(ip6pseudo)); 1015 ip6pseudo.ip6ph_src = ip6->ip6_src; 1016 ip6pseudo.ip6ph_dst = ip6->ip6_dst; 1017 in6_clearscope(&ip6pseudo.ip6ph_src); 1018 in6_clearscope(&ip6pseudo.ip6ph_dst); 1019 ip6pseudo.ip6ph_nxt = IPPROTO_TCP; 1020 ip6pseudo.ip6ph_len = htonl(m->m_pkthdr.len - iphlen); 1021 1022 MD5Update(&ctx, (char *)&ip6pseudo, 1023 sizeof(ip6pseudo)); 1024 break; 1025 } 1026 #endif 1027 } 1028 1029 th0 = *th; 1030 th0.th_sum = 0; 1031 1032 if (doswap) { 1033 th0.th_seq = htonl(th0.th_seq); 1034 th0.th_ack = htonl(th0.th_ack); 1035 th0.th_win = htons(th0.th_win); 1036 th0.th_urp = htons(th0.th_urp); 1037 } 1038 MD5Update(&ctx, (char *)&th0, sizeof(th0)); 1039 1040 len = m->m_pkthdr.len - iphlen - th->th_off * sizeof(uint32_t); 1041 1042 if (len > 0 && 1043 m_apply(m, iphlen + th->th_off * sizeof(uint32_t), len, 1044 tcp_signature_apply, (caddr_t)&ctx)) 1045 return (-1); 1046 1047 MD5Update(&ctx, tdb->tdb_amxkey, tdb->tdb_amxkeylen); 1048 MD5Final(sig, &ctx); 1049 1050 return (0); 1051 } 1052 #endif /* TCP_SIGNATURE */ 1053