1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2012 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_inet.h" 34 #include "opt_inet6.h" 35 36 #ifdef TCP_OFFLOAD 37 #include <sys/param.h> 38 #include <sys/types.h> 39 #include <sys/kernel.h> 40 #include <sys/ktr.h> 41 #include <sys/module.h> 42 #include <sys/protosw.h> 43 #include <sys/refcount.h> 44 #include <sys/domain.h> 45 #include <sys/fnv_hash.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sysctl.h> 49 #include <net/ethernet.h> 50 #include <net/if.h> 51 #include <net/if_types.h> 52 #include <net/if_vlan_var.h> 53 #include <net/route.h> 54 #include <net/route/nhop.h> 55 #include <netinet/in.h> 56 #include <netinet/in_fib.h> 57 #include <netinet/in_pcb.h> 58 #include <netinet/ip.h> 59 #include <netinet/ip6.h> 60 #include <netinet6/in6_fib.h> 61 #include <netinet6/scope6_var.h> 62 #include <netinet/tcp_timer.h> 63 #define TCPSTATES 64 #include <netinet/tcp_fsm.h> 65 #include <netinet/tcp_var.h> 66 #include <netinet/toecore.h> 67 #include <netinet/cc/cc.h> 68 69 #include "common/common.h" 70 #include "common/t4_msg.h" 71 #include "common/t4_regs.h" 72 #include "t4_clip.h" 73 #include "tom/t4_tom_l2t.h" 74 #include "tom/t4_tom.h" 75 76 /* stid services */ 77 static int alloc_stid(struct adapter *, struct listen_ctx *, int); 78 static struct listen_ctx *lookup_stid(struct adapter *, int); 79 static void free_stid(struct adapter *, struct listen_ctx *); 80 81 /* lctx services */ 82 static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *, 83 struct vi_info *); 84 static int free_lctx(struct adapter *, struct listen_ctx *); 85 static void hold_lctx(struct listen_ctx *); 86 static void listen_hash_add(struct adapter *, struct listen_ctx *); 87 static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *); 88 static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *); 89 static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *); 90 91 static void send_abort_rpl_synqe(struct toedev *, struct synq_entry *, int); 92 93 static int 94 alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6) 95 { 96 struct tid_info *t = &sc->tids; 97 u_int stid, n, f, mask; 98 struct stid_region *sr = &lctx->stid_region; 99 100 /* 101 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in 102 * the TCAM. The start of the stid region is properly aligned (the chip 103 * requires each region to be 128-cell aligned). 104 */ 105 n = isipv6 ? 2 : 1; 106 mask = n - 1; 107 KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0, 108 ("%s: stid region (%u, %u) not properly aligned. n = %u", 109 __func__, t->stid_base, t->nstids, n)); 110 111 mtx_lock(&t->stid_lock); 112 if (n > t->nstids - t->stids_in_use) { 113 mtx_unlock(&t->stid_lock); 114 return (-1); 115 } 116 117 if (t->nstids_free_head >= n) { 118 /* 119 * This allocation will definitely succeed because the region 120 * starts at a good alignment and we just checked we have enough 121 * stids free. 122 */ 123 f = t->nstids_free_head & mask; 124 t->nstids_free_head -= n + f; 125 stid = t->nstids_free_head; 126 TAILQ_INSERT_HEAD(&t->stids, sr, link); 127 } else { 128 struct stid_region *s; 129 130 stid = t->nstids_free_head; 131 TAILQ_FOREACH(s, &t->stids, link) { 132 stid += s->used + s->free; 133 f = stid & mask; 134 if (s->free >= n + f) { 135 stid -= n + f; 136 s->free -= n + f; 137 TAILQ_INSERT_AFTER(&t->stids, s, sr, link); 138 goto allocated; 139 } 140 } 141 142 if (__predict_false(stid != t->nstids)) { 143 panic("%s: stids TAILQ (%p) corrupt." 144 " At %d instead of %d at the end of the queue.", 145 __func__, &t->stids, stid, t->nstids); 146 } 147 148 mtx_unlock(&t->stid_lock); 149 return (-1); 150 } 151 152 allocated: 153 sr->used = n; 154 sr->free = f; 155 t->stids_in_use += n; 156 t->stid_tab[stid] = lctx; 157 mtx_unlock(&t->stid_lock); 158 159 KASSERT(((stid + t->stid_base) & mask) == 0, 160 ("%s: EDOOFUS.", __func__)); 161 return (stid + t->stid_base); 162 } 163 164 static struct listen_ctx * 165 lookup_stid(struct adapter *sc, int stid) 166 { 167 struct tid_info *t = &sc->tids; 168 169 return (t->stid_tab[stid - t->stid_base]); 170 } 171 172 static void 173 free_stid(struct adapter *sc, struct listen_ctx *lctx) 174 { 175 struct tid_info *t = &sc->tids; 176 struct stid_region *sr = &lctx->stid_region; 177 struct stid_region *s; 178 179 KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used)); 180 181 mtx_lock(&t->stid_lock); 182 s = TAILQ_PREV(sr, stid_head, link); 183 if (s != NULL) 184 s->free += sr->used + sr->free; 185 else 186 t->nstids_free_head += sr->used + sr->free; 187 KASSERT(t->stids_in_use >= sr->used, 188 ("%s: stids_in_use (%u) < stids being freed (%u)", __func__, 189 t->stids_in_use, sr->used)); 190 t->stids_in_use -= sr->used; 191 TAILQ_REMOVE(&t->stids, sr, link); 192 mtx_unlock(&t->stid_lock); 193 } 194 195 static struct listen_ctx * 196 alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi) 197 { 198 struct listen_ctx *lctx; 199 200 INP_WLOCK_ASSERT(inp); 201 202 lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO); 203 if (lctx == NULL) 204 return (NULL); 205 206 lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6); 207 if (lctx->stid < 0) { 208 free(lctx, M_CXGBE); 209 return (NULL); 210 } 211 212 if (inp->inp_vflag & INP_IPV6 && 213 !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) { 214 lctx->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true); 215 if (lctx->ce == NULL) { 216 free(lctx, M_CXGBE); 217 return (NULL); 218 } 219 } 220 221 lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id]; 222 lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq]; 223 refcount_init(&lctx->refcount, 1); 224 225 lctx->inp = inp; 226 lctx->vnet = inp->inp_socket->so_vnet; 227 in_pcbref(inp); 228 229 return (lctx); 230 } 231 232 /* Don't call this directly, use release_lctx instead */ 233 static int 234 free_lctx(struct adapter *sc, struct listen_ctx *lctx) 235 { 236 struct inpcb *inp = lctx->inp; 237 238 INP_WLOCK_ASSERT(inp); 239 KASSERT(lctx->refcount == 0, 240 ("%s: refcount %d", __func__, lctx->refcount)); 241 KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid)); 242 243 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p", 244 __func__, lctx->stid, lctx, lctx->inp); 245 246 if (lctx->ce) 247 t4_release_clip_entry(sc, lctx->ce); 248 free_stid(sc, lctx); 249 free(lctx, M_CXGBE); 250 251 return (in_pcbrele_wlocked(inp)); 252 } 253 254 static void 255 hold_lctx(struct listen_ctx *lctx) 256 { 257 258 refcount_acquire(&lctx->refcount); 259 } 260 261 static inline uint32_t 262 listen_hashfn(void *key, u_long mask) 263 { 264 265 return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask); 266 } 267 268 /* 269 * Add a listen_ctx entry to the listen hash table. 270 */ 271 static void 272 listen_hash_add(struct adapter *sc, struct listen_ctx *lctx) 273 { 274 struct tom_data *td = sc->tom_softc; 275 int bucket = listen_hashfn(lctx->inp, td->listen_mask); 276 277 mtx_lock(&td->lctx_hash_lock); 278 LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link); 279 td->lctx_count++; 280 mtx_unlock(&td->lctx_hash_lock); 281 } 282 283 /* 284 * Look for the listening socket's context entry in the hash and return it. 285 */ 286 static struct listen_ctx * 287 listen_hash_find(struct adapter *sc, struct inpcb *inp) 288 { 289 struct tom_data *td = sc->tom_softc; 290 int bucket = listen_hashfn(inp, td->listen_mask); 291 struct listen_ctx *lctx; 292 293 mtx_lock(&td->lctx_hash_lock); 294 LIST_FOREACH(lctx, &td->listen_hash[bucket], link) { 295 if (lctx->inp == inp) 296 break; 297 } 298 mtx_unlock(&td->lctx_hash_lock); 299 300 return (lctx); 301 } 302 303 /* 304 * Removes the listen_ctx structure for inp from the hash and returns it. 305 */ 306 static struct listen_ctx * 307 listen_hash_del(struct adapter *sc, struct inpcb *inp) 308 { 309 struct tom_data *td = sc->tom_softc; 310 int bucket = listen_hashfn(inp, td->listen_mask); 311 struct listen_ctx *lctx, *l; 312 313 mtx_lock(&td->lctx_hash_lock); 314 LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) { 315 if (lctx->inp == inp) { 316 LIST_REMOVE(lctx, link); 317 td->lctx_count--; 318 break; 319 } 320 } 321 mtx_unlock(&td->lctx_hash_lock); 322 323 return (lctx); 324 } 325 326 /* 327 * Releases a hold on the lctx. Must be called with the listening socket's inp 328 * locked. The inp may be freed by this function and it returns NULL to 329 * indicate this. 330 */ 331 static struct inpcb * 332 release_lctx(struct adapter *sc, struct listen_ctx *lctx) 333 { 334 struct inpcb *inp = lctx->inp; 335 int inp_freed = 0; 336 337 INP_WLOCK_ASSERT(inp); 338 if (refcount_release(&lctx->refcount)) 339 inp_freed = free_lctx(sc, lctx); 340 341 return (inp_freed ? NULL : inp); 342 } 343 344 static void 345 send_flowc_wr_synqe(struct adapter *sc, struct synq_entry *synqe) 346 { 347 struct mbuf *m = synqe->syn; 348 struct ifnet *ifp = m->m_pkthdr.rcvif; 349 struct vi_info *vi = ifp->if_softc; 350 struct port_info *pi = vi->pi; 351 struct wrqe *wr; 352 struct fw_flowc_wr *flowc; 353 struct sge_ofld_txq *ofld_txq; 354 struct sge_ofld_rxq *ofld_rxq; 355 const int nparams = 6; 356 const int flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 357 const u_int pfvf = sc->pf << S_FW_VIID_PFN; 358 359 INP_WLOCK_ASSERT(synqe->lctx->inp); 360 MPASS((synqe->flags & TPF_FLOWC_WR_SENT) == 0); 361 362 ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx]; 363 ofld_rxq = &sc->sge.ofld_rxq[synqe->params.rxq_idx]; 364 365 wr = alloc_wrqe(roundup2(flowclen, 16), &ofld_txq->wrq); 366 if (wr == NULL) { 367 /* XXX */ 368 panic("%s: allocation failure.", __func__); 369 } 370 flowc = wrtod(wr); 371 memset(flowc, 0, wr->wr_len); 372 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 373 V_FW_FLOWC_WR_NPARAMS(nparams)); 374 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 375 V_FW_WR_FLOWID(synqe->tid)); 376 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; 377 flowc->mnemval[0].val = htobe32(pfvf); 378 flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; 379 flowc->mnemval[1].val = htobe32(pi->tx_chan); 380 flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; 381 flowc->mnemval[2].val = htobe32(pi->tx_chan); 382 flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; 383 flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id); 384 flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF; 385 flowc->mnemval[4].val = htobe32(512); 386 flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS; 387 flowc->mnemval[5].val = htobe32(512); 388 389 synqe->flags |= TPF_FLOWC_WR_SENT; 390 t4_wrq_tx(sc, wr); 391 } 392 393 static void 394 send_abort_rpl_synqe(struct toedev *tod, struct synq_entry *synqe, 395 int rst_status) 396 { 397 struct adapter *sc = tod->tod_softc; 398 struct wrqe *wr; 399 struct cpl_abort_req *req; 400 401 INP_WLOCK_ASSERT(synqe->lctx->inp); 402 403 CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s", 404 __func__, synqe, synqe->flags, synqe->tid, 405 synqe->flags & TPF_ABORT_SHUTDOWN ? 406 " (abort already in progress)" : ""); 407 if (synqe->flags & TPF_ABORT_SHUTDOWN) 408 return; /* abort already in progress */ 409 synqe->flags |= TPF_ABORT_SHUTDOWN; 410 411 if (!(synqe->flags & TPF_FLOWC_WR_SENT)) 412 send_flowc_wr_synqe(sc, synqe); 413 414 wr = alloc_wrqe(sizeof(*req), 415 &sc->sge.ofld_txq[synqe->params.txq_idx].wrq); 416 if (wr == NULL) { 417 /* XXX */ 418 panic("%s: allocation failure.", __func__); 419 } 420 req = wrtod(wr); 421 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid); 422 req->rsvd0 = 0; /* don't have a snd_nxt */ 423 req->rsvd1 = 1; /* no data sent yet */ 424 req->cmd = rst_status; 425 426 t4_l2t_send(sc, wr, &sc->l2t->l2tab[synqe->params.l2t_idx]); 427 } 428 429 static int 430 create_server(struct adapter *sc, struct listen_ctx *lctx) 431 { 432 struct wrqe *wr; 433 struct cpl_pass_open_req *req; 434 struct inpcb *inp = lctx->inp; 435 436 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 437 if (wr == NULL) { 438 log(LOG_ERR, "%s: allocation failure", __func__); 439 return (ENOMEM); 440 } 441 req = wrtod(wr); 442 443 INIT_TP_WR(req, 0); 444 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid)); 445 req->local_port = inp->inp_lport; 446 req->peer_port = 0; 447 req->local_ip = inp->inp_laddr.s_addr; 448 req->peer_ip = 0; 449 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); 450 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | 451 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); 452 453 t4_wrq_tx(sc, wr); 454 return (0); 455 } 456 457 static int 458 create_server6(struct adapter *sc, struct listen_ctx *lctx) 459 { 460 struct wrqe *wr; 461 struct cpl_pass_open_req6 *req; 462 struct inpcb *inp = lctx->inp; 463 464 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 465 if (wr == NULL) { 466 log(LOG_ERR, "%s: allocation failure", __func__); 467 return (ENOMEM); 468 } 469 req = wrtod(wr); 470 471 INIT_TP_WR(req, 0); 472 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid)); 473 req->local_port = inp->inp_lport; 474 req->peer_port = 0; 475 req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; 476 req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; 477 req->peer_ip_hi = 0; 478 req->peer_ip_lo = 0; 479 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); 480 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | 481 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); 482 483 t4_wrq_tx(sc, wr); 484 return (0); 485 } 486 487 static int 488 destroy_server(struct adapter *sc, struct listen_ctx *lctx) 489 { 490 struct wrqe *wr; 491 struct cpl_close_listsvr_req *req; 492 493 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 494 if (wr == NULL) { 495 /* XXX */ 496 panic("%s: allocation failure.", __func__); 497 } 498 req = wrtod(wr); 499 500 INIT_TP_WR(req, 0); 501 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, 502 lctx->stid)); 503 req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id); 504 req->rsvd = htobe16(0); 505 506 t4_wrq_tx(sc, wr); 507 return (0); 508 } 509 510 /* 511 * Start a listening server by sending a passive open request to HW. 512 * 513 * Can't take adapter lock here and access to sc->flags, 514 * sc->offload_map, if_capenable are all race prone. 515 */ 516 int 517 t4_listen_start(struct toedev *tod, struct tcpcb *tp) 518 { 519 struct adapter *sc = tod->tod_softc; 520 struct vi_info *vi; 521 struct port_info *pi; 522 struct inpcb *inp = tptoinpcb(tp); 523 struct listen_ctx *lctx; 524 int i, rc, v; 525 struct offload_settings settings; 526 527 INP_WLOCK_ASSERT(inp); 528 529 rw_rlock(&sc->policy_lock); 530 settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL, 531 EVL_MAKETAG(0xfff, 0, 0), inp); 532 rw_runlock(&sc->policy_lock); 533 if (!settings.offload) 534 return (0); 535 536 /* Don't start a hardware listener for any loopback address. */ 537 if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr)) 538 return (0); 539 if (!(inp->inp_vflag & INP_IPV6) && 540 IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr))) 541 return (0); 542 if (sc->flags & KERN_TLS_ON) 543 return (0); 544 #if 0 545 ADAPTER_LOCK(sc); 546 if (IS_BUSY(sc)) { 547 log(LOG_ERR, "%s: listen request ignored, %s is busy", 548 __func__, device_get_nameunit(sc->dev)); 549 goto done; 550 } 551 552 KASSERT(uld_active(sc, ULD_TOM), 553 ("%s: TOM not initialized", __func__)); 554 #endif 555 556 /* 557 * Find an initialized VI with IFCAP_TOE (4 or 6). We'll use the first 558 * such VI's queues to send the passive open and receive the reply to 559 * it. 560 * 561 * XXX: need a way to mark a port in use by offload. if_cxgbe should 562 * then reject any attempt to bring down such a port (and maybe reject 563 * attempts to disable IFCAP_TOE on that port too?). 564 */ 565 for_each_port(sc, i) { 566 pi = sc->port[i]; 567 for_each_vi(pi, v, vi) { 568 if (vi->flags & VI_INIT_DONE && 569 vi->ifp->if_capenable & IFCAP_TOE) 570 goto found; 571 } 572 } 573 goto done; /* no port that's UP with IFCAP_TOE enabled */ 574 found: 575 576 if (listen_hash_find(sc, inp) != NULL) 577 goto done; /* already setup */ 578 579 lctx = alloc_lctx(sc, inp, vi); 580 if (lctx == NULL) { 581 log(LOG_ERR, 582 "%s: listen request ignored, %s couldn't allocate lctx\n", 583 __func__, device_get_nameunit(sc->dev)); 584 goto done; 585 } 586 listen_hash_add(sc, lctx); 587 588 CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x", 589 __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp, 590 inp->inp_vflag); 591 592 if (inp->inp_vflag & INP_IPV6) 593 rc = create_server6(sc, lctx); 594 else 595 rc = create_server(sc, lctx); 596 if (rc != 0) { 597 log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n", 598 __func__, device_get_nameunit(sc->dev), rc); 599 (void) listen_hash_del(sc, inp); 600 inp = release_lctx(sc, lctx); 601 /* can't be freed, host stack has a reference */ 602 KASSERT(inp != NULL, ("%s: inp freed", __func__)); 603 goto done; 604 } 605 lctx->flags |= LCTX_RPL_PENDING; 606 done: 607 #if 0 608 ADAPTER_UNLOCK(sc); 609 #endif 610 return (0); 611 } 612 613 int 614 t4_listen_stop(struct toedev *tod, struct tcpcb *tp) 615 { 616 struct listen_ctx *lctx; 617 struct adapter *sc = tod->tod_softc; 618 struct inpcb *inp = tptoinpcb(tp); 619 620 INP_WLOCK_ASSERT(inp); 621 622 lctx = listen_hash_del(sc, inp); 623 if (lctx == NULL) 624 return (ENOENT); /* no hardware listener for this inp */ 625 626 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid, 627 lctx, lctx->flags); 628 629 /* 630 * If the reply to the PASS_OPEN is still pending we'll wait for it to 631 * arrive and clean up when it does. 632 */ 633 if (lctx->flags & LCTX_RPL_PENDING) { 634 return (EINPROGRESS); 635 } 636 637 destroy_server(sc, lctx); 638 return (0); 639 } 640 641 static inline struct synq_entry * 642 alloc_synqe(struct adapter *sc __unused, struct listen_ctx *lctx, int flags) 643 { 644 struct synq_entry *synqe; 645 646 INP_RLOCK_ASSERT(lctx->inp); 647 MPASS(flags == M_WAITOK || flags == M_NOWAIT); 648 649 synqe = malloc(sizeof(*synqe), M_CXGBE, flags); 650 if (__predict_true(synqe != NULL)) { 651 synqe->flags = TPF_SYNQE; 652 refcount_init(&synqe->refcnt, 1); 653 synqe->lctx = lctx; 654 hold_lctx(lctx); /* Every synqe has a ref on its lctx. */ 655 synqe->syn = NULL; 656 } 657 658 return (synqe); 659 } 660 661 static inline void 662 hold_synqe(struct synq_entry *synqe) 663 { 664 665 refcount_acquire(&synqe->refcnt); 666 } 667 668 static inline struct inpcb * 669 release_synqe(struct adapter *sc, struct synq_entry *synqe) 670 { 671 struct inpcb *inp; 672 673 MPASS(synqe->flags & TPF_SYNQE); 674 MPASS(synqe->lctx != NULL); 675 676 inp = synqe->lctx->inp; 677 MPASS(inp != NULL); 678 INP_WLOCK_ASSERT(inp); 679 680 if (refcount_release(&synqe->refcnt)) { 681 inp = release_lctx(sc, synqe->lctx); 682 m_freem(synqe->syn); 683 free(synqe, M_CXGBE); 684 } 685 686 return (inp); 687 } 688 689 void 690 t4_syncache_added(struct toedev *tod __unused, void *arg) 691 { 692 struct synq_entry *synqe = arg; 693 694 hold_synqe(synqe); 695 } 696 697 void 698 t4_syncache_removed(struct toedev *tod, void *arg) 699 { 700 struct adapter *sc = tod->tod_softc; 701 struct synq_entry *synqe = arg; 702 struct inpcb *inp = synqe->lctx->inp; 703 704 /* 705 * XXX: this is a LOR but harmless when running from the softclock. 706 */ 707 INP_WLOCK(inp); 708 inp = release_synqe(sc, synqe); 709 if (inp != NULL) 710 INP_WUNLOCK(inp); 711 } 712 713 int 714 t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m) 715 { 716 struct synq_entry *synqe = arg; 717 718 if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) { 719 struct tcpopt to; 720 struct ip *ip = mtod(m, struct ip *); 721 struct tcphdr *th; 722 723 if (ip->ip_v == IPVERSION) 724 th = (void *)(ip + 1); 725 else 726 th = (void *)((struct ip6_hdr *)ip + 1); 727 bzero(&to, sizeof(to)); 728 tcp_dooptions(&to, (void *)(th + 1), 729 (th->th_off << 2) - sizeof(*th), TO_SYN); 730 731 /* save these for later */ 732 synqe->iss = be32toh(th->th_seq); 733 synqe->irs = be32toh(th->th_ack) - 1; 734 synqe->ts = to.to_tsval; 735 } 736 737 m_freem(m); /* don't need this any more */ 738 return (0); 739 } 740 741 static int 742 do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss, 743 struct mbuf *m) 744 { 745 struct adapter *sc = iq->adapter; 746 const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1); 747 int stid = GET_TID(cpl); 748 unsigned int status = cpl->status; 749 struct listen_ctx *lctx = lookup_stid(sc, stid); 750 struct inpcb *inp = lctx->inp; 751 #ifdef INVARIANTS 752 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 753 #endif 754 755 KASSERT(opcode == CPL_PASS_OPEN_RPL, 756 ("%s: unexpected opcode 0x%x", __func__, opcode)); 757 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 758 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 759 760 INP_WLOCK(inp); 761 762 CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x", 763 __func__, stid, status, lctx->flags); 764 765 lctx->flags &= ~LCTX_RPL_PENDING; 766 767 if (status != CPL_ERR_NONE) 768 log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status); 769 770 #ifdef INVARIANTS 771 /* 772 * If the inp has been dropped (listening socket closed) then 773 * listen_stop must have run and taken the inp out of the hash. 774 */ 775 if (inp->inp_flags & INP_DROPPED) { 776 KASSERT(listen_hash_del(sc, inp) == NULL, 777 ("%s: inp %p still in listen hash", __func__, inp)); 778 } 779 #endif 780 781 if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) { 782 if (release_lctx(sc, lctx) != NULL) 783 INP_WUNLOCK(inp); 784 return (status); 785 } 786 787 /* 788 * Listening socket stopped listening earlier and now the chip tells us 789 * it has started the hardware listener. Stop it; the lctx will be 790 * released in do_close_server_rpl. 791 */ 792 if (inp->inp_flags & INP_DROPPED) { 793 destroy_server(sc, lctx); 794 INP_WUNLOCK(inp); 795 return (status); 796 } 797 798 /* 799 * Failed to start hardware listener. Take inp out of the hash and 800 * release our reference on it. An error message has been logged 801 * already. 802 */ 803 if (status != CPL_ERR_NONE) { 804 listen_hash_del(sc, inp); 805 if (release_lctx(sc, lctx) != NULL) 806 INP_WUNLOCK(inp); 807 return (status); 808 } 809 810 /* hardware listener open for business */ 811 812 INP_WUNLOCK(inp); 813 return (status); 814 } 815 816 static int 817 do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss, 818 struct mbuf *m) 819 { 820 struct adapter *sc = iq->adapter; 821 const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1); 822 int stid = GET_TID(cpl); 823 unsigned int status = cpl->status; 824 struct listen_ctx *lctx = lookup_stid(sc, stid); 825 struct inpcb *inp = lctx->inp; 826 #ifdef INVARIANTS 827 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 828 #endif 829 830 KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL, 831 ("%s: unexpected opcode 0x%x", __func__, opcode)); 832 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 833 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 834 835 CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status); 836 837 if (status != CPL_ERR_NONE) { 838 log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n", 839 __func__, status, stid); 840 return (status); 841 } 842 843 INP_WLOCK(inp); 844 inp = release_lctx(sc, lctx); 845 if (inp != NULL) 846 INP_WUNLOCK(inp); 847 848 return (status); 849 } 850 851 static void 852 done_with_synqe(struct adapter *sc, struct synq_entry *synqe) 853 { 854 struct listen_ctx *lctx = synqe->lctx; 855 struct inpcb *inp = lctx->inp; 856 struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; 857 int ntids; 858 859 INP_WLOCK_ASSERT(inp); 860 ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1; 861 862 remove_tid(sc, synqe->tid, ntids); 863 release_tid(sc, synqe->tid, lctx->ctrlq); 864 t4_l2t_release(e); 865 inp = release_synqe(sc, synqe); 866 if (inp) 867 INP_WUNLOCK(inp); 868 } 869 870 void 871 synack_failure_cleanup(struct adapter *sc, int tid) 872 { 873 struct synq_entry *synqe = lookup_tid(sc, tid); 874 875 INP_WLOCK(synqe->lctx->inp); 876 done_with_synqe(sc, synqe); 877 } 878 879 int 880 do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss, 881 struct mbuf *m) 882 { 883 struct adapter *sc = iq->adapter; 884 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 885 unsigned int tid = GET_TID(cpl); 886 struct synq_entry *synqe = lookup_tid(sc, tid); 887 struct listen_ctx *lctx = synqe->lctx; 888 struct inpcb *inp = lctx->inp; 889 struct sge_ofld_txq *ofld_txq; 890 #ifdef INVARIANTS 891 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 892 #endif 893 894 KASSERT(opcode == CPL_ABORT_REQ_RSS, 895 ("%s: unexpected opcode 0x%x", __func__, opcode)); 896 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 897 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); 898 899 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", 900 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); 901 902 if (negative_advice(cpl->status)) 903 return (0); /* Ignore negative advice */ 904 905 INP_WLOCK(inp); 906 907 ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx]; 908 909 if (!(synqe->flags & TPF_FLOWC_WR_SENT)) 910 send_flowc_wr_synqe(sc, synqe); 911 912 /* 913 * If we'd initiated an abort earlier the reply to it is responsible for 914 * cleaning up resources. Otherwise we tear everything down right here 915 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 916 */ 917 if (synqe->flags & TPF_ABORT_SHUTDOWN) { 918 INP_WUNLOCK(inp); 919 goto done; 920 } 921 922 done_with_synqe(sc, synqe); 923 /* inp lock released by done_with_synqe */ 924 done: 925 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 926 return (0); 927 } 928 929 int 930 do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss, 931 struct mbuf *m) 932 { 933 struct adapter *sc = iq->adapter; 934 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 935 unsigned int tid = GET_TID(cpl); 936 struct synq_entry *synqe = lookup_tid(sc, tid); 937 struct listen_ctx *lctx = synqe->lctx; 938 struct inpcb *inp = lctx->inp; 939 #ifdef INVARIANTS 940 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 941 #endif 942 943 KASSERT(opcode == CPL_ABORT_RPL_RSS, 944 ("%s: unexpected opcode 0x%x", __func__, opcode)); 945 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 946 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); 947 948 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", 949 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); 950 951 INP_WLOCK(inp); 952 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, 953 ("%s: wasn't expecting abort reply for synqe %p (0x%x)", 954 __func__, synqe, synqe->flags)); 955 956 done_with_synqe(sc, synqe); 957 /* inp lock released by done_with_synqe */ 958 959 return (0); 960 } 961 962 void 963 t4_offload_socket(struct toedev *tod, void *arg, struct socket *so) 964 { 965 struct adapter *sc = tod->tod_softc; 966 struct synq_entry *synqe = arg; 967 struct inpcb *inp = sotoinpcb(so); 968 struct toepcb *toep = synqe->toep; 969 970 NET_EPOCH_ASSERT(); /* prevents bad race with accept() */ 971 INP_WLOCK_ASSERT(inp); 972 KASSERT(synqe->flags & TPF_SYNQE, 973 ("%s: %p not a synq_entry?", __func__, arg)); 974 MPASS(toep->tid == synqe->tid); 975 976 offload_socket(so, toep); 977 make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt); 978 toep->flags |= TPF_CPL_PENDING; 979 update_tid(sc, synqe->tid, toep); 980 synqe->flags |= TPF_SYNQE_EXPANDED; 981 inp->inp_flowtype = (inp->inp_vflag & INP_IPV6) ? 982 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4; 983 inp->inp_flowid = synqe->rss_hash; 984 } 985 986 static void 987 t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to) 988 { 989 bzero(to, sizeof(*to)); 990 991 if (t4opt->mss) { 992 to->to_flags |= TOF_MSS; 993 to->to_mss = be16toh(t4opt->mss); 994 } 995 996 if (t4opt->wsf > 0 && t4opt->wsf < 15) { 997 to->to_flags |= TOF_SCALE; 998 to->to_wscale = t4opt->wsf; 999 } 1000 1001 if (t4opt->tstamp) 1002 to->to_flags |= TOF_TS; 1003 1004 if (t4opt->sack) 1005 to->to_flags |= TOF_SACKPERM; 1006 } 1007 1008 static bool 1009 encapsulated_syn(struct adapter *sc, const struct cpl_pass_accept_req *cpl) 1010 { 1011 u_int hlen = be32toh(cpl->hdr_len); 1012 1013 if (chip_id(sc) >= CHELSIO_T6) 1014 return (G_T6_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header)); 1015 else 1016 return (G_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header)); 1017 } 1018 1019 static void 1020 pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m, 1021 struct in_conninfo *inc, struct tcphdr *th, uint8_t *iptos) 1022 { 1023 const struct cpl_pass_accept_req *cpl = mtod(m, const void *); 1024 const struct ether_header *eh; 1025 unsigned int hlen = be32toh(cpl->hdr_len); 1026 uintptr_t l3hdr; 1027 const struct tcphdr *tcp; 1028 1029 eh = (const void *)(cpl + 1); 1030 if (chip_id(sc) >= CHELSIO_T6) { 1031 l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen)); 1032 tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen)); 1033 } else { 1034 l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen)); 1035 tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen)); 1036 } 1037 1038 /* extract TOS (DiffServ + ECN) byte for AccECN */ 1039 if (iptos) { 1040 if (((struct ip *)l3hdr)->ip_v == IPVERSION) { 1041 const struct ip *ip = (const void *)l3hdr; 1042 *iptos = ip->ip_tos; 1043 } 1044 #ifdef INET6 1045 else 1046 if (((struct ip *)l3hdr)->ip_v == (IPV6_VERSION >> 4)) { 1047 const struct ip6_hdr *ip6 = (const void *)l3hdr; 1048 *iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 1049 } 1050 #endif /* INET */ 1051 } 1052 1053 if (inc) { 1054 bzero(inc, sizeof(*inc)); 1055 inc->inc_fport = tcp->th_sport; 1056 inc->inc_lport = tcp->th_dport; 1057 if (((struct ip *)l3hdr)->ip_v == IPVERSION) { 1058 const struct ip *ip = (const void *)l3hdr; 1059 1060 inc->inc_faddr = ip->ip_src; 1061 inc->inc_laddr = ip->ip_dst; 1062 } else { 1063 const struct ip6_hdr *ip6 = (const void *)l3hdr; 1064 1065 inc->inc_flags |= INC_ISIPV6; 1066 inc->inc6_faddr = ip6->ip6_src; 1067 inc->inc6_laddr = ip6->ip6_dst; 1068 } 1069 } 1070 1071 if (th) { 1072 bcopy(tcp, th, sizeof(*th)); 1073 tcp_fields_to_host(th); /* just like tcp_input */ 1074 } 1075 } 1076 1077 static struct l2t_entry * 1078 get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp, 1079 struct in_conninfo *inc) 1080 { 1081 struct l2t_entry *e; 1082 struct sockaddr_in6 sin6; 1083 struct sockaddr *dst = (void *)&sin6; 1084 struct nhop_object *nh; 1085 1086 if (inc->inc_flags & INC_ISIPV6) { 1087 bzero(dst, sizeof(struct sockaddr_in6)); 1088 dst->sa_len = sizeof(struct sockaddr_in6); 1089 dst->sa_family = AF_INET6; 1090 1091 if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) { 1092 /* no need for route lookup */ 1093 e = t4_l2t_get(pi, ifp, dst); 1094 return (e); 1095 } 1096 1097 nh = fib6_lookup(RT_DEFAULT_FIB, &inc->inc6_faddr, 0, NHR_NONE, 0); 1098 if (nh == NULL) 1099 return (NULL); 1100 if (nh->nh_ifp != ifp) 1101 return (NULL); 1102 if (nh->nh_flags & NHF_GATEWAY) 1103 ((struct sockaddr_in6 *)dst)->sin6_addr = nh->gw6_sa.sin6_addr; 1104 else 1105 ((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr; 1106 } else { 1107 dst->sa_len = sizeof(struct sockaddr_in); 1108 dst->sa_family = AF_INET; 1109 1110 nh = fib4_lookup(RT_DEFAULT_FIB, inc->inc_faddr, 0, NHR_NONE, 0); 1111 if (nh == NULL) 1112 return (NULL); 1113 if (nh->nh_ifp != ifp) 1114 return (NULL); 1115 if (nh->nh_flags & NHF_GATEWAY) 1116 if (nh->gw_sa.sa_family == AF_INET) 1117 ((struct sockaddr_in *)dst)->sin_addr = nh->gw4_sa.sin_addr; 1118 else 1119 *((struct sockaddr_in6 *)dst) = nh->gw6_sa; 1120 else 1121 ((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr; 1122 } 1123 1124 e = t4_l2t_get(pi, ifp, dst); 1125 return (e); 1126 } 1127 1128 static int 1129 send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0, 1130 uint32_t opt2, int tid) 1131 { 1132 struct wrqe *wr; 1133 struct cpl_pass_accept_rpl *rpl; 1134 struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; 1135 1136 wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) : 1137 sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]); 1138 if (wr == NULL) 1139 return (ENOMEM); 1140 rpl = wrtod(wr); 1141 1142 if (is_t4(sc)) 1143 INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid); 1144 else { 1145 struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl; 1146 1147 INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid); 1148 rpl5->iss = htobe32(synqe->iss); 1149 } 1150 rpl->opt0 = opt0; 1151 rpl->opt2 = opt2; 1152 1153 return (t4_l2t_send(sc, wr, e)); 1154 } 1155 1156 #define REJECT_PASS_ACCEPT_REQ(tunnel) do { \ 1157 if (!tunnel) { \ 1158 m_freem(m); \ 1159 m = NULL; \ 1160 } \ 1161 reject_reason = __LINE__; \ 1162 goto reject; \ 1163 } while (0) 1164 1165 /* 1166 * The context associated with a tid entry via insert_tid could be a synq_entry 1167 * or a toepcb. The only way CPL handlers can tell is via a bit in these flags. 1168 */ 1169 CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags)); 1170 1171 /* 1172 * Incoming SYN on a listening socket. 1173 * 1174 * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe, 1175 * etc. 1176 */ 1177 static int 1178 do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, 1179 struct mbuf *m) 1180 { 1181 struct adapter *sc = iq->adapter; 1182 struct toedev *tod; 1183 const struct cpl_pass_accept_req *cpl = mtod(m, const void *); 1184 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); 1185 unsigned int tid = GET_TID(cpl); 1186 struct listen_ctx *lctx = lookup_stid(sc, stid); 1187 struct inpcb *inp; 1188 struct socket *so; 1189 struct in_conninfo inc; 1190 struct tcphdr th; 1191 struct tcpopt to; 1192 struct port_info *pi; 1193 struct vi_info *vi; 1194 struct ifnet *hw_ifp, *ifp; 1195 struct l2t_entry *e = NULL; 1196 struct synq_entry *synqe = NULL; 1197 int reject_reason, v, ntids; 1198 uint16_t vid, l2info; 1199 struct epoch_tracker et; 1200 #ifdef INVARIANTS 1201 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1202 #endif 1203 struct offload_settings settings; 1204 uint8_t iptos; 1205 1206 KASSERT(opcode == CPL_PASS_ACCEPT_REQ, 1207 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1208 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 1209 1210 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid, 1211 lctx); 1212 1213 /* 1214 * Figure out the port the SYN arrived on. We'll look for an exact VI 1215 * match in a bit but in case we don't find any we'll use the main VI as 1216 * the incoming ifnet. 1217 */ 1218 l2info = be16toh(cpl->l2info); 1219 pi = sc->port[G_SYN_INTF(l2info)]; 1220 hw_ifp = pi->vi[0].ifp; 1221 m->m_pkthdr.rcvif = hw_ifp; 1222 1223 CURVNET_SET(lctx->vnet); /* before any potential REJECT */ 1224 1225 /* 1226 * If VXLAN/NVGRE parsing is enabled then SYNs in the inner traffic will 1227 * also hit the listener. We don't want to offload those. 1228 */ 1229 if (encapsulated_syn(sc, cpl)) { 1230 REJECT_PASS_ACCEPT_REQ(true); 1231 } 1232 1233 /* 1234 * Use the MAC index to lookup the associated VI. If this SYN didn't 1235 * match a perfect MAC filter, punt. 1236 */ 1237 if (!(l2info & F_SYN_XACT_MATCH)) { 1238 REJECT_PASS_ACCEPT_REQ(true); 1239 } 1240 for_each_vi(pi, v, vi) { 1241 if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info)) 1242 goto found; 1243 } 1244 REJECT_PASS_ACCEPT_REQ(true); 1245 found: 1246 hw_ifp = vi->ifp; /* the cxgbe ifnet */ 1247 m->m_pkthdr.rcvif = hw_ifp; 1248 tod = TOEDEV(hw_ifp); 1249 1250 /* 1251 * Don't offload if the peer requested a TCP option that's not known to 1252 * the silicon. Send the SYN to the kernel instead. 1253 */ 1254 if (__predict_false(cpl->tcpopt.unknown)) 1255 REJECT_PASS_ACCEPT_REQ(true); 1256 1257 /* 1258 * Figure out if there is a pseudo interface (vlan, lagg, etc.) 1259 * involved. Don't offload if the SYN had a VLAN tag and the vid 1260 * doesn't match anything on this interface. 1261 * 1262 * XXX: lagg support, lagg + vlan support. 1263 */ 1264 vid = EVL_VLANOFTAG(be16toh(cpl->vlan)); 1265 if (vid != 0xfff && vid != 0) { 1266 ifp = VLAN_DEVAT(hw_ifp, vid); 1267 if (ifp == NULL) 1268 REJECT_PASS_ACCEPT_REQ(true); 1269 } else 1270 ifp = hw_ifp; 1271 1272 /* 1273 * Don't offload if the ifnet that the SYN came in on is not in the same 1274 * vnet as the listening socket. 1275 */ 1276 if (lctx->vnet != ifp->if_vnet) 1277 REJECT_PASS_ACCEPT_REQ(true); 1278 1279 pass_accept_req_to_protohdrs(sc, m, &inc, &th, &iptos); 1280 if (inc.inc_flags & INC_ISIPV6) { 1281 1282 /* Don't offload if the ifcap isn't enabled */ 1283 if ((ifp->if_capenable & IFCAP_TOE6) == 0) 1284 REJECT_PASS_ACCEPT_REQ(true); 1285 1286 /* 1287 * SYN must be directed to an IP6 address on this ifnet. This 1288 * is more restrictive than in6_localip. 1289 */ 1290 NET_EPOCH_ENTER(et); 1291 if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) { 1292 NET_EPOCH_EXIT(et); 1293 REJECT_PASS_ACCEPT_REQ(true); 1294 } 1295 1296 ntids = 2; 1297 } else { 1298 1299 /* Don't offload if the ifcap isn't enabled */ 1300 if ((ifp->if_capenable & IFCAP_TOE4) == 0) 1301 REJECT_PASS_ACCEPT_REQ(true); 1302 1303 /* 1304 * SYN must be directed to an IP address on this ifnet. This 1305 * is more restrictive than in_localip. 1306 */ 1307 NET_EPOCH_ENTER(et); 1308 if (!in_ifhasaddr(ifp, inc.inc_laddr)) { 1309 NET_EPOCH_EXIT(et); 1310 REJECT_PASS_ACCEPT_REQ(true); 1311 } 1312 1313 ntids = 1; 1314 } 1315 1316 e = get_l2te_for_nexthop(pi, ifp, &inc); 1317 if (e == NULL) { 1318 NET_EPOCH_EXIT(et); 1319 REJECT_PASS_ACCEPT_REQ(true); 1320 } 1321 1322 /* Don't offload if the 4-tuple is already in use */ 1323 if (toe_4tuple_check(&inc, &th, ifp) != 0) { 1324 NET_EPOCH_EXIT(et); 1325 REJECT_PASS_ACCEPT_REQ(false); 1326 } 1327 1328 inp = lctx->inp; /* listening socket, not owned by TOE */ 1329 INP_RLOCK(inp); 1330 1331 /* Don't offload if the listening socket has closed */ 1332 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 1333 INP_RUNLOCK(inp); 1334 NET_EPOCH_EXIT(et); 1335 REJECT_PASS_ACCEPT_REQ(false); 1336 } 1337 so = inp->inp_socket; 1338 rw_rlock(&sc->policy_lock); 1339 settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m, 1340 EVL_MAKETAG(0xfff, 0, 0), inp); 1341 rw_runlock(&sc->policy_lock); 1342 if (!settings.offload) { 1343 INP_RUNLOCK(inp); 1344 NET_EPOCH_EXIT(et); 1345 REJECT_PASS_ACCEPT_REQ(true); /* Rejected by COP. */ 1346 } 1347 1348 synqe = alloc_synqe(sc, lctx, M_NOWAIT); 1349 if (synqe == NULL) { 1350 INP_RUNLOCK(inp); 1351 NET_EPOCH_EXIT(et); 1352 REJECT_PASS_ACCEPT_REQ(true); 1353 } 1354 MPASS(rss->hash_type == RSS_HASH_TCP); 1355 synqe->rss_hash = be32toh(rss->hash_val); 1356 atomic_store_int(&synqe->ok_to_respond, 0); 1357 1358 init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx, 1359 &synqe->params); 1360 1361 /* 1362 * If all goes well t4_syncache_respond will get called during 1363 * syncache_add. Note that syncache_add releases the pcb lock. 1364 */ 1365 t4opt_to_tcpopt(&cpl->tcpopt, &to); 1366 toe_syncache_add(&inc, &to, &th, inp, tod, synqe, iptos); 1367 1368 if (atomic_load_int(&synqe->ok_to_respond) > 0) { 1369 uint64_t opt0; 1370 uint32_t opt2; 1371 1372 opt0 = calc_options0(vi, &synqe->params); 1373 opt2 = calc_options2(vi, &synqe->params); 1374 1375 insert_tid(sc, tid, synqe, ntids); 1376 synqe->tid = tid; 1377 synqe->syn = m; 1378 m = NULL; 1379 1380 if (send_synack(sc, synqe, opt0, opt2, tid) != 0) { 1381 remove_tid(sc, tid, ntids); 1382 m = synqe->syn; 1383 synqe->syn = NULL; 1384 NET_EPOCH_EXIT(et); 1385 REJECT_PASS_ACCEPT_REQ(true); 1386 } 1387 1388 CTR6(KTR_CXGBE, 1389 "%s: stid %u, tid %u, synqe %p, opt0 %#016lx, opt2 %#08x", 1390 __func__, stid, tid, synqe, be64toh(opt0), be32toh(opt2)); 1391 } else { 1392 NET_EPOCH_EXIT(et); 1393 REJECT_PASS_ACCEPT_REQ(false); 1394 } 1395 1396 NET_EPOCH_EXIT(et); 1397 CURVNET_RESTORE(); 1398 return (0); 1399 reject: 1400 CURVNET_RESTORE(); 1401 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, 1402 reject_reason); 1403 1404 if (e) 1405 t4_l2t_release(e); 1406 release_tid(sc, tid, lctx->ctrlq); 1407 if (synqe) { 1408 inp = synqe->lctx->inp; 1409 INP_WLOCK(inp); 1410 inp = release_synqe(sc, synqe); 1411 if (inp) 1412 INP_WUNLOCK(inp); 1413 } 1414 1415 if (m) { 1416 /* 1417 * The connection request hit a TOE listener but is being passed 1418 * on to the kernel sw stack instead of getting offloaded. 1419 */ 1420 m_adj(m, sizeof(*cpl)); 1421 m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | 1422 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 1423 m->m_pkthdr.csum_data = 0xffff; 1424 hw_ifp->if_input(hw_ifp, m); 1425 } 1426 1427 return (reject_reason); 1428 } 1429 1430 static void 1431 synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe, 1432 const struct cpl_pass_establish *cpl, struct in_conninfo *inc, 1433 struct tcphdr *th, struct tcpopt *to) 1434 { 1435 uint16_t tcp_opt = be16toh(cpl->tcp_opt); 1436 uint8_t iptos; 1437 1438 /* start off with the original SYN */ 1439 pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th, &iptos); 1440 1441 /* modify parts to make it look like the ACK to our SYN|ACK */ 1442 th->th_flags = TH_ACK; 1443 th->th_ack = synqe->iss + 1; 1444 th->th_seq = be32toh(cpl->rcv_isn); 1445 bzero(to, sizeof(*to)); 1446 if (G_TCPOPT_TSTAMP(tcp_opt)) { 1447 to->to_flags |= TOF_TS; 1448 to->to_tsecr = synqe->ts; 1449 } 1450 } 1451 1452 static int 1453 do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, 1454 struct mbuf *m) 1455 { 1456 struct adapter *sc = iq->adapter; 1457 struct vi_info *vi; 1458 struct ifnet *ifp; 1459 const struct cpl_pass_establish *cpl = (const void *)(rss + 1); 1460 #if defined(KTR) || defined(INVARIANTS) 1461 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); 1462 #endif 1463 unsigned int tid = GET_TID(cpl); 1464 struct synq_entry *synqe = lookup_tid(sc, tid); 1465 struct listen_ctx *lctx = synqe->lctx; 1466 struct inpcb *inp = lctx->inp, *new_inp; 1467 struct socket *so; 1468 struct tcphdr th; 1469 struct tcpopt to; 1470 struct in_conninfo inc; 1471 struct toepcb *toep; 1472 struct epoch_tracker et; 1473 int rstreason; 1474 #ifdef INVARIANTS 1475 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1476 #endif 1477 1478 KASSERT(opcode == CPL_PASS_ESTABLISH, 1479 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1480 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1481 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 1482 KASSERT(synqe->flags & TPF_SYNQE, 1483 ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe)); 1484 1485 CURVNET_SET(lctx->vnet); 1486 NET_EPOCH_ENTER(et); /* for syncache_expand */ 1487 INP_WLOCK(inp); 1488 1489 CTR6(KTR_CXGBE, 1490 "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x", 1491 __func__, stid, tid, synqe, synqe->flags, inp->inp_flags); 1492 1493 ifp = synqe->syn->m_pkthdr.rcvif; 1494 vi = ifp->if_softc; 1495 KASSERT(vi->adapter == sc, 1496 ("%s: vi %p, sc %p mismatch", __func__, vi, sc)); 1497 1498 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 1499 reset: 1500 send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_SEND_RST); 1501 INP_WUNLOCK(inp); 1502 NET_EPOCH_EXIT(et); 1503 CURVNET_RESTORE(); 1504 return (0); 1505 } 1506 1507 KASSERT(synqe->params.rxq_idx == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0], 1508 ("%s: CPL arrived on unexpected rxq. %d %d", __func__, 1509 synqe->params.rxq_idx, 1510 (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0]))); 1511 1512 toep = alloc_toepcb(vi, M_NOWAIT); 1513 if (toep == NULL) 1514 goto reset; 1515 toep->tid = tid; 1516 toep->l2te = &sc->l2t->l2tab[synqe->params.l2t_idx]; 1517 toep->vnet = lctx->vnet; 1518 bcopy(&synqe->params, &toep->params, sizeof(toep->params)); 1519 init_toepcb(vi, toep); 1520 1521 MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss); 1522 MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs); 1523 synqe->tcp_opt = cpl->tcp_opt; 1524 synqe->toep = toep; 1525 1526 /* Come up with something that syncache_expand should be ok with. */ 1527 synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to); 1528 if (inc.inc_flags & INC_ISIPV6) { 1529 if (lctx->ce == NULL) { 1530 toep->ce = t4_get_clip_entry(sc, &inc.inc6_laddr, true); 1531 if (toep->ce == NULL) { 1532 free_toepcb(toep); 1533 goto reset; /* RST without a CLIP entry? */ 1534 } 1535 } else { 1536 t4_hold_clip_entry(sc, lctx->ce); 1537 toep->ce = lctx->ce; 1538 } 1539 } 1540 so = inp->inp_socket; 1541 KASSERT(so != NULL, ("%s: socket is NULL", __func__)); 1542 1543 rstreason = toe_syncache_expand(&inc, &to, &th, &so); 1544 if (rstreason < 0) { 1545 free_toepcb(toep); 1546 send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_NO_RST); 1547 INP_WUNLOCK(inp); 1548 NET_EPOCH_EXIT(et); 1549 CURVNET_RESTORE(); 1550 return (0); 1551 } else if (rstreason == 0 || so == NULL) { 1552 free_toepcb(toep); 1553 goto reset; 1554 } 1555 1556 /* New connection inpcb is already locked by syncache_expand(). */ 1557 new_inp = sotoinpcb(so); 1558 INP_WLOCK_ASSERT(new_inp); 1559 MPASS(so->so_vnet == lctx->vnet); 1560 1561 /* 1562 * This is for expansion from syncookies. 1563 * 1564 * XXX: we've held the tcbinfo lock throughout so there's no risk of 1565 * anyone accept'ing a connection before we've installed our hooks, but 1566 * this somewhat defeats the purpose of having a tod_offload_socket :-( 1567 */ 1568 if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) { 1569 tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); 1570 t4_offload_socket(TOEDEV(ifp), synqe, so); 1571 } 1572 1573 INP_WUNLOCK(new_inp); 1574 1575 /* Done with the synqe */ 1576 inp = release_synqe(sc, synqe); 1577 if (inp != NULL) 1578 INP_WUNLOCK(inp); 1579 NET_EPOCH_EXIT(et); 1580 CURVNET_RESTORE(); 1581 1582 return (0); 1583 } 1584 1585 void 1586 t4_init_listen_cpl_handlers(void) 1587 { 1588 1589 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl); 1590 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); 1591 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 1592 t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); 1593 } 1594 1595 void 1596 t4_uninit_listen_cpl_handlers(void) 1597 { 1598 1599 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL); 1600 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL); 1601 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL); 1602 t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL); 1603 } 1604 #endif 1605