1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2012 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 #include "opt_inet.h" 32 #include "opt_inet6.h" 33 34 #ifdef TCP_OFFLOAD 35 #include <sys/param.h> 36 #include <sys/types.h> 37 #include <sys/kernel.h> 38 #include <sys/ktr.h> 39 #include <sys/module.h> 40 #include <sys/protosw.h> 41 #include <sys/refcount.h> 42 #include <sys/domain.h> 43 #include <sys/fnv_hash.h> 44 #include <sys/socket.h> 45 #include <sys/socketvar.h> 46 #include <sys/sysctl.h> 47 #include <net/ethernet.h> 48 #include <net/if.h> 49 #include <net/if_types.h> 50 #include <net/if_vlan_var.h> 51 #include <net/route.h> 52 #include <net/route/nhop.h> 53 #include <netinet/in.h> 54 #include <netinet/in_fib.h> 55 #include <netinet/in_pcb.h> 56 #include <netinet/ip.h> 57 #include <netinet/ip6.h> 58 #include <netinet6/in6_fib.h> 59 #include <netinet6/scope6_var.h> 60 #include <netinet/tcp_timer.h> 61 #define TCPSTATES 62 #include <netinet/tcp_fsm.h> 63 #include <netinet/tcp_var.h> 64 #include <netinet/toecore.h> 65 #include <netinet/cc/cc.h> 66 67 #include "common/common.h" 68 #include "common/t4_msg.h" 69 #include "common/t4_regs.h" 70 #include "t4_clip.h" 71 #include "tom/t4_tom_l2t.h" 72 #include "tom/t4_tom.h" 73 74 /* stid services */ 75 static int alloc_stid(struct adapter *, struct listen_ctx *, int); 76 static struct listen_ctx *lookup_stid(struct adapter *, int); 77 static void free_stid(struct adapter *, struct listen_ctx *); 78 79 /* lctx services */ 80 static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *, 81 struct vi_info *); 82 static int free_lctx(struct adapter *, struct listen_ctx *); 83 static void hold_lctx(struct listen_ctx *); 84 static void listen_hash_add(struct adapter *, struct listen_ctx *); 85 static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *); 86 static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *); 87 static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *); 88 89 static void send_abort_rpl_synqe(struct toedev *, struct synq_entry *, int); 90 91 static int 92 alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6) 93 { 94 struct tid_info *t = &sc->tids; 95 u_int stid, n, f, mask; 96 struct stid_region *sr = &lctx->stid_region; 97 98 /* 99 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in 100 * the TCAM. The start of the stid region is properly aligned (the chip 101 * requires each region to be 128-cell aligned). 102 */ 103 n = isipv6 ? 2 : 1; 104 mask = n - 1; 105 KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0, 106 ("%s: stid region (%u, %u) not properly aligned. n = %u", 107 __func__, t->stid_base, t->nstids, n)); 108 109 mtx_lock(&t->stid_lock); 110 if (n > t->nstids - t->stids_in_use) { 111 mtx_unlock(&t->stid_lock); 112 return (-1); 113 } 114 115 if (t->nstids_free_head >= n) { 116 /* 117 * This allocation will definitely succeed because the region 118 * starts at a good alignment and we just checked we have enough 119 * stids free. 120 */ 121 f = t->nstids_free_head & mask; 122 t->nstids_free_head -= n + f; 123 stid = t->nstids_free_head; 124 TAILQ_INSERT_HEAD(&t->stids, sr, link); 125 } else { 126 struct stid_region *s; 127 128 stid = t->nstids_free_head; 129 TAILQ_FOREACH(s, &t->stids, link) { 130 stid += s->used + s->free; 131 f = stid & mask; 132 if (s->free >= n + f) { 133 stid -= n + f; 134 s->free -= n + f; 135 TAILQ_INSERT_AFTER(&t->stids, s, sr, link); 136 goto allocated; 137 } 138 } 139 140 if (__predict_false(stid != t->nstids)) { 141 panic("%s: stids TAILQ (%p) corrupt." 142 " At %d instead of %d at the end of the queue.", 143 __func__, &t->stids, stid, t->nstids); 144 } 145 146 mtx_unlock(&t->stid_lock); 147 return (-1); 148 } 149 150 allocated: 151 sr->used = n; 152 sr->free = f; 153 t->stids_in_use += n; 154 t->stid_tab[stid] = lctx; 155 mtx_unlock(&t->stid_lock); 156 157 KASSERT(((stid + t->stid_base) & mask) == 0, 158 ("%s: EDOOFUS.", __func__)); 159 return (stid + t->stid_base); 160 } 161 162 static struct listen_ctx * 163 lookup_stid(struct adapter *sc, int stid) 164 { 165 struct tid_info *t = &sc->tids; 166 167 return (t->stid_tab[stid - t->stid_base]); 168 } 169 170 static void 171 free_stid(struct adapter *sc, struct listen_ctx *lctx) 172 { 173 struct tid_info *t = &sc->tids; 174 struct stid_region *sr = &lctx->stid_region; 175 struct stid_region *s; 176 177 KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used)); 178 179 mtx_lock(&t->stid_lock); 180 s = TAILQ_PREV(sr, stid_head, link); 181 if (s != NULL) 182 s->free += sr->used + sr->free; 183 else 184 t->nstids_free_head += sr->used + sr->free; 185 KASSERT(t->stids_in_use >= sr->used, 186 ("%s: stids_in_use (%u) < stids being freed (%u)", __func__, 187 t->stids_in_use, sr->used)); 188 t->stids_in_use -= sr->used; 189 TAILQ_REMOVE(&t->stids, sr, link); 190 mtx_unlock(&t->stid_lock); 191 } 192 193 static struct listen_ctx * 194 alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi) 195 { 196 struct listen_ctx *lctx; 197 198 INP_WLOCK_ASSERT(inp); 199 200 lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO); 201 if (lctx == NULL) 202 return (NULL); 203 204 lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6); 205 if (lctx->stid < 0) { 206 free(lctx, M_CXGBE); 207 return (NULL); 208 } 209 210 if (inp->inp_vflag & INP_IPV6 && 211 !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) { 212 lctx->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true); 213 if (lctx->ce == NULL) { 214 free(lctx, M_CXGBE); 215 return (NULL); 216 } 217 } 218 219 lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id]; 220 lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq]; 221 refcount_init(&lctx->refcount, 1); 222 223 lctx->inp = inp; 224 lctx->vnet = inp->inp_socket->so_vnet; 225 in_pcbref(inp); 226 227 return (lctx); 228 } 229 230 /* Don't call this directly, use release_lctx instead */ 231 static int 232 free_lctx(struct adapter *sc, struct listen_ctx *lctx) 233 { 234 struct inpcb *inp = lctx->inp; 235 236 INP_WLOCK_ASSERT(inp); 237 KASSERT(lctx->refcount == 0, 238 ("%s: refcount %d", __func__, lctx->refcount)); 239 KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid)); 240 241 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p", 242 __func__, lctx->stid, lctx, lctx->inp); 243 244 if (lctx->ce) 245 t4_release_clip_entry(sc, lctx->ce); 246 free_stid(sc, lctx); 247 free(lctx, M_CXGBE); 248 249 return (in_pcbrele_wlocked(inp)); 250 } 251 252 static void 253 hold_lctx(struct listen_ctx *lctx) 254 { 255 256 refcount_acquire(&lctx->refcount); 257 } 258 259 static inline uint32_t 260 listen_hashfn(void *key, u_long mask) 261 { 262 263 return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask); 264 } 265 266 /* 267 * Add a listen_ctx entry to the listen hash table. 268 */ 269 static void 270 listen_hash_add(struct adapter *sc, struct listen_ctx *lctx) 271 { 272 struct tom_data *td = sc->tom_softc; 273 int bucket = listen_hashfn(lctx->inp, td->listen_mask); 274 275 mtx_lock(&td->lctx_hash_lock); 276 LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link); 277 td->lctx_count++; 278 mtx_unlock(&td->lctx_hash_lock); 279 } 280 281 /* 282 * Look for the listening socket's context entry in the hash and return it. 283 */ 284 static struct listen_ctx * 285 listen_hash_find(struct adapter *sc, struct inpcb *inp) 286 { 287 struct tom_data *td = sc->tom_softc; 288 int bucket = listen_hashfn(inp, td->listen_mask); 289 struct listen_ctx *lctx; 290 291 mtx_lock(&td->lctx_hash_lock); 292 LIST_FOREACH(lctx, &td->listen_hash[bucket], link) { 293 if (lctx->inp == inp) 294 break; 295 } 296 mtx_unlock(&td->lctx_hash_lock); 297 298 return (lctx); 299 } 300 301 /* 302 * Removes the listen_ctx structure for inp from the hash and returns it. 303 */ 304 static struct listen_ctx * 305 listen_hash_del(struct adapter *sc, struct inpcb *inp) 306 { 307 struct tom_data *td = sc->tom_softc; 308 int bucket = listen_hashfn(inp, td->listen_mask); 309 struct listen_ctx *lctx, *l; 310 311 mtx_lock(&td->lctx_hash_lock); 312 LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) { 313 if (lctx->inp == inp) { 314 LIST_REMOVE(lctx, link); 315 td->lctx_count--; 316 break; 317 } 318 } 319 mtx_unlock(&td->lctx_hash_lock); 320 321 return (lctx); 322 } 323 324 /* 325 * Releases a hold on the lctx. Must be called with the listening socket's inp 326 * locked. The inp may be freed by this function and it returns NULL to 327 * indicate this. 328 */ 329 static struct inpcb * 330 release_lctx(struct adapter *sc, struct listen_ctx *lctx) 331 { 332 struct inpcb *inp = lctx->inp; 333 int inp_freed = 0; 334 335 INP_WLOCK_ASSERT(inp); 336 if (refcount_release(&lctx->refcount)) 337 inp_freed = free_lctx(sc, lctx); 338 339 return (inp_freed ? NULL : inp); 340 } 341 342 static void 343 send_flowc_wr_synqe(struct adapter *sc, struct synq_entry *synqe) 344 { 345 struct mbuf *m = synqe->syn; 346 if_t ifp = m->m_pkthdr.rcvif; 347 struct vi_info *vi = if_getsoftc(ifp); 348 struct port_info *pi = vi->pi; 349 struct wrqe *wr; 350 struct fw_flowc_wr *flowc; 351 struct sge_ofld_txq *ofld_txq; 352 struct sge_ofld_rxq *ofld_rxq; 353 const int nparams = 6; 354 const int flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 355 const u_int pfvf = sc->pf << S_FW_VIID_PFN; 356 357 INP_WLOCK_ASSERT(synqe->lctx->inp); 358 MPASS((synqe->flags & TPF_FLOWC_WR_SENT) == 0); 359 360 ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx]; 361 ofld_rxq = &sc->sge.ofld_rxq[synqe->params.rxq_idx]; 362 363 wr = alloc_wrqe(roundup2(flowclen, 16), &ofld_txq->wrq); 364 if (wr == NULL) { 365 /* XXX */ 366 panic("%s: allocation failure.", __func__); 367 } 368 flowc = wrtod(wr); 369 memset(flowc, 0, wr->wr_len); 370 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 371 V_FW_FLOWC_WR_NPARAMS(nparams)); 372 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 373 V_FW_WR_FLOWID(synqe->tid)); 374 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; 375 flowc->mnemval[0].val = htobe32(pfvf); 376 flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; 377 flowc->mnemval[1].val = htobe32(pi->tx_chan); 378 flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; 379 flowc->mnemval[2].val = htobe32(pi->tx_chan); 380 flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; 381 flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id); 382 flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF; 383 flowc->mnemval[4].val = htobe32(512); 384 flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS; 385 flowc->mnemval[5].val = htobe32(512); 386 387 synqe->flags |= TPF_FLOWC_WR_SENT; 388 t4_wrq_tx(sc, wr); 389 } 390 391 static void 392 send_abort_rpl_synqe(struct toedev *tod, struct synq_entry *synqe, 393 int rst_status) 394 { 395 struct adapter *sc = tod->tod_softc; 396 struct wrqe *wr; 397 struct cpl_abort_req *req; 398 399 INP_WLOCK_ASSERT(synqe->lctx->inp); 400 401 CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s", 402 __func__, synqe, synqe->flags, synqe->tid, 403 synqe->flags & TPF_ABORT_SHUTDOWN ? 404 " (abort already in progress)" : ""); 405 if (synqe->flags & TPF_ABORT_SHUTDOWN) 406 return; /* abort already in progress */ 407 synqe->flags |= TPF_ABORT_SHUTDOWN; 408 409 if (!(synqe->flags & TPF_FLOWC_WR_SENT)) 410 send_flowc_wr_synqe(sc, synqe); 411 412 wr = alloc_wrqe(sizeof(*req), 413 &sc->sge.ofld_txq[synqe->params.txq_idx].wrq); 414 if (wr == NULL) { 415 /* XXX */ 416 panic("%s: allocation failure.", __func__); 417 } 418 req = wrtod(wr); 419 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid); 420 req->rsvd0 = 0; /* don't have a snd_nxt */ 421 req->rsvd1 = 1; /* no data sent yet */ 422 req->cmd = rst_status; 423 424 t4_l2t_send(sc, wr, &sc->l2t->l2tab[synqe->params.l2t_idx]); 425 } 426 427 static int 428 create_server(struct adapter *sc, struct listen_ctx *lctx) 429 { 430 struct wrqe *wr; 431 struct cpl_pass_open_req *req; 432 struct inpcb *inp = lctx->inp; 433 434 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 435 if (wr == NULL) { 436 log(LOG_ERR, "%s: allocation failure", __func__); 437 return (ENOMEM); 438 } 439 req = wrtod(wr); 440 441 INIT_TP_WR(req, 0); 442 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid)); 443 req->local_port = inp->inp_lport; 444 req->peer_port = 0; 445 req->local_ip = inp->inp_laddr.s_addr; 446 req->peer_ip = 0; 447 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); 448 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | 449 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); 450 451 t4_wrq_tx(sc, wr); 452 return (0); 453 } 454 455 static int 456 create_server6(struct adapter *sc, struct listen_ctx *lctx) 457 { 458 struct wrqe *wr; 459 struct cpl_pass_open_req6 *req; 460 struct inpcb *inp = lctx->inp; 461 462 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 463 if (wr == NULL) { 464 log(LOG_ERR, "%s: allocation failure", __func__); 465 return (ENOMEM); 466 } 467 req = wrtod(wr); 468 469 INIT_TP_WR(req, 0); 470 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid)); 471 req->local_port = inp->inp_lport; 472 req->peer_port = 0; 473 req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; 474 req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; 475 req->peer_ip_hi = 0; 476 req->peer_ip_lo = 0; 477 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); 478 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | 479 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); 480 481 t4_wrq_tx(sc, wr); 482 return (0); 483 } 484 485 static int 486 destroy_server(struct adapter *sc, struct listen_ctx *lctx) 487 { 488 struct wrqe *wr; 489 struct cpl_close_listsvr_req *req; 490 491 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 492 if (wr == NULL) { 493 /* XXX */ 494 panic("%s: allocation failure.", __func__); 495 } 496 req = wrtod(wr); 497 498 INIT_TP_WR(req, 0); 499 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, 500 lctx->stid)); 501 req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id); 502 req->rsvd = htobe16(0); 503 504 t4_wrq_tx(sc, wr); 505 return (0); 506 } 507 508 /* 509 * Start a listening server by sending a passive open request to HW. 510 * 511 * Can't take adapter lock here and access to sc->flags, 512 * sc->offload_map, if_capenable are all race prone. 513 */ 514 int 515 t4_listen_start(struct toedev *tod, struct tcpcb *tp) 516 { 517 struct adapter *sc = tod->tod_softc; 518 struct vi_info *vi; 519 struct port_info *pi; 520 struct inpcb *inp = tptoinpcb(tp); 521 struct listen_ctx *lctx; 522 int i, rc, v; 523 struct offload_settings settings; 524 525 INP_WLOCK_ASSERT(inp); 526 527 rw_rlock(&sc->policy_lock); 528 settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL, 529 EVL_MAKETAG(0xfff, 0, 0), inp); 530 rw_runlock(&sc->policy_lock); 531 if (!settings.offload) 532 return (0); 533 534 /* Don't start a hardware listener for any loopback address. */ 535 if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr)) 536 return (0); 537 if (!(inp->inp_vflag & INP_IPV6) && 538 IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr))) 539 return (0); 540 if (sc->flags & KERN_TLS_ON) 541 return (0); 542 #if 0 543 ADAPTER_LOCK(sc); 544 if (IS_BUSY(sc)) { 545 log(LOG_ERR, "%s: listen request ignored, %s is busy", 546 __func__, device_get_nameunit(sc->dev)); 547 goto done; 548 } 549 550 KASSERT(uld_active(sc, ULD_TOM), 551 ("%s: TOM not initialized", __func__)); 552 #endif 553 554 /* 555 * Find an initialized VI with IFCAP_TOE (4 or 6). We'll use the first 556 * such VI's queues to send the passive open and receive the reply to 557 * it. 558 * 559 * XXX: need a way to mark a port in use by offload. if_cxgbe should 560 * then reject any attempt to bring down such a port (and maybe reject 561 * attempts to disable IFCAP_TOE on that port too?). 562 */ 563 for_each_port(sc, i) { 564 pi = sc->port[i]; 565 for_each_vi(pi, v, vi) { 566 if (vi->flags & VI_INIT_DONE && 567 if_getcapenable(vi->ifp) & IFCAP_TOE) 568 goto found; 569 } 570 } 571 goto done; /* no port that's UP with IFCAP_TOE enabled */ 572 found: 573 574 if (listen_hash_find(sc, inp) != NULL) 575 goto done; /* already setup */ 576 577 lctx = alloc_lctx(sc, inp, vi); 578 if (lctx == NULL) { 579 log(LOG_ERR, 580 "%s: listen request ignored, %s couldn't allocate lctx\n", 581 __func__, device_get_nameunit(sc->dev)); 582 goto done; 583 } 584 listen_hash_add(sc, lctx); 585 586 CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x", 587 __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp, 588 inp->inp_vflag); 589 590 if (inp->inp_vflag & INP_IPV6) 591 rc = create_server6(sc, lctx); 592 else 593 rc = create_server(sc, lctx); 594 if (rc != 0) { 595 log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n", 596 __func__, device_get_nameunit(sc->dev), rc); 597 (void) listen_hash_del(sc, inp); 598 inp = release_lctx(sc, lctx); 599 /* can't be freed, host stack has a reference */ 600 KASSERT(inp != NULL, ("%s: inp freed", __func__)); 601 goto done; 602 } 603 lctx->flags |= LCTX_RPL_PENDING; 604 done: 605 #if 0 606 ADAPTER_UNLOCK(sc); 607 #endif 608 return (0); 609 } 610 611 int 612 t4_listen_stop(struct toedev *tod, struct tcpcb *tp) 613 { 614 struct listen_ctx *lctx; 615 struct adapter *sc = tod->tod_softc; 616 struct inpcb *inp = tptoinpcb(tp); 617 618 INP_WLOCK_ASSERT(inp); 619 620 lctx = listen_hash_del(sc, inp); 621 if (lctx == NULL) 622 return (ENOENT); /* no hardware listener for this inp */ 623 624 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid, 625 lctx, lctx->flags); 626 627 /* 628 * If the reply to the PASS_OPEN is still pending we'll wait for it to 629 * arrive and clean up when it does. 630 */ 631 if (lctx->flags & LCTX_RPL_PENDING) { 632 return (EINPROGRESS); 633 } 634 635 destroy_server(sc, lctx); 636 return (0); 637 } 638 639 static inline struct synq_entry * 640 alloc_synqe(struct adapter *sc __unused, struct listen_ctx *lctx, int flags) 641 { 642 struct synq_entry *synqe; 643 644 INP_RLOCK_ASSERT(lctx->inp); 645 MPASS(flags == M_WAITOK || flags == M_NOWAIT); 646 647 synqe = malloc(sizeof(*synqe), M_CXGBE, flags); 648 if (__predict_true(synqe != NULL)) { 649 synqe->flags = TPF_SYNQE; 650 refcount_init(&synqe->refcnt, 1); 651 synqe->lctx = lctx; 652 hold_lctx(lctx); /* Every synqe has a ref on its lctx. */ 653 synqe->syn = NULL; 654 } 655 656 return (synqe); 657 } 658 659 static inline void 660 hold_synqe(struct synq_entry *synqe) 661 { 662 663 refcount_acquire(&synqe->refcnt); 664 } 665 666 static inline struct inpcb * 667 release_synqe(struct adapter *sc, struct synq_entry *synqe) 668 { 669 struct inpcb *inp; 670 671 MPASS(synqe->flags & TPF_SYNQE); 672 MPASS(synqe->lctx != NULL); 673 674 inp = synqe->lctx->inp; 675 MPASS(inp != NULL); 676 INP_WLOCK_ASSERT(inp); 677 678 if (refcount_release(&synqe->refcnt)) { 679 inp = release_lctx(sc, synqe->lctx); 680 m_freem(synqe->syn); 681 free(synqe, M_CXGBE); 682 } 683 684 return (inp); 685 } 686 687 void 688 t4_syncache_added(struct toedev *tod __unused, void *arg) 689 { 690 struct synq_entry *synqe = arg; 691 692 hold_synqe(synqe); 693 } 694 695 void 696 t4_syncache_removed(struct toedev *tod, void *arg) 697 { 698 struct adapter *sc = tod->tod_softc; 699 struct synq_entry *synqe = arg; 700 struct inpcb *inp = synqe->lctx->inp; 701 702 /* 703 * XXX: this is a LOR but harmless when running from the softclock. 704 */ 705 INP_WLOCK(inp); 706 inp = release_synqe(sc, synqe); 707 if (inp != NULL) 708 INP_WUNLOCK(inp); 709 } 710 711 int 712 t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m) 713 { 714 struct synq_entry *synqe = arg; 715 716 if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) { 717 struct tcpopt to; 718 struct ip *ip = mtod(m, struct ip *); 719 struct tcphdr *th; 720 721 if (ip->ip_v == IPVERSION) 722 th = (void *)(ip + 1); 723 else 724 th = (void *)((struct ip6_hdr *)ip + 1); 725 bzero(&to, sizeof(to)); 726 tcp_dooptions(&to, (void *)(th + 1), 727 (th->th_off << 2) - sizeof(*th), TO_SYN); 728 729 /* save these for later */ 730 synqe->iss = be32toh(th->th_seq); 731 synqe->irs = be32toh(th->th_ack) - 1; 732 synqe->ts = to.to_tsval; 733 } 734 735 m_freem(m); /* don't need this any more */ 736 return (0); 737 } 738 739 static int 740 do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss, 741 struct mbuf *m) 742 { 743 struct adapter *sc = iq->adapter; 744 const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1); 745 int stid = GET_TID(cpl); 746 unsigned int status = cpl->status; 747 struct listen_ctx *lctx = lookup_stid(sc, stid); 748 struct inpcb *inp = lctx->inp; 749 #ifdef INVARIANTS 750 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 751 #endif 752 753 KASSERT(opcode == CPL_PASS_OPEN_RPL, 754 ("%s: unexpected opcode 0x%x", __func__, opcode)); 755 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 756 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 757 758 INP_WLOCK(inp); 759 760 CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x", 761 __func__, stid, status, lctx->flags); 762 763 lctx->flags &= ~LCTX_RPL_PENDING; 764 765 if (status != CPL_ERR_NONE) 766 log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status); 767 768 #ifdef INVARIANTS 769 /* 770 * If the inp has been dropped (listening socket closed) then 771 * listen_stop must have run and taken the inp out of the hash. 772 */ 773 if (inp->inp_flags & INP_DROPPED) { 774 KASSERT(listen_hash_del(sc, inp) == NULL, 775 ("%s: inp %p still in listen hash", __func__, inp)); 776 } 777 #endif 778 779 if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) { 780 if (release_lctx(sc, lctx) != NULL) 781 INP_WUNLOCK(inp); 782 return (status); 783 } 784 785 /* 786 * Listening socket stopped listening earlier and now the chip tells us 787 * it has started the hardware listener. Stop it; the lctx will be 788 * released in do_close_server_rpl. 789 */ 790 if (inp->inp_flags & INP_DROPPED) { 791 destroy_server(sc, lctx); 792 INP_WUNLOCK(inp); 793 return (status); 794 } 795 796 /* 797 * Failed to start hardware listener. Take inp out of the hash and 798 * release our reference on it. An error message has been logged 799 * already. 800 */ 801 if (status != CPL_ERR_NONE) { 802 listen_hash_del(sc, inp); 803 if (release_lctx(sc, lctx) != NULL) 804 INP_WUNLOCK(inp); 805 return (status); 806 } 807 808 /* hardware listener open for business */ 809 810 INP_WUNLOCK(inp); 811 return (status); 812 } 813 814 static int 815 do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss, 816 struct mbuf *m) 817 { 818 struct adapter *sc = iq->adapter; 819 const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1); 820 int stid = GET_TID(cpl); 821 unsigned int status = cpl->status; 822 struct listen_ctx *lctx = lookup_stid(sc, stid); 823 struct inpcb *inp = lctx->inp; 824 #ifdef INVARIANTS 825 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 826 #endif 827 828 KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL, 829 ("%s: unexpected opcode 0x%x", __func__, opcode)); 830 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 831 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 832 833 CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status); 834 835 if (status != CPL_ERR_NONE) { 836 log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n", 837 __func__, status, stid); 838 return (status); 839 } 840 841 INP_WLOCK(inp); 842 inp = release_lctx(sc, lctx); 843 if (inp != NULL) 844 INP_WUNLOCK(inp); 845 846 return (status); 847 } 848 849 static void 850 done_with_synqe(struct adapter *sc, struct synq_entry *synqe) 851 { 852 struct listen_ctx *lctx = synqe->lctx; 853 struct inpcb *inp = lctx->inp; 854 struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; 855 int ntids; 856 857 INP_WLOCK_ASSERT(inp); 858 ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1; 859 860 remove_tid(sc, synqe->tid, ntids); 861 release_tid(sc, synqe->tid, lctx->ctrlq); 862 t4_l2t_release(e); 863 inp = release_synqe(sc, synqe); 864 if (inp) 865 INP_WUNLOCK(inp); 866 } 867 868 void 869 synack_failure_cleanup(struct adapter *sc, int tid) 870 { 871 struct synq_entry *synqe = lookup_tid(sc, tid); 872 873 INP_WLOCK(synqe->lctx->inp); 874 done_with_synqe(sc, synqe); 875 } 876 877 int 878 do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss, 879 struct mbuf *m) 880 { 881 struct adapter *sc = iq->adapter; 882 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 883 unsigned int tid = GET_TID(cpl); 884 struct synq_entry *synqe = lookup_tid(sc, tid); 885 struct listen_ctx *lctx = synqe->lctx; 886 struct inpcb *inp = lctx->inp; 887 struct sge_ofld_txq *ofld_txq; 888 #ifdef INVARIANTS 889 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 890 #endif 891 892 KASSERT(opcode == CPL_ABORT_REQ_RSS, 893 ("%s: unexpected opcode 0x%x", __func__, opcode)); 894 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 895 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); 896 897 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", 898 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); 899 900 if (negative_advice(cpl->status)) 901 return (0); /* Ignore negative advice */ 902 903 INP_WLOCK(inp); 904 905 ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx]; 906 907 if (!(synqe->flags & TPF_FLOWC_WR_SENT)) 908 send_flowc_wr_synqe(sc, synqe); 909 910 /* 911 * If we'd initiated an abort earlier the reply to it is responsible for 912 * cleaning up resources. Otherwise we tear everything down right here 913 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 914 */ 915 if (synqe->flags & TPF_ABORT_SHUTDOWN) { 916 INP_WUNLOCK(inp); 917 goto done; 918 } 919 920 done_with_synqe(sc, synqe); 921 /* inp lock released by done_with_synqe */ 922 done: 923 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 924 return (0); 925 } 926 927 int 928 do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss, 929 struct mbuf *m) 930 { 931 struct adapter *sc = iq->adapter; 932 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 933 unsigned int tid = GET_TID(cpl); 934 struct synq_entry *synqe = lookup_tid(sc, tid); 935 struct listen_ctx *lctx = synqe->lctx; 936 struct inpcb *inp = lctx->inp; 937 #ifdef INVARIANTS 938 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 939 #endif 940 941 KASSERT(opcode == CPL_ABORT_RPL_RSS, 942 ("%s: unexpected opcode 0x%x", __func__, opcode)); 943 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 944 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); 945 946 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", 947 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); 948 949 INP_WLOCK(inp); 950 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, 951 ("%s: wasn't expecting abort reply for synqe %p (0x%x)", 952 __func__, synqe, synqe->flags)); 953 954 done_with_synqe(sc, synqe); 955 /* inp lock released by done_with_synqe */ 956 957 return (0); 958 } 959 960 void 961 t4_offload_socket(struct toedev *tod, void *arg, struct socket *so) 962 { 963 struct adapter *sc = tod->tod_softc; 964 struct synq_entry *synqe = arg; 965 struct inpcb *inp = sotoinpcb(so); 966 struct toepcb *toep = synqe->toep; 967 968 NET_EPOCH_ASSERT(); /* prevents bad race with accept() */ 969 INP_WLOCK_ASSERT(inp); 970 KASSERT(synqe->flags & TPF_SYNQE, 971 ("%s: %p not a synq_entry?", __func__, arg)); 972 MPASS(toep->tid == synqe->tid); 973 974 offload_socket(so, toep); 975 make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt); 976 toep->flags |= TPF_CPL_PENDING; 977 update_tid(sc, synqe->tid, toep); 978 synqe->flags |= TPF_SYNQE_EXPANDED; 979 inp->inp_flowtype = (inp->inp_vflag & INP_IPV6) ? 980 M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4; 981 inp->inp_flowid = synqe->rss_hash; 982 } 983 984 static void 985 t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to) 986 { 987 bzero(to, sizeof(*to)); 988 989 if (t4opt->mss) { 990 to->to_flags |= TOF_MSS; 991 to->to_mss = be16toh(t4opt->mss); 992 } 993 994 if (t4opt->wsf > 0 && t4opt->wsf < 15) { 995 to->to_flags |= TOF_SCALE; 996 to->to_wscale = t4opt->wsf; 997 } 998 999 if (t4opt->tstamp) 1000 to->to_flags |= TOF_TS; 1001 1002 if (t4opt->sack) 1003 to->to_flags |= TOF_SACKPERM; 1004 } 1005 1006 static bool 1007 encapsulated_syn(struct adapter *sc, const struct cpl_pass_accept_req *cpl) 1008 { 1009 u_int hlen = be32toh(cpl->hdr_len); 1010 1011 if (chip_id(sc) >= CHELSIO_T6) 1012 return (G_T6_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header)); 1013 else 1014 return (G_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header)); 1015 } 1016 1017 static void 1018 pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m, 1019 struct in_conninfo *inc, struct tcphdr *th, uint8_t *iptos) 1020 { 1021 const struct cpl_pass_accept_req *cpl = mtod(m, const void *); 1022 const struct ether_header *eh; 1023 unsigned int hlen = be32toh(cpl->hdr_len); 1024 uintptr_t l3hdr; 1025 const struct tcphdr *tcp; 1026 1027 eh = (const void *)(cpl + 1); 1028 if (chip_id(sc) >= CHELSIO_T6) { 1029 l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen)); 1030 tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen)); 1031 } else { 1032 l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen)); 1033 tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen)); 1034 } 1035 1036 /* extract TOS (DiffServ + ECN) byte for AccECN */ 1037 if (iptos) { 1038 if (((struct ip *)l3hdr)->ip_v == IPVERSION) { 1039 const struct ip *ip = (const void *)l3hdr; 1040 *iptos = ip->ip_tos; 1041 } 1042 #ifdef INET6 1043 else 1044 if (((struct ip *)l3hdr)->ip_v == (IPV6_VERSION >> 4)) { 1045 const struct ip6_hdr *ip6 = (const void *)l3hdr; 1046 *iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; 1047 } 1048 #endif /* INET */ 1049 } 1050 1051 if (inc) { 1052 bzero(inc, sizeof(*inc)); 1053 inc->inc_fport = tcp->th_sport; 1054 inc->inc_lport = tcp->th_dport; 1055 if (((struct ip *)l3hdr)->ip_v == IPVERSION) { 1056 const struct ip *ip = (const void *)l3hdr; 1057 1058 inc->inc_faddr = ip->ip_src; 1059 inc->inc_laddr = ip->ip_dst; 1060 } else { 1061 const struct ip6_hdr *ip6 = (const void *)l3hdr; 1062 1063 inc->inc_flags |= INC_ISIPV6; 1064 inc->inc6_faddr = ip6->ip6_src; 1065 inc->inc6_laddr = ip6->ip6_dst; 1066 } 1067 } 1068 1069 if (th) { 1070 bcopy(tcp, th, sizeof(*th)); 1071 tcp_fields_to_host(th); /* just like tcp_input */ 1072 } 1073 } 1074 1075 static struct l2t_entry * 1076 get_l2te_for_nexthop(struct port_info *pi, if_t ifp, 1077 struct in_conninfo *inc) 1078 { 1079 struct l2t_entry *e; 1080 struct sockaddr_in6 sin6; 1081 struct sockaddr *dst = (void *)&sin6; 1082 struct nhop_object *nh; 1083 1084 if (inc->inc_flags & INC_ISIPV6) { 1085 bzero(dst, sizeof(struct sockaddr_in6)); 1086 dst->sa_len = sizeof(struct sockaddr_in6); 1087 dst->sa_family = AF_INET6; 1088 1089 if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) { 1090 /* no need for route lookup */ 1091 e = t4_l2t_get(pi, ifp, dst); 1092 return (e); 1093 } 1094 1095 nh = fib6_lookup(RT_DEFAULT_FIB, &inc->inc6_faddr, 0, NHR_NONE, 0); 1096 if (nh == NULL) 1097 return (NULL); 1098 if (nh->nh_ifp != ifp) 1099 return (NULL); 1100 if (nh->nh_flags & NHF_GATEWAY) 1101 ((struct sockaddr_in6 *)dst)->sin6_addr = nh->gw6_sa.sin6_addr; 1102 else 1103 ((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr; 1104 } else { 1105 dst->sa_len = sizeof(struct sockaddr_in); 1106 dst->sa_family = AF_INET; 1107 1108 nh = fib4_lookup(RT_DEFAULT_FIB, inc->inc_faddr, 0, NHR_NONE, 0); 1109 if (nh == NULL) 1110 return (NULL); 1111 if (nh->nh_ifp != ifp) 1112 return (NULL); 1113 if (nh->nh_flags & NHF_GATEWAY) 1114 if (nh->gw_sa.sa_family == AF_INET) 1115 ((struct sockaddr_in *)dst)->sin_addr = nh->gw4_sa.sin_addr; 1116 else 1117 *((struct sockaddr_in6 *)dst) = nh->gw6_sa; 1118 else 1119 ((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr; 1120 } 1121 1122 e = t4_l2t_get(pi, ifp, dst); 1123 return (e); 1124 } 1125 1126 static int 1127 send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0, 1128 uint32_t opt2, int tid) 1129 { 1130 struct wrqe *wr; 1131 struct cpl_pass_accept_rpl *rpl; 1132 struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; 1133 1134 wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) : 1135 sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]); 1136 if (wr == NULL) 1137 return (ENOMEM); 1138 rpl = wrtod(wr); 1139 1140 if (is_t4(sc)) 1141 INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid); 1142 else { 1143 struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl; 1144 1145 INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid); 1146 rpl5->iss = htobe32(synqe->iss); 1147 } 1148 rpl->opt0 = opt0; 1149 rpl->opt2 = opt2; 1150 1151 return (t4_l2t_send(sc, wr, e)); 1152 } 1153 1154 #define REJECT_PASS_ACCEPT_REQ(tunnel) do { \ 1155 if (!tunnel) { \ 1156 m_freem(m); \ 1157 m = NULL; \ 1158 } \ 1159 reject_reason = __LINE__; \ 1160 goto reject; \ 1161 } while (0) 1162 1163 /* 1164 * The context associated with a tid entry via insert_tid could be a synq_entry 1165 * or a toepcb. The only way CPL handlers can tell is via a bit in these flags. 1166 */ 1167 CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags)); 1168 1169 /* 1170 * Incoming SYN on a listening socket. 1171 * 1172 * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe, 1173 * etc. 1174 */ 1175 static int 1176 do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, 1177 struct mbuf *m) 1178 { 1179 struct adapter *sc = iq->adapter; 1180 struct toedev *tod; 1181 const struct cpl_pass_accept_req *cpl = mtod(m, const void *); 1182 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); 1183 unsigned int tid = GET_TID(cpl); 1184 struct listen_ctx *lctx = lookup_stid(sc, stid); 1185 struct inpcb *inp; 1186 struct socket *so; 1187 struct in_conninfo inc; 1188 struct tcphdr th; 1189 struct tcpopt to; 1190 struct port_info *pi; 1191 struct vi_info *vi; 1192 if_t hw_ifp, ifp; 1193 struct l2t_entry *e = NULL; 1194 struct synq_entry *synqe = NULL; 1195 int reject_reason, v, ntids; 1196 uint16_t vid, l2info; 1197 struct epoch_tracker et; 1198 #ifdef INVARIANTS 1199 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1200 #endif 1201 struct offload_settings settings; 1202 uint8_t iptos; 1203 1204 KASSERT(opcode == CPL_PASS_ACCEPT_REQ, 1205 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1206 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 1207 1208 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid, 1209 lctx); 1210 1211 /* 1212 * Figure out the port the SYN arrived on. We'll look for an exact VI 1213 * match in a bit but in case we don't find any we'll use the main VI as 1214 * the incoming ifnet. 1215 */ 1216 l2info = be16toh(cpl->l2info); 1217 pi = sc->port[G_SYN_INTF(l2info)]; 1218 hw_ifp = pi->vi[0].ifp; 1219 m->m_pkthdr.rcvif = hw_ifp; 1220 1221 CURVNET_SET(lctx->vnet); /* before any potential REJECT */ 1222 1223 /* 1224 * If VXLAN/NVGRE parsing is enabled then SYNs in the inner traffic will 1225 * also hit the listener. We don't want to offload those. 1226 */ 1227 if (encapsulated_syn(sc, cpl)) { 1228 REJECT_PASS_ACCEPT_REQ(true); 1229 } 1230 1231 /* 1232 * Use the MAC index to lookup the associated VI. If this SYN didn't 1233 * match a perfect MAC filter, punt. 1234 */ 1235 if (!(l2info & F_SYN_XACT_MATCH)) { 1236 REJECT_PASS_ACCEPT_REQ(true); 1237 } 1238 for_each_vi(pi, v, vi) { 1239 if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info)) 1240 goto found; 1241 } 1242 REJECT_PASS_ACCEPT_REQ(true); 1243 found: 1244 hw_ifp = vi->ifp; /* the cxgbe ifnet */ 1245 m->m_pkthdr.rcvif = hw_ifp; 1246 tod = TOEDEV(hw_ifp); 1247 1248 /* 1249 * Don't offload if the peer requested a TCP option that's not known to 1250 * the silicon. Send the SYN to the kernel instead. 1251 */ 1252 if (__predict_false(cpl->tcpopt.unknown)) 1253 REJECT_PASS_ACCEPT_REQ(true); 1254 1255 /* 1256 * Figure out if there is a pseudo interface (vlan, lagg, etc.) 1257 * involved. Don't offload if the SYN had a VLAN tag and the vid 1258 * doesn't match anything on this interface. 1259 * 1260 * XXX: lagg support, lagg + vlan support. 1261 */ 1262 vid = EVL_VLANOFTAG(be16toh(cpl->vlan)); 1263 if (vid != 0xfff && vid != 0) { 1264 ifp = VLAN_DEVAT(hw_ifp, vid); 1265 if (ifp == NULL) 1266 REJECT_PASS_ACCEPT_REQ(true); 1267 } else 1268 ifp = hw_ifp; 1269 1270 /* 1271 * Don't offload if the ifnet that the SYN came in on is not in the same 1272 * vnet as the listening socket. 1273 */ 1274 if (lctx->vnet != if_getvnet(ifp)) 1275 REJECT_PASS_ACCEPT_REQ(true); 1276 1277 pass_accept_req_to_protohdrs(sc, m, &inc, &th, &iptos); 1278 if (inc.inc_flags & INC_ISIPV6) { 1279 1280 /* Don't offload if the ifcap isn't enabled */ 1281 if ((if_getcapenable(ifp) & IFCAP_TOE6) == 0) 1282 REJECT_PASS_ACCEPT_REQ(true); 1283 1284 /* 1285 * SYN must be directed to an IP6 address on this ifnet. This 1286 * is more restrictive than in6_localip. 1287 */ 1288 NET_EPOCH_ENTER(et); 1289 if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) { 1290 NET_EPOCH_EXIT(et); 1291 REJECT_PASS_ACCEPT_REQ(true); 1292 } 1293 1294 ntids = 2; 1295 } else { 1296 1297 /* Don't offload if the ifcap isn't enabled */ 1298 if ((if_getcapenable(ifp) & IFCAP_TOE4) == 0) 1299 REJECT_PASS_ACCEPT_REQ(true); 1300 1301 /* 1302 * SYN must be directed to an IP address on this ifnet. This 1303 * is more restrictive than in_localip. 1304 */ 1305 NET_EPOCH_ENTER(et); 1306 if (!in_ifhasaddr(ifp, inc.inc_laddr)) { 1307 NET_EPOCH_EXIT(et); 1308 REJECT_PASS_ACCEPT_REQ(true); 1309 } 1310 1311 ntids = 1; 1312 } 1313 1314 e = get_l2te_for_nexthop(pi, ifp, &inc); 1315 if (e == NULL) { 1316 NET_EPOCH_EXIT(et); 1317 REJECT_PASS_ACCEPT_REQ(true); 1318 } 1319 1320 /* Don't offload if the 4-tuple is already in use */ 1321 if (toe_4tuple_check(&inc, &th, ifp) != 0) { 1322 NET_EPOCH_EXIT(et); 1323 REJECT_PASS_ACCEPT_REQ(false); 1324 } 1325 1326 inp = lctx->inp; /* listening socket, not owned by TOE */ 1327 INP_RLOCK(inp); 1328 1329 /* Don't offload if the listening socket has closed */ 1330 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 1331 INP_RUNLOCK(inp); 1332 NET_EPOCH_EXIT(et); 1333 REJECT_PASS_ACCEPT_REQ(false); 1334 } 1335 so = inp->inp_socket; 1336 rw_rlock(&sc->policy_lock); 1337 settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m, 1338 EVL_MAKETAG(0xfff, 0, 0), inp); 1339 rw_runlock(&sc->policy_lock); 1340 if (!settings.offload) { 1341 INP_RUNLOCK(inp); 1342 NET_EPOCH_EXIT(et); 1343 REJECT_PASS_ACCEPT_REQ(true); /* Rejected by COP. */ 1344 } 1345 1346 synqe = alloc_synqe(sc, lctx, M_NOWAIT); 1347 if (synqe == NULL) { 1348 INP_RUNLOCK(inp); 1349 NET_EPOCH_EXIT(et); 1350 REJECT_PASS_ACCEPT_REQ(true); 1351 } 1352 MPASS(rss->hash_type == RSS_HASH_TCP); 1353 synqe->rss_hash = be32toh(rss->hash_val); 1354 atomic_store_int(&synqe->ok_to_respond, 0); 1355 1356 init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx, 1357 &synqe->params); 1358 1359 /* 1360 * If all goes well t4_syncache_respond will get called during 1361 * syncache_add. Note that syncache_add releases the pcb lock. 1362 */ 1363 t4opt_to_tcpopt(&cpl->tcpopt, &to); 1364 toe_syncache_add(&inc, &to, &th, inp, tod, synqe, iptos); 1365 1366 if (atomic_load_int(&synqe->ok_to_respond) > 0) { 1367 uint64_t opt0; 1368 uint32_t opt2; 1369 1370 opt0 = calc_options0(vi, &synqe->params); 1371 opt2 = calc_options2(vi, &synqe->params); 1372 1373 insert_tid(sc, tid, synqe, ntids); 1374 synqe->tid = tid; 1375 synqe->syn = m; 1376 m = NULL; 1377 1378 if (send_synack(sc, synqe, opt0, opt2, tid) != 0) { 1379 remove_tid(sc, tid, ntids); 1380 m = synqe->syn; 1381 synqe->syn = NULL; 1382 NET_EPOCH_EXIT(et); 1383 REJECT_PASS_ACCEPT_REQ(true); 1384 } 1385 1386 CTR6(KTR_CXGBE, 1387 "%s: stid %u, tid %u, synqe %p, opt0 %#016lx, opt2 %#08x", 1388 __func__, stid, tid, synqe, be64toh(opt0), be32toh(opt2)); 1389 } else { 1390 NET_EPOCH_EXIT(et); 1391 REJECT_PASS_ACCEPT_REQ(false); 1392 } 1393 1394 NET_EPOCH_EXIT(et); 1395 CURVNET_RESTORE(); 1396 return (0); 1397 reject: 1398 CURVNET_RESTORE(); 1399 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, 1400 reject_reason); 1401 1402 if (e) 1403 t4_l2t_release(e); 1404 release_tid(sc, tid, lctx->ctrlq); 1405 if (synqe) { 1406 inp = synqe->lctx->inp; 1407 INP_WLOCK(inp); 1408 inp = release_synqe(sc, synqe); 1409 if (inp) 1410 INP_WUNLOCK(inp); 1411 } 1412 1413 if (m) { 1414 /* 1415 * The connection request hit a TOE listener but is being passed 1416 * on to the kernel sw stack instead of getting offloaded. 1417 */ 1418 m_adj(m, sizeof(*cpl)); 1419 m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | 1420 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 1421 m->m_pkthdr.csum_data = 0xffff; 1422 if_input(hw_ifp, m); 1423 } 1424 1425 return (reject_reason); 1426 } 1427 1428 static void 1429 synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe, 1430 const struct cpl_pass_establish *cpl, struct in_conninfo *inc, 1431 struct tcphdr *th, struct tcpopt *to) 1432 { 1433 uint16_t tcp_opt = be16toh(cpl->tcp_opt); 1434 uint8_t iptos; 1435 1436 /* start off with the original SYN */ 1437 pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th, &iptos); 1438 1439 /* modify parts to make it look like the ACK to our SYN|ACK */ 1440 th->th_flags = TH_ACK; 1441 th->th_ack = synqe->iss + 1; 1442 th->th_seq = be32toh(cpl->rcv_isn); 1443 bzero(to, sizeof(*to)); 1444 if (G_TCPOPT_TSTAMP(tcp_opt)) { 1445 to->to_flags |= TOF_TS; 1446 to->to_tsecr = synqe->ts; 1447 } 1448 } 1449 1450 static int 1451 do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, 1452 struct mbuf *m) 1453 { 1454 struct adapter *sc = iq->adapter; 1455 struct vi_info *vi; 1456 if_t ifp; 1457 const struct cpl_pass_establish *cpl = (const void *)(rss + 1); 1458 #if defined(KTR) || defined(INVARIANTS) 1459 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); 1460 #endif 1461 unsigned int tid = GET_TID(cpl); 1462 struct synq_entry *synqe = lookup_tid(sc, tid); 1463 struct listen_ctx *lctx = synqe->lctx; 1464 struct inpcb *inp = lctx->inp, *new_inp; 1465 struct socket *so; 1466 struct tcphdr th; 1467 struct tcpopt to; 1468 struct in_conninfo inc; 1469 struct toepcb *toep; 1470 struct epoch_tracker et; 1471 int rstreason; 1472 #ifdef INVARIANTS 1473 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1474 #endif 1475 1476 KASSERT(opcode == CPL_PASS_ESTABLISH, 1477 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1478 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1479 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 1480 KASSERT(synqe->flags & TPF_SYNQE, 1481 ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe)); 1482 1483 CURVNET_SET(lctx->vnet); 1484 NET_EPOCH_ENTER(et); /* for syncache_expand */ 1485 INP_WLOCK(inp); 1486 1487 CTR6(KTR_CXGBE, 1488 "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x", 1489 __func__, stid, tid, synqe, synqe->flags, inp->inp_flags); 1490 1491 ifp = synqe->syn->m_pkthdr.rcvif; 1492 vi = if_getsoftc(ifp); 1493 KASSERT(vi->adapter == sc, 1494 ("%s: vi %p, sc %p mismatch", __func__, vi, sc)); 1495 1496 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 1497 reset: 1498 send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_SEND_RST); 1499 INP_WUNLOCK(inp); 1500 NET_EPOCH_EXIT(et); 1501 CURVNET_RESTORE(); 1502 return (0); 1503 } 1504 1505 KASSERT(synqe->params.rxq_idx == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0], 1506 ("%s: CPL arrived on unexpected rxq. %d %d", __func__, 1507 synqe->params.rxq_idx, 1508 (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0]))); 1509 1510 toep = alloc_toepcb(vi, M_NOWAIT); 1511 if (toep == NULL) 1512 goto reset; 1513 toep->tid = tid; 1514 toep->l2te = &sc->l2t->l2tab[synqe->params.l2t_idx]; 1515 toep->vnet = lctx->vnet; 1516 bcopy(&synqe->params, &toep->params, sizeof(toep->params)); 1517 init_toepcb(vi, toep); 1518 1519 MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss); 1520 MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs); 1521 synqe->tcp_opt = cpl->tcp_opt; 1522 synqe->toep = toep; 1523 1524 /* Come up with something that syncache_expand should be ok with. */ 1525 synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to); 1526 if (inc.inc_flags & INC_ISIPV6) { 1527 if (lctx->ce == NULL) { 1528 toep->ce = t4_get_clip_entry(sc, &inc.inc6_laddr, true); 1529 if (toep->ce == NULL) { 1530 free_toepcb(toep); 1531 goto reset; /* RST without a CLIP entry? */ 1532 } 1533 } else { 1534 t4_hold_clip_entry(sc, lctx->ce); 1535 toep->ce = lctx->ce; 1536 } 1537 } 1538 so = inp->inp_socket; 1539 KASSERT(so != NULL, ("%s: socket is NULL", __func__)); 1540 1541 rstreason = toe_syncache_expand(&inc, &to, &th, &so); 1542 if (rstreason < 0) { 1543 free_toepcb(toep); 1544 send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_NO_RST); 1545 INP_WUNLOCK(inp); 1546 NET_EPOCH_EXIT(et); 1547 CURVNET_RESTORE(); 1548 return (0); 1549 } else if (rstreason == 0 || so == NULL) { 1550 free_toepcb(toep); 1551 goto reset; 1552 } 1553 1554 /* New connection inpcb is already locked by syncache_expand(). */ 1555 new_inp = sotoinpcb(so); 1556 INP_WLOCK_ASSERT(new_inp); 1557 MPASS(so->so_vnet == lctx->vnet); 1558 1559 /* 1560 * This is for expansion from syncookies. 1561 * 1562 * XXX: we've held the tcbinfo lock throughout so there's no risk of 1563 * anyone accept'ing a connection before we've installed our hooks, but 1564 * this somewhat defeats the purpose of having a tod_offload_socket :-( 1565 */ 1566 if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) { 1567 tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); 1568 t4_offload_socket(TOEDEV(ifp), synqe, so); 1569 } 1570 1571 INP_WUNLOCK(new_inp); 1572 1573 /* Done with the synqe */ 1574 inp = release_synqe(sc, synqe); 1575 if (inp != NULL) 1576 INP_WUNLOCK(inp); 1577 NET_EPOCH_EXIT(et); 1578 CURVNET_RESTORE(); 1579 1580 return (0); 1581 } 1582 1583 void 1584 t4_init_listen_cpl_handlers(void) 1585 { 1586 1587 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl); 1588 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); 1589 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 1590 t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); 1591 } 1592 1593 void 1594 t4_uninit_listen_cpl_handlers(void) 1595 { 1596 1597 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL); 1598 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL); 1599 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL); 1600 t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL); 1601 } 1602 #endif 1603