1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2012 Chelsio Communications, Inc. 5 * All rights reserved. 6 * Written by: Navdeep Parhar <np@FreeBSD.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #include "opt_inet.h" 34 #include "opt_inet6.h" 35 36 #ifdef TCP_OFFLOAD 37 #include <sys/param.h> 38 #include <sys/types.h> 39 #include <sys/kernel.h> 40 #include <sys/ktr.h> 41 #include <sys/module.h> 42 #include <sys/protosw.h> 43 #include <sys/refcount.h> 44 #include <sys/domain.h> 45 #include <sys/fnv_hash.h> 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sysctl.h> 49 #include <net/ethernet.h> 50 #include <net/if.h> 51 #include <net/if_types.h> 52 #include <net/if_vlan_var.h> 53 #include <net/route.h> 54 #include <netinet/in.h> 55 #include <netinet/in_fib.h> 56 #include <netinet/in_pcb.h> 57 #include <netinet/ip.h> 58 #include <netinet/ip6.h> 59 #include <netinet6/in6_fib.h> 60 #include <netinet6/scope6_var.h> 61 #include <netinet/tcp_timer.h> 62 #define TCPSTATES 63 #include <netinet/tcp_fsm.h> 64 #include <netinet/tcp_var.h> 65 #include <netinet/toecore.h> 66 #include <netinet/cc/cc.h> 67 68 #include "common/common.h" 69 #include "common/t4_msg.h" 70 #include "common/t4_regs.h" 71 #include "t4_clip.h" 72 #include "tom/t4_tom_l2t.h" 73 #include "tom/t4_tom.h" 74 75 /* stid services */ 76 static int alloc_stid(struct adapter *, struct listen_ctx *, int); 77 static struct listen_ctx *lookup_stid(struct adapter *, int); 78 static void free_stid(struct adapter *, struct listen_ctx *); 79 80 /* lctx services */ 81 static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *, 82 struct vi_info *); 83 static int free_lctx(struct adapter *, struct listen_ctx *); 84 static void hold_lctx(struct listen_ctx *); 85 static void listen_hash_add(struct adapter *, struct listen_ctx *); 86 static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *); 87 static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *); 88 static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *); 89 90 static void send_reset_synqe(struct toedev *, struct synq_entry *); 91 92 static int 93 alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6) 94 { 95 struct tid_info *t = &sc->tids; 96 u_int stid, n, f, mask; 97 struct stid_region *sr = &lctx->stid_region; 98 99 /* 100 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in 101 * the TCAM. The start of the stid region is properly aligned (the chip 102 * requires each region to be 128-cell aligned). 103 */ 104 n = isipv6 ? 2 : 1; 105 mask = n - 1; 106 KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0, 107 ("%s: stid region (%u, %u) not properly aligned. n = %u", 108 __func__, t->stid_base, t->nstids, n)); 109 110 mtx_lock(&t->stid_lock); 111 if (n > t->nstids - t->stids_in_use) { 112 mtx_unlock(&t->stid_lock); 113 return (-1); 114 } 115 116 if (t->nstids_free_head >= n) { 117 /* 118 * This allocation will definitely succeed because the region 119 * starts at a good alignment and we just checked we have enough 120 * stids free. 121 */ 122 f = t->nstids_free_head & mask; 123 t->nstids_free_head -= n + f; 124 stid = t->nstids_free_head; 125 TAILQ_INSERT_HEAD(&t->stids, sr, link); 126 } else { 127 struct stid_region *s; 128 129 stid = t->nstids_free_head; 130 TAILQ_FOREACH(s, &t->stids, link) { 131 stid += s->used + s->free; 132 f = stid & mask; 133 if (s->free >= n + f) { 134 stid -= n + f; 135 s->free -= n + f; 136 TAILQ_INSERT_AFTER(&t->stids, s, sr, link); 137 goto allocated; 138 } 139 } 140 141 if (__predict_false(stid != t->nstids)) { 142 panic("%s: stids TAILQ (%p) corrupt." 143 " At %d instead of %d at the end of the queue.", 144 __func__, &t->stids, stid, t->nstids); 145 } 146 147 mtx_unlock(&t->stid_lock); 148 return (-1); 149 } 150 151 allocated: 152 sr->used = n; 153 sr->free = f; 154 t->stids_in_use += n; 155 t->stid_tab[stid] = lctx; 156 mtx_unlock(&t->stid_lock); 157 158 KASSERT(((stid + t->stid_base) & mask) == 0, 159 ("%s: EDOOFUS.", __func__)); 160 return (stid + t->stid_base); 161 } 162 163 static struct listen_ctx * 164 lookup_stid(struct adapter *sc, int stid) 165 { 166 struct tid_info *t = &sc->tids; 167 168 return (t->stid_tab[stid - t->stid_base]); 169 } 170 171 static void 172 free_stid(struct adapter *sc, struct listen_ctx *lctx) 173 { 174 struct tid_info *t = &sc->tids; 175 struct stid_region *sr = &lctx->stid_region; 176 struct stid_region *s; 177 178 KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used)); 179 180 mtx_lock(&t->stid_lock); 181 s = TAILQ_PREV(sr, stid_head, link); 182 if (s != NULL) 183 s->free += sr->used + sr->free; 184 else 185 t->nstids_free_head += sr->used + sr->free; 186 KASSERT(t->stids_in_use >= sr->used, 187 ("%s: stids_in_use (%u) < stids being freed (%u)", __func__, 188 t->stids_in_use, sr->used)); 189 t->stids_in_use -= sr->used; 190 TAILQ_REMOVE(&t->stids, sr, link); 191 mtx_unlock(&t->stid_lock); 192 } 193 194 static struct listen_ctx * 195 alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi) 196 { 197 struct listen_ctx *lctx; 198 199 INP_WLOCK_ASSERT(inp); 200 201 lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO); 202 if (lctx == NULL) 203 return (NULL); 204 205 lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6); 206 if (lctx->stid < 0) { 207 free(lctx, M_CXGBE); 208 return (NULL); 209 } 210 211 if (inp->inp_vflag & INP_IPV6 && 212 !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) { 213 lctx->ce = t4_hold_lip(sc, &inp->in6p_laddr, NULL); 214 if (lctx->ce == NULL) { 215 free(lctx, M_CXGBE); 216 return (NULL); 217 } 218 } 219 220 lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id]; 221 lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq]; 222 refcount_init(&lctx->refcount, 1); 223 224 lctx->inp = inp; 225 lctx->vnet = inp->inp_socket->so_vnet; 226 in_pcbref(inp); 227 228 return (lctx); 229 } 230 231 /* Don't call this directly, use release_lctx instead */ 232 static int 233 free_lctx(struct adapter *sc, struct listen_ctx *lctx) 234 { 235 struct inpcb *inp = lctx->inp; 236 237 INP_WLOCK_ASSERT(inp); 238 KASSERT(lctx->refcount == 0, 239 ("%s: refcount %d", __func__, lctx->refcount)); 240 KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid)); 241 242 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p", 243 __func__, lctx->stid, lctx, lctx->inp); 244 245 if (lctx->ce) 246 t4_release_lip(sc, lctx->ce); 247 free_stid(sc, lctx); 248 free(lctx, M_CXGBE); 249 250 return (in_pcbrele_wlocked(inp)); 251 } 252 253 static void 254 hold_lctx(struct listen_ctx *lctx) 255 { 256 257 refcount_acquire(&lctx->refcount); 258 } 259 260 static inline uint32_t 261 listen_hashfn(void *key, u_long mask) 262 { 263 264 return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask); 265 } 266 267 /* 268 * Add a listen_ctx entry to the listen hash table. 269 */ 270 static void 271 listen_hash_add(struct adapter *sc, struct listen_ctx *lctx) 272 { 273 struct tom_data *td = sc->tom_softc; 274 int bucket = listen_hashfn(lctx->inp, td->listen_mask); 275 276 mtx_lock(&td->lctx_hash_lock); 277 LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link); 278 td->lctx_count++; 279 mtx_unlock(&td->lctx_hash_lock); 280 } 281 282 /* 283 * Look for the listening socket's context entry in the hash and return it. 284 */ 285 static struct listen_ctx * 286 listen_hash_find(struct adapter *sc, struct inpcb *inp) 287 { 288 struct tom_data *td = sc->tom_softc; 289 int bucket = listen_hashfn(inp, td->listen_mask); 290 struct listen_ctx *lctx; 291 292 mtx_lock(&td->lctx_hash_lock); 293 LIST_FOREACH(lctx, &td->listen_hash[bucket], link) { 294 if (lctx->inp == inp) 295 break; 296 } 297 mtx_unlock(&td->lctx_hash_lock); 298 299 return (lctx); 300 } 301 302 /* 303 * Removes the listen_ctx structure for inp from the hash and returns it. 304 */ 305 static struct listen_ctx * 306 listen_hash_del(struct adapter *sc, struct inpcb *inp) 307 { 308 struct tom_data *td = sc->tom_softc; 309 int bucket = listen_hashfn(inp, td->listen_mask); 310 struct listen_ctx *lctx, *l; 311 312 mtx_lock(&td->lctx_hash_lock); 313 LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) { 314 if (lctx->inp == inp) { 315 LIST_REMOVE(lctx, link); 316 td->lctx_count--; 317 break; 318 } 319 } 320 mtx_unlock(&td->lctx_hash_lock); 321 322 return (lctx); 323 } 324 325 /* 326 * Releases a hold on the lctx. Must be called with the listening socket's inp 327 * locked. The inp may be freed by this function and it returns NULL to 328 * indicate this. 329 */ 330 static struct inpcb * 331 release_lctx(struct adapter *sc, struct listen_ctx *lctx) 332 { 333 struct inpcb *inp = lctx->inp; 334 int inp_freed = 0; 335 336 INP_WLOCK_ASSERT(inp); 337 if (refcount_release(&lctx->refcount)) 338 inp_freed = free_lctx(sc, lctx); 339 340 return (inp_freed ? NULL : inp); 341 } 342 343 static void 344 send_reset_synqe(struct toedev *tod, struct synq_entry *synqe) 345 { 346 struct adapter *sc = tod->tod_softc; 347 struct mbuf *m = synqe->syn; 348 struct ifnet *ifp = m->m_pkthdr.rcvif; 349 struct vi_info *vi = ifp->if_softc; 350 struct port_info *pi = vi->pi; 351 struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx]; 352 struct wrqe *wr; 353 struct fw_flowc_wr *flowc; 354 struct cpl_abort_req *req; 355 int flowclen; 356 struct sge_wrq *ofld_txq; 357 struct sge_ofld_rxq *ofld_rxq; 358 const int nparams = 6; 359 const u_int pfvf = sc->pf << S_FW_VIID_PFN; 360 361 INP_WLOCK_ASSERT(synqe->lctx->inp); 362 363 CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s", 364 __func__, synqe, synqe->flags, synqe->tid, 365 synqe->flags & TPF_ABORT_SHUTDOWN ? 366 " (abort already in progress)" : ""); 367 if (synqe->flags & TPF_ABORT_SHUTDOWN) 368 return; /* abort already in progress */ 369 synqe->flags |= TPF_ABORT_SHUTDOWN; 370 371 ofld_txq = &sc->sge.ofld_txq[synqe->txqid]; 372 ofld_rxq = &sc->sge.ofld_rxq[synqe->rxqid]; 373 374 /* The wrqe will have two WRs - a flowc followed by an abort_req */ 375 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 376 377 wr = alloc_wrqe(roundup2(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq); 378 if (wr == NULL) { 379 /* XXX */ 380 panic("%s: allocation failure.", __func__); 381 } 382 flowc = wrtod(wr); 383 req = (void *)((caddr_t)flowc + roundup2(flowclen, EQ_ESIZE)); 384 385 /* First the flowc ... */ 386 memset(flowc, 0, wr->wr_len); 387 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 388 V_FW_FLOWC_WR_NPARAMS(nparams)); 389 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 390 V_FW_WR_FLOWID(synqe->tid)); 391 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; 392 flowc->mnemval[0].val = htobe32(pfvf); 393 flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; 394 flowc->mnemval[1].val = htobe32(pi->tx_chan); 395 flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; 396 flowc->mnemval[2].val = htobe32(pi->tx_chan); 397 flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; 398 flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id); 399 flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF; 400 flowc->mnemval[4].val = htobe32(512); 401 flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS; 402 flowc->mnemval[5].val = htobe32(512); 403 synqe->flags |= TPF_FLOWC_WR_SENT; 404 405 /* ... then ABORT request */ 406 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid); 407 req->rsvd0 = 0; /* don't have a snd_nxt */ 408 req->rsvd1 = 1; /* no data sent yet */ 409 req->cmd = CPL_ABORT_SEND_RST; 410 411 t4_l2t_send(sc, wr, e); 412 } 413 414 static int 415 create_server(struct adapter *sc, struct listen_ctx *lctx) 416 { 417 struct wrqe *wr; 418 struct cpl_pass_open_req *req; 419 struct inpcb *inp = lctx->inp; 420 421 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 422 if (wr == NULL) { 423 log(LOG_ERR, "%s: allocation failure", __func__); 424 return (ENOMEM); 425 } 426 req = wrtod(wr); 427 428 INIT_TP_WR(req, 0); 429 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid)); 430 req->local_port = inp->inp_lport; 431 req->peer_port = 0; 432 req->local_ip = inp->inp_laddr.s_addr; 433 req->peer_ip = 0; 434 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); 435 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | 436 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); 437 438 t4_wrq_tx(sc, wr); 439 return (0); 440 } 441 442 static int 443 create_server6(struct adapter *sc, struct listen_ctx *lctx) 444 { 445 struct wrqe *wr; 446 struct cpl_pass_open_req6 *req; 447 struct inpcb *inp = lctx->inp; 448 449 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 450 if (wr == NULL) { 451 log(LOG_ERR, "%s: allocation failure", __func__); 452 return (ENOMEM); 453 } 454 req = wrtod(wr); 455 456 INIT_TP_WR(req, 0); 457 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid)); 458 req->local_port = inp->inp_lport; 459 req->peer_port = 0; 460 req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0]; 461 req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8]; 462 req->peer_ip_hi = 0; 463 req->peer_ip_lo = 0; 464 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan)); 465 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) | 466 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id)); 467 468 t4_wrq_tx(sc, wr); 469 return (0); 470 } 471 472 static int 473 destroy_server(struct adapter *sc, struct listen_ctx *lctx) 474 { 475 struct wrqe *wr; 476 struct cpl_close_listsvr_req *req; 477 478 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq); 479 if (wr == NULL) { 480 /* XXX */ 481 panic("%s: allocation failure.", __func__); 482 } 483 req = wrtod(wr); 484 485 INIT_TP_WR(req, 0); 486 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, 487 lctx->stid)); 488 req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id); 489 req->rsvd = htobe16(0); 490 491 t4_wrq_tx(sc, wr); 492 return (0); 493 } 494 495 /* 496 * Start a listening server by sending a passive open request to HW. 497 * 498 * Can't take adapter lock here and access to sc->flags, 499 * sc->offload_map, if_capenable are all race prone. 500 */ 501 int 502 t4_listen_start(struct toedev *tod, struct tcpcb *tp) 503 { 504 struct adapter *sc = tod->tod_softc; 505 struct vi_info *vi; 506 struct port_info *pi; 507 struct inpcb *inp = tp->t_inpcb; 508 struct listen_ctx *lctx; 509 int i, rc, v; 510 struct offload_settings settings; 511 512 INP_WLOCK_ASSERT(inp); 513 514 rw_rlock(&sc->policy_lock); 515 settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL, 516 EVL_MAKETAG(0xfff, 0, 0), inp); 517 rw_runlock(&sc->policy_lock); 518 if (!settings.offload) 519 return (0); 520 521 /* Don't start a hardware listener for any loopback address. */ 522 if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr)) 523 return (0); 524 if (!(inp->inp_vflag & INP_IPV6) && 525 IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr))) 526 return (0); 527 #if 0 528 ADAPTER_LOCK(sc); 529 if (IS_BUSY(sc)) { 530 log(LOG_ERR, "%s: listen request ignored, %s is busy", 531 __func__, device_get_nameunit(sc->dev)); 532 goto done; 533 } 534 535 KASSERT(uld_active(sc, ULD_TOM), 536 ("%s: TOM not initialized", __func__)); 537 #endif 538 539 /* 540 * Find an initialized VI with IFCAP_TOE (4 or 6). We'll use the first 541 * such VI's queues to send the passive open and receive the reply to 542 * it. 543 * 544 * XXX: need a way to mark a port in use by offload. if_cxgbe should 545 * then reject any attempt to bring down such a port (and maybe reject 546 * attempts to disable IFCAP_TOE on that port too?). 547 */ 548 for_each_port(sc, i) { 549 pi = sc->port[i]; 550 for_each_vi(pi, v, vi) { 551 if (vi->flags & VI_INIT_DONE && 552 vi->ifp->if_capenable & IFCAP_TOE) 553 goto found; 554 } 555 } 556 goto done; /* no port that's UP with IFCAP_TOE enabled */ 557 found: 558 559 if (listen_hash_find(sc, inp) != NULL) 560 goto done; /* already setup */ 561 562 lctx = alloc_lctx(sc, inp, vi); 563 if (lctx == NULL) { 564 log(LOG_ERR, 565 "%s: listen request ignored, %s couldn't allocate lctx\n", 566 __func__, device_get_nameunit(sc->dev)); 567 goto done; 568 } 569 listen_hash_add(sc, lctx); 570 571 CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x", 572 __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp, 573 inp->inp_vflag); 574 575 if (inp->inp_vflag & INP_IPV6) 576 rc = create_server6(sc, lctx); 577 else 578 rc = create_server(sc, lctx); 579 if (rc != 0) { 580 log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n", 581 __func__, device_get_nameunit(sc->dev), rc); 582 (void) listen_hash_del(sc, inp); 583 inp = release_lctx(sc, lctx); 584 /* can't be freed, host stack has a reference */ 585 KASSERT(inp != NULL, ("%s: inp freed", __func__)); 586 goto done; 587 } 588 lctx->flags |= LCTX_RPL_PENDING; 589 done: 590 #if 0 591 ADAPTER_UNLOCK(sc); 592 #endif 593 return (0); 594 } 595 596 int 597 t4_listen_stop(struct toedev *tod, struct tcpcb *tp) 598 { 599 struct listen_ctx *lctx; 600 struct adapter *sc = tod->tod_softc; 601 struct inpcb *inp = tp->t_inpcb; 602 603 INP_WLOCK_ASSERT(inp); 604 605 lctx = listen_hash_del(sc, inp); 606 if (lctx == NULL) 607 return (ENOENT); /* no hardware listener for this inp */ 608 609 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid, 610 lctx, lctx->flags); 611 612 /* 613 * If the reply to the PASS_OPEN is still pending we'll wait for it to 614 * arrive and clean up when it does. 615 */ 616 if (lctx->flags & LCTX_RPL_PENDING) { 617 return (EINPROGRESS); 618 } 619 620 destroy_server(sc, lctx); 621 return (0); 622 } 623 624 static inline struct synq_entry * 625 alloc_synqe(struct adapter *sc __unused, struct listen_ctx *lctx, int flags) 626 { 627 struct synq_entry *synqe; 628 629 INP_WLOCK_ASSERT(lctx->inp); 630 MPASS(flags == M_WAITOK || flags == M_NOWAIT); 631 632 synqe = malloc(sizeof(*synqe), M_CXGBE, flags); 633 if (__predict_true(synqe != NULL)) { 634 synqe->flags = TPF_SYNQE; 635 refcount_init(&synqe->refcnt, 1); 636 synqe->lctx = lctx; 637 hold_lctx(lctx); /* Every synqe has a ref on its lctx. */ 638 synqe->syn = NULL; 639 } 640 641 return (synqe); 642 } 643 644 static inline void 645 hold_synqe(struct synq_entry *synqe) 646 { 647 648 refcount_acquire(&synqe->refcnt); 649 } 650 651 static inline struct inpcb * 652 release_synqe(struct adapter *sc, struct synq_entry *synqe) 653 { 654 struct inpcb *inp; 655 656 MPASS(synqe->flags & TPF_SYNQE); 657 MPASS(synqe->lctx != NULL); 658 659 inp = synqe->lctx->inp; 660 MPASS(inp != NULL); 661 INP_WLOCK_ASSERT(inp); 662 663 if (refcount_release(&synqe->refcnt)) { 664 inp = release_lctx(sc, synqe->lctx); 665 m_freem(synqe->syn); 666 free(synqe, M_CXGBE); 667 } 668 669 return (inp); 670 } 671 672 void 673 t4_syncache_added(struct toedev *tod __unused, void *arg) 674 { 675 struct synq_entry *synqe = arg; 676 677 hold_synqe(synqe); 678 } 679 680 void 681 t4_syncache_removed(struct toedev *tod, void *arg) 682 { 683 struct adapter *sc = tod->tod_softc; 684 struct synq_entry *synqe = arg; 685 struct inpcb *inp = synqe->lctx->inp; 686 687 /* 688 * XXX: this is a LOR but harmless when running from the softclock. 689 */ 690 INP_WLOCK(inp); 691 inp = release_synqe(sc, synqe); 692 if (inp != NULL) 693 INP_WUNLOCK(inp); 694 } 695 696 int 697 t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m) 698 { 699 struct synq_entry *synqe = arg; 700 701 if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) { 702 struct tcpopt to; 703 struct ip *ip = mtod(m, struct ip *); 704 struct tcphdr *th; 705 706 if (ip->ip_v == IPVERSION) 707 th = (void *)(ip + 1); 708 else 709 th = (void *)((struct ip6_hdr *)ip + 1); 710 bzero(&to, sizeof(to)); 711 tcp_dooptions(&to, (void *)(th + 1), 712 (th->th_off << 2) - sizeof(*th), TO_SYN); 713 714 /* save these for later */ 715 synqe->iss = be32toh(th->th_seq); 716 synqe->irs = be32toh(th->th_ack) - 1; 717 synqe->ts = to.to_tsval; 718 } 719 720 m_freem(m); /* don't need this any more */ 721 return (0); 722 } 723 724 static int 725 do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss, 726 struct mbuf *m) 727 { 728 struct adapter *sc = iq->adapter; 729 const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1); 730 int stid = GET_TID(cpl); 731 unsigned int status = cpl->status; 732 struct listen_ctx *lctx = lookup_stid(sc, stid); 733 struct inpcb *inp = lctx->inp; 734 #ifdef INVARIANTS 735 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 736 #endif 737 738 KASSERT(opcode == CPL_PASS_OPEN_RPL, 739 ("%s: unexpected opcode 0x%x", __func__, opcode)); 740 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 741 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 742 743 INP_WLOCK(inp); 744 745 CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x", 746 __func__, stid, status, lctx->flags); 747 748 lctx->flags &= ~LCTX_RPL_PENDING; 749 750 if (status != CPL_ERR_NONE) 751 log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status); 752 753 #ifdef INVARIANTS 754 /* 755 * If the inp has been dropped (listening socket closed) then 756 * listen_stop must have run and taken the inp out of the hash. 757 */ 758 if (inp->inp_flags & INP_DROPPED) { 759 KASSERT(listen_hash_del(sc, inp) == NULL, 760 ("%s: inp %p still in listen hash", __func__, inp)); 761 } 762 #endif 763 764 if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) { 765 if (release_lctx(sc, lctx) != NULL) 766 INP_WUNLOCK(inp); 767 return (status); 768 } 769 770 /* 771 * Listening socket stopped listening earlier and now the chip tells us 772 * it has started the hardware listener. Stop it; the lctx will be 773 * released in do_close_server_rpl. 774 */ 775 if (inp->inp_flags & INP_DROPPED) { 776 destroy_server(sc, lctx); 777 INP_WUNLOCK(inp); 778 return (status); 779 } 780 781 /* 782 * Failed to start hardware listener. Take inp out of the hash and 783 * release our reference on it. An error message has been logged 784 * already. 785 */ 786 if (status != CPL_ERR_NONE) { 787 listen_hash_del(sc, inp); 788 if (release_lctx(sc, lctx) != NULL) 789 INP_WUNLOCK(inp); 790 return (status); 791 } 792 793 /* hardware listener open for business */ 794 795 INP_WUNLOCK(inp); 796 return (status); 797 } 798 799 static int 800 do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss, 801 struct mbuf *m) 802 { 803 struct adapter *sc = iq->adapter; 804 const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1); 805 int stid = GET_TID(cpl); 806 unsigned int status = cpl->status; 807 struct listen_ctx *lctx = lookup_stid(sc, stid); 808 struct inpcb *inp = lctx->inp; 809 #ifdef INVARIANTS 810 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 811 #endif 812 813 KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL, 814 ("%s: unexpected opcode 0x%x", __func__, opcode)); 815 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 816 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 817 818 CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status); 819 820 if (status != CPL_ERR_NONE) { 821 log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n", 822 __func__, status, stid); 823 return (status); 824 } 825 826 INP_WLOCK(inp); 827 inp = release_lctx(sc, lctx); 828 if (inp != NULL) 829 INP_WUNLOCK(inp); 830 831 return (status); 832 } 833 834 static void 835 done_with_synqe(struct adapter *sc, struct synq_entry *synqe) 836 { 837 struct listen_ctx *lctx = synqe->lctx; 838 struct inpcb *inp = lctx->inp; 839 struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx]; 840 int ntids; 841 842 INP_WLOCK_ASSERT(inp); 843 ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1; 844 845 remove_tid(sc, synqe->tid, ntids); 846 release_tid(sc, synqe->tid, lctx->ctrlq); 847 t4_l2t_release(e); 848 inp = release_synqe(sc, synqe); 849 if (inp) 850 INP_WUNLOCK(inp); 851 } 852 853 void 854 synack_failure_cleanup(struct adapter *sc, int tid) 855 { 856 struct synq_entry *synqe = lookup_tid(sc, tid); 857 858 INP_WLOCK(synqe->lctx->inp); 859 done_with_synqe(sc, synqe); 860 } 861 862 int 863 do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss, 864 struct mbuf *m) 865 { 866 struct adapter *sc = iq->adapter; 867 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1); 868 unsigned int tid = GET_TID(cpl); 869 struct synq_entry *synqe = lookup_tid(sc, tid); 870 struct listen_ctx *lctx = synqe->lctx; 871 struct inpcb *inp = lctx->inp; 872 struct sge_wrq *ofld_txq; 873 #ifdef INVARIANTS 874 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 875 #endif 876 877 KASSERT(opcode == CPL_ABORT_REQ_RSS, 878 ("%s: unexpected opcode 0x%x", __func__, opcode)); 879 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 880 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); 881 882 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", 883 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); 884 885 if (negative_advice(cpl->status)) 886 return (0); /* Ignore negative advice */ 887 888 INP_WLOCK(inp); 889 890 ofld_txq = &sc->sge.ofld_txq[synqe->txqid]; 891 892 /* 893 * If we'd initiated an abort earlier the reply to it is responsible for 894 * cleaning up resources. Otherwise we tear everything down right here 895 * right now. We owe the T4 a CPL_ABORT_RPL no matter what. 896 */ 897 if (synqe->flags & TPF_ABORT_SHUTDOWN) { 898 INP_WUNLOCK(inp); 899 goto done; 900 } 901 902 done_with_synqe(sc, synqe); 903 /* inp lock released by done_with_synqe */ 904 done: 905 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); 906 return (0); 907 } 908 909 int 910 do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss, 911 struct mbuf *m) 912 { 913 struct adapter *sc = iq->adapter; 914 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1); 915 unsigned int tid = GET_TID(cpl); 916 struct synq_entry *synqe = lookup_tid(sc, tid); 917 struct listen_ctx *lctx = synqe->lctx; 918 struct inpcb *inp = lctx->inp; 919 #ifdef INVARIANTS 920 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 921 #endif 922 923 KASSERT(opcode == CPL_ABORT_RPL_RSS, 924 ("%s: unexpected opcode 0x%x", __func__, opcode)); 925 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 926 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__)); 927 928 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d", 929 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status); 930 931 INP_WLOCK(inp); 932 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN, 933 ("%s: wasn't expecting abort reply for synqe %p (0x%x)", 934 __func__, synqe, synqe->flags)); 935 936 done_with_synqe(sc, synqe); 937 /* inp lock released by done_with_synqe */ 938 939 return (0); 940 } 941 942 void 943 t4_offload_socket(struct toedev *tod, void *arg, struct socket *so) 944 { 945 struct adapter *sc = tod->tod_softc; 946 struct synq_entry *synqe = arg; 947 #ifdef INVARIANTS 948 struct inpcb *inp = sotoinpcb(so); 949 #endif 950 struct toepcb *toep = synqe->toep; 951 952 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */ 953 INP_WLOCK_ASSERT(inp); 954 KASSERT(synqe->flags & TPF_SYNQE, 955 ("%s: %p not a synq_entry?", __func__, arg)); 956 MPASS(toep->tid == synqe->tid); 957 958 offload_socket(so, toep); 959 make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt); 960 toep->flags |= TPF_CPL_PENDING; 961 update_tid(sc, synqe->tid, toep); 962 synqe->flags |= TPF_SYNQE_EXPANDED; 963 } 964 965 static inline void 966 save_qids_in_synqe(struct synq_entry *synqe, struct vi_info *vi, 967 struct offload_settings *s) 968 { 969 uint32_t txqid, rxqid; 970 971 if (s->txq >= 0 && s->txq < vi->nofldtxq) 972 txqid = s->txq; 973 else 974 txqid = arc4random() % vi->nofldtxq; 975 txqid += vi->first_ofld_txq; 976 977 if (s->rxq >= 0 && s->rxq < vi->nofldrxq) 978 rxqid = s->rxq; 979 else 980 rxqid = arc4random() % vi->nofldrxq; 981 rxqid += vi->first_ofld_rxq; 982 983 synqe->txqid = txqid; 984 synqe->rxqid = rxqid; 985 } 986 987 static void 988 t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to) 989 { 990 bzero(to, sizeof(*to)); 991 992 if (t4opt->mss) { 993 to->to_flags |= TOF_MSS; 994 to->to_mss = be16toh(t4opt->mss); 995 } 996 997 if (t4opt->wsf) { 998 to->to_flags |= TOF_SCALE; 999 to->to_wscale = t4opt->wsf; 1000 } 1001 1002 if (t4opt->tstamp) 1003 to->to_flags |= TOF_TS; 1004 1005 if (t4opt->sack) 1006 to->to_flags |= TOF_SACKPERM; 1007 } 1008 1009 /* 1010 * Options2 for passive open. 1011 */ 1012 static uint32_t 1013 calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid, 1014 const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode, 1015 struct cc_algo *cc, const struct offload_settings *s) 1016 { 1017 struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid]; 1018 uint32_t opt2 = 0; 1019 1020 /* 1021 * rx flow control, rx coalesce, congestion control, and tx pace are all 1022 * explicitly set by the driver. On T5+ the ISS is also set by the 1023 * driver to the value picked by the kernel. 1024 */ 1025 if (is_t4(sc)) { 1026 opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID; 1027 opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID; 1028 } else { 1029 opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */ 1030 opt2 |= F_T5_ISS; /* ISS provided in CPL */ 1031 } 1032 1033 if (tcpopt->sack && (s->sack > 0 || (s->sack < 0 && V_tcp_do_rfc1323))) 1034 opt2 |= F_SACK_EN; 1035 1036 if (tcpopt->tstamp && 1037 (s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323))) 1038 opt2 |= F_TSTAMPS_EN; 1039 1040 if (tcpopt->wsf < 15 && V_tcp_do_rfc1323) 1041 opt2 |= F_WND_SCALE_EN; 1042 1043 if (th->th_flags & (TH_ECE | TH_CWR) && 1044 (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn))) 1045 opt2 |= F_CCTRL_ECN; 1046 1047 /* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */ 1048 1049 opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]); 1050 1051 /* These defaults are subject to ULP specific fixups later. */ 1052 opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0); 1053 1054 opt2 |= V_PACE(0); 1055 1056 if (s->cong_algo >= 0) 1057 opt2 |= V_CONG_CNTRL(s->cong_algo); 1058 else if (sc->tt.cong_algorithm >= 0) 1059 opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL); 1060 else { 1061 if (strcasecmp(cc->name, "reno") == 0) 1062 opt2 |= V_CONG_CNTRL(CONG_ALG_RENO); 1063 else if (strcasecmp(cc->name, "tahoe") == 0) 1064 opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE); 1065 if (strcasecmp(cc->name, "newreno") == 0) 1066 opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO); 1067 if (strcasecmp(cc->name, "highspeed") == 0) 1068 opt2 |= V_CONG_CNTRL(CONG_ALG_HIGHSPEED); 1069 else { 1070 /* 1071 * Use newreno in case the algorithm selected by the 1072 * host stack is not supported by the hardware. 1073 */ 1074 opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO); 1075 } 1076 } 1077 1078 if (s->rx_coalesce > 0 || (s->rx_coalesce < 0 && sc->tt.rx_coalesce)) 1079 opt2 |= V_RX_COALESCE(M_RX_COALESCE); 1080 1081 /* Note that ofld_rxq is already set according to s->rxq. */ 1082 opt2 |= F_RSS_QUEUE_VALID; 1083 opt2 |= V_RSS_QUEUE(ofld_rxq->iq.abs_id); 1084 1085 #ifdef USE_DDP_RX_FLOW_CONTROL 1086 if (ulp_mode == ULP_MODE_TCPDDP) 1087 opt2 |= F_RX_FC_DDP; 1088 #endif 1089 1090 if (ulp_mode == ULP_MODE_TLS) { 1091 opt2 &= ~V_RX_COALESCE(M_RX_COALESCE); 1092 opt2 |= F_RX_FC_DISABLE; 1093 } 1094 1095 return (htobe32(opt2)); 1096 } 1097 1098 static void 1099 pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m, 1100 struct in_conninfo *inc, struct tcphdr *th) 1101 { 1102 const struct cpl_pass_accept_req *cpl = mtod(m, const void *); 1103 const struct ether_header *eh; 1104 unsigned int hlen = be32toh(cpl->hdr_len); 1105 uintptr_t l3hdr; 1106 const struct tcphdr *tcp; 1107 1108 eh = (const void *)(cpl + 1); 1109 if (chip_id(sc) >= CHELSIO_T6) { 1110 l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen)); 1111 tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen)); 1112 } else { 1113 l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen)); 1114 tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen)); 1115 } 1116 1117 if (inc) { 1118 bzero(inc, sizeof(*inc)); 1119 inc->inc_fport = tcp->th_sport; 1120 inc->inc_lport = tcp->th_dport; 1121 if (((struct ip *)l3hdr)->ip_v == IPVERSION) { 1122 const struct ip *ip = (const void *)l3hdr; 1123 1124 inc->inc_faddr = ip->ip_src; 1125 inc->inc_laddr = ip->ip_dst; 1126 } else { 1127 const struct ip6_hdr *ip6 = (const void *)l3hdr; 1128 1129 inc->inc_flags |= INC_ISIPV6; 1130 inc->inc6_faddr = ip6->ip6_src; 1131 inc->inc6_laddr = ip6->ip6_dst; 1132 } 1133 } 1134 1135 if (th) { 1136 bcopy(tcp, th, sizeof(*th)); 1137 tcp_fields_to_host(th); /* just like tcp_input */ 1138 } 1139 } 1140 1141 static struct l2t_entry * 1142 get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp, 1143 struct in_conninfo *inc) 1144 { 1145 struct l2t_entry *e; 1146 struct sockaddr_in6 sin6; 1147 struct sockaddr *dst = (void *)&sin6; 1148 1149 if (inc->inc_flags & INC_ISIPV6) { 1150 struct nhop6_basic nh6; 1151 1152 bzero(dst, sizeof(struct sockaddr_in6)); 1153 dst->sa_len = sizeof(struct sockaddr_in6); 1154 dst->sa_family = AF_INET6; 1155 1156 if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) { 1157 /* no need for route lookup */ 1158 e = t4_l2t_get(pi, ifp, dst); 1159 return (e); 1160 } 1161 1162 if (fib6_lookup_nh_basic(RT_DEFAULT_FIB, &inc->inc6_faddr, 1163 0, 0, 0, &nh6) != 0) 1164 return (NULL); 1165 if (nh6.nh_ifp != ifp) 1166 return (NULL); 1167 ((struct sockaddr_in6 *)dst)->sin6_addr = nh6.nh_addr; 1168 } else { 1169 struct nhop4_basic nh4; 1170 1171 dst->sa_len = sizeof(struct sockaddr_in); 1172 dst->sa_family = AF_INET; 1173 1174 if (fib4_lookup_nh_basic(RT_DEFAULT_FIB, inc->inc_faddr, 0, 0, 1175 &nh4) != 0) 1176 return (NULL); 1177 if (nh4.nh_ifp != ifp) 1178 return (NULL); 1179 ((struct sockaddr_in *)dst)->sin_addr = nh4.nh_addr; 1180 } 1181 1182 e = t4_l2t_get(pi, ifp, dst); 1183 return (e); 1184 } 1185 1186 static int 1187 send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0, 1188 uint32_t opt2, int tid) 1189 { 1190 struct wrqe *wr; 1191 struct cpl_pass_accept_rpl *rpl; 1192 struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx]; 1193 1194 wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) : 1195 sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]); 1196 if (wr == NULL) 1197 return (ENOMEM); 1198 rpl = wrtod(wr); 1199 1200 if (is_t4(sc)) 1201 INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid); 1202 else { 1203 struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl; 1204 1205 INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid); 1206 rpl5->iss = htobe32(synqe->iss); 1207 } 1208 rpl->opt0 = opt0; 1209 rpl->opt2 = opt2; 1210 1211 return (t4_l2t_send(sc, wr, e)); 1212 } 1213 1214 #define REJECT_PASS_ACCEPT_REQ(tunnel) do { \ 1215 if (!tunnel) { \ 1216 m_freem(m); \ 1217 m = NULL; \ 1218 } \ 1219 reject_reason = __LINE__; \ 1220 goto reject; \ 1221 } while (0) 1222 1223 /* 1224 * The context associated with a tid entry via insert_tid could be a synq_entry 1225 * or a toepcb. The only way CPL handlers can tell is via a bit in these flags. 1226 */ 1227 CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags)); 1228 1229 /* 1230 * Incoming SYN on a listening socket. 1231 * 1232 * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe, 1233 * etc. 1234 */ 1235 static int 1236 do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, 1237 struct mbuf *m) 1238 { 1239 struct adapter *sc = iq->adapter; 1240 struct toedev *tod; 1241 const struct cpl_pass_accept_req *cpl = mtod(m, const void *); 1242 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); 1243 unsigned int tid = GET_TID(cpl); 1244 struct listen_ctx *lctx = lookup_stid(sc, stid); 1245 struct inpcb *inp; 1246 struct socket *so; 1247 struct in_conninfo inc; 1248 struct tcphdr th; 1249 struct tcpopt to; 1250 struct port_info *pi; 1251 struct vi_info *vi; 1252 struct ifnet *hw_ifp, *ifp; 1253 struct l2t_entry *e = NULL; 1254 struct synq_entry *synqe = NULL; 1255 int reject_reason, v, ntids; 1256 uint16_t vid, l2info; 1257 struct epoch_tracker et; 1258 #ifdef INVARIANTS 1259 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1260 #endif 1261 struct offload_settings settings; 1262 1263 KASSERT(opcode == CPL_PASS_ACCEPT_REQ, 1264 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1265 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 1266 1267 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid, 1268 lctx); 1269 1270 CURVNET_SET(lctx->vnet); /* before any potential REJECT */ 1271 1272 /* 1273 * Use the MAC index to lookup the associated VI. If this SYN didn't 1274 * match a perfect MAC filter, punt. 1275 */ 1276 l2info = be16toh(cpl->l2info); 1277 pi = sc->port[G_SYN_INTF(l2info)]; 1278 if (!(l2info & F_SYN_XACT_MATCH)) { 1279 REJECT_PASS_ACCEPT_REQ(false); 1280 } 1281 for_each_vi(pi, v, vi) { 1282 if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info)) 1283 goto found; 1284 } 1285 REJECT_PASS_ACCEPT_REQ(false); 1286 found: 1287 hw_ifp = vi->ifp; /* the cxgbe ifnet */ 1288 m->m_pkthdr.rcvif = hw_ifp; 1289 tod = TOEDEV(hw_ifp); 1290 1291 /* 1292 * Don't offload if the peer requested a TCP option that's not known to 1293 * the silicon. Send the SYN to the kernel instead. 1294 */ 1295 if (__predict_false(cpl->tcpopt.unknown)) 1296 REJECT_PASS_ACCEPT_REQ(true); 1297 1298 /* 1299 * Figure out if there is a pseudo interface (vlan, lagg, etc.) 1300 * involved. Don't offload if the SYN had a VLAN tag and the vid 1301 * doesn't match anything on this interface. 1302 * 1303 * XXX: lagg support, lagg + vlan support. 1304 */ 1305 vid = EVL_VLANOFTAG(be16toh(cpl->vlan)); 1306 if (vid != 0xfff && vid != 0) { 1307 ifp = VLAN_DEVAT(hw_ifp, vid); 1308 if (ifp == NULL) 1309 REJECT_PASS_ACCEPT_REQ(true); 1310 } else 1311 ifp = hw_ifp; 1312 1313 /* 1314 * Don't offload if the ifnet that the SYN came in on is not in the same 1315 * vnet as the listening socket. 1316 */ 1317 if (lctx->vnet != ifp->if_vnet) 1318 REJECT_PASS_ACCEPT_REQ(true); 1319 1320 pass_accept_req_to_protohdrs(sc, m, &inc, &th); 1321 if (inc.inc_flags & INC_ISIPV6) { 1322 1323 /* Don't offload if the ifcap isn't enabled */ 1324 if ((ifp->if_capenable & IFCAP_TOE6) == 0) 1325 REJECT_PASS_ACCEPT_REQ(true); 1326 1327 /* 1328 * SYN must be directed to an IP6 address on this ifnet. This 1329 * is more restrictive than in6_localip. 1330 */ 1331 if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) 1332 REJECT_PASS_ACCEPT_REQ(true); 1333 1334 ntids = 2; 1335 } else { 1336 1337 /* Don't offload if the ifcap isn't enabled */ 1338 if ((ifp->if_capenable & IFCAP_TOE4) == 0) 1339 REJECT_PASS_ACCEPT_REQ(true); 1340 1341 /* 1342 * SYN must be directed to an IP address on this ifnet. This 1343 * is more restrictive than in_localip. 1344 */ 1345 if (!in_ifhasaddr(ifp, inc.inc_laddr)) 1346 REJECT_PASS_ACCEPT_REQ(true); 1347 1348 ntids = 1; 1349 } 1350 1351 e = get_l2te_for_nexthop(pi, ifp, &inc); 1352 if (e == NULL) 1353 REJECT_PASS_ACCEPT_REQ(true); 1354 1355 /* Don't offload if the 4-tuple is already in use */ 1356 INP_INFO_RLOCK_ET(&V_tcbinfo, et); /* for 4-tuple check */ 1357 if (toe_4tuple_check(&inc, &th, ifp) != 0) { 1358 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1359 REJECT_PASS_ACCEPT_REQ(false); 1360 } 1361 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1362 1363 inp = lctx->inp; /* listening socket, not owned by TOE */ 1364 INP_WLOCK(inp); 1365 1366 /* Don't offload if the listening socket has closed */ 1367 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 1368 INP_WUNLOCK(inp); 1369 REJECT_PASS_ACCEPT_REQ(false); 1370 } 1371 so = inp->inp_socket; 1372 rw_rlock(&sc->policy_lock); 1373 settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m, 1374 EVL_MAKETAG(0xfff, 0, 0), inp); 1375 rw_runlock(&sc->policy_lock); 1376 if (!settings.offload) { 1377 INP_WUNLOCK(inp); 1378 REJECT_PASS_ACCEPT_REQ(true); /* Rejected by COP. */ 1379 } 1380 1381 synqe = alloc_synqe(sc, lctx, M_NOWAIT); 1382 if (synqe == NULL) { 1383 INP_WUNLOCK(inp); 1384 REJECT_PASS_ACCEPT_REQ(true); 1385 } 1386 atomic_store_int(&synqe->ok_to_respond, 0); 1387 1388 /* 1389 * If all goes well t4_syncache_respond will get called during 1390 * syncache_add. Note that syncache_add releases the pcb lock. 1391 */ 1392 t4opt_to_tcpopt(&cpl->tcpopt, &to); 1393 toe_syncache_add(&inc, &to, &th, inp, tod, synqe); 1394 1395 if (atomic_load_int(&synqe->ok_to_respond) > 0) { 1396 uint64_t opt0; 1397 uint32_t opt2; 1398 u_int wnd; 1399 int rscale, mtu_idx, rx_credits; 1400 1401 mtu_idx = find_best_mtu_idx(sc, &inc, &settings); 1402 rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0; 1403 wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND); 1404 wnd = min(wnd, MAX_RCV_WND); 1405 rx_credits = min(wnd >> 10, M_RCV_BUFSIZ); 1406 1407 save_qids_in_synqe(synqe, vi, &settings); 1408 synqe->ulp_mode = select_ulp_mode(so, sc, &settings); 1409 1410 opt0 = calc_opt0(so, vi, e, mtu_idx, rscale, rx_credits, 1411 synqe->ulp_mode, &settings); 1412 opt2 = calc_opt2p(sc, pi, synqe->rxqid, &cpl->tcpopt, &th, 1413 synqe->ulp_mode, CC_ALGO(intotcpcb(inp)), &settings); 1414 1415 insert_tid(sc, tid, synqe, ntids); 1416 synqe->tid = tid; 1417 synqe->l2e_idx = e->idx; 1418 synqe->rcv_bufsize = rx_credits; 1419 synqe->syn = m; 1420 m = NULL; 1421 1422 if (send_synack(sc, synqe, opt0, opt2, tid) != 0) { 1423 remove_tid(sc, tid, ntids); 1424 m = synqe->syn; 1425 synqe->syn = NULL; 1426 REJECT_PASS_ACCEPT_REQ(true); 1427 } 1428 1429 CTR6(KTR_CXGBE, 1430 "%s: stid %u, tid %u, lctx %p, synqe %p, mode %d, SYNACK", 1431 __func__, stid, tid, lctx, synqe, synqe->ulp_mode); 1432 } else 1433 REJECT_PASS_ACCEPT_REQ(false); 1434 1435 CURVNET_RESTORE(); 1436 return (0); 1437 reject: 1438 CURVNET_RESTORE(); 1439 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, 1440 reject_reason); 1441 1442 if (e) 1443 t4_l2t_release(e); 1444 release_tid(sc, tid, lctx->ctrlq); 1445 if (synqe) { 1446 inp = synqe->lctx->inp; 1447 INP_WLOCK(inp); 1448 inp = release_synqe(sc, synqe); 1449 if (inp) 1450 INP_WUNLOCK(inp); 1451 } 1452 1453 if (m) { 1454 /* 1455 * The connection request hit a TOE listener but is being passed 1456 * on to the kernel sw stack instead of getting offloaded. 1457 */ 1458 m_adj(m, sizeof(*cpl)); 1459 m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | 1460 CSUM_DATA_VALID | CSUM_PSEUDO_HDR); 1461 m->m_pkthdr.csum_data = 0xffff; 1462 hw_ifp->if_input(hw_ifp, m); 1463 } 1464 1465 return (reject_reason); 1466 } 1467 1468 static void 1469 synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe, 1470 const struct cpl_pass_establish *cpl, struct in_conninfo *inc, 1471 struct tcphdr *th, struct tcpopt *to) 1472 { 1473 uint16_t tcp_opt = be16toh(cpl->tcp_opt); 1474 1475 /* start off with the original SYN */ 1476 pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th); 1477 1478 /* modify parts to make it look like the ACK to our SYN|ACK */ 1479 th->th_flags = TH_ACK; 1480 th->th_ack = synqe->iss + 1; 1481 th->th_seq = be32toh(cpl->rcv_isn); 1482 bzero(to, sizeof(*to)); 1483 if (G_TCPOPT_TSTAMP(tcp_opt)) { 1484 to->to_flags |= TOF_TS; 1485 to->to_tsecr = synqe->ts; 1486 } 1487 } 1488 1489 static int 1490 do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, 1491 struct mbuf *m) 1492 { 1493 struct adapter *sc = iq->adapter; 1494 struct vi_info *vi; 1495 struct ifnet *ifp; 1496 const struct cpl_pass_establish *cpl = (const void *)(rss + 1); 1497 #if defined(KTR) || defined(INVARIANTS) 1498 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid)); 1499 #endif 1500 unsigned int tid = GET_TID(cpl); 1501 struct synq_entry *synqe = lookup_tid(sc, tid); 1502 struct listen_ctx *lctx = synqe->lctx; 1503 struct inpcb *inp = lctx->inp, *new_inp; 1504 struct socket *so; 1505 struct tcphdr th; 1506 struct tcpopt to; 1507 struct in_conninfo inc; 1508 struct toepcb *toep; 1509 struct epoch_tracker et; 1510 #ifdef INVARIANTS 1511 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); 1512 #endif 1513 1514 KASSERT(opcode == CPL_PASS_ESTABLISH, 1515 ("%s: unexpected opcode 0x%x", __func__, opcode)); 1516 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 1517 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__)); 1518 KASSERT(synqe->flags & TPF_SYNQE, 1519 ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe)); 1520 1521 CURVNET_SET(lctx->vnet); 1522 INP_INFO_RLOCK_ET(&V_tcbinfo, et); /* for syncache_expand */ 1523 INP_WLOCK(inp); 1524 1525 CTR6(KTR_CXGBE, 1526 "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x", 1527 __func__, stid, tid, synqe, synqe->flags, inp->inp_flags); 1528 1529 ifp = synqe->syn->m_pkthdr.rcvif; 1530 vi = ifp->if_softc; 1531 KASSERT(vi->pi->adapter == sc, 1532 ("%s: vi %p, sc %p mismatch", __func__, vi, sc)); 1533 1534 if (__predict_false(inp->inp_flags & INP_DROPPED)) { 1535 reset: 1536 send_reset_synqe(TOEDEV(ifp), synqe); 1537 INP_WUNLOCK(inp); 1538 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1539 CURVNET_RESTORE(); 1540 return (0); 1541 } 1542 1543 KASSERT(synqe->rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0], 1544 ("%s: CPL arrived on unexpected rxq. %d %d", __func__, 1545 synqe->rxqid, (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0]))); 1546 1547 toep = alloc_toepcb(vi, synqe->txqid, synqe->rxqid, M_NOWAIT); 1548 if (toep == NULL) 1549 goto reset; 1550 toep->tid = tid; 1551 toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx]; 1552 toep->vnet = lctx->vnet; 1553 set_ulp_mode(toep, synqe->ulp_mode); 1554 toep->opt0_rcv_bufsize = synqe->rcv_bufsize; 1555 1556 MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss); 1557 MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs); 1558 synqe->tcp_opt = cpl->tcp_opt; 1559 synqe->toep = toep; 1560 1561 /* Come up with something that syncache_expand should be ok with. */ 1562 synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to); 1563 if (inc.inc_flags & INC_ISIPV6) 1564 toep->ce = t4_hold_lip(sc, &inc.inc6_laddr, lctx->ce); 1565 so = inp->inp_socket; 1566 KASSERT(so != NULL, ("%s: socket is NULL", __func__)); 1567 1568 if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) { 1569 free_toepcb(toep); 1570 goto reset; 1571 } 1572 1573 /* New connection inpcb is already locked by syncache_expand(). */ 1574 new_inp = sotoinpcb(so); 1575 INP_WLOCK_ASSERT(new_inp); 1576 MPASS(so->so_vnet == lctx->vnet); 1577 1578 /* 1579 * This is for expansion from syncookies. 1580 * 1581 * XXX: we've held the tcbinfo lock throughout so there's no risk of 1582 * anyone accept'ing a connection before we've installed our hooks, but 1583 * this somewhat defeats the purpose of having a tod_offload_socket :-( 1584 */ 1585 if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) { 1586 tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); 1587 t4_offload_socket(TOEDEV(ifp), synqe, so); 1588 } 1589 1590 INP_WUNLOCK(new_inp); 1591 1592 /* Done with the synqe */ 1593 inp = release_synqe(sc, synqe); 1594 if (inp != NULL) 1595 INP_WUNLOCK(inp); 1596 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1597 CURVNET_RESTORE(); 1598 1599 return (0); 1600 } 1601 1602 void 1603 t4_init_listen_cpl_handlers(void) 1604 { 1605 1606 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl); 1607 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl); 1608 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req); 1609 t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish); 1610 } 1611 1612 void 1613 t4_uninit_listen_cpl_handlers(void) 1614 { 1615 1616 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL); 1617 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL); 1618 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL); 1619 t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL); 1620 } 1621 #endif 1622