1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2020, The University of Queensland 14 * Copyright (c) 2018, Joyent, Inc. 15 * Copyright 2020 RackTop Systems, Inc. 16 */ 17 18 /* 19 * Mellanox Connect-X 4/5/6 driver. 20 */ 21 22 /* 23 * The PRM for this family of parts is freely available, and can be found at: 24 * https://www.mellanox.com/related-docs/user_manuals/ \ 25 * Ethernet_Adapters_Programming_Manual.pdf 26 */ 27 /* 28 * ConnectX glossary 29 * ----------------- 30 * 31 * WR Work Request: something we've asked the hardware to do by 32 * creating a Work Queue Entry (WQE), e.g. send or recv a packet 33 * 34 * WQE Work Queue Entry: a descriptor on a work queue descriptor ring 35 * 36 * WQ Work Queue: a descriptor ring that we can place WQEs on, usually 37 * either a Send Queue (SQ) or Receive Queue (RQ). Different WQ 38 * types have different WQE structures, different commands for 39 * creating and destroying them, etc, but share a common context 40 * structure, counter setup and state graph. 41 * SQ Send Queue, a specific type of WQ that sends packets 42 * RQ Receive Queue, a specific type of WQ that receives packets 43 * 44 * CQ Completion Queue: completion of WRs from a WQ are reported to 45 * one of these, as a CQE on its entry ring. 46 * CQE Completion Queue Entry: an entry in a CQ ring. Contains error 47 * info, as well as packet size, the ID of the WQ, and the index 48 * of the WQE which completed. Does not contain any packet data. 49 * 50 * EQ Event Queue: a ring of event structs from the hardware informing 51 * us when particular events happen. Many events can point at a 52 * a particular CQ which we should then go look at. 53 * EQE Event Queue Entry: an entry on the EQ ring 54 * 55 * UAR User Access Region, a page of the device's PCI BAR which is 56 * tied to particular EQ/CQ/WQ sets and contains doorbells to 57 * ring to arm them for interrupts or wake them up for new work 58 * 59 * RQT RQ Table, a collection of indexed RQs used to refer to the group 60 * as a single unit (for e.g. hashing/RSS). 61 * 62 * TIR Transport Interface Recieve, a bucket of resources for the 63 * reception of packets. TIRs have to point at either a single RQ 64 * or a table of RQs (RQT). They then serve as a target for flow 65 * table entries (FEs). TIRs that point at an RQT also contain the 66 * settings for hashing for RSS. 67 * 68 * TIS Transport Interface Send, a bucket of resources associated with 69 * the transmission of packets. In particular, the temporary 70 * resources used for LSO internally in the card are accounted to 71 * a TIS. 72 * 73 * FT Flow Table, a collection of FEs and FGs that can be referred to 74 * as a single entity (e.g. used as a target from another flow 75 * entry or set as the "root" table to handle incoming or outgoing 76 * packets). Packets arriving at a FT are matched against the 77 * FEs in the table until either one matches with a terminating 78 * action or all FEs are exhausted (it's first-match-wins but with 79 * some actions that are non-terminal, like counting actions). 80 * 81 * FG Flow Group, a group of FEs which share a common "mask" (i.e. 82 * they match on the same attributes of packets coming into the 83 * flow). 84 * 85 * FE Flow Entry, an individual set of values to match against 86 * packets entering the flow table, combined with an action to 87 * take upon a successful match. The action we use most is 88 * "forward", which sends the packets to a TIR or another flow 89 * table and then stops further processing within the FE's FT. 90 * 91 * lkey/mkey A reference to something similar to a page table but in the 92 * device's internal onboard MMU. Since Connect-X parts double as 93 * IB cards (lots of RDMA) they have extensive onboard memory mgmt 94 * features which we try very hard not to use. For our WQEs we use 95 * the "reserved" lkey, which is a special value which indicates 96 * that addresses we give are linear addresses and should not be 97 * translated. 98 * 99 * PD Protection Domain, an IB concept. We have to allocate one to 100 * provide as a parameter for new WQs, but we don't do anything 101 * with it. 102 * 103 * TDOM/TD Transport Domain, an IB concept. We allocate one in order to 104 * provide it as a parameter to TIR/TIS creation, but we don't do 105 * anything with it. 106 */ 107 /* 108 * 109 * Data flow overview 110 * ------------------ 111 * 112 * This driver is a MAC ring-enabled driver which maps rings to send and recv 113 * queues in hardware on the device. 114 * 115 * Each SQ and RQ is set up to report to its own individual CQ, to ensure 116 * sufficient space, and simplify the logic needed to work out which buffer 117 * was completed. 118 * 119 * The CQs are then round-robin allocated onto EQs, of which we set up one per 120 * interrupt that the system gives us for the device. Normally this means we 121 * have 8 EQs. 122 * 123 * When we have >= 8 EQs available, we try to allocate only RX or only TX 124 * CQs on each one. The EQs are chosen for RX and TX in an alternating fashion. 125 * 126 * EQ #0 is reserved for all event types other than completion events, and has 127 * no CQs associated with it at any time. EQs #1 and upwards are only used for 128 * handling CQ completion events. 129 * 130 * +------+ +------+ +------+ +---------+ 131 * | SQ 0 |---->| CQ 0 |-----+ | EQ 0 |------> | MSI-X 0 | mlxcx_intr_0 132 * +------+ +------+ | +------+ +---------+ 133 * | 134 * +------+ +------+ | 135 * | SQ 1 |---->| CQ 1 |---+ | +------+ 136 * +------+ +------+ | +---> | | 137 * | | | 138 * +------+ +------+ | | EQ 1 | +---------+ 139 * | SQ 2 |---->| CQ 2 |---------> | |------> | MSI-X 1 | mlxcx_intr_n 140 * +------+ +------+ | +---> | | +---------+ 141 * | | +------+ 142 * | | 143 * ... | | 144 * | | +------+ 145 * +------+ +------+ +-----> | | 146 * | RQ 0 |---->| CQ 3 |---------> | | +---------+ 147 * +------+ +------+ | | EQ 2 |------> | MSI-X 2 | mlxcx_intr_n 148 * | | | +---------+ 149 * +------+ +------+ | +-> | | 150 * | RQ 1 |---->| CQ 4 |-----+ | +------+ 151 * +------+ +------+ | 152 * | .... 153 * +------+ +------+ | 154 * | RQ 2 |---->| CQ 5 |-------+ 155 * +------+ +------+ 156 * 157 * ... (note this diagram does not show RX-only or TX-only EQs) 158 * 159 * For TX, we advertise all of the SQs we create as plain rings to MAC with 160 * no TX groups. This puts MAC in "virtual group" mode where it will allocate 161 * and use the rings as it sees fit. 162 * 163 * For RX, we advertise actual groups in order to make use of hardware 164 * classification. 165 * 166 * The hardware classification we use is based around Flow Tables, and we 167 * currently ignore all of the eswitch features of the card. The NIC VPORT 168 * is always set to promisc mode so that the eswitch sends us all of the 169 * traffic that arrives on the NIC, and we use flow entries to manage 170 * everything. 171 * 172 * We use 2 layers of flow tables for classification: traffic arrives at the 173 * root RX flow table which contains MAC address filters. Those then send 174 * matched traffic to the per-group L1 VLAN filter tables which contain VLAN 175 * presence and VID filters. 176 * 177 * Since these parts only support doing RSS hashing on a single protocol at a 178 * time, we have to use a third layer of flow tables as well to break traffic 179 * down by L4 and L3 protocol (TCPv6, TCPv4, UDPv6, UDPv4, IPv6, IPv4 etc) 180 * so that it can be sent to the appropriate TIR for hashing. 181 * 182 * Incoming packets 183 * + +---------+ +---------+ 184 * | +->| group 0 | | group 0 | 185 * | | | vlan ft | +-->| hash ft | 186 * v | | L1 | | | L2 | 187 * +----+----+ | +---------+ | +---------+ +-----+ +-----+------+ 188 * | eswitch | | | | | | TCPv6 |--->| TIR |--->| | RQ0 | 189 * +----+----+ | | | | +---------+ +-----+ | +------+ 190 * | | | | | | UDPv6 |--->| TIR |--->| | RQ1 | 191 * | | | | | +---------+ +-----+ | +------+ 192 * | | | | | | TCPv4 |--->| TIR |--->| | RQ2 | 193 * v | | | | +---------+ +-----+ | RQT +------+ 194 * +----+----+ | +---------+ | | UDPv4 |--->| TIR |--->| | ... | 195 * | root rx | | | default |--+ +---------+ +-----+ | | | 196 * | flow tb | | +---------+ | | IPv6 |--->| TIR |--->| | | 197 * | L0 | | | promisc |--+ +---------+ +-----+ | | | 198 * +---------+ | +---------+ ^ | IPv4 |--->| TIR |--->| | | 199 * | bcast |---|---------------+ +---------+ +-----+ +-----+------+ 200 * +---------+ | ^ | other |-+ 201 * | MAC 0 |---+ | +---------+ | +-----+ +-----+ 202 * +---------+ | +->| TIR |--->| RQ0 | 203 * | MAC 1 |-+ | +-----+ +-----+ 204 * +---------+ | +---------------+ 205 * | MAC 2 |-+ | ^ 206 * +---------+ | | | 207 * | MAC 3 |-+ | +---------+ | +---------+ 208 * +---------+ | | | group 1 | | | group 1 | 209 * | ..... | +--->| vlan ft | | +>| hash ft | 210 * | | | | L1 | | | | L2 | 211 * +---------+ | +---------+ | | +---------+ +-----+ +-----+------+ 212 * | promisc |---+ | VLAN 0 |----+ | TCPv6 |--->| TIR |--->| | RQ3 | 213 * +---------+ +---------+ | +---------+ +-----+ | +------+ 214 * | ..... | | | UDPv6 |--->| TIR |--->| | RQ4 | 215 * | | | +---------+ +-----+ | +------+ 216 * | | | | TCPv4 |--->| TIR |--->| | RQ5 | 217 * | | | +---------+ +-----+ | RQT +------+ 218 * +---------+ | | UDPv4 |--->| TIR |--->| | ... | 219 * | | | +---------+ +-----+ | | | 220 * +---------+ | | IPv6 |--->| TIR |--->| | | 221 * | promisc |--+ +---------+ +-----+ | | | 222 * +---------+ | IPv4 |--->| TIR |--->| | | 223 * +---------+ +-----+ +-----+------+ 224 * | other |-+ 225 * +---------+ | 226 * ....... | +-----+ +-----+ 227 * +->| TIR |--->| RQ3 | 228 * +-----+ +-----+ 229 * 230 * Note that the "promisc" flow entries are only set/enabled when promisc 231 * mode is enabled for the NIC. All promisc flow entries point directly at 232 * group 0's hashing flowtable (so all promisc-only traffic lands on group 0, 233 * the "default group" in MAC). 234 * 235 * The "default" entry in the L1 VLAN filter flow tables is used when there 236 * are no VLANs set for the group, to accept any traffic regardless of tag. It 237 * is deleted as soon as a VLAN filter is added (and re-instated if the 238 * last VLAN filter is removed). 239 * 240 * The actual descriptor ring structures for RX on Connect-X4 don't contain any 241 * space for packet data (they're a collection of scatter pointers only). TX 242 * descriptors contain some space for "inline headers" (and the card requires 243 * us to put at least the L2 Ethernet headers there for the eswitch to look at) 244 * but all the rest of the data comes from the gather pointers. 245 * 246 * When we get completions back they simply contain the ring index number of 247 * the WR (work request) which completed. So, we manage the buffers for actual 248 * packet data completely independently of the descriptors in this driver. When 249 * a WR is enqueued in a WQE (work queue entry), we stamp the packet data buffer 250 * with the WQE index that we put it at, and therefore don't have to look at 251 * the original descriptor at all when handling completions. 252 * 253 * For RX, we create sufficient packet data buffers to fill 150% of the 254 * available descriptors for each ring. These all are pre-set-up for DMA and 255 * have an mblk_t associated with them (with desballoc()). 256 * 257 * For TX we either borrow the mblk's memory and DMA bind it (if the packet is 258 * large enough), or we copy it into a pre-allocated buffer set up in the same 259 * as as for RX. 260 */ 261 262 /* 263 * Buffer lifecycle: RX 264 * -------------------- 265 * 266 * The lifecycle of an mlxcx_buffer_t (packet buffer) used for RX is pretty 267 * straightforward. 268 * 269 * It is created (and has all its memory allocated) at the time of starting up 270 * the RX ring it belongs to. Then it is placed on the "free" list in the 271 * mlxcx_buffer_shard_t associated with its RQ. When mlxcx_rq_refill() wants 272 * more buffers to add to the RQ, it takes one off and marks it as "on WQ" 273 * before making a WQE for it. 274 * 275 * After a completion event occurs, the packet is either discarded (and the 276 * buffer_t returned to the free list), or it is readied for loaning to MAC. 277 * 278 * Once MAC and the rest of the system have finished with the packet, they call 279 * freemsg() on its mblk, which will call mlxcx_buf_mp_return and return the 280 * buffer_t to the free list. 281 * 282 * At detach/teardown time, buffers are only every destroyed from the free list. 283 * 284 * 285 * + 286 * | 287 * | mlxcx_buf_create 288 * | 289 * v 290 * +----+----+ 291 * | created | 292 * +----+----+ 293 * | 294 * | 295 * | mlxcx_buf_return 296 * | 297 * v 298 * mlxcx_buf_destroy +----+----+ 299 * +---------| free |<---------------+ 300 * | +----+----+ | 301 * | | | 302 * | | | mlxcx_buf_return 303 * v | mlxcx_buf_take | 304 * +---+--+ v | 305 * | dead | +---+---+ | 306 * +------+ | on WQ |- - - - - - - - >O 307 * +---+---+ ^ 308 * | | 309 * | | 310 * | mlxcx_buf_loan | mlxcx_buf_mp_return 311 * v | 312 * +-------+--------+ | 313 * | on loan to MAC |----------->O 314 * +----------------+ freemsg() 315 * 316 */ 317 318 /* 319 * Buffer lifecycle: TX 320 * -------------------- 321 * 322 * mlxcx_buffer_ts used for TX are divided into two kinds: regular buffers, and 323 * "foreign" buffers. 324 * 325 * The former have their memory allocated and DMA bound by this driver, while 326 * the latter (the "foreign" buffers) are on loan from MAC. Their memory is 327 * not owned by us, though we do DMA bind it (and take responsibility for 328 * un-binding it when we're done with them). 329 * 330 * We use separate mlxcx_buf_shard_ts for foreign and local buffers on each 331 * SQ. Thus, there is a separate free list and mutex for each kind. 332 * 333 * Since a TX packet might consist of multiple mblks, we translate each mblk 334 * into exactly one buffer_t. The buffer_ts are chained together in the same 335 * order as the mblks, using the mlb_tx_chain/mlb_tx_chain_entry list_t. 336 * 337 * Each chain of TX buffers may consist of foreign or driver buffers, in any 338 * mixture. 339 * 340 * The head of a TX buffer chain has mlb_tx_head == itself, which distinguishes 341 * it from the rest of the chain buffers. 342 * 343 * TX buffer chains are always returned to the free list by 344 * mlxcx_buf_return_chain(), which takes care of walking the mlb_tx_chain and 345 * freeing all of the members. 346 * 347 * We only call freemsg() once, on the head of the TX buffer chain's original 348 * mblk. This is true whether we copied it or bound it in a foreign buffer. 349 */ 350 351 /* 352 * Startup and command interface 353 * ----------------------------- 354 * 355 * The command interface is the primary way in which we give control orders to 356 * the hardware (e.g. actions like "create this queue" or "delete this flow 357 * entry"). The command interface is never used to transmit or receive packets 358 * -- that takes place only on the queues that are set up through it. 359 * 360 * In mlxcx_cmd.c we implement our use of the command interface on top of a 361 * simple taskq. Since it's not performance critical, we busy-wait on command 362 * completions and only process a single command at a time. 363 * 364 * If this becomes a problem later we can wire command completions up to EQ 0 365 * once we have interrupts running. 366 * 367 * The startup/attach process for this card involves a bunch of different steps 368 * which are summarised pretty well in the PRM. We have to send a number of 369 * commands which do different things to start the card up, give it some pages 370 * of our own memory for it to use, then start creating all the entities that 371 * we need to use like EQs, CQs, WQs, as well as their dependencies like PDs 372 * and TDoms. 373 */ 374 375 /* 376 * UARs 377 * ---- 378 * 379 * The pages of the PCI BAR other than the first few are reserved for use as 380 * "UAR" sections in this device. Each UAR section can be used as a set of 381 * doorbells for our queues. 382 * 383 * Currently we just make one single UAR for all of our queues. It doesn't 384 * seem to be a major limitation yet. 385 * 386 * When we're sending packets through an SQ, the PRM is not awful clear about 387 * exactly how we're meant to use the first 16 bytes of the Blueflame buffers 388 * (it's clear on the pattern of alternation you're expected to use between 389 * even and odd for Blueflame sends, but not for regular doorbells). 390 * 391 * Currently we don't do the even-odd alternating pattern for ordinary 392 * doorbells, and we don't use Blueflame at all. This seems to work fine, at 393 * least on Connect-X4 Lx. 394 */ 395 396 /* 397 * Lock ordering 398 * ------------- 399 * 400 * Interrupt side: 401 * 402 * - mleq_mtx 403 * - mlcq_mtx 404 * - mlcq_bufbmtx 405 * - mlwq_mtx 406 * - mlbs_mtx 407 * - mlp_mtx 408 * 409 * GLD side: 410 * 411 * - mlp_mtx 412 * - mlg_mtx 413 * - mlg_*.mlft_mtx 414 * - mlp_*.mlft_mtx 415 * - mlwq_mtx 416 * - mlbs_mtx 417 * - mlcq_bufbmtx 418 * - mleq_mtx 419 * - mlcq_mtx 420 * 421 */ 422 423 #include <sys/modctl.h> 424 #include <sys/conf.h> 425 #include <sys/devops.h> 426 #include <sys/sysmacros.h> 427 #include <sys/time.h> 428 429 #include <sys/mac_provider.h> 430 431 #include <mlxcx.h> 432 433 CTASSERT((1 << MLXCX_RX_HASH_FT_SIZE_SHIFT) >= MLXCX_TIRS_PER_GROUP); 434 435 #define MLXCX_MODULE_NAME "mlxcx" 436 /* 437 * We give this to the firmware, so it has to be in a fixed format that it 438 * understands. 439 */ 440 #define MLXCX_DRIVER_VERSION "illumos,mlxcx,1.0.0,1,000,000000" 441 442 /* 443 * Firmware may take a while to reclaim pages. Try a set number of times. 444 */ 445 clock_t mlxcx_reclaim_delay = 1000 * 50; /* 50 ms in us */ 446 uint_t mlxcx_reclaim_tries = 100; /* Wait at most 5000ms */ 447 448 static void *mlxcx_softstate; 449 450 /* 451 * Fault detection thresholds. 452 */ 453 uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT; 454 uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT; 455 456 static void 457 mlxcx_load_prop_defaults(mlxcx_t *mlxp) 458 { 459 mlxcx_drv_props_t *p = &mlxp->mlx_props; 460 mlxcx_port_t *port = &mlxp->mlx_ports[0]; 461 462 VERIFY((mlxp->mlx_attach & MLXCX_ATTACH_PORTS) != 0); 463 VERIFY((mlxp->mlx_attach & (MLXCX_ATTACH_CQS | MLXCX_ATTACH_WQS)) == 0); 464 465 /* 466 * Currently we have different queue size defaults for two 467 * categories of queues. One set for devices which support a 468 * maximum speed of 10Gb/s, and another for those above that. 469 */ 470 if ((port->mlp_max_proto & (MLXCX_PROTO_25G | MLXCX_PROTO_40G | 471 MLXCX_PROTO_50G | MLXCX_PROTO_100G)) != 0) { 472 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_25G; 473 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_25G; 474 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_25G; 475 } else if ((port->mlp_max_proto & (MLXCX_PROTO_100M | MLXCX_PROTO_1G | 476 MLXCX_PROTO_10G)) != 0) { 477 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT; 478 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT; 479 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT; 480 } else { 481 mlxcx_warn(mlxp, "Encountered a port with a speed we don't " 482 "recognize. Proto: 0x%x", port->mlp_max_proto); 483 p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT; 484 p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT; 485 p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT; 486 } 487 } 488 489 /* 490 * Properties which may have different defaults based on hardware 491 * characteristics. 492 */ 493 static void 494 mlxcx_load_model_props(mlxcx_t *mlxp) 495 { 496 mlxcx_drv_props_t *p = &mlxp->mlx_props; 497 498 mlxcx_load_prop_defaults(mlxp); 499 500 p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 501 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift", 502 p->mldp_cq_size_shift_default); 503 p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 504 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift", 505 p->mldp_sq_size_shift_default); 506 p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 507 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift", 508 p->mldp_rq_size_shift_default); 509 } 510 511 static void 512 mlxcx_load_props(mlxcx_t *mlxp) 513 { 514 mlxcx_drv_props_t *p = &mlxp->mlx_props; 515 516 p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 517 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift", 518 MLXCX_EQ_SIZE_SHIFT_DFLT); 519 p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 520 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec", 521 MLXCX_CQEMOD_PERIOD_USEC_DFLT); 522 p->mldp_cqemod_count = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 523 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_count", 524 MLXCX_CQEMOD_COUNT_DFLT); 525 p->mldp_intrmod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 526 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "intrmod_period_usec", 527 MLXCX_INTRMOD_PERIOD_USEC_DFLT); 528 529 p->mldp_tx_ngroups = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 530 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_ngroups", 531 MLXCX_TX_NGROUPS_DFLT); 532 p->mldp_tx_nrings_per_group = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 533 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_nrings_per_group", 534 MLXCX_TX_NRINGS_PER_GROUP_DFLT); 535 536 p->mldp_rx_ngroups_large = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 537 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_large", 538 MLXCX_RX_NGROUPS_LARGE_DFLT); 539 p->mldp_rx_ngroups_small = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 540 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_small", 541 MLXCX_RX_NGROUPS_SMALL_DFLT); 542 p->mldp_rx_nrings_per_large_group = ddi_getprop(DDI_DEV_T_ANY, 543 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 544 "rx_nrings_per_large_group", MLXCX_RX_NRINGS_PER_LARGE_GROUP_DFLT); 545 p->mldp_rx_nrings_per_small_group = ddi_getprop(DDI_DEV_T_ANY, 546 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 547 "rx_nrings_per_small_group", MLXCX_RX_NRINGS_PER_SMALL_GROUP_DFLT); 548 549 p->mldp_ftbl_root_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 550 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_root_size_shift", 551 MLXCX_FTBL_ROOT_SIZE_SHIFT_DFLT); 552 553 p->mldp_tx_bind_threshold = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 554 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_bind_threshold", 555 MLXCX_TX_BIND_THRESHOLD_DFLT); 556 557 p->mldp_ftbl_vlan_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 558 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_vlan_size_shift", 559 MLXCX_FTBL_VLAN_SIZE_SHIFT_DFLT); 560 561 p->mldp_eq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 562 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 563 "eq_check_interval_sec", MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT); 564 p->mldp_cq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 565 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 566 "cq_check_interval_sec", MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT); 567 p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY, 568 mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, 569 "wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT); 570 571 p->mldp_rx_per_cq = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip, 572 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_limit_per_completion", 573 MLXCX_RX_PER_CQ_DEFAULT); 574 575 if (p->mldp_rx_per_cq < MLXCX_RX_PER_CQ_MIN || 576 p->mldp_rx_per_cq > MLXCX_RX_PER_CQ_MAX) { 577 mlxcx_warn(mlxp, "!rx_limit_per_completion = %u is " 578 "out of range. Defaulting to: %d. Valid values are from " 579 "%d to %d", p->mldp_rx_per_cq, MLXCX_RX_PER_CQ_DEFAULT, 580 MLXCX_RX_PER_CQ_MIN, MLXCX_RX_PER_CQ_MAX); 581 p->mldp_rx_per_cq = MLXCX_RX_PER_CQ_DEFAULT; 582 } 583 } 584 585 void 586 mlxcx_note(mlxcx_t *mlxp, const char *fmt, ...) 587 { 588 va_list ap; 589 590 va_start(ap, fmt); 591 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 592 vdev_err(mlxp->mlx_dip, CE_NOTE, fmt, ap); 593 } else { 594 vcmn_err(CE_NOTE, fmt, ap); 595 } 596 va_end(ap); 597 } 598 599 void 600 mlxcx_warn(mlxcx_t *mlxp, const char *fmt, ...) 601 { 602 va_list ap; 603 604 va_start(ap, fmt); 605 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 606 vdev_err(mlxp->mlx_dip, CE_WARN, fmt, ap); 607 } else { 608 vcmn_err(CE_WARN, fmt, ap); 609 } 610 va_end(ap); 611 } 612 613 void 614 mlxcx_panic(mlxcx_t *mlxp, const char *fmt, ...) 615 { 616 va_list ap; 617 618 va_start(ap, fmt); 619 if (mlxp != NULL && mlxp->mlx_dip != NULL) { 620 vdev_err(mlxp->mlx_dip, CE_PANIC, fmt, ap); 621 } else { 622 vcmn_err(CE_PANIC, fmt, ap); 623 } 624 va_end(ap); 625 } 626 627 uint16_t 628 mlxcx_get16(mlxcx_t *mlxp, uintptr_t off) 629 { 630 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 631 return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr)); 632 } 633 634 uint32_t 635 mlxcx_get32(mlxcx_t *mlxp, uintptr_t off) 636 { 637 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 638 return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr)); 639 } 640 641 uint64_t 642 mlxcx_get64(mlxcx_t *mlxp, uintptr_t off) 643 { 644 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 645 return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr)); 646 } 647 648 void 649 mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val) 650 { 651 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 652 ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); 653 } 654 655 void 656 mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val) 657 { 658 uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base; 659 ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); 660 } 661 662 void 663 mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val) 664 { 665 /* 666 * The UAR is always inside the first BAR, which we mapped as 667 * mlx_regs 668 */ 669 uintptr_t addr = off + (uintptr_t)mlu->mlu_base + 670 (uintptr_t)mlxp->mlx_regs_base; 671 ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val); 672 } 673 674 void 675 mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val) 676 { 677 uintptr_t addr = off + (uintptr_t)mlu->mlu_base + 678 (uintptr_t)mlxp->mlx_regs_base; 679 ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val); 680 } 681 682 static void 683 mlxcx_fm_fini(mlxcx_t *mlxp) 684 { 685 if (mlxp->mlx_fm_caps == 0) 686 return; 687 688 if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) 689 ddi_fm_handler_unregister(mlxp->mlx_dip); 690 691 if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) || 692 DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) 693 pci_ereport_teardown(mlxp->mlx_dip); 694 695 ddi_fm_fini(mlxp->mlx_dip); 696 697 mlxp->mlx_fm_caps = 0; 698 } 699 700 void 701 mlxcx_fm_ereport(mlxcx_t *mlxp, const char *detail) 702 { 703 uint64_t ena; 704 char buf[FM_MAX_CLASS]; 705 706 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 707 return; 708 709 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail); 710 ena = fm_ena_generate(0, FM_ENA_FMT1); 711 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 712 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 713 NULL); 714 } 715 716 static int 717 mlxcx_fm_errcb(dev_info_t *dip, ddi_fm_error_t *err, const void *arg) 718 { 719 /* 720 * as the driver can always deal with an error in any dma or 721 * access handle, we can just return the fme_status value. 722 */ 723 pci_ereport_post(dip, err, NULL); 724 return (err->fme_status); 725 } 726 727 static void 728 mlxcx_fm_init(mlxcx_t *mlxp) 729 { 730 ddi_iblock_cookie_t iblk; 731 int def = DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE | 732 DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE; 733 734 mlxp->mlx_fm_caps = ddi_prop_get_int(DDI_DEV_T_ANY, mlxp->mlx_dip, 735 DDI_PROP_DONTPASS, "fm_capable", def); 736 737 if (mlxp->mlx_fm_caps < 0) { 738 mlxp->mlx_fm_caps = 0; 739 } 740 mlxp->mlx_fm_caps &= def; 741 742 if (mlxp->mlx_fm_caps == 0) 743 return; 744 745 ddi_fm_init(mlxp->mlx_dip, &mlxp->mlx_fm_caps, &iblk); 746 if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) || 747 DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) { 748 pci_ereport_setup(mlxp->mlx_dip); 749 } 750 if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) { 751 ddi_fm_handler_register(mlxp->mlx_dip, mlxcx_fm_errcb, 752 (void *)mlxp); 753 } 754 } 755 756 static void 757 mlxcx_mlbs_teardown(mlxcx_t *mlxp, mlxcx_buf_shard_t *s) 758 { 759 mlxcx_buffer_t *buf; 760 761 mutex_enter(&s->mlbs_mtx); 762 while (!list_is_empty(&s->mlbs_busy)) 763 cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx); 764 while ((buf = list_head(&s->mlbs_free)) != NULL) { 765 mlxcx_buf_destroy(mlxp, buf); 766 } 767 list_destroy(&s->mlbs_free); 768 list_destroy(&s->mlbs_busy); 769 mutex_exit(&s->mlbs_mtx); 770 771 cv_destroy(&s->mlbs_free_nonempty); 772 mutex_destroy(&s->mlbs_mtx); 773 } 774 775 static void 776 mlxcx_teardown_bufs(mlxcx_t *mlxp) 777 { 778 mlxcx_buf_shard_t *s; 779 780 while ((s = list_remove_head(&mlxp->mlx_buf_shards)) != NULL) { 781 mlxcx_mlbs_teardown(mlxp, s); 782 kmem_free(s, sizeof (mlxcx_buf_shard_t)); 783 } 784 list_destroy(&mlxp->mlx_buf_shards); 785 786 kmem_cache_destroy(mlxp->mlx_bufs_cache); 787 } 788 789 static void 790 mlxcx_teardown_pages(mlxcx_t *mlxp) 791 { 792 uint_t nzeros = 0; 793 794 mutex_enter(&mlxp->mlx_pagemtx); 795 796 while (mlxp->mlx_npages > 0) { 797 int32_t req, ret; 798 uint64_t pas[MLXCX_MANAGE_PAGES_MAX_PAGES]; 799 800 ASSERT0(avl_is_empty(&mlxp->mlx_pages)); 801 req = MIN(mlxp->mlx_npages, MLXCX_MANAGE_PAGES_MAX_PAGES); 802 803 if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) { 804 mlxcx_warn(mlxp, "hardware refused to return pages, " 805 "leaking %u remaining pages", mlxp->mlx_npages); 806 goto out; 807 } 808 809 for (int32_t i = 0; i < ret; i++) { 810 mlxcx_dev_page_t *mdp, probe; 811 bzero(&probe, sizeof (probe)); 812 probe.mxdp_pa = pas[i]; 813 814 mdp = avl_find(&mlxp->mlx_pages, &probe, NULL); 815 816 if (mdp != NULL) { 817 avl_remove(&mlxp->mlx_pages, mdp); 818 mlxp->mlx_npages--; 819 mlxcx_dma_free(&mdp->mxdp_dma); 820 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 821 } else { 822 mlxcx_panic(mlxp, "hardware returned a page " 823 "with PA 0x%" PRIx64 " but we have no " 824 "record of giving out such a page", pas[i]); 825 } 826 } 827 828 /* 829 * If no pages were returned, note that fact. 830 */ 831 if (ret == 0) { 832 nzeros++; 833 if (nzeros > mlxcx_reclaim_tries) { 834 mlxcx_warn(mlxp, "hardware refused to return " 835 "pages, leaking %u remaining pages", 836 mlxp->mlx_npages); 837 goto out; 838 } 839 delay(drv_usectohz(mlxcx_reclaim_delay)); 840 } 841 } 842 843 avl_destroy(&mlxp->mlx_pages); 844 845 out: 846 mutex_exit(&mlxp->mlx_pagemtx); 847 mutex_destroy(&mlxp->mlx_pagemtx); 848 } 849 850 static boolean_t 851 mlxcx_eq_alloc_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 852 { 853 ddi_device_acc_attr_t acc; 854 ddi_dma_attr_t attr; 855 boolean_t ret; 856 size_t sz, i; 857 858 VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC); 859 860 mleq->mleq_entshift = mlxp->mlx_props.mldp_eq_size_shift; 861 mleq->mleq_nents = (1 << mleq->mleq_entshift); 862 sz = mleq->mleq_nents * sizeof (mlxcx_eventq_ent_t); 863 ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0); 864 865 mlxcx_dma_acc_attr(mlxp, &acc); 866 mlxcx_dma_queue_attr(mlxp, &attr); 867 868 ret = mlxcx_dma_alloc(mlxp, &mleq->mleq_dma, &attr, &acc, 869 B_TRUE, sz, B_TRUE); 870 if (!ret) { 871 mlxcx_warn(mlxp, "failed to allocate EQ memory"); 872 return (B_FALSE); 873 } 874 875 mleq->mleq_ent = (mlxcx_eventq_ent_t *)mleq->mleq_dma.mxdb_va; 876 877 for (i = 0; i < mleq->mleq_nents; ++i) 878 mleq->mleq_ent[i].mleqe_owner = MLXCX_EQ_OWNER_INIT; 879 880 mleq->mleq_state |= MLXCX_EQ_ALLOC; 881 882 return (B_TRUE); 883 } 884 885 static void 886 mlxcx_eq_rele_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq) 887 { 888 VERIFY(mleq->mleq_state & MLXCX_EQ_ALLOC); 889 if (mleq->mleq_state & MLXCX_EQ_CREATED) 890 VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED); 891 892 mlxcx_dma_free(&mleq->mleq_dma); 893 mleq->mleq_ent = NULL; 894 895 mleq->mleq_state &= ~MLXCX_EQ_ALLOC; 896 } 897 898 void 899 mlxcx_teardown_flow_table(mlxcx_t *mlxp, mlxcx_flow_table_t *ft) 900 { 901 mlxcx_flow_group_t *fg; 902 mlxcx_flow_entry_t *fe; 903 int i; 904 905 ASSERT(mutex_owned(&ft->mlft_mtx)); 906 907 for (i = ft->mlft_nents - 1; i >= 0; --i) { 908 fe = &ft->mlft_ent[i]; 909 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 910 if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) { 911 mlxcx_panic(mlxp, "failed to delete flow " 912 "entry %u on table %u", i, 913 ft->mlft_num); 914 } 915 } 916 } 917 918 while ((fg = list_remove_head(&ft->mlft_groups)) != NULL) { 919 if (fg->mlfg_state & MLXCX_FLOW_GROUP_CREATED && 920 !(fg->mlfg_state & MLXCX_FLOW_GROUP_DESTROYED)) { 921 if (!mlxcx_cmd_destroy_flow_group(mlxp, fg)) { 922 mlxcx_panic(mlxp, "failed to destroy flow " 923 "group %u", fg->mlfg_num); 924 } 925 } 926 kmem_free(fg, sizeof (mlxcx_flow_group_t)); 927 } 928 list_destroy(&ft->mlft_groups); 929 if (ft->mlft_state & MLXCX_FLOW_TABLE_CREATED && 930 !(ft->mlft_state & MLXCX_FLOW_TABLE_DESTROYED)) { 931 if (!mlxcx_cmd_destroy_flow_table(mlxp, ft)) { 932 mlxcx_panic(mlxp, "failed to destroy flow table %u", 933 ft->mlft_num); 934 } 935 } 936 kmem_free(ft->mlft_ent, ft->mlft_entsize); 937 ft->mlft_ent = NULL; 938 mutex_exit(&ft->mlft_mtx); 939 mutex_destroy(&ft->mlft_mtx); 940 kmem_free(ft, sizeof (mlxcx_flow_table_t)); 941 } 942 943 static void 944 mlxcx_teardown_ports(mlxcx_t *mlxp) 945 { 946 uint_t i; 947 mlxcx_port_t *p; 948 mlxcx_flow_table_t *ft; 949 950 for (i = 0; i < mlxp->mlx_nports; ++i) { 951 p = &mlxp->mlx_ports[i]; 952 if (!(p->mlp_init & MLXCX_PORT_INIT)) 953 continue; 954 mutex_enter(&p->mlp_mtx); 955 if ((ft = p->mlp_rx_flow) != NULL) { 956 mutex_enter(&ft->mlft_mtx); 957 /* 958 * teardown_flow_table() will destroy the mutex, so 959 * we don't release it here. 960 */ 961 mlxcx_teardown_flow_table(mlxp, ft); 962 } 963 mutex_exit(&p->mlp_mtx); 964 mutex_destroy(&p->mlp_mtx); 965 p->mlp_init &= ~MLXCX_PORT_INIT; 966 } 967 968 kmem_free(mlxp->mlx_ports, mlxp->mlx_ports_size); 969 mlxp->mlx_ports = NULL; 970 } 971 972 static void 973 mlxcx_teardown_wqs(mlxcx_t *mlxp) 974 { 975 mlxcx_work_queue_t *mlwq; 976 977 while ((mlwq = list_head(&mlxp->mlx_wqs)) != NULL) { 978 mlxcx_wq_teardown(mlxp, mlwq); 979 } 980 list_destroy(&mlxp->mlx_wqs); 981 } 982 983 static void 984 mlxcx_teardown_cqs(mlxcx_t *mlxp) 985 { 986 mlxcx_completion_queue_t *mlcq; 987 988 while ((mlcq = list_head(&mlxp->mlx_cqs)) != NULL) { 989 mlxcx_cq_teardown(mlxp, mlcq); 990 } 991 list_destroy(&mlxp->mlx_cqs); 992 } 993 994 static void 995 mlxcx_teardown_eqs(mlxcx_t *mlxp) 996 { 997 mlxcx_event_queue_t *mleq; 998 uint_t i; 999 1000 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 1001 mleq = &mlxp->mlx_eqs[i]; 1002 mutex_enter(&mleq->mleq_mtx); 1003 if ((mleq->mleq_state & MLXCX_EQ_CREATED) && 1004 !(mleq->mleq_state & MLXCX_EQ_DESTROYED)) { 1005 if (!mlxcx_cmd_destroy_eq(mlxp, mleq)) { 1006 mlxcx_warn(mlxp, "failed to destroy " 1007 "event queue idx %u eqn %u", 1008 i, mleq->mleq_num); 1009 } 1010 } 1011 if (mleq->mleq_state & MLXCX_EQ_ALLOC) { 1012 mlxcx_eq_rele_dma(mlxp, mleq); 1013 } 1014 mutex_exit(&mleq->mleq_mtx); 1015 } 1016 } 1017 1018 static void 1019 mlxcx_teardown_checktimers(mlxcx_t *mlxp) 1020 { 1021 if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) 1022 ddi_periodic_delete(mlxp->mlx_eq_checktimer); 1023 if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) 1024 ddi_periodic_delete(mlxp->mlx_cq_checktimer); 1025 if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) 1026 ddi_periodic_delete(mlxp->mlx_wq_checktimer); 1027 } 1028 1029 static void 1030 mlxcx_teardown(mlxcx_t *mlxp) 1031 { 1032 uint_t i; 1033 dev_info_t *dip = mlxp->mlx_dip; 1034 1035 if (mlxp->mlx_attach & MLXCX_ATTACH_GROUPS) { 1036 mlxcx_teardown_groups(mlxp); 1037 mlxp->mlx_attach &= ~MLXCX_ATTACH_GROUPS; 1038 } 1039 1040 if (mlxp->mlx_attach & MLXCX_ATTACH_CHKTIMERS) { 1041 mlxcx_teardown_checktimers(mlxp); 1042 mlxp->mlx_attach &= ~MLXCX_ATTACH_CHKTIMERS; 1043 } 1044 1045 if (mlxp->mlx_attach & MLXCX_ATTACH_WQS) { 1046 mlxcx_teardown_wqs(mlxp); 1047 mlxp->mlx_attach &= ~MLXCX_ATTACH_WQS; 1048 } 1049 1050 if (mlxp->mlx_attach & MLXCX_ATTACH_CQS) { 1051 mlxcx_teardown_cqs(mlxp); 1052 mlxp->mlx_attach &= ~MLXCX_ATTACH_CQS; 1053 } 1054 1055 if (mlxp->mlx_attach & MLXCX_ATTACH_BUFS) { 1056 mlxcx_teardown_bufs(mlxp); 1057 mlxp->mlx_attach &= ~MLXCX_ATTACH_BUFS; 1058 } 1059 1060 if (mlxp->mlx_attach & MLXCX_ATTACH_PORTS) { 1061 mlxcx_teardown_ports(mlxp); 1062 mlxp->mlx_attach &= ~MLXCX_ATTACH_PORTS; 1063 } 1064 1065 if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) { 1066 mlxcx_teardown_eqs(mlxp); 1067 mlxcx_intr_teardown(mlxp); 1068 mlxp->mlx_attach &= ~MLXCX_ATTACH_INTRS; 1069 } 1070 1071 if (mlxp->mlx_attach & MLXCX_ATTACH_UAR_PD_TD) { 1072 if (mlxp->mlx_uar.mlu_allocated) { 1073 if (!mlxcx_cmd_dealloc_uar(mlxp, &mlxp->mlx_uar)) { 1074 mlxcx_warn(mlxp, "failed to release UAR"); 1075 } 1076 for (i = 0; i < MLXCX_BF_PER_UAR; ++i) 1077 mutex_destroy(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx); 1078 } 1079 if (mlxp->mlx_pd.mlpd_allocated && 1080 !mlxcx_cmd_dealloc_pd(mlxp, &mlxp->mlx_pd)) { 1081 mlxcx_warn(mlxp, "failed to release PD"); 1082 } 1083 if (mlxp->mlx_tdom.mltd_allocated && 1084 !mlxcx_cmd_dealloc_tdom(mlxp, &mlxp->mlx_tdom)) { 1085 mlxcx_warn(mlxp, "failed to release TDOM"); 1086 } 1087 mlxp->mlx_attach &= ~MLXCX_ATTACH_UAR_PD_TD; 1088 } 1089 1090 if (mlxp->mlx_attach & MLXCX_ATTACH_INIT_HCA) { 1091 if (!mlxcx_cmd_teardown_hca(mlxp)) { 1092 mlxcx_warn(mlxp, "failed to send teardown HCA " 1093 "command during device detach"); 1094 } 1095 mlxp->mlx_attach &= ~MLXCX_ATTACH_INIT_HCA; 1096 } 1097 1098 if (mlxp->mlx_attach & MLXCX_ATTACH_PAGE_LIST) { 1099 mlxcx_teardown_pages(mlxp); 1100 mlxp->mlx_attach &= ~MLXCX_ATTACH_PAGE_LIST; 1101 } 1102 1103 if (mlxp->mlx_attach & MLXCX_ATTACH_ENABLE_HCA) { 1104 if (!mlxcx_cmd_disable_hca(mlxp)) { 1105 mlxcx_warn(mlxp, "failed to send DISABLE HCA command " 1106 "during device detach"); 1107 } 1108 mlxp->mlx_attach &= ~MLXCX_ATTACH_ENABLE_HCA; 1109 } 1110 1111 if (mlxp->mlx_attach & MLXCX_ATTACH_CMD) { 1112 mlxcx_cmd_queue_fini(mlxp); 1113 mlxp->mlx_attach &= ~MLXCX_ATTACH_CMD; 1114 } 1115 1116 if (mlxp->mlx_attach & MLXCX_ATTACH_CAPS) { 1117 kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t)); 1118 mlxp->mlx_caps = NULL; 1119 mlxp->mlx_attach &= ~MLXCX_ATTACH_CAPS; 1120 } 1121 1122 if (mlxp->mlx_attach & MLXCX_ATTACH_REGS) { 1123 ddi_regs_map_free(&mlxp->mlx_regs_handle); 1124 mlxp->mlx_regs_handle = NULL; 1125 mlxp->mlx_attach &= ~MLXCX_ATTACH_REGS; 1126 } 1127 1128 if (mlxp->mlx_attach & MLXCX_ATTACH_PCI_CONFIG) { 1129 pci_config_teardown(&mlxp->mlx_cfg_handle); 1130 mlxp->mlx_cfg_handle = NULL; 1131 mlxp->mlx_attach &= ~MLXCX_ATTACH_PCI_CONFIG; 1132 } 1133 1134 if (mlxp->mlx_attach & MLXCX_ATTACH_FM) { 1135 mlxcx_fm_fini(mlxp); 1136 mlxp->mlx_attach &= ~MLXCX_ATTACH_FM; 1137 } 1138 1139 VERIFY3S(mlxp->mlx_attach, ==, 0); 1140 ddi_soft_state_free(mlxcx_softstate, mlxp->mlx_inst); 1141 ddi_set_driver_private(dip, NULL); 1142 } 1143 1144 static boolean_t 1145 mlxcx_regs_map(mlxcx_t *mlxp) 1146 { 1147 off_t memsize; 1148 int ret; 1149 ddi_device_acc_attr_t da; 1150 1151 if (ddi_dev_regsize(mlxp->mlx_dip, MLXCX_REG_NUMBER, &memsize) != 1152 DDI_SUCCESS) { 1153 mlxcx_warn(mlxp, "failed to get register set size"); 1154 return (B_FALSE); 1155 } 1156 1157 /* 1158 * All data in the main BAR is kept in big-endian even though it's a PCI 1159 * device. 1160 */ 1161 bzero(&da, sizeof (ddi_device_acc_attr_t)); 1162 da.devacc_attr_version = DDI_DEVICE_ATTR_V0; 1163 da.devacc_attr_endian_flags = DDI_STRUCTURE_BE_ACC; 1164 da.devacc_attr_dataorder = DDI_STRICTORDER_ACC; 1165 if (DDI_FM_ACC_ERR_CAP(mlxp->mlx_fm_caps)) { 1166 da.devacc_attr_access = DDI_FLAGERR_ACC; 1167 } else { 1168 da.devacc_attr_access = DDI_DEFAULT_ACC; 1169 } 1170 1171 ret = ddi_regs_map_setup(mlxp->mlx_dip, MLXCX_REG_NUMBER, 1172 &mlxp->mlx_regs_base, 0, memsize, &da, &mlxp->mlx_regs_handle); 1173 1174 if (ret != DDI_SUCCESS) { 1175 mlxcx_warn(mlxp, "failed to map device registers: %d", ret); 1176 return (B_FALSE); 1177 } 1178 1179 return (B_TRUE); 1180 } 1181 1182 static boolean_t 1183 mlxcx_check_issi(mlxcx_t *mlxp) 1184 { 1185 uint32_t issi; 1186 1187 if (!mlxcx_cmd_query_issi(mlxp, &issi)) { 1188 mlxcx_warn(mlxp, "failed to get ISSI"); 1189 return (B_FALSE); 1190 } 1191 1192 if ((issi & (1 << MLXCX_CURRENT_ISSI)) == 0) { 1193 mlxcx_warn(mlxp, "hardware does not support software ISSI, " 1194 "hw vector 0x%x, sw version %u", issi, MLXCX_CURRENT_ISSI); 1195 return (B_FALSE); 1196 } 1197 1198 if (!mlxcx_cmd_set_issi(mlxp, MLXCX_CURRENT_ISSI)) { 1199 mlxcx_warn(mlxp, "failed to set ISSI to %u", 1200 MLXCX_CURRENT_ISSI); 1201 return (B_FALSE); 1202 } 1203 1204 return (B_TRUE); 1205 } 1206 1207 boolean_t 1208 mlxcx_give_pages(mlxcx_t *mlxp, int32_t npages) 1209 { 1210 ddi_device_acc_attr_t acc; 1211 ddi_dma_attr_t attr; 1212 int32_t i; 1213 list_t plist; 1214 mlxcx_dev_page_t *mdp; 1215 const ddi_dma_cookie_t *ck; 1216 1217 /* 1218 * If there are no pages required, then we're done here. 1219 */ 1220 if (npages <= 0) { 1221 return (B_TRUE); 1222 } 1223 1224 list_create(&plist, sizeof (mlxcx_dev_page_t), 1225 offsetof(mlxcx_dev_page_t, mxdp_list)); 1226 1227 for (i = 0; i < npages; i++) { 1228 mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP); 1229 mlxcx_dma_acc_attr(mlxp, &acc); 1230 mlxcx_dma_page_attr(mlxp, &attr); 1231 if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc, 1232 B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) { 1233 mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i, 1234 npages); 1235 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 1236 goto cleanup_npages; 1237 } 1238 ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma); 1239 mdp->mxdp_pa = ck->dmac_laddress; 1240 1241 list_insert_tail(&plist, mdp); 1242 } 1243 1244 /* 1245 * Now that all of the pages have been allocated, given them to hardware 1246 * in chunks. 1247 */ 1248 while (npages > 0) { 1249 mlxcx_dev_page_t *pages[MLXCX_MANAGE_PAGES_MAX_PAGES]; 1250 int32_t togive = MIN(MLXCX_MANAGE_PAGES_MAX_PAGES, npages); 1251 1252 for (i = 0; i < togive; i++) { 1253 pages[i] = list_remove_head(&plist); 1254 } 1255 1256 if (!mlxcx_cmd_give_pages(mlxp, 1257 MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, togive, pages)) { 1258 mlxcx_warn(mlxp, "!hardware refused our gift of %u " 1259 "pages!", togive); 1260 for (i = 0; i < togive; i++) { 1261 list_insert_tail(&plist, pages[i]); 1262 } 1263 goto cleanup_npages; 1264 } 1265 1266 mutex_enter(&mlxp->mlx_pagemtx); 1267 for (i = 0; i < togive; i++) { 1268 avl_add(&mlxp->mlx_pages, pages[i]); 1269 } 1270 mlxp->mlx_npages += togive; 1271 mutex_exit(&mlxp->mlx_pagemtx); 1272 npages -= togive; 1273 } 1274 1275 list_destroy(&plist); 1276 1277 return (B_TRUE); 1278 1279 cleanup_npages: 1280 while ((mdp = list_remove_head(&plist)) != NULL) { 1281 mlxcx_dma_free(&mdp->mxdp_dma); 1282 kmem_free(mdp, sizeof (mlxcx_dev_page_t)); 1283 } 1284 list_destroy(&plist); 1285 return (B_FALSE); 1286 } 1287 1288 static boolean_t 1289 mlxcx_init_pages(mlxcx_t *mlxp, uint_t type) 1290 { 1291 int32_t npages; 1292 1293 if (!mlxcx_cmd_query_pages(mlxp, type, &npages)) { 1294 mlxcx_warn(mlxp, "failed to determine boot pages"); 1295 return (B_FALSE); 1296 } 1297 1298 return (mlxcx_give_pages(mlxp, npages)); 1299 } 1300 1301 static int 1302 mlxcx_bufs_cache_constr(void *arg, void *cookie, int kmflags) 1303 { 1304 mlxcx_t *mlxp = cookie; 1305 mlxcx_buffer_t *b = arg; 1306 1307 bzero(b, sizeof (mlxcx_buffer_t)); 1308 b->mlb_mlx = mlxp; 1309 b->mlb_state = MLXCX_BUFFER_INIT; 1310 list_create(&b->mlb_tx_chain, sizeof (mlxcx_buffer_t), 1311 offsetof(mlxcx_buffer_t, mlb_tx_chain_entry)); 1312 1313 return (0); 1314 } 1315 1316 static void 1317 mlxcx_bufs_cache_destr(void *arg, void *cookie) 1318 { 1319 mlxcx_t *mlxp = cookie; 1320 mlxcx_buffer_t *b = arg; 1321 VERIFY3P(b->mlb_mlx, ==, mlxp); 1322 VERIFY(b->mlb_state == MLXCX_BUFFER_INIT); 1323 list_destroy(&b->mlb_tx_chain); 1324 } 1325 1326 mlxcx_buf_shard_t * 1327 mlxcx_mlbs_create(mlxcx_t *mlxp) 1328 { 1329 mlxcx_buf_shard_t *s; 1330 1331 s = kmem_zalloc(sizeof (mlxcx_buf_shard_t), KM_SLEEP); 1332 1333 mutex_init(&s->mlbs_mtx, NULL, MUTEX_DRIVER, 1334 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1335 list_create(&s->mlbs_busy, sizeof (mlxcx_buffer_t), 1336 offsetof(mlxcx_buffer_t, mlb_entry)); 1337 list_create(&s->mlbs_free, sizeof (mlxcx_buffer_t), 1338 offsetof(mlxcx_buffer_t, mlb_entry)); 1339 cv_init(&s->mlbs_free_nonempty, NULL, CV_DRIVER, NULL); 1340 1341 list_insert_tail(&mlxp->mlx_buf_shards, s); 1342 1343 return (s); 1344 } 1345 1346 static boolean_t 1347 mlxcx_setup_bufs(mlxcx_t *mlxp) 1348 { 1349 char namebuf[KSTAT_STRLEN]; 1350 1351 (void) snprintf(namebuf, KSTAT_STRLEN, "mlxcx%d_bufs_cache", 1352 ddi_get_instance(mlxp->mlx_dip)); 1353 mlxp->mlx_bufs_cache = kmem_cache_create(namebuf, 1354 sizeof (mlxcx_buffer_t), sizeof (uint64_t), 1355 mlxcx_bufs_cache_constr, mlxcx_bufs_cache_destr, 1356 NULL, mlxp, NULL, 0); 1357 1358 list_create(&mlxp->mlx_buf_shards, sizeof (mlxcx_buf_shard_t), 1359 offsetof(mlxcx_buf_shard_t, mlbs_entry)); 1360 1361 return (B_TRUE); 1362 } 1363 1364 static void 1365 mlxcx_fm_qstate_ereport(mlxcx_t *mlxp, const char *qtype, uint32_t qnum, 1366 const char *state, uint8_t statenum) 1367 { 1368 uint64_t ena; 1369 char buf[FM_MAX_CLASS]; 1370 1371 if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps)) 1372 return; 1373 1374 (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", 1375 MLXCX_FM_SERVICE_MLXCX, "qstate.err"); 1376 ena = fm_ena_generate(0, FM_ENA_FMT1); 1377 1378 ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP, 1379 FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0, 1380 "state", DATA_TYPE_STRING, state, 1381 "state_num", DATA_TYPE_UINT8, statenum, 1382 "qtype", DATA_TYPE_STRING, qtype, 1383 "qnum", DATA_TYPE_UINT32, qnum, 1384 NULL); 1385 ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED); 1386 } 1387 1388 static void 1389 mlxcx_eq_check(void *arg) 1390 { 1391 mlxcx_t *mlxp = (mlxcx_t *)arg; 1392 mlxcx_event_queue_t *eq; 1393 mlxcx_eventq_ctx_t ctx; 1394 const char *str; 1395 1396 uint_t i; 1397 1398 for (i = 0; i < mlxp->mlx_intr_count; ++i) { 1399 eq = &mlxp->mlx_eqs[i]; 1400 if (!(eq->mleq_state & MLXCX_EQ_CREATED) || 1401 (eq->mleq_state & MLXCX_EQ_DESTROYED)) 1402 continue; 1403 mutex_enter(&eq->mleq_mtx); 1404 if (!mlxcx_cmd_query_eq(mlxp, eq, &ctx)) { 1405 mutex_exit(&eq->mleq_mtx); 1406 continue; 1407 } 1408 1409 str = "???"; 1410 switch (ctx.mleqc_status) { 1411 case MLXCX_EQ_STATUS_OK: 1412 break; 1413 case MLXCX_EQ_STATUS_WRITE_FAILURE: 1414 str = "WRITE_FAILURE"; 1415 break; 1416 } 1417 if (ctx.mleqc_status != MLXCX_EQ_STATUS_OK) { 1418 mlxcx_fm_qstate_ereport(mlxp, "event", 1419 eq->mleq_num, str, ctx.mleqc_status); 1420 mlxcx_warn(mlxp, "EQ %u is in bad status: %x (%s)", 1421 eq->mleq_intr_index, ctx.mleqc_status, str); 1422 } 1423 1424 if (ctx.mleqc_state != MLXCX_EQ_ST_ARMED && 1425 (eq->mleq_state & MLXCX_EQ_ARMED)) { 1426 if (eq->mleq_cc == eq->mleq_check_disarm_cc && 1427 ++eq->mleq_check_disarm_cnt >= 3) { 1428 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL); 1429 mlxcx_warn(mlxp, "EQ %u isn't armed", 1430 eq->mleq_intr_index); 1431 } 1432 eq->mleq_check_disarm_cc = eq->mleq_cc; 1433 } else { 1434 eq->mleq_check_disarm_cc = 0; 1435 eq->mleq_check_disarm_cnt = 0; 1436 } 1437 1438 mutex_exit(&eq->mleq_mtx); 1439 } 1440 } 1441 1442 static void 1443 mlxcx_cq_check(void *arg) 1444 { 1445 mlxcx_t *mlxp = (mlxcx_t *)arg; 1446 mlxcx_completion_queue_t *cq; 1447 mlxcx_completionq_ctx_t ctx; 1448 const char *str, *type; 1449 uint_t v; 1450 1451 for (cq = list_head(&mlxp->mlx_cqs); cq != NULL; 1452 cq = list_next(&mlxp->mlx_cqs, cq)) { 1453 mutex_enter(&cq->mlcq_mtx); 1454 if (!(cq->mlcq_state & MLXCX_CQ_CREATED) || 1455 (cq->mlcq_state & MLXCX_CQ_DESTROYED) || 1456 (cq->mlcq_state & MLXCX_CQ_TEARDOWN)) { 1457 mutex_exit(&cq->mlcq_mtx); 1458 continue; 1459 } 1460 if (cq->mlcq_fm_repd_qstate) { 1461 mutex_exit(&cq->mlcq_mtx); 1462 continue; 1463 } 1464 if (!mlxcx_cmd_query_cq(mlxp, cq, &ctx)) { 1465 mutex_exit(&cq->mlcq_mtx); 1466 continue; 1467 } 1468 if (cq->mlcq_wq != NULL) { 1469 mlxcx_work_queue_t *wq = cq->mlcq_wq; 1470 if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ) 1471 type = "rx "; 1472 else if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ) 1473 type = "tx "; 1474 else 1475 type = ""; 1476 } else { 1477 type = ""; 1478 } 1479 1480 str = "???"; 1481 v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATUS); 1482 switch (v) { 1483 case MLXCX_CQC_STATUS_OK: 1484 break; 1485 case MLXCX_CQC_STATUS_OVERFLOW: 1486 str = "OVERFLOW"; 1487 break; 1488 case MLXCX_CQC_STATUS_WRITE_FAIL: 1489 str = "WRITE_FAIL"; 1490 break; 1491 case MLXCX_CQC_STATUS_INVALID: 1492 str = "INVALID"; 1493 break; 1494 } 1495 if (v != MLXCX_CQC_STATUS_OK) { 1496 mlxcx_fm_qstate_ereport(mlxp, "completion", 1497 cq->mlcq_num, str, v); 1498 mlxcx_warn(mlxp, "%sCQ 0x%x is in bad status: %x (%s)", 1499 type, cq->mlcq_num, v, str); 1500 cq->mlcq_fm_repd_qstate = B_TRUE; 1501 } 1502 1503 v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATE); 1504 if (v != MLXCX_CQC_STATE_ARMED && 1505 (cq->mlcq_state & MLXCX_CQ_ARMED) && 1506 !(cq->mlcq_state & MLXCX_CQ_POLLING)) { 1507 if (cq->mlcq_cc == cq->mlcq_check_disarm_cc && 1508 ++cq->mlcq_check_disarm_cnt >= 3) { 1509 mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL); 1510 mlxcx_warn(mlxp, "%sCQ 0x%x (%p) isn't armed", 1511 type, cq->mlcq_num, cq); 1512 } 1513 cq->mlcq_check_disarm_cc = cq->mlcq_cc; 1514 } else { 1515 cq->mlcq_check_disarm_cnt = 0; 1516 cq->mlcq_check_disarm_cc = 0; 1517 } 1518 mutex_exit(&cq->mlcq_mtx); 1519 } 1520 } 1521 1522 void 1523 mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq) 1524 { 1525 mlxcx_sq_ctx_t ctx; 1526 mlxcx_sq_state_t state; 1527 1528 ASSERT(mutex_owned(&sq->mlwq_mtx)); 1529 1530 if (!mlxcx_cmd_query_sq(mlxp, sq, &ctx)) 1531 return; 1532 1533 ASSERT3U(from_be24(ctx.mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num); 1534 state = get_bits32(ctx.mlsqc_flags, MLXCX_SQ_STATE); 1535 switch (state) { 1536 case MLXCX_SQ_STATE_RST: 1537 if (sq->mlwq_state & MLXCX_WQ_STARTED) { 1538 mlxcx_fm_qstate_ereport(mlxp, "send", 1539 sq->mlwq_num, "RST", state); 1540 sq->mlwq_fm_repd_qstate = B_TRUE; 1541 } 1542 break; 1543 case MLXCX_SQ_STATE_RDY: 1544 if (!(sq->mlwq_state & MLXCX_WQ_STARTED)) { 1545 mlxcx_fm_qstate_ereport(mlxp, "send", 1546 sq->mlwq_num, "RDY", state); 1547 sq->mlwq_fm_repd_qstate = B_TRUE; 1548 } 1549 break; 1550 case MLXCX_SQ_STATE_ERR: 1551 mlxcx_fm_qstate_ereport(mlxp, "send", 1552 sq->mlwq_num, "ERR", state); 1553 sq->mlwq_fm_repd_qstate = B_TRUE; 1554 break; 1555 default: 1556 mlxcx_fm_qstate_ereport(mlxp, "send", 1557 sq->mlwq_num, "???", state); 1558 sq->mlwq_fm_repd_qstate = B_TRUE; 1559 break; 1560 } 1561 } 1562 1563 void 1564 mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq) 1565 { 1566 mlxcx_rq_ctx_t ctx; 1567 mlxcx_rq_state_t state; 1568 1569 ASSERT(mutex_owned(&rq->mlwq_mtx)); 1570 1571 if (!mlxcx_cmd_query_rq(mlxp, rq, &ctx)) 1572 return; 1573 1574 ASSERT3U(from_be24(ctx.mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num); 1575 state = get_bits32(ctx.mlrqc_flags, MLXCX_RQ_STATE); 1576 switch (state) { 1577 case MLXCX_RQ_STATE_RST: 1578 if (rq->mlwq_state & MLXCX_WQ_STARTED) { 1579 mlxcx_fm_qstate_ereport(mlxp, "receive", 1580 rq->mlwq_num, "RST", state); 1581 rq->mlwq_fm_repd_qstate = B_TRUE; 1582 } 1583 break; 1584 case MLXCX_RQ_STATE_RDY: 1585 if (!(rq->mlwq_state & MLXCX_WQ_STARTED)) { 1586 mlxcx_fm_qstate_ereport(mlxp, "receive", 1587 rq->mlwq_num, "RDY", state); 1588 rq->mlwq_fm_repd_qstate = B_TRUE; 1589 } 1590 break; 1591 case MLXCX_RQ_STATE_ERR: 1592 mlxcx_fm_qstate_ereport(mlxp, "receive", 1593 rq->mlwq_num, "ERR", state); 1594 rq->mlwq_fm_repd_qstate = B_TRUE; 1595 break; 1596 default: 1597 mlxcx_fm_qstate_ereport(mlxp, "receive", 1598 rq->mlwq_num, "???", state); 1599 rq->mlwq_fm_repd_qstate = B_TRUE; 1600 break; 1601 } 1602 } 1603 1604 static void 1605 mlxcx_wq_check(void *arg) 1606 { 1607 mlxcx_t *mlxp = (mlxcx_t *)arg; 1608 mlxcx_work_queue_t *wq; 1609 1610 for (wq = list_head(&mlxp->mlx_wqs); wq != NULL; 1611 wq = list_next(&mlxp->mlx_wqs, wq)) { 1612 mutex_enter(&wq->mlwq_mtx); 1613 if (!(wq->mlwq_state & MLXCX_WQ_CREATED) || 1614 (wq->mlwq_state & MLXCX_WQ_DESTROYED) || 1615 (wq->mlwq_state & MLXCX_WQ_TEARDOWN)) { 1616 mutex_exit(&wq->mlwq_mtx); 1617 continue; 1618 } 1619 if (wq->mlwq_fm_repd_qstate) { 1620 mutex_exit(&wq->mlwq_mtx); 1621 continue; 1622 } 1623 switch (wq->mlwq_type) { 1624 case MLXCX_WQ_TYPE_SENDQ: 1625 mlxcx_check_sq(mlxp, wq); 1626 break; 1627 case MLXCX_WQ_TYPE_RECVQ: 1628 mlxcx_check_rq(mlxp, wq); 1629 break; 1630 } 1631 mutex_exit(&wq->mlwq_mtx); 1632 } 1633 } 1634 1635 static boolean_t 1636 mlxcx_setup_checktimers(mlxcx_t *mlxp) 1637 { 1638 if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) { 1639 mlxp->mlx_eq_checktimer = ddi_periodic_add(mlxcx_eq_check, mlxp, 1640 mlxp->mlx_props.mldp_eq_check_interval_sec * NANOSEC, 1641 DDI_IPL_0); 1642 } 1643 if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) { 1644 mlxp->mlx_cq_checktimer = ddi_periodic_add(mlxcx_cq_check, mlxp, 1645 mlxp->mlx_props.mldp_cq_check_interval_sec * NANOSEC, 1646 DDI_IPL_0); 1647 } 1648 if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) { 1649 mlxp->mlx_wq_checktimer = ddi_periodic_add(mlxcx_wq_check, mlxp, 1650 mlxp->mlx_props.mldp_wq_check_interval_sec * NANOSEC, 1651 DDI_IPL_0); 1652 } 1653 return (B_TRUE); 1654 } 1655 1656 int 1657 mlxcx_dmac_fe_compare(const void *arg0, const void *arg1) 1658 { 1659 const mlxcx_flow_entry_t *left = arg0; 1660 const mlxcx_flow_entry_t *right = arg1; 1661 int bcmpr; 1662 1663 bcmpr = memcmp(left->mlfe_dmac, right->mlfe_dmac, 1664 sizeof (left->mlfe_dmac)); 1665 if (bcmpr < 0) 1666 return (-1); 1667 if (bcmpr > 0) 1668 return (1); 1669 if (left->mlfe_vid < right->mlfe_vid) 1670 return (-1); 1671 if (left->mlfe_vid > right->mlfe_vid) 1672 return (1); 1673 return (0); 1674 } 1675 1676 int 1677 mlxcx_grmac_compare(const void *arg0, const void *arg1) 1678 { 1679 const mlxcx_group_mac_t *left = arg0; 1680 const mlxcx_group_mac_t *right = arg1; 1681 int bcmpr; 1682 1683 bcmpr = memcmp(left->mlgm_mac, right->mlgm_mac, 1684 sizeof (left->mlgm_mac)); 1685 if (bcmpr < 0) 1686 return (-1); 1687 if (bcmpr > 0) 1688 return (1); 1689 return (0); 1690 } 1691 1692 int 1693 mlxcx_page_compare(const void *arg0, const void *arg1) 1694 { 1695 const mlxcx_dev_page_t *p0 = arg0; 1696 const mlxcx_dev_page_t *p1 = arg1; 1697 1698 if (p0->mxdp_pa < p1->mxdp_pa) 1699 return (-1); 1700 if (p0->mxdp_pa > p1->mxdp_pa) 1701 return (1); 1702 return (0); 1703 } 1704 1705 static boolean_t 1706 mlxcx_setup_ports(mlxcx_t *mlxp) 1707 { 1708 uint_t i, j; 1709 mlxcx_port_t *p; 1710 mlxcx_flow_table_t *ft; 1711 mlxcx_flow_group_t *fg; 1712 mlxcx_flow_entry_t *fe; 1713 1714 VERIFY3U(mlxp->mlx_nports, >, 0); 1715 mlxp->mlx_ports_size = mlxp->mlx_nports * sizeof (mlxcx_port_t); 1716 mlxp->mlx_ports = kmem_zalloc(mlxp->mlx_ports_size, KM_SLEEP); 1717 1718 for (i = 0; i < mlxp->mlx_nports; ++i) { 1719 p = &mlxp->mlx_ports[i]; 1720 p->mlp_num = i; 1721 p->mlp_init |= MLXCX_PORT_INIT; 1722 mutex_init(&p->mlp_mtx, NULL, MUTEX_DRIVER, 1723 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1724 mutex_enter(&p->mlp_mtx); 1725 if (!mlxcx_cmd_query_nic_vport_ctx(mlxp, p)) { 1726 mutex_exit(&p->mlp_mtx); 1727 goto err; 1728 } 1729 if (!mlxcx_cmd_query_port_mtu(mlxp, p)) { 1730 mutex_exit(&p->mlp_mtx); 1731 goto err; 1732 } 1733 if (!mlxcx_cmd_query_port_status(mlxp, p)) { 1734 mutex_exit(&p->mlp_mtx); 1735 goto err; 1736 } 1737 if (!mlxcx_cmd_query_port_speed(mlxp, p)) { 1738 mutex_exit(&p->mlp_mtx); 1739 goto err; 1740 } 1741 if (!mlxcx_cmd_modify_nic_vport_ctx(mlxp, p, 1742 MLXCX_MODIFY_NIC_VPORT_CTX_PROMISC)) { 1743 mutex_exit(&p->mlp_mtx); 1744 goto err; 1745 } 1746 1747 mutex_exit(&p->mlp_mtx); 1748 } 1749 1750 for (i = 0; i < mlxp->mlx_nports; ++i) { 1751 p = &mlxp->mlx_ports[i]; 1752 mutex_enter(&p->mlp_mtx); 1753 p->mlp_rx_flow = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t), 1754 KM_SLEEP)); 1755 mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER, 1756 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 1757 1758 mutex_enter(&ft->mlft_mtx); 1759 1760 ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX; 1761 ft->mlft_port = p; 1762 ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_root_size_shift; 1763 if (ft->mlft_entshift > mlxp->mlx_caps->mlc_max_rx_ft_shift) 1764 ft->mlft_entshift = mlxp->mlx_caps->mlc_max_rx_ft_shift; 1765 ft->mlft_nents = (1 << ft->mlft_entshift); 1766 ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t); 1767 ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP); 1768 list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t), 1769 offsetof(mlxcx_flow_group_t, mlfg_entry)); 1770 1771 for (j = 0; j < ft->mlft_nents; ++j) { 1772 ft->mlft_ent[j].mlfe_table = ft; 1773 ft->mlft_ent[j].mlfe_index = j; 1774 } 1775 1776 if (!mlxcx_cmd_create_flow_table(mlxp, ft)) { 1777 mutex_exit(&ft->mlft_mtx); 1778 mutex_exit(&p->mlp_mtx); 1779 goto err; 1780 } 1781 1782 if (!mlxcx_cmd_set_flow_table_root(mlxp, ft)) { 1783 mutex_exit(&ft->mlft_mtx); 1784 mutex_exit(&p->mlp_mtx); 1785 goto err; 1786 } 1787 1788 /* 1789 * We match broadcast at the top of the root flow table, then 1790 * all multicast/unicast MACs, then the promisc entry is down 1791 * the very bottom. 1792 * 1793 * This way when promisc is on, that entry simply catches any 1794 * remaining traffic that earlier flows haven't matched. 1795 */ 1796 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1797 list_insert_tail(&ft->mlft_groups, fg); 1798 fg->mlfg_table = ft; 1799 fg->mlfg_size = 1; 1800 fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC; 1801 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1802 mutex_exit(&ft->mlft_mtx); 1803 mutex_exit(&p->mlp_mtx); 1804 goto err; 1805 } 1806 p->mlp_bcast = fg; 1807 fe = list_head(&fg->mlfg_entries); 1808 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1809 (void) memset(fe->mlfe_dmac, 0xff, sizeof (fe->mlfe_dmac)); 1810 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 1811 1812 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1813 list_insert_tail(&ft->mlft_groups, fg); 1814 fg->mlfg_table = ft; 1815 fg->mlfg_size = ft->mlft_nents - 2; 1816 fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC; 1817 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1818 mutex_exit(&ft->mlft_mtx); 1819 mutex_exit(&p->mlp_mtx); 1820 goto err; 1821 } 1822 p->mlp_umcast = fg; 1823 1824 fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP); 1825 list_insert_tail(&ft->mlft_groups, fg); 1826 fg->mlfg_table = ft; 1827 fg->mlfg_size = 1; 1828 if (!mlxcx_setup_flow_group(mlxp, ft, fg)) { 1829 mutex_exit(&ft->mlft_mtx); 1830 mutex_exit(&p->mlp_mtx); 1831 goto err; 1832 } 1833 p->mlp_promisc = fg; 1834 fe = list_head(&fg->mlfg_entries); 1835 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 1836 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 1837 1838 avl_create(&p->mlp_dmac_fe, mlxcx_dmac_fe_compare, 1839 sizeof (mlxcx_flow_entry_t), offsetof(mlxcx_flow_entry_t, 1840 mlfe_dmac_entry)); 1841 1842 mutex_exit(&ft->mlft_mtx); 1843 mutex_exit(&p->mlp_mtx); 1844 } 1845 1846 return (B_TRUE); 1847 1848 err: 1849 mlxcx_teardown_ports(mlxp); 1850 return (B_FALSE); 1851 } 1852 1853 void 1854 mlxcx_remove_all_vlan_entries(mlxcx_t *mlxp, mlxcx_ring_group_t *g) 1855 { 1856 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 1857 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 1858 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 1859 mlxcx_flow_entry_t *fe; 1860 mlxcx_group_vlan_t *v; 1861 1862 ASSERT(mutex_owned(&g->mlg_mtx)); 1863 1864 mutex_enter(&ft->mlft_mtx); 1865 1866 if (!list_is_empty(&g->mlg_rx_vlans)) { 1867 fe = list_head(&dfg->mlfg_entries); 1868 (void) mlxcx_cmd_set_flow_table_entry(mlxp, fe); 1869 } 1870 1871 while ((v = list_remove_head(&g->mlg_rx_vlans)) != NULL) { 1872 fe = v->mlgv_fe; 1873 ASSERT3P(fe->mlfe_table, ==, ft); 1874 ASSERT3P(fe->mlfe_group, ==, fg); 1875 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 1876 1877 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 1878 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 1879 } 1880 1881 mutex_exit(&ft->mlft_mtx); 1882 } 1883 1884 boolean_t 1885 mlxcx_remove_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, 1886 boolean_t tagged, uint16_t vid) 1887 { 1888 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 1889 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 1890 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 1891 mlxcx_flow_entry_t *fe; 1892 mlxcx_group_vlan_t *v; 1893 boolean_t found = B_FALSE; 1894 1895 ASSERT(mutex_owned(&g->mlg_mtx)); 1896 1897 mutex_enter(&ft->mlft_mtx); 1898 1899 for (v = list_head(&g->mlg_rx_vlans); v != NULL; 1900 v = list_next(&g->mlg_rx_vlans, v)) { 1901 if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) { 1902 found = B_TRUE; 1903 break; 1904 } 1905 } 1906 if (!found) { 1907 mutex_exit(&ft->mlft_mtx); 1908 return (B_FALSE); 1909 } 1910 1911 list_remove(&g->mlg_rx_vlans, v); 1912 1913 /* 1914 * If this is the last VLAN entry, we have to go back to accepting 1915 * any VLAN (which means re-enabling the default entry). 1916 * 1917 * Do this before we remove the flow entry for the last specific 1918 * VLAN so that we don't lose any traffic in the transition. 1919 */ 1920 if (list_is_empty(&g->mlg_rx_vlans)) { 1921 fe = list_head(&dfg->mlfg_entries); 1922 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 1923 list_insert_tail(&g->mlg_rx_vlans, v); 1924 mutex_exit(&ft->mlft_mtx); 1925 return (B_FALSE); 1926 } 1927 } 1928 1929 fe = v->mlgv_fe; 1930 ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED); 1931 ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED); 1932 ASSERT3P(fe->mlfe_table, ==, ft); 1933 ASSERT3P(fe->mlfe_group, ==, fg); 1934 1935 if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) { 1936 list_insert_tail(&g->mlg_rx_vlans, v); 1937 fe = list_head(&dfg->mlfg_entries); 1938 if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) { 1939 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 1940 } 1941 mutex_exit(&ft->mlft_mtx); 1942 return (B_FALSE); 1943 } 1944 1945 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 1946 1947 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 1948 1949 mutex_exit(&ft->mlft_mtx); 1950 return (B_TRUE); 1951 } 1952 1953 boolean_t 1954 mlxcx_add_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, boolean_t tagged, 1955 uint16_t vid) 1956 { 1957 mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft; 1958 mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg; 1959 mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg; 1960 mlxcx_flow_entry_t *fe; 1961 mlxcx_group_vlan_t *v; 1962 boolean_t found = B_FALSE; 1963 boolean_t first = B_FALSE; 1964 1965 ASSERT(mutex_owned(&g->mlg_mtx)); 1966 1967 mutex_enter(&ft->mlft_mtx); 1968 1969 for (v = list_head(&g->mlg_rx_vlans); v != NULL; 1970 v = list_next(&g->mlg_rx_vlans, v)) { 1971 if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) { 1972 mutex_exit(&ft->mlft_mtx); 1973 return (B_TRUE); 1974 } 1975 } 1976 if (list_is_empty(&g->mlg_rx_vlans)) 1977 first = B_TRUE; 1978 1979 for (fe = list_head(&fg->mlfg_entries); fe != NULL; 1980 fe = list_next(&fg->mlfg_entries, fe)) { 1981 if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) { 1982 found = B_TRUE; 1983 break; 1984 } 1985 } 1986 if (!found) { 1987 mutex_exit(&ft->mlft_mtx); 1988 return (B_FALSE); 1989 } 1990 1991 v = kmem_zalloc(sizeof (mlxcx_group_vlan_t), KM_SLEEP); 1992 v->mlgv_fe = fe; 1993 v->mlgv_tagged = tagged; 1994 v->mlgv_vid = vid; 1995 1996 fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED; 1997 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 1998 fe->mlfe_vid = vid; 1999 if (tagged) { 2000 fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_CVLAN; 2001 } else { 2002 fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_NONE; 2003 } 2004 2005 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2006 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY; 2007 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2008 kmem_free(v, sizeof (mlxcx_group_vlan_t)); 2009 mutex_exit(&ft->mlft_mtx); 2010 return (B_FALSE); 2011 } 2012 2013 list_insert_tail(&g->mlg_rx_vlans, v); 2014 2015 /* 2016 * If the vlan list was empty for this group before adding this one, 2017 * then we no longer want the "default" entry to allow all VLANs 2018 * through. 2019 */ 2020 if (first) { 2021 fe = list_head(&dfg->mlfg_entries); 2022 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2023 } 2024 2025 mutex_exit(&ft->mlft_mtx); 2026 return (B_TRUE); 2027 } 2028 2029 void 2030 mlxcx_remove_all_umcast_entries(mlxcx_t *mlxp, mlxcx_port_t *port, 2031 mlxcx_ring_group_t *group) 2032 { 2033 mlxcx_flow_entry_t *fe; 2034 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2035 mlxcx_group_mac_t *gm, *ngm; 2036 2037 ASSERT(mutex_owned(&port->mlp_mtx)); 2038 ASSERT(mutex_owned(&group->mlg_mtx)); 2039 2040 mutex_enter(&ft->mlft_mtx); 2041 2042 gm = avl_first(&group->mlg_rx_macs); 2043 for (; gm != NULL; gm = ngm) { 2044 ngm = AVL_NEXT(&group->mlg_rx_macs, gm); 2045 2046 ASSERT3P(gm->mlgm_group, ==, group); 2047 fe = gm->mlgm_fe; 2048 ASSERT3P(fe->mlfe_table, ==, ft); 2049 2050 avl_remove(&group->mlg_rx_macs, gm); 2051 list_remove(&fe->mlfe_ring_groups, gm); 2052 kmem_free(gm, sizeof (mlxcx_group_mac_t)); 2053 2054 fe->mlfe_ndest = 0; 2055 for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL; 2056 gm = list_next(&fe->mlfe_ring_groups, gm)) { 2057 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = 2058 gm->mlgm_group->mlg_rx_vlan_ft; 2059 } 2060 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2061 2062 if (fe->mlfe_ndest > 0) { 2063 (void) mlxcx_cmd_set_flow_table_entry(mlxp, fe); 2064 continue; 2065 } 2066 2067 /* 2068 * There are no more ring groups left for this MAC (it wasn't 2069 * attached to any other groups since ndest == 0), so clean up 2070 * its flow entry. 2071 */ 2072 avl_remove(&port->mlp_dmac_fe, fe); 2073 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2074 list_destroy(&fe->mlfe_ring_groups); 2075 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2076 } 2077 2078 mutex_exit(&ft->mlft_mtx); 2079 } 2080 2081 boolean_t 2082 mlxcx_remove_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port, 2083 mlxcx_ring_group_t *group, const uint8_t *macaddr) 2084 { 2085 mlxcx_flow_entry_t *fe; 2086 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2087 mlxcx_group_mac_t *gm, probe; 2088 2089 ASSERT(mutex_owned(&port->mlp_mtx)); 2090 ASSERT(mutex_owned(&group->mlg_mtx)); 2091 2092 bzero(&probe, sizeof (probe)); 2093 bcopy(macaddr, probe.mlgm_mac, sizeof (probe.mlgm_mac)); 2094 2095 mutex_enter(&ft->mlft_mtx); 2096 2097 gm = avl_find(&group->mlg_rx_macs, &probe, NULL); 2098 if (gm == NULL) { 2099 mutex_exit(&ft->mlft_mtx); 2100 return (B_FALSE); 2101 } 2102 ASSERT3P(gm->mlgm_group, ==, group); 2103 ASSERT0(bcmp(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac))); 2104 2105 fe = gm->mlgm_fe; 2106 ASSERT3P(fe->mlfe_table, ==, ft); 2107 ASSERT0(bcmp(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac))); 2108 2109 list_remove(&fe->mlfe_ring_groups, gm); 2110 avl_remove(&group->mlg_rx_macs, gm); 2111 kmem_free(gm, sizeof (mlxcx_group_mac_t)); 2112 2113 fe->mlfe_ndest = 0; 2114 for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL; 2115 gm = list_next(&fe->mlfe_ring_groups, gm)) { 2116 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = 2117 gm->mlgm_group->mlg_rx_vlan_ft; 2118 } 2119 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2120 2121 if (fe->mlfe_ndest > 0) { 2122 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2123 mutex_exit(&ft->mlft_mtx); 2124 return (B_FALSE); 2125 } 2126 mutex_exit(&ft->mlft_mtx); 2127 return (B_TRUE); 2128 } 2129 2130 /* 2131 * There are no more ring groups left for this MAC (it wasn't attached 2132 * to any other groups since ndest == 0), so clean up its flow entry. 2133 */ 2134 avl_remove(&port->mlp_dmac_fe, fe); 2135 (void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe); 2136 list_destroy(&fe->mlfe_ring_groups); 2137 2138 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2139 2140 mutex_exit(&ft->mlft_mtx); 2141 2142 return (B_TRUE); 2143 } 2144 2145 boolean_t 2146 mlxcx_add_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port, 2147 mlxcx_ring_group_t *group, const uint8_t *macaddr) 2148 { 2149 mlxcx_flow_group_t *fg; 2150 mlxcx_flow_entry_t *fe, probe; 2151 mlxcx_flow_table_t *ft = port->mlp_rx_flow; 2152 mlxcx_group_mac_t *gm; 2153 boolean_t found = B_FALSE; 2154 2155 ASSERT(mutex_owned(&port->mlp_mtx)); 2156 ASSERT(mutex_owned(&group->mlg_mtx)); 2157 2158 bzero(&probe, sizeof (probe)); 2159 bcopy(macaddr, probe.mlfe_dmac, sizeof (probe.mlfe_dmac)); 2160 2161 mutex_enter(&ft->mlft_mtx); 2162 2163 fe = avl_find(&port->mlp_dmac_fe, &probe, NULL); 2164 2165 if (fe == NULL) { 2166 fg = port->mlp_umcast; 2167 for (fe = list_head(&fg->mlfg_entries); fe != NULL; 2168 fe = list_next(&fg->mlfg_entries, fe)) { 2169 if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) { 2170 found = B_TRUE; 2171 break; 2172 } 2173 } 2174 if (!found) { 2175 mutex_exit(&ft->mlft_mtx); 2176 return (B_FALSE); 2177 } 2178 list_create(&fe->mlfe_ring_groups, sizeof (mlxcx_group_mac_t), 2179 offsetof(mlxcx_group_mac_t, mlgm_fe_entry)); 2180 fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED; 2181 fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD; 2182 bcopy(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac)); 2183 2184 avl_add(&port->mlp_dmac_fe, fe); 2185 } 2186 2187 fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = group->mlg_rx_vlan_ft; 2188 fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY; 2189 2190 if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) { 2191 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY; 2192 if (--fe->mlfe_ndest == 0) { 2193 fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED; 2194 } 2195 mutex_exit(&ft->mlft_mtx); 2196 return (B_FALSE); 2197 } 2198 2199 gm = kmem_zalloc(sizeof (mlxcx_group_mac_t), KM_SLEEP); 2200 gm->mlgm_group = group; 2201 gm->mlgm_fe = fe; 2202 bcopy(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac)); 2203 avl_add(&group->mlg_rx_macs, gm); 2204 list_insert_tail(&fe->mlfe_ring_groups, gm); 2205 2206 mutex_exit(&ft->mlft_mtx); 2207 2208 return (B_TRUE); 2209 } 2210 2211 boolean_t 2212 mlxcx_setup_flow_group(mlxcx_t *mlxp, mlxcx_flow_table_t *ft, 2213 mlxcx_flow_group_t *fg) 2214 { 2215 mlxcx_flow_entry_t *fe; 2216 uint_t i, idx; 2217 2218 ASSERT(mutex_owned(&ft->mlft_mtx)); 2219 ASSERT(ft->mlft_state & MLXCX_FLOW_TABLE_CREATED); 2220 ASSERT3P(fg->mlfg_table, ==, ft); 2221 2222 if (ft->mlft_next_ent + fg->mlfg_size > ft->mlft_nents) 2223 return (B_FALSE); 2224 fg->mlfg_start_idx = ft->mlft_next_ent; 2225 2226 if (!mlxcx_cmd_create_flow_group(mlxp, fg)) { 2227 return (B_FALSE); 2228 } 2229 2230 list_create(&fg->mlfg_entries, sizeof (mlxcx_flow_entry_t), 2231 offsetof(mlxcx_flow_entry_t, mlfe_group_entry)); 2232 for (i = 0; i < fg->mlfg_size; ++i) { 2233 idx = fg->mlfg_start_idx + i; 2234 fe = &ft->mlft_ent[idx]; 2235 fe->mlfe_group = fg; 2236 list_insert_tail(&fg->mlfg_entries, fe); 2237 } 2238 fg->mlfg_avail = fg->mlfg_size; 2239 ft->mlft_next_ent += fg->mlfg_size; 2240 2241 return (B_TRUE); 2242 } 2243 2244 static boolean_t 2245 mlxcx_setup_eq0(mlxcx_t *mlxp) 2246 { 2247 mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[0]; 2248 2249 mutex_enter(&mleq->mleq_mtx); 2250 if (!mlxcx_eq_alloc_dma(mlxp, mleq)) { 2251 /* mlxcx_teardown_eqs() will clean this up */ 2252 mutex_exit(&mleq->mleq_mtx); 2253 return (B_FALSE); 2254 } 2255 mleq->mleq_mlx = mlxp; 2256 mleq->mleq_uar = &mlxp->mlx_uar; 2257 mleq->mleq_events = 2258 (1ULL << MLXCX_EVENT_PAGE_REQUEST) | 2259 (1ULL << MLXCX_EVENT_PORT_STATE) | 2260 (1ULL << MLXCX_EVENT_INTERNAL_ERROR) | 2261 (1ULL << MLXCX_EVENT_PORT_MODULE) | 2262 (1ULL << MLXCX_EVENT_SENDQ_DRAIN) | 2263 (1ULL << MLXCX_EVENT_LAST_WQE) | 2264 (1ULL << MLXCX_EVENT_CQ_ERROR) | 2265 (1ULL << MLXCX_EVENT_WQ_CATASTROPHE) | 2266 (1ULL << MLXCX_EVENT_PAGE_FAULT) | 2267 (1ULL << MLXCX_EVENT_WQ_INVALID_REQ) | 2268 (1ULL << MLXCX_EVENT_WQ_ACCESS_VIOL) | 2269 (1ULL << MLXCX_EVENT_NIC_VPORT) | 2270 (1ULL << MLXCX_EVENT_DOORBELL_CONGEST); 2271 if (!mlxcx_cmd_create_eq(mlxp, mleq)) { 2272 /* mlxcx_teardown_eqs() will clean this up */ 2273 mutex_exit(&mleq->mleq_mtx); 2274 return (B_FALSE); 2275 } 2276 if (ddi_intr_enable(mlxp->mlx_intr_handles[0]) != DDI_SUCCESS) { 2277 /* 2278 * mlxcx_teardown_eqs() will handle calling cmd_destroy_eq and 2279 * eq_rele_dma 2280 */ 2281 mutex_exit(&mleq->mleq_mtx); 2282 return (B_FALSE); 2283 } 2284 mlxcx_arm_eq(mlxp, mleq); 2285 mutex_exit(&mleq->mleq_mtx); 2286 return (B_TRUE); 2287 } 2288 2289 int 2290 mlxcx_cq_compare(const void *arg0, const void *arg1) 2291 { 2292 const mlxcx_completion_queue_t *left = arg0; 2293 const mlxcx_completion_queue_t *right = arg1; 2294 2295 if (left->mlcq_num < right->mlcq_num) { 2296 return (-1); 2297 } 2298 if (left->mlcq_num > right->mlcq_num) { 2299 return (1); 2300 } 2301 return (0); 2302 } 2303 2304 static boolean_t 2305 mlxcx_setup_eqs(mlxcx_t *mlxp) 2306 { 2307 uint_t i; 2308 mlxcx_event_queue_t *mleq; 2309 2310 ASSERT3S(mlxp->mlx_intr_count, >, 0); 2311 2312 for (i = 1; i < mlxp->mlx_intr_count; ++i) { 2313 mleq = &mlxp->mlx_eqs[i]; 2314 mutex_enter(&mleq->mleq_mtx); 2315 if (!mlxcx_eq_alloc_dma(mlxp, mleq)) { 2316 mutex_exit(&mleq->mleq_mtx); 2317 return (B_FALSE); 2318 } 2319 mleq->mleq_uar = &mlxp->mlx_uar; 2320 if (!mlxcx_cmd_create_eq(mlxp, mleq)) { 2321 /* mlxcx_teardown() will handle calling eq_rele_dma */ 2322 mutex_exit(&mleq->mleq_mtx); 2323 return (B_FALSE); 2324 } 2325 if (mlxp->mlx_props.mldp_intrmod_period_usec != 0 && 2326 !mlxcx_cmd_set_int_mod(mlxp, i, 2327 mlxp->mlx_props.mldp_intrmod_period_usec)) { 2328 mutex_exit(&mleq->mleq_mtx); 2329 return (B_FALSE); 2330 } 2331 if (ddi_intr_enable(mlxp->mlx_intr_handles[i]) != DDI_SUCCESS) { 2332 mutex_exit(&mleq->mleq_mtx); 2333 return (B_FALSE); 2334 } 2335 mlxcx_arm_eq(mlxp, mleq); 2336 mutex_exit(&mleq->mleq_mtx); 2337 } 2338 2339 mlxp->mlx_next_eq = 1; 2340 2341 return (B_TRUE); 2342 } 2343 2344 /* 2345 * Snapshot all of the hardware capabilities that we care about and then modify 2346 * the HCA capabilities to get things moving. 2347 */ 2348 static boolean_t 2349 mlxcx_init_caps(mlxcx_t *mlxp) 2350 { 2351 mlxcx_caps_t *c; 2352 2353 mlxp->mlx_caps = c = kmem_zalloc(sizeof (mlxcx_caps_t), KM_SLEEP); 2354 2355 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL, 2356 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_hca_cur)) { 2357 mlxcx_warn(mlxp, "failed to obtain current HCA general caps"); 2358 } 2359 2360 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL, 2361 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_hca_max)) { 2362 mlxcx_warn(mlxp, "failed to obtain maximum HCA general caps"); 2363 } 2364 2365 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET, 2366 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_ether_cur)) { 2367 mlxcx_warn(mlxp, "failed to obtain current HCA eth caps"); 2368 } 2369 2370 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET, 2371 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_ether_max)) { 2372 mlxcx_warn(mlxp, "failed to obtain maximum HCA eth caps"); 2373 } 2374 2375 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW, 2376 MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_nic_flow_cur)) { 2377 mlxcx_warn(mlxp, "failed to obtain current HCA flow caps"); 2378 } 2379 2380 if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW, 2381 MLXCX_HCA_CAP_MODE_MAX, &c->mlc_nic_flow_max)) { 2382 mlxcx_warn(mlxp, "failed to obtain maximum HCA flow caps"); 2383 } 2384 2385 /* 2386 * Check the caps meet our requirements. 2387 */ 2388 const mlxcx_hca_cap_general_caps_t *gen = &c->mlc_hca_cur.mhc_general; 2389 2390 if (gen->mlcap_general_log_pg_sz != 12) { 2391 mlxcx_warn(mlxp, "!hardware has page size != 4k " 2392 "(log_pg_sz = %u)", (uint_t)gen->mlcap_general_log_pg_sz); 2393 goto err; 2394 } 2395 if (gen->mlcap_general_cqe_version != 1) { 2396 mlxcx_warn(mlxp, "!hardware does not support CQE v1 " 2397 "(cqe_ver = %u)", (uint_t)gen->mlcap_general_cqe_version); 2398 goto err; 2399 } 2400 if (gen->mlcap_general_port_type != 2401 MLXCX_CAP_GENERAL_PORT_TYPE_ETHERNET) { 2402 mlxcx_warn(mlxp, "!hardware has non-ethernet ports"); 2403 goto err; 2404 } 2405 mlxp->mlx_nports = gen->mlcap_general_num_ports; 2406 mlxp->mlx_max_sdu = (1 << (gen->mlcap_general_log_max_msg & 0x1F)); 2407 2408 c->mlc_max_tir = (1 << gen->mlcap_general_log_max_tir); 2409 2410 c->mlc_checksum = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags, 2411 MLXCX_ETH_CAP_CSUM_CAP); 2412 c->mlc_vxlan = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags, 2413 MLXCX_ETH_CAP_TUNNEL_STATELESS_VXLAN); 2414 2415 c->mlc_max_lso_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth. 2416 mlcap_eth_flags, MLXCX_ETH_CAP_MAX_LSO_CAP)); 2417 if (c->mlc_max_lso_size == 1) { 2418 c->mlc_max_lso_size = 0; 2419 c->mlc_lso = B_FALSE; 2420 } else { 2421 c->mlc_lso = B_TRUE; 2422 } 2423 2424 c->mlc_max_rqt_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth. 2425 mlcap_eth_flags, MLXCX_ETH_CAP_RSS_IND_TBL_CAP)); 2426 2427 if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2428 mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_SUPPORT)) { 2429 mlxcx_warn(mlxp, "!hardware does not support rx flow tables"); 2430 goto err; 2431 } 2432 if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2433 mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_MODIFY)) { 2434 mlxcx_warn(mlxp, "!hardware does not support modifying rx " 2435 "flow table entries"); 2436 goto err; 2437 } 2438 2439 c->mlc_max_rx_ft_shift = c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx. 2440 mlcap_flow_prop_log_max_ft_size; 2441 c->mlc_max_rx_flows = (1 << c->mlc_nic_flow_cur.mhc_flow. 2442 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_flow); 2443 c->mlc_max_rx_fe_dest = (1 << c->mlc_nic_flow_cur.mhc_flow. 2444 mlcap_flow_nic_rx.mlcap_flow_prop_log_max_destination); 2445 2446 return (B_TRUE); 2447 2448 err: 2449 kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t)); 2450 return (B_FALSE); 2451 } 2452 2453 static int 2454 mlxcx_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2455 { 2456 mlxcx_t *mlxp; 2457 2458 if (cmd != DDI_DETACH) 2459 return (DDI_FAILURE); 2460 2461 mlxp = ddi_get_driver_private(dip); 2462 if (mlxp == NULL) { 2463 mlxcx_warn(NULL, "asked to detach, but missing instance " 2464 "private data"); 2465 return (DDI_FAILURE); 2466 } 2467 2468 if (mlxp->mlx_attach & MLXCX_ATTACH_MAC_HDL) { 2469 if (mac_unregister(mlxp->mlx_mac_hdl) != DDI_SUCCESS) { 2470 return (DDI_FAILURE); 2471 } 2472 mlxp->mlx_attach &= ~MLXCX_ATTACH_MAC_HDL; 2473 } 2474 2475 mlxcx_teardown(mlxp); 2476 return (DDI_SUCCESS); 2477 } 2478 2479 static size_t 2480 mlxcx_calc_rx_ngroups(mlxcx_t *mlxp) 2481 { 2482 size_t ngroups = mlxp->mlx_props.mldp_rx_ngroups_large + 2483 mlxp->mlx_props.mldp_rx_ngroups_small; 2484 size_t tirlim, flowlim, gflowlim; 2485 2486 tirlim = mlxp->mlx_caps->mlc_max_tir / MLXCX_TIRS_PER_GROUP; 2487 if (tirlim < ngroups) { 2488 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2489 "on number of TIRs available", tirlim); 2490 ngroups = tirlim; 2491 } 2492 2493 flowlim = (1 << mlxp->mlx_caps->mlc_max_rx_ft_shift) - 2; 2494 if (flowlim < ngroups) { 2495 mlxcx_note(mlxp, "limiting number of rx groups to %u based " 2496 "on max size of RX flow tables", flowlim); 2497 ngroups = flowlim; 2498 } 2499 2500 do { 2501 gflowlim = mlxp->mlx_caps->mlc_max_rx_flows - 16 * ngroups - 2; 2502 if (gflowlim < ngroups) { 2503 mlxcx_note(mlxp, "limiting number of rx groups to %u " 2504 "based on max total RX flows", gflowlim); 2505 --ngroups; 2506 } 2507 } while (gflowlim < ngroups); 2508 2509 return (ngroups); 2510 } 2511 2512 static int 2513 mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2514 { 2515 mlxcx_t *mlxp; 2516 uint_t i; 2517 int inst, ret; 2518 2519 if (cmd != DDI_ATTACH) 2520 return (DDI_FAILURE); 2521 2522 inst = ddi_get_instance(dip); 2523 ret = ddi_soft_state_zalloc(mlxcx_softstate, inst); 2524 if (ret != 0) 2525 return (ret); 2526 2527 mlxp = ddi_get_soft_state(mlxcx_softstate, inst); 2528 if (mlxp == NULL) 2529 return (DDI_FAILURE); 2530 mlxp->mlx_dip = dip; 2531 mlxp->mlx_inst = inst; 2532 ddi_set_driver_private(dip, mlxp); 2533 2534 mlxcx_load_props(mlxp); 2535 2536 mlxcx_fm_init(mlxp); 2537 mlxp->mlx_attach |= MLXCX_ATTACH_FM; 2538 2539 if (pci_config_setup(mlxp->mlx_dip, &mlxp->mlx_cfg_handle) != 2540 DDI_SUCCESS) { 2541 mlxcx_warn(mlxp, "failed to initial PCI config space"); 2542 goto err; 2543 } 2544 mlxp->mlx_attach |= MLXCX_ATTACH_PCI_CONFIG; 2545 2546 if (!mlxcx_regs_map(mlxp)) { 2547 goto err; 2548 } 2549 mlxp->mlx_attach |= MLXCX_ATTACH_REGS; 2550 2551 if (!mlxcx_cmd_queue_init(mlxp)) { 2552 goto err; 2553 } 2554 mlxp->mlx_attach |= MLXCX_ATTACH_CMD; 2555 2556 if (!mlxcx_cmd_enable_hca(mlxp)) { 2557 goto err; 2558 } 2559 mlxp->mlx_attach |= MLXCX_ATTACH_ENABLE_HCA; 2560 2561 if (!mlxcx_check_issi(mlxp)) { 2562 goto err; 2563 } 2564 2565 /* 2566 * We have to get our interrupts now so we know what priority to 2567 * create pagemtx with. 2568 */ 2569 if (!mlxcx_intr_setup(mlxp)) { 2570 goto err; 2571 } 2572 mlxp->mlx_attach |= MLXCX_ATTACH_INTRS; 2573 2574 mutex_init(&mlxp->mlx_pagemtx, NULL, MUTEX_DRIVER, 2575 DDI_INTR_PRI(mlxp->mlx_intr_pri)); 2576 avl_create(&mlxp->mlx_pages, mlxcx_page_compare, 2577 sizeof (mlxcx_dev_page_t), offsetof(mlxcx_dev_page_t, mxdp_tree)); 2578 mlxp->mlx_attach |= MLXCX_ATTACH_PAGE_LIST; 2579 2580 if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_BOOT)) { 2581 goto err; 2582 } 2583 2584 if (!mlxcx_init_caps(mlxp)) { 2585 goto err; 2586 } 2587 mlxp->mlx_attach |= MLXCX_ATTACH_CAPS; 2588 2589 if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_INIT)) { 2590 goto err; 2591 } 2592 2593 if (!mlxcx_cmd_init_hca(mlxp)) { 2594 goto err; 2595 } 2596 mlxp->mlx_attach |= MLXCX_ATTACH_INIT_HCA; 2597 2598 if (!mlxcx_cmd_set_driver_version(mlxp, MLXCX_DRIVER_VERSION)) { 2599 goto err; 2600 } 2601 2602 /* 2603 * The User Access Region (UAR) is needed so we can ring EQ and CQ 2604 * doorbells. 2605 */ 2606 if (!mlxcx_cmd_alloc_uar(mlxp, &mlxp->mlx_uar)) { 2607 goto err; 2608 } 2609 for (i = 0; i < MLXCX_BF_PER_UAR; ++i) { 2610 mutex_init(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx, NULL, 2611 MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri)); 2612 } 2613 mlxp->mlx_attach |= MLXCX_ATTACH_UAR_PD_TD; 2614 2615 /* 2616 * Set up event queue #0 -- it's special and only handles control 2617 * type events, like PAGE_REQUEST (which we will probably get during 2618 * the commands below). 2619 * 2620 * This will enable and arm the interrupt on EQ 0, too. 2621 */ 2622 if (!mlxcx_setup_eq0(mlxp)) { 2623 goto err; 2624 } 2625 2626 /* 2627 * Allocate a protection and transport domain. These don't really do 2628 * anything for us (they're IB concepts), but we need to give their 2629 * ID numbers in other commands. 2630 */ 2631 if (!mlxcx_cmd_alloc_pd(mlxp, &mlxp->mlx_pd)) { 2632 goto err; 2633 } 2634 if (!mlxcx_cmd_alloc_tdom(mlxp, &mlxp->mlx_tdom)) { 2635 goto err; 2636 } 2637 /* 2638 * Fetch the "reserved" lkey that lets us give linear addresses in 2639 * work queue entries, rather than having to mess with the NIC's 2640 * internal MMU. 2641 */ 2642 if (!mlxcx_cmd_query_special_ctxs(mlxp)) { 2643 goto err; 2644 } 2645 2646 /* 2647 * Query our port information and current state, populate the 2648 * mlxcx_port_t structs. 2649 * 2650 * This also sets up the root flow tables and flow groups. 2651 */ 2652 if (!mlxcx_setup_ports(mlxp)) { 2653 goto err; 2654 } 2655 mlxp->mlx_attach |= MLXCX_ATTACH_PORTS; 2656 2657 mlxcx_load_model_props(mlxp); 2658 2659 /* 2660 * Set up, enable and arm the rest of the interrupt EQs which will 2661 * service events from CQs. 2662 * 2663 * The MLXCX_ATTACH_INTRS flag covers checking if these need to be 2664 * cleaned up. 2665 */ 2666 if (!mlxcx_setup_eqs(mlxp)) { 2667 goto err; 2668 } 2669 2670 /* Completion queues */ 2671 list_create(&mlxp->mlx_cqs, sizeof (mlxcx_completion_queue_t), 2672 offsetof(mlxcx_completion_queue_t, mlcq_entry)); 2673 mlxp->mlx_attach |= MLXCX_ATTACH_CQS; 2674 2675 /* Work queues (send queues, receive queues) */ 2676 list_create(&mlxp->mlx_wqs, sizeof (mlxcx_work_queue_t), 2677 offsetof(mlxcx_work_queue_t, mlwq_entry)); 2678 mlxp->mlx_attach |= MLXCX_ATTACH_WQS; 2679 2680 /* Set up periodic fault check timers which check the queue states */ 2681 if (!mlxcx_setup_checktimers(mlxp)) { 2682 goto err; 2683 } 2684 mlxp->mlx_attach |= MLXCX_ATTACH_CHKTIMERS; 2685 2686 /* 2687 * Construct our arrays of mlxcx_ring_group_ts, which represent the 2688 * "groups" we advertise to MAC. 2689 */ 2690 mlxp->mlx_rx_ngroups = mlxcx_calc_rx_ngroups(mlxp); 2691 mlxp->mlx_rx_groups_size = mlxp->mlx_rx_ngroups * 2692 sizeof (mlxcx_ring_group_t); 2693 mlxp->mlx_rx_groups = kmem_zalloc(mlxp->mlx_rx_groups_size, KM_SLEEP); 2694 2695 mlxp->mlx_tx_ngroups = mlxp->mlx_props.mldp_tx_ngroups; 2696 mlxp->mlx_tx_groups_size = mlxp->mlx_tx_ngroups * 2697 sizeof (mlxcx_ring_group_t); 2698 mlxp->mlx_tx_groups = kmem_zalloc(mlxp->mlx_tx_groups_size, KM_SLEEP); 2699 2700 mlxp->mlx_attach |= MLXCX_ATTACH_GROUPS; 2701 2702 /* 2703 * Sets up the free/busy buffers list for keeping track of packet 2704 * buffers. 2705 */ 2706 if (!mlxcx_setup_bufs(mlxp)) 2707 goto err; 2708 mlxp->mlx_attach |= MLXCX_ATTACH_BUFS; 2709 2710 /* 2711 * Before we tell MAC about our rings/groups, we need to do enough 2712 * setup on them to be sure about the numbers and configuration that 2713 * we have. This will do basically everything short of allocating 2714 * packet buffers and starting the rings up. 2715 */ 2716 for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) { 2717 if (!mlxcx_tx_group_setup(mlxp, &mlxp->mlx_tx_groups[i])) 2718 goto err; 2719 } 2720 for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) { 2721 if (!mlxcx_rx_group_setup(mlxp, &mlxp->mlx_rx_groups[i])) 2722 goto err; 2723 } 2724 2725 /* 2726 * Finally, tell MAC that we exist! 2727 */ 2728 if (!mlxcx_register_mac(mlxp)) { 2729 goto err; 2730 } 2731 mlxp->mlx_attach |= MLXCX_ATTACH_MAC_HDL; 2732 2733 return (DDI_SUCCESS); 2734 2735 err: 2736 mlxcx_teardown(mlxp); 2737 return (DDI_FAILURE); 2738 } 2739 2740 static struct cb_ops mlxcx_cb_ops = { 2741 .cb_open = nulldev, 2742 .cb_close = nulldev, 2743 .cb_strategy = nodev, 2744 .cb_print = nodev, 2745 .cb_dump = nodev, 2746 .cb_read = nodev, 2747 .cb_write = nodev, 2748 .cb_ioctl = nodev, 2749 .cb_devmap = nodev, 2750 .cb_mmap = nodev, 2751 .cb_segmap = nodev, 2752 .cb_chpoll = nochpoll, 2753 .cb_prop_op = ddi_prop_op, 2754 .cb_flag = D_MP, 2755 .cb_rev = CB_REV, 2756 .cb_aread = nodev, 2757 .cb_awrite = nodev 2758 }; 2759 2760 static struct dev_ops mlxcx_dev_ops = { 2761 .devo_rev = DEVO_REV, 2762 .devo_refcnt = 0, 2763 .devo_getinfo = NULL, 2764 .devo_identify = nulldev, 2765 .devo_probe = nulldev, 2766 .devo_attach = mlxcx_attach, 2767 .devo_detach = mlxcx_detach, 2768 .devo_reset = nodev, 2769 .devo_power = ddi_power, 2770 .devo_quiesce = ddi_quiesce_not_supported, 2771 .devo_cb_ops = &mlxcx_cb_ops 2772 }; 2773 2774 static struct modldrv mlxcx_modldrv = { 2775 .drv_modops = &mod_driverops, 2776 .drv_linkinfo = "Mellanox Connect-X 4/5/6", 2777 .drv_dev_ops = &mlxcx_dev_ops 2778 }; 2779 2780 static struct modlinkage mlxcx_modlinkage = { 2781 .ml_rev = MODREV_1, 2782 .ml_linkage = { &mlxcx_modldrv, NULL } 2783 }; 2784 2785 int 2786 _init(void) 2787 { 2788 int ret; 2789 2790 ret = ddi_soft_state_init(&mlxcx_softstate, sizeof (mlxcx_t), 0); 2791 if (ret != 0) { 2792 return (ret); 2793 } 2794 2795 mac_init_ops(&mlxcx_dev_ops, MLXCX_MODULE_NAME); 2796 2797 if ((ret = mod_install(&mlxcx_modlinkage)) != DDI_SUCCESS) { 2798 mac_fini_ops(&mlxcx_dev_ops); 2799 ddi_soft_state_fini(&mlxcx_softstate); 2800 return (ret); 2801 } 2802 2803 return (DDI_SUCCESS); 2804 } 2805 2806 int 2807 _info(struct modinfo *modinfop) 2808 { 2809 return (mod_info(&mlxcx_modlinkage, modinfop)); 2810 } 2811 2812 int 2813 _fini(void) 2814 { 2815 int ret; 2816 2817 if ((ret = mod_remove(&mlxcx_modlinkage)) != DDI_SUCCESS) { 2818 return (ret); 2819 } 2820 2821 mac_fini_ops(&mlxcx_dev_ops); 2822 2823 ddi_soft_state_fini(&mlxcx_softstate); 2824 2825 return (DDI_SUCCESS); 2826 } 2827