xref: /illumos-gate/usr/src/uts/common/io/mlxcx/mlxcx.c (revision e1086107)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2020, The University of Queensland
14  * Copyright (c) 2018, Joyent, Inc.
15  * Copyright 2020 RackTop Systems, Inc.
16  */
17 
18 /*
19  * Mellanox Connect-X 4/5/6 driver.
20  */
21 
22 /*
23  * The PRM for this family of parts is freely available, and can be found at:
24  * https://www.mellanox.com/related-docs/user_manuals/ \
25  *   Ethernet_Adapters_Programming_Manual.pdf
26  */
27 /*
28  * ConnectX glossary
29  * -----------------
30  *
31  * WR		Work Request: something we've asked the hardware to do by
32  *		creating a Work Queue Entry (WQE), e.g. send or recv a packet
33  *
34  * WQE		Work Queue Entry: a descriptor on a work queue descriptor ring
35  *
36  * WQ		Work Queue: a descriptor ring that we can place WQEs on, usually
37  *		either a Send Queue (SQ) or Receive Queue (RQ). Different WQ
38  *		types have different WQE structures, different commands for
39  *		creating and destroying them, etc, but share a common context
40  *		structure, counter setup and state graph.
41  * SQ		Send Queue, a specific type of WQ that sends packets
42  * RQ		Receive Queue, a specific type of WQ that receives packets
43  *
44  * CQ		Completion Queue: completion of WRs from a WQ are reported to
45  *		one of these, as a CQE on its entry ring.
46  * CQE		Completion Queue Entry: an entry in a CQ ring. Contains error
47  *		info, as well as packet size, the ID of the WQ, and the index
48  *		of the WQE which completed. Does not contain any packet data.
49  *
50  * EQ		Event Queue: a ring of event structs from the hardware informing
51  *		us when particular events happen. Many events can point at a
52  *		a particular CQ which we should then go look at.
53  * EQE		Event Queue Entry: an entry on the EQ ring
54  *
55  * UAR		User Access Region, a page of the device's PCI BAR which is
56  *		tied to particular EQ/CQ/WQ sets and contains doorbells to
57  *		ring to arm them for interrupts or wake them up for new work
58  *
59  * RQT		RQ Table, a collection of indexed RQs used to refer to the group
60  *		as a single unit (for e.g. hashing/RSS).
61  *
62  * TIR		Transport Interface Recieve, a bucket of resources for the
63  *		reception of packets. TIRs have to point at either a single RQ
64  *		or a table of RQs (RQT). They then serve as a target for flow
65  *		table entries (FEs). TIRs that point at an RQT also contain the
66  *		settings for hashing for RSS.
67  *
68  * TIS		Transport Interface Send, a bucket of resources associated with
69  *		the transmission of packets. In particular, the temporary
70  *		resources used for LSO internally in the card are accounted to
71  *		a TIS.
72  *
73  * FT		Flow Table, a collection of FEs and FGs that can be referred to
74  *		as a single entity (e.g. used as a target from another flow
75  *		entry or set as the "root" table to handle incoming or outgoing
76  *		packets). Packets arriving at a FT are matched against the
77  *		FEs in the table until either one matches with a terminating
78  *		action or all FEs are exhausted (it's first-match-wins but with
79  *		some actions that are non-terminal, like counting actions).
80  *
81  * FG		Flow Group, a group of FEs which share a common "mask" (i.e.
82  *		they match on the same attributes of packets coming into the
83  *		flow).
84  *
85  * FE		Flow Entry, an individual set of values to match against
86  *		packets entering the flow table, combined with an action to
87  *		take upon a successful match. The action we use most is
88  *		"forward", which sends the packets to a TIR or another flow
89  *		table and then stops further processing within the FE's FT.
90  *
91  * lkey/mkey	A reference to something similar to a page table but in the
92  *		device's internal onboard MMU. Since Connect-X parts double as
93  *		IB cards (lots of RDMA) they have extensive onboard memory mgmt
94  *		features which we try very hard not to use. For our WQEs we use
95  *		the "reserved" lkey, which is a special value which indicates
96  *		that addresses we give are linear addresses and should not be
97  *		translated.
98  *
99  * PD		Protection Domain, an IB concept. We have to allocate one to
100  *		provide as a parameter for new WQs, but we don't do anything
101  *		with it.
102  *
103  * TDOM/TD	Transport Domain, an IB concept. We allocate one in order to
104  *		provide it as a parameter to TIR/TIS creation, but we don't do
105  *		anything with it.
106  */
107 /*
108  *
109  * Data flow overview
110  * ------------------
111  *
112  * This driver is a MAC ring-enabled driver which maps rings to send and recv
113  * queues in hardware on the device.
114  *
115  * Each SQ and RQ is set up to report to its own individual CQ, to ensure
116  * sufficient space, and simplify the logic needed to work out which buffer
117  * was completed.
118  *
119  * The CQs are then round-robin allocated onto EQs, of which we set up one per
120  * interrupt that the system gives us for the device. Normally this means we
121  * have 8 EQs.
122  *
123  * When we have >= 8 EQs available, we try to allocate only RX or only TX
124  * CQs on each one. The EQs are chosen for RX and TX in an alternating fashion.
125  *
126  * EQ #0 is reserved for all event types other than completion events, and has
127  * no CQs associated with it at any time. EQs #1 and upwards are only used for
128  * handling CQ completion events.
129  *
130  * +------+     +------+           +------+        +---------+
131  * | SQ 0 |---->| CQ 0 |-----+     | EQ 0 |------> | MSI-X 0 |     mlxcx_intr_0
132  * +------+     +------+     |     +------+        +---------+
133  *                           |
134  * +------+     +------+     |
135  * | SQ 1 |---->| CQ 1 |---+ |     +------+
136  * +------+     +------+   | +---> |      |
137  *                         |       |      |
138  * +------+     +------+   |       | EQ 1 |        +---------+
139  * | SQ 2 |---->| CQ 2 |---------> |      |------> | MSI-X 1 |     mlxcx_intr_n
140  * +------+     +------+   | +---> |      |        +---------+
141  *                         | |     +------+
142  *                         | |
143  *   ...                   | |
144  *                         | |     +------+
145  * +------+     +------+   +-----> |      |
146  * | RQ 0 |---->| CQ 3 |---------> |      |        +---------+
147  * +------+     +------+     |     | EQ 2 |------> | MSI-X 2 |     mlxcx_intr_n
148  *                           |     |      |        +---------+
149  * +------+     +------+     | +-> |      |
150  * | RQ 1 |---->| CQ 4 |-----+ |   +------+
151  * +------+     +------+       |
152  *                             |     ....
153  * +------+     +------+       |
154  * | RQ 2 |---->| CQ 5 |-------+
155  * +------+     +------+
156  *
157  *   ... (note this diagram does not show RX-only or TX-only EQs)
158  *
159  * For TX, we advertise all of the SQs we create as plain rings to MAC with
160  * no TX groups. This puts MAC in "virtual group" mode where it will allocate
161  * and use the rings as it sees fit.
162  *
163  * For RX, we advertise actual groups in order to make use of hardware
164  * classification.
165  *
166  * The hardware classification we use is based around Flow Tables, and we
167  * currently ignore all of the eswitch features of the card. The NIC VPORT
168  * is always set to promisc mode so that the eswitch sends us all of the
169  * traffic that arrives on the NIC, and we use flow entries to manage
170  * everything.
171  *
172  * We use 2 layers of flow tables for classification: traffic arrives at the
173  * root RX flow table which contains MAC address filters. Those then send
174  * matched traffic to the per-group L1 VLAN filter tables which contain VLAN
175  * presence and VID filters.
176  *
177  * Since these parts only support doing RSS hashing on a single protocol at a
178  * time, we have to use a third layer of flow tables as well to break traffic
179  * down by L4 and L3 protocol (TCPv6, TCPv4, UDPv6, UDPv4, IPv6, IPv4 etc)
180  * so that it can be sent to the appropriate TIR for hashing.
181  *
182  * Incoming packets
183  *        +           +---------+      +---------+
184  *        |        +->| group 0 |      | group 0 |
185  *        |        |  | vlan ft |  +-->| hash ft |
186  *        v        |  |   L1    |  |   |   L2    |
187  *   +----+----+   |  +---------+  |   +---------+    +-----+    +-----+------+
188  *   | eswitch |   |  |         |  |   |  TCPv6  |--->| TIR |--->|     |  RQ0 |
189  *   +----+----+   |  |         |  |   +---------+    +-----+    |     +------+
190  *        |        |  |         |  |   |  UDPv6  |--->| TIR |--->|     |  RQ1 |
191  *        |        |  |         |  |   +---------+    +-----+    |     +------+
192  *        |        |  |         |  |   |  TCPv4  |--->| TIR |--->|     |  RQ2 |
193  *        v        |  |         |  |   +---------+    +-----+    | RQT +------+
194  *   +----+----+   |  +---------+  |   |  UDPv4  |--->| TIR |--->|     |  ... |
195  *   | root rx |   |  | default |--+   +---------+    +-----+    |     |      |
196  *   | flow tb |   |  +---------+  |   |  IPv6   |--->| TIR |--->|     |      |
197  *   |    L0   |   |  | promisc |--+   +---------+    +-----+    |     |      |
198  *   +---------+   |  +---------+  ^   |  IPv4   |--->| TIR |--->|     |      |
199  *   |  bcast  |---|---------------+   +---------+    +-----+    +-----+------+
200  *   +---------+   |               ^   |  other  |-+
201  *   |  MAC 0  |---+               |   +---------+ |  +-----+    +-----+
202  *   +---------+                   |               +->| TIR |--->| RQ0 |
203  *   |  MAC 1  |-+                 |                  +-----+    +-----+
204  *   +---------+ | +---------------+
205  *   |  MAC 2  |-+ |               ^
206  *   +---------+ | |               |
207  *   |  MAC 3  |-+ |  +---------+  |   +---------+
208  *   +---------+ | |  | group 1 |  |   | group 1 |
209  *   |  .....  | +--->| vlan ft |  | +>| hash ft |
210  *   |         |   |  |   L1    |  | | |   L2    |
211  *   +---------+   |  +---------+  | | +---------+    +-----+    +-----+------+
212  *   | promisc |---+  | VLAN 0  |----+ |  TCPv6  |--->| TIR |--->|     |  RQ3 |
213  *   +---------+      +---------+  |   +---------+    +-----+    |     +------+
214  *                    |  .....  |  |   |  UDPv6  |--->| TIR |--->|     |  RQ4 |
215  *                    |         |  |   +---------+    +-----+    |     +------+
216  *                    |         |  |   |  TCPv4  |--->| TIR |--->|     |  RQ5 |
217  *                    |         |  |   +---------+    +-----+    | RQT +------+
218  *                    +---------+  |   |  UDPv4  |--->| TIR |--->|     |  ... |
219  *                    |         |  |   +---------+    +-----+    |     |      |
220  *                    +---------+  |   |  IPv6   |--->| TIR |--->|     |      |
221  *                    | promisc |--+   +---------+    +-----+    |     |      |
222  *                    +---------+      |  IPv4   |--->| TIR |--->|     |      |
223  *                                     +---------+    +-----+    +-----+------+
224  *                                     |  other  |-+
225  *                                     +---------+ |
226  *                      .......                    |  +-----+    +-----+
227  *                                                 +->| TIR |--->| RQ3 |
228  *                                                    +-----+    +-----+
229  *
230  * Note that the "promisc" flow entries are only set/enabled when promisc
231  * mode is enabled for the NIC. All promisc flow entries point directly at
232  * group 0's hashing flowtable (so all promisc-only traffic lands on group 0,
233  * the "default group" in MAC).
234  *
235  * The "default" entry in the L1 VLAN filter flow tables is used when there
236  * are no VLANs set for the group, to accept any traffic regardless of tag. It
237  * is deleted as soon as a VLAN filter is added (and re-instated if the
238  * last VLAN filter is removed).
239  *
240  * The actual descriptor ring structures for RX on Connect-X4 don't contain any
241  * space for packet data (they're a collection of scatter pointers only). TX
242  * descriptors contain some space for "inline headers" (and the card requires
243  * us to put at least the L2 Ethernet headers there for the eswitch to look at)
244  * but all the rest of the data comes from the gather pointers.
245  *
246  * When we get completions back they simply contain the ring index number of
247  * the WR (work request) which completed. So, we manage the buffers for actual
248  * packet data completely independently of the descriptors in this driver. When
249  * a WR is enqueued in a WQE (work queue entry), we stamp the packet data buffer
250  * with the WQE index that we put it at, and therefore don't have to look at
251  * the original descriptor at all when handling completions.
252  *
253  * For RX, we create sufficient packet data buffers to fill 150% of the
254  * available descriptors for each ring. These all are pre-set-up for DMA and
255  * have an mblk_t associated with them (with desballoc()).
256  *
257  * For TX we either borrow the mblk's memory and DMA bind it (if the packet is
258  * large enough), or we copy it into a pre-allocated buffer set up in the same
259  * as as for RX.
260  */
261 
262 /*
263  * Buffer lifecycle: RX
264  * --------------------
265  *
266  * The lifecycle of an mlxcx_buffer_t (packet buffer) used for RX is pretty
267  * straightforward.
268  *
269  * It is created (and has all its memory allocated) at the time of starting up
270  * the RX ring it belongs to. Then it is placed on the "free" list in the
271  * mlxcx_buffer_shard_t associated with its RQ. When mlxcx_rq_refill() wants
272  * more buffers to add to the RQ, it takes one off and marks it as "on WQ"
273  * before making a WQE for it.
274  *
275  * After a completion event occurs, the packet is either discarded (and the
276  * buffer_t returned to the free list), or it is readied for loaning to MAC.
277  *
278  * Once MAC and the rest of the system have finished with the packet, they call
279  * freemsg() on its mblk, which will call mlxcx_buf_mp_return and return the
280  * buffer_t to the free list.
281  *
282  * At detach/teardown time, buffers are only every destroyed from the free list.
283  *
284  *
285  *                         +
286  *                         |
287  *                         | mlxcx_buf_create
288  *                         |
289  *                         v
290  *                    +----+----+
291  *                    | created |
292  *                    +----+----+
293  *                         |
294  *                         |
295  *                         | mlxcx_buf_return
296  *                         |
297  *                         v
298  * mlxcx_buf_destroy  +----+----+
299  *          +---------|  free   |<---------------+
300  *          |         +----+----+                |
301  *          |              |                     |
302  *          |              |                     | mlxcx_buf_return
303  *          v              | mlxcx_buf_take      |
304  *      +---+--+           v                     |
305  *      | dead |       +---+---+                 |
306  *      +------+       | on WQ |- - - - - - - - >O
307  *                     +---+---+                 ^
308  *                         |                     |
309  *                         |                     |
310  *                         | mlxcx_buf_loan      | mlxcx_buf_mp_return
311  *                         v                     |
312  *                 +-------+--------+            |
313  *                 | on loan to MAC |----------->O
314  *                 +----------------+  freemsg()
315  *
316  */
317 
318 /*
319  * Buffer lifecycle: TX
320  * --------------------
321  *
322  * mlxcx_buffer_ts used for TX are divided into two kinds: regular buffers, and
323  * "foreign" buffers.
324  *
325  * The former have their memory allocated and DMA bound by this driver, while
326  * the latter (the "foreign" buffers) are on loan from MAC. Their memory is
327  * not owned by us, though we do DMA bind it (and take responsibility for
328  * un-binding it when we're done with them).
329  *
330  * We use separate mlxcx_buf_shard_ts for foreign and local buffers on each
331  * SQ. Thus, there is a separate free list and mutex for each kind.
332  *
333  * Since a TX packet might consist of multiple mblks, we translate each mblk
334  * into exactly one buffer_t. The buffer_ts are chained together in the same
335  * order as the mblks, using the mlb_tx_chain/mlb_tx_chain_entry list_t.
336  *
337  * Each chain of TX buffers may consist of foreign or driver buffers, in any
338  * mixture.
339  *
340  * The head of a TX buffer chain has mlb_tx_head == itself, which distinguishes
341  * it from the rest of the chain buffers.
342  *
343  * TX buffer chains are always returned to the free list by
344  * mlxcx_buf_return_chain(), which takes care of walking the mlb_tx_chain and
345  * freeing all of the members.
346  *
347  * We only call freemsg() once, on the head of the TX buffer chain's original
348  * mblk. This is true whether we copied it or bound it in a foreign buffer.
349  */
350 
351 /*
352  * Startup and command interface
353  * -----------------------------
354  *
355  * The command interface is the primary way in which we give control orders to
356  * the hardware (e.g. actions like "create this queue" or "delete this flow
357  * entry"). The command interface is never used to transmit or receive packets
358  * -- that takes place only on the queues that are set up through it.
359  *
360  * In mlxcx_cmd.c we implement our use of the command interface on top of a
361  * simple taskq. Since it's not performance critical, we busy-wait on command
362  * completions and only process a single command at a time.
363  *
364  * If this becomes a problem later we can wire command completions up to EQ 0
365  * once we have interrupts running.
366  *
367  * The startup/attach process for this card involves a bunch of different steps
368  * which are summarised pretty well in the PRM. We have to send a number of
369  * commands which do different things to start the card up, give it some pages
370  * of our own memory for it to use, then start creating all the entities that
371  * we need to use like EQs, CQs, WQs, as well as their dependencies like PDs
372  * and TDoms.
373  */
374 
375 /*
376  * UARs
377  * ----
378  *
379  * The pages of the PCI BAR other than the first few are reserved for use as
380  * "UAR" sections in this device. Each UAR section can be used as a set of
381  * doorbells for our queues.
382  *
383  * Currently we just make one single UAR for all of our queues. It doesn't
384  * seem to be a major limitation yet.
385  *
386  * When we're sending packets through an SQ, the PRM is not awful clear about
387  * exactly how we're meant to use the first 16 bytes of the Blueflame buffers
388  * (it's clear on the pattern of alternation you're expected to use between
389  * even and odd for Blueflame sends, but not for regular doorbells).
390  *
391  * Currently we don't do the even-odd alternating pattern for ordinary
392  * doorbells, and we don't use Blueflame at all. This seems to work fine, at
393  * least on Connect-X4 Lx.
394  */
395 
396 /*
397  * Lock ordering
398  * -------------
399  *
400  * Interrupt side:
401  *
402  *  - mleq_mtx
403  *    - mlcq_mtx
404  *      - mlcq_bufbmtx
405  *      - mlwq_mtx
406  *        - mlbs_mtx
407  *    - mlp_mtx
408  *
409  * GLD side:
410  *
411  *  - mlp_mtx
412  *    - mlg_mtx
413  *      - mlg_*.mlft_mtx
414  *    - mlp_*.mlft_mtx
415  *    - mlwq_mtx
416  *      - mlbs_mtx
417  *      - mlcq_bufbmtx
418  *  - mleq_mtx
419  *    - mlcq_mtx
420  *
421  */
422 
423 #include <sys/modctl.h>
424 #include <sys/conf.h>
425 #include <sys/devops.h>
426 #include <sys/sysmacros.h>
427 #include <sys/time.h>
428 
429 #include <sys/mac_provider.h>
430 
431 #include <mlxcx.h>
432 
433 CTASSERT((1 << MLXCX_RX_HASH_FT_SIZE_SHIFT) >= MLXCX_TIRS_PER_GROUP);
434 
435 #define	MLXCX_MODULE_NAME	"mlxcx"
436 /*
437  * We give this to the firmware, so it has to be in a fixed format that it
438  * understands.
439  */
440 #define	MLXCX_DRIVER_VERSION	"illumos,mlxcx,1.0.0,1,000,000000"
441 
442 /*
443  * Firmware may take a while to reclaim pages. Try a set number of times.
444  */
445 clock_t mlxcx_reclaim_delay = 1000 * 50; /* 50 ms in us */
446 uint_t mlxcx_reclaim_tries = 100; /* Wait at most 5000ms */
447 
448 static void *mlxcx_softstate;
449 
450 /*
451  * Fault detection thresholds.
452  */
453 uint_t mlxcx_doorbell_tries = MLXCX_DOORBELL_TRIES_DFLT;
454 uint_t mlxcx_stuck_intr_count = MLXCX_STUCK_INTR_COUNT_DFLT;
455 
456 static void
457 mlxcx_load_prop_defaults(mlxcx_t *mlxp)
458 {
459 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
460 	mlxcx_port_t *port = &mlxp->mlx_ports[0];
461 
462 	VERIFY((mlxp->mlx_attach & MLXCX_ATTACH_PORTS) != 0);
463 	VERIFY((mlxp->mlx_attach & (MLXCX_ATTACH_CQS | MLXCX_ATTACH_WQS)) == 0);
464 
465 	/*
466 	 * Currently we have different queue size defaults for two
467 	 * categories of queues. One set for devices which support a
468 	 * maximum speed of 10Gb/s, and another for those above that.
469 	 */
470 	if ((port->mlp_max_proto & (MLXCX_PROTO_25G | MLXCX_PROTO_40G |
471 	    MLXCX_PROTO_50G | MLXCX_PROTO_100G)) != 0) {
472 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_25G;
473 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_25G;
474 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_25G;
475 	} else if ((port->mlp_max_proto & (MLXCX_PROTO_100M | MLXCX_PROTO_1G |
476 	    MLXCX_PROTO_10G)) != 0) {
477 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
478 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
479 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
480 	} else {
481 		mlxcx_warn(mlxp, "Encountered a port with a speed we don't "
482 		    "recognize. Proto: 0x%x", port->mlp_max_proto);
483 		p->mldp_cq_size_shift_default = MLXCX_CQ_SIZE_SHIFT_DFLT;
484 		p->mldp_rq_size_shift_default = MLXCX_RQ_SIZE_SHIFT_DFLT;
485 		p->mldp_sq_size_shift_default = MLXCX_SQ_SIZE_SHIFT_DFLT;
486 	}
487 }
488 
489 /*
490  * Properties which may have different defaults based on hardware
491  * characteristics.
492  */
493 static void
494 mlxcx_load_model_props(mlxcx_t *mlxp)
495 {
496 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
497 
498 	mlxcx_load_prop_defaults(mlxp);
499 
500 	p->mldp_cq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
501 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cq_size_shift",
502 	    p->mldp_cq_size_shift_default);
503 	p->mldp_sq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
504 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "sq_size_shift",
505 	    p->mldp_sq_size_shift_default);
506 	p->mldp_rq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
507 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rq_size_shift",
508 	    p->mldp_rq_size_shift_default);
509 }
510 
511 static void
512 mlxcx_load_props(mlxcx_t *mlxp)
513 {
514 	mlxcx_drv_props_t *p = &mlxp->mlx_props;
515 
516 	p->mldp_eq_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
517 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "eq_size_shift",
518 	    MLXCX_EQ_SIZE_SHIFT_DFLT);
519 	p->mldp_cqemod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
520 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_period_usec",
521 	    MLXCX_CQEMOD_PERIOD_USEC_DFLT);
522 	p->mldp_cqemod_count = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
523 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "cqemod_count",
524 	    MLXCX_CQEMOD_COUNT_DFLT);
525 	p->mldp_intrmod_period_usec = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
526 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "intrmod_period_usec",
527 	    MLXCX_INTRMOD_PERIOD_USEC_DFLT);
528 
529 	p->mldp_tx_ngroups = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
530 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_ngroups",
531 	    MLXCX_TX_NGROUPS_DFLT);
532 	p->mldp_tx_nrings_per_group = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
533 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_nrings_per_group",
534 	    MLXCX_TX_NRINGS_PER_GROUP_DFLT);
535 
536 	p->mldp_rx_ngroups_large = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
537 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_large",
538 	    MLXCX_RX_NGROUPS_LARGE_DFLT);
539 	p->mldp_rx_ngroups_small = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
540 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_ngroups_small",
541 	    MLXCX_RX_NGROUPS_SMALL_DFLT);
542 	p->mldp_rx_nrings_per_large_group = ddi_getprop(DDI_DEV_T_ANY,
543 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
544 	    "rx_nrings_per_large_group", MLXCX_RX_NRINGS_PER_LARGE_GROUP_DFLT);
545 	p->mldp_rx_nrings_per_small_group = ddi_getprop(DDI_DEV_T_ANY,
546 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
547 	    "rx_nrings_per_small_group", MLXCX_RX_NRINGS_PER_SMALL_GROUP_DFLT);
548 
549 	p->mldp_ftbl_root_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
550 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_root_size_shift",
551 	    MLXCX_FTBL_ROOT_SIZE_SHIFT_DFLT);
552 
553 	p->mldp_tx_bind_threshold = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
554 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "tx_bind_threshold",
555 	    MLXCX_TX_BIND_THRESHOLD_DFLT);
556 
557 	p->mldp_ftbl_vlan_size_shift = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
558 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "ftbl_vlan_size_shift",
559 	    MLXCX_FTBL_VLAN_SIZE_SHIFT_DFLT);
560 
561 	p->mldp_eq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
562 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
563 	    "eq_check_interval_sec", MLXCX_EQ_CHECK_INTERVAL_SEC_DFLT);
564 	p->mldp_cq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
565 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
566 	    "cq_check_interval_sec", MLXCX_CQ_CHECK_INTERVAL_SEC_DFLT);
567 	p->mldp_wq_check_interval_sec = ddi_getprop(DDI_DEV_T_ANY,
568 	    mlxp->mlx_dip, DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS,
569 	    "wq_check_interval_sec", MLXCX_WQ_CHECK_INTERVAL_SEC_DFLT);
570 
571 	p->mldp_rx_per_cq = ddi_getprop(DDI_DEV_T_ANY, mlxp->mlx_dip,
572 	    DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "rx_limit_per_completion",
573 	    MLXCX_RX_PER_CQ_DEFAULT);
574 
575 	if (p->mldp_rx_per_cq < MLXCX_RX_PER_CQ_MIN ||
576 	    p->mldp_rx_per_cq > MLXCX_RX_PER_CQ_MAX) {
577 		mlxcx_warn(mlxp, "!rx_limit_per_completion = %u is "
578 		    "out of range. Defaulting to: %d. Valid values are from "
579 		    "%d to %d", p->mldp_rx_per_cq, MLXCX_RX_PER_CQ_DEFAULT,
580 		    MLXCX_RX_PER_CQ_MIN, MLXCX_RX_PER_CQ_MAX);
581 		p->mldp_rx_per_cq = MLXCX_RX_PER_CQ_DEFAULT;
582 	}
583 }
584 
585 void
586 mlxcx_note(mlxcx_t *mlxp, const char *fmt, ...)
587 {
588 	va_list ap;
589 
590 	va_start(ap, fmt);
591 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
592 		vdev_err(mlxp->mlx_dip, CE_NOTE, fmt, ap);
593 	} else {
594 		vcmn_err(CE_NOTE, fmt, ap);
595 	}
596 	va_end(ap);
597 }
598 
599 void
600 mlxcx_warn(mlxcx_t *mlxp, const char *fmt, ...)
601 {
602 	va_list ap;
603 
604 	va_start(ap, fmt);
605 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
606 		vdev_err(mlxp->mlx_dip, CE_WARN, fmt, ap);
607 	} else {
608 		vcmn_err(CE_WARN, fmt, ap);
609 	}
610 	va_end(ap);
611 }
612 
613 void
614 mlxcx_panic(mlxcx_t *mlxp, const char *fmt, ...)
615 {
616 	va_list ap;
617 
618 	va_start(ap, fmt);
619 	if (mlxp != NULL && mlxp->mlx_dip != NULL) {
620 		vdev_err(mlxp->mlx_dip, CE_PANIC, fmt, ap);
621 	} else {
622 		vcmn_err(CE_PANIC, fmt, ap);
623 	}
624 	va_end(ap);
625 }
626 
627 uint16_t
628 mlxcx_get16(mlxcx_t *mlxp, uintptr_t off)
629 {
630 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
631 	return (ddi_get16(mlxp->mlx_regs_handle, (void *)addr));
632 }
633 
634 uint32_t
635 mlxcx_get32(mlxcx_t *mlxp, uintptr_t off)
636 {
637 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
638 	return (ddi_get32(mlxp->mlx_regs_handle, (void *)addr));
639 }
640 
641 uint64_t
642 mlxcx_get64(mlxcx_t *mlxp, uintptr_t off)
643 {
644 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
645 	return (ddi_get64(mlxp->mlx_regs_handle, (void *)addr));
646 }
647 
648 void
649 mlxcx_put32(mlxcx_t *mlxp, uintptr_t off, uint32_t val)
650 {
651 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
652 	ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val);
653 }
654 
655 void
656 mlxcx_put64(mlxcx_t *mlxp, uintptr_t off, uint64_t val)
657 {
658 	uintptr_t addr = off + (uintptr_t)mlxp->mlx_regs_base;
659 	ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val);
660 }
661 
662 void
663 mlxcx_uar_put32(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint32_t val)
664 {
665 	/*
666 	 * The UAR is always inside the first BAR, which we mapped as
667 	 * mlx_regs
668 	 */
669 	uintptr_t addr = off + (uintptr_t)mlu->mlu_base +
670 	    (uintptr_t)mlxp->mlx_regs_base;
671 	ddi_put32(mlxp->mlx_regs_handle, (void *)addr, val);
672 }
673 
674 void
675 mlxcx_uar_put64(mlxcx_t *mlxp, mlxcx_uar_t *mlu, uintptr_t off, uint64_t val)
676 {
677 	uintptr_t addr = off + (uintptr_t)mlu->mlu_base +
678 	    (uintptr_t)mlxp->mlx_regs_base;
679 	ddi_put64(mlxp->mlx_regs_handle, (void *)addr, val);
680 }
681 
682 static void
683 mlxcx_fm_fini(mlxcx_t *mlxp)
684 {
685 	if (mlxp->mlx_fm_caps == 0)
686 		return;
687 
688 	if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps))
689 		ddi_fm_handler_unregister(mlxp->mlx_dip);
690 
691 	if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) ||
692 	    DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps))
693 		pci_ereport_teardown(mlxp->mlx_dip);
694 
695 	ddi_fm_fini(mlxp->mlx_dip);
696 
697 	mlxp->mlx_fm_caps = 0;
698 }
699 
700 void
701 mlxcx_fm_ereport(mlxcx_t *mlxp, const char *detail)
702 {
703 	uint64_t ena;
704 	char buf[FM_MAX_CLASS];
705 
706 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
707 		return;
708 
709 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail);
710 	ena = fm_ena_generate(0, FM_ENA_FMT1);
711 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
712 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
713 	    NULL);
714 }
715 
716 static int
717 mlxcx_fm_errcb(dev_info_t *dip, ddi_fm_error_t *err, const void *arg)
718 {
719 	/*
720 	 * as the driver can always deal with an error in any dma or
721 	 * access handle, we can just return the fme_status value.
722 	 */
723 	pci_ereport_post(dip, err, NULL);
724 	return (err->fme_status);
725 }
726 
727 static void
728 mlxcx_fm_init(mlxcx_t *mlxp)
729 {
730 	ddi_iblock_cookie_t iblk;
731 	int def = DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE |
732 	    DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE;
733 
734 	mlxp->mlx_fm_caps = ddi_prop_get_int(DDI_DEV_T_ANY, mlxp->mlx_dip,
735 	    DDI_PROP_DONTPASS, "fm_capable", def);
736 
737 	if (mlxp->mlx_fm_caps < 0) {
738 		mlxp->mlx_fm_caps = 0;
739 	}
740 	mlxp->mlx_fm_caps &= def;
741 
742 	if (mlxp->mlx_fm_caps == 0)
743 		return;
744 
745 	ddi_fm_init(mlxp->mlx_dip, &mlxp->mlx_fm_caps, &iblk);
746 	if (DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps) ||
747 	    DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) {
748 		pci_ereport_setup(mlxp->mlx_dip);
749 	}
750 	if (DDI_FM_ERRCB_CAP(mlxp->mlx_fm_caps)) {
751 		ddi_fm_handler_register(mlxp->mlx_dip, mlxcx_fm_errcb,
752 		    (void *)mlxp);
753 	}
754 }
755 
756 static void
757 mlxcx_mlbs_teardown(mlxcx_t *mlxp, mlxcx_buf_shard_t *s)
758 {
759 	mlxcx_buffer_t *buf;
760 
761 	mutex_enter(&s->mlbs_mtx);
762 	while (!list_is_empty(&s->mlbs_busy))
763 		cv_wait(&s->mlbs_free_nonempty, &s->mlbs_mtx);
764 	while ((buf = list_head(&s->mlbs_free)) != NULL) {
765 		mlxcx_buf_destroy(mlxp, buf);
766 	}
767 	list_destroy(&s->mlbs_free);
768 	list_destroy(&s->mlbs_busy);
769 	mutex_exit(&s->mlbs_mtx);
770 
771 	cv_destroy(&s->mlbs_free_nonempty);
772 	mutex_destroy(&s->mlbs_mtx);
773 }
774 
775 static void
776 mlxcx_teardown_bufs(mlxcx_t *mlxp)
777 {
778 	mlxcx_buf_shard_t *s;
779 
780 	while ((s = list_remove_head(&mlxp->mlx_buf_shards)) != NULL) {
781 		mlxcx_mlbs_teardown(mlxp, s);
782 		kmem_free(s, sizeof (mlxcx_buf_shard_t));
783 	}
784 	list_destroy(&mlxp->mlx_buf_shards);
785 
786 	kmem_cache_destroy(mlxp->mlx_bufs_cache);
787 }
788 
789 static void
790 mlxcx_teardown_pages(mlxcx_t *mlxp)
791 {
792 	uint_t nzeros = 0;
793 
794 	mutex_enter(&mlxp->mlx_pagemtx);
795 
796 	while (mlxp->mlx_npages > 0) {
797 		int32_t req, ret;
798 		uint64_t pas[MLXCX_MANAGE_PAGES_MAX_PAGES];
799 
800 		ASSERT0(avl_is_empty(&mlxp->mlx_pages));
801 		req = MIN(mlxp->mlx_npages, MLXCX_MANAGE_PAGES_MAX_PAGES);
802 
803 		if (!mlxcx_cmd_return_pages(mlxp, req, pas, &ret)) {
804 			mlxcx_warn(mlxp, "hardware refused to return pages, "
805 			    "leaking %u remaining pages", mlxp->mlx_npages);
806 			goto out;
807 		}
808 
809 		for (int32_t i = 0; i < ret; i++) {
810 			mlxcx_dev_page_t *mdp, probe;
811 			bzero(&probe, sizeof (probe));
812 			probe.mxdp_pa = pas[i];
813 
814 			mdp = avl_find(&mlxp->mlx_pages, &probe, NULL);
815 
816 			if (mdp != NULL) {
817 				avl_remove(&mlxp->mlx_pages, mdp);
818 				mlxp->mlx_npages--;
819 				mlxcx_dma_free(&mdp->mxdp_dma);
820 				kmem_free(mdp, sizeof (mlxcx_dev_page_t));
821 			} else {
822 				mlxcx_panic(mlxp, "hardware returned a page "
823 				    "with PA 0x%" PRIx64 " but we have no "
824 				    "record of giving out such a page", pas[i]);
825 			}
826 		}
827 
828 		/*
829 		 * If no pages were returned, note that fact.
830 		 */
831 		if (ret == 0) {
832 			nzeros++;
833 			if (nzeros > mlxcx_reclaim_tries) {
834 				mlxcx_warn(mlxp, "hardware refused to return "
835 				    "pages, leaking %u remaining pages",
836 				    mlxp->mlx_npages);
837 				goto out;
838 			}
839 			delay(drv_usectohz(mlxcx_reclaim_delay));
840 		}
841 	}
842 
843 	avl_destroy(&mlxp->mlx_pages);
844 
845 out:
846 	mutex_exit(&mlxp->mlx_pagemtx);
847 	mutex_destroy(&mlxp->mlx_pagemtx);
848 }
849 
850 static boolean_t
851 mlxcx_eq_alloc_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
852 {
853 	ddi_device_acc_attr_t acc;
854 	ddi_dma_attr_t attr;
855 	boolean_t ret;
856 	size_t sz, i;
857 
858 	VERIFY0(mleq->mleq_state & MLXCX_EQ_ALLOC);
859 
860 	mleq->mleq_entshift = mlxp->mlx_props.mldp_eq_size_shift;
861 	mleq->mleq_nents = (1 << mleq->mleq_entshift);
862 	sz = mleq->mleq_nents * sizeof (mlxcx_eventq_ent_t);
863 	ASSERT3U(sz & (MLXCX_HW_PAGE_SIZE - 1), ==, 0);
864 
865 	mlxcx_dma_acc_attr(mlxp, &acc);
866 	mlxcx_dma_queue_attr(mlxp, &attr);
867 
868 	ret = mlxcx_dma_alloc(mlxp, &mleq->mleq_dma, &attr, &acc,
869 	    B_TRUE, sz, B_TRUE);
870 	if (!ret) {
871 		mlxcx_warn(mlxp, "failed to allocate EQ memory");
872 		return (B_FALSE);
873 	}
874 
875 	mleq->mleq_ent = (mlxcx_eventq_ent_t *)mleq->mleq_dma.mxdb_va;
876 
877 	for (i = 0; i < mleq->mleq_nents; ++i)
878 		mleq->mleq_ent[i].mleqe_owner = MLXCX_EQ_OWNER_INIT;
879 
880 	mleq->mleq_state |= MLXCX_EQ_ALLOC;
881 
882 	return (B_TRUE);
883 }
884 
885 static void
886 mlxcx_eq_rele_dma(mlxcx_t *mlxp, mlxcx_event_queue_t *mleq)
887 {
888 	VERIFY(mleq->mleq_state & MLXCX_EQ_ALLOC);
889 	if (mleq->mleq_state & MLXCX_EQ_CREATED)
890 		VERIFY(mleq->mleq_state & MLXCX_EQ_DESTROYED);
891 
892 	mlxcx_dma_free(&mleq->mleq_dma);
893 	mleq->mleq_ent = NULL;
894 
895 	mleq->mleq_state &= ~MLXCX_EQ_ALLOC;
896 }
897 
898 void
899 mlxcx_teardown_flow_table(mlxcx_t *mlxp, mlxcx_flow_table_t *ft)
900 {
901 	mlxcx_flow_group_t *fg;
902 	mlxcx_flow_entry_t *fe;
903 	int i;
904 
905 	ASSERT(mutex_owned(&ft->mlft_mtx));
906 
907 	for (i = ft->mlft_nents - 1; i >= 0; --i) {
908 		fe = &ft->mlft_ent[i];
909 		if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
910 			if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
911 				mlxcx_panic(mlxp, "failed to delete flow "
912 				    "entry %u on table %u", i,
913 				    ft->mlft_num);
914 			}
915 		}
916 	}
917 
918 	while ((fg = list_remove_head(&ft->mlft_groups)) != NULL) {
919 		if (fg->mlfg_state & MLXCX_FLOW_GROUP_CREATED &&
920 		    !(fg->mlfg_state & MLXCX_FLOW_GROUP_DESTROYED)) {
921 			if (!mlxcx_cmd_destroy_flow_group(mlxp, fg)) {
922 				mlxcx_panic(mlxp, "failed to destroy flow "
923 				    "group %u", fg->mlfg_num);
924 			}
925 		}
926 		kmem_free(fg, sizeof (mlxcx_flow_group_t));
927 	}
928 	list_destroy(&ft->mlft_groups);
929 	if (ft->mlft_state & MLXCX_FLOW_TABLE_CREATED &&
930 	    !(ft->mlft_state & MLXCX_FLOW_TABLE_DESTROYED)) {
931 		if (!mlxcx_cmd_destroy_flow_table(mlxp, ft)) {
932 			mlxcx_panic(mlxp, "failed to destroy flow table %u",
933 			    ft->mlft_num);
934 		}
935 	}
936 	kmem_free(ft->mlft_ent, ft->mlft_entsize);
937 	ft->mlft_ent = NULL;
938 	mutex_exit(&ft->mlft_mtx);
939 	mutex_destroy(&ft->mlft_mtx);
940 	kmem_free(ft, sizeof (mlxcx_flow_table_t));
941 }
942 
943 static void
944 mlxcx_teardown_ports(mlxcx_t *mlxp)
945 {
946 	uint_t i;
947 	mlxcx_port_t *p;
948 	mlxcx_flow_table_t *ft;
949 
950 	for (i = 0; i < mlxp->mlx_nports; ++i) {
951 		p = &mlxp->mlx_ports[i];
952 		if (!(p->mlp_init & MLXCX_PORT_INIT))
953 			continue;
954 		mutex_enter(&p->mlp_mtx);
955 		if ((ft = p->mlp_rx_flow) != NULL) {
956 			mutex_enter(&ft->mlft_mtx);
957 			/*
958 			 * teardown_flow_table() will destroy the mutex, so
959 			 * we don't release it here.
960 			 */
961 			mlxcx_teardown_flow_table(mlxp, ft);
962 		}
963 		mutex_exit(&p->mlp_mtx);
964 		mutex_destroy(&p->mlp_mtx);
965 		p->mlp_init &= ~MLXCX_PORT_INIT;
966 	}
967 
968 	kmem_free(mlxp->mlx_ports, mlxp->mlx_ports_size);
969 	mlxp->mlx_ports = NULL;
970 }
971 
972 static void
973 mlxcx_teardown_wqs(mlxcx_t *mlxp)
974 {
975 	mlxcx_work_queue_t *mlwq;
976 
977 	while ((mlwq = list_head(&mlxp->mlx_wqs)) != NULL) {
978 		mlxcx_wq_teardown(mlxp, mlwq);
979 	}
980 	list_destroy(&mlxp->mlx_wqs);
981 }
982 
983 static void
984 mlxcx_teardown_cqs(mlxcx_t *mlxp)
985 {
986 	mlxcx_completion_queue_t *mlcq;
987 
988 	while ((mlcq = list_head(&mlxp->mlx_cqs)) != NULL) {
989 		mlxcx_cq_teardown(mlxp, mlcq);
990 	}
991 	list_destroy(&mlxp->mlx_cqs);
992 }
993 
994 static void
995 mlxcx_teardown_eqs(mlxcx_t *mlxp)
996 {
997 	mlxcx_event_queue_t *mleq;
998 	uint_t i;
999 
1000 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1001 		mleq = &mlxp->mlx_eqs[i];
1002 		mutex_enter(&mleq->mleq_mtx);
1003 		if ((mleq->mleq_state & MLXCX_EQ_CREATED) &&
1004 		    !(mleq->mleq_state & MLXCX_EQ_DESTROYED)) {
1005 			if (!mlxcx_cmd_destroy_eq(mlxp, mleq)) {
1006 				mlxcx_warn(mlxp, "failed to destroy "
1007 				    "event queue idx %u eqn %u",
1008 				    i, mleq->mleq_num);
1009 			}
1010 		}
1011 		if (mleq->mleq_state & MLXCX_EQ_ALLOC) {
1012 			mlxcx_eq_rele_dma(mlxp, mleq);
1013 		}
1014 		mutex_exit(&mleq->mleq_mtx);
1015 	}
1016 }
1017 
1018 static void
1019 mlxcx_teardown_checktimers(mlxcx_t *mlxp)
1020 {
1021 	if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0)
1022 		ddi_periodic_delete(mlxp->mlx_eq_checktimer);
1023 	if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0)
1024 		ddi_periodic_delete(mlxp->mlx_cq_checktimer);
1025 	if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0)
1026 		ddi_periodic_delete(mlxp->mlx_wq_checktimer);
1027 }
1028 
1029 static void
1030 mlxcx_teardown(mlxcx_t *mlxp)
1031 {
1032 	uint_t i;
1033 	dev_info_t *dip = mlxp->mlx_dip;
1034 
1035 	if (mlxp->mlx_attach & MLXCX_ATTACH_GROUPS) {
1036 		mlxcx_teardown_groups(mlxp);
1037 		mlxp->mlx_attach &= ~MLXCX_ATTACH_GROUPS;
1038 	}
1039 
1040 	if (mlxp->mlx_attach & MLXCX_ATTACH_CHKTIMERS) {
1041 		mlxcx_teardown_checktimers(mlxp);
1042 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CHKTIMERS;
1043 	}
1044 
1045 	if (mlxp->mlx_attach & MLXCX_ATTACH_WQS) {
1046 		mlxcx_teardown_wqs(mlxp);
1047 		mlxp->mlx_attach &= ~MLXCX_ATTACH_WQS;
1048 	}
1049 
1050 	if (mlxp->mlx_attach & MLXCX_ATTACH_CQS) {
1051 		mlxcx_teardown_cqs(mlxp);
1052 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CQS;
1053 	}
1054 
1055 	if (mlxp->mlx_attach & MLXCX_ATTACH_BUFS) {
1056 		mlxcx_teardown_bufs(mlxp);
1057 		mlxp->mlx_attach &= ~MLXCX_ATTACH_BUFS;
1058 	}
1059 
1060 	if (mlxp->mlx_attach & MLXCX_ATTACH_PORTS) {
1061 		mlxcx_teardown_ports(mlxp);
1062 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PORTS;
1063 	}
1064 
1065 	if (mlxp->mlx_attach & MLXCX_ATTACH_INTRS) {
1066 		mlxcx_teardown_eqs(mlxp);
1067 		mlxcx_intr_teardown(mlxp);
1068 		mlxp->mlx_attach &= ~MLXCX_ATTACH_INTRS;
1069 	}
1070 
1071 	if (mlxp->mlx_attach & MLXCX_ATTACH_UAR_PD_TD) {
1072 		if (mlxp->mlx_uar.mlu_allocated) {
1073 			if (!mlxcx_cmd_dealloc_uar(mlxp, &mlxp->mlx_uar)) {
1074 				mlxcx_warn(mlxp, "failed to release UAR");
1075 			}
1076 			for (i = 0; i < MLXCX_BF_PER_UAR; ++i)
1077 				mutex_destroy(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx);
1078 		}
1079 		if (mlxp->mlx_pd.mlpd_allocated &&
1080 		    !mlxcx_cmd_dealloc_pd(mlxp, &mlxp->mlx_pd)) {
1081 			mlxcx_warn(mlxp, "failed to release PD");
1082 		}
1083 		if (mlxp->mlx_tdom.mltd_allocated &&
1084 		    !mlxcx_cmd_dealloc_tdom(mlxp, &mlxp->mlx_tdom)) {
1085 			mlxcx_warn(mlxp, "failed to release TDOM");
1086 		}
1087 		mlxp->mlx_attach &= ~MLXCX_ATTACH_UAR_PD_TD;
1088 	}
1089 
1090 	if (mlxp->mlx_attach & MLXCX_ATTACH_INIT_HCA) {
1091 		if (!mlxcx_cmd_teardown_hca(mlxp)) {
1092 			mlxcx_warn(mlxp, "failed to send teardown HCA "
1093 			    "command during device detach");
1094 		}
1095 		mlxp->mlx_attach &= ~MLXCX_ATTACH_INIT_HCA;
1096 	}
1097 
1098 	if (mlxp->mlx_attach & MLXCX_ATTACH_PAGE_LIST) {
1099 		mlxcx_teardown_pages(mlxp);
1100 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PAGE_LIST;
1101 	}
1102 
1103 	if (mlxp->mlx_attach & MLXCX_ATTACH_ENABLE_HCA) {
1104 		if (!mlxcx_cmd_disable_hca(mlxp)) {
1105 			mlxcx_warn(mlxp, "failed to send DISABLE HCA command "
1106 			    "during device detach");
1107 		}
1108 		mlxp->mlx_attach &= ~MLXCX_ATTACH_ENABLE_HCA;
1109 	}
1110 
1111 	if (mlxp->mlx_attach & MLXCX_ATTACH_CMD) {
1112 		mlxcx_cmd_queue_fini(mlxp);
1113 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CMD;
1114 	}
1115 
1116 	if (mlxp->mlx_attach & MLXCX_ATTACH_CAPS) {
1117 		kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t));
1118 		mlxp->mlx_caps = NULL;
1119 		mlxp->mlx_attach &= ~MLXCX_ATTACH_CAPS;
1120 	}
1121 
1122 	if (mlxp->mlx_attach & MLXCX_ATTACH_REGS) {
1123 		ddi_regs_map_free(&mlxp->mlx_regs_handle);
1124 		mlxp->mlx_regs_handle = NULL;
1125 		mlxp->mlx_attach &= ~MLXCX_ATTACH_REGS;
1126 	}
1127 
1128 	if (mlxp->mlx_attach & MLXCX_ATTACH_PCI_CONFIG) {
1129 		pci_config_teardown(&mlxp->mlx_cfg_handle);
1130 		mlxp->mlx_cfg_handle = NULL;
1131 		mlxp->mlx_attach &= ~MLXCX_ATTACH_PCI_CONFIG;
1132 	}
1133 
1134 	if (mlxp->mlx_attach & MLXCX_ATTACH_FM) {
1135 		mlxcx_fm_fini(mlxp);
1136 		mlxp->mlx_attach &= ~MLXCX_ATTACH_FM;
1137 	}
1138 
1139 	VERIFY3S(mlxp->mlx_attach, ==, 0);
1140 	ddi_soft_state_free(mlxcx_softstate, mlxp->mlx_inst);
1141 	ddi_set_driver_private(dip, NULL);
1142 }
1143 
1144 static boolean_t
1145 mlxcx_regs_map(mlxcx_t *mlxp)
1146 {
1147 	off_t memsize;
1148 	int ret;
1149 	ddi_device_acc_attr_t da;
1150 
1151 	if (ddi_dev_regsize(mlxp->mlx_dip, MLXCX_REG_NUMBER, &memsize) !=
1152 	    DDI_SUCCESS) {
1153 		mlxcx_warn(mlxp, "failed to get register set size");
1154 		return (B_FALSE);
1155 	}
1156 
1157 	/*
1158 	 * All data in the main BAR is kept in big-endian even though it's a PCI
1159 	 * device.
1160 	 */
1161 	bzero(&da, sizeof (ddi_device_acc_attr_t));
1162 	da.devacc_attr_version = DDI_DEVICE_ATTR_V0;
1163 	da.devacc_attr_endian_flags = DDI_STRUCTURE_BE_ACC;
1164 	da.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
1165 	if (DDI_FM_ACC_ERR_CAP(mlxp->mlx_fm_caps)) {
1166 		da.devacc_attr_access = DDI_FLAGERR_ACC;
1167 	} else {
1168 		da.devacc_attr_access = DDI_DEFAULT_ACC;
1169 	}
1170 
1171 	ret = ddi_regs_map_setup(mlxp->mlx_dip, MLXCX_REG_NUMBER,
1172 	    &mlxp->mlx_regs_base, 0, memsize, &da, &mlxp->mlx_regs_handle);
1173 
1174 	if (ret != DDI_SUCCESS) {
1175 		mlxcx_warn(mlxp, "failed to map device registers: %d", ret);
1176 		return (B_FALSE);
1177 	}
1178 
1179 	return (B_TRUE);
1180 }
1181 
1182 static boolean_t
1183 mlxcx_check_issi(mlxcx_t *mlxp)
1184 {
1185 	uint32_t issi;
1186 
1187 	if (!mlxcx_cmd_query_issi(mlxp, &issi)) {
1188 		mlxcx_warn(mlxp, "failed to get ISSI");
1189 		return (B_FALSE);
1190 	}
1191 
1192 	if ((issi & (1 << MLXCX_CURRENT_ISSI)) == 0) {
1193 		mlxcx_warn(mlxp, "hardware does not support software ISSI, "
1194 		    "hw vector 0x%x, sw version %u", issi, MLXCX_CURRENT_ISSI);
1195 		return (B_FALSE);
1196 	}
1197 
1198 	if (!mlxcx_cmd_set_issi(mlxp, MLXCX_CURRENT_ISSI)) {
1199 		mlxcx_warn(mlxp, "failed to set ISSI to %u",
1200 		    MLXCX_CURRENT_ISSI);
1201 		return (B_FALSE);
1202 	}
1203 
1204 	return (B_TRUE);
1205 }
1206 
1207 boolean_t
1208 mlxcx_give_pages(mlxcx_t *mlxp, int32_t npages)
1209 {
1210 	ddi_device_acc_attr_t acc;
1211 	ddi_dma_attr_t attr;
1212 	int32_t i;
1213 	list_t plist;
1214 	mlxcx_dev_page_t *mdp;
1215 	const ddi_dma_cookie_t *ck;
1216 
1217 	/*
1218 	 * If there are no pages required, then we're done here.
1219 	 */
1220 	if (npages <= 0) {
1221 		return (B_TRUE);
1222 	}
1223 
1224 	list_create(&plist, sizeof (mlxcx_dev_page_t),
1225 	    offsetof(mlxcx_dev_page_t, mxdp_list));
1226 
1227 	for (i = 0; i < npages; i++) {
1228 		mdp = kmem_zalloc(sizeof (mlxcx_dev_page_t), KM_SLEEP);
1229 		mlxcx_dma_acc_attr(mlxp, &acc);
1230 		mlxcx_dma_page_attr(mlxp, &attr);
1231 		if (!mlxcx_dma_alloc(mlxp, &mdp->mxdp_dma, &attr, &acc,
1232 		    B_TRUE, MLXCX_HW_PAGE_SIZE, B_TRUE)) {
1233 			mlxcx_warn(mlxp, "failed to allocate 4k page %u/%u", i,
1234 			    npages);
1235 			kmem_free(mdp, sizeof (mlxcx_dev_page_t));
1236 			goto cleanup_npages;
1237 		}
1238 		ck = mlxcx_dma_cookie_one(&mdp->mxdp_dma);
1239 		mdp->mxdp_pa = ck->dmac_laddress;
1240 
1241 		list_insert_tail(&plist, mdp);
1242 	}
1243 
1244 	/*
1245 	 * Now that all of the pages have been allocated, given them to hardware
1246 	 * in chunks.
1247 	 */
1248 	while (npages > 0) {
1249 		mlxcx_dev_page_t *pages[MLXCX_MANAGE_PAGES_MAX_PAGES];
1250 		int32_t togive = MIN(MLXCX_MANAGE_PAGES_MAX_PAGES, npages);
1251 
1252 		for (i = 0; i < togive; i++) {
1253 			pages[i] = list_remove_head(&plist);
1254 		}
1255 
1256 		if (!mlxcx_cmd_give_pages(mlxp,
1257 		    MLXCX_MANAGE_PAGES_OPMOD_GIVE_PAGES, togive, pages)) {
1258 			mlxcx_warn(mlxp, "!hardware refused our gift of %u "
1259 			    "pages!", togive);
1260 			for (i = 0; i < togive; i++) {
1261 				list_insert_tail(&plist, pages[i]);
1262 			}
1263 			goto cleanup_npages;
1264 		}
1265 
1266 		mutex_enter(&mlxp->mlx_pagemtx);
1267 		for (i = 0; i < togive; i++) {
1268 			avl_add(&mlxp->mlx_pages, pages[i]);
1269 		}
1270 		mlxp->mlx_npages += togive;
1271 		mutex_exit(&mlxp->mlx_pagemtx);
1272 		npages -= togive;
1273 	}
1274 
1275 	list_destroy(&plist);
1276 
1277 	return (B_TRUE);
1278 
1279 cleanup_npages:
1280 	while ((mdp = list_remove_head(&plist)) != NULL) {
1281 		mlxcx_dma_free(&mdp->mxdp_dma);
1282 		kmem_free(mdp, sizeof (mlxcx_dev_page_t));
1283 	}
1284 	list_destroy(&plist);
1285 	return (B_FALSE);
1286 }
1287 
1288 static boolean_t
1289 mlxcx_init_pages(mlxcx_t *mlxp, uint_t type)
1290 {
1291 	int32_t npages;
1292 
1293 	if (!mlxcx_cmd_query_pages(mlxp, type, &npages)) {
1294 		mlxcx_warn(mlxp, "failed to determine boot pages");
1295 		return (B_FALSE);
1296 	}
1297 
1298 	return (mlxcx_give_pages(mlxp, npages));
1299 }
1300 
1301 static int
1302 mlxcx_bufs_cache_constr(void *arg, void *cookie, int kmflags)
1303 {
1304 	mlxcx_t *mlxp = cookie;
1305 	mlxcx_buffer_t *b = arg;
1306 
1307 	bzero(b, sizeof (mlxcx_buffer_t));
1308 	b->mlb_mlx = mlxp;
1309 	b->mlb_state = MLXCX_BUFFER_INIT;
1310 	list_create(&b->mlb_tx_chain, sizeof (mlxcx_buffer_t),
1311 	    offsetof(mlxcx_buffer_t, mlb_tx_chain_entry));
1312 
1313 	return (0);
1314 }
1315 
1316 static void
1317 mlxcx_bufs_cache_destr(void *arg, void *cookie)
1318 {
1319 	mlxcx_t *mlxp = cookie;
1320 	mlxcx_buffer_t *b = arg;
1321 	VERIFY3P(b->mlb_mlx, ==, mlxp);
1322 	VERIFY(b->mlb_state == MLXCX_BUFFER_INIT);
1323 	list_destroy(&b->mlb_tx_chain);
1324 }
1325 
1326 mlxcx_buf_shard_t *
1327 mlxcx_mlbs_create(mlxcx_t *mlxp)
1328 {
1329 	mlxcx_buf_shard_t *s;
1330 
1331 	s = kmem_zalloc(sizeof (mlxcx_buf_shard_t), KM_SLEEP);
1332 
1333 	mutex_init(&s->mlbs_mtx, NULL, MUTEX_DRIVER,
1334 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1335 	list_create(&s->mlbs_busy, sizeof (mlxcx_buffer_t),
1336 	    offsetof(mlxcx_buffer_t, mlb_entry));
1337 	list_create(&s->mlbs_free, sizeof (mlxcx_buffer_t),
1338 	    offsetof(mlxcx_buffer_t, mlb_entry));
1339 	cv_init(&s->mlbs_free_nonempty, NULL, CV_DRIVER, NULL);
1340 
1341 	list_insert_tail(&mlxp->mlx_buf_shards, s);
1342 
1343 	return (s);
1344 }
1345 
1346 static boolean_t
1347 mlxcx_setup_bufs(mlxcx_t *mlxp)
1348 {
1349 	char namebuf[KSTAT_STRLEN];
1350 
1351 	(void) snprintf(namebuf, KSTAT_STRLEN, "mlxcx%d_bufs_cache",
1352 	    ddi_get_instance(mlxp->mlx_dip));
1353 	mlxp->mlx_bufs_cache = kmem_cache_create(namebuf,
1354 	    sizeof (mlxcx_buffer_t), sizeof (uint64_t),
1355 	    mlxcx_bufs_cache_constr, mlxcx_bufs_cache_destr,
1356 	    NULL, mlxp, NULL, 0);
1357 
1358 	list_create(&mlxp->mlx_buf_shards, sizeof (mlxcx_buf_shard_t),
1359 	    offsetof(mlxcx_buf_shard_t, mlbs_entry));
1360 
1361 	return (B_TRUE);
1362 }
1363 
1364 static void
1365 mlxcx_fm_qstate_ereport(mlxcx_t *mlxp, const char *qtype, uint32_t qnum,
1366     const char *state, uint8_t statenum)
1367 {
1368 	uint64_t ena;
1369 	char buf[FM_MAX_CLASS];
1370 
1371 	if (!DDI_FM_EREPORT_CAP(mlxp->mlx_fm_caps))
1372 		return;
1373 
1374 	(void) snprintf(buf, FM_MAX_CLASS, "%s.%s",
1375 	    MLXCX_FM_SERVICE_MLXCX, "qstate.err");
1376 	ena = fm_ena_generate(0, FM_ENA_FMT1);
1377 
1378 	ddi_fm_ereport_post(mlxp->mlx_dip, buf, ena, DDI_NOSLEEP,
1379 	    FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERS0,
1380 	    "state", DATA_TYPE_STRING, state,
1381 	    "state_num", DATA_TYPE_UINT8, statenum,
1382 	    "qtype", DATA_TYPE_STRING, qtype,
1383 	    "qnum", DATA_TYPE_UINT32, qnum,
1384 	    NULL);
1385 	ddi_fm_service_impact(mlxp->mlx_dip, DDI_SERVICE_DEGRADED);
1386 }
1387 
1388 static void
1389 mlxcx_eq_check(void *arg)
1390 {
1391 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1392 	mlxcx_event_queue_t *eq;
1393 	mlxcx_eventq_ctx_t ctx;
1394 	const char *str;
1395 
1396 	uint_t i;
1397 
1398 	for (i = 0; i < mlxp->mlx_intr_count; ++i) {
1399 		eq = &mlxp->mlx_eqs[i];
1400 		if (!(eq->mleq_state & MLXCX_EQ_CREATED) ||
1401 		    (eq->mleq_state & MLXCX_EQ_DESTROYED))
1402 			continue;
1403 		mutex_enter(&eq->mleq_mtx);
1404 		if (!mlxcx_cmd_query_eq(mlxp, eq, &ctx)) {
1405 			mutex_exit(&eq->mleq_mtx);
1406 			continue;
1407 		}
1408 
1409 		str = "???";
1410 		switch (ctx.mleqc_status) {
1411 		case MLXCX_EQ_STATUS_OK:
1412 			break;
1413 		case MLXCX_EQ_STATUS_WRITE_FAILURE:
1414 			str = "WRITE_FAILURE";
1415 			break;
1416 		}
1417 		if (ctx.mleqc_status != MLXCX_EQ_STATUS_OK) {
1418 			mlxcx_fm_qstate_ereport(mlxp, "event",
1419 			    eq->mleq_num, str, ctx.mleqc_status);
1420 			mlxcx_warn(mlxp, "EQ %u is in bad status: %x (%s)",
1421 			    eq->mleq_intr_index, ctx.mleqc_status, str);
1422 		}
1423 
1424 		if (ctx.mleqc_state != MLXCX_EQ_ST_ARMED &&
1425 		    (eq->mleq_state & MLXCX_EQ_ARMED)) {
1426 			if (eq->mleq_cc == eq->mleq_check_disarm_cc &&
1427 			    ++eq->mleq_check_disarm_cnt >= 3) {
1428 				mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL);
1429 				mlxcx_warn(mlxp, "EQ %u isn't armed",
1430 				    eq->mleq_intr_index);
1431 			}
1432 			eq->mleq_check_disarm_cc = eq->mleq_cc;
1433 		} else {
1434 			eq->mleq_check_disarm_cc = 0;
1435 			eq->mleq_check_disarm_cnt = 0;
1436 		}
1437 
1438 		mutex_exit(&eq->mleq_mtx);
1439 	}
1440 }
1441 
1442 static void
1443 mlxcx_cq_check(void *arg)
1444 {
1445 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1446 	mlxcx_completion_queue_t *cq;
1447 	mlxcx_completionq_ctx_t ctx;
1448 	const char *str, *type;
1449 	uint_t v;
1450 
1451 	for (cq = list_head(&mlxp->mlx_cqs); cq != NULL;
1452 	    cq = list_next(&mlxp->mlx_cqs, cq)) {
1453 		mutex_enter(&cq->mlcq_mtx);
1454 		if (!(cq->mlcq_state & MLXCX_CQ_CREATED) ||
1455 		    (cq->mlcq_state & MLXCX_CQ_DESTROYED) ||
1456 		    (cq->mlcq_state & MLXCX_CQ_TEARDOWN)) {
1457 			mutex_exit(&cq->mlcq_mtx);
1458 			continue;
1459 		}
1460 		if (cq->mlcq_fm_repd_qstate) {
1461 			mutex_exit(&cq->mlcq_mtx);
1462 			continue;
1463 		}
1464 		if (!mlxcx_cmd_query_cq(mlxp, cq, &ctx)) {
1465 			mutex_exit(&cq->mlcq_mtx);
1466 			continue;
1467 		}
1468 		if (cq->mlcq_wq != NULL) {
1469 			mlxcx_work_queue_t *wq = cq->mlcq_wq;
1470 			if (wq->mlwq_type == MLXCX_WQ_TYPE_RECVQ)
1471 				type = "rx ";
1472 			else if (wq->mlwq_type == MLXCX_WQ_TYPE_SENDQ)
1473 				type = "tx ";
1474 			else
1475 				type = "";
1476 		} else {
1477 			type = "";
1478 		}
1479 
1480 		str = "???";
1481 		v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATUS);
1482 		switch (v) {
1483 		case MLXCX_CQC_STATUS_OK:
1484 			break;
1485 		case MLXCX_CQC_STATUS_OVERFLOW:
1486 			str = "OVERFLOW";
1487 			break;
1488 		case MLXCX_CQC_STATUS_WRITE_FAIL:
1489 			str = "WRITE_FAIL";
1490 			break;
1491 		case MLXCX_CQC_STATUS_INVALID:
1492 			str = "INVALID";
1493 			break;
1494 		}
1495 		if (v != MLXCX_CQC_STATUS_OK) {
1496 			mlxcx_fm_qstate_ereport(mlxp, "completion",
1497 			    cq->mlcq_num, str, v);
1498 			mlxcx_warn(mlxp, "%sCQ 0x%x is in bad status: %x (%s)",
1499 			    type, cq->mlcq_num, v, str);
1500 			cq->mlcq_fm_repd_qstate = B_TRUE;
1501 		}
1502 
1503 		v = get_bits32(ctx.mlcqc_flags, MLXCX_CQ_CTX_STATE);
1504 		if (v != MLXCX_CQC_STATE_ARMED &&
1505 		    (cq->mlcq_state & MLXCX_CQ_ARMED) &&
1506 		    !(cq->mlcq_state & MLXCX_CQ_POLLING)) {
1507 			if (cq->mlcq_cc == cq->mlcq_check_disarm_cc &&
1508 			    ++cq->mlcq_check_disarm_cnt >= 3) {
1509 				mlxcx_fm_ereport(mlxp, DDI_FM_DEVICE_STALL);
1510 				mlxcx_warn(mlxp, "%sCQ 0x%x (%p) isn't armed",
1511 				    type, cq->mlcq_num, cq);
1512 			}
1513 			cq->mlcq_check_disarm_cc = cq->mlcq_cc;
1514 		} else {
1515 			cq->mlcq_check_disarm_cnt = 0;
1516 			cq->mlcq_check_disarm_cc = 0;
1517 		}
1518 		mutex_exit(&cq->mlcq_mtx);
1519 	}
1520 }
1521 
1522 void
1523 mlxcx_check_sq(mlxcx_t *mlxp, mlxcx_work_queue_t *sq)
1524 {
1525 	mlxcx_sq_ctx_t ctx;
1526 	mlxcx_sq_state_t state;
1527 
1528 	ASSERT(mutex_owned(&sq->mlwq_mtx));
1529 
1530 	if (!mlxcx_cmd_query_sq(mlxp, sq, &ctx))
1531 		return;
1532 
1533 	ASSERT3U(from_be24(ctx.mlsqc_cqn), ==, sq->mlwq_cq->mlcq_num);
1534 	state = get_bits32(ctx.mlsqc_flags, MLXCX_SQ_STATE);
1535 	switch (state) {
1536 	case MLXCX_SQ_STATE_RST:
1537 		if (sq->mlwq_state & MLXCX_WQ_STARTED) {
1538 			mlxcx_fm_qstate_ereport(mlxp, "send",
1539 			    sq->mlwq_num, "RST", state);
1540 			sq->mlwq_fm_repd_qstate = B_TRUE;
1541 		}
1542 		break;
1543 	case MLXCX_SQ_STATE_RDY:
1544 		if (!(sq->mlwq_state & MLXCX_WQ_STARTED)) {
1545 			mlxcx_fm_qstate_ereport(mlxp, "send",
1546 			    sq->mlwq_num, "RDY", state);
1547 			sq->mlwq_fm_repd_qstate = B_TRUE;
1548 		}
1549 		break;
1550 	case MLXCX_SQ_STATE_ERR:
1551 		mlxcx_fm_qstate_ereport(mlxp, "send",
1552 		    sq->mlwq_num, "ERR", state);
1553 		sq->mlwq_fm_repd_qstate = B_TRUE;
1554 		break;
1555 	default:
1556 		mlxcx_fm_qstate_ereport(mlxp, "send",
1557 		    sq->mlwq_num, "???", state);
1558 		sq->mlwq_fm_repd_qstate = B_TRUE;
1559 		break;
1560 	}
1561 }
1562 
1563 void
1564 mlxcx_check_rq(mlxcx_t *mlxp, mlxcx_work_queue_t *rq)
1565 {
1566 	mlxcx_rq_ctx_t ctx;
1567 	mlxcx_rq_state_t state;
1568 
1569 	ASSERT(mutex_owned(&rq->mlwq_mtx));
1570 
1571 	if (!mlxcx_cmd_query_rq(mlxp, rq, &ctx))
1572 		return;
1573 
1574 	ASSERT3U(from_be24(ctx.mlrqc_cqn), ==, rq->mlwq_cq->mlcq_num);
1575 	state = get_bits32(ctx.mlrqc_flags, MLXCX_RQ_STATE);
1576 	switch (state) {
1577 	case MLXCX_RQ_STATE_RST:
1578 		if (rq->mlwq_state & MLXCX_WQ_STARTED) {
1579 			mlxcx_fm_qstate_ereport(mlxp, "receive",
1580 			    rq->mlwq_num, "RST", state);
1581 			rq->mlwq_fm_repd_qstate = B_TRUE;
1582 		}
1583 		break;
1584 	case MLXCX_RQ_STATE_RDY:
1585 		if (!(rq->mlwq_state & MLXCX_WQ_STARTED)) {
1586 			mlxcx_fm_qstate_ereport(mlxp, "receive",
1587 			    rq->mlwq_num, "RDY", state);
1588 			rq->mlwq_fm_repd_qstate = B_TRUE;
1589 		}
1590 		break;
1591 	case MLXCX_RQ_STATE_ERR:
1592 		mlxcx_fm_qstate_ereport(mlxp, "receive",
1593 		    rq->mlwq_num, "ERR", state);
1594 		rq->mlwq_fm_repd_qstate = B_TRUE;
1595 		break;
1596 	default:
1597 		mlxcx_fm_qstate_ereport(mlxp, "receive",
1598 		    rq->mlwq_num, "???", state);
1599 		rq->mlwq_fm_repd_qstate = B_TRUE;
1600 		break;
1601 	}
1602 }
1603 
1604 static void
1605 mlxcx_wq_check(void *arg)
1606 {
1607 	mlxcx_t *mlxp = (mlxcx_t *)arg;
1608 	mlxcx_work_queue_t *wq;
1609 
1610 	for (wq = list_head(&mlxp->mlx_wqs); wq != NULL;
1611 	    wq = list_next(&mlxp->mlx_wqs, wq)) {
1612 		mutex_enter(&wq->mlwq_mtx);
1613 		if (!(wq->mlwq_state & MLXCX_WQ_CREATED) ||
1614 		    (wq->mlwq_state & MLXCX_WQ_DESTROYED) ||
1615 		    (wq->mlwq_state & MLXCX_WQ_TEARDOWN)) {
1616 			mutex_exit(&wq->mlwq_mtx);
1617 			continue;
1618 		}
1619 		if (wq->mlwq_fm_repd_qstate) {
1620 			mutex_exit(&wq->mlwq_mtx);
1621 			continue;
1622 		}
1623 		switch (wq->mlwq_type) {
1624 		case MLXCX_WQ_TYPE_SENDQ:
1625 			mlxcx_check_sq(mlxp, wq);
1626 			break;
1627 		case MLXCX_WQ_TYPE_RECVQ:
1628 			mlxcx_check_rq(mlxp, wq);
1629 			break;
1630 		}
1631 		mutex_exit(&wq->mlwq_mtx);
1632 	}
1633 }
1634 
1635 static boolean_t
1636 mlxcx_setup_checktimers(mlxcx_t *mlxp)
1637 {
1638 	if (mlxp->mlx_props.mldp_eq_check_interval_sec > 0) {
1639 		mlxp->mlx_eq_checktimer = ddi_periodic_add(mlxcx_eq_check, mlxp,
1640 		    mlxp->mlx_props.mldp_eq_check_interval_sec * NANOSEC,
1641 		    DDI_IPL_0);
1642 	}
1643 	if (mlxp->mlx_props.mldp_cq_check_interval_sec > 0) {
1644 		mlxp->mlx_cq_checktimer = ddi_periodic_add(mlxcx_cq_check, mlxp,
1645 		    mlxp->mlx_props.mldp_cq_check_interval_sec * NANOSEC,
1646 		    DDI_IPL_0);
1647 	}
1648 	if (mlxp->mlx_props.mldp_wq_check_interval_sec > 0) {
1649 		mlxp->mlx_wq_checktimer = ddi_periodic_add(mlxcx_wq_check, mlxp,
1650 		    mlxp->mlx_props.mldp_wq_check_interval_sec * NANOSEC,
1651 		    DDI_IPL_0);
1652 	}
1653 	return (B_TRUE);
1654 }
1655 
1656 int
1657 mlxcx_dmac_fe_compare(const void *arg0, const void *arg1)
1658 {
1659 	const mlxcx_flow_entry_t *left = arg0;
1660 	const mlxcx_flow_entry_t *right = arg1;
1661 	int bcmpr;
1662 
1663 	bcmpr = memcmp(left->mlfe_dmac, right->mlfe_dmac,
1664 	    sizeof (left->mlfe_dmac));
1665 	if (bcmpr < 0)
1666 		return (-1);
1667 	if (bcmpr > 0)
1668 		return (1);
1669 	if (left->mlfe_vid < right->mlfe_vid)
1670 		return (-1);
1671 	if (left->mlfe_vid > right->mlfe_vid)
1672 		return (1);
1673 	return (0);
1674 }
1675 
1676 int
1677 mlxcx_grmac_compare(const void *arg0, const void *arg1)
1678 {
1679 	const mlxcx_group_mac_t *left = arg0;
1680 	const mlxcx_group_mac_t *right = arg1;
1681 	int bcmpr;
1682 
1683 	bcmpr = memcmp(left->mlgm_mac, right->mlgm_mac,
1684 	    sizeof (left->mlgm_mac));
1685 	if (bcmpr < 0)
1686 		return (-1);
1687 	if (bcmpr > 0)
1688 		return (1);
1689 	return (0);
1690 }
1691 
1692 int
1693 mlxcx_page_compare(const void *arg0, const void *arg1)
1694 {
1695 	const mlxcx_dev_page_t *p0 = arg0;
1696 	const mlxcx_dev_page_t *p1 = arg1;
1697 
1698 	if (p0->mxdp_pa < p1->mxdp_pa)
1699 		return (-1);
1700 	if (p0->mxdp_pa > p1->mxdp_pa)
1701 		return (1);
1702 	return (0);
1703 }
1704 
1705 static boolean_t
1706 mlxcx_setup_ports(mlxcx_t *mlxp)
1707 {
1708 	uint_t i, j;
1709 	mlxcx_port_t *p;
1710 	mlxcx_flow_table_t *ft;
1711 	mlxcx_flow_group_t *fg;
1712 	mlxcx_flow_entry_t *fe;
1713 
1714 	VERIFY3U(mlxp->mlx_nports, >, 0);
1715 	mlxp->mlx_ports_size = mlxp->mlx_nports * sizeof (mlxcx_port_t);
1716 	mlxp->mlx_ports = kmem_zalloc(mlxp->mlx_ports_size, KM_SLEEP);
1717 
1718 	for (i = 0; i < mlxp->mlx_nports; ++i) {
1719 		p = &mlxp->mlx_ports[i];
1720 		p->mlp_num = i;
1721 		p->mlp_init |= MLXCX_PORT_INIT;
1722 		mutex_init(&p->mlp_mtx, NULL, MUTEX_DRIVER,
1723 		    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1724 		mutex_enter(&p->mlp_mtx);
1725 		if (!mlxcx_cmd_query_nic_vport_ctx(mlxp, p)) {
1726 			mutex_exit(&p->mlp_mtx);
1727 			goto err;
1728 		}
1729 		if (!mlxcx_cmd_query_port_mtu(mlxp, p)) {
1730 			mutex_exit(&p->mlp_mtx);
1731 			goto err;
1732 		}
1733 		if (!mlxcx_cmd_query_port_status(mlxp, p)) {
1734 			mutex_exit(&p->mlp_mtx);
1735 			goto err;
1736 		}
1737 		if (!mlxcx_cmd_query_port_speed(mlxp, p)) {
1738 			mutex_exit(&p->mlp_mtx);
1739 			goto err;
1740 		}
1741 		if (!mlxcx_cmd_modify_nic_vport_ctx(mlxp, p,
1742 		    MLXCX_MODIFY_NIC_VPORT_CTX_PROMISC)) {
1743 			mutex_exit(&p->mlp_mtx);
1744 			goto err;
1745 		}
1746 
1747 		mutex_exit(&p->mlp_mtx);
1748 	}
1749 
1750 	for (i = 0; i < mlxp->mlx_nports; ++i) {
1751 		p = &mlxp->mlx_ports[i];
1752 		mutex_enter(&p->mlp_mtx);
1753 		p->mlp_rx_flow = (ft = kmem_zalloc(sizeof (mlxcx_flow_table_t),
1754 		    KM_SLEEP));
1755 		mutex_init(&ft->mlft_mtx, NULL, MUTEX_DRIVER,
1756 		    DDI_INTR_PRI(mlxp->mlx_intr_pri));
1757 
1758 		mutex_enter(&ft->mlft_mtx);
1759 
1760 		ft->mlft_type = MLXCX_FLOW_TABLE_NIC_RX;
1761 		ft->mlft_port = p;
1762 		ft->mlft_entshift = mlxp->mlx_props.mldp_ftbl_root_size_shift;
1763 		if (ft->mlft_entshift > mlxp->mlx_caps->mlc_max_rx_ft_shift)
1764 			ft->mlft_entshift = mlxp->mlx_caps->mlc_max_rx_ft_shift;
1765 		ft->mlft_nents = (1 << ft->mlft_entshift);
1766 		ft->mlft_entsize = ft->mlft_nents * sizeof (mlxcx_flow_entry_t);
1767 		ft->mlft_ent = kmem_zalloc(ft->mlft_entsize, KM_SLEEP);
1768 		list_create(&ft->mlft_groups, sizeof (mlxcx_flow_group_t),
1769 		    offsetof(mlxcx_flow_group_t, mlfg_entry));
1770 
1771 		for (j = 0; j < ft->mlft_nents; ++j) {
1772 			ft->mlft_ent[j].mlfe_table = ft;
1773 			ft->mlft_ent[j].mlfe_index = j;
1774 		}
1775 
1776 		if (!mlxcx_cmd_create_flow_table(mlxp, ft)) {
1777 			mutex_exit(&ft->mlft_mtx);
1778 			mutex_exit(&p->mlp_mtx);
1779 			goto err;
1780 		}
1781 
1782 		if (!mlxcx_cmd_set_flow_table_root(mlxp, ft)) {
1783 			mutex_exit(&ft->mlft_mtx);
1784 			mutex_exit(&p->mlp_mtx);
1785 			goto err;
1786 		}
1787 
1788 		/*
1789 		 * We match broadcast at the top of the root flow table, then
1790 		 * all multicast/unicast MACs, then the promisc entry is down
1791 		 * the very bottom.
1792 		 *
1793 		 * This way when promisc is on, that entry simply catches any
1794 		 * remaining traffic that earlier flows haven't matched.
1795 		 */
1796 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1797 		list_insert_tail(&ft->mlft_groups, fg);
1798 		fg->mlfg_table = ft;
1799 		fg->mlfg_size = 1;
1800 		fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC;
1801 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1802 			mutex_exit(&ft->mlft_mtx);
1803 			mutex_exit(&p->mlp_mtx);
1804 			goto err;
1805 		}
1806 		p->mlp_bcast = fg;
1807 		fe = list_head(&fg->mlfg_entries);
1808 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1809 		(void) memset(fe->mlfe_dmac, 0xff, sizeof (fe->mlfe_dmac));
1810 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
1811 
1812 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1813 		list_insert_tail(&ft->mlft_groups, fg);
1814 		fg->mlfg_table = ft;
1815 		fg->mlfg_size = ft->mlft_nents - 2;
1816 		fg->mlfg_mask |= MLXCX_FLOW_MATCH_DMAC;
1817 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1818 			mutex_exit(&ft->mlft_mtx);
1819 			mutex_exit(&p->mlp_mtx);
1820 			goto err;
1821 		}
1822 		p->mlp_umcast = fg;
1823 
1824 		fg = kmem_zalloc(sizeof (mlxcx_flow_group_t), KM_SLEEP);
1825 		list_insert_tail(&ft->mlft_groups, fg);
1826 		fg->mlfg_table = ft;
1827 		fg->mlfg_size = 1;
1828 		if (!mlxcx_setup_flow_group(mlxp, ft, fg)) {
1829 			mutex_exit(&ft->mlft_mtx);
1830 			mutex_exit(&p->mlp_mtx);
1831 			goto err;
1832 		}
1833 		p->mlp_promisc = fg;
1834 		fe = list_head(&fg->mlfg_entries);
1835 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
1836 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
1837 
1838 		avl_create(&p->mlp_dmac_fe, mlxcx_dmac_fe_compare,
1839 		    sizeof (mlxcx_flow_entry_t), offsetof(mlxcx_flow_entry_t,
1840 		    mlfe_dmac_entry));
1841 
1842 		mutex_exit(&ft->mlft_mtx);
1843 		mutex_exit(&p->mlp_mtx);
1844 	}
1845 
1846 	return (B_TRUE);
1847 
1848 err:
1849 	mlxcx_teardown_ports(mlxp);
1850 	return (B_FALSE);
1851 }
1852 
1853 void
1854 mlxcx_remove_all_vlan_entries(mlxcx_t *mlxp, mlxcx_ring_group_t *g)
1855 {
1856 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
1857 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
1858 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
1859 	mlxcx_flow_entry_t *fe;
1860 	mlxcx_group_vlan_t *v;
1861 
1862 	ASSERT(mutex_owned(&g->mlg_mtx));
1863 
1864 	mutex_enter(&ft->mlft_mtx);
1865 
1866 	if (!list_is_empty(&g->mlg_rx_vlans)) {
1867 		fe = list_head(&dfg->mlfg_entries);
1868 		(void) mlxcx_cmd_set_flow_table_entry(mlxp, fe);
1869 	}
1870 
1871 	while ((v = list_remove_head(&g->mlg_rx_vlans)) != NULL) {
1872 		fe = v->mlgv_fe;
1873 		ASSERT3P(fe->mlfe_table, ==, ft);
1874 		ASSERT3P(fe->mlfe_group, ==, fg);
1875 		kmem_free(v, sizeof (mlxcx_group_vlan_t));
1876 
1877 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
1878 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
1879 	}
1880 
1881 	mutex_exit(&ft->mlft_mtx);
1882 }
1883 
1884 boolean_t
1885 mlxcx_remove_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g,
1886     boolean_t tagged, uint16_t vid)
1887 {
1888 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
1889 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
1890 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
1891 	mlxcx_flow_entry_t *fe;
1892 	mlxcx_group_vlan_t *v;
1893 	boolean_t found = B_FALSE;
1894 
1895 	ASSERT(mutex_owned(&g->mlg_mtx));
1896 
1897 	mutex_enter(&ft->mlft_mtx);
1898 
1899 	for (v = list_head(&g->mlg_rx_vlans); v != NULL;
1900 	    v = list_next(&g->mlg_rx_vlans, v)) {
1901 		if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) {
1902 			found = B_TRUE;
1903 			break;
1904 		}
1905 	}
1906 	if (!found) {
1907 		mutex_exit(&ft->mlft_mtx);
1908 		return (B_FALSE);
1909 	}
1910 
1911 	list_remove(&g->mlg_rx_vlans, v);
1912 
1913 	/*
1914 	 * If this is the last VLAN entry, we have to go back to accepting
1915 	 * any VLAN (which means re-enabling the default entry).
1916 	 *
1917 	 * Do this before we remove the flow entry for the last specific
1918 	 * VLAN so that we don't lose any traffic in the transition.
1919 	 */
1920 	if (list_is_empty(&g->mlg_rx_vlans)) {
1921 		fe = list_head(&dfg->mlfg_entries);
1922 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
1923 			list_insert_tail(&g->mlg_rx_vlans, v);
1924 			mutex_exit(&ft->mlft_mtx);
1925 			return (B_FALSE);
1926 		}
1927 	}
1928 
1929 	fe = v->mlgv_fe;
1930 	ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED);
1931 	ASSERT(fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED);
1932 	ASSERT3P(fe->mlfe_table, ==, ft);
1933 	ASSERT3P(fe->mlfe_group, ==, fg);
1934 
1935 	if (!mlxcx_cmd_delete_flow_table_entry(mlxp, fe)) {
1936 		list_insert_tail(&g->mlg_rx_vlans, v);
1937 		fe = list_head(&dfg->mlfg_entries);
1938 		if (fe->mlfe_state & MLXCX_FLOW_ENTRY_CREATED) {
1939 			(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
1940 		}
1941 		mutex_exit(&ft->mlft_mtx);
1942 		return (B_FALSE);
1943 	}
1944 
1945 	fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
1946 
1947 	kmem_free(v, sizeof (mlxcx_group_vlan_t));
1948 
1949 	mutex_exit(&ft->mlft_mtx);
1950 	return (B_TRUE);
1951 }
1952 
1953 boolean_t
1954 mlxcx_add_vlan_entry(mlxcx_t *mlxp, mlxcx_ring_group_t *g, boolean_t tagged,
1955     uint16_t vid)
1956 {
1957 	mlxcx_flow_table_t *ft = g->mlg_rx_vlan_ft;
1958 	mlxcx_flow_group_t *fg = g->mlg_rx_vlan_fg;
1959 	mlxcx_flow_group_t *dfg = g->mlg_rx_vlan_def_fg;
1960 	mlxcx_flow_entry_t *fe;
1961 	mlxcx_group_vlan_t *v;
1962 	boolean_t found = B_FALSE;
1963 	boolean_t first = B_FALSE;
1964 
1965 	ASSERT(mutex_owned(&g->mlg_mtx));
1966 
1967 	mutex_enter(&ft->mlft_mtx);
1968 
1969 	for (v = list_head(&g->mlg_rx_vlans); v != NULL;
1970 	    v = list_next(&g->mlg_rx_vlans, v)) {
1971 		if (v->mlgv_tagged == tagged && v->mlgv_vid == vid) {
1972 			mutex_exit(&ft->mlft_mtx);
1973 			return (B_TRUE);
1974 		}
1975 	}
1976 	if (list_is_empty(&g->mlg_rx_vlans))
1977 		first = B_TRUE;
1978 
1979 	for (fe = list_head(&fg->mlfg_entries); fe != NULL;
1980 	    fe = list_next(&fg->mlfg_entries, fe)) {
1981 		if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) {
1982 			found = B_TRUE;
1983 			break;
1984 		}
1985 	}
1986 	if (!found) {
1987 		mutex_exit(&ft->mlft_mtx);
1988 		return (B_FALSE);
1989 	}
1990 
1991 	v = kmem_zalloc(sizeof (mlxcx_group_vlan_t), KM_SLEEP);
1992 	v->mlgv_fe = fe;
1993 	v->mlgv_tagged = tagged;
1994 	v->mlgv_vid = vid;
1995 
1996 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED;
1997 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
1998 	fe->mlfe_vid = vid;
1999 	if (tagged) {
2000 		fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_CVLAN;
2001 	} else {
2002 		fe->mlfe_vlan_type = MLXCX_VLAN_TYPE_NONE;
2003 	}
2004 
2005 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2006 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY;
2007 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2008 		kmem_free(v, sizeof (mlxcx_group_vlan_t));
2009 		mutex_exit(&ft->mlft_mtx);
2010 		return (B_FALSE);
2011 	}
2012 
2013 	list_insert_tail(&g->mlg_rx_vlans, v);
2014 
2015 	/*
2016 	 * If the vlan list was empty for this group before adding this one,
2017 	 * then we no longer want the "default" entry to allow all VLANs
2018 	 * through.
2019 	 */
2020 	if (first) {
2021 		fe = list_head(&dfg->mlfg_entries);
2022 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2023 	}
2024 
2025 	mutex_exit(&ft->mlft_mtx);
2026 	return (B_TRUE);
2027 }
2028 
2029 void
2030 mlxcx_remove_all_umcast_entries(mlxcx_t *mlxp, mlxcx_port_t *port,
2031     mlxcx_ring_group_t *group)
2032 {
2033 	mlxcx_flow_entry_t *fe;
2034 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2035 	mlxcx_group_mac_t *gm, *ngm;
2036 
2037 	ASSERT(mutex_owned(&port->mlp_mtx));
2038 	ASSERT(mutex_owned(&group->mlg_mtx));
2039 
2040 	mutex_enter(&ft->mlft_mtx);
2041 
2042 	gm = avl_first(&group->mlg_rx_macs);
2043 	for (; gm != NULL; gm = ngm) {
2044 		ngm = AVL_NEXT(&group->mlg_rx_macs, gm);
2045 
2046 		ASSERT3P(gm->mlgm_group, ==, group);
2047 		fe = gm->mlgm_fe;
2048 		ASSERT3P(fe->mlfe_table, ==, ft);
2049 
2050 		avl_remove(&group->mlg_rx_macs, gm);
2051 		list_remove(&fe->mlfe_ring_groups, gm);
2052 		kmem_free(gm, sizeof (mlxcx_group_mac_t));
2053 
2054 		fe->mlfe_ndest = 0;
2055 		for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL;
2056 		    gm = list_next(&fe->mlfe_ring_groups, gm)) {
2057 			fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow =
2058 			    gm->mlgm_group->mlg_rx_vlan_ft;
2059 		}
2060 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2061 
2062 		if (fe->mlfe_ndest > 0) {
2063 			(void) mlxcx_cmd_set_flow_table_entry(mlxp, fe);
2064 			continue;
2065 		}
2066 
2067 		/*
2068 		 * There are no more ring groups left for this MAC (it wasn't
2069 		 * attached to any other groups since ndest == 0), so clean up
2070 		 * its flow entry.
2071 		 */
2072 		avl_remove(&port->mlp_dmac_fe, fe);
2073 		(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2074 		list_destroy(&fe->mlfe_ring_groups);
2075 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2076 	}
2077 
2078 	mutex_exit(&ft->mlft_mtx);
2079 }
2080 
2081 boolean_t
2082 mlxcx_remove_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port,
2083     mlxcx_ring_group_t *group, const uint8_t *macaddr)
2084 {
2085 	mlxcx_flow_entry_t *fe;
2086 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2087 	mlxcx_group_mac_t *gm, probe;
2088 
2089 	ASSERT(mutex_owned(&port->mlp_mtx));
2090 	ASSERT(mutex_owned(&group->mlg_mtx));
2091 
2092 	bzero(&probe, sizeof (probe));
2093 	bcopy(macaddr, probe.mlgm_mac, sizeof (probe.mlgm_mac));
2094 
2095 	mutex_enter(&ft->mlft_mtx);
2096 
2097 	gm = avl_find(&group->mlg_rx_macs, &probe, NULL);
2098 	if (gm == NULL) {
2099 		mutex_exit(&ft->mlft_mtx);
2100 		return (B_FALSE);
2101 	}
2102 	ASSERT3P(gm->mlgm_group, ==, group);
2103 	ASSERT0(bcmp(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac)));
2104 
2105 	fe = gm->mlgm_fe;
2106 	ASSERT3P(fe->mlfe_table, ==, ft);
2107 	ASSERT0(bcmp(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac)));
2108 
2109 	list_remove(&fe->mlfe_ring_groups, gm);
2110 	avl_remove(&group->mlg_rx_macs, gm);
2111 	kmem_free(gm, sizeof (mlxcx_group_mac_t));
2112 
2113 	fe->mlfe_ndest = 0;
2114 	for (gm = list_head(&fe->mlfe_ring_groups); gm != NULL;
2115 	    gm = list_next(&fe->mlfe_ring_groups, gm)) {
2116 		fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow =
2117 		    gm->mlgm_group->mlg_rx_vlan_ft;
2118 	}
2119 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2120 
2121 	if (fe->mlfe_ndest > 0) {
2122 		if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2123 			mutex_exit(&ft->mlft_mtx);
2124 			return (B_FALSE);
2125 		}
2126 		mutex_exit(&ft->mlft_mtx);
2127 		return (B_TRUE);
2128 	}
2129 
2130 	/*
2131 	 * There are no more ring groups left for this MAC (it wasn't attached
2132 	 * to any other groups since ndest == 0), so clean up its flow entry.
2133 	 */
2134 	avl_remove(&port->mlp_dmac_fe, fe);
2135 	(void) mlxcx_cmd_delete_flow_table_entry(mlxp, fe);
2136 	list_destroy(&fe->mlfe_ring_groups);
2137 
2138 	fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2139 
2140 	mutex_exit(&ft->mlft_mtx);
2141 
2142 	return (B_TRUE);
2143 }
2144 
2145 boolean_t
2146 mlxcx_add_umcast_entry(mlxcx_t *mlxp, mlxcx_port_t *port,
2147     mlxcx_ring_group_t *group, const uint8_t *macaddr)
2148 {
2149 	mlxcx_flow_group_t *fg;
2150 	mlxcx_flow_entry_t *fe, probe;
2151 	mlxcx_flow_table_t *ft = port->mlp_rx_flow;
2152 	mlxcx_group_mac_t *gm;
2153 	boolean_t found = B_FALSE;
2154 
2155 	ASSERT(mutex_owned(&port->mlp_mtx));
2156 	ASSERT(mutex_owned(&group->mlg_mtx));
2157 
2158 	bzero(&probe, sizeof (probe));
2159 	bcopy(macaddr, probe.mlfe_dmac, sizeof (probe.mlfe_dmac));
2160 
2161 	mutex_enter(&ft->mlft_mtx);
2162 
2163 	fe = avl_find(&port->mlp_dmac_fe, &probe, NULL);
2164 
2165 	if (fe == NULL) {
2166 		fg = port->mlp_umcast;
2167 		for (fe = list_head(&fg->mlfg_entries); fe != NULL;
2168 		    fe = list_next(&fg->mlfg_entries, fe)) {
2169 			if (!(fe->mlfe_state & MLXCX_FLOW_ENTRY_RESERVED)) {
2170 				found = B_TRUE;
2171 				break;
2172 			}
2173 		}
2174 		if (!found) {
2175 			mutex_exit(&ft->mlft_mtx);
2176 			return (B_FALSE);
2177 		}
2178 		list_create(&fe->mlfe_ring_groups, sizeof (mlxcx_group_mac_t),
2179 		    offsetof(mlxcx_group_mac_t, mlgm_fe_entry));
2180 		fe->mlfe_state |= MLXCX_FLOW_ENTRY_RESERVED;
2181 		fe->mlfe_action = MLXCX_FLOW_ACTION_FORWARD;
2182 		bcopy(macaddr, fe->mlfe_dmac, sizeof (fe->mlfe_dmac));
2183 
2184 		avl_add(&port->mlp_dmac_fe, fe);
2185 	}
2186 
2187 	fe->mlfe_dest[fe->mlfe_ndest++].mlfed_flow = group->mlg_rx_vlan_ft;
2188 	fe->mlfe_state |= MLXCX_FLOW_ENTRY_DIRTY;
2189 
2190 	if (!mlxcx_cmd_set_flow_table_entry(mlxp, fe)) {
2191 		fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_DIRTY;
2192 		if (--fe->mlfe_ndest == 0) {
2193 			fe->mlfe_state &= ~MLXCX_FLOW_ENTRY_RESERVED;
2194 		}
2195 		mutex_exit(&ft->mlft_mtx);
2196 		return (B_FALSE);
2197 	}
2198 
2199 	gm = kmem_zalloc(sizeof (mlxcx_group_mac_t), KM_SLEEP);
2200 	gm->mlgm_group = group;
2201 	gm->mlgm_fe = fe;
2202 	bcopy(macaddr, gm->mlgm_mac, sizeof (gm->mlgm_mac));
2203 	avl_add(&group->mlg_rx_macs, gm);
2204 	list_insert_tail(&fe->mlfe_ring_groups, gm);
2205 
2206 	mutex_exit(&ft->mlft_mtx);
2207 
2208 	return (B_TRUE);
2209 }
2210 
2211 boolean_t
2212 mlxcx_setup_flow_group(mlxcx_t *mlxp, mlxcx_flow_table_t *ft,
2213     mlxcx_flow_group_t *fg)
2214 {
2215 	mlxcx_flow_entry_t *fe;
2216 	uint_t i, idx;
2217 
2218 	ASSERT(mutex_owned(&ft->mlft_mtx));
2219 	ASSERT(ft->mlft_state & MLXCX_FLOW_TABLE_CREATED);
2220 	ASSERT3P(fg->mlfg_table, ==, ft);
2221 
2222 	if (ft->mlft_next_ent + fg->mlfg_size > ft->mlft_nents)
2223 		return (B_FALSE);
2224 	fg->mlfg_start_idx = ft->mlft_next_ent;
2225 
2226 	if (!mlxcx_cmd_create_flow_group(mlxp, fg)) {
2227 		return (B_FALSE);
2228 	}
2229 
2230 	list_create(&fg->mlfg_entries, sizeof (mlxcx_flow_entry_t),
2231 	    offsetof(mlxcx_flow_entry_t, mlfe_group_entry));
2232 	for (i = 0; i < fg->mlfg_size; ++i) {
2233 		idx = fg->mlfg_start_idx + i;
2234 		fe = &ft->mlft_ent[idx];
2235 		fe->mlfe_group = fg;
2236 		list_insert_tail(&fg->mlfg_entries, fe);
2237 	}
2238 	fg->mlfg_avail = fg->mlfg_size;
2239 	ft->mlft_next_ent += fg->mlfg_size;
2240 
2241 	return (B_TRUE);
2242 }
2243 
2244 static boolean_t
2245 mlxcx_setup_eq0(mlxcx_t *mlxp)
2246 {
2247 	mlxcx_event_queue_t *mleq = &mlxp->mlx_eqs[0];
2248 
2249 	mutex_enter(&mleq->mleq_mtx);
2250 	if (!mlxcx_eq_alloc_dma(mlxp, mleq)) {
2251 		/* mlxcx_teardown_eqs() will clean this up */
2252 		mutex_exit(&mleq->mleq_mtx);
2253 		return (B_FALSE);
2254 	}
2255 	mleq->mleq_mlx = mlxp;
2256 	mleq->mleq_uar = &mlxp->mlx_uar;
2257 	mleq->mleq_events =
2258 	    (1ULL << MLXCX_EVENT_PAGE_REQUEST) |
2259 	    (1ULL << MLXCX_EVENT_PORT_STATE) |
2260 	    (1ULL << MLXCX_EVENT_INTERNAL_ERROR) |
2261 	    (1ULL << MLXCX_EVENT_PORT_MODULE) |
2262 	    (1ULL << MLXCX_EVENT_SENDQ_DRAIN) |
2263 	    (1ULL << MLXCX_EVENT_LAST_WQE) |
2264 	    (1ULL << MLXCX_EVENT_CQ_ERROR) |
2265 	    (1ULL << MLXCX_EVENT_WQ_CATASTROPHE) |
2266 	    (1ULL << MLXCX_EVENT_PAGE_FAULT) |
2267 	    (1ULL << MLXCX_EVENT_WQ_INVALID_REQ) |
2268 	    (1ULL << MLXCX_EVENT_WQ_ACCESS_VIOL) |
2269 	    (1ULL << MLXCX_EVENT_NIC_VPORT) |
2270 	    (1ULL << MLXCX_EVENT_DOORBELL_CONGEST);
2271 	if (!mlxcx_cmd_create_eq(mlxp, mleq)) {
2272 		/* mlxcx_teardown_eqs() will clean this up */
2273 		mutex_exit(&mleq->mleq_mtx);
2274 		return (B_FALSE);
2275 	}
2276 	if (ddi_intr_enable(mlxp->mlx_intr_handles[0]) != DDI_SUCCESS) {
2277 		/*
2278 		 * mlxcx_teardown_eqs() will handle calling cmd_destroy_eq and
2279 		 * eq_rele_dma
2280 		 */
2281 		mutex_exit(&mleq->mleq_mtx);
2282 		return (B_FALSE);
2283 	}
2284 	mlxcx_arm_eq(mlxp, mleq);
2285 	mutex_exit(&mleq->mleq_mtx);
2286 	return (B_TRUE);
2287 }
2288 
2289 int
2290 mlxcx_cq_compare(const void *arg0, const void *arg1)
2291 {
2292 	const mlxcx_completion_queue_t *left = arg0;
2293 	const mlxcx_completion_queue_t *right = arg1;
2294 
2295 	if (left->mlcq_num < right->mlcq_num) {
2296 		return (-1);
2297 	}
2298 	if (left->mlcq_num > right->mlcq_num) {
2299 		return (1);
2300 	}
2301 	return (0);
2302 }
2303 
2304 static boolean_t
2305 mlxcx_setup_eqs(mlxcx_t *mlxp)
2306 {
2307 	uint_t i;
2308 	mlxcx_event_queue_t *mleq;
2309 
2310 	ASSERT3S(mlxp->mlx_intr_count, >, 0);
2311 
2312 	for (i = 1; i < mlxp->mlx_intr_count; ++i) {
2313 		mleq = &mlxp->mlx_eqs[i];
2314 		mutex_enter(&mleq->mleq_mtx);
2315 		if (!mlxcx_eq_alloc_dma(mlxp, mleq)) {
2316 			mutex_exit(&mleq->mleq_mtx);
2317 			return (B_FALSE);
2318 		}
2319 		mleq->mleq_uar = &mlxp->mlx_uar;
2320 		if (!mlxcx_cmd_create_eq(mlxp, mleq)) {
2321 			/* mlxcx_teardown() will handle calling eq_rele_dma */
2322 			mutex_exit(&mleq->mleq_mtx);
2323 			return (B_FALSE);
2324 		}
2325 		if (mlxp->mlx_props.mldp_intrmod_period_usec != 0 &&
2326 		    !mlxcx_cmd_set_int_mod(mlxp, i,
2327 		    mlxp->mlx_props.mldp_intrmod_period_usec)) {
2328 			mutex_exit(&mleq->mleq_mtx);
2329 			return (B_FALSE);
2330 		}
2331 		if (ddi_intr_enable(mlxp->mlx_intr_handles[i]) != DDI_SUCCESS) {
2332 			mutex_exit(&mleq->mleq_mtx);
2333 			return (B_FALSE);
2334 		}
2335 		mlxcx_arm_eq(mlxp, mleq);
2336 		mutex_exit(&mleq->mleq_mtx);
2337 	}
2338 
2339 	mlxp->mlx_next_eq = 1;
2340 
2341 	return (B_TRUE);
2342 }
2343 
2344 /*
2345  * Snapshot all of the hardware capabilities that we care about and then modify
2346  * the HCA capabilities to get things moving.
2347  */
2348 static boolean_t
2349 mlxcx_init_caps(mlxcx_t *mlxp)
2350 {
2351 	mlxcx_caps_t *c;
2352 
2353 	mlxp->mlx_caps = c = kmem_zalloc(sizeof (mlxcx_caps_t), KM_SLEEP);
2354 
2355 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL,
2356 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_hca_cur)) {
2357 		mlxcx_warn(mlxp, "failed to obtain current HCA general caps");
2358 	}
2359 
2360 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_GENERAL,
2361 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_hca_max)) {
2362 		mlxcx_warn(mlxp, "failed to obtain maximum HCA general caps");
2363 	}
2364 
2365 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET,
2366 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_ether_cur)) {
2367 		mlxcx_warn(mlxp, "failed to obtain current HCA eth caps");
2368 	}
2369 
2370 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_ETHERNET,
2371 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_ether_max)) {
2372 		mlxcx_warn(mlxp, "failed to obtain maximum HCA eth caps");
2373 	}
2374 
2375 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW,
2376 	    MLXCX_HCA_CAP_MODE_CURRENT, &c->mlc_nic_flow_cur)) {
2377 		mlxcx_warn(mlxp, "failed to obtain current HCA flow caps");
2378 	}
2379 
2380 	if (!mlxcx_cmd_query_hca_cap(mlxp, MLXCX_HCA_CAP_NIC_FLOW,
2381 	    MLXCX_HCA_CAP_MODE_MAX, &c->mlc_nic_flow_max)) {
2382 		mlxcx_warn(mlxp, "failed to obtain maximum HCA flow caps");
2383 	}
2384 
2385 	/*
2386 	 * Check the caps meet our requirements.
2387 	 */
2388 	const mlxcx_hca_cap_general_caps_t *gen = &c->mlc_hca_cur.mhc_general;
2389 
2390 	if (gen->mlcap_general_log_pg_sz != 12) {
2391 		mlxcx_warn(mlxp, "!hardware has page size != 4k "
2392 		    "(log_pg_sz = %u)", (uint_t)gen->mlcap_general_log_pg_sz);
2393 		goto err;
2394 	}
2395 	if (gen->mlcap_general_cqe_version != 1) {
2396 		mlxcx_warn(mlxp, "!hardware does not support CQE v1 "
2397 		    "(cqe_ver = %u)", (uint_t)gen->mlcap_general_cqe_version);
2398 		goto err;
2399 	}
2400 	if (gen->mlcap_general_port_type !=
2401 	    MLXCX_CAP_GENERAL_PORT_TYPE_ETHERNET) {
2402 		mlxcx_warn(mlxp, "!hardware has non-ethernet ports");
2403 		goto err;
2404 	}
2405 	mlxp->mlx_nports = gen->mlcap_general_num_ports;
2406 	mlxp->mlx_max_sdu = (1 << (gen->mlcap_general_log_max_msg & 0x1F));
2407 
2408 	c->mlc_max_tir = (1 << gen->mlcap_general_log_max_tir);
2409 
2410 	c->mlc_checksum = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags,
2411 	    MLXCX_ETH_CAP_CSUM_CAP);
2412 	c->mlc_vxlan = get_bit32(c->mlc_ether_cur.mhc_eth.mlcap_eth_flags,
2413 	    MLXCX_ETH_CAP_TUNNEL_STATELESS_VXLAN);
2414 
2415 	c->mlc_max_lso_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth.
2416 	    mlcap_eth_flags, MLXCX_ETH_CAP_MAX_LSO_CAP));
2417 	if (c->mlc_max_lso_size == 1) {
2418 		c->mlc_max_lso_size = 0;
2419 		c->mlc_lso = B_FALSE;
2420 	} else {
2421 		c->mlc_lso = B_TRUE;
2422 	}
2423 
2424 	c->mlc_max_rqt_size = (1 << get_bits32(c->mlc_ether_cur.mhc_eth.
2425 	    mlcap_eth_flags, MLXCX_ETH_CAP_RSS_IND_TBL_CAP));
2426 
2427 	if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2428 	    mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_SUPPORT)) {
2429 		mlxcx_warn(mlxp, "!hardware does not support rx flow tables");
2430 		goto err;
2431 	}
2432 	if (!get_bit32(c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2433 	    mlcap_flow_prop_flags, MLXCX_FLOW_CAP_PROPS_MODIFY)) {
2434 		mlxcx_warn(mlxp, "!hardware does not support modifying rx "
2435 		    "flow table entries");
2436 		goto err;
2437 	}
2438 
2439 	c->mlc_max_rx_ft_shift = c->mlc_nic_flow_cur.mhc_flow.mlcap_flow_nic_rx.
2440 	    mlcap_flow_prop_log_max_ft_size;
2441 	c->mlc_max_rx_flows = (1 << c->mlc_nic_flow_cur.mhc_flow.
2442 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_flow);
2443 	c->mlc_max_rx_fe_dest = (1 << c->mlc_nic_flow_cur.mhc_flow.
2444 	    mlcap_flow_nic_rx.mlcap_flow_prop_log_max_destination);
2445 
2446 	return (B_TRUE);
2447 
2448 err:
2449 	kmem_free(mlxp->mlx_caps, sizeof (mlxcx_caps_t));
2450 	return (B_FALSE);
2451 }
2452 
2453 static int
2454 mlxcx_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2455 {
2456 	mlxcx_t *mlxp;
2457 
2458 	if (cmd != DDI_DETACH)
2459 		return (DDI_FAILURE);
2460 
2461 	mlxp = ddi_get_driver_private(dip);
2462 	if (mlxp == NULL) {
2463 		mlxcx_warn(NULL, "asked to detach, but missing instance "
2464 		    "private data");
2465 		return (DDI_FAILURE);
2466 	}
2467 
2468 	if (mlxp->mlx_attach & MLXCX_ATTACH_MAC_HDL) {
2469 		if (mac_unregister(mlxp->mlx_mac_hdl) != DDI_SUCCESS) {
2470 			return (DDI_FAILURE);
2471 		}
2472 		mlxp->mlx_attach &= ~MLXCX_ATTACH_MAC_HDL;
2473 	}
2474 
2475 	mlxcx_teardown(mlxp);
2476 	return (DDI_SUCCESS);
2477 }
2478 
2479 static size_t
2480 mlxcx_calc_rx_ngroups(mlxcx_t *mlxp)
2481 {
2482 	size_t ngroups = mlxp->mlx_props.mldp_rx_ngroups_large +
2483 	    mlxp->mlx_props.mldp_rx_ngroups_small;
2484 	size_t tirlim, flowlim, gflowlim;
2485 
2486 	tirlim = mlxp->mlx_caps->mlc_max_tir / MLXCX_TIRS_PER_GROUP;
2487 	if (tirlim < ngroups) {
2488 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2489 		    "on number of TIRs available", tirlim);
2490 		ngroups = tirlim;
2491 	}
2492 
2493 	flowlim = (1 << mlxp->mlx_caps->mlc_max_rx_ft_shift) - 2;
2494 	if (flowlim < ngroups) {
2495 		mlxcx_note(mlxp, "limiting number of rx groups to %u based "
2496 		    "on max size of RX flow tables", flowlim);
2497 		ngroups = flowlim;
2498 	}
2499 
2500 	do {
2501 		gflowlim = mlxp->mlx_caps->mlc_max_rx_flows - 16 * ngroups - 2;
2502 		if (gflowlim < ngroups) {
2503 			mlxcx_note(mlxp, "limiting number of rx groups to %u "
2504 			    "based on max total RX flows", gflowlim);
2505 			--ngroups;
2506 		}
2507 	} while (gflowlim < ngroups);
2508 
2509 	return (ngroups);
2510 }
2511 
2512 static int
2513 mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2514 {
2515 	mlxcx_t *mlxp;
2516 	uint_t i;
2517 	int inst, ret;
2518 
2519 	if (cmd != DDI_ATTACH)
2520 		return (DDI_FAILURE);
2521 
2522 	inst = ddi_get_instance(dip);
2523 	ret = ddi_soft_state_zalloc(mlxcx_softstate, inst);
2524 	if (ret != 0)
2525 		return (ret);
2526 
2527 	mlxp = ddi_get_soft_state(mlxcx_softstate, inst);
2528 	if (mlxp == NULL)
2529 		return (DDI_FAILURE);
2530 	mlxp->mlx_dip = dip;
2531 	mlxp->mlx_inst = inst;
2532 	ddi_set_driver_private(dip, mlxp);
2533 
2534 	mlxcx_load_props(mlxp);
2535 
2536 	mlxcx_fm_init(mlxp);
2537 	mlxp->mlx_attach |= MLXCX_ATTACH_FM;
2538 
2539 	if (pci_config_setup(mlxp->mlx_dip, &mlxp->mlx_cfg_handle) !=
2540 	    DDI_SUCCESS) {
2541 		mlxcx_warn(mlxp, "failed to initial PCI config space");
2542 		goto err;
2543 	}
2544 	mlxp->mlx_attach |= MLXCX_ATTACH_PCI_CONFIG;
2545 
2546 	if (!mlxcx_regs_map(mlxp)) {
2547 		goto err;
2548 	}
2549 	mlxp->mlx_attach |= MLXCX_ATTACH_REGS;
2550 
2551 	if (!mlxcx_cmd_queue_init(mlxp)) {
2552 		goto err;
2553 	}
2554 	mlxp->mlx_attach |= MLXCX_ATTACH_CMD;
2555 
2556 	if (!mlxcx_cmd_enable_hca(mlxp)) {
2557 		goto err;
2558 	}
2559 	mlxp->mlx_attach |= MLXCX_ATTACH_ENABLE_HCA;
2560 
2561 	if (!mlxcx_check_issi(mlxp)) {
2562 		goto err;
2563 	}
2564 
2565 	/*
2566 	 * We have to get our interrupts now so we know what priority to
2567 	 * create pagemtx with.
2568 	 */
2569 	if (!mlxcx_intr_setup(mlxp)) {
2570 		goto err;
2571 	}
2572 	mlxp->mlx_attach |= MLXCX_ATTACH_INTRS;
2573 
2574 	mutex_init(&mlxp->mlx_pagemtx, NULL, MUTEX_DRIVER,
2575 	    DDI_INTR_PRI(mlxp->mlx_intr_pri));
2576 	avl_create(&mlxp->mlx_pages, mlxcx_page_compare,
2577 	    sizeof (mlxcx_dev_page_t), offsetof(mlxcx_dev_page_t, mxdp_tree));
2578 	mlxp->mlx_attach |= MLXCX_ATTACH_PAGE_LIST;
2579 
2580 	if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_BOOT)) {
2581 		goto err;
2582 	}
2583 
2584 	if (!mlxcx_init_caps(mlxp)) {
2585 		goto err;
2586 	}
2587 	mlxp->mlx_attach |= MLXCX_ATTACH_CAPS;
2588 
2589 	if (!mlxcx_init_pages(mlxp, MLXCX_QUERY_PAGES_OPMOD_INIT)) {
2590 		goto err;
2591 	}
2592 
2593 	if (!mlxcx_cmd_init_hca(mlxp)) {
2594 		goto err;
2595 	}
2596 	mlxp->mlx_attach |= MLXCX_ATTACH_INIT_HCA;
2597 
2598 	if (!mlxcx_cmd_set_driver_version(mlxp, MLXCX_DRIVER_VERSION)) {
2599 		goto err;
2600 	}
2601 
2602 	/*
2603 	 * The User Access Region (UAR) is needed so we can ring EQ and CQ
2604 	 * doorbells.
2605 	 */
2606 	if (!mlxcx_cmd_alloc_uar(mlxp, &mlxp->mlx_uar)) {
2607 		goto err;
2608 	}
2609 	for (i = 0; i < MLXCX_BF_PER_UAR; ++i) {
2610 		mutex_init(&mlxp->mlx_uar.mlu_bf[i].mbf_mtx, NULL,
2611 		    MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri));
2612 	}
2613 	mlxp->mlx_attach |= MLXCX_ATTACH_UAR_PD_TD;
2614 
2615 	/*
2616 	 * Set up event queue #0 -- it's special and only handles control
2617 	 * type events, like PAGE_REQUEST (which we will probably get during
2618 	 * the commands below).
2619 	 *
2620 	 * This will enable and arm the interrupt on EQ 0, too.
2621 	 */
2622 	if (!mlxcx_setup_eq0(mlxp)) {
2623 		goto err;
2624 	}
2625 
2626 	/*
2627 	 * Allocate a protection and transport domain. These don't really do
2628 	 * anything for us (they're IB concepts), but we need to give their
2629 	 * ID numbers in other commands.
2630 	 */
2631 	if (!mlxcx_cmd_alloc_pd(mlxp, &mlxp->mlx_pd)) {
2632 		goto err;
2633 	}
2634 	if (!mlxcx_cmd_alloc_tdom(mlxp, &mlxp->mlx_tdom)) {
2635 		goto err;
2636 	}
2637 	/*
2638 	 * Fetch the "reserved" lkey that lets us give linear addresses in
2639 	 * work queue entries, rather than having to mess with the NIC's
2640 	 * internal MMU.
2641 	 */
2642 	if (!mlxcx_cmd_query_special_ctxs(mlxp)) {
2643 		goto err;
2644 	}
2645 
2646 	/*
2647 	 * Query our port information and current state, populate the
2648 	 * mlxcx_port_t structs.
2649 	 *
2650 	 * This also sets up the root flow tables and flow groups.
2651 	 */
2652 	if (!mlxcx_setup_ports(mlxp)) {
2653 		goto err;
2654 	}
2655 	mlxp->mlx_attach |= MLXCX_ATTACH_PORTS;
2656 
2657 	mlxcx_load_model_props(mlxp);
2658 
2659 	/*
2660 	 * Set up, enable and arm the rest of the interrupt EQs which will
2661 	 * service events from CQs.
2662 	 *
2663 	 * The MLXCX_ATTACH_INTRS flag covers checking if these need to be
2664 	 * cleaned up.
2665 	 */
2666 	if (!mlxcx_setup_eqs(mlxp)) {
2667 		goto err;
2668 	}
2669 
2670 	/* Completion queues */
2671 	list_create(&mlxp->mlx_cqs, sizeof (mlxcx_completion_queue_t),
2672 	    offsetof(mlxcx_completion_queue_t, mlcq_entry));
2673 	mlxp->mlx_attach |= MLXCX_ATTACH_CQS;
2674 
2675 	/* Work queues (send queues, receive queues) */
2676 	list_create(&mlxp->mlx_wqs, sizeof (mlxcx_work_queue_t),
2677 	    offsetof(mlxcx_work_queue_t, mlwq_entry));
2678 	mlxp->mlx_attach |= MLXCX_ATTACH_WQS;
2679 
2680 	/* Set up periodic fault check timers which check the queue states */
2681 	if (!mlxcx_setup_checktimers(mlxp)) {
2682 		goto err;
2683 	}
2684 	mlxp->mlx_attach |= MLXCX_ATTACH_CHKTIMERS;
2685 
2686 	/*
2687 	 * Construct our arrays of mlxcx_ring_group_ts, which represent the
2688 	 * "groups" we advertise to MAC.
2689 	 */
2690 	mlxp->mlx_rx_ngroups = mlxcx_calc_rx_ngroups(mlxp);
2691 	mlxp->mlx_rx_groups_size = mlxp->mlx_rx_ngroups *
2692 	    sizeof (mlxcx_ring_group_t);
2693 	mlxp->mlx_rx_groups = kmem_zalloc(mlxp->mlx_rx_groups_size, KM_SLEEP);
2694 
2695 	mlxp->mlx_tx_ngroups = mlxp->mlx_props.mldp_tx_ngroups;
2696 	mlxp->mlx_tx_groups_size = mlxp->mlx_tx_ngroups *
2697 	    sizeof (mlxcx_ring_group_t);
2698 	mlxp->mlx_tx_groups = kmem_zalloc(mlxp->mlx_tx_groups_size, KM_SLEEP);
2699 
2700 	mlxp->mlx_attach |= MLXCX_ATTACH_GROUPS;
2701 
2702 	/*
2703 	 * Sets up the free/busy buffers list for keeping track of packet
2704 	 * buffers.
2705 	 */
2706 	if (!mlxcx_setup_bufs(mlxp))
2707 		goto err;
2708 	mlxp->mlx_attach |= MLXCX_ATTACH_BUFS;
2709 
2710 	/*
2711 	 * Before we tell MAC about our rings/groups, we need to do enough
2712 	 * setup on them to be sure about the numbers and configuration that
2713 	 * we have. This will do basically everything short of allocating
2714 	 * packet buffers and starting the rings up.
2715 	 */
2716 	for (i = 0; i < mlxp->mlx_tx_ngroups; ++i) {
2717 		if (!mlxcx_tx_group_setup(mlxp, &mlxp->mlx_tx_groups[i]))
2718 			goto err;
2719 	}
2720 	for (i = 0; i < mlxp->mlx_rx_ngroups; ++i) {
2721 		if (!mlxcx_rx_group_setup(mlxp, &mlxp->mlx_rx_groups[i]))
2722 			goto err;
2723 	}
2724 
2725 	/*
2726 	 * Finally, tell MAC that we exist!
2727 	 */
2728 	if (!mlxcx_register_mac(mlxp)) {
2729 		goto err;
2730 	}
2731 	mlxp->mlx_attach |= MLXCX_ATTACH_MAC_HDL;
2732 
2733 	return (DDI_SUCCESS);
2734 
2735 err:
2736 	mlxcx_teardown(mlxp);
2737 	return (DDI_FAILURE);
2738 }
2739 
2740 static struct cb_ops mlxcx_cb_ops = {
2741 	.cb_open = nulldev,
2742 	.cb_close = nulldev,
2743 	.cb_strategy = nodev,
2744 	.cb_print = nodev,
2745 	.cb_dump = nodev,
2746 	.cb_read = nodev,
2747 	.cb_write = nodev,
2748 	.cb_ioctl = nodev,
2749 	.cb_devmap = nodev,
2750 	.cb_mmap = nodev,
2751 	.cb_segmap = nodev,
2752 	.cb_chpoll = nochpoll,
2753 	.cb_prop_op = ddi_prop_op,
2754 	.cb_flag = D_MP,
2755 	.cb_rev = CB_REV,
2756 	.cb_aread = nodev,
2757 	.cb_awrite = nodev
2758 };
2759 
2760 static struct dev_ops mlxcx_dev_ops = {
2761 	.devo_rev = DEVO_REV,
2762 	.devo_refcnt = 0,
2763 	.devo_getinfo = NULL,
2764 	.devo_identify = nulldev,
2765 	.devo_probe = nulldev,
2766 	.devo_attach = mlxcx_attach,
2767 	.devo_detach = mlxcx_detach,
2768 	.devo_reset = nodev,
2769 	.devo_power = ddi_power,
2770 	.devo_quiesce = ddi_quiesce_not_supported,
2771 	.devo_cb_ops = &mlxcx_cb_ops
2772 };
2773 
2774 static struct modldrv mlxcx_modldrv = {
2775 	.drv_modops = &mod_driverops,
2776 	.drv_linkinfo = "Mellanox Connect-X 4/5/6",
2777 	.drv_dev_ops = &mlxcx_dev_ops
2778 };
2779 
2780 static struct modlinkage mlxcx_modlinkage = {
2781 	.ml_rev = MODREV_1,
2782 	.ml_linkage = { &mlxcx_modldrv, NULL }
2783 };
2784 
2785 int
2786 _init(void)
2787 {
2788 	int ret;
2789 
2790 	ret = ddi_soft_state_init(&mlxcx_softstate, sizeof (mlxcx_t), 0);
2791 	if (ret != 0) {
2792 		return (ret);
2793 	}
2794 
2795 	mac_init_ops(&mlxcx_dev_ops, MLXCX_MODULE_NAME);
2796 
2797 	if ((ret = mod_install(&mlxcx_modlinkage)) != DDI_SUCCESS) {
2798 		mac_fini_ops(&mlxcx_dev_ops);
2799 		ddi_soft_state_fini(&mlxcx_softstate);
2800 		return (ret);
2801 	}
2802 
2803 	return (DDI_SUCCESS);
2804 }
2805 
2806 int
2807 _info(struct modinfo *modinfop)
2808 {
2809 	return (mod_info(&mlxcx_modlinkage, modinfop));
2810 }
2811 
2812 int
2813 _fini(void)
2814 {
2815 	int ret;
2816 
2817 	if ((ret = mod_remove(&mlxcx_modlinkage)) != DDI_SUCCESS) {
2818 		return (ret);
2819 	}
2820 
2821 	mac_fini_ops(&mlxcx_dev_ops);
2822 
2823 	ddi_soft_state_fini(&mlxcx_softstate);
2824 
2825 	return (DDI_SUCCESS);
2826 }
2827