xref: /illumos-gate/usr/src/uts/common/sys/ib/clients/ibd/ibd.h (revision abcc7ef9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #ifndef _SYS_IB_CLIENTS_IBD_H
27 #define	_SYS_IB_CLIENTS_IBD_H
28 
29 #ifdef __cplusplus
30 extern "C" {
31 #endif
32 
33 /* The following macros are used in both ibd.c and ibd_cm.c */
34 
35 /*
36  * Completion queue polling control
37  */
38 #define	IBD_CQ_POLLING			0x1
39 #define	IBD_REDO_CQ_POLLING		0x2
40 
41 /*
42  * Maximum length for returning chained mps back to crossbow.
43  * Also used as the maximum number of rx wc's polled at a time.
44  */
45 #define	IBD_MAX_RX_MP_LEN		16
46 
47 /*
48  * When doing multiple-send-wr, this value determines how many to do at
49  * a time (in a single ibt_post_send).
50  */
51 #define	IBD_MAX_TX_POST_MULTIPLE	4
52 
53 /*
54  * Flag bits for resources to reap
55  */
56 #define	IBD_RSRC_SWQE			0x1
57 #define	IBD_RSRC_LSOBUF			0x2
58 #define	IBD_RSRC_RC_SWQE		0x4
59 #define	IBD_RSRC_RC_TX_LARGEBUF		0x8
60 
61 /*
62  * Async operation types
63  */
64 #define	IBD_ASYNC_GETAH			1
65 #define	IBD_ASYNC_JOIN			2
66 #define	IBD_ASYNC_LEAVE			3
67 #define	IBD_ASYNC_PROMON		4
68 #define	IBD_ASYNC_PROMOFF		5
69 #define	IBD_ASYNC_REAP			6
70 #define	IBD_ASYNC_TRAP			7
71 #define	IBD_ASYNC_SCHED			8
72 #define	IBD_ASYNC_LINK			9
73 #define	IBD_ASYNC_EXIT			10
74 #define	IBD_ASYNC_RC_TOO_BIG		11
75 #define	IBD_ASYNC_RC_CLOSE_ACT_CHAN		12
76 #define	IBD_ASYNC_RC_RECYCLE_ACE		13
77 
78 /*
79  * Miscellaneous constants
80  */
81 #define	IBD_SEND			0
82 #define	IBD_RECV			1
83 
84 /* Tunables defaults and limits */
85 #define	IBD_LINK_MODE_UD		0
86 #define	IBD_LINK_MODE_RC		1
87 
88 #define	IBD_DEF_LINK_MODE		IBD_LINK_MODE_RC
89 #define	IBD_DEF_LSO_POLICY		B_TRUE
90 #define	IBD_DEF_NUM_LSO_BUFS		1024
91 #define	IBD_DEF_CREATE_BCAST_GROUP	B_TRUE
92 #define	IBD_DEF_COALESCE_COMPLETIONS	B_TRUE
93 #define	IBD_DEF_UD_RX_COMP_COUNT	4
94 #define	IBD_DEF_UD_RX_COMP_USEC		10
95 #define	IBD_DEF_UD_TX_COMP_COUNT	16
96 #define	IBD_DEF_UD_TX_COMP_USEC		300
97 #define	IBD_DEF_RC_RX_COMP_COUNT	4
98 #define	IBD_DEF_RC_RX_COMP_USEC		10
99 #define	IBD_DEF_RC_TX_COMP_COUNT	10
100 #define	IBD_DEF_RC_TX_COMP_USEC		300
101 #define	IBD_DEF_UD_TX_COPY_THRESH	4096
102 #define	IBD_DEF_RC_RX_COPY_THRESH	4096
103 #define	IBD_DEF_RC_TX_COPY_THRESH	4096
104 #define	IBD_DEF_UD_NUM_RWQE		4000
105 #define	IBD_DEF_UD_NUM_SWQE		4000
106 #define	IBD_DEF_RC_ENABLE_SRQ		B_TRUE
107 #define	IBD_DEF_RC_NUM_RWQE		2047
108 #define	IBD_DEF_RC_NUM_SWQE		511
109 #define	IBD_DEF_NUM_AH			256
110 #define	IBD_DEF_HASH_SIZE		32
111 #define	IBD_DEF_RC_NUM_SRQ		(IBD_DEF_RC_NUM_RWQE - 1)
112 #define	IBD_DEF_RC_RX_RWQE_THRESH	(IBD_DEF_RC_NUM_RWQE >> 2)
113 
114 /* Tunable limits */
115 #define	IBD_MIN_NUM_LSO_BUFS		512
116 #define	IBD_MAX_NUM_LSO_BUFS		4096
117 #define	IBD_MIN_UD_TX_COPY_THRESH	2048
118 #define	IBD_MAX_UD_TX_COPY_THRESH	65536
119 #define	IBD_MIN_UD_NUM_SWQE		512
120 #define	IBD_MAX_UD_NUM_SWQE		8000
121 #define	IBD_MIN_UD_NUM_RWQE		512
122 #define	IBD_MAX_UD_NUM_RWQE		8000
123 #define	IBD_MIN_NUM_AH			32
124 #define	IBD_MAX_NUM_AH			8192
125 #define	IBD_MIN_HASH_SIZE		32
126 #define	IBD_MAX_HASH_SIZE		1024
127 
128 #define	IBD_MIN_RC_NUM_SWQE		511
129 #define	IBD_MAX_RC_NUM_SWQE		8000
130 #define	IBD_MIN_RC_NUM_RWQE		511
131 #define	IBD_MAX_RC_NUM_RWQE		8000
132 #define	IBD_MIN_RC_RX_COPY_THRESH	1500
133 #define	IBD_MAX_RC_RX_COPY_THRESH	65520
134 #define	IBD_MIN_RC_TX_COPY_THRESH	1500
135 #define	IBD_MAX_RC_TX_COPY_THRESH	65520
136 #define	IBD_MIN_RC_NUM_SRQ		(IBD_MIN_RC_NUM_RWQE - 1)
137 #define	IBD_MIN_RC_RX_RWQE_THRESH	(IBD_MIN_RC_NUM_RWQE >> 2)
138 
139 /*
140  * Thresholds
141  *
142  * When waiting for resources (swqes or lso buffers) to become available,
143  * the first two thresholds below determine how long to wait before informing
144  * the network layer to start sending packets again. The IBD_TX_POLL_THRESH
145  * determines how low the available swqes should go before we start polling
146  * the completion queue.
147  */
148 #define	IBD_FREE_LSOS_THRESH		8
149 #define	IBD_FREE_SWQES_THRESH		20
150 #define	IBD_TX_POLL_THRESH		80
151 
152 #ifdef DEBUG
153 void debug_print(int l, char *fmt, ...);
154 #define	DPRINT		debug_print
155 #else
156 #define	DPRINT		0 &&
157 #endif
158 
159 /*
160  * AH and MCE active list manipulation:
161  *
162  * Multicast disable requests and MCG delete traps are two cases
163  * where the active AH entry for the mcg (if any unreferenced one exists)
164  * will be moved to the free list (to force the next Tx to the mcg to
165  * join the MCG in SendOnly mode). Port up handling will also move AHs
166  * from active to free list.
167  *
168  * In the case when some transmits are still pending on an entry
169  * for an mcg, but a multicast disable has already been issued on the
170  * mcg, there are some options to consider to preserve the join state
171  * to ensure the emitted packet is properly routed on the IBA fabric.
172  * For the AH, we can
173  * 1. take out of active list at multicast disable time.
174  * 2. take out of active list only when last pending Tx completes.
175  * For the MCE, we can
176  * 3. take out of active list at multicast disable time.
177  * 4. take out of active list only when last pending Tx completes.
178  * 5. move from active list to stale list at multicast disable time.
179  * We choose to use 2,4. We use option 4 so that if a multicast enable
180  * is tried before the pending Tx completes, the enable code finds the
181  * mce in the active list and just has to make sure it will not be reaped
182  * (ie the mcg leave done) when the pending Tx does complete. Alternatively,
183  * a stale list (#5) that would be checked in the enable code would need
184  * to be implemented. Option 2 is used, because otherwise, a Tx attempt
185  * after the multicast disable would try to put an AH in the active list,
186  * and associate the mce it finds in the active list to this new AH,
187  * whereas the mce is already associated with the previous AH (taken off
188  * the active list), and will be removed once the pending Tx's complete
189  * (unless a reference count on mce's is implemented). One implication of
190  * using 2,4 is that new Tx's posted before the pending Tx's complete will
191  * grab new references on the AH, further delaying the leave.
192  *
193  * In the case of mcg delete (or create) trap when the port is sendonly
194  * joined, the AH and MCE handling is different: the AH and MCE has to be
195  * immediately taken off the active lists (forcing a join and path lookup
196  * at the next Tx is the only guaranteed means of ensuring a proper Tx
197  * to an mcg as it is repeatedly created and deleted and goes thru
198  * reincarnations).
199  *
200  * When a port is already sendonly joined, and a multicast enable is
201  * attempted, the same mce structure is promoted; this ensures only a
202  * single mce on the active list tracks the most powerful join state.
203  *
204  * In the case of port up event handling, the MCE for sendonly membership
205  * is freed up, and the ACE is put into the free list as soon as possible
206  * (depending on whether posted Tx's have completed). For fullmembership
207  * MCE's though, the ACE is similarly handled; but the MCE is kept around
208  * (a re-JOIN is attempted) only if the DLPI leave has not already been
209  * done; else the mce is deconstructed (mc_fullreap case).
210  *
211  * MCG creation and deletion trap handling:
212  *
213  * These traps are unreliable (meaning sometimes the trap might never
214  * be delivered to the subscribed nodes) and may arrive out-of-order
215  * since they use UD transport. An alternative to relying on these
216  * unreliable traps is to poll for mcg presence every so often, but
217  * instead of doing that, we try to be as conservative as possible
218  * while handling the traps, and hope that the traps do arrive at
219  * the subscribed nodes soon. Note that if a node is fullmember
220  * joined to an mcg, it can not possibly receive a mcg create/delete
221  * trap for that mcg (by fullmember definition); if it does, it is
222  * an old trap from a previous incarnation of the mcg.
223  *
224  * Whenever a trap is received, the driver cleans up its sendonly
225  * membership to the group; we choose to do a sendonly leave even
226  * on a creation trap to handle the case of a prior deletion of the mcg
227  * having gone unnoticed. Consider an example scenario:
228  * T1: MCG M is deleted, and fires off deletion trap D1.
229  * T2: MCG M is recreated, fires off creation trap C1, which is lost.
230  * T3: Node N tries to transmit to M, joining in sendonly mode.
231  * T4: MCG M is deleted, and fires off deletion trap D2.
232  * T5: N receives a deletion trap, but can not distinguish D1 from D2.
233  *     If the trap is D2, then a LEAVE is not required, since the mcg
234  *     is already deleted; but if it is D1, a LEAVE is required. A safe
235  *     approach is to always LEAVE, but the SM may be confused if it
236  *     receives a LEAVE without a prior JOIN.
237  *
238  * Management of the non-membership to an mcg is similar to the above,
239  * except that if the interface is in promiscuous mode, it is required
240  * to attempt to re-join the mcg after receiving a trap. Unfortunately,
241  * if the re-join attempt fails (in which case a warning message needs
242  * to be printed), it is not clear whether it failed due to the mcg not
243  * existing, or some fabric/hca issues, due to the delayed nature of
244  * trap delivery. Querying the SA to establish presence/absence of the
245  * mcg is also racy at best. Thus, the driver just prints a warning
246  * message when it can not rejoin after receiving a create trap, although
247  * this might be (on rare occasions) a mis-warning if the create trap is
248  * received after the mcg was deleted.
249  */
250 
251 /*
252  * Implementation of atomic "recycle" bits and reference count
253  * on address handles. This utilizes the fact that max reference
254  * count on any handle is limited by number of send wqes, thus
255  * high bits in the ac_ref field can be used as the recycle bits,
256  * and only the low bits hold the number of pending Tx requests.
257  * This atomic AH reference counting allows the Tx completion
258  * handler not to acquire the id_ac_mutex to process every completion,
259  * thus reducing lock contention problems between completion and
260  * the Tx path.
261  */
262 #define	CYCLEVAL		0x80000
263 #define	CLEAR_REFCYCLE(ace)	(ace)->ac_ref = 0
264 #define	CYCLE_SET(ace)		(((ace)->ac_ref & CYCLEVAL) == CYCLEVAL)
265 #define	GET_REF(ace)		((ace)->ac_ref)
266 #define	GET_REF_CYCLE(ace) (				\
267 	/*						\
268 	 * Make sure "cycle" bit is set.		\
269 	 */						\
270 	ASSERT(CYCLE_SET(ace)),				\
271 	((ace)->ac_ref & ~(CYCLEVAL))			\
272 )
273 #define	INC_REF(ace, num) {				\
274 	atomic_add_32(&(ace)->ac_ref, num);		\
275 }
276 #define	SET_CYCLE_IF_REF(ace) (				\
277 	CYCLE_SET(ace) ? B_TRUE :			\
278 	    atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) ==	\
279 		CYCLEVAL ?				\
280 		/*					\
281 		 * Clear the "cycle" bit we just set;	\
282 		 * ref count known to be 0 from above.	\
283 		 */					\
284 		CLEAR_REFCYCLE(ace), B_FALSE :		\
285 		/*					\
286 		 * We set "cycle" bit; let caller know.	\
287 		 */					\
288 		B_TRUE					\
289 )
290 #define	DEC_REF_DO_CYCLE(ace) (				\
291 	atomic_dec_32_nv(&ace->ac_ref) == CYCLEVAL ?	\
292 		/*					\
293 		 * Ref count known to be 0 from above.	\
294 		 */					\
295 		B_TRUE :				\
296 		B_FALSE					\
297 )
298 
299 /*
300  * Address handle entries maintained by the driver are kept in the
301  * free and active lists. Each entry starts out in the free list;
302  * it migrates to the active list when primed using ibt_get_paths()
303  * and ibt_modify_ud_dest() for transmission to a specific destination.
304  * In the active list, the entry has a reference count indicating the
305  * number of ongoing/uncompleted transmits that reference it. The
306  * entry is left in the active list even after the reference count
307  * goes to 0, since successive transmits can find it there and do
308  * not need to set up another entry (ie the path information is
309  * cached using the active list). Entries on the active list are
310  * also hashed using the destination link address as a key for faster
311  * lookups during transmits.
312  *
313  * For any destination address (unicast or multicast, whatever the
314  * join states), there will be at most one entry in the active list.
315  * Entries with a 0 reference count on the active list can be reused
316  * for a transmit to a new destination, if the free list is empty.
317  *
318  * The AH free list insertion/deletion is protected with the id_ac_mutex,
319  * since the async thread and Tx callback handlers insert/delete. The
320  * active list does not need a lock (all operations are done by the
321  * async thread) but updates to the reference count are atomically
322  * done (increments done by Tx path, decrements by the Tx callback handler).
323  */
324 #define	IBD_ACACHE_INSERT_FREE(state, ce) \
325 	list_insert_head(&state->id_ah_free, ce)
326 #define	IBD_ACACHE_GET_FREE(state) \
327 	list_get_head(&state->id_ah_free)
328 #define	IBD_ACACHE_INSERT_ACTIVE(state, ce) {			\
329 	int _ret_;						\
330 	list_insert_head(&state->id_ah_active, ce);		\
331 	_ret_ = mod_hash_insert(state->id_ah_active_hash,	\
332 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
333 	ASSERT(_ret_ == 0);					\
334 	state->id_ac_hot_ace = ce;				\
335 }
336 #define	IBD_ACACHE_PULLOUT_ACTIVE(state, ce) {			\
337 	list_remove(&state->id_ah_active, ce);			\
338 	if (state->id_ac_hot_ace == ce)				\
339 		state->id_ac_hot_ace = NULL;			\
340 	(void) mod_hash_remove(state->id_ah_active_hash,	\
341 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
342 }
343 #define	IBD_ACACHE_GET_ACTIVE(state) \
344 	list_get_head(&state->id_ah_active)
345 
346 /*
347  * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at
348  * front of optional src/tgt link layer address. Right now Solaris inserts
349  * padding by default at the end. The routine which is doing is nce_xmit()
350  * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when
351  * the packet comes down from IP layer to the IBD driver, it is in the
352  * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T]
353  * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result
354  * machdr is not 4 byte aligned and had 2 bytes of padding at the end.
355  *
356  * The send routine at IBD driver changes this packet as follows:
357  * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding]
358  * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte
359  * aligned.
360  *
361  * At the receiving side again ibd_process_rx takes the above packet and
362  * removes the two bytes of front padding and inserts it at the end. This
363  * is since the IP layer does not understand padding at the front.
364  */
365 #define	IBD_PAD_NSNA(ip6h, len, type) {					\
366 	uchar_t 	*nd_lla_ptr;					\
367 	icmp6_t 	*icmp6;						\
368 	nd_opt_hdr_t	*opt;						\
369 	int 		i;						\
370 									\
371 	icmp6 = (icmp6_t *)&ip6h[1];					\
372 	len -= sizeof (nd_neighbor_advert_t);				\
373 	if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) ||		\
374 	    (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) &&		\
375 	    (len != 0)) {						\
376 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h			\
377 		    + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t));	\
378 		ASSERT(opt != NULL);					\
379 		nd_lla_ptr = (uchar_t *)&opt[1];			\
380 		if (type == IBD_SEND) {					\
381 			for (i = IPOIB_ADDRL; i > 0; i--)		\
382 				*(nd_lla_ptr + i + 1) =			\
383 				    *(nd_lla_ptr + i - 1);		\
384 		} else {						\
385 			for (i = 0; i < IPOIB_ADDRL; i++)		\
386 				*(nd_lla_ptr + i) =			\
387 				    *(nd_lla_ptr + i + 2);		\
388 		}							\
389 		*(nd_lla_ptr + i) = 0;					\
390 		*(nd_lla_ptr + i + 1) = 0;				\
391 	}								\
392 }
393 
394 
395 /*
396  * IETF defined IPoIB encapsulation header, with 2b of ethertype
397  * followed by 2 reserved bytes. This is at the start of the
398  * datagram sent to and received over the wire by the driver.
399  */
400 typedef struct ipoib_header {
401 	ushort_t	ipoib_type;
402 	ushort_t	ipoib_mbz;
403 } ipoib_hdr_t;
404 
405 #define	IPOIB_HDRSIZE	sizeof (struct ipoib_header)
406 
407 /*
408  * IETF defined IPoIB link address; IBA QPN, followed by GID,
409  * which has a prefix and suffix, as reported via ARP.
410  */
411 typedef struct ipoib_mac {
412 	uint32_t	ipoib_qpn;
413 	uint32_t	ipoib_gidpref[2];
414 	uint32_t	ipoib_gidsuff[2];
415 } ipoib_mac_t;
416 
417 #define	IPOIB_ADDRL	sizeof (struct ipoib_mac)
418 
419 /*
420  * Pseudo header prepended to datagram in DLIOCRAW transmit path
421  * and when GLD hands the datagram to the gldm_send entry point.
422  */
423 typedef struct ipoib_ptxhdr {
424 	ipoib_mac_t	ipoib_dest;
425 	ipoib_hdr_t	ipoib_rhdr;
426 } ipoib_ptxhdr_t;
427 
428 #define	IPOIBDLSAP(p, offset)	((ipoib_ptxhdr_t *)((caddr_t)(p)+offset))
429 
430 /*
431  * The pseudo-GRH structure that sits before the data in the
432  * receive buffer, and is overlaid on top of the real GRH.
433  * The driver sets the ipoib_vertcflow to 0 if the pseudo-GRH
434  * does not hold valid information. If it is indicated valid,
435  * the driver must additionally provide the sender's qpn in
436  * network byte order in ipoib_sqpn, and not touch the
437  * remaining parts which were DMA'ed in by the IBA hardware.
438  */
439 typedef struct ipoib_pgrh {
440 	uint32_t	ipoib_vertcflow;
441 	uint32_t	ipoib_sqpn;
442 	uint32_t	ipoib_sgid_pref[2];
443 	uint32_t	ipoib_sgid_suff[2];
444 	uint32_t	ipoib_dgid_pref[2];
445 	uint32_t	ipoib_dgid_suff[2];
446 } ipoib_pgrh_t;
447 
448 /*
449  * The GRH is also dma'ed into recv buffers, thus space needs
450  * to be allocated for them.
451  */
452 #define	IPOIB_GRH_SIZE	sizeof (ipoib_pgrh_t)
453 
454 /* support  the RC (reliable connected) mode */
455 #define	IBD_MAC_ADDR_RC		0x80000000
456 /* support the UC (unreliable connected) mode */
457 #define	IBD_MAC_ADDR_UC		0x40000000
458 
459 #define	IBD_RC_SERVICE_ID 0x100000000000000ULL
460 
461 /*
462  * Legacy OFED had used a wrong service ID (one additional zero digit) for
463  * many years. To interop with legacy OFED, we support this wrong service ID
464  * here.
465  */
466 #define	IBD_RC_SERVICE_ID_OFED_INTEROP 0x1000000000000000ULL
467 
468 #define	IBD_RC_MIN_CQ_SIZE	0x7f
469 
470 /* Number of ibt_wc_t provided for each RC channel */
471 #define	IBD_RC_MAX_CQ_WC	0x3f
472 
473 #if defined(_KERNEL) && !defined(_BOOT)
474 
475 #include <sys/ib/ibtl/ibti.h>
476 #include <sys/ib/ib_pkt_hdrs.h>
477 #include <sys/list.h>
478 #include <sys/mac_provider.h>
479 #include <sys/mac_ib.h>
480 #include <sys/modhash.h>
481 
482 /* State of a reliable connected channel (ibd_rc_chan_t->chan_state) */
483 typedef enum {
484 	IBD_RC_STATE_INIT = 0,
485 
486 	/* Active side */
487 	IBD_RC_STATE_ACT_REP_RECV,	/* reply received */
488 	IBD_RC_STATE_ACT_ESTAB,		/* established, ready to send */
489 	IBD_RC_STATE_ACT_REJECT,	/* rejected */
490 	/* Someone else is closing this channel, please don't re-close it */
491 	IBD_RC_STATE_ACT_CLOSING,
492 	IBD_RC_STATE_ACT_CLOSED,
493 	IBD_RC_STATE_ACT_ERROR,
494 
495 	/* Passive side */
496 	IBD_RC_STATE_PAS_REQ_RECV,	/* request received */
497 	IBD_RC_STATE_PAS_ESTAB,		/* established, ready to receive */
498 	IBD_RC_STATE_PAS_REJECT,	/* rejected */
499 
500 	IBD_RC_STATE_PAS_CLOSED
501 } ibd_rc_chan_state_t;
502 
503 /*
504  * Structure to encapsulate various types of async requests.
505  */
506 typedef struct ibd_acache_rq {
507 	struct list_node 	rq_list; 	/* list of pending work */
508 	int			rq_op;		/* what operation */
509 	ipoib_mac_t		rq_mac;
510 	ib_gid_t		rq_gid;
511 	void			*rq_ptr;
512 	void			*rq_ptr2;
513 } ibd_req_t;
514 
515 typedef struct ibd_mcache {
516 	struct list_node	mc_list;	/* full/non list */
517 	uint8_t			mc_jstate;
518 	boolean_t		mc_fullreap;
519 	ibt_mcg_info_t		mc_info;
520 	ibd_req_t		mc_req;		/* to queue LEAVE req */
521 } ibd_mce_t;
522 
523 typedef struct ibd_acache_s {
524 	struct list_node	ac_list;	/* free/active list */
525 	ibt_ud_dest_hdl_t	ac_dest;
526 	ipoib_mac_t		ac_mac;
527 	uint32_t		ac_ref;
528 	ibd_mce_t		*ac_mce;	/* for MCG AHs */
529 
530 	/* For Reliable Connected mode */
531 	struct ibd_rc_chan_s	*ac_chan;
532 	/* protect tx_too_big_ongoing */
533 	kmutex_t		tx_too_big_mutex;
534 	/* Deal with too big packet */
535 	boolean_t		tx_too_big_ongoing;
536 } ibd_ace_t;
537 
538 #define	IBD_MAX_SQSEG	59
539 #define	IBD_MAX_RQSEG	1
540 
541 typedef enum {
542 	IBD_WQE_SEND,
543 	IBD_WQE_RECV
544 } ibd_wqe_type_t;
545 
546 typedef enum {
547 	IBD_WQE_TXBUF = 1,
548 	IBD_WQE_LSOBUF = 2,
549 	IBD_WQE_MAPPED = 3,
550 	IBD_WQE_RC_COPYBUF = 4
551 } ibd_wqe_buftype_t;
552 
553 #ifdef DEBUG
554 typedef struct ibd_rc_stat_s {
555 	kstat_named_t		rc_rcv_trans_byte;
556 	kstat_named_t		rc_rcv_trans_pkt;
557 	kstat_named_t		rc_rcv_copy_byte;
558 	kstat_named_t		rc_rcv_copy_pkt;
559 	kstat_named_t		rc_rcv_alloc_fail;
560 
561 	kstat_named_t		rc_rcq_invoke;
562 	kstat_named_t		rc_rcq_err;	/* fail in rcq handler */
563 	kstat_named_t		rc_scq_invoke;
564 
565 	kstat_named_t		rc_rwqe_short;	/* short rwqe */
566 
567 	kstat_named_t		rc_xmt_bytes;
568 	/* pkt size <= state->id_rc_tx_copy_thresh */
569 	kstat_named_t		rc_xmt_small_pkt;
570 	kstat_named_t		rc_xmt_fragmented_pkt;
571 	/* fail in ibt_map_mem_iov() */
572 	kstat_named_t		rc_xmt_map_fail_pkt;
573 	/* succ in ibt_map_mem_iov() */
574 	kstat_named_t		rc_xmt_map_succ_pkt;
575 
576 	kstat_named_t		rc_ace_not_found;	/* ace not found */
577 	/* no swqe even after recycle */
578 	kstat_named_t		rc_scq_no_swqe;
579 	/* no tx large buf even after recycle */
580 	kstat_named_t		rc_scq_no_largebuf;
581 
582 	/* short swqe in ibd_send() */
583 	kstat_named_t		rc_swqe_short;
584 	/* call mac_tx_update() when there is enough swqe */
585 	kstat_named_t		rc_swqe_mac_update;
586 	/* short large buf in ibd_send() */
587 	kstat_named_t		rc_xmt_buf_short;
588 	/* call mac_tx_update() when there is enough Tx large buffers */
589 	kstat_named_t rc_xmt_buf_mac_update;
590 
591 	kstat_named_t		rc_conn_succ;	/* # of success connect */
592 	kstat_named_t		rc_conn_fail;	/* # of fail connect */
593 	/* ace->ac_chan == NULL for unicast packet */
594 	kstat_named_t		rc_null_conn;
595 	/* not in active established state */
596 	kstat_named_t		rc_no_estab_conn;
597 
598 	kstat_named_t		rc_act_close;	/* call ibd_rc_act_close() */
599 	kstat_named_t		rc_pas_close;	/* call ibd_rc_pas_close() */
600 	kstat_named_t		rc_delay_ace_recycle;
601 	kstat_named_t		rc_act_close_simultaneous;
602 
603 	kstat_named_t		rc_reset_cnt;	/* # of Reset RC channel */
604 } ibd_rc_stat_t;
605 #endif
606 
607 typedef struct ibd_rc_chan_list_s {
608 	/* This mutex protects chan_list and ibd_rc_chan_t.next */
609 	kmutex_t		chan_list_mutex;
610 	struct ibd_rc_chan_s	*chan_list;
611 } ibd_rc_chan_list_t;
612 
613 typedef struct ibd_rc_tx_largebuf_s {
614 	struct ibd_rc_tx_largebuf_s	*lb_next;
615 	uint8_t				*lb_buf;
616 } ibd_rc_tx_largebuf_t;
617 
618 /*
619  * Pre-registered copybuf used for send and receive
620  */
621 typedef struct ibd_copybuf_s {
622 	ibt_wr_ds_t		ic_sgl;
623 	uint8_t			*ic_bufaddr;
624 } ibd_copybuf_t;
625 
626 typedef struct ibd_wqe_s {
627 	struct ibd_wqe_s	*w_next;
628 	ibd_copybuf_t		w_copybuf;
629 	mblk_t			*im_mblk;
630 } ibd_wqe_t;
631 
632 /*
633  * Send WQE
634  */
635 typedef struct ibd_swqe_s {
636 	ibd_wqe_t		w_ibd_swqe;
637 	ibd_wqe_buftype_t	w_buftype;
638 	ibt_send_wr_t		w_swr;
639 	ibd_ace_t		*w_ahandle;
640 	ibt_mi_hdl_t		w_mi_hdl;
641 	ibt_wr_ds_t		w_sgl[IBD_MAX_SQSEG];
642 	ibd_rc_tx_largebuf_t	*w_rc_tx_largebuf;
643 } ibd_swqe_t;
644 
645 #define	swqe_next		w_ibd_swqe.w_next
646 #define	swqe_copybuf		w_ibd_swqe.w_copybuf
647 #define	swqe_im_mblk		w_ibd_swqe.im_mblk
648 #define	SWQE_TO_WQE(swqe)	(ibd_wqe_t *)&((swqe)->w_ibd_swqe)
649 #define	WQE_TO_SWQE(wqe)	(ibd_swqe_t *)wqe
650 
651 /*
652  * Receive WQE
653  */
654 typedef struct ibd_rwqe_s {
655 	ibd_wqe_t		w_ibd_rwqe;
656 	struct ibd_state_s	*w_state;
657 	ibt_recv_wr_t		w_rwr;
658 	frtn_t			w_freemsg_cb;
659 	boolean_t		w_freeing_wqe;
660 	struct ibd_rc_chan_s	*w_chan;
661 } ibd_rwqe_t;
662 
663 #define	rwqe_next		w_ibd_rwqe.w_next
664 #define	rwqe_copybuf		w_ibd_rwqe.w_copybuf
665 #define	rwqe_im_mblk		w_ibd_rwqe.im_mblk
666 #define	RWQE_TO_WQE(rwqe)	(ibd_wqe_t *)&((rwqe)->w_ibd_rwqe)
667 #define	WQE_TO_RWQE(wqe)	(ibd_rwqe_t *)wqe
668 
669 typedef struct ibd_list_s {
670 	kmutex_t		dl_mutex;
671 	ibd_wqe_t		*dl_head;
672 	union {
673 		boolean_t	pending_sends;
674 		uint32_t	bufs_outstanding;
675 	} ustat;
676 	uint32_t		dl_cnt;
677 } ibd_list_t;
678 
679 #define	dl_pending_sends	ustat.pending_sends
680 #define	dl_bufs_outstanding	ustat.bufs_outstanding
681 
682 /*
683  * LSO buffers
684  *
685  * Under normal circumstances we should never need to use any buffer
686  * that's larger than MTU.  Unfortunately, IB HCA has limitations
687  * on the length of SGL that are much smaller than those for regular
688  * ethernet NICs.  Since the network layer doesn't care to limit the
689  * number of mblk fragments in any send mp chain, we end up having to
690  * use these larger-than-MTU sized (larger than id_tx_buf_sz actually)
691  * buffers occasionally.
692  */
693 typedef struct ibd_lsobuf_s {
694 	struct ibd_lsobuf_s *lb_next;
695 	uint8_t		*lb_buf;
696 	int		lb_isfree;
697 } ibd_lsobuf_t;
698 
699 typedef struct ibd_lsobkt_s {
700 	uint8_t		*bkt_mem;
701 	ibd_lsobuf_t	*bkt_bufl;
702 	ibd_lsobuf_t	*bkt_free_head;
703 	ibt_mr_hdl_t	bkt_mr_hdl;
704 	ibt_mr_desc_t	bkt_mr_desc;
705 	uint_t		bkt_nelem;
706 	uint_t		bkt_nfree;
707 } ibd_lsobkt_t;
708 
709 #define	IBD_PORT_DRIVER		0x1
710 #define	IBD_PARTITION_OBJ	0x2
711 
712 /*
713  * Posting to a single software rx post queue is contentious,
714  * so break it out to (multiple) an array of queues.
715  *
716  * Try to ensure rx_queue structs fall in different cache lines using a filler.
717  * Note: the RX_QUEUE_CACHE_LINE needs to change if the struct changes.
718  */
719 #define	RX_QUEUE_CACHE_LINE \
720 	(64 - (sizeof (kmutex_t) + sizeof (ibd_wqe_t *) + sizeof (uint_t)))
721 typedef struct ibd_rx_queue_s {
722 	kmutex_t		rx_post_lock;
723 	ibd_wqe_t		*rx_head;
724 	uint_t			rx_cnt;
725 	uint8_t			rx_pad[RX_QUEUE_CACHE_LINE];
726 } ibd_rx_queue_t;
727 
728 /*
729  * This structure maintains information per port per HCA
730  * (per network interface).
731  */
732 typedef struct ibd_state_s {
733 	uint_t			id_type;
734 	dev_info_t		*id_dip;
735 	ibt_clnt_hdl_t		id_ibt_hdl;
736 	ibt_hca_hdl_t		id_hca_hdl;
737 	ibt_pd_hdl_t		id_pd_hdl;
738 	kmem_cache_t		*id_req_kmc;
739 
740 	ibd_list_t		id_tx_rel_list;
741 
742 	uint32_t		id_running;
743 
744 	uint32_t		id_max_sqseg;
745 	uint32_t		id_max_sqseg_hiwm;
746 	ibd_list_t		id_tx_list;
747 	ddi_softintr_t		id_tx;
748 	uint32_t		id_tx_sends;
749 
750 	kmutex_t		id_txpost_lock;
751 	ibd_swqe_t		*id_tx_head;
752 	ibd_swqe_t		*id_tx_tail;
753 	int			id_tx_busy;
754 
755 	uint_t			id_tx_buf_sz;
756 	uint8_t			*id_tx_bufs;
757 	ibd_swqe_t		*id_tx_wqes;
758 	ibt_mr_hdl_t		id_tx_mr_hdl;
759 	ibt_mr_desc_t		id_tx_mr_desc;
760 
761 	kmutex_t		id_lso_lock;
762 	ibd_lsobkt_t		*id_lso;
763 
764 	kmutex_t		id_scq_poll_lock;
765 	int			id_scq_poll_busy;
766 
767 	ibt_cq_hdl_t		id_scq_hdl;
768 	ibt_wc_t		*id_txwcs;
769 	uint32_t		id_txwcs_size;
770 
771 	int			id_rx_nqueues;
772 	ibd_rx_queue_t		*id_rx_queues;
773 	int			id_rx_post_queue_index;
774 	uint32_t		id_rx_post_active;
775 
776 	ibd_rwqe_t		*id_rx_wqes;
777 	uint8_t			*id_rx_bufs;
778 	ibt_mr_hdl_t		id_rx_mr_hdl;
779 	ibt_mr_desc_t		id_rx_mr_desc;
780 	uint_t			id_rx_buf_sz;
781 	/*
782 	 * id_ud_num_rwqe
783 	 * Number of "receive WQE" elements that will be allocated and used
784 	 * by ibd. This parameter is limited by the maximum channel size of
785 	 * the HCA. Each buffer in the receive wqe will be of MTU size.
786 	 */
787 	uint32_t		id_ud_num_rwqe;
788 	ibd_list_t		id_rx_list;
789 	ddi_softintr_t		id_rx;
790 	uint32_t		id_rx_bufs_outstanding_limit;
791 	uint32_t		id_rx_allocb;
792 	uint32_t		id_rx_allocb_failed;
793 	ibd_list_t		id_rx_free_list;
794 
795 	kmutex_t		id_rcq_poll_lock;
796 	int			id_rcq_poll_busy;
797 	uint32_t		id_rxwcs_size;
798 	ibt_wc_t		*id_rxwcs;
799 	ibt_cq_hdl_t		id_rcq_hdl;
800 
801 	ibt_channel_hdl_t	id_chnl_hdl;
802 	ib_pkey_t		id_pkey;
803 	uint16_t		id_pkix;
804 	uint8_t			id_port;
805 	ibt_mcg_info_t		*id_mcinfo;
806 
807 	mac_handle_t		id_mh;
808 	mac_resource_handle_t	id_rh;
809 	ib_gid_t		id_sgid;
810 	ib_qpn_t		id_qpnum;
811 	ipoib_mac_t		id_macaddr;
812 	ib_gid_t		id_mgid;
813 	ipoib_mac_t		id_bcaddr;
814 
815 	int			id_mtu;
816 	uchar_t			id_scope;
817 
818 	kmutex_t		id_acache_req_lock;
819 	kcondvar_t		id_acache_req_cv;
820 	struct list		id_req_list;
821 	kt_did_t		id_async_thrid;
822 
823 	kmutex_t		id_ac_mutex;
824 	ibd_ace_t		*id_ac_hot_ace;
825 	struct list		id_ah_active;
826 	struct list		id_ah_free;
827 	ipoib_mac_t		id_ah_addr;
828 	ibd_req_t		id_ah_req;
829 	char			id_ah_op;
830 	uint64_t		id_ah_error;
831 	ibd_ace_t		*id_ac_list;
832 	mod_hash_t		*id_ah_active_hash;
833 
834 	kmutex_t		id_mc_mutex;
835 	struct list		id_mc_full;
836 	struct list		id_mc_non;
837 
838 	kmutex_t		id_trap_lock;
839 	kcondvar_t		id_trap_cv;
840 	boolean_t		id_trap_stop;
841 	uint32_t		id_trap_inprog;
842 
843 	char			id_prom_op;
844 
845 	kmutex_t		id_sched_lock;
846 	int			id_sched_needed;
847 	int			id_sched_cnt;
848 	int			id_sched_lso_cnt;
849 
850 	kmutex_t		id_link_mutex;
851 	link_state_t		id_link_state;
852 	uint64_t		id_link_speed;
853 
854 	uint64_t		id_num_intrs;
855 	uint64_t		id_tx_short;
856 	/*
857 	 * id_ud_num_swqe
858 	 * Number of "send WQE" elements that will be allocated and used by
859 	 * ibd. When tuning this parameter, the size of pre-allocated, pre-
860 	 * mapped copy buffer in each of these send wqes must be taken into
861 	 * account. This copy buffer size is determined by the value of
862 	 * IBD_TX_BUF_SZ (this is currently set to the same value of
863 	 * ibd_tx_copy_thresh, but may be changed independently if needed).
864 	 */
865 	uint32_t		id_ud_num_swqe;
866 
867 	uint64_t		id_xmt_bytes;
868 	uint64_t		id_rcv_bytes;
869 	uint64_t		id_multi_xmt;
870 	uint64_t		id_brd_xmt;
871 	uint64_t		id_multi_rcv;
872 	uint64_t		id_brd_rcv;
873 	uint64_t		id_xmt_pkt;
874 	uint64_t		id_rcv_pkt;
875 
876 	uint32_t		id_hwcksum_capab;
877 	boolean_t		id_lso_policy;
878 	boolean_t		id_lso_capable;
879 	uint_t			id_lso_maxlen;
880 	int			id_hca_res_lkey_capab;
881 	ibt_lkey_t		id_res_lkey;
882 
883 	boolean_t		id_bgroup_created;
884 	kmutex_t		id_macst_lock;
885 	kcondvar_t		id_macst_cv;
886 	uint32_t		id_mac_state;
887 
888 	/* For Reliable Connected Mode */
889 	boolean_t		id_enable_rc;
890 	boolean_t		rc_enable_srq;
891 
892 	int			rc_mtu;
893 	uint32_t		rc_tx_max_sqseg;
894 	/*
895 	 * In IPoIB over Reliable Connected mode, its mac address is added
896 	 * an "IBD_MAC_ADDR_RC" prefix. But for loopback filter in function
897 	 * ibd_process_rx(), the input mac address should not include the
898 	 * "IBD_MAC_ADDR_RC" prefix.
899 	 *
900 	 * So, we introduce the rc_macaddr_loopback for the loopback filter in
901 	 * IPoIB over Reliable Connected mode.
902 	 *
903 	 * rc_macaddr_loopback = id_macaddr excludes "IBD_MAC_ADDR_RC" prefix.
904 	 */
905 	ipoib_mac_t		rc_macaddr_loopback;
906 
907 	ibt_srv_hdl_t		rc_listen_hdl;
908 	ibt_sbind_hdl_t		rc_listen_bind;
909 	ibt_srv_hdl_t		rc_listen_hdl_OFED_interop;
910 	ibt_sbind_hdl_t		rc_listen_bind_OFED_interop;
911 
912 	ibd_rc_chan_list_t	rc_pass_chan_list;
913 	/* obsolete active channel list */
914 	ibd_rc_chan_list_t	rc_obs_act_chan_list;
915 
916 	kmutex_t		rc_ace_recycle_lock;
917 	ibd_ace_t		*rc_ace_recycle;
918 
919 	/* Send */
920 	/*
921 	 * This mutex protects rc_tx_largebuf_free_head, rc_tx_largebuf_nfree
922 	 * and ibd_rc_tx_largebuf_t->lb_next
923 	 */
924 	kmutex_t		rc_tx_large_bufs_lock;
925 	ibd_rc_tx_largebuf_t	*rc_tx_largebuf_free_head;
926 	uint_t			rc_tx_largebuf_nfree;
927 	/* The chunk of whole Tx large buffers */
928 	uint8_t			*rc_tx_mr_bufs;
929 	ibt_mr_hdl_t		rc_tx_mr_hdl;
930 	ibt_mr_desc_t		rc_tx_mr_desc;
931 	ibd_rc_tx_largebuf_t	*rc_tx_largebuf_desc_base;	/* base addr */
932 
933 	boolean_t		rc_enable_iov_map;
934 	uint_t			rc_max_sqseg_hiwm;
935 
936 	/* For SRQ */
937 	uint32_t 		rc_srq_size;
938 	ibt_srq_hdl_t		rc_srq_hdl;
939 	ibd_list_t		rc_srq_rwqe_list;
940 	ibd_list_t		rc_srq_free_list;
941 	ibd_rwqe_t		*rc_srq_rwqes;
942 	uint8_t			*rc_srq_rx_bufs;
943 	ibt_mr_hdl_t		rc_srq_rx_mr_hdl;
944 	ibt_mr_desc_t		rc_srq_rx_mr_desc;
945 
946 	/* For chained receive */
947 	kmutex_t		rc_rx_lock;
948 	mblk_t			*rc_rx_mp;
949 	mblk_t			*rc_rx_mp_tail;
950 	uint32_t		rc_rx_mp_len;
951 
952 	/* Counters for RC mode */
953 	/* RX */
954 	/*
955 	 * # of Received packets. These packets are directly transferred to GLD
956 	 * without copy it
957 	 */
958 	uint64_t		rc_rcv_trans_byte;
959 	uint64_t		rc_rcv_trans_pkt;
960 	/*
961 	 * # of Received packets. We will allocate new buffers for these packet,
962 	 * copy their content into new buffers, then transfer to GLD
963 	 */
964 	uint64_t		rc_rcv_copy_byte;
965 	uint64_t		rc_rcv_copy_pkt;
966 	uint64_t		rc_rcv_alloc_fail;
967 
968 #ifdef DEBUG
969 	uint64_t		rc_rwqe_short;	/* short rwqe */
970 #endif
971 
972 	/* # of invoke Receive CQ handler */
973 	uint64_t		rc_rcq_invoke;
974 	/* wc->wc_status != IBT_WC_SUCCESS */
975 	uint64_t		rc_rcq_err;
976 
977 	/* Tx */
978 	uint64_t		rc_xmt_bytes;
979 
980 	/* pkt size <= ibd_rc_tx_copy_thresh */
981 	uint64_t		rc_xmt_small_pkt;
982 	uint64_t		rc_xmt_fragmented_pkt;
983 	/* fail in ibt_map_mem_iov() */
984 	uint64_t		rc_xmt_map_fail_pkt;
985 	/* succ in ibt_map_mem_iov() */
986 	uint64_t		rc_xmt_map_succ_pkt;
987 
988 	uint64_t		rc_ace_not_found;
989 
990 	uint64_t		rc_xmt_drop_too_long_pkt;
991 	uint64_t		rc_xmt_icmp_too_long_pkt;
992 	uint64_t		rc_xmt_reenter_too_long_pkt;
993 
994 	/* short swqe in ibd_send() */
995 	uint64_t		rc_swqe_short;
996 	/* call mac_tx_update when there is enough swqe */
997 	uint64_t		rc_swqe_mac_update;
998 	/* short tx large copy buf in ibd_send() */
999 	uint64_t		rc_xmt_buf_short;
1000 	/* call mac_tx_update when there is enough Tx copy buf */
1001 	uint64_t		rc_xmt_buf_mac_update;
1002 
1003 	/* No swqe even after call swqe recycle function */
1004 	uint64_t		rc_scq_no_swqe;
1005 	/* No large Tx buf even after call swqe recycle function */
1006 	uint64_t		rc_scq_no_largebuf;
1007 	/* # of invoke Send CQ handler */
1008 	uint64_t		rc_scq_invoke;
1009 
1010 	/* Connection setup and close */
1011 	uint64_t		rc_conn_succ;	/* time of succ connect */
1012 	uint64_t		rc_conn_fail;	/* time of fail connect */
1013 	/* ace->ac_chan == NULL for unicast packet */
1014 	uint64_t		rc_null_conn;
1015 	/* not in active established state */
1016 	uint64_t		rc_no_estab_conn;
1017 
1018 	uint64_t		rc_act_close;	/* call ibd_rc_act_close() */
1019 	uint64_t		rc_pas_close;	/* call ibd_rc_pas_close() */
1020 	uint64_t		rc_delay_ace_recycle;
1021 	uint64_t		rc_act_close_simultaneous;
1022 
1023 	/* the counter of reset RC channel */
1024 	uint64_t		rc_reset_cnt;
1025 
1026 #ifdef DEBUG
1027 	kstat_t 		*rc_ksp;
1028 #endif
1029 	ib_guid_t		id_hca_guid;
1030 	ib_guid_t		id_port_guid;
1031 	datalink_id_t		id_dlinkid;
1032 	datalink_id_t		id_plinkid;
1033 	int			id_port_inst;
1034 	struct ibd_state_s	*id_next;
1035 	boolean_t		id_force_create;
1036 	boolean_t		id_bgroup_present;
1037 	uint_t			id_hca_max_chan_sz;
1038 
1039 	/*
1040 	 * UD Mode Tunables
1041 	 *
1042 	 * id_ud_tx_copy_thresh
1043 	 * This sets the threshold at which ibd will attempt to do a bcopy
1044 	 * of the outgoing data into a pre-mapped buffer. IPoIB driver's
1045 	 * send behavior is restricted by various parameters, so setting of
1046 	 * this value must be made after careful considerations only. For
1047 	 * instance, IB HCAs currently impose a relatively small limit
1048 	 * (when compared to ethernet NICs) on the length of the SGL for
1049 	 * transmit. On the other hand, the ip stack could send down mp
1050 	 * chains that are quite long when LSO is enabled.
1051 	 *
1052 	 * id_num_lso_bufs
1053 	 * Number of "larger-than-MTU" copy buffers to use for cases when the
1054 	 * outgoing mblk chain is too fragmented to be used with
1055 	 * ibt_map_mem_iov() and too large to be used with regular MTU-sized
1056 	 * copy buffers. It is not recommended to tune this variable without
1057 	 * understanding the application environment and/or memory resources.
1058 	 * The size of each of these lso buffers is determined by the value of
1059 	 * IBD_LSO_BUFSZ.
1060 	 *
1061 	 * id_num_ah
1062 	 * Number of AH cache entries to allocate
1063 	 *
1064 	 * id_hash_size
1065 	 * Hash table size for the active AH list
1066 	 *
1067 	 */
1068 	uint_t id_ud_tx_copy_thresh;
1069 	uint_t id_num_lso_bufs;
1070 	uint_t id_num_ah;
1071 	uint_t id_hash_size;
1072 
1073 	boolean_t id_create_broadcast_group;
1074 
1075 	boolean_t id_allow_coalesce_comp_tuning;
1076 	uint_t id_ud_rx_comp_count;
1077 	uint_t id_ud_rx_comp_usec;
1078 	uint_t id_ud_tx_comp_count;
1079 	uint_t id_ud_tx_comp_usec;
1080 
1081 	/* RC Mode Tunables */
1082 
1083 	uint_t id_rc_rx_comp_count;
1084 	uint_t id_rc_rx_comp_usec;
1085 	uint_t id_rc_tx_comp_count;
1086 	uint_t id_rc_tx_comp_usec;
1087 	/*
1088 	 * id_rc_tx_copy_thresh
1089 	 * This sets the threshold at which ibd will attempt to do a bcopy
1090 	 * of the outgoing data into a pre-mapped buffer.
1091 	 *
1092 	 * id_rc_rx_copy_thresh
1093 	 * If (the size of incoming buffer <= id_rc_rx_copy_thresh), ibd
1094 	 * will attempt to allocate a buffer and do a bcopy of the incoming
1095 	 * data into the allocated buffer.
1096 	 *
1097 	 * id_rc_rx_rwqe_thresh
1098 	 * If (the number of available rwqe < ibd_rc_rx_rwqe_thresh), ibd
1099 	 * will attempt to allocate a buffer and do a bcopy of the incoming
1100 	 * data into the allocated buffer.
1101 	 *
1102 	 * id_rc_num_swqe
1103 	 * 1) Send CQ size = ibd_rc_num_swqe
1104 	 * 2) The send queue size = ibd_rc_num_swqe -1
1105 	 * 3) Number of pre-allocated Tx buffers for ibt_post_send() =
1106 	 * ibd_rc_num_swqe - 1.
1107 	 *
1108 	 * id_rc_num_rwqe
1109 	 * 1) For non-SRQ, we pre-post ibd_rc_num_rwqe number of WRs
1110 	 * via ibt_post_receive() for receive queue of each RC channel.
1111 	 * 2) For SRQ and non-SRQ, receive CQ size = ibd_rc_num_rwqe
1112 	 *
1113 	 * For SRQ
1114 	 * If using SRQ, we allocate ibd_rc_num_srq number of buffers (the
1115 	 * size of each buffer is equal to RC mtu). And post them by
1116 	 * ibt_post_srq().
1117 	 *
1118 	 * id_rc_num_srq
1119 	 * ibd_rc_num_srq should not be larger than ibd_rc_num_rwqe,
1120 	 * otherwise it will cause a bug with the following warnings:
1121 	 * NOTICE: hermon0: Device Error: EQE cq overrun or protection error
1122 	 * NOTICE: hermon0: Device Error: EQE local work queue catastrophic
1123 	 * error
1124 	 * NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff
1125 	 * catastrophic channel error
1126 	 * NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff
1127 	 * completion queue error
1128 	 */
1129 	uint_t id_rc_tx_copy_thresh;
1130 	uint_t id_rc_rx_copy_thresh;
1131 	uint_t id_rc_rx_rwqe_thresh;
1132 	uint_t id_rc_num_swqe;
1133 	uint_t id_rc_num_rwqe;
1134 	uint_t id_rc_num_srq;
1135 } ibd_state_t;
1136 
1137 /*
1138  * Structures to track global IBTF data, data that is shared
1139  * among the IBD device instances.  This includes the one ibt_hdl
1140  * and the list of service registrations.
1141  */
1142 typedef struct ibd_service_s {
1143 	struct ibd_service_s	*is_link;
1144 	ibt_srv_hdl_t		is_srv_hdl;
1145 	ib_svc_id_t		is_sid;
1146 	uint_t			is_ref_cnt;
1147 } ibd_service_t;
1148 
1149 typedef struct ibd_global_state_s {
1150 	kmutex_t	ig_mutex;
1151 	ibt_clnt_hdl_t	ig_ibt_hdl;
1152 	uint_t		ig_ibt_hdl_ref_cnt;
1153 	ibd_service_t	*ig_service_list;
1154 } ibd_global_state_t;
1155 
1156 typedef struct ibd_rc_msg_hello_s {
1157 	uint32_t reserved_qpn;
1158 	uint32_t rx_mtu;
1159 } ibd_rc_msg_hello_t;
1160 
1161 typedef struct ibd_rc_chan_s {
1162 	struct ibd_rc_chan_s	*next;
1163 	/* channel hdl that we'll be using for Reliable Connected Mode */
1164 	ibt_channel_hdl_t	chan_hdl;
1165 	struct ibd_state_s	*state;
1166 	ibd_ace_t		*ace;
1167 	ibd_rc_chan_state_t	chan_state;
1168 
1169 	/* used to judge duplicate connection */
1170 	ib_gid_t		requester_gid;
1171 	ib_pkey_t		requester_pkey;
1172 
1173 	ibd_list_t		tx_wqe_list;	/* free wqe list */
1174 	ibd_list_t		tx_rel_list;	/* for swqe recycle */
1175 
1176 	ibd_swqe_t		*tx_wqes;
1177 
1178 	/* start address of Tx Buffers */
1179 	uint8_t			*tx_mr_bufs;
1180 	ibt_mr_hdl_t		tx_mr_hdl;
1181 	ibt_mr_desc_t		tx_mr_desc;
1182 
1183 	ibt_cq_hdl_t		scq_hdl;	/* Tx completion queue */
1184 	ibt_wc_t		tx_wc[IBD_RC_MAX_CQ_WC];
1185 	ddi_softintr_t		scq_softintr;
1186 
1187 	uint32_t		tx_trans_error_cnt;
1188 
1189 	/* For chained send */
1190 	kmutex_t		tx_post_lock;
1191 	ibd_swqe_t		*tx_head;
1192 	ibd_swqe_t		*tx_tail;
1193 	int			tx_busy;
1194 
1195 	/* For tx buffer recycle */
1196 	kmutex_t		tx_poll_lock;
1197 	int			tx_poll_busy;
1198 
1199 	/* Rx */
1200 	ibd_list_t		rx_wqe_list;	/* used by ibt_post_recv */
1201 	ibd_list_t		rx_free_list;	/* free rwqe list */
1202 
1203 	ibt_cq_hdl_t		rcq_hdl;	/* Rx completion queue */
1204 	ibt_wc_t		rx_wc[IBD_RC_MAX_CQ_WC];
1205 
1206 	ibd_rwqe_t		*rx_rwqes;	/* the chuck of whole rwqes */
1207 	uint8_t			*rx_bufs;	/* the chuck of whole Rx bufs */
1208 	ibt_mr_hdl_t		rx_mr_hdl;	/* ibt_mr_hdl_t for rx_bufs */
1209 	ibt_mr_desc_t		rx_mr_desc;	/* ibt_mr_desc_t for rx_bufs */
1210 
1211 	/* For chained receive */
1212 	kmutex_t		rx_lock;
1213 	mblk_t			*rx_mp;
1214 	mblk_t			*rx_mp_tail;
1215 	uint32_t		rx_mp_len;
1216 
1217 	uint32_t 		rcq_size;
1218 	uint32_t 		scq_size;
1219 	/*
1220 	 * We need two channels for each connection.
1221 	 * One channel for Tx; another channel for Rx.
1222 	 * If "is_tx_chan == B_TRUE", this is a Tx channel.
1223 	 */
1224 	boolean_t		is_tx_chan;
1225 } ibd_rc_chan_t;
1226 
1227 /*
1228  * The following functions are defined in "ibd.c".
1229  * They are also used by "ibd_cm.c"
1230  */
1231 void ibd_print_warn(ibd_state_t *, char *, ...);
1232 void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *);
1233 void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int);
1234 boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t);
1235 void ibd_dec_ref_ace(ibd_state_t *, ibd_ace_t *);
1236 ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int);
1237 
1238 /*
1239  * The following functions are defined in "ibd_cm.c".
1240  * They are also used in "ibd.c".
1241  */
1242 void ibd_async_rc_process_too_big(ibd_state_t *, ibd_req_t *);
1243 void ibd_async_rc_close_act_chan(ibd_state_t *, ibd_req_t *);
1244 void ibd_async_rc_recycle_ace(ibd_state_t *, ibd_req_t *);
1245 
1246 /* Connection Setup/Close Functions */
1247 ibt_status_t ibd_rc_listen(ibd_state_t *);
1248 void ibd_rc_stop_listen(ibd_state_t *);
1249 ibt_status_t ibd_rc_connect(ibd_state_t *, ibd_ace_t *, ibt_path_info_t *,
1250     uint64_t);
1251 void ibd_rc_try_connect(ibd_state_t *, ibd_ace_t *,  ibt_path_info_t *);
1252 void ibd_rc_signal_act_close(ibd_state_t *, ibd_ace_t *);
1253 void ibd_rc_signal_ace_recycle(ibd_state_t *, ibd_ace_t *);
1254 void ibd_rc_close_all_chan(ibd_state_t *);
1255 
1256 /* Receive Functions */
1257 int ibd_rc_init_srq_list(ibd_state_t *);
1258 void ibd_rc_fini_srq_list(ibd_state_t *);
1259 int ibd_rc_repost_srq_free_list(ibd_state_t *);
1260 
1261 /* Send Functions */
1262 int ibd_rc_init_tx_largebuf_list(ibd_state_t *);
1263 void ibd_rc_fini_tx_largebuf_list(ibd_state_t *);
1264 ibd_swqe_t *ibd_rc_acquire_swqes(ibd_rc_chan_t *);
1265 void ibd_rc_post_send(ibd_rc_chan_t *, ibd_swqe_t *);
1266 void ibd_rc_drain_scq(ibd_rc_chan_t *, ibt_cq_hdl_t);
1267 void ibd_rc_tx_cleanup(ibd_swqe_t *);
1268 
1269 /* Others */
1270 void ibd_rc_get_conf(ibd_state_t *);
1271 int ibd_rc_init_stats(ibd_state_t *);
1272 
1273 #endif /* _KERNEL && !_BOOT */
1274 
1275 #ifdef __cplusplus
1276 }
1277 #endif
1278 
1279 #endif	/* _SYS_IB_CLIENTS_IBD_H */
1280