xref: /illumos-gate/usr/src/uts/common/io/ib/clients/ibd/ibd.c (revision 230167a0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * An implementation of the IPoIB standard based on PSARC 2001/289.
28  */
29 
30 #include <sys/types.h>
31 #include <sys/conf.h>
32 #include <sys/ddi.h>
33 #include <sys/sunddi.h>
34 #include <sys/modctl.h>
35 #include <sys/stropts.h>
36 #include <sys/stream.h>
37 #include <sys/strsun.h>
38 #include <sys/strsubr.h>
39 #include <sys/dlpi.h>
40 #include <sys/mac_provider.h>
41 
42 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
43 #include <sys/sysmacros.h>	/* for offsetof */
44 #include <sys/disp.h>		/* for async thread pri */
45 #include <sys/atomic.h>		/* for atomic_add*() */
46 #include <sys/ethernet.h>	/* for ETHERTYPE_IPV6 */
47 #include <netinet/in.h>		/* for netinet/ip.h below */
48 #include <netinet/ip.h>		/* for struct ip */
49 #include <netinet/udp.h>	/* for struct udphdr */
50 #include <inet/common.h>	/* for inet/ip.h below */
51 #include <inet/ip.h>		/* for ipha_t */
52 #include <inet/ip6.h>		/* for ip6_t */
53 #include <inet/tcp.h>		/* for tcph_t */
54 #include <netinet/icmp6.h>	/* for icmp6_t */
55 #include <sys/callb.h>
56 #include <sys/modhash.h>
57 
58 #include <sys/ib/clients/ibd/ibd.h>
59 #include <sys/ib/mgt/sm_attr.h>	/* for SM_INIT_TYPE_* */
60 #include <sys/note.h>
61 #include <sys/multidata.h>
62 
63 #include <sys/ib/mgt/ibmf/ibmf.h>	/* for ibd_get_portspeed */
64 
65 #include <sys/priv_names.h>
66 #include <sys/dls.h>
67 #include <sys/dld_ioc.h>
68 #include <sys/policy.h>
69 #include <sys/ibpart.h>
70 #include <sys/file.h>
71 
72 /*
73  * The write-up below includes details on the following:
74  * 1. The dladm administrative model.
75  * 2. Late HCA initialization feature.
76  * 3. Brussels support and its implications to the current architecture.
77  *
78  * 1. The dladm administrative model.
79  * ------------------------------------------
80  * With the dladm model, ibnex will create one ibd instance per port. These
81  * instances will be created independent of the port state.
82  *
83  * The ibd driver is two faceted: One side of it working as the port driver and
84  * the other as the partition object driver.
85  *
86  * The port instance is a child of the HCA, and will have an entry in the devfs.
87  * A DDI attach only happens for the port driver, and its attach is
88  * handled in ibd_port_attach(). Similary, a DDI detach for the port driver is
89  * handled in ibd_port_unattach().
90  *
91  * The partition object is only a registrant to the mac layer via mac_register()
92  * and does not have an entry in the device tree. There is no DDI softstate
93  * managed by the DDI framework for the partition objects. However, the state is
94  * managed inside the ibd driver, and every partition object hangs off the
95  * "ibd_objlist_head".
96  *
97  * The partition object first comes into existence when a user runs the
98  * 'create-part' subcommand of dladm. This is like invoking the attach entry
99  * point of the partition object. The partition object goes away with the
100  * 'delete-part' subcommand of dladm. This is like invoking the detach entry
101  * point of the partition object.
102  *
103  * The create-part and delete-part subcommands result in dld ioctls that end up
104  * calling ibd_create_parition() and ibd_delete_partition respectively.
105  * There ioctls are registered with the dld layer in _init() via a call to
106  * dld_ioc_register().
107  *
108  * The port instance by itself cannot be plumbed. It is only the partition
109  * objects that can be plumbed and they alone participate in I/O and not the
110  * port driver.
111  *
112  * There are some info ioctls supported in ibd which are used by dladm(1M) to
113  * display useful information. The info entry point for ibd is
114  * ibd_get_partition_info().
115  *
116  * 2. Late HCA initialization feature.
117  * ------------------------------------
118  * As mentioned in section 1, the user creates the partition objects via
119  * dladm(1M). It is possible that:
120  * a) The physical port itself is down and the SM cannot be reached.
121  * b) The PKEY specified by the used has not been created in the SM yet.
122  * c) An IPoIB broadcast group for the specified PKEY is not present.
123  *
124  * In all of the above cases, complete initialization of the partition object is
125  * not possible. However, the new model allows the creation of partition
126  * objects even in such cases but will defer the initialization for later.
127  * When such a partition object is plumbed, the link state will be displayed as
128  * "down".
129  * The driver, at this point, is listening to events that herald the
130  * availability of resources -
131  * i)   LINK_UP when the link becomes available
132  * ii)  PORT_CHANGE when the PKEY has been created
133  * iii) MCG_CREATED when the IPoIB broadcast group for the given pkey has been
134  * created
135  * via ibd_async_handler() for events i) and ii), and via
136  * ibd_snet_notices_handler() for iii.
137  * The driver handles these events (as and when they arrive) and completes the
138  * initialization of the partition object and transitions it to a usable state.
139  *
140  * 3. Brussels support and its implications to the current architecture.
141  * ---------------------------------------------------------------------
142  * The brussels support introduces two new interfaces to the ibd driver -
143  * ibd_m_getprop() and ibd_m_setprop().
144  * These interfaces allow setting and retrieval of certain properties.
145  * Some of them are public properties while most other are private properties
146  * meant to be used by developers. Tuning the latter kind can cause
147  * performance issues and should not be used without understanding the
148  * implications. All properties are specific to an instance of either the
149  * partition object or the port driver.
150  *
151  * The public properties are : mtu and linkmode.
152  * mtu is a read-only property.
153  * linkmode can take two values - UD and CM.
154  *
155  * Changing the linkmode requires some bookkeeping in the driver. The
156  * capabilities need to be re-reported to the mac layer. This is done by
157  * calling mac_capab_update().  The maxsdu is updated by calling
158  * mac_maxsdu_update().
159  * The private properties retain their values across the change of linkmode.
160  * NOTE:
161  * - The port driver does not support any property apart from mtu.
162  * - All other properties are only meant for the partition object.
163  * - The properties cannot be set when an instance is plumbed. The
164  * instance has to be unplumbed to effect any setting.
165  */
166 
167 /*
168  * Driver wide tunables
169  *
170  * ibd_tx_softintr
171  * ibd_rx_softintr
172  *     The softintr mechanism allows ibd to avoid event queue overflows if
173  *     the receive/completion handlers are to be expensive. These are enabled
174  *     by default.
175  *
176  * ibd_log_sz
177  *     This specifies the size of the ibd log buffer in bytes. The buffer is
178  *     allocated and logging is enabled only when IBD_LOGGING is defined.
179  *
180  */
181 uint_t ibd_rx_softintr = 1;
182 uint_t ibd_tx_softintr = 1;
183 
184 #ifdef IBD_LOGGING
185 uint_t ibd_log_sz = 0x20000;
186 #endif
187 
188 #ifdef IBD_LOGGING
189 #define	IBD_LOG_SZ			ibd_log_sz
190 #endif
191 
192 /* Post IBD_RX_POST_CNT receive work requests at a time. */
193 #define	IBD_RX_POST_CNT			8
194 
195 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */
196 #define	IBD_LOG_RX_POST			4
197 
198 /* Minimum number of receive work requests driver needs to always have */
199 #define	IBD_RWQE_MIN	((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4)
200 
201 /*
202  * LSO parameters
203  */
204 #define	IBD_LSO_MAXLEN			65536
205 #define	IBD_LSO_BUFSZ			8192
206 
207 /*
208  * Async operation states
209  */
210 #define	IBD_OP_NOTSTARTED		0
211 #define	IBD_OP_ONGOING			1
212 #define	IBD_OP_COMPLETED		2
213 #define	IBD_OP_ERRORED			3
214 #define	IBD_OP_ROUTERED			4
215 
216 /*
217  * State of IBD driver initialization during attach/m_start
218  */
219 #define	IBD_DRV_STATE_INITIALIZED	0x000001
220 #define	IBD_DRV_RXINTR_ADDED		0x000002
221 #define	IBD_DRV_TXINTR_ADDED		0x000004
222 #define	IBD_DRV_IBTL_ATTACH_DONE	0x000008
223 #define	IBD_DRV_HCA_OPENED		0x000010
224 #define	IBD_DRV_PD_ALLOCD		0x000020
225 #define	IBD_DRV_MAC_REGISTERED		0x000040
226 #define	IBD_DRV_PORT_DETAILS_OBTAINED	0x000080
227 #define	IBD_DRV_BCAST_GROUP_FOUND	0x000100
228 #define	IBD_DRV_ACACHE_INITIALIZED	0x000200
229 #define	IBD_DRV_CQS_ALLOCD		0x000400
230 #define	IBD_DRV_UD_CHANNEL_SETUP	0x000800
231 #define	IBD_DRV_TXLIST_ALLOCD		0x001000
232 #define	IBD_DRV_SCQ_NOTIFY_ENABLED	0x002000
233 #define	IBD_DRV_RXLIST_ALLOCD		0x004000
234 #define	IBD_DRV_BCAST_GROUP_JOINED	0x008000
235 #define	IBD_DRV_ASYNC_THR_CREATED	0x010000
236 #define	IBD_DRV_RCQ_NOTIFY_ENABLED	0x020000
237 #define	IBD_DRV_SM_NOTICES_REGISTERED	0x040000
238 #define	IBD_DRV_STARTED			0x080000
239 #define	IBD_DRV_RC_SRQ_ALLOCD		0x100000
240 #define	IBD_DRV_RC_LARGEBUF_ALLOCD	0x200000
241 #define	IBD_DRV_RC_LISTEN		0x400000
242 #ifdef DEBUG
243 #define	IBD_DRV_RC_PRIVATE_STATE	0x800000
244 #endif
245 #define	IBD_DRV_IN_DELETION		0x1000000
246 #define	IBD_DRV_IN_LATE_HCA_INIT 	0x2000000
247 #define	IBD_DRV_REQ_LIST_INITED 	0x4000000
248 
249 /*
250  * Start/stop in-progress flags; note that restart must always remain
251  * the OR of start and stop flag values.
252  */
253 #define	IBD_DRV_START_IN_PROGRESS	0x10000000
254 #define	IBD_DRV_STOP_IN_PROGRESS	0x20000000
255 #define	IBD_DRV_RESTART_IN_PROGRESS	0x30000000
256 #define	IBD_DRV_DELETE_IN_PROGRESS	IBD_DRV_RESTART_IN_PROGRESS
257 
258 /*
259  * Miscellaneous constants
260  */
261 #define	IB_MGID_IPV4_LOWGRP_MASK	0xFFFFFFFF
262 #define	IBD_DEF_MAX_SDU			2044
263 #define	IBD_DEF_MAX_MTU			(IBD_DEF_MAX_SDU + IPOIB_HDRSIZE)
264 #define	IBD_DEF_RC_MAX_SDU		65520
265 #define	IBD_DEF_RC_MAX_MTU		(IBD_DEF_RC_MAX_SDU + IPOIB_HDRSIZE)
266 #define	IBD_DEFAULT_QKEY		0xB1B
267 #ifdef IBD_LOGGING
268 #define	IBD_DMAX_LINE			100
269 #endif
270 
271 /*
272  * Enumerations for link states
273  */
274 typedef enum {
275 	IBD_LINK_DOWN,
276 	IBD_LINK_UP,
277 	IBD_LINK_UP_ABSENT
278 } ibd_link_op_t;
279 
280 /*
281  * Driver State Pointer
282  */
283 void *ibd_list;
284 
285 /*
286  * Driver Global Data
287  */
288 ibd_global_state_t ibd_gstate;
289 
290 /*
291  * Partition object list
292  */
293 ibd_state_t	*ibd_objlist_head = NULL;
294 kmutex_t	ibd_objlist_lock;
295 
296 /*
297  * Logging
298  */
299 #ifdef IBD_LOGGING
300 kmutex_t ibd_lbuf_lock;
301 uint8_t *ibd_lbuf;
302 uint32_t ibd_lbuf_ndx;
303 #endif
304 
305 /*
306  * Required system entry points
307  */
308 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
309 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
310 
311 /*
312  * Required driver entry points for GLDv3
313  */
314 static int ibd_m_stat(void *, uint_t, uint64_t *);
315 static int ibd_m_start(void *);
316 static void ibd_m_stop(void *);
317 static int ibd_m_promisc(void *, boolean_t);
318 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
319 static int ibd_m_unicst(void *, const uint8_t *);
320 static mblk_t *ibd_m_tx(void *, mblk_t *);
321 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
322 
323 static int ibd_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
324     const void *);
325 static int ibd_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
326 static void ibd_m_propinfo(void *, const char *, mac_prop_id_t,
327     mac_prop_info_handle_t);
328 static int ibd_set_priv_prop(ibd_state_t *, const char *, uint_t,
329     const void *);
330 static int ibd_get_priv_prop(ibd_state_t *, const char *, uint_t, void *);
331 
332 /*
333  * Private driver entry points for GLDv3
334  */
335 
336 /*
337  * Initialization
338  */
339 static int ibd_state_init(ibd_state_t *, dev_info_t *);
340 static int ibd_init_txlist(ibd_state_t *);
341 static int ibd_init_rxlist(ibd_state_t *);
342 static int ibd_acache_init(ibd_state_t *);
343 #ifdef IBD_LOGGING
344 static void ibd_log_init(void);
345 #endif
346 
347 /*
348  * Termination/cleanup
349  */
350 static void ibd_state_fini(ibd_state_t *);
351 static void ibd_fini_txlist(ibd_state_t *);
352 static void ibd_fini_rxlist(ibd_state_t *);
353 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
354 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *);
355 static void ibd_acache_fini(ibd_state_t *);
356 #ifdef IBD_LOGGING
357 static void ibd_log_fini(void);
358 #endif
359 
360 /*
361  * Allocation/acquire/map routines
362  */
363 static int ibd_alloc_tx_copybufs(ibd_state_t *);
364 static int ibd_alloc_rx_copybufs(ibd_state_t *);
365 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
366 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *);
367 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
368     uint32_t *);
369 
370 /*
371  * Free/release/unmap routines
372  */
373 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
374 static void ibd_free_tx_copybufs(ibd_state_t *);
375 static void ibd_free_rx_copybufs(ibd_state_t *);
376 static void ibd_free_rx_rsrcs(ibd_state_t *);
377 static void ibd_free_tx_lsobufs(ibd_state_t *);
378 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int);
379 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
380 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
381 
382 /*
383  * Handlers/callback routines
384  */
385 static uint_t ibd_intr(caddr_t);
386 static uint_t ibd_tx_recycle(caddr_t);
387 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
388 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
389 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t);
390 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t);
391 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t);
392 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t);
393 static void ibd_freemsg_cb(char *);
394 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
395     ibt_async_event_t *);
396 static void ibdpd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
397     ibt_async_event_t *);
398 static void ibd_snet_notices_handler(void *, ib_gid_t,
399     ibt_subnet_event_code_t, ibt_subnet_event_t *);
400 
401 /*
402  * Send/receive routines
403  */
404 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
405 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
406 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *);
407 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
408 
409 /*
410  * Threads
411  */
412 static void ibd_async_work(ibd_state_t *);
413 
414 /*
415  * Async tasks
416  */
417 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
418 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
419 static void ibd_async_setprom(ibd_state_t *);
420 static void ibd_async_unsetprom(ibd_state_t *);
421 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
422 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
423 static void ibd_async_txsched(ibd_state_t *);
424 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
425 
426 /*
427  * Async task helpers
428  */
429 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
430 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
431 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
432 static boolean_t ibd_get_allroutergroup(ibd_state_t *,
433     ipoib_mac_t *, ipoib_mac_t *);
434 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
435 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
436 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
437 static ibt_status_t ibd_find_bgroup(ibd_state_t *);
438 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
439 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
440 static uint64_t ibd_get_portspeed(ibd_state_t *);
441 static boolean_t ibd_async_safe(ibd_state_t *);
442 static void ibd_async_done(ibd_state_t *);
443 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
444 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
445 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
446 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);
447 
448 /*
449  * Helpers for attach/start routines
450  */
451 static int ibd_register_mac(ibd_state_t *, dev_info_t *);
452 static int ibd_record_capab(ibd_state_t *);
453 static int ibd_get_port_details(ibd_state_t *);
454 static int ibd_alloc_cqs(ibd_state_t *);
455 static int ibd_setup_ud_channel(ibd_state_t *);
456 static int ibd_start(ibd_state_t *);
457 static int ibd_undo_start(ibd_state_t *, link_state_t);
458 static void ibd_set_mac_progress(ibd_state_t *, uint_t);
459 static void ibd_clr_mac_progress(ibd_state_t *, uint_t);
460 static int ibd_part_attach(ibd_state_t *state, dev_info_t *dip);
461 static void ibd_part_unattach(ibd_state_t *state);
462 static int ibd_port_attach(dev_info_t *);
463 static int ibd_port_unattach(ibd_state_t *state, dev_info_t *dip);
464 static int ibd_get_port_state(ibd_state_t *, link_state_t *);
465 static int ibd_part_busy(ibd_state_t *);
466 
467 /*
468  * Miscellaneous helpers
469  */
470 static int ibd_sched_poll(ibd_state_t *, int, int);
471 static void ibd_resume_transmission(ibd_state_t *);
472 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
473 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
474 static void *list_get_head(list_t *);
475 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
476 static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
477 
478 ibt_status_t ibd_get_part_attr(datalink_id_t, ibt_part_attr_t *);
479 ibt_status_t ibd_get_all_part_attr(ibt_part_attr_t **, int *);
480 
481 #ifdef IBD_LOGGING
482 static void ibd_log(const char *, ...);
483 #endif
484 
485 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
486     nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
487 
488 /* Module Driver Info */
489 static struct modldrv ibd_modldrv = {
490 	&mod_driverops,			/* This one is a driver */
491 	"InfiniBand GLDv3 Driver",	/* short description */
492 	&ibd_dev_ops			/* driver specific ops */
493 };
494 
495 /* Module Linkage */
496 static struct modlinkage ibd_modlinkage = {
497 	MODREV_1, (void *)&ibd_modldrv, NULL
498 };
499 
500 /*
501  * Module (static) info passed to IBTL during ibt_attach
502  */
503 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
504 	IBTI_V_CURR,
505 	IBT_NETWORK,
506 	ibd_async_handler,
507 	NULL,
508 	"IBPART"
509 };
510 
511 static struct ibt_clnt_modinfo_s ibdpd_clnt_modinfo = {
512 	IBTI_V_CURR,
513 	IBT_NETWORK,
514 	ibdpd_async_handler,
515 	NULL,
516 	"IPIB"
517 };
518 
519 /*
520  * GLDv3 entry points
521  */
522 #define	IBD_M_CALLBACK_FLAGS	\
523 	(MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO)
524 
525 static mac_callbacks_t ibd_m_callbacks = {
526 	IBD_M_CALLBACK_FLAGS,
527 	ibd_m_stat,
528 	ibd_m_start,
529 	ibd_m_stop,
530 	ibd_m_promisc,
531 	ibd_m_multicst,
532 	ibd_m_unicst,
533 	ibd_m_tx,
534 	NULL,
535 	NULL,
536 	ibd_m_getcapab,
537 	NULL,
538 	NULL,
539 	ibd_m_setprop,
540 	ibd_m_getprop,
541 	ibd_m_propinfo
542 };
543 
544 /* Private properties */
545 char *ibd_priv_props[] = {
546 	"_ibd_broadcast_group",
547 	"_ibd_coalesce_completions",
548 	"_ibd_create_broadcast_group",
549 	"_ibd_hash_size",
550 	"_ibd_lso_enable",
551 	"_ibd_num_ah",
552 	"_ibd_num_lso_bufs",
553 	"_ibd_rc_enable_srq",
554 	"_ibd_rc_num_rwqe",
555 	"_ibd_rc_num_srq",
556 	"_ibd_rc_num_swqe",
557 	"_ibd_rc_rx_comp_count",
558 	"_ibd_rc_rx_comp_usec",
559 	"_ibd_rc_rx_copy_thresh",
560 	"_ibd_rc_rx_rwqe_thresh",
561 	"_ibd_rc_tx_comp_count",
562 	"_ibd_rc_tx_comp_usec",
563 	"_ibd_rc_tx_copy_thresh",
564 	"_ibd_ud_num_rwqe",
565 	"_ibd_ud_num_swqe",
566 	"_ibd_ud_rx_comp_count",
567 	"_ibd_ud_rx_comp_usec",
568 	"_ibd_ud_tx_comp_count",
569 	"_ibd_ud_tx_comp_usec",
570 	"_ibd_ud_tx_copy_thresh",
571 	NULL
572 };
573 
574 static int ibd_create_partition(void *, intptr_t, int, cred_t *, int *);
575 static int ibd_delete_partition(void *, intptr_t, int, cred_t *, int *);
576 static int ibd_get_partition_info(void *, intptr_t, int, cred_t *, int *);
577 
578 static dld_ioc_info_t ibd_dld_ioctl_list[] = {
579 	{IBD_CREATE_IBPART, DLDCOPYINOUT, sizeof (ibpart_ioctl_t),
580 	    ibd_create_partition, secpolicy_dl_config},
581 	{IBD_DELETE_IBPART, DLDCOPYIN, sizeof (ibpart_ioctl_t),
582 	    ibd_delete_partition, secpolicy_dl_config},
583 	{IBD_INFO_IBPART, DLDCOPYIN, sizeof (ibd_ioctl_t),
584 	    ibd_get_partition_info, NULL}
585 };
586 
587 /*
588  * Fill/clear <scope> and <p_key> in multicast/broadcast address
589  */
590 #define	IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)		\
591 {							\
592 	*(uint32_t *)((char *)(maddr) + 4) |=		\
593 	    htonl((uint32_t)(scope) << 16);		\
594 	*(uint32_t *)((char *)(maddr) + 8) |=		\
595 	    htonl((uint32_t)(pkey) << 16);		\
596 }
597 
598 #define	IBD_CLEAR_SCOPE_PKEY(maddr)			\
599 {							\
600 	*(uint32_t *)((char *)(maddr) + 4) &=		\
601 	    htonl(~((uint32_t)0xF << 16));		\
602 	*(uint32_t *)((char *)(maddr) + 8) &=		\
603 	    htonl(~((uint32_t)0xFFFF << 16));		\
604 }
605 
606 /*
607  * Rudimentary debugging support
608  */
609 #ifdef DEBUG
610 int ibd_debuglevel = 100;
611 void
612 debug_print(int l, char *fmt, ...)
613 {
614 	va_list ap;
615 
616 	if (l < ibd_debuglevel)
617 		return;
618 	va_start(ap, fmt);
619 	vcmn_err(CE_CONT, fmt, ap);
620 	va_end(ap);
621 }
622 #endif
623 
624 /*
625  * Common routine to print warning messages; adds in hca guid, port number
626  * and pkey to be able to identify the IBA interface.
627  */
628 void
629 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
630 {
631 	ib_guid_t hca_guid;
632 	char ibd_print_buf[256];
633 	int len;
634 	va_list ap;
635 
636 	hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
637 	    0, "hca-guid", 0);
638 	len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
639 	    "%s%d: HCA GUID %016llx port %d PKEY %02x ",
640 	    ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
641 	    (u_longlong_t)hca_guid, state->id_port, state->id_pkey);
642 	va_start(ap, fmt);
643 	(void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
644 	    fmt, ap);
645 	cmn_err(CE_NOTE, "!%s", ibd_print_buf);
646 	va_end(ap);
647 }
648 
649 /*
650  * Warlock directives
651  */
652 
653 /*
654  * id_lso_lock
655  *
656  * state->id_lso->bkt_nfree may be accessed without a lock to
657  * determine the threshold at which we have to ask the nw layer
658  * to resume transmission (see ibd_resume_transmission()).
659  */
660 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
661     ibd_state_t::id_lso))
662 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
663 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy))
664 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))
665 
666 /*
667  * id_scq_poll_lock
668  */
669 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock,
670     ibd_state_t::id_scq_poll_busy))
671 
672 /*
673  * id_txpost_lock
674  */
675 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
676     ibd_state_t::id_tx_head))
677 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
678     ibd_state_t::id_tx_busy))
679 
680 /*
681  * id_acache_req_lock
682  */
683 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
684     ibd_state_t::id_acache_req_cv))
685 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
686     ibd_state_t::id_req_list))
687 _NOTE(SCHEME_PROTECTS_DATA("atomic",
688     ibd_acache_s::ac_ref))
689 
690 /*
691  * id_ac_mutex
692  *
693  * This mutex is actually supposed to protect id_ah_op as well,
694  * but this path of the code isn't clean (see update of id_ah_op
695  * in ibd_async_acache(), immediately after the call to
696  * ibd_async_mcache()). For now, we'll skip this check by
697  * declaring that id_ah_op is protected by some internal scheme
698  * that warlock isn't aware of.
699  */
700 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
701     ibd_state_t::id_ah_active))
702 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
703     ibd_state_t::id_ah_free))
704 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
705     ibd_state_t::id_ah_addr))
706 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
707     ibd_state_t::id_ah_op))
708 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
709     ibd_state_t::id_ah_error))
710 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
711     ibd_state_t::id_ac_hot_ace))
712 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))
713 
714 /*
715  * id_mc_mutex
716  */
717 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
718     ibd_state_t::id_mc_full))
719 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
720     ibd_state_t::id_mc_non))
721 
722 /*
723  * id_trap_lock
724  */
725 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
726     ibd_state_t::id_trap_cv))
727 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
728     ibd_state_t::id_trap_stop))
729 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
730     ibd_state_t::id_trap_inprog))
731 
732 /*
733  * id_prom_op
734  */
735 _NOTE(SCHEME_PROTECTS_DATA("only by async thread",
736     ibd_state_t::id_prom_op))
737 
738 /*
739  * id_sched_lock
740  */
741 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
742     ibd_state_t::id_sched_needed))
743 
744 /*
745  * id_link_mutex
746  */
747 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
748     ibd_state_t::id_link_state))
749 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
750 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
751     ibd_state_t::id_link_speed))
752 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid))
753 
754 /*
755  * id_tx_list.dl_mutex
756  */
757 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
758     ibd_state_t::id_tx_list.dl_head))
759 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
760     ibd_state_t::id_tx_list.dl_pending_sends))
761 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
762     ibd_state_t::id_tx_list.dl_cnt))
763 
764 /*
765  * id_rx_list.dl_mutex
766  */
767 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
768     ibd_state_t::id_rx_list.dl_bufs_outstanding))
769 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
770     ibd_state_t::id_rx_list.dl_cnt))
771 
772 
773 /*
774  * Items protected by atomic updates
775  */
776 _NOTE(SCHEME_PROTECTS_DATA("atomic update only",
777     ibd_state_s::id_brd_rcv
778     ibd_state_s::id_brd_xmt
779     ibd_state_s::id_multi_rcv
780     ibd_state_s::id_multi_xmt
781     ibd_state_s::id_num_intrs
782     ibd_state_s::id_rcv_bytes
783     ibd_state_s::id_rcv_pkt
784     ibd_state_s::id_rx_post_queue_index
785     ibd_state_s::id_tx_short
786     ibd_state_s::id_xmt_bytes
787     ibd_state_s::id_xmt_pkt
788     ibd_state_s::rc_rcv_trans_byte
789     ibd_state_s::rc_rcv_trans_pkt
790     ibd_state_s::rc_rcv_copy_byte
791     ibd_state_s::rc_rcv_copy_pkt
792     ibd_state_s::rc_xmt_bytes
793     ibd_state_s::rc_xmt_small_pkt
794     ibd_state_s::rc_xmt_fragmented_pkt
795     ibd_state_s::rc_xmt_map_fail_pkt
796     ibd_state_s::rc_xmt_map_succ_pkt))
797 
798 /*
799  * Non-mutex protection schemes for data elements. Almost all of
800  * these are non-shared items.
801  */
802 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
803     callb_cpr
804     ib_gid_s
805     ib_header_info
806     ibd_acache_rq
807     ibd_acache_s::ac_mce
808     ibd_acache_s::ac_chan
809     ibd_mcache::mc_fullreap
810     ibd_mcache::mc_jstate
811     ibd_mcache::mc_req
812     ibd_rwqe_s
813     ibd_swqe_s
814     ibd_wqe_s
815     ibt_wr_ds_s::ds_va
816     ibt_wr_lso_s
817     ipoib_mac::ipoib_qpn
818     mac_capab_lso_s
819     msgb::b_next
820     msgb::b_cont
821     msgb::b_rptr
822     msgb::b_wptr
823     ibd_state_s::id_bgroup_created
824     ibd_state_s::id_mac_state
825     ibd_state_s::id_mtu
826     ibd_state_s::id_ud_num_rwqe
827     ibd_state_s::id_ud_num_swqe
828     ibd_state_s::id_qpnum
829     ibd_state_s::id_rcq_hdl
830     ibd_state_s::id_rx_buf_sz
831     ibd_state_s::id_rx_bufs
832     ibd_state_s::id_rx_mr_hdl
833     ibd_state_s::id_rx_wqes
834     ibd_state_s::id_rxwcs
835     ibd_state_s::id_rxwcs_size
836     ibd_state_s::id_rx_nqueues
837     ibd_state_s::id_rx_queues
838     ibd_state_s::id_scope
839     ibd_state_s::id_scq_hdl
840     ibd_state_s::id_tx_buf_sz
841     ibd_state_s::id_tx_bufs
842     ibd_state_s::id_tx_mr_hdl
843     ibd_state_s::id_tx_rel_list.dl_cnt
844     ibd_state_s::id_tx_wqes
845     ibd_state_s::id_txwcs
846     ibd_state_s::id_txwcs_size
847     ibd_state_s::rc_listen_hdl
848     ibd_state_s::rc_listen_hdl_OFED_interop
849     ibd_state_s::rc_srq_size
850     ibd_state_s::rc_srq_rwqes
851     ibd_state_s::rc_srq_rx_bufs
852     ibd_state_s::rc_srq_rx_mr_hdl
853     ibd_state_s::rc_tx_largebuf_desc_base
854     ibd_state_s::rc_tx_mr_bufs
855     ibd_state_s::rc_tx_mr_hdl
856     ipha_s
857     icmph_s
858     ibt_path_info_s::pi_sid
859     ibd_rc_chan_s::ace
860     ibd_rc_chan_s::chan_hdl
861     ibd_rc_chan_s::state
862     ibd_rc_chan_s::chan_state
863     ibd_rc_chan_s::is_tx_chan
864     ibd_rc_chan_s::rcq_hdl
865     ibd_rc_chan_s::rcq_size
866     ibd_rc_chan_s::scq_hdl
867     ibd_rc_chan_s::scq_size
868     ibd_rc_chan_s::requester_gid
869     ibd_rc_chan_s::requester_pkey
870     ibd_rc_chan_s::rx_bufs
871     ibd_rc_chan_s::rx_mr_hdl
872     ibd_rc_chan_s::rx_rwqes
873     ibd_rc_chan_s::tx_wqes
874     ibd_rc_chan_s::tx_mr_bufs
875     ibd_rc_chan_s::tx_mr_hdl
876     ibd_rc_chan_s::tx_rel_list.dl_cnt
877     ibd_rc_chan_s::tx_trans_error_cnt
878     ibd_rc_tx_largebuf_s::lb_buf
879     ibd_rc_msg_hello_s
880     ibt_cm_return_args_s))
881 
882 /*
883  * ibd_rc_chan_s::next is protected by two mutexes:
884  * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex
885  * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex.
886  */
887 _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes",
888     ibd_rc_chan_s::next))
889 
890 /*
891  * ibd_state_s.rc_tx_large_bufs_lock
892  */
893 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
894     ibd_state_s::rc_tx_largebuf_free_head))
895 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
896     ibd_state_s::rc_tx_largebuf_nfree))
897 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
898     ibd_rc_tx_largebuf_s::lb_next))
899 
900 /*
901  * ibd_acache_s.tx_too_big_mutex
902  */
903 _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex,
904     ibd_acache_s::tx_too_big_ongoing))
905 
906 /*
907  * tx_wqe_list.dl_mutex
908  */
909 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
910     ibd_rc_chan_s::tx_wqe_list.dl_head))
911 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
912     ibd_rc_chan_s::tx_wqe_list.dl_pending_sends))
913 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
914     ibd_rc_chan_s::tx_wqe_list.dl_cnt))
915 
916 /*
917  * ibd_state_s.rc_ace_recycle_lock
918  */
919 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock,
920     ibd_state_s::rc_ace_recycle))
921 
922 /*
923  * rc_srq_rwqe_list.dl_mutex
924  */
925 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
926     ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding))
927 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
928     ibd_state_t::rc_srq_rwqe_list.dl_cnt))
929 
930 /*
931  * Non-mutex protection schemes for data elements. They are counters
932  * for problem diagnosis. Don't need be protected.
933  */
934 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
935     ibd_state_s::rc_rcv_alloc_fail
936     ibd_state_s::rc_rcq_invoke
937     ibd_state_s::rc_rcq_err
938     ibd_state_s::rc_ace_not_found
939     ibd_state_s::rc_xmt_drop_too_long_pkt
940     ibd_state_s::rc_xmt_icmp_too_long_pkt
941     ibd_state_s::rc_xmt_reenter_too_long_pkt
942     ibd_state_s::rc_swqe_short
943     ibd_state_s::rc_swqe_mac_update
944     ibd_state_s::rc_xmt_buf_short
945     ibd_state_s::rc_xmt_buf_mac_update
946     ibd_state_s::rc_scq_no_swqe
947     ibd_state_s::rc_scq_no_largebuf
948     ibd_state_s::rc_scq_invoke
949     ibd_state_s::rc_conn_succ
950     ibd_state_s::rc_conn_fail
951     ibd_state_s::rc_null_conn
952     ibd_state_s::rc_no_estab_conn
953     ibd_state_s::rc_act_close
954     ibd_state_s::rc_pas_close
955     ibd_state_s::rc_delay_ace_recycle
956     ibd_state_s::rc_act_close_simultaneous
957     ibd_state_s::rc_reset_cnt))
958 
959 #ifdef DEBUG
960 /*
961  * Non-mutex protection schemes for data elements. They are counters
962  * for problem diagnosis. Don't need be protected.
963  */
964 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
965     ibd_state_s::rc_rwqe_short
966     ibd_rc_stat_s::rc_rcv_trans_byte
967     ibd_rc_stat_s::rc_rcv_trans_pkt
968     ibd_rc_stat_s::rc_rcv_copy_byte
969     ibd_rc_stat_s::rc_rcv_copy_pkt
970     ibd_rc_stat_s::rc_rcv_alloc_fail
971     ibd_rc_stat_s::rc_rcq_invoke
972     ibd_rc_stat_s::rc_rcq_err
973     ibd_rc_stat_s::rc_scq_invoke
974     ibd_rc_stat_s::rc_rwqe_short
975     ibd_rc_stat_s::rc_xmt_bytes
976     ibd_rc_stat_s::rc_xmt_small_pkt
977     ibd_rc_stat_s::rc_xmt_fragmented_pkt
978     ibd_rc_stat_s::rc_xmt_map_fail_pkt
979     ibd_rc_stat_s::rc_xmt_map_succ_pkt
980     ibd_rc_stat_s::rc_ace_not_found
981     ibd_rc_stat_s::rc_scq_no_swqe
982     ibd_rc_stat_s::rc_scq_no_largebuf
983     ibd_rc_stat_s::rc_swqe_short
984     ibd_rc_stat_s::rc_swqe_mac_update
985     ibd_rc_stat_s::rc_xmt_buf_short
986     ibd_rc_stat_s::rc_xmt_buf_mac_update
987     ibd_rc_stat_s::rc_conn_succ
988     ibd_rc_stat_s::rc_conn_fail
989     ibd_rc_stat_s::rc_null_conn
990     ibd_rc_stat_s::rc_no_estab_conn
991     ibd_rc_stat_s::rc_act_close
992     ibd_rc_stat_s::rc_pas_close
993     ibd_rc_stat_s::rc_delay_ace_recycle
994     ibd_rc_stat_s::rc_act_close_simultaneous
995     ibd_rc_stat_s::rc_reset_cnt))
996 #endif
997 
998 int
999 _init()
1000 {
1001 	int status;
1002 
1003 	status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t),
1004 	    PAGESIZE), 0);
1005 	if (status != 0) {
1006 		DPRINT(10, "_init:failed in ddi_soft_state_init()");
1007 		return (status);
1008 	}
1009 
1010 	mutex_init(&ibd_objlist_lock, NULL, MUTEX_DRIVER, NULL);
1011 
1012 	mac_init_ops(&ibd_dev_ops, "ibp");
1013 	status = mod_install(&ibd_modlinkage);
1014 	if (status != 0) {
1015 		DPRINT(10, "_init:failed in mod_install()");
1016 		ddi_soft_state_fini(&ibd_list);
1017 		mac_fini_ops(&ibd_dev_ops);
1018 		return (status);
1019 	}
1020 
1021 	mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL);
1022 	mutex_enter(&ibd_gstate.ig_mutex);
1023 	ibd_gstate.ig_ibt_hdl = NULL;
1024 	ibd_gstate.ig_ibt_hdl_ref_cnt = 0;
1025 	ibd_gstate.ig_service_list = NULL;
1026 	mutex_exit(&ibd_gstate.ig_mutex);
1027 
1028 	if (dld_ioc_register(IBPART_IOC, ibd_dld_ioctl_list,
1029 	    DLDIOCCNT(ibd_dld_ioctl_list)) != 0) {
1030 		return (EIO);
1031 	}
1032 
1033 	ibt_register_part_attr_cb(ibd_get_part_attr, ibd_get_all_part_attr);
1034 
1035 #ifdef IBD_LOGGING
1036 	ibd_log_init();
1037 #endif
1038 	return (0);
1039 }
1040 
1041 int
1042 _info(struct modinfo *modinfop)
1043 {
1044 	return (mod_info(&ibd_modlinkage, modinfop));
1045 }
1046 
1047 int
1048 _fini()
1049 {
1050 	int status;
1051 
1052 	status = mod_remove(&ibd_modlinkage);
1053 	if (status != 0)
1054 		return (status);
1055 
1056 	ibt_unregister_part_attr_cb();
1057 
1058 	mac_fini_ops(&ibd_dev_ops);
1059 	mutex_destroy(&ibd_objlist_lock);
1060 	ddi_soft_state_fini(&ibd_list);
1061 	mutex_destroy(&ibd_gstate.ig_mutex);
1062 #ifdef IBD_LOGGING
1063 	ibd_log_fini();
1064 #endif
1065 	return (0);
1066 }
1067 
1068 /*
1069  * Convert the GID part of the mac address from network byte order
1070  * to host order.
1071  */
1072 static void
1073 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
1074 {
1075 	ib_sn_prefix_t nbopref;
1076 	ib_guid_t nboguid;
1077 
1078 	bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
1079 	bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
1080 	dgid->gid_prefix = b2h64(nbopref);
1081 	dgid->gid_guid = b2h64(nboguid);
1082 }
1083 
1084 /*
1085  * Create the IPoIB address in network byte order from host order inputs.
1086  */
1087 static void
1088 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
1089     ib_guid_t guid)
1090 {
1091 	ib_sn_prefix_t nbopref;
1092 	ib_guid_t nboguid;
1093 
1094 	mac->ipoib_qpn = htonl(qpn);
1095 	nbopref = h2b64(prefix);
1096 	nboguid = h2b64(guid);
1097 	bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
1098 	bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
1099 }
1100 
1101 /*
1102  * Send to the appropriate all-routers group when the IBA multicast group
1103  * does not exist, based on whether the target group is v4 or v6.
1104  */
1105 static boolean_t
1106 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
1107     ipoib_mac_t *rmac)
1108 {
1109 	boolean_t retval = B_TRUE;
1110 	uint32_t adjscope = state->id_scope << 16;
1111 	uint32_t topword;
1112 
1113 	/*
1114 	 * Copy the first 4 bytes in without assuming any alignment of
1115 	 * input mac address; this will have IPoIB signature, flags and
1116 	 * scope bits.
1117 	 */
1118 	bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
1119 	topword = ntohl(topword);
1120 
1121 	/*
1122 	 * Generate proper address for IPv4/v6, adding in the Pkey properly.
1123 	 */
1124 	if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
1125 	    (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
1126 		ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
1127 		    ((uint32_t)(state->id_pkey << 16))),
1128 		    (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
1129 	else
1130 		/*
1131 		 * Does not have proper bits in the mgid address.
1132 		 */
1133 		retval = B_FALSE;
1134 
1135 	return (retval);
1136 }
1137 
1138 /*
1139  * Membership states for different mcg's are tracked by two lists:
1140  * the "non" list is used for promiscuous mode, when all mcg traffic
1141  * needs to be inspected. This type of membership is never used for
1142  * transmission, so there can not be an AH in the active list
1143  * corresponding to a member in this list. This list does not need
1144  * any protection, since all operations are performed by the async
1145  * thread.
1146  *
1147  * "Full" and "SendOnly" membership is tracked using a single list,
1148  * the "full" list. This is because this single list can then be
1149  * searched during transmit to a multicast group (if an AH for the
1150  * mcg is not found in the active list), since at least one type
1151  * of membership must be present before initiating the transmit.
1152  * This list is also emptied during driver detach, since sendonly
1153  * membership acquired during transmit is dropped at detach time
1154  * along with ipv4 broadcast full membership. Insert/deletes to
1155  * this list are done only by the async thread, but it is also
1156  * searched in program context (see multicast disable case), thus
1157  * the id_mc_mutex protects the list. The driver detach path also
1158  * deconstructs the "full" list, but it ensures that the async
1159  * thread will not be accessing the list (by blocking out mcg
1160  * trap handling and making sure no more Tx reaping will happen).
1161  *
1162  * Currently, an IBA attach is done in the SendOnly case too,
1163  * although this is not required.
1164  */
1165 #define	IBD_MCACHE_INSERT_FULL(state, mce) \
1166 	list_insert_head(&state->id_mc_full, mce)
1167 #define	IBD_MCACHE_INSERT_NON(state, mce) \
1168 	list_insert_head(&state->id_mc_non, mce)
1169 #define	IBD_MCACHE_FIND_FULL(state, mgid) \
1170 	ibd_mcache_find(mgid, &state->id_mc_full)
1171 #define	IBD_MCACHE_FIND_NON(state, mgid) \
1172 	ibd_mcache_find(mgid, &state->id_mc_non)
1173 #define	IBD_MCACHE_PULLOUT_FULL(state, mce) \
1174 	list_remove(&state->id_mc_full, mce)
1175 #define	IBD_MCACHE_PULLOUT_NON(state, mce) \
1176 	list_remove(&state->id_mc_non, mce)
1177 
1178 static void *
1179 list_get_head(list_t *list)
1180 {
1181 	list_node_t *lhead = list_head(list);
1182 
1183 	if (lhead != NULL)
1184 		list_remove(list, lhead);
1185 	return (lhead);
1186 }
1187 
1188 /*
1189  * This is always guaranteed to be able to queue the work.
1190  */
1191 void
1192 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1193 {
1194 	/* Initialize request */
1195 	DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1196 	ptr->rq_op = op;
1197 
1198 	/*
1199 	 * Queue provided slot onto request pool.
1200 	 */
1201 	mutex_enter(&state->id_acache_req_lock);
1202 	list_insert_tail(&state->id_req_list, ptr);
1203 
1204 	/* Go, fetch, async thread */
1205 	cv_signal(&state->id_acache_req_cv);
1206 	mutex_exit(&state->id_acache_req_lock);
1207 }
1208 
1209 /*
1210  * Main body of the per interface async thread.
1211  */
1212 static void
1213 ibd_async_work(ibd_state_t *state)
1214 {
1215 	ibd_req_t *ptr;
1216 	callb_cpr_t cprinfo;
1217 
1218 	mutex_enter(&state->id_acache_req_lock);
1219 	CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1220 	    callb_generic_cpr, "ibd_async_work");
1221 
1222 	for (;;) {
1223 		ptr = list_get_head(&state->id_req_list);
1224 		if (ptr != NULL) {
1225 			mutex_exit(&state->id_acache_req_lock);
1226 
1227 			/*
1228 			 * If we are in late hca initialization mode, do not
1229 			 * process any other async request other than TRAP. TRAP
1230 			 * is used for indicating creation of a broadcast group;
1231 			 * in which case, we need to join/create the group.
1232 			 */
1233 			if ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
1234 			    (ptr->rq_op != IBD_ASYNC_TRAP)) {
1235 				goto free_req_and_continue;
1236 			}
1237 
1238 			/*
1239 			 * Once we have done the operation, there is no
1240 			 * guarantee the request slot is going to be valid,
1241 			 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
1242 			 * TRAP).
1243 			 *
1244 			 * Perform the request.
1245 			 */
1246 			switch (ptr->rq_op) {
1247 				case IBD_ASYNC_GETAH:
1248 					ibd_async_acache(state, &ptr->rq_mac);
1249 					break;
1250 				case IBD_ASYNC_JOIN:
1251 				case IBD_ASYNC_LEAVE:
1252 					ibd_async_multicast(state,
1253 					    ptr->rq_gid, ptr->rq_op);
1254 					break;
1255 				case IBD_ASYNC_PROMON:
1256 					ibd_async_setprom(state);
1257 					break;
1258 				case IBD_ASYNC_PROMOFF:
1259 					ibd_async_unsetprom(state);
1260 					break;
1261 				case IBD_ASYNC_REAP:
1262 					ibd_async_reap_group(state,
1263 					    ptr->rq_ptr, ptr->rq_gid,
1264 					    IB_MC_JSTATE_FULL);
1265 					/*
1266 					 * the req buf contains in mce
1267 					 * structure, so we do not need
1268 					 * to free it here.
1269 					 */
1270 					ptr = NULL;
1271 					break;
1272 				case IBD_ASYNC_TRAP:
1273 					ibd_async_trap(state, ptr);
1274 					break;
1275 				case IBD_ASYNC_SCHED:
1276 					ibd_async_txsched(state);
1277 					break;
1278 				case IBD_ASYNC_LINK:
1279 					ibd_async_link(state, ptr);
1280 					break;
1281 				case IBD_ASYNC_EXIT:
1282 					mutex_enter(&state->id_acache_req_lock);
1283 #ifndef __lock_lint
1284 					CALLB_CPR_EXIT(&cprinfo);
1285 #else
1286 					mutex_exit(&state->id_acache_req_lock);
1287 #endif
1288 					return;
1289 				case IBD_ASYNC_RC_TOO_BIG:
1290 					ibd_async_rc_process_too_big(state,
1291 					    ptr);
1292 					break;
1293 				case IBD_ASYNC_RC_CLOSE_ACT_CHAN:
1294 					ibd_async_rc_close_act_chan(state, ptr);
1295 					break;
1296 				case IBD_ASYNC_RC_RECYCLE_ACE:
1297 					ibd_async_rc_recycle_ace(state, ptr);
1298 					break;
1299 			}
1300 free_req_and_continue:
1301 			if (ptr != NULL)
1302 				kmem_cache_free(state->id_req_kmc, ptr);
1303 
1304 			mutex_enter(&state->id_acache_req_lock);
1305 		} else {
1306 #ifndef __lock_lint
1307 			/*
1308 			 * Nothing to do: wait till new request arrives.
1309 			 */
1310 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1311 			cv_wait(&state->id_acache_req_cv,
1312 			    &state->id_acache_req_lock);
1313 			CALLB_CPR_SAFE_END(&cprinfo,
1314 			    &state->id_acache_req_lock);
1315 #endif
1316 		}
1317 	}
1318 
1319 	/*NOTREACHED*/
1320 	_NOTE(NOT_REACHED)
1321 }
1322 
1323 /*
1324  * Return when it is safe to queue requests to the async daemon; primarily
1325  * for subnet trap and async event handling. Disallow requests before the
1326  * daemon is created, and when interface deinitilization starts.
1327  */
1328 static boolean_t
1329 ibd_async_safe(ibd_state_t *state)
1330 {
1331 	mutex_enter(&state->id_trap_lock);
1332 	if (state->id_trap_stop) {
1333 		mutex_exit(&state->id_trap_lock);
1334 		return (B_FALSE);
1335 	}
1336 	state->id_trap_inprog++;
1337 	mutex_exit(&state->id_trap_lock);
1338 	return (B_TRUE);
1339 }
1340 
1341 /*
1342  * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
1343  * trap or event handling to complete to kill the async thread and deconstruct
1344  * the mcg/ace list.
1345  */
1346 static void
1347 ibd_async_done(ibd_state_t *state)
1348 {
1349 	mutex_enter(&state->id_trap_lock);
1350 	if (--state->id_trap_inprog == 0)
1351 		cv_signal(&state->id_trap_cv);
1352 	mutex_exit(&state->id_trap_lock);
1353 }
1354 
1355 /*
1356  * Hash functions:
1357  * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1358  * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1359  * These operate on mac addresses input into ibd_send, but there is no
1360  * guarantee on the alignment of the ipoib_mac_t structure.
1361  */
1362 /*ARGSUSED*/
1363 static uint_t
1364 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1365 {
1366 	ulong_t ptraddr = (ulong_t)key;
1367 	uint_t hval;
1368 
1369 	/*
1370 	 * If the input address is 4 byte aligned, we can just dereference
1371 	 * it. This is most common, since IP will send in a 4 byte aligned
1372 	 * IP header, which implies the 24 byte IPoIB psuedo header will be
1373 	 * 4 byte aligned too.
1374 	 */
1375 	if ((ptraddr & 3) == 0)
1376 		return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1377 
1378 	bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1379 	return (hval);
1380 }
1381 
1382 static int
1383 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1384 {
1385 	if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1386 		return (0);
1387 	else
1388 		return (1);
1389 }
1390 
1391 /*
1392  * Initialize all the per interface caches and lists; AH cache,
1393  * MCG list etc.
1394  */
1395 static int
1396 ibd_acache_init(ibd_state_t *state)
1397 {
1398 	ibd_ace_t *ce;
1399 	int i;
1400 
1401 	mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1402 	mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1403 	mutex_enter(&state->id_ac_mutex);
1404 	list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1405 	    offsetof(ibd_ace_t, ac_list));
1406 	list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1407 	    offsetof(ibd_ace_t, ac_list));
1408 	state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1409 	    state->id_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
1410 	    ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1411 	list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1412 	    offsetof(ibd_mce_t, mc_list));
1413 	list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1414 	    offsetof(ibd_mce_t, mc_list));
1415 	state->id_ac_hot_ace = NULL;
1416 
1417 	state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1418 	    state->id_num_ah, KM_SLEEP);
1419 	for (i = 0; i < state->id_num_ah; i++, ce++) {
1420 		if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1421 		    state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1422 			mutex_exit(&state->id_ac_mutex);
1423 			ibd_acache_fini(state);
1424 			return (DDI_FAILURE);
1425 		} else {
1426 			CLEAR_REFCYCLE(ce);
1427 			ce->ac_mce = NULL;
1428 			mutex_init(&ce->tx_too_big_mutex, NULL,
1429 			    MUTEX_DRIVER, NULL);
1430 			IBD_ACACHE_INSERT_FREE(state, ce);
1431 		}
1432 	}
1433 	mutex_exit(&state->id_ac_mutex);
1434 	return (DDI_SUCCESS);
1435 }
1436 
1437 static void
1438 ibd_acache_fini(ibd_state_t *state)
1439 {
1440 	ibd_ace_t *ptr;
1441 
1442 	mutex_enter(&state->id_ac_mutex);
1443 
1444 	while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1445 		ASSERT(GET_REF(ptr) == 0);
1446 		mutex_destroy(&ptr->tx_too_big_mutex);
1447 		(void) ibt_free_ud_dest(ptr->ac_dest);
1448 	}
1449 
1450 	while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1451 		ASSERT(GET_REF(ptr) == 0);
1452 		mutex_destroy(&ptr->tx_too_big_mutex);
1453 		(void) ibt_free_ud_dest(ptr->ac_dest);
1454 	}
1455 
1456 	list_destroy(&state->id_ah_free);
1457 	list_destroy(&state->id_ah_active);
1458 	list_destroy(&state->id_mc_full);
1459 	list_destroy(&state->id_mc_non);
1460 	kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * state->id_num_ah);
1461 	mutex_exit(&state->id_ac_mutex);
1462 	mutex_destroy(&state->id_ac_mutex);
1463 	mutex_destroy(&state->id_mc_mutex);
1464 }
1465 
1466 /*
1467  * Search AH active hash list for a cached path to input destination.
1468  * If we are "just looking", hold == F. When we are in the Tx path,
1469  * we set hold == T to grab a reference on the AH so that it can not
1470  * be recycled to a new destination while the Tx request is posted.
1471  */
1472 ibd_ace_t *
1473 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1474 {
1475 	ibd_ace_t *ptr;
1476 
1477 	ASSERT(mutex_owned(&state->id_ac_mutex));
1478 
1479 	/*
1480 	 * Do hash search.
1481 	 */
1482 	if (mod_hash_find(state->id_ah_active_hash,
1483 	    (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1484 		if (hold)
1485 			INC_REF(ptr, num);
1486 		return (ptr);
1487 	}
1488 	return (NULL);
1489 }
1490 
1491 /*
1492  * This is called by the tx side; if an initialized AH is found in
1493  * the active list, it is locked down and can be used; if no entry
1494  * is found, an async request is queued to do path resolution.
1495  */
1496 static ibd_ace_t *
1497 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1498 {
1499 	ibd_ace_t *ptr;
1500 	ibd_req_t *req;
1501 
1502 	/*
1503 	 * Only attempt to print when we can; in the mdt pattr case, the
1504 	 * address is not aligned properly.
1505 	 */
1506 	if (((ulong_t)mac & 3) == 0) {
1507 		DPRINT(4,
1508 		    "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1509 		    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1510 		    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1511 		    htonl(mac->ipoib_gidsuff[1]));
1512 	}
1513 
1514 	mutex_enter(&state->id_ac_mutex);
1515 
1516 	if (((ptr = state->id_ac_hot_ace) != NULL) &&
1517 	    (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) {
1518 		INC_REF(ptr, numwqe);
1519 		mutex_exit(&state->id_ac_mutex);
1520 		return (ptr);
1521 	}
1522 	if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) {
1523 		state->id_ac_hot_ace = ptr;
1524 		mutex_exit(&state->id_ac_mutex);
1525 		return (ptr);
1526 	}
1527 
1528 	/*
1529 	 * Implementation of a single outstanding async request; if
1530 	 * the operation is not started yet, queue a request and move
1531 	 * to ongoing state. Remember in id_ah_addr for which address
1532 	 * we are queueing the request, in case we need to flag an error;
1533 	 * Any further requests, for the same or different address, until
1534 	 * the operation completes, is sent back to GLDv3 to be retried.
1535 	 * The async thread will update id_ah_op with an error indication
1536 	 * or will set it to indicate the next look up can start; either
1537 	 * way, it will mac_tx_update() so that all blocked requests come
1538 	 * back here.
1539 	 */
1540 	*err = EAGAIN;
1541 	if (state->id_ah_op == IBD_OP_NOTSTARTED) {
1542 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1543 		if (req != NULL) {
1544 			/*
1545 			 * We did not even find the entry; queue a request
1546 			 * for it.
1547 			 */
1548 			bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1549 			ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
1550 			state->id_ah_op = IBD_OP_ONGOING;
1551 			bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1552 		}
1553 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1554 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1555 		/*
1556 		 * Check the status of the pathrecord lookup request
1557 		 * we had queued before.
1558 		 */
1559 		if (state->id_ah_op == IBD_OP_ERRORED) {
1560 			*err = EFAULT;
1561 			state->id_ah_error++;
1562 		} else {
1563 			/*
1564 			 * IBD_OP_ROUTERED case: We need to send to the
1565 			 * all-router MCG. If we can find the AH for
1566 			 * the mcg, the Tx will be attempted. If we
1567 			 * do not find the AH, we return NORESOURCES
1568 			 * to retry.
1569 			 */
1570 			ipoib_mac_t routermac;
1571 
1572 			(void) ibd_get_allroutergroup(state, mac, &routermac);
1573 			ptr = ibd_acache_find(state, &routermac, B_TRUE,
1574 			    numwqe);
1575 		}
1576 		state->id_ah_op = IBD_OP_NOTSTARTED;
1577 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1578 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1579 		/*
1580 		 * This case can happen when we get a higher band
1581 		 * packet. The easiest way is to reset the state machine
1582 		 * to accommodate the higher priority packet.
1583 		 */
1584 		state->id_ah_op = IBD_OP_NOTSTARTED;
1585 	}
1586 	mutex_exit(&state->id_ac_mutex);
1587 
1588 	return (ptr);
1589 }
1590 
1591 /*
1592  * Grab a not-currently-in-use AH/PathRecord from the active
1593  * list to recycle to a new destination. Only the async thread
1594  * executes this code.
1595  */
1596 static ibd_ace_t *
1597 ibd_acache_get_unref(ibd_state_t *state)
1598 {
1599 	ibd_ace_t *ptr = list_tail(&state->id_ah_active);
1600 	boolean_t try_rc_chan_recycle = B_FALSE;
1601 
1602 	ASSERT(mutex_owned(&state->id_ac_mutex));
1603 
1604 	/*
1605 	 * Do plain linear search.
1606 	 */
1607 	while (ptr != NULL) {
1608 		/*
1609 		 * Note that it is possible that the "cycle" bit
1610 		 * is set on the AH w/o any reference count. The
1611 		 * mcg must have been deleted, and the tx cleanup
1612 		 * just decremented the reference count to 0, but
1613 		 * hasn't gotten around to grabbing the id_ac_mutex
1614 		 * to move the AH into the free list.
1615 		 */
1616 		if (GET_REF(ptr) == 0) {
1617 			if (ptr->ac_chan != NULL) {
1618 				ASSERT(state->id_enable_rc == B_TRUE);
1619 				if (!try_rc_chan_recycle) {
1620 					try_rc_chan_recycle = B_TRUE;
1621 					ibd_rc_signal_ace_recycle(state, ptr);
1622 				}
1623 			} else {
1624 				IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1625 				break;
1626 			}
1627 		}
1628 		ptr = list_prev(&state->id_ah_active, ptr);
1629 	}
1630 	return (ptr);
1631 }
1632 
1633 /*
1634  * Invoked to clean up AH from active list in case of multicast
1635  * disable and to handle sendonly memberships during mcg traps.
1636  * And for port up processing for multicast and unicast AHs.
1637  * Normally, the AH is taken off the active list, and put into
1638  * the free list to be recycled for a new destination. In case
1639  * Tx requests on the AH have not completed yet, the AH is marked
1640  * for reaping (which will put the AH on the free list) once the Tx's
1641  * complete; in this case, depending on the "force" input, we take
1642  * out the AH from the active list right now, or leave it also for
1643  * the reap operation. Returns TRUE if the AH is taken off the active
1644  * list (and either put into the free list right now, or arranged for
1645  * later), FALSE otherwise.
1646  */
1647 boolean_t
1648 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1649 {
1650 	ibd_ace_t *acactive;
1651 	boolean_t ret = B_TRUE;
1652 
1653 	ASSERT(mutex_owned(&state->id_ac_mutex));
1654 
1655 	if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1656 
1657 		/*
1658 		 * Note that the AH might already have the cycle bit set
1659 		 * on it; this might happen if sequences of multicast
1660 		 * enables and disables are coming so fast, that posted
1661 		 * Tx's to the mcg have not completed yet, and the cycle
1662 		 * bit is set successively by each multicast disable.
1663 		 */
1664 		if (SET_CYCLE_IF_REF(acactive)) {
1665 			if (!force) {
1666 				/*
1667 				 * The ace is kept on the active list, further
1668 				 * Tx's can still grab a reference on it; the
1669 				 * ace is reaped when all pending Tx's
1670 				 * referencing the AH complete.
1671 				 */
1672 				ret = B_FALSE;
1673 			} else {
1674 				/*
1675 				 * In the mcg trap case, we always pull the
1676 				 * AH from the active list. And also the port
1677 				 * up multi/unicast case.
1678 				 */
1679 				ASSERT(acactive->ac_chan == NULL);
1680 				IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1681 				acactive->ac_mce = NULL;
1682 			}
1683 		} else {
1684 			/*
1685 			 * Determined the ref count is 0, thus reclaim
1686 			 * immediately after pulling out the ace from
1687 			 * the active list.
1688 			 */
1689 			ASSERT(acactive->ac_chan == NULL);
1690 			IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1691 			acactive->ac_mce = NULL;
1692 			IBD_ACACHE_INSERT_FREE(state, acactive);
1693 		}
1694 
1695 	}
1696 	return (ret);
1697 }
1698 
1699 /*
1700  * Helper function for async path record lookup. If we are trying to
1701  * Tx to a MCG, check our membership, possibly trying to join the
1702  * group if required. If that fails, try to send the packet to the
1703  * all router group (indicated by the redirect output), pointing
1704  * the input mac address to the router mcg address.
1705  */
1706 static ibd_mce_t *
1707 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1708 {
1709 	ib_gid_t mgid;
1710 	ibd_mce_t *mce;
1711 	ipoib_mac_t routermac;
1712 
1713 	*redirect = B_FALSE;
1714 	ibd_n2h_gid(mac, &mgid);
1715 
1716 	/*
1717 	 * Check the FullMember+SendOnlyNonMember list.
1718 	 * Since we are the only one who manipulates the
1719 	 * id_mc_full list, no locks are needed.
1720 	 */
1721 	mce = IBD_MCACHE_FIND_FULL(state, mgid);
1722 	if (mce != NULL) {
1723 		DPRINT(4, "ibd_async_mcache : already joined to group");
1724 		return (mce);
1725 	}
1726 
1727 	/*
1728 	 * Not found; try to join(SendOnlyNonMember) and attach.
1729 	 */
1730 	DPRINT(4, "ibd_async_mcache : not joined to group");
1731 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1732 	    NULL) {
1733 		DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1734 		return (mce);
1735 	}
1736 
1737 	/*
1738 	 * MCGroup not present; try to join the all-router group. If
1739 	 * any of the following steps succeed, we will be redirecting
1740 	 * to the all router group.
1741 	 */
1742 	DPRINT(4, "ibd_async_mcache : nonmem join failed");
1743 	if (!ibd_get_allroutergroup(state, mac, &routermac))
1744 		return (NULL);
1745 	*redirect = B_TRUE;
1746 	ibd_n2h_gid(&routermac, &mgid);
1747 	bcopy(&routermac, mac, IPOIB_ADDRL);
1748 	DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1749 	    mgid.gid_prefix, mgid.gid_guid);
1750 
1751 	/*
1752 	 * Are we already joined to the router group?
1753 	 */
1754 	if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1755 		DPRINT(4, "ibd_async_mcache : using already joined router"
1756 		    "group\n");
1757 		return (mce);
1758 	}
1759 
1760 	/*
1761 	 * Can we join(SendOnlyNonMember) the router group?
1762 	 */
1763 	DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1764 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1765 	    NULL) {
1766 		DPRINT(4, "ibd_async_mcache : joined to router grp");
1767 		return (mce);
1768 	}
1769 
1770 	return (NULL);
1771 }
1772 
1773 /*
1774  * Async path record lookup code.
1775  */
1776 static void
1777 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1778 {
1779 	ibd_ace_t *ce;
1780 	ibd_mce_t *mce = NULL;
1781 	ibt_path_attr_t path_attr;
1782 	ibt_path_info_t path_info;
1783 	ib_gid_t destgid;
1784 	char ret = IBD_OP_NOTSTARTED;
1785 
1786 	DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
1787 	    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1788 	    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1789 	    htonl(mac->ipoib_gidsuff[1]));
1790 
1791 	/*
1792 	 * Check whether we are trying to transmit to a MCG.
1793 	 * In that case, we need to make sure we are a member of
1794 	 * the MCG.
1795 	 */
1796 	if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1797 		boolean_t redirected;
1798 
1799 		/*
1800 		 * If we can not find or join the group or even
1801 		 * redirect, error out.
1802 		 */
1803 		if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1804 		    NULL) {
1805 			state->id_ah_op = IBD_OP_ERRORED;
1806 			return;
1807 		}
1808 
1809 		/*
1810 		 * If we got redirected, we need to determine whether
1811 		 * the AH for the new mcg is in the cache already, and
1812 		 * not pull it in then; otherwise proceed to get the
1813 		 * path for the new mcg. There is no guarantee that
1814 		 * if the AH is currently in the cache, it will still be
1815 		 * there when we look in ibd_acache_lookup(), but that's
1816 		 * okay, we will come back here.
1817 		 */
1818 		if (redirected) {
1819 			ret = IBD_OP_ROUTERED;
1820 			DPRINT(4, "ibd_async_acache :  redirected to "
1821 			    "%08X:%08X:%08X:%08X:%08X",
1822 			    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1823 			    htonl(mac->ipoib_gidpref[1]),
1824 			    htonl(mac->ipoib_gidsuff[0]),
1825 			    htonl(mac->ipoib_gidsuff[1]));
1826 
1827 			mutex_enter(&state->id_ac_mutex);
1828 			if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1829 				state->id_ah_op = IBD_OP_ROUTERED;
1830 				mutex_exit(&state->id_ac_mutex);
1831 				DPRINT(4, "ibd_async_acache : router AH found");
1832 				return;
1833 			}
1834 			mutex_exit(&state->id_ac_mutex);
1835 		}
1836 	}
1837 
1838 	/*
1839 	 * Get an AH from the free list.
1840 	 */
1841 	mutex_enter(&state->id_ac_mutex);
1842 	if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1843 		/*
1844 		 * No free ones; try to grab an unreferenced active
1845 		 * one. Maybe we need to make the active list LRU,
1846 		 * but that will create more work for Tx callbacks.
1847 		 * Is there a way of not having to pull out the
1848 		 * entry from the active list, but just indicate it
1849 		 * is being recycled? Yes, but that creates one more
1850 		 * check in the fast lookup path.
1851 		 */
1852 		if ((ce = ibd_acache_get_unref(state)) == NULL) {
1853 			/*
1854 			 * Pretty serious shortage now.
1855 			 */
1856 			state->id_ah_op = IBD_OP_NOTSTARTED;
1857 			mutex_exit(&state->id_ac_mutex);
1858 			DPRINT(10, "ibd_async_acache : failed to find AH "
1859 			    "slot\n");
1860 			return;
1861 		}
1862 		/*
1863 		 * We could check whether ac_mce points to a SendOnly
1864 		 * member and drop that membership now. Or do it lazily
1865 		 * at detach time.
1866 		 */
1867 		ce->ac_mce = NULL;
1868 	}
1869 	mutex_exit(&state->id_ac_mutex);
1870 	ASSERT(ce->ac_mce == NULL);
1871 
1872 	/*
1873 	 * Update the entry.
1874 	 */
1875 	bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1876 
1877 	bzero(&path_info, sizeof (path_info));
1878 	bzero(&path_attr, sizeof (ibt_path_attr_t));
1879 	path_attr.pa_sgid = state->id_sgid;
1880 	path_attr.pa_num_dgids = 1;
1881 	ibd_n2h_gid(&ce->ac_mac, &destgid);
1882 	path_attr.pa_dgids = &destgid;
1883 	path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1884 	path_attr.pa_pkey = state->id_pkey;
1885 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_PKEY, &path_attr, 1,
1886 	    &path_info, NULL) != IBT_SUCCESS) {
1887 		DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1888 		goto error;
1889 	}
1890 	if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1891 	    ntohl(ce->ac_mac.ipoib_qpn),
1892 	    &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1893 		DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1894 		goto error;
1895 	}
1896 
1897 	/*
1898 	 * mce is set whenever an AH is being associated with a
1899 	 * MCG; this will come in handy when we leave the MCG. The
1900 	 * lock protects Tx fastpath from scanning the active list.
1901 	 */
1902 	if (mce != NULL)
1903 		ce->ac_mce = mce;
1904 
1905 	/*
1906 	 * initiate a RC mode connection for unicast address
1907 	 */
1908 	if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) &&
1909 	    (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) {
1910 		ASSERT(ce->ac_chan == NULL);
1911 		DPRINT(10, "ibd_async_acache: call "
1912 		    "ibd_rc_try_connect(ace=%p)", ce);
1913 		ibd_rc_try_connect(state, ce, &path_info);
1914 		if (ce->ac_chan == NULL) {
1915 			DPRINT(10, "ibd_async_acache: fail to setup RC"
1916 			    " channel");
1917 			state->rc_conn_fail++;
1918 			goto error;
1919 		}
1920 	}
1921 
1922 	mutex_enter(&state->id_ac_mutex);
1923 	IBD_ACACHE_INSERT_ACTIVE(state, ce);
1924 	state->id_ah_op = ret;
1925 	mutex_exit(&state->id_ac_mutex);
1926 	return;
1927 error:
1928 	/*
1929 	 * We might want to drop SendOnly membership here if we
1930 	 * joined above. The lock protects Tx callbacks inserting
1931 	 * into the free list.
1932 	 */
1933 	mutex_enter(&state->id_ac_mutex);
1934 	state->id_ah_op = IBD_OP_ERRORED;
1935 	IBD_ACACHE_INSERT_FREE(state, ce);
1936 	mutex_exit(&state->id_ac_mutex);
1937 }
1938 
1939 /*
1940  * While restoring port's presence on the subnet on a port up, it is possible
1941  * that the port goes down again.
1942  */
1943 static void
1944 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1945 {
1946 	ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1947 	link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1948 	    LINK_STATE_UP;
1949 	ibd_mce_t *mce, *pmce;
1950 	ibd_ace_t *ace, *pace;
1951 
1952 	DPRINT(10, "ibd_async_link(): %d", opcode);
1953 
1954 	/*
1955 	 * On a link up, revalidate the link speed/width. No point doing
1956 	 * this on a link down, since we will be unable to do SA operations,
1957 	 * defaulting to the lowest speed. Also notice that we update our
1958 	 * notion of speed before calling mac_link_update(), which will do
1959 	 * necessary higher level notifications for speed changes.
1960 	 */
1961 	if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1962 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1963 		state->id_link_speed = ibd_get_portspeed(state);
1964 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
1965 	}
1966 
1967 	/*
1968 	 * Do all the work required to establish our presence on
1969 	 * the subnet.
1970 	 */
1971 	if (opcode == IBD_LINK_UP_ABSENT) {
1972 		/*
1973 		 * If in promiscuous mode ...
1974 		 */
1975 		if (state->id_prom_op == IBD_OP_COMPLETED) {
1976 			/*
1977 			 * Drop all nonmembership.
1978 			 */
1979 			ibd_async_unsetprom(state);
1980 
1981 			/*
1982 			 * Then, try to regain nonmembership to all mcg's.
1983 			 */
1984 			ibd_async_setprom(state);
1985 
1986 		}
1987 
1988 		/*
1989 		 * Drop all sendonly membership (which also gets rid of the
1990 		 * AHs); try to reacquire all full membership.
1991 		 */
1992 		mce = list_head(&state->id_mc_full);
1993 		while ((pmce = mce) != NULL) {
1994 			mce = list_next(&state->id_mc_full, mce);
1995 			if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
1996 				ibd_leave_group(state,
1997 				    pmce->mc_info.mc_adds_vect.av_dgid,
1998 				    IB_MC_JSTATE_SEND_ONLY_NON);
1999 			else
2000 				ibd_reacquire_group(state, pmce);
2001 		}
2002 
2003 		/*
2004 		 * Recycle all active AHs to free list (and if there are
2005 		 * pending posts, make sure they will go into the free list
2006 		 * once the Tx's complete). Grab the lock to prevent
2007 		 * concurrent Tx's as well as Tx cleanups.
2008 		 */
2009 		mutex_enter(&state->id_ac_mutex);
2010 		ace = list_head(&state->id_ah_active);
2011 		while ((pace = ace) != NULL) {
2012 			boolean_t cycled;
2013 
2014 			ace = list_next(&state->id_ah_active, ace);
2015 			mce = pace->ac_mce;
2016 			if (pace->ac_chan != NULL) {
2017 				ASSERT(mce == NULL);
2018 				ASSERT(state->id_enable_rc == B_TRUE);
2019 				if (pace->ac_chan->chan_state ==
2020 				    IBD_RC_STATE_ACT_ESTAB) {
2021 					INC_REF(pace, 1);
2022 					IBD_ACACHE_PULLOUT_ACTIVE(state, pace);
2023 					pace->ac_chan->chan_state =
2024 					    IBD_RC_STATE_ACT_CLOSING;
2025 					ibd_rc_signal_act_close(state, pace);
2026 				} else {
2027 					state->rc_act_close_simultaneous++;
2028 					DPRINT(40, "ibd_async_link: other "
2029 					    "thread is closing it, ace=%p, "
2030 					    "ac_chan=%p, chan_state=%d",
2031 					    pace, pace->ac_chan,
2032 					    pace->ac_chan->chan_state);
2033 				}
2034 			} else {
2035 				cycled = ibd_acache_recycle(state,
2036 				    &pace->ac_mac, B_TRUE);
2037 			}
2038 			/*
2039 			 * If this is for an mcg, it must be for a fullmember,
2040 			 * since we got rid of send-only members above when
2041 			 * processing the mce list.
2042 			 */
2043 			ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
2044 			    IB_MC_JSTATE_FULL)));
2045 
2046 			/*
2047 			 * Check if the fullmember mce needs to be torn down,
2048 			 * ie whether the DLPI disable has already been done.
2049 			 * If so, do some of the work of tx_cleanup, namely
2050 			 * causing leave (which will fail), detach and
2051 			 * mce-freeing. tx_cleanup will put the AH into free
2052 			 * list. The reason to duplicate some of this
2053 			 * tx_cleanup work is because we want to delete the
2054 			 * AH right now instead of waiting for tx_cleanup, to
2055 			 * force subsequent Tx's to reacquire an AH.
2056 			 */
2057 			if ((mce != NULL) && (mce->mc_fullreap))
2058 				ibd_async_reap_group(state, mce,
2059 				    mce->mc_info.mc_adds_vect.av_dgid,
2060 				    mce->mc_jstate);
2061 		}
2062 		mutex_exit(&state->id_ac_mutex);
2063 	}
2064 
2065 	/*
2066 	 * mac handle is guaranteed to exist since driver does ibt_close_hca()
2067 	 * (which stops further events from being delivered) before
2068 	 * mac_unregister(). At this point, it is guaranteed that mac_register
2069 	 * has already been done.
2070 	 */
2071 	mutex_enter(&state->id_link_mutex);
2072 	state->id_link_state = lstate;
2073 	mac_link_update(state->id_mh, lstate);
2074 	mutex_exit(&state->id_link_mutex);
2075 
2076 	ibd_async_done(state);
2077 }
2078 
2079 /*
2080  * Check the pkey table to see if we can find the pkey we're looking for.
2081  * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
2082  * failure.
2083  */
2084 static int
2085 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
2086     uint16_t *pkix)
2087 {
2088 	uint16_t ndx;
2089 
2090 	ASSERT(pkix != NULL);
2091 
2092 	for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
2093 		if (pkey_tbl[ndx] == pkey) {
2094 			*pkix = ndx;
2095 			return (0);
2096 		}
2097 	}
2098 	return (-1);
2099 }
2100 
2101 /*
2102  * Late HCA Initialization:
2103  * If plumb had succeeded without the availability of an active port or the
2104  * pkey, and either of their availability is now being indicated via PORT_UP
2105  * or PORT_CHANGE respectively, try a start of the interface.
2106  *
2107  * Normal Operation:
2108  * When the link is notified up, we need to do a few things, based
2109  * on the port's current p_init_type_reply claiming a reinit has been
2110  * done or not. The reinit steps are:
2111  * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
2112  *    the old Pkey and GID0 are correct.
2113  * 2. Register for mcg traps (already done by ibmf).
2114  * 3. If PreservePresenceReply indicates the SM has restored port's presence
2115  *    in subnet, nothing more to do. Else go to next steps (on async daemon).
2116  * 4. Give up all sendonly memberships.
2117  * 5. Acquire all full memberships.
2118  * 6. In promiscuous mode, acquire all non memberships.
2119  * 7. Recycle all AHs to free list.
2120  */
2121 static void
2122 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
2123 {
2124 	ibt_hca_portinfo_t *port_infop = NULL;
2125 	ibt_status_t ibt_status;
2126 	uint_t psize, port_infosz;
2127 	ibd_link_op_t opcode;
2128 	ibd_req_t *req;
2129 	link_state_t new_link_state = LINK_STATE_UP;
2130 	uint8_t itreply;
2131 	uint16_t pkix;
2132 	int ret;
2133 
2134 	/*
2135 	 * Let's not race with a plumb or an unplumb; if we detect a
2136 	 * pkey relocation event later on here, we may have to restart.
2137 	 */
2138 	ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2139 
2140 	mutex_enter(&state->id_link_mutex);
2141 
2142 	/*
2143 	 * If the link state is unknown, a plumb has not yet been attempted
2144 	 * on the interface. Nothing to do.
2145 	 */
2146 	if (state->id_link_state == LINK_STATE_UNKNOWN) {
2147 		mutex_exit(&state->id_link_mutex);
2148 		goto link_mod_return;
2149 	}
2150 
2151 	/*
2152 	 * If link state is down because of plumb failure, and we are not in
2153 	 * late HCA init, and we were not successfully plumbed, nothing to do.
2154 	 */
2155 	if ((state->id_link_state == LINK_STATE_DOWN) &&
2156 	    ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 0) &&
2157 	    ((state->id_mac_state & IBD_DRV_STARTED) == 0)) {
2158 		mutex_exit(&state->id_link_mutex);
2159 		goto link_mod_return;
2160 	}
2161 
2162 	/*
2163 	 * If this routine was called in response to a port down event,
2164 	 * we just need to see if this should be informed.
2165 	 */
2166 	if (code == IBT_ERROR_PORT_DOWN) {
2167 		new_link_state = LINK_STATE_DOWN;
2168 		goto update_link_state;
2169 	}
2170 
2171 	/*
2172 	 * If it's not a port down event we've received, try to get the port
2173 	 * attributes first. If we fail here, the port is as good as down.
2174 	 * Otherwise, if the link went down by the time the handler gets
2175 	 * here, give up - we cannot even validate the pkey/gid since those
2176 	 * are not valid and this is as bad as a port down anyway.
2177 	 */
2178 	ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
2179 	    &port_infop, &psize, &port_infosz);
2180 	if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
2181 	    (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
2182 		new_link_state = LINK_STATE_DOWN;
2183 		goto update_link_state;
2184 	}
2185 
2186 	/*
2187 	 * If in the previous attempt, the pkey was not found either due to the
2188 	 * port state being down, or due to it's absence in the pkey table,
2189 	 * look for it now and try to start the interface.
2190 	 */
2191 	if (state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) {
2192 		mutex_exit(&state->id_link_mutex);
2193 		if ((ret = ibd_start(state)) != 0) {
2194 			DPRINT(10, "ibd_linkmod: cannot start from late HCA "
2195 			    "init, ret=%d", ret);
2196 		}
2197 		ibt_free_portinfo(port_infop, port_infosz);
2198 		goto link_mod_return;
2199 	}
2200 
2201 	/*
2202 	 * Check the SM InitTypeReply flags. If both NoLoadReply and
2203 	 * PreserveContentReply are 0, we don't know anything about the
2204 	 * data loaded into the port attributes, so we need to verify
2205 	 * if gid0 and pkey are still valid.
2206 	 */
2207 	itreply = port_infop->p_init_type_reply;
2208 	if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
2209 	    ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
2210 		/*
2211 		 * Check to see if the subnet part of GID0 has changed. If
2212 		 * not, check the simple case first to see if the pkey
2213 		 * index is the same as before; finally check to see if the
2214 		 * pkey has been relocated to a different index in the table.
2215 		 */
2216 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
2217 		if (bcmp(port_infop->p_sgid_tbl,
2218 		    &state->id_sgid, sizeof (ib_gid_t)) != 0) {
2219 
2220 			new_link_state = LINK_STATE_DOWN;
2221 
2222 		} else if (port_infop->p_pkey_tbl[state->id_pkix] ==
2223 		    state->id_pkey) {
2224 
2225 			new_link_state = LINK_STATE_UP;
2226 
2227 		} else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
2228 		    port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {
2229 
2230 			ibt_free_portinfo(port_infop, port_infosz);
2231 			mutex_exit(&state->id_link_mutex);
2232 
2233 			/*
2234 			 * Currently a restart is required if our pkey has moved
2235 			 * in the pkey table. If we get the ibt_recycle_ud() to
2236 			 * work as documented (expected), we may be able to
2237 			 * avoid a complete restart.  Note that we've already
2238 			 * marked both the start and stop 'in-progress' flags,
2239 			 * so it is ok to go ahead and do this restart.
2240 			 */
2241 			(void) ibd_undo_start(state, LINK_STATE_DOWN);
2242 			if ((ret = ibd_start(state)) != 0) {
2243 				DPRINT(10, "ibd_restart: cannot restart, "
2244 				    "ret=%d", ret);
2245 			}
2246 
2247 			goto link_mod_return;
2248 		} else {
2249 			new_link_state = LINK_STATE_DOWN;
2250 		}
2251 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
2252 	}
2253 
2254 update_link_state:
2255 	if (port_infop) {
2256 		ibt_free_portinfo(port_infop, port_infosz);
2257 	}
2258 
2259 	/*
2260 	 * If we're reporting a link up, check InitTypeReply to see if
2261 	 * the SM has ensured that the port's presence in mcg, traps,
2262 	 * etc. is intact.
2263 	 */
2264 	if (new_link_state == LINK_STATE_DOWN) {
2265 		opcode = IBD_LINK_DOWN;
2266 	} else {
2267 		if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2268 		    SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
2269 			opcode = IBD_LINK_UP;
2270 		} else {
2271 			opcode = IBD_LINK_UP_ABSENT;
2272 		}
2273 	}
2274 
2275 	/*
2276 	 * If the old state is the same as the new state, and the SM indicated
2277 	 * no change in the port parameters, nothing to do.
2278 	 */
2279 	if ((state->id_link_state == new_link_state) && (opcode !=
2280 	    IBD_LINK_UP_ABSENT)) {
2281 		mutex_exit(&state->id_link_mutex);
2282 		goto link_mod_return;
2283 	}
2284 
2285 	/*
2286 	 * Ok, so there was a link state change; see if it's safe to ask
2287 	 * the async thread to do the work
2288 	 */
2289 	if (!ibd_async_safe(state)) {
2290 		state->id_link_state = new_link_state;
2291 		mutex_exit(&state->id_link_mutex);
2292 		goto link_mod_return;
2293 	}
2294 
2295 	mutex_exit(&state->id_link_mutex);
2296 
2297 	/*
2298 	 * Queue up a request for ibd_async_link() to handle this link
2299 	 * state change event
2300 	 */
2301 	req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
2302 	req->rq_ptr = (void *)opcode;
2303 	ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
2304 
2305 link_mod_return:
2306 	ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2307 }
2308 
2309 /*
2310  * For the port up/down events, IBTL guarantees there will not be concurrent
2311  * invocations of the handler. IBTL might coalesce link transition events,
2312  * and not invoke the handler for _each_ up/down transition, but it will
2313  * invoke the handler with last known state
2314  */
2315 static void
2316 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2317     ibt_async_code_t code, ibt_async_event_t *event)
2318 {
2319 	ibd_state_t *state = (ibd_state_t *)clnt_private;
2320 
2321 	switch (code) {
2322 	case IBT_ERROR_CATASTROPHIC_CHAN:
2323 		ibd_print_warn(state, "catastrophic channel error");
2324 		break;
2325 	case IBT_ERROR_CQ:
2326 		ibd_print_warn(state, "completion queue error");
2327 		break;
2328 	case IBT_PORT_CHANGE_EVENT:
2329 		/*
2330 		 * Events will be delivered to all instances that have
2331 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2332 		 * Only need to do work for our port; IBTF will deliver
2333 		 * events for other ports on the hca we have ibt_open_hca'ed
2334 		 * too. Note that id_port is initialized in ibd_attach()
2335 		 * before we do an ibt_open_hca() in ibd_attach().
2336 		 */
2337 		ASSERT(state->id_hca_hdl == hca_hdl);
2338 		if (state->id_port != event->ev_port)
2339 			break;
2340 
2341 		if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
2342 		    IBT_PORT_CHANGE_PKEY) {
2343 			ibd_link_mod(state, code);
2344 		}
2345 		break;
2346 	case IBT_ERROR_PORT_DOWN:
2347 	case IBT_CLNT_REREG_EVENT:
2348 	case IBT_EVENT_PORT_UP:
2349 		/*
2350 		 * Events will be delivered to all instances that have
2351 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2352 		 * Only need to do work for our port; IBTF will deliver
2353 		 * events for other ports on the hca we have ibt_open_hca'ed
2354 		 * too. Note that id_port is initialized in ibd_attach()
2355 		 * before we do an ibt_open_hca() in ibd_attach().
2356 		 */
2357 		ASSERT(state->id_hca_hdl == hca_hdl);
2358 		if (state->id_port != event->ev_port)
2359 			break;
2360 
2361 		ibd_link_mod(state, code);
2362 		break;
2363 
2364 	case IBT_HCA_ATTACH_EVENT:
2365 	case IBT_HCA_DETACH_EVENT:
2366 		/*
2367 		 * When a new card is plugged to the system, attach_event is
2368 		 * invoked. Additionally, a cfgadm needs to be run to make the
2369 		 * card known to the system, and an ifconfig needs to be run to
2370 		 * plumb up any ibd interfaces on the card. In the case of card
2371 		 * unplug, a cfgadm is run that will trigger any RCM scripts to
2372 		 * unplumb the ibd interfaces on the card; when the card is
2373 		 * actually unplugged, the detach_event is invoked;
2374 		 * additionally, if any ibd instances are still active on the
2375 		 * card (eg there were no associated RCM scripts), driver's
2376 		 * detach routine is invoked.
2377 		 */
2378 		break;
2379 	default:
2380 		break;
2381 	}
2382 }
2383 
2384 static int
2385 ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
2386 {
2387 	mac_register_t *macp;
2388 	int ret;
2389 
2390 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2391 		DPRINT(10, "ibd_register_mac: mac_alloc() failed");
2392 		return (DDI_FAILURE);
2393 	}
2394 
2395 	/*
2396 	 * Note that when we register with mac during attach, we don't
2397 	 * have the id_macaddr yet, so we'll simply be registering a
2398 	 * zero macaddr that we'll overwrite later during plumb (in
2399 	 * ibd_m_start()). Similar is the case with id_mtu - we'll
2400 	 * update the mac layer with the correct mtu during plumb.
2401 	 */
2402 	macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2403 	macp->m_driver = state;
2404 	macp->m_dip = dip;
2405 	macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2406 	macp->m_callbacks = &ibd_m_callbacks;
2407 	macp->m_min_sdu = 0;
2408 	if (state->id_type == IBD_PORT_DRIVER) {
2409 		macp->m_max_sdu = IBD_DEF_RC_MAX_SDU;
2410 	} else if (state->id_enable_rc) {
2411 		macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE;
2412 	} else {
2413 		macp->m_max_sdu = IBD_DEF_MAX_SDU;
2414 	}
2415 	macp->m_priv_props = ibd_priv_props;
2416 
2417 	/*
2418 	 *  Register ourselves with the GLDv3 interface
2419 	 */
2420 	if ((ret = mac_register(macp, &state->id_mh)) != 0) {
2421 		mac_free(macp);
2422 		DPRINT(10,
2423 		    "ibd_register_mac: mac_register() failed, ret=%d", ret);
2424 		return (DDI_FAILURE);
2425 	}
2426 
2427 	mac_free(macp);
2428 	return (DDI_SUCCESS);
2429 }
2430 
2431 static int
2432 ibd_record_capab(ibd_state_t *state)
2433 {
2434 	ibt_hca_attr_t hca_attrs;
2435 	ibt_status_t ibt_status;
2436 
2437 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
2438 
2439 	/*
2440 	 * Query the HCA and fetch its attributes
2441 	 */
2442 	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
2443 	ASSERT(ibt_status == IBT_SUCCESS);
2444 
2445 	/*
2446 	 * 1. Set the Hardware Checksum capability. Currently we only consider
2447 	 *    full checksum offload.
2448 	 */
2449 	if (state->id_enable_rc) {
2450 			state->id_hwcksum_capab = 0;
2451 	} else {
2452 		if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL)
2453 		    == IBT_HCA_CKSUM_FULL) {
2454 			state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
2455 		}
2456 	}
2457 
2458 	/*
2459 	 * 2. Set LSO policy, capability and maximum length
2460 	 */
2461 	if (state->id_enable_rc) {
2462 		state->id_lso_capable = B_FALSE;
2463 		state->id_lso_maxlen = 0;
2464 	} else {
2465 		if (hca_attrs.hca_max_lso_size > 0) {
2466 			state->id_lso_capable = B_TRUE;
2467 			if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
2468 				state->id_lso_maxlen = IBD_LSO_MAXLEN;
2469 			else
2470 				state->id_lso_maxlen =
2471 				    hca_attrs.hca_max_lso_size;
2472 		} else {
2473 			state->id_lso_capable = B_FALSE;
2474 			state->id_lso_maxlen = 0;
2475 		}
2476 	}
2477 
2478 	/*
2479 	 * 3. Set Reserved L_Key capability
2480 	 */
2481 	if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
2482 		state->id_hca_res_lkey_capab = 1;
2483 		state->id_res_lkey = hca_attrs.hca_reserved_lkey;
2484 		state->rc_enable_iov_map = B_TRUE;
2485 	} else {
2486 		/* If no reserved lkey, we will not use ibt_map_mem_iov */
2487 		state->rc_enable_iov_map = B_FALSE;
2488 	}
2489 
2490 	/*
2491 	 * 4. Set maximum sqseg value after checking to see if extended sgl
2492 	 *    size information is provided by the hca
2493 	 */
2494 	if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
2495 		state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
2496 		state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz;
2497 	} else {
2498 		state->id_max_sqseg = hca_attrs.hca_max_sgl;
2499 		state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl;
2500 	}
2501 	if (state->id_max_sqseg > IBD_MAX_SQSEG) {
2502 		state->id_max_sqseg = IBD_MAX_SQSEG;
2503 	} else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
2504 		ibd_print_warn(state, "Set #sgl = %d instead of default %d",
2505 		    state->id_max_sqseg, IBD_MAX_SQSEG);
2506 	}
2507 	if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) {
2508 		state->rc_tx_max_sqseg = IBD_MAX_SQSEG;
2509 	} else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) {
2510 		ibd_print_warn(state, "RC mode: Set #sgl = %d instead of "
2511 		    "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG);
2512 	}
2513 
2514 	/*
2515 	 * Translating the virtual address regions into physical regions
2516 	 * for using the Reserved LKey feature results in a wr sgl that
2517 	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
2518 	 * we'll fix a high-water mark (65%) for when we should stop.
2519 	 */
2520 	state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100;
2521 	state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100;
2522 
2523 	/*
2524 	 * 5. Set number of recv and send wqes after checking hca maximum
2525 	 *    channel size. Store the max channel size in the state so that it
2526 	 *    can be referred to when the swqe/rwqe change is requested via
2527 	 *    dladm.
2528 	 */
2529 
2530 	state->id_hca_max_chan_sz = hca_attrs.hca_max_chan_sz;
2531 
2532 	if (hca_attrs.hca_max_chan_sz < state->id_ud_num_rwqe)
2533 		state->id_ud_num_rwqe = hca_attrs.hca_max_chan_sz;
2534 
2535 	state->id_rx_bufs_outstanding_limit = state->id_ud_num_rwqe -
2536 	    IBD_RWQE_MIN;
2537 
2538 	if (hca_attrs.hca_max_chan_sz < state->id_ud_num_swqe)
2539 		state->id_ud_num_swqe = hca_attrs.hca_max_chan_sz;
2540 
2541 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
2542 
2543 	return (DDI_SUCCESS);
2544 }
2545 
2546 static int
2547 ibd_part_busy(ibd_state_t *state)
2548 {
2549 	if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) {
2550 		DPRINT(10, "ibd_part_busy: failed: rx bufs outstanding\n");
2551 		return (DDI_FAILURE);
2552 	}
2553 
2554 	if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) {
2555 		DPRINT(10, "ibd_part_busy: failed: srq bufs outstanding\n");
2556 		return (DDI_FAILURE);
2557 	}
2558 
2559 	return (DDI_SUCCESS);
2560 }
2561 
2562 
2563 static void
2564 ibd_part_unattach(ibd_state_t *state)
2565 {
2566 	uint32_t progress = state->id_mac_state;
2567 	ibt_status_t ret;
2568 
2569 	/* make sure rx resources are freed */
2570 	ibd_free_rx_rsrcs(state);
2571 
2572 	if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
2573 		ASSERT(state->id_enable_rc);
2574 		ibd_rc_fini_srq_list(state);
2575 		state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
2576 	}
2577 
2578 	if (progress & IBD_DRV_MAC_REGISTERED) {
2579 		(void) mac_unregister(state->id_mh);
2580 		state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
2581 	}
2582 
2583 	if (progress & IBD_DRV_ASYNC_THR_CREATED) {
2584 		/*
2585 		 * No new async requests will be posted since the device
2586 		 * link state has been marked as unknown; completion handlers
2587 		 * have been turned off, so Tx handler will not cause any
2588 		 * more IBD_ASYNC_REAP requests.
2589 		 *
2590 		 * Queue a request for the async thread to exit, which will
2591 		 * be serviced after any pending ones. This can take a while,
2592 		 * specially if the SM is unreachable, since IBMF will slowly
2593 		 * timeout each SM request issued by the async thread.  Reap
2594 		 * the thread before continuing on, we do not want it to be
2595 		 * lingering in modunloaded code.
2596 		 */
2597 		ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
2598 		thread_join(state->id_async_thrid);
2599 
2600 		state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
2601 	}
2602 
2603 	if (progress & IBD_DRV_REQ_LIST_INITED) {
2604 		list_destroy(&state->id_req_list);
2605 		mutex_destroy(&state->id_acache_req_lock);
2606 		cv_destroy(&state->id_acache_req_cv);
2607 		state->id_mac_state &= ~IBD_DRV_REQ_LIST_INITED;
2608 	}
2609 
2610 	if (progress & IBD_DRV_PD_ALLOCD) {
2611 		if ((ret = ibt_free_pd(state->id_hca_hdl,
2612 		    state->id_pd_hdl)) != IBT_SUCCESS) {
2613 			ibd_print_warn(state, "failed to free "
2614 			    "protection domain, ret=%d", ret);
2615 		}
2616 		state->id_pd_hdl = NULL;
2617 		state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
2618 	}
2619 
2620 	if (progress & IBD_DRV_HCA_OPENED) {
2621 		if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
2622 		    IBT_SUCCESS) {
2623 			ibd_print_warn(state, "failed to close "
2624 			    "HCA device, ret=%d", ret);
2625 		}
2626 		state->id_hca_hdl = NULL;
2627 		state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
2628 	}
2629 
2630 	mutex_enter(&ibd_gstate.ig_mutex);
2631 	if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
2632 		if ((ret = ibt_detach(state->id_ibt_hdl)) !=
2633 		    IBT_SUCCESS) {
2634 			ibd_print_warn(state,
2635 			    "ibt_detach() failed, ret=%d", ret);
2636 		}
2637 		state->id_ibt_hdl = NULL;
2638 		state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
2639 		ibd_gstate.ig_ibt_hdl_ref_cnt--;
2640 	}
2641 	if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) &&
2642 	    (ibd_gstate.ig_ibt_hdl != NULL)) {
2643 		if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) !=
2644 		    IBT_SUCCESS) {
2645 			ibd_print_warn(state, "ibt_detach(): global "
2646 			    "failed, ret=%d", ret);
2647 		}
2648 		ibd_gstate.ig_ibt_hdl = NULL;
2649 	}
2650 	mutex_exit(&ibd_gstate.ig_mutex);
2651 
2652 	if (progress & IBD_DRV_TXINTR_ADDED) {
2653 		ddi_remove_softintr(state->id_tx);
2654 		state->id_tx = NULL;
2655 		state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
2656 	}
2657 
2658 	if (progress & IBD_DRV_RXINTR_ADDED) {
2659 		ddi_remove_softintr(state->id_rx);
2660 		state->id_rx = NULL;
2661 		state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
2662 	}
2663 
2664 #ifdef DEBUG
2665 	if (progress & IBD_DRV_RC_PRIVATE_STATE) {
2666 		kstat_delete(state->rc_ksp);
2667 		state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE);
2668 	}
2669 #endif
2670 
2671 	if (progress & IBD_DRV_STATE_INITIALIZED) {
2672 		ibd_state_fini(state);
2673 		state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
2674 	}
2675 }
2676 
2677 int
2678 ibd_part_attach(ibd_state_t *state, dev_info_t *dip)
2679 {
2680 	ibt_status_t ret;
2681 	int rv;
2682 	kthread_t *kht;
2683 
2684 	/*
2685 	 * Initialize mutexes and condition variables
2686 	 */
2687 	if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2688 		DPRINT(10, "ibd_part_attach: failed in ibd_state_init()");
2689 		return (DDI_FAILURE);
2690 	}
2691 	state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
2692 
2693 	/*
2694 	 * Allocate rx,tx softintr
2695 	 */
2696 	if (ibd_rx_softintr == 1) {
2697 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2698 		    NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
2699 			DPRINT(10, "ibd_part_attach: failed in "
2700 			    "ddi_add_softintr(id_rx),  ret=%d", rv);
2701 			return (DDI_FAILURE);
2702 		}
2703 		state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
2704 	}
2705 	if (ibd_tx_softintr == 1) {
2706 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2707 		    NULL, NULL, ibd_tx_recycle,
2708 		    (caddr_t)state)) != DDI_SUCCESS) {
2709 			DPRINT(10, "ibd_part_attach: failed in "
2710 			    "ddi_add_softintr(id_tx), ret=%d", rv);
2711 			return (DDI_FAILURE);
2712 		}
2713 		state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
2714 	}
2715 
2716 	/*
2717 	 * Attach to IBTL
2718 	 */
2719 	mutex_enter(&ibd_gstate.ig_mutex);
2720 	if (ibd_gstate.ig_ibt_hdl == NULL) {
2721 		if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2722 		    &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) {
2723 			DPRINT(10, "ibd_part_attach: global: failed in "
2724 			    "ibt_attach(), ret=%d", ret);
2725 			mutex_exit(&ibd_gstate.ig_mutex);
2726 			return (DDI_FAILURE);
2727 		}
2728 	}
2729 	if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2730 	    &state->id_ibt_hdl)) != IBT_SUCCESS) {
2731 		DPRINT(10, "ibd_part_attach: failed in ibt_attach(), ret=%d",
2732 		    ret);
2733 		mutex_exit(&ibd_gstate.ig_mutex);
2734 		return (DDI_FAILURE);
2735 	}
2736 	ibd_gstate.ig_ibt_hdl_ref_cnt++;
2737 	mutex_exit(&ibd_gstate.ig_mutex);
2738 	state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
2739 
2740 	/*
2741 	 * Open the HCA
2742 	 */
2743 	if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
2744 	    &state->id_hca_hdl)) != IBT_SUCCESS) {
2745 		DPRINT(10, "ibd_part_attach: ibt_open_hca() failed, ret=%d",
2746 		    ret);
2747 		return (DDI_FAILURE);
2748 	}
2749 	state->id_mac_state |= IBD_DRV_HCA_OPENED;
2750 
2751 #ifdef DEBUG
2752 	/* Initialize Driver Counters for Reliable Connected Mode */
2753 	if (state->id_enable_rc) {
2754 		if (ibd_rc_init_stats(state) != DDI_SUCCESS) {
2755 			DPRINT(10, "ibd_part_attach: failed in "
2756 			    "ibd_rc_init_stats");
2757 			return (DDI_FAILURE);
2758 		}
2759 		state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE;
2760 	}
2761 #endif
2762 
2763 	/*
2764 	 * Record capabilities
2765 	 */
2766 	(void) ibd_record_capab(state);
2767 
2768 	/*
2769 	 * Allocate a protection domain on the HCA
2770 	 */
2771 	if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
2772 	    &state->id_pd_hdl)) != IBT_SUCCESS) {
2773 		DPRINT(10, "ibd_part_attach: ibt_alloc_pd() failed, ret=%d",
2774 		    ret);
2775 		return (DDI_FAILURE);
2776 	}
2777 	state->id_mac_state |= IBD_DRV_PD_ALLOCD;
2778 
2779 
2780 	/*
2781 	 * We need to initialise the req_list that is required for the
2782 	 * operation of the async_thread.
2783 	 */
2784 	mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
2785 	cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
2786 	list_create(&state->id_req_list, sizeof (ibd_req_t),
2787 	    offsetof(ibd_req_t, rq_list));
2788 	state->id_mac_state |= IBD_DRV_REQ_LIST_INITED;
2789 
2790 	/*
2791 	 * Create the async thread; thread_create never fails.
2792 	 */
2793 	kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
2794 	    TS_RUN, minclsyspri);
2795 	state->id_async_thrid = kht->t_did;
2796 	state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
2797 
2798 	return (DDI_SUCCESS);
2799 }
2800 
2801 /*
2802  * Attach device to the IO framework.
2803  */
2804 static int
2805 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2806 {
2807 	int ret;
2808 
2809 	switch (cmd) {
2810 		case DDI_ATTACH:
2811 			ret = ibd_port_attach(dip);
2812 			break;
2813 		default:
2814 			ret = DDI_FAILURE;
2815 			break;
2816 	}
2817 	return (ret);
2818 }
2819 
2820 /*
2821  * Detach device from the IO framework.
2822  */
2823 static int
2824 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2825 {
2826 	ibd_state_t *state;
2827 	int instance;
2828 
2829 	/*
2830 	 * IBD doesn't support suspend/resume
2831 	 */
2832 	if (cmd != DDI_DETACH)
2833 		return (DDI_FAILURE);
2834 
2835 	/*
2836 	 * Get the instance softstate
2837 	 */
2838 	instance = ddi_get_instance(dip);
2839 	state = ddi_get_soft_state(ibd_list, instance);
2840 
2841 	/*
2842 	 * Release all resources we're holding still.  Note that if we'd
2843 	 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
2844 	 * so far, we should find all the flags we need in id_mac_state.
2845 	 */
2846 	return (ibd_port_unattach(state, dip));
2847 }
2848 
2849 /*
2850  * Pre ibt_attach() driver initialization
2851  */
2852 static int
2853 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2854 {
2855 	char buf[64];
2856 
2857 	mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2858 	state->id_link_state = LINK_STATE_UNKNOWN;
2859 
2860 	mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2861 	cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2862 	state->id_trap_stop = B_TRUE;
2863 	state->id_trap_inprog = 0;
2864 
2865 	mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2866 	mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2867 	state->id_dip = dip;
2868 
2869 	mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2870 
2871 	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2872 	mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2873 	mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
2874 	state->id_tx_busy = 0;
2875 	mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL);
2876 
2877 	state->id_rx_list.dl_bufs_outstanding = 0;
2878 	state->id_rx_list.dl_cnt = 0;
2879 	mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2880 	mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2881 	(void) sprintf(buf, "ibd_req%d_%x", ddi_get_instance(dip),
2882 	    state->id_pkey);
2883 	state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2884 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2885 
2886 	/* For Reliable Connected Mode */
2887 	mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL);
2888 	mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL);
2889 	mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2890 	mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2891 	mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL,
2892 	    MUTEX_DRIVER, NULL);
2893 
2894 	/*
2895 	 * Make the default link mode as RC. If this fails during connection
2896 	 * setup, the link mode is automatically transitioned to UD.
2897 	 * Also set the RC MTU.
2898 	 */
2899 	state->id_enable_rc = IBD_DEF_LINK_MODE;
2900 	state->rc_mtu = IBD_DEF_RC_MAX_MTU;
2901 	state->id_mtu = IBD_DEF_MAX_MTU;
2902 
2903 	/* Iniatialize all tunables to default */
2904 	state->id_lso_policy = IBD_DEF_LSO_POLICY;
2905 	state->id_num_lso_bufs = IBD_DEF_NUM_LSO_BUFS;
2906 	state->id_num_ah = IBD_DEF_NUM_AH;
2907 	state->id_hash_size = IBD_DEF_HASH_SIZE;
2908 	state->id_create_broadcast_group = IBD_DEF_CREATE_BCAST_GROUP;
2909 	state->id_allow_coalesce_comp_tuning = IBD_DEF_COALESCE_COMPLETIONS;
2910 	state->id_ud_rx_comp_count = IBD_DEF_UD_RX_COMP_COUNT;
2911 	state->id_ud_rx_comp_usec = IBD_DEF_UD_RX_COMP_USEC;
2912 	state->id_ud_tx_comp_count = IBD_DEF_UD_TX_COMP_COUNT;
2913 	state->id_ud_tx_comp_usec = IBD_DEF_UD_TX_COMP_USEC;
2914 	state->id_rc_rx_comp_count = IBD_DEF_RC_RX_COMP_COUNT;
2915 	state->id_rc_rx_comp_usec = IBD_DEF_RC_RX_COMP_USEC;
2916 	state->id_rc_tx_comp_count = IBD_DEF_RC_TX_COMP_COUNT;
2917 	state->id_rc_tx_comp_usec = IBD_DEF_RC_TX_COMP_USEC;
2918 	state->id_ud_tx_copy_thresh = IBD_DEF_UD_TX_COPY_THRESH;
2919 	state->id_rc_rx_copy_thresh = IBD_DEF_RC_RX_COPY_THRESH;
2920 	state->id_rc_tx_copy_thresh = IBD_DEF_RC_TX_COPY_THRESH;
2921 	state->id_ud_num_rwqe = IBD_DEF_UD_NUM_RWQE;
2922 	state->id_ud_num_swqe = IBD_DEF_UD_NUM_SWQE;
2923 	state->id_rc_num_rwqe = IBD_DEF_RC_NUM_RWQE;
2924 	state->id_rc_num_swqe = IBD_DEF_RC_NUM_SWQE;
2925 	state->rc_enable_srq = IBD_DEF_RC_ENABLE_SRQ;
2926 	state->id_rc_num_srq = IBD_DEF_RC_NUM_SRQ;
2927 	state->id_rc_rx_rwqe_thresh = IBD_DEF_RC_RX_RWQE_THRESH;
2928 
2929 	return (DDI_SUCCESS);
2930 }
2931 
2932 /*
2933  * Post ibt_detach() driver deconstruction
2934  */
2935 static void
2936 ibd_state_fini(ibd_state_t *state)
2937 {
2938 	kmem_cache_destroy(state->id_req_kmc);
2939 
2940 	mutex_destroy(&state->id_rx_list.dl_mutex);
2941 	mutex_destroy(&state->id_rx_free_list.dl_mutex);
2942 
2943 	mutex_destroy(&state->id_txpost_lock);
2944 	mutex_destroy(&state->id_tx_list.dl_mutex);
2945 	mutex_destroy(&state->id_tx_rel_list.dl_mutex);
2946 	mutex_destroy(&state->id_lso_lock);
2947 
2948 	mutex_destroy(&state->id_sched_lock);
2949 	mutex_destroy(&state->id_scq_poll_lock);
2950 	mutex_destroy(&state->id_rcq_poll_lock);
2951 
2952 	cv_destroy(&state->id_trap_cv);
2953 	mutex_destroy(&state->id_trap_lock);
2954 	mutex_destroy(&state->id_link_mutex);
2955 
2956 	/* For Reliable Connected Mode */
2957 	mutex_destroy(&state->rc_srq_free_list.dl_mutex);
2958 	mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex);
2959 	mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex);
2960 	mutex_destroy(&state->rc_tx_large_bufs_lock);
2961 	mutex_destroy(&state->rc_rx_lock);
2962 }
2963 
2964 /*
2965  * Fetch link speed from SA for snmp ifspeed reporting.
2966  */
2967 static uint64_t
2968 ibd_get_portspeed(ibd_state_t *state)
2969 {
2970 	int			ret;
2971 	ibt_path_info_t		path;
2972 	ibt_path_attr_t		path_attr;
2973 	uint8_t			num_paths;
2974 	uint64_t		ifspeed;
2975 
2976 	/*
2977 	 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2978 	 * translates to 2 Gbps data rate. Thus, 1X single data rate is
2979 	 * 2000000000. Start with that as default.
2980 	 */
2981 	ifspeed = 2000000000;
2982 
2983 	bzero(&path_attr, sizeof (path_attr));
2984 
2985 	/*
2986 	 * Get the port speed from Loopback path information.
2987 	 */
2988 	path_attr.pa_dgids = &state->id_sgid;
2989 	path_attr.pa_num_dgids = 1;
2990 	path_attr.pa_sgid = state->id_sgid;
2991 
2992 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2993 	    &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2994 		goto earlydone;
2995 
2996 	if (num_paths < 1)
2997 		goto earlydone;
2998 
2999 	/*
3000 	 * In case SA does not return an expected value, report the default
3001 	 * speed as 1X.
3002 	 */
3003 	ret = 1;
3004 	switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
3005 		case IBT_SRATE_2:	/*  1X SDR i.e 2.5 Gbps */
3006 			ret = 1;
3007 			break;
3008 		case IBT_SRATE_10:	/*  4X SDR or 1X QDR i.e 10 Gbps */
3009 			ret = 4;
3010 			break;
3011 		case IBT_SRATE_30:	/* 12X SDR i.e 30 Gbps */
3012 			ret = 12;
3013 			break;
3014 		case IBT_SRATE_5:	/*  1X DDR i.e  5 Gbps */
3015 			ret = 2;
3016 			break;
3017 		case IBT_SRATE_20:	/*  4X DDR or 8X SDR i.e 20 Gbps */
3018 			ret = 8;
3019 			break;
3020 		case IBT_SRATE_40:	/*  8X DDR or 4X QDR i.e 40 Gbps */
3021 			ret = 16;
3022 			break;
3023 		case IBT_SRATE_60:	/* 12X DDR i.e 60 Gbps */
3024 			ret = 24;
3025 			break;
3026 		case IBT_SRATE_80:	/*  8X QDR i.e 80 Gbps */
3027 			ret = 32;
3028 			break;
3029 		case IBT_SRATE_120:	/* 12X QDR i.e 120 Gbps */
3030 			ret = 48;
3031 			break;
3032 	}
3033 
3034 	ifspeed *= ret;
3035 
3036 earlydone:
3037 	return (ifspeed);
3038 }
3039 
3040 /*
3041  * Search input mcg list (id_mc_full or id_mc_non) for an entry
3042  * representing the input mcg mgid.
3043  */
3044 static ibd_mce_t *
3045 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
3046 {
3047 	ibd_mce_t *ptr = list_head(mlist);
3048 
3049 	/*
3050 	 * Do plain linear search.
3051 	 */
3052 	while (ptr != NULL) {
3053 		if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
3054 		    sizeof (ib_gid_t)) == 0)
3055 			return (ptr);
3056 		ptr = list_next(mlist, ptr);
3057 	}
3058 	return (NULL);
3059 }
3060 
3061 /*
3062  * Execute IBA JOIN.
3063  */
3064 static ibt_status_t
3065 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
3066 {
3067 	ibt_mcg_attr_t mcg_attr;
3068 
3069 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3070 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
3071 	mcg_attr.mc_mgid = mgid;
3072 	mcg_attr.mc_join_state = mce->mc_jstate;
3073 	mcg_attr.mc_scope = state->id_scope;
3074 	mcg_attr.mc_pkey = state->id_pkey;
3075 	mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
3076 	mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
3077 	mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
3078 	return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
3079 	    NULL, NULL));
3080 }
3081 
3082 /*
3083  * This code JOINs the port in the proper way (depending on the join
3084  * state) so that IBA fabric will forward mcg packets to/from the port.
3085  * It also attaches the QPN to the mcg so it can receive those mcg
3086  * packets. This code makes sure not to attach the mcg to the QP if
3087  * that has been previously done due to the mcg being joined with a
3088  * different join state, even though this is not required by SWG_0216,
3089  * refid 3610.
3090  */
3091 static ibd_mce_t *
3092 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3093 {
3094 	ibt_status_t ibt_status;
3095 	ibd_mce_t *mce, *tmce, *omce = NULL;
3096 	boolean_t do_attach = B_TRUE;
3097 
3098 	DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
3099 	    jstate, mgid.gid_prefix, mgid.gid_guid);
3100 
3101 	/*
3102 	 * For enable_multicast Full member joins, we need to do some
3103 	 * extra work. If there is already an mce on the list that
3104 	 * indicates full membership, that means the membership has
3105 	 * not yet been dropped (since the disable_multicast was issued)
3106 	 * because there are pending Tx's to the mcg; in that case, just
3107 	 * mark the mce not to be reaped when the Tx completion queues
3108 	 * an async reap operation.
3109 	 *
3110 	 * If there is already an mce on the list indicating sendonly
3111 	 * membership, try to promote to full membership. Be careful
3112 	 * not to deallocate the old mce, since there might be an AH
3113 	 * pointing to it; instead, update the old mce with new data
3114 	 * that tracks the full membership.
3115 	 */
3116 	if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
3117 	    IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
3118 		if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
3119 			ASSERT(omce->mc_fullreap);
3120 			omce->mc_fullreap = B_FALSE;
3121 			return (omce);
3122 		} else {
3123 			ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
3124 		}
3125 	}
3126 
3127 	/*
3128 	 * Allocate the ibd_mce_t to track this JOIN.
3129 	 */
3130 	mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
3131 	mce->mc_fullreap = B_FALSE;
3132 	mce->mc_jstate = jstate;
3133 
3134 	if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
3135 		DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
3136 		    ibt_status);
3137 		kmem_free(mce, sizeof (ibd_mce_t));
3138 		return (NULL);
3139 	}
3140 
3141 	/*
3142 	 * Is an IBA attach required? Not if the interface is already joined
3143 	 * to the mcg in a different appropriate join state.
3144 	 */
3145 	if (jstate == IB_MC_JSTATE_NON) {
3146 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3147 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3148 			do_attach = B_FALSE;
3149 	} else if (jstate == IB_MC_JSTATE_FULL) {
3150 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3151 			do_attach = B_FALSE;
3152 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3153 		do_attach = B_FALSE;
3154 	}
3155 
3156 	if (do_attach) {
3157 		/*
3158 		 * Do the IBA attach.
3159 		 */
3160 		DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
3161 		if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
3162 		    &mce->mc_info)) != IBT_SUCCESS) {
3163 			DPRINT(10, "ibd_join_group : failed qp attachment "
3164 			    "%d\n", ibt_status);
3165 			/*
3166 			 * NOTE that we should probably preserve the join info
3167 			 * in the list and later try to leave again at detach
3168 			 * time.
3169 			 */
3170 			(void) ibt_leave_mcg(state->id_sgid, mgid,
3171 			    state->id_sgid, jstate);
3172 			kmem_free(mce, sizeof (ibd_mce_t));
3173 			return (NULL);
3174 		}
3175 	}
3176 
3177 	/*
3178 	 * Insert the ibd_mce_t in the proper list.
3179 	 */
3180 	if (jstate == IB_MC_JSTATE_NON) {
3181 		IBD_MCACHE_INSERT_NON(state, mce);
3182 	} else {
3183 		/*
3184 		 * Set up the mc_req fields used for reaping the
3185 		 * mcg in case of delayed tx completion (see
3186 		 * ibd_tx_cleanup()). Also done for sendonly join in
3187 		 * case we are promoted to fullmembership later and
3188 		 * keep using the same mce.
3189 		 */
3190 		mce->mc_req.rq_gid = mgid;
3191 		mce->mc_req.rq_ptr = mce;
3192 		/*
3193 		 * Check whether this is the case of trying to join
3194 		 * full member, and we were already joined send only.
3195 		 * We try to drop our SendOnly membership, but it is
3196 		 * possible that the mcg does not exist anymore (and
3197 		 * the subnet trap never reached us), so the leave
3198 		 * operation might fail.
3199 		 */
3200 		if (omce != NULL) {
3201 			(void) ibt_leave_mcg(state->id_sgid, mgid,
3202 			    state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
3203 			omce->mc_jstate = IB_MC_JSTATE_FULL;
3204 			bcopy(&mce->mc_info, &omce->mc_info,
3205 			    sizeof (ibt_mcg_info_t));
3206 			kmem_free(mce, sizeof (ibd_mce_t));
3207 			return (omce);
3208 		}
3209 		mutex_enter(&state->id_mc_mutex);
3210 		IBD_MCACHE_INSERT_FULL(state, mce);
3211 		mutex_exit(&state->id_mc_mutex);
3212 	}
3213 
3214 	return (mce);
3215 }
3216 
3217 /*
3218  * Called during port up event handling to attempt to reacquire full
3219  * membership to an mcg. Stripped down version of ibd_join_group().
3220  * Note that it is possible that the mcg might have gone away, and
3221  * gets recreated at this point.
3222  */
3223 static void
3224 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
3225 {
3226 	ib_gid_t mgid;
3227 
3228 	/*
3229 	 * If the mc_fullreap flag is set, or this join fails, a subsequent
3230 	 * reap/leave is going to try to leave the group. We could prevent
3231 	 * that by adding a boolean flag into ibd_mce_t, if required.
3232 	 */
3233 	if (mce->mc_fullreap)
3234 		return;
3235 
3236 	mgid = mce->mc_info.mc_adds_vect.av_dgid;
3237 
3238 	DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
3239 	    mgid.gid_guid);
3240 
3241 	/* While reacquiring, leave and then join the MCG */
3242 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid,
3243 	    mce->mc_jstate);
3244 	if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
3245 		ibd_print_warn(state, "Failure on port up to rejoin "
3246 		    "multicast gid %016llx:%016llx",
3247 		    (u_longlong_t)mgid.gid_prefix,
3248 		    (u_longlong_t)mgid.gid_guid);
3249 }
3250 
3251 /*
3252  * This code handles delayed Tx completion cleanups for mcg's to which
3253  * disable_multicast has been issued, regular mcg related cleanups during
3254  * disable_multicast, disable_promiscuous and mcg traps, as well as
3255  * cleanups during driver detach time. Depending on the join state,
3256  * it deletes the mce from the appropriate list and issues the IBA
3257  * leave/detach; except in the disable_multicast case when the mce
3258  * is left on the active list for a subsequent Tx completion cleanup.
3259  */
3260 static void
3261 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
3262     uint8_t jstate)
3263 {
3264 	ibd_mce_t *tmce;
3265 	boolean_t do_detach = B_TRUE;
3266 
3267 	/*
3268 	 * Before detaching, we must check whether the other list
3269 	 * contains the mcg; if we detach blindly, the consumer
3270 	 * who set up the other list will also stop receiving
3271 	 * traffic.
3272 	 */
3273 	if (jstate == IB_MC_JSTATE_FULL) {
3274 		/*
3275 		 * The following check is only relevant while coming
3276 		 * from the Tx completion path in the reap case.
3277 		 */
3278 		if (!mce->mc_fullreap)
3279 			return;
3280 		mutex_enter(&state->id_mc_mutex);
3281 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3282 		mutex_exit(&state->id_mc_mutex);
3283 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3284 			do_detach = B_FALSE;
3285 	} else if (jstate == IB_MC_JSTATE_NON) {
3286 		IBD_MCACHE_PULLOUT_NON(state, mce);
3287 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3288 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3289 			do_detach = B_FALSE;
3290 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3291 		mutex_enter(&state->id_mc_mutex);
3292 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3293 		mutex_exit(&state->id_mc_mutex);
3294 		do_detach = B_FALSE;
3295 	}
3296 
3297 	/*
3298 	 * If we are reacting to a mcg trap and leaving our sendonly or
3299 	 * non membership, the mcg is possibly already gone, so attempting
3300 	 * to leave might fail. On the other hand, we must try to leave
3301 	 * anyway, since this might be a trap from long ago, and we could
3302 	 * have potentially sendonly joined to a recent incarnation of
3303 	 * the mcg and are about to loose track of this information.
3304 	 */
3305 	if (do_detach) {
3306 		DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
3307 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3308 		(void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
3309 	}
3310 
3311 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
3312 	kmem_free(mce, sizeof (ibd_mce_t));
3313 }
3314 
3315 /*
3316  * Async code executed due to multicast and promiscuous disable requests
3317  * and mcg trap handling; also executed during driver detach. Mostly, a
3318  * leave and detach is done; except for the fullmember case when Tx
3319  * requests are pending, whence arrangements are made for subsequent
3320  * cleanup on Tx completion.
3321  */
3322 static void
3323 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3324 {
3325 	ipoib_mac_t mcmac;
3326 	boolean_t recycled;
3327 	ibd_mce_t *mce;
3328 
3329 	DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
3330 	    jstate, mgid.gid_prefix, mgid.gid_guid);
3331 
3332 	if (jstate == IB_MC_JSTATE_NON) {
3333 		recycled = B_TRUE;
3334 		mce = IBD_MCACHE_FIND_NON(state, mgid);
3335 		/*
3336 		 * In case we are handling a mcg trap, we might not find
3337 		 * the mcg in the non list.
3338 		 */
3339 		if (mce == NULL) {
3340 			return;
3341 		}
3342 	} else {
3343 		mce = IBD_MCACHE_FIND_FULL(state, mgid);
3344 
3345 		/*
3346 		 * In case we are handling a mcg trap, make sure the trap
3347 		 * is not arriving late; if we have an mce that indicates
3348 		 * that we are already a fullmember, that would be a clear
3349 		 * indication that the trap arrived late (ie, is for a
3350 		 * previous incarnation of the mcg).
3351 		 */
3352 		if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
3353 			if ((mce == NULL) || (mce->mc_jstate ==
3354 			    IB_MC_JSTATE_FULL)) {
3355 				return;
3356 			}
3357 		} else {
3358 			ASSERT(jstate == IB_MC_JSTATE_FULL);
3359 
3360 			/*
3361 			 * If join group failed, mce will be NULL here.
3362 			 * This is because in GLDv3 driver, set multicast
3363 			 *  will always return success.
3364 			 */
3365 			if (mce == NULL) {
3366 				return;
3367 			}
3368 
3369 			mce->mc_fullreap = B_TRUE;
3370 		}
3371 
3372 		/*
3373 		 * If no pending Tx's remain that reference the AH
3374 		 * for the mcg, recycle it from active to free list.
3375 		 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3376 		 * so the last completing Tx will cause an async reap
3377 		 * operation to be invoked, at which time we will drop our
3378 		 * membership to the mcg so that the pending Tx's complete
3379 		 * successfully. Refer to comments on "AH and MCE active
3380 		 * list manipulation" at top of this file. The lock protects
3381 		 * against Tx fast path and Tx cleanup code.
3382 		 */
3383 		mutex_enter(&state->id_ac_mutex);
3384 		ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3385 		recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3386 		    IB_MC_JSTATE_SEND_ONLY_NON));
3387 		mutex_exit(&state->id_ac_mutex);
3388 	}
3389 
3390 	if (recycled) {
3391 		DPRINT(2, "ibd_leave_group : leave_group reaping : "
3392 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3393 		ibd_async_reap_group(state, mce, mgid, jstate);
3394 	}
3395 }
3396 
3397 /*
3398  * Find the broadcast address as defined by IPoIB; implicitly
3399  * determines the IBA scope, mtu, tclass etc of the link the
3400  * interface is going to be a member of.
3401  */
3402 static ibt_status_t
3403 ibd_find_bgroup(ibd_state_t *state)
3404 {
3405 	ibt_mcg_attr_t mcg_attr;
3406 	uint_t numg;
3407 	uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3408 	    IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3409 	    IB_MC_SCOPE_GLOBAL };
3410 	int i, mcgmtu;
3411 	boolean_t found = B_FALSE;
3412 	int ret;
3413 	ibt_mcg_info_t mcg_info;
3414 
3415 	state->id_bgroup_created = B_FALSE;
3416 	state->id_bgroup_present = B_FALSE;
3417 
3418 query_bcast_grp:
3419 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3420 	mcg_attr.mc_pkey = state->id_pkey;
3421 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3422 	state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
3423 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3424 
3425 	for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3426 		state->id_scope = mcg_attr.mc_scope = scopes[i];
3427 
3428 		/*
3429 		 * Look for the IPoIB broadcast group.
3430 		 */
3431 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3432 		state->id_mgid.gid_prefix =
3433 		    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3434 		    ((uint64_t)state->id_scope << 48) |
3435 		    ((uint32_t)(state->id_pkey << 16)));
3436 		mcg_attr.mc_mgid = state->id_mgid;
3437 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3438 		if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3439 		    &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3440 			found = B_TRUE;
3441 			break;
3442 		}
3443 	}
3444 
3445 	if (!found) {
3446 		if (state->id_create_broadcast_group) {
3447 			/*
3448 			 * If we created the broadcast group, but failed to
3449 			 * find it, we can't do anything except leave the
3450 			 * one we created and return failure.
3451 			 */
3452 			if (state->id_bgroup_created) {
3453 				ibd_print_warn(state, "IPoIB broadcast group "
3454 				    "absent. Unable to query after create.");
3455 				goto find_bgroup_fail;
3456 			}
3457 
3458 			/*
3459 			 * Create the ipoib broadcast group if it didn't exist
3460 			 */
3461 			bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3462 			mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
3463 			mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
3464 			mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
3465 			mcg_attr.mc_pkey = state->id_pkey;
3466 			mcg_attr.mc_flow = 0;
3467 			mcg_attr.mc_sl = 0;
3468 			mcg_attr.mc_tclass = 0;
3469 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3470 			state->id_mgid.gid_prefix =
3471 			    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3472 			    ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
3473 			    ((uint32_t)(state->id_pkey << 16)));
3474 			mcg_attr.mc_mgid = state->id_mgid;
3475 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3476 
3477 			if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
3478 			    &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
3479 				ibd_print_warn(state, "IPoIB broadcast group "
3480 				    "absent, create failed: ret = %d\n", ret);
3481 				state->id_bgroup_created = B_FALSE;
3482 				return (IBT_FAILURE);
3483 			}
3484 			state->id_bgroup_created = B_TRUE;
3485 			goto query_bcast_grp;
3486 		} else {
3487 			ibd_print_warn(state, "IPoIB broadcast group absent");
3488 			return (IBT_FAILURE);
3489 		}
3490 	}
3491 
3492 	/*
3493 	 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3494 	 */
3495 	mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3496 	if (state->id_mtu < mcgmtu) {
3497 		ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3498 		    "greater than port's maximum MTU %d", mcgmtu,
3499 		    state->id_mtu);
3500 		ibt_free_mcg_info(state->id_mcinfo, 1);
3501 		goto find_bgroup_fail;
3502 	}
3503 	state->id_mtu = mcgmtu;
3504 	state->id_bgroup_present = B_TRUE;
3505 
3506 	return (IBT_SUCCESS);
3507 
3508 find_bgroup_fail:
3509 	if (state->id_bgroup_created) {
3510 		(void) ibt_leave_mcg(state->id_sgid,
3511 		    mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
3512 		    IB_MC_JSTATE_FULL);
3513 	}
3514 
3515 	return (IBT_FAILURE);
3516 }
3517 
3518 static int
3519 ibd_alloc_tx_copybufs(ibd_state_t *state)
3520 {
3521 	ibt_mr_attr_t mem_attr;
3522 
3523 	/*
3524 	 * Allocate one big chunk for all regular tx copy bufs
3525 	 */
3526 	state->id_tx_buf_sz = state->id_mtu;
3527 	if (state->id_lso_policy && state->id_lso_capable &&
3528 	    (state->id_ud_tx_copy_thresh > state->id_mtu)) {
3529 		state->id_tx_buf_sz = state->id_ud_tx_copy_thresh;
3530 	}
3531 
3532 	state->id_tx_bufs = kmem_zalloc(state->id_ud_num_swqe *
3533 	    state->id_tx_buf_sz, KM_SLEEP);
3534 
3535 	state->id_tx_wqes = kmem_zalloc(state->id_ud_num_swqe *
3536 	    sizeof (ibd_swqe_t), KM_SLEEP);
3537 
3538 	/*
3539 	 * Do one memory registration on the entire txbuf area
3540 	 */
3541 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
3542 	mem_attr.mr_len = state->id_ud_num_swqe * state->id_tx_buf_sz;
3543 	mem_attr.mr_as = NULL;
3544 	mem_attr.mr_flags = IBT_MR_SLEEP;
3545 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3546 	    &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
3547 		DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
3548 		kmem_free(state->id_tx_wqes,
3549 		    state->id_ud_num_swqe * sizeof (ibd_swqe_t));
3550 		kmem_free(state->id_tx_bufs,
3551 		    state->id_ud_num_swqe * state->id_tx_buf_sz);
3552 		state->id_tx_bufs = NULL;
3553 		return (DDI_FAILURE);
3554 	}
3555 
3556 	return (DDI_SUCCESS);
3557 }
3558 
3559 static int
3560 ibd_alloc_tx_lsobufs(ibd_state_t *state)
3561 {
3562 	ibt_mr_attr_t mem_attr;
3563 	ibd_lsobuf_t *buflist;
3564 	ibd_lsobuf_t *lbufp;
3565 	ibd_lsobuf_t *tail;
3566 	ibd_lsobkt_t *bktp;
3567 	uint8_t *membase;
3568 	uint8_t *memp;
3569 	uint_t memsz;
3570 	int i;
3571 
3572 	/*
3573 	 * Allocate the lso bucket
3574 	 */
3575 	bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);
3576 
3577 	/*
3578 	 * Allocate the entire lso memory and register it
3579 	 */
3580 	memsz = state->id_num_lso_bufs * IBD_LSO_BUFSZ;
3581 	membase = kmem_zalloc(memsz, KM_SLEEP);
3582 
3583 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
3584 	mem_attr.mr_len = memsz;
3585 	mem_attr.mr_as = NULL;
3586 	mem_attr.mr_flags = IBT_MR_SLEEP;
3587 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
3588 	    &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
3589 		DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
3590 		kmem_free(membase, memsz);
3591 		kmem_free(bktp, sizeof (ibd_lsobkt_t));
3592 		return (DDI_FAILURE);
3593 	}
3594 
3595 	mutex_enter(&state->id_lso_lock);
3596 
3597 	/*
3598 	 * Now allocate the buflist.  Note that the elements in the buflist and
3599 	 * the buffers in the lso memory have a permanent 1-1 relation, so we
3600 	 * can always derive the address of a buflist entry from the address of
3601 	 * an lso buffer.
3602 	 */
3603 	buflist = kmem_zalloc(state->id_num_lso_bufs * sizeof (ibd_lsobuf_t),
3604 	    KM_SLEEP);
3605 
3606 	/*
3607 	 * Set up the lso buf chain
3608 	 */
3609 	memp = membase;
3610 	lbufp = buflist;
3611 	for (i = 0; i < state->id_num_lso_bufs; i++) {
3612 		lbufp->lb_isfree = 1;
3613 		lbufp->lb_buf = memp;
3614 		lbufp->lb_next = lbufp + 1;
3615 
3616 		tail = lbufp;
3617 
3618 		memp += IBD_LSO_BUFSZ;
3619 		lbufp++;
3620 	}
3621 	tail->lb_next = NULL;
3622 
3623 	/*
3624 	 * Set up the LSO buffer information in ibd state
3625 	 */
3626 	bktp->bkt_bufl = buflist;
3627 	bktp->bkt_free_head = buflist;
3628 	bktp->bkt_mem = membase;
3629 	bktp->bkt_nelem = state->id_num_lso_bufs;
3630 	bktp->bkt_nfree = bktp->bkt_nelem;
3631 
3632 	state->id_lso = bktp;
3633 	mutex_exit(&state->id_lso_lock);
3634 
3635 	return (DDI_SUCCESS);
3636 }
3637 
3638 /*
3639  * Statically allocate Tx buffer list(s).
3640  */
3641 static int
3642 ibd_init_txlist(ibd_state_t *state)
3643 {
3644 	ibd_swqe_t *swqe;
3645 	ibt_lkey_t lkey;
3646 	int i;
3647 	uint_t len;
3648 	uint8_t *bufaddr;
3649 
3650 	if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
3651 		return (DDI_FAILURE);
3652 
3653 	if (state->id_lso_policy && state->id_lso_capable) {
3654 		if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
3655 			state->id_lso_capable = B_FALSE;
3656 	}
3657 
3658 	mutex_enter(&state->id_tx_list.dl_mutex);
3659 	state->id_tx_list.dl_head = NULL;
3660 	state->id_tx_list.dl_pending_sends = B_FALSE;
3661 	state->id_tx_list.dl_cnt = 0;
3662 	mutex_exit(&state->id_tx_list.dl_mutex);
3663 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
3664 	state->id_tx_rel_list.dl_head = NULL;
3665 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3666 	state->id_tx_rel_list.dl_cnt = 0;
3667 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
3668 
3669 	/*
3670 	 * Allocate and setup the swqe list
3671 	 */
3672 	lkey = state->id_tx_mr_desc.md_lkey;
3673 	bufaddr = state->id_tx_bufs;
3674 	len = state->id_tx_buf_sz;
3675 	swqe = state->id_tx_wqes;
3676 	mutex_enter(&state->id_tx_list.dl_mutex);
3677 	for (i = 0; i < state->id_ud_num_swqe; i++, swqe++, bufaddr += len) {
3678 		swqe->swqe_next = NULL;
3679 		swqe->swqe_im_mblk = NULL;
3680 
3681 		swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
3682 		    bufaddr;
3683 		swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
3684 		swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3685 
3686 		swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3687 		swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS;
3688 		swqe->w_swr.wr_trans = IBT_UD_SRV;
3689 
3690 		/* These are set in send */
3691 		swqe->w_swr.wr_nds = 0;
3692 		swqe->w_swr.wr_sgl = NULL;
3693 		swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3694 
3695 		/* add to list */
3696 		state->id_tx_list.dl_cnt++;
3697 		swqe->swqe_next = state->id_tx_list.dl_head;
3698 		state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3699 	}
3700 	mutex_exit(&state->id_tx_list.dl_mutex);
3701 
3702 	return (DDI_SUCCESS);
3703 }
3704 
3705 static int
3706 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
3707     uint32_t *nds_p)
3708 {
3709 	ibd_lsobkt_t *bktp;
3710 	ibd_lsobuf_t *lbufp;
3711 	ibd_lsobuf_t *nextp;
3712 	ibt_lkey_t lso_lkey;
3713 	uint_t frag_sz;
3714 	uint_t num_needed;
3715 	int i;
3716 
3717 	ASSERT(sgl_p != NULL);
3718 	ASSERT(nds_p != NULL);
3719 	ASSERT(req_sz != 0);
3720 
3721 	/*
3722 	 * Determine how many bufs we'd need for the size requested
3723 	 */
3724 	num_needed = req_sz / IBD_LSO_BUFSZ;
3725 	if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
3726 		num_needed++;
3727 
3728 	mutex_enter(&state->id_lso_lock);
3729 
3730 	/*
3731 	 * If we don't have enough lso bufs, return failure
3732 	 */
3733 	ASSERT(state->id_lso != NULL);
3734 	bktp = state->id_lso;
3735 	if (bktp->bkt_nfree < num_needed) {
3736 		mutex_exit(&state->id_lso_lock);
3737 		return (-1);
3738 	}
3739 
3740 	/*
3741 	 * Pick the first 'num_needed' bufs from the free list
3742 	 */
3743 	lso_lkey = bktp->bkt_mr_desc.md_lkey;
3744 	lbufp = bktp->bkt_free_head;
3745 	for (i = 0; i < num_needed; i++) {
3746 		ASSERT(lbufp->lb_isfree != 0);
3747 		ASSERT(lbufp->lb_buf != NULL);
3748 
3749 		nextp = lbufp->lb_next;
3750 
3751 		sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
3752 		sgl_p[i].ds_key = lso_lkey;
3753 		sgl_p[i].ds_len = IBD_LSO_BUFSZ;
3754 
3755 		lbufp->lb_isfree = 0;
3756 		lbufp->lb_next = NULL;
3757 
3758 		lbufp = nextp;
3759 	}
3760 	bktp->bkt_free_head = lbufp;
3761 
3762 	/*
3763 	 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
3764 	 * to adjust the last sgl entry's length. Since we know we need atleast
3765 	 * one, the i-1 use below is ok.
3766 	 */
3767 	if (frag_sz) {
3768 		sgl_p[i-1].ds_len = frag_sz;
3769 	}
3770 
3771 	/*
3772 	 * Update nfree count and return
3773 	 */
3774 	bktp->bkt_nfree -= num_needed;
3775 
3776 	mutex_exit(&state->id_lso_lock);
3777 
3778 	*nds_p = num_needed;
3779 
3780 	return (0);
3781 }
3782 
3783 static void
3784 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
3785 {
3786 	ibd_lsobkt_t *bktp;
3787 	ibd_lsobuf_t *lbufp;
3788 	uint8_t *lso_mem_end;
3789 	uint_t ndx;
3790 	int i;
3791 
3792 	mutex_enter(&state->id_lso_lock);
3793 
3794 	bktp = state->id_lso;
3795 	ASSERT(bktp != NULL);
3796 
3797 	lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
3798 	for (i = 0; i < nds; i++) {
3799 		uint8_t *va;
3800 
3801 		va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
3802 		ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);
3803 
3804 		/*
3805 		 * Figure out the buflist element this sgl buffer corresponds
3806 		 * to and put it back at the head
3807 		 */
3808 		ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
3809 		lbufp = bktp->bkt_bufl + ndx;
3810 
3811 		ASSERT(lbufp->lb_isfree == 0);
3812 		ASSERT(lbufp->lb_buf == va);
3813 
3814 		lbufp->lb_isfree = 1;
3815 		lbufp->lb_next = bktp->bkt_free_head;
3816 		bktp->bkt_free_head = lbufp;
3817 	}
3818 	bktp->bkt_nfree += nds;
3819 
3820 	mutex_exit(&state->id_lso_lock);
3821 }
3822 
3823 static void
3824 ibd_free_tx_copybufs(ibd_state_t *state)
3825 {
3826 	/*
3827 	 * Unregister txbuf mr
3828 	 */
3829 	if (ibt_deregister_mr(state->id_hca_hdl,
3830 	    state->id_tx_mr_hdl) != IBT_SUCCESS) {
3831 		DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
3832 	}
3833 	state->id_tx_mr_hdl = NULL;
3834 
3835 	/*
3836 	 * Free txbuf memory
3837 	 */
3838 	kmem_free(state->id_tx_wqes, state->id_ud_num_swqe *
3839 	    sizeof (ibd_swqe_t));
3840 	kmem_free(state->id_tx_bufs, state->id_ud_num_swqe *
3841 	    state->id_tx_buf_sz);
3842 	state->id_tx_wqes = NULL;
3843 	state->id_tx_bufs = NULL;
3844 }
3845 
3846 static void
3847 ibd_free_tx_lsobufs(ibd_state_t *state)
3848 {
3849 	ibd_lsobkt_t *bktp;
3850 
3851 	mutex_enter(&state->id_lso_lock);
3852 
3853 	if ((bktp = state->id_lso) == NULL) {
3854 		mutex_exit(&state->id_lso_lock);
3855 		return;
3856 	}
3857 
3858 	/*
3859 	 * First, free the buflist
3860 	 */
3861 	ASSERT(bktp->bkt_bufl != NULL);
3862 	kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));
3863 
3864 	/*
3865 	 * Unregister the LSO memory and free it
3866 	 */
3867 	ASSERT(bktp->bkt_mr_hdl != NULL);
3868 	if (ibt_deregister_mr(state->id_hca_hdl,
3869 	    bktp->bkt_mr_hdl) != IBT_SUCCESS) {
3870 		DPRINT(10,
3871 		    "ibd_free_lsobufs: ibt_deregister_mr failed");
3872 	}
3873 	ASSERT(bktp->bkt_mem);
3874 	kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);
3875 
3876 	/*
3877 	 * Finally free the bucket
3878 	 */
3879 	kmem_free(bktp, sizeof (ibd_lsobkt_t));
3880 	state->id_lso = NULL;
3881 
3882 	mutex_exit(&state->id_lso_lock);
3883 }
3884 
3885 /*
3886  * Free the statically allocated Tx buffer list.
3887  */
3888 static void
3889 ibd_fini_txlist(ibd_state_t *state)
3890 {
3891 	/*
3892 	 * Free the allocated swqes
3893 	 */
3894 	mutex_enter(&state->id_tx_list.dl_mutex);
3895 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
3896 	state->id_tx_list.dl_head = NULL;
3897 	state->id_tx_list.dl_pending_sends = B_FALSE;
3898 	state->id_tx_list.dl_cnt = 0;
3899 	state->id_tx_rel_list.dl_head = NULL;
3900 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3901 	state->id_tx_rel_list.dl_cnt = 0;
3902 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
3903 	mutex_exit(&state->id_tx_list.dl_mutex);
3904 
3905 	ibd_free_tx_lsobufs(state);
3906 	ibd_free_tx_copybufs(state);
3907 }
3908 
3909 /*
3910  * post a list of rwqes, NULL terminated.
3911  */
3912 static void
3913 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe)
3914 {
3915 	uint_t		i;
3916 	uint_t		num_posted;
3917 	ibt_status_t	ibt_status;
3918 	ibt_recv_wr_t	wrs[IBD_RX_POST_CNT];
3919 
3920 	while (rwqe) {
3921 		/* Post up to IBD_RX_POST_CNT receive work requests */
3922 		for (i = 0; i < IBD_RX_POST_CNT; i++) {
3923 			wrs[i] = rwqe->w_rwr;
3924 			rwqe = WQE_TO_RWQE(rwqe->rwqe_next);
3925 			if (rwqe == NULL) {
3926 				i++;
3927 				break;
3928 			}
3929 		}
3930 
3931 		/*
3932 		 * If posting fails for some reason, we'll never receive
3933 		 * completion intimation, so we'll need to cleanup. But
3934 		 * we need to make sure we don't clean up nodes whose
3935 		 * wrs have been successfully posted. We assume that the
3936 		 * hca driver returns on the first failure to post and
3937 		 * therefore the first 'num_posted' entries don't need
3938 		 * cleanup here.
3939 		 */
3940 		atomic_add_32(&state->id_rx_list.dl_cnt, i);
3941 
3942 		num_posted = 0;
3943 		ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i,
3944 		    &num_posted);
3945 		if (ibt_status != IBT_SUCCESS) {
3946 			/* This cannot happen unless the device has an error. */
3947 			ibd_print_warn(state, "ibd_post_recv: FATAL: "
3948 			    "posting multiple wrs failed: "
3949 			    "requested=%d, done=%d, ret=%d",
3950 			    IBD_RX_POST_CNT, num_posted, ibt_status);
3951 			atomic_add_32(&state->id_rx_list.dl_cnt,
3952 			    num_posted - i);
3953 		}
3954 	}
3955 }
3956 
3957 /*
3958  * Grab a list of rwqes from the array of lists, and post the list.
3959  */
3960 static void
3961 ibd_post_recv_intr(ibd_state_t *state)
3962 {
3963 	ibd_rx_queue_t	*rxp;
3964 	ibd_rwqe_t *list;
3965 
3966 	/* rotate through the rx_queue array, expecting an adequate number */
3967 	state->id_rx_post_queue_index =
3968 	    (state->id_rx_post_queue_index + 1) &
3969 	    (state->id_rx_nqueues - 1);
3970 
3971 	rxp = state->id_rx_queues + state->id_rx_post_queue_index;
3972 	mutex_enter(&rxp->rx_post_lock);
3973 	list = WQE_TO_RWQE(rxp->rx_head);
3974 	rxp->rx_head = NULL;
3975 	rxp->rx_cnt = 0;
3976 	mutex_exit(&rxp->rx_post_lock);
3977 	ibd_post_recv_list(state, list);
3978 }
3979 
3980 /* macro explained below */
3981 #define	RX_QUEUE_HASH(rwqe) \
3982 	(((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1))
3983 
3984 /*
3985  * Add a rwqe to one of the the Rx lists.  If the list is large enough
3986  * (exactly IBD_RX_POST_CNT), post the list to the hardware.
3987  *
3988  * Note: one of 2^N lists is chosen via a hash.  This is done
3989  * because using one list is contentious.  If the first list is busy
3990  * (mutex_tryenter fails), use a second list (just call mutex_enter).
3991  *
3992  * The number 8 in RX_QUEUE_HASH is a random choice that provides
3993  * even distribution of mapping rwqes to the 2^N queues.
3994  */
3995 static void
3996 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe)
3997 {
3998 	ibd_rx_queue_t	*rxp;
3999 
4000 	rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe);
4001 
4002 	if (!mutex_tryenter(&rxp->rx_post_lock)) {
4003 		/* Failed.  Try a different queue ("ptr + 16" ensures that). */
4004 		rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16);
4005 		mutex_enter(&rxp->rx_post_lock);
4006 	}
4007 	rwqe->rwqe_next = rxp->rx_head;
4008 	if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) {
4009 		uint_t active = atomic_inc_32_nv(&state->id_rx_post_active);
4010 
4011 		/* only call ibt_post_recv() every Nth time through here */
4012 		if ((active & (state->id_rx_nqueues - 1)) == 0) {
4013 			rxp->rx_head = NULL;
4014 			rxp->rx_cnt = 0;
4015 			mutex_exit(&rxp->rx_post_lock);
4016 			ibd_post_recv_list(state, rwqe);
4017 			return;
4018 		}
4019 	}
4020 	rxp->rx_head = RWQE_TO_WQE(rwqe);
4021 	mutex_exit(&rxp->rx_post_lock);
4022 }
4023 
4024 static int
4025 ibd_alloc_rx_copybufs(ibd_state_t *state)
4026 {
4027 	ibt_mr_attr_t mem_attr;
4028 	int i;
4029 
4030 	/*
4031 	 * Allocate one big chunk for all regular rx copy bufs
4032 	 */
4033 	state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE;
4034 
4035 	state->id_rx_bufs = kmem_zalloc(state->id_ud_num_rwqe *
4036 	    state->id_rx_buf_sz, KM_SLEEP);
4037 
4038 	state->id_rx_wqes = kmem_zalloc(state->id_ud_num_rwqe *
4039 	    sizeof (ibd_rwqe_t), KM_SLEEP);
4040 
4041 	state->id_rx_nqueues = 1 << IBD_LOG_RX_POST;
4042 	state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues *
4043 	    sizeof (ibd_rx_queue_t), KM_SLEEP);
4044 	for (i = 0; i < state->id_rx_nqueues; i++) {
4045 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4046 		mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL);
4047 	}
4048 
4049 	/*
4050 	 * Do one memory registration on the entire rxbuf area
4051 	 */
4052 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs;
4053 	mem_attr.mr_len = state->id_ud_num_rwqe * state->id_rx_buf_sz;
4054 	mem_attr.mr_as = NULL;
4055 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
4056 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
4057 	    &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) {
4058 		DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed");
4059 		kmem_free(state->id_rx_wqes,
4060 		    state->id_ud_num_rwqe * sizeof (ibd_rwqe_t));
4061 		kmem_free(state->id_rx_bufs,
4062 		    state->id_ud_num_rwqe * state->id_rx_buf_sz);
4063 		state->id_rx_bufs = NULL;
4064 		state->id_rx_wqes = NULL;
4065 		return (DDI_FAILURE);
4066 	}
4067 
4068 	return (DDI_SUCCESS);
4069 }
4070 
4071 /*
4072  * Allocate the statically allocated Rx buffer list.
4073  */
4074 static int
4075 ibd_init_rxlist(ibd_state_t *state)
4076 {
4077 	ibd_rwqe_t *rwqe, *next;
4078 	ibd_wqe_t *list;
4079 	ibt_lkey_t lkey;
4080 	int i;
4081 	uint_t len;
4082 	uint8_t *bufaddr;
4083 
4084 	mutex_enter(&state->id_rx_free_list.dl_mutex);
4085 	if (state->id_rx_free_list.dl_head != NULL) {
4086 		/* rx rsrcs were never freed.  Just repost them */
4087 		len = state->id_rx_buf_sz;
4088 		list = state->id_rx_free_list.dl_head;
4089 		state->id_rx_free_list.dl_head = NULL;
4090 		state->id_rx_free_list.dl_cnt = 0;
4091 		mutex_exit(&state->id_rx_free_list.dl_mutex);
4092 		for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4093 		    rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
4094 			if ((rwqe->rwqe_im_mblk = desballoc(
4095 			    rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
4096 			    &rwqe->w_freemsg_cb)) == NULL) {
4097 				/* allow freemsg_cb to free the rwqes */
4098 				if (atomic_dec_32_nv(&state->id_running) != 0) {
4099 					cmn_err(CE_WARN, "ibd_init_rxlist: "
4100 					    "id_running was not 1\n");
4101 				}
4102 				DPRINT(10, "ibd_init_rxlist : "
4103 				    "failed in desballoc()");
4104 				for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4105 				    rwqe = next) {
4106 					next = WQE_TO_RWQE(rwqe->rwqe_next);
4107 					if (rwqe->rwqe_im_mblk) {
4108 						atomic_inc_32(&state->
4109 						    id_rx_list.
4110 						    dl_bufs_outstanding);
4111 						freemsg(rwqe->rwqe_im_mblk);
4112 					} else
4113 						ibd_free_rwqe(state, rwqe);
4114 				}
4115 				atomic_inc_32(&state->id_running);
4116 				return (DDI_FAILURE);
4117 			}
4118 		}
4119 		ibd_post_recv_list(state, WQE_TO_RWQE(list));
4120 		return (DDI_SUCCESS);
4121 	}
4122 	mutex_exit(&state->id_rx_free_list.dl_mutex);
4123 
4124 	if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS)
4125 		return (DDI_FAILURE);
4126 
4127 	/*
4128 	 * Allocate and setup the rwqe list
4129 	 */
4130 	len = state->id_rx_buf_sz;
4131 	lkey = state->id_rx_mr_desc.md_lkey;
4132 	rwqe = state->id_rx_wqes;
4133 	bufaddr = state->id_rx_bufs;
4134 	list = NULL;
4135 	for (i = 0; i < state->id_ud_num_rwqe; i++, rwqe++, bufaddr += len) {
4136 		rwqe->w_state = state;
4137 		rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
4138 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
4139 
4140 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
4141 
4142 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
4143 		    &rwqe->w_freemsg_cb)) == NULL) {
4144 			DPRINT(10, "ibd_init_rxlist : failed in desballoc()");
4145 			/* allow freemsg_cb to free the rwqes */
4146 			if (atomic_dec_32_nv(&state->id_running) != 0) {
4147 				cmn_err(CE_WARN, "ibd_init_rxlist: "
4148 				    "id_running was not 1\n");
4149 			}
4150 			DPRINT(10, "ibd_init_rxlist : "
4151 			    "failed in desballoc()");
4152 			for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4153 			    rwqe = next) {
4154 				next = WQE_TO_RWQE(rwqe->rwqe_next);
4155 				freemsg(rwqe->rwqe_im_mblk);
4156 			}
4157 			atomic_inc_32(&state->id_running);
4158 
4159 			/* remove reference to free'd rwqes */
4160 			mutex_enter(&state->id_rx_free_list.dl_mutex);
4161 			state->id_rx_free_list.dl_head = NULL;
4162 			state->id_rx_free_list.dl_cnt = 0;
4163 			mutex_exit(&state->id_rx_free_list.dl_mutex);
4164 
4165 			ibd_fini_rxlist(state);
4166 			return (DDI_FAILURE);
4167 		}
4168 
4169 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
4170 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
4171 		    (ib_vaddr_t)(uintptr_t)bufaddr;
4172 		rwqe->rwqe_copybuf.ic_sgl.ds_len = len;
4173 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
4174 		rwqe->w_rwr.wr_nds = 1;
4175 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
4176 
4177 		rwqe->rwqe_next = list;
4178 		list = RWQE_TO_WQE(rwqe);
4179 	}
4180 	ibd_post_recv_list(state, WQE_TO_RWQE(list));
4181 
4182 	return (DDI_SUCCESS);
4183 }
4184 
4185 static void
4186 ibd_free_rx_copybufs(ibd_state_t *state)
4187 {
4188 	int i;
4189 
4190 	/*
4191 	 * Unregister rxbuf mr
4192 	 */
4193 	if (ibt_deregister_mr(state->id_hca_hdl,
4194 	    state->id_rx_mr_hdl) != IBT_SUCCESS) {
4195 		DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed");
4196 	}
4197 	state->id_rx_mr_hdl = NULL;
4198 
4199 	/*
4200 	 * Free rxbuf memory
4201 	 */
4202 	for (i = 0; i < state->id_rx_nqueues; i++) {
4203 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4204 		mutex_destroy(&rxp->rx_post_lock);
4205 	}
4206 	kmem_free(state->id_rx_queues, state->id_rx_nqueues *
4207 	    sizeof (ibd_rx_queue_t));
4208 	kmem_free(state->id_rx_wqes, state->id_ud_num_rwqe *
4209 	    sizeof (ibd_rwqe_t));
4210 	kmem_free(state->id_rx_bufs, state->id_ud_num_rwqe *
4211 	    state->id_rx_buf_sz);
4212 	state->id_rx_queues = NULL;
4213 	state->id_rx_wqes = NULL;
4214 	state->id_rx_bufs = NULL;
4215 }
4216 
4217 static void
4218 ibd_free_rx_rsrcs(ibd_state_t *state)
4219 {
4220 	mutex_enter(&state->id_rx_free_list.dl_mutex);
4221 	if (state->id_rx_free_list.dl_head == NULL) {
4222 		/* already freed */
4223 		mutex_exit(&state->id_rx_free_list.dl_mutex);
4224 		return;
4225 	}
4226 	ASSERT(state->id_rx_free_list.dl_cnt == state->id_ud_num_rwqe);
4227 	ibd_free_rx_copybufs(state);
4228 	state->id_rx_free_list.dl_cnt = 0;
4229 	state->id_rx_free_list.dl_head = NULL;
4230 	mutex_exit(&state->id_rx_free_list.dl_mutex);
4231 }
4232 
4233 /*
4234  * Free the statically allocated Rx buffer list.
4235  */
4236 static void
4237 ibd_fini_rxlist(ibd_state_t *state)
4238 {
4239 	ibd_rwqe_t *rwqe;
4240 	int i;
4241 
4242 	/* run through the rx_queue's, calling freemsg() */
4243 	for (i = 0; i < state->id_rx_nqueues; i++) {
4244 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4245 		mutex_enter(&rxp->rx_post_lock);
4246 		for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe;
4247 		    rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
4248 			freemsg(rwqe->rwqe_im_mblk);
4249 			rxp->rx_cnt--;
4250 		}
4251 		rxp->rx_head = NULL;
4252 		mutex_exit(&rxp->rx_post_lock);
4253 	}
4254 
4255 	/* cannot free rx resources unless gld returned everything */
4256 	if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0)
4257 		ibd_free_rx_rsrcs(state);
4258 }
4259 
4260 /*
4261  * Free an allocated recv wqe.
4262  */
4263 /* ARGSUSED */
4264 static void
4265 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
4266 {
4267 	/*
4268 	 * desballoc() failed (no memory).
4269 	 *
4270 	 * This rwqe is placed on a free list so that it
4271 	 * can be reinstated when memory is available.
4272 	 *
4273 	 * NOTE: no code currently exists to reinstate
4274 	 * these "lost" rwqes.
4275 	 */
4276 	mutex_enter(&state->id_rx_free_list.dl_mutex);
4277 	state->id_rx_free_list.dl_cnt++;
4278 	rwqe->rwqe_next = state->id_rx_free_list.dl_head;
4279 	state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
4280 	mutex_exit(&state->id_rx_free_list.dl_mutex);
4281 }
4282 
4283 /*
4284  * IBA Rx completion queue handler. Guaranteed to be single
4285  * threaded and nonreentrant for this CQ.
4286  */
4287 /* ARGSUSED */
4288 static void
4289 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4290 {
4291 	ibd_state_t *state = (ibd_state_t *)arg;
4292 
4293 	atomic_inc_64(&state->id_num_intrs);
4294 
4295 	if (ibd_rx_softintr == 1) {
4296 		mutex_enter(&state->id_rcq_poll_lock);
4297 		if (state->id_rcq_poll_busy & IBD_CQ_POLLING) {
4298 			state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING;
4299 			mutex_exit(&state->id_rcq_poll_lock);
4300 			return;
4301 		} else {
4302 			mutex_exit(&state->id_rcq_poll_lock);
4303 			ddi_trigger_softintr(state->id_rx);
4304 		}
4305 	} else
4306 		(void) ibd_intr((caddr_t)state);
4307 }
4308 
4309 /*
4310  * CQ handler for Tx completions, when the Tx CQ is in
4311  * interrupt driven mode.
4312  */
4313 /* ARGSUSED */
4314 static void
4315 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4316 {
4317 	ibd_state_t *state = (ibd_state_t *)arg;
4318 
4319 	atomic_inc_64(&state->id_num_intrs);
4320 
4321 	if (ibd_tx_softintr == 1) {
4322 		mutex_enter(&state->id_scq_poll_lock);
4323 		if (state->id_scq_poll_busy & IBD_CQ_POLLING) {
4324 			state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING;
4325 			mutex_exit(&state->id_scq_poll_lock);
4326 			return;
4327 		} else {
4328 			mutex_exit(&state->id_scq_poll_lock);
4329 			ddi_trigger_softintr(state->id_tx);
4330 		}
4331 	} else
4332 		(void) ibd_tx_recycle((caddr_t)state);
4333 }
4334 
4335 /*
4336  * Multicast group create/delete trap handler. These will be delivered
4337  * on a kernel thread (handling can thus block) and can be invoked
4338  * concurrently. The handler can be invoked anytime after it is
4339  * registered and before ibt_detach().
4340  */
4341 /* ARGSUSED */
4342 static void
4343 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
4344     ibt_subnet_event_t *event)
4345 {
4346 	ibd_state_t *state = (ibd_state_t *)arg;
4347 	ibd_req_t *req;
4348 
4349 	/*
4350 	 * The trap handler will get invoked once for every event for
4351 	 * every port. The input "gid" is the GID0 of the port the
4352 	 * trap came in on; we just need to act on traps that came
4353 	 * to our port, meaning the port on which the ipoib interface
4354 	 * resides. Since ipoib uses GID0 of the port, we just match
4355 	 * the gids to check whether we need to handle the trap.
4356 	 */
4357 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
4358 	if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
4359 		return;
4360 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
4361 
4362 	DPRINT(10, "ibd_notices_handler : %d\n", code);
4363 
4364 	switch (code) {
4365 		case IBT_SM_EVENT_UNAVAILABLE:
4366 			/*
4367 			 * If we are in promiscuous mode or have
4368 			 * sendnonmembers, we need to print a warning
4369 			 * message right now. Else, just store the
4370 			 * information, print when we enter promiscuous
4371 			 * mode or attempt nonmember send. We might
4372 			 * also want to stop caching sendnonmember.
4373 			 */
4374 			ibd_print_warn(state, "IBA multicast support "
4375 			    "degraded due to unavailability of multicast "
4376 			    "traps");
4377 			break;
4378 		case IBT_SM_EVENT_AVAILABLE:
4379 			/*
4380 			 * If we printed a warning message above or
4381 			 * while trying to nonmember send or get into
4382 			 * promiscuous mode, print an okay message.
4383 			 */
4384 			ibd_print_warn(state, "IBA multicast support "
4385 			    "restored due to availability of multicast "
4386 			    "traps");
4387 			break;
4388 		case IBT_SM_EVENT_MCG_CREATED:
4389 		case IBT_SM_EVENT_MCG_DELETED:
4390 			/*
4391 			 * If it is a "deleted" event and we are in late hca
4392 			 * init, nothing to do.
4393 			 */
4394 			if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4395 			    IBD_DRV_IN_LATE_HCA_INIT) && (code ==
4396 			    IBT_SM_EVENT_MCG_DELETED)) {
4397 				break;
4398 			}
4399 			/*
4400 			 * Common processing of creation/deletion traps.
4401 			 * First check if the instance is being
4402 			 * [de]initialized; back off then, without doing
4403 			 * anything more, since we are not sure if the
4404 			 * async thread is around, or whether we might
4405 			 * be racing with the detach code in ibd_m_stop()
4406 			 * that scans the mcg list.
4407 			 */
4408 			if (!ibd_async_safe(state))
4409 				return;
4410 
4411 			req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
4412 			req->rq_gid = event->sm_notice_gid;
4413 			req->rq_ptr = (void *)code;
4414 			ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
4415 			break;
4416 	}
4417 }
4418 
4419 static void
4420 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
4421 {
4422 	ib_gid_t mgid = req->rq_gid;
4423 	ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
4424 	int ret;
4425 	ib_pkey_t pkey = (mgid.gid_prefix >> 16) & 0xffff;
4426 
4427 	DPRINT(10, "ibd_async_trap : %d\n", code);
4428 
4429 	/*
4430 	 * Check if we have already joined the IPoIB broadcast group for our
4431 	 * PKEY. If joined, perform the rest of the operation.
4432 	 * Else, the interface is not initialised. Do the initialisation here
4433 	 * by calling ibd_start() and return.
4434 	 */
4435 
4436 	if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4437 	    IBD_DRV_IN_LATE_HCA_INIT) && (state->id_bgroup_present == 0) &&
4438 	    (code == IBT_SM_EVENT_MCG_CREATED)) {
4439 		/*
4440 		 * If we are in late HCA init and a notification for the
4441 		 * creation of a MCG came in, check if it is the IPoIB MCG for
4442 		 * this pkey. If not, return.
4443 		 */
4444 		if ((mgid.gid_guid != IB_MGID_IPV4_LOWGRP_MASK) || (pkey !=
4445 		    state->id_pkey)) {
4446 			ibd_async_done(state);
4447 			return;
4448 		}
4449 		ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
4450 		/*
4451 		 * Check if there is still a necessity to start the interface.
4452 		 * It is possible that the user attempted unplumb at just about
4453 		 * the same time, and if unplumb succeeded, we have nothing to
4454 		 * do.
4455 		 */
4456 		if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4457 		    IBD_DRV_IN_LATE_HCA_INIT) &&
4458 		    ((ret = ibd_start(state)) != 0)) {
4459 			DPRINT(10, "ibd_async_trap: cannot start from late HCA "
4460 			    "init, ret=%d", ret);
4461 		}
4462 		ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
4463 		ibd_async_done(state);
4464 		return;
4465 	}
4466 
4467 	/*
4468 	 * Atomically search the nonmember and sendonlymember lists and
4469 	 * delete.
4470 	 */
4471 	ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
4472 
4473 	if (state->id_prom_op == IBD_OP_COMPLETED) {
4474 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4475 
4476 		/*
4477 		 * If in promiscuous mode, try to join/attach to the new
4478 		 * mcg. Given the unreliable out-of-order mode of trap
4479 		 * delivery, we can never be sure whether it is a problem
4480 		 * if the join fails. Thus, we warn the admin of a failure
4481 		 * if this was a creation trap. Note that the trap might
4482 		 * actually be reporting a long past event, and the mcg
4483 		 * might already have been deleted, thus we might be warning
4484 		 * in vain.
4485 		 */
4486 		if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
4487 		    NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
4488 			ibd_print_warn(state, "IBA promiscuous mode missed "
4489 			    "new multicast gid %016llx:%016llx",
4490 			    (u_longlong_t)mgid.gid_prefix,
4491 			    (u_longlong_t)mgid.gid_guid);
4492 	}
4493 
4494 	/*
4495 	 * Free the request slot allocated by the subnet event thread.
4496 	 */
4497 	ibd_async_done(state);
4498 }
4499 
4500 /*
4501  * GLDv3 entry point to get capabilities.
4502  */
4503 static boolean_t
4504 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
4505 {
4506 	ibd_state_t *state = arg;
4507 
4508 	if (state->id_type == IBD_PORT_DRIVER)
4509 		return (B_FALSE);
4510 
4511 	switch (cap) {
4512 	case MAC_CAPAB_HCKSUM: {
4513 		uint32_t *txflags = cap_data;
4514 
4515 		/*
4516 		 * We either do full checksum or not do it at all
4517 		 */
4518 		if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
4519 			*txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
4520 		else
4521 			return (B_FALSE);
4522 		break;
4523 	}
4524 
4525 	case MAC_CAPAB_LSO: {
4526 		mac_capab_lso_t *cap_lso = cap_data;
4527 
4528 		/*
4529 		 * In addition to the capability and policy, since LSO
4530 		 * relies on hw checksum, we'll not enable LSO if we
4531 		 * don't have hw checksum.  Of course, if the HCA doesn't
4532 		 * provide the reserved lkey capability, enabling LSO will
4533 		 * actually affect performance adversely, so we'll disable
4534 		 * LSO even for that case.
4535 		 */
4536 		if (!state->id_lso_policy || !state->id_lso_capable)
4537 			return (B_FALSE);
4538 
4539 		if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
4540 			return (B_FALSE);
4541 
4542 		if (state->id_hca_res_lkey_capab == 0) {
4543 			ibd_print_warn(state, "no reserved-lkey capability, "
4544 			    "disabling LSO");
4545 			return (B_FALSE);
4546 		}
4547 
4548 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
4549 		cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
4550 		break;
4551 	}
4552 
4553 	default:
4554 		return (B_FALSE);
4555 	}
4556 
4557 	return (B_TRUE);
4558 }
4559 
4560 /*
4561  * callback function for set/get of properties
4562  */
4563 static int
4564 ibd_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4565     uint_t pr_valsize, const void *pr_val)
4566 {
4567 	ibd_state_t *state = arg;
4568 	int err = 0;
4569 	uint32_t link_mode;
4570 
4571 	/* Cannot set properties on a port driver */
4572 	if (state->id_type == IBD_PORT_DRIVER) {
4573 		return (ENOTSUP);
4574 	}
4575 
4576 	switch (pr_num) {
4577 		case MAC_PROP_IB_LINKMODE:
4578 			if (state->id_mac_state & IBD_DRV_STARTED) {
4579 				err = EBUSY;
4580 				break;
4581 			}
4582 			if (pr_val == NULL) {
4583 				err = EINVAL;
4584 				break;
4585 			}
4586 			bcopy(pr_val, &link_mode, sizeof (link_mode));
4587 			if (link_mode != IBD_LINK_MODE_UD &&
4588 			    link_mode != IBD_LINK_MODE_RC) {
4589 				err = EINVAL;
4590 			} else {
4591 				if (link_mode == IBD_LINK_MODE_RC) {
4592 					if (state->id_enable_rc) {
4593 						return (0);
4594 					}
4595 					state->id_enable_rc = 1;
4596 					/* inform MAC framework of new MTU */
4597 					err = mac_maxsdu_update(state->id_mh,
4598 					    state->rc_mtu - IPOIB_HDRSIZE);
4599 				} else {
4600 					if (!state->id_enable_rc) {
4601 						return (0);
4602 					}
4603 					state->id_enable_rc = 0;
4604 					err = mac_maxsdu_update(state->id_mh,
4605 					    state->id_mtu - IPOIB_HDRSIZE);
4606 				}
4607 				(void) ibd_record_capab(state);
4608 				mac_capab_update(state->id_mh);
4609 			}
4610 			break;
4611 		case MAC_PROP_PRIVATE:
4612 			err = ibd_set_priv_prop(state, pr_name,
4613 			    pr_valsize, pr_val);
4614 			break;
4615 		default:
4616 			err = ENOTSUP;
4617 			break;
4618 	}
4619 	return (err);
4620 }
4621 
4622 static int
4623 ibd_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4624     uint_t pr_valsize, void *pr_val)
4625 {
4626 	ibd_state_t *state = arg;
4627 	int err = 0;
4628 
4629 	switch (pr_num) {
4630 		case MAC_PROP_MTU:
4631 			break;
4632 		default:
4633 			if (state->id_type == IBD_PORT_DRIVER) {
4634 				return (ENOTSUP);
4635 			}
4636 			break;
4637 	}
4638 
4639 	switch (pr_num) {
4640 		case MAC_PROP_IB_LINKMODE:
4641 			*(uint_t *)pr_val = state->id_enable_rc;
4642 			break;
4643 		case MAC_PROP_PRIVATE:
4644 			err = ibd_get_priv_prop(state, pr_name, pr_valsize,
4645 			    pr_val);
4646 			break;
4647 		default:
4648 			err = ENOTSUP;
4649 			break;
4650 	}
4651 	return (err);
4652 }
4653 
4654 static void
4655 ibd_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4656     mac_prop_info_handle_t prh)
4657 {
4658 	ibd_state_t *state = arg;
4659 
4660 	switch (pr_num) {
4661 	case MAC_PROP_IB_LINKMODE: {
4662 		mac_prop_info_set_default_uint32(prh, IBD_DEF_LINK_MODE);
4663 		break;
4664 	}
4665 	case MAC_PROP_MTU: {
4666 		uint32_t min, max;
4667 		if (state->id_type == IBD_PORT_DRIVER) {
4668 			min = 1500;
4669 			max = IBD_DEF_RC_MAX_SDU;
4670 		} else if (state->id_enable_rc) {
4671 			min = max = IBD_DEF_RC_MAX_SDU;
4672 		} else {
4673 			min = max = state->id_mtu - IPOIB_HDRSIZE;
4674 		}
4675 		mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
4676 		mac_prop_info_set_range_uint32(prh, min, max);
4677 		break;
4678 	}
4679 	case MAC_PROP_PRIVATE: {
4680 		char valstr[64];
4681 		int value;
4682 
4683 		if (strcmp(pr_name, "_ibd_broadcast_group") == 0) {
4684 			mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
4685 			return;
4686 		} else if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4687 			value = IBD_DEF_COALESCE_COMPLETIONS;
4688 		} else if (strcmp(pr_name,
4689 		    "_ibd_create_broadcast_group") == 0) {
4690 			value = IBD_DEF_CREATE_BCAST_GROUP;
4691 		} else if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4692 			value = IBD_DEF_HASH_SIZE;
4693 		} else if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4694 			value = IBD_DEF_LSO_POLICY;
4695 		} else if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4696 			value = IBD_DEF_NUM_AH;
4697 		} else if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4698 			value = IBD_DEF_NUM_LSO_BUFS;
4699 		} else if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4700 			value = IBD_DEF_RC_ENABLE_SRQ;
4701 		} else if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4702 			value = IBD_DEF_RC_NUM_RWQE;
4703 		} else if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4704 			value = IBD_DEF_RC_NUM_SRQ;
4705 		} else if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4706 			value = IBD_DEF_RC_NUM_SWQE;
4707 		} else if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4708 			value = IBD_DEF_RC_RX_COMP_COUNT;
4709 		} else if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4710 			value = IBD_DEF_RC_RX_COMP_USEC;
4711 		} else if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4712 			value = IBD_DEF_RC_RX_COPY_THRESH;
4713 		} else if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4714 			value = IBD_DEF_RC_RX_RWQE_THRESH;
4715 		} else if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
4716 			value = IBD_DEF_RC_TX_COMP_COUNT;
4717 		} else if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
4718 			value = IBD_DEF_RC_TX_COMP_USEC;
4719 		} else if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
4720 			value = IBD_DEF_RC_TX_COPY_THRESH;
4721 		} else if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
4722 			value = IBD_DEF_UD_NUM_RWQE;
4723 		} else if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
4724 			value = IBD_DEF_UD_NUM_SWQE;
4725 		} else if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
4726 			value = IBD_DEF_UD_RX_COMP_COUNT;
4727 		} else if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
4728 			value = IBD_DEF_UD_RX_COMP_USEC;
4729 		} else if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
4730 			value = IBD_DEF_UD_TX_COMP_COUNT;
4731 		} else if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
4732 			value = IBD_DEF_UD_TX_COMP_USEC;
4733 		} else if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
4734 			value = IBD_DEF_UD_TX_COPY_THRESH;
4735 		} else {
4736 			return;
4737 		}
4738 
4739 		(void) snprintf(valstr, sizeof (valstr), "%d", value);
4740 		mac_prop_info_set_default_str(prh, valstr);
4741 		break;
4742 	}
4743 	} /* switch (pr_num) */
4744 }
4745 
4746 /* ARGSUSED2 */
4747 static int
4748 ibd_set_priv_prop(ibd_state_t *state, const char *pr_name,
4749     uint_t pr_valsize, const void *pr_val)
4750 {
4751 	int err = 0;
4752 	long result;
4753 
4754 	if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4755 		if (pr_val == NULL) {
4756 			return (EINVAL);
4757 		}
4758 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4759 		if (result < 0 || result > 1) {
4760 			err = EINVAL;
4761 		} else {
4762 			state->id_allow_coalesce_comp_tuning = (result == 1) ?
4763 			    B_TRUE: B_FALSE;
4764 		}
4765 		return (err);
4766 	}
4767 	if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) {
4768 		if (state->id_mac_state & IBD_DRV_STARTED) {
4769 			return (EBUSY);
4770 		}
4771 		if (pr_val == NULL) {
4772 			return (EINVAL);
4773 		}
4774 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4775 		if (result < 0 || result > 1) {
4776 			err = EINVAL;
4777 		} else {
4778 			state->id_create_broadcast_group = (result == 1) ?
4779 			    B_TRUE: B_FALSE;
4780 		}
4781 		return (err);
4782 	}
4783 	if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4784 		if (state->id_mac_state & IBD_DRV_STARTED) {
4785 			return (EBUSY);
4786 		}
4787 		if (pr_val == NULL) {
4788 			return (EINVAL);
4789 		}
4790 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4791 		if (result < IBD_MIN_HASH_SIZE || result > IBD_MAX_HASH_SIZE) {
4792 			err = EINVAL;
4793 		} else {
4794 			state->id_hash_size = (uint32_t)result;
4795 		}
4796 		return (err);
4797 	}
4798 	if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4799 		if (state->id_mac_state & IBD_DRV_STARTED) {
4800 			return (EBUSY);
4801 		}
4802 		if (pr_val == NULL) {
4803 			return (EINVAL);
4804 		}
4805 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4806 		if (result < 0 || result > 1) {
4807 			err = EINVAL;
4808 		} else {
4809 			state->id_lso_policy = (result == 1) ?
4810 			    B_TRUE: B_FALSE;
4811 		}
4812 		mac_capab_update(state->id_mh);
4813 		return (err);
4814 	}
4815 	if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4816 		if (state->id_mac_state & IBD_DRV_STARTED) {
4817 			return (EBUSY);
4818 		}
4819 		if (pr_val == NULL) {
4820 			return (EINVAL);
4821 		}
4822 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4823 		if (result < IBD_MIN_NUM_AH || result > IBD_MAX_NUM_AH) {
4824 			err = EINVAL;
4825 		} else {
4826 			state->id_num_ah = (uint32_t)result;
4827 		}
4828 		return (err);
4829 	}
4830 	if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4831 		if (state->id_mac_state & IBD_DRV_STARTED) {
4832 			return (EBUSY);
4833 		}
4834 		if (!state->id_lso_policy || !state->id_lso_capable) {
4835 			return (EINVAL);
4836 		}
4837 		if (pr_val == NULL) {
4838 			return (EINVAL);
4839 		}
4840 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4841 		if (result < IBD_MIN_NUM_LSO_BUFS ||
4842 		    result > IBD_MAX_NUM_LSO_BUFS) {
4843 			err = EINVAL;
4844 		} else {
4845 			state->id_num_lso_bufs = (uint32_t)result;
4846 		}
4847 		return (err);
4848 	}
4849 	if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4850 		if (state->id_mac_state & IBD_DRV_STARTED) {
4851 			return (EBUSY);
4852 		}
4853 		if (pr_val == NULL) {
4854 			return (EINVAL);
4855 		}
4856 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4857 		if (result < 0 || result > 1) {
4858 			err = EINVAL;
4859 		} else {
4860 			state->rc_enable_srq = (result == 1) ?
4861 			    B_TRUE: B_FALSE;
4862 		}
4863 		if (!state->rc_enable_srq) {
4864 			state->id_rc_num_srq = 0;
4865 		}
4866 		return (err);
4867 	}
4868 	if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4869 		if (state->id_mac_state & IBD_DRV_STARTED) {
4870 			return (EBUSY);
4871 		}
4872 		if (pr_val == NULL) {
4873 			return (EINVAL);
4874 		}
4875 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4876 		if (result < IBD_MIN_RC_NUM_RWQE ||
4877 		    result > IBD_MAX_RC_NUM_RWQE) {
4878 			err = EINVAL;
4879 		} else {
4880 			state->id_rc_num_rwqe = (uint32_t)result;
4881 			if (state->id_allow_coalesce_comp_tuning &&
4882 			    state->id_rc_rx_comp_count > state->id_rc_num_rwqe)
4883 				state->id_rc_rx_comp_count =
4884 				    state->id_rc_num_rwqe;
4885 			if (state->id_rc_num_srq > state->id_rc_num_rwqe)
4886 				state->id_rc_num_srq =
4887 				    state->id_rc_num_rwqe - 1;
4888 			/*
4889 			 * If rx_rwqe_threshold is greater than the number of
4890 			 * rwqes, pull it back to 25% of number of rwqes.
4891 			 */
4892 			if (state->id_rc_rx_rwqe_thresh > state->id_rc_num_rwqe)
4893 				state->id_rc_rx_rwqe_thresh =
4894 				    (state->id_rc_num_rwqe >> 2);
4895 
4896 		}
4897 		return (err);
4898 	}
4899 	if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4900 		if (state->id_mac_state & IBD_DRV_STARTED) {
4901 			return (EBUSY);
4902 		}
4903 		if (pr_val == NULL) {
4904 			return (EINVAL);
4905 		}
4906 		if (!state->rc_enable_srq)
4907 			return (EINVAL);
4908 
4909 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4910 		if (result < IBD_MIN_RC_NUM_SRQ ||
4911 		    result >= state->id_rc_num_rwqe) {
4912 			err = EINVAL;
4913 		} else
4914 			state->id_rc_num_srq = (uint32_t)result;
4915 		return (err);
4916 	}
4917 	if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4918 		if (state->id_mac_state & IBD_DRV_STARTED) {
4919 			return (EBUSY);
4920 		}
4921 		if (pr_val == NULL) {
4922 			return (EINVAL);
4923 		}
4924 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4925 		if (result < IBD_MIN_RC_NUM_SWQE ||
4926 		    result > IBD_MAX_RC_NUM_SWQE) {
4927 			err = EINVAL;
4928 		} else {
4929 			state->id_rc_num_swqe = (uint32_t)result;
4930 			if (state->id_allow_coalesce_comp_tuning &&
4931 			    state->id_rc_tx_comp_count > state->id_rc_num_swqe)
4932 				state->id_rc_tx_comp_count =
4933 				    state->id_rc_num_swqe;
4934 		}
4935 		return (err);
4936 	}
4937 	if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4938 		if (!state->id_allow_coalesce_comp_tuning) {
4939 			return (ENOTSUP);
4940 		}
4941 		if (pr_val == NULL) {
4942 			return (EINVAL);
4943 		}
4944 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4945 		if (result < 1 || result > state->id_rc_num_rwqe) {
4946 			err = EINVAL;
4947 		} else {
4948 			state->id_rc_rx_comp_count = (uint32_t)result;
4949 		}
4950 		return (err);
4951 	}
4952 	if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4953 		if (!state->id_allow_coalesce_comp_tuning) {
4954 			return (ENOTSUP);
4955 		}
4956 		if (pr_val == NULL) {
4957 			return (EINVAL);
4958 		}
4959 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4960 		if (result < 1) {
4961 			err = EINVAL;
4962 		} else {
4963 			state->id_rc_rx_comp_usec = (uint32_t)result;
4964 		}
4965 		return (err);
4966 	}
4967 	if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4968 		if (state->id_mac_state & IBD_DRV_STARTED) {
4969 			return (EBUSY);
4970 		}
4971 		if (pr_val == NULL) {
4972 			return (EINVAL);
4973 		}
4974 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4975 		if (result < IBD_MIN_RC_RX_COPY_THRESH ||
4976 		    result > state->rc_mtu) {
4977 			err = EINVAL;
4978 		} else {
4979 			state->id_rc_rx_copy_thresh = (uint32_t)result;
4980 		}
4981 		return (err);
4982 	}
4983 	if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4984 		if (state->id_mac_state & IBD_DRV_STARTED) {
4985 			return (EBUSY);
4986 		}
4987 		if (pr_val == NULL) {
4988 			return (EINVAL);
4989 		}
4990 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4991 		if (result < IBD_MIN_RC_RX_RWQE_THRESH ||
4992 		    result >= state->id_rc_num_rwqe) {
4993 			err = EINVAL;
4994 		} else {
4995 			state->id_rc_rx_rwqe_thresh = (uint32_t)result;
4996 		}
4997 		return (err);
4998 	}
4999 	if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
5000 		if (!state->id_allow_coalesce_comp_tuning) {
5001 			return (ENOTSUP);
5002 		}
5003 		if (pr_val == NULL) {
5004 			return (EINVAL);
5005 		}
5006 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5007 		if (result < 1 || result > state->id_rc_num_swqe) {
5008 			err = EINVAL;
5009 		} else {
5010 			state->id_rc_tx_comp_count = (uint32_t)result;
5011 		}
5012 		return (err);
5013 	}
5014 	if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
5015 		if (!state->id_allow_coalesce_comp_tuning) {
5016 			return (ENOTSUP);
5017 		}
5018 		if (pr_val == NULL) {
5019 			return (EINVAL);
5020 		}
5021 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5022 		if (result < 1)
5023 			err = EINVAL;
5024 		else {
5025 			state->id_rc_tx_comp_usec = (uint32_t)result;
5026 		}
5027 		return (err);
5028 	}
5029 	if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
5030 		if (state->id_mac_state & IBD_DRV_STARTED) {
5031 			return (EBUSY);
5032 		}
5033 		if (pr_val == NULL) {
5034 			return (EINVAL);
5035 		}
5036 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5037 		if (result < IBD_MIN_RC_TX_COPY_THRESH ||
5038 		    result > state->rc_mtu) {
5039 			err = EINVAL;
5040 		} else {
5041 			state->id_rc_tx_copy_thresh = (uint32_t)result;
5042 		}
5043 		return (err);
5044 	}
5045 	if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
5046 		if (state->id_mac_state & IBD_DRV_STARTED) {
5047 			return (EBUSY);
5048 		}
5049 		if (pr_val == NULL) {
5050 			return (EINVAL);
5051 		}
5052 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5053 		if (result < IBD_MIN_UD_NUM_RWQE ||
5054 		    result > IBD_MAX_UD_NUM_RWQE) {
5055 			err = EINVAL;
5056 		} else {
5057 			if (result > state->id_hca_max_chan_sz) {
5058 				state->id_ud_num_rwqe =
5059 				    state->id_hca_max_chan_sz;
5060 			} else {
5061 				state->id_ud_num_rwqe = (uint32_t)result;
5062 			}
5063 			if (state->id_allow_coalesce_comp_tuning &&
5064 			    state->id_ud_rx_comp_count > state->id_ud_num_rwqe)
5065 				state->id_ud_rx_comp_count =
5066 				    state->id_ud_num_rwqe;
5067 		}
5068 		return (err);
5069 	}
5070 	if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
5071 		if (state->id_mac_state & IBD_DRV_STARTED) {
5072 			return (EBUSY);
5073 		}
5074 		if (pr_val == NULL) {
5075 			return (EINVAL);
5076 		}
5077 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5078 		if (result < IBD_MIN_UD_NUM_SWQE ||
5079 		    result > IBD_MAX_UD_NUM_SWQE) {
5080 			err = EINVAL;
5081 		} else {
5082 			if (result > state->id_hca_max_chan_sz) {
5083 				state->id_ud_num_swqe =
5084 				    state->id_hca_max_chan_sz;
5085 			} else {
5086 				state->id_ud_num_swqe = (uint32_t)result;
5087 			}
5088 			if (state->id_allow_coalesce_comp_tuning &&
5089 			    state->id_ud_tx_comp_count > state->id_ud_num_swqe)
5090 				state->id_ud_tx_comp_count =
5091 				    state->id_ud_num_swqe;
5092 		}
5093 		return (err);
5094 	}
5095 	if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
5096 		if (!state->id_allow_coalesce_comp_tuning) {
5097 			return (ENOTSUP);
5098 		}
5099 		if (pr_val == NULL) {
5100 			return (EINVAL);
5101 		}
5102 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5103 		if (result < 1 || result > state->id_ud_num_rwqe) {
5104 			err = EINVAL;
5105 		} else {
5106 			state->id_ud_rx_comp_count = (uint32_t)result;
5107 		}
5108 		return (err);
5109 	}
5110 	if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
5111 		if (!state->id_allow_coalesce_comp_tuning) {
5112 			return (ENOTSUP);
5113 		}
5114 		if (pr_val == NULL) {
5115 			return (EINVAL);
5116 		}
5117 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5118 		if (result < 1) {
5119 			err = EINVAL;
5120 		} else {
5121 			state->id_ud_rx_comp_usec = (uint32_t)result;
5122 		}
5123 		return (err);
5124 	}
5125 	if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
5126 		if (!state->id_allow_coalesce_comp_tuning) {
5127 			return (ENOTSUP);
5128 		}
5129 		if (pr_val == NULL) {
5130 			return (EINVAL);
5131 		}
5132 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5133 		if (result < 1 || result > state->id_ud_num_swqe) {
5134 			err = EINVAL;
5135 		} else {
5136 			state->id_ud_tx_comp_count = (uint32_t)result;
5137 		}
5138 		return (err);
5139 	}
5140 	if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
5141 		if (!state->id_allow_coalesce_comp_tuning) {
5142 			return (ENOTSUP);
5143 		}
5144 		if (pr_val == NULL) {
5145 			return (EINVAL);
5146 		}
5147 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5148 		if (result < 1) {
5149 			err = EINVAL;
5150 		} else {
5151 			state->id_ud_tx_comp_usec = (uint32_t)result;
5152 		}
5153 		return (err);
5154 	}
5155 	if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
5156 		if (state->id_mac_state & IBD_DRV_STARTED) {
5157 			return (EBUSY);
5158 		}
5159 		if (pr_val == NULL) {
5160 			return (EINVAL);
5161 		}
5162 		(void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5163 		if (result < IBD_MIN_UD_TX_COPY_THRESH ||
5164 		    result > IBD_MAX_UD_TX_COPY_THRESH) {
5165 			err = EINVAL;
5166 		} else {
5167 			state->id_ud_tx_copy_thresh = (uint32_t)result;
5168 		}
5169 		return (err);
5170 	}
5171 	return (ENOTSUP);
5172 }
5173 
5174 static int
5175 ibd_get_priv_prop(ibd_state_t *state, const char *pr_name, uint_t pr_valsize,
5176     void *pr_val)
5177 {
5178 	int err = ENOTSUP;
5179 	int value;
5180 
5181 	if (strcmp(pr_name, "_ibd_broadcast_group") == 0) {
5182 		value = state->id_bgroup_present;
5183 		err = 0;
5184 		goto done;
5185 	}
5186 	if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
5187 		value = state->id_allow_coalesce_comp_tuning;
5188 		err = 0;
5189 		goto done;
5190 	}
5191 	if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) {
5192 		value = state->id_create_broadcast_group;
5193 		err = 0;
5194 		goto done;
5195 	}
5196 	if (strcmp(pr_name, "_ibd_hash_size") == 0) {
5197 		value = state->id_hash_size;
5198 		err = 0;
5199 		goto done;
5200 	}
5201 	if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
5202 		value = state->id_lso_policy;
5203 		err = 0;
5204 		goto done;
5205 	}
5206 	if (strcmp(pr_name, "_ibd_num_ah") == 0) {
5207 		value = state->id_num_ah;
5208 		err = 0;
5209 		goto done;
5210 	}
5211 	if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
5212 		value = state->id_num_lso_bufs;
5213 		err = 0;
5214 		goto done;
5215 	}
5216 	if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
5217 		value = state->rc_enable_srq;
5218 		err = 0;
5219 		goto done;
5220 	}
5221 	if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
5222 		value = state->id_rc_num_rwqe;
5223 		err = 0;
5224 		goto done;
5225 	}
5226 	if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
5227 		value = state->id_rc_num_srq;
5228 		err = 0;
5229 		goto done;
5230 	}
5231 	if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
5232 		value = state->id_rc_num_swqe;
5233 		err = 0;
5234 		goto done;
5235 	}
5236 	if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
5237 		value = state->id_rc_rx_comp_count;
5238 		err = 0;
5239 		goto done;
5240 	}
5241 	if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
5242 		value = state->id_rc_rx_comp_usec;
5243 		err = 0;
5244 		goto done;
5245 	}
5246 	if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
5247 		value = state->id_rc_rx_copy_thresh;
5248 		err = 0;
5249 		goto done;
5250 	}
5251 	if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
5252 		value = state->id_rc_rx_rwqe_thresh;
5253 		err = 0;
5254 		goto done;
5255 	}
5256 	if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
5257 		value = state->id_rc_tx_comp_count;
5258 		err = 0;
5259 		goto done;
5260 	}
5261 	if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
5262 		value = state->id_rc_tx_comp_usec;
5263 		err = 0;
5264 		goto done;
5265 	}
5266 	if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
5267 		value = state->id_rc_tx_copy_thresh;
5268 		err = 0;
5269 		goto done;
5270 	}
5271 	if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
5272 		value = state->id_ud_num_rwqe;
5273 		err = 0;
5274 		goto done;
5275 	}
5276 	if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
5277 		value = state->id_ud_num_swqe;
5278 		err = 0;
5279 		goto done;
5280 	}
5281 	if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
5282 		value = state->id_ud_rx_comp_count;
5283 		err = 0;
5284 		goto done;
5285 	}
5286 	if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
5287 		value = state->id_ud_rx_comp_usec;
5288 		err = 0;
5289 		goto done;
5290 	}
5291 	if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
5292 		value = state->id_ud_tx_comp_count;
5293 		err = 0;
5294 		goto done;
5295 	}
5296 	if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
5297 		value = state->id_ud_tx_comp_usec;
5298 		err = 0;
5299 		goto done;
5300 	}
5301 	if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
5302 		value = state->id_ud_tx_copy_thresh;
5303 		err = 0;
5304 		goto done;
5305 	}
5306 done:
5307 	if (err == 0) {
5308 		(void) snprintf(pr_val, pr_valsize, "%d", value);
5309 	}
5310 	return (err);
5311 }
5312 
5313 static int
5314 ibd_get_port_details(ibd_state_t *state)
5315 {
5316 	ibt_hca_portinfo_t *port_infop;
5317 	ibt_status_t ret;
5318 	uint_t psize, port_infosz;
5319 
5320 	mutex_enter(&state->id_link_mutex);
5321 
5322 	/*
5323 	 * Query for port information
5324 	 */
5325 	ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
5326 	    &port_infop, &psize, &port_infosz);
5327 	if ((ret != IBT_SUCCESS) || (psize != 1)) {
5328 		mutex_exit(&state->id_link_mutex);
5329 		DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
5330 		    "failed, ret=%d", ret);
5331 		return (ENETDOWN);
5332 	}
5333 
5334 	/*
5335 	 * If the link is active, verify the pkey
5336 	 */
5337 	if (port_infop->p_linkstate == IBT_PORT_ACTIVE) {
5338 		if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
5339 		    state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
5340 			state->id_link_state = LINK_STATE_DOWN;
5341 		} else {
5342 			state->id_link_state = LINK_STATE_UP;
5343 		}
5344 		state->id_mtu = (128 << port_infop->p_mtu);
5345 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
5346 		state->id_sgid = *port_infop->p_sgid_tbl;
5347 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
5348 		/*
5349 		 * Now that the port is active, record the port speed
5350 		 */
5351 		state->id_link_speed = ibd_get_portspeed(state);
5352 	} else {
5353 		/* Make sure that these are handled in PORT_UP/CHANGE */
5354 		state->id_mtu = 0;
5355 		state->id_link_state = LINK_STATE_DOWN;
5356 		state->id_link_speed = 0;
5357 	}
5358 	mutex_exit(&state->id_link_mutex);
5359 	ibt_free_portinfo(port_infop, port_infosz);
5360 
5361 	return (0);
5362 }
5363 
5364 static int
5365 ibd_alloc_cqs(ibd_state_t *state)
5366 {
5367 	ibt_hca_attr_t hca_attrs;
5368 	ibt_cq_attr_t cq_attr;
5369 	ibt_status_t ret;
5370 	uint32_t real_size;
5371 	uint_t num_rwqe_change = 0;
5372 	uint_t num_swqe_change = 0;
5373 
5374 	ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
5375 	ASSERT(ret == IBT_SUCCESS);
5376 
5377 	/*
5378 	 * Allocate Rx/combined CQ:
5379 	 * Theoretically, there is no point in having more than #rwqe
5380 	 * plus #swqe cqe's, except that the CQ will be signaled for
5381 	 * overflow when the last wqe completes, if none of the previous
5382 	 * cqe's have been polled. Thus, we allocate just a few less wqe's
5383 	 * to make sure such overflow does not occur.
5384 	 */
5385 	cq_attr.cq_sched = NULL;
5386 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
5387 
5388 	/*
5389 	 * Allocate Receive CQ.
5390 	 */
5391 	if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_rwqe + 1)) {
5392 		cq_attr.cq_size = state->id_ud_num_rwqe + 1;
5393 	} else {
5394 		cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
5395 		num_rwqe_change = state->id_ud_num_rwqe;
5396 		state->id_ud_num_rwqe = cq_attr.cq_size - 1;
5397 	}
5398 
5399 	if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
5400 	    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
5401 		DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
5402 		    "failed, ret=%d\n", ret);
5403 		return (DDI_FAILURE);
5404 	}
5405 
5406 	if ((ret = ibt_modify_cq(state->id_rcq_hdl, state->id_ud_rx_comp_count,
5407 	    state->id_ud_rx_comp_usec, 0)) != IBT_SUCCESS) {
5408 		DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
5409 		    "moderation failed, ret=%d\n", ret);
5410 	}
5411 
5412 	/* make the #rx wc's the same as max rx chain size */
5413 	state->id_rxwcs_size = IBD_MAX_RX_MP_LEN;
5414 	state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
5415 	    state->id_rxwcs_size, KM_SLEEP);
5416 
5417 	/*
5418 	 * Allocate Send CQ.
5419 	 */
5420 	if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_swqe + 1)) {
5421 		cq_attr.cq_size = state->id_ud_num_swqe + 1;
5422 	} else {
5423 		cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
5424 		num_swqe_change = state->id_ud_num_swqe;
5425 		state->id_ud_num_swqe = cq_attr.cq_size - 1;
5426 	}
5427 
5428 	if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
5429 	    &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
5430 		DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
5431 		    "failed, ret=%d\n", ret);
5432 		kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
5433 		    state->id_rxwcs_size);
5434 		(void) ibt_free_cq(state->id_rcq_hdl);
5435 		return (DDI_FAILURE);
5436 	}
5437 	if ((ret = ibt_modify_cq(state->id_scq_hdl, state->id_ud_tx_comp_count,
5438 	    state->id_ud_tx_comp_usec, 0)) != IBT_SUCCESS) {
5439 		DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
5440 		    "moderation failed, ret=%d\n", ret);
5441 	}
5442 
5443 	state->id_txwcs_size = IBD_TX_POLL_THRESH;
5444 	state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
5445 	    state->id_txwcs_size, KM_SLEEP);
5446 
5447 	/*
5448 	 * Print message in case we could not allocate as many wqe's
5449 	 * as was requested.
5450 	 */
5451 	if (num_rwqe_change) {
5452 		ibd_print_warn(state, "Setting #rwqe = %d instead of default "
5453 		    "%d", state->id_ud_num_rwqe, num_rwqe_change);
5454 	}
5455 	if (num_swqe_change) {
5456 		ibd_print_warn(state, "Setting #swqe = %d instead of default "
5457 		    "%d", state->id_ud_num_swqe, num_swqe_change);
5458 	}
5459 
5460 	return (DDI_SUCCESS);
5461 }
5462 
5463 static int
5464 ibd_setup_ud_channel(ibd_state_t *state)
5465 {
5466 	ibt_ud_chan_alloc_args_t ud_alloc_attr;
5467 	ibt_ud_chan_query_attr_t ud_chan_attr;
5468 	ibt_status_t ret;
5469 
5470 	ud_alloc_attr.ud_flags  = IBT_ALL_SIGNALED;
5471 	if (state->id_hca_res_lkey_capab)
5472 		ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
5473 	if (state->id_lso_policy && state->id_lso_capable)
5474 		ud_alloc_attr.ud_flags |= IBT_USES_LSO;
5475 
5476 	ud_alloc_attr.ud_hca_port_num	= state->id_port;
5477 	ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
5478 	ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
5479 	ud_alloc_attr.ud_sizes.cs_sq    = state->id_ud_num_swqe;
5480 	ud_alloc_attr.ud_sizes.cs_rq    = state->id_ud_num_rwqe;
5481 	ud_alloc_attr.ud_qkey		= state->id_mcinfo->mc_qkey;
5482 	ud_alloc_attr.ud_scq		= state->id_scq_hdl;
5483 	ud_alloc_attr.ud_rcq		= state->id_rcq_hdl;
5484 	ud_alloc_attr.ud_pd		= state->id_pd_hdl;
5485 	ud_alloc_attr.ud_pkey_ix	= state->id_pkix;
5486 	ud_alloc_attr.ud_clone_chan	= NULL;
5487 
5488 	if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
5489 	    &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
5490 		DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
5491 		    "failed, ret=%d\n", ret);
5492 		return (DDI_FAILURE);
5493 	}
5494 
5495 	if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
5496 	    &ud_chan_attr)) != IBT_SUCCESS) {
5497 		DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
5498 		    "failed, ret=%d\n", ret);
5499 		(void) ibt_free_channel(state->id_chnl_hdl);
5500 		return (DDI_FAILURE);
5501 	}
5502 
5503 	state->id_qpnum = ud_chan_attr.ud_qpn;
5504 
5505 	return (DDI_SUCCESS);
5506 }
5507 
5508 static int
5509 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state)
5510 {
5511 	uint32_t progress = state->id_mac_state;
5512 	uint_t attempts;
5513 	ibt_status_t ret;
5514 	ib_gid_t mgid;
5515 	ibd_mce_t *mce;
5516 	uint8_t jstate;
5517 
5518 	if (atomic_dec_32_nv(&state->id_running) != 0)
5519 		cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n");
5520 
5521 	/*
5522 	 * Before we try to stop/undo whatever we did in ibd_start(),
5523 	 * we need to mark the link state appropriately to prevent the
5524 	 * ip layer from using this instance for any new transfers. Note
5525 	 * that if the original state of the link was "up" when we're
5526 	 * here, we'll set the final link state to "unknown", to behave
5527 	 * in the same fashion as other ethernet drivers.
5528 	 */
5529 	mutex_enter(&state->id_link_mutex);
5530 	if (cur_link_state == LINK_STATE_DOWN) {
5531 		state->id_link_state = cur_link_state;
5532 	} else {
5533 		state->id_link_state = LINK_STATE_UNKNOWN;
5534 	}
5535 	mutex_exit(&state->id_link_mutex);
5536 	bzero(&state->id_macaddr, sizeof (ipoib_mac_t));
5537 	mac_link_update(state->id_mh, state->id_link_state);
5538 
5539 	state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
5540 	if (progress & IBD_DRV_STARTED) {
5541 		state->id_mac_state &= (~IBD_DRV_STARTED);
5542 	}
5543 
5544 	if (progress & IBD_DRV_IN_LATE_HCA_INIT) {
5545 		state->id_mac_state &= (~IBD_DRV_IN_LATE_HCA_INIT);
5546 	}
5547 
5548 	/* Stop listen under Reliable Connected Mode */
5549 	if (progress & IBD_DRV_RC_LISTEN) {
5550 		ASSERT(state->id_enable_rc);
5551 		if (state->rc_listen_hdl != NULL) {
5552 			ibd_rc_stop_listen(state);
5553 		}
5554 		state->id_mac_state &= (~IBD_DRV_RC_LISTEN);
5555 	}
5556 
5557 	if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) {
5558 		(void) ibd_rc_close_all_chan(state);
5559 	}
5560 
5561 	/*
5562 	 * First, stop receive interrupts; this stops the driver from
5563 	 * handing up buffers to higher layers.  Wait for receive buffers
5564 	 * to be returned and give up after 1 second.
5565 	 */
5566 	if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
5567 		attempts = 10;
5568 		while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding,
5569 		    0) > 0) {
5570 			delay(drv_usectohz(100000));
5571 			if (--attempts == 0) {
5572 				/*
5573 				 * There are pending bufs with the network
5574 				 * layer and we have no choice but to wait
5575 				 * for them to be done with. Reap all the
5576 				 * Tx/Rx completions that were posted since
5577 				 * we turned off the notification and
5578 				 * return failure.
5579 				 */
5580 				cmn_err(CE_CONT, "!ibd: bufs outstanding\n");
5581 				DPRINT(2, "ibd_undo_start: "
5582 				    "reclaiming failed");
5583 				break;
5584 			}
5585 		}
5586 		state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
5587 	}
5588 
5589 	if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) {
5590 		ibd_rc_fini_tx_largebuf_list(state);
5591 		state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD);
5592 	}
5593 
5594 	if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
5595 		ASSERT(state->id_enable_rc);
5596 		if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) {
5597 			ibd_rc_fini_srq_list(state);
5598 			state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
5599 		} else {
5600 			cmn_err(CE_CONT, "ibd_undo_start: srq bufs "
5601 			    "outstanding\n");
5602 		}
5603 	}
5604 
5605 	if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
5606 		ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
5607 
5608 		mutex_enter(&state->id_trap_lock);
5609 		state->id_trap_stop = B_TRUE;
5610 		while (state->id_trap_inprog > 0)
5611 			cv_wait(&state->id_trap_cv, &state->id_trap_lock);
5612 		mutex_exit(&state->id_trap_lock);
5613 
5614 		state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
5615 	}
5616 
5617 	if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
5618 		/*
5619 		 * Flushing the channel ensures that all pending WQE's
5620 		 * are marked with flush_error and handed to the CQ. It
5621 		 * does not guarantee the invocation of the CQ handler.
5622 		 * This call is guaranteed to return successfully for
5623 		 * UD QPNs.
5624 		 */
5625 		if ((ret = ibt_flush_channel(state->id_chnl_hdl)) !=
5626 		    IBT_SUCCESS) {
5627 			DPRINT(10, "ibd_undo_start: flush_channel "
5628 			    "failed, ret=%d", ret);
5629 		}
5630 
5631 		/*
5632 		 * Give some time for the TX CQ handler to process the
5633 		 * completions.
5634 		 */
5635 		mutex_enter(&state->id_tx_list.dl_mutex);
5636 		mutex_enter(&state->id_tx_rel_list.dl_mutex);
5637 		attempts = 10;
5638 		while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt
5639 		    != state->id_ud_num_swqe) {
5640 			if (--attempts == 0)
5641 				break;
5642 			mutex_exit(&state->id_tx_rel_list.dl_mutex);
5643 			mutex_exit(&state->id_tx_list.dl_mutex);
5644 			delay(drv_usectohz(100000));
5645 			mutex_enter(&state->id_tx_list.dl_mutex);
5646 			mutex_enter(&state->id_tx_rel_list.dl_mutex);
5647 		}
5648 		ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
5649 		if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt !=
5650 		    state->id_ud_num_swqe) {
5651 			cmn_err(CE_WARN, "tx resources not freed\n");
5652 		}
5653 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
5654 		mutex_exit(&state->id_tx_list.dl_mutex);
5655 
5656 		attempts = 10;
5657 		while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
5658 			if (--attempts == 0)
5659 				break;
5660 			delay(drv_usectohz(100000));
5661 		}
5662 		ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
5663 		if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
5664 			cmn_err(CE_WARN, "rx resources not freed\n");
5665 		}
5666 
5667 		state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
5668 	}
5669 
5670 	if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
5671 		/*
5672 		 * Drop all residual full/non membership. This includes full
5673 		 * membership to the broadcast group, and any nonmembership
5674 		 * acquired during transmits. We do this after the Tx completion
5675 		 * handlers are done, since those might result in some late
5676 		 * leaves; this also eliminates a potential race with that
5677 		 * path wrt the mc full list insert/delete. Trap handling
5678 		 * has also been suppressed at this point. Thus, no locks
5679 		 * are required while traversing the mc full list.
5680 		 */
5681 		DPRINT(2, "ibd_undo_start: clear full cache entries");
5682 		mce = list_head(&state->id_mc_full);
5683 		while (mce != NULL) {
5684 			mgid = mce->mc_info.mc_adds_vect.av_dgid;
5685 			jstate = mce->mc_jstate;
5686 			mce = list_next(&state->id_mc_full, mce);
5687 			ibd_leave_group(state, mgid, jstate);
5688 		}
5689 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
5690 	}
5691 
5692 	if (progress & IBD_DRV_RXLIST_ALLOCD) {
5693 		ibd_fini_rxlist(state);
5694 		state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
5695 	}
5696 
5697 	if (progress & IBD_DRV_TXLIST_ALLOCD) {
5698 		ibd_fini_txlist(state);
5699 		state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
5700 	}
5701 
5702 	if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
5703 		if ((ret = ibt_free_channel(state->id_chnl_hdl)) !=
5704 		    IBT_SUCCESS) {
5705 			DPRINT(10, "ibd_undo_start: free_channel "
5706 			    "failed, ret=%d", ret);
5707 		}
5708 
5709 		state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
5710 	}
5711 
5712 	if (progress & IBD_DRV_CQS_ALLOCD) {
5713 		kmem_free(state->id_txwcs,
5714 		    sizeof (ibt_wc_t) * state->id_txwcs_size);
5715 		if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
5716 		    IBT_SUCCESS) {
5717 			DPRINT(10, "ibd_undo_start: free_cq(scq) "
5718 			    "failed, ret=%d", ret);
5719 		}
5720 
5721 		kmem_free(state->id_rxwcs,
5722 		    sizeof (ibt_wc_t) * state->id_rxwcs_size);
5723 		if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) {
5724 			DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, "
5725 			    "ret=%d", ret);
5726 		}
5727 
5728 		state->id_txwcs = NULL;
5729 		state->id_rxwcs = NULL;
5730 		state->id_scq_hdl = NULL;
5731 		state->id_rcq_hdl = NULL;
5732 
5733 		state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
5734 	}
5735 
5736 	if (progress & IBD_DRV_ACACHE_INITIALIZED) {
5737 		mutex_enter(&state->id_ac_mutex);
5738 		mod_hash_destroy_hash(state->id_ah_active_hash);
5739 		mutex_exit(&state->id_ac_mutex);
5740 		ibd_acache_fini(state);
5741 
5742 		state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
5743 	}
5744 
5745 	if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
5746 		/*
5747 		 * If we'd created the ipoib broadcast group and had
5748 		 * successfully joined it, leave it now
5749 		 */
5750 		if (state->id_bgroup_created) {
5751 			mgid = state->id_mcinfo->mc_adds_vect.av_dgid;
5752 			jstate = IB_MC_JSTATE_FULL;
5753 			(void) ibt_leave_mcg(state->id_sgid, mgid,
5754 			    state->id_sgid, jstate);
5755 		}
5756 		ibt_free_mcg_info(state->id_mcinfo, 1);
5757 
5758 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
5759 	}
5760 
5761 	return (DDI_SUCCESS);
5762 }
5763 
5764 /*
5765  * These pair of routines are used to set/clear the condition that
5766  * the caller is likely to do something to change the id_mac_state.
5767  * If there's already someone doing either a start or a stop (possibly
5768  * due to the async handler detecting a pkey relocation event, a plumb
5769  * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until
5770  * that's done.
5771  */
5772 static void
5773 ibd_set_mac_progress(ibd_state_t *state, uint_t flag)
5774 {
5775 	mutex_enter(&state->id_macst_lock);
5776 	while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS)
5777 		cv_wait(&state->id_macst_cv, &state->id_macst_lock);
5778 
5779 	state->id_mac_state |= flag;
5780 	mutex_exit(&state->id_macst_lock);
5781 }
5782 
5783 static void
5784 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag)
5785 {
5786 	mutex_enter(&state->id_macst_lock);
5787 	state->id_mac_state &= (~flag);
5788 	cv_signal(&state->id_macst_cv);
5789 	mutex_exit(&state->id_macst_lock);
5790 }
5791 
5792 /*
5793  * GLDv3 entry point to start hardware.
5794  */
5795 /*ARGSUSED*/
5796 static int
5797 ibd_m_start(void *arg)
5798 {
5799 	ibd_state_t *state = arg;
5800 	int	ret;
5801 
5802 	if (state->id_type == IBD_PORT_DRIVER)
5803 		return (EINVAL);
5804 
5805 	ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5806 	if (state->id_mac_state & IBD_DRV_IN_DELETION) {
5807 		ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5808 		return (EIO);
5809 	}
5810 
5811 	ret = ibd_start(state);
5812 	ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5813 	return (ret);
5814 }
5815 
5816 static int
5817 ibd_start(ibd_state_t *state)
5818 {
5819 	int err;
5820 	ibt_status_t ret;
5821 	int late_hca_init = 0;
5822 
5823 	if (state->id_mac_state & IBD_DRV_STARTED)
5824 		return (DDI_SUCCESS);
5825 
5826 	/*
5827 	 * We do not increment the running flag when calling ibd_start() as
5828 	 * a result of some event which moves the state away from late HCA
5829 	 * initialization viz. MCG_CREATED, PORT_CHANGE or link availability.
5830 	 */
5831 	if (!(state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
5832 	    (atomic_inc_32_nv(&state->id_running) != 1)) {
5833 		DPRINT(10, "ibd_start: id_running is non-zero");
5834 		cmn_err(CE_WARN, "ibd_start: id_running was not 0\n");
5835 		atomic_dec_32(&state->id_running);
5836 		return (EINVAL);
5837 	}
5838 
5839 	/*
5840 	 * Get port details; if we fail here, something bad happened.
5841 	 * Fail plumb.
5842 	 */
5843 	if ((err = ibd_get_port_details(state)) != 0) {
5844 		DPRINT(10, "ibd_start: ibd_get_port_details() failed");
5845 		goto start_fail;
5846 	}
5847 	/*
5848 	 * If state->id_link_state is DOWN, it indicates that either the port
5849 	 * is down, or the pkey is not available. In both cases, resort to late
5850 	 * initialization. Register for subnet notices, and return success.
5851 	 */
5852 	state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;
5853 	if (state->id_link_state == LINK_STATE_DOWN) {
5854 		late_hca_init = 1;
5855 		goto late_hca_init_return;
5856 	}
5857 
5858 	/*
5859 	 * Find the IPoIB broadcast group
5860 	 */
5861 	if (ibd_find_bgroup(state) != IBT_SUCCESS) {
5862 		/* Resort to late initialization */
5863 		late_hca_init = 1;
5864 		goto reg_snet_notices;
5865 	}
5866 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;
5867 
5868 	/*
5869 	 * Initialize per-interface caches and lists; if we fail here,
5870 	 * it is most likely due to a lack of resources
5871 	 */
5872 	if (ibd_acache_init(state) != DDI_SUCCESS) {
5873 		DPRINT(10, "ibd_start: ibd_acache_init() failed");
5874 		err = ENOMEM;
5875 		goto start_fail;
5876 	}
5877 	state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;
5878 
5879 	/*
5880 	 * Allocate send and receive completion queues
5881 	 */
5882 	if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
5883 		DPRINT(10, "ibd_start: ibd_alloc_cqs() failed");
5884 		err = ENOMEM;
5885 		goto start_fail;
5886 	}
5887 	state->id_mac_state |= IBD_DRV_CQS_ALLOCD;
5888 
5889 	/*
5890 	 * Setup a UD channel
5891 	 */
5892 	if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
5893 		err = ENOMEM;
5894 		DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed");
5895 		goto start_fail;
5896 	}
5897 	state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;
5898 
5899 	/*
5900 	 * Allocate and initialize the tx buffer list
5901 	 */
5902 	if (ibd_init_txlist(state) != DDI_SUCCESS) {
5903 		DPRINT(10, "ibd_start: ibd_init_txlist() failed");
5904 		err = ENOMEM;
5905 		goto start_fail;
5906 	}
5907 	state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
5908 
5909 	/*
5910 	 * Create the send cq handler here
5911 	 */
5912 	ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
5913 	if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
5914 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
5915 		DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
5916 		    "failed, ret=%d", ret);
5917 		err = EINVAL;
5918 		goto start_fail;
5919 	}
5920 	state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
5921 
5922 	/*
5923 	 * Allocate and initialize the rx buffer list
5924 	 */
5925 	if (ibd_init_rxlist(state) != DDI_SUCCESS) {
5926 		DPRINT(10, "ibd_start: ibd_init_rxlist() failed");
5927 		err = ENOMEM;
5928 		goto start_fail;
5929 	}
5930 	state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;
5931 
5932 	/*
5933 	 * Join IPoIB broadcast group
5934 	 */
5935 	if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
5936 		DPRINT(10, "ibd_start: ibd_join_group() failed");
5937 		err = ENOTACTIVE;
5938 		goto start_fail;
5939 	}
5940 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;
5941 
5942 	/*
5943 	 * When we did mac_register() in ibd_attach(), we didn't register
5944 	 * the real macaddr and we didn't have the true port mtu. Now that
5945 	 * we're almost ready, set the local mac address and broadcast
5946 	 * addresses and update gldv3 about the real values of these
5947 	 * parameters.
5948 	 */
5949 	if (state->id_enable_rc) {
5950 		ibd_h2n_mac(&state->id_macaddr,
5951 		    IBD_MAC_ADDR_RC + state->id_qpnum,
5952 		    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
5953 		ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum,
5954 		    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
5955 	} else {
5956 		ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
5957 		    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
5958 	}
5959 	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
5960 	    state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
5961 
5962 	if (!state->id_enable_rc) {
5963 		(void) mac_maxsdu_update(state->id_mh, state->id_mtu
5964 		    - IPOIB_HDRSIZE);
5965 	}
5966 	mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
5967 
5968 	/*
5969 	 * Setup the receive cq handler
5970 	 */
5971 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
5972 	if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl,
5973 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
5974 		DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) "
5975 		    "failed, ret=%d", ret);
5976 		err = EINVAL;
5977 		goto start_fail;
5978 	}
5979 	state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;
5980 
5981 reg_snet_notices:
5982 	/*
5983 	 * In case of normal initialization sequence,
5984 	 * Setup the subnet notices handler after we've initialized the acache/
5985 	 * mcache and started the async thread, both of which are required for
5986 	 * the trap handler to function properly.
5987 	 *
5988 	 * Now that the async thread has been started (and we've already done
5989 	 * a mac_register() during attach so mac_tx_update() can be called
5990 	 * if necessary without any problem), we can enable the trap handler
5991 	 * to queue requests to the async thread.
5992 	 *
5993 	 * In case of late hca initialization, the subnet notices handler will
5994 	 * only handle MCG created/deleted event. The action performed as part
5995 	 * of handling these events is to start the interface. So, the
5996 	 * acache/mcache initialization is not a necessity in such cases for
5997 	 * registering the subnet notices handler. Also, if we are in
5998 	 * ibd_start() as a result of, say, some event handling after entering
5999 	 * late hca initialization phase no need to register again.
6000 	 */
6001 	if ((state->id_mac_state & IBD_DRV_SM_NOTICES_REGISTERED) == 0) {
6002 		ibt_register_subnet_notices(state->id_ibt_hdl,
6003 		    ibd_snet_notices_handler, state);
6004 		mutex_enter(&state->id_trap_lock);
6005 		state->id_trap_stop = B_FALSE;
6006 		mutex_exit(&state->id_trap_lock);
6007 		state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;
6008 	}
6009 
6010 late_hca_init_return:
6011 	if (late_hca_init == 1) {
6012 		state->id_mac_state |= IBD_DRV_IN_LATE_HCA_INIT;
6013 		/*
6014 		 * In case of late initialization, mark the link state as down,
6015 		 * immaterial of the actual link state as reported in the
6016 		 * port_info.
6017 		 */
6018 		state->id_link_state = LINK_STATE_DOWN;
6019 		mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
6020 		mac_link_update(state->id_mh, state->id_link_state);
6021 		return (DDI_SUCCESS);
6022 	}
6023 
6024 	if (state->id_enable_rc) {
6025 		if (state->rc_enable_srq) {
6026 			if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) {
6027 				if (ibd_rc_repost_srq_free_list(state) !=
6028 				    IBT_SUCCESS) {
6029 					err = ENOMEM;
6030 					goto start_fail;
6031 				}
6032 			} else {
6033 				/* Allocate SRQ resource */
6034 				if (ibd_rc_init_srq_list(state) !=
6035 				    IBT_SUCCESS) {
6036 					err = ENOMEM;
6037 					goto start_fail;
6038 				}
6039 				state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD;
6040 			}
6041 		}
6042 
6043 		if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) {
6044 			DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() "
6045 			    "failed");
6046 			err = ENOMEM;
6047 			goto start_fail;
6048 		}
6049 		state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD;
6050 
6051 		/* RC: begin to listen only after everything is available */
6052 		if (ibd_rc_listen(state) != IBT_SUCCESS) {
6053 			DPRINT(10, "ibd_start: ibd_rc_listen() failed");
6054 			err = EINVAL;
6055 			goto start_fail;
6056 		}
6057 		state->id_mac_state |= IBD_DRV_RC_LISTEN;
6058 	}
6059 
6060 	/*
6061 	 * Indicate link status to GLDv3 and higher layers. By default,
6062 	 * we assume we are in up state (which must have been true at
6063 	 * least at the time the broadcast mcg's were probed); if there
6064 	 * were any up/down transitions till the time we come here, the
6065 	 * async handler will have updated last known state, which we
6066 	 * use to tell GLDv3. The async handler will not send any
6067 	 * notifications to GLDv3 till we reach here in the initialization
6068 	 * sequence.
6069 	 */
6070 	mac_link_update(state->id_mh, state->id_link_state);
6071 	state->id_mac_state &= ~IBD_DRV_IN_LATE_HCA_INIT;
6072 	state->id_mac_state |= IBD_DRV_STARTED;
6073 
6074 	return (DDI_SUCCESS);
6075 
6076 start_fail:
6077 	/*
6078 	 * If we ran into a problem during ibd_start() and ran into
6079 	 * some other problem during undoing our partial work, we can't
6080 	 * do anything about it.  Ignore any errors we might get from
6081 	 * ibd_undo_start() and just return the original error we got.
6082 	 */
6083 	(void) ibd_undo_start(state, LINK_STATE_DOWN);
6084 	return (err);
6085 }
6086 
6087 /*
6088  * GLDv3 entry point to stop hardware from receiving packets.
6089  */
6090 /*ARGSUSED*/
6091 static void
6092 ibd_m_stop(void *arg)
6093 {
6094 	ibd_state_t *state = (ibd_state_t *)arg;
6095 
6096 	if (state->id_type == IBD_PORT_DRIVER)
6097 		return;
6098 
6099 	ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
6100 
6101 	(void) ibd_undo_start(state, state->id_link_state);
6102 
6103 	ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
6104 }
6105 
6106 /*
6107  * GLDv3 entry point to modify device's mac address. We do not
6108  * allow address modifications.
6109  */
6110 static int
6111 ibd_m_unicst(void *arg, const uint8_t *macaddr)
6112 {
6113 	ibd_state_t *state = arg;
6114 
6115 	if (state->id_type == IBD_PORT_DRIVER)
6116 		return (EINVAL);
6117 
6118 	/*
6119 	 * Don't bother even comparing the macaddr if we haven't
6120 	 * completed ibd_m_start().
6121 	 */
6122 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6123 		return (0);
6124 
6125 	if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
6126 		return (0);
6127 	else
6128 		return (EINVAL);
6129 }
6130 
6131 /*
6132  * The blocking part of the IBA join/leave operations are done out
6133  * of here on the async thread.
6134  */
6135 static void
6136 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
6137 {
6138 	DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
6139 	    "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
6140 
6141 	if (op == IBD_ASYNC_JOIN) {
6142 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
6143 			ibd_print_warn(state, "Join multicast group failed :"
6144 			"%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
6145 		}
6146 	} else {
6147 		/*
6148 		 * Here, we must search for the proper mcg_info and
6149 		 * use that to leave the group.
6150 		 */
6151 		ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
6152 	}
6153 }
6154 
6155 /*
6156  * GLDv3 entry point for multicast enable/disable requests.
6157  * This function queues the operation to the async thread and
6158  * return success for a valid multicast address.
6159  */
6160 static int
6161 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
6162 {
6163 	ibd_state_t *state = (ibd_state_t *)arg;
6164 	ipoib_mac_t maddr, *mcast;
6165 	ib_gid_t mgid;
6166 	ibd_req_t *req;
6167 
6168 	if (state->id_type == IBD_PORT_DRIVER)
6169 		return (EINVAL);
6170 
6171 	/*
6172 	 * If we haven't completed ibd_m_start(), async thread wouldn't
6173 	 * have been started and id_bcaddr wouldn't be set, so there's
6174 	 * no point in continuing.
6175 	 */
6176 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6177 		return (0);
6178 
6179 	/*
6180 	 * The incoming multicast address might not be aligned properly
6181 	 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
6182 	 * it to look like one though, to get the offsets of the mc gid,
6183 	 * since we know we are not going to dereference any values with
6184 	 * the ipoib_mac_t pointer.
6185 	 */
6186 	bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
6187 	mcast = &maddr;
6188 
6189 	/*
6190 	 * Check validity of MCG address. We could additionally check
6191 	 * that a enable/disable is not being issued on the "broadcast"
6192 	 * mcg, but since this operation is only invokable by privileged
6193 	 * programs anyway, we allow the flexibility to those dlpi apps.
6194 	 * Note that we do not validate the "scope" of the IBA mcg.
6195 	 */
6196 	if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
6197 		return (EINVAL);
6198 
6199 	/*
6200 	 * fill in multicast pkey and scope
6201 	 */
6202 	IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
6203 
6204 	/*
6205 	 * If someone is trying to JOIN/LEAVE the broadcast group, we do
6206 	 * nothing (i.e. we stay JOINed to the broadcast group done in
6207 	 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
6208 	 * requires to be joined to broadcast groups at all times.
6209 	 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
6210 	 * depends on this.
6211 	 */
6212 	if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
6213 		return (0);
6214 
6215 	ibd_n2h_gid(mcast, &mgid);
6216 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6217 	if (req == NULL)
6218 		return (ENOMEM);
6219 
6220 	req->rq_gid = mgid;
6221 
6222 	if (add) {
6223 		DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
6224 		    mgid.gid_prefix, mgid.gid_guid);
6225 		ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
6226 	} else {
6227 		DPRINT(1, "ibd_m_multicst : unset_multicast : "
6228 		    "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
6229 		ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
6230 	}
6231 	return (0);
6232 }
6233 
6234 /*
6235  * The blocking part of the IBA promiscuous operations are done
6236  * out of here on the async thread. The dlpireq parameter indicates
6237  * whether this invocation is due to a dlpi request or due to
6238  * a port up/down event.
6239  */
6240 static void
6241 ibd_async_unsetprom(ibd_state_t *state)
6242 {
6243 	ibd_mce_t *mce = list_head(&state->id_mc_non);
6244 	ib_gid_t mgid;
6245 
6246 	DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
6247 
6248 	while (mce != NULL) {
6249 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
6250 		mce = list_next(&state->id_mc_non, mce);
6251 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
6252 	}
6253 	state->id_prom_op = IBD_OP_NOTSTARTED;
6254 }
6255 
6256 /*
6257  * The blocking part of the IBA promiscuous operations are done
6258  * out of here on the async thread. The dlpireq parameter indicates
6259  * whether this invocation is due to a dlpi request or due to
6260  * a port up/down event.
6261  */
6262 static void
6263 ibd_async_setprom(ibd_state_t *state)
6264 {
6265 	ibt_mcg_attr_t mcg_attr;
6266 	ibt_mcg_info_t *mcg_info;
6267 	ib_gid_t mgid;
6268 	uint_t numg;
6269 	int i;
6270 	char ret = IBD_OP_COMPLETED;
6271 
6272 	DPRINT(2, "ibd_async_setprom : async_set_promisc");
6273 
6274 	/*
6275 	 * Obtain all active MC groups on the IB fabric with
6276 	 * specified criteria (scope + Pkey + Qkey + mtu).
6277 	 */
6278 	bzero(&mcg_attr, sizeof (mcg_attr));
6279 	mcg_attr.mc_pkey = state->id_pkey;
6280 	mcg_attr.mc_scope = state->id_scope;
6281 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
6282 	mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
6283 	mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
6284 	if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
6285 	    IBT_SUCCESS) {
6286 		ibd_print_warn(state, "Could not get list of IBA multicast "
6287 		    "groups");
6288 		ret = IBD_OP_ERRORED;
6289 		goto done;
6290 	}
6291 
6292 	/*
6293 	 * Iterate over the returned mcg's and join as NonMember
6294 	 * to the IP mcg's.
6295 	 */
6296 	for (i = 0; i < numg; i++) {
6297 		/*
6298 		 * Do a NonMember JOIN on the MC group.
6299 		 */
6300 		mgid = mcg_info[i].mc_adds_vect.av_dgid;
6301 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
6302 			ibd_print_warn(state, "IBA promiscuous mode missed "
6303 			    "multicast gid %016llx:%016llx",
6304 			    (u_longlong_t)mgid.gid_prefix,
6305 			    (u_longlong_t)mgid.gid_guid);
6306 	}
6307 
6308 	ibt_free_mcg_info(mcg_info, numg);
6309 	DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
6310 done:
6311 	state->id_prom_op = ret;
6312 }
6313 
6314 /*
6315  * GLDv3 entry point for multicast promiscuous enable/disable requests.
6316  * GLDv3 assumes phys state receives more packets than multi state,
6317  * which is not true for IPoIB. Thus, treat the multi and phys
6318  * promiscuous states the same way to work with GLDv3's assumption.
6319  */
6320 static int
6321 ibd_m_promisc(void *arg, boolean_t on)
6322 {
6323 	ibd_state_t *state = (ibd_state_t *)arg;
6324 	ibd_req_t *req;
6325 
6326 	if (state->id_type == IBD_PORT_DRIVER)
6327 		return (EINVAL);
6328 
6329 	/*
6330 	 * Async thread wouldn't have been started if we haven't
6331 	 * passed ibd_m_start()
6332 	 */
6333 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6334 		return (0);
6335 
6336 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6337 	if (req == NULL)
6338 		return (ENOMEM);
6339 	if (on) {
6340 		DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
6341 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
6342 	} else {
6343 		DPRINT(1, "ibd_m_promisc : unset_promisc");
6344 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
6345 	}
6346 
6347 	return (0);
6348 }
6349 
6350 /*
6351  * GLDv3 entry point for gathering statistics.
6352  */
6353 static int
6354 ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
6355 {
6356 	ibd_state_t *state = (ibd_state_t *)arg;
6357 
6358 	switch (stat) {
6359 	case MAC_STAT_IFSPEED:
6360 		*val = state->id_link_speed;
6361 		break;
6362 	case MAC_STAT_MULTIRCV:
6363 		*val = state->id_multi_rcv;
6364 		break;
6365 	case MAC_STAT_BRDCSTRCV:
6366 		*val = state->id_brd_rcv;
6367 		break;
6368 	case MAC_STAT_MULTIXMT:
6369 		*val = state->id_multi_xmt;
6370 		break;
6371 	case MAC_STAT_BRDCSTXMT:
6372 		*val = state->id_brd_xmt;
6373 		break;
6374 	case MAC_STAT_RBYTES:
6375 		*val = state->id_rcv_bytes + state->rc_rcv_trans_byte
6376 		    + state->rc_rcv_copy_byte;
6377 		break;
6378 	case MAC_STAT_IPACKETS:
6379 		*val = state->id_rcv_pkt + state->rc_rcv_trans_pkt
6380 		    + state->rc_rcv_copy_pkt;
6381 		break;
6382 	case MAC_STAT_OBYTES:
6383 		*val = state->id_xmt_bytes + state->rc_xmt_bytes;
6384 		break;
6385 	case MAC_STAT_OPACKETS:
6386 		*val = state->id_xmt_pkt + state->rc_xmt_small_pkt +
6387 		    state->rc_xmt_fragmented_pkt +
6388 		    state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt;
6389 		break;
6390 	case MAC_STAT_OERRORS:
6391 		*val = state->id_ah_error;	/* failed AH translation */
6392 		break;
6393 	case MAC_STAT_IERRORS:
6394 		*val = 0;
6395 		break;
6396 	case MAC_STAT_NOXMTBUF:
6397 		*val = state->id_tx_short + state->rc_swqe_short +
6398 		    state->rc_xmt_buf_short;
6399 		break;
6400 	case MAC_STAT_NORCVBUF:
6401 	default:
6402 		return (ENOTSUP);
6403 	}
6404 
6405 	return (0);
6406 }
6407 
6408 static void
6409 ibd_async_txsched(ibd_state_t *state)
6410 {
6411 	ibd_resume_transmission(state);
6412 }
6413 
6414 static void
6415 ibd_resume_transmission(ibd_state_t *state)
6416 {
6417 	int flag;
6418 	int met_thresh = 0;
6419 	int thresh = 0;
6420 	int ret = -1;
6421 
6422 	mutex_enter(&state->id_sched_lock);
6423 	if (state->id_sched_needed & IBD_RSRC_SWQE) {
6424 		mutex_enter(&state->id_tx_list.dl_mutex);
6425 		mutex_enter(&state->id_tx_rel_list.dl_mutex);
6426 		met_thresh = state->id_tx_list.dl_cnt +
6427 		    state->id_tx_rel_list.dl_cnt;
6428 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
6429 		mutex_exit(&state->id_tx_list.dl_mutex);
6430 		thresh = IBD_FREE_SWQES_THRESH;
6431 		flag = IBD_RSRC_SWQE;
6432 	} else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
6433 		ASSERT(state->id_lso != NULL);
6434 		mutex_enter(&state->id_lso_lock);
6435 		met_thresh = state->id_lso->bkt_nfree;
6436 		thresh = IBD_FREE_LSOS_THRESH;
6437 		mutex_exit(&state->id_lso_lock);
6438 		flag = IBD_RSRC_LSOBUF;
6439 		if (met_thresh > thresh)
6440 			state->id_sched_lso_cnt++;
6441 	}
6442 	if (met_thresh > thresh) {
6443 		state->id_sched_needed &= ~flag;
6444 		state->id_sched_cnt++;
6445 		ret = 0;
6446 	}
6447 	mutex_exit(&state->id_sched_lock);
6448 
6449 	if (ret == 0)
6450 		mac_tx_update(state->id_mh);
6451 }
6452 
6453 /*
6454  * Release the send wqe back into free list.
6455  */
6456 static void
6457 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n)
6458 {
6459 	/*
6460 	 * Add back on Tx list for reuse.
6461 	 */
6462 	ASSERT(tail->swqe_next == NULL);
6463 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
6464 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
6465 	tail->swqe_next = state->id_tx_rel_list.dl_head;
6466 	state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head);
6467 	state->id_tx_rel_list.dl_cnt += n;
6468 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
6469 }
6470 
6471 /*
6472  * Acquire a send wqe from free list.
6473  * Returns error number and send wqe pointer.
6474  */
6475 static ibd_swqe_t *
6476 ibd_acquire_swqe(ibd_state_t *state)
6477 {
6478 	ibd_swqe_t *wqe;
6479 
6480 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
6481 	if (state->id_tx_rel_list.dl_head != NULL) {
6482 		/* transfer id_tx_rel_list to id_tx_list */
6483 		state->id_tx_list.dl_head =
6484 		    state->id_tx_rel_list.dl_head;
6485 		state->id_tx_list.dl_cnt =
6486 		    state->id_tx_rel_list.dl_cnt;
6487 		state->id_tx_list.dl_pending_sends = B_FALSE;
6488 
6489 		/* clear id_tx_rel_list */
6490 		state->id_tx_rel_list.dl_head = NULL;
6491 		state->id_tx_rel_list.dl_cnt = 0;
6492 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
6493 
6494 		wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
6495 		state->id_tx_list.dl_cnt -= 1;
6496 		state->id_tx_list.dl_head = wqe->swqe_next;
6497 	} else {	/* no free swqe */
6498 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
6499 		state->id_tx_list.dl_pending_sends = B_TRUE;
6500 		DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
6501 		state->id_tx_short++;
6502 		wqe = NULL;
6503 	}
6504 	return (wqe);
6505 }
6506 
6507 static int
6508 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
6509     ibt_ud_dest_hdl_t ud_dest)
6510 {
6511 	mblk_t	*nmp;
6512 	int iph_len, tcph_len;
6513 	ibt_wr_lso_t *lso;
6514 	uintptr_t ip_start, tcp_start;
6515 	uint8_t *dst;
6516 	uint_t pending, mblen;
6517 
6518 	/*
6519 	 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
6520 	 * we need to adjust it here for lso.
6521 	 */
6522 	lso = &(node->w_swr.wr.ud_lso);
6523 	lso->lso_ud_dest = ud_dest;
6524 	lso->lso_mss = mss;
6525 
6526 	/*
6527 	 * Calculate the LSO header size and set it in the UD LSO structure.
6528 	 * Note that the only assumption we make is that each of the IPoIB,
6529 	 * IP and TCP headers will be contained in a single mblk fragment;
6530 	 * together, the headers may span multiple mblk fragments.
6531 	 */
6532 	nmp = mp;
6533 	ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
6534 	if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
6535 		ip_start = (uintptr_t)nmp->b_cont->b_rptr
6536 		    + (ip_start - (uintptr_t)(nmp->b_wptr));
6537 		nmp = nmp->b_cont;
6538 
6539 	}
6540 	iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
6541 
6542 	tcp_start = ip_start + iph_len;
6543 	if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
6544 		tcp_start = (uintptr_t)nmp->b_cont->b_rptr
6545 		    + (tcp_start - (uintptr_t)(nmp->b_wptr));
6546 		nmp = nmp->b_cont;
6547 	}
6548 	tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
6549 	lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;
6550 
6551 	/*
6552 	 * If the lso header fits entirely within a single mblk fragment,
6553 	 * we'll avoid an additional copy of the lso header here and just
6554 	 * pass the b_rptr of the mblk directly.
6555 	 *
6556 	 * If this isn't true, we'd have to allocate for it explicitly.
6557 	 */
6558 	if (lso->lso_hdr_sz <= MBLKL(mp)) {
6559 		lso->lso_hdr = mp->b_rptr;
6560 	} else {
6561 		/* On work completion, remember to free this allocated hdr */
6562 		lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
6563 		if (lso->lso_hdr == NULL) {
6564 			DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
6565 			    "sz = %d", lso->lso_hdr_sz);
6566 			lso->lso_hdr_sz = 0;
6567 			lso->lso_mss = 0;
6568 			return (-1);
6569 		}
6570 	}
6571 
6572 	/*
6573 	 * Copy in the lso header only if we need to
6574 	 */
6575 	if (lso->lso_hdr != mp->b_rptr) {
6576 		dst = lso->lso_hdr;
6577 		pending = lso->lso_hdr_sz;
6578 
6579 		for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
6580 			mblen = MBLKL(nmp);
6581 			if (pending > mblen) {
6582 				bcopy(nmp->b_rptr, dst, mblen);
6583 				dst += mblen;
6584 				pending -= mblen;
6585 			} else {
6586 				bcopy(nmp->b_rptr, dst, pending);
6587 				break;
6588 			}
6589 		}
6590 	}
6591 
6592 	return (0);
6593 }
6594 
6595 static void
6596 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
6597 {
6598 	ibt_wr_lso_t *lso;
6599 
6600 	if ((!node) || (!mp))
6601 		return;
6602 
6603 	/*
6604 	 * Free any header space that we might've allocated if we
6605 	 * did an LSO
6606 	 */
6607 	if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
6608 		lso = &(node->w_swr.wr.ud_lso);
6609 		if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
6610 			kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
6611 			lso->lso_hdr = NULL;
6612 			lso->lso_hdr_sz = 0;
6613 		}
6614 	}
6615 }
6616 
6617 static void
6618 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
6619 {
6620 	uint_t		i;
6621 	uint_t		num_posted;
6622 	uint_t		n_wrs;
6623 	ibt_status_t	ibt_status;
6624 	ibt_send_wr_t	wrs[IBD_MAX_TX_POST_MULTIPLE];
6625 	ibd_swqe_t	*tx_head, *elem;
6626 	ibd_swqe_t	*nodes[IBD_MAX_TX_POST_MULTIPLE];
6627 
6628 	/* post the one request, then check for more */
6629 	ibt_status = ibt_post_send(state->id_chnl_hdl,
6630 	    &node->w_swr, 1, NULL);
6631 	if (ibt_status != IBT_SUCCESS) {
6632 		ibd_print_warn(state, "ibd_post_send: "
6633 		    "posting one wr failed: ret=%d", ibt_status);
6634 		ibd_tx_cleanup(state, node);
6635 	}
6636 
6637 	tx_head = NULL;
6638 	for (;;) {
6639 		if (tx_head == NULL) {
6640 			mutex_enter(&state->id_txpost_lock);
6641 			tx_head = state->id_tx_head;
6642 			if (tx_head == NULL) {
6643 				state->id_tx_busy = 0;
6644 				mutex_exit(&state->id_txpost_lock);
6645 				return;
6646 			}
6647 			state->id_tx_head = NULL;
6648 			mutex_exit(&state->id_txpost_lock);
6649 		}
6650 
6651 		/*
6652 		 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
6653 		 * at a time if possible, and keep posting them.
6654 		 */
6655 		for (n_wrs = 0, elem = tx_head;
6656 		    (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
6657 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
6658 			nodes[n_wrs] = elem;
6659 			wrs[n_wrs] = elem->w_swr;
6660 		}
6661 		tx_head = elem;
6662 
6663 		ASSERT(n_wrs != 0);
6664 
6665 		/*
6666 		 * If posting fails for some reason, we'll never receive
6667 		 * completion intimation, so we'll need to cleanup. But
6668 		 * we need to make sure we don't clean up nodes whose
6669 		 * wrs have been successfully posted. We assume that the
6670 		 * hca driver returns on the first failure to post and
6671 		 * therefore the first 'num_posted' entries don't need
6672 		 * cleanup here.
6673 		 */
6674 		num_posted = 0;
6675 		ibt_status = ibt_post_send(state->id_chnl_hdl,
6676 		    wrs, n_wrs, &num_posted);
6677 		if (ibt_status != IBT_SUCCESS) {
6678 			ibd_print_warn(state, "ibd_post_send: "
6679 			    "posting multiple wrs failed: "
6680 			    "requested=%d, done=%d, ret=%d",
6681 			    n_wrs, num_posted, ibt_status);
6682 
6683 			for (i = num_posted; i < n_wrs; i++)
6684 				ibd_tx_cleanup(state, nodes[i]);
6685 		}
6686 	}
6687 }
6688 
6689 static int
6690 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
6691     uint_t lsohdr_sz)
6692 {
6693 	ibt_wr_ds_t *sgl;
6694 	ibt_status_t ibt_status;
6695 	mblk_t *nmp;
6696 	mblk_t *data_mp;
6697 	uchar_t *bufp;
6698 	size_t blksize;
6699 	size_t skip;
6700 	size_t avail;
6701 	uint_t pktsize;
6702 	uint_t frag_len;
6703 	uint_t pending_hdr;
6704 	int nmblks;
6705 	int i;
6706 
6707 	/*
6708 	 * Let's skip ahead to the data if this is LSO
6709 	 */
6710 	data_mp = mp;
6711 	pending_hdr = 0;
6712 	if (lsohdr_sz) {
6713 		pending_hdr = lsohdr_sz;
6714 		for (nmp = mp; nmp; nmp = nmp->b_cont) {
6715 			frag_len = nmp->b_wptr - nmp->b_rptr;
6716 			if (frag_len > pending_hdr)
6717 				break;
6718 			pending_hdr -= frag_len;
6719 		}
6720 		data_mp = nmp;	/* start of data past lso header */
6721 		ASSERT(data_mp != NULL);
6722 	}
6723 
6724 	/*
6725 	 * Calculate the size of message data and number of msg blocks
6726 	 */
6727 	pktsize = 0;
6728 	for (nmblks = 0, nmp = data_mp; nmp != NULL;
6729 	    nmp = nmp->b_cont, nmblks++) {
6730 		pktsize += MBLKL(nmp);
6731 	}
6732 	pktsize -= pending_hdr;
6733 
6734 	/*
6735 	 * We only do ibt_map_mem_iov() if the pktsize is above the
6736 	 * "copy-threshold", and if the number of mp fragments is less than
6737 	 * the maximum acceptable.
6738 	 */
6739 	if ((state->id_hca_res_lkey_capab) &&
6740 	    (pktsize > state->id_ud_tx_copy_thresh) &&
6741 	    (nmblks < state->id_max_sqseg_hiwm)) {
6742 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
6743 		ibt_iov_attr_t iov_attr;
6744 
6745 		iov_attr.iov_as = NULL;
6746 		iov_attr.iov = iov_arr;
6747 		iov_attr.iov_buf = NULL;
6748 		iov_attr.iov_list_len = nmblks;
6749 		iov_attr.iov_wr_nds = state->id_max_sqseg;
6750 		iov_attr.iov_lso_hdr_sz = lsohdr_sz;
6751 		iov_attr.iov_flags = IBT_IOV_SLEEP;
6752 
6753 		for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
6754 			iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
6755 			iov_arr[i].iov_len = MBLKL(nmp);
6756 			if (i == 0) {
6757 				iov_arr[i].iov_addr += pending_hdr;
6758 				iov_arr[i].iov_len -= pending_hdr;
6759 			}
6760 		}
6761 
6762 		node->w_buftype = IBD_WQE_MAPPED;
6763 		node->w_swr.wr_sgl = node->w_sgl;
6764 
6765 		ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
6766 		    (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
6767 		if (ibt_status != IBT_SUCCESS) {
6768 			ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
6769 			    "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
6770 			goto ibd_copy_path;
6771 		}
6772 
6773 		return (0);
6774 	}
6775 
6776 ibd_copy_path:
6777 	if (pktsize <= state->id_tx_buf_sz) {
6778 		node->swqe_copybuf.ic_sgl.ds_len = pktsize;
6779 		node->w_swr.wr_nds = 1;
6780 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
6781 		node->w_buftype = IBD_WQE_TXBUF;
6782 
6783 		/*
6784 		 * Even though this is the copy path for transfers less than
6785 		 * id_tx_buf_sz, it could still be an LSO packet.  If so, it
6786 		 * is possible the first data mblk fragment (data_mp) still
6787 		 * contains part of the LSO header that we need to skip.
6788 		 */
6789 		bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
6790 		for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
6791 			blksize = MBLKL(nmp) - pending_hdr;
6792 			bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
6793 			bufp += blksize;
6794 			pending_hdr = 0;
6795 		}
6796 
6797 		return (0);
6798 	}
6799 
6800 	/*
6801 	 * Copy path for transfers greater than id_tx_buf_sz
6802 	 */
6803 	node->w_swr.wr_sgl = node->w_sgl;
6804 	if (ibd_acquire_lsobufs(state, pktsize,
6805 	    node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
6806 		DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
6807 		return (-1);
6808 	}
6809 	node->w_buftype = IBD_WQE_LSOBUF;
6810 
6811 	/*
6812 	 * Copy the larger-than-id_tx_buf_sz packet into a set of
6813 	 * fixed-sized, pre-mapped LSO buffers. Note that we might
6814 	 * need to skip part of the LSO header in the first fragment
6815 	 * as before.
6816 	 */
6817 	nmp = data_mp;
6818 	skip = pending_hdr;
6819 	for (i = 0; i < node->w_swr.wr_nds; i++) {
6820 		sgl = node->w_swr.wr_sgl + i;
6821 		bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
6822 		avail = IBD_LSO_BUFSZ;
6823 		while (nmp && avail) {
6824 			blksize = MBLKL(nmp) - skip;
6825 			if (blksize > avail) {
6826 				bcopy(nmp->b_rptr + skip, bufp, avail);
6827 				skip += avail;
6828 				avail = 0;
6829 			} else {
6830 				bcopy(nmp->b_rptr + skip, bufp, blksize);
6831 				skip = 0;
6832 				avail -= blksize;
6833 				bufp += blksize;
6834 				nmp = nmp->b_cont;
6835 			}
6836 		}
6837 	}
6838 
6839 	return (0);
6840 }
6841 
6842 /*
6843  * Schedule a completion queue polling to reap the resource we're
6844  * short on.  If we implement the change to reap tx completions
6845  * in a separate thread, we'll need to wake up that thread here.
6846  */
6847 static int
6848 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
6849 {
6850 	ibd_req_t *req;
6851 
6852 	mutex_enter(&state->id_sched_lock);
6853 	state->id_sched_needed |= resource_type;
6854 	mutex_exit(&state->id_sched_lock);
6855 
6856 	/*
6857 	 * If we are asked to queue a work entry, we need to do it
6858 	 */
6859 	if (q_flag) {
6860 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6861 		if (req == NULL)
6862 			return (-1);
6863 
6864 		ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
6865 	}
6866 
6867 	return (0);
6868 }
6869 
6870 /*
6871  * The passed in packet has this format:
6872  * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
6873  */
6874 static boolean_t
6875 ibd_send(ibd_state_t *state, mblk_t *mp)
6876 {
6877 	ibd_ace_t *ace;
6878 	ibd_swqe_t *node;
6879 	ipoib_mac_t *dest;
6880 	ib_header_info_t *ipibp;
6881 	ip6_t *ip6h;
6882 	uint_t pktsize;
6883 	uint32_t mss;
6884 	uint32_t hckflags;
6885 	uint32_t lsoflags = 0;
6886 	uint_t lsohdr_sz = 0;
6887 	int ret, len;
6888 	boolean_t dofree = B_FALSE;
6889 	boolean_t rc;
6890 	/* if (rc_chan == NULL) send by UD; else send by RC; */
6891 	ibd_rc_chan_t *rc_chan;
6892 	int nmblks;
6893 	mblk_t *nmp;
6894 
6895 	/*
6896 	 * If we aren't done with the device initialization and start,
6897 	 * we shouldn't be here.
6898 	 */
6899 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6900 		return (B_FALSE);
6901 
6902 	/*
6903 	 * Obtain an address handle for the destination.
6904 	 */
6905 	ipibp = (ib_header_info_t *)mp->b_rptr;
6906 	dest = (ipoib_mac_t *)&ipibp->ib_dst;
6907 	if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6908 		IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
6909 
6910 	rc_chan = NULL;
6911 	ace = ibd_acache_lookup(state, dest, &ret, 1);
6912 	if (state->id_enable_rc && (ace != NULL) &&
6913 	    (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) {
6914 		if (ace->ac_chan == NULL) {
6915 			state->rc_null_conn++;
6916 		} else {
6917 			if (ace->ac_chan->chan_state ==
6918 			    IBD_RC_STATE_ACT_ESTAB) {
6919 				rc_chan = ace->ac_chan;
6920 				mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
6921 				node = WQE_TO_SWQE(
6922 				    rc_chan->tx_wqe_list.dl_head);
6923 				if (node != NULL) {
6924 					rc_chan->tx_wqe_list.dl_cnt -= 1;
6925 					rc_chan->tx_wqe_list.dl_head =
6926 					    node->swqe_next;
6927 				} else {
6928 					node = ibd_rc_acquire_swqes(rc_chan);
6929 				}
6930 				mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
6931 
6932 				if (node == NULL) {
6933 					state->rc_swqe_short++;
6934 					mutex_enter(&state->id_sched_lock);
6935 					state->id_sched_needed |=
6936 					    IBD_RSRC_RC_SWQE;
6937 					mutex_exit(&state->id_sched_lock);
6938 					ibd_dec_ref_ace(state, ace);
6939 					return (B_FALSE);
6940 				}
6941 			} else {
6942 				state->rc_no_estab_conn++;
6943 			}
6944 		}
6945 	}
6946 
6947 	if (rc_chan == NULL) {
6948 		mutex_enter(&state->id_tx_list.dl_mutex);
6949 		node = WQE_TO_SWQE(state->id_tx_list.dl_head);
6950 		if (node != NULL) {
6951 			state->id_tx_list.dl_cnt -= 1;
6952 			state->id_tx_list.dl_head = node->swqe_next;
6953 		} else {
6954 			node = ibd_acquire_swqe(state);
6955 		}
6956 		mutex_exit(&state->id_tx_list.dl_mutex);
6957 		if (node == NULL) {
6958 			/*
6959 			 * If we don't have an swqe available, schedule a
6960 			 * transmit completion queue cleanup and hold off on
6961 			 * sending more packets until we have some free swqes
6962 			 */
6963 			if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) {
6964 				if (ace != NULL) {
6965 					ibd_dec_ref_ace(state, ace);
6966 				}
6967 				return (B_FALSE);
6968 			}
6969 
6970 			/*
6971 			 * If a poll cannot be scheduled, we have no choice but
6972 			 * to drop this packet
6973 			 */
6974 			ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
6975 			if (ace != NULL) {
6976 				ibd_dec_ref_ace(state, ace);
6977 			}
6978 			return (B_TRUE);
6979 		}
6980 	}
6981 
6982 	/*
6983 	 * Initialize the commonly used fields in swqe to NULL to protect
6984 	 * against ibd_tx_cleanup accidentally misinterpreting these on a
6985 	 * failure.
6986 	 */
6987 	node->swqe_im_mblk = NULL;
6988 	node->w_swr.wr_nds = 0;
6989 	node->w_swr.wr_sgl = NULL;
6990 	node->w_swr.wr_opcode = IBT_WRC_SEND;
6991 
6992 	/*
6993 	 * Calculate the size of message data and number of msg blocks
6994 	 */
6995 	pktsize = 0;
6996 	for (nmblks = 0, nmp = mp; nmp != NULL;
6997 	    nmp = nmp->b_cont, nmblks++) {
6998 		pktsize += MBLKL(nmp);
6999 	}
7000 
7001 	if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
7002 		atomic_inc_64(&state->id_brd_xmt);
7003 	else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
7004 		atomic_inc_64(&state->id_multi_xmt);
7005 
7006 	if (ace != NULL) {
7007 		node->w_ahandle = ace;
7008 		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
7009 	} else {
7010 		DPRINT(5,
7011 		    "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
7012 		    ((ret == EFAULT) ? "failed" : "queued"),
7013 		    htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
7014 		    htonl(dest->ipoib_gidpref[1]),
7015 		    htonl(dest->ipoib_gidsuff[0]),
7016 		    htonl(dest->ipoib_gidsuff[1]));
7017 		state->rc_ace_not_found++;
7018 		node->w_ahandle = NULL;
7019 
7020 		/*
7021 		 * Here if ibd_acache_lookup() returns EFAULT, it means ibd
7022 		 * can not find a path for the specific dest address. We
7023 		 * should get rid of this kind of packet.  We also should get
7024 		 * rid of the packet if we cannot schedule a poll via the
7025 		 * async thread.  For the normal case, ibd will return the
7026 		 * packet to upper layer and wait for AH creating.
7027 		 *
7028 		 * Note that we always queue a work slot entry for the async
7029 		 * thread when we fail AH lookup (even in intr mode); this is
7030 		 * due to the convoluted way the code currently looks for AH.
7031 		 */
7032 		if (ret == EFAULT) {
7033 			dofree = B_TRUE;
7034 			rc = B_TRUE;
7035 		} else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
7036 			dofree = B_TRUE;
7037 			rc = B_TRUE;
7038 		} else {
7039 			dofree = B_FALSE;
7040 			rc = B_FALSE;
7041 		}
7042 		goto ibd_send_fail;
7043 	}
7044 
7045 	/*
7046 	 * For ND6 packets, padding is at the front of the source lladdr.
7047 	 * Insert the padding at front.
7048 	 */
7049 	if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) {
7050 		if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
7051 			if (!pullupmsg(mp, IPV6_HDR_LEN +
7052 			    sizeof (ib_header_info_t))) {
7053 				DPRINT(10, "ibd_send: pullupmsg failure ");
7054 				dofree = B_TRUE;
7055 				rc = B_TRUE;
7056 				goto ibd_send_fail;
7057 			}
7058 			ipibp = (ib_header_info_t *)mp->b_rptr;
7059 		}
7060 		ip6h = (ip6_t *)((uchar_t *)ipibp +
7061 		    sizeof (ib_header_info_t));
7062 		len = ntohs(ip6h->ip6_plen);
7063 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
7064 			mblk_t	*pad;
7065 
7066 			pad = allocb(4, 0);
7067 			pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
7068 			linkb(mp, pad);
7069 			if (MBLKL(mp) < sizeof (ib_header_info_t) +
7070 			    IPV6_HDR_LEN + len + 4) {
7071 				if (!pullupmsg(mp, sizeof (ib_header_info_t) +
7072 				    IPV6_HDR_LEN + len + 4)) {
7073 					DPRINT(10, "ibd_send: pullupmsg "
7074 					    "failure ");
7075 					dofree = B_TRUE;
7076 					rc = B_TRUE;
7077 					goto ibd_send_fail;
7078 				}
7079 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
7080 				    sizeof (ib_header_info_t));
7081 			}
7082 
7083 			/* LINTED: E_CONSTANT_CONDITION */
7084 			IBD_PAD_NSNA(ip6h, len, IBD_SEND);
7085 		}
7086 	}
7087 
7088 	ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t));
7089 	mp->b_rptr += sizeof (ib_addrs_t);
7090 	pktsize -= sizeof (ib_addrs_t);
7091 
7092 	if (rc_chan) {	/* send in RC mode */
7093 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
7094 		ibt_iov_attr_t iov_attr;
7095 		uint_t		i;
7096 		size_t	blksize;
7097 		uchar_t *bufp;
7098 		ibd_rc_tx_largebuf_t *lbufp;
7099 
7100 		atomic_add_64(&state->rc_xmt_bytes, pktsize);
7101 
7102 		/*
7103 		 * Upper layer does Tx checksum, we don't need do any
7104 		 * checksum here.
7105 		 */
7106 		ASSERT(node->w_swr.wr_trans == IBT_RC_SRV);
7107 
7108 		/*
7109 		 * We only do ibt_map_mem_iov() if the pktsize is above
7110 		 * the "copy-threshold", and if the number of mp
7111 		 * fragments is less than the maximum acceptable.
7112 		 */
7113 		if (pktsize <= state->id_rc_tx_copy_thresh) {
7114 			atomic_inc_64(&state->rc_xmt_small_pkt);
7115 			/*
7116 			 * Only process unicast packet in Reliable Connected
7117 			 * mode.
7118 			 */
7119 			node->swqe_copybuf.ic_sgl.ds_len = pktsize;
7120 			node->w_swr.wr_nds = 1;
7121 			node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
7122 			node->w_buftype = IBD_WQE_TXBUF;
7123 
7124 			bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
7125 			for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7126 				blksize = MBLKL(nmp);
7127 				bcopy(nmp->b_rptr, bufp, blksize);
7128 				bufp += blksize;
7129 			}
7130 			freemsg(mp);
7131 			ASSERT(node->swqe_im_mblk == NULL);
7132 		} else {
7133 			if ((state->rc_enable_iov_map) &&
7134 			    (nmblks < state->rc_max_sqseg_hiwm)) {
7135 
7136 				/* do ibt_map_mem_iov() */
7137 				iov_attr.iov_as = NULL;
7138 				iov_attr.iov = iov_arr;
7139 				iov_attr.iov_buf = NULL;
7140 				iov_attr.iov_wr_nds = state->rc_tx_max_sqseg;
7141 				iov_attr.iov_lso_hdr_sz = 0;
7142 				iov_attr.iov_flags = IBT_IOV_SLEEP;
7143 
7144 				i = 0;
7145 				for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7146 					iov_arr[i].iov_len = MBLKL(nmp);
7147 					if (iov_arr[i].iov_len != 0) {
7148 						iov_arr[i].iov_addr = (caddr_t)
7149 						    (void *)nmp->b_rptr;
7150 						i++;
7151 					}
7152 				}
7153 				iov_attr.iov_list_len = i;
7154 				node->w_swr.wr_sgl = node->w_sgl;
7155 
7156 				ret = ibt_map_mem_iov(state->id_hca_hdl,
7157 				    &iov_attr, (ibt_all_wr_t *)&node->w_swr,
7158 				    &node->w_mi_hdl);
7159 				if (ret != IBT_SUCCESS) {
7160 					atomic_inc_64(
7161 					    &state->rc_xmt_map_fail_pkt);
7162 					DPRINT(30, "ibd_send: ibt_map_mem_iov("
7163 					    ") failed, nmblks=%d, real_nmblks"
7164 					    "=%d, ret=0x%x", nmblks, i, ret);
7165 					goto ibd_rc_large_copy;
7166 				}
7167 
7168 				atomic_inc_64(&state->rc_xmt_map_succ_pkt);
7169 				node->w_buftype = IBD_WQE_MAPPED;
7170 				node->swqe_im_mblk = mp;
7171 			} else {
7172 				atomic_inc_64(&state->rc_xmt_fragmented_pkt);
7173 ibd_rc_large_copy:
7174 				mutex_enter(&state->rc_tx_large_bufs_lock);
7175 				if (state->rc_tx_largebuf_nfree == 0) {
7176 					state->rc_xmt_buf_short++;
7177 					mutex_exit
7178 					    (&state->rc_tx_large_bufs_lock);
7179 					mutex_enter(&state->id_sched_lock);
7180 					state->id_sched_needed |=
7181 					    IBD_RSRC_RC_TX_LARGEBUF;
7182 					mutex_exit(&state->id_sched_lock);
7183 					dofree = B_FALSE;
7184 					rc = B_FALSE;
7185 					/*
7186 					 * If we don't have Tx large bufs,
7187 					 * return failure. node->w_buftype
7188 					 * should not be IBD_WQE_RC_COPYBUF,
7189 					 * otherwise it will cause problem
7190 					 * in ibd_rc_tx_cleanup()
7191 					 */
7192 					node->w_buftype = IBD_WQE_TXBUF;
7193 					goto ibd_send_fail;
7194 				}
7195 
7196 				lbufp = state->rc_tx_largebuf_free_head;
7197 				ASSERT(lbufp->lb_buf != NULL);
7198 				state->rc_tx_largebuf_free_head =
7199 				    lbufp->lb_next;
7200 				lbufp->lb_next = NULL;
7201 				/* Update nfree count */
7202 				state->rc_tx_largebuf_nfree --;
7203 				mutex_exit(&state->rc_tx_large_bufs_lock);
7204 				bufp = lbufp->lb_buf;
7205 				node->w_sgl[0].ds_va =
7206 				    (ib_vaddr_t)(uintptr_t)bufp;
7207 				node->w_sgl[0].ds_key =
7208 				    state->rc_tx_mr_desc.md_lkey;
7209 				node->w_sgl[0].ds_len = pktsize;
7210 				node->w_swr.wr_sgl = node->w_sgl;
7211 				node->w_swr.wr_nds = 1;
7212 				node->w_buftype = IBD_WQE_RC_COPYBUF;
7213 				node->w_rc_tx_largebuf = lbufp;
7214 
7215 				for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7216 					blksize = MBLKL(nmp);
7217 					if (blksize != 0) {
7218 						bcopy(nmp->b_rptr, bufp,
7219 						    blksize);
7220 						bufp += blksize;
7221 					}
7222 				}
7223 				freemsg(mp);
7224 				ASSERT(node->swqe_im_mblk == NULL);
7225 			}
7226 		}
7227 
7228 		node->swqe_next = NULL;
7229 		mutex_enter(&rc_chan->tx_post_lock);
7230 		if (rc_chan->tx_busy) {
7231 			if (rc_chan->tx_head) {
7232 				rc_chan->tx_tail->swqe_next =
7233 				    SWQE_TO_WQE(node);
7234 			} else {
7235 				rc_chan->tx_head = node;
7236 			}
7237 			rc_chan->tx_tail = node;
7238 			mutex_exit(&rc_chan->tx_post_lock);
7239 		} else {
7240 			rc_chan->tx_busy = 1;
7241 			mutex_exit(&rc_chan->tx_post_lock);
7242 			ibd_rc_post_send(rc_chan, node);
7243 		}
7244 
7245 		return (B_TRUE);
7246 	} /* send by RC */
7247 
7248 	if ((state->id_enable_rc) && (pktsize > state->id_mtu)) {
7249 		/*
7250 		 * Too long pktsize. The packet size from GLD should <=
7251 		 * state->id_mtu + sizeof (ib_addrs_t)
7252 		 */
7253 		if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) {
7254 			ibd_req_t *req;
7255 
7256 			mutex_enter(&ace->tx_too_big_mutex);
7257 			if (ace->tx_too_big_ongoing) {
7258 				mutex_exit(&ace->tx_too_big_mutex);
7259 				state->rc_xmt_reenter_too_long_pkt++;
7260 				dofree = B_TRUE;
7261 			} else {
7262 				ace->tx_too_big_ongoing = B_TRUE;
7263 				mutex_exit(&ace->tx_too_big_mutex);
7264 				state->rc_xmt_icmp_too_long_pkt++;
7265 
7266 				req = kmem_cache_alloc(state->id_req_kmc,
7267 				    KM_NOSLEEP);
7268 				if (req == NULL) {
7269 					ibd_print_warn(state, "ibd_send: alloc "
7270 					    "ibd_req_t fail");
7271 					/* Drop it. */
7272 					dofree = B_TRUE;
7273 				} else {
7274 					req->rq_ptr = mp;
7275 					req->rq_ptr2 = ace;
7276 					ibd_queue_work_slot(state, req,
7277 					    IBD_ASYNC_RC_TOO_BIG);
7278 					dofree = B_FALSE;
7279 				}
7280 			}
7281 		} else {
7282 			ibd_print_warn(state, "Reliable Connected mode is on. "
7283 			    "Multicast packet length %d > %d is too long to "
7284 			    "send packet (%d > %d), drop it",
7285 			    pktsize, state->id_mtu);
7286 			state->rc_xmt_drop_too_long_pkt++;
7287 			/* Drop it. */
7288 			dofree = B_TRUE;
7289 		}
7290 		rc = B_TRUE;
7291 		goto ibd_send_fail;
7292 	}
7293 
7294 	atomic_add_64(&state->id_xmt_bytes, pktsize);
7295 	atomic_inc_64(&state->id_xmt_pkt);
7296 
7297 	/*
7298 	 * Do LSO and checksum related work here.  For LSO send, adjust the
7299 	 * ud destination, the opcode and the LSO header information to the
7300 	 * work request.
7301 	 */
7302 	mac_lso_get(mp, &mss, &lsoflags);
7303 	if ((lsoflags & HW_LSO) != HW_LSO) {
7304 		node->w_swr.wr_opcode = IBT_WRC_SEND;
7305 		lsohdr_sz = 0;
7306 	} else {
7307 		if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
7308 			/*
7309 			 * The routine can only fail if there's no memory; we
7310 			 * can only drop the packet if this happens
7311 			 */
7312 			ibd_print_warn(state,
7313 			    "ibd_send: no memory, lso posting failed");
7314 			dofree = B_TRUE;
7315 			rc = B_TRUE;
7316 			goto ibd_send_fail;
7317 		}
7318 
7319 		node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
7320 		lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
7321 	}
7322 
7323 	mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags);
7324 	if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
7325 		node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
7326 	else
7327 		node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;
7328 
7329 	/*
7330 	 * Prepare the sgl for posting; the routine can only fail if there's
7331 	 * no lso buf available for posting. If this is the case, we should
7332 	 * probably resched for lso bufs to become available and then try again.
7333 	 */
7334 	if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
7335 		if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
7336 			dofree = B_TRUE;
7337 			rc = B_TRUE;
7338 		} else {
7339 			dofree = B_FALSE;
7340 			rc = B_FALSE;
7341 		}
7342 		goto ibd_send_fail;
7343 	}
7344 	node->swqe_im_mblk = mp;
7345 
7346 	/*
7347 	 * Queue the wqe to hardware; since we can now simply queue a
7348 	 * post instead of doing it serially, we cannot assume anything
7349 	 * about the 'node' after ibd_post_send() returns.
7350 	 */
7351 	node->swqe_next = NULL;
7352 
7353 	mutex_enter(&state->id_txpost_lock);
7354 	if (state->id_tx_busy) {
7355 		if (state->id_tx_head) {
7356 			state->id_tx_tail->swqe_next =
7357 			    SWQE_TO_WQE(node);
7358 		} else {
7359 			state->id_tx_head = node;
7360 		}
7361 		state->id_tx_tail = node;
7362 		mutex_exit(&state->id_txpost_lock);
7363 	} else {
7364 		state->id_tx_busy = 1;
7365 		mutex_exit(&state->id_txpost_lock);
7366 		ibd_post_send(state, node);
7367 	}
7368 
7369 	return (B_TRUE);
7370 
7371 ibd_send_fail:
7372 	if (node && mp)
7373 		ibd_free_lsohdr(node, mp);
7374 
7375 	if (dofree)
7376 		freemsg(mp);
7377 
7378 	if (node != NULL) {
7379 		if (rc_chan) {
7380 			ibd_rc_tx_cleanup(node);
7381 		} else {
7382 			ibd_tx_cleanup(state, node);
7383 		}
7384 	}
7385 
7386 	return (rc);
7387 }
7388 
7389 /*
7390  * GLDv3 entry point for transmitting datagram.
7391  */
7392 static mblk_t *
7393 ibd_m_tx(void *arg, mblk_t *mp)
7394 {
7395 	ibd_state_t *state = (ibd_state_t *)arg;
7396 	mblk_t *next;
7397 
7398 	if (state->id_type == IBD_PORT_DRIVER) {
7399 		freemsgchain(mp);
7400 		return (NULL);
7401 	}
7402 
7403 	if ((state->id_link_state != LINK_STATE_UP) ||
7404 	    !(state->id_mac_state & IBD_DRV_STARTED)) {
7405 		freemsgchain(mp);
7406 		mp = NULL;
7407 	}
7408 
7409 	while (mp != NULL) {
7410 		next = mp->b_next;
7411 		mp->b_next = NULL;
7412 		if (ibd_send(state, mp) == B_FALSE) {
7413 			/* Send fail */
7414 			mp->b_next = next;
7415 			break;
7416 		}
7417 		mp = next;
7418 	}
7419 
7420 	return (mp);
7421 }
7422 
7423 /*
7424  * this handles Tx and Rx completions. With separate CQs, this handles
7425  * only Rx completions.
7426  */
7427 static uint_t
7428 ibd_intr(caddr_t arg)
7429 {
7430 	ibd_state_t *state = (ibd_state_t *)arg;
7431 
7432 	ibd_poll_rcq(state, state->id_rcq_hdl);
7433 
7434 	return (DDI_INTR_CLAIMED);
7435 }
7436 
7437 /*
7438  * Poll and fully drain the send cq
7439  */
7440 static void
7441 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7442 {
7443 	ibt_wc_t *wcs = state->id_txwcs;
7444 	uint_t numwcs = state->id_txwcs_size;
7445 	ibd_wqe_t *wqe;
7446 	ibd_swqe_t *head, *tail;
7447 	ibt_wc_t *wc;
7448 	uint_t num_polled;
7449 	int i;
7450 
7451 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
7452 		head = tail = NULL;
7453 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
7454 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
7455 			if (wc->wc_status != IBT_WC_SUCCESS) {
7456 				/*
7457 				 * Channel being torn down.
7458 				 */
7459 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
7460 					DPRINT(5, "ibd_drain_scq: flush error");
7461 					DPRINT(10, "ibd_drain_scq: Bad "
7462 					    "status %d", wc->wc_status);
7463 				} else {
7464 					DPRINT(10, "ibd_drain_scq: "
7465 					    "unexpected wc_status %d",
7466 					    wc->wc_status);
7467 				}
7468 				/*
7469 				 * Fallthrough to invoke the Tx handler to
7470 				 * release held resources, e.g., AH refcount.
7471 				 */
7472 			}
7473 			/*
7474 			 * Add this swqe to the list to be cleaned up.
7475 			 */
7476 			if (head)
7477 				tail->swqe_next = wqe;
7478 			else
7479 				head = WQE_TO_SWQE(wqe);
7480 			tail = WQE_TO_SWQE(wqe);
7481 		}
7482 		tail->swqe_next = NULL;
7483 		ibd_tx_cleanup_list(state, head, tail);
7484 
7485 		/*
7486 		 * Resume any blocked transmissions if possible
7487 		 */
7488 		ibd_resume_transmission(state);
7489 	}
7490 }
7491 
7492 /*
7493  * Poll and fully drain the receive cq
7494  */
7495 static void
7496 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7497 {
7498 	ibt_wc_t *wcs = state->id_rxwcs;
7499 	uint_t numwcs = state->id_rxwcs_size;
7500 	ibd_rwqe_t *rwqe;
7501 	ibt_wc_t *wc;
7502 	uint_t num_polled;
7503 	int i;
7504 	mblk_t *head, *tail, *mp;
7505 
7506 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
7507 		head = tail = NULL;
7508 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
7509 			rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id;
7510 			if (wc->wc_status != IBT_WC_SUCCESS) {
7511 				/*
7512 				 * Channel being torn down.
7513 				 */
7514 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
7515 					DPRINT(5, "ibd_drain_rcq: "
7516 					    "expected flushed rwqe");
7517 				} else {
7518 					DPRINT(5, "ibd_drain_rcq: "
7519 					    "unexpected wc_status %d",
7520 					    wc->wc_status);
7521 				}
7522 				atomic_inc_32(
7523 				    &state->id_rx_list.dl_bufs_outstanding);
7524 				freemsg(rwqe->rwqe_im_mblk);
7525 				continue;
7526 			}
7527 			mp = ibd_process_rx(state, rwqe, wc);
7528 			if (mp == NULL)
7529 				continue;
7530 
7531 			/*
7532 			 * Add this mp to the list to send to the nw layer.
7533 			 */
7534 			if (head)
7535 				tail->b_next = mp;
7536 			else
7537 				head = mp;
7538 			tail = mp;
7539 		}
7540 		if (head)
7541 			mac_rx(state->id_mh, state->id_rh, head);
7542 
7543 		/*
7544 		 * Account for #rwqes polled.
7545 		 * Post more here, if less than one fourth full.
7546 		 */
7547 		if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) <
7548 		    (state->id_ud_num_rwqe / 4))
7549 			ibd_post_recv_intr(state);
7550 	}
7551 }
7552 
7553 /*
7554  * Common code for interrupt handling as well as for polling
7555  * for all completed wqe's while detaching.
7556  */
7557 static void
7558 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7559 {
7560 	int flag, redo_flag;
7561 	int redo = 1;
7562 
7563 	flag = IBD_CQ_POLLING;
7564 	redo_flag = IBD_REDO_CQ_POLLING;
7565 
7566 	mutex_enter(&state->id_scq_poll_lock);
7567 	if (state->id_scq_poll_busy & flag) {
7568 		ibd_print_warn(state, "ibd_poll_scq: multiple polling threads");
7569 		state->id_scq_poll_busy |= redo_flag;
7570 		mutex_exit(&state->id_scq_poll_lock);
7571 		return;
7572 	}
7573 	state->id_scq_poll_busy |= flag;
7574 	mutex_exit(&state->id_scq_poll_lock);
7575 
7576 	/*
7577 	 * In some cases (eg detaching), this code can be invoked on
7578 	 * any cpu after disabling cq notification (thus no concurrency
7579 	 * exists). Apart from that, the following applies normally:
7580 	 * Transmit completion handling could be from any cpu if
7581 	 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
7582 	 * is interrupt driven.
7583 	 */
7584 
7585 	/*
7586 	 * Poll and drain the CQ
7587 	 */
7588 	ibd_drain_scq(state, cq_hdl);
7589 
7590 	/*
7591 	 * Enable CQ notifications and redrain the cq to catch any
7592 	 * completions we might have missed after the ibd_drain_scq()
7593 	 * above and before the ibt_enable_cq_notify() that follows.
7594 	 * Finally, service any new requests to poll the cq that
7595 	 * could've come in after the ibt_enable_cq_notify().
7596 	 */
7597 	do {
7598 		if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
7599 		    IBT_SUCCESS) {
7600 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
7601 		}
7602 
7603 		ibd_drain_scq(state, cq_hdl);
7604 
7605 		mutex_enter(&state->id_scq_poll_lock);
7606 		if (state->id_scq_poll_busy & redo_flag)
7607 			state->id_scq_poll_busy &= ~redo_flag;
7608 		else {
7609 			state->id_scq_poll_busy &= ~flag;
7610 			redo = 0;
7611 		}
7612 		mutex_exit(&state->id_scq_poll_lock);
7613 
7614 	} while (redo);
7615 }
7616 
7617 /*
7618  * Common code for interrupt handling as well as for polling
7619  * for all completed wqe's while detaching.
7620  */
7621 static void
7622 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq)
7623 {
7624 	int flag, redo_flag;
7625 	int redo = 1;
7626 
7627 	flag = IBD_CQ_POLLING;
7628 	redo_flag = IBD_REDO_CQ_POLLING;
7629 
7630 	mutex_enter(&state->id_rcq_poll_lock);
7631 	if (state->id_rcq_poll_busy & flag) {
7632 		ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads");
7633 		state->id_rcq_poll_busy |= redo_flag;
7634 		mutex_exit(&state->id_rcq_poll_lock);
7635 		return;
7636 	}
7637 	state->id_rcq_poll_busy |= flag;
7638 	mutex_exit(&state->id_rcq_poll_lock);
7639 
7640 	/*
7641 	 * Poll and drain the CQ
7642 	 */
7643 	ibd_drain_rcq(state, rcq);
7644 
7645 	/*
7646 	 * Enable CQ notifications and redrain the cq to catch any
7647 	 * completions we might have missed after the ibd_drain_cq()
7648 	 * above and before the ibt_enable_cq_notify() that follows.
7649 	 * Finally, service any new requests to poll the cq that
7650 	 * could've come in after the ibt_enable_cq_notify().
7651 	 */
7652 	do {
7653 		if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) !=
7654 		    IBT_SUCCESS) {
7655 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
7656 		}
7657 
7658 		ibd_drain_rcq(state, rcq);
7659 
7660 		mutex_enter(&state->id_rcq_poll_lock);
7661 		if (state->id_rcq_poll_busy & redo_flag)
7662 			state->id_rcq_poll_busy &= ~redo_flag;
7663 		else {
7664 			state->id_rcq_poll_busy &= ~flag;
7665 			redo = 0;
7666 		}
7667 		mutex_exit(&state->id_rcq_poll_lock);
7668 
7669 	} while (redo);
7670 }
7671 
7672 /*
7673  * Unmap the memory area associated with a given swqe.
7674  */
7675 void
7676 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
7677 {
7678 	ibt_status_t stat;
7679 
7680 	DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);
7681 
7682 	if (swqe->w_mi_hdl) {
7683 		if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
7684 		    swqe->w_mi_hdl)) != IBT_SUCCESS) {
7685 			DPRINT(10,
7686 			    "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
7687 		}
7688 		swqe->w_mi_hdl = NULL;
7689 	}
7690 	swqe->w_swr.wr_nds = 0;
7691 }
7692 
7693 void
7694 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace)
7695 {
7696 	/*
7697 	 * The recycling logic can be eliminated from here
7698 	 * and put into the async thread if we create another
7699 	 * list to hold ACE's for unjoined mcg's.
7700 	 */
7701 	if (DEC_REF_DO_CYCLE(ace)) {
7702 		ibd_mce_t *mce;
7703 
7704 		/*
7705 		 * Check with the lock taken: we decremented
7706 		 * reference count without the lock, and some
7707 		 * transmitter might already have bumped the
7708 		 * reference count (possible in case of multicast
7709 		 * disable when we leave the AH on the active
7710 		 * list). If not still 0, get out, leaving the
7711 		 * recycle bit intact.
7712 		 *
7713 		 * Atomically transition the AH from active
7714 		 * to free list, and queue a work request to
7715 		 * leave the group and destroy the mce. No
7716 		 * transmitter can be looking at the AH or
7717 		 * the MCE in between, since we have the
7718 		 * ac_mutex lock. In the SendOnly reap case,
7719 		 * it is not necessary to hold the ac_mutex
7720 		 * and recheck the ref count (since the AH was
7721 		 * taken off the active list), we just do it
7722 		 * to have uniform processing with the Full
7723 		 * reap case.
7724 		 */
7725 		mutex_enter(&state->id_ac_mutex);
7726 		mce = ace->ac_mce;
7727 		if (GET_REF_CYCLE(ace) == 0) {
7728 			CLEAR_REFCYCLE(ace);
7729 			/*
7730 			 * Identify the case of fullmember reap as
7731 			 * opposed to mcg trap reap. Also, port up
7732 			 * might set ac_mce to NULL to indicate Tx
7733 			 * cleanup should do no more than put the
7734 			 * AH in the free list (see ibd_async_link).
7735 			 */
7736 			if (mce != NULL) {
7737 				ace->ac_mce = NULL;
7738 				IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
7739 				/*
7740 				 * mc_req was initialized at mce
7741 				 * creation time.
7742 				 */
7743 				ibd_queue_work_slot(state,
7744 				    &mce->mc_req, IBD_ASYNC_REAP);
7745 			}
7746 			IBD_ACACHE_INSERT_FREE(state, ace);
7747 		}
7748 		mutex_exit(&state->id_ac_mutex);
7749 	}
7750 }
7751 
7752 /*
7753  * Common code that deals with clean ups after a successful or
7754  * erroneous transmission attempt.
7755  */
7756 static void
7757 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
7758 {
7759 	ibd_ace_t *ace = swqe->w_ahandle;
7760 
7761 	DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
7762 
7763 	/*
7764 	 * If this was a dynamic mapping in ibd_send(), we need to
7765 	 * unmap here. If this was an lso buffer we'd used for sending,
7766 	 * we need to release the lso buf to the pool, since the resource
7767 	 * is scarce. However, if this was simply a normal send using
7768 	 * the copybuf (present in each swqe), we don't need to release it.
7769 	 */
7770 	if (swqe->swqe_im_mblk != NULL) {
7771 		if (swqe->w_buftype == IBD_WQE_MAPPED) {
7772 			ibd_unmap_mem(state, swqe);
7773 		} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
7774 			ibd_release_lsobufs(state,
7775 			    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
7776 		}
7777 		ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
7778 		freemsg(swqe->swqe_im_mblk);
7779 		swqe->swqe_im_mblk = NULL;
7780 	}
7781 
7782 	/*
7783 	 * Drop the reference count on the AH; it can be reused
7784 	 * now for a different destination if there are no more
7785 	 * posted sends that will use it. This can be eliminated
7786 	 * if we can always associate each Tx buffer with an AH.
7787 	 * The ace can be null if we are cleaning up from the
7788 	 * ibd_send() error path.
7789 	 */
7790 	if (ace != NULL) {
7791 		ibd_dec_ref_ace(state, ace);
7792 	}
7793 
7794 	/*
7795 	 * Release the send wqe for reuse.
7796 	 */
7797 	swqe->swqe_next = NULL;
7798 	ibd_release_swqe(state, swqe, swqe, 1);
7799 }
7800 
7801 static void
7802 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail)
7803 {
7804 	ibd_ace_t *ace;
7805 	ibd_swqe_t *swqe;
7806 	int n = 0;
7807 
7808 	DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail);
7809 
7810 	for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) {
7811 
7812 		/*
7813 		 * If this was a dynamic mapping in ibd_send(), we need to
7814 		 * unmap here. If this was an lso buffer we'd used for sending,
7815 		 * we need to release the lso buf to the pool, since the
7816 		 * resource is scarce. However, if this was simply a normal
7817 		 * send using the copybuf (present in each swqe), we don't need
7818 		 * to release it.
7819 		 */
7820 		if (swqe->swqe_im_mblk != NULL) {
7821 			if (swqe->w_buftype == IBD_WQE_MAPPED) {
7822 				ibd_unmap_mem(state, swqe);
7823 			} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
7824 				ibd_release_lsobufs(state,
7825 				    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
7826 			}
7827 			ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
7828 			freemsg(swqe->swqe_im_mblk);
7829 			swqe->swqe_im_mblk = NULL;
7830 		}
7831 
7832 		/*
7833 		 * Drop the reference count on the AH; it can be reused
7834 		 * now for a different destination if there are no more
7835 		 * posted sends that will use it. This can be eliminated
7836 		 * if we can always associate each Tx buffer with an AH.
7837 		 * The ace can be null if we are cleaning up from the
7838 		 * ibd_send() error path.
7839 		 */
7840 		ace = swqe->w_ahandle;
7841 		if (ace != NULL) {
7842 			ibd_dec_ref_ace(state, ace);
7843 		}
7844 		n++;
7845 	}
7846 
7847 	/*
7848 	 * Release the send wqes for reuse.
7849 	 */
7850 	ibd_release_swqe(state, head, tail, n);
7851 }
7852 
7853 /*
7854  * Processing to be done after receipt of a packet; hand off to GLD
7855  * in the format expected by GLD.  The received packet has this
7856  * format: 2b sap :: 00 :: data.
7857  */
7858 static mblk_t *
7859 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
7860 {
7861 	ib_header_info_t *phdr;
7862 	mblk_t *mp;
7863 	ipoib_hdr_t *ipibp;
7864 	ipha_t *iphap;
7865 	ip6_t *ip6h;
7866 	int len;
7867 	ib_msglen_t pkt_len = wc->wc_bytes_xfer;
7868 	uint32_t bufs;
7869 
7870 	/*
7871 	 * Track number handed to upper layer that need to be returned.
7872 	 */
7873 	bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding);
7874 
7875 	/* Never run out of rwqes, use allocb when running low */
7876 	if (bufs >= state->id_rx_bufs_outstanding_limit) {
7877 		atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
7878 		atomic_inc_32(&state->id_rx_allocb);
7879 		mp = allocb(pkt_len, BPRI_HI);
7880 		if (mp) {
7881 			bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len);
7882 			ibd_post_recv(state, rwqe);
7883 		} else {	/* no memory */
7884 			atomic_inc_32(&state->id_rx_allocb_failed);
7885 			ibd_post_recv(state, rwqe);
7886 			return (NULL);
7887 		}
7888 	} else {
7889 		mp = rwqe->rwqe_im_mblk;
7890 	}
7891 
7892 
7893 	/*
7894 	 * Adjust write pointer depending on how much data came in.
7895 	 */
7896 	mp->b_wptr = mp->b_rptr + pkt_len;
7897 
7898 	/*
7899 	 * Make sure this is NULL or we're in trouble.
7900 	 */
7901 	if (mp->b_next != NULL) {
7902 		ibd_print_warn(state,
7903 		    "ibd_process_rx: got duplicate mp from rcq?");
7904 		mp->b_next = NULL;
7905 	}
7906 
7907 	/*
7908 	 * the IB link will deliver one of the IB link layer
7909 	 * headers called, the Global Routing Header (GRH).
7910 	 * ibd driver uses the information in GRH to build the
7911 	 * Header_info structure and pass it with the datagram up
7912 	 * to GLDv3.
7913 	 * If the GRH is not valid, indicate to GLDv3 by setting
7914 	 * the VerTcFlow field to 0.
7915 	 */
7916 	phdr = (ib_header_info_t *)mp->b_rptr;
7917 	if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
7918 		phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
7919 
7920 		/* if it is loop back packet, just drop it. */
7921 		if (state->id_enable_rc) {
7922 			if (bcmp(&phdr->ib_grh.ipoib_sqpn,
7923 			    &state->rc_macaddr_loopback,
7924 			    IPOIB_ADDRL) == 0) {
7925 				freemsg(mp);
7926 				return (NULL);
7927 			}
7928 		} else {
7929 			if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
7930 			    IPOIB_ADDRL) == 0) {
7931 				freemsg(mp);
7932 				return (NULL);
7933 			}
7934 		}
7935 
7936 		ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
7937 		    sizeof (ipoib_mac_t));
7938 		if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
7939 			phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
7940 			IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
7941 		} else {
7942 			phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
7943 		}
7944 	} else {
7945 		/*
7946 		 * It can not be a IBA multicast packet. Must have been
7947 		 * unicast for us. Just copy the interface address to dst.
7948 		 */
7949 		phdr->ib_grh.ipoib_vertcflow = 0;
7950 		ovbcopy(&state->id_macaddr, &phdr->ib_dst,
7951 		    sizeof (ipoib_mac_t));
7952 	}
7953 
7954 	/*
7955 	 * For ND6 packets, padding is at the front of the source/target
7956 	 * lladdr. However the inet6 layer is not aware of it, hence remove
7957 	 * the padding from such packets.
7958 	 */
7959 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
7960 	if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
7961 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
7962 		len = ntohs(ip6h->ip6_plen);
7963 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
7964 			/* LINTED: E_CONSTANT_CONDITION */
7965 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
7966 		}
7967 	}
7968 
7969 	/*
7970 	 * Update statistics
7971 	 */
7972 	atomic_add_64(&state->id_rcv_bytes, pkt_len);
7973 	atomic_inc_64(&state->id_rcv_pkt);
7974 	if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
7975 		atomic_inc_64(&state->id_brd_rcv);
7976 	else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
7977 		atomic_inc_64(&state->id_multi_rcv);
7978 
7979 	iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
7980 	/*
7981 	 * Set receive checksum status in mp
7982 	 * Hardware checksumming can be considered valid only if:
7983 	 * 1. CQE.IP_OK bit is set
7984 	 * 2. CQE.CKSUM = 0xffff
7985 	 * 3. IPv6 routing header is not present in the packet
7986 	 * 4. If there are no IP_OPTIONS in the IP HEADER
7987 	 */
7988 
7989 	if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
7990 	    (wc->wc_cksum == 0xFFFF) &&
7991 	    (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
7992 		mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
7993 	}
7994 
7995 	return (mp);
7996 }
7997 
7998 /*
7999  * Callback code invoked from STREAMs when the receive data buffer is
8000  * free for recycling.
8001  */
8002 static void
8003 ibd_freemsg_cb(char *arg)
8004 {
8005 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
8006 	ibd_state_t *state = rwqe->w_state;
8007 
8008 	atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
8009 
8010 	/*
8011 	 * If the driver is stopped, just free the rwqe.
8012 	 */
8013 	if (atomic_add_32_nv(&state->id_running, 0) == 0) {
8014 		DPRINT(6, "ibd_freemsg: wqe being freed");
8015 		rwqe->rwqe_im_mblk = NULL;
8016 		ibd_free_rwqe(state, rwqe);
8017 		return;
8018 	}
8019 
8020 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
8021 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
8022 	if (rwqe->rwqe_im_mblk == NULL) {
8023 		ibd_free_rwqe(state, rwqe);
8024 		DPRINT(6, "ibd_freemsg: desballoc failed");
8025 		return;
8026 	}
8027 
8028 	ibd_post_recv(state, rwqe);
8029 }
8030 
8031 static uint_t
8032 ibd_tx_recycle(caddr_t arg)
8033 {
8034 	ibd_state_t *state = (ibd_state_t *)arg;
8035 
8036 	/*
8037 	 * Poll for completed entries
8038 	 */
8039 	ibd_poll_scq(state, state->id_scq_hdl);
8040 
8041 	return (DDI_INTR_CLAIMED);
8042 }
8043 
8044 #ifdef IBD_LOGGING
8045 static void
8046 ibd_log_init(void)
8047 {
8048 	ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
8049 	ibd_lbuf_ndx = 0;
8050 
8051 	mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
8052 }
8053 
8054 static void
8055 ibd_log_fini(void)
8056 {
8057 	if (ibd_lbuf)
8058 		kmem_free(ibd_lbuf, IBD_LOG_SZ);
8059 	ibd_lbuf_ndx = 0;
8060 	ibd_lbuf = NULL;
8061 
8062 	mutex_destroy(&ibd_lbuf_lock);
8063 }
8064 
8065 static void
8066 ibd_log(const char *fmt, ...)
8067 {
8068 	va_list	ap;
8069 	uint32_t off;
8070 	uint32_t msglen;
8071 	char tmpbuf[IBD_DMAX_LINE];
8072 
8073 	if (ibd_lbuf == NULL)
8074 		return;
8075 
8076 	va_start(ap, fmt);
8077 	msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
8078 	va_end(ap);
8079 
8080 	if (msglen >= IBD_DMAX_LINE)
8081 		msglen = IBD_DMAX_LINE - 1;
8082 
8083 	mutex_enter(&ibd_lbuf_lock);
8084 
8085 	off = ibd_lbuf_ndx;		/* current msg should go here */
8086 	if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
8087 		ibd_lbuf[ibd_lbuf_ndx-1] = '\n';
8088 
8089 	ibd_lbuf_ndx += msglen;		/* place where next msg should start */
8090 	ibd_lbuf[ibd_lbuf_ndx] = 0;	/* current msg should terminate */
8091 
8092 	if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
8093 		ibd_lbuf_ndx = 0;
8094 
8095 	mutex_exit(&ibd_lbuf_lock);
8096 
8097 	bcopy(tmpbuf, ibd_lbuf+off, msglen);	/* no lock needed for this */
8098 }
8099 #endif
8100 
8101 /* ARGSUSED */
8102 static int
8103 ibd_create_partition(void *karg, intptr_t arg, int mode, cred_t *credp,
8104     int *rvalp)
8105 {
8106 	ibd_create_ioctl_t	*cmd = karg;
8107 	ibd_state_t		*state, *port_state, *p;
8108 	int			i, err, rval = 0;
8109 	mac_register_t		*macp;
8110 	ibt_hca_portinfo_t 	*pinfop = NULL;
8111 	ibt_status_t 		ibt_status;
8112 	uint_t 			psize, pinfosz;
8113 	boolean_t		force_create = B_FALSE;
8114 
8115 	cmd->ibdioc.ioc_status = 0;
8116 
8117 	if (cmd->ibdioc.ioc_port_inst < 0) {
8118 		cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST;
8119 		return (EINVAL);
8120 	}
8121 	port_state = ddi_get_soft_state(ibd_list, cmd->ibdioc.ioc_port_inst);
8122 	if (port_state == NULL) {
8123 		DPRINT(10, "ibd_create_partition: failed to get state %d",
8124 		    cmd->ibdioc.ioc_port_inst);
8125 		cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST;
8126 		return (EINVAL);
8127 	}
8128 
8129 	/* Limited PKeys not supported */
8130 	if (cmd->ioc_pkey <= IB_PKEY_INVALID_FULL) {
8131 		rval = EINVAL;
8132 		goto part_create_return;
8133 	}
8134 
8135 	if (cmd->ioc_force_create == 0) {
8136 		/*
8137 		 * Check if the port pkey table contains the pkey for which
8138 		 * this partition is being created.
8139 		 */
8140 		ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8141 		    port_state->id_port, &pinfop, &psize, &pinfosz);
8142 
8143 		if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8144 			rval = EINVAL;
8145 			goto part_create_return;
8146 		}
8147 
8148 		if (pinfop->p_linkstate != IBT_PORT_ACTIVE) {
8149 			rval = ENETDOWN;
8150 			cmd->ibdioc.ioc_status = IBD_PORT_IS_DOWN;
8151 			goto part_create_return;
8152 		}
8153 
8154 		for (i = 0; i < pinfop->p_pkey_tbl_sz; i++) {
8155 			if (pinfop->p_pkey_tbl[i] == cmd->ioc_pkey) {
8156 				break;
8157 			}
8158 		}
8159 		if (i == pinfop->p_pkey_tbl_sz) {
8160 			rval = EINVAL;
8161 			cmd->ibdioc.ioc_status = IBD_PKEY_NOT_PRESENT;
8162 			goto part_create_return;
8163 		}
8164 	} else {
8165 		force_create = B_TRUE;
8166 	}
8167 
8168 	mutex_enter(&ibd_objlist_lock);
8169 	for (p = ibd_objlist_head; p; p = p->id_next) {
8170 		if ((p->id_port_inst == cmd->ibdioc.ioc_port_inst) &&
8171 		    (p->id_pkey == cmd->ioc_pkey)) {
8172 			mutex_exit(&ibd_objlist_lock);
8173 			rval = EEXIST;
8174 			cmd->ibdioc.ioc_status = IBD_PARTITION_EXISTS;
8175 			goto part_create_return;
8176 		}
8177 	}
8178 	mutex_exit(&ibd_objlist_lock);
8179 
8180 	state = kmem_zalloc(sizeof (ibd_state_t), KM_SLEEP);
8181 
8182 	state->id_type		= IBD_PARTITION_OBJ;
8183 
8184 	state->id_plinkid	= cmd->ioc_partid;
8185 	state->id_dlinkid	= cmd->ibdioc.ioc_linkid;
8186 	state->id_port_inst	= cmd->ibdioc.ioc_port_inst;
8187 
8188 	state->id_dip		= port_state->id_dip;
8189 	state->id_port		= port_state->id_port;
8190 	state->id_pkey		= cmd->ioc_pkey;
8191 	state->id_hca_guid	= port_state->id_hca_guid;
8192 	state->id_port_guid	= port_state->id_port_guid;
8193 	state->id_force_create	= force_create;
8194 
8195 	mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL);
8196 	cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL);
8197 
8198 	if (ibd_part_attach(state, state->id_dip) != DDI_SUCCESS) {
8199 		rval = EIO;
8200 		cmd->ibdioc.ioc_status = IBD_NO_HW_RESOURCE;
8201 		goto fail;
8202 	}
8203 
8204 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
8205 		rval = EAGAIN;
8206 		goto fail;
8207 	}
8208 
8209 	macp->m_type_ident	= MAC_PLUGIN_IDENT_IB;
8210 	macp->m_dip		= port_state->id_dip;
8211 	macp->m_instance	= (uint_t)-1;
8212 	macp->m_driver		= state;
8213 	macp->m_src_addr	= (uint8_t *)&state->id_macaddr;
8214 	macp->m_callbacks	= &ibd_m_callbacks;
8215 	macp->m_min_sdu		= 0;
8216 	if (state->id_enable_rc) {
8217 		macp->m_max_sdu		= IBD_DEF_RC_MAX_SDU;
8218 	} else {
8219 		macp->m_max_sdu		= IBD_DEF_MAX_SDU;
8220 	}
8221 	macp->m_priv_props = ibd_priv_props;
8222 
8223 	err = mac_register(macp, &state->id_mh);
8224 	mac_free(macp);
8225 
8226 	if (err != 0) {
8227 		DPRINT(10, "ibd_create_partition: mac_register() failed %d",
8228 		    err);
8229 		rval = err;
8230 		goto fail;
8231 	}
8232 
8233 	err = dls_devnet_create(state->id_mh,
8234 	    cmd->ioc_partid, crgetzoneid(credp));
8235 	if (err != 0) {
8236 		DPRINT(10, "ibd_create_partition: dls_devnet_create() failed "
8237 		    "%d", err);
8238 		rval = err;
8239 		(void) mac_unregister(state->id_mh);
8240 		goto fail;
8241 	}
8242 
8243 	/*
8244 	 * Add the new partition state structure to the list
8245 	 */
8246 	mutex_enter(&ibd_objlist_lock);
8247 	if (ibd_objlist_head)
8248 		state->id_next = ibd_objlist_head;
8249 
8250 	ibd_objlist_head = state;
8251 	mutex_exit(&ibd_objlist_lock);
8252 
8253 part_create_return:
8254 	if (pinfop) {
8255 		ibt_free_portinfo(pinfop, pinfosz);
8256 	}
8257 	return (rval);
8258 
8259 fail:
8260 	if (pinfop) {
8261 		ibt_free_portinfo(pinfop, pinfosz);
8262 	}
8263 	ibd_part_unattach(state);
8264 	kmem_free(state, sizeof (ibd_state_t));
8265 	return (rval);
8266 }
8267 
8268 /* ARGSUSED */
8269 static int
8270 ibd_delete_partition(void *karg, intptr_t arg, int mode, cred_t *credp,
8271     int *rvalp)
8272 {
8273 	int err;
8274 	datalink_id_t tmpid;
8275 	ibd_state_t *node, *prev;
8276 	ibd_delete_ioctl_t *cmd = karg;
8277 
8278 	prev = NULL;
8279 
8280 	mutex_enter(&ibd_objlist_lock);
8281 	node = ibd_objlist_head;
8282 
8283 	/* Find the ibd state structure corresponding the partion */
8284 	while (node != NULL) {
8285 		if (node->id_plinkid == cmd->ioc_partid)
8286 			break;
8287 		prev = node;
8288 		node = node->id_next;
8289 	}
8290 
8291 	if (node == NULL) {
8292 		mutex_exit(&ibd_objlist_lock);
8293 		return (ENOENT);
8294 	}
8295 
8296 	if ((err = dls_devnet_destroy(node->id_mh, &tmpid, B_TRUE)) != 0) {
8297 		DPRINT(10, "ibd_delete_partition: dls_devnet_destroy() failed "
8298 		    "%d", err);
8299 		mutex_exit(&ibd_objlist_lock);
8300 		return (err);
8301 	}
8302 
8303 	/*
8304 	 * Call ibd_part_unattach() only after making sure that the instance has
8305 	 * not been started yet and is also not in late hca init mode.
8306 	 */
8307 	ibd_set_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8308 
8309 	err = 0;
8310 	if ((node->id_mac_state & IBD_DRV_STARTED) ||
8311 	    (node->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ||
8312 	    (ibd_part_busy(node) != DDI_SUCCESS) ||
8313 	    ((err = mac_disable(node->id_mh)) != 0)) {
8314 		(void) dls_devnet_create(node->id_mh, cmd->ioc_partid,
8315 		    crgetzoneid(credp));
8316 		ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8317 		mutex_exit(&ibd_objlist_lock);
8318 		return (err != 0 ? err : EBUSY);
8319 	}
8320 
8321 	node->id_mac_state |= IBD_DRV_IN_DELETION;
8322 
8323 	ibd_part_unattach(node);
8324 
8325 	ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8326 
8327 	/* Remove the partition state structure from the linked list */
8328 	if (prev == NULL)
8329 		ibd_objlist_head = node->id_next;
8330 	else
8331 		prev->id_next = node->id_next;
8332 	mutex_exit(&ibd_objlist_lock);
8333 
8334 	if ((err = mac_unregister(node->id_mh)) != 0) {
8335 		DPRINT(10, "ibd_delete_partition: mac_unregister() failed %d",
8336 		    err);
8337 	}
8338 
8339 	cv_destroy(&node->id_macst_cv);
8340 	mutex_destroy(&node->id_macst_lock);
8341 
8342 	kmem_free(node, sizeof (ibd_state_t));
8343 
8344 	return (0);
8345 }
8346 
8347 /* ARGSUSED */
8348 static int
8349 ibd_get_partition_info(void *karg, intptr_t arg, int mode, cred_t *cred,
8350     int *rvalp)
8351 {
8352 	ibd_ioctl_t		cmd;
8353 	ibpart_ioctl_t		partioc;
8354 	ibport_ioctl_t		portioc;
8355 #ifdef _MULTI_DATAMODEL
8356 	ibport_ioctl32_t	portioc32;
8357 #endif
8358 	ibd_state_t		*state, *port_state;
8359 	int			size;
8360 	ibt_hca_portinfo_t 	*pinfop = NULL;
8361 	ibt_status_t 		ibt_status;
8362 	uint_t 			psize, pinfosz;
8363 	int			rval = 0;
8364 
8365 	size = sizeof (ibd_ioctl_t);
8366 	if (ddi_copyin((void *)arg, &cmd, size, mode)) {
8367 		return (EFAULT);
8368 	}
8369 	cmd.ioc_status = 0;
8370 	switch (cmd.ioc_info_cmd) {
8371 	case IBD_INFO_CMD_IBPART:
8372 		size = sizeof (ibpart_ioctl_t);
8373 		if (ddi_copyin((void *)arg, &partioc, size, mode)) {
8374 			return (EFAULT);
8375 		}
8376 
8377 		mutex_enter(&ibd_objlist_lock);
8378 		/* Find the ibd state structure corresponding the partition */
8379 		for (state = ibd_objlist_head; state; state = state->id_next) {
8380 			if (state->id_plinkid == cmd.ioc_linkid) {
8381 				break;
8382 			}
8383 		}
8384 
8385 		if (state == NULL) {
8386 			mutex_exit(&ibd_objlist_lock);
8387 			return (ENOENT);
8388 		}
8389 
8390 		partioc.ibdioc.ioc_linkid = state->id_dlinkid;
8391 		partioc.ibdioc.ioc_port_inst = state->id_port_inst;
8392 		partioc.ibdioc.ioc_portnum = state->id_port;
8393 		partioc.ibdioc.ioc_hcaguid = state->id_hca_guid;
8394 		partioc.ibdioc.ioc_portguid = state->id_port_guid;
8395 		partioc.ibdioc.ioc_status = 0;
8396 		partioc.ioc_partid = state->id_plinkid;
8397 		partioc.ioc_pkey = state->id_pkey;
8398 		partioc.ioc_force_create = state->id_force_create;
8399 		if (ddi_copyout((void *)&partioc, (void *)arg, size, mode)) {
8400 			mutex_exit(&ibd_objlist_lock);
8401 			return (EFAULT);
8402 		}
8403 		mutex_exit(&ibd_objlist_lock);
8404 
8405 		break;
8406 
8407 	case IBD_INFO_CMD_IBPORT:
8408 		if ((cmd.ioc_port_inst < 0) || ((port_state =
8409 		    ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) {
8410 			DPRINT(10, "ibd_create_partition: failed to get"
8411 			    " state %d", cmd.ioc_port_inst);
8412 			size = sizeof (ibd_ioctl_t);
8413 			cmd.ioc_status = IBD_INVALID_PORT_INST;
8414 			if (ddi_copyout((void *)&cmd, (void *)arg, size,
8415 			    mode)) {
8416 				return (EFAULT);
8417 			}
8418 			return (EINVAL);
8419 		}
8420 		ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8421 		    port_state->id_port, &pinfop, &psize, &pinfosz);
8422 		if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8423 			return (EINVAL);
8424 		}
8425 #ifdef _MULTI_DATAMODEL
8426 		switch (ddi_model_convert_from(mode & FMODELS)) {
8427 		case DDI_MODEL_ILP32: {
8428 			size = sizeof (ibport_ioctl32_t);
8429 			if (ddi_copyin((void *)arg, &portioc32, size, mode)) {
8430 				rval = EFAULT;
8431 				goto fail;
8432 			}
8433 			portioc32.ibdioc.ioc_status = 0;
8434 			portioc32.ibdioc.ioc_portnum = port_state->id_port;
8435 			portioc32.ibdioc.ioc_hcaguid =
8436 			    port_state->id_hca_guid;
8437 			portioc32.ibdioc.ioc_portguid =
8438 			    port_state->id_port_guid;
8439 			if (portioc32.ioc_pkey_tbl_sz !=
8440 			    pinfop->p_pkey_tbl_sz) {
8441 				rval = EINVAL;
8442 				size = sizeof (ibd_ioctl_t);
8443 				portioc32.ibdioc.ioc_status =
8444 				    IBD_INVALID_PKEY_TBL_SIZE;
8445 				if (ddi_copyout((void *)&portioc32.ibdioc,
8446 				    (void *)arg, size, mode)) {
8447 					rval = EFAULT;
8448 					goto fail;
8449 				}
8450 				goto fail;
8451 			}
8452 			size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8453 			if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8454 			    (void *)(uintptr_t)portioc32.ioc_pkeys, size,
8455 			    mode)) {
8456 				rval = EFAULT;
8457 				goto fail;
8458 			}
8459 			size = sizeof (ibport_ioctl32_t);
8460 			if (ddi_copyout((void *)&portioc32, (void *)arg, size,
8461 			    mode)) {
8462 				rval = EFAULT;
8463 				goto fail;
8464 			}
8465 			break;
8466 		}
8467 		case DDI_MODEL_NONE:
8468 			size = sizeof (ibport_ioctl_t);
8469 			if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8470 				rval = EFAULT;
8471 				goto fail;
8472 			}
8473 			portioc.ibdioc.ioc_status = 0;
8474 			portioc.ibdioc.ioc_portnum = port_state->id_port;
8475 			portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8476 			portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8477 			if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) {
8478 				rval = EINVAL;
8479 				size = sizeof (ibd_ioctl_t);
8480 				portioc.ibdioc.ioc_status =
8481 				    IBD_INVALID_PKEY_TBL_SIZE;
8482 				if (ddi_copyout((void *)&portioc.ibdioc,
8483 				    (void *)arg, size, mode)) {
8484 					rval = EFAULT;
8485 					goto fail;
8486 				}
8487 				goto fail;
8488 			}
8489 			size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8490 			if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8491 			    (void *)(portioc.ioc_pkeys), size, mode)) {
8492 				rval = EFAULT;
8493 				goto fail;
8494 			}
8495 			size = sizeof (ibport_ioctl_t);
8496 			if (ddi_copyout((void *)&portioc, (void *)arg, size,
8497 			    mode)) {
8498 				rval = EFAULT;
8499 				goto fail;
8500 			}
8501 			break;
8502 		}
8503 #else /* ! _MULTI_DATAMODEL */
8504 		size = sizeof (ibport_ioctl_t);
8505 		if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8506 			rval = EFAULT;
8507 			goto fail;
8508 		}
8509 		portioc.ibdioc.ioc_status = 0;
8510 		portioc.ibdioc.ioc_portnum = port_state->id_port;
8511 		portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8512 		portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8513 		if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) {
8514 			rval = EINVAL;
8515 			size = sizeof (ibd_ioctl_t);
8516 			portioc.ibdioc.ioc_status = IBD_INVALID_PKEY_TBL_SIZE;
8517 			if (ddi_copyout((void *)&portioc.ibdioc, (void *)arg,
8518 			    size, mode)) {
8519 				rval = EFAULT;
8520 				goto fail;
8521 			}
8522 			goto fail;
8523 		}
8524 		size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8525 		if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8526 		    (void *)(portioc.ioc_pkeys), size, mode)) {
8527 			rval = EFAULT;
8528 			goto fail;
8529 		}
8530 		size = sizeof (ibport_ioctl_t);
8531 		if (ddi_copyout((void *)&portioc, (void *)arg, size,
8532 		    mode)) {
8533 			rval = EFAULT;
8534 			goto fail;
8535 		}
8536 #endif /* _MULTI_DATAMODEL */
8537 
8538 		break;
8539 
8540 	case IBD_INFO_CMD_PKEYTBLSZ:
8541 		if ((cmd.ioc_port_inst < 0) || ((port_state =
8542 		    ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) {
8543 			DPRINT(10, "ibd_create_partition: failed to get"
8544 			    " state %d", cmd.ioc_port_inst);
8545 			size = sizeof (ibd_ioctl_t);
8546 			cmd.ioc_status = IBD_INVALID_PORT_INST;
8547 			if (ddi_copyout((void *)&cmd, (void *)arg, size,
8548 			    mode)) {
8549 				return (EFAULT);
8550 			}
8551 			return (EINVAL);
8552 		}
8553 		ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8554 		    port_state->id_port, &pinfop, &psize, &pinfosz);
8555 		if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8556 			return (EINVAL);
8557 		}
8558 #ifdef _MULTI_DATAMODEL
8559 		switch (ddi_model_convert_from(mode & FMODELS)) {
8560 		case DDI_MODEL_ILP32: {
8561 			size = sizeof (ibport_ioctl32_t);
8562 			if (ddi_copyin((void *)arg, &portioc32, size, mode)) {
8563 				rval = EFAULT;
8564 				goto fail;
8565 			}
8566 			portioc32.ibdioc.ioc_status = 0;
8567 			portioc32.ibdioc.ioc_portnum = port_state->id_port;
8568 			portioc32.ibdioc.ioc_hcaguid =
8569 			    port_state->id_hca_guid;
8570 			portioc32.ibdioc.ioc_portguid =
8571 			    port_state->id_port_guid;
8572 			portioc32.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8573 			if (ddi_copyout((void *)&portioc32, (void *)arg, size,
8574 			    mode)) {
8575 				rval = EFAULT;
8576 				goto fail;
8577 			}
8578 			break;
8579 		}
8580 		case DDI_MODEL_NONE:
8581 			size = sizeof (ibport_ioctl_t);
8582 			if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8583 				rval = EFAULT;
8584 				goto fail;
8585 			}
8586 			portioc.ibdioc.ioc_status = 0;
8587 			portioc.ibdioc.ioc_portnum = port_state->id_port;
8588 			portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8589 			portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8590 			portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8591 			if (ddi_copyout((void *)&portioc, (void *)arg, size,
8592 			    mode)) {
8593 				rval = EFAULT;
8594 				goto fail;
8595 			}
8596 			break;
8597 		}
8598 #else /* ! _MULTI_DATAMODEL */
8599 		size = sizeof (ibport_ioctl_t);
8600 		if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8601 			rval = EFAULT;
8602 			goto fail;
8603 		}
8604 		portioc.ibdioc.ioc_status = 0;
8605 		portioc.ibdioc.ioc_portnum = port_state->id_port;
8606 		portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8607 		portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8608 		portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8609 		if (ddi_copyout((void *)&portioc, (void *)arg, size,
8610 		    mode)) {
8611 			rval = EFAULT;
8612 			goto fail;
8613 		}
8614 #endif /* _MULTI_DATAMODEL */
8615 		break;
8616 
8617 	default:
8618 		return (EINVAL);
8619 
8620 	} /* switch (cmd.ioc_info_cmd) */
8621 fail:
8622 	if (pinfop) {
8623 		ibt_free_portinfo(pinfop, pinfosz);
8624 	}
8625 	return (rval);
8626 }
8627 
8628 /* ARGSUSED */
8629 static void
8630 ibdpd_async_handler(void *arg, ibt_hca_hdl_t hca_hdl,
8631     ibt_async_code_t code, ibt_async_event_t *event)
8632 {
8633 	ibd_state_t *state = (ibd_state_t *)arg;
8634 	link_state_t	lstate;
8635 
8636 	switch (code) {
8637 	case IBT_EVENT_PORT_UP:
8638 	case IBT_ERROR_PORT_DOWN:
8639 		if (ibd_get_port_state(state, &lstate) != 0)
8640 			break;
8641 
8642 		if (state->id_link_state != lstate) {
8643 			state->id_link_state = lstate;
8644 			mac_link_update(state->id_mh, lstate);
8645 		}
8646 		break;
8647 	default:
8648 		break;
8649 	}
8650 }
8651 
8652 static int
8653 ibd_get_port_state(ibd_state_t *state, link_state_t *lstate)
8654 {
8655 	ibt_hca_portinfo_t *port_infop;
8656 	uint_t psize, port_infosz;
8657 	ibt_status_t	ret;
8658 
8659 	ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
8660 	    &port_infop, &psize, &port_infosz);
8661 	if ((ret != IBT_SUCCESS) || (psize != 1))
8662 		return (-1);
8663 
8664 	state->id_sgid = *port_infop->p_sgid_tbl;
8665 	state->id_link_speed = ibd_get_portspeed(state);
8666 
8667 	if (port_infop->p_linkstate == IBT_PORT_ACTIVE)
8668 		*lstate = LINK_STATE_UP;
8669 	else
8670 		*lstate = LINK_STATE_DOWN;
8671 
8672 	ibt_free_portinfo(port_infop, port_infosz);
8673 	return (0);
8674 }
8675 
8676 static int
8677 ibd_port_attach(dev_info_t *dip)
8678 {
8679 	ibd_state_t		*state;
8680 	link_state_t		lstate;
8681 	int			instance;
8682 	ibt_status_t		ret;
8683 
8684 	/*
8685 	 * Allocate softstate structure
8686 	 */
8687 	instance = ddi_get_instance(dip);
8688 	if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) {
8689 		DPRINT(10, "ibd_port_attach: ddi_soft_state_zalloc() failed");
8690 		return (DDI_FAILURE);
8691 	}
8692 
8693 	state = ddi_get_soft_state(ibd_list, instance);
8694 
8695 	state->id_dip = dip;
8696 	state->id_type = IBD_PORT_DRIVER;
8697 
8698 	if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
8699 	    "port-number", 0)) == 0) {
8700 		DPRINT(10, "ibd_port_attach: invalid port number (%d)",
8701 		    state->id_port);
8702 		return (DDI_FAILURE);
8703 	}
8704 	if ((state->id_hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
8705 	    "hca-guid", 0)) == 0) {
8706 		DPRINT(10, "ibd_port_attach: hca has invalid guid (0x%llx)",
8707 		    state->id_hca_guid);
8708 		return (DDI_FAILURE);
8709 	}
8710 	if ((state->id_port_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
8711 	    "port-guid", 0)) == 0) {
8712 		DPRINT(10, "ibd_port_attach: port has invalid guid (0x%llx)",
8713 		    state->id_port_guid);
8714 		return (DDI_FAILURE);
8715 	}
8716 
8717 	/*
8718 	 * Attach to IBTL
8719 	 */
8720 	if ((ret = ibt_attach(&ibdpd_clnt_modinfo, dip, state,
8721 	    &state->id_ibt_hdl)) != IBT_SUCCESS) {
8722 		DPRINT(10, "ibd_port_attach: failed in ibt_attach(), ret=%d",
8723 		    ret);
8724 		goto done;
8725 	}
8726 
8727 	state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
8728 
8729 	if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
8730 	    &state->id_hca_hdl)) != IBT_SUCCESS) {
8731 		DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d",
8732 		    ret);
8733 		goto done;
8734 	}
8735 	state->id_mac_state |= IBD_DRV_HCA_OPENED;
8736 
8737 	/* Update link status */
8738 
8739 	if (ibd_get_port_state(state, &lstate) != 0) {
8740 		DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d",
8741 		    ret);
8742 		goto done;
8743 	}
8744 	state->id_link_state = lstate;
8745 	/*
8746 	 * Register ibd interfaces with the Nemo framework
8747 	 */
8748 	if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
8749 		DPRINT(10, "ibd_port_attach: failed in ibd_register_mac()");
8750 		goto done;
8751 	}
8752 	state->id_mac_state |= IBD_DRV_MAC_REGISTERED;
8753 
8754 	mac_link_update(state->id_mh, lstate);
8755 
8756 	return (DDI_SUCCESS);
8757 done:
8758 	(void) ibd_port_unattach(state, dip);
8759 	return (DDI_FAILURE);
8760 }
8761 
8762 static int
8763 ibd_port_unattach(ibd_state_t *state, dev_info_t *dip)
8764 {
8765 	int instance;
8766 	uint32_t progress = state->id_mac_state;
8767 	ibt_status_t ret;
8768 
8769 	if (progress & IBD_DRV_MAC_REGISTERED) {
8770 		(void) mac_unregister(state->id_mh);
8771 		state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
8772 	}
8773 
8774 	if (progress & IBD_DRV_HCA_OPENED) {
8775 		if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
8776 		    IBT_SUCCESS) {
8777 			ibd_print_warn(state, "failed to close "
8778 			    "HCA device, ret=%d", ret);
8779 		}
8780 		state->id_hca_hdl = NULL;
8781 		state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
8782 	}
8783 
8784 	if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
8785 		if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) {
8786 			ibd_print_warn(state,
8787 			    "ibt_detach() failed, ret=%d", ret);
8788 		}
8789 		state->id_ibt_hdl = NULL;
8790 		state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
8791 	}
8792 	instance = ddi_get_instance(dip);
8793 	ddi_soft_state_free(ibd_list, instance);
8794 
8795 	return (DDI_SUCCESS);
8796 }
8797 
8798 ibt_status_t
8799 ibd_get_part_attr(datalink_id_t linkid, ibt_part_attr_t *attr)
8800 {
8801 	ibd_state_t	*state;
8802 
8803 	mutex_enter(&ibd_objlist_lock);
8804 
8805 	/* Find the ibd state structure corresponding the partition */
8806 	for (state = ibd_objlist_head; state; state = state->id_next) {
8807 		if (state->id_plinkid == linkid) {
8808 			break;
8809 		}
8810 	}
8811 
8812 	if (state == NULL) {
8813 		mutex_exit(&ibd_objlist_lock);
8814 		return (IBT_NO_SUCH_OBJECT);
8815 	}
8816 
8817 	attr->pa_dlinkid = state->id_dlinkid;
8818 	attr->pa_plinkid = state->id_plinkid;
8819 	attr->pa_port = state->id_port;
8820 	attr->pa_hca_guid = state->id_hca_guid;
8821 	attr->pa_port_guid = state->id_port_guid;
8822 	attr->pa_pkey = state->id_pkey;
8823 
8824 	mutex_exit(&ibd_objlist_lock);
8825 
8826 	return (IBT_SUCCESS);
8827 }
8828 
8829 ibt_status_t
8830 ibd_get_all_part_attr(ibt_part_attr_t **attr_list, int *nparts)
8831 {
8832 	ibd_state_t	*state;
8833 	int		n = 0;
8834 	ibt_part_attr_t	*attr;
8835 
8836 	mutex_enter(&ibd_objlist_lock);
8837 
8838 	for (state = ibd_objlist_head; state; state = state->id_next)
8839 		n++;
8840 
8841 	*nparts = n;
8842 	if (n == 0) {
8843 		*attr_list = NULL;
8844 		mutex_exit(&ibd_objlist_lock);
8845 		return (IBT_SUCCESS);
8846 	}
8847 
8848 	*attr_list = kmem_alloc(sizeof (ibt_part_attr_t) * n, KM_SLEEP);
8849 	attr = *attr_list;
8850 	for (state = ibd_objlist_head; state; state = state->id_next) {
8851 #ifdef DEBUG
8852 		ASSERT(n > 0);
8853 		n--;
8854 #endif
8855 		attr->pa_dlinkid = state->id_dlinkid;
8856 		attr->pa_plinkid = state->id_plinkid;
8857 		attr->pa_port = state->id_port;
8858 		attr->pa_hca_guid = state->id_hca_guid;
8859 		attr->pa_port_guid = state->id_port_guid;
8860 		attr->pa_pkey = state->id_pkey;
8861 		attr++;
8862 	}
8863 
8864 	mutex_exit(&ibd_objlist_lock);
8865 	return (IBT_SUCCESS);
8866 }
8867